diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..14f5756aa0c03176509bd70ffec821cc02f5dd8f 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoint-10575/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-12690/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-2115/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-4230/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-6345/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-8460/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ec3daa06f23591a6040e6b86b735afdf596aa153
--- /dev/null
+++ b/README.md
@@ -0,0 +1,143 @@
+---
+library_name: peft
+license: llama3.1
+base_model: meta-llama/Llama-3.1-8B
+tags:
+- generated_from_trainer
+datasets:
+- ugaoo/instruction_conciseoutput__clinical_trails
+model-index:
+- name: out/meta_llama_Llama_3.1_8B_ugaoo_instruction_conciseoutput__clinical_trails
+ results: []
+---
+
+
+
+[
](https://github.com/axolotl-ai-cloud/axolotl)
+See axolotl config
+
+axolotl version: `0.8.0.dev0`
+```yaml
+base_model: meta-llama/Llama-3.1-8B
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+ - path: ugaoo/instruction_conciseoutput__clinical_trails
+ type: alpaca
+val_set_size: 0
+output_dir: ./out/meta_llama_Llama_3.1_8B_ugaoo_instruction_conciseoutput__clinical_trails
+
+sequence_len: 4000
+sample_packing: true
+pad_to_sequence_len: true
+
+adapter: qlora
+lora_r: 256
+lora_alpha: 512
+lora_dropout: 0.05
+lora_target_linear: true
+lora_target_modules:
+ - q_proj
+ - k_proj
+ - v_proj
+ - o_proj
+ - up_proj
+ - down_proj
+ - gate_proj
+lora_modules_to_save:
+ - embed_tokens
+ - lm_head
+
+wandb_project: cosmosearch
+wandb_entity:
+wandb_watch:
+wandb_name: meta_llama_Llama_3.1_8B_ugaoo_instruction_conciseoutput__clinical_trails
+wandb_log_model:
+
+gradient_accumulation_steps: 3
+micro_batch_size: 4
+num_epochs: 6
+optimizer: adamw_torch
+lr_scheduler: cosine
+learning_rate: 5e-6
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 100
+evals_per_epoch: 6
+eval_table_size:
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+save_total_limit: 6
+special_tokens:
+ pad_token: <|end_of_text|>
+```
+
+
+
+# out/meta_llama_Llama_3.1_8B_ugaoo_instruction_conciseoutput__clinical_trails
+
+This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on the ugaoo/instruction_conciseoutput__clinical_trails dataset.
+
+## Model description
+
+More information needed
+
+## Intended uses & limitations
+
+More information needed
+
+## Training and evaluation data
+
+More information needed
+
+## Training procedure
+
+### Training hyperparameters
+
+The following hyperparameters were used during training:
+- learning_rate: 5e-06
+- train_batch_size: 4
+- eval_batch_size: 4
+- seed: 42
+- distributed_type: multi-GPU
+- gradient_accumulation_steps: 3
+- total_train_batch_size: 12
+- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 100
+- num_epochs: 6.0
+
+### Training results
+
+
+
+### Framework versions
+
+- PEFT 0.14.0
+- Transformers 4.49.0
+- Pytorch 2.5.1+cu124
+- Datasets 3.2.0
+- Tokenizers 0.21.0
\ No newline at end of file
diff --git a/adapter_config.json b/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..38890ca21f7e1854f7350049a9113a9eec8a443a
--- /dev/null
+++ b/adapter_config.json
@@ -0,0 +1,40 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "meta-llama/Llama-3.1-8B",
+ "bias": "none",
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "v_proj",
+ "q_proj",
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "up_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/adapter_model.safetensors b/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a86dd37eb82fb1d46b236ea471b6db3ec90b904d
--- /dev/null
+++ b/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f07f4ee71740fb0bcf54ffe982c86734f08671b6b0cb55e4add89e30c744f1b
+size 3443586272
diff --git a/checkpoint-10575/README.md b/checkpoint-10575/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..049d467664ca6172b7ffbe6ba60b3eac7479cac4
--- /dev/null
+++ b/checkpoint-10575/README.md
@@ -0,0 +1,202 @@
+---
+base_model: meta-llama/Llama-3.1-8B
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.14.0
\ No newline at end of file
diff --git a/checkpoint-10575/adapter_config.json b/checkpoint-10575/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..38890ca21f7e1854f7350049a9113a9eec8a443a
--- /dev/null
+++ b/checkpoint-10575/adapter_config.json
@@ -0,0 +1,40 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "meta-llama/Llama-3.1-8B",
+ "bias": "none",
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "v_proj",
+ "q_proj",
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "up_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-10575/adapter_model.safetensors b/checkpoint-10575/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..dd54bf712e5af1f879554a9d4b6c82513809423d
--- /dev/null
+++ b/checkpoint-10575/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:666c561a647dd6b4f82d8fea18727b69dbae4c01c62666d44986414a5bd74a87
+size 3443586272
diff --git a/checkpoint-10575/global_step10575/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-10575/global_step10575/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2bd3486dad95f939747e6070cf659a4318c3eb7d
--- /dev/null
+++ b/checkpoint-10575/global_step10575/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e27a0f175b033408dbfdf484eb68a273b80f9d9aaa9c00f007f85016a6987a2
+size 20661195036
diff --git a/checkpoint-10575/global_step10575/mp_rank_00_model_states.pt b/checkpoint-10575/global_step10575/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ece1202286a91934faa857aeaf7d67de4f397fdf
--- /dev/null
+++ b/checkpoint-10575/global_step10575/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29cfba8bd4c3d53e2fb1257abbe7914c55537330662b4e6dcdd6468dcc783459
+size 3555326841
diff --git a/checkpoint-10575/latest b/checkpoint-10575/latest
new file mode 100644
index 0000000000000000000000000000000000000000..56565e819bc8ef07835bdab964cc11cd67edcd70
--- /dev/null
+++ b/checkpoint-10575/latest
@@ -0,0 +1 @@
+global_step10575
\ No newline at end of file
diff --git a/checkpoint-10575/rng_state.pth b/checkpoint-10575/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..ffe45090af0d1ab26ff3adcca35aeeee802c4527
--- /dev/null
+++ b/checkpoint-10575/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bbb84f257048c95820717aea86696c56a0aa84f41af814a650be05453aa1aa01
+size 14244
diff --git a/checkpoint-10575/scheduler.pt b/checkpoint-10575/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c4e80fb5e19f1b1c29bd0920238b2b31259cc1e2
--- /dev/null
+++ b/checkpoint-10575/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9f49b370776cfadd191a2189ce69293b7feb8856f50fc048212b0479f81c0ab
+size 1064
diff --git a/checkpoint-10575/special_tokens_map.json b/checkpoint-10575/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca
--- /dev/null
+++ b/checkpoint-10575/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-10575/tokenizer.json b/checkpoint-10575/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-10575/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-10575/tokenizer_config.json b/checkpoint-10575/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..81dd14db6632ad5b35b9d447732e37ac074873a5
--- /dev/null
+++ b/checkpoint-10575/tokenizer_config.json
@@ -0,0 +1,2063 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|end_of_text|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-10575/trainer_state.json b/checkpoint-10575/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd40e2b98cbd186a1474c3a0212450bb18500eef
--- /dev/null
+++ b/checkpoint-10575/trainer_state.json
@@ -0,0 +1,74058 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 5.0,
+ "eval_steps": 500,
+ "global_step": 10575,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.00047281323877068556,
+ "grad_norm": 5.163570880889893,
+ "learning_rate": 5.0000000000000004e-08,
+ "loss": 1.4628,
+ "step": 1
+ },
+ {
+ "epoch": 0.0009456264775413711,
+ "grad_norm": 6.298020839691162,
+ "learning_rate": 1.0000000000000001e-07,
+ "loss": 1.5003,
+ "step": 2
+ },
+ {
+ "epoch": 0.0014184397163120568,
+ "grad_norm": 5.853623390197754,
+ "learning_rate": 1.5000000000000002e-07,
+ "loss": 1.4495,
+ "step": 3
+ },
+ {
+ "epoch": 0.0018912529550827422,
+ "grad_norm": 5.456025123596191,
+ "learning_rate": 2.0000000000000002e-07,
+ "loss": 1.3798,
+ "step": 4
+ },
+ {
+ "epoch": 0.002364066193853428,
+ "grad_norm": 5.757407188415527,
+ "learning_rate": 2.5000000000000004e-07,
+ "loss": 1.4515,
+ "step": 5
+ },
+ {
+ "epoch": 0.0028368794326241137,
+ "grad_norm": 5.872277736663818,
+ "learning_rate": 3.0000000000000004e-07,
+ "loss": 1.4424,
+ "step": 6
+ },
+ {
+ "epoch": 0.003309692671394799,
+ "grad_norm": 6.7816009521484375,
+ "learning_rate": 3.5000000000000004e-07,
+ "loss": 1.4004,
+ "step": 7
+ },
+ {
+ "epoch": 0.0037825059101654845,
+ "grad_norm": 6.229667663574219,
+ "learning_rate": 4.0000000000000003e-07,
+ "loss": 1.4494,
+ "step": 8
+ },
+ {
+ "epoch": 0.00425531914893617,
+ "grad_norm": 5.336202621459961,
+ "learning_rate": 4.5000000000000003e-07,
+ "loss": 1.3916,
+ "step": 9
+ },
+ {
+ "epoch": 0.004728132387706856,
+ "grad_norm": 5.589445114135742,
+ "learning_rate": 5.000000000000001e-07,
+ "loss": 1.2318,
+ "step": 10
+ },
+ {
+ "epoch": 0.005200945626477541,
+ "grad_norm": 5.720539569854736,
+ "learning_rate": 5.5e-07,
+ "loss": 1.4367,
+ "step": 11
+ },
+ {
+ "epoch": 0.005673758865248227,
+ "grad_norm": 5.913913726806641,
+ "learning_rate": 6.000000000000001e-07,
+ "loss": 1.342,
+ "step": 12
+ },
+ {
+ "epoch": 0.006146572104018913,
+ "grad_norm": 5.899744987487793,
+ "learning_rate": 6.5e-07,
+ "loss": 1.4307,
+ "step": 13
+ },
+ {
+ "epoch": 0.006619385342789598,
+ "grad_norm": 5.571037292480469,
+ "learning_rate": 7.000000000000001e-07,
+ "loss": 1.3372,
+ "step": 14
+ },
+ {
+ "epoch": 0.0070921985815602835,
+ "grad_norm": 5.480010509490967,
+ "learning_rate": 7.5e-07,
+ "loss": 1.3923,
+ "step": 15
+ },
+ {
+ "epoch": 0.007565011820330969,
+ "grad_norm": 5.254702091217041,
+ "learning_rate": 8.000000000000001e-07,
+ "loss": 1.2928,
+ "step": 16
+ },
+ {
+ "epoch": 0.008037825059101654,
+ "grad_norm": 6.090312480926514,
+ "learning_rate": 8.500000000000001e-07,
+ "loss": 1.4984,
+ "step": 17
+ },
+ {
+ "epoch": 0.00851063829787234,
+ "grad_norm": 5.689319610595703,
+ "learning_rate": 9.000000000000001e-07,
+ "loss": 1.4108,
+ "step": 18
+ },
+ {
+ "epoch": 0.008983451536643027,
+ "grad_norm": 5.386685848236084,
+ "learning_rate": 9.500000000000001e-07,
+ "loss": 1.425,
+ "step": 19
+ },
+ {
+ "epoch": 0.009456264775413711,
+ "grad_norm": 6.451584815979004,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 1.5507,
+ "step": 20
+ },
+ {
+ "epoch": 0.009929078014184398,
+ "grad_norm": 5.37647008895874,
+ "learning_rate": 1.0500000000000001e-06,
+ "loss": 1.4109,
+ "step": 21
+ },
+ {
+ "epoch": 0.010401891252955082,
+ "grad_norm": 4.716553211212158,
+ "learning_rate": 1.1e-06,
+ "loss": 1.2028,
+ "step": 22
+ },
+ {
+ "epoch": 0.010874704491725768,
+ "grad_norm": 4.950989723205566,
+ "learning_rate": 1.1500000000000002e-06,
+ "loss": 1.3043,
+ "step": 23
+ },
+ {
+ "epoch": 0.011347517730496455,
+ "grad_norm": 4.688975811004639,
+ "learning_rate": 1.2000000000000002e-06,
+ "loss": 1.2708,
+ "step": 24
+ },
+ {
+ "epoch": 0.01182033096926714,
+ "grad_norm": 4.905868053436279,
+ "learning_rate": 1.25e-06,
+ "loss": 1.3268,
+ "step": 25
+ },
+ {
+ "epoch": 0.012293144208037825,
+ "grad_norm": 4.503395080566406,
+ "learning_rate": 1.3e-06,
+ "loss": 1.1799,
+ "step": 26
+ },
+ {
+ "epoch": 0.01276595744680851,
+ "grad_norm": 4.77382230758667,
+ "learning_rate": 1.3500000000000002e-06,
+ "loss": 1.3882,
+ "step": 27
+ },
+ {
+ "epoch": 0.013238770685579196,
+ "grad_norm": 4.734329700469971,
+ "learning_rate": 1.4000000000000001e-06,
+ "loss": 1.3476,
+ "step": 28
+ },
+ {
+ "epoch": 0.013711583924349883,
+ "grad_norm": 4.775066375732422,
+ "learning_rate": 1.45e-06,
+ "loss": 1.2429,
+ "step": 29
+ },
+ {
+ "epoch": 0.014184397163120567,
+ "grad_norm": 4.978334426879883,
+ "learning_rate": 1.5e-06,
+ "loss": 1.2119,
+ "step": 30
+ },
+ {
+ "epoch": 0.014657210401891253,
+ "grad_norm": 4.506785869598389,
+ "learning_rate": 1.5500000000000002e-06,
+ "loss": 1.3157,
+ "step": 31
+ },
+ {
+ "epoch": 0.015130023640661938,
+ "grad_norm": 4.007757186889648,
+ "learning_rate": 1.6000000000000001e-06,
+ "loss": 1.1451,
+ "step": 32
+ },
+ {
+ "epoch": 0.015602836879432624,
+ "grad_norm": 3.6621618270874023,
+ "learning_rate": 1.6500000000000003e-06,
+ "loss": 1.093,
+ "step": 33
+ },
+ {
+ "epoch": 0.01607565011820331,
+ "grad_norm": 3.8733766078948975,
+ "learning_rate": 1.7000000000000002e-06,
+ "loss": 1.2289,
+ "step": 34
+ },
+ {
+ "epoch": 0.016548463356973995,
+ "grad_norm": 4.3391900062561035,
+ "learning_rate": 1.75e-06,
+ "loss": 1.1453,
+ "step": 35
+ },
+ {
+ "epoch": 0.01702127659574468,
+ "grad_norm": 3.287623643875122,
+ "learning_rate": 1.8000000000000001e-06,
+ "loss": 1.0257,
+ "step": 36
+ },
+ {
+ "epoch": 0.017494089834515367,
+ "grad_norm": 3.591721773147583,
+ "learning_rate": 1.85e-06,
+ "loss": 0.9976,
+ "step": 37
+ },
+ {
+ "epoch": 0.017966903073286054,
+ "grad_norm": 4.028271675109863,
+ "learning_rate": 1.9000000000000002e-06,
+ "loss": 1.0773,
+ "step": 38
+ },
+ {
+ "epoch": 0.018439716312056736,
+ "grad_norm": 3.3543951511383057,
+ "learning_rate": 1.9500000000000004e-06,
+ "loss": 1.1677,
+ "step": 39
+ },
+ {
+ "epoch": 0.018912529550827423,
+ "grad_norm": 3.807624340057373,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 1.1232,
+ "step": 40
+ },
+ {
+ "epoch": 0.01938534278959811,
+ "grad_norm": 4.242797374725342,
+ "learning_rate": 2.05e-06,
+ "loss": 1.1819,
+ "step": 41
+ },
+ {
+ "epoch": 0.019858156028368795,
+ "grad_norm": 3.4574992656707764,
+ "learning_rate": 2.1000000000000002e-06,
+ "loss": 0.9878,
+ "step": 42
+ },
+ {
+ "epoch": 0.02033096926713948,
+ "grad_norm": 3.906695604324341,
+ "learning_rate": 2.15e-06,
+ "loss": 1.0592,
+ "step": 43
+ },
+ {
+ "epoch": 0.020803782505910164,
+ "grad_norm": 3.7543163299560547,
+ "learning_rate": 2.2e-06,
+ "loss": 1.0309,
+ "step": 44
+ },
+ {
+ "epoch": 0.02127659574468085,
+ "grad_norm": 3.3777148723602295,
+ "learning_rate": 2.25e-06,
+ "loss": 1.0664,
+ "step": 45
+ },
+ {
+ "epoch": 0.021749408983451537,
+ "grad_norm": 3.6003634929656982,
+ "learning_rate": 2.3000000000000004e-06,
+ "loss": 1.0482,
+ "step": 46
+ },
+ {
+ "epoch": 0.022222222222222223,
+ "grad_norm": 3.3961377143859863,
+ "learning_rate": 2.35e-06,
+ "loss": 1.0252,
+ "step": 47
+ },
+ {
+ "epoch": 0.02269503546099291,
+ "grad_norm": 3.1601035594940186,
+ "learning_rate": 2.4000000000000003e-06,
+ "loss": 1.0435,
+ "step": 48
+ },
+ {
+ "epoch": 0.023167848699763592,
+ "grad_norm": 3.4192967414855957,
+ "learning_rate": 2.4500000000000003e-06,
+ "loss": 1.0935,
+ "step": 49
+ },
+ {
+ "epoch": 0.02364066193853428,
+ "grad_norm": 3.1225922107696533,
+ "learning_rate": 2.5e-06,
+ "loss": 0.8988,
+ "step": 50
+ },
+ {
+ "epoch": 0.024113475177304965,
+ "grad_norm": 3.1423380374908447,
+ "learning_rate": 2.55e-06,
+ "loss": 1.0159,
+ "step": 51
+ },
+ {
+ "epoch": 0.02458628841607565,
+ "grad_norm": 3.4782402515411377,
+ "learning_rate": 2.6e-06,
+ "loss": 1.0231,
+ "step": 52
+ },
+ {
+ "epoch": 0.025059101654846337,
+ "grad_norm": 3.8362693786621094,
+ "learning_rate": 2.6500000000000005e-06,
+ "loss": 1.0725,
+ "step": 53
+ },
+ {
+ "epoch": 0.02553191489361702,
+ "grad_norm": 3.033294916152954,
+ "learning_rate": 2.7000000000000004e-06,
+ "loss": 0.9377,
+ "step": 54
+ },
+ {
+ "epoch": 0.026004728132387706,
+ "grad_norm": 3.849741220474243,
+ "learning_rate": 2.7500000000000004e-06,
+ "loss": 1.0046,
+ "step": 55
+ },
+ {
+ "epoch": 0.026477541371158392,
+ "grad_norm": 3.141876220703125,
+ "learning_rate": 2.8000000000000003e-06,
+ "loss": 0.9226,
+ "step": 56
+ },
+ {
+ "epoch": 0.02695035460992908,
+ "grad_norm": 2.773594856262207,
+ "learning_rate": 2.85e-06,
+ "loss": 0.8662,
+ "step": 57
+ },
+ {
+ "epoch": 0.027423167848699765,
+ "grad_norm": 3.1460225582122803,
+ "learning_rate": 2.9e-06,
+ "loss": 0.9304,
+ "step": 58
+ },
+ {
+ "epoch": 0.027895981087470448,
+ "grad_norm": 3.293583631515503,
+ "learning_rate": 2.95e-06,
+ "loss": 1.0374,
+ "step": 59
+ },
+ {
+ "epoch": 0.028368794326241134,
+ "grad_norm": 3.8190863132476807,
+ "learning_rate": 3e-06,
+ "loss": 0.971,
+ "step": 60
+ },
+ {
+ "epoch": 0.02884160756501182,
+ "grad_norm": 3.4566776752471924,
+ "learning_rate": 3.05e-06,
+ "loss": 0.9631,
+ "step": 61
+ },
+ {
+ "epoch": 0.029314420803782507,
+ "grad_norm": 3.355741500854492,
+ "learning_rate": 3.1000000000000004e-06,
+ "loss": 1.0097,
+ "step": 62
+ },
+ {
+ "epoch": 0.029787234042553193,
+ "grad_norm": 3.29746675491333,
+ "learning_rate": 3.1500000000000003e-06,
+ "loss": 0.9459,
+ "step": 63
+ },
+ {
+ "epoch": 0.030260047281323876,
+ "grad_norm": 3.3122968673706055,
+ "learning_rate": 3.2000000000000003e-06,
+ "loss": 0.8594,
+ "step": 64
+ },
+ {
+ "epoch": 0.030732860520094562,
+ "grad_norm": 3.477701187133789,
+ "learning_rate": 3.2500000000000002e-06,
+ "loss": 0.9197,
+ "step": 65
+ },
+ {
+ "epoch": 0.031205673758865248,
+ "grad_norm": 3.3363406658172607,
+ "learning_rate": 3.3000000000000006e-06,
+ "loss": 0.9478,
+ "step": 66
+ },
+ {
+ "epoch": 0.03167848699763593,
+ "grad_norm": 4.143295764923096,
+ "learning_rate": 3.3500000000000005e-06,
+ "loss": 1.0534,
+ "step": 67
+ },
+ {
+ "epoch": 0.03215130023640662,
+ "grad_norm": 3.2363274097442627,
+ "learning_rate": 3.4000000000000005e-06,
+ "loss": 0.9454,
+ "step": 68
+ },
+ {
+ "epoch": 0.032624113475177303,
+ "grad_norm": 3.198746681213379,
+ "learning_rate": 3.45e-06,
+ "loss": 0.9388,
+ "step": 69
+ },
+ {
+ "epoch": 0.03309692671394799,
+ "grad_norm": 3.5751023292541504,
+ "learning_rate": 3.5e-06,
+ "loss": 0.9444,
+ "step": 70
+ },
+ {
+ "epoch": 0.033569739952718676,
+ "grad_norm": 3.1745729446411133,
+ "learning_rate": 3.5500000000000003e-06,
+ "loss": 0.8683,
+ "step": 71
+ },
+ {
+ "epoch": 0.03404255319148936,
+ "grad_norm": 3.3210883140563965,
+ "learning_rate": 3.6000000000000003e-06,
+ "loss": 0.8811,
+ "step": 72
+ },
+ {
+ "epoch": 0.03451536643026005,
+ "grad_norm": 3.2502429485321045,
+ "learning_rate": 3.65e-06,
+ "loss": 1.0012,
+ "step": 73
+ },
+ {
+ "epoch": 0.034988179669030735,
+ "grad_norm": 3.44598126411438,
+ "learning_rate": 3.7e-06,
+ "loss": 0.9217,
+ "step": 74
+ },
+ {
+ "epoch": 0.03546099290780142,
+ "grad_norm": 3.439117431640625,
+ "learning_rate": 3.7500000000000005e-06,
+ "loss": 0.8976,
+ "step": 75
+ },
+ {
+ "epoch": 0.03593380614657211,
+ "grad_norm": 3.523627758026123,
+ "learning_rate": 3.8000000000000005e-06,
+ "loss": 0.8996,
+ "step": 76
+ },
+ {
+ "epoch": 0.03640661938534279,
+ "grad_norm": 3.3716015815734863,
+ "learning_rate": 3.85e-06,
+ "loss": 0.9061,
+ "step": 77
+ },
+ {
+ "epoch": 0.03687943262411347,
+ "grad_norm": 3.33518385887146,
+ "learning_rate": 3.900000000000001e-06,
+ "loss": 0.9371,
+ "step": 78
+ },
+ {
+ "epoch": 0.03735224586288416,
+ "grad_norm": 3.833829879760742,
+ "learning_rate": 3.95e-06,
+ "loss": 0.9669,
+ "step": 79
+ },
+ {
+ "epoch": 0.037825059101654845,
+ "grad_norm": 3.260446786880493,
+ "learning_rate": 4.000000000000001e-06,
+ "loss": 0.9449,
+ "step": 80
+ },
+ {
+ "epoch": 0.03829787234042553,
+ "grad_norm": 3.532451629638672,
+ "learning_rate": 4.05e-06,
+ "loss": 0.897,
+ "step": 81
+ },
+ {
+ "epoch": 0.03877068557919622,
+ "grad_norm": 3.1156492233276367,
+ "learning_rate": 4.1e-06,
+ "loss": 0.8463,
+ "step": 82
+ },
+ {
+ "epoch": 0.039243498817966904,
+ "grad_norm": 2.8801751136779785,
+ "learning_rate": 4.15e-06,
+ "loss": 0.8616,
+ "step": 83
+ },
+ {
+ "epoch": 0.03971631205673759,
+ "grad_norm": 3.072476863861084,
+ "learning_rate": 4.2000000000000004e-06,
+ "loss": 0.8387,
+ "step": 84
+ },
+ {
+ "epoch": 0.04018912529550828,
+ "grad_norm": 2.9601376056671143,
+ "learning_rate": 4.25e-06,
+ "loss": 0.8538,
+ "step": 85
+ },
+ {
+ "epoch": 0.04066193853427896,
+ "grad_norm": 3.521664619445801,
+ "learning_rate": 4.3e-06,
+ "loss": 0.8894,
+ "step": 86
+ },
+ {
+ "epoch": 0.04113475177304964,
+ "grad_norm": 3.2670981884002686,
+ "learning_rate": 4.350000000000001e-06,
+ "loss": 0.8387,
+ "step": 87
+ },
+ {
+ "epoch": 0.04160756501182033,
+ "grad_norm": 3.422089099884033,
+ "learning_rate": 4.4e-06,
+ "loss": 0.7728,
+ "step": 88
+ },
+ {
+ "epoch": 0.042080378250591015,
+ "grad_norm": 3.414034128189087,
+ "learning_rate": 4.450000000000001e-06,
+ "loss": 0.7968,
+ "step": 89
+ },
+ {
+ "epoch": 0.0425531914893617,
+ "grad_norm": 4.234285354614258,
+ "learning_rate": 4.5e-06,
+ "loss": 0.8502,
+ "step": 90
+ },
+ {
+ "epoch": 0.04302600472813239,
+ "grad_norm": 3.1446919441223145,
+ "learning_rate": 4.5500000000000005e-06,
+ "loss": 0.8236,
+ "step": 91
+ },
+ {
+ "epoch": 0.043498817966903074,
+ "grad_norm": 3.683443307876587,
+ "learning_rate": 4.600000000000001e-06,
+ "loss": 0.9792,
+ "step": 92
+ },
+ {
+ "epoch": 0.04397163120567376,
+ "grad_norm": 3.664219617843628,
+ "learning_rate": 4.65e-06,
+ "loss": 0.8743,
+ "step": 93
+ },
+ {
+ "epoch": 0.044444444444444446,
+ "grad_norm": 3.369479179382324,
+ "learning_rate": 4.7e-06,
+ "loss": 0.8741,
+ "step": 94
+ },
+ {
+ "epoch": 0.04491725768321513,
+ "grad_norm": 3.694949150085449,
+ "learning_rate": 4.75e-06,
+ "loss": 0.7574,
+ "step": 95
+ },
+ {
+ "epoch": 0.04539007092198582,
+ "grad_norm": 3.5144498348236084,
+ "learning_rate": 4.800000000000001e-06,
+ "loss": 0.9934,
+ "step": 96
+ },
+ {
+ "epoch": 0.0458628841607565,
+ "grad_norm": 3.164451837539673,
+ "learning_rate": 4.85e-06,
+ "loss": 0.7463,
+ "step": 97
+ },
+ {
+ "epoch": 0.046335697399527184,
+ "grad_norm": 3.222785472869873,
+ "learning_rate": 4.9000000000000005e-06,
+ "loss": 0.7698,
+ "step": 98
+ },
+ {
+ "epoch": 0.04680851063829787,
+ "grad_norm": 2.9129555225372314,
+ "learning_rate": 4.95e-06,
+ "loss": 0.7856,
+ "step": 99
+ },
+ {
+ "epoch": 0.04728132387706856,
+ "grad_norm": 3.5061235427856445,
+ "learning_rate": 5e-06,
+ "loss": 0.8588,
+ "step": 100
+ },
+ {
+ "epoch": 0.04775413711583924,
+ "grad_norm": 3.2805044651031494,
+ "learning_rate": 4.999999922167982e-06,
+ "loss": 0.7643,
+ "step": 101
+ },
+ {
+ "epoch": 0.04822695035460993,
+ "grad_norm": 3.5461678504943848,
+ "learning_rate": 4.999999688671929e-06,
+ "loss": 0.8253,
+ "step": 102
+ },
+ {
+ "epoch": 0.048699763593380616,
+ "grad_norm": 3.2238264083862305,
+ "learning_rate": 4.99999929951186e-06,
+ "loss": 0.7622,
+ "step": 103
+ },
+ {
+ "epoch": 0.0491725768321513,
+ "grad_norm": 3.818955898284912,
+ "learning_rate": 4.999998754687795e-06,
+ "loss": 0.8471,
+ "step": 104
+ },
+ {
+ "epoch": 0.04964539007092199,
+ "grad_norm": 3.1252424716949463,
+ "learning_rate": 4.99999805419977e-06,
+ "loss": 0.8409,
+ "step": 105
+ },
+ {
+ "epoch": 0.050118203309692674,
+ "grad_norm": 3.604283571243286,
+ "learning_rate": 4.999997198047828e-06,
+ "loss": 0.9027,
+ "step": 106
+ },
+ {
+ "epoch": 0.050591016548463354,
+ "grad_norm": 3.6752424240112305,
+ "learning_rate": 4.999996186232023e-06,
+ "loss": 0.9336,
+ "step": 107
+ },
+ {
+ "epoch": 0.05106382978723404,
+ "grad_norm": 3.517557144165039,
+ "learning_rate": 4.9999950187524184e-06,
+ "loss": 0.8351,
+ "step": 108
+ },
+ {
+ "epoch": 0.051536643026004726,
+ "grad_norm": 3.427285671234131,
+ "learning_rate": 4.999993695609085e-06,
+ "loss": 0.8457,
+ "step": 109
+ },
+ {
+ "epoch": 0.05200945626477541,
+ "grad_norm": 3.2792510986328125,
+ "learning_rate": 4.999992216802107e-06,
+ "loss": 0.8391,
+ "step": 110
+ },
+ {
+ "epoch": 0.0524822695035461,
+ "grad_norm": 3.581094741821289,
+ "learning_rate": 4.999990582331576e-06,
+ "loss": 0.7533,
+ "step": 111
+ },
+ {
+ "epoch": 0.052955082742316785,
+ "grad_norm": 3.1667377948760986,
+ "learning_rate": 4.999988792197593e-06,
+ "loss": 0.9562,
+ "step": 112
+ },
+ {
+ "epoch": 0.05342789598108747,
+ "grad_norm": 3.3609890937805176,
+ "learning_rate": 4.99998684640027e-06,
+ "loss": 0.8181,
+ "step": 113
+ },
+ {
+ "epoch": 0.05390070921985816,
+ "grad_norm": 3.260627269744873,
+ "learning_rate": 4.999984744939729e-06,
+ "loss": 0.8012,
+ "step": 114
+ },
+ {
+ "epoch": 0.054373522458628844,
+ "grad_norm": 3.4535653591156006,
+ "learning_rate": 4.9999824878160985e-06,
+ "loss": 0.919,
+ "step": 115
+ },
+ {
+ "epoch": 0.05484633569739953,
+ "grad_norm": 3.4880740642547607,
+ "learning_rate": 4.999980075029522e-06,
+ "loss": 0.8114,
+ "step": 116
+ },
+ {
+ "epoch": 0.05531914893617021,
+ "grad_norm": 3.2546932697296143,
+ "learning_rate": 4.999977506580147e-06,
+ "loss": 0.8274,
+ "step": 117
+ },
+ {
+ "epoch": 0.055791962174940896,
+ "grad_norm": 3.2762744426727295,
+ "learning_rate": 4.999974782468136e-06,
+ "loss": 0.9018,
+ "step": 118
+ },
+ {
+ "epoch": 0.05626477541371158,
+ "grad_norm": 3.42825984954834,
+ "learning_rate": 4.999971902693657e-06,
+ "loss": 0.8262,
+ "step": 119
+ },
+ {
+ "epoch": 0.05673758865248227,
+ "grad_norm": 3.082496404647827,
+ "learning_rate": 4.99996886725689e-06,
+ "loss": 0.8181,
+ "step": 120
+ },
+ {
+ "epoch": 0.057210401891252954,
+ "grad_norm": 3.322869300842285,
+ "learning_rate": 4.9999656761580225e-06,
+ "loss": 0.8382,
+ "step": 121
+ },
+ {
+ "epoch": 0.05768321513002364,
+ "grad_norm": 3.6365339756011963,
+ "learning_rate": 4.9999623293972555e-06,
+ "loss": 0.7489,
+ "step": 122
+ },
+ {
+ "epoch": 0.05815602836879433,
+ "grad_norm": 3.376352548599243,
+ "learning_rate": 4.999958826974796e-06,
+ "loss": 0.9012,
+ "step": 123
+ },
+ {
+ "epoch": 0.05862884160756501,
+ "grad_norm": 3.49088716506958,
+ "learning_rate": 4.999955168890862e-06,
+ "loss": 0.8999,
+ "step": 124
+ },
+ {
+ "epoch": 0.0591016548463357,
+ "grad_norm": 3.3265068531036377,
+ "learning_rate": 4.999951355145682e-06,
+ "loss": 0.8161,
+ "step": 125
+ },
+ {
+ "epoch": 0.059574468085106386,
+ "grad_norm": 3.697282314300537,
+ "learning_rate": 4.999947385739493e-06,
+ "loss": 0.9623,
+ "step": 126
+ },
+ {
+ "epoch": 0.06004728132387707,
+ "grad_norm": 2.7901928424835205,
+ "learning_rate": 4.999943260672542e-06,
+ "loss": 0.7371,
+ "step": 127
+ },
+ {
+ "epoch": 0.06052009456264775,
+ "grad_norm": 3.110319137573242,
+ "learning_rate": 4.999938979945086e-06,
+ "loss": 0.715,
+ "step": 128
+ },
+ {
+ "epoch": 0.06099290780141844,
+ "grad_norm": 3.2211520671844482,
+ "learning_rate": 4.999934543557392e-06,
+ "loss": 0.8888,
+ "step": 129
+ },
+ {
+ "epoch": 0.061465721040189124,
+ "grad_norm": 3.2466187477111816,
+ "learning_rate": 4.999929951509735e-06,
+ "loss": 0.9389,
+ "step": 130
+ },
+ {
+ "epoch": 0.06193853427895981,
+ "grad_norm": 3.3574399948120117,
+ "learning_rate": 4.999925203802403e-06,
+ "loss": 0.8263,
+ "step": 131
+ },
+ {
+ "epoch": 0.062411347517730496,
+ "grad_norm": 3.275601625442505,
+ "learning_rate": 4.99992030043569e-06,
+ "loss": 0.8338,
+ "step": 132
+ },
+ {
+ "epoch": 0.06288416075650118,
+ "grad_norm": 3.6011312007904053,
+ "learning_rate": 4.999915241409902e-06,
+ "loss": 0.8351,
+ "step": 133
+ },
+ {
+ "epoch": 0.06335697399527186,
+ "grad_norm": 2.969011068344116,
+ "learning_rate": 4.999910026725352e-06,
+ "loss": 0.79,
+ "step": 134
+ },
+ {
+ "epoch": 0.06382978723404255,
+ "grad_norm": 3.690784454345703,
+ "learning_rate": 4.999904656382369e-06,
+ "loss": 0.8209,
+ "step": 135
+ },
+ {
+ "epoch": 0.06430260047281323,
+ "grad_norm": 3.3363115787506104,
+ "learning_rate": 4.999899130381283e-06,
+ "loss": 0.858,
+ "step": 136
+ },
+ {
+ "epoch": 0.06477541371158392,
+ "grad_norm": 3.206881523132324,
+ "learning_rate": 4.9998934487224405e-06,
+ "loss": 0.834,
+ "step": 137
+ },
+ {
+ "epoch": 0.06524822695035461,
+ "grad_norm": 2.773146152496338,
+ "learning_rate": 4.999887611406195e-06,
+ "loss": 0.7576,
+ "step": 138
+ },
+ {
+ "epoch": 0.0657210401891253,
+ "grad_norm": 3.307725667953491,
+ "learning_rate": 4.999881618432908e-06,
+ "loss": 0.7487,
+ "step": 139
+ },
+ {
+ "epoch": 0.06619385342789598,
+ "grad_norm": 4.273657321929932,
+ "learning_rate": 4.999875469802956e-06,
+ "loss": 0.8176,
+ "step": 140
+ },
+ {
+ "epoch": 0.06666666666666667,
+ "grad_norm": 3.0898005962371826,
+ "learning_rate": 4.999869165516719e-06,
+ "loss": 0.7578,
+ "step": 141
+ },
+ {
+ "epoch": 0.06713947990543735,
+ "grad_norm": 3.25150990486145,
+ "learning_rate": 4.9998627055745915e-06,
+ "loss": 0.7873,
+ "step": 142
+ },
+ {
+ "epoch": 0.06761229314420804,
+ "grad_norm": 2.9705755710601807,
+ "learning_rate": 4.999856089976974e-06,
+ "loss": 0.6473,
+ "step": 143
+ },
+ {
+ "epoch": 0.06808510638297872,
+ "grad_norm": 3.5658507347106934,
+ "learning_rate": 4.9998493187242804e-06,
+ "loss": 0.855,
+ "step": 144
+ },
+ {
+ "epoch": 0.06855791962174941,
+ "grad_norm": 3.3994076251983643,
+ "learning_rate": 4.99984239181693e-06,
+ "loss": 0.7926,
+ "step": 145
+ },
+ {
+ "epoch": 0.0690307328605201,
+ "grad_norm": 2.8266260623931885,
+ "learning_rate": 4.999835309255357e-06,
+ "loss": 0.7564,
+ "step": 146
+ },
+ {
+ "epoch": 0.06950354609929078,
+ "grad_norm": 3.1143875122070312,
+ "learning_rate": 4.999828071039999e-06,
+ "loss": 0.8398,
+ "step": 147
+ },
+ {
+ "epoch": 0.06997635933806147,
+ "grad_norm": 2.9364278316497803,
+ "learning_rate": 4.99982067717131e-06,
+ "loss": 0.7381,
+ "step": 148
+ },
+ {
+ "epoch": 0.07044917257683216,
+ "grad_norm": 3.4155616760253906,
+ "learning_rate": 4.999813127649748e-06,
+ "loss": 0.7933,
+ "step": 149
+ },
+ {
+ "epoch": 0.07092198581560284,
+ "grad_norm": 4.371236324310303,
+ "learning_rate": 4.999805422475784e-06,
+ "loss": 0.8292,
+ "step": 150
+ },
+ {
+ "epoch": 0.07139479905437353,
+ "grad_norm": 3.3967185020446777,
+ "learning_rate": 4.999797561649897e-06,
+ "loss": 0.8712,
+ "step": 151
+ },
+ {
+ "epoch": 0.07186761229314421,
+ "grad_norm": 3.343303680419922,
+ "learning_rate": 4.999789545172578e-06,
+ "loss": 0.8177,
+ "step": 152
+ },
+ {
+ "epoch": 0.07234042553191489,
+ "grad_norm": 3.040235757827759,
+ "learning_rate": 4.999781373044325e-06,
+ "loss": 0.7379,
+ "step": 153
+ },
+ {
+ "epoch": 0.07281323877068557,
+ "grad_norm": 3.4069204330444336,
+ "learning_rate": 4.999773045265647e-06,
+ "loss": 0.7939,
+ "step": 154
+ },
+ {
+ "epoch": 0.07328605200945626,
+ "grad_norm": 3.1939475536346436,
+ "learning_rate": 4.999764561837063e-06,
+ "loss": 0.8037,
+ "step": 155
+ },
+ {
+ "epoch": 0.07375886524822695,
+ "grad_norm": 4.452004909515381,
+ "learning_rate": 4.999755922759101e-06,
+ "loss": 0.8421,
+ "step": 156
+ },
+ {
+ "epoch": 0.07423167848699763,
+ "grad_norm": 3.2031240463256836,
+ "learning_rate": 4.999747128032298e-06,
+ "loss": 0.794,
+ "step": 157
+ },
+ {
+ "epoch": 0.07470449172576832,
+ "grad_norm": 3.175920009613037,
+ "learning_rate": 4.999738177657203e-06,
+ "loss": 0.759,
+ "step": 158
+ },
+ {
+ "epoch": 0.075177304964539,
+ "grad_norm": 3.7679688930511475,
+ "learning_rate": 4.9997290716343725e-06,
+ "loss": 0.8174,
+ "step": 159
+ },
+ {
+ "epoch": 0.07565011820330969,
+ "grad_norm": 3.7020037174224854,
+ "learning_rate": 4.999719809964373e-06,
+ "loss": 0.7116,
+ "step": 160
+ },
+ {
+ "epoch": 0.07612293144208038,
+ "grad_norm": 4.357471942901611,
+ "learning_rate": 4.999710392647783e-06,
+ "loss": 0.7649,
+ "step": 161
+ },
+ {
+ "epoch": 0.07659574468085106,
+ "grad_norm": 3.3439087867736816,
+ "learning_rate": 4.999700819685187e-06,
+ "loss": 0.7907,
+ "step": 162
+ },
+ {
+ "epoch": 0.07706855791962175,
+ "grad_norm": 3.210815191268921,
+ "learning_rate": 4.999691091077182e-06,
+ "loss": 0.8446,
+ "step": 163
+ },
+ {
+ "epoch": 0.07754137115839244,
+ "grad_norm": 3.1029553413391113,
+ "learning_rate": 4.9996812068243735e-06,
+ "loss": 0.7232,
+ "step": 164
+ },
+ {
+ "epoch": 0.07801418439716312,
+ "grad_norm": 2.9389400482177734,
+ "learning_rate": 4.999671166927378e-06,
+ "loss": 0.7413,
+ "step": 165
+ },
+ {
+ "epoch": 0.07848699763593381,
+ "grad_norm": 3.7062697410583496,
+ "learning_rate": 4.9996609713868185e-06,
+ "loss": 0.8773,
+ "step": 166
+ },
+ {
+ "epoch": 0.0789598108747045,
+ "grad_norm": 3.2768924236297607,
+ "learning_rate": 4.999650620203332e-06,
+ "loss": 0.8046,
+ "step": 167
+ },
+ {
+ "epoch": 0.07943262411347518,
+ "grad_norm": 3.380373001098633,
+ "learning_rate": 4.999640113377561e-06,
+ "loss": 0.7529,
+ "step": 168
+ },
+ {
+ "epoch": 0.07990543735224587,
+ "grad_norm": 3.520022392272949,
+ "learning_rate": 4.999629450910162e-06,
+ "loss": 0.7352,
+ "step": 169
+ },
+ {
+ "epoch": 0.08037825059101655,
+ "grad_norm": 3.43269419670105,
+ "learning_rate": 4.999618632801796e-06,
+ "loss": 0.9371,
+ "step": 170
+ },
+ {
+ "epoch": 0.08085106382978724,
+ "grad_norm": 3.555877923965454,
+ "learning_rate": 4.99960765905314e-06,
+ "loss": 0.8276,
+ "step": 171
+ },
+ {
+ "epoch": 0.08132387706855793,
+ "grad_norm": 3.597050189971924,
+ "learning_rate": 4.999596529664874e-06,
+ "loss": 0.8164,
+ "step": 172
+ },
+ {
+ "epoch": 0.0817966903073286,
+ "grad_norm": 3.2002956867218018,
+ "learning_rate": 4.999585244637693e-06,
+ "loss": 0.7824,
+ "step": 173
+ },
+ {
+ "epoch": 0.08226950354609928,
+ "grad_norm": 3.527275562286377,
+ "learning_rate": 4.999573803972299e-06,
+ "loss": 0.8033,
+ "step": 174
+ },
+ {
+ "epoch": 0.08274231678486997,
+ "grad_norm": 3.5184452533721924,
+ "learning_rate": 4.999562207669405e-06,
+ "loss": 0.724,
+ "step": 175
+ },
+ {
+ "epoch": 0.08321513002364066,
+ "grad_norm": 3.6635067462921143,
+ "learning_rate": 4.999550455729732e-06,
+ "loss": 0.819,
+ "step": 176
+ },
+ {
+ "epoch": 0.08368794326241134,
+ "grad_norm": 3.192399740219116,
+ "learning_rate": 4.999538548154012e-06,
+ "loss": 0.7999,
+ "step": 177
+ },
+ {
+ "epoch": 0.08416075650118203,
+ "grad_norm": 3.0946953296661377,
+ "learning_rate": 4.999526484942988e-06,
+ "loss": 0.7367,
+ "step": 178
+ },
+ {
+ "epoch": 0.08463356973995272,
+ "grad_norm": 2.847198009490967,
+ "learning_rate": 4.99951426609741e-06,
+ "loss": 0.7536,
+ "step": 179
+ },
+ {
+ "epoch": 0.0851063829787234,
+ "grad_norm": 2.7674827575683594,
+ "learning_rate": 4.999501891618037e-06,
+ "loss": 0.701,
+ "step": 180
+ },
+ {
+ "epoch": 0.08557919621749409,
+ "grad_norm": 3.357933521270752,
+ "learning_rate": 4.999489361505643e-06,
+ "loss": 0.8331,
+ "step": 181
+ },
+ {
+ "epoch": 0.08605200945626477,
+ "grad_norm": 3.1464426517486572,
+ "learning_rate": 4.999476675761004e-06,
+ "loss": 0.7931,
+ "step": 182
+ },
+ {
+ "epoch": 0.08652482269503546,
+ "grad_norm": 3.310697078704834,
+ "learning_rate": 4.999463834384915e-06,
+ "loss": 0.753,
+ "step": 183
+ },
+ {
+ "epoch": 0.08699763593380615,
+ "grad_norm": 2.9794881343841553,
+ "learning_rate": 4.999450837378171e-06,
+ "loss": 0.7091,
+ "step": 184
+ },
+ {
+ "epoch": 0.08747044917257683,
+ "grad_norm": 3.0776889324188232,
+ "learning_rate": 4.999437684741584e-06,
+ "loss": 0.7226,
+ "step": 185
+ },
+ {
+ "epoch": 0.08794326241134752,
+ "grad_norm": 3.6657519340515137,
+ "learning_rate": 4.999424376475972e-06,
+ "loss": 0.845,
+ "step": 186
+ },
+ {
+ "epoch": 0.0884160756501182,
+ "grad_norm": 3.872718572616577,
+ "learning_rate": 4.999410912582164e-06,
+ "loss": 0.812,
+ "step": 187
+ },
+ {
+ "epoch": 0.08888888888888889,
+ "grad_norm": 2.9184508323669434,
+ "learning_rate": 4.9993972930609976e-06,
+ "loss": 0.6823,
+ "step": 188
+ },
+ {
+ "epoch": 0.08936170212765958,
+ "grad_norm": 3.5567142963409424,
+ "learning_rate": 4.999383517913321e-06,
+ "loss": 0.7614,
+ "step": 189
+ },
+ {
+ "epoch": 0.08983451536643026,
+ "grad_norm": 3.3688533306121826,
+ "learning_rate": 4.999369587139992e-06,
+ "loss": 0.858,
+ "step": 190
+ },
+ {
+ "epoch": 0.09030732860520095,
+ "grad_norm": 2.893223524093628,
+ "learning_rate": 4.99935550074188e-06,
+ "loss": 0.6761,
+ "step": 191
+ },
+ {
+ "epoch": 0.09078014184397164,
+ "grad_norm": 3.400225877761841,
+ "learning_rate": 4.999341258719859e-06,
+ "loss": 0.7531,
+ "step": 192
+ },
+ {
+ "epoch": 0.09125295508274232,
+ "grad_norm": 3.6167714595794678,
+ "learning_rate": 4.999326861074817e-06,
+ "loss": 0.8164,
+ "step": 193
+ },
+ {
+ "epoch": 0.091725768321513,
+ "grad_norm": 4.325016498565674,
+ "learning_rate": 4.9993123078076506e-06,
+ "loss": 0.7069,
+ "step": 194
+ },
+ {
+ "epoch": 0.09219858156028368,
+ "grad_norm": 3.195317029953003,
+ "learning_rate": 4.999297598919266e-06,
+ "loss": 0.726,
+ "step": 195
+ },
+ {
+ "epoch": 0.09267139479905437,
+ "grad_norm": 3.146530866622925,
+ "learning_rate": 4.999282734410579e-06,
+ "loss": 0.7888,
+ "step": 196
+ },
+ {
+ "epoch": 0.09314420803782505,
+ "grad_norm": 3.5166752338409424,
+ "learning_rate": 4.999267714282515e-06,
+ "loss": 0.8473,
+ "step": 197
+ },
+ {
+ "epoch": 0.09361702127659574,
+ "grad_norm": 3.3140196800231934,
+ "learning_rate": 4.99925253853601e-06,
+ "loss": 0.7233,
+ "step": 198
+ },
+ {
+ "epoch": 0.09408983451536643,
+ "grad_norm": 3.0318164825439453,
+ "learning_rate": 4.999237207172008e-06,
+ "loss": 0.7543,
+ "step": 199
+ },
+ {
+ "epoch": 0.09456264775413711,
+ "grad_norm": 3.662214756011963,
+ "learning_rate": 4.999221720191464e-06,
+ "loss": 0.7783,
+ "step": 200
+ },
+ {
+ "epoch": 0.0950354609929078,
+ "grad_norm": 3.452078104019165,
+ "learning_rate": 4.9992060775953425e-06,
+ "loss": 0.7868,
+ "step": 201
+ },
+ {
+ "epoch": 0.09550827423167849,
+ "grad_norm": 3.4051287174224854,
+ "learning_rate": 4.999190279384617e-06,
+ "loss": 0.7849,
+ "step": 202
+ },
+ {
+ "epoch": 0.09598108747044917,
+ "grad_norm": 3.1377196311950684,
+ "learning_rate": 4.999174325560271e-06,
+ "loss": 0.8364,
+ "step": 203
+ },
+ {
+ "epoch": 0.09645390070921986,
+ "grad_norm": 3.129473924636841,
+ "learning_rate": 4.999158216123299e-06,
+ "loss": 0.7458,
+ "step": 204
+ },
+ {
+ "epoch": 0.09692671394799054,
+ "grad_norm": 3.169548749923706,
+ "learning_rate": 4.999141951074703e-06,
+ "loss": 0.7256,
+ "step": 205
+ },
+ {
+ "epoch": 0.09739952718676123,
+ "grad_norm": 3.186009168624878,
+ "learning_rate": 4.999125530415495e-06,
+ "loss": 0.783,
+ "step": 206
+ },
+ {
+ "epoch": 0.09787234042553192,
+ "grad_norm": 3.0995123386383057,
+ "learning_rate": 4.9991089541467e-06,
+ "loss": 0.7519,
+ "step": 207
+ },
+ {
+ "epoch": 0.0983451536643026,
+ "grad_norm": 3.1854088306427,
+ "learning_rate": 4.999092222269348e-06,
+ "loss": 0.7444,
+ "step": 208
+ },
+ {
+ "epoch": 0.09881796690307329,
+ "grad_norm": 3.1512246131896973,
+ "learning_rate": 4.999075334784482e-06,
+ "loss": 0.7882,
+ "step": 209
+ },
+ {
+ "epoch": 0.09929078014184398,
+ "grad_norm": 3.6199698448181152,
+ "learning_rate": 4.999058291693153e-06,
+ "loss": 0.8048,
+ "step": 210
+ },
+ {
+ "epoch": 0.09976359338061466,
+ "grad_norm": 2.956907272338867,
+ "learning_rate": 4.999041092996422e-06,
+ "loss": 0.7663,
+ "step": 211
+ },
+ {
+ "epoch": 0.10023640661938535,
+ "grad_norm": 3.3493971824645996,
+ "learning_rate": 4.99902373869536e-06,
+ "loss": 0.7639,
+ "step": 212
+ },
+ {
+ "epoch": 0.10070921985815603,
+ "grad_norm": 3.144812822341919,
+ "learning_rate": 4.9990062287910475e-06,
+ "loss": 0.7953,
+ "step": 213
+ },
+ {
+ "epoch": 0.10118203309692671,
+ "grad_norm": 3.5986971855163574,
+ "learning_rate": 4.998988563284576e-06,
+ "loss": 0.8297,
+ "step": 214
+ },
+ {
+ "epoch": 0.1016548463356974,
+ "grad_norm": 3.447584867477417,
+ "learning_rate": 4.998970742177044e-06,
+ "loss": 0.808,
+ "step": 215
+ },
+ {
+ "epoch": 0.10212765957446808,
+ "grad_norm": 3.791353940963745,
+ "learning_rate": 4.998952765469562e-06,
+ "loss": 0.8005,
+ "step": 216
+ },
+ {
+ "epoch": 0.10260047281323877,
+ "grad_norm": 3.4490807056427,
+ "learning_rate": 4.998934633163247e-06,
+ "loss": 0.8135,
+ "step": 217
+ },
+ {
+ "epoch": 0.10307328605200945,
+ "grad_norm": 3.1053314208984375,
+ "learning_rate": 4.998916345259232e-06,
+ "loss": 0.7888,
+ "step": 218
+ },
+ {
+ "epoch": 0.10354609929078014,
+ "grad_norm": 3.407862663269043,
+ "learning_rate": 4.9988979017586514e-06,
+ "loss": 0.7099,
+ "step": 219
+ },
+ {
+ "epoch": 0.10401891252955082,
+ "grad_norm": 3.116656541824341,
+ "learning_rate": 4.998879302662658e-06,
+ "loss": 0.8344,
+ "step": 220
+ },
+ {
+ "epoch": 0.10449172576832151,
+ "grad_norm": 3.339264154434204,
+ "learning_rate": 4.998860547972406e-06,
+ "loss": 0.8496,
+ "step": 221
+ },
+ {
+ "epoch": 0.1049645390070922,
+ "grad_norm": 3.251892566680908,
+ "learning_rate": 4.998841637689066e-06,
+ "loss": 0.7455,
+ "step": 222
+ },
+ {
+ "epoch": 0.10543735224586288,
+ "grad_norm": 4.098135471343994,
+ "learning_rate": 4.998822571813814e-06,
+ "loss": 0.7772,
+ "step": 223
+ },
+ {
+ "epoch": 0.10591016548463357,
+ "grad_norm": 3.9871134757995605,
+ "learning_rate": 4.998803350347837e-06,
+ "loss": 0.8261,
+ "step": 224
+ },
+ {
+ "epoch": 0.10638297872340426,
+ "grad_norm": 3.2822303771972656,
+ "learning_rate": 4.998783973292333e-06,
+ "loss": 0.8623,
+ "step": 225
+ },
+ {
+ "epoch": 0.10685579196217494,
+ "grad_norm": 3.0356857776641846,
+ "learning_rate": 4.998764440648507e-06,
+ "loss": 0.7426,
+ "step": 226
+ },
+ {
+ "epoch": 0.10732860520094563,
+ "grad_norm": 2.8932785987854004,
+ "learning_rate": 4.998744752417576e-06,
+ "loss": 0.6741,
+ "step": 227
+ },
+ {
+ "epoch": 0.10780141843971631,
+ "grad_norm": 3.085820436477661,
+ "learning_rate": 4.998724908600767e-06,
+ "loss": 0.6549,
+ "step": 228
+ },
+ {
+ "epoch": 0.108274231678487,
+ "grad_norm": 3.135829210281372,
+ "learning_rate": 4.998704909199314e-06,
+ "loss": 0.6702,
+ "step": 229
+ },
+ {
+ "epoch": 0.10874704491725769,
+ "grad_norm": 5.016134262084961,
+ "learning_rate": 4.9986847542144625e-06,
+ "loss": 0.7852,
+ "step": 230
+ },
+ {
+ "epoch": 0.10921985815602837,
+ "grad_norm": 3.9056200981140137,
+ "learning_rate": 4.998664443647468e-06,
+ "loss": 0.9654,
+ "step": 231
+ },
+ {
+ "epoch": 0.10969267139479906,
+ "grad_norm": 3.0880749225616455,
+ "learning_rate": 4.998643977499595e-06,
+ "loss": 0.7579,
+ "step": 232
+ },
+ {
+ "epoch": 0.11016548463356975,
+ "grad_norm": 3.6893601417541504,
+ "learning_rate": 4.998623355772118e-06,
+ "loss": 0.713,
+ "step": 233
+ },
+ {
+ "epoch": 0.11063829787234042,
+ "grad_norm": 4.181536674499512,
+ "learning_rate": 4.998602578466319e-06,
+ "loss": 0.7331,
+ "step": 234
+ },
+ {
+ "epoch": 0.1111111111111111,
+ "grad_norm": 3.036386728286743,
+ "learning_rate": 4.998581645583496e-06,
+ "loss": 0.7115,
+ "step": 235
+ },
+ {
+ "epoch": 0.11158392434988179,
+ "grad_norm": 3.6333255767822266,
+ "learning_rate": 4.998560557124948e-06,
+ "loss": 0.7544,
+ "step": 236
+ },
+ {
+ "epoch": 0.11205673758865248,
+ "grad_norm": 2.926417827606201,
+ "learning_rate": 4.9985393130919915e-06,
+ "loss": 0.715,
+ "step": 237
+ },
+ {
+ "epoch": 0.11252955082742316,
+ "grad_norm": 2.969158172607422,
+ "learning_rate": 4.998517913485946e-06,
+ "loss": 0.7304,
+ "step": 238
+ },
+ {
+ "epoch": 0.11300236406619385,
+ "grad_norm": 3.5254971981048584,
+ "learning_rate": 4.9984963583081466e-06,
+ "loss": 0.7725,
+ "step": 239
+ },
+ {
+ "epoch": 0.11347517730496454,
+ "grad_norm": 3.7840335369110107,
+ "learning_rate": 4.998474647559936e-06,
+ "loss": 0.8685,
+ "step": 240
+ },
+ {
+ "epoch": 0.11394799054373522,
+ "grad_norm": 3.0333125591278076,
+ "learning_rate": 4.9984527812426625e-06,
+ "loss": 0.7793,
+ "step": 241
+ },
+ {
+ "epoch": 0.11442080378250591,
+ "grad_norm": 3.290159225463867,
+ "learning_rate": 4.99843075935769e-06,
+ "loss": 0.7158,
+ "step": 242
+ },
+ {
+ "epoch": 0.1148936170212766,
+ "grad_norm": 3.3935494422912598,
+ "learning_rate": 4.99840858190639e-06,
+ "loss": 0.7643,
+ "step": 243
+ },
+ {
+ "epoch": 0.11536643026004728,
+ "grad_norm": 3.333965539932251,
+ "learning_rate": 4.998386248890142e-06,
+ "loss": 0.7255,
+ "step": 244
+ },
+ {
+ "epoch": 0.11583924349881797,
+ "grad_norm": 2.8129613399505615,
+ "learning_rate": 4.998363760310339e-06,
+ "loss": 0.768,
+ "step": 245
+ },
+ {
+ "epoch": 0.11631205673758865,
+ "grad_norm": 2.8678107261657715,
+ "learning_rate": 4.998341116168378e-06,
+ "loss": 0.7403,
+ "step": 246
+ },
+ {
+ "epoch": 0.11678486997635934,
+ "grad_norm": 2.8898239135742188,
+ "learning_rate": 4.998318316465672e-06,
+ "loss": 0.6844,
+ "step": 247
+ },
+ {
+ "epoch": 0.11725768321513003,
+ "grad_norm": 3.139777898788452,
+ "learning_rate": 4.998295361203637e-06,
+ "loss": 0.7936,
+ "step": 248
+ },
+ {
+ "epoch": 0.11773049645390071,
+ "grad_norm": 3.393721103668213,
+ "learning_rate": 4.998272250383707e-06,
+ "loss": 0.8173,
+ "step": 249
+ },
+ {
+ "epoch": 0.1182033096926714,
+ "grad_norm": 3.240973949432373,
+ "learning_rate": 4.998248984007318e-06,
+ "loss": 0.8252,
+ "step": 250
+ },
+ {
+ "epoch": 0.11867612293144209,
+ "grad_norm": 3.384855031967163,
+ "learning_rate": 4.998225562075918e-06,
+ "loss": 0.7244,
+ "step": 251
+ },
+ {
+ "epoch": 0.11914893617021277,
+ "grad_norm": 3.1881816387176514,
+ "learning_rate": 4.9982019845909675e-06,
+ "loss": 0.6818,
+ "step": 252
+ },
+ {
+ "epoch": 0.11962174940898346,
+ "grad_norm": 2.888364553451538,
+ "learning_rate": 4.998178251553934e-06,
+ "loss": 0.6753,
+ "step": 253
+ },
+ {
+ "epoch": 0.12009456264775414,
+ "grad_norm": 3.630093812942505,
+ "learning_rate": 4.9981543629662944e-06,
+ "loss": 0.7995,
+ "step": 254
+ },
+ {
+ "epoch": 0.12056737588652482,
+ "grad_norm": 2.9820947647094727,
+ "learning_rate": 4.998130318829537e-06,
+ "loss": 0.7478,
+ "step": 255
+ },
+ {
+ "epoch": 0.1210401891252955,
+ "grad_norm": 2.7094738483428955,
+ "learning_rate": 4.998106119145159e-06,
+ "loss": 0.7237,
+ "step": 256
+ },
+ {
+ "epoch": 0.12151300236406619,
+ "grad_norm": 3.1808104515075684,
+ "learning_rate": 4.9980817639146665e-06,
+ "loss": 0.7915,
+ "step": 257
+ },
+ {
+ "epoch": 0.12198581560283688,
+ "grad_norm": 3.1661291122436523,
+ "learning_rate": 4.998057253139575e-06,
+ "loss": 0.8053,
+ "step": 258
+ },
+ {
+ "epoch": 0.12245862884160756,
+ "grad_norm": 3.528749942779541,
+ "learning_rate": 4.998032586821413e-06,
+ "loss": 0.7946,
+ "step": 259
+ },
+ {
+ "epoch": 0.12293144208037825,
+ "grad_norm": 3.125964879989624,
+ "learning_rate": 4.998007764961716e-06,
+ "loss": 0.7569,
+ "step": 260
+ },
+ {
+ "epoch": 0.12340425531914893,
+ "grad_norm": 3.0778942108154297,
+ "learning_rate": 4.997982787562029e-06,
+ "loss": 0.7184,
+ "step": 261
+ },
+ {
+ "epoch": 0.12387706855791962,
+ "grad_norm": 3.3531930446624756,
+ "learning_rate": 4.997957654623906e-06,
+ "loss": 0.7586,
+ "step": 262
+ },
+ {
+ "epoch": 0.1243498817966903,
+ "grad_norm": 3.229278564453125,
+ "learning_rate": 4.997932366148913e-06,
+ "loss": 0.6092,
+ "step": 263
+ },
+ {
+ "epoch": 0.12482269503546099,
+ "grad_norm": 3.7286155223846436,
+ "learning_rate": 4.997906922138626e-06,
+ "loss": 0.7965,
+ "step": 264
+ },
+ {
+ "epoch": 0.12529550827423167,
+ "grad_norm": 3.300311803817749,
+ "learning_rate": 4.997881322594628e-06,
+ "loss": 0.7665,
+ "step": 265
+ },
+ {
+ "epoch": 0.12576832151300235,
+ "grad_norm": 3.411482572555542,
+ "learning_rate": 4.9978555675185115e-06,
+ "loss": 0.7253,
+ "step": 266
+ },
+ {
+ "epoch": 0.12624113475177304,
+ "grad_norm": 3.0884511470794678,
+ "learning_rate": 4.9978296569118825e-06,
+ "loss": 0.659,
+ "step": 267
+ },
+ {
+ "epoch": 0.12671394799054372,
+ "grad_norm": 3.0652925968170166,
+ "learning_rate": 4.9978035907763535e-06,
+ "loss": 0.6739,
+ "step": 268
+ },
+ {
+ "epoch": 0.1271867612293144,
+ "grad_norm": 3.280555009841919,
+ "learning_rate": 4.997777369113547e-06,
+ "loss": 0.8003,
+ "step": 269
+ },
+ {
+ "epoch": 0.1276595744680851,
+ "grad_norm": 2.980860948562622,
+ "learning_rate": 4.997750991925096e-06,
+ "loss": 0.7097,
+ "step": 270
+ },
+ {
+ "epoch": 0.12813238770685578,
+ "grad_norm": 3.301760673522949,
+ "learning_rate": 4.997724459212644e-06,
+ "loss": 0.7894,
+ "step": 271
+ },
+ {
+ "epoch": 0.12860520094562647,
+ "grad_norm": 2.9584903717041016,
+ "learning_rate": 4.997697770977841e-06,
+ "loss": 0.733,
+ "step": 272
+ },
+ {
+ "epoch": 0.12907801418439716,
+ "grad_norm": 3.5632214546203613,
+ "learning_rate": 4.99767092722235e-06,
+ "loss": 0.7228,
+ "step": 273
+ },
+ {
+ "epoch": 0.12955082742316784,
+ "grad_norm": 3.5900983810424805,
+ "learning_rate": 4.997643927947843e-06,
+ "loss": 0.7634,
+ "step": 274
+ },
+ {
+ "epoch": 0.13002364066193853,
+ "grad_norm": 3.332650661468506,
+ "learning_rate": 4.997616773156e-06,
+ "loss": 0.797,
+ "step": 275
+ },
+ {
+ "epoch": 0.13049645390070921,
+ "grad_norm": 3.1094167232513428,
+ "learning_rate": 4.997589462848512e-06,
+ "loss": 0.7849,
+ "step": 276
+ },
+ {
+ "epoch": 0.1309692671394799,
+ "grad_norm": 3.5359463691711426,
+ "learning_rate": 4.99756199702708e-06,
+ "loss": 0.6871,
+ "step": 277
+ },
+ {
+ "epoch": 0.1314420803782506,
+ "grad_norm": 3.190441846847534,
+ "learning_rate": 4.997534375693414e-06,
+ "loss": 0.6883,
+ "step": 278
+ },
+ {
+ "epoch": 0.13191489361702127,
+ "grad_norm": 3.063518762588501,
+ "learning_rate": 4.997506598849234e-06,
+ "loss": 0.7586,
+ "step": 279
+ },
+ {
+ "epoch": 0.13238770685579196,
+ "grad_norm": 3.4112050533294678,
+ "learning_rate": 4.997478666496269e-06,
+ "loss": 0.796,
+ "step": 280
+ },
+ {
+ "epoch": 0.13286052009456265,
+ "grad_norm": 3.231886386871338,
+ "learning_rate": 4.997450578636259e-06,
+ "loss": 0.7714,
+ "step": 281
+ },
+ {
+ "epoch": 0.13333333333333333,
+ "grad_norm": 3.279425621032715,
+ "learning_rate": 4.9974223352709515e-06,
+ "loss": 0.7793,
+ "step": 282
+ },
+ {
+ "epoch": 0.13380614657210402,
+ "grad_norm": 3.2154316902160645,
+ "learning_rate": 4.9973939364021075e-06,
+ "loss": 0.791,
+ "step": 283
+ },
+ {
+ "epoch": 0.1342789598108747,
+ "grad_norm": 3.2090768814086914,
+ "learning_rate": 4.9973653820314925e-06,
+ "loss": 0.6433,
+ "step": 284
+ },
+ {
+ "epoch": 0.1347517730496454,
+ "grad_norm": 3.1712026596069336,
+ "learning_rate": 4.997336672160886e-06,
+ "loss": 0.8128,
+ "step": 285
+ },
+ {
+ "epoch": 0.13522458628841608,
+ "grad_norm": 2.929229497909546,
+ "learning_rate": 4.997307806792076e-06,
+ "loss": 0.7594,
+ "step": 286
+ },
+ {
+ "epoch": 0.13569739952718676,
+ "grad_norm": 3.0363314151763916,
+ "learning_rate": 4.997278785926859e-06,
+ "loss": 0.7336,
+ "step": 287
+ },
+ {
+ "epoch": 0.13617021276595745,
+ "grad_norm": 3.1352357864379883,
+ "learning_rate": 4.997249609567042e-06,
+ "loss": 0.7225,
+ "step": 288
+ },
+ {
+ "epoch": 0.13664302600472814,
+ "grad_norm": 3.3171157836914062,
+ "learning_rate": 4.997220277714442e-06,
+ "loss": 0.7777,
+ "step": 289
+ },
+ {
+ "epoch": 0.13711583924349882,
+ "grad_norm": 3.050717353820801,
+ "learning_rate": 4.997190790370885e-06,
+ "loss": 0.6836,
+ "step": 290
+ },
+ {
+ "epoch": 0.1375886524822695,
+ "grad_norm": 3.0297694206237793,
+ "learning_rate": 4.997161147538208e-06,
+ "loss": 0.6883,
+ "step": 291
+ },
+ {
+ "epoch": 0.1380614657210402,
+ "grad_norm": 3.0566554069519043,
+ "learning_rate": 4.997131349218256e-06,
+ "loss": 0.6674,
+ "step": 292
+ },
+ {
+ "epoch": 0.13853427895981088,
+ "grad_norm": 3.799111843109131,
+ "learning_rate": 4.997101395412885e-06,
+ "loss": 0.8256,
+ "step": 293
+ },
+ {
+ "epoch": 0.13900709219858157,
+ "grad_norm": 3.1394248008728027,
+ "learning_rate": 4.9970712861239576e-06,
+ "loss": 0.7306,
+ "step": 294
+ },
+ {
+ "epoch": 0.13947990543735225,
+ "grad_norm": 3.0605666637420654,
+ "learning_rate": 4.997041021353352e-06,
+ "loss": 0.7212,
+ "step": 295
+ },
+ {
+ "epoch": 0.13995271867612294,
+ "grad_norm": 3.8813397884368896,
+ "learning_rate": 4.997010601102951e-06,
+ "loss": 0.769,
+ "step": 296
+ },
+ {
+ "epoch": 0.14042553191489363,
+ "grad_norm": 3.0514819622039795,
+ "learning_rate": 4.996980025374649e-06,
+ "loss": 0.7422,
+ "step": 297
+ },
+ {
+ "epoch": 0.1408983451536643,
+ "grad_norm": 2.9544146060943604,
+ "learning_rate": 4.99694929417035e-06,
+ "loss": 0.6912,
+ "step": 298
+ },
+ {
+ "epoch": 0.141371158392435,
+ "grad_norm": 3.2635602951049805,
+ "learning_rate": 4.996918407491966e-06,
+ "loss": 0.7395,
+ "step": 299
+ },
+ {
+ "epoch": 0.14184397163120568,
+ "grad_norm": 3.373882532119751,
+ "learning_rate": 4.996887365341423e-06,
+ "loss": 0.7799,
+ "step": 300
+ },
+ {
+ "epoch": 0.14231678486997637,
+ "grad_norm": 3.001128673553467,
+ "learning_rate": 4.996856167720652e-06,
+ "loss": 0.7168,
+ "step": 301
+ },
+ {
+ "epoch": 0.14278959810874706,
+ "grad_norm": 3.1026835441589355,
+ "learning_rate": 4.996824814631595e-06,
+ "loss": 0.7492,
+ "step": 302
+ },
+ {
+ "epoch": 0.14326241134751774,
+ "grad_norm": 3.41947603225708,
+ "learning_rate": 4.996793306076205e-06,
+ "loss": 0.6659,
+ "step": 303
+ },
+ {
+ "epoch": 0.14373522458628843,
+ "grad_norm": 3.2272400856018066,
+ "learning_rate": 4.996761642056444e-06,
+ "loss": 0.7184,
+ "step": 304
+ },
+ {
+ "epoch": 0.14420803782505912,
+ "grad_norm": 2.9488935470581055,
+ "learning_rate": 4.996729822574284e-06,
+ "loss": 0.7451,
+ "step": 305
+ },
+ {
+ "epoch": 0.14468085106382977,
+ "grad_norm": 3.268231153488159,
+ "learning_rate": 4.9966978476317065e-06,
+ "loss": 0.7798,
+ "step": 306
+ },
+ {
+ "epoch": 0.14515366430260046,
+ "grad_norm": 3.9086556434631348,
+ "learning_rate": 4.996665717230701e-06,
+ "loss": 0.7871,
+ "step": 307
+ },
+ {
+ "epoch": 0.14562647754137115,
+ "grad_norm": 3.3483879566192627,
+ "learning_rate": 4.996633431373269e-06,
+ "loss": 0.7415,
+ "step": 308
+ },
+ {
+ "epoch": 0.14609929078014183,
+ "grad_norm": 2.839400053024292,
+ "learning_rate": 4.99660099006142e-06,
+ "loss": 0.7192,
+ "step": 309
+ },
+ {
+ "epoch": 0.14657210401891252,
+ "grad_norm": 3.177302598953247,
+ "learning_rate": 4.996568393297175e-06,
+ "loss": 0.755,
+ "step": 310
+ },
+ {
+ "epoch": 0.1470449172576832,
+ "grad_norm": 3.5477044582366943,
+ "learning_rate": 4.996535641082563e-06,
+ "loss": 0.7531,
+ "step": 311
+ },
+ {
+ "epoch": 0.1475177304964539,
+ "grad_norm": 3.418576717376709,
+ "learning_rate": 4.996502733419624e-06,
+ "loss": 0.8009,
+ "step": 312
+ },
+ {
+ "epoch": 0.14799054373522458,
+ "grad_norm": 3.711341619491577,
+ "learning_rate": 4.996469670310407e-06,
+ "loss": 0.7362,
+ "step": 313
+ },
+ {
+ "epoch": 0.14846335697399526,
+ "grad_norm": 3.2419373989105225,
+ "learning_rate": 4.99643645175697e-06,
+ "loss": 0.7761,
+ "step": 314
+ },
+ {
+ "epoch": 0.14893617021276595,
+ "grad_norm": 3.121858835220337,
+ "learning_rate": 4.996403077761381e-06,
+ "loss": 0.6495,
+ "step": 315
+ },
+ {
+ "epoch": 0.14940898345153664,
+ "grad_norm": 3.123054265975952,
+ "learning_rate": 4.996369548325719e-06,
+ "loss": 0.7444,
+ "step": 316
+ },
+ {
+ "epoch": 0.14988179669030732,
+ "grad_norm": 2.780880928039551,
+ "learning_rate": 4.996335863452072e-06,
+ "loss": 0.672,
+ "step": 317
+ },
+ {
+ "epoch": 0.150354609929078,
+ "grad_norm": 3.3738629817962646,
+ "learning_rate": 4.996302023142536e-06,
+ "loss": 0.7972,
+ "step": 318
+ },
+ {
+ "epoch": 0.1508274231678487,
+ "grad_norm": 3.4874777793884277,
+ "learning_rate": 4.99626802739922e-06,
+ "loss": 0.8252,
+ "step": 319
+ },
+ {
+ "epoch": 0.15130023640661938,
+ "grad_norm": 3.7074787616729736,
+ "learning_rate": 4.9962338762242395e-06,
+ "loss": 0.8216,
+ "step": 320
+ },
+ {
+ "epoch": 0.15177304964539007,
+ "grad_norm": 3.281912326812744,
+ "learning_rate": 4.996199569619721e-06,
+ "loss": 0.8175,
+ "step": 321
+ },
+ {
+ "epoch": 0.15224586288416075,
+ "grad_norm": 2.9485340118408203,
+ "learning_rate": 4.996165107587801e-06,
+ "loss": 0.707,
+ "step": 322
+ },
+ {
+ "epoch": 0.15271867612293144,
+ "grad_norm": 3.3757646083831787,
+ "learning_rate": 4.996130490130625e-06,
+ "loss": 0.7955,
+ "step": 323
+ },
+ {
+ "epoch": 0.15319148936170213,
+ "grad_norm": 2.962181568145752,
+ "learning_rate": 4.996095717250349e-06,
+ "loss": 0.7067,
+ "step": 324
+ },
+ {
+ "epoch": 0.1536643026004728,
+ "grad_norm": 3.114272356033325,
+ "learning_rate": 4.996060788949136e-06,
+ "loss": 0.7486,
+ "step": 325
+ },
+ {
+ "epoch": 0.1541371158392435,
+ "grad_norm": 3.0621590614318848,
+ "learning_rate": 4.996025705229165e-06,
+ "loss": 0.6547,
+ "step": 326
+ },
+ {
+ "epoch": 0.15460992907801419,
+ "grad_norm": 2.8745882511138916,
+ "learning_rate": 4.995990466092616e-06,
+ "loss": 0.6435,
+ "step": 327
+ },
+ {
+ "epoch": 0.15508274231678487,
+ "grad_norm": 2.90841007232666,
+ "learning_rate": 4.995955071541686e-06,
+ "loss": 0.7331,
+ "step": 328
+ },
+ {
+ "epoch": 0.15555555555555556,
+ "grad_norm": 2.694580316543579,
+ "learning_rate": 4.9959195215785784e-06,
+ "loss": 0.6731,
+ "step": 329
+ },
+ {
+ "epoch": 0.15602836879432624,
+ "grad_norm": 3.158083438873291,
+ "learning_rate": 4.995883816205507e-06,
+ "loss": 0.7257,
+ "step": 330
+ },
+ {
+ "epoch": 0.15650118203309693,
+ "grad_norm": 3.3234715461730957,
+ "learning_rate": 4.995847955424694e-06,
+ "loss": 0.7389,
+ "step": 331
+ },
+ {
+ "epoch": 0.15697399527186762,
+ "grad_norm": 2.9406495094299316,
+ "learning_rate": 4.995811939238373e-06,
+ "loss": 0.643,
+ "step": 332
+ },
+ {
+ "epoch": 0.1574468085106383,
+ "grad_norm": 3.3191726207733154,
+ "learning_rate": 4.995775767648785e-06,
+ "loss": 0.7879,
+ "step": 333
+ },
+ {
+ "epoch": 0.157919621749409,
+ "grad_norm": 3.711925745010376,
+ "learning_rate": 4.995739440658185e-06,
+ "loss": 0.7586,
+ "step": 334
+ },
+ {
+ "epoch": 0.15839243498817968,
+ "grad_norm": 9.573421478271484,
+ "learning_rate": 4.995702958268833e-06,
+ "loss": 0.7842,
+ "step": 335
+ },
+ {
+ "epoch": 0.15886524822695036,
+ "grad_norm": 3.4154508113861084,
+ "learning_rate": 4.995666320483001e-06,
+ "loss": 0.6735,
+ "step": 336
+ },
+ {
+ "epoch": 0.15933806146572105,
+ "grad_norm": 3.4169859886169434,
+ "learning_rate": 4.995629527302971e-06,
+ "loss": 0.741,
+ "step": 337
+ },
+ {
+ "epoch": 0.15981087470449173,
+ "grad_norm": 3.287503242492676,
+ "learning_rate": 4.9955925787310335e-06,
+ "loss": 0.7139,
+ "step": 338
+ },
+ {
+ "epoch": 0.16028368794326242,
+ "grad_norm": 3.288409471511841,
+ "learning_rate": 4.995555474769488e-06,
+ "loss": 0.7636,
+ "step": 339
+ },
+ {
+ "epoch": 0.1607565011820331,
+ "grad_norm": 2.8021693229675293,
+ "learning_rate": 4.995518215420646e-06,
+ "loss": 0.5883,
+ "step": 340
+ },
+ {
+ "epoch": 0.1612293144208038,
+ "grad_norm": 2.7038564682006836,
+ "learning_rate": 4.995480800686827e-06,
+ "loss": 0.657,
+ "step": 341
+ },
+ {
+ "epoch": 0.16170212765957448,
+ "grad_norm": 3.2370235919952393,
+ "learning_rate": 4.9954432305703615e-06,
+ "loss": 0.6999,
+ "step": 342
+ },
+ {
+ "epoch": 0.16217494089834517,
+ "grad_norm": 2.8666412830352783,
+ "learning_rate": 4.995405505073588e-06,
+ "loss": 0.7199,
+ "step": 343
+ },
+ {
+ "epoch": 0.16264775413711585,
+ "grad_norm": 3.6467232704162598,
+ "learning_rate": 4.995367624198856e-06,
+ "loss": 0.7317,
+ "step": 344
+ },
+ {
+ "epoch": 0.16312056737588654,
+ "grad_norm": 2.7576327323913574,
+ "learning_rate": 4.9953295879485246e-06,
+ "loss": 0.647,
+ "step": 345
+ },
+ {
+ "epoch": 0.1635933806146572,
+ "grad_norm": 2.922232151031494,
+ "learning_rate": 4.995291396324959e-06,
+ "loss": 0.6686,
+ "step": 346
+ },
+ {
+ "epoch": 0.16406619385342788,
+ "grad_norm": 2.8693501949310303,
+ "learning_rate": 4.995253049330542e-06,
+ "loss": 0.6756,
+ "step": 347
+ },
+ {
+ "epoch": 0.16453900709219857,
+ "grad_norm": 3.671865701675415,
+ "learning_rate": 4.995214546967658e-06,
+ "loss": 0.7347,
+ "step": 348
+ },
+ {
+ "epoch": 0.16501182033096926,
+ "grad_norm": 3.024219274520874,
+ "learning_rate": 4.995175889238706e-06,
+ "loss": 0.7547,
+ "step": 349
+ },
+ {
+ "epoch": 0.16548463356973994,
+ "grad_norm": 2.8470778465270996,
+ "learning_rate": 4.995137076146091e-06,
+ "loss": 0.6764,
+ "step": 350
+ },
+ {
+ "epoch": 0.16595744680851063,
+ "grad_norm": 2.905057907104492,
+ "learning_rate": 4.9950981076922324e-06,
+ "loss": 0.6814,
+ "step": 351
+ },
+ {
+ "epoch": 0.16643026004728131,
+ "grad_norm": 3.504377841949463,
+ "learning_rate": 4.995058983879555e-06,
+ "loss": 0.7145,
+ "step": 352
+ },
+ {
+ "epoch": 0.166903073286052,
+ "grad_norm": 3.0029661655426025,
+ "learning_rate": 4.995019704710495e-06,
+ "loss": 0.7114,
+ "step": 353
+ },
+ {
+ "epoch": 0.1673758865248227,
+ "grad_norm": 2.8666274547576904,
+ "learning_rate": 4.994980270187499e-06,
+ "loss": 0.7416,
+ "step": 354
+ },
+ {
+ "epoch": 0.16784869976359337,
+ "grad_norm": 3.1644718647003174,
+ "learning_rate": 4.994940680313021e-06,
+ "loss": 0.661,
+ "step": 355
+ },
+ {
+ "epoch": 0.16832151300236406,
+ "grad_norm": 3.050391674041748,
+ "learning_rate": 4.994900935089527e-06,
+ "loss": 0.7243,
+ "step": 356
+ },
+ {
+ "epoch": 0.16879432624113475,
+ "grad_norm": 2.985466480255127,
+ "learning_rate": 4.994861034519491e-06,
+ "loss": 0.6917,
+ "step": 357
+ },
+ {
+ "epoch": 0.16926713947990543,
+ "grad_norm": 2.909342050552368,
+ "learning_rate": 4.9948209786053995e-06,
+ "loss": 0.6636,
+ "step": 358
+ },
+ {
+ "epoch": 0.16973995271867612,
+ "grad_norm": 3.2214784622192383,
+ "learning_rate": 4.9947807673497435e-06,
+ "loss": 0.7903,
+ "step": 359
+ },
+ {
+ "epoch": 0.1702127659574468,
+ "grad_norm": 2.5654983520507812,
+ "learning_rate": 4.994740400755029e-06,
+ "loss": 0.6129,
+ "step": 360
+ },
+ {
+ "epoch": 0.1706855791962175,
+ "grad_norm": 3.775646448135376,
+ "learning_rate": 4.99469987882377e-06,
+ "loss": 0.7145,
+ "step": 361
+ },
+ {
+ "epoch": 0.17115839243498818,
+ "grad_norm": 2.8965413570404053,
+ "learning_rate": 4.994659201558487e-06,
+ "loss": 0.7177,
+ "step": 362
+ },
+ {
+ "epoch": 0.17163120567375886,
+ "grad_norm": 3.485597848892212,
+ "learning_rate": 4.9946183689617146e-06,
+ "loss": 0.8107,
+ "step": 363
+ },
+ {
+ "epoch": 0.17210401891252955,
+ "grad_norm": 3.277839183807373,
+ "learning_rate": 4.994577381035995e-06,
+ "loss": 0.691,
+ "step": 364
+ },
+ {
+ "epoch": 0.17257683215130024,
+ "grad_norm": 2.8807685375213623,
+ "learning_rate": 4.99453623778388e-06,
+ "loss": 0.7627,
+ "step": 365
+ },
+ {
+ "epoch": 0.17304964539007092,
+ "grad_norm": 3.0659940242767334,
+ "learning_rate": 4.994494939207932e-06,
+ "loss": 0.6858,
+ "step": 366
+ },
+ {
+ "epoch": 0.1735224586288416,
+ "grad_norm": 3.0881855487823486,
+ "learning_rate": 4.994453485310723e-06,
+ "loss": 0.8212,
+ "step": 367
+ },
+ {
+ "epoch": 0.1739952718676123,
+ "grad_norm": 2.7199201583862305,
+ "learning_rate": 4.994411876094832e-06,
+ "loss": 0.6516,
+ "step": 368
+ },
+ {
+ "epoch": 0.17446808510638298,
+ "grad_norm": 2.955889940261841,
+ "learning_rate": 4.994370111562851e-06,
+ "loss": 0.6579,
+ "step": 369
+ },
+ {
+ "epoch": 0.17494089834515367,
+ "grad_norm": 3.1321663856506348,
+ "learning_rate": 4.994328191717382e-06,
+ "loss": 0.6891,
+ "step": 370
+ },
+ {
+ "epoch": 0.17541371158392435,
+ "grad_norm": 3.0560388565063477,
+ "learning_rate": 4.994286116561034e-06,
+ "loss": 0.7243,
+ "step": 371
+ },
+ {
+ "epoch": 0.17588652482269504,
+ "grad_norm": 3.1560704708099365,
+ "learning_rate": 4.994243886096425e-06,
+ "loss": 0.7262,
+ "step": 372
+ },
+ {
+ "epoch": 0.17635933806146573,
+ "grad_norm": 2.913541316986084,
+ "learning_rate": 4.994201500326187e-06,
+ "loss": 0.7318,
+ "step": 373
+ },
+ {
+ "epoch": 0.1768321513002364,
+ "grad_norm": 3.098376512527466,
+ "learning_rate": 4.994158959252958e-06,
+ "loss": 0.6419,
+ "step": 374
+ },
+ {
+ "epoch": 0.1773049645390071,
+ "grad_norm": 2.977508544921875,
+ "learning_rate": 4.994116262879387e-06,
+ "loss": 0.6709,
+ "step": 375
+ },
+ {
+ "epoch": 0.17777777777777778,
+ "grad_norm": 3.168186902999878,
+ "learning_rate": 4.994073411208133e-06,
+ "loss": 0.6608,
+ "step": 376
+ },
+ {
+ "epoch": 0.17825059101654847,
+ "grad_norm": 3.436844825744629,
+ "learning_rate": 4.994030404241864e-06,
+ "loss": 0.7227,
+ "step": 377
+ },
+ {
+ "epoch": 0.17872340425531916,
+ "grad_norm": 2.8998289108276367,
+ "learning_rate": 4.993987241983258e-06,
+ "loss": 0.6512,
+ "step": 378
+ },
+ {
+ "epoch": 0.17919621749408984,
+ "grad_norm": 3.407191514968872,
+ "learning_rate": 4.993943924435002e-06,
+ "loss": 0.616,
+ "step": 379
+ },
+ {
+ "epoch": 0.17966903073286053,
+ "grad_norm": 3.744858741760254,
+ "learning_rate": 4.993900451599793e-06,
+ "loss": 0.8599,
+ "step": 380
+ },
+ {
+ "epoch": 0.18014184397163122,
+ "grad_norm": 3.486283779144287,
+ "learning_rate": 4.993856823480338e-06,
+ "loss": 0.6634,
+ "step": 381
+ },
+ {
+ "epoch": 0.1806146572104019,
+ "grad_norm": 2.895719051361084,
+ "learning_rate": 4.993813040079355e-06,
+ "loss": 0.6972,
+ "step": 382
+ },
+ {
+ "epoch": 0.1810874704491726,
+ "grad_norm": 2.814133882522583,
+ "learning_rate": 4.993769101399569e-06,
+ "loss": 0.6271,
+ "step": 383
+ },
+ {
+ "epoch": 0.18156028368794327,
+ "grad_norm": 2.8609800338745117,
+ "learning_rate": 4.993725007443715e-06,
+ "loss": 0.6481,
+ "step": 384
+ },
+ {
+ "epoch": 0.18203309692671396,
+ "grad_norm": 3.2829644680023193,
+ "learning_rate": 4.99368075821454e-06,
+ "loss": 0.7999,
+ "step": 385
+ },
+ {
+ "epoch": 0.18250591016548465,
+ "grad_norm": 3.1417458057403564,
+ "learning_rate": 4.993636353714798e-06,
+ "loss": 0.6972,
+ "step": 386
+ },
+ {
+ "epoch": 0.1829787234042553,
+ "grad_norm": 3.0679385662078857,
+ "learning_rate": 4.993591793947256e-06,
+ "loss": 0.667,
+ "step": 387
+ },
+ {
+ "epoch": 0.183451536643026,
+ "grad_norm": 3.1387410163879395,
+ "learning_rate": 4.993547078914686e-06,
+ "loss": 0.7618,
+ "step": 388
+ },
+ {
+ "epoch": 0.18392434988179668,
+ "grad_norm": 2.9181406497955322,
+ "learning_rate": 4.993502208619872e-06,
+ "loss": 0.7391,
+ "step": 389
+ },
+ {
+ "epoch": 0.18439716312056736,
+ "grad_norm": 2.8952157497406006,
+ "learning_rate": 4.993457183065611e-06,
+ "loss": 0.6988,
+ "step": 390
+ },
+ {
+ "epoch": 0.18486997635933805,
+ "grad_norm": 3.2274813652038574,
+ "learning_rate": 4.993412002254704e-06,
+ "loss": 0.688,
+ "step": 391
+ },
+ {
+ "epoch": 0.18534278959810874,
+ "grad_norm": 3.4693779945373535,
+ "learning_rate": 4.993366666189965e-06,
+ "loss": 0.6634,
+ "step": 392
+ },
+ {
+ "epoch": 0.18581560283687942,
+ "grad_norm": 3.5358526706695557,
+ "learning_rate": 4.993321174874217e-06,
+ "loss": 0.7343,
+ "step": 393
+ },
+ {
+ "epoch": 0.1862884160756501,
+ "grad_norm": 3.013338088989258,
+ "learning_rate": 4.993275528310292e-06,
+ "loss": 0.7579,
+ "step": 394
+ },
+ {
+ "epoch": 0.1867612293144208,
+ "grad_norm": 2.694772720336914,
+ "learning_rate": 4.993229726501033e-06,
+ "loss": 0.718,
+ "step": 395
+ },
+ {
+ "epoch": 0.18723404255319148,
+ "grad_norm": 3.070612907409668,
+ "learning_rate": 4.9931837694492915e-06,
+ "loss": 0.6438,
+ "step": 396
+ },
+ {
+ "epoch": 0.18770685579196217,
+ "grad_norm": 2.9193027019500732,
+ "learning_rate": 4.993137657157928e-06,
+ "loss": 0.6788,
+ "step": 397
+ },
+ {
+ "epoch": 0.18817966903073285,
+ "grad_norm": 3.047682046890259,
+ "learning_rate": 4.993091389629816e-06,
+ "loss": 0.6826,
+ "step": 398
+ },
+ {
+ "epoch": 0.18865248226950354,
+ "grad_norm": 2.9629905223846436,
+ "learning_rate": 4.993044966867834e-06,
+ "loss": 0.7196,
+ "step": 399
+ },
+ {
+ "epoch": 0.18912529550827423,
+ "grad_norm": 3.0692050457000732,
+ "learning_rate": 4.992998388874874e-06,
+ "loss": 0.7015,
+ "step": 400
+ },
+ {
+ "epoch": 0.1895981087470449,
+ "grad_norm": 3.5427212715148926,
+ "learning_rate": 4.992951655653836e-06,
+ "loss": 0.8292,
+ "step": 401
+ },
+ {
+ "epoch": 0.1900709219858156,
+ "grad_norm": 2.643526554107666,
+ "learning_rate": 4.992904767207629e-06,
+ "loss": 0.624,
+ "step": 402
+ },
+ {
+ "epoch": 0.19054373522458629,
+ "grad_norm": 3.1185996532440186,
+ "learning_rate": 4.992857723539173e-06,
+ "loss": 0.7354,
+ "step": 403
+ },
+ {
+ "epoch": 0.19101654846335697,
+ "grad_norm": 3.006856679916382,
+ "learning_rate": 4.992810524651398e-06,
+ "loss": 0.7752,
+ "step": 404
+ },
+ {
+ "epoch": 0.19148936170212766,
+ "grad_norm": 2.9913275241851807,
+ "learning_rate": 4.9927631705472425e-06,
+ "loss": 0.7306,
+ "step": 405
+ },
+ {
+ "epoch": 0.19196217494089834,
+ "grad_norm": 2.6794071197509766,
+ "learning_rate": 4.992715661229655e-06,
+ "loss": 0.6136,
+ "step": 406
+ },
+ {
+ "epoch": 0.19243498817966903,
+ "grad_norm": 3.5933966636657715,
+ "learning_rate": 4.992667996701593e-06,
+ "loss": 0.7024,
+ "step": 407
+ },
+ {
+ "epoch": 0.19290780141843972,
+ "grad_norm": 2.862187623977661,
+ "learning_rate": 4.992620176966025e-06,
+ "loss": 0.692,
+ "step": 408
+ },
+ {
+ "epoch": 0.1933806146572104,
+ "grad_norm": 3.076845407485962,
+ "learning_rate": 4.9925722020259286e-06,
+ "loss": 0.7475,
+ "step": 409
+ },
+ {
+ "epoch": 0.1938534278959811,
+ "grad_norm": 3.372919797897339,
+ "learning_rate": 4.9925240718842895e-06,
+ "loss": 0.6886,
+ "step": 410
+ },
+ {
+ "epoch": 0.19432624113475178,
+ "grad_norm": 2.922977924346924,
+ "learning_rate": 4.992475786544108e-06,
+ "loss": 0.7049,
+ "step": 411
+ },
+ {
+ "epoch": 0.19479905437352246,
+ "grad_norm": 2.908034324645996,
+ "learning_rate": 4.992427346008387e-06,
+ "loss": 0.6498,
+ "step": 412
+ },
+ {
+ "epoch": 0.19527186761229315,
+ "grad_norm": 3.096723794937134,
+ "learning_rate": 4.992378750280144e-06,
+ "loss": 0.7151,
+ "step": 413
+ },
+ {
+ "epoch": 0.19574468085106383,
+ "grad_norm": 2.895237684249878,
+ "learning_rate": 4.992329999362405e-06,
+ "loss": 0.7277,
+ "step": 414
+ },
+ {
+ "epoch": 0.19621749408983452,
+ "grad_norm": 2.718230724334717,
+ "learning_rate": 4.9922810932582065e-06,
+ "loss": 0.6375,
+ "step": 415
+ },
+ {
+ "epoch": 0.1966903073286052,
+ "grad_norm": 3.187743663787842,
+ "learning_rate": 4.992232031970592e-06,
+ "loss": 0.6528,
+ "step": 416
+ },
+ {
+ "epoch": 0.1971631205673759,
+ "grad_norm": 2.996406316757202,
+ "learning_rate": 4.992182815502616e-06,
+ "loss": 0.6552,
+ "step": 417
+ },
+ {
+ "epoch": 0.19763593380614658,
+ "grad_norm": 3.301084041595459,
+ "learning_rate": 4.992133443857345e-06,
+ "loss": 0.7061,
+ "step": 418
+ },
+ {
+ "epoch": 0.19810874704491727,
+ "grad_norm": 3.7874677181243896,
+ "learning_rate": 4.992083917037853e-06,
+ "loss": 0.7859,
+ "step": 419
+ },
+ {
+ "epoch": 0.19858156028368795,
+ "grad_norm": 3.124253511428833,
+ "learning_rate": 4.992034235047222e-06,
+ "loss": 0.7615,
+ "step": 420
+ },
+ {
+ "epoch": 0.19905437352245864,
+ "grad_norm": 3.0488970279693604,
+ "learning_rate": 4.991984397888546e-06,
+ "loss": 0.6916,
+ "step": 421
+ },
+ {
+ "epoch": 0.19952718676122932,
+ "grad_norm": 3.1241321563720703,
+ "learning_rate": 4.991934405564929e-06,
+ "loss": 0.7055,
+ "step": 422
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 3.396632432937622,
+ "learning_rate": 4.991884258079484e-06,
+ "loss": 0.7675,
+ "step": 423
+ },
+ {
+ "epoch": 0.2004728132387707,
+ "grad_norm": 3.7776873111724854,
+ "learning_rate": 4.9918339554353316e-06,
+ "loss": 0.7371,
+ "step": 424
+ },
+ {
+ "epoch": 0.20094562647754138,
+ "grad_norm": 3.3356032371520996,
+ "learning_rate": 4.991783497635606e-06,
+ "loss": 0.6778,
+ "step": 425
+ },
+ {
+ "epoch": 0.20141843971631207,
+ "grad_norm": 2.988856792449951,
+ "learning_rate": 4.9917328846834474e-06,
+ "loss": 0.6795,
+ "step": 426
+ },
+ {
+ "epoch": 0.20189125295508276,
+ "grad_norm": 3.264183282852173,
+ "learning_rate": 4.99168211658201e-06,
+ "loss": 0.7707,
+ "step": 427
+ },
+ {
+ "epoch": 0.20236406619385341,
+ "grad_norm": 3.878068208694458,
+ "learning_rate": 4.991631193334451e-06,
+ "loss": 0.857,
+ "step": 428
+ },
+ {
+ "epoch": 0.2028368794326241,
+ "grad_norm": 3.6377553939819336,
+ "learning_rate": 4.991580114943943e-06,
+ "loss": 0.8033,
+ "step": 429
+ },
+ {
+ "epoch": 0.2033096926713948,
+ "grad_norm": 2.95393967628479,
+ "learning_rate": 4.991528881413667e-06,
+ "loss": 0.6809,
+ "step": 430
+ },
+ {
+ "epoch": 0.20378250591016547,
+ "grad_norm": 3.058704376220703,
+ "learning_rate": 4.9914774927468125e-06,
+ "loss": 0.6664,
+ "step": 431
+ },
+ {
+ "epoch": 0.20425531914893616,
+ "grad_norm": 2.7783217430114746,
+ "learning_rate": 4.9914259489465795e-06,
+ "loss": 0.6478,
+ "step": 432
+ },
+ {
+ "epoch": 0.20472813238770685,
+ "grad_norm": 2.4825217723846436,
+ "learning_rate": 4.991374250016177e-06,
+ "loss": 0.6598,
+ "step": 433
+ },
+ {
+ "epoch": 0.20520094562647753,
+ "grad_norm": 2.8753600120544434,
+ "learning_rate": 4.991322395958824e-06,
+ "loss": 0.6947,
+ "step": 434
+ },
+ {
+ "epoch": 0.20567375886524822,
+ "grad_norm": 3.2339367866516113,
+ "learning_rate": 4.99127038677775e-06,
+ "loss": 0.8201,
+ "step": 435
+ },
+ {
+ "epoch": 0.2061465721040189,
+ "grad_norm": 2.9065537452697754,
+ "learning_rate": 4.991218222476193e-06,
+ "loss": 0.6679,
+ "step": 436
+ },
+ {
+ "epoch": 0.2066193853427896,
+ "grad_norm": 3.283228874206543,
+ "learning_rate": 4.991165903057401e-06,
+ "loss": 0.8039,
+ "step": 437
+ },
+ {
+ "epoch": 0.20709219858156028,
+ "grad_norm": 3.429872751235962,
+ "learning_rate": 4.991113428524631e-06,
+ "loss": 0.7392,
+ "step": 438
+ },
+ {
+ "epoch": 0.20756501182033096,
+ "grad_norm": 3.118943452835083,
+ "learning_rate": 4.991060798881152e-06,
+ "loss": 0.6794,
+ "step": 439
+ },
+ {
+ "epoch": 0.20803782505910165,
+ "grad_norm": 3.395970106124878,
+ "learning_rate": 4.99100801413024e-06,
+ "loss": 0.6862,
+ "step": 440
+ },
+ {
+ "epoch": 0.20851063829787234,
+ "grad_norm": 2.869191884994507,
+ "learning_rate": 4.99095507427518e-06,
+ "loss": 0.6076,
+ "step": 441
+ },
+ {
+ "epoch": 0.20898345153664302,
+ "grad_norm": 3.1934661865234375,
+ "learning_rate": 4.990901979319272e-06,
+ "loss": 0.6927,
+ "step": 442
+ },
+ {
+ "epoch": 0.2094562647754137,
+ "grad_norm": 2.9068603515625,
+ "learning_rate": 4.990848729265819e-06,
+ "loss": 0.6864,
+ "step": 443
+ },
+ {
+ "epoch": 0.2099290780141844,
+ "grad_norm": 3.0535948276519775,
+ "learning_rate": 4.9907953241181375e-06,
+ "loss": 0.6396,
+ "step": 444
+ },
+ {
+ "epoch": 0.21040189125295508,
+ "grad_norm": 2.871511459350586,
+ "learning_rate": 4.990741763879554e-06,
+ "loss": 0.6743,
+ "step": 445
+ },
+ {
+ "epoch": 0.21087470449172577,
+ "grad_norm": 2.9184393882751465,
+ "learning_rate": 4.9906880485534015e-06,
+ "loss": 0.6786,
+ "step": 446
+ },
+ {
+ "epoch": 0.21134751773049645,
+ "grad_norm": 3.0628271102905273,
+ "learning_rate": 4.990634178143026e-06,
+ "loss": 0.6326,
+ "step": 447
+ },
+ {
+ "epoch": 0.21182033096926714,
+ "grad_norm": 3.7878305912017822,
+ "learning_rate": 4.990580152651782e-06,
+ "loss": 0.7944,
+ "step": 448
+ },
+ {
+ "epoch": 0.21229314420803783,
+ "grad_norm": 2.8577189445495605,
+ "learning_rate": 4.990525972083031e-06,
+ "loss": 0.71,
+ "step": 449
+ },
+ {
+ "epoch": 0.2127659574468085,
+ "grad_norm": 3.307769775390625,
+ "learning_rate": 4.99047163644015e-06,
+ "loss": 0.6893,
+ "step": 450
+ },
+ {
+ "epoch": 0.2132387706855792,
+ "grad_norm": 2.7391717433929443,
+ "learning_rate": 4.990417145726519e-06,
+ "loss": 0.712,
+ "step": 451
+ },
+ {
+ "epoch": 0.21371158392434988,
+ "grad_norm": 2.938044786453247,
+ "learning_rate": 4.990362499945534e-06,
+ "loss": 0.7516,
+ "step": 452
+ },
+ {
+ "epoch": 0.21418439716312057,
+ "grad_norm": 2.7831056118011475,
+ "learning_rate": 4.990307699100595e-06,
+ "loss": 0.6168,
+ "step": 453
+ },
+ {
+ "epoch": 0.21465721040189126,
+ "grad_norm": 2.907977342605591,
+ "learning_rate": 4.990252743195116e-06,
+ "loss": 0.6706,
+ "step": 454
+ },
+ {
+ "epoch": 0.21513002364066194,
+ "grad_norm": 3.7882161140441895,
+ "learning_rate": 4.990197632232517e-06,
+ "loss": 0.6847,
+ "step": 455
+ },
+ {
+ "epoch": 0.21560283687943263,
+ "grad_norm": 2.899716854095459,
+ "learning_rate": 4.990142366216232e-06,
+ "loss": 0.6699,
+ "step": 456
+ },
+ {
+ "epoch": 0.21607565011820332,
+ "grad_norm": 2.907003879547119,
+ "learning_rate": 4.990086945149701e-06,
+ "loss": 0.6864,
+ "step": 457
+ },
+ {
+ "epoch": 0.216548463356974,
+ "grad_norm": 3.2407333850860596,
+ "learning_rate": 4.9900313690363736e-06,
+ "loss": 0.692,
+ "step": 458
+ },
+ {
+ "epoch": 0.2170212765957447,
+ "grad_norm": 2.9055583477020264,
+ "learning_rate": 4.989975637879712e-06,
+ "loss": 0.7113,
+ "step": 459
+ },
+ {
+ "epoch": 0.21749408983451538,
+ "grad_norm": 2.9836206436157227,
+ "learning_rate": 4.989919751683184e-06,
+ "loss": 0.6673,
+ "step": 460
+ },
+ {
+ "epoch": 0.21796690307328606,
+ "grad_norm": 3.371035575866699,
+ "learning_rate": 4.989863710450273e-06,
+ "loss": 0.7181,
+ "step": 461
+ },
+ {
+ "epoch": 0.21843971631205675,
+ "grad_norm": 2.9636635780334473,
+ "learning_rate": 4.989807514184465e-06,
+ "loss": 0.6082,
+ "step": 462
+ },
+ {
+ "epoch": 0.21891252955082743,
+ "grad_norm": 2.9634664058685303,
+ "learning_rate": 4.9897511628892615e-06,
+ "loss": 0.7086,
+ "step": 463
+ },
+ {
+ "epoch": 0.21938534278959812,
+ "grad_norm": 3.154763698577881,
+ "learning_rate": 4.98969465656817e-06,
+ "loss": 0.7027,
+ "step": 464
+ },
+ {
+ "epoch": 0.2198581560283688,
+ "grad_norm": 2.9959890842437744,
+ "learning_rate": 4.98963799522471e-06,
+ "loss": 0.6498,
+ "step": 465
+ },
+ {
+ "epoch": 0.2203309692671395,
+ "grad_norm": 3.5470590591430664,
+ "learning_rate": 4.989581178862408e-06,
+ "loss": 0.7199,
+ "step": 466
+ },
+ {
+ "epoch": 0.22080378250591018,
+ "grad_norm": 7.1873369216918945,
+ "learning_rate": 4.989524207484802e-06,
+ "loss": 0.6676,
+ "step": 467
+ },
+ {
+ "epoch": 0.22127659574468084,
+ "grad_norm": 3.1099541187286377,
+ "learning_rate": 4.98946708109544e-06,
+ "loss": 0.6785,
+ "step": 468
+ },
+ {
+ "epoch": 0.22174940898345152,
+ "grad_norm": 2.830991506576538,
+ "learning_rate": 4.9894097996978795e-06,
+ "loss": 0.6456,
+ "step": 469
+ },
+ {
+ "epoch": 0.2222222222222222,
+ "grad_norm": 3.0212316513061523,
+ "learning_rate": 4.989352363295687e-06,
+ "loss": 0.6048,
+ "step": 470
+ },
+ {
+ "epoch": 0.2226950354609929,
+ "grad_norm": 3.18776798248291,
+ "learning_rate": 4.989294771892437e-06,
+ "loss": 0.7078,
+ "step": 471
+ },
+ {
+ "epoch": 0.22316784869976358,
+ "grad_norm": 2.9972598552703857,
+ "learning_rate": 4.989237025491717e-06,
+ "loss": 0.7082,
+ "step": 472
+ },
+ {
+ "epoch": 0.22364066193853427,
+ "grad_norm": 3.4935688972473145,
+ "learning_rate": 4.989179124097123e-06,
+ "loss": 0.8199,
+ "step": 473
+ },
+ {
+ "epoch": 0.22411347517730495,
+ "grad_norm": 2.6485543251037598,
+ "learning_rate": 4.9891210677122595e-06,
+ "loss": 0.6371,
+ "step": 474
+ },
+ {
+ "epoch": 0.22458628841607564,
+ "grad_norm": 2.969233512878418,
+ "learning_rate": 4.989062856340742e-06,
+ "loss": 0.6879,
+ "step": 475
+ },
+ {
+ "epoch": 0.22505910165484633,
+ "grad_norm": 2.881875514984131,
+ "learning_rate": 4.989004489986194e-06,
+ "loss": 0.7415,
+ "step": 476
+ },
+ {
+ "epoch": 0.225531914893617,
+ "grad_norm": 2.624540090560913,
+ "learning_rate": 4.98894596865225e-06,
+ "loss": 0.6522,
+ "step": 477
+ },
+ {
+ "epoch": 0.2260047281323877,
+ "grad_norm": 3.61075496673584,
+ "learning_rate": 4.988887292342555e-06,
+ "loss": 0.7109,
+ "step": 478
+ },
+ {
+ "epoch": 0.2264775413711584,
+ "grad_norm": 2.9368972778320312,
+ "learning_rate": 4.988828461060762e-06,
+ "loss": 0.6843,
+ "step": 479
+ },
+ {
+ "epoch": 0.22695035460992907,
+ "grad_norm": 3.0670197010040283,
+ "learning_rate": 4.988769474810533e-06,
+ "loss": 0.6807,
+ "step": 480
+ },
+ {
+ "epoch": 0.22742316784869976,
+ "grad_norm": 2.9662792682647705,
+ "learning_rate": 4.988710333595542e-06,
+ "loss": 0.6796,
+ "step": 481
+ },
+ {
+ "epoch": 0.22789598108747045,
+ "grad_norm": 2.971235752105713,
+ "learning_rate": 4.988651037419472e-06,
+ "loss": 0.696,
+ "step": 482
+ },
+ {
+ "epoch": 0.22836879432624113,
+ "grad_norm": 2.931884527206421,
+ "learning_rate": 4.988591586286013e-06,
+ "loss": 0.7323,
+ "step": 483
+ },
+ {
+ "epoch": 0.22884160756501182,
+ "grad_norm": 2.8114213943481445,
+ "learning_rate": 4.988531980198868e-06,
+ "loss": 0.6584,
+ "step": 484
+ },
+ {
+ "epoch": 0.2293144208037825,
+ "grad_norm": 3.2785916328430176,
+ "learning_rate": 4.98847221916175e-06,
+ "loss": 0.7514,
+ "step": 485
+ },
+ {
+ "epoch": 0.2297872340425532,
+ "grad_norm": 3.0520215034484863,
+ "learning_rate": 4.988412303178377e-06,
+ "loss": 0.7564,
+ "step": 486
+ },
+ {
+ "epoch": 0.23026004728132388,
+ "grad_norm": 3.181002616882324,
+ "learning_rate": 4.988352232252483e-06,
+ "loss": 0.6768,
+ "step": 487
+ },
+ {
+ "epoch": 0.23073286052009456,
+ "grad_norm": 3.4953625202178955,
+ "learning_rate": 4.988292006387805e-06,
+ "loss": 0.7143,
+ "step": 488
+ },
+ {
+ "epoch": 0.23120567375886525,
+ "grad_norm": 3.326571226119995,
+ "learning_rate": 4.988231625588096e-06,
+ "loss": 0.7318,
+ "step": 489
+ },
+ {
+ "epoch": 0.23167848699763594,
+ "grad_norm": 3.09614634513855,
+ "learning_rate": 4.988171089857113e-06,
+ "loss": 0.6574,
+ "step": 490
+ },
+ {
+ "epoch": 0.23215130023640662,
+ "grad_norm": 2.7439446449279785,
+ "learning_rate": 4.9881103991986265e-06,
+ "loss": 0.6637,
+ "step": 491
+ },
+ {
+ "epoch": 0.2326241134751773,
+ "grad_norm": 3.0681190490722656,
+ "learning_rate": 4.988049553616416e-06,
+ "loss": 0.6326,
+ "step": 492
+ },
+ {
+ "epoch": 0.233096926713948,
+ "grad_norm": 3.0757341384887695,
+ "learning_rate": 4.98798855311427e-06,
+ "loss": 0.695,
+ "step": 493
+ },
+ {
+ "epoch": 0.23356973995271868,
+ "grad_norm": 2.8637635707855225,
+ "learning_rate": 4.987927397695985e-06,
+ "loss": 0.6598,
+ "step": 494
+ },
+ {
+ "epoch": 0.23404255319148937,
+ "grad_norm": 3.3641068935394287,
+ "learning_rate": 4.9878660873653715e-06,
+ "loss": 0.7435,
+ "step": 495
+ },
+ {
+ "epoch": 0.23451536643026005,
+ "grad_norm": 3.5025596618652344,
+ "learning_rate": 4.987804622126245e-06,
+ "loss": 0.735,
+ "step": 496
+ },
+ {
+ "epoch": 0.23498817966903074,
+ "grad_norm": 2.9298837184906006,
+ "learning_rate": 4.987743001982434e-06,
+ "loss": 0.7063,
+ "step": 497
+ },
+ {
+ "epoch": 0.23546099290780143,
+ "grad_norm": 2.70358943939209,
+ "learning_rate": 4.987681226937774e-06,
+ "loss": 0.6799,
+ "step": 498
+ },
+ {
+ "epoch": 0.2359338061465721,
+ "grad_norm": 3.027871608734131,
+ "learning_rate": 4.9876192969961125e-06,
+ "loss": 0.6881,
+ "step": 499
+ },
+ {
+ "epoch": 0.2364066193853428,
+ "grad_norm": 3.362306594848633,
+ "learning_rate": 4.987557212161304e-06,
+ "loss": 0.7906,
+ "step": 500
+ },
+ {
+ "epoch": 0.23687943262411348,
+ "grad_norm": 3.3136050701141357,
+ "learning_rate": 4.987494972437217e-06,
+ "loss": 0.6878,
+ "step": 501
+ },
+ {
+ "epoch": 0.23735224586288417,
+ "grad_norm": 3.017089605331421,
+ "learning_rate": 4.9874325778277255e-06,
+ "loss": 0.7279,
+ "step": 502
+ },
+ {
+ "epoch": 0.23782505910165486,
+ "grad_norm": 2.8300516605377197,
+ "learning_rate": 4.987370028336714e-06,
+ "loss": 0.6864,
+ "step": 503
+ },
+ {
+ "epoch": 0.23829787234042554,
+ "grad_norm": 3.201860189437866,
+ "learning_rate": 4.987307323968077e-06,
+ "loss": 0.7531,
+ "step": 504
+ },
+ {
+ "epoch": 0.23877068557919623,
+ "grad_norm": 2.685396194458008,
+ "learning_rate": 4.987244464725721e-06,
+ "loss": 0.5849,
+ "step": 505
+ },
+ {
+ "epoch": 0.23924349881796692,
+ "grad_norm": 2.8715312480926514,
+ "learning_rate": 4.987181450613557e-06,
+ "loss": 0.675,
+ "step": 506
+ },
+ {
+ "epoch": 0.2397163120567376,
+ "grad_norm": 2.813908815383911,
+ "learning_rate": 4.987118281635511e-06,
+ "loss": 0.6841,
+ "step": 507
+ },
+ {
+ "epoch": 0.2401891252955083,
+ "grad_norm": 3.2738473415374756,
+ "learning_rate": 4.987054957795514e-06,
+ "loss": 0.7158,
+ "step": 508
+ },
+ {
+ "epoch": 0.24066193853427895,
+ "grad_norm": 2.896134376525879,
+ "learning_rate": 4.986991479097511e-06,
+ "loss": 0.7542,
+ "step": 509
+ },
+ {
+ "epoch": 0.24113475177304963,
+ "grad_norm": 3.0390403270721436,
+ "learning_rate": 4.986927845545454e-06,
+ "loss": 0.6733,
+ "step": 510
+ },
+ {
+ "epoch": 0.24160756501182032,
+ "grad_norm": 3.0300254821777344,
+ "learning_rate": 4.9868640571433044e-06,
+ "loss": 0.722,
+ "step": 511
+ },
+ {
+ "epoch": 0.242080378250591,
+ "grad_norm": 3.3037352561950684,
+ "learning_rate": 4.986800113895035e-06,
+ "loss": 0.6811,
+ "step": 512
+ },
+ {
+ "epoch": 0.2425531914893617,
+ "grad_norm": 3.0358474254608154,
+ "learning_rate": 4.986736015804627e-06,
+ "loss": 0.7348,
+ "step": 513
+ },
+ {
+ "epoch": 0.24302600472813238,
+ "grad_norm": 3.108792304992676,
+ "learning_rate": 4.986671762876071e-06,
+ "loss": 0.6096,
+ "step": 514
+ },
+ {
+ "epoch": 0.24349881796690306,
+ "grad_norm": 3.1316237449645996,
+ "learning_rate": 4.986607355113367e-06,
+ "loss": 0.6357,
+ "step": 515
+ },
+ {
+ "epoch": 0.24397163120567375,
+ "grad_norm": 3.3095219135284424,
+ "learning_rate": 4.986542792520528e-06,
+ "loss": 0.7515,
+ "step": 516
+ },
+ {
+ "epoch": 0.24444444444444444,
+ "grad_norm": 3.4775984287261963,
+ "learning_rate": 4.986478075101572e-06,
+ "loss": 0.7104,
+ "step": 517
+ },
+ {
+ "epoch": 0.24491725768321512,
+ "grad_norm": 3.341708183288574,
+ "learning_rate": 4.986413202860528e-06,
+ "loss": 0.7339,
+ "step": 518
+ },
+ {
+ "epoch": 0.2453900709219858,
+ "grad_norm": 2.9646966457366943,
+ "learning_rate": 4.986348175801438e-06,
+ "loss": 0.6032,
+ "step": 519
+ },
+ {
+ "epoch": 0.2458628841607565,
+ "grad_norm": 3.1853902339935303,
+ "learning_rate": 4.986282993928349e-06,
+ "loss": 0.6925,
+ "step": 520
+ },
+ {
+ "epoch": 0.24633569739952718,
+ "grad_norm": 3.286909818649292,
+ "learning_rate": 4.98621765724532e-06,
+ "loss": 0.7447,
+ "step": 521
+ },
+ {
+ "epoch": 0.24680851063829787,
+ "grad_norm": 3.2255051136016846,
+ "learning_rate": 4.986152165756419e-06,
+ "loss": 0.7747,
+ "step": 522
+ },
+ {
+ "epoch": 0.24728132387706855,
+ "grad_norm": 3.002352237701416,
+ "learning_rate": 4.986086519465724e-06,
+ "loss": 0.6472,
+ "step": 523
+ },
+ {
+ "epoch": 0.24775413711583924,
+ "grad_norm": 3.4738974571228027,
+ "learning_rate": 4.986020718377322e-06,
+ "loss": 0.7381,
+ "step": 524
+ },
+ {
+ "epoch": 0.24822695035460993,
+ "grad_norm": 3.4470200538635254,
+ "learning_rate": 4.985954762495312e-06,
+ "loss": 0.6878,
+ "step": 525
+ },
+ {
+ "epoch": 0.2486997635933806,
+ "grad_norm": 2.9219350814819336,
+ "learning_rate": 4.985888651823799e-06,
+ "loss": 0.6317,
+ "step": 526
+ },
+ {
+ "epoch": 0.2491725768321513,
+ "grad_norm": 3.061767101287842,
+ "learning_rate": 4.985822386366899e-06,
+ "loss": 0.6842,
+ "step": 527
+ },
+ {
+ "epoch": 0.24964539007092199,
+ "grad_norm": 3.0291247367858887,
+ "learning_rate": 4.985755966128742e-06,
+ "loss": 0.6852,
+ "step": 528
+ },
+ {
+ "epoch": 0.25011820330969264,
+ "grad_norm": 2.964280843734741,
+ "learning_rate": 4.985689391113457e-06,
+ "loss": 0.7738,
+ "step": 529
+ },
+ {
+ "epoch": 0.25059101654846333,
+ "grad_norm": 3.058302164077759,
+ "learning_rate": 4.9856226613251955e-06,
+ "loss": 0.6677,
+ "step": 530
+ },
+ {
+ "epoch": 0.251063829787234,
+ "grad_norm": 3.345141649246216,
+ "learning_rate": 4.985555776768109e-06,
+ "loss": 0.7837,
+ "step": 531
+ },
+ {
+ "epoch": 0.2515366430260047,
+ "grad_norm": 3.565031051635742,
+ "learning_rate": 4.9854887374463636e-06,
+ "loss": 0.7231,
+ "step": 532
+ },
+ {
+ "epoch": 0.2520094562647754,
+ "grad_norm": 2.7953789234161377,
+ "learning_rate": 4.985421543364132e-06,
+ "loss": 0.6102,
+ "step": 533
+ },
+ {
+ "epoch": 0.2524822695035461,
+ "grad_norm": 2.887606620788574,
+ "learning_rate": 4.9853541945256e-06,
+ "loss": 0.6289,
+ "step": 534
+ },
+ {
+ "epoch": 0.25295508274231676,
+ "grad_norm": 3.1480495929718018,
+ "learning_rate": 4.985286690934961e-06,
+ "loss": 0.6348,
+ "step": 535
+ },
+ {
+ "epoch": 0.25342789598108745,
+ "grad_norm": 2.8912761211395264,
+ "learning_rate": 4.985219032596416e-06,
+ "loss": 0.595,
+ "step": 536
+ },
+ {
+ "epoch": 0.25390070921985813,
+ "grad_norm": 2.947936534881592,
+ "learning_rate": 4.98515121951418e-06,
+ "loss": 0.6196,
+ "step": 537
+ },
+ {
+ "epoch": 0.2543735224586288,
+ "grad_norm": 3.1085827350616455,
+ "learning_rate": 4.985083251692474e-06,
+ "loss": 0.6387,
+ "step": 538
+ },
+ {
+ "epoch": 0.2548463356973995,
+ "grad_norm": 3.1688334941864014,
+ "learning_rate": 4.985015129135531e-06,
+ "loss": 0.7055,
+ "step": 539
+ },
+ {
+ "epoch": 0.2553191489361702,
+ "grad_norm": 3.075042963027954,
+ "learning_rate": 4.984946851847593e-06,
+ "loss": 0.7515,
+ "step": 540
+ },
+ {
+ "epoch": 0.2557919621749409,
+ "grad_norm": 3.1933093070983887,
+ "learning_rate": 4.98487841983291e-06,
+ "loss": 0.7054,
+ "step": 541
+ },
+ {
+ "epoch": 0.25626477541371157,
+ "grad_norm": 3.043473958969116,
+ "learning_rate": 4.984809833095744e-06,
+ "loss": 0.6281,
+ "step": 542
+ },
+ {
+ "epoch": 0.25673758865248225,
+ "grad_norm": 3.0532584190368652,
+ "learning_rate": 4.9847410916403645e-06,
+ "loss": 0.6155,
+ "step": 543
+ },
+ {
+ "epoch": 0.25721040189125294,
+ "grad_norm": 3.608480215072632,
+ "learning_rate": 4.984672195471053e-06,
+ "loss": 0.7363,
+ "step": 544
+ },
+ {
+ "epoch": 0.2576832151300236,
+ "grad_norm": 2.7491862773895264,
+ "learning_rate": 4.9846031445921e-06,
+ "loss": 0.6594,
+ "step": 545
+ },
+ {
+ "epoch": 0.2581560283687943,
+ "grad_norm": 2.8602418899536133,
+ "learning_rate": 4.984533939007802e-06,
+ "loss": 0.6742,
+ "step": 546
+ },
+ {
+ "epoch": 0.258628841607565,
+ "grad_norm": 3.1782007217407227,
+ "learning_rate": 4.98446457872247e-06,
+ "loss": 0.731,
+ "step": 547
+ },
+ {
+ "epoch": 0.2591016548463357,
+ "grad_norm": 2.796147584915161,
+ "learning_rate": 4.984395063740423e-06,
+ "loss": 0.6617,
+ "step": 548
+ },
+ {
+ "epoch": 0.25957446808510637,
+ "grad_norm": 2.8392202854156494,
+ "learning_rate": 4.984325394065991e-06,
+ "loss": 0.6753,
+ "step": 549
+ },
+ {
+ "epoch": 0.26004728132387706,
+ "grad_norm": 3.134672164916992,
+ "learning_rate": 4.984255569703508e-06,
+ "loss": 0.7222,
+ "step": 550
+ },
+ {
+ "epoch": 0.26052009456264774,
+ "grad_norm": 2.734330177307129,
+ "learning_rate": 4.984185590657325e-06,
+ "loss": 0.6098,
+ "step": 551
+ },
+ {
+ "epoch": 0.26099290780141843,
+ "grad_norm": 3.739010810852051,
+ "learning_rate": 4.984115456931798e-06,
+ "loss": 0.7457,
+ "step": 552
+ },
+ {
+ "epoch": 0.2614657210401891,
+ "grad_norm": 2.8412528038024902,
+ "learning_rate": 4.9840451685312925e-06,
+ "loss": 0.6972,
+ "step": 553
+ },
+ {
+ "epoch": 0.2619385342789598,
+ "grad_norm": 3.017395496368408,
+ "learning_rate": 4.983974725460188e-06,
+ "loss": 0.6887,
+ "step": 554
+ },
+ {
+ "epoch": 0.2624113475177305,
+ "grad_norm": 3.2746949195861816,
+ "learning_rate": 4.98390412772287e-06,
+ "loss": 0.7047,
+ "step": 555
+ },
+ {
+ "epoch": 0.2628841607565012,
+ "grad_norm": 3.1561965942382812,
+ "learning_rate": 4.983833375323732e-06,
+ "loss": 0.7726,
+ "step": 556
+ },
+ {
+ "epoch": 0.26335697399527186,
+ "grad_norm": 3.2367217540740967,
+ "learning_rate": 4.9837624682671816e-06,
+ "loss": 0.6348,
+ "step": 557
+ },
+ {
+ "epoch": 0.26382978723404255,
+ "grad_norm": 2.8195858001708984,
+ "learning_rate": 4.983691406557633e-06,
+ "loss": 0.6387,
+ "step": 558
+ },
+ {
+ "epoch": 0.26430260047281323,
+ "grad_norm": 3.349820852279663,
+ "learning_rate": 4.983620190199511e-06,
+ "loss": 0.6776,
+ "step": 559
+ },
+ {
+ "epoch": 0.2647754137115839,
+ "grad_norm": 2.8025588989257812,
+ "learning_rate": 4.98354881919725e-06,
+ "loss": 0.6512,
+ "step": 560
+ },
+ {
+ "epoch": 0.2652482269503546,
+ "grad_norm": 2.9125499725341797,
+ "learning_rate": 4.983477293555295e-06,
+ "loss": 0.7024,
+ "step": 561
+ },
+ {
+ "epoch": 0.2657210401891253,
+ "grad_norm": 3.3479275703430176,
+ "learning_rate": 4.983405613278098e-06,
+ "loss": 0.688,
+ "step": 562
+ },
+ {
+ "epoch": 0.266193853427896,
+ "grad_norm": 3.123971462249756,
+ "learning_rate": 4.983333778370123e-06,
+ "loss": 0.6743,
+ "step": 563
+ },
+ {
+ "epoch": 0.26666666666666666,
+ "grad_norm": 2.891625165939331,
+ "learning_rate": 4.983261788835843e-06,
+ "loss": 0.5971,
+ "step": 564
+ },
+ {
+ "epoch": 0.26713947990543735,
+ "grad_norm": 3.5066864490509033,
+ "learning_rate": 4.98318964467974e-06,
+ "loss": 0.6958,
+ "step": 565
+ },
+ {
+ "epoch": 0.26761229314420804,
+ "grad_norm": 2.570547342300415,
+ "learning_rate": 4.983117345906306e-06,
+ "loss": 0.609,
+ "step": 566
+ },
+ {
+ "epoch": 0.2680851063829787,
+ "grad_norm": 3.005106210708618,
+ "learning_rate": 4.983044892520044e-06,
+ "loss": 0.6791,
+ "step": 567
+ },
+ {
+ "epoch": 0.2685579196217494,
+ "grad_norm": 3.429675340652466,
+ "learning_rate": 4.982972284525463e-06,
+ "loss": 0.6625,
+ "step": 568
+ },
+ {
+ "epoch": 0.2690307328605201,
+ "grad_norm": 3.825657367706299,
+ "learning_rate": 4.982899521927086e-06,
+ "loss": 0.6368,
+ "step": 569
+ },
+ {
+ "epoch": 0.2695035460992908,
+ "grad_norm": 2.8699095249176025,
+ "learning_rate": 4.982826604729443e-06,
+ "loss": 0.6425,
+ "step": 570
+ },
+ {
+ "epoch": 0.26997635933806147,
+ "grad_norm": 3.1688714027404785,
+ "learning_rate": 4.982753532937074e-06,
+ "loss": 0.6904,
+ "step": 571
+ },
+ {
+ "epoch": 0.27044917257683215,
+ "grad_norm": 3.3889992237091064,
+ "learning_rate": 4.98268030655453e-06,
+ "loss": 0.7575,
+ "step": 572
+ },
+ {
+ "epoch": 0.27092198581560284,
+ "grad_norm": 3.108315944671631,
+ "learning_rate": 4.982606925586367e-06,
+ "loss": 0.6648,
+ "step": 573
+ },
+ {
+ "epoch": 0.2713947990543735,
+ "grad_norm": 3.209831953048706,
+ "learning_rate": 4.982533390037159e-06,
+ "loss": 0.657,
+ "step": 574
+ },
+ {
+ "epoch": 0.2718676122931442,
+ "grad_norm": 3.1740927696228027,
+ "learning_rate": 4.982459699911482e-06,
+ "loss": 0.7262,
+ "step": 575
+ },
+ {
+ "epoch": 0.2723404255319149,
+ "grad_norm": 3.0190417766571045,
+ "learning_rate": 4.982385855213924e-06,
+ "loss": 0.6368,
+ "step": 576
+ },
+ {
+ "epoch": 0.2728132387706856,
+ "grad_norm": 3.05049467086792,
+ "learning_rate": 4.982311855949084e-06,
+ "loss": 0.72,
+ "step": 577
+ },
+ {
+ "epoch": 0.27328605200945627,
+ "grad_norm": 2.984816551208496,
+ "learning_rate": 4.98223770212157e-06,
+ "loss": 0.6856,
+ "step": 578
+ },
+ {
+ "epoch": 0.27375886524822696,
+ "grad_norm": 2.744969606399536,
+ "learning_rate": 4.982163393735998e-06,
+ "loss": 0.6023,
+ "step": 579
+ },
+ {
+ "epoch": 0.27423167848699764,
+ "grad_norm": 3.170564889907837,
+ "learning_rate": 4.982088930796996e-06,
+ "loss": 0.6678,
+ "step": 580
+ },
+ {
+ "epoch": 0.27470449172576833,
+ "grad_norm": 2.8686118125915527,
+ "learning_rate": 4.982014313309199e-06,
+ "loss": 0.6157,
+ "step": 581
+ },
+ {
+ "epoch": 0.275177304964539,
+ "grad_norm": 2.8768694400787354,
+ "learning_rate": 4.981939541277254e-06,
+ "loss": 0.6566,
+ "step": 582
+ },
+ {
+ "epoch": 0.2756501182033097,
+ "grad_norm": 2.621481418609619,
+ "learning_rate": 4.981864614705818e-06,
+ "loss": 0.7372,
+ "step": 583
+ },
+ {
+ "epoch": 0.2761229314420804,
+ "grad_norm": 3.527374267578125,
+ "learning_rate": 4.981789533599554e-06,
+ "loss": 0.6485,
+ "step": 584
+ },
+ {
+ "epoch": 0.2765957446808511,
+ "grad_norm": 3.3141074180603027,
+ "learning_rate": 4.981714297963138e-06,
+ "loss": 0.6816,
+ "step": 585
+ },
+ {
+ "epoch": 0.27706855791962176,
+ "grad_norm": 2.9247069358825684,
+ "learning_rate": 4.981638907801255e-06,
+ "loss": 0.7217,
+ "step": 586
+ },
+ {
+ "epoch": 0.27754137115839245,
+ "grad_norm": 2.875236749649048,
+ "learning_rate": 4.981563363118599e-06,
+ "loss": 0.6662,
+ "step": 587
+ },
+ {
+ "epoch": 0.27801418439716313,
+ "grad_norm": 2.9540364742279053,
+ "learning_rate": 4.981487663919874e-06,
+ "loss": 0.7225,
+ "step": 588
+ },
+ {
+ "epoch": 0.2784869976359338,
+ "grad_norm": 2.90889310836792,
+ "learning_rate": 4.981411810209793e-06,
+ "loss": 0.6054,
+ "step": 589
+ },
+ {
+ "epoch": 0.2789598108747045,
+ "grad_norm": 2.8541409969329834,
+ "learning_rate": 4.981335801993078e-06,
+ "loss": 0.6539,
+ "step": 590
+ },
+ {
+ "epoch": 0.2794326241134752,
+ "grad_norm": 3.1600730419158936,
+ "learning_rate": 4.981259639274465e-06,
+ "loss": 0.6415,
+ "step": 591
+ },
+ {
+ "epoch": 0.2799054373522459,
+ "grad_norm": 3.569376230239868,
+ "learning_rate": 4.981183322058693e-06,
+ "loss": 0.6944,
+ "step": 592
+ },
+ {
+ "epoch": 0.28037825059101656,
+ "grad_norm": 3.067667007446289,
+ "learning_rate": 4.981106850350515e-06,
+ "loss": 0.7378,
+ "step": 593
+ },
+ {
+ "epoch": 0.28085106382978725,
+ "grad_norm": 3.082073450088501,
+ "learning_rate": 4.981030224154693e-06,
+ "loss": 0.693,
+ "step": 594
+ },
+ {
+ "epoch": 0.28132387706855794,
+ "grad_norm": 2.902932643890381,
+ "learning_rate": 4.980953443475998e-06,
+ "loss": 0.6549,
+ "step": 595
+ },
+ {
+ "epoch": 0.2817966903073286,
+ "grad_norm": 2.6821181774139404,
+ "learning_rate": 4.980876508319211e-06,
+ "loss": 0.6231,
+ "step": 596
+ },
+ {
+ "epoch": 0.2822695035460993,
+ "grad_norm": 3.1747355461120605,
+ "learning_rate": 4.9807994186891215e-06,
+ "loss": 0.6826,
+ "step": 597
+ },
+ {
+ "epoch": 0.28274231678487,
+ "grad_norm": 2.6975860595703125,
+ "learning_rate": 4.980722174590531e-06,
+ "loss": 0.6669,
+ "step": 598
+ },
+ {
+ "epoch": 0.2832151300236407,
+ "grad_norm": 2.924285650253296,
+ "learning_rate": 4.9806447760282486e-06,
+ "loss": 0.689,
+ "step": 599
+ },
+ {
+ "epoch": 0.28368794326241137,
+ "grad_norm": 2.941417694091797,
+ "learning_rate": 4.980567223007093e-06,
+ "loss": 0.6672,
+ "step": 600
+ },
+ {
+ "epoch": 0.28416075650118205,
+ "grad_norm": 2.8582186698913574,
+ "learning_rate": 4.980489515531892e-06,
+ "loss": 0.6229,
+ "step": 601
+ },
+ {
+ "epoch": 0.28463356973995274,
+ "grad_norm": 2.6462013721466064,
+ "learning_rate": 4.9804116536074865e-06,
+ "loss": 0.606,
+ "step": 602
+ },
+ {
+ "epoch": 0.2851063829787234,
+ "grad_norm": 2.9029998779296875,
+ "learning_rate": 4.980333637238723e-06,
+ "loss": 0.5915,
+ "step": 603
+ },
+ {
+ "epoch": 0.2855791962174941,
+ "grad_norm": 3.9359042644500732,
+ "learning_rate": 4.980255466430462e-06,
+ "loss": 0.7035,
+ "step": 604
+ },
+ {
+ "epoch": 0.2860520094562648,
+ "grad_norm": 3.200524091720581,
+ "learning_rate": 4.980177141187566e-06,
+ "loss": 0.7156,
+ "step": 605
+ },
+ {
+ "epoch": 0.2865248226950355,
+ "grad_norm": 3.1708686351776123,
+ "learning_rate": 4.980098661514916e-06,
+ "loss": 0.746,
+ "step": 606
+ },
+ {
+ "epoch": 0.28699763593380617,
+ "grad_norm": 2.8926830291748047,
+ "learning_rate": 4.980020027417397e-06,
+ "loss": 0.6282,
+ "step": 607
+ },
+ {
+ "epoch": 0.28747044917257686,
+ "grad_norm": 3.0526294708251953,
+ "learning_rate": 4.979941238899906e-06,
+ "loss": 0.6594,
+ "step": 608
+ },
+ {
+ "epoch": 0.28794326241134754,
+ "grad_norm": 2.9869306087493896,
+ "learning_rate": 4.9798622959673486e-06,
+ "loss": 0.7771,
+ "step": 609
+ },
+ {
+ "epoch": 0.28841607565011823,
+ "grad_norm": 2.7894513607025146,
+ "learning_rate": 4.979783198624638e-06,
+ "loss": 0.6819,
+ "step": 610
+ },
+ {
+ "epoch": 0.28888888888888886,
+ "grad_norm": 2.958575963973999,
+ "learning_rate": 4.9797039468767025e-06,
+ "loss": 0.6474,
+ "step": 611
+ },
+ {
+ "epoch": 0.28936170212765955,
+ "grad_norm": 3.423748016357422,
+ "learning_rate": 4.979624540728475e-06,
+ "loss": 0.7389,
+ "step": 612
+ },
+ {
+ "epoch": 0.28983451536643023,
+ "grad_norm": 2.9641635417938232,
+ "learning_rate": 4.9795449801849e-06,
+ "loss": 0.6005,
+ "step": 613
+ },
+ {
+ "epoch": 0.2903073286052009,
+ "grad_norm": 3.02274227142334,
+ "learning_rate": 4.979465265250933e-06,
+ "loss": 0.6358,
+ "step": 614
+ },
+ {
+ "epoch": 0.2907801418439716,
+ "grad_norm": 3.0562758445739746,
+ "learning_rate": 4.979385395931534e-06,
+ "loss": 0.6313,
+ "step": 615
+ },
+ {
+ "epoch": 0.2912529550827423,
+ "grad_norm": 3.301816701889038,
+ "learning_rate": 4.97930537223168e-06,
+ "loss": 0.7264,
+ "step": 616
+ },
+ {
+ "epoch": 0.291725768321513,
+ "grad_norm": 2.975360870361328,
+ "learning_rate": 4.979225194156351e-06,
+ "loss": 0.613,
+ "step": 617
+ },
+ {
+ "epoch": 0.29219858156028367,
+ "grad_norm": 2.9245030879974365,
+ "learning_rate": 4.97914486171054e-06,
+ "loss": 0.6646,
+ "step": 618
+ },
+ {
+ "epoch": 0.29267139479905435,
+ "grad_norm": 3.1336188316345215,
+ "learning_rate": 4.979064374899249e-06,
+ "loss": 0.6421,
+ "step": 619
+ },
+ {
+ "epoch": 0.29314420803782504,
+ "grad_norm": 3.6298763751983643,
+ "learning_rate": 4.978983733727491e-06,
+ "loss": 0.6433,
+ "step": 620
+ },
+ {
+ "epoch": 0.2936170212765957,
+ "grad_norm": 2.919597625732422,
+ "learning_rate": 4.9789029382002845e-06,
+ "loss": 0.6288,
+ "step": 621
+ },
+ {
+ "epoch": 0.2940898345153664,
+ "grad_norm": 3.2206127643585205,
+ "learning_rate": 4.978821988322662e-06,
+ "loss": 0.7102,
+ "step": 622
+ },
+ {
+ "epoch": 0.2945626477541371,
+ "grad_norm": 3.1767101287841797,
+ "learning_rate": 4.978740884099664e-06,
+ "loss": 0.6722,
+ "step": 623
+ },
+ {
+ "epoch": 0.2950354609929078,
+ "grad_norm": 3.3425452709198,
+ "learning_rate": 4.97865962553634e-06,
+ "loss": 0.6492,
+ "step": 624
+ },
+ {
+ "epoch": 0.29550827423167847,
+ "grad_norm": 3.0408358573913574,
+ "learning_rate": 4.97857821263775e-06,
+ "loss": 0.6522,
+ "step": 625
+ },
+ {
+ "epoch": 0.29598108747044916,
+ "grad_norm": 2.8144783973693848,
+ "learning_rate": 4.978496645408963e-06,
+ "loss": 0.7237,
+ "step": 626
+ },
+ {
+ "epoch": 0.29645390070921984,
+ "grad_norm": 3.7010560035705566,
+ "learning_rate": 4.978414923855057e-06,
+ "loss": 0.7509,
+ "step": 627
+ },
+ {
+ "epoch": 0.29692671394799053,
+ "grad_norm": 2.9438371658325195,
+ "learning_rate": 4.978333047981122e-06,
+ "loss": 0.6244,
+ "step": 628
+ },
+ {
+ "epoch": 0.2973995271867612,
+ "grad_norm": 3.285982370376587,
+ "learning_rate": 4.978251017792255e-06,
+ "loss": 0.7553,
+ "step": 629
+ },
+ {
+ "epoch": 0.2978723404255319,
+ "grad_norm": 3.7021138668060303,
+ "learning_rate": 4.978168833293564e-06,
+ "loss": 0.7859,
+ "step": 630
+ },
+ {
+ "epoch": 0.2983451536643026,
+ "grad_norm": 3.481858730316162,
+ "learning_rate": 4.9780864944901654e-06,
+ "loss": 0.7146,
+ "step": 631
+ },
+ {
+ "epoch": 0.2988179669030733,
+ "grad_norm": 3.693824529647827,
+ "learning_rate": 4.978004001387188e-06,
+ "loss": 0.6608,
+ "step": 632
+ },
+ {
+ "epoch": 0.29929078014184396,
+ "grad_norm": 3.0069146156311035,
+ "learning_rate": 4.9779213539897665e-06,
+ "loss": 0.6506,
+ "step": 633
+ },
+ {
+ "epoch": 0.29976359338061465,
+ "grad_norm": 3.037644147872925,
+ "learning_rate": 4.977838552303048e-06,
+ "loss": 0.6487,
+ "step": 634
+ },
+ {
+ "epoch": 0.30023640661938533,
+ "grad_norm": 3.018554449081421,
+ "learning_rate": 4.977755596332188e-06,
+ "loss": 0.6128,
+ "step": 635
+ },
+ {
+ "epoch": 0.300709219858156,
+ "grad_norm": 3.000312089920044,
+ "learning_rate": 4.977672486082351e-06,
+ "loss": 0.6431,
+ "step": 636
+ },
+ {
+ "epoch": 0.3011820330969267,
+ "grad_norm": 2.836803913116455,
+ "learning_rate": 4.977589221558713e-06,
+ "loss": 0.5914,
+ "step": 637
+ },
+ {
+ "epoch": 0.3016548463356974,
+ "grad_norm": 3.080469846725464,
+ "learning_rate": 4.977505802766457e-06,
+ "loss": 0.7265,
+ "step": 638
+ },
+ {
+ "epoch": 0.3021276595744681,
+ "grad_norm": 3.2245471477508545,
+ "learning_rate": 4.97742222971078e-06,
+ "loss": 0.6895,
+ "step": 639
+ },
+ {
+ "epoch": 0.30260047281323876,
+ "grad_norm": 3.559006452560425,
+ "learning_rate": 4.977338502396882e-06,
+ "loss": 0.7439,
+ "step": 640
+ },
+ {
+ "epoch": 0.30307328605200945,
+ "grad_norm": 2.9116289615631104,
+ "learning_rate": 4.9772546208299795e-06,
+ "loss": 0.6907,
+ "step": 641
+ },
+ {
+ "epoch": 0.30354609929078014,
+ "grad_norm": 3.3645524978637695,
+ "learning_rate": 4.977170585015295e-06,
+ "loss": 0.6983,
+ "step": 642
+ },
+ {
+ "epoch": 0.3040189125295508,
+ "grad_norm": 3.080148458480835,
+ "learning_rate": 4.977086394958058e-06,
+ "loss": 0.7016,
+ "step": 643
+ },
+ {
+ "epoch": 0.3044917257683215,
+ "grad_norm": 2.9276750087738037,
+ "learning_rate": 4.977002050663515e-06,
+ "loss": 0.6509,
+ "step": 644
+ },
+ {
+ "epoch": 0.3049645390070922,
+ "grad_norm": 3.183609962463379,
+ "learning_rate": 4.976917552136914e-06,
+ "loss": 0.6814,
+ "step": 645
+ },
+ {
+ "epoch": 0.3054373522458629,
+ "grad_norm": 3.0980000495910645,
+ "learning_rate": 4.976832899383519e-06,
+ "loss": 0.6319,
+ "step": 646
+ },
+ {
+ "epoch": 0.30591016548463357,
+ "grad_norm": 3.211376190185547,
+ "learning_rate": 4.9767480924086e-06,
+ "loss": 0.6365,
+ "step": 647
+ },
+ {
+ "epoch": 0.30638297872340425,
+ "grad_norm": 3.214430093765259,
+ "learning_rate": 4.976663131217437e-06,
+ "loss": 0.6006,
+ "step": 648
+ },
+ {
+ "epoch": 0.30685579196217494,
+ "grad_norm": 3.0914318561553955,
+ "learning_rate": 4.976578015815321e-06,
+ "loss": 0.7162,
+ "step": 649
+ },
+ {
+ "epoch": 0.3073286052009456,
+ "grad_norm": 2.7644500732421875,
+ "learning_rate": 4.976492746207551e-06,
+ "loss": 0.6045,
+ "step": 650
+ },
+ {
+ "epoch": 0.3078014184397163,
+ "grad_norm": 3.1913280487060547,
+ "learning_rate": 4.9764073223994374e-06,
+ "loss": 0.6796,
+ "step": 651
+ },
+ {
+ "epoch": 0.308274231678487,
+ "grad_norm": 2.8919692039489746,
+ "learning_rate": 4.976321744396299e-06,
+ "loss": 0.6683,
+ "step": 652
+ },
+ {
+ "epoch": 0.3087470449172577,
+ "grad_norm": 2.862234115600586,
+ "learning_rate": 4.976236012203463e-06,
+ "loss": 0.6631,
+ "step": 653
+ },
+ {
+ "epoch": 0.30921985815602837,
+ "grad_norm": 2.9708092212677,
+ "learning_rate": 4.976150125826268e-06,
+ "loss": 0.6326,
+ "step": 654
+ },
+ {
+ "epoch": 0.30969267139479906,
+ "grad_norm": 2.892465353012085,
+ "learning_rate": 4.976064085270063e-06,
+ "loss": 0.6574,
+ "step": 655
+ },
+ {
+ "epoch": 0.31016548463356974,
+ "grad_norm": 3.9215126037597656,
+ "learning_rate": 4.975977890540205e-06,
+ "loss": 0.7351,
+ "step": 656
+ },
+ {
+ "epoch": 0.31063829787234043,
+ "grad_norm": 2.9544081687927246,
+ "learning_rate": 4.975891541642059e-06,
+ "loss": 0.7264,
+ "step": 657
+ },
+ {
+ "epoch": 0.3111111111111111,
+ "grad_norm": 2.995035409927368,
+ "learning_rate": 4.975805038581005e-06,
+ "loss": 0.7405,
+ "step": 658
+ },
+ {
+ "epoch": 0.3115839243498818,
+ "grad_norm": 2.9653120040893555,
+ "learning_rate": 4.975718381362427e-06,
+ "loss": 0.679,
+ "step": 659
+ },
+ {
+ "epoch": 0.3120567375886525,
+ "grad_norm": 2.93976092338562,
+ "learning_rate": 4.9756315699917205e-06,
+ "loss": 0.627,
+ "step": 660
+ },
+ {
+ "epoch": 0.3125295508274232,
+ "grad_norm": 3.106522560119629,
+ "learning_rate": 4.9755446044742915e-06,
+ "loss": 0.6329,
+ "step": 661
+ },
+ {
+ "epoch": 0.31300236406619386,
+ "grad_norm": 3.0238280296325684,
+ "learning_rate": 4.975457484815554e-06,
+ "loss": 0.6643,
+ "step": 662
+ },
+ {
+ "epoch": 0.31347517730496455,
+ "grad_norm": 2.943528175354004,
+ "learning_rate": 4.9753702110209356e-06,
+ "loss": 0.668,
+ "step": 663
+ },
+ {
+ "epoch": 0.31394799054373523,
+ "grad_norm": 2.6840121746063232,
+ "learning_rate": 4.9752827830958676e-06,
+ "loss": 0.5482,
+ "step": 664
+ },
+ {
+ "epoch": 0.3144208037825059,
+ "grad_norm": 2.823875904083252,
+ "learning_rate": 4.975195201045794e-06,
+ "loss": 0.7017,
+ "step": 665
+ },
+ {
+ "epoch": 0.3148936170212766,
+ "grad_norm": 3.148181200027466,
+ "learning_rate": 4.975107464876168e-06,
+ "loss": 0.747,
+ "step": 666
+ },
+ {
+ "epoch": 0.3153664302600473,
+ "grad_norm": 2.630584478378296,
+ "learning_rate": 4.9750195745924545e-06,
+ "loss": 0.5987,
+ "step": 667
+ },
+ {
+ "epoch": 0.315839243498818,
+ "grad_norm": 3.075866460800171,
+ "learning_rate": 4.974931530200124e-06,
+ "loss": 0.664,
+ "step": 668
+ },
+ {
+ "epoch": 0.31631205673758866,
+ "grad_norm": 2.947197914123535,
+ "learning_rate": 4.974843331704659e-06,
+ "loss": 0.631,
+ "step": 669
+ },
+ {
+ "epoch": 0.31678486997635935,
+ "grad_norm": 3.519646644592285,
+ "learning_rate": 4.974754979111552e-06,
+ "loss": 0.7154,
+ "step": 670
+ },
+ {
+ "epoch": 0.31725768321513004,
+ "grad_norm": 2.8687186241149902,
+ "learning_rate": 4.974666472426305e-06,
+ "loss": 0.6366,
+ "step": 671
+ },
+ {
+ "epoch": 0.3177304964539007,
+ "grad_norm": 2.6966612339019775,
+ "learning_rate": 4.974577811654426e-06,
+ "loss": 0.7112,
+ "step": 672
+ },
+ {
+ "epoch": 0.3182033096926714,
+ "grad_norm": 3.1390228271484375,
+ "learning_rate": 4.974488996801439e-06,
+ "loss": 0.6882,
+ "step": 673
+ },
+ {
+ "epoch": 0.3186761229314421,
+ "grad_norm": 3.4667599201202393,
+ "learning_rate": 4.974400027872871e-06,
+ "loss": 0.7153,
+ "step": 674
+ },
+ {
+ "epoch": 0.3191489361702128,
+ "grad_norm": 2.9632184505462646,
+ "learning_rate": 4.974310904874265e-06,
+ "loss": 0.7081,
+ "step": 675
+ },
+ {
+ "epoch": 0.31962174940898347,
+ "grad_norm": 3.46150279045105,
+ "learning_rate": 4.9742216278111666e-06,
+ "loss": 0.6242,
+ "step": 676
+ },
+ {
+ "epoch": 0.32009456264775416,
+ "grad_norm": 3.380403757095337,
+ "learning_rate": 4.974132196689137e-06,
+ "loss": 0.6863,
+ "step": 677
+ },
+ {
+ "epoch": 0.32056737588652484,
+ "grad_norm": 3.4279606342315674,
+ "learning_rate": 4.974042611513746e-06,
+ "loss": 0.6388,
+ "step": 678
+ },
+ {
+ "epoch": 0.3210401891252955,
+ "grad_norm": 2.634523391723633,
+ "learning_rate": 4.973952872290568e-06,
+ "loss": 0.6038,
+ "step": 679
+ },
+ {
+ "epoch": 0.3215130023640662,
+ "grad_norm": 3.19693922996521,
+ "learning_rate": 4.973862979025194e-06,
+ "loss": 0.6383,
+ "step": 680
+ },
+ {
+ "epoch": 0.3219858156028369,
+ "grad_norm": 3.437692165374756,
+ "learning_rate": 4.973772931723218e-06,
+ "loss": 0.7288,
+ "step": 681
+ },
+ {
+ "epoch": 0.3224586288416076,
+ "grad_norm": 2.506301164627075,
+ "learning_rate": 4.97368273039025e-06,
+ "loss": 0.5707,
+ "step": 682
+ },
+ {
+ "epoch": 0.3229314420803783,
+ "grad_norm": 3.0942845344543457,
+ "learning_rate": 4.9735923750319044e-06,
+ "loss": 0.6348,
+ "step": 683
+ },
+ {
+ "epoch": 0.32340425531914896,
+ "grad_norm": 3.0889835357666016,
+ "learning_rate": 4.973501865653809e-06,
+ "loss": 0.6697,
+ "step": 684
+ },
+ {
+ "epoch": 0.32387706855791965,
+ "grad_norm": 3.0391931533813477,
+ "learning_rate": 4.973411202261598e-06,
+ "loss": 0.7091,
+ "step": 685
+ },
+ {
+ "epoch": 0.32434988179669033,
+ "grad_norm": 3.0333497524261475,
+ "learning_rate": 4.973320384860917e-06,
+ "loss": 0.6403,
+ "step": 686
+ },
+ {
+ "epoch": 0.324822695035461,
+ "grad_norm": 2.9714622497558594,
+ "learning_rate": 4.973229413457421e-06,
+ "loss": 0.6977,
+ "step": 687
+ },
+ {
+ "epoch": 0.3252955082742317,
+ "grad_norm": 3.057558298110962,
+ "learning_rate": 4.973138288056774e-06,
+ "loss": 0.7236,
+ "step": 688
+ },
+ {
+ "epoch": 0.3257683215130024,
+ "grad_norm": 2.921093463897705,
+ "learning_rate": 4.97304700866465e-06,
+ "loss": 0.576,
+ "step": 689
+ },
+ {
+ "epoch": 0.3262411347517731,
+ "grad_norm": 3.0287256240844727,
+ "learning_rate": 4.972955575286732e-06,
+ "loss": 0.7077,
+ "step": 690
+ },
+ {
+ "epoch": 0.32671394799054376,
+ "grad_norm": 2.8621346950531006,
+ "learning_rate": 4.972863987928716e-06,
+ "loss": 0.6952,
+ "step": 691
+ },
+ {
+ "epoch": 0.3271867612293144,
+ "grad_norm": 2.631359100341797,
+ "learning_rate": 4.9727722465963006e-06,
+ "loss": 0.6931,
+ "step": 692
+ },
+ {
+ "epoch": 0.3276595744680851,
+ "grad_norm": 2.8484320640563965,
+ "learning_rate": 4.972680351295201e-06,
+ "loss": 0.6292,
+ "step": 693
+ },
+ {
+ "epoch": 0.32813238770685577,
+ "grad_norm": 2.593001365661621,
+ "learning_rate": 4.972588302031138e-06,
+ "loss": 0.5942,
+ "step": 694
+ },
+ {
+ "epoch": 0.32860520094562645,
+ "grad_norm": 2.6321065425872803,
+ "learning_rate": 4.972496098809844e-06,
+ "loss": 0.65,
+ "step": 695
+ },
+ {
+ "epoch": 0.32907801418439714,
+ "grad_norm": 3.2516732215881348,
+ "learning_rate": 4.972403741637059e-06,
+ "loss": 0.7385,
+ "step": 696
+ },
+ {
+ "epoch": 0.3295508274231678,
+ "grad_norm": 3.180854320526123,
+ "learning_rate": 4.972311230518535e-06,
+ "loss": 0.6569,
+ "step": 697
+ },
+ {
+ "epoch": 0.3300236406619385,
+ "grad_norm": 4.161016941070557,
+ "learning_rate": 4.972218565460031e-06,
+ "loss": 0.6416,
+ "step": 698
+ },
+ {
+ "epoch": 0.3304964539007092,
+ "grad_norm": 3.153897762298584,
+ "learning_rate": 4.972125746467317e-06,
+ "loss": 0.7196,
+ "step": 699
+ },
+ {
+ "epoch": 0.3309692671394799,
+ "grad_norm": 2.9595556259155273,
+ "learning_rate": 4.972032773546173e-06,
+ "loss": 0.7093,
+ "step": 700
+ },
+ {
+ "epoch": 0.33144208037825057,
+ "grad_norm": 3.1086833477020264,
+ "learning_rate": 4.9719396467023875e-06,
+ "loss": 0.6963,
+ "step": 701
+ },
+ {
+ "epoch": 0.33191489361702126,
+ "grad_norm": 2.958921432495117,
+ "learning_rate": 4.971846365941759e-06,
+ "loss": 0.6518,
+ "step": 702
+ },
+ {
+ "epoch": 0.33238770685579194,
+ "grad_norm": 2.8745479583740234,
+ "learning_rate": 4.971752931270096e-06,
+ "loss": 0.696,
+ "step": 703
+ },
+ {
+ "epoch": 0.33286052009456263,
+ "grad_norm": 3.224358558654785,
+ "learning_rate": 4.971659342693217e-06,
+ "loss": 0.6769,
+ "step": 704
+ },
+ {
+ "epoch": 0.3333333333333333,
+ "grad_norm": 2.696319580078125,
+ "learning_rate": 4.9715656002169486e-06,
+ "loss": 0.6833,
+ "step": 705
+ },
+ {
+ "epoch": 0.333806146572104,
+ "grad_norm": 2.9283502101898193,
+ "learning_rate": 4.971471703847127e-06,
+ "loss": 0.6784,
+ "step": 706
+ },
+ {
+ "epoch": 0.3342789598108747,
+ "grad_norm": 2.654914140701294,
+ "learning_rate": 4.9713776535896e-06,
+ "loss": 0.6337,
+ "step": 707
+ },
+ {
+ "epoch": 0.3347517730496454,
+ "grad_norm": 3.041555643081665,
+ "learning_rate": 4.971283449450224e-06,
+ "loss": 0.6227,
+ "step": 708
+ },
+ {
+ "epoch": 0.33522458628841606,
+ "grad_norm": 2.893008232116699,
+ "learning_rate": 4.971189091434863e-06,
+ "loss": 0.655,
+ "step": 709
+ },
+ {
+ "epoch": 0.33569739952718675,
+ "grad_norm": 2.8806653022766113,
+ "learning_rate": 4.971094579549393e-06,
+ "loss": 0.7077,
+ "step": 710
+ },
+ {
+ "epoch": 0.33617021276595743,
+ "grad_norm": 3.4830048084259033,
+ "learning_rate": 4.9709999137996986e-06,
+ "loss": 0.7461,
+ "step": 711
+ },
+ {
+ "epoch": 0.3366430260047281,
+ "grad_norm": 3.155444860458374,
+ "learning_rate": 4.970905094191674e-06,
+ "loss": 0.652,
+ "step": 712
+ },
+ {
+ "epoch": 0.3371158392434988,
+ "grad_norm": 2.7608706951141357,
+ "learning_rate": 4.970810120731225e-06,
+ "loss": 0.684,
+ "step": 713
+ },
+ {
+ "epoch": 0.3375886524822695,
+ "grad_norm": 2.8209474086761475,
+ "learning_rate": 4.970714993424265e-06,
+ "loss": 0.6009,
+ "step": 714
+ },
+ {
+ "epoch": 0.3380614657210402,
+ "grad_norm": 3.6532654762268066,
+ "learning_rate": 4.9706197122767145e-06,
+ "loss": 0.702,
+ "step": 715
+ },
+ {
+ "epoch": 0.33853427895981086,
+ "grad_norm": 2.6276566982269287,
+ "learning_rate": 4.970524277294508e-06,
+ "loss": 0.6338,
+ "step": 716
+ },
+ {
+ "epoch": 0.33900709219858155,
+ "grad_norm": 3.509871482849121,
+ "learning_rate": 4.970428688483589e-06,
+ "loss": 0.6853,
+ "step": 717
+ },
+ {
+ "epoch": 0.33947990543735224,
+ "grad_norm": 5.332682132720947,
+ "learning_rate": 4.970332945849906e-06,
+ "loss": 0.6684,
+ "step": 718
+ },
+ {
+ "epoch": 0.3399527186761229,
+ "grad_norm": 2.718801975250244,
+ "learning_rate": 4.970237049399424e-06,
+ "loss": 0.6676,
+ "step": 719
+ },
+ {
+ "epoch": 0.3404255319148936,
+ "grad_norm": 3.891003131866455,
+ "learning_rate": 4.970140999138112e-06,
+ "loss": 0.7043,
+ "step": 720
+ },
+ {
+ "epoch": 0.3408983451536643,
+ "grad_norm": 2.8863155841827393,
+ "learning_rate": 4.970044795071951e-06,
+ "loss": 0.6563,
+ "step": 721
+ },
+ {
+ "epoch": 0.341371158392435,
+ "grad_norm": 3.2527518272399902,
+ "learning_rate": 4.969948437206932e-06,
+ "loss": 0.7244,
+ "step": 722
+ },
+ {
+ "epoch": 0.34184397163120567,
+ "grad_norm": 2.9726758003234863,
+ "learning_rate": 4.969851925549054e-06,
+ "loss": 0.6548,
+ "step": 723
+ },
+ {
+ "epoch": 0.34231678486997635,
+ "grad_norm": 3.118309497833252,
+ "learning_rate": 4.969755260104327e-06,
+ "loss": 0.7293,
+ "step": 724
+ },
+ {
+ "epoch": 0.34278959810874704,
+ "grad_norm": 3.373068332672119,
+ "learning_rate": 4.969658440878769e-06,
+ "loss": 0.6444,
+ "step": 725
+ },
+ {
+ "epoch": 0.3432624113475177,
+ "grad_norm": 2.7157437801361084,
+ "learning_rate": 4.969561467878409e-06,
+ "loss": 0.642,
+ "step": 726
+ },
+ {
+ "epoch": 0.3437352245862884,
+ "grad_norm": 2.58929705619812,
+ "learning_rate": 4.969464341109285e-06,
+ "loss": 0.6165,
+ "step": 727
+ },
+ {
+ "epoch": 0.3442080378250591,
+ "grad_norm": 2.8811306953430176,
+ "learning_rate": 4.969367060577445e-06,
+ "loss": 0.7127,
+ "step": 728
+ },
+ {
+ "epoch": 0.3446808510638298,
+ "grad_norm": 3.494358539581299,
+ "learning_rate": 4.969269626288946e-06,
+ "loss": 0.7103,
+ "step": 729
+ },
+ {
+ "epoch": 0.34515366430260047,
+ "grad_norm": 2.9753928184509277,
+ "learning_rate": 4.969172038249855e-06,
+ "loss": 0.6911,
+ "step": 730
+ },
+ {
+ "epoch": 0.34562647754137116,
+ "grad_norm": 3.2885913848876953,
+ "learning_rate": 4.969074296466247e-06,
+ "loss": 0.6968,
+ "step": 731
+ },
+ {
+ "epoch": 0.34609929078014184,
+ "grad_norm": 2.7564568519592285,
+ "learning_rate": 4.968976400944211e-06,
+ "loss": 0.6843,
+ "step": 732
+ },
+ {
+ "epoch": 0.34657210401891253,
+ "grad_norm": 2.9255006313323975,
+ "learning_rate": 4.96887835168984e-06,
+ "loss": 0.6024,
+ "step": 733
+ },
+ {
+ "epoch": 0.3470449172576832,
+ "grad_norm": 3.1808290481567383,
+ "learning_rate": 4.968780148709239e-06,
+ "loss": 0.7377,
+ "step": 734
+ },
+ {
+ "epoch": 0.3475177304964539,
+ "grad_norm": 2.956666946411133,
+ "learning_rate": 4.968681792008523e-06,
+ "loss": 0.65,
+ "step": 735
+ },
+ {
+ "epoch": 0.3479905437352246,
+ "grad_norm": 2.9631855487823486,
+ "learning_rate": 4.9685832815938175e-06,
+ "loss": 0.677,
+ "step": 736
+ },
+ {
+ "epoch": 0.3484633569739953,
+ "grad_norm": 2.501917600631714,
+ "learning_rate": 4.968484617471256e-06,
+ "loss": 0.6282,
+ "step": 737
+ },
+ {
+ "epoch": 0.34893617021276596,
+ "grad_norm": 2.750779628753662,
+ "learning_rate": 4.968385799646981e-06,
+ "loss": 0.6507,
+ "step": 738
+ },
+ {
+ "epoch": 0.34940898345153665,
+ "grad_norm": 2.872300624847412,
+ "learning_rate": 4.968286828127146e-06,
+ "loss": 0.5949,
+ "step": 739
+ },
+ {
+ "epoch": 0.34988179669030733,
+ "grad_norm": 2.6316142082214355,
+ "learning_rate": 4.9681877029179124e-06,
+ "loss": 0.6328,
+ "step": 740
+ },
+ {
+ "epoch": 0.350354609929078,
+ "grad_norm": 3.244364023208618,
+ "learning_rate": 4.968088424025454e-06,
+ "loss": 0.7393,
+ "step": 741
+ },
+ {
+ "epoch": 0.3508274231678487,
+ "grad_norm": 2.620465040206909,
+ "learning_rate": 4.967988991455951e-06,
+ "loss": 0.6797,
+ "step": 742
+ },
+ {
+ "epoch": 0.3513002364066194,
+ "grad_norm": 2.854513645172119,
+ "learning_rate": 4.967889405215596e-06,
+ "loss": 0.6368,
+ "step": 743
+ },
+ {
+ "epoch": 0.3517730496453901,
+ "grad_norm": 2.579854726791382,
+ "learning_rate": 4.9677896653105886e-06,
+ "loss": 0.6489,
+ "step": 744
+ },
+ {
+ "epoch": 0.35224586288416077,
+ "grad_norm": 3.0697381496429443,
+ "learning_rate": 4.96768977174714e-06,
+ "loss": 0.6313,
+ "step": 745
+ },
+ {
+ "epoch": 0.35271867612293145,
+ "grad_norm": 3.369338035583496,
+ "learning_rate": 4.96758972453147e-06,
+ "loss": 0.7416,
+ "step": 746
+ },
+ {
+ "epoch": 0.35319148936170214,
+ "grad_norm": 2.836221933364868,
+ "learning_rate": 4.967489523669807e-06,
+ "loss": 0.6422,
+ "step": 747
+ },
+ {
+ "epoch": 0.3536643026004728,
+ "grad_norm": 2.929579496383667,
+ "learning_rate": 4.967389169168392e-06,
+ "loss": 0.6482,
+ "step": 748
+ },
+ {
+ "epoch": 0.3541371158392435,
+ "grad_norm": 2.9243831634521484,
+ "learning_rate": 4.967288661033472e-06,
+ "loss": 0.5813,
+ "step": 749
+ },
+ {
+ "epoch": 0.3546099290780142,
+ "grad_norm": 3.7555336952209473,
+ "learning_rate": 4.967187999271306e-06,
+ "loss": 0.6501,
+ "step": 750
+ },
+ {
+ "epoch": 0.3550827423167849,
+ "grad_norm": 3.4279143810272217,
+ "learning_rate": 4.9670871838881615e-06,
+ "loss": 0.6326,
+ "step": 751
+ },
+ {
+ "epoch": 0.35555555555555557,
+ "grad_norm": 2.875066041946411,
+ "learning_rate": 4.9669862148903166e-06,
+ "loss": 0.664,
+ "step": 752
+ },
+ {
+ "epoch": 0.35602836879432626,
+ "grad_norm": 3.130394697189331,
+ "learning_rate": 4.966885092284057e-06,
+ "loss": 0.706,
+ "step": 753
+ },
+ {
+ "epoch": 0.35650118203309694,
+ "grad_norm": 2.9606287479400635,
+ "learning_rate": 4.96678381607568e-06,
+ "loss": 0.693,
+ "step": 754
+ },
+ {
+ "epoch": 0.35697399527186763,
+ "grad_norm": 3.0584909915924072,
+ "learning_rate": 4.966682386271491e-06,
+ "loss": 0.6034,
+ "step": 755
+ },
+ {
+ "epoch": 0.3574468085106383,
+ "grad_norm": 2.8215200901031494,
+ "learning_rate": 4.966580802877805e-06,
+ "loss": 0.6217,
+ "step": 756
+ },
+ {
+ "epoch": 0.357919621749409,
+ "grad_norm": 2.7348055839538574,
+ "learning_rate": 4.966479065900949e-06,
+ "loss": 0.6194,
+ "step": 757
+ },
+ {
+ "epoch": 0.3583924349881797,
+ "grad_norm": 3.2347466945648193,
+ "learning_rate": 4.966377175347257e-06,
+ "loss": 0.6377,
+ "step": 758
+ },
+ {
+ "epoch": 0.3588652482269504,
+ "grad_norm": 3.311845302581787,
+ "learning_rate": 4.966275131223072e-06,
+ "loss": 0.6234,
+ "step": 759
+ },
+ {
+ "epoch": 0.35933806146572106,
+ "grad_norm": 3.0384368896484375,
+ "learning_rate": 4.96617293353475e-06,
+ "loss": 0.609,
+ "step": 760
+ },
+ {
+ "epoch": 0.35981087470449175,
+ "grad_norm": 3.516854763031006,
+ "learning_rate": 4.966070582288653e-06,
+ "loss": 0.6627,
+ "step": 761
+ },
+ {
+ "epoch": 0.36028368794326243,
+ "grad_norm": 3.2425215244293213,
+ "learning_rate": 4.9659680774911534e-06,
+ "loss": 0.7355,
+ "step": 762
+ },
+ {
+ "epoch": 0.3607565011820331,
+ "grad_norm": 3.2665750980377197,
+ "learning_rate": 4.965865419148636e-06,
+ "loss": 0.6787,
+ "step": 763
+ },
+ {
+ "epoch": 0.3612293144208038,
+ "grad_norm": 2.729428291320801,
+ "learning_rate": 4.96576260726749e-06,
+ "loss": 0.6272,
+ "step": 764
+ },
+ {
+ "epoch": 0.3617021276595745,
+ "grad_norm": 3.299969434738159,
+ "learning_rate": 4.965659641854119e-06,
+ "loss": 0.6552,
+ "step": 765
+ },
+ {
+ "epoch": 0.3621749408983452,
+ "grad_norm": 2.7090916633605957,
+ "learning_rate": 4.965556522914934e-06,
+ "loss": 0.6661,
+ "step": 766
+ },
+ {
+ "epoch": 0.36264775413711586,
+ "grad_norm": 2.488846778869629,
+ "learning_rate": 4.965453250456355e-06,
+ "loss": 0.5821,
+ "step": 767
+ },
+ {
+ "epoch": 0.36312056737588655,
+ "grad_norm": 2.5267233848571777,
+ "learning_rate": 4.965349824484813e-06,
+ "loss": 0.5593,
+ "step": 768
+ },
+ {
+ "epoch": 0.36359338061465724,
+ "grad_norm": 3.0646679401397705,
+ "learning_rate": 4.965246245006748e-06,
+ "loss": 0.6341,
+ "step": 769
+ },
+ {
+ "epoch": 0.3640661938534279,
+ "grad_norm": 2.9877712726593018,
+ "learning_rate": 4.965142512028609e-06,
+ "loss": 0.7202,
+ "step": 770
+ },
+ {
+ "epoch": 0.3645390070921986,
+ "grad_norm": 3.7494113445281982,
+ "learning_rate": 4.965038625556854e-06,
+ "loss": 0.7643,
+ "step": 771
+ },
+ {
+ "epoch": 0.3650118203309693,
+ "grad_norm": 2.8382890224456787,
+ "learning_rate": 4.964934585597954e-06,
+ "loss": 0.6522,
+ "step": 772
+ },
+ {
+ "epoch": 0.3654846335697399,
+ "grad_norm": 3.091655731201172,
+ "learning_rate": 4.9648303921583854e-06,
+ "loss": 0.7117,
+ "step": 773
+ },
+ {
+ "epoch": 0.3659574468085106,
+ "grad_norm": 3.0608325004577637,
+ "learning_rate": 4.964726045244635e-06,
+ "loss": 0.6538,
+ "step": 774
+ },
+ {
+ "epoch": 0.3664302600472813,
+ "grad_norm": 2.8492867946624756,
+ "learning_rate": 4.964621544863203e-06,
+ "loss": 0.6079,
+ "step": 775
+ },
+ {
+ "epoch": 0.366903073286052,
+ "grad_norm": 3.0669894218444824,
+ "learning_rate": 4.964516891020594e-06,
+ "loss": 0.6223,
+ "step": 776
+ },
+ {
+ "epoch": 0.36737588652482267,
+ "grad_norm": 3.089984893798828,
+ "learning_rate": 4.964412083723325e-06,
+ "loss": 0.671,
+ "step": 777
+ },
+ {
+ "epoch": 0.36784869976359336,
+ "grad_norm": 2.905242443084717,
+ "learning_rate": 4.964307122977921e-06,
+ "loss": 0.62,
+ "step": 778
+ },
+ {
+ "epoch": 0.36832151300236404,
+ "grad_norm": 3.954436779022217,
+ "learning_rate": 4.964202008790918e-06,
+ "loss": 0.6535,
+ "step": 779
+ },
+ {
+ "epoch": 0.36879432624113473,
+ "grad_norm": 2.6026058197021484,
+ "learning_rate": 4.9640967411688615e-06,
+ "loss": 0.5865,
+ "step": 780
+ },
+ {
+ "epoch": 0.3692671394799054,
+ "grad_norm": 2.9876346588134766,
+ "learning_rate": 4.963991320118306e-06,
+ "loss": 0.6698,
+ "step": 781
+ },
+ {
+ "epoch": 0.3697399527186761,
+ "grad_norm": 2.9411263465881348,
+ "learning_rate": 4.963885745645815e-06,
+ "loss": 0.6173,
+ "step": 782
+ },
+ {
+ "epoch": 0.3702127659574468,
+ "grad_norm": 2.5679805278778076,
+ "learning_rate": 4.963780017757962e-06,
+ "loss": 0.6285,
+ "step": 783
+ },
+ {
+ "epoch": 0.3706855791962175,
+ "grad_norm": 3.3100640773773193,
+ "learning_rate": 4.963674136461332e-06,
+ "loss": 0.5968,
+ "step": 784
+ },
+ {
+ "epoch": 0.37115839243498816,
+ "grad_norm": 3.1293699741363525,
+ "learning_rate": 4.963568101762515e-06,
+ "loss": 0.697,
+ "step": 785
+ },
+ {
+ "epoch": 0.37163120567375885,
+ "grad_norm": 3.043853759765625,
+ "learning_rate": 4.963461913668115e-06,
+ "loss": 0.5881,
+ "step": 786
+ },
+ {
+ "epoch": 0.37210401891252953,
+ "grad_norm": 3.07351016998291,
+ "learning_rate": 4.963355572184744e-06,
+ "loss": 0.6307,
+ "step": 787
+ },
+ {
+ "epoch": 0.3725768321513002,
+ "grad_norm": 2.7381317615509033,
+ "learning_rate": 4.9632490773190225e-06,
+ "loss": 0.716,
+ "step": 788
+ },
+ {
+ "epoch": 0.3730496453900709,
+ "grad_norm": 2.892221450805664,
+ "learning_rate": 4.963142429077582e-06,
+ "loss": 0.6867,
+ "step": 789
+ },
+ {
+ "epoch": 0.3735224586288416,
+ "grad_norm": 3.133122205734253,
+ "learning_rate": 4.963035627467064e-06,
+ "loss": 0.659,
+ "step": 790
+ },
+ {
+ "epoch": 0.3739952718676123,
+ "grad_norm": 3.032599925994873,
+ "learning_rate": 4.962928672494116e-06,
+ "loss": 0.6848,
+ "step": 791
+ },
+ {
+ "epoch": 0.37446808510638296,
+ "grad_norm": 3.0076355934143066,
+ "learning_rate": 4.9628215641654e-06,
+ "loss": 0.6549,
+ "step": 792
+ },
+ {
+ "epoch": 0.37494089834515365,
+ "grad_norm": 2.8904454708099365,
+ "learning_rate": 4.962714302487585e-06,
+ "loss": 0.6484,
+ "step": 793
+ },
+ {
+ "epoch": 0.37541371158392434,
+ "grad_norm": 2.881364107131958,
+ "learning_rate": 4.9626068874673486e-06,
+ "loss": 0.721,
+ "step": 794
+ },
+ {
+ "epoch": 0.375886524822695,
+ "grad_norm": 3.11668062210083,
+ "learning_rate": 4.962499319111379e-06,
+ "loss": 0.7824,
+ "step": 795
+ },
+ {
+ "epoch": 0.3763593380614657,
+ "grad_norm": 2.9201436042785645,
+ "learning_rate": 4.962391597426374e-06,
+ "loss": 0.6911,
+ "step": 796
+ },
+ {
+ "epoch": 0.3768321513002364,
+ "grad_norm": 2.926598072052002,
+ "learning_rate": 4.962283722419043e-06,
+ "loss": 0.6715,
+ "step": 797
+ },
+ {
+ "epoch": 0.3773049645390071,
+ "grad_norm": 2.7267675399780273,
+ "learning_rate": 4.962175694096101e-06,
+ "loss": 0.6111,
+ "step": 798
+ },
+ {
+ "epoch": 0.37777777777777777,
+ "grad_norm": 3.194031000137329,
+ "learning_rate": 4.962067512464275e-06,
+ "loss": 0.6558,
+ "step": 799
+ },
+ {
+ "epoch": 0.37825059101654845,
+ "grad_norm": 2.6249136924743652,
+ "learning_rate": 4.9619591775303e-06,
+ "loss": 0.6166,
+ "step": 800
+ },
+ {
+ "epoch": 0.37872340425531914,
+ "grad_norm": 2.6356167793273926,
+ "learning_rate": 4.961850689300923e-06,
+ "loss": 0.6112,
+ "step": 801
+ },
+ {
+ "epoch": 0.3791962174940898,
+ "grad_norm": 3.030724287033081,
+ "learning_rate": 4.961742047782898e-06,
+ "loss": 0.6511,
+ "step": 802
+ },
+ {
+ "epoch": 0.3796690307328605,
+ "grad_norm": 3.4987757205963135,
+ "learning_rate": 4.96163325298299e-06,
+ "loss": 0.5888,
+ "step": 803
+ },
+ {
+ "epoch": 0.3801418439716312,
+ "grad_norm": 3.0371780395507812,
+ "learning_rate": 4.961524304907974e-06,
+ "loss": 0.6385,
+ "step": 804
+ },
+ {
+ "epoch": 0.3806146572104019,
+ "grad_norm": 3.302570104598999,
+ "learning_rate": 4.961415203564632e-06,
+ "loss": 0.6515,
+ "step": 805
+ },
+ {
+ "epoch": 0.38108747044917257,
+ "grad_norm": 2.7597038745880127,
+ "learning_rate": 4.961305948959759e-06,
+ "loss": 0.6126,
+ "step": 806
+ },
+ {
+ "epoch": 0.38156028368794326,
+ "grad_norm": 2.789811849594116,
+ "learning_rate": 4.9611965411001575e-06,
+ "loss": 0.6601,
+ "step": 807
+ },
+ {
+ "epoch": 0.38203309692671394,
+ "grad_norm": 3.0403921604156494,
+ "learning_rate": 4.961086979992639e-06,
+ "loss": 0.6947,
+ "step": 808
+ },
+ {
+ "epoch": 0.38250591016548463,
+ "grad_norm": 3.2139980792999268,
+ "learning_rate": 4.960977265644026e-06,
+ "loss": 0.6876,
+ "step": 809
+ },
+ {
+ "epoch": 0.3829787234042553,
+ "grad_norm": 2.918515205383301,
+ "learning_rate": 4.960867398061149e-06,
+ "loss": 0.5997,
+ "step": 810
+ },
+ {
+ "epoch": 0.383451536643026,
+ "grad_norm": 3.197636604309082,
+ "learning_rate": 4.9607573772508495e-06,
+ "loss": 0.5754,
+ "step": 811
+ },
+ {
+ "epoch": 0.3839243498817967,
+ "grad_norm": 2.8848466873168945,
+ "learning_rate": 4.960647203219979e-06,
+ "loss": 0.6424,
+ "step": 812
+ },
+ {
+ "epoch": 0.3843971631205674,
+ "grad_norm": 3.4810187816619873,
+ "learning_rate": 4.960536875975397e-06,
+ "loss": 0.6851,
+ "step": 813
+ },
+ {
+ "epoch": 0.38486997635933806,
+ "grad_norm": 3.713934898376465,
+ "learning_rate": 4.960426395523972e-06,
+ "loss": 0.6122,
+ "step": 814
+ },
+ {
+ "epoch": 0.38534278959810875,
+ "grad_norm": 2.862600803375244,
+ "learning_rate": 4.960315761872585e-06,
+ "loss": 0.6493,
+ "step": 815
+ },
+ {
+ "epoch": 0.38581560283687943,
+ "grad_norm": 3.133882522583008,
+ "learning_rate": 4.960204975028123e-06,
+ "loss": 0.7535,
+ "step": 816
+ },
+ {
+ "epoch": 0.3862884160756501,
+ "grad_norm": 3.1526732444763184,
+ "learning_rate": 4.960094034997485e-06,
+ "loss": 0.6512,
+ "step": 817
+ },
+ {
+ "epoch": 0.3867612293144208,
+ "grad_norm": 2.7213544845581055,
+ "learning_rate": 4.959982941787579e-06,
+ "loss": 0.6121,
+ "step": 818
+ },
+ {
+ "epoch": 0.3872340425531915,
+ "grad_norm": 3.4935851097106934,
+ "learning_rate": 4.9598716954053214e-06,
+ "loss": 0.7852,
+ "step": 819
+ },
+ {
+ "epoch": 0.3877068557919622,
+ "grad_norm": 2.691016435623169,
+ "learning_rate": 4.9597602958576395e-06,
+ "loss": 0.6861,
+ "step": 820
+ },
+ {
+ "epoch": 0.38817966903073287,
+ "grad_norm": 2.8621015548706055,
+ "learning_rate": 4.959648743151469e-06,
+ "loss": 0.6262,
+ "step": 821
+ },
+ {
+ "epoch": 0.38865248226950355,
+ "grad_norm": 3.3887462615966797,
+ "learning_rate": 4.959537037293758e-06,
+ "loss": 0.7103,
+ "step": 822
+ },
+ {
+ "epoch": 0.38912529550827424,
+ "grad_norm": 2.7565438747406006,
+ "learning_rate": 4.95942517829146e-06,
+ "loss": 0.6471,
+ "step": 823
+ },
+ {
+ "epoch": 0.3895981087470449,
+ "grad_norm": 2.7920358180999756,
+ "learning_rate": 4.959313166151541e-06,
+ "loss": 0.6239,
+ "step": 824
+ },
+ {
+ "epoch": 0.3900709219858156,
+ "grad_norm": 3.18904185295105,
+ "learning_rate": 4.959201000880973e-06,
+ "loss": 0.7461,
+ "step": 825
+ },
+ {
+ "epoch": 0.3905437352245863,
+ "grad_norm": 2.727872371673584,
+ "learning_rate": 4.959088682486743e-06,
+ "loss": 0.6333,
+ "step": 826
+ },
+ {
+ "epoch": 0.391016548463357,
+ "grad_norm": 2.906378746032715,
+ "learning_rate": 4.958976210975844e-06,
+ "loss": 0.7547,
+ "step": 827
+ },
+ {
+ "epoch": 0.39148936170212767,
+ "grad_norm": 2.96482515335083,
+ "learning_rate": 4.958863586355278e-06,
+ "loss": 0.6312,
+ "step": 828
+ },
+ {
+ "epoch": 0.39196217494089836,
+ "grad_norm": 3.2890889644622803,
+ "learning_rate": 4.958750808632059e-06,
+ "loss": 0.6943,
+ "step": 829
+ },
+ {
+ "epoch": 0.39243498817966904,
+ "grad_norm": 2.7004311084747314,
+ "learning_rate": 4.958637877813207e-06,
+ "loss": 0.5918,
+ "step": 830
+ },
+ {
+ "epoch": 0.39290780141843973,
+ "grad_norm": 2.7487950325012207,
+ "learning_rate": 4.9585247939057566e-06,
+ "loss": 0.6201,
+ "step": 831
+ },
+ {
+ "epoch": 0.3933806146572104,
+ "grad_norm": 2.7873897552490234,
+ "learning_rate": 4.958411556916747e-06,
+ "loss": 0.6268,
+ "step": 832
+ },
+ {
+ "epoch": 0.3938534278959811,
+ "grad_norm": 2.8501343727111816,
+ "learning_rate": 4.958298166853229e-06,
+ "loss": 0.7119,
+ "step": 833
+ },
+ {
+ "epoch": 0.3943262411347518,
+ "grad_norm": 3.0391547679901123,
+ "learning_rate": 4.958184623722265e-06,
+ "loss": 0.6375,
+ "step": 834
+ },
+ {
+ "epoch": 0.3947990543735225,
+ "grad_norm": 2.850520133972168,
+ "learning_rate": 4.958070927530922e-06,
+ "loss": 0.5962,
+ "step": 835
+ },
+ {
+ "epoch": 0.39527186761229316,
+ "grad_norm": 3.351914644241333,
+ "learning_rate": 4.957957078286281e-06,
+ "loss": 0.7247,
+ "step": 836
+ },
+ {
+ "epoch": 0.39574468085106385,
+ "grad_norm": 2.9559543132781982,
+ "learning_rate": 4.957843075995431e-06,
+ "loss": 0.6571,
+ "step": 837
+ },
+ {
+ "epoch": 0.39621749408983453,
+ "grad_norm": 3.225785255432129,
+ "learning_rate": 4.95772892066547e-06,
+ "loss": 0.7074,
+ "step": 838
+ },
+ {
+ "epoch": 0.3966903073286052,
+ "grad_norm": 2.7842373847961426,
+ "learning_rate": 4.957614612303505e-06,
+ "loss": 0.6469,
+ "step": 839
+ },
+ {
+ "epoch": 0.3971631205673759,
+ "grad_norm": 4.249724864959717,
+ "learning_rate": 4.957500150916655e-06,
+ "loss": 0.741,
+ "step": 840
+ },
+ {
+ "epoch": 0.3976359338061466,
+ "grad_norm": 3.138221263885498,
+ "learning_rate": 4.957385536512046e-06,
+ "loss": 0.6676,
+ "step": 841
+ },
+ {
+ "epoch": 0.3981087470449173,
+ "grad_norm": 3.456423759460449,
+ "learning_rate": 4.957270769096816e-06,
+ "loss": 0.6877,
+ "step": 842
+ },
+ {
+ "epoch": 0.39858156028368796,
+ "grad_norm": 2.8676278591156006,
+ "learning_rate": 4.957155848678109e-06,
+ "loss": 0.5986,
+ "step": 843
+ },
+ {
+ "epoch": 0.39905437352245865,
+ "grad_norm": 2.705324411392212,
+ "learning_rate": 4.957040775263082e-06,
+ "loss": 0.6356,
+ "step": 844
+ },
+ {
+ "epoch": 0.39952718676122934,
+ "grad_norm": 3.0767486095428467,
+ "learning_rate": 4.9569255488589e-06,
+ "loss": 0.6844,
+ "step": 845
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 2.7787704467773438,
+ "learning_rate": 4.956810169472736e-06,
+ "loss": 0.6641,
+ "step": 846
+ },
+ {
+ "epoch": 0.4004728132387707,
+ "grad_norm": 2.584277868270874,
+ "learning_rate": 4.956694637111777e-06,
+ "loss": 0.6256,
+ "step": 847
+ },
+ {
+ "epoch": 0.4009456264775414,
+ "grad_norm": 2.751641273498535,
+ "learning_rate": 4.956578951783215e-06,
+ "loss": 0.5954,
+ "step": 848
+ },
+ {
+ "epoch": 0.4014184397163121,
+ "grad_norm": 3.0181658267974854,
+ "learning_rate": 4.956463113494253e-06,
+ "loss": 0.6569,
+ "step": 849
+ },
+ {
+ "epoch": 0.40189125295508277,
+ "grad_norm": 3.0933220386505127,
+ "learning_rate": 4.956347122252104e-06,
+ "loss": 0.6248,
+ "step": 850
+ },
+ {
+ "epoch": 0.40236406619385345,
+ "grad_norm": 3.3767428398132324,
+ "learning_rate": 4.956230978063991e-06,
+ "loss": 0.719,
+ "step": 851
+ },
+ {
+ "epoch": 0.40283687943262414,
+ "grad_norm": 3.7666573524475098,
+ "learning_rate": 4.956114680937145e-06,
+ "loss": 0.6467,
+ "step": 852
+ },
+ {
+ "epoch": 0.4033096926713948,
+ "grad_norm": 2.9836843013763428,
+ "learning_rate": 4.955998230878808e-06,
+ "loss": 0.6993,
+ "step": 853
+ },
+ {
+ "epoch": 0.4037825059101655,
+ "grad_norm": 2.981497049331665,
+ "learning_rate": 4.955881627896229e-06,
+ "loss": 0.6578,
+ "step": 854
+ },
+ {
+ "epoch": 0.40425531914893614,
+ "grad_norm": 3.1369056701660156,
+ "learning_rate": 4.955764871996672e-06,
+ "loss": 0.6763,
+ "step": 855
+ },
+ {
+ "epoch": 0.40472813238770683,
+ "grad_norm": 2.7675817012786865,
+ "learning_rate": 4.9556479631874036e-06,
+ "loss": 0.6488,
+ "step": 856
+ },
+ {
+ "epoch": 0.4052009456264775,
+ "grad_norm": 3.035334825515747,
+ "learning_rate": 4.9555309014757034e-06,
+ "loss": 0.7076,
+ "step": 857
+ },
+ {
+ "epoch": 0.4056737588652482,
+ "grad_norm": 3.493704319000244,
+ "learning_rate": 4.955413686868862e-06,
+ "loss": 0.6773,
+ "step": 858
+ },
+ {
+ "epoch": 0.4061465721040189,
+ "grad_norm": 3.245487928390503,
+ "learning_rate": 4.9552963193741765e-06,
+ "loss": 0.6915,
+ "step": 859
+ },
+ {
+ "epoch": 0.4066193853427896,
+ "grad_norm": 3.189969539642334,
+ "learning_rate": 4.955178798998956e-06,
+ "loss": 0.7318,
+ "step": 860
+ },
+ {
+ "epoch": 0.40709219858156026,
+ "grad_norm": 2.7987146377563477,
+ "learning_rate": 4.955061125750517e-06,
+ "loss": 0.6162,
+ "step": 861
+ },
+ {
+ "epoch": 0.40756501182033095,
+ "grad_norm": 3.020118474960327,
+ "learning_rate": 4.954943299636187e-06,
+ "loss": 0.6678,
+ "step": 862
+ },
+ {
+ "epoch": 0.40803782505910163,
+ "grad_norm": 2.715463876724243,
+ "learning_rate": 4.954825320663302e-06,
+ "loss": 0.668,
+ "step": 863
+ },
+ {
+ "epoch": 0.4085106382978723,
+ "grad_norm": 2.595050096511841,
+ "learning_rate": 4.9547071888392085e-06,
+ "loss": 0.6557,
+ "step": 864
+ },
+ {
+ "epoch": 0.408983451536643,
+ "grad_norm": 3.131596088409424,
+ "learning_rate": 4.954588904171261e-06,
+ "loss": 0.6548,
+ "step": 865
+ },
+ {
+ "epoch": 0.4094562647754137,
+ "grad_norm": 2.5742313861846924,
+ "learning_rate": 4.954470466666827e-06,
+ "loss": 0.6592,
+ "step": 866
+ },
+ {
+ "epoch": 0.4099290780141844,
+ "grad_norm": 2.8612802028656006,
+ "learning_rate": 4.9543518763332785e-06,
+ "loss": 0.5391,
+ "step": 867
+ },
+ {
+ "epoch": 0.41040189125295506,
+ "grad_norm": 2.8973186016082764,
+ "learning_rate": 4.954233133178001e-06,
+ "loss": 0.6649,
+ "step": 868
+ },
+ {
+ "epoch": 0.41087470449172575,
+ "grad_norm": 2.802525043487549,
+ "learning_rate": 4.954114237208388e-06,
+ "loss": 0.6212,
+ "step": 869
+ },
+ {
+ "epoch": 0.41134751773049644,
+ "grad_norm": 2.5919506549835205,
+ "learning_rate": 4.953995188431843e-06,
+ "loss": 0.6596,
+ "step": 870
+ },
+ {
+ "epoch": 0.4118203309692671,
+ "grad_norm": 3.139169454574585,
+ "learning_rate": 4.953875986855777e-06,
+ "loss": 0.6799,
+ "step": 871
+ },
+ {
+ "epoch": 0.4122931442080378,
+ "grad_norm": 3.99727725982666,
+ "learning_rate": 4.953756632487614e-06,
+ "loss": 0.6519,
+ "step": 872
+ },
+ {
+ "epoch": 0.4127659574468085,
+ "grad_norm": 3.238706350326538,
+ "learning_rate": 4.953637125334784e-06,
+ "loss": 0.7361,
+ "step": 873
+ },
+ {
+ "epoch": 0.4132387706855792,
+ "grad_norm": 2.780019998550415,
+ "learning_rate": 4.9535174654047295e-06,
+ "loss": 0.6406,
+ "step": 874
+ },
+ {
+ "epoch": 0.41371158392434987,
+ "grad_norm": 2.7629551887512207,
+ "learning_rate": 4.953397652704901e-06,
+ "loss": 0.6131,
+ "step": 875
+ },
+ {
+ "epoch": 0.41418439716312055,
+ "grad_norm": 2.8008246421813965,
+ "learning_rate": 4.9532776872427585e-06,
+ "loss": 0.6464,
+ "step": 876
+ },
+ {
+ "epoch": 0.41465721040189124,
+ "grad_norm": 3.0970115661621094,
+ "learning_rate": 4.953157569025772e-06,
+ "loss": 0.7066,
+ "step": 877
+ },
+ {
+ "epoch": 0.4151300236406619,
+ "grad_norm": 2.8375589847564697,
+ "learning_rate": 4.9530372980614195e-06,
+ "loss": 0.6551,
+ "step": 878
+ },
+ {
+ "epoch": 0.4156028368794326,
+ "grad_norm": 2.718843936920166,
+ "learning_rate": 4.952916874357191e-06,
+ "loss": 0.5947,
+ "step": 879
+ },
+ {
+ "epoch": 0.4160756501182033,
+ "grad_norm": 2.7104697227478027,
+ "learning_rate": 4.952796297920585e-06,
+ "loss": 0.6708,
+ "step": 880
+ },
+ {
+ "epoch": 0.416548463356974,
+ "grad_norm": 2.8223445415496826,
+ "learning_rate": 4.952675568759108e-06,
+ "loss": 0.6214,
+ "step": 881
+ },
+ {
+ "epoch": 0.41702127659574467,
+ "grad_norm": 2.6598153114318848,
+ "learning_rate": 4.952554686880279e-06,
+ "loss": 0.6116,
+ "step": 882
+ },
+ {
+ "epoch": 0.41749408983451536,
+ "grad_norm": 2.8639824390411377,
+ "learning_rate": 4.952433652291623e-06,
+ "loss": 0.5971,
+ "step": 883
+ },
+ {
+ "epoch": 0.41796690307328604,
+ "grad_norm": 2.9578304290771484,
+ "learning_rate": 4.952312465000677e-06,
+ "loss": 0.6785,
+ "step": 884
+ },
+ {
+ "epoch": 0.41843971631205673,
+ "grad_norm": 2.872144937515259,
+ "learning_rate": 4.952191125014987e-06,
+ "loss": 0.6772,
+ "step": 885
+ },
+ {
+ "epoch": 0.4189125295508274,
+ "grad_norm": 2.7513675689697266,
+ "learning_rate": 4.952069632342108e-06,
+ "loss": 0.702,
+ "step": 886
+ },
+ {
+ "epoch": 0.4193853427895981,
+ "grad_norm": 2.9275078773498535,
+ "learning_rate": 4.951947986989606e-06,
+ "loss": 0.589,
+ "step": 887
+ },
+ {
+ "epoch": 0.4198581560283688,
+ "grad_norm": 2.740549325942993,
+ "learning_rate": 4.951826188965053e-06,
+ "loss": 0.5942,
+ "step": 888
+ },
+ {
+ "epoch": 0.4203309692671395,
+ "grad_norm": 2.92452073097229,
+ "learning_rate": 4.951704238276035e-06,
+ "loss": 0.6819,
+ "step": 889
+ },
+ {
+ "epoch": 0.42080378250591016,
+ "grad_norm": 2.842491865158081,
+ "learning_rate": 4.951582134930144e-06,
+ "loss": 0.6304,
+ "step": 890
+ },
+ {
+ "epoch": 0.42127659574468085,
+ "grad_norm": 2.613478422164917,
+ "learning_rate": 4.951459878934983e-06,
+ "loss": 0.6912,
+ "step": 891
+ },
+ {
+ "epoch": 0.42174940898345153,
+ "grad_norm": 3.2408607006073,
+ "learning_rate": 4.951337470298165e-06,
+ "loss": 0.6755,
+ "step": 892
+ },
+ {
+ "epoch": 0.4222222222222222,
+ "grad_norm": 3.1022439002990723,
+ "learning_rate": 4.9512149090273125e-06,
+ "loss": 0.6138,
+ "step": 893
+ },
+ {
+ "epoch": 0.4226950354609929,
+ "grad_norm": 2.6418895721435547,
+ "learning_rate": 4.951092195130055e-06,
+ "loss": 0.639,
+ "step": 894
+ },
+ {
+ "epoch": 0.4231678486997636,
+ "grad_norm": 3.010744333267212,
+ "learning_rate": 4.950969328614035e-06,
+ "loss": 0.7102,
+ "step": 895
+ },
+ {
+ "epoch": 0.4236406619385343,
+ "grad_norm": 2.673292636871338,
+ "learning_rate": 4.950846309486901e-06,
+ "loss": 0.5676,
+ "step": 896
+ },
+ {
+ "epoch": 0.42411347517730497,
+ "grad_norm": 3.6974737644195557,
+ "learning_rate": 4.950723137756314e-06,
+ "loss": 0.5722,
+ "step": 897
+ },
+ {
+ "epoch": 0.42458628841607565,
+ "grad_norm": 3.69028902053833,
+ "learning_rate": 4.9505998134299435e-06,
+ "loss": 0.6337,
+ "step": 898
+ },
+ {
+ "epoch": 0.42505910165484634,
+ "grad_norm": 3.2136125564575195,
+ "learning_rate": 4.950476336515469e-06,
+ "loss": 0.6469,
+ "step": 899
+ },
+ {
+ "epoch": 0.425531914893617,
+ "grad_norm": 2.7396016120910645,
+ "learning_rate": 4.950352707020577e-06,
+ "loss": 0.6656,
+ "step": 900
+ },
+ {
+ "epoch": 0.4260047281323877,
+ "grad_norm": 2.825416088104248,
+ "learning_rate": 4.950228924952967e-06,
+ "loss": 0.6298,
+ "step": 901
+ },
+ {
+ "epoch": 0.4264775413711584,
+ "grad_norm": 3.401658535003662,
+ "learning_rate": 4.950104990320345e-06,
+ "loss": 0.778,
+ "step": 902
+ },
+ {
+ "epoch": 0.4269503546099291,
+ "grad_norm": 2.7002272605895996,
+ "learning_rate": 4.9499809031304294e-06,
+ "loss": 0.6536,
+ "step": 903
+ },
+ {
+ "epoch": 0.42742316784869977,
+ "grad_norm": 2.62386417388916,
+ "learning_rate": 4.949856663390945e-06,
+ "loss": 0.6629,
+ "step": 904
+ },
+ {
+ "epoch": 0.42789598108747046,
+ "grad_norm": 2.584247589111328,
+ "learning_rate": 4.94973227110963e-06,
+ "loss": 0.5813,
+ "step": 905
+ },
+ {
+ "epoch": 0.42836879432624114,
+ "grad_norm": 3.4365768432617188,
+ "learning_rate": 4.9496077262942265e-06,
+ "loss": 0.7648,
+ "step": 906
+ },
+ {
+ "epoch": 0.42884160756501183,
+ "grad_norm": 2.8993639945983887,
+ "learning_rate": 4.949483028952492e-06,
+ "loss": 0.6696,
+ "step": 907
+ },
+ {
+ "epoch": 0.4293144208037825,
+ "grad_norm": 2.922809362411499,
+ "learning_rate": 4.94935817909219e-06,
+ "loss": 0.6892,
+ "step": 908
+ },
+ {
+ "epoch": 0.4297872340425532,
+ "grad_norm": 2.85478138923645,
+ "learning_rate": 4.9492331767210944e-06,
+ "loss": 0.536,
+ "step": 909
+ },
+ {
+ "epoch": 0.4302600472813239,
+ "grad_norm": 2.8639259338378906,
+ "learning_rate": 4.949108021846988e-06,
+ "loss": 0.634,
+ "step": 910
+ },
+ {
+ "epoch": 0.4307328605200946,
+ "grad_norm": 3.0533697605133057,
+ "learning_rate": 4.948982714477664e-06,
+ "loss": 0.6318,
+ "step": 911
+ },
+ {
+ "epoch": 0.43120567375886526,
+ "grad_norm": 2.331674814224243,
+ "learning_rate": 4.9488572546209255e-06,
+ "loss": 0.6562,
+ "step": 912
+ },
+ {
+ "epoch": 0.43167848699763595,
+ "grad_norm": 3.0154623985290527,
+ "learning_rate": 4.9487316422845835e-06,
+ "loss": 0.6675,
+ "step": 913
+ },
+ {
+ "epoch": 0.43215130023640663,
+ "grad_norm": 2.7354514598846436,
+ "learning_rate": 4.948605877476459e-06,
+ "loss": 0.6012,
+ "step": 914
+ },
+ {
+ "epoch": 0.4326241134751773,
+ "grad_norm": 2.863736629486084,
+ "learning_rate": 4.948479960204383e-06,
+ "loss": 0.6062,
+ "step": 915
+ },
+ {
+ "epoch": 0.433096926713948,
+ "grad_norm": 3.01998233795166,
+ "learning_rate": 4.948353890476197e-06,
+ "loss": 0.6749,
+ "step": 916
+ },
+ {
+ "epoch": 0.4335697399527187,
+ "grad_norm": 2.7550456523895264,
+ "learning_rate": 4.94822766829975e-06,
+ "loss": 0.6507,
+ "step": 917
+ },
+ {
+ "epoch": 0.4340425531914894,
+ "grad_norm": 3.370572805404663,
+ "learning_rate": 4.948101293682901e-06,
+ "loss": 0.714,
+ "step": 918
+ },
+ {
+ "epoch": 0.43451536643026006,
+ "grad_norm": 2.9736790657043457,
+ "learning_rate": 4.947974766633519e-06,
+ "loss": 0.729,
+ "step": 919
+ },
+ {
+ "epoch": 0.43498817966903075,
+ "grad_norm": 3.1036548614501953,
+ "learning_rate": 4.947848087159483e-06,
+ "loss": 0.7547,
+ "step": 920
+ },
+ {
+ "epoch": 0.43546099290780144,
+ "grad_norm": 2.895094871520996,
+ "learning_rate": 4.947721255268679e-06,
+ "loss": 0.6089,
+ "step": 921
+ },
+ {
+ "epoch": 0.4359338061465721,
+ "grad_norm": 2.798476219177246,
+ "learning_rate": 4.947594270969005e-06,
+ "loss": 0.5432,
+ "step": 922
+ },
+ {
+ "epoch": 0.4364066193853428,
+ "grad_norm": 2.7675702571868896,
+ "learning_rate": 4.94746713426837e-06,
+ "loss": 0.5693,
+ "step": 923
+ },
+ {
+ "epoch": 0.4368794326241135,
+ "grad_norm": 2.6851553916931152,
+ "learning_rate": 4.947339845174687e-06,
+ "loss": 0.6503,
+ "step": 924
+ },
+ {
+ "epoch": 0.4373522458628842,
+ "grad_norm": 2.909635543823242,
+ "learning_rate": 4.947212403695883e-06,
+ "loss": 0.6494,
+ "step": 925
+ },
+ {
+ "epoch": 0.43782505910165487,
+ "grad_norm": 2.604526996612549,
+ "learning_rate": 4.947084809839894e-06,
+ "loss": 0.6349,
+ "step": 926
+ },
+ {
+ "epoch": 0.43829787234042555,
+ "grad_norm": 3.118149518966675,
+ "learning_rate": 4.946957063614664e-06,
+ "loss": 0.6219,
+ "step": 927
+ },
+ {
+ "epoch": 0.43877068557919624,
+ "grad_norm": 2.7452616691589355,
+ "learning_rate": 4.9468291650281465e-06,
+ "loss": 0.6096,
+ "step": 928
+ },
+ {
+ "epoch": 0.4392434988179669,
+ "grad_norm": 3.30098819732666,
+ "learning_rate": 4.946701114088307e-06,
+ "loss": 0.6277,
+ "step": 929
+ },
+ {
+ "epoch": 0.4397163120567376,
+ "grad_norm": 2.789482593536377,
+ "learning_rate": 4.946572910803116e-06,
+ "loss": 0.7,
+ "step": 930
+ },
+ {
+ "epoch": 0.4401891252955083,
+ "grad_norm": 2.7283935546875,
+ "learning_rate": 4.946444555180559e-06,
+ "loss": 0.5375,
+ "step": 931
+ },
+ {
+ "epoch": 0.440661938534279,
+ "grad_norm": 3.101304054260254,
+ "learning_rate": 4.946316047228627e-06,
+ "loss": 0.6131,
+ "step": 932
+ },
+ {
+ "epoch": 0.44113475177304967,
+ "grad_norm": 3.573908805847168,
+ "learning_rate": 4.946187386955321e-06,
+ "loss": 0.7073,
+ "step": 933
+ },
+ {
+ "epoch": 0.44160756501182036,
+ "grad_norm": 3.214979648590088,
+ "learning_rate": 4.946058574368653e-06,
+ "loss": 0.6508,
+ "step": 934
+ },
+ {
+ "epoch": 0.44208037825059104,
+ "grad_norm": 3.145082712173462,
+ "learning_rate": 4.945929609476643e-06,
+ "loss": 0.64,
+ "step": 935
+ },
+ {
+ "epoch": 0.4425531914893617,
+ "grad_norm": 2.991780996322632,
+ "learning_rate": 4.945800492287321e-06,
+ "loss": 0.6315,
+ "step": 936
+ },
+ {
+ "epoch": 0.44302600472813236,
+ "grad_norm": 3.2441139221191406,
+ "learning_rate": 4.945671222808727e-06,
+ "loss": 0.7144,
+ "step": 937
+ },
+ {
+ "epoch": 0.44349881796690305,
+ "grad_norm": 2.9397029876708984,
+ "learning_rate": 4.94554180104891e-06,
+ "loss": 0.6818,
+ "step": 938
+ },
+ {
+ "epoch": 0.44397163120567373,
+ "grad_norm": 3.2471461296081543,
+ "learning_rate": 4.945412227015929e-06,
+ "loss": 0.6921,
+ "step": 939
+ },
+ {
+ "epoch": 0.4444444444444444,
+ "grad_norm": 3.0882487297058105,
+ "learning_rate": 4.945282500717851e-06,
+ "loss": 0.718,
+ "step": 940
+ },
+ {
+ "epoch": 0.4449172576832151,
+ "grad_norm": 2.6035783290863037,
+ "learning_rate": 4.945152622162753e-06,
+ "loss": 0.621,
+ "step": 941
+ },
+ {
+ "epoch": 0.4453900709219858,
+ "grad_norm": 2.83659029006958,
+ "learning_rate": 4.945022591358724e-06,
+ "loss": 0.6403,
+ "step": 942
+ },
+ {
+ "epoch": 0.4458628841607565,
+ "grad_norm": 2.824463129043579,
+ "learning_rate": 4.944892408313859e-06,
+ "loss": 0.6594,
+ "step": 943
+ },
+ {
+ "epoch": 0.44633569739952716,
+ "grad_norm": 2.753735065460205,
+ "learning_rate": 4.9447620730362645e-06,
+ "loss": 0.6116,
+ "step": 944
+ },
+ {
+ "epoch": 0.44680851063829785,
+ "grad_norm": 3.0659725666046143,
+ "learning_rate": 4.944631585534056e-06,
+ "loss": 0.5983,
+ "step": 945
+ },
+ {
+ "epoch": 0.44728132387706854,
+ "grad_norm": 2.969113349914551,
+ "learning_rate": 4.944500945815357e-06,
+ "loss": 0.6859,
+ "step": 946
+ },
+ {
+ "epoch": 0.4477541371158392,
+ "grad_norm": 2.810303211212158,
+ "learning_rate": 4.944370153888303e-06,
+ "loss": 0.7025,
+ "step": 947
+ },
+ {
+ "epoch": 0.4482269503546099,
+ "grad_norm": 3.027721643447876,
+ "learning_rate": 4.944239209761038e-06,
+ "loss": 0.7268,
+ "step": 948
+ },
+ {
+ "epoch": 0.4486997635933806,
+ "grad_norm": 2.661503314971924,
+ "learning_rate": 4.944108113441716e-06,
+ "loss": 0.6702,
+ "step": 949
+ },
+ {
+ "epoch": 0.4491725768321513,
+ "grad_norm": 2.738591432571411,
+ "learning_rate": 4.943976864938498e-06,
+ "loss": 0.6728,
+ "step": 950
+ },
+ {
+ "epoch": 0.44964539007092197,
+ "grad_norm": 3.447505474090576,
+ "learning_rate": 4.943845464259557e-06,
+ "loss": 0.6586,
+ "step": 951
+ },
+ {
+ "epoch": 0.45011820330969265,
+ "grad_norm": 3.0968854427337646,
+ "learning_rate": 4.943713911413075e-06,
+ "loss": 0.7666,
+ "step": 952
+ },
+ {
+ "epoch": 0.45059101654846334,
+ "grad_norm": 2.4113779067993164,
+ "learning_rate": 4.943582206407244e-06,
+ "loss": 0.6173,
+ "step": 953
+ },
+ {
+ "epoch": 0.451063829787234,
+ "grad_norm": 2.6357979774475098,
+ "learning_rate": 4.943450349250263e-06,
+ "loss": 0.5589,
+ "step": 954
+ },
+ {
+ "epoch": 0.4515366430260047,
+ "grad_norm": 2.9182233810424805,
+ "learning_rate": 4.9433183399503425e-06,
+ "loss": 0.6252,
+ "step": 955
+ },
+ {
+ "epoch": 0.4520094562647754,
+ "grad_norm": 2.832740306854248,
+ "learning_rate": 4.943186178515703e-06,
+ "loss": 0.6882,
+ "step": 956
+ },
+ {
+ "epoch": 0.4524822695035461,
+ "grad_norm": 2.9508981704711914,
+ "learning_rate": 4.943053864954574e-06,
+ "loss": 0.5722,
+ "step": 957
+ },
+ {
+ "epoch": 0.4529550827423168,
+ "grad_norm": 3.044729471206665,
+ "learning_rate": 4.9429213992751925e-06,
+ "loss": 0.6772,
+ "step": 958
+ },
+ {
+ "epoch": 0.45342789598108746,
+ "grad_norm": 2.606003522872925,
+ "learning_rate": 4.9427887814858075e-06,
+ "loss": 0.6445,
+ "step": 959
+ },
+ {
+ "epoch": 0.45390070921985815,
+ "grad_norm": 2.4634225368499756,
+ "learning_rate": 4.942656011594676e-06,
+ "loss": 0.6151,
+ "step": 960
+ },
+ {
+ "epoch": 0.45437352245862883,
+ "grad_norm": 2.8872334957122803,
+ "learning_rate": 4.942523089610066e-06,
+ "loss": 0.6255,
+ "step": 961
+ },
+ {
+ "epoch": 0.4548463356973995,
+ "grad_norm": 2.870605707168579,
+ "learning_rate": 4.942390015540253e-06,
+ "loss": 0.7481,
+ "step": 962
+ },
+ {
+ "epoch": 0.4553191489361702,
+ "grad_norm": 2.952680826187134,
+ "learning_rate": 4.942256789393524e-06,
+ "loss": 0.5556,
+ "step": 963
+ },
+ {
+ "epoch": 0.4557919621749409,
+ "grad_norm": 2.623680353164673,
+ "learning_rate": 4.9421234111781725e-06,
+ "loss": 0.6115,
+ "step": 964
+ },
+ {
+ "epoch": 0.4562647754137116,
+ "grad_norm": 2.6933600902557373,
+ "learning_rate": 4.941989880902505e-06,
+ "loss": 0.6102,
+ "step": 965
+ },
+ {
+ "epoch": 0.45673758865248226,
+ "grad_norm": 2.6047189235687256,
+ "learning_rate": 4.941856198574836e-06,
+ "loss": 0.612,
+ "step": 966
+ },
+ {
+ "epoch": 0.45721040189125295,
+ "grad_norm": 2.779186725616455,
+ "learning_rate": 4.9417223642034885e-06,
+ "loss": 0.5424,
+ "step": 967
+ },
+ {
+ "epoch": 0.45768321513002364,
+ "grad_norm": 2.6177165508270264,
+ "learning_rate": 4.941588377796795e-06,
+ "loss": 0.4661,
+ "step": 968
+ },
+ {
+ "epoch": 0.4581560283687943,
+ "grad_norm": 2.959676742553711,
+ "learning_rate": 4.941454239363101e-06,
+ "loss": 0.6966,
+ "step": 969
+ },
+ {
+ "epoch": 0.458628841607565,
+ "grad_norm": 2.9788379669189453,
+ "learning_rate": 4.941319948910756e-06,
+ "loss": 0.6181,
+ "step": 970
+ },
+ {
+ "epoch": 0.4591016548463357,
+ "grad_norm": 4.642750263214111,
+ "learning_rate": 4.941185506448122e-06,
+ "loss": 0.5602,
+ "step": 971
+ },
+ {
+ "epoch": 0.4595744680851064,
+ "grad_norm": 2.793002128601074,
+ "learning_rate": 4.941050911983572e-06,
+ "loss": 0.602,
+ "step": 972
+ },
+ {
+ "epoch": 0.46004728132387707,
+ "grad_norm": 2.6833035945892334,
+ "learning_rate": 4.9409161655254845e-06,
+ "loss": 0.5549,
+ "step": 973
+ },
+ {
+ "epoch": 0.46052009456264775,
+ "grad_norm": 3.905032157897949,
+ "learning_rate": 4.94078126708225e-06,
+ "loss": 0.6335,
+ "step": 974
+ },
+ {
+ "epoch": 0.46099290780141844,
+ "grad_norm": 2.922609329223633,
+ "learning_rate": 4.94064621666227e-06,
+ "loss": 0.5839,
+ "step": 975
+ },
+ {
+ "epoch": 0.4614657210401891,
+ "grad_norm": 2.8277416229248047,
+ "learning_rate": 4.940511014273952e-06,
+ "loss": 0.629,
+ "step": 976
+ },
+ {
+ "epoch": 0.4619385342789598,
+ "grad_norm": 3.07511043548584,
+ "learning_rate": 4.940375659925714e-06,
+ "loss": 0.7058,
+ "step": 977
+ },
+ {
+ "epoch": 0.4624113475177305,
+ "grad_norm": 3.65043044090271,
+ "learning_rate": 4.940240153625984e-06,
+ "loss": 0.7174,
+ "step": 978
+ },
+ {
+ "epoch": 0.4628841607565012,
+ "grad_norm": 2.755167245864868,
+ "learning_rate": 4.9401044953832e-06,
+ "loss": 0.6548,
+ "step": 979
+ },
+ {
+ "epoch": 0.46335697399527187,
+ "grad_norm": 2.9881057739257812,
+ "learning_rate": 4.939968685205808e-06,
+ "loss": 0.6245,
+ "step": 980
+ },
+ {
+ "epoch": 0.46382978723404256,
+ "grad_norm": 2.9484212398529053,
+ "learning_rate": 4.939832723102266e-06,
+ "loss": 0.655,
+ "step": 981
+ },
+ {
+ "epoch": 0.46430260047281324,
+ "grad_norm": 2.898918628692627,
+ "learning_rate": 4.939696609081038e-06,
+ "loss": 0.6178,
+ "step": 982
+ },
+ {
+ "epoch": 0.46477541371158393,
+ "grad_norm": 2.7052435874938965,
+ "learning_rate": 4.9395603431506e-06,
+ "loss": 0.6393,
+ "step": 983
+ },
+ {
+ "epoch": 0.4652482269503546,
+ "grad_norm": 2.5610013008117676,
+ "learning_rate": 4.939423925319436e-06,
+ "loss": 0.4847,
+ "step": 984
+ },
+ {
+ "epoch": 0.4657210401891253,
+ "grad_norm": 3.229083299636841,
+ "learning_rate": 4.939287355596042e-06,
+ "loss": 0.6473,
+ "step": 985
+ },
+ {
+ "epoch": 0.466193853427896,
+ "grad_norm": 2.907097816467285,
+ "learning_rate": 4.9391506339889195e-06,
+ "loss": 0.652,
+ "step": 986
+ },
+ {
+ "epoch": 0.4666666666666667,
+ "grad_norm": 2.6929478645324707,
+ "learning_rate": 4.939013760506582e-06,
+ "loss": 0.6175,
+ "step": 987
+ },
+ {
+ "epoch": 0.46713947990543736,
+ "grad_norm": 3.414813280105591,
+ "learning_rate": 4.938876735157554e-06,
+ "loss": 0.7597,
+ "step": 988
+ },
+ {
+ "epoch": 0.46761229314420805,
+ "grad_norm": 3.297360420227051,
+ "learning_rate": 4.938739557950365e-06,
+ "loss": 0.6824,
+ "step": 989
+ },
+ {
+ "epoch": 0.46808510638297873,
+ "grad_norm": 3.083155393600464,
+ "learning_rate": 4.938602228893557e-06,
+ "loss": 0.6505,
+ "step": 990
+ },
+ {
+ "epoch": 0.4685579196217494,
+ "grad_norm": 2.9781153202056885,
+ "learning_rate": 4.938464747995681e-06,
+ "loss": 0.666,
+ "step": 991
+ },
+ {
+ "epoch": 0.4690307328605201,
+ "grad_norm": 3.1494534015655518,
+ "learning_rate": 4.9383271152652975e-06,
+ "loss": 0.6422,
+ "step": 992
+ },
+ {
+ "epoch": 0.4695035460992908,
+ "grad_norm": 2.547868490219116,
+ "learning_rate": 4.938189330710976e-06,
+ "loss": 0.5766,
+ "step": 993
+ },
+ {
+ "epoch": 0.4699763593380615,
+ "grad_norm": 2.684736967086792,
+ "learning_rate": 4.938051394341297e-06,
+ "loss": 0.6407,
+ "step": 994
+ },
+ {
+ "epoch": 0.47044917257683216,
+ "grad_norm": 2.9619693756103516,
+ "learning_rate": 4.937913306164847e-06,
+ "loss": 0.6936,
+ "step": 995
+ },
+ {
+ "epoch": 0.47092198581560285,
+ "grad_norm": 2.9698498249053955,
+ "learning_rate": 4.937775066190227e-06,
+ "loss": 0.6464,
+ "step": 996
+ },
+ {
+ "epoch": 0.47139479905437354,
+ "grad_norm": 3.121049642562866,
+ "learning_rate": 4.937636674426042e-06,
+ "loss": 0.6383,
+ "step": 997
+ },
+ {
+ "epoch": 0.4718676122931442,
+ "grad_norm": 3.113672971725464,
+ "learning_rate": 4.93749813088091e-06,
+ "loss": 0.6892,
+ "step": 998
+ },
+ {
+ "epoch": 0.4723404255319149,
+ "grad_norm": 3.126113176345825,
+ "learning_rate": 4.937359435563458e-06,
+ "loss": 0.6728,
+ "step": 999
+ },
+ {
+ "epoch": 0.4728132387706856,
+ "grad_norm": 3.353966236114502,
+ "learning_rate": 4.937220588482321e-06,
+ "loss": 0.6041,
+ "step": 1000
+ },
+ {
+ "epoch": 0.4732860520094563,
+ "grad_norm": 2.8860628604888916,
+ "learning_rate": 4.937081589646144e-06,
+ "loss": 0.6798,
+ "step": 1001
+ },
+ {
+ "epoch": 0.47375886524822697,
+ "grad_norm": 3.0510590076446533,
+ "learning_rate": 4.936942439063584e-06,
+ "loss": 0.5841,
+ "step": 1002
+ },
+ {
+ "epoch": 0.47423167848699765,
+ "grad_norm": 2.6998369693756104,
+ "learning_rate": 4.936803136743303e-06,
+ "loss": 0.6403,
+ "step": 1003
+ },
+ {
+ "epoch": 0.47470449172576834,
+ "grad_norm": 2.875347137451172,
+ "learning_rate": 4.9366636826939765e-06,
+ "loss": 0.5811,
+ "step": 1004
+ },
+ {
+ "epoch": 0.475177304964539,
+ "grad_norm": 2.9122262001037598,
+ "learning_rate": 4.936524076924287e-06,
+ "loss": 0.6852,
+ "step": 1005
+ },
+ {
+ "epoch": 0.4756501182033097,
+ "grad_norm": 2.5167057514190674,
+ "learning_rate": 4.9363843194429265e-06,
+ "loss": 0.5367,
+ "step": 1006
+ },
+ {
+ "epoch": 0.4761229314420804,
+ "grad_norm": 2.5745551586151123,
+ "learning_rate": 4.9362444102585985e-06,
+ "loss": 0.6241,
+ "step": 1007
+ },
+ {
+ "epoch": 0.4765957446808511,
+ "grad_norm": 2.5024216175079346,
+ "learning_rate": 4.9361043493800125e-06,
+ "loss": 0.6133,
+ "step": 1008
+ },
+ {
+ "epoch": 0.47706855791962177,
+ "grad_norm": 2.7281384468078613,
+ "learning_rate": 4.935964136815892e-06,
+ "loss": 0.6834,
+ "step": 1009
+ },
+ {
+ "epoch": 0.47754137115839246,
+ "grad_norm": 3.0118913650512695,
+ "learning_rate": 4.935823772574965e-06,
+ "loss": 0.6922,
+ "step": 1010
+ },
+ {
+ "epoch": 0.47801418439716314,
+ "grad_norm": 3.016216993331909,
+ "learning_rate": 4.935683256665973e-06,
+ "loss": 0.6653,
+ "step": 1011
+ },
+ {
+ "epoch": 0.47848699763593383,
+ "grad_norm": 2.9526784420013428,
+ "learning_rate": 4.9355425890976636e-06,
+ "loss": 0.6423,
+ "step": 1012
+ },
+ {
+ "epoch": 0.4789598108747045,
+ "grad_norm": 6.222797393798828,
+ "learning_rate": 4.9354017698787985e-06,
+ "loss": 0.5884,
+ "step": 1013
+ },
+ {
+ "epoch": 0.4794326241134752,
+ "grad_norm": 2.6553597450256348,
+ "learning_rate": 4.935260799018143e-06,
+ "loss": 0.6624,
+ "step": 1014
+ },
+ {
+ "epoch": 0.4799054373522459,
+ "grad_norm": 3.0942065715789795,
+ "learning_rate": 4.935119676524475e-06,
+ "loss": 0.6623,
+ "step": 1015
+ },
+ {
+ "epoch": 0.4803782505910166,
+ "grad_norm": 2.626359224319458,
+ "learning_rate": 4.934978402406585e-06,
+ "loss": 0.6195,
+ "step": 1016
+ },
+ {
+ "epoch": 0.4808510638297872,
+ "grad_norm": 2.7954699993133545,
+ "learning_rate": 4.934836976673265e-06,
+ "loss": 0.5545,
+ "step": 1017
+ },
+ {
+ "epoch": 0.4813238770685579,
+ "grad_norm": 2.913557291030884,
+ "learning_rate": 4.934695399333324e-06,
+ "loss": 0.6288,
+ "step": 1018
+ },
+ {
+ "epoch": 0.4817966903073286,
+ "grad_norm": 3.1043739318847656,
+ "learning_rate": 4.9345536703955746e-06,
+ "loss": 0.6771,
+ "step": 1019
+ },
+ {
+ "epoch": 0.48226950354609927,
+ "grad_norm": 2.789357900619507,
+ "learning_rate": 4.934411789868845e-06,
+ "loss": 0.6227,
+ "step": 1020
+ },
+ {
+ "epoch": 0.48274231678486995,
+ "grad_norm": 2.480609655380249,
+ "learning_rate": 4.934269757761967e-06,
+ "loss": 0.5779,
+ "step": 1021
+ },
+ {
+ "epoch": 0.48321513002364064,
+ "grad_norm": 2.7946252822875977,
+ "learning_rate": 4.934127574083785e-06,
+ "loss": 0.6166,
+ "step": 1022
+ },
+ {
+ "epoch": 0.4836879432624113,
+ "grad_norm": 3.0670509338378906,
+ "learning_rate": 4.933985238843153e-06,
+ "loss": 0.7766,
+ "step": 1023
+ },
+ {
+ "epoch": 0.484160756501182,
+ "grad_norm": 2.8567559719085693,
+ "learning_rate": 4.933842752048932e-06,
+ "loss": 0.5088,
+ "step": 1024
+ },
+ {
+ "epoch": 0.4846335697399527,
+ "grad_norm": 2.5674657821655273,
+ "learning_rate": 4.933700113709996e-06,
+ "loss": 0.6036,
+ "step": 1025
+ },
+ {
+ "epoch": 0.4851063829787234,
+ "grad_norm": 2.782339096069336,
+ "learning_rate": 4.933557323835224e-06,
+ "loss": 0.5335,
+ "step": 1026
+ },
+ {
+ "epoch": 0.48557919621749407,
+ "grad_norm": 2.6334071159362793,
+ "learning_rate": 4.93341438243351e-06,
+ "loss": 0.6327,
+ "step": 1027
+ },
+ {
+ "epoch": 0.48605200945626476,
+ "grad_norm": 3.0853965282440186,
+ "learning_rate": 4.933271289513751e-06,
+ "loss": 0.7102,
+ "step": 1028
+ },
+ {
+ "epoch": 0.48652482269503544,
+ "grad_norm": 2.619997501373291,
+ "learning_rate": 4.933128045084859e-06,
+ "loss": 0.6138,
+ "step": 1029
+ },
+ {
+ "epoch": 0.48699763593380613,
+ "grad_norm": 2.8316116333007812,
+ "learning_rate": 4.932984649155753e-06,
+ "loss": 0.6346,
+ "step": 1030
+ },
+ {
+ "epoch": 0.4874704491725768,
+ "grad_norm": 3.153486490249634,
+ "learning_rate": 4.932841101735361e-06,
+ "loss": 0.7626,
+ "step": 1031
+ },
+ {
+ "epoch": 0.4879432624113475,
+ "grad_norm": 3.1831274032592773,
+ "learning_rate": 4.9326974028326214e-06,
+ "loss": 0.6607,
+ "step": 1032
+ },
+ {
+ "epoch": 0.4884160756501182,
+ "grad_norm": 2.791078567504883,
+ "learning_rate": 4.932553552456481e-06,
+ "loss": 0.6141,
+ "step": 1033
+ },
+ {
+ "epoch": 0.4888888888888889,
+ "grad_norm": 2.627263307571411,
+ "learning_rate": 4.932409550615898e-06,
+ "loss": 0.6777,
+ "step": 1034
+ },
+ {
+ "epoch": 0.48936170212765956,
+ "grad_norm": 2.8550007343292236,
+ "learning_rate": 4.932265397319838e-06,
+ "loss": 0.6379,
+ "step": 1035
+ },
+ {
+ "epoch": 0.48983451536643025,
+ "grad_norm": 4.505824089050293,
+ "learning_rate": 4.932121092577276e-06,
+ "loss": 0.5892,
+ "step": 1036
+ },
+ {
+ "epoch": 0.49030732860520093,
+ "grad_norm": 3.100191116333008,
+ "learning_rate": 4.931976636397199e-06,
+ "loss": 0.6443,
+ "step": 1037
+ },
+ {
+ "epoch": 0.4907801418439716,
+ "grad_norm": 2.921494245529175,
+ "learning_rate": 4.9318320287886e-06,
+ "loss": 0.6821,
+ "step": 1038
+ },
+ {
+ "epoch": 0.4912529550827423,
+ "grad_norm": 4.577807903289795,
+ "learning_rate": 4.931687269760485e-06,
+ "loss": 0.5946,
+ "step": 1039
+ },
+ {
+ "epoch": 0.491725768321513,
+ "grad_norm": 2.7347636222839355,
+ "learning_rate": 4.931542359321865e-06,
+ "loss": 0.5689,
+ "step": 1040
+ },
+ {
+ "epoch": 0.4921985815602837,
+ "grad_norm": 2.5289158821105957,
+ "learning_rate": 4.931397297481765e-06,
+ "loss": 0.5632,
+ "step": 1041
+ },
+ {
+ "epoch": 0.49267139479905436,
+ "grad_norm": 3.3518471717834473,
+ "learning_rate": 4.9312520842492165e-06,
+ "loss": 0.6349,
+ "step": 1042
+ },
+ {
+ "epoch": 0.49314420803782505,
+ "grad_norm": 3.0469748973846436,
+ "learning_rate": 4.931106719633261e-06,
+ "loss": 0.5734,
+ "step": 1043
+ },
+ {
+ "epoch": 0.49361702127659574,
+ "grad_norm": 3.104682445526123,
+ "learning_rate": 4.930961203642951e-06,
+ "loss": 0.6101,
+ "step": 1044
+ },
+ {
+ "epoch": 0.4940898345153664,
+ "grad_norm": 2.776705503463745,
+ "learning_rate": 4.930815536287346e-06,
+ "loss": 0.6397,
+ "step": 1045
+ },
+ {
+ "epoch": 0.4945626477541371,
+ "grad_norm": 2.760380983352661,
+ "learning_rate": 4.930669717575516e-06,
+ "loss": 0.668,
+ "step": 1046
+ },
+ {
+ "epoch": 0.4950354609929078,
+ "grad_norm": 2.70084547996521,
+ "learning_rate": 4.930523747516541e-06,
+ "loss": 0.5729,
+ "step": 1047
+ },
+ {
+ "epoch": 0.4955082742316785,
+ "grad_norm": 2.7319583892822266,
+ "learning_rate": 4.930377626119511e-06,
+ "loss": 0.6258,
+ "step": 1048
+ },
+ {
+ "epoch": 0.49598108747044917,
+ "grad_norm": 3.2515223026275635,
+ "learning_rate": 4.930231353393521e-06,
+ "loss": 0.7412,
+ "step": 1049
+ },
+ {
+ "epoch": 0.49645390070921985,
+ "grad_norm": 3.0646486282348633,
+ "learning_rate": 4.930084929347682e-06,
+ "loss": 0.5809,
+ "step": 1050
+ },
+ {
+ "epoch": 0.49692671394799054,
+ "grad_norm": 3.1621921062469482,
+ "learning_rate": 4.9299383539911096e-06,
+ "loss": 0.6282,
+ "step": 1051
+ },
+ {
+ "epoch": 0.4973995271867612,
+ "grad_norm": 2.864713191986084,
+ "learning_rate": 4.929791627332931e-06,
+ "loss": 0.6263,
+ "step": 1052
+ },
+ {
+ "epoch": 0.4978723404255319,
+ "grad_norm": 3.181016683578491,
+ "learning_rate": 4.929644749382283e-06,
+ "loss": 0.5697,
+ "step": 1053
+ },
+ {
+ "epoch": 0.4983451536643026,
+ "grad_norm": 2.9064836502075195,
+ "learning_rate": 4.929497720148309e-06,
+ "loss": 0.6161,
+ "step": 1054
+ },
+ {
+ "epoch": 0.4988179669030733,
+ "grad_norm": 3.058112859725952,
+ "learning_rate": 4.9293505396401655e-06,
+ "loss": 0.6477,
+ "step": 1055
+ },
+ {
+ "epoch": 0.49929078014184397,
+ "grad_norm": 2.5227596759796143,
+ "learning_rate": 4.929203207867016e-06,
+ "loss": 0.5819,
+ "step": 1056
+ },
+ {
+ "epoch": 0.49976359338061466,
+ "grad_norm": 3.386862277984619,
+ "learning_rate": 4.929055724838035e-06,
+ "loss": 0.7342,
+ "step": 1057
+ },
+ {
+ "epoch": 0.5002364066193853,
+ "grad_norm": 3.368346929550171,
+ "learning_rate": 4.928908090562404e-06,
+ "loss": 0.6622,
+ "step": 1058
+ },
+ {
+ "epoch": 0.500709219858156,
+ "grad_norm": 2.9108314514160156,
+ "learning_rate": 4.928760305049317e-06,
+ "loss": 0.6598,
+ "step": 1059
+ },
+ {
+ "epoch": 0.5011820330969267,
+ "grad_norm": 2.822305917739868,
+ "learning_rate": 4.928612368307977e-06,
+ "loss": 0.5841,
+ "step": 1060
+ },
+ {
+ "epoch": 0.5016548463356973,
+ "grad_norm": 2.689131259918213,
+ "learning_rate": 4.928464280347592e-06,
+ "loss": 0.6631,
+ "step": 1061
+ },
+ {
+ "epoch": 0.502127659574468,
+ "grad_norm": 3.337214946746826,
+ "learning_rate": 4.9283160411773864e-06,
+ "loss": 0.6105,
+ "step": 1062
+ },
+ {
+ "epoch": 0.5026004728132387,
+ "grad_norm": 3.035911798477173,
+ "learning_rate": 4.928167650806588e-06,
+ "loss": 0.6981,
+ "step": 1063
+ },
+ {
+ "epoch": 0.5030732860520094,
+ "grad_norm": 2.8820855617523193,
+ "learning_rate": 4.9280191092444375e-06,
+ "loss": 0.6408,
+ "step": 1064
+ },
+ {
+ "epoch": 0.5035460992907801,
+ "grad_norm": 3.080432415008545,
+ "learning_rate": 4.927870416500183e-06,
+ "loss": 0.6398,
+ "step": 1065
+ },
+ {
+ "epoch": 0.5040189125295508,
+ "grad_norm": 2.761612892150879,
+ "learning_rate": 4.927721572583084e-06,
+ "loss": 0.6126,
+ "step": 1066
+ },
+ {
+ "epoch": 0.5044917257683215,
+ "grad_norm": 2.8561882972717285,
+ "learning_rate": 4.927572577502408e-06,
+ "loss": 0.584,
+ "step": 1067
+ },
+ {
+ "epoch": 0.5049645390070922,
+ "grad_norm": 3.3386311531066895,
+ "learning_rate": 4.927423431267432e-06,
+ "loss": 0.6666,
+ "step": 1068
+ },
+ {
+ "epoch": 0.5054373522458628,
+ "grad_norm": 2.632906675338745,
+ "learning_rate": 4.927274133887443e-06,
+ "loss": 0.632,
+ "step": 1069
+ },
+ {
+ "epoch": 0.5059101654846335,
+ "grad_norm": 2.8737308979034424,
+ "learning_rate": 4.927124685371737e-06,
+ "loss": 0.6051,
+ "step": 1070
+ },
+ {
+ "epoch": 0.5063829787234042,
+ "grad_norm": 3.042222738265991,
+ "learning_rate": 4.926975085729619e-06,
+ "loss": 0.6954,
+ "step": 1071
+ },
+ {
+ "epoch": 0.5068557919621749,
+ "grad_norm": 3.3341481685638428,
+ "learning_rate": 4.926825334970404e-06,
+ "loss": 0.7148,
+ "step": 1072
+ },
+ {
+ "epoch": 0.5073286052009456,
+ "grad_norm": 2.7415387630462646,
+ "learning_rate": 4.926675433103418e-06,
+ "loss": 0.5456,
+ "step": 1073
+ },
+ {
+ "epoch": 0.5078014184397163,
+ "grad_norm": 2.7545325756073,
+ "learning_rate": 4.926525380137993e-06,
+ "loss": 0.6213,
+ "step": 1074
+ },
+ {
+ "epoch": 0.508274231678487,
+ "grad_norm": 2.9153690338134766,
+ "learning_rate": 4.926375176083472e-06,
+ "loss": 0.6466,
+ "step": 1075
+ },
+ {
+ "epoch": 0.5087470449172576,
+ "grad_norm": 4.210638523101807,
+ "learning_rate": 4.926224820949209e-06,
+ "loss": 0.6192,
+ "step": 1076
+ },
+ {
+ "epoch": 0.5092198581560283,
+ "grad_norm": 2.4357898235321045,
+ "learning_rate": 4.926074314744565e-06,
+ "loss": 0.594,
+ "step": 1077
+ },
+ {
+ "epoch": 0.509692671394799,
+ "grad_norm": 2.8004701137542725,
+ "learning_rate": 4.92592365747891e-06,
+ "loss": 0.6276,
+ "step": 1078
+ },
+ {
+ "epoch": 0.5101654846335697,
+ "grad_norm": 2.920675039291382,
+ "learning_rate": 4.925772849161628e-06,
+ "loss": 0.6043,
+ "step": 1079
+ },
+ {
+ "epoch": 0.5106382978723404,
+ "grad_norm": 2.791555404663086,
+ "learning_rate": 4.9256218898021055e-06,
+ "loss": 0.6837,
+ "step": 1080
+ },
+ {
+ "epoch": 0.5111111111111111,
+ "grad_norm": 3.1702463626861572,
+ "learning_rate": 4.925470779409746e-06,
+ "loss": 0.668,
+ "step": 1081
+ },
+ {
+ "epoch": 0.5115839243498818,
+ "grad_norm": 2.7149479389190674,
+ "learning_rate": 4.925319517993955e-06,
+ "loss": 0.5842,
+ "step": 1082
+ },
+ {
+ "epoch": 0.5120567375886524,
+ "grad_norm": 2.916311025619507,
+ "learning_rate": 4.925168105564153e-06,
+ "loss": 0.6893,
+ "step": 1083
+ },
+ {
+ "epoch": 0.5125295508274231,
+ "grad_norm": 2.917654514312744,
+ "learning_rate": 4.925016542129767e-06,
+ "loss": 0.6513,
+ "step": 1084
+ },
+ {
+ "epoch": 0.5130023640661938,
+ "grad_norm": 2.5568928718566895,
+ "learning_rate": 4.924864827700234e-06,
+ "loss": 0.6177,
+ "step": 1085
+ },
+ {
+ "epoch": 0.5134751773049645,
+ "grad_norm": 2.816720485687256,
+ "learning_rate": 4.924712962285001e-06,
+ "loss": 0.5833,
+ "step": 1086
+ },
+ {
+ "epoch": 0.5139479905437352,
+ "grad_norm": 2.6989188194274902,
+ "learning_rate": 4.9245609458935235e-06,
+ "loss": 0.6332,
+ "step": 1087
+ },
+ {
+ "epoch": 0.5144208037825059,
+ "grad_norm": 2.959599494934082,
+ "learning_rate": 4.924408778535268e-06,
+ "loss": 0.626,
+ "step": 1088
+ },
+ {
+ "epoch": 0.5148936170212766,
+ "grad_norm": 2.872814416885376,
+ "learning_rate": 4.924256460219708e-06,
+ "loss": 0.6407,
+ "step": 1089
+ },
+ {
+ "epoch": 0.5153664302600472,
+ "grad_norm": 2.6989097595214844,
+ "learning_rate": 4.924103990956329e-06,
+ "loss": 0.6391,
+ "step": 1090
+ },
+ {
+ "epoch": 0.5158392434988179,
+ "grad_norm": 2.986492156982422,
+ "learning_rate": 4.9239513707546235e-06,
+ "loss": 0.6911,
+ "step": 1091
+ },
+ {
+ "epoch": 0.5163120567375886,
+ "grad_norm": 3.069920301437378,
+ "learning_rate": 4.9237985996240954e-06,
+ "loss": 0.671,
+ "step": 1092
+ },
+ {
+ "epoch": 0.5167848699763593,
+ "grad_norm": 2.8214917182922363,
+ "learning_rate": 4.9236456775742555e-06,
+ "loss": 0.5885,
+ "step": 1093
+ },
+ {
+ "epoch": 0.51725768321513,
+ "grad_norm": 2.9416961669921875,
+ "learning_rate": 4.923492604614627e-06,
+ "loss": 0.6293,
+ "step": 1094
+ },
+ {
+ "epoch": 0.5177304964539007,
+ "grad_norm": 2.761780023574829,
+ "learning_rate": 4.923339380754741e-06,
+ "loss": 0.649,
+ "step": 1095
+ },
+ {
+ "epoch": 0.5182033096926714,
+ "grad_norm": 2.7648792266845703,
+ "learning_rate": 4.923186006004138e-06,
+ "loss": 0.5906,
+ "step": 1096
+ },
+ {
+ "epoch": 0.518676122931442,
+ "grad_norm": 3.5535428524017334,
+ "learning_rate": 4.923032480372367e-06,
+ "loss": 0.7138,
+ "step": 1097
+ },
+ {
+ "epoch": 0.5191489361702127,
+ "grad_norm": 2.6252479553222656,
+ "learning_rate": 4.922878803868988e-06,
+ "loss": 0.5499,
+ "step": 1098
+ },
+ {
+ "epoch": 0.5196217494089834,
+ "grad_norm": 2.901002883911133,
+ "learning_rate": 4.9227249765035715e-06,
+ "loss": 0.6991,
+ "step": 1099
+ },
+ {
+ "epoch": 0.5200945626477541,
+ "grad_norm": 2.621877431869507,
+ "learning_rate": 4.9225709982856925e-06,
+ "loss": 0.6269,
+ "step": 1100
+ },
+ {
+ "epoch": 0.5205673758865248,
+ "grad_norm": 2.872483015060425,
+ "learning_rate": 4.92241686922494e-06,
+ "loss": 0.6657,
+ "step": 1101
+ },
+ {
+ "epoch": 0.5210401891252955,
+ "grad_norm": 2.730447769165039,
+ "learning_rate": 4.922262589330912e-06,
+ "loss": 0.6061,
+ "step": 1102
+ },
+ {
+ "epoch": 0.5215130023640662,
+ "grad_norm": 2.646247386932373,
+ "learning_rate": 4.922108158613213e-06,
+ "loss": 0.5923,
+ "step": 1103
+ },
+ {
+ "epoch": 0.5219858156028369,
+ "grad_norm": 2.6488895416259766,
+ "learning_rate": 4.92195357708146e-06,
+ "loss": 0.6293,
+ "step": 1104
+ },
+ {
+ "epoch": 0.5224586288416075,
+ "grad_norm": 2.756338357925415,
+ "learning_rate": 4.921798844745278e-06,
+ "loss": 0.6374,
+ "step": 1105
+ },
+ {
+ "epoch": 0.5229314420803782,
+ "grad_norm": 3.1441280841827393,
+ "learning_rate": 4.921643961614301e-06,
+ "loss": 0.6652,
+ "step": 1106
+ },
+ {
+ "epoch": 0.5234042553191489,
+ "grad_norm": 3.050002098083496,
+ "learning_rate": 4.921488927698172e-06,
+ "loss": 0.6809,
+ "step": 1107
+ },
+ {
+ "epoch": 0.5238770685579196,
+ "grad_norm": 2.71750807762146,
+ "learning_rate": 4.921333743006547e-06,
+ "loss": 0.6266,
+ "step": 1108
+ },
+ {
+ "epoch": 0.5243498817966903,
+ "grad_norm": 2.8439245223999023,
+ "learning_rate": 4.921178407549086e-06,
+ "loss": 0.5663,
+ "step": 1109
+ },
+ {
+ "epoch": 0.524822695035461,
+ "grad_norm": 3.0722241401672363,
+ "learning_rate": 4.921022921335464e-06,
+ "loss": 0.6791,
+ "step": 1110
+ },
+ {
+ "epoch": 0.5252955082742317,
+ "grad_norm": 3.4381656646728516,
+ "learning_rate": 4.920867284375358e-06,
+ "loss": 0.6687,
+ "step": 1111
+ },
+ {
+ "epoch": 0.5257683215130023,
+ "grad_norm": 2.819812774658203,
+ "learning_rate": 4.920711496678463e-06,
+ "loss": 0.6299,
+ "step": 1112
+ },
+ {
+ "epoch": 0.526241134751773,
+ "grad_norm": 3.6587414741516113,
+ "learning_rate": 4.9205555582544765e-06,
+ "loss": 0.7392,
+ "step": 1113
+ },
+ {
+ "epoch": 0.5267139479905437,
+ "grad_norm": 2.774296522140503,
+ "learning_rate": 4.920399469113109e-06,
+ "loss": 0.6652,
+ "step": 1114
+ },
+ {
+ "epoch": 0.5271867612293144,
+ "grad_norm": 2.7480580806732178,
+ "learning_rate": 4.920243229264081e-06,
+ "loss": 0.596,
+ "step": 1115
+ },
+ {
+ "epoch": 0.5276595744680851,
+ "grad_norm": 3.213057518005371,
+ "learning_rate": 4.920086838717119e-06,
+ "loss": 0.6986,
+ "step": 1116
+ },
+ {
+ "epoch": 0.5281323877068558,
+ "grad_norm": 2.940546989440918,
+ "learning_rate": 4.919930297481962e-06,
+ "loss": 0.6481,
+ "step": 1117
+ },
+ {
+ "epoch": 0.5286052009456265,
+ "grad_norm": 2.5970494747161865,
+ "learning_rate": 4.9197736055683555e-06,
+ "loss": 0.5658,
+ "step": 1118
+ },
+ {
+ "epoch": 0.5290780141843971,
+ "grad_norm": 4.49385404586792,
+ "learning_rate": 4.919616762986057e-06,
+ "loss": 0.605,
+ "step": 1119
+ },
+ {
+ "epoch": 0.5295508274231678,
+ "grad_norm": 2.971857786178589,
+ "learning_rate": 4.919459769744833e-06,
+ "loss": 0.6539,
+ "step": 1120
+ },
+ {
+ "epoch": 0.5300236406619385,
+ "grad_norm": 2.6192965507507324,
+ "learning_rate": 4.919302625854457e-06,
+ "loss": 0.6226,
+ "step": 1121
+ },
+ {
+ "epoch": 0.5304964539007092,
+ "grad_norm": 2.665088176727295,
+ "learning_rate": 4.919145331324716e-06,
+ "loss": 0.6647,
+ "step": 1122
+ },
+ {
+ "epoch": 0.5309692671394799,
+ "grad_norm": 2.612126111984253,
+ "learning_rate": 4.918987886165403e-06,
+ "loss": 0.6965,
+ "step": 1123
+ },
+ {
+ "epoch": 0.5314420803782506,
+ "grad_norm": 3.80017352104187,
+ "learning_rate": 4.9188302903863205e-06,
+ "loss": 0.7396,
+ "step": 1124
+ },
+ {
+ "epoch": 0.5319148936170213,
+ "grad_norm": 2.781752824783325,
+ "learning_rate": 4.918672543997282e-06,
+ "loss": 0.5985,
+ "step": 1125
+ },
+ {
+ "epoch": 0.532387706855792,
+ "grad_norm": 2.6067914962768555,
+ "learning_rate": 4.91851464700811e-06,
+ "loss": 0.6159,
+ "step": 1126
+ },
+ {
+ "epoch": 0.5328605200945626,
+ "grad_norm": 2.670807123184204,
+ "learning_rate": 4.918356599428636e-06,
+ "loss": 0.5958,
+ "step": 1127
+ },
+ {
+ "epoch": 0.5333333333333333,
+ "grad_norm": 2.608611822128296,
+ "learning_rate": 4.9181984012687e-06,
+ "loss": 0.5768,
+ "step": 1128
+ },
+ {
+ "epoch": 0.533806146572104,
+ "grad_norm": 2.586764097213745,
+ "learning_rate": 4.918040052538154e-06,
+ "loss": 0.661,
+ "step": 1129
+ },
+ {
+ "epoch": 0.5342789598108747,
+ "grad_norm": 3.1317451000213623,
+ "learning_rate": 4.917881553246856e-06,
+ "loss": 0.6626,
+ "step": 1130
+ },
+ {
+ "epoch": 0.5347517730496454,
+ "grad_norm": 2.7135281562805176,
+ "learning_rate": 4.917722903404676e-06,
+ "loss": 0.6572,
+ "step": 1131
+ },
+ {
+ "epoch": 0.5352245862884161,
+ "grad_norm": 3.4546358585357666,
+ "learning_rate": 4.917564103021493e-06,
+ "loss": 0.5597,
+ "step": 1132
+ },
+ {
+ "epoch": 0.5356973995271868,
+ "grad_norm": 3.0943493843078613,
+ "learning_rate": 4.917405152107193e-06,
+ "loss": 0.7258,
+ "step": 1133
+ },
+ {
+ "epoch": 0.5361702127659574,
+ "grad_norm": 2.6069352626800537,
+ "learning_rate": 4.917246050671674e-06,
+ "loss": 0.6209,
+ "step": 1134
+ },
+ {
+ "epoch": 0.5366430260047281,
+ "grad_norm": 2.584883689880371,
+ "learning_rate": 4.917086798724844e-06,
+ "loss": 0.658,
+ "step": 1135
+ },
+ {
+ "epoch": 0.5371158392434988,
+ "grad_norm": 3.001976490020752,
+ "learning_rate": 4.9169273962766166e-06,
+ "loss": 0.6306,
+ "step": 1136
+ },
+ {
+ "epoch": 0.5375886524822695,
+ "grad_norm": 2.5013928413391113,
+ "learning_rate": 4.916767843336918e-06,
+ "loss": 0.572,
+ "step": 1137
+ },
+ {
+ "epoch": 0.5380614657210402,
+ "grad_norm": 2.9114553928375244,
+ "learning_rate": 4.916608139915684e-06,
+ "loss": 0.5841,
+ "step": 1138
+ },
+ {
+ "epoch": 0.5385342789598109,
+ "grad_norm": 2.8878467082977295,
+ "learning_rate": 4.9164482860228564e-06,
+ "loss": 0.6654,
+ "step": 1139
+ },
+ {
+ "epoch": 0.5390070921985816,
+ "grad_norm": 2.9827866554260254,
+ "learning_rate": 4.91628828166839e-06,
+ "loss": 0.6674,
+ "step": 1140
+ },
+ {
+ "epoch": 0.5394799054373522,
+ "grad_norm": 3.8696281909942627,
+ "learning_rate": 4.916128126862248e-06,
+ "loss": 0.6241,
+ "step": 1141
+ },
+ {
+ "epoch": 0.5399527186761229,
+ "grad_norm": 2.9556291103363037,
+ "learning_rate": 4.915967821614402e-06,
+ "loss": 0.6478,
+ "step": 1142
+ },
+ {
+ "epoch": 0.5404255319148936,
+ "grad_norm": 2.392942428588867,
+ "learning_rate": 4.915807365934834e-06,
+ "loss": 0.6097,
+ "step": 1143
+ },
+ {
+ "epoch": 0.5408983451536643,
+ "grad_norm": 3.032235860824585,
+ "learning_rate": 4.915646759833534e-06,
+ "loss": 0.7193,
+ "step": 1144
+ },
+ {
+ "epoch": 0.541371158392435,
+ "grad_norm": 2.840416193008423,
+ "learning_rate": 4.915486003320501e-06,
+ "loss": 0.5506,
+ "step": 1145
+ },
+ {
+ "epoch": 0.5418439716312057,
+ "grad_norm": 2.5438895225524902,
+ "learning_rate": 4.915325096405747e-06,
+ "loss": 0.6487,
+ "step": 1146
+ },
+ {
+ "epoch": 0.5423167848699764,
+ "grad_norm": 2.544334650039673,
+ "learning_rate": 4.9151640390992905e-06,
+ "loss": 0.6168,
+ "step": 1147
+ },
+ {
+ "epoch": 0.542789598108747,
+ "grad_norm": 2.8535678386688232,
+ "learning_rate": 4.91500283141116e-06,
+ "loss": 0.678,
+ "step": 1148
+ },
+ {
+ "epoch": 0.5432624113475177,
+ "grad_norm": 2.8086955547332764,
+ "learning_rate": 4.9148414733513915e-06,
+ "loss": 0.6473,
+ "step": 1149
+ },
+ {
+ "epoch": 0.5437352245862884,
+ "grad_norm": 2.4709885120391846,
+ "learning_rate": 4.914679964930034e-06,
+ "loss": 0.6797,
+ "step": 1150
+ },
+ {
+ "epoch": 0.5442080378250591,
+ "grad_norm": 2.8546934127807617,
+ "learning_rate": 4.9145183061571435e-06,
+ "loss": 0.6247,
+ "step": 1151
+ },
+ {
+ "epoch": 0.5446808510638298,
+ "grad_norm": 2.991184711456299,
+ "learning_rate": 4.9143564970427844e-06,
+ "loss": 0.5977,
+ "step": 1152
+ },
+ {
+ "epoch": 0.5451536643026005,
+ "grad_norm": 3.011216402053833,
+ "learning_rate": 4.914194537597033e-06,
+ "loss": 0.7005,
+ "step": 1153
+ },
+ {
+ "epoch": 0.5456264775413712,
+ "grad_norm": 2.807521343231201,
+ "learning_rate": 4.9140324278299744e-06,
+ "loss": 0.5412,
+ "step": 1154
+ },
+ {
+ "epoch": 0.5460992907801419,
+ "grad_norm": 3.0401229858398438,
+ "learning_rate": 4.913870167751701e-06,
+ "loss": 0.6394,
+ "step": 1155
+ },
+ {
+ "epoch": 0.5465721040189125,
+ "grad_norm": 2.853914976119995,
+ "learning_rate": 4.913707757372317e-06,
+ "loss": 0.6745,
+ "step": 1156
+ },
+ {
+ "epoch": 0.5470449172576832,
+ "grad_norm": 4.505620956420898,
+ "learning_rate": 4.913545196701935e-06,
+ "loss": 0.6668,
+ "step": 1157
+ },
+ {
+ "epoch": 0.5475177304964539,
+ "grad_norm": 3.0505781173706055,
+ "learning_rate": 4.913382485750676e-06,
+ "loss": 0.6926,
+ "step": 1158
+ },
+ {
+ "epoch": 0.5479905437352246,
+ "grad_norm": 2.798435688018799,
+ "learning_rate": 4.913219624528672e-06,
+ "loss": 0.605,
+ "step": 1159
+ },
+ {
+ "epoch": 0.5484633569739953,
+ "grad_norm": 2.7814908027648926,
+ "learning_rate": 4.913056613046065e-06,
+ "loss": 0.6678,
+ "step": 1160
+ },
+ {
+ "epoch": 0.548936170212766,
+ "grad_norm": 3.2089321613311768,
+ "learning_rate": 4.9128934513130025e-06,
+ "loss": 0.5995,
+ "step": 1161
+ },
+ {
+ "epoch": 0.5494089834515367,
+ "grad_norm": 2.7699952125549316,
+ "learning_rate": 4.9127301393396455e-06,
+ "loss": 0.7062,
+ "step": 1162
+ },
+ {
+ "epoch": 0.5498817966903073,
+ "grad_norm": 2.859368324279785,
+ "learning_rate": 4.912566677136162e-06,
+ "loss": 0.6063,
+ "step": 1163
+ },
+ {
+ "epoch": 0.550354609929078,
+ "grad_norm": 2.727334499359131,
+ "learning_rate": 4.91240306471273e-06,
+ "loss": 0.6848,
+ "step": 1164
+ },
+ {
+ "epoch": 0.5508274231678487,
+ "grad_norm": 2.6017510890960693,
+ "learning_rate": 4.912239302079537e-06,
+ "loss": 0.5808,
+ "step": 1165
+ },
+ {
+ "epoch": 0.5513002364066194,
+ "grad_norm": 3.539583206176758,
+ "learning_rate": 4.912075389246781e-06,
+ "loss": 0.7053,
+ "step": 1166
+ },
+ {
+ "epoch": 0.5517730496453901,
+ "grad_norm": 2.918280601501465,
+ "learning_rate": 4.911911326224666e-06,
+ "loss": 0.5904,
+ "step": 1167
+ },
+ {
+ "epoch": 0.5522458628841608,
+ "grad_norm": 3.0067362785339355,
+ "learning_rate": 4.9117471130234095e-06,
+ "loss": 0.6392,
+ "step": 1168
+ },
+ {
+ "epoch": 0.5527186761229315,
+ "grad_norm": 2.4374797344207764,
+ "learning_rate": 4.911582749653236e-06,
+ "loss": 0.5793,
+ "step": 1169
+ },
+ {
+ "epoch": 0.5531914893617021,
+ "grad_norm": 3.121182918548584,
+ "learning_rate": 4.911418236124378e-06,
+ "loss": 0.6636,
+ "step": 1170
+ },
+ {
+ "epoch": 0.5536643026004728,
+ "grad_norm": 3.1289851665496826,
+ "learning_rate": 4.91125357244708e-06,
+ "loss": 0.656,
+ "step": 1171
+ },
+ {
+ "epoch": 0.5541371158392435,
+ "grad_norm": 2.7034592628479004,
+ "learning_rate": 4.911088758631596e-06,
+ "loss": 0.6001,
+ "step": 1172
+ },
+ {
+ "epoch": 0.5546099290780142,
+ "grad_norm": 2.710146188735962,
+ "learning_rate": 4.910923794688187e-06,
+ "loss": 0.6007,
+ "step": 1173
+ },
+ {
+ "epoch": 0.5550827423167849,
+ "grad_norm": 2.5424487590789795,
+ "learning_rate": 4.910758680627124e-06,
+ "loss": 0.5193,
+ "step": 1174
+ },
+ {
+ "epoch": 0.5555555555555556,
+ "grad_norm": 2.615893602371216,
+ "learning_rate": 4.91059341645869e-06,
+ "loss": 0.5525,
+ "step": 1175
+ },
+ {
+ "epoch": 0.5560283687943263,
+ "grad_norm": 3.3179728984832764,
+ "learning_rate": 4.910428002193174e-06,
+ "loss": 0.7285,
+ "step": 1176
+ },
+ {
+ "epoch": 0.556501182033097,
+ "grad_norm": 2.7234175205230713,
+ "learning_rate": 4.910262437840875e-06,
+ "loss": 0.574,
+ "step": 1177
+ },
+ {
+ "epoch": 0.5569739952718676,
+ "grad_norm": 3.0416605472564697,
+ "learning_rate": 4.9100967234121034e-06,
+ "loss": 0.5623,
+ "step": 1178
+ },
+ {
+ "epoch": 0.5574468085106383,
+ "grad_norm": 3.067786455154419,
+ "learning_rate": 4.909930858917177e-06,
+ "loss": 0.6491,
+ "step": 1179
+ },
+ {
+ "epoch": 0.557919621749409,
+ "grad_norm": 3.0037379264831543,
+ "learning_rate": 4.909764844366422e-06,
+ "loss": 0.5696,
+ "step": 1180
+ },
+ {
+ "epoch": 0.5583924349881797,
+ "grad_norm": 2.966179609298706,
+ "learning_rate": 4.909598679770178e-06,
+ "loss": 0.6042,
+ "step": 1181
+ },
+ {
+ "epoch": 0.5588652482269504,
+ "grad_norm": 2.6000657081604004,
+ "learning_rate": 4.909432365138789e-06,
+ "loss": 0.5883,
+ "step": 1182
+ },
+ {
+ "epoch": 0.5593380614657211,
+ "grad_norm": 2.6794495582580566,
+ "learning_rate": 4.909265900482612e-06,
+ "loss": 0.6809,
+ "step": 1183
+ },
+ {
+ "epoch": 0.5598108747044918,
+ "grad_norm": 2.6765122413635254,
+ "learning_rate": 4.9090992858120115e-06,
+ "loss": 0.6601,
+ "step": 1184
+ },
+ {
+ "epoch": 0.5602836879432624,
+ "grad_norm": 2.6051928997039795,
+ "learning_rate": 4.908932521137363e-06,
+ "loss": 0.5946,
+ "step": 1185
+ },
+ {
+ "epoch": 0.5607565011820331,
+ "grad_norm": 3.0405542850494385,
+ "learning_rate": 4.908765606469048e-06,
+ "loss": 0.6998,
+ "step": 1186
+ },
+ {
+ "epoch": 0.5612293144208038,
+ "grad_norm": 2.7975668907165527,
+ "learning_rate": 4.908598541817462e-06,
+ "loss": 0.6218,
+ "step": 1187
+ },
+ {
+ "epoch": 0.5617021276595745,
+ "grad_norm": 2.5367627143859863,
+ "learning_rate": 4.908431327193005e-06,
+ "loss": 0.6354,
+ "step": 1188
+ },
+ {
+ "epoch": 0.5621749408983452,
+ "grad_norm": 3.7939631938934326,
+ "learning_rate": 4.908263962606091e-06,
+ "loss": 0.6376,
+ "step": 1189
+ },
+ {
+ "epoch": 0.5626477541371159,
+ "grad_norm": 2.864079475402832,
+ "learning_rate": 4.908096448067139e-06,
+ "loss": 0.5485,
+ "step": 1190
+ },
+ {
+ "epoch": 0.5631205673758866,
+ "grad_norm": 2.7855563163757324,
+ "learning_rate": 4.9079287835865804e-06,
+ "loss": 0.6645,
+ "step": 1191
+ },
+ {
+ "epoch": 0.5635933806146572,
+ "grad_norm": 2.6156625747680664,
+ "learning_rate": 4.9077609691748556e-06,
+ "loss": 0.5751,
+ "step": 1192
+ },
+ {
+ "epoch": 0.5640661938534279,
+ "grad_norm": 3.0475659370422363,
+ "learning_rate": 4.907593004842412e-06,
+ "loss": 0.6739,
+ "step": 1193
+ },
+ {
+ "epoch": 0.5645390070921986,
+ "grad_norm": 2.9176738262176514,
+ "learning_rate": 4.9074248905997104e-06,
+ "loss": 0.6493,
+ "step": 1194
+ },
+ {
+ "epoch": 0.5650118203309693,
+ "grad_norm": 2.6168384552001953,
+ "learning_rate": 4.907256626457216e-06,
+ "loss": 0.6154,
+ "step": 1195
+ },
+ {
+ "epoch": 0.56548463356974,
+ "grad_norm": 2.893980026245117,
+ "learning_rate": 4.907088212425408e-06,
+ "loss": 0.5808,
+ "step": 1196
+ },
+ {
+ "epoch": 0.5659574468085107,
+ "grad_norm": 3.3832836151123047,
+ "learning_rate": 4.90691964851477e-06,
+ "loss": 0.7888,
+ "step": 1197
+ },
+ {
+ "epoch": 0.5664302600472814,
+ "grad_norm": 3.088932752609253,
+ "learning_rate": 4.906750934735801e-06,
+ "loss": 0.6516,
+ "step": 1198
+ },
+ {
+ "epoch": 0.566903073286052,
+ "grad_norm": 2.494471549987793,
+ "learning_rate": 4.906582071099004e-06,
+ "loss": 0.6286,
+ "step": 1199
+ },
+ {
+ "epoch": 0.5673758865248227,
+ "grad_norm": 2.716550588607788,
+ "learning_rate": 4.906413057614895e-06,
+ "loss": 0.5939,
+ "step": 1200
+ },
+ {
+ "epoch": 0.5678486997635934,
+ "grad_norm": 2.5821073055267334,
+ "learning_rate": 4.906243894293995e-06,
+ "loss": 0.6668,
+ "step": 1201
+ },
+ {
+ "epoch": 0.5683215130023641,
+ "grad_norm": 3.651787042617798,
+ "learning_rate": 4.90607458114684e-06,
+ "loss": 0.6124,
+ "step": 1202
+ },
+ {
+ "epoch": 0.5687943262411348,
+ "grad_norm": 2.7567858695983887,
+ "learning_rate": 4.9059051181839705e-06,
+ "loss": 0.6656,
+ "step": 1203
+ },
+ {
+ "epoch": 0.5692671394799055,
+ "grad_norm": 2.8067586421966553,
+ "learning_rate": 4.90573550541594e-06,
+ "loss": 0.6306,
+ "step": 1204
+ },
+ {
+ "epoch": 0.5697399527186762,
+ "grad_norm": 2.6136393547058105,
+ "learning_rate": 4.905565742853307e-06,
+ "loss": 0.5992,
+ "step": 1205
+ },
+ {
+ "epoch": 0.5702127659574469,
+ "grad_norm": 2.899049758911133,
+ "learning_rate": 4.905395830506644e-06,
+ "loss": 0.621,
+ "step": 1206
+ },
+ {
+ "epoch": 0.5706855791962175,
+ "grad_norm": 3.036583185195923,
+ "learning_rate": 4.9052257683865294e-06,
+ "loss": 0.652,
+ "step": 1207
+ },
+ {
+ "epoch": 0.5711583924349882,
+ "grad_norm": 2.7947216033935547,
+ "learning_rate": 4.905055556503553e-06,
+ "loss": 0.6636,
+ "step": 1208
+ },
+ {
+ "epoch": 0.5716312056737589,
+ "grad_norm": 3.1646955013275146,
+ "learning_rate": 4.9048851948683135e-06,
+ "loss": 0.6376,
+ "step": 1209
+ },
+ {
+ "epoch": 0.5721040189125296,
+ "grad_norm": 2.8175766468048096,
+ "learning_rate": 4.904714683491417e-06,
+ "loss": 0.5929,
+ "step": 1210
+ },
+ {
+ "epoch": 0.5725768321513003,
+ "grad_norm": 2.923923969268799,
+ "learning_rate": 4.904544022383483e-06,
+ "loss": 0.6633,
+ "step": 1211
+ },
+ {
+ "epoch": 0.573049645390071,
+ "grad_norm": 2.7471134662628174,
+ "learning_rate": 4.9043732115551356e-06,
+ "loss": 0.6551,
+ "step": 1212
+ },
+ {
+ "epoch": 0.5735224586288417,
+ "grad_norm": 2.8660807609558105,
+ "learning_rate": 4.90420225101701e-06,
+ "loss": 0.6423,
+ "step": 1213
+ },
+ {
+ "epoch": 0.5739952718676123,
+ "grad_norm": 2.769247531890869,
+ "learning_rate": 4.904031140779754e-06,
+ "loss": 0.5982,
+ "step": 1214
+ },
+ {
+ "epoch": 0.574468085106383,
+ "grad_norm": 2.9043145179748535,
+ "learning_rate": 4.90385988085402e-06,
+ "loss": 0.5843,
+ "step": 1215
+ },
+ {
+ "epoch": 0.5749408983451537,
+ "grad_norm": 2.6639609336853027,
+ "learning_rate": 4.903688471250471e-06,
+ "loss": 0.5858,
+ "step": 1216
+ },
+ {
+ "epoch": 0.5754137115839244,
+ "grad_norm": 2.6967573165893555,
+ "learning_rate": 4.903516911979781e-06,
+ "loss": 0.5755,
+ "step": 1217
+ },
+ {
+ "epoch": 0.5758865248226951,
+ "grad_norm": 2.8865857124328613,
+ "learning_rate": 4.903345203052633e-06,
+ "loss": 0.6051,
+ "step": 1218
+ },
+ {
+ "epoch": 0.5763593380614658,
+ "grad_norm": 2.381979465484619,
+ "learning_rate": 4.903173344479717e-06,
+ "loss": 0.5727,
+ "step": 1219
+ },
+ {
+ "epoch": 0.5768321513002365,
+ "grad_norm": 2.7717981338500977,
+ "learning_rate": 4.903001336271734e-06,
+ "loss": 0.6406,
+ "step": 1220
+ },
+ {
+ "epoch": 0.577304964539007,
+ "grad_norm": 2.6431570053100586,
+ "learning_rate": 4.902829178439395e-06,
+ "loss": 0.6226,
+ "step": 1221
+ },
+ {
+ "epoch": 0.5777777777777777,
+ "grad_norm": 2.8090415000915527,
+ "learning_rate": 4.902656870993419e-06,
+ "loss": 0.5761,
+ "step": 1222
+ },
+ {
+ "epoch": 0.5782505910165484,
+ "grad_norm": 2.4769368171691895,
+ "learning_rate": 4.902484413944535e-06,
+ "loss": 0.5602,
+ "step": 1223
+ },
+ {
+ "epoch": 0.5787234042553191,
+ "grad_norm": 2.693316698074341,
+ "learning_rate": 4.902311807303481e-06,
+ "loss": 0.5222,
+ "step": 1224
+ },
+ {
+ "epoch": 0.5791962174940898,
+ "grad_norm": 2.7623913288116455,
+ "learning_rate": 4.902139051081004e-06,
+ "loss": 0.6978,
+ "step": 1225
+ },
+ {
+ "epoch": 0.5796690307328605,
+ "grad_norm": 2.6133766174316406,
+ "learning_rate": 4.901966145287863e-06,
+ "loss": 0.5802,
+ "step": 1226
+ },
+ {
+ "epoch": 0.5801418439716312,
+ "grad_norm": 2.7345972061157227,
+ "learning_rate": 4.901793089934821e-06,
+ "loss": 0.6294,
+ "step": 1227
+ },
+ {
+ "epoch": 0.5806146572104018,
+ "grad_norm": 2.7545835971832275,
+ "learning_rate": 4.9016198850326555e-06,
+ "loss": 0.6085,
+ "step": 1228
+ },
+ {
+ "epoch": 0.5810874704491725,
+ "grad_norm": 2.6947758197784424,
+ "learning_rate": 4.90144653059215e-06,
+ "loss": 0.6025,
+ "step": 1229
+ },
+ {
+ "epoch": 0.5815602836879432,
+ "grad_norm": 2.692967414855957,
+ "learning_rate": 4.901273026624099e-06,
+ "loss": 0.5715,
+ "step": 1230
+ },
+ {
+ "epoch": 0.5820330969267139,
+ "grad_norm": 2.78347110748291,
+ "learning_rate": 4.901099373139307e-06,
+ "loss": 0.6063,
+ "step": 1231
+ },
+ {
+ "epoch": 0.5825059101654846,
+ "grad_norm": 2.346496343612671,
+ "learning_rate": 4.900925570148585e-06,
+ "loss": 0.5869,
+ "step": 1232
+ },
+ {
+ "epoch": 0.5829787234042553,
+ "grad_norm": 2.606639862060547,
+ "learning_rate": 4.900751617662755e-06,
+ "loss": 0.6197,
+ "step": 1233
+ },
+ {
+ "epoch": 0.583451536643026,
+ "grad_norm": 2.5825929641723633,
+ "learning_rate": 4.900577515692649e-06,
+ "loss": 0.6721,
+ "step": 1234
+ },
+ {
+ "epoch": 0.5839243498817966,
+ "grad_norm": 2.731349468231201,
+ "learning_rate": 4.900403264249107e-06,
+ "loss": 0.6273,
+ "step": 1235
+ },
+ {
+ "epoch": 0.5843971631205673,
+ "grad_norm": 3.2133874893188477,
+ "learning_rate": 4.90022886334298e-06,
+ "loss": 0.6231,
+ "step": 1236
+ },
+ {
+ "epoch": 0.584869976359338,
+ "grad_norm": 2.9213852882385254,
+ "learning_rate": 4.900054312985127e-06,
+ "loss": 0.6677,
+ "step": 1237
+ },
+ {
+ "epoch": 0.5853427895981087,
+ "grad_norm": 2.815425157546997,
+ "learning_rate": 4.899879613186414e-06,
+ "loss": 0.6405,
+ "step": 1238
+ },
+ {
+ "epoch": 0.5858156028368794,
+ "grad_norm": 2.730782985687256,
+ "learning_rate": 4.899704763957721e-06,
+ "loss": 0.6233,
+ "step": 1239
+ },
+ {
+ "epoch": 0.5862884160756501,
+ "grad_norm": 2.6432766914367676,
+ "learning_rate": 4.899529765309936e-06,
+ "loss": 0.6267,
+ "step": 1240
+ },
+ {
+ "epoch": 0.5867612293144208,
+ "grad_norm": 2.616215229034424,
+ "learning_rate": 4.899354617253953e-06,
+ "loss": 0.6268,
+ "step": 1241
+ },
+ {
+ "epoch": 0.5872340425531914,
+ "grad_norm": 2.7630255222320557,
+ "learning_rate": 4.899179319800679e-06,
+ "loss": 0.6348,
+ "step": 1242
+ },
+ {
+ "epoch": 0.5877068557919621,
+ "grad_norm": 2.785095453262329,
+ "learning_rate": 4.899003872961029e-06,
+ "loss": 0.5839,
+ "step": 1243
+ },
+ {
+ "epoch": 0.5881796690307328,
+ "grad_norm": 2.9050328731536865,
+ "learning_rate": 4.898828276745927e-06,
+ "loss": 0.651,
+ "step": 1244
+ },
+ {
+ "epoch": 0.5886524822695035,
+ "grad_norm": 2.958092212677002,
+ "learning_rate": 4.8986525311663065e-06,
+ "loss": 0.6395,
+ "step": 1245
+ },
+ {
+ "epoch": 0.5891252955082742,
+ "grad_norm": 2.952310800552368,
+ "learning_rate": 4.898476636233111e-06,
+ "loss": 0.6731,
+ "step": 1246
+ },
+ {
+ "epoch": 0.5895981087470449,
+ "grad_norm": 2.9876346588134766,
+ "learning_rate": 4.898300591957293e-06,
+ "loss": 0.7015,
+ "step": 1247
+ },
+ {
+ "epoch": 0.5900709219858156,
+ "grad_norm": 2.8941752910614014,
+ "learning_rate": 4.898124398349813e-06,
+ "loss": 0.6452,
+ "step": 1248
+ },
+ {
+ "epoch": 0.5905437352245863,
+ "grad_norm": 2.9809536933898926,
+ "learning_rate": 4.897948055421642e-06,
+ "loss": 0.5736,
+ "step": 1249
+ },
+ {
+ "epoch": 0.5910165484633569,
+ "grad_norm": 2.927046775817871,
+ "learning_rate": 4.897771563183761e-06,
+ "loss": 0.5918,
+ "step": 1250
+ },
+ {
+ "epoch": 0.5914893617021276,
+ "grad_norm": 2.865020275115967,
+ "learning_rate": 4.897594921647158e-06,
+ "loss": 0.6924,
+ "step": 1251
+ },
+ {
+ "epoch": 0.5919621749408983,
+ "grad_norm": 2.7406699657440186,
+ "learning_rate": 4.897418130822832e-06,
+ "loss": 0.509,
+ "step": 1252
+ },
+ {
+ "epoch": 0.592434988179669,
+ "grad_norm": 2.781606912612915,
+ "learning_rate": 4.897241190721791e-06,
+ "loss": 0.5555,
+ "step": 1253
+ },
+ {
+ "epoch": 0.5929078014184397,
+ "grad_norm": 2.79209303855896,
+ "learning_rate": 4.8970641013550535e-06,
+ "loss": 0.6722,
+ "step": 1254
+ },
+ {
+ "epoch": 0.5933806146572104,
+ "grad_norm": 3.0672268867492676,
+ "learning_rate": 4.896886862733645e-06,
+ "loss": 0.6366,
+ "step": 1255
+ },
+ {
+ "epoch": 0.5938534278959811,
+ "grad_norm": 2.7456953525543213,
+ "learning_rate": 4.896709474868602e-06,
+ "loss": 0.6246,
+ "step": 1256
+ },
+ {
+ "epoch": 0.5943262411347517,
+ "grad_norm": 3.6731202602386475,
+ "learning_rate": 4.896531937770968e-06,
+ "loss": 0.668,
+ "step": 1257
+ },
+ {
+ "epoch": 0.5947990543735224,
+ "grad_norm": 2.6056087017059326,
+ "learning_rate": 4.8963542514518e-06,
+ "loss": 0.5815,
+ "step": 1258
+ },
+ {
+ "epoch": 0.5952718676122931,
+ "grad_norm": 2.719698905944824,
+ "learning_rate": 4.89617641592216e-06,
+ "loss": 0.6058,
+ "step": 1259
+ },
+ {
+ "epoch": 0.5957446808510638,
+ "grad_norm": 2.625838279724121,
+ "learning_rate": 4.895998431193121e-06,
+ "loss": 0.6143,
+ "step": 1260
+ },
+ {
+ "epoch": 0.5962174940898345,
+ "grad_norm": 2.7166085243225098,
+ "learning_rate": 4.895820297275767e-06,
+ "loss": 0.5187,
+ "step": 1261
+ },
+ {
+ "epoch": 0.5966903073286052,
+ "grad_norm": 2.7544102668762207,
+ "learning_rate": 4.8956420141811875e-06,
+ "loss": 0.5928,
+ "step": 1262
+ },
+ {
+ "epoch": 0.5971631205673759,
+ "grad_norm": 2.6678333282470703,
+ "learning_rate": 4.895463581920484e-06,
+ "loss": 0.611,
+ "step": 1263
+ },
+ {
+ "epoch": 0.5976359338061465,
+ "grad_norm": 2.853384494781494,
+ "learning_rate": 4.895285000504768e-06,
+ "loss": 0.642,
+ "step": 1264
+ },
+ {
+ "epoch": 0.5981087470449172,
+ "grad_norm": 2.637852430343628,
+ "learning_rate": 4.895106269945158e-06,
+ "loss": 0.6308,
+ "step": 1265
+ },
+ {
+ "epoch": 0.5985815602836879,
+ "grad_norm": 2.9880387783050537,
+ "learning_rate": 4.8949273902527826e-06,
+ "loss": 0.5781,
+ "step": 1266
+ },
+ {
+ "epoch": 0.5990543735224586,
+ "grad_norm": 3.5984015464782715,
+ "learning_rate": 4.89474836143878e-06,
+ "loss": 0.5865,
+ "step": 1267
+ },
+ {
+ "epoch": 0.5995271867612293,
+ "grad_norm": 2.719855546951294,
+ "learning_rate": 4.8945691835142975e-06,
+ "loss": 0.6393,
+ "step": 1268
+ },
+ {
+ "epoch": 0.6,
+ "grad_norm": 2.7885141372680664,
+ "learning_rate": 4.894389856490492e-06,
+ "loss": 0.66,
+ "step": 1269
+ },
+ {
+ "epoch": 0.6004728132387707,
+ "grad_norm": 2.698819875717163,
+ "learning_rate": 4.894210380378529e-06,
+ "loss": 0.6144,
+ "step": 1270
+ },
+ {
+ "epoch": 0.6009456264775414,
+ "grad_norm": 2.278045654296875,
+ "learning_rate": 4.894030755189584e-06,
+ "loss": 0.5609,
+ "step": 1271
+ },
+ {
+ "epoch": 0.601418439716312,
+ "grad_norm": 2.8729357719421387,
+ "learning_rate": 4.893850980934841e-06,
+ "loss": 0.6715,
+ "step": 1272
+ },
+ {
+ "epoch": 0.6018912529550827,
+ "grad_norm": 2.8541221618652344,
+ "learning_rate": 4.893671057625495e-06,
+ "loss": 0.6787,
+ "step": 1273
+ },
+ {
+ "epoch": 0.6023640661938534,
+ "grad_norm": 2.4561476707458496,
+ "learning_rate": 4.893490985272748e-06,
+ "loss": 0.6331,
+ "step": 1274
+ },
+ {
+ "epoch": 0.6028368794326241,
+ "grad_norm": 2.565739154815674,
+ "learning_rate": 4.893310763887812e-06,
+ "loss": 0.587,
+ "step": 1275
+ },
+ {
+ "epoch": 0.6033096926713948,
+ "grad_norm": 2.384951591491699,
+ "learning_rate": 4.8931303934819095e-06,
+ "loss": 0.5358,
+ "step": 1276
+ },
+ {
+ "epoch": 0.6037825059101655,
+ "grad_norm": 2.380808115005493,
+ "learning_rate": 4.89294987406627e-06,
+ "loss": 0.5402,
+ "step": 1277
+ },
+ {
+ "epoch": 0.6042553191489362,
+ "grad_norm": 2.764815092086792,
+ "learning_rate": 4.892769205652136e-06,
+ "loss": 0.6103,
+ "step": 1278
+ },
+ {
+ "epoch": 0.6047281323877068,
+ "grad_norm": 2.463350296020508,
+ "learning_rate": 4.892588388250754e-06,
+ "loss": 0.5937,
+ "step": 1279
+ },
+ {
+ "epoch": 0.6052009456264775,
+ "grad_norm": 3.099689245223999,
+ "learning_rate": 4.8924074218733855e-06,
+ "loss": 0.6354,
+ "step": 1280
+ },
+ {
+ "epoch": 0.6056737588652482,
+ "grad_norm": 2.804450035095215,
+ "learning_rate": 4.892226306531297e-06,
+ "loss": 0.6595,
+ "step": 1281
+ },
+ {
+ "epoch": 0.6061465721040189,
+ "grad_norm": 3.1559767723083496,
+ "learning_rate": 4.892045042235765e-06,
+ "loss": 0.6664,
+ "step": 1282
+ },
+ {
+ "epoch": 0.6066193853427896,
+ "grad_norm": 2.844341993331909,
+ "learning_rate": 4.891863628998079e-06,
+ "loss": 0.7454,
+ "step": 1283
+ },
+ {
+ "epoch": 0.6070921985815603,
+ "grad_norm": 2.686602830886841,
+ "learning_rate": 4.891682066829532e-06,
+ "loss": 0.6755,
+ "step": 1284
+ },
+ {
+ "epoch": 0.607565011820331,
+ "grad_norm": 2.736457347869873,
+ "learning_rate": 4.8915003557414285e-06,
+ "loss": 0.6305,
+ "step": 1285
+ },
+ {
+ "epoch": 0.6080378250591016,
+ "grad_norm": 2.661362409591675,
+ "learning_rate": 4.891318495745086e-06,
+ "loss": 0.5958,
+ "step": 1286
+ },
+ {
+ "epoch": 0.6085106382978723,
+ "grad_norm": 2.707348108291626,
+ "learning_rate": 4.8911364868518255e-06,
+ "loss": 0.5824,
+ "step": 1287
+ },
+ {
+ "epoch": 0.608983451536643,
+ "grad_norm": 2.9798858165740967,
+ "learning_rate": 4.890954329072981e-06,
+ "loss": 0.5981,
+ "step": 1288
+ },
+ {
+ "epoch": 0.6094562647754137,
+ "grad_norm": 2.6285455226898193,
+ "learning_rate": 4.890772022419895e-06,
+ "loss": 0.6194,
+ "step": 1289
+ },
+ {
+ "epoch": 0.6099290780141844,
+ "grad_norm": 2.9254322052001953,
+ "learning_rate": 4.890589566903917e-06,
+ "loss": 0.6002,
+ "step": 1290
+ },
+ {
+ "epoch": 0.6104018912529551,
+ "grad_norm": 2.6458325386047363,
+ "learning_rate": 4.89040696253641e-06,
+ "loss": 0.5457,
+ "step": 1291
+ },
+ {
+ "epoch": 0.6108747044917258,
+ "grad_norm": 2.508242607116699,
+ "learning_rate": 4.890224209328743e-06,
+ "loss": 0.6168,
+ "step": 1292
+ },
+ {
+ "epoch": 0.6113475177304964,
+ "grad_norm": 3.034785509109497,
+ "learning_rate": 4.890041307292296e-06,
+ "loss": 0.664,
+ "step": 1293
+ },
+ {
+ "epoch": 0.6118203309692671,
+ "grad_norm": 3.52469539642334,
+ "learning_rate": 4.889858256438455e-06,
+ "loss": 0.7301,
+ "step": 1294
+ },
+ {
+ "epoch": 0.6122931442080378,
+ "grad_norm": 2.9145348072052,
+ "learning_rate": 4.889675056778622e-06,
+ "loss": 0.6494,
+ "step": 1295
+ },
+ {
+ "epoch": 0.6127659574468085,
+ "grad_norm": 2.831829071044922,
+ "learning_rate": 4.8894917083242e-06,
+ "loss": 0.6064,
+ "step": 1296
+ },
+ {
+ "epoch": 0.6132387706855792,
+ "grad_norm": 2.6883130073547363,
+ "learning_rate": 4.889308211086608e-06,
+ "loss": 0.5642,
+ "step": 1297
+ },
+ {
+ "epoch": 0.6137115839243499,
+ "grad_norm": 3.0605485439300537,
+ "learning_rate": 4.889124565077269e-06,
+ "loss": 0.6695,
+ "step": 1298
+ },
+ {
+ "epoch": 0.6141843971631206,
+ "grad_norm": 3.44062876701355,
+ "learning_rate": 4.88894077030762e-06,
+ "loss": 0.6415,
+ "step": 1299
+ },
+ {
+ "epoch": 0.6146572104018913,
+ "grad_norm": 2.5970818996429443,
+ "learning_rate": 4.888756826789105e-06,
+ "loss": 0.6518,
+ "step": 1300
+ },
+ {
+ "epoch": 0.6151300236406619,
+ "grad_norm": 4.2233567237854,
+ "learning_rate": 4.8885727345331755e-06,
+ "loss": 0.6555,
+ "step": 1301
+ },
+ {
+ "epoch": 0.6156028368794326,
+ "grad_norm": 2.645385503768921,
+ "learning_rate": 4.888388493551297e-06,
+ "loss": 0.6762,
+ "step": 1302
+ },
+ {
+ "epoch": 0.6160756501182033,
+ "grad_norm": 2.907954454421997,
+ "learning_rate": 4.8882041038549385e-06,
+ "loss": 0.6526,
+ "step": 1303
+ },
+ {
+ "epoch": 0.616548463356974,
+ "grad_norm": 2.482771873474121,
+ "learning_rate": 4.888019565455583e-06,
+ "loss": 0.628,
+ "step": 1304
+ },
+ {
+ "epoch": 0.6170212765957447,
+ "grad_norm": 2.7165915966033936,
+ "learning_rate": 4.88783487836472e-06,
+ "loss": 0.5743,
+ "step": 1305
+ },
+ {
+ "epoch": 0.6174940898345154,
+ "grad_norm": 3.095627546310425,
+ "learning_rate": 4.88765004259385e-06,
+ "loss": 0.627,
+ "step": 1306
+ },
+ {
+ "epoch": 0.6179669030732861,
+ "grad_norm": 2.5018465518951416,
+ "learning_rate": 4.8874650581544805e-06,
+ "loss": 0.5215,
+ "step": 1307
+ },
+ {
+ "epoch": 0.6184397163120567,
+ "grad_norm": 3.094337224960327,
+ "learning_rate": 4.8872799250581316e-06,
+ "loss": 0.6979,
+ "step": 1308
+ },
+ {
+ "epoch": 0.6189125295508274,
+ "grad_norm": 3.1002209186553955,
+ "learning_rate": 4.887094643316329e-06,
+ "loss": 0.6565,
+ "step": 1309
+ },
+ {
+ "epoch": 0.6193853427895981,
+ "grad_norm": 2.551431894302368,
+ "learning_rate": 4.88690921294061e-06,
+ "loss": 0.5748,
+ "step": 1310
+ },
+ {
+ "epoch": 0.6198581560283688,
+ "grad_norm": 2.8282904624938965,
+ "learning_rate": 4.886723633942521e-06,
+ "loss": 0.676,
+ "step": 1311
+ },
+ {
+ "epoch": 0.6203309692671395,
+ "grad_norm": 2.8887810707092285,
+ "learning_rate": 4.886537906333617e-06,
+ "loss": 0.5971,
+ "step": 1312
+ },
+ {
+ "epoch": 0.6208037825059102,
+ "grad_norm": 2.9989118576049805,
+ "learning_rate": 4.886352030125462e-06,
+ "loss": 0.6341,
+ "step": 1313
+ },
+ {
+ "epoch": 0.6212765957446809,
+ "grad_norm": 2.8042776584625244,
+ "learning_rate": 4.886166005329629e-06,
+ "loss": 0.6578,
+ "step": 1314
+ },
+ {
+ "epoch": 0.6217494089834515,
+ "grad_norm": 2.4980967044830322,
+ "learning_rate": 4.8859798319577026e-06,
+ "loss": 0.6711,
+ "step": 1315
+ },
+ {
+ "epoch": 0.6222222222222222,
+ "grad_norm": 2.762369155883789,
+ "learning_rate": 4.885793510021274e-06,
+ "loss": 0.5747,
+ "step": 1316
+ },
+ {
+ "epoch": 0.6226950354609929,
+ "grad_norm": 3.136327028274536,
+ "learning_rate": 4.885607039531945e-06,
+ "loss": 0.7544,
+ "step": 1317
+ },
+ {
+ "epoch": 0.6231678486997636,
+ "grad_norm": 2.8736963272094727,
+ "learning_rate": 4.885420420501327e-06,
+ "loss": 0.6603,
+ "step": 1318
+ },
+ {
+ "epoch": 0.6236406619385343,
+ "grad_norm": 2.766237497329712,
+ "learning_rate": 4.885233652941039e-06,
+ "loss": 0.581,
+ "step": 1319
+ },
+ {
+ "epoch": 0.624113475177305,
+ "grad_norm": 2.4740939140319824,
+ "learning_rate": 4.88504673686271e-06,
+ "loss": 0.6335,
+ "step": 1320
+ },
+ {
+ "epoch": 0.6245862884160757,
+ "grad_norm": 3.324795961380005,
+ "learning_rate": 4.884859672277978e-06,
+ "loss": 0.6019,
+ "step": 1321
+ },
+ {
+ "epoch": 0.6250591016548463,
+ "grad_norm": 3.521327257156372,
+ "learning_rate": 4.884672459198493e-06,
+ "loss": 0.6104,
+ "step": 1322
+ },
+ {
+ "epoch": 0.625531914893617,
+ "grad_norm": 2.7728071212768555,
+ "learning_rate": 4.884485097635909e-06,
+ "loss": 0.6714,
+ "step": 1323
+ },
+ {
+ "epoch": 0.6260047281323877,
+ "grad_norm": 3.0738155841827393,
+ "learning_rate": 4.884297587601895e-06,
+ "loss": 0.604,
+ "step": 1324
+ },
+ {
+ "epoch": 0.6264775413711584,
+ "grad_norm": 2.719240427017212,
+ "learning_rate": 4.884109929108124e-06,
+ "loss": 0.6795,
+ "step": 1325
+ },
+ {
+ "epoch": 0.6269503546099291,
+ "grad_norm": 2.4108200073242188,
+ "learning_rate": 4.883922122166282e-06,
+ "loss": 0.5846,
+ "step": 1326
+ },
+ {
+ "epoch": 0.6274231678486998,
+ "grad_norm": 2.393899917602539,
+ "learning_rate": 4.883734166788063e-06,
+ "loss": 0.6188,
+ "step": 1327
+ },
+ {
+ "epoch": 0.6278959810874705,
+ "grad_norm": 4.555255889892578,
+ "learning_rate": 4.883546062985169e-06,
+ "loss": 0.5962,
+ "step": 1328
+ },
+ {
+ "epoch": 0.6283687943262412,
+ "grad_norm": 2.571075439453125,
+ "learning_rate": 4.883357810769315e-06,
+ "loss": 0.6165,
+ "step": 1329
+ },
+ {
+ "epoch": 0.6288416075650118,
+ "grad_norm": 2.553115129470825,
+ "learning_rate": 4.8831694101522185e-06,
+ "loss": 0.6787,
+ "step": 1330
+ },
+ {
+ "epoch": 0.6293144208037825,
+ "grad_norm": 3.2564642429351807,
+ "learning_rate": 4.882980861145614e-06,
+ "loss": 0.659,
+ "step": 1331
+ },
+ {
+ "epoch": 0.6297872340425532,
+ "grad_norm": 2.535216808319092,
+ "learning_rate": 4.882792163761241e-06,
+ "loss": 0.6176,
+ "step": 1332
+ },
+ {
+ "epoch": 0.6302600472813239,
+ "grad_norm": 3.097921848297119,
+ "learning_rate": 4.882603318010847e-06,
+ "loss": 0.6822,
+ "step": 1333
+ },
+ {
+ "epoch": 0.6307328605200946,
+ "grad_norm": 2.8135175704956055,
+ "learning_rate": 4.882414323906192e-06,
+ "loss": 0.6782,
+ "step": 1334
+ },
+ {
+ "epoch": 0.6312056737588653,
+ "grad_norm": 2.724634885787964,
+ "learning_rate": 4.882225181459044e-06,
+ "loss": 0.6545,
+ "step": 1335
+ },
+ {
+ "epoch": 0.631678486997636,
+ "grad_norm": 2.9585227966308594,
+ "learning_rate": 4.882035890681179e-06,
+ "loss": 0.6218,
+ "step": 1336
+ },
+ {
+ "epoch": 0.6321513002364066,
+ "grad_norm": 2.6952011585235596,
+ "learning_rate": 4.881846451584385e-06,
+ "loss": 0.6,
+ "step": 1337
+ },
+ {
+ "epoch": 0.6326241134751773,
+ "grad_norm": 3.1400704383850098,
+ "learning_rate": 4.881656864180455e-06,
+ "loss": 0.6687,
+ "step": 1338
+ },
+ {
+ "epoch": 0.633096926713948,
+ "grad_norm": 2.8382487297058105,
+ "learning_rate": 4.881467128481197e-06,
+ "loss": 0.574,
+ "step": 1339
+ },
+ {
+ "epoch": 0.6335697399527187,
+ "grad_norm": 2.8520095348358154,
+ "learning_rate": 4.881277244498422e-06,
+ "loss": 0.6582,
+ "step": 1340
+ },
+ {
+ "epoch": 0.6340425531914894,
+ "grad_norm": 2.703498363494873,
+ "learning_rate": 4.881087212243956e-06,
+ "loss": 0.7224,
+ "step": 1341
+ },
+ {
+ "epoch": 0.6345153664302601,
+ "grad_norm": 3.697205066680908,
+ "learning_rate": 4.880897031729629e-06,
+ "loss": 0.6582,
+ "step": 1342
+ },
+ {
+ "epoch": 0.6349881796690308,
+ "grad_norm": 2.7625808715820312,
+ "learning_rate": 4.880706702967284e-06,
+ "loss": 0.574,
+ "step": 1343
+ },
+ {
+ "epoch": 0.6354609929078014,
+ "grad_norm": 2.949984073638916,
+ "learning_rate": 4.880516225968771e-06,
+ "loss": 0.66,
+ "step": 1344
+ },
+ {
+ "epoch": 0.6359338061465721,
+ "grad_norm": 2.548269748687744,
+ "learning_rate": 4.8803256007459525e-06,
+ "loss": 0.642,
+ "step": 1345
+ },
+ {
+ "epoch": 0.6364066193853428,
+ "grad_norm": 2.5102174282073975,
+ "learning_rate": 4.8801348273106945e-06,
+ "loss": 0.6238,
+ "step": 1346
+ },
+ {
+ "epoch": 0.6368794326241135,
+ "grad_norm": 2.9847946166992188,
+ "learning_rate": 4.8799439056748786e-06,
+ "loss": 0.5416,
+ "step": 1347
+ },
+ {
+ "epoch": 0.6373522458628842,
+ "grad_norm": 2.8711049556732178,
+ "learning_rate": 4.879752835850391e-06,
+ "loss": 0.6427,
+ "step": 1348
+ },
+ {
+ "epoch": 0.6378250591016549,
+ "grad_norm": 2.7901716232299805,
+ "learning_rate": 4.879561617849129e-06,
+ "loss": 0.6026,
+ "step": 1349
+ },
+ {
+ "epoch": 0.6382978723404256,
+ "grad_norm": 2.659778356552124,
+ "learning_rate": 4.879370251682999e-06,
+ "loss": 0.6623,
+ "step": 1350
+ },
+ {
+ "epoch": 0.6387706855791963,
+ "grad_norm": 3.224386692047119,
+ "learning_rate": 4.879178737363917e-06,
+ "loss": 0.6485,
+ "step": 1351
+ },
+ {
+ "epoch": 0.6392434988179669,
+ "grad_norm": 2.6385605335235596,
+ "learning_rate": 4.8789870749038076e-06,
+ "loss": 0.5866,
+ "step": 1352
+ },
+ {
+ "epoch": 0.6397163120567376,
+ "grad_norm": 2.807713270187378,
+ "learning_rate": 4.8787952643146045e-06,
+ "loss": 0.6537,
+ "step": 1353
+ },
+ {
+ "epoch": 0.6401891252955083,
+ "grad_norm": 2.5689280033111572,
+ "learning_rate": 4.878603305608251e-06,
+ "loss": 0.6216,
+ "step": 1354
+ },
+ {
+ "epoch": 0.640661938534279,
+ "grad_norm": 2.7347843647003174,
+ "learning_rate": 4.8784111987967e-06,
+ "loss": 0.6318,
+ "step": 1355
+ },
+ {
+ "epoch": 0.6411347517730497,
+ "grad_norm": 2.5210378170013428,
+ "learning_rate": 4.878218943891911e-06,
+ "loss": 0.5472,
+ "step": 1356
+ },
+ {
+ "epoch": 0.6416075650118204,
+ "grad_norm": 2.866785764694214,
+ "learning_rate": 4.878026540905858e-06,
+ "loss": 0.7108,
+ "step": 1357
+ },
+ {
+ "epoch": 0.642080378250591,
+ "grad_norm": 2.923314332962036,
+ "learning_rate": 4.877833989850519e-06,
+ "loss": 0.5557,
+ "step": 1358
+ },
+ {
+ "epoch": 0.6425531914893617,
+ "grad_norm": 2.925463914871216,
+ "learning_rate": 4.8776412907378845e-06,
+ "loss": 0.6382,
+ "step": 1359
+ },
+ {
+ "epoch": 0.6430260047281324,
+ "grad_norm": 2.909644365310669,
+ "learning_rate": 4.877448443579952e-06,
+ "loss": 0.5603,
+ "step": 1360
+ },
+ {
+ "epoch": 0.6434988179669031,
+ "grad_norm": 3.501148223876953,
+ "learning_rate": 4.8772554483887306e-06,
+ "loss": 0.6722,
+ "step": 1361
+ },
+ {
+ "epoch": 0.6439716312056738,
+ "grad_norm": 2.823765516281128,
+ "learning_rate": 4.877062305176235e-06,
+ "loss": 0.6408,
+ "step": 1362
+ },
+ {
+ "epoch": 0.6444444444444445,
+ "grad_norm": 2.9807584285736084,
+ "learning_rate": 4.8768690139544935e-06,
+ "loss": 0.5984,
+ "step": 1363
+ },
+ {
+ "epoch": 0.6449172576832152,
+ "grad_norm": 2.8411378860473633,
+ "learning_rate": 4.8766755747355405e-06,
+ "loss": 0.6231,
+ "step": 1364
+ },
+ {
+ "epoch": 0.6453900709219859,
+ "grad_norm": 3.158952236175537,
+ "learning_rate": 4.8764819875314215e-06,
+ "loss": 0.6441,
+ "step": 1365
+ },
+ {
+ "epoch": 0.6458628841607565,
+ "grad_norm": 2.9614369869232178,
+ "learning_rate": 4.876288252354189e-06,
+ "loss": 0.6308,
+ "step": 1366
+ },
+ {
+ "epoch": 0.6463356973995272,
+ "grad_norm": 3.073805570602417,
+ "learning_rate": 4.876094369215907e-06,
+ "loss": 0.6046,
+ "step": 1367
+ },
+ {
+ "epoch": 0.6468085106382979,
+ "grad_norm": 2.719189405441284,
+ "learning_rate": 4.875900338128648e-06,
+ "loss": 0.6082,
+ "step": 1368
+ },
+ {
+ "epoch": 0.6472813238770686,
+ "grad_norm": 2.676726818084717,
+ "learning_rate": 4.8757061591044914e-06,
+ "loss": 0.6344,
+ "step": 1369
+ },
+ {
+ "epoch": 0.6477541371158393,
+ "grad_norm": 2.955256938934326,
+ "learning_rate": 4.87551183215553e-06,
+ "loss": 0.6506,
+ "step": 1370
+ },
+ {
+ "epoch": 0.64822695035461,
+ "grad_norm": 2.5672218799591064,
+ "learning_rate": 4.875317357293864e-06,
+ "loss": 0.5284,
+ "step": 1371
+ },
+ {
+ "epoch": 0.6486997635933807,
+ "grad_norm": 2.5860238075256348,
+ "learning_rate": 4.875122734531602e-06,
+ "loss": 0.667,
+ "step": 1372
+ },
+ {
+ "epoch": 0.6491725768321513,
+ "grad_norm": 3.1037003993988037,
+ "learning_rate": 4.8749279638808605e-06,
+ "loss": 0.6902,
+ "step": 1373
+ },
+ {
+ "epoch": 0.649645390070922,
+ "grad_norm": 2.7715282440185547,
+ "learning_rate": 4.874733045353769e-06,
+ "loss": 0.6291,
+ "step": 1374
+ },
+ {
+ "epoch": 0.6501182033096927,
+ "grad_norm": 2.527071475982666,
+ "learning_rate": 4.874537978962463e-06,
+ "loss": 0.5565,
+ "step": 1375
+ },
+ {
+ "epoch": 0.6505910165484634,
+ "grad_norm": 2.722092628479004,
+ "learning_rate": 4.874342764719091e-06,
+ "loss": 0.5724,
+ "step": 1376
+ },
+ {
+ "epoch": 0.6510638297872341,
+ "grad_norm": 2.6342411041259766,
+ "learning_rate": 4.874147402635805e-06,
+ "loss": 0.6308,
+ "step": 1377
+ },
+ {
+ "epoch": 0.6515366430260048,
+ "grad_norm": 2.3850719928741455,
+ "learning_rate": 4.8739518927247695e-06,
+ "loss": 0.5692,
+ "step": 1378
+ },
+ {
+ "epoch": 0.6520094562647755,
+ "grad_norm": 2.9787259101867676,
+ "learning_rate": 4.873756234998161e-06,
+ "loss": 0.6953,
+ "step": 1379
+ },
+ {
+ "epoch": 0.6524822695035462,
+ "grad_norm": 2.634141683578491,
+ "learning_rate": 4.873560429468159e-06,
+ "loss": 0.6077,
+ "step": 1380
+ },
+ {
+ "epoch": 0.6529550827423168,
+ "grad_norm": 2.803046941757202,
+ "learning_rate": 4.873364476146958e-06,
+ "loss": 0.6657,
+ "step": 1381
+ },
+ {
+ "epoch": 0.6534278959810875,
+ "grad_norm": 2.762827157974243,
+ "learning_rate": 4.8731683750467574e-06,
+ "loss": 0.6061,
+ "step": 1382
+ },
+ {
+ "epoch": 0.6539007092198581,
+ "grad_norm": 2.6654391288757324,
+ "learning_rate": 4.872972126179768e-06,
+ "loss": 0.6387,
+ "step": 1383
+ },
+ {
+ "epoch": 0.6543735224586288,
+ "grad_norm": 2.4363625049591064,
+ "learning_rate": 4.872775729558209e-06,
+ "loss": 0.5623,
+ "step": 1384
+ },
+ {
+ "epoch": 0.6548463356973995,
+ "grad_norm": 2.528959035873413,
+ "learning_rate": 4.87257918519431e-06,
+ "loss": 0.5609,
+ "step": 1385
+ },
+ {
+ "epoch": 0.6553191489361702,
+ "grad_norm": 2.718383312225342,
+ "learning_rate": 4.872382493100309e-06,
+ "loss": 0.5575,
+ "step": 1386
+ },
+ {
+ "epoch": 0.6557919621749408,
+ "grad_norm": 2.660841226577759,
+ "learning_rate": 4.872185653288453e-06,
+ "loss": 0.6106,
+ "step": 1387
+ },
+ {
+ "epoch": 0.6562647754137115,
+ "grad_norm": 2.508753538131714,
+ "learning_rate": 4.871988665770997e-06,
+ "loss": 0.5705,
+ "step": 1388
+ },
+ {
+ "epoch": 0.6567375886524822,
+ "grad_norm": 2.5134334564208984,
+ "learning_rate": 4.871791530560208e-06,
+ "loss": 0.5592,
+ "step": 1389
+ },
+ {
+ "epoch": 0.6572104018912529,
+ "grad_norm": 2.7475597858428955,
+ "learning_rate": 4.871594247668361e-06,
+ "loss": 0.6277,
+ "step": 1390
+ },
+ {
+ "epoch": 0.6576832151300236,
+ "grad_norm": 2.793616533279419,
+ "learning_rate": 4.871396817107739e-06,
+ "loss": 0.595,
+ "step": 1391
+ },
+ {
+ "epoch": 0.6581560283687943,
+ "grad_norm": 2.8285086154937744,
+ "learning_rate": 4.871199238890635e-06,
+ "loss": 0.6094,
+ "step": 1392
+ },
+ {
+ "epoch": 0.658628841607565,
+ "grad_norm": 2.74124813079834,
+ "learning_rate": 4.871001513029352e-06,
+ "loss": 0.6296,
+ "step": 1393
+ },
+ {
+ "epoch": 0.6591016548463356,
+ "grad_norm": 2.761237621307373,
+ "learning_rate": 4.870803639536202e-06,
+ "loss": 0.5702,
+ "step": 1394
+ },
+ {
+ "epoch": 0.6595744680851063,
+ "grad_norm": 2.761038064956665,
+ "learning_rate": 4.870605618423504e-06,
+ "loss": 0.6195,
+ "step": 1395
+ },
+ {
+ "epoch": 0.660047281323877,
+ "grad_norm": 2.8812482357025146,
+ "learning_rate": 4.870407449703589e-06,
+ "loss": 0.616,
+ "step": 1396
+ },
+ {
+ "epoch": 0.6605200945626477,
+ "grad_norm": 2.9966578483581543,
+ "learning_rate": 4.870209133388797e-06,
+ "loss": 0.6547,
+ "step": 1397
+ },
+ {
+ "epoch": 0.6609929078014184,
+ "grad_norm": 2.7969017028808594,
+ "learning_rate": 4.870010669491474e-06,
+ "loss": 0.5762,
+ "step": 1398
+ },
+ {
+ "epoch": 0.6614657210401891,
+ "grad_norm": 2.557783842086792,
+ "learning_rate": 4.86981205802398e-06,
+ "loss": 0.6184,
+ "step": 1399
+ },
+ {
+ "epoch": 0.6619385342789598,
+ "grad_norm": 2.5393927097320557,
+ "learning_rate": 4.86961329899868e-06,
+ "loss": 0.5953,
+ "step": 1400
+ },
+ {
+ "epoch": 0.6624113475177305,
+ "grad_norm": 2.7745981216430664,
+ "learning_rate": 4.86941439242795e-06,
+ "loss": 0.5967,
+ "step": 1401
+ },
+ {
+ "epoch": 0.6628841607565011,
+ "grad_norm": 2.650381326675415,
+ "learning_rate": 4.869215338324176e-06,
+ "loss": 0.5667,
+ "step": 1402
+ },
+ {
+ "epoch": 0.6633569739952718,
+ "grad_norm": 2.583169937133789,
+ "learning_rate": 4.869016136699751e-06,
+ "loss": 0.549,
+ "step": 1403
+ },
+ {
+ "epoch": 0.6638297872340425,
+ "grad_norm": 2.984978437423706,
+ "learning_rate": 4.868816787567079e-06,
+ "loss": 0.5931,
+ "step": 1404
+ },
+ {
+ "epoch": 0.6643026004728132,
+ "grad_norm": 3.1947181224823,
+ "learning_rate": 4.868617290938573e-06,
+ "loss": 0.5473,
+ "step": 1405
+ },
+ {
+ "epoch": 0.6647754137115839,
+ "grad_norm": 2.562927007675171,
+ "learning_rate": 4.868417646826654e-06,
+ "loss": 0.6878,
+ "step": 1406
+ },
+ {
+ "epoch": 0.6652482269503546,
+ "grad_norm": 2.8741261959075928,
+ "learning_rate": 4.868217855243754e-06,
+ "loss": 0.6312,
+ "step": 1407
+ },
+ {
+ "epoch": 0.6657210401891253,
+ "grad_norm": 2.9834797382354736,
+ "learning_rate": 4.868017916202312e-06,
+ "loss": 0.5624,
+ "step": 1408
+ },
+ {
+ "epoch": 0.6661938534278959,
+ "grad_norm": 2.6935982704162598,
+ "learning_rate": 4.8678178297147785e-06,
+ "loss": 0.5857,
+ "step": 1409
+ },
+ {
+ "epoch": 0.6666666666666666,
+ "grad_norm": 2.8200576305389404,
+ "learning_rate": 4.86761759579361e-06,
+ "loss": 0.6153,
+ "step": 1410
+ },
+ {
+ "epoch": 0.6671394799054373,
+ "grad_norm": 2.831425189971924,
+ "learning_rate": 4.867417214451276e-06,
+ "loss": 0.6495,
+ "step": 1411
+ },
+ {
+ "epoch": 0.667612293144208,
+ "grad_norm": 2.733565092086792,
+ "learning_rate": 4.867216685700253e-06,
+ "loss": 0.6036,
+ "step": 1412
+ },
+ {
+ "epoch": 0.6680851063829787,
+ "grad_norm": 3.0609400272369385,
+ "learning_rate": 4.867016009553027e-06,
+ "loss": 0.6773,
+ "step": 1413
+ },
+ {
+ "epoch": 0.6685579196217494,
+ "grad_norm": 2.665452241897583,
+ "learning_rate": 4.866815186022093e-06,
+ "loss": 0.6256,
+ "step": 1414
+ },
+ {
+ "epoch": 0.6690307328605201,
+ "grad_norm": 2.9480721950531006,
+ "learning_rate": 4.866614215119956e-06,
+ "loss": 0.535,
+ "step": 1415
+ },
+ {
+ "epoch": 0.6695035460992907,
+ "grad_norm": 2.5514180660247803,
+ "learning_rate": 4.866413096859128e-06,
+ "loss": 0.6588,
+ "step": 1416
+ },
+ {
+ "epoch": 0.6699763593380614,
+ "grad_norm": 3.3442373275756836,
+ "learning_rate": 4.866211831252134e-06,
+ "loss": 0.5754,
+ "step": 1417
+ },
+ {
+ "epoch": 0.6704491725768321,
+ "grad_norm": 2.521467685699463,
+ "learning_rate": 4.866010418311504e-06,
+ "loss": 0.5546,
+ "step": 1418
+ },
+ {
+ "epoch": 0.6709219858156028,
+ "grad_norm": 2.930706262588501,
+ "learning_rate": 4.865808858049781e-06,
+ "loss": 0.589,
+ "step": 1419
+ },
+ {
+ "epoch": 0.6713947990543735,
+ "grad_norm": 2.6298375129699707,
+ "learning_rate": 4.865607150479513e-06,
+ "loss": 0.5915,
+ "step": 1420
+ },
+ {
+ "epoch": 0.6718676122931442,
+ "grad_norm": 2.9554293155670166,
+ "learning_rate": 4.8654052956132615e-06,
+ "loss": 0.6654,
+ "step": 1421
+ },
+ {
+ "epoch": 0.6723404255319149,
+ "grad_norm": 3.2706902027130127,
+ "learning_rate": 4.865203293463593e-06,
+ "loss": 0.7115,
+ "step": 1422
+ },
+ {
+ "epoch": 0.6728132387706856,
+ "grad_norm": 3.041539430618286,
+ "learning_rate": 4.865001144043088e-06,
+ "loss": 0.5818,
+ "step": 1423
+ },
+ {
+ "epoch": 0.6732860520094562,
+ "grad_norm": 3.1314544677734375,
+ "learning_rate": 4.864798847364331e-06,
+ "loss": 0.5822,
+ "step": 1424
+ },
+ {
+ "epoch": 0.6737588652482269,
+ "grad_norm": 2.5301461219787598,
+ "learning_rate": 4.86459640343992e-06,
+ "loss": 0.5525,
+ "step": 1425
+ },
+ {
+ "epoch": 0.6742316784869976,
+ "grad_norm": 2.809295892715454,
+ "learning_rate": 4.864393812282458e-06,
+ "loss": 0.6768,
+ "step": 1426
+ },
+ {
+ "epoch": 0.6747044917257683,
+ "grad_norm": 2.794664144515991,
+ "learning_rate": 4.864191073904562e-06,
+ "loss": 0.5793,
+ "step": 1427
+ },
+ {
+ "epoch": 0.675177304964539,
+ "grad_norm": 2.7771105766296387,
+ "learning_rate": 4.863988188318854e-06,
+ "loss": 0.6453,
+ "step": 1428
+ },
+ {
+ "epoch": 0.6756501182033097,
+ "grad_norm": 2.6431946754455566,
+ "learning_rate": 4.863785155537967e-06,
+ "loss": 0.5877,
+ "step": 1429
+ },
+ {
+ "epoch": 0.6761229314420804,
+ "grad_norm": 2.951353073120117,
+ "learning_rate": 4.863581975574544e-06,
+ "loss": 0.6793,
+ "step": 1430
+ },
+ {
+ "epoch": 0.676595744680851,
+ "grad_norm": 3.1336071491241455,
+ "learning_rate": 4.863378648441235e-06,
+ "loss": 0.6695,
+ "step": 1431
+ },
+ {
+ "epoch": 0.6770685579196217,
+ "grad_norm": 2.735982656478882,
+ "learning_rate": 4.8631751741507e-06,
+ "loss": 0.5239,
+ "step": 1432
+ },
+ {
+ "epoch": 0.6775413711583924,
+ "grad_norm": 2.7085206508636475,
+ "learning_rate": 4.862971552715611e-06,
+ "loss": 0.6837,
+ "step": 1433
+ },
+ {
+ "epoch": 0.6780141843971631,
+ "grad_norm": 3.136528730392456,
+ "learning_rate": 4.8627677841486436e-06,
+ "loss": 0.683,
+ "step": 1434
+ },
+ {
+ "epoch": 0.6784869976359338,
+ "grad_norm": 2.7879369258880615,
+ "learning_rate": 4.862563868462486e-06,
+ "loss": 0.608,
+ "step": 1435
+ },
+ {
+ "epoch": 0.6789598108747045,
+ "grad_norm": 2.7937729358673096,
+ "learning_rate": 4.862359805669837e-06,
+ "loss": 0.6131,
+ "step": 1436
+ },
+ {
+ "epoch": 0.6794326241134752,
+ "grad_norm": 2.5988364219665527,
+ "learning_rate": 4.862155595783401e-06,
+ "loss": 0.6303,
+ "step": 1437
+ },
+ {
+ "epoch": 0.6799054373522458,
+ "grad_norm": 3.251070499420166,
+ "learning_rate": 4.861951238815894e-06,
+ "loss": 0.7246,
+ "step": 1438
+ },
+ {
+ "epoch": 0.6803782505910165,
+ "grad_norm": 2.646759271621704,
+ "learning_rate": 4.861746734780039e-06,
+ "loss": 0.6313,
+ "step": 1439
+ },
+ {
+ "epoch": 0.6808510638297872,
+ "grad_norm": 2.773866891860962,
+ "learning_rate": 4.861542083688573e-06,
+ "loss": 0.6463,
+ "step": 1440
+ },
+ {
+ "epoch": 0.6813238770685579,
+ "grad_norm": 2.759965658187866,
+ "learning_rate": 4.861337285554235e-06,
+ "loss": 0.5428,
+ "step": 1441
+ },
+ {
+ "epoch": 0.6817966903073286,
+ "grad_norm": 3.3250818252563477,
+ "learning_rate": 4.861132340389779e-06,
+ "loss": 0.6522,
+ "step": 1442
+ },
+ {
+ "epoch": 0.6822695035460993,
+ "grad_norm": 2.661797523498535,
+ "learning_rate": 4.860927248207965e-06,
+ "loss": 0.5871,
+ "step": 1443
+ },
+ {
+ "epoch": 0.68274231678487,
+ "grad_norm": 2.706289052963257,
+ "learning_rate": 4.860722009021563e-06,
+ "loss": 0.6651,
+ "step": 1444
+ },
+ {
+ "epoch": 0.6832151300236406,
+ "grad_norm": 2.8459298610687256,
+ "learning_rate": 4.860516622843354e-06,
+ "loss": 0.5827,
+ "step": 1445
+ },
+ {
+ "epoch": 0.6836879432624113,
+ "grad_norm": 3.1041831970214844,
+ "learning_rate": 4.860311089686125e-06,
+ "loss": 0.6727,
+ "step": 1446
+ },
+ {
+ "epoch": 0.684160756501182,
+ "grad_norm": 2.9382801055908203,
+ "learning_rate": 4.8601054095626746e-06,
+ "loss": 0.6002,
+ "step": 1447
+ },
+ {
+ "epoch": 0.6846335697399527,
+ "grad_norm": 2.782475471496582,
+ "learning_rate": 4.859899582485808e-06,
+ "loss": 0.6951,
+ "step": 1448
+ },
+ {
+ "epoch": 0.6851063829787234,
+ "grad_norm": 3.313894510269165,
+ "learning_rate": 4.859693608468343e-06,
+ "loss": 0.6363,
+ "step": 1449
+ },
+ {
+ "epoch": 0.6855791962174941,
+ "grad_norm": 3.1639695167541504,
+ "learning_rate": 4.8594874875231045e-06,
+ "loss": 0.7002,
+ "step": 1450
+ },
+ {
+ "epoch": 0.6860520094562648,
+ "grad_norm": 2.6762218475341797,
+ "learning_rate": 4.859281219662926e-06,
+ "loss": 0.6246,
+ "step": 1451
+ },
+ {
+ "epoch": 0.6865248226950355,
+ "grad_norm": 2.8368663787841797,
+ "learning_rate": 4.85907480490065e-06,
+ "loss": 0.5906,
+ "step": 1452
+ },
+ {
+ "epoch": 0.6869976359338061,
+ "grad_norm": 2.887373208999634,
+ "learning_rate": 4.858868243249131e-06,
+ "loss": 0.5931,
+ "step": 1453
+ },
+ {
+ "epoch": 0.6874704491725768,
+ "grad_norm": 2.8115322589874268,
+ "learning_rate": 4.858661534721229e-06,
+ "loss": 0.6337,
+ "step": 1454
+ },
+ {
+ "epoch": 0.6879432624113475,
+ "grad_norm": 2.8470499515533447,
+ "learning_rate": 4.8584546793298174e-06,
+ "loss": 0.632,
+ "step": 1455
+ },
+ {
+ "epoch": 0.6884160756501182,
+ "grad_norm": 2.8229613304138184,
+ "learning_rate": 4.8582476770877725e-06,
+ "loss": 0.6494,
+ "step": 1456
+ },
+ {
+ "epoch": 0.6888888888888889,
+ "grad_norm": 2.4235479831695557,
+ "learning_rate": 4.858040528007987e-06,
+ "loss": 0.5709,
+ "step": 1457
+ },
+ {
+ "epoch": 0.6893617021276596,
+ "grad_norm": 2.9348199367523193,
+ "learning_rate": 4.857833232103356e-06,
+ "loss": 0.5404,
+ "step": 1458
+ },
+ {
+ "epoch": 0.6898345153664303,
+ "grad_norm": 2.8274219036102295,
+ "learning_rate": 4.857625789386789e-06,
+ "loss": 0.701,
+ "step": 1459
+ },
+ {
+ "epoch": 0.6903073286052009,
+ "grad_norm": 3.136929988861084,
+ "learning_rate": 4.857418199871203e-06,
+ "loss": 0.6971,
+ "step": 1460
+ },
+ {
+ "epoch": 0.6907801418439716,
+ "grad_norm": 2.8987185955047607,
+ "learning_rate": 4.8572104635695214e-06,
+ "loss": 0.6613,
+ "step": 1461
+ },
+ {
+ "epoch": 0.6912529550827423,
+ "grad_norm": 2.5073442459106445,
+ "learning_rate": 4.857002580494681e-06,
+ "loss": 0.6032,
+ "step": 1462
+ },
+ {
+ "epoch": 0.691725768321513,
+ "grad_norm": 2.7019522190093994,
+ "learning_rate": 4.856794550659625e-06,
+ "loss": 0.567,
+ "step": 1463
+ },
+ {
+ "epoch": 0.6921985815602837,
+ "grad_norm": 2.4795594215393066,
+ "learning_rate": 4.8565863740773054e-06,
+ "loss": 0.5777,
+ "step": 1464
+ },
+ {
+ "epoch": 0.6926713947990544,
+ "grad_norm": 3.032506227493286,
+ "learning_rate": 4.856378050760687e-06,
+ "loss": 0.607,
+ "step": 1465
+ },
+ {
+ "epoch": 0.6931442080378251,
+ "grad_norm": 3.052091121673584,
+ "learning_rate": 4.85616958072274e-06,
+ "loss": 0.591,
+ "step": 1466
+ },
+ {
+ "epoch": 0.6936170212765957,
+ "grad_norm": 2.704831838607788,
+ "learning_rate": 4.855960963976443e-06,
+ "loss": 0.6528,
+ "step": 1467
+ },
+ {
+ "epoch": 0.6940898345153664,
+ "grad_norm": 2.680995225906372,
+ "learning_rate": 4.855752200534788e-06,
+ "loss": 0.6294,
+ "step": 1468
+ },
+ {
+ "epoch": 0.6945626477541371,
+ "grad_norm": 2.3948659896850586,
+ "learning_rate": 4.855543290410774e-06,
+ "loss": 0.6091,
+ "step": 1469
+ },
+ {
+ "epoch": 0.6950354609929078,
+ "grad_norm": 2.6407411098480225,
+ "learning_rate": 4.855334233617407e-06,
+ "loss": 0.5572,
+ "step": 1470
+ },
+ {
+ "epoch": 0.6955082742316785,
+ "grad_norm": 2.5526835918426514,
+ "learning_rate": 4.8551250301677064e-06,
+ "loss": 0.5432,
+ "step": 1471
+ },
+ {
+ "epoch": 0.6959810874704492,
+ "grad_norm": 3.1237430572509766,
+ "learning_rate": 4.8549156800746965e-06,
+ "loss": 0.5944,
+ "step": 1472
+ },
+ {
+ "epoch": 0.6964539007092199,
+ "grad_norm": 2.8112540245056152,
+ "learning_rate": 4.854706183351412e-06,
+ "loss": 0.604,
+ "step": 1473
+ },
+ {
+ "epoch": 0.6969267139479906,
+ "grad_norm": 2.664644479751587,
+ "learning_rate": 4.8544965400109e-06,
+ "loss": 0.5647,
+ "step": 1474
+ },
+ {
+ "epoch": 0.6973995271867612,
+ "grad_norm": 3.26310133934021,
+ "learning_rate": 4.854286750066212e-06,
+ "loss": 0.6999,
+ "step": 1475
+ },
+ {
+ "epoch": 0.6978723404255319,
+ "grad_norm": 2.9717442989349365,
+ "learning_rate": 4.8540768135304115e-06,
+ "loss": 0.6655,
+ "step": 1476
+ },
+ {
+ "epoch": 0.6983451536643026,
+ "grad_norm": 2.5302982330322266,
+ "learning_rate": 4.85386673041657e-06,
+ "loss": 0.6384,
+ "step": 1477
+ },
+ {
+ "epoch": 0.6988179669030733,
+ "grad_norm": 2.864877700805664,
+ "learning_rate": 4.853656500737769e-06,
+ "loss": 0.6834,
+ "step": 1478
+ },
+ {
+ "epoch": 0.699290780141844,
+ "grad_norm": 2.5522031784057617,
+ "learning_rate": 4.853446124507098e-06,
+ "loss": 0.5929,
+ "step": 1479
+ },
+ {
+ "epoch": 0.6997635933806147,
+ "grad_norm": 3.096477746963501,
+ "learning_rate": 4.853235601737656e-06,
+ "loss": 0.5737,
+ "step": 1480
+ },
+ {
+ "epoch": 0.7002364066193854,
+ "grad_norm": 2.884779214859009,
+ "learning_rate": 4.853024932442552e-06,
+ "loss": 0.6362,
+ "step": 1481
+ },
+ {
+ "epoch": 0.700709219858156,
+ "grad_norm": 3.368558406829834,
+ "learning_rate": 4.852814116634903e-06,
+ "loss": 0.6721,
+ "step": 1482
+ },
+ {
+ "epoch": 0.7011820330969267,
+ "grad_norm": 2.742414951324463,
+ "learning_rate": 4.852603154327837e-06,
+ "loss": 0.6212,
+ "step": 1483
+ },
+ {
+ "epoch": 0.7016548463356974,
+ "grad_norm": 2.53454852104187,
+ "learning_rate": 4.8523920455344864e-06,
+ "loss": 0.6675,
+ "step": 1484
+ },
+ {
+ "epoch": 0.7021276595744681,
+ "grad_norm": 2.9354238510131836,
+ "learning_rate": 4.852180790267999e-06,
+ "loss": 0.6692,
+ "step": 1485
+ },
+ {
+ "epoch": 0.7026004728132388,
+ "grad_norm": 2.585070848464966,
+ "learning_rate": 4.8519693885415274e-06,
+ "loss": 0.6215,
+ "step": 1486
+ },
+ {
+ "epoch": 0.7030732860520095,
+ "grad_norm": 2.9047999382019043,
+ "learning_rate": 4.851757840368235e-06,
+ "loss": 0.6231,
+ "step": 1487
+ },
+ {
+ "epoch": 0.7035460992907802,
+ "grad_norm": 3.0930933952331543,
+ "learning_rate": 4.851546145761295e-06,
+ "loss": 0.7267,
+ "step": 1488
+ },
+ {
+ "epoch": 0.7040189125295508,
+ "grad_norm": 3.0224719047546387,
+ "learning_rate": 4.8513343047338875e-06,
+ "loss": 0.6293,
+ "step": 1489
+ },
+ {
+ "epoch": 0.7044917257683215,
+ "grad_norm": 2.5758471488952637,
+ "learning_rate": 4.851122317299203e-06,
+ "loss": 0.5855,
+ "step": 1490
+ },
+ {
+ "epoch": 0.7049645390070922,
+ "grad_norm": 2.579272508621216,
+ "learning_rate": 4.850910183470441e-06,
+ "loss": 0.582,
+ "step": 1491
+ },
+ {
+ "epoch": 0.7054373522458629,
+ "grad_norm": 2.8148300647735596,
+ "learning_rate": 4.85069790326081e-06,
+ "loss": 0.6396,
+ "step": 1492
+ },
+ {
+ "epoch": 0.7059101654846336,
+ "grad_norm": 2.6380527019500732,
+ "learning_rate": 4.850485476683528e-06,
+ "loss": 0.6114,
+ "step": 1493
+ },
+ {
+ "epoch": 0.7063829787234043,
+ "grad_norm": 2.7736263275146484,
+ "learning_rate": 4.850272903751823e-06,
+ "loss": 0.6683,
+ "step": 1494
+ },
+ {
+ "epoch": 0.706855791962175,
+ "grad_norm": 3.1958179473876953,
+ "learning_rate": 4.8500601844789285e-06,
+ "loss": 0.6265,
+ "step": 1495
+ },
+ {
+ "epoch": 0.7073286052009456,
+ "grad_norm": 3.783212423324585,
+ "learning_rate": 4.8498473188780916e-06,
+ "loss": 0.6078,
+ "step": 1496
+ },
+ {
+ "epoch": 0.7078014184397163,
+ "grad_norm": 2.6656646728515625,
+ "learning_rate": 4.849634306962566e-06,
+ "loss": 0.5756,
+ "step": 1497
+ },
+ {
+ "epoch": 0.708274231678487,
+ "grad_norm": 2.757141590118408,
+ "learning_rate": 4.849421148745615e-06,
+ "loss": 0.5596,
+ "step": 1498
+ },
+ {
+ "epoch": 0.7087470449172577,
+ "grad_norm": 3.0391886234283447,
+ "learning_rate": 4.849207844240511e-06,
+ "loss": 0.5293,
+ "step": 1499
+ },
+ {
+ "epoch": 0.7092198581560284,
+ "grad_norm": 2.981912851333618,
+ "learning_rate": 4.848994393460535e-06,
+ "loss": 0.598,
+ "step": 1500
+ },
+ {
+ "epoch": 0.7096926713947991,
+ "grad_norm": 2.5470798015594482,
+ "learning_rate": 4.848780796418978e-06,
+ "loss": 0.6266,
+ "step": 1501
+ },
+ {
+ "epoch": 0.7101654846335698,
+ "grad_norm": 2.8394415378570557,
+ "learning_rate": 4.8485670531291415e-06,
+ "loss": 0.6844,
+ "step": 1502
+ },
+ {
+ "epoch": 0.7106382978723405,
+ "grad_norm": 3.2023508548736572,
+ "learning_rate": 4.848353163604331e-06,
+ "loss": 0.6134,
+ "step": 1503
+ },
+ {
+ "epoch": 0.7111111111111111,
+ "grad_norm": 2.98245906829834,
+ "learning_rate": 4.848139127857867e-06,
+ "loss": 0.7084,
+ "step": 1504
+ },
+ {
+ "epoch": 0.7115839243498818,
+ "grad_norm": 2.5917441844940186,
+ "learning_rate": 4.847924945903076e-06,
+ "loss": 0.5676,
+ "step": 1505
+ },
+ {
+ "epoch": 0.7120567375886525,
+ "grad_norm": 2.8736681938171387,
+ "learning_rate": 4.847710617753294e-06,
+ "loss": 0.6304,
+ "step": 1506
+ },
+ {
+ "epoch": 0.7125295508274232,
+ "grad_norm": 2.7832682132720947,
+ "learning_rate": 4.847496143421866e-06,
+ "loss": 0.5705,
+ "step": 1507
+ },
+ {
+ "epoch": 0.7130023640661939,
+ "grad_norm": 2.480560779571533,
+ "learning_rate": 4.847281522922147e-06,
+ "loss": 0.5595,
+ "step": 1508
+ },
+ {
+ "epoch": 0.7134751773049646,
+ "grad_norm": 2.357675313949585,
+ "learning_rate": 4.847066756267499e-06,
+ "loss": 0.5065,
+ "step": 1509
+ },
+ {
+ "epoch": 0.7139479905437353,
+ "grad_norm": 2.632669448852539,
+ "learning_rate": 4.846851843471296e-06,
+ "loss": 0.6949,
+ "step": 1510
+ },
+ {
+ "epoch": 0.7144208037825059,
+ "grad_norm": 2.7691073417663574,
+ "learning_rate": 4.84663678454692e-06,
+ "loss": 0.6638,
+ "step": 1511
+ },
+ {
+ "epoch": 0.7148936170212766,
+ "grad_norm": 2.5647685527801514,
+ "learning_rate": 4.846421579507761e-06,
+ "loss": 0.6098,
+ "step": 1512
+ },
+ {
+ "epoch": 0.7153664302600473,
+ "grad_norm": 2.476701021194458,
+ "learning_rate": 4.846206228367218e-06,
+ "loss": 0.592,
+ "step": 1513
+ },
+ {
+ "epoch": 0.715839243498818,
+ "grad_norm": 2.805727958679199,
+ "learning_rate": 4.845990731138702e-06,
+ "loss": 0.5466,
+ "step": 1514
+ },
+ {
+ "epoch": 0.7163120567375887,
+ "grad_norm": 2.551392078399658,
+ "learning_rate": 4.84577508783563e-06,
+ "loss": 0.6039,
+ "step": 1515
+ },
+ {
+ "epoch": 0.7167848699763594,
+ "grad_norm": 2.6861350536346436,
+ "learning_rate": 4.845559298471429e-06,
+ "loss": 0.6427,
+ "step": 1516
+ },
+ {
+ "epoch": 0.7172576832151301,
+ "grad_norm": 3.1908371448516846,
+ "learning_rate": 4.845343363059535e-06,
+ "loss": 0.5447,
+ "step": 1517
+ },
+ {
+ "epoch": 0.7177304964539007,
+ "grad_norm": 2.9021761417388916,
+ "learning_rate": 4.845127281613394e-06,
+ "loss": 0.5836,
+ "step": 1518
+ },
+ {
+ "epoch": 0.7182033096926714,
+ "grad_norm": 2.476670742034912,
+ "learning_rate": 4.844911054146461e-06,
+ "loss": 0.5863,
+ "step": 1519
+ },
+ {
+ "epoch": 0.7186761229314421,
+ "grad_norm": 2.662935495376587,
+ "learning_rate": 4.844694680672198e-06,
+ "loss": 0.5678,
+ "step": 1520
+ },
+ {
+ "epoch": 0.7191489361702128,
+ "grad_norm": 2.677896738052368,
+ "learning_rate": 4.844478161204079e-06,
+ "loss": 0.6195,
+ "step": 1521
+ },
+ {
+ "epoch": 0.7196217494089835,
+ "grad_norm": 2.781921863555908,
+ "learning_rate": 4.844261495755585e-06,
+ "loss": 0.643,
+ "step": 1522
+ },
+ {
+ "epoch": 0.7200945626477542,
+ "grad_norm": 3.0157392024993896,
+ "learning_rate": 4.844044684340206e-06,
+ "loss": 0.7559,
+ "step": 1523
+ },
+ {
+ "epoch": 0.7205673758865249,
+ "grad_norm": 2.8109354972839355,
+ "learning_rate": 4.843827726971444e-06,
+ "loss": 0.6264,
+ "step": 1524
+ },
+ {
+ "epoch": 0.7210401891252955,
+ "grad_norm": 3.0953569412231445,
+ "learning_rate": 4.8436106236628064e-06,
+ "loss": 0.6429,
+ "step": 1525
+ },
+ {
+ "epoch": 0.7215130023640662,
+ "grad_norm": 2.6850643157958984,
+ "learning_rate": 4.843393374427812e-06,
+ "loss": 0.6598,
+ "step": 1526
+ },
+ {
+ "epoch": 0.7219858156028369,
+ "grad_norm": 3.043480634689331,
+ "learning_rate": 4.8431759792799874e-06,
+ "loss": 0.6331,
+ "step": 1527
+ },
+ {
+ "epoch": 0.7224586288416076,
+ "grad_norm": 2.723870038986206,
+ "learning_rate": 4.842958438232868e-06,
+ "loss": 0.6259,
+ "step": 1528
+ },
+ {
+ "epoch": 0.7229314420803783,
+ "grad_norm": 2.822492837905884,
+ "learning_rate": 4.842740751300002e-06,
+ "loss": 0.6554,
+ "step": 1529
+ },
+ {
+ "epoch": 0.723404255319149,
+ "grad_norm": 2.7866315841674805,
+ "learning_rate": 4.842522918494941e-06,
+ "loss": 0.6991,
+ "step": 1530
+ },
+ {
+ "epoch": 0.7238770685579197,
+ "grad_norm": 2.8881826400756836,
+ "learning_rate": 4.84230493983125e-06,
+ "loss": 0.5876,
+ "step": 1531
+ },
+ {
+ "epoch": 0.7243498817966904,
+ "grad_norm": 2.7456939220428467,
+ "learning_rate": 4.8420868153225e-06,
+ "loss": 0.6188,
+ "step": 1532
+ },
+ {
+ "epoch": 0.724822695035461,
+ "grad_norm": 3.0257532596588135,
+ "learning_rate": 4.841868544982274e-06,
+ "loss": 0.63,
+ "step": 1533
+ },
+ {
+ "epoch": 0.7252955082742317,
+ "grad_norm": 3.1581954956054688,
+ "learning_rate": 4.841650128824164e-06,
+ "loss": 0.7214,
+ "step": 1534
+ },
+ {
+ "epoch": 0.7257683215130024,
+ "grad_norm": 2.9174306392669678,
+ "learning_rate": 4.841431566861767e-06,
+ "loss": 0.704,
+ "step": 1535
+ },
+ {
+ "epoch": 0.7262411347517731,
+ "grad_norm": 2.5019054412841797,
+ "learning_rate": 4.8412128591086935e-06,
+ "loss": 0.6298,
+ "step": 1536
+ },
+ {
+ "epoch": 0.7267139479905438,
+ "grad_norm": 2.724285125732422,
+ "learning_rate": 4.840994005578562e-06,
+ "loss": 0.6289,
+ "step": 1537
+ },
+ {
+ "epoch": 0.7271867612293145,
+ "grad_norm": 2.5882341861724854,
+ "learning_rate": 4.840775006284998e-06,
+ "loss": 0.6355,
+ "step": 1538
+ },
+ {
+ "epoch": 0.7276595744680852,
+ "grad_norm": 3.1281991004943848,
+ "learning_rate": 4.840555861241638e-06,
+ "loss": 0.5551,
+ "step": 1539
+ },
+ {
+ "epoch": 0.7281323877068558,
+ "grad_norm": 2.6064817905426025,
+ "learning_rate": 4.840336570462127e-06,
+ "loss": 0.5543,
+ "step": 1540
+ },
+ {
+ "epoch": 0.7286052009456265,
+ "grad_norm": 2.67112398147583,
+ "learning_rate": 4.840117133960122e-06,
+ "loss": 0.6044,
+ "step": 1541
+ },
+ {
+ "epoch": 0.7290780141843972,
+ "grad_norm": 2.838022232055664,
+ "learning_rate": 4.839897551749282e-06,
+ "loss": 0.6814,
+ "step": 1542
+ },
+ {
+ "epoch": 0.7295508274231679,
+ "grad_norm": 2.8897151947021484,
+ "learning_rate": 4.839677823843283e-06,
+ "loss": 0.593,
+ "step": 1543
+ },
+ {
+ "epoch": 0.7300236406619386,
+ "grad_norm": 2.9238014221191406,
+ "learning_rate": 4.839457950255805e-06,
+ "loss": 0.5544,
+ "step": 1544
+ },
+ {
+ "epoch": 0.7304964539007093,
+ "grad_norm": 3.016876459121704,
+ "learning_rate": 4.839237931000538e-06,
+ "loss": 0.6099,
+ "step": 1545
+ },
+ {
+ "epoch": 0.7309692671394799,
+ "grad_norm": 2.9415392875671387,
+ "learning_rate": 4.839017766091182e-06,
+ "loss": 0.6413,
+ "step": 1546
+ },
+ {
+ "epoch": 0.7314420803782505,
+ "grad_norm": 2.658067226409912,
+ "learning_rate": 4.838797455541446e-06,
+ "loss": 0.6534,
+ "step": 1547
+ },
+ {
+ "epoch": 0.7319148936170212,
+ "grad_norm": 2.460358142852783,
+ "learning_rate": 4.838576999365049e-06,
+ "loss": 0.5307,
+ "step": 1548
+ },
+ {
+ "epoch": 0.7323877068557919,
+ "grad_norm": 2.5818674564361572,
+ "learning_rate": 4.838356397575716e-06,
+ "loss": 0.6265,
+ "step": 1549
+ },
+ {
+ "epoch": 0.7328605200945626,
+ "grad_norm": 3.009197473526001,
+ "learning_rate": 4.838135650187183e-06,
+ "loss": 0.6957,
+ "step": 1550
+ },
+ {
+ "epoch": 0.7333333333333333,
+ "grad_norm": 2.738543748855591,
+ "learning_rate": 4.837914757213196e-06,
+ "loss": 0.646,
+ "step": 1551
+ },
+ {
+ "epoch": 0.733806146572104,
+ "grad_norm": 2.8208494186401367,
+ "learning_rate": 4.837693718667508e-06,
+ "loss": 0.5936,
+ "step": 1552
+ },
+ {
+ "epoch": 0.7342789598108747,
+ "grad_norm": 3.1574649810791016,
+ "learning_rate": 4.837472534563883e-06,
+ "loss": 0.6455,
+ "step": 1553
+ },
+ {
+ "epoch": 0.7347517730496453,
+ "grad_norm": 2.6737420558929443,
+ "learning_rate": 4.837251204916093e-06,
+ "loss": 0.5921,
+ "step": 1554
+ },
+ {
+ "epoch": 0.735224586288416,
+ "grad_norm": 2.424983024597168,
+ "learning_rate": 4.837029729737918e-06,
+ "loss": 0.6346,
+ "step": 1555
+ },
+ {
+ "epoch": 0.7356973995271867,
+ "grad_norm": 2.5163493156433105,
+ "learning_rate": 4.836808109043151e-06,
+ "loss": 0.6061,
+ "step": 1556
+ },
+ {
+ "epoch": 0.7361702127659574,
+ "grad_norm": 2.8377044200897217,
+ "learning_rate": 4.836586342845588e-06,
+ "loss": 0.611,
+ "step": 1557
+ },
+ {
+ "epoch": 0.7366430260047281,
+ "grad_norm": 2.5929181575775146,
+ "learning_rate": 4.83636443115904e-06,
+ "loss": 0.5496,
+ "step": 1558
+ },
+ {
+ "epoch": 0.7371158392434988,
+ "grad_norm": 2.5017223358154297,
+ "learning_rate": 4.836142373997323e-06,
+ "loss": 0.6235,
+ "step": 1559
+ },
+ {
+ "epoch": 0.7375886524822695,
+ "grad_norm": 2.822500228881836,
+ "learning_rate": 4.835920171374265e-06,
+ "loss": 0.6147,
+ "step": 1560
+ },
+ {
+ "epoch": 0.7380614657210401,
+ "grad_norm": 2.7234230041503906,
+ "learning_rate": 4.8356978233037e-06,
+ "loss": 0.6228,
+ "step": 1561
+ },
+ {
+ "epoch": 0.7385342789598108,
+ "grad_norm": 2.9565515518188477,
+ "learning_rate": 4.835475329799472e-06,
+ "loss": 0.5728,
+ "step": 1562
+ },
+ {
+ "epoch": 0.7390070921985815,
+ "grad_norm": 2.4356038570404053,
+ "learning_rate": 4.835252690875438e-06,
+ "loss": 0.6723,
+ "step": 1563
+ },
+ {
+ "epoch": 0.7394799054373522,
+ "grad_norm": 2.765913248062134,
+ "learning_rate": 4.835029906545458e-06,
+ "loss": 0.5805,
+ "step": 1564
+ },
+ {
+ "epoch": 0.7399527186761229,
+ "grad_norm": 2.4481914043426514,
+ "learning_rate": 4.834806976823405e-06,
+ "loss": 0.599,
+ "step": 1565
+ },
+ {
+ "epoch": 0.7404255319148936,
+ "grad_norm": 2.620779514312744,
+ "learning_rate": 4.834583901723158e-06,
+ "loss": 0.63,
+ "step": 1566
+ },
+ {
+ "epoch": 0.7408983451536643,
+ "grad_norm": 2.654426097869873,
+ "learning_rate": 4.83436068125861e-06,
+ "loss": 0.6544,
+ "step": 1567
+ },
+ {
+ "epoch": 0.741371158392435,
+ "grad_norm": 2.589623212814331,
+ "learning_rate": 4.834137315443656e-06,
+ "loss": 0.5596,
+ "step": 1568
+ },
+ {
+ "epoch": 0.7418439716312056,
+ "grad_norm": 2.572883129119873,
+ "learning_rate": 4.833913804292209e-06,
+ "loss": 0.5974,
+ "step": 1569
+ },
+ {
+ "epoch": 0.7423167848699763,
+ "grad_norm": 2.8744914531707764,
+ "learning_rate": 4.833690147818181e-06,
+ "loss": 0.5364,
+ "step": 1570
+ },
+ {
+ "epoch": 0.742789598108747,
+ "grad_norm": 2.9800851345062256,
+ "learning_rate": 4.833466346035502e-06,
+ "loss": 0.6287,
+ "step": 1571
+ },
+ {
+ "epoch": 0.7432624113475177,
+ "grad_norm": 2.627784490585327,
+ "learning_rate": 4.833242398958105e-06,
+ "loss": 0.621,
+ "step": 1572
+ },
+ {
+ "epoch": 0.7437352245862884,
+ "grad_norm": 2.5187721252441406,
+ "learning_rate": 4.833018306599933e-06,
+ "loss": 0.5901,
+ "step": 1573
+ },
+ {
+ "epoch": 0.7442080378250591,
+ "grad_norm": 2.4843688011169434,
+ "learning_rate": 4.832794068974944e-06,
+ "loss": 0.6336,
+ "step": 1574
+ },
+ {
+ "epoch": 0.7446808510638298,
+ "grad_norm": 2.774911880493164,
+ "learning_rate": 4.832569686097096e-06,
+ "loss": 0.6091,
+ "step": 1575
+ },
+ {
+ "epoch": 0.7451536643026004,
+ "grad_norm": 3.2562527656555176,
+ "learning_rate": 4.8323451579803615e-06,
+ "loss": 0.7686,
+ "step": 1576
+ },
+ {
+ "epoch": 0.7456264775413711,
+ "grad_norm": 2.799570083618164,
+ "learning_rate": 4.832120484638721e-06,
+ "loss": 0.6233,
+ "step": 1577
+ },
+ {
+ "epoch": 0.7460992907801418,
+ "grad_norm": 2.661893367767334,
+ "learning_rate": 4.831895666086164e-06,
+ "loss": 0.5841,
+ "step": 1578
+ },
+ {
+ "epoch": 0.7465721040189125,
+ "grad_norm": 3.0382652282714844,
+ "learning_rate": 4.831670702336689e-06,
+ "loss": 0.5769,
+ "step": 1579
+ },
+ {
+ "epoch": 0.7470449172576832,
+ "grad_norm": 2.676398515701294,
+ "learning_rate": 4.831445593404304e-06,
+ "loss": 0.619,
+ "step": 1580
+ },
+ {
+ "epoch": 0.7475177304964539,
+ "grad_norm": 2.717916965484619,
+ "learning_rate": 4.831220339303024e-06,
+ "loss": 0.5787,
+ "step": 1581
+ },
+ {
+ "epoch": 0.7479905437352246,
+ "grad_norm": 2.3918066024780273,
+ "learning_rate": 4.830994940046876e-06,
+ "loss": 0.5108,
+ "step": 1582
+ },
+ {
+ "epoch": 0.7484633569739952,
+ "grad_norm": 2.709144115447998,
+ "learning_rate": 4.830769395649895e-06,
+ "loss": 0.6875,
+ "step": 1583
+ },
+ {
+ "epoch": 0.7489361702127659,
+ "grad_norm": 2.8711116313934326,
+ "learning_rate": 4.830543706126123e-06,
+ "loss": 0.6745,
+ "step": 1584
+ },
+ {
+ "epoch": 0.7494089834515366,
+ "grad_norm": 2.612339496612549,
+ "learning_rate": 4.830317871489614e-06,
+ "loss": 0.5738,
+ "step": 1585
+ },
+ {
+ "epoch": 0.7498817966903073,
+ "grad_norm": 2.4355857372283936,
+ "learning_rate": 4.830091891754429e-06,
+ "loss": 0.5907,
+ "step": 1586
+ },
+ {
+ "epoch": 0.750354609929078,
+ "grad_norm": 2.676051378250122,
+ "learning_rate": 4.829865766934638e-06,
+ "loss": 0.6628,
+ "step": 1587
+ },
+ {
+ "epoch": 0.7508274231678487,
+ "grad_norm": 2.66489839553833,
+ "learning_rate": 4.829639497044323e-06,
+ "loss": 0.5984,
+ "step": 1588
+ },
+ {
+ "epoch": 0.7513002364066194,
+ "grad_norm": 2.5358035564422607,
+ "learning_rate": 4.829413082097572e-06,
+ "loss": 0.5867,
+ "step": 1589
+ },
+ {
+ "epoch": 0.75177304964539,
+ "grad_norm": 2.6530144214630127,
+ "learning_rate": 4.8291865221084815e-06,
+ "loss": 0.5917,
+ "step": 1590
+ },
+ {
+ "epoch": 0.7522458628841607,
+ "grad_norm": 2.5160958766937256,
+ "learning_rate": 4.82895981709116e-06,
+ "loss": 0.6347,
+ "step": 1591
+ },
+ {
+ "epoch": 0.7527186761229314,
+ "grad_norm": 2.61592698097229,
+ "learning_rate": 4.8287329670597225e-06,
+ "loss": 0.5472,
+ "step": 1592
+ },
+ {
+ "epoch": 0.7531914893617021,
+ "grad_norm": 2.7528622150421143,
+ "learning_rate": 4.828505972028296e-06,
+ "loss": 0.5842,
+ "step": 1593
+ },
+ {
+ "epoch": 0.7536643026004728,
+ "grad_norm": 2.8154072761535645,
+ "learning_rate": 4.828278832011011e-06,
+ "loss": 0.5757,
+ "step": 1594
+ },
+ {
+ "epoch": 0.7541371158392435,
+ "grad_norm": 3.118515729904175,
+ "learning_rate": 4.828051547022013e-06,
+ "loss": 0.6472,
+ "step": 1595
+ },
+ {
+ "epoch": 0.7546099290780142,
+ "grad_norm": 2.452033758163452,
+ "learning_rate": 4.827824117075453e-06,
+ "loss": 0.5571,
+ "step": 1596
+ },
+ {
+ "epoch": 0.7550827423167848,
+ "grad_norm": 2.984388828277588,
+ "learning_rate": 4.827596542185492e-06,
+ "loss": 0.6656,
+ "step": 1597
+ },
+ {
+ "epoch": 0.7555555555555555,
+ "grad_norm": 2.61356782913208,
+ "learning_rate": 4.8273688223663014e-06,
+ "loss": 0.6444,
+ "step": 1598
+ },
+ {
+ "epoch": 0.7560283687943262,
+ "grad_norm": 2.8967196941375732,
+ "learning_rate": 4.8271409576320595e-06,
+ "loss": 0.6457,
+ "step": 1599
+ },
+ {
+ "epoch": 0.7565011820330969,
+ "grad_norm": 2.852367639541626,
+ "learning_rate": 4.826912947996954e-06,
+ "loss": 0.5629,
+ "step": 1600
+ },
+ {
+ "epoch": 0.7569739952718676,
+ "grad_norm": 2.905280590057373,
+ "learning_rate": 4.826684793475182e-06,
+ "loss": 0.6245,
+ "step": 1601
+ },
+ {
+ "epoch": 0.7574468085106383,
+ "grad_norm": 2.6156530380249023,
+ "learning_rate": 4.826456494080951e-06,
+ "loss": 0.5869,
+ "step": 1602
+ },
+ {
+ "epoch": 0.757919621749409,
+ "grad_norm": 2.6490228176116943,
+ "learning_rate": 4.826228049828475e-06,
+ "loss": 0.5461,
+ "step": 1603
+ },
+ {
+ "epoch": 0.7583924349881797,
+ "grad_norm": 2.9626693725585938,
+ "learning_rate": 4.825999460731978e-06,
+ "loss": 0.6842,
+ "step": 1604
+ },
+ {
+ "epoch": 0.7588652482269503,
+ "grad_norm": 2.6866023540496826,
+ "learning_rate": 4.825770726805695e-06,
+ "loss": 0.5726,
+ "step": 1605
+ },
+ {
+ "epoch": 0.759338061465721,
+ "grad_norm": 2.5525858402252197,
+ "learning_rate": 4.825541848063866e-06,
+ "loss": 0.6061,
+ "step": 1606
+ },
+ {
+ "epoch": 0.7598108747044917,
+ "grad_norm": 2.703977584838867,
+ "learning_rate": 4.825312824520743e-06,
+ "loss": 0.6726,
+ "step": 1607
+ },
+ {
+ "epoch": 0.7602836879432624,
+ "grad_norm": 2.856534957885742,
+ "learning_rate": 4.825083656190588e-06,
+ "loss": 0.625,
+ "step": 1608
+ },
+ {
+ "epoch": 0.7607565011820331,
+ "grad_norm": 2.8564887046813965,
+ "learning_rate": 4.824854343087668e-06,
+ "loss": 0.7251,
+ "step": 1609
+ },
+ {
+ "epoch": 0.7612293144208038,
+ "grad_norm": 2.327650308609009,
+ "learning_rate": 4.824624885226262e-06,
+ "loss": 0.526,
+ "step": 1610
+ },
+ {
+ "epoch": 0.7617021276595745,
+ "grad_norm": 3.0025737285614014,
+ "learning_rate": 4.824395282620659e-06,
+ "loss": 0.6043,
+ "step": 1611
+ },
+ {
+ "epoch": 0.7621749408983451,
+ "grad_norm": 2.5441737174987793,
+ "learning_rate": 4.824165535285152e-06,
+ "loss": 0.6276,
+ "step": 1612
+ },
+ {
+ "epoch": 0.7626477541371158,
+ "grad_norm": 2.4177372455596924,
+ "learning_rate": 4.823935643234049e-06,
+ "loss": 0.6419,
+ "step": 1613
+ },
+ {
+ "epoch": 0.7631205673758865,
+ "grad_norm": 2.9210550785064697,
+ "learning_rate": 4.823705606481664e-06,
+ "loss": 0.5663,
+ "step": 1614
+ },
+ {
+ "epoch": 0.7635933806146572,
+ "grad_norm": 2.6353724002838135,
+ "learning_rate": 4.82347542504232e-06,
+ "loss": 0.5669,
+ "step": 1615
+ },
+ {
+ "epoch": 0.7640661938534279,
+ "grad_norm": 2.419081926345825,
+ "learning_rate": 4.823245098930349e-06,
+ "loss": 0.5777,
+ "step": 1616
+ },
+ {
+ "epoch": 0.7645390070921986,
+ "grad_norm": 2.5077571868896484,
+ "learning_rate": 4.823014628160093e-06,
+ "loss": 0.5924,
+ "step": 1617
+ },
+ {
+ "epoch": 0.7650118203309693,
+ "grad_norm": 2.816056251525879,
+ "learning_rate": 4.822784012745902e-06,
+ "loss": 0.7273,
+ "step": 1618
+ },
+ {
+ "epoch": 0.76548463356974,
+ "grad_norm": 2.7163147926330566,
+ "learning_rate": 4.8225532527021366e-06,
+ "loss": 0.5545,
+ "step": 1619
+ },
+ {
+ "epoch": 0.7659574468085106,
+ "grad_norm": 2.4784302711486816,
+ "learning_rate": 4.822322348043164e-06,
+ "loss": 0.556,
+ "step": 1620
+ },
+ {
+ "epoch": 0.7664302600472813,
+ "grad_norm": 2.712467670440674,
+ "learning_rate": 4.822091298783361e-06,
+ "loss": 0.6501,
+ "step": 1621
+ },
+ {
+ "epoch": 0.766903073286052,
+ "grad_norm": 2.7217724323272705,
+ "learning_rate": 4.821860104937115e-06,
+ "loss": 0.5989,
+ "step": 1622
+ },
+ {
+ "epoch": 0.7673758865248227,
+ "grad_norm": 2.5622854232788086,
+ "learning_rate": 4.821628766518821e-06,
+ "loss": 0.5263,
+ "step": 1623
+ },
+ {
+ "epoch": 0.7678486997635934,
+ "grad_norm": 3.230923891067505,
+ "learning_rate": 4.821397283542884e-06,
+ "loss": 0.6707,
+ "step": 1624
+ },
+ {
+ "epoch": 0.7683215130023641,
+ "grad_norm": 2.37929105758667,
+ "learning_rate": 4.821165656023718e-06,
+ "loss": 0.6124,
+ "step": 1625
+ },
+ {
+ "epoch": 0.7687943262411348,
+ "grad_norm": 2.9811325073242188,
+ "learning_rate": 4.820933883975745e-06,
+ "loss": 0.6435,
+ "step": 1626
+ },
+ {
+ "epoch": 0.7692671394799054,
+ "grad_norm": 2.887380838394165,
+ "learning_rate": 4.820701967413395e-06,
+ "loss": 0.621,
+ "step": 1627
+ },
+ {
+ "epoch": 0.7697399527186761,
+ "grad_norm": 2.6762876510620117,
+ "learning_rate": 4.820469906351109e-06,
+ "loss": 0.5713,
+ "step": 1628
+ },
+ {
+ "epoch": 0.7702127659574468,
+ "grad_norm": 2.7347512245178223,
+ "learning_rate": 4.820237700803337e-06,
+ "loss": 0.6136,
+ "step": 1629
+ },
+ {
+ "epoch": 0.7706855791962175,
+ "grad_norm": 2.7244746685028076,
+ "learning_rate": 4.820005350784539e-06,
+ "loss": 0.5816,
+ "step": 1630
+ },
+ {
+ "epoch": 0.7711583924349882,
+ "grad_norm": 2.9293999671936035,
+ "learning_rate": 4.8197728563091795e-06,
+ "loss": 0.6649,
+ "step": 1631
+ },
+ {
+ "epoch": 0.7716312056737589,
+ "grad_norm": 2.4402127265930176,
+ "learning_rate": 4.819540217391736e-06,
+ "loss": 0.6481,
+ "step": 1632
+ },
+ {
+ "epoch": 0.7721040189125296,
+ "grad_norm": 3.083941698074341,
+ "learning_rate": 4.819307434046694e-06,
+ "loss": 0.6951,
+ "step": 1633
+ },
+ {
+ "epoch": 0.7725768321513002,
+ "grad_norm": 2.544952392578125,
+ "learning_rate": 4.819074506288548e-06,
+ "loss": 0.539,
+ "step": 1634
+ },
+ {
+ "epoch": 0.7730496453900709,
+ "grad_norm": 2.7791268825531006,
+ "learning_rate": 4.818841434131801e-06,
+ "loss": 0.5827,
+ "step": 1635
+ },
+ {
+ "epoch": 0.7735224586288416,
+ "grad_norm": 2.7349796295166016,
+ "learning_rate": 4.818608217590967e-06,
+ "loss": 0.5584,
+ "step": 1636
+ },
+ {
+ "epoch": 0.7739952718676123,
+ "grad_norm": 2.637652635574341,
+ "learning_rate": 4.818374856680565e-06,
+ "loss": 0.6386,
+ "step": 1637
+ },
+ {
+ "epoch": 0.774468085106383,
+ "grad_norm": 2.9821584224700928,
+ "learning_rate": 4.818141351415127e-06,
+ "loss": 0.6734,
+ "step": 1638
+ },
+ {
+ "epoch": 0.7749408983451537,
+ "grad_norm": 2.992938995361328,
+ "learning_rate": 4.817907701809192e-06,
+ "loss": 0.5899,
+ "step": 1639
+ },
+ {
+ "epoch": 0.7754137115839244,
+ "grad_norm": 4.35719633102417,
+ "learning_rate": 4.8176739078773076e-06,
+ "loss": 0.6281,
+ "step": 1640
+ },
+ {
+ "epoch": 0.775886524822695,
+ "grad_norm": 2.838146209716797,
+ "learning_rate": 4.8174399696340315e-06,
+ "loss": 0.5766,
+ "step": 1641
+ },
+ {
+ "epoch": 0.7763593380614657,
+ "grad_norm": 3.3116989135742188,
+ "learning_rate": 4.81720588709393e-06,
+ "loss": 0.6409,
+ "step": 1642
+ },
+ {
+ "epoch": 0.7768321513002364,
+ "grad_norm": 2.9843590259552,
+ "learning_rate": 4.816971660271579e-06,
+ "loss": 0.6108,
+ "step": 1643
+ },
+ {
+ "epoch": 0.7773049645390071,
+ "grad_norm": 2.843770742416382,
+ "learning_rate": 4.816737289181562e-06,
+ "loss": 0.6053,
+ "step": 1644
+ },
+ {
+ "epoch": 0.7777777777777778,
+ "grad_norm": 2.7608556747436523,
+ "learning_rate": 4.816502773838473e-06,
+ "loss": 0.5854,
+ "step": 1645
+ },
+ {
+ "epoch": 0.7782505910165485,
+ "grad_norm": 3.343682289123535,
+ "learning_rate": 4.816268114256914e-06,
+ "loss": 0.6329,
+ "step": 1646
+ },
+ {
+ "epoch": 0.7787234042553192,
+ "grad_norm": 2.769768476486206,
+ "learning_rate": 4.816033310451496e-06,
+ "loss": 0.6242,
+ "step": 1647
+ },
+ {
+ "epoch": 0.7791962174940898,
+ "grad_norm": 2.989851713180542,
+ "learning_rate": 4.815798362436838e-06,
+ "loss": 0.6493,
+ "step": 1648
+ },
+ {
+ "epoch": 0.7796690307328605,
+ "grad_norm": 3.170736312866211,
+ "learning_rate": 4.8155632702275716e-06,
+ "loss": 0.6341,
+ "step": 1649
+ },
+ {
+ "epoch": 0.7801418439716312,
+ "grad_norm": 2.7372522354125977,
+ "learning_rate": 4.815328033838334e-06,
+ "loss": 0.5445,
+ "step": 1650
+ },
+ {
+ "epoch": 0.7806146572104019,
+ "grad_norm": 2.6947238445281982,
+ "learning_rate": 4.8150926532837715e-06,
+ "loss": 0.6437,
+ "step": 1651
+ },
+ {
+ "epoch": 0.7810874704491726,
+ "grad_norm": 2.472323179244995,
+ "learning_rate": 4.81485712857854e-06,
+ "loss": 0.5751,
+ "step": 1652
+ },
+ {
+ "epoch": 0.7815602836879433,
+ "grad_norm": 2.791114091873169,
+ "learning_rate": 4.814621459737308e-06,
+ "loss": 0.5996,
+ "step": 1653
+ },
+ {
+ "epoch": 0.782033096926714,
+ "grad_norm": 3.1957521438598633,
+ "learning_rate": 4.814385646774745e-06,
+ "loss": 0.5803,
+ "step": 1654
+ },
+ {
+ "epoch": 0.7825059101654847,
+ "grad_norm": 2.4120798110961914,
+ "learning_rate": 4.8141496897055364e-06,
+ "loss": 0.5814,
+ "step": 1655
+ },
+ {
+ "epoch": 0.7829787234042553,
+ "grad_norm": 2.9262423515319824,
+ "learning_rate": 4.813913588544374e-06,
+ "loss": 0.6292,
+ "step": 1656
+ },
+ {
+ "epoch": 0.783451536643026,
+ "grad_norm": 2.8251047134399414,
+ "learning_rate": 4.813677343305959e-06,
+ "loss": 0.6787,
+ "step": 1657
+ },
+ {
+ "epoch": 0.7839243498817967,
+ "grad_norm": 2.931659698486328,
+ "learning_rate": 4.8134409540050005e-06,
+ "loss": 0.6163,
+ "step": 1658
+ },
+ {
+ "epoch": 0.7843971631205674,
+ "grad_norm": 2.7160706520080566,
+ "learning_rate": 4.813204420656219e-06,
+ "loss": 0.6831,
+ "step": 1659
+ },
+ {
+ "epoch": 0.7848699763593381,
+ "grad_norm": 3.2134454250335693,
+ "learning_rate": 4.81296774327434e-06,
+ "loss": 0.6002,
+ "step": 1660
+ },
+ {
+ "epoch": 0.7853427895981088,
+ "grad_norm": 2.4002513885498047,
+ "learning_rate": 4.812730921874103e-06,
+ "loss": 0.5488,
+ "step": 1661
+ },
+ {
+ "epoch": 0.7858156028368795,
+ "grad_norm": 2.5559282302856445,
+ "learning_rate": 4.812493956470251e-06,
+ "loss": 0.5802,
+ "step": 1662
+ },
+ {
+ "epoch": 0.7862884160756501,
+ "grad_norm": 2.57478404045105,
+ "learning_rate": 4.812256847077541e-06,
+ "loss": 0.646,
+ "step": 1663
+ },
+ {
+ "epoch": 0.7867612293144208,
+ "grad_norm": 2.811851978302002,
+ "learning_rate": 4.812019593710736e-06,
+ "loss": 0.6245,
+ "step": 1664
+ },
+ {
+ "epoch": 0.7872340425531915,
+ "grad_norm": 2.5228829383850098,
+ "learning_rate": 4.811782196384609e-06,
+ "loss": 0.5949,
+ "step": 1665
+ },
+ {
+ "epoch": 0.7877068557919622,
+ "grad_norm": 2.744096040725708,
+ "learning_rate": 4.8115446551139415e-06,
+ "loss": 0.6006,
+ "step": 1666
+ },
+ {
+ "epoch": 0.7881796690307329,
+ "grad_norm": 3.129242420196533,
+ "learning_rate": 4.811306969913524e-06,
+ "loss": 0.7251,
+ "step": 1667
+ },
+ {
+ "epoch": 0.7886524822695036,
+ "grad_norm": 2.7855660915374756,
+ "learning_rate": 4.811069140798156e-06,
+ "loss": 0.6534,
+ "step": 1668
+ },
+ {
+ "epoch": 0.7891252955082743,
+ "grad_norm": 2.836603879928589,
+ "learning_rate": 4.810831167782647e-06,
+ "loss": 0.6661,
+ "step": 1669
+ },
+ {
+ "epoch": 0.789598108747045,
+ "grad_norm": 2.5339887142181396,
+ "learning_rate": 4.810593050881813e-06,
+ "loss": 0.5354,
+ "step": 1670
+ },
+ {
+ "epoch": 0.7900709219858156,
+ "grad_norm": 2.9553709030151367,
+ "learning_rate": 4.810354790110482e-06,
+ "loss": 0.6001,
+ "step": 1671
+ },
+ {
+ "epoch": 0.7905437352245863,
+ "grad_norm": 2.6581788063049316,
+ "learning_rate": 4.8101163854834885e-06,
+ "loss": 0.6802,
+ "step": 1672
+ },
+ {
+ "epoch": 0.791016548463357,
+ "grad_norm": 3.2002551555633545,
+ "learning_rate": 4.809877837015677e-06,
+ "loss": 0.6641,
+ "step": 1673
+ },
+ {
+ "epoch": 0.7914893617021277,
+ "grad_norm": 2.918792963027954,
+ "learning_rate": 4.809639144721902e-06,
+ "loss": 0.6758,
+ "step": 1674
+ },
+ {
+ "epoch": 0.7919621749408984,
+ "grad_norm": 2.7993946075439453,
+ "learning_rate": 4.8094003086170245e-06,
+ "loss": 0.5889,
+ "step": 1675
+ },
+ {
+ "epoch": 0.7924349881796691,
+ "grad_norm": 2.3698952198028564,
+ "learning_rate": 4.809161328715916e-06,
+ "loss": 0.6244,
+ "step": 1676
+ },
+ {
+ "epoch": 0.7929078014184398,
+ "grad_norm": 2.8891594409942627,
+ "learning_rate": 4.808922205033458e-06,
+ "loss": 0.5835,
+ "step": 1677
+ },
+ {
+ "epoch": 0.7933806146572104,
+ "grad_norm": 2.838345766067505,
+ "learning_rate": 4.808682937584537e-06,
+ "loss": 0.6907,
+ "step": 1678
+ },
+ {
+ "epoch": 0.7938534278959811,
+ "grad_norm": 2.8443174362182617,
+ "learning_rate": 4.808443526384053e-06,
+ "loss": 0.6692,
+ "step": 1679
+ },
+ {
+ "epoch": 0.7943262411347518,
+ "grad_norm": 2.7355034351348877,
+ "learning_rate": 4.808203971446913e-06,
+ "loss": 0.5799,
+ "step": 1680
+ },
+ {
+ "epoch": 0.7947990543735225,
+ "grad_norm": 2.7108020782470703,
+ "learning_rate": 4.807964272788033e-06,
+ "loss": 0.652,
+ "step": 1681
+ },
+ {
+ "epoch": 0.7952718676122932,
+ "grad_norm": 2.397650957107544,
+ "learning_rate": 4.807724430422338e-06,
+ "loss": 0.5418,
+ "step": 1682
+ },
+ {
+ "epoch": 0.7957446808510639,
+ "grad_norm": 2.4981582164764404,
+ "learning_rate": 4.807484444364762e-06,
+ "loss": 0.5731,
+ "step": 1683
+ },
+ {
+ "epoch": 0.7962174940898346,
+ "grad_norm": 2.7943713665008545,
+ "learning_rate": 4.8072443146302475e-06,
+ "loss": 0.5913,
+ "step": 1684
+ },
+ {
+ "epoch": 0.7966903073286052,
+ "grad_norm": 2.5691423416137695,
+ "learning_rate": 4.807004041233746e-06,
+ "loss": 0.6475,
+ "step": 1685
+ },
+ {
+ "epoch": 0.7971631205673759,
+ "grad_norm": 3.2367498874664307,
+ "learning_rate": 4.8067636241902195e-06,
+ "loss": 0.675,
+ "step": 1686
+ },
+ {
+ "epoch": 0.7976359338061466,
+ "grad_norm": 3.000595808029175,
+ "learning_rate": 4.806523063514637e-06,
+ "loss": 0.5481,
+ "step": 1687
+ },
+ {
+ "epoch": 0.7981087470449173,
+ "grad_norm": 2.702014207839966,
+ "learning_rate": 4.806282359221976e-06,
+ "loss": 0.5993,
+ "step": 1688
+ },
+ {
+ "epoch": 0.798581560283688,
+ "grad_norm": 2.383671998977661,
+ "learning_rate": 4.806041511327226e-06,
+ "loss": 0.562,
+ "step": 1689
+ },
+ {
+ "epoch": 0.7990543735224587,
+ "grad_norm": 2.6965041160583496,
+ "learning_rate": 4.8058005198453834e-06,
+ "loss": 0.5955,
+ "step": 1690
+ },
+ {
+ "epoch": 0.7995271867612294,
+ "grad_norm": 2.5906765460968018,
+ "learning_rate": 4.805559384791453e-06,
+ "loss": 0.5151,
+ "step": 1691
+ },
+ {
+ "epoch": 0.8,
+ "grad_norm": 2.5454652309417725,
+ "learning_rate": 4.8053181061804475e-06,
+ "loss": 0.5843,
+ "step": 1692
+ },
+ {
+ "epoch": 0.8004728132387707,
+ "grad_norm": 2.661343812942505,
+ "learning_rate": 4.8050766840273935e-06,
+ "loss": 0.5995,
+ "step": 1693
+ },
+ {
+ "epoch": 0.8009456264775414,
+ "grad_norm": 2.5635924339294434,
+ "learning_rate": 4.8048351183473215e-06,
+ "loss": 0.5676,
+ "step": 1694
+ },
+ {
+ "epoch": 0.8014184397163121,
+ "grad_norm": 2.5936667919158936,
+ "learning_rate": 4.804593409155274e-06,
+ "loss": 0.6291,
+ "step": 1695
+ },
+ {
+ "epoch": 0.8018912529550828,
+ "grad_norm": 2.6902432441711426,
+ "learning_rate": 4.804351556466299e-06,
+ "loss": 0.6114,
+ "step": 1696
+ },
+ {
+ "epoch": 0.8023640661938535,
+ "grad_norm": 2.7764673233032227,
+ "learning_rate": 4.804109560295457e-06,
+ "loss": 0.5768,
+ "step": 1697
+ },
+ {
+ "epoch": 0.8028368794326242,
+ "grad_norm": 2.9587221145629883,
+ "learning_rate": 4.803867420657816e-06,
+ "loss": 0.6048,
+ "step": 1698
+ },
+ {
+ "epoch": 0.8033096926713948,
+ "grad_norm": 2.9238998889923096,
+ "learning_rate": 4.803625137568453e-06,
+ "loss": 0.6329,
+ "step": 1699
+ },
+ {
+ "epoch": 0.8037825059101655,
+ "grad_norm": 2.70473313331604,
+ "learning_rate": 4.803382711042455e-06,
+ "loss": 0.5427,
+ "step": 1700
+ },
+ {
+ "epoch": 0.8042553191489362,
+ "grad_norm": 3.1604979038238525,
+ "learning_rate": 4.803140141094914e-06,
+ "loss": 0.626,
+ "step": 1701
+ },
+ {
+ "epoch": 0.8047281323877069,
+ "grad_norm": 2.9567699432373047,
+ "learning_rate": 4.802897427740936e-06,
+ "loss": 0.5319,
+ "step": 1702
+ },
+ {
+ "epoch": 0.8052009456264776,
+ "grad_norm": 2.90983247756958,
+ "learning_rate": 4.802654570995632e-06,
+ "loss": 0.586,
+ "step": 1703
+ },
+ {
+ "epoch": 0.8056737588652483,
+ "grad_norm": 2.783480167388916,
+ "learning_rate": 4.8024115708741255e-06,
+ "loss": 0.5773,
+ "step": 1704
+ },
+ {
+ "epoch": 0.806146572104019,
+ "grad_norm": 3.3307793140411377,
+ "learning_rate": 4.802168427391547e-06,
+ "loss": 0.6257,
+ "step": 1705
+ },
+ {
+ "epoch": 0.8066193853427897,
+ "grad_norm": 3.0475001335144043,
+ "learning_rate": 4.801925140563034e-06,
+ "loss": 0.6612,
+ "step": 1706
+ },
+ {
+ "epoch": 0.8070921985815603,
+ "grad_norm": 2.8278894424438477,
+ "learning_rate": 4.8016817104037375e-06,
+ "loss": 0.6449,
+ "step": 1707
+ },
+ {
+ "epoch": 0.807565011820331,
+ "grad_norm": 2.760244369506836,
+ "learning_rate": 4.801438136928812e-06,
+ "loss": 0.7007,
+ "step": 1708
+ },
+ {
+ "epoch": 0.8080378250591016,
+ "grad_norm": 2.827634572982788,
+ "learning_rate": 4.801194420153427e-06,
+ "loss": 0.6418,
+ "step": 1709
+ },
+ {
+ "epoch": 0.8085106382978723,
+ "grad_norm": 2.8655009269714355,
+ "learning_rate": 4.800950560092754e-06,
+ "loss": 0.6231,
+ "step": 1710
+ },
+ {
+ "epoch": 0.808983451536643,
+ "grad_norm": 2.738112688064575,
+ "learning_rate": 4.800706556761981e-06,
+ "loss": 0.6463,
+ "step": 1711
+ },
+ {
+ "epoch": 0.8094562647754137,
+ "grad_norm": 2.4781179428100586,
+ "learning_rate": 4.800462410176296e-06,
+ "loss": 0.5365,
+ "step": 1712
+ },
+ {
+ "epoch": 0.8099290780141843,
+ "grad_norm": 2.6049838066101074,
+ "learning_rate": 4.800218120350906e-06,
+ "loss": 0.6035,
+ "step": 1713
+ },
+ {
+ "epoch": 0.810401891252955,
+ "grad_norm": 2.9089980125427246,
+ "learning_rate": 4.79997368730102e-06,
+ "loss": 0.5828,
+ "step": 1714
+ },
+ {
+ "epoch": 0.8108747044917257,
+ "grad_norm": 2.831871747970581,
+ "learning_rate": 4.799729111041857e-06,
+ "loss": 0.5953,
+ "step": 1715
+ },
+ {
+ "epoch": 0.8113475177304964,
+ "grad_norm": 2.5611300468444824,
+ "learning_rate": 4.799484391588647e-06,
+ "loss": 0.6302,
+ "step": 1716
+ },
+ {
+ "epoch": 0.8118203309692671,
+ "grad_norm": 2.744070053100586,
+ "learning_rate": 4.799239528956625e-06,
+ "loss": 0.5561,
+ "step": 1717
+ },
+ {
+ "epoch": 0.8122931442080378,
+ "grad_norm": 2.7344231605529785,
+ "learning_rate": 4.798994523161041e-06,
+ "loss": 0.6317,
+ "step": 1718
+ },
+ {
+ "epoch": 0.8127659574468085,
+ "grad_norm": 2.3420889377593994,
+ "learning_rate": 4.798749374217149e-06,
+ "loss": 0.5415,
+ "step": 1719
+ },
+ {
+ "epoch": 0.8132387706855791,
+ "grad_norm": 2.57384991645813,
+ "learning_rate": 4.798504082140212e-06,
+ "loss": 0.6383,
+ "step": 1720
+ },
+ {
+ "epoch": 0.8137115839243498,
+ "grad_norm": 2.8819844722747803,
+ "learning_rate": 4.798258646945505e-06,
+ "loss": 0.6355,
+ "step": 1721
+ },
+ {
+ "epoch": 0.8141843971631205,
+ "grad_norm": 2.908123254776001,
+ "learning_rate": 4.79801306864831e-06,
+ "loss": 0.701,
+ "step": 1722
+ },
+ {
+ "epoch": 0.8146572104018912,
+ "grad_norm": 2.6500701904296875,
+ "learning_rate": 4.797767347263917e-06,
+ "loss": 0.6152,
+ "step": 1723
+ },
+ {
+ "epoch": 0.8151300236406619,
+ "grad_norm": 2.5513017177581787,
+ "learning_rate": 4.797521482807628e-06,
+ "loss": 0.6241,
+ "step": 1724
+ },
+ {
+ "epoch": 0.8156028368794326,
+ "grad_norm": 2.6239185333251953,
+ "learning_rate": 4.7972754752947495e-06,
+ "loss": 0.6072,
+ "step": 1725
+ },
+ {
+ "epoch": 0.8160756501182033,
+ "grad_norm": 2.673436403274536,
+ "learning_rate": 4.797029324740601e-06,
+ "loss": 0.5802,
+ "step": 1726
+ },
+ {
+ "epoch": 0.816548463356974,
+ "grad_norm": 2.533831834793091,
+ "learning_rate": 4.796783031160508e-06,
+ "loss": 0.5566,
+ "step": 1727
+ },
+ {
+ "epoch": 0.8170212765957446,
+ "grad_norm": 2.9806582927703857,
+ "learning_rate": 4.796536594569807e-06,
+ "loss": 0.6945,
+ "step": 1728
+ },
+ {
+ "epoch": 0.8174940898345153,
+ "grad_norm": 2.7093560695648193,
+ "learning_rate": 4.796290014983842e-06,
+ "loss": 0.7143,
+ "step": 1729
+ },
+ {
+ "epoch": 0.817966903073286,
+ "grad_norm": 2.814507246017456,
+ "learning_rate": 4.796043292417967e-06,
+ "loss": 0.6122,
+ "step": 1730
+ },
+ {
+ "epoch": 0.8184397163120567,
+ "grad_norm": 2.537156820297241,
+ "learning_rate": 4.795796426887543e-06,
+ "loss": 0.6229,
+ "step": 1731
+ },
+ {
+ "epoch": 0.8189125295508274,
+ "grad_norm": 2.4878013134002686,
+ "learning_rate": 4.795549418407944e-06,
+ "loss": 0.5442,
+ "step": 1732
+ },
+ {
+ "epoch": 0.8193853427895981,
+ "grad_norm": 2.839383363723755,
+ "learning_rate": 4.795302266994548e-06,
+ "loss": 0.6717,
+ "step": 1733
+ },
+ {
+ "epoch": 0.8198581560283688,
+ "grad_norm": 3.1981801986694336,
+ "learning_rate": 4.795054972662744e-06,
+ "loss": 0.6596,
+ "step": 1734
+ },
+ {
+ "epoch": 0.8203309692671394,
+ "grad_norm": 2.781730890274048,
+ "learning_rate": 4.79480753542793e-06,
+ "loss": 0.5845,
+ "step": 1735
+ },
+ {
+ "epoch": 0.8208037825059101,
+ "grad_norm": 2.689948558807373,
+ "learning_rate": 4.794559955305513e-06,
+ "loss": 0.5928,
+ "step": 1736
+ },
+ {
+ "epoch": 0.8212765957446808,
+ "grad_norm": 2.7267637252807617,
+ "learning_rate": 4.7943122323109105e-06,
+ "loss": 0.5224,
+ "step": 1737
+ },
+ {
+ "epoch": 0.8217494089834515,
+ "grad_norm": 2.4346601963043213,
+ "learning_rate": 4.794064366459544e-06,
+ "loss": 0.6431,
+ "step": 1738
+ },
+ {
+ "epoch": 0.8222222222222222,
+ "grad_norm": 2.7440176010131836,
+ "learning_rate": 4.793816357766849e-06,
+ "loss": 0.6083,
+ "step": 1739
+ },
+ {
+ "epoch": 0.8226950354609929,
+ "grad_norm": 2.6558027267456055,
+ "learning_rate": 4.793568206248268e-06,
+ "loss": 0.698,
+ "step": 1740
+ },
+ {
+ "epoch": 0.8231678486997636,
+ "grad_norm": 2.591658353805542,
+ "learning_rate": 4.793319911919251e-06,
+ "loss": 0.6601,
+ "step": 1741
+ },
+ {
+ "epoch": 0.8236406619385342,
+ "grad_norm": 2.5431172847747803,
+ "learning_rate": 4.79307147479526e-06,
+ "loss": 0.5917,
+ "step": 1742
+ },
+ {
+ "epoch": 0.8241134751773049,
+ "grad_norm": 2.7335588932037354,
+ "learning_rate": 4.792822894891762e-06,
+ "loss": 0.5925,
+ "step": 1743
+ },
+ {
+ "epoch": 0.8245862884160756,
+ "grad_norm": 2.2500839233398438,
+ "learning_rate": 4.792574172224237e-06,
+ "loss": 0.4984,
+ "step": 1744
+ },
+ {
+ "epoch": 0.8250591016548463,
+ "grad_norm": 2.691343069076538,
+ "learning_rate": 4.79232530680817e-06,
+ "loss": 0.6262,
+ "step": 1745
+ },
+ {
+ "epoch": 0.825531914893617,
+ "grad_norm": 2.612204074859619,
+ "learning_rate": 4.792076298659058e-06,
+ "loss": 0.5822,
+ "step": 1746
+ },
+ {
+ "epoch": 0.8260047281323877,
+ "grad_norm": 3.0163519382476807,
+ "learning_rate": 4.791827147792406e-06,
+ "loss": 0.6263,
+ "step": 1747
+ },
+ {
+ "epoch": 0.8264775413711584,
+ "grad_norm": 2.742183208465576,
+ "learning_rate": 4.791577854223727e-06,
+ "loss": 0.6628,
+ "step": 1748
+ },
+ {
+ "epoch": 0.826950354609929,
+ "grad_norm": 2.872213840484619,
+ "learning_rate": 4.791328417968542e-06,
+ "loss": 0.6332,
+ "step": 1749
+ },
+ {
+ "epoch": 0.8274231678486997,
+ "grad_norm": 2.725006580352783,
+ "learning_rate": 4.7910788390423844e-06,
+ "loss": 0.6266,
+ "step": 1750
+ },
+ {
+ "epoch": 0.8278959810874704,
+ "grad_norm": 3.0366697311401367,
+ "learning_rate": 4.790829117460793e-06,
+ "loss": 0.6403,
+ "step": 1751
+ },
+ {
+ "epoch": 0.8283687943262411,
+ "grad_norm": 2.594881772994995,
+ "learning_rate": 4.790579253239318e-06,
+ "loss": 0.521,
+ "step": 1752
+ },
+ {
+ "epoch": 0.8288416075650118,
+ "grad_norm": 2.4496347904205322,
+ "learning_rate": 4.790329246393517e-06,
+ "loss": 0.54,
+ "step": 1753
+ },
+ {
+ "epoch": 0.8293144208037825,
+ "grad_norm": 3.102278470993042,
+ "learning_rate": 4.790079096938956e-06,
+ "loss": 0.6142,
+ "step": 1754
+ },
+ {
+ "epoch": 0.8297872340425532,
+ "grad_norm": 2.4645912647247314,
+ "learning_rate": 4.789828804891212e-06,
+ "loss": 0.5212,
+ "step": 1755
+ },
+ {
+ "epoch": 0.8302600472813239,
+ "grad_norm": 2.7482516765594482,
+ "learning_rate": 4.789578370265868e-06,
+ "loss": 0.6712,
+ "step": 1756
+ },
+ {
+ "epoch": 0.8307328605200945,
+ "grad_norm": 2.61360502243042,
+ "learning_rate": 4.7893277930785195e-06,
+ "loss": 0.6367,
+ "step": 1757
+ },
+ {
+ "epoch": 0.8312056737588652,
+ "grad_norm": 2.79028058052063,
+ "learning_rate": 4.789077073344767e-06,
+ "loss": 0.5099,
+ "step": 1758
+ },
+ {
+ "epoch": 0.8316784869976359,
+ "grad_norm": 2.647662401199341,
+ "learning_rate": 4.788826211080222e-06,
+ "loss": 0.6698,
+ "step": 1759
+ },
+ {
+ "epoch": 0.8321513002364066,
+ "grad_norm": 3.0214831829071045,
+ "learning_rate": 4.7885752063005055e-06,
+ "loss": 0.6121,
+ "step": 1760
+ },
+ {
+ "epoch": 0.8326241134751773,
+ "grad_norm": 2.8244032859802246,
+ "learning_rate": 4.788324059021247e-06,
+ "loss": 0.6921,
+ "step": 1761
+ },
+ {
+ "epoch": 0.833096926713948,
+ "grad_norm": 3.1501076221466064,
+ "learning_rate": 4.788072769258082e-06,
+ "loss": 0.6872,
+ "step": 1762
+ },
+ {
+ "epoch": 0.8335697399527187,
+ "grad_norm": 2.6989903450012207,
+ "learning_rate": 4.7878213370266594e-06,
+ "loss": 0.5884,
+ "step": 1763
+ },
+ {
+ "epoch": 0.8340425531914893,
+ "grad_norm": 2.6982665061950684,
+ "learning_rate": 4.787569762342633e-06,
+ "loss": 0.6112,
+ "step": 1764
+ },
+ {
+ "epoch": 0.83451536643026,
+ "grad_norm": 2.6918323040008545,
+ "learning_rate": 4.7873180452216685e-06,
+ "loss": 0.5315,
+ "step": 1765
+ },
+ {
+ "epoch": 0.8349881796690307,
+ "grad_norm": 2.5494401454925537,
+ "learning_rate": 4.78706618567944e-06,
+ "loss": 0.5909,
+ "step": 1766
+ },
+ {
+ "epoch": 0.8354609929078014,
+ "grad_norm": 2.7532095909118652,
+ "learning_rate": 4.786814183731627e-06,
+ "loss": 0.5566,
+ "step": 1767
+ },
+ {
+ "epoch": 0.8359338061465721,
+ "grad_norm": 2.550865888595581,
+ "learning_rate": 4.786562039393923e-06,
+ "loss": 0.555,
+ "step": 1768
+ },
+ {
+ "epoch": 0.8364066193853428,
+ "grad_norm": 2.4477791786193848,
+ "learning_rate": 4.786309752682028e-06,
+ "loss": 0.5844,
+ "step": 1769
+ },
+ {
+ "epoch": 0.8368794326241135,
+ "grad_norm": 2.6982262134552,
+ "learning_rate": 4.7860573236116485e-06,
+ "loss": 0.6136,
+ "step": 1770
+ },
+ {
+ "epoch": 0.8373522458628841,
+ "grad_norm": 2.456263542175293,
+ "learning_rate": 4.785804752198503e-06,
+ "loss": 0.5055,
+ "step": 1771
+ },
+ {
+ "epoch": 0.8378250591016548,
+ "grad_norm": 2.428544521331787,
+ "learning_rate": 4.78555203845832e-06,
+ "loss": 0.5859,
+ "step": 1772
+ },
+ {
+ "epoch": 0.8382978723404255,
+ "grad_norm": 2.1782307624816895,
+ "learning_rate": 4.785299182406833e-06,
+ "loss": 0.5325,
+ "step": 1773
+ },
+ {
+ "epoch": 0.8387706855791962,
+ "grad_norm": 3.137956142425537,
+ "learning_rate": 4.785046184059786e-06,
+ "loss": 0.6097,
+ "step": 1774
+ },
+ {
+ "epoch": 0.8392434988179669,
+ "grad_norm": 2.6269001960754395,
+ "learning_rate": 4.7847930434329336e-06,
+ "loss": 0.5972,
+ "step": 1775
+ },
+ {
+ "epoch": 0.8397163120567376,
+ "grad_norm": 2.732659339904785,
+ "learning_rate": 4.784539760542037e-06,
+ "loss": 0.6054,
+ "step": 1776
+ },
+ {
+ "epoch": 0.8401891252955083,
+ "grad_norm": 2.5346736907958984,
+ "learning_rate": 4.784286335402866e-06,
+ "loss": 0.5521,
+ "step": 1777
+ },
+ {
+ "epoch": 0.840661938534279,
+ "grad_norm": 3.1420228481292725,
+ "learning_rate": 4.784032768031202e-06,
+ "loss": 0.6165,
+ "step": 1778
+ },
+ {
+ "epoch": 0.8411347517730496,
+ "grad_norm": 3.073793411254883,
+ "learning_rate": 4.783779058442831e-06,
+ "loss": 0.6414,
+ "step": 1779
+ },
+ {
+ "epoch": 0.8416075650118203,
+ "grad_norm": 2.6621336936950684,
+ "learning_rate": 4.783525206653554e-06,
+ "loss": 0.5836,
+ "step": 1780
+ },
+ {
+ "epoch": 0.842080378250591,
+ "grad_norm": 2.7029049396514893,
+ "learning_rate": 4.7832712126791745e-06,
+ "loss": 0.5897,
+ "step": 1781
+ },
+ {
+ "epoch": 0.8425531914893617,
+ "grad_norm": 2.4733822345733643,
+ "learning_rate": 4.783017076535509e-06,
+ "loss": 0.5913,
+ "step": 1782
+ },
+ {
+ "epoch": 0.8430260047281324,
+ "grad_norm": 2.8119473457336426,
+ "learning_rate": 4.782762798238381e-06,
+ "loss": 0.6105,
+ "step": 1783
+ },
+ {
+ "epoch": 0.8434988179669031,
+ "grad_norm": 2.5290818214416504,
+ "learning_rate": 4.782508377803622e-06,
+ "loss": 0.6119,
+ "step": 1784
+ },
+ {
+ "epoch": 0.8439716312056738,
+ "grad_norm": 3.193472385406494,
+ "learning_rate": 4.782253815247076e-06,
+ "loss": 0.6665,
+ "step": 1785
+ },
+ {
+ "epoch": 0.8444444444444444,
+ "grad_norm": 3.206759452819824,
+ "learning_rate": 4.781999110584592e-06,
+ "loss": 0.6012,
+ "step": 1786
+ },
+ {
+ "epoch": 0.8449172576832151,
+ "grad_norm": 2.6227457523345947,
+ "learning_rate": 4.781744263832029e-06,
+ "loss": 0.5845,
+ "step": 1787
+ },
+ {
+ "epoch": 0.8453900709219858,
+ "grad_norm": 2.838365316390991,
+ "learning_rate": 4.781489275005257e-06,
+ "loss": 0.5695,
+ "step": 1788
+ },
+ {
+ "epoch": 0.8458628841607565,
+ "grad_norm": 2.8348326683044434,
+ "learning_rate": 4.78123414412015e-06,
+ "loss": 0.6136,
+ "step": 1789
+ },
+ {
+ "epoch": 0.8463356973995272,
+ "grad_norm": 2.5698344707489014,
+ "learning_rate": 4.780978871192597e-06,
+ "loss": 0.6576,
+ "step": 1790
+ },
+ {
+ "epoch": 0.8468085106382979,
+ "grad_norm": 2.5198330879211426,
+ "learning_rate": 4.780723456238492e-06,
+ "loss": 0.5521,
+ "step": 1791
+ },
+ {
+ "epoch": 0.8472813238770686,
+ "grad_norm": 3.001325845718384,
+ "learning_rate": 4.780467899273737e-06,
+ "loss": 0.6075,
+ "step": 1792
+ },
+ {
+ "epoch": 0.8477541371158392,
+ "grad_norm": 2.7732746601104736,
+ "learning_rate": 4.780212200314247e-06,
+ "loss": 0.6245,
+ "step": 1793
+ },
+ {
+ "epoch": 0.8482269503546099,
+ "grad_norm": 2.6950337886810303,
+ "learning_rate": 4.77995635937594e-06,
+ "loss": 0.5723,
+ "step": 1794
+ },
+ {
+ "epoch": 0.8486997635933806,
+ "grad_norm": 2.82051420211792,
+ "learning_rate": 4.779700376474749e-06,
+ "loss": 0.6184,
+ "step": 1795
+ },
+ {
+ "epoch": 0.8491725768321513,
+ "grad_norm": 2.757791757583618,
+ "learning_rate": 4.779444251626611e-06,
+ "loss": 0.608,
+ "step": 1796
+ },
+ {
+ "epoch": 0.849645390070922,
+ "grad_norm": 2.394108533859253,
+ "learning_rate": 4.779187984847475e-06,
+ "loss": 0.6174,
+ "step": 1797
+ },
+ {
+ "epoch": 0.8501182033096927,
+ "grad_norm": 2.427562713623047,
+ "learning_rate": 4.778931576153296e-06,
+ "loss": 0.5618,
+ "step": 1798
+ },
+ {
+ "epoch": 0.8505910165484634,
+ "grad_norm": 2.891268491744995,
+ "learning_rate": 4.778675025560042e-06,
+ "loss": 0.6865,
+ "step": 1799
+ },
+ {
+ "epoch": 0.851063829787234,
+ "grad_norm": 2.665534257888794,
+ "learning_rate": 4.778418333083685e-06,
+ "loss": 0.5852,
+ "step": 1800
+ },
+ {
+ "epoch": 0.8515366430260047,
+ "grad_norm": 2.5492889881134033,
+ "learning_rate": 4.7781614987402095e-06,
+ "loss": 0.5161,
+ "step": 1801
+ },
+ {
+ "epoch": 0.8520094562647754,
+ "grad_norm": 2.400177001953125,
+ "learning_rate": 4.777904522545607e-06,
+ "loss": 0.5128,
+ "step": 1802
+ },
+ {
+ "epoch": 0.8524822695035461,
+ "grad_norm": 2.3949809074401855,
+ "learning_rate": 4.777647404515878e-06,
+ "loss": 0.571,
+ "step": 1803
+ },
+ {
+ "epoch": 0.8529550827423168,
+ "grad_norm": 2.3624472618103027,
+ "learning_rate": 4.7773901446670325e-06,
+ "loss": 0.5486,
+ "step": 1804
+ },
+ {
+ "epoch": 0.8534278959810875,
+ "grad_norm": 2.711366891860962,
+ "learning_rate": 4.7771327430150885e-06,
+ "loss": 0.5667,
+ "step": 1805
+ },
+ {
+ "epoch": 0.8539007092198582,
+ "grad_norm": 2.7681493759155273,
+ "learning_rate": 4.776875199576073e-06,
+ "loss": 0.5686,
+ "step": 1806
+ },
+ {
+ "epoch": 0.8543735224586289,
+ "grad_norm": 3.0369436740875244,
+ "learning_rate": 4.776617514366023e-06,
+ "loss": 0.6635,
+ "step": 1807
+ },
+ {
+ "epoch": 0.8548463356973995,
+ "grad_norm": 2.919649600982666,
+ "learning_rate": 4.776359687400983e-06,
+ "loss": 0.5749,
+ "step": 1808
+ },
+ {
+ "epoch": 0.8553191489361702,
+ "grad_norm": 2.7986185550689697,
+ "learning_rate": 4.776101718697007e-06,
+ "loss": 0.559,
+ "step": 1809
+ },
+ {
+ "epoch": 0.8557919621749409,
+ "grad_norm": 2.5951223373413086,
+ "learning_rate": 4.775843608270158e-06,
+ "loss": 0.5654,
+ "step": 1810
+ },
+ {
+ "epoch": 0.8562647754137116,
+ "grad_norm": 2.674138069152832,
+ "learning_rate": 4.775585356136505e-06,
+ "loss": 0.5286,
+ "step": 1811
+ },
+ {
+ "epoch": 0.8567375886524823,
+ "grad_norm": 3.045437812805176,
+ "learning_rate": 4.775326962312131e-06,
+ "loss": 0.6185,
+ "step": 1812
+ },
+ {
+ "epoch": 0.857210401891253,
+ "grad_norm": 2.6145293712615967,
+ "learning_rate": 4.775068426813124e-06,
+ "loss": 0.6075,
+ "step": 1813
+ },
+ {
+ "epoch": 0.8576832151300237,
+ "grad_norm": 2.6320106983184814,
+ "learning_rate": 4.7748097496555824e-06,
+ "loss": 0.561,
+ "step": 1814
+ },
+ {
+ "epoch": 0.8581560283687943,
+ "grad_norm": 2.5038623809814453,
+ "learning_rate": 4.774550930855612e-06,
+ "loss": 0.593,
+ "step": 1815
+ },
+ {
+ "epoch": 0.858628841607565,
+ "grad_norm": 2.8168089389801025,
+ "learning_rate": 4.774291970429329e-06,
+ "loss": 0.5196,
+ "step": 1816
+ },
+ {
+ "epoch": 0.8591016548463357,
+ "grad_norm": 2.778130292892456,
+ "learning_rate": 4.774032868392858e-06,
+ "loss": 0.5984,
+ "step": 1817
+ },
+ {
+ "epoch": 0.8595744680851064,
+ "grad_norm": 2.536458730697632,
+ "learning_rate": 4.7737736247623305e-06,
+ "loss": 0.568,
+ "step": 1818
+ },
+ {
+ "epoch": 0.8600472813238771,
+ "grad_norm": 2.6669719219207764,
+ "learning_rate": 4.77351423955389e-06,
+ "loss": 0.6233,
+ "step": 1819
+ },
+ {
+ "epoch": 0.8605200945626478,
+ "grad_norm": 2.578242540359497,
+ "learning_rate": 4.773254712783687e-06,
+ "loss": 0.579,
+ "step": 1820
+ },
+ {
+ "epoch": 0.8609929078014185,
+ "grad_norm": 2.816664457321167,
+ "learning_rate": 4.772995044467881e-06,
+ "loss": 0.6635,
+ "step": 1821
+ },
+ {
+ "epoch": 0.8614657210401891,
+ "grad_norm": 3.1111979484558105,
+ "learning_rate": 4.77273523462264e-06,
+ "loss": 0.6372,
+ "step": 1822
+ },
+ {
+ "epoch": 0.8619385342789598,
+ "grad_norm": 2.764552354812622,
+ "learning_rate": 4.772475283264142e-06,
+ "loss": 0.6216,
+ "step": 1823
+ },
+ {
+ "epoch": 0.8624113475177305,
+ "grad_norm": 2.9126830101013184,
+ "learning_rate": 4.772215190408572e-06,
+ "loss": 0.6396,
+ "step": 1824
+ },
+ {
+ "epoch": 0.8628841607565012,
+ "grad_norm": 2.7502307891845703,
+ "learning_rate": 4.7719549560721264e-06,
+ "loss": 0.6186,
+ "step": 1825
+ },
+ {
+ "epoch": 0.8633569739952719,
+ "grad_norm": 2.6279006004333496,
+ "learning_rate": 4.771694580271007e-06,
+ "loss": 0.5557,
+ "step": 1826
+ },
+ {
+ "epoch": 0.8638297872340426,
+ "grad_norm": 2.996563196182251,
+ "learning_rate": 4.7714340630214276e-06,
+ "loss": 0.6259,
+ "step": 1827
+ },
+ {
+ "epoch": 0.8643026004728133,
+ "grad_norm": 3.231323480606079,
+ "learning_rate": 4.771173404339609e-06,
+ "loss": 0.5473,
+ "step": 1828
+ },
+ {
+ "epoch": 0.864775413711584,
+ "grad_norm": 3.143519878387451,
+ "learning_rate": 4.770912604241781e-06,
+ "loss": 0.593,
+ "step": 1829
+ },
+ {
+ "epoch": 0.8652482269503546,
+ "grad_norm": 2.515484094619751,
+ "learning_rate": 4.770651662744184e-06,
+ "loss": 0.538,
+ "step": 1830
+ },
+ {
+ "epoch": 0.8657210401891253,
+ "grad_norm": 2.629058837890625,
+ "learning_rate": 4.770390579863064e-06,
+ "loss": 0.5745,
+ "step": 1831
+ },
+ {
+ "epoch": 0.866193853427896,
+ "grad_norm": 2.5826802253723145,
+ "learning_rate": 4.770129355614677e-06,
+ "loss": 0.6397,
+ "step": 1832
+ },
+ {
+ "epoch": 0.8666666666666667,
+ "grad_norm": 2.954623222351074,
+ "learning_rate": 4.769867990015289e-06,
+ "loss": 0.6106,
+ "step": 1833
+ },
+ {
+ "epoch": 0.8671394799054374,
+ "grad_norm": 2.742192268371582,
+ "learning_rate": 4.769606483081175e-06,
+ "loss": 0.6902,
+ "step": 1834
+ },
+ {
+ "epoch": 0.8676122931442081,
+ "grad_norm": 2.2619097232818604,
+ "learning_rate": 4.769344834828618e-06,
+ "loss": 0.5414,
+ "step": 1835
+ },
+ {
+ "epoch": 0.8680851063829788,
+ "grad_norm": 2.7384188175201416,
+ "learning_rate": 4.769083045273908e-06,
+ "loss": 0.5787,
+ "step": 1836
+ },
+ {
+ "epoch": 0.8685579196217494,
+ "grad_norm": 2.6734485626220703,
+ "learning_rate": 4.768821114433346e-06,
+ "loss": 0.5923,
+ "step": 1837
+ },
+ {
+ "epoch": 0.8690307328605201,
+ "grad_norm": 2.286140203475952,
+ "learning_rate": 4.768559042323243e-06,
+ "loss": 0.5822,
+ "step": 1838
+ },
+ {
+ "epoch": 0.8695035460992908,
+ "grad_norm": 3.0243725776672363,
+ "learning_rate": 4.768296828959915e-06,
+ "loss": 0.6623,
+ "step": 1839
+ },
+ {
+ "epoch": 0.8699763593380615,
+ "grad_norm": 2.4026312828063965,
+ "learning_rate": 4.768034474359689e-06,
+ "loss": 0.5554,
+ "step": 1840
+ },
+ {
+ "epoch": 0.8704491725768322,
+ "grad_norm": 2.7469029426574707,
+ "learning_rate": 4.767771978538903e-06,
+ "loss": 0.6316,
+ "step": 1841
+ },
+ {
+ "epoch": 0.8709219858156029,
+ "grad_norm": 2.729659080505371,
+ "learning_rate": 4.767509341513899e-06,
+ "loss": 0.5807,
+ "step": 1842
+ },
+ {
+ "epoch": 0.8713947990543736,
+ "grad_norm": 2.5336945056915283,
+ "learning_rate": 4.76724656330103e-06,
+ "loss": 0.6109,
+ "step": 1843
+ },
+ {
+ "epoch": 0.8718676122931442,
+ "grad_norm": 2.519880533218384,
+ "learning_rate": 4.76698364391666e-06,
+ "loss": 0.5313,
+ "step": 1844
+ },
+ {
+ "epoch": 0.8723404255319149,
+ "grad_norm": 2.698862075805664,
+ "learning_rate": 4.766720583377159e-06,
+ "loss": 0.5953,
+ "step": 1845
+ },
+ {
+ "epoch": 0.8728132387706856,
+ "grad_norm": 3.0195560455322266,
+ "learning_rate": 4.766457381698907e-06,
+ "loss": 0.5965,
+ "step": 1846
+ },
+ {
+ "epoch": 0.8732860520094563,
+ "grad_norm": 2.5972697734832764,
+ "learning_rate": 4.766194038898291e-06,
+ "loss": 0.6014,
+ "step": 1847
+ },
+ {
+ "epoch": 0.873758865248227,
+ "grad_norm": 2.7132294178009033,
+ "learning_rate": 4.76593055499171e-06,
+ "loss": 0.5638,
+ "step": 1848
+ },
+ {
+ "epoch": 0.8742316784869977,
+ "grad_norm": 2.7134575843811035,
+ "learning_rate": 4.765666929995568e-06,
+ "loss": 0.52,
+ "step": 1849
+ },
+ {
+ "epoch": 0.8747044917257684,
+ "grad_norm": 2.3804993629455566,
+ "learning_rate": 4.765403163926282e-06,
+ "loss": 0.5435,
+ "step": 1850
+ },
+ {
+ "epoch": 0.875177304964539,
+ "grad_norm": 2.8782761096954346,
+ "learning_rate": 4.765139256800274e-06,
+ "loss": 0.5843,
+ "step": 1851
+ },
+ {
+ "epoch": 0.8756501182033097,
+ "grad_norm": 2.836209774017334,
+ "learning_rate": 4.764875208633977e-06,
+ "loss": 0.6667,
+ "step": 1852
+ },
+ {
+ "epoch": 0.8761229314420804,
+ "grad_norm": 2.608851194381714,
+ "learning_rate": 4.764611019443831e-06,
+ "loss": 0.5436,
+ "step": 1853
+ },
+ {
+ "epoch": 0.8765957446808511,
+ "grad_norm": 2.788738965988159,
+ "learning_rate": 4.764346689246288e-06,
+ "loss": 0.7331,
+ "step": 1854
+ },
+ {
+ "epoch": 0.8770685579196218,
+ "grad_norm": 2.524277687072754,
+ "learning_rate": 4.764082218057805e-06,
+ "loss": 0.5067,
+ "step": 1855
+ },
+ {
+ "epoch": 0.8775413711583925,
+ "grad_norm": 3.7559316158294678,
+ "learning_rate": 4.763817605894851e-06,
+ "loss": 0.6809,
+ "step": 1856
+ },
+ {
+ "epoch": 0.8780141843971632,
+ "grad_norm": 2.9070613384246826,
+ "learning_rate": 4.763552852773899e-06,
+ "loss": 0.5913,
+ "step": 1857
+ },
+ {
+ "epoch": 0.8784869976359339,
+ "grad_norm": 2.7050609588623047,
+ "learning_rate": 4.7632879587114386e-06,
+ "loss": 0.6074,
+ "step": 1858
+ },
+ {
+ "epoch": 0.8789598108747045,
+ "grad_norm": 2.891134262084961,
+ "learning_rate": 4.76302292372396e-06,
+ "loss": 0.5939,
+ "step": 1859
+ },
+ {
+ "epoch": 0.8794326241134752,
+ "grad_norm": 2.8581702709198,
+ "learning_rate": 4.762757747827968e-06,
+ "loss": 0.5972,
+ "step": 1860
+ },
+ {
+ "epoch": 0.8799054373522459,
+ "grad_norm": 2.8266196250915527,
+ "learning_rate": 4.762492431039971e-06,
+ "loss": 0.5993,
+ "step": 1861
+ },
+ {
+ "epoch": 0.8803782505910166,
+ "grad_norm": 2.4853954315185547,
+ "learning_rate": 4.762226973376493e-06,
+ "loss": 0.6388,
+ "step": 1862
+ },
+ {
+ "epoch": 0.8808510638297873,
+ "grad_norm": 3.2212886810302734,
+ "learning_rate": 4.761961374854059e-06,
+ "loss": 0.6698,
+ "step": 1863
+ },
+ {
+ "epoch": 0.881323877068558,
+ "grad_norm": 3.1254501342773438,
+ "learning_rate": 4.761695635489211e-06,
+ "loss": 0.5263,
+ "step": 1864
+ },
+ {
+ "epoch": 0.8817966903073287,
+ "grad_norm": 2.6891462802886963,
+ "learning_rate": 4.761429755298491e-06,
+ "loss": 0.5359,
+ "step": 1865
+ },
+ {
+ "epoch": 0.8822695035460993,
+ "grad_norm": 2.8557538986206055,
+ "learning_rate": 4.761163734298457e-06,
+ "loss": 0.5933,
+ "step": 1866
+ },
+ {
+ "epoch": 0.88274231678487,
+ "grad_norm": 2.53548264503479,
+ "learning_rate": 4.7608975725056724e-06,
+ "loss": 0.6397,
+ "step": 1867
+ },
+ {
+ "epoch": 0.8832151300236407,
+ "grad_norm": 3.0237956047058105,
+ "learning_rate": 4.76063126993671e-06,
+ "loss": 0.6845,
+ "step": 1868
+ },
+ {
+ "epoch": 0.8836879432624114,
+ "grad_norm": 3.222886800765991,
+ "learning_rate": 4.76036482660815e-06,
+ "loss": 0.6055,
+ "step": 1869
+ },
+ {
+ "epoch": 0.8841607565011821,
+ "grad_norm": 3.1867551803588867,
+ "learning_rate": 4.760098242536584e-06,
+ "loss": 0.6592,
+ "step": 1870
+ },
+ {
+ "epoch": 0.8846335697399527,
+ "grad_norm": 2.782209873199463,
+ "learning_rate": 4.7598315177386115e-06,
+ "loss": 0.5833,
+ "step": 1871
+ },
+ {
+ "epoch": 0.8851063829787233,
+ "grad_norm": 2.899871587753296,
+ "learning_rate": 4.759564652230838e-06,
+ "loss": 0.6129,
+ "step": 1872
+ },
+ {
+ "epoch": 0.885579196217494,
+ "grad_norm": 2.5690579414367676,
+ "learning_rate": 4.759297646029882e-06,
+ "loss": 0.5827,
+ "step": 1873
+ },
+ {
+ "epoch": 0.8860520094562647,
+ "grad_norm": 2.666130304336548,
+ "learning_rate": 4.759030499152368e-06,
+ "loss": 0.5272,
+ "step": 1874
+ },
+ {
+ "epoch": 0.8865248226950354,
+ "grad_norm": 2.7030911445617676,
+ "learning_rate": 4.758763211614932e-06,
+ "loss": 0.6415,
+ "step": 1875
+ },
+ {
+ "epoch": 0.8869976359338061,
+ "grad_norm": 2.717512845993042,
+ "learning_rate": 4.7584957834342135e-06,
+ "loss": 0.5827,
+ "step": 1876
+ },
+ {
+ "epoch": 0.8874704491725768,
+ "grad_norm": 2.665823459625244,
+ "learning_rate": 4.758228214626867e-06,
+ "loss": 0.6209,
+ "step": 1877
+ },
+ {
+ "epoch": 0.8879432624113475,
+ "grad_norm": 2.636653184890747,
+ "learning_rate": 4.75796050520955e-06,
+ "loss": 0.6413,
+ "step": 1878
+ },
+ {
+ "epoch": 0.8884160756501182,
+ "grad_norm": 2.585115671157837,
+ "learning_rate": 4.7576926551989345e-06,
+ "loss": 0.5518,
+ "step": 1879
+ },
+ {
+ "epoch": 0.8888888888888888,
+ "grad_norm": 2.808526039123535,
+ "learning_rate": 4.757424664611697e-06,
+ "loss": 0.5717,
+ "step": 1880
+ },
+ {
+ "epoch": 0.8893617021276595,
+ "grad_norm": 3.5957939624786377,
+ "learning_rate": 4.757156533464524e-06,
+ "loss": 0.6323,
+ "step": 1881
+ },
+ {
+ "epoch": 0.8898345153664302,
+ "grad_norm": 2.5003883838653564,
+ "learning_rate": 4.756888261774111e-06,
+ "loss": 0.5937,
+ "step": 1882
+ },
+ {
+ "epoch": 0.8903073286052009,
+ "grad_norm": 2.749061346054077,
+ "learning_rate": 4.756619849557161e-06,
+ "loss": 0.6642,
+ "step": 1883
+ },
+ {
+ "epoch": 0.8907801418439716,
+ "grad_norm": 2.6757891178131104,
+ "learning_rate": 4.756351296830389e-06,
+ "loss": 0.5887,
+ "step": 1884
+ },
+ {
+ "epoch": 0.8912529550827423,
+ "grad_norm": 2.811925172805786,
+ "learning_rate": 4.756082603610516e-06,
+ "loss": 0.6571,
+ "step": 1885
+ },
+ {
+ "epoch": 0.891725768321513,
+ "grad_norm": 2.5054616928100586,
+ "learning_rate": 4.755813769914271e-06,
+ "loss": 0.6312,
+ "step": 1886
+ },
+ {
+ "epoch": 0.8921985815602836,
+ "grad_norm": 2.7518467903137207,
+ "learning_rate": 4.755544795758395e-06,
+ "loss": 0.6685,
+ "step": 1887
+ },
+ {
+ "epoch": 0.8926713947990543,
+ "grad_norm": 2.7527287006378174,
+ "learning_rate": 4.755275681159634e-06,
+ "loss": 0.5886,
+ "step": 1888
+ },
+ {
+ "epoch": 0.893144208037825,
+ "grad_norm": 2.6162452697753906,
+ "learning_rate": 4.755006426134745e-06,
+ "loss": 0.546,
+ "step": 1889
+ },
+ {
+ "epoch": 0.8936170212765957,
+ "grad_norm": 2.4016737937927246,
+ "learning_rate": 4.754737030700495e-06,
+ "loss": 0.5726,
+ "step": 1890
+ },
+ {
+ "epoch": 0.8940898345153664,
+ "grad_norm": 2.528327703475952,
+ "learning_rate": 4.754467494873656e-06,
+ "loss": 0.5682,
+ "step": 1891
+ },
+ {
+ "epoch": 0.8945626477541371,
+ "grad_norm": 2.3139286041259766,
+ "learning_rate": 4.7541978186710115e-06,
+ "loss": 0.6108,
+ "step": 1892
+ },
+ {
+ "epoch": 0.8950354609929078,
+ "grad_norm": 2.7269136905670166,
+ "learning_rate": 4.753928002109354e-06,
+ "loss": 0.5875,
+ "step": 1893
+ },
+ {
+ "epoch": 0.8955082742316784,
+ "grad_norm": 4.425495147705078,
+ "learning_rate": 4.753658045205482e-06,
+ "loss": 0.5572,
+ "step": 1894
+ },
+ {
+ "epoch": 0.8959810874704491,
+ "grad_norm": 2.535409927368164,
+ "learning_rate": 4.753387947976206e-06,
+ "loss": 0.5868,
+ "step": 1895
+ },
+ {
+ "epoch": 0.8964539007092198,
+ "grad_norm": 2.722458600997925,
+ "learning_rate": 4.753117710438343e-06,
+ "loss": 0.5935,
+ "step": 1896
+ },
+ {
+ "epoch": 0.8969267139479905,
+ "grad_norm": 2.743861436843872,
+ "learning_rate": 4.75284733260872e-06,
+ "loss": 0.572,
+ "step": 1897
+ },
+ {
+ "epoch": 0.8973995271867612,
+ "grad_norm": 2.60640549659729,
+ "learning_rate": 4.752576814504173e-06,
+ "loss": 0.567,
+ "step": 1898
+ },
+ {
+ "epoch": 0.8978723404255319,
+ "grad_norm": 2.7486042976379395,
+ "learning_rate": 4.7523061561415435e-06,
+ "loss": 0.5768,
+ "step": 1899
+ },
+ {
+ "epoch": 0.8983451536643026,
+ "grad_norm": 3.8410251140594482,
+ "learning_rate": 4.752035357537686e-06,
+ "loss": 0.6034,
+ "step": 1900
+ },
+ {
+ "epoch": 0.8988179669030733,
+ "grad_norm": 3.0935890674591064,
+ "learning_rate": 4.751764418709462e-06,
+ "loss": 0.5644,
+ "step": 1901
+ },
+ {
+ "epoch": 0.8992907801418439,
+ "grad_norm": 2.7989892959594727,
+ "learning_rate": 4.751493339673742e-06,
+ "loss": 0.656,
+ "step": 1902
+ },
+ {
+ "epoch": 0.8997635933806146,
+ "grad_norm": 3.6940557956695557,
+ "learning_rate": 4.751222120447403e-06,
+ "loss": 0.6632,
+ "step": 1903
+ },
+ {
+ "epoch": 0.9002364066193853,
+ "grad_norm": 2.3428797721862793,
+ "learning_rate": 4.750950761047335e-06,
+ "loss": 0.4485,
+ "step": 1904
+ },
+ {
+ "epoch": 0.900709219858156,
+ "grad_norm": 2.622544050216675,
+ "learning_rate": 4.750679261490432e-06,
+ "loss": 0.5857,
+ "step": 1905
+ },
+ {
+ "epoch": 0.9011820330969267,
+ "grad_norm": 2.4911322593688965,
+ "learning_rate": 4.750407621793601e-06,
+ "loss": 0.5618,
+ "step": 1906
+ },
+ {
+ "epoch": 0.9016548463356974,
+ "grad_norm": 2.6434662342071533,
+ "learning_rate": 4.750135841973755e-06,
+ "loss": 0.6057,
+ "step": 1907
+ },
+ {
+ "epoch": 0.902127659574468,
+ "grad_norm": 3.115443706512451,
+ "learning_rate": 4.749863922047817e-06,
+ "loss": 0.6064,
+ "step": 1908
+ },
+ {
+ "epoch": 0.9026004728132387,
+ "grad_norm": 2.5671091079711914,
+ "learning_rate": 4.749591862032718e-06,
+ "loss": 0.5625,
+ "step": 1909
+ },
+ {
+ "epoch": 0.9030732860520094,
+ "grad_norm": 3.2008655071258545,
+ "learning_rate": 4.749319661945398e-06,
+ "loss": 0.5547,
+ "step": 1910
+ },
+ {
+ "epoch": 0.9035460992907801,
+ "grad_norm": 2.905987024307251,
+ "learning_rate": 4.749047321802805e-06,
+ "loss": 0.6033,
+ "step": 1911
+ },
+ {
+ "epoch": 0.9040189125295508,
+ "grad_norm": 3.1456053256988525,
+ "learning_rate": 4.748774841621897e-06,
+ "loss": 0.5651,
+ "step": 1912
+ },
+ {
+ "epoch": 0.9044917257683215,
+ "grad_norm": 2.8116416931152344,
+ "learning_rate": 4.748502221419641e-06,
+ "loss": 0.5853,
+ "step": 1913
+ },
+ {
+ "epoch": 0.9049645390070922,
+ "grad_norm": 3.123835325241089,
+ "learning_rate": 4.748229461213011e-06,
+ "loss": 0.5427,
+ "step": 1914
+ },
+ {
+ "epoch": 0.9054373522458629,
+ "grad_norm": 2.4750146865844727,
+ "learning_rate": 4.747956561018989e-06,
+ "loss": 0.6517,
+ "step": 1915
+ },
+ {
+ "epoch": 0.9059101654846335,
+ "grad_norm": 2.6174299716949463,
+ "learning_rate": 4.7476835208545705e-06,
+ "loss": 0.6119,
+ "step": 1916
+ },
+ {
+ "epoch": 0.9063829787234042,
+ "grad_norm": 2.7390382289886475,
+ "learning_rate": 4.747410340736755e-06,
+ "loss": 0.5664,
+ "step": 1917
+ },
+ {
+ "epoch": 0.9068557919621749,
+ "grad_norm": 2.7940444946289062,
+ "learning_rate": 4.747137020682552e-06,
+ "loss": 0.5628,
+ "step": 1918
+ },
+ {
+ "epoch": 0.9073286052009456,
+ "grad_norm": 2.477365016937256,
+ "learning_rate": 4.7468635607089795e-06,
+ "loss": 0.5261,
+ "step": 1919
+ },
+ {
+ "epoch": 0.9078014184397163,
+ "grad_norm": 2.7016685009002686,
+ "learning_rate": 4.746589960833066e-06,
+ "loss": 0.5576,
+ "step": 1920
+ },
+ {
+ "epoch": 0.908274231678487,
+ "grad_norm": 2.8806519508361816,
+ "learning_rate": 4.746316221071846e-06,
+ "loss": 0.5925,
+ "step": 1921
+ },
+ {
+ "epoch": 0.9087470449172577,
+ "grad_norm": 3.0315234661102295,
+ "learning_rate": 4.746042341442365e-06,
+ "loss": 0.6142,
+ "step": 1922
+ },
+ {
+ "epoch": 0.9092198581560283,
+ "grad_norm": 4.2446160316467285,
+ "learning_rate": 4.745768321961676e-06,
+ "loss": 0.5352,
+ "step": 1923
+ },
+ {
+ "epoch": 0.909692671394799,
+ "grad_norm": 2.6517012119293213,
+ "learning_rate": 4.745494162646841e-06,
+ "loss": 0.6118,
+ "step": 1924
+ },
+ {
+ "epoch": 0.9101654846335697,
+ "grad_norm": 2.774900197982788,
+ "learning_rate": 4.7452198635149304e-06,
+ "loss": 0.572,
+ "step": 1925
+ },
+ {
+ "epoch": 0.9106382978723404,
+ "grad_norm": 3.0133683681488037,
+ "learning_rate": 4.744945424583024e-06,
+ "loss": 0.5897,
+ "step": 1926
+ },
+ {
+ "epoch": 0.9111111111111111,
+ "grad_norm": 2.7344839572906494,
+ "learning_rate": 4.744670845868211e-06,
+ "loss": 0.6207,
+ "step": 1927
+ },
+ {
+ "epoch": 0.9115839243498818,
+ "grad_norm": 2.636578321456909,
+ "learning_rate": 4.744396127387586e-06,
+ "loss": 0.6687,
+ "step": 1928
+ },
+ {
+ "epoch": 0.9120567375886525,
+ "grad_norm": 2.8663458824157715,
+ "learning_rate": 4.744121269158255e-06,
+ "loss": 0.5002,
+ "step": 1929
+ },
+ {
+ "epoch": 0.9125295508274232,
+ "grad_norm": 2.661079168319702,
+ "learning_rate": 4.743846271197333e-06,
+ "loss": 0.5848,
+ "step": 1930
+ },
+ {
+ "epoch": 0.9130023640661938,
+ "grad_norm": 2.881256341934204,
+ "learning_rate": 4.743571133521943e-06,
+ "loss": 0.5911,
+ "step": 1931
+ },
+ {
+ "epoch": 0.9134751773049645,
+ "grad_norm": 2.5540573596954346,
+ "learning_rate": 4.743295856149217e-06,
+ "loss": 0.5647,
+ "step": 1932
+ },
+ {
+ "epoch": 0.9139479905437352,
+ "grad_norm": 2.7060387134552,
+ "learning_rate": 4.743020439096293e-06,
+ "loss": 0.6267,
+ "step": 1933
+ },
+ {
+ "epoch": 0.9144208037825059,
+ "grad_norm": 2.694481372833252,
+ "learning_rate": 4.742744882380323e-06,
+ "loss": 0.6283,
+ "step": 1934
+ },
+ {
+ "epoch": 0.9148936170212766,
+ "grad_norm": 2.711555242538452,
+ "learning_rate": 4.7424691860184625e-06,
+ "loss": 0.5784,
+ "step": 1935
+ },
+ {
+ "epoch": 0.9153664302600473,
+ "grad_norm": 2.9077224731445312,
+ "learning_rate": 4.742193350027879e-06,
+ "loss": 0.5948,
+ "step": 1936
+ },
+ {
+ "epoch": 0.915839243498818,
+ "grad_norm": 2.9824187755584717,
+ "learning_rate": 4.7419173744257476e-06,
+ "loss": 0.6115,
+ "step": 1937
+ },
+ {
+ "epoch": 0.9163120567375886,
+ "grad_norm": 2.5127830505371094,
+ "learning_rate": 4.7416412592292515e-06,
+ "loss": 0.5803,
+ "step": 1938
+ },
+ {
+ "epoch": 0.9167848699763593,
+ "grad_norm": 3.1307175159454346,
+ "learning_rate": 4.741365004455583e-06,
+ "loss": 0.5657,
+ "step": 1939
+ },
+ {
+ "epoch": 0.91725768321513,
+ "grad_norm": 2.8205273151397705,
+ "learning_rate": 4.741088610121944e-06,
+ "loss": 0.6145,
+ "step": 1940
+ },
+ {
+ "epoch": 0.9177304964539007,
+ "grad_norm": 2.6119720935821533,
+ "learning_rate": 4.7408120762455444e-06,
+ "loss": 0.6058,
+ "step": 1941
+ },
+ {
+ "epoch": 0.9182033096926714,
+ "grad_norm": 2.421276092529297,
+ "learning_rate": 4.7405354028436025e-06,
+ "loss": 0.5973,
+ "step": 1942
+ },
+ {
+ "epoch": 0.9186761229314421,
+ "grad_norm": 2.9846808910369873,
+ "learning_rate": 4.740258589933346e-06,
+ "loss": 0.6892,
+ "step": 1943
+ },
+ {
+ "epoch": 0.9191489361702128,
+ "grad_norm": 2.6899871826171875,
+ "learning_rate": 4.739981637532009e-06,
+ "loss": 0.5705,
+ "step": 1944
+ },
+ {
+ "epoch": 0.9196217494089834,
+ "grad_norm": 2.8636131286621094,
+ "learning_rate": 4.739704545656839e-06,
+ "loss": 0.5775,
+ "step": 1945
+ },
+ {
+ "epoch": 0.9200945626477541,
+ "grad_norm": 2.7659449577331543,
+ "learning_rate": 4.739427314325087e-06,
+ "loss": 0.5823,
+ "step": 1946
+ },
+ {
+ "epoch": 0.9205673758865248,
+ "grad_norm": 4.71295166015625,
+ "learning_rate": 4.739149943554016e-06,
+ "loss": 0.5601,
+ "step": 1947
+ },
+ {
+ "epoch": 0.9210401891252955,
+ "grad_norm": 2.642636775970459,
+ "learning_rate": 4.738872433360896e-06,
+ "loss": 0.5278,
+ "step": 1948
+ },
+ {
+ "epoch": 0.9215130023640662,
+ "grad_norm": 2.4658217430114746,
+ "learning_rate": 4.7385947837630065e-06,
+ "loss": 0.6392,
+ "step": 1949
+ },
+ {
+ "epoch": 0.9219858156028369,
+ "grad_norm": 2.851602792739868,
+ "learning_rate": 4.738316994777636e-06,
+ "loss": 0.6164,
+ "step": 1950
+ },
+ {
+ "epoch": 0.9224586288416076,
+ "grad_norm": 2.394226551055908,
+ "learning_rate": 4.738039066422081e-06,
+ "loss": 0.5556,
+ "step": 1951
+ },
+ {
+ "epoch": 0.9229314420803783,
+ "grad_norm": 2.7985100746154785,
+ "learning_rate": 4.737760998713647e-06,
+ "loss": 0.5799,
+ "step": 1952
+ },
+ {
+ "epoch": 0.9234042553191489,
+ "grad_norm": 2.5974674224853516,
+ "learning_rate": 4.737482791669648e-06,
+ "loss": 0.6984,
+ "step": 1953
+ },
+ {
+ "epoch": 0.9238770685579196,
+ "grad_norm": 2.707636594772339,
+ "learning_rate": 4.737204445307406e-06,
+ "loss": 0.5548,
+ "step": 1954
+ },
+ {
+ "epoch": 0.9243498817966903,
+ "grad_norm": 2.7882707118988037,
+ "learning_rate": 4.736925959644254e-06,
+ "loss": 0.6026,
+ "step": 1955
+ },
+ {
+ "epoch": 0.924822695035461,
+ "grad_norm": 2.474482774734497,
+ "learning_rate": 4.7366473346975304e-06,
+ "loss": 0.5832,
+ "step": 1956
+ },
+ {
+ "epoch": 0.9252955082742317,
+ "grad_norm": 2.6196324825286865,
+ "learning_rate": 4.736368570484585e-06,
+ "loss": 0.5861,
+ "step": 1957
+ },
+ {
+ "epoch": 0.9257683215130024,
+ "grad_norm": 2.826864004135132,
+ "learning_rate": 4.736089667022775e-06,
+ "loss": 0.6173,
+ "step": 1958
+ },
+ {
+ "epoch": 0.926241134751773,
+ "grad_norm": 2.414473056793213,
+ "learning_rate": 4.735810624329466e-06,
+ "loss": 0.5753,
+ "step": 1959
+ },
+ {
+ "epoch": 0.9267139479905437,
+ "grad_norm": 2.8037970066070557,
+ "learning_rate": 4.7355314424220335e-06,
+ "loss": 0.6207,
+ "step": 1960
+ },
+ {
+ "epoch": 0.9271867612293144,
+ "grad_norm": 2.645458698272705,
+ "learning_rate": 4.735252121317861e-06,
+ "loss": 0.5959,
+ "step": 1961
+ },
+ {
+ "epoch": 0.9276595744680851,
+ "grad_norm": 2.7983884811401367,
+ "learning_rate": 4.734972661034339e-06,
+ "loss": 0.5696,
+ "step": 1962
+ },
+ {
+ "epoch": 0.9281323877068558,
+ "grad_norm": 3.0568997859954834,
+ "learning_rate": 4.73469306158887e-06,
+ "loss": 0.6194,
+ "step": 1963
+ },
+ {
+ "epoch": 0.9286052009456265,
+ "grad_norm": 2.7205135822296143,
+ "learning_rate": 4.734413322998863e-06,
+ "loss": 0.5292,
+ "step": 1964
+ },
+ {
+ "epoch": 0.9290780141843972,
+ "grad_norm": 3.3168489933013916,
+ "learning_rate": 4.734133445281735e-06,
+ "loss": 0.5654,
+ "step": 1965
+ },
+ {
+ "epoch": 0.9295508274231679,
+ "grad_norm": 3.0095653533935547,
+ "learning_rate": 4.733853428454916e-06,
+ "loss": 0.6508,
+ "step": 1966
+ },
+ {
+ "epoch": 0.9300236406619385,
+ "grad_norm": 2.7726712226867676,
+ "learning_rate": 4.733573272535838e-06,
+ "loss": 0.644,
+ "step": 1967
+ },
+ {
+ "epoch": 0.9304964539007092,
+ "grad_norm": 2.474397659301758,
+ "learning_rate": 4.7332929775419456e-06,
+ "loss": 0.5479,
+ "step": 1968
+ },
+ {
+ "epoch": 0.9309692671394799,
+ "grad_norm": 2.4518635272979736,
+ "learning_rate": 4.733012543490693e-06,
+ "loss": 0.6,
+ "step": 1969
+ },
+ {
+ "epoch": 0.9314420803782506,
+ "grad_norm": 2.9292192459106445,
+ "learning_rate": 4.73273197039954e-06,
+ "loss": 0.6647,
+ "step": 1970
+ },
+ {
+ "epoch": 0.9319148936170213,
+ "grad_norm": 2.425004720687866,
+ "learning_rate": 4.732451258285958e-06,
+ "loss": 0.6338,
+ "step": 1971
+ },
+ {
+ "epoch": 0.932387706855792,
+ "grad_norm": 2.904479503631592,
+ "learning_rate": 4.7321704071674255e-06,
+ "loss": 0.5923,
+ "step": 1972
+ },
+ {
+ "epoch": 0.9328605200945627,
+ "grad_norm": 2.477085590362549,
+ "learning_rate": 4.731889417061428e-06,
+ "loss": 0.5984,
+ "step": 1973
+ },
+ {
+ "epoch": 0.9333333333333333,
+ "grad_norm": 2.585240364074707,
+ "learning_rate": 4.731608287985465e-06,
+ "loss": 0.558,
+ "step": 1974
+ },
+ {
+ "epoch": 0.933806146572104,
+ "grad_norm": 2.658714532852173,
+ "learning_rate": 4.731327019957039e-06,
+ "loss": 0.5567,
+ "step": 1975
+ },
+ {
+ "epoch": 0.9342789598108747,
+ "grad_norm": 2.7593026161193848,
+ "learning_rate": 4.731045612993662e-06,
+ "loss": 0.5772,
+ "step": 1976
+ },
+ {
+ "epoch": 0.9347517730496454,
+ "grad_norm": 2.4386026859283447,
+ "learning_rate": 4.7307640671128585e-06,
+ "loss": 0.6199,
+ "step": 1977
+ },
+ {
+ "epoch": 0.9352245862884161,
+ "grad_norm": 2.681910514831543,
+ "learning_rate": 4.730482382332158e-06,
+ "loss": 0.5971,
+ "step": 1978
+ },
+ {
+ "epoch": 0.9356973995271868,
+ "grad_norm": 3.7593860626220703,
+ "learning_rate": 4.7302005586691e-06,
+ "loss": 0.6346,
+ "step": 1979
+ },
+ {
+ "epoch": 0.9361702127659575,
+ "grad_norm": 2.5789096355438232,
+ "learning_rate": 4.729918596141232e-06,
+ "loss": 0.5684,
+ "step": 1980
+ },
+ {
+ "epoch": 0.9366430260047282,
+ "grad_norm": 3.0607335567474365,
+ "learning_rate": 4.729636494766111e-06,
+ "loss": 0.6223,
+ "step": 1981
+ },
+ {
+ "epoch": 0.9371158392434988,
+ "grad_norm": 2.906643867492676,
+ "learning_rate": 4.729354254561303e-06,
+ "loss": 0.6513,
+ "step": 1982
+ },
+ {
+ "epoch": 0.9375886524822695,
+ "grad_norm": 3.192430019378662,
+ "learning_rate": 4.7290718755443795e-06,
+ "loss": 0.5095,
+ "step": 1983
+ },
+ {
+ "epoch": 0.9380614657210402,
+ "grad_norm": 2.661536931991577,
+ "learning_rate": 4.7287893577329255e-06,
+ "loss": 0.5525,
+ "step": 1984
+ },
+ {
+ "epoch": 0.9385342789598109,
+ "grad_norm": 2.8436734676361084,
+ "learning_rate": 4.728506701144531e-06,
+ "loss": 0.6323,
+ "step": 1985
+ },
+ {
+ "epoch": 0.9390070921985816,
+ "grad_norm": 2.75544810295105,
+ "learning_rate": 4.728223905796796e-06,
+ "loss": 0.6018,
+ "step": 1986
+ },
+ {
+ "epoch": 0.9394799054373523,
+ "grad_norm": 3.0652759075164795,
+ "learning_rate": 4.727940971707329e-06,
+ "loss": 0.62,
+ "step": 1987
+ },
+ {
+ "epoch": 0.939952718676123,
+ "grad_norm": 2.802567720413208,
+ "learning_rate": 4.727657898893747e-06,
+ "loss": 0.5809,
+ "step": 1988
+ },
+ {
+ "epoch": 0.9404255319148936,
+ "grad_norm": 2.6208512783050537,
+ "learning_rate": 4.7273746873736745e-06,
+ "loss": 0.5762,
+ "step": 1989
+ },
+ {
+ "epoch": 0.9408983451536643,
+ "grad_norm": 2.5901873111724854,
+ "learning_rate": 4.727091337164748e-06,
+ "loss": 0.6111,
+ "step": 1990
+ },
+ {
+ "epoch": 0.941371158392435,
+ "grad_norm": 3.002347707748413,
+ "learning_rate": 4.726807848284609e-06,
+ "loss": 0.6419,
+ "step": 1991
+ },
+ {
+ "epoch": 0.9418439716312057,
+ "grad_norm": 2.522151470184326,
+ "learning_rate": 4.72652422075091e-06,
+ "loss": 0.642,
+ "step": 1992
+ },
+ {
+ "epoch": 0.9423167848699764,
+ "grad_norm": 2.5571532249450684,
+ "learning_rate": 4.726240454581311e-06,
+ "loss": 0.5729,
+ "step": 1993
+ },
+ {
+ "epoch": 0.9427895981087471,
+ "grad_norm": 2.7704918384552,
+ "learning_rate": 4.72595654979348e-06,
+ "loss": 0.6816,
+ "step": 1994
+ },
+ {
+ "epoch": 0.9432624113475178,
+ "grad_norm": 2.517040491104126,
+ "learning_rate": 4.7256725064050955e-06,
+ "loss": 0.5782,
+ "step": 1995
+ },
+ {
+ "epoch": 0.9437352245862884,
+ "grad_norm": 2.613955020904541,
+ "learning_rate": 4.725388324433843e-06,
+ "loss": 0.6291,
+ "step": 1996
+ },
+ {
+ "epoch": 0.9442080378250591,
+ "grad_norm": 2.848891258239746,
+ "learning_rate": 4.725104003897418e-06,
+ "loss": 0.6544,
+ "step": 1997
+ },
+ {
+ "epoch": 0.9446808510638298,
+ "grad_norm": 3.0162429809570312,
+ "learning_rate": 4.724819544813523e-06,
+ "loss": 0.6301,
+ "step": 1998
+ },
+ {
+ "epoch": 0.9451536643026005,
+ "grad_norm": 2.613614559173584,
+ "learning_rate": 4.72453494719987e-06,
+ "loss": 0.5829,
+ "step": 1999
+ },
+ {
+ "epoch": 0.9456264775413712,
+ "grad_norm": 2.4838767051696777,
+ "learning_rate": 4.724250211074182e-06,
+ "loss": 0.6042,
+ "step": 2000
+ },
+ {
+ "epoch": 0.9460992907801419,
+ "grad_norm": 2.526470899581909,
+ "learning_rate": 4.723965336454185e-06,
+ "loss": 0.6167,
+ "step": 2001
+ },
+ {
+ "epoch": 0.9465721040189126,
+ "grad_norm": 2.504506826400757,
+ "learning_rate": 4.723680323357618e-06,
+ "loss": 0.6061,
+ "step": 2002
+ },
+ {
+ "epoch": 0.9470449172576832,
+ "grad_norm": 3.0547544956207275,
+ "learning_rate": 4.723395171802228e-06,
+ "loss": 0.6619,
+ "step": 2003
+ },
+ {
+ "epoch": 0.9475177304964539,
+ "grad_norm": 2.8692407608032227,
+ "learning_rate": 4.723109881805771e-06,
+ "loss": 0.5985,
+ "step": 2004
+ },
+ {
+ "epoch": 0.9479905437352246,
+ "grad_norm": 2.7929654121398926,
+ "learning_rate": 4.7228244533860094e-06,
+ "loss": 0.5869,
+ "step": 2005
+ },
+ {
+ "epoch": 0.9484633569739953,
+ "grad_norm": 2.764869451522827,
+ "learning_rate": 4.7225388865607146e-06,
+ "loss": 0.6288,
+ "step": 2006
+ },
+ {
+ "epoch": 0.948936170212766,
+ "grad_norm": 2.7656404972076416,
+ "learning_rate": 4.722253181347671e-06,
+ "loss": 0.5831,
+ "step": 2007
+ },
+ {
+ "epoch": 0.9494089834515367,
+ "grad_norm": 2.6698336601257324,
+ "learning_rate": 4.7219673377646635e-06,
+ "loss": 0.6087,
+ "step": 2008
+ },
+ {
+ "epoch": 0.9498817966903074,
+ "grad_norm": 2.524935722351074,
+ "learning_rate": 4.7216813558294946e-06,
+ "loss": 0.5675,
+ "step": 2009
+ },
+ {
+ "epoch": 0.950354609929078,
+ "grad_norm": 2.5998785495758057,
+ "learning_rate": 4.721395235559969e-06,
+ "loss": 0.5667,
+ "step": 2010
+ },
+ {
+ "epoch": 0.9508274231678487,
+ "grad_norm": 2.758021354675293,
+ "learning_rate": 4.721108976973902e-06,
+ "loss": 0.4931,
+ "step": 2011
+ },
+ {
+ "epoch": 0.9513002364066194,
+ "grad_norm": 2.767695903778076,
+ "learning_rate": 4.72082258008912e-06,
+ "loss": 0.5778,
+ "step": 2012
+ },
+ {
+ "epoch": 0.9517730496453901,
+ "grad_norm": 2.982314348220825,
+ "learning_rate": 4.720536044923453e-06,
+ "loss": 0.6096,
+ "step": 2013
+ },
+ {
+ "epoch": 0.9522458628841608,
+ "grad_norm": 2.7608799934387207,
+ "learning_rate": 4.720249371494743e-06,
+ "loss": 0.6242,
+ "step": 2014
+ },
+ {
+ "epoch": 0.9527186761229315,
+ "grad_norm": 2.60054349899292,
+ "learning_rate": 4.71996255982084e-06,
+ "loss": 0.6249,
+ "step": 2015
+ },
+ {
+ "epoch": 0.9531914893617022,
+ "grad_norm": 2.654355764389038,
+ "learning_rate": 4.719675609919603e-06,
+ "loss": 0.6327,
+ "step": 2016
+ },
+ {
+ "epoch": 0.9536643026004729,
+ "grad_norm": 2.589404582977295,
+ "learning_rate": 4.719388521808899e-06,
+ "loss": 0.6357,
+ "step": 2017
+ },
+ {
+ "epoch": 0.9541371158392435,
+ "grad_norm": 2.8016581535339355,
+ "learning_rate": 4.719101295506603e-06,
+ "loss": 0.5901,
+ "step": 2018
+ },
+ {
+ "epoch": 0.9546099290780142,
+ "grad_norm": 3.1408045291900635,
+ "learning_rate": 4.7188139310306e-06,
+ "loss": 0.598,
+ "step": 2019
+ },
+ {
+ "epoch": 0.9550827423167849,
+ "grad_norm": 2.7432665824890137,
+ "learning_rate": 4.718526428398783e-06,
+ "loss": 0.5508,
+ "step": 2020
+ },
+ {
+ "epoch": 0.9555555555555556,
+ "grad_norm": 2.947800874710083,
+ "learning_rate": 4.718238787629053e-06,
+ "loss": 0.6439,
+ "step": 2021
+ },
+ {
+ "epoch": 0.9560283687943263,
+ "grad_norm": 2.50828218460083,
+ "learning_rate": 4.71795100873932e-06,
+ "loss": 0.5441,
+ "step": 2022
+ },
+ {
+ "epoch": 0.956501182033097,
+ "grad_norm": 2.8558974266052246,
+ "learning_rate": 4.717663091747503e-06,
+ "loss": 0.5416,
+ "step": 2023
+ },
+ {
+ "epoch": 0.9569739952718677,
+ "grad_norm": 2.4803316593170166,
+ "learning_rate": 4.71737503667153e-06,
+ "loss": 0.5317,
+ "step": 2024
+ },
+ {
+ "epoch": 0.9574468085106383,
+ "grad_norm": 4.36754035949707,
+ "learning_rate": 4.717086843529336e-06,
+ "loss": 0.5808,
+ "step": 2025
+ },
+ {
+ "epoch": 0.957919621749409,
+ "grad_norm": 2.730185031890869,
+ "learning_rate": 4.7167985123388665e-06,
+ "loss": 0.5257,
+ "step": 2026
+ },
+ {
+ "epoch": 0.9583924349881797,
+ "grad_norm": 2.8136069774627686,
+ "learning_rate": 4.716510043118074e-06,
+ "loss": 0.5836,
+ "step": 2027
+ },
+ {
+ "epoch": 0.9588652482269504,
+ "grad_norm": 2.793975353240967,
+ "learning_rate": 4.71622143588492e-06,
+ "loss": 0.5706,
+ "step": 2028
+ },
+ {
+ "epoch": 0.9593380614657211,
+ "grad_norm": 2.3883821964263916,
+ "learning_rate": 4.7159326906573745e-06,
+ "loss": 0.5291,
+ "step": 2029
+ },
+ {
+ "epoch": 0.9598108747044918,
+ "grad_norm": 2.6135976314544678,
+ "learning_rate": 4.715643807453417e-06,
+ "loss": 0.6199,
+ "step": 2030
+ },
+ {
+ "epoch": 0.9602836879432625,
+ "grad_norm": 2.6245670318603516,
+ "learning_rate": 4.715354786291035e-06,
+ "loss": 0.5585,
+ "step": 2031
+ },
+ {
+ "epoch": 0.9607565011820332,
+ "grad_norm": 2.7870967388153076,
+ "learning_rate": 4.715065627188225e-06,
+ "loss": 0.6196,
+ "step": 2032
+ },
+ {
+ "epoch": 0.9612293144208038,
+ "grad_norm": 2.6983911991119385,
+ "learning_rate": 4.714776330162991e-06,
+ "loss": 0.6424,
+ "step": 2033
+ },
+ {
+ "epoch": 0.9617021276595744,
+ "grad_norm": 2.3221919536590576,
+ "learning_rate": 4.7144868952333465e-06,
+ "loss": 0.568,
+ "step": 2034
+ },
+ {
+ "epoch": 0.9621749408983451,
+ "grad_norm": 2.9408178329467773,
+ "learning_rate": 4.714197322417314e-06,
+ "loss": 0.6175,
+ "step": 2035
+ },
+ {
+ "epoch": 0.9626477541371158,
+ "grad_norm": 2.404057264328003,
+ "learning_rate": 4.713907611732921e-06,
+ "loss": 0.4943,
+ "step": 2036
+ },
+ {
+ "epoch": 0.9631205673758865,
+ "grad_norm": 3.547607660293579,
+ "learning_rate": 4.71361776319821e-06,
+ "loss": 0.5488,
+ "step": 2037
+ },
+ {
+ "epoch": 0.9635933806146572,
+ "grad_norm": 2.679614543914795,
+ "learning_rate": 4.713327776831227e-06,
+ "loss": 0.6234,
+ "step": 2038
+ },
+ {
+ "epoch": 0.9640661938534278,
+ "grad_norm": 2.526914119720459,
+ "learning_rate": 4.7130376526500286e-06,
+ "loss": 0.5891,
+ "step": 2039
+ },
+ {
+ "epoch": 0.9645390070921985,
+ "grad_norm": 2.6953470706939697,
+ "learning_rate": 4.71274739067268e-06,
+ "loss": 0.69,
+ "step": 2040
+ },
+ {
+ "epoch": 0.9650118203309692,
+ "grad_norm": 2.546660900115967,
+ "learning_rate": 4.712456990917254e-06,
+ "loss": 0.6185,
+ "step": 2041
+ },
+ {
+ "epoch": 0.9654846335697399,
+ "grad_norm": 3.3920490741729736,
+ "learning_rate": 4.712166453401832e-06,
+ "loss": 0.587,
+ "step": 2042
+ },
+ {
+ "epoch": 0.9659574468085106,
+ "grad_norm": 2.5961573123931885,
+ "learning_rate": 4.711875778144504e-06,
+ "loss": 0.6105,
+ "step": 2043
+ },
+ {
+ "epoch": 0.9664302600472813,
+ "grad_norm": 2.5111498832702637,
+ "learning_rate": 4.711584965163372e-06,
+ "loss": 0.5533,
+ "step": 2044
+ },
+ {
+ "epoch": 0.966903073286052,
+ "grad_norm": 2.4878132343292236,
+ "learning_rate": 4.7112940144765405e-06,
+ "loss": 0.5604,
+ "step": 2045
+ },
+ {
+ "epoch": 0.9673758865248226,
+ "grad_norm": 2.5714077949523926,
+ "learning_rate": 4.711002926102128e-06,
+ "loss": 0.5794,
+ "step": 2046
+ },
+ {
+ "epoch": 0.9678486997635933,
+ "grad_norm": 2.7069091796875,
+ "learning_rate": 4.710711700058257e-06,
+ "loss": 0.594,
+ "step": 2047
+ },
+ {
+ "epoch": 0.968321513002364,
+ "grad_norm": 2.8104631900787354,
+ "learning_rate": 4.710420336363063e-06,
+ "loss": 0.6247,
+ "step": 2048
+ },
+ {
+ "epoch": 0.9687943262411347,
+ "grad_norm": 2.8464386463165283,
+ "learning_rate": 4.7101288350346865e-06,
+ "loss": 0.6162,
+ "step": 2049
+ },
+ {
+ "epoch": 0.9692671394799054,
+ "grad_norm": 2.7187976837158203,
+ "learning_rate": 4.709837196091279e-06,
+ "loss": 0.6109,
+ "step": 2050
+ },
+ {
+ "epoch": 0.9697399527186761,
+ "grad_norm": 2.556734085083008,
+ "learning_rate": 4.709545419550999e-06,
+ "loss": 0.6297,
+ "step": 2051
+ },
+ {
+ "epoch": 0.9702127659574468,
+ "grad_norm": 2.937195062637329,
+ "learning_rate": 4.709253505432014e-06,
+ "loss": 0.6862,
+ "step": 2052
+ },
+ {
+ "epoch": 0.9706855791962175,
+ "grad_norm": 2.792175531387329,
+ "learning_rate": 4.7089614537525015e-06,
+ "loss": 0.6105,
+ "step": 2053
+ },
+ {
+ "epoch": 0.9711583924349881,
+ "grad_norm": 2.625636100769043,
+ "learning_rate": 4.708669264530644e-06,
+ "loss": 0.5849,
+ "step": 2054
+ },
+ {
+ "epoch": 0.9716312056737588,
+ "grad_norm": 2.6752610206604004,
+ "learning_rate": 4.708376937784637e-06,
+ "loss": 0.5949,
+ "step": 2055
+ },
+ {
+ "epoch": 0.9721040189125295,
+ "grad_norm": 2.6072793006896973,
+ "learning_rate": 4.708084473532681e-06,
+ "loss": 0.5776,
+ "step": 2056
+ },
+ {
+ "epoch": 0.9725768321513002,
+ "grad_norm": 2.728632926940918,
+ "learning_rate": 4.707791871792988e-06,
+ "loss": 0.6352,
+ "step": 2057
+ },
+ {
+ "epoch": 0.9730496453900709,
+ "grad_norm": 2.5841758251190186,
+ "learning_rate": 4.707499132583775e-06,
+ "loss": 0.5488,
+ "step": 2058
+ },
+ {
+ "epoch": 0.9735224586288416,
+ "grad_norm": 2.8464293479919434,
+ "learning_rate": 4.707206255923271e-06,
+ "loss": 0.7051,
+ "step": 2059
+ },
+ {
+ "epoch": 0.9739952718676123,
+ "grad_norm": 2.547297239303589,
+ "learning_rate": 4.706913241829712e-06,
+ "loss": 0.5937,
+ "step": 2060
+ },
+ {
+ "epoch": 0.9744680851063829,
+ "grad_norm": 2.6572306156158447,
+ "learning_rate": 4.706620090321341e-06,
+ "loss": 0.6041,
+ "step": 2061
+ },
+ {
+ "epoch": 0.9749408983451536,
+ "grad_norm": 2.3262805938720703,
+ "learning_rate": 4.706326801416414e-06,
+ "loss": 0.5144,
+ "step": 2062
+ },
+ {
+ "epoch": 0.9754137115839243,
+ "grad_norm": 2.9693965911865234,
+ "learning_rate": 4.706033375133191e-06,
+ "loss": 0.551,
+ "step": 2063
+ },
+ {
+ "epoch": 0.975886524822695,
+ "grad_norm": 2.5993731021881104,
+ "learning_rate": 4.7057398114899435e-06,
+ "loss": 0.6143,
+ "step": 2064
+ },
+ {
+ "epoch": 0.9763593380614657,
+ "grad_norm": 2.453336477279663,
+ "learning_rate": 4.70544611050495e-06,
+ "loss": 0.6093,
+ "step": 2065
+ },
+ {
+ "epoch": 0.9768321513002364,
+ "grad_norm": 2.898629665374756,
+ "learning_rate": 4.705152272196497e-06,
+ "loss": 0.6007,
+ "step": 2066
+ },
+ {
+ "epoch": 0.9773049645390071,
+ "grad_norm": 2.7990612983703613,
+ "learning_rate": 4.7048582965828815e-06,
+ "loss": 0.6687,
+ "step": 2067
+ },
+ {
+ "epoch": 0.9777777777777777,
+ "grad_norm": 2.635284423828125,
+ "learning_rate": 4.704564183682408e-06,
+ "loss": 0.5564,
+ "step": 2068
+ },
+ {
+ "epoch": 0.9782505910165484,
+ "grad_norm": 3.014547109603882,
+ "learning_rate": 4.704269933513389e-06,
+ "loss": 0.6084,
+ "step": 2069
+ },
+ {
+ "epoch": 0.9787234042553191,
+ "grad_norm": 2.659357786178589,
+ "learning_rate": 4.703975546094147e-06,
+ "loss": 0.6031,
+ "step": 2070
+ },
+ {
+ "epoch": 0.9791962174940898,
+ "grad_norm": 2.326932668685913,
+ "learning_rate": 4.703681021443013e-06,
+ "loss": 0.5859,
+ "step": 2071
+ },
+ {
+ "epoch": 0.9796690307328605,
+ "grad_norm": 2.958803653717041,
+ "learning_rate": 4.7033863595783235e-06,
+ "loss": 0.5586,
+ "step": 2072
+ },
+ {
+ "epoch": 0.9801418439716312,
+ "grad_norm": 2.921386957168579,
+ "learning_rate": 4.703091560518427e-06,
+ "loss": 0.6126,
+ "step": 2073
+ },
+ {
+ "epoch": 0.9806146572104019,
+ "grad_norm": 2.6500775814056396,
+ "learning_rate": 4.702796624281679e-06,
+ "loss": 0.5678,
+ "step": 2074
+ },
+ {
+ "epoch": 0.9810874704491725,
+ "grad_norm": 2.7740228176116943,
+ "learning_rate": 4.702501550886445e-06,
+ "loss": 0.6067,
+ "step": 2075
+ },
+ {
+ "epoch": 0.9815602836879432,
+ "grad_norm": 2.3296213150024414,
+ "learning_rate": 4.702206340351096e-06,
+ "loss": 0.5247,
+ "step": 2076
+ },
+ {
+ "epoch": 0.9820330969267139,
+ "grad_norm": 2.748300790786743,
+ "learning_rate": 4.701910992694016e-06,
+ "loss": 0.5197,
+ "step": 2077
+ },
+ {
+ "epoch": 0.9825059101654846,
+ "grad_norm": 2.250985622406006,
+ "learning_rate": 4.7016155079335926e-06,
+ "loss": 0.5214,
+ "step": 2078
+ },
+ {
+ "epoch": 0.9829787234042553,
+ "grad_norm": 2.389845848083496,
+ "learning_rate": 4.701319886088226e-06,
+ "loss": 0.519,
+ "step": 2079
+ },
+ {
+ "epoch": 0.983451536643026,
+ "grad_norm": 2.818220853805542,
+ "learning_rate": 4.701024127176322e-06,
+ "loss": 0.607,
+ "step": 2080
+ },
+ {
+ "epoch": 0.9839243498817967,
+ "grad_norm": 3.4058034420013428,
+ "learning_rate": 4.700728231216297e-06,
+ "loss": 0.5711,
+ "step": 2081
+ },
+ {
+ "epoch": 0.9843971631205674,
+ "grad_norm": 2.5297787189483643,
+ "learning_rate": 4.700432198226575e-06,
+ "loss": 0.5979,
+ "step": 2082
+ },
+ {
+ "epoch": 0.984869976359338,
+ "grad_norm": 3.0548105239868164,
+ "learning_rate": 4.7001360282255885e-06,
+ "loss": 0.6041,
+ "step": 2083
+ },
+ {
+ "epoch": 0.9853427895981087,
+ "grad_norm": 2.8983733654022217,
+ "learning_rate": 4.699839721231779e-06,
+ "loss": 0.5926,
+ "step": 2084
+ },
+ {
+ "epoch": 0.9858156028368794,
+ "grad_norm": 3.2717764377593994,
+ "learning_rate": 4.699543277263596e-06,
+ "loss": 0.6477,
+ "step": 2085
+ },
+ {
+ "epoch": 0.9862884160756501,
+ "grad_norm": 3.03729248046875,
+ "learning_rate": 4.699246696339497e-06,
+ "loss": 0.6786,
+ "step": 2086
+ },
+ {
+ "epoch": 0.9867612293144208,
+ "grad_norm": 2.852301597595215,
+ "learning_rate": 4.698949978477951e-06,
+ "loss": 0.6565,
+ "step": 2087
+ },
+ {
+ "epoch": 0.9872340425531915,
+ "grad_norm": 2.843485116958618,
+ "learning_rate": 4.698653123697431e-06,
+ "loss": 0.6627,
+ "step": 2088
+ },
+ {
+ "epoch": 0.9877068557919622,
+ "grad_norm": 2.6315064430236816,
+ "learning_rate": 4.698356132016423e-06,
+ "loss": 0.6577,
+ "step": 2089
+ },
+ {
+ "epoch": 0.9881796690307328,
+ "grad_norm": 2.7482151985168457,
+ "learning_rate": 4.698059003453417e-06,
+ "loss": 0.5514,
+ "step": 2090
+ },
+ {
+ "epoch": 0.9886524822695035,
+ "grad_norm": 2.826673746109009,
+ "learning_rate": 4.6977617380269145e-06,
+ "loss": 0.565,
+ "step": 2091
+ },
+ {
+ "epoch": 0.9891252955082742,
+ "grad_norm": 3.0273752212524414,
+ "learning_rate": 4.697464335755427e-06,
+ "loss": 0.6331,
+ "step": 2092
+ },
+ {
+ "epoch": 0.9895981087470449,
+ "grad_norm": 2.7551653385162354,
+ "learning_rate": 4.6971667966574695e-06,
+ "loss": 0.6486,
+ "step": 2093
+ },
+ {
+ "epoch": 0.9900709219858156,
+ "grad_norm": 2.656299114227295,
+ "learning_rate": 4.696869120751571e-06,
+ "loss": 0.6562,
+ "step": 2094
+ },
+ {
+ "epoch": 0.9905437352245863,
+ "grad_norm": 2.785322904586792,
+ "learning_rate": 4.696571308056265e-06,
+ "loss": 0.5892,
+ "step": 2095
+ },
+ {
+ "epoch": 0.991016548463357,
+ "grad_norm": 2.9334635734558105,
+ "learning_rate": 4.696273358590095e-06,
+ "loss": 0.6346,
+ "step": 2096
+ },
+ {
+ "epoch": 0.9914893617021276,
+ "grad_norm": 2.7944300174713135,
+ "learning_rate": 4.695975272371613e-06,
+ "loss": 0.5859,
+ "step": 2097
+ },
+ {
+ "epoch": 0.9919621749408983,
+ "grad_norm": 2.5416972637176514,
+ "learning_rate": 4.695677049419381e-06,
+ "loss": 0.5658,
+ "step": 2098
+ },
+ {
+ "epoch": 0.992434988179669,
+ "grad_norm": 2.4056856632232666,
+ "learning_rate": 4.695378689751966e-06,
+ "loss": 0.5121,
+ "step": 2099
+ },
+ {
+ "epoch": 0.9929078014184397,
+ "grad_norm": 2.614548683166504,
+ "learning_rate": 4.695080193387948e-06,
+ "loss": 0.5961,
+ "step": 2100
+ },
+ {
+ "epoch": 0.9933806146572104,
+ "grad_norm": 2.8966517448425293,
+ "learning_rate": 4.69478156034591e-06,
+ "loss": 0.5985,
+ "step": 2101
+ },
+ {
+ "epoch": 0.9938534278959811,
+ "grad_norm": 2.9514098167419434,
+ "learning_rate": 4.694482790644448e-06,
+ "loss": 0.5677,
+ "step": 2102
+ },
+ {
+ "epoch": 0.9943262411347518,
+ "grad_norm": 2.4326791763305664,
+ "learning_rate": 4.694183884302165e-06,
+ "loss": 0.5698,
+ "step": 2103
+ },
+ {
+ "epoch": 0.9947990543735225,
+ "grad_norm": 2.9242892265319824,
+ "learning_rate": 4.6938848413376735e-06,
+ "loss": 0.6245,
+ "step": 2104
+ },
+ {
+ "epoch": 0.9952718676122931,
+ "grad_norm": 2.9134104251861572,
+ "learning_rate": 4.693585661769593e-06,
+ "loss": 0.6164,
+ "step": 2105
+ },
+ {
+ "epoch": 0.9957446808510638,
+ "grad_norm": 2.472564458847046,
+ "learning_rate": 4.693286345616551e-06,
+ "loss": 0.5616,
+ "step": 2106
+ },
+ {
+ "epoch": 0.9962174940898345,
+ "grad_norm": 3.2456448078155518,
+ "learning_rate": 4.692986892897186e-06,
+ "loss": 0.6977,
+ "step": 2107
+ },
+ {
+ "epoch": 0.9966903073286052,
+ "grad_norm": 3.4032769203186035,
+ "learning_rate": 4.692687303630143e-06,
+ "loss": 0.643,
+ "step": 2108
+ },
+ {
+ "epoch": 0.9971631205673759,
+ "grad_norm": 2.722200870513916,
+ "learning_rate": 4.692387577834076e-06,
+ "loss": 0.5873,
+ "step": 2109
+ },
+ {
+ "epoch": 0.9976359338061466,
+ "grad_norm": 2.687532663345337,
+ "learning_rate": 4.692087715527648e-06,
+ "loss": 0.5423,
+ "step": 2110
+ },
+ {
+ "epoch": 0.9981087470449173,
+ "grad_norm": 2.578613042831421,
+ "learning_rate": 4.6917877167295305e-06,
+ "loss": 0.5689,
+ "step": 2111
+ },
+ {
+ "epoch": 0.9985815602836879,
+ "grad_norm": 3.1806094646453857,
+ "learning_rate": 4.691487581458402e-06,
+ "loss": 0.6133,
+ "step": 2112
+ },
+ {
+ "epoch": 0.9990543735224586,
+ "grad_norm": 2.4449520111083984,
+ "learning_rate": 4.691187309732952e-06,
+ "loss": 0.5841,
+ "step": 2113
+ },
+ {
+ "epoch": 0.9995271867612293,
+ "grad_norm": 2.908749580383301,
+ "learning_rate": 4.690886901571875e-06,
+ "loss": 0.534,
+ "step": 2114
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 4.019968032836914,
+ "learning_rate": 4.6905863569938785e-06,
+ "loss": 0.596,
+ "step": 2115
+ },
+ {
+ "epoch": 1.0004728132387706,
+ "grad_norm": 2.4319307804107666,
+ "learning_rate": 4.690285676017675e-06,
+ "loss": 0.4973,
+ "step": 2116
+ },
+ {
+ "epoch": 1.0009456264775414,
+ "grad_norm": 2.6366477012634277,
+ "learning_rate": 4.689984858661986e-06,
+ "loss": 0.5682,
+ "step": 2117
+ },
+ {
+ "epoch": 1.001418439716312,
+ "grad_norm": 2.815114974975586,
+ "learning_rate": 4.689683904945542e-06,
+ "loss": 0.5616,
+ "step": 2118
+ },
+ {
+ "epoch": 1.0018912529550827,
+ "grad_norm": 2.6680490970611572,
+ "learning_rate": 4.689382814887084e-06,
+ "loss": 0.5161,
+ "step": 2119
+ },
+ {
+ "epoch": 1.0023640661938533,
+ "grad_norm": 2.7406351566314697,
+ "learning_rate": 4.689081588505358e-06,
+ "loss": 0.4937,
+ "step": 2120
+ },
+ {
+ "epoch": 1.0028368794326241,
+ "grad_norm": 2.2832298278808594,
+ "learning_rate": 4.68878022581912e-06,
+ "loss": 0.4986,
+ "step": 2121
+ },
+ {
+ "epoch": 1.0033096926713947,
+ "grad_norm": 2.5525307655334473,
+ "learning_rate": 4.688478726847136e-06,
+ "loss": 0.4909,
+ "step": 2122
+ },
+ {
+ "epoch": 1.0037825059101655,
+ "grad_norm": 2.9843199253082275,
+ "learning_rate": 4.688177091608176e-06,
+ "loss": 0.6046,
+ "step": 2123
+ },
+ {
+ "epoch": 1.004255319148936,
+ "grad_norm": 2.5231106281280518,
+ "learning_rate": 4.687875320121024e-06,
+ "loss": 0.5423,
+ "step": 2124
+ },
+ {
+ "epoch": 1.0047281323877069,
+ "grad_norm": 2.567599058151245,
+ "learning_rate": 4.68757341240447e-06,
+ "loss": 0.5092,
+ "step": 2125
+ },
+ {
+ "epoch": 1.0052009456264774,
+ "grad_norm": 2.768111228942871,
+ "learning_rate": 4.687271368477311e-06,
+ "loss": 0.5175,
+ "step": 2126
+ },
+ {
+ "epoch": 1.0056737588652482,
+ "grad_norm": 2.7223286628723145,
+ "learning_rate": 4.686969188358355e-06,
+ "loss": 0.5412,
+ "step": 2127
+ },
+ {
+ "epoch": 1.0061465721040188,
+ "grad_norm": 2.488299608230591,
+ "learning_rate": 4.686666872066418e-06,
+ "loss": 0.5288,
+ "step": 2128
+ },
+ {
+ "epoch": 1.0066193853427896,
+ "grad_norm": 2.882981777191162,
+ "learning_rate": 4.6863644196203215e-06,
+ "loss": 0.6117,
+ "step": 2129
+ },
+ {
+ "epoch": 1.0070921985815602,
+ "grad_norm": 3.0019447803497314,
+ "learning_rate": 4.686061831038901e-06,
+ "loss": 0.5308,
+ "step": 2130
+ },
+ {
+ "epoch": 1.007565011820331,
+ "grad_norm": 3.0056138038635254,
+ "learning_rate": 4.685759106340996e-06,
+ "loss": 0.5833,
+ "step": 2131
+ },
+ {
+ "epoch": 1.0080378250591016,
+ "grad_norm": 2.5709075927734375,
+ "learning_rate": 4.685456245545454e-06,
+ "loss": 0.5071,
+ "step": 2132
+ },
+ {
+ "epoch": 1.0085106382978724,
+ "grad_norm": 2.4641504287719727,
+ "learning_rate": 4.685153248671136e-06,
+ "loss": 0.4813,
+ "step": 2133
+ },
+ {
+ "epoch": 1.008983451536643,
+ "grad_norm": 2.374413013458252,
+ "learning_rate": 4.684850115736906e-06,
+ "loss": 0.5179,
+ "step": 2134
+ },
+ {
+ "epoch": 1.0094562647754137,
+ "grad_norm": 2.6504571437835693,
+ "learning_rate": 4.684546846761641e-06,
+ "loss": 0.437,
+ "step": 2135
+ },
+ {
+ "epoch": 1.0099290780141843,
+ "grad_norm": 2.5977871417999268,
+ "learning_rate": 4.684243441764221e-06,
+ "loss": 0.497,
+ "step": 2136
+ },
+ {
+ "epoch": 1.010401891252955,
+ "grad_norm": 2.4950785636901855,
+ "learning_rate": 4.683939900763541e-06,
+ "loss": 0.5624,
+ "step": 2137
+ },
+ {
+ "epoch": 1.0108747044917257,
+ "grad_norm": 3.065718412399292,
+ "learning_rate": 4.6836362237785e-06,
+ "loss": 0.512,
+ "step": 2138
+ },
+ {
+ "epoch": 1.0113475177304965,
+ "grad_norm": 2.7419207096099854,
+ "learning_rate": 4.6833324108280045e-06,
+ "loss": 0.5585,
+ "step": 2139
+ },
+ {
+ "epoch": 1.011820330969267,
+ "grad_norm": 2.623610496520996,
+ "learning_rate": 4.6830284619309744e-06,
+ "loss": 0.5163,
+ "step": 2140
+ },
+ {
+ "epoch": 1.0122931442080378,
+ "grad_norm": 2.774322986602783,
+ "learning_rate": 4.682724377106334e-06,
+ "loss": 0.527,
+ "step": 2141
+ },
+ {
+ "epoch": 1.0127659574468084,
+ "grad_norm": 2.959935188293457,
+ "learning_rate": 4.682420156373017e-06,
+ "loss": 0.6166,
+ "step": 2142
+ },
+ {
+ "epoch": 1.0132387706855792,
+ "grad_norm": 2.584026336669922,
+ "learning_rate": 4.682115799749968e-06,
+ "loss": 0.5086,
+ "step": 2143
+ },
+ {
+ "epoch": 1.0137115839243498,
+ "grad_norm": 2.6039700508117676,
+ "learning_rate": 4.6818113072561346e-06,
+ "loss": 0.49,
+ "step": 2144
+ },
+ {
+ "epoch": 1.0141843971631206,
+ "grad_norm": 2.466381072998047,
+ "learning_rate": 4.681506678910479e-06,
+ "loss": 0.4959,
+ "step": 2145
+ },
+ {
+ "epoch": 1.0146572104018912,
+ "grad_norm": 2.432636260986328,
+ "learning_rate": 4.681201914731969e-06,
+ "loss": 0.5057,
+ "step": 2146
+ },
+ {
+ "epoch": 1.015130023640662,
+ "grad_norm": 2.6134090423583984,
+ "learning_rate": 4.680897014739579e-06,
+ "loss": 0.4874,
+ "step": 2147
+ },
+ {
+ "epoch": 1.0156028368794325,
+ "grad_norm": 2.774481773376465,
+ "learning_rate": 4.680591978952295e-06,
+ "loss": 0.4967,
+ "step": 2148
+ },
+ {
+ "epoch": 1.0160756501182033,
+ "grad_norm": 2.66050124168396,
+ "learning_rate": 4.68028680738911e-06,
+ "loss": 0.4932,
+ "step": 2149
+ },
+ {
+ "epoch": 1.016548463356974,
+ "grad_norm": 3.020594835281372,
+ "learning_rate": 4.679981500069026e-06,
+ "loss": 0.5788,
+ "step": 2150
+ },
+ {
+ "epoch": 1.0170212765957447,
+ "grad_norm": 2.697758436203003,
+ "learning_rate": 4.679676057011053e-06,
+ "loss": 0.5441,
+ "step": 2151
+ },
+ {
+ "epoch": 1.0174940898345153,
+ "grad_norm": 6.986445903778076,
+ "learning_rate": 4.679370478234209e-06,
+ "loss": 0.6483,
+ "step": 2152
+ },
+ {
+ "epoch": 1.017966903073286,
+ "grad_norm": 2.6637115478515625,
+ "learning_rate": 4.679064763757522e-06,
+ "loss": 0.5859,
+ "step": 2153
+ },
+ {
+ "epoch": 1.0184397163120567,
+ "grad_norm": 2.7501862049102783,
+ "learning_rate": 4.678758913600027e-06,
+ "loss": 0.5745,
+ "step": 2154
+ },
+ {
+ "epoch": 1.0189125295508275,
+ "grad_norm": 2.7959372997283936,
+ "learning_rate": 4.678452927780768e-06,
+ "loss": 0.5076,
+ "step": 2155
+ },
+ {
+ "epoch": 1.019385342789598,
+ "grad_norm": 2.4377388954162598,
+ "learning_rate": 4.678146806318798e-06,
+ "loss": 0.5061,
+ "step": 2156
+ },
+ {
+ "epoch": 1.0198581560283688,
+ "grad_norm": 2.5478947162628174,
+ "learning_rate": 4.677840549233176e-06,
+ "loss": 0.4941,
+ "step": 2157
+ },
+ {
+ "epoch": 1.0203309692671394,
+ "grad_norm": 3.0956528186798096,
+ "learning_rate": 4.677534156542973e-06,
+ "loss": 0.5879,
+ "step": 2158
+ },
+ {
+ "epoch": 1.0208037825059102,
+ "grad_norm": 2.5247607231140137,
+ "learning_rate": 4.6772276282672666e-06,
+ "loss": 0.5532,
+ "step": 2159
+ },
+ {
+ "epoch": 1.0212765957446808,
+ "grad_norm": 3.1972787380218506,
+ "learning_rate": 4.676920964425143e-06,
+ "loss": 0.6081,
+ "step": 2160
+ },
+ {
+ "epoch": 1.0217494089834516,
+ "grad_norm": 2.6173388957977295,
+ "learning_rate": 4.6766141650356955e-06,
+ "loss": 0.5001,
+ "step": 2161
+ },
+ {
+ "epoch": 1.0222222222222221,
+ "grad_norm": 2.9914398193359375,
+ "learning_rate": 4.676307230118029e-06,
+ "loss": 0.5566,
+ "step": 2162
+ },
+ {
+ "epoch": 1.022695035460993,
+ "grad_norm": 2.8011834621429443,
+ "learning_rate": 4.676000159691254e-06,
+ "loss": 0.4909,
+ "step": 2163
+ },
+ {
+ "epoch": 1.0231678486997635,
+ "grad_norm": 2.6049559116363525,
+ "learning_rate": 4.67569295377449e-06,
+ "loss": 0.5018,
+ "step": 2164
+ },
+ {
+ "epoch": 1.0236406619385343,
+ "grad_norm": 2.8175013065338135,
+ "learning_rate": 4.675385612386866e-06,
+ "loss": 0.5309,
+ "step": 2165
+ },
+ {
+ "epoch": 1.0241134751773049,
+ "grad_norm": 2.854696750640869,
+ "learning_rate": 4.675078135547519e-06,
+ "loss": 0.5627,
+ "step": 2166
+ },
+ {
+ "epoch": 1.0245862884160757,
+ "grad_norm": 3.1856436729431152,
+ "learning_rate": 4.674770523275594e-06,
+ "loss": 0.5475,
+ "step": 2167
+ },
+ {
+ "epoch": 1.0250591016548463,
+ "grad_norm": 2.8289129734039307,
+ "learning_rate": 4.674462775590244e-06,
+ "loss": 0.5878,
+ "step": 2168
+ },
+ {
+ "epoch": 1.025531914893617,
+ "grad_norm": 2.8824517726898193,
+ "learning_rate": 4.6741548925106325e-06,
+ "loss": 0.4392,
+ "step": 2169
+ },
+ {
+ "epoch": 1.0260047281323876,
+ "grad_norm": 2.7044589519500732,
+ "learning_rate": 4.673846874055928e-06,
+ "loss": 0.5264,
+ "step": 2170
+ },
+ {
+ "epoch": 1.0264775413711584,
+ "grad_norm": 2.575035810470581,
+ "learning_rate": 4.673538720245312e-06,
+ "loss": 0.4615,
+ "step": 2171
+ },
+ {
+ "epoch": 1.026950354609929,
+ "grad_norm": 2.48168683052063,
+ "learning_rate": 4.67323043109797e-06,
+ "loss": 0.4404,
+ "step": 2172
+ },
+ {
+ "epoch": 1.0274231678486998,
+ "grad_norm": 2.926593065261841,
+ "learning_rate": 4.672922006633098e-06,
+ "loss": 0.54,
+ "step": 2173
+ },
+ {
+ "epoch": 1.0278959810874704,
+ "grad_norm": 2.4610698223114014,
+ "learning_rate": 4.672613446869901e-06,
+ "loss": 0.5555,
+ "step": 2174
+ },
+ {
+ "epoch": 1.0283687943262412,
+ "grad_norm": 3.026901960372925,
+ "learning_rate": 4.672304751827592e-06,
+ "loss": 0.62,
+ "step": 2175
+ },
+ {
+ "epoch": 1.0288416075650118,
+ "grad_norm": 2.3946213722229004,
+ "learning_rate": 4.671995921525391e-06,
+ "loss": 0.5228,
+ "step": 2176
+ },
+ {
+ "epoch": 1.0293144208037825,
+ "grad_norm": 2.985020399093628,
+ "learning_rate": 4.671686955982528e-06,
+ "loss": 0.6256,
+ "step": 2177
+ },
+ {
+ "epoch": 1.0297872340425531,
+ "grad_norm": 3.0910139083862305,
+ "learning_rate": 4.671377855218239e-06,
+ "loss": 0.5893,
+ "step": 2178
+ },
+ {
+ "epoch": 1.030260047281324,
+ "grad_norm": 2.507805109024048,
+ "learning_rate": 4.6710686192517744e-06,
+ "loss": 0.5329,
+ "step": 2179
+ },
+ {
+ "epoch": 1.0307328605200945,
+ "grad_norm": 2.4514641761779785,
+ "learning_rate": 4.670759248102386e-06,
+ "loss": 0.4585,
+ "step": 2180
+ },
+ {
+ "epoch": 1.0312056737588653,
+ "grad_norm": 2.742838144302368,
+ "learning_rate": 4.670449741789337e-06,
+ "loss": 0.6255,
+ "step": 2181
+ },
+ {
+ "epoch": 1.0316784869976359,
+ "grad_norm": 2.374349594116211,
+ "learning_rate": 4.670140100331901e-06,
+ "loss": 0.5049,
+ "step": 2182
+ },
+ {
+ "epoch": 1.0321513002364067,
+ "grad_norm": 2.78894305229187,
+ "learning_rate": 4.669830323749356e-06,
+ "loss": 0.6061,
+ "step": 2183
+ },
+ {
+ "epoch": 1.0326241134751772,
+ "grad_norm": 2.7195091247558594,
+ "learning_rate": 4.6695204120609905e-06,
+ "loss": 0.592,
+ "step": 2184
+ },
+ {
+ "epoch": 1.033096926713948,
+ "grad_norm": 2.824411630630493,
+ "learning_rate": 4.6692103652861035e-06,
+ "loss": 0.5666,
+ "step": 2185
+ },
+ {
+ "epoch": 1.0335697399527186,
+ "grad_norm": 2.4981014728546143,
+ "learning_rate": 4.6689001834439975e-06,
+ "loss": 0.5045,
+ "step": 2186
+ },
+ {
+ "epoch": 1.0340425531914894,
+ "grad_norm": 2.7375214099884033,
+ "learning_rate": 4.668589866553988e-06,
+ "loss": 0.5305,
+ "step": 2187
+ },
+ {
+ "epoch": 1.03451536643026,
+ "grad_norm": 2.625345468521118,
+ "learning_rate": 4.668279414635396e-06,
+ "loss": 0.4819,
+ "step": 2188
+ },
+ {
+ "epoch": 1.0349881796690308,
+ "grad_norm": 2.60479736328125,
+ "learning_rate": 4.667968827707553e-06,
+ "loss": 0.55,
+ "step": 2189
+ },
+ {
+ "epoch": 1.0354609929078014,
+ "grad_norm": 2.642014741897583,
+ "learning_rate": 4.667658105789797e-06,
+ "loss": 0.5264,
+ "step": 2190
+ },
+ {
+ "epoch": 1.0359338061465722,
+ "grad_norm": 2.5439083576202393,
+ "learning_rate": 4.667347248901476e-06,
+ "loss": 0.4657,
+ "step": 2191
+ },
+ {
+ "epoch": 1.0364066193853427,
+ "grad_norm": 2.5537586212158203,
+ "learning_rate": 4.667036257061945e-06,
+ "loss": 0.527,
+ "step": 2192
+ },
+ {
+ "epoch": 1.0368794326241135,
+ "grad_norm": 2.595466375350952,
+ "learning_rate": 4.666725130290569e-06,
+ "loss": 0.5336,
+ "step": 2193
+ },
+ {
+ "epoch": 1.037352245862884,
+ "grad_norm": 3.5106313228607178,
+ "learning_rate": 4.666413868606719e-06,
+ "loss": 0.5176,
+ "step": 2194
+ },
+ {
+ "epoch": 1.037825059101655,
+ "grad_norm": 2.931553363800049,
+ "learning_rate": 4.666102472029778e-06,
+ "loss": 0.549,
+ "step": 2195
+ },
+ {
+ "epoch": 1.0382978723404255,
+ "grad_norm": 2.4325125217437744,
+ "learning_rate": 4.665790940579133e-06,
+ "loss": 0.5095,
+ "step": 2196
+ },
+ {
+ "epoch": 1.0387706855791963,
+ "grad_norm": 2.708477258682251,
+ "learning_rate": 4.665479274274184e-06,
+ "loss": 0.5264,
+ "step": 2197
+ },
+ {
+ "epoch": 1.0392434988179668,
+ "grad_norm": 2.905977487564087,
+ "learning_rate": 4.665167473134335e-06,
+ "loss": 0.5575,
+ "step": 2198
+ },
+ {
+ "epoch": 1.0397163120567376,
+ "grad_norm": 2.428938865661621,
+ "learning_rate": 4.664855537179003e-06,
+ "loss": 0.5099,
+ "step": 2199
+ },
+ {
+ "epoch": 1.0401891252955082,
+ "grad_norm": 2.8432137966156006,
+ "learning_rate": 4.6645434664276075e-06,
+ "loss": 0.5331,
+ "step": 2200
+ },
+ {
+ "epoch": 1.040661938534279,
+ "grad_norm": 2.5185136795043945,
+ "learning_rate": 4.6642312608995825e-06,
+ "loss": 0.5217,
+ "step": 2201
+ },
+ {
+ "epoch": 1.0411347517730496,
+ "grad_norm": 2.556607723236084,
+ "learning_rate": 4.663918920614366e-06,
+ "loss": 0.4431,
+ "step": 2202
+ },
+ {
+ "epoch": 1.0416075650118204,
+ "grad_norm": 3.1271166801452637,
+ "learning_rate": 4.663606445591407e-06,
+ "loss": 0.5398,
+ "step": 2203
+ },
+ {
+ "epoch": 1.042080378250591,
+ "grad_norm": 2.573680877685547,
+ "learning_rate": 4.663293835850162e-06,
+ "loss": 0.4713,
+ "step": 2204
+ },
+ {
+ "epoch": 1.0425531914893618,
+ "grad_norm": 2.5230324268341064,
+ "learning_rate": 4.662981091410096e-06,
+ "loss": 0.5571,
+ "step": 2205
+ },
+ {
+ "epoch": 1.0430260047281323,
+ "grad_norm": 2.552182912826538,
+ "learning_rate": 4.662668212290681e-06,
+ "loss": 0.5173,
+ "step": 2206
+ },
+ {
+ "epoch": 1.0434988179669031,
+ "grad_norm": 2.832345724105835,
+ "learning_rate": 4.6623551985113995e-06,
+ "loss": 0.525,
+ "step": 2207
+ },
+ {
+ "epoch": 1.0439716312056737,
+ "grad_norm": 2.9729080200195312,
+ "learning_rate": 4.6620420500917416e-06,
+ "loss": 0.6308,
+ "step": 2208
+ },
+ {
+ "epoch": 1.0444444444444445,
+ "grad_norm": 2.618187665939331,
+ "learning_rate": 4.661728767051206e-06,
+ "loss": 0.4942,
+ "step": 2209
+ },
+ {
+ "epoch": 1.044917257683215,
+ "grad_norm": 2.515566349029541,
+ "learning_rate": 4.661415349409299e-06,
+ "loss": 0.5229,
+ "step": 2210
+ },
+ {
+ "epoch": 1.0453900709219859,
+ "grad_norm": 2.8651459217071533,
+ "learning_rate": 4.6611017971855356e-06,
+ "loss": 0.5029,
+ "step": 2211
+ },
+ {
+ "epoch": 1.0458628841607565,
+ "grad_norm": 2.502405881881714,
+ "learning_rate": 4.660788110399439e-06,
+ "loss": 0.4732,
+ "step": 2212
+ },
+ {
+ "epoch": 1.0463356973995273,
+ "grad_norm": 2.540668249130249,
+ "learning_rate": 4.660474289070541e-06,
+ "loss": 0.547,
+ "step": 2213
+ },
+ {
+ "epoch": 1.0468085106382978,
+ "grad_norm": 2.803469181060791,
+ "learning_rate": 4.660160333218384e-06,
+ "loss": 0.5441,
+ "step": 2214
+ },
+ {
+ "epoch": 1.0472813238770686,
+ "grad_norm": 3.233325481414795,
+ "learning_rate": 4.659846242862514e-06,
+ "loss": 0.4457,
+ "step": 2215
+ },
+ {
+ "epoch": 1.0477541371158392,
+ "grad_norm": 2.549548387527466,
+ "learning_rate": 4.659532018022489e-06,
+ "loss": 0.5684,
+ "step": 2216
+ },
+ {
+ "epoch": 1.04822695035461,
+ "grad_norm": 2.6112852096557617,
+ "learning_rate": 4.659217658717875e-06,
+ "loss": 0.5323,
+ "step": 2217
+ },
+ {
+ "epoch": 1.0486997635933806,
+ "grad_norm": 2.347418785095215,
+ "learning_rate": 4.658903164968245e-06,
+ "loss": 0.5349,
+ "step": 2218
+ },
+ {
+ "epoch": 1.0491725768321514,
+ "grad_norm": 2.695502281188965,
+ "learning_rate": 4.658588536793182e-06,
+ "loss": 0.4883,
+ "step": 2219
+ },
+ {
+ "epoch": 1.049645390070922,
+ "grad_norm": 2.7575674057006836,
+ "learning_rate": 4.658273774212275e-06,
+ "loss": 0.5517,
+ "step": 2220
+ },
+ {
+ "epoch": 1.0501182033096927,
+ "grad_norm": 2.787855386734009,
+ "learning_rate": 4.6579588772451245e-06,
+ "loss": 0.5744,
+ "step": 2221
+ },
+ {
+ "epoch": 1.0505910165484633,
+ "grad_norm": 3.0699398517608643,
+ "learning_rate": 4.657643845911337e-06,
+ "loss": 0.5258,
+ "step": 2222
+ },
+ {
+ "epoch": 1.0510638297872341,
+ "grad_norm": 2.652040719985962,
+ "learning_rate": 4.657328680230527e-06,
+ "loss": 0.5141,
+ "step": 2223
+ },
+ {
+ "epoch": 1.0515366430260047,
+ "grad_norm": 2.6896369457244873,
+ "learning_rate": 4.657013380222322e-06,
+ "loss": 0.5139,
+ "step": 2224
+ },
+ {
+ "epoch": 1.0520094562647755,
+ "grad_norm": 2.551839590072632,
+ "learning_rate": 4.65669794590635e-06,
+ "loss": 0.5099,
+ "step": 2225
+ },
+ {
+ "epoch": 1.052482269503546,
+ "grad_norm": 2.8543262481689453,
+ "learning_rate": 4.656382377302255e-06,
+ "loss": 0.6085,
+ "step": 2226
+ },
+ {
+ "epoch": 1.0529550827423169,
+ "grad_norm": 2.871469259262085,
+ "learning_rate": 4.656066674429685e-06,
+ "loss": 0.6108,
+ "step": 2227
+ },
+ {
+ "epoch": 1.0534278959810874,
+ "grad_norm": 2.4840824604034424,
+ "learning_rate": 4.655750837308296e-06,
+ "loss": 0.4994,
+ "step": 2228
+ },
+ {
+ "epoch": 1.0539007092198582,
+ "grad_norm": 2.5203280448913574,
+ "learning_rate": 4.6554348659577555e-06,
+ "loss": 0.4928,
+ "step": 2229
+ },
+ {
+ "epoch": 1.0543735224586288,
+ "grad_norm": 2.9327683448791504,
+ "learning_rate": 4.655118760397737e-06,
+ "loss": 0.6324,
+ "step": 2230
+ },
+ {
+ "epoch": 1.0548463356973996,
+ "grad_norm": 2.6766855716705322,
+ "learning_rate": 4.654802520647924e-06,
+ "loss": 0.5178,
+ "step": 2231
+ },
+ {
+ "epoch": 1.0553191489361702,
+ "grad_norm": 2.8438873291015625,
+ "learning_rate": 4.654486146728006e-06,
+ "loss": 0.509,
+ "step": 2232
+ },
+ {
+ "epoch": 1.055791962174941,
+ "grad_norm": 2.538661241531372,
+ "learning_rate": 4.6541696386576826e-06,
+ "loss": 0.5463,
+ "step": 2233
+ },
+ {
+ "epoch": 1.0562647754137116,
+ "grad_norm": 2.829030990600586,
+ "learning_rate": 4.653852996456662e-06,
+ "loss": 0.5404,
+ "step": 2234
+ },
+ {
+ "epoch": 1.0567375886524824,
+ "grad_norm": 2.5657269954681396,
+ "learning_rate": 4.653536220144659e-06,
+ "loss": 0.5479,
+ "step": 2235
+ },
+ {
+ "epoch": 1.057210401891253,
+ "grad_norm": 2.6641297340393066,
+ "learning_rate": 4.653219309741399e-06,
+ "loss": 0.5503,
+ "step": 2236
+ },
+ {
+ "epoch": 1.0576832151300237,
+ "grad_norm": 2.966350555419922,
+ "learning_rate": 4.652902265266615e-06,
+ "loss": 0.6404,
+ "step": 2237
+ },
+ {
+ "epoch": 1.0581560283687943,
+ "grad_norm": 2.462430000305176,
+ "learning_rate": 4.6525850867400455e-06,
+ "loss": 0.4885,
+ "step": 2238
+ },
+ {
+ "epoch": 1.058628841607565,
+ "grad_norm": 2.1791880130767822,
+ "learning_rate": 4.652267774181443e-06,
+ "loss": 0.4405,
+ "step": 2239
+ },
+ {
+ "epoch": 1.0591016548463357,
+ "grad_norm": 2.5473732948303223,
+ "learning_rate": 4.651950327610563e-06,
+ "loss": 0.5295,
+ "step": 2240
+ },
+ {
+ "epoch": 1.0595744680851065,
+ "grad_norm": 2.70904803276062,
+ "learning_rate": 4.651632747047172e-06,
+ "loss": 0.5169,
+ "step": 2241
+ },
+ {
+ "epoch": 1.060047281323877,
+ "grad_norm": 3.8442928791046143,
+ "learning_rate": 4.651315032511045e-06,
+ "loss": 0.5473,
+ "step": 2242
+ },
+ {
+ "epoch": 1.0605200945626478,
+ "grad_norm": 2.8613383769989014,
+ "learning_rate": 4.650997184021963e-06,
+ "loss": 0.5445,
+ "step": 2243
+ },
+ {
+ "epoch": 1.0609929078014184,
+ "grad_norm": 2.5995829105377197,
+ "learning_rate": 4.6506792015997184e-06,
+ "loss": 0.5525,
+ "step": 2244
+ },
+ {
+ "epoch": 1.0614657210401892,
+ "grad_norm": 2.5465996265411377,
+ "learning_rate": 4.650361085264111e-06,
+ "loss": 0.5093,
+ "step": 2245
+ },
+ {
+ "epoch": 1.0619385342789598,
+ "grad_norm": 2.46553111076355,
+ "learning_rate": 4.650042835034948e-06,
+ "loss": 0.5375,
+ "step": 2246
+ },
+ {
+ "epoch": 1.0624113475177306,
+ "grad_norm": 2.6907830238342285,
+ "learning_rate": 4.649724450932045e-06,
+ "loss": 0.572,
+ "step": 2247
+ },
+ {
+ "epoch": 1.0628841607565012,
+ "grad_norm": 3.0671346187591553,
+ "learning_rate": 4.649405932975226e-06,
+ "loss": 0.4974,
+ "step": 2248
+ },
+ {
+ "epoch": 1.063356973995272,
+ "grad_norm": 2.5392491817474365,
+ "learning_rate": 4.649087281184325e-06,
+ "loss": 0.524,
+ "step": 2249
+ },
+ {
+ "epoch": 1.0638297872340425,
+ "grad_norm": 2.7498562335968018,
+ "learning_rate": 4.648768495579183e-06,
+ "loss": 0.5801,
+ "step": 2250
+ },
+ {
+ "epoch": 1.0643026004728133,
+ "grad_norm": 2.8536248207092285,
+ "learning_rate": 4.648449576179649e-06,
+ "loss": 0.5384,
+ "step": 2251
+ },
+ {
+ "epoch": 1.064775413711584,
+ "grad_norm": 2.7062792778015137,
+ "learning_rate": 4.64813052300558e-06,
+ "loss": 0.5262,
+ "step": 2252
+ },
+ {
+ "epoch": 1.0652482269503547,
+ "grad_norm": 2.798650026321411,
+ "learning_rate": 4.647811336076841e-06,
+ "loss": 0.5719,
+ "step": 2253
+ },
+ {
+ "epoch": 1.0657210401891253,
+ "grad_norm": 2.9793951511383057,
+ "learning_rate": 4.647492015413311e-06,
+ "loss": 0.5377,
+ "step": 2254
+ },
+ {
+ "epoch": 1.066193853427896,
+ "grad_norm": 2.572129011154175,
+ "learning_rate": 4.647172561034868e-06,
+ "loss": 0.4791,
+ "step": 2255
+ },
+ {
+ "epoch": 1.0666666666666667,
+ "grad_norm": 3.7490930557250977,
+ "learning_rate": 4.646852972961405e-06,
+ "loss": 0.5423,
+ "step": 2256
+ },
+ {
+ "epoch": 1.0671394799054374,
+ "grad_norm": 2.626255750656128,
+ "learning_rate": 4.646533251212821e-06,
+ "loss": 0.5558,
+ "step": 2257
+ },
+ {
+ "epoch": 1.067612293144208,
+ "grad_norm": 2.8408126831054688,
+ "learning_rate": 4.646213395809023e-06,
+ "loss": 0.55,
+ "step": 2258
+ },
+ {
+ "epoch": 1.0680851063829788,
+ "grad_norm": 3.255606174468994,
+ "learning_rate": 4.645893406769929e-06,
+ "loss": 0.547,
+ "step": 2259
+ },
+ {
+ "epoch": 1.0685579196217494,
+ "grad_norm": 2.4352102279663086,
+ "learning_rate": 4.645573284115461e-06,
+ "loss": 0.4898,
+ "step": 2260
+ },
+ {
+ "epoch": 1.0690307328605202,
+ "grad_norm": 2.408634662628174,
+ "learning_rate": 4.6452530278655535e-06,
+ "loss": 0.5264,
+ "step": 2261
+ },
+ {
+ "epoch": 1.0695035460992908,
+ "grad_norm": 2.4220449924468994,
+ "learning_rate": 4.644932638040146e-06,
+ "loss": 0.5166,
+ "step": 2262
+ },
+ {
+ "epoch": 1.0699763593380616,
+ "grad_norm": 2.9188082218170166,
+ "learning_rate": 4.644612114659188e-06,
+ "loss": 0.5611,
+ "step": 2263
+ },
+ {
+ "epoch": 1.0704491725768321,
+ "grad_norm": 2.906557083129883,
+ "learning_rate": 4.644291457742638e-06,
+ "loss": 0.5515,
+ "step": 2264
+ },
+ {
+ "epoch": 1.070921985815603,
+ "grad_norm": 2.9039015769958496,
+ "learning_rate": 4.643970667310462e-06,
+ "loss": 0.5732,
+ "step": 2265
+ },
+ {
+ "epoch": 1.0713947990543735,
+ "grad_norm": 2.9985480308532715,
+ "learning_rate": 4.643649743382632e-06,
+ "loss": 0.563,
+ "step": 2266
+ },
+ {
+ "epoch": 1.0718676122931443,
+ "grad_norm": 2.5780906677246094,
+ "learning_rate": 4.6433286859791335e-06,
+ "loss": 0.502,
+ "step": 2267
+ },
+ {
+ "epoch": 1.0723404255319149,
+ "grad_norm": 2.590209722518921,
+ "learning_rate": 4.643007495119955e-06,
+ "loss": 0.4995,
+ "step": 2268
+ },
+ {
+ "epoch": 1.0728132387706855,
+ "grad_norm": 2.378894805908203,
+ "learning_rate": 4.642686170825097e-06,
+ "loss": 0.4886,
+ "step": 2269
+ },
+ {
+ "epoch": 1.0732860520094563,
+ "grad_norm": 2.6826229095458984,
+ "learning_rate": 4.642364713114567e-06,
+ "loss": 0.465,
+ "step": 2270
+ },
+ {
+ "epoch": 1.073758865248227,
+ "grad_norm": 2.627819538116455,
+ "learning_rate": 4.64204312200838e-06,
+ "loss": 0.4954,
+ "step": 2271
+ },
+ {
+ "epoch": 1.0742316784869976,
+ "grad_norm": 2.993021249771118,
+ "learning_rate": 4.641721397526561e-06,
+ "loss": 0.5073,
+ "step": 2272
+ },
+ {
+ "epoch": 1.0747044917257682,
+ "grad_norm": 2.719052791595459,
+ "learning_rate": 4.64139953968914e-06,
+ "loss": 0.538,
+ "step": 2273
+ },
+ {
+ "epoch": 1.075177304964539,
+ "grad_norm": 2.729252576828003,
+ "learning_rate": 4.6410775485161605e-06,
+ "loss": 0.552,
+ "step": 2274
+ },
+ {
+ "epoch": 1.0756501182033098,
+ "grad_norm": 2.924142599105835,
+ "learning_rate": 4.640755424027671e-06,
+ "loss": 0.522,
+ "step": 2275
+ },
+ {
+ "epoch": 1.0761229314420804,
+ "grad_norm": 3.329162120819092,
+ "learning_rate": 4.640433166243728e-06,
+ "loss": 0.5965,
+ "step": 2276
+ },
+ {
+ "epoch": 1.076595744680851,
+ "grad_norm": 2.9810245037078857,
+ "learning_rate": 4.640110775184396e-06,
+ "loss": 0.5653,
+ "step": 2277
+ },
+ {
+ "epoch": 1.0770685579196217,
+ "grad_norm": 2.61772084236145,
+ "learning_rate": 4.639788250869751e-06,
+ "loss": 0.5382,
+ "step": 2278
+ },
+ {
+ "epoch": 1.0775413711583925,
+ "grad_norm": 2.741225004196167,
+ "learning_rate": 4.639465593319874e-06,
+ "loss": 0.4866,
+ "step": 2279
+ },
+ {
+ "epoch": 1.0780141843971631,
+ "grad_norm": 2.7945218086242676,
+ "learning_rate": 4.639142802554856e-06,
+ "loss": 0.4711,
+ "step": 2280
+ },
+ {
+ "epoch": 1.0784869976359337,
+ "grad_norm": 2.4282329082489014,
+ "learning_rate": 4.638819878594795e-06,
+ "loss": 0.4911,
+ "step": 2281
+ },
+ {
+ "epoch": 1.0789598108747045,
+ "grad_norm": 2.551741361618042,
+ "learning_rate": 4.638496821459799e-06,
+ "loss": 0.453,
+ "step": 2282
+ },
+ {
+ "epoch": 1.0794326241134753,
+ "grad_norm": 2.5622754096984863,
+ "learning_rate": 4.638173631169983e-06,
+ "loss": 0.5983,
+ "step": 2283
+ },
+ {
+ "epoch": 1.0799054373522459,
+ "grad_norm": 2.7748284339904785,
+ "learning_rate": 4.6378503077454715e-06,
+ "loss": 0.5143,
+ "step": 2284
+ },
+ {
+ "epoch": 1.0803782505910164,
+ "grad_norm": 2.7693238258361816,
+ "learning_rate": 4.637526851206394e-06,
+ "loss": 0.5929,
+ "step": 2285
+ },
+ {
+ "epoch": 1.0808510638297872,
+ "grad_norm": 2.705548048019409,
+ "learning_rate": 4.637203261572893e-06,
+ "loss": 0.5577,
+ "step": 2286
+ },
+ {
+ "epoch": 1.081323877068558,
+ "grad_norm": 2.739307165145874,
+ "learning_rate": 4.636879538865117e-06,
+ "loss": 0.5676,
+ "step": 2287
+ },
+ {
+ "epoch": 1.0817966903073286,
+ "grad_norm": 2.514059543609619,
+ "learning_rate": 4.636555683103221e-06,
+ "loss": 0.5001,
+ "step": 2288
+ },
+ {
+ "epoch": 1.0822695035460992,
+ "grad_norm": 2.7166874408721924,
+ "learning_rate": 4.636231694307372e-06,
+ "loss": 0.5411,
+ "step": 2289
+ },
+ {
+ "epoch": 1.08274231678487,
+ "grad_norm": 2.7661683559417725,
+ "learning_rate": 4.635907572497741e-06,
+ "loss": 0.6353,
+ "step": 2290
+ },
+ {
+ "epoch": 1.0832151300236406,
+ "grad_norm": 2.598381996154785,
+ "learning_rate": 4.635583317694512e-06,
+ "loss": 0.5213,
+ "step": 2291
+ },
+ {
+ "epoch": 1.0836879432624114,
+ "grad_norm": 2.821491003036499,
+ "learning_rate": 4.6352589299178744e-06,
+ "loss": 0.6172,
+ "step": 2292
+ },
+ {
+ "epoch": 1.084160756501182,
+ "grad_norm": 2.5422823429107666,
+ "learning_rate": 4.634934409188025e-06,
+ "loss": 0.5245,
+ "step": 2293
+ },
+ {
+ "epoch": 1.0846335697399527,
+ "grad_norm": 2.8264620304107666,
+ "learning_rate": 4.634609755525173e-06,
+ "loss": 0.5004,
+ "step": 2294
+ },
+ {
+ "epoch": 1.0851063829787233,
+ "grad_norm": 2.3286643028259277,
+ "learning_rate": 4.63428496894953e-06,
+ "loss": 0.4561,
+ "step": 2295
+ },
+ {
+ "epoch": 1.085579196217494,
+ "grad_norm": 2.462005376815796,
+ "learning_rate": 4.633960049481321e-06,
+ "loss": 0.4948,
+ "step": 2296
+ },
+ {
+ "epoch": 1.0860520094562647,
+ "grad_norm": 2.760258913040161,
+ "learning_rate": 4.633634997140777e-06,
+ "loss": 0.5407,
+ "step": 2297
+ },
+ {
+ "epoch": 1.0865248226950355,
+ "grad_norm": 3.0234217643737793,
+ "learning_rate": 4.633309811948138e-06,
+ "loss": 0.4914,
+ "step": 2298
+ },
+ {
+ "epoch": 1.086997635933806,
+ "grad_norm": 2.8380849361419678,
+ "learning_rate": 4.63298449392365e-06,
+ "loss": 0.5562,
+ "step": 2299
+ },
+ {
+ "epoch": 1.0874704491725768,
+ "grad_norm": 2.6201648712158203,
+ "learning_rate": 4.632659043087572e-06,
+ "loss": 0.5882,
+ "step": 2300
+ },
+ {
+ "epoch": 1.0879432624113474,
+ "grad_norm": 2.586339235305786,
+ "learning_rate": 4.632333459460165e-06,
+ "loss": 0.4991,
+ "step": 2301
+ },
+ {
+ "epoch": 1.0884160756501182,
+ "grad_norm": 2.500115394592285,
+ "learning_rate": 4.632007743061705e-06,
+ "loss": 0.552,
+ "step": 2302
+ },
+ {
+ "epoch": 1.0888888888888888,
+ "grad_norm": 2.816390037536621,
+ "learning_rate": 4.63168189391247e-06,
+ "loss": 0.5301,
+ "step": 2303
+ },
+ {
+ "epoch": 1.0893617021276596,
+ "grad_norm": 2.975400924682617,
+ "learning_rate": 4.631355912032753e-06,
+ "loss": 0.6056,
+ "step": 2304
+ },
+ {
+ "epoch": 1.0898345153664302,
+ "grad_norm": 2.747985363006592,
+ "learning_rate": 4.631029797442846e-06,
+ "loss": 0.5335,
+ "step": 2305
+ },
+ {
+ "epoch": 1.090307328605201,
+ "grad_norm": 2.609281539916992,
+ "learning_rate": 4.630703550163059e-06,
+ "loss": 0.5189,
+ "step": 2306
+ },
+ {
+ "epoch": 1.0907801418439715,
+ "grad_norm": 2.624131202697754,
+ "learning_rate": 4.630377170213705e-06,
+ "loss": 0.5646,
+ "step": 2307
+ },
+ {
+ "epoch": 1.0912529550827423,
+ "grad_norm": 2.6186959743499756,
+ "learning_rate": 4.630050657615107e-06,
+ "loss": 0.5187,
+ "step": 2308
+ },
+ {
+ "epoch": 1.091725768321513,
+ "grad_norm": 2.9961764812469482,
+ "learning_rate": 4.629724012387594e-06,
+ "loss": 0.6207,
+ "step": 2309
+ },
+ {
+ "epoch": 1.0921985815602837,
+ "grad_norm": 2.665799140930176,
+ "learning_rate": 4.629397234551505e-06,
+ "loss": 0.5046,
+ "step": 2310
+ },
+ {
+ "epoch": 1.0926713947990543,
+ "grad_norm": 2.6154725551605225,
+ "learning_rate": 4.629070324127187e-06,
+ "loss": 0.5553,
+ "step": 2311
+ },
+ {
+ "epoch": 1.093144208037825,
+ "grad_norm": 2.702967643737793,
+ "learning_rate": 4.628743281134996e-06,
+ "loss": 0.5159,
+ "step": 2312
+ },
+ {
+ "epoch": 1.0936170212765957,
+ "grad_norm": 2.578080177307129,
+ "learning_rate": 4.628416105595295e-06,
+ "loss": 0.4934,
+ "step": 2313
+ },
+ {
+ "epoch": 1.0940898345153665,
+ "grad_norm": 2.8763060569763184,
+ "learning_rate": 4.628088797528456e-06,
+ "loss": 0.5404,
+ "step": 2314
+ },
+ {
+ "epoch": 1.094562647754137,
+ "grad_norm": 2.5301198959350586,
+ "learning_rate": 4.6277613569548585e-06,
+ "loss": 0.524,
+ "step": 2315
+ },
+ {
+ "epoch": 1.0950354609929078,
+ "grad_norm": 2.559903144836426,
+ "learning_rate": 4.627433783894892e-06,
+ "loss": 0.5177,
+ "step": 2316
+ },
+ {
+ "epoch": 1.0955082742316784,
+ "grad_norm": 2.430863380432129,
+ "learning_rate": 4.627106078368952e-06,
+ "loss": 0.5368,
+ "step": 2317
+ },
+ {
+ "epoch": 1.0959810874704492,
+ "grad_norm": 2.687567949295044,
+ "learning_rate": 4.626778240397444e-06,
+ "loss": 0.5385,
+ "step": 2318
+ },
+ {
+ "epoch": 1.0964539007092198,
+ "grad_norm": 3.053466558456421,
+ "learning_rate": 4.62645027000078e-06,
+ "loss": 0.5814,
+ "step": 2319
+ },
+ {
+ "epoch": 1.0969267139479906,
+ "grad_norm": 2.4612979888916016,
+ "learning_rate": 4.6261221671993815e-06,
+ "loss": 0.5069,
+ "step": 2320
+ },
+ {
+ "epoch": 1.0973995271867611,
+ "grad_norm": 2.6153628826141357,
+ "learning_rate": 4.625793932013679e-06,
+ "loss": 0.5422,
+ "step": 2321
+ },
+ {
+ "epoch": 1.097872340425532,
+ "grad_norm": 2.8918874263763428,
+ "learning_rate": 4.62546556446411e-06,
+ "loss": 0.5326,
+ "step": 2322
+ },
+ {
+ "epoch": 1.0983451536643025,
+ "grad_norm": 3.62565279006958,
+ "learning_rate": 4.625137064571119e-06,
+ "loss": 0.5164,
+ "step": 2323
+ },
+ {
+ "epoch": 1.0988179669030733,
+ "grad_norm": 2.4285085201263428,
+ "learning_rate": 4.624808432355164e-06,
+ "loss": 0.5084,
+ "step": 2324
+ },
+ {
+ "epoch": 1.099290780141844,
+ "grad_norm": 2.593979835510254,
+ "learning_rate": 4.624479667836702e-06,
+ "loss": 0.4986,
+ "step": 2325
+ },
+ {
+ "epoch": 1.0997635933806147,
+ "grad_norm": 2.490752935409546,
+ "learning_rate": 4.624150771036208e-06,
+ "loss": 0.5296,
+ "step": 2326
+ },
+ {
+ "epoch": 1.1002364066193853,
+ "grad_norm": 2.67694091796875,
+ "learning_rate": 4.6238217419741595e-06,
+ "loss": 0.5229,
+ "step": 2327
+ },
+ {
+ "epoch": 1.100709219858156,
+ "grad_norm": 2.594147205352783,
+ "learning_rate": 4.623492580671044e-06,
+ "loss": 0.4916,
+ "step": 2328
+ },
+ {
+ "epoch": 1.1011820330969266,
+ "grad_norm": 2.943472385406494,
+ "learning_rate": 4.623163287147356e-06,
+ "loss": 0.5591,
+ "step": 2329
+ },
+ {
+ "epoch": 1.1016548463356974,
+ "grad_norm": 2.569410562515259,
+ "learning_rate": 4.622833861423601e-06,
+ "loss": 0.4648,
+ "step": 2330
+ },
+ {
+ "epoch": 1.102127659574468,
+ "grad_norm": 2.5490405559539795,
+ "learning_rate": 4.6225043035202886e-06,
+ "loss": 0.5493,
+ "step": 2331
+ },
+ {
+ "epoch": 1.1026004728132388,
+ "grad_norm": 2.5964598655700684,
+ "learning_rate": 4.622174613457941e-06,
+ "loss": 0.5358,
+ "step": 2332
+ },
+ {
+ "epoch": 1.1030732860520094,
+ "grad_norm": 2.6456820964813232,
+ "learning_rate": 4.621844791257085e-06,
+ "loss": 0.5864,
+ "step": 2333
+ },
+ {
+ "epoch": 1.1035460992907802,
+ "grad_norm": 2.861180067062378,
+ "learning_rate": 4.621514836938259e-06,
+ "loss": 0.6064,
+ "step": 2334
+ },
+ {
+ "epoch": 1.1040189125295508,
+ "grad_norm": 2.8199548721313477,
+ "learning_rate": 4.621184750522005e-06,
+ "loss": 0.5244,
+ "step": 2335
+ },
+ {
+ "epoch": 1.1044917257683216,
+ "grad_norm": 2.7398853302001953,
+ "learning_rate": 4.6208545320288795e-06,
+ "loss": 0.5496,
+ "step": 2336
+ },
+ {
+ "epoch": 1.1049645390070921,
+ "grad_norm": 2.7941031455993652,
+ "learning_rate": 4.620524181479441e-06,
+ "loss": 0.5496,
+ "step": 2337
+ },
+ {
+ "epoch": 1.105437352245863,
+ "grad_norm": 2.973785161972046,
+ "learning_rate": 4.620193698894259e-06,
+ "loss": 0.5492,
+ "step": 2338
+ },
+ {
+ "epoch": 1.1059101654846335,
+ "grad_norm": 2.650355815887451,
+ "learning_rate": 4.6198630842939144e-06,
+ "loss": 0.5392,
+ "step": 2339
+ },
+ {
+ "epoch": 1.1063829787234043,
+ "grad_norm": 2.9092214107513428,
+ "learning_rate": 4.61953233769899e-06,
+ "loss": 0.5305,
+ "step": 2340
+ },
+ {
+ "epoch": 1.1068557919621749,
+ "grad_norm": 2.6329731941223145,
+ "learning_rate": 4.61920145913008e-06,
+ "loss": 0.5031,
+ "step": 2341
+ },
+ {
+ "epoch": 1.1073286052009457,
+ "grad_norm": 2.7214207649230957,
+ "learning_rate": 4.618870448607788e-06,
+ "loss": 0.5536,
+ "step": 2342
+ },
+ {
+ "epoch": 1.1078014184397162,
+ "grad_norm": 2.873119592666626,
+ "learning_rate": 4.618539306152724e-06,
+ "loss": 0.4531,
+ "step": 2343
+ },
+ {
+ "epoch": 1.108274231678487,
+ "grad_norm": 2.701042413711548,
+ "learning_rate": 4.618208031785507e-06,
+ "loss": 0.5217,
+ "step": 2344
+ },
+ {
+ "epoch": 1.1087470449172576,
+ "grad_norm": 2.7189881801605225,
+ "learning_rate": 4.6178766255267635e-06,
+ "loss": 0.6205,
+ "step": 2345
+ },
+ {
+ "epoch": 1.1092198581560284,
+ "grad_norm": 2.546382188796997,
+ "learning_rate": 4.61754508739713e-06,
+ "loss": 0.5475,
+ "step": 2346
+ },
+ {
+ "epoch": 1.109692671394799,
+ "grad_norm": 2.8429276943206787,
+ "learning_rate": 4.617213417417249e-06,
+ "loss": 0.4809,
+ "step": 2347
+ },
+ {
+ "epoch": 1.1101654846335698,
+ "grad_norm": 2.9515812397003174,
+ "learning_rate": 4.616881615607772e-06,
+ "loss": 0.5067,
+ "step": 2348
+ },
+ {
+ "epoch": 1.1106382978723404,
+ "grad_norm": 2.5910723209381104,
+ "learning_rate": 4.616549681989358e-06,
+ "loss": 0.5368,
+ "step": 2349
+ },
+ {
+ "epoch": 1.1111111111111112,
+ "grad_norm": 2.80855655670166,
+ "learning_rate": 4.616217616582678e-06,
+ "loss": 0.5827,
+ "step": 2350
+ },
+ {
+ "epoch": 1.1115839243498817,
+ "grad_norm": 2.604383945465088,
+ "learning_rate": 4.6158854194084044e-06,
+ "loss": 0.5716,
+ "step": 2351
+ },
+ {
+ "epoch": 1.1120567375886525,
+ "grad_norm": 3.0585904121398926,
+ "learning_rate": 4.6155530904872246e-06,
+ "loss": 0.4998,
+ "step": 2352
+ },
+ {
+ "epoch": 1.112529550827423,
+ "grad_norm": 2.660961627960205,
+ "learning_rate": 4.61522062983983e-06,
+ "loss": 0.4533,
+ "step": 2353
+ },
+ {
+ "epoch": 1.113002364066194,
+ "grad_norm": 2.8042070865631104,
+ "learning_rate": 4.614888037486923e-06,
+ "loss": 0.5592,
+ "step": 2354
+ },
+ {
+ "epoch": 1.1134751773049645,
+ "grad_norm": 2.681664228439331,
+ "learning_rate": 4.61455531344921e-06,
+ "loss": 0.5439,
+ "step": 2355
+ },
+ {
+ "epoch": 1.1139479905437353,
+ "grad_norm": 2.905054807662964,
+ "learning_rate": 4.61422245774741e-06,
+ "loss": 0.5497,
+ "step": 2356
+ },
+ {
+ "epoch": 1.1144208037825059,
+ "grad_norm": 2.7979753017425537,
+ "learning_rate": 4.6138894704022484e-06,
+ "loss": 0.5374,
+ "step": 2357
+ },
+ {
+ "epoch": 1.1148936170212767,
+ "grad_norm": 2.965611696243286,
+ "learning_rate": 4.613556351434458e-06,
+ "loss": 0.5145,
+ "step": 2358
+ },
+ {
+ "epoch": 1.1153664302600472,
+ "grad_norm": 2.583134889602661,
+ "learning_rate": 4.613223100864782e-06,
+ "loss": 0.535,
+ "step": 2359
+ },
+ {
+ "epoch": 1.115839243498818,
+ "grad_norm": 2.5979621410369873,
+ "learning_rate": 4.61288971871397e-06,
+ "loss": 0.5514,
+ "step": 2360
+ },
+ {
+ "epoch": 1.1163120567375886,
+ "grad_norm": 3.0117669105529785,
+ "learning_rate": 4.612556205002779e-06,
+ "loss": 0.5266,
+ "step": 2361
+ },
+ {
+ "epoch": 1.1167848699763594,
+ "grad_norm": 2.425133466720581,
+ "learning_rate": 4.612222559751976e-06,
+ "loss": 0.4838,
+ "step": 2362
+ },
+ {
+ "epoch": 1.11725768321513,
+ "grad_norm": 2.5102691650390625,
+ "learning_rate": 4.611888782982337e-06,
+ "loss": 0.3947,
+ "step": 2363
+ },
+ {
+ "epoch": 1.1177304964539008,
+ "grad_norm": 3.0327367782592773,
+ "learning_rate": 4.611554874714645e-06,
+ "loss": 0.5753,
+ "step": 2364
+ },
+ {
+ "epoch": 1.1182033096926713,
+ "grad_norm": 2.4561009407043457,
+ "learning_rate": 4.6112208349696875e-06,
+ "loss": 0.5054,
+ "step": 2365
+ },
+ {
+ "epoch": 1.1186761229314421,
+ "grad_norm": 3.3898050785064697,
+ "learning_rate": 4.610886663768267e-06,
+ "loss": 0.5946,
+ "step": 2366
+ },
+ {
+ "epoch": 1.1191489361702127,
+ "grad_norm": 2.8112242221832275,
+ "learning_rate": 4.61055236113119e-06,
+ "loss": 0.5475,
+ "step": 2367
+ },
+ {
+ "epoch": 1.1196217494089835,
+ "grad_norm": 3.152946710586548,
+ "learning_rate": 4.610217927079272e-06,
+ "loss": 0.5165,
+ "step": 2368
+ },
+ {
+ "epoch": 1.120094562647754,
+ "grad_norm": 2.7847867012023926,
+ "learning_rate": 4.609883361633336e-06,
+ "loss": 0.5533,
+ "step": 2369
+ },
+ {
+ "epoch": 1.1205673758865249,
+ "grad_norm": 2.6376686096191406,
+ "learning_rate": 4.6095486648142155e-06,
+ "loss": 0.4942,
+ "step": 2370
+ },
+ {
+ "epoch": 1.1210401891252955,
+ "grad_norm": 3.123072862625122,
+ "learning_rate": 4.609213836642749e-06,
+ "loss": 0.616,
+ "step": 2371
+ },
+ {
+ "epoch": 1.1215130023640663,
+ "grad_norm": 2.802694320678711,
+ "learning_rate": 4.608878877139786e-06,
+ "loss": 0.5323,
+ "step": 2372
+ },
+ {
+ "epoch": 1.1219858156028368,
+ "grad_norm": 2.3567938804626465,
+ "learning_rate": 4.6085437863261825e-06,
+ "loss": 0.4822,
+ "step": 2373
+ },
+ {
+ "epoch": 1.1224586288416076,
+ "grad_norm": 2.553112030029297,
+ "learning_rate": 4.608208564222804e-06,
+ "loss": 0.5447,
+ "step": 2374
+ },
+ {
+ "epoch": 1.1229314420803782,
+ "grad_norm": 3.0020132064819336,
+ "learning_rate": 4.607873210850521e-06,
+ "loss": 0.6486,
+ "step": 2375
+ },
+ {
+ "epoch": 1.123404255319149,
+ "grad_norm": 2.832442045211792,
+ "learning_rate": 4.607537726230216e-06,
+ "loss": 0.5257,
+ "step": 2376
+ },
+ {
+ "epoch": 1.1238770685579196,
+ "grad_norm": 2.471527099609375,
+ "learning_rate": 4.607202110382778e-06,
+ "loss": 0.4816,
+ "step": 2377
+ },
+ {
+ "epoch": 1.1243498817966904,
+ "grad_norm": 2.4232118129730225,
+ "learning_rate": 4.606866363329105e-06,
+ "loss": 0.5533,
+ "step": 2378
+ },
+ {
+ "epoch": 1.124822695035461,
+ "grad_norm": 2.477506637573242,
+ "learning_rate": 4.6065304850901025e-06,
+ "loss": 0.5223,
+ "step": 2379
+ },
+ {
+ "epoch": 1.1252955082742317,
+ "grad_norm": 3.54127836227417,
+ "learning_rate": 4.6061944756866824e-06,
+ "loss": 0.6514,
+ "step": 2380
+ },
+ {
+ "epoch": 1.1257683215130023,
+ "grad_norm": 2.5148677825927734,
+ "learning_rate": 4.605858335139768e-06,
+ "loss": 0.4864,
+ "step": 2381
+ },
+ {
+ "epoch": 1.1262411347517731,
+ "grad_norm": 2.8363659381866455,
+ "learning_rate": 4.605522063470289e-06,
+ "loss": 0.5034,
+ "step": 2382
+ },
+ {
+ "epoch": 1.1267139479905437,
+ "grad_norm": 2.4996654987335205,
+ "learning_rate": 4.605185660699184e-06,
+ "loss": 0.4126,
+ "step": 2383
+ },
+ {
+ "epoch": 1.1271867612293145,
+ "grad_norm": 2.352543830871582,
+ "learning_rate": 4.604849126847398e-06,
+ "loss": 0.5224,
+ "step": 2384
+ },
+ {
+ "epoch": 1.127659574468085,
+ "grad_norm": 2.60101056098938,
+ "learning_rate": 4.6045124619358875e-06,
+ "loss": 0.4867,
+ "step": 2385
+ },
+ {
+ "epoch": 1.1281323877068559,
+ "grad_norm": 2.9471068382263184,
+ "learning_rate": 4.604175665985613e-06,
+ "loss": 0.6474,
+ "step": 2386
+ },
+ {
+ "epoch": 1.1286052009456264,
+ "grad_norm": 2.5933351516723633,
+ "learning_rate": 4.603838739017546e-06,
+ "loss": 0.5081,
+ "step": 2387
+ },
+ {
+ "epoch": 1.1290780141843972,
+ "grad_norm": 2.3740346431732178,
+ "learning_rate": 4.6035016810526665e-06,
+ "loss": 0.4438,
+ "step": 2388
+ },
+ {
+ "epoch": 1.1295508274231678,
+ "grad_norm": 2.675020217895508,
+ "learning_rate": 4.6031644921119614e-06,
+ "loss": 0.4968,
+ "step": 2389
+ },
+ {
+ "epoch": 1.1300236406619386,
+ "grad_norm": 2.599472999572754,
+ "learning_rate": 4.602827172216424e-06,
+ "loss": 0.5131,
+ "step": 2390
+ },
+ {
+ "epoch": 1.1304964539007092,
+ "grad_norm": 2.8176097869873047,
+ "learning_rate": 4.602489721387061e-06,
+ "loss": 0.5549,
+ "step": 2391
+ },
+ {
+ "epoch": 1.13096926713948,
+ "grad_norm": 2.466914176940918,
+ "learning_rate": 4.602152139644881e-06,
+ "loss": 0.5052,
+ "step": 2392
+ },
+ {
+ "epoch": 1.1314420803782506,
+ "grad_norm": 2.8938796520233154,
+ "learning_rate": 4.601814427010905e-06,
+ "loss": 0.6181,
+ "step": 2393
+ },
+ {
+ "epoch": 1.1319148936170214,
+ "grad_norm": 2.7390825748443604,
+ "learning_rate": 4.601476583506161e-06,
+ "loss": 0.5178,
+ "step": 2394
+ },
+ {
+ "epoch": 1.132387706855792,
+ "grad_norm": 3.180112838745117,
+ "learning_rate": 4.601138609151685e-06,
+ "loss": 0.6071,
+ "step": 2395
+ },
+ {
+ "epoch": 1.1328605200945627,
+ "grad_norm": 2.9282350540161133,
+ "learning_rate": 4.600800503968521e-06,
+ "loss": 0.5557,
+ "step": 2396
+ },
+ {
+ "epoch": 1.1333333333333333,
+ "grad_norm": 2.6689717769622803,
+ "learning_rate": 4.6004622679777215e-06,
+ "loss": 0.4679,
+ "step": 2397
+ },
+ {
+ "epoch": 1.133806146572104,
+ "grad_norm": 2.651582956314087,
+ "learning_rate": 4.600123901200347e-06,
+ "loss": 0.4907,
+ "step": 2398
+ },
+ {
+ "epoch": 1.1342789598108747,
+ "grad_norm": 2.5702924728393555,
+ "learning_rate": 4.599785403657464e-06,
+ "loss": 0.4919,
+ "step": 2399
+ },
+ {
+ "epoch": 1.1347517730496455,
+ "grad_norm": 2.636812448501587,
+ "learning_rate": 4.599446775370153e-06,
+ "loss": 0.5091,
+ "step": 2400
+ },
+ {
+ "epoch": 1.135224586288416,
+ "grad_norm": 2.5965442657470703,
+ "learning_rate": 4.599108016359497e-06,
+ "loss": 0.5035,
+ "step": 2401
+ },
+ {
+ "epoch": 1.1356973995271868,
+ "grad_norm": 2.689732313156128,
+ "learning_rate": 4.5987691266465885e-06,
+ "loss": 0.5307,
+ "step": 2402
+ },
+ {
+ "epoch": 1.1361702127659574,
+ "grad_norm": 2.7256956100463867,
+ "learning_rate": 4.59843010625253e-06,
+ "loss": 0.5066,
+ "step": 2403
+ },
+ {
+ "epoch": 1.1366430260047282,
+ "grad_norm": 2.726020574569702,
+ "learning_rate": 4.59809095519843e-06,
+ "loss": 0.4805,
+ "step": 2404
+ },
+ {
+ "epoch": 1.1371158392434988,
+ "grad_norm": 2.703339099884033,
+ "learning_rate": 4.597751673505406e-06,
+ "loss": 0.4992,
+ "step": 2405
+ },
+ {
+ "epoch": 1.1375886524822696,
+ "grad_norm": 2.54455304145813,
+ "learning_rate": 4.5974122611945835e-06,
+ "loss": 0.5251,
+ "step": 2406
+ },
+ {
+ "epoch": 1.1380614657210402,
+ "grad_norm": 2.623507022857666,
+ "learning_rate": 4.597072718287096e-06,
+ "loss": 0.4831,
+ "step": 2407
+ },
+ {
+ "epoch": 1.138534278959811,
+ "grad_norm": 2.653590202331543,
+ "learning_rate": 4.596733044804086e-06,
+ "loss": 0.5646,
+ "step": 2408
+ },
+ {
+ "epoch": 1.1390070921985815,
+ "grad_norm": 2.8230600357055664,
+ "learning_rate": 4.5963932407667035e-06,
+ "loss": 0.514,
+ "step": 2409
+ },
+ {
+ "epoch": 1.1394799054373523,
+ "grad_norm": 2.6077451705932617,
+ "learning_rate": 4.5960533061961065e-06,
+ "loss": 0.4713,
+ "step": 2410
+ },
+ {
+ "epoch": 1.139952718676123,
+ "grad_norm": 2.3945798873901367,
+ "learning_rate": 4.595713241113461e-06,
+ "loss": 0.466,
+ "step": 2411
+ },
+ {
+ "epoch": 1.1404255319148937,
+ "grad_norm": 2.8100006580352783,
+ "learning_rate": 4.595373045539941e-06,
+ "loss": 0.5365,
+ "step": 2412
+ },
+ {
+ "epoch": 1.1408983451536643,
+ "grad_norm": 2.6825881004333496,
+ "learning_rate": 4.59503271949673e-06,
+ "loss": 0.4457,
+ "step": 2413
+ },
+ {
+ "epoch": 1.141371158392435,
+ "grad_norm": 2.969435691833496,
+ "learning_rate": 4.594692263005016e-06,
+ "loss": 0.5459,
+ "step": 2414
+ },
+ {
+ "epoch": 1.1418439716312057,
+ "grad_norm": 2.4103164672851562,
+ "learning_rate": 4.594351676086002e-06,
+ "loss": 0.4573,
+ "step": 2415
+ },
+ {
+ "epoch": 1.1423167848699765,
+ "grad_norm": 2.9450128078460693,
+ "learning_rate": 4.594010958760892e-06,
+ "loss": 0.5529,
+ "step": 2416
+ },
+ {
+ "epoch": 1.142789598108747,
+ "grad_norm": 2.6416335105895996,
+ "learning_rate": 4.593670111050901e-06,
+ "loss": 0.5153,
+ "step": 2417
+ },
+ {
+ "epoch": 1.1432624113475178,
+ "grad_norm": 2.473177194595337,
+ "learning_rate": 4.593329132977253e-06,
+ "loss": 0.4962,
+ "step": 2418
+ },
+ {
+ "epoch": 1.1437352245862884,
+ "grad_norm": 2.4494502544403076,
+ "learning_rate": 4.592988024561179e-06,
+ "loss": 0.5182,
+ "step": 2419
+ },
+ {
+ "epoch": 1.1442080378250592,
+ "grad_norm": 2.773930311203003,
+ "learning_rate": 4.592646785823918e-06,
+ "loss": 0.4442,
+ "step": 2420
+ },
+ {
+ "epoch": 1.1446808510638298,
+ "grad_norm": 2.4733314514160156,
+ "learning_rate": 4.592305416786718e-06,
+ "loss": 0.5106,
+ "step": 2421
+ },
+ {
+ "epoch": 1.1451536643026006,
+ "grad_norm": 2.6870038509368896,
+ "learning_rate": 4.591963917470834e-06,
+ "loss": 0.5316,
+ "step": 2422
+ },
+ {
+ "epoch": 1.1456264775413711,
+ "grad_norm": 2.8989531993865967,
+ "learning_rate": 4.591622287897529e-06,
+ "loss": 0.5906,
+ "step": 2423
+ },
+ {
+ "epoch": 1.1460992907801417,
+ "grad_norm": 2.6349124908447266,
+ "learning_rate": 4.591280528088077e-06,
+ "loss": 0.6225,
+ "step": 2424
+ },
+ {
+ "epoch": 1.1465721040189125,
+ "grad_norm": 3.19022274017334,
+ "learning_rate": 4.5909386380637555e-06,
+ "loss": 0.555,
+ "step": 2425
+ },
+ {
+ "epoch": 1.1470449172576833,
+ "grad_norm": 3.1473541259765625,
+ "learning_rate": 4.5905966178458535e-06,
+ "loss": 0.537,
+ "step": 2426
+ },
+ {
+ "epoch": 1.147517730496454,
+ "grad_norm": 2.6996145248413086,
+ "learning_rate": 4.590254467455667e-06,
+ "loss": 0.565,
+ "step": 2427
+ },
+ {
+ "epoch": 1.1479905437352245,
+ "grad_norm": 2.830188274383545,
+ "learning_rate": 4.5899121869145015e-06,
+ "loss": 0.6773,
+ "step": 2428
+ },
+ {
+ "epoch": 1.1484633569739953,
+ "grad_norm": 2.4937260150909424,
+ "learning_rate": 4.589569776243667e-06,
+ "loss": 0.5484,
+ "step": 2429
+ },
+ {
+ "epoch": 1.148936170212766,
+ "grad_norm": 2.54011869430542,
+ "learning_rate": 4.589227235464486e-06,
+ "loss": 0.5307,
+ "step": 2430
+ },
+ {
+ "epoch": 1.1494089834515366,
+ "grad_norm": 2.8764214515686035,
+ "learning_rate": 4.5888845645982845e-06,
+ "loss": 0.5296,
+ "step": 2431
+ },
+ {
+ "epoch": 1.1498817966903072,
+ "grad_norm": 2.637033462524414,
+ "learning_rate": 4.588541763666402e-06,
+ "loss": 0.5975,
+ "step": 2432
+ },
+ {
+ "epoch": 1.150354609929078,
+ "grad_norm": 2.8534255027770996,
+ "learning_rate": 4.5881988326901815e-06,
+ "loss": 0.5431,
+ "step": 2433
+ },
+ {
+ "epoch": 1.1508274231678488,
+ "grad_norm": 2.8546559810638428,
+ "learning_rate": 4.587855771690976e-06,
+ "loss": 0.469,
+ "step": 2434
+ },
+ {
+ "epoch": 1.1513002364066194,
+ "grad_norm": 2.9084973335266113,
+ "learning_rate": 4.587512580690146e-06,
+ "loss": 0.5566,
+ "step": 2435
+ },
+ {
+ "epoch": 1.15177304964539,
+ "grad_norm": 3.0993130207061768,
+ "learning_rate": 4.587169259709063e-06,
+ "loss": 0.5612,
+ "step": 2436
+ },
+ {
+ "epoch": 1.1522458628841608,
+ "grad_norm": 10.847400665283203,
+ "learning_rate": 4.5868258087691e-06,
+ "loss": 0.4678,
+ "step": 2437
+ },
+ {
+ "epoch": 1.1527186761229316,
+ "grad_norm": 2.6648571491241455,
+ "learning_rate": 4.586482227891645e-06,
+ "loss": 0.5951,
+ "step": 2438
+ },
+ {
+ "epoch": 1.1531914893617021,
+ "grad_norm": 2.529043197631836,
+ "learning_rate": 4.586138517098091e-06,
+ "loss": 0.5048,
+ "step": 2439
+ },
+ {
+ "epoch": 1.1536643026004727,
+ "grad_norm": 2.833904504776001,
+ "learning_rate": 4.585794676409839e-06,
+ "loss": 0.536,
+ "step": 2440
+ },
+ {
+ "epoch": 1.1541371158392435,
+ "grad_norm": 3.507657766342163,
+ "learning_rate": 4.585450705848298e-06,
+ "loss": 0.5954,
+ "step": 2441
+ },
+ {
+ "epoch": 1.1546099290780143,
+ "grad_norm": 2.6108388900756836,
+ "learning_rate": 4.585106605434887e-06,
+ "loss": 0.5684,
+ "step": 2442
+ },
+ {
+ "epoch": 1.1550827423167849,
+ "grad_norm": 2.490708589553833,
+ "learning_rate": 4.58476237519103e-06,
+ "loss": 0.4678,
+ "step": 2443
+ },
+ {
+ "epoch": 1.1555555555555554,
+ "grad_norm": 2.8192343711853027,
+ "learning_rate": 4.584418015138161e-06,
+ "loss": 0.5291,
+ "step": 2444
+ },
+ {
+ "epoch": 1.1560283687943262,
+ "grad_norm": 3.0878679752349854,
+ "learning_rate": 4.584073525297722e-06,
+ "loss": 0.5691,
+ "step": 2445
+ },
+ {
+ "epoch": 1.156501182033097,
+ "grad_norm": 3.1444318294525146,
+ "learning_rate": 4.583728905691163e-06,
+ "loss": 0.5643,
+ "step": 2446
+ },
+ {
+ "epoch": 1.1569739952718676,
+ "grad_norm": 3.02382230758667,
+ "learning_rate": 4.583384156339942e-06,
+ "loss": 0.6008,
+ "step": 2447
+ },
+ {
+ "epoch": 1.1574468085106382,
+ "grad_norm": 2.5942490100860596,
+ "learning_rate": 4.583039277265525e-06,
+ "loss": 0.5105,
+ "step": 2448
+ },
+ {
+ "epoch": 1.157919621749409,
+ "grad_norm": 2.938608407974243,
+ "learning_rate": 4.582694268489386e-06,
+ "loss": 0.5123,
+ "step": 2449
+ },
+ {
+ "epoch": 1.1583924349881798,
+ "grad_norm": 2.4622268676757812,
+ "learning_rate": 4.5823491300330075e-06,
+ "loss": 0.4538,
+ "step": 2450
+ },
+ {
+ "epoch": 1.1588652482269504,
+ "grad_norm": 2.4380505084991455,
+ "learning_rate": 4.5820038619178795e-06,
+ "loss": 0.4682,
+ "step": 2451
+ },
+ {
+ "epoch": 1.159338061465721,
+ "grad_norm": 2.479896068572998,
+ "learning_rate": 4.581658464165501e-06,
+ "loss": 0.4877,
+ "step": 2452
+ },
+ {
+ "epoch": 1.1598108747044917,
+ "grad_norm": 2.3373546600341797,
+ "learning_rate": 4.5813129367973765e-06,
+ "loss": 0.445,
+ "step": 2453
+ },
+ {
+ "epoch": 1.1602836879432625,
+ "grad_norm": 2.8586013317108154,
+ "learning_rate": 4.5809672798350214e-06,
+ "loss": 0.5232,
+ "step": 2454
+ },
+ {
+ "epoch": 1.160756501182033,
+ "grad_norm": 3.2302439212799072,
+ "learning_rate": 4.5806214932999595e-06,
+ "loss": 0.5336,
+ "step": 2455
+ },
+ {
+ "epoch": 1.1612293144208037,
+ "grad_norm": 3.1005783081054688,
+ "learning_rate": 4.580275577213721e-06,
+ "loss": 0.5123,
+ "step": 2456
+ },
+ {
+ "epoch": 1.1617021276595745,
+ "grad_norm": 2.7131073474884033,
+ "learning_rate": 4.579929531597842e-06,
+ "loss": 0.5648,
+ "step": 2457
+ },
+ {
+ "epoch": 1.1621749408983453,
+ "grad_norm": 2.5067050457000732,
+ "learning_rate": 4.579583356473874e-06,
+ "loss": 0.5324,
+ "step": 2458
+ },
+ {
+ "epoch": 1.1626477541371159,
+ "grad_norm": 2.7870543003082275,
+ "learning_rate": 4.579237051863366e-06,
+ "loss": 0.5094,
+ "step": 2459
+ },
+ {
+ "epoch": 1.1631205673758864,
+ "grad_norm": 2.739196300506592,
+ "learning_rate": 4.578890617787887e-06,
+ "loss": 0.5103,
+ "step": 2460
+ },
+ {
+ "epoch": 1.1635933806146572,
+ "grad_norm": 2.7108185291290283,
+ "learning_rate": 4.578544054269003e-06,
+ "loss": 0.533,
+ "step": 2461
+ },
+ {
+ "epoch": 1.1640661938534278,
+ "grad_norm": 3.028005361557007,
+ "learning_rate": 4.578197361328295e-06,
+ "loss": 0.636,
+ "step": 2462
+ },
+ {
+ "epoch": 1.1645390070921986,
+ "grad_norm": 2.4855129718780518,
+ "learning_rate": 4.5778505389873505e-06,
+ "loss": 0.501,
+ "step": 2463
+ },
+ {
+ "epoch": 1.1650118203309692,
+ "grad_norm": 2.6314198970794678,
+ "learning_rate": 4.577503587267764e-06,
+ "loss": 0.5812,
+ "step": 2464
+ },
+ {
+ "epoch": 1.16548463356974,
+ "grad_norm": 2.4209671020507812,
+ "learning_rate": 4.5771565061911385e-06,
+ "loss": 0.5168,
+ "step": 2465
+ },
+ {
+ "epoch": 1.1659574468085105,
+ "grad_norm": 2.526388645172119,
+ "learning_rate": 4.576809295779085e-06,
+ "loss": 0.5047,
+ "step": 2466
+ },
+ {
+ "epoch": 1.1664302600472813,
+ "grad_norm": 2.8278191089630127,
+ "learning_rate": 4.576461956053224e-06,
+ "loss": 0.4759,
+ "step": 2467
+ },
+ {
+ "epoch": 1.166903073286052,
+ "grad_norm": 2.7862167358398438,
+ "learning_rate": 4.576114487035182e-06,
+ "loss": 0.5492,
+ "step": 2468
+ },
+ {
+ "epoch": 1.1673758865248227,
+ "grad_norm": 2.6303019523620605,
+ "learning_rate": 4.575766888746594e-06,
+ "loss": 0.5538,
+ "step": 2469
+ },
+ {
+ "epoch": 1.1678486997635933,
+ "grad_norm": 2.613104820251465,
+ "learning_rate": 4.5754191612091034e-06,
+ "loss": 0.5114,
+ "step": 2470
+ },
+ {
+ "epoch": 1.168321513002364,
+ "grad_norm": 2.653958320617676,
+ "learning_rate": 4.5750713044443625e-06,
+ "loss": 0.5858,
+ "step": 2471
+ },
+ {
+ "epoch": 1.1687943262411347,
+ "grad_norm": 3.1143975257873535,
+ "learning_rate": 4.574723318474031e-06,
+ "loss": 0.5193,
+ "step": 2472
+ },
+ {
+ "epoch": 1.1692671394799055,
+ "grad_norm": 3.05454421043396,
+ "learning_rate": 4.574375203319775e-06,
+ "loss": 0.464,
+ "step": 2473
+ },
+ {
+ "epoch": 1.169739952718676,
+ "grad_norm": 2.66626238822937,
+ "learning_rate": 4.574026959003272e-06,
+ "loss": 0.4988,
+ "step": 2474
+ },
+ {
+ "epoch": 1.1702127659574468,
+ "grad_norm": 2.8871963024139404,
+ "learning_rate": 4.573678585546203e-06,
+ "loss": 0.5557,
+ "step": 2475
+ },
+ {
+ "epoch": 1.1706855791962174,
+ "grad_norm": 2.592949628829956,
+ "learning_rate": 4.573330082970262e-06,
+ "loss": 0.5178,
+ "step": 2476
+ },
+ {
+ "epoch": 1.1711583924349882,
+ "grad_norm": 2.9111456871032715,
+ "learning_rate": 4.572981451297148e-06,
+ "loss": 0.5712,
+ "step": 2477
+ },
+ {
+ "epoch": 1.1716312056737588,
+ "grad_norm": 2.8152248859405518,
+ "learning_rate": 4.57263269054857e-06,
+ "loss": 0.5548,
+ "step": 2478
+ },
+ {
+ "epoch": 1.1721040189125296,
+ "grad_norm": 3.0292418003082275,
+ "learning_rate": 4.572283800746241e-06,
+ "loss": 0.5937,
+ "step": 2479
+ },
+ {
+ "epoch": 1.1725768321513002,
+ "grad_norm": 3.454618215560913,
+ "learning_rate": 4.571934781911886e-06,
+ "loss": 0.5537,
+ "step": 2480
+ },
+ {
+ "epoch": 1.173049645390071,
+ "grad_norm": 2.7817866802215576,
+ "learning_rate": 4.571585634067239e-06,
+ "loss": 0.5649,
+ "step": 2481
+ },
+ {
+ "epoch": 1.1735224586288415,
+ "grad_norm": 2.7989349365234375,
+ "learning_rate": 4.571236357234037e-06,
+ "loss": 0.5448,
+ "step": 2482
+ },
+ {
+ "epoch": 1.1739952718676123,
+ "grad_norm": 2.8863933086395264,
+ "learning_rate": 4.57088695143403e-06,
+ "loss": 0.63,
+ "step": 2483
+ },
+ {
+ "epoch": 1.174468085106383,
+ "grad_norm": 2.5738039016723633,
+ "learning_rate": 4.570537416688972e-06,
+ "loss": 0.4702,
+ "step": 2484
+ },
+ {
+ "epoch": 1.1749408983451537,
+ "grad_norm": 3.003643274307251,
+ "learning_rate": 4.570187753020629e-06,
+ "loss": 0.5918,
+ "step": 2485
+ },
+ {
+ "epoch": 1.1754137115839243,
+ "grad_norm": 2.8619167804718018,
+ "learning_rate": 4.569837960450772e-06,
+ "loss": 0.5268,
+ "step": 2486
+ },
+ {
+ "epoch": 1.175886524822695,
+ "grad_norm": 2.876077175140381,
+ "learning_rate": 4.569488039001181e-06,
+ "loss": 0.4915,
+ "step": 2487
+ },
+ {
+ "epoch": 1.1763593380614656,
+ "grad_norm": 3.407115936279297,
+ "learning_rate": 4.569137988693644e-06,
+ "loss": 0.5761,
+ "step": 2488
+ },
+ {
+ "epoch": 1.1768321513002364,
+ "grad_norm": 2.7292826175689697,
+ "learning_rate": 4.568787809549958e-06,
+ "loss": 0.541,
+ "step": 2489
+ },
+ {
+ "epoch": 1.177304964539007,
+ "grad_norm": 2.8805999755859375,
+ "learning_rate": 4.568437501591926e-06,
+ "loss": 0.6223,
+ "step": 2490
+ },
+ {
+ "epoch": 1.1777777777777778,
+ "grad_norm": 2.9264373779296875,
+ "learning_rate": 4.56808706484136e-06,
+ "loss": 0.6081,
+ "step": 2491
+ },
+ {
+ "epoch": 1.1782505910165484,
+ "grad_norm": 2.5167033672332764,
+ "learning_rate": 4.567736499320082e-06,
+ "loss": 0.5393,
+ "step": 2492
+ },
+ {
+ "epoch": 1.1787234042553192,
+ "grad_norm": 3.4647862911224365,
+ "learning_rate": 4.567385805049918e-06,
+ "loss": 0.4826,
+ "step": 2493
+ },
+ {
+ "epoch": 1.1791962174940898,
+ "grad_norm": 2.9824202060699463,
+ "learning_rate": 4.5670349820527055e-06,
+ "loss": 0.541,
+ "step": 2494
+ },
+ {
+ "epoch": 1.1796690307328606,
+ "grad_norm": 2.997105836868286,
+ "learning_rate": 4.5666840303502885e-06,
+ "loss": 0.5771,
+ "step": 2495
+ },
+ {
+ "epoch": 1.1801418439716311,
+ "grad_norm": 2.8728017807006836,
+ "learning_rate": 4.56633294996452e-06,
+ "loss": 0.4877,
+ "step": 2496
+ },
+ {
+ "epoch": 1.180614657210402,
+ "grad_norm": 2.626498222351074,
+ "learning_rate": 4.5659817409172565e-06,
+ "loss": 0.5296,
+ "step": 2497
+ },
+ {
+ "epoch": 1.1810874704491725,
+ "grad_norm": 2.87037992477417,
+ "learning_rate": 4.565630403230371e-06,
+ "loss": 0.539,
+ "step": 2498
+ },
+ {
+ "epoch": 1.1815602836879433,
+ "grad_norm": 2.5719685554504395,
+ "learning_rate": 4.5652789369257375e-06,
+ "loss": 0.5653,
+ "step": 2499
+ },
+ {
+ "epoch": 1.1820330969267139,
+ "grad_norm": 2.4842135906219482,
+ "learning_rate": 4.56492734202524e-06,
+ "loss": 0.515,
+ "step": 2500
+ },
+ {
+ "epoch": 1.1825059101654847,
+ "grad_norm": 2.640951156616211,
+ "learning_rate": 4.564575618550773e-06,
+ "loss": 0.5601,
+ "step": 2501
+ },
+ {
+ "epoch": 1.1829787234042553,
+ "grad_norm": 2.624394655227661,
+ "learning_rate": 4.564223766524234e-06,
+ "loss": 0.5551,
+ "step": 2502
+ },
+ {
+ "epoch": 1.183451536643026,
+ "grad_norm": 3.014537811279297,
+ "learning_rate": 4.563871785967533e-06,
+ "loss": 0.5212,
+ "step": 2503
+ },
+ {
+ "epoch": 1.1839243498817966,
+ "grad_norm": 2.8756890296936035,
+ "learning_rate": 4.563519676902585e-06,
+ "loss": 0.5132,
+ "step": 2504
+ },
+ {
+ "epoch": 1.1843971631205674,
+ "grad_norm": 2.636781692504883,
+ "learning_rate": 4.5631674393513145e-06,
+ "loss": 0.5323,
+ "step": 2505
+ },
+ {
+ "epoch": 1.184869976359338,
+ "grad_norm": 2.7233786582946777,
+ "learning_rate": 4.562815073335655e-06,
+ "loss": 0.5608,
+ "step": 2506
+ },
+ {
+ "epoch": 1.1853427895981088,
+ "grad_norm": 2.7158713340759277,
+ "learning_rate": 4.562462578877546e-06,
+ "loss": 0.5373,
+ "step": 2507
+ },
+ {
+ "epoch": 1.1858156028368794,
+ "grad_norm": 2.9754762649536133,
+ "learning_rate": 4.562109955998936e-06,
+ "loss": 0.5712,
+ "step": 2508
+ },
+ {
+ "epoch": 1.1862884160756502,
+ "grad_norm": 2.8815054893493652,
+ "learning_rate": 4.561757204721781e-06,
+ "loss": 0.6126,
+ "step": 2509
+ },
+ {
+ "epoch": 1.1867612293144207,
+ "grad_norm": 2.866319417953491,
+ "learning_rate": 4.561404325068045e-06,
+ "loss": 0.506,
+ "step": 2510
+ },
+ {
+ "epoch": 1.1872340425531915,
+ "grad_norm": 2.6187376976013184,
+ "learning_rate": 4.561051317059701e-06,
+ "loss": 0.4674,
+ "step": 2511
+ },
+ {
+ "epoch": 1.1877068557919621,
+ "grad_norm": 2.642552137374878,
+ "learning_rate": 4.560698180718729e-06,
+ "loss": 0.4793,
+ "step": 2512
+ },
+ {
+ "epoch": 1.188179669030733,
+ "grad_norm": 2.7815041542053223,
+ "learning_rate": 4.560344916067117e-06,
+ "loss": 0.5034,
+ "step": 2513
+ },
+ {
+ "epoch": 1.1886524822695035,
+ "grad_norm": 2.70853590965271,
+ "learning_rate": 4.559991523126862e-06,
+ "loss": 0.4811,
+ "step": 2514
+ },
+ {
+ "epoch": 1.1891252955082743,
+ "grad_norm": 2.7049436569213867,
+ "learning_rate": 4.559638001919967e-06,
+ "loss": 0.547,
+ "step": 2515
+ },
+ {
+ "epoch": 1.1895981087470449,
+ "grad_norm": 2.766773223876953,
+ "learning_rate": 4.559284352468445e-06,
+ "loss": 0.5362,
+ "step": 2516
+ },
+ {
+ "epoch": 1.1900709219858157,
+ "grad_norm": 3.0064334869384766,
+ "learning_rate": 4.558930574794316e-06,
+ "loss": 0.5915,
+ "step": 2517
+ },
+ {
+ "epoch": 1.1905437352245862,
+ "grad_norm": 2.4899885654449463,
+ "learning_rate": 4.558576668919609e-06,
+ "loss": 0.4379,
+ "step": 2518
+ },
+ {
+ "epoch": 1.191016548463357,
+ "grad_norm": 2.925963878631592,
+ "learning_rate": 4.558222634866358e-06,
+ "loss": 0.5389,
+ "step": 2519
+ },
+ {
+ "epoch": 1.1914893617021276,
+ "grad_norm": 6.087667465209961,
+ "learning_rate": 4.55786847265661e-06,
+ "loss": 0.4777,
+ "step": 2520
+ },
+ {
+ "epoch": 1.1919621749408984,
+ "grad_norm": 2.4560582637786865,
+ "learning_rate": 4.5575141823124145e-06,
+ "loss": 0.5576,
+ "step": 2521
+ },
+ {
+ "epoch": 1.192434988179669,
+ "grad_norm": 3.184252977371216,
+ "learning_rate": 4.557159763855834e-06,
+ "loss": 0.5151,
+ "step": 2522
+ },
+ {
+ "epoch": 1.1929078014184398,
+ "grad_norm": 2.359722137451172,
+ "learning_rate": 4.556805217308935e-06,
+ "loss": 0.478,
+ "step": 2523
+ },
+ {
+ "epoch": 1.1933806146572103,
+ "grad_norm": 3.0821568965911865,
+ "learning_rate": 4.5564505426937935e-06,
+ "loss": 0.5784,
+ "step": 2524
+ },
+ {
+ "epoch": 1.1938534278959811,
+ "grad_norm": 2.9905128479003906,
+ "learning_rate": 4.5560957400324936e-06,
+ "loss": 0.6087,
+ "step": 2525
+ },
+ {
+ "epoch": 1.1943262411347517,
+ "grad_norm": 2.462102174758911,
+ "learning_rate": 4.555740809347128e-06,
+ "loss": 0.4739,
+ "step": 2526
+ },
+ {
+ "epoch": 1.1947990543735225,
+ "grad_norm": 2.7931067943573,
+ "learning_rate": 4.555385750659796e-06,
+ "loss": 0.4961,
+ "step": 2527
+ },
+ {
+ "epoch": 1.195271867612293,
+ "grad_norm": 2.660320997238159,
+ "learning_rate": 4.555030563992607e-06,
+ "loss": 0.487,
+ "step": 2528
+ },
+ {
+ "epoch": 1.195744680851064,
+ "grad_norm": 2.8135557174682617,
+ "learning_rate": 4.554675249367675e-06,
+ "loss": 0.5269,
+ "step": 2529
+ },
+ {
+ "epoch": 1.1962174940898345,
+ "grad_norm": 2.661933422088623,
+ "learning_rate": 4.554319806807126e-06,
+ "loss": 0.4723,
+ "step": 2530
+ },
+ {
+ "epoch": 1.1966903073286053,
+ "grad_norm": 2.568176507949829,
+ "learning_rate": 4.553964236333089e-06,
+ "loss": 0.5258,
+ "step": 2531
+ },
+ {
+ "epoch": 1.1971631205673758,
+ "grad_norm": 2.6890947818756104,
+ "learning_rate": 4.553608537967705e-06,
+ "loss": 0.4965,
+ "step": 2532
+ },
+ {
+ "epoch": 1.1976359338061466,
+ "grad_norm": 3.133470058441162,
+ "learning_rate": 4.553252711733124e-06,
+ "loss": 0.5423,
+ "step": 2533
+ },
+ {
+ "epoch": 1.1981087470449172,
+ "grad_norm": 2.7086687088012695,
+ "learning_rate": 4.552896757651498e-06,
+ "loss": 0.5326,
+ "step": 2534
+ },
+ {
+ "epoch": 1.198581560283688,
+ "grad_norm": 2.8411715030670166,
+ "learning_rate": 4.552540675744994e-06,
+ "loss": 0.5793,
+ "step": 2535
+ },
+ {
+ "epoch": 1.1990543735224586,
+ "grad_norm": 3.041077136993408,
+ "learning_rate": 4.552184466035782e-06,
+ "loss": 0.5068,
+ "step": 2536
+ },
+ {
+ "epoch": 1.1995271867612294,
+ "grad_norm": 2.5921192169189453,
+ "learning_rate": 4.551828128546041e-06,
+ "loss": 0.5189,
+ "step": 2537
+ },
+ {
+ "epoch": 1.2,
+ "grad_norm": 2.923305034637451,
+ "learning_rate": 4.5514716632979605e-06,
+ "loss": 0.516,
+ "step": 2538
+ },
+ {
+ "epoch": 1.2004728132387708,
+ "grad_norm": 2.7083024978637695,
+ "learning_rate": 4.551115070313734e-06,
+ "loss": 0.4825,
+ "step": 2539
+ },
+ {
+ "epoch": 1.2009456264775413,
+ "grad_norm": 2.746842384338379,
+ "learning_rate": 4.550758349615567e-06,
+ "loss": 0.5691,
+ "step": 2540
+ },
+ {
+ "epoch": 1.2014184397163121,
+ "grad_norm": 2.6596429347991943,
+ "learning_rate": 4.550401501225669e-06,
+ "loss": 0.5983,
+ "step": 2541
+ },
+ {
+ "epoch": 1.2018912529550827,
+ "grad_norm": 2.9057931900024414,
+ "learning_rate": 4.550044525166261e-06,
+ "loss": 0.5069,
+ "step": 2542
+ },
+ {
+ "epoch": 1.2023640661938535,
+ "grad_norm": 2.6139039993286133,
+ "learning_rate": 4.5496874214595686e-06,
+ "loss": 0.5102,
+ "step": 2543
+ },
+ {
+ "epoch": 1.202836879432624,
+ "grad_norm": 2.630286455154419,
+ "learning_rate": 4.5493301901278285e-06,
+ "loss": 0.4902,
+ "step": 2544
+ },
+ {
+ "epoch": 1.2033096926713949,
+ "grad_norm": 2.639174222946167,
+ "learning_rate": 4.548972831193284e-06,
+ "loss": 0.4566,
+ "step": 2545
+ },
+ {
+ "epoch": 1.2037825059101654,
+ "grad_norm": 2.9569664001464844,
+ "learning_rate": 4.548615344678186e-06,
+ "loss": 0.5636,
+ "step": 2546
+ },
+ {
+ "epoch": 1.2042553191489362,
+ "grad_norm": 2.981734037399292,
+ "learning_rate": 4.5482577306047924e-06,
+ "loss": 0.4884,
+ "step": 2547
+ },
+ {
+ "epoch": 1.2047281323877068,
+ "grad_norm": 2.6760342121124268,
+ "learning_rate": 4.547899988995371e-06,
+ "loss": 0.5426,
+ "step": 2548
+ },
+ {
+ "epoch": 1.2052009456264776,
+ "grad_norm": 2.825805902481079,
+ "learning_rate": 4.547542119872198e-06,
+ "loss": 0.4989,
+ "step": 2549
+ },
+ {
+ "epoch": 1.2056737588652482,
+ "grad_norm": 2.856426954269409,
+ "learning_rate": 4.547184123257555e-06,
+ "loss": 0.5734,
+ "step": 2550
+ },
+ {
+ "epoch": 1.206146572104019,
+ "grad_norm": 2.555682420730591,
+ "learning_rate": 4.5468259991737334e-06,
+ "loss": 0.5299,
+ "step": 2551
+ },
+ {
+ "epoch": 1.2066193853427896,
+ "grad_norm": 2.6324024200439453,
+ "learning_rate": 4.546467747643032e-06,
+ "loss": 0.5906,
+ "step": 2552
+ },
+ {
+ "epoch": 1.2070921985815604,
+ "grad_norm": 3.4145350456237793,
+ "learning_rate": 4.546109368687757e-06,
+ "loss": 0.5153,
+ "step": 2553
+ },
+ {
+ "epoch": 1.207565011820331,
+ "grad_norm": 2.658691644668579,
+ "learning_rate": 4.545750862330225e-06,
+ "loss": 0.5759,
+ "step": 2554
+ },
+ {
+ "epoch": 1.2080378250591017,
+ "grad_norm": 3.162605047225952,
+ "learning_rate": 4.545392228592755e-06,
+ "loss": 0.5379,
+ "step": 2555
+ },
+ {
+ "epoch": 1.2085106382978723,
+ "grad_norm": 2.8631198406219482,
+ "learning_rate": 4.545033467497681e-06,
+ "loss": 0.5959,
+ "step": 2556
+ },
+ {
+ "epoch": 1.208983451536643,
+ "grad_norm": 2.457109212875366,
+ "learning_rate": 4.54467457906734e-06,
+ "loss": 0.4864,
+ "step": 2557
+ },
+ {
+ "epoch": 1.2094562647754137,
+ "grad_norm": 2.5307061672210693,
+ "learning_rate": 4.544315563324078e-06,
+ "loss": 0.5308,
+ "step": 2558
+ },
+ {
+ "epoch": 1.2099290780141845,
+ "grad_norm": 2.8482773303985596,
+ "learning_rate": 4.543956420290251e-06,
+ "loss": 0.5126,
+ "step": 2559
+ },
+ {
+ "epoch": 1.210401891252955,
+ "grad_norm": 2.4990832805633545,
+ "learning_rate": 4.5435971499882195e-06,
+ "loss": 0.4534,
+ "step": 2560
+ },
+ {
+ "epoch": 1.2108747044917259,
+ "grad_norm": 2.6292665004730225,
+ "learning_rate": 4.543237752440354e-06,
+ "loss": 0.4434,
+ "step": 2561
+ },
+ {
+ "epoch": 1.2113475177304964,
+ "grad_norm": 2.865983247756958,
+ "learning_rate": 4.542878227669033e-06,
+ "loss": 0.5667,
+ "step": 2562
+ },
+ {
+ "epoch": 1.2118203309692672,
+ "grad_norm": 2.745614528656006,
+ "learning_rate": 4.542518575696644e-06,
+ "loss": 0.4724,
+ "step": 2563
+ },
+ {
+ "epoch": 1.2122931442080378,
+ "grad_norm": 2.8562581539154053,
+ "learning_rate": 4.5421587965455785e-06,
+ "loss": 0.5405,
+ "step": 2564
+ },
+ {
+ "epoch": 1.2127659574468086,
+ "grad_norm": 2.6670095920562744,
+ "learning_rate": 4.5417988902382385e-06,
+ "loss": 0.5432,
+ "step": 2565
+ },
+ {
+ "epoch": 1.2132387706855792,
+ "grad_norm": 2.9320743083953857,
+ "learning_rate": 4.541438856797036e-06,
+ "loss": 0.5862,
+ "step": 2566
+ },
+ {
+ "epoch": 1.21371158392435,
+ "grad_norm": 2.577505588531494,
+ "learning_rate": 4.541078696244386e-06,
+ "loss": 0.4742,
+ "step": 2567
+ },
+ {
+ "epoch": 1.2141843971631205,
+ "grad_norm": 3.4476120471954346,
+ "learning_rate": 4.540718408602717e-06,
+ "loss": 0.5903,
+ "step": 2568
+ },
+ {
+ "epoch": 1.2146572104018913,
+ "grad_norm": 2.816210985183716,
+ "learning_rate": 4.540357993894459e-06,
+ "loss": 0.5033,
+ "step": 2569
+ },
+ {
+ "epoch": 1.215130023640662,
+ "grad_norm": 3.0806639194488525,
+ "learning_rate": 4.539997452142058e-06,
+ "loss": 0.6064,
+ "step": 2570
+ },
+ {
+ "epoch": 1.2156028368794327,
+ "grad_norm": 2.563060760498047,
+ "learning_rate": 4.5396367833679586e-06,
+ "loss": 0.5597,
+ "step": 2571
+ },
+ {
+ "epoch": 1.2160756501182033,
+ "grad_norm": 3.1014397144317627,
+ "learning_rate": 4.5392759875946215e-06,
+ "loss": 0.54,
+ "step": 2572
+ },
+ {
+ "epoch": 1.216548463356974,
+ "grad_norm": 3.124190330505371,
+ "learning_rate": 4.53891506484451e-06,
+ "loss": 0.5122,
+ "step": 2573
+ },
+ {
+ "epoch": 1.2170212765957447,
+ "grad_norm": 2.6688716411590576,
+ "learning_rate": 4.538554015140097e-06,
+ "loss": 0.5615,
+ "step": 2574
+ },
+ {
+ "epoch": 1.2174940898345155,
+ "grad_norm": 2.775543689727783,
+ "learning_rate": 4.538192838503866e-06,
+ "loss": 0.496,
+ "step": 2575
+ },
+ {
+ "epoch": 1.217966903073286,
+ "grad_norm": 2.7877283096313477,
+ "learning_rate": 4.537831534958303e-06,
+ "loss": 0.4995,
+ "step": 2576
+ },
+ {
+ "epoch": 1.2184397163120568,
+ "grad_norm": 2.824810028076172,
+ "learning_rate": 4.537470104525906e-06,
+ "loss": 0.5481,
+ "step": 2577
+ },
+ {
+ "epoch": 1.2189125295508274,
+ "grad_norm": 2.801269292831421,
+ "learning_rate": 4.53710854722918e-06,
+ "loss": 0.5628,
+ "step": 2578
+ },
+ {
+ "epoch": 1.2193853427895982,
+ "grad_norm": 2.7780683040618896,
+ "learning_rate": 4.536746863090637e-06,
+ "loss": 0.4845,
+ "step": 2579
+ },
+ {
+ "epoch": 1.2198581560283688,
+ "grad_norm": 2.536010265350342,
+ "learning_rate": 4.536385052132798e-06,
+ "loss": 0.4771,
+ "step": 2580
+ },
+ {
+ "epoch": 1.2203309692671396,
+ "grad_norm": 2.768775701522827,
+ "learning_rate": 4.536023114378191e-06,
+ "loss": 0.5366,
+ "step": 2581
+ },
+ {
+ "epoch": 1.2208037825059102,
+ "grad_norm": 2.658125877380371,
+ "learning_rate": 4.535661049849352e-06,
+ "loss": 0.524,
+ "step": 2582
+ },
+ {
+ "epoch": 1.2212765957446807,
+ "grad_norm": 2.558696746826172,
+ "learning_rate": 4.535298858568825e-06,
+ "loss": 0.5482,
+ "step": 2583
+ },
+ {
+ "epoch": 1.2217494089834515,
+ "grad_norm": 2.5284535884857178,
+ "learning_rate": 4.534936540559164e-06,
+ "loss": 0.4454,
+ "step": 2584
+ },
+ {
+ "epoch": 1.2222222222222223,
+ "grad_norm": 7.617330074310303,
+ "learning_rate": 4.534574095842927e-06,
+ "loss": 0.5615,
+ "step": 2585
+ },
+ {
+ "epoch": 1.222695035460993,
+ "grad_norm": 2.9120311737060547,
+ "learning_rate": 4.534211524442682e-06,
+ "loss": 0.5624,
+ "step": 2586
+ },
+ {
+ "epoch": 1.2231678486997635,
+ "grad_norm": 2.5004289150238037,
+ "learning_rate": 4.533848826381005e-06,
+ "loss": 0.4743,
+ "step": 2587
+ },
+ {
+ "epoch": 1.2236406619385343,
+ "grad_norm": 2.8395533561706543,
+ "learning_rate": 4.53348600168048e-06,
+ "loss": 0.4457,
+ "step": 2588
+ },
+ {
+ "epoch": 1.224113475177305,
+ "grad_norm": 2.832211494445801,
+ "learning_rate": 4.533123050363699e-06,
+ "loss": 0.5559,
+ "step": 2589
+ },
+ {
+ "epoch": 1.2245862884160756,
+ "grad_norm": 2.6318583488464355,
+ "learning_rate": 4.53275997245326e-06,
+ "loss": 0.5281,
+ "step": 2590
+ },
+ {
+ "epoch": 1.2250591016548462,
+ "grad_norm": 3.0509233474731445,
+ "learning_rate": 4.532396767971771e-06,
+ "loss": 0.6003,
+ "step": 2591
+ },
+ {
+ "epoch": 1.225531914893617,
+ "grad_norm": 2.6863620281219482,
+ "learning_rate": 4.532033436941847e-06,
+ "loss": 0.5219,
+ "step": 2592
+ },
+ {
+ "epoch": 1.2260047281323878,
+ "grad_norm": 2.401463747024536,
+ "learning_rate": 4.5316699793861104e-06,
+ "loss": 0.5994,
+ "step": 2593
+ },
+ {
+ "epoch": 1.2264775413711584,
+ "grad_norm": 2.613517999649048,
+ "learning_rate": 4.531306395327194e-06,
+ "loss": 0.5785,
+ "step": 2594
+ },
+ {
+ "epoch": 1.226950354609929,
+ "grad_norm": 2.5016374588012695,
+ "learning_rate": 4.530942684787735e-06,
+ "loss": 0.5695,
+ "step": 2595
+ },
+ {
+ "epoch": 1.2274231678486998,
+ "grad_norm": 2.576464891433716,
+ "learning_rate": 4.53057884779038e-06,
+ "loss": 0.4427,
+ "step": 2596
+ },
+ {
+ "epoch": 1.2278959810874706,
+ "grad_norm": 2.5688700675964355,
+ "learning_rate": 4.530214884357785e-06,
+ "loss": 0.4966,
+ "step": 2597
+ },
+ {
+ "epoch": 1.2283687943262411,
+ "grad_norm": 3.179013729095459,
+ "learning_rate": 4.52985079451261e-06,
+ "loss": 0.5239,
+ "step": 2598
+ },
+ {
+ "epoch": 1.2288416075650117,
+ "grad_norm": 2.6015284061431885,
+ "learning_rate": 4.529486578277527e-06,
+ "loss": 0.5135,
+ "step": 2599
+ },
+ {
+ "epoch": 1.2293144208037825,
+ "grad_norm": 2.3029589653015137,
+ "learning_rate": 4.529122235675214e-06,
+ "loss": 0.4044,
+ "step": 2600
+ },
+ {
+ "epoch": 1.2297872340425533,
+ "grad_norm": 2.994093656539917,
+ "learning_rate": 4.528757766728357e-06,
+ "loss": 0.5419,
+ "step": 2601
+ },
+ {
+ "epoch": 1.2302600472813239,
+ "grad_norm": 2.6297390460968018,
+ "learning_rate": 4.52839317145965e-06,
+ "loss": 0.488,
+ "step": 2602
+ },
+ {
+ "epoch": 1.2307328605200945,
+ "grad_norm": 2.4814043045043945,
+ "learning_rate": 4.528028449891793e-06,
+ "loss": 0.4917,
+ "step": 2603
+ },
+ {
+ "epoch": 1.2312056737588652,
+ "grad_norm": 3.6052863597869873,
+ "learning_rate": 4.527663602047499e-06,
+ "loss": 0.5301,
+ "step": 2604
+ },
+ {
+ "epoch": 1.231678486997636,
+ "grad_norm": 2.6984751224517822,
+ "learning_rate": 4.5272986279494825e-06,
+ "loss": 0.5253,
+ "step": 2605
+ },
+ {
+ "epoch": 1.2321513002364066,
+ "grad_norm": 2.514000415802002,
+ "learning_rate": 4.526933527620469e-06,
+ "loss": 0.5661,
+ "step": 2606
+ },
+ {
+ "epoch": 1.2326241134751772,
+ "grad_norm": 2.890921115875244,
+ "learning_rate": 4.526568301083195e-06,
+ "loss": 0.5585,
+ "step": 2607
+ },
+ {
+ "epoch": 1.233096926713948,
+ "grad_norm": 2.6390011310577393,
+ "learning_rate": 4.526202948360397e-06,
+ "loss": 0.5168,
+ "step": 2608
+ },
+ {
+ "epoch": 1.2335697399527188,
+ "grad_norm": 2.7370636463165283,
+ "learning_rate": 4.5258374694748266e-06,
+ "loss": 0.5453,
+ "step": 2609
+ },
+ {
+ "epoch": 1.2340425531914894,
+ "grad_norm": 2.8203976154327393,
+ "learning_rate": 4.52547186444924e-06,
+ "loss": 0.5763,
+ "step": 2610
+ },
+ {
+ "epoch": 1.23451536643026,
+ "grad_norm": 2.7567849159240723,
+ "learning_rate": 4.5251061333064025e-06,
+ "loss": 0.5194,
+ "step": 2611
+ },
+ {
+ "epoch": 1.2349881796690307,
+ "grad_norm": 2.767519474029541,
+ "learning_rate": 4.524740276069085e-06,
+ "loss": 0.5355,
+ "step": 2612
+ },
+ {
+ "epoch": 1.2354609929078015,
+ "grad_norm": 3.072035312652588,
+ "learning_rate": 4.5243742927600695e-06,
+ "loss": 0.5391,
+ "step": 2613
+ },
+ {
+ "epoch": 1.2359338061465721,
+ "grad_norm": 2.5957462787628174,
+ "learning_rate": 4.524008183402143e-06,
+ "loss": 0.5645,
+ "step": 2614
+ },
+ {
+ "epoch": 1.2364066193853427,
+ "grad_norm": 2.774897575378418,
+ "learning_rate": 4.523641948018101e-06,
+ "loss": 0.5576,
+ "step": 2615
+ },
+ {
+ "epoch": 1.2368794326241135,
+ "grad_norm": 2.635887622833252,
+ "learning_rate": 4.5232755866307496e-06,
+ "loss": 0.5254,
+ "step": 2616
+ },
+ {
+ "epoch": 1.2373522458628843,
+ "grad_norm": 2.4860997200012207,
+ "learning_rate": 4.522909099262899e-06,
+ "loss": 0.4692,
+ "step": 2617
+ },
+ {
+ "epoch": 1.2378250591016549,
+ "grad_norm": 2.595513105392456,
+ "learning_rate": 4.522542485937369e-06,
+ "loss": 0.5166,
+ "step": 2618
+ },
+ {
+ "epoch": 1.2382978723404254,
+ "grad_norm": 2.961474895477295,
+ "learning_rate": 4.522175746676986e-06,
+ "loss": 0.5455,
+ "step": 2619
+ },
+ {
+ "epoch": 1.2387706855791962,
+ "grad_norm": 2.813889741897583,
+ "learning_rate": 4.521808881504588e-06,
+ "loss": 0.5249,
+ "step": 2620
+ },
+ {
+ "epoch": 1.239243498817967,
+ "grad_norm": 2.8434813022613525,
+ "learning_rate": 4.521441890443015e-06,
+ "loss": 0.472,
+ "step": 2621
+ },
+ {
+ "epoch": 1.2397163120567376,
+ "grad_norm": 2.4264845848083496,
+ "learning_rate": 4.521074773515119e-06,
+ "loss": 0.4783,
+ "step": 2622
+ },
+ {
+ "epoch": 1.2401891252955082,
+ "grad_norm": 2.615169048309326,
+ "learning_rate": 4.520707530743761e-06,
+ "loss": 0.5324,
+ "step": 2623
+ },
+ {
+ "epoch": 1.240661938534279,
+ "grad_norm": 2.6772537231445312,
+ "learning_rate": 4.520340162151803e-06,
+ "loss": 0.5224,
+ "step": 2624
+ },
+ {
+ "epoch": 1.2411347517730495,
+ "grad_norm": 2.683393955230713,
+ "learning_rate": 4.519972667762124e-06,
+ "loss": 0.4863,
+ "step": 2625
+ },
+ {
+ "epoch": 1.2416075650118203,
+ "grad_norm": 3.0335750579833984,
+ "learning_rate": 4.519605047597603e-06,
+ "loss": 0.544,
+ "step": 2626
+ },
+ {
+ "epoch": 1.242080378250591,
+ "grad_norm": 2.8694353103637695,
+ "learning_rate": 4.519237301681132e-06,
+ "loss": 0.5576,
+ "step": 2627
+ },
+ {
+ "epoch": 1.2425531914893617,
+ "grad_norm": 3.217808246612549,
+ "learning_rate": 4.518869430035609e-06,
+ "loss": 0.5459,
+ "step": 2628
+ },
+ {
+ "epoch": 1.2430260047281323,
+ "grad_norm": 2.7700083255767822,
+ "learning_rate": 4.518501432683937e-06,
+ "loss": 0.5579,
+ "step": 2629
+ },
+ {
+ "epoch": 1.243498817966903,
+ "grad_norm": 2.4759175777435303,
+ "learning_rate": 4.5181333096490335e-06,
+ "loss": 0.5049,
+ "step": 2630
+ },
+ {
+ "epoch": 1.2439716312056737,
+ "grad_norm": 2.8652584552764893,
+ "learning_rate": 4.517765060953818e-06,
+ "loss": 0.5366,
+ "step": 2631
+ },
+ {
+ "epoch": 1.2444444444444445,
+ "grad_norm": 2.776334524154663,
+ "learning_rate": 4.517396686621218e-06,
+ "loss": 0.5677,
+ "step": 2632
+ },
+ {
+ "epoch": 1.244917257683215,
+ "grad_norm": 2.676708221435547,
+ "learning_rate": 4.517028186674174e-06,
+ "loss": 0.5055,
+ "step": 2633
+ },
+ {
+ "epoch": 1.2453900709219858,
+ "grad_norm": 2.6851537227630615,
+ "learning_rate": 4.516659561135629e-06,
+ "loss": 0.5537,
+ "step": 2634
+ },
+ {
+ "epoch": 1.2458628841607564,
+ "grad_norm": 2.619971513748169,
+ "learning_rate": 4.516290810028536e-06,
+ "loss": 0.5765,
+ "step": 2635
+ },
+ {
+ "epoch": 1.2463356973995272,
+ "grad_norm": 2.7302334308624268,
+ "learning_rate": 4.515921933375855e-06,
+ "loss": 0.5611,
+ "step": 2636
+ },
+ {
+ "epoch": 1.2468085106382978,
+ "grad_norm": 2.5005829334259033,
+ "learning_rate": 4.5155529312005554e-06,
+ "loss": 0.442,
+ "step": 2637
+ },
+ {
+ "epoch": 1.2472813238770686,
+ "grad_norm": 2.713587522506714,
+ "learning_rate": 4.515183803525612e-06,
+ "loss": 0.5023,
+ "step": 2638
+ },
+ {
+ "epoch": 1.2477541371158392,
+ "grad_norm": 2.5146236419677734,
+ "learning_rate": 4.514814550374009e-06,
+ "loss": 0.5195,
+ "step": 2639
+ },
+ {
+ "epoch": 1.24822695035461,
+ "grad_norm": 2.761060953140259,
+ "learning_rate": 4.51444517176874e-06,
+ "loss": 0.5138,
+ "step": 2640
+ },
+ {
+ "epoch": 1.2486997635933805,
+ "grad_norm": 3.082329273223877,
+ "learning_rate": 4.5140756677328026e-06,
+ "loss": 0.6105,
+ "step": 2641
+ },
+ {
+ "epoch": 1.2491725768321513,
+ "grad_norm": 2.6933493614196777,
+ "learning_rate": 4.513706038289205e-06,
+ "loss": 0.5185,
+ "step": 2642
+ },
+ {
+ "epoch": 1.249645390070922,
+ "grad_norm": 2.515856981277466,
+ "learning_rate": 4.513336283460962e-06,
+ "loss": 0.5375,
+ "step": 2643
+ },
+ {
+ "epoch": 1.2501182033096927,
+ "grad_norm": 2.8553731441497803,
+ "learning_rate": 4.512966403271096e-06,
+ "loss": 0.5582,
+ "step": 2644
+ },
+ {
+ "epoch": 1.2505910165484633,
+ "grad_norm": 2.640880823135376,
+ "learning_rate": 4.5125963977426405e-06,
+ "loss": 0.5125,
+ "step": 2645
+ },
+ {
+ "epoch": 1.251063829787234,
+ "grad_norm": 2.9845943450927734,
+ "learning_rate": 4.512226266898631e-06,
+ "loss": 0.4749,
+ "step": 2646
+ },
+ {
+ "epoch": 1.2515366430260046,
+ "grad_norm": 2.5131032466888428,
+ "learning_rate": 4.511856010762116e-06,
+ "loss": 0.4764,
+ "step": 2647
+ },
+ {
+ "epoch": 1.2520094562647754,
+ "grad_norm": 2.370638370513916,
+ "learning_rate": 4.511485629356148e-06,
+ "loss": 0.5153,
+ "step": 2648
+ },
+ {
+ "epoch": 1.252482269503546,
+ "grad_norm": 2.912461996078491,
+ "learning_rate": 4.511115122703791e-06,
+ "loss": 0.6117,
+ "step": 2649
+ },
+ {
+ "epoch": 1.2529550827423168,
+ "grad_norm": 2.7308082580566406,
+ "learning_rate": 4.510744490828113e-06,
+ "loss": 0.5076,
+ "step": 2650
+ },
+ {
+ "epoch": 1.2534278959810874,
+ "grad_norm": 2.8524296283721924,
+ "learning_rate": 4.510373733752193e-06,
+ "loss": 0.542,
+ "step": 2651
+ },
+ {
+ "epoch": 1.2539007092198582,
+ "grad_norm": 2.799377202987671,
+ "learning_rate": 4.5100028514991145e-06,
+ "loss": 0.486,
+ "step": 2652
+ },
+ {
+ "epoch": 1.2543735224586288,
+ "grad_norm": 2.7248027324676514,
+ "learning_rate": 4.509631844091973e-06,
+ "loss": 0.4972,
+ "step": 2653
+ },
+ {
+ "epoch": 1.2548463356973996,
+ "grad_norm": 2.8041458129882812,
+ "learning_rate": 4.5092607115538686e-06,
+ "loss": 0.588,
+ "step": 2654
+ },
+ {
+ "epoch": 1.2553191489361701,
+ "grad_norm": 2.679417133331299,
+ "learning_rate": 4.50888945390791e-06,
+ "loss": 0.4639,
+ "step": 2655
+ },
+ {
+ "epoch": 1.255791962174941,
+ "grad_norm": 3.1049270629882812,
+ "learning_rate": 4.508518071177214e-06,
+ "loss": 0.5857,
+ "step": 2656
+ },
+ {
+ "epoch": 1.2562647754137115,
+ "grad_norm": 2.8590362071990967,
+ "learning_rate": 4.508146563384904e-06,
+ "loss": 0.5451,
+ "step": 2657
+ },
+ {
+ "epoch": 1.2567375886524823,
+ "grad_norm": 2.9774081707000732,
+ "learning_rate": 4.507774930554114e-06,
+ "loss": 0.5493,
+ "step": 2658
+ },
+ {
+ "epoch": 1.2572104018912529,
+ "grad_norm": 2.617643356323242,
+ "learning_rate": 4.507403172707983e-06,
+ "loss": 0.5472,
+ "step": 2659
+ },
+ {
+ "epoch": 1.2576832151300237,
+ "grad_norm": 2.9195587635040283,
+ "learning_rate": 4.507031289869658e-06,
+ "loss": 0.5403,
+ "step": 2660
+ },
+ {
+ "epoch": 1.2581560283687943,
+ "grad_norm": 2.706089496612549,
+ "learning_rate": 4.506659282062295e-06,
+ "loss": 0.4899,
+ "step": 2661
+ },
+ {
+ "epoch": 1.258628841607565,
+ "grad_norm": 2.8229358196258545,
+ "learning_rate": 4.506287149309057e-06,
+ "loss": 0.5336,
+ "step": 2662
+ },
+ {
+ "epoch": 1.2591016548463356,
+ "grad_norm": 2.5295674800872803,
+ "learning_rate": 4.505914891633117e-06,
+ "loss": 0.4806,
+ "step": 2663
+ },
+ {
+ "epoch": 1.2595744680851064,
+ "grad_norm": 3.098208427429199,
+ "learning_rate": 4.505542509057651e-06,
+ "loss": 0.6039,
+ "step": 2664
+ },
+ {
+ "epoch": 1.260047281323877,
+ "grad_norm": 2.5118041038513184,
+ "learning_rate": 4.5051700016058475e-06,
+ "loss": 0.5279,
+ "step": 2665
+ },
+ {
+ "epoch": 1.2605200945626478,
+ "grad_norm": 2.6901369094848633,
+ "learning_rate": 4.5047973693009005e-06,
+ "loss": 0.5515,
+ "step": 2666
+ },
+ {
+ "epoch": 1.2609929078014184,
+ "grad_norm": 2.5622377395629883,
+ "learning_rate": 4.504424612166012e-06,
+ "loss": 0.5405,
+ "step": 2667
+ },
+ {
+ "epoch": 1.2614657210401892,
+ "grad_norm": 2.685751438140869,
+ "learning_rate": 4.5040517302243915e-06,
+ "loss": 0.5797,
+ "step": 2668
+ },
+ {
+ "epoch": 1.2619385342789597,
+ "grad_norm": 2.8525350093841553,
+ "learning_rate": 4.503678723499259e-06,
+ "loss": 0.5561,
+ "step": 2669
+ },
+ {
+ "epoch": 1.2624113475177305,
+ "grad_norm": 2.803386926651001,
+ "learning_rate": 4.503305592013836e-06,
+ "loss": 0.5376,
+ "step": 2670
+ },
+ {
+ "epoch": 1.2628841607565011,
+ "grad_norm": 2.78633189201355,
+ "learning_rate": 4.502932335791359e-06,
+ "loss": 0.4739,
+ "step": 2671
+ },
+ {
+ "epoch": 1.263356973995272,
+ "grad_norm": 2.8337297439575195,
+ "learning_rate": 4.502558954855069e-06,
+ "loss": 0.5406,
+ "step": 2672
+ },
+ {
+ "epoch": 1.2638297872340425,
+ "grad_norm": 2.610275983810425,
+ "learning_rate": 4.502185449228213e-06,
+ "loss": 0.5343,
+ "step": 2673
+ },
+ {
+ "epoch": 1.2643026004728133,
+ "grad_norm": 2.7842252254486084,
+ "learning_rate": 4.501811818934048e-06,
+ "loss": 0.532,
+ "step": 2674
+ },
+ {
+ "epoch": 1.2647754137115839,
+ "grad_norm": 2.4472389221191406,
+ "learning_rate": 4.501438063995839e-06,
+ "loss": 0.4976,
+ "step": 2675
+ },
+ {
+ "epoch": 1.2652482269503547,
+ "grad_norm": 3.076580762863159,
+ "learning_rate": 4.501064184436858e-06,
+ "loss": 0.507,
+ "step": 2676
+ },
+ {
+ "epoch": 1.2657210401891252,
+ "grad_norm": 2.5952908992767334,
+ "learning_rate": 4.500690180280384e-06,
+ "loss": 0.5498,
+ "step": 2677
+ },
+ {
+ "epoch": 1.266193853427896,
+ "grad_norm": 2.476943016052246,
+ "learning_rate": 4.500316051549706e-06,
+ "loss": 0.557,
+ "step": 2678
+ },
+ {
+ "epoch": 1.2666666666666666,
+ "grad_norm": 2.730579376220703,
+ "learning_rate": 4.499941798268118e-06,
+ "loss": 0.4975,
+ "step": 2679
+ },
+ {
+ "epoch": 1.2671394799054374,
+ "grad_norm": 2.7916698455810547,
+ "learning_rate": 4.499567420458924e-06,
+ "loss": 0.5673,
+ "step": 2680
+ },
+ {
+ "epoch": 1.267612293144208,
+ "grad_norm": 2.4249091148376465,
+ "learning_rate": 4.4991929181454355e-06,
+ "loss": 0.4836,
+ "step": 2681
+ },
+ {
+ "epoch": 1.2680851063829788,
+ "grad_norm": 2.661911725997925,
+ "learning_rate": 4.498818291350969e-06,
+ "loss": 0.5332,
+ "step": 2682
+ },
+ {
+ "epoch": 1.2685579196217494,
+ "grad_norm": 2.693657875061035,
+ "learning_rate": 4.498443540098852e-06,
+ "loss": 0.5257,
+ "step": 2683
+ },
+ {
+ "epoch": 1.2690307328605201,
+ "grad_norm": 2.609386682510376,
+ "learning_rate": 4.4980686644124195e-06,
+ "loss": 0.4918,
+ "step": 2684
+ },
+ {
+ "epoch": 1.2695035460992907,
+ "grad_norm": 3.2104930877685547,
+ "learning_rate": 4.4976936643150124e-06,
+ "loss": 0.6097,
+ "step": 2685
+ },
+ {
+ "epoch": 1.2699763593380615,
+ "grad_norm": 2.707860231399536,
+ "learning_rate": 4.49731853982998e-06,
+ "loss": 0.5109,
+ "step": 2686
+ },
+ {
+ "epoch": 1.270449172576832,
+ "grad_norm": 3.5046379566192627,
+ "learning_rate": 4.49694329098068e-06,
+ "loss": 0.5883,
+ "step": 2687
+ },
+ {
+ "epoch": 1.270921985815603,
+ "grad_norm": 2.5362324714660645,
+ "learning_rate": 4.496567917790477e-06,
+ "loss": 0.5301,
+ "step": 2688
+ },
+ {
+ "epoch": 1.2713947990543735,
+ "grad_norm": 2.7095518112182617,
+ "learning_rate": 4.496192420282746e-06,
+ "loss": 0.4772,
+ "step": 2689
+ },
+ {
+ "epoch": 1.2718676122931443,
+ "grad_norm": 2.416433095932007,
+ "learning_rate": 4.495816798480865e-06,
+ "loss": 0.5012,
+ "step": 2690
+ },
+ {
+ "epoch": 1.2723404255319148,
+ "grad_norm": 2.5362391471862793,
+ "learning_rate": 4.495441052408224e-06,
+ "loss": 0.5197,
+ "step": 2691
+ },
+ {
+ "epoch": 1.2728132387706856,
+ "grad_norm": 2.9093947410583496,
+ "learning_rate": 4.495065182088218e-06,
+ "loss": 0.4893,
+ "step": 2692
+ },
+ {
+ "epoch": 1.2732860520094562,
+ "grad_norm": 2.520470142364502,
+ "learning_rate": 4.494689187544251e-06,
+ "loss": 0.5072,
+ "step": 2693
+ },
+ {
+ "epoch": 1.273758865248227,
+ "grad_norm": 2.4385125637054443,
+ "learning_rate": 4.494313068799735e-06,
+ "loss": 0.4923,
+ "step": 2694
+ },
+ {
+ "epoch": 1.2742316784869976,
+ "grad_norm": 2.636852502822876,
+ "learning_rate": 4.493936825878089e-06,
+ "loss": 0.5409,
+ "step": 2695
+ },
+ {
+ "epoch": 1.2747044917257684,
+ "grad_norm": 2.7027053833007812,
+ "learning_rate": 4.493560458802741e-06,
+ "loss": 0.5906,
+ "step": 2696
+ },
+ {
+ "epoch": 1.275177304964539,
+ "grad_norm": 2.58752179145813,
+ "learning_rate": 4.493183967597123e-06,
+ "loss": 0.5292,
+ "step": 2697
+ },
+ {
+ "epoch": 1.2756501182033098,
+ "grad_norm": 2.7658379077911377,
+ "learning_rate": 4.49280735228468e-06,
+ "loss": 0.5613,
+ "step": 2698
+ },
+ {
+ "epoch": 1.2761229314420803,
+ "grad_norm": 3.272688388824463,
+ "learning_rate": 4.492430612888861e-06,
+ "loss": 0.5654,
+ "step": 2699
+ },
+ {
+ "epoch": 1.2765957446808511,
+ "grad_norm": 2.806819438934326,
+ "learning_rate": 4.492053749433125e-06,
+ "loss": 0.5388,
+ "step": 2700
+ },
+ {
+ "epoch": 1.2770685579196217,
+ "grad_norm": 2.879727602005005,
+ "learning_rate": 4.491676761940936e-06,
+ "loss": 0.5033,
+ "step": 2701
+ },
+ {
+ "epoch": 1.2775413711583925,
+ "grad_norm": 2.733347177505493,
+ "learning_rate": 4.4912996504357695e-06,
+ "loss": 0.5113,
+ "step": 2702
+ },
+ {
+ "epoch": 1.278014184397163,
+ "grad_norm": 2.7431252002716064,
+ "learning_rate": 4.490922414941104e-06,
+ "loss": 0.5417,
+ "step": 2703
+ },
+ {
+ "epoch": 1.2784869976359339,
+ "grad_norm": 2.9287240505218506,
+ "learning_rate": 4.490545055480431e-06,
+ "loss": 0.5875,
+ "step": 2704
+ },
+ {
+ "epoch": 1.2789598108747045,
+ "grad_norm": 2.576775550842285,
+ "learning_rate": 4.490167572077244e-06,
+ "loss": 0.5176,
+ "step": 2705
+ },
+ {
+ "epoch": 1.2794326241134752,
+ "grad_norm": 2.4335594177246094,
+ "learning_rate": 4.4897899647550505e-06,
+ "loss": 0.4749,
+ "step": 2706
+ },
+ {
+ "epoch": 1.2799054373522458,
+ "grad_norm": 2.6798062324523926,
+ "learning_rate": 4.489412233537361e-06,
+ "loss": 0.5439,
+ "step": 2707
+ },
+ {
+ "epoch": 1.2803782505910166,
+ "grad_norm": 2.8440675735473633,
+ "learning_rate": 4.489034378447693e-06,
+ "loss": 0.552,
+ "step": 2708
+ },
+ {
+ "epoch": 1.2808510638297872,
+ "grad_norm": 2.9059503078460693,
+ "learning_rate": 4.488656399509577e-06,
+ "loss": 0.5667,
+ "step": 2709
+ },
+ {
+ "epoch": 1.281323877068558,
+ "grad_norm": 2.7415006160736084,
+ "learning_rate": 4.488278296746548e-06,
+ "loss": 0.5676,
+ "step": 2710
+ },
+ {
+ "epoch": 1.2817966903073286,
+ "grad_norm": 2.4584875106811523,
+ "learning_rate": 4.487900070182147e-06,
+ "loss": 0.4787,
+ "step": 2711
+ },
+ {
+ "epoch": 1.2822695035460994,
+ "grad_norm": 2.990940809249878,
+ "learning_rate": 4.487521719839924e-06,
+ "loss": 0.5239,
+ "step": 2712
+ },
+ {
+ "epoch": 1.28274231678487,
+ "grad_norm": 3.075201988220215,
+ "learning_rate": 4.487143245743441e-06,
+ "loss": 0.5103,
+ "step": 2713
+ },
+ {
+ "epoch": 1.2832151300236407,
+ "grad_norm": 2.543341875076294,
+ "learning_rate": 4.486764647916259e-06,
+ "loss": 0.5475,
+ "step": 2714
+ },
+ {
+ "epoch": 1.2836879432624113,
+ "grad_norm": 2.9927213191986084,
+ "learning_rate": 4.486385926381957e-06,
+ "loss": 0.4923,
+ "step": 2715
+ },
+ {
+ "epoch": 1.284160756501182,
+ "grad_norm": 2.4220657348632812,
+ "learning_rate": 4.486007081164111e-06,
+ "loss": 0.543,
+ "step": 2716
+ },
+ {
+ "epoch": 1.2846335697399527,
+ "grad_norm": 2.468214988708496,
+ "learning_rate": 4.4856281122863134e-06,
+ "loss": 0.5248,
+ "step": 2717
+ },
+ {
+ "epoch": 1.2851063829787235,
+ "grad_norm": 2.633711099624634,
+ "learning_rate": 4.48524901977216e-06,
+ "loss": 0.4764,
+ "step": 2718
+ },
+ {
+ "epoch": 1.285579196217494,
+ "grad_norm": 2.8399546146392822,
+ "learning_rate": 4.484869803645254e-06,
+ "loss": 0.5503,
+ "step": 2719
+ },
+ {
+ "epoch": 1.2860520094562649,
+ "grad_norm": 2.769063949584961,
+ "learning_rate": 4.484490463929209e-06,
+ "loss": 0.5468,
+ "step": 2720
+ },
+ {
+ "epoch": 1.2865248226950354,
+ "grad_norm": 2.617863893508911,
+ "learning_rate": 4.4841110006476465e-06,
+ "loss": 0.5906,
+ "step": 2721
+ },
+ {
+ "epoch": 1.2869976359338062,
+ "grad_norm": 2.7639541625976562,
+ "learning_rate": 4.4837314138241905e-06,
+ "loss": 0.552,
+ "step": 2722
+ },
+ {
+ "epoch": 1.2874704491725768,
+ "grad_norm": 2.7711129188537598,
+ "learning_rate": 4.483351703482478e-06,
+ "loss": 0.5229,
+ "step": 2723
+ },
+ {
+ "epoch": 1.2879432624113476,
+ "grad_norm": 2.611205577850342,
+ "learning_rate": 4.482971869646152e-06,
+ "loss": 0.5055,
+ "step": 2724
+ },
+ {
+ "epoch": 1.2884160756501182,
+ "grad_norm": 2.8602211475372314,
+ "learning_rate": 4.482591912338862e-06,
+ "loss": 0.5561,
+ "step": 2725
+ },
+ {
+ "epoch": 1.2888888888888888,
+ "grad_norm": 2.5882298946380615,
+ "learning_rate": 4.4822118315842675e-06,
+ "loss": 0.5555,
+ "step": 2726
+ },
+ {
+ "epoch": 1.2893617021276595,
+ "grad_norm": 2.7533531188964844,
+ "learning_rate": 4.481831627406033e-06,
+ "loss": 0.5346,
+ "step": 2727
+ },
+ {
+ "epoch": 1.2898345153664303,
+ "grad_norm": 2.4296958446502686,
+ "learning_rate": 4.481451299827835e-06,
+ "loss": 0.4915,
+ "step": 2728
+ },
+ {
+ "epoch": 1.290307328605201,
+ "grad_norm": 2.4403445720672607,
+ "learning_rate": 4.481070848873352e-06,
+ "loss": 0.5648,
+ "step": 2729
+ },
+ {
+ "epoch": 1.2907801418439715,
+ "grad_norm": 2.473224401473999,
+ "learning_rate": 4.480690274566274e-06,
+ "loss": 0.4849,
+ "step": 2730
+ },
+ {
+ "epoch": 1.2912529550827423,
+ "grad_norm": 2.637899875640869,
+ "learning_rate": 4.480309576930297e-06,
+ "loss": 0.4968,
+ "step": 2731
+ },
+ {
+ "epoch": 1.291725768321513,
+ "grad_norm": 2.7156927585601807,
+ "learning_rate": 4.479928755989127e-06,
+ "loss": 0.4759,
+ "step": 2732
+ },
+ {
+ "epoch": 1.2921985815602837,
+ "grad_norm": 2.632786989212036,
+ "learning_rate": 4.479547811766475e-06,
+ "loss": 0.5468,
+ "step": 2733
+ },
+ {
+ "epoch": 1.2926713947990542,
+ "grad_norm": 2.529218912124634,
+ "learning_rate": 4.479166744286061e-06,
+ "loss": 0.4852,
+ "step": 2734
+ },
+ {
+ "epoch": 1.293144208037825,
+ "grad_norm": 2.561978340148926,
+ "learning_rate": 4.4787855535716115e-06,
+ "loss": 0.546,
+ "step": 2735
+ },
+ {
+ "epoch": 1.2936170212765958,
+ "grad_norm": 2.3684909343719482,
+ "learning_rate": 4.478404239646862e-06,
+ "loss": 0.5369,
+ "step": 2736
+ },
+ {
+ "epoch": 1.2940898345153664,
+ "grad_norm": 2.8940367698669434,
+ "learning_rate": 4.4780228025355566e-06,
+ "loss": 0.568,
+ "step": 2737
+ },
+ {
+ "epoch": 1.294562647754137,
+ "grad_norm": 2.6950316429138184,
+ "learning_rate": 4.477641242261445e-06,
+ "loss": 0.4576,
+ "step": 2738
+ },
+ {
+ "epoch": 1.2950354609929078,
+ "grad_norm": 2.4211716651916504,
+ "learning_rate": 4.4772595588482835e-06,
+ "loss": 0.4341,
+ "step": 2739
+ },
+ {
+ "epoch": 1.2955082742316786,
+ "grad_norm": 3.141097068786621,
+ "learning_rate": 4.47687775231984e-06,
+ "loss": 0.5944,
+ "step": 2740
+ },
+ {
+ "epoch": 1.2959810874704492,
+ "grad_norm": 3.077522039413452,
+ "learning_rate": 4.476495822699887e-06,
+ "loss": 0.5786,
+ "step": 2741
+ },
+ {
+ "epoch": 1.2964539007092197,
+ "grad_norm": 2.708139419555664,
+ "learning_rate": 4.476113770012206e-06,
+ "loss": 0.5014,
+ "step": 2742
+ },
+ {
+ "epoch": 1.2969267139479905,
+ "grad_norm": 2.7572035789489746,
+ "learning_rate": 4.475731594280586e-06,
+ "loss": 0.594,
+ "step": 2743
+ },
+ {
+ "epoch": 1.2973995271867613,
+ "grad_norm": 2.673126459121704,
+ "learning_rate": 4.475349295528822e-06,
+ "loss": 0.5317,
+ "step": 2744
+ },
+ {
+ "epoch": 1.297872340425532,
+ "grad_norm": 2.6757819652557373,
+ "learning_rate": 4.4749668737807195e-06,
+ "loss": 0.5614,
+ "step": 2745
+ },
+ {
+ "epoch": 1.2983451536643025,
+ "grad_norm": 2.7077620029449463,
+ "learning_rate": 4.47458432906009e-06,
+ "loss": 0.4916,
+ "step": 2746
+ },
+ {
+ "epoch": 1.2988179669030733,
+ "grad_norm": 2.446570873260498,
+ "learning_rate": 4.474201661390752e-06,
+ "loss": 0.5005,
+ "step": 2747
+ },
+ {
+ "epoch": 1.299290780141844,
+ "grad_norm": 2.642695665359497,
+ "learning_rate": 4.473818870796533e-06,
+ "loss": 0.5048,
+ "step": 2748
+ },
+ {
+ "epoch": 1.2997635933806146,
+ "grad_norm": 2.519824743270874,
+ "learning_rate": 4.4734359573012686e-06,
+ "loss": 0.5131,
+ "step": 2749
+ },
+ {
+ "epoch": 1.3002364066193852,
+ "grad_norm": 2.5901925563812256,
+ "learning_rate": 4.4730529209287995e-06,
+ "loss": 0.4582,
+ "step": 2750
+ },
+ {
+ "epoch": 1.300709219858156,
+ "grad_norm": 2.6789121627807617,
+ "learning_rate": 4.472669761702978e-06,
+ "loss": 0.5685,
+ "step": 2751
+ },
+ {
+ "epoch": 1.3011820330969268,
+ "grad_norm": 2.408003807067871,
+ "learning_rate": 4.472286479647659e-06,
+ "loss": 0.4329,
+ "step": 2752
+ },
+ {
+ "epoch": 1.3016548463356974,
+ "grad_norm": 2.681403398513794,
+ "learning_rate": 4.47190307478671e-06,
+ "loss": 0.4853,
+ "step": 2753
+ },
+ {
+ "epoch": 1.302127659574468,
+ "grad_norm": 2.9923183917999268,
+ "learning_rate": 4.4715195471440025e-06,
+ "loss": 0.5184,
+ "step": 2754
+ },
+ {
+ "epoch": 1.3026004728132388,
+ "grad_norm": 2.5100321769714355,
+ "learning_rate": 4.471135896743418e-06,
+ "loss": 0.5148,
+ "step": 2755
+ },
+ {
+ "epoch": 1.3030732860520096,
+ "grad_norm": 2.267881393432617,
+ "learning_rate": 4.4707521236088444e-06,
+ "loss": 0.5028,
+ "step": 2756
+ },
+ {
+ "epoch": 1.3035460992907801,
+ "grad_norm": 2.7779829502105713,
+ "learning_rate": 4.4703682277641775e-06,
+ "loss": 0.5724,
+ "step": 2757
+ },
+ {
+ "epoch": 1.3040189125295507,
+ "grad_norm": 2.4262194633483887,
+ "learning_rate": 4.4699842092333205e-06,
+ "loss": 0.5341,
+ "step": 2758
+ },
+ {
+ "epoch": 1.3044917257683215,
+ "grad_norm": 2.8682050704956055,
+ "learning_rate": 4.469600068040185e-06,
+ "loss": 0.6114,
+ "step": 2759
+ },
+ {
+ "epoch": 1.3049645390070923,
+ "grad_norm": 2.647853374481201,
+ "learning_rate": 4.46921580420869e-06,
+ "loss": 0.5107,
+ "step": 2760
+ },
+ {
+ "epoch": 1.3054373522458629,
+ "grad_norm": 2.561998128890991,
+ "learning_rate": 4.468831417762762e-06,
+ "loss": 0.6019,
+ "step": 2761
+ },
+ {
+ "epoch": 1.3059101654846335,
+ "grad_norm": 2.763425350189209,
+ "learning_rate": 4.468446908726334e-06,
+ "loss": 0.572,
+ "step": 2762
+ },
+ {
+ "epoch": 1.3063829787234043,
+ "grad_norm": 2.7052934169769287,
+ "learning_rate": 4.468062277123348e-06,
+ "loss": 0.4876,
+ "step": 2763
+ },
+ {
+ "epoch": 1.306855791962175,
+ "grad_norm": 2.997845411300659,
+ "learning_rate": 4.467677522977755e-06,
+ "loss": 0.5683,
+ "step": 2764
+ },
+ {
+ "epoch": 1.3073286052009456,
+ "grad_norm": 2.503129005432129,
+ "learning_rate": 4.46729264631351e-06,
+ "loss": 0.4951,
+ "step": 2765
+ },
+ {
+ "epoch": 1.3078014184397162,
+ "grad_norm": 2.617492437362671,
+ "learning_rate": 4.466907647154578e-06,
+ "loss": 0.5054,
+ "step": 2766
+ },
+ {
+ "epoch": 1.308274231678487,
+ "grad_norm": 2.934967279434204,
+ "learning_rate": 4.4665225255249315e-06,
+ "loss": 0.5299,
+ "step": 2767
+ },
+ {
+ "epoch": 1.3087470449172578,
+ "grad_norm": 2.787252187728882,
+ "learning_rate": 4.46613728144855e-06,
+ "loss": 0.4652,
+ "step": 2768
+ },
+ {
+ "epoch": 1.3092198581560284,
+ "grad_norm": 2.567439556121826,
+ "learning_rate": 4.465751914949422e-06,
+ "loss": 0.538,
+ "step": 2769
+ },
+ {
+ "epoch": 1.309692671394799,
+ "grad_norm": 2.6386024951934814,
+ "learning_rate": 4.4653664260515416e-06,
+ "loss": 0.464,
+ "step": 2770
+ },
+ {
+ "epoch": 1.3101654846335697,
+ "grad_norm": 2.966848134994507,
+ "learning_rate": 4.464980814778912e-06,
+ "loss": 0.4889,
+ "step": 2771
+ },
+ {
+ "epoch": 1.3106382978723405,
+ "grad_norm": 2.571256637573242,
+ "learning_rate": 4.464595081155542e-06,
+ "loss": 0.4979,
+ "step": 2772
+ },
+ {
+ "epoch": 1.3111111111111111,
+ "grad_norm": 2.774203062057495,
+ "learning_rate": 4.4642092252054515e-06,
+ "loss": 0.5366,
+ "step": 2773
+ },
+ {
+ "epoch": 1.3115839243498817,
+ "grad_norm": 2.682969331741333,
+ "learning_rate": 4.463823246952666e-06,
+ "loss": 0.5118,
+ "step": 2774
+ },
+ {
+ "epoch": 1.3120567375886525,
+ "grad_norm": 2.4873905181884766,
+ "learning_rate": 4.463437146421217e-06,
+ "loss": 0.5548,
+ "step": 2775
+ },
+ {
+ "epoch": 1.3125295508274233,
+ "grad_norm": 2.6769661903381348,
+ "learning_rate": 4.463050923635147e-06,
+ "loss": 0.5023,
+ "step": 2776
+ },
+ {
+ "epoch": 1.3130023640661939,
+ "grad_norm": 2.7190892696380615,
+ "learning_rate": 4.462664578618503e-06,
+ "loss": 0.5546,
+ "step": 2777
+ },
+ {
+ "epoch": 1.3134751773049644,
+ "grad_norm": 2.8193624019622803,
+ "learning_rate": 4.462278111395343e-06,
+ "loss": 0.5265,
+ "step": 2778
+ },
+ {
+ "epoch": 1.3139479905437352,
+ "grad_norm": 2.7324538230895996,
+ "learning_rate": 4.461891521989728e-06,
+ "loss": 0.5449,
+ "step": 2779
+ },
+ {
+ "epoch": 1.314420803782506,
+ "grad_norm": 2.87320876121521,
+ "learning_rate": 4.4615048104257305e-06,
+ "loss": 0.5367,
+ "step": 2780
+ },
+ {
+ "epoch": 1.3148936170212766,
+ "grad_norm": 2.6777031421661377,
+ "learning_rate": 4.4611179767274306e-06,
+ "loss": 0.5026,
+ "step": 2781
+ },
+ {
+ "epoch": 1.3153664302600472,
+ "grad_norm": 3.714524269104004,
+ "learning_rate": 4.460731020918913e-06,
+ "loss": 0.569,
+ "step": 2782
+ },
+ {
+ "epoch": 1.315839243498818,
+ "grad_norm": 2.7493600845336914,
+ "learning_rate": 4.460343943024273e-06,
+ "loss": 0.5826,
+ "step": 2783
+ },
+ {
+ "epoch": 1.3163120567375888,
+ "grad_norm": 2.6544079780578613,
+ "learning_rate": 4.459956743067609e-06,
+ "loss": 0.5399,
+ "step": 2784
+ },
+ {
+ "epoch": 1.3167848699763594,
+ "grad_norm": 2.4338037967681885,
+ "learning_rate": 4.459569421073036e-06,
+ "loss": 0.5186,
+ "step": 2785
+ },
+ {
+ "epoch": 1.31725768321513,
+ "grad_norm": 2.9312374591827393,
+ "learning_rate": 4.459181977064665e-06,
+ "loss": 0.5571,
+ "step": 2786
+ },
+ {
+ "epoch": 1.3177304964539007,
+ "grad_norm": 2.5988922119140625,
+ "learning_rate": 4.458794411066624e-06,
+ "loss": 0.5926,
+ "step": 2787
+ },
+ {
+ "epoch": 1.3182033096926715,
+ "grad_norm": 2.5193772315979004,
+ "learning_rate": 4.458406723103044e-06,
+ "loss": 0.5243,
+ "step": 2788
+ },
+ {
+ "epoch": 1.318676122931442,
+ "grad_norm": 2.8653743267059326,
+ "learning_rate": 4.458018913198066e-06,
+ "loss": 0.5421,
+ "step": 2789
+ },
+ {
+ "epoch": 1.3191489361702127,
+ "grad_norm": 2.486245632171631,
+ "learning_rate": 4.457630981375834e-06,
+ "loss": 0.4862,
+ "step": 2790
+ },
+ {
+ "epoch": 1.3196217494089835,
+ "grad_norm": 3.155435800552368,
+ "learning_rate": 4.457242927660506e-06,
+ "loss": 0.5386,
+ "step": 2791
+ },
+ {
+ "epoch": 1.3200945626477543,
+ "grad_norm": 3.102023124694824,
+ "learning_rate": 4.456854752076242e-06,
+ "loss": 0.5527,
+ "step": 2792
+ },
+ {
+ "epoch": 1.3205673758865248,
+ "grad_norm": 2.7995986938476562,
+ "learning_rate": 4.456466454647215e-06,
+ "loss": 0.4364,
+ "step": 2793
+ },
+ {
+ "epoch": 1.3210401891252954,
+ "grad_norm": 2.8328311443328857,
+ "learning_rate": 4.456078035397599e-06,
+ "loss": 0.5516,
+ "step": 2794
+ },
+ {
+ "epoch": 1.3215130023640662,
+ "grad_norm": 2.606161594390869,
+ "learning_rate": 4.455689494351581e-06,
+ "loss": 0.5042,
+ "step": 2795
+ },
+ {
+ "epoch": 1.321985815602837,
+ "grad_norm": 2.6344757080078125,
+ "learning_rate": 4.455300831533354e-06,
+ "loss": 0.4807,
+ "step": 2796
+ },
+ {
+ "epoch": 1.3224586288416076,
+ "grad_norm": 2.8539786338806152,
+ "learning_rate": 4.454912046967118e-06,
+ "loss": 0.4694,
+ "step": 2797
+ },
+ {
+ "epoch": 1.3229314420803782,
+ "grad_norm": 2.849066734313965,
+ "learning_rate": 4.454523140677081e-06,
+ "loss": 0.5037,
+ "step": 2798
+ },
+ {
+ "epoch": 1.323404255319149,
+ "grad_norm": 2.6803371906280518,
+ "learning_rate": 4.454134112687458e-06,
+ "loss": 0.4959,
+ "step": 2799
+ },
+ {
+ "epoch": 1.3238770685579198,
+ "grad_norm": 3.0546066761016846,
+ "learning_rate": 4.453744963022473e-06,
+ "loss": 0.5935,
+ "step": 2800
+ },
+ {
+ "epoch": 1.3243498817966903,
+ "grad_norm": 2.625602960586548,
+ "learning_rate": 4.453355691706356e-06,
+ "loss": 0.5349,
+ "step": 2801
+ },
+ {
+ "epoch": 1.324822695035461,
+ "grad_norm": 2.7568554878234863,
+ "learning_rate": 4.452966298763345e-06,
+ "loss": 0.5012,
+ "step": 2802
+ },
+ {
+ "epoch": 1.3252955082742317,
+ "grad_norm": 2.940427303314209,
+ "learning_rate": 4.452576784217686e-06,
+ "loss": 0.5246,
+ "step": 2803
+ },
+ {
+ "epoch": 1.3257683215130025,
+ "grad_norm": 2.5485289096832275,
+ "learning_rate": 4.452187148093633e-06,
+ "loss": 0.5282,
+ "step": 2804
+ },
+ {
+ "epoch": 1.326241134751773,
+ "grad_norm": 2.8152987957000732,
+ "learning_rate": 4.4517973904154455e-06,
+ "loss": 0.5468,
+ "step": 2805
+ },
+ {
+ "epoch": 1.3267139479905437,
+ "grad_norm": 2.9399688243865967,
+ "learning_rate": 4.451407511207393e-06,
+ "loss": 0.5586,
+ "step": 2806
+ },
+ {
+ "epoch": 1.3271867612293144,
+ "grad_norm": 2.3870036602020264,
+ "learning_rate": 4.451017510493751e-06,
+ "loss": 0.4807,
+ "step": 2807
+ },
+ {
+ "epoch": 1.327659574468085,
+ "grad_norm": 3.4667887687683105,
+ "learning_rate": 4.450627388298805e-06,
+ "loss": 0.5571,
+ "step": 2808
+ },
+ {
+ "epoch": 1.3281323877068558,
+ "grad_norm": 2.685986042022705,
+ "learning_rate": 4.450237144646844e-06,
+ "loss": 0.5525,
+ "step": 2809
+ },
+ {
+ "epoch": 1.3286052009456264,
+ "grad_norm": 2.8529131412506104,
+ "learning_rate": 4.449846779562168e-06,
+ "loss": 0.491,
+ "step": 2810
+ },
+ {
+ "epoch": 1.3290780141843972,
+ "grad_norm": 2.7360332012176514,
+ "learning_rate": 4.449456293069082e-06,
+ "loss": 0.5574,
+ "step": 2811
+ },
+ {
+ "epoch": 1.3295508274231678,
+ "grad_norm": 2.4656026363372803,
+ "learning_rate": 4.4490656851919015e-06,
+ "loss": 0.4678,
+ "step": 2812
+ },
+ {
+ "epoch": 1.3300236406619386,
+ "grad_norm": 2.602651357650757,
+ "learning_rate": 4.448674955954947e-06,
+ "loss": 0.5118,
+ "step": 2813
+ },
+ {
+ "epoch": 1.3304964539007091,
+ "grad_norm": 3.0129756927490234,
+ "learning_rate": 4.448284105382548e-06,
+ "loss": 0.6136,
+ "step": 2814
+ },
+ {
+ "epoch": 1.33096926713948,
+ "grad_norm": 2.8499927520751953,
+ "learning_rate": 4.447893133499039e-06,
+ "loss": 0.5286,
+ "step": 2815
+ },
+ {
+ "epoch": 1.3314420803782505,
+ "grad_norm": 2.8320744037628174,
+ "learning_rate": 4.447502040328767e-06,
+ "loss": 0.5186,
+ "step": 2816
+ },
+ {
+ "epoch": 1.3319148936170213,
+ "grad_norm": 2.499950885772705,
+ "learning_rate": 4.447110825896084e-06,
+ "loss": 0.5338,
+ "step": 2817
+ },
+ {
+ "epoch": 1.3323877068557919,
+ "grad_norm": 2.530895233154297,
+ "learning_rate": 4.446719490225346e-06,
+ "loss": 0.5151,
+ "step": 2818
+ },
+ {
+ "epoch": 1.3328605200945627,
+ "grad_norm": 2.5276098251342773,
+ "learning_rate": 4.446328033340921e-06,
+ "loss": 0.5424,
+ "step": 2819
+ },
+ {
+ "epoch": 1.3333333333333333,
+ "grad_norm": 2.90218186378479,
+ "learning_rate": 4.4459364552671845e-06,
+ "loss": 0.5747,
+ "step": 2820
+ },
+ {
+ "epoch": 1.333806146572104,
+ "grad_norm": 2.500943183898926,
+ "learning_rate": 4.445544756028518e-06,
+ "loss": 0.5459,
+ "step": 2821
+ },
+ {
+ "epoch": 1.3342789598108746,
+ "grad_norm": 2.960374355316162,
+ "learning_rate": 4.44515293564931e-06,
+ "loss": 0.6092,
+ "step": 2822
+ },
+ {
+ "epoch": 1.3347517730496454,
+ "grad_norm": 2.813671827316284,
+ "learning_rate": 4.444760994153958e-06,
+ "loss": 0.5536,
+ "step": 2823
+ },
+ {
+ "epoch": 1.335224586288416,
+ "grad_norm": 2.7147483825683594,
+ "learning_rate": 4.444368931566867e-06,
+ "loss": 0.5291,
+ "step": 2824
+ },
+ {
+ "epoch": 1.3356973995271868,
+ "grad_norm": 2.710101842880249,
+ "learning_rate": 4.443976747912447e-06,
+ "loss": 0.5138,
+ "step": 2825
+ },
+ {
+ "epoch": 1.3361702127659574,
+ "grad_norm": 2.711419105529785,
+ "learning_rate": 4.443584443215121e-06,
+ "loss": 0.5223,
+ "step": 2826
+ },
+ {
+ "epoch": 1.3366430260047282,
+ "grad_norm": 2.887472152709961,
+ "learning_rate": 4.443192017499313e-06,
+ "loss": 0.5464,
+ "step": 2827
+ },
+ {
+ "epoch": 1.3371158392434987,
+ "grad_norm": 2.8867223262786865,
+ "learning_rate": 4.4427994707894585e-06,
+ "loss": 0.5748,
+ "step": 2828
+ },
+ {
+ "epoch": 1.3375886524822695,
+ "grad_norm": 2.407247543334961,
+ "learning_rate": 4.44240680311e-06,
+ "loss": 0.4727,
+ "step": 2829
+ },
+ {
+ "epoch": 1.3380614657210401,
+ "grad_norm": 2.578420877456665,
+ "learning_rate": 4.4420140144853865e-06,
+ "loss": 0.5129,
+ "step": 2830
+ },
+ {
+ "epoch": 1.338534278959811,
+ "grad_norm": 2.884373426437378,
+ "learning_rate": 4.441621104940077e-06,
+ "loss": 0.5366,
+ "step": 2831
+ },
+ {
+ "epoch": 1.3390070921985815,
+ "grad_norm": 2.8652374744415283,
+ "learning_rate": 4.441228074498534e-06,
+ "loss": 0.5045,
+ "step": 2832
+ },
+ {
+ "epoch": 1.3394799054373523,
+ "grad_norm": 2.5380210876464844,
+ "learning_rate": 4.440834923185231e-06,
+ "loss": 0.509,
+ "step": 2833
+ },
+ {
+ "epoch": 1.3399527186761229,
+ "grad_norm": 2.415734052658081,
+ "learning_rate": 4.440441651024648e-06,
+ "loss": 0.5066,
+ "step": 2834
+ },
+ {
+ "epoch": 1.3404255319148937,
+ "grad_norm": 2.503051996231079,
+ "learning_rate": 4.440048258041272e-06,
+ "loss": 0.5118,
+ "step": 2835
+ },
+ {
+ "epoch": 1.3408983451536642,
+ "grad_norm": 3.351001024246216,
+ "learning_rate": 4.439654744259598e-06,
+ "loss": 0.5758,
+ "step": 2836
+ },
+ {
+ "epoch": 1.341371158392435,
+ "grad_norm": 2.7368781566619873,
+ "learning_rate": 4.439261109704129e-06,
+ "loss": 0.5674,
+ "step": 2837
+ },
+ {
+ "epoch": 1.3418439716312056,
+ "grad_norm": 3.008199453353882,
+ "learning_rate": 4.438867354399372e-06,
+ "loss": 0.5891,
+ "step": 2838
+ },
+ {
+ "epoch": 1.3423167848699764,
+ "grad_norm": 2.538907766342163,
+ "learning_rate": 4.438473478369847e-06,
+ "loss": 0.5102,
+ "step": 2839
+ },
+ {
+ "epoch": 1.342789598108747,
+ "grad_norm": 2.7169063091278076,
+ "learning_rate": 4.438079481640079e-06,
+ "loss": 0.6131,
+ "step": 2840
+ },
+ {
+ "epoch": 1.3432624113475178,
+ "grad_norm": 2.7411608695983887,
+ "learning_rate": 4.437685364234601e-06,
+ "loss": 0.5337,
+ "step": 2841
+ },
+ {
+ "epoch": 1.3437352245862884,
+ "grad_norm": 3.2374939918518066,
+ "learning_rate": 4.43729112617795e-06,
+ "loss": 0.5401,
+ "step": 2842
+ },
+ {
+ "epoch": 1.3442080378250592,
+ "grad_norm": 2.4712226390838623,
+ "learning_rate": 4.436896767494676e-06,
+ "loss": 0.5365,
+ "step": 2843
+ },
+ {
+ "epoch": 1.3446808510638297,
+ "grad_norm": 2.661619186401367,
+ "learning_rate": 4.436502288209334e-06,
+ "loss": 0.4919,
+ "step": 2844
+ },
+ {
+ "epoch": 1.3451536643026005,
+ "grad_norm": 2.5943779945373535,
+ "learning_rate": 4.4361076883464845e-06,
+ "loss": 0.5253,
+ "step": 2845
+ },
+ {
+ "epoch": 1.345626477541371,
+ "grad_norm": 2.672297477722168,
+ "learning_rate": 4.4357129679307e-06,
+ "loss": 0.541,
+ "step": 2846
+ },
+ {
+ "epoch": 1.346099290780142,
+ "grad_norm": 2.6830925941467285,
+ "learning_rate": 4.435318126986557e-06,
+ "loss": 0.5641,
+ "step": 2847
+ },
+ {
+ "epoch": 1.3465721040189125,
+ "grad_norm": 2.7394626140594482,
+ "learning_rate": 4.434923165538639e-06,
+ "loss": 0.5591,
+ "step": 2848
+ },
+ {
+ "epoch": 1.3470449172576833,
+ "grad_norm": 2.9656317234039307,
+ "learning_rate": 4.434528083611541e-06,
+ "loss": 0.515,
+ "step": 2849
+ },
+ {
+ "epoch": 1.3475177304964538,
+ "grad_norm": 3.30155086517334,
+ "learning_rate": 4.434132881229861e-06,
+ "loss": 0.5871,
+ "step": 2850
+ },
+ {
+ "epoch": 1.3479905437352246,
+ "grad_norm": 2.6222476959228516,
+ "learning_rate": 4.433737558418209e-06,
+ "loss": 0.5143,
+ "step": 2851
+ },
+ {
+ "epoch": 1.3484633569739952,
+ "grad_norm": 2.903158187866211,
+ "learning_rate": 4.4333421152011965e-06,
+ "loss": 0.4484,
+ "step": 2852
+ },
+ {
+ "epoch": 1.348936170212766,
+ "grad_norm": 2.863116979598999,
+ "learning_rate": 4.432946551603449e-06,
+ "loss": 0.5213,
+ "step": 2853
+ },
+ {
+ "epoch": 1.3494089834515366,
+ "grad_norm": 2.8253962993621826,
+ "learning_rate": 4.432550867649596e-06,
+ "loss": 0.5713,
+ "step": 2854
+ },
+ {
+ "epoch": 1.3498817966903074,
+ "grad_norm": 2.652493953704834,
+ "learning_rate": 4.432155063364273e-06,
+ "loss": 0.5559,
+ "step": 2855
+ },
+ {
+ "epoch": 1.350354609929078,
+ "grad_norm": 2.4289376735687256,
+ "learning_rate": 4.431759138772127e-06,
+ "loss": 0.5122,
+ "step": 2856
+ },
+ {
+ "epoch": 1.3508274231678488,
+ "grad_norm": 2.6329853534698486,
+ "learning_rate": 4.43136309389781e-06,
+ "loss": 0.5332,
+ "step": 2857
+ },
+ {
+ "epoch": 1.3513002364066193,
+ "grad_norm": 2.431103229522705,
+ "learning_rate": 4.430966928765982e-06,
+ "loss": 0.4863,
+ "step": 2858
+ },
+ {
+ "epoch": 1.3517730496453901,
+ "grad_norm": 2.7529025077819824,
+ "learning_rate": 4.4305706434013106e-06,
+ "loss": 0.5263,
+ "step": 2859
+ },
+ {
+ "epoch": 1.3522458628841607,
+ "grad_norm": 2.884605646133423,
+ "learning_rate": 4.43017423782847e-06,
+ "loss": 0.564,
+ "step": 2860
+ },
+ {
+ "epoch": 1.3527186761229315,
+ "grad_norm": 3.027771234512329,
+ "learning_rate": 4.4297777120721435e-06,
+ "loss": 0.5846,
+ "step": 2861
+ },
+ {
+ "epoch": 1.353191489361702,
+ "grad_norm": 3.0140626430511475,
+ "learning_rate": 4.4293810661570205e-06,
+ "loss": 0.6621,
+ "step": 2862
+ },
+ {
+ "epoch": 1.3536643026004729,
+ "grad_norm": 2.721799612045288,
+ "learning_rate": 4.428984300107799e-06,
+ "loss": 0.5566,
+ "step": 2863
+ },
+ {
+ "epoch": 1.3541371158392435,
+ "grad_norm": 3.0016496181488037,
+ "learning_rate": 4.428587413949183e-06,
+ "loss": 0.5525,
+ "step": 2864
+ },
+ {
+ "epoch": 1.3546099290780143,
+ "grad_norm": 2.77138614654541,
+ "learning_rate": 4.428190407705886e-06,
+ "loss": 0.6016,
+ "step": 2865
+ },
+ {
+ "epoch": 1.3550827423167848,
+ "grad_norm": 2.9783477783203125,
+ "learning_rate": 4.427793281402627e-06,
+ "loss": 0.5556,
+ "step": 2866
+ },
+ {
+ "epoch": 1.3555555555555556,
+ "grad_norm": 2.2490382194519043,
+ "learning_rate": 4.427396035064132e-06,
+ "loss": 0.5138,
+ "step": 2867
+ },
+ {
+ "epoch": 1.3560283687943262,
+ "grad_norm": 2.442225217819214,
+ "learning_rate": 4.426998668715139e-06,
+ "loss": 0.4843,
+ "step": 2868
+ },
+ {
+ "epoch": 1.356501182033097,
+ "grad_norm": 2.74040150642395,
+ "learning_rate": 4.426601182380388e-06,
+ "loss": 0.54,
+ "step": 2869
+ },
+ {
+ "epoch": 1.3569739952718676,
+ "grad_norm": 2.4434332847595215,
+ "learning_rate": 4.426203576084629e-06,
+ "loss": 0.5199,
+ "step": 2870
+ },
+ {
+ "epoch": 1.3574468085106384,
+ "grad_norm": 2.6380388736724854,
+ "learning_rate": 4.42580584985262e-06,
+ "loss": 0.5049,
+ "step": 2871
+ },
+ {
+ "epoch": 1.357919621749409,
+ "grad_norm": 2.7324254512786865,
+ "learning_rate": 4.425408003709125e-06,
+ "loss": 0.5036,
+ "step": 2872
+ },
+ {
+ "epoch": 1.3583924349881797,
+ "grad_norm": 2.661012649536133,
+ "learning_rate": 4.425010037678916e-06,
+ "loss": 0.4965,
+ "step": 2873
+ },
+ {
+ "epoch": 1.3588652482269503,
+ "grad_norm": 2.5380208492279053,
+ "learning_rate": 4.424611951786773e-06,
+ "loss": 0.4293,
+ "step": 2874
+ },
+ {
+ "epoch": 1.3593380614657211,
+ "grad_norm": 2.6060714721679688,
+ "learning_rate": 4.424213746057483e-06,
+ "loss": 0.5335,
+ "step": 2875
+ },
+ {
+ "epoch": 1.3598108747044917,
+ "grad_norm": 2.98282527923584,
+ "learning_rate": 4.423815420515841e-06,
+ "loss": 0.5626,
+ "step": 2876
+ },
+ {
+ "epoch": 1.3602836879432625,
+ "grad_norm": 2.779371500015259,
+ "learning_rate": 4.423416975186647e-06,
+ "loss": 0.5353,
+ "step": 2877
+ },
+ {
+ "epoch": 1.360756501182033,
+ "grad_norm": 2.8033530712127686,
+ "learning_rate": 4.423018410094713e-06,
+ "loss": 0.538,
+ "step": 2878
+ },
+ {
+ "epoch": 1.3612293144208039,
+ "grad_norm": 3.225177764892578,
+ "learning_rate": 4.422619725264855e-06,
+ "loss": 0.5441,
+ "step": 2879
+ },
+ {
+ "epoch": 1.3617021276595744,
+ "grad_norm": 2.959135055541992,
+ "learning_rate": 4.422220920721896e-06,
+ "loss": 0.5293,
+ "step": 2880
+ },
+ {
+ "epoch": 1.3621749408983452,
+ "grad_norm": 2.5558884143829346,
+ "learning_rate": 4.4218219964906704e-06,
+ "loss": 0.442,
+ "step": 2881
+ },
+ {
+ "epoch": 1.3626477541371158,
+ "grad_norm": 2.694899797439575,
+ "learning_rate": 4.421422952596015e-06,
+ "loss": 0.5318,
+ "step": 2882
+ },
+ {
+ "epoch": 1.3631205673758866,
+ "grad_norm": 2.7909531593322754,
+ "learning_rate": 4.421023789062777e-06,
+ "loss": 0.6648,
+ "step": 2883
+ },
+ {
+ "epoch": 1.3635933806146572,
+ "grad_norm": 2.421995162963867,
+ "learning_rate": 4.420624505915813e-06,
+ "loss": 0.4644,
+ "step": 2884
+ },
+ {
+ "epoch": 1.364066193853428,
+ "grad_norm": 2.5876688957214355,
+ "learning_rate": 4.420225103179981e-06,
+ "loss": 0.5743,
+ "step": 2885
+ },
+ {
+ "epoch": 1.3645390070921986,
+ "grad_norm": 2.89341139793396,
+ "learning_rate": 4.419825580880152e-06,
+ "loss": 0.5454,
+ "step": 2886
+ },
+ {
+ "epoch": 1.3650118203309693,
+ "grad_norm": 2.534708261489868,
+ "learning_rate": 4.419425939041203e-06,
+ "loss": 0.5572,
+ "step": 2887
+ },
+ {
+ "epoch": 1.36548463356974,
+ "grad_norm": 2.6052141189575195,
+ "learning_rate": 4.419026177688017e-06,
+ "loss": 0.4763,
+ "step": 2888
+ },
+ {
+ "epoch": 1.3659574468085105,
+ "grad_norm": 2.723720073699951,
+ "learning_rate": 4.4186262968454854e-06,
+ "loss": 0.5659,
+ "step": 2889
+ },
+ {
+ "epoch": 1.3664302600472813,
+ "grad_norm": 2.8909599781036377,
+ "learning_rate": 4.418226296538507e-06,
+ "loss": 0.4996,
+ "step": 2890
+ },
+ {
+ "epoch": 1.366903073286052,
+ "grad_norm": 2.551375389099121,
+ "learning_rate": 4.417826176791988e-06,
+ "loss": 0.5259,
+ "step": 2891
+ },
+ {
+ "epoch": 1.3673758865248227,
+ "grad_norm": 3.360267162322998,
+ "learning_rate": 4.417425937630843e-06,
+ "loss": 0.5381,
+ "step": 2892
+ },
+ {
+ "epoch": 1.3678486997635932,
+ "grad_norm": 2.7611942291259766,
+ "learning_rate": 4.417025579079992e-06,
+ "loss": 0.6022,
+ "step": 2893
+ },
+ {
+ "epoch": 1.368321513002364,
+ "grad_norm": 2.5931224822998047,
+ "learning_rate": 4.416625101164365e-06,
+ "loss": 0.5102,
+ "step": 2894
+ },
+ {
+ "epoch": 1.3687943262411348,
+ "grad_norm": 2.5888102054595947,
+ "learning_rate": 4.416224503908897e-06,
+ "loss": 0.4955,
+ "step": 2895
+ },
+ {
+ "epoch": 1.3692671394799054,
+ "grad_norm": 2.6262896060943604,
+ "learning_rate": 4.41582378733853e-06,
+ "loss": 0.5101,
+ "step": 2896
+ },
+ {
+ "epoch": 1.369739952718676,
+ "grad_norm": 3.339170217514038,
+ "learning_rate": 4.415422951478218e-06,
+ "loss": 0.4939,
+ "step": 2897
+ },
+ {
+ "epoch": 1.3702127659574468,
+ "grad_norm": 2.940866708755493,
+ "learning_rate": 4.415021996352917e-06,
+ "loss": 0.5157,
+ "step": 2898
+ },
+ {
+ "epoch": 1.3706855791962176,
+ "grad_norm": 2.7423818111419678,
+ "learning_rate": 4.414620921987594e-06,
+ "loss": 0.5308,
+ "step": 2899
+ },
+ {
+ "epoch": 1.3711583924349882,
+ "grad_norm": 2.7177040576934814,
+ "learning_rate": 4.414219728407221e-06,
+ "loss": 0.5429,
+ "step": 2900
+ },
+ {
+ "epoch": 1.3716312056737587,
+ "grad_norm": 2.560774087905884,
+ "learning_rate": 4.4138184156367794e-06,
+ "loss": 0.5266,
+ "step": 2901
+ },
+ {
+ "epoch": 1.3721040189125295,
+ "grad_norm": 2.5649116039276123,
+ "learning_rate": 4.413416983701256e-06,
+ "loss": 0.4718,
+ "step": 2902
+ },
+ {
+ "epoch": 1.3725768321513003,
+ "grad_norm": 2.8547167778015137,
+ "learning_rate": 4.413015432625648e-06,
+ "loss": 0.5129,
+ "step": 2903
+ },
+ {
+ "epoch": 1.373049645390071,
+ "grad_norm": 2.5413618087768555,
+ "learning_rate": 4.412613762434958e-06,
+ "loss": 0.5738,
+ "step": 2904
+ },
+ {
+ "epoch": 1.3735224586288415,
+ "grad_norm": 3.3252241611480713,
+ "learning_rate": 4.412211973154195e-06,
+ "loss": 0.5639,
+ "step": 2905
+ },
+ {
+ "epoch": 1.3739952718676123,
+ "grad_norm": 2.869102954864502,
+ "learning_rate": 4.411810064808376e-06,
+ "loss": 0.5384,
+ "step": 2906
+ },
+ {
+ "epoch": 1.374468085106383,
+ "grad_norm": 2.703199863433838,
+ "learning_rate": 4.411408037422529e-06,
+ "loss": 0.5742,
+ "step": 2907
+ },
+ {
+ "epoch": 1.3749408983451537,
+ "grad_norm": 2.685450792312622,
+ "learning_rate": 4.411005891021684e-06,
+ "loss": 0.5121,
+ "step": 2908
+ },
+ {
+ "epoch": 1.3754137115839242,
+ "grad_norm": 2.9572203159332275,
+ "learning_rate": 4.410603625630882e-06,
+ "loss": 0.5444,
+ "step": 2909
+ },
+ {
+ "epoch": 1.375886524822695,
+ "grad_norm": 2.707002878189087,
+ "learning_rate": 4.410201241275169e-06,
+ "loss": 0.5125,
+ "step": 2910
+ },
+ {
+ "epoch": 1.3763593380614658,
+ "grad_norm": 3.0158939361572266,
+ "learning_rate": 4.409798737979602e-06,
+ "loss": 0.5299,
+ "step": 2911
+ },
+ {
+ "epoch": 1.3768321513002364,
+ "grad_norm": 2.7932698726654053,
+ "learning_rate": 4.4093961157692415e-06,
+ "loss": 0.5437,
+ "step": 2912
+ },
+ {
+ "epoch": 1.377304964539007,
+ "grad_norm": 2.459510326385498,
+ "learning_rate": 4.408993374669156e-06,
+ "loss": 0.5548,
+ "step": 2913
+ },
+ {
+ "epoch": 1.3777777777777778,
+ "grad_norm": 2.7500696182250977,
+ "learning_rate": 4.408590514704425e-06,
+ "loss": 0.5186,
+ "step": 2914
+ },
+ {
+ "epoch": 1.3782505910165486,
+ "grad_norm": 2.7824268341064453,
+ "learning_rate": 4.4081875359001315e-06,
+ "loss": 0.4762,
+ "step": 2915
+ },
+ {
+ "epoch": 1.3787234042553191,
+ "grad_norm": 2.4202158451080322,
+ "learning_rate": 4.4077844382813675e-06,
+ "loss": 0.5005,
+ "step": 2916
+ },
+ {
+ "epoch": 1.3791962174940897,
+ "grad_norm": 2.5566670894622803,
+ "learning_rate": 4.4073812218732316e-06,
+ "loss": 0.5377,
+ "step": 2917
+ },
+ {
+ "epoch": 1.3796690307328605,
+ "grad_norm": 3.400874376296997,
+ "learning_rate": 4.406977886700831e-06,
+ "loss": 0.6637,
+ "step": 2918
+ },
+ {
+ "epoch": 1.3801418439716313,
+ "grad_norm": 2.8187878131866455,
+ "learning_rate": 4.406574432789278e-06,
+ "loss": 0.5033,
+ "step": 2919
+ },
+ {
+ "epoch": 1.3806146572104019,
+ "grad_norm": 2.5578041076660156,
+ "learning_rate": 4.406170860163697e-06,
+ "loss": 0.5293,
+ "step": 2920
+ },
+ {
+ "epoch": 1.3810874704491725,
+ "grad_norm": 2.6709718704223633,
+ "learning_rate": 4.405767168849213e-06,
+ "loss": 0.5144,
+ "step": 2921
+ },
+ {
+ "epoch": 1.3815602836879433,
+ "grad_norm": 3.049365997314453,
+ "learning_rate": 4.405363358870965e-06,
+ "loss": 0.4894,
+ "step": 2922
+ },
+ {
+ "epoch": 1.382033096926714,
+ "grad_norm": 2.5569891929626465,
+ "learning_rate": 4.404959430254095e-06,
+ "loss": 0.4929,
+ "step": 2923
+ },
+ {
+ "epoch": 1.3825059101654846,
+ "grad_norm": 2.8288230895996094,
+ "learning_rate": 4.404555383023754e-06,
+ "loss": 0.5438,
+ "step": 2924
+ },
+ {
+ "epoch": 1.3829787234042552,
+ "grad_norm": 2.8363358974456787,
+ "learning_rate": 4.404151217205102e-06,
+ "loss": 0.545,
+ "step": 2925
+ },
+ {
+ "epoch": 1.383451536643026,
+ "grad_norm": 2.720972776412964,
+ "learning_rate": 4.403746932823302e-06,
+ "loss": 0.5732,
+ "step": 2926
+ },
+ {
+ "epoch": 1.3839243498817968,
+ "grad_norm": 2.728043794631958,
+ "learning_rate": 4.403342529903528e-06,
+ "loss": 0.4944,
+ "step": 2927
+ },
+ {
+ "epoch": 1.3843971631205674,
+ "grad_norm": 2.4366135597229004,
+ "learning_rate": 4.402938008470961e-06,
+ "loss": 0.4441,
+ "step": 2928
+ },
+ {
+ "epoch": 1.384869976359338,
+ "grad_norm": 2.858454704284668,
+ "learning_rate": 4.402533368550788e-06,
+ "loss": 0.5359,
+ "step": 2929
+ },
+ {
+ "epoch": 1.3853427895981087,
+ "grad_norm": 2.805795907974243,
+ "learning_rate": 4.402128610168205e-06,
+ "loss": 0.4954,
+ "step": 2930
+ },
+ {
+ "epoch": 1.3858156028368795,
+ "grad_norm": 3.3514177799224854,
+ "learning_rate": 4.401723733348413e-06,
+ "loss": 0.579,
+ "step": 2931
+ },
+ {
+ "epoch": 1.3862884160756501,
+ "grad_norm": 2.6255125999450684,
+ "learning_rate": 4.401318738116624e-06,
+ "loss": 0.5002,
+ "step": 2932
+ },
+ {
+ "epoch": 1.3867612293144207,
+ "grad_norm": 2.3480796813964844,
+ "learning_rate": 4.400913624498054e-06,
+ "loss": 0.4688,
+ "step": 2933
+ },
+ {
+ "epoch": 1.3872340425531915,
+ "grad_norm": 2.710165023803711,
+ "learning_rate": 4.400508392517927e-06,
+ "loss": 0.5099,
+ "step": 2934
+ },
+ {
+ "epoch": 1.3877068557919623,
+ "grad_norm": 2.5820295810699463,
+ "learning_rate": 4.400103042201477e-06,
+ "loss": 0.512,
+ "step": 2935
+ },
+ {
+ "epoch": 1.3881796690307329,
+ "grad_norm": 2.750596523284912,
+ "learning_rate": 4.399697573573942e-06,
+ "loss": 0.463,
+ "step": 2936
+ },
+ {
+ "epoch": 1.3886524822695034,
+ "grad_norm": 3.497537612915039,
+ "learning_rate": 4.399291986660569e-06,
+ "loss": 0.5676,
+ "step": 2937
+ },
+ {
+ "epoch": 1.3891252955082742,
+ "grad_norm": 2.4046003818511963,
+ "learning_rate": 4.398886281486612e-06,
+ "loss": 0.5408,
+ "step": 2938
+ },
+ {
+ "epoch": 1.389598108747045,
+ "grad_norm": 2.941606283187866,
+ "learning_rate": 4.398480458077332e-06,
+ "loss": 0.5734,
+ "step": 2939
+ },
+ {
+ "epoch": 1.3900709219858156,
+ "grad_norm": 3.030214309692383,
+ "learning_rate": 4.398074516458e-06,
+ "loss": 0.5353,
+ "step": 2940
+ },
+ {
+ "epoch": 1.3905437352245862,
+ "grad_norm": 2.9991626739501953,
+ "learning_rate": 4.397668456653889e-06,
+ "loss": 0.5989,
+ "step": 2941
+ },
+ {
+ "epoch": 1.391016548463357,
+ "grad_norm": 4.163141250610352,
+ "learning_rate": 4.397262278690285e-06,
+ "loss": 0.5436,
+ "step": 2942
+ },
+ {
+ "epoch": 1.3914893617021278,
+ "grad_norm": 2.6576037406921387,
+ "learning_rate": 4.396855982592478e-06,
+ "loss": 0.5206,
+ "step": 2943
+ },
+ {
+ "epoch": 1.3919621749408984,
+ "grad_norm": 2.7729203701019287,
+ "learning_rate": 4.396449568385768e-06,
+ "loss": 0.5403,
+ "step": 2944
+ },
+ {
+ "epoch": 1.392434988179669,
+ "grad_norm": 2.4560446739196777,
+ "learning_rate": 4.396043036095457e-06,
+ "loss": 0.4924,
+ "step": 2945
+ },
+ {
+ "epoch": 1.3929078014184397,
+ "grad_norm": 2.6370556354522705,
+ "learning_rate": 4.39563638574686e-06,
+ "loss": 0.5543,
+ "step": 2946
+ },
+ {
+ "epoch": 1.3933806146572105,
+ "grad_norm": 2.593914270401001,
+ "learning_rate": 4.395229617365298e-06,
+ "loss": 0.5133,
+ "step": 2947
+ },
+ {
+ "epoch": 1.393853427895981,
+ "grad_norm": 2.3583998680114746,
+ "learning_rate": 4.394822730976099e-06,
+ "loss": 0.4436,
+ "step": 2948
+ },
+ {
+ "epoch": 1.3943262411347517,
+ "grad_norm": 3.2768537998199463,
+ "learning_rate": 4.394415726604596e-06,
+ "loss": 0.5489,
+ "step": 2949
+ },
+ {
+ "epoch": 1.3947990543735225,
+ "grad_norm": 2.88662052154541,
+ "learning_rate": 4.394008604276133e-06,
+ "loss": 0.5194,
+ "step": 2950
+ },
+ {
+ "epoch": 1.3952718676122933,
+ "grad_norm": 2.46610426902771,
+ "learning_rate": 4.393601364016059e-06,
+ "loss": 0.5255,
+ "step": 2951
+ },
+ {
+ "epoch": 1.3957446808510638,
+ "grad_norm": 3.122509241104126,
+ "learning_rate": 4.393194005849731e-06,
+ "loss": 0.6046,
+ "step": 2952
+ },
+ {
+ "epoch": 1.3962174940898344,
+ "grad_norm": 2.724926471710205,
+ "learning_rate": 4.392786529802513e-06,
+ "loss": 0.4958,
+ "step": 2953
+ },
+ {
+ "epoch": 1.3966903073286052,
+ "grad_norm": 2.491485595703125,
+ "learning_rate": 4.3923789358997785e-06,
+ "loss": 0.5209,
+ "step": 2954
+ },
+ {
+ "epoch": 1.397163120567376,
+ "grad_norm": 2.61110520362854,
+ "learning_rate": 4.3919712241669056e-06,
+ "loss": 0.5202,
+ "step": 2955
+ },
+ {
+ "epoch": 1.3976359338061466,
+ "grad_norm": 2.3814501762390137,
+ "learning_rate": 4.39156339462928e-06,
+ "loss": 0.4966,
+ "step": 2956
+ },
+ {
+ "epoch": 1.3981087470449172,
+ "grad_norm": 2.762498617172241,
+ "learning_rate": 4.391155447312296e-06,
+ "loss": 0.6025,
+ "step": 2957
+ },
+ {
+ "epoch": 1.398581560283688,
+ "grad_norm": 2.964975595474243,
+ "learning_rate": 4.390747382241355e-06,
+ "loss": 0.4845,
+ "step": 2958
+ },
+ {
+ "epoch": 1.3990543735224588,
+ "grad_norm": 3.0117249488830566,
+ "learning_rate": 4.3903391994418655e-06,
+ "loss": 0.5326,
+ "step": 2959
+ },
+ {
+ "epoch": 1.3995271867612293,
+ "grad_norm": 2.578626871109009,
+ "learning_rate": 4.389930898939243e-06,
+ "loss": 0.5271,
+ "step": 2960
+ },
+ {
+ "epoch": 1.4,
+ "grad_norm": 2.747441053390503,
+ "learning_rate": 4.38952248075891e-06,
+ "loss": 0.5553,
+ "step": 2961
+ },
+ {
+ "epoch": 1.4004728132387707,
+ "grad_norm": 2.8273086547851562,
+ "learning_rate": 4.389113944926297e-06,
+ "loss": 0.5475,
+ "step": 2962
+ },
+ {
+ "epoch": 1.4009456264775415,
+ "grad_norm": 2.55238676071167,
+ "learning_rate": 4.388705291466843e-06,
+ "loss": 0.4864,
+ "step": 2963
+ },
+ {
+ "epoch": 1.401418439716312,
+ "grad_norm": 2.597214460372925,
+ "learning_rate": 4.388296520405992e-06,
+ "loss": 0.4845,
+ "step": 2964
+ },
+ {
+ "epoch": 1.4018912529550827,
+ "grad_norm": 2.608962297439575,
+ "learning_rate": 4.387887631769196e-06,
+ "loss": 0.5544,
+ "step": 2965
+ },
+ {
+ "epoch": 1.4023640661938535,
+ "grad_norm": 2.2754876613616943,
+ "learning_rate": 4.3874786255819165e-06,
+ "loss": 0.5045,
+ "step": 2966
+ },
+ {
+ "epoch": 1.4028368794326243,
+ "grad_norm": 2.9900264739990234,
+ "learning_rate": 4.387069501869618e-06,
+ "loss": 0.562,
+ "step": 2967
+ },
+ {
+ "epoch": 1.4033096926713948,
+ "grad_norm": 2.8069417476654053,
+ "learning_rate": 4.386660260657778e-06,
+ "loss": 0.5284,
+ "step": 2968
+ },
+ {
+ "epoch": 1.4037825059101654,
+ "grad_norm": 2.68894624710083,
+ "learning_rate": 4.386250901971875e-06,
+ "loss": 0.5879,
+ "step": 2969
+ },
+ {
+ "epoch": 1.4042553191489362,
+ "grad_norm": 2.614485025405884,
+ "learning_rate": 4.385841425837399e-06,
+ "loss": 0.4771,
+ "step": 2970
+ },
+ {
+ "epoch": 1.4047281323877068,
+ "grad_norm": 2.487950325012207,
+ "learning_rate": 4.385431832279848e-06,
+ "loss": 0.5552,
+ "step": 2971
+ },
+ {
+ "epoch": 1.4052009456264776,
+ "grad_norm": 2.5098392963409424,
+ "learning_rate": 4.385022121324723e-06,
+ "loss": 0.5267,
+ "step": 2972
+ },
+ {
+ "epoch": 1.4056737588652481,
+ "grad_norm": 2.825838565826416,
+ "learning_rate": 4.384612292997537e-06,
+ "loss": 0.5336,
+ "step": 2973
+ },
+ {
+ "epoch": 1.406146572104019,
+ "grad_norm": 2.898188829421997,
+ "learning_rate": 4.384202347323806e-06,
+ "loss": 0.5685,
+ "step": 2974
+ },
+ {
+ "epoch": 1.4066193853427895,
+ "grad_norm": 2.8722569942474365,
+ "learning_rate": 4.383792284329057e-06,
+ "loss": 0.5977,
+ "step": 2975
+ },
+ {
+ "epoch": 1.4070921985815603,
+ "grad_norm": 2.832951307296753,
+ "learning_rate": 4.3833821040388235e-06,
+ "loss": 0.5766,
+ "step": 2976
+ },
+ {
+ "epoch": 1.407565011820331,
+ "grad_norm": 2.7353670597076416,
+ "learning_rate": 4.3829718064786446e-06,
+ "loss": 0.5461,
+ "step": 2977
+ },
+ {
+ "epoch": 1.4080378250591017,
+ "grad_norm": 2.6050429344177246,
+ "learning_rate": 4.3825613916740675e-06,
+ "loss": 0.5501,
+ "step": 2978
+ },
+ {
+ "epoch": 1.4085106382978723,
+ "grad_norm": 2.79719877243042,
+ "learning_rate": 4.382150859650647e-06,
+ "loss": 0.502,
+ "step": 2979
+ },
+ {
+ "epoch": 1.408983451536643,
+ "grad_norm": 2.5538079738616943,
+ "learning_rate": 4.381740210433946e-06,
+ "loss": 0.4762,
+ "step": 2980
+ },
+ {
+ "epoch": 1.4094562647754136,
+ "grad_norm": 2.7256062030792236,
+ "learning_rate": 4.381329444049533e-06,
+ "loss": 0.4692,
+ "step": 2981
+ },
+ {
+ "epoch": 1.4099290780141844,
+ "grad_norm": 2.7778146266937256,
+ "learning_rate": 4.3809185605229855e-06,
+ "loss": 0.5366,
+ "step": 2982
+ },
+ {
+ "epoch": 1.410401891252955,
+ "grad_norm": 2.6289451122283936,
+ "learning_rate": 4.380507559879887e-06,
+ "loss": 0.5412,
+ "step": 2983
+ },
+ {
+ "epoch": 1.4108747044917258,
+ "grad_norm": 2.697204828262329,
+ "learning_rate": 4.380096442145827e-06,
+ "loss": 0.5065,
+ "step": 2984
+ },
+ {
+ "epoch": 1.4113475177304964,
+ "grad_norm": 2.4709219932556152,
+ "learning_rate": 4.379685207346407e-06,
+ "loss": 0.568,
+ "step": 2985
+ },
+ {
+ "epoch": 1.4118203309692672,
+ "grad_norm": 2.9740655422210693,
+ "learning_rate": 4.379273855507231e-06,
+ "loss": 0.5512,
+ "step": 2986
+ },
+ {
+ "epoch": 1.4122931442080378,
+ "grad_norm": 3.0090627670288086,
+ "learning_rate": 4.378862386653911e-06,
+ "loss": 0.5459,
+ "step": 2987
+ },
+ {
+ "epoch": 1.4127659574468086,
+ "grad_norm": 2.8835368156433105,
+ "learning_rate": 4.378450800812071e-06,
+ "loss": 0.5357,
+ "step": 2988
+ },
+ {
+ "epoch": 1.4132387706855791,
+ "grad_norm": 2.558824062347412,
+ "learning_rate": 4.378039098007335e-06,
+ "loss": 0.536,
+ "step": 2989
+ },
+ {
+ "epoch": 1.41371158392435,
+ "grad_norm": 2.5572092533111572,
+ "learning_rate": 4.377627278265339e-06,
+ "loss": 0.5183,
+ "step": 2990
+ },
+ {
+ "epoch": 1.4141843971631205,
+ "grad_norm": 2.7356579303741455,
+ "learning_rate": 4.377215341611727e-06,
+ "loss": 0.5087,
+ "step": 2991
+ },
+ {
+ "epoch": 1.4146572104018913,
+ "grad_norm": 2.7541024684906006,
+ "learning_rate": 4.376803288072146e-06,
+ "loss": 0.4509,
+ "step": 2992
+ },
+ {
+ "epoch": 1.4151300236406619,
+ "grad_norm": 2.7548446655273438,
+ "learning_rate": 4.376391117672254e-06,
+ "loss": 0.5532,
+ "step": 2993
+ },
+ {
+ "epoch": 1.4156028368794327,
+ "grad_norm": 2.9107465744018555,
+ "learning_rate": 4.375978830437715e-06,
+ "loss": 0.5719,
+ "step": 2994
+ },
+ {
+ "epoch": 1.4160756501182032,
+ "grad_norm": 2.7077393531799316,
+ "learning_rate": 4.3755664263942e-06,
+ "loss": 0.5084,
+ "step": 2995
+ },
+ {
+ "epoch": 1.416548463356974,
+ "grad_norm": 2.764209270477295,
+ "learning_rate": 4.375153905567388e-06,
+ "loss": 0.5976,
+ "step": 2996
+ },
+ {
+ "epoch": 1.4170212765957446,
+ "grad_norm": 2.7792932987213135,
+ "learning_rate": 4.374741267982964e-06,
+ "loss": 0.5358,
+ "step": 2997
+ },
+ {
+ "epoch": 1.4174940898345154,
+ "grad_norm": 2.459212064743042,
+ "learning_rate": 4.374328513666622e-06,
+ "loss": 0.5181,
+ "step": 2998
+ },
+ {
+ "epoch": 1.417966903073286,
+ "grad_norm": 2.548546552658081,
+ "learning_rate": 4.373915642644062e-06,
+ "loss": 0.528,
+ "step": 2999
+ },
+ {
+ "epoch": 1.4184397163120568,
+ "grad_norm": 2.998138189315796,
+ "learning_rate": 4.373502654940992e-06,
+ "loss": 0.5233,
+ "step": 3000
+ },
+ {
+ "epoch": 1.4189125295508274,
+ "grad_norm": 2.604341983795166,
+ "learning_rate": 4.373089550583126e-06,
+ "loss": 0.5274,
+ "step": 3001
+ },
+ {
+ "epoch": 1.4193853427895982,
+ "grad_norm": 2.6792588233947754,
+ "learning_rate": 4.372676329596188e-06,
+ "loss": 0.5061,
+ "step": 3002
+ },
+ {
+ "epoch": 1.4198581560283687,
+ "grad_norm": 2.5182368755340576,
+ "learning_rate": 4.372262992005906e-06,
+ "loss": 0.541,
+ "step": 3003
+ },
+ {
+ "epoch": 1.4203309692671395,
+ "grad_norm": 2.690718173980713,
+ "learning_rate": 4.371849537838018e-06,
+ "loss": 0.5308,
+ "step": 3004
+ },
+ {
+ "epoch": 1.42080378250591,
+ "grad_norm": 2.6797590255737305,
+ "learning_rate": 4.371435967118266e-06,
+ "loss": 0.5728,
+ "step": 3005
+ },
+ {
+ "epoch": 1.421276595744681,
+ "grad_norm": 2.847900152206421,
+ "learning_rate": 4.371022279872403e-06,
+ "loss": 0.5053,
+ "step": 3006
+ },
+ {
+ "epoch": 1.4217494089834515,
+ "grad_norm": 2.497810125350952,
+ "learning_rate": 4.370608476126186e-06,
+ "loss": 0.5057,
+ "step": 3007
+ },
+ {
+ "epoch": 1.4222222222222223,
+ "grad_norm": 2.5259225368499756,
+ "learning_rate": 4.370194555905382e-06,
+ "loss": 0.5508,
+ "step": 3008
+ },
+ {
+ "epoch": 1.4226950354609929,
+ "grad_norm": 2.774118423461914,
+ "learning_rate": 4.369780519235763e-06,
+ "loss": 0.5419,
+ "step": 3009
+ },
+ {
+ "epoch": 1.4231678486997636,
+ "grad_norm": 2.2764663696289062,
+ "learning_rate": 4.369366366143111e-06,
+ "loss": 0.5032,
+ "step": 3010
+ },
+ {
+ "epoch": 1.4236406619385342,
+ "grad_norm": 2.736347198486328,
+ "learning_rate": 4.368952096653211e-06,
+ "loss": 0.5184,
+ "step": 3011
+ },
+ {
+ "epoch": 1.424113475177305,
+ "grad_norm": 2.476762056350708,
+ "learning_rate": 4.36853771079186e-06,
+ "loss": 0.5331,
+ "step": 3012
+ },
+ {
+ "epoch": 1.4245862884160756,
+ "grad_norm": 2.8006162643432617,
+ "learning_rate": 4.3681232085848585e-06,
+ "loss": 0.5331,
+ "step": 3013
+ },
+ {
+ "epoch": 1.4250591016548464,
+ "grad_norm": 2.509143590927124,
+ "learning_rate": 4.367708590058016e-06,
+ "loss": 0.5127,
+ "step": 3014
+ },
+ {
+ "epoch": 1.425531914893617,
+ "grad_norm": 3.030137538909912,
+ "learning_rate": 4.3672938552371505e-06,
+ "loss": 0.5555,
+ "step": 3015
+ },
+ {
+ "epoch": 1.4260047281323878,
+ "grad_norm": 3.0536904335021973,
+ "learning_rate": 4.3668790041480835e-06,
+ "loss": 0.5241,
+ "step": 3016
+ },
+ {
+ "epoch": 1.4264775413711583,
+ "grad_norm": 2.6400439739227295,
+ "learning_rate": 4.366464036816647e-06,
+ "loss": 0.4946,
+ "step": 3017
+ },
+ {
+ "epoch": 1.4269503546099291,
+ "grad_norm": 2.7302589416503906,
+ "learning_rate": 4.366048953268679e-06,
+ "loss": 0.5105,
+ "step": 3018
+ },
+ {
+ "epoch": 1.4274231678486997,
+ "grad_norm": 2.504549264907837,
+ "learning_rate": 4.365633753530026e-06,
+ "loss": 0.4844,
+ "step": 3019
+ },
+ {
+ "epoch": 1.4278959810874705,
+ "grad_norm": 2.3872320652008057,
+ "learning_rate": 4.365218437626539e-06,
+ "loss": 0.4402,
+ "step": 3020
+ },
+ {
+ "epoch": 1.428368794326241,
+ "grad_norm": 2.531649351119995,
+ "learning_rate": 4.364803005584078e-06,
+ "loss": 0.4913,
+ "step": 3021
+ },
+ {
+ "epoch": 1.4288416075650119,
+ "grad_norm": 2.4683783054351807,
+ "learning_rate": 4.364387457428512e-06,
+ "loss": 0.515,
+ "step": 3022
+ },
+ {
+ "epoch": 1.4293144208037825,
+ "grad_norm": 2.632336378097534,
+ "learning_rate": 4.363971793185713e-06,
+ "loss": 0.5398,
+ "step": 3023
+ },
+ {
+ "epoch": 1.4297872340425533,
+ "grad_norm": 2.7456719875335693,
+ "learning_rate": 4.363556012881565e-06,
+ "loss": 0.5254,
+ "step": 3024
+ },
+ {
+ "epoch": 1.4302600472813238,
+ "grad_norm": 2.607177972793579,
+ "learning_rate": 4.363140116541955e-06,
+ "loss": 0.5266,
+ "step": 3025
+ },
+ {
+ "epoch": 1.4307328605200946,
+ "grad_norm": 2.640127420425415,
+ "learning_rate": 4.3627241041927796e-06,
+ "loss": 0.5157,
+ "step": 3026
+ },
+ {
+ "epoch": 1.4312056737588652,
+ "grad_norm": 2.4210736751556396,
+ "learning_rate": 4.362307975859941e-06,
+ "loss": 0.4599,
+ "step": 3027
+ },
+ {
+ "epoch": 1.431678486997636,
+ "grad_norm": 2.6007790565490723,
+ "learning_rate": 4.361891731569352e-06,
+ "loss": 0.5298,
+ "step": 3028
+ },
+ {
+ "epoch": 1.4321513002364066,
+ "grad_norm": 2.5352046489715576,
+ "learning_rate": 4.361475371346928e-06,
+ "loss": 0.5128,
+ "step": 3029
+ },
+ {
+ "epoch": 1.4326241134751774,
+ "grad_norm": 2.4204049110412598,
+ "learning_rate": 4.361058895218596e-06,
+ "loss": 0.4669,
+ "step": 3030
+ },
+ {
+ "epoch": 1.433096926713948,
+ "grad_norm": 2.525240182876587,
+ "learning_rate": 4.360642303210286e-06,
+ "loss": 0.4925,
+ "step": 3031
+ },
+ {
+ "epoch": 1.4335697399527187,
+ "grad_norm": 2.839646339416504,
+ "learning_rate": 4.360225595347939e-06,
+ "loss": 0.5868,
+ "step": 3032
+ },
+ {
+ "epoch": 1.4340425531914893,
+ "grad_norm": 2.5043296813964844,
+ "learning_rate": 4.359808771657501e-06,
+ "loss": 0.4951,
+ "step": 3033
+ },
+ {
+ "epoch": 1.4345153664302601,
+ "grad_norm": 2.9082300662994385,
+ "learning_rate": 4.359391832164927e-06,
+ "loss": 0.5259,
+ "step": 3034
+ },
+ {
+ "epoch": 1.4349881796690307,
+ "grad_norm": 2.6651999950408936,
+ "learning_rate": 4.3589747768961745e-06,
+ "loss": 0.537,
+ "step": 3035
+ },
+ {
+ "epoch": 1.4354609929078015,
+ "grad_norm": 2.577077865600586,
+ "learning_rate": 4.358557605877216e-06,
+ "loss": 0.5186,
+ "step": 3036
+ },
+ {
+ "epoch": 1.435933806146572,
+ "grad_norm": 2.7445287704467773,
+ "learning_rate": 4.3581403191340236e-06,
+ "loss": 0.5573,
+ "step": 3037
+ },
+ {
+ "epoch": 1.4364066193853429,
+ "grad_norm": 2.502086639404297,
+ "learning_rate": 4.357722916692582e-06,
+ "loss": 0.5039,
+ "step": 3038
+ },
+ {
+ "epoch": 1.4368794326241134,
+ "grad_norm": 2.4476163387298584,
+ "learning_rate": 4.357305398578879e-06,
+ "loss": 0.5638,
+ "step": 3039
+ },
+ {
+ "epoch": 1.4373522458628842,
+ "grad_norm": 2.7705588340759277,
+ "learning_rate": 4.356887764818915e-06,
+ "loss": 0.5485,
+ "step": 3040
+ },
+ {
+ "epoch": 1.4378250591016548,
+ "grad_norm": 2.498225450515747,
+ "learning_rate": 4.356470015438691e-06,
+ "loss": 0.5486,
+ "step": 3041
+ },
+ {
+ "epoch": 1.4382978723404256,
+ "grad_norm": 2.394320011138916,
+ "learning_rate": 4.356052150464219e-06,
+ "loss": 0.512,
+ "step": 3042
+ },
+ {
+ "epoch": 1.4387706855791962,
+ "grad_norm": 2.8725767135620117,
+ "learning_rate": 4.3556341699215185e-06,
+ "loss": 0.5202,
+ "step": 3043
+ },
+ {
+ "epoch": 1.439243498817967,
+ "grad_norm": 3.1707918643951416,
+ "learning_rate": 4.355216073836615e-06,
+ "loss": 0.5229,
+ "step": 3044
+ },
+ {
+ "epoch": 1.4397163120567376,
+ "grad_norm": 2.532578468322754,
+ "learning_rate": 4.3547978622355415e-06,
+ "loss": 0.4569,
+ "step": 3045
+ },
+ {
+ "epoch": 1.4401891252955084,
+ "grad_norm": 3.0111029148101807,
+ "learning_rate": 4.354379535144338e-06,
+ "loss": 0.5801,
+ "step": 3046
+ },
+ {
+ "epoch": 1.440661938534279,
+ "grad_norm": 2.9554224014282227,
+ "learning_rate": 4.353961092589052e-06,
+ "loss": 0.5968,
+ "step": 3047
+ },
+ {
+ "epoch": 1.4411347517730497,
+ "grad_norm": 2.7562637329101562,
+ "learning_rate": 4.353542534595738e-06,
+ "loss": 0.5005,
+ "step": 3048
+ },
+ {
+ "epoch": 1.4416075650118203,
+ "grad_norm": 3.083254337310791,
+ "learning_rate": 4.3531238611904595e-06,
+ "loss": 0.5389,
+ "step": 3049
+ },
+ {
+ "epoch": 1.442080378250591,
+ "grad_norm": 2.7778005599975586,
+ "learning_rate": 4.352705072399282e-06,
+ "loss": 0.5342,
+ "step": 3050
+ },
+ {
+ "epoch": 1.4425531914893617,
+ "grad_norm": 2.6673996448516846,
+ "learning_rate": 4.3522861682482845e-06,
+ "loss": 0.5213,
+ "step": 3051
+ },
+ {
+ "epoch": 1.4430260047281322,
+ "grad_norm": 2.637605905532837,
+ "learning_rate": 4.351867148763548e-06,
+ "loss": 0.4893,
+ "step": 3052
+ },
+ {
+ "epoch": 1.443498817966903,
+ "grad_norm": 2.834469795227051,
+ "learning_rate": 4.351448013971166e-06,
+ "loss": 0.5391,
+ "step": 3053
+ },
+ {
+ "epoch": 1.4439716312056738,
+ "grad_norm": 2.824153184890747,
+ "learning_rate": 4.351028763897234e-06,
+ "loss": 0.6403,
+ "step": 3054
+ },
+ {
+ "epoch": 1.4444444444444444,
+ "grad_norm": 2.558966875076294,
+ "learning_rate": 4.350609398567857e-06,
+ "loss": 0.4912,
+ "step": 3055
+ },
+ {
+ "epoch": 1.444917257683215,
+ "grad_norm": 2.281726360321045,
+ "learning_rate": 4.3501899180091475e-06,
+ "loss": 0.4655,
+ "step": 3056
+ },
+ {
+ "epoch": 1.4453900709219858,
+ "grad_norm": 2.499472141265869,
+ "learning_rate": 4.349770322247225e-06,
+ "loss": 0.4878,
+ "step": 3057
+ },
+ {
+ "epoch": 1.4458628841607566,
+ "grad_norm": 2.578615188598633,
+ "learning_rate": 4.349350611308215e-06,
+ "loss": 0.4855,
+ "step": 3058
+ },
+ {
+ "epoch": 1.4463356973995272,
+ "grad_norm": 2.7111165523529053,
+ "learning_rate": 4.348930785218252e-06,
+ "loss": 0.5415,
+ "step": 3059
+ },
+ {
+ "epoch": 1.4468085106382977,
+ "grad_norm": 2.8081610202789307,
+ "learning_rate": 4.348510844003476e-06,
+ "loss": 0.4881,
+ "step": 3060
+ },
+ {
+ "epoch": 1.4472813238770685,
+ "grad_norm": 2.9439868927001953,
+ "learning_rate": 4.348090787690036e-06,
+ "loss": 0.5485,
+ "step": 3061
+ },
+ {
+ "epoch": 1.4477541371158393,
+ "grad_norm": 2.592532157897949,
+ "learning_rate": 4.347670616304085e-06,
+ "loss": 0.4912,
+ "step": 3062
+ },
+ {
+ "epoch": 1.44822695035461,
+ "grad_norm": 2.960592746734619,
+ "learning_rate": 4.347250329871787e-06,
+ "loss": 0.5473,
+ "step": 3063
+ },
+ {
+ "epoch": 1.4486997635933805,
+ "grad_norm": 2.5786688327789307,
+ "learning_rate": 4.3468299284193116e-06,
+ "loss": 0.5348,
+ "step": 3064
+ },
+ {
+ "epoch": 1.4491725768321513,
+ "grad_norm": 2.6084046363830566,
+ "learning_rate": 4.346409411972834e-06,
+ "loss": 0.527,
+ "step": 3065
+ },
+ {
+ "epoch": 1.449645390070922,
+ "grad_norm": 2.489748239517212,
+ "learning_rate": 4.3459887805585385e-06,
+ "loss": 0.4943,
+ "step": 3066
+ },
+ {
+ "epoch": 1.4501182033096927,
+ "grad_norm": 2.452131986618042,
+ "learning_rate": 4.345568034202617e-06,
+ "loss": 0.4886,
+ "step": 3067
+ },
+ {
+ "epoch": 1.4505910165484632,
+ "grad_norm": 2.4034671783447266,
+ "learning_rate": 4.345147172931266e-06,
+ "loss": 0.4689,
+ "step": 3068
+ },
+ {
+ "epoch": 1.451063829787234,
+ "grad_norm": 2.6045448780059814,
+ "learning_rate": 4.344726196770691e-06,
+ "loss": 0.5842,
+ "step": 3069
+ },
+ {
+ "epoch": 1.4515366430260048,
+ "grad_norm": 2.697593927383423,
+ "learning_rate": 4.3443051057471045e-06,
+ "loss": 0.5358,
+ "step": 3070
+ },
+ {
+ "epoch": 1.4520094562647754,
+ "grad_norm": 2.6080820560455322,
+ "learning_rate": 4.343883899886727e-06,
+ "loss": 0.5361,
+ "step": 3071
+ },
+ {
+ "epoch": 1.452482269503546,
+ "grad_norm": 2.4605307579040527,
+ "learning_rate": 4.343462579215783e-06,
+ "loss": 0.4941,
+ "step": 3072
+ },
+ {
+ "epoch": 1.4529550827423168,
+ "grad_norm": 2.8025355339050293,
+ "learning_rate": 4.343041143760509e-06,
+ "loss": 0.5116,
+ "step": 3073
+ },
+ {
+ "epoch": 1.4534278959810876,
+ "grad_norm": 2.432515859603882,
+ "learning_rate": 4.3426195935471434e-06,
+ "loss": 0.4991,
+ "step": 3074
+ },
+ {
+ "epoch": 1.4539007092198581,
+ "grad_norm": 2.5838661193847656,
+ "learning_rate": 4.342197928601935e-06,
+ "loss": 0.4994,
+ "step": 3075
+ },
+ {
+ "epoch": 1.4543735224586287,
+ "grad_norm": 2.421692371368408,
+ "learning_rate": 4.341776148951141e-06,
+ "loss": 0.4945,
+ "step": 3076
+ },
+ {
+ "epoch": 1.4548463356973995,
+ "grad_norm": 2.5354676246643066,
+ "learning_rate": 4.341354254621021e-06,
+ "loss": 0.4859,
+ "step": 3077
+ },
+ {
+ "epoch": 1.4553191489361703,
+ "grad_norm": 2.7316789627075195,
+ "learning_rate": 4.340932245637846e-06,
+ "loss": 0.5136,
+ "step": 3078
+ },
+ {
+ "epoch": 1.455791962174941,
+ "grad_norm": 3.5903496742248535,
+ "learning_rate": 4.340510122027891e-06,
+ "loss": 0.6451,
+ "step": 3079
+ },
+ {
+ "epoch": 1.4562647754137115,
+ "grad_norm": 2.95190167427063,
+ "learning_rate": 4.340087883817442e-06,
+ "loss": 0.6354,
+ "step": 3080
+ },
+ {
+ "epoch": 1.4567375886524823,
+ "grad_norm": 2.8659214973449707,
+ "learning_rate": 4.339665531032789e-06,
+ "loss": 0.5514,
+ "step": 3081
+ },
+ {
+ "epoch": 1.457210401891253,
+ "grad_norm": 2.5681674480438232,
+ "learning_rate": 4.339243063700231e-06,
+ "loss": 0.5135,
+ "step": 3082
+ },
+ {
+ "epoch": 1.4576832151300236,
+ "grad_norm": 2.7353906631469727,
+ "learning_rate": 4.338820481846072e-06,
+ "loss": 0.4608,
+ "step": 3083
+ },
+ {
+ "epoch": 1.4581560283687942,
+ "grad_norm": 2.6116466522216797,
+ "learning_rate": 4.3383977854966245e-06,
+ "loss": 0.4924,
+ "step": 3084
+ },
+ {
+ "epoch": 1.458628841607565,
+ "grad_norm": 2.6676487922668457,
+ "learning_rate": 4.337974974678207e-06,
+ "loss": 0.5747,
+ "step": 3085
+ },
+ {
+ "epoch": 1.4591016548463358,
+ "grad_norm": 2.909031629562378,
+ "learning_rate": 4.337552049417147e-06,
+ "loss": 0.4618,
+ "step": 3086
+ },
+ {
+ "epoch": 1.4595744680851064,
+ "grad_norm": 2.7614190578460693,
+ "learning_rate": 4.33712900973978e-06,
+ "loss": 0.5154,
+ "step": 3087
+ },
+ {
+ "epoch": 1.460047281323877,
+ "grad_norm": 2.452188014984131,
+ "learning_rate": 4.336705855672444e-06,
+ "loss": 0.542,
+ "step": 3088
+ },
+ {
+ "epoch": 1.4605200945626478,
+ "grad_norm": 3.0004117488861084,
+ "learning_rate": 4.336282587241488e-06,
+ "loss": 0.5857,
+ "step": 3089
+ },
+ {
+ "epoch": 1.4609929078014185,
+ "grad_norm": 2.870783567428589,
+ "learning_rate": 4.335859204473268e-06,
+ "loss": 0.5506,
+ "step": 3090
+ },
+ {
+ "epoch": 1.4614657210401891,
+ "grad_norm": 3.1078689098358154,
+ "learning_rate": 4.335435707394145e-06,
+ "loss": 0.5138,
+ "step": 3091
+ },
+ {
+ "epoch": 1.4619385342789597,
+ "grad_norm": 2.8516197204589844,
+ "learning_rate": 4.335012096030488e-06,
+ "loss": 0.5842,
+ "step": 3092
+ },
+ {
+ "epoch": 1.4624113475177305,
+ "grad_norm": 2.615922212600708,
+ "learning_rate": 4.334588370408675e-06,
+ "loss": 0.4896,
+ "step": 3093
+ },
+ {
+ "epoch": 1.4628841607565013,
+ "grad_norm": 3.1911802291870117,
+ "learning_rate": 4.334164530555088e-06,
+ "loss": 0.4974,
+ "step": 3094
+ },
+ {
+ "epoch": 1.4633569739952719,
+ "grad_norm": 3.075051784515381,
+ "learning_rate": 4.3337405764961186e-06,
+ "loss": 0.567,
+ "step": 3095
+ },
+ {
+ "epoch": 1.4638297872340424,
+ "grad_norm": 2.550625801086426,
+ "learning_rate": 4.333316508258163e-06,
+ "loss": 0.4887,
+ "step": 3096
+ },
+ {
+ "epoch": 1.4643026004728132,
+ "grad_norm": 2.3986475467681885,
+ "learning_rate": 4.332892325867629e-06,
+ "loss": 0.5047,
+ "step": 3097
+ },
+ {
+ "epoch": 1.464775413711584,
+ "grad_norm": 2.5045125484466553,
+ "learning_rate": 4.332468029350926e-06,
+ "loss": 0.4721,
+ "step": 3098
+ },
+ {
+ "epoch": 1.4652482269503546,
+ "grad_norm": 2.347365617752075,
+ "learning_rate": 4.332043618734474e-06,
+ "loss": 0.4913,
+ "step": 3099
+ },
+ {
+ "epoch": 1.4657210401891252,
+ "grad_norm": 2.459928512573242,
+ "learning_rate": 4.331619094044699e-06,
+ "loss": 0.523,
+ "step": 3100
+ },
+ {
+ "epoch": 1.466193853427896,
+ "grad_norm": 2.5771310329437256,
+ "learning_rate": 4.331194455308035e-06,
+ "loss": 0.593,
+ "step": 3101
+ },
+ {
+ "epoch": 1.4666666666666668,
+ "grad_norm": 3.1351823806762695,
+ "learning_rate": 4.330769702550921e-06,
+ "loss": 0.5852,
+ "step": 3102
+ },
+ {
+ "epoch": 1.4671394799054374,
+ "grad_norm": 2.589817523956299,
+ "learning_rate": 4.330344835799806e-06,
+ "loss": 0.508,
+ "step": 3103
+ },
+ {
+ "epoch": 1.467612293144208,
+ "grad_norm": 3.1140341758728027,
+ "learning_rate": 4.329919855081144e-06,
+ "loss": 0.469,
+ "step": 3104
+ },
+ {
+ "epoch": 1.4680851063829787,
+ "grad_norm": 2.8186635971069336,
+ "learning_rate": 4.329494760421396e-06,
+ "loss": 0.5088,
+ "step": 3105
+ },
+ {
+ "epoch": 1.4685579196217495,
+ "grad_norm": 2.676077365875244,
+ "learning_rate": 4.329069551847031e-06,
+ "loss": 0.52,
+ "step": 3106
+ },
+ {
+ "epoch": 1.46903073286052,
+ "grad_norm": 2.5543313026428223,
+ "learning_rate": 4.328644229384526e-06,
+ "loss": 0.5066,
+ "step": 3107
+ },
+ {
+ "epoch": 1.4695035460992907,
+ "grad_norm": 2.8176217079162598,
+ "learning_rate": 4.328218793060362e-06,
+ "loss": 0.6404,
+ "step": 3108
+ },
+ {
+ "epoch": 1.4699763593380615,
+ "grad_norm": 2.485217332839966,
+ "learning_rate": 4.3277932429010314e-06,
+ "loss": 0.4578,
+ "step": 3109
+ },
+ {
+ "epoch": 1.4704491725768323,
+ "grad_norm": 2.6741621494293213,
+ "learning_rate": 4.327367578933031e-06,
+ "loss": 0.5068,
+ "step": 3110
+ },
+ {
+ "epoch": 1.4709219858156029,
+ "grad_norm": 2.377242088317871,
+ "learning_rate": 4.326941801182863e-06,
+ "loss": 0.5249,
+ "step": 3111
+ },
+ {
+ "epoch": 1.4713947990543734,
+ "grad_norm": 2.790046215057373,
+ "learning_rate": 4.32651590967704e-06,
+ "loss": 0.5532,
+ "step": 3112
+ },
+ {
+ "epoch": 1.4718676122931442,
+ "grad_norm": 2.78019642829895,
+ "learning_rate": 4.326089904442081e-06,
+ "loss": 0.5362,
+ "step": 3113
+ },
+ {
+ "epoch": 1.472340425531915,
+ "grad_norm": 2.5661380290985107,
+ "learning_rate": 4.32566378550451e-06,
+ "loss": 0.5041,
+ "step": 3114
+ },
+ {
+ "epoch": 1.4728132387706856,
+ "grad_norm": 2.522153615951538,
+ "learning_rate": 4.3252375528908605e-06,
+ "loss": 0.5074,
+ "step": 3115
+ },
+ {
+ "epoch": 1.4732860520094562,
+ "grad_norm": 2.874688148498535,
+ "learning_rate": 4.3248112066276725e-06,
+ "loss": 0.59,
+ "step": 3116
+ },
+ {
+ "epoch": 1.473758865248227,
+ "grad_norm": 3.067866802215576,
+ "learning_rate": 4.324384746741492e-06,
+ "loss": 0.5924,
+ "step": 3117
+ },
+ {
+ "epoch": 1.4742316784869978,
+ "grad_norm": 3.359463930130005,
+ "learning_rate": 4.323958173258873e-06,
+ "loss": 0.6346,
+ "step": 3118
+ },
+ {
+ "epoch": 1.4747044917257683,
+ "grad_norm": 2.193024158477783,
+ "learning_rate": 4.323531486206376e-06,
+ "loss": 0.4594,
+ "step": 3119
+ },
+ {
+ "epoch": 1.475177304964539,
+ "grad_norm": 2.886889934539795,
+ "learning_rate": 4.323104685610569e-06,
+ "loss": 0.523,
+ "step": 3120
+ },
+ {
+ "epoch": 1.4756501182033097,
+ "grad_norm": 2.7558681964874268,
+ "learning_rate": 4.322677771498028e-06,
+ "loss": 0.5387,
+ "step": 3121
+ },
+ {
+ "epoch": 1.4761229314420805,
+ "grad_norm": 2.639277935028076,
+ "learning_rate": 4.322250743895335e-06,
+ "loss": 0.5599,
+ "step": 3122
+ },
+ {
+ "epoch": 1.476595744680851,
+ "grad_norm": 2.786198616027832,
+ "learning_rate": 4.321823602829078e-06,
+ "loss": 0.5405,
+ "step": 3123
+ },
+ {
+ "epoch": 1.4770685579196217,
+ "grad_norm": 2.582315683364868,
+ "learning_rate": 4.321396348325853e-06,
+ "loss": 0.4452,
+ "step": 3124
+ },
+ {
+ "epoch": 1.4775413711583925,
+ "grad_norm": 2.8574297428131104,
+ "learning_rate": 4.320968980412265e-06,
+ "loss": 0.4846,
+ "step": 3125
+ },
+ {
+ "epoch": 1.4780141843971633,
+ "grad_norm": 2.705281972885132,
+ "learning_rate": 4.320541499114922e-06,
+ "loss": 0.5548,
+ "step": 3126
+ },
+ {
+ "epoch": 1.4784869976359338,
+ "grad_norm": 2.3152754306793213,
+ "learning_rate": 4.320113904460444e-06,
+ "loss": 0.5216,
+ "step": 3127
+ },
+ {
+ "epoch": 1.4789598108747044,
+ "grad_norm": 3.230764150619507,
+ "learning_rate": 4.319686196475453e-06,
+ "loss": 0.6192,
+ "step": 3128
+ },
+ {
+ "epoch": 1.4794326241134752,
+ "grad_norm": 2.463380813598633,
+ "learning_rate": 4.319258375186583e-06,
+ "loss": 0.4872,
+ "step": 3129
+ },
+ {
+ "epoch": 1.479905437352246,
+ "grad_norm": 2.8477656841278076,
+ "learning_rate": 4.31883044062047e-06,
+ "loss": 0.5371,
+ "step": 3130
+ },
+ {
+ "epoch": 1.4803782505910166,
+ "grad_norm": 2.393911123275757,
+ "learning_rate": 4.318402392803762e-06,
+ "loss": 0.5334,
+ "step": 3131
+ },
+ {
+ "epoch": 1.4808510638297872,
+ "grad_norm": 2.6113736629486084,
+ "learning_rate": 4.317974231763109e-06,
+ "loss": 0.5572,
+ "step": 3132
+ },
+ {
+ "epoch": 1.481323877068558,
+ "grad_norm": 2.3941731452941895,
+ "learning_rate": 4.317545957525173e-06,
+ "loss": 0.4849,
+ "step": 3133
+ },
+ {
+ "epoch": 1.4817966903073285,
+ "grad_norm": 2.9536755084991455,
+ "learning_rate": 4.317117570116619e-06,
+ "loss": 0.6058,
+ "step": 3134
+ },
+ {
+ "epoch": 1.4822695035460993,
+ "grad_norm": 2.595754623413086,
+ "learning_rate": 4.316689069564123e-06,
+ "loss": 0.5193,
+ "step": 3135
+ },
+ {
+ "epoch": 1.48274231678487,
+ "grad_norm": 2.569833993911743,
+ "learning_rate": 4.316260455894364e-06,
+ "loss": 0.543,
+ "step": 3136
+ },
+ {
+ "epoch": 1.4832151300236407,
+ "grad_norm": 2.5137455463409424,
+ "learning_rate": 4.315831729134031e-06,
+ "loss": 0.5415,
+ "step": 3137
+ },
+ {
+ "epoch": 1.4836879432624113,
+ "grad_norm": 2.5582292079925537,
+ "learning_rate": 4.3154028893098176e-06,
+ "loss": 0.5338,
+ "step": 3138
+ },
+ {
+ "epoch": 1.484160756501182,
+ "grad_norm": 2.666426181793213,
+ "learning_rate": 4.3149739364484265e-06,
+ "loss": 0.5435,
+ "step": 3139
+ },
+ {
+ "epoch": 1.4846335697399526,
+ "grad_norm": 2.790851354598999,
+ "learning_rate": 4.314544870576568e-06,
+ "loss": 0.5746,
+ "step": 3140
+ },
+ {
+ "epoch": 1.4851063829787234,
+ "grad_norm": 2.620326042175293,
+ "learning_rate": 4.314115691720956e-06,
+ "loss": 0.5076,
+ "step": 3141
+ },
+ {
+ "epoch": 1.485579196217494,
+ "grad_norm": 3.075674533843994,
+ "learning_rate": 4.313686399908314e-06,
+ "loss": 0.5486,
+ "step": 3142
+ },
+ {
+ "epoch": 1.4860520094562648,
+ "grad_norm": 3.1347315311431885,
+ "learning_rate": 4.3132569951653745e-06,
+ "loss": 0.531,
+ "step": 3143
+ },
+ {
+ "epoch": 1.4865248226950354,
+ "grad_norm": 2.5783653259277344,
+ "learning_rate": 4.312827477518871e-06,
+ "loss": 0.5818,
+ "step": 3144
+ },
+ {
+ "epoch": 1.4869976359338062,
+ "grad_norm": 3.0247137546539307,
+ "learning_rate": 4.3123978469955505e-06,
+ "loss": 0.5347,
+ "step": 3145
+ },
+ {
+ "epoch": 1.4874704491725768,
+ "grad_norm": 2.4789345264434814,
+ "learning_rate": 4.311968103622163e-06,
+ "loss": 0.5,
+ "step": 3146
+ },
+ {
+ "epoch": 1.4879432624113476,
+ "grad_norm": 2.663341522216797,
+ "learning_rate": 4.311538247425466e-06,
+ "loss": 0.4825,
+ "step": 3147
+ },
+ {
+ "epoch": 1.4884160756501181,
+ "grad_norm": 2.633711099624634,
+ "learning_rate": 4.311108278432226e-06,
+ "loss": 0.5244,
+ "step": 3148
+ },
+ {
+ "epoch": 1.488888888888889,
+ "grad_norm": 2.51312518119812,
+ "learning_rate": 4.310678196669216e-06,
+ "loss": 0.513,
+ "step": 3149
+ },
+ {
+ "epoch": 1.4893617021276595,
+ "grad_norm": 2.5263755321502686,
+ "learning_rate": 4.310248002163214e-06,
+ "loss": 0.5236,
+ "step": 3150
+ },
+ {
+ "epoch": 1.4898345153664303,
+ "grad_norm": 2.559216260910034,
+ "learning_rate": 4.309817694941007e-06,
+ "loss": 0.5107,
+ "step": 3151
+ },
+ {
+ "epoch": 1.4903073286052009,
+ "grad_norm": 2.5023303031921387,
+ "learning_rate": 4.309387275029386e-06,
+ "loss": 0.4685,
+ "step": 3152
+ },
+ {
+ "epoch": 1.4907801418439717,
+ "grad_norm": 3.0314254760742188,
+ "learning_rate": 4.308956742455155e-06,
+ "loss": 0.5462,
+ "step": 3153
+ },
+ {
+ "epoch": 1.4912529550827422,
+ "grad_norm": 2.675295114517212,
+ "learning_rate": 4.308526097245119e-06,
+ "loss": 0.5398,
+ "step": 3154
+ },
+ {
+ "epoch": 1.491725768321513,
+ "grad_norm": 2.6613399982452393,
+ "learning_rate": 4.308095339426094e-06,
+ "loss": 0.5376,
+ "step": 3155
+ },
+ {
+ "epoch": 1.4921985815602836,
+ "grad_norm": 2.58937668800354,
+ "learning_rate": 4.307664469024899e-06,
+ "loss": 0.5385,
+ "step": 3156
+ },
+ {
+ "epoch": 1.4926713947990544,
+ "grad_norm": 2.583631992340088,
+ "learning_rate": 4.3072334860683655e-06,
+ "loss": 0.4927,
+ "step": 3157
+ },
+ {
+ "epoch": 1.493144208037825,
+ "grad_norm": 2.5889222621917725,
+ "learning_rate": 4.306802390583327e-06,
+ "loss": 0.47,
+ "step": 3158
+ },
+ {
+ "epoch": 1.4936170212765958,
+ "grad_norm": 2.9362716674804688,
+ "learning_rate": 4.3063711825966244e-06,
+ "loss": 0.4902,
+ "step": 3159
+ },
+ {
+ "epoch": 1.4940898345153664,
+ "grad_norm": 2.5385425090789795,
+ "learning_rate": 4.305939862135111e-06,
+ "loss": 0.5396,
+ "step": 3160
+ },
+ {
+ "epoch": 1.4945626477541372,
+ "grad_norm": 2.776326894760132,
+ "learning_rate": 4.305508429225641e-06,
+ "loss": 0.5169,
+ "step": 3161
+ },
+ {
+ "epoch": 1.4950354609929077,
+ "grad_norm": 2.575063467025757,
+ "learning_rate": 4.305076883895076e-06,
+ "loss": 0.4938,
+ "step": 3162
+ },
+ {
+ "epoch": 1.4955082742316785,
+ "grad_norm": 2.7552313804626465,
+ "learning_rate": 4.304645226170291e-06,
+ "loss": 0.6211,
+ "step": 3163
+ },
+ {
+ "epoch": 1.4959810874704491,
+ "grad_norm": 2.57149338722229,
+ "learning_rate": 4.30421345607816e-06,
+ "loss": 0.5241,
+ "step": 3164
+ },
+ {
+ "epoch": 1.49645390070922,
+ "grad_norm": 2.8142426013946533,
+ "learning_rate": 4.303781573645568e-06,
+ "loss": 0.5699,
+ "step": 3165
+ },
+ {
+ "epoch": 1.4969267139479905,
+ "grad_norm": 2.6344845294952393,
+ "learning_rate": 4.303349578899407e-06,
+ "loss": 0.5049,
+ "step": 3166
+ },
+ {
+ "epoch": 1.4973995271867613,
+ "grad_norm": 2.554410934448242,
+ "learning_rate": 4.302917471866575e-06,
+ "loss": 0.4404,
+ "step": 3167
+ },
+ {
+ "epoch": 1.4978723404255319,
+ "grad_norm": 2.896240711212158,
+ "learning_rate": 4.302485252573978e-06,
+ "loss": 0.602,
+ "step": 3168
+ },
+ {
+ "epoch": 1.4983451536643027,
+ "grad_norm": 2.4044477939605713,
+ "learning_rate": 4.302052921048527e-06,
+ "loss": 0.4857,
+ "step": 3169
+ },
+ {
+ "epoch": 1.4988179669030732,
+ "grad_norm": 2.7447879314422607,
+ "learning_rate": 4.301620477317144e-06,
+ "loss": 0.5438,
+ "step": 3170
+ },
+ {
+ "epoch": 1.499290780141844,
+ "grad_norm": 2.851820945739746,
+ "learning_rate": 4.301187921406752e-06,
+ "loss": 0.5245,
+ "step": 3171
+ },
+ {
+ "epoch": 1.4997635933806146,
+ "grad_norm": 3.247114419937134,
+ "learning_rate": 4.300755253344287e-06,
+ "loss": 0.504,
+ "step": 3172
+ },
+ {
+ "epoch": 1.5002364066193854,
+ "grad_norm": 3.117490291595459,
+ "learning_rate": 4.300322473156688e-06,
+ "loss": 0.4627,
+ "step": 3173
+ },
+ {
+ "epoch": 1.500709219858156,
+ "grad_norm": 2.558319330215454,
+ "learning_rate": 4.299889580870904e-06,
+ "loss": 0.5721,
+ "step": 3174
+ },
+ {
+ "epoch": 1.5011820330969265,
+ "grad_norm": 2.8983113765716553,
+ "learning_rate": 4.2994565765138865e-06,
+ "loss": 0.5257,
+ "step": 3175
+ },
+ {
+ "epoch": 1.5016548463356973,
+ "grad_norm": 2.744056463241577,
+ "learning_rate": 4.299023460112599e-06,
+ "loss": 0.4892,
+ "step": 3176
+ },
+ {
+ "epoch": 1.5021276595744681,
+ "grad_norm": 2.5506751537323,
+ "learning_rate": 4.29859023169401e-06,
+ "loss": 0.4933,
+ "step": 3177
+ },
+ {
+ "epoch": 1.5026004728132387,
+ "grad_norm": 2.842615842819214,
+ "learning_rate": 4.298156891285092e-06,
+ "loss": 0.6124,
+ "step": 3178
+ },
+ {
+ "epoch": 1.5030732860520093,
+ "grad_norm": 2.5355329513549805,
+ "learning_rate": 4.2977234389128305e-06,
+ "loss": 0.641,
+ "step": 3179
+ },
+ {
+ "epoch": 1.50354609929078,
+ "grad_norm": 2.674781084060669,
+ "learning_rate": 4.297289874604213e-06,
+ "loss": 0.475,
+ "step": 3180
+ },
+ {
+ "epoch": 1.5040189125295509,
+ "grad_norm": 2.6845548152923584,
+ "learning_rate": 4.296856198386235e-06,
+ "loss": 0.5328,
+ "step": 3181
+ },
+ {
+ "epoch": 1.5044917257683215,
+ "grad_norm": 2.9686241149902344,
+ "learning_rate": 4.296422410285902e-06,
+ "loss": 0.6216,
+ "step": 3182
+ },
+ {
+ "epoch": 1.504964539007092,
+ "grad_norm": 2.5095980167388916,
+ "learning_rate": 4.295988510330222e-06,
+ "loss": 0.4993,
+ "step": 3183
+ },
+ {
+ "epoch": 1.5054373522458628,
+ "grad_norm": 2.4906392097473145,
+ "learning_rate": 4.2955544985462125e-06,
+ "loss": 0.4795,
+ "step": 3184
+ },
+ {
+ "epoch": 1.5059101654846336,
+ "grad_norm": 2.5593366622924805,
+ "learning_rate": 4.295120374960897e-06,
+ "loss": 0.5527,
+ "step": 3185
+ },
+ {
+ "epoch": 1.5063829787234042,
+ "grad_norm": 2.691495180130005,
+ "learning_rate": 4.294686139601308e-06,
+ "loss": 0.5646,
+ "step": 3186
+ },
+ {
+ "epoch": 1.5068557919621748,
+ "grad_norm": 2.74320387840271,
+ "learning_rate": 4.294251792494483e-06,
+ "loss": 0.6149,
+ "step": 3187
+ },
+ {
+ "epoch": 1.5073286052009456,
+ "grad_norm": 2.8827052116394043,
+ "learning_rate": 4.293817333667465e-06,
+ "loss": 0.5414,
+ "step": 3188
+ },
+ {
+ "epoch": 1.5078014184397164,
+ "grad_norm": 2.5652425289154053,
+ "learning_rate": 4.293382763147308e-06,
+ "loss": 0.5006,
+ "step": 3189
+ },
+ {
+ "epoch": 1.508274231678487,
+ "grad_norm": 2.729295253753662,
+ "learning_rate": 4.29294808096107e-06,
+ "loss": 0.522,
+ "step": 3190
+ },
+ {
+ "epoch": 1.5087470449172575,
+ "grad_norm": 2.348118305206299,
+ "learning_rate": 4.292513287135817e-06,
+ "loss": 0.4125,
+ "step": 3191
+ },
+ {
+ "epoch": 1.5092198581560283,
+ "grad_norm": 2.809551954269409,
+ "learning_rate": 4.292078381698621e-06,
+ "loss": 0.5577,
+ "step": 3192
+ },
+ {
+ "epoch": 1.5096926713947991,
+ "grad_norm": 2.6925361156463623,
+ "learning_rate": 4.291643364676563e-06,
+ "loss": 0.62,
+ "step": 3193
+ },
+ {
+ "epoch": 1.5101654846335697,
+ "grad_norm": 2.4200620651245117,
+ "learning_rate": 4.291208236096729e-06,
+ "loss": 0.5464,
+ "step": 3194
+ },
+ {
+ "epoch": 1.5106382978723403,
+ "grad_norm": 2.5659191608428955,
+ "learning_rate": 4.290772995986211e-06,
+ "loss": 0.5402,
+ "step": 3195
+ },
+ {
+ "epoch": 1.511111111111111,
+ "grad_norm": 2.3877315521240234,
+ "learning_rate": 4.290337644372113e-06,
+ "loss": 0.463,
+ "step": 3196
+ },
+ {
+ "epoch": 1.5115839243498819,
+ "grad_norm": 2.7063233852386475,
+ "learning_rate": 4.289902181281538e-06,
+ "loss": 0.5253,
+ "step": 3197
+ },
+ {
+ "epoch": 1.5120567375886524,
+ "grad_norm": 2.56788969039917,
+ "learning_rate": 4.289466606741603e-06,
+ "loss": 0.5012,
+ "step": 3198
+ },
+ {
+ "epoch": 1.512529550827423,
+ "grad_norm": 2.637164831161499,
+ "learning_rate": 4.28903092077943e-06,
+ "loss": 0.5236,
+ "step": 3199
+ },
+ {
+ "epoch": 1.5130023640661938,
+ "grad_norm": 2.767526865005493,
+ "learning_rate": 4.288595123422146e-06,
+ "loss": 0.5832,
+ "step": 3200
+ },
+ {
+ "epoch": 1.5134751773049646,
+ "grad_norm": 2.33365535736084,
+ "learning_rate": 4.2881592146968866e-06,
+ "loss": 0.4548,
+ "step": 3201
+ },
+ {
+ "epoch": 1.5139479905437352,
+ "grad_norm": 2.544189453125,
+ "learning_rate": 4.287723194630793e-06,
+ "loss": 0.5115,
+ "step": 3202
+ },
+ {
+ "epoch": 1.5144208037825058,
+ "grad_norm": 2.588793992996216,
+ "learning_rate": 4.2872870632510155e-06,
+ "loss": 0.4766,
+ "step": 3203
+ },
+ {
+ "epoch": 1.5148936170212766,
+ "grad_norm": 2.5382184982299805,
+ "learning_rate": 4.286850820584709e-06,
+ "loss": 0.5401,
+ "step": 3204
+ },
+ {
+ "epoch": 1.5153664302600474,
+ "grad_norm": 2.597930669784546,
+ "learning_rate": 4.286414466659038e-06,
+ "loss": 0.5346,
+ "step": 3205
+ },
+ {
+ "epoch": 1.515839243498818,
+ "grad_norm": 2.8522393703460693,
+ "learning_rate": 4.28597800150117e-06,
+ "loss": 0.486,
+ "step": 3206
+ },
+ {
+ "epoch": 1.5163120567375885,
+ "grad_norm": 2.4801454544067383,
+ "learning_rate": 4.285541425138285e-06,
+ "loss": 0.5162,
+ "step": 3207
+ },
+ {
+ "epoch": 1.5167848699763593,
+ "grad_norm": 2.353665351867676,
+ "learning_rate": 4.285104737597563e-06,
+ "loss": 0.5066,
+ "step": 3208
+ },
+ {
+ "epoch": 1.51725768321513,
+ "grad_norm": 2.767976760864258,
+ "learning_rate": 4.2846679389061975e-06,
+ "loss": 0.5331,
+ "step": 3209
+ },
+ {
+ "epoch": 1.5177304964539007,
+ "grad_norm": 2.9307682514190674,
+ "learning_rate": 4.284231029091385e-06,
+ "loss": 0.5291,
+ "step": 3210
+ },
+ {
+ "epoch": 1.5182033096926713,
+ "grad_norm": 2.39719820022583,
+ "learning_rate": 4.283794008180329e-06,
+ "loss": 0.4759,
+ "step": 3211
+ },
+ {
+ "epoch": 1.518676122931442,
+ "grad_norm": 2.452244758605957,
+ "learning_rate": 4.283356876200242e-06,
+ "loss": 0.4283,
+ "step": 3212
+ },
+ {
+ "epoch": 1.5191489361702128,
+ "grad_norm": 2.4911608695983887,
+ "learning_rate": 4.282919633178343e-06,
+ "loss": 0.4812,
+ "step": 3213
+ },
+ {
+ "epoch": 1.5196217494089834,
+ "grad_norm": 2.5813944339752197,
+ "learning_rate": 4.282482279141856e-06,
+ "loss": 0.4911,
+ "step": 3214
+ },
+ {
+ "epoch": 1.520094562647754,
+ "grad_norm": 2.503542184829712,
+ "learning_rate": 4.282044814118013e-06,
+ "loss": 0.4969,
+ "step": 3215
+ },
+ {
+ "epoch": 1.5205673758865248,
+ "grad_norm": 2.5090713500976562,
+ "learning_rate": 4.281607238134053e-06,
+ "loss": 0.5293,
+ "step": 3216
+ },
+ {
+ "epoch": 1.5210401891252956,
+ "grad_norm": 2.425994396209717,
+ "learning_rate": 4.281169551217223e-06,
+ "loss": 0.5365,
+ "step": 3217
+ },
+ {
+ "epoch": 1.5215130023640662,
+ "grad_norm": 2.637655258178711,
+ "learning_rate": 4.2807317533947765e-06,
+ "loss": 0.5589,
+ "step": 3218
+ },
+ {
+ "epoch": 1.5219858156028367,
+ "grad_norm": 2.9335296154022217,
+ "learning_rate": 4.28029384469397e-06,
+ "loss": 0.6071,
+ "step": 3219
+ },
+ {
+ "epoch": 1.5224586288416075,
+ "grad_norm": 2.898683547973633,
+ "learning_rate": 4.279855825142073e-06,
+ "loss": 0.5392,
+ "step": 3220
+ },
+ {
+ "epoch": 1.5229314420803783,
+ "grad_norm": 2.613914966583252,
+ "learning_rate": 4.279417694766359e-06,
+ "loss": 0.4968,
+ "step": 3221
+ },
+ {
+ "epoch": 1.523404255319149,
+ "grad_norm": 2.500682830810547,
+ "learning_rate": 4.278979453594106e-06,
+ "loss": 0.471,
+ "step": 3222
+ },
+ {
+ "epoch": 1.5238770685579195,
+ "grad_norm": 2.5269598960876465,
+ "learning_rate": 4.278541101652605e-06,
+ "loss": 0.471,
+ "step": 3223
+ },
+ {
+ "epoch": 1.5243498817966903,
+ "grad_norm": 2.8153114318847656,
+ "learning_rate": 4.2781026389691465e-06,
+ "loss": 0.5742,
+ "step": 3224
+ },
+ {
+ "epoch": 1.524822695035461,
+ "grad_norm": 2.5648019313812256,
+ "learning_rate": 4.277664065571034e-06,
+ "loss": 0.5315,
+ "step": 3225
+ },
+ {
+ "epoch": 1.5252955082742317,
+ "grad_norm": 2.778355836868286,
+ "learning_rate": 4.277225381485575e-06,
+ "loss": 0.5543,
+ "step": 3226
+ },
+ {
+ "epoch": 1.5257683215130022,
+ "grad_norm": 2.6736745834350586,
+ "learning_rate": 4.2767865867400846e-06,
+ "loss": 0.4947,
+ "step": 3227
+ },
+ {
+ "epoch": 1.526241134751773,
+ "grad_norm": 2.9560294151306152,
+ "learning_rate": 4.276347681361884e-06,
+ "loss": 0.5835,
+ "step": 3228
+ },
+ {
+ "epoch": 1.5267139479905438,
+ "grad_norm": 2.5580296516418457,
+ "learning_rate": 4.275908665378302e-06,
+ "loss": 0.4751,
+ "step": 3229
+ },
+ {
+ "epoch": 1.5271867612293144,
+ "grad_norm": 3.0705175399780273,
+ "learning_rate": 4.2754695388166755e-06,
+ "loss": 0.5327,
+ "step": 3230
+ },
+ {
+ "epoch": 1.527659574468085,
+ "grad_norm": 2.664652109146118,
+ "learning_rate": 4.275030301704346e-06,
+ "loss": 0.4934,
+ "step": 3231
+ },
+ {
+ "epoch": 1.5281323877068558,
+ "grad_norm": 2.308499813079834,
+ "learning_rate": 4.274590954068663e-06,
+ "loss": 0.4412,
+ "step": 3232
+ },
+ {
+ "epoch": 1.5286052009456266,
+ "grad_norm": 2.871189594268799,
+ "learning_rate": 4.2741514959369815e-06,
+ "loss": 0.5001,
+ "step": 3233
+ },
+ {
+ "epoch": 1.5290780141843971,
+ "grad_norm": 2.5274453163146973,
+ "learning_rate": 4.273711927336666e-06,
+ "loss": 0.4938,
+ "step": 3234
+ },
+ {
+ "epoch": 1.5295508274231677,
+ "grad_norm": 2.8848133087158203,
+ "learning_rate": 4.273272248295087e-06,
+ "loss": 0.5397,
+ "step": 3235
+ },
+ {
+ "epoch": 1.5300236406619385,
+ "grad_norm": 2.3927090167999268,
+ "learning_rate": 4.27283245883962e-06,
+ "loss": 0.5497,
+ "step": 3236
+ },
+ {
+ "epoch": 1.5304964539007093,
+ "grad_norm": 2.5413873195648193,
+ "learning_rate": 4.27239255899765e-06,
+ "loss": 0.5108,
+ "step": 3237
+ },
+ {
+ "epoch": 1.53096926713948,
+ "grad_norm": 2.7692389488220215,
+ "learning_rate": 4.271952548796567e-06,
+ "loss": 0.5768,
+ "step": 3238
+ },
+ {
+ "epoch": 1.5314420803782505,
+ "grad_norm": 2.4621126651763916,
+ "learning_rate": 4.271512428263768e-06,
+ "loss": 0.4698,
+ "step": 3239
+ },
+ {
+ "epoch": 1.5319148936170213,
+ "grad_norm": 2.6423375606536865,
+ "learning_rate": 4.271072197426659e-06,
+ "loss": 0.4929,
+ "step": 3240
+ },
+ {
+ "epoch": 1.532387706855792,
+ "grad_norm": 2.7097692489624023,
+ "learning_rate": 4.270631856312649e-06,
+ "loss": 0.4836,
+ "step": 3241
+ },
+ {
+ "epoch": 1.5328605200945626,
+ "grad_norm": 2.545706272125244,
+ "learning_rate": 4.270191404949158e-06,
+ "loss": 0.4636,
+ "step": 3242
+ },
+ {
+ "epoch": 1.5333333333333332,
+ "grad_norm": 3.138781785964966,
+ "learning_rate": 4.26975084336361e-06,
+ "loss": 0.5988,
+ "step": 3243
+ },
+ {
+ "epoch": 1.533806146572104,
+ "grad_norm": 2.492715835571289,
+ "learning_rate": 4.269310171583438e-06,
+ "loss": 0.5095,
+ "step": 3244
+ },
+ {
+ "epoch": 1.5342789598108748,
+ "grad_norm": 2.5705838203430176,
+ "learning_rate": 4.268869389636077e-06,
+ "loss": 0.4818,
+ "step": 3245
+ },
+ {
+ "epoch": 1.5347517730496454,
+ "grad_norm": 2.7633554935455322,
+ "learning_rate": 4.268428497548979e-06,
+ "loss": 0.547,
+ "step": 3246
+ },
+ {
+ "epoch": 1.535224586288416,
+ "grad_norm": 2.654528856277466,
+ "learning_rate": 4.2679874953495905e-06,
+ "loss": 0.5261,
+ "step": 3247
+ },
+ {
+ "epoch": 1.5356973995271868,
+ "grad_norm": 2.5039751529693604,
+ "learning_rate": 4.2675463830653744e-06,
+ "loss": 0.4941,
+ "step": 3248
+ },
+ {
+ "epoch": 1.5361702127659576,
+ "grad_norm": 2.897268295288086,
+ "learning_rate": 4.267105160723794e-06,
+ "loss": 0.5404,
+ "step": 3249
+ },
+ {
+ "epoch": 1.5366430260047281,
+ "grad_norm": 2.500732421875,
+ "learning_rate": 4.266663828352324e-06,
+ "loss": 0.5375,
+ "step": 3250
+ },
+ {
+ "epoch": 1.5371158392434987,
+ "grad_norm": 2.6310064792633057,
+ "learning_rate": 4.266222385978444e-06,
+ "loss": 0.5217,
+ "step": 3251
+ },
+ {
+ "epoch": 1.5375886524822695,
+ "grad_norm": 2.7440476417541504,
+ "learning_rate": 4.265780833629642e-06,
+ "loss": 0.5419,
+ "step": 3252
+ },
+ {
+ "epoch": 1.5380614657210403,
+ "grad_norm": 2.7037577629089355,
+ "learning_rate": 4.2653391713334095e-06,
+ "loss": 0.5634,
+ "step": 3253
+ },
+ {
+ "epoch": 1.5385342789598109,
+ "grad_norm": 2.548525810241699,
+ "learning_rate": 4.264897399117248e-06,
+ "loss": 0.535,
+ "step": 3254
+ },
+ {
+ "epoch": 1.5390070921985815,
+ "grad_norm": 2.6127355098724365,
+ "learning_rate": 4.264455517008663e-06,
+ "loss": 0.4619,
+ "step": 3255
+ },
+ {
+ "epoch": 1.5394799054373522,
+ "grad_norm": 2.5597004890441895,
+ "learning_rate": 4.264013525035171e-06,
+ "loss": 0.4477,
+ "step": 3256
+ },
+ {
+ "epoch": 1.539952718676123,
+ "grad_norm": 2.642432689666748,
+ "learning_rate": 4.263571423224292e-06,
+ "loss": 0.4749,
+ "step": 3257
+ },
+ {
+ "epoch": 1.5404255319148936,
+ "grad_norm": 2.5121877193450928,
+ "learning_rate": 4.2631292116035526e-06,
+ "loss": 0.4693,
+ "step": 3258
+ },
+ {
+ "epoch": 1.5408983451536642,
+ "grad_norm": 2.390292167663574,
+ "learning_rate": 4.262686890200489e-06,
+ "loss": 0.4872,
+ "step": 3259
+ },
+ {
+ "epoch": 1.541371158392435,
+ "grad_norm": 2.5898337364196777,
+ "learning_rate": 4.2622444590426405e-06,
+ "loss": 0.5193,
+ "step": 3260
+ },
+ {
+ "epoch": 1.5418439716312058,
+ "grad_norm": 2.508821487426758,
+ "learning_rate": 4.261801918157558e-06,
+ "loss": 0.511,
+ "step": 3261
+ },
+ {
+ "epoch": 1.5423167848699764,
+ "grad_norm": 2.6992101669311523,
+ "learning_rate": 4.261359267572795e-06,
+ "loss": 0.5069,
+ "step": 3262
+ },
+ {
+ "epoch": 1.542789598108747,
+ "grad_norm": 2.6011030673980713,
+ "learning_rate": 4.2609165073159145e-06,
+ "loss": 0.5887,
+ "step": 3263
+ },
+ {
+ "epoch": 1.5432624113475177,
+ "grad_norm": 2.887053966522217,
+ "learning_rate": 4.260473637414483e-06,
+ "loss": 0.5556,
+ "step": 3264
+ },
+ {
+ "epoch": 1.5437352245862885,
+ "grad_norm": 2.6433887481689453,
+ "learning_rate": 4.260030657896079e-06,
+ "loss": 0.4728,
+ "step": 3265
+ },
+ {
+ "epoch": 1.544208037825059,
+ "grad_norm": 2.6134607791900635,
+ "learning_rate": 4.259587568788282e-06,
+ "loss": 0.483,
+ "step": 3266
+ },
+ {
+ "epoch": 1.5446808510638297,
+ "grad_norm": 2.5308640003204346,
+ "learning_rate": 4.259144370118684e-06,
+ "loss": 0.5115,
+ "step": 3267
+ },
+ {
+ "epoch": 1.5451536643026005,
+ "grad_norm": 2.8256733417510986,
+ "learning_rate": 4.258701061914879e-06,
+ "loss": 0.5414,
+ "step": 3268
+ },
+ {
+ "epoch": 1.5456264775413713,
+ "grad_norm": 2.8648319244384766,
+ "learning_rate": 4.258257644204471e-06,
+ "loss": 0.5695,
+ "step": 3269
+ },
+ {
+ "epoch": 1.5460992907801419,
+ "grad_norm": 2.8568081855773926,
+ "learning_rate": 4.257814117015069e-06,
+ "loss": 0.5264,
+ "step": 3270
+ },
+ {
+ "epoch": 1.5465721040189124,
+ "grad_norm": 2.6065011024475098,
+ "learning_rate": 4.257370480374289e-06,
+ "loss": 0.5646,
+ "step": 3271
+ },
+ {
+ "epoch": 1.5470449172576832,
+ "grad_norm": 2.7840216159820557,
+ "learning_rate": 4.256926734309756e-06,
+ "loss": 0.5191,
+ "step": 3272
+ },
+ {
+ "epoch": 1.547517730496454,
+ "grad_norm": 2.85906982421875,
+ "learning_rate": 4.256482878849099e-06,
+ "loss": 0.5911,
+ "step": 3273
+ },
+ {
+ "epoch": 1.5479905437352246,
+ "grad_norm": 2.916029930114746,
+ "learning_rate": 4.256038914019954e-06,
+ "loss": 0.5589,
+ "step": 3274
+ },
+ {
+ "epoch": 1.5484633569739952,
+ "grad_norm": 2.6748716831207275,
+ "learning_rate": 4.255594839849967e-06,
+ "loss": 0.5323,
+ "step": 3275
+ },
+ {
+ "epoch": 1.548936170212766,
+ "grad_norm": 2.717212200164795,
+ "learning_rate": 4.255150656366787e-06,
+ "loss": 0.453,
+ "step": 3276
+ },
+ {
+ "epoch": 1.5494089834515368,
+ "grad_norm": 2.4974849224090576,
+ "learning_rate": 4.254706363598072e-06,
+ "loss": 0.4516,
+ "step": 3277
+ },
+ {
+ "epoch": 1.5498817966903073,
+ "grad_norm": 2.648151397705078,
+ "learning_rate": 4.254261961571485e-06,
+ "loss": 0.5452,
+ "step": 3278
+ },
+ {
+ "epoch": 1.550354609929078,
+ "grad_norm": 2.932905435562134,
+ "learning_rate": 4.253817450314699e-06,
+ "loss": 0.4813,
+ "step": 3279
+ },
+ {
+ "epoch": 1.5508274231678487,
+ "grad_norm": 2.862912178039551,
+ "learning_rate": 4.25337282985539e-06,
+ "loss": 0.5689,
+ "step": 3280
+ },
+ {
+ "epoch": 1.5513002364066195,
+ "grad_norm": 2.532156467437744,
+ "learning_rate": 4.2529281002212436e-06,
+ "loss": 0.485,
+ "step": 3281
+ },
+ {
+ "epoch": 1.55177304964539,
+ "grad_norm": 2.583299160003662,
+ "learning_rate": 4.25248326143995e-06,
+ "loss": 0.4661,
+ "step": 3282
+ },
+ {
+ "epoch": 1.5522458628841607,
+ "grad_norm": 2.5790653228759766,
+ "learning_rate": 4.252038313539209e-06,
+ "loss": 0.5455,
+ "step": 3283
+ },
+ {
+ "epoch": 1.5527186761229315,
+ "grad_norm": 2.872864007949829,
+ "learning_rate": 4.251593256546724e-06,
+ "loss": 0.5317,
+ "step": 3284
+ },
+ {
+ "epoch": 1.5531914893617023,
+ "grad_norm": 3.0382463932037354,
+ "learning_rate": 4.251148090490208e-06,
+ "loss": 0.5131,
+ "step": 3285
+ },
+ {
+ "epoch": 1.5536643026004728,
+ "grad_norm": 2.574399709701538,
+ "learning_rate": 4.250702815397379e-06,
+ "loss": 0.5399,
+ "step": 3286
+ },
+ {
+ "epoch": 1.5541371158392434,
+ "grad_norm": 2.9784770011901855,
+ "learning_rate": 4.250257431295962e-06,
+ "loss": 0.5209,
+ "step": 3287
+ },
+ {
+ "epoch": 1.5546099290780142,
+ "grad_norm": 2.6482062339782715,
+ "learning_rate": 4.249811938213689e-06,
+ "loss": 0.5416,
+ "step": 3288
+ },
+ {
+ "epoch": 1.555082742316785,
+ "grad_norm": 2.82142972946167,
+ "learning_rate": 4.2493663361783e-06,
+ "loss": 0.594,
+ "step": 3289
+ },
+ {
+ "epoch": 1.5555555555555556,
+ "grad_norm": 2.815595865249634,
+ "learning_rate": 4.24892062521754e-06,
+ "loss": 0.5381,
+ "step": 3290
+ },
+ {
+ "epoch": 1.5560283687943262,
+ "grad_norm": 2.689764976501465,
+ "learning_rate": 4.248474805359161e-06,
+ "loss": 0.5141,
+ "step": 3291
+ },
+ {
+ "epoch": 1.556501182033097,
+ "grad_norm": 2.7718515396118164,
+ "learning_rate": 4.248028876630922e-06,
+ "loss": 0.5324,
+ "step": 3292
+ },
+ {
+ "epoch": 1.5569739952718678,
+ "grad_norm": 3.0196774005889893,
+ "learning_rate": 4.247582839060591e-06,
+ "loss": 0.4971,
+ "step": 3293
+ },
+ {
+ "epoch": 1.5574468085106383,
+ "grad_norm": 2.608475923538208,
+ "learning_rate": 4.247136692675939e-06,
+ "loss": 0.5795,
+ "step": 3294
+ },
+ {
+ "epoch": 1.557919621749409,
+ "grad_norm": 2.4912326335906982,
+ "learning_rate": 4.246690437504746e-06,
+ "loss": 0.5348,
+ "step": 3295
+ },
+ {
+ "epoch": 1.5583924349881797,
+ "grad_norm": 2.519303560256958,
+ "learning_rate": 4.246244073574799e-06,
+ "loss": 0.4953,
+ "step": 3296
+ },
+ {
+ "epoch": 1.5588652482269505,
+ "grad_norm": 2.5667171478271484,
+ "learning_rate": 4.24579760091389e-06,
+ "loss": 0.5353,
+ "step": 3297
+ },
+ {
+ "epoch": 1.559338061465721,
+ "grad_norm": 2.8835761547088623,
+ "learning_rate": 4.24535101954982e-06,
+ "loss": 0.578,
+ "step": 3298
+ },
+ {
+ "epoch": 1.5598108747044916,
+ "grad_norm": 3.0506930351257324,
+ "learning_rate": 4.244904329510395e-06,
+ "loss": 0.6418,
+ "step": 3299
+ },
+ {
+ "epoch": 1.5602836879432624,
+ "grad_norm": 2.579446315765381,
+ "learning_rate": 4.244457530823428e-06,
+ "loss": 0.5027,
+ "step": 3300
+ },
+ {
+ "epoch": 1.5607565011820332,
+ "grad_norm": 2.72012996673584,
+ "learning_rate": 4.24401062351674e-06,
+ "loss": 0.5438,
+ "step": 3301
+ },
+ {
+ "epoch": 1.5612293144208038,
+ "grad_norm": 2.527007818222046,
+ "learning_rate": 4.243563607618158e-06,
+ "loss": 0.5303,
+ "step": 3302
+ },
+ {
+ "epoch": 1.5617021276595744,
+ "grad_norm": 2.4415159225463867,
+ "learning_rate": 4.243116483155516e-06,
+ "loss": 0.4893,
+ "step": 3303
+ },
+ {
+ "epoch": 1.5621749408983452,
+ "grad_norm": 2.462256669998169,
+ "learning_rate": 4.242669250156653e-06,
+ "loss": 0.5671,
+ "step": 3304
+ },
+ {
+ "epoch": 1.562647754137116,
+ "grad_norm": 2.479865074157715,
+ "learning_rate": 4.242221908649418e-06,
+ "loss": 0.5038,
+ "step": 3305
+ },
+ {
+ "epoch": 1.5631205673758866,
+ "grad_norm": 2.74670672416687,
+ "learning_rate": 4.241774458661662e-06,
+ "loss": 0.5689,
+ "step": 3306
+ },
+ {
+ "epoch": 1.5635933806146571,
+ "grad_norm": 2.55938982963562,
+ "learning_rate": 4.24132690022125e-06,
+ "loss": 0.492,
+ "step": 3307
+ },
+ {
+ "epoch": 1.564066193853428,
+ "grad_norm": 2.634956121444702,
+ "learning_rate": 4.240879233356048e-06,
+ "loss": 0.503,
+ "step": 3308
+ },
+ {
+ "epoch": 1.5645390070921987,
+ "grad_norm": 2.381775140762329,
+ "learning_rate": 4.240431458093928e-06,
+ "loss": 0.4939,
+ "step": 3309
+ },
+ {
+ "epoch": 1.5650118203309693,
+ "grad_norm": 2.8176610469818115,
+ "learning_rate": 4.239983574462774e-06,
+ "loss": 0.5609,
+ "step": 3310
+ },
+ {
+ "epoch": 1.5654846335697399,
+ "grad_norm": 3.0268442630767822,
+ "learning_rate": 4.239535582490471e-06,
+ "loss": 0.5427,
+ "step": 3311
+ },
+ {
+ "epoch": 1.5659574468085107,
+ "grad_norm": 2.5881481170654297,
+ "learning_rate": 4.239087482204916e-06,
+ "loss": 0.5538,
+ "step": 3312
+ },
+ {
+ "epoch": 1.5664302600472815,
+ "grad_norm": 2.5317704677581787,
+ "learning_rate": 4.238639273634008e-06,
+ "loss": 0.4915,
+ "step": 3313
+ },
+ {
+ "epoch": 1.566903073286052,
+ "grad_norm": 2.9608731269836426,
+ "learning_rate": 4.238190956805658e-06,
+ "loss": 0.564,
+ "step": 3314
+ },
+ {
+ "epoch": 1.5673758865248226,
+ "grad_norm": 3.022686243057251,
+ "learning_rate": 4.237742531747777e-06,
+ "loss": 0.5503,
+ "step": 3315
+ },
+ {
+ "epoch": 1.5678486997635934,
+ "grad_norm": 2.763622283935547,
+ "learning_rate": 4.23729399848829e-06,
+ "loss": 0.5241,
+ "step": 3316
+ },
+ {
+ "epoch": 1.5683215130023642,
+ "grad_norm": 2.6112794876098633,
+ "learning_rate": 4.236845357055122e-06,
+ "loss": 0.4919,
+ "step": 3317
+ },
+ {
+ "epoch": 1.5687943262411348,
+ "grad_norm": 2.649829149246216,
+ "learning_rate": 4.23639660747621e-06,
+ "loss": 0.5472,
+ "step": 3318
+ },
+ {
+ "epoch": 1.5692671394799054,
+ "grad_norm": 2.8888115882873535,
+ "learning_rate": 4.2359477497794955e-06,
+ "loss": 0.5077,
+ "step": 3319
+ },
+ {
+ "epoch": 1.5697399527186762,
+ "grad_norm": 2.5666911602020264,
+ "learning_rate": 4.235498783992927e-06,
+ "loss": 0.5365,
+ "step": 3320
+ },
+ {
+ "epoch": 1.570212765957447,
+ "grad_norm": 2.448758363723755,
+ "learning_rate": 4.2350497101444575e-06,
+ "loss": 0.5043,
+ "step": 3321
+ },
+ {
+ "epoch": 1.5706855791962175,
+ "grad_norm": 2.595207691192627,
+ "learning_rate": 4.234600528262052e-06,
+ "loss": 0.5303,
+ "step": 3322
+ },
+ {
+ "epoch": 1.5711583924349881,
+ "grad_norm": 2.7814228534698486,
+ "learning_rate": 4.234151238373676e-06,
+ "loss": 0.4521,
+ "step": 3323
+ },
+ {
+ "epoch": 1.571631205673759,
+ "grad_norm": 2.781538724899292,
+ "learning_rate": 4.233701840507308e-06,
+ "loss": 0.5193,
+ "step": 3324
+ },
+ {
+ "epoch": 1.5721040189125297,
+ "grad_norm": 2.771907329559326,
+ "learning_rate": 4.233252334690928e-06,
+ "loss": 0.497,
+ "step": 3325
+ },
+ {
+ "epoch": 1.5725768321513003,
+ "grad_norm": 2.5557498931884766,
+ "learning_rate": 4.232802720952525e-06,
+ "loss": 0.4913,
+ "step": 3326
+ },
+ {
+ "epoch": 1.5730496453900709,
+ "grad_norm": 2.478267192840576,
+ "learning_rate": 4.232352999320094e-06,
+ "loss": 0.4967,
+ "step": 3327
+ },
+ {
+ "epoch": 1.5735224586288417,
+ "grad_norm": 3.1548502445220947,
+ "learning_rate": 4.231903169821639e-06,
+ "loss": 0.5009,
+ "step": 3328
+ },
+ {
+ "epoch": 1.5739952718676125,
+ "grad_norm": 2.634824275970459,
+ "learning_rate": 4.231453232485168e-06,
+ "loss": 0.5223,
+ "step": 3329
+ },
+ {
+ "epoch": 1.574468085106383,
+ "grad_norm": 2.579102039337158,
+ "learning_rate": 4.231003187338695e-06,
+ "loss": 0.5513,
+ "step": 3330
+ },
+ {
+ "epoch": 1.5749408983451536,
+ "grad_norm": 2.8477070331573486,
+ "learning_rate": 4.230553034410245e-06,
+ "loss": 0.561,
+ "step": 3331
+ },
+ {
+ "epoch": 1.5754137115839244,
+ "grad_norm": 2.6714725494384766,
+ "learning_rate": 4.2301027737278446e-06,
+ "loss": 0.4687,
+ "step": 3332
+ },
+ {
+ "epoch": 1.5758865248226952,
+ "grad_norm": 2.6562764644622803,
+ "learning_rate": 4.229652405319532e-06,
+ "loss": 0.5925,
+ "step": 3333
+ },
+ {
+ "epoch": 1.5763593380614658,
+ "grad_norm": 2.750946283340454,
+ "learning_rate": 4.229201929213348e-06,
+ "loss": 0.4748,
+ "step": 3334
+ },
+ {
+ "epoch": 1.5768321513002364,
+ "grad_norm": 2.760470151901245,
+ "learning_rate": 4.228751345437342e-06,
+ "loss": 0.5989,
+ "step": 3335
+ },
+ {
+ "epoch": 1.5773049645390071,
+ "grad_norm": 3.1451845169067383,
+ "learning_rate": 4.2283006540195706e-06,
+ "loss": 0.562,
+ "step": 3336
+ },
+ {
+ "epoch": 1.5777777777777777,
+ "grad_norm": 2.563011407852173,
+ "learning_rate": 4.227849854988095e-06,
+ "loss": 0.5473,
+ "step": 3337
+ },
+ {
+ "epoch": 1.5782505910165483,
+ "grad_norm": 2.310469388961792,
+ "learning_rate": 4.2273989483709856e-06,
+ "loss": 0.5033,
+ "step": 3338
+ },
+ {
+ "epoch": 1.578723404255319,
+ "grad_norm": 2.677978754043579,
+ "learning_rate": 4.226947934196318e-06,
+ "loss": 0.5291,
+ "step": 3339
+ },
+ {
+ "epoch": 1.57919621749409,
+ "grad_norm": 3.0423545837402344,
+ "learning_rate": 4.226496812492176e-06,
+ "loss": 0.5201,
+ "step": 3340
+ },
+ {
+ "epoch": 1.5796690307328605,
+ "grad_norm": 2.357513904571533,
+ "learning_rate": 4.226045583286647e-06,
+ "loss": 0.4421,
+ "step": 3341
+ },
+ {
+ "epoch": 1.580141843971631,
+ "grad_norm": 2.719860315322876,
+ "learning_rate": 4.225594246607828e-06,
+ "loss": 0.4855,
+ "step": 3342
+ },
+ {
+ "epoch": 1.5806146572104018,
+ "grad_norm": 3.2645058631896973,
+ "learning_rate": 4.2251428024838215e-06,
+ "loss": 0.6654,
+ "step": 3343
+ },
+ {
+ "epoch": 1.5810874704491726,
+ "grad_norm": 2.2997004985809326,
+ "learning_rate": 4.224691250942737e-06,
+ "loss": 0.4565,
+ "step": 3344
+ },
+ {
+ "epoch": 1.5815602836879432,
+ "grad_norm": 2.8103034496307373,
+ "learning_rate": 4.2242395920126926e-06,
+ "loss": 0.5543,
+ "step": 3345
+ },
+ {
+ "epoch": 1.5820330969267138,
+ "grad_norm": 2.720254898071289,
+ "learning_rate": 4.223787825721808e-06,
+ "loss": 0.5028,
+ "step": 3346
+ },
+ {
+ "epoch": 1.5825059101654846,
+ "grad_norm": 2.735544204711914,
+ "learning_rate": 4.223335952098214e-06,
+ "loss": 0.5169,
+ "step": 3347
+ },
+ {
+ "epoch": 1.5829787234042554,
+ "grad_norm": 2.784254550933838,
+ "learning_rate": 4.222883971170047e-06,
+ "loss": 0.4989,
+ "step": 3348
+ },
+ {
+ "epoch": 1.583451536643026,
+ "grad_norm": 2.7192094326019287,
+ "learning_rate": 4.22243188296545e-06,
+ "loss": 0.502,
+ "step": 3349
+ },
+ {
+ "epoch": 1.5839243498817965,
+ "grad_norm": 2.716501474380493,
+ "learning_rate": 4.221979687512573e-06,
+ "loss": 0.5687,
+ "step": 3350
+ },
+ {
+ "epoch": 1.5843971631205673,
+ "grad_norm": 2.8420114517211914,
+ "learning_rate": 4.22152738483957e-06,
+ "loss": 0.5903,
+ "step": 3351
+ },
+ {
+ "epoch": 1.5848699763593381,
+ "grad_norm": 2.734872579574585,
+ "learning_rate": 4.2210749749746065e-06,
+ "loss": 0.5397,
+ "step": 3352
+ },
+ {
+ "epoch": 1.5853427895981087,
+ "grad_norm": 2.4343836307525635,
+ "learning_rate": 4.220622457945851e-06,
+ "loss": 0.436,
+ "step": 3353
+ },
+ {
+ "epoch": 1.5858156028368793,
+ "grad_norm": 2.728177547454834,
+ "learning_rate": 4.2201698337814785e-06,
+ "loss": 0.5703,
+ "step": 3354
+ },
+ {
+ "epoch": 1.58628841607565,
+ "grad_norm": 2.502098560333252,
+ "learning_rate": 4.219717102509674e-06,
+ "loss": 0.5275,
+ "step": 3355
+ },
+ {
+ "epoch": 1.5867612293144209,
+ "grad_norm": 2.6595494747161865,
+ "learning_rate": 4.219264264158627e-06,
+ "loss": 0.4659,
+ "step": 3356
+ },
+ {
+ "epoch": 1.5872340425531914,
+ "grad_norm": 2.5307185649871826,
+ "learning_rate": 4.218811318756532e-06,
+ "loss": 0.5048,
+ "step": 3357
+ },
+ {
+ "epoch": 1.587706855791962,
+ "grad_norm": 2.9300129413604736,
+ "learning_rate": 4.218358266331593e-06,
+ "loss": 0.5137,
+ "step": 3358
+ },
+ {
+ "epoch": 1.5881796690307328,
+ "grad_norm": 2.686586618423462,
+ "learning_rate": 4.21790510691202e-06,
+ "loss": 0.4529,
+ "step": 3359
+ },
+ {
+ "epoch": 1.5886524822695036,
+ "grad_norm": 2.9981517791748047,
+ "learning_rate": 4.217451840526029e-06,
+ "loss": 0.6054,
+ "step": 3360
+ },
+ {
+ "epoch": 1.5891252955082742,
+ "grad_norm": 2.6943674087524414,
+ "learning_rate": 4.216998467201841e-06,
+ "loss": 0.5153,
+ "step": 3361
+ },
+ {
+ "epoch": 1.5895981087470448,
+ "grad_norm": 2.707084894180298,
+ "learning_rate": 4.216544986967689e-06,
+ "loss": 0.5235,
+ "step": 3362
+ },
+ {
+ "epoch": 1.5900709219858156,
+ "grad_norm": 2.6553728580474854,
+ "learning_rate": 4.216091399851808e-06,
+ "loss": 0.5275,
+ "step": 3363
+ },
+ {
+ "epoch": 1.5905437352245864,
+ "grad_norm": 2.9136953353881836,
+ "learning_rate": 4.215637705882439e-06,
+ "loss": 0.5834,
+ "step": 3364
+ },
+ {
+ "epoch": 1.591016548463357,
+ "grad_norm": 2.7647159099578857,
+ "learning_rate": 4.2151839050878325e-06,
+ "loss": 0.5641,
+ "step": 3365
+ },
+ {
+ "epoch": 1.5914893617021275,
+ "grad_norm": 2.4556827545166016,
+ "learning_rate": 4.214729997496246e-06,
+ "loss": 0.5636,
+ "step": 3366
+ },
+ {
+ "epoch": 1.5919621749408983,
+ "grad_norm": 2.6111652851104736,
+ "learning_rate": 4.2142759831359414e-06,
+ "loss": 0.5097,
+ "step": 3367
+ },
+ {
+ "epoch": 1.592434988179669,
+ "grad_norm": 2.4886903762817383,
+ "learning_rate": 4.213821862035189e-06,
+ "loss": 0.531,
+ "step": 3368
+ },
+ {
+ "epoch": 1.5929078014184397,
+ "grad_norm": 2.5245840549468994,
+ "learning_rate": 4.213367634222263e-06,
+ "loss": 0.5085,
+ "step": 3369
+ },
+ {
+ "epoch": 1.5933806146572103,
+ "grad_norm": 2.970214605331421,
+ "learning_rate": 4.212913299725447e-06,
+ "loss": 0.5851,
+ "step": 3370
+ },
+ {
+ "epoch": 1.593853427895981,
+ "grad_norm": 2.5433361530303955,
+ "learning_rate": 4.212458858573032e-06,
+ "loss": 0.48,
+ "step": 3371
+ },
+ {
+ "epoch": 1.5943262411347519,
+ "grad_norm": 2.3550102710723877,
+ "learning_rate": 4.212004310793312e-06,
+ "loss": 0.4405,
+ "step": 3372
+ },
+ {
+ "epoch": 1.5947990543735224,
+ "grad_norm": 2.4824719429016113,
+ "learning_rate": 4.2115496564145896e-06,
+ "loss": 0.4634,
+ "step": 3373
+ },
+ {
+ "epoch": 1.595271867612293,
+ "grad_norm": 2.4751930236816406,
+ "learning_rate": 4.211094895465176e-06,
+ "loss": 0.5662,
+ "step": 3374
+ },
+ {
+ "epoch": 1.5957446808510638,
+ "grad_norm": 2.4193356037139893,
+ "learning_rate": 4.210640027973386e-06,
+ "loss": 0.4441,
+ "step": 3375
+ },
+ {
+ "epoch": 1.5962174940898346,
+ "grad_norm": 2.4477498531341553,
+ "learning_rate": 4.210185053967543e-06,
+ "loss": 0.5205,
+ "step": 3376
+ },
+ {
+ "epoch": 1.5966903073286052,
+ "grad_norm": 2.7954161167144775,
+ "learning_rate": 4.209729973475976e-06,
+ "loss": 0.4951,
+ "step": 3377
+ },
+ {
+ "epoch": 1.5971631205673757,
+ "grad_norm": 3.1907570362091064,
+ "learning_rate": 4.209274786527019e-06,
+ "loss": 0.6024,
+ "step": 3378
+ },
+ {
+ "epoch": 1.5976359338061465,
+ "grad_norm": 2.485245704650879,
+ "learning_rate": 4.2088194931490165e-06,
+ "loss": 0.5652,
+ "step": 3379
+ },
+ {
+ "epoch": 1.5981087470449173,
+ "grad_norm": 2.589310884475708,
+ "learning_rate": 4.208364093370317e-06,
+ "loss": 0.5085,
+ "step": 3380
+ },
+ {
+ "epoch": 1.598581560283688,
+ "grad_norm": 2.8941214084625244,
+ "learning_rate": 4.207908587219276e-06,
+ "loss": 0.53,
+ "step": 3381
+ },
+ {
+ "epoch": 1.5990543735224585,
+ "grad_norm": 2.480509042739868,
+ "learning_rate": 4.207452974724258e-06,
+ "loss": 0.4543,
+ "step": 3382
+ },
+ {
+ "epoch": 1.5995271867612293,
+ "grad_norm": 2.7884905338287354,
+ "learning_rate": 4.206997255913629e-06,
+ "loss": 0.5483,
+ "step": 3383
+ },
+ {
+ "epoch": 1.6,
+ "grad_norm": 2.7976696491241455,
+ "learning_rate": 4.206541430815766e-06,
+ "loss": 0.4734,
+ "step": 3384
+ },
+ {
+ "epoch": 1.6004728132387707,
+ "grad_norm": 2.5463132858276367,
+ "learning_rate": 4.206085499459051e-06,
+ "loss": 0.4931,
+ "step": 3385
+ },
+ {
+ "epoch": 1.6009456264775412,
+ "grad_norm": 2.8384251594543457,
+ "learning_rate": 4.205629461871871e-06,
+ "loss": 0.5066,
+ "step": 3386
+ },
+ {
+ "epoch": 1.601418439716312,
+ "grad_norm": 2.8578574657440186,
+ "learning_rate": 4.205173318082626e-06,
+ "loss": 0.458,
+ "step": 3387
+ },
+ {
+ "epoch": 1.6018912529550828,
+ "grad_norm": 2.7779932022094727,
+ "learning_rate": 4.204717068119715e-06,
+ "loss": 0.5293,
+ "step": 3388
+ },
+ {
+ "epoch": 1.6023640661938534,
+ "grad_norm": 2.9123778343200684,
+ "learning_rate": 4.204260712011546e-06,
+ "loss": 0.4866,
+ "step": 3389
+ },
+ {
+ "epoch": 1.602836879432624,
+ "grad_norm": 2.757922887802124,
+ "learning_rate": 4.203804249786537e-06,
+ "loss": 0.4925,
+ "step": 3390
+ },
+ {
+ "epoch": 1.6033096926713948,
+ "grad_norm": 3.287733316421509,
+ "learning_rate": 4.203347681473107e-06,
+ "loss": 0.6694,
+ "step": 3391
+ },
+ {
+ "epoch": 1.6037825059101656,
+ "grad_norm": 3.2117912769317627,
+ "learning_rate": 4.202891007099687e-06,
+ "loss": 0.5269,
+ "step": 3392
+ },
+ {
+ "epoch": 1.6042553191489362,
+ "grad_norm": 2.8489456176757812,
+ "learning_rate": 4.20243422669471e-06,
+ "loss": 0.5073,
+ "step": 3393
+ },
+ {
+ "epoch": 1.6047281323877067,
+ "grad_norm": 2.7660224437713623,
+ "learning_rate": 4.201977340286619e-06,
+ "loss": 0.5014,
+ "step": 3394
+ },
+ {
+ "epoch": 1.6052009456264775,
+ "grad_norm": 2.68182110786438,
+ "learning_rate": 4.201520347903862e-06,
+ "loss": 0.4542,
+ "step": 3395
+ },
+ {
+ "epoch": 1.6056737588652483,
+ "grad_norm": 2.7546045780181885,
+ "learning_rate": 4.2010632495748934e-06,
+ "loss": 0.516,
+ "step": 3396
+ },
+ {
+ "epoch": 1.606146572104019,
+ "grad_norm": 2.744668483734131,
+ "learning_rate": 4.200606045328176e-06,
+ "loss": 0.5243,
+ "step": 3397
+ },
+ {
+ "epoch": 1.6066193853427895,
+ "grad_norm": 2.935343027114868,
+ "learning_rate": 4.200148735192177e-06,
+ "loss": 0.5624,
+ "step": 3398
+ },
+ {
+ "epoch": 1.6070921985815603,
+ "grad_norm": 2.7392852306365967,
+ "learning_rate": 4.19969131919537e-06,
+ "loss": 0.5796,
+ "step": 3399
+ },
+ {
+ "epoch": 1.607565011820331,
+ "grad_norm": 2.864750385284424,
+ "learning_rate": 4.199233797366239e-06,
+ "loss": 0.549,
+ "step": 3400
+ },
+ {
+ "epoch": 1.6080378250591016,
+ "grad_norm": 2.684157371520996,
+ "learning_rate": 4.198776169733269e-06,
+ "loss": 0.5532,
+ "step": 3401
+ },
+ {
+ "epoch": 1.6085106382978722,
+ "grad_norm": 2.4717135429382324,
+ "learning_rate": 4.198318436324957e-06,
+ "loss": 0.5174,
+ "step": 3402
+ },
+ {
+ "epoch": 1.608983451536643,
+ "grad_norm": 2.640242338180542,
+ "learning_rate": 4.197860597169802e-06,
+ "loss": 0.5117,
+ "step": 3403
+ },
+ {
+ "epoch": 1.6094562647754138,
+ "grad_norm": 2.4957473278045654,
+ "learning_rate": 4.197402652296313e-06,
+ "loss": 0.474,
+ "step": 3404
+ },
+ {
+ "epoch": 1.6099290780141844,
+ "grad_norm": 2.416138172149658,
+ "learning_rate": 4.196944601733004e-06,
+ "loss": 0.4858,
+ "step": 3405
+ },
+ {
+ "epoch": 1.610401891252955,
+ "grad_norm": 2.4498109817504883,
+ "learning_rate": 4.196486445508395e-06,
+ "loss": 0.5048,
+ "step": 3406
+ },
+ {
+ "epoch": 1.6108747044917258,
+ "grad_norm": 2.415895938873291,
+ "learning_rate": 4.196028183651014e-06,
+ "loss": 0.4745,
+ "step": 3407
+ },
+ {
+ "epoch": 1.6113475177304966,
+ "grad_norm": 2.843665838241577,
+ "learning_rate": 4.195569816189395e-06,
+ "loss": 0.5219,
+ "step": 3408
+ },
+ {
+ "epoch": 1.6118203309692671,
+ "grad_norm": 2.608579158782959,
+ "learning_rate": 4.195111343152079e-06,
+ "loss": 0.4941,
+ "step": 3409
+ },
+ {
+ "epoch": 1.6122931442080377,
+ "grad_norm": 2.643789529800415,
+ "learning_rate": 4.194652764567611e-06,
+ "loss": 0.515,
+ "step": 3410
+ },
+ {
+ "epoch": 1.6127659574468085,
+ "grad_norm": 2.8099429607391357,
+ "learning_rate": 4.194194080464547e-06,
+ "loss": 0.4935,
+ "step": 3411
+ },
+ {
+ "epoch": 1.6132387706855793,
+ "grad_norm": 2.595628261566162,
+ "learning_rate": 4.193735290871446e-06,
+ "loss": 0.5571,
+ "step": 3412
+ },
+ {
+ "epoch": 1.6137115839243499,
+ "grad_norm": 2.7903778553009033,
+ "learning_rate": 4.193276395816876e-06,
+ "loss": 0.5228,
+ "step": 3413
+ },
+ {
+ "epoch": 1.6141843971631205,
+ "grad_norm": 2.83910870552063,
+ "learning_rate": 4.192817395329409e-06,
+ "loss": 0.6124,
+ "step": 3414
+ },
+ {
+ "epoch": 1.6146572104018913,
+ "grad_norm": 2.6155734062194824,
+ "learning_rate": 4.192358289437626e-06,
+ "loss": 0.552,
+ "step": 3415
+ },
+ {
+ "epoch": 1.615130023640662,
+ "grad_norm": 2.795832872390747,
+ "learning_rate": 4.191899078170113e-06,
+ "loss": 0.5561,
+ "step": 3416
+ },
+ {
+ "epoch": 1.6156028368794326,
+ "grad_norm": 2.3402161598205566,
+ "learning_rate": 4.191439761555464e-06,
+ "loss": 0.4889,
+ "step": 3417
+ },
+ {
+ "epoch": 1.6160756501182032,
+ "grad_norm": 3.1183433532714844,
+ "learning_rate": 4.190980339622276e-06,
+ "loss": 0.5337,
+ "step": 3418
+ },
+ {
+ "epoch": 1.616548463356974,
+ "grad_norm": 2.6262872219085693,
+ "learning_rate": 4.190520812399158e-06,
+ "loss": 0.525,
+ "step": 3419
+ },
+ {
+ "epoch": 1.6170212765957448,
+ "grad_norm": 2.578340530395508,
+ "learning_rate": 4.190061179914722e-06,
+ "loss": 0.4975,
+ "step": 3420
+ },
+ {
+ "epoch": 1.6174940898345154,
+ "grad_norm": 3.19482159614563,
+ "learning_rate": 4.189601442197586e-06,
+ "loss": 0.5832,
+ "step": 3421
+ },
+ {
+ "epoch": 1.617966903073286,
+ "grad_norm": 2.6398792266845703,
+ "learning_rate": 4.189141599276378e-06,
+ "loss": 0.4676,
+ "step": 3422
+ },
+ {
+ "epoch": 1.6184397163120567,
+ "grad_norm": 2.624865770339966,
+ "learning_rate": 4.1886816511797275e-06,
+ "loss": 0.4507,
+ "step": 3423
+ },
+ {
+ "epoch": 1.6189125295508275,
+ "grad_norm": 2.4136857986450195,
+ "learning_rate": 4.1882215979362775e-06,
+ "loss": 0.4616,
+ "step": 3424
+ },
+ {
+ "epoch": 1.6193853427895981,
+ "grad_norm": 2.6906614303588867,
+ "learning_rate": 4.18776143957467e-06,
+ "loss": 0.5142,
+ "step": 3425
+ },
+ {
+ "epoch": 1.6198581560283687,
+ "grad_norm": 2.5149154663085938,
+ "learning_rate": 4.187301176123558e-06,
+ "loss": 0.5252,
+ "step": 3426
+ },
+ {
+ "epoch": 1.6203309692671395,
+ "grad_norm": 2.677405834197998,
+ "learning_rate": 4.186840807611602e-06,
+ "loss": 0.4635,
+ "step": 3427
+ },
+ {
+ "epoch": 1.6208037825059103,
+ "grad_norm": 2.7164649963378906,
+ "learning_rate": 4.186380334067464e-06,
+ "loss": 0.5634,
+ "step": 3428
+ },
+ {
+ "epoch": 1.6212765957446809,
+ "grad_norm": 2.8299832344055176,
+ "learning_rate": 4.185919755519817e-06,
+ "loss": 0.5166,
+ "step": 3429
+ },
+ {
+ "epoch": 1.6217494089834514,
+ "grad_norm": 2.465848207473755,
+ "learning_rate": 4.18545907199734e-06,
+ "loss": 0.4696,
+ "step": 3430
+ },
+ {
+ "epoch": 1.6222222222222222,
+ "grad_norm": 2.407616376876831,
+ "learning_rate": 4.1849982835287175e-06,
+ "loss": 0.5111,
+ "step": 3431
+ },
+ {
+ "epoch": 1.622695035460993,
+ "grad_norm": 2.452146291732788,
+ "learning_rate": 4.184537390142639e-06,
+ "loss": 0.4574,
+ "step": 3432
+ },
+ {
+ "epoch": 1.6231678486997636,
+ "grad_norm": 2.653071165084839,
+ "learning_rate": 4.1840763918678055e-06,
+ "loss": 0.5611,
+ "step": 3433
+ },
+ {
+ "epoch": 1.6236406619385342,
+ "grad_norm": 2.5920350551605225,
+ "learning_rate": 4.183615288732919e-06,
+ "loss": 0.5437,
+ "step": 3434
+ },
+ {
+ "epoch": 1.624113475177305,
+ "grad_norm": 2.782900810241699,
+ "learning_rate": 4.18315408076669e-06,
+ "loss": 0.5824,
+ "step": 3435
+ },
+ {
+ "epoch": 1.6245862884160758,
+ "grad_norm": 2.8769774436950684,
+ "learning_rate": 4.1826927679978365e-06,
+ "loss": 0.5271,
+ "step": 3436
+ },
+ {
+ "epoch": 1.6250591016548463,
+ "grad_norm": 2.488598585128784,
+ "learning_rate": 4.182231350455084e-06,
+ "loss": 0.4684,
+ "step": 3437
+ },
+ {
+ "epoch": 1.625531914893617,
+ "grad_norm": 2.6472036838531494,
+ "learning_rate": 4.181769828167161e-06,
+ "loss": 0.5372,
+ "step": 3438
+ },
+ {
+ "epoch": 1.6260047281323877,
+ "grad_norm": 2.6498794555664062,
+ "learning_rate": 4.1813082011628045e-06,
+ "loss": 0.4805,
+ "step": 3439
+ },
+ {
+ "epoch": 1.6264775413711585,
+ "grad_norm": 2.5386533737182617,
+ "learning_rate": 4.1808464694707595e-06,
+ "loss": 0.5015,
+ "step": 3440
+ },
+ {
+ "epoch": 1.626950354609929,
+ "grad_norm": 2.8812551498413086,
+ "learning_rate": 4.180384633119775e-06,
+ "loss": 0.5225,
+ "step": 3441
+ },
+ {
+ "epoch": 1.6274231678486997,
+ "grad_norm": 2.870124578475952,
+ "learning_rate": 4.179922692138609e-06,
+ "loss": 0.537,
+ "step": 3442
+ },
+ {
+ "epoch": 1.6278959810874705,
+ "grad_norm": 2.5759785175323486,
+ "learning_rate": 4.179460646556021e-06,
+ "loss": 0.5142,
+ "step": 3443
+ },
+ {
+ "epoch": 1.6283687943262413,
+ "grad_norm": 2.629347324371338,
+ "learning_rate": 4.1789984964007836e-06,
+ "loss": 0.5007,
+ "step": 3444
+ },
+ {
+ "epoch": 1.6288416075650118,
+ "grad_norm": 2.751128673553467,
+ "learning_rate": 4.178536241701672e-06,
+ "loss": 0.5677,
+ "step": 3445
+ },
+ {
+ "epoch": 1.6293144208037824,
+ "grad_norm": 2.7582364082336426,
+ "learning_rate": 4.178073882487469e-06,
+ "loss": 0.499,
+ "step": 3446
+ },
+ {
+ "epoch": 1.6297872340425532,
+ "grad_norm": 3.136711359024048,
+ "learning_rate": 4.177611418786963e-06,
+ "loss": 0.5294,
+ "step": 3447
+ },
+ {
+ "epoch": 1.630260047281324,
+ "grad_norm": 2.7363100051879883,
+ "learning_rate": 4.17714885062895e-06,
+ "loss": 0.5264,
+ "step": 3448
+ },
+ {
+ "epoch": 1.6307328605200946,
+ "grad_norm": 2.7305946350097656,
+ "learning_rate": 4.176686178042233e-06,
+ "loss": 0.5235,
+ "step": 3449
+ },
+ {
+ "epoch": 1.6312056737588652,
+ "grad_norm": 2.6500556468963623,
+ "learning_rate": 4.176223401055619e-06,
+ "loss": 0.5463,
+ "step": 3450
+ },
+ {
+ "epoch": 1.631678486997636,
+ "grad_norm": 2.756321907043457,
+ "learning_rate": 4.175760519697924e-06,
+ "loss": 0.545,
+ "step": 3451
+ },
+ {
+ "epoch": 1.6321513002364068,
+ "grad_norm": 2.6234960556030273,
+ "learning_rate": 4.17529753399797e-06,
+ "loss": 0.4927,
+ "step": 3452
+ },
+ {
+ "epoch": 1.6326241134751773,
+ "grad_norm": 2.6358842849731445,
+ "learning_rate": 4.174834443984584e-06,
+ "loss": 0.5445,
+ "step": 3453
+ },
+ {
+ "epoch": 1.633096926713948,
+ "grad_norm": 2.541147470474243,
+ "learning_rate": 4.174371249686601e-06,
+ "loss": 0.4691,
+ "step": 3454
+ },
+ {
+ "epoch": 1.6335697399527187,
+ "grad_norm": 2.566981077194214,
+ "learning_rate": 4.173907951132863e-06,
+ "loss": 0.4932,
+ "step": 3455
+ },
+ {
+ "epoch": 1.6340425531914895,
+ "grad_norm": 2.670940399169922,
+ "learning_rate": 4.173444548352216e-06,
+ "loss": 0.4979,
+ "step": 3456
+ },
+ {
+ "epoch": 1.63451536643026,
+ "grad_norm": 2.5440268516540527,
+ "learning_rate": 4.172981041373515e-06,
+ "loss": 0.4716,
+ "step": 3457
+ },
+ {
+ "epoch": 1.6349881796690307,
+ "grad_norm": 2.3801631927490234,
+ "learning_rate": 4.17251743022562e-06,
+ "loss": 0.5126,
+ "step": 3458
+ },
+ {
+ "epoch": 1.6354609929078014,
+ "grad_norm": 2.5051121711730957,
+ "learning_rate": 4.1720537149373985e-06,
+ "loss": 0.4964,
+ "step": 3459
+ },
+ {
+ "epoch": 1.6359338061465722,
+ "grad_norm": 3.5521697998046875,
+ "learning_rate": 4.171589895537724e-06,
+ "loss": 0.5447,
+ "step": 3460
+ },
+ {
+ "epoch": 1.6364066193853428,
+ "grad_norm": 2.6041572093963623,
+ "learning_rate": 4.171125972055477e-06,
+ "loss": 0.4637,
+ "step": 3461
+ },
+ {
+ "epoch": 1.6368794326241134,
+ "grad_norm": 2.2297258377075195,
+ "learning_rate": 4.170661944519543e-06,
+ "loss": 0.4702,
+ "step": 3462
+ },
+ {
+ "epoch": 1.6373522458628842,
+ "grad_norm": 2.6764535903930664,
+ "learning_rate": 4.170197812958815e-06,
+ "loss": 0.5111,
+ "step": 3463
+ },
+ {
+ "epoch": 1.637825059101655,
+ "grad_norm": 2.86892032623291,
+ "learning_rate": 4.169733577402193e-06,
+ "loss": 0.5437,
+ "step": 3464
+ },
+ {
+ "epoch": 1.6382978723404256,
+ "grad_norm": 2.9007070064544678,
+ "learning_rate": 4.1692692378785825e-06,
+ "loss": 0.5425,
+ "step": 3465
+ },
+ {
+ "epoch": 1.6387706855791961,
+ "grad_norm": 2.5902905464172363,
+ "learning_rate": 4.168804794416896e-06,
+ "loss": 0.5252,
+ "step": 3466
+ },
+ {
+ "epoch": 1.639243498817967,
+ "grad_norm": 2.821183681488037,
+ "learning_rate": 4.168340247046053e-06,
+ "loss": 0.5265,
+ "step": 3467
+ },
+ {
+ "epoch": 1.6397163120567377,
+ "grad_norm": 2.7928314208984375,
+ "learning_rate": 4.167875595794978e-06,
+ "loss": 0.5151,
+ "step": 3468
+ },
+ {
+ "epoch": 1.6401891252955083,
+ "grad_norm": 2.3130412101745605,
+ "learning_rate": 4.167410840692603e-06,
+ "loss": 0.4941,
+ "step": 3469
+ },
+ {
+ "epoch": 1.6406619385342789,
+ "grad_norm": 2.6078619956970215,
+ "learning_rate": 4.1669459817678655e-06,
+ "loss": 0.493,
+ "step": 3470
+ },
+ {
+ "epoch": 1.6411347517730497,
+ "grad_norm": 2.5335731506347656,
+ "learning_rate": 4.166481019049712e-06,
+ "loss": 0.4969,
+ "step": 3471
+ },
+ {
+ "epoch": 1.6416075650118205,
+ "grad_norm": 2.8181469440460205,
+ "learning_rate": 4.166015952567093e-06,
+ "loss": 0.5062,
+ "step": 3472
+ },
+ {
+ "epoch": 1.642080378250591,
+ "grad_norm": 2.7256782054901123,
+ "learning_rate": 4.165550782348966e-06,
+ "loss": 0.5397,
+ "step": 3473
+ },
+ {
+ "epoch": 1.6425531914893616,
+ "grad_norm": 2.284345865249634,
+ "learning_rate": 4.1650855084242946e-06,
+ "loss": 0.4448,
+ "step": 3474
+ },
+ {
+ "epoch": 1.6430260047281324,
+ "grad_norm": 3.0383145809173584,
+ "learning_rate": 4.164620130822049e-06,
+ "loss": 0.5873,
+ "step": 3475
+ },
+ {
+ "epoch": 1.6434988179669032,
+ "grad_norm": 2.754448652267456,
+ "learning_rate": 4.1641546495712085e-06,
+ "loss": 0.4852,
+ "step": 3476
+ },
+ {
+ "epoch": 1.6439716312056738,
+ "grad_norm": 2.6820101737976074,
+ "learning_rate": 4.1636890647007535e-06,
+ "loss": 0.5325,
+ "step": 3477
+ },
+ {
+ "epoch": 1.6444444444444444,
+ "grad_norm": 2.6396398544311523,
+ "learning_rate": 4.163223376239676e-06,
+ "loss": 0.466,
+ "step": 3478
+ },
+ {
+ "epoch": 1.6449172576832152,
+ "grad_norm": 2.395049810409546,
+ "learning_rate": 4.162757584216972e-06,
+ "loss": 0.4531,
+ "step": 3479
+ },
+ {
+ "epoch": 1.645390070921986,
+ "grad_norm": 2.596670627593994,
+ "learning_rate": 4.162291688661645e-06,
+ "loss": 0.5207,
+ "step": 3480
+ },
+ {
+ "epoch": 1.6458628841607565,
+ "grad_norm": 2.4391872882843018,
+ "learning_rate": 4.161825689602703e-06,
+ "loss": 0.5133,
+ "step": 3481
+ },
+ {
+ "epoch": 1.6463356973995271,
+ "grad_norm": 2.6169841289520264,
+ "learning_rate": 4.161359587069162e-06,
+ "loss": 0.5096,
+ "step": 3482
+ },
+ {
+ "epoch": 1.646808510638298,
+ "grad_norm": 2.634089946746826,
+ "learning_rate": 4.1608933810900445e-06,
+ "loss": 0.4921,
+ "step": 3483
+ },
+ {
+ "epoch": 1.6472813238770687,
+ "grad_norm": 2.815877914428711,
+ "learning_rate": 4.160427071694379e-06,
+ "loss": 0.5045,
+ "step": 3484
+ },
+ {
+ "epoch": 1.6477541371158393,
+ "grad_norm": 2.417525053024292,
+ "learning_rate": 4.159960658911199e-06,
+ "loss": 0.4997,
+ "step": 3485
+ },
+ {
+ "epoch": 1.6482269503546099,
+ "grad_norm": 2.5713605880737305,
+ "learning_rate": 4.15949414276955e-06,
+ "loss": 0.5246,
+ "step": 3486
+ },
+ {
+ "epoch": 1.6486997635933807,
+ "grad_norm": 3.49833607673645,
+ "learning_rate": 4.159027523298475e-06,
+ "loss": 0.4901,
+ "step": 3487
+ },
+ {
+ "epoch": 1.6491725768321515,
+ "grad_norm": 2.985464334487915,
+ "learning_rate": 4.158560800527033e-06,
+ "loss": 0.5726,
+ "step": 3488
+ },
+ {
+ "epoch": 1.649645390070922,
+ "grad_norm": 2.72745680809021,
+ "learning_rate": 4.158093974484282e-06,
+ "loss": 0.5119,
+ "step": 3489
+ },
+ {
+ "epoch": 1.6501182033096926,
+ "grad_norm": 2.4885571002960205,
+ "learning_rate": 4.157627045199289e-06,
+ "loss": 0.4838,
+ "step": 3490
+ },
+ {
+ "epoch": 1.6505910165484634,
+ "grad_norm": 2.7622628211975098,
+ "learning_rate": 4.157160012701128e-06,
+ "loss": 0.5269,
+ "step": 3491
+ },
+ {
+ "epoch": 1.6510638297872342,
+ "grad_norm": 2.615122079849243,
+ "learning_rate": 4.156692877018879e-06,
+ "loss": 0.5501,
+ "step": 3492
+ },
+ {
+ "epoch": 1.6515366430260048,
+ "grad_norm": 2.827753782272339,
+ "learning_rate": 4.156225638181631e-06,
+ "loss": 0.5452,
+ "step": 3493
+ },
+ {
+ "epoch": 1.6520094562647754,
+ "grad_norm": 2.724820137023926,
+ "learning_rate": 4.155758296218474e-06,
+ "loss": 0.5155,
+ "step": 3494
+ },
+ {
+ "epoch": 1.6524822695035462,
+ "grad_norm": 2.5806174278259277,
+ "learning_rate": 4.155290851158508e-06,
+ "loss": 0.5292,
+ "step": 3495
+ },
+ {
+ "epoch": 1.652955082742317,
+ "grad_norm": 2.5655179023742676,
+ "learning_rate": 4.154823303030838e-06,
+ "loss": 0.4959,
+ "step": 3496
+ },
+ {
+ "epoch": 1.6534278959810875,
+ "grad_norm": 2.656548261642456,
+ "learning_rate": 4.154355651864579e-06,
+ "loss": 0.5703,
+ "step": 3497
+ },
+ {
+ "epoch": 1.653900709219858,
+ "grad_norm": 2.9085004329681396,
+ "learning_rate": 4.153887897688847e-06,
+ "loss": 0.5061,
+ "step": 3498
+ },
+ {
+ "epoch": 1.654373522458629,
+ "grad_norm": 2.608010768890381,
+ "learning_rate": 4.1534200405327665e-06,
+ "loss": 0.5165,
+ "step": 3499
+ },
+ {
+ "epoch": 1.6548463356973995,
+ "grad_norm": 2.600463628768921,
+ "learning_rate": 4.152952080425471e-06,
+ "loss": 0.4946,
+ "step": 3500
+ },
+ {
+ "epoch": 1.65531914893617,
+ "grad_norm": 2.5561563968658447,
+ "learning_rate": 4.152484017396098e-06,
+ "loss": 0.4804,
+ "step": 3501
+ },
+ {
+ "epoch": 1.6557919621749408,
+ "grad_norm": 2.788594961166382,
+ "learning_rate": 4.152015851473791e-06,
+ "loss": 0.5635,
+ "step": 3502
+ },
+ {
+ "epoch": 1.6562647754137116,
+ "grad_norm": 2.693302631378174,
+ "learning_rate": 4.151547582687699e-06,
+ "loss": 0.5139,
+ "step": 3503
+ },
+ {
+ "epoch": 1.6567375886524822,
+ "grad_norm": 2.7887485027313232,
+ "learning_rate": 4.1510792110669825e-06,
+ "loss": 0.4952,
+ "step": 3504
+ },
+ {
+ "epoch": 1.6572104018912528,
+ "grad_norm": 2.8982298374176025,
+ "learning_rate": 4.150610736640803e-06,
+ "loss": 0.4136,
+ "step": 3505
+ },
+ {
+ "epoch": 1.6576832151300236,
+ "grad_norm": 2.7569408416748047,
+ "learning_rate": 4.150142159438331e-06,
+ "loss": 0.5272,
+ "step": 3506
+ },
+ {
+ "epoch": 1.6581560283687944,
+ "grad_norm": 2.531648874282837,
+ "learning_rate": 4.149673479488742e-06,
+ "loss": 0.5016,
+ "step": 3507
+ },
+ {
+ "epoch": 1.658628841607565,
+ "grad_norm": 2.7706353664398193,
+ "learning_rate": 4.149204696821219e-06,
+ "loss": 0.5512,
+ "step": 3508
+ },
+ {
+ "epoch": 1.6591016548463355,
+ "grad_norm": 2.7307450771331787,
+ "learning_rate": 4.148735811464951e-06,
+ "loss": 0.4968,
+ "step": 3509
+ },
+ {
+ "epoch": 1.6595744680851063,
+ "grad_norm": 3.0097429752349854,
+ "learning_rate": 4.1482668234491335e-06,
+ "loss": 0.4797,
+ "step": 3510
+ },
+ {
+ "epoch": 1.6600472813238771,
+ "grad_norm": 2.6045308113098145,
+ "learning_rate": 4.147797732802969e-06,
+ "loss": 0.5496,
+ "step": 3511
+ },
+ {
+ "epoch": 1.6605200945626477,
+ "grad_norm": 2.702061176300049,
+ "learning_rate": 4.147328539555664e-06,
+ "loss": 0.5302,
+ "step": 3512
+ },
+ {
+ "epoch": 1.6609929078014183,
+ "grad_norm": 3.3724892139434814,
+ "learning_rate": 4.1468592437364356e-06,
+ "loss": 0.5124,
+ "step": 3513
+ },
+ {
+ "epoch": 1.661465721040189,
+ "grad_norm": 2.5117242336273193,
+ "learning_rate": 4.146389845374502e-06,
+ "loss": 0.4953,
+ "step": 3514
+ },
+ {
+ "epoch": 1.6619385342789599,
+ "grad_norm": 2.86547589302063,
+ "learning_rate": 4.145920344499092e-06,
+ "loss": 0.5337,
+ "step": 3515
+ },
+ {
+ "epoch": 1.6624113475177305,
+ "grad_norm": 2.745149850845337,
+ "learning_rate": 4.14545074113944e-06,
+ "loss": 0.5187,
+ "step": 3516
+ },
+ {
+ "epoch": 1.662884160756501,
+ "grad_norm": 2.5560994148254395,
+ "learning_rate": 4.1449810353247855e-06,
+ "loss": 0.5183,
+ "step": 3517
+ },
+ {
+ "epoch": 1.6633569739952718,
+ "grad_norm": 2.2318122386932373,
+ "learning_rate": 4.144511227084374e-06,
+ "loss": 0.4452,
+ "step": 3518
+ },
+ {
+ "epoch": 1.6638297872340426,
+ "grad_norm": 2.6980903148651123,
+ "learning_rate": 4.14404131644746e-06,
+ "loss": 0.4974,
+ "step": 3519
+ },
+ {
+ "epoch": 1.6643026004728132,
+ "grad_norm": 2.6875357627868652,
+ "learning_rate": 4.1435713034433025e-06,
+ "loss": 0.4582,
+ "step": 3520
+ },
+ {
+ "epoch": 1.6647754137115838,
+ "grad_norm": 2.9430019855499268,
+ "learning_rate": 4.143101188101166e-06,
+ "loss": 0.5004,
+ "step": 3521
+ },
+ {
+ "epoch": 1.6652482269503546,
+ "grad_norm": 2.4447221755981445,
+ "learning_rate": 4.142630970450323e-06,
+ "loss": 0.5436,
+ "step": 3522
+ },
+ {
+ "epoch": 1.6657210401891254,
+ "grad_norm": 2.571023941040039,
+ "learning_rate": 4.142160650520053e-06,
+ "loss": 0.5307,
+ "step": 3523
+ },
+ {
+ "epoch": 1.666193853427896,
+ "grad_norm": 2.9725306034088135,
+ "learning_rate": 4.14169022833964e-06,
+ "loss": 0.5918,
+ "step": 3524
+ },
+ {
+ "epoch": 1.6666666666666665,
+ "grad_norm": 2.5958926677703857,
+ "learning_rate": 4.141219703938375e-06,
+ "loss": 0.5036,
+ "step": 3525
+ },
+ {
+ "epoch": 1.6671394799054373,
+ "grad_norm": 2.935788631439209,
+ "learning_rate": 4.140749077345556e-06,
+ "loss": 0.5773,
+ "step": 3526
+ },
+ {
+ "epoch": 1.6676122931442081,
+ "grad_norm": 2.5460526943206787,
+ "learning_rate": 4.140278348590485e-06,
+ "loss": 0.4762,
+ "step": 3527
+ },
+ {
+ "epoch": 1.6680851063829787,
+ "grad_norm": 2.5729143619537354,
+ "learning_rate": 4.139807517702475e-06,
+ "loss": 0.5515,
+ "step": 3528
+ },
+ {
+ "epoch": 1.6685579196217493,
+ "grad_norm": 2.4377381801605225,
+ "learning_rate": 4.13933658471084e-06,
+ "loss": 0.5383,
+ "step": 3529
+ },
+ {
+ "epoch": 1.66903073286052,
+ "grad_norm": 2.6284425258636475,
+ "learning_rate": 4.138865549644905e-06,
+ "loss": 0.5396,
+ "step": 3530
+ },
+ {
+ "epoch": 1.6695035460992909,
+ "grad_norm": 2.857250928878784,
+ "learning_rate": 4.138394412533998e-06,
+ "loss": 0.5861,
+ "step": 3531
+ },
+ {
+ "epoch": 1.6699763593380614,
+ "grad_norm": 2.9226012229919434,
+ "learning_rate": 4.137923173407456e-06,
+ "loss": 0.5262,
+ "step": 3532
+ },
+ {
+ "epoch": 1.670449172576832,
+ "grad_norm": 4.839131832122803,
+ "learning_rate": 4.137451832294619e-06,
+ "loss": 0.651,
+ "step": 3533
+ },
+ {
+ "epoch": 1.6709219858156028,
+ "grad_norm": 2.4727771282196045,
+ "learning_rate": 4.1369803892248375e-06,
+ "loss": 0.5149,
+ "step": 3534
+ },
+ {
+ "epoch": 1.6713947990543736,
+ "grad_norm": 2.5391688346862793,
+ "learning_rate": 4.1365088442274635e-06,
+ "loss": 0.4907,
+ "step": 3535
+ },
+ {
+ "epoch": 1.6718676122931442,
+ "grad_norm": 2.5168209075927734,
+ "learning_rate": 4.136037197331862e-06,
+ "loss": 0.5091,
+ "step": 3536
+ },
+ {
+ "epoch": 1.6723404255319148,
+ "grad_norm": 2.6278600692749023,
+ "learning_rate": 4.135565448567396e-06,
+ "loss": 0.4357,
+ "step": 3537
+ },
+ {
+ "epoch": 1.6728132387706856,
+ "grad_norm": 2.835184097290039,
+ "learning_rate": 4.135093597963441e-06,
+ "loss": 0.4786,
+ "step": 3538
+ },
+ {
+ "epoch": 1.6732860520094563,
+ "grad_norm": 2.385328531265259,
+ "learning_rate": 4.134621645549379e-06,
+ "loss": 0.4849,
+ "step": 3539
+ },
+ {
+ "epoch": 1.673758865248227,
+ "grad_norm": 2.6504149436950684,
+ "learning_rate": 4.134149591354593e-06,
+ "loss": 0.6037,
+ "step": 3540
+ },
+ {
+ "epoch": 1.6742316784869975,
+ "grad_norm": 2.945634126663208,
+ "learning_rate": 4.1336774354084786e-06,
+ "loss": 0.532,
+ "step": 3541
+ },
+ {
+ "epoch": 1.6747044917257683,
+ "grad_norm": 2.8373215198516846,
+ "learning_rate": 4.133205177740434e-06,
+ "loss": 0.5138,
+ "step": 3542
+ },
+ {
+ "epoch": 1.675177304964539,
+ "grad_norm": 2.6616621017456055,
+ "learning_rate": 4.1327328183798634e-06,
+ "loss": 0.5543,
+ "step": 3543
+ },
+ {
+ "epoch": 1.6756501182033097,
+ "grad_norm": 3.0843071937561035,
+ "learning_rate": 4.13226035735618e-06,
+ "loss": 0.6585,
+ "step": 3544
+ },
+ {
+ "epoch": 1.6761229314420802,
+ "grad_norm": 2.2214272022247314,
+ "learning_rate": 4.131787794698802e-06,
+ "loss": 0.5413,
+ "step": 3545
+ },
+ {
+ "epoch": 1.676595744680851,
+ "grad_norm": 2.4515018463134766,
+ "learning_rate": 4.131315130437152e-06,
+ "loss": 0.4966,
+ "step": 3546
+ },
+ {
+ "epoch": 1.6770685579196218,
+ "grad_norm": 2.647414207458496,
+ "learning_rate": 4.130842364600663e-06,
+ "loss": 0.5401,
+ "step": 3547
+ },
+ {
+ "epoch": 1.6775413711583924,
+ "grad_norm": 2.648941993713379,
+ "learning_rate": 4.13036949721877e-06,
+ "loss": 0.4796,
+ "step": 3548
+ },
+ {
+ "epoch": 1.678014184397163,
+ "grad_norm": 2.7835679054260254,
+ "learning_rate": 4.129896528320919e-06,
+ "loss": 0.5653,
+ "step": 3549
+ },
+ {
+ "epoch": 1.6784869976359338,
+ "grad_norm": 2.995964288711548,
+ "learning_rate": 4.129423457936556e-06,
+ "loss": 0.4999,
+ "step": 3550
+ },
+ {
+ "epoch": 1.6789598108747046,
+ "grad_norm": 2.5980007648468018,
+ "learning_rate": 4.1289502860951405e-06,
+ "loss": 0.5177,
+ "step": 3551
+ },
+ {
+ "epoch": 1.6794326241134752,
+ "grad_norm": 2.442254066467285,
+ "learning_rate": 4.128477012826133e-06,
+ "loss": 0.5062,
+ "step": 3552
+ },
+ {
+ "epoch": 1.6799054373522457,
+ "grad_norm": 2.3007538318634033,
+ "learning_rate": 4.1280036381590025e-06,
+ "loss": 0.5029,
+ "step": 3553
+ },
+ {
+ "epoch": 1.6803782505910165,
+ "grad_norm": 2.4169347286224365,
+ "learning_rate": 4.1275301621232245e-06,
+ "loss": 0.515,
+ "step": 3554
+ },
+ {
+ "epoch": 1.6808510638297873,
+ "grad_norm": 2.6456379890441895,
+ "learning_rate": 4.127056584748279e-06,
+ "loss": 0.5343,
+ "step": 3555
+ },
+ {
+ "epoch": 1.681323877068558,
+ "grad_norm": 2.6406595706939697,
+ "learning_rate": 4.1265829060636546e-06,
+ "loss": 0.5047,
+ "step": 3556
+ },
+ {
+ "epoch": 1.6817966903073285,
+ "grad_norm": 2.9344475269317627,
+ "learning_rate": 4.126109126098846e-06,
+ "loss": 0.5501,
+ "step": 3557
+ },
+ {
+ "epoch": 1.6822695035460993,
+ "grad_norm": 2.3292455673217773,
+ "learning_rate": 4.125635244883351e-06,
+ "loss": 0.463,
+ "step": 3558
+ },
+ {
+ "epoch": 1.68274231678487,
+ "grad_norm": 2.4150657653808594,
+ "learning_rate": 4.125161262446677e-06,
+ "loss": 0.4802,
+ "step": 3559
+ },
+ {
+ "epoch": 1.6832151300236406,
+ "grad_norm": 2.604292392730713,
+ "learning_rate": 4.124687178818339e-06,
+ "loss": 0.5683,
+ "step": 3560
+ },
+ {
+ "epoch": 1.6836879432624112,
+ "grad_norm": 2.5676791667938232,
+ "learning_rate": 4.1242129940278544e-06,
+ "loss": 0.5519,
+ "step": 3561
+ },
+ {
+ "epoch": 1.684160756501182,
+ "grad_norm": 3.078514814376831,
+ "learning_rate": 4.123738708104748e-06,
+ "loss": 0.5194,
+ "step": 3562
+ },
+ {
+ "epoch": 1.6846335697399528,
+ "grad_norm": 2.893577814102173,
+ "learning_rate": 4.123264321078552e-06,
+ "loss": 0.5107,
+ "step": 3563
+ },
+ {
+ "epoch": 1.6851063829787234,
+ "grad_norm": 2.772413730621338,
+ "learning_rate": 4.122789832978804e-06,
+ "loss": 0.6147,
+ "step": 3564
+ },
+ {
+ "epoch": 1.685579196217494,
+ "grad_norm": 2.5804643630981445,
+ "learning_rate": 4.12231524383505e-06,
+ "loss": 0.5057,
+ "step": 3565
+ },
+ {
+ "epoch": 1.6860520094562648,
+ "grad_norm": 2.599571466445923,
+ "learning_rate": 4.121840553676839e-06,
+ "loss": 0.5591,
+ "step": 3566
+ },
+ {
+ "epoch": 1.6865248226950356,
+ "grad_norm": 2.9124577045440674,
+ "learning_rate": 4.1213657625337275e-06,
+ "loss": 0.565,
+ "step": 3567
+ },
+ {
+ "epoch": 1.6869976359338061,
+ "grad_norm": 2.6582155227661133,
+ "learning_rate": 4.120890870435281e-06,
+ "loss": 0.4607,
+ "step": 3568
+ },
+ {
+ "epoch": 1.6874704491725767,
+ "grad_norm": 2.929227590560913,
+ "learning_rate": 4.120415877411066e-06,
+ "loss": 0.5705,
+ "step": 3569
+ },
+ {
+ "epoch": 1.6879432624113475,
+ "grad_norm": 2.4443247318267822,
+ "learning_rate": 4.11994078349066e-06,
+ "loss": 0.4592,
+ "step": 3570
+ },
+ {
+ "epoch": 1.6884160756501183,
+ "grad_norm": 2.4799163341522217,
+ "learning_rate": 4.119465588703645e-06,
+ "loss": 0.5361,
+ "step": 3571
+ },
+ {
+ "epoch": 1.6888888888888889,
+ "grad_norm": 2.9408936500549316,
+ "learning_rate": 4.1189902930796085e-06,
+ "loss": 0.5347,
+ "step": 3572
+ },
+ {
+ "epoch": 1.6893617021276595,
+ "grad_norm": 3.3348076343536377,
+ "learning_rate": 4.118514896648146e-06,
+ "loss": 0.5612,
+ "step": 3573
+ },
+ {
+ "epoch": 1.6898345153664303,
+ "grad_norm": 2.764889717102051,
+ "learning_rate": 4.118039399438857e-06,
+ "loss": 0.4745,
+ "step": 3574
+ },
+ {
+ "epoch": 1.690307328605201,
+ "grad_norm": 2.7023751735687256,
+ "learning_rate": 4.11756380148135e-06,
+ "loss": 0.5106,
+ "step": 3575
+ },
+ {
+ "epoch": 1.6907801418439716,
+ "grad_norm": 2.8816208839416504,
+ "learning_rate": 4.117088102805238e-06,
+ "loss": 0.6016,
+ "step": 3576
+ },
+ {
+ "epoch": 1.6912529550827422,
+ "grad_norm": 2.215733289718628,
+ "learning_rate": 4.11661230344014e-06,
+ "loss": 0.4404,
+ "step": 3577
+ },
+ {
+ "epoch": 1.691725768321513,
+ "grad_norm": 2.8190999031066895,
+ "learning_rate": 4.116136403415683e-06,
+ "loss": 0.5038,
+ "step": 3578
+ },
+ {
+ "epoch": 1.6921985815602838,
+ "grad_norm": 2.616424083709717,
+ "learning_rate": 4.115660402761499e-06,
+ "loss": 0.5493,
+ "step": 3579
+ },
+ {
+ "epoch": 1.6926713947990544,
+ "grad_norm": 2.7738113403320312,
+ "learning_rate": 4.115184301507226e-06,
+ "loss": 0.5416,
+ "step": 3580
+ },
+ {
+ "epoch": 1.693144208037825,
+ "grad_norm": 2.4793593883514404,
+ "learning_rate": 4.114708099682509e-06,
+ "loss": 0.4526,
+ "step": 3581
+ },
+ {
+ "epoch": 1.6936170212765957,
+ "grad_norm": 2.390652894973755,
+ "learning_rate": 4.114231797316999e-06,
+ "loss": 0.4908,
+ "step": 3582
+ },
+ {
+ "epoch": 1.6940898345153665,
+ "grad_norm": 2.513197660446167,
+ "learning_rate": 4.113755394440352e-06,
+ "loss": 0.4738,
+ "step": 3583
+ },
+ {
+ "epoch": 1.6945626477541371,
+ "grad_norm": 2.504497766494751,
+ "learning_rate": 4.113278891082234e-06,
+ "loss": 0.4661,
+ "step": 3584
+ },
+ {
+ "epoch": 1.6950354609929077,
+ "grad_norm": 2.4966917037963867,
+ "learning_rate": 4.112802287272314e-06,
+ "loss": 0.4979,
+ "step": 3585
+ },
+ {
+ "epoch": 1.6955082742316785,
+ "grad_norm": 2.3129689693450928,
+ "learning_rate": 4.112325583040265e-06,
+ "loss": 0.4933,
+ "step": 3586
+ },
+ {
+ "epoch": 1.6959810874704493,
+ "grad_norm": 2.822136878967285,
+ "learning_rate": 4.111848778415774e-06,
+ "loss": 0.5087,
+ "step": 3587
+ },
+ {
+ "epoch": 1.6964539007092199,
+ "grad_norm": 2.5181210041046143,
+ "learning_rate": 4.111371873428527e-06,
+ "loss": 0.4836,
+ "step": 3588
+ },
+ {
+ "epoch": 1.6969267139479904,
+ "grad_norm": 2.7564687728881836,
+ "learning_rate": 4.110894868108218e-06,
+ "loss": 0.5224,
+ "step": 3589
+ },
+ {
+ "epoch": 1.6973995271867612,
+ "grad_norm": 2.424421787261963,
+ "learning_rate": 4.11041776248455e-06,
+ "loss": 0.4552,
+ "step": 3590
+ },
+ {
+ "epoch": 1.697872340425532,
+ "grad_norm": 2.7013823986053467,
+ "learning_rate": 4.10994055658723e-06,
+ "loss": 0.5535,
+ "step": 3591
+ },
+ {
+ "epoch": 1.6983451536643026,
+ "grad_norm": 2.5660946369171143,
+ "learning_rate": 4.10946325044597e-06,
+ "loss": 0.5351,
+ "step": 3592
+ },
+ {
+ "epoch": 1.6988179669030732,
+ "grad_norm": 2.5598108768463135,
+ "learning_rate": 4.10898584409049e-06,
+ "loss": 0.5246,
+ "step": 3593
+ },
+ {
+ "epoch": 1.699290780141844,
+ "grad_norm": 2.6318907737731934,
+ "learning_rate": 4.108508337550518e-06,
+ "loss": 0.5002,
+ "step": 3594
+ },
+ {
+ "epoch": 1.6997635933806148,
+ "grad_norm": 2.527099132537842,
+ "learning_rate": 4.108030730855784e-06,
+ "loss": 0.5366,
+ "step": 3595
+ },
+ {
+ "epoch": 1.7002364066193854,
+ "grad_norm": 2.8629603385925293,
+ "learning_rate": 4.107553024036029e-06,
+ "loss": 0.5742,
+ "step": 3596
+ },
+ {
+ "epoch": 1.700709219858156,
+ "grad_norm": 2.8084018230438232,
+ "learning_rate": 4.107075217120994e-06,
+ "loss": 0.5618,
+ "step": 3597
+ },
+ {
+ "epoch": 1.7011820330969267,
+ "grad_norm": 3.6470065116882324,
+ "learning_rate": 4.1065973101404325e-06,
+ "loss": 0.508,
+ "step": 3598
+ },
+ {
+ "epoch": 1.7016548463356975,
+ "grad_norm": 3.0332422256469727,
+ "learning_rate": 4.106119303124102e-06,
+ "loss": 0.51,
+ "step": 3599
+ },
+ {
+ "epoch": 1.702127659574468,
+ "grad_norm": 2.4887590408325195,
+ "learning_rate": 4.105641196101765e-06,
+ "loss": 0.5109,
+ "step": 3600
+ },
+ {
+ "epoch": 1.7026004728132387,
+ "grad_norm": 2.6102066040039062,
+ "learning_rate": 4.105162989103191e-06,
+ "loss": 0.5278,
+ "step": 3601
+ },
+ {
+ "epoch": 1.7030732860520095,
+ "grad_norm": 2.771578073501587,
+ "learning_rate": 4.104684682158156e-06,
+ "loss": 0.498,
+ "step": 3602
+ },
+ {
+ "epoch": 1.7035460992907803,
+ "grad_norm": 2.5452702045440674,
+ "learning_rate": 4.1042062752964425e-06,
+ "loss": 0.4939,
+ "step": 3603
+ },
+ {
+ "epoch": 1.7040189125295508,
+ "grad_norm": 2.4287021160125732,
+ "learning_rate": 4.103727768547838e-06,
+ "loss": 0.4819,
+ "step": 3604
+ },
+ {
+ "epoch": 1.7044917257683214,
+ "grad_norm": 2.412280321121216,
+ "learning_rate": 4.103249161942138e-06,
+ "loss": 0.5196,
+ "step": 3605
+ },
+ {
+ "epoch": 1.7049645390070922,
+ "grad_norm": 2.8850717544555664,
+ "learning_rate": 4.102770455509142e-06,
+ "loss": 0.5724,
+ "step": 3606
+ },
+ {
+ "epoch": 1.705437352245863,
+ "grad_norm": 2.7979609966278076,
+ "learning_rate": 4.102291649278659e-06,
+ "loss": 0.5295,
+ "step": 3607
+ },
+ {
+ "epoch": 1.7059101654846336,
+ "grad_norm": 2.762238025665283,
+ "learning_rate": 4.1018127432805e-06,
+ "loss": 0.5166,
+ "step": 3608
+ },
+ {
+ "epoch": 1.7063829787234042,
+ "grad_norm": 2.921586513519287,
+ "learning_rate": 4.101333737544485e-06,
+ "loss": 0.5607,
+ "step": 3609
+ },
+ {
+ "epoch": 1.706855791962175,
+ "grad_norm": 3.001929998397827,
+ "learning_rate": 4.100854632100439e-06,
+ "loss": 0.6255,
+ "step": 3610
+ },
+ {
+ "epoch": 1.7073286052009458,
+ "grad_norm": 2.752713918685913,
+ "learning_rate": 4.100375426978196e-06,
+ "loss": 0.5732,
+ "step": 3611
+ },
+ {
+ "epoch": 1.7078014184397163,
+ "grad_norm": 2.6496472358703613,
+ "learning_rate": 4.099896122207593e-06,
+ "loss": 0.5138,
+ "step": 3612
+ },
+ {
+ "epoch": 1.708274231678487,
+ "grad_norm": 3.0079452991485596,
+ "learning_rate": 4.099416717818473e-06,
+ "loss": 0.5746,
+ "step": 3613
+ },
+ {
+ "epoch": 1.7087470449172577,
+ "grad_norm": 2.5762360095977783,
+ "learning_rate": 4.098937213840687e-06,
+ "loss": 0.5308,
+ "step": 3614
+ },
+ {
+ "epoch": 1.7092198581560285,
+ "grad_norm": 2.6026158332824707,
+ "learning_rate": 4.098457610304092e-06,
+ "loss": 0.4857,
+ "step": 3615
+ },
+ {
+ "epoch": 1.709692671394799,
+ "grad_norm": 2.587583541870117,
+ "learning_rate": 4.097977907238551e-06,
+ "loss": 0.4591,
+ "step": 3616
+ },
+ {
+ "epoch": 1.7101654846335697,
+ "grad_norm": 2.6996991634368896,
+ "learning_rate": 4.097498104673932e-06,
+ "loss": 0.5298,
+ "step": 3617
+ },
+ {
+ "epoch": 1.7106382978723405,
+ "grad_norm": 2.600029945373535,
+ "learning_rate": 4.097018202640111e-06,
+ "loss": 0.4726,
+ "step": 3618
+ },
+ {
+ "epoch": 1.7111111111111112,
+ "grad_norm": 2.8261220455169678,
+ "learning_rate": 4.096538201166969e-06,
+ "loss": 0.5242,
+ "step": 3619
+ },
+ {
+ "epoch": 1.7115839243498818,
+ "grad_norm": 3.053027629852295,
+ "learning_rate": 4.096058100284394e-06,
+ "loss": 0.5568,
+ "step": 3620
+ },
+ {
+ "epoch": 1.7120567375886524,
+ "grad_norm": 2.9638442993164062,
+ "learning_rate": 4.0955779000222805e-06,
+ "loss": 0.5325,
+ "step": 3621
+ },
+ {
+ "epoch": 1.7125295508274232,
+ "grad_norm": 2.731095790863037,
+ "learning_rate": 4.095097600410527e-06,
+ "loss": 0.4733,
+ "step": 3622
+ },
+ {
+ "epoch": 1.713002364066194,
+ "grad_norm": 2.632490873336792,
+ "learning_rate": 4.09461720147904e-06,
+ "loss": 0.5253,
+ "step": 3623
+ },
+ {
+ "epoch": 1.7134751773049646,
+ "grad_norm": 2.847689390182495,
+ "learning_rate": 4.094136703257732e-06,
+ "loss": 0.57,
+ "step": 3624
+ },
+ {
+ "epoch": 1.7139479905437351,
+ "grad_norm": 3.1078696250915527,
+ "learning_rate": 4.0936561057765215e-06,
+ "loss": 0.5368,
+ "step": 3625
+ },
+ {
+ "epoch": 1.714420803782506,
+ "grad_norm": 2.696349620819092,
+ "learning_rate": 4.0931754090653334e-06,
+ "loss": 0.491,
+ "step": 3626
+ },
+ {
+ "epoch": 1.7148936170212767,
+ "grad_norm": 2.712958812713623,
+ "learning_rate": 4.092694613154099e-06,
+ "loss": 0.5768,
+ "step": 3627
+ },
+ {
+ "epoch": 1.7153664302600473,
+ "grad_norm": 2.5421478748321533,
+ "learning_rate": 4.092213718072754e-06,
+ "loss": 0.4839,
+ "step": 3628
+ },
+ {
+ "epoch": 1.715839243498818,
+ "grad_norm": 2.5176162719726562,
+ "learning_rate": 4.091732723851243e-06,
+ "loss": 0.5049,
+ "step": 3629
+ },
+ {
+ "epoch": 1.7163120567375887,
+ "grad_norm": 2.642185926437378,
+ "learning_rate": 4.091251630519514e-06,
+ "loss": 0.589,
+ "step": 3630
+ },
+ {
+ "epoch": 1.7167848699763595,
+ "grad_norm": 2.587348461151123,
+ "learning_rate": 4.0907704381075245e-06,
+ "loss": 0.5281,
+ "step": 3631
+ },
+ {
+ "epoch": 1.71725768321513,
+ "grad_norm": 2.4628195762634277,
+ "learning_rate": 4.090289146645234e-06,
+ "loss": 0.5592,
+ "step": 3632
+ },
+ {
+ "epoch": 1.7177304964539006,
+ "grad_norm": 2.2751028537750244,
+ "learning_rate": 4.0898077561626125e-06,
+ "loss": 0.502,
+ "step": 3633
+ },
+ {
+ "epoch": 1.7182033096926714,
+ "grad_norm": 2.7712769508361816,
+ "learning_rate": 4.089326266689632e-06,
+ "loss": 0.5143,
+ "step": 3634
+ },
+ {
+ "epoch": 1.7186761229314422,
+ "grad_norm": 2.5297727584838867,
+ "learning_rate": 4.088844678256275e-06,
+ "loss": 0.5035,
+ "step": 3635
+ },
+ {
+ "epoch": 1.7191489361702128,
+ "grad_norm": 2.739130735397339,
+ "learning_rate": 4.088362990892527e-06,
+ "loss": 0.5959,
+ "step": 3636
+ },
+ {
+ "epoch": 1.7196217494089834,
+ "grad_norm": 2.3708314895629883,
+ "learning_rate": 4.08788120462838e-06,
+ "loss": 0.4796,
+ "step": 3637
+ },
+ {
+ "epoch": 1.7200945626477542,
+ "grad_norm": 2.7664241790771484,
+ "learning_rate": 4.087399319493832e-06,
+ "loss": 0.6052,
+ "step": 3638
+ },
+ {
+ "epoch": 1.720567375886525,
+ "grad_norm": 2.5900204181671143,
+ "learning_rate": 4.0869173355188895e-06,
+ "loss": 0.4955,
+ "step": 3639
+ },
+ {
+ "epoch": 1.7210401891252955,
+ "grad_norm": 2.6771862506866455,
+ "learning_rate": 4.0864352527335635e-06,
+ "loss": 0.4889,
+ "step": 3640
+ },
+ {
+ "epoch": 1.7215130023640661,
+ "grad_norm": 2.888479471206665,
+ "learning_rate": 4.085953071167871e-06,
+ "loss": 0.5719,
+ "step": 3641
+ },
+ {
+ "epoch": 1.721985815602837,
+ "grad_norm": 2.5967187881469727,
+ "learning_rate": 4.085470790851833e-06,
+ "loss": 0.4959,
+ "step": 3642
+ },
+ {
+ "epoch": 1.7224586288416077,
+ "grad_norm": 2.5317695140838623,
+ "learning_rate": 4.084988411815483e-06,
+ "loss": 0.4596,
+ "step": 3643
+ },
+ {
+ "epoch": 1.7229314420803783,
+ "grad_norm": 2.6531455516815186,
+ "learning_rate": 4.084505934088853e-06,
+ "loss": 0.5346,
+ "step": 3644
+ },
+ {
+ "epoch": 1.7234042553191489,
+ "grad_norm": 2.6525208950042725,
+ "learning_rate": 4.084023357701987e-06,
+ "loss": 0.5178,
+ "step": 3645
+ },
+ {
+ "epoch": 1.7238770685579197,
+ "grad_norm": 2.461954116821289,
+ "learning_rate": 4.083540682684932e-06,
+ "loss": 0.4802,
+ "step": 3646
+ },
+ {
+ "epoch": 1.7243498817966905,
+ "grad_norm": 2.794696807861328,
+ "learning_rate": 4.083057909067743e-06,
+ "loss": 0.5148,
+ "step": 3647
+ },
+ {
+ "epoch": 1.724822695035461,
+ "grad_norm": 2.867572546005249,
+ "learning_rate": 4.082575036880479e-06,
+ "loss": 0.5352,
+ "step": 3648
+ },
+ {
+ "epoch": 1.7252955082742316,
+ "grad_norm": 2.642820358276367,
+ "learning_rate": 4.082092066153207e-06,
+ "loss": 0.4652,
+ "step": 3649
+ },
+ {
+ "epoch": 1.7257683215130024,
+ "grad_norm": 2.782142400741577,
+ "learning_rate": 4.081608996915999e-06,
+ "loss": 0.5591,
+ "step": 3650
+ },
+ {
+ "epoch": 1.7262411347517732,
+ "grad_norm": 2.327331304550171,
+ "learning_rate": 4.081125829198934e-06,
+ "loss": 0.4339,
+ "step": 3651
+ },
+ {
+ "epoch": 1.7267139479905438,
+ "grad_norm": 2.7959988117218018,
+ "learning_rate": 4.0806425630320965e-06,
+ "loss": 0.5783,
+ "step": 3652
+ },
+ {
+ "epoch": 1.7271867612293144,
+ "grad_norm": 2.595053195953369,
+ "learning_rate": 4.080159198445578e-06,
+ "loss": 0.4602,
+ "step": 3653
+ },
+ {
+ "epoch": 1.7276595744680852,
+ "grad_norm": 3.0968129634857178,
+ "learning_rate": 4.079675735469475e-06,
+ "loss": 0.5775,
+ "step": 3654
+ },
+ {
+ "epoch": 1.728132387706856,
+ "grad_norm": 2.628044605255127,
+ "learning_rate": 4.07919217413389e-06,
+ "loss": 0.486,
+ "step": 3655
+ },
+ {
+ "epoch": 1.7286052009456265,
+ "grad_norm": 2.782799005508423,
+ "learning_rate": 4.078708514468933e-06,
+ "loss": 0.5282,
+ "step": 3656
+ },
+ {
+ "epoch": 1.729078014184397,
+ "grad_norm": 2.655365467071533,
+ "learning_rate": 4.0782247565047205e-06,
+ "loss": 0.4873,
+ "step": 3657
+ },
+ {
+ "epoch": 1.729550827423168,
+ "grad_norm": 2.9461584091186523,
+ "learning_rate": 4.077740900271371e-06,
+ "loss": 0.548,
+ "step": 3658
+ },
+ {
+ "epoch": 1.7300236406619387,
+ "grad_norm": 2.5094761848449707,
+ "learning_rate": 4.077256945799015e-06,
+ "loss": 0.5437,
+ "step": 3659
+ },
+ {
+ "epoch": 1.7304964539007093,
+ "grad_norm": 2.555793285369873,
+ "learning_rate": 4.0767728931177845e-06,
+ "loss": 0.5268,
+ "step": 3660
+ },
+ {
+ "epoch": 1.7309692671394799,
+ "grad_norm": 2.4433486461639404,
+ "learning_rate": 4.07628874225782e-06,
+ "loss": 0.5211,
+ "step": 3661
+ },
+ {
+ "epoch": 1.7314420803782506,
+ "grad_norm": 2.365206003189087,
+ "learning_rate": 4.075804493249267e-06,
+ "loss": 0.5084,
+ "step": 3662
+ },
+ {
+ "epoch": 1.7319148936170212,
+ "grad_norm": 2.514305830001831,
+ "learning_rate": 4.075320146122278e-06,
+ "loss": 0.4693,
+ "step": 3663
+ },
+ {
+ "epoch": 1.7323877068557918,
+ "grad_norm": 2.9270083904266357,
+ "learning_rate": 4.074835700907012e-06,
+ "loss": 0.5724,
+ "step": 3664
+ },
+ {
+ "epoch": 1.7328605200945626,
+ "grad_norm": 2.938692569732666,
+ "learning_rate": 4.0743511576336315e-06,
+ "loss": 0.5361,
+ "step": 3665
+ },
+ {
+ "epoch": 1.7333333333333334,
+ "grad_norm": 3.1978867053985596,
+ "learning_rate": 4.073866516332307e-06,
+ "loss": 0.6277,
+ "step": 3666
+ },
+ {
+ "epoch": 1.733806146572104,
+ "grad_norm": 2.3477370738983154,
+ "learning_rate": 4.073381777033217e-06,
+ "loss": 0.5139,
+ "step": 3667
+ },
+ {
+ "epoch": 1.7342789598108745,
+ "grad_norm": 2.5954184532165527,
+ "learning_rate": 4.072896939766543e-06,
+ "loss": 0.537,
+ "step": 3668
+ },
+ {
+ "epoch": 1.7347517730496453,
+ "grad_norm": 2.8999998569488525,
+ "learning_rate": 4.072412004562472e-06,
+ "loss": 0.5486,
+ "step": 3669
+ },
+ {
+ "epoch": 1.7352245862884161,
+ "grad_norm": 2.7320556640625,
+ "learning_rate": 4.071926971451201e-06,
+ "loss": 0.6025,
+ "step": 3670
+ },
+ {
+ "epoch": 1.7356973995271867,
+ "grad_norm": 2.499234676361084,
+ "learning_rate": 4.0714418404629304e-06,
+ "loss": 0.456,
+ "step": 3671
+ },
+ {
+ "epoch": 1.7361702127659573,
+ "grad_norm": 2.485924243927002,
+ "learning_rate": 4.070956611627867e-06,
+ "loss": 0.5097,
+ "step": 3672
+ },
+ {
+ "epoch": 1.736643026004728,
+ "grad_norm": 2.513723373413086,
+ "learning_rate": 4.070471284976225e-06,
+ "loss": 0.4744,
+ "step": 3673
+ },
+ {
+ "epoch": 1.7371158392434989,
+ "grad_norm": 2.281977653503418,
+ "learning_rate": 4.06998586053822e-06,
+ "loss": 0.5124,
+ "step": 3674
+ },
+ {
+ "epoch": 1.7375886524822695,
+ "grad_norm": 2.3683905601501465,
+ "learning_rate": 4.069500338344081e-06,
+ "loss": 0.4816,
+ "step": 3675
+ },
+ {
+ "epoch": 1.73806146572104,
+ "grad_norm": 2.5635924339294434,
+ "learning_rate": 4.069014718424038e-06,
+ "loss": 0.5665,
+ "step": 3676
+ },
+ {
+ "epoch": 1.7385342789598108,
+ "grad_norm": 2.7308456897735596,
+ "learning_rate": 4.068529000808328e-06,
+ "loss": 0.534,
+ "step": 3677
+ },
+ {
+ "epoch": 1.7390070921985816,
+ "grad_norm": 2.788452625274658,
+ "learning_rate": 4.068043185527196e-06,
+ "loss": 0.5609,
+ "step": 3678
+ },
+ {
+ "epoch": 1.7394799054373522,
+ "grad_norm": 2.832368850708008,
+ "learning_rate": 4.067557272610889e-06,
+ "loss": 0.553,
+ "step": 3679
+ },
+ {
+ "epoch": 1.7399527186761228,
+ "grad_norm": 2.9987435340881348,
+ "learning_rate": 4.067071262089665e-06,
+ "loss": 0.5,
+ "step": 3680
+ },
+ {
+ "epoch": 1.7404255319148936,
+ "grad_norm": 3.04913067817688,
+ "learning_rate": 4.066585153993785e-06,
+ "loss": 0.5158,
+ "step": 3681
+ },
+ {
+ "epoch": 1.7408983451536644,
+ "grad_norm": 2.5177130699157715,
+ "learning_rate": 4.066098948353516e-06,
+ "loss": 0.4508,
+ "step": 3682
+ },
+ {
+ "epoch": 1.741371158392435,
+ "grad_norm": 2.8991222381591797,
+ "learning_rate": 4.065612645199133e-06,
+ "loss": 0.5268,
+ "step": 3683
+ },
+ {
+ "epoch": 1.7418439716312055,
+ "grad_norm": 2.4928159713745117,
+ "learning_rate": 4.0651262445609156e-06,
+ "loss": 0.5024,
+ "step": 3684
+ },
+ {
+ "epoch": 1.7423167848699763,
+ "grad_norm": 2.9737319946289062,
+ "learning_rate": 4.06463974646915e-06,
+ "loss": 0.5429,
+ "step": 3685
+ },
+ {
+ "epoch": 1.7427895981087471,
+ "grad_norm": 2.6485493183135986,
+ "learning_rate": 4.064153150954128e-06,
+ "loss": 0.5619,
+ "step": 3686
+ },
+ {
+ "epoch": 1.7432624113475177,
+ "grad_norm": 2.564861297607422,
+ "learning_rate": 4.063666458046148e-06,
+ "loss": 0.4878,
+ "step": 3687
+ },
+ {
+ "epoch": 1.7437352245862883,
+ "grad_norm": 2.6048383712768555,
+ "learning_rate": 4.063179667775514e-06,
+ "loss": 0.4836,
+ "step": 3688
+ },
+ {
+ "epoch": 1.744208037825059,
+ "grad_norm": 2.751638650894165,
+ "learning_rate": 4.062692780172536e-06,
+ "loss": 0.5558,
+ "step": 3689
+ },
+ {
+ "epoch": 1.7446808510638299,
+ "grad_norm": 3.3866634368896484,
+ "learning_rate": 4.062205795267531e-06,
+ "loss": 0.4825,
+ "step": 3690
+ },
+ {
+ "epoch": 1.7451536643026004,
+ "grad_norm": 3.0112249851226807,
+ "learning_rate": 4.061718713090822e-06,
+ "loss": 0.5732,
+ "step": 3691
+ },
+ {
+ "epoch": 1.745626477541371,
+ "grad_norm": 2.5889365673065186,
+ "learning_rate": 4.061231533672736e-06,
+ "loss": 0.483,
+ "step": 3692
+ },
+ {
+ "epoch": 1.7460992907801418,
+ "grad_norm": 2.624598979949951,
+ "learning_rate": 4.0607442570436085e-06,
+ "loss": 0.5706,
+ "step": 3693
+ },
+ {
+ "epoch": 1.7465721040189126,
+ "grad_norm": 2.9219250679016113,
+ "learning_rate": 4.060256883233779e-06,
+ "loss": 0.5153,
+ "step": 3694
+ },
+ {
+ "epoch": 1.7470449172576832,
+ "grad_norm": 3.2219252586364746,
+ "learning_rate": 4.059769412273595e-06,
+ "loss": 0.5184,
+ "step": 3695
+ },
+ {
+ "epoch": 1.7475177304964538,
+ "grad_norm": 2.890697956085205,
+ "learning_rate": 4.05928184419341e-06,
+ "loss": 0.5312,
+ "step": 3696
+ },
+ {
+ "epoch": 1.7479905437352246,
+ "grad_norm": 2.673809289932251,
+ "learning_rate": 4.0587941790235816e-06,
+ "loss": 0.4893,
+ "step": 3697
+ },
+ {
+ "epoch": 1.7484633569739954,
+ "grad_norm": 2.5339348316192627,
+ "learning_rate": 4.058306416794474e-06,
+ "loss": 0.5115,
+ "step": 3698
+ },
+ {
+ "epoch": 1.748936170212766,
+ "grad_norm": 2.6525840759277344,
+ "learning_rate": 4.05781855753646e-06,
+ "loss": 0.5256,
+ "step": 3699
+ },
+ {
+ "epoch": 1.7494089834515365,
+ "grad_norm": 2.7868754863739014,
+ "learning_rate": 4.057330601279914e-06,
+ "loss": 0.5227,
+ "step": 3700
+ },
+ {
+ "epoch": 1.7498817966903073,
+ "grad_norm": 3.1629884243011475,
+ "learning_rate": 4.056842548055221e-06,
+ "loss": 0.5617,
+ "step": 3701
+ },
+ {
+ "epoch": 1.750354609929078,
+ "grad_norm": 2.9350688457489014,
+ "learning_rate": 4.056354397892769e-06,
+ "loss": 0.4753,
+ "step": 3702
+ },
+ {
+ "epoch": 1.7508274231678487,
+ "grad_norm": 2.9688615798950195,
+ "learning_rate": 4.0558661508229525e-06,
+ "loss": 0.596,
+ "step": 3703
+ },
+ {
+ "epoch": 1.7513002364066192,
+ "grad_norm": 2.802205801010132,
+ "learning_rate": 4.055377806876174e-06,
+ "loss": 0.5793,
+ "step": 3704
+ },
+ {
+ "epoch": 1.75177304964539,
+ "grad_norm": 2.4933416843414307,
+ "learning_rate": 4.054889366082839e-06,
+ "loss": 0.4824,
+ "step": 3705
+ },
+ {
+ "epoch": 1.7522458628841608,
+ "grad_norm": 3.7904608249664307,
+ "learning_rate": 4.054400828473361e-06,
+ "loss": 0.5124,
+ "step": 3706
+ },
+ {
+ "epoch": 1.7527186761229314,
+ "grad_norm": 2.694838762283325,
+ "learning_rate": 4.053912194078159e-06,
+ "loss": 0.5604,
+ "step": 3707
+ },
+ {
+ "epoch": 1.753191489361702,
+ "grad_norm": 2.3721256256103516,
+ "learning_rate": 4.053423462927659e-06,
+ "loss": 0.4978,
+ "step": 3708
+ },
+ {
+ "epoch": 1.7536643026004728,
+ "grad_norm": 2.718512773513794,
+ "learning_rate": 4.052934635052292e-06,
+ "loss": 0.5029,
+ "step": 3709
+ },
+ {
+ "epoch": 1.7541371158392436,
+ "grad_norm": 3.061558246612549,
+ "learning_rate": 4.052445710482493e-06,
+ "loss": 0.4886,
+ "step": 3710
+ },
+ {
+ "epoch": 1.7546099290780142,
+ "grad_norm": 3.0490729808807373,
+ "learning_rate": 4.051956689248709e-06,
+ "loss": 0.5363,
+ "step": 3711
+ },
+ {
+ "epoch": 1.7550827423167847,
+ "grad_norm": 2.611661672592163,
+ "learning_rate": 4.051467571381385e-06,
+ "loss": 0.5397,
+ "step": 3712
+ },
+ {
+ "epoch": 1.7555555555555555,
+ "grad_norm": 2.7829177379608154,
+ "learning_rate": 4.050978356910979e-06,
+ "loss": 0.4973,
+ "step": 3713
+ },
+ {
+ "epoch": 1.7560283687943263,
+ "grad_norm": 2.6228256225585938,
+ "learning_rate": 4.0504890458679525e-06,
+ "loss": 0.4551,
+ "step": 3714
+ },
+ {
+ "epoch": 1.756501182033097,
+ "grad_norm": 2.6801326274871826,
+ "learning_rate": 4.049999638282771e-06,
+ "loss": 0.5581,
+ "step": 3715
+ },
+ {
+ "epoch": 1.7569739952718675,
+ "grad_norm": 2.4476819038391113,
+ "learning_rate": 4.049510134185908e-06,
+ "loss": 0.5226,
+ "step": 3716
+ },
+ {
+ "epoch": 1.7574468085106383,
+ "grad_norm": 2.5661075115203857,
+ "learning_rate": 4.049020533607844e-06,
+ "loss": 0.5163,
+ "step": 3717
+ },
+ {
+ "epoch": 1.757919621749409,
+ "grad_norm": 2.3923349380493164,
+ "learning_rate": 4.048530836579065e-06,
+ "loss": 0.5076,
+ "step": 3718
+ },
+ {
+ "epoch": 1.7583924349881797,
+ "grad_norm": 2.8204405307769775,
+ "learning_rate": 4.0480410431300585e-06,
+ "loss": 0.5883,
+ "step": 3719
+ },
+ {
+ "epoch": 1.7588652482269502,
+ "grad_norm": 2.323107957839966,
+ "learning_rate": 4.047551153291325e-06,
+ "loss": 0.5116,
+ "step": 3720
+ },
+ {
+ "epoch": 1.759338061465721,
+ "grad_norm": 2.8306009769439697,
+ "learning_rate": 4.047061167093368e-06,
+ "loss": 0.5094,
+ "step": 3721
+ },
+ {
+ "epoch": 1.7598108747044918,
+ "grad_norm": 2.568765640258789,
+ "learning_rate": 4.046571084566695e-06,
+ "loss": 0.4725,
+ "step": 3722
+ },
+ {
+ "epoch": 1.7602836879432624,
+ "grad_norm": 2.7212061882019043,
+ "learning_rate": 4.046080905741822e-06,
+ "loss": 0.4741,
+ "step": 3723
+ },
+ {
+ "epoch": 1.760756501182033,
+ "grad_norm": 2.802917003631592,
+ "learning_rate": 4.04559063064927e-06,
+ "loss": 0.5691,
+ "step": 3724
+ },
+ {
+ "epoch": 1.7612293144208038,
+ "grad_norm": 3.1044139862060547,
+ "learning_rate": 4.0451002593195675e-06,
+ "loss": 0.5472,
+ "step": 3725
+ },
+ {
+ "epoch": 1.7617021276595746,
+ "grad_norm": 2.5855562686920166,
+ "learning_rate": 4.044609791783246e-06,
+ "loss": 0.4852,
+ "step": 3726
+ },
+ {
+ "epoch": 1.7621749408983451,
+ "grad_norm": 2.6235129833221436,
+ "learning_rate": 4.0441192280708465e-06,
+ "loss": 0.5269,
+ "step": 3727
+ },
+ {
+ "epoch": 1.7626477541371157,
+ "grad_norm": 3.535630464553833,
+ "learning_rate": 4.043628568212914e-06,
+ "loss": 0.5266,
+ "step": 3728
+ },
+ {
+ "epoch": 1.7631205673758865,
+ "grad_norm": 2.7783355712890625,
+ "learning_rate": 4.043137812239998e-06,
+ "loss": 0.5609,
+ "step": 3729
+ },
+ {
+ "epoch": 1.7635933806146573,
+ "grad_norm": 2.9344944953918457,
+ "learning_rate": 4.042646960182657e-06,
+ "loss": 0.5056,
+ "step": 3730
+ },
+ {
+ "epoch": 1.7640661938534279,
+ "grad_norm": 2.6205739974975586,
+ "learning_rate": 4.042156012071453e-06,
+ "loss": 0.4914,
+ "step": 3731
+ },
+ {
+ "epoch": 1.7645390070921985,
+ "grad_norm": 2.8004493713378906,
+ "learning_rate": 4.041664967936958e-06,
+ "loss": 0.4901,
+ "step": 3732
+ },
+ {
+ "epoch": 1.7650118203309693,
+ "grad_norm": 2.944589138031006,
+ "learning_rate": 4.041173827809745e-06,
+ "loss": 0.5572,
+ "step": 3733
+ },
+ {
+ "epoch": 1.76548463356974,
+ "grad_norm": 2.5021605491638184,
+ "learning_rate": 4.040682591720397e-06,
+ "loss": 0.4637,
+ "step": 3734
+ },
+ {
+ "epoch": 1.7659574468085106,
+ "grad_norm": 2.448030948638916,
+ "learning_rate": 4.040191259699497e-06,
+ "loss": 0.4785,
+ "step": 3735
+ },
+ {
+ "epoch": 1.7664302600472812,
+ "grad_norm": 2.7171032428741455,
+ "learning_rate": 4.039699831777643e-06,
+ "loss": 0.4919,
+ "step": 3736
+ },
+ {
+ "epoch": 1.766903073286052,
+ "grad_norm": 2.453118324279785,
+ "learning_rate": 4.03920830798543e-06,
+ "loss": 0.4326,
+ "step": 3737
+ },
+ {
+ "epoch": 1.7673758865248228,
+ "grad_norm": 3.112877368927002,
+ "learning_rate": 4.038716688353466e-06,
+ "loss": 0.5375,
+ "step": 3738
+ },
+ {
+ "epoch": 1.7678486997635934,
+ "grad_norm": 2.742239236831665,
+ "learning_rate": 4.038224972912361e-06,
+ "loss": 0.5267,
+ "step": 3739
+ },
+ {
+ "epoch": 1.768321513002364,
+ "grad_norm": 2.544785737991333,
+ "learning_rate": 4.037733161692731e-06,
+ "loss": 0.5032,
+ "step": 3740
+ },
+ {
+ "epoch": 1.7687943262411348,
+ "grad_norm": 2.4639062881469727,
+ "learning_rate": 4.037241254725201e-06,
+ "loss": 0.5532,
+ "step": 3741
+ },
+ {
+ "epoch": 1.7692671394799055,
+ "grad_norm": 2.866290330886841,
+ "learning_rate": 4.036749252040398e-06,
+ "loss": 0.5503,
+ "step": 3742
+ },
+ {
+ "epoch": 1.7697399527186761,
+ "grad_norm": 2.3466262817382812,
+ "learning_rate": 4.0362571536689575e-06,
+ "loss": 0.5286,
+ "step": 3743
+ },
+ {
+ "epoch": 1.7702127659574467,
+ "grad_norm": 2.246464967727661,
+ "learning_rate": 4.03576495964152e-06,
+ "loss": 0.4656,
+ "step": 3744
+ },
+ {
+ "epoch": 1.7706855791962175,
+ "grad_norm": 2.667558431625366,
+ "learning_rate": 4.035272669988733e-06,
+ "loss": 0.5205,
+ "step": 3745
+ },
+ {
+ "epoch": 1.7711583924349883,
+ "grad_norm": 2.974666118621826,
+ "learning_rate": 4.034780284741249e-06,
+ "loss": 0.6007,
+ "step": 3746
+ },
+ {
+ "epoch": 1.7716312056737589,
+ "grad_norm": 2.7164433002471924,
+ "learning_rate": 4.034287803929726e-06,
+ "loss": 0.4913,
+ "step": 3747
+ },
+ {
+ "epoch": 1.7721040189125294,
+ "grad_norm": 2.5923962593078613,
+ "learning_rate": 4.033795227584829e-06,
+ "loss": 0.5275,
+ "step": 3748
+ },
+ {
+ "epoch": 1.7725768321513002,
+ "grad_norm": 2.606027126312256,
+ "learning_rate": 4.033302555737229e-06,
+ "loss": 0.4869,
+ "step": 3749
+ },
+ {
+ "epoch": 1.773049645390071,
+ "grad_norm": 3.0110089778900146,
+ "learning_rate": 4.032809788417602e-06,
+ "loss": 0.4956,
+ "step": 3750
+ },
+ {
+ "epoch": 1.7735224586288416,
+ "grad_norm": 3.004598617553711,
+ "learning_rate": 4.032316925656632e-06,
+ "loss": 0.5159,
+ "step": 3751
+ },
+ {
+ "epoch": 1.7739952718676122,
+ "grad_norm": 2.731539249420166,
+ "learning_rate": 4.031823967485005e-06,
+ "loss": 0.5237,
+ "step": 3752
+ },
+ {
+ "epoch": 1.774468085106383,
+ "grad_norm": 2.7466373443603516,
+ "learning_rate": 4.0313309139334155e-06,
+ "loss": 0.4948,
+ "step": 3753
+ },
+ {
+ "epoch": 1.7749408983451538,
+ "grad_norm": 2.8596460819244385,
+ "learning_rate": 4.030837765032565e-06,
+ "loss": 0.5016,
+ "step": 3754
+ },
+ {
+ "epoch": 1.7754137115839244,
+ "grad_norm": 3.2886788845062256,
+ "learning_rate": 4.03034452081316e-06,
+ "loss": 0.5377,
+ "step": 3755
+ },
+ {
+ "epoch": 1.775886524822695,
+ "grad_norm": 2.5629258155822754,
+ "learning_rate": 4.029851181305912e-06,
+ "loss": 0.519,
+ "step": 3756
+ },
+ {
+ "epoch": 1.7763593380614657,
+ "grad_norm": 2.5988714694976807,
+ "learning_rate": 4.029357746541539e-06,
+ "loss": 0.5521,
+ "step": 3757
+ },
+ {
+ "epoch": 1.7768321513002365,
+ "grad_norm": 2.987884759902954,
+ "learning_rate": 4.028864216550765e-06,
+ "loss": 0.6225,
+ "step": 3758
+ },
+ {
+ "epoch": 1.777304964539007,
+ "grad_norm": 2.6875851154327393,
+ "learning_rate": 4.02837059136432e-06,
+ "loss": 0.5321,
+ "step": 3759
+ },
+ {
+ "epoch": 1.7777777777777777,
+ "grad_norm": 2.6414570808410645,
+ "learning_rate": 4.02787687101294e-06,
+ "loss": 0.4831,
+ "step": 3760
+ },
+ {
+ "epoch": 1.7782505910165485,
+ "grad_norm": 2.581475019454956,
+ "learning_rate": 4.027383055527368e-06,
+ "loss": 0.5204,
+ "step": 3761
+ },
+ {
+ "epoch": 1.7787234042553193,
+ "grad_norm": 2.811298131942749,
+ "learning_rate": 4.026889144938349e-06,
+ "loss": 0.5486,
+ "step": 3762
+ },
+ {
+ "epoch": 1.7791962174940898,
+ "grad_norm": 3.1589081287384033,
+ "learning_rate": 4.026395139276639e-06,
+ "loss": 0.4979,
+ "step": 3763
+ },
+ {
+ "epoch": 1.7796690307328604,
+ "grad_norm": 2.3773093223571777,
+ "learning_rate": 4.025901038572996e-06,
+ "loss": 0.503,
+ "step": 3764
+ },
+ {
+ "epoch": 1.7801418439716312,
+ "grad_norm": 2.962541341781616,
+ "learning_rate": 4.025406842858187e-06,
+ "loss": 0.4613,
+ "step": 3765
+ },
+ {
+ "epoch": 1.780614657210402,
+ "grad_norm": 2.603092908859253,
+ "learning_rate": 4.024912552162982e-06,
+ "loss": 0.5142,
+ "step": 3766
+ },
+ {
+ "epoch": 1.7810874704491726,
+ "grad_norm": 2.648927927017212,
+ "learning_rate": 4.024418166518159e-06,
+ "loss": 0.4491,
+ "step": 3767
+ },
+ {
+ "epoch": 1.7815602836879432,
+ "grad_norm": 3.3239917755126953,
+ "learning_rate": 4.023923685954502e-06,
+ "loss": 0.6272,
+ "step": 3768
+ },
+ {
+ "epoch": 1.782033096926714,
+ "grad_norm": 2.672821283340454,
+ "learning_rate": 4.023429110502798e-06,
+ "loss": 0.5171,
+ "step": 3769
+ },
+ {
+ "epoch": 1.7825059101654848,
+ "grad_norm": 2.364332437515259,
+ "learning_rate": 4.022934440193844e-06,
+ "loss": 0.4513,
+ "step": 3770
+ },
+ {
+ "epoch": 1.7829787234042553,
+ "grad_norm": 3.03108549118042,
+ "learning_rate": 4.022439675058441e-06,
+ "loss": 0.4324,
+ "step": 3771
+ },
+ {
+ "epoch": 1.783451536643026,
+ "grad_norm": 2.647557020187378,
+ "learning_rate": 4.021944815127393e-06,
+ "loss": 0.5162,
+ "step": 3772
+ },
+ {
+ "epoch": 1.7839243498817967,
+ "grad_norm": 2.4111907482147217,
+ "learning_rate": 4.021449860431517e-06,
+ "loss": 0.4712,
+ "step": 3773
+ },
+ {
+ "epoch": 1.7843971631205675,
+ "grad_norm": 2.796175718307495,
+ "learning_rate": 4.020954811001629e-06,
+ "loss": 0.5131,
+ "step": 3774
+ },
+ {
+ "epoch": 1.784869976359338,
+ "grad_norm": 2.4594924449920654,
+ "learning_rate": 4.020459666868553e-06,
+ "loss": 0.4739,
+ "step": 3775
+ },
+ {
+ "epoch": 1.7853427895981087,
+ "grad_norm": 2.5735671520233154,
+ "learning_rate": 4.0199644280631215e-06,
+ "loss": 0.4716,
+ "step": 3776
+ },
+ {
+ "epoch": 1.7858156028368795,
+ "grad_norm": 2.419990062713623,
+ "learning_rate": 4.01946909461617e-06,
+ "loss": 0.4866,
+ "step": 3777
+ },
+ {
+ "epoch": 1.7862884160756503,
+ "grad_norm": 2.5597951412200928,
+ "learning_rate": 4.01897366655854e-06,
+ "loss": 0.5569,
+ "step": 3778
+ },
+ {
+ "epoch": 1.7867612293144208,
+ "grad_norm": 2.462383985519409,
+ "learning_rate": 4.018478143921081e-06,
+ "loss": 0.4588,
+ "step": 3779
+ },
+ {
+ "epoch": 1.7872340425531914,
+ "grad_norm": 2.536701202392578,
+ "learning_rate": 4.017982526734646e-06,
+ "loss": 0.5278,
+ "step": 3780
+ },
+ {
+ "epoch": 1.7877068557919622,
+ "grad_norm": 2.691077470779419,
+ "learning_rate": 4.017486815030095e-06,
+ "loss": 0.4815,
+ "step": 3781
+ },
+ {
+ "epoch": 1.788179669030733,
+ "grad_norm": 2.4277288913726807,
+ "learning_rate": 4.016991008838294e-06,
+ "loss": 0.4877,
+ "step": 3782
+ },
+ {
+ "epoch": 1.7886524822695036,
+ "grad_norm": 2.6740009784698486,
+ "learning_rate": 4.016495108190115e-06,
+ "loss": 0.572,
+ "step": 3783
+ },
+ {
+ "epoch": 1.7891252955082741,
+ "grad_norm": 3.179232120513916,
+ "learning_rate": 4.0159991131164355e-06,
+ "loss": 0.4821,
+ "step": 3784
+ },
+ {
+ "epoch": 1.789598108747045,
+ "grad_norm": 3.2747793197631836,
+ "learning_rate": 4.015503023648138e-06,
+ "loss": 0.5517,
+ "step": 3785
+ },
+ {
+ "epoch": 1.7900709219858157,
+ "grad_norm": 2.671367645263672,
+ "learning_rate": 4.015006839816113e-06,
+ "loss": 0.5158,
+ "step": 3786
+ },
+ {
+ "epoch": 1.7905437352245863,
+ "grad_norm": 2.6600193977355957,
+ "learning_rate": 4.014510561651256e-06,
+ "loss": 0.535,
+ "step": 3787
+ },
+ {
+ "epoch": 1.791016548463357,
+ "grad_norm": 2.481509208679199,
+ "learning_rate": 4.014014189184466e-06,
+ "loss": 0.5596,
+ "step": 3788
+ },
+ {
+ "epoch": 1.7914893617021277,
+ "grad_norm": 2.759816884994507,
+ "learning_rate": 4.013517722446652e-06,
+ "loss": 0.5201,
+ "step": 3789
+ },
+ {
+ "epoch": 1.7919621749408985,
+ "grad_norm": 2.6913561820983887,
+ "learning_rate": 4.013021161468724e-06,
+ "loss": 0.5758,
+ "step": 3790
+ },
+ {
+ "epoch": 1.792434988179669,
+ "grad_norm": 2.775087594985962,
+ "learning_rate": 4.0125245062816044e-06,
+ "loss": 0.499,
+ "step": 3791
+ },
+ {
+ "epoch": 1.7929078014184396,
+ "grad_norm": 2.6134777069091797,
+ "learning_rate": 4.012027756916216e-06,
+ "loss": 0.5659,
+ "step": 3792
+ },
+ {
+ "epoch": 1.7933806146572104,
+ "grad_norm": 2.7109756469726562,
+ "learning_rate": 4.0115309134034895e-06,
+ "loss": 0.5337,
+ "step": 3793
+ },
+ {
+ "epoch": 1.7938534278959812,
+ "grad_norm": 2.5389950275421143,
+ "learning_rate": 4.0110339757743595e-06,
+ "loss": 0.4501,
+ "step": 3794
+ },
+ {
+ "epoch": 1.7943262411347518,
+ "grad_norm": 2.634648561477661,
+ "learning_rate": 4.010536944059771e-06,
+ "loss": 0.4411,
+ "step": 3795
+ },
+ {
+ "epoch": 1.7947990543735224,
+ "grad_norm": 2.527070999145508,
+ "learning_rate": 4.0100398182906695e-06,
+ "loss": 0.5145,
+ "step": 3796
+ },
+ {
+ "epoch": 1.7952718676122932,
+ "grad_norm": 2.62988543510437,
+ "learning_rate": 4.0095425984980105e-06,
+ "loss": 0.4981,
+ "step": 3797
+ },
+ {
+ "epoch": 1.795744680851064,
+ "grad_norm": 2.6032519340515137,
+ "learning_rate": 4.009045284712752e-06,
+ "loss": 0.453,
+ "step": 3798
+ },
+ {
+ "epoch": 1.7962174940898346,
+ "grad_norm": 2.735173463821411,
+ "learning_rate": 4.008547876965863e-06,
+ "loss": 0.5925,
+ "step": 3799
+ },
+ {
+ "epoch": 1.7966903073286051,
+ "grad_norm": 2.6296730041503906,
+ "learning_rate": 4.00805037528831e-06,
+ "loss": 0.5651,
+ "step": 3800
+ },
+ {
+ "epoch": 1.797163120567376,
+ "grad_norm": 2.641214370727539,
+ "learning_rate": 4.0075527797110735e-06,
+ "loss": 0.4973,
+ "step": 3801
+ },
+ {
+ "epoch": 1.7976359338061467,
+ "grad_norm": 2.6104819774627686,
+ "learning_rate": 4.007055090265136e-06,
+ "loss": 0.4432,
+ "step": 3802
+ },
+ {
+ "epoch": 1.7981087470449173,
+ "grad_norm": 2.8200619220733643,
+ "learning_rate": 4.0065573069814865e-06,
+ "loss": 0.4899,
+ "step": 3803
+ },
+ {
+ "epoch": 1.7985815602836879,
+ "grad_norm": 2.982354164123535,
+ "learning_rate": 4.006059429891119e-06,
+ "loss": 0.5488,
+ "step": 3804
+ },
+ {
+ "epoch": 1.7990543735224587,
+ "grad_norm": 2.7561678886413574,
+ "learning_rate": 4.005561459025034e-06,
+ "loss": 0.5637,
+ "step": 3805
+ },
+ {
+ "epoch": 1.7995271867612295,
+ "grad_norm": 2.702212333679199,
+ "learning_rate": 4.005063394414241e-06,
+ "loss": 0.4804,
+ "step": 3806
+ },
+ {
+ "epoch": 1.8,
+ "grad_norm": 2.8655319213867188,
+ "learning_rate": 4.004565236089748e-06,
+ "loss": 0.5759,
+ "step": 3807
+ },
+ {
+ "epoch": 1.8004728132387706,
+ "grad_norm": 2.703676223754883,
+ "learning_rate": 4.0040669840825756e-06,
+ "loss": 0.4728,
+ "step": 3808
+ },
+ {
+ "epoch": 1.8009456264775414,
+ "grad_norm": 2.802645683288574,
+ "learning_rate": 4.003568638423747e-06,
+ "loss": 0.5421,
+ "step": 3809
+ },
+ {
+ "epoch": 1.8014184397163122,
+ "grad_norm": 2.4723124504089355,
+ "learning_rate": 4.003070199144292e-06,
+ "loss": 0.4944,
+ "step": 3810
+ },
+ {
+ "epoch": 1.8018912529550828,
+ "grad_norm": 2.4889068603515625,
+ "learning_rate": 4.0025716662752475e-06,
+ "loss": 0.4774,
+ "step": 3811
+ },
+ {
+ "epoch": 1.8023640661938534,
+ "grad_norm": 2.5408077239990234,
+ "learning_rate": 4.002073039847653e-06,
+ "loss": 0.5233,
+ "step": 3812
+ },
+ {
+ "epoch": 1.8028368794326242,
+ "grad_norm": 2.734602689743042,
+ "learning_rate": 4.001574319892557e-06,
+ "loss": 0.5403,
+ "step": 3813
+ },
+ {
+ "epoch": 1.803309692671395,
+ "grad_norm": 3.3786163330078125,
+ "learning_rate": 4.001075506441012e-06,
+ "loss": 0.6969,
+ "step": 3814
+ },
+ {
+ "epoch": 1.8037825059101655,
+ "grad_norm": 2.7375378608703613,
+ "learning_rate": 4.000576599524078e-06,
+ "loss": 0.4907,
+ "step": 3815
+ },
+ {
+ "epoch": 1.804255319148936,
+ "grad_norm": 3.041804075241089,
+ "learning_rate": 4.000077599172818e-06,
+ "loss": 0.6021,
+ "step": 3816
+ },
+ {
+ "epoch": 1.804728132387707,
+ "grad_norm": 2.697599411010742,
+ "learning_rate": 3.999578505418305e-06,
+ "loss": 0.4743,
+ "step": 3817
+ },
+ {
+ "epoch": 1.8052009456264777,
+ "grad_norm": 2.276921272277832,
+ "learning_rate": 3.999079318291612e-06,
+ "loss": 0.4885,
+ "step": 3818
+ },
+ {
+ "epoch": 1.8056737588652483,
+ "grad_norm": 2.4896953105926514,
+ "learning_rate": 3.998580037823825e-06,
+ "loss": 0.503,
+ "step": 3819
+ },
+ {
+ "epoch": 1.8061465721040189,
+ "grad_norm": 2.6232175827026367,
+ "learning_rate": 3.998080664046029e-06,
+ "loss": 0.5058,
+ "step": 3820
+ },
+ {
+ "epoch": 1.8066193853427897,
+ "grad_norm": 2.695861339569092,
+ "learning_rate": 3.997581196989319e-06,
+ "loss": 0.4949,
+ "step": 3821
+ },
+ {
+ "epoch": 1.8070921985815604,
+ "grad_norm": 2.912886142730713,
+ "learning_rate": 3.997081636684795e-06,
+ "loss": 0.4971,
+ "step": 3822
+ },
+ {
+ "epoch": 1.807565011820331,
+ "grad_norm": 2.876500368118286,
+ "learning_rate": 3.996581983163561e-06,
+ "loss": 0.5584,
+ "step": 3823
+ },
+ {
+ "epoch": 1.8080378250591016,
+ "grad_norm": 2.857069730758667,
+ "learning_rate": 3.99608223645673e-06,
+ "loss": 0.5457,
+ "step": 3824
+ },
+ {
+ "epoch": 1.8085106382978724,
+ "grad_norm": 2.486743211746216,
+ "learning_rate": 3.995582396595419e-06,
+ "loss": 0.5291,
+ "step": 3825
+ },
+ {
+ "epoch": 1.808983451536643,
+ "grad_norm": 2.509441375732422,
+ "learning_rate": 3.9950824636107486e-06,
+ "loss": 0.4747,
+ "step": 3826
+ },
+ {
+ "epoch": 1.8094562647754135,
+ "grad_norm": 2.931394100189209,
+ "learning_rate": 3.99458243753385e-06,
+ "loss": 0.5116,
+ "step": 3827
+ },
+ {
+ "epoch": 1.8099290780141843,
+ "grad_norm": 2.4868650436401367,
+ "learning_rate": 3.994082318395856e-06,
+ "loss": 0.4671,
+ "step": 3828
+ },
+ {
+ "epoch": 1.8104018912529551,
+ "grad_norm": 2.5554752349853516,
+ "learning_rate": 3.993582106227907e-06,
+ "loss": 0.4969,
+ "step": 3829
+ },
+ {
+ "epoch": 1.8108747044917257,
+ "grad_norm": 2.8367133140563965,
+ "learning_rate": 3.99308180106115e-06,
+ "loss": 0.5507,
+ "step": 3830
+ },
+ {
+ "epoch": 1.8113475177304963,
+ "grad_norm": 2.68245792388916,
+ "learning_rate": 3.992581402926737e-06,
+ "loss": 0.5115,
+ "step": 3831
+ },
+ {
+ "epoch": 1.811820330969267,
+ "grad_norm": 2.406674385070801,
+ "learning_rate": 3.992080911855824e-06,
+ "loss": 0.545,
+ "step": 3832
+ },
+ {
+ "epoch": 1.8122931442080379,
+ "grad_norm": 2.5003464221954346,
+ "learning_rate": 3.991580327879575e-06,
+ "loss": 0.4331,
+ "step": 3833
+ },
+ {
+ "epoch": 1.8127659574468085,
+ "grad_norm": 2.49320912361145,
+ "learning_rate": 3.99107965102916e-06,
+ "loss": 0.5118,
+ "step": 3834
+ },
+ {
+ "epoch": 1.813238770685579,
+ "grad_norm": 2.6183295249938965,
+ "learning_rate": 3.990578881335752e-06,
+ "loss": 0.5286,
+ "step": 3835
+ },
+ {
+ "epoch": 1.8137115839243498,
+ "grad_norm": 3.1999518871307373,
+ "learning_rate": 3.990078018830534e-06,
+ "loss": 0.5048,
+ "step": 3836
+ },
+ {
+ "epoch": 1.8141843971631206,
+ "grad_norm": 2.4351117610931396,
+ "learning_rate": 3.9895770635446915e-06,
+ "loss": 0.514,
+ "step": 3837
+ },
+ {
+ "epoch": 1.8146572104018912,
+ "grad_norm": 2.6859259605407715,
+ "learning_rate": 3.989076015509416e-06,
+ "loss": 0.5575,
+ "step": 3838
+ },
+ {
+ "epoch": 1.8151300236406618,
+ "grad_norm": 2.790421962738037,
+ "learning_rate": 3.988574874755909e-06,
+ "loss": 0.5467,
+ "step": 3839
+ },
+ {
+ "epoch": 1.8156028368794326,
+ "grad_norm": 2.5202765464782715,
+ "learning_rate": 3.988073641315369e-06,
+ "loss": 0.5229,
+ "step": 3840
+ },
+ {
+ "epoch": 1.8160756501182034,
+ "grad_norm": 2.623652219772339,
+ "learning_rate": 3.987572315219009e-06,
+ "loss": 0.509,
+ "step": 3841
+ },
+ {
+ "epoch": 1.816548463356974,
+ "grad_norm": 2.6038360595703125,
+ "learning_rate": 3.987070896498044e-06,
+ "loss": 0.5304,
+ "step": 3842
+ },
+ {
+ "epoch": 1.8170212765957445,
+ "grad_norm": 2.9378011226654053,
+ "learning_rate": 3.9865693851836955e-06,
+ "loss": 0.5845,
+ "step": 3843
+ },
+ {
+ "epoch": 1.8174940898345153,
+ "grad_norm": 2.4061124324798584,
+ "learning_rate": 3.98606778130719e-06,
+ "loss": 0.4333,
+ "step": 3844
+ },
+ {
+ "epoch": 1.8179669030732861,
+ "grad_norm": 2.483489751815796,
+ "learning_rate": 3.985566084899759e-06,
+ "loss": 0.4827,
+ "step": 3845
+ },
+ {
+ "epoch": 1.8184397163120567,
+ "grad_norm": 2.7774932384490967,
+ "learning_rate": 3.985064295992642e-06,
+ "loss": 0.5016,
+ "step": 3846
+ },
+ {
+ "epoch": 1.8189125295508273,
+ "grad_norm": 2.5936765670776367,
+ "learning_rate": 3.984562414617083e-06,
+ "loss": 0.4448,
+ "step": 3847
+ },
+ {
+ "epoch": 1.819385342789598,
+ "grad_norm": 2.8608627319335938,
+ "learning_rate": 3.9840604408043325e-06,
+ "loss": 0.5735,
+ "step": 3848
+ },
+ {
+ "epoch": 1.8198581560283689,
+ "grad_norm": 2.6212472915649414,
+ "learning_rate": 3.983558374585646e-06,
+ "loss": 0.5091,
+ "step": 3849
+ },
+ {
+ "epoch": 1.8203309692671394,
+ "grad_norm": 2.832460641860962,
+ "learning_rate": 3.983056215992284e-06,
+ "loss": 0.5169,
+ "step": 3850
+ },
+ {
+ "epoch": 1.82080378250591,
+ "grad_norm": 2.5293610095977783,
+ "learning_rate": 3.982553965055514e-06,
+ "loss": 0.4708,
+ "step": 3851
+ },
+ {
+ "epoch": 1.8212765957446808,
+ "grad_norm": 2.9362871646881104,
+ "learning_rate": 3.982051621806611e-06,
+ "loss": 0.575,
+ "step": 3852
+ },
+ {
+ "epoch": 1.8217494089834516,
+ "grad_norm": 2.69073486328125,
+ "learning_rate": 3.98154918627685e-06,
+ "loss": 0.5278,
+ "step": 3853
+ },
+ {
+ "epoch": 1.8222222222222222,
+ "grad_norm": 2.6711034774780273,
+ "learning_rate": 3.98104665849752e-06,
+ "loss": 0.4918,
+ "step": 3854
+ },
+ {
+ "epoch": 1.8226950354609928,
+ "grad_norm": 2.571110963821411,
+ "learning_rate": 3.980544038499907e-06,
+ "loss": 0.5234,
+ "step": 3855
+ },
+ {
+ "epoch": 1.8231678486997636,
+ "grad_norm": 3.2603371143341064,
+ "learning_rate": 3.980041326315309e-06,
+ "loss": 0.5996,
+ "step": 3856
+ },
+ {
+ "epoch": 1.8236406619385344,
+ "grad_norm": 2.8472323417663574,
+ "learning_rate": 3.979538521975028e-06,
+ "loss": 0.4769,
+ "step": 3857
+ },
+ {
+ "epoch": 1.824113475177305,
+ "grad_norm": 2.6714751720428467,
+ "learning_rate": 3.979035625510371e-06,
+ "loss": 0.4826,
+ "step": 3858
+ },
+ {
+ "epoch": 1.8245862884160755,
+ "grad_norm": 2.6816468238830566,
+ "learning_rate": 3.97853263695265e-06,
+ "loss": 0.5127,
+ "step": 3859
+ },
+ {
+ "epoch": 1.8250591016548463,
+ "grad_norm": 2.6464123725891113,
+ "learning_rate": 3.978029556333185e-06,
+ "loss": 0.4925,
+ "step": 3860
+ },
+ {
+ "epoch": 1.825531914893617,
+ "grad_norm": 2.5317227840423584,
+ "learning_rate": 3.977526383683301e-06,
+ "loss": 0.4765,
+ "step": 3861
+ },
+ {
+ "epoch": 1.8260047281323877,
+ "grad_norm": 2.5052425861358643,
+ "learning_rate": 3.977023119034328e-06,
+ "loss": 0.4804,
+ "step": 3862
+ },
+ {
+ "epoch": 1.8264775413711583,
+ "grad_norm": 2.7022836208343506,
+ "learning_rate": 3.976519762417602e-06,
+ "loss": 0.4824,
+ "step": 3863
+ },
+ {
+ "epoch": 1.826950354609929,
+ "grad_norm": 2.7445900440216064,
+ "learning_rate": 3.976016313864464e-06,
+ "loss": 0.5698,
+ "step": 3864
+ },
+ {
+ "epoch": 1.8274231678486998,
+ "grad_norm": 2.442518711090088,
+ "learning_rate": 3.975512773406262e-06,
+ "loss": 0.5133,
+ "step": 3865
+ },
+ {
+ "epoch": 1.8278959810874704,
+ "grad_norm": 2.4100050926208496,
+ "learning_rate": 3.975009141074351e-06,
+ "loss": 0.5044,
+ "step": 3866
+ },
+ {
+ "epoch": 1.828368794326241,
+ "grad_norm": 2.9507648944854736,
+ "learning_rate": 3.974505416900088e-06,
+ "loss": 0.5367,
+ "step": 3867
+ },
+ {
+ "epoch": 1.8288416075650118,
+ "grad_norm": 2.5662600994110107,
+ "learning_rate": 3.974001600914837e-06,
+ "loss": 0.5878,
+ "step": 3868
+ },
+ {
+ "epoch": 1.8293144208037826,
+ "grad_norm": 2.4306657314300537,
+ "learning_rate": 3.973497693149971e-06,
+ "loss": 0.4647,
+ "step": 3869
+ },
+ {
+ "epoch": 1.8297872340425532,
+ "grad_norm": 2.974686622619629,
+ "learning_rate": 3.972993693636864e-06,
+ "loss": 0.4911,
+ "step": 3870
+ },
+ {
+ "epoch": 1.8302600472813237,
+ "grad_norm": 2.5711987018585205,
+ "learning_rate": 3.972489602406899e-06,
+ "loss": 0.5089,
+ "step": 3871
+ },
+ {
+ "epoch": 1.8307328605200945,
+ "grad_norm": 3.259617328643799,
+ "learning_rate": 3.971985419491463e-06,
+ "loss": 0.5966,
+ "step": 3872
+ },
+ {
+ "epoch": 1.8312056737588653,
+ "grad_norm": 2.7437000274658203,
+ "learning_rate": 3.971481144921949e-06,
+ "loss": 0.5097,
+ "step": 3873
+ },
+ {
+ "epoch": 1.831678486997636,
+ "grad_norm": 2.9597461223602295,
+ "learning_rate": 3.970976778729757e-06,
+ "loss": 0.5672,
+ "step": 3874
+ },
+ {
+ "epoch": 1.8321513002364065,
+ "grad_norm": 2.5775723457336426,
+ "learning_rate": 3.970472320946291e-06,
+ "loss": 0.4749,
+ "step": 3875
+ },
+ {
+ "epoch": 1.8326241134751773,
+ "grad_norm": 2.7381200790405273,
+ "learning_rate": 3.969967771602961e-06,
+ "loss": 0.5255,
+ "step": 3876
+ },
+ {
+ "epoch": 1.833096926713948,
+ "grad_norm": 2.651698350906372,
+ "learning_rate": 3.969463130731183e-06,
+ "loss": 0.5098,
+ "step": 3877
+ },
+ {
+ "epoch": 1.8335697399527187,
+ "grad_norm": 2.7277021408081055,
+ "learning_rate": 3.968958398362381e-06,
+ "loss": 0.5251,
+ "step": 3878
+ },
+ {
+ "epoch": 1.8340425531914892,
+ "grad_norm": 2.5184953212738037,
+ "learning_rate": 3.968453574527978e-06,
+ "loss": 0.5086,
+ "step": 3879
+ },
+ {
+ "epoch": 1.83451536643026,
+ "grad_norm": 2.8227882385253906,
+ "learning_rate": 3.967948659259412e-06,
+ "loss": 0.5742,
+ "step": 3880
+ },
+ {
+ "epoch": 1.8349881796690308,
+ "grad_norm": 2.547922134399414,
+ "learning_rate": 3.967443652588119e-06,
+ "loss": 0.5411,
+ "step": 3881
+ },
+ {
+ "epoch": 1.8354609929078014,
+ "grad_norm": 2.6572835445404053,
+ "learning_rate": 3.966938554545545e-06,
+ "loss": 0.4854,
+ "step": 3882
+ },
+ {
+ "epoch": 1.835933806146572,
+ "grad_norm": 2.9416658878326416,
+ "learning_rate": 3.966433365163139e-06,
+ "loss": 0.5236,
+ "step": 3883
+ },
+ {
+ "epoch": 1.8364066193853428,
+ "grad_norm": 2.344325304031372,
+ "learning_rate": 3.965928084472357e-06,
+ "loss": 0.4916,
+ "step": 3884
+ },
+ {
+ "epoch": 1.8368794326241136,
+ "grad_norm": 2.890418291091919,
+ "learning_rate": 3.965422712504662e-06,
+ "loss": 0.5287,
+ "step": 3885
+ },
+ {
+ "epoch": 1.8373522458628841,
+ "grad_norm": 2.6063363552093506,
+ "learning_rate": 3.96491724929152e-06,
+ "loss": 0.4842,
+ "step": 3886
+ },
+ {
+ "epoch": 1.8378250591016547,
+ "grad_norm": 2.5582427978515625,
+ "learning_rate": 3.964411694864404e-06,
+ "loss": 0.4768,
+ "step": 3887
+ },
+ {
+ "epoch": 1.8382978723404255,
+ "grad_norm": 2.84356951713562,
+ "learning_rate": 3.963906049254793e-06,
+ "loss": 0.5284,
+ "step": 3888
+ },
+ {
+ "epoch": 1.8387706855791963,
+ "grad_norm": 2.7048516273498535,
+ "learning_rate": 3.963400312494172e-06,
+ "loss": 0.5271,
+ "step": 3889
+ },
+ {
+ "epoch": 1.839243498817967,
+ "grad_norm": 2.5401699542999268,
+ "learning_rate": 3.962894484614031e-06,
+ "loss": 0.4734,
+ "step": 3890
+ },
+ {
+ "epoch": 1.8397163120567375,
+ "grad_norm": 2.208256244659424,
+ "learning_rate": 3.962388565645864e-06,
+ "loss": 0.4113,
+ "step": 3891
+ },
+ {
+ "epoch": 1.8401891252955083,
+ "grad_norm": 2.775139331817627,
+ "learning_rate": 3.961882555621173e-06,
+ "loss": 0.5172,
+ "step": 3892
+ },
+ {
+ "epoch": 1.840661938534279,
+ "grad_norm": 2.7540855407714844,
+ "learning_rate": 3.961376454571466e-06,
+ "loss": 0.5252,
+ "step": 3893
+ },
+ {
+ "epoch": 1.8411347517730496,
+ "grad_norm": 2.6731574535369873,
+ "learning_rate": 3.960870262528255e-06,
+ "loss": 0.4495,
+ "step": 3894
+ },
+ {
+ "epoch": 1.8416075650118202,
+ "grad_norm": 2.791492223739624,
+ "learning_rate": 3.960363979523058e-06,
+ "loss": 0.5457,
+ "step": 3895
+ },
+ {
+ "epoch": 1.842080378250591,
+ "grad_norm": 2.9280290603637695,
+ "learning_rate": 3.959857605587401e-06,
+ "loss": 0.5373,
+ "step": 3896
+ },
+ {
+ "epoch": 1.8425531914893618,
+ "grad_norm": 2.5652217864990234,
+ "learning_rate": 3.95935114075281e-06,
+ "loss": 0.5191,
+ "step": 3897
+ },
+ {
+ "epoch": 1.8430260047281324,
+ "grad_norm": 2.7297749519348145,
+ "learning_rate": 3.958844585050824e-06,
+ "loss": 0.5366,
+ "step": 3898
+ },
+ {
+ "epoch": 1.843498817966903,
+ "grad_norm": 2.5302982330322266,
+ "learning_rate": 3.958337938512983e-06,
+ "loss": 0.569,
+ "step": 3899
+ },
+ {
+ "epoch": 1.8439716312056738,
+ "grad_norm": 2.644777297973633,
+ "learning_rate": 3.957831201170832e-06,
+ "loss": 0.521,
+ "step": 3900
+ },
+ {
+ "epoch": 1.8444444444444446,
+ "grad_norm": 2.8375515937805176,
+ "learning_rate": 3.957324373055925e-06,
+ "loss": 0.573,
+ "step": 3901
+ },
+ {
+ "epoch": 1.8449172576832151,
+ "grad_norm": 2.512296676635742,
+ "learning_rate": 3.956817454199819e-06,
+ "loss": 0.5081,
+ "step": 3902
+ },
+ {
+ "epoch": 1.8453900709219857,
+ "grad_norm": 2.3662109375,
+ "learning_rate": 3.956310444634079e-06,
+ "loss": 0.4989,
+ "step": 3903
+ },
+ {
+ "epoch": 1.8458628841607565,
+ "grad_norm": 2.6849682331085205,
+ "learning_rate": 3.955803344390272e-06,
+ "loss": 0.5459,
+ "step": 3904
+ },
+ {
+ "epoch": 1.8463356973995273,
+ "grad_norm": 2.8364317417144775,
+ "learning_rate": 3.9552961534999756e-06,
+ "loss": 0.5704,
+ "step": 3905
+ },
+ {
+ "epoch": 1.8468085106382979,
+ "grad_norm": 2.6006948947906494,
+ "learning_rate": 3.954788871994768e-06,
+ "loss": 0.5696,
+ "step": 3906
+ },
+ {
+ "epoch": 1.8472813238770684,
+ "grad_norm": 2.558300018310547,
+ "learning_rate": 3.9542814999062375e-06,
+ "loss": 0.5047,
+ "step": 3907
+ },
+ {
+ "epoch": 1.8477541371158392,
+ "grad_norm": 2.6343321800231934,
+ "learning_rate": 3.953774037265974e-06,
+ "loss": 0.525,
+ "step": 3908
+ },
+ {
+ "epoch": 1.84822695035461,
+ "grad_norm": 2.5050008296966553,
+ "learning_rate": 3.953266484105576e-06,
+ "loss": 0.4867,
+ "step": 3909
+ },
+ {
+ "epoch": 1.8486997635933806,
+ "grad_norm": 2.3775103092193604,
+ "learning_rate": 3.952758840456647e-06,
+ "loss": 0.4349,
+ "step": 3910
+ },
+ {
+ "epoch": 1.8491725768321512,
+ "grad_norm": 2.508376359939575,
+ "learning_rate": 3.952251106350794e-06,
+ "loss": 0.539,
+ "step": 3911
+ },
+ {
+ "epoch": 1.849645390070922,
+ "grad_norm": 2.7403106689453125,
+ "learning_rate": 3.951743281819633e-06,
+ "loss": 0.4478,
+ "step": 3912
+ },
+ {
+ "epoch": 1.8501182033096928,
+ "grad_norm": 2.5332062244415283,
+ "learning_rate": 3.951235366894784e-06,
+ "loss": 0.4658,
+ "step": 3913
+ },
+ {
+ "epoch": 1.8505910165484634,
+ "grad_norm": 3.0137248039245605,
+ "learning_rate": 3.950727361607872e-06,
+ "loss": 0.5047,
+ "step": 3914
+ },
+ {
+ "epoch": 1.851063829787234,
+ "grad_norm": 2.5820653438568115,
+ "learning_rate": 3.950219265990528e-06,
+ "loss": 0.542,
+ "step": 3915
+ },
+ {
+ "epoch": 1.8515366430260047,
+ "grad_norm": 2.555133819580078,
+ "learning_rate": 3.949711080074389e-06,
+ "loss": 0.5253,
+ "step": 3916
+ },
+ {
+ "epoch": 1.8520094562647755,
+ "grad_norm": 2.876882791519165,
+ "learning_rate": 3.949202803891099e-06,
+ "loss": 0.5242,
+ "step": 3917
+ },
+ {
+ "epoch": 1.852482269503546,
+ "grad_norm": 2.5929203033447266,
+ "learning_rate": 3.948694437472305e-06,
+ "loss": 0.5358,
+ "step": 3918
+ },
+ {
+ "epoch": 1.8529550827423167,
+ "grad_norm": 2.468513250350952,
+ "learning_rate": 3.948185980849659e-06,
+ "loss": 0.5119,
+ "step": 3919
+ },
+ {
+ "epoch": 1.8534278959810875,
+ "grad_norm": 2.9259560108184814,
+ "learning_rate": 3.947677434054824e-06,
+ "loss": 0.4756,
+ "step": 3920
+ },
+ {
+ "epoch": 1.8539007092198583,
+ "grad_norm": 2.5247011184692383,
+ "learning_rate": 3.947168797119462e-06,
+ "loss": 0.4627,
+ "step": 3921
+ },
+ {
+ "epoch": 1.8543735224586289,
+ "grad_norm": 2.7396671772003174,
+ "learning_rate": 3.946660070075245e-06,
+ "loss": 0.5013,
+ "step": 3922
+ },
+ {
+ "epoch": 1.8548463356973994,
+ "grad_norm": 2.7059738636016846,
+ "learning_rate": 3.946151252953849e-06,
+ "loss": 0.5875,
+ "step": 3923
+ },
+ {
+ "epoch": 1.8553191489361702,
+ "grad_norm": 2.5638437271118164,
+ "learning_rate": 3.945642345786955e-06,
+ "loss": 0.5063,
+ "step": 3924
+ },
+ {
+ "epoch": 1.855791962174941,
+ "grad_norm": 2.6647839546203613,
+ "learning_rate": 3.945133348606251e-06,
+ "loss": 0.5421,
+ "step": 3925
+ },
+ {
+ "epoch": 1.8562647754137116,
+ "grad_norm": 3.7235286235809326,
+ "learning_rate": 3.944624261443431e-06,
+ "loss": 0.5958,
+ "step": 3926
+ },
+ {
+ "epoch": 1.8567375886524822,
+ "grad_norm": 2.769984245300293,
+ "learning_rate": 3.944115084330192e-06,
+ "loss": 0.5678,
+ "step": 3927
+ },
+ {
+ "epoch": 1.857210401891253,
+ "grad_norm": 2.567249059677124,
+ "learning_rate": 3.9436058172982395e-06,
+ "loss": 0.4767,
+ "step": 3928
+ },
+ {
+ "epoch": 1.8576832151300238,
+ "grad_norm": 2.6196048259735107,
+ "learning_rate": 3.943096460379283e-06,
+ "loss": 0.5345,
+ "step": 3929
+ },
+ {
+ "epoch": 1.8581560283687943,
+ "grad_norm": 2.5999555587768555,
+ "learning_rate": 3.942587013605037e-06,
+ "loss": 0.5482,
+ "step": 3930
+ },
+ {
+ "epoch": 1.858628841607565,
+ "grad_norm": 2.630387783050537,
+ "learning_rate": 3.942077477007224e-06,
+ "loss": 0.6023,
+ "step": 3931
+ },
+ {
+ "epoch": 1.8591016548463357,
+ "grad_norm": 2.543503761291504,
+ "learning_rate": 3.941567850617569e-06,
+ "loss": 0.5157,
+ "step": 3932
+ },
+ {
+ "epoch": 1.8595744680851065,
+ "grad_norm": 2.5109236240386963,
+ "learning_rate": 3.941058134467805e-06,
+ "loss": 0.4774,
+ "step": 3933
+ },
+ {
+ "epoch": 1.860047281323877,
+ "grad_norm": 2.5110230445861816,
+ "learning_rate": 3.94054832858967e-06,
+ "loss": 0.5064,
+ "step": 3934
+ },
+ {
+ "epoch": 1.8605200945626477,
+ "grad_norm": 2.4780776500701904,
+ "learning_rate": 3.940038433014908e-06,
+ "loss": 0.5216,
+ "step": 3935
+ },
+ {
+ "epoch": 1.8609929078014185,
+ "grad_norm": 2.4398856163024902,
+ "learning_rate": 3.939528447775266e-06,
+ "loss": 0.4958,
+ "step": 3936
+ },
+ {
+ "epoch": 1.8614657210401893,
+ "grad_norm": 2.449498176574707,
+ "learning_rate": 3.9390183729025e-06,
+ "loss": 0.5165,
+ "step": 3937
+ },
+ {
+ "epoch": 1.8619385342789598,
+ "grad_norm": 2.982544422149658,
+ "learning_rate": 3.938508208428371e-06,
+ "loss": 0.4803,
+ "step": 3938
+ },
+ {
+ "epoch": 1.8624113475177304,
+ "grad_norm": 2.6574015617370605,
+ "learning_rate": 3.937997954384641e-06,
+ "loss": 0.4797,
+ "step": 3939
+ },
+ {
+ "epoch": 1.8628841607565012,
+ "grad_norm": 2.7773542404174805,
+ "learning_rate": 3.937487610803086e-06,
+ "loss": 0.4843,
+ "step": 3940
+ },
+ {
+ "epoch": 1.863356973995272,
+ "grad_norm": 2.588937759399414,
+ "learning_rate": 3.9369771777154805e-06,
+ "loss": 0.5426,
+ "step": 3941
+ },
+ {
+ "epoch": 1.8638297872340426,
+ "grad_norm": 2.855442523956299,
+ "learning_rate": 3.936466655153607e-06,
+ "loss": 0.5443,
+ "step": 3942
+ },
+ {
+ "epoch": 1.8643026004728132,
+ "grad_norm": 2.554676055908203,
+ "learning_rate": 3.935956043149253e-06,
+ "loss": 0.5334,
+ "step": 3943
+ },
+ {
+ "epoch": 1.864775413711584,
+ "grad_norm": 2.901599884033203,
+ "learning_rate": 3.935445341734212e-06,
+ "loss": 0.5842,
+ "step": 3944
+ },
+ {
+ "epoch": 1.8652482269503547,
+ "grad_norm": 2.554485321044922,
+ "learning_rate": 3.934934550940285e-06,
+ "loss": 0.4941,
+ "step": 3945
+ },
+ {
+ "epoch": 1.8657210401891253,
+ "grad_norm": 2.357203245162964,
+ "learning_rate": 3.934423670799275e-06,
+ "loss": 0.4402,
+ "step": 3946
+ },
+ {
+ "epoch": 1.866193853427896,
+ "grad_norm": 2.7036049365997314,
+ "learning_rate": 3.933912701342993e-06,
+ "loss": 0.4966,
+ "step": 3947
+ },
+ {
+ "epoch": 1.8666666666666667,
+ "grad_norm": 2.7817211151123047,
+ "learning_rate": 3.933401642603255e-06,
+ "loss": 0.4908,
+ "step": 3948
+ },
+ {
+ "epoch": 1.8671394799054375,
+ "grad_norm": 2.439490795135498,
+ "learning_rate": 3.932890494611882e-06,
+ "loss": 0.4322,
+ "step": 3949
+ },
+ {
+ "epoch": 1.867612293144208,
+ "grad_norm": 3.187152147293091,
+ "learning_rate": 3.9323792574007e-06,
+ "loss": 0.501,
+ "step": 3950
+ },
+ {
+ "epoch": 1.8680851063829786,
+ "grad_norm": 2.405773401260376,
+ "learning_rate": 3.931867931001543e-06,
+ "loss": 0.4477,
+ "step": 3951
+ },
+ {
+ "epoch": 1.8685579196217494,
+ "grad_norm": 2.4922525882720947,
+ "learning_rate": 3.931356515446248e-06,
+ "loss": 0.5098,
+ "step": 3952
+ },
+ {
+ "epoch": 1.8690307328605202,
+ "grad_norm": 2.7781267166137695,
+ "learning_rate": 3.93084501076666e-06,
+ "loss": 0.5815,
+ "step": 3953
+ },
+ {
+ "epoch": 1.8695035460992908,
+ "grad_norm": 2.74621844291687,
+ "learning_rate": 3.930333416994626e-06,
+ "loss": 0.5605,
+ "step": 3954
+ },
+ {
+ "epoch": 1.8699763593380614,
+ "grad_norm": 2.5527689456939697,
+ "learning_rate": 3.929821734162004e-06,
+ "loss": 0.5141,
+ "step": 3955
+ },
+ {
+ "epoch": 1.8704491725768322,
+ "grad_norm": 2.5730628967285156,
+ "learning_rate": 3.92930996230065e-06,
+ "loss": 0.5446,
+ "step": 3956
+ },
+ {
+ "epoch": 1.870921985815603,
+ "grad_norm": 2.7053353786468506,
+ "learning_rate": 3.9287981014424334e-06,
+ "loss": 0.4722,
+ "step": 3957
+ },
+ {
+ "epoch": 1.8713947990543736,
+ "grad_norm": 2.7591893672943115,
+ "learning_rate": 3.928286151619224e-06,
+ "loss": 0.509,
+ "step": 3958
+ },
+ {
+ "epoch": 1.8718676122931441,
+ "grad_norm": 2.6233739852905273,
+ "learning_rate": 3.927774112862898e-06,
+ "loss": 0.5266,
+ "step": 3959
+ },
+ {
+ "epoch": 1.872340425531915,
+ "grad_norm": 2.7715370655059814,
+ "learning_rate": 3.9272619852053396e-06,
+ "loss": 0.5612,
+ "step": 3960
+ },
+ {
+ "epoch": 1.8728132387706857,
+ "grad_norm": 2.4815211296081543,
+ "learning_rate": 3.926749768678435e-06,
+ "loss": 0.5498,
+ "step": 3961
+ },
+ {
+ "epoch": 1.8732860520094563,
+ "grad_norm": 2.6819605827331543,
+ "learning_rate": 3.926237463314078e-06,
+ "loss": 0.5499,
+ "step": 3962
+ },
+ {
+ "epoch": 1.8737588652482269,
+ "grad_norm": 2.638664722442627,
+ "learning_rate": 3.925725069144168e-06,
+ "loss": 0.5429,
+ "step": 3963
+ },
+ {
+ "epoch": 1.8742316784869977,
+ "grad_norm": 2.527294874191284,
+ "learning_rate": 3.925212586200611e-06,
+ "loss": 0.5451,
+ "step": 3964
+ },
+ {
+ "epoch": 1.8747044917257685,
+ "grad_norm": 2.831638813018799,
+ "learning_rate": 3.924700014515315e-06,
+ "loss": 0.5276,
+ "step": 3965
+ },
+ {
+ "epoch": 1.875177304964539,
+ "grad_norm": 2.5906996726989746,
+ "learning_rate": 3.924187354120196e-06,
+ "loss": 0.5323,
+ "step": 3966
+ },
+ {
+ "epoch": 1.8756501182033096,
+ "grad_norm": 2.5482442378997803,
+ "learning_rate": 3.923674605047175e-06,
+ "loss": 0.4882,
+ "step": 3967
+ },
+ {
+ "epoch": 1.8761229314420804,
+ "grad_norm": 2.56402850151062,
+ "learning_rate": 3.923161767328179e-06,
+ "loss": 0.5111,
+ "step": 3968
+ },
+ {
+ "epoch": 1.8765957446808512,
+ "grad_norm": 3.223782539367676,
+ "learning_rate": 3.9226488409951405e-06,
+ "loss": 0.5829,
+ "step": 3969
+ },
+ {
+ "epoch": 1.8770685579196218,
+ "grad_norm": 2.665964365005493,
+ "learning_rate": 3.922135826079997e-06,
+ "loss": 0.4739,
+ "step": 3970
+ },
+ {
+ "epoch": 1.8775413711583924,
+ "grad_norm": 2.602696418762207,
+ "learning_rate": 3.921622722614691e-06,
+ "loss": 0.5199,
+ "step": 3971
+ },
+ {
+ "epoch": 1.8780141843971632,
+ "grad_norm": 2.5384418964385986,
+ "learning_rate": 3.921109530631172e-06,
+ "loss": 0.5086,
+ "step": 3972
+ },
+ {
+ "epoch": 1.878486997635934,
+ "grad_norm": 2.7961080074310303,
+ "learning_rate": 3.920596250161394e-06,
+ "loss": 0.5454,
+ "step": 3973
+ },
+ {
+ "epoch": 1.8789598108747045,
+ "grad_norm": 3.022007465362549,
+ "learning_rate": 3.920082881237317e-06,
+ "loss": 0.5537,
+ "step": 3974
+ },
+ {
+ "epoch": 1.8794326241134751,
+ "grad_norm": 2.699885129928589,
+ "learning_rate": 3.9195694238909045e-06,
+ "loss": 0.5274,
+ "step": 3975
+ },
+ {
+ "epoch": 1.879905437352246,
+ "grad_norm": 2.3994593620300293,
+ "learning_rate": 3.919055878154129e-06,
+ "loss": 0.4134,
+ "step": 3976
+ },
+ {
+ "epoch": 1.8803782505910167,
+ "grad_norm": 4.093045711517334,
+ "learning_rate": 3.918542244058967e-06,
+ "loss": 0.5305,
+ "step": 3977
+ },
+ {
+ "epoch": 1.8808510638297873,
+ "grad_norm": 3.011643171310425,
+ "learning_rate": 3.9180285216374e-06,
+ "loss": 0.5481,
+ "step": 3978
+ },
+ {
+ "epoch": 1.8813238770685579,
+ "grad_norm": 2.6426854133605957,
+ "learning_rate": 3.917514710921414e-06,
+ "loss": 0.5415,
+ "step": 3979
+ },
+ {
+ "epoch": 1.8817966903073287,
+ "grad_norm": 2.4379019737243652,
+ "learning_rate": 3.917000811943002e-06,
+ "loss": 0.4566,
+ "step": 3980
+ },
+ {
+ "epoch": 1.8822695035460995,
+ "grad_norm": 3.18522047996521,
+ "learning_rate": 3.9164868247341634e-06,
+ "loss": 0.6079,
+ "step": 3981
+ },
+ {
+ "epoch": 1.88274231678487,
+ "grad_norm": 2.6451141834259033,
+ "learning_rate": 3.915972749326903e-06,
+ "loss": 0.515,
+ "step": 3982
+ },
+ {
+ "epoch": 1.8832151300236406,
+ "grad_norm": 2.565598726272583,
+ "learning_rate": 3.915458585753226e-06,
+ "loss": 0.4799,
+ "step": 3983
+ },
+ {
+ "epoch": 1.8836879432624114,
+ "grad_norm": 2.711651563644409,
+ "learning_rate": 3.91494433404515e-06,
+ "loss": 0.5595,
+ "step": 3984
+ },
+ {
+ "epoch": 1.8841607565011822,
+ "grad_norm": 2.749328851699829,
+ "learning_rate": 3.914429994234695e-06,
+ "loss": 0.495,
+ "step": 3985
+ },
+ {
+ "epoch": 1.8846335697399526,
+ "grad_norm": 2.9492287635803223,
+ "learning_rate": 3.913915566353886e-06,
+ "loss": 0.5683,
+ "step": 3986
+ },
+ {
+ "epoch": 1.8851063829787233,
+ "grad_norm": 3.07747745513916,
+ "learning_rate": 3.913401050434756e-06,
+ "loss": 0.4953,
+ "step": 3987
+ },
+ {
+ "epoch": 1.8855791962174941,
+ "grad_norm": 2.8746345043182373,
+ "learning_rate": 3.912886446509338e-06,
+ "loss": 0.4752,
+ "step": 3988
+ },
+ {
+ "epoch": 1.8860520094562647,
+ "grad_norm": 2.772954225540161,
+ "learning_rate": 3.912371754609677e-06,
+ "loss": 0.5473,
+ "step": 3989
+ },
+ {
+ "epoch": 1.8865248226950353,
+ "grad_norm": 2.8906044960021973,
+ "learning_rate": 3.911856974767821e-06,
+ "loss": 0.5285,
+ "step": 3990
+ },
+ {
+ "epoch": 1.886997635933806,
+ "grad_norm": 2.8992726802825928,
+ "learning_rate": 3.9113421070158206e-06,
+ "loss": 0.571,
+ "step": 3991
+ },
+ {
+ "epoch": 1.887470449172577,
+ "grad_norm": 2.624662160873413,
+ "learning_rate": 3.910827151385737e-06,
+ "loss": 0.5183,
+ "step": 3992
+ },
+ {
+ "epoch": 1.8879432624113475,
+ "grad_norm": 2.4491732120513916,
+ "learning_rate": 3.910312107909632e-06,
+ "loss": 0.4205,
+ "step": 3993
+ },
+ {
+ "epoch": 1.888416075650118,
+ "grad_norm": 2.278259515762329,
+ "learning_rate": 3.909796976619575e-06,
+ "loss": 0.4464,
+ "step": 3994
+ },
+ {
+ "epoch": 1.8888888888888888,
+ "grad_norm": 2.6481523513793945,
+ "learning_rate": 3.909281757547644e-06,
+ "loss": 0.5023,
+ "step": 3995
+ },
+ {
+ "epoch": 1.8893617021276596,
+ "grad_norm": 2.6687493324279785,
+ "learning_rate": 3.908766450725917e-06,
+ "loss": 0.495,
+ "step": 3996
+ },
+ {
+ "epoch": 1.8898345153664302,
+ "grad_norm": 2.507525682449341,
+ "learning_rate": 3.908251056186481e-06,
+ "loss": 0.4155,
+ "step": 3997
+ },
+ {
+ "epoch": 1.8903073286052008,
+ "grad_norm": 2.7048323154449463,
+ "learning_rate": 3.907735573961426e-06,
+ "loss": 0.4601,
+ "step": 3998
+ },
+ {
+ "epoch": 1.8907801418439716,
+ "grad_norm": 2.6825389862060547,
+ "learning_rate": 3.907220004082848e-06,
+ "loss": 0.5067,
+ "step": 3999
+ },
+ {
+ "epoch": 1.8912529550827424,
+ "grad_norm": 2.775696039199829,
+ "learning_rate": 3.906704346582852e-06,
+ "loss": 0.5411,
+ "step": 4000
+ },
+ {
+ "epoch": 1.891725768321513,
+ "grad_norm": 2.4492077827453613,
+ "learning_rate": 3.906188601493545e-06,
+ "loss": 0.4931,
+ "step": 4001
+ },
+ {
+ "epoch": 1.8921985815602835,
+ "grad_norm": 2.320810556411743,
+ "learning_rate": 3.905672768847041e-06,
+ "loss": 0.4908,
+ "step": 4002
+ },
+ {
+ "epoch": 1.8926713947990543,
+ "grad_norm": 2.455162525177002,
+ "learning_rate": 3.905156848675455e-06,
+ "loss": 0.508,
+ "step": 4003
+ },
+ {
+ "epoch": 1.8931442080378251,
+ "grad_norm": 2.515921115875244,
+ "learning_rate": 3.904640841010915e-06,
+ "loss": 0.5318,
+ "step": 4004
+ },
+ {
+ "epoch": 1.8936170212765957,
+ "grad_norm": 2.7230770587921143,
+ "learning_rate": 3.904124745885548e-06,
+ "loss": 0.4793,
+ "step": 4005
+ },
+ {
+ "epoch": 1.8940898345153663,
+ "grad_norm": 2.519934892654419,
+ "learning_rate": 3.903608563331491e-06,
+ "loss": 0.5013,
+ "step": 4006
+ },
+ {
+ "epoch": 1.894562647754137,
+ "grad_norm": 2.719674587249756,
+ "learning_rate": 3.903092293380883e-06,
+ "loss": 0.516,
+ "step": 4007
+ },
+ {
+ "epoch": 1.8950354609929079,
+ "grad_norm": 3.2107343673706055,
+ "learning_rate": 3.902575936065869e-06,
+ "loss": 0.6297,
+ "step": 4008
+ },
+ {
+ "epoch": 1.8955082742316784,
+ "grad_norm": 2.9773149490356445,
+ "learning_rate": 3.902059491418603e-06,
+ "loss": 0.566,
+ "step": 4009
+ },
+ {
+ "epoch": 1.895981087470449,
+ "grad_norm": 2.6754770278930664,
+ "learning_rate": 3.90154295947124e-06,
+ "loss": 0.5187,
+ "step": 4010
+ },
+ {
+ "epoch": 1.8964539007092198,
+ "grad_norm": 2.457303762435913,
+ "learning_rate": 3.901026340255943e-06,
+ "loss": 0.5757,
+ "step": 4011
+ },
+ {
+ "epoch": 1.8969267139479906,
+ "grad_norm": 2.5944161415100098,
+ "learning_rate": 3.900509633804878e-06,
+ "loss": 0.5049,
+ "step": 4012
+ },
+ {
+ "epoch": 1.8973995271867612,
+ "grad_norm": 2.610445022583008,
+ "learning_rate": 3.89999284015022e-06,
+ "loss": 0.521,
+ "step": 4013
+ },
+ {
+ "epoch": 1.8978723404255318,
+ "grad_norm": 2.6949338912963867,
+ "learning_rate": 3.899475959324146e-06,
+ "loss": 0.5619,
+ "step": 4014
+ },
+ {
+ "epoch": 1.8983451536643026,
+ "grad_norm": 2.7889559268951416,
+ "learning_rate": 3.898958991358841e-06,
+ "loss": 0.5223,
+ "step": 4015
+ },
+ {
+ "epoch": 1.8988179669030734,
+ "grad_norm": 2.569265842437744,
+ "learning_rate": 3.898441936286493e-06,
+ "loss": 0.5724,
+ "step": 4016
+ },
+ {
+ "epoch": 1.899290780141844,
+ "grad_norm": 2.3567774295806885,
+ "learning_rate": 3.897924794139299e-06,
+ "loss": 0.4784,
+ "step": 4017
+ },
+ {
+ "epoch": 1.8997635933806145,
+ "grad_norm": 2.9176526069641113,
+ "learning_rate": 3.897407564949457e-06,
+ "loss": 0.646,
+ "step": 4018
+ },
+ {
+ "epoch": 1.9002364066193853,
+ "grad_norm": 2.7870090007781982,
+ "learning_rate": 3.896890248749174e-06,
+ "loss": 0.4922,
+ "step": 4019
+ },
+ {
+ "epoch": 1.900709219858156,
+ "grad_norm": 2.8310980796813965,
+ "learning_rate": 3.89637284557066e-06,
+ "loss": 0.4746,
+ "step": 4020
+ },
+ {
+ "epoch": 1.9011820330969267,
+ "grad_norm": 2.434915542602539,
+ "learning_rate": 3.895855355446131e-06,
+ "loss": 0.4537,
+ "step": 4021
+ },
+ {
+ "epoch": 1.9016548463356973,
+ "grad_norm": 3.0547034740448,
+ "learning_rate": 3.89533777840781e-06,
+ "loss": 0.6161,
+ "step": 4022
+ },
+ {
+ "epoch": 1.902127659574468,
+ "grad_norm": 3.416774272918701,
+ "learning_rate": 3.894820114487925e-06,
+ "loss": 0.5448,
+ "step": 4023
+ },
+ {
+ "epoch": 1.9026004728132389,
+ "grad_norm": 2.606951951980591,
+ "learning_rate": 3.894302363718707e-06,
+ "loss": 0.5501,
+ "step": 4024
+ },
+ {
+ "epoch": 1.9030732860520094,
+ "grad_norm": 3.082165002822876,
+ "learning_rate": 3.8937845261323945e-06,
+ "loss": 0.6035,
+ "step": 4025
+ },
+ {
+ "epoch": 1.90354609929078,
+ "grad_norm": 2.616093397140503,
+ "learning_rate": 3.893266601761231e-06,
+ "loss": 0.5294,
+ "step": 4026
+ },
+ {
+ "epoch": 1.9040189125295508,
+ "grad_norm": 2.7141637802124023,
+ "learning_rate": 3.8927485906374654e-06,
+ "loss": 0.5481,
+ "step": 4027
+ },
+ {
+ "epoch": 1.9044917257683216,
+ "grad_norm": 2.5129404067993164,
+ "learning_rate": 3.892230492793352e-06,
+ "loss": 0.4958,
+ "step": 4028
+ },
+ {
+ "epoch": 1.9049645390070922,
+ "grad_norm": 2.703403949737549,
+ "learning_rate": 3.891712308261151e-06,
+ "loss": 0.4852,
+ "step": 4029
+ },
+ {
+ "epoch": 1.9054373522458627,
+ "grad_norm": 2.881058931350708,
+ "learning_rate": 3.891194037073127e-06,
+ "loss": 0.4662,
+ "step": 4030
+ },
+ {
+ "epoch": 1.9059101654846335,
+ "grad_norm": 3.216769218444824,
+ "learning_rate": 3.8906756792615505e-06,
+ "loss": 0.5076,
+ "step": 4031
+ },
+ {
+ "epoch": 1.9063829787234043,
+ "grad_norm": 2.442265748977661,
+ "learning_rate": 3.890157234858697e-06,
+ "loss": 0.4748,
+ "step": 4032
+ },
+ {
+ "epoch": 1.906855791962175,
+ "grad_norm": 3.088672399520874,
+ "learning_rate": 3.889638703896849e-06,
+ "loss": 0.5729,
+ "step": 4033
+ },
+ {
+ "epoch": 1.9073286052009455,
+ "grad_norm": 2.9304986000061035,
+ "learning_rate": 3.889120086408291e-06,
+ "loss": 0.603,
+ "step": 4034
+ },
+ {
+ "epoch": 1.9078014184397163,
+ "grad_norm": 2.686093807220459,
+ "learning_rate": 3.888601382425318e-06,
+ "loss": 0.4978,
+ "step": 4035
+ },
+ {
+ "epoch": 1.908274231678487,
+ "grad_norm": 2.5668389797210693,
+ "learning_rate": 3.888082591980225e-06,
+ "loss": 0.5086,
+ "step": 4036
+ },
+ {
+ "epoch": 1.9087470449172577,
+ "grad_norm": 2.530996561050415,
+ "learning_rate": 3.887563715105315e-06,
+ "loss": 0.4678,
+ "step": 4037
+ },
+ {
+ "epoch": 1.9092198581560282,
+ "grad_norm": 3.043342351913452,
+ "learning_rate": 3.887044751832897e-06,
+ "loss": 0.5452,
+ "step": 4038
+ },
+ {
+ "epoch": 1.909692671394799,
+ "grad_norm": 2.799734115600586,
+ "learning_rate": 3.886525702195284e-06,
+ "loss": 0.5265,
+ "step": 4039
+ },
+ {
+ "epoch": 1.9101654846335698,
+ "grad_norm": 2.890022039413452,
+ "learning_rate": 3.886006566224796e-06,
+ "loss": 0.4634,
+ "step": 4040
+ },
+ {
+ "epoch": 1.9106382978723404,
+ "grad_norm": 2.6804237365722656,
+ "learning_rate": 3.8854873439537555e-06,
+ "loss": 0.5031,
+ "step": 4041
+ },
+ {
+ "epoch": 1.911111111111111,
+ "grad_norm": 2.43038272857666,
+ "learning_rate": 3.884968035414495e-06,
+ "loss": 0.5098,
+ "step": 4042
+ },
+ {
+ "epoch": 1.9115839243498818,
+ "grad_norm": 2.589583396911621,
+ "learning_rate": 3.884448640639346e-06,
+ "loss": 0.498,
+ "step": 4043
+ },
+ {
+ "epoch": 1.9120567375886526,
+ "grad_norm": 2.4565231800079346,
+ "learning_rate": 3.8839291596606524e-06,
+ "loss": 0.4318,
+ "step": 4044
+ },
+ {
+ "epoch": 1.9125295508274232,
+ "grad_norm": 2.66762638092041,
+ "learning_rate": 3.8834095925107575e-06,
+ "loss": 0.5441,
+ "step": 4045
+ },
+ {
+ "epoch": 1.9130023640661937,
+ "grad_norm": 2.7334461212158203,
+ "learning_rate": 3.882889939222013e-06,
+ "loss": 0.5209,
+ "step": 4046
+ },
+ {
+ "epoch": 1.9134751773049645,
+ "grad_norm": 2.6398537158966064,
+ "learning_rate": 3.8823701998267765e-06,
+ "loss": 0.4874,
+ "step": 4047
+ },
+ {
+ "epoch": 1.9139479905437353,
+ "grad_norm": 2.82405161857605,
+ "learning_rate": 3.881850374357409e-06,
+ "loss": 0.4519,
+ "step": 4048
+ },
+ {
+ "epoch": 1.914420803782506,
+ "grad_norm": 2.7552523612976074,
+ "learning_rate": 3.8813304628462776e-06,
+ "loss": 0.547,
+ "step": 4049
+ },
+ {
+ "epoch": 1.9148936170212765,
+ "grad_norm": 2.5287928581237793,
+ "learning_rate": 3.880810465325755e-06,
+ "loss": 0.5226,
+ "step": 4050
+ },
+ {
+ "epoch": 1.9153664302600473,
+ "grad_norm": 2.7597358226776123,
+ "learning_rate": 3.88029038182822e-06,
+ "loss": 0.5171,
+ "step": 4051
+ },
+ {
+ "epoch": 1.915839243498818,
+ "grad_norm": 2.563899278640747,
+ "learning_rate": 3.879770212386055e-06,
+ "loss": 0.4911,
+ "step": 4052
+ },
+ {
+ "epoch": 1.9163120567375886,
+ "grad_norm": 2.499404191970825,
+ "learning_rate": 3.879249957031649e-06,
+ "loss": 0.5072,
+ "step": 4053
+ },
+ {
+ "epoch": 1.9167848699763592,
+ "grad_norm": 2.817713499069214,
+ "learning_rate": 3.878729615797396e-06,
+ "loss": 0.5452,
+ "step": 4054
+ },
+ {
+ "epoch": 1.91725768321513,
+ "grad_norm": 2.7152490615844727,
+ "learning_rate": 3.878209188715696e-06,
+ "loss": 0.4917,
+ "step": 4055
+ },
+ {
+ "epoch": 1.9177304964539008,
+ "grad_norm": 2.384265661239624,
+ "learning_rate": 3.877688675818953e-06,
+ "loss": 0.4823,
+ "step": 4056
+ },
+ {
+ "epoch": 1.9182033096926714,
+ "grad_norm": 2.61059308052063,
+ "learning_rate": 3.877168077139577e-06,
+ "loss": 0.478,
+ "step": 4057
+ },
+ {
+ "epoch": 1.918676122931442,
+ "grad_norm": 2.6107938289642334,
+ "learning_rate": 3.8766473927099824e-06,
+ "loss": 0.5202,
+ "step": 4058
+ },
+ {
+ "epoch": 1.9191489361702128,
+ "grad_norm": 2.2339766025543213,
+ "learning_rate": 3.876126622562592e-06,
+ "loss": 0.547,
+ "step": 4059
+ },
+ {
+ "epoch": 1.9196217494089836,
+ "grad_norm": 2.4324610233306885,
+ "learning_rate": 3.8756057667298304e-06,
+ "loss": 0.5333,
+ "step": 4060
+ },
+ {
+ "epoch": 1.9200945626477541,
+ "grad_norm": 2.5521230697631836,
+ "learning_rate": 3.875084825244131e-06,
+ "loss": 0.5503,
+ "step": 4061
+ },
+ {
+ "epoch": 1.9205673758865247,
+ "grad_norm": 2.6985747814178467,
+ "learning_rate": 3.874563798137928e-06,
+ "loss": 0.4944,
+ "step": 4062
+ },
+ {
+ "epoch": 1.9210401891252955,
+ "grad_norm": 2.422332525253296,
+ "learning_rate": 3.874042685443664e-06,
+ "loss": 0.4807,
+ "step": 4063
+ },
+ {
+ "epoch": 1.9215130023640663,
+ "grad_norm": 2.914553165435791,
+ "learning_rate": 3.873521487193788e-06,
+ "loss": 0.4439,
+ "step": 4064
+ },
+ {
+ "epoch": 1.9219858156028369,
+ "grad_norm": 2.8098697662353516,
+ "learning_rate": 3.873000203420752e-06,
+ "loss": 0.5433,
+ "step": 4065
+ },
+ {
+ "epoch": 1.9224586288416075,
+ "grad_norm": 2.6124703884124756,
+ "learning_rate": 3.872478834157013e-06,
+ "loss": 0.4812,
+ "step": 4066
+ },
+ {
+ "epoch": 1.9229314420803783,
+ "grad_norm": 2.511059522628784,
+ "learning_rate": 3.871957379435035e-06,
+ "loss": 0.4666,
+ "step": 4067
+ },
+ {
+ "epoch": 1.923404255319149,
+ "grad_norm": 2.950542688369751,
+ "learning_rate": 3.871435839287287e-06,
+ "loss": 0.5687,
+ "step": 4068
+ },
+ {
+ "epoch": 1.9238770685579196,
+ "grad_norm": 2.4969422817230225,
+ "learning_rate": 3.870914213746243e-06,
+ "loss": 0.5235,
+ "step": 4069
+ },
+ {
+ "epoch": 1.9243498817966902,
+ "grad_norm": 2.512152910232544,
+ "learning_rate": 3.870392502844382e-06,
+ "loss": 0.4524,
+ "step": 4070
+ },
+ {
+ "epoch": 1.924822695035461,
+ "grad_norm": 3.0212557315826416,
+ "learning_rate": 3.86987070661419e-06,
+ "loss": 0.4868,
+ "step": 4071
+ },
+ {
+ "epoch": 1.9252955082742318,
+ "grad_norm": 2.8949966430664062,
+ "learning_rate": 3.869348825088154e-06,
+ "loss": 0.5556,
+ "step": 4072
+ },
+ {
+ "epoch": 1.9257683215130024,
+ "grad_norm": 2.402043581008911,
+ "learning_rate": 3.868826858298772e-06,
+ "loss": 0.5307,
+ "step": 4073
+ },
+ {
+ "epoch": 1.926241134751773,
+ "grad_norm": 2.980992078781128,
+ "learning_rate": 3.868304806278543e-06,
+ "loss": 0.6313,
+ "step": 4074
+ },
+ {
+ "epoch": 1.9267139479905437,
+ "grad_norm": 2.7140514850616455,
+ "learning_rate": 3.867782669059975e-06,
+ "loss": 0.5359,
+ "step": 4075
+ },
+ {
+ "epoch": 1.9271867612293145,
+ "grad_norm": 2.499631643295288,
+ "learning_rate": 3.867260446675577e-06,
+ "loss": 0.4873,
+ "step": 4076
+ },
+ {
+ "epoch": 1.9276595744680851,
+ "grad_norm": 2.915583610534668,
+ "learning_rate": 3.866738139157866e-06,
+ "loss": 0.5736,
+ "step": 4077
+ },
+ {
+ "epoch": 1.9281323877068557,
+ "grad_norm": 2.4231131076812744,
+ "learning_rate": 3.866215746539363e-06,
+ "loss": 0.5096,
+ "step": 4078
+ },
+ {
+ "epoch": 1.9286052009456265,
+ "grad_norm": 2.360074996948242,
+ "learning_rate": 3.865693268852599e-06,
+ "loss": 0.4907,
+ "step": 4079
+ },
+ {
+ "epoch": 1.9290780141843973,
+ "grad_norm": 2.5410032272338867,
+ "learning_rate": 3.865170706130101e-06,
+ "loss": 0.473,
+ "step": 4080
+ },
+ {
+ "epoch": 1.9295508274231679,
+ "grad_norm": 2.780090808868408,
+ "learning_rate": 3.86464805840441e-06,
+ "loss": 0.5213,
+ "step": 4081
+ },
+ {
+ "epoch": 1.9300236406619384,
+ "grad_norm": 2.7318382263183594,
+ "learning_rate": 3.864125325708068e-06,
+ "loss": 0.5617,
+ "step": 4082
+ },
+ {
+ "epoch": 1.9304964539007092,
+ "grad_norm": 2.76509165763855,
+ "learning_rate": 3.863602508073623e-06,
+ "loss": 0.52,
+ "step": 4083
+ },
+ {
+ "epoch": 1.93096926713948,
+ "grad_norm": 2.8041110038757324,
+ "learning_rate": 3.863079605533631e-06,
+ "loss": 0.5343,
+ "step": 4084
+ },
+ {
+ "epoch": 1.9314420803782506,
+ "grad_norm": 2.4462404251098633,
+ "learning_rate": 3.862556618120647e-06,
+ "loss": 0.4657,
+ "step": 4085
+ },
+ {
+ "epoch": 1.9319148936170212,
+ "grad_norm": 2.460864305496216,
+ "learning_rate": 3.862033545867238e-06,
+ "loss": 0.517,
+ "step": 4086
+ },
+ {
+ "epoch": 1.932387706855792,
+ "grad_norm": 2.6480276584625244,
+ "learning_rate": 3.8615103888059715e-06,
+ "loss": 0.4702,
+ "step": 4087
+ },
+ {
+ "epoch": 1.9328605200945628,
+ "grad_norm": 2.7175381183624268,
+ "learning_rate": 3.860987146969424e-06,
+ "loss": 0.5073,
+ "step": 4088
+ },
+ {
+ "epoch": 1.9333333333333333,
+ "grad_norm": 2.4963486194610596,
+ "learning_rate": 3.860463820390175e-06,
+ "loss": 0.4491,
+ "step": 4089
+ },
+ {
+ "epoch": 1.933806146572104,
+ "grad_norm": 2.548135757446289,
+ "learning_rate": 3.8599404091008075e-06,
+ "loss": 0.5134,
+ "step": 4090
+ },
+ {
+ "epoch": 1.9342789598108747,
+ "grad_norm": 2.8693668842315674,
+ "learning_rate": 3.859416913133916e-06,
+ "loss": 0.5467,
+ "step": 4091
+ },
+ {
+ "epoch": 1.9347517730496455,
+ "grad_norm": 2.711273670196533,
+ "learning_rate": 3.858893332522092e-06,
+ "loss": 0.6287,
+ "step": 4092
+ },
+ {
+ "epoch": 1.935224586288416,
+ "grad_norm": 2.8604533672332764,
+ "learning_rate": 3.858369667297941e-06,
+ "loss": 0.5661,
+ "step": 4093
+ },
+ {
+ "epoch": 1.9356973995271867,
+ "grad_norm": 2.936988353729248,
+ "learning_rate": 3.857845917494066e-06,
+ "loss": 0.5311,
+ "step": 4094
+ },
+ {
+ "epoch": 1.9361702127659575,
+ "grad_norm": 2.414093494415283,
+ "learning_rate": 3.857322083143079e-06,
+ "loss": 0.505,
+ "step": 4095
+ },
+ {
+ "epoch": 1.9366430260047283,
+ "grad_norm": 2.5528934001922607,
+ "learning_rate": 3.856798164277599e-06,
+ "loss": 0.4759,
+ "step": 4096
+ },
+ {
+ "epoch": 1.9371158392434988,
+ "grad_norm": 2.592893600463867,
+ "learning_rate": 3.8562741609302456e-06,
+ "loss": 0.4932,
+ "step": 4097
+ },
+ {
+ "epoch": 1.9375886524822694,
+ "grad_norm": 2.9619107246398926,
+ "learning_rate": 3.855750073133648e-06,
+ "loss": 0.5563,
+ "step": 4098
+ },
+ {
+ "epoch": 1.9380614657210402,
+ "grad_norm": 2.864889621734619,
+ "learning_rate": 3.855225900920438e-06,
+ "loss": 0.5069,
+ "step": 4099
+ },
+ {
+ "epoch": 1.938534278959811,
+ "grad_norm": 2.3951032161712646,
+ "learning_rate": 3.854701644323253e-06,
+ "loss": 0.4883,
+ "step": 4100
+ },
+ {
+ "epoch": 1.9390070921985816,
+ "grad_norm": 2.6339633464813232,
+ "learning_rate": 3.854177303374737e-06,
+ "loss": 0.5207,
+ "step": 4101
+ },
+ {
+ "epoch": 1.9394799054373522,
+ "grad_norm": 2.6435508728027344,
+ "learning_rate": 3.853652878107539e-06,
+ "loss": 0.4679,
+ "step": 4102
+ },
+ {
+ "epoch": 1.939952718676123,
+ "grad_norm": 2.4635629653930664,
+ "learning_rate": 3.853128368554311e-06,
+ "loss": 0.5639,
+ "step": 4103
+ },
+ {
+ "epoch": 1.9404255319148938,
+ "grad_norm": 2.664635419845581,
+ "learning_rate": 3.852603774747714e-06,
+ "loss": 0.5697,
+ "step": 4104
+ },
+ {
+ "epoch": 1.9408983451536643,
+ "grad_norm": 2.7020363807678223,
+ "learning_rate": 3.8520790967204095e-06,
+ "loss": 0.5462,
+ "step": 4105
+ },
+ {
+ "epoch": 1.941371158392435,
+ "grad_norm": 3.529282331466675,
+ "learning_rate": 3.851554334505069e-06,
+ "loss": 0.54,
+ "step": 4106
+ },
+ {
+ "epoch": 1.9418439716312057,
+ "grad_norm": 2.7125768661499023,
+ "learning_rate": 3.851029488134367e-06,
+ "loss": 0.5355,
+ "step": 4107
+ },
+ {
+ "epoch": 1.9423167848699765,
+ "grad_norm": 2.5226643085479736,
+ "learning_rate": 3.850504557640981e-06,
+ "loss": 0.5106,
+ "step": 4108
+ },
+ {
+ "epoch": 1.942789598108747,
+ "grad_norm": 2.834352731704712,
+ "learning_rate": 3.8499795430575995e-06,
+ "loss": 0.6069,
+ "step": 4109
+ },
+ {
+ "epoch": 1.9432624113475176,
+ "grad_norm": 2.8484177589416504,
+ "learning_rate": 3.849454444416911e-06,
+ "loss": 0.5542,
+ "step": 4110
+ },
+ {
+ "epoch": 1.9437352245862884,
+ "grad_norm": 2.402539014816284,
+ "learning_rate": 3.848929261751612e-06,
+ "loss": 0.47,
+ "step": 4111
+ },
+ {
+ "epoch": 1.9442080378250592,
+ "grad_norm": 2.7010042667388916,
+ "learning_rate": 3.848403995094402e-06,
+ "loss": 0.5263,
+ "step": 4112
+ },
+ {
+ "epoch": 1.9446808510638298,
+ "grad_norm": 2.441689968109131,
+ "learning_rate": 3.847878644477988e-06,
+ "loss": 0.5607,
+ "step": 4113
+ },
+ {
+ "epoch": 1.9451536643026004,
+ "grad_norm": 2.5994722843170166,
+ "learning_rate": 3.847353209935081e-06,
+ "loss": 0.5103,
+ "step": 4114
+ },
+ {
+ "epoch": 1.9456264775413712,
+ "grad_norm": 2.452242136001587,
+ "learning_rate": 3.8468276914983975e-06,
+ "loss": 0.4409,
+ "step": 4115
+ },
+ {
+ "epoch": 1.946099290780142,
+ "grad_norm": 2.421023368835449,
+ "learning_rate": 3.84630208920066e-06,
+ "loss": 0.4429,
+ "step": 4116
+ },
+ {
+ "epoch": 1.9465721040189126,
+ "grad_norm": 2.696399688720703,
+ "learning_rate": 3.8457764030745945e-06,
+ "loss": 0.5352,
+ "step": 4117
+ },
+ {
+ "epoch": 1.9470449172576831,
+ "grad_norm": 2.3963489532470703,
+ "learning_rate": 3.845250633152933e-06,
+ "loss": 0.4505,
+ "step": 4118
+ },
+ {
+ "epoch": 1.947517730496454,
+ "grad_norm": 2.610649585723877,
+ "learning_rate": 3.8447247794684135e-06,
+ "loss": 0.501,
+ "step": 4119
+ },
+ {
+ "epoch": 1.9479905437352247,
+ "grad_norm": 2.740412712097168,
+ "learning_rate": 3.8441988420537775e-06,
+ "loss": 0.5362,
+ "step": 4120
+ },
+ {
+ "epoch": 1.9484633569739953,
+ "grad_norm": 2.2614004611968994,
+ "learning_rate": 3.8436728209417755e-06,
+ "loss": 0.4199,
+ "step": 4121
+ },
+ {
+ "epoch": 1.9489361702127659,
+ "grad_norm": 3.0683481693267822,
+ "learning_rate": 3.843146716165158e-06,
+ "loss": 0.5248,
+ "step": 4122
+ },
+ {
+ "epoch": 1.9494089834515367,
+ "grad_norm": 3.005174398422241,
+ "learning_rate": 3.842620527756684e-06,
+ "loss": 0.5246,
+ "step": 4123
+ },
+ {
+ "epoch": 1.9498817966903075,
+ "grad_norm": 2.672896385192871,
+ "learning_rate": 3.842094255749117e-06,
+ "loss": 0.5586,
+ "step": 4124
+ },
+ {
+ "epoch": 1.950354609929078,
+ "grad_norm": 2.5481197834014893,
+ "learning_rate": 3.8415679001752255e-06,
+ "loss": 0.5061,
+ "step": 4125
+ },
+ {
+ "epoch": 1.9508274231678486,
+ "grad_norm": 2.515789270401001,
+ "learning_rate": 3.8410414610677835e-06,
+ "loss": 0.4645,
+ "step": 4126
+ },
+ {
+ "epoch": 1.9513002364066194,
+ "grad_norm": 2.7236077785491943,
+ "learning_rate": 3.84051493845957e-06,
+ "loss": 0.5623,
+ "step": 4127
+ },
+ {
+ "epoch": 1.9517730496453902,
+ "grad_norm": 2.6252009868621826,
+ "learning_rate": 3.839988332383369e-06,
+ "loss": 0.5078,
+ "step": 4128
+ },
+ {
+ "epoch": 1.9522458628841608,
+ "grad_norm": 2.719196081161499,
+ "learning_rate": 3.83946164287197e-06,
+ "loss": 0.5481,
+ "step": 4129
+ },
+ {
+ "epoch": 1.9527186761229314,
+ "grad_norm": 2.484163284301758,
+ "learning_rate": 3.838934869958169e-06,
+ "loss": 0.5332,
+ "step": 4130
+ },
+ {
+ "epoch": 1.9531914893617022,
+ "grad_norm": 2.615382671356201,
+ "learning_rate": 3.838408013674764e-06,
+ "loss": 0.4742,
+ "step": 4131
+ },
+ {
+ "epoch": 1.953664302600473,
+ "grad_norm": 2.735321044921875,
+ "learning_rate": 3.83788107405456e-06,
+ "loss": 0.421,
+ "step": 4132
+ },
+ {
+ "epoch": 1.9541371158392435,
+ "grad_norm": 2.892652750015259,
+ "learning_rate": 3.837354051130369e-06,
+ "loss": 0.5326,
+ "step": 4133
+ },
+ {
+ "epoch": 1.9546099290780141,
+ "grad_norm": 2.6800546646118164,
+ "learning_rate": 3.8368269449350055e-06,
+ "loss": 0.5041,
+ "step": 4134
+ },
+ {
+ "epoch": 1.955082742316785,
+ "grad_norm": 2.362470865249634,
+ "learning_rate": 3.836299755501289e-06,
+ "loss": 0.4697,
+ "step": 4135
+ },
+ {
+ "epoch": 1.9555555555555557,
+ "grad_norm": 2.3855135440826416,
+ "learning_rate": 3.835772482862047e-06,
+ "loss": 0.5148,
+ "step": 4136
+ },
+ {
+ "epoch": 1.9560283687943263,
+ "grad_norm": 2.3338418006896973,
+ "learning_rate": 3.83524512705011e-06,
+ "loss": 0.4643,
+ "step": 4137
+ },
+ {
+ "epoch": 1.9565011820330969,
+ "grad_norm": 2.261355400085449,
+ "learning_rate": 3.834717688098313e-06,
+ "loss": 0.5573,
+ "step": 4138
+ },
+ {
+ "epoch": 1.9569739952718677,
+ "grad_norm": 2.8166391849517822,
+ "learning_rate": 3.834190166039498e-06,
+ "loss": 0.4868,
+ "step": 4139
+ },
+ {
+ "epoch": 1.9574468085106385,
+ "grad_norm": 2.4155869483947754,
+ "learning_rate": 3.833662560906512e-06,
+ "loss": 0.4923,
+ "step": 4140
+ },
+ {
+ "epoch": 1.957919621749409,
+ "grad_norm": 2.3977696895599365,
+ "learning_rate": 3.833134872732206e-06,
+ "loss": 0.5106,
+ "step": 4141
+ },
+ {
+ "epoch": 1.9583924349881796,
+ "grad_norm": 2.9541378021240234,
+ "learning_rate": 3.832607101549438e-06,
+ "loss": 0.4683,
+ "step": 4142
+ },
+ {
+ "epoch": 1.9588652482269504,
+ "grad_norm": 2.5862700939178467,
+ "learning_rate": 3.832079247391068e-06,
+ "loss": 0.4453,
+ "step": 4143
+ },
+ {
+ "epoch": 1.9593380614657212,
+ "grad_norm": 2.7459371089935303,
+ "learning_rate": 3.8315513102899644e-06,
+ "loss": 0.5511,
+ "step": 4144
+ },
+ {
+ "epoch": 1.9598108747044918,
+ "grad_norm": 2.904869556427002,
+ "learning_rate": 3.831023290279e-06,
+ "loss": 0.5348,
+ "step": 4145
+ },
+ {
+ "epoch": 1.9602836879432624,
+ "grad_norm": 3.092846632003784,
+ "learning_rate": 3.830495187391051e-06,
+ "loss": 0.5664,
+ "step": 4146
+ },
+ {
+ "epoch": 1.9607565011820332,
+ "grad_norm": 3.2838528156280518,
+ "learning_rate": 3.829967001659001e-06,
+ "loss": 0.5115,
+ "step": 4147
+ },
+ {
+ "epoch": 1.961229314420804,
+ "grad_norm": 2.7799549102783203,
+ "learning_rate": 3.829438733115738e-06,
+ "loss": 0.5145,
+ "step": 4148
+ },
+ {
+ "epoch": 1.9617021276595743,
+ "grad_norm": 2.436084270477295,
+ "learning_rate": 3.828910381794154e-06,
+ "loss": 0.4718,
+ "step": 4149
+ },
+ {
+ "epoch": 1.962174940898345,
+ "grad_norm": 2.6662371158599854,
+ "learning_rate": 3.828381947727148e-06,
+ "loss": 0.6129,
+ "step": 4150
+ },
+ {
+ "epoch": 1.962647754137116,
+ "grad_norm": 2.937000036239624,
+ "learning_rate": 3.827853430947622e-06,
+ "loss": 0.522,
+ "step": 4151
+ },
+ {
+ "epoch": 1.9631205673758865,
+ "grad_norm": 2.5737369060516357,
+ "learning_rate": 3.827324831488486e-06,
+ "loss": 0.4916,
+ "step": 4152
+ },
+ {
+ "epoch": 1.963593380614657,
+ "grad_norm": 2.70232892036438,
+ "learning_rate": 3.826796149382653e-06,
+ "loss": 0.4726,
+ "step": 4153
+ },
+ {
+ "epoch": 1.9640661938534278,
+ "grad_norm": 2.6899707317352295,
+ "learning_rate": 3.826267384663042e-06,
+ "loss": 0.529,
+ "step": 4154
+ },
+ {
+ "epoch": 1.9645390070921986,
+ "grad_norm": 2.6142728328704834,
+ "learning_rate": 3.825738537362575e-06,
+ "loss": 0.4999,
+ "step": 4155
+ },
+ {
+ "epoch": 1.9650118203309692,
+ "grad_norm": 2.43949818611145,
+ "learning_rate": 3.825209607514183e-06,
+ "loss": 0.5035,
+ "step": 4156
+ },
+ {
+ "epoch": 1.9654846335697398,
+ "grad_norm": 2.3735458850860596,
+ "learning_rate": 3.824680595150801e-06,
+ "loss": 0.4779,
+ "step": 4157
+ },
+ {
+ "epoch": 1.9659574468085106,
+ "grad_norm": 2.444307565689087,
+ "learning_rate": 3.824151500305365e-06,
+ "loss": 0.4825,
+ "step": 4158
+ },
+ {
+ "epoch": 1.9664302600472814,
+ "grad_norm": 2.8219668865203857,
+ "learning_rate": 3.8236223230108224e-06,
+ "loss": 0.5354,
+ "step": 4159
+ },
+ {
+ "epoch": 1.966903073286052,
+ "grad_norm": 2.720721483230591,
+ "learning_rate": 3.823093063300121e-06,
+ "loss": 0.5064,
+ "step": 4160
+ },
+ {
+ "epoch": 1.9673758865248225,
+ "grad_norm": 2.324190616607666,
+ "learning_rate": 3.822563721206217e-06,
+ "loss": 0.5348,
+ "step": 4161
+ },
+ {
+ "epoch": 1.9678486997635933,
+ "grad_norm": 2.702155351638794,
+ "learning_rate": 3.8220342967620695e-06,
+ "loss": 0.5388,
+ "step": 4162
+ },
+ {
+ "epoch": 1.9683215130023641,
+ "grad_norm": 2.4956369400024414,
+ "learning_rate": 3.821504790000642e-06,
+ "loss": 0.5071,
+ "step": 4163
+ },
+ {
+ "epoch": 1.9687943262411347,
+ "grad_norm": 2.568039655685425,
+ "learning_rate": 3.820975200954906e-06,
+ "loss": 0.5133,
+ "step": 4164
+ },
+ {
+ "epoch": 1.9692671394799053,
+ "grad_norm": 2.810868978500366,
+ "learning_rate": 3.820445529657837e-06,
+ "loss": 0.4856,
+ "step": 4165
+ },
+ {
+ "epoch": 1.969739952718676,
+ "grad_norm": 2.66365647315979,
+ "learning_rate": 3.819915776142415e-06,
+ "loss": 0.5235,
+ "step": 4166
+ },
+ {
+ "epoch": 1.9702127659574469,
+ "grad_norm": 2.2982139587402344,
+ "learning_rate": 3.8193859404416265e-06,
+ "loss": 0.4361,
+ "step": 4167
+ },
+ {
+ "epoch": 1.9706855791962175,
+ "grad_norm": 2.585672378540039,
+ "learning_rate": 3.818856022588458e-06,
+ "loss": 0.4842,
+ "step": 4168
+ },
+ {
+ "epoch": 1.971158392434988,
+ "grad_norm": 2.57857346534729,
+ "learning_rate": 3.81832602261591e-06,
+ "loss": 0.5249,
+ "step": 4169
+ },
+ {
+ "epoch": 1.9716312056737588,
+ "grad_norm": 2.6947224140167236,
+ "learning_rate": 3.817795940556981e-06,
+ "loss": 0.5234,
+ "step": 4170
+ },
+ {
+ "epoch": 1.9721040189125296,
+ "grad_norm": 2.7453415393829346,
+ "learning_rate": 3.8172657764446764e-06,
+ "loss": 0.5219,
+ "step": 4171
+ },
+ {
+ "epoch": 1.9725768321513002,
+ "grad_norm": 8.424073219299316,
+ "learning_rate": 3.816735530312009e-06,
+ "loss": 0.5162,
+ "step": 4172
+ },
+ {
+ "epoch": 1.9730496453900708,
+ "grad_norm": 2.8229739665985107,
+ "learning_rate": 3.816205202191993e-06,
+ "loss": 0.4621,
+ "step": 4173
+ },
+ {
+ "epoch": 1.9735224586288416,
+ "grad_norm": 2.5969009399414062,
+ "learning_rate": 3.815674792117651e-06,
+ "loss": 0.5044,
+ "step": 4174
+ },
+ {
+ "epoch": 1.9739952718676124,
+ "grad_norm": 2.646024227142334,
+ "learning_rate": 3.815144300122009e-06,
+ "loss": 0.5094,
+ "step": 4175
+ },
+ {
+ "epoch": 1.974468085106383,
+ "grad_norm": 2.4950616359710693,
+ "learning_rate": 3.814613726238097e-06,
+ "loss": 0.4827,
+ "step": 4176
+ },
+ {
+ "epoch": 1.9749408983451535,
+ "grad_norm": 2.5636119842529297,
+ "learning_rate": 3.8140830704989535e-06,
+ "loss": 0.5241,
+ "step": 4177
+ },
+ {
+ "epoch": 1.9754137115839243,
+ "grad_norm": 2.7936553955078125,
+ "learning_rate": 3.813552332937619e-06,
+ "loss": 0.5344,
+ "step": 4178
+ },
+ {
+ "epoch": 1.9758865248226951,
+ "grad_norm": 2.8085341453552246,
+ "learning_rate": 3.8130215135871405e-06,
+ "loss": 0.5647,
+ "step": 4179
+ },
+ {
+ "epoch": 1.9763593380614657,
+ "grad_norm": 2.4776322841644287,
+ "learning_rate": 3.8124906124805694e-06,
+ "loss": 0.542,
+ "step": 4180
+ },
+ {
+ "epoch": 1.9768321513002363,
+ "grad_norm": 2.3227856159210205,
+ "learning_rate": 3.8119596296509635e-06,
+ "loss": 0.4618,
+ "step": 4181
+ },
+ {
+ "epoch": 1.977304964539007,
+ "grad_norm": 2.5157814025878906,
+ "learning_rate": 3.8114285651313848e-06,
+ "loss": 0.538,
+ "step": 4182
+ },
+ {
+ "epoch": 1.9777777777777779,
+ "grad_norm": 2.5630218982696533,
+ "learning_rate": 3.8108974189548987e-06,
+ "loss": 0.5254,
+ "step": 4183
+ },
+ {
+ "epoch": 1.9782505910165484,
+ "grad_norm": 2.703237533569336,
+ "learning_rate": 3.8103661911545787e-06,
+ "loss": 0.4859,
+ "step": 4184
+ },
+ {
+ "epoch": 1.978723404255319,
+ "grad_norm": 2.8808000087738037,
+ "learning_rate": 3.809834881763502e-06,
+ "loss": 0.5585,
+ "step": 4185
+ },
+ {
+ "epoch": 1.9791962174940898,
+ "grad_norm": 2.9047577381134033,
+ "learning_rate": 3.8093034908147507e-06,
+ "loss": 0.5022,
+ "step": 4186
+ },
+ {
+ "epoch": 1.9796690307328606,
+ "grad_norm": 2.7417640686035156,
+ "learning_rate": 3.8087720183414125e-06,
+ "loss": 0.5275,
+ "step": 4187
+ },
+ {
+ "epoch": 1.9801418439716312,
+ "grad_norm": 2.952012062072754,
+ "learning_rate": 3.8082404643765786e-06,
+ "loss": 0.543,
+ "step": 4188
+ },
+ {
+ "epoch": 1.9806146572104018,
+ "grad_norm": 2.538376569747925,
+ "learning_rate": 3.807708828953348e-06,
+ "loss": 0.4969,
+ "step": 4189
+ },
+ {
+ "epoch": 1.9810874704491725,
+ "grad_norm": 2.3476181030273438,
+ "learning_rate": 3.807177112104823e-06,
+ "loss": 0.4979,
+ "step": 4190
+ },
+ {
+ "epoch": 1.9815602836879433,
+ "grad_norm": 2.6480464935302734,
+ "learning_rate": 3.80664531386411e-06,
+ "loss": 0.4894,
+ "step": 4191
+ },
+ {
+ "epoch": 1.982033096926714,
+ "grad_norm": 2.792916774749756,
+ "learning_rate": 3.8061134342643235e-06,
+ "loss": 0.5468,
+ "step": 4192
+ },
+ {
+ "epoch": 1.9825059101654845,
+ "grad_norm": 2.368736743927002,
+ "learning_rate": 3.805581473338581e-06,
+ "loss": 0.4672,
+ "step": 4193
+ },
+ {
+ "epoch": 1.9829787234042553,
+ "grad_norm": 2.379084348678589,
+ "learning_rate": 3.8050494311200037e-06,
+ "loss": 0.4577,
+ "step": 4194
+ },
+ {
+ "epoch": 1.983451536643026,
+ "grad_norm": 2.722471237182617,
+ "learning_rate": 3.804517307641722e-06,
+ "loss": 0.4988,
+ "step": 4195
+ },
+ {
+ "epoch": 1.9839243498817967,
+ "grad_norm": 2.356649875640869,
+ "learning_rate": 3.8039851029368674e-06,
+ "loss": 0.4933,
+ "step": 4196
+ },
+ {
+ "epoch": 1.9843971631205672,
+ "grad_norm": 2.9182281494140625,
+ "learning_rate": 3.8034528170385776e-06,
+ "loss": 0.4873,
+ "step": 4197
+ },
+ {
+ "epoch": 1.984869976359338,
+ "grad_norm": 2.6232199668884277,
+ "learning_rate": 3.8029204499799976e-06,
+ "loss": 0.4425,
+ "step": 4198
+ },
+ {
+ "epoch": 1.9853427895981088,
+ "grad_norm": 2.667541980743408,
+ "learning_rate": 3.802388001794274e-06,
+ "loss": 0.5022,
+ "step": 4199
+ },
+ {
+ "epoch": 1.9858156028368794,
+ "grad_norm": 3.168470621109009,
+ "learning_rate": 3.8018554725145596e-06,
+ "loss": 0.5505,
+ "step": 4200
+ },
+ {
+ "epoch": 1.98628841607565,
+ "grad_norm": 2.716625452041626,
+ "learning_rate": 3.8013228621740132e-06,
+ "loss": 0.4937,
+ "step": 4201
+ },
+ {
+ "epoch": 1.9867612293144208,
+ "grad_norm": 2.3014442920684814,
+ "learning_rate": 3.800790170805799e-06,
+ "loss": 0.4734,
+ "step": 4202
+ },
+ {
+ "epoch": 1.9872340425531916,
+ "grad_norm": 2.9426841735839844,
+ "learning_rate": 3.8002573984430847e-06,
+ "loss": 0.4983,
+ "step": 4203
+ },
+ {
+ "epoch": 1.9877068557919622,
+ "grad_norm": 2.5598278045654297,
+ "learning_rate": 3.7997245451190435e-06,
+ "loss": 0.4834,
+ "step": 4204
+ },
+ {
+ "epoch": 1.9881796690307327,
+ "grad_norm": 2.86458420753479,
+ "learning_rate": 3.7991916108668538e-06,
+ "loss": 0.5613,
+ "step": 4205
+ },
+ {
+ "epoch": 1.9886524822695035,
+ "grad_norm": 2.842914342880249,
+ "learning_rate": 3.7986585957196997e-06,
+ "loss": 0.4951,
+ "step": 4206
+ },
+ {
+ "epoch": 1.9891252955082743,
+ "grad_norm": 3.1828150749206543,
+ "learning_rate": 3.7981254997107686e-06,
+ "loss": 0.5913,
+ "step": 4207
+ },
+ {
+ "epoch": 1.989598108747045,
+ "grad_norm": 2.5765931606292725,
+ "learning_rate": 3.7975923228732547e-06,
+ "loss": 0.5544,
+ "step": 4208
+ },
+ {
+ "epoch": 1.9900709219858155,
+ "grad_norm": 2.492234945297241,
+ "learning_rate": 3.797059065240357e-06,
+ "loss": 0.5046,
+ "step": 4209
+ },
+ {
+ "epoch": 1.9905437352245863,
+ "grad_norm": 2.870346784591675,
+ "learning_rate": 3.7965257268452795e-06,
+ "loss": 0.5354,
+ "step": 4210
+ },
+ {
+ "epoch": 1.991016548463357,
+ "grad_norm": 2.4989993572235107,
+ "learning_rate": 3.795992307721229e-06,
+ "loss": 0.4677,
+ "step": 4211
+ },
+ {
+ "epoch": 1.9914893617021276,
+ "grad_norm": 2.931114673614502,
+ "learning_rate": 3.7954588079014206e-06,
+ "loss": 0.5504,
+ "step": 4212
+ },
+ {
+ "epoch": 1.9919621749408982,
+ "grad_norm": 2.5247652530670166,
+ "learning_rate": 3.794925227419073e-06,
+ "loss": 0.4736,
+ "step": 4213
+ },
+ {
+ "epoch": 1.992434988179669,
+ "grad_norm": 2.6238436698913574,
+ "learning_rate": 3.794391566307409e-06,
+ "loss": 0.4591,
+ "step": 4214
+ },
+ {
+ "epoch": 1.9929078014184398,
+ "grad_norm": 2.654886245727539,
+ "learning_rate": 3.7938578245996584e-06,
+ "loss": 0.5149,
+ "step": 4215
+ },
+ {
+ "epoch": 1.9933806146572104,
+ "grad_norm": 2.509164810180664,
+ "learning_rate": 3.793324002329054e-06,
+ "loss": 0.4951,
+ "step": 4216
+ },
+ {
+ "epoch": 1.993853427895981,
+ "grad_norm": 2.909632921218872,
+ "learning_rate": 3.7927900995288345e-06,
+ "loss": 0.5131,
+ "step": 4217
+ },
+ {
+ "epoch": 1.9943262411347518,
+ "grad_norm": 2.4354615211486816,
+ "learning_rate": 3.7922561162322456e-06,
+ "loss": 0.4716,
+ "step": 4218
+ },
+ {
+ "epoch": 1.9947990543735226,
+ "grad_norm": 2.6514649391174316,
+ "learning_rate": 3.791722052472534e-06,
+ "loss": 0.5714,
+ "step": 4219
+ },
+ {
+ "epoch": 1.9952718676122931,
+ "grad_norm": 2.77089262008667,
+ "learning_rate": 3.791187908282954e-06,
+ "loss": 0.5736,
+ "step": 4220
+ },
+ {
+ "epoch": 1.9957446808510637,
+ "grad_norm": 2.7651021480560303,
+ "learning_rate": 3.7906536836967657e-06,
+ "loss": 0.4948,
+ "step": 4221
+ },
+ {
+ "epoch": 1.9962174940898345,
+ "grad_norm": 2.7536795139312744,
+ "learning_rate": 3.7901193787472306e-06,
+ "loss": 0.512,
+ "step": 4222
+ },
+ {
+ "epoch": 1.9966903073286053,
+ "grad_norm": 2.684893846511841,
+ "learning_rate": 3.78958499346762e-06,
+ "loss": 0.5118,
+ "step": 4223
+ },
+ {
+ "epoch": 1.9971631205673759,
+ "grad_norm": 2.7616753578186035,
+ "learning_rate": 3.7890505278912054e-06,
+ "loss": 0.4516,
+ "step": 4224
+ },
+ {
+ "epoch": 1.9976359338061465,
+ "grad_norm": 2.4731967449188232,
+ "learning_rate": 3.7885159820512666e-06,
+ "loss": 0.4736,
+ "step": 4225
+ },
+ {
+ "epoch": 1.9981087470449173,
+ "grad_norm": 2.366631031036377,
+ "learning_rate": 3.7879813559810884e-06,
+ "loss": 0.4999,
+ "step": 4226
+ },
+ {
+ "epoch": 1.998581560283688,
+ "grad_norm": 2.994624137878418,
+ "learning_rate": 3.7874466497139582e-06,
+ "loss": 0.5273,
+ "step": 4227
+ },
+ {
+ "epoch": 1.9990543735224586,
+ "grad_norm": 2.4499242305755615,
+ "learning_rate": 3.7869118632831712e-06,
+ "loss": 0.5761,
+ "step": 4228
+ },
+ {
+ "epoch": 1.9995271867612292,
+ "grad_norm": 2.3370113372802734,
+ "learning_rate": 3.7863769967220243e-06,
+ "loss": 0.4673,
+ "step": 4229
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 3.1131203174591064,
+ "learning_rate": 3.7858420500638236e-06,
+ "loss": 0.5118,
+ "step": 4230
+ },
+ {
+ "epoch": 2.000472813238771,
+ "grad_norm": 2.2747561931610107,
+ "learning_rate": 3.785307023341876e-06,
+ "loss": 0.4166,
+ "step": 4231
+ },
+ {
+ "epoch": 2.000945626477541,
+ "grad_norm": 2.4347424507141113,
+ "learning_rate": 3.7847719165894963e-06,
+ "loss": 0.4161,
+ "step": 4232
+ },
+ {
+ "epoch": 2.001418439716312,
+ "grad_norm": 2.398805618286133,
+ "learning_rate": 3.784236729840003e-06,
+ "loss": 0.4652,
+ "step": 4233
+ },
+ {
+ "epoch": 2.0018912529550827,
+ "grad_norm": 2.1904916763305664,
+ "learning_rate": 3.783701463126719e-06,
+ "loss": 0.4554,
+ "step": 4234
+ },
+ {
+ "epoch": 2.0023640661938535,
+ "grad_norm": 2.237330913543701,
+ "learning_rate": 3.7831661164829735e-06,
+ "loss": 0.4471,
+ "step": 4235
+ },
+ {
+ "epoch": 2.002836879432624,
+ "grad_norm": 2.3656628131866455,
+ "learning_rate": 3.7826306899421016e-06,
+ "loss": 0.4052,
+ "step": 4236
+ },
+ {
+ "epoch": 2.0033096926713947,
+ "grad_norm": 2.615489959716797,
+ "learning_rate": 3.7820951835374405e-06,
+ "loss": 0.4847,
+ "step": 4237
+ },
+ {
+ "epoch": 2.0037825059101655,
+ "grad_norm": 2.453036308288574,
+ "learning_rate": 3.7815595973023347e-06,
+ "loss": 0.4672,
+ "step": 4238
+ },
+ {
+ "epoch": 2.0042553191489363,
+ "grad_norm": 2.537468671798706,
+ "learning_rate": 3.7810239312701306e-06,
+ "loss": 0.467,
+ "step": 4239
+ },
+ {
+ "epoch": 2.0047281323877066,
+ "grad_norm": 2.3321666717529297,
+ "learning_rate": 3.780488185474184e-06,
+ "loss": 0.3557,
+ "step": 4240
+ },
+ {
+ "epoch": 2.0052009456264774,
+ "grad_norm": 2.9051828384399414,
+ "learning_rate": 3.779952359947854e-06,
+ "loss": 0.5474,
+ "step": 4241
+ },
+ {
+ "epoch": 2.0056737588652482,
+ "grad_norm": 2.7458817958831787,
+ "learning_rate": 3.7794164547245015e-06,
+ "loss": 0.4659,
+ "step": 4242
+ },
+ {
+ "epoch": 2.006146572104019,
+ "grad_norm": 2.627046585083008,
+ "learning_rate": 3.778880469837497e-06,
+ "loss": 0.4179,
+ "step": 4243
+ },
+ {
+ "epoch": 2.0066193853427894,
+ "grad_norm": 2.4186174869537354,
+ "learning_rate": 3.7783444053202135e-06,
+ "loss": 0.3976,
+ "step": 4244
+ },
+ {
+ "epoch": 2.00709219858156,
+ "grad_norm": 3.109376907348633,
+ "learning_rate": 3.7778082612060296e-06,
+ "loss": 0.4095,
+ "step": 4245
+ },
+ {
+ "epoch": 2.007565011820331,
+ "grad_norm": 2.583376169204712,
+ "learning_rate": 3.7772720375283282e-06,
+ "loss": 0.4325,
+ "step": 4246
+ },
+ {
+ "epoch": 2.0080378250591018,
+ "grad_norm": 2.6199896335601807,
+ "learning_rate": 3.776735734320497e-06,
+ "loss": 0.4207,
+ "step": 4247
+ },
+ {
+ "epoch": 2.008510638297872,
+ "grad_norm": 2.545353651046753,
+ "learning_rate": 3.77619935161593e-06,
+ "loss": 0.4483,
+ "step": 4248
+ },
+ {
+ "epoch": 2.008983451536643,
+ "grad_norm": 2.770266056060791,
+ "learning_rate": 3.7756628894480263e-06,
+ "loss": 0.457,
+ "step": 4249
+ },
+ {
+ "epoch": 2.0094562647754137,
+ "grad_norm": 2.903254985809326,
+ "learning_rate": 3.7751263478501878e-06,
+ "loss": 0.4171,
+ "step": 4250
+ },
+ {
+ "epoch": 2.0099290780141845,
+ "grad_norm": 2.5576963424682617,
+ "learning_rate": 3.774589726855822e-06,
+ "loss": 0.3631,
+ "step": 4251
+ },
+ {
+ "epoch": 2.010401891252955,
+ "grad_norm": 3.7584285736083984,
+ "learning_rate": 3.7740530264983434e-06,
+ "loss": 0.4827,
+ "step": 4252
+ },
+ {
+ "epoch": 2.0108747044917257,
+ "grad_norm": 3.3116581439971924,
+ "learning_rate": 3.77351624681117e-06,
+ "loss": 0.5071,
+ "step": 4253
+ },
+ {
+ "epoch": 2.0113475177304965,
+ "grad_norm": 3.1370885372161865,
+ "learning_rate": 3.772979387827723e-06,
+ "loss": 0.4963,
+ "step": 4254
+ },
+ {
+ "epoch": 2.0118203309692673,
+ "grad_norm": 2.4832639694213867,
+ "learning_rate": 3.772442449581432e-06,
+ "loss": 0.4442,
+ "step": 4255
+ },
+ {
+ "epoch": 2.0122931442080376,
+ "grad_norm": 2.7645785808563232,
+ "learning_rate": 3.7719054321057293e-06,
+ "loss": 0.4572,
+ "step": 4256
+ },
+ {
+ "epoch": 2.0127659574468084,
+ "grad_norm": 2.7962236404418945,
+ "learning_rate": 3.7713683354340515e-06,
+ "loss": 0.4906,
+ "step": 4257
+ },
+ {
+ "epoch": 2.013238770685579,
+ "grad_norm": 2.647991895675659,
+ "learning_rate": 3.7708311595998425e-06,
+ "loss": 0.4027,
+ "step": 4258
+ },
+ {
+ "epoch": 2.01371158392435,
+ "grad_norm": 2.3780267238616943,
+ "learning_rate": 3.7702939046365504e-06,
+ "loss": 0.4285,
+ "step": 4259
+ },
+ {
+ "epoch": 2.0141843971631204,
+ "grad_norm": 2.5185933113098145,
+ "learning_rate": 3.7697565705776266e-06,
+ "loss": 0.4834,
+ "step": 4260
+ },
+ {
+ "epoch": 2.014657210401891,
+ "grad_norm": 2.432507276535034,
+ "learning_rate": 3.7692191574565294e-06,
+ "loss": 0.3695,
+ "step": 4261
+ },
+ {
+ "epoch": 2.015130023640662,
+ "grad_norm": 2.8010706901550293,
+ "learning_rate": 3.76868166530672e-06,
+ "loss": 0.478,
+ "step": 4262
+ },
+ {
+ "epoch": 2.0156028368794328,
+ "grad_norm": 2.32817006111145,
+ "learning_rate": 3.768144094161666e-06,
+ "loss": 0.4154,
+ "step": 4263
+ },
+ {
+ "epoch": 2.016075650118203,
+ "grad_norm": 3.062812328338623,
+ "learning_rate": 3.7676064440548405e-06,
+ "loss": 0.5015,
+ "step": 4264
+ },
+ {
+ "epoch": 2.016548463356974,
+ "grad_norm": 2.6129536628723145,
+ "learning_rate": 3.7670687150197194e-06,
+ "loss": 0.3843,
+ "step": 4265
+ },
+ {
+ "epoch": 2.0170212765957447,
+ "grad_norm": 2.838259696960449,
+ "learning_rate": 3.766530907089786e-06,
+ "loss": 0.4937,
+ "step": 4266
+ },
+ {
+ "epoch": 2.0174940898345155,
+ "grad_norm": 2.601203680038452,
+ "learning_rate": 3.7659930202985263e-06,
+ "loss": 0.4644,
+ "step": 4267
+ },
+ {
+ "epoch": 2.017966903073286,
+ "grad_norm": 2.5964133739471436,
+ "learning_rate": 3.7654550546794322e-06,
+ "loss": 0.4365,
+ "step": 4268
+ },
+ {
+ "epoch": 2.0184397163120567,
+ "grad_norm": 3.0028915405273438,
+ "learning_rate": 3.764917010266001e-06,
+ "loss": 0.434,
+ "step": 4269
+ },
+ {
+ "epoch": 2.0189125295508275,
+ "grad_norm": 2.719252586364746,
+ "learning_rate": 3.764378887091734e-06,
+ "loss": 0.4401,
+ "step": 4270
+ },
+ {
+ "epoch": 2.0193853427895982,
+ "grad_norm": 2.400254011154175,
+ "learning_rate": 3.7638406851901377e-06,
+ "loss": 0.4904,
+ "step": 4271
+ },
+ {
+ "epoch": 2.0198581560283686,
+ "grad_norm": 2.8015363216400146,
+ "learning_rate": 3.763302404594724e-06,
+ "loss": 0.4569,
+ "step": 4272
+ },
+ {
+ "epoch": 2.0203309692671394,
+ "grad_norm": 2.718416452407837,
+ "learning_rate": 3.762764045339009e-06,
+ "loss": 0.5124,
+ "step": 4273
+ },
+ {
+ "epoch": 2.02080378250591,
+ "grad_norm": 2.484049081802368,
+ "learning_rate": 3.762225607456514e-06,
+ "loss": 0.4255,
+ "step": 4274
+ },
+ {
+ "epoch": 2.021276595744681,
+ "grad_norm": 2.6377930641174316,
+ "learning_rate": 3.7616870909807645e-06,
+ "loss": 0.5044,
+ "step": 4275
+ },
+ {
+ "epoch": 2.0217494089834513,
+ "grad_norm": 2.8845038414001465,
+ "learning_rate": 3.7611484959452927e-06,
+ "loss": 0.4924,
+ "step": 4276
+ },
+ {
+ "epoch": 2.022222222222222,
+ "grad_norm": 2.5939974784851074,
+ "learning_rate": 3.7606098223836342e-06,
+ "loss": 0.4873,
+ "step": 4277
+ },
+ {
+ "epoch": 2.022695035460993,
+ "grad_norm": 2.499826431274414,
+ "learning_rate": 3.76007107032933e-06,
+ "loss": 0.4515,
+ "step": 4278
+ },
+ {
+ "epoch": 2.0231678486997637,
+ "grad_norm": 3.0318663120269775,
+ "learning_rate": 3.759532239815924e-06,
+ "loss": 0.4901,
+ "step": 4279
+ },
+ {
+ "epoch": 2.023640661938534,
+ "grad_norm": 2.857977867126465,
+ "learning_rate": 3.758993330876969e-06,
+ "loss": 0.4659,
+ "step": 4280
+ },
+ {
+ "epoch": 2.024113475177305,
+ "grad_norm": 2.47918438911438,
+ "learning_rate": 3.7584543435460196e-06,
+ "loss": 0.4323,
+ "step": 4281
+ },
+ {
+ "epoch": 2.0245862884160757,
+ "grad_norm": 2.6033785343170166,
+ "learning_rate": 3.757915277856637e-06,
+ "loss": 0.4437,
+ "step": 4282
+ },
+ {
+ "epoch": 2.0250591016548465,
+ "grad_norm": 2.799781322479248,
+ "learning_rate": 3.757376133842386e-06,
+ "loss": 0.4523,
+ "step": 4283
+ },
+ {
+ "epoch": 2.025531914893617,
+ "grad_norm": 2.6092529296875,
+ "learning_rate": 3.756836911536836e-06,
+ "loss": 0.3898,
+ "step": 4284
+ },
+ {
+ "epoch": 2.0260047281323876,
+ "grad_norm": 2.66229248046875,
+ "learning_rate": 3.7562976109735627e-06,
+ "loss": 0.4731,
+ "step": 4285
+ },
+ {
+ "epoch": 2.0264775413711584,
+ "grad_norm": 2.90142822265625,
+ "learning_rate": 3.7557582321861463e-06,
+ "loss": 0.4285,
+ "step": 4286
+ },
+ {
+ "epoch": 2.0269503546099292,
+ "grad_norm": 2.5138802528381348,
+ "learning_rate": 3.7552187752081707e-06,
+ "loss": 0.4467,
+ "step": 4287
+ },
+ {
+ "epoch": 2.0274231678486996,
+ "grad_norm": 3.0656235218048096,
+ "learning_rate": 3.754679240073226e-06,
+ "loss": 0.4718,
+ "step": 4288
+ },
+ {
+ "epoch": 2.0278959810874704,
+ "grad_norm": 2.9633383750915527,
+ "learning_rate": 3.754139626814907e-06,
+ "loss": 0.4741,
+ "step": 4289
+ },
+ {
+ "epoch": 2.028368794326241,
+ "grad_norm": 2.5925145149230957,
+ "learning_rate": 3.753599935466812e-06,
+ "loss": 0.4281,
+ "step": 4290
+ },
+ {
+ "epoch": 2.028841607565012,
+ "grad_norm": 2.837740659713745,
+ "learning_rate": 3.7530601660625456e-06,
+ "loss": 0.4757,
+ "step": 4291
+ },
+ {
+ "epoch": 2.0293144208037823,
+ "grad_norm": 2.3995790481567383,
+ "learning_rate": 3.752520318635718e-06,
+ "loss": 0.4148,
+ "step": 4292
+ },
+ {
+ "epoch": 2.029787234042553,
+ "grad_norm": 2.572601795196533,
+ "learning_rate": 3.7519803932199424e-06,
+ "loss": 0.4051,
+ "step": 4293
+ },
+ {
+ "epoch": 2.030260047281324,
+ "grad_norm": 2.6780295372009277,
+ "learning_rate": 3.751440389848837e-06,
+ "loss": 0.4626,
+ "step": 4294
+ },
+ {
+ "epoch": 2.0307328605200947,
+ "grad_norm": 2.8666839599609375,
+ "learning_rate": 3.7509003085560257e-06,
+ "loss": 0.4255,
+ "step": 4295
+ },
+ {
+ "epoch": 2.031205673758865,
+ "grad_norm": 2.4398207664489746,
+ "learning_rate": 3.750360149375138e-06,
+ "loss": 0.4235,
+ "step": 4296
+ },
+ {
+ "epoch": 2.031678486997636,
+ "grad_norm": 2.436840534210205,
+ "learning_rate": 3.7498199123398062e-06,
+ "loss": 0.3907,
+ "step": 4297
+ },
+ {
+ "epoch": 2.0321513002364067,
+ "grad_norm": 3.3945820331573486,
+ "learning_rate": 3.7492795974836683e-06,
+ "loss": 0.465,
+ "step": 4298
+ },
+ {
+ "epoch": 2.0326241134751775,
+ "grad_norm": 2.6693103313446045,
+ "learning_rate": 3.7487392048403678e-06,
+ "loss": 0.4948,
+ "step": 4299
+ },
+ {
+ "epoch": 2.033096926713948,
+ "grad_norm": 2.7642734050750732,
+ "learning_rate": 3.748198734443553e-06,
+ "loss": 0.4538,
+ "step": 4300
+ },
+ {
+ "epoch": 2.0335697399527186,
+ "grad_norm": 3.1436543464660645,
+ "learning_rate": 3.747658186326876e-06,
+ "loss": 0.5137,
+ "step": 4301
+ },
+ {
+ "epoch": 2.0340425531914894,
+ "grad_norm": 3.482678174972534,
+ "learning_rate": 3.7471175605239947e-06,
+ "loss": 0.4982,
+ "step": 4302
+ },
+ {
+ "epoch": 2.03451536643026,
+ "grad_norm": 2.712557077407837,
+ "learning_rate": 3.746576857068571e-06,
+ "loss": 0.4459,
+ "step": 4303
+ },
+ {
+ "epoch": 2.0349881796690306,
+ "grad_norm": 3.147440195083618,
+ "learning_rate": 3.7460360759942726e-06,
+ "loss": 0.5063,
+ "step": 4304
+ },
+ {
+ "epoch": 2.0354609929078014,
+ "grad_norm": 2.840672492980957,
+ "learning_rate": 3.7454952173347714e-06,
+ "loss": 0.5041,
+ "step": 4305
+ },
+ {
+ "epoch": 2.035933806146572,
+ "grad_norm": 2.584122657775879,
+ "learning_rate": 3.744954281123745e-06,
+ "loss": 0.4487,
+ "step": 4306
+ },
+ {
+ "epoch": 2.036406619385343,
+ "grad_norm": 2.9869542121887207,
+ "learning_rate": 3.7444132673948737e-06,
+ "loss": 0.479,
+ "step": 4307
+ },
+ {
+ "epoch": 2.0368794326241133,
+ "grad_norm": 2.478459358215332,
+ "learning_rate": 3.7438721761818446e-06,
+ "loss": 0.4636,
+ "step": 4308
+ },
+ {
+ "epoch": 2.037352245862884,
+ "grad_norm": 2.5524215698242188,
+ "learning_rate": 3.7433310075183504e-06,
+ "loss": 0.4601,
+ "step": 4309
+ },
+ {
+ "epoch": 2.037825059101655,
+ "grad_norm": 2.3709988594055176,
+ "learning_rate": 3.742789761438086e-06,
+ "loss": 0.4163,
+ "step": 4310
+ },
+ {
+ "epoch": 2.0382978723404257,
+ "grad_norm": 3.140355348587036,
+ "learning_rate": 3.742248437974752e-06,
+ "loss": 0.4433,
+ "step": 4311
+ },
+ {
+ "epoch": 2.038770685579196,
+ "grad_norm": 2.940948486328125,
+ "learning_rate": 3.741707037162055e-06,
+ "loss": 0.4299,
+ "step": 4312
+ },
+ {
+ "epoch": 2.039243498817967,
+ "grad_norm": 3.009157419204712,
+ "learning_rate": 3.7411655590337055e-06,
+ "loss": 0.463,
+ "step": 4313
+ },
+ {
+ "epoch": 2.0397163120567376,
+ "grad_norm": 2.672945737838745,
+ "learning_rate": 3.7406240036234185e-06,
+ "loss": 0.4696,
+ "step": 4314
+ },
+ {
+ "epoch": 2.0401891252955084,
+ "grad_norm": 2.745962142944336,
+ "learning_rate": 3.740082370964916e-06,
+ "loss": 0.4931,
+ "step": 4315
+ },
+ {
+ "epoch": 2.040661938534279,
+ "grad_norm": 2.3939316272735596,
+ "learning_rate": 3.7395406610919217e-06,
+ "loss": 0.4396,
+ "step": 4316
+ },
+ {
+ "epoch": 2.0411347517730496,
+ "grad_norm": 2.4364447593688965,
+ "learning_rate": 3.738998874038165e-06,
+ "loss": 0.4807,
+ "step": 4317
+ },
+ {
+ "epoch": 2.0416075650118204,
+ "grad_norm": 2.360489845275879,
+ "learning_rate": 3.738457009837381e-06,
+ "loss": 0.4426,
+ "step": 4318
+ },
+ {
+ "epoch": 2.042080378250591,
+ "grad_norm": 2.5494935512542725,
+ "learning_rate": 3.7379150685233108e-06,
+ "loss": 0.4189,
+ "step": 4319
+ },
+ {
+ "epoch": 2.0425531914893615,
+ "grad_norm": 2.635472059249878,
+ "learning_rate": 3.7373730501296963e-06,
+ "loss": 0.5014,
+ "step": 4320
+ },
+ {
+ "epoch": 2.0430260047281323,
+ "grad_norm": 2.4982943534851074,
+ "learning_rate": 3.7368309546902876e-06,
+ "loss": 0.4658,
+ "step": 4321
+ },
+ {
+ "epoch": 2.043498817966903,
+ "grad_norm": 2.692742109298706,
+ "learning_rate": 3.736288782238839e-06,
+ "loss": 0.4454,
+ "step": 4322
+ },
+ {
+ "epoch": 2.043971631205674,
+ "grad_norm": 2.6774091720581055,
+ "learning_rate": 3.7357465328091086e-06,
+ "loss": 0.5002,
+ "step": 4323
+ },
+ {
+ "epoch": 2.0444444444444443,
+ "grad_norm": 2.695138692855835,
+ "learning_rate": 3.735204206434861e-06,
+ "loss": 0.448,
+ "step": 4324
+ },
+ {
+ "epoch": 2.044917257683215,
+ "grad_norm": 2.5383570194244385,
+ "learning_rate": 3.7346618031498635e-06,
+ "loss": 0.4352,
+ "step": 4325
+ },
+ {
+ "epoch": 2.045390070921986,
+ "grad_norm": 2.267277240753174,
+ "learning_rate": 3.7341193229878886e-06,
+ "loss": 0.4162,
+ "step": 4326
+ },
+ {
+ "epoch": 2.0458628841607567,
+ "grad_norm": 2.6037328243255615,
+ "learning_rate": 3.733576765982715e-06,
+ "loss": 0.4471,
+ "step": 4327
+ },
+ {
+ "epoch": 2.046335697399527,
+ "grad_norm": 3.261385440826416,
+ "learning_rate": 3.7330341321681253e-06,
+ "loss": 0.4618,
+ "step": 4328
+ },
+ {
+ "epoch": 2.046808510638298,
+ "grad_norm": 2.440650463104248,
+ "learning_rate": 3.7324914215779072e-06,
+ "loss": 0.4476,
+ "step": 4329
+ },
+ {
+ "epoch": 2.0472813238770686,
+ "grad_norm": 2.5940682888031006,
+ "learning_rate": 3.731948634245853e-06,
+ "loss": 0.4389,
+ "step": 4330
+ },
+ {
+ "epoch": 2.0477541371158394,
+ "grad_norm": 2.7428150177001953,
+ "learning_rate": 3.7314057702057582e-06,
+ "loss": 0.4477,
+ "step": 4331
+ },
+ {
+ "epoch": 2.0482269503546098,
+ "grad_norm": 2.3546223640441895,
+ "learning_rate": 3.730862829491427e-06,
+ "loss": 0.4047,
+ "step": 4332
+ },
+ {
+ "epoch": 2.0486997635933806,
+ "grad_norm": 2.552422523498535,
+ "learning_rate": 3.7303198121366637e-06,
+ "loss": 0.4438,
+ "step": 4333
+ },
+ {
+ "epoch": 2.0491725768321514,
+ "grad_norm": 2.99226713180542,
+ "learning_rate": 3.729776718175281e-06,
+ "loss": 0.491,
+ "step": 4334
+ },
+ {
+ "epoch": 2.049645390070922,
+ "grad_norm": 3.2003321647644043,
+ "learning_rate": 3.7292335476410935e-06,
+ "loss": 0.5458,
+ "step": 4335
+ },
+ {
+ "epoch": 2.0501182033096925,
+ "grad_norm": 2.739847183227539,
+ "learning_rate": 3.7286903005679237e-06,
+ "loss": 0.4499,
+ "step": 4336
+ },
+ {
+ "epoch": 2.0505910165484633,
+ "grad_norm": 2.5917470455169678,
+ "learning_rate": 3.7281469769895963e-06,
+ "loss": 0.4714,
+ "step": 4337
+ },
+ {
+ "epoch": 2.051063829787234,
+ "grad_norm": 2.8029327392578125,
+ "learning_rate": 3.7276035769399422e-06,
+ "loss": 0.42,
+ "step": 4338
+ },
+ {
+ "epoch": 2.051536643026005,
+ "grad_norm": 2.484879493713379,
+ "learning_rate": 3.727060100452796e-06,
+ "loss": 0.4163,
+ "step": 4339
+ },
+ {
+ "epoch": 2.0520094562647753,
+ "grad_norm": 2.7126030921936035,
+ "learning_rate": 3.7265165475619973e-06,
+ "loss": 0.4112,
+ "step": 4340
+ },
+ {
+ "epoch": 2.052482269503546,
+ "grad_norm": 2.618267774581909,
+ "learning_rate": 3.7259729183013927e-06,
+ "loss": 0.4281,
+ "step": 4341
+ },
+ {
+ "epoch": 2.052955082742317,
+ "grad_norm": 2.703270673751831,
+ "learning_rate": 3.7254292127048293e-06,
+ "loss": 0.4437,
+ "step": 4342
+ },
+ {
+ "epoch": 2.0534278959810877,
+ "grad_norm": 2.429150104522705,
+ "learning_rate": 3.7248854308061623e-06,
+ "loss": 0.3971,
+ "step": 4343
+ },
+ {
+ "epoch": 2.053900709219858,
+ "grad_norm": 2.54354190826416,
+ "learning_rate": 3.7243415726392508e-06,
+ "loss": 0.4485,
+ "step": 4344
+ },
+ {
+ "epoch": 2.054373522458629,
+ "grad_norm": 2.9515016078948975,
+ "learning_rate": 3.723797638237957e-06,
+ "loss": 0.4386,
+ "step": 4345
+ },
+ {
+ "epoch": 2.0548463356973996,
+ "grad_norm": 2.9129958152770996,
+ "learning_rate": 3.7232536276361514e-06,
+ "loss": 0.4595,
+ "step": 4346
+ },
+ {
+ "epoch": 2.0553191489361704,
+ "grad_norm": 2.5397512912750244,
+ "learning_rate": 3.722709540867706e-06,
+ "loss": 0.3681,
+ "step": 4347
+ },
+ {
+ "epoch": 2.0557919621749408,
+ "grad_norm": 2.79884672164917,
+ "learning_rate": 3.722165377966499e-06,
+ "loss": 0.4576,
+ "step": 4348
+ },
+ {
+ "epoch": 2.0562647754137116,
+ "grad_norm": 2.669936180114746,
+ "learning_rate": 3.7216211389664137e-06,
+ "loss": 0.3692,
+ "step": 4349
+ },
+ {
+ "epoch": 2.0567375886524824,
+ "grad_norm": 2.512326240539551,
+ "learning_rate": 3.7210768239013355e-06,
+ "loss": 0.4554,
+ "step": 4350
+ },
+ {
+ "epoch": 2.057210401891253,
+ "grad_norm": 2.913693904876709,
+ "learning_rate": 3.7205324328051583e-06,
+ "loss": 0.5282,
+ "step": 4351
+ },
+ {
+ "epoch": 2.0576832151300235,
+ "grad_norm": 3.040891170501709,
+ "learning_rate": 3.719987965711778e-06,
+ "loss": 0.4778,
+ "step": 4352
+ },
+ {
+ "epoch": 2.0581560283687943,
+ "grad_norm": 2.7504117488861084,
+ "learning_rate": 3.7194434226550966e-06,
+ "loss": 0.4217,
+ "step": 4353
+ },
+ {
+ "epoch": 2.058628841607565,
+ "grad_norm": 2.5522971153259277,
+ "learning_rate": 3.718898803669021e-06,
+ "loss": 0.437,
+ "step": 4354
+ },
+ {
+ "epoch": 2.059101654846336,
+ "grad_norm": 2.8531908988952637,
+ "learning_rate": 3.718354108787461e-06,
+ "loss": 0.4251,
+ "step": 4355
+ },
+ {
+ "epoch": 2.0595744680851062,
+ "grad_norm": 2.5812065601348877,
+ "learning_rate": 3.7178093380443337e-06,
+ "loss": 0.4374,
+ "step": 4356
+ },
+ {
+ "epoch": 2.060047281323877,
+ "grad_norm": 2.627871513366699,
+ "learning_rate": 3.7172644914735583e-06,
+ "loss": 0.436,
+ "step": 4357
+ },
+ {
+ "epoch": 2.060520094562648,
+ "grad_norm": 2.7146239280700684,
+ "learning_rate": 3.7167195691090607e-06,
+ "loss": 0.4204,
+ "step": 4358
+ },
+ {
+ "epoch": 2.0609929078014186,
+ "grad_norm": 2.486483573913574,
+ "learning_rate": 3.7161745709847706e-06,
+ "loss": 0.4015,
+ "step": 4359
+ },
+ {
+ "epoch": 2.061465721040189,
+ "grad_norm": 2.866049289703369,
+ "learning_rate": 3.7156294971346226e-06,
+ "loss": 0.4087,
+ "step": 4360
+ },
+ {
+ "epoch": 2.06193853427896,
+ "grad_norm": 2.9345552921295166,
+ "learning_rate": 3.715084347592556e-06,
+ "loss": 0.5074,
+ "step": 4361
+ },
+ {
+ "epoch": 2.0624113475177306,
+ "grad_norm": 2.502455711364746,
+ "learning_rate": 3.7145391223925155e-06,
+ "loss": 0.469,
+ "step": 4362
+ },
+ {
+ "epoch": 2.0628841607565014,
+ "grad_norm": 2.6419875621795654,
+ "learning_rate": 3.713993821568449e-06,
+ "loss": 0.4493,
+ "step": 4363
+ },
+ {
+ "epoch": 2.0633569739952717,
+ "grad_norm": 3.812079429626465,
+ "learning_rate": 3.7134484451543114e-06,
+ "loss": 0.4764,
+ "step": 4364
+ },
+ {
+ "epoch": 2.0638297872340425,
+ "grad_norm": 2.581780195236206,
+ "learning_rate": 3.712902993184059e-06,
+ "loss": 0.3994,
+ "step": 4365
+ },
+ {
+ "epoch": 2.0643026004728133,
+ "grad_norm": 2.282508134841919,
+ "learning_rate": 3.712357465691656e-06,
+ "loss": 0.4252,
+ "step": 4366
+ },
+ {
+ "epoch": 2.064775413711584,
+ "grad_norm": 2.4727818965911865,
+ "learning_rate": 3.71181186271107e-06,
+ "loss": 0.4558,
+ "step": 4367
+ },
+ {
+ "epoch": 2.0652482269503545,
+ "grad_norm": 2.7661173343658447,
+ "learning_rate": 3.711266184276272e-06,
+ "loss": 0.505,
+ "step": 4368
+ },
+ {
+ "epoch": 2.0657210401891253,
+ "grad_norm": 2.6264543533325195,
+ "learning_rate": 3.71072043042124e-06,
+ "loss": 0.4297,
+ "step": 4369
+ },
+ {
+ "epoch": 2.066193853427896,
+ "grad_norm": 2.773699998855591,
+ "learning_rate": 3.7101746011799565e-06,
+ "loss": 0.4267,
+ "step": 4370
+ },
+ {
+ "epoch": 2.066666666666667,
+ "grad_norm": 2.686955213546753,
+ "learning_rate": 3.709628696586407e-06,
+ "loss": 0.4099,
+ "step": 4371
+ },
+ {
+ "epoch": 2.0671394799054372,
+ "grad_norm": 2.6066620349884033,
+ "learning_rate": 3.709082716674582e-06,
+ "loss": 0.4146,
+ "step": 4372
+ },
+ {
+ "epoch": 2.067612293144208,
+ "grad_norm": 2.7769250869750977,
+ "learning_rate": 3.7085366614784784e-06,
+ "loss": 0.4047,
+ "step": 4373
+ },
+ {
+ "epoch": 2.068085106382979,
+ "grad_norm": 2.4986939430236816,
+ "learning_rate": 3.7079905310320957e-06,
+ "loss": 0.4021,
+ "step": 4374
+ },
+ {
+ "epoch": 2.0685579196217496,
+ "grad_norm": 2.5456206798553467,
+ "learning_rate": 3.7074443253694402e-06,
+ "loss": 0.3569,
+ "step": 4375
+ },
+ {
+ "epoch": 2.06903073286052,
+ "grad_norm": 2.4079296588897705,
+ "learning_rate": 3.70689804452452e-06,
+ "loss": 0.4308,
+ "step": 4376
+ },
+ {
+ "epoch": 2.0695035460992908,
+ "grad_norm": 2.86014723777771,
+ "learning_rate": 3.7063516885313513e-06,
+ "loss": 0.4577,
+ "step": 4377
+ },
+ {
+ "epoch": 2.0699763593380616,
+ "grad_norm": 2.8025779724121094,
+ "learning_rate": 3.7058052574239523e-06,
+ "loss": 0.4615,
+ "step": 4378
+ },
+ {
+ "epoch": 2.0704491725768324,
+ "grad_norm": 2.902676820755005,
+ "learning_rate": 3.7052587512363475e-06,
+ "loss": 0.4765,
+ "step": 4379
+ },
+ {
+ "epoch": 2.0709219858156027,
+ "grad_norm": 2.814509391784668,
+ "learning_rate": 3.704712170002566e-06,
+ "loss": 0.434,
+ "step": 4380
+ },
+ {
+ "epoch": 2.0713947990543735,
+ "grad_norm": 2.7923502922058105,
+ "learning_rate": 3.704165513756639e-06,
+ "loss": 0.4626,
+ "step": 4381
+ },
+ {
+ "epoch": 2.0718676122931443,
+ "grad_norm": 2.6802031993865967,
+ "learning_rate": 3.703618782532606e-06,
+ "loss": 0.4835,
+ "step": 4382
+ },
+ {
+ "epoch": 2.072340425531915,
+ "grad_norm": 3.0963687896728516,
+ "learning_rate": 3.7030719763645085e-06,
+ "loss": 0.4813,
+ "step": 4383
+ },
+ {
+ "epoch": 2.0728132387706855,
+ "grad_norm": 2.5658695697784424,
+ "learning_rate": 3.7025250952863956e-06,
+ "loss": 0.4428,
+ "step": 4384
+ },
+ {
+ "epoch": 2.0732860520094563,
+ "grad_norm": 2.7738289833068848,
+ "learning_rate": 3.7019781393323167e-06,
+ "loss": 0.4376,
+ "step": 4385
+ },
+ {
+ "epoch": 2.073758865248227,
+ "grad_norm": 2.6446938514709473,
+ "learning_rate": 3.7014311085363303e-06,
+ "loss": 0.4208,
+ "step": 4386
+ },
+ {
+ "epoch": 2.0742316784869974,
+ "grad_norm": 2.7556118965148926,
+ "learning_rate": 3.7008840029324967e-06,
+ "loss": 0.3831,
+ "step": 4387
+ },
+ {
+ "epoch": 2.074704491725768,
+ "grad_norm": 2.573141574859619,
+ "learning_rate": 3.700336822554882e-06,
+ "loss": 0.4396,
+ "step": 4388
+ },
+ {
+ "epoch": 2.075177304964539,
+ "grad_norm": 2.762319803237915,
+ "learning_rate": 3.6997895674375566e-06,
+ "loss": 0.4579,
+ "step": 4389
+ },
+ {
+ "epoch": 2.07565011820331,
+ "grad_norm": 2.729780435562134,
+ "learning_rate": 3.699242237614596e-06,
+ "loss": 0.4262,
+ "step": 4390
+ },
+ {
+ "epoch": 2.0761229314420806,
+ "grad_norm": 2.657480001449585,
+ "learning_rate": 3.698694833120079e-06,
+ "loss": 0.4176,
+ "step": 4391
+ },
+ {
+ "epoch": 2.076595744680851,
+ "grad_norm": 2.8433303833007812,
+ "learning_rate": 3.6981473539880914e-06,
+ "loss": 0.457,
+ "step": 4392
+ },
+ {
+ "epoch": 2.0770685579196217,
+ "grad_norm": 2.819047212600708,
+ "learning_rate": 3.6975998002527225e-06,
+ "loss": 0.4244,
+ "step": 4393
+ },
+ {
+ "epoch": 2.0775413711583925,
+ "grad_norm": 2.6565003395080566,
+ "learning_rate": 3.697052171948064e-06,
+ "loss": 0.4384,
+ "step": 4394
+ },
+ {
+ "epoch": 2.078014184397163,
+ "grad_norm": 2.5795063972473145,
+ "learning_rate": 3.696504469108216e-06,
+ "loss": 0.4958,
+ "step": 4395
+ },
+ {
+ "epoch": 2.0784869976359337,
+ "grad_norm": 2.455730676651001,
+ "learning_rate": 3.6959566917672822e-06,
+ "loss": 0.4191,
+ "step": 4396
+ },
+ {
+ "epoch": 2.0789598108747045,
+ "grad_norm": 2.6706607341766357,
+ "learning_rate": 3.6954088399593684e-06,
+ "loss": 0.4709,
+ "step": 4397
+ },
+ {
+ "epoch": 2.0794326241134753,
+ "grad_norm": 2.3758466243743896,
+ "learning_rate": 3.694860913718589e-06,
+ "loss": 0.4231,
+ "step": 4398
+ },
+ {
+ "epoch": 2.079905437352246,
+ "grad_norm": 2.3488340377807617,
+ "learning_rate": 3.6943129130790583e-06,
+ "loss": 0.4321,
+ "step": 4399
+ },
+ {
+ "epoch": 2.0803782505910164,
+ "grad_norm": 2.6438148021698,
+ "learning_rate": 3.6937648380748996e-06,
+ "loss": 0.4907,
+ "step": 4400
+ },
+ {
+ "epoch": 2.0808510638297872,
+ "grad_norm": 2.9826784133911133,
+ "learning_rate": 3.6932166887402395e-06,
+ "loss": 0.4404,
+ "step": 4401
+ },
+ {
+ "epoch": 2.081323877068558,
+ "grad_norm": 2.5203495025634766,
+ "learning_rate": 3.6926684651092076e-06,
+ "loss": 0.4337,
+ "step": 4402
+ },
+ {
+ "epoch": 2.0817966903073284,
+ "grad_norm": 2.7704148292541504,
+ "learning_rate": 3.692120167215941e-06,
+ "loss": 0.4195,
+ "step": 4403
+ },
+ {
+ "epoch": 2.082269503546099,
+ "grad_norm": 2.879430055618286,
+ "learning_rate": 3.6915717950945782e-06,
+ "loss": 0.4498,
+ "step": 4404
+ },
+ {
+ "epoch": 2.08274231678487,
+ "grad_norm": 2.7659497261047363,
+ "learning_rate": 3.6910233487792655e-06,
+ "loss": 0.4017,
+ "step": 4405
+ },
+ {
+ "epoch": 2.083215130023641,
+ "grad_norm": 3.4017205238342285,
+ "learning_rate": 3.6904748283041503e-06,
+ "loss": 0.4733,
+ "step": 4406
+ },
+ {
+ "epoch": 2.083687943262411,
+ "grad_norm": 2.706223249435425,
+ "learning_rate": 3.6899262337033887e-06,
+ "loss": 0.4926,
+ "step": 4407
+ },
+ {
+ "epoch": 2.084160756501182,
+ "grad_norm": 2.644932508468628,
+ "learning_rate": 3.6893775650111372e-06,
+ "loss": 0.3904,
+ "step": 4408
+ },
+ {
+ "epoch": 2.0846335697399527,
+ "grad_norm": 2.666585683822632,
+ "learning_rate": 3.6888288222615603e-06,
+ "loss": 0.4698,
+ "step": 4409
+ },
+ {
+ "epoch": 2.0851063829787235,
+ "grad_norm": 3.0058486461639404,
+ "learning_rate": 3.688280005488826e-06,
+ "loss": 0.5291,
+ "step": 4410
+ },
+ {
+ "epoch": 2.085579196217494,
+ "grad_norm": 2.533088445663452,
+ "learning_rate": 3.687731114727105e-06,
+ "loss": 0.393,
+ "step": 4411
+ },
+ {
+ "epoch": 2.0860520094562647,
+ "grad_norm": 2.921687364578247,
+ "learning_rate": 3.6871821500105763e-06,
+ "loss": 0.4719,
+ "step": 4412
+ },
+ {
+ "epoch": 2.0865248226950355,
+ "grad_norm": 2.291804313659668,
+ "learning_rate": 3.686633111373421e-06,
+ "loss": 0.4105,
+ "step": 4413
+ },
+ {
+ "epoch": 2.0869976359338063,
+ "grad_norm": 2.496333122253418,
+ "learning_rate": 3.6860839988498255e-06,
+ "loss": 0.4704,
+ "step": 4414
+ },
+ {
+ "epoch": 2.0874704491725766,
+ "grad_norm": 2.8059427738189697,
+ "learning_rate": 3.6855348124739787e-06,
+ "loss": 0.4961,
+ "step": 4415
+ },
+ {
+ "epoch": 2.0879432624113474,
+ "grad_norm": 2.683922290802002,
+ "learning_rate": 3.6849855522800795e-06,
+ "loss": 0.4838,
+ "step": 4416
+ },
+ {
+ "epoch": 2.088416075650118,
+ "grad_norm": 2.694148540496826,
+ "learning_rate": 3.684436218302324e-06,
+ "loss": 0.4812,
+ "step": 4417
+ },
+ {
+ "epoch": 2.088888888888889,
+ "grad_norm": 2.724531888961792,
+ "learning_rate": 3.683886810574919e-06,
+ "loss": 0.4495,
+ "step": 4418
+ },
+ {
+ "epoch": 2.0893617021276594,
+ "grad_norm": 2.6176564693450928,
+ "learning_rate": 3.6833373291320746e-06,
+ "loss": 0.4698,
+ "step": 4419
+ },
+ {
+ "epoch": 2.08983451536643,
+ "grad_norm": 2.534116268157959,
+ "learning_rate": 3.6827877740080032e-06,
+ "loss": 0.3912,
+ "step": 4420
+ },
+ {
+ "epoch": 2.090307328605201,
+ "grad_norm": 2.5747432708740234,
+ "learning_rate": 3.682238145236924e-06,
+ "loss": 0.4072,
+ "step": 4421
+ },
+ {
+ "epoch": 2.0907801418439718,
+ "grad_norm": 2.5947659015655518,
+ "learning_rate": 3.6816884428530588e-06,
+ "loss": 0.4638,
+ "step": 4422
+ },
+ {
+ "epoch": 2.091252955082742,
+ "grad_norm": 2.811992883682251,
+ "learning_rate": 3.6811386668906353e-06,
+ "loss": 0.4345,
+ "step": 4423
+ },
+ {
+ "epoch": 2.091725768321513,
+ "grad_norm": 2.7482287883758545,
+ "learning_rate": 3.680588817383886e-06,
+ "loss": 0.4541,
+ "step": 4424
+ },
+ {
+ "epoch": 2.0921985815602837,
+ "grad_norm": 2.987131357192993,
+ "learning_rate": 3.6800388943670484e-06,
+ "loss": 0.4571,
+ "step": 4425
+ },
+ {
+ "epoch": 2.0926713947990545,
+ "grad_norm": 3.1918671131134033,
+ "learning_rate": 3.6794888978743637e-06,
+ "loss": 0.5722,
+ "step": 4426
+ },
+ {
+ "epoch": 2.093144208037825,
+ "grad_norm": 2.5654571056365967,
+ "learning_rate": 3.678938827940076e-06,
+ "loss": 0.4686,
+ "step": 4427
+ },
+ {
+ "epoch": 2.0936170212765957,
+ "grad_norm": 2.942084789276123,
+ "learning_rate": 3.6783886845984383e-06,
+ "loss": 0.4512,
+ "step": 4428
+ },
+ {
+ "epoch": 2.0940898345153665,
+ "grad_norm": 2.74847674369812,
+ "learning_rate": 3.677838467883703e-06,
+ "loss": 0.4506,
+ "step": 4429
+ },
+ {
+ "epoch": 2.0945626477541373,
+ "grad_norm": 2.7569334506988525,
+ "learning_rate": 3.6772881778301322e-06,
+ "loss": 0.502,
+ "step": 4430
+ },
+ {
+ "epoch": 2.0950354609929076,
+ "grad_norm": 2.969966173171997,
+ "learning_rate": 3.6767378144719884e-06,
+ "loss": 0.4772,
+ "step": 4431
+ },
+ {
+ "epoch": 2.0955082742316784,
+ "grad_norm": 2.773524522781372,
+ "learning_rate": 3.67618737784354e-06,
+ "loss": 0.5183,
+ "step": 4432
+ },
+ {
+ "epoch": 2.095981087470449,
+ "grad_norm": 2.6760106086730957,
+ "learning_rate": 3.6756368679790617e-06,
+ "loss": 0.4787,
+ "step": 4433
+ },
+ {
+ "epoch": 2.09645390070922,
+ "grad_norm": 2.8758978843688965,
+ "learning_rate": 3.6750862849128304e-06,
+ "loss": 0.4275,
+ "step": 4434
+ },
+ {
+ "epoch": 2.0969267139479904,
+ "grad_norm": 2.670509099960327,
+ "learning_rate": 3.6745356286791288e-06,
+ "loss": 0.4401,
+ "step": 4435
+ },
+ {
+ "epoch": 2.097399527186761,
+ "grad_norm": 2.8453969955444336,
+ "learning_rate": 3.673984899312244e-06,
+ "loss": 0.4303,
+ "step": 4436
+ },
+ {
+ "epoch": 2.097872340425532,
+ "grad_norm": 2.6212339401245117,
+ "learning_rate": 3.673434096846468e-06,
+ "loss": 0.4675,
+ "step": 4437
+ },
+ {
+ "epoch": 2.0983451536643027,
+ "grad_norm": 2.8211941719055176,
+ "learning_rate": 3.672883221316095e-06,
+ "loss": 0.4678,
+ "step": 4438
+ },
+ {
+ "epoch": 2.098817966903073,
+ "grad_norm": 2.4838058948516846,
+ "learning_rate": 3.672332272755427e-06,
+ "loss": 0.4128,
+ "step": 4439
+ },
+ {
+ "epoch": 2.099290780141844,
+ "grad_norm": 2.596660852432251,
+ "learning_rate": 3.671781251198769e-06,
+ "loss": 0.423,
+ "step": 4440
+ },
+ {
+ "epoch": 2.0997635933806147,
+ "grad_norm": 2.9979989528656006,
+ "learning_rate": 3.67123015668043e-06,
+ "loss": 0.4493,
+ "step": 4441
+ },
+ {
+ "epoch": 2.1002364066193855,
+ "grad_norm": 2.6232850551605225,
+ "learning_rate": 3.670678989234725e-06,
+ "loss": 0.4237,
+ "step": 4442
+ },
+ {
+ "epoch": 2.100709219858156,
+ "grad_norm": 2.575039863586426,
+ "learning_rate": 3.670127748895973e-06,
+ "loss": 0.4464,
+ "step": 4443
+ },
+ {
+ "epoch": 2.1011820330969266,
+ "grad_norm": 2.3381190299987793,
+ "learning_rate": 3.669576435698497e-06,
+ "loss": 0.4208,
+ "step": 4444
+ },
+ {
+ "epoch": 2.1016548463356974,
+ "grad_norm": 2.9645180702209473,
+ "learning_rate": 3.669025049676625e-06,
+ "loss": 0.5272,
+ "step": 4445
+ },
+ {
+ "epoch": 2.1021276595744682,
+ "grad_norm": 2.719320297241211,
+ "learning_rate": 3.668473590864689e-06,
+ "loss": 0.4485,
+ "step": 4446
+ },
+ {
+ "epoch": 2.1026004728132386,
+ "grad_norm": 2.8665547370910645,
+ "learning_rate": 3.6679220592970254e-06,
+ "loss": 0.4433,
+ "step": 4447
+ },
+ {
+ "epoch": 2.1030732860520094,
+ "grad_norm": 2.6922879219055176,
+ "learning_rate": 3.667370455007977e-06,
+ "loss": 0.502,
+ "step": 4448
+ },
+ {
+ "epoch": 2.10354609929078,
+ "grad_norm": 3.018228530883789,
+ "learning_rate": 3.6668187780318894e-06,
+ "loss": 0.4939,
+ "step": 4449
+ },
+ {
+ "epoch": 2.104018912529551,
+ "grad_norm": 3.187901735305786,
+ "learning_rate": 3.666267028403112e-06,
+ "loss": 0.4151,
+ "step": 4450
+ },
+ {
+ "epoch": 2.1044917257683213,
+ "grad_norm": 2.9521446228027344,
+ "learning_rate": 3.6657152061560012e-06,
+ "loss": 0.4343,
+ "step": 4451
+ },
+ {
+ "epoch": 2.104964539007092,
+ "grad_norm": 2.5125739574432373,
+ "learning_rate": 3.6651633113249164e-06,
+ "loss": 0.4071,
+ "step": 4452
+ },
+ {
+ "epoch": 2.105437352245863,
+ "grad_norm": 2.9164133071899414,
+ "learning_rate": 3.664611343944221e-06,
+ "loss": 0.4173,
+ "step": 4453
+ },
+ {
+ "epoch": 2.1059101654846337,
+ "grad_norm": 2.680893898010254,
+ "learning_rate": 3.6640593040482834e-06,
+ "loss": 0.4917,
+ "step": 4454
+ },
+ {
+ "epoch": 2.106382978723404,
+ "grad_norm": 2.6823534965515137,
+ "learning_rate": 3.6635071916714774e-06,
+ "loss": 0.4668,
+ "step": 4455
+ },
+ {
+ "epoch": 2.106855791962175,
+ "grad_norm": 2.6221907138824463,
+ "learning_rate": 3.6629550068481806e-06,
+ "loss": 0.4956,
+ "step": 4456
+ },
+ {
+ "epoch": 2.1073286052009457,
+ "grad_norm": 3.096370220184326,
+ "learning_rate": 3.6624027496127745e-06,
+ "loss": 0.3995,
+ "step": 4457
+ },
+ {
+ "epoch": 2.1078014184397165,
+ "grad_norm": 2.752885341644287,
+ "learning_rate": 3.661850419999647e-06,
+ "loss": 0.4838,
+ "step": 4458
+ },
+ {
+ "epoch": 2.108274231678487,
+ "grad_norm": 2.6806766986846924,
+ "learning_rate": 3.661298018043188e-06,
+ "loss": 0.4817,
+ "step": 4459
+ },
+ {
+ "epoch": 2.1087470449172576,
+ "grad_norm": 2.6317873001098633,
+ "learning_rate": 3.660745543777794e-06,
+ "loss": 0.4777,
+ "step": 4460
+ },
+ {
+ "epoch": 2.1092198581560284,
+ "grad_norm": 2.4939377307891846,
+ "learning_rate": 3.6601929972378634e-06,
+ "loss": 0.4525,
+ "step": 4461
+ },
+ {
+ "epoch": 2.109692671394799,
+ "grad_norm": 2.4902873039245605,
+ "learning_rate": 3.659640378457803e-06,
+ "loss": 0.4392,
+ "step": 4462
+ },
+ {
+ "epoch": 2.1101654846335696,
+ "grad_norm": 2.5082345008850098,
+ "learning_rate": 3.6590876874720216e-06,
+ "loss": 0.4224,
+ "step": 4463
+ },
+ {
+ "epoch": 2.1106382978723404,
+ "grad_norm": 2.658407211303711,
+ "learning_rate": 3.6585349243149313e-06,
+ "loss": 0.4316,
+ "step": 4464
+ },
+ {
+ "epoch": 2.111111111111111,
+ "grad_norm": 2.562883138656616,
+ "learning_rate": 3.6579820890209515e-06,
+ "loss": 0.4491,
+ "step": 4465
+ },
+ {
+ "epoch": 2.111583924349882,
+ "grad_norm": 2.5719261169433594,
+ "learning_rate": 3.657429181624505e-06,
+ "loss": 0.4406,
+ "step": 4466
+ },
+ {
+ "epoch": 2.1120567375886523,
+ "grad_norm": 2.8840596675872803,
+ "learning_rate": 3.6568762021600184e-06,
+ "loss": 0.4267,
+ "step": 4467
+ },
+ {
+ "epoch": 2.112529550827423,
+ "grad_norm": 2.660304546356201,
+ "learning_rate": 3.656323150661924e-06,
+ "loss": 0.4502,
+ "step": 4468
+ },
+ {
+ "epoch": 2.113002364066194,
+ "grad_norm": 2.610996961593628,
+ "learning_rate": 3.655770027164657e-06,
+ "loss": 0.3934,
+ "step": 4469
+ },
+ {
+ "epoch": 2.1134751773049647,
+ "grad_norm": 2.6000053882598877,
+ "learning_rate": 3.655216831702658e-06,
+ "loss": 0.4582,
+ "step": 4470
+ },
+ {
+ "epoch": 2.113947990543735,
+ "grad_norm": 2.73124098777771,
+ "learning_rate": 3.654663564310372e-06,
+ "loss": 0.4748,
+ "step": 4471
+ },
+ {
+ "epoch": 2.114420803782506,
+ "grad_norm": 2.711091995239258,
+ "learning_rate": 3.6541102250222495e-06,
+ "loss": 0.4145,
+ "step": 4472
+ },
+ {
+ "epoch": 2.1148936170212767,
+ "grad_norm": 2.655996561050415,
+ "learning_rate": 3.6535568138727438e-06,
+ "loss": 0.4407,
+ "step": 4473
+ },
+ {
+ "epoch": 2.1153664302600474,
+ "grad_norm": 2.7630865573883057,
+ "learning_rate": 3.653003330896313e-06,
+ "loss": 0.4298,
+ "step": 4474
+ },
+ {
+ "epoch": 2.115839243498818,
+ "grad_norm": 2.554415464401245,
+ "learning_rate": 3.6524497761274214e-06,
+ "loss": 0.44,
+ "step": 4475
+ },
+ {
+ "epoch": 2.1163120567375886,
+ "grad_norm": 2.790328025817871,
+ "learning_rate": 3.651896149600535e-06,
+ "loss": 0.5061,
+ "step": 4476
+ },
+ {
+ "epoch": 2.1167848699763594,
+ "grad_norm": 2.755267381668091,
+ "learning_rate": 3.651342451350127e-06,
+ "loss": 0.4588,
+ "step": 4477
+ },
+ {
+ "epoch": 2.11725768321513,
+ "grad_norm": 2.8936638832092285,
+ "learning_rate": 3.6507886814106722e-06,
+ "loss": 0.468,
+ "step": 4478
+ },
+ {
+ "epoch": 2.1177304964539005,
+ "grad_norm": 2.7394332885742188,
+ "learning_rate": 3.6502348398166525e-06,
+ "loss": 0.383,
+ "step": 4479
+ },
+ {
+ "epoch": 2.1182033096926713,
+ "grad_norm": 2.3359546661376953,
+ "learning_rate": 3.649680926602553e-06,
+ "loss": 0.3903,
+ "step": 4480
+ },
+ {
+ "epoch": 2.118676122931442,
+ "grad_norm": 3.102202892303467,
+ "learning_rate": 3.6491269418028637e-06,
+ "loss": 0.4525,
+ "step": 4481
+ },
+ {
+ "epoch": 2.119148936170213,
+ "grad_norm": 2.467970848083496,
+ "learning_rate": 3.648572885452078e-06,
+ "loss": 0.414,
+ "step": 4482
+ },
+ {
+ "epoch": 2.1196217494089833,
+ "grad_norm": 2.8984131813049316,
+ "learning_rate": 3.6480187575846952e-06,
+ "loss": 0.4571,
+ "step": 4483
+ },
+ {
+ "epoch": 2.120094562647754,
+ "grad_norm": 2.674834966659546,
+ "learning_rate": 3.6474645582352187e-06,
+ "loss": 0.455,
+ "step": 4484
+ },
+ {
+ "epoch": 2.120567375886525,
+ "grad_norm": 2.8713369369506836,
+ "learning_rate": 3.6469102874381552e-06,
+ "loss": 0.4567,
+ "step": 4485
+ },
+ {
+ "epoch": 2.1210401891252957,
+ "grad_norm": 3.174814462661743,
+ "learning_rate": 3.646355945228017e-06,
+ "loss": 0.5295,
+ "step": 4486
+ },
+ {
+ "epoch": 2.121513002364066,
+ "grad_norm": 2.6409823894500732,
+ "learning_rate": 3.6458015316393215e-06,
+ "loss": 0.4308,
+ "step": 4487
+ },
+ {
+ "epoch": 2.121985815602837,
+ "grad_norm": 2.4228954315185547,
+ "learning_rate": 3.645247046706588e-06,
+ "loss": 0.4042,
+ "step": 4488
+ },
+ {
+ "epoch": 2.1224586288416076,
+ "grad_norm": 2.553551435470581,
+ "learning_rate": 3.6446924904643427e-06,
+ "loss": 0.3925,
+ "step": 4489
+ },
+ {
+ "epoch": 2.1229314420803784,
+ "grad_norm": 2.8019237518310547,
+ "learning_rate": 3.6441378629471157e-06,
+ "loss": 0.4079,
+ "step": 4490
+ },
+ {
+ "epoch": 2.123404255319149,
+ "grad_norm": 2.993251085281372,
+ "learning_rate": 3.643583164189441e-06,
+ "loss": 0.4558,
+ "step": 4491
+ },
+ {
+ "epoch": 2.1238770685579196,
+ "grad_norm": 2.4531471729278564,
+ "learning_rate": 3.643028394225857e-06,
+ "loss": 0.4167,
+ "step": 4492
+ },
+ {
+ "epoch": 2.1243498817966904,
+ "grad_norm": 2.6827852725982666,
+ "learning_rate": 3.6424735530909065e-06,
+ "loss": 0.4311,
+ "step": 4493
+ },
+ {
+ "epoch": 2.124822695035461,
+ "grad_norm": 3.1232128143310547,
+ "learning_rate": 3.6419186408191377e-06,
+ "loss": 0.4537,
+ "step": 4494
+ },
+ {
+ "epoch": 2.1252955082742315,
+ "grad_norm": 2.816348075866699,
+ "learning_rate": 3.641363657445103e-06,
+ "loss": 0.4869,
+ "step": 4495
+ },
+ {
+ "epoch": 2.1257683215130023,
+ "grad_norm": 2.6269683837890625,
+ "learning_rate": 3.6408086030033575e-06,
+ "loss": 0.4066,
+ "step": 4496
+ },
+ {
+ "epoch": 2.126241134751773,
+ "grad_norm": 4.6375956535339355,
+ "learning_rate": 3.640253477528462e-06,
+ "loss": 0.4488,
+ "step": 4497
+ },
+ {
+ "epoch": 2.126713947990544,
+ "grad_norm": 3.020970582962036,
+ "learning_rate": 3.639698281054983e-06,
+ "loss": 0.4197,
+ "step": 4498
+ },
+ {
+ "epoch": 2.1271867612293143,
+ "grad_norm": 2.87904691696167,
+ "learning_rate": 3.6391430136174892e-06,
+ "loss": 0.4743,
+ "step": 4499
+ },
+ {
+ "epoch": 2.127659574468085,
+ "grad_norm": 2.719892978668213,
+ "learning_rate": 3.6385876752505554e-06,
+ "loss": 0.388,
+ "step": 4500
+ },
+ {
+ "epoch": 2.128132387706856,
+ "grad_norm": 2.7321808338165283,
+ "learning_rate": 3.638032265988759e-06,
+ "loss": 0.4857,
+ "step": 4501
+ },
+ {
+ "epoch": 2.1286052009456267,
+ "grad_norm": 2.700814723968506,
+ "learning_rate": 3.6374767858666836e-06,
+ "loss": 0.4819,
+ "step": 4502
+ },
+ {
+ "epoch": 2.129078014184397,
+ "grad_norm": 2.658423662185669,
+ "learning_rate": 3.6369212349189164e-06,
+ "loss": 0.4113,
+ "step": 4503
+ },
+ {
+ "epoch": 2.129550827423168,
+ "grad_norm": 2.673877716064453,
+ "learning_rate": 3.63636561318005e-06,
+ "loss": 0.3745,
+ "step": 4504
+ },
+ {
+ "epoch": 2.1300236406619386,
+ "grad_norm": 2.607758045196533,
+ "learning_rate": 3.6358099206846787e-06,
+ "loss": 0.4409,
+ "step": 4505
+ },
+ {
+ "epoch": 2.1304964539007094,
+ "grad_norm": 2.8117682933807373,
+ "learning_rate": 3.6352541574674044e-06,
+ "loss": 0.426,
+ "step": 4506
+ },
+ {
+ "epoch": 2.1309692671394798,
+ "grad_norm": 2.6970250606536865,
+ "learning_rate": 3.634698323562832e-06,
+ "loss": 0.4295,
+ "step": 4507
+ },
+ {
+ "epoch": 2.1314420803782506,
+ "grad_norm": 2.7133560180664062,
+ "learning_rate": 3.6341424190055696e-06,
+ "loss": 0.4443,
+ "step": 4508
+ },
+ {
+ "epoch": 2.1319148936170214,
+ "grad_norm": 2.57181715965271,
+ "learning_rate": 3.6335864438302328e-06,
+ "loss": 0.3995,
+ "step": 4509
+ },
+ {
+ "epoch": 2.132387706855792,
+ "grad_norm": 2.8618004322052,
+ "learning_rate": 3.633030398071438e-06,
+ "loss": 0.5075,
+ "step": 4510
+ },
+ {
+ "epoch": 2.1328605200945625,
+ "grad_norm": 2.7586729526519775,
+ "learning_rate": 3.6324742817638087e-06,
+ "loss": 0.4322,
+ "step": 4511
+ },
+ {
+ "epoch": 2.1333333333333333,
+ "grad_norm": 2.913256883621216,
+ "learning_rate": 3.631918094941972e-06,
+ "loss": 0.4708,
+ "step": 4512
+ },
+ {
+ "epoch": 2.133806146572104,
+ "grad_norm": 2.7715728282928467,
+ "learning_rate": 3.6313618376405585e-06,
+ "loss": 0.5194,
+ "step": 4513
+ },
+ {
+ "epoch": 2.134278959810875,
+ "grad_norm": 2.7986366748809814,
+ "learning_rate": 3.6308055098942042e-06,
+ "loss": 0.4419,
+ "step": 4514
+ },
+ {
+ "epoch": 2.1347517730496453,
+ "grad_norm": 3.043549060821533,
+ "learning_rate": 3.6302491117375492e-06,
+ "loss": 0.4441,
+ "step": 4515
+ },
+ {
+ "epoch": 2.135224586288416,
+ "grad_norm": 2.771761417388916,
+ "learning_rate": 3.629692643205238e-06,
+ "loss": 0.4752,
+ "step": 4516
+ },
+ {
+ "epoch": 2.135697399527187,
+ "grad_norm": 2.804941415786743,
+ "learning_rate": 3.6291361043319202e-06,
+ "loss": 0.4089,
+ "step": 4517
+ },
+ {
+ "epoch": 2.1361702127659576,
+ "grad_norm": 2.9897940158843994,
+ "learning_rate": 3.628579495152248e-06,
+ "loss": 0.4829,
+ "step": 4518
+ },
+ {
+ "epoch": 2.136643026004728,
+ "grad_norm": 2.9273486137390137,
+ "learning_rate": 3.6280228157008784e-06,
+ "loss": 0.4469,
+ "step": 4519
+ },
+ {
+ "epoch": 2.137115839243499,
+ "grad_norm": 2.584373950958252,
+ "learning_rate": 3.627466066012475e-06,
+ "loss": 0.4277,
+ "step": 4520
+ },
+ {
+ "epoch": 2.1375886524822696,
+ "grad_norm": 3.009333848953247,
+ "learning_rate": 3.626909246121703e-06,
+ "loss": 0.4025,
+ "step": 4521
+ },
+ {
+ "epoch": 2.1380614657210404,
+ "grad_norm": 2.634615659713745,
+ "learning_rate": 3.626352356063234e-06,
+ "loss": 0.4046,
+ "step": 4522
+ },
+ {
+ "epoch": 2.1385342789598107,
+ "grad_norm": 2.87310528755188,
+ "learning_rate": 3.625795395871743e-06,
+ "loss": 0.4426,
+ "step": 4523
+ },
+ {
+ "epoch": 2.1390070921985815,
+ "grad_norm": 2.94985032081604,
+ "learning_rate": 3.625238365581909e-06,
+ "loss": 0.445,
+ "step": 4524
+ },
+ {
+ "epoch": 2.1394799054373523,
+ "grad_norm": 2.470189332962036,
+ "learning_rate": 3.624681265228416e-06,
+ "loss": 0.4082,
+ "step": 4525
+ },
+ {
+ "epoch": 2.139952718676123,
+ "grad_norm": 2.5304040908813477,
+ "learning_rate": 3.624124094845952e-06,
+ "loss": 0.403,
+ "step": 4526
+ },
+ {
+ "epoch": 2.1404255319148935,
+ "grad_norm": 2.6148900985717773,
+ "learning_rate": 3.62356685446921e-06,
+ "loss": 0.3867,
+ "step": 4527
+ },
+ {
+ "epoch": 2.1408983451536643,
+ "grad_norm": 2.885549783706665,
+ "learning_rate": 3.623009544132886e-06,
+ "loss": 0.4706,
+ "step": 4528
+ },
+ {
+ "epoch": 2.141371158392435,
+ "grad_norm": 3.00490665435791,
+ "learning_rate": 3.6224521638716827e-06,
+ "loss": 0.4733,
+ "step": 4529
+ },
+ {
+ "epoch": 2.141843971631206,
+ "grad_norm": 2.925879716873169,
+ "learning_rate": 3.6218947137203043e-06,
+ "loss": 0.4581,
+ "step": 4530
+ },
+ {
+ "epoch": 2.1423167848699762,
+ "grad_norm": 3.10861873626709,
+ "learning_rate": 3.621337193713462e-06,
+ "loss": 0.4579,
+ "step": 4531
+ },
+ {
+ "epoch": 2.142789598108747,
+ "grad_norm": 2.7386577129364014,
+ "learning_rate": 3.6207796038858693e-06,
+ "loss": 0.4248,
+ "step": 4532
+ },
+ {
+ "epoch": 2.143262411347518,
+ "grad_norm": 2.601836681365967,
+ "learning_rate": 3.6202219442722453e-06,
+ "loss": 0.4928,
+ "step": 4533
+ },
+ {
+ "epoch": 2.1437352245862886,
+ "grad_norm": 2.598778247833252,
+ "learning_rate": 3.6196642149073123e-06,
+ "loss": 0.4415,
+ "step": 4534
+ },
+ {
+ "epoch": 2.144208037825059,
+ "grad_norm": 2.443995714187622,
+ "learning_rate": 3.619106415825798e-06,
+ "loss": 0.3917,
+ "step": 4535
+ },
+ {
+ "epoch": 2.1446808510638298,
+ "grad_norm": 2.84643816947937,
+ "learning_rate": 3.6185485470624354e-06,
+ "loss": 0.4162,
+ "step": 4536
+ },
+ {
+ "epoch": 2.1451536643026006,
+ "grad_norm": 2.4568188190460205,
+ "learning_rate": 3.617990608651959e-06,
+ "loss": 0.4298,
+ "step": 4537
+ },
+ {
+ "epoch": 2.145626477541371,
+ "grad_norm": 2.968804359436035,
+ "learning_rate": 3.61743260062911e-06,
+ "loss": 0.4696,
+ "step": 4538
+ },
+ {
+ "epoch": 2.1460992907801417,
+ "grad_norm": 2.629075288772583,
+ "learning_rate": 3.6168745230286327e-06,
+ "loss": 0.4234,
+ "step": 4539
+ },
+ {
+ "epoch": 2.1465721040189125,
+ "grad_norm": 2.7680578231811523,
+ "learning_rate": 3.6163163758852754e-06,
+ "loss": 0.4669,
+ "step": 4540
+ },
+ {
+ "epoch": 2.1470449172576833,
+ "grad_norm": 2.782825469970703,
+ "learning_rate": 3.615758159233793e-06,
+ "loss": 0.4552,
+ "step": 4541
+ },
+ {
+ "epoch": 2.147517730496454,
+ "grad_norm": 2.653047561645508,
+ "learning_rate": 3.615199873108942e-06,
+ "loss": 0.4393,
+ "step": 4542
+ },
+ {
+ "epoch": 2.1479905437352245,
+ "grad_norm": 2.4175806045532227,
+ "learning_rate": 3.6146415175454852e-06,
+ "loss": 0.4114,
+ "step": 4543
+ },
+ {
+ "epoch": 2.1484633569739953,
+ "grad_norm": 2.627943515777588,
+ "learning_rate": 3.614083092578189e-06,
+ "loss": 0.4215,
+ "step": 4544
+ },
+ {
+ "epoch": 2.148936170212766,
+ "grad_norm": 2.8934123516082764,
+ "learning_rate": 3.6135245982418227e-06,
+ "loss": 0.4815,
+ "step": 4545
+ },
+ {
+ "epoch": 2.1494089834515364,
+ "grad_norm": 2.8535244464874268,
+ "learning_rate": 3.612966034571164e-06,
+ "loss": 0.4683,
+ "step": 4546
+ },
+ {
+ "epoch": 2.149881796690307,
+ "grad_norm": 2.7826647758483887,
+ "learning_rate": 3.6124074016009893e-06,
+ "loss": 0.4351,
+ "step": 4547
+ },
+ {
+ "epoch": 2.150354609929078,
+ "grad_norm": 2.6906018257141113,
+ "learning_rate": 3.6118486993660834e-06,
+ "loss": 0.4585,
+ "step": 4548
+ },
+ {
+ "epoch": 2.150827423167849,
+ "grad_norm": 2.726766586303711,
+ "learning_rate": 3.6112899279012346e-06,
+ "loss": 0.4753,
+ "step": 4549
+ },
+ {
+ "epoch": 2.1513002364066196,
+ "grad_norm": 3.0193991661071777,
+ "learning_rate": 3.6107310872412348e-06,
+ "loss": 0.4827,
+ "step": 4550
+ },
+ {
+ "epoch": 2.15177304964539,
+ "grad_norm": 2.6788697242736816,
+ "learning_rate": 3.610172177420881e-06,
+ "loss": 0.4333,
+ "step": 4551
+ },
+ {
+ "epoch": 2.1522458628841608,
+ "grad_norm": 2.865410327911377,
+ "learning_rate": 3.609613198474973e-06,
+ "loss": 0.4569,
+ "step": 4552
+ },
+ {
+ "epoch": 2.1527186761229316,
+ "grad_norm": 2.9199366569519043,
+ "learning_rate": 3.609054150438317e-06,
+ "loss": 0.5097,
+ "step": 4553
+ },
+ {
+ "epoch": 2.153191489361702,
+ "grad_norm": 2.761035203933716,
+ "learning_rate": 3.6084950333457215e-06,
+ "loss": 0.5002,
+ "step": 4554
+ },
+ {
+ "epoch": 2.1536643026004727,
+ "grad_norm": 2.514223337173462,
+ "learning_rate": 3.607935847232002e-06,
+ "loss": 0.4171,
+ "step": 4555
+ },
+ {
+ "epoch": 2.1541371158392435,
+ "grad_norm": 2.5167524814605713,
+ "learning_rate": 3.6073765921319747e-06,
+ "loss": 0.4494,
+ "step": 4556
+ },
+ {
+ "epoch": 2.1546099290780143,
+ "grad_norm": 2.7540643215179443,
+ "learning_rate": 3.606817268080463e-06,
+ "loss": 0.4472,
+ "step": 4557
+ },
+ {
+ "epoch": 2.155082742316785,
+ "grad_norm": 2.7728664875030518,
+ "learning_rate": 3.6062578751122936e-06,
+ "loss": 0.4669,
+ "step": 4558
+ },
+ {
+ "epoch": 2.1555555555555554,
+ "grad_norm": 2.7788400650024414,
+ "learning_rate": 3.605698413262296e-06,
+ "loss": 0.4613,
+ "step": 4559
+ },
+ {
+ "epoch": 2.1560283687943262,
+ "grad_norm": 2.7811810970306396,
+ "learning_rate": 3.605138882565308e-06,
+ "loss": 0.4242,
+ "step": 4560
+ },
+ {
+ "epoch": 2.156501182033097,
+ "grad_norm": 2.7819995880126953,
+ "learning_rate": 3.6045792830561664e-06,
+ "loss": 0.443,
+ "step": 4561
+ },
+ {
+ "epoch": 2.1569739952718674,
+ "grad_norm": 2.671259641647339,
+ "learning_rate": 3.6040196147697166e-06,
+ "loss": 0.4336,
+ "step": 4562
+ },
+ {
+ "epoch": 2.157446808510638,
+ "grad_norm": 2.9296300411224365,
+ "learning_rate": 3.603459877740807e-06,
+ "loss": 0.479,
+ "step": 4563
+ },
+ {
+ "epoch": 2.157919621749409,
+ "grad_norm": 2.834937334060669,
+ "learning_rate": 3.602900072004289e-06,
+ "loss": 0.4603,
+ "step": 4564
+ },
+ {
+ "epoch": 2.15839243498818,
+ "grad_norm": 2.8434760570526123,
+ "learning_rate": 3.602340197595019e-06,
+ "loss": 0.4288,
+ "step": 4565
+ },
+ {
+ "epoch": 2.1588652482269506,
+ "grad_norm": 2.7245426177978516,
+ "learning_rate": 3.6017802545478593e-06,
+ "loss": 0.4194,
+ "step": 4566
+ },
+ {
+ "epoch": 2.159338061465721,
+ "grad_norm": 2.7795023918151855,
+ "learning_rate": 3.6012202428976735e-06,
+ "loss": 0.4481,
+ "step": 4567
+ },
+ {
+ "epoch": 2.1598108747044917,
+ "grad_norm": 2.9482083320617676,
+ "learning_rate": 3.6006601626793325e-06,
+ "loss": 0.468,
+ "step": 4568
+ },
+ {
+ "epoch": 2.1602836879432625,
+ "grad_norm": 2.9563326835632324,
+ "learning_rate": 3.6001000139277094e-06,
+ "loss": 0.4427,
+ "step": 4569
+ },
+ {
+ "epoch": 2.160756501182033,
+ "grad_norm": 2.7755916118621826,
+ "learning_rate": 3.599539796677682e-06,
+ "loss": 0.4258,
+ "step": 4570
+ },
+ {
+ "epoch": 2.1612293144208037,
+ "grad_norm": 2.961045265197754,
+ "learning_rate": 3.5989795109641333e-06,
+ "loss": 0.4645,
+ "step": 4571
+ },
+ {
+ "epoch": 2.1617021276595745,
+ "grad_norm": 3.0184407234191895,
+ "learning_rate": 3.5984191568219482e-06,
+ "loss": 0.4192,
+ "step": 4572
+ },
+ {
+ "epoch": 2.1621749408983453,
+ "grad_norm": 2.9811131954193115,
+ "learning_rate": 3.5978587342860192e-06,
+ "loss": 0.408,
+ "step": 4573
+ },
+ {
+ "epoch": 2.162647754137116,
+ "grad_norm": 2.9172329902648926,
+ "learning_rate": 3.597298243391242e-06,
+ "loss": 0.4528,
+ "step": 4574
+ },
+ {
+ "epoch": 2.1631205673758864,
+ "grad_norm": 2.7798452377319336,
+ "learning_rate": 3.596737684172513e-06,
+ "loss": 0.391,
+ "step": 4575
+ },
+ {
+ "epoch": 2.1635933806146572,
+ "grad_norm": 2.526277542114258,
+ "learning_rate": 3.596177056664738e-06,
+ "loss": 0.3699,
+ "step": 4576
+ },
+ {
+ "epoch": 2.164066193853428,
+ "grad_norm": 2.856269121170044,
+ "learning_rate": 3.5956163609028244e-06,
+ "loss": 0.4082,
+ "step": 4577
+ },
+ {
+ "epoch": 2.1645390070921984,
+ "grad_norm": 2.7681572437286377,
+ "learning_rate": 3.5950555969216845e-06,
+ "loss": 0.4064,
+ "step": 4578
+ },
+ {
+ "epoch": 2.165011820330969,
+ "grad_norm": 2.2924954891204834,
+ "learning_rate": 3.5944947647562333e-06,
+ "loss": 0.416,
+ "step": 4579
+ },
+ {
+ "epoch": 2.16548463356974,
+ "grad_norm": 2.439929485321045,
+ "learning_rate": 3.5939338644413936e-06,
+ "loss": 0.4476,
+ "step": 4580
+ },
+ {
+ "epoch": 2.1659574468085108,
+ "grad_norm": 2.786442518234253,
+ "learning_rate": 3.5933728960120877e-06,
+ "loss": 0.4525,
+ "step": 4581
+ },
+ {
+ "epoch": 2.166430260047281,
+ "grad_norm": 2.5910253524780273,
+ "learning_rate": 3.5928118595032465e-06,
+ "loss": 0.4441,
+ "step": 4582
+ },
+ {
+ "epoch": 2.166903073286052,
+ "grad_norm": 2.8144876956939697,
+ "learning_rate": 3.5922507549498024e-06,
+ "loss": 0.497,
+ "step": 4583
+ },
+ {
+ "epoch": 2.1673758865248227,
+ "grad_norm": 2.5714170932769775,
+ "learning_rate": 3.591689582386694e-06,
+ "loss": 0.4625,
+ "step": 4584
+ },
+ {
+ "epoch": 2.1678486997635935,
+ "grad_norm": 2.878187894821167,
+ "learning_rate": 3.591128341848861e-06,
+ "loss": 0.4835,
+ "step": 4585
+ },
+ {
+ "epoch": 2.168321513002364,
+ "grad_norm": 2.4946508407592773,
+ "learning_rate": 3.5905670333712504e-06,
+ "loss": 0.4278,
+ "step": 4586
+ },
+ {
+ "epoch": 2.1687943262411347,
+ "grad_norm": 2.9186196327209473,
+ "learning_rate": 3.590005656988814e-06,
+ "loss": 0.465,
+ "step": 4587
+ },
+ {
+ "epoch": 2.1692671394799055,
+ "grad_norm": 3.136807441711426,
+ "learning_rate": 3.5894442127365046e-06,
+ "loss": 0.4146,
+ "step": 4588
+ },
+ {
+ "epoch": 2.1697399527186763,
+ "grad_norm": 2.8106343746185303,
+ "learning_rate": 3.5888827006492804e-06,
+ "loss": 0.4737,
+ "step": 4589
+ },
+ {
+ "epoch": 2.1702127659574466,
+ "grad_norm": 2.874553680419922,
+ "learning_rate": 3.5883211207621047e-06,
+ "loss": 0.3962,
+ "step": 4590
+ },
+ {
+ "epoch": 2.1706855791962174,
+ "grad_norm": 2.7914116382598877,
+ "learning_rate": 3.587759473109946e-06,
+ "loss": 0.4705,
+ "step": 4591
+ },
+ {
+ "epoch": 2.171158392434988,
+ "grad_norm": 2.7273290157318115,
+ "learning_rate": 3.5871977577277745e-06,
+ "loss": 0.4827,
+ "step": 4592
+ },
+ {
+ "epoch": 2.171631205673759,
+ "grad_norm": 2.4167256355285645,
+ "learning_rate": 3.5866359746505653e-06,
+ "loss": 0.4181,
+ "step": 4593
+ },
+ {
+ "epoch": 2.1721040189125294,
+ "grad_norm": 2.8929779529571533,
+ "learning_rate": 3.586074123913299e-06,
+ "loss": 0.4006,
+ "step": 4594
+ },
+ {
+ "epoch": 2.1725768321513,
+ "grad_norm": 2.6996190547943115,
+ "learning_rate": 3.5855122055509593e-06,
+ "loss": 0.4792,
+ "step": 4595
+ },
+ {
+ "epoch": 2.173049645390071,
+ "grad_norm": 2.9341464042663574,
+ "learning_rate": 3.584950219598534e-06,
+ "loss": 0.3903,
+ "step": 4596
+ },
+ {
+ "epoch": 2.1735224586288417,
+ "grad_norm": 2.799330234527588,
+ "learning_rate": 3.5843881660910166e-06,
+ "loss": 0.4717,
+ "step": 4597
+ },
+ {
+ "epoch": 2.173995271867612,
+ "grad_norm": 2.5028693675994873,
+ "learning_rate": 3.5838260450634028e-06,
+ "loss": 0.4462,
+ "step": 4598
+ },
+ {
+ "epoch": 2.174468085106383,
+ "grad_norm": 2.5845541954040527,
+ "learning_rate": 3.583263856550693e-06,
+ "loss": 0.4327,
+ "step": 4599
+ },
+ {
+ "epoch": 2.1749408983451537,
+ "grad_norm": 2.4804906845092773,
+ "learning_rate": 3.5827016005878933e-06,
+ "loss": 0.4555,
+ "step": 4600
+ },
+ {
+ "epoch": 2.1754137115839245,
+ "grad_norm": 2.625746011734009,
+ "learning_rate": 3.5821392772100125e-06,
+ "loss": 0.455,
+ "step": 4601
+ },
+ {
+ "epoch": 2.175886524822695,
+ "grad_norm": 2.6230757236480713,
+ "learning_rate": 3.581576886452064e-06,
+ "loss": 0.4422,
+ "step": 4602
+ },
+ {
+ "epoch": 2.1763593380614656,
+ "grad_norm": 3.3104100227355957,
+ "learning_rate": 3.5810144283490656e-06,
+ "loss": 0.4212,
+ "step": 4603
+ },
+ {
+ "epoch": 2.1768321513002364,
+ "grad_norm": 2.6799755096435547,
+ "learning_rate": 3.5804519029360384e-06,
+ "loss": 0.4575,
+ "step": 4604
+ },
+ {
+ "epoch": 2.1773049645390072,
+ "grad_norm": 2.462216854095459,
+ "learning_rate": 3.5798893102480085e-06,
+ "loss": 0.4096,
+ "step": 4605
+ },
+ {
+ "epoch": 2.1777777777777776,
+ "grad_norm": 2.8600878715515137,
+ "learning_rate": 3.5793266503200074e-06,
+ "loss": 0.4798,
+ "step": 4606
+ },
+ {
+ "epoch": 2.1782505910165484,
+ "grad_norm": 2.935746431350708,
+ "learning_rate": 3.5787639231870673e-06,
+ "loss": 0.4021,
+ "step": 4607
+ },
+ {
+ "epoch": 2.178723404255319,
+ "grad_norm": 2.8655526638031006,
+ "learning_rate": 3.578201128884229e-06,
+ "loss": 0.4553,
+ "step": 4608
+ },
+ {
+ "epoch": 2.17919621749409,
+ "grad_norm": 3.219498634338379,
+ "learning_rate": 3.577638267446533e-06,
+ "loss": 0.4692,
+ "step": 4609
+ },
+ {
+ "epoch": 2.1796690307328603,
+ "grad_norm": 3.0449860095977783,
+ "learning_rate": 3.5770753389090283e-06,
+ "loss": 0.4675,
+ "step": 4610
+ },
+ {
+ "epoch": 2.180141843971631,
+ "grad_norm": 2.7045507431030273,
+ "learning_rate": 3.576512343306765e-06,
+ "loss": 0.4773,
+ "step": 4611
+ },
+ {
+ "epoch": 2.180614657210402,
+ "grad_norm": 2.601499557495117,
+ "learning_rate": 3.5759492806747985e-06,
+ "loss": 0.4112,
+ "step": 4612
+ },
+ {
+ "epoch": 2.1810874704491727,
+ "grad_norm": 2.987741470336914,
+ "learning_rate": 3.575386151048188e-06,
+ "loss": 0.4651,
+ "step": 4613
+ },
+ {
+ "epoch": 2.181560283687943,
+ "grad_norm": 2.961228847503662,
+ "learning_rate": 3.5748229544619973e-06,
+ "loss": 0.5116,
+ "step": 4614
+ },
+ {
+ "epoch": 2.182033096926714,
+ "grad_norm": 2.8008430004119873,
+ "learning_rate": 3.574259690951295e-06,
+ "loss": 0.4152,
+ "step": 4615
+ },
+ {
+ "epoch": 2.1825059101654847,
+ "grad_norm": 2.5429348945617676,
+ "learning_rate": 3.573696360551151e-06,
+ "loss": 0.4188,
+ "step": 4616
+ },
+ {
+ "epoch": 2.1829787234042555,
+ "grad_norm": 2.9566478729248047,
+ "learning_rate": 3.5731329632966428e-06,
+ "loss": 0.5156,
+ "step": 4617
+ },
+ {
+ "epoch": 2.183451536643026,
+ "grad_norm": 2.5302467346191406,
+ "learning_rate": 3.572569499222851e-06,
+ "loss": 0.4361,
+ "step": 4618
+ },
+ {
+ "epoch": 2.1839243498817966,
+ "grad_norm": 3.206803560256958,
+ "learning_rate": 3.5720059683648593e-06,
+ "loss": 0.5149,
+ "step": 4619
+ },
+ {
+ "epoch": 2.1843971631205674,
+ "grad_norm": 2.9432034492492676,
+ "learning_rate": 3.5714423707577573e-06,
+ "loss": 0.4411,
+ "step": 4620
+ },
+ {
+ "epoch": 2.184869976359338,
+ "grad_norm": 2.9412078857421875,
+ "learning_rate": 3.5708787064366358e-06,
+ "loss": 0.4372,
+ "step": 4621
+ },
+ {
+ "epoch": 2.1853427895981086,
+ "grad_norm": 3.1702330112457275,
+ "learning_rate": 3.5703149754365935e-06,
+ "loss": 0.4761,
+ "step": 4622
+ },
+ {
+ "epoch": 2.1858156028368794,
+ "grad_norm": 3.1240456104278564,
+ "learning_rate": 3.569751177792731e-06,
+ "loss": 0.4854,
+ "step": 4623
+ },
+ {
+ "epoch": 2.18628841607565,
+ "grad_norm": 2.7221994400024414,
+ "learning_rate": 3.5691873135401534e-06,
+ "loss": 0.4048,
+ "step": 4624
+ },
+ {
+ "epoch": 2.186761229314421,
+ "grad_norm": 2.74397873878479,
+ "learning_rate": 3.5686233827139695e-06,
+ "loss": 0.4747,
+ "step": 4625
+ },
+ {
+ "epoch": 2.1872340425531913,
+ "grad_norm": 2.7379889488220215,
+ "learning_rate": 3.5680593853492932e-06,
+ "loss": 0.4963,
+ "step": 4626
+ },
+ {
+ "epoch": 2.187706855791962,
+ "grad_norm": 3.040205478668213,
+ "learning_rate": 3.5674953214812435e-06,
+ "loss": 0.4917,
+ "step": 4627
+ },
+ {
+ "epoch": 2.188179669030733,
+ "grad_norm": 2.95302677154541,
+ "learning_rate": 3.56693119114494e-06,
+ "loss": 0.4758,
+ "step": 4628
+ },
+ {
+ "epoch": 2.1886524822695037,
+ "grad_norm": 2.5488312244415283,
+ "learning_rate": 3.56636699437551e-06,
+ "loss": 0.4057,
+ "step": 4629
+ },
+ {
+ "epoch": 2.189125295508274,
+ "grad_norm": 2.8379666805267334,
+ "learning_rate": 3.565802731208083e-06,
+ "loss": 0.4755,
+ "step": 4630
+ },
+ {
+ "epoch": 2.189598108747045,
+ "grad_norm": 2.8765869140625,
+ "learning_rate": 3.565238401677793e-06,
+ "loss": 0.4232,
+ "step": 4631
+ },
+ {
+ "epoch": 2.1900709219858157,
+ "grad_norm": 2.9091262817382812,
+ "learning_rate": 3.5646740058197784e-06,
+ "loss": 0.3874,
+ "step": 4632
+ },
+ {
+ "epoch": 2.1905437352245865,
+ "grad_norm": 2.7067387104034424,
+ "learning_rate": 3.5641095436691826e-06,
+ "loss": 0.4771,
+ "step": 4633
+ },
+ {
+ "epoch": 2.191016548463357,
+ "grad_norm": 2.403043508529663,
+ "learning_rate": 3.563545015261151e-06,
+ "loss": 0.4062,
+ "step": 4634
+ },
+ {
+ "epoch": 2.1914893617021276,
+ "grad_norm": 2.8059732913970947,
+ "learning_rate": 3.562980420630836e-06,
+ "loss": 0.4635,
+ "step": 4635
+ },
+ {
+ "epoch": 2.1919621749408984,
+ "grad_norm": 2.5467724800109863,
+ "learning_rate": 3.56241575981339e-06,
+ "loss": 0.4552,
+ "step": 4636
+ },
+ {
+ "epoch": 2.192434988179669,
+ "grad_norm": 2.651024103164673,
+ "learning_rate": 3.561851032843973e-06,
+ "loss": 0.38,
+ "step": 4637
+ },
+ {
+ "epoch": 2.1929078014184396,
+ "grad_norm": 2.5529849529266357,
+ "learning_rate": 3.5612862397577496e-06,
+ "loss": 0.4106,
+ "step": 4638
+ },
+ {
+ "epoch": 2.1933806146572103,
+ "grad_norm": 3.069258451461792,
+ "learning_rate": 3.5607213805898844e-06,
+ "loss": 0.461,
+ "step": 4639
+ },
+ {
+ "epoch": 2.193853427895981,
+ "grad_norm": 2.5652637481689453,
+ "learning_rate": 3.56015645537555e-06,
+ "loss": 0.4497,
+ "step": 4640
+ },
+ {
+ "epoch": 2.194326241134752,
+ "grad_norm": 2.699101209640503,
+ "learning_rate": 3.5595914641499224e-06,
+ "loss": 0.4887,
+ "step": 4641
+ },
+ {
+ "epoch": 2.1947990543735223,
+ "grad_norm": 2.9292235374450684,
+ "learning_rate": 3.5590264069481805e-06,
+ "loss": 0.4462,
+ "step": 4642
+ },
+ {
+ "epoch": 2.195271867612293,
+ "grad_norm": 2.6151106357574463,
+ "learning_rate": 3.5584612838055077e-06,
+ "loss": 0.4334,
+ "step": 4643
+ },
+ {
+ "epoch": 2.195744680851064,
+ "grad_norm": 2.895798444747925,
+ "learning_rate": 3.5578960947570923e-06,
+ "loss": 0.4448,
+ "step": 4644
+ },
+ {
+ "epoch": 2.1962174940898347,
+ "grad_norm": 2.627631425857544,
+ "learning_rate": 3.557330839838125e-06,
+ "loss": 0.436,
+ "step": 4645
+ },
+ {
+ "epoch": 2.196690307328605,
+ "grad_norm": 2.8803584575653076,
+ "learning_rate": 3.556765519083803e-06,
+ "loss": 0.4698,
+ "step": 4646
+ },
+ {
+ "epoch": 2.197163120567376,
+ "grad_norm": 2.436609983444214,
+ "learning_rate": 3.5562001325293265e-06,
+ "loss": 0.4043,
+ "step": 4647
+ },
+ {
+ "epoch": 2.1976359338061466,
+ "grad_norm": 2.5090718269348145,
+ "learning_rate": 3.5556346802098985e-06,
+ "loss": 0.4505,
+ "step": 4648
+ },
+ {
+ "epoch": 2.1981087470449174,
+ "grad_norm": 2.792783737182617,
+ "learning_rate": 3.5550691621607277e-06,
+ "loss": 0.43,
+ "step": 4649
+ },
+ {
+ "epoch": 2.198581560283688,
+ "grad_norm": 2.74153470993042,
+ "learning_rate": 3.554503578417026e-06,
+ "loss": 0.4496,
+ "step": 4650
+ },
+ {
+ "epoch": 2.1990543735224586,
+ "grad_norm": 3.0262627601623535,
+ "learning_rate": 3.5539379290140114e-06,
+ "loss": 0.4503,
+ "step": 4651
+ },
+ {
+ "epoch": 2.1995271867612294,
+ "grad_norm": 2.783811330795288,
+ "learning_rate": 3.553372213986903e-06,
+ "loss": 0.432,
+ "step": 4652
+ },
+ {
+ "epoch": 2.2,
+ "grad_norm": 3.091191053390503,
+ "learning_rate": 3.5528064333709255e-06,
+ "loss": 0.4658,
+ "step": 4653
+ },
+ {
+ "epoch": 2.2004728132387705,
+ "grad_norm": 2.814634084701538,
+ "learning_rate": 3.5522405872013076e-06,
+ "loss": 0.4473,
+ "step": 4654
+ },
+ {
+ "epoch": 2.2009456264775413,
+ "grad_norm": 2.6918299198150635,
+ "learning_rate": 3.5516746755132824e-06,
+ "loss": 0.5323,
+ "step": 4655
+ },
+ {
+ "epoch": 2.201418439716312,
+ "grad_norm": 2.9902455806732178,
+ "learning_rate": 3.5511086983420867e-06,
+ "loss": 0.5166,
+ "step": 4656
+ },
+ {
+ "epoch": 2.201891252955083,
+ "grad_norm": 2.932699203491211,
+ "learning_rate": 3.5505426557229616e-06,
+ "loss": 0.5197,
+ "step": 4657
+ },
+ {
+ "epoch": 2.2023640661938533,
+ "grad_norm": 2.585712432861328,
+ "learning_rate": 3.549976547691152e-06,
+ "loss": 0.425,
+ "step": 4658
+ },
+ {
+ "epoch": 2.202836879432624,
+ "grad_norm": 3.1019949913024902,
+ "learning_rate": 3.5494103742819065e-06,
+ "loss": 0.485,
+ "step": 4659
+ },
+ {
+ "epoch": 2.203309692671395,
+ "grad_norm": 2.3169195652008057,
+ "learning_rate": 3.548844135530478e-06,
+ "loss": 0.4064,
+ "step": 4660
+ },
+ {
+ "epoch": 2.2037825059101657,
+ "grad_norm": 2.779240846633911,
+ "learning_rate": 3.5482778314721257e-06,
+ "loss": 0.427,
+ "step": 4661
+ },
+ {
+ "epoch": 2.204255319148936,
+ "grad_norm": 2.765423059463501,
+ "learning_rate": 3.5477114621421078e-06,
+ "loss": 0.5125,
+ "step": 4662
+ },
+ {
+ "epoch": 2.204728132387707,
+ "grad_norm": 2.5590033531188965,
+ "learning_rate": 3.5471450275756913e-06,
+ "loss": 0.4009,
+ "step": 4663
+ },
+ {
+ "epoch": 2.2052009456264776,
+ "grad_norm": 2.706068515777588,
+ "learning_rate": 3.546578527808146e-06,
+ "loss": 0.4604,
+ "step": 4664
+ },
+ {
+ "epoch": 2.2056737588652484,
+ "grad_norm": 2.7995102405548096,
+ "learning_rate": 3.546011962874745e-06,
+ "loss": 0.4088,
+ "step": 4665
+ },
+ {
+ "epoch": 2.2061465721040188,
+ "grad_norm": 2.6369729042053223,
+ "learning_rate": 3.5454453328107656e-06,
+ "loss": 0.4634,
+ "step": 4666
+ },
+ {
+ "epoch": 2.2066193853427896,
+ "grad_norm": 3.1426475048065186,
+ "learning_rate": 3.54487863765149e-06,
+ "loss": 0.4761,
+ "step": 4667
+ },
+ {
+ "epoch": 2.2070921985815604,
+ "grad_norm": 2.7739460468292236,
+ "learning_rate": 3.5443118774322027e-06,
+ "loss": 0.467,
+ "step": 4668
+ },
+ {
+ "epoch": 2.207565011820331,
+ "grad_norm": 2.559105157852173,
+ "learning_rate": 3.5437450521881934e-06,
+ "loss": 0.4268,
+ "step": 4669
+ },
+ {
+ "epoch": 2.2080378250591015,
+ "grad_norm": 2.726593017578125,
+ "learning_rate": 3.543178161954758e-06,
+ "loss": 0.462,
+ "step": 4670
+ },
+ {
+ "epoch": 2.2085106382978723,
+ "grad_norm": 2.796109199523926,
+ "learning_rate": 3.5426112067671907e-06,
+ "loss": 0.4571,
+ "step": 4671
+ },
+ {
+ "epoch": 2.208983451536643,
+ "grad_norm": 2.7989072799682617,
+ "learning_rate": 3.5420441866607964e-06,
+ "loss": 0.4648,
+ "step": 4672
+ },
+ {
+ "epoch": 2.209456264775414,
+ "grad_norm": 2.6750967502593994,
+ "learning_rate": 3.5414771016708795e-06,
+ "loss": 0.4717,
+ "step": 4673
+ },
+ {
+ "epoch": 2.2099290780141843,
+ "grad_norm": 2.705659866333008,
+ "learning_rate": 3.5409099518327507e-06,
+ "loss": 0.4738,
+ "step": 4674
+ },
+ {
+ "epoch": 2.210401891252955,
+ "grad_norm": 2.79276442527771,
+ "learning_rate": 3.5403427371817234e-06,
+ "loss": 0.4625,
+ "step": 4675
+ },
+ {
+ "epoch": 2.210874704491726,
+ "grad_norm": 2.781339406967163,
+ "learning_rate": 3.539775457753115e-06,
+ "loss": 0.438,
+ "step": 4676
+ },
+ {
+ "epoch": 2.2113475177304966,
+ "grad_norm": 3.0088918209075928,
+ "learning_rate": 3.5392081135822488e-06,
+ "loss": 0.4776,
+ "step": 4677
+ },
+ {
+ "epoch": 2.211820330969267,
+ "grad_norm": 3.0291390419006348,
+ "learning_rate": 3.538640704704449e-06,
+ "loss": 0.4634,
+ "step": 4678
+ },
+ {
+ "epoch": 2.212293144208038,
+ "grad_norm": 2.967867374420166,
+ "learning_rate": 3.5380732311550477e-06,
+ "loss": 0.4776,
+ "step": 4679
+ },
+ {
+ "epoch": 2.2127659574468086,
+ "grad_norm": 2.6268832683563232,
+ "learning_rate": 3.5375056929693787e-06,
+ "loss": 0.4646,
+ "step": 4680
+ },
+ {
+ "epoch": 2.2132387706855794,
+ "grad_norm": 2.6688554286956787,
+ "learning_rate": 3.536938090182778e-06,
+ "loss": 0.3975,
+ "step": 4681
+ },
+ {
+ "epoch": 2.2137115839243497,
+ "grad_norm": 3.0079736709594727,
+ "learning_rate": 3.5363704228305906e-06,
+ "loss": 0.4724,
+ "step": 4682
+ },
+ {
+ "epoch": 2.2141843971631205,
+ "grad_norm": 2.4287586212158203,
+ "learning_rate": 3.535802690948161e-06,
+ "loss": 0.4371,
+ "step": 4683
+ },
+ {
+ "epoch": 2.2146572104018913,
+ "grad_norm": 2.960679531097412,
+ "learning_rate": 3.53523489457084e-06,
+ "loss": 0.4347,
+ "step": 4684
+ },
+ {
+ "epoch": 2.215130023640662,
+ "grad_norm": 2.9646008014678955,
+ "learning_rate": 3.5346670337339807e-06,
+ "loss": 0.4803,
+ "step": 4685
+ },
+ {
+ "epoch": 2.2156028368794325,
+ "grad_norm": 3.0518898963928223,
+ "learning_rate": 3.534099108472942e-06,
+ "loss": 0.4712,
+ "step": 4686
+ },
+ {
+ "epoch": 2.2160756501182033,
+ "grad_norm": 2.776681900024414,
+ "learning_rate": 3.533531118823086e-06,
+ "loss": 0.4347,
+ "step": 4687
+ },
+ {
+ "epoch": 2.216548463356974,
+ "grad_norm": 2.18019437789917,
+ "learning_rate": 3.53296306481978e-06,
+ "loss": 0.3551,
+ "step": 4688
+ },
+ {
+ "epoch": 2.217021276595745,
+ "grad_norm": 2.9400811195373535,
+ "learning_rate": 3.5323949464983937e-06,
+ "loss": 0.4912,
+ "step": 4689
+ },
+ {
+ "epoch": 2.2174940898345152,
+ "grad_norm": 2.798386812210083,
+ "learning_rate": 3.5318267638943e-06,
+ "loss": 0.3967,
+ "step": 4690
+ },
+ {
+ "epoch": 2.217966903073286,
+ "grad_norm": 2.5452775955200195,
+ "learning_rate": 3.531258517042879e-06,
+ "loss": 0.3773,
+ "step": 4691
+ },
+ {
+ "epoch": 2.218439716312057,
+ "grad_norm": 2.711137294769287,
+ "learning_rate": 3.5306902059795113e-06,
+ "loss": 0.4123,
+ "step": 4692
+ },
+ {
+ "epoch": 2.2189125295508276,
+ "grad_norm": 3.0022387504577637,
+ "learning_rate": 3.530121830739584e-06,
+ "loss": 0.4898,
+ "step": 4693
+ },
+ {
+ "epoch": 2.219385342789598,
+ "grad_norm": 2.871814250946045,
+ "learning_rate": 3.5295533913584877e-06,
+ "loss": 0.4497,
+ "step": 4694
+ },
+ {
+ "epoch": 2.219858156028369,
+ "grad_norm": 2.9782521724700928,
+ "learning_rate": 3.528984887871616e-06,
+ "loss": 0.4797,
+ "step": 4695
+ },
+ {
+ "epoch": 2.2203309692671396,
+ "grad_norm": 2.6896398067474365,
+ "learning_rate": 3.5284163203143673e-06,
+ "loss": 0.439,
+ "step": 4696
+ },
+ {
+ "epoch": 2.2208037825059104,
+ "grad_norm": 2.7898833751678467,
+ "learning_rate": 3.5278476887221436e-06,
+ "loss": 0.4656,
+ "step": 4697
+ },
+ {
+ "epoch": 2.2212765957446807,
+ "grad_norm": 2.800416946411133,
+ "learning_rate": 3.527278993130352e-06,
+ "loss": 0.4452,
+ "step": 4698
+ },
+ {
+ "epoch": 2.2217494089834515,
+ "grad_norm": 3.653228998184204,
+ "learning_rate": 3.526710233574401e-06,
+ "loss": 0.4189,
+ "step": 4699
+ },
+ {
+ "epoch": 2.2222222222222223,
+ "grad_norm": 2.856956958770752,
+ "learning_rate": 3.5261414100897064e-06,
+ "loss": 0.4298,
+ "step": 4700
+ },
+ {
+ "epoch": 2.222695035460993,
+ "grad_norm": 2.8576223850250244,
+ "learning_rate": 3.5255725227116854e-06,
+ "loss": 0.4425,
+ "step": 4701
+ },
+ {
+ "epoch": 2.2231678486997635,
+ "grad_norm": 3.1161351203918457,
+ "learning_rate": 3.5250035714757603e-06,
+ "loss": 0.4609,
+ "step": 4702
+ },
+ {
+ "epoch": 2.2236406619385343,
+ "grad_norm": 2.843379259109497,
+ "learning_rate": 3.5244345564173578e-06,
+ "loss": 0.3589,
+ "step": 4703
+ },
+ {
+ "epoch": 2.224113475177305,
+ "grad_norm": 2.877157211303711,
+ "learning_rate": 3.5238654775719068e-06,
+ "loss": 0.4591,
+ "step": 4704
+ },
+ {
+ "epoch": 2.2245862884160754,
+ "grad_norm": 3.488954782485962,
+ "learning_rate": 3.5232963349748424e-06,
+ "loss": 0.4836,
+ "step": 4705
+ },
+ {
+ "epoch": 2.225059101654846,
+ "grad_norm": 2.929037570953369,
+ "learning_rate": 3.5227271286616025e-06,
+ "loss": 0.5293,
+ "step": 4706
+ },
+ {
+ "epoch": 2.225531914893617,
+ "grad_norm": 2.6230576038360596,
+ "learning_rate": 3.5221578586676286e-06,
+ "loss": 0.4235,
+ "step": 4707
+ },
+ {
+ "epoch": 2.226004728132388,
+ "grad_norm": 2.529998302459717,
+ "learning_rate": 3.5215885250283664e-06,
+ "loss": 0.4369,
+ "step": 4708
+ },
+ {
+ "epoch": 2.2264775413711586,
+ "grad_norm": 2.817279577255249,
+ "learning_rate": 3.521019127779267e-06,
+ "loss": 0.481,
+ "step": 4709
+ },
+ {
+ "epoch": 2.226950354609929,
+ "grad_norm": 3.1513843536376953,
+ "learning_rate": 3.5204496669557833e-06,
+ "loss": 0.463,
+ "step": 4710
+ },
+ {
+ "epoch": 2.2274231678486998,
+ "grad_norm": 2.9403610229492188,
+ "learning_rate": 3.5198801425933725e-06,
+ "loss": 0.455,
+ "step": 4711
+ },
+ {
+ "epoch": 2.2278959810874706,
+ "grad_norm": 2.648346424102783,
+ "learning_rate": 3.5193105547274987e-06,
+ "loss": 0.4441,
+ "step": 4712
+ },
+ {
+ "epoch": 2.228368794326241,
+ "grad_norm": 2.791898727416992,
+ "learning_rate": 3.5187409033936252e-06,
+ "loss": 0.4682,
+ "step": 4713
+ },
+ {
+ "epoch": 2.2288416075650117,
+ "grad_norm": 2.8157432079315186,
+ "learning_rate": 3.5181711886272242e-06,
+ "loss": 0.4572,
+ "step": 4714
+ },
+ {
+ "epoch": 2.2293144208037825,
+ "grad_norm": 3.250319480895996,
+ "learning_rate": 3.5176014104637665e-06,
+ "loss": 0.4599,
+ "step": 4715
+ },
+ {
+ "epoch": 2.2297872340425533,
+ "grad_norm": 2.6747050285339355,
+ "learning_rate": 3.5170315689387307e-06,
+ "loss": 0.4328,
+ "step": 4716
+ },
+ {
+ "epoch": 2.230260047281324,
+ "grad_norm": 2.584094762802124,
+ "learning_rate": 3.5164616640875993e-06,
+ "loss": 0.4268,
+ "step": 4717
+ },
+ {
+ "epoch": 2.2307328605200945,
+ "grad_norm": 2.480710506439209,
+ "learning_rate": 3.5158916959458573e-06,
+ "loss": 0.438,
+ "step": 4718
+ },
+ {
+ "epoch": 2.2312056737588652,
+ "grad_norm": 2.9338483810424805,
+ "learning_rate": 3.515321664548993e-06,
+ "loss": 0.4937,
+ "step": 4719
+ },
+ {
+ "epoch": 2.231678486997636,
+ "grad_norm": 2.7880783081054688,
+ "learning_rate": 3.5147515699325013e-06,
+ "loss": 0.4624,
+ "step": 4720
+ },
+ {
+ "epoch": 2.2321513002364064,
+ "grad_norm": 2.740841865539551,
+ "learning_rate": 3.5141814121318797e-06,
+ "loss": 0.3689,
+ "step": 4721
+ },
+ {
+ "epoch": 2.232624113475177,
+ "grad_norm": 2.9541244506835938,
+ "learning_rate": 3.5136111911826277e-06,
+ "loss": 0.4092,
+ "step": 4722
+ },
+ {
+ "epoch": 2.233096926713948,
+ "grad_norm": 2.7205398082733154,
+ "learning_rate": 3.5130409071202515e-06,
+ "loss": 0.445,
+ "step": 4723
+ },
+ {
+ "epoch": 2.233569739952719,
+ "grad_norm": 2.563406229019165,
+ "learning_rate": 3.51247055998026e-06,
+ "loss": 0.4335,
+ "step": 4724
+ },
+ {
+ "epoch": 2.2340425531914896,
+ "grad_norm": 2.4249489307403564,
+ "learning_rate": 3.5119001497981666e-06,
+ "loss": 0.4671,
+ "step": 4725
+ },
+ {
+ "epoch": 2.23451536643026,
+ "grad_norm": 2.711630344390869,
+ "learning_rate": 3.5113296766094875e-06,
+ "loss": 0.4177,
+ "step": 4726
+ },
+ {
+ "epoch": 2.2349881796690307,
+ "grad_norm": 3.0257632732391357,
+ "learning_rate": 3.5107591404497443e-06,
+ "loss": 0.4976,
+ "step": 4727
+ },
+ {
+ "epoch": 2.2354609929078015,
+ "grad_norm": 2.717303991317749,
+ "learning_rate": 3.5101885413544614e-06,
+ "loss": 0.4621,
+ "step": 4728
+ },
+ {
+ "epoch": 2.235933806146572,
+ "grad_norm": 3.2846004962921143,
+ "learning_rate": 3.509617879359167e-06,
+ "loss": 0.4284,
+ "step": 4729
+ },
+ {
+ "epoch": 2.2364066193853427,
+ "grad_norm": 2.7217819690704346,
+ "learning_rate": 3.5090471544993953e-06,
+ "loss": 0.4247,
+ "step": 4730
+ },
+ {
+ "epoch": 2.2368794326241135,
+ "grad_norm": 2.5003223419189453,
+ "learning_rate": 3.5084763668106812e-06,
+ "loss": 0.4096,
+ "step": 4731
+ },
+ {
+ "epoch": 2.2373522458628843,
+ "grad_norm": 2.7312731742858887,
+ "learning_rate": 3.5079055163285658e-06,
+ "loss": 0.4741,
+ "step": 4732
+ },
+ {
+ "epoch": 2.237825059101655,
+ "grad_norm": 2.84940767288208,
+ "learning_rate": 3.5073346030885934e-06,
+ "loss": 0.4887,
+ "step": 4733
+ },
+ {
+ "epoch": 2.2382978723404254,
+ "grad_norm": 3.1188511848449707,
+ "learning_rate": 3.506763627126313e-06,
+ "loss": 0.5335,
+ "step": 4734
+ },
+ {
+ "epoch": 2.2387706855791962,
+ "grad_norm": 2.6741397380828857,
+ "learning_rate": 3.5061925884772753e-06,
+ "loss": 0.4137,
+ "step": 4735
+ },
+ {
+ "epoch": 2.239243498817967,
+ "grad_norm": 3.1542465686798096,
+ "learning_rate": 3.505621487177037e-06,
+ "loss": 0.5303,
+ "step": 4736
+ },
+ {
+ "epoch": 2.2397163120567374,
+ "grad_norm": 5.448268890380859,
+ "learning_rate": 3.505050323261159e-06,
+ "loss": 0.4995,
+ "step": 4737
+ },
+ {
+ "epoch": 2.240189125295508,
+ "grad_norm": 2.7317898273468018,
+ "learning_rate": 3.5044790967652037e-06,
+ "loss": 0.4595,
+ "step": 4738
+ },
+ {
+ "epoch": 2.240661938534279,
+ "grad_norm": 2.8135695457458496,
+ "learning_rate": 3.50390780772474e-06,
+ "loss": 0.4593,
+ "step": 4739
+ },
+ {
+ "epoch": 2.2411347517730498,
+ "grad_norm": 3.1391844749450684,
+ "learning_rate": 3.5033364561753393e-06,
+ "loss": 0.4902,
+ "step": 4740
+ },
+ {
+ "epoch": 2.24160756501182,
+ "grad_norm": 2.6383132934570312,
+ "learning_rate": 3.5027650421525762e-06,
+ "loss": 0.3832,
+ "step": 4741
+ },
+ {
+ "epoch": 2.242080378250591,
+ "grad_norm": 2.742546558380127,
+ "learning_rate": 3.5021935656920314e-06,
+ "loss": 0.4012,
+ "step": 4742
+ },
+ {
+ "epoch": 2.2425531914893617,
+ "grad_norm": 3.1243674755096436,
+ "learning_rate": 3.5016220268292873e-06,
+ "loss": 0.4271,
+ "step": 4743
+ },
+ {
+ "epoch": 2.2430260047281325,
+ "grad_norm": 2.794717788696289,
+ "learning_rate": 3.501050425599932e-06,
+ "loss": 0.4604,
+ "step": 4744
+ },
+ {
+ "epoch": 2.243498817966903,
+ "grad_norm": 2.8481621742248535,
+ "learning_rate": 3.5004787620395565e-06,
+ "loss": 0.4814,
+ "step": 4745
+ },
+ {
+ "epoch": 2.2439716312056737,
+ "grad_norm": 2.8842051029205322,
+ "learning_rate": 3.499907036183755e-06,
+ "loss": 0.4987,
+ "step": 4746
+ },
+ {
+ "epoch": 2.2444444444444445,
+ "grad_norm": 3.074805974960327,
+ "learning_rate": 3.4993352480681265e-06,
+ "loss": 0.4966,
+ "step": 4747
+ },
+ {
+ "epoch": 2.2449172576832153,
+ "grad_norm": 2.7204246520996094,
+ "learning_rate": 3.4987633977282742e-06,
+ "loss": 0.4,
+ "step": 4748
+ },
+ {
+ "epoch": 2.2453900709219856,
+ "grad_norm": 2.685884952545166,
+ "learning_rate": 3.4981914851998055e-06,
+ "loss": 0.4285,
+ "step": 4749
+ },
+ {
+ "epoch": 2.2458628841607564,
+ "grad_norm": 2.1666336059570312,
+ "learning_rate": 3.4976195105183287e-06,
+ "loss": 0.3756,
+ "step": 4750
+ },
+ {
+ "epoch": 2.246335697399527,
+ "grad_norm": 2.863006353378296,
+ "learning_rate": 3.49704747371946e-06,
+ "loss": 0.4535,
+ "step": 4751
+ },
+ {
+ "epoch": 2.246808510638298,
+ "grad_norm": 2.5558736324310303,
+ "learning_rate": 3.496475374838817e-06,
+ "loss": 0.4129,
+ "step": 4752
+ },
+ {
+ "epoch": 2.2472813238770684,
+ "grad_norm": 2.9780309200286865,
+ "learning_rate": 3.495903213912022e-06,
+ "loss": 0.4871,
+ "step": 4753
+ },
+ {
+ "epoch": 2.247754137115839,
+ "grad_norm": 2.951779365539551,
+ "learning_rate": 3.4953309909747e-06,
+ "loss": 0.5162,
+ "step": 4754
+ },
+ {
+ "epoch": 2.24822695035461,
+ "grad_norm": 2.7654693126678467,
+ "learning_rate": 3.4947587060624834e-06,
+ "loss": 0.4662,
+ "step": 4755
+ },
+ {
+ "epoch": 2.2486997635933808,
+ "grad_norm": 2.708247184753418,
+ "learning_rate": 3.494186359211002e-06,
+ "loss": 0.4279,
+ "step": 4756
+ },
+ {
+ "epoch": 2.249172576832151,
+ "grad_norm": 3.09916615486145,
+ "learning_rate": 3.4936139504558963e-06,
+ "loss": 0.4085,
+ "step": 4757
+ },
+ {
+ "epoch": 2.249645390070922,
+ "grad_norm": 2.913806200027466,
+ "learning_rate": 3.493041479832807e-06,
+ "loss": 0.4653,
+ "step": 4758
+ },
+ {
+ "epoch": 2.2501182033096927,
+ "grad_norm": 3.2903928756713867,
+ "learning_rate": 3.4924689473773787e-06,
+ "loss": 0.5167,
+ "step": 4759
+ },
+ {
+ "epoch": 2.2505910165484635,
+ "grad_norm": 3.1302902698516846,
+ "learning_rate": 3.4918963531252607e-06,
+ "loss": 0.5398,
+ "step": 4760
+ },
+ {
+ "epoch": 2.251063829787234,
+ "grad_norm": 2.8858273029327393,
+ "learning_rate": 3.4913236971121063e-06,
+ "loss": 0.4395,
+ "step": 4761
+ },
+ {
+ "epoch": 2.2515366430260046,
+ "grad_norm": 3.194521903991699,
+ "learning_rate": 3.4907509793735727e-06,
+ "loss": 0.5258,
+ "step": 4762
+ },
+ {
+ "epoch": 2.2520094562647754,
+ "grad_norm": 2.8640544414520264,
+ "learning_rate": 3.49017819994532e-06,
+ "loss": 0.4073,
+ "step": 4763
+ },
+ {
+ "epoch": 2.2524822695035462,
+ "grad_norm": 3.139995813369751,
+ "learning_rate": 3.489605358863011e-06,
+ "loss": 0.4653,
+ "step": 4764
+ },
+ {
+ "epoch": 2.2529550827423166,
+ "grad_norm": 2.6228537559509277,
+ "learning_rate": 3.489032456162317e-06,
+ "loss": 0.4546,
+ "step": 4765
+ },
+ {
+ "epoch": 2.2534278959810874,
+ "grad_norm": 2.8197672367095947,
+ "learning_rate": 3.4884594918789083e-06,
+ "loss": 0.479,
+ "step": 4766
+ },
+ {
+ "epoch": 2.253900709219858,
+ "grad_norm": 2.7839298248291016,
+ "learning_rate": 3.4878864660484612e-06,
+ "loss": 0.5081,
+ "step": 4767
+ },
+ {
+ "epoch": 2.254373522458629,
+ "grad_norm": 2.8630709648132324,
+ "learning_rate": 3.487313378706656e-06,
+ "loss": 0.4345,
+ "step": 4768
+ },
+ {
+ "epoch": 2.2548463356973993,
+ "grad_norm": 2.5661563873291016,
+ "learning_rate": 3.4867402298891755e-06,
+ "loss": 0.4266,
+ "step": 4769
+ },
+ {
+ "epoch": 2.25531914893617,
+ "grad_norm": 2.6274025440216064,
+ "learning_rate": 3.4861670196317084e-06,
+ "loss": 0.4645,
+ "step": 4770
+ },
+ {
+ "epoch": 2.255791962174941,
+ "grad_norm": 2.578702449798584,
+ "learning_rate": 3.485593747969944e-06,
+ "loss": 0.4242,
+ "step": 4771
+ },
+ {
+ "epoch": 2.2562647754137117,
+ "grad_norm": 2.322476625442505,
+ "learning_rate": 3.48502041493958e-06,
+ "loss": 0.3975,
+ "step": 4772
+ },
+ {
+ "epoch": 2.256737588652482,
+ "grad_norm": 2.8412630558013916,
+ "learning_rate": 3.484447020576313e-06,
+ "loss": 0.4276,
+ "step": 4773
+ },
+ {
+ "epoch": 2.257210401891253,
+ "grad_norm": 2.6090497970581055,
+ "learning_rate": 3.483873564915847e-06,
+ "loss": 0.429,
+ "step": 4774
+ },
+ {
+ "epoch": 2.2576832151300237,
+ "grad_norm": 2.692458152770996,
+ "learning_rate": 3.4833000479938877e-06,
+ "loss": 0.4211,
+ "step": 4775
+ },
+ {
+ "epoch": 2.2581560283687945,
+ "grad_norm": 2.5546815395355225,
+ "learning_rate": 3.482726469846146e-06,
+ "loss": 0.4751,
+ "step": 4776
+ },
+ {
+ "epoch": 2.258628841607565,
+ "grad_norm": 2.8409626483917236,
+ "learning_rate": 3.4821528305083376e-06,
+ "loss": 0.4821,
+ "step": 4777
+ },
+ {
+ "epoch": 2.2591016548463356,
+ "grad_norm": 2.722966432571411,
+ "learning_rate": 3.4815791300161785e-06,
+ "loss": 0.5029,
+ "step": 4778
+ },
+ {
+ "epoch": 2.2595744680851064,
+ "grad_norm": 2.691603899002075,
+ "learning_rate": 3.48100536840539e-06,
+ "loss": 0.4242,
+ "step": 4779
+ },
+ {
+ "epoch": 2.260047281323877,
+ "grad_norm": 2.64035964012146,
+ "learning_rate": 3.4804315457116992e-06,
+ "loss": 0.4033,
+ "step": 4780
+ },
+ {
+ "epoch": 2.2605200945626476,
+ "grad_norm": 2.758819580078125,
+ "learning_rate": 3.4798576619708357e-06,
+ "loss": 0.4321,
+ "step": 4781
+ },
+ {
+ "epoch": 2.2609929078014184,
+ "grad_norm": 2.8204405307769775,
+ "learning_rate": 3.4792837172185324e-06,
+ "loss": 0.4309,
+ "step": 4782
+ },
+ {
+ "epoch": 2.261465721040189,
+ "grad_norm": 2.529771327972412,
+ "learning_rate": 3.478709711490525e-06,
+ "loss": 0.4398,
+ "step": 4783
+ },
+ {
+ "epoch": 2.26193853427896,
+ "grad_norm": 2.8156251907348633,
+ "learning_rate": 3.4781356448225557e-06,
+ "loss": 0.447,
+ "step": 4784
+ },
+ {
+ "epoch": 2.2624113475177303,
+ "grad_norm": 2.689528703689575,
+ "learning_rate": 3.477561517250369e-06,
+ "loss": 0.3907,
+ "step": 4785
+ },
+ {
+ "epoch": 2.262884160756501,
+ "grad_norm": 2.9148027896881104,
+ "learning_rate": 3.476987328809713e-06,
+ "loss": 0.4287,
+ "step": 4786
+ },
+ {
+ "epoch": 2.263356973995272,
+ "grad_norm": 2.933021306991577,
+ "learning_rate": 3.4764130795363404e-06,
+ "loss": 0.4847,
+ "step": 4787
+ },
+ {
+ "epoch": 2.2638297872340427,
+ "grad_norm": 2.8559257984161377,
+ "learning_rate": 3.4758387694660064e-06,
+ "loss": 0.4554,
+ "step": 4788
+ },
+ {
+ "epoch": 2.264302600472813,
+ "grad_norm": 3.0355522632598877,
+ "learning_rate": 3.4752643986344707e-06,
+ "loss": 0.4286,
+ "step": 4789
+ },
+ {
+ "epoch": 2.264775413711584,
+ "grad_norm": 2.9768362045288086,
+ "learning_rate": 3.474689967077498e-06,
+ "loss": 0.4917,
+ "step": 4790
+ },
+ {
+ "epoch": 2.2652482269503547,
+ "grad_norm": 2.827971935272217,
+ "learning_rate": 3.474115474830855e-06,
+ "loss": 0.4542,
+ "step": 4791
+ },
+ {
+ "epoch": 2.2657210401891255,
+ "grad_norm": 2.559659719467163,
+ "learning_rate": 3.4735409219303123e-06,
+ "loss": 0.4168,
+ "step": 4792
+ },
+ {
+ "epoch": 2.266193853427896,
+ "grad_norm": 2.3172824382781982,
+ "learning_rate": 3.472966308411645e-06,
+ "loss": 0.3535,
+ "step": 4793
+ },
+ {
+ "epoch": 2.2666666666666666,
+ "grad_norm": 2.6779656410217285,
+ "learning_rate": 3.4723916343106327e-06,
+ "loss": 0.4599,
+ "step": 4794
+ },
+ {
+ "epoch": 2.2671394799054374,
+ "grad_norm": 2.55780291557312,
+ "learning_rate": 3.4718168996630573e-06,
+ "loss": 0.4185,
+ "step": 4795
+ },
+ {
+ "epoch": 2.267612293144208,
+ "grad_norm": 2.4929800033569336,
+ "learning_rate": 3.471242104504704e-06,
+ "loss": 0.4008,
+ "step": 4796
+ },
+ {
+ "epoch": 2.2680851063829786,
+ "grad_norm": 2.849475145339966,
+ "learning_rate": 3.4706672488713642e-06,
+ "loss": 0.396,
+ "step": 4797
+ },
+ {
+ "epoch": 2.2685579196217494,
+ "grad_norm": 2.4830739498138428,
+ "learning_rate": 3.4700923327988306e-06,
+ "loss": 0.4087,
+ "step": 4798
+ },
+ {
+ "epoch": 2.26903073286052,
+ "grad_norm": 3.2748119831085205,
+ "learning_rate": 3.469517356322901e-06,
+ "loss": 0.4496,
+ "step": 4799
+ },
+ {
+ "epoch": 2.269503546099291,
+ "grad_norm": 3.0440170764923096,
+ "learning_rate": 3.468942319479378e-06,
+ "loss": 0.4903,
+ "step": 4800
+ },
+ {
+ "epoch": 2.2699763593380613,
+ "grad_norm": 2.8200504779815674,
+ "learning_rate": 3.4683672223040645e-06,
+ "loss": 0.4588,
+ "step": 4801
+ },
+ {
+ "epoch": 2.270449172576832,
+ "grad_norm": 2.675206184387207,
+ "learning_rate": 3.4677920648327707e-06,
+ "loss": 0.4257,
+ "step": 4802
+ },
+ {
+ "epoch": 2.270921985815603,
+ "grad_norm": 2.862675905227661,
+ "learning_rate": 3.4672168471013084e-06,
+ "loss": 0.466,
+ "step": 4803
+ },
+ {
+ "epoch": 2.2713947990543737,
+ "grad_norm": 2.65663743019104,
+ "learning_rate": 3.4666415691454947e-06,
+ "loss": 0.4784,
+ "step": 4804
+ },
+ {
+ "epoch": 2.271867612293144,
+ "grad_norm": 2.5610506534576416,
+ "learning_rate": 3.4660662310011483e-06,
+ "loss": 0.4429,
+ "step": 4805
+ },
+ {
+ "epoch": 2.272340425531915,
+ "grad_norm": 2.6459643840789795,
+ "learning_rate": 3.465490832704094e-06,
+ "loss": 0.4345,
+ "step": 4806
+ },
+ {
+ "epoch": 2.2728132387706856,
+ "grad_norm": 2.426013469696045,
+ "learning_rate": 3.4649153742901585e-06,
+ "loss": 0.4533,
+ "step": 4807
+ },
+ {
+ "epoch": 2.2732860520094564,
+ "grad_norm": 2.6714842319488525,
+ "learning_rate": 3.4643398557951745e-06,
+ "loss": 0.4409,
+ "step": 4808
+ },
+ {
+ "epoch": 2.273758865248227,
+ "grad_norm": 2.703629493713379,
+ "learning_rate": 3.463764277254976e-06,
+ "loss": 0.3656,
+ "step": 4809
+ },
+ {
+ "epoch": 2.2742316784869976,
+ "grad_norm": 2.811753988265991,
+ "learning_rate": 3.4631886387054025e-06,
+ "loss": 0.4957,
+ "step": 4810
+ },
+ {
+ "epoch": 2.2747044917257684,
+ "grad_norm": 2.9469289779663086,
+ "learning_rate": 3.462612940182295e-06,
+ "loss": 0.4582,
+ "step": 4811
+ },
+ {
+ "epoch": 2.275177304964539,
+ "grad_norm": 2.6287801265716553,
+ "learning_rate": 3.462037181721501e-06,
+ "loss": 0.4072,
+ "step": 4812
+ },
+ {
+ "epoch": 2.2756501182033095,
+ "grad_norm": 2.7104952335357666,
+ "learning_rate": 3.46146136335887e-06,
+ "loss": 0.4998,
+ "step": 4813
+ },
+ {
+ "epoch": 2.2761229314420803,
+ "grad_norm": 3.170363187789917,
+ "learning_rate": 3.460885485130256e-06,
+ "loss": 0.4722,
+ "step": 4814
+ },
+ {
+ "epoch": 2.276595744680851,
+ "grad_norm": 2.7315151691436768,
+ "learning_rate": 3.460309547071516e-06,
+ "loss": 0.4482,
+ "step": 4815
+ },
+ {
+ "epoch": 2.277068557919622,
+ "grad_norm": 2.685988187789917,
+ "learning_rate": 3.4597335492185113e-06,
+ "loss": 0.4419,
+ "step": 4816
+ },
+ {
+ "epoch": 2.2775413711583923,
+ "grad_norm": 2.532790184020996,
+ "learning_rate": 3.459157491607107e-06,
+ "loss": 0.3961,
+ "step": 4817
+ },
+ {
+ "epoch": 2.278014184397163,
+ "grad_norm": 2.920729875564575,
+ "learning_rate": 3.458581374273171e-06,
+ "loss": 0.4767,
+ "step": 4818
+ },
+ {
+ "epoch": 2.278486997635934,
+ "grad_norm": 3.2481250762939453,
+ "learning_rate": 3.458005197252577e-06,
+ "loss": 0.4985,
+ "step": 4819
+ },
+ {
+ "epoch": 2.2789598108747047,
+ "grad_norm": 2.373809814453125,
+ "learning_rate": 3.4574289605811994e-06,
+ "loss": 0.4259,
+ "step": 4820
+ },
+ {
+ "epoch": 2.279432624113475,
+ "grad_norm": 2.7851033210754395,
+ "learning_rate": 3.4568526642949184e-06,
+ "loss": 0.4829,
+ "step": 4821
+ },
+ {
+ "epoch": 2.279905437352246,
+ "grad_norm": 2.9777133464813232,
+ "learning_rate": 3.456276308429618e-06,
+ "loss": 0.4896,
+ "step": 4822
+ },
+ {
+ "epoch": 2.2803782505910166,
+ "grad_norm": 2.7922022342681885,
+ "learning_rate": 3.4556998930211853e-06,
+ "loss": 0.4908,
+ "step": 4823
+ },
+ {
+ "epoch": 2.2808510638297874,
+ "grad_norm": 2.699180841445923,
+ "learning_rate": 3.4551234181055104e-06,
+ "loss": 0.4518,
+ "step": 4824
+ },
+ {
+ "epoch": 2.2813238770685578,
+ "grad_norm": 3.1200520992279053,
+ "learning_rate": 3.4545468837184885e-06,
+ "loss": 0.4877,
+ "step": 4825
+ },
+ {
+ "epoch": 2.2817966903073286,
+ "grad_norm": 2.56782603263855,
+ "learning_rate": 3.453970289896018e-06,
+ "loss": 0.4281,
+ "step": 4826
+ },
+ {
+ "epoch": 2.2822695035460994,
+ "grad_norm": 3.241356372833252,
+ "learning_rate": 3.4533936366740007e-06,
+ "loss": 0.4338,
+ "step": 4827
+ },
+ {
+ "epoch": 2.28274231678487,
+ "grad_norm": 3.560295343399048,
+ "learning_rate": 3.452816924088342e-06,
+ "loss": 0.4121,
+ "step": 4828
+ },
+ {
+ "epoch": 2.2832151300236405,
+ "grad_norm": 2.8512449264526367,
+ "learning_rate": 3.452240152174951e-06,
+ "loss": 0.4357,
+ "step": 4829
+ },
+ {
+ "epoch": 2.2836879432624113,
+ "grad_norm": 3.0332651138305664,
+ "learning_rate": 3.4516633209697408e-06,
+ "loss": 0.4985,
+ "step": 4830
+ },
+ {
+ "epoch": 2.284160756501182,
+ "grad_norm": 2.520930528640747,
+ "learning_rate": 3.451086430508629e-06,
+ "loss": 0.4021,
+ "step": 4831
+ },
+ {
+ "epoch": 2.284633569739953,
+ "grad_norm": 2.508227825164795,
+ "learning_rate": 3.4505094808275363e-06,
+ "loss": 0.3935,
+ "step": 4832
+ },
+ {
+ "epoch": 2.2851063829787233,
+ "grad_norm": 2.56752610206604,
+ "learning_rate": 3.449932471962385e-06,
+ "loss": 0.4689,
+ "step": 4833
+ },
+ {
+ "epoch": 2.285579196217494,
+ "grad_norm": 2.7757534980773926,
+ "learning_rate": 3.449355403949105e-06,
+ "loss": 0.4565,
+ "step": 4834
+ },
+ {
+ "epoch": 2.286052009456265,
+ "grad_norm": 3.364821195602417,
+ "learning_rate": 3.448778276823626e-06,
+ "loss": 0.4729,
+ "step": 4835
+ },
+ {
+ "epoch": 2.2865248226950357,
+ "grad_norm": 3.0045557022094727,
+ "learning_rate": 3.448201090621884e-06,
+ "loss": 0.4834,
+ "step": 4836
+ },
+ {
+ "epoch": 2.286997635933806,
+ "grad_norm": 2.9451794624328613,
+ "learning_rate": 3.4476238453798183e-06,
+ "loss": 0.489,
+ "step": 4837
+ },
+ {
+ "epoch": 2.287470449172577,
+ "grad_norm": 2.8307435512542725,
+ "learning_rate": 3.4470465411333708e-06,
+ "loss": 0.5079,
+ "step": 4838
+ },
+ {
+ "epoch": 2.2879432624113476,
+ "grad_norm": 2.7118136882781982,
+ "learning_rate": 3.4464691779184876e-06,
+ "loss": 0.4794,
+ "step": 4839
+ },
+ {
+ "epoch": 2.2884160756501184,
+ "grad_norm": 2.6724441051483154,
+ "learning_rate": 3.445891755771119e-06,
+ "loss": 0.4619,
+ "step": 4840
+ },
+ {
+ "epoch": 2.2888888888888888,
+ "grad_norm": 2.8161258697509766,
+ "learning_rate": 3.445314274727218e-06,
+ "loss": 0.4287,
+ "step": 4841
+ },
+ {
+ "epoch": 2.2893617021276595,
+ "grad_norm": 2.5681750774383545,
+ "learning_rate": 3.4447367348227433e-06,
+ "loss": 0.4167,
+ "step": 4842
+ },
+ {
+ "epoch": 2.2898345153664303,
+ "grad_norm": 2.8136284351348877,
+ "learning_rate": 3.444159136093654e-06,
+ "loss": 0.4195,
+ "step": 4843
+ },
+ {
+ "epoch": 2.290307328605201,
+ "grad_norm": 3.153651714324951,
+ "learning_rate": 3.443581478575915e-06,
+ "loss": 0.4821,
+ "step": 4844
+ },
+ {
+ "epoch": 2.2907801418439715,
+ "grad_norm": 2.980883836746216,
+ "learning_rate": 3.4430037623054953e-06,
+ "loss": 0.4627,
+ "step": 4845
+ },
+ {
+ "epoch": 2.2912529550827423,
+ "grad_norm": 2.786182403564453,
+ "learning_rate": 3.4424259873183664e-06,
+ "loss": 0.4342,
+ "step": 4846
+ },
+ {
+ "epoch": 2.291725768321513,
+ "grad_norm": 2.8938279151916504,
+ "learning_rate": 3.4418481536505026e-06,
+ "loss": 0.3997,
+ "step": 4847
+ },
+ {
+ "epoch": 2.2921985815602834,
+ "grad_norm": 2.5534510612487793,
+ "learning_rate": 3.4412702613378844e-06,
+ "loss": 0.3982,
+ "step": 4848
+ },
+ {
+ "epoch": 2.2926713947990542,
+ "grad_norm": 2.7907063961029053,
+ "learning_rate": 3.4406923104164956e-06,
+ "loss": 0.4484,
+ "step": 4849
+ },
+ {
+ "epoch": 2.293144208037825,
+ "grad_norm": 3.162702798843384,
+ "learning_rate": 3.4401143009223203e-06,
+ "loss": 0.4528,
+ "step": 4850
+ },
+ {
+ "epoch": 2.293617021276596,
+ "grad_norm": 2.4647393226623535,
+ "learning_rate": 3.4395362328913505e-06,
+ "loss": 0.3759,
+ "step": 4851
+ },
+ {
+ "epoch": 2.2940898345153666,
+ "grad_norm": 2.8219876289367676,
+ "learning_rate": 3.438958106359579e-06,
+ "loss": 0.4903,
+ "step": 4852
+ },
+ {
+ "epoch": 2.294562647754137,
+ "grad_norm": 2.827073097229004,
+ "learning_rate": 3.438379921363003e-06,
+ "loss": 0.4315,
+ "step": 4853
+ },
+ {
+ "epoch": 2.295035460992908,
+ "grad_norm": 2.472470283508301,
+ "learning_rate": 3.4378016779376244e-06,
+ "loss": 0.4478,
+ "step": 4854
+ },
+ {
+ "epoch": 2.2955082742316786,
+ "grad_norm": 3.3994734287261963,
+ "learning_rate": 3.4372233761194473e-06,
+ "loss": 0.5086,
+ "step": 4855
+ },
+ {
+ "epoch": 2.295981087470449,
+ "grad_norm": 3.030465602874756,
+ "learning_rate": 3.4366450159444796e-06,
+ "loss": 0.4159,
+ "step": 4856
+ },
+ {
+ "epoch": 2.2964539007092197,
+ "grad_norm": 2.5460705757141113,
+ "learning_rate": 3.4360665974487346e-06,
+ "loss": 0.4097,
+ "step": 4857
+ },
+ {
+ "epoch": 2.2969267139479905,
+ "grad_norm": 2.884469509124756,
+ "learning_rate": 3.4354881206682273e-06,
+ "loss": 0.4478,
+ "step": 4858
+ },
+ {
+ "epoch": 2.2973995271867613,
+ "grad_norm": 2.5139710903167725,
+ "learning_rate": 3.4349095856389765e-06,
+ "loss": 0.4286,
+ "step": 4859
+ },
+ {
+ "epoch": 2.297872340425532,
+ "grad_norm": 3.1628260612487793,
+ "learning_rate": 3.4343309923970053e-06,
+ "loss": 0.4617,
+ "step": 4860
+ },
+ {
+ "epoch": 2.2983451536643025,
+ "grad_norm": 2.6141695976257324,
+ "learning_rate": 3.4337523409783395e-06,
+ "loss": 0.3841,
+ "step": 4861
+ },
+ {
+ "epoch": 2.2988179669030733,
+ "grad_norm": 2.766834259033203,
+ "learning_rate": 3.43317363141901e-06,
+ "loss": 0.4484,
+ "step": 4862
+ },
+ {
+ "epoch": 2.299290780141844,
+ "grad_norm": 2.785491943359375,
+ "learning_rate": 3.4325948637550503e-06,
+ "loss": 0.4363,
+ "step": 4863
+ },
+ {
+ "epoch": 2.2997635933806144,
+ "grad_norm": 2.624929189682007,
+ "learning_rate": 3.4320160380224988e-06,
+ "loss": 0.4518,
+ "step": 4864
+ },
+ {
+ "epoch": 2.300236406619385,
+ "grad_norm": 2.895413398742676,
+ "learning_rate": 3.4314371542573944e-06,
+ "loss": 0.4745,
+ "step": 4865
+ },
+ {
+ "epoch": 2.300709219858156,
+ "grad_norm": 2.603816270828247,
+ "learning_rate": 3.430858212495783e-06,
+ "loss": 0.4444,
+ "step": 4866
+ },
+ {
+ "epoch": 2.301182033096927,
+ "grad_norm": 3.387360095977783,
+ "learning_rate": 3.4302792127737116e-06,
+ "loss": 0.4169,
+ "step": 4867
+ },
+ {
+ "epoch": 2.3016548463356976,
+ "grad_norm": 2.894054651260376,
+ "learning_rate": 3.4297001551272334e-06,
+ "loss": 0.4493,
+ "step": 4868
+ },
+ {
+ "epoch": 2.302127659574468,
+ "grad_norm": 3.0432028770446777,
+ "learning_rate": 3.4291210395924035e-06,
+ "loss": 0.4854,
+ "step": 4869
+ },
+ {
+ "epoch": 2.3026004728132388,
+ "grad_norm": 2.5144734382629395,
+ "learning_rate": 3.42854186620528e-06,
+ "loss": 0.4556,
+ "step": 4870
+ },
+ {
+ "epoch": 2.3030732860520096,
+ "grad_norm": 2.964812755584717,
+ "learning_rate": 3.427962635001926e-06,
+ "loss": 0.495,
+ "step": 4871
+ },
+ {
+ "epoch": 2.30354609929078,
+ "grad_norm": 2.9991118907928467,
+ "learning_rate": 3.4273833460184077e-06,
+ "loss": 0.4787,
+ "step": 4872
+ },
+ {
+ "epoch": 2.3040189125295507,
+ "grad_norm": 2.9424328804016113,
+ "learning_rate": 3.4268039992907955e-06,
+ "loss": 0.5006,
+ "step": 4873
+ },
+ {
+ "epoch": 2.3044917257683215,
+ "grad_norm": 2.792880058288574,
+ "learning_rate": 3.426224594855162e-06,
+ "loss": 0.4399,
+ "step": 4874
+ },
+ {
+ "epoch": 2.3049645390070923,
+ "grad_norm": 2.5308053493499756,
+ "learning_rate": 3.4256451327475838e-06,
+ "loss": 0.4843,
+ "step": 4875
+ },
+ {
+ "epoch": 2.305437352245863,
+ "grad_norm": 2.7937564849853516,
+ "learning_rate": 3.425065613004142e-06,
+ "loss": 0.4428,
+ "step": 4876
+ },
+ {
+ "epoch": 2.3059101654846335,
+ "grad_norm": 2.4231557846069336,
+ "learning_rate": 3.424486035660921e-06,
+ "loss": 0.4054,
+ "step": 4877
+ },
+ {
+ "epoch": 2.3063829787234043,
+ "grad_norm": 3.0622596740722656,
+ "learning_rate": 3.423906400754009e-06,
+ "loss": 0.4623,
+ "step": 4878
+ },
+ {
+ "epoch": 2.306855791962175,
+ "grad_norm": 2.6532933712005615,
+ "learning_rate": 3.4233267083194955e-06,
+ "loss": 0.4387,
+ "step": 4879
+ },
+ {
+ "epoch": 2.3073286052009454,
+ "grad_norm": 2.793325185775757,
+ "learning_rate": 3.422746958393477e-06,
+ "loss": 0.4047,
+ "step": 4880
+ },
+ {
+ "epoch": 2.307801418439716,
+ "grad_norm": 2.9178314208984375,
+ "learning_rate": 3.422167151012052e-06,
+ "loss": 0.4397,
+ "step": 4881
+ },
+ {
+ "epoch": 2.308274231678487,
+ "grad_norm": 3.463913917541504,
+ "learning_rate": 3.4215872862113214e-06,
+ "loss": 0.4347,
+ "step": 4882
+ },
+ {
+ "epoch": 2.308747044917258,
+ "grad_norm": 3.228403091430664,
+ "learning_rate": 3.421007364027392e-06,
+ "loss": 0.4405,
+ "step": 4883
+ },
+ {
+ "epoch": 2.3092198581560286,
+ "grad_norm": 2.896933078765869,
+ "learning_rate": 3.420427384496372e-06,
+ "loss": 0.4429,
+ "step": 4884
+ },
+ {
+ "epoch": 2.309692671394799,
+ "grad_norm": 2.5559937953948975,
+ "learning_rate": 3.4198473476543755e-06,
+ "loss": 0.4281,
+ "step": 4885
+ },
+ {
+ "epoch": 2.3101654846335697,
+ "grad_norm": 3.457918167114258,
+ "learning_rate": 3.419267253537517e-06,
+ "loss": 0.4495,
+ "step": 4886
+ },
+ {
+ "epoch": 2.3106382978723405,
+ "grad_norm": 2.6554839611053467,
+ "learning_rate": 3.418687102181918e-06,
+ "loss": 0.4682,
+ "step": 4887
+ },
+ {
+ "epoch": 2.311111111111111,
+ "grad_norm": 2.8171639442443848,
+ "learning_rate": 3.4181068936237024e-06,
+ "loss": 0.4184,
+ "step": 4888
+ },
+ {
+ "epoch": 2.3115839243498817,
+ "grad_norm": 2.9272499084472656,
+ "learning_rate": 3.4175266278989955e-06,
+ "loss": 0.5445,
+ "step": 4889
+ },
+ {
+ "epoch": 2.3120567375886525,
+ "grad_norm": 2.5928499698638916,
+ "learning_rate": 3.4169463050439284e-06,
+ "loss": 0.3808,
+ "step": 4890
+ },
+ {
+ "epoch": 2.3125295508274233,
+ "grad_norm": 2.6624577045440674,
+ "learning_rate": 3.4163659250946356e-06,
+ "loss": 0.4678,
+ "step": 4891
+ },
+ {
+ "epoch": 2.313002364066194,
+ "grad_norm": 2.666555643081665,
+ "learning_rate": 3.4157854880872553e-06,
+ "loss": 0.457,
+ "step": 4892
+ },
+ {
+ "epoch": 2.3134751773049644,
+ "grad_norm": 3.2987406253814697,
+ "learning_rate": 3.4152049940579278e-06,
+ "loss": 0.551,
+ "step": 4893
+ },
+ {
+ "epoch": 2.3139479905437352,
+ "grad_norm": 2.728119134902954,
+ "learning_rate": 3.414624443042799e-06,
+ "loss": 0.3935,
+ "step": 4894
+ },
+ {
+ "epoch": 2.314420803782506,
+ "grad_norm": 3.133005380630493,
+ "learning_rate": 3.4140438350780157e-06,
+ "loss": 0.4981,
+ "step": 4895
+ },
+ {
+ "epoch": 2.3148936170212764,
+ "grad_norm": 2.591252565383911,
+ "learning_rate": 3.4134631701997312e-06,
+ "loss": 0.4251,
+ "step": 4896
+ },
+ {
+ "epoch": 2.315366430260047,
+ "grad_norm": 3.007136344909668,
+ "learning_rate": 3.412882448444101e-06,
+ "loss": 0.4492,
+ "step": 4897
+ },
+ {
+ "epoch": 2.315839243498818,
+ "grad_norm": 2.6391026973724365,
+ "learning_rate": 3.412301669847284e-06,
+ "loss": 0.5151,
+ "step": 4898
+ },
+ {
+ "epoch": 2.3163120567375888,
+ "grad_norm": 7.453699111938477,
+ "learning_rate": 3.411720834445441e-06,
+ "loss": 0.4983,
+ "step": 4899
+ },
+ {
+ "epoch": 2.3167848699763596,
+ "grad_norm": 2.667712688446045,
+ "learning_rate": 3.41113994227474e-06,
+ "loss": 0.4581,
+ "step": 4900
+ },
+ {
+ "epoch": 2.31725768321513,
+ "grad_norm": 2.7727627754211426,
+ "learning_rate": 3.41055899337135e-06,
+ "loss": 0.4731,
+ "step": 4901
+ },
+ {
+ "epoch": 2.3177304964539007,
+ "grad_norm": 3.0096890926361084,
+ "learning_rate": 3.409977987771444e-06,
+ "loss": 0.4996,
+ "step": 4902
+ },
+ {
+ "epoch": 2.3182033096926715,
+ "grad_norm": 2.725830078125,
+ "learning_rate": 3.4093969255111993e-06,
+ "loss": 0.4544,
+ "step": 4903
+ },
+ {
+ "epoch": 2.318676122931442,
+ "grad_norm": 2.7596993446350098,
+ "learning_rate": 3.4088158066267945e-06,
+ "loss": 0.4846,
+ "step": 4904
+ },
+ {
+ "epoch": 2.3191489361702127,
+ "grad_norm": 2.702620029449463,
+ "learning_rate": 3.4082346311544156e-06,
+ "loss": 0.4849,
+ "step": 4905
+ },
+ {
+ "epoch": 2.3196217494089835,
+ "grad_norm": 2.725374460220337,
+ "learning_rate": 3.407653399130249e-06,
+ "loss": 0.4116,
+ "step": 4906
+ },
+ {
+ "epoch": 2.3200945626477543,
+ "grad_norm": 2.6770219802856445,
+ "learning_rate": 3.4070721105904847e-06,
+ "loss": 0.4606,
+ "step": 4907
+ },
+ {
+ "epoch": 2.320567375886525,
+ "grad_norm": 2.9249117374420166,
+ "learning_rate": 3.406490765571317e-06,
+ "loss": 0.461,
+ "step": 4908
+ },
+ {
+ "epoch": 2.3210401891252954,
+ "grad_norm": 2.7568278312683105,
+ "learning_rate": 3.405909364108944e-06,
+ "loss": 0.4065,
+ "step": 4909
+ },
+ {
+ "epoch": 2.321513002364066,
+ "grad_norm": 2.7231340408325195,
+ "learning_rate": 3.4053279062395676e-06,
+ "loss": 0.4173,
+ "step": 4910
+ },
+ {
+ "epoch": 2.321985815602837,
+ "grad_norm": 3.1401100158691406,
+ "learning_rate": 3.404746391999393e-06,
+ "loss": 0.4287,
+ "step": 4911
+ },
+ {
+ "epoch": 2.3224586288416074,
+ "grad_norm": 2.714853525161743,
+ "learning_rate": 3.404164821424627e-06,
+ "loss": 0.4552,
+ "step": 4912
+ },
+ {
+ "epoch": 2.322931442080378,
+ "grad_norm": 3.1509978771209717,
+ "learning_rate": 3.4035831945514825e-06,
+ "loss": 0.5296,
+ "step": 4913
+ },
+ {
+ "epoch": 2.323404255319149,
+ "grad_norm": 2.567194938659668,
+ "learning_rate": 3.403001511416174e-06,
+ "loss": 0.4306,
+ "step": 4914
+ },
+ {
+ "epoch": 2.3238770685579198,
+ "grad_norm": 2.7473888397216797,
+ "learning_rate": 3.402419772054922e-06,
+ "loss": 0.4009,
+ "step": 4915
+ },
+ {
+ "epoch": 2.3243498817966906,
+ "grad_norm": 2.8617780208587646,
+ "learning_rate": 3.401837976503947e-06,
+ "loss": 0.4545,
+ "step": 4916
+ },
+ {
+ "epoch": 2.324822695035461,
+ "grad_norm": 2.3650572299957275,
+ "learning_rate": 3.401256124799475e-06,
+ "loss": 0.4046,
+ "step": 4917
+ },
+ {
+ "epoch": 2.3252955082742317,
+ "grad_norm": 2.418407678604126,
+ "learning_rate": 3.4006742169777364e-06,
+ "loss": 0.4222,
+ "step": 4918
+ },
+ {
+ "epoch": 2.3257683215130025,
+ "grad_norm": 2.7232494354248047,
+ "learning_rate": 3.400092253074964e-06,
+ "loss": 0.4373,
+ "step": 4919
+ },
+ {
+ "epoch": 2.326241134751773,
+ "grad_norm": 2.702965497970581,
+ "learning_rate": 3.399510233127394e-06,
+ "loss": 0.437,
+ "step": 4920
+ },
+ {
+ "epoch": 2.3267139479905437,
+ "grad_norm": 2.8381760120391846,
+ "learning_rate": 3.3989281571712664e-06,
+ "loss": 0.4294,
+ "step": 4921
+ },
+ {
+ "epoch": 2.3271867612293144,
+ "grad_norm": 2.767131805419922,
+ "learning_rate": 3.398346025242823e-06,
+ "loss": 0.4673,
+ "step": 4922
+ },
+ {
+ "epoch": 2.3276595744680852,
+ "grad_norm": 2.5261805057525635,
+ "learning_rate": 3.3977638373783123e-06,
+ "loss": 0.4147,
+ "step": 4923
+ },
+ {
+ "epoch": 2.3281323877068556,
+ "grad_norm": 2.7176897525787354,
+ "learning_rate": 3.3971815936139836e-06,
+ "loss": 0.3885,
+ "step": 4924
+ },
+ {
+ "epoch": 2.3286052009456264,
+ "grad_norm": 2.849043130874634,
+ "learning_rate": 3.396599293986092e-06,
+ "loss": 0.4842,
+ "step": 4925
+ },
+ {
+ "epoch": 2.329078014184397,
+ "grad_norm": 2.550673484802246,
+ "learning_rate": 3.3960169385308927e-06,
+ "loss": 0.4049,
+ "step": 4926
+ },
+ {
+ "epoch": 2.329550827423168,
+ "grad_norm": 3.0821585655212402,
+ "learning_rate": 3.3954345272846477e-06,
+ "loss": 0.53,
+ "step": 4927
+ },
+ {
+ "epoch": 2.3300236406619383,
+ "grad_norm": 2.68658185005188,
+ "learning_rate": 3.3948520602836223e-06,
+ "loss": 0.4592,
+ "step": 4928
+ },
+ {
+ "epoch": 2.330496453900709,
+ "grad_norm": 2.7391903400421143,
+ "learning_rate": 3.394269537564082e-06,
+ "loss": 0.4773,
+ "step": 4929
+ },
+ {
+ "epoch": 2.33096926713948,
+ "grad_norm": 2.665114164352417,
+ "learning_rate": 3.393686959162299e-06,
+ "loss": 0.4671,
+ "step": 4930
+ },
+ {
+ "epoch": 2.3314420803782507,
+ "grad_norm": 2.6827399730682373,
+ "learning_rate": 3.3931043251145477e-06,
+ "loss": 0.4669,
+ "step": 4931
+ },
+ {
+ "epoch": 2.331914893617021,
+ "grad_norm": 3.1760666370391846,
+ "learning_rate": 3.392521635457106e-06,
+ "loss": 0.4729,
+ "step": 4932
+ },
+ {
+ "epoch": 2.332387706855792,
+ "grad_norm": 2.9686226844787598,
+ "learning_rate": 3.3919388902262555e-06,
+ "loss": 0.5017,
+ "step": 4933
+ },
+ {
+ "epoch": 2.3328605200945627,
+ "grad_norm": 2.471325397491455,
+ "learning_rate": 3.3913560894582818e-06,
+ "loss": 0.4195,
+ "step": 4934
+ },
+ {
+ "epoch": 2.3333333333333335,
+ "grad_norm": 2.4062955379486084,
+ "learning_rate": 3.3907732331894732e-06,
+ "loss": 0.3666,
+ "step": 4935
+ },
+ {
+ "epoch": 2.333806146572104,
+ "grad_norm": 2.6800320148468018,
+ "learning_rate": 3.3901903214561206e-06,
+ "loss": 0.4774,
+ "step": 4936
+ },
+ {
+ "epoch": 2.3342789598108746,
+ "grad_norm": 2.923741102218628,
+ "learning_rate": 3.389607354294521e-06,
+ "loss": 0.4546,
+ "step": 4937
+ },
+ {
+ "epoch": 2.3347517730496454,
+ "grad_norm": 3.0034096240997314,
+ "learning_rate": 3.3890243317409716e-06,
+ "loss": 0.5373,
+ "step": 4938
+ },
+ {
+ "epoch": 2.3352245862884162,
+ "grad_norm": 3.0757339000701904,
+ "learning_rate": 3.388441253831775e-06,
+ "loss": 0.4655,
+ "step": 4939
+ },
+ {
+ "epoch": 2.3356973995271866,
+ "grad_norm": 2.5352041721343994,
+ "learning_rate": 3.3878581206032373e-06,
+ "loss": 0.4391,
+ "step": 4940
+ },
+ {
+ "epoch": 2.3361702127659574,
+ "grad_norm": 2.9332237243652344,
+ "learning_rate": 3.3872749320916675e-06,
+ "loss": 0.4685,
+ "step": 4941
+ },
+ {
+ "epoch": 2.336643026004728,
+ "grad_norm": 2.4871222972869873,
+ "learning_rate": 3.386691688333379e-06,
+ "loss": 0.3952,
+ "step": 4942
+ },
+ {
+ "epoch": 2.337115839243499,
+ "grad_norm": 2.6384918689727783,
+ "learning_rate": 3.386108389364687e-06,
+ "loss": 0.4044,
+ "step": 4943
+ },
+ {
+ "epoch": 2.3375886524822693,
+ "grad_norm": 2.3545165061950684,
+ "learning_rate": 3.3855250352219102e-06,
+ "loss": 0.426,
+ "step": 4944
+ },
+ {
+ "epoch": 2.33806146572104,
+ "grad_norm": 2.972242593765259,
+ "learning_rate": 3.3849416259413735e-06,
+ "loss": 0.5033,
+ "step": 4945
+ },
+ {
+ "epoch": 2.338534278959811,
+ "grad_norm": 3.117351770401001,
+ "learning_rate": 3.384358161559401e-06,
+ "loss": 0.4695,
+ "step": 4946
+ },
+ {
+ "epoch": 2.3390070921985817,
+ "grad_norm": 2.888916492462158,
+ "learning_rate": 3.383774642112324e-06,
+ "loss": 0.437,
+ "step": 4947
+ },
+ {
+ "epoch": 2.339479905437352,
+ "grad_norm": 3.0677435398101807,
+ "learning_rate": 3.3831910676364753e-06,
+ "loss": 0.4293,
+ "step": 4948
+ },
+ {
+ "epoch": 2.339952718676123,
+ "grad_norm": 2.8571784496307373,
+ "learning_rate": 3.3826074381681916e-06,
+ "loss": 0.4574,
+ "step": 4949
+ },
+ {
+ "epoch": 2.3404255319148937,
+ "grad_norm": 2.907276153564453,
+ "learning_rate": 3.3820237537438127e-06,
+ "loss": 0.4731,
+ "step": 4950
+ },
+ {
+ "epoch": 2.3408983451536645,
+ "grad_norm": 2.923762559890747,
+ "learning_rate": 3.3814400143996823e-06,
+ "loss": 0.4648,
+ "step": 4951
+ },
+ {
+ "epoch": 2.341371158392435,
+ "grad_norm": 2.6206982135772705,
+ "learning_rate": 3.3808562201721473e-06,
+ "loss": 0.436,
+ "step": 4952
+ },
+ {
+ "epoch": 2.3418439716312056,
+ "grad_norm": 6.279088973999023,
+ "learning_rate": 3.380272371097558e-06,
+ "loss": 0.4461,
+ "step": 4953
+ },
+ {
+ "epoch": 2.3423167848699764,
+ "grad_norm": 2.785297155380249,
+ "learning_rate": 3.3796884672122684e-06,
+ "loss": 0.4619,
+ "step": 4954
+ },
+ {
+ "epoch": 2.342789598108747,
+ "grad_norm": 2.6241793632507324,
+ "learning_rate": 3.379104508552634e-06,
+ "loss": 0.4323,
+ "step": 4955
+ },
+ {
+ "epoch": 2.3432624113475176,
+ "grad_norm": 2.6052167415618896,
+ "learning_rate": 3.378520495155017e-06,
+ "loss": 0.3943,
+ "step": 4956
+ },
+ {
+ "epoch": 2.3437352245862884,
+ "grad_norm": 2.8247411251068115,
+ "learning_rate": 3.3779364270557818e-06,
+ "loss": 0.4689,
+ "step": 4957
+ },
+ {
+ "epoch": 2.344208037825059,
+ "grad_norm": 2.5348927974700928,
+ "learning_rate": 3.377352304291294e-06,
+ "loss": 0.4619,
+ "step": 4958
+ },
+ {
+ "epoch": 2.34468085106383,
+ "grad_norm": 2.906648874282837,
+ "learning_rate": 3.376768126897926e-06,
+ "loss": 0.5191,
+ "step": 4959
+ },
+ {
+ "epoch": 2.3451536643026003,
+ "grad_norm": 2.796870470046997,
+ "learning_rate": 3.3761838949120514e-06,
+ "loss": 0.4227,
+ "step": 4960
+ },
+ {
+ "epoch": 2.345626477541371,
+ "grad_norm": 2.789635419845581,
+ "learning_rate": 3.3755996083700464e-06,
+ "loss": 0.3927,
+ "step": 4961
+ },
+ {
+ "epoch": 2.346099290780142,
+ "grad_norm": 2.86641263961792,
+ "learning_rate": 3.375015267308295e-06,
+ "loss": 0.4097,
+ "step": 4962
+ },
+ {
+ "epoch": 2.3465721040189127,
+ "grad_norm": 2.8374414443969727,
+ "learning_rate": 3.374430871763178e-06,
+ "loss": 0.4566,
+ "step": 4963
+ },
+ {
+ "epoch": 2.347044917257683,
+ "grad_norm": 2.71951961517334,
+ "learning_rate": 3.3738464217710854e-06,
+ "loss": 0.4748,
+ "step": 4964
+ },
+ {
+ "epoch": 2.347517730496454,
+ "grad_norm": 2.6939785480499268,
+ "learning_rate": 3.373261917368408e-06,
+ "loss": 0.4499,
+ "step": 4965
+ },
+ {
+ "epoch": 2.3479905437352246,
+ "grad_norm": 2.862661600112915,
+ "learning_rate": 3.37267735859154e-06,
+ "loss": 0.415,
+ "step": 4966
+ },
+ {
+ "epoch": 2.3484633569739954,
+ "grad_norm": 2.3657119274139404,
+ "learning_rate": 3.3720927454768793e-06,
+ "loss": 0.4112,
+ "step": 4967
+ },
+ {
+ "epoch": 2.348936170212766,
+ "grad_norm": 3.701571464538574,
+ "learning_rate": 3.3715080780608277e-06,
+ "loss": 0.4735,
+ "step": 4968
+ },
+ {
+ "epoch": 2.3494089834515366,
+ "grad_norm": 2.894350528717041,
+ "learning_rate": 3.3709233563797895e-06,
+ "loss": 0.4278,
+ "step": 4969
+ },
+ {
+ "epoch": 2.3498817966903074,
+ "grad_norm": 3.0072877407073975,
+ "learning_rate": 3.3703385804701727e-06,
+ "loss": 0.4718,
+ "step": 4970
+ },
+ {
+ "epoch": 2.350354609929078,
+ "grad_norm": 2.9920408725738525,
+ "learning_rate": 3.369753750368389e-06,
+ "loss": 0.4636,
+ "step": 4971
+ },
+ {
+ "epoch": 2.3508274231678485,
+ "grad_norm": 2.381770372390747,
+ "learning_rate": 3.369168866110853e-06,
+ "loss": 0.3841,
+ "step": 4972
+ },
+ {
+ "epoch": 2.3513002364066193,
+ "grad_norm": 2.6195342540740967,
+ "learning_rate": 3.3685839277339825e-06,
+ "loss": 0.4422,
+ "step": 4973
+ },
+ {
+ "epoch": 2.35177304964539,
+ "grad_norm": 2.885852575302124,
+ "learning_rate": 3.3679989352741992e-06,
+ "loss": 0.4798,
+ "step": 4974
+ },
+ {
+ "epoch": 2.352245862884161,
+ "grad_norm": 2.820004940032959,
+ "learning_rate": 3.367413888767929e-06,
+ "loss": 0.4498,
+ "step": 4975
+ },
+ {
+ "epoch": 2.3527186761229313,
+ "grad_norm": 2.579680919647217,
+ "learning_rate": 3.366828788251599e-06,
+ "loss": 0.4894,
+ "step": 4976
+ },
+ {
+ "epoch": 2.353191489361702,
+ "grad_norm": 2.7509915828704834,
+ "learning_rate": 3.366243633761642e-06,
+ "loss": 0.4354,
+ "step": 4977
+ },
+ {
+ "epoch": 2.353664302600473,
+ "grad_norm": 3.061767339706421,
+ "learning_rate": 3.3656584253344917e-06,
+ "loss": 0.4651,
+ "step": 4978
+ },
+ {
+ "epoch": 2.3541371158392437,
+ "grad_norm": 2.6109485626220703,
+ "learning_rate": 3.365073163006587e-06,
+ "loss": 0.44,
+ "step": 4979
+ },
+ {
+ "epoch": 2.354609929078014,
+ "grad_norm": 3.4247376918792725,
+ "learning_rate": 3.36448784681437e-06,
+ "loss": 0.3993,
+ "step": 4980
+ },
+ {
+ "epoch": 2.355082742316785,
+ "grad_norm": 2.953695297241211,
+ "learning_rate": 3.363902476794285e-06,
+ "loss": 0.4763,
+ "step": 4981
+ },
+ {
+ "epoch": 2.3555555555555556,
+ "grad_norm": 2.836543083190918,
+ "learning_rate": 3.3633170529827806e-06,
+ "loss": 0.4755,
+ "step": 4982
+ },
+ {
+ "epoch": 2.3560283687943264,
+ "grad_norm": 2.944082021713257,
+ "learning_rate": 3.36273157541631e-06,
+ "loss": 0.472,
+ "step": 4983
+ },
+ {
+ "epoch": 2.3565011820330968,
+ "grad_norm": 2.891716957092285,
+ "learning_rate": 3.3621460441313262e-06,
+ "loss": 0.5259,
+ "step": 4984
+ },
+ {
+ "epoch": 2.3569739952718676,
+ "grad_norm": 2.8448829650878906,
+ "learning_rate": 3.3615604591642896e-06,
+ "loss": 0.4587,
+ "step": 4985
+ },
+ {
+ "epoch": 2.3574468085106384,
+ "grad_norm": 3.114393711090088,
+ "learning_rate": 3.36097482055166e-06,
+ "loss": 0.4352,
+ "step": 4986
+ },
+ {
+ "epoch": 2.357919621749409,
+ "grad_norm": 2.964851140975952,
+ "learning_rate": 3.360389128329904e-06,
+ "loss": 0.5015,
+ "step": 4987
+ },
+ {
+ "epoch": 2.3583924349881795,
+ "grad_norm": 2.4819815158843994,
+ "learning_rate": 3.3598033825354893e-06,
+ "loss": 0.3459,
+ "step": 4988
+ },
+ {
+ "epoch": 2.3588652482269503,
+ "grad_norm": 2.635754346847534,
+ "learning_rate": 3.359217583204889e-06,
+ "loss": 0.4367,
+ "step": 4989
+ },
+ {
+ "epoch": 2.359338061465721,
+ "grad_norm": 2.542482376098633,
+ "learning_rate": 3.358631730374576e-06,
+ "loss": 0.3978,
+ "step": 4990
+ },
+ {
+ "epoch": 2.359810874704492,
+ "grad_norm": 2.614018678665161,
+ "learning_rate": 3.358045824081031e-06,
+ "loss": 0.424,
+ "step": 4991
+ },
+ {
+ "epoch": 2.3602836879432623,
+ "grad_norm": 2.775373697280884,
+ "learning_rate": 3.3574598643607354e-06,
+ "loss": 0.4901,
+ "step": 4992
+ },
+ {
+ "epoch": 2.360756501182033,
+ "grad_norm": 3.091381311416626,
+ "learning_rate": 3.356873851250173e-06,
+ "loss": 0.4954,
+ "step": 4993
+ },
+ {
+ "epoch": 2.361229314420804,
+ "grad_norm": 2.440023422241211,
+ "learning_rate": 3.3562877847858337e-06,
+ "loss": 0.4053,
+ "step": 4994
+ },
+ {
+ "epoch": 2.3617021276595747,
+ "grad_norm": 2.8879518508911133,
+ "learning_rate": 3.3557016650042084e-06,
+ "loss": 0.4766,
+ "step": 4995
+ },
+ {
+ "epoch": 2.362174940898345,
+ "grad_norm": 3.1298391819000244,
+ "learning_rate": 3.355115491941793e-06,
+ "loss": 0.4743,
+ "step": 4996
+ },
+ {
+ "epoch": 2.362647754137116,
+ "grad_norm": 3.3325259685516357,
+ "learning_rate": 3.3545292656350845e-06,
+ "loss": 0.4703,
+ "step": 4997
+ },
+ {
+ "epoch": 2.3631205673758866,
+ "grad_norm": 2.7935359477996826,
+ "learning_rate": 3.353942986120587e-06,
+ "loss": 0.432,
+ "step": 4998
+ },
+ {
+ "epoch": 2.3635933806146574,
+ "grad_norm": 2.623624324798584,
+ "learning_rate": 3.3533566534348033e-06,
+ "loss": 0.4302,
+ "step": 4999
+ },
+ {
+ "epoch": 2.3640661938534278,
+ "grad_norm": 3.1467108726501465,
+ "learning_rate": 3.3527702676142426e-06,
+ "loss": 0.4661,
+ "step": 5000
+ },
+ {
+ "epoch": 2.3645390070921986,
+ "grad_norm": 2.5364840030670166,
+ "learning_rate": 3.352183828695418e-06,
+ "loss": 0.4134,
+ "step": 5001
+ },
+ {
+ "epoch": 2.3650118203309693,
+ "grad_norm": 3.002777338027954,
+ "learning_rate": 3.3515973367148415e-06,
+ "loss": 0.3771,
+ "step": 5002
+ },
+ {
+ "epoch": 2.36548463356974,
+ "grad_norm": 2.660043954849243,
+ "learning_rate": 3.3510107917090335e-06,
+ "loss": 0.4254,
+ "step": 5003
+ },
+ {
+ "epoch": 2.3659574468085105,
+ "grad_norm": 2.7041075229644775,
+ "learning_rate": 3.3504241937145148e-06,
+ "loss": 0.4651,
+ "step": 5004
+ },
+ {
+ "epoch": 2.3664302600472813,
+ "grad_norm": 2.7387280464172363,
+ "learning_rate": 3.349837542767811e-06,
+ "loss": 0.3874,
+ "step": 5005
+ },
+ {
+ "epoch": 2.366903073286052,
+ "grad_norm": 3.012188196182251,
+ "learning_rate": 3.349250838905449e-06,
+ "loss": 0.4508,
+ "step": 5006
+ },
+ {
+ "epoch": 2.3673758865248224,
+ "grad_norm": 2.3108484745025635,
+ "learning_rate": 3.3486640821639616e-06,
+ "loss": 0.3783,
+ "step": 5007
+ },
+ {
+ "epoch": 2.3678486997635932,
+ "grad_norm": 3.2188332080841064,
+ "learning_rate": 3.3480772725798837e-06,
+ "loss": 0.4879,
+ "step": 5008
+ },
+ {
+ "epoch": 2.368321513002364,
+ "grad_norm": 2.566087484359741,
+ "learning_rate": 3.3474904101897526e-06,
+ "loss": 0.3847,
+ "step": 5009
+ },
+ {
+ "epoch": 2.368794326241135,
+ "grad_norm": 2.5581698417663574,
+ "learning_rate": 3.3469034950301092e-06,
+ "loss": 0.4201,
+ "step": 5010
+ },
+ {
+ "epoch": 2.3692671394799056,
+ "grad_norm": 2.900296926498413,
+ "learning_rate": 3.3463165271374992e-06,
+ "loss": 0.4568,
+ "step": 5011
+ },
+ {
+ "epoch": 2.369739952718676,
+ "grad_norm": 2.8239312171936035,
+ "learning_rate": 3.34572950654847e-06,
+ "loss": 0.4583,
+ "step": 5012
+ },
+ {
+ "epoch": 2.370212765957447,
+ "grad_norm": 3.219465970993042,
+ "learning_rate": 3.3451424332995723e-06,
+ "loss": 0.5435,
+ "step": 5013
+ },
+ {
+ "epoch": 2.3706855791962176,
+ "grad_norm": 3.3111915588378906,
+ "learning_rate": 3.344555307427362e-06,
+ "loss": 0.435,
+ "step": 5014
+ },
+ {
+ "epoch": 2.371158392434988,
+ "grad_norm": 3.296668529510498,
+ "learning_rate": 3.3439681289683946e-06,
+ "loss": 0.4738,
+ "step": 5015
+ },
+ {
+ "epoch": 2.3716312056737587,
+ "grad_norm": 3.005722761154175,
+ "learning_rate": 3.343380897959234e-06,
+ "loss": 0.4267,
+ "step": 5016
+ },
+ {
+ "epoch": 2.3721040189125295,
+ "grad_norm": 2.7844085693359375,
+ "learning_rate": 3.3427936144364425e-06,
+ "loss": 0.4558,
+ "step": 5017
+ },
+ {
+ "epoch": 2.3725768321513003,
+ "grad_norm": 2.7532076835632324,
+ "learning_rate": 3.3422062784365884e-06,
+ "loss": 0.4144,
+ "step": 5018
+ },
+ {
+ "epoch": 2.373049645390071,
+ "grad_norm": 2.835764169692993,
+ "learning_rate": 3.3416188899962413e-06,
+ "loss": 0.4945,
+ "step": 5019
+ },
+ {
+ "epoch": 2.3735224586288415,
+ "grad_norm": 3.1513726711273193,
+ "learning_rate": 3.3410314491519767e-06,
+ "loss": 0.4971,
+ "step": 5020
+ },
+ {
+ "epoch": 2.3739952718676123,
+ "grad_norm": 3.0162220001220703,
+ "learning_rate": 3.3404439559403723e-06,
+ "loss": 0.4477,
+ "step": 5021
+ },
+ {
+ "epoch": 2.374468085106383,
+ "grad_norm": 2.676391363143921,
+ "learning_rate": 3.3398564103980073e-06,
+ "loss": 0.432,
+ "step": 5022
+ },
+ {
+ "epoch": 2.3749408983451534,
+ "grad_norm": 2.7806248664855957,
+ "learning_rate": 3.3392688125614663e-06,
+ "loss": 0.4818,
+ "step": 5023
+ },
+ {
+ "epoch": 2.3754137115839242,
+ "grad_norm": 2.968806505203247,
+ "learning_rate": 3.3386811624673373e-06,
+ "loss": 0.4893,
+ "step": 5024
+ },
+ {
+ "epoch": 2.375886524822695,
+ "grad_norm": 2.992684841156006,
+ "learning_rate": 3.3380934601522087e-06,
+ "loss": 0.4423,
+ "step": 5025
+ },
+ {
+ "epoch": 2.376359338061466,
+ "grad_norm": 2.578420639038086,
+ "learning_rate": 3.3375057056526762e-06,
+ "loss": 0.3682,
+ "step": 5026
+ },
+ {
+ "epoch": 2.3768321513002366,
+ "grad_norm": 2.7683115005493164,
+ "learning_rate": 3.336917899005335e-06,
+ "loss": 0.4038,
+ "step": 5027
+ },
+ {
+ "epoch": 2.377304964539007,
+ "grad_norm": 2.838812828063965,
+ "learning_rate": 3.336330040246786e-06,
+ "loss": 0.442,
+ "step": 5028
+ },
+ {
+ "epoch": 2.3777777777777778,
+ "grad_norm": 2.766136646270752,
+ "learning_rate": 3.335742129413633e-06,
+ "loss": 0.4745,
+ "step": 5029
+ },
+ {
+ "epoch": 2.3782505910165486,
+ "grad_norm": 2.862656593322754,
+ "learning_rate": 3.3351541665424812e-06,
+ "loss": 0.4324,
+ "step": 5030
+ },
+ {
+ "epoch": 2.378723404255319,
+ "grad_norm": 2.71425199508667,
+ "learning_rate": 3.3345661516699433e-06,
+ "loss": 0.4013,
+ "step": 5031
+ },
+ {
+ "epoch": 2.3791962174940897,
+ "grad_norm": 2.8404030799865723,
+ "learning_rate": 3.333978084832629e-06,
+ "loss": 0.5038,
+ "step": 5032
+ },
+ {
+ "epoch": 2.3796690307328605,
+ "grad_norm": 2.965851068496704,
+ "learning_rate": 3.3333899660671574e-06,
+ "loss": 0.4668,
+ "step": 5033
+ },
+ {
+ "epoch": 2.3801418439716313,
+ "grad_norm": 2.686452627182007,
+ "learning_rate": 3.3328017954101464e-06,
+ "loss": 0.4167,
+ "step": 5034
+ },
+ {
+ "epoch": 2.380614657210402,
+ "grad_norm": 2.8676156997680664,
+ "learning_rate": 3.3322135728982197e-06,
+ "loss": 0.4531,
+ "step": 5035
+ },
+ {
+ "epoch": 2.3810874704491725,
+ "grad_norm": 2.4456300735473633,
+ "learning_rate": 3.3316252985680026e-06,
+ "loss": 0.4173,
+ "step": 5036
+ },
+ {
+ "epoch": 2.3815602836879433,
+ "grad_norm": 2.5472559928894043,
+ "learning_rate": 3.331036972456124e-06,
+ "loss": 0.3926,
+ "step": 5037
+ },
+ {
+ "epoch": 2.382033096926714,
+ "grad_norm": 2.81900954246521,
+ "learning_rate": 3.330448594599218e-06,
+ "loss": 0.4785,
+ "step": 5038
+ },
+ {
+ "epoch": 2.3825059101654844,
+ "grad_norm": 3.0930590629577637,
+ "learning_rate": 3.329860165033919e-06,
+ "loss": 0.4587,
+ "step": 5039
+ },
+ {
+ "epoch": 2.382978723404255,
+ "grad_norm": 3.0553040504455566,
+ "learning_rate": 3.3292716837968673e-06,
+ "loss": 0.5285,
+ "step": 5040
+ },
+ {
+ "epoch": 2.383451536643026,
+ "grad_norm": 2.577580690383911,
+ "learning_rate": 3.328683150924704e-06,
+ "loss": 0.4184,
+ "step": 5041
+ },
+ {
+ "epoch": 2.383924349881797,
+ "grad_norm": 2.6430366039276123,
+ "learning_rate": 3.3280945664540735e-06,
+ "loss": 0.4636,
+ "step": 5042
+ },
+ {
+ "epoch": 2.3843971631205676,
+ "grad_norm": 3.228360891342163,
+ "learning_rate": 3.3275059304216255e-06,
+ "loss": 0.455,
+ "step": 5043
+ },
+ {
+ "epoch": 2.384869976359338,
+ "grad_norm": 2.776142120361328,
+ "learning_rate": 3.3269172428640125e-06,
+ "loss": 0.4785,
+ "step": 5044
+ },
+ {
+ "epoch": 2.3853427895981087,
+ "grad_norm": 2.755671739578247,
+ "learning_rate": 3.3263285038178882e-06,
+ "loss": 0.4625,
+ "step": 5045
+ },
+ {
+ "epoch": 2.3858156028368795,
+ "grad_norm": 3.061004400253296,
+ "learning_rate": 3.3257397133199114e-06,
+ "loss": 0.4641,
+ "step": 5046
+ },
+ {
+ "epoch": 2.38628841607565,
+ "grad_norm": 2.8391458988189697,
+ "learning_rate": 3.3251508714067432e-06,
+ "loss": 0.5003,
+ "step": 5047
+ },
+ {
+ "epoch": 2.3867612293144207,
+ "grad_norm": 2.390810966491699,
+ "learning_rate": 3.324561978115049e-06,
+ "loss": 0.4446,
+ "step": 5048
+ },
+ {
+ "epoch": 2.3872340425531915,
+ "grad_norm": 2.7760825157165527,
+ "learning_rate": 3.323973033481496e-06,
+ "loss": 0.4443,
+ "step": 5049
+ },
+ {
+ "epoch": 2.3877068557919623,
+ "grad_norm": 3.157893419265747,
+ "learning_rate": 3.3233840375427552e-06,
+ "loss": 0.4934,
+ "step": 5050
+ },
+ {
+ "epoch": 2.388179669030733,
+ "grad_norm": 2.7245349884033203,
+ "learning_rate": 3.3227949903355e-06,
+ "loss": 0.4254,
+ "step": 5051
+ },
+ {
+ "epoch": 2.3886524822695034,
+ "grad_norm": 2.6674044132232666,
+ "learning_rate": 3.322205891896409e-06,
+ "loss": 0.4116,
+ "step": 5052
+ },
+ {
+ "epoch": 2.3891252955082742,
+ "grad_norm": 3.1490554809570312,
+ "learning_rate": 3.3216167422621627e-06,
+ "loss": 0.4604,
+ "step": 5053
+ },
+ {
+ "epoch": 2.389598108747045,
+ "grad_norm": 2.725731134414673,
+ "learning_rate": 3.321027541469444e-06,
+ "loss": 0.4836,
+ "step": 5054
+ },
+ {
+ "epoch": 2.3900709219858154,
+ "grad_norm": 2.5378828048706055,
+ "learning_rate": 3.3204382895549407e-06,
+ "loss": 0.4228,
+ "step": 5055
+ },
+ {
+ "epoch": 2.390543735224586,
+ "grad_norm": 2.8191192150115967,
+ "learning_rate": 3.3198489865553427e-06,
+ "loss": 0.4371,
+ "step": 5056
+ },
+ {
+ "epoch": 2.391016548463357,
+ "grad_norm": 2.5676498413085938,
+ "learning_rate": 3.3192596325073433e-06,
+ "loss": 0.4463,
+ "step": 5057
+ },
+ {
+ "epoch": 2.391489361702128,
+ "grad_norm": 3.0846121311187744,
+ "learning_rate": 3.3186702274476397e-06,
+ "loss": 0.5049,
+ "step": 5058
+ },
+ {
+ "epoch": 2.3919621749408986,
+ "grad_norm": 2.6085152626037598,
+ "learning_rate": 3.3180807714129293e-06,
+ "loss": 0.4376,
+ "step": 5059
+ },
+ {
+ "epoch": 2.392434988179669,
+ "grad_norm": 3.0218591690063477,
+ "learning_rate": 3.3174912644399172e-06,
+ "loss": 0.4734,
+ "step": 5060
+ },
+ {
+ "epoch": 2.3929078014184397,
+ "grad_norm": 2.5904781818389893,
+ "learning_rate": 3.316901706565308e-06,
+ "loss": 0.4924,
+ "step": 5061
+ },
+ {
+ "epoch": 2.3933806146572105,
+ "grad_norm": 2.675478458404541,
+ "learning_rate": 3.3163120978258123e-06,
+ "loss": 0.4072,
+ "step": 5062
+ },
+ {
+ "epoch": 2.393853427895981,
+ "grad_norm": 2.7944445610046387,
+ "learning_rate": 3.3157224382581415e-06,
+ "loss": 0.4328,
+ "step": 5063
+ },
+ {
+ "epoch": 2.3943262411347517,
+ "grad_norm": 2.846224546432495,
+ "learning_rate": 3.315132727899012e-06,
+ "loss": 0.4447,
+ "step": 5064
+ },
+ {
+ "epoch": 2.3947990543735225,
+ "grad_norm": 2.6825828552246094,
+ "learning_rate": 3.3145429667851402e-06,
+ "loss": 0.4528,
+ "step": 5065
+ },
+ {
+ "epoch": 2.3952718676122933,
+ "grad_norm": 3.0305285453796387,
+ "learning_rate": 3.3139531549532505e-06,
+ "loss": 0.4538,
+ "step": 5066
+ },
+ {
+ "epoch": 2.395744680851064,
+ "grad_norm": 2.707540988922119,
+ "learning_rate": 3.313363292440067e-06,
+ "loss": 0.4412,
+ "step": 5067
+ },
+ {
+ "epoch": 2.3962174940898344,
+ "grad_norm": 3.0458385944366455,
+ "learning_rate": 3.3127733792823173e-06,
+ "loss": 0.4587,
+ "step": 5068
+ },
+ {
+ "epoch": 2.396690307328605,
+ "grad_norm": 2.7711992263793945,
+ "learning_rate": 3.312183415516733e-06,
+ "loss": 0.4157,
+ "step": 5069
+ },
+ {
+ "epoch": 2.397163120567376,
+ "grad_norm": 2.6953988075256348,
+ "learning_rate": 3.3115934011800494e-06,
+ "loss": 0.3828,
+ "step": 5070
+ },
+ {
+ "epoch": 2.3976359338061464,
+ "grad_norm": 3.033721923828125,
+ "learning_rate": 3.311003336309003e-06,
+ "loss": 0.5204,
+ "step": 5071
+ },
+ {
+ "epoch": 2.398108747044917,
+ "grad_norm": 2.6134517192840576,
+ "learning_rate": 3.3104132209403355e-06,
+ "loss": 0.4181,
+ "step": 5072
+ },
+ {
+ "epoch": 2.398581560283688,
+ "grad_norm": 2.8800251483917236,
+ "learning_rate": 3.30982305511079e-06,
+ "loss": 0.466,
+ "step": 5073
+ },
+ {
+ "epoch": 2.3990543735224588,
+ "grad_norm": 2.5043210983276367,
+ "learning_rate": 3.309232838857114e-06,
+ "loss": 0.4161,
+ "step": 5074
+ },
+ {
+ "epoch": 2.3995271867612296,
+ "grad_norm": 2.6577322483062744,
+ "learning_rate": 3.308642572216057e-06,
+ "loss": 0.465,
+ "step": 5075
+ },
+ {
+ "epoch": 2.4,
+ "grad_norm": 2.549098253250122,
+ "learning_rate": 3.3080522552243734e-06,
+ "loss": 0.4571,
+ "step": 5076
+ },
+ {
+ "epoch": 2.4004728132387707,
+ "grad_norm": 2.881958246231079,
+ "learning_rate": 3.3074618879188186e-06,
+ "loss": 0.4443,
+ "step": 5077
+ },
+ {
+ "epoch": 2.4009456264775415,
+ "grad_norm": 2.608397960662842,
+ "learning_rate": 3.3068714703361528e-06,
+ "loss": 0.3843,
+ "step": 5078
+ },
+ {
+ "epoch": 2.401418439716312,
+ "grad_norm": 2.8666789531707764,
+ "learning_rate": 3.306281002513139e-06,
+ "loss": 0.4857,
+ "step": 5079
+ },
+ {
+ "epoch": 2.4018912529550827,
+ "grad_norm": 2.9008588790893555,
+ "learning_rate": 3.3056904844865422e-06,
+ "loss": 0.4454,
+ "step": 5080
+ },
+ {
+ "epoch": 2.4023640661938535,
+ "grad_norm": 2.7446060180664062,
+ "learning_rate": 3.3050999162931315e-06,
+ "loss": 0.4522,
+ "step": 5081
+ },
+ {
+ "epoch": 2.4028368794326243,
+ "grad_norm": 2.787116765975952,
+ "learning_rate": 3.3045092979696804e-06,
+ "loss": 0.4714,
+ "step": 5082
+ },
+ {
+ "epoch": 2.403309692671395,
+ "grad_norm": 2.7494192123413086,
+ "learning_rate": 3.3039186295529613e-06,
+ "loss": 0.4107,
+ "step": 5083
+ },
+ {
+ "epoch": 2.4037825059101654,
+ "grad_norm": 2.733794927597046,
+ "learning_rate": 3.303327911079755e-06,
+ "loss": 0.4169,
+ "step": 5084
+ },
+ {
+ "epoch": 2.404255319148936,
+ "grad_norm": 2.7313334941864014,
+ "learning_rate": 3.3027371425868422e-06,
+ "loss": 0.4287,
+ "step": 5085
+ },
+ {
+ "epoch": 2.404728132387707,
+ "grad_norm": 2.7832977771759033,
+ "learning_rate": 3.3021463241110075e-06,
+ "loss": 0.5307,
+ "step": 5086
+ },
+ {
+ "epoch": 2.4052009456264773,
+ "grad_norm": 2.6615281105041504,
+ "learning_rate": 3.301555455689038e-06,
+ "loss": 0.4519,
+ "step": 5087
+ },
+ {
+ "epoch": 2.405673758865248,
+ "grad_norm": 2.343921422958374,
+ "learning_rate": 3.3009645373577264e-06,
+ "loss": 0.46,
+ "step": 5088
+ },
+ {
+ "epoch": 2.406146572104019,
+ "grad_norm": 2.6115355491638184,
+ "learning_rate": 3.300373569153864e-06,
+ "loss": 0.4782,
+ "step": 5089
+ },
+ {
+ "epoch": 2.4066193853427897,
+ "grad_norm": 2.730625629425049,
+ "learning_rate": 3.299782551114249e-06,
+ "loss": 0.4632,
+ "step": 5090
+ },
+ {
+ "epoch": 2.40709219858156,
+ "grad_norm": 2.4495043754577637,
+ "learning_rate": 3.2991914832756824e-06,
+ "loss": 0.4243,
+ "step": 5091
+ },
+ {
+ "epoch": 2.407565011820331,
+ "grad_norm": 2.8731648921966553,
+ "learning_rate": 3.2986003656749654e-06,
+ "loss": 0.4262,
+ "step": 5092
+ },
+ {
+ "epoch": 2.4080378250591017,
+ "grad_norm": 2.870342969894409,
+ "learning_rate": 3.2980091983489053e-06,
+ "loss": 0.4735,
+ "step": 5093
+ },
+ {
+ "epoch": 2.4085106382978725,
+ "grad_norm": 2.500786542892456,
+ "learning_rate": 3.297417981334312e-06,
+ "loss": 0.4007,
+ "step": 5094
+ },
+ {
+ "epoch": 2.408983451536643,
+ "grad_norm": 2.7787322998046875,
+ "learning_rate": 3.2968267146679978e-06,
+ "loss": 0.493,
+ "step": 5095
+ },
+ {
+ "epoch": 2.4094562647754136,
+ "grad_norm": 2.5229599475860596,
+ "learning_rate": 3.2962353983867783e-06,
+ "loss": 0.3676,
+ "step": 5096
+ },
+ {
+ "epoch": 2.4099290780141844,
+ "grad_norm": 3.1955904960632324,
+ "learning_rate": 3.2956440325274715e-06,
+ "loss": 0.4888,
+ "step": 5097
+ },
+ {
+ "epoch": 2.4104018912529552,
+ "grad_norm": 2.8580288887023926,
+ "learning_rate": 3.2950526171268995e-06,
+ "loss": 0.4892,
+ "step": 5098
+ },
+ {
+ "epoch": 2.4108747044917256,
+ "grad_norm": 2.6321749687194824,
+ "learning_rate": 3.294461152221887e-06,
+ "loss": 0.3823,
+ "step": 5099
+ },
+ {
+ "epoch": 2.4113475177304964,
+ "grad_norm": 2.881127119064331,
+ "learning_rate": 3.293869637849263e-06,
+ "loss": 0.4569,
+ "step": 5100
+ },
+ {
+ "epoch": 2.411820330969267,
+ "grad_norm": 2.7742316722869873,
+ "learning_rate": 3.293278074045857e-06,
+ "loss": 0.4445,
+ "step": 5101
+ },
+ {
+ "epoch": 2.412293144208038,
+ "grad_norm": 2.546701431274414,
+ "learning_rate": 3.2926864608485037e-06,
+ "loss": 0.3995,
+ "step": 5102
+ },
+ {
+ "epoch": 2.4127659574468083,
+ "grad_norm": 2.588226318359375,
+ "learning_rate": 3.292094798294041e-06,
+ "loss": 0.4081,
+ "step": 5103
+ },
+ {
+ "epoch": 2.413238770685579,
+ "grad_norm": 2.968689441680908,
+ "learning_rate": 3.2915030864193077e-06,
+ "loss": 0.4475,
+ "step": 5104
+ },
+ {
+ "epoch": 2.41371158392435,
+ "grad_norm": 2.9249184131622314,
+ "learning_rate": 3.290911325261148e-06,
+ "loss": 0.4763,
+ "step": 5105
+ },
+ {
+ "epoch": 2.4141843971631207,
+ "grad_norm": 2.817596673965454,
+ "learning_rate": 3.2903195148564083e-06,
+ "loss": 0.4451,
+ "step": 5106
+ },
+ {
+ "epoch": 2.414657210401891,
+ "grad_norm": 2.6465954780578613,
+ "learning_rate": 3.2897276552419377e-06,
+ "loss": 0.4665,
+ "step": 5107
+ },
+ {
+ "epoch": 2.415130023640662,
+ "grad_norm": 2.8613853454589844,
+ "learning_rate": 3.2891357464545885e-06,
+ "loss": 0.4398,
+ "step": 5108
+ },
+ {
+ "epoch": 2.4156028368794327,
+ "grad_norm": 2.756321907043457,
+ "learning_rate": 3.2885437885312175e-06,
+ "loss": 0.4634,
+ "step": 5109
+ },
+ {
+ "epoch": 2.4160756501182035,
+ "grad_norm": 2.8965282440185547,
+ "learning_rate": 3.287951781508682e-06,
+ "loss": 0.4319,
+ "step": 5110
+ },
+ {
+ "epoch": 2.416548463356974,
+ "grad_norm": 2.896756172180176,
+ "learning_rate": 3.287359725423844e-06,
+ "loss": 0.4771,
+ "step": 5111
+ },
+ {
+ "epoch": 2.4170212765957446,
+ "grad_norm": 2.952911376953125,
+ "learning_rate": 3.286767620313569e-06,
+ "loss": 0.5026,
+ "step": 5112
+ },
+ {
+ "epoch": 2.4174940898345154,
+ "grad_norm": 3.850515604019165,
+ "learning_rate": 3.2861754662147234e-06,
+ "loss": 0.4387,
+ "step": 5113
+ },
+ {
+ "epoch": 2.417966903073286,
+ "grad_norm": 3.0072689056396484,
+ "learning_rate": 3.2855832631641794e-06,
+ "loss": 0.4586,
+ "step": 5114
+ },
+ {
+ "epoch": 2.4184397163120566,
+ "grad_norm": 3.166790246963501,
+ "learning_rate": 3.2849910111988092e-06,
+ "loss": 0.4842,
+ "step": 5115
+ },
+ {
+ "epoch": 2.4189125295508274,
+ "grad_norm": 3.5397679805755615,
+ "learning_rate": 3.284398710355492e-06,
+ "loss": 0.5138,
+ "step": 5116
+ },
+ {
+ "epoch": 2.419385342789598,
+ "grad_norm": 2.779609441757202,
+ "learning_rate": 3.283806360671106e-06,
+ "loss": 0.4049,
+ "step": 5117
+ },
+ {
+ "epoch": 2.419858156028369,
+ "grad_norm": 2.5924575328826904,
+ "learning_rate": 3.283213962182535e-06,
+ "loss": 0.433,
+ "step": 5118
+ },
+ {
+ "epoch": 2.4203309692671393,
+ "grad_norm": 2.7429699897766113,
+ "learning_rate": 3.282621514926665e-06,
+ "loss": 0.4674,
+ "step": 5119
+ },
+ {
+ "epoch": 2.42080378250591,
+ "grad_norm": 2.8113889694213867,
+ "learning_rate": 3.2820290189403846e-06,
+ "loss": 0.3898,
+ "step": 5120
+ },
+ {
+ "epoch": 2.421276595744681,
+ "grad_norm": 2.867105722427368,
+ "learning_rate": 3.2814364742605863e-06,
+ "loss": 0.4439,
+ "step": 5121
+ },
+ {
+ "epoch": 2.4217494089834517,
+ "grad_norm": 2.428597927093506,
+ "learning_rate": 3.2808438809241654e-06,
+ "loss": 0.4339,
+ "step": 5122
+ },
+ {
+ "epoch": 2.422222222222222,
+ "grad_norm": 3.071735143661499,
+ "learning_rate": 3.2802512389680203e-06,
+ "loss": 0.4583,
+ "step": 5123
+ },
+ {
+ "epoch": 2.422695035460993,
+ "grad_norm": 3.046313762664795,
+ "learning_rate": 3.279658548429051e-06,
+ "loss": 0.5351,
+ "step": 5124
+ },
+ {
+ "epoch": 2.4231678486997636,
+ "grad_norm": 2.8412697315216064,
+ "learning_rate": 3.279065809344163e-06,
+ "loss": 0.5258,
+ "step": 5125
+ },
+ {
+ "epoch": 2.4236406619385344,
+ "grad_norm": 2.887169122695923,
+ "learning_rate": 3.278473021750263e-06,
+ "loss": 0.4568,
+ "step": 5126
+ },
+ {
+ "epoch": 2.424113475177305,
+ "grad_norm": 2.8316574096679688,
+ "learning_rate": 3.2778801856842624e-06,
+ "loss": 0.46,
+ "step": 5127
+ },
+ {
+ "epoch": 2.4245862884160756,
+ "grad_norm": 2.7660772800445557,
+ "learning_rate": 3.277287301183073e-06,
+ "loss": 0.4323,
+ "step": 5128
+ },
+ {
+ "epoch": 2.4250591016548464,
+ "grad_norm": 2.737682819366455,
+ "learning_rate": 3.276694368283611e-06,
+ "loss": 0.4296,
+ "step": 5129
+ },
+ {
+ "epoch": 2.425531914893617,
+ "grad_norm": 2.8807425498962402,
+ "learning_rate": 3.276101387022797e-06,
+ "loss": 0.4673,
+ "step": 5130
+ },
+ {
+ "epoch": 2.4260047281323875,
+ "grad_norm": 2.530526876449585,
+ "learning_rate": 3.275508357437552e-06,
+ "loss": 0.416,
+ "step": 5131
+ },
+ {
+ "epoch": 2.4264775413711583,
+ "grad_norm": 3.1189746856689453,
+ "learning_rate": 3.274915279564803e-06,
+ "loss": 0.4171,
+ "step": 5132
+ },
+ {
+ "epoch": 2.426950354609929,
+ "grad_norm": 2.6612462997436523,
+ "learning_rate": 3.274322153441477e-06,
+ "loss": 0.4104,
+ "step": 5133
+ },
+ {
+ "epoch": 2.4274231678487,
+ "grad_norm": 2.717973470687866,
+ "learning_rate": 3.2737289791045064e-06,
+ "loss": 0.479,
+ "step": 5134
+ },
+ {
+ "epoch": 2.4278959810874703,
+ "grad_norm": 2.764216661453247,
+ "learning_rate": 3.2731357565908247e-06,
+ "loss": 0.481,
+ "step": 5135
+ },
+ {
+ "epoch": 2.428368794326241,
+ "grad_norm": 2.5081393718719482,
+ "learning_rate": 3.272542485937369e-06,
+ "loss": 0.4592,
+ "step": 5136
+ },
+ {
+ "epoch": 2.428841607565012,
+ "grad_norm": 3.1380364894866943,
+ "learning_rate": 3.271949167181081e-06,
+ "loss": 0.4179,
+ "step": 5137
+ },
+ {
+ "epoch": 2.4293144208037827,
+ "grad_norm": 2.9275963306427,
+ "learning_rate": 3.2713558003589026e-06,
+ "loss": 0.5196,
+ "step": 5138
+ },
+ {
+ "epoch": 2.429787234042553,
+ "grad_norm": 2.8215506076812744,
+ "learning_rate": 3.270762385507781e-06,
+ "loss": 0.4081,
+ "step": 5139
+ },
+ {
+ "epoch": 2.430260047281324,
+ "grad_norm": 2.9185614585876465,
+ "learning_rate": 3.270168922664665e-06,
+ "loss": 0.4936,
+ "step": 5140
+ },
+ {
+ "epoch": 2.4307328605200946,
+ "grad_norm": 2.6507248878479004,
+ "learning_rate": 3.269575411866507e-06,
+ "loss": 0.4834,
+ "step": 5141
+ },
+ {
+ "epoch": 2.4312056737588654,
+ "grad_norm": 2.864741563796997,
+ "learning_rate": 3.2689818531502637e-06,
+ "loss": 0.4562,
+ "step": 5142
+ },
+ {
+ "epoch": 2.431678486997636,
+ "grad_norm": 2.806919813156128,
+ "learning_rate": 3.2683882465528917e-06,
+ "loss": 0.4645,
+ "step": 5143
+ },
+ {
+ "epoch": 2.4321513002364066,
+ "grad_norm": 2.733372211456299,
+ "learning_rate": 3.267794592111353e-06,
+ "loss": 0.4123,
+ "step": 5144
+ },
+ {
+ "epoch": 2.4326241134751774,
+ "grad_norm": 2.8005833625793457,
+ "learning_rate": 3.2672008898626116e-06,
+ "loss": 0.4343,
+ "step": 5145
+ },
+ {
+ "epoch": 2.433096926713948,
+ "grad_norm": 3.2339670658111572,
+ "learning_rate": 3.2666071398436354e-06,
+ "loss": 0.4017,
+ "step": 5146
+ },
+ {
+ "epoch": 2.4335697399527185,
+ "grad_norm": 2.510251760482788,
+ "learning_rate": 3.2660133420913932e-06,
+ "loss": 0.3882,
+ "step": 5147
+ },
+ {
+ "epoch": 2.4340425531914893,
+ "grad_norm": 3.5633628368377686,
+ "learning_rate": 3.26541949664286e-06,
+ "loss": 0.4766,
+ "step": 5148
+ },
+ {
+ "epoch": 2.43451536643026,
+ "grad_norm": 2.8246724605560303,
+ "learning_rate": 3.26482560353501e-06,
+ "loss": 0.3728,
+ "step": 5149
+ },
+ {
+ "epoch": 2.434988179669031,
+ "grad_norm": 2.4923641681671143,
+ "learning_rate": 3.264231662804823e-06,
+ "loss": 0.4346,
+ "step": 5150
+ },
+ {
+ "epoch": 2.4354609929078013,
+ "grad_norm": 3.180874824523926,
+ "learning_rate": 3.2636376744892827e-06,
+ "loss": 0.4351,
+ "step": 5151
+ },
+ {
+ "epoch": 2.435933806146572,
+ "grad_norm": 2.6933515071868896,
+ "learning_rate": 3.263043638625373e-06,
+ "loss": 0.4293,
+ "step": 5152
+ },
+ {
+ "epoch": 2.436406619385343,
+ "grad_norm": 2.584132194519043,
+ "learning_rate": 3.262449555250081e-06,
+ "loss": 0.4589,
+ "step": 5153
+ },
+ {
+ "epoch": 2.4368794326241137,
+ "grad_norm": 2.8103036880493164,
+ "learning_rate": 3.2618554244003985e-06,
+ "loss": 0.463,
+ "step": 5154
+ },
+ {
+ "epoch": 2.437352245862884,
+ "grad_norm": 2.809070587158203,
+ "learning_rate": 3.2612612461133197e-06,
+ "loss": 0.4629,
+ "step": 5155
+ },
+ {
+ "epoch": 2.437825059101655,
+ "grad_norm": 2.98148512840271,
+ "learning_rate": 3.2606670204258405e-06,
+ "loss": 0.451,
+ "step": 5156
+ },
+ {
+ "epoch": 2.4382978723404256,
+ "grad_norm": 2.691047191619873,
+ "learning_rate": 3.2600727473749614e-06,
+ "loss": 0.3878,
+ "step": 5157
+ },
+ {
+ "epoch": 2.4387706855791964,
+ "grad_norm": 2.900360345840454,
+ "learning_rate": 3.2594784269976856e-06,
+ "loss": 0.4216,
+ "step": 5158
+ },
+ {
+ "epoch": 2.4392434988179668,
+ "grad_norm": 2.8449952602386475,
+ "learning_rate": 3.258884059331019e-06,
+ "loss": 0.4268,
+ "step": 5159
+ },
+ {
+ "epoch": 2.4397163120567376,
+ "grad_norm": 2.7226388454437256,
+ "learning_rate": 3.258289644411969e-06,
+ "loss": 0.4381,
+ "step": 5160
+ },
+ {
+ "epoch": 2.4401891252955084,
+ "grad_norm": 2.513946056365967,
+ "learning_rate": 3.257695182277547e-06,
+ "loss": 0.4566,
+ "step": 5161
+ },
+ {
+ "epoch": 2.440661938534279,
+ "grad_norm": 2.9941394329071045,
+ "learning_rate": 3.2571006729647693e-06,
+ "loss": 0.4395,
+ "step": 5162
+ },
+ {
+ "epoch": 2.4411347517730495,
+ "grad_norm": 2.699094533920288,
+ "learning_rate": 3.2565061165106523e-06,
+ "loss": 0.4274,
+ "step": 5163
+ },
+ {
+ "epoch": 2.4416075650118203,
+ "grad_norm": 2.574193000793457,
+ "learning_rate": 3.255911512952216e-06,
+ "loss": 0.4187,
+ "step": 5164
+ },
+ {
+ "epoch": 2.442080378250591,
+ "grad_norm": 2.920766592025757,
+ "learning_rate": 3.2553168623264854e-06,
+ "loss": 0.4911,
+ "step": 5165
+ },
+ {
+ "epoch": 2.4425531914893615,
+ "grad_norm": 2.728421926498413,
+ "learning_rate": 3.2547221646704853e-06,
+ "loss": 0.4466,
+ "step": 5166
+ },
+ {
+ "epoch": 2.4430260047281322,
+ "grad_norm": 2.8171417713165283,
+ "learning_rate": 3.254127420021246e-06,
+ "loss": 0.4331,
+ "step": 5167
+ },
+ {
+ "epoch": 2.443498817966903,
+ "grad_norm": 2.4069135189056396,
+ "learning_rate": 3.2535326284157975e-06,
+ "loss": 0.389,
+ "step": 5168
+ },
+ {
+ "epoch": 2.443971631205674,
+ "grad_norm": 2.912405490875244,
+ "learning_rate": 3.2529377898911777e-06,
+ "loss": 0.4681,
+ "step": 5169
+ },
+ {
+ "epoch": 2.4444444444444446,
+ "grad_norm": 2.987558126449585,
+ "learning_rate": 3.2523429044844228e-06,
+ "loss": 0.4715,
+ "step": 5170
+ },
+ {
+ "epoch": 2.444917257683215,
+ "grad_norm": 2.5117199420928955,
+ "learning_rate": 3.251747972232574e-06,
+ "loss": 0.4531,
+ "step": 5171
+ },
+ {
+ "epoch": 2.445390070921986,
+ "grad_norm": 2.5405385494232178,
+ "learning_rate": 3.2511529931726752e-06,
+ "loss": 0.4323,
+ "step": 5172
+ },
+ {
+ "epoch": 2.4458628841607566,
+ "grad_norm": 2.989932060241699,
+ "learning_rate": 3.250557967341773e-06,
+ "loss": 0.4039,
+ "step": 5173
+ },
+ {
+ "epoch": 2.446335697399527,
+ "grad_norm": 2.6331627368927,
+ "learning_rate": 3.2499628947769186e-06,
+ "loss": 0.5147,
+ "step": 5174
+ },
+ {
+ "epoch": 2.4468085106382977,
+ "grad_norm": 2.71699857711792,
+ "learning_rate": 3.249367775515162e-06,
+ "loss": 0.3748,
+ "step": 5175
+ },
+ {
+ "epoch": 2.4472813238770685,
+ "grad_norm": 2.9508471488952637,
+ "learning_rate": 3.2487726095935606e-06,
+ "loss": 0.5145,
+ "step": 5176
+ },
+ {
+ "epoch": 2.4477541371158393,
+ "grad_norm": 2.8276431560516357,
+ "learning_rate": 3.2481773970491713e-06,
+ "loss": 0.4295,
+ "step": 5177
+ },
+ {
+ "epoch": 2.44822695035461,
+ "grad_norm": 2.5500540733337402,
+ "learning_rate": 3.2475821379190565e-06,
+ "loss": 0.4246,
+ "step": 5178
+ },
+ {
+ "epoch": 2.4486997635933805,
+ "grad_norm": 2.845641613006592,
+ "learning_rate": 3.246986832240281e-06,
+ "loss": 0.4211,
+ "step": 5179
+ },
+ {
+ "epoch": 2.4491725768321513,
+ "grad_norm": 3.1215856075286865,
+ "learning_rate": 3.2463914800499097e-06,
+ "loss": 0.4378,
+ "step": 5180
+ },
+ {
+ "epoch": 2.449645390070922,
+ "grad_norm": 2.4685606956481934,
+ "learning_rate": 3.2457960813850137e-06,
+ "loss": 0.4836,
+ "step": 5181
+ },
+ {
+ "epoch": 2.4501182033096924,
+ "grad_norm": 2.508028268814087,
+ "learning_rate": 3.245200636282666e-06,
+ "loss": 0.4377,
+ "step": 5182
+ },
+ {
+ "epoch": 2.4505910165484632,
+ "grad_norm": 2.899949312210083,
+ "learning_rate": 3.244605144779943e-06,
+ "loss": 0.501,
+ "step": 5183
+ },
+ {
+ "epoch": 2.451063829787234,
+ "grad_norm": 2.6494483947753906,
+ "learning_rate": 3.244009606913923e-06,
+ "loss": 0.4255,
+ "step": 5184
+ },
+ {
+ "epoch": 2.451536643026005,
+ "grad_norm": 2.4363760948181152,
+ "learning_rate": 3.243414022721686e-06,
+ "loss": 0.4402,
+ "step": 5185
+ },
+ {
+ "epoch": 2.4520094562647756,
+ "grad_norm": 2.4725022315979004,
+ "learning_rate": 3.242818392240317e-06,
+ "loss": 0.4388,
+ "step": 5186
+ },
+ {
+ "epoch": 2.452482269503546,
+ "grad_norm": 2.7010514736175537,
+ "learning_rate": 3.242222715506905e-06,
+ "loss": 0.4388,
+ "step": 5187
+ },
+ {
+ "epoch": 2.4529550827423168,
+ "grad_norm": 2.811464548110962,
+ "learning_rate": 3.241626992558539e-06,
+ "loss": 0.4634,
+ "step": 5188
+ },
+ {
+ "epoch": 2.4534278959810876,
+ "grad_norm": 2.6473052501678467,
+ "learning_rate": 3.2410312234323123e-06,
+ "loss": 0.4752,
+ "step": 5189
+ },
+ {
+ "epoch": 2.453900709219858,
+ "grad_norm": 2.5587213039398193,
+ "learning_rate": 3.24043540816532e-06,
+ "loss": 0.4458,
+ "step": 5190
+ },
+ {
+ "epoch": 2.4543735224586287,
+ "grad_norm": 2.6306557655334473,
+ "learning_rate": 3.239839546794662e-06,
+ "loss": 0.4081,
+ "step": 5191
+ },
+ {
+ "epoch": 2.4548463356973995,
+ "grad_norm": 2.4613633155822754,
+ "learning_rate": 3.23924363935744e-06,
+ "loss": 0.4165,
+ "step": 5192
+ },
+ {
+ "epoch": 2.4553191489361703,
+ "grad_norm": 2.7189204692840576,
+ "learning_rate": 3.238647685890757e-06,
+ "loss": 0.4822,
+ "step": 5193
+ },
+ {
+ "epoch": 2.455791962174941,
+ "grad_norm": 3.015977382659912,
+ "learning_rate": 3.238051686431722e-06,
+ "loss": 0.4964,
+ "step": 5194
+ },
+ {
+ "epoch": 2.4562647754137115,
+ "grad_norm": 2.8868937492370605,
+ "learning_rate": 3.2374556410174445e-06,
+ "loss": 0.4514,
+ "step": 5195
+ },
+ {
+ "epoch": 2.4567375886524823,
+ "grad_norm": 2.7959537506103516,
+ "learning_rate": 3.2368595496850375e-06,
+ "loss": 0.475,
+ "step": 5196
+ },
+ {
+ "epoch": 2.457210401891253,
+ "grad_norm": 3.0086777210235596,
+ "learning_rate": 3.2362634124716187e-06,
+ "loss": 0.4913,
+ "step": 5197
+ },
+ {
+ "epoch": 2.4576832151300234,
+ "grad_norm": 2.621335506439209,
+ "learning_rate": 3.2356672294143044e-06,
+ "loss": 0.4259,
+ "step": 5198
+ },
+ {
+ "epoch": 2.458156028368794,
+ "grad_norm": 3.1620380878448486,
+ "learning_rate": 3.235071000550218e-06,
+ "loss": 0.451,
+ "step": 5199
+ },
+ {
+ "epoch": 2.458628841607565,
+ "grad_norm": 2.7663278579711914,
+ "learning_rate": 3.234474725916484e-06,
+ "loss": 0.3854,
+ "step": 5200
+ },
+ {
+ "epoch": 2.459101654846336,
+ "grad_norm": 2.5187132358551025,
+ "learning_rate": 3.2338784055502288e-06,
+ "loss": 0.4068,
+ "step": 5201
+ },
+ {
+ "epoch": 2.4595744680851066,
+ "grad_norm": 2.6022701263427734,
+ "learning_rate": 3.233282039488583e-06,
+ "loss": 0.4484,
+ "step": 5202
+ },
+ {
+ "epoch": 2.460047281323877,
+ "grad_norm": 2.874750852584839,
+ "learning_rate": 3.2326856277686807e-06,
+ "loss": 0.45,
+ "step": 5203
+ },
+ {
+ "epoch": 2.4605200945626478,
+ "grad_norm": 2.671008586883545,
+ "learning_rate": 3.232089170427656e-06,
+ "loss": 0.4446,
+ "step": 5204
+ },
+ {
+ "epoch": 2.4609929078014185,
+ "grad_norm": 2.7365503311157227,
+ "learning_rate": 3.2314926675026498e-06,
+ "loss": 0.4402,
+ "step": 5205
+ },
+ {
+ "epoch": 2.461465721040189,
+ "grad_norm": 2.8163657188415527,
+ "learning_rate": 3.230896119030803e-06,
+ "loss": 0.3881,
+ "step": 5206
+ },
+ {
+ "epoch": 2.4619385342789597,
+ "grad_norm": 2.812433958053589,
+ "learning_rate": 3.2302995250492584e-06,
+ "loss": 0.4897,
+ "step": 5207
+ },
+ {
+ "epoch": 2.4624113475177305,
+ "grad_norm": 2.786033868789673,
+ "learning_rate": 3.2297028855951664e-06,
+ "loss": 0.4069,
+ "step": 5208
+ },
+ {
+ "epoch": 2.4628841607565013,
+ "grad_norm": 3.0247974395751953,
+ "learning_rate": 3.229106200705674e-06,
+ "loss": 0.4048,
+ "step": 5209
+ },
+ {
+ "epoch": 2.463356973995272,
+ "grad_norm": 3.3280487060546875,
+ "learning_rate": 3.2285094704179353e-06,
+ "loss": 0.5613,
+ "step": 5210
+ },
+ {
+ "epoch": 2.4638297872340424,
+ "grad_norm": 2.603219985961914,
+ "learning_rate": 3.2279126947691073e-06,
+ "loss": 0.432,
+ "step": 5211
+ },
+ {
+ "epoch": 2.4643026004728132,
+ "grad_norm": 3.1532180309295654,
+ "learning_rate": 3.2273158737963472e-06,
+ "loss": 0.4602,
+ "step": 5212
+ },
+ {
+ "epoch": 2.464775413711584,
+ "grad_norm": 2.7512969970703125,
+ "learning_rate": 3.2267190075368164e-06,
+ "loss": 0.5064,
+ "step": 5213
+ },
+ {
+ "epoch": 2.4652482269503544,
+ "grad_norm": 2.926992177963257,
+ "learning_rate": 3.22612209602768e-06,
+ "loss": 0.4753,
+ "step": 5214
+ },
+ {
+ "epoch": 2.465721040189125,
+ "grad_norm": 4.052840709686279,
+ "learning_rate": 3.2255251393061047e-06,
+ "loss": 0.5235,
+ "step": 5215
+ },
+ {
+ "epoch": 2.466193853427896,
+ "grad_norm": 2.8266959190368652,
+ "learning_rate": 3.2249281374092606e-06,
+ "loss": 0.3931,
+ "step": 5216
+ },
+ {
+ "epoch": 2.466666666666667,
+ "grad_norm": 2.564359426498413,
+ "learning_rate": 3.2243310903743196e-06,
+ "loss": 0.4146,
+ "step": 5217
+ },
+ {
+ "epoch": 2.4671394799054376,
+ "grad_norm": 2.387925148010254,
+ "learning_rate": 3.2237339982384576e-06,
+ "loss": 0.4142,
+ "step": 5218
+ },
+ {
+ "epoch": 2.467612293144208,
+ "grad_norm": 2.7045164108276367,
+ "learning_rate": 3.223136861038853e-06,
+ "loss": 0.4345,
+ "step": 5219
+ },
+ {
+ "epoch": 2.4680851063829787,
+ "grad_norm": 2.6963284015655518,
+ "learning_rate": 3.2225396788126872e-06,
+ "loss": 0.4243,
+ "step": 5220
+ },
+ {
+ "epoch": 2.4685579196217495,
+ "grad_norm": 2.8247268199920654,
+ "learning_rate": 3.221942451597144e-06,
+ "loss": 0.3919,
+ "step": 5221
+ },
+ {
+ "epoch": 2.46903073286052,
+ "grad_norm": 3.843836784362793,
+ "learning_rate": 3.2213451794294093e-06,
+ "loss": 0.4183,
+ "step": 5222
+ },
+ {
+ "epoch": 2.4695035460992907,
+ "grad_norm": 2.8579909801483154,
+ "learning_rate": 3.220747862346674e-06,
+ "loss": 0.4844,
+ "step": 5223
+ },
+ {
+ "epoch": 2.4699763593380615,
+ "grad_norm": 3.744027853012085,
+ "learning_rate": 3.2201505003861294e-06,
+ "loss": 0.4563,
+ "step": 5224
+ },
+ {
+ "epoch": 2.4704491725768323,
+ "grad_norm": 2.835108995437622,
+ "learning_rate": 3.219553093584971e-06,
+ "loss": 0.4394,
+ "step": 5225
+ },
+ {
+ "epoch": 2.470921985815603,
+ "grad_norm": 2.5681865215301514,
+ "learning_rate": 3.218955641980397e-06,
+ "loss": 0.3907,
+ "step": 5226
+ },
+ {
+ "epoch": 2.4713947990543734,
+ "grad_norm": 2.963172674179077,
+ "learning_rate": 3.2183581456096067e-06,
+ "loss": 0.5163,
+ "step": 5227
+ },
+ {
+ "epoch": 2.4718676122931442,
+ "grad_norm": 2.7840685844421387,
+ "learning_rate": 3.2177606045098047e-06,
+ "loss": 0.411,
+ "step": 5228
+ },
+ {
+ "epoch": 2.472340425531915,
+ "grad_norm": 2.7849979400634766,
+ "learning_rate": 3.2171630187181977e-06,
+ "loss": 0.4671,
+ "step": 5229
+ },
+ {
+ "epoch": 2.4728132387706854,
+ "grad_norm": 2.736406087875366,
+ "learning_rate": 3.216565388271994e-06,
+ "loss": 0.5225,
+ "step": 5230
+ },
+ {
+ "epoch": 2.473286052009456,
+ "grad_norm": 2.978271007537842,
+ "learning_rate": 3.215967713208406e-06,
+ "loss": 0.4668,
+ "step": 5231
+ },
+ {
+ "epoch": 2.473758865248227,
+ "grad_norm": 2.687560796737671,
+ "learning_rate": 3.2153699935646475e-06,
+ "loss": 0.4683,
+ "step": 5232
+ },
+ {
+ "epoch": 2.4742316784869978,
+ "grad_norm": 2.7096521854400635,
+ "learning_rate": 3.214772229377936e-06,
+ "loss": 0.4999,
+ "step": 5233
+ },
+ {
+ "epoch": 2.4747044917257686,
+ "grad_norm": 3.1861157417297363,
+ "learning_rate": 3.214174420685493e-06,
+ "loss": 0.4365,
+ "step": 5234
+ },
+ {
+ "epoch": 2.475177304964539,
+ "grad_norm": 2.623061418533325,
+ "learning_rate": 3.2135765675245394e-06,
+ "loss": 0.3717,
+ "step": 5235
+ },
+ {
+ "epoch": 2.4756501182033097,
+ "grad_norm": 2.680921792984009,
+ "learning_rate": 3.2129786699323016e-06,
+ "loss": 0.4688,
+ "step": 5236
+ },
+ {
+ "epoch": 2.4761229314420805,
+ "grad_norm": 2.80426025390625,
+ "learning_rate": 3.2123807279460096e-06,
+ "loss": 0.5043,
+ "step": 5237
+ },
+ {
+ "epoch": 2.476595744680851,
+ "grad_norm": 2.676156997680664,
+ "learning_rate": 3.211782741602893e-06,
+ "loss": 0.4486,
+ "step": 5238
+ },
+ {
+ "epoch": 2.4770685579196217,
+ "grad_norm": 2.700822591781616,
+ "learning_rate": 3.2111847109401855e-06,
+ "loss": 0.4097,
+ "step": 5239
+ },
+ {
+ "epoch": 2.4775413711583925,
+ "grad_norm": 2.735387086868286,
+ "learning_rate": 3.2105866359951254e-06,
+ "loss": 0.4357,
+ "step": 5240
+ },
+ {
+ "epoch": 2.4780141843971633,
+ "grad_norm": 2.961874485015869,
+ "learning_rate": 3.2099885168049507e-06,
+ "loss": 0.4942,
+ "step": 5241
+ },
+ {
+ "epoch": 2.478486997635934,
+ "grad_norm": 2.546588659286499,
+ "learning_rate": 3.209390353406904e-06,
+ "loss": 0.3852,
+ "step": 5242
+ },
+ {
+ "epoch": 2.4789598108747044,
+ "grad_norm": 2.6269772052764893,
+ "learning_rate": 3.208792145838231e-06,
+ "loss": 0.3935,
+ "step": 5243
+ },
+ {
+ "epoch": 2.479432624113475,
+ "grad_norm": 2.9009883403778076,
+ "learning_rate": 3.208193894136179e-06,
+ "loss": 0.4003,
+ "step": 5244
+ },
+ {
+ "epoch": 2.479905437352246,
+ "grad_norm": 2.772834300994873,
+ "learning_rate": 3.2075955983379982e-06,
+ "loss": 0.4742,
+ "step": 5245
+ },
+ {
+ "epoch": 2.4803782505910164,
+ "grad_norm": 2.728703737258911,
+ "learning_rate": 3.2069972584809423e-06,
+ "loss": 0.4405,
+ "step": 5246
+ },
+ {
+ "epoch": 2.480851063829787,
+ "grad_norm": 2.72868275642395,
+ "learning_rate": 3.206398874602268e-06,
+ "loss": 0.4714,
+ "step": 5247
+ },
+ {
+ "epoch": 2.481323877068558,
+ "grad_norm": 2.6804213523864746,
+ "learning_rate": 3.2058004467392323e-06,
+ "loss": 0.4106,
+ "step": 5248
+ },
+ {
+ "epoch": 2.4817966903073287,
+ "grad_norm": 2.6740739345550537,
+ "learning_rate": 3.205201974929098e-06,
+ "loss": 0.3855,
+ "step": 5249
+ },
+ {
+ "epoch": 2.482269503546099,
+ "grad_norm": 2.8131754398345947,
+ "learning_rate": 3.204603459209129e-06,
+ "loss": 0.418,
+ "step": 5250
+ },
+ {
+ "epoch": 2.48274231678487,
+ "grad_norm": 2.5242888927459717,
+ "learning_rate": 3.204004899616592e-06,
+ "loss": 0.4914,
+ "step": 5251
+ },
+ {
+ "epoch": 2.4832151300236407,
+ "grad_norm": 2.969191551208496,
+ "learning_rate": 3.2034062961887567e-06,
+ "loss": 0.4634,
+ "step": 5252
+ },
+ {
+ "epoch": 2.4836879432624115,
+ "grad_norm": 2.967968463897705,
+ "learning_rate": 3.2028076489628963e-06,
+ "loss": 0.456,
+ "step": 5253
+ },
+ {
+ "epoch": 2.484160756501182,
+ "grad_norm": 2.9006540775299072,
+ "learning_rate": 3.2022089579762845e-06,
+ "loss": 0.4203,
+ "step": 5254
+ },
+ {
+ "epoch": 2.4846335697399526,
+ "grad_norm": 2.6377336978912354,
+ "learning_rate": 3.2016102232662003e-06,
+ "loss": 0.4518,
+ "step": 5255
+ },
+ {
+ "epoch": 2.4851063829787234,
+ "grad_norm": 2.757749319076538,
+ "learning_rate": 3.201011444869925e-06,
+ "loss": 0.4314,
+ "step": 5256
+ },
+ {
+ "epoch": 2.4855791962174942,
+ "grad_norm": 2.571560859680176,
+ "learning_rate": 3.20041262282474e-06,
+ "loss": 0.427,
+ "step": 5257
+ },
+ {
+ "epoch": 2.4860520094562646,
+ "grad_norm": 3.1367194652557373,
+ "learning_rate": 3.1998137571679316e-06,
+ "loss": 0.4901,
+ "step": 5258
+ },
+ {
+ "epoch": 2.4865248226950354,
+ "grad_norm": 3.194042205810547,
+ "learning_rate": 3.1992148479367896e-06,
+ "loss": 0.466,
+ "step": 5259
+ },
+ {
+ "epoch": 2.486997635933806,
+ "grad_norm": 2.5546324253082275,
+ "learning_rate": 3.1986158951686052e-06,
+ "loss": 0.4182,
+ "step": 5260
+ },
+ {
+ "epoch": 2.487470449172577,
+ "grad_norm": 2.919783115386963,
+ "learning_rate": 3.198016898900672e-06,
+ "loss": 0.4234,
+ "step": 5261
+ },
+ {
+ "epoch": 2.4879432624113473,
+ "grad_norm": 2.865248918533325,
+ "learning_rate": 3.1974178591702877e-06,
+ "loss": 0.4291,
+ "step": 5262
+ },
+ {
+ "epoch": 2.488416075650118,
+ "grad_norm": 2.685737133026123,
+ "learning_rate": 3.196818776014752e-06,
+ "loss": 0.4548,
+ "step": 5263
+ },
+ {
+ "epoch": 2.488888888888889,
+ "grad_norm": 2.826974630355835,
+ "learning_rate": 3.196219649471365e-06,
+ "loss": 0.4152,
+ "step": 5264
+ },
+ {
+ "epoch": 2.4893617021276597,
+ "grad_norm": 2.764975070953369,
+ "learning_rate": 3.1956204795774336e-06,
+ "loss": 0.5209,
+ "step": 5265
+ },
+ {
+ "epoch": 2.48983451536643,
+ "grad_norm": 2.4184255599975586,
+ "learning_rate": 3.1950212663702662e-06,
+ "loss": 0.3969,
+ "step": 5266
+ },
+ {
+ "epoch": 2.490307328605201,
+ "grad_norm": 2.9361133575439453,
+ "learning_rate": 3.1944220098871713e-06,
+ "loss": 0.4589,
+ "step": 5267
+ },
+ {
+ "epoch": 2.4907801418439717,
+ "grad_norm": 2.377051830291748,
+ "learning_rate": 3.193822710165463e-06,
+ "loss": 0.4328,
+ "step": 5268
+ },
+ {
+ "epoch": 2.4912529550827425,
+ "grad_norm": 3.1302497386932373,
+ "learning_rate": 3.1932233672424563e-06,
+ "loss": 0.3918,
+ "step": 5269
+ },
+ {
+ "epoch": 2.491725768321513,
+ "grad_norm": 2.89577579498291,
+ "learning_rate": 3.192623981155471e-06,
+ "loss": 0.5004,
+ "step": 5270
+ },
+ {
+ "epoch": 2.4921985815602836,
+ "grad_norm": 2.7735235691070557,
+ "learning_rate": 3.1920245519418273e-06,
+ "loss": 0.4206,
+ "step": 5271
+ },
+ {
+ "epoch": 2.4926713947990544,
+ "grad_norm": 2.5424516201019287,
+ "learning_rate": 3.1914250796388493e-06,
+ "loss": 0.4419,
+ "step": 5272
+ },
+ {
+ "epoch": 2.493144208037825,
+ "grad_norm": 3.1216981410980225,
+ "learning_rate": 3.1908255642838628e-06,
+ "loss": 0.4552,
+ "step": 5273
+ },
+ {
+ "epoch": 2.4936170212765956,
+ "grad_norm": 3.044045925140381,
+ "learning_rate": 3.1902260059141978e-06,
+ "loss": 0.4967,
+ "step": 5274
+ },
+ {
+ "epoch": 2.4940898345153664,
+ "grad_norm": 2.5630741119384766,
+ "learning_rate": 3.189626404567186e-06,
+ "loss": 0.3908,
+ "step": 5275
+ },
+ {
+ "epoch": 2.494562647754137,
+ "grad_norm": 2.7177648544311523,
+ "learning_rate": 3.189026760280162e-06,
+ "loss": 0.4915,
+ "step": 5276
+ },
+ {
+ "epoch": 2.495035460992908,
+ "grad_norm": 2.653416395187378,
+ "learning_rate": 3.1884270730904632e-06,
+ "loss": 0.4633,
+ "step": 5277
+ },
+ {
+ "epoch": 2.4955082742316783,
+ "grad_norm": 3.7212321758270264,
+ "learning_rate": 3.1878273430354284e-06,
+ "loss": 0.4549,
+ "step": 5278
+ },
+ {
+ "epoch": 2.495981087470449,
+ "grad_norm": 2.4152729511260986,
+ "learning_rate": 3.187227570152402e-06,
+ "loss": 0.4674,
+ "step": 5279
+ },
+ {
+ "epoch": 2.49645390070922,
+ "grad_norm": 2.5354862213134766,
+ "learning_rate": 3.1866277544787284e-06,
+ "loss": 0.4135,
+ "step": 5280
+ },
+ {
+ "epoch": 2.4969267139479907,
+ "grad_norm": 3.1766583919525146,
+ "learning_rate": 3.186027896051754e-06,
+ "loss": 0.5656,
+ "step": 5281
+ },
+ {
+ "epoch": 2.497399527186761,
+ "grad_norm": 2.5636754035949707,
+ "learning_rate": 3.1854279949088313e-06,
+ "loss": 0.4138,
+ "step": 5282
+ },
+ {
+ "epoch": 2.497872340425532,
+ "grad_norm": 2.7615602016448975,
+ "learning_rate": 3.1848280510873124e-06,
+ "loss": 0.4936,
+ "step": 5283
+ },
+ {
+ "epoch": 2.4983451536643027,
+ "grad_norm": 2.964721918106079,
+ "learning_rate": 3.1842280646245543e-06,
+ "loss": 0.4865,
+ "step": 5284
+ },
+ {
+ "epoch": 2.4988179669030735,
+ "grad_norm": 2.6915178298950195,
+ "learning_rate": 3.1836280355579152e-06,
+ "loss": 0.4179,
+ "step": 5285
+ },
+ {
+ "epoch": 2.499290780141844,
+ "grad_norm": 2.820451259613037,
+ "learning_rate": 3.183027963924755e-06,
+ "loss": 0.4785,
+ "step": 5286
+ },
+ {
+ "epoch": 2.4997635933806146,
+ "grad_norm": 2.841719627380371,
+ "learning_rate": 3.1824278497624393e-06,
+ "loss": 0.4535,
+ "step": 5287
+ },
+ {
+ "epoch": 2.5002364066193854,
+ "grad_norm": 2.459167957305908,
+ "learning_rate": 3.181827693108333e-06,
+ "loss": 0.4353,
+ "step": 5288
+ },
+ {
+ "epoch": 2.500709219858156,
+ "grad_norm": 3.2538363933563232,
+ "learning_rate": 3.1812274939998066e-06,
+ "loss": 0.4037,
+ "step": 5289
+ },
+ {
+ "epoch": 2.5011820330969265,
+ "grad_norm": 2.6980504989624023,
+ "learning_rate": 3.180627252474231e-06,
+ "loss": 0.4181,
+ "step": 5290
+ },
+ {
+ "epoch": 2.5016548463356973,
+ "grad_norm": 2.9400012493133545,
+ "learning_rate": 3.1800269685689804e-06,
+ "loss": 0.4642,
+ "step": 5291
+ },
+ {
+ "epoch": 2.502127659574468,
+ "grad_norm": 2.7832958698272705,
+ "learning_rate": 3.1794266423214328e-06,
+ "loss": 0.3936,
+ "step": 5292
+ },
+ {
+ "epoch": 2.5026004728132385,
+ "grad_norm": 2.4017868041992188,
+ "learning_rate": 3.178826273768967e-06,
+ "loss": 0.3984,
+ "step": 5293
+ },
+ {
+ "epoch": 2.5030732860520093,
+ "grad_norm": 2.398120641708374,
+ "learning_rate": 3.1782258629489665e-06,
+ "loss": 0.4219,
+ "step": 5294
+ },
+ {
+ "epoch": 2.50354609929078,
+ "grad_norm": 2.973947763442993,
+ "learning_rate": 3.177625409898815e-06,
+ "loss": 0.4192,
+ "step": 5295
+ },
+ {
+ "epoch": 2.504018912529551,
+ "grad_norm": 3.1169888973236084,
+ "learning_rate": 3.1770249146559006e-06,
+ "loss": 0.5098,
+ "step": 5296
+ },
+ {
+ "epoch": 2.5044917257683217,
+ "grad_norm": 2.816964864730835,
+ "learning_rate": 3.1764243772576132e-06,
+ "loss": 0.4228,
+ "step": 5297
+ },
+ {
+ "epoch": 2.504964539007092,
+ "grad_norm": 2.5624163150787354,
+ "learning_rate": 3.1758237977413452e-06,
+ "loss": 0.4389,
+ "step": 5298
+ },
+ {
+ "epoch": 2.505437352245863,
+ "grad_norm": 2.7477777004241943,
+ "learning_rate": 3.175223176144494e-06,
+ "loss": 0.4564,
+ "step": 5299
+ },
+ {
+ "epoch": 2.5059101654846336,
+ "grad_norm": 3.1478309631347656,
+ "learning_rate": 3.174622512504456e-06,
+ "loss": 0.4859,
+ "step": 5300
+ },
+ {
+ "epoch": 2.506382978723404,
+ "grad_norm": 2.8400418758392334,
+ "learning_rate": 3.1740218068586315e-06,
+ "loss": 0.4476,
+ "step": 5301
+ },
+ {
+ "epoch": 2.506855791962175,
+ "grad_norm": 2.7097036838531494,
+ "learning_rate": 3.173421059244426e-06,
+ "loss": 0.4559,
+ "step": 5302
+ },
+ {
+ "epoch": 2.5073286052009456,
+ "grad_norm": 2.864760637283325,
+ "learning_rate": 3.172820269699243e-06,
+ "loss": 0.5124,
+ "step": 5303
+ },
+ {
+ "epoch": 2.5078014184397164,
+ "grad_norm": 2.877110004425049,
+ "learning_rate": 3.1722194382604926e-06,
+ "loss": 0.5083,
+ "step": 5304
+ },
+ {
+ "epoch": 2.508274231678487,
+ "grad_norm": 3.2369656562805176,
+ "learning_rate": 3.1716185649655844e-06,
+ "loss": 0.4894,
+ "step": 5305
+ },
+ {
+ "epoch": 2.5087470449172575,
+ "grad_norm": 2.7377753257751465,
+ "learning_rate": 3.171017649851934e-06,
+ "loss": 0.4324,
+ "step": 5306
+ },
+ {
+ "epoch": 2.5092198581560283,
+ "grad_norm": 2.883364200592041,
+ "learning_rate": 3.1704166929569564e-06,
+ "loss": 0.3731,
+ "step": 5307
+ },
+ {
+ "epoch": 2.509692671394799,
+ "grad_norm": 2.5724737644195557,
+ "learning_rate": 3.1698156943180716e-06,
+ "loss": 0.4768,
+ "step": 5308
+ },
+ {
+ "epoch": 2.5101654846335695,
+ "grad_norm": 2.7532460689544678,
+ "learning_rate": 3.1692146539727e-06,
+ "loss": 0.4385,
+ "step": 5309
+ },
+ {
+ "epoch": 2.5106382978723403,
+ "grad_norm": 2.786505699157715,
+ "learning_rate": 3.168613571958267e-06,
+ "loss": 0.4241,
+ "step": 5310
+ },
+ {
+ "epoch": 2.511111111111111,
+ "grad_norm": 3.1674118041992188,
+ "learning_rate": 3.1680124483121975e-06,
+ "loss": 0.4445,
+ "step": 5311
+ },
+ {
+ "epoch": 2.511583924349882,
+ "grad_norm": 2.7861545085906982,
+ "learning_rate": 3.167411283071923e-06,
+ "loss": 0.4264,
+ "step": 5312
+ },
+ {
+ "epoch": 2.5120567375886527,
+ "grad_norm": 2.7412493228912354,
+ "learning_rate": 3.1668100762748745e-06,
+ "loss": 0.4725,
+ "step": 5313
+ },
+ {
+ "epoch": 2.512529550827423,
+ "grad_norm": 2.710019588470459,
+ "learning_rate": 3.1662088279584858e-06,
+ "loss": 0.5207,
+ "step": 5314
+ },
+ {
+ "epoch": 2.513002364066194,
+ "grad_norm": 2.694812297821045,
+ "learning_rate": 3.165607538160194e-06,
+ "loss": 0.3666,
+ "step": 5315
+ },
+ {
+ "epoch": 2.5134751773049646,
+ "grad_norm": 2.4390623569488525,
+ "learning_rate": 3.1650062069174405e-06,
+ "loss": 0.4025,
+ "step": 5316
+ },
+ {
+ "epoch": 2.513947990543735,
+ "grad_norm": 3.055738925933838,
+ "learning_rate": 3.1644048342676663e-06,
+ "loss": 0.4288,
+ "step": 5317
+ },
+ {
+ "epoch": 2.5144208037825058,
+ "grad_norm": 3.065824508666992,
+ "learning_rate": 3.163803420248316e-06,
+ "loss": 0.4592,
+ "step": 5318
+ },
+ {
+ "epoch": 2.5148936170212766,
+ "grad_norm": 2.6011085510253906,
+ "learning_rate": 3.163201964896838e-06,
+ "loss": 0.4081,
+ "step": 5319
+ },
+ {
+ "epoch": 2.5153664302600474,
+ "grad_norm": 2.4833033084869385,
+ "learning_rate": 3.162600468250681e-06,
+ "loss": 0.4343,
+ "step": 5320
+ },
+ {
+ "epoch": 2.515839243498818,
+ "grad_norm": 2.9035534858703613,
+ "learning_rate": 3.161998930347299e-06,
+ "loss": 0.4972,
+ "step": 5321
+ },
+ {
+ "epoch": 2.5163120567375885,
+ "grad_norm": 2.788752317428589,
+ "learning_rate": 3.161397351224146e-06,
+ "loss": 0.4597,
+ "step": 5322
+ },
+ {
+ "epoch": 2.5167848699763593,
+ "grad_norm": 2.4344491958618164,
+ "learning_rate": 3.16079573091868e-06,
+ "loss": 0.359,
+ "step": 5323
+ },
+ {
+ "epoch": 2.51725768321513,
+ "grad_norm": 2.750150680541992,
+ "learning_rate": 3.160194069468361e-06,
+ "loss": 0.4596,
+ "step": 5324
+ },
+ {
+ "epoch": 2.5177304964539005,
+ "grad_norm": 2.826902389526367,
+ "learning_rate": 3.1595923669106526e-06,
+ "loss": 0.4377,
+ "step": 5325
+ },
+ {
+ "epoch": 2.5182033096926713,
+ "grad_norm": 2.554439067840576,
+ "learning_rate": 3.15899062328302e-06,
+ "loss": 0.4517,
+ "step": 5326
+ },
+ {
+ "epoch": 2.518676122931442,
+ "grad_norm": 3.0882742404937744,
+ "learning_rate": 3.158388838622931e-06,
+ "loss": 0.47,
+ "step": 5327
+ },
+ {
+ "epoch": 2.519148936170213,
+ "grad_norm": 2.918947696685791,
+ "learning_rate": 3.157787012967856e-06,
+ "loss": 0.522,
+ "step": 5328
+ },
+ {
+ "epoch": 2.5196217494089836,
+ "grad_norm": 2.8057637214660645,
+ "learning_rate": 3.1571851463552674e-06,
+ "loss": 0.4837,
+ "step": 5329
+ },
+ {
+ "epoch": 2.520094562647754,
+ "grad_norm": 2.66241455078125,
+ "learning_rate": 3.156583238822641e-06,
+ "loss": 0.3988,
+ "step": 5330
+ },
+ {
+ "epoch": 2.520567375886525,
+ "grad_norm": 2.9793803691864014,
+ "learning_rate": 3.155981290407456e-06,
+ "loss": 0.4737,
+ "step": 5331
+ },
+ {
+ "epoch": 2.5210401891252956,
+ "grad_norm": 2.847522258758545,
+ "learning_rate": 3.1553793011471924e-06,
+ "loss": 0.4394,
+ "step": 5332
+ },
+ {
+ "epoch": 2.521513002364066,
+ "grad_norm": 2.9561474323272705,
+ "learning_rate": 3.154777271079333e-06,
+ "loss": 0.47,
+ "step": 5333
+ },
+ {
+ "epoch": 2.5219858156028367,
+ "grad_norm": 2.8353018760681152,
+ "learning_rate": 3.154175200241365e-06,
+ "loss": 0.4015,
+ "step": 5334
+ },
+ {
+ "epoch": 2.5224586288416075,
+ "grad_norm": 2.609049081802368,
+ "learning_rate": 3.153573088670775e-06,
+ "loss": 0.4723,
+ "step": 5335
+ },
+ {
+ "epoch": 2.5229314420803783,
+ "grad_norm": 2.8538455963134766,
+ "learning_rate": 3.1529709364050556e-06,
+ "loss": 0.4665,
+ "step": 5336
+ },
+ {
+ "epoch": 2.523404255319149,
+ "grad_norm": 2.768310785293579,
+ "learning_rate": 3.1523687434816978e-06,
+ "loss": 0.4933,
+ "step": 5337
+ },
+ {
+ "epoch": 2.5238770685579195,
+ "grad_norm": 2.9300906658172607,
+ "learning_rate": 3.1517665099382e-06,
+ "loss": 0.4651,
+ "step": 5338
+ },
+ {
+ "epoch": 2.5243498817966903,
+ "grad_norm": 2.6984703540802,
+ "learning_rate": 3.1511642358120585e-06,
+ "loss": 0.4442,
+ "step": 5339
+ },
+ {
+ "epoch": 2.524822695035461,
+ "grad_norm": 2.8148467540740967,
+ "learning_rate": 3.1505619211407762e-06,
+ "loss": 0.4611,
+ "step": 5340
+ },
+ {
+ "epoch": 2.5252955082742314,
+ "grad_norm": 2.816436290740967,
+ "learning_rate": 3.1499595659618556e-06,
+ "loss": 0.5291,
+ "step": 5341
+ },
+ {
+ "epoch": 2.5257683215130022,
+ "grad_norm": 2.902805805206299,
+ "learning_rate": 3.149357170312802e-06,
+ "loss": 0.4394,
+ "step": 5342
+ },
+ {
+ "epoch": 2.526241134751773,
+ "grad_norm": 2.6443474292755127,
+ "learning_rate": 3.148754734231126e-06,
+ "loss": 0.4444,
+ "step": 5343
+ },
+ {
+ "epoch": 2.526713947990544,
+ "grad_norm": 2.6818583011627197,
+ "learning_rate": 3.148152257754336e-06,
+ "loss": 0.4256,
+ "step": 5344
+ },
+ {
+ "epoch": 2.5271867612293146,
+ "grad_norm": 2.5266945362091064,
+ "learning_rate": 3.1475497409199485e-06,
+ "loss": 0.4087,
+ "step": 5345
+ },
+ {
+ "epoch": 2.527659574468085,
+ "grad_norm": 2.6326711177825928,
+ "learning_rate": 3.146947183765477e-06,
+ "loss": 0.3842,
+ "step": 5346
+ },
+ {
+ "epoch": 2.5281323877068558,
+ "grad_norm": 3.122880697250366,
+ "learning_rate": 3.1463445863284413e-06,
+ "loss": 0.482,
+ "step": 5347
+ },
+ {
+ "epoch": 2.5286052009456266,
+ "grad_norm": 2.819258213043213,
+ "learning_rate": 3.145741948646362e-06,
+ "loss": 0.4628,
+ "step": 5348
+ },
+ {
+ "epoch": 2.529078014184397,
+ "grad_norm": 2.5842230319976807,
+ "learning_rate": 3.145139270756764e-06,
+ "loss": 0.4479,
+ "step": 5349
+ },
+ {
+ "epoch": 2.5295508274231677,
+ "grad_norm": 2.7257237434387207,
+ "learning_rate": 3.144536552697172e-06,
+ "loss": 0.473,
+ "step": 5350
+ },
+ {
+ "epoch": 2.5300236406619385,
+ "grad_norm": 2.6876981258392334,
+ "learning_rate": 3.143933794505115e-06,
+ "loss": 0.4615,
+ "step": 5351
+ },
+ {
+ "epoch": 2.5304964539007093,
+ "grad_norm": 2.7942895889282227,
+ "learning_rate": 3.143330996218124e-06,
+ "loss": 0.4982,
+ "step": 5352
+ },
+ {
+ "epoch": 2.53096926713948,
+ "grad_norm": 2.3150579929351807,
+ "learning_rate": 3.1427281578737327e-06,
+ "loss": 0.3905,
+ "step": 5353
+ },
+ {
+ "epoch": 2.5314420803782505,
+ "grad_norm": 2.7326138019561768,
+ "learning_rate": 3.142125279509478e-06,
+ "loss": 0.4076,
+ "step": 5354
+ },
+ {
+ "epoch": 2.5319148936170213,
+ "grad_norm": 2.46362566947937,
+ "learning_rate": 3.1415223611628976e-06,
+ "loss": 0.4043,
+ "step": 5355
+ },
+ {
+ "epoch": 2.532387706855792,
+ "grad_norm": 2.6670427322387695,
+ "learning_rate": 3.1409194028715323e-06,
+ "loss": 0.484,
+ "step": 5356
+ },
+ {
+ "epoch": 2.5328605200945624,
+ "grad_norm": 2.917771100997925,
+ "learning_rate": 3.140316404672926e-06,
+ "loss": 0.4539,
+ "step": 5357
+ },
+ {
+ "epoch": 2.533333333333333,
+ "grad_norm": 2.7964110374450684,
+ "learning_rate": 3.1397133666046254e-06,
+ "loss": 0.4706,
+ "step": 5358
+ },
+ {
+ "epoch": 2.533806146572104,
+ "grad_norm": 2.6481330394744873,
+ "learning_rate": 3.139110288704179e-06,
+ "loss": 0.4101,
+ "step": 5359
+ },
+ {
+ "epoch": 2.534278959810875,
+ "grad_norm": 2.859452962875366,
+ "learning_rate": 3.1385071710091365e-06,
+ "loss": 0.4842,
+ "step": 5360
+ },
+ {
+ "epoch": 2.5347517730496456,
+ "grad_norm": 2.686077356338501,
+ "learning_rate": 3.137904013557052e-06,
+ "loss": 0.4073,
+ "step": 5361
+ },
+ {
+ "epoch": 2.535224586288416,
+ "grad_norm": 3.7147045135498047,
+ "learning_rate": 3.137300816385482e-06,
+ "loss": 0.4536,
+ "step": 5362
+ },
+ {
+ "epoch": 2.5356973995271868,
+ "grad_norm": 2.51054048538208,
+ "learning_rate": 3.1366975795319856e-06,
+ "loss": 0.4171,
+ "step": 5363
+ },
+ {
+ "epoch": 2.5361702127659576,
+ "grad_norm": 3.043149471282959,
+ "learning_rate": 3.136094303034121e-06,
+ "loss": 0.5179,
+ "step": 5364
+ },
+ {
+ "epoch": 2.536643026004728,
+ "grad_norm": 2.398878812789917,
+ "learning_rate": 3.1354909869294548e-06,
+ "loss": 0.4144,
+ "step": 5365
+ },
+ {
+ "epoch": 2.5371158392434987,
+ "grad_norm": 2.969712257385254,
+ "learning_rate": 3.134887631255551e-06,
+ "loss": 0.3983,
+ "step": 5366
+ },
+ {
+ "epoch": 2.5375886524822695,
+ "grad_norm": 2.7707982063293457,
+ "learning_rate": 3.134284236049978e-06,
+ "loss": 0.4405,
+ "step": 5367
+ },
+ {
+ "epoch": 2.5380614657210403,
+ "grad_norm": 2.579742193222046,
+ "learning_rate": 3.1336808013503073e-06,
+ "loss": 0.4402,
+ "step": 5368
+ },
+ {
+ "epoch": 2.538534278959811,
+ "grad_norm": 2.6041927337646484,
+ "learning_rate": 3.1330773271941113e-06,
+ "loss": 0.396,
+ "step": 5369
+ },
+ {
+ "epoch": 2.5390070921985815,
+ "grad_norm": 2.7383856773376465,
+ "learning_rate": 3.1324738136189658e-06,
+ "loss": 0.4424,
+ "step": 5370
+ },
+ {
+ "epoch": 2.5394799054373522,
+ "grad_norm": 3.053644895553589,
+ "learning_rate": 3.13187026066245e-06,
+ "loss": 0.473,
+ "step": 5371
+ },
+ {
+ "epoch": 2.539952718676123,
+ "grad_norm": 2.684244155883789,
+ "learning_rate": 3.1312666683621428e-06,
+ "loss": 0.3963,
+ "step": 5372
+ },
+ {
+ "epoch": 2.5404255319148934,
+ "grad_norm": 2.6505017280578613,
+ "learning_rate": 3.130663036755629e-06,
+ "loss": 0.4292,
+ "step": 5373
+ },
+ {
+ "epoch": 2.540898345153664,
+ "grad_norm": 3.025965929031372,
+ "learning_rate": 3.1300593658804935e-06,
+ "loss": 0.4539,
+ "step": 5374
+ },
+ {
+ "epoch": 2.541371158392435,
+ "grad_norm": 2.72106671333313,
+ "learning_rate": 3.1294556557743237e-06,
+ "loss": 0.4519,
+ "step": 5375
+ },
+ {
+ "epoch": 2.541843971631206,
+ "grad_norm": 2.759995222091675,
+ "learning_rate": 3.12885190647471e-06,
+ "loss": 0.451,
+ "step": 5376
+ },
+ {
+ "epoch": 2.5423167848699766,
+ "grad_norm": 2.697950601577759,
+ "learning_rate": 3.1282481180192457e-06,
+ "loss": 0.4328,
+ "step": 5377
+ },
+ {
+ "epoch": 2.542789598108747,
+ "grad_norm": 2.6970415115356445,
+ "learning_rate": 3.127644290445526e-06,
+ "loss": 0.4489,
+ "step": 5378
+ },
+ {
+ "epoch": 2.5432624113475177,
+ "grad_norm": 2.5856997966766357,
+ "learning_rate": 3.127040423791148e-06,
+ "loss": 0.3848,
+ "step": 5379
+ },
+ {
+ "epoch": 2.5437352245862885,
+ "grad_norm": 2.9798166751861572,
+ "learning_rate": 3.1264365180937127e-06,
+ "loss": 0.5038,
+ "step": 5380
+ },
+ {
+ "epoch": 2.544208037825059,
+ "grad_norm": 3.413175106048584,
+ "learning_rate": 3.1258325733908224e-06,
+ "loss": 0.5247,
+ "step": 5381
+ },
+ {
+ "epoch": 2.5446808510638297,
+ "grad_norm": 2.838517904281616,
+ "learning_rate": 3.1252285897200818e-06,
+ "loss": 0.4652,
+ "step": 5382
+ },
+ {
+ "epoch": 2.5451536643026005,
+ "grad_norm": 2.8342528343200684,
+ "learning_rate": 3.1246245671190983e-06,
+ "loss": 0.4245,
+ "step": 5383
+ },
+ {
+ "epoch": 2.5456264775413713,
+ "grad_norm": 3.06026029586792,
+ "learning_rate": 3.124020505625482e-06,
+ "loss": 0.469,
+ "step": 5384
+ },
+ {
+ "epoch": 2.546099290780142,
+ "grad_norm": 2.633894681930542,
+ "learning_rate": 3.1234164052768452e-06,
+ "loss": 0.4509,
+ "step": 5385
+ },
+ {
+ "epoch": 2.5465721040189124,
+ "grad_norm": 2.634819984436035,
+ "learning_rate": 3.1228122661108023e-06,
+ "loss": 0.4879,
+ "step": 5386
+ },
+ {
+ "epoch": 2.5470449172576832,
+ "grad_norm": 3.9843504428863525,
+ "learning_rate": 3.1222080881649707e-06,
+ "loss": 0.4472,
+ "step": 5387
+ },
+ {
+ "epoch": 2.547517730496454,
+ "grad_norm": 2.5480258464813232,
+ "learning_rate": 3.1216038714769694e-06,
+ "loss": 0.4396,
+ "step": 5388
+ },
+ {
+ "epoch": 2.5479905437352244,
+ "grad_norm": 2.7461917400360107,
+ "learning_rate": 3.12099961608442e-06,
+ "loss": 0.4735,
+ "step": 5389
+ },
+ {
+ "epoch": 2.548463356973995,
+ "grad_norm": 3.167769193649292,
+ "learning_rate": 3.1203953220249493e-06,
+ "loss": 0.4196,
+ "step": 5390
+ },
+ {
+ "epoch": 2.548936170212766,
+ "grad_norm": 2.721696615219116,
+ "learning_rate": 3.1197909893361814e-06,
+ "loss": 0.4571,
+ "step": 5391
+ },
+ {
+ "epoch": 2.5494089834515368,
+ "grad_norm": 2.726668119430542,
+ "learning_rate": 3.1191866180557463e-06,
+ "loss": 0.4856,
+ "step": 5392
+ },
+ {
+ "epoch": 2.5498817966903076,
+ "grad_norm": 2.602205276489258,
+ "learning_rate": 3.1185822082212754e-06,
+ "loss": 0.4631,
+ "step": 5393
+ },
+ {
+ "epoch": 2.550354609929078,
+ "grad_norm": 2.7715859413146973,
+ "learning_rate": 3.1179777598704025e-06,
+ "loss": 0.4136,
+ "step": 5394
+ },
+ {
+ "epoch": 2.5508274231678487,
+ "grad_norm": 2.8081955909729004,
+ "learning_rate": 3.1173732730407647e-06,
+ "loss": 0.4963,
+ "step": 5395
+ },
+ {
+ "epoch": 2.5513002364066195,
+ "grad_norm": 2.946772336959839,
+ "learning_rate": 3.1167687477700006e-06,
+ "loss": 0.4443,
+ "step": 5396
+ },
+ {
+ "epoch": 2.55177304964539,
+ "grad_norm": 2.89345383644104,
+ "learning_rate": 3.1161641840957503e-06,
+ "loss": 0.4377,
+ "step": 5397
+ },
+ {
+ "epoch": 2.5522458628841607,
+ "grad_norm": 2.908317804336548,
+ "learning_rate": 3.115559582055659e-06,
+ "loss": 0.4702,
+ "step": 5398
+ },
+ {
+ "epoch": 2.5527186761229315,
+ "grad_norm": 2.554417848587036,
+ "learning_rate": 3.1149549416873704e-06,
+ "loss": 0.3738,
+ "step": 5399
+ },
+ {
+ "epoch": 2.5531914893617023,
+ "grad_norm": 2.3132457733154297,
+ "learning_rate": 3.1143502630285356e-06,
+ "loss": 0.4074,
+ "step": 5400
+ },
+ {
+ "epoch": 2.553664302600473,
+ "grad_norm": 2.751666784286499,
+ "learning_rate": 3.1137455461168026e-06,
+ "loss": 0.4697,
+ "step": 5401
+ },
+ {
+ "epoch": 2.5541371158392434,
+ "grad_norm": 2.7088871002197266,
+ "learning_rate": 3.113140790989826e-06,
+ "loss": 0.4754,
+ "step": 5402
+ },
+ {
+ "epoch": 2.554609929078014,
+ "grad_norm": 3.0633046627044678,
+ "learning_rate": 3.1125359976852605e-06,
+ "loss": 0.4874,
+ "step": 5403
+ },
+ {
+ "epoch": 2.555082742316785,
+ "grad_norm": 3.399456024169922,
+ "learning_rate": 3.111931166240764e-06,
+ "loss": 0.5529,
+ "step": 5404
+ },
+ {
+ "epoch": 2.5555555555555554,
+ "grad_norm": 2.7729690074920654,
+ "learning_rate": 3.1113262966939985e-06,
+ "loss": 0.4677,
+ "step": 5405
+ },
+ {
+ "epoch": 2.556028368794326,
+ "grad_norm": 2.81025767326355,
+ "learning_rate": 3.1107213890826244e-06,
+ "loss": 0.4954,
+ "step": 5406
+ },
+ {
+ "epoch": 2.556501182033097,
+ "grad_norm": 2.4837241172790527,
+ "learning_rate": 3.110116443444307e-06,
+ "loss": 0.3681,
+ "step": 5407
+ },
+ {
+ "epoch": 2.5569739952718678,
+ "grad_norm": 2.6406874656677246,
+ "learning_rate": 3.109511459816714e-06,
+ "loss": 0.4569,
+ "step": 5408
+ },
+ {
+ "epoch": 2.5574468085106385,
+ "grad_norm": 2.6093738079071045,
+ "learning_rate": 3.1089064382375155e-06,
+ "loss": 0.413,
+ "step": 5409
+ },
+ {
+ "epoch": 2.557919621749409,
+ "grad_norm": 2.6629011631011963,
+ "learning_rate": 3.108301378744383e-06,
+ "loss": 0.4286,
+ "step": 5410
+ },
+ {
+ "epoch": 2.5583924349881797,
+ "grad_norm": 2.694796323776245,
+ "learning_rate": 3.10769628137499e-06,
+ "loss": 0.4316,
+ "step": 5411
+ },
+ {
+ "epoch": 2.5588652482269505,
+ "grad_norm": 2.88023042678833,
+ "learning_rate": 3.107091146167015e-06,
+ "loss": 0.4378,
+ "step": 5412
+ },
+ {
+ "epoch": 2.559338061465721,
+ "grad_norm": 2.8804919719696045,
+ "learning_rate": 3.1064859731581365e-06,
+ "loss": 0.4971,
+ "step": 5413
+ },
+ {
+ "epoch": 2.5598108747044916,
+ "grad_norm": 2.850468397140503,
+ "learning_rate": 3.1058807623860353e-06,
+ "loss": 0.4686,
+ "step": 5414
+ },
+ {
+ "epoch": 2.5602836879432624,
+ "grad_norm": 3.0548019409179688,
+ "learning_rate": 3.1052755138883963e-06,
+ "loss": 0.4497,
+ "step": 5415
+ },
+ {
+ "epoch": 2.5607565011820332,
+ "grad_norm": 3.10168719291687,
+ "learning_rate": 3.1046702277029046e-06,
+ "loss": 0.569,
+ "step": 5416
+ },
+ {
+ "epoch": 2.561229314420804,
+ "grad_norm": 2.5887374877929688,
+ "learning_rate": 3.1040649038672494e-06,
+ "loss": 0.3812,
+ "step": 5417
+ },
+ {
+ "epoch": 2.5617021276595744,
+ "grad_norm": 2.9928438663482666,
+ "learning_rate": 3.1034595424191212e-06,
+ "loss": 0.4308,
+ "step": 5418
+ },
+ {
+ "epoch": 2.562174940898345,
+ "grad_norm": 2.7003073692321777,
+ "learning_rate": 3.102854143396214e-06,
+ "loss": 0.4967,
+ "step": 5419
+ },
+ {
+ "epoch": 2.562647754137116,
+ "grad_norm": 3.172868490219116,
+ "learning_rate": 3.102248706836222e-06,
+ "loss": 0.5311,
+ "step": 5420
+ },
+ {
+ "epoch": 2.5631205673758863,
+ "grad_norm": 3.0146191120147705,
+ "learning_rate": 3.101643232776844e-06,
+ "loss": 0.4714,
+ "step": 5421
+ },
+ {
+ "epoch": 2.563593380614657,
+ "grad_norm": 3.0683791637420654,
+ "learning_rate": 3.1010377212557806e-06,
+ "loss": 0.4047,
+ "step": 5422
+ },
+ {
+ "epoch": 2.564066193853428,
+ "grad_norm": 2.8260676860809326,
+ "learning_rate": 3.1004321723107334e-06,
+ "loss": 0.5282,
+ "step": 5423
+ },
+ {
+ "epoch": 2.5645390070921987,
+ "grad_norm": 3.0792388916015625,
+ "learning_rate": 3.0998265859794074e-06,
+ "loss": 0.5323,
+ "step": 5424
+ },
+ {
+ "epoch": 2.5650118203309695,
+ "grad_norm": 2.7332866191864014,
+ "learning_rate": 3.09922096229951e-06,
+ "loss": 0.4401,
+ "step": 5425
+ },
+ {
+ "epoch": 2.56548463356974,
+ "grad_norm": 2.9366047382354736,
+ "learning_rate": 3.098615301308751e-06,
+ "loss": 0.4495,
+ "step": 5426
+ },
+ {
+ "epoch": 2.5659574468085107,
+ "grad_norm": 2.982088565826416,
+ "learning_rate": 3.098009603044842e-06,
+ "loss": 0.495,
+ "step": 5427
+ },
+ {
+ "epoch": 2.5664302600472815,
+ "grad_norm": 3.1204755306243896,
+ "learning_rate": 3.0974038675454976e-06,
+ "loss": 0.4354,
+ "step": 5428
+ },
+ {
+ "epoch": 2.566903073286052,
+ "grad_norm": 2.835238218307495,
+ "learning_rate": 3.0967980948484333e-06,
+ "loss": 0.4161,
+ "step": 5429
+ },
+ {
+ "epoch": 2.5673758865248226,
+ "grad_norm": 2.8104958534240723,
+ "learning_rate": 3.096192284991369e-06,
+ "loss": 0.5045,
+ "step": 5430
+ },
+ {
+ "epoch": 2.5678486997635934,
+ "grad_norm": 3.1636080741882324,
+ "learning_rate": 3.0955864380120247e-06,
+ "loss": 0.4533,
+ "step": 5431
+ },
+ {
+ "epoch": 2.568321513002364,
+ "grad_norm": 2.980112314224243,
+ "learning_rate": 3.0949805539481247e-06,
+ "loss": 0.3998,
+ "step": 5432
+ },
+ {
+ "epoch": 2.568794326241135,
+ "grad_norm": 2.6379945278167725,
+ "learning_rate": 3.0943746328373953e-06,
+ "loss": 0.3785,
+ "step": 5433
+ },
+ {
+ "epoch": 2.5692671394799054,
+ "grad_norm": 2.780930757522583,
+ "learning_rate": 3.0937686747175627e-06,
+ "loss": 0.4801,
+ "step": 5434
+ },
+ {
+ "epoch": 2.569739952718676,
+ "grad_norm": 2.6608550548553467,
+ "learning_rate": 3.0931626796263585e-06,
+ "loss": 0.4047,
+ "step": 5435
+ },
+ {
+ "epoch": 2.570212765957447,
+ "grad_norm": 3.130584716796875,
+ "learning_rate": 3.0925566476015156e-06,
+ "loss": 0.5049,
+ "step": 5436
+ },
+ {
+ "epoch": 2.5706855791962173,
+ "grad_norm": 2.9699313640594482,
+ "learning_rate": 3.0919505786807687e-06,
+ "loss": 0.3847,
+ "step": 5437
+ },
+ {
+ "epoch": 2.571158392434988,
+ "grad_norm": 2.919260025024414,
+ "learning_rate": 3.091344472901855e-06,
+ "loss": 0.4631,
+ "step": 5438
+ },
+ {
+ "epoch": 2.571631205673759,
+ "grad_norm": 2.956587553024292,
+ "learning_rate": 3.0907383303025134e-06,
+ "loss": 0.4974,
+ "step": 5439
+ },
+ {
+ "epoch": 2.5721040189125297,
+ "grad_norm": 2.758542776107788,
+ "learning_rate": 3.090132150920486e-06,
+ "loss": 0.4785,
+ "step": 5440
+ },
+ {
+ "epoch": 2.5725768321513005,
+ "grad_norm": 2.678469657897949,
+ "learning_rate": 3.0895259347935175e-06,
+ "loss": 0.4453,
+ "step": 5441
+ },
+ {
+ "epoch": 2.573049645390071,
+ "grad_norm": 2.6508545875549316,
+ "learning_rate": 3.088919681959355e-06,
+ "loss": 0.4426,
+ "step": 5442
+ },
+ {
+ "epoch": 2.5735224586288417,
+ "grad_norm": 2.6156187057495117,
+ "learning_rate": 3.0883133924557453e-06,
+ "loss": 0.4445,
+ "step": 5443
+ },
+ {
+ "epoch": 2.5739952718676125,
+ "grad_norm": 2.484374761581421,
+ "learning_rate": 3.08770706632044e-06,
+ "loss": 0.4155,
+ "step": 5444
+ },
+ {
+ "epoch": 2.574468085106383,
+ "grad_norm": 2.7465295791625977,
+ "learning_rate": 3.087100703591193e-06,
+ "loss": 0.4085,
+ "step": 5445
+ },
+ {
+ "epoch": 2.5749408983451536,
+ "grad_norm": 2.771740198135376,
+ "learning_rate": 3.08649430430576e-06,
+ "loss": 0.4313,
+ "step": 5446
+ },
+ {
+ "epoch": 2.5754137115839244,
+ "grad_norm": 2.7480874061584473,
+ "learning_rate": 3.0858878685018984e-06,
+ "loss": 0.3471,
+ "step": 5447
+ },
+ {
+ "epoch": 2.575886524822695,
+ "grad_norm": 2.894913673400879,
+ "learning_rate": 3.085281396217368e-06,
+ "loss": 0.4888,
+ "step": 5448
+ },
+ {
+ "epoch": 2.576359338061466,
+ "grad_norm": 3.037628173828125,
+ "learning_rate": 3.0846748874899306e-06,
+ "loss": 0.3976,
+ "step": 5449
+ },
+ {
+ "epoch": 2.5768321513002364,
+ "grad_norm": 2.4811434745788574,
+ "learning_rate": 3.0840683423573526e-06,
+ "loss": 0.4822,
+ "step": 5450
+ },
+ {
+ "epoch": 2.577304964539007,
+ "grad_norm": 3.0078725814819336,
+ "learning_rate": 3.0834617608573998e-06,
+ "loss": 0.4999,
+ "step": 5451
+ },
+ {
+ "epoch": 2.5777777777777775,
+ "grad_norm": 3.174154043197632,
+ "learning_rate": 3.0828551430278413e-06,
+ "loss": 0.4626,
+ "step": 5452
+ },
+ {
+ "epoch": 2.5782505910165483,
+ "grad_norm": 2.8277535438537598,
+ "learning_rate": 3.082248488906449e-06,
+ "loss": 0.4633,
+ "step": 5453
+ },
+ {
+ "epoch": 2.578723404255319,
+ "grad_norm": 2.731767416000366,
+ "learning_rate": 3.0816417985309966e-06,
+ "loss": 0.4148,
+ "step": 5454
+ },
+ {
+ "epoch": 2.57919621749409,
+ "grad_norm": 2.5480549335479736,
+ "learning_rate": 3.0810350719392597e-06,
+ "loss": 0.4773,
+ "step": 5455
+ },
+ {
+ "epoch": 2.5796690307328607,
+ "grad_norm": 2.9755172729492188,
+ "learning_rate": 3.080428309169017e-06,
+ "loss": 0.5107,
+ "step": 5456
+ },
+ {
+ "epoch": 2.580141843971631,
+ "grad_norm": 2.6499290466308594,
+ "learning_rate": 3.079821510258048e-06,
+ "loss": 0.3982,
+ "step": 5457
+ },
+ {
+ "epoch": 2.580614657210402,
+ "grad_norm": 2.663214921951294,
+ "learning_rate": 3.079214675244136e-06,
+ "loss": 0.4419,
+ "step": 5458
+ },
+ {
+ "epoch": 2.5810874704491726,
+ "grad_norm": 2.595489263534546,
+ "learning_rate": 3.078607804165066e-06,
+ "loss": 0.3958,
+ "step": 5459
+ },
+ {
+ "epoch": 2.581560283687943,
+ "grad_norm": 3.031458854675293,
+ "learning_rate": 3.0780008970586255e-06,
+ "loss": 0.518,
+ "step": 5460
+ },
+ {
+ "epoch": 2.582033096926714,
+ "grad_norm": 2.827071189880371,
+ "learning_rate": 3.077393953962603e-06,
+ "loss": 0.4397,
+ "step": 5461
+ },
+ {
+ "epoch": 2.5825059101654846,
+ "grad_norm": 2.656111240386963,
+ "learning_rate": 3.0767869749147917e-06,
+ "loss": 0.4912,
+ "step": 5462
+ },
+ {
+ "epoch": 2.5829787234042554,
+ "grad_norm": 2.545365333557129,
+ "learning_rate": 3.076179959952984e-06,
+ "loss": 0.3991,
+ "step": 5463
+ },
+ {
+ "epoch": 2.583451536643026,
+ "grad_norm": 2.5794365406036377,
+ "learning_rate": 3.075572909114977e-06,
+ "loss": 0.4499,
+ "step": 5464
+ },
+ {
+ "epoch": 2.5839243498817965,
+ "grad_norm": 2.787140369415283,
+ "learning_rate": 3.074965822438568e-06,
+ "loss": 0.386,
+ "step": 5465
+ },
+ {
+ "epoch": 2.5843971631205673,
+ "grad_norm": 2.6406853199005127,
+ "learning_rate": 3.0743586999615594e-06,
+ "loss": 0.4853,
+ "step": 5466
+ },
+ {
+ "epoch": 2.584869976359338,
+ "grad_norm": 2.8082082271575928,
+ "learning_rate": 3.073751541721752e-06,
+ "loss": 0.4669,
+ "step": 5467
+ },
+ {
+ "epoch": 2.5853427895981085,
+ "grad_norm": 2.8808975219726562,
+ "learning_rate": 3.073144347756952e-06,
+ "loss": 0.4193,
+ "step": 5468
+ },
+ {
+ "epoch": 2.5858156028368793,
+ "grad_norm": 2.823352813720703,
+ "learning_rate": 3.072537118104968e-06,
+ "loss": 0.482,
+ "step": 5469
+ },
+ {
+ "epoch": 2.58628841607565,
+ "grad_norm": 2.6454555988311768,
+ "learning_rate": 3.0719298528036073e-06,
+ "loss": 0.4667,
+ "step": 5470
+ },
+ {
+ "epoch": 2.586761229314421,
+ "grad_norm": 2.871145486831665,
+ "learning_rate": 3.0713225518906826e-06,
+ "loss": 0.5125,
+ "step": 5471
+ },
+ {
+ "epoch": 2.5872340425531917,
+ "grad_norm": 3.1301417350769043,
+ "learning_rate": 3.070715215404007e-06,
+ "loss": 0.4827,
+ "step": 5472
+ },
+ {
+ "epoch": 2.587706855791962,
+ "grad_norm": 2.31062912940979,
+ "learning_rate": 3.070107843381398e-06,
+ "loss": 0.3954,
+ "step": 5473
+ },
+ {
+ "epoch": 2.588179669030733,
+ "grad_norm": 2.8366353511810303,
+ "learning_rate": 3.069500435860674e-06,
+ "loss": 0.4597,
+ "step": 5474
+ },
+ {
+ "epoch": 2.5886524822695036,
+ "grad_norm": 2.900143623352051,
+ "learning_rate": 3.068892992879654e-06,
+ "loss": 0.4294,
+ "step": 5475
+ },
+ {
+ "epoch": 2.589125295508274,
+ "grad_norm": 2.923313617706299,
+ "learning_rate": 3.0682855144761626e-06,
+ "loss": 0.505,
+ "step": 5476
+ },
+ {
+ "epoch": 2.5895981087470448,
+ "grad_norm": 2.726475954055786,
+ "learning_rate": 3.0676780006880242e-06,
+ "loss": 0.4208,
+ "step": 5477
+ },
+ {
+ "epoch": 2.5900709219858156,
+ "grad_norm": 4.115052223205566,
+ "learning_rate": 3.0670704515530654e-06,
+ "loss": 0.466,
+ "step": 5478
+ },
+ {
+ "epoch": 2.5905437352245864,
+ "grad_norm": 2.6018717288970947,
+ "learning_rate": 3.0664628671091163e-06,
+ "loss": 0.4697,
+ "step": 5479
+ },
+ {
+ "epoch": 2.591016548463357,
+ "grad_norm": 2.7393722534179688,
+ "learning_rate": 3.0658552473940085e-06,
+ "loss": 0.4618,
+ "step": 5480
+ },
+ {
+ "epoch": 2.5914893617021275,
+ "grad_norm": 2.8406929969787598,
+ "learning_rate": 3.065247592445575e-06,
+ "loss": 0.4806,
+ "step": 5481
+ },
+ {
+ "epoch": 2.5919621749408983,
+ "grad_norm": 2.9773001670837402,
+ "learning_rate": 3.0646399023016525e-06,
+ "loss": 0.4764,
+ "step": 5482
+ },
+ {
+ "epoch": 2.592434988179669,
+ "grad_norm": 3.374643325805664,
+ "learning_rate": 3.0640321770000804e-06,
+ "loss": 0.4481,
+ "step": 5483
+ },
+ {
+ "epoch": 2.5929078014184395,
+ "grad_norm": 2.5742013454437256,
+ "learning_rate": 3.0634244165786965e-06,
+ "loss": 0.432,
+ "step": 5484
+ },
+ {
+ "epoch": 2.5933806146572103,
+ "grad_norm": 2.9390289783477783,
+ "learning_rate": 3.062816621075346e-06,
+ "loss": 0.3941,
+ "step": 5485
+ },
+ {
+ "epoch": 2.593853427895981,
+ "grad_norm": 2.683414936065674,
+ "learning_rate": 3.062208790527871e-06,
+ "loss": 0.4268,
+ "step": 5486
+ },
+ {
+ "epoch": 2.594326241134752,
+ "grad_norm": 2.689647674560547,
+ "learning_rate": 3.06160092497412e-06,
+ "loss": 0.4569,
+ "step": 5487
+ },
+ {
+ "epoch": 2.5947990543735227,
+ "grad_norm": 3.1170310974121094,
+ "learning_rate": 3.060993024451943e-06,
+ "loss": 0.4387,
+ "step": 5488
+ },
+ {
+ "epoch": 2.595271867612293,
+ "grad_norm": 2.8732447624206543,
+ "learning_rate": 3.0603850889991894e-06,
+ "loss": 0.451,
+ "step": 5489
+ },
+ {
+ "epoch": 2.595744680851064,
+ "grad_norm": 3.0444157123565674,
+ "learning_rate": 3.0597771186537135e-06,
+ "loss": 0.4691,
+ "step": 5490
+ },
+ {
+ "epoch": 2.5962174940898346,
+ "grad_norm": 2.3791720867156982,
+ "learning_rate": 3.0591691134533714e-06,
+ "loss": 0.4771,
+ "step": 5491
+ },
+ {
+ "epoch": 2.596690307328605,
+ "grad_norm": 3.0677225589752197,
+ "learning_rate": 3.05856107343602e-06,
+ "loss": 0.459,
+ "step": 5492
+ },
+ {
+ "epoch": 2.5971631205673757,
+ "grad_norm": 3.1702635288238525,
+ "learning_rate": 3.05795299863952e-06,
+ "loss": 0.4816,
+ "step": 5493
+ },
+ {
+ "epoch": 2.5976359338061465,
+ "grad_norm": 2.964869499206543,
+ "learning_rate": 3.057344889101734e-06,
+ "loss": 0.4369,
+ "step": 5494
+ },
+ {
+ "epoch": 2.5981087470449173,
+ "grad_norm": 3.1333882808685303,
+ "learning_rate": 3.056736744860525e-06,
+ "loss": 0.4178,
+ "step": 5495
+ },
+ {
+ "epoch": 2.598581560283688,
+ "grad_norm": 2.4340405464172363,
+ "learning_rate": 3.05612856595376e-06,
+ "loss": 0.4359,
+ "step": 5496
+ },
+ {
+ "epoch": 2.5990543735224585,
+ "grad_norm": 2.638620615005493,
+ "learning_rate": 3.0555203524193083e-06,
+ "loss": 0.3915,
+ "step": 5497
+ },
+ {
+ "epoch": 2.5995271867612293,
+ "grad_norm": 2.8218815326690674,
+ "learning_rate": 3.054912104295039e-06,
+ "loss": 0.4684,
+ "step": 5498
+ },
+ {
+ "epoch": 2.6,
+ "grad_norm": 2.6696009635925293,
+ "learning_rate": 3.054303821618827e-06,
+ "loss": 0.4073,
+ "step": 5499
+ },
+ {
+ "epoch": 2.6004728132387704,
+ "grad_norm": 2.3880512714385986,
+ "learning_rate": 3.0536955044285465e-06,
+ "loss": 0.3576,
+ "step": 5500
+ },
+ {
+ "epoch": 2.6009456264775412,
+ "grad_norm": 2.762890100479126,
+ "learning_rate": 3.053087152762075e-06,
+ "loss": 0.3857,
+ "step": 5501
+ },
+ {
+ "epoch": 2.601418439716312,
+ "grad_norm": 2.729033946990967,
+ "learning_rate": 3.052478766657292e-06,
+ "loss": 0.3935,
+ "step": 5502
+ },
+ {
+ "epoch": 2.601891252955083,
+ "grad_norm": 2.630490303039551,
+ "learning_rate": 3.051870346152078e-06,
+ "loss": 0.3932,
+ "step": 5503
+ },
+ {
+ "epoch": 2.6023640661938536,
+ "grad_norm": 3.0335981845855713,
+ "learning_rate": 3.051261891284318e-06,
+ "loss": 0.4313,
+ "step": 5504
+ },
+ {
+ "epoch": 2.602836879432624,
+ "grad_norm": 2.969888687133789,
+ "learning_rate": 3.0506534020918963e-06,
+ "loss": 0.4698,
+ "step": 5505
+ },
+ {
+ "epoch": 2.603309692671395,
+ "grad_norm": 3.093996524810791,
+ "learning_rate": 3.050044878612703e-06,
+ "loss": 0.5338,
+ "step": 5506
+ },
+ {
+ "epoch": 2.6037825059101656,
+ "grad_norm": 2.759993314743042,
+ "learning_rate": 3.049436320884626e-06,
+ "loss": 0.4429,
+ "step": 5507
+ },
+ {
+ "epoch": 2.604255319148936,
+ "grad_norm": 2.979422092437744,
+ "learning_rate": 3.0488277289455587e-06,
+ "loss": 0.4489,
+ "step": 5508
+ },
+ {
+ "epoch": 2.6047281323877067,
+ "grad_norm": 2.8266701698303223,
+ "learning_rate": 3.048219102833396e-06,
+ "loss": 0.489,
+ "step": 5509
+ },
+ {
+ "epoch": 2.6052009456264775,
+ "grad_norm": 2.2582461833953857,
+ "learning_rate": 3.047610442586033e-06,
+ "loss": 0.3759,
+ "step": 5510
+ },
+ {
+ "epoch": 2.6056737588652483,
+ "grad_norm": 3.078152894973755,
+ "learning_rate": 3.0470017482413694e-06,
+ "loss": 0.5059,
+ "step": 5511
+ },
+ {
+ "epoch": 2.606146572104019,
+ "grad_norm": 2.7895498275756836,
+ "learning_rate": 3.0463930198373047e-06,
+ "loss": 0.4752,
+ "step": 5512
+ },
+ {
+ "epoch": 2.6066193853427895,
+ "grad_norm": 3.2307958602905273,
+ "learning_rate": 3.045784257411743e-06,
+ "loss": 0.4847,
+ "step": 5513
+ },
+ {
+ "epoch": 2.6070921985815603,
+ "grad_norm": 2.793661594390869,
+ "learning_rate": 3.0451754610025884e-06,
+ "loss": 0.4492,
+ "step": 5514
+ },
+ {
+ "epoch": 2.607565011820331,
+ "grad_norm": 2.4443132877349854,
+ "learning_rate": 3.0445666306477484e-06,
+ "loss": 0.4174,
+ "step": 5515
+ },
+ {
+ "epoch": 2.6080378250591014,
+ "grad_norm": 2.628769636154175,
+ "learning_rate": 3.0439577663851326e-06,
+ "loss": 0.3889,
+ "step": 5516
+ },
+ {
+ "epoch": 2.608510638297872,
+ "grad_norm": 2.9367563724517822,
+ "learning_rate": 3.0433488682526525e-06,
+ "loss": 0.437,
+ "step": 5517
+ },
+ {
+ "epoch": 2.608983451536643,
+ "grad_norm": 3.171353340148926,
+ "learning_rate": 3.04273993628822e-06,
+ "loss": 0.47,
+ "step": 5518
+ },
+ {
+ "epoch": 2.609456264775414,
+ "grad_norm": 2.856576442718506,
+ "learning_rate": 3.0421309705297513e-06,
+ "loss": 0.4797,
+ "step": 5519
+ },
+ {
+ "epoch": 2.6099290780141846,
+ "grad_norm": 2.4926068782806396,
+ "learning_rate": 3.041521971015165e-06,
+ "loss": 0.4294,
+ "step": 5520
+ },
+ {
+ "epoch": 2.610401891252955,
+ "grad_norm": 2.7897613048553467,
+ "learning_rate": 3.040912937782379e-06,
+ "loss": 0.4388,
+ "step": 5521
+ },
+ {
+ "epoch": 2.6108747044917258,
+ "grad_norm": 3.588188886642456,
+ "learning_rate": 3.0403038708693173e-06,
+ "loss": 0.4027,
+ "step": 5522
+ },
+ {
+ "epoch": 2.6113475177304966,
+ "grad_norm": 3.5394980907440186,
+ "learning_rate": 3.0396947703139017e-06,
+ "loss": 0.4866,
+ "step": 5523
+ },
+ {
+ "epoch": 2.611820330969267,
+ "grad_norm": 3.086865186691284,
+ "learning_rate": 3.03908563615406e-06,
+ "loss": 0.4344,
+ "step": 5524
+ },
+ {
+ "epoch": 2.6122931442080377,
+ "grad_norm": 2.649564504623413,
+ "learning_rate": 3.0384764684277194e-06,
+ "loss": 0.4571,
+ "step": 5525
+ },
+ {
+ "epoch": 2.6127659574468085,
+ "grad_norm": 2.945234775543213,
+ "learning_rate": 3.0378672671728105e-06,
+ "loss": 0.4885,
+ "step": 5526
+ },
+ {
+ "epoch": 2.6132387706855793,
+ "grad_norm": 2.625424861907959,
+ "learning_rate": 3.037258032427265e-06,
+ "loss": 0.4095,
+ "step": 5527
+ },
+ {
+ "epoch": 2.61371158392435,
+ "grad_norm": 2.7597248554229736,
+ "learning_rate": 3.0366487642290175e-06,
+ "loss": 0.4393,
+ "step": 5528
+ },
+ {
+ "epoch": 2.6141843971631205,
+ "grad_norm": 2.721189260482788,
+ "learning_rate": 3.0360394626160043e-06,
+ "loss": 0.3865,
+ "step": 5529
+ },
+ {
+ "epoch": 2.6146572104018913,
+ "grad_norm": 2.624056339263916,
+ "learning_rate": 3.0354301276261656e-06,
+ "loss": 0.4273,
+ "step": 5530
+ },
+ {
+ "epoch": 2.615130023640662,
+ "grad_norm": 2.7764177322387695,
+ "learning_rate": 3.034820759297439e-06,
+ "loss": 0.4756,
+ "step": 5531
+ },
+ {
+ "epoch": 2.6156028368794324,
+ "grad_norm": 3.0841729640960693,
+ "learning_rate": 3.0342113576677696e-06,
+ "loss": 0.4907,
+ "step": 5532
+ },
+ {
+ "epoch": 2.616075650118203,
+ "grad_norm": 2.678715705871582,
+ "learning_rate": 3.0336019227751017e-06,
+ "loss": 0.4478,
+ "step": 5533
+ },
+ {
+ "epoch": 2.616548463356974,
+ "grad_norm": 2.378679037094116,
+ "learning_rate": 3.032992454657382e-06,
+ "loss": 0.3678,
+ "step": 5534
+ },
+ {
+ "epoch": 2.617021276595745,
+ "grad_norm": 2.792079210281372,
+ "learning_rate": 3.0323829533525583e-06,
+ "loss": 0.4115,
+ "step": 5535
+ },
+ {
+ "epoch": 2.6174940898345156,
+ "grad_norm": 2.738133192062378,
+ "learning_rate": 3.0317734188985832e-06,
+ "loss": 0.4152,
+ "step": 5536
+ },
+ {
+ "epoch": 2.617966903073286,
+ "grad_norm": 2.6963796615600586,
+ "learning_rate": 3.0311638513334084e-06,
+ "loss": 0.4096,
+ "step": 5537
+ },
+ {
+ "epoch": 2.6184397163120567,
+ "grad_norm": 2.694145679473877,
+ "learning_rate": 3.03055425069499e-06,
+ "loss": 0.3793,
+ "step": 5538
+ },
+ {
+ "epoch": 2.6189125295508275,
+ "grad_norm": 2.762403964996338,
+ "learning_rate": 3.0299446170212855e-06,
+ "loss": 0.459,
+ "step": 5539
+ },
+ {
+ "epoch": 2.619385342789598,
+ "grad_norm": 2.804382562637329,
+ "learning_rate": 3.0293349503502522e-06,
+ "loss": 0.4853,
+ "step": 5540
+ },
+ {
+ "epoch": 2.6198581560283687,
+ "grad_norm": 2.7768518924713135,
+ "learning_rate": 3.0287252507198537e-06,
+ "loss": 0.4496,
+ "step": 5541
+ },
+ {
+ "epoch": 2.6203309692671395,
+ "grad_norm": 2.9075138568878174,
+ "learning_rate": 3.028115518168052e-06,
+ "loss": 0.4498,
+ "step": 5542
+ },
+ {
+ "epoch": 2.6208037825059103,
+ "grad_norm": 2.8966822624206543,
+ "learning_rate": 3.0275057527328126e-06,
+ "loss": 0.4434,
+ "step": 5543
+ },
+ {
+ "epoch": 2.621276595744681,
+ "grad_norm": 2.8140156269073486,
+ "learning_rate": 3.0268959544521027e-06,
+ "loss": 0.3935,
+ "step": 5544
+ },
+ {
+ "epoch": 2.6217494089834514,
+ "grad_norm": 2.8606276512145996,
+ "learning_rate": 3.0262861233638924e-06,
+ "loss": 0.4222,
+ "step": 5545
+ },
+ {
+ "epoch": 2.6222222222222222,
+ "grad_norm": 3.003610134124756,
+ "learning_rate": 3.0256762595061522e-06,
+ "loss": 0.428,
+ "step": 5546
+ },
+ {
+ "epoch": 2.622695035460993,
+ "grad_norm": 2.725907802581787,
+ "learning_rate": 3.025066362916857e-06,
+ "loss": 0.3975,
+ "step": 5547
+ },
+ {
+ "epoch": 2.6231678486997634,
+ "grad_norm": 2.5247902870178223,
+ "learning_rate": 3.024456433633982e-06,
+ "loss": 0.4584,
+ "step": 5548
+ },
+ {
+ "epoch": 2.623640661938534,
+ "grad_norm": 2.932798147201538,
+ "learning_rate": 3.0238464716955045e-06,
+ "loss": 0.4991,
+ "step": 5549
+ },
+ {
+ "epoch": 2.624113475177305,
+ "grad_norm": 2.693547010421753,
+ "learning_rate": 3.023236477139404e-06,
+ "loss": 0.4405,
+ "step": 5550
+ },
+ {
+ "epoch": 2.6245862884160758,
+ "grad_norm": 3.2600035667419434,
+ "learning_rate": 3.022626450003662e-06,
+ "loss": 0.4904,
+ "step": 5551
+ },
+ {
+ "epoch": 2.6250591016548466,
+ "grad_norm": 2.9471960067749023,
+ "learning_rate": 3.0220163903262627e-06,
+ "loss": 0.4487,
+ "step": 5552
+ },
+ {
+ "epoch": 2.625531914893617,
+ "grad_norm": 2.583944082260132,
+ "learning_rate": 3.0214062981451926e-06,
+ "loss": 0.3552,
+ "step": 5553
+ },
+ {
+ "epoch": 2.6260047281323877,
+ "grad_norm": 2.675062656402588,
+ "learning_rate": 3.0207961734984377e-06,
+ "loss": 0.4524,
+ "step": 5554
+ },
+ {
+ "epoch": 2.6264775413711585,
+ "grad_norm": 3.0126802921295166,
+ "learning_rate": 3.0201860164239887e-06,
+ "loss": 0.4124,
+ "step": 5555
+ },
+ {
+ "epoch": 2.626950354609929,
+ "grad_norm": 2.490734577178955,
+ "learning_rate": 3.019575826959838e-06,
+ "loss": 0.4095,
+ "step": 5556
+ },
+ {
+ "epoch": 2.6274231678486997,
+ "grad_norm": 2.72817063331604,
+ "learning_rate": 3.018965605143978e-06,
+ "loss": 0.4298,
+ "step": 5557
+ },
+ {
+ "epoch": 2.6278959810874705,
+ "grad_norm": 3.1298327445983887,
+ "learning_rate": 3.0183553510144064e-06,
+ "loss": 0.4961,
+ "step": 5558
+ },
+ {
+ "epoch": 2.6283687943262413,
+ "grad_norm": 3.2379956245422363,
+ "learning_rate": 3.0177450646091195e-06,
+ "loss": 0.4943,
+ "step": 5559
+ },
+ {
+ "epoch": 2.628841607565012,
+ "grad_norm": 2.5040571689605713,
+ "learning_rate": 3.017134745966117e-06,
+ "loss": 0.3701,
+ "step": 5560
+ },
+ {
+ "epoch": 2.6293144208037824,
+ "grad_norm": 3.047184944152832,
+ "learning_rate": 3.0165243951234025e-06,
+ "loss": 0.4587,
+ "step": 5561
+ },
+ {
+ "epoch": 2.629787234042553,
+ "grad_norm": 2.4926774501800537,
+ "learning_rate": 3.0159140121189783e-06,
+ "loss": 0.3723,
+ "step": 5562
+ },
+ {
+ "epoch": 2.630260047281324,
+ "grad_norm": 2.5434961318969727,
+ "learning_rate": 3.015303596990851e-06,
+ "loss": 0.4176,
+ "step": 5563
+ },
+ {
+ "epoch": 2.6307328605200944,
+ "grad_norm": 2.5117976665496826,
+ "learning_rate": 3.0146931497770284e-06,
+ "loss": 0.4218,
+ "step": 5564
+ },
+ {
+ "epoch": 2.631205673758865,
+ "grad_norm": 2.9408798217773438,
+ "learning_rate": 3.0140826705155196e-06,
+ "loss": 0.4473,
+ "step": 5565
+ },
+ {
+ "epoch": 2.631678486997636,
+ "grad_norm": 2.996422052383423,
+ "learning_rate": 3.0134721592443385e-06,
+ "loss": 0.4513,
+ "step": 5566
+ },
+ {
+ "epoch": 2.6321513002364068,
+ "grad_norm": 2.984356164932251,
+ "learning_rate": 3.0128616160014955e-06,
+ "loss": 0.4749,
+ "step": 5567
+ },
+ {
+ "epoch": 2.6326241134751776,
+ "grad_norm": 2.6075069904327393,
+ "learning_rate": 3.0122510408250095e-06,
+ "loss": 0.4707,
+ "step": 5568
+ },
+ {
+ "epoch": 2.633096926713948,
+ "grad_norm": 2.9463071823120117,
+ "learning_rate": 3.0116404337528972e-06,
+ "loss": 0.5125,
+ "step": 5569
+ },
+ {
+ "epoch": 2.6335697399527187,
+ "grad_norm": 2.98574161529541,
+ "learning_rate": 3.0110297948231787e-06,
+ "loss": 0.4487,
+ "step": 5570
+ },
+ {
+ "epoch": 2.6340425531914895,
+ "grad_norm": 2.6039397716522217,
+ "learning_rate": 3.010419124073876e-06,
+ "loss": 0.4516,
+ "step": 5571
+ },
+ {
+ "epoch": 2.63451536643026,
+ "grad_norm": 2.8480236530303955,
+ "learning_rate": 3.0098084215430124e-06,
+ "loss": 0.4962,
+ "step": 5572
+ },
+ {
+ "epoch": 2.6349881796690307,
+ "grad_norm": 2.527597427368164,
+ "learning_rate": 3.0091976872686133e-06,
+ "loss": 0.435,
+ "step": 5573
+ },
+ {
+ "epoch": 2.6354609929078014,
+ "grad_norm": 2.898303508758545,
+ "learning_rate": 3.0085869212887076e-06,
+ "loss": 0.4473,
+ "step": 5574
+ },
+ {
+ "epoch": 2.6359338061465722,
+ "grad_norm": 2.981414318084717,
+ "learning_rate": 3.007976123641324e-06,
+ "loss": 0.4203,
+ "step": 5575
+ },
+ {
+ "epoch": 2.636406619385343,
+ "grad_norm": 3.219064474105835,
+ "learning_rate": 3.0073652943644947e-06,
+ "loss": 0.4596,
+ "step": 5576
+ },
+ {
+ "epoch": 2.6368794326241134,
+ "grad_norm": 2.7287049293518066,
+ "learning_rate": 3.0067544334962532e-06,
+ "loss": 0.433,
+ "step": 5577
+ },
+ {
+ "epoch": 2.637352245862884,
+ "grad_norm": 2.6232664585113525,
+ "learning_rate": 3.0061435410746352e-06,
+ "loss": 0.4254,
+ "step": 5578
+ },
+ {
+ "epoch": 2.637825059101655,
+ "grad_norm": 2.908311605453491,
+ "learning_rate": 3.0055326171376788e-06,
+ "loss": 0.4349,
+ "step": 5579
+ },
+ {
+ "epoch": 2.6382978723404253,
+ "grad_norm": 2.8369064331054688,
+ "learning_rate": 3.0049216617234224e-06,
+ "loss": 0.4675,
+ "step": 5580
+ },
+ {
+ "epoch": 2.638770685579196,
+ "grad_norm": 2.659499406814575,
+ "learning_rate": 3.0043106748699085e-06,
+ "loss": 0.4073,
+ "step": 5581
+ },
+ {
+ "epoch": 2.639243498817967,
+ "grad_norm": 2.579765558242798,
+ "learning_rate": 3.00369965661518e-06,
+ "loss": 0.4536,
+ "step": 5582
+ },
+ {
+ "epoch": 2.6397163120567377,
+ "grad_norm": 3.572861909866333,
+ "learning_rate": 3.0030886069972827e-06,
+ "loss": 0.5227,
+ "step": 5583
+ },
+ {
+ "epoch": 2.6401891252955085,
+ "grad_norm": 2.6523196697235107,
+ "learning_rate": 3.002477526054263e-06,
+ "loss": 0.3846,
+ "step": 5584
+ },
+ {
+ "epoch": 2.640661938534279,
+ "grad_norm": 3.072181463241577,
+ "learning_rate": 3.001866413824173e-06,
+ "loss": 0.5399,
+ "step": 5585
+ },
+ {
+ "epoch": 2.6411347517730497,
+ "grad_norm": 2.7304325103759766,
+ "learning_rate": 3.0012552703450597e-06,
+ "loss": 0.4048,
+ "step": 5586
+ },
+ {
+ "epoch": 2.6416075650118205,
+ "grad_norm": 3.039491891860962,
+ "learning_rate": 3.0006440956549798e-06,
+ "loss": 0.5035,
+ "step": 5587
+ },
+ {
+ "epoch": 2.642080378250591,
+ "grad_norm": 2.7623798847198486,
+ "learning_rate": 3.000032889791988e-06,
+ "loss": 0.4369,
+ "step": 5588
+ },
+ {
+ "epoch": 2.6425531914893616,
+ "grad_norm": 3.391052722930908,
+ "learning_rate": 2.9994216527941394e-06,
+ "loss": 0.5308,
+ "step": 5589
+ },
+ {
+ "epoch": 2.6430260047281324,
+ "grad_norm": 3.0263915061950684,
+ "learning_rate": 2.9988103846994954e-06,
+ "loss": 0.4319,
+ "step": 5590
+ },
+ {
+ "epoch": 2.6434988179669032,
+ "grad_norm": 2.786607027053833,
+ "learning_rate": 2.998199085546115e-06,
+ "loss": 0.4695,
+ "step": 5591
+ },
+ {
+ "epoch": 2.643971631205674,
+ "grad_norm": 2.884674310684204,
+ "learning_rate": 2.9975877553720627e-06,
+ "loss": 0.4615,
+ "step": 5592
+ },
+ {
+ "epoch": 2.6444444444444444,
+ "grad_norm": 2.6100499629974365,
+ "learning_rate": 2.996976394215402e-06,
+ "loss": 0.4784,
+ "step": 5593
+ },
+ {
+ "epoch": 2.644917257683215,
+ "grad_norm": 2.6978676319122314,
+ "learning_rate": 2.9963650021142018e-06,
+ "loss": 0.3911,
+ "step": 5594
+ },
+ {
+ "epoch": 2.645390070921986,
+ "grad_norm": 2.8080835342407227,
+ "learning_rate": 2.9957535791065284e-06,
+ "loss": 0.4997,
+ "step": 5595
+ },
+ {
+ "epoch": 2.6458628841607563,
+ "grad_norm": 2.6639578342437744,
+ "learning_rate": 2.9951421252304537e-06,
+ "loss": 0.4066,
+ "step": 5596
+ },
+ {
+ "epoch": 2.646335697399527,
+ "grad_norm": 3.102456569671631,
+ "learning_rate": 2.9945306405240505e-06,
+ "loss": 0.5554,
+ "step": 5597
+ },
+ {
+ "epoch": 2.646808510638298,
+ "grad_norm": 2.6524150371551514,
+ "learning_rate": 2.993919125025392e-06,
+ "loss": 0.3881,
+ "step": 5598
+ },
+ {
+ "epoch": 2.6472813238770687,
+ "grad_norm": 2.926316499710083,
+ "learning_rate": 2.993307578772556e-06,
+ "loss": 0.4845,
+ "step": 5599
+ },
+ {
+ "epoch": 2.6477541371158395,
+ "grad_norm": 3.346550703048706,
+ "learning_rate": 2.9926960018036195e-06,
+ "loss": 0.4481,
+ "step": 5600
+ },
+ {
+ "epoch": 2.64822695035461,
+ "grad_norm": 2.6211020946502686,
+ "learning_rate": 2.9920843941566634e-06,
+ "loss": 0.4355,
+ "step": 5601
+ },
+ {
+ "epoch": 2.6486997635933807,
+ "grad_norm": 2.7479333877563477,
+ "learning_rate": 2.99147275586977e-06,
+ "loss": 0.4373,
+ "step": 5602
+ },
+ {
+ "epoch": 2.6491725768321515,
+ "grad_norm": 2.523385524749756,
+ "learning_rate": 2.9908610869810235e-06,
+ "loss": 0.4467,
+ "step": 5603
+ },
+ {
+ "epoch": 2.649645390070922,
+ "grad_norm": 2.93886137008667,
+ "learning_rate": 2.9902493875285086e-06,
+ "loss": 0.4956,
+ "step": 5604
+ },
+ {
+ "epoch": 2.6501182033096926,
+ "grad_norm": 2.7630443572998047,
+ "learning_rate": 2.989637657550315e-06,
+ "loss": 0.5012,
+ "step": 5605
+ },
+ {
+ "epoch": 2.6505910165484634,
+ "grad_norm": 2.6733906269073486,
+ "learning_rate": 2.989025897084531e-06,
+ "loss": 0.446,
+ "step": 5606
+ },
+ {
+ "epoch": 2.651063829787234,
+ "grad_norm": 2.8411107063293457,
+ "learning_rate": 2.9884141061692484e-06,
+ "loss": 0.4817,
+ "step": 5607
+ },
+ {
+ "epoch": 2.651536643026005,
+ "grad_norm": 2.8667192459106445,
+ "learning_rate": 2.987802284842562e-06,
+ "loss": 0.3909,
+ "step": 5608
+ },
+ {
+ "epoch": 2.6520094562647754,
+ "grad_norm": 3.4640755653381348,
+ "learning_rate": 2.987190433142565e-06,
+ "loss": 0.4379,
+ "step": 5609
+ },
+ {
+ "epoch": 2.652482269503546,
+ "grad_norm": 2.675121307373047,
+ "learning_rate": 2.9865785511073565e-06,
+ "loss": 0.4833,
+ "step": 5610
+ },
+ {
+ "epoch": 2.652955082742317,
+ "grad_norm": 2.4375529289245605,
+ "learning_rate": 2.9859666387750353e-06,
+ "loss": 0.3949,
+ "step": 5611
+ },
+ {
+ "epoch": 2.6534278959810873,
+ "grad_norm": 2.7312581539154053,
+ "learning_rate": 2.9853546961837026e-06,
+ "loss": 0.4546,
+ "step": 5612
+ },
+ {
+ "epoch": 2.653900709219858,
+ "grad_norm": 2.7695999145507812,
+ "learning_rate": 2.9847427233714617e-06,
+ "loss": 0.4696,
+ "step": 5613
+ },
+ {
+ "epoch": 2.654373522458629,
+ "grad_norm": 2.6313109397888184,
+ "learning_rate": 2.984130720376416e-06,
+ "loss": 0.4733,
+ "step": 5614
+ },
+ {
+ "epoch": 2.6548463356973997,
+ "grad_norm": 2.656864881515503,
+ "learning_rate": 2.9835186872366733e-06,
+ "loss": 0.3806,
+ "step": 5615
+ },
+ {
+ "epoch": 2.65531914893617,
+ "grad_norm": 2.720075845718384,
+ "learning_rate": 2.982906623990342e-06,
+ "loss": 0.4041,
+ "step": 5616
+ },
+ {
+ "epoch": 2.655791962174941,
+ "grad_norm": 2.6684951782226562,
+ "learning_rate": 2.9822945306755334e-06,
+ "loss": 0.4552,
+ "step": 5617
+ },
+ {
+ "epoch": 2.6562647754137116,
+ "grad_norm": 2.567751884460449,
+ "learning_rate": 2.9816824073303585e-06,
+ "loss": 0.465,
+ "step": 5618
+ },
+ {
+ "epoch": 2.656737588652482,
+ "grad_norm": 2.7490367889404297,
+ "learning_rate": 2.981070253992933e-06,
+ "loss": 0.4647,
+ "step": 5619
+ },
+ {
+ "epoch": 2.657210401891253,
+ "grad_norm": 2.548656463623047,
+ "learning_rate": 2.9804580707013715e-06,
+ "loss": 0.4226,
+ "step": 5620
+ },
+ {
+ "epoch": 2.6576832151300236,
+ "grad_norm": 2.5484731197357178,
+ "learning_rate": 2.9798458574937927e-06,
+ "loss": 0.382,
+ "step": 5621
+ },
+ {
+ "epoch": 2.6581560283687944,
+ "grad_norm": 2.7293949127197266,
+ "learning_rate": 2.979233614408317e-06,
+ "loss": 0.4418,
+ "step": 5622
+ },
+ {
+ "epoch": 2.658628841607565,
+ "grad_norm": 2.645036458969116,
+ "learning_rate": 2.9786213414830646e-06,
+ "loss": 0.414,
+ "step": 5623
+ },
+ {
+ "epoch": 2.6591016548463355,
+ "grad_norm": 2.5287609100341797,
+ "learning_rate": 2.9780090387561604e-06,
+ "loss": 0.3914,
+ "step": 5624
+ },
+ {
+ "epoch": 2.6595744680851063,
+ "grad_norm": 2.5570411682128906,
+ "learning_rate": 2.9773967062657293e-06,
+ "loss": 0.4431,
+ "step": 5625
+ },
+ {
+ "epoch": 2.660047281323877,
+ "grad_norm": 2.681749105453491,
+ "learning_rate": 2.9767843440498983e-06,
+ "loss": 0.4245,
+ "step": 5626
+ },
+ {
+ "epoch": 2.6605200945626475,
+ "grad_norm": 2.8629777431488037,
+ "learning_rate": 2.976171952146798e-06,
+ "loss": 0.4643,
+ "step": 5627
+ },
+ {
+ "epoch": 2.6609929078014183,
+ "grad_norm": 2.577148199081421,
+ "learning_rate": 2.9755595305945573e-06,
+ "loss": 0.43,
+ "step": 5628
+ },
+ {
+ "epoch": 2.661465721040189,
+ "grad_norm": 2.747218370437622,
+ "learning_rate": 2.97494707943131e-06,
+ "loss": 0.5194,
+ "step": 5629
+ },
+ {
+ "epoch": 2.66193853427896,
+ "grad_norm": 2.535604953765869,
+ "learning_rate": 2.9743345986951904e-06,
+ "loss": 0.4401,
+ "step": 5630
+ },
+ {
+ "epoch": 2.6624113475177307,
+ "grad_norm": 3.3341166973114014,
+ "learning_rate": 2.973722088424336e-06,
+ "loss": 0.4925,
+ "step": 5631
+ },
+ {
+ "epoch": 2.662884160756501,
+ "grad_norm": 2.9264349937438965,
+ "learning_rate": 2.973109548656884e-06,
+ "loss": 0.4787,
+ "step": 5632
+ },
+ {
+ "epoch": 2.663356973995272,
+ "grad_norm": 2.7132506370544434,
+ "learning_rate": 2.9724969794309742e-06,
+ "loss": 0.4138,
+ "step": 5633
+ },
+ {
+ "epoch": 2.6638297872340426,
+ "grad_norm": 2.7970192432403564,
+ "learning_rate": 2.9718843807847497e-06,
+ "loss": 0.4896,
+ "step": 5634
+ },
+ {
+ "epoch": 2.664302600472813,
+ "grad_norm": 2.610208749771118,
+ "learning_rate": 2.9712717527563545e-06,
+ "loss": 0.3997,
+ "step": 5635
+ },
+ {
+ "epoch": 2.6647754137115838,
+ "grad_norm": 3.5483577251434326,
+ "learning_rate": 2.9706590953839335e-06,
+ "loss": 0.5109,
+ "step": 5636
+ },
+ {
+ "epoch": 2.6652482269503546,
+ "grad_norm": 2.746933698654175,
+ "learning_rate": 2.9700464087056345e-06,
+ "loss": 0.4672,
+ "step": 5637
+ },
+ {
+ "epoch": 2.6657210401891254,
+ "grad_norm": 2.704436779022217,
+ "learning_rate": 2.969433692759607e-06,
+ "loss": 0.4402,
+ "step": 5638
+ },
+ {
+ "epoch": 2.666193853427896,
+ "grad_norm": 2.859520196914673,
+ "learning_rate": 2.9688209475840005e-06,
+ "loss": 0.4679,
+ "step": 5639
+ },
+ {
+ "epoch": 2.6666666666666665,
+ "grad_norm": 2.518580436706543,
+ "learning_rate": 2.968208173216971e-06,
+ "loss": 0.3772,
+ "step": 5640
+ },
+ {
+ "epoch": 2.6671394799054373,
+ "grad_norm": 2.7624926567077637,
+ "learning_rate": 2.967595369696671e-06,
+ "loss": 0.4753,
+ "step": 5641
+ },
+ {
+ "epoch": 2.667612293144208,
+ "grad_norm": 2.654003620147705,
+ "learning_rate": 2.966982537061257e-06,
+ "loss": 0.4583,
+ "step": 5642
+ },
+ {
+ "epoch": 2.6680851063829785,
+ "grad_norm": 2.8473968505859375,
+ "learning_rate": 2.966369675348888e-06,
+ "loss": 0.4623,
+ "step": 5643
+ },
+ {
+ "epoch": 2.6685579196217493,
+ "grad_norm": 2.5587947368621826,
+ "learning_rate": 2.9657567845977253e-06,
+ "loss": 0.4014,
+ "step": 5644
+ },
+ {
+ "epoch": 2.66903073286052,
+ "grad_norm": 2.572220802307129,
+ "learning_rate": 2.96514386484593e-06,
+ "loss": 0.4249,
+ "step": 5645
+ },
+ {
+ "epoch": 2.669503546099291,
+ "grad_norm": 2.7995707988739014,
+ "learning_rate": 2.964530916131665e-06,
+ "loss": 0.4575,
+ "step": 5646
+ },
+ {
+ "epoch": 2.6699763593380617,
+ "grad_norm": 2.8712687492370605,
+ "learning_rate": 2.963917938493097e-06,
+ "loss": 0.4353,
+ "step": 5647
+ },
+ {
+ "epoch": 2.670449172576832,
+ "grad_norm": 2.856473207473755,
+ "learning_rate": 2.963304931968393e-06,
+ "loss": 0.4345,
+ "step": 5648
+ },
+ {
+ "epoch": 2.670921985815603,
+ "grad_norm": 2.709198474884033,
+ "learning_rate": 2.9626918965957224e-06,
+ "loss": 0.4116,
+ "step": 5649
+ },
+ {
+ "epoch": 2.6713947990543736,
+ "grad_norm": 2.8144607543945312,
+ "learning_rate": 2.962078832413257e-06,
+ "loss": 0.4575,
+ "step": 5650
+ },
+ {
+ "epoch": 2.671867612293144,
+ "grad_norm": 3.131911039352417,
+ "learning_rate": 2.961465739459168e-06,
+ "loss": 0.4743,
+ "step": 5651
+ },
+ {
+ "epoch": 2.6723404255319148,
+ "grad_norm": 2.8487515449523926,
+ "learning_rate": 2.9608526177716316e-06,
+ "loss": 0.4314,
+ "step": 5652
+ },
+ {
+ "epoch": 2.6728132387706856,
+ "grad_norm": 2.613229751586914,
+ "learning_rate": 2.960239467388823e-06,
+ "loss": 0.4807,
+ "step": 5653
+ },
+ {
+ "epoch": 2.6732860520094563,
+ "grad_norm": 2.5049116611480713,
+ "learning_rate": 2.9596262883489213e-06,
+ "loss": 0.4708,
+ "step": 5654
+ },
+ {
+ "epoch": 2.673758865248227,
+ "grad_norm": 2.6347460746765137,
+ "learning_rate": 2.9590130806901052e-06,
+ "loss": 0.3689,
+ "step": 5655
+ },
+ {
+ "epoch": 2.6742316784869975,
+ "grad_norm": 3.3290371894836426,
+ "learning_rate": 2.9583998444505578e-06,
+ "loss": 0.4674,
+ "step": 5656
+ },
+ {
+ "epoch": 2.6747044917257683,
+ "grad_norm": 2.748403549194336,
+ "learning_rate": 2.957786579668462e-06,
+ "loss": 0.3852,
+ "step": 5657
+ },
+ {
+ "epoch": 2.675177304964539,
+ "grad_norm": 2.837573766708374,
+ "learning_rate": 2.957173286382003e-06,
+ "loss": 0.4541,
+ "step": 5658
+ },
+ {
+ "epoch": 2.6756501182033094,
+ "grad_norm": 3.0976510047912598,
+ "learning_rate": 2.9565599646293686e-06,
+ "loss": 0.4669,
+ "step": 5659
+ },
+ {
+ "epoch": 2.6761229314420802,
+ "grad_norm": 2.7059597969055176,
+ "learning_rate": 2.955946614448747e-06,
+ "loss": 0.3935,
+ "step": 5660
+ },
+ {
+ "epoch": 2.676595744680851,
+ "grad_norm": 2.6700541973114014,
+ "learning_rate": 2.9553332358783294e-06,
+ "loss": 0.4322,
+ "step": 5661
+ },
+ {
+ "epoch": 2.677068557919622,
+ "grad_norm": 2.9782698154449463,
+ "learning_rate": 2.9547198289563068e-06,
+ "loss": 0.4338,
+ "step": 5662
+ },
+ {
+ "epoch": 2.6775413711583926,
+ "grad_norm": 2.637876510620117,
+ "learning_rate": 2.9541063937208755e-06,
+ "loss": 0.4289,
+ "step": 5663
+ },
+ {
+ "epoch": 2.678014184397163,
+ "grad_norm": 3.421949863433838,
+ "learning_rate": 2.953492930210229e-06,
+ "loss": 0.5458,
+ "step": 5664
+ },
+ {
+ "epoch": 2.678486997635934,
+ "grad_norm": 2.8273842334747314,
+ "learning_rate": 2.952879438462567e-06,
+ "loss": 0.4529,
+ "step": 5665
+ },
+ {
+ "epoch": 2.6789598108747046,
+ "grad_norm": 2.9090168476104736,
+ "learning_rate": 2.9522659185160873e-06,
+ "loss": 0.444,
+ "step": 5666
+ },
+ {
+ "epoch": 2.679432624113475,
+ "grad_norm": 2.646710157394409,
+ "learning_rate": 2.9516523704089927e-06,
+ "loss": 0.4226,
+ "step": 5667
+ },
+ {
+ "epoch": 2.6799054373522457,
+ "grad_norm": 2.65915584564209,
+ "learning_rate": 2.951038794179486e-06,
+ "loss": 0.4307,
+ "step": 5668
+ },
+ {
+ "epoch": 2.6803782505910165,
+ "grad_norm": 3.004507303237915,
+ "learning_rate": 2.950425189865771e-06,
+ "loss": 0.4799,
+ "step": 5669
+ },
+ {
+ "epoch": 2.6808510638297873,
+ "grad_norm": 2.5210134983062744,
+ "learning_rate": 2.949811557506054e-06,
+ "loss": 0.3842,
+ "step": 5670
+ },
+ {
+ "epoch": 2.681323877068558,
+ "grad_norm": 2.8072893619537354,
+ "learning_rate": 2.9491978971385436e-06,
+ "loss": 0.435,
+ "step": 5671
+ },
+ {
+ "epoch": 2.6817966903073285,
+ "grad_norm": 2.5701990127563477,
+ "learning_rate": 2.9485842088014498e-06,
+ "loss": 0.4932,
+ "step": 5672
+ },
+ {
+ "epoch": 2.6822695035460993,
+ "grad_norm": 2.9368457794189453,
+ "learning_rate": 2.9479704925329854e-06,
+ "loss": 0.455,
+ "step": 5673
+ },
+ {
+ "epoch": 2.68274231678487,
+ "grad_norm": 2.8576247692108154,
+ "learning_rate": 2.947356748371362e-06,
+ "loss": 0.4254,
+ "step": 5674
+ },
+ {
+ "epoch": 2.6832151300236404,
+ "grad_norm": 2.8999195098876953,
+ "learning_rate": 2.946742976354795e-06,
+ "loss": 0.4159,
+ "step": 5675
+ },
+ {
+ "epoch": 2.6836879432624112,
+ "grad_norm": 2.8439736366271973,
+ "learning_rate": 2.946129176521502e-06,
+ "loss": 0.4035,
+ "step": 5676
+ },
+ {
+ "epoch": 2.684160756501182,
+ "grad_norm": 2.8525729179382324,
+ "learning_rate": 2.945515348909702e-06,
+ "loss": 0.4137,
+ "step": 5677
+ },
+ {
+ "epoch": 2.684633569739953,
+ "grad_norm": 2.6573562622070312,
+ "learning_rate": 2.9449014935576147e-06,
+ "loss": 0.4203,
+ "step": 5678
+ },
+ {
+ "epoch": 2.6851063829787236,
+ "grad_norm": 2.765794277191162,
+ "learning_rate": 2.9442876105034616e-06,
+ "loss": 0.5184,
+ "step": 5679
+ },
+ {
+ "epoch": 2.685579196217494,
+ "grad_norm": 2.694617748260498,
+ "learning_rate": 2.943673699785467e-06,
+ "loss": 0.417,
+ "step": 5680
+ },
+ {
+ "epoch": 2.6860520094562648,
+ "grad_norm": 2.740774393081665,
+ "learning_rate": 2.943059761441857e-06,
+ "loss": 0.4431,
+ "step": 5681
+ },
+ {
+ "epoch": 2.6865248226950356,
+ "grad_norm": 2.670642614364624,
+ "learning_rate": 2.942445795510859e-06,
+ "loss": 0.4298,
+ "step": 5682
+ },
+ {
+ "epoch": 2.686997635933806,
+ "grad_norm": 2.838907241821289,
+ "learning_rate": 2.9418318020307e-06,
+ "loss": 0.4529,
+ "step": 5683
+ },
+ {
+ "epoch": 2.6874704491725767,
+ "grad_norm": 2.562317371368408,
+ "learning_rate": 2.9412177810396135e-06,
+ "loss": 0.4251,
+ "step": 5684
+ },
+ {
+ "epoch": 2.6879432624113475,
+ "grad_norm": 2.5805928707122803,
+ "learning_rate": 2.9406037325758298e-06,
+ "loss": 0.4405,
+ "step": 5685
+ },
+ {
+ "epoch": 2.6884160756501183,
+ "grad_norm": 2.5701205730438232,
+ "learning_rate": 2.939989656677583e-06,
+ "loss": 0.4184,
+ "step": 5686
+ },
+ {
+ "epoch": 2.688888888888889,
+ "grad_norm": 2.7990400791168213,
+ "learning_rate": 2.939375553383111e-06,
+ "loss": 0.4866,
+ "step": 5687
+ },
+ {
+ "epoch": 2.6893617021276595,
+ "grad_norm": 3.063319206237793,
+ "learning_rate": 2.9387614227306487e-06,
+ "loss": 0.4202,
+ "step": 5688
+ },
+ {
+ "epoch": 2.6898345153664303,
+ "grad_norm": 3.0891315937042236,
+ "learning_rate": 2.938147264758437e-06,
+ "loss": 0.4344,
+ "step": 5689
+ },
+ {
+ "epoch": 2.690307328605201,
+ "grad_norm": 2.8982670307159424,
+ "learning_rate": 2.9375330795047165e-06,
+ "loss": 0.4548,
+ "step": 5690
+ },
+ {
+ "epoch": 2.6907801418439714,
+ "grad_norm": 2.7947235107421875,
+ "learning_rate": 2.9369188670077293e-06,
+ "loss": 0.5028,
+ "step": 5691
+ },
+ {
+ "epoch": 2.691252955082742,
+ "grad_norm": 3.1615960597991943,
+ "learning_rate": 2.9363046273057206e-06,
+ "loss": 0.4855,
+ "step": 5692
+ },
+ {
+ "epoch": 2.691725768321513,
+ "grad_norm": 2.669516086578369,
+ "learning_rate": 2.935690360436935e-06,
+ "loss": 0.3813,
+ "step": 5693
+ },
+ {
+ "epoch": 2.692198581560284,
+ "grad_norm": 2.8743274211883545,
+ "learning_rate": 2.935076066439622e-06,
+ "loss": 0.4302,
+ "step": 5694
+ },
+ {
+ "epoch": 2.6926713947990546,
+ "grad_norm": 2.6829612255096436,
+ "learning_rate": 2.9344617453520295e-06,
+ "loss": 0.4063,
+ "step": 5695
+ },
+ {
+ "epoch": 2.693144208037825,
+ "grad_norm": 2.776447057723999,
+ "learning_rate": 2.9338473972124097e-06,
+ "loss": 0.4921,
+ "step": 5696
+ },
+ {
+ "epoch": 2.6936170212765957,
+ "grad_norm": 2.7865772247314453,
+ "learning_rate": 2.9332330220590143e-06,
+ "loss": 0.4939,
+ "step": 5697
+ },
+ {
+ "epoch": 2.6940898345153665,
+ "grad_norm": 3.020526170730591,
+ "learning_rate": 2.932618619930098e-06,
+ "loss": 0.4839,
+ "step": 5698
+ },
+ {
+ "epoch": 2.694562647754137,
+ "grad_norm": 2.637057065963745,
+ "learning_rate": 2.932004190863918e-06,
+ "loss": 0.4343,
+ "step": 5699
+ },
+ {
+ "epoch": 2.6950354609929077,
+ "grad_norm": 2.7426512241363525,
+ "learning_rate": 2.9313897348987314e-06,
+ "loss": 0.3609,
+ "step": 5700
+ },
+ {
+ "epoch": 2.6955082742316785,
+ "grad_norm": 2.767186164855957,
+ "learning_rate": 2.9307752520727974e-06,
+ "loss": 0.3793,
+ "step": 5701
+ },
+ {
+ "epoch": 2.6959810874704493,
+ "grad_norm": 2.4791622161865234,
+ "learning_rate": 2.930160742424377e-06,
+ "loss": 0.4192,
+ "step": 5702
+ },
+ {
+ "epoch": 2.69645390070922,
+ "grad_norm": 2.661461591720581,
+ "learning_rate": 2.9295462059917336e-06,
+ "loss": 0.4758,
+ "step": 5703
+ },
+ {
+ "epoch": 2.6969267139479904,
+ "grad_norm": 2.896242380142212,
+ "learning_rate": 2.928931642813131e-06,
+ "loss": 0.42,
+ "step": 5704
+ },
+ {
+ "epoch": 2.6973995271867612,
+ "grad_norm": 2.783813238143921,
+ "learning_rate": 2.9283170529268366e-06,
+ "loss": 0.4726,
+ "step": 5705
+ },
+ {
+ "epoch": 2.697872340425532,
+ "grad_norm": 2.4347333908081055,
+ "learning_rate": 2.927702436371117e-06,
+ "loss": 0.4199,
+ "step": 5706
+ },
+ {
+ "epoch": 2.6983451536643024,
+ "grad_norm": 2.4643805027008057,
+ "learning_rate": 2.927087793184242e-06,
+ "loss": 0.3578,
+ "step": 5707
+ },
+ {
+ "epoch": 2.698817966903073,
+ "grad_norm": 2.6396660804748535,
+ "learning_rate": 2.9264731234044835e-06,
+ "loss": 0.4509,
+ "step": 5708
+ },
+ {
+ "epoch": 2.699290780141844,
+ "grad_norm": 2.7341182231903076,
+ "learning_rate": 2.925858427070113e-06,
+ "loss": 0.4331,
+ "step": 5709
+ },
+ {
+ "epoch": 2.699763593380615,
+ "grad_norm": 2.7578938007354736,
+ "learning_rate": 2.9252437042194058e-06,
+ "loss": 0.4508,
+ "step": 5710
+ },
+ {
+ "epoch": 2.7002364066193856,
+ "grad_norm": 2.557788133621216,
+ "learning_rate": 2.9246289548906375e-06,
+ "loss": 0.3775,
+ "step": 5711
+ },
+ {
+ "epoch": 2.700709219858156,
+ "grad_norm": 2.802851676940918,
+ "learning_rate": 2.924014179122086e-06,
+ "loss": 0.4518,
+ "step": 5712
+ },
+ {
+ "epoch": 2.7011820330969267,
+ "grad_norm": 2.4773001670837402,
+ "learning_rate": 2.9233993769520313e-06,
+ "loss": 0.4019,
+ "step": 5713
+ },
+ {
+ "epoch": 2.7016548463356975,
+ "grad_norm": 3.108971357345581,
+ "learning_rate": 2.922784548418754e-06,
+ "loss": 0.4715,
+ "step": 5714
+ },
+ {
+ "epoch": 2.702127659574468,
+ "grad_norm": 2.8596770763397217,
+ "learning_rate": 2.9221696935605366e-06,
+ "loss": 0.4361,
+ "step": 5715
+ },
+ {
+ "epoch": 2.7026004728132387,
+ "grad_norm": 2.570604085922241,
+ "learning_rate": 2.9215548124156633e-06,
+ "loss": 0.3982,
+ "step": 5716
+ },
+ {
+ "epoch": 2.7030732860520095,
+ "grad_norm": 2.3157799243927,
+ "learning_rate": 2.9209399050224206e-06,
+ "loss": 0.456,
+ "step": 5717
+ },
+ {
+ "epoch": 2.7035460992907803,
+ "grad_norm": 2.6865758895874023,
+ "learning_rate": 2.9203249714190952e-06,
+ "loss": 0.4441,
+ "step": 5718
+ },
+ {
+ "epoch": 2.704018912529551,
+ "grad_norm": 2.76723313331604,
+ "learning_rate": 2.919710011643978e-06,
+ "loss": 0.464,
+ "step": 5719
+ },
+ {
+ "epoch": 2.7044917257683214,
+ "grad_norm": 2.648792028427124,
+ "learning_rate": 2.9190950257353578e-06,
+ "loss": 0.3426,
+ "step": 5720
+ },
+ {
+ "epoch": 2.704964539007092,
+ "grad_norm": 2.878739833831787,
+ "learning_rate": 2.9184800137315276e-06,
+ "loss": 0.4431,
+ "step": 5721
+ },
+ {
+ "epoch": 2.705437352245863,
+ "grad_norm": 2.670567274093628,
+ "learning_rate": 2.917864975670783e-06,
+ "loss": 0.4347,
+ "step": 5722
+ },
+ {
+ "epoch": 2.7059101654846334,
+ "grad_norm": 2.7031569480895996,
+ "learning_rate": 2.9172499115914184e-06,
+ "loss": 0.4557,
+ "step": 5723
+ },
+ {
+ "epoch": 2.706382978723404,
+ "grad_norm": 2.5225696563720703,
+ "learning_rate": 2.9166348215317314e-06,
+ "loss": 0.4159,
+ "step": 5724
+ },
+ {
+ "epoch": 2.706855791962175,
+ "grad_norm": 2.8676085472106934,
+ "learning_rate": 2.916019705530021e-06,
+ "loss": 0.5018,
+ "step": 5725
+ },
+ {
+ "epoch": 2.7073286052009458,
+ "grad_norm": 2.576463460922241,
+ "learning_rate": 2.915404563624587e-06,
+ "loss": 0.4317,
+ "step": 5726
+ },
+ {
+ "epoch": 2.7078014184397166,
+ "grad_norm": 3.155565023422241,
+ "learning_rate": 2.9147893958537328e-06,
+ "loss": 0.5029,
+ "step": 5727
+ },
+ {
+ "epoch": 2.708274231678487,
+ "grad_norm": 2.604079008102417,
+ "learning_rate": 2.9141742022557622e-06,
+ "loss": 0.4324,
+ "step": 5728
+ },
+ {
+ "epoch": 2.7087470449172577,
+ "grad_norm": 2.6597228050231934,
+ "learning_rate": 2.913558982868979e-06,
+ "loss": 0.4335,
+ "step": 5729
+ },
+ {
+ "epoch": 2.7092198581560285,
+ "grad_norm": 2.811384439468384,
+ "learning_rate": 2.9129437377316923e-06,
+ "loss": 0.4031,
+ "step": 5730
+ },
+ {
+ "epoch": 2.709692671394799,
+ "grad_norm": 3.1041207313537598,
+ "learning_rate": 2.91232846688221e-06,
+ "loss": 0.481,
+ "step": 5731
+ },
+ {
+ "epoch": 2.7101654846335697,
+ "grad_norm": 2.5992188453674316,
+ "learning_rate": 2.9117131703588414e-06,
+ "loss": 0.4266,
+ "step": 5732
+ },
+ {
+ "epoch": 2.7106382978723405,
+ "grad_norm": 2.7726242542266846,
+ "learning_rate": 2.911097848199899e-06,
+ "loss": 0.4464,
+ "step": 5733
+ },
+ {
+ "epoch": 2.7111111111111112,
+ "grad_norm": 2.8683483600616455,
+ "learning_rate": 2.9104825004436966e-06,
+ "loss": 0.4248,
+ "step": 5734
+ },
+ {
+ "epoch": 2.711583924349882,
+ "grad_norm": 2.776386022567749,
+ "learning_rate": 2.9098671271285484e-06,
+ "loss": 0.4556,
+ "step": 5735
+ },
+ {
+ "epoch": 2.7120567375886524,
+ "grad_norm": 2.7612528800964355,
+ "learning_rate": 2.909251728292771e-06,
+ "loss": 0.455,
+ "step": 5736
+ },
+ {
+ "epoch": 2.712529550827423,
+ "grad_norm": 2.9223551750183105,
+ "learning_rate": 2.908636303974684e-06,
+ "loss": 0.4302,
+ "step": 5737
+ },
+ {
+ "epoch": 2.713002364066194,
+ "grad_norm": 2.898226022720337,
+ "learning_rate": 2.908020854212606e-06,
+ "loss": 0.4827,
+ "step": 5738
+ },
+ {
+ "epoch": 2.7134751773049643,
+ "grad_norm": 2.706361770629883,
+ "learning_rate": 2.9074053790448576e-06,
+ "loss": 0.4444,
+ "step": 5739
+ },
+ {
+ "epoch": 2.713947990543735,
+ "grad_norm": 2.8227248191833496,
+ "learning_rate": 2.9067898785097637e-06,
+ "loss": 0.4661,
+ "step": 5740
+ },
+ {
+ "epoch": 2.714420803782506,
+ "grad_norm": 2.597837448120117,
+ "learning_rate": 2.9061743526456474e-06,
+ "loss": 0.4646,
+ "step": 5741
+ },
+ {
+ "epoch": 2.7148936170212767,
+ "grad_norm": 2.5525131225585938,
+ "learning_rate": 2.9055588014908354e-06,
+ "loss": 0.4172,
+ "step": 5742
+ },
+ {
+ "epoch": 2.7153664302600475,
+ "grad_norm": 2.713071823120117,
+ "learning_rate": 2.904943225083655e-06,
+ "loss": 0.4893,
+ "step": 5743
+ },
+ {
+ "epoch": 2.715839243498818,
+ "grad_norm": 2.538623571395874,
+ "learning_rate": 2.9043276234624353e-06,
+ "loss": 0.3905,
+ "step": 5744
+ },
+ {
+ "epoch": 2.7163120567375887,
+ "grad_norm": 2.5190389156341553,
+ "learning_rate": 2.9037119966655076e-06,
+ "loss": 0.4318,
+ "step": 5745
+ },
+ {
+ "epoch": 2.7167848699763595,
+ "grad_norm": 2.6587612628936768,
+ "learning_rate": 2.903096344731204e-06,
+ "loss": 0.4153,
+ "step": 5746
+ },
+ {
+ "epoch": 2.71725768321513,
+ "grad_norm": 2.836731433868408,
+ "learning_rate": 2.902480667697859e-06,
+ "loss": 0.4779,
+ "step": 5747
+ },
+ {
+ "epoch": 2.7177304964539006,
+ "grad_norm": 2.8076045513153076,
+ "learning_rate": 2.9018649656038074e-06,
+ "loss": 0.5126,
+ "step": 5748
+ },
+ {
+ "epoch": 2.7182033096926714,
+ "grad_norm": 2.8930516242980957,
+ "learning_rate": 2.9012492384873865e-06,
+ "loss": 0.4561,
+ "step": 5749
+ },
+ {
+ "epoch": 2.7186761229314422,
+ "grad_norm": 2.7000370025634766,
+ "learning_rate": 2.9006334863869343e-06,
+ "loss": 0.4659,
+ "step": 5750
+ },
+ {
+ "epoch": 2.719148936170213,
+ "grad_norm": 2.927011251449585,
+ "learning_rate": 2.9000177093407926e-06,
+ "loss": 0.5123,
+ "step": 5751
+ },
+ {
+ "epoch": 2.7196217494089834,
+ "grad_norm": 3.0102779865264893,
+ "learning_rate": 2.8994019073873015e-06,
+ "loss": 0.3972,
+ "step": 5752
+ },
+ {
+ "epoch": 2.720094562647754,
+ "grad_norm": 2.778838634490967,
+ "learning_rate": 2.8987860805648054e-06,
+ "loss": 0.4922,
+ "step": 5753
+ },
+ {
+ "epoch": 2.720567375886525,
+ "grad_norm": 2.6150314807891846,
+ "learning_rate": 2.898170228911648e-06,
+ "loss": 0.4425,
+ "step": 5754
+ },
+ {
+ "epoch": 2.7210401891252953,
+ "grad_norm": 2.9329984188079834,
+ "learning_rate": 2.8975543524661777e-06,
+ "loss": 0.4872,
+ "step": 5755
+ },
+ {
+ "epoch": 2.721513002364066,
+ "grad_norm": 2.756803512573242,
+ "learning_rate": 2.8969384512667404e-06,
+ "loss": 0.4362,
+ "step": 5756
+ },
+ {
+ "epoch": 2.721985815602837,
+ "grad_norm": 2.600877285003662,
+ "learning_rate": 2.896322525351686e-06,
+ "loss": 0.4802,
+ "step": 5757
+ },
+ {
+ "epoch": 2.7224586288416077,
+ "grad_norm": 2.647069215774536,
+ "learning_rate": 2.8957065747593655e-06,
+ "loss": 0.4649,
+ "step": 5758
+ },
+ {
+ "epoch": 2.7229314420803785,
+ "grad_norm": 2.845388174057007,
+ "learning_rate": 2.895090599528132e-06,
+ "loss": 0.4533,
+ "step": 5759
+ },
+ {
+ "epoch": 2.723404255319149,
+ "grad_norm": 2.973881721496582,
+ "learning_rate": 2.8944745996963397e-06,
+ "loss": 0.4959,
+ "step": 5760
+ },
+ {
+ "epoch": 2.7238770685579197,
+ "grad_norm": 2.8995487689971924,
+ "learning_rate": 2.8938585753023435e-06,
+ "loss": 0.4597,
+ "step": 5761
+ },
+ {
+ "epoch": 2.7243498817966905,
+ "grad_norm": 2.903693437576294,
+ "learning_rate": 2.8932425263845004e-06,
+ "loss": 0.4521,
+ "step": 5762
+ },
+ {
+ "epoch": 2.724822695035461,
+ "grad_norm": 2.7609009742736816,
+ "learning_rate": 2.8926264529811702e-06,
+ "loss": 0.4399,
+ "step": 5763
+ },
+ {
+ "epoch": 2.7252955082742316,
+ "grad_norm": 2.788787603378296,
+ "learning_rate": 2.892010355130712e-06,
+ "loss": 0.4614,
+ "step": 5764
+ },
+ {
+ "epoch": 2.7257683215130024,
+ "grad_norm": 2.786498785018921,
+ "learning_rate": 2.8913942328714887e-06,
+ "loss": 0.4798,
+ "step": 5765
+ },
+ {
+ "epoch": 2.726241134751773,
+ "grad_norm": 2.9809393882751465,
+ "learning_rate": 2.8907780862418616e-06,
+ "loss": 0.5108,
+ "step": 5766
+ },
+ {
+ "epoch": 2.726713947990544,
+ "grad_norm": 2.6621177196502686,
+ "learning_rate": 2.8901619152801967e-06,
+ "loss": 0.4031,
+ "step": 5767
+ },
+ {
+ "epoch": 2.7271867612293144,
+ "grad_norm": 3.3092098236083984,
+ "learning_rate": 2.8895457200248607e-06,
+ "loss": 0.4671,
+ "step": 5768
+ },
+ {
+ "epoch": 2.727659574468085,
+ "grad_norm": 2.866306781768799,
+ "learning_rate": 2.8889295005142204e-06,
+ "loss": 0.4434,
+ "step": 5769
+ },
+ {
+ "epoch": 2.728132387706856,
+ "grad_norm": 2.6861231327056885,
+ "learning_rate": 2.888313256786646e-06,
+ "loss": 0.429,
+ "step": 5770
+ },
+ {
+ "epoch": 2.7286052009456263,
+ "grad_norm": 2.873180389404297,
+ "learning_rate": 2.8876969888805072e-06,
+ "loss": 0.4412,
+ "step": 5771
+ },
+ {
+ "epoch": 2.729078014184397,
+ "grad_norm": 2.511678695678711,
+ "learning_rate": 2.887080696834178e-06,
+ "loss": 0.4024,
+ "step": 5772
+ },
+ {
+ "epoch": 2.729550827423168,
+ "grad_norm": 2.6502726078033447,
+ "learning_rate": 2.88646438068603e-06,
+ "loss": 0.4357,
+ "step": 5773
+ },
+ {
+ "epoch": 2.7300236406619387,
+ "grad_norm": 2.7156145572662354,
+ "learning_rate": 2.8858480404744403e-06,
+ "loss": 0.4511,
+ "step": 5774
+ },
+ {
+ "epoch": 2.7304964539007095,
+ "grad_norm": 2.882582187652588,
+ "learning_rate": 2.8852316762377842e-06,
+ "loss": 0.4822,
+ "step": 5775
+ },
+ {
+ "epoch": 2.73096926713948,
+ "grad_norm": 2.7139666080474854,
+ "learning_rate": 2.8846152880144413e-06,
+ "loss": 0.4666,
+ "step": 5776
+ },
+ {
+ "epoch": 2.7314420803782506,
+ "grad_norm": 2.7453949451446533,
+ "learning_rate": 2.8839988758427907e-06,
+ "loss": 0.3927,
+ "step": 5777
+ },
+ {
+ "epoch": 2.731914893617021,
+ "grad_norm": 2.7859580516815186,
+ "learning_rate": 2.883382439761214e-06,
+ "loss": 0.4466,
+ "step": 5778
+ },
+ {
+ "epoch": 2.732387706855792,
+ "grad_norm": 2.695234537124634,
+ "learning_rate": 2.882765979808094e-06,
+ "loss": 0.4227,
+ "step": 5779
+ },
+ {
+ "epoch": 2.7328605200945626,
+ "grad_norm": 2.8081552982330322,
+ "learning_rate": 2.8821494960218148e-06,
+ "loss": 0.447,
+ "step": 5780
+ },
+ {
+ "epoch": 2.7333333333333334,
+ "grad_norm": 2.887643337249756,
+ "learning_rate": 2.881532988440762e-06,
+ "loss": 0.5018,
+ "step": 5781
+ },
+ {
+ "epoch": 2.733806146572104,
+ "grad_norm": 3.108212471008301,
+ "learning_rate": 2.8809164571033233e-06,
+ "loss": 0.4132,
+ "step": 5782
+ },
+ {
+ "epoch": 2.7342789598108745,
+ "grad_norm": 2.874328374862671,
+ "learning_rate": 2.880299902047886e-06,
+ "loss": 0.4618,
+ "step": 5783
+ },
+ {
+ "epoch": 2.7347517730496453,
+ "grad_norm": 3.089132308959961,
+ "learning_rate": 2.879683323312843e-06,
+ "loss": 0.4956,
+ "step": 5784
+ },
+ {
+ "epoch": 2.735224586288416,
+ "grad_norm": 2.5173206329345703,
+ "learning_rate": 2.879066720936583e-06,
+ "loss": 0.4087,
+ "step": 5785
+ },
+ {
+ "epoch": 2.7356973995271865,
+ "grad_norm": 2.6401286125183105,
+ "learning_rate": 2.8784500949575014e-06,
+ "loss": 0.3995,
+ "step": 5786
+ },
+ {
+ "epoch": 2.7361702127659573,
+ "grad_norm": 2.9371910095214844,
+ "learning_rate": 2.877833445413991e-06,
+ "loss": 0.5209,
+ "step": 5787
+ },
+ {
+ "epoch": 2.736643026004728,
+ "grad_norm": 3.218158006668091,
+ "learning_rate": 2.8772167723444498e-06,
+ "loss": 0.4275,
+ "step": 5788
+ },
+ {
+ "epoch": 2.737115839243499,
+ "grad_norm": 2.9072160720825195,
+ "learning_rate": 2.8766000757872736e-06,
+ "loss": 0.4244,
+ "step": 5789
+ },
+ {
+ "epoch": 2.7375886524822697,
+ "grad_norm": 3.0378096103668213,
+ "learning_rate": 2.8759833557808614e-06,
+ "loss": 0.507,
+ "step": 5790
+ },
+ {
+ "epoch": 2.73806146572104,
+ "grad_norm": 2.728353977203369,
+ "learning_rate": 2.8753666123636148e-06,
+ "loss": 0.413,
+ "step": 5791
+ },
+ {
+ "epoch": 2.738534278959811,
+ "grad_norm": 2.6869957447052,
+ "learning_rate": 2.874749845573935e-06,
+ "loss": 0.44,
+ "step": 5792
+ },
+ {
+ "epoch": 2.7390070921985816,
+ "grad_norm": 2.6381702423095703,
+ "learning_rate": 2.8741330554502263e-06,
+ "loss": 0.4708,
+ "step": 5793
+ },
+ {
+ "epoch": 2.739479905437352,
+ "grad_norm": 2.6944689750671387,
+ "learning_rate": 2.873516242030892e-06,
+ "loss": 0.4555,
+ "step": 5794
+ },
+ {
+ "epoch": 2.739952718676123,
+ "grad_norm": 3.168473243713379,
+ "learning_rate": 2.8728994053543396e-06,
+ "loss": 0.4538,
+ "step": 5795
+ },
+ {
+ "epoch": 2.7404255319148936,
+ "grad_norm": 2.7504515647888184,
+ "learning_rate": 2.872282545458976e-06,
+ "loss": 0.4628,
+ "step": 5796
+ },
+ {
+ "epoch": 2.7408983451536644,
+ "grad_norm": 2.896462917327881,
+ "learning_rate": 2.8716656623832114e-06,
+ "loss": 0.4946,
+ "step": 5797
+ },
+ {
+ "epoch": 2.741371158392435,
+ "grad_norm": 2.8053417205810547,
+ "learning_rate": 2.8710487561654547e-06,
+ "loss": 0.4893,
+ "step": 5798
+ },
+ {
+ "epoch": 2.7418439716312055,
+ "grad_norm": 2.63171124458313,
+ "learning_rate": 2.870431826844119e-06,
+ "loss": 0.4257,
+ "step": 5799
+ },
+ {
+ "epoch": 2.7423167848699763,
+ "grad_norm": 3.0963807106018066,
+ "learning_rate": 2.869814874457618e-06,
+ "loss": 0.5404,
+ "step": 5800
+ },
+ {
+ "epoch": 2.742789598108747,
+ "grad_norm": 2.591132164001465,
+ "learning_rate": 2.8691978990443664e-06,
+ "loss": 0.4015,
+ "step": 5801
+ },
+ {
+ "epoch": 2.7432624113475175,
+ "grad_norm": 3.0319552421569824,
+ "learning_rate": 2.8685809006427812e-06,
+ "loss": 0.4411,
+ "step": 5802
+ },
+ {
+ "epoch": 2.7437352245862883,
+ "grad_norm": 2.7791874408721924,
+ "learning_rate": 2.8679638792912784e-06,
+ "loss": 0.43,
+ "step": 5803
+ },
+ {
+ "epoch": 2.744208037825059,
+ "grad_norm": 3.530632495880127,
+ "learning_rate": 2.867346835028279e-06,
+ "loss": 0.4581,
+ "step": 5804
+ },
+ {
+ "epoch": 2.74468085106383,
+ "grad_norm": 3.2043099403381348,
+ "learning_rate": 2.8667297678922024e-06,
+ "loss": 0.4375,
+ "step": 5805
+ },
+ {
+ "epoch": 2.7451536643026007,
+ "grad_norm": 2.8442344665527344,
+ "learning_rate": 2.8661126779214716e-06,
+ "loss": 0.4059,
+ "step": 5806
+ },
+ {
+ "epoch": 2.745626477541371,
+ "grad_norm": 2.7561380863189697,
+ "learning_rate": 2.86549556515451e-06,
+ "loss": 0.4391,
+ "step": 5807
+ },
+ {
+ "epoch": 2.746099290780142,
+ "grad_norm": 3.229663848876953,
+ "learning_rate": 2.8648784296297418e-06,
+ "loss": 0.4579,
+ "step": 5808
+ },
+ {
+ "epoch": 2.7465721040189126,
+ "grad_norm": 2.8375027179718018,
+ "learning_rate": 2.864261271385593e-06,
+ "loss": 0.4566,
+ "step": 5809
+ },
+ {
+ "epoch": 2.747044917257683,
+ "grad_norm": 2.392998695373535,
+ "learning_rate": 2.863644090460493e-06,
+ "loss": 0.4123,
+ "step": 5810
+ },
+ {
+ "epoch": 2.7475177304964538,
+ "grad_norm": 2.707610607147217,
+ "learning_rate": 2.86302688689287e-06,
+ "loss": 0.4299,
+ "step": 5811
+ },
+ {
+ "epoch": 2.7479905437352246,
+ "grad_norm": 2.824042797088623,
+ "learning_rate": 2.8624096607211547e-06,
+ "loss": 0.3799,
+ "step": 5812
+ },
+ {
+ "epoch": 2.7484633569739954,
+ "grad_norm": 2.933102607727051,
+ "learning_rate": 2.861792411983779e-06,
+ "loss": 0.4416,
+ "step": 5813
+ },
+ {
+ "epoch": 2.748936170212766,
+ "grad_norm": 2.7000277042388916,
+ "learning_rate": 2.8611751407191757e-06,
+ "loss": 0.4346,
+ "step": 5814
+ },
+ {
+ "epoch": 2.7494089834515365,
+ "grad_norm": 2.6757142543792725,
+ "learning_rate": 2.86055784696578e-06,
+ "loss": 0.4559,
+ "step": 5815
+ },
+ {
+ "epoch": 2.7498817966903073,
+ "grad_norm": 2.5791053771972656,
+ "learning_rate": 2.8599405307620287e-06,
+ "loss": 0.4619,
+ "step": 5816
+ },
+ {
+ "epoch": 2.750354609929078,
+ "grad_norm": 3.0327374935150146,
+ "learning_rate": 2.859323192146359e-06,
+ "loss": 0.4163,
+ "step": 5817
+ },
+ {
+ "epoch": 2.7508274231678485,
+ "grad_norm": 2.5580220222473145,
+ "learning_rate": 2.8587058311572084e-06,
+ "loss": 0.4005,
+ "step": 5818
+ },
+ {
+ "epoch": 2.7513002364066192,
+ "grad_norm": 2.592179536819458,
+ "learning_rate": 2.85808844783302e-06,
+ "loss": 0.4404,
+ "step": 5819
+ },
+ {
+ "epoch": 2.75177304964539,
+ "grad_norm": 3.2779927253723145,
+ "learning_rate": 2.8574710422122342e-06,
+ "loss": 0.54,
+ "step": 5820
+ },
+ {
+ "epoch": 2.752245862884161,
+ "grad_norm": 2.4804370403289795,
+ "learning_rate": 2.8568536143332933e-06,
+ "loss": 0.4476,
+ "step": 5821
+ },
+ {
+ "epoch": 2.7527186761229316,
+ "grad_norm": 2.649477481842041,
+ "learning_rate": 2.8562361642346427e-06,
+ "loss": 0.4336,
+ "step": 5822
+ },
+ {
+ "epoch": 2.753191489361702,
+ "grad_norm": 3.138587474822998,
+ "learning_rate": 2.855618691954728e-06,
+ "loss": 0.5042,
+ "step": 5823
+ },
+ {
+ "epoch": 2.753664302600473,
+ "grad_norm": 2.75093412399292,
+ "learning_rate": 2.855001197531997e-06,
+ "loss": 0.4327,
+ "step": 5824
+ },
+ {
+ "epoch": 2.7541371158392436,
+ "grad_norm": 2.678809642791748,
+ "learning_rate": 2.854383681004898e-06,
+ "loss": 0.4409,
+ "step": 5825
+ },
+ {
+ "epoch": 2.754609929078014,
+ "grad_norm": 2.965386390686035,
+ "learning_rate": 2.853766142411881e-06,
+ "loss": 0.4716,
+ "step": 5826
+ },
+ {
+ "epoch": 2.7550827423167847,
+ "grad_norm": 2.6419436931610107,
+ "learning_rate": 2.853148581791398e-06,
+ "loss": 0.4367,
+ "step": 5827
+ },
+ {
+ "epoch": 2.7555555555555555,
+ "grad_norm": 3.205794095993042,
+ "learning_rate": 2.8525309991819004e-06,
+ "loss": 0.4869,
+ "step": 5828
+ },
+ {
+ "epoch": 2.7560283687943263,
+ "grad_norm": 3.041008472442627,
+ "learning_rate": 2.851913394621844e-06,
+ "loss": 0.5087,
+ "step": 5829
+ },
+ {
+ "epoch": 2.756501182033097,
+ "grad_norm": 2.6525566577911377,
+ "learning_rate": 2.851295768149684e-06,
+ "loss": 0.3951,
+ "step": 5830
+ },
+ {
+ "epoch": 2.7569739952718675,
+ "grad_norm": 2.732220411300659,
+ "learning_rate": 2.850678119803876e-06,
+ "loss": 0.4797,
+ "step": 5831
+ },
+ {
+ "epoch": 2.7574468085106383,
+ "grad_norm": 2.8965251445770264,
+ "learning_rate": 2.8500604496228797e-06,
+ "loss": 0.4938,
+ "step": 5832
+ },
+ {
+ "epoch": 2.757919621749409,
+ "grad_norm": 2.48020076751709,
+ "learning_rate": 2.849442757645154e-06,
+ "loss": 0.4172,
+ "step": 5833
+ },
+ {
+ "epoch": 2.7583924349881794,
+ "grad_norm": 2.4764912128448486,
+ "learning_rate": 2.8488250439091603e-06,
+ "loss": 0.4123,
+ "step": 5834
+ },
+ {
+ "epoch": 2.7588652482269502,
+ "grad_norm": 2.4547016620635986,
+ "learning_rate": 2.84820730845336e-06,
+ "loss": 0.4116,
+ "step": 5835
+ },
+ {
+ "epoch": 2.759338061465721,
+ "grad_norm": 2.55476975440979,
+ "learning_rate": 2.847589551316218e-06,
+ "loss": 0.4744,
+ "step": 5836
+ },
+ {
+ "epoch": 2.759810874704492,
+ "grad_norm": 2.3866238594055176,
+ "learning_rate": 2.846971772536199e-06,
+ "loss": 0.4406,
+ "step": 5837
+ },
+ {
+ "epoch": 2.7602836879432626,
+ "grad_norm": 2.855318784713745,
+ "learning_rate": 2.8463539721517687e-06,
+ "loss": 0.4517,
+ "step": 5838
+ },
+ {
+ "epoch": 2.760756501182033,
+ "grad_norm": 2.527198314666748,
+ "learning_rate": 2.8457361502013954e-06,
+ "loss": 0.3588,
+ "step": 5839
+ },
+ {
+ "epoch": 2.7612293144208038,
+ "grad_norm": 2.6761462688446045,
+ "learning_rate": 2.8451183067235476e-06,
+ "loss": 0.4192,
+ "step": 5840
+ },
+ {
+ "epoch": 2.7617021276595746,
+ "grad_norm": 2.5692319869995117,
+ "learning_rate": 2.8445004417566967e-06,
+ "loss": 0.4108,
+ "step": 5841
+ },
+ {
+ "epoch": 2.762174940898345,
+ "grad_norm": 2.5721096992492676,
+ "learning_rate": 2.8438825553393133e-06,
+ "loss": 0.3941,
+ "step": 5842
+ },
+ {
+ "epoch": 2.7626477541371157,
+ "grad_norm": 2.699430227279663,
+ "learning_rate": 2.843264647509872e-06,
+ "loss": 0.4418,
+ "step": 5843
+ },
+ {
+ "epoch": 2.7631205673758865,
+ "grad_norm": 2.6943318843841553,
+ "learning_rate": 2.842646718306846e-06,
+ "loss": 0.4505,
+ "step": 5844
+ },
+ {
+ "epoch": 2.7635933806146573,
+ "grad_norm": 2.661656379699707,
+ "learning_rate": 2.8420287677687107e-06,
+ "loss": 0.4413,
+ "step": 5845
+ },
+ {
+ "epoch": 2.764066193853428,
+ "grad_norm": 2.830467939376831,
+ "learning_rate": 2.8414107959339444e-06,
+ "loss": 0.5095,
+ "step": 5846
+ },
+ {
+ "epoch": 2.7645390070921985,
+ "grad_norm": 2.598053455352783,
+ "learning_rate": 2.840792802841024e-06,
+ "loss": 0.4029,
+ "step": 5847
+ },
+ {
+ "epoch": 2.7650118203309693,
+ "grad_norm": 2.641700029373169,
+ "learning_rate": 2.8401747885284316e-06,
+ "loss": 0.4237,
+ "step": 5848
+ },
+ {
+ "epoch": 2.76548463356974,
+ "grad_norm": 2.6672768592834473,
+ "learning_rate": 2.8395567530346454e-06,
+ "loss": 0.4181,
+ "step": 5849
+ },
+ {
+ "epoch": 2.7659574468085104,
+ "grad_norm": 2.5851705074310303,
+ "learning_rate": 2.838938696398149e-06,
+ "loss": 0.4165,
+ "step": 5850
+ },
+ {
+ "epoch": 2.766430260047281,
+ "grad_norm": 2.318120002746582,
+ "learning_rate": 2.8383206186574276e-06,
+ "loss": 0.3578,
+ "step": 5851
+ },
+ {
+ "epoch": 2.766903073286052,
+ "grad_norm": 2.6199793815612793,
+ "learning_rate": 2.8377025198509635e-06,
+ "loss": 0.4719,
+ "step": 5852
+ },
+ {
+ "epoch": 2.767375886524823,
+ "grad_norm": 2.7186086177825928,
+ "learning_rate": 2.837084400017245e-06,
+ "loss": 0.41,
+ "step": 5853
+ },
+ {
+ "epoch": 2.7678486997635936,
+ "grad_norm": 2.702514886856079,
+ "learning_rate": 2.8364662591947583e-06,
+ "loss": 0.4659,
+ "step": 5854
+ },
+ {
+ "epoch": 2.768321513002364,
+ "grad_norm": 2.612375259399414,
+ "learning_rate": 2.835848097421993e-06,
+ "loss": 0.4252,
+ "step": 5855
+ },
+ {
+ "epoch": 2.7687943262411348,
+ "grad_norm": 3.0127978324890137,
+ "learning_rate": 2.8352299147374394e-06,
+ "loss": 0.4084,
+ "step": 5856
+ },
+ {
+ "epoch": 2.7692671394799055,
+ "grad_norm": 2.6460049152374268,
+ "learning_rate": 2.83461171117959e-06,
+ "loss": 0.4035,
+ "step": 5857
+ },
+ {
+ "epoch": 2.769739952718676,
+ "grad_norm": 2.9844725131988525,
+ "learning_rate": 2.8339934867869357e-06,
+ "loss": 0.4912,
+ "step": 5858
+ },
+ {
+ "epoch": 2.7702127659574467,
+ "grad_norm": 2.731217861175537,
+ "learning_rate": 2.833375241597972e-06,
+ "loss": 0.4112,
+ "step": 5859
+ },
+ {
+ "epoch": 2.7706855791962175,
+ "grad_norm": 2.731194496154785,
+ "learning_rate": 2.832756975651193e-06,
+ "loss": 0.4516,
+ "step": 5860
+ },
+ {
+ "epoch": 2.7711583924349883,
+ "grad_norm": 3.0532076358795166,
+ "learning_rate": 2.8321386889850965e-06,
+ "loss": 0.3959,
+ "step": 5861
+ },
+ {
+ "epoch": 2.771631205673759,
+ "grad_norm": 3.5437800884246826,
+ "learning_rate": 2.831520381638181e-06,
+ "loss": 0.6055,
+ "step": 5862
+ },
+ {
+ "epoch": 2.7721040189125294,
+ "grad_norm": 2.4297714233398438,
+ "learning_rate": 2.830902053648944e-06,
+ "loss": 0.4038,
+ "step": 5863
+ },
+ {
+ "epoch": 2.7725768321513002,
+ "grad_norm": 2.696768045425415,
+ "learning_rate": 2.8302837050558876e-06,
+ "loss": 0.3983,
+ "step": 5864
+ },
+ {
+ "epoch": 2.773049645390071,
+ "grad_norm": 2.6574649810791016,
+ "learning_rate": 2.8296653358975122e-06,
+ "loss": 0.4937,
+ "step": 5865
+ },
+ {
+ "epoch": 2.7735224586288414,
+ "grad_norm": 2.9393341541290283,
+ "learning_rate": 2.8290469462123234e-06,
+ "loss": 0.4603,
+ "step": 5866
+ },
+ {
+ "epoch": 2.773995271867612,
+ "grad_norm": 2.7630696296691895,
+ "learning_rate": 2.828428536038824e-06,
+ "loss": 0.4663,
+ "step": 5867
+ },
+ {
+ "epoch": 2.774468085106383,
+ "grad_norm": 2.7354233264923096,
+ "learning_rate": 2.8278101054155183e-06,
+ "loss": 0.4444,
+ "step": 5868
+ },
+ {
+ "epoch": 2.774940898345154,
+ "grad_norm": 3.0489425659179688,
+ "learning_rate": 2.827191654380915e-06,
+ "loss": 0.4684,
+ "step": 5869
+ },
+ {
+ "epoch": 2.7754137115839246,
+ "grad_norm": 2.9602572917938232,
+ "learning_rate": 2.8265731829735226e-06,
+ "loss": 0.4571,
+ "step": 5870
+ },
+ {
+ "epoch": 2.775886524822695,
+ "grad_norm": 2.774132013320923,
+ "learning_rate": 2.825954691231851e-06,
+ "loss": 0.4458,
+ "step": 5871
+ },
+ {
+ "epoch": 2.7763593380614657,
+ "grad_norm": 2.696622133255005,
+ "learning_rate": 2.825336179194409e-06,
+ "loss": 0.4933,
+ "step": 5872
+ },
+ {
+ "epoch": 2.7768321513002365,
+ "grad_norm": 2.742184638977051,
+ "learning_rate": 2.8247176468997096e-06,
+ "loss": 0.4464,
+ "step": 5873
+ },
+ {
+ "epoch": 2.777304964539007,
+ "grad_norm": 2.7033183574676514,
+ "learning_rate": 2.824099094386266e-06,
+ "loss": 0.4369,
+ "step": 5874
+ },
+ {
+ "epoch": 2.7777777777777777,
+ "grad_norm": 2.7264044284820557,
+ "learning_rate": 2.8234805216925935e-06,
+ "loss": 0.4621,
+ "step": 5875
+ },
+ {
+ "epoch": 2.7782505910165485,
+ "grad_norm": 2.6417739391326904,
+ "learning_rate": 2.822861928857208e-06,
+ "loss": 0.4254,
+ "step": 5876
+ },
+ {
+ "epoch": 2.7787234042553193,
+ "grad_norm": 3.17209529876709,
+ "learning_rate": 2.8222433159186245e-06,
+ "loss": 0.5011,
+ "step": 5877
+ },
+ {
+ "epoch": 2.77919621749409,
+ "grad_norm": 3.1434381008148193,
+ "learning_rate": 2.8216246829153633e-06,
+ "loss": 0.4567,
+ "step": 5878
+ },
+ {
+ "epoch": 2.7796690307328604,
+ "grad_norm": 2.781608819961548,
+ "learning_rate": 2.821006029885943e-06,
+ "loss": 0.4723,
+ "step": 5879
+ },
+ {
+ "epoch": 2.780141843971631,
+ "grad_norm": 3.00079345703125,
+ "learning_rate": 2.820387356868885e-06,
+ "loss": 0.4796,
+ "step": 5880
+ },
+ {
+ "epoch": 2.780614657210402,
+ "grad_norm": 2.703555107116699,
+ "learning_rate": 2.819768663902712e-06,
+ "loss": 0.4577,
+ "step": 5881
+ },
+ {
+ "epoch": 2.7810874704491724,
+ "grad_norm": 2.5741801261901855,
+ "learning_rate": 2.8191499510259453e-06,
+ "loss": 0.4255,
+ "step": 5882
+ },
+ {
+ "epoch": 2.781560283687943,
+ "grad_norm": 2.9871208667755127,
+ "learning_rate": 2.8185312182771112e-06,
+ "loss": 0.4495,
+ "step": 5883
+ },
+ {
+ "epoch": 2.782033096926714,
+ "grad_norm": 2.525317668914795,
+ "learning_rate": 2.8179124656947343e-06,
+ "loss": 0.4428,
+ "step": 5884
+ },
+ {
+ "epoch": 2.7825059101654848,
+ "grad_norm": 2.525092840194702,
+ "learning_rate": 2.817293693317343e-06,
+ "loss": 0.4348,
+ "step": 5885
+ },
+ {
+ "epoch": 2.7829787234042556,
+ "grad_norm": 2.8485171794891357,
+ "learning_rate": 2.816674901183464e-06,
+ "loss": 0.4206,
+ "step": 5886
+ },
+ {
+ "epoch": 2.783451536643026,
+ "grad_norm": 2.6612746715545654,
+ "learning_rate": 2.8160560893316272e-06,
+ "loss": 0.396,
+ "step": 5887
+ },
+ {
+ "epoch": 2.7839243498817967,
+ "grad_norm": 2.7093865871429443,
+ "learning_rate": 2.815437257800364e-06,
+ "loss": 0.4468,
+ "step": 5888
+ },
+ {
+ "epoch": 2.7843971631205675,
+ "grad_norm": 2.6130900382995605,
+ "learning_rate": 2.814818406628206e-06,
+ "loss": 0.443,
+ "step": 5889
+ },
+ {
+ "epoch": 2.784869976359338,
+ "grad_norm": 2.8147552013397217,
+ "learning_rate": 2.8141995358536866e-06,
+ "loss": 0.4454,
+ "step": 5890
+ },
+ {
+ "epoch": 2.7853427895981087,
+ "grad_norm": 2.5621275901794434,
+ "learning_rate": 2.8135806455153395e-06,
+ "loss": 0.439,
+ "step": 5891
+ },
+ {
+ "epoch": 2.7858156028368795,
+ "grad_norm": 2.880228281021118,
+ "learning_rate": 2.812961735651701e-06,
+ "loss": 0.3895,
+ "step": 5892
+ },
+ {
+ "epoch": 2.7862884160756503,
+ "grad_norm": 2.5861377716064453,
+ "learning_rate": 2.8123428063013068e-06,
+ "loss": 0.4402,
+ "step": 5893
+ },
+ {
+ "epoch": 2.786761229314421,
+ "grad_norm": 2.9707765579223633,
+ "learning_rate": 2.811723857502696e-06,
+ "loss": 0.4461,
+ "step": 5894
+ },
+ {
+ "epoch": 2.7872340425531914,
+ "grad_norm": 2.923999309539795,
+ "learning_rate": 2.811104889294408e-06,
+ "loss": 0.4395,
+ "step": 5895
+ },
+ {
+ "epoch": 2.787706855791962,
+ "grad_norm": 2.846933603286743,
+ "learning_rate": 2.810485901714981e-06,
+ "loss": 0.5168,
+ "step": 5896
+ },
+ {
+ "epoch": 2.788179669030733,
+ "grad_norm": 4.1052350997924805,
+ "learning_rate": 2.8098668948029597e-06,
+ "loss": 0.5152,
+ "step": 5897
+ },
+ {
+ "epoch": 2.7886524822695034,
+ "grad_norm": 2.7391018867492676,
+ "learning_rate": 2.8092478685968856e-06,
+ "loss": 0.4515,
+ "step": 5898
+ },
+ {
+ "epoch": 2.789125295508274,
+ "grad_norm": 2.976088285446167,
+ "learning_rate": 2.8086288231353027e-06,
+ "loss": 0.5156,
+ "step": 5899
+ },
+ {
+ "epoch": 2.789598108747045,
+ "grad_norm": 2.6139633655548096,
+ "learning_rate": 2.8080097584567562e-06,
+ "loss": 0.4237,
+ "step": 5900
+ },
+ {
+ "epoch": 2.7900709219858157,
+ "grad_norm": 2.501654624938965,
+ "learning_rate": 2.807390674599792e-06,
+ "loss": 0.4349,
+ "step": 5901
+ },
+ {
+ "epoch": 2.7905437352245865,
+ "grad_norm": 2.8814525604248047,
+ "learning_rate": 2.8067715716029586e-06,
+ "loss": 0.4866,
+ "step": 5902
+ },
+ {
+ "epoch": 2.791016548463357,
+ "grad_norm": 2.7953200340270996,
+ "learning_rate": 2.8061524495048046e-06,
+ "loss": 0.3964,
+ "step": 5903
+ },
+ {
+ "epoch": 2.7914893617021277,
+ "grad_norm": 2.7362849712371826,
+ "learning_rate": 2.8055333083438808e-06,
+ "loss": 0.4181,
+ "step": 5904
+ },
+ {
+ "epoch": 2.7919621749408985,
+ "grad_norm": 2.9740512371063232,
+ "learning_rate": 2.8049141481587366e-06,
+ "loss": 0.4784,
+ "step": 5905
+ },
+ {
+ "epoch": 2.792434988179669,
+ "grad_norm": 2.595813274383545,
+ "learning_rate": 2.8042949689879262e-06,
+ "loss": 0.4421,
+ "step": 5906
+ },
+ {
+ "epoch": 2.7929078014184396,
+ "grad_norm": 2.886899948120117,
+ "learning_rate": 2.803675770870002e-06,
+ "loss": 0.4435,
+ "step": 5907
+ },
+ {
+ "epoch": 2.7933806146572104,
+ "grad_norm": 2.6057486534118652,
+ "learning_rate": 2.8030565538435196e-06,
+ "loss": 0.4472,
+ "step": 5908
+ },
+ {
+ "epoch": 2.7938534278959812,
+ "grad_norm": 2.7422802448272705,
+ "learning_rate": 2.802437317947034e-06,
+ "loss": 0.4799,
+ "step": 5909
+ },
+ {
+ "epoch": 2.794326241134752,
+ "grad_norm": 2.3904244899749756,
+ "learning_rate": 2.801818063219102e-06,
+ "loss": 0.4508,
+ "step": 5910
+ },
+ {
+ "epoch": 2.7947990543735224,
+ "grad_norm": 2.8434207439422607,
+ "learning_rate": 2.8011987896982835e-06,
+ "loss": 0.4473,
+ "step": 5911
+ },
+ {
+ "epoch": 2.795271867612293,
+ "grad_norm": 2.916088819503784,
+ "learning_rate": 2.8005794974231366e-06,
+ "loss": 0.464,
+ "step": 5912
+ },
+ {
+ "epoch": 2.795744680851064,
+ "grad_norm": 2.6483397483825684,
+ "learning_rate": 2.7999601864322236e-06,
+ "loss": 0.441,
+ "step": 5913
+ },
+ {
+ "epoch": 2.7962174940898343,
+ "grad_norm": 2.9287428855895996,
+ "learning_rate": 2.7993408567641033e-06,
+ "loss": 0.4551,
+ "step": 5914
+ },
+ {
+ "epoch": 2.796690307328605,
+ "grad_norm": 2.575024127960205,
+ "learning_rate": 2.798721508457342e-06,
+ "loss": 0.4494,
+ "step": 5915
+ },
+ {
+ "epoch": 2.797163120567376,
+ "grad_norm": 2.7156829833984375,
+ "learning_rate": 2.7981021415505015e-06,
+ "loss": 0.419,
+ "step": 5916
+ },
+ {
+ "epoch": 2.7976359338061467,
+ "grad_norm": 2.850553035736084,
+ "learning_rate": 2.7974827560821482e-06,
+ "loss": 0.4709,
+ "step": 5917
+ },
+ {
+ "epoch": 2.7981087470449175,
+ "grad_norm": 2.673846483230591,
+ "learning_rate": 2.796863352090847e-06,
+ "loss": 0.4224,
+ "step": 5918
+ },
+ {
+ "epoch": 2.798581560283688,
+ "grad_norm": 2.9093217849731445,
+ "learning_rate": 2.796243929615168e-06,
+ "loss": 0.468,
+ "step": 5919
+ },
+ {
+ "epoch": 2.7990543735224587,
+ "grad_norm": 2.4853813648223877,
+ "learning_rate": 2.7956244886936775e-06,
+ "loss": 0.4723,
+ "step": 5920
+ },
+ {
+ "epoch": 2.7995271867612295,
+ "grad_norm": 3.026428461074829,
+ "learning_rate": 2.795005029364946e-06,
+ "loss": 0.4721,
+ "step": 5921
+ },
+ {
+ "epoch": 2.8,
+ "grad_norm": 2.886295795440674,
+ "learning_rate": 2.794385551667546e-06,
+ "loss": 0.456,
+ "step": 5922
+ },
+ {
+ "epoch": 2.8004728132387706,
+ "grad_norm": 3.2260656356811523,
+ "learning_rate": 2.7937660556400486e-06,
+ "loss": 0.4499,
+ "step": 5923
+ },
+ {
+ "epoch": 2.8009456264775414,
+ "grad_norm": 2.7971982955932617,
+ "learning_rate": 2.793146541321027e-06,
+ "loss": 0.3982,
+ "step": 5924
+ },
+ {
+ "epoch": 2.801418439716312,
+ "grad_norm": 2.85461163520813,
+ "learning_rate": 2.7925270087490546e-06,
+ "loss": 0.4841,
+ "step": 5925
+ },
+ {
+ "epoch": 2.801891252955083,
+ "grad_norm": 3.0642316341400146,
+ "learning_rate": 2.7919074579627086e-06,
+ "loss": 0.4538,
+ "step": 5926
+ },
+ {
+ "epoch": 2.8023640661938534,
+ "grad_norm": 2.9053616523742676,
+ "learning_rate": 2.7912878890005657e-06,
+ "loss": 0.434,
+ "step": 5927
+ },
+ {
+ "epoch": 2.802836879432624,
+ "grad_norm": 2.7649240493774414,
+ "learning_rate": 2.7906683019012027e-06,
+ "loss": 0.414,
+ "step": 5928
+ },
+ {
+ "epoch": 2.803309692671395,
+ "grad_norm": 2.8717660903930664,
+ "learning_rate": 2.7900486967031987e-06,
+ "loss": 0.4337,
+ "step": 5929
+ },
+ {
+ "epoch": 2.8037825059101653,
+ "grad_norm": 2.6860995292663574,
+ "learning_rate": 2.789429073445135e-06,
+ "loss": 0.447,
+ "step": 5930
+ },
+ {
+ "epoch": 2.804255319148936,
+ "grad_norm": 2.67509126663208,
+ "learning_rate": 2.7888094321655918e-06,
+ "loss": 0.4955,
+ "step": 5931
+ },
+ {
+ "epoch": 2.804728132387707,
+ "grad_norm": 2.7426326274871826,
+ "learning_rate": 2.7881897729031514e-06,
+ "loss": 0.4564,
+ "step": 5932
+ },
+ {
+ "epoch": 2.8052009456264777,
+ "grad_norm": 2.7087252140045166,
+ "learning_rate": 2.7875700956963973e-06,
+ "loss": 0.4571,
+ "step": 5933
+ },
+ {
+ "epoch": 2.8056737588652485,
+ "grad_norm": 2.513526439666748,
+ "learning_rate": 2.7869504005839147e-06,
+ "loss": 0.4361,
+ "step": 5934
+ },
+ {
+ "epoch": 2.806146572104019,
+ "grad_norm": 3.2246084213256836,
+ "learning_rate": 2.7863306876042885e-06,
+ "loss": 0.4612,
+ "step": 5935
+ },
+ {
+ "epoch": 2.8066193853427897,
+ "grad_norm": 3.226325511932373,
+ "learning_rate": 2.7857109567961066e-06,
+ "loss": 0.4528,
+ "step": 5936
+ },
+ {
+ "epoch": 2.8070921985815604,
+ "grad_norm": 2.8861422538757324,
+ "learning_rate": 2.785091208197956e-06,
+ "loss": 0.5049,
+ "step": 5937
+ },
+ {
+ "epoch": 2.807565011820331,
+ "grad_norm": 2.76279616355896,
+ "learning_rate": 2.7844714418484257e-06,
+ "loss": 0.4714,
+ "step": 5938
+ },
+ {
+ "epoch": 2.8080378250591016,
+ "grad_norm": 2.9591920375823975,
+ "learning_rate": 2.7838516577861063e-06,
+ "loss": 0.4633,
+ "step": 5939
+ },
+ {
+ "epoch": 2.8085106382978724,
+ "grad_norm": 2.536916971206665,
+ "learning_rate": 2.7832318560495885e-06,
+ "loss": 0.4108,
+ "step": 5940
+ },
+ {
+ "epoch": 2.808983451536643,
+ "grad_norm": 3.2484991550445557,
+ "learning_rate": 2.7826120366774657e-06,
+ "loss": 0.4888,
+ "step": 5941
+ },
+ {
+ "epoch": 2.8094562647754135,
+ "grad_norm": 2.7129359245300293,
+ "learning_rate": 2.781992199708329e-06,
+ "loss": 0.4008,
+ "step": 5942
+ },
+ {
+ "epoch": 2.8099290780141843,
+ "grad_norm": 2.4176113605499268,
+ "learning_rate": 2.781372345180776e-06,
+ "loss": 0.3864,
+ "step": 5943
+ },
+ {
+ "epoch": 2.810401891252955,
+ "grad_norm": 2.6557252407073975,
+ "learning_rate": 2.7807524731334e-06,
+ "loss": 0.4295,
+ "step": 5944
+ },
+ {
+ "epoch": 2.8108747044917255,
+ "grad_norm": 2.9191324710845947,
+ "learning_rate": 2.7801325836047993e-06,
+ "loss": 0.4854,
+ "step": 5945
+ },
+ {
+ "epoch": 2.8113475177304963,
+ "grad_norm": 2.6325371265411377,
+ "learning_rate": 2.7795126766335705e-06,
+ "loss": 0.4332,
+ "step": 5946
+ },
+ {
+ "epoch": 2.811820330969267,
+ "grad_norm": 2.658337116241455,
+ "learning_rate": 2.778892752258314e-06,
+ "loss": 0.4276,
+ "step": 5947
+ },
+ {
+ "epoch": 2.812293144208038,
+ "grad_norm": 2.763782262802124,
+ "learning_rate": 2.778272810517627e-06,
+ "loss": 0.4246,
+ "step": 5948
+ },
+ {
+ "epoch": 2.8127659574468087,
+ "grad_norm": 2.407607078552246,
+ "learning_rate": 2.777652851450113e-06,
+ "loss": 0.3788,
+ "step": 5949
+ },
+ {
+ "epoch": 2.813238770685579,
+ "grad_norm": 3.0339951515197754,
+ "learning_rate": 2.7770328750943736e-06,
+ "loss": 0.477,
+ "step": 5950
+ },
+ {
+ "epoch": 2.81371158392435,
+ "grad_norm": 2.3475773334503174,
+ "learning_rate": 2.776412881489012e-06,
+ "loss": 0.4206,
+ "step": 5951
+ },
+ {
+ "epoch": 2.8141843971631206,
+ "grad_norm": 3.0455260276794434,
+ "learning_rate": 2.7757928706726318e-06,
+ "loss": 0.4301,
+ "step": 5952
+ },
+ {
+ "epoch": 2.814657210401891,
+ "grad_norm": 2.803920030593872,
+ "learning_rate": 2.7751728426838386e-06,
+ "loss": 0.3738,
+ "step": 5953
+ },
+ {
+ "epoch": 2.815130023640662,
+ "grad_norm": 3.1083319187164307,
+ "learning_rate": 2.77455279756124e-06,
+ "loss": 0.5365,
+ "step": 5954
+ },
+ {
+ "epoch": 2.8156028368794326,
+ "grad_norm": 3.180809497833252,
+ "learning_rate": 2.7739327353434427e-06,
+ "loss": 0.4789,
+ "step": 5955
+ },
+ {
+ "epoch": 2.8160756501182034,
+ "grad_norm": 2.975043773651123,
+ "learning_rate": 2.7733126560690543e-06,
+ "loss": 0.4798,
+ "step": 5956
+ },
+ {
+ "epoch": 2.816548463356974,
+ "grad_norm": 2.765475034713745,
+ "learning_rate": 2.772692559776685e-06,
+ "loss": 0.4206,
+ "step": 5957
+ },
+ {
+ "epoch": 2.8170212765957445,
+ "grad_norm": 2.48612380027771,
+ "learning_rate": 2.7720724465049463e-06,
+ "loss": 0.4234,
+ "step": 5958
+ },
+ {
+ "epoch": 2.8174940898345153,
+ "grad_norm": 2.7145729064941406,
+ "learning_rate": 2.77145231629245e-06,
+ "loss": 0.4713,
+ "step": 5959
+ },
+ {
+ "epoch": 2.817966903073286,
+ "grad_norm": 2.5993762016296387,
+ "learning_rate": 2.7708321691778074e-06,
+ "loss": 0.4144,
+ "step": 5960
+ },
+ {
+ "epoch": 2.8184397163120565,
+ "grad_norm": 3.0902538299560547,
+ "learning_rate": 2.770212005199633e-06,
+ "loss": 0.4822,
+ "step": 5961
+ },
+ {
+ "epoch": 2.8189125295508273,
+ "grad_norm": 2.849757671356201,
+ "learning_rate": 2.7695918243965424e-06,
+ "loss": 0.4449,
+ "step": 5962
+ },
+ {
+ "epoch": 2.819385342789598,
+ "grad_norm": 2.77148699760437,
+ "learning_rate": 2.768971626807151e-06,
+ "loss": 0.4448,
+ "step": 5963
+ },
+ {
+ "epoch": 2.819858156028369,
+ "grad_norm": 2.7865898609161377,
+ "learning_rate": 2.7683514124700757e-06,
+ "loss": 0.4944,
+ "step": 5964
+ },
+ {
+ "epoch": 2.8203309692671397,
+ "grad_norm": 2.9057955741882324,
+ "learning_rate": 2.767731181423934e-06,
+ "loss": 0.5074,
+ "step": 5965
+ },
+ {
+ "epoch": 2.82080378250591,
+ "grad_norm": 2.725837469100952,
+ "learning_rate": 2.7671109337073465e-06,
+ "loss": 0.4207,
+ "step": 5966
+ },
+ {
+ "epoch": 2.821276595744681,
+ "grad_norm": 3.078531265258789,
+ "learning_rate": 2.7664906693589315e-06,
+ "loss": 0.4835,
+ "step": 5967
+ },
+ {
+ "epoch": 2.8217494089834516,
+ "grad_norm": 2.8692002296447754,
+ "learning_rate": 2.765870388417312e-06,
+ "loss": 0.4284,
+ "step": 5968
+ },
+ {
+ "epoch": 2.822222222222222,
+ "grad_norm": 2.8519723415374756,
+ "learning_rate": 2.765250090921109e-06,
+ "loss": 0.541,
+ "step": 5969
+ },
+ {
+ "epoch": 2.8226950354609928,
+ "grad_norm": 3.2037532329559326,
+ "learning_rate": 2.7646297769089457e-06,
+ "loss": 0.4276,
+ "step": 5970
+ },
+ {
+ "epoch": 2.8231678486997636,
+ "grad_norm": 2.8637137413024902,
+ "learning_rate": 2.7640094464194468e-06,
+ "loss": 0.4904,
+ "step": 5971
+ },
+ {
+ "epoch": 2.8236406619385344,
+ "grad_norm": 2.681516408920288,
+ "learning_rate": 2.7633890994912372e-06,
+ "loss": 0.4942,
+ "step": 5972
+ },
+ {
+ "epoch": 2.824113475177305,
+ "grad_norm": 3.0035219192504883,
+ "learning_rate": 2.7627687361629434e-06,
+ "loss": 0.4556,
+ "step": 5973
+ },
+ {
+ "epoch": 2.8245862884160755,
+ "grad_norm": 2.8107759952545166,
+ "learning_rate": 2.7621483564731923e-06,
+ "loss": 0.4225,
+ "step": 5974
+ },
+ {
+ "epoch": 2.8250591016548463,
+ "grad_norm": 2.87276029586792,
+ "learning_rate": 2.7615279604606126e-06,
+ "loss": 0.5045,
+ "step": 5975
+ },
+ {
+ "epoch": 2.825531914893617,
+ "grad_norm": 2.687953233718872,
+ "learning_rate": 2.760907548163833e-06,
+ "loss": 0.4018,
+ "step": 5976
+ },
+ {
+ "epoch": 2.8260047281323875,
+ "grad_norm": 2.587979555130005,
+ "learning_rate": 2.760287119621486e-06,
+ "loss": 0.4407,
+ "step": 5977
+ },
+ {
+ "epoch": 2.8264775413711583,
+ "grad_norm": 2.805602550506592,
+ "learning_rate": 2.7596666748722e-06,
+ "loss": 0.4559,
+ "step": 5978
+ },
+ {
+ "epoch": 2.826950354609929,
+ "grad_norm": 2.320763111114502,
+ "learning_rate": 2.759046213954609e-06,
+ "loss": 0.3847,
+ "step": 5979
+ },
+ {
+ "epoch": 2.8274231678487,
+ "grad_norm": 2.6876401901245117,
+ "learning_rate": 2.758425736907347e-06,
+ "loss": 0.4528,
+ "step": 5980
+ },
+ {
+ "epoch": 2.8278959810874706,
+ "grad_norm": 2.6852915287017822,
+ "learning_rate": 2.757805243769046e-06,
+ "loss": 0.395,
+ "step": 5981
+ },
+ {
+ "epoch": 2.828368794326241,
+ "grad_norm": 2.808326005935669,
+ "learning_rate": 2.7571847345783447e-06,
+ "loss": 0.4647,
+ "step": 5982
+ },
+ {
+ "epoch": 2.828841607565012,
+ "grad_norm": 2.641479015350342,
+ "learning_rate": 2.7565642093738766e-06,
+ "loss": 0.3798,
+ "step": 5983
+ },
+ {
+ "epoch": 2.8293144208037826,
+ "grad_norm": 2.8066110610961914,
+ "learning_rate": 2.7559436681942803e-06,
+ "loss": 0.5072,
+ "step": 5984
+ },
+ {
+ "epoch": 2.829787234042553,
+ "grad_norm": 2.898375988006592,
+ "learning_rate": 2.7553231110781936e-06,
+ "loss": 0.5182,
+ "step": 5985
+ },
+ {
+ "epoch": 2.8302600472813237,
+ "grad_norm": 2.704890489578247,
+ "learning_rate": 2.7547025380642574e-06,
+ "loss": 0.3999,
+ "step": 5986
+ },
+ {
+ "epoch": 2.8307328605200945,
+ "grad_norm": 2.6024270057678223,
+ "learning_rate": 2.7540819491911106e-06,
+ "loss": 0.4302,
+ "step": 5987
+ },
+ {
+ "epoch": 2.8312056737588653,
+ "grad_norm": 2.8006081581115723,
+ "learning_rate": 2.7534613444973946e-06,
+ "loss": 0.4492,
+ "step": 5988
+ },
+ {
+ "epoch": 2.831678486997636,
+ "grad_norm": 2.9532058238983154,
+ "learning_rate": 2.752840724021752e-06,
+ "loss": 0.4552,
+ "step": 5989
+ },
+ {
+ "epoch": 2.8321513002364065,
+ "grad_norm": 3.1830217838287354,
+ "learning_rate": 2.7522200878028265e-06,
+ "loss": 0.5013,
+ "step": 5990
+ },
+ {
+ "epoch": 2.8326241134751773,
+ "grad_norm": 2.716176748275757,
+ "learning_rate": 2.7515994358792624e-06,
+ "loss": 0.4569,
+ "step": 5991
+ },
+ {
+ "epoch": 2.833096926713948,
+ "grad_norm": 2.6852715015411377,
+ "learning_rate": 2.7509787682897044e-06,
+ "loss": 0.4764,
+ "step": 5992
+ },
+ {
+ "epoch": 2.8335697399527184,
+ "grad_norm": 2.9383316040039062,
+ "learning_rate": 2.7503580850727985e-06,
+ "loss": 0.5205,
+ "step": 5993
+ },
+ {
+ "epoch": 2.8340425531914892,
+ "grad_norm": 2.703132152557373,
+ "learning_rate": 2.749737386267193e-06,
+ "loss": 0.4543,
+ "step": 5994
+ },
+ {
+ "epoch": 2.83451536643026,
+ "grad_norm": 2.4304885864257812,
+ "learning_rate": 2.7491166719115354e-06,
+ "loss": 0.4479,
+ "step": 5995
+ },
+ {
+ "epoch": 2.834988179669031,
+ "grad_norm": 2.975722551345825,
+ "learning_rate": 2.748495942044475e-06,
+ "loss": 0.4074,
+ "step": 5996
+ },
+ {
+ "epoch": 2.8354609929078016,
+ "grad_norm": 3.440208911895752,
+ "learning_rate": 2.7478751967046617e-06,
+ "loss": 0.4497,
+ "step": 5997
+ },
+ {
+ "epoch": 2.835933806146572,
+ "grad_norm": 2.734673261642456,
+ "learning_rate": 2.747254435930747e-06,
+ "loss": 0.437,
+ "step": 5998
+ },
+ {
+ "epoch": 2.8364066193853428,
+ "grad_norm": 3.1918959617614746,
+ "learning_rate": 2.7466336597613826e-06,
+ "loss": 0.4197,
+ "step": 5999
+ },
+ {
+ "epoch": 2.8368794326241136,
+ "grad_norm": 3.1440329551696777,
+ "learning_rate": 2.7460128682352216e-06,
+ "loss": 0.4425,
+ "step": 6000
+ },
+ {
+ "epoch": 2.837352245862884,
+ "grad_norm": 2.582993507385254,
+ "learning_rate": 2.7453920613909183e-06,
+ "loss": 0.4475,
+ "step": 6001
+ },
+ {
+ "epoch": 2.8378250591016547,
+ "grad_norm": 3.2682149410247803,
+ "learning_rate": 2.744771239267128e-06,
+ "loss": 0.4615,
+ "step": 6002
+ },
+ {
+ "epoch": 2.8382978723404255,
+ "grad_norm": 2.848477840423584,
+ "learning_rate": 2.7441504019025046e-06,
+ "loss": 0.4093,
+ "step": 6003
+ },
+ {
+ "epoch": 2.8387706855791963,
+ "grad_norm": 2.3582282066345215,
+ "learning_rate": 2.7435295493357067e-06,
+ "loss": 0.3911,
+ "step": 6004
+ },
+ {
+ "epoch": 2.839243498817967,
+ "grad_norm": 2.7707207202911377,
+ "learning_rate": 2.742908681605392e-06,
+ "loss": 0.4069,
+ "step": 6005
+ },
+ {
+ "epoch": 2.8397163120567375,
+ "grad_norm": 3.0763752460479736,
+ "learning_rate": 2.7422877987502183e-06,
+ "loss": 0.512,
+ "step": 6006
+ },
+ {
+ "epoch": 2.8401891252955083,
+ "grad_norm": 2.8027124404907227,
+ "learning_rate": 2.741666900808846e-06,
+ "loss": 0.4922,
+ "step": 6007
+ },
+ {
+ "epoch": 2.840661938534279,
+ "grad_norm": 2.487982988357544,
+ "learning_rate": 2.7410459878199353e-06,
+ "loss": 0.4368,
+ "step": 6008
+ },
+ {
+ "epoch": 2.8411347517730494,
+ "grad_norm": 2.8727993965148926,
+ "learning_rate": 2.7404250598221484e-06,
+ "loss": 0.4639,
+ "step": 6009
+ },
+ {
+ "epoch": 2.84160756501182,
+ "grad_norm": 2.5556678771972656,
+ "learning_rate": 2.739804116854147e-06,
+ "loss": 0.4217,
+ "step": 6010
+ },
+ {
+ "epoch": 2.842080378250591,
+ "grad_norm": 2.6306912899017334,
+ "learning_rate": 2.7391831589545948e-06,
+ "loss": 0.4816,
+ "step": 6011
+ },
+ {
+ "epoch": 2.842553191489362,
+ "grad_norm": 2.7340946197509766,
+ "learning_rate": 2.7385621861621557e-06,
+ "loss": 0.4113,
+ "step": 6012
+ },
+ {
+ "epoch": 2.8430260047281326,
+ "grad_norm": 2.834190607070923,
+ "learning_rate": 2.737941198515495e-06,
+ "loss": 0.4691,
+ "step": 6013
+ },
+ {
+ "epoch": 2.843498817966903,
+ "grad_norm": 2.7139697074890137,
+ "learning_rate": 2.737320196053281e-06,
+ "loss": 0.3798,
+ "step": 6014
+ },
+ {
+ "epoch": 2.8439716312056738,
+ "grad_norm": 2.7934985160827637,
+ "learning_rate": 2.736699178814177e-06,
+ "loss": 0.446,
+ "step": 6015
+ },
+ {
+ "epoch": 2.8444444444444446,
+ "grad_norm": 2.6941518783569336,
+ "learning_rate": 2.7360781468368534e-06,
+ "loss": 0.4787,
+ "step": 6016
+ },
+ {
+ "epoch": 2.844917257683215,
+ "grad_norm": 3.1530468463897705,
+ "learning_rate": 2.7354571001599792e-06,
+ "loss": 0.474,
+ "step": 6017
+ },
+ {
+ "epoch": 2.8453900709219857,
+ "grad_norm": 2.613875389099121,
+ "learning_rate": 2.7348360388222243e-06,
+ "loss": 0.4297,
+ "step": 6018
+ },
+ {
+ "epoch": 2.8458628841607565,
+ "grad_norm": 2.5481486320495605,
+ "learning_rate": 2.7342149628622587e-06,
+ "loss": 0.3762,
+ "step": 6019
+ },
+ {
+ "epoch": 2.8463356973995273,
+ "grad_norm": 2.6425609588623047,
+ "learning_rate": 2.7335938723187544e-06,
+ "loss": 0.4077,
+ "step": 6020
+ },
+ {
+ "epoch": 2.846808510638298,
+ "grad_norm": 2.6281731128692627,
+ "learning_rate": 2.7329727672303836e-06,
+ "loss": 0.466,
+ "step": 6021
+ },
+ {
+ "epoch": 2.8472813238770684,
+ "grad_norm": 2.8862180709838867,
+ "learning_rate": 2.7323516476358197e-06,
+ "loss": 0.4191,
+ "step": 6022
+ },
+ {
+ "epoch": 2.8477541371158392,
+ "grad_norm": 2.907731533050537,
+ "learning_rate": 2.7317305135737383e-06,
+ "loss": 0.4867,
+ "step": 6023
+ },
+ {
+ "epoch": 2.84822695035461,
+ "grad_norm": 2.825593948364258,
+ "learning_rate": 2.731109365082814e-06,
+ "loss": 0.4888,
+ "step": 6024
+ },
+ {
+ "epoch": 2.8486997635933804,
+ "grad_norm": 2.478163003921509,
+ "learning_rate": 2.730488202201722e-06,
+ "loss": 0.4714,
+ "step": 6025
+ },
+ {
+ "epoch": 2.849172576832151,
+ "grad_norm": 2.928899049758911,
+ "learning_rate": 2.7298670249691418e-06,
+ "loss": 0.4671,
+ "step": 6026
+ },
+ {
+ "epoch": 2.849645390070922,
+ "grad_norm": 2.778256893157959,
+ "learning_rate": 2.7292458334237488e-06,
+ "loss": 0.429,
+ "step": 6027
+ },
+ {
+ "epoch": 2.850118203309693,
+ "grad_norm": 3.0689055919647217,
+ "learning_rate": 2.7286246276042234e-06,
+ "loss": 0.4727,
+ "step": 6028
+ },
+ {
+ "epoch": 2.8505910165484636,
+ "grad_norm": 2.582066774368286,
+ "learning_rate": 2.7280034075492447e-06,
+ "loss": 0.4025,
+ "step": 6029
+ },
+ {
+ "epoch": 2.851063829787234,
+ "grad_norm": 3.6679015159606934,
+ "learning_rate": 2.7273821732974936e-06,
+ "loss": 0.4856,
+ "step": 6030
+ },
+ {
+ "epoch": 2.8515366430260047,
+ "grad_norm": 2.7222588062286377,
+ "learning_rate": 2.7267609248876516e-06,
+ "loss": 0.4255,
+ "step": 6031
+ },
+ {
+ "epoch": 2.8520094562647755,
+ "grad_norm": 2.455038547515869,
+ "learning_rate": 2.726139662358401e-06,
+ "loss": 0.4234,
+ "step": 6032
+ },
+ {
+ "epoch": 2.852482269503546,
+ "grad_norm": 2.8277318477630615,
+ "learning_rate": 2.7255183857484253e-06,
+ "loss": 0.4146,
+ "step": 6033
+ },
+ {
+ "epoch": 2.8529550827423167,
+ "grad_norm": 2.523615837097168,
+ "learning_rate": 2.724897095096409e-06,
+ "loss": 0.4227,
+ "step": 6034
+ },
+ {
+ "epoch": 2.8534278959810875,
+ "grad_norm": 3.353646755218506,
+ "learning_rate": 2.724275790441036e-06,
+ "loss": 0.5041,
+ "step": 6035
+ },
+ {
+ "epoch": 2.8539007092198583,
+ "grad_norm": 2.753981828689575,
+ "learning_rate": 2.7236544718209934e-06,
+ "loss": 0.4646,
+ "step": 6036
+ },
+ {
+ "epoch": 2.854373522458629,
+ "grad_norm": 2.954744577407837,
+ "learning_rate": 2.723033139274967e-06,
+ "loss": 0.5182,
+ "step": 6037
+ },
+ {
+ "epoch": 2.8548463356973994,
+ "grad_norm": 2.4814131259918213,
+ "learning_rate": 2.7224117928416462e-06,
+ "loss": 0.4626,
+ "step": 6038
+ },
+ {
+ "epoch": 2.8553191489361702,
+ "grad_norm": 2.7414886951446533,
+ "learning_rate": 2.721790432559717e-06,
+ "loss": 0.4111,
+ "step": 6039
+ },
+ {
+ "epoch": 2.855791962174941,
+ "grad_norm": 2.8743896484375,
+ "learning_rate": 2.7211690584678706e-06,
+ "loss": 0.4986,
+ "step": 6040
+ },
+ {
+ "epoch": 2.8562647754137114,
+ "grad_norm": 3.0691921710968018,
+ "learning_rate": 2.720547670604797e-06,
+ "loss": 0.4743,
+ "step": 6041
+ },
+ {
+ "epoch": 2.856737588652482,
+ "grad_norm": 2.7273411750793457,
+ "learning_rate": 2.7199262690091872e-06,
+ "loss": 0.4403,
+ "step": 6042
+ },
+ {
+ "epoch": 2.857210401891253,
+ "grad_norm": 2.8022944927215576,
+ "learning_rate": 2.7193048537197325e-06,
+ "loss": 0.4413,
+ "step": 6043
+ },
+ {
+ "epoch": 2.8576832151300238,
+ "grad_norm": 2.4883248805999756,
+ "learning_rate": 2.718683424775126e-06,
+ "loss": 0.4485,
+ "step": 6044
+ },
+ {
+ "epoch": 2.8581560283687946,
+ "grad_norm": 2.457249879837036,
+ "learning_rate": 2.718061982214062e-06,
+ "loss": 0.4167,
+ "step": 6045
+ },
+ {
+ "epoch": 2.858628841607565,
+ "grad_norm": 2.7210328578948975,
+ "learning_rate": 2.717440526075234e-06,
+ "loss": 0.4419,
+ "step": 6046
+ },
+ {
+ "epoch": 2.8591016548463357,
+ "grad_norm": 2.684483766555786,
+ "learning_rate": 2.7168190563973386e-06,
+ "loss": 0.4449,
+ "step": 6047
+ },
+ {
+ "epoch": 2.8595744680851065,
+ "grad_norm": 2.5305230617523193,
+ "learning_rate": 2.7161975732190706e-06,
+ "loss": 0.3829,
+ "step": 6048
+ },
+ {
+ "epoch": 2.860047281323877,
+ "grad_norm": 3.0284602642059326,
+ "learning_rate": 2.7155760765791278e-06,
+ "loss": 0.5164,
+ "step": 6049
+ },
+ {
+ "epoch": 2.8605200945626477,
+ "grad_norm": 3.154599189758301,
+ "learning_rate": 2.7149545665162085e-06,
+ "loss": 0.527,
+ "step": 6050
+ },
+ {
+ "epoch": 2.8609929078014185,
+ "grad_norm": 2.6798126697540283,
+ "learning_rate": 2.7143330430690113e-06,
+ "loss": 0.4379,
+ "step": 6051
+ },
+ {
+ "epoch": 2.8614657210401893,
+ "grad_norm": 2.9531302452087402,
+ "learning_rate": 2.7137115062762344e-06,
+ "loss": 0.4549,
+ "step": 6052
+ },
+ {
+ "epoch": 2.86193853427896,
+ "grad_norm": 2.779531240463257,
+ "learning_rate": 2.7130899561765787e-06,
+ "loss": 0.4037,
+ "step": 6053
+ },
+ {
+ "epoch": 2.8624113475177304,
+ "grad_norm": 2.786763906478882,
+ "learning_rate": 2.7124683928087466e-06,
+ "loss": 0.3986,
+ "step": 6054
+ },
+ {
+ "epoch": 2.862884160756501,
+ "grad_norm": 2.430415630340576,
+ "learning_rate": 2.7118468162114385e-06,
+ "loss": 0.4402,
+ "step": 6055
+ },
+ {
+ "epoch": 2.863356973995272,
+ "grad_norm": 3.027268409729004,
+ "learning_rate": 2.7112252264233596e-06,
+ "loss": 0.4737,
+ "step": 6056
+ },
+ {
+ "epoch": 2.8638297872340424,
+ "grad_norm": 3.024935483932495,
+ "learning_rate": 2.710603623483211e-06,
+ "loss": 0.3997,
+ "step": 6057
+ },
+ {
+ "epoch": 2.864302600472813,
+ "grad_norm": 2.8862195014953613,
+ "learning_rate": 2.7099820074296985e-06,
+ "loss": 0.4896,
+ "step": 6058
+ },
+ {
+ "epoch": 2.864775413711584,
+ "grad_norm": 2.595579147338867,
+ "learning_rate": 2.709360378301527e-06,
+ "loss": 0.4387,
+ "step": 6059
+ },
+ {
+ "epoch": 2.8652482269503547,
+ "grad_norm": 2.8046188354492188,
+ "learning_rate": 2.708738736137403e-06,
+ "loss": 0.4726,
+ "step": 6060
+ },
+ {
+ "epoch": 2.8657210401891255,
+ "grad_norm": 3.040304660797119,
+ "learning_rate": 2.708117080976033e-06,
+ "loss": 0.4642,
+ "step": 6061
+ },
+ {
+ "epoch": 2.866193853427896,
+ "grad_norm": 2.618128538131714,
+ "learning_rate": 2.7074954128561248e-06,
+ "loss": 0.3171,
+ "step": 6062
+ },
+ {
+ "epoch": 2.8666666666666667,
+ "grad_norm": 2.7966055870056152,
+ "learning_rate": 2.706873731816387e-06,
+ "loss": 0.4893,
+ "step": 6063
+ },
+ {
+ "epoch": 2.8671394799054375,
+ "grad_norm": 2.9198038578033447,
+ "learning_rate": 2.706252037895529e-06,
+ "loss": 0.4428,
+ "step": 6064
+ },
+ {
+ "epoch": 2.867612293144208,
+ "grad_norm": 2.417705774307251,
+ "learning_rate": 2.7056303311322617e-06,
+ "loss": 0.3704,
+ "step": 6065
+ },
+ {
+ "epoch": 2.8680851063829786,
+ "grad_norm": 3.143918752670288,
+ "learning_rate": 2.7050086115652953e-06,
+ "loss": 0.5247,
+ "step": 6066
+ },
+ {
+ "epoch": 2.8685579196217494,
+ "grad_norm": 2.620781183242798,
+ "learning_rate": 2.704386879233341e-06,
+ "loss": 0.4131,
+ "step": 6067
+ },
+ {
+ "epoch": 2.8690307328605202,
+ "grad_norm": 2.6929845809936523,
+ "learning_rate": 2.703765134175112e-06,
+ "loss": 0.4833,
+ "step": 6068
+ },
+ {
+ "epoch": 2.869503546099291,
+ "grad_norm": 2.695920944213867,
+ "learning_rate": 2.7031433764293214e-06,
+ "loss": 0.435,
+ "step": 6069
+ },
+ {
+ "epoch": 2.8699763593380614,
+ "grad_norm": 2.6184475421905518,
+ "learning_rate": 2.702521606034684e-06,
+ "loss": 0.3898,
+ "step": 6070
+ },
+ {
+ "epoch": 2.870449172576832,
+ "grad_norm": 3.130624532699585,
+ "learning_rate": 2.7018998230299136e-06,
+ "loss": 0.4934,
+ "step": 6071
+ },
+ {
+ "epoch": 2.870921985815603,
+ "grad_norm": 2.947936534881592,
+ "learning_rate": 2.701278027453727e-06,
+ "loss": 0.4167,
+ "step": 6072
+ },
+ {
+ "epoch": 2.8713947990543733,
+ "grad_norm": 2.389263391494751,
+ "learning_rate": 2.7006562193448406e-06,
+ "loss": 0.3854,
+ "step": 6073
+ },
+ {
+ "epoch": 2.871867612293144,
+ "grad_norm": 2.9040684700012207,
+ "learning_rate": 2.700034398741971e-06,
+ "loss": 0.4656,
+ "step": 6074
+ },
+ {
+ "epoch": 2.872340425531915,
+ "grad_norm": 2.8671910762786865,
+ "learning_rate": 2.6994125656838365e-06,
+ "loss": 0.4642,
+ "step": 6075
+ },
+ {
+ "epoch": 2.8728132387706857,
+ "grad_norm": 2.6957180500030518,
+ "learning_rate": 2.698790720209156e-06,
+ "loss": 0.4894,
+ "step": 6076
+ },
+ {
+ "epoch": 2.8732860520094565,
+ "grad_norm": 2.748342514038086,
+ "learning_rate": 2.698168862356648e-06,
+ "loss": 0.4552,
+ "step": 6077
+ },
+ {
+ "epoch": 2.873758865248227,
+ "grad_norm": 2.7459912300109863,
+ "learning_rate": 2.6975469921650344e-06,
+ "loss": 0.4244,
+ "step": 6078
+ },
+ {
+ "epoch": 2.8742316784869977,
+ "grad_norm": 2.515650987625122,
+ "learning_rate": 2.6969251096730366e-06,
+ "loss": 0.4178,
+ "step": 6079
+ },
+ {
+ "epoch": 2.8747044917257685,
+ "grad_norm": 2.747373342514038,
+ "learning_rate": 2.696303214919375e-06,
+ "loss": 0.4623,
+ "step": 6080
+ },
+ {
+ "epoch": 2.875177304964539,
+ "grad_norm": 2.72092604637146,
+ "learning_rate": 2.695681307942773e-06,
+ "loss": 0.4227,
+ "step": 6081
+ },
+ {
+ "epoch": 2.8756501182033096,
+ "grad_norm": 2.6925108432769775,
+ "learning_rate": 2.695059388781955e-06,
+ "loss": 0.3807,
+ "step": 6082
+ },
+ {
+ "epoch": 2.8761229314420804,
+ "grad_norm": 2.673546314239502,
+ "learning_rate": 2.6944374574756427e-06,
+ "loss": 0.424,
+ "step": 6083
+ },
+ {
+ "epoch": 2.876595744680851,
+ "grad_norm": 2.7018187046051025,
+ "learning_rate": 2.6938155140625636e-06,
+ "loss": 0.4367,
+ "step": 6084
+ },
+ {
+ "epoch": 2.877068557919622,
+ "grad_norm": 2.9420957565307617,
+ "learning_rate": 2.6931935585814416e-06,
+ "loss": 0.4223,
+ "step": 6085
+ },
+ {
+ "epoch": 2.8775413711583924,
+ "grad_norm": 2.6523385047912598,
+ "learning_rate": 2.6925715910710036e-06,
+ "loss": 0.4074,
+ "step": 6086
+ },
+ {
+ "epoch": 2.878014184397163,
+ "grad_norm": 2.6104063987731934,
+ "learning_rate": 2.691949611569978e-06,
+ "loss": 0.423,
+ "step": 6087
+ },
+ {
+ "epoch": 2.878486997635934,
+ "grad_norm": 2.6463685035705566,
+ "learning_rate": 2.691327620117091e-06,
+ "loss": 0.4354,
+ "step": 6088
+ },
+ {
+ "epoch": 2.8789598108747043,
+ "grad_norm": 2.5863583087921143,
+ "learning_rate": 2.6907056167510725e-06,
+ "loss": 0.4177,
+ "step": 6089
+ },
+ {
+ "epoch": 2.879432624113475,
+ "grad_norm": 2.6946942806243896,
+ "learning_rate": 2.690083601510651e-06,
+ "loss": 0.4176,
+ "step": 6090
+ },
+ {
+ "epoch": 2.879905437352246,
+ "grad_norm": 3.0649454593658447,
+ "learning_rate": 2.6894615744345575e-06,
+ "loss": 0.4827,
+ "step": 6091
+ },
+ {
+ "epoch": 2.8803782505910167,
+ "grad_norm": 2.6454906463623047,
+ "learning_rate": 2.6888395355615226e-06,
+ "loss": 0.4757,
+ "step": 6092
+ },
+ {
+ "epoch": 2.8808510638297875,
+ "grad_norm": 3.251805067062378,
+ "learning_rate": 2.688217484930278e-06,
+ "loss": 0.5651,
+ "step": 6093
+ },
+ {
+ "epoch": 2.881323877068558,
+ "grad_norm": 2.543999433517456,
+ "learning_rate": 2.687595422579555e-06,
+ "loss": 0.4196,
+ "step": 6094
+ },
+ {
+ "epoch": 2.8817966903073287,
+ "grad_norm": 3.1502909660339355,
+ "learning_rate": 2.686973348548088e-06,
+ "loss": 0.4376,
+ "step": 6095
+ },
+ {
+ "epoch": 2.8822695035460995,
+ "grad_norm": 2.7800376415252686,
+ "learning_rate": 2.686351262874611e-06,
+ "loss": 0.444,
+ "step": 6096
+ },
+ {
+ "epoch": 2.88274231678487,
+ "grad_norm": 3.1529603004455566,
+ "learning_rate": 2.685729165597858e-06,
+ "loss": 0.5137,
+ "step": 6097
+ },
+ {
+ "epoch": 2.8832151300236406,
+ "grad_norm": 2.6079602241516113,
+ "learning_rate": 2.685107056756564e-06,
+ "loss": 0.4213,
+ "step": 6098
+ },
+ {
+ "epoch": 2.8836879432624114,
+ "grad_norm": 2.8969249725341797,
+ "learning_rate": 2.6844849363894648e-06,
+ "loss": 0.4679,
+ "step": 6099
+ },
+ {
+ "epoch": 2.884160756501182,
+ "grad_norm": 2.5882437229156494,
+ "learning_rate": 2.6838628045352977e-06,
+ "loss": 0.3891,
+ "step": 6100
+ },
+ {
+ "epoch": 2.8846335697399526,
+ "grad_norm": 2.9458062648773193,
+ "learning_rate": 2.6832406612328007e-06,
+ "loss": 0.4802,
+ "step": 6101
+ },
+ {
+ "epoch": 2.8851063829787233,
+ "grad_norm": 2.8463058471679688,
+ "learning_rate": 2.6826185065207105e-06,
+ "loss": 0.4332,
+ "step": 6102
+ },
+ {
+ "epoch": 2.885579196217494,
+ "grad_norm": 2.8799285888671875,
+ "learning_rate": 2.6819963404377667e-06,
+ "loss": 0.4474,
+ "step": 6103
+ },
+ {
+ "epoch": 2.8860520094562645,
+ "grad_norm": 2.846860408782959,
+ "learning_rate": 2.681374163022709e-06,
+ "loss": 0.4317,
+ "step": 6104
+ },
+ {
+ "epoch": 2.8865248226950353,
+ "grad_norm": 2.7918877601623535,
+ "learning_rate": 2.6807519743142775e-06,
+ "loss": 0.4243,
+ "step": 6105
+ },
+ {
+ "epoch": 2.886997635933806,
+ "grad_norm": 2.9351487159729004,
+ "learning_rate": 2.6801297743512127e-06,
+ "loss": 0.5253,
+ "step": 6106
+ },
+ {
+ "epoch": 2.887470449172577,
+ "grad_norm": 2.9422426223754883,
+ "learning_rate": 2.6795075631722576e-06,
+ "loss": 0.4887,
+ "step": 6107
+ },
+ {
+ "epoch": 2.8879432624113477,
+ "grad_norm": 2.6837220191955566,
+ "learning_rate": 2.678885340816153e-06,
+ "loss": 0.4761,
+ "step": 6108
+ },
+ {
+ "epoch": 2.888416075650118,
+ "grad_norm": 2.6800777912139893,
+ "learning_rate": 2.6782631073216425e-06,
+ "loss": 0.4248,
+ "step": 6109
+ },
+ {
+ "epoch": 2.888888888888889,
+ "grad_norm": 2.9654436111450195,
+ "learning_rate": 2.6776408627274702e-06,
+ "loss": 0.487,
+ "step": 6110
+ },
+ {
+ "epoch": 2.8893617021276596,
+ "grad_norm": 2.7725181579589844,
+ "learning_rate": 2.6770186070723804e-06,
+ "loss": 0.4166,
+ "step": 6111
+ },
+ {
+ "epoch": 2.88983451536643,
+ "grad_norm": 2.6547815799713135,
+ "learning_rate": 2.676396340395118e-06,
+ "loss": 0.4039,
+ "step": 6112
+ },
+ {
+ "epoch": 2.890307328605201,
+ "grad_norm": 2.690997838973999,
+ "learning_rate": 2.6757740627344292e-06,
+ "loss": 0.4639,
+ "step": 6113
+ },
+ {
+ "epoch": 2.8907801418439716,
+ "grad_norm": 2.4693069458007812,
+ "learning_rate": 2.67515177412906e-06,
+ "loss": 0.4052,
+ "step": 6114
+ },
+ {
+ "epoch": 2.8912529550827424,
+ "grad_norm": 2.7137033939361572,
+ "learning_rate": 2.6745294746177576e-06,
+ "loss": 0.4442,
+ "step": 6115
+ },
+ {
+ "epoch": 2.891725768321513,
+ "grad_norm": 3.7417004108428955,
+ "learning_rate": 2.6739071642392712e-06,
+ "loss": 0.4809,
+ "step": 6116
+ },
+ {
+ "epoch": 2.8921985815602835,
+ "grad_norm": 2.707094669342041,
+ "learning_rate": 2.673284843032347e-06,
+ "loss": 0.411,
+ "step": 6117
+ },
+ {
+ "epoch": 2.8926713947990543,
+ "grad_norm": 2.7864158153533936,
+ "learning_rate": 2.672662511035736e-06,
+ "loss": 0.4939,
+ "step": 6118
+ },
+ {
+ "epoch": 2.893144208037825,
+ "grad_norm": 2.8753504753112793,
+ "learning_rate": 2.672040168288187e-06,
+ "loss": 0.4396,
+ "step": 6119
+ },
+ {
+ "epoch": 2.8936170212765955,
+ "grad_norm": 2.7581071853637695,
+ "learning_rate": 2.6714178148284516e-06,
+ "loss": 0.427,
+ "step": 6120
+ },
+ {
+ "epoch": 2.8940898345153663,
+ "grad_norm": 2.9754791259765625,
+ "learning_rate": 2.6707954506952803e-06,
+ "loss": 0.4255,
+ "step": 6121
+ },
+ {
+ "epoch": 2.894562647754137,
+ "grad_norm": 2.876939296722412,
+ "learning_rate": 2.670173075927426e-06,
+ "loss": 0.4699,
+ "step": 6122
+ },
+ {
+ "epoch": 2.895035460992908,
+ "grad_norm": 2.4875400066375732,
+ "learning_rate": 2.6695506905636397e-06,
+ "loss": 0.3568,
+ "step": 6123
+ },
+ {
+ "epoch": 2.8955082742316787,
+ "grad_norm": 2.703606128692627,
+ "learning_rate": 2.668928294642675e-06,
+ "loss": 0.3646,
+ "step": 6124
+ },
+ {
+ "epoch": 2.895981087470449,
+ "grad_norm": 2.8618338108062744,
+ "learning_rate": 2.6683058882032868e-06,
+ "loss": 0.378,
+ "step": 6125
+ },
+ {
+ "epoch": 2.89645390070922,
+ "grad_norm": 2.9756760597229004,
+ "learning_rate": 2.667683471284229e-06,
+ "loss": 0.4348,
+ "step": 6126
+ },
+ {
+ "epoch": 2.8969267139479906,
+ "grad_norm": 2.7861104011535645,
+ "learning_rate": 2.667061043924256e-06,
+ "loss": 0.4435,
+ "step": 6127
+ },
+ {
+ "epoch": 2.897399527186761,
+ "grad_norm": 2.7932238578796387,
+ "learning_rate": 2.6664386061621243e-06,
+ "loss": 0.4824,
+ "step": 6128
+ },
+ {
+ "epoch": 2.8978723404255318,
+ "grad_norm": 2.85483455657959,
+ "learning_rate": 2.6658161580365917e-06,
+ "loss": 0.4925,
+ "step": 6129
+ },
+ {
+ "epoch": 2.8983451536643026,
+ "grad_norm": 2.4242141246795654,
+ "learning_rate": 2.6651936995864136e-06,
+ "loss": 0.3466,
+ "step": 6130
+ },
+ {
+ "epoch": 2.8988179669030734,
+ "grad_norm": 3.385214328765869,
+ "learning_rate": 2.6645712308503473e-06,
+ "loss": 0.4751,
+ "step": 6131
+ },
+ {
+ "epoch": 2.899290780141844,
+ "grad_norm": 2.7109622955322266,
+ "learning_rate": 2.6639487518671525e-06,
+ "loss": 0.4469,
+ "step": 6132
+ },
+ {
+ "epoch": 2.8997635933806145,
+ "grad_norm": 2.6537814140319824,
+ "learning_rate": 2.6633262626755877e-06,
+ "loss": 0.4678,
+ "step": 6133
+ },
+ {
+ "epoch": 2.9002364066193853,
+ "grad_norm": 2.5992231369018555,
+ "learning_rate": 2.6627037633144124e-06,
+ "loss": 0.4206,
+ "step": 6134
+ },
+ {
+ "epoch": 2.900709219858156,
+ "grad_norm": 2.988940954208374,
+ "learning_rate": 2.6620812538223885e-06,
+ "loss": 0.4554,
+ "step": 6135
+ },
+ {
+ "epoch": 2.9011820330969265,
+ "grad_norm": 3.0678138732910156,
+ "learning_rate": 2.661458734238274e-06,
+ "loss": 0.4671,
+ "step": 6136
+ },
+ {
+ "epoch": 2.9016548463356973,
+ "grad_norm": 2.6902482509613037,
+ "learning_rate": 2.6608362046008335e-06,
+ "loss": 0.372,
+ "step": 6137
+ },
+ {
+ "epoch": 2.902127659574468,
+ "grad_norm": 3.031597375869751,
+ "learning_rate": 2.660213664948827e-06,
+ "loss": 0.4424,
+ "step": 6138
+ },
+ {
+ "epoch": 2.902600472813239,
+ "grad_norm": 2.8376755714416504,
+ "learning_rate": 2.6595911153210187e-06,
+ "loss": 0.4599,
+ "step": 6139
+ },
+ {
+ "epoch": 2.9030732860520096,
+ "grad_norm": 3.3164854049682617,
+ "learning_rate": 2.6589685557561707e-06,
+ "loss": 0.3897,
+ "step": 6140
+ },
+ {
+ "epoch": 2.90354609929078,
+ "grad_norm": 2.9535014629364014,
+ "learning_rate": 2.658345986293048e-06,
+ "loss": 0.4957,
+ "step": 6141
+ },
+ {
+ "epoch": 2.904018912529551,
+ "grad_norm": 2.821276903152466,
+ "learning_rate": 2.657723406970415e-06,
+ "loss": 0.4453,
+ "step": 6142
+ },
+ {
+ "epoch": 2.9044917257683216,
+ "grad_norm": 2.7314651012420654,
+ "learning_rate": 2.657100817827037e-06,
+ "loss": 0.4406,
+ "step": 6143
+ },
+ {
+ "epoch": 2.904964539007092,
+ "grad_norm": 2.9509520530700684,
+ "learning_rate": 2.6564782189016804e-06,
+ "loss": 0.4629,
+ "step": 6144
+ },
+ {
+ "epoch": 2.9054373522458627,
+ "grad_norm": 2.6234960556030273,
+ "learning_rate": 2.655855610233111e-06,
+ "loss": 0.4306,
+ "step": 6145
+ },
+ {
+ "epoch": 2.9059101654846335,
+ "grad_norm": 2.7209644317626953,
+ "learning_rate": 2.6552329918600962e-06,
+ "loss": 0.3643,
+ "step": 6146
+ },
+ {
+ "epoch": 2.9063829787234043,
+ "grad_norm": 2.9797747135162354,
+ "learning_rate": 2.654610363821404e-06,
+ "loss": 0.4616,
+ "step": 6147
+ },
+ {
+ "epoch": 2.906855791962175,
+ "grad_norm": 2.8179666996002197,
+ "learning_rate": 2.6539877261558016e-06,
+ "loss": 0.4526,
+ "step": 6148
+ },
+ {
+ "epoch": 2.9073286052009455,
+ "grad_norm": 2.7492244243621826,
+ "learning_rate": 2.653365078902059e-06,
+ "loss": 0.4862,
+ "step": 6149
+ },
+ {
+ "epoch": 2.9078014184397163,
+ "grad_norm": 3.0262451171875,
+ "learning_rate": 2.6527424220989457e-06,
+ "loss": 0.3728,
+ "step": 6150
+ },
+ {
+ "epoch": 2.908274231678487,
+ "grad_norm": 2.8092808723449707,
+ "learning_rate": 2.6521197557852315e-06,
+ "loss": 0.4668,
+ "step": 6151
+ },
+ {
+ "epoch": 2.9087470449172574,
+ "grad_norm": 2.915719985961914,
+ "learning_rate": 2.651497079999687e-06,
+ "loss": 0.5124,
+ "step": 6152
+ },
+ {
+ "epoch": 2.9092198581560282,
+ "grad_norm": 2.9794204235076904,
+ "learning_rate": 2.6508743947810834e-06,
+ "loss": 0.5207,
+ "step": 6153
+ },
+ {
+ "epoch": 2.909692671394799,
+ "grad_norm": 2.882453680038452,
+ "learning_rate": 2.650251700168193e-06,
+ "loss": 0.4382,
+ "step": 6154
+ },
+ {
+ "epoch": 2.91016548463357,
+ "grad_norm": 3.183680534362793,
+ "learning_rate": 2.6496289961997886e-06,
+ "loss": 0.5134,
+ "step": 6155
+ },
+ {
+ "epoch": 2.9106382978723406,
+ "grad_norm": 2.9374759197235107,
+ "learning_rate": 2.649006282914642e-06,
+ "loss": 0.4748,
+ "step": 6156
+ },
+ {
+ "epoch": 2.911111111111111,
+ "grad_norm": 2.8096041679382324,
+ "learning_rate": 2.648383560351527e-06,
+ "loss": 0.4672,
+ "step": 6157
+ },
+ {
+ "epoch": 2.911583924349882,
+ "grad_norm": 2.8799238204956055,
+ "learning_rate": 2.6477608285492196e-06,
+ "loss": 0.4679,
+ "step": 6158
+ },
+ {
+ "epoch": 2.9120567375886526,
+ "grad_norm": 2.689310073852539,
+ "learning_rate": 2.6471380875464923e-06,
+ "loss": 0.4069,
+ "step": 6159
+ },
+ {
+ "epoch": 2.912529550827423,
+ "grad_norm": 2.909323215484619,
+ "learning_rate": 2.6465153373821216e-06,
+ "loss": 0.4463,
+ "step": 6160
+ },
+ {
+ "epoch": 2.9130023640661937,
+ "grad_norm": 2.797724962234497,
+ "learning_rate": 2.6458925780948845e-06,
+ "loss": 0.4269,
+ "step": 6161
+ },
+ {
+ "epoch": 2.9134751773049645,
+ "grad_norm": 2.7533204555511475,
+ "learning_rate": 2.645269809723556e-06,
+ "loss": 0.453,
+ "step": 6162
+ },
+ {
+ "epoch": 2.9139479905437353,
+ "grad_norm": 2.6615989208221436,
+ "learning_rate": 2.6446470323069122e-06,
+ "loss": 0.3921,
+ "step": 6163
+ },
+ {
+ "epoch": 2.914420803782506,
+ "grad_norm": 3.0493314266204834,
+ "learning_rate": 2.644024245883733e-06,
+ "loss": 0.4779,
+ "step": 6164
+ },
+ {
+ "epoch": 2.9148936170212765,
+ "grad_norm": 2.649845600128174,
+ "learning_rate": 2.643401450492795e-06,
+ "loss": 0.454,
+ "step": 6165
+ },
+ {
+ "epoch": 2.9153664302600473,
+ "grad_norm": 2.7931838035583496,
+ "learning_rate": 2.642778646172877e-06,
+ "loss": 0.504,
+ "step": 6166
+ },
+ {
+ "epoch": 2.915839243498818,
+ "grad_norm": 2.9518136978149414,
+ "learning_rate": 2.64215583296276e-06,
+ "loss": 0.4767,
+ "step": 6167
+ },
+ {
+ "epoch": 2.9163120567375884,
+ "grad_norm": 2.6047427654266357,
+ "learning_rate": 2.6415330109012216e-06,
+ "loss": 0.4316,
+ "step": 6168
+ },
+ {
+ "epoch": 2.916784869976359,
+ "grad_norm": 2.7732112407684326,
+ "learning_rate": 2.640910180027044e-06,
+ "loss": 0.4213,
+ "step": 6169
+ },
+ {
+ "epoch": 2.91725768321513,
+ "grad_norm": 3.1157236099243164,
+ "learning_rate": 2.6402873403790068e-06,
+ "loss": 0.4559,
+ "step": 6170
+ },
+ {
+ "epoch": 2.917730496453901,
+ "grad_norm": 2.68424129486084,
+ "learning_rate": 2.6396644919958917e-06,
+ "loss": 0.3456,
+ "step": 6171
+ },
+ {
+ "epoch": 2.9182033096926716,
+ "grad_norm": 3.1093270778656006,
+ "learning_rate": 2.639041634916482e-06,
+ "loss": 0.4172,
+ "step": 6172
+ },
+ {
+ "epoch": 2.918676122931442,
+ "grad_norm": 2.9844655990600586,
+ "learning_rate": 2.6384187691795594e-06,
+ "loss": 0.4844,
+ "step": 6173
+ },
+ {
+ "epoch": 2.9191489361702128,
+ "grad_norm": 2.907151222229004,
+ "learning_rate": 2.637795894823906e-06,
+ "loss": 0.5126,
+ "step": 6174
+ },
+ {
+ "epoch": 2.9196217494089836,
+ "grad_norm": 2.804105520248413,
+ "learning_rate": 2.637173011888307e-06,
+ "loss": 0.3919,
+ "step": 6175
+ },
+ {
+ "epoch": 2.920094562647754,
+ "grad_norm": 2.8809266090393066,
+ "learning_rate": 2.636550120411547e-06,
+ "loss": 0.4468,
+ "step": 6176
+ },
+ {
+ "epoch": 2.9205673758865247,
+ "grad_norm": 2.686290979385376,
+ "learning_rate": 2.6359272204324087e-06,
+ "loss": 0.4352,
+ "step": 6177
+ },
+ {
+ "epoch": 2.9210401891252955,
+ "grad_norm": 2.448101758956909,
+ "learning_rate": 2.635304311989678e-06,
+ "loss": 0.4218,
+ "step": 6178
+ },
+ {
+ "epoch": 2.9215130023640663,
+ "grad_norm": 2.81024169921875,
+ "learning_rate": 2.6346813951221416e-06,
+ "loss": 0.5177,
+ "step": 6179
+ },
+ {
+ "epoch": 2.921985815602837,
+ "grad_norm": 2.7590086460113525,
+ "learning_rate": 2.6340584698685856e-06,
+ "loss": 0.3897,
+ "step": 6180
+ },
+ {
+ "epoch": 2.9224586288416075,
+ "grad_norm": 3.1226227283477783,
+ "learning_rate": 2.6334355362677965e-06,
+ "loss": 0.4595,
+ "step": 6181
+ },
+ {
+ "epoch": 2.9229314420803783,
+ "grad_norm": 2.673828125,
+ "learning_rate": 2.6328125943585607e-06,
+ "loss": 0.4932,
+ "step": 6182
+ },
+ {
+ "epoch": 2.923404255319149,
+ "grad_norm": 2.8297293186187744,
+ "learning_rate": 2.632189644179668e-06,
+ "loss": 0.3819,
+ "step": 6183
+ },
+ {
+ "epoch": 2.9238770685579194,
+ "grad_norm": 2.9661548137664795,
+ "learning_rate": 2.6315666857699056e-06,
+ "loss": 0.4419,
+ "step": 6184
+ },
+ {
+ "epoch": 2.92434988179669,
+ "grad_norm": 2.9745798110961914,
+ "learning_rate": 2.6309437191680627e-06,
+ "loss": 0.4423,
+ "step": 6185
+ },
+ {
+ "epoch": 2.924822695035461,
+ "grad_norm": 2.8351712226867676,
+ "learning_rate": 2.6303207444129285e-06,
+ "loss": 0.5043,
+ "step": 6186
+ },
+ {
+ "epoch": 2.925295508274232,
+ "grad_norm": 2.6442384719848633,
+ "learning_rate": 2.6296977615432927e-06,
+ "loss": 0.4431,
+ "step": 6187
+ },
+ {
+ "epoch": 2.9257683215130026,
+ "grad_norm": 2.4128029346466064,
+ "learning_rate": 2.6290747705979457e-06,
+ "loss": 0.3603,
+ "step": 6188
+ },
+ {
+ "epoch": 2.926241134751773,
+ "grad_norm": 2.730424642562866,
+ "learning_rate": 2.6284517716156786e-06,
+ "loss": 0.439,
+ "step": 6189
+ },
+ {
+ "epoch": 2.9267139479905437,
+ "grad_norm": 2.6215405464172363,
+ "learning_rate": 2.627828764635284e-06,
+ "loss": 0.4117,
+ "step": 6190
+ },
+ {
+ "epoch": 2.9271867612293145,
+ "grad_norm": 2.56585955619812,
+ "learning_rate": 2.627205749695552e-06,
+ "loss": 0.4404,
+ "step": 6191
+ },
+ {
+ "epoch": 2.927659574468085,
+ "grad_norm": 2.9587886333465576,
+ "learning_rate": 2.6265827268352763e-06,
+ "loss": 0.4295,
+ "step": 6192
+ },
+ {
+ "epoch": 2.9281323877068557,
+ "grad_norm": 2.6611828804016113,
+ "learning_rate": 2.625959696093249e-06,
+ "loss": 0.4441,
+ "step": 6193
+ },
+ {
+ "epoch": 2.9286052009456265,
+ "grad_norm": 2.4391369819641113,
+ "learning_rate": 2.6253366575082634e-06,
+ "loss": 0.4447,
+ "step": 6194
+ },
+ {
+ "epoch": 2.9290780141843973,
+ "grad_norm": 2.710763454437256,
+ "learning_rate": 2.6247136111191144e-06,
+ "loss": 0.4662,
+ "step": 6195
+ },
+ {
+ "epoch": 2.929550827423168,
+ "grad_norm": 2.770697593688965,
+ "learning_rate": 2.6240905569645952e-06,
+ "loss": 0.4263,
+ "step": 6196
+ },
+ {
+ "epoch": 2.9300236406619384,
+ "grad_norm": 2.5885732173919678,
+ "learning_rate": 2.623467495083501e-06,
+ "loss": 0.4303,
+ "step": 6197
+ },
+ {
+ "epoch": 2.9304964539007092,
+ "grad_norm": 2.5716748237609863,
+ "learning_rate": 2.6228444255146274e-06,
+ "loss": 0.3714,
+ "step": 6198
+ },
+ {
+ "epoch": 2.93096926713948,
+ "grad_norm": 3.0437910556793213,
+ "learning_rate": 2.6222213482967703e-06,
+ "loss": 0.4077,
+ "step": 6199
+ },
+ {
+ "epoch": 2.9314420803782504,
+ "grad_norm": 2.7861344814300537,
+ "learning_rate": 2.6215982634687253e-06,
+ "loss": 0.4157,
+ "step": 6200
+ },
+ {
+ "epoch": 2.931914893617021,
+ "grad_norm": 2.5265355110168457,
+ "learning_rate": 2.6209751710692905e-06,
+ "loss": 0.4586,
+ "step": 6201
+ },
+ {
+ "epoch": 2.932387706855792,
+ "grad_norm": 2.940112590789795,
+ "learning_rate": 2.6203520711372615e-06,
+ "loss": 0.4208,
+ "step": 6202
+ },
+ {
+ "epoch": 2.9328605200945628,
+ "grad_norm": 2.7124581336975098,
+ "learning_rate": 2.6197289637114363e-06,
+ "loss": 0.4173,
+ "step": 6203
+ },
+ {
+ "epoch": 2.9333333333333336,
+ "grad_norm": 2.818523406982422,
+ "learning_rate": 2.619105848830615e-06,
+ "loss": 0.4349,
+ "step": 6204
+ },
+ {
+ "epoch": 2.933806146572104,
+ "grad_norm": 2.7630393505096436,
+ "learning_rate": 2.6184827265335937e-06,
+ "loss": 0.5078,
+ "step": 6205
+ },
+ {
+ "epoch": 2.9342789598108747,
+ "grad_norm": 3.0554699897766113,
+ "learning_rate": 2.6178595968591726e-06,
+ "loss": 0.4712,
+ "step": 6206
+ },
+ {
+ "epoch": 2.9347517730496455,
+ "grad_norm": 2.721992254257202,
+ "learning_rate": 2.6172364598461507e-06,
+ "loss": 0.4847,
+ "step": 6207
+ },
+ {
+ "epoch": 2.935224586288416,
+ "grad_norm": 2.809663772583008,
+ "learning_rate": 2.6166133155333303e-06,
+ "loss": 0.4447,
+ "step": 6208
+ },
+ {
+ "epoch": 2.9356973995271867,
+ "grad_norm": 2.568394660949707,
+ "learning_rate": 2.6159901639595088e-06,
+ "loss": 0.4543,
+ "step": 6209
+ },
+ {
+ "epoch": 2.9361702127659575,
+ "grad_norm": 3.3670637607574463,
+ "learning_rate": 2.6153670051634884e-06,
+ "loss": 0.4901,
+ "step": 6210
+ },
+ {
+ "epoch": 2.9366430260047283,
+ "grad_norm": 3.082508087158203,
+ "learning_rate": 2.614743839184071e-06,
+ "loss": 0.4862,
+ "step": 6211
+ },
+ {
+ "epoch": 2.937115839243499,
+ "grad_norm": 2.692139148712158,
+ "learning_rate": 2.6141206660600566e-06,
+ "loss": 0.5199,
+ "step": 6212
+ },
+ {
+ "epoch": 2.9375886524822694,
+ "grad_norm": 3.231433391571045,
+ "learning_rate": 2.6134974858302504e-06,
+ "loss": 0.464,
+ "step": 6213
+ },
+ {
+ "epoch": 2.93806146572104,
+ "grad_norm": 3.224238157272339,
+ "learning_rate": 2.612874298533452e-06,
+ "loss": 0.4507,
+ "step": 6214
+ },
+ {
+ "epoch": 2.938534278959811,
+ "grad_norm": 2.812755584716797,
+ "learning_rate": 2.6122511042084663e-06,
+ "loss": 0.4527,
+ "step": 6215
+ },
+ {
+ "epoch": 2.9390070921985814,
+ "grad_norm": 2.837811231613159,
+ "learning_rate": 2.611627902894098e-06,
+ "loss": 0.4782,
+ "step": 6216
+ },
+ {
+ "epoch": 2.939479905437352,
+ "grad_norm": 3.093817710876465,
+ "learning_rate": 2.6110046946291476e-06,
+ "loss": 0.4933,
+ "step": 6217
+ },
+ {
+ "epoch": 2.939952718676123,
+ "grad_norm": 2.950119733810425,
+ "learning_rate": 2.6103814794524235e-06,
+ "loss": 0.4884,
+ "step": 6218
+ },
+ {
+ "epoch": 2.9404255319148938,
+ "grad_norm": 2.469681978225708,
+ "learning_rate": 2.6097582574027274e-06,
+ "loss": 0.4135,
+ "step": 6219
+ },
+ {
+ "epoch": 2.9408983451536646,
+ "grad_norm": 2.779238224029541,
+ "learning_rate": 2.609135028518866e-06,
+ "loss": 0.5165,
+ "step": 6220
+ },
+ {
+ "epoch": 2.941371158392435,
+ "grad_norm": 2.807705879211426,
+ "learning_rate": 2.608511792839645e-06,
+ "loss": 0.4046,
+ "step": 6221
+ },
+ {
+ "epoch": 2.9418439716312057,
+ "grad_norm": 2.6067750453948975,
+ "learning_rate": 2.607888550403871e-06,
+ "loss": 0.406,
+ "step": 6222
+ },
+ {
+ "epoch": 2.9423167848699765,
+ "grad_norm": 2.865766763687134,
+ "learning_rate": 2.607265301250349e-06,
+ "loss": 0.471,
+ "step": 6223
+ },
+ {
+ "epoch": 2.942789598108747,
+ "grad_norm": 2.977681875228882,
+ "learning_rate": 2.6066420454178876e-06,
+ "loss": 0.4666,
+ "step": 6224
+ },
+ {
+ "epoch": 2.9432624113475176,
+ "grad_norm": 2.870884418487549,
+ "learning_rate": 2.606018782945294e-06,
+ "loss": 0.4768,
+ "step": 6225
+ },
+ {
+ "epoch": 2.9437352245862884,
+ "grad_norm": 2.992851495742798,
+ "learning_rate": 2.6053955138713756e-06,
+ "loss": 0.4657,
+ "step": 6226
+ },
+ {
+ "epoch": 2.9442080378250592,
+ "grad_norm": 2.7279815673828125,
+ "learning_rate": 2.6047722382349406e-06,
+ "loss": 0.4087,
+ "step": 6227
+ },
+ {
+ "epoch": 2.94468085106383,
+ "grad_norm": 2.8587028980255127,
+ "learning_rate": 2.604148956074797e-06,
+ "loss": 0.4452,
+ "step": 6228
+ },
+ {
+ "epoch": 2.9451536643026004,
+ "grad_norm": 3.001694679260254,
+ "learning_rate": 2.6035256674297555e-06,
+ "loss": 0.4852,
+ "step": 6229
+ },
+ {
+ "epoch": 2.945626477541371,
+ "grad_norm": 2.858069896697998,
+ "learning_rate": 2.6029023723386237e-06,
+ "loss": 0.4281,
+ "step": 6230
+ },
+ {
+ "epoch": 2.946099290780142,
+ "grad_norm": 2.675856828689575,
+ "learning_rate": 2.602279070840213e-06,
+ "loss": 0.4545,
+ "step": 6231
+ },
+ {
+ "epoch": 2.9465721040189123,
+ "grad_norm": 2.530245065689087,
+ "learning_rate": 2.6016557629733334e-06,
+ "loss": 0.4619,
+ "step": 6232
+ },
+ {
+ "epoch": 2.947044917257683,
+ "grad_norm": 2.7533743381500244,
+ "learning_rate": 2.601032448776795e-06,
+ "loss": 0.4879,
+ "step": 6233
+ },
+ {
+ "epoch": 2.947517730496454,
+ "grad_norm": 3.130453109741211,
+ "learning_rate": 2.600409128289409e-06,
+ "loss": 0.4056,
+ "step": 6234
+ },
+ {
+ "epoch": 2.9479905437352247,
+ "grad_norm": 3.4736509323120117,
+ "learning_rate": 2.5997858015499867e-06,
+ "loss": 0.5063,
+ "step": 6235
+ },
+ {
+ "epoch": 2.9484633569739955,
+ "grad_norm": 2.871978282928467,
+ "learning_rate": 2.5991624685973406e-06,
+ "loss": 0.4562,
+ "step": 6236
+ },
+ {
+ "epoch": 2.948936170212766,
+ "grad_norm": 2.976503372192383,
+ "learning_rate": 2.5985391294702817e-06,
+ "loss": 0.5079,
+ "step": 6237
+ },
+ {
+ "epoch": 2.9494089834515367,
+ "grad_norm": 2.578122615814209,
+ "learning_rate": 2.597915784207623e-06,
+ "loss": 0.4069,
+ "step": 6238
+ },
+ {
+ "epoch": 2.9498817966903075,
+ "grad_norm": 2.885911226272583,
+ "learning_rate": 2.597292432848178e-06,
+ "loss": 0.4382,
+ "step": 6239
+ },
+ {
+ "epoch": 2.950354609929078,
+ "grad_norm": 2.9301681518554688,
+ "learning_rate": 2.5966690754307605e-06,
+ "loss": 0.4888,
+ "step": 6240
+ },
+ {
+ "epoch": 2.9508274231678486,
+ "grad_norm": 2.9912192821502686,
+ "learning_rate": 2.5960457119941834e-06,
+ "loss": 0.4699,
+ "step": 6241
+ },
+ {
+ "epoch": 2.9513002364066194,
+ "grad_norm": 2.6612601280212402,
+ "learning_rate": 2.5954223425772607e-06,
+ "loss": 0.3736,
+ "step": 6242
+ },
+ {
+ "epoch": 2.9517730496453902,
+ "grad_norm": 2.9325380325317383,
+ "learning_rate": 2.5947989672188067e-06,
+ "loss": 0.4771,
+ "step": 6243
+ },
+ {
+ "epoch": 2.952245862884161,
+ "grad_norm": 2.8143959045410156,
+ "learning_rate": 2.594175585957637e-06,
+ "loss": 0.5103,
+ "step": 6244
+ },
+ {
+ "epoch": 2.9527186761229314,
+ "grad_norm": 2.355078935623169,
+ "learning_rate": 2.5935521988325674e-06,
+ "loss": 0.44,
+ "step": 6245
+ },
+ {
+ "epoch": 2.953191489361702,
+ "grad_norm": 2.733156442642212,
+ "learning_rate": 2.5929288058824114e-06,
+ "loss": 0.4306,
+ "step": 6246
+ },
+ {
+ "epoch": 2.953664302600473,
+ "grad_norm": 3.182563304901123,
+ "learning_rate": 2.5923054071459865e-06,
+ "loss": 0.417,
+ "step": 6247
+ },
+ {
+ "epoch": 2.9541371158392433,
+ "grad_norm": 2.4162323474884033,
+ "learning_rate": 2.5916820026621094e-06,
+ "loss": 0.3802,
+ "step": 6248
+ },
+ {
+ "epoch": 2.954609929078014,
+ "grad_norm": 2.772706985473633,
+ "learning_rate": 2.591058592469595e-06,
+ "loss": 0.4654,
+ "step": 6249
+ },
+ {
+ "epoch": 2.955082742316785,
+ "grad_norm": 2.6011102199554443,
+ "learning_rate": 2.5904351766072616e-06,
+ "loss": 0.4619,
+ "step": 6250
+ },
+ {
+ "epoch": 2.9555555555555557,
+ "grad_norm": 2.5700361728668213,
+ "learning_rate": 2.589811755113926e-06,
+ "loss": 0.3991,
+ "step": 6251
+ },
+ {
+ "epoch": 2.9560283687943265,
+ "grad_norm": 2.6444971561431885,
+ "learning_rate": 2.589188328028407e-06,
+ "loss": 0.4388,
+ "step": 6252
+ },
+ {
+ "epoch": 2.956501182033097,
+ "grad_norm": 2.739567279815674,
+ "learning_rate": 2.588564895389521e-06,
+ "loss": 0.4193,
+ "step": 6253
+ },
+ {
+ "epoch": 2.9569739952718677,
+ "grad_norm": 2.7070045471191406,
+ "learning_rate": 2.5879414572360877e-06,
+ "loss": 0.4347,
+ "step": 6254
+ },
+ {
+ "epoch": 2.9574468085106385,
+ "grad_norm": 2.7811532020568848,
+ "learning_rate": 2.587318013606926e-06,
+ "loss": 0.43,
+ "step": 6255
+ },
+ {
+ "epoch": 2.957919621749409,
+ "grad_norm": 3.0036091804504395,
+ "learning_rate": 2.5866945645408537e-06,
+ "loss": 0.4855,
+ "step": 6256
+ },
+ {
+ "epoch": 2.9583924349881796,
+ "grad_norm": 2.948573112487793,
+ "learning_rate": 2.5860711100766918e-06,
+ "loss": 0.4594,
+ "step": 6257
+ },
+ {
+ "epoch": 2.9588652482269504,
+ "grad_norm": 2.6371593475341797,
+ "learning_rate": 2.5854476502532583e-06,
+ "loss": 0.446,
+ "step": 6258
+ },
+ {
+ "epoch": 2.959338061465721,
+ "grad_norm": 2.668677806854248,
+ "learning_rate": 2.5848241851093754e-06,
+ "loss": 0.3991,
+ "step": 6259
+ },
+ {
+ "epoch": 2.959810874704492,
+ "grad_norm": 3.1640663146972656,
+ "learning_rate": 2.5842007146838614e-06,
+ "loss": 0.5146,
+ "step": 6260
+ },
+ {
+ "epoch": 2.9602836879432624,
+ "grad_norm": 2.9412102699279785,
+ "learning_rate": 2.5835772390155382e-06,
+ "loss": 0.4798,
+ "step": 6261
+ },
+ {
+ "epoch": 2.960756501182033,
+ "grad_norm": 2.7674343585968018,
+ "learning_rate": 2.582953758143227e-06,
+ "loss": 0.4262,
+ "step": 6262
+ },
+ {
+ "epoch": 2.961229314420804,
+ "grad_norm": 3.5219457149505615,
+ "learning_rate": 2.582330272105749e-06,
+ "loss": 0.4905,
+ "step": 6263
+ },
+ {
+ "epoch": 2.9617021276595743,
+ "grad_norm": 2.4274468421936035,
+ "learning_rate": 2.5817067809419267e-06,
+ "loss": 0.4048,
+ "step": 6264
+ },
+ {
+ "epoch": 2.962174940898345,
+ "grad_norm": 2.6907944679260254,
+ "learning_rate": 2.5810832846905814e-06,
+ "loss": 0.388,
+ "step": 6265
+ },
+ {
+ "epoch": 2.962647754137116,
+ "grad_norm": 2.603151321411133,
+ "learning_rate": 2.5804597833905347e-06,
+ "loss": 0.4377,
+ "step": 6266
+ },
+ {
+ "epoch": 2.9631205673758867,
+ "grad_norm": 2.685837507247925,
+ "learning_rate": 2.57983627708061e-06,
+ "loss": 0.4409,
+ "step": 6267
+ },
+ {
+ "epoch": 2.963593380614657,
+ "grad_norm": 2.8281500339508057,
+ "learning_rate": 2.579212765799631e-06,
+ "loss": 0.4567,
+ "step": 6268
+ },
+ {
+ "epoch": 2.964066193853428,
+ "grad_norm": 2.6387875080108643,
+ "learning_rate": 2.57858924958642e-06,
+ "loss": 0.4061,
+ "step": 6269
+ },
+ {
+ "epoch": 2.9645390070921986,
+ "grad_norm": 2.64139986038208,
+ "learning_rate": 2.5779657284798017e-06,
+ "loss": 0.4539,
+ "step": 6270
+ },
+ {
+ "epoch": 2.965011820330969,
+ "grad_norm": 2.7384836673736572,
+ "learning_rate": 2.5773422025185983e-06,
+ "loss": 0.408,
+ "step": 6271
+ },
+ {
+ "epoch": 2.96548463356974,
+ "grad_norm": 2.262514352798462,
+ "learning_rate": 2.576718671741636e-06,
+ "loss": 0.3726,
+ "step": 6272
+ },
+ {
+ "epoch": 2.9659574468085106,
+ "grad_norm": 2.53800106048584,
+ "learning_rate": 2.5760951361877384e-06,
+ "loss": 0.4716,
+ "step": 6273
+ },
+ {
+ "epoch": 2.9664302600472814,
+ "grad_norm": 3.256701707839966,
+ "learning_rate": 2.57547159589573e-06,
+ "loss": 0.518,
+ "step": 6274
+ },
+ {
+ "epoch": 2.966903073286052,
+ "grad_norm": 2.9427342414855957,
+ "learning_rate": 2.574848050904436e-06,
+ "loss": 0.4255,
+ "step": 6275
+ },
+ {
+ "epoch": 2.9673758865248225,
+ "grad_norm": 2.5794098377227783,
+ "learning_rate": 2.574224501252682e-06,
+ "loss": 0.4412,
+ "step": 6276
+ },
+ {
+ "epoch": 2.9678486997635933,
+ "grad_norm": 2.5894877910614014,
+ "learning_rate": 2.573600946979294e-06,
+ "loss": 0.4356,
+ "step": 6277
+ },
+ {
+ "epoch": 2.968321513002364,
+ "grad_norm": 2.9597361087799072,
+ "learning_rate": 2.572977388123098e-06,
+ "loss": 0.4376,
+ "step": 6278
+ },
+ {
+ "epoch": 2.9687943262411345,
+ "grad_norm": 2.779303550720215,
+ "learning_rate": 2.5723538247229197e-06,
+ "loss": 0.3985,
+ "step": 6279
+ },
+ {
+ "epoch": 2.9692671394799053,
+ "grad_norm": 2.9173855781555176,
+ "learning_rate": 2.5717302568175866e-06,
+ "loss": 0.4581,
+ "step": 6280
+ },
+ {
+ "epoch": 2.969739952718676,
+ "grad_norm": 2.703721284866333,
+ "learning_rate": 2.5711066844459242e-06,
+ "loss": 0.3705,
+ "step": 6281
+ },
+ {
+ "epoch": 2.970212765957447,
+ "grad_norm": 2.5415029525756836,
+ "learning_rate": 2.5704831076467613e-06,
+ "loss": 0.4089,
+ "step": 6282
+ },
+ {
+ "epoch": 2.9706855791962177,
+ "grad_norm": 2.791780948638916,
+ "learning_rate": 2.5698595264589234e-06,
+ "loss": 0.4357,
+ "step": 6283
+ },
+ {
+ "epoch": 2.971158392434988,
+ "grad_norm": 2.887662887573242,
+ "learning_rate": 2.5692359409212392e-06,
+ "loss": 0.4093,
+ "step": 6284
+ },
+ {
+ "epoch": 2.971631205673759,
+ "grad_norm": 3.0309557914733887,
+ "learning_rate": 2.5686123510725364e-06,
+ "loss": 0.4461,
+ "step": 6285
+ },
+ {
+ "epoch": 2.9721040189125296,
+ "grad_norm": 2.6861515045166016,
+ "learning_rate": 2.5679887569516437e-06,
+ "loss": 0.4199,
+ "step": 6286
+ },
+ {
+ "epoch": 2.9725768321513,
+ "grad_norm": 2.7014012336730957,
+ "learning_rate": 2.5673651585973897e-06,
+ "loss": 0.4373,
+ "step": 6287
+ },
+ {
+ "epoch": 2.9730496453900708,
+ "grad_norm": 2.951265811920166,
+ "learning_rate": 2.5667415560486026e-06,
+ "loss": 0.4426,
+ "step": 6288
+ },
+ {
+ "epoch": 2.9735224586288416,
+ "grad_norm": 2.7664504051208496,
+ "learning_rate": 2.5661179493441106e-06,
+ "loss": 0.474,
+ "step": 6289
+ },
+ {
+ "epoch": 2.9739952718676124,
+ "grad_norm": 2.6081087589263916,
+ "learning_rate": 2.5654943385227445e-06,
+ "loss": 0.4058,
+ "step": 6290
+ },
+ {
+ "epoch": 2.974468085106383,
+ "grad_norm": 2.9416966438293457,
+ "learning_rate": 2.564870723623333e-06,
+ "loss": 0.506,
+ "step": 6291
+ },
+ {
+ "epoch": 2.9749408983451535,
+ "grad_norm": 2.9441659450531006,
+ "learning_rate": 2.564247104684706e-06,
+ "loss": 0.4505,
+ "step": 6292
+ },
+ {
+ "epoch": 2.9754137115839243,
+ "grad_norm": 2.7110862731933594,
+ "learning_rate": 2.563623481745693e-06,
+ "loss": 0.4493,
+ "step": 6293
+ },
+ {
+ "epoch": 2.975886524822695,
+ "grad_norm": 2.88459849357605,
+ "learning_rate": 2.562999854845125e-06,
+ "loss": 0.4462,
+ "step": 6294
+ },
+ {
+ "epoch": 2.9763593380614655,
+ "grad_norm": 3.0491793155670166,
+ "learning_rate": 2.5623762240218327e-06,
+ "loss": 0.4928,
+ "step": 6295
+ },
+ {
+ "epoch": 2.9768321513002363,
+ "grad_norm": 2.9475483894348145,
+ "learning_rate": 2.561752589314646e-06,
+ "loss": 0.4535,
+ "step": 6296
+ },
+ {
+ "epoch": 2.977304964539007,
+ "grad_norm": 2.879495859146118,
+ "learning_rate": 2.561128950762397e-06,
+ "loss": 0.4393,
+ "step": 6297
+ },
+ {
+ "epoch": 2.977777777777778,
+ "grad_norm": 2.8478336334228516,
+ "learning_rate": 2.560505308403916e-06,
+ "loss": 0.4363,
+ "step": 6298
+ },
+ {
+ "epoch": 2.9782505910165487,
+ "grad_norm": 2.5475094318389893,
+ "learning_rate": 2.5598816622780343e-06,
+ "loss": 0.3825,
+ "step": 6299
+ },
+ {
+ "epoch": 2.978723404255319,
+ "grad_norm": 2.85430908203125,
+ "learning_rate": 2.5592580124235838e-06,
+ "loss": 0.4226,
+ "step": 6300
+ },
+ {
+ "epoch": 2.97919621749409,
+ "grad_norm": 2.569775104522705,
+ "learning_rate": 2.5586343588793975e-06,
+ "loss": 0.4045,
+ "step": 6301
+ },
+ {
+ "epoch": 2.9796690307328606,
+ "grad_norm": 2.4482202529907227,
+ "learning_rate": 2.558010701684307e-06,
+ "loss": 0.4625,
+ "step": 6302
+ },
+ {
+ "epoch": 2.980141843971631,
+ "grad_norm": 2.9301230907440186,
+ "learning_rate": 2.5573870408771436e-06,
+ "loss": 0.4358,
+ "step": 6303
+ },
+ {
+ "epoch": 2.9806146572104018,
+ "grad_norm": 2.9865870475769043,
+ "learning_rate": 2.5567633764967416e-06,
+ "loss": 0.497,
+ "step": 6304
+ },
+ {
+ "epoch": 2.9810874704491725,
+ "grad_norm": 2.523524522781372,
+ "learning_rate": 2.556139708581933e-06,
+ "loss": 0.4141,
+ "step": 6305
+ },
+ {
+ "epoch": 2.9815602836879433,
+ "grad_norm": 2.8489344120025635,
+ "learning_rate": 2.5555160371715504e-06,
+ "loss": 0.4205,
+ "step": 6306
+ },
+ {
+ "epoch": 2.982033096926714,
+ "grad_norm": 2.417759895324707,
+ "learning_rate": 2.5548923623044274e-06,
+ "loss": 0.44,
+ "step": 6307
+ },
+ {
+ "epoch": 2.9825059101654845,
+ "grad_norm": 2.7626900672912598,
+ "learning_rate": 2.554268684019398e-06,
+ "loss": 0.4646,
+ "step": 6308
+ },
+ {
+ "epoch": 2.9829787234042553,
+ "grad_norm": 3.0916266441345215,
+ "learning_rate": 2.5536450023552956e-06,
+ "loss": 0.4443,
+ "step": 6309
+ },
+ {
+ "epoch": 2.983451536643026,
+ "grad_norm": 2.721992015838623,
+ "learning_rate": 2.5530213173509542e-06,
+ "loss": 0.4008,
+ "step": 6310
+ },
+ {
+ "epoch": 2.9839243498817964,
+ "grad_norm": 2.825334072113037,
+ "learning_rate": 2.552397629045208e-06,
+ "loss": 0.4513,
+ "step": 6311
+ },
+ {
+ "epoch": 2.9843971631205672,
+ "grad_norm": 2.912050485610962,
+ "learning_rate": 2.5517739374768915e-06,
+ "loss": 0.4104,
+ "step": 6312
+ },
+ {
+ "epoch": 2.984869976359338,
+ "grad_norm": 2.760650634765625,
+ "learning_rate": 2.551150242684838e-06,
+ "loss": 0.4372,
+ "step": 6313
+ },
+ {
+ "epoch": 2.985342789598109,
+ "grad_norm": 2.8926033973693848,
+ "learning_rate": 2.5505265447078838e-06,
+ "loss": 0.475,
+ "step": 6314
+ },
+ {
+ "epoch": 2.9858156028368796,
+ "grad_norm": 2.6279892921447754,
+ "learning_rate": 2.5499028435848633e-06,
+ "loss": 0.4589,
+ "step": 6315
+ },
+ {
+ "epoch": 2.98628841607565,
+ "grad_norm": 3.2147316932678223,
+ "learning_rate": 2.549279139354611e-06,
+ "loss": 0.4968,
+ "step": 6316
+ },
+ {
+ "epoch": 2.986761229314421,
+ "grad_norm": 2.4510674476623535,
+ "learning_rate": 2.5486554320559626e-06,
+ "loss": 0.4291,
+ "step": 6317
+ },
+ {
+ "epoch": 2.9872340425531916,
+ "grad_norm": 2.6919643878936768,
+ "learning_rate": 2.5480317217277544e-06,
+ "loss": 0.4704,
+ "step": 6318
+ },
+ {
+ "epoch": 2.987706855791962,
+ "grad_norm": 2.9832234382629395,
+ "learning_rate": 2.5474080084088215e-06,
+ "loss": 0.4129,
+ "step": 6319
+ },
+ {
+ "epoch": 2.9881796690307327,
+ "grad_norm": 2.893209218978882,
+ "learning_rate": 2.5467842921380004e-06,
+ "loss": 0.5099,
+ "step": 6320
+ },
+ {
+ "epoch": 2.9886524822695035,
+ "grad_norm": 2.6734580993652344,
+ "learning_rate": 2.5461605729541254e-06,
+ "loss": 0.4588,
+ "step": 6321
+ },
+ {
+ "epoch": 2.9891252955082743,
+ "grad_norm": 2.5591681003570557,
+ "learning_rate": 2.5455368508960343e-06,
+ "loss": 0.4162,
+ "step": 6322
+ },
+ {
+ "epoch": 2.989598108747045,
+ "grad_norm": 3.2619881629943848,
+ "learning_rate": 2.5449131260025626e-06,
+ "loss": 0.4412,
+ "step": 6323
+ },
+ {
+ "epoch": 2.9900709219858155,
+ "grad_norm": 2.897914409637451,
+ "learning_rate": 2.544289398312549e-06,
+ "loss": 0.5079,
+ "step": 6324
+ },
+ {
+ "epoch": 2.9905437352245863,
+ "grad_norm": 2.7891685962677,
+ "learning_rate": 2.5436656678648274e-06,
+ "loss": 0.42,
+ "step": 6325
+ },
+ {
+ "epoch": 2.991016548463357,
+ "grad_norm": 3.022341728210449,
+ "learning_rate": 2.5430419346982367e-06,
+ "loss": 0.4739,
+ "step": 6326
+ },
+ {
+ "epoch": 2.9914893617021274,
+ "grad_norm": 3.395775556564331,
+ "learning_rate": 2.542418198851614e-06,
+ "loss": 0.4822,
+ "step": 6327
+ },
+ {
+ "epoch": 2.9919621749408982,
+ "grad_norm": 3.0200490951538086,
+ "learning_rate": 2.541794460363795e-06,
+ "loss": 0.4755,
+ "step": 6328
+ },
+ {
+ "epoch": 2.992434988179669,
+ "grad_norm": 3.302020311355591,
+ "learning_rate": 2.541170719273619e-06,
+ "loss": 0.4603,
+ "step": 6329
+ },
+ {
+ "epoch": 2.99290780141844,
+ "grad_norm": 2.5985910892486572,
+ "learning_rate": 2.5405469756199226e-06,
+ "loss": 0.4475,
+ "step": 6330
+ },
+ {
+ "epoch": 2.9933806146572106,
+ "grad_norm": 2.9413928985595703,
+ "learning_rate": 2.5399232294415434e-06,
+ "loss": 0.4695,
+ "step": 6331
+ },
+ {
+ "epoch": 2.993853427895981,
+ "grad_norm": 2.942777156829834,
+ "learning_rate": 2.53929948077732e-06,
+ "loss": 0.4462,
+ "step": 6332
+ },
+ {
+ "epoch": 2.9943262411347518,
+ "grad_norm": 2.971120595932007,
+ "learning_rate": 2.53867572966609e-06,
+ "loss": 0.4546,
+ "step": 6333
+ },
+ {
+ "epoch": 2.9947990543735226,
+ "grad_norm": 2.8248138427734375,
+ "learning_rate": 2.5380519761466927e-06,
+ "loss": 0.453,
+ "step": 6334
+ },
+ {
+ "epoch": 2.995271867612293,
+ "grad_norm": 3.0819008350372314,
+ "learning_rate": 2.5374282202579647e-06,
+ "loss": 0.4774,
+ "step": 6335
+ },
+ {
+ "epoch": 2.9957446808510637,
+ "grad_norm": 2.742570161819458,
+ "learning_rate": 2.5368044620387466e-06,
+ "loss": 0.5059,
+ "step": 6336
+ },
+ {
+ "epoch": 2.9962174940898345,
+ "grad_norm": 2.9087419509887695,
+ "learning_rate": 2.5361807015278757e-06,
+ "loss": 0.3606,
+ "step": 6337
+ },
+ {
+ "epoch": 2.9966903073286053,
+ "grad_norm": 2.6887354850769043,
+ "learning_rate": 2.5355569387641908e-06,
+ "loss": 0.4247,
+ "step": 6338
+ },
+ {
+ "epoch": 2.997163120567376,
+ "grad_norm": 2.8516008853912354,
+ "learning_rate": 2.534933173786531e-06,
+ "loss": 0.4502,
+ "step": 6339
+ },
+ {
+ "epoch": 2.9976359338061465,
+ "grad_norm": 2.4463164806365967,
+ "learning_rate": 2.5343094066337366e-06,
+ "loss": 0.3883,
+ "step": 6340
+ },
+ {
+ "epoch": 2.9981087470449173,
+ "grad_norm": 2.87025785446167,
+ "learning_rate": 2.533685637344645e-06,
+ "loss": 0.4534,
+ "step": 6341
+ },
+ {
+ "epoch": 2.998581560283688,
+ "grad_norm": 3.0706169605255127,
+ "learning_rate": 2.5330618659580967e-06,
+ "loss": 0.5426,
+ "step": 6342
+ },
+ {
+ "epoch": 2.9990543735224584,
+ "grad_norm": 2.7185773849487305,
+ "learning_rate": 2.532438092512931e-06,
+ "loss": 0.497,
+ "step": 6343
+ },
+ {
+ "epoch": 2.999527186761229,
+ "grad_norm": 2.840207815170288,
+ "learning_rate": 2.531814317047988e-06,
+ "loss": 0.4073,
+ "step": 6344
+ },
+ {
+ "epoch": 3.0,
+ "grad_norm": 3.1592655181884766,
+ "learning_rate": 2.5311905396021063e-06,
+ "loss": 0.4728,
+ "step": 6345
+ },
+ {
+ "epoch": 3.000472813238771,
+ "grad_norm": 2.190042495727539,
+ "learning_rate": 2.530566760214127e-06,
+ "loss": 0.3588,
+ "step": 6346
+ },
+ {
+ "epoch": 3.000945626477541,
+ "grad_norm": 2.749516248703003,
+ "learning_rate": 2.5299429789228898e-06,
+ "loss": 0.3495,
+ "step": 6347
+ },
+ {
+ "epoch": 3.001418439716312,
+ "grad_norm": 2.6181938648223877,
+ "learning_rate": 2.5293191957672335e-06,
+ "loss": 0.3611,
+ "step": 6348
+ },
+ {
+ "epoch": 3.0018912529550827,
+ "grad_norm": 2.7235212326049805,
+ "learning_rate": 2.528695410786e-06,
+ "loss": 0.4173,
+ "step": 6349
+ },
+ {
+ "epoch": 3.0023640661938535,
+ "grad_norm": 2.5408031940460205,
+ "learning_rate": 2.528071624018029e-06,
+ "loss": 0.3651,
+ "step": 6350
+ },
+ {
+ "epoch": 3.002836879432624,
+ "grad_norm": 2.7824409008026123,
+ "learning_rate": 2.5274478355021615e-06,
+ "loss": 0.378,
+ "step": 6351
+ },
+ {
+ "epoch": 3.0033096926713947,
+ "grad_norm": 2.7671427726745605,
+ "learning_rate": 2.526824045277238e-06,
+ "loss": 0.446,
+ "step": 6352
+ },
+ {
+ "epoch": 3.0037825059101655,
+ "grad_norm": 2.6746346950531006,
+ "learning_rate": 2.526200253382098e-06,
+ "loss": 0.3831,
+ "step": 6353
+ },
+ {
+ "epoch": 3.0042553191489363,
+ "grad_norm": 2.437439441680908,
+ "learning_rate": 2.525576459855583e-06,
+ "loss": 0.352,
+ "step": 6354
+ },
+ {
+ "epoch": 3.0047281323877066,
+ "grad_norm": 2.7632546424865723,
+ "learning_rate": 2.5249526647365343e-06,
+ "loss": 0.4636,
+ "step": 6355
+ },
+ {
+ "epoch": 3.0052009456264774,
+ "grad_norm": 2.681955099105835,
+ "learning_rate": 2.524328868063793e-06,
+ "loss": 0.3978,
+ "step": 6356
+ },
+ {
+ "epoch": 3.0056737588652482,
+ "grad_norm": 2.9575345516204834,
+ "learning_rate": 2.523705069876199e-06,
+ "loss": 0.3803,
+ "step": 6357
+ },
+ {
+ "epoch": 3.006146572104019,
+ "grad_norm": 2.7368216514587402,
+ "learning_rate": 2.523081270212594e-06,
+ "loss": 0.3968,
+ "step": 6358
+ },
+ {
+ "epoch": 3.0066193853427894,
+ "grad_norm": 2.637592077255249,
+ "learning_rate": 2.522457469111821e-06,
+ "loss": 0.3629,
+ "step": 6359
+ },
+ {
+ "epoch": 3.00709219858156,
+ "grad_norm": 2.579331398010254,
+ "learning_rate": 2.5218336666127187e-06,
+ "loss": 0.4044,
+ "step": 6360
+ },
+ {
+ "epoch": 3.007565011820331,
+ "grad_norm": 3.014544725418091,
+ "learning_rate": 2.5212098627541296e-06,
+ "loss": 0.3518,
+ "step": 6361
+ },
+ {
+ "epoch": 3.0080378250591018,
+ "grad_norm": 2.5261058807373047,
+ "learning_rate": 2.520586057574896e-06,
+ "loss": 0.3763,
+ "step": 6362
+ },
+ {
+ "epoch": 3.008510638297872,
+ "grad_norm": 3.234910249710083,
+ "learning_rate": 2.519962251113858e-06,
+ "loss": 0.3691,
+ "step": 6363
+ },
+ {
+ "epoch": 3.008983451536643,
+ "grad_norm": 3.2930967807769775,
+ "learning_rate": 2.519338443409859e-06,
+ "loss": 0.4363,
+ "step": 6364
+ },
+ {
+ "epoch": 3.0094562647754137,
+ "grad_norm": 2.807910442352295,
+ "learning_rate": 2.51871463450174e-06,
+ "loss": 0.3984,
+ "step": 6365
+ },
+ {
+ "epoch": 3.0099290780141845,
+ "grad_norm": 3.1555075645446777,
+ "learning_rate": 2.518090824428342e-06,
+ "loss": 0.4006,
+ "step": 6366
+ },
+ {
+ "epoch": 3.010401891252955,
+ "grad_norm": 3.1793272495269775,
+ "learning_rate": 2.5174670132285084e-06,
+ "loss": 0.4966,
+ "step": 6367
+ },
+ {
+ "epoch": 3.0108747044917257,
+ "grad_norm": 2.7007548809051514,
+ "learning_rate": 2.5168432009410805e-06,
+ "loss": 0.3755,
+ "step": 6368
+ },
+ {
+ "epoch": 3.0113475177304965,
+ "grad_norm": 2.914792537689209,
+ "learning_rate": 2.5162193876048995e-06,
+ "loss": 0.39,
+ "step": 6369
+ },
+ {
+ "epoch": 3.0118203309692673,
+ "grad_norm": 2.935516119003296,
+ "learning_rate": 2.5155955732588093e-06,
+ "loss": 0.4045,
+ "step": 6370
+ },
+ {
+ "epoch": 3.0122931442080376,
+ "grad_norm": 2.8817989826202393,
+ "learning_rate": 2.5149717579416503e-06,
+ "loss": 0.3751,
+ "step": 6371
+ },
+ {
+ "epoch": 3.0127659574468084,
+ "grad_norm": 2.9181740283966064,
+ "learning_rate": 2.514347941692266e-06,
+ "loss": 0.3689,
+ "step": 6372
+ },
+ {
+ "epoch": 3.013238770685579,
+ "grad_norm": 3.052060604095459,
+ "learning_rate": 2.5137241245494982e-06,
+ "loss": 0.3874,
+ "step": 6373
+ },
+ {
+ "epoch": 3.01371158392435,
+ "grad_norm": 2.6931657791137695,
+ "learning_rate": 2.513100306552189e-06,
+ "loss": 0.3673,
+ "step": 6374
+ },
+ {
+ "epoch": 3.0141843971631204,
+ "grad_norm": 2.3422248363494873,
+ "learning_rate": 2.5124764877391824e-06,
+ "loss": 0.3753,
+ "step": 6375
+ },
+ {
+ "epoch": 3.014657210401891,
+ "grad_norm": 2.5826265811920166,
+ "learning_rate": 2.5118526681493186e-06,
+ "loss": 0.3661,
+ "step": 6376
+ },
+ {
+ "epoch": 3.015130023640662,
+ "grad_norm": 2.7407493591308594,
+ "learning_rate": 2.5112288478214415e-06,
+ "loss": 0.3887,
+ "step": 6377
+ },
+ {
+ "epoch": 3.0156028368794328,
+ "grad_norm": 2.7378315925598145,
+ "learning_rate": 2.510605026794393e-06,
+ "loss": 0.3623,
+ "step": 6378
+ },
+ {
+ "epoch": 3.016075650118203,
+ "grad_norm": 2.59541654586792,
+ "learning_rate": 2.5099812051070167e-06,
+ "loss": 0.3804,
+ "step": 6379
+ },
+ {
+ "epoch": 3.016548463356974,
+ "grad_norm": 3.1022770404815674,
+ "learning_rate": 2.509357382798154e-06,
+ "loss": 0.4092,
+ "step": 6380
+ },
+ {
+ "epoch": 3.0170212765957447,
+ "grad_norm": 2.521545648574829,
+ "learning_rate": 2.5087335599066476e-06,
+ "loss": 0.3509,
+ "step": 6381
+ },
+ {
+ "epoch": 3.0174940898345155,
+ "grad_norm": 2.949395179748535,
+ "learning_rate": 2.5081097364713407e-06,
+ "loss": 0.387,
+ "step": 6382
+ },
+ {
+ "epoch": 3.017966903073286,
+ "grad_norm": 2.4806487560272217,
+ "learning_rate": 2.507485912531077e-06,
+ "loss": 0.4004,
+ "step": 6383
+ },
+ {
+ "epoch": 3.0184397163120567,
+ "grad_norm": 2.6480894088745117,
+ "learning_rate": 2.506862088124698e-06,
+ "loss": 0.3366,
+ "step": 6384
+ },
+ {
+ "epoch": 3.0189125295508275,
+ "grad_norm": 2.62559175491333,
+ "learning_rate": 2.5062382632910463e-06,
+ "loss": 0.3676,
+ "step": 6385
+ },
+ {
+ "epoch": 3.0193853427895982,
+ "grad_norm": 2.694767951965332,
+ "learning_rate": 2.5056144380689657e-06,
+ "loss": 0.3438,
+ "step": 6386
+ },
+ {
+ "epoch": 3.0198581560283686,
+ "grad_norm": 2.808107614517212,
+ "learning_rate": 2.504990612497299e-06,
+ "loss": 0.3831,
+ "step": 6387
+ },
+ {
+ "epoch": 3.0203309692671394,
+ "grad_norm": 3.2392303943634033,
+ "learning_rate": 2.504366786614888e-06,
+ "loss": 0.3493,
+ "step": 6388
+ },
+ {
+ "epoch": 3.02080378250591,
+ "grad_norm": 2.6899030208587646,
+ "learning_rate": 2.5037429604605774e-06,
+ "loss": 0.3998,
+ "step": 6389
+ },
+ {
+ "epoch": 3.021276595744681,
+ "grad_norm": 2.5622799396514893,
+ "learning_rate": 2.503119134073208e-06,
+ "loss": 0.3443,
+ "step": 6390
+ },
+ {
+ "epoch": 3.0217494089834513,
+ "grad_norm": 2.716832399368286,
+ "learning_rate": 2.502495307491625e-06,
+ "loss": 0.4465,
+ "step": 6391
+ },
+ {
+ "epoch": 3.022222222222222,
+ "grad_norm": 2.8117692470550537,
+ "learning_rate": 2.501871480754669e-06,
+ "loss": 0.3513,
+ "step": 6392
+ },
+ {
+ "epoch": 3.022695035460993,
+ "grad_norm": 3.1260762214660645,
+ "learning_rate": 2.501247653901185e-06,
+ "loss": 0.4336,
+ "step": 6393
+ },
+ {
+ "epoch": 3.0231678486997637,
+ "grad_norm": 2.5076897144317627,
+ "learning_rate": 2.5006238269700137e-06,
+ "loss": 0.3437,
+ "step": 6394
+ },
+ {
+ "epoch": 3.023640661938534,
+ "grad_norm": 2.781937837600708,
+ "learning_rate": 2.5e-06,
+ "loss": 0.3583,
+ "step": 6395
+ },
+ {
+ "epoch": 3.024113475177305,
+ "grad_norm": 3.084050178527832,
+ "learning_rate": 2.499376173029987e-06,
+ "loss": 0.3785,
+ "step": 6396
+ },
+ {
+ "epoch": 3.0245862884160757,
+ "grad_norm": 3.2292473316192627,
+ "learning_rate": 2.498752346098816e-06,
+ "loss": 0.3858,
+ "step": 6397
+ },
+ {
+ "epoch": 3.0250591016548465,
+ "grad_norm": 2.738614797592163,
+ "learning_rate": 2.498128519245332e-06,
+ "loss": 0.4166,
+ "step": 6398
+ },
+ {
+ "epoch": 3.025531914893617,
+ "grad_norm": 2.940103054046631,
+ "learning_rate": 2.4975046925083764e-06,
+ "loss": 0.4117,
+ "step": 6399
+ },
+ {
+ "epoch": 3.0260047281323876,
+ "grad_norm": 2.5177032947540283,
+ "learning_rate": 2.4968808659267927e-06,
+ "loss": 0.3704,
+ "step": 6400
+ },
+ {
+ "epoch": 3.0264775413711584,
+ "grad_norm": 2.6969990730285645,
+ "learning_rate": 2.4962570395394243e-06,
+ "loss": 0.3721,
+ "step": 6401
+ },
+ {
+ "epoch": 3.0269503546099292,
+ "grad_norm": 2.9696028232574463,
+ "learning_rate": 2.495633213385112e-06,
+ "loss": 0.3934,
+ "step": 6402
+ },
+ {
+ "epoch": 3.0274231678486996,
+ "grad_norm": 3.4032552242279053,
+ "learning_rate": 2.495009387502702e-06,
+ "loss": 0.3877,
+ "step": 6403
+ },
+ {
+ "epoch": 3.0278959810874704,
+ "grad_norm": 2.6801865100860596,
+ "learning_rate": 2.4943855619310343e-06,
+ "loss": 0.3421,
+ "step": 6404
+ },
+ {
+ "epoch": 3.028368794326241,
+ "grad_norm": 2.827056884765625,
+ "learning_rate": 2.493761736708954e-06,
+ "loss": 0.3791,
+ "step": 6405
+ },
+ {
+ "epoch": 3.028841607565012,
+ "grad_norm": 2.6393566131591797,
+ "learning_rate": 2.4931379118753034e-06,
+ "loss": 0.3729,
+ "step": 6406
+ },
+ {
+ "epoch": 3.0293144208037823,
+ "grad_norm": 2.833519458770752,
+ "learning_rate": 2.4925140874689236e-06,
+ "loss": 0.3836,
+ "step": 6407
+ },
+ {
+ "epoch": 3.029787234042553,
+ "grad_norm": 2.8852169513702393,
+ "learning_rate": 2.4918902635286597e-06,
+ "loss": 0.4307,
+ "step": 6408
+ },
+ {
+ "epoch": 3.030260047281324,
+ "grad_norm": 2.7166404724121094,
+ "learning_rate": 2.491266440093354e-06,
+ "loss": 0.3825,
+ "step": 6409
+ },
+ {
+ "epoch": 3.0307328605200947,
+ "grad_norm": 2.5828018188476562,
+ "learning_rate": 2.4906426172018474e-06,
+ "loss": 0.3579,
+ "step": 6410
+ },
+ {
+ "epoch": 3.031205673758865,
+ "grad_norm": 2.915632724761963,
+ "learning_rate": 2.490018794892985e-06,
+ "loss": 0.4099,
+ "step": 6411
+ },
+ {
+ "epoch": 3.031678486997636,
+ "grad_norm": 2.7117249965667725,
+ "learning_rate": 2.489394973205607e-06,
+ "loss": 0.4063,
+ "step": 6412
+ },
+ {
+ "epoch": 3.0321513002364067,
+ "grad_norm": 2.3989102840423584,
+ "learning_rate": 2.488771152178559e-06,
+ "loss": 0.3377,
+ "step": 6413
+ },
+ {
+ "epoch": 3.0326241134751775,
+ "grad_norm": 2.6560115814208984,
+ "learning_rate": 2.488147331850682e-06,
+ "loss": 0.4072,
+ "step": 6414
+ },
+ {
+ "epoch": 3.033096926713948,
+ "grad_norm": 2.9466328620910645,
+ "learning_rate": 2.4875235122608184e-06,
+ "loss": 0.3559,
+ "step": 6415
+ },
+ {
+ "epoch": 3.0335697399527186,
+ "grad_norm": 2.765348196029663,
+ "learning_rate": 2.4868996934478114e-06,
+ "loss": 0.336,
+ "step": 6416
+ },
+ {
+ "epoch": 3.0340425531914894,
+ "grad_norm": 2.6021807193756104,
+ "learning_rate": 2.4862758754505017e-06,
+ "loss": 0.3861,
+ "step": 6417
+ },
+ {
+ "epoch": 3.03451536643026,
+ "grad_norm": 2.7293684482574463,
+ "learning_rate": 2.4856520583077344e-06,
+ "loss": 0.3926,
+ "step": 6418
+ },
+ {
+ "epoch": 3.0349881796690306,
+ "grad_norm": 2.9704763889312744,
+ "learning_rate": 2.485028242058351e-06,
+ "loss": 0.4303,
+ "step": 6419
+ },
+ {
+ "epoch": 3.0354609929078014,
+ "grad_norm": 3.385713815689087,
+ "learning_rate": 2.484404426741191e-06,
+ "loss": 0.44,
+ "step": 6420
+ },
+ {
+ "epoch": 3.035933806146572,
+ "grad_norm": 3.177983045578003,
+ "learning_rate": 2.4837806123951013e-06,
+ "loss": 0.4256,
+ "step": 6421
+ },
+ {
+ "epoch": 3.036406619385343,
+ "grad_norm": 2.6287200450897217,
+ "learning_rate": 2.4831567990589203e-06,
+ "loss": 0.3764,
+ "step": 6422
+ },
+ {
+ "epoch": 3.0368794326241133,
+ "grad_norm": 2.81823992729187,
+ "learning_rate": 2.4825329867714924e-06,
+ "loss": 0.3645,
+ "step": 6423
+ },
+ {
+ "epoch": 3.037352245862884,
+ "grad_norm": 3.1826934814453125,
+ "learning_rate": 2.4819091755716586e-06,
+ "loss": 0.3666,
+ "step": 6424
+ },
+ {
+ "epoch": 3.037825059101655,
+ "grad_norm": 3.0880346298217773,
+ "learning_rate": 2.481285365498261e-06,
+ "loss": 0.4339,
+ "step": 6425
+ },
+ {
+ "epoch": 3.0382978723404257,
+ "grad_norm": 3.1764965057373047,
+ "learning_rate": 2.480661556590142e-06,
+ "loss": 0.4804,
+ "step": 6426
+ },
+ {
+ "epoch": 3.038770685579196,
+ "grad_norm": 2.89469313621521,
+ "learning_rate": 2.480037748886142e-06,
+ "loss": 0.3875,
+ "step": 6427
+ },
+ {
+ "epoch": 3.039243498817967,
+ "grad_norm": 2.6043636798858643,
+ "learning_rate": 2.479413942425105e-06,
+ "loss": 0.3859,
+ "step": 6428
+ },
+ {
+ "epoch": 3.0397163120567376,
+ "grad_norm": 2.6570727825164795,
+ "learning_rate": 2.4787901372458712e-06,
+ "loss": 0.3508,
+ "step": 6429
+ },
+ {
+ "epoch": 3.0401891252955084,
+ "grad_norm": 2.914050579071045,
+ "learning_rate": 2.4781663333872825e-06,
+ "loss": 0.3904,
+ "step": 6430
+ },
+ {
+ "epoch": 3.040661938534279,
+ "grad_norm": 2.595606803894043,
+ "learning_rate": 2.47754253088818e-06,
+ "loss": 0.3753,
+ "step": 6431
+ },
+ {
+ "epoch": 3.0411347517730496,
+ "grad_norm": 2.68186616897583,
+ "learning_rate": 2.4769187297874065e-06,
+ "loss": 0.3545,
+ "step": 6432
+ },
+ {
+ "epoch": 3.0416075650118204,
+ "grad_norm": 2.956507921218872,
+ "learning_rate": 2.476294930123802e-06,
+ "loss": 0.3778,
+ "step": 6433
+ },
+ {
+ "epoch": 3.042080378250591,
+ "grad_norm": 2.8327226638793945,
+ "learning_rate": 2.475671131936209e-06,
+ "loss": 0.3205,
+ "step": 6434
+ },
+ {
+ "epoch": 3.0425531914893615,
+ "grad_norm": 2.594348430633545,
+ "learning_rate": 2.475047335263466e-06,
+ "loss": 0.3859,
+ "step": 6435
+ },
+ {
+ "epoch": 3.0430260047281323,
+ "grad_norm": 3.5030717849731445,
+ "learning_rate": 2.4744235401444177e-06,
+ "loss": 0.3611,
+ "step": 6436
+ },
+ {
+ "epoch": 3.043498817966903,
+ "grad_norm": 2.8478317260742188,
+ "learning_rate": 2.4737997466179034e-06,
+ "loss": 0.3927,
+ "step": 6437
+ },
+ {
+ "epoch": 3.043971631205674,
+ "grad_norm": 2.677827835083008,
+ "learning_rate": 2.4731759547227627e-06,
+ "loss": 0.3784,
+ "step": 6438
+ },
+ {
+ "epoch": 3.0444444444444443,
+ "grad_norm": 3.0059866905212402,
+ "learning_rate": 2.4725521644978393e-06,
+ "loss": 0.4279,
+ "step": 6439
+ },
+ {
+ "epoch": 3.044917257683215,
+ "grad_norm": 3.012500047683716,
+ "learning_rate": 2.4719283759819713e-06,
+ "loss": 0.4007,
+ "step": 6440
+ },
+ {
+ "epoch": 3.045390070921986,
+ "grad_norm": 2.758204936981201,
+ "learning_rate": 2.4713045892140007e-06,
+ "loss": 0.3668,
+ "step": 6441
+ },
+ {
+ "epoch": 3.0458628841607567,
+ "grad_norm": 2.9551615715026855,
+ "learning_rate": 2.4706808042327678e-06,
+ "loss": 0.3524,
+ "step": 6442
+ },
+ {
+ "epoch": 3.046335697399527,
+ "grad_norm": 2.8639965057373047,
+ "learning_rate": 2.4700570210771115e-06,
+ "loss": 0.3886,
+ "step": 6443
+ },
+ {
+ "epoch": 3.046808510638298,
+ "grad_norm": 2.718219757080078,
+ "learning_rate": 2.4694332397858738e-06,
+ "loss": 0.3693,
+ "step": 6444
+ },
+ {
+ "epoch": 3.0472813238770686,
+ "grad_norm": 3.050135612487793,
+ "learning_rate": 2.4688094603978933e-06,
+ "loss": 0.3979,
+ "step": 6445
+ },
+ {
+ "epoch": 3.0477541371158394,
+ "grad_norm": 2.786186456680298,
+ "learning_rate": 2.468185682952013e-06,
+ "loss": 0.3809,
+ "step": 6446
+ },
+ {
+ "epoch": 3.0482269503546098,
+ "grad_norm": 2.6462252140045166,
+ "learning_rate": 2.4675619074870697e-06,
+ "loss": 0.3746,
+ "step": 6447
+ },
+ {
+ "epoch": 3.0486997635933806,
+ "grad_norm": 2.984783887863159,
+ "learning_rate": 2.4669381340419037e-06,
+ "loss": 0.4092,
+ "step": 6448
+ },
+ {
+ "epoch": 3.0491725768321514,
+ "grad_norm": 2.936380624771118,
+ "learning_rate": 2.466314362655356e-06,
+ "loss": 0.4335,
+ "step": 6449
+ },
+ {
+ "epoch": 3.049645390070922,
+ "grad_norm": 2.730738639831543,
+ "learning_rate": 2.465690593366264e-06,
+ "loss": 0.364,
+ "step": 6450
+ },
+ {
+ "epoch": 3.0501182033096925,
+ "grad_norm": 2.7273590564727783,
+ "learning_rate": 2.4650668262134693e-06,
+ "loss": 0.3905,
+ "step": 6451
+ },
+ {
+ "epoch": 3.0505910165484633,
+ "grad_norm": 2.9588208198547363,
+ "learning_rate": 2.4644430612358105e-06,
+ "loss": 0.3936,
+ "step": 6452
+ },
+ {
+ "epoch": 3.051063829787234,
+ "grad_norm": 2.8721611499786377,
+ "learning_rate": 2.4638192984721247e-06,
+ "loss": 0.4279,
+ "step": 6453
+ },
+ {
+ "epoch": 3.051536643026005,
+ "grad_norm": 3.7179651260375977,
+ "learning_rate": 2.463195537961254e-06,
+ "loss": 0.427,
+ "step": 6454
+ },
+ {
+ "epoch": 3.0520094562647753,
+ "grad_norm": 2.651731491088867,
+ "learning_rate": 2.4625717797420353e-06,
+ "loss": 0.3471,
+ "step": 6455
+ },
+ {
+ "epoch": 3.052482269503546,
+ "grad_norm": 3.898737668991089,
+ "learning_rate": 2.4619480238533085e-06,
+ "loss": 0.4574,
+ "step": 6456
+ },
+ {
+ "epoch": 3.052955082742317,
+ "grad_norm": 2.916252374649048,
+ "learning_rate": 2.4613242703339108e-06,
+ "loss": 0.3622,
+ "step": 6457
+ },
+ {
+ "epoch": 3.0534278959810877,
+ "grad_norm": 3.122565507888794,
+ "learning_rate": 2.4607005192226806e-06,
+ "loss": 0.3954,
+ "step": 6458
+ },
+ {
+ "epoch": 3.053900709219858,
+ "grad_norm": 3.2377424240112305,
+ "learning_rate": 2.4600767705584575e-06,
+ "loss": 0.4082,
+ "step": 6459
+ },
+ {
+ "epoch": 3.054373522458629,
+ "grad_norm": 2.941102981567383,
+ "learning_rate": 2.459453024380079e-06,
+ "loss": 0.4324,
+ "step": 6460
+ },
+ {
+ "epoch": 3.0548463356973996,
+ "grad_norm": 2.964313507080078,
+ "learning_rate": 2.4588292807263816e-06,
+ "loss": 0.3037,
+ "step": 6461
+ },
+ {
+ "epoch": 3.0553191489361704,
+ "grad_norm": 2.824669599533081,
+ "learning_rate": 2.4582055396362055e-06,
+ "loss": 0.4076,
+ "step": 6462
+ },
+ {
+ "epoch": 3.0557919621749408,
+ "grad_norm": 2.7739884853363037,
+ "learning_rate": 2.457581801148387e-06,
+ "loss": 0.3615,
+ "step": 6463
+ },
+ {
+ "epoch": 3.0562647754137116,
+ "grad_norm": 3.2974464893341064,
+ "learning_rate": 2.456958065301764e-06,
+ "loss": 0.426,
+ "step": 6464
+ },
+ {
+ "epoch": 3.0567375886524824,
+ "grad_norm": 3.0801217555999756,
+ "learning_rate": 2.456334332135174e-06,
+ "loss": 0.3737,
+ "step": 6465
+ },
+ {
+ "epoch": 3.057210401891253,
+ "grad_norm": 2.788851022720337,
+ "learning_rate": 2.455710601687452e-06,
+ "loss": 0.4367,
+ "step": 6466
+ },
+ {
+ "epoch": 3.0576832151300235,
+ "grad_norm": 2.8078136444091797,
+ "learning_rate": 2.4550868739974378e-06,
+ "loss": 0.3796,
+ "step": 6467
+ },
+ {
+ "epoch": 3.0581560283687943,
+ "grad_norm": 2.9871349334716797,
+ "learning_rate": 2.4544631491039657e-06,
+ "loss": 0.3869,
+ "step": 6468
+ },
+ {
+ "epoch": 3.058628841607565,
+ "grad_norm": 2.9170174598693848,
+ "learning_rate": 2.453839427045875e-06,
+ "loss": 0.4591,
+ "step": 6469
+ },
+ {
+ "epoch": 3.059101654846336,
+ "grad_norm": 2.7316131591796875,
+ "learning_rate": 2.4532157078620013e-06,
+ "loss": 0.3723,
+ "step": 6470
+ },
+ {
+ "epoch": 3.0595744680851062,
+ "grad_norm": 3.047921657562256,
+ "learning_rate": 2.4525919915911793e-06,
+ "loss": 0.3804,
+ "step": 6471
+ },
+ {
+ "epoch": 3.060047281323877,
+ "grad_norm": 3.047934055328369,
+ "learning_rate": 2.4519682782722465e-06,
+ "loss": 0.3949,
+ "step": 6472
+ },
+ {
+ "epoch": 3.060520094562648,
+ "grad_norm": 2.4911186695098877,
+ "learning_rate": 2.4513445679440374e-06,
+ "loss": 0.3629,
+ "step": 6473
+ },
+ {
+ "epoch": 3.0609929078014186,
+ "grad_norm": 2.5353519916534424,
+ "learning_rate": 2.4507208606453895e-06,
+ "loss": 0.3417,
+ "step": 6474
+ },
+ {
+ "epoch": 3.061465721040189,
+ "grad_norm": 2.474622964859009,
+ "learning_rate": 2.4500971564151384e-06,
+ "loss": 0.3468,
+ "step": 6475
+ },
+ {
+ "epoch": 3.06193853427896,
+ "grad_norm": 2.7016963958740234,
+ "learning_rate": 2.4494734552921166e-06,
+ "loss": 0.3872,
+ "step": 6476
+ },
+ {
+ "epoch": 3.0624113475177306,
+ "grad_norm": 2.912144184112549,
+ "learning_rate": 2.4488497573151625e-06,
+ "loss": 0.3727,
+ "step": 6477
+ },
+ {
+ "epoch": 3.0628841607565014,
+ "grad_norm": 2.8234877586364746,
+ "learning_rate": 2.4482260625231093e-06,
+ "loss": 0.3472,
+ "step": 6478
+ },
+ {
+ "epoch": 3.0633569739952717,
+ "grad_norm": 2.6554179191589355,
+ "learning_rate": 2.447602370954793e-06,
+ "loss": 0.343,
+ "step": 6479
+ },
+ {
+ "epoch": 3.0638297872340425,
+ "grad_norm": 2.666419744491577,
+ "learning_rate": 2.446978682649047e-06,
+ "loss": 0.3932,
+ "step": 6480
+ },
+ {
+ "epoch": 3.0643026004728133,
+ "grad_norm": 2.968574285507202,
+ "learning_rate": 2.446354997644705e-06,
+ "loss": 0.4418,
+ "step": 6481
+ },
+ {
+ "epoch": 3.064775413711584,
+ "grad_norm": 2.692253589630127,
+ "learning_rate": 2.4457313159806028e-06,
+ "loss": 0.3141,
+ "step": 6482
+ },
+ {
+ "epoch": 3.0652482269503545,
+ "grad_norm": 2.5857295989990234,
+ "learning_rate": 2.445107637695574e-06,
+ "loss": 0.3392,
+ "step": 6483
+ },
+ {
+ "epoch": 3.0657210401891253,
+ "grad_norm": 3.2332825660705566,
+ "learning_rate": 2.4444839628284504e-06,
+ "loss": 0.4694,
+ "step": 6484
+ },
+ {
+ "epoch": 3.066193853427896,
+ "grad_norm": 2.7391014099121094,
+ "learning_rate": 2.4438602914180684e-06,
+ "loss": 0.3966,
+ "step": 6485
+ },
+ {
+ "epoch": 3.066666666666667,
+ "grad_norm": 2.7882139682769775,
+ "learning_rate": 2.4432366235032593e-06,
+ "loss": 0.3552,
+ "step": 6486
+ },
+ {
+ "epoch": 3.0671394799054372,
+ "grad_norm": 2.8907811641693115,
+ "learning_rate": 2.4426129591228573e-06,
+ "loss": 0.4478,
+ "step": 6487
+ },
+ {
+ "epoch": 3.067612293144208,
+ "grad_norm": 2.878929853439331,
+ "learning_rate": 2.4419892983156947e-06,
+ "loss": 0.3457,
+ "step": 6488
+ },
+ {
+ "epoch": 3.068085106382979,
+ "grad_norm": 2.7087442874908447,
+ "learning_rate": 2.441365641120603e-06,
+ "loss": 0.3491,
+ "step": 6489
+ },
+ {
+ "epoch": 3.0685579196217496,
+ "grad_norm": 3.2330431938171387,
+ "learning_rate": 2.4407419875764167e-06,
+ "loss": 0.3901,
+ "step": 6490
+ },
+ {
+ "epoch": 3.06903073286052,
+ "grad_norm": 3.0529370307922363,
+ "learning_rate": 2.440118337721966e-06,
+ "loss": 0.4059,
+ "step": 6491
+ },
+ {
+ "epoch": 3.0695035460992908,
+ "grad_norm": 2.4786794185638428,
+ "learning_rate": 2.439494691596085e-06,
+ "loss": 0.3153,
+ "step": 6492
+ },
+ {
+ "epoch": 3.0699763593380616,
+ "grad_norm": 2.956310510635376,
+ "learning_rate": 2.438871049237604e-06,
+ "loss": 0.3973,
+ "step": 6493
+ },
+ {
+ "epoch": 3.0704491725768324,
+ "grad_norm": 3.0816991329193115,
+ "learning_rate": 2.4382474106853543e-06,
+ "loss": 0.388,
+ "step": 6494
+ },
+ {
+ "epoch": 3.0709219858156027,
+ "grad_norm": 2.6103477478027344,
+ "learning_rate": 2.4376237759781686e-06,
+ "loss": 0.3656,
+ "step": 6495
+ },
+ {
+ "epoch": 3.0713947990543735,
+ "grad_norm": 2.974076271057129,
+ "learning_rate": 2.437000145154875e-06,
+ "loss": 0.3246,
+ "step": 6496
+ },
+ {
+ "epoch": 3.0718676122931443,
+ "grad_norm": 2.633605718612671,
+ "learning_rate": 2.4363765182543075e-06,
+ "loss": 0.3556,
+ "step": 6497
+ },
+ {
+ "epoch": 3.072340425531915,
+ "grad_norm": 2.49161434173584,
+ "learning_rate": 2.4357528953152953e-06,
+ "loss": 0.3506,
+ "step": 6498
+ },
+ {
+ "epoch": 3.0728132387706855,
+ "grad_norm": 2.6435935497283936,
+ "learning_rate": 2.4351292763766676e-06,
+ "loss": 0.3652,
+ "step": 6499
+ },
+ {
+ "epoch": 3.0732860520094563,
+ "grad_norm": 2.9710617065429688,
+ "learning_rate": 2.4345056614772563e-06,
+ "loss": 0.3713,
+ "step": 6500
+ },
+ {
+ "epoch": 3.073758865248227,
+ "grad_norm": 2.6947052478790283,
+ "learning_rate": 2.43388205065589e-06,
+ "loss": 0.378,
+ "step": 6501
+ },
+ {
+ "epoch": 3.0742316784869974,
+ "grad_norm": 2.9686238765716553,
+ "learning_rate": 2.433258443951398e-06,
+ "loss": 0.3936,
+ "step": 6502
+ },
+ {
+ "epoch": 3.074704491725768,
+ "grad_norm": 2.6008691787719727,
+ "learning_rate": 2.432634841402611e-06,
+ "loss": 0.3709,
+ "step": 6503
+ },
+ {
+ "epoch": 3.075177304964539,
+ "grad_norm": 2.595116376876831,
+ "learning_rate": 2.4320112430483563e-06,
+ "loss": 0.3884,
+ "step": 6504
+ },
+ {
+ "epoch": 3.07565011820331,
+ "grad_norm": 2.685241460800171,
+ "learning_rate": 2.431387648927464e-06,
+ "loss": 0.3751,
+ "step": 6505
+ },
+ {
+ "epoch": 3.0761229314420806,
+ "grad_norm": 2.8863797187805176,
+ "learning_rate": 2.430764059078762e-06,
+ "loss": 0.3765,
+ "step": 6506
+ },
+ {
+ "epoch": 3.076595744680851,
+ "grad_norm": 3.020766019821167,
+ "learning_rate": 2.430140473541077e-06,
+ "loss": 0.362,
+ "step": 6507
+ },
+ {
+ "epoch": 3.0770685579196217,
+ "grad_norm": 2.9521167278289795,
+ "learning_rate": 2.42951689235324e-06,
+ "loss": 0.41,
+ "step": 6508
+ },
+ {
+ "epoch": 3.0775413711583925,
+ "grad_norm": 2.5844924449920654,
+ "learning_rate": 2.4288933155540757e-06,
+ "loss": 0.3258,
+ "step": 6509
+ },
+ {
+ "epoch": 3.078014184397163,
+ "grad_norm": 3.052661657333374,
+ "learning_rate": 2.4282697431824138e-06,
+ "loss": 0.363,
+ "step": 6510
+ },
+ {
+ "epoch": 3.0784869976359337,
+ "grad_norm": 3.109342575073242,
+ "learning_rate": 2.427646175277081e-06,
+ "loss": 0.4105,
+ "step": 6511
+ },
+ {
+ "epoch": 3.0789598108747045,
+ "grad_norm": 3.3141326904296875,
+ "learning_rate": 2.427022611876903e-06,
+ "loss": 0.405,
+ "step": 6512
+ },
+ {
+ "epoch": 3.0794326241134753,
+ "grad_norm": 3.054673194885254,
+ "learning_rate": 2.426399053020707e-06,
+ "loss": 0.3532,
+ "step": 6513
+ },
+ {
+ "epoch": 3.079905437352246,
+ "grad_norm": 2.823489189147949,
+ "learning_rate": 2.425775498747318e-06,
+ "loss": 0.3762,
+ "step": 6514
+ },
+ {
+ "epoch": 3.0803782505910164,
+ "grad_norm": 2.6739792823791504,
+ "learning_rate": 2.425151949095565e-06,
+ "loss": 0.4044,
+ "step": 6515
+ },
+ {
+ "epoch": 3.0808510638297872,
+ "grad_norm": 2.7313177585601807,
+ "learning_rate": 2.4245284041042714e-06,
+ "loss": 0.3136,
+ "step": 6516
+ },
+ {
+ "epoch": 3.081323877068558,
+ "grad_norm": 3.1661181449890137,
+ "learning_rate": 2.4239048638122624e-06,
+ "loss": 0.44,
+ "step": 6517
+ },
+ {
+ "epoch": 3.0817966903073284,
+ "grad_norm": 3.326542377471924,
+ "learning_rate": 2.4232813282583647e-06,
+ "loss": 0.3798,
+ "step": 6518
+ },
+ {
+ "epoch": 3.082269503546099,
+ "grad_norm": 3.0194952487945557,
+ "learning_rate": 2.422657797481402e-06,
+ "loss": 0.423,
+ "step": 6519
+ },
+ {
+ "epoch": 3.08274231678487,
+ "grad_norm": 2.6704318523406982,
+ "learning_rate": 2.4220342715201995e-06,
+ "loss": 0.41,
+ "step": 6520
+ },
+ {
+ "epoch": 3.083215130023641,
+ "grad_norm": 3.057990312576294,
+ "learning_rate": 2.421410750413581e-06,
+ "loss": 0.4096,
+ "step": 6521
+ },
+ {
+ "epoch": 3.083687943262411,
+ "grad_norm": 2.6242079734802246,
+ "learning_rate": 2.4207872342003693e-06,
+ "loss": 0.3673,
+ "step": 6522
+ },
+ {
+ "epoch": 3.084160756501182,
+ "grad_norm": 2.933910846710205,
+ "learning_rate": 2.4201637229193904e-06,
+ "loss": 0.4018,
+ "step": 6523
+ },
+ {
+ "epoch": 3.0846335697399527,
+ "grad_norm": 2.6973681449890137,
+ "learning_rate": 2.4195402166094657e-06,
+ "loss": 0.3533,
+ "step": 6524
+ },
+ {
+ "epoch": 3.0851063829787235,
+ "grad_norm": 3.096013307571411,
+ "learning_rate": 2.4189167153094194e-06,
+ "loss": 0.3872,
+ "step": 6525
+ },
+ {
+ "epoch": 3.085579196217494,
+ "grad_norm": 3.0707414150238037,
+ "learning_rate": 2.4182932190580737e-06,
+ "loss": 0.3775,
+ "step": 6526
+ },
+ {
+ "epoch": 3.0860520094562647,
+ "grad_norm": 2.873190402984619,
+ "learning_rate": 2.417669727894251e-06,
+ "loss": 0.3144,
+ "step": 6527
+ },
+ {
+ "epoch": 3.0865248226950355,
+ "grad_norm": 2.316431999206543,
+ "learning_rate": 2.4170462418567732e-06,
+ "loss": 0.3238,
+ "step": 6528
+ },
+ {
+ "epoch": 3.0869976359338063,
+ "grad_norm": 2.3672494888305664,
+ "learning_rate": 2.4164227609844626e-06,
+ "loss": 0.3585,
+ "step": 6529
+ },
+ {
+ "epoch": 3.0874704491725766,
+ "grad_norm": 2.904538154602051,
+ "learning_rate": 2.415799285316139e-06,
+ "loss": 0.366,
+ "step": 6530
+ },
+ {
+ "epoch": 3.0879432624113474,
+ "grad_norm": 2.914602279663086,
+ "learning_rate": 2.415175814890626e-06,
+ "loss": 0.3793,
+ "step": 6531
+ },
+ {
+ "epoch": 3.088416075650118,
+ "grad_norm": 2.652005672454834,
+ "learning_rate": 2.4145523497467417e-06,
+ "loss": 0.362,
+ "step": 6532
+ },
+ {
+ "epoch": 3.088888888888889,
+ "grad_norm": 2.5137813091278076,
+ "learning_rate": 2.413928889923309e-06,
+ "loss": 0.2974,
+ "step": 6533
+ },
+ {
+ "epoch": 3.0893617021276594,
+ "grad_norm": 3.2166645526885986,
+ "learning_rate": 2.413305435459147e-06,
+ "loss": 0.4151,
+ "step": 6534
+ },
+ {
+ "epoch": 3.08983451536643,
+ "grad_norm": 3.0506820678710938,
+ "learning_rate": 2.412681986393075e-06,
+ "loss": 0.4223,
+ "step": 6535
+ },
+ {
+ "epoch": 3.090307328605201,
+ "grad_norm": 3.035275936126709,
+ "learning_rate": 2.412058542763913e-06,
+ "loss": 0.4841,
+ "step": 6536
+ },
+ {
+ "epoch": 3.0907801418439718,
+ "grad_norm": 3.3195009231567383,
+ "learning_rate": 2.4114351046104793e-06,
+ "loss": 0.4205,
+ "step": 6537
+ },
+ {
+ "epoch": 3.091252955082742,
+ "grad_norm": 2.8700361251831055,
+ "learning_rate": 2.410811671971594e-06,
+ "loss": 0.3704,
+ "step": 6538
+ },
+ {
+ "epoch": 3.091725768321513,
+ "grad_norm": 2.900595188140869,
+ "learning_rate": 2.410188244886075e-06,
+ "loss": 0.4184,
+ "step": 6539
+ },
+ {
+ "epoch": 3.0921985815602837,
+ "grad_norm": 2.88179349899292,
+ "learning_rate": 2.409564823392739e-06,
+ "loss": 0.4156,
+ "step": 6540
+ },
+ {
+ "epoch": 3.0926713947990545,
+ "grad_norm": 2.677568197250366,
+ "learning_rate": 2.408941407530406e-06,
+ "loss": 0.4084,
+ "step": 6541
+ },
+ {
+ "epoch": 3.093144208037825,
+ "grad_norm": 3.0236027240753174,
+ "learning_rate": 2.408317997337892e-06,
+ "loss": 0.4384,
+ "step": 6542
+ },
+ {
+ "epoch": 3.0936170212765957,
+ "grad_norm": 3.1708545684814453,
+ "learning_rate": 2.4076945928540143e-06,
+ "loss": 0.3876,
+ "step": 6543
+ },
+ {
+ "epoch": 3.0940898345153665,
+ "grad_norm": 3.248821973800659,
+ "learning_rate": 2.40707119411759e-06,
+ "loss": 0.3865,
+ "step": 6544
+ },
+ {
+ "epoch": 3.0945626477541373,
+ "grad_norm": 3.0961649417877197,
+ "learning_rate": 2.4064478011674334e-06,
+ "loss": 0.3982,
+ "step": 6545
+ },
+ {
+ "epoch": 3.0950354609929076,
+ "grad_norm": 3.1989805698394775,
+ "learning_rate": 2.4058244140423637e-06,
+ "loss": 0.4777,
+ "step": 6546
+ },
+ {
+ "epoch": 3.0955082742316784,
+ "grad_norm": 2.805640459060669,
+ "learning_rate": 2.4052010327811933e-06,
+ "loss": 0.3764,
+ "step": 6547
+ },
+ {
+ "epoch": 3.095981087470449,
+ "grad_norm": 2.7225050926208496,
+ "learning_rate": 2.40457765742274e-06,
+ "loss": 0.3286,
+ "step": 6548
+ },
+ {
+ "epoch": 3.09645390070922,
+ "grad_norm": 3.119915008544922,
+ "learning_rate": 2.4039542880058174e-06,
+ "loss": 0.4463,
+ "step": 6549
+ },
+ {
+ "epoch": 3.0969267139479904,
+ "grad_norm": 2.8503530025482178,
+ "learning_rate": 2.4033309245692403e-06,
+ "loss": 0.395,
+ "step": 6550
+ },
+ {
+ "epoch": 3.097399527186761,
+ "grad_norm": 2.947504758834839,
+ "learning_rate": 2.4027075671518225e-06,
+ "loss": 0.4024,
+ "step": 6551
+ },
+ {
+ "epoch": 3.097872340425532,
+ "grad_norm": 3.170905113220215,
+ "learning_rate": 2.402084215792377e-06,
+ "loss": 0.4302,
+ "step": 6552
+ },
+ {
+ "epoch": 3.0983451536643027,
+ "grad_norm": 2.910475492477417,
+ "learning_rate": 2.4014608705297195e-06,
+ "loss": 0.4037,
+ "step": 6553
+ },
+ {
+ "epoch": 3.098817966903073,
+ "grad_norm": 2.627511978149414,
+ "learning_rate": 2.400837531402661e-06,
+ "loss": 0.3972,
+ "step": 6554
+ },
+ {
+ "epoch": 3.099290780141844,
+ "grad_norm": 2.6485681533813477,
+ "learning_rate": 2.4002141984500133e-06,
+ "loss": 0.4044,
+ "step": 6555
+ },
+ {
+ "epoch": 3.0997635933806147,
+ "grad_norm": 2.930954694747925,
+ "learning_rate": 2.399590871710592e-06,
+ "loss": 0.4214,
+ "step": 6556
+ },
+ {
+ "epoch": 3.1002364066193855,
+ "grad_norm": 2.6014554500579834,
+ "learning_rate": 2.3989675512232063e-06,
+ "loss": 0.3493,
+ "step": 6557
+ },
+ {
+ "epoch": 3.100709219858156,
+ "grad_norm": 2.899001121520996,
+ "learning_rate": 2.398344237026667e-06,
+ "loss": 0.382,
+ "step": 6558
+ },
+ {
+ "epoch": 3.1011820330969266,
+ "grad_norm": 2.4698870182037354,
+ "learning_rate": 2.3977209291597876e-06,
+ "loss": 0.3558,
+ "step": 6559
+ },
+ {
+ "epoch": 3.1016548463356974,
+ "grad_norm": 3.2926251888275146,
+ "learning_rate": 2.3970976276613763e-06,
+ "loss": 0.4078,
+ "step": 6560
+ },
+ {
+ "epoch": 3.1021276595744682,
+ "grad_norm": 2.5306150913238525,
+ "learning_rate": 2.3964743325702454e-06,
+ "loss": 0.3657,
+ "step": 6561
+ },
+ {
+ "epoch": 3.1026004728132386,
+ "grad_norm": 2.727583408355713,
+ "learning_rate": 2.395851043925204e-06,
+ "loss": 0.3791,
+ "step": 6562
+ },
+ {
+ "epoch": 3.1030732860520094,
+ "grad_norm": 3.1403541564941406,
+ "learning_rate": 2.3952277617650602e-06,
+ "loss": 0.3934,
+ "step": 6563
+ },
+ {
+ "epoch": 3.10354609929078,
+ "grad_norm": 2.5816383361816406,
+ "learning_rate": 2.3946044861286256e-06,
+ "loss": 0.3703,
+ "step": 6564
+ },
+ {
+ "epoch": 3.104018912529551,
+ "grad_norm": 2.5742220878601074,
+ "learning_rate": 2.3939812170547067e-06,
+ "loss": 0.3628,
+ "step": 6565
+ },
+ {
+ "epoch": 3.1044917257683213,
+ "grad_norm": 2.7276530265808105,
+ "learning_rate": 2.393357954582113e-06,
+ "loss": 0.3789,
+ "step": 6566
+ },
+ {
+ "epoch": 3.104964539007092,
+ "grad_norm": 3.05595064163208,
+ "learning_rate": 2.3927346987496515e-06,
+ "loss": 0.3766,
+ "step": 6567
+ },
+ {
+ "epoch": 3.105437352245863,
+ "grad_norm": 2.786970615386963,
+ "learning_rate": 2.39211144959613e-06,
+ "loss": 0.3329,
+ "step": 6568
+ },
+ {
+ "epoch": 3.1059101654846337,
+ "grad_norm": 3.499018430709839,
+ "learning_rate": 2.391488207160356e-06,
+ "loss": 0.4175,
+ "step": 6569
+ },
+ {
+ "epoch": 3.106382978723404,
+ "grad_norm": 2.969735860824585,
+ "learning_rate": 2.3908649714811346e-06,
+ "loss": 0.3893,
+ "step": 6570
+ },
+ {
+ "epoch": 3.106855791962175,
+ "grad_norm": 3.1494929790496826,
+ "learning_rate": 2.3902417425972734e-06,
+ "loss": 0.4048,
+ "step": 6571
+ },
+ {
+ "epoch": 3.1073286052009457,
+ "grad_norm": 2.6393489837646484,
+ "learning_rate": 2.3896185205475782e-06,
+ "loss": 0.3216,
+ "step": 6572
+ },
+ {
+ "epoch": 3.1078014184397165,
+ "grad_norm": 3.6984152793884277,
+ "learning_rate": 2.3889953053708528e-06,
+ "loss": 0.3646,
+ "step": 6573
+ },
+ {
+ "epoch": 3.108274231678487,
+ "grad_norm": 3.518547534942627,
+ "learning_rate": 2.388372097105903e-06,
+ "loss": 0.3627,
+ "step": 6574
+ },
+ {
+ "epoch": 3.1087470449172576,
+ "grad_norm": 3.422043800354004,
+ "learning_rate": 2.3877488957915333e-06,
+ "loss": 0.4116,
+ "step": 6575
+ },
+ {
+ "epoch": 3.1092198581560284,
+ "grad_norm": 2.8088064193725586,
+ "learning_rate": 2.3871257014665486e-06,
+ "loss": 0.3477,
+ "step": 6576
+ },
+ {
+ "epoch": 3.109692671394799,
+ "grad_norm": 2.7877607345581055,
+ "learning_rate": 2.3865025141697513e-06,
+ "loss": 0.351,
+ "step": 6577
+ },
+ {
+ "epoch": 3.1101654846335696,
+ "grad_norm": 2.9446799755096436,
+ "learning_rate": 2.3858793339399433e-06,
+ "loss": 0.4025,
+ "step": 6578
+ },
+ {
+ "epoch": 3.1106382978723404,
+ "grad_norm": 2.886584758758545,
+ "learning_rate": 2.3852561608159304e-06,
+ "loss": 0.3765,
+ "step": 6579
+ },
+ {
+ "epoch": 3.111111111111111,
+ "grad_norm": 3.45711088180542,
+ "learning_rate": 2.384632994836513e-06,
+ "loss": 0.3744,
+ "step": 6580
+ },
+ {
+ "epoch": 3.111583924349882,
+ "grad_norm": 2.737441301345825,
+ "learning_rate": 2.3840098360404916e-06,
+ "loss": 0.4048,
+ "step": 6581
+ },
+ {
+ "epoch": 3.1120567375886523,
+ "grad_norm": 2.742567300796509,
+ "learning_rate": 2.383386684466671e-06,
+ "loss": 0.3717,
+ "step": 6582
+ },
+ {
+ "epoch": 3.112529550827423,
+ "grad_norm": 3.017970561981201,
+ "learning_rate": 2.382763540153849e-06,
+ "loss": 0.3922,
+ "step": 6583
+ },
+ {
+ "epoch": 3.113002364066194,
+ "grad_norm": 3.132004499435425,
+ "learning_rate": 2.3821404031408283e-06,
+ "loss": 0.3969,
+ "step": 6584
+ },
+ {
+ "epoch": 3.1134751773049647,
+ "grad_norm": 2.910820245742798,
+ "learning_rate": 2.3815172734664075e-06,
+ "loss": 0.4241,
+ "step": 6585
+ },
+ {
+ "epoch": 3.113947990543735,
+ "grad_norm": 3.0029842853546143,
+ "learning_rate": 2.380894151169386e-06,
+ "loss": 0.4007,
+ "step": 6586
+ },
+ {
+ "epoch": 3.114420803782506,
+ "grad_norm": 3.0309178829193115,
+ "learning_rate": 2.380271036288564e-06,
+ "loss": 0.3876,
+ "step": 6587
+ },
+ {
+ "epoch": 3.1148936170212767,
+ "grad_norm": 2.963204860687256,
+ "learning_rate": 2.379647928862739e-06,
+ "loss": 0.4017,
+ "step": 6588
+ },
+ {
+ "epoch": 3.1153664302600474,
+ "grad_norm": 3.0127944946289062,
+ "learning_rate": 2.3790248289307103e-06,
+ "loss": 0.3651,
+ "step": 6589
+ },
+ {
+ "epoch": 3.115839243498818,
+ "grad_norm": 2.557485580444336,
+ "learning_rate": 2.3784017365312755e-06,
+ "loss": 0.3419,
+ "step": 6590
+ },
+ {
+ "epoch": 3.1163120567375886,
+ "grad_norm": 2.8577969074249268,
+ "learning_rate": 2.3777786517032306e-06,
+ "loss": 0.372,
+ "step": 6591
+ },
+ {
+ "epoch": 3.1167848699763594,
+ "grad_norm": 2.450324058532715,
+ "learning_rate": 2.3771555744853735e-06,
+ "loss": 0.3442,
+ "step": 6592
+ },
+ {
+ "epoch": 3.11725768321513,
+ "grad_norm": 2.7939295768737793,
+ "learning_rate": 2.3765325049164996e-06,
+ "loss": 0.401,
+ "step": 6593
+ },
+ {
+ "epoch": 3.1177304964539005,
+ "grad_norm": 2.9690325260162354,
+ "learning_rate": 2.3759094430354056e-06,
+ "loss": 0.3962,
+ "step": 6594
+ },
+ {
+ "epoch": 3.1182033096926713,
+ "grad_norm": 2.7630631923675537,
+ "learning_rate": 2.375286388880887e-06,
+ "loss": 0.4126,
+ "step": 6595
+ },
+ {
+ "epoch": 3.118676122931442,
+ "grad_norm": 2.6259944438934326,
+ "learning_rate": 2.3746633424917366e-06,
+ "loss": 0.3285,
+ "step": 6596
+ },
+ {
+ "epoch": 3.119148936170213,
+ "grad_norm": 2.7107701301574707,
+ "learning_rate": 2.3740403039067516e-06,
+ "loss": 0.3636,
+ "step": 6597
+ },
+ {
+ "epoch": 3.1196217494089833,
+ "grad_norm": 2.985301971435547,
+ "learning_rate": 2.373417273164724e-06,
+ "loss": 0.3928,
+ "step": 6598
+ },
+ {
+ "epoch": 3.120094562647754,
+ "grad_norm": 3.2578976154327393,
+ "learning_rate": 2.3727942503044483e-06,
+ "loss": 0.3379,
+ "step": 6599
+ },
+ {
+ "epoch": 3.120567375886525,
+ "grad_norm": 3.1681406497955322,
+ "learning_rate": 2.372171235364717e-06,
+ "loss": 0.4023,
+ "step": 6600
+ },
+ {
+ "epoch": 3.1210401891252957,
+ "grad_norm": 3.120147705078125,
+ "learning_rate": 2.371548228384321e-06,
+ "loss": 0.4228,
+ "step": 6601
+ },
+ {
+ "epoch": 3.121513002364066,
+ "grad_norm": 2.7786099910736084,
+ "learning_rate": 2.3709252294020547e-06,
+ "loss": 0.4386,
+ "step": 6602
+ },
+ {
+ "epoch": 3.121985815602837,
+ "grad_norm": 2.698849678039551,
+ "learning_rate": 2.3703022384567086e-06,
+ "loss": 0.3861,
+ "step": 6603
+ },
+ {
+ "epoch": 3.1224586288416076,
+ "grad_norm": 2.7917959690093994,
+ "learning_rate": 2.3696792555870724e-06,
+ "loss": 0.3535,
+ "step": 6604
+ },
+ {
+ "epoch": 3.1229314420803784,
+ "grad_norm": 2.8249263763427734,
+ "learning_rate": 2.3690562808319385e-06,
+ "loss": 0.3415,
+ "step": 6605
+ },
+ {
+ "epoch": 3.123404255319149,
+ "grad_norm": 2.567458391189575,
+ "learning_rate": 2.368433314230095e-06,
+ "loss": 0.3827,
+ "step": 6606
+ },
+ {
+ "epoch": 3.1238770685579196,
+ "grad_norm": 2.9670443534851074,
+ "learning_rate": 2.3678103558203328e-06,
+ "loss": 0.4238,
+ "step": 6607
+ },
+ {
+ "epoch": 3.1243498817966904,
+ "grad_norm": 2.6893439292907715,
+ "learning_rate": 2.36718740564144e-06,
+ "loss": 0.3461,
+ "step": 6608
+ },
+ {
+ "epoch": 3.124822695035461,
+ "grad_norm": 3.2669708728790283,
+ "learning_rate": 2.3665644637322044e-06,
+ "loss": 0.3992,
+ "step": 6609
+ },
+ {
+ "epoch": 3.1252955082742315,
+ "grad_norm": 2.889340400695801,
+ "learning_rate": 2.3659415301314152e-06,
+ "loss": 0.3829,
+ "step": 6610
+ },
+ {
+ "epoch": 3.1257683215130023,
+ "grad_norm": 2.625603199005127,
+ "learning_rate": 2.3653186048778584e-06,
+ "loss": 0.3559,
+ "step": 6611
+ },
+ {
+ "epoch": 3.126241134751773,
+ "grad_norm": 2.8128650188446045,
+ "learning_rate": 2.3646956880103224e-06,
+ "loss": 0.4035,
+ "step": 6612
+ },
+ {
+ "epoch": 3.126713947990544,
+ "grad_norm": 3.1887412071228027,
+ "learning_rate": 2.3640727795675925e-06,
+ "loss": 0.3938,
+ "step": 6613
+ },
+ {
+ "epoch": 3.1271867612293143,
+ "grad_norm": 2.886514186859131,
+ "learning_rate": 2.363449879588454e-06,
+ "loss": 0.3504,
+ "step": 6614
+ },
+ {
+ "epoch": 3.127659574468085,
+ "grad_norm": 3.2149860858917236,
+ "learning_rate": 2.3628269881116937e-06,
+ "loss": 0.4137,
+ "step": 6615
+ },
+ {
+ "epoch": 3.128132387706856,
+ "grad_norm": 3.3155312538146973,
+ "learning_rate": 2.362204105176094e-06,
+ "loss": 0.3811,
+ "step": 6616
+ },
+ {
+ "epoch": 3.1286052009456267,
+ "grad_norm": 2.6228792667388916,
+ "learning_rate": 2.3615812308204415e-06,
+ "loss": 0.3511,
+ "step": 6617
+ },
+ {
+ "epoch": 3.129078014184397,
+ "grad_norm": 2.7686524391174316,
+ "learning_rate": 2.3609583650835187e-06,
+ "loss": 0.3722,
+ "step": 6618
+ },
+ {
+ "epoch": 3.129550827423168,
+ "grad_norm": 3.396368980407715,
+ "learning_rate": 2.3603355080041083e-06,
+ "loss": 0.4678,
+ "step": 6619
+ },
+ {
+ "epoch": 3.1300236406619386,
+ "grad_norm": 2.7329437732696533,
+ "learning_rate": 2.359712659620994e-06,
+ "loss": 0.3775,
+ "step": 6620
+ },
+ {
+ "epoch": 3.1304964539007094,
+ "grad_norm": 2.7633914947509766,
+ "learning_rate": 2.3590898199729567e-06,
+ "loss": 0.3306,
+ "step": 6621
+ },
+ {
+ "epoch": 3.1309692671394798,
+ "grad_norm": 3.020887613296509,
+ "learning_rate": 2.3584669890987792e-06,
+ "loss": 0.4121,
+ "step": 6622
+ },
+ {
+ "epoch": 3.1314420803782506,
+ "grad_norm": 2.8912103176116943,
+ "learning_rate": 2.3578441670372414e-06,
+ "loss": 0.4297,
+ "step": 6623
+ },
+ {
+ "epoch": 3.1319148936170214,
+ "grad_norm": 3.0654027462005615,
+ "learning_rate": 2.3572213538271234e-06,
+ "loss": 0.3856,
+ "step": 6624
+ },
+ {
+ "epoch": 3.132387706855792,
+ "grad_norm": 3.1126575469970703,
+ "learning_rate": 2.356598549507206e-06,
+ "loss": 0.3886,
+ "step": 6625
+ },
+ {
+ "epoch": 3.1328605200945625,
+ "grad_norm": 2.7066447734832764,
+ "learning_rate": 2.3559757541162687e-06,
+ "loss": 0.4212,
+ "step": 6626
+ },
+ {
+ "epoch": 3.1333333333333333,
+ "grad_norm": 2.876338243484497,
+ "learning_rate": 2.355352967693088e-06,
+ "loss": 0.3607,
+ "step": 6627
+ },
+ {
+ "epoch": 3.133806146572104,
+ "grad_norm": 2.9011716842651367,
+ "learning_rate": 2.3547301902764454e-06,
+ "loss": 0.428,
+ "step": 6628
+ },
+ {
+ "epoch": 3.134278959810875,
+ "grad_norm": 2.805656909942627,
+ "learning_rate": 2.3541074219051163e-06,
+ "loss": 0.4038,
+ "step": 6629
+ },
+ {
+ "epoch": 3.1347517730496453,
+ "grad_norm": 2.89546275138855,
+ "learning_rate": 2.353484662617879e-06,
+ "loss": 0.3798,
+ "step": 6630
+ },
+ {
+ "epoch": 3.135224586288416,
+ "grad_norm": 3.0290539264678955,
+ "learning_rate": 2.352861912453508e-06,
+ "loss": 0.3916,
+ "step": 6631
+ },
+ {
+ "epoch": 3.135697399527187,
+ "grad_norm": 2.848393440246582,
+ "learning_rate": 2.352239171450781e-06,
+ "loss": 0.3423,
+ "step": 6632
+ },
+ {
+ "epoch": 3.1361702127659576,
+ "grad_norm": 2.871372938156128,
+ "learning_rate": 2.3516164396484737e-06,
+ "loss": 0.3872,
+ "step": 6633
+ },
+ {
+ "epoch": 3.136643026004728,
+ "grad_norm": 3.120682716369629,
+ "learning_rate": 2.3509937170853585e-06,
+ "loss": 0.3952,
+ "step": 6634
+ },
+ {
+ "epoch": 3.137115839243499,
+ "grad_norm": 2.6936683654785156,
+ "learning_rate": 2.3503710038002127e-06,
+ "loss": 0.3643,
+ "step": 6635
+ },
+ {
+ "epoch": 3.1375886524822696,
+ "grad_norm": 3.749519109725952,
+ "learning_rate": 2.349748299831808e-06,
+ "loss": 0.4519,
+ "step": 6636
+ },
+ {
+ "epoch": 3.1380614657210404,
+ "grad_norm": 2.8034276962280273,
+ "learning_rate": 2.3491256052189175e-06,
+ "loss": 0.401,
+ "step": 6637
+ },
+ {
+ "epoch": 3.1385342789598107,
+ "grad_norm": 2.6201975345611572,
+ "learning_rate": 2.348502920000314e-06,
+ "loss": 0.3491,
+ "step": 6638
+ },
+ {
+ "epoch": 3.1390070921985815,
+ "grad_norm": 2.890552043914795,
+ "learning_rate": 2.347880244214769e-06,
+ "loss": 0.3439,
+ "step": 6639
+ },
+ {
+ "epoch": 3.1394799054373523,
+ "grad_norm": 2.899594306945801,
+ "learning_rate": 2.347257577901055e-06,
+ "loss": 0.3707,
+ "step": 6640
+ },
+ {
+ "epoch": 3.139952718676123,
+ "grad_norm": 2.8660130500793457,
+ "learning_rate": 2.346634921097942e-06,
+ "loss": 0.3582,
+ "step": 6641
+ },
+ {
+ "epoch": 3.1404255319148935,
+ "grad_norm": 2.9805452823638916,
+ "learning_rate": 2.346012273844199e-06,
+ "loss": 0.3466,
+ "step": 6642
+ },
+ {
+ "epoch": 3.1408983451536643,
+ "grad_norm": 3.162977457046509,
+ "learning_rate": 2.345389636178597e-06,
+ "loss": 0.3657,
+ "step": 6643
+ },
+ {
+ "epoch": 3.141371158392435,
+ "grad_norm": 2.838988780975342,
+ "learning_rate": 2.344767008139904e-06,
+ "loss": 0.3826,
+ "step": 6644
+ },
+ {
+ "epoch": 3.141843971631206,
+ "grad_norm": 3.8427252769470215,
+ "learning_rate": 2.3441443897668893e-06,
+ "loss": 0.3697,
+ "step": 6645
+ },
+ {
+ "epoch": 3.1423167848699762,
+ "grad_norm": 2.9233880043029785,
+ "learning_rate": 2.34352178109832e-06,
+ "loss": 0.3481,
+ "step": 6646
+ },
+ {
+ "epoch": 3.142789598108747,
+ "grad_norm": 2.5840606689453125,
+ "learning_rate": 2.342899182172963e-06,
+ "loss": 0.3746,
+ "step": 6647
+ },
+ {
+ "epoch": 3.143262411347518,
+ "grad_norm": 2.806793451309204,
+ "learning_rate": 2.3422765930295857e-06,
+ "loss": 0.419,
+ "step": 6648
+ },
+ {
+ "epoch": 3.1437352245862886,
+ "grad_norm": 2.803952693939209,
+ "learning_rate": 2.3416540137069522e-06,
+ "loss": 0.3965,
+ "step": 6649
+ },
+ {
+ "epoch": 3.144208037825059,
+ "grad_norm": 2.8416364192962646,
+ "learning_rate": 2.3410314442438297e-06,
+ "loss": 0.4317,
+ "step": 6650
+ },
+ {
+ "epoch": 3.1446808510638298,
+ "grad_norm": 2.9956440925598145,
+ "learning_rate": 2.3404088846789826e-06,
+ "loss": 0.4268,
+ "step": 6651
+ },
+ {
+ "epoch": 3.1451536643026006,
+ "grad_norm": 3.1649162769317627,
+ "learning_rate": 2.339786335051173e-06,
+ "loss": 0.4149,
+ "step": 6652
+ },
+ {
+ "epoch": 3.145626477541371,
+ "grad_norm": 2.909107208251953,
+ "learning_rate": 2.3391637953991673e-06,
+ "loss": 0.4085,
+ "step": 6653
+ },
+ {
+ "epoch": 3.1460992907801417,
+ "grad_norm": 2.416755199432373,
+ "learning_rate": 2.3385412657617264e-06,
+ "loss": 0.3585,
+ "step": 6654
+ },
+ {
+ "epoch": 3.1465721040189125,
+ "grad_norm": 3.1122629642486572,
+ "learning_rate": 2.3379187461776123e-06,
+ "loss": 0.3876,
+ "step": 6655
+ },
+ {
+ "epoch": 3.1470449172576833,
+ "grad_norm": 2.6854658126831055,
+ "learning_rate": 2.337296236685588e-06,
+ "loss": 0.3125,
+ "step": 6656
+ },
+ {
+ "epoch": 3.147517730496454,
+ "grad_norm": 2.779876708984375,
+ "learning_rate": 2.3366737373244127e-06,
+ "loss": 0.3688,
+ "step": 6657
+ },
+ {
+ "epoch": 3.1479905437352245,
+ "grad_norm": 3.1444761753082275,
+ "learning_rate": 2.3360512481328484e-06,
+ "loss": 0.4089,
+ "step": 6658
+ },
+ {
+ "epoch": 3.1484633569739953,
+ "grad_norm": 2.71445894241333,
+ "learning_rate": 2.335428769149654e-06,
+ "loss": 0.3532,
+ "step": 6659
+ },
+ {
+ "epoch": 3.148936170212766,
+ "grad_norm": 2.9788241386413574,
+ "learning_rate": 2.334806300413587e-06,
+ "loss": 0.4238,
+ "step": 6660
+ },
+ {
+ "epoch": 3.1494089834515364,
+ "grad_norm": 3.0118865966796875,
+ "learning_rate": 2.334183841963409e-06,
+ "loss": 0.4437,
+ "step": 6661
+ },
+ {
+ "epoch": 3.149881796690307,
+ "grad_norm": 3.2229537963867188,
+ "learning_rate": 2.3335613938378753e-06,
+ "loss": 0.3582,
+ "step": 6662
+ },
+ {
+ "epoch": 3.150354609929078,
+ "grad_norm": 2.734997034072876,
+ "learning_rate": 2.3329389560757447e-06,
+ "loss": 0.3737,
+ "step": 6663
+ },
+ {
+ "epoch": 3.150827423167849,
+ "grad_norm": 3.4746382236480713,
+ "learning_rate": 2.3323165287157724e-06,
+ "loss": 0.3516,
+ "step": 6664
+ },
+ {
+ "epoch": 3.1513002364066196,
+ "grad_norm": 2.9428153038024902,
+ "learning_rate": 2.3316941117967137e-06,
+ "loss": 0.3985,
+ "step": 6665
+ },
+ {
+ "epoch": 3.15177304964539,
+ "grad_norm": 2.6840944290161133,
+ "learning_rate": 2.3310717053573257e-06,
+ "loss": 0.3274,
+ "step": 6666
+ },
+ {
+ "epoch": 3.1522458628841608,
+ "grad_norm": 3.048335552215576,
+ "learning_rate": 2.3304493094363607e-06,
+ "loss": 0.4262,
+ "step": 6667
+ },
+ {
+ "epoch": 3.1527186761229316,
+ "grad_norm": 2.87381911277771,
+ "learning_rate": 2.329826924072575e-06,
+ "loss": 0.3867,
+ "step": 6668
+ },
+ {
+ "epoch": 3.153191489361702,
+ "grad_norm": 2.6236355304718018,
+ "learning_rate": 2.32920454930472e-06,
+ "loss": 0.3649,
+ "step": 6669
+ },
+ {
+ "epoch": 3.1536643026004727,
+ "grad_norm": 3.1326401233673096,
+ "learning_rate": 2.328582185171549e-06,
+ "loss": 0.3451,
+ "step": 6670
+ },
+ {
+ "epoch": 3.1541371158392435,
+ "grad_norm": 3.011826992034912,
+ "learning_rate": 2.327959831711814e-06,
+ "loss": 0.4118,
+ "step": 6671
+ },
+ {
+ "epoch": 3.1546099290780143,
+ "grad_norm": 2.834933280944824,
+ "learning_rate": 2.3273374889642646e-06,
+ "loss": 0.4378,
+ "step": 6672
+ },
+ {
+ "epoch": 3.155082742316785,
+ "grad_norm": 3.085756778717041,
+ "learning_rate": 2.326715156967654e-06,
+ "loss": 0.4389,
+ "step": 6673
+ },
+ {
+ "epoch": 3.1555555555555554,
+ "grad_norm": 2.7912232875823975,
+ "learning_rate": 2.3260928357607305e-06,
+ "loss": 0.3352,
+ "step": 6674
+ },
+ {
+ "epoch": 3.1560283687943262,
+ "grad_norm": 2.7643113136291504,
+ "learning_rate": 2.3254705253822424e-06,
+ "loss": 0.3449,
+ "step": 6675
+ },
+ {
+ "epoch": 3.156501182033097,
+ "grad_norm": 2.8984663486480713,
+ "learning_rate": 2.3248482258709405e-06,
+ "loss": 0.4231,
+ "step": 6676
+ },
+ {
+ "epoch": 3.1569739952718674,
+ "grad_norm": 3.214996814727783,
+ "learning_rate": 2.324225937265572e-06,
+ "loss": 0.4616,
+ "step": 6677
+ },
+ {
+ "epoch": 3.157446808510638,
+ "grad_norm": 2.58534836769104,
+ "learning_rate": 2.3236036596048827e-06,
+ "loss": 0.3264,
+ "step": 6678
+ },
+ {
+ "epoch": 3.157919621749409,
+ "grad_norm": 2.790714740753174,
+ "learning_rate": 2.322981392927621e-06,
+ "loss": 0.4086,
+ "step": 6679
+ },
+ {
+ "epoch": 3.15839243498818,
+ "grad_norm": 2.726029872894287,
+ "learning_rate": 2.32235913727253e-06,
+ "loss": 0.3344,
+ "step": 6680
+ },
+ {
+ "epoch": 3.1588652482269506,
+ "grad_norm": 2.8392906188964844,
+ "learning_rate": 2.3217368926783583e-06,
+ "loss": 0.3468,
+ "step": 6681
+ },
+ {
+ "epoch": 3.159338061465721,
+ "grad_norm": 2.9796900749206543,
+ "learning_rate": 2.321114659183848e-06,
+ "loss": 0.4051,
+ "step": 6682
+ },
+ {
+ "epoch": 3.1598108747044917,
+ "grad_norm": 3.0399303436279297,
+ "learning_rate": 2.320492436827743e-06,
+ "loss": 0.402,
+ "step": 6683
+ },
+ {
+ "epoch": 3.1602836879432625,
+ "grad_norm": 2.9295334815979004,
+ "learning_rate": 2.3198702256487877e-06,
+ "loss": 0.3975,
+ "step": 6684
+ },
+ {
+ "epoch": 3.160756501182033,
+ "grad_norm": 2.881552219390869,
+ "learning_rate": 2.319248025685723e-06,
+ "loss": 0.4342,
+ "step": 6685
+ },
+ {
+ "epoch": 3.1612293144208037,
+ "grad_norm": 3.0711705684661865,
+ "learning_rate": 2.3186258369772916e-06,
+ "loss": 0.3829,
+ "step": 6686
+ },
+ {
+ "epoch": 3.1617021276595745,
+ "grad_norm": 2.6614468097686768,
+ "learning_rate": 2.3180036595622345e-06,
+ "loss": 0.3473,
+ "step": 6687
+ },
+ {
+ "epoch": 3.1621749408983453,
+ "grad_norm": 3.0084400177001953,
+ "learning_rate": 2.3173814934792903e-06,
+ "loss": 0.4363,
+ "step": 6688
+ },
+ {
+ "epoch": 3.162647754137116,
+ "grad_norm": 2.9340786933898926,
+ "learning_rate": 2.3167593387672006e-06,
+ "loss": 0.4235,
+ "step": 6689
+ },
+ {
+ "epoch": 3.1631205673758864,
+ "grad_norm": 3.0765340328216553,
+ "learning_rate": 2.3161371954647023e-06,
+ "loss": 0.4601,
+ "step": 6690
+ },
+ {
+ "epoch": 3.1635933806146572,
+ "grad_norm": 2.816096067428589,
+ "learning_rate": 2.3155150636105356e-06,
+ "loss": 0.3764,
+ "step": 6691
+ },
+ {
+ "epoch": 3.164066193853428,
+ "grad_norm": 3.0476551055908203,
+ "learning_rate": 2.3148929432434372e-06,
+ "loss": 0.3956,
+ "step": 6692
+ },
+ {
+ "epoch": 3.1645390070921984,
+ "grad_norm": 2.628934860229492,
+ "learning_rate": 2.314270834402143e-06,
+ "loss": 0.3551,
+ "step": 6693
+ },
+ {
+ "epoch": 3.165011820330969,
+ "grad_norm": 3.3933539390563965,
+ "learning_rate": 2.31364873712539e-06,
+ "loss": 0.4523,
+ "step": 6694
+ },
+ {
+ "epoch": 3.16548463356974,
+ "grad_norm": 3.256176233291626,
+ "learning_rate": 2.313026651451912e-06,
+ "loss": 0.417,
+ "step": 6695
+ },
+ {
+ "epoch": 3.1659574468085108,
+ "grad_norm": 2.92926025390625,
+ "learning_rate": 2.312404577420445e-06,
+ "loss": 0.4365,
+ "step": 6696
+ },
+ {
+ "epoch": 3.166430260047281,
+ "grad_norm": 2.9514732360839844,
+ "learning_rate": 2.3117825150697233e-06,
+ "loss": 0.4632,
+ "step": 6697
+ },
+ {
+ "epoch": 3.166903073286052,
+ "grad_norm": 2.8635852336883545,
+ "learning_rate": 2.3111604644384778e-06,
+ "loss": 0.4018,
+ "step": 6698
+ },
+ {
+ "epoch": 3.1673758865248227,
+ "grad_norm": 2.5937020778656006,
+ "learning_rate": 2.3105384255654433e-06,
+ "loss": 0.3682,
+ "step": 6699
+ },
+ {
+ "epoch": 3.1678486997635935,
+ "grad_norm": 2.857851266860962,
+ "learning_rate": 2.3099163984893497e-06,
+ "loss": 0.3293,
+ "step": 6700
+ },
+ {
+ "epoch": 3.168321513002364,
+ "grad_norm": 2.5903947353363037,
+ "learning_rate": 2.3092943832489283e-06,
+ "loss": 0.3543,
+ "step": 6701
+ },
+ {
+ "epoch": 3.1687943262411347,
+ "grad_norm": 2.9783661365509033,
+ "learning_rate": 2.30867237988291e-06,
+ "loss": 0.3707,
+ "step": 6702
+ },
+ {
+ "epoch": 3.1692671394799055,
+ "grad_norm": 3.0133306980133057,
+ "learning_rate": 2.3080503884300225e-06,
+ "loss": 0.439,
+ "step": 6703
+ },
+ {
+ "epoch": 3.1697399527186763,
+ "grad_norm": 2.7119483947753906,
+ "learning_rate": 2.3074284089289968e-06,
+ "loss": 0.3956,
+ "step": 6704
+ },
+ {
+ "epoch": 3.1702127659574466,
+ "grad_norm": 3.0499672889709473,
+ "learning_rate": 2.3068064414185597e-06,
+ "loss": 0.434,
+ "step": 6705
+ },
+ {
+ "epoch": 3.1706855791962174,
+ "grad_norm": 2.862807512283325,
+ "learning_rate": 2.306184485937437e-06,
+ "loss": 0.3644,
+ "step": 6706
+ },
+ {
+ "epoch": 3.171158392434988,
+ "grad_norm": 2.9445149898529053,
+ "learning_rate": 2.305562542524358e-06,
+ "loss": 0.3894,
+ "step": 6707
+ },
+ {
+ "epoch": 3.171631205673759,
+ "grad_norm": 3.0442428588867188,
+ "learning_rate": 2.304940611218046e-06,
+ "loss": 0.3816,
+ "step": 6708
+ },
+ {
+ "epoch": 3.1721040189125294,
+ "grad_norm": 2.7101798057556152,
+ "learning_rate": 2.304318692057228e-06,
+ "loss": 0.3708,
+ "step": 6709
+ },
+ {
+ "epoch": 3.1725768321513,
+ "grad_norm": 2.7874515056610107,
+ "learning_rate": 2.303696785080626e-06,
+ "loss": 0.404,
+ "step": 6710
+ },
+ {
+ "epoch": 3.173049645390071,
+ "grad_norm": 3.0438833236694336,
+ "learning_rate": 2.303074890326964e-06,
+ "loss": 0.4342,
+ "step": 6711
+ },
+ {
+ "epoch": 3.1735224586288417,
+ "grad_norm": 2.6079208850860596,
+ "learning_rate": 2.302453007834966e-06,
+ "loss": 0.3725,
+ "step": 6712
+ },
+ {
+ "epoch": 3.173995271867612,
+ "grad_norm": 3.3353021144866943,
+ "learning_rate": 2.3018311376433523e-06,
+ "loss": 0.4372,
+ "step": 6713
+ },
+ {
+ "epoch": 3.174468085106383,
+ "grad_norm": 2.840771436691284,
+ "learning_rate": 2.3012092797908454e-06,
+ "loss": 0.3979,
+ "step": 6714
+ },
+ {
+ "epoch": 3.1749408983451537,
+ "grad_norm": 3.0474867820739746,
+ "learning_rate": 2.3005874343161648e-06,
+ "loss": 0.4077,
+ "step": 6715
+ },
+ {
+ "epoch": 3.1754137115839245,
+ "grad_norm": 2.849835157394409,
+ "learning_rate": 2.2999656012580296e-06,
+ "loss": 0.393,
+ "step": 6716
+ },
+ {
+ "epoch": 3.175886524822695,
+ "grad_norm": 2.6361217498779297,
+ "learning_rate": 2.29934378065516e-06,
+ "loss": 0.3894,
+ "step": 6717
+ },
+ {
+ "epoch": 3.1763593380614656,
+ "grad_norm": 3.139700174331665,
+ "learning_rate": 2.298721972546273e-06,
+ "loss": 0.36,
+ "step": 6718
+ },
+ {
+ "epoch": 3.1768321513002364,
+ "grad_norm": 2.987861156463623,
+ "learning_rate": 2.298100176970087e-06,
+ "loss": 0.4306,
+ "step": 6719
+ },
+ {
+ "epoch": 3.1773049645390072,
+ "grad_norm": 2.6403157711029053,
+ "learning_rate": 2.297478393965317e-06,
+ "loss": 0.3978,
+ "step": 6720
+ },
+ {
+ "epoch": 3.1777777777777776,
+ "grad_norm": 2.819519281387329,
+ "learning_rate": 2.296856623570679e-06,
+ "loss": 0.3467,
+ "step": 6721
+ },
+ {
+ "epoch": 3.1782505910165484,
+ "grad_norm": 2.7195916175842285,
+ "learning_rate": 2.296234865824889e-06,
+ "loss": 0.3685,
+ "step": 6722
+ },
+ {
+ "epoch": 3.178723404255319,
+ "grad_norm": 3.015488624572754,
+ "learning_rate": 2.2956131207666604e-06,
+ "loss": 0.3751,
+ "step": 6723
+ },
+ {
+ "epoch": 3.17919621749409,
+ "grad_norm": 2.9283792972564697,
+ "learning_rate": 2.2949913884347055e-06,
+ "loss": 0.3261,
+ "step": 6724
+ },
+ {
+ "epoch": 3.1796690307328603,
+ "grad_norm": 3.358991861343384,
+ "learning_rate": 2.294369668867739e-06,
+ "loss": 0.4505,
+ "step": 6725
+ },
+ {
+ "epoch": 3.180141843971631,
+ "grad_norm": 2.9143471717834473,
+ "learning_rate": 2.2937479621044712e-06,
+ "loss": 0.3612,
+ "step": 6726
+ },
+ {
+ "epoch": 3.180614657210402,
+ "grad_norm": 3.020519495010376,
+ "learning_rate": 2.2931262681836136e-06,
+ "loss": 0.4241,
+ "step": 6727
+ },
+ {
+ "epoch": 3.1810874704491727,
+ "grad_norm": 2.693737745285034,
+ "learning_rate": 2.2925045871438765e-06,
+ "loss": 0.366,
+ "step": 6728
+ },
+ {
+ "epoch": 3.181560283687943,
+ "grad_norm": 2.9427194595336914,
+ "learning_rate": 2.2918829190239677e-06,
+ "loss": 0.3741,
+ "step": 6729
+ },
+ {
+ "epoch": 3.182033096926714,
+ "grad_norm": 2.529383659362793,
+ "learning_rate": 2.291261263862598e-06,
+ "loss": 0.4469,
+ "step": 6730
+ },
+ {
+ "epoch": 3.1825059101654847,
+ "grad_norm": 3.0097804069519043,
+ "learning_rate": 2.290639621698473e-06,
+ "loss": 0.4167,
+ "step": 6731
+ },
+ {
+ "epoch": 3.1829787234042555,
+ "grad_norm": 2.7047014236450195,
+ "learning_rate": 2.290017992570302e-06,
+ "loss": 0.3615,
+ "step": 6732
+ },
+ {
+ "epoch": 3.183451536643026,
+ "grad_norm": 2.676964282989502,
+ "learning_rate": 2.2893963765167897e-06,
+ "loss": 0.3722,
+ "step": 6733
+ },
+ {
+ "epoch": 3.1839243498817966,
+ "grad_norm": 3.0529778003692627,
+ "learning_rate": 2.2887747735766413e-06,
+ "loss": 0.395,
+ "step": 6734
+ },
+ {
+ "epoch": 3.1843971631205674,
+ "grad_norm": 2.826725721359253,
+ "learning_rate": 2.288153183788562e-06,
+ "loss": 0.3713,
+ "step": 6735
+ },
+ {
+ "epoch": 3.184869976359338,
+ "grad_norm": 2.8689587116241455,
+ "learning_rate": 2.287531607191254e-06,
+ "loss": 0.4383,
+ "step": 6736
+ },
+ {
+ "epoch": 3.1853427895981086,
+ "grad_norm": 3.1835694313049316,
+ "learning_rate": 2.2869100438234217e-06,
+ "loss": 0.3908,
+ "step": 6737
+ },
+ {
+ "epoch": 3.1858156028368794,
+ "grad_norm": 3.227262020111084,
+ "learning_rate": 2.286288493723767e-06,
+ "loss": 0.3549,
+ "step": 6738
+ },
+ {
+ "epoch": 3.18628841607565,
+ "grad_norm": 2.7543468475341797,
+ "learning_rate": 2.2856669569309896e-06,
+ "loss": 0.351,
+ "step": 6739
+ },
+ {
+ "epoch": 3.186761229314421,
+ "grad_norm": 2.5381555557250977,
+ "learning_rate": 2.2850454334837923e-06,
+ "loss": 0.3473,
+ "step": 6740
+ },
+ {
+ "epoch": 3.1872340425531913,
+ "grad_norm": 2.785923957824707,
+ "learning_rate": 2.284423923420872e-06,
+ "loss": 0.4144,
+ "step": 6741
+ },
+ {
+ "epoch": 3.187706855791962,
+ "grad_norm": 2.583853006362915,
+ "learning_rate": 2.28380242678093e-06,
+ "loss": 0.3088,
+ "step": 6742
+ },
+ {
+ "epoch": 3.188179669030733,
+ "grad_norm": 2.604647159576416,
+ "learning_rate": 2.2831809436026627e-06,
+ "loss": 0.3474,
+ "step": 6743
+ },
+ {
+ "epoch": 3.1886524822695037,
+ "grad_norm": 6.13611364364624,
+ "learning_rate": 2.2825594739247662e-06,
+ "loss": 0.4089,
+ "step": 6744
+ },
+ {
+ "epoch": 3.189125295508274,
+ "grad_norm": 3.034011125564575,
+ "learning_rate": 2.281938017785939e-06,
+ "loss": 0.4569,
+ "step": 6745
+ },
+ {
+ "epoch": 3.189598108747045,
+ "grad_norm": 2.9352638721466064,
+ "learning_rate": 2.281316575224874e-06,
+ "loss": 0.4293,
+ "step": 6746
+ },
+ {
+ "epoch": 3.1900709219858157,
+ "grad_norm": 3.860957384109497,
+ "learning_rate": 2.280695146280268e-06,
+ "loss": 0.4082,
+ "step": 6747
+ },
+ {
+ "epoch": 3.1905437352245865,
+ "grad_norm": 2.8131468296051025,
+ "learning_rate": 2.280073730990814e-06,
+ "loss": 0.3194,
+ "step": 6748
+ },
+ {
+ "epoch": 3.191016548463357,
+ "grad_norm": 3.1310737133026123,
+ "learning_rate": 2.2794523293952033e-06,
+ "loss": 0.4454,
+ "step": 6749
+ },
+ {
+ "epoch": 3.1914893617021276,
+ "grad_norm": 3.065091133117676,
+ "learning_rate": 2.27883094153213e-06,
+ "loss": 0.3789,
+ "step": 6750
+ },
+ {
+ "epoch": 3.1919621749408984,
+ "grad_norm": 3.315216541290283,
+ "learning_rate": 2.278209567440284e-06,
+ "loss": 0.4037,
+ "step": 6751
+ },
+ {
+ "epoch": 3.192434988179669,
+ "grad_norm": 3.0228476524353027,
+ "learning_rate": 2.2775882071583546e-06,
+ "loss": 0.3652,
+ "step": 6752
+ },
+ {
+ "epoch": 3.1929078014184396,
+ "grad_norm": 3.703540802001953,
+ "learning_rate": 2.2769668607250336e-06,
+ "loss": 0.3477,
+ "step": 6753
+ },
+ {
+ "epoch": 3.1933806146572103,
+ "grad_norm": 2.952481508255005,
+ "learning_rate": 2.2763455281790065e-06,
+ "loss": 0.4026,
+ "step": 6754
+ },
+ {
+ "epoch": 3.193853427895981,
+ "grad_norm": 2.5798189640045166,
+ "learning_rate": 2.275724209558965e-06,
+ "loss": 0.3475,
+ "step": 6755
+ },
+ {
+ "epoch": 3.194326241134752,
+ "grad_norm": 2.599669933319092,
+ "learning_rate": 2.2751029049035923e-06,
+ "loss": 0.3499,
+ "step": 6756
+ },
+ {
+ "epoch": 3.1947990543735223,
+ "grad_norm": 3.0463781356811523,
+ "learning_rate": 2.2744816142515756e-06,
+ "loss": 0.3927,
+ "step": 6757
+ },
+ {
+ "epoch": 3.195271867612293,
+ "grad_norm": 3.134199380874634,
+ "learning_rate": 2.2738603376416003e-06,
+ "loss": 0.3957,
+ "step": 6758
+ },
+ {
+ "epoch": 3.195744680851064,
+ "grad_norm": 3.1326372623443604,
+ "learning_rate": 2.273239075112349e-06,
+ "loss": 0.4305,
+ "step": 6759
+ },
+ {
+ "epoch": 3.1962174940898347,
+ "grad_norm": 2.847128391265869,
+ "learning_rate": 2.2726178267025072e-06,
+ "loss": 0.3825,
+ "step": 6760
+ },
+ {
+ "epoch": 3.196690307328605,
+ "grad_norm": 2.697584629058838,
+ "learning_rate": 2.2719965924507566e-06,
+ "loss": 0.3517,
+ "step": 6761
+ },
+ {
+ "epoch": 3.197163120567376,
+ "grad_norm": 2.881446599960327,
+ "learning_rate": 2.271375372395777e-06,
+ "loss": 0.3791,
+ "step": 6762
+ },
+ {
+ "epoch": 3.1976359338061466,
+ "grad_norm": 3.085054874420166,
+ "learning_rate": 2.270754166576252e-06,
+ "loss": 0.4324,
+ "step": 6763
+ },
+ {
+ "epoch": 3.1981087470449174,
+ "grad_norm": 3.3494462966918945,
+ "learning_rate": 2.270132975030859e-06,
+ "loss": 0.4242,
+ "step": 6764
+ },
+ {
+ "epoch": 3.198581560283688,
+ "grad_norm": 2.8617660999298096,
+ "learning_rate": 2.2695117977982785e-06,
+ "loss": 0.3563,
+ "step": 6765
+ },
+ {
+ "epoch": 3.1990543735224586,
+ "grad_norm": 2.7437968254089355,
+ "learning_rate": 2.2688906349171873e-06,
+ "loss": 0.4042,
+ "step": 6766
+ },
+ {
+ "epoch": 3.1995271867612294,
+ "grad_norm": 3.1129143238067627,
+ "learning_rate": 2.268269486426262e-06,
+ "loss": 0.3761,
+ "step": 6767
+ },
+ {
+ "epoch": 3.2,
+ "grad_norm": 3.32441782951355,
+ "learning_rate": 2.2676483523641807e-06,
+ "loss": 0.4439,
+ "step": 6768
+ },
+ {
+ "epoch": 3.2004728132387705,
+ "grad_norm": 2.8744730949401855,
+ "learning_rate": 2.267027232769617e-06,
+ "loss": 0.4015,
+ "step": 6769
+ },
+ {
+ "epoch": 3.2009456264775413,
+ "grad_norm": 3.6283397674560547,
+ "learning_rate": 2.2664061276812465e-06,
+ "loss": 0.3634,
+ "step": 6770
+ },
+ {
+ "epoch": 3.201418439716312,
+ "grad_norm": 2.7826597690582275,
+ "learning_rate": 2.2657850371377426e-06,
+ "loss": 0.3178,
+ "step": 6771
+ },
+ {
+ "epoch": 3.201891252955083,
+ "grad_norm": 2.668173313140869,
+ "learning_rate": 2.265163961177776e-06,
+ "loss": 0.3662,
+ "step": 6772
+ },
+ {
+ "epoch": 3.2023640661938533,
+ "grad_norm": 2.868441104888916,
+ "learning_rate": 2.264542899840021e-06,
+ "loss": 0.4235,
+ "step": 6773
+ },
+ {
+ "epoch": 3.202836879432624,
+ "grad_norm": 3.2715935707092285,
+ "learning_rate": 2.263921853163147e-06,
+ "loss": 0.4741,
+ "step": 6774
+ },
+ {
+ "epoch": 3.203309692671395,
+ "grad_norm": 2.8647544384002686,
+ "learning_rate": 2.2633008211858233e-06,
+ "loss": 0.3885,
+ "step": 6775
+ },
+ {
+ "epoch": 3.2037825059101657,
+ "grad_norm": 3.070164680480957,
+ "learning_rate": 2.2626798039467207e-06,
+ "loss": 0.4191,
+ "step": 6776
+ },
+ {
+ "epoch": 3.204255319148936,
+ "grad_norm": 2.846686840057373,
+ "learning_rate": 2.262058801484505e-06,
+ "loss": 0.3619,
+ "step": 6777
+ },
+ {
+ "epoch": 3.204728132387707,
+ "grad_norm": 2.767031192779541,
+ "learning_rate": 2.261437813837845e-06,
+ "loss": 0.3248,
+ "step": 6778
+ },
+ {
+ "epoch": 3.2052009456264776,
+ "grad_norm": 2.6819260120391846,
+ "learning_rate": 2.2608168410454065e-06,
+ "loss": 0.3871,
+ "step": 6779
+ },
+ {
+ "epoch": 3.2056737588652484,
+ "grad_norm": 3.1176788806915283,
+ "learning_rate": 2.260195883145854e-06,
+ "loss": 0.3929,
+ "step": 6780
+ },
+ {
+ "epoch": 3.2061465721040188,
+ "grad_norm": 3.143209457397461,
+ "learning_rate": 2.2595749401778524e-06,
+ "loss": 0.4188,
+ "step": 6781
+ },
+ {
+ "epoch": 3.2066193853427896,
+ "grad_norm": 2.9685657024383545,
+ "learning_rate": 2.2589540121800647e-06,
+ "loss": 0.4049,
+ "step": 6782
+ },
+ {
+ "epoch": 3.2070921985815604,
+ "grad_norm": 2.6853368282318115,
+ "learning_rate": 2.258333099191155e-06,
+ "loss": 0.349,
+ "step": 6783
+ },
+ {
+ "epoch": 3.207565011820331,
+ "grad_norm": 2.8418309688568115,
+ "learning_rate": 2.257712201249783e-06,
+ "loss": 0.4121,
+ "step": 6784
+ },
+ {
+ "epoch": 3.2080378250591015,
+ "grad_norm": 2.9441449642181396,
+ "learning_rate": 2.2570913183946085e-06,
+ "loss": 0.3846,
+ "step": 6785
+ },
+ {
+ "epoch": 3.2085106382978723,
+ "grad_norm": 2.9956493377685547,
+ "learning_rate": 2.256470450664294e-06,
+ "loss": 0.3941,
+ "step": 6786
+ },
+ {
+ "epoch": 3.208983451536643,
+ "grad_norm": 3.1774401664733887,
+ "learning_rate": 2.255849598097496e-06,
+ "loss": 0.4252,
+ "step": 6787
+ },
+ {
+ "epoch": 3.209456264775414,
+ "grad_norm": 2.8948934078216553,
+ "learning_rate": 2.255228760732873e-06,
+ "loss": 0.3963,
+ "step": 6788
+ },
+ {
+ "epoch": 3.2099290780141843,
+ "grad_norm": 3.440021276473999,
+ "learning_rate": 2.2546079386090825e-06,
+ "loss": 0.3777,
+ "step": 6789
+ },
+ {
+ "epoch": 3.210401891252955,
+ "grad_norm": 3.1573195457458496,
+ "learning_rate": 2.253987131764779e-06,
+ "loss": 0.3896,
+ "step": 6790
+ },
+ {
+ "epoch": 3.210874704491726,
+ "grad_norm": 3.4218719005584717,
+ "learning_rate": 2.2533663402386183e-06,
+ "loss": 0.3979,
+ "step": 6791
+ },
+ {
+ "epoch": 3.2113475177304966,
+ "grad_norm": 3.3442487716674805,
+ "learning_rate": 2.252745564069253e-06,
+ "loss": 0.406,
+ "step": 6792
+ },
+ {
+ "epoch": 3.211820330969267,
+ "grad_norm": 2.6089327335357666,
+ "learning_rate": 2.2521248032953387e-06,
+ "loss": 0.3539,
+ "step": 6793
+ },
+ {
+ "epoch": 3.212293144208038,
+ "grad_norm": 3.8015971183776855,
+ "learning_rate": 2.251504057955526e-06,
+ "loss": 0.4184,
+ "step": 6794
+ },
+ {
+ "epoch": 3.2127659574468086,
+ "grad_norm": 3.797565460205078,
+ "learning_rate": 2.250883328088465e-06,
+ "loss": 0.3392,
+ "step": 6795
+ },
+ {
+ "epoch": 3.2132387706855794,
+ "grad_norm": 3.290762186050415,
+ "learning_rate": 2.2502626137328077e-06,
+ "loss": 0.3726,
+ "step": 6796
+ },
+ {
+ "epoch": 3.2137115839243497,
+ "grad_norm": 3.149158000946045,
+ "learning_rate": 2.2496419149272023e-06,
+ "loss": 0.3869,
+ "step": 6797
+ },
+ {
+ "epoch": 3.2141843971631205,
+ "grad_norm": 2.652902364730835,
+ "learning_rate": 2.2490212317102964e-06,
+ "loss": 0.3256,
+ "step": 6798
+ },
+ {
+ "epoch": 3.2146572104018913,
+ "grad_norm": 3.3039770126342773,
+ "learning_rate": 2.248400564120739e-06,
+ "loss": 0.4231,
+ "step": 6799
+ },
+ {
+ "epoch": 3.215130023640662,
+ "grad_norm": 3.0190038681030273,
+ "learning_rate": 2.247779912197174e-06,
+ "loss": 0.4319,
+ "step": 6800
+ },
+ {
+ "epoch": 3.2156028368794325,
+ "grad_norm": 2.861393690109253,
+ "learning_rate": 2.2471592759782485e-06,
+ "loss": 0.465,
+ "step": 6801
+ },
+ {
+ "epoch": 3.2160756501182033,
+ "grad_norm": 2.7796146869659424,
+ "learning_rate": 2.246538655502606e-06,
+ "loss": 0.3896,
+ "step": 6802
+ },
+ {
+ "epoch": 3.216548463356974,
+ "grad_norm": 3.1849005222320557,
+ "learning_rate": 2.24591805080889e-06,
+ "loss": 0.3782,
+ "step": 6803
+ },
+ {
+ "epoch": 3.217021276595745,
+ "grad_norm": 3.076164960861206,
+ "learning_rate": 2.2452974619357435e-06,
+ "loss": 0.4023,
+ "step": 6804
+ },
+ {
+ "epoch": 3.2174940898345152,
+ "grad_norm": 2.7006006240844727,
+ "learning_rate": 2.2446768889218064e-06,
+ "loss": 0.3902,
+ "step": 6805
+ },
+ {
+ "epoch": 3.217966903073286,
+ "grad_norm": 2.9310474395751953,
+ "learning_rate": 2.2440563318057205e-06,
+ "loss": 0.366,
+ "step": 6806
+ },
+ {
+ "epoch": 3.218439716312057,
+ "grad_norm": 3.057248592376709,
+ "learning_rate": 2.2434357906261246e-06,
+ "loss": 0.4042,
+ "step": 6807
+ },
+ {
+ "epoch": 3.2189125295508276,
+ "grad_norm": 3.3720197677612305,
+ "learning_rate": 2.242815265421656e-06,
+ "loss": 0.3816,
+ "step": 6808
+ },
+ {
+ "epoch": 3.219385342789598,
+ "grad_norm": 2.9626352787017822,
+ "learning_rate": 2.2421947562309545e-06,
+ "loss": 0.363,
+ "step": 6809
+ },
+ {
+ "epoch": 3.219858156028369,
+ "grad_norm": 2.7848782539367676,
+ "learning_rate": 2.2415742630926533e-06,
+ "loss": 0.3597,
+ "step": 6810
+ },
+ {
+ "epoch": 3.2203309692671396,
+ "grad_norm": 2.757319450378418,
+ "learning_rate": 2.2409537860453913e-06,
+ "loss": 0.3304,
+ "step": 6811
+ },
+ {
+ "epoch": 3.2208037825059104,
+ "grad_norm": 2.7765560150146484,
+ "learning_rate": 2.240333325127801e-06,
+ "loss": 0.3896,
+ "step": 6812
+ },
+ {
+ "epoch": 3.2212765957446807,
+ "grad_norm": 2.9882447719573975,
+ "learning_rate": 2.239712880378515e-06,
+ "loss": 0.4004,
+ "step": 6813
+ },
+ {
+ "epoch": 3.2217494089834515,
+ "grad_norm": 2.8551244735717773,
+ "learning_rate": 2.2390924518361673e-06,
+ "loss": 0.4167,
+ "step": 6814
+ },
+ {
+ "epoch": 3.2222222222222223,
+ "grad_norm": 2.8051679134368896,
+ "learning_rate": 2.2384720395393878e-06,
+ "loss": 0.3319,
+ "step": 6815
+ },
+ {
+ "epoch": 3.222695035460993,
+ "grad_norm": 3.1172873973846436,
+ "learning_rate": 2.2378516435268086e-06,
+ "loss": 0.379,
+ "step": 6816
+ },
+ {
+ "epoch": 3.2231678486997635,
+ "grad_norm": 3.0282177925109863,
+ "learning_rate": 2.237231263837058e-06,
+ "loss": 0.3855,
+ "step": 6817
+ },
+ {
+ "epoch": 3.2236406619385343,
+ "grad_norm": 2.7156803607940674,
+ "learning_rate": 2.236610900508763e-06,
+ "loss": 0.4062,
+ "step": 6818
+ },
+ {
+ "epoch": 3.224113475177305,
+ "grad_norm": 2.721327781677246,
+ "learning_rate": 2.235990553580554e-06,
+ "loss": 0.3726,
+ "step": 6819
+ },
+ {
+ "epoch": 3.2245862884160754,
+ "grad_norm": 2.881181240081787,
+ "learning_rate": 2.235370223091055e-06,
+ "loss": 0.421,
+ "step": 6820
+ },
+ {
+ "epoch": 3.225059101654846,
+ "grad_norm": 2.8074657917022705,
+ "learning_rate": 2.234749909078892e-06,
+ "loss": 0.3628,
+ "step": 6821
+ },
+ {
+ "epoch": 3.225531914893617,
+ "grad_norm": 2.8781638145446777,
+ "learning_rate": 2.234129611582689e-06,
+ "loss": 0.3857,
+ "step": 6822
+ },
+ {
+ "epoch": 3.226004728132388,
+ "grad_norm": 2.9473299980163574,
+ "learning_rate": 2.233509330641068e-06,
+ "loss": 0.4358,
+ "step": 6823
+ },
+ {
+ "epoch": 3.2264775413711586,
+ "grad_norm": 3.261209011077881,
+ "learning_rate": 2.2328890662926543e-06,
+ "loss": 0.4115,
+ "step": 6824
+ },
+ {
+ "epoch": 3.226950354609929,
+ "grad_norm": 3.2796943187713623,
+ "learning_rate": 2.232268818576067e-06,
+ "loss": 0.3846,
+ "step": 6825
+ },
+ {
+ "epoch": 3.2274231678486998,
+ "grad_norm": 3.1083059310913086,
+ "learning_rate": 2.2316485875299247e-06,
+ "loss": 0.3452,
+ "step": 6826
+ },
+ {
+ "epoch": 3.2278959810874706,
+ "grad_norm": 2.7947003841400146,
+ "learning_rate": 2.23102837319285e-06,
+ "loss": 0.3733,
+ "step": 6827
+ },
+ {
+ "epoch": 3.228368794326241,
+ "grad_norm": 2.792348861694336,
+ "learning_rate": 2.230408175603458e-06,
+ "loss": 0.411,
+ "step": 6828
+ },
+ {
+ "epoch": 3.2288416075650117,
+ "grad_norm": 2.8563876152038574,
+ "learning_rate": 2.229787994800368e-06,
+ "loss": 0.4303,
+ "step": 6829
+ },
+ {
+ "epoch": 3.2293144208037825,
+ "grad_norm": 2.9573659896850586,
+ "learning_rate": 2.2291678308221943e-06,
+ "loss": 0.4124,
+ "step": 6830
+ },
+ {
+ "epoch": 3.2297872340425533,
+ "grad_norm": 2.8554422855377197,
+ "learning_rate": 2.228547683707551e-06,
+ "loss": 0.3715,
+ "step": 6831
+ },
+ {
+ "epoch": 3.230260047281324,
+ "grad_norm": 2.9457242488861084,
+ "learning_rate": 2.227927553495054e-06,
+ "loss": 0.4339,
+ "step": 6832
+ },
+ {
+ "epoch": 3.2307328605200945,
+ "grad_norm": 2.799135684967041,
+ "learning_rate": 2.227307440223315e-06,
+ "loss": 0.3335,
+ "step": 6833
+ },
+ {
+ "epoch": 3.2312056737588652,
+ "grad_norm": 2.768529176712036,
+ "learning_rate": 2.2266873439309465e-06,
+ "loss": 0.3929,
+ "step": 6834
+ },
+ {
+ "epoch": 3.231678486997636,
+ "grad_norm": 3.124069929122925,
+ "learning_rate": 2.2260672646565585e-06,
+ "loss": 0.4205,
+ "step": 6835
+ },
+ {
+ "epoch": 3.2321513002364064,
+ "grad_norm": 2.8153982162475586,
+ "learning_rate": 2.2254472024387603e-06,
+ "loss": 0.3565,
+ "step": 6836
+ },
+ {
+ "epoch": 3.232624113475177,
+ "grad_norm": 3.1802141666412354,
+ "learning_rate": 2.224827157316162e-06,
+ "loss": 0.4614,
+ "step": 6837
+ },
+ {
+ "epoch": 3.233096926713948,
+ "grad_norm": 2.669651746749878,
+ "learning_rate": 2.2242071293273682e-06,
+ "loss": 0.3581,
+ "step": 6838
+ },
+ {
+ "epoch": 3.233569739952719,
+ "grad_norm": 3.073127269744873,
+ "learning_rate": 2.223587118510989e-06,
+ "loss": 0.3581,
+ "step": 6839
+ },
+ {
+ "epoch": 3.2340425531914896,
+ "grad_norm": 2.875955820083618,
+ "learning_rate": 2.222967124905627e-06,
+ "loss": 0.3905,
+ "step": 6840
+ },
+ {
+ "epoch": 3.23451536643026,
+ "grad_norm": 2.887744903564453,
+ "learning_rate": 2.2223471485498872e-06,
+ "loss": 0.4131,
+ "step": 6841
+ },
+ {
+ "epoch": 3.2349881796690307,
+ "grad_norm": 2.6957902908325195,
+ "learning_rate": 2.2217271894823735e-06,
+ "loss": 0.3631,
+ "step": 6842
+ },
+ {
+ "epoch": 3.2354609929078015,
+ "grad_norm": 2.7098400592803955,
+ "learning_rate": 2.221107247741688e-06,
+ "loss": 0.3959,
+ "step": 6843
+ },
+ {
+ "epoch": 3.235933806146572,
+ "grad_norm": 2.986271858215332,
+ "learning_rate": 2.22048732336643e-06,
+ "loss": 0.3515,
+ "step": 6844
+ },
+ {
+ "epoch": 3.2364066193853427,
+ "grad_norm": 3.0537121295928955,
+ "learning_rate": 2.2198674163952015e-06,
+ "loss": 0.438,
+ "step": 6845
+ },
+ {
+ "epoch": 3.2368794326241135,
+ "grad_norm": 2.8351151943206787,
+ "learning_rate": 2.2192475268666e-06,
+ "loss": 0.4069,
+ "step": 6846
+ },
+ {
+ "epoch": 3.2373522458628843,
+ "grad_norm": 2.6455280780792236,
+ "learning_rate": 2.218627654819225e-06,
+ "loss": 0.3626,
+ "step": 6847
+ },
+ {
+ "epoch": 3.237825059101655,
+ "grad_norm": 3.060352325439453,
+ "learning_rate": 2.2180078002916717e-06,
+ "loss": 0.3306,
+ "step": 6848
+ },
+ {
+ "epoch": 3.2382978723404254,
+ "grad_norm": 3.0178887844085693,
+ "learning_rate": 2.2173879633225355e-06,
+ "loss": 0.4111,
+ "step": 6849
+ },
+ {
+ "epoch": 3.2387706855791962,
+ "grad_norm": 2.895822763442993,
+ "learning_rate": 2.2167681439504123e-06,
+ "loss": 0.4053,
+ "step": 6850
+ },
+ {
+ "epoch": 3.239243498817967,
+ "grad_norm": 2.7295608520507812,
+ "learning_rate": 2.2161483422138945e-06,
+ "loss": 0.4021,
+ "step": 6851
+ },
+ {
+ "epoch": 3.2397163120567374,
+ "grad_norm": 3.1004912853240967,
+ "learning_rate": 2.2155285581515747e-06,
+ "loss": 0.3882,
+ "step": 6852
+ },
+ {
+ "epoch": 3.240189125295508,
+ "grad_norm": 2.927987813949585,
+ "learning_rate": 2.214908791802045e-06,
+ "loss": 0.4036,
+ "step": 6853
+ },
+ {
+ "epoch": 3.240661938534279,
+ "grad_norm": 3.1679599285125732,
+ "learning_rate": 2.2142890432038943e-06,
+ "loss": 0.3897,
+ "step": 6854
+ },
+ {
+ "epoch": 3.2411347517730498,
+ "grad_norm": 3.2094008922576904,
+ "learning_rate": 2.213669312395712e-06,
+ "loss": 0.4429,
+ "step": 6855
+ },
+ {
+ "epoch": 3.24160756501182,
+ "grad_norm": 4.637594223022461,
+ "learning_rate": 2.2130495994160857e-06,
+ "loss": 0.3708,
+ "step": 6856
+ },
+ {
+ "epoch": 3.242080378250591,
+ "grad_norm": 3.0063490867614746,
+ "learning_rate": 2.212429904303603e-06,
+ "loss": 0.3949,
+ "step": 6857
+ },
+ {
+ "epoch": 3.2425531914893617,
+ "grad_norm": 3.285444736480713,
+ "learning_rate": 2.21181022709685e-06,
+ "loss": 0.4236,
+ "step": 6858
+ },
+ {
+ "epoch": 3.2430260047281325,
+ "grad_norm": 3.02506422996521,
+ "learning_rate": 2.2111905678344086e-06,
+ "loss": 0.368,
+ "step": 6859
+ },
+ {
+ "epoch": 3.243498817966903,
+ "grad_norm": 2.9845006465911865,
+ "learning_rate": 2.2105709265548657e-06,
+ "loss": 0.4154,
+ "step": 6860
+ },
+ {
+ "epoch": 3.2439716312056737,
+ "grad_norm": 3.2537527084350586,
+ "learning_rate": 2.2099513032968013e-06,
+ "loss": 0.4385,
+ "step": 6861
+ },
+ {
+ "epoch": 3.2444444444444445,
+ "grad_norm": 2.8521063327789307,
+ "learning_rate": 2.2093316980987985e-06,
+ "loss": 0.384,
+ "step": 6862
+ },
+ {
+ "epoch": 3.2449172576832153,
+ "grad_norm": 3.186844825744629,
+ "learning_rate": 2.208712110999436e-06,
+ "loss": 0.4131,
+ "step": 6863
+ },
+ {
+ "epoch": 3.2453900709219856,
+ "grad_norm": 2.932058095932007,
+ "learning_rate": 2.208092542037292e-06,
+ "loss": 0.3341,
+ "step": 6864
+ },
+ {
+ "epoch": 3.2458628841607564,
+ "grad_norm": 3.0818707942962646,
+ "learning_rate": 2.2074729912509462e-06,
+ "loss": 0.4149,
+ "step": 6865
+ },
+ {
+ "epoch": 3.246335697399527,
+ "grad_norm": 2.9788503646850586,
+ "learning_rate": 2.2068534586789735e-06,
+ "loss": 0.3572,
+ "step": 6866
+ },
+ {
+ "epoch": 3.246808510638298,
+ "grad_norm": 2.84075665473938,
+ "learning_rate": 2.206233944359952e-06,
+ "loss": 0.3561,
+ "step": 6867
+ },
+ {
+ "epoch": 3.2472813238770684,
+ "grad_norm": 2.966459035873413,
+ "learning_rate": 2.2056144483324545e-06,
+ "loss": 0.3909,
+ "step": 6868
+ },
+ {
+ "epoch": 3.247754137115839,
+ "grad_norm": 2.892038106918335,
+ "learning_rate": 2.204994970635054e-06,
+ "loss": 0.3557,
+ "step": 6869
+ },
+ {
+ "epoch": 3.24822695035461,
+ "grad_norm": 2.7458810806274414,
+ "learning_rate": 2.2043755113063233e-06,
+ "loss": 0.3551,
+ "step": 6870
+ },
+ {
+ "epoch": 3.2486997635933808,
+ "grad_norm": 2.766803741455078,
+ "learning_rate": 2.2037560703848334e-06,
+ "loss": 0.3343,
+ "step": 6871
+ },
+ {
+ "epoch": 3.249172576832151,
+ "grad_norm": 2.9780561923980713,
+ "learning_rate": 2.2031366479091533e-06,
+ "loss": 0.4004,
+ "step": 6872
+ },
+ {
+ "epoch": 3.249645390070922,
+ "grad_norm": 2.8848516941070557,
+ "learning_rate": 2.202517243917853e-06,
+ "loss": 0.3467,
+ "step": 6873
+ },
+ {
+ "epoch": 3.2501182033096927,
+ "grad_norm": 2.9962213039398193,
+ "learning_rate": 2.201897858449499e-06,
+ "loss": 0.3796,
+ "step": 6874
+ },
+ {
+ "epoch": 3.2505910165484635,
+ "grad_norm": 2.838131904602051,
+ "learning_rate": 2.201278491542659e-06,
+ "loss": 0.3683,
+ "step": 6875
+ },
+ {
+ "epoch": 3.251063829787234,
+ "grad_norm": 3.0232505798339844,
+ "learning_rate": 2.200659143235897e-06,
+ "loss": 0.3793,
+ "step": 6876
+ },
+ {
+ "epoch": 3.2515366430260046,
+ "grad_norm": 3.0690126419067383,
+ "learning_rate": 2.2000398135677776e-06,
+ "loss": 0.417,
+ "step": 6877
+ },
+ {
+ "epoch": 3.2520094562647754,
+ "grad_norm": 3.1838719844818115,
+ "learning_rate": 2.1994205025768643e-06,
+ "loss": 0.4608,
+ "step": 6878
+ },
+ {
+ "epoch": 3.2524822695035462,
+ "grad_norm": 3.1187257766723633,
+ "learning_rate": 2.198801210301717e-06,
+ "loss": 0.3396,
+ "step": 6879
+ },
+ {
+ "epoch": 3.2529550827423166,
+ "grad_norm": 2.7608656883239746,
+ "learning_rate": 2.1981819367808984e-06,
+ "loss": 0.386,
+ "step": 6880
+ },
+ {
+ "epoch": 3.2534278959810874,
+ "grad_norm": 3.027456283569336,
+ "learning_rate": 2.197562682052968e-06,
+ "loss": 0.3941,
+ "step": 6881
+ },
+ {
+ "epoch": 3.253900709219858,
+ "grad_norm": 2.925515651702881,
+ "learning_rate": 2.1969434461564816e-06,
+ "loss": 0.3608,
+ "step": 6882
+ },
+ {
+ "epoch": 3.254373522458629,
+ "grad_norm": 2.946770668029785,
+ "learning_rate": 2.196324229129999e-06,
+ "loss": 0.4116,
+ "step": 6883
+ },
+ {
+ "epoch": 3.2548463356973993,
+ "grad_norm": 2.6497952938079834,
+ "learning_rate": 2.1957050310120746e-06,
+ "loss": 0.338,
+ "step": 6884
+ },
+ {
+ "epoch": 3.25531914893617,
+ "grad_norm": 2.6915128231048584,
+ "learning_rate": 2.195085851841264e-06,
+ "loss": 0.3372,
+ "step": 6885
+ },
+ {
+ "epoch": 3.255791962174941,
+ "grad_norm": 3.4022350311279297,
+ "learning_rate": 2.1944666916561205e-06,
+ "loss": 0.3844,
+ "step": 6886
+ },
+ {
+ "epoch": 3.2562647754137117,
+ "grad_norm": 2.7463366985321045,
+ "learning_rate": 2.1938475504951958e-06,
+ "loss": 0.3268,
+ "step": 6887
+ },
+ {
+ "epoch": 3.256737588652482,
+ "grad_norm": 2.828810691833496,
+ "learning_rate": 2.193228428397042e-06,
+ "loss": 0.3275,
+ "step": 6888
+ },
+ {
+ "epoch": 3.257210401891253,
+ "grad_norm": 3.4016268253326416,
+ "learning_rate": 2.192609325400208e-06,
+ "loss": 0.3916,
+ "step": 6889
+ },
+ {
+ "epoch": 3.2576832151300237,
+ "grad_norm": 2.4980733394622803,
+ "learning_rate": 2.191990241543245e-06,
+ "loss": 0.3636,
+ "step": 6890
+ },
+ {
+ "epoch": 3.2581560283687945,
+ "grad_norm": 3.0384702682495117,
+ "learning_rate": 2.191371176864698e-06,
+ "loss": 0.398,
+ "step": 6891
+ },
+ {
+ "epoch": 3.258628841607565,
+ "grad_norm": 2.8949527740478516,
+ "learning_rate": 2.190752131403115e-06,
+ "loss": 0.3919,
+ "step": 6892
+ },
+ {
+ "epoch": 3.2591016548463356,
+ "grad_norm": 2.765617609024048,
+ "learning_rate": 2.190133105197041e-06,
+ "loss": 0.3799,
+ "step": 6893
+ },
+ {
+ "epoch": 3.2595744680851064,
+ "grad_norm": 2.6149277687072754,
+ "learning_rate": 2.18951409828502e-06,
+ "loss": 0.3895,
+ "step": 6894
+ },
+ {
+ "epoch": 3.260047281323877,
+ "grad_norm": 2.9738945960998535,
+ "learning_rate": 2.1888951107055934e-06,
+ "loss": 0.3879,
+ "step": 6895
+ },
+ {
+ "epoch": 3.2605200945626476,
+ "grad_norm": 2.9438633918762207,
+ "learning_rate": 2.1882761424973053e-06,
+ "loss": 0.438,
+ "step": 6896
+ },
+ {
+ "epoch": 3.2609929078014184,
+ "grad_norm": 3.114243984222412,
+ "learning_rate": 2.1876571936986936e-06,
+ "loss": 0.4737,
+ "step": 6897
+ },
+ {
+ "epoch": 3.261465721040189,
+ "grad_norm": 3.017526388168335,
+ "learning_rate": 2.1870382643483e-06,
+ "loss": 0.4039,
+ "step": 6898
+ },
+ {
+ "epoch": 3.26193853427896,
+ "grad_norm": 3.1475703716278076,
+ "learning_rate": 2.1864193544846613e-06,
+ "loss": 0.3825,
+ "step": 6899
+ },
+ {
+ "epoch": 3.2624113475177303,
+ "grad_norm": 2.75502872467041,
+ "learning_rate": 2.1858004641463142e-06,
+ "loss": 0.3507,
+ "step": 6900
+ },
+ {
+ "epoch": 3.262884160756501,
+ "grad_norm": 3.0467209815979004,
+ "learning_rate": 2.1851815933717944e-06,
+ "loss": 0.3938,
+ "step": 6901
+ },
+ {
+ "epoch": 3.263356973995272,
+ "grad_norm": 2.993014097213745,
+ "learning_rate": 2.184562742199636e-06,
+ "loss": 0.3711,
+ "step": 6902
+ },
+ {
+ "epoch": 3.2638297872340427,
+ "grad_norm": 2.607309341430664,
+ "learning_rate": 2.183943910668373e-06,
+ "loss": 0.3689,
+ "step": 6903
+ },
+ {
+ "epoch": 3.264302600472813,
+ "grad_norm": 2.961653470993042,
+ "learning_rate": 2.1833250988165373e-06,
+ "loss": 0.3806,
+ "step": 6904
+ },
+ {
+ "epoch": 3.264775413711584,
+ "grad_norm": 2.8202552795410156,
+ "learning_rate": 2.1827063066826574e-06,
+ "loss": 0.391,
+ "step": 6905
+ },
+ {
+ "epoch": 3.2652482269503547,
+ "grad_norm": 3.032648801803589,
+ "learning_rate": 2.1820875343052666e-06,
+ "loss": 0.4011,
+ "step": 6906
+ },
+ {
+ "epoch": 3.2657210401891255,
+ "grad_norm": 2.8265180587768555,
+ "learning_rate": 2.1814687817228896e-06,
+ "loss": 0.3923,
+ "step": 6907
+ },
+ {
+ "epoch": 3.266193853427896,
+ "grad_norm": 3.1425564289093018,
+ "learning_rate": 2.1808500489740555e-06,
+ "loss": 0.4913,
+ "step": 6908
+ },
+ {
+ "epoch": 3.2666666666666666,
+ "grad_norm": 2.977809429168701,
+ "learning_rate": 2.18023133609729e-06,
+ "loss": 0.379,
+ "step": 6909
+ },
+ {
+ "epoch": 3.2671394799054374,
+ "grad_norm": 3.509551525115967,
+ "learning_rate": 2.1796126431311153e-06,
+ "loss": 0.4025,
+ "step": 6910
+ },
+ {
+ "epoch": 3.267612293144208,
+ "grad_norm": 2.9133846759796143,
+ "learning_rate": 2.178993970114058e-06,
+ "loss": 0.4209,
+ "step": 6911
+ },
+ {
+ "epoch": 3.2680851063829786,
+ "grad_norm": 2.945513963699341,
+ "learning_rate": 2.178375317084637e-06,
+ "loss": 0.3882,
+ "step": 6912
+ },
+ {
+ "epoch": 3.2685579196217494,
+ "grad_norm": 2.7868733406066895,
+ "learning_rate": 2.1777566840813763e-06,
+ "loss": 0.3456,
+ "step": 6913
+ },
+ {
+ "epoch": 3.26903073286052,
+ "grad_norm": 2.803220748901367,
+ "learning_rate": 2.1771380711427937e-06,
+ "loss": 0.3394,
+ "step": 6914
+ },
+ {
+ "epoch": 3.269503546099291,
+ "grad_norm": 3.1293554306030273,
+ "learning_rate": 2.176519478307407e-06,
+ "loss": 0.402,
+ "step": 6915
+ },
+ {
+ "epoch": 3.2699763593380613,
+ "grad_norm": 2.843971014022827,
+ "learning_rate": 2.1759009056137347e-06,
+ "loss": 0.3449,
+ "step": 6916
+ },
+ {
+ "epoch": 3.270449172576832,
+ "grad_norm": 2.9983274936676025,
+ "learning_rate": 2.1752823531002917e-06,
+ "loss": 0.4091,
+ "step": 6917
+ },
+ {
+ "epoch": 3.270921985815603,
+ "grad_norm": 2.686722993850708,
+ "learning_rate": 2.174663820805592e-06,
+ "loss": 0.4303,
+ "step": 6918
+ },
+ {
+ "epoch": 3.2713947990543737,
+ "grad_norm": 2.669349431991577,
+ "learning_rate": 2.1740453087681508e-06,
+ "loss": 0.3796,
+ "step": 6919
+ },
+ {
+ "epoch": 3.271867612293144,
+ "grad_norm": 2.992138624191284,
+ "learning_rate": 2.173426817026477e-06,
+ "loss": 0.4125,
+ "step": 6920
+ },
+ {
+ "epoch": 3.272340425531915,
+ "grad_norm": 3.332834243774414,
+ "learning_rate": 2.1728083456190852e-06,
+ "loss": 0.3885,
+ "step": 6921
+ },
+ {
+ "epoch": 3.2728132387706856,
+ "grad_norm": 2.869673013687134,
+ "learning_rate": 2.1721898945844825e-06,
+ "loss": 0.3941,
+ "step": 6922
+ },
+ {
+ "epoch": 3.2732860520094564,
+ "grad_norm": 2.804440975189209,
+ "learning_rate": 2.1715714639611774e-06,
+ "loss": 0.4007,
+ "step": 6923
+ },
+ {
+ "epoch": 3.273758865248227,
+ "grad_norm": 3.1751439571380615,
+ "learning_rate": 2.1709530537876774e-06,
+ "loss": 0.3981,
+ "step": 6924
+ },
+ {
+ "epoch": 3.2742316784869976,
+ "grad_norm": 2.6367175579071045,
+ "learning_rate": 2.1703346641024878e-06,
+ "loss": 0.3582,
+ "step": 6925
+ },
+ {
+ "epoch": 3.2747044917257684,
+ "grad_norm": 2.99164080619812,
+ "learning_rate": 2.1697162949441137e-06,
+ "loss": 0.3846,
+ "step": 6926
+ },
+ {
+ "epoch": 3.275177304964539,
+ "grad_norm": 3.3206982612609863,
+ "learning_rate": 2.169097946351057e-06,
+ "loss": 0.3689,
+ "step": 6927
+ },
+ {
+ "epoch": 3.2756501182033095,
+ "grad_norm": 2.927907943725586,
+ "learning_rate": 2.16847961836182e-06,
+ "loss": 0.3536,
+ "step": 6928
+ },
+ {
+ "epoch": 3.2761229314420803,
+ "grad_norm": 3.1950864791870117,
+ "learning_rate": 2.167861311014904e-06,
+ "loss": 0.4154,
+ "step": 6929
+ },
+ {
+ "epoch": 3.276595744680851,
+ "grad_norm": 2.888383388519287,
+ "learning_rate": 2.1672430243488073e-06,
+ "loss": 0.3702,
+ "step": 6930
+ },
+ {
+ "epoch": 3.277068557919622,
+ "grad_norm": 2.842287063598633,
+ "learning_rate": 2.166624758402029e-06,
+ "loss": 0.3623,
+ "step": 6931
+ },
+ {
+ "epoch": 3.2775413711583923,
+ "grad_norm": 2.84350323677063,
+ "learning_rate": 2.166006513213065e-06,
+ "loss": 0.3757,
+ "step": 6932
+ },
+ {
+ "epoch": 3.278014184397163,
+ "grad_norm": 3.105626344680786,
+ "learning_rate": 2.165388288820411e-06,
+ "loss": 0.3955,
+ "step": 6933
+ },
+ {
+ "epoch": 3.278486997635934,
+ "grad_norm": 3.273508071899414,
+ "learning_rate": 2.164770085262561e-06,
+ "loss": 0.4046,
+ "step": 6934
+ },
+ {
+ "epoch": 3.2789598108747047,
+ "grad_norm": 3.2530124187469482,
+ "learning_rate": 2.1641519025780066e-06,
+ "loss": 0.3141,
+ "step": 6935
+ },
+ {
+ "epoch": 3.279432624113475,
+ "grad_norm": 2.822849750518799,
+ "learning_rate": 2.163533740805242e-06,
+ "loss": 0.3973,
+ "step": 6936
+ },
+ {
+ "epoch": 3.279905437352246,
+ "grad_norm": 2.772097587585449,
+ "learning_rate": 2.162915599982756e-06,
+ "loss": 0.3606,
+ "step": 6937
+ },
+ {
+ "epoch": 3.2803782505910166,
+ "grad_norm": 3.150696039199829,
+ "learning_rate": 2.1622974801490365e-06,
+ "loss": 0.4709,
+ "step": 6938
+ },
+ {
+ "epoch": 3.2808510638297874,
+ "grad_norm": 3.2072134017944336,
+ "learning_rate": 2.1616793813425736e-06,
+ "loss": 0.3946,
+ "step": 6939
+ },
+ {
+ "epoch": 3.2813238770685578,
+ "grad_norm": 2.9922473430633545,
+ "learning_rate": 2.1610613036018515e-06,
+ "loss": 0.3263,
+ "step": 6940
+ },
+ {
+ "epoch": 3.2817966903073286,
+ "grad_norm": 2.7818009853363037,
+ "learning_rate": 2.1604432469653555e-06,
+ "loss": 0.3887,
+ "step": 6941
+ },
+ {
+ "epoch": 3.2822695035460994,
+ "grad_norm": 3.12998628616333,
+ "learning_rate": 2.15982521147157e-06,
+ "loss": 0.3522,
+ "step": 6942
+ },
+ {
+ "epoch": 3.28274231678487,
+ "grad_norm": 2.876678228378296,
+ "learning_rate": 2.159207197158976e-06,
+ "loss": 0.3643,
+ "step": 6943
+ },
+ {
+ "epoch": 3.2832151300236405,
+ "grad_norm": 2.825488805770874,
+ "learning_rate": 2.1585892040660565e-06,
+ "loss": 0.3223,
+ "step": 6944
+ },
+ {
+ "epoch": 3.2836879432624113,
+ "grad_norm": 2.8724498748779297,
+ "learning_rate": 2.1579712322312906e-06,
+ "loss": 0.3855,
+ "step": 6945
+ },
+ {
+ "epoch": 3.284160756501182,
+ "grad_norm": 2.841064691543579,
+ "learning_rate": 2.1573532816931547e-06,
+ "loss": 0.4106,
+ "step": 6946
+ },
+ {
+ "epoch": 3.284633569739953,
+ "grad_norm": 3.053391218185425,
+ "learning_rate": 2.1567353524901288e-06,
+ "loss": 0.4875,
+ "step": 6947
+ },
+ {
+ "epoch": 3.2851063829787233,
+ "grad_norm": 2.7294771671295166,
+ "learning_rate": 2.156117444660687e-06,
+ "loss": 0.3856,
+ "step": 6948
+ },
+ {
+ "epoch": 3.285579196217494,
+ "grad_norm": 3.0965659618377686,
+ "learning_rate": 2.155499558243304e-06,
+ "loss": 0.4104,
+ "step": 6949
+ },
+ {
+ "epoch": 3.286052009456265,
+ "grad_norm": 2.778923511505127,
+ "learning_rate": 2.1548816932764536e-06,
+ "loss": 0.3636,
+ "step": 6950
+ },
+ {
+ "epoch": 3.2865248226950357,
+ "grad_norm": 2.890679121017456,
+ "learning_rate": 2.1542638497986054e-06,
+ "loss": 0.4026,
+ "step": 6951
+ },
+ {
+ "epoch": 3.286997635933806,
+ "grad_norm": 3.0466806888580322,
+ "learning_rate": 2.1536460278482326e-06,
+ "loss": 0.3856,
+ "step": 6952
+ },
+ {
+ "epoch": 3.287470449172577,
+ "grad_norm": 3.1367077827453613,
+ "learning_rate": 2.1530282274638013e-06,
+ "loss": 0.3767,
+ "step": 6953
+ },
+ {
+ "epoch": 3.2879432624113476,
+ "grad_norm": 2.984694719314575,
+ "learning_rate": 2.1524104486837823e-06,
+ "loss": 0.4142,
+ "step": 6954
+ },
+ {
+ "epoch": 3.2884160756501184,
+ "grad_norm": 3.1542797088623047,
+ "learning_rate": 2.151792691546641e-06,
+ "loss": 0.4361,
+ "step": 6955
+ },
+ {
+ "epoch": 3.2888888888888888,
+ "grad_norm": 2.7306816577911377,
+ "learning_rate": 2.1511749560908405e-06,
+ "loss": 0.3692,
+ "step": 6956
+ },
+ {
+ "epoch": 3.2893617021276595,
+ "grad_norm": 3.6679904460906982,
+ "learning_rate": 2.150557242354847e-06,
+ "loss": 0.4496,
+ "step": 6957
+ },
+ {
+ "epoch": 3.2898345153664303,
+ "grad_norm": 3.2040863037109375,
+ "learning_rate": 2.1499395503771207e-06,
+ "loss": 0.3526,
+ "step": 6958
+ },
+ {
+ "epoch": 3.290307328605201,
+ "grad_norm": 3.2416043281555176,
+ "learning_rate": 2.1493218801961246e-06,
+ "loss": 0.3955,
+ "step": 6959
+ },
+ {
+ "epoch": 3.2907801418439715,
+ "grad_norm": 2.8164525032043457,
+ "learning_rate": 2.1487042318503174e-06,
+ "loss": 0.3727,
+ "step": 6960
+ },
+ {
+ "epoch": 3.2912529550827423,
+ "grad_norm": 2.5954513549804688,
+ "learning_rate": 2.148086605378156e-06,
+ "loss": 0.3315,
+ "step": 6961
+ },
+ {
+ "epoch": 3.291725768321513,
+ "grad_norm": 2.8068149089813232,
+ "learning_rate": 2.1474690008181e-06,
+ "loss": 0.3702,
+ "step": 6962
+ },
+ {
+ "epoch": 3.2921985815602834,
+ "grad_norm": 2.9063730239868164,
+ "learning_rate": 2.1468514182086025e-06,
+ "loss": 0.3357,
+ "step": 6963
+ },
+ {
+ "epoch": 3.2926713947990542,
+ "grad_norm": 2.7623207569122314,
+ "learning_rate": 2.1462338575881197e-06,
+ "loss": 0.381,
+ "step": 6964
+ },
+ {
+ "epoch": 3.293144208037825,
+ "grad_norm": 2.6818830966949463,
+ "learning_rate": 2.145616318995103e-06,
+ "loss": 0.3733,
+ "step": 6965
+ },
+ {
+ "epoch": 3.293617021276596,
+ "grad_norm": 2.7966864109039307,
+ "learning_rate": 2.1449988024680034e-06,
+ "loss": 0.3993,
+ "step": 6966
+ },
+ {
+ "epoch": 3.2940898345153666,
+ "grad_norm": 3.0644514560699463,
+ "learning_rate": 2.1443813080452728e-06,
+ "loss": 0.3541,
+ "step": 6967
+ },
+ {
+ "epoch": 3.294562647754137,
+ "grad_norm": 3.03204607963562,
+ "learning_rate": 2.1437638357653586e-06,
+ "loss": 0.3864,
+ "step": 6968
+ },
+ {
+ "epoch": 3.295035460992908,
+ "grad_norm": 2.980565071105957,
+ "learning_rate": 2.143146385666707e-06,
+ "loss": 0.36,
+ "step": 6969
+ },
+ {
+ "epoch": 3.2955082742316786,
+ "grad_norm": 3.1261661052703857,
+ "learning_rate": 2.1425289577877675e-06,
+ "loss": 0.4053,
+ "step": 6970
+ },
+ {
+ "epoch": 3.295981087470449,
+ "grad_norm": 3.0194897651672363,
+ "learning_rate": 2.1419115521669804e-06,
+ "loss": 0.4553,
+ "step": 6971
+ },
+ {
+ "epoch": 3.2964539007092197,
+ "grad_norm": 2.7620482444763184,
+ "learning_rate": 2.141294168842792e-06,
+ "loss": 0.3846,
+ "step": 6972
+ },
+ {
+ "epoch": 3.2969267139479905,
+ "grad_norm": 2.9575016498565674,
+ "learning_rate": 2.1406768078536427e-06,
+ "loss": 0.4415,
+ "step": 6973
+ },
+ {
+ "epoch": 3.2973995271867613,
+ "grad_norm": 3.17909574508667,
+ "learning_rate": 2.1400594692379717e-06,
+ "loss": 0.4514,
+ "step": 6974
+ },
+ {
+ "epoch": 3.297872340425532,
+ "grad_norm": 3.128613233566284,
+ "learning_rate": 2.1394421530342207e-06,
+ "loss": 0.3757,
+ "step": 6975
+ },
+ {
+ "epoch": 3.2983451536643025,
+ "grad_norm": 3.0247111320495605,
+ "learning_rate": 2.1388248592808243e-06,
+ "loss": 0.3881,
+ "step": 6976
+ },
+ {
+ "epoch": 3.2988179669030733,
+ "grad_norm": 2.8091228008270264,
+ "learning_rate": 2.1382075880162217e-06,
+ "loss": 0.3782,
+ "step": 6977
+ },
+ {
+ "epoch": 3.299290780141844,
+ "grad_norm": 2.985105514526367,
+ "learning_rate": 2.137590339278846e-06,
+ "loss": 0.3783,
+ "step": 6978
+ },
+ {
+ "epoch": 3.2997635933806144,
+ "grad_norm": 3.1862502098083496,
+ "learning_rate": 2.1369731131071304e-06,
+ "loss": 0.4776,
+ "step": 6979
+ },
+ {
+ "epoch": 3.300236406619385,
+ "grad_norm": 3.3138091564178467,
+ "learning_rate": 2.1363559095395075e-06,
+ "loss": 0.4056,
+ "step": 6980
+ },
+ {
+ "epoch": 3.300709219858156,
+ "grad_norm": 3.023695707321167,
+ "learning_rate": 2.135738728614407e-06,
+ "loss": 0.3716,
+ "step": 6981
+ },
+ {
+ "epoch": 3.301182033096927,
+ "grad_norm": 6.149252414703369,
+ "learning_rate": 2.135121570370259e-06,
+ "loss": 0.3713,
+ "step": 6982
+ },
+ {
+ "epoch": 3.3016548463356976,
+ "grad_norm": 2.689671754837036,
+ "learning_rate": 2.134504434845491e-06,
+ "loss": 0.3541,
+ "step": 6983
+ },
+ {
+ "epoch": 3.302127659574468,
+ "grad_norm": 3.241212844848633,
+ "learning_rate": 2.1338873220785284e-06,
+ "loss": 0.4328,
+ "step": 6984
+ },
+ {
+ "epoch": 3.3026004728132388,
+ "grad_norm": 3.6037068367004395,
+ "learning_rate": 2.133270232107798e-06,
+ "loss": 0.4091,
+ "step": 6985
+ },
+ {
+ "epoch": 3.3030732860520096,
+ "grad_norm": 3.300031900405884,
+ "learning_rate": 2.1326531649717216e-06,
+ "loss": 0.3742,
+ "step": 6986
+ },
+ {
+ "epoch": 3.30354609929078,
+ "grad_norm": 2.82257342338562,
+ "learning_rate": 2.1320361207087225e-06,
+ "loss": 0.3622,
+ "step": 6987
+ },
+ {
+ "epoch": 3.3040189125295507,
+ "grad_norm": 3.297513246536255,
+ "learning_rate": 2.1314190993572196e-06,
+ "loss": 0.4606,
+ "step": 6988
+ },
+ {
+ "epoch": 3.3044917257683215,
+ "grad_norm": 2.676440954208374,
+ "learning_rate": 2.130802100955634e-06,
+ "loss": 0.382,
+ "step": 6989
+ },
+ {
+ "epoch": 3.3049645390070923,
+ "grad_norm": 2.9548017978668213,
+ "learning_rate": 2.130185125542383e-06,
+ "loss": 0.3751,
+ "step": 6990
+ },
+ {
+ "epoch": 3.305437352245863,
+ "grad_norm": 2.800647020339966,
+ "learning_rate": 2.129568173155882e-06,
+ "loss": 0.3868,
+ "step": 6991
+ },
+ {
+ "epoch": 3.3059101654846335,
+ "grad_norm": 3.3789260387420654,
+ "learning_rate": 2.128951243834546e-06,
+ "loss": 0.4373,
+ "step": 6992
+ },
+ {
+ "epoch": 3.3063829787234043,
+ "grad_norm": 2.944807767868042,
+ "learning_rate": 2.12833433761679e-06,
+ "loss": 0.4205,
+ "step": 6993
+ },
+ {
+ "epoch": 3.306855791962175,
+ "grad_norm": 2.577975273132324,
+ "learning_rate": 2.127717454541025e-06,
+ "loss": 0.4197,
+ "step": 6994
+ },
+ {
+ "epoch": 3.3073286052009454,
+ "grad_norm": 3.0542666912078857,
+ "learning_rate": 2.127100594645661e-06,
+ "loss": 0.3811,
+ "step": 6995
+ },
+ {
+ "epoch": 3.307801418439716,
+ "grad_norm": 3.163015842437744,
+ "learning_rate": 2.1264837579691088e-06,
+ "loss": 0.415,
+ "step": 6996
+ },
+ {
+ "epoch": 3.308274231678487,
+ "grad_norm": 2.9161269664764404,
+ "learning_rate": 2.1258669445497746e-06,
+ "loss": 0.3714,
+ "step": 6997
+ },
+ {
+ "epoch": 3.308747044917258,
+ "grad_norm": 2.934483289718628,
+ "learning_rate": 2.1252501544260657e-06,
+ "loss": 0.4085,
+ "step": 6998
+ },
+ {
+ "epoch": 3.3092198581560286,
+ "grad_norm": 3.155613660812378,
+ "learning_rate": 2.1246333876363852e-06,
+ "loss": 0.4698,
+ "step": 6999
+ },
+ {
+ "epoch": 3.309692671394799,
+ "grad_norm": 2.648171901702881,
+ "learning_rate": 2.124016644219139e-06,
+ "loss": 0.3091,
+ "step": 7000
+ },
+ {
+ "epoch": 3.3101654846335697,
+ "grad_norm": 2.908219814300537,
+ "learning_rate": 2.123399924212728e-06,
+ "loss": 0.4063,
+ "step": 7001
+ },
+ {
+ "epoch": 3.3106382978723405,
+ "grad_norm": 3.138749361038208,
+ "learning_rate": 2.122783227655551e-06,
+ "loss": 0.4296,
+ "step": 7002
+ },
+ {
+ "epoch": 3.311111111111111,
+ "grad_norm": 3.044466018676758,
+ "learning_rate": 2.1221665545860094e-06,
+ "loss": 0.4424,
+ "step": 7003
+ },
+ {
+ "epoch": 3.3115839243498817,
+ "grad_norm": 2.6758792400360107,
+ "learning_rate": 2.121549905042499e-06,
+ "loss": 0.4073,
+ "step": 7004
+ },
+ {
+ "epoch": 3.3120567375886525,
+ "grad_norm": 2.8901989459991455,
+ "learning_rate": 2.1209332790634174e-06,
+ "loss": 0.3842,
+ "step": 7005
+ },
+ {
+ "epoch": 3.3125295508274233,
+ "grad_norm": 2.8179712295532227,
+ "learning_rate": 2.1203166766871582e-06,
+ "loss": 0.366,
+ "step": 7006
+ },
+ {
+ "epoch": 3.313002364066194,
+ "grad_norm": 2.6536550521850586,
+ "learning_rate": 2.1197000979521138e-06,
+ "loss": 0.3851,
+ "step": 7007
+ },
+ {
+ "epoch": 3.3134751773049644,
+ "grad_norm": 3.1277682781219482,
+ "learning_rate": 2.1190835428966775e-06,
+ "loss": 0.4249,
+ "step": 7008
+ },
+ {
+ "epoch": 3.3139479905437352,
+ "grad_norm": 2.924666166305542,
+ "learning_rate": 2.1184670115592383e-06,
+ "loss": 0.3873,
+ "step": 7009
+ },
+ {
+ "epoch": 3.314420803782506,
+ "grad_norm": 2.7921009063720703,
+ "learning_rate": 2.1178505039781856e-06,
+ "loss": 0.3754,
+ "step": 7010
+ },
+ {
+ "epoch": 3.3148936170212764,
+ "grad_norm": 2.5349879264831543,
+ "learning_rate": 2.1172340201919067e-06,
+ "loss": 0.3701,
+ "step": 7011
+ },
+ {
+ "epoch": 3.315366430260047,
+ "grad_norm": 2.849376678466797,
+ "learning_rate": 2.1166175602387866e-06,
+ "loss": 0.3963,
+ "step": 7012
+ },
+ {
+ "epoch": 3.315839243498818,
+ "grad_norm": 3.141280174255371,
+ "learning_rate": 2.11600112415721e-06,
+ "loss": 0.4158,
+ "step": 7013
+ },
+ {
+ "epoch": 3.3163120567375888,
+ "grad_norm": 2.922807455062866,
+ "learning_rate": 2.11538471198556e-06,
+ "loss": 0.3667,
+ "step": 7014
+ },
+ {
+ "epoch": 3.3167848699763596,
+ "grad_norm": 2.770400047302246,
+ "learning_rate": 2.114768323762216e-06,
+ "loss": 0.3674,
+ "step": 7015
+ },
+ {
+ "epoch": 3.31725768321513,
+ "grad_norm": 2.7706570625305176,
+ "learning_rate": 2.114151959525561e-06,
+ "loss": 0.3761,
+ "step": 7016
+ },
+ {
+ "epoch": 3.3177304964539007,
+ "grad_norm": 3.041755437850952,
+ "learning_rate": 2.1135356193139704e-06,
+ "loss": 0.4483,
+ "step": 7017
+ },
+ {
+ "epoch": 3.3182033096926715,
+ "grad_norm": 3.5757904052734375,
+ "learning_rate": 2.1129193031658227e-06,
+ "loss": 0.4094,
+ "step": 7018
+ },
+ {
+ "epoch": 3.318676122931442,
+ "grad_norm": 2.9292917251586914,
+ "learning_rate": 2.1123030111194936e-06,
+ "loss": 0.3514,
+ "step": 7019
+ },
+ {
+ "epoch": 3.3191489361702127,
+ "grad_norm": 3.1443874835968018,
+ "learning_rate": 2.111686743213355e-06,
+ "loss": 0.4098,
+ "step": 7020
+ },
+ {
+ "epoch": 3.3196217494089835,
+ "grad_norm": 2.9738030433654785,
+ "learning_rate": 2.1110704994857804e-06,
+ "loss": 0.3584,
+ "step": 7021
+ },
+ {
+ "epoch": 3.3200945626477543,
+ "grad_norm": 2.8961563110351562,
+ "learning_rate": 2.1104542799751397e-06,
+ "loss": 0.3736,
+ "step": 7022
+ },
+ {
+ "epoch": 3.320567375886525,
+ "grad_norm": 3.9264683723449707,
+ "learning_rate": 2.1098380847198037e-06,
+ "loss": 0.457,
+ "step": 7023
+ },
+ {
+ "epoch": 3.3210401891252954,
+ "grad_norm": 2.8742756843566895,
+ "learning_rate": 2.109221913758139e-06,
+ "loss": 0.4252,
+ "step": 7024
+ },
+ {
+ "epoch": 3.321513002364066,
+ "grad_norm": 3.7229559421539307,
+ "learning_rate": 2.108605767128512e-06,
+ "loss": 0.4451,
+ "step": 7025
+ },
+ {
+ "epoch": 3.321985815602837,
+ "grad_norm": 2.6417593955993652,
+ "learning_rate": 2.1079896448692884e-06,
+ "loss": 0.3658,
+ "step": 7026
+ },
+ {
+ "epoch": 3.3224586288416074,
+ "grad_norm": 2.8780412673950195,
+ "learning_rate": 2.10737354701883e-06,
+ "loss": 0.4225,
+ "step": 7027
+ },
+ {
+ "epoch": 3.322931442080378,
+ "grad_norm": 2.557816505432129,
+ "learning_rate": 2.1067574736155e-06,
+ "loss": 0.3812,
+ "step": 7028
+ },
+ {
+ "epoch": 3.323404255319149,
+ "grad_norm": 2.859062910079956,
+ "learning_rate": 2.106141424697658e-06,
+ "loss": 0.3629,
+ "step": 7029
+ },
+ {
+ "epoch": 3.3238770685579198,
+ "grad_norm": 2.4776878356933594,
+ "learning_rate": 2.1055254003036607e-06,
+ "loss": 0.3591,
+ "step": 7030
+ },
+ {
+ "epoch": 3.3243498817966906,
+ "grad_norm": 3.085066795349121,
+ "learning_rate": 2.1049094004718687e-06,
+ "loss": 0.4237,
+ "step": 7031
+ },
+ {
+ "epoch": 3.324822695035461,
+ "grad_norm": 2.862592935562134,
+ "learning_rate": 2.1042934252406345e-06,
+ "loss": 0.3185,
+ "step": 7032
+ },
+ {
+ "epoch": 3.3252955082742317,
+ "grad_norm": 2.965743064880371,
+ "learning_rate": 2.1036774746483145e-06,
+ "loss": 0.4058,
+ "step": 7033
+ },
+ {
+ "epoch": 3.3257683215130025,
+ "grad_norm": 2.7420589923858643,
+ "learning_rate": 2.103061548733261e-06,
+ "loss": 0.3566,
+ "step": 7034
+ },
+ {
+ "epoch": 3.326241134751773,
+ "grad_norm": 2.7824347019195557,
+ "learning_rate": 2.1024456475338235e-06,
+ "loss": 0.3553,
+ "step": 7035
+ },
+ {
+ "epoch": 3.3267139479905437,
+ "grad_norm": 3.0410704612731934,
+ "learning_rate": 2.1018297710883528e-06,
+ "loss": 0.3772,
+ "step": 7036
+ },
+ {
+ "epoch": 3.3271867612293144,
+ "grad_norm": 3.0811562538146973,
+ "learning_rate": 2.101213919435196e-06,
+ "loss": 0.3738,
+ "step": 7037
+ },
+ {
+ "epoch": 3.3276595744680852,
+ "grad_norm": 2.939445734024048,
+ "learning_rate": 2.100598092612699e-06,
+ "loss": 0.4107,
+ "step": 7038
+ },
+ {
+ "epoch": 3.3281323877068556,
+ "grad_norm": 3.05804705619812,
+ "learning_rate": 2.0999822906592086e-06,
+ "loss": 0.3972,
+ "step": 7039
+ },
+ {
+ "epoch": 3.3286052009456264,
+ "grad_norm": 2.803558111190796,
+ "learning_rate": 2.0993665136130657e-06,
+ "loss": 0.3487,
+ "step": 7040
+ },
+ {
+ "epoch": 3.329078014184397,
+ "grad_norm": 2.937675714492798,
+ "learning_rate": 2.0987507615126147e-06,
+ "loss": 0.4095,
+ "step": 7041
+ },
+ {
+ "epoch": 3.329550827423168,
+ "grad_norm": 2.853905439376831,
+ "learning_rate": 2.098135034396194e-06,
+ "loss": 0.3775,
+ "step": 7042
+ },
+ {
+ "epoch": 3.3300236406619383,
+ "grad_norm": 3.3520495891571045,
+ "learning_rate": 2.097519332302142e-06,
+ "loss": 0.4065,
+ "step": 7043
+ },
+ {
+ "epoch": 3.330496453900709,
+ "grad_norm": 2.8787078857421875,
+ "learning_rate": 2.096903655268797e-06,
+ "loss": 0.3452,
+ "step": 7044
+ },
+ {
+ "epoch": 3.33096926713948,
+ "grad_norm": 2.993896007537842,
+ "learning_rate": 2.096288003334493e-06,
+ "loss": 0.3814,
+ "step": 7045
+ },
+ {
+ "epoch": 3.3314420803782507,
+ "grad_norm": 3.5248336791992188,
+ "learning_rate": 2.0956723765375655e-06,
+ "loss": 0.3852,
+ "step": 7046
+ },
+ {
+ "epoch": 3.331914893617021,
+ "grad_norm": 3.2227890491485596,
+ "learning_rate": 2.0950567749163463e-06,
+ "loss": 0.3913,
+ "step": 7047
+ },
+ {
+ "epoch": 3.332387706855792,
+ "grad_norm": 3.390401601791382,
+ "learning_rate": 2.094441198509165e-06,
+ "loss": 0.3944,
+ "step": 7048
+ },
+ {
+ "epoch": 3.3328605200945627,
+ "grad_norm": 3.2057554721832275,
+ "learning_rate": 2.0938256473543534e-06,
+ "loss": 0.404,
+ "step": 7049
+ },
+ {
+ "epoch": 3.3333333333333335,
+ "grad_norm": 2.866708755493164,
+ "learning_rate": 2.0932101214902367e-06,
+ "loss": 0.4345,
+ "step": 7050
+ },
+ {
+ "epoch": 3.333806146572104,
+ "grad_norm": 3.4304039478302,
+ "learning_rate": 2.0925946209551428e-06,
+ "loss": 0.4209,
+ "step": 7051
+ },
+ {
+ "epoch": 3.3342789598108746,
+ "grad_norm": 3.996561288833618,
+ "learning_rate": 2.091979145787395e-06,
+ "loss": 0.4394,
+ "step": 7052
+ },
+ {
+ "epoch": 3.3347517730496454,
+ "grad_norm": 3.1932613849639893,
+ "learning_rate": 2.0913636960253166e-06,
+ "loss": 0.3837,
+ "step": 7053
+ },
+ {
+ "epoch": 3.3352245862884162,
+ "grad_norm": 2.908832311630249,
+ "learning_rate": 2.0907482717072293e-06,
+ "loss": 0.3526,
+ "step": 7054
+ },
+ {
+ "epoch": 3.3356973995271866,
+ "grad_norm": 2.7319607734680176,
+ "learning_rate": 2.090132872871452e-06,
+ "loss": 0.3686,
+ "step": 7055
+ },
+ {
+ "epoch": 3.3361702127659574,
+ "grad_norm": 2.9213504791259766,
+ "learning_rate": 2.0895174995563043e-06,
+ "loss": 0.4034,
+ "step": 7056
+ },
+ {
+ "epoch": 3.336643026004728,
+ "grad_norm": 2.8093936443328857,
+ "learning_rate": 2.0889021518001017e-06,
+ "loss": 0.4151,
+ "step": 7057
+ },
+ {
+ "epoch": 3.337115839243499,
+ "grad_norm": 3.1840829849243164,
+ "learning_rate": 2.0882868296411594e-06,
+ "loss": 0.3501,
+ "step": 7058
+ },
+ {
+ "epoch": 3.3375886524822693,
+ "grad_norm": 2.793567657470703,
+ "learning_rate": 2.087671533117791e-06,
+ "loss": 0.3911,
+ "step": 7059
+ },
+ {
+ "epoch": 3.33806146572104,
+ "grad_norm": 3.0820090770721436,
+ "learning_rate": 2.0870562622683077e-06,
+ "loss": 0.432,
+ "step": 7060
+ },
+ {
+ "epoch": 3.338534278959811,
+ "grad_norm": 2.774630546569824,
+ "learning_rate": 2.0864410171310213e-06,
+ "loss": 0.3434,
+ "step": 7061
+ },
+ {
+ "epoch": 3.3390070921985817,
+ "grad_norm": 2.70447039604187,
+ "learning_rate": 2.085825797744239e-06,
+ "loss": 0.3787,
+ "step": 7062
+ },
+ {
+ "epoch": 3.339479905437352,
+ "grad_norm": 3.1014437675476074,
+ "learning_rate": 2.0852106041462672e-06,
+ "loss": 0.4568,
+ "step": 7063
+ },
+ {
+ "epoch": 3.339952718676123,
+ "grad_norm": 3.312680244445801,
+ "learning_rate": 2.0845954363754133e-06,
+ "loss": 0.4285,
+ "step": 7064
+ },
+ {
+ "epoch": 3.3404255319148937,
+ "grad_norm": 2.7070534229278564,
+ "learning_rate": 2.0839802944699806e-06,
+ "loss": 0.4096,
+ "step": 7065
+ },
+ {
+ "epoch": 3.3408983451536645,
+ "grad_norm": 2.8172531127929688,
+ "learning_rate": 2.083365178468269e-06,
+ "loss": 0.3652,
+ "step": 7066
+ },
+ {
+ "epoch": 3.341371158392435,
+ "grad_norm": 2.896378517150879,
+ "learning_rate": 2.082750088408582e-06,
+ "loss": 0.3778,
+ "step": 7067
+ },
+ {
+ "epoch": 3.3418439716312056,
+ "grad_norm": 2.769805669784546,
+ "learning_rate": 2.0821350243292175e-06,
+ "loss": 0.3593,
+ "step": 7068
+ },
+ {
+ "epoch": 3.3423167848699764,
+ "grad_norm": 2.672520875930786,
+ "learning_rate": 2.0815199862684728e-06,
+ "loss": 0.3873,
+ "step": 7069
+ },
+ {
+ "epoch": 3.342789598108747,
+ "grad_norm": 2.841327428817749,
+ "learning_rate": 2.0809049742646435e-06,
+ "loss": 0.41,
+ "step": 7070
+ },
+ {
+ "epoch": 3.3432624113475176,
+ "grad_norm": 3.0540482997894287,
+ "learning_rate": 2.080289988356023e-06,
+ "loss": 0.32,
+ "step": 7071
+ },
+ {
+ "epoch": 3.3437352245862884,
+ "grad_norm": 3.471684217453003,
+ "learning_rate": 2.079675028580905e-06,
+ "loss": 0.3779,
+ "step": 7072
+ },
+ {
+ "epoch": 3.344208037825059,
+ "grad_norm": 2.8545875549316406,
+ "learning_rate": 2.07906009497758e-06,
+ "loss": 0.4645,
+ "step": 7073
+ },
+ {
+ "epoch": 3.34468085106383,
+ "grad_norm": 2.7771127223968506,
+ "learning_rate": 2.078445187584337e-06,
+ "loss": 0.3889,
+ "step": 7074
+ },
+ {
+ "epoch": 3.3451536643026003,
+ "grad_norm": 2.769188165664673,
+ "learning_rate": 2.0778303064394647e-06,
+ "loss": 0.3745,
+ "step": 7075
+ },
+ {
+ "epoch": 3.345626477541371,
+ "grad_norm": 2.739577531814575,
+ "learning_rate": 2.0772154515812467e-06,
+ "loss": 0.4402,
+ "step": 7076
+ },
+ {
+ "epoch": 3.346099290780142,
+ "grad_norm": 2.6124343872070312,
+ "learning_rate": 2.0766006230479696e-06,
+ "loss": 0.3595,
+ "step": 7077
+ },
+ {
+ "epoch": 3.3465721040189127,
+ "grad_norm": 2.7100563049316406,
+ "learning_rate": 2.0759858208779136e-06,
+ "loss": 0.3641,
+ "step": 7078
+ },
+ {
+ "epoch": 3.347044917257683,
+ "grad_norm": 2.8594000339508057,
+ "learning_rate": 2.075371045109363e-06,
+ "loss": 0.402,
+ "step": 7079
+ },
+ {
+ "epoch": 3.347517730496454,
+ "grad_norm": 3.2045278549194336,
+ "learning_rate": 2.0747562957805955e-06,
+ "loss": 0.4719,
+ "step": 7080
+ },
+ {
+ "epoch": 3.3479905437352246,
+ "grad_norm": 2.825594663619995,
+ "learning_rate": 2.0741415729298874e-06,
+ "loss": 0.4127,
+ "step": 7081
+ },
+ {
+ "epoch": 3.3484633569739954,
+ "grad_norm": 2.992403984069824,
+ "learning_rate": 2.0735268765955173e-06,
+ "loss": 0.3943,
+ "step": 7082
+ },
+ {
+ "epoch": 3.348936170212766,
+ "grad_norm": 3.0629165172576904,
+ "learning_rate": 2.072912206815758e-06,
+ "loss": 0.4132,
+ "step": 7083
+ },
+ {
+ "epoch": 3.3494089834515366,
+ "grad_norm": 2.7553658485412598,
+ "learning_rate": 2.0722975636288836e-06,
+ "loss": 0.3667,
+ "step": 7084
+ },
+ {
+ "epoch": 3.3498817966903074,
+ "grad_norm": 3.1556780338287354,
+ "learning_rate": 2.0716829470731647e-06,
+ "loss": 0.4383,
+ "step": 7085
+ },
+ {
+ "epoch": 3.350354609929078,
+ "grad_norm": 2.6693310737609863,
+ "learning_rate": 2.071068357186869e-06,
+ "loss": 0.363,
+ "step": 7086
+ },
+ {
+ "epoch": 3.3508274231678485,
+ "grad_norm": 2.738314628601074,
+ "learning_rate": 2.0704537940082673e-06,
+ "loss": 0.3493,
+ "step": 7087
+ },
+ {
+ "epoch": 3.3513002364066193,
+ "grad_norm": 3.2205989360809326,
+ "learning_rate": 2.069839257575624e-06,
+ "loss": 0.3802,
+ "step": 7088
+ },
+ {
+ "epoch": 3.35177304964539,
+ "grad_norm": 2.8969876766204834,
+ "learning_rate": 2.069224747927203e-06,
+ "loss": 0.3846,
+ "step": 7089
+ },
+ {
+ "epoch": 3.352245862884161,
+ "grad_norm": 2.833179473876953,
+ "learning_rate": 2.0686102651012694e-06,
+ "loss": 0.3892,
+ "step": 7090
+ },
+ {
+ "epoch": 3.3527186761229313,
+ "grad_norm": 3.303830623626709,
+ "learning_rate": 2.067995809136082e-06,
+ "loss": 0.4009,
+ "step": 7091
+ },
+ {
+ "epoch": 3.353191489361702,
+ "grad_norm": 3.3684141635894775,
+ "learning_rate": 2.0673813800699024e-06,
+ "loss": 0.4434,
+ "step": 7092
+ },
+ {
+ "epoch": 3.353664302600473,
+ "grad_norm": 2.6549112796783447,
+ "learning_rate": 2.066766977940987e-06,
+ "loss": 0.3941,
+ "step": 7093
+ },
+ {
+ "epoch": 3.3541371158392437,
+ "grad_norm": 2.852935314178467,
+ "learning_rate": 2.066152602787591e-06,
+ "loss": 0.4143,
+ "step": 7094
+ },
+ {
+ "epoch": 3.354609929078014,
+ "grad_norm": 2.9621706008911133,
+ "learning_rate": 2.0655382546479713e-06,
+ "loss": 0.4502,
+ "step": 7095
+ },
+ {
+ "epoch": 3.355082742316785,
+ "grad_norm": 3.2836413383483887,
+ "learning_rate": 2.064923933560378e-06,
+ "loss": 0.3993,
+ "step": 7096
+ },
+ {
+ "epoch": 3.3555555555555556,
+ "grad_norm": 2.8187968730926514,
+ "learning_rate": 2.0643096395630654e-06,
+ "loss": 0.3766,
+ "step": 7097
+ },
+ {
+ "epoch": 3.3560283687943264,
+ "grad_norm": 2.7965118885040283,
+ "learning_rate": 2.0636953726942803e-06,
+ "loss": 0.4258,
+ "step": 7098
+ },
+ {
+ "epoch": 3.3565011820330968,
+ "grad_norm": 3.002030611038208,
+ "learning_rate": 2.063081132992271e-06,
+ "loss": 0.3548,
+ "step": 7099
+ },
+ {
+ "epoch": 3.3569739952718676,
+ "grad_norm": 2.927603006362915,
+ "learning_rate": 2.0624669204952847e-06,
+ "loss": 0.3759,
+ "step": 7100
+ },
+ {
+ "epoch": 3.3574468085106384,
+ "grad_norm": 2.911393165588379,
+ "learning_rate": 2.061852735241563e-06,
+ "loss": 0.3599,
+ "step": 7101
+ },
+ {
+ "epoch": 3.357919621749409,
+ "grad_norm": 3.0596864223480225,
+ "learning_rate": 2.0612385772693517e-06,
+ "loss": 0.3557,
+ "step": 7102
+ },
+ {
+ "epoch": 3.3583924349881795,
+ "grad_norm": 2.7869808673858643,
+ "learning_rate": 2.0606244466168905e-06,
+ "loss": 0.3696,
+ "step": 7103
+ },
+ {
+ "epoch": 3.3588652482269503,
+ "grad_norm": 2.927715539932251,
+ "learning_rate": 2.060010343322417e-06,
+ "loss": 0.3309,
+ "step": 7104
+ },
+ {
+ "epoch": 3.359338061465721,
+ "grad_norm": 3.44653058052063,
+ "learning_rate": 2.059396267424171e-06,
+ "loss": 0.4453,
+ "step": 7105
+ },
+ {
+ "epoch": 3.359810874704492,
+ "grad_norm": 3.047652244567871,
+ "learning_rate": 2.0587822189603873e-06,
+ "loss": 0.3615,
+ "step": 7106
+ },
+ {
+ "epoch": 3.3602836879432623,
+ "grad_norm": 2.6640517711639404,
+ "learning_rate": 2.0581681979693002e-06,
+ "loss": 0.3716,
+ "step": 7107
+ },
+ {
+ "epoch": 3.360756501182033,
+ "grad_norm": 2.8253493309020996,
+ "learning_rate": 2.0575542044891424e-06,
+ "loss": 0.3485,
+ "step": 7108
+ },
+ {
+ "epoch": 3.361229314420804,
+ "grad_norm": 3.0512938499450684,
+ "learning_rate": 2.0569402385581433e-06,
+ "loss": 0.4582,
+ "step": 7109
+ },
+ {
+ "epoch": 3.3617021276595747,
+ "grad_norm": 2.935060739517212,
+ "learning_rate": 2.0563263002145333e-06,
+ "loss": 0.425,
+ "step": 7110
+ },
+ {
+ "epoch": 3.362174940898345,
+ "grad_norm": 3.2708780765533447,
+ "learning_rate": 2.0557123894965396e-06,
+ "loss": 0.4193,
+ "step": 7111
+ },
+ {
+ "epoch": 3.362647754137116,
+ "grad_norm": 2.758329391479492,
+ "learning_rate": 2.055098506442386e-06,
+ "loss": 0.3754,
+ "step": 7112
+ },
+ {
+ "epoch": 3.3631205673758866,
+ "grad_norm": 3.0359015464782715,
+ "learning_rate": 2.0544846510902987e-06,
+ "loss": 0.4207,
+ "step": 7113
+ },
+ {
+ "epoch": 3.3635933806146574,
+ "grad_norm": 3.096968412399292,
+ "learning_rate": 2.0538708234784983e-06,
+ "loss": 0.4303,
+ "step": 7114
+ },
+ {
+ "epoch": 3.3640661938534278,
+ "grad_norm": 3.0777673721313477,
+ "learning_rate": 2.053257023645206e-06,
+ "loss": 0.3904,
+ "step": 7115
+ },
+ {
+ "epoch": 3.3645390070921986,
+ "grad_norm": 2.9483232498168945,
+ "learning_rate": 2.0526432516286394e-06,
+ "loss": 0.3949,
+ "step": 7116
+ },
+ {
+ "epoch": 3.3650118203309693,
+ "grad_norm": 2.839067220687866,
+ "learning_rate": 2.0520295074670154e-06,
+ "loss": 0.3705,
+ "step": 7117
+ },
+ {
+ "epoch": 3.36548463356974,
+ "grad_norm": 3.0450778007507324,
+ "learning_rate": 2.0514157911985506e-06,
+ "loss": 0.3987,
+ "step": 7118
+ },
+ {
+ "epoch": 3.3659574468085105,
+ "grad_norm": 3.425318717956543,
+ "learning_rate": 2.0508021028614564e-06,
+ "loss": 0.3941,
+ "step": 7119
+ },
+ {
+ "epoch": 3.3664302600472813,
+ "grad_norm": 2.9509286880493164,
+ "learning_rate": 2.0501884424939465e-06,
+ "loss": 0.354,
+ "step": 7120
+ },
+ {
+ "epoch": 3.366903073286052,
+ "grad_norm": 2.799504518508911,
+ "learning_rate": 2.0495748101342303e-06,
+ "loss": 0.3891,
+ "step": 7121
+ },
+ {
+ "epoch": 3.3673758865248224,
+ "grad_norm": 2.9140994548797607,
+ "learning_rate": 2.048961205820515e-06,
+ "loss": 0.3638,
+ "step": 7122
+ },
+ {
+ "epoch": 3.3678486997635932,
+ "grad_norm": 2.8074216842651367,
+ "learning_rate": 2.0483476295910077e-06,
+ "loss": 0.3501,
+ "step": 7123
+ },
+ {
+ "epoch": 3.368321513002364,
+ "grad_norm": 2.770829677581787,
+ "learning_rate": 2.0477340814839126e-06,
+ "loss": 0.3774,
+ "step": 7124
+ },
+ {
+ "epoch": 3.368794326241135,
+ "grad_norm": 2.581655502319336,
+ "learning_rate": 2.047120561537434e-06,
+ "loss": 0.3523,
+ "step": 7125
+ },
+ {
+ "epoch": 3.3692671394799056,
+ "grad_norm": 3.4234209060668945,
+ "learning_rate": 2.046507069789772e-06,
+ "loss": 0.4191,
+ "step": 7126
+ },
+ {
+ "epoch": 3.369739952718676,
+ "grad_norm": 2.669860601425171,
+ "learning_rate": 2.045893606279126e-06,
+ "loss": 0.3542,
+ "step": 7127
+ },
+ {
+ "epoch": 3.370212765957447,
+ "grad_norm": 3.2426629066467285,
+ "learning_rate": 2.045280171043694e-06,
+ "loss": 0.4416,
+ "step": 7128
+ },
+ {
+ "epoch": 3.3706855791962176,
+ "grad_norm": 3.1318910121917725,
+ "learning_rate": 2.044666764121672e-06,
+ "loss": 0.3999,
+ "step": 7129
+ },
+ {
+ "epoch": 3.371158392434988,
+ "grad_norm": 2.7044012546539307,
+ "learning_rate": 2.044053385551254e-06,
+ "loss": 0.3907,
+ "step": 7130
+ },
+ {
+ "epoch": 3.3716312056737587,
+ "grad_norm": 2.9429895877838135,
+ "learning_rate": 2.0434400353706322e-06,
+ "loss": 0.3827,
+ "step": 7131
+ },
+ {
+ "epoch": 3.3721040189125295,
+ "grad_norm": 2.7258787155151367,
+ "learning_rate": 2.0428267136179973e-06,
+ "loss": 0.3688,
+ "step": 7132
+ },
+ {
+ "epoch": 3.3725768321513003,
+ "grad_norm": 2.765108108520508,
+ "learning_rate": 2.042213420331539e-06,
+ "loss": 0.4078,
+ "step": 7133
+ },
+ {
+ "epoch": 3.373049645390071,
+ "grad_norm": 3.2951347827911377,
+ "learning_rate": 2.0416001555494435e-06,
+ "loss": 0.4259,
+ "step": 7134
+ },
+ {
+ "epoch": 3.3735224586288415,
+ "grad_norm": 3.3917062282562256,
+ "learning_rate": 2.040986919309895e-06,
+ "loss": 0.5094,
+ "step": 7135
+ },
+ {
+ "epoch": 3.3739952718676123,
+ "grad_norm": 2.746434450149536,
+ "learning_rate": 2.04037371165108e-06,
+ "loss": 0.3513,
+ "step": 7136
+ },
+ {
+ "epoch": 3.374468085106383,
+ "grad_norm": 3.268731117248535,
+ "learning_rate": 2.0397605326111774e-06,
+ "loss": 0.3909,
+ "step": 7137
+ },
+ {
+ "epoch": 3.3749408983451534,
+ "grad_norm": 2.8498165607452393,
+ "learning_rate": 2.0391473822283692e-06,
+ "loss": 0.3657,
+ "step": 7138
+ },
+ {
+ "epoch": 3.3754137115839242,
+ "grad_norm": 2.855966567993164,
+ "learning_rate": 2.0385342605408325e-06,
+ "loss": 0.3927,
+ "step": 7139
+ },
+ {
+ "epoch": 3.375886524822695,
+ "grad_norm": 3.1839048862457275,
+ "learning_rate": 2.0379211675867438e-06,
+ "loss": 0.4476,
+ "step": 7140
+ },
+ {
+ "epoch": 3.376359338061466,
+ "grad_norm": 2.9379947185516357,
+ "learning_rate": 2.037308103404278e-06,
+ "loss": 0.3657,
+ "step": 7141
+ },
+ {
+ "epoch": 3.3768321513002366,
+ "grad_norm": 2.9251210689544678,
+ "learning_rate": 2.0366950680316073e-06,
+ "loss": 0.3975,
+ "step": 7142
+ },
+ {
+ "epoch": 3.377304964539007,
+ "grad_norm": 2.811885118484497,
+ "learning_rate": 2.036082061506904e-06,
+ "loss": 0.3064,
+ "step": 7143
+ },
+ {
+ "epoch": 3.3777777777777778,
+ "grad_norm": 2.755229949951172,
+ "learning_rate": 2.0354690838683363e-06,
+ "loss": 0.3328,
+ "step": 7144
+ },
+ {
+ "epoch": 3.3782505910165486,
+ "grad_norm": 3.006819725036621,
+ "learning_rate": 2.0348561351540706e-06,
+ "loss": 0.4168,
+ "step": 7145
+ },
+ {
+ "epoch": 3.378723404255319,
+ "grad_norm": 2.8788509368896484,
+ "learning_rate": 2.034243215402275e-06,
+ "loss": 0.4123,
+ "step": 7146
+ },
+ {
+ "epoch": 3.3791962174940897,
+ "grad_norm": 2.9732980728149414,
+ "learning_rate": 2.033630324651112e-06,
+ "loss": 0.3371,
+ "step": 7147
+ },
+ {
+ "epoch": 3.3796690307328605,
+ "grad_norm": 2.7731754779815674,
+ "learning_rate": 2.033017462938744e-06,
+ "loss": 0.382,
+ "step": 7148
+ },
+ {
+ "epoch": 3.3801418439716313,
+ "grad_norm": 2.766395092010498,
+ "learning_rate": 2.032404630303331e-06,
+ "loss": 0.3295,
+ "step": 7149
+ },
+ {
+ "epoch": 3.380614657210402,
+ "grad_norm": 3.197960138320923,
+ "learning_rate": 2.03179182678303e-06,
+ "loss": 0.354,
+ "step": 7150
+ },
+ {
+ "epoch": 3.3810874704491725,
+ "grad_norm": 3.048553228378296,
+ "learning_rate": 2.031179052416e-06,
+ "loss": 0.4027,
+ "step": 7151
+ },
+ {
+ "epoch": 3.3815602836879433,
+ "grad_norm": 3.1527998447418213,
+ "learning_rate": 2.0305663072403934e-06,
+ "loss": 0.4229,
+ "step": 7152
+ },
+ {
+ "epoch": 3.382033096926714,
+ "grad_norm": 3.0407028198242188,
+ "learning_rate": 2.029953591294366e-06,
+ "loss": 0.4254,
+ "step": 7153
+ },
+ {
+ "epoch": 3.3825059101654844,
+ "grad_norm": 2.7170357704162598,
+ "learning_rate": 2.0293409046160673e-06,
+ "loss": 0.3307,
+ "step": 7154
+ },
+ {
+ "epoch": 3.382978723404255,
+ "grad_norm": 3.0128726959228516,
+ "learning_rate": 2.028728247243646e-06,
+ "loss": 0.3873,
+ "step": 7155
+ },
+ {
+ "epoch": 3.383451536643026,
+ "grad_norm": 4.861877918243408,
+ "learning_rate": 2.0281156192152507e-06,
+ "loss": 0.4371,
+ "step": 7156
+ },
+ {
+ "epoch": 3.383924349881797,
+ "grad_norm": 2.890249252319336,
+ "learning_rate": 2.0275030205690257e-06,
+ "loss": 0.3899,
+ "step": 7157
+ },
+ {
+ "epoch": 3.3843971631205676,
+ "grad_norm": 3.0774779319763184,
+ "learning_rate": 2.026890451343117e-06,
+ "loss": 0.4151,
+ "step": 7158
+ },
+ {
+ "epoch": 3.384869976359338,
+ "grad_norm": 2.8705947399139404,
+ "learning_rate": 2.026277911575665e-06,
+ "loss": 0.4004,
+ "step": 7159
+ },
+ {
+ "epoch": 3.3853427895981087,
+ "grad_norm": 3.170760154724121,
+ "learning_rate": 2.0256654013048096e-06,
+ "loss": 0.4442,
+ "step": 7160
+ },
+ {
+ "epoch": 3.3858156028368795,
+ "grad_norm": 4.211156368255615,
+ "learning_rate": 2.0250529205686905e-06,
+ "loss": 0.4605,
+ "step": 7161
+ },
+ {
+ "epoch": 3.38628841607565,
+ "grad_norm": 2.513519287109375,
+ "learning_rate": 2.0244404694054435e-06,
+ "loss": 0.3506,
+ "step": 7162
+ },
+ {
+ "epoch": 3.3867612293144207,
+ "grad_norm": 3.1558821201324463,
+ "learning_rate": 2.023828047853203e-06,
+ "loss": 0.43,
+ "step": 7163
+ },
+ {
+ "epoch": 3.3872340425531915,
+ "grad_norm": 3.6770291328430176,
+ "learning_rate": 2.023215655950102e-06,
+ "loss": 0.3911,
+ "step": 7164
+ },
+ {
+ "epoch": 3.3877068557919623,
+ "grad_norm": 2.6544485092163086,
+ "learning_rate": 2.022603293734271e-06,
+ "loss": 0.3306,
+ "step": 7165
+ },
+ {
+ "epoch": 3.388179669030733,
+ "grad_norm": 3.34232759475708,
+ "learning_rate": 2.0219909612438405e-06,
+ "loss": 0.4233,
+ "step": 7166
+ },
+ {
+ "epoch": 3.3886524822695034,
+ "grad_norm": 3.388561725616455,
+ "learning_rate": 2.0213786585169363e-06,
+ "loss": 0.4171,
+ "step": 7167
+ },
+ {
+ "epoch": 3.3891252955082742,
+ "grad_norm": 2.8606953620910645,
+ "learning_rate": 2.020766385591684e-06,
+ "loss": 0.3864,
+ "step": 7168
+ },
+ {
+ "epoch": 3.389598108747045,
+ "grad_norm": 3.0135979652404785,
+ "learning_rate": 2.020154142506208e-06,
+ "loss": 0.3933,
+ "step": 7169
+ },
+ {
+ "epoch": 3.3900709219858154,
+ "grad_norm": 2.5003163814544678,
+ "learning_rate": 2.0195419292986294e-06,
+ "loss": 0.3852,
+ "step": 7170
+ },
+ {
+ "epoch": 3.390543735224586,
+ "grad_norm": 2.8591368198394775,
+ "learning_rate": 2.0189297460070685e-06,
+ "loss": 0.3962,
+ "step": 7171
+ },
+ {
+ "epoch": 3.391016548463357,
+ "grad_norm": 2.8830223083496094,
+ "learning_rate": 2.0183175926696427e-06,
+ "loss": 0.3632,
+ "step": 7172
+ },
+ {
+ "epoch": 3.391489361702128,
+ "grad_norm": 3.3904542922973633,
+ "learning_rate": 2.0177054693244674e-06,
+ "loss": 0.4284,
+ "step": 7173
+ },
+ {
+ "epoch": 3.3919621749408986,
+ "grad_norm": 3.0325920581817627,
+ "learning_rate": 2.0170933760096585e-06,
+ "loss": 0.4331,
+ "step": 7174
+ },
+ {
+ "epoch": 3.392434988179669,
+ "grad_norm": 2.60345196723938,
+ "learning_rate": 2.016481312763327e-06,
+ "loss": 0.4077,
+ "step": 7175
+ },
+ {
+ "epoch": 3.3929078014184397,
+ "grad_norm": 2.8146891593933105,
+ "learning_rate": 2.0158692796235845e-06,
+ "loss": 0.4224,
+ "step": 7176
+ },
+ {
+ "epoch": 3.3933806146572105,
+ "grad_norm": 2.8158490657806396,
+ "learning_rate": 2.0152572766285396e-06,
+ "loss": 0.3454,
+ "step": 7177
+ },
+ {
+ "epoch": 3.393853427895981,
+ "grad_norm": 3.2753400802612305,
+ "learning_rate": 2.0146453038162978e-06,
+ "loss": 0.3615,
+ "step": 7178
+ },
+ {
+ "epoch": 3.3943262411347517,
+ "grad_norm": 3.0527124404907227,
+ "learning_rate": 2.0140333612249655e-06,
+ "loss": 0.415,
+ "step": 7179
+ },
+ {
+ "epoch": 3.3947990543735225,
+ "grad_norm": 2.6813764572143555,
+ "learning_rate": 2.0134214488926435e-06,
+ "loss": 0.3391,
+ "step": 7180
+ },
+ {
+ "epoch": 3.3952718676122933,
+ "grad_norm": 2.809319496154785,
+ "learning_rate": 2.0128095668574356e-06,
+ "loss": 0.3123,
+ "step": 7181
+ },
+ {
+ "epoch": 3.395744680851064,
+ "grad_norm": 2.6619064807891846,
+ "learning_rate": 2.0121977151574396e-06,
+ "loss": 0.4222,
+ "step": 7182
+ },
+ {
+ "epoch": 3.3962174940898344,
+ "grad_norm": 2.9201200008392334,
+ "learning_rate": 2.0115858938307516e-06,
+ "loss": 0.3712,
+ "step": 7183
+ },
+ {
+ "epoch": 3.396690307328605,
+ "grad_norm": 3.2058637142181396,
+ "learning_rate": 2.0109741029154696e-06,
+ "loss": 0.4004,
+ "step": 7184
+ },
+ {
+ "epoch": 3.397163120567376,
+ "grad_norm": 2.821855306625366,
+ "learning_rate": 2.0103623424496862e-06,
+ "loss": 0.4053,
+ "step": 7185
+ },
+ {
+ "epoch": 3.3976359338061464,
+ "grad_norm": 3.0371549129486084,
+ "learning_rate": 2.009750612471492e-06,
+ "loss": 0.4246,
+ "step": 7186
+ },
+ {
+ "epoch": 3.398108747044917,
+ "grad_norm": 2.8827290534973145,
+ "learning_rate": 2.009138913018978e-06,
+ "loss": 0.3256,
+ "step": 7187
+ },
+ {
+ "epoch": 3.398581560283688,
+ "grad_norm": 3.168039560317993,
+ "learning_rate": 2.0085272441302305e-06,
+ "loss": 0.4233,
+ "step": 7188
+ },
+ {
+ "epoch": 3.3990543735224588,
+ "grad_norm": 3.259723663330078,
+ "learning_rate": 2.0079156058433374e-06,
+ "loss": 0.4168,
+ "step": 7189
+ },
+ {
+ "epoch": 3.3995271867612296,
+ "grad_norm": 2.456231117248535,
+ "learning_rate": 2.007303998196382e-06,
+ "loss": 0.3383,
+ "step": 7190
+ },
+ {
+ "epoch": 3.4,
+ "grad_norm": 2.735180377960205,
+ "learning_rate": 2.006692421227445e-06,
+ "loss": 0.3475,
+ "step": 7191
+ },
+ {
+ "epoch": 3.4004728132387707,
+ "grad_norm": 2.76263427734375,
+ "learning_rate": 2.006080874974609e-06,
+ "loss": 0.3651,
+ "step": 7192
+ },
+ {
+ "epoch": 3.4009456264775415,
+ "grad_norm": 3.36867094039917,
+ "learning_rate": 2.0054693594759504e-06,
+ "loss": 0.4479,
+ "step": 7193
+ },
+ {
+ "epoch": 3.401418439716312,
+ "grad_norm": 2.532167673110962,
+ "learning_rate": 2.004857874769547e-06,
+ "loss": 0.3818,
+ "step": 7194
+ },
+ {
+ "epoch": 3.4018912529550827,
+ "grad_norm": 2.8723537921905518,
+ "learning_rate": 2.0042464208934724e-06,
+ "loss": 0.3332,
+ "step": 7195
+ },
+ {
+ "epoch": 3.4023640661938535,
+ "grad_norm": 2.676460027694702,
+ "learning_rate": 2.0036349978857987e-06,
+ "loss": 0.3488,
+ "step": 7196
+ },
+ {
+ "epoch": 3.4028368794326243,
+ "grad_norm": 2.805851459503174,
+ "learning_rate": 2.0030236057845983e-06,
+ "loss": 0.3796,
+ "step": 7197
+ },
+ {
+ "epoch": 3.403309692671395,
+ "grad_norm": 2.688988447189331,
+ "learning_rate": 2.0024122446279377e-06,
+ "loss": 0.3707,
+ "step": 7198
+ },
+ {
+ "epoch": 3.4037825059101654,
+ "grad_norm": 3.118720293045044,
+ "learning_rate": 2.0018009144538853e-06,
+ "loss": 0.4064,
+ "step": 7199
+ },
+ {
+ "epoch": 3.404255319148936,
+ "grad_norm": 2.876507520675659,
+ "learning_rate": 2.001189615300506e-06,
+ "loss": 0.3543,
+ "step": 7200
+ },
+ {
+ "epoch": 3.404728132387707,
+ "grad_norm": 3.0043466091156006,
+ "learning_rate": 2.000578347205861e-06,
+ "loss": 0.3833,
+ "step": 7201
+ },
+ {
+ "epoch": 3.4052009456264773,
+ "grad_norm": 3.1057114601135254,
+ "learning_rate": 1.9999671102080133e-06,
+ "loss": 0.4154,
+ "step": 7202
+ },
+ {
+ "epoch": 3.405673758865248,
+ "grad_norm": 2.9791855812072754,
+ "learning_rate": 1.9993559043450202e-06,
+ "loss": 0.3865,
+ "step": 7203
+ },
+ {
+ "epoch": 3.406146572104019,
+ "grad_norm": 3.4403460025787354,
+ "learning_rate": 1.9987447296549407e-06,
+ "loss": 0.3883,
+ "step": 7204
+ },
+ {
+ "epoch": 3.4066193853427897,
+ "grad_norm": 2.9962027072906494,
+ "learning_rate": 1.998133586175829e-06,
+ "loss": 0.3796,
+ "step": 7205
+ },
+ {
+ "epoch": 3.40709219858156,
+ "grad_norm": 3.0613129138946533,
+ "learning_rate": 1.997522473945737e-06,
+ "loss": 0.3917,
+ "step": 7206
+ },
+ {
+ "epoch": 3.407565011820331,
+ "grad_norm": 3.065985679626465,
+ "learning_rate": 1.996911393002718e-06,
+ "loss": 0.3521,
+ "step": 7207
+ },
+ {
+ "epoch": 3.4080378250591017,
+ "grad_norm": 2.976177930831909,
+ "learning_rate": 1.996300343384821e-06,
+ "loss": 0.3852,
+ "step": 7208
+ },
+ {
+ "epoch": 3.4085106382978725,
+ "grad_norm": 3.3587961196899414,
+ "learning_rate": 1.995689325130092e-06,
+ "loss": 0.3947,
+ "step": 7209
+ },
+ {
+ "epoch": 3.408983451536643,
+ "grad_norm": 2.626983165740967,
+ "learning_rate": 1.995078338276578e-06,
+ "loss": 0.316,
+ "step": 7210
+ },
+ {
+ "epoch": 3.4094562647754136,
+ "grad_norm": 3.14713978767395,
+ "learning_rate": 1.9944673828623217e-06,
+ "loss": 0.4008,
+ "step": 7211
+ },
+ {
+ "epoch": 3.4099290780141844,
+ "grad_norm": 2.968918800354004,
+ "learning_rate": 1.993856458925365e-06,
+ "loss": 0.439,
+ "step": 7212
+ },
+ {
+ "epoch": 3.4104018912529552,
+ "grad_norm": 2.7724127769470215,
+ "learning_rate": 1.9932455665037476e-06,
+ "loss": 0.3941,
+ "step": 7213
+ },
+ {
+ "epoch": 3.4108747044917256,
+ "grad_norm": 2.963146448135376,
+ "learning_rate": 1.9926347056355057e-06,
+ "loss": 0.3893,
+ "step": 7214
+ },
+ {
+ "epoch": 3.4113475177304964,
+ "grad_norm": 2.791637420654297,
+ "learning_rate": 1.9920238763586765e-06,
+ "loss": 0.4068,
+ "step": 7215
+ },
+ {
+ "epoch": 3.411820330969267,
+ "grad_norm": 3.030275583267212,
+ "learning_rate": 1.9914130787112924e-06,
+ "loss": 0.3828,
+ "step": 7216
+ },
+ {
+ "epoch": 3.412293144208038,
+ "grad_norm": 3.113128900527954,
+ "learning_rate": 1.990802312731387e-06,
+ "loss": 0.3903,
+ "step": 7217
+ },
+ {
+ "epoch": 3.4127659574468083,
+ "grad_norm": 3.104170322418213,
+ "learning_rate": 1.9901915784569884e-06,
+ "loss": 0.4171,
+ "step": 7218
+ },
+ {
+ "epoch": 3.413238770685579,
+ "grad_norm": 3.1247572898864746,
+ "learning_rate": 1.989580875926125e-06,
+ "loss": 0.4022,
+ "step": 7219
+ },
+ {
+ "epoch": 3.41371158392435,
+ "grad_norm": 2.9487457275390625,
+ "learning_rate": 1.988970205176822e-06,
+ "loss": 0.3948,
+ "step": 7220
+ },
+ {
+ "epoch": 3.4141843971631207,
+ "grad_norm": 2.8763654232025146,
+ "learning_rate": 1.9883595662471028e-06,
+ "loss": 0.3588,
+ "step": 7221
+ },
+ {
+ "epoch": 3.414657210401891,
+ "grad_norm": 2.563152551651001,
+ "learning_rate": 1.987748959174991e-06,
+ "loss": 0.3509,
+ "step": 7222
+ },
+ {
+ "epoch": 3.415130023640662,
+ "grad_norm": 3.148759365081787,
+ "learning_rate": 1.9871383839985053e-06,
+ "loss": 0.4364,
+ "step": 7223
+ },
+ {
+ "epoch": 3.4156028368794327,
+ "grad_norm": 2.8187363147735596,
+ "learning_rate": 1.986527840755663e-06,
+ "loss": 0.3803,
+ "step": 7224
+ },
+ {
+ "epoch": 3.4160756501182035,
+ "grad_norm": 3.009376287460327,
+ "learning_rate": 1.985917329484481e-06,
+ "loss": 0.3841,
+ "step": 7225
+ },
+ {
+ "epoch": 3.416548463356974,
+ "grad_norm": 2.869291067123413,
+ "learning_rate": 1.985306850222972e-06,
+ "loss": 0.3877,
+ "step": 7226
+ },
+ {
+ "epoch": 3.4170212765957446,
+ "grad_norm": 3.108461856842041,
+ "learning_rate": 1.9846964030091497e-06,
+ "loss": 0.3767,
+ "step": 7227
+ },
+ {
+ "epoch": 3.4174940898345154,
+ "grad_norm": 3.096320629119873,
+ "learning_rate": 1.9840859878810226e-06,
+ "loss": 0.4603,
+ "step": 7228
+ },
+ {
+ "epoch": 3.417966903073286,
+ "grad_norm": 2.8519909381866455,
+ "learning_rate": 1.983475604876598e-06,
+ "loss": 0.3263,
+ "step": 7229
+ },
+ {
+ "epoch": 3.4184397163120566,
+ "grad_norm": 3.192051410675049,
+ "learning_rate": 1.9828652540338835e-06,
+ "loss": 0.4132,
+ "step": 7230
+ },
+ {
+ "epoch": 3.4189125295508274,
+ "grad_norm": 3.0398056507110596,
+ "learning_rate": 1.9822549353908817e-06,
+ "loss": 0.4038,
+ "step": 7231
+ },
+ {
+ "epoch": 3.419385342789598,
+ "grad_norm": 3.12247896194458,
+ "learning_rate": 1.9816446489855944e-06,
+ "loss": 0.409,
+ "step": 7232
+ },
+ {
+ "epoch": 3.419858156028369,
+ "grad_norm": 3.20316481590271,
+ "learning_rate": 1.9810343948560223e-06,
+ "loss": 0.4058,
+ "step": 7233
+ },
+ {
+ "epoch": 3.4203309692671393,
+ "grad_norm": 3.3397457599639893,
+ "learning_rate": 1.9804241730401625e-06,
+ "loss": 0.3657,
+ "step": 7234
+ },
+ {
+ "epoch": 3.42080378250591,
+ "grad_norm": 3.928691864013672,
+ "learning_rate": 1.979813983576012e-06,
+ "loss": 0.361,
+ "step": 7235
+ },
+ {
+ "epoch": 3.421276595744681,
+ "grad_norm": 3.5814051628112793,
+ "learning_rate": 1.9792038265015635e-06,
+ "loss": 0.3975,
+ "step": 7236
+ },
+ {
+ "epoch": 3.4217494089834517,
+ "grad_norm": 2.8578879833221436,
+ "learning_rate": 1.9785937018548086e-06,
+ "loss": 0.3915,
+ "step": 7237
+ },
+ {
+ "epoch": 3.422222222222222,
+ "grad_norm": 3.0343220233917236,
+ "learning_rate": 1.977983609673738e-06,
+ "loss": 0.3686,
+ "step": 7238
+ },
+ {
+ "epoch": 3.422695035460993,
+ "grad_norm": 3.2719056606292725,
+ "learning_rate": 1.977373549996338e-06,
+ "loss": 0.3905,
+ "step": 7239
+ },
+ {
+ "epoch": 3.4231678486997636,
+ "grad_norm": 2.6638169288635254,
+ "learning_rate": 1.976763522860597e-06,
+ "loss": 0.3631,
+ "step": 7240
+ },
+ {
+ "epoch": 3.4236406619385344,
+ "grad_norm": 2.7679927349090576,
+ "learning_rate": 1.9761535283044967e-06,
+ "loss": 0.377,
+ "step": 7241
+ },
+ {
+ "epoch": 3.424113475177305,
+ "grad_norm": 2.774540424346924,
+ "learning_rate": 1.975543566366019e-06,
+ "loss": 0.3509,
+ "step": 7242
+ },
+ {
+ "epoch": 3.4245862884160756,
+ "grad_norm": 2.811659336090088,
+ "learning_rate": 1.9749336370831438e-06,
+ "loss": 0.3835,
+ "step": 7243
+ },
+ {
+ "epoch": 3.4250591016548464,
+ "grad_norm": 2.8533360958099365,
+ "learning_rate": 1.9743237404938478e-06,
+ "loss": 0.3765,
+ "step": 7244
+ },
+ {
+ "epoch": 3.425531914893617,
+ "grad_norm": 2.712301015853882,
+ "learning_rate": 1.9737138766361084e-06,
+ "loss": 0.3797,
+ "step": 7245
+ },
+ {
+ "epoch": 3.4260047281323875,
+ "grad_norm": 2.9763426780700684,
+ "learning_rate": 1.9731040455478986e-06,
+ "loss": 0.4223,
+ "step": 7246
+ },
+ {
+ "epoch": 3.4264775413711583,
+ "grad_norm": 2.8802297115325928,
+ "learning_rate": 1.9724942472671882e-06,
+ "loss": 0.3666,
+ "step": 7247
+ },
+ {
+ "epoch": 3.426950354609929,
+ "grad_norm": 2.934107542037964,
+ "learning_rate": 1.9718844818319486e-06,
+ "loss": 0.3612,
+ "step": 7248
+ },
+ {
+ "epoch": 3.4274231678487,
+ "grad_norm": 3.0172696113586426,
+ "learning_rate": 1.9712747492801467e-06,
+ "loss": 0.3643,
+ "step": 7249
+ },
+ {
+ "epoch": 3.4278959810874703,
+ "grad_norm": 3.368419647216797,
+ "learning_rate": 1.970665049649748e-06,
+ "loss": 0.4511,
+ "step": 7250
+ },
+ {
+ "epoch": 3.428368794326241,
+ "grad_norm": 3.077819585800171,
+ "learning_rate": 1.9700553829787162e-06,
+ "loss": 0.4013,
+ "step": 7251
+ },
+ {
+ "epoch": 3.428841607565012,
+ "grad_norm": 2.690673828125,
+ "learning_rate": 1.96944574930501e-06,
+ "loss": 0.3776,
+ "step": 7252
+ },
+ {
+ "epoch": 3.4293144208037827,
+ "grad_norm": 3.1122169494628906,
+ "learning_rate": 1.9688361486665924e-06,
+ "loss": 0.3802,
+ "step": 7253
+ },
+ {
+ "epoch": 3.429787234042553,
+ "grad_norm": 2.9874207973480225,
+ "learning_rate": 1.968226581101417e-06,
+ "loss": 0.4492,
+ "step": 7254
+ },
+ {
+ "epoch": 3.430260047281324,
+ "grad_norm": 2.885493278503418,
+ "learning_rate": 1.967617046647442e-06,
+ "loss": 0.3958,
+ "step": 7255
+ },
+ {
+ "epoch": 3.4307328605200946,
+ "grad_norm": 2.953897476196289,
+ "learning_rate": 1.9670075453426195e-06,
+ "loss": 0.3973,
+ "step": 7256
+ },
+ {
+ "epoch": 3.4312056737588654,
+ "grad_norm": 2.685088634490967,
+ "learning_rate": 1.966398077224899e-06,
+ "loss": 0.393,
+ "step": 7257
+ },
+ {
+ "epoch": 3.431678486997636,
+ "grad_norm": 4.035208702087402,
+ "learning_rate": 1.9657886423322313e-06,
+ "loss": 0.4263,
+ "step": 7258
+ },
+ {
+ "epoch": 3.4321513002364066,
+ "grad_norm": 2.942042827606201,
+ "learning_rate": 1.965179240702562e-06,
+ "loss": 0.4319,
+ "step": 7259
+ },
+ {
+ "epoch": 3.4326241134751774,
+ "grad_norm": 3.0794999599456787,
+ "learning_rate": 1.9645698723738356e-06,
+ "loss": 0.4199,
+ "step": 7260
+ },
+ {
+ "epoch": 3.433096926713948,
+ "grad_norm": 3.0653584003448486,
+ "learning_rate": 1.963960537383996e-06,
+ "loss": 0.3723,
+ "step": 7261
+ },
+ {
+ "epoch": 3.4335697399527185,
+ "grad_norm": 3.1571545600891113,
+ "learning_rate": 1.963351235770983e-06,
+ "loss": 0.4211,
+ "step": 7262
+ },
+ {
+ "epoch": 3.4340425531914893,
+ "grad_norm": 2.6681735515594482,
+ "learning_rate": 1.962741967572736e-06,
+ "loss": 0.3333,
+ "step": 7263
+ },
+ {
+ "epoch": 3.43451536643026,
+ "grad_norm": 2.9747934341430664,
+ "learning_rate": 1.9621327328271907e-06,
+ "loss": 0.3896,
+ "step": 7264
+ },
+ {
+ "epoch": 3.434988179669031,
+ "grad_norm": 2.7994508743286133,
+ "learning_rate": 1.9615235315722814e-06,
+ "loss": 0.3642,
+ "step": 7265
+ },
+ {
+ "epoch": 3.4354609929078013,
+ "grad_norm": 2.933928966522217,
+ "learning_rate": 1.9609143638459405e-06,
+ "loss": 0.3955,
+ "step": 7266
+ },
+ {
+ "epoch": 3.435933806146572,
+ "grad_norm": 2.9577367305755615,
+ "learning_rate": 1.9603052296860983e-06,
+ "loss": 0.3437,
+ "step": 7267
+ },
+ {
+ "epoch": 3.436406619385343,
+ "grad_norm": 3.017282009124756,
+ "learning_rate": 1.959696129130684e-06,
+ "loss": 0.3784,
+ "step": 7268
+ },
+ {
+ "epoch": 3.4368794326241137,
+ "grad_norm": 3.2072815895080566,
+ "learning_rate": 1.959087062217622e-06,
+ "loss": 0.3901,
+ "step": 7269
+ },
+ {
+ "epoch": 3.437352245862884,
+ "grad_norm": 2.91153621673584,
+ "learning_rate": 1.9584780289848358e-06,
+ "loss": 0.4402,
+ "step": 7270
+ },
+ {
+ "epoch": 3.437825059101655,
+ "grad_norm": 2.846842050552368,
+ "learning_rate": 1.9578690294702495e-06,
+ "loss": 0.3804,
+ "step": 7271
+ },
+ {
+ "epoch": 3.4382978723404256,
+ "grad_norm": 3.0958521366119385,
+ "learning_rate": 1.957260063711781e-06,
+ "loss": 0.4103,
+ "step": 7272
+ },
+ {
+ "epoch": 3.4387706855791964,
+ "grad_norm": 2.9808530807495117,
+ "learning_rate": 1.9566511317473483e-06,
+ "loss": 0.4127,
+ "step": 7273
+ },
+ {
+ "epoch": 3.4392434988179668,
+ "grad_norm": 2.725851058959961,
+ "learning_rate": 1.9560422336148678e-06,
+ "loss": 0.3493,
+ "step": 7274
+ },
+ {
+ "epoch": 3.4397163120567376,
+ "grad_norm": 2.7861814498901367,
+ "learning_rate": 1.9554333693522515e-06,
+ "loss": 0.3703,
+ "step": 7275
+ },
+ {
+ "epoch": 3.4401891252955084,
+ "grad_norm": 3.128708839416504,
+ "learning_rate": 1.954824538997412e-06,
+ "loss": 0.3917,
+ "step": 7276
+ },
+ {
+ "epoch": 3.440661938534279,
+ "grad_norm": 3.117403268814087,
+ "learning_rate": 1.954215742588257e-06,
+ "loss": 0.3581,
+ "step": 7277
+ },
+ {
+ "epoch": 3.4411347517730495,
+ "grad_norm": 2.710076093673706,
+ "learning_rate": 1.9536069801626957e-06,
+ "loss": 0.3255,
+ "step": 7278
+ },
+ {
+ "epoch": 3.4416075650118203,
+ "grad_norm": 2.7732627391815186,
+ "learning_rate": 1.952998251758632e-06,
+ "loss": 0.375,
+ "step": 7279
+ },
+ {
+ "epoch": 3.442080378250591,
+ "grad_norm": 2.896050453186035,
+ "learning_rate": 1.9523895574139673e-06,
+ "loss": 0.4087,
+ "step": 7280
+ },
+ {
+ "epoch": 3.4425531914893615,
+ "grad_norm": 2.9051663875579834,
+ "learning_rate": 1.9517808971666048e-06,
+ "loss": 0.3423,
+ "step": 7281
+ },
+ {
+ "epoch": 3.4430260047281322,
+ "grad_norm": 3.0232038497924805,
+ "learning_rate": 1.9511722710544417e-06,
+ "loss": 0.364,
+ "step": 7282
+ },
+ {
+ "epoch": 3.443498817966903,
+ "grad_norm": 2.753870725631714,
+ "learning_rate": 1.9505636791153744e-06,
+ "loss": 0.3484,
+ "step": 7283
+ },
+ {
+ "epoch": 3.443971631205674,
+ "grad_norm": 2.944079637527466,
+ "learning_rate": 1.9499551213872983e-06,
+ "loss": 0.3354,
+ "step": 7284
+ },
+ {
+ "epoch": 3.4444444444444446,
+ "grad_norm": 3.1531970500946045,
+ "learning_rate": 1.949346597908104e-06,
+ "loss": 0.3394,
+ "step": 7285
+ },
+ {
+ "epoch": 3.444917257683215,
+ "grad_norm": 3.0357189178466797,
+ "learning_rate": 1.948738108715683e-06,
+ "loss": 0.4302,
+ "step": 7286
+ },
+ {
+ "epoch": 3.445390070921986,
+ "grad_norm": 3.3698086738586426,
+ "learning_rate": 1.948129653847923e-06,
+ "loss": 0.419,
+ "step": 7287
+ },
+ {
+ "epoch": 3.4458628841607566,
+ "grad_norm": 3.343132495880127,
+ "learning_rate": 1.947521233342709e-06,
+ "loss": 0.3895,
+ "step": 7288
+ },
+ {
+ "epoch": 3.446335697399527,
+ "grad_norm": 3.1905252933502197,
+ "learning_rate": 1.9469128472379257e-06,
+ "loss": 0.429,
+ "step": 7289
+ },
+ {
+ "epoch": 3.4468085106382977,
+ "grad_norm": 2.8517212867736816,
+ "learning_rate": 1.946304495571454e-06,
+ "loss": 0.3513,
+ "step": 7290
+ },
+ {
+ "epoch": 3.4472813238770685,
+ "grad_norm": 2.7713496685028076,
+ "learning_rate": 1.9456961783811735e-06,
+ "loss": 0.4331,
+ "step": 7291
+ },
+ {
+ "epoch": 3.4477541371158393,
+ "grad_norm": 2.8258652687072754,
+ "learning_rate": 1.945087895704962e-06,
+ "loss": 0.3539,
+ "step": 7292
+ },
+ {
+ "epoch": 3.44822695035461,
+ "grad_norm": 2.757322072982788,
+ "learning_rate": 1.9444796475806925e-06,
+ "loss": 0.3865,
+ "step": 7293
+ },
+ {
+ "epoch": 3.4486997635933805,
+ "grad_norm": 2.8410696983337402,
+ "learning_rate": 1.943871434046241e-06,
+ "loss": 0.3612,
+ "step": 7294
+ },
+ {
+ "epoch": 3.4491725768321513,
+ "grad_norm": 3.2297637462615967,
+ "learning_rate": 1.9432632551394753e-06,
+ "loss": 0.3956,
+ "step": 7295
+ },
+ {
+ "epoch": 3.449645390070922,
+ "grad_norm": 2.991351842880249,
+ "learning_rate": 1.9426551108982666e-06,
+ "loss": 0.3864,
+ "step": 7296
+ },
+ {
+ "epoch": 3.4501182033096924,
+ "grad_norm": 2.7942168712615967,
+ "learning_rate": 1.94204700136048e-06,
+ "loss": 0.4314,
+ "step": 7297
+ },
+ {
+ "epoch": 3.4505910165484632,
+ "grad_norm": 2.8188698291778564,
+ "learning_rate": 1.9414389265639805e-06,
+ "loss": 0.3585,
+ "step": 7298
+ },
+ {
+ "epoch": 3.451063829787234,
+ "grad_norm": 3.2826895713806152,
+ "learning_rate": 1.9408308865466295e-06,
+ "loss": 0.4614,
+ "step": 7299
+ },
+ {
+ "epoch": 3.451536643026005,
+ "grad_norm": 3.273867130279541,
+ "learning_rate": 1.9402228813462865e-06,
+ "loss": 0.3533,
+ "step": 7300
+ },
+ {
+ "epoch": 3.4520094562647756,
+ "grad_norm": 3.5334157943725586,
+ "learning_rate": 1.939614911000811e-06,
+ "loss": 0.4088,
+ "step": 7301
+ },
+ {
+ "epoch": 3.452482269503546,
+ "grad_norm": 2.983908176422119,
+ "learning_rate": 1.9390069755480583e-06,
+ "loss": 0.3725,
+ "step": 7302
+ },
+ {
+ "epoch": 3.4529550827423168,
+ "grad_norm": 2.893660306930542,
+ "learning_rate": 1.93839907502588e-06,
+ "loss": 0.3746,
+ "step": 7303
+ },
+ {
+ "epoch": 3.4534278959810876,
+ "grad_norm": 3.1762871742248535,
+ "learning_rate": 1.9377912094721295e-06,
+ "loss": 0.446,
+ "step": 7304
+ },
+ {
+ "epoch": 3.453900709219858,
+ "grad_norm": 3.3231537342071533,
+ "learning_rate": 1.9371833789246554e-06,
+ "loss": 0.4837,
+ "step": 7305
+ },
+ {
+ "epoch": 3.4543735224586287,
+ "grad_norm": 3.548333168029785,
+ "learning_rate": 1.936575583421304e-06,
+ "loss": 0.3911,
+ "step": 7306
+ },
+ {
+ "epoch": 3.4548463356973995,
+ "grad_norm": 3.0627071857452393,
+ "learning_rate": 1.9359678229999213e-06,
+ "loss": 0.3751,
+ "step": 7307
+ },
+ {
+ "epoch": 3.4553191489361703,
+ "grad_norm": 2.797663927078247,
+ "learning_rate": 1.9353600976983475e-06,
+ "loss": 0.41,
+ "step": 7308
+ },
+ {
+ "epoch": 3.455791962174941,
+ "grad_norm": 2.803269624710083,
+ "learning_rate": 1.9347524075544258e-06,
+ "loss": 0.3775,
+ "step": 7309
+ },
+ {
+ "epoch": 3.4562647754137115,
+ "grad_norm": 2.828010320663452,
+ "learning_rate": 1.934144752605993e-06,
+ "loss": 0.375,
+ "step": 7310
+ },
+ {
+ "epoch": 3.4567375886524823,
+ "grad_norm": 3.456477165222168,
+ "learning_rate": 1.933537132890884e-06,
+ "loss": 0.4764,
+ "step": 7311
+ },
+ {
+ "epoch": 3.457210401891253,
+ "grad_norm": 2.723670244216919,
+ "learning_rate": 1.9329295484469354e-06,
+ "loss": 0.3581,
+ "step": 7312
+ },
+ {
+ "epoch": 3.4576832151300234,
+ "grad_norm": 3.9723474979400635,
+ "learning_rate": 1.9323219993119766e-06,
+ "loss": 0.3951,
+ "step": 7313
+ },
+ {
+ "epoch": 3.458156028368794,
+ "grad_norm": 2.951300859451294,
+ "learning_rate": 1.931714485523838e-06,
+ "loss": 0.3865,
+ "step": 7314
+ },
+ {
+ "epoch": 3.458628841607565,
+ "grad_norm": 2.9265835285186768,
+ "learning_rate": 1.931107007120347e-06,
+ "loss": 0.3731,
+ "step": 7315
+ },
+ {
+ "epoch": 3.459101654846336,
+ "grad_norm": 3.271883249282837,
+ "learning_rate": 1.930499564139327e-06,
+ "loss": 0.3971,
+ "step": 7316
+ },
+ {
+ "epoch": 3.4595744680851066,
+ "grad_norm": 2.8716280460357666,
+ "learning_rate": 1.929892156618603e-06,
+ "loss": 0.3332,
+ "step": 7317
+ },
+ {
+ "epoch": 3.460047281323877,
+ "grad_norm": 2.9820191860198975,
+ "learning_rate": 1.929284784595993e-06,
+ "loss": 0.3907,
+ "step": 7318
+ },
+ {
+ "epoch": 3.4605200945626478,
+ "grad_norm": 3.313225269317627,
+ "learning_rate": 1.9286774481093183e-06,
+ "loss": 0.3678,
+ "step": 7319
+ },
+ {
+ "epoch": 3.4609929078014185,
+ "grad_norm": 3.365387439727783,
+ "learning_rate": 1.928070147196394e-06,
+ "loss": 0.4894,
+ "step": 7320
+ },
+ {
+ "epoch": 3.461465721040189,
+ "grad_norm": 3.1723599433898926,
+ "learning_rate": 1.927462881895033e-06,
+ "loss": 0.4607,
+ "step": 7321
+ },
+ {
+ "epoch": 3.4619385342789597,
+ "grad_norm": 2.7644999027252197,
+ "learning_rate": 1.9268556522430483e-06,
+ "loss": 0.3627,
+ "step": 7322
+ },
+ {
+ "epoch": 3.4624113475177305,
+ "grad_norm": 2.65572190284729,
+ "learning_rate": 1.9262484582782483e-06,
+ "loss": 0.3893,
+ "step": 7323
+ },
+ {
+ "epoch": 3.4628841607565013,
+ "grad_norm": 2.992037773132324,
+ "learning_rate": 1.9256413000384415e-06,
+ "loss": 0.4175,
+ "step": 7324
+ },
+ {
+ "epoch": 3.463356973995272,
+ "grad_norm": 3.020496368408203,
+ "learning_rate": 1.925034177561433e-06,
+ "loss": 0.42,
+ "step": 7325
+ },
+ {
+ "epoch": 3.4638297872340424,
+ "grad_norm": 2.780334234237671,
+ "learning_rate": 1.9244270908850236e-06,
+ "loss": 0.4195,
+ "step": 7326
+ },
+ {
+ "epoch": 3.4643026004728132,
+ "grad_norm": 2.863028049468994,
+ "learning_rate": 1.9238200400470166e-06,
+ "loss": 0.3706,
+ "step": 7327
+ },
+ {
+ "epoch": 3.464775413711584,
+ "grad_norm": 3.2766900062561035,
+ "learning_rate": 1.923213025085209e-06,
+ "loss": 0.4506,
+ "step": 7328
+ },
+ {
+ "epoch": 3.4652482269503544,
+ "grad_norm": 2.7300634384155273,
+ "learning_rate": 1.9226060460373975e-06,
+ "loss": 0.3463,
+ "step": 7329
+ },
+ {
+ "epoch": 3.465721040189125,
+ "grad_norm": 3.136104106903076,
+ "learning_rate": 1.921999102941376e-06,
+ "loss": 0.3839,
+ "step": 7330
+ },
+ {
+ "epoch": 3.466193853427896,
+ "grad_norm": 2.944932699203491,
+ "learning_rate": 1.921392195834934e-06,
+ "loss": 0.432,
+ "step": 7331
+ },
+ {
+ "epoch": 3.466666666666667,
+ "grad_norm": 3.428375005722046,
+ "learning_rate": 1.9207853247558647e-06,
+ "loss": 0.3407,
+ "step": 7332
+ },
+ {
+ "epoch": 3.4671394799054376,
+ "grad_norm": 3.3732450008392334,
+ "learning_rate": 1.9201784897419535e-06,
+ "loss": 0.361,
+ "step": 7333
+ },
+ {
+ "epoch": 3.467612293144208,
+ "grad_norm": 2.8291900157928467,
+ "learning_rate": 1.9195716908309836e-06,
+ "loss": 0.3805,
+ "step": 7334
+ },
+ {
+ "epoch": 3.4680851063829787,
+ "grad_norm": 3.3229610919952393,
+ "learning_rate": 1.9189649280607407e-06,
+ "loss": 0.3756,
+ "step": 7335
+ },
+ {
+ "epoch": 3.4685579196217495,
+ "grad_norm": 2.949416160583496,
+ "learning_rate": 1.918358201469004e-06,
+ "loss": 0.4316,
+ "step": 7336
+ },
+ {
+ "epoch": 3.46903073286052,
+ "grad_norm": 3.525501251220703,
+ "learning_rate": 1.9177515110935515e-06,
+ "loss": 0.4018,
+ "step": 7337
+ },
+ {
+ "epoch": 3.4695035460992907,
+ "grad_norm": 3.1439104080200195,
+ "learning_rate": 1.917144856972159e-06,
+ "loss": 0.4176,
+ "step": 7338
+ },
+ {
+ "epoch": 3.4699763593380615,
+ "grad_norm": 3.0022377967834473,
+ "learning_rate": 1.9165382391426006e-06,
+ "loss": 0.3962,
+ "step": 7339
+ },
+ {
+ "epoch": 3.4704491725768323,
+ "grad_norm": 3.2174794673919678,
+ "learning_rate": 1.9159316576426482e-06,
+ "loss": 0.441,
+ "step": 7340
+ },
+ {
+ "epoch": 3.470921985815603,
+ "grad_norm": 2.965123414993286,
+ "learning_rate": 1.9153251125100694e-06,
+ "loss": 0.4105,
+ "step": 7341
+ },
+ {
+ "epoch": 3.4713947990543734,
+ "grad_norm": 2.722904920578003,
+ "learning_rate": 1.9147186037826333e-06,
+ "loss": 0.4102,
+ "step": 7342
+ },
+ {
+ "epoch": 3.4718676122931442,
+ "grad_norm": 3.4894051551818848,
+ "learning_rate": 1.9141121314981033e-06,
+ "loss": 0.4225,
+ "step": 7343
+ },
+ {
+ "epoch": 3.472340425531915,
+ "grad_norm": 2.828497886657715,
+ "learning_rate": 1.913505695694241e-06,
+ "loss": 0.374,
+ "step": 7344
+ },
+ {
+ "epoch": 3.4728132387706854,
+ "grad_norm": 3.3046014308929443,
+ "learning_rate": 1.9128992964088077e-06,
+ "loss": 0.3568,
+ "step": 7345
+ },
+ {
+ "epoch": 3.473286052009456,
+ "grad_norm": 2.927281618118286,
+ "learning_rate": 1.9122929336795605e-06,
+ "loss": 0.4308,
+ "step": 7346
+ },
+ {
+ "epoch": 3.473758865248227,
+ "grad_norm": 2.9569990634918213,
+ "learning_rate": 1.911686607544256e-06,
+ "loss": 0.3226,
+ "step": 7347
+ },
+ {
+ "epoch": 3.4742316784869978,
+ "grad_norm": 3.1061038970947266,
+ "learning_rate": 1.9110803180406468e-06,
+ "loss": 0.4426,
+ "step": 7348
+ },
+ {
+ "epoch": 3.4747044917257686,
+ "grad_norm": 2.9609580039978027,
+ "learning_rate": 1.9104740652064825e-06,
+ "loss": 0.3835,
+ "step": 7349
+ },
+ {
+ "epoch": 3.475177304964539,
+ "grad_norm": 3.1547608375549316,
+ "learning_rate": 1.9098678490795147e-06,
+ "loss": 0.3814,
+ "step": 7350
+ },
+ {
+ "epoch": 3.4756501182033097,
+ "grad_norm": 2.869022846221924,
+ "learning_rate": 1.909261669697487e-06,
+ "loss": 0.4048,
+ "step": 7351
+ },
+ {
+ "epoch": 3.4761229314420805,
+ "grad_norm": 3.0565078258514404,
+ "learning_rate": 1.908655527098146e-06,
+ "loss": 0.3736,
+ "step": 7352
+ },
+ {
+ "epoch": 3.476595744680851,
+ "grad_norm": 2.893603563308716,
+ "learning_rate": 1.9080494213192317e-06,
+ "loss": 0.3906,
+ "step": 7353
+ },
+ {
+ "epoch": 3.4770685579196217,
+ "grad_norm": 2.818938732147217,
+ "learning_rate": 1.9074433523984844e-06,
+ "loss": 0.3958,
+ "step": 7354
+ },
+ {
+ "epoch": 3.4775413711583925,
+ "grad_norm": 2.675461769104004,
+ "learning_rate": 1.9068373203736419e-06,
+ "loss": 0.3371,
+ "step": 7355
+ },
+ {
+ "epoch": 3.4780141843971633,
+ "grad_norm": 2.5831551551818848,
+ "learning_rate": 1.9062313252824384e-06,
+ "loss": 0.3365,
+ "step": 7356
+ },
+ {
+ "epoch": 3.478486997635934,
+ "grad_norm": 3.299736738204956,
+ "learning_rate": 1.9056253671626054e-06,
+ "loss": 0.3923,
+ "step": 7357
+ },
+ {
+ "epoch": 3.4789598108747044,
+ "grad_norm": 2.508787155151367,
+ "learning_rate": 1.905019446051876e-06,
+ "loss": 0.3367,
+ "step": 7358
+ },
+ {
+ "epoch": 3.479432624113475,
+ "grad_norm": 2.980327606201172,
+ "learning_rate": 1.9044135619879753e-06,
+ "loss": 0.3842,
+ "step": 7359
+ },
+ {
+ "epoch": 3.479905437352246,
+ "grad_norm": 3.2114269733428955,
+ "learning_rate": 1.9038077150086317e-06,
+ "loss": 0.4625,
+ "step": 7360
+ },
+ {
+ "epoch": 3.4803782505910164,
+ "grad_norm": 3.2119715213775635,
+ "learning_rate": 1.9032019051515677e-06,
+ "loss": 0.4197,
+ "step": 7361
+ },
+ {
+ "epoch": 3.480851063829787,
+ "grad_norm": 3.2967300415039062,
+ "learning_rate": 1.9025961324545034e-06,
+ "loss": 0.4462,
+ "step": 7362
+ },
+ {
+ "epoch": 3.481323877068558,
+ "grad_norm": 3.132643461227417,
+ "learning_rate": 1.9019903969551589e-06,
+ "loss": 0.4355,
+ "step": 7363
+ },
+ {
+ "epoch": 3.4817966903073287,
+ "grad_norm": 2.9940602779388428,
+ "learning_rate": 1.9013846986912493e-06,
+ "loss": 0.3584,
+ "step": 7364
+ },
+ {
+ "epoch": 3.482269503546099,
+ "grad_norm": 2.901935577392578,
+ "learning_rate": 1.9007790377004907e-06,
+ "loss": 0.3987,
+ "step": 7365
+ },
+ {
+ "epoch": 3.48274231678487,
+ "grad_norm": 3.143404006958008,
+ "learning_rate": 1.9001734140205939e-06,
+ "loss": 0.3642,
+ "step": 7366
+ },
+ {
+ "epoch": 3.4832151300236407,
+ "grad_norm": 3.6701388359069824,
+ "learning_rate": 1.899567827689267e-06,
+ "loss": 0.4111,
+ "step": 7367
+ },
+ {
+ "epoch": 3.4836879432624115,
+ "grad_norm": 2.6821651458740234,
+ "learning_rate": 1.8989622787442202e-06,
+ "loss": 0.3827,
+ "step": 7368
+ },
+ {
+ "epoch": 3.484160756501182,
+ "grad_norm": 3.3916640281677246,
+ "learning_rate": 1.8983567672231562e-06,
+ "loss": 0.4214,
+ "step": 7369
+ },
+ {
+ "epoch": 3.4846335697399526,
+ "grad_norm": 3.3192927837371826,
+ "learning_rate": 1.8977512931637788e-06,
+ "loss": 0.4375,
+ "step": 7370
+ },
+ {
+ "epoch": 3.4851063829787234,
+ "grad_norm": 3.177884578704834,
+ "learning_rate": 1.8971458566037877e-06,
+ "loss": 0.3944,
+ "step": 7371
+ },
+ {
+ "epoch": 3.4855791962174942,
+ "grad_norm": 3.1554532051086426,
+ "learning_rate": 1.896540457580879e-06,
+ "loss": 0.3712,
+ "step": 7372
+ },
+ {
+ "epoch": 3.4860520094562646,
+ "grad_norm": 2.7773821353912354,
+ "learning_rate": 1.8959350961327516e-06,
+ "loss": 0.3534,
+ "step": 7373
+ },
+ {
+ "epoch": 3.4865248226950354,
+ "grad_norm": 2.4792168140411377,
+ "learning_rate": 1.8953297722970956e-06,
+ "loss": 0.3571,
+ "step": 7374
+ },
+ {
+ "epoch": 3.486997635933806,
+ "grad_norm": 2.676361083984375,
+ "learning_rate": 1.8947244861116044e-06,
+ "loss": 0.3365,
+ "step": 7375
+ },
+ {
+ "epoch": 3.487470449172577,
+ "grad_norm": 2.9340765476226807,
+ "learning_rate": 1.8941192376139655e-06,
+ "loss": 0.4656,
+ "step": 7376
+ },
+ {
+ "epoch": 3.4879432624113473,
+ "grad_norm": 3.7924742698669434,
+ "learning_rate": 1.8935140268418646e-06,
+ "loss": 0.3639,
+ "step": 7377
+ },
+ {
+ "epoch": 3.488416075650118,
+ "grad_norm": 2.798912286758423,
+ "learning_rate": 1.892908853832986e-06,
+ "loss": 0.3741,
+ "step": 7378
+ },
+ {
+ "epoch": 3.488888888888889,
+ "grad_norm": 3.1731197834014893,
+ "learning_rate": 1.8923037186250112e-06,
+ "loss": 0.4041,
+ "step": 7379
+ },
+ {
+ "epoch": 3.4893617021276597,
+ "grad_norm": 2.893725633621216,
+ "learning_rate": 1.8916986212556182e-06,
+ "loss": 0.3103,
+ "step": 7380
+ },
+ {
+ "epoch": 3.48983451536643,
+ "grad_norm": 3.2489001750946045,
+ "learning_rate": 1.891093561762486e-06,
+ "loss": 0.328,
+ "step": 7381
+ },
+ {
+ "epoch": 3.490307328605201,
+ "grad_norm": 2.8076415061950684,
+ "learning_rate": 1.8904885401832862e-06,
+ "loss": 0.426,
+ "step": 7382
+ },
+ {
+ "epoch": 3.4907801418439717,
+ "grad_norm": 3.076544761657715,
+ "learning_rate": 1.8898835565556938e-06,
+ "loss": 0.3664,
+ "step": 7383
+ },
+ {
+ "epoch": 3.4912529550827425,
+ "grad_norm": 2.7615935802459717,
+ "learning_rate": 1.8892786109173769e-06,
+ "loss": 0.3718,
+ "step": 7384
+ },
+ {
+ "epoch": 3.491725768321513,
+ "grad_norm": 2.9050116539001465,
+ "learning_rate": 1.8886737033060023e-06,
+ "loss": 0.3456,
+ "step": 7385
+ },
+ {
+ "epoch": 3.4921985815602836,
+ "grad_norm": 2.4928293228149414,
+ "learning_rate": 1.8880688337592366e-06,
+ "loss": 0.3487,
+ "step": 7386
+ },
+ {
+ "epoch": 3.4926713947990544,
+ "grad_norm": 2.773418426513672,
+ "learning_rate": 1.88746400231474e-06,
+ "loss": 0.3771,
+ "step": 7387
+ },
+ {
+ "epoch": 3.493144208037825,
+ "grad_norm": 2.7137296199798584,
+ "learning_rate": 1.886859209010175e-06,
+ "loss": 0.376,
+ "step": 7388
+ },
+ {
+ "epoch": 3.4936170212765956,
+ "grad_norm": 3.327976942062378,
+ "learning_rate": 1.886254453883199e-06,
+ "loss": 0.3481,
+ "step": 7389
+ },
+ {
+ "epoch": 3.4940898345153664,
+ "grad_norm": 3.8637235164642334,
+ "learning_rate": 1.8856497369714655e-06,
+ "loss": 0.3726,
+ "step": 7390
+ },
+ {
+ "epoch": 3.494562647754137,
+ "grad_norm": 3.1517951488494873,
+ "learning_rate": 1.88504505831263e-06,
+ "loss": 0.4459,
+ "step": 7391
+ },
+ {
+ "epoch": 3.495035460992908,
+ "grad_norm": 3.160130262374878,
+ "learning_rate": 1.884440417944342e-06,
+ "loss": 0.3918,
+ "step": 7392
+ },
+ {
+ "epoch": 3.4955082742316783,
+ "grad_norm": 2.6518726348876953,
+ "learning_rate": 1.8838358159042503e-06,
+ "loss": 0.3493,
+ "step": 7393
+ },
+ {
+ "epoch": 3.495981087470449,
+ "grad_norm": 2.7487380504608154,
+ "learning_rate": 1.8832312522300009e-06,
+ "loss": 0.3846,
+ "step": 7394
+ },
+ {
+ "epoch": 3.49645390070922,
+ "grad_norm": 3.062293291091919,
+ "learning_rate": 1.8826267269592355e-06,
+ "loss": 0.3792,
+ "step": 7395
+ },
+ {
+ "epoch": 3.4969267139479907,
+ "grad_norm": 3.3636794090270996,
+ "learning_rate": 1.8820222401295979e-06,
+ "loss": 0.4504,
+ "step": 7396
+ },
+ {
+ "epoch": 3.497399527186761,
+ "grad_norm": 3.230196237564087,
+ "learning_rate": 1.8814177917787246e-06,
+ "loss": 0.3953,
+ "step": 7397
+ },
+ {
+ "epoch": 3.497872340425532,
+ "grad_norm": 2.891002893447876,
+ "learning_rate": 1.8808133819442541e-06,
+ "loss": 0.3923,
+ "step": 7398
+ },
+ {
+ "epoch": 3.4983451536643027,
+ "grad_norm": 2.7478551864624023,
+ "learning_rate": 1.8802090106638196e-06,
+ "loss": 0.4115,
+ "step": 7399
+ },
+ {
+ "epoch": 3.4988179669030735,
+ "grad_norm": 3.0452797412872314,
+ "learning_rate": 1.8796046779750515e-06,
+ "loss": 0.4154,
+ "step": 7400
+ },
+ {
+ "epoch": 3.499290780141844,
+ "grad_norm": 3.0759124755859375,
+ "learning_rate": 1.87900038391558e-06,
+ "loss": 0.4277,
+ "step": 7401
+ },
+ {
+ "epoch": 3.4997635933806146,
+ "grad_norm": 2.7563929557800293,
+ "learning_rate": 1.8783961285230314e-06,
+ "loss": 0.3896,
+ "step": 7402
+ },
+ {
+ "epoch": 3.5002364066193854,
+ "grad_norm": 2.661916494369507,
+ "learning_rate": 1.87779191183503e-06,
+ "loss": 0.3625,
+ "step": 7403
+ },
+ {
+ "epoch": 3.500709219858156,
+ "grad_norm": 2.881241798400879,
+ "learning_rate": 1.877187733889199e-06,
+ "loss": 0.3724,
+ "step": 7404
+ },
+ {
+ "epoch": 3.5011820330969265,
+ "grad_norm": 3.2405693531036377,
+ "learning_rate": 1.8765835947231554e-06,
+ "loss": 0.3974,
+ "step": 7405
+ },
+ {
+ "epoch": 3.5016548463356973,
+ "grad_norm": 2.924288034439087,
+ "learning_rate": 1.8759794943745184e-06,
+ "loss": 0.3467,
+ "step": 7406
+ },
+ {
+ "epoch": 3.502127659574468,
+ "grad_norm": 3.031663656234741,
+ "learning_rate": 1.8753754328809027e-06,
+ "loss": 0.3995,
+ "step": 7407
+ },
+ {
+ "epoch": 3.5026004728132385,
+ "grad_norm": 3.028277635574341,
+ "learning_rate": 1.874771410279919e-06,
+ "loss": 0.3741,
+ "step": 7408
+ },
+ {
+ "epoch": 3.5030732860520093,
+ "grad_norm": 3.0211644172668457,
+ "learning_rate": 1.8741674266091782e-06,
+ "loss": 0.4018,
+ "step": 7409
+ },
+ {
+ "epoch": 3.50354609929078,
+ "grad_norm": 2.732234239578247,
+ "learning_rate": 1.8735634819062875e-06,
+ "loss": 0.313,
+ "step": 7410
+ },
+ {
+ "epoch": 3.504018912529551,
+ "grad_norm": 3.139596939086914,
+ "learning_rate": 1.8729595762088525e-06,
+ "loss": 0.4112,
+ "step": 7411
+ },
+ {
+ "epoch": 3.5044917257683217,
+ "grad_norm": 2.894230365753174,
+ "learning_rate": 1.8723557095544754e-06,
+ "loss": 0.3891,
+ "step": 7412
+ },
+ {
+ "epoch": 3.504964539007092,
+ "grad_norm": 2.850205659866333,
+ "learning_rate": 1.8717518819807547e-06,
+ "loss": 0.424,
+ "step": 7413
+ },
+ {
+ "epoch": 3.505437352245863,
+ "grad_norm": 3.047736644744873,
+ "learning_rate": 1.8711480935252907e-06,
+ "loss": 0.3757,
+ "step": 7414
+ },
+ {
+ "epoch": 3.5059101654846336,
+ "grad_norm": 3.0174455642700195,
+ "learning_rate": 1.8705443442256772e-06,
+ "loss": 0.3625,
+ "step": 7415
+ },
+ {
+ "epoch": 3.506382978723404,
+ "grad_norm": 2.840681552886963,
+ "learning_rate": 1.869940634119507e-06,
+ "loss": 0.3595,
+ "step": 7416
+ },
+ {
+ "epoch": 3.506855791962175,
+ "grad_norm": 3.067473888397217,
+ "learning_rate": 1.8693369632443713e-06,
+ "loss": 0.432,
+ "step": 7417
+ },
+ {
+ "epoch": 3.5073286052009456,
+ "grad_norm": 2.94655179977417,
+ "learning_rate": 1.8687333316378572e-06,
+ "loss": 0.4222,
+ "step": 7418
+ },
+ {
+ "epoch": 3.5078014184397164,
+ "grad_norm": 2.968548536300659,
+ "learning_rate": 1.868129739337551e-06,
+ "loss": 0.4098,
+ "step": 7419
+ },
+ {
+ "epoch": 3.508274231678487,
+ "grad_norm": 2.70094895362854,
+ "learning_rate": 1.867526186381034e-06,
+ "loss": 0.386,
+ "step": 7420
+ },
+ {
+ "epoch": 3.5087470449172575,
+ "grad_norm": 3.25897216796875,
+ "learning_rate": 1.8669226728058895e-06,
+ "loss": 0.4411,
+ "step": 7421
+ },
+ {
+ "epoch": 3.5092198581560283,
+ "grad_norm": 4.281215667724609,
+ "learning_rate": 1.866319198649694e-06,
+ "loss": 0.4011,
+ "step": 7422
+ },
+ {
+ "epoch": 3.509692671394799,
+ "grad_norm": 2.8394858837127686,
+ "learning_rate": 1.8657157639500223e-06,
+ "loss": 0.4162,
+ "step": 7423
+ },
+ {
+ "epoch": 3.5101654846335695,
+ "grad_norm": 2.732691764831543,
+ "learning_rate": 1.86511236874445e-06,
+ "loss": 0.3603,
+ "step": 7424
+ },
+ {
+ "epoch": 3.5106382978723403,
+ "grad_norm": 3.0152828693389893,
+ "learning_rate": 1.8645090130705463e-06,
+ "loss": 0.3811,
+ "step": 7425
+ },
+ {
+ "epoch": 3.511111111111111,
+ "grad_norm": 3.1762008666992188,
+ "learning_rate": 1.8639056969658793e-06,
+ "loss": 0.3985,
+ "step": 7426
+ },
+ {
+ "epoch": 3.511583924349882,
+ "grad_norm": 3.151123523712158,
+ "learning_rate": 1.863302420468016e-06,
+ "loss": 0.3582,
+ "step": 7427
+ },
+ {
+ "epoch": 3.5120567375886527,
+ "grad_norm": 2.738206386566162,
+ "learning_rate": 1.862699183614518e-06,
+ "loss": 0.3768,
+ "step": 7428
+ },
+ {
+ "epoch": 3.512529550827423,
+ "grad_norm": 3.235212564468384,
+ "learning_rate": 1.8620959864429487e-06,
+ "loss": 0.3964,
+ "step": 7429
+ },
+ {
+ "epoch": 3.513002364066194,
+ "grad_norm": 3.1113579273223877,
+ "learning_rate": 1.8614928289908648e-06,
+ "loss": 0.3979,
+ "step": 7430
+ },
+ {
+ "epoch": 3.5134751773049646,
+ "grad_norm": 2.6802520751953125,
+ "learning_rate": 1.860889711295822e-06,
+ "loss": 0.327,
+ "step": 7431
+ },
+ {
+ "epoch": 3.513947990543735,
+ "grad_norm": 2.9212403297424316,
+ "learning_rate": 1.860286633395375e-06,
+ "loss": 0.4104,
+ "step": 7432
+ },
+ {
+ "epoch": 3.5144208037825058,
+ "grad_norm": 2.868861198425293,
+ "learning_rate": 1.8596835953270742e-06,
+ "loss": 0.383,
+ "step": 7433
+ },
+ {
+ "epoch": 3.5148936170212766,
+ "grad_norm": 2.831655740737915,
+ "learning_rate": 1.8590805971284686e-06,
+ "loss": 0.3615,
+ "step": 7434
+ },
+ {
+ "epoch": 3.5153664302600474,
+ "grad_norm": 3.1540114879608154,
+ "learning_rate": 1.8584776388371039e-06,
+ "loss": 0.3914,
+ "step": 7435
+ },
+ {
+ "epoch": 3.515839243498818,
+ "grad_norm": 3.22031307220459,
+ "learning_rate": 1.8578747204905223e-06,
+ "loss": 0.4358,
+ "step": 7436
+ },
+ {
+ "epoch": 3.5163120567375885,
+ "grad_norm": 3.2922887802124023,
+ "learning_rate": 1.8572718421262677e-06,
+ "loss": 0.3894,
+ "step": 7437
+ },
+ {
+ "epoch": 3.5167848699763593,
+ "grad_norm": 2.936475992202759,
+ "learning_rate": 1.856669003781876e-06,
+ "loss": 0.3748,
+ "step": 7438
+ },
+ {
+ "epoch": 3.51725768321513,
+ "grad_norm": 3.4542860984802246,
+ "learning_rate": 1.8560662054948856e-06,
+ "loss": 0.3362,
+ "step": 7439
+ },
+ {
+ "epoch": 3.5177304964539005,
+ "grad_norm": 3.1532278060913086,
+ "learning_rate": 1.8554634473028288e-06,
+ "loss": 0.411,
+ "step": 7440
+ },
+ {
+ "epoch": 3.5182033096926713,
+ "grad_norm": 3.1678943634033203,
+ "learning_rate": 1.854860729243237e-06,
+ "loss": 0.4357,
+ "step": 7441
+ },
+ {
+ "epoch": 3.518676122931442,
+ "grad_norm": 2.608930826187134,
+ "learning_rate": 1.8542580513536385e-06,
+ "loss": 0.3851,
+ "step": 7442
+ },
+ {
+ "epoch": 3.519148936170213,
+ "grad_norm": 3.127915143966675,
+ "learning_rate": 1.853655413671559e-06,
+ "loss": 0.4227,
+ "step": 7443
+ },
+ {
+ "epoch": 3.5196217494089836,
+ "grad_norm": 3.0593245029449463,
+ "learning_rate": 1.8530528162345238e-06,
+ "loss": 0.4315,
+ "step": 7444
+ },
+ {
+ "epoch": 3.520094562647754,
+ "grad_norm": 2.7818729877471924,
+ "learning_rate": 1.852450259080053e-06,
+ "loss": 0.4018,
+ "step": 7445
+ },
+ {
+ "epoch": 3.520567375886525,
+ "grad_norm": 3.2635445594787598,
+ "learning_rate": 1.8518477422456639e-06,
+ "loss": 0.415,
+ "step": 7446
+ },
+ {
+ "epoch": 3.5210401891252956,
+ "grad_norm": 2.5713813304901123,
+ "learning_rate": 1.851245265768875e-06,
+ "loss": 0.3309,
+ "step": 7447
+ },
+ {
+ "epoch": 3.521513002364066,
+ "grad_norm": 2.6778969764709473,
+ "learning_rate": 1.8506428296871982e-06,
+ "loss": 0.3106,
+ "step": 7448
+ },
+ {
+ "epoch": 3.5219858156028367,
+ "grad_norm": 2.901095390319824,
+ "learning_rate": 1.8500404340381455e-06,
+ "loss": 0.3729,
+ "step": 7449
+ },
+ {
+ "epoch": 3.5224586288416075,
+ "grad_norm": 3.1000046730041504,
+ "learning_rate": 1.849438078859225e-06,
+ "loss": 0.438,
+ "step": 7450
+ },
+ {
+ "epoch": 3.5229314420803783,
+ "grad_norm": 2.901890993118286,
+ "learning_rate": 1.8488357641879417e-06,
+ "loss": 0.3934,
+ "step": 7451
+ },
+ {
+ "epoch": 3.523404255319149,
+ "grad_norm": 3.2212157249450684,
+ "learning_rate": 1.8482334900618009e-06,
+ "loss": 0.4359,
+ "step": 7452
+ },
+ {
+ "epoch": 3.5238770685579195,
+ "grad_norm": 3.3780901432037354,
+ "learning_rate": 1.847631256518303e-06,
+ "loss": 0.4022,
+ "step": 7453
+ },
+ {
+ "epoch": 3.5243498817966903,
+ "grad_norm": 2.9996445178985596,
+ "learning_rate": 1.847029063594945e-06,
+ "loss": 0.3989,
+ "step": 7454
+ },
+ {
+ "epoch": 3.524822695035461,
+ "grad_norm": 2.8581080436706543,
+ "learning_rate": 1.8464269113292255e-06,
+ "loss": 0.3401,
+ "step": 7455
+ },
+ {
+ "epoch": 3.5252955082742314,
+ "grad_norm": 2.9551661014556885,
+ "learning_rate": 1.8458247997586354e-06,
+ "loss": 0.4556,
+ "step": 7456
+ },
+ {
+ "epoch": 3.5257683215130022,
+ "grad_norm": 2.9672555923461914,
+ "learning_rate": 1.8452227289206672e-06,
+ "loss": 0.3575,
+ "step": 7457
+ },
+ {
+ "epoch": 3.526241134751773,
+ "grad_norm": 3.226273536682129,
+ "learning_rate": 1.8446206988528087e-06,
+ "loss": 0.3769,
+ "step": 7458
+ },
+ {
+ "epoch": 3.526713947990544,
+ "grad_norm": 2.994356155395508,
+ "learning_rate": 1.8440187095925443e-06,
+ "loss": 0.3653,
+ "step": 7459
+ },
+ {
+ "epoch": 3.5271867612293146,
+ "grad_norm": 2.489049196243286,
+ "learning_rate": 1.8434167611773595e-06,
+ "loss": 0.3454,
+ "step": 7460
+ },
+ {
+ "epoch": 3.527659574468085,
+ "grad_norm": 2.7897472381591797,
+ "learning_rate": 1.8428148536447333e-06,
+ "loss": 0.3526,
+ "step": 7461
+ },
+ {
+ "epoch": 3.5281323877068558,
+ "grad_norm": 2.947746992111206,
+ "learning_rate": 1.842212987032145e-06,
+ "loss": 0.3542,
+ "step": 7462
+ },
+ {
+ "epoch": 3.5286052009456266,
+ "grad_norm": 2.9303736686706543,
+ "learning_rate": 1.84161116137707e-06,
+ "loss": 0.3618,
+ "step": 7463
+ },
+ {
+ "epoch": 3.529078014184397,
+ "grad_norm": 2.81052827835083,
+ "learning_rate": 1.8410093767169807e-06,
+ "loss": 0.3833,
+ "step": 7464
+ },
+ {
+ "epoch": 3.5295508274231677,
+ "grad_norm": 3.4084126949310303,
+ "learning_rate": 1.840407633089348e-06,
+ "loss": 0.3868,
+ "step": 7465
+ },
+ {
+ "epoch": 3.5300236406619385,
+ "grad_norm": 2.8372802734375,
+ "learning_rate": 1.839805930531639e-06,
+ "loss": 0.3407,
+ "step": 7466
+ },
+ {
+ "epoch": 3.5304964539007093,
+ "grad_norm": 2.9218525886535645,
+ "learning_rate": 1.8392042690813205e-06,
+ "loss": 0.3772,
+ "step": 7467
+ },
+ {
+ "epoch": 3.53096926713948,
+ "grad_norm": 3.425274610519409,
+ "learning_rate": 1.8386026487758552e-06,
+ "loss": 0.3996,
+ "step": 7468
+ },
+ {
+ "epoch": 3.5314420803782505,
+ "grad_norm": 3.027423858642578,
+ "learning_rate": 1.8380010696527015e-06,
+ "loss": 0.3752,
+ "step": 7469
+ },
+ {
+ "epoch": 3.5319148936170213,
+ "grad_norm": 2.974896192550659,
+ "learning_rate": 1.8373995317493193e-06,
+ "loss": 0.3657,
+ "step": 7470
+ },
+ {
+ "epoch": 3.532387706855792,
+ "grad_norm": 2.837458610534668,
+ "learning_rate": 1.8367980351031628e-06,
+ "loss": 0.3949,
+ "step": 7471
+ },
+ {
+ "epoch": 3.5328605200945624,
+ "grad_norm": 2.8257288932800293,
+ "learning_rate": 1.8361965797516844e-06,
+ "loss": 0.3253,
+ "step": 7472
+ },
+ {
+ "epoch": 3.533333333333333,
+ "grad_norm": 2.8278095722198486,
+ "learning_rate": 1.8355951657323351e-06,
+ "loss": 0.3588,
+ "step": 7473
+ },
+ {
+ "epoch": 3.533806146572104,
+ "grad_norm": 2.641160249710083,
+ "learning_rate": 1.8349937930825601e-06,
+ "loss": 0.3423,
+ "step": 7474
+ },
+ {
+ "epoch": 3.534278959810875,
+ "grad_norm": 2.6909263134002686,
+ "learning_rate": 1.8343924618398065e-06,
+ "loss": 0.3973,
+ "step": 7475
+ },
+ {
+ "epoch": 3.5347517730496456,
+ "grad_norm": 3.0727429389953613,
+ "learning_rate": 1.8337911720415157e-06,
+ "loss": 0.4207,
+ "step": 7476
+ },
+ {
+ "epoch": 3.535224586288416,
+ "grad_norm": 3.218925714492798,
+ "learning_rate": 1.8331899237251265e-06,
+ "loss": 0.3955,
+ "step": 7477
+ },
+ {
+ "epoch": 3.5356973995271868,
+ "grad_norm": 3.163914918899536,
+ "learning_rate": 1.832588716928078e-06,
+ "loss": 0.4655,
+ "step": 7478
+ },
+ {
+ "epoch": 3.5361702127659576,
+ "grad_norm": 2.8622686862945557,
+ "learning_rate": 1.831987551687803e-06,
+ "loss": 0.4084,
+ "step": 7479
+ },
+ {
+ "epoch": 3.536643026004728,
+ "grad_norm": 2.8534188270568848,
+ "learning_rate": 1.831386428041734e-06,
+ "loss": 0.4144,
+ "step": 7480
+ },
+ {
+ "epoch": 3.5371158392434987,
+ "grad_norm": 2.8138554096221924,
+ "learning_rate": 1.8307853460273008e-06,
+ "loss": 0.3835,
+ "step": 7481
+ },
+ {
+ "epoch": 3.5375886524822695,
+ "grad_norm": 3.061960458755493,
+ "learning_rate": 1.830184305681929e-06,
+ "loss": 0.4128,
+ "step": 7482
+ },
+ {
+ "epoch": 3.5380614657210403,
+ "grad_norm": 2.8524835109710693,
+ "learning_rate": 1.8295833070430444e-06,
+ "loss": 0.3372,
+ "step": 7483
+ },
+ {
+ "epoch": 3.538534278959811,
+ "grad_norm": 3.2567028999328613,
+ "learning_rate": 1.8289823501480663e-06,
+ "loss": 0.4533,
+ "step": 7484
+ },
+ {
+ "epoch": 3.5390070921985815,
+ "grad_norm": 2.945634603500366,
+ "learning_rate": 1.8283814350344158e-06,
+ "loss": 0.3565,
+ "step": 7485
+ },
+ {
+ "epoch": 3.5394799054373522,
+ "grad_norm": 2.903287649154663,
+ "learning_rate": 1.8277805617395089e-06,
+ "loss": 0.349,
+ "step": 7486
+ },
+ {
+ "epoch": 3.539952718676123,
+ "grad_norm": 3.249272584915161,
+ "learning_rate": 1.827179730300757e-06,
+ "loss": 0.4076,
+ "step": 7487
+ },
+ {
+ "epoch": 3.5404255319148934,
+ "grad_norm": 2.9591739177703857,
+ "learning_rate": 1.8265789407555748e-06,
+ "loss": 0.3439,
+ "step": 7488
+ },
+ {
+ "epoch": 3.540898345153664,
+ "grad_norm": 3.8527538776397705,
+ "learning_rate": 1.8259781931413683e-06,
+ "loss": 0.4684,
+ "step": 7489
+ },
+ {
+ "epoch": 3.541371158392435,
+ "grad_norm": 2.7392261028289795,
+ "learning_rate": 1.8253774874955449e-06,
+ "loss": 0.3494,
+ "step": 7490
+ },
+ {
+ "epoch": 3.541843971631206,
+ "grad_norm": 2.880993127822876,
+ "learning_rate": 1.8247768238555069e-06,
+ "loss": 0.3546,
+ "step": 7491
+ },
+ {
+ "epoch": 3.5423167848699766,
+ "grad_norm": 2.9944894313812256,
+ "learning_rate": 1.8241762022586545e-06,
+ "loss": 0.3594,
+ "step": 7492
+ },
+ {
+ "epoch": 3.542789598108747,
+ "grad_norm": 3.0084292888641357,
+ "learning_rate": 1.8235756227423878e-06,
+ "loss": 0.408,
+ "step": 7493
+ },
+ {
+ "epoch": 3.5432624113475177,
+ "grad_norm": 2.75227689743042,
+ "learning_rate": 1.8229750853440998e-06,
+ "loss": 0.3515,
+ "step": 7494
+ },
+ {
+ "epoch": 3.5437352245862885,
+ "grad_norm": 3.041893243789673,
+ "learning_rate": 1.8223745901011856e-06,
+ "loss": 0.401,
+ "step": 7495
+ },
+ {
+ "epoch": 3.544208037825059,
+ "grad_norm": 2.8728370666503906,
+ "learning_rate": 1.8217741370510345e-06,
+ "loss": 0.3832,
+ "step": 7496
+ },
+ {
+ "epoch": 3.5446808510638297,
+ "grad_norm": 3.095460891723633,
+ "learning_rate": 1.8211737262310331e-06,
+ "loss": 0.3086,
+ "step": 7497
+ },
+ {
+ "epoch": 3.5451536643026005,
+ "grad_norm": 3.1869826316833496,
+ "learning_rate": 1.8205733576785678e-06,
+ "loss": 0.3666,
+ "step": 7498
+ },
+ {
+ "epoch": 3.5456264775413713,
+ "grad_norm": 3.307560443878174,
+ "learning_rate": 1.8199730314310204e-06,
+ "loss": 0.4489,
+ "step": 7499
+ },
+ {
+ "epoch": 3.546099290780142,
+ "grad_norm": 2.9531142711639404,
+ "learning_rate": 1.8193727475257697e-06,
+ "loss": 0.4017,
+ "step": 7500
+ },
+ {
+ "epoch": 3.5465721040189124,
+ "grad_norm": 3.2969162464141846,
+ "learning_rate": 1.8187725060001942e-06,
+ "loss": 0.4179,
+ "step": 7501
+ },
+ {
+ "epoch": 3.5470449172576832,
+ "grad_norm": 2.9434688091278076,
+ "learning_rate": 1.818172306891667e-06,
+ "loss": 0.3562,
+ "step": 7502
+ },
+ {
+ "epoch": 3.547517730496454,
+ "grad_norm": 3.070732355117798,
+ "learning_rate": 1.8175721502375616e-06,
+ "loss": 0.393,
+ "step": 7503
+ },
+ {
+ "epoch": 3.5479905437352244,
+ "grad_norm": 2.970898389816284,
+ "learning_rate": 1.8169720360752457e-06,
+ "loss": 0.4448,
+ "step": 7504
+ },
+ {
+ "epoch": 3.548463356973995,
+ "grad_norm": 2.7050931453704834,
+ "learning_rate": 1.8163719644420858e-06,
+ "loss": 0.3562,
+ "step": 7505
+ },
+ {
+ "epoch": 3.548936170212766,
+ "grad_norm": 3.0073063373565674,
+ "learning_rate": 1.8157719353754467e-06,
+ "loss": 0.4429,
+ "step": 7506
+ },
+ {
+ "epoch": 3.5494089834515368,
+ "grad_norm": 3.0240445137023926,
+ "learning_rate": 1.8151719489126874e-06,
+ "loss": 0.4073,
+ "step": 7507
+ },
+ {
+ "epoch": 3.5498817966903076,
+ "grad_norm": 3.558763265609741,
+ "learning_rate": 1.8145720050911695e-06,
+ "loss": 0.4025,
+ "step": 7508
+ },
+ {
+ "epoch": 3.550354609929078,
+ "grad_norm": 3.637258768081665,
+ "learning_rate": 1.8139721039482473e-06,
+ "loss": 0.5074,
+ "step": 7509
+ },
+ {
+ "epoch": 3.5508274231678487,
+ "grad_norm": 2.804719924926758,
+ "learning_rate": 1.8133722455212726e-06,
+ "loss": 0.3727,
+ "step": 7510
+ },
+ {
+ "epoch": 3.5513002364066195,
+ "grad_norm": 2.9034759998321533,
+ "learning_rate": 1.8127724298475984e-06,
+ "loss": 0.3642,
+ "step": 7511
+ },
+ {
+ "epoch": 3.55177304964539,
+ "grad_norm": 3.1304872035980225,
+ "learning_rate": 1.8121726569645714e-06,
+ "loss": 0.434,
+ "step": 7512
+ },
+ {
+ "epoch": 3.5522458628841607,
+ "grad_norm": 3.019956111907959,
+ "learning_rate": 1.8115729269095378e-06,
+ "loss": 0.4235,
+ "step": 7513
+ },
+ {
+ "epoch": 3.5527186761229315,
+ "grad_norm": 2.7984633445739746,
+ "learning_rate": 1.810973239719839e-06,
+ "loss": 0.3344,
+ "step": 7514
+ },
+ {
+ "epoch": 3.5531914893617023,
+ "grad_norm": 2.839709997177124,
+ "learning_rate": 1.8103735954328145e-06,
+ "loss": 0.3708,
+ "step": 7515
+ },
+ {
+ "epoch": 3.553664302600473,
+ "grad_norm": 2.766819477081299,
+ "learning_rate": 1.809773994085803e-06,
+ "loss": 0.3402,
+ "step": 7516
+ },
+ {
+ "epoch": 3.5541371158392434,
+ "grad_norm": 2.707942247390747,
+ "learning_rate": 1.8091744357161372e-06,
+ "loss": 0.4327,
+ "step": 7517
+ },
+ {
+ "epoch": 3.554609929078014,
+ "grad_norm": 3.512702465057373,
+ "learning_rate": 1.8085749203611516e-06,
+ "loss": 0.3965,
+ "step": 7518
+ },
+ {
+ "epoch": 3.555082742316785,
+ "grad_norm": 2.717024803161621,
+ "learning_rate": 1.8079754480581738e-06,
+ "loss": 0.3237,
+ "step": 7519
+ },
+ {
+ "epoch": 3.5555555555555554,
+ "grad_norm": 2.659001350402832,
+ "learning_rate": 1.8073760188445296e-06,
+ "loss": 0.3546,
+ "step": 7520
+ },
+ {
+ "epoch": 3.556028368794326,
+ "grad_norm": 2.615028142929077,
+ "learning_rate": 1.8067766327575445e-06,
+ "loss": 0.3232,
+ "step": 7521
+ },
+ {
+ "epoch": 3.556501182033097,
+ "grad_norm": 2.659428119659424,
+ "learning_rate": 1.8061772898345386e-06,
+ "loss": 0.3769,
+ "step": 7522
+ },
+ {
+ "epoch": 3.5569739952718678,
+ "grad_norm": 3.142369270324707,
+ "learning_rate": 1.8055779901128296e-06,
+ "loss": 0.4292,
+ "step": 7523
+ },
+ {
+ "epoch": 3.5574468085106385,
+ "grad_norm": 3.0832736492156982,
+ "learning_rate": 1.8049787336297352e-06,
+ "loss": 0.3871,
+ "step": 7524
+ },
+ {
+ "epoch": 3.557919621749409,
+ "grad_norm": 2.778411865234375,
+ "learning_rate": 1.8043795204225664e-06,
+ "loss": 0.3938,
+ "step": 7525
+ },
+ {
+ "epoch": 3.5583924349881797,
+ "grad_norm": 3.1651480197906494,
+ "learning_rate": 1.8037803505286355e-06,
+ "loss": 0.3315,
+ "step": 7526
+ },
+ {
+ "epoch": 3.5588652482269505,
+ "grad_norm": 3.266508102416992,
+ "learning_rate": 1.8031812239852498e-06,
+ "loss": 0.4156,
+ "step": 7527
+ },
+ {
+ "epoch": 3.559338061465721,
+ "grad_norm": 3.1345436573028564,
+ "learning_rate": 1.8025821408297127e-06,
+ "loss": 0.3813,
+ "step": 7528
+ },
+ {
+ "epoch": 3.5598108747044916,
+ "grad_norm": 3.1535425186157227,
+ "learning_rate": 1.8019831010993289e-06,
+ "loss": 0.3897,
+ "step": 7529
+ },
+ {
+ "epoch": 3.5602836879432624,
+ "grad_norm": 3.0934345722198486,
+ "learning_rate": 1.8013841048313952e-06,
+ "loss": 0.4074,
+ "step": 7530
+ },
+ {
+ "epoch": 3.5607565011820332,
+ "grad_norm": 3.224876642227173,
+ "learning_rate": 1.8007851520632108e-06,
+ "loss": 0.3969,
+ "step": 7531
+ },
+ {
+ "epoch": 3.561229314420804,
+ "grad_norm": 3.082303285598755,
+ "learning_rate": 1.8001862428320693e-06,
+ "loss": 0.3559,
+ "step": 7532
+ },
+ {
+ "epoch": 3.5617021276595744,
+ "grad_norm": 3.5289969444274902,
+ "learning_rate": 1.7995873771752608e-06,
+ "loss": 0.3961,
+ "step": 7533
+ },
+ {
+ "epoch": 3.562174940898345,
+ "grad_norm": 3.1893370151519775,
+ "learning_rate": 1.7989885551300762e-06,
+ "loss": 0.3721,
+ "step": 7534
+ },
+ {
+ "epoch": 3.562647754137116,
+ "grad_norm": 2.6911089420318604,
+ "learning_rate": 1.7983897767337999e-06,
+ "loss": 0.3801,
+ "step": 7535
+ },
+ {
+ "epoch": 3.5631205673758863,
+ "grad_norm": 3.0837483406066895,
+ "learning_rate": 1.797791042023716e-06,
+ "loss": 0.3886,
+ "step": 7536
+ },
+ {
+ "epoch": 3.563593380614657,
+ "grad_norm": 2.973459005355835,
+ "learning_rate": 1.7971923510371054e-06,
+ "loss": 0.438,
+ "step": 7537
+ },
+ {
+ "epoch": 3.564066193853428,
+ "grad_norm": 3.1537392139434814,
+ "learning_rate": 1.7965937038112435e-06,
+ "loss": 0.4022,
+ "step": 7538
+ },
+ {
+ "epoch": 3.5645390070921987,
+ "grad_norm": 3.2339680194854736,
+ "learning_rate": 1.795995100383409e-06,
+ "loss": 0.3883,
+ "step": 7539
+ },
+ {
+ "epoch": 3.5650118203309695,
+ "grad_norm": 2.5029079914093018,
+ "learning_rate": 1.7953965407908714e-06,
+ "loss": 0.3522,
+ "step": 7540
+ },
+ {
+ "epoch": 3.56548463356974,
+ "grad_norm": 3.1560211181640625,
+ "learning_rate": 1.7947980250709027e-06,
+ "loss": 0.4024,
+ "step": 7541
+ },
+ {
+ "epoch": 3.5659574468085107,
+ "grad_norm": 2.950477361679077,
+ "learning_rate": 1.7941995532607687e-06,
+ "loss": 0.3598,
+ "step": 7542
+ },
+ {
+ "epoch": 3.5664302600472815,
+ "grad_norm": 3.1263279914855957,
+ "learning_rate": 1.793601125397733e-06,
+ "loss": 0.3535,
+ "step": 7543
+ },
+ {
+ "epoch": 3.566903073286052,
+ "grad_norm": 2.986631393432617,
+ "learning_rate": 1.7930027415190587e-06,
+ "loss": 0.4251,
+ "step": 7544
+ },
+ {
+ "epoch": 3.5673758865248226,
+ "grad_norm": 2.6882247924804688,
+ "learning_rate": 1.7924044016620022e-06,
+ "loss": 0.3584,
+ "step": 7545
+ },
+ {
+ "epoch": 3.5678486997635934,
+ "grad_norm": 2.9358696937561035,
+ "learning_rate": 1.791806105863822e-06,
+ "loss": 0.3671,
+ "step": 7546
+ },
+ {
+ "epoch": 3.568321513002364,
+ "grad_norm": 2.774198055267334,
+ "learning_rate": 1.7912078541617704e-06,
+ "loss": 0.3505,
+ "step": 7547
+ },
+ {
+ "epoch": 3.568794326241135,
+ "grad_norm": 2.7384231090545654,
+ "learning_rate": 1.7906096465930964e-06,
+ "loss": 0.3992,
+ "step": 7548
+ },
+ {
+ "epoch": 3.5692671394799054,
+ "grad_norm": 2.8625354766845703,
+ "learning_rate": 1.7900114831950506e-06,
+ "loss": 0.3858,
+ "step": 7549
+ },
+ {
+ "epoch": 3.569739952718676,
+ "grad_norm": 2.737884044647217,
+ "learning_rate": 1.7894133640048761e-06,
+ "loss": 0.3973,
+ "step": 7550
+ },
+ {
+ "epoch": 3.570212765957447,
+ "grad_norm": 2.9817614555358887,
+ "learning_rate": 1.7888152890598154e-06,
+ "loss": 0.3613,
+ "step": 7551
+ },
+ {
+ "epoch": 3.5706855791962173,
+ "grad_norm": 2.760956287384033,
+ "learning_rate": 1.7882172583971081e-06,
+ "loss": 0.3645,
+ "step": 7552
+ },
+ {
+ "epoch": 3.571158392434988,
+ "grad_norm": 2.6867735385894775,
+ "learning_rate": 1.7876192720539908e-06,
+ "loss": 0.3771,
+ "step": 7553
+ },
+ {
+ "epoch": 3.571631205673759,
+ "grad_norm": 3.3362443447113037,
+ "learning_rate": 1.7870213300676986e-06,
+ "loss": 0.3989,
+ "step": 7554
+ },
+ {
+ "epoch": 3.5721040189125297,
+ "grad_norm": 2.8359227180480957,
+ "learning_rate": 1.7864234324754617e-06,
+ "loss": 0.3645,
+ "step": 7555
+ },
+ {
+ "epoch": 3.5725768321513005,
+ "grad_norm": 3.3070647716522217,
+ "learning_rate": 1.7858255793145076e-06,
+ "loss": 0.4128,
+ "step": 7556
+ },
+ {
+ "epoch": 3.573049645390071,
+ "grad_norm": 2.544879913330078,
+ "learning_rate": 1.7852277706220644e-06,
+ "loss": 0.3779,
+ "step": 7557
+ },
+ {
+ "epoch": 3.5735224586288417,
+ "grad_norm": 2.890796661376953,
+ "learning_rate": 1.7846300064353525e-06,
+ "loss": 0.373,
+ "step": 7558
+ },
+ {
+ "epoch": 3.5739952718676125,
+ "grad_norm": 2.9703400135040283,
+ "learning_rate": 1.7840322867915944e-06,
+ "loss": 0.3619,
+ "step": 7559
+ },
+ {
+ "epoch": 3.574468085106383,
+ "grad_norm": 3.0122430324554443,
+ "learning_rate": 1.7834346117280066e-06,
+ "loss": 0.4079,
+ "step": 7560
+ },
+ {
+ "epoch": 3.5749408983451536,
+ "grad_norm": 2.904963493347168,
+ "learning_rate": 1.7828369812818025e-06,
+ "loss": 0.38,
+ "step": 7561
+ },
+ {
+ "epoch": 3.5754137115839244,
+ "grad_norm": 3.0917439460754395,
+ "learning_rate": 1.7822393954901957e-06,
+ "loss": 0.383,
+ "step": 7562
+ },
+ {
+ "epoch": 3.575886524822695,
+ "grad_norm": 2.633920907974243,
+ "learning_rate": 1.7816418543903935e-06,
+ "loss": 0.3823,
+ "step": 7563
+ },
+ {
+ "epoch": 3.576359338061466,
+ "grad_norm": 2.9266390800476074,
+ "learning_rate": 1.781044358019604e-06,
+ "loss": 0.4642,
+ "step": 7564
+ },
+ {
+ "epoch": 3.5768321513002364,
+ "grad_norm": 2.878138780593872,
+ "learning_rate": 1.7804469064150299e-06,
+ "loss": 0.4056,
+ "step": 7565
+ },
+ {
+ "epoch": 3.577304964539007,
+ "grad_norm": 2.949370861053467,
+ "learning_rate": 1.7798494996138708e-06,
+ "loss": 0.3863,
+ "step": 7566
+ },
+ {
+ "epoch": 3.5777777777777775,
+ "grad_norm": 3.1444685459136963,
+ "learning_rate": 1.7792521376533264e-06,
+ "loss": 0.3611,
+ "step": 7567
+ },
+ {
+ "epoch": 3.5782505910165483,
+ "grad_norm": 3.0719716548919678,
+ "learning_rate": 1.7786548205705906e-06,
+ "loss": 0.3866,
+ "step": 7568
+ },
+ {
+ "epoch": 3.578723404255319,
+ "grad_norm": 3.155343770980835,
+ "learning_rate": 1.7780575484028566e-06,
+ "loss": 0.3896,
+ "step": 7569
+ },
+ {
+ "epoch": 3.57919621749409,
+ "grad_norm": 3.0447211265563965,
+ "learning_rate": 1.7774603211873138e-06,
+ "loss": 0.3713,
+ "step": 7570
+ },
+ {
+ "epoch": 3.5796690307328607,
+ "grad_norm": 2.8683619499206543,
+ "learning_rate": 1.7768631389611471e-06,
+ "loss": 0.3808,
+ "step": 7571
+ },
+ {
+ "epoch": 3.580141843971631,
+ "grad_norm": 3.1548070907592773,
+ "learning_rate": 1.776266001761543e-06,
+ "loss": 0.3714,
+ "step": 7572
+ },
+ {
+ "epoch": 3.580614657210402,
+ "grad_norm": 2.8699257373809814,
+ "learning_rate": 1.7756689096256816e-06,
+ "loss": 0.3694,
+ "step": 7573
+ },
+ {
+ "epoch": 3.5810874704491726,
+ "grad_norm": 2.834714412689209,
+ "learning_rate": 1.7750718625907398e-06,
+ "loss": 0.3935,
+ "step": 7574
+ },
+ {
+ "epoch": 3.581560283687943,
+ "grad_norm": 3.3828539848327637,
+ "learning_rate": 1.7744748606938957e-06,
+ "loss": 0.4783,
+ "step": 7575
+ },
+ {
+ "epoch": 3.582033096926714,
+ "grad_norm": 3.3892476558685303,
+ "learning_rate": 1.7738779039723202e-06,
+ "loss": 0.41,
+ "step": 7576
+ },
+ {
+ "epoch": 3.5825059101654846,
+ "grad_norm": 3.014289379119873,
+ "learning_rate": 1.7732809924631842e-06,
+ "loss": 0.3516,
+ "step": 7577
+ },
+ {
+ "epoch": 3.5829787234042554,
+ "grad_norm": 3.477212429046631,
+ "learning_rate": 1.772684126203654e-06,
+ "loss": 0.4144,
+ "step": 7578
+ },
+ {
+ "epoch": 3.583451536643026,
+ "grad_norm": 2.9156792163848877,
+ "learning_rate": 1.772087305230893e-06,
+ "loss": 0.3772,
+ "step": 7579
+ },
+ {
+ "epoch": 3.5839243498817965,
+ "grad_norm": 2.639169931411743,
+ "learning_rate": 1.7714905295820651e-06,
+ "loss": 0.3487,
+ "step": 7580
+ },
+ {
+ "epoch": 3.5843971631205673,
+ "grad_norm": 3.196894407272339,
+ "learning_rate": 1.7708937992943263e-06,
+ "loss": 0.4852,
+ "step": 7581
+ },
+ {
+ "epoch": 3.584869976359338,
+ "grad_norm": 2.9140779972076416,
+ "learning_rate": 1.7702971144048347e-06,
+ "loss": 0.3703,
+ "step": 7582
+ },
+ {
+ "epoch": 3.5853427895981085,
+ "grad_norm": 3.3844895362854004,
+ "learning_rate": 1.7697004749507418e-06,
+ "loss": 0.4227,
+ "step": 7583
+ },
+ {
+ "epoch": 3.5858156028368793,
+ "grad_norm": 3.080061912536621,
+ "learning_rate": 1.769103880969198e-06,
+ "loss": 0.4237,
+ "step": 7584
+ },
+ {
+ "epoch": 3.58628841607565,
+ "grad_norm": 3.037505865097046,
+ "learning_rate": 1.7685073324973506e-06,
+ "loss": 0.3902,
+ "step": 7585
+ },
+ {
+ "epoch": 3.586761229314421,
+ "grad_norm": 3.6563873291015625,
+ "learning_rate": 1.7679108295723436e-06,
+ "loss": 0.3956,
+ "step": 7586
+ },
+ {
+ "epoch": 3.5872340425531917,
+ "grad_norm": 3.158935546875,
+ "learning_rate": 1.76731437223132e-06,
+ "loss": 0.3898,
+ "step": 7587
+ },
+ {
+ "epoch": 3.587706855791962,
+ "grad_norm": 3.059199571609497,
+ "learning_rate": 1.7667179605114176e-06,
+ "loss": 0.4183,
+ "step": 7588
+ },
+ {
+ "epoch": 3.588179669030733,
+ "grad_norm": 2.8123233318328857,
+ "learning_rate": 1.7661215944497716e-06,
+ "loss": 0.3731,
+ "step": 7589
+ },
+ {
+ "epoch": 3.5886524822695036,
+ "grad_norm": 3.094287633895874,
+ "learning_rate": 1.7655252740835169e-06,
+ "loss": 0.4562,
+ "step": 7590
+ },
+ {
+ "epoch": 3.589125295508274,
+ "grad_norm": 2.886833667755127,
+ "learning_rate": 1.7649289994497822e-06,
+ "loss": 0.4178,
+ "step": 7591
+ },
+ {
+ "epoch": 3.5895981087470448,
+ "grad_norm": 3.3040647506713867,
+ "learning_rate": 1.764332770585696e-06,
+ "loss": 0.4311,
+ "step": 7592
+ },
+ {
+ "epoch": 3.5900709219858156,
+ "grad_norm": 2.7948951721191406,
+ "learning_rate": 1.7637365875283827e-06,
+ "loss": 0.3704,
+ "step": 7593
+ },
+ {
+ "epoch": 3.5905437352245864,
+ "grad_norm": 3.092221975326538,
+ "learning_rate": 1.7631404503149623e-06,
+ "loss": 0.4166,
+ "step": 7594
+ },
+ {
+ "epoch": 3.591016548463357,
+ "grad_norm": 3.6018600463867188,
+ "learning_rate": 1.7625443589825564e-06,
+ "loss": 0.4251,
+ "step": 7595
+ },
+ {
+ "epoch": 3.5914893617021275,
+ "grad_norm": 2.708017110824585,
+ "learning_rate": 1.7619483135682791e-06,
+ "loss": 0.3775,
+ "step": 7596
+ },
+ {
+ "epoch": 3.5919621749408983,
+ "grad_norm": 2.8069381713867188,
+ "learning_rate": 1.7613523141092438e-06,
+ "loss": 0.3929,
+ "step": 7597
+ },
+ {
+ "epoch": 3.592434988179669,
+ "grad_norm": 3.097787380218506,
+ "learning_rate": 1.7607563606425616e-06,
+ "loss": 0.3992,
+ "step": 7598
+ },
+ {
+ "epoch": 3.5929078014184395,
+ "grad_norm": 2.9691715240478516,
+ "learning_rate": 1.7601604532053385e-06,
+ "loss": 0.4001,
+ "step": 7599
+ },
+ {
+ "epoch": 3.5933806146572103,
+ "grad_norm": 2.5511624813079834,
+ "learning_rate": 1.7595645918346807e-06,
+ "loss": 0.3136,
+ "step": 7600
+ },
+ {
+ "epoch": 3.593853427895981,
+ "grad_norm": 2.4688427448272705,
+ "learning_rate": 1.7589687765676891e-06,
+ "loss": 0.3922,
+ "step": 7601
+ },
+ {
+ "epoch": 3.594326241134752,
+ "grad_norm": 3.004023790359497,
+ "learning_rate": 1.7583730074414613e-06,
+ "loss": 0.4203,
+ "step": 7602
+ },
+ {
+ "epoch": 3.5947990543735227,
+ "grad_norm": 2.902641773223877,
+ "learning_rate": 1.7577772844930957e-06,
+ "loss": 0.3855,
+ "step": 7603
+ },
+ {
+ "epoch": 3.595271867612293,
+ "grad_norm": 3.851375102996826,
+ "learning_rate": 1.7571816077596826e-06,
+ "loss": 0.3769,
+ "step": 7604
+ },
+ {
+ "epoch": 3.595744680851064,
+ "grad_norm": 3.03249192237854,
+ "learning_rate": 1.756585977278315e-06,
+ "loss": 0.3448,
+ "step": 7605
+ },
+ {
+ "epoch": 3.5962174940898346,
+ "grad_norm": 2.992363214492798,
+ "learning_rate": 1.7559903930860789e-06,
+ "loss": 0.3893,
+ "step": 7606
+ },
+ {
+ "epoch": 3.596690307328605,
+ "grad_norm": 2.9322855472564697,
+ "learning_rate": 1.7553948552200577e-06,
+ "loss": 0.4337,
+ "step": 7607
+ },
+ {
+ "epoch": 3.5971631205673757,
+ "grad_norm": 3.2564096450805664,
+ "learning_rate": 1.7547993637173347e-06,
+ "loss": 0.3943,
+ "step": 7608
+ },
+ {
+ "epoch": 3.5976359338061465,
+ "grad_norm": 2.9988484382629395,
+ "learning_rate": 1.7542039186149867e-06,
+ "loss": 0.3421,
+ "step": 7609
+ },
+ {
+ "epoch": 3.5981087470449173,
+ "grad_norm": 2.8188817501068115,
+ "learning_rate": 1.7536085199500914e-06,
+ "loss": 0.3657,
+ "step": 7610
+ },
+ {
+ "epoch": 3.598581560283688,
+ "grad_norm": 3.0583255290985107,
+ "learning_rate": 1.7530131677597206e-06,
+ "loss": 0.4036,
+ "step": 7611
+ },
+ {
+ "epoch": 3.5990543735224585,
+ "grad_norm": 2.8700921535491943,
+ "learning_rate": 1.7524178620809435e-06,
+ "loss": 0.3928,
+ "step": 7612
+ },
+ {
+ "epoch": 3.5995271867612293,
+ "grad_norm": 3.4497945308685303,
+ "learning_rate": 1.751822602950829e-06,
+ "loss": 0.3517,
+ "step": 7613
+ },
+ {
+ "epoch": 3.6,
+ "grad_norm": 3.334191083908081,
+ "learning_rate": 1.75122739040644e-06,
+ "loss": 0.3414,
+ "step": 7614
+ },
+ {
+ "epoch": 3.6004728132387704,
+ "grad_norm": 3.1435158252716064,
+ "learning_rate": 1.7506322244848387e-06,
+ "loss": 0.4075,
+ "step": 7615
+ },
+ {
+ "epoch": 3.6009456264775412,
+ "grad_norm": 3.178990125656128,
+ "learning_rate": 1.7500371052230824e-06,
+ "loss": 0.4688,
+ "step": 7616
+ },
+ {
+ "epoch": 3.601418439716312,
+ "grad_norm": 2.9292044639587402,
+ "learning_rate": 1.7494420326582267e-06,
+ "loss": 0.3882,
+ "step": 7617
+ },
+ {
+ "epoch": 3.601891252955083,
+ "grad_norm": 2.6899197101593018,
+ "learning_rate": 1.7488470068273256e-06,
+ "loss": 0.3916,
+ "step": 7618
+ },
+ {
+ "epoch": 3.6023640661938536,
+ "grad_norm": 2.8319191932678223,
+ "learning_rate": 1.7482520277674273e-06,
+ "loss": 0.3924,
+ "step": 7619
+ },
+ {
+ "epoch": 3.602836879432624,
+ "grad_norm": 2.74589204788208,
+ "learning_rate": 1.747657095515578e-06,
+ "loss": 0.2911,
+ "step": 7620
+ },
+ {
+ "epoch": 3.603309692671395,
+ "grad_norm": 2.857028007507324,
+ "learning_rate": 1.7470622101088233e-06,
+ "loss": 0.3618,
+ "step": 7621
+ },
+ {
+ "epoch": 3.6037825059101656,
+ "grad_norm": 3.3715617656707764,
+ "learning_rate": 1.746467371584203e-06,
+ "loss": 0.4186,
+ "step": 7622
+ },
+ {
+ "epoch": 3.604255319148936,
+ "grad_norm": 2.839526414871216,
+ "learning_rate": 1.745872579978755e-06,
+ "loss": 0.4088,
+ "step": 7623
+ },
+ {
+ "epoch": 3.6047281323877067,
+ "grad_norm": 3.7689156532287598,
+ "learning_rate": 1.7452778353295155e-06,
+ "loss": 0.4748,
+ "step": 7624
+ },
+ {
+ "epoch": 3.6052009456264775,
+ "grad_norm": 2.9345123767852783,
+ "learning_rate": 1.7446831376735152e-06,
+ "loss": 0.4117,
+ "step": 7625
+ },
+ {
+ "epoch": 3.6056737588652483,
+ "grad_norm": 2.7898924350738525,
+ "learning_rate": 1.7440884870477845e-06,
+ "loss": 0.3515,
+ "step": 7626
+ },
+ {
+ "epoch": 3.606146572104019,
+ "grad_norm": 3.4268569946289062,
+ "learning_rate": 1.7434938834893481e-06,
+ "loss": 0.4051,
+ "step": 7627
+ },
+ {
+ "epoch": 3.6066193853427895,
+ "grad_norm": 3.019066095352173,
+ "learning_rate": 1.7428993270352311e-06,
+ "loss": 0.4128,
+ "step": 7628
+ },
+ {
+ "epoch": 3.6070921985815603,
+ "grad_norm": 3.1277568340301514,
+ "learning_rate": 1.742304817722454e-06,
+ "loss": 0.37,
+ "step": 7629
+ },
+ {
+ "epoch": 3.607565011820331,
+ "grad_norm": 2.924818277359009,
+ "learning_rate": 1.7417103555880318e-06,
+ "loss": 0.3792,
+ "step": 7630
+ },
+ {
+ "epoch": 3.6080378250591014,
+ "grad_norm": 2.664699077606201,
+ "learning_rate": 1.7411159406689821e-06,
+ "loss": 0.3584,
+ "step": 7631
+ },
+ {
+ "epoch": 3.608510638297872,
+ "grad_norm": 3.223729133605957,
+ "learning_rate": 1.7405215730023144e-06,
+ "loss": 0.3956,
+ "step": 7632
+ },
+ {
+ "epoch": 3.608983451536643,
+ "grad_norm": 2.934225559234619,
+ "learning_rate": 1.7399272526250388e-06,
+ "loss": 0.4179,
+ "step": 7633
+ },
+ {
+ "epoch": 3.609456264775414,
+ "grad_norm": 2.833798885345459,
+ "learning_rate": 1.7393329795741603e-06,
+ "loss": 0.3283,
+ "step": 7634
+ },
+ {
+ "epoch": 3.6099290780141846,
+ "grad_norm": 3.008798837661743,
+ "learning_rate": 1.738738753886681e-06,
+ "loss": 0.3704,
+ "step": 7635
+ },
+ {
+ "epoch": 3.610401891252955,
+ "grad_norm": 2.8714520931243896,
+ "learning_rate": 1.7381445755996023e-06,
+ "loss": 0.3646,
+ "step": 7636
+ },
+ {
+ "epoch": 3.6108747044917258,
+ "grad_norm": 3.083554267883301,
+ "learning_rate": 1.7375504447499193e-06,
+ "loss": 0.3785,
+ "step": 7637
+ },
+ {
+ "epoch": 3.6113475177304966,
+ "grad_norm": 3.270347833633423,
+ "learning_rate": 1.7369563613746277e-06,
+ "loss": 0.4426,
+ "step": 7638
+ },
+ {
+ "epoch": 3.611820330969267,
+ "grad_norm": 2.7754862308502197,
+ "learning_rate": 1.7363623255107175e-06,
+ "loss": 0.3448,
+ "step": 7639
+ },
+ {
+ "epoch": 3.6122931442080377,
+ "grad_norm": 2.98140025138855,
+ "learning_rate": 1.7357683371951767e-06,
+ "loss": 0.4027,
+ "step": 7640
+ },
+ {
+ "epoch": 3.6127659574468085,
+ "grad_norm": 3.1640074253082275,
+ "learning_rate": 1.7351743964649908e-06,
+ "loss": 0.3913,
+ "step": 7641
+ },
+ {
+ "epoch": 3.6132387706855793,
+ "grad_norm": 2.758202075958252,
+ "learning_rate": 1.7345805033571417e-06,
+ "loss": 0.4148,
+ "step": 7642
+ },
+ {
+ "epoch": 3.61371158392435,
+ "grad_norm": 3.1030571460723877,
+ "learning_rate": 1.7339866579086074e-06,
+ "loss": 0.4002,
+ "step": 7643
+ },
+ {
+ "epoch": 3.6141843971631205,
+ "grad_norm": 3.2414135932922363,
+ "learning_rate": 1.733392860156366e-06,
+ "loss": 0.4732,
+ "step": 7644
+ },
+ {
+ "epoch": 3.6146572104018913,
+ "grad_norm": 2.8720390796661377,
+ "learning_rate": 1.7327991101373886e-06,
+ "loss": 0.4112,
+ "step": 7645
+ },
+ {
+ "epoch": 3.615130023640662,
+ "grad_norm": 3.0104875564575195,
+ "learning_rate": 1.7322054078886474e-06,
+ "loss": 0.3934,
+ "step": 7646
+ },
+ {
+ "epoch": 3.6156028368794324,
+ "grad_norm": 2.8615126609802246,
+ "learning_rate": 1.7316117534471091e-06,
+ "loss": 0.3437,
+ "step": 7647
+ },
+ {
+ "epoch": 3.616075650118203,
+ "grad_norm": 2.8283586502075195,
+ "learning_rate": 1.7310181468497369e-06,
+ "loss": 0.374,
+ "step": 7648
+ },
+ {
+ "epoch": 3.616548463356974,
+ "grad_norm": 3.2289321422576904,
+ "learning_rate": 1.7304245881334935e-06,
+ "loss": 0.3899,
+ "step": 7649
+ },
+ {
+ "epoch": 3.617021276595745,
+ "grad_norm": 3.126882791519165,
+ "learning_rate": 1.7298310773353356e-06,
+ "loss": 0.388,
+ "step": 7650
+ },
+ {
+ "epoch": 3.6174940898345156,
+ "grad_norm": 3.013657569885254,
+ "learning_rate": 1.7292376144922201e-06,
+ "loss": 0.379,
+ "step": 7651
+ },
+ {
+ "epoch": 3.617966903073286,
+ "grad_norm": 3.070192337036133,
+ "learning_rate": 1.7286441996410989e-06,
+ "loss": 0.3801,
+ "step": 7652
+ },
+ {
+ "epoch": 3.6184397163120567,
+ "grad_norm": 2.805380344390869,
+ "learning_rate": 1.7280508328189199e-06,
+ "loss": 0.3577,
+ "step": 7653
+ },
+ {
+ "epoch": 3.6189125295508275,
+ "grad_norm": 3.2853379249572754,
+ "learning_rate": 1.7274575140626318e-06,
+ "loss": 0.4168,
+ "step": 7654
+ },
+ {
+ "epoch": 3.619385342789598,
+ "grad_norm": 3.16316819190979,
+ "learning_rate": 1.7268642434091761e-06,
+ "loss": 0.425,
+ "step": 7655
+ },
+ {
+ "epoch": 3.6198581560283687,
+ "grad_norm": 3.2971179485321045,
+ "learning_rate": 1.7262710208954947e-06,
+ "loss": 0.3884,
+ "step": 7656
+ },
+ {
+ "epoch": 3.6203309692671395,
+ "grad_norm": 3.1823747158050537,
+ "learning_rate": 1.725677846558524e-06,
+ "loss": 0.3419,
+ "step": 7657
+ },
+ {
+ "epoch": 3.6208037825059103,
+ "grad_norm": 3.114654779434204,
+ "learning_rate": 1.7250847204351973e-06,
+ "loss": 0.3951,
+ "step": 7658
+ },
+ {
+ "epoch": 3.621276595744681,
+ "grad_norm": 3.0272440910339355,
+ "learning_rate": 1.7244916425624482e-06,
+ "loss": 0.4102,
+ "step": 7659
+ },
+ {
+ "epoch": 3.6217494089834514,
+ "grad_norm": 2.973611354827881,
+ "learning_rate": 1.7238986129772035e-06,
+ "loss": 0.3827,
+ "step": 7660
+ },
+ {
+ "epoch": 3.6222222222222222,
+ "grad_norm": 3.063713312149048,
+ "learning_rate": 1.7233056317163894e-06,
+ "loss": 0.3909,
+ "step": 7661
+ },
+ {
+ "epoch": 3.622695035460993,
+ "grad_norm": 3.203725576400757,
+ "learning_rate": 1.7227126988169283e-06,
+ "loss": 0.3933,
+ "step": 7662
+ },
+ {
+ "epoch": 3.6231678486997634,
+ "grad_norm": 2.945887327194214,
+ "learning_rate": 1.7221198143157386e-06,
+ "loss": 0.3722,
+ "step": 7663
+ },
+ {
+ "epoch": 3.623640661938534,
+ "grad_norm": 3.042691469192505,
+ "learning_rate": 1.7215269782497373e-06,
+ "loss": 0.4108,
+ "step": 7664
+ },
+ {
+ "epoch": 3.624113475177305,
+ "grad_norm": 2.8496763706207275,
+ "learning_rate": 1.720934190655837e-06,
+ "loss": 0.3867,
+ "step": 7665
+ },
+ {
+ "epoch": 3.6245862884160758,
+ "grad_norm": 2.7017154693603516,
+ "learning_rate": 1.7203414515709493e-06,
+ "loss": 0.3246,
+ "step": 7666
+ },
+ {
+ "epoch": 3.6250591016548466,
+ "grad_norm": 2.66630482673645,
+ "learning_rate": 1.7197487610319808e-06,
+ "loss": 0.365,
+ "step": 7667
+ },
+ {
+ "epoch": 3.625531914893617,
+ "grad_norm": 2.8724591732025146,
+ "learning_rate": 1.7191561190758348e-06,
+ "loss": 0.3361,
+ "step": 7668
+ },
+ {
+ "epoch": 3.6260047281323877,
+ "grad_norm": 3.1413803100585938,
+ "learning_rate": 1.7185635257394143e-06,
+ "loss": 0.3949,
+ "step": 7669
+ },
+ {
+ "epoch": 3.6264775413711585,
+ "grad_norm": 2.9866268634796143,
+ "learning_rate": 1.7179709810596163e-06,
+ "loss": 0.3728,
+ "step": 7670
+ },
+ {
+ "epoch": 3.626950354609929,
+ "grad_norm": 3.003497838973999,
+ "learning_rate": 1.717378485073336e-06,
+ "loss": 0.384,
+ "step": 7671
+ },
+ {
+ "epoch": 3.6274231678486997,
+ "grad_norm": 3.0043468475341797,
+ "learning_rate": 1.716786037817466e-06,
+ "loss": 0.3432,
+ "step": 7672
+ },
+ {
+ "epoch": 3.6278959810874705,
+ "grad_norm": 3.216550827026367,
+ "learning_rate": 1.7161936393288945e-06,
+ "loss": 0.3963,
+ "step": 7673
+ },
+ {
+ "epoch": 3.6283687943262413,
+ "grad_norm": 3.1091387271881104,
+ "learning_rate": 1.715601289644509e-06,
+ "loss": 0.4347,
+ "step": 7674
+ },
+ {
+ "epoch": 3.628841607565012,
+ "grad_norm": 3.2288286685943604,
+ "learning_rate": 1.7150089888011916e-06,
+ "loss": 0.4291,
+ "step": 7675
+ },
+ {
+ "epoch": 3.6293144208037824,
+ "grad_norm": 2.943941831588745,
+ "learning_rate": 1.7144167368358216e-06,
+ "loss": 0.3643,
+ "step": 7676
+ },
+ {
+ "epoch": 3.629787234042553,
+ "grad_norm": 2.819683313369751,
+ "learning_rate": 1.7138245337852774e-06,
+ "loss": 0.4051,
+ "step": 7677
+ },
+ {
+ "epoch": 3.630260047281324,
+ "grad_norm": 2.9988269805908203,
+ "learning_rate": 1.713232379686432e-06,
+ "loss": 0.4102,
+ "step": 7678
+ },
+ {
+ "epoch": 3.6307328605200944,
+ "grad_norm": 3.0041310787200928,
+ "learning_rate": 1.7126402745761566e-06,
+ "loss": 0.3854,
+ "step": 7679
+ },
+ {
+ "epoch": 3.631205673758865,
+ "grad_norm": 2.8700194358825684,
+ "learning_rate": 1.7120482184913192e-06,
+ "loss": 0.3441,
+ "step": 7680
+ },
+ {
+ "epoch": 3.631678486997636,
+ "grad_norm": 3.5275180339813232,
+ "learning_rate": 1.7114562114687833e-06,
+ "loss": 0.3808,
+ "step": 7681
+ },
+ {
+ "epoch": 3.6321513002364068,
+ "grad_norm": 3.182326078414917,
+ "learning_rate": 1.710864253545412e-06,
+ "loss": 0.4178,
+ "step": 7682
+ },
+ {
+ "epoch": 3.6326241134751776,
+ "grad_norm": 3.0514512062072754,
+ "learning_rate": 1.7102723447580627e-06,
+ "loss": 0.3527,
+ "step": 7683
+ },
+ {
+ "epoch": 3.633096926713948,
+ "grad_norm": 2.8293066024780273,
+ "learning_rate": 1.7096804851435922e-06,
+ "loss": 0.3723,
+ "step": 7684
+ },
+ {
+ "epoch": 3.6335697399527187,
+ "grad_norm": 2.9601097106933594,
+ "learning_rate": 1.709088674738853e-06,
+ "loss": 0.3704,
+ "step": 7685
+ },
+ {
+ "epoch": 3.6340425531914895,
+ "grad_norm": 2.8070995807647705,
+ "learning_rate": 1.7084969135806933e-06,
+ "loss": 0.346,
+ "step": 7686
+ },
+ {
+ "epoch": 3.63451536643026,
+ "grad_norm": 3.0162715911865234,
+ "learning_rate": 1.70790520170596e-06,
+ "loss": 0.39,
+ "step": 7687
+ },
+ {
+ "epoch": 3.6349881796690307,
+ "grad_norm": 3.018763780593872,
+ "learning_rate": 1.7073135391514967e-06,
+ "loss": 0.4621,
+ "step": 7688
+ },
+ {
+ "epoch": 3.6354609929078014,
+ "grad_norm": 2.963604688644409,
+ "learning_rate": 1.706721925954144e-06,
+ "loss": 0.339,
+ "step": 7689
+ },
+ {
+ "epoch": 3.6359338061465722,
+ "grad_norm": 2.8532896041870117,
+ "learning_rate": 1.7061303621507383e-06,
+ "loss": 0.3915,
+ "step": 7690
+ },
+ {
+ "epoch": 3.636406619385343,
+ "grad_norm": 3.248006820678711,
+ "learning_rate": 1.7055388477781133e-06,
+ "loss": 0.3712,
+ "step": 7691
+ },
+ {
+ "epoch": 3.6368794326241134,
+ "grad_norm": 3.2195777893066406,
+ "learning_rate": 1.7049473828731011e-06,
+ "loss": 0.4358,
+ "step": 7692
+ },
+ {
+ "epoch": 3.637352245862884,
+ "grad_norm": 2.7190768718719482,
+ "learning_rate": 1.7043559674725296e-06,
+ "loss": 0.341,
+ "step": 7693
+ },
+ {
+ "epoch": 3.637825059101655,
+ "grad_norm": 2.6047232151031494,
+ "learning_rate": 1.7037646016132223e-06,
+ "loss": 0.3513,
+ "step": 7694
+ },
+ {
+ "epoch": 3.6382978723404253,
+ "grad_norm": 3.0824201107025146,
+ "learning_rate": 1.7031732853320026e-06,
+ "loss": 0.4097,
+ "step": 7695
+ },
+ {
+ "epoch": 3.638770685579196,
+ "grad_norm": 2.845461130142212,
+ "learning_rate": 1.7025820186656883e-06,
+ "loss": 0.3395,
+ "step": 7696
+ },
+ {
+ "epoch": 3.639243498817967,
+ "grad_norm": 2.937863826751709,
+ "learning_rate": 1.7019908016510953e-06,
+ "loss": 0.395,
+ "step": 7697
+ },
+ {
+ "epoch": 3.6397163120567377,
+ "grad_norm": 3.349780559539795,
+ "learning_rate": 1.701399634325036e-06,
+ "loss": 0.3889,
+ "step": 7698
+ },
+ {
+ "epoch": 3.6401891252955085,
+ "grad_norm": 2.8527066707611084,
+ "learning_rate": 1.7008085167243187e-06,
+ "loss": 0.3753,
+ "step": 7699
+ },
+ {
+ "epoch": 3.640661938534279,
+ "grad_norm": 2.8112385272979736,
+ "learning_rate": 1.7002174488857517e-06,
+ "loss": 0.3912,
+ "step": 7700
+ },
+ {
+ "epoch": 3.6411347517730497,
+ "grad_norm": 2.731933832168579,
+ "learning_rate": 1.6996264308461363e-06,
+ "loss": 0.4142,
+ "step": 7701
+ },
+ {
+ "epoch": 3.6416075650118205,
+ "grad_norm": 3.70465350151062,
+ "learning_rate": 1.6990354626422744e-06,
+ "loss": 0.4089,
+ "step": 7702
+ },
+ {
+ "epoch": 3.642080378250591,
+ "grad_norm": 2.8656258583068848,
+ "learning_rate": 1.698444544310962e-06,
+ "loss": 0.3771,
+ "step": 7703
+ },
+ {
+ "epoch": 3.6425531914893616,
+ "grad_norm": 2.878830671310425,
+ "learning_rate": 1.697853675888993e-06,
+ "loss": 0.3754,
+ "step": 7704
+ },
+ {
+ "epoch": 3.6430260047281324,
+ "grad_norm": 3.440528154373169,
+ "learning_rate": 1.6972628574131586e-06,
+ "loss": 0.4543,
+ "step": 7705
+ },
+ {
+ "epoch": 3.6434988179669032,
+ "grad_norm": 2.70736026763916,
+ "learning_rate": 1.6966720889202451e-06,
+ "loss": 0.4049,
+ "step": 7706
+ },
+ {
+ "epoch": 3.643971631205674,
+ "grad_norm": 2.787992238998413,
+ "learning_rate": 1.6960813704470391e-06,
+ "loss": 0.3854,
+ "step": 7707
+ },
+ {
+ "epoch": 3.6444444444444444,
+ "grad_norm": 2.631490707397461,
+ "learning_rate": 1.6954907020303213e-06,
+ "loss": 0.3775,
+ "step": 7708
+ },
+ {
+ "epoch": 3.644917257683215,
+ "grad_norm": 3.052255392074585,
+ "learning_rate": 1.6949000837068685e-06,
+ "loss": 0.3873,
+ "step": 7709
+ },
+ {
+ "epoch": 3.645390070921986,
+ "grad_norm": 2.7443203926086426,
+ "learning_rate": 1.6943095155134586e-06,
+ "loss": 0.3362,
+ "step": 7710
+ },
+ {
+ "epoch": 3.6458628841607563,
+ "grad_norm": 2.931688070297241,
+ "learning_rate": 1.6937189974868618e-06,
+ "loss": 0.3839,
+ "step": 7711
+ },
+ {
+ "epoch": 3.646335697399527,
+ "grad_norm": 2.950242757797241,
+ "learning_rate": 1.6931285296638479e-06,
+ "loss": 0.3552,
+ "step": 7712
+ },
+ {
+ "epoch": 3.646808510638298,
+ "grad_norm": 2.940735340118408,
+ "learning_rate": 1.6925381120811823e-06,
+ "loss": 0.3881,
+ "step": 7713
+ },
+ {
+ "epoch": 3.6472813238770687,
+ "grad_norm": 2.771355390548706,
+ "learning_rate": 1.6919477447756273e-06,
+ "loss": 0.3578,
+ "step": 7714
+ },
+ {
+ "epoch": 3.6477541371158395,
+ "grad_norm": 2.919004201889038,
+ "learning_rate": 1.6913574277839435e-06,
+ "loss": 0.3971,
+ "step": 7715
+ },
+ {
+ "epoch": 3.64822695035461,
+ "grad_norm": 3.293705463409424,
+ "learning_rate": 1.6907671611428872e-06,
+ "loss": 0.422,
+ "step": 7716
+ },
+ {
+ "epoch": 3.6486997635933807,
+ "grad_norm": 2.744239091873169,
+ "learning_rate": 1.6901769448892103e-06,
+ "loss": 0.398,
+ "step": 7717
+ },
+ {
+ "epoch": 3.6491725768321515,
+ "grad_norm": 3.1726129055023193,
+ "learning_rate": 1.689586779059665e-06,
+ "loss": 0.39,
+ "step": 7718
+ },
+ {
+ "epoch": 3.649645390070922,
+ "grad_norm": 3.146743059158325,
+ "learning_rate": 1.688996663690997e-06,
+ "loss": 0.4059,
+ "step": 7719
+ },
+ {
+ "epoch": 3.6501182033096926,
+ "grad_norm": 2.941025495529175,
+ "learning_rate": 1.688406598819951e-06,
+ "loss": 0.3479,
+ "step": 7720
+ },
+ {
+ "epoch": 3.6505910165484634,
+ "grad_norm": 3.3480939865112305,
+ "learning_rate": 1.6878165844832679e-06,
+ "loss": 0.4141,
+ "step": 7721
+ },
+ {
+ "epoch": 3.651063829787234,
+ "grad_norm": 2.9145030975341797,
+ "learning_rate": 1.6872266207176833e-06,
+ "loss": 0.3497,
+ "step": 7722
+ },
+ {
+ "epoch": 3.651536643026005,
+ "grad_norm": 3.119502067565918,
+ "learning_rate": 1.686636707559934e-06,
+ "loss": 0.424,
+ "step": 7723
+ },
+ {
+ "epoch": 3.6520094562647754,
+ "grad_norm": 3.0867667198181152,
+ "learning_rate": 1.6860468450467497e-06,
+ "loss": 0.3998,
+ "step": 7724
+ },
+ {
+ "epoch": 3.652482269503546,
+ "grad_norm": 2.9128987789154053,
+ "learning_rate": 1.6854570332148602e-06,
+ "loss": 0.4043,
+ "step": 7725
+ },
+ {
+ "epoch": 3.652955082742317,
+ "grad_norm": 2.9973206520080566,
+ "learning_rate": 1.6848672721009896e-06,
+ "loss": 0.3395,
+ "step": 7726
+ },
+ {
+ "epoch": 3.6534278959810873,
+ "grad_norm": 2.824916124343872,
+ "learning_rate": 1.6842775617418591e-06,
+ "loss": 0.4102,
+ "step": 7727
+ },
+ {
+ "epoch": 3.653900709219858,
+ "grad_norm": 2.7984440326690674,
+ "learning_rate": 1.6836879021741887e-06,
+ "loss": 0.3823,
+ "step": 7728
+ },
+ {
+ "epoch": 3.654373522458629,
+ "grad_norm": 2.8412179946899414,
+ "learning_rate": 1.6830982934346917e-06,
+ "loss": 0.3755,
+ "step": 7729
+ },
+ {
+ "epoch": 3.6548463356973997,
+ "grad_norm": 3.1677138805389404,
+ "learning_rate": 1.6825087355600836e-06,
+ "loss": 0.4224,
+ "step": 7730
+ },
+ {
+ "epoch": 3.65531914893617,
+ "grad_norm": 3.097085475921631,
+ "learning_rate": 1.6819192285870718e-06,
+ "loss": 0.4103,
+ "step": 7731
+ },
+ {
+ "epoch": 3.655791962174941,
+ "grad_norm": 2.9802496433258057,
+ "learning_rate": 1.6813297725523613e-06,
+ "loss": 0.4297,
+ "step": 7732
+ },
+ {
+ "epoch": 3.6562647754137116,
+ "grad_norm": 3.0135059356689453,
+ "learning_rate": 1.680740367492657e-06,
+ "loss": 0.4526,
+ "step": 7733
+ },
+ {
+ "epoch": 3.656737588652482,
+ "grad_norm": 2.7776739597320557,
+ "learning_rate": 1.6801510134446575e-06,
+ "loss": 0.3924,
+ "step": 7734
+ },
+ {
+ "epoch": 3.657210401891253,
+ "grad_norm": 2.7500126361846924,
+ "learning_rate": 1.6795617104450595e-06,
+ "loss": 0.3785,
+ "step": 7735
+ },
+ {
+ "epoch": 3.6576832151300236,
+ "grad_norm": 3.494142770767212,
+ "learning_rate": 1.6789724585305566e-06,
+ "loss": 0.3483,
+ "step": 7736
+ },
+ {
+ "epoch": 3.6581560283687944,
+ "grad_norm": 3.055081605911255,
+ "learning_rate": 1.6783832577378377e-06,
+ "loss": 0.4481,
+ "step": 7737
+ },
+ {
+ "epoch": 3.658628841607565,
+ "grad_norm": 2.781412124633789,
+ "learning_rate": 1.6777941081035914e-06,
+ "loss": 0.3969,
+ "step": 7738
+ },
+ {
+ "epoch": 3.6591016548463355,
+ "grad_norm": 3.1672184467315674,
+ "learning_rate": 1.677205009664501e-06,
+ "loss": 0.3959,
+ "step": 7739
+ },
+ {
+ "epoch": 3.6595744680851063,
+ "grad_norm": 3.0597715377807617,
+ "learning_rate": 1.6766159624572458e-06,
+ "loss": 0.418,
+ "step": 7740
+ },
+ {
+ "epoch": 3.660047281323877,
+ "grad_norm": 3.2906267642974854,
+ "learning_rate": 1.676026966518505e-06,
+ "loss": 0.4335,
+ "step": 7741
+ },
+ {
+ "epoch": 3.6605200945626475,
+ "grad_norm": 3.2519290447235107,
+ "learning_rate": 1.6754380218849515e-06,
+ "loss": 0.3786,
+ "step": 7742
+ },
+ {
+ "epoch": 3.6609929078014183,
+ "grad_norm": 3.24716854095459,
+ "learning_rate": 1.6748491285932572e-06,
+ "loss": 0.3599,
+ "step": 7743
+ },
+ {
+ "epoch": 3.661465721040189,
+ "grad_norm": 3.2940993309020996,
+ "learning_rate": 1.6742602866800897e-06,
+ "loss": 0.3934,
+ "step": 7744
+ },
+ {
+ "epoch": 3.66193853427896,
+ "grad_norm": 2.917409896850586,
+ "learning_rate": 1.6736714961821124e-06,
+ "loss": 0.4197,
+ "step": 7745
+ },
+ {
+ "epoch": 3.6624113475177307,
+ "grad_norm": 3.005068063735962,
+ "learning_rate": 1.6730827571359887e-06,
+ "loss": 0.4239,
+ "step": 7746
+ },
+ {
+ "epoch": 3.662884160756501,
+ "grad_norm": 2.751880168914795,
+ "learning_rate": 1.6724940695783745e-06,
+ "loss": 0.4257,
+ "step": 7747
+ },
+ {
+ "epoch": 3.663356973995272,
+ "grad_norm": 3.090670585632324,
+ "learning_rate": 1.6719054335459273e-06,
+ "loss": 0.3686,
+ "step": 7748
+ },
+ {
+ "epoch": 3.6638297872340426,
+ "grad_norm": 3.250251293182373,
+ "learning_rate": 1.6713168490752974e-06,
+ "loss": 0.4249,
+ "step": 7749
+ },
+ {
+ "epoch": 3.664302600472813,
+ "grad_norm": 2.8662827014923096,
+ "learning_rate": 1.6707283162031335e-06,
+ "loss": 0.3692,
+ "step": 7750
+ },
+ {
+ "epoch": 3.6647754137115838,
+ "grad_norm": 2.8709118366241455,
+ "learning_rate": 1.6701398349660813e-06,
+ "loss": 0.3929,
+ "step": 7751
+ },
+ {
+ "epoch": 3.6652482269503546,
+ "grad_norm": 2.992035388946533,
+ "learning_rate": 1.6695514054007822e-06,
+ "loss": 0.4131,
+ "step": 7752
+ },
+ {
+ "epoch": 3.6657210401891254,
+ "grad_norm": 3.0427589416503906,
+ "learning_rate": 1.668963027543876e-06,
+ "loss": 0.387,
+ "step": 7753
+ },
+ {
+ "epoch": 3.666193853427896,
+ "grad_norm": 3.0147807598114014,
+ "learning_rate": 1.6683747014319987e-06,
+ "loss": 0.3648,
+ "step": 7754
+ },
+ {
+ "epoch": 3.6666666666666665,
+ "grad_norm": 2.5483829975128174,
+ "learning_rate": 1.6677864271017811e-06,
+ "loss": 0.3643,
+ "step": 7755
+ },
+ {
+ "epoch": 3.6671394799054373,
+ "grad_norm": 2.7661986351013184,
+ "learning_rate": 1.6671982045898544e-06,
+ "loss": 0.3731,
+ "step": 7756
+ },
+ {
+ "epoch": 3.667612293144208,
+ "grad_norm": 2.778036117553711,
+ "learning_rate": 1.666610033932843e-06,
+ "loss": 0.3744,
+ "step": 7757
+ },
+ {
+ "epoch": 3.6680851063829785,
+ "grad_norm": 2.9028329849243164,
+ "learning_rate": 1.6660219151673712e-06,
+ "loss": 0.4286,
+ "step": 7758
+ },
+ {
+ "epoch": 3.6685579196217493,
+ "grad_norm": 2.826687812805176,
+ "learning_rate": 1.6654338483300575e-06,
+ "loss": 0.318,
+ "step": 7759
+ },
+ {
+ "epoch": 3.66903073286052,
+ "grad_norm": 2.7063660621643066,
+ "learning_rate": 1.6648458334575186e-06,
+ "loss": 0.3351,
+ "step": 7760
+ },
+ {
+ "epoch": 3.669503546099291,
+ "grad_norm": 2.708361864089966,
+ "learning_rate": 1.664257870586368e-06,
+ "loss": 0.376,
+ "step": 7761
+ },
+ {
+ "epoch": 3.6699763593380617,
+ "grad_norm": 3.1139161586761475,
+ "learning_rate": 1.6636699597532141e-06,
+ "loss": 0.3572,
+ "step": 7762
+ },
+ {
+ "epoch": 3.670449172576832,
+ "grad_norm": 3.0858285427093506,
+ "learning_rate": 1.6630821009946658e-06,
+ "loss": 0.4204,
+ "step": 7763
+ },
+ {
+ "epoch": 3.670921985815603,
+ "grad_norm": 3.5593984127044678,
+ "learning_rate": 1.6624942943473252e-06,
+ "loss": 0.463,
+ "step": 7764
+ },
+ {
+ "epoch": 3.6713947990543736,
+ "grad_norm": 2.863851308822632,
+ "learning_rate": 1.6619065398477921e-06,
+ "loss": 0.4272,
+ "step": 7765
+ },
+ {
+ "epoch": 3.671867612293144,
+ "grad_norm": 2.833399772644043,
+ "learning_rate": 1.6613188375326638e-06,
+ "loss": 0.3509,
+ "step": 7766
+ },
+ {
+ "epoch": 3.6723404255319148,
+ "grad_norm": 2.988948345184326,
+ "learning_rate": 1.6607311874385346e-06,
+ "loss": 0.3572,
+ "step": 7767
+ },
+ {
+ "epoch": 3.6728132387706856,
+ "grad_norm": 2.7349398136138916,
+ "learning_rate": 1.6601435896019936e-06,
+ "loss": 0.3875,
+ "step": 7768
+ },
+ {
+ "epoch": 3.6732860520094563,
+ "grad_norm": 2.8544445037841797,
+ "learning_rate": 1.659556044059629e-06,
+ "loss": 0.4057,
+ "step": 7769
+ },
+ {
+ "epoch": 3.673758865248227,
+ "grad_norm": 3.0341904163360596,
+ "learning_rate": 1.6589685508480235e-06,
+ "loss": 0.3935,
+ "step": 7770
+ },
+ {
+ "epoch": 3.6742316784869975,
+ "grad_norm": 2.7495710849761963,
+ "learning_rate": 1.6583811100037595e-06,
+ "loss": 0.352,
+ "step": 7771
+ },
+ {
+ "epoch": 3.6747044917257683,
+ "grad_norm": 3.258525848388672,
+ "learning_rate": 1.6577937215634133e-06,
+ "loss": 0.4414,
+ "step": 7772
+ },
+ {
+ "epoch": 3.675177304964539,
+ "grad_norm": 3.0686328411102295,
+ "learning_rate": 1.657206385563558e-06,
+ "loss": 0.353,
+ "step": 7773
+ },
+ {
+ "epoch": 3.6756501182033094,
+ "grad_norm": 3.1168248653411865,
+ "learning_rate": 1.6566191020407668e-06,
+ "loss": 0.4064,
+ "step": 7774
+ },
+ {
+ "epoch": 3.6761229314420802,
+ "grad_norm": 2.7837352752685547,
+ "learning_rate": 1.6560318710316053e-06,
+ "loss": 0.3956,
+ "step": 7775
+ },
+ {
+ "epoch": 3.676595744680851,
+ "grad_norm": 3.1514039039611816,
+ "learning_rate": 1.6554446925726391e-06,
+ "loss": 0.4044,
+ "step": 7776
+ },
+ {
+ "epoch": 3.677068557919622,
+ "grad_norm": 3.010352611541748,
+ "learning_rate": 1.6548575667004285e-06,
+ "loss": 0.4162,
+ "step": 7777
+ },
+ {
+ "epoch": 3.6775413711583926,
+ "grad_norm": 3.1727633476257324,
+ "learning_rate": 1.6542704934515308e-06,
+ "loss": 0.411,
+ "step": 7778
+ },
+ {
+ "epoch": 3.678014184397163,
+ "grad_norm": 3.6771271228790283,
+ "learning_rate": 1.6536834728625018e-06,
+ "loss": 0.4562,
+ "step": 7779
+ },
+ {
+ "epoch": 3.678486997635934,
+ "grad_norm": 2.9793131351470947,
+ "learning_rate": 1.6530965049698908e-06,
+ "loss": 0.4039,
+ "step": 7780
+ },
+ {
+ "epoch": 3.6789598108747046,
+ "grad_norm": 3.193751096725464,
+ "learning_rate": 1.6525095898102478e-06,
+ "loss": 0.4064,
+ "step": 7781
+ },
+ {
+ "epoch": 3.679432624113475,
+ "grad_norm": 2.6643173694610596,
+ "learning_rate": 1.6519227274201169e-06,
+ "loss": 0.3731,
+ "step": 7782
+ },
+ {
+ "epoch": 3.6799054373522457,
+ "grad_norm": 3.4855685234069824,
+ "learning_rate": 1.6513359178360384e-06,
+ "loss": 0.3815,
+ "step": 7783
+ },
+ {
+ "epoch": 3.6803782505910165,
+ "grad_norm": 3.320537567138672,
+ "learning_rate": 1.6507491610945514e-06,
+ "loss": 0.4065,
+ "step": 7784
+ },
+ {
+ "epoch": 3.6808510638297873,
+ "grad_norm": 3.2793102264404297,
+ "learning_rate": 1.6501624572321895e-06,
+ "loss": 0.429,
+ "step": 7785
+ },
+ {
+ "epoch": 3.681323877068558,
+ "grad_norm": 2.8609631061553955,
+ "learning_rate": 1.6495758062854854e-06,
+ "loss": 0.3881,
+ "step": 7786
+ },
+ {
+ "epoch": 3.6817966903073285,
+ "grad_norm": 2.956533193588257,
+ "learning_rate": 1.6489892082909675e-06,
+ "loss": 0.3662,
+ "step": 7787
+ },
+ {
+ "epoch": 3.6822695035460993,
+ "grad_norm": 2.949092149734497,
+ "learning_rate": 1.6484026632851591e-06,
+ "loss": 0.4021,
+ "step": 7788
+ },
+ {
+ "epoch": 3.68274231678487,
+ "grad_norm": 2.802572250366211,
+ "learning_rate": 1.6478161713045831e-06,
+ "loss": 0.3862,
+ "step": 7789
+ },
+ {
+ "epoch": 3.6832151300236404,
+ "grad_norm": 3.0064797401428223,
+ "learning_rate": 1.6472297323857578e-06,
+ "loss": 0.3495,
+ "step": 7790
+ },
+ {
+ "epoch": 3.6836879432624112,
+ "grad_norm": 2.74037766456604,
+ "learning_rate": 1.646643346565197e-06,
+ "loss": 0.362,
+ "step": 7791
+ },
+ {
+ "epoch": 3.684160756501182,
+ "grad_norm": 3.423326253890991,
+ "learning_rate": 1.646057013879414e-06,
+ "loss": 0.4383,
+ "step": 7792
+ },
+ {
+ "epoch": 3.684633569739953,
+ "grad_norm": 2.7426443099975586,
+ "learning_rate": 1.645470734364915e-06,
+ "loss": 0.3741,
+ "step": 7793
+ },
+ {
+ "epoch": 3.6851063829787236,
+ "grad_norm": 2.879296064376831,
+ "learning_rate": 1.6448845080582077e-06,
+ "loss": 0.4003,
+ "step": 7794
+ },
+ {
+ "epoch": 3.685579196217494,
+ "grad_norm": 2.6821188926696777,
+ "learning_rate": 1.6442983349957924e-06,
+ "loss": 0.3545,
+ "step": 7795
+ },
+ {
+ "epoch": 3.6860520094562648,
+ "grad_norm": 2.714059829711914,
+ "learning_rate": 1.6437122152141665e-06,
+ "loss": 0.3702,
+ "step": 7796
+ },
+ {
+ "epoch": 3.6865248226950356,
+ "grad_norm": 2.593811511993408,
+ "learning_rate": 1.6431261487498274e-06,
+ "loss": 0.3512,
+ "step": 7797
+ },
+ {
+ "epoch": 3.686997635933806,
+ "grad_norm": 2.687533378601074,
+ "learning_rate": 1.6425401356392652e-06,
+ "loss": 0.36,
+ "step": 7798
+ },
+ {
+ "epoch": 3.6874704491725767,
+ "grad_norm": 3.1675431728363037,
+ "learning_rate": 1.6419541759189694e-06,
+ "loss": 0.3349,
+ "step": 7799
+ },
+ {
+ "epoch": 3.6879432624113475,
+ "grad_norm": 2.777310371398926,
+ "learning_rate": 1.6413682696254246e-06,
+ "loss": 0.317,
+ "step": 7800
+ },
+ {
+ "epoch": 3.6884160756501183,
+ "grad_norm": 3.0121655464172363,
+ "learning_rate": 1.640782416795112e-06,
+ "loss": 0.3612,
+ "step": 7801
+ },
+ {
+ "epoch": 3.688888888888889,
+ "grad_norm": 3.0532145500183105,
+ "learning_rate": 1.6401966174645113e-06,
+ "loss": 0.4065,
+ "step": 7802
+ },
+ {
+ "epoch": 3.6893617021276595,
+ "grad_norm": 2.8221664428710938,
+ "learning_rate": 1.6396108716700961e-06,
+ "loss": 0.3669,
+ "step": 7803
+ },
+ {
+ "epoch": 3.6898345153664303,
+ "grad_norm": 2.966357707977295,
+ "learning_rate": 1.6390251794483405e-06,
+ "loss": 0.391,
+ "step": 7804
+ },
+ {
+ "epoch": 3.690307328605201,
+ "grad_norm": 3.460252046585083,
+ "learning_rate": 1.6384395408357118e-06,
+ "loss": 0.429,
+ "step": 7805
+ },
+ {
+ "epoch": 3.6907801418439714,
+ "grad_norm": 2.8907718658447266,
+ "learning_rate": 1.637853955868674e-06,
+ "loss": 0.3761,
+ "step": 7806
+ },
+ {
+ "epoch": 3.691252955082742,
+ "grad_norm": 3.114612102508545,
+ "learning_rate": 1.6372684245836912e-06,
+ "loss": 0.4376,
+ "step": 7807
+ },
+ {
+ "epoch": 3.691725768321513,
+ "grad_norm": 2.9361326694488525,
+ "learning_rate": 1.6366829470172191e-06,
+ "loss": 0.3672,
+ "step": 7808
+ },
+ {
+ "epoch": 3.692198581560284,
+ "grad_norm": 3.2719476222991943,
+ "learning_rate": 1.6360975232057156e-06,
+ "loss": 0.4266,
+ "step": 7809
+ },
+ {
+ "epoch": 3.6926713947990546,
+ "grad_norm": 2.873952865600586,
+ "learning_rate": 1.635512153185631e-06,
+ "loss": 0.4056,
+ "step": 7810
+ },
+ {
+ "epoch": 3.693144208037825,
+ "grad_norm": 3.0273401737213135,
+ "learning_rate": 1.634926836993413e-06,
+ "loss": 0.3947,
+ "step": 7811
+ },
+ {
+ "epoch": 3.6936170212765957,
+ "grad_norm": 2.868738889694214,
+ "learning_rate": 1.634341574665509e-06,
+ "loss": 0.3935,
+ "step": 7812
+ },
+ {
+ "epoch": 3.6940898345153665,
+ "grad_norm": 3.3080437183380127,
+ "learning_rate": 1.6337563662383591e-06,
+ "loss": 0.3606,
+ "step": 7813
+ },
+ {
+ "epoch": 3.694562647754137,
+ "grad_norm": 2.8339016437530518,
+ "learning_rate": 1.6331712117484014e-06,
+ "loss": 0.4019,
+ "step": 7814
+ },
+ {
+ "epoch": 3.6950354609929077,
+ "grad_norm": 2.666815996170044,
+ "learning_rate": 1.6325861112320717e-06,
+ "loss": 0.3502,
+ "step": 7815
+ },
+ {
+ "epoch": 3.6955082742316785,
+ "grad_norm": 2.7624311447143555,
+ "learning_rate": 1.6320010647258008e-06,
+ "loss": 0.3481,
+ "step": 7816
+ },
+ {
+ "epoch": 3.6959810874704493,
+ "grad_norm": 2.7796332836151123,
+ "learning_rate": 1.6314160722660183e-06,
+ "loss": 0.3735,
+ "step": 7817
+ },
+ {
+ "epoch": 3.69645390070922,
+ "grad_norm": 2.954318046569824,
+ "learning_rate": 1.6308311338891484e-06,
+ "loss": 0.3933,
+ "step": 7818
+ },
+ {
+ "epoch": 3.6969267139479904,
+ "grad_norm": 2.821072816848755,
+ "learning_rate": 1.6302462496316115e-06,
+ "loss": 0.3437,
+ "step": 7819
+ },
+ {
+ "epoch": 3.6973995271867612,
+ "grad_norm": 3.436192750930786,
+ "learning_rate": 1.629661419529828e-06,
+ "loss": 0.4469,
+ "step": 7820
+ },
+ {
+ "epoch": 3.697872340425532,
+ "grad_norm": 3.1361067295074463,
+ "learning_rate": 1.629076643620211e-06,
+ "loss": 0.3887,
+ "step": 7821
+ },
+ {
+ "epoch": 3.6983451536643024,
+ "grad_norm": 3.355024576187134,
+ "learning_rate": 1.6284919219391732e-06,
+ "loss": 0.424,
+ "step": 7822
+ },
+ {
+ "epoch": 3.698817966903073,
+ "grad_norm": 2.7671639919281006,
+ "learning_rate": 1.6279072545231212e-06,
+ "loss": 0.3765,
+ "step": 7823
+ },
+ {
+ "epoch": 3.699290780141844,
+ "grad_norm": 2.9509360790252686,
+ "learning_rate": 1.6273226414084606e-06,
+ "loss": 0.4057,
+ "step": 7824
+ },
+ {
+ "epoch": 3.699763593380615,
+ "grad_norm": 2.9852921962738037,
+ "learning_rate": 1.6267380826315932e-06,
+ "loss": 0.4238,
+ "step": 7825
+ },
+ {
+ "epoch": 3.7002364066193856,
+ "grad_norm": 2.826594114303589,
+ "learning_rate": 1.626153578228915e-06,
+ "loss": 0.3958,
+ "step": 7826
+ },
+ {
+ "epoch": 3.700709219858156,
+ "grad_norm": 2.9103410243988037,
+ "learning_rate": 1.6255691282368228e-06,
+ "loss": 0.394,
+ "step": 7827
+ },
+ {
+ "epoch": 3.7011820330969267,
+ "grad_norm": 3.362992525100708,
+ "learning_rate": 1.6249847326917068e-06,
+ "loss": 0.4233,
+ "step": 7828
+ },
+ {
+ "epoch": 3.7016548463356975,
+ "grad_norm": 2.711280107498169,
+ "learning_rate": 1.624400391629954e-06,
+ "loss": 0.2977,
+ "step": 7829
+ },
+ {
+ "epoch": 3.702127659574468,
+ "grad_norm": 2.8354649543762207,
+ "learning_rate": 1.6238161050879497e-06,
+ "loss": 0.3549,
+ "step": 7830
+ },
+ {
+ "epoch": 3.7026004728132387,
+ "grad_norm": 3.096376895904541,
+ "learning_rate": 1.6232318731020743e-06,
+ "loss": 0.3486,
+ "step": 7831
+ },
+ {
+ "epoch": 3.7030732860520095,
+ "grad_norm": 2.918267250061035,
+ "learning_rate": 1.6226476957087064e-06,
+ "loss": 0.3659,
+ "step": 7832
+ },
+ {
+ "epoch": 3.7035460992907803,
+ "grad_norm": 2.705399513244629,
+ "learning_rate": 1.6220635729442195e-06,
+ "loss": 0.4301,
+ "step": 7833
+ },
+ {
+ "epoch": 3.704018912529551,
+ "grad_norm": 2.9192235469818115,
+ "learning_rate": 1.621479504844983e-06,
+ "loss": 0.3384,
+ "step": 7834
+ },
+ {
+ "epoch": 3.7044917257683214,
+ "grad_norm": 2.78623104095459,
+ "learning_rate": 1.6208954914473669e-06,
+ "loss": 0.3528,
+ "step": 7835
+ },
+ {
+ "epoch": 3.704964539007092,
+ "grad_norm": 3.0218069553375244,
+ "learning_rate": 1.6203115327877333e-06,
+ "loss": 0.3698,
+ "step": 7836
+ },
+ {
+ "epoch": 3.705437352245863,
+ "grad_norm": 3.019101619720459,
+ "learning_rate": 1.6197276289024422e-06,
+ "loss": 0.4398,
+ "step": 7837
+ },
+ {
+ "epoch": 3.7059101654846334,
+ "grad_norm": 2.9220848083496094,
+ "learning_rate": 1.6191437798278531e-06,
+ "loss": 0.3803,
+ "step": 7838
+ },
+ {
+ "epoch": 3.706382978723404,
+ "grad_norm": 3.2731969356536865,
+ "learning_rate": 1.6185599856003181e-06,
+ "loss": 0.4529,
+ "step": 7839
+ },
+ {
+ "epoch": 3.706855791962175,
+ "grad_norm": 2.85239577293396,
+ "learning_rate": 1.617976246256188e-06,
+ "loss": 0.3801,
+ "step": 7840
+ },
+ {
+ "epoch": 3.7073286052009458,
+ "grad_norm": 2.8250765800476074,
+ "learning_rate": 1.6173925618318092e-06,
+ "loss": 0.3267,
+ "step": 7841
+ },
+ {
+ "epoch": 3.7078014184397166,
+ "grad_norm": 2.9152321815490723,
+ "learning_rate": 1.616808932363525e-06,
+ "loss": 0.428,
+ "step": 7842
+ },
+ {
+ "epoch": 3.708274231678487,
+ "grad_norm": 2.912656545639038,
+ "learning_rate": 1.6162253578876766e-06,
+ "loss": 0.3802,
+ "step": 7843
+ },
+ {
+ "epoch": 3.7087470449172577,
+ "grad_norm": 3.0700762271881104,
+ "learning_rate": 1.6156418384405992e-06,
+ "loss": 0.377,
+ "step": 7844
+ },
+ {
+ "epoch": 3.7092198581560285,
+ "grad_norm": 2.873141050338745,
+ "learning_rate": 1.6150583740586274e-06,
+ "loss": 0.399,
+ "step": 7845
+ },
+ {
+ "epoch": 3.709692671394799,
+ "grad_norm": 2.899555206298828,
+ "learning_rate": 1.6144749647780906e-06,
+ "loss": 0.402,
+ "step": 7846
+ },
+ {
+ "epoch": 3.7101654846335697,
+ "grad_norm": 3.257697343826294,
+ "learning_rate": 1.6138916106353139e-06,
+ "loss": 0.4193,
+ "step": 7847
+ },
+ {
+ "epoch": 3.7106382978723405,
+ "grad_norm": 2.6879804134368896,
+ "learning_rate": 1.613308311666622e-06,
+ "loss": 0.3474,
+ "step": 7848
+ },
+ {
+ "epoch": 3.7111111111111112,
+ "grad_norm": 2.712491273880005,
+ "learning_rate": 1.6127250679083323e-06,
+ "loss": 0.3315,
+ "step": 7849
+ },
+ {
+ "epoch": 3.711583924349882,
+ "grad_norm": 2.9762673377990723,
+ "learning_rate": 1.6121418793967631e-06,
+ "loss": 0.3953,
+ "step": 7850
+ },
+ {
+ "epoch": 3.7120567375886524,
+ "grad_norm": 2.743668556213379,
+ "learning_rate": 1.6115587461682258e-06,
+ "loss": 0.381,
+ "step": 7851
+ },
+ {
+ "epoch": 3.712529550827423,
+ "grad_norm": 3.0545318126678467,
+ "learning_rate": 1.6109756682590288e-06,
+ "loss": 0.412,
+ "step": 7852
+ },
+ {
+ "epoch": 3.713002364066194,
+ "grad_norm": 3.0125906467437744,
+ "learning_rate": 1.61039264570548e-06,
+ "loss": 0.3931,
+ "step": 7853
+ },
+ {
+ "epoch": 3.7134751773049643,
+ "grad_norm": 2.809302806854248,
+ "learning_rate": 1.6098096785438794e-06,
+ "loss": 0.3943,
+ "step": 7854
+ },
+ {
+ "epoch": 3.713947990543735,
+ "grad_norm": 3.092452049255371,
+ "learning_rate": 1.6092267668105276e-06,
+ "loss": 0.3932,
+ "step": 7855
+ },
+ {
+ "epoch": 3.714420803782506,
+ "grad_norm": 2.9878969192504883,
+ "learning_rate": 1.608643910541719e-06,
+ "loss": 0.4289,
+ "step": 7856
+ },
+ {
+ "epoch": 3.7148936170212767,
+ "grad_norm": 2.693387508392334,
+ "learning_rate": 1.6080611097737444e-06,
+ "loss": 0.373,
+ "step": 7857
+ },
+ {
+ "epoch": 3.7153664302600475,
+ "grad_norm": 3.4097673892974854,
+ "learning_rate": 1.6074783645428945e-06,
+ "loss": 0.4487,
+ "step": 7858
+ },
+ {
+ "epoch": 3.715839243498818,
+ "grad_norm": 3.1466784477233887,
+ "learning_rate": 1.6068956748854525e-06,
+ "loss": 0.3648,
+ "step": 7859
+ },
+ {
+ "epoch": 3.7163120567375887,
+ "grad_norm": 3.062107563018799,
+ "learning_rate": 1.6063130408377015e-06,
+ "loss": 0.3899,
+ "step": 7860
+ },
+ {
+ "epoch": 3.7167848699763595,
+ "grad_norm": 3.2298364639282227,
+ "learning_rate": 1.6057304624359188e-06,
+ "loss": 0.4243,
+ "step": 7861
+ },
+ {
+ "epoch": 3.71725768321513,
+ "grad_norm": 3.0285773277282715,
+ "learning_rate": 1.6051479397163784e-06,
+ "loss": 0.3469,
+ "step": 7862
+ },
+ {
+ "epoch": 3.7177304964539006,
+ "grad_norm": 2.8438515663146973,
+ "learning_rate": 1.6045654727153525e-06,
+ "loss": 0.3363,
+ "step": 7863
+ },
+ {
+ "epoch": 3.7182033096926714,
+ "grad_norm": 3.1558034420013428,
+ "learning_rate": 1.6039830614691081e-06,
+ "loss": 0.4326,
+ "step": 7864
+ },
+ {
+ "epoch": 3.7186761229314422,
+ "grad_norm": 2.438640594482422,
+ "learning_rate": 1.603400706013909e-06,
+ "loss": 0.3647,
+ "step": 7865
+ },
+ {
+ "epoch": 3.719148936170213,
+ "grad_norm": 3.0443127155303955,
+ "learning_rate": 1.6028184063860168e-06,
+ "loss": 0.4295,
+ "step": 7866
+ },
+ {
+ "epoch": 3.7196217494089834,
+ "grad_norm": 2.8009512424468994,
+ "learning_rate": 1.602236162621688e-06,
+ "loss": 0.4055,
+ "step": 7867
+ },
+ {
+ "epoch": 3.720094562647754,
+ "grad_norm": 3.227698802947998,
+ "learning_rate": 1.6016539747571775e-06,
+ "loss": 0.4681,
+ "step": 7868
+ },
+ {
+ "epoch": 3.720567375886525,
+ "grad_norm": 2.8242595195770264,
+ "learning_rate": 1.601071842828735e-06,
+ "loss": 0.3707,
+ "step": 7869
+ },
+ {
+ "epoch": 3.7210401891252953,
+ "grad_norm": 3.602937698364258,
+ "learning_rate": 1.6004897668726067e-06,
+ "loss": 0.5201,
+ "step": 7870
+ },
+ {
+ "epoch": 3.721513002364066,
+ "grad_norm": 3.578422784805298,
+ "learning_rate": 1.599907746925037e-06,
+ "loss": 0.4514,
+ "step": 7871
+ },
+ {
+ "epoch": 3.721985815602837,
+ "grad_norm": 2.7365758419036865,
+ "learning_rate": 1.5993257830222635e-06,
+ "loss": 0.356,
+ "step": 7872
+ },
+ {
+ "epoch": 3.7224586288416077,
+ "grad_norm": 3.125636577606201,
+ "learning_rate": 1.5987438752005258e-06,
+ "loss": 0.4277,
+ "step": 7873
+ },
+ {
+ "epoch": 3.7229314420803785,
+ "grad_norm": 2.7157294750213623,
+ "learning_rate": 1.5981620234960549e-06,
+ "loss": 0.363,
+ "step": 7874
+ },
+ {
+ "epoch": 3.723404255319149,
+ "grad_norm": 2.90950083732605,
+ "learning_rate": 1.5975802279450793e-06,
+ "loss": 0.4027,
+ "step": 7875
+ },
+ {
+ "epoch": 3.7238770685579197,
+ "grad_norm": 2.659787178039551,
+ "learning_rate": 1.596998488583827e-06,
+ "loss": 0.3632,
+ "step": 7876
+ },
+ {
+ "epoch": 3.7243498817966905,
+ "grad_norm": 3.221623182296753,
+ "learning_rate": 1.5964168054485185e-06,
+ "loss": 0.4295,
+ "step": 7877
+ },
+ {
+ "epoch": 3.724822695035461,
+ "grad_norm": 2.6838672161102295,
+ "learning_rate": 1.595835178575374e-06,
+ "loss": 0.3413,
+ "step": 7878
+ },
+ {
+ "epoch": 3.7252955082742316,
+ "grad_norm": 2.804706335067749,
+ "learning_rate": 1.5952536080006084e-06,
+ "loss": 0.3801,
+ "step": 7879
+ },
+ {
+ "epoch": 3.7257683215130024,
+ "grad_norm": 2.7647509574890137,
+ "learning_rate": 1.5946720937604326e-06,
+ "loss": 0.3941,
+ "step": 7880
+ },
+ {
+ "epoch": 3.726241134751773,
+ "grad_norm": 2.8363754749298096,
+ "learning_rate": 1.5940906358910566e-06,
+ "loss": 0.3772,
+ "step": 7881
+ },
+ {
+ "epoch": 3.726713947990544,
+ "grad_norm": 3.4147698879241943,
+ "learning_rate": 1.5935092344286835e-06,
+ "loss": 0.399,
+ "step": 7882
+ },
+ {
+ "epoch": 3.7271867612293144,
+ "grad_norm": 2.984090805053711,
+ "learning_rate": 1.5929278894095162e-06,
+ "loss": 0.3373,
+ "step": 7883
+ },
+ {
+ "epoch": 3.727659574468085,
+ "grad_norm": 3.250173330307007,
+ "learning_rate": 1.5923466008697521e-06,
+ "loss": 0.3932,
+ "step": 7884
+ },
+ {
+ "epoch": 3.728132387706856,
+ "grad_norm": 3.2699649333953857,
+ "learning_rate": 1.5917653688455848e-06,
+ "loss": 0.4529,
+ "step": 7885
+ },
+ {
+ "epoch": 3.7286052009456263,
+ "grad_norm": 3.175934076309204,
+ "learning_rate": 1.591184193373206e-06,
+ "loss": 0.3726,
+ "step": 7886
+ },
+ {
+ "epoch": 3.729078014184397,
+ "grad_norm": 2.8128812313079834,
+ "learning_rate": 1.5906030744888024e-06,
+ "loss": 0.3648,
+ "step": 7887
+ },
+ {
+ "epoch": 3.729550827423168,
+ "grad_norm": 3.025012493133545,
+ "learning_rate": 1.5900220122285564e-06,
+ "loss": 0.3945,
+ "step": 7888
+ },
+ {
+ "epoch": 3.7300236406619387,
+ "grad_norm": 3.237680435180664,
+ "learning_rate": 1.5894410066286512e-06,
+ "loss": 0.3815,
+ "step": 7889
+ },
+ {
+ "epoch": 3.7304964539007095,
+ "grad_norm": 3.458033323287964,
+ "learning_rate": 1.5888600577252605e-06,
+ "loss": 0.4104,
+ "step": 7890
+ },
+ {
+ "epoch": 3.73096926713948,
+ "grad_norm": 2.718867540359497,
+ "learning_rate": 1.58827916555456e-06,
+ "loss": 0.3243,
+ "step": 7891
+ },
+ {
+ "epoch": 3.7314420803782506,
+ "grad_norm": 3.047157049179077,
+ "learning_rate": 1.5876983301527176e-06,
+ "loss": 0.3689,
+ "step": 7892
+ },
+ {
+ "epoch": 3.731914893617021,
+ "grad_norm": 3.2904715538024902,
+ "learning_rate": 1.5871175515558995e-06,
+ "loss": 0.4045,
+ "step": 7893
+ },
+ {
+ "epoch": 3.732387706855792,
+ "grad_norm": 2.956467866897583,
+ "learning_rate": 1.5865368298002692e-06,
+ "loss": 0.3806,
+ "step": 7894
+ },
+ {
+ "epoch": 3.7328605200945626,
+ "grad_norm": 3.3309173583984375,
+ "learning_rate": 1.5859561649219843e-06,
+ "loss": 0.4011,
+ "step": 7895
+ },
+ {
+ "epoch": 3.7333333333333334,
+ "grad_norm": 2.7853524684906006,
+ "learning_rate": 1.5853755569572018e-06,
+ "loss": 0.3239,
+ "step": 7896
+ },
+ {
+ "epoch": 3.733806146572104,
+ "grad_norm": 2.9832780361175537,
+ "learning_rate": 1.584795005942073e-06,
+ "loss": 0.4582,
+ "step": 7897
+ },
+ {
+ "epoch": 3.7342789598108745,
+ "grad_norm": 3.2866461277008057,
+ "learning_rate": 1.584214511912745e-06,
+ "loss": 0.3876,
+ "step": 7898
+ },
+ {
+ "epoch": 3.7347517730496453,
+ "grad_norm": 3.018526792526245,
+ "learning_rate": 1.5836340749053646e-06,
+ "loss": 0.3221,
+ "step": 7899
+ },
+ {
+ "epoch": 3.735224586288416,
+ "grad_norm": 2.9109885692596436,
+ "learning_rate": 1.583053694956072e-06,
+ "loss": 0.4225,
+ "step": 7900
+ },
+ {
+ "epoch": 3.7356973995271865,
+ "grad_norm": 3.104146718978882,
+ "learning_rate": 1.5824733721010051e-06,
+ "loss": 0.3843,
+ "step": 7901
+ },
+ {
+ "epoch": 3.7361702127659573,
+ "grad_norm": 3.0982813835144043,
+ "learning_rate": 1.5818931063762989e-06,
+ "loss": 0.4223,
+ "step": 7902
+ },
+ {
+ "epoch": 3.736643026004728,
+ "grad_norm": 2.7797579765319824,
+ "learning_rate": 1.5813128978180819e-06,
+ "loss": 0.3536,
+ "step": 7903
+ },
+ {
+ "epoch": 3.737115839243499,
+ "grad_norm": 2.870884656906128,
+ "learning_rate": 1.5807327464624835e-06,
+ "loss": 0.3053,
+ "step": 7904
+ },
+ {
+ "epoch": 3.7375886524822697,
+ "grad_norm": 2.896674633026123,
+ "learning_rate": 1.5801526523456251e-06,
+ "loss": 0.3806,
+ "step": 7905
+ },
+ {
+ "epoch": 3.73806146572104,
+ "grad_norm": 3.009662389755249,
+ "learning_rate": 1.5795726155036284e-06,
+ "loss": 0.3568,
+ "step": 7906
+ },
+ {
+ "epoch": 3.738534278959811,
+ "grad_norm": 2.6860599517822266,
+ "learning_rate": 1.578992635972609e-06,
+ "loss": 0.4392,
+ "step": 7907
+ },
+ {
+ "epoch": 3.7390070921985816,
+ "grad_norm": 2.9046099185943604,
+ "learning_rate": 1.578412713788679e-06,
+ "loss": 0.3756,
+ "step": 7908
+ },
+ {
+ "epoch": 3.739479905437352,
+ "grad_norm": 2.8035101890563965,
+ "learning_rate": 1.5778328489879488e-06,
+ "loss": 0.3576,
+ "step": 7909
+ },
+ {
+ "epoch": 3.739952718676123,
+ "grad_norm": 2.767514228820801,
+ "learning_rate": 1.5772530416065238e-06,
+ "loss": 0.4037,
+ "step": 7910
+ },
+ {
+ "epoch": 3.7404255319148936,
+ "grad_norm": 3.0867795944213867,
+ "learning_rate": 1.576673291680505e-06,
+ "loss": 0.4394,
+ "step": 7911
+ },
+ {
+ "epoch": 3.7408983451536644,
+ "grad_norm": 3.295976161956787,
+ "learning_rate": 1.5760935992459926e-06,
+ "loss": 0.3938,
+ "step": 7912
+ },
+ {
+ "epoch": 3.741371158392435,
+ "grad_norm": 2.725949287414551,
+ "learning_rate": 1.5755139643390794e-06,
+ "loss": 0.3633,
+ "step": 7913
+ },
+ {
+ "epoch": 3.7418439716312055,
+ "grad_norm": 3.0864083766937256,
+ "learning_rate": 1.5749343869958585e-06,
+ "loss": 0.3034,
+ "step": 7914
+ },
+ {
+ "epoch": 3.7423167848699763,
+ "grad_norm": 3.707273244857788,
+ "learning_rate": 1.5743548672524175e-06,
+ "loss": 0.4206,
+ "step": 7915
+ },
+ {
+ "epoch": 3.742789598108747,
+ "grad_norm": 2.9829516410827637,
+ "learning_rate": 1.573775405144839e-06,
+ "loss": 0.333,
+ "step": 7916
+ },
+ {
+ "epoch": 3.7432624113475175,
+ "grad_norm": 3.3303117752075195,
+ "learning_rate": 1.5731960007092056e-06,
+ "loss": 0.4558,
+ "step": 7917
+ },
+ {
+ "epoch": 3.7437352245862883,
+ "grad_norm": 2.63291335105896,
+ "learning_rate": 1.5726166539815925e-06,
+ "loss": 0.39,
+ "step": 7918
+ },
+ {
+ "epoch": 3.744208037825059,
+ "grad_norm": 3.0533673763275146,
+ "learning_rate": 1.572037364998075e-06,
+ "loss": 0.3586,
+ "step": 7919
+ },
+ {
+ "epoch": 3.74468085106383,
+ "grad_norm": 2.9185104370117188,
+ "learning_rate": 1.5714581337947216e-06,
+ "loss": 0.3809,
+ "step": 7920
+ },
+ {
+ "epoch": 3.7451536643026007,
+ "grad_norm": 3.1863298416137695,
+ "learning_rate": 1.5708789604075975e-06,
+ "loss": 0.4132,
+ "step": 7921
+ },
+ {
+ "epoch": 3.745626477541371,
+ "grad_norm": 3.2700514793395996,
+ "learning_rate": 1.5702998448727674e-06,
+ "loss": 0.4601,
+ "step": 7922
+ },
+ {
+ "epoch": 3.746099290780142,
+ "grad_norm": 3.4729206562042236,
+ "learning_rate": 1.5697207872262886e-06,
+ "loss": 0.4585,
+ "step": 7923
+ },
+ {
+ "epoch": 3.7465721040189126,
+ "grad_norm": 3.1432926654815674,
+ "learning_rate": 1.5691417875042182e-06,
+ "loss": 0.3128,
+ "step": 7924
+ },
+ {
+ "epoch": 3.747044917257683,
+ "grad_norm": 3.096121072769165,
+ "learning_rate": 1.5685628457426066e-06,
+ "loss": 0.3903,
+ "step": 7925
+ },
+ {
+ "epoch": 3.7475177304964538,
+ "grad_norm": 2.6897027492523193,
+ "learning_rate": 1.5679839619775023e-06,
+ "loss": 0.3707,
+ "step": 7926
+ },
+ {
+ "epoch": 3.7479905437352246,
+ "grad_norm": 2.8020687103271484,
+ "learning_rate": 1.5674051362449503e-06,
+ "loss": 0.3986,
+ "step": 7927
+ },
+ {
+ "epoch": 3.7484633569739954,
+ "grad_norm": 3.1278326511383057,
+ "learning_rate": 1.56682636858099e-06,
+ "loss": 0.3899,
+ "step": 7928
+ },
+ {
+ "epoch": 3.748936170212766,
+ "grad_norm": 2.907982587814331,
+ "learning_rate": 1.5662476590216613e-06,
+ "loss": 0.3422,
+ "step": 7929
+ },
+ {
+ "epoch": 3.7494089834515365,
+ "grad_norm": 3.1246347427368164,
+ "learning_rate": 1.5656690076029962e-06,
+ "loss": 0.452,
+ "step": 7930
+ },
+ {
+ "epoch": 3.7498817966903073,
+ "grad_norm": 2.9161367416381836,
+ "learning_rate": 1.565090414361024e-06,
+ "loss": 0.3284,
+ "step": 7931
+ },
+ {
+ "epoch": 3.750354609929078,
+ "grad_norm": 2.943183422088623,
+ "learning_rate": 1.564511879331773e-06,
+ "loss": 0.3478,
+ "step": 7932
+ },
+ {
+ "epoch": 3.7508274231678485,
+ "grad_norm": 3.2308566570281982,
+ "learning_rate": 1.563933402551266e-06,
+ "loss": 0.4143,
+ "step": 7933
+ },
+ {
+ "epoch": 3.7513002364066192,
+ "grad_norm": 2.6846251487731934,
+ "learning_rate": 1.5633549840555206e-06,
+ "loss": 0.3681,
+ "step": 7934
+ },
+ {
+ "epoch": 3.75177304964539,
+ "grad_norm": 3.0995283126831055,
+ "learning_rate": 1.562776623880554e-06,
+ "loss": 0.4642,
+ "step": 7935
+ },
+ {
+ "epoch": 3.752245862884161,
+ "grad_norm": 2.7406163215637207,
+ "learning_rate": 1.562198322062376e-06,
+ "loss": 0.3823,
+ "step": 7936
+ },
+ {
+ "epoch": 3.7527186761229316,
+ "grad_norm": 2.85732364654541,
+ "learning_rate": 1.5616200786369978e-06,
+ "loss": 0.3053,
+ "step": 7937
+ },
+ {
+ "epoch": 3.753191489361702,
+ "grad_norm": 2.812526226043701,
+ "learning_rate": 1.5610418936404223e-06,
+ "loss": 0.3944,
+ "step": 7938
+ },
+ {
+ "epoch": 3.753664302600473,
+ "grad_norm": 2.8886849880218506,
+ "learning_rate": 1.5604637671086499e-06,
+ "loss": 0.3936,
+ "step": 7939
+ },
+ {
+ "epoch": 3.7541371158392436,
+ "grad_norm": 2.831774950027466,
+ "learning_rate": 1.5598856990776801e-06,
+ "loss": 0.3146,
+ "step": 7940
+ },
+ {
+ "epoch": 3.754609929078014,
+ "grad_norm": 2.8853790760040283,
+ "learning_rate": 1.5593076895835052e-06,
+ "loss": 0.3286,
+ "step": 7941
+ },
+ {
+ "epoch": 3.7550827423167847,
+ "grad_norm": 3.2724483013153076,
+ "learning_rate": 1.5587297386621158e-06,
+ "loss": 0.3396,
+ "step": 7942
+ },
+ {
+ "epoch": 3.7555555555555555,
+ "grad_norm": 3.5077168941497803,
+ "learning_rate": 1.5581518463494983e-06,
+ "loss": 0.4528,
+ "step": 7943
+ },
+ {
+ "epoch": 3.7560283687943263,
+ "grad_norm": 3.031503915786743,
+ "learning_rate": 1.5575740126816346e-06,
+ "loss": 0.3803,
+ "step": 7944
+ },
+ {
+ "epoch": 3.756501182033097,
+ "grad_norm": 3.0939114093780518,
+ "learning_rate": 1.556996237694506e-06,
+ "loss": 0.3931,
+ "step": 7945
+ },
+ {
+ "epoch": 3.7569739952718675,
+ "grad_norm": 2.9404146671295166,
+ "learning_rate": 1.556418521424085e-06,
+ "loss": 0.3608,
+ "step": 7946
+ },
+ {
+ "epoch": 3.7574468085106383,
+ "grad_norm": 3.4363012313842773,
+ "learning_rate": 1.5558408639063465e-06,
+ "loss": 0.4335,
+ "step": 7947
+ },
+ {
+ "epoch": 3.757919621749409,
+ "grad_norm": 3.2819864749908447,
+ "learning_rate": 1.5552632651772575e-06,
+ "loss": 0.4147,
+ "step": 7948
+ },
+ {
+ "epoch": 3.7583924349881794,
+ "grad_norm": 2.917788505554199,
+ "learning_rate": 1.554685725272782e-06,
+ "loss": 0.3516,
+ "step": 7949
+ },
+ {
+ "epoch": 3.7588652482269502,
+ "grad_norm": 2.8425943851470947,
+ "learning_rate": 1.5541082442288818e-06,
+ "loss": 0.3596,
+ "step": 7950
+ },
+ {
+ "epoch": 3.759338061465721,
+ "grad_norm": 3.087005376815796,
+ "learning_rate": 1.5535308220815126e-06,
+ "loss": 0.3968,
+ "step": 7951
+ },
+ {
+ "epoch": 3.759810874704492,
+ "grad_norm": 2.743110179901123,
+ "learning_rate": 1.5529534588666298e-06,
+ "loss": 0.3802,
+ "step": 7952
+ },
+ {
+ "epoch": 3.7602836879432626,
+ "grad_norm": 2.914424180984497,
+ "learning_rate": 1.5523761546201825e-06,
+ "loss": 0.4055,
+ "step": 7953
+ },
+ {
+ "epoch": 3.760756501182033,
+ "grad_norm": 2.9691991806030273,
+ "learning_rate": 1.551798909378116e-06,
+ "loss": 0.3384,
+ "step": 7954
+ },
+ {
+ "epoch": 3.7612293144208038,
+ "grad_norm": 2.433657646179199,
+ "learning_rate": 1.5512217231763747e-06,
+ "loss": 0.3019,
+ "step": 7955
+ },
+ {
+ "epoch": 3.7617021276595746,
+ "grad_norm": 2.7904880046844482,
+ "learning_rate": 1.5506445960508957e-06,
+ "loss": 0.389,
+ "step": 7956
+ },
+ {
+ "epoch": 3.762174940898345,
+ "grad_norm": 2.9241607189178467,
+ "learning_rate": 1.5500675280376154e-06,
+ "loss": 0.4291,
+ "step": 7957
+ },
+ {
+ "epoch": 3.7626477541371157,
+ "grad_norm": 3.216491222381592,
+ "learning_rate": 1.549490519172465e-06,
+ "loss": 0.4065,
+ "step": 7958
+ },
+ {
+ "epoch": 3.7631205673758865,
+ "grad_norm": 2.8859689235687256,
+ "learning_rate": 1.548913569491371e-06,
+ "loss": 0.353,
+ "step": 7959
+ },
+ {
+ "epoch": 3.7635933806146573,
+ "grad_norm": 2.958773136138916,
+ "learning_rate": 1.5483366790302594e-06,
+ "loss": 0.3829,
+ "step": 7960
+ },
+ {
+ "epoch": 3.764066193853428,
+ "grad_norm": 2.868649482727051,
+ "learning_rate": 1.5477598478250505e-06,
+ "loss": 0.3591,
+ "step": 7961
+ },
+ {
+ "epoch": 3.7645390070921985,
+ "grad_norm": 2.6912996768951416,
+ "learning_rate": 1.5471830759116591e-06,
+ "loss": 0.3695,
+ "step": 7962
+ },
+ {
+ "epoch": 3.7650118203309693,
+ "grad_norm": 3.3318257331848145,
+ "learning_rate": 1.5466063633260004e-06,
+ "loss": 0.4126,
+ "step": 7963
+ },
+ {
+ "epoch": 3.76548463356974,
+ "grad_norm": 2.865525007247925,
+ "learning_rate": 1.5460297101039825e-06,
+ "loss": 0.4235,
+ "step": 7964
+ },
+ {
+ "epoch": 3.7659574468085104,
+ "grad_norm": 2.8639180660247803,
+ "learning_rate": 1.5454531162815123e-06,
+ "loss": 0.4392,
+ "step": 7965
+ },
+ {
+ "epoch": 3.766430260047281,
+ "grad_norm": 2.5752499103546143,
+ "learning_rate": 1.5448765818944902e-06,
+ "loss": 0.4113,
+ "step": 7966
+ },
+ {
+ "epoch": 3.766903073286052,
+ "grad_norm": 2.7622742652893066,
+ "learning_rate": 1.5443001069788155e-06,
+ "loss": 0.3785,
+ "step": 7967
+ },
+ {
+ "epoch": 3.767375886524823,
+ "grad_norm": 2.965579032897949,
+ "learning_rate": 1.5437236915703829e-06,
+ "loss": 0.335,
+ "step": 7968
+ },
+ {
+ "epoch": 3.7678486997635936,
+ "grad_norm": 3.0587408542633057,
+ "learning_rate": 1.5431473357050816e-06,
+ "loss": 0.4047,
+ "step": 7969
+ },
+ {
+ "epoch": 3.768321513002364,
+ "grad_norm": 3.2929413318634033,
+ "learning_rate": 1.5425710394188014e-06,
+ "loss": 0.4061,
+ "step": 7970
+ },
+ {
+ "epoch": 3.7687943262411348,
+ "grad_norm": 2.663043975830078,
+ "learning_rate": 1.541994802747424e-06,
+ "loss": 0.3478,
+ "step": 7971
+ },
+ {
+ "epoch": 3.7692671394799055,
+ "grad_norm": 3.0657591819763184,
+ "learning_rate": 1.5414186257268293e-06,
+ "loss": 0.3735,
+ "step": 7972
+ },
+ {
+ "epoch": 3.769739952718676,
+ "grad_norm": 2.963189125061035,
+ "learning_rate": 1.5408425083928939e-06,
+ "loss": 0.4743,
+ "step": 7973
+ },
+ {
+ "epoch": 3.7702127659574467,
+ "grad_norm": 3.1509387493133545,
+ "learning_rate": 1.540266450781489e-06,
+ "loss": 0.4164,
+ "step": 7974
+ },
+ {
+ "epoch": 3.7706855791962175,
+ "grad_norm": 3.4436306953430176,
+ "learning_rate": 1.539690452928485e-06,
+ "loss": 0.4583,
+ "step": 7975
+ },
+ {
+ "epoch": 3.7711583924349883,
+ "grad_norm": 3.1746156215667725,
+ "learning_rate": 1.5391145148697454e-06,
+ "loss": 0.4042,
+ "step": 7976
+ },
+ {
+ "epoch": 3.771631205673759,
+ "grad_norm": 3.531028985977173,
+ "learning_rate": 1.5385386366411304e-06,
+ "loss": 0.4304,
+ "step": 7977
+ },
+ {
+ "epoch": 3.7721040189125294,
+ "grad_norm": 2.867871046066284,
+ "learning_rate": 1.5379628182785e-06,
+ "loss": 0.4023,
+ "step": 7978
+ },
+ {
+ "epoch": 3.7725768321513002,
+ "grad_norm": 3.0504629611968994,
+ "learning_rate": 1.5373870598177051e-06,
+ "loss": 0.3785,
+ "step": 7979
+ },
+ {
+ "epoch": 3.773049645390071,
+ "grad_norm": 2.8188650608062744,
+ "learning_rate": 1.5368113612945983e-06,
+ "loss": 0.3808,
+ "step": 7980
+ },
+ {
+ "epoch": 3.7735224586288414,
+ "grad_norm": 3.0809133052825928,
+ "learning_rate": 1.5362357227450248e-06,
+ "loss": 0.3912,
+ "step": 7981
+ },
+ {
+ "epoch": 3.773995271867612,
+ "grad_norm": 3.223273277282715,
+ "learning_rate": 1.5356601442048257e-06,
+ "loss": 0.3802,
+ "step": 7982
+ },
+ {
+ "epoch": 3.774468085106383,
+ "grad_norm": 2.7513339519500732,
+ "learning_rate": 1.535084625709842e-06,
+ "loss": 0.3822,
+ "step": 7983
+ },
+ {
+ "epoch": 3.774940898345154,
+ "grad_norm": 3.085592031478882,
+ "learning_rate": 1.5345091672959074e-06,
+ "loss": 0.4348,
+ "step": 7984
+ },
+ {
+ "epoch": 3.7754137115839246,
+ "grad_norm": 3.315108299255371,
+ "learning_rate": 1.5339337689988525e-06,
+ "loss": 0.4196,
+ "step": 7985
+ },
+ {
+ "epoch": 3.775886524822695,
+ "grad_norm": 3.713372230529785,
+ "learning_rate": 1.533358430854507e-06,
+ "loss": 0.4292,
+ "step": 7986
+ },
+ {
+ "epoch": 3.7763593380614657,
+ "grad_norm": 2.7899155616760254,
+ "learning_rate": 1.532783152898692e-06,
+ "loss": 0.3874,
+ "step": 7987
+ },
+ {
+ "epoch": 3.7768321513002365,
+ "grad_norm": 2.918851852416992,
+ "learning_rate": 1.5322079351672297e-06,
+ "loss": 0.4073,
+ "step": 7988
+ },
+ {
+ "epoch": 3.777304964539007,
+ "grad_norm": 3.13395619392395,
+ "learning_rate": 1.5316327776959361e-06,
+ "loss": 0.3441,
+ "step": 7989
+ },
+ {
+ "epoch": 3.7777777777777777,
+ "grad_norm": 3.2320916652679443,
+ "learning_rate": 1.531057680520623e-06,
+ "loss": 0.372,
+ "step": 7990
+ },
+ {
+ "epoch": 3.7782505910165485,
+ "grad_norm": 3.1130621433258057,
+ "learning_rate": 1.5304826436770991e-06,
+ "loss": 0.3514,
+ "step": 7991
+ },
+ {
+ "epoch": 3.7787234042553193,
+ "grad_norm": 3.223207712173462,
+ "learning_rate": 1.5299076672011696e-06,
+ "loss": 0.44,
+ "step": 7992
+ },
+ {
+ "epoch": 3.77919621749409,
+ "grad_norm": 3.0757877826690674,
+ "learning_rate": 1.5293327511286366e-06,
+ "loss": 0.4051,
+ "step": 7993
+ },
+ {
+ "epoch": 3.7796690307328604,
+ "grad_norm": 2.936678409576416,
+ "learning_rate": 1.528757895495297e-06,
+ "loss": 0.3965,
+ "step": 7994
+ },
+ {
+ "epoch": 3.780141843971631,
+ "grad_norm": 2.993445873260498,
+ "learning_rate": 1.5281831003369435e-06,
+ "loss": 0.4222,
+ "step": 7995
+ },
+ {
+ "epoch": 3.780614657210402,
+ "grad_norm": 2.9140853881835938,
+ "learning_rate": 1.5276083656893679e-06,
+ "loss": 0.3662,
+ "step": 7996
+ },
+ {
+ "epoch": 3.7810874704491724,
+ "grad_norm": 3.2649893760681152,
+ "learning_rate": 1.5270336915883549e-06,
+ "loss": 0.4272,
+ "step": 7997
+ },
+ {
+ "epoch": 3.781560283687943,
+ "grad_norm": 3.0631372928619385,
+ "learning_rate": 1.5264590780696887e-06,
+ "loss": 0.4111,
+ "step": 7998
+ },
+ {
+ "epoch": 3.782033096926714,
+ "grad_norm": 2.791299343109131,
+ "learning_rate": 1.5258845251691463e-06,
+ "loss": 0.416,
+ "step": 7999
+ },
+ {
+ "epoch": 3.7825059101654848,
+ "grad_norm": 3.262294054031372,
+ "learning_rate": 1.5253100329225023e-06,
+ "loss": 0.4236,
+ "step": 8000
+ },
+ {
+ "epoch": 3.7829787234042556,
+ "grad_norm": 2.574486017227173,
+ "learning_rate": 1.5247356013655295e-06,
+ "loss": 0.4089,
+ "step": 8001
+ },
+ {
+ "epoch": 3.783451536643026,
+ "grad_norm": 3.1566531658172607,
+ "learning_rate": 1.5241612305339936e-06,
+ "loss": 0.3955,
+ "step": 8002
+ },
+ {
+ "epoch": 3.7839243498817967,
+ "grad_norm": 2.5845813751220703,
+ "learning_rate": 1.5235869204636602e-06,
+ "loss": 0.3672,
+ "step": 8003
+ },
+ {
+ "epoch": 3.7843971631205675,
+ "grad_norm": 2.877570629119873,
+ "learning_rate": 1.5230126711902876e-06,
+ "loss": 0.3919,
+ "step": 8004
+ },
+ {
+ "epoch": 3.784869976359338,
+ "grad_norm": 3.183061122894287,
+ "learning_rate": 1.5224384827496314e-06,
+ "loss": 0.3291,
+ "step": 8005
+ },
+ {
+ "epoch": 3.7853427895981087,
+ "grad_norm": 3.0778391361236572,
+ "learning_rate": 1.5218643551774451e-06,
+ "loss": 0.3571,
+ "step": 8006
+ },
+ {
+ "epoch": 3.7858156028368795,
+ "grad_norm": 3.2364399433135986,
+ "learning_rate": 1.5212902885094762e-06,
+ "loss": 0.4045,
+ "step": 8007
+ },
+ {
+ "epoch": 3.7862884160756503,
+ "grad_norm": 3.0571746826171875,
+ "learning_rate": 1.5207162827814687e-06,
+ "loss": 0.4181,
+ "step": 8008
+ },
+ {
+ "epoch": 3.786761229314421,
+ "grad_norm": 2.7215163707733154,
+ "learning_rate": 1.5201423380291652e-06,
+ "loss": 0.3328,
+ "step": 8009
+ },
+ {
+ "epoch": 3.7872340425531914,
+ "grad_norm": 3.0521233081817627,
+ "learning_rate": 1.5195684542883007e-06,
+ "loss": 0.4072,
+ "step": 8010
+ },
+ {
+ "epoch": 3.787706855791962,
+ "grad_norm": 2.541666269302368,
+ "learning_rate": 1.5189946315946104e-06,
+ "loss": 0.3293,
+ "step": 8011
+ },
+ {
+ "epoch": 3.788179669030733,
+ "grad_norm": 3.0041720867156982,
+ "learning_rate": 1.5184208699838232e-06,
+ "loss": 0.3998,
+ "step": 8012
+ },
+ {
+ "epoch": 3.7886524822695034,
+ "grad_norm": 3.0763001441955566,
+ "learning_rate": 1.5178471694916635e-06,
+ "loss": 0.38,
+ "step": 8013
+ },
+ {
+ "epoch": 3.789125295508274,
+ "grad_norm": 3.0788497924804688,
+ "learning_rate": 1.5172735301538544e-06,
+ "loss": 0.3986,
+ "step": 8014
+ },
+ {
+ "epoch": 3.789598108747045,
+ "grad_norm": 2.830225944519043,
+ "learning_rate": 1.5166999520061127e-06,
+ "loss": 0.3977,
+ "step": 8015
+ },
+ {
+ "epoch": 3.7900709219858157,
+ "grad_norm": 3.196078062057495,
+ "learning_rate": 1.5161264350841543e-06,
+ "loss": 0.4058,
+ "step": 8016
+ },
+ {
+ "epoch": 3.7905437352245865,
+ "grad_norm": 9.898200988769531,
+ "learning_rate": 1.5155529794236884e-06,
+ "loss": 0.3451,
+ "step": 8017
+ },
+ {
+ "epoch": 3.791016548463357,
+ "grad_norm": 3.0028066635131836,
+ "learning_rate": 1.514979585060421e-06,
+ "loss": 0.4029,
+ "step": 8018
+ },
+ {
+ "epoch": 3.7914893617021277,
+ "grad_norm": 2.984926223754883,
+ "learning_rate": 1.5144062520300562e-06,
+ "loss": 0.3995,
+ "step": 8019
+ },
+ {
+ "epoch": 3.7919621749408985,
+ "grad_norm": 2.938596487045288,
+ "learning_rate": 1.5138329803682925e-06,
+ "loss": 0.386,
+ "step": 8020
+ },
+ {
+ "epoch": 3.792434988179669,
+ "grad_norm": 2.992565393447876,
+ "learning_rate": 1.513259770110825e-06,
+ "loss": 0.3919,
+ "step": 8021
+ },
+ {
+ "epoch": 3.7929078014184396,
+ "grad_norm": 3.0182361602783203,
+ "learning_rate": 1.5126866212933453e-06,
+ "loss": 0.3506,
+ "step": 8022
+ },
+ {
+ "epoch": 3.7933806146572104,
+ "grad_norm": 3.2039108276367188,
+ "learning_rate": 1.5121135339515392e-06,
+ "loss": 0.3807,
+ "step": 8023
+ },
+ {
+ "epoch": 3.7938534278959812,
+ "grad_norm": 2.9290878772735596,
+ "learning_rate": 1.5115405081210927e-06,
+ "loss": 0.3596,
+ "step": 8024
+ },
+ {
+ "epoch": 3.794326241134752,
+ "grad_norm": 3.106152057647705,
+ "learning_rate": 1.510967543837683e-06,
+ "loss": 0.3703,
+ "step": 8025
+ },
+ {
+ "epoch": 3.7947990543735224,
+ "grad_norm": 2.9752190113067627,
+ "learning_rate": 1.510394641136989e-06,
+ "loss": 0.4049,
+ "step": 8026
+ },
+ {
+ "epoch": 3.795271867612293,
+ "grad_norm": 2.996206283569336,
+ "learning_rate": 1.5098218000546815e-06,
+ "loss": 0.4286,
+ "step": 8027
+ },
+ {
+ "epoch": 3.795744680851064,
+ "grad_norm": 2.9403493404388428,
+ "learning_rate": 1.5092490206264281e-06,
+ "loss": 0.3628,
+ "step": 8028
+ },
+ {
+ "epoch": 3.7962174940898343,
+ "grad_norm": 2.8101110458374023,
+ "learning_rate": 1.5086763028878943e-06,
+ "loss": 0.4016,
+ "step": 8029
+ },
+ {
+ "epoch": 3.796690307328605,
+ "grad_norm": 3.162264108657837,
+ "learning_rate": 1.5081036468747401e-06,
+ "loss": 0.4133,
+ "step": 8030
+ },
+ {
+ "epoch": 3.797163120567376,
+ "grad_norm": 2.6871988773345947,
+ "learning_rate": 1.5075310526226223e-06,
+ "loss": 0.3748,
+ "step": 8031
+ },
+ {
+ "epoch": 3.7976359338061467,
+ "grad_norm": 2.997924327850342,
+ "learning_rate": 1.5069585201671944e-06,
+ "loss": 0.4083,
+ "step": 8032
+ },
+ {
+ "epoch": 3.7981087470449175,
+ "grad_norm": 2.8266279697418213,
+ "learning_rate": 1.506386049544104e-06,
+ "loss": 0.4488,
+ "step": 8033
+ },
+ {
+ "epoch": 3.798581560283688,
+ "grad_norm": 2.7106378078460693,
+ "learning_rate": 1.5058136407889985e-06,
+ "loss": 0.363,
+ "step": 8034
+ },
+ {
+ "epoch": 3.7990543735224587,
+ "grad_norm": 2.8983304500579834,
+ "learning_rate": 1.5052412939375183e-06,
+ "loss": 0.4156,
+ "step": 8035
+ },
+ {
+ "epoch": 3.7995271867612295,
+ "grad_norm": 3.0333914756774902,
+ "learning_rate": 1.5046690090253001e-06,
+ "loss": 0.3694,
+ "step": 8036
+ },
+ {
+ "epoch": 3.8,
+ "grad_norm": 2.872662305831909,
+ "learning_rate": 1.5040967860879785e-06,
+ "loss": 0.3492,
+ "step": 8037
+ },
+ {
+ "epoch": 3.8004728132387706,
+ "grad_norm": 2.7279646396636963,
+ "learning_rate": 1.5035246251611835e-06,
+ "loss": 0.327,
+ "step": 8038
+ },
+ {
+ "epoch": 3.8009456264775414,
+ "grad_norm": 2.969326972961426,
+ "learning_rate": 1.5029525262805405e-06,
+ "loss": 0.3977,
+ "step": 8039
+ },
+ {
+ "epoch": 3.801418439716312,
+ "grad_norm": 3.073899745941162,
+ "learning_rate": 1.5023804894816723e-06,
+ "loss": 0.388,
+ "step": 8040
+ },
+ {
+ "epoch": 3.801891252955083,
+ "grad_norm": 3.026284694671631,
+ "learning_rate": 1.5018085148001953e-06,
+ "loss": 0.3761,
+ "step": 8041
+ },
+ {
+ "epoch": 3.8023640661938534,
+ "grad_norm": 3.0478618144989014,
+ "learning_rate": 1.5012366022717262e-06,
+ "loss": 0.4415,
+ "step": 8042
+ },
+ {
+ "epoch": 3.802836879432624,
+ "grad_norm": 2.801584005355835,
+ "learning_rate": 1.500664751931874e-06,
+ "loss": 0.4079,
+ "step": 8043
+ },
+ {
+ "epoch": 3.803309692671395,
+ "grad_norm": 3.4839112758636475,
+ "learning_rate": 1.5000929638162459e-06,
+ "loss": 0.4391,
+ "step": 8044
+ },
+ {
+ "epoch": 3.8037825059101653,
+ "grad_norm": 2.6945605278015137,
+ "learning_rate": 1.4995212379604446e-06,
+ "loss": 0.3564,
+ "step": 8045
+ },
+ {
+ "epoch": 3.804255319148936,
+ "grad_norm": 3.0870234966278076,
+ "learning_rate": 1.4989495744000687e-06,
+ "loss": 0.3801,
+ "step": 8046
+ },
+ {
+ "epoch": 3.804728132387707,
+ "grad_norm": 2.975332021713257,
+ "learning_rate": 1.4983779731707135e-06,
+ "loss": 0.3408,
+ "step": 8047
+ },
+ {
+ "epoch": 3.8052009456264777,
+ "grad_norm": 2.9920027256011963,
+ "learning_rate": 1.497806434307969e-06,
+ "loss": 0.3875,
+ "step": 8048
+ },
+ {
+ "epoch": 3.8056737588652485,
+ "grad_norm": 3.1974916458129883,
+ "learning_rate": 1.4972349578474244e-06,
+ "loss": 0.4492,
+ "step": 8049
+ },
+ {
+ "epoch": 3.806146572104019,
+ "grad_norm": 2.839503526687622,
+ "learning_rate": 1.4966635438246622e-06,
+ "loss": 0.3785,
+ "step": 8050
+ },
+ {
+ "epoch": 3.8066193853427897,
+ "grad_norm": 3.274502992630005,
+ "learning_rate": 1.4960921922752603e-06,
+ "loss": 0.4404,
+ "step": 8051
+ },
+ {
+ "epoch": 3.8070921985815604,
+ "grad_norm": 3.0852737426757812,
+ "learning_rate": 1.4955209032347967e-06,
+ "loss": 0.4047,
+ "step": 8052
+ },
+ {
+ "epoch": 3.807565011820331,
+ "grad_norm": 2.9251608848571777,
+ "learning_rate": 1.4949496767388417e-06,
+ "loss": 0.3654,
+ "step": 8053
+ },
+ {
+ "epoch": 3.8080378250591016,
+ "grad_norm": 2.518220901489258,
+ "learning_rate": 1.4943785128229635e-06,
+ "loss": 0.3157,
+ "step": 8054
+ },
+ {
+ "epoch": 3.8085106382978724,
+ "grad_norm": 3.3993279933929443,
+ "learning_rate": 1.4938074115227257e-06,
+ "loss": 0.4204,
+ "step": 8055
+ },
+ {
+ "epoch": 3.808983451536643,
+ "grad_norm": 3.2847096920013428,
+ "learning_rate": 1.4932363728736876e-06,
+ "loss": 0.339,
+ "step": 8056
+ },
+ {
+ "epoch": 3.8094562647754135,
+ "grad_norm": 2.7779417037963867,
+ "learning_rate": 1.492665396911407e-06,
+ "loss": 0.3538,
+ "step": 8057
+ },
+ {
+ "epoch": 3.8099290780141843,
+ "grad_norm": 2.958131790161133,
+ "learning_rate": 1.4920944836714353e-06,
+ "loss": 0.363,
+ "step": 8058
+ },
+ {
+ "epoch": 3.810401891252955,
+ "grad_norm": 3.1873440742492676,
+ "learning_rate": 1.491523633189319e-06,
+ "loss": 0.3785,
+ "step": 8059
+ },
+ {
+ "epoch": 3.8108747044917255,
+ "grad_norm": 3.132652759552002,
+ "learning_rate": 1.4909528455006055e-06,
+ "loss": 0.375,
+ "step": 8060
+ },
+ {
+ "epoch": 3.8113475177304963,
+ "grad_norm": 2.8598761558532715,
+ "learning_rate": 1.490382120640833e-06,
+ "loss": 0.4152,
+ "step": 8061
+ },
+ {
+ "epoch": 3.811820330969267,
+ "grad_norm": 3.115870952606201,
+ "learning_rate": 1.4898114586455399e-06,
+ "loss": 0.4609,
+ "step": 8062
+ },
+ {
+ "epoch": 3.812293144208038,
+ "grad_norm": 3.347944974899292,
+ "learning_rate": 1.4892408595502571e-06,
+ "loss": 0.3836,
+ "step": 8063
+ },
+ {
+ "epoch": 3.8127659574468087,
+ "grad_norm": 3.1747031211853027,
+ "learning_rate": 1.4886703233905132e-06,
+ "loss": 0.374,
+ "step": 8064
+ },
+ {
+ "epoch": 3.813238770685579,
+ "grad_norm": 2.945139169692993,
+ "learning_rate": 1.4880998502018345e-06,
+ "loss": 0.3652,
+ "step": 8065
+ },
+ {
+ "epoch": 3.81371158392435,
+ "grad_norm": 2.8911492824554443,
+ "learning_rate": 1.4875294400197403e-06,
+ "loss": 0.3683,
+ "step": 8066
+ },
+ {
+ "epoch": 3.8141843971631206,
+ "grad_norm": 3.080268383026123,
+ "learning_rate": 1.4869590928797491e-06,
+ "loss": 0.3919,
+ "step": 8067
+ },
+ {
+ "epoch": 3.814657210401891,
+ "grad_norm": 3.0834288597106934,
+ "learning_rate": 1.4863888088173734e-06,
+ "loss": 0.3988,
+ "step": 8068
+ },
+ {
+ "epoch": 3.815130023640662,
+ "grad_norm": 2.765702724456787,
+ "learning_rate": 1.4858185878681213e-06,
+ "loss": 0.3659,
+ "step": 8069
+ },
+ {
+ "epoch": 3.8156028368794326,
+ "grad_norm": 3.074059247970581,
+ "learning_rate": 1.4852484300674993e-06,
+ "loss": 0.3888,
+ "step": 8070
+ },
+ {
+ "epoch": 3.8160756501182034,
+ "grad_norm": 3.0009944438934326,
+ "learning_rate": 1.484678335451007e-06,
+ "loss": 0.417,
+ "step": 8071
+ },
+ {
+ "epoch": 3.816548463356974,
+ "grad_norm": 2.6661112308502197,
+ "learning_rate": 1.4841083040541438e-06,
+ "loss": 0.3544,
+ "step": 8072
+ },
+ {
+ "epoch": 3.8170212765957445,
+ "grad_norm": 2.7849514484405518,
+ "learning_rate": 1.4835383359124018e-06,
+ "loss": 0.3691,
+ "step": 8073
+ },
+ {
+ "epoch": 3.8174940898345153,
+ "grad_norm": 3.008070707321167,
+ "learning_rate": 1.4829684310612697e-06,
+ "loss": 0.4228,
+ "step": 8074
+ },
+ {
+ "epoch": 3.817966903073286,
+ "grad_norm": 2.649296998977661,
+ "learning_rate": 1.4823985895362348e-06,
+ "loss": 0.3642,
+ "step": 8075
+ },
+ {
+ "epoch": 3.8184397163120565,
+ "grad_norm": 2.6017661094665527,
+ "learning_rate": 1.4818288113727768e-06,
+ "loss": 0.3537,
+ "step": 8076
+ },
+ {
+ "epoch": 3.8189125295508273,
+ "grad_norm": 2.9071972370147705,
+ "learning_rate": 1.481259096606375e-06,
+ "loss": 0.3096,
+ "step": 8077
+ },
+ {
+ "epoch": 3.819385342789598,
+ "grad_norm": 3.0866518020629883,
+ "learning_rate": 1.4806894452725024e-06,
+ "loss": 0.4148,
+ "step": 8078
+ },
+ {
+ "epoch": 3.819858156028369,
+ "grad_norm": 3.2099499702453613,
+ "learning_rate": 1.4801198574066272e-06,
+ "loss": 0.4058,
+ "step": 8079
+ },
+ {
+ "epoch": 3.8203309692671397,
+ "grad_norm": 3.0204920768737793,
+ "learning_rate": 1.4795503330442176e-06,
+ "loss": 0.3427,
+ "step": 8080
+ },
+ {
+ "epoch": 3.82080378250591,
+ "grad_norm": 2.88667368888855,
+ "learning_rate": 1.478980872220734e-06,
+ "loss": 0.4075,
+ "step": 8081
+ },
+ {
+ "epoch": 3.821276595744681,
+ "grad_norm": 2.926673173904419,
+ "learning_rate": 1.4784114749716338e-06,
+ "loss": 0.3449,
+ "step": 8082
+ },
+ {
+ "epoch": 3.8217494089834516,
+ "grad_norm": 2.818936347961426,
+ "learning_rate": 1.4778421413323723e-06,
+ "loss": 0.3628,
+ "step": 8083
+ },
+ {
+ "epoch": 3.822222222222222,
+ "grad_norm": 2.960322380065918,
+ "learning_rate": 1.4772728713383983e-06,
+ "loss": 0.3669,
+ "step": 8084
+ },
+ {
+ "epoch": 3.8226950354609928,
+ "grad_norm": 2.940131902694702,
+ "learning_rate": 1.4767036650251584e-06,
+ "loss": 0.4357,
+ "step": 8085
+ },
+ {
+ "epoch": 3.8231678486997636,
+ "grad_norm": 2.9251785278320312,
+ "learning_rate": 1.4761345224280943e-06,
+ "loss": 0.4046,
+ "step": 8086
+ },
+ {
+ "epoch": 3.8236406619385344,
+ "grad_norm": 3.115590810775757,
+ "learning_rate": 1.475565443582643e-06,
+ "loss": 0.3712,
+ "step": 8087
+ },
+ {
+ "epoch": 3.824113475177305,
+ "grad_norm": 2.5968618392944336,
+ "learning_rate": 1.4749964285242408e-06,
+ "loss": 0.3432,
+ "step": 8088
+ },
+ {
+ "epoch": 3.8245862884160755,
+ "grad_norm": 3.195409059524536,
+ "learning_rate": 1.4744274772883148e-06,
+ "loss": 0.3717,
+ "step": 8089
+ },
+ {
+ "epoch": 3.8250591016548463,
+ "grad_norm": 2.8658018112182617,
+ "learning_rate": 1.4738585899102942e-06,
+ "loss": 0.3807,
+ "step": 8090
+ },
+ {
+ "epoch": 3.825531914893617,
+ "grad_norm": 2.9005510807037354,
+ "learning_rate": 1.4732897664255998e-06,
+ "loss": 0.3988,
+ "step": 8091
+ },
+ {
+ "epoch": 3.8260047281323875,
+ "grad_norm": 3.9155731201171875,
+ "learning_rate": 1.472721006869649e-06,
+ "loss": 0.3981,
+ "step": 8092
+ },
+ {
+ "epoch": 3.8264775413711583,
+ "grad_norm": 2.89312744140625,
+ "learning_rate": 1.4721523112778575e-06,
+ "loss": 0.3286,
+ "step": 8093
+ },
+ {
+ "epoch": 3.826950354609929,
+ "grad_norm": 3.006071090698242,
+ "learning_rate": 1.4715836796856332e-06,
+ "loss": 0.3901,
+ "step": 8094
+ },
+ {
+ "epoch": 3.8274231678487,
+ "grad_norm": 3.083411693572998,
+ "learning_rate": 1.4710151121283845e-06,
+ "loss": 0.3741,
+ "step": 8095
+ },
+ {
+ "epoch": 3.8278959810874706,
+ "grad_norm": 2.864989995956421,
+ "learning_rate": 1.4704466086415131e-06,
+ "loss": 0.3887,
+ "step": 8096
+ },
+ {
+ "epoch": 3.828368794326241,
+ "grad_norm": 2.4846417903900146,
+ "learning_rate": 1.4698781692604158e-06,
+ "loss": 0.33,
+ "step": 8097
+ },
+ {
+ "epoch": 3.828841607565012,
+ "grad_norm": 3.2497007846832275,
+ "learning_rate": 1.4693097940204893e-06,
+ "loss": 0.4011,
+ "step": 8098
+ },
+ {
+ "epoch": 3.8293144208037826,
+ "grad_norm": 3.0079777240753174,
+ "learning_rate": 1.4687414829571218e-06,
+ "loss": 0.4263,
+ "step": 8099
+ },
+ {
+ "epoch": 3.829787234042553,
+ "grad_norm": 2.8538410663604736,
+ "learning_rate": 1.4681732361057005e-06,
+ "loss": 0.3651,
+ "step": 8100
+ },
+ {
+ "epoch": 3.8302600472813237,
+ "grad_norm": 3.238163948059082,
+ "learning_rate": 1.4676050535016076e-06,
+ "loss": 0.392,
+ "step": 8101
+ },
+ {
+ "epoch": 3.8307328605200945,
+ "grad_norm": 2.9991304874420166,
+ "learning_rate": 1.46703693518022e-06,
+ "loss": 0.3643,
+ "step": 8102
+ },
+ {
+ "epoch": 3.8312056737588653,
+ "grad_norm": 2.9816839694976807,
+ "learning_rate": 1.466468881176914e-06,
+ "loss": 0.3803,
+ "step": 8103
+ },
+ {
+ "epoch": 3.831678486997636,
+ "grad_norm": 3.2009265422821045,
+ "learning_rate": 1.465900891527059e-06,
+ "loss": 0.3828,
+ "step": 8104
+ },
+ {
+ "epoch": 3.8321513002364065,
+ "grad_norm": 2.9479124546051025,
+ "learning_rate": 1.4653329662660201e-06,
+ "loss": 0.3683,
+ "step": 8105
+ },
+ {
+ "epoch": 3.8326241134751773,
+ "grad_norm": 2.938507080078125,
+ "learning_rate": 1.4647651054291614e-06,
+ "loss": 0.3703,
+ "step": 8106
+ },
+ {
+ "epoch": 3.833096926713948,
+ "grad_norm": 2.7777645587921143,
+ "learning_rate": 1.4641973090518397e-06,
+ "loss": 0.3982,
+ "step": 8107
+ },
+ {
+ "epoch": 3.8335697399527184,
+ "grad_norm": 3.2470149993896484,
+ "learning_rate": 1.4636295771694099e-06,
+ "loss": 0.3748,
+ "step": 8108
+ },
+ {
+ "epoch": 3.8340425531914892,
+ "grad_norm": 2.869310140609741,
+ "learning_rate": 1.4630619098172223e-06,
+ "loss": 0.3577,
+ "step": 8109
+ },
+ {
+ "epoch": 3.83451536643026,
+ "grad_norm": 3.1245369911193848,
+ "learning_rate": 1.4624943070306225e-06,
+ "loss": 0.4518,
+ "step": 8110
+ },
+ {
+ "epoch": 3.834988179669031,
+ "grad_norm": 3.0390701293945312,
+ "learning_rate": 1.4619267688449529e-06,
+ "loss": 0.5051,
+ "step": 8111
+ },
+ {
+ "epoch": 3.8354609929078016,
+ "grad_norm": 2.929943799972534,
+ "learning_rate": 1.4613592952955507e-06,
+ "loss": 0.4207,
+ "step": 8112
+ },
+ {
+ "epoch": 3.835933806146572,
+ "grad_norm": 3.17008376121521,
+ "learning_rate": 1.4607918864177523e-06,
+ "loss": 0.3836,
+ "step": 8113
+ },
+ {
+ "epoch": 3.8364066193853428,
+ "grad_norm": 3.0689237117767334,
+ "learning_rate": 1.460224542246886e-06,
+ "loss": 0.3413,
+ "step": 8114
+ },
+ {
+ "epoch": 3.8368794326241136,
+ "grad_norm": 2.9966423511505127,
+ "learning_rate": 1.4596572628182774e-06,
+ "loss": 0.4367,
+ "step": 8115
+ },
+ {
+ "epoch": 3.837352245862884,
+ "grad_norm": 3.0572052001953125,
+ "learning_rate": 1.45909004816725e-06,
+ "loss": 0.4089,
+ "step": 8116
+ },
+ {
+ "epoch": 3.8378250591016547,
+ "grad_norm": 2.911263942718506,
+ "learning_rate": 1.4585228983291203e-06,
+ "loss": 0.3848,
+ "step": 8117
+ },
+ {
+ "epoch": 3.8382978723404255,
+ "grad_norm": 2.9233853816986084,
+ "learning_rate": 1.4579558133392038e-06,
+ "loss": 0.4012,
+ "step": 8118
+ },
+ {
+ "epoch": 3.8387706855791963,
+ "grad_norm": 2.7813868522644043,
+ "learning_rate": 1.4573887932328097e-06,
+ "loss": 0.3898,
+ "step": 8119
+ },
+ {
+ "epoch": 3.839243498817967,
+ "grad_norm": 2.8727006912231445,
+ "learning_rate": 1.4568218380452436e-06,
+ "loss": 0.3965,
+ "step": 8120
+ },
+ {
+ "epoch": 3.8397163120567375,
+ "grad_norm": 3.0381174087524414,
+ "learning_rate": 1.4562549478118077e-06,
+ "loss": 0.4304,
+ "step": 8121
+ },
+ {
+ "epoch": 3.8401891252955083,
+ "grad_norm": 2.7406346797943115,
+ "learning_rate": 1.4556881225677982e-06,
+ "loss": 0.3636,
+ "step": 8122
+ },
+ {
+ "epoch": 3.840661938534279,
+ "grad_norm": 3.3900108337402344,
+ "learning_rate": 1.4551213623485111e-06,
+ "loss": 0.3863,
+ "step": 8123
+ },
+ {
+ "epoch": 3.8411347517730494,
+ "grad_norm": 2.885150909423828,
+ "learning_rate": 1.4545546671892354e-06,
+ "loss": 0.3679,
+ "step": 8124
+ },
+ {
+ "epoch": 3.84160756501182,
+ "grad_norm": 3.3361690044403076,
+ "learning_rate": 1.4539880371252555e-06,
+ "loss": 0.4333,
+ "step": 8125
+ },
+ {
+ "epoch": 3.842080378250591,
+ "grad_norm": 3.1547763347625732,
+ "learning_rate": 1.4534214721918545e-06,
+ "loss": 0.4477,
+ "step": 8126
+ },
+ {
+ "epoch": 3.842553191489362,
+ "grad_norm": 3.0337510108947754,
+ "learning_rate": 1.4528549724243095e-06,
+ "loss": 0.3647,
+ "step": 8127
+ },
+ {
+ "epoch": 3.8430260047281326,
+ "grad_norm": 2.8390069007873535,
+ "learning_rate": 1.452288537857893e-06,
+ "loss": 0.3698,
+ "step": 8128
+ },
+ {
+ "epoch": 3.843498817966903,
+ "grad_norm": 2.857513427734375,
+ "learning_rate": 1.451722168527876e-06,
+ "loss": 0.3842,
+ "step": 8129
+ },
+ {
+ "epoch": 3.8439716312056738,
+ "grad_norm": 3.015320062637329,
+ "learning_rate": 1.451155864469522e-06,
+ "loss": 0.4058,
+ "step": 8130
+ },
+ {
+ "epoch": 3.8444444444444446,
+ "grad_norm": 2.923957347869873,
+ "learning_rate": 1.450589625718094e-06,
+ "loss": 0.3976,
+ "step": 8131
+ },
+ {
+ "epoch": 3.844917257683215,
+ "grad_norm": 3.332338571548462,
+ "learning_rate": 1.4500234523088492e-06,
+ "loss": 0.4118,
+ "step": 8132
+ },
+ {
+ "epoch": 3.8453900709219857,
+ "grad_norm": 3.0403711795806885,
+ "learning_rate": 1.4494573442770381e-06,
+ "loss": 0.3715,
+ "step": 8133
+ },
+ {
+ "epoch": 3.8458628841607565,
+ "grad_norm": 3.2310287952423096,
+ "learning_rate": 1.4488913016579135e-06,
+ "loss": 0.4587,
+ "step": 8134
+ },
+ {
+ "epoch": 3.8463356973995273,
+ "grad_norm": 3.091282844543457,
+ "learning_rate": 1.448325324486718e-06,
+ "loss": 0.4234,
+ "step": 8135
+ },
+ {
+ "epoch": 3.846808510638298,
+ "grad_norm": 3.11161208152771,
+ "learning_rate": 1.4477594127986933e-06,
+ "loss": 0.4176,
+ "step": 8136
+ },
+ {
+ "epoch": 3.8472813238770684,
+ "grad_norm": 3.21042537689209,
+ "learning_rate": 1.4471935666290751e-06,
+ "loss": 0.4326,
+ "step": 8137
+ },
+ {
+ "epoch": 3.8477541371158392,
+ "grad_norm": 3.411543846130371,
+ "learning_rate": 1.4466277860130981e-06,
+ "loss": 0.4525,
+ "step": 8138
+ },
+ {
+ "epoch": 3.84822695035461,
+ "grad_norm": 3.0475308895111084,
+ "learning_rate": 1.4460620709859898e-06,
+ "loss": 0.3906,
+ "step": 8139
+ },
+ {
+ "epoch": 3.8486997635933804,
+ "grad_norm": 2.989367723464966,
+ "learning_rate": 1.4454964215829742e-06,
+ "loss": 0.3732,
+ "step": 8140
+ },
+ {
+ "epoch": 3.849172576832151,
+ "grad_norm": 2.8130393028259277,
+ "learning_rate": 1.4449308378392734e-06,
+ "loss": 0.3733,
+ "step": 8141
+ },
+ {
+ "epoch": 3.849645390070922,
+ "grad_norm": 12.2243013381958,
+ "learning_rate": 1.444365319790103e-06,
+ "loss": 0.3506,
+ "step": 8142
+ },
+ {
+ "epoch": 3.850118203309693,
+ "grad_norm": 3.075556516647339,
+ "learning_rate": 1.4437998674706743e-06,
+ "loss": 0.376,
+ "step": 8143
+ },
+ {
+ "epoch": 3.8505910165484636,
+ "grad_norm": 2.765650510787964,
+ "learning_rate": 1.4432344809161974e-06,
+ "loss": 0.3865,
+ "step": 8144
+ },
+ {
+ "epoch": 3.851063829787234,
+ "grad_norm": 3.171588897705078,
+ "learning_rate": 1.4426691601618747e-06,
+ "loss": 0.4391,
+ "step": 8145
+ },
+ {
+ "epoch": 3.8515366430260047,
+ "grad_norm": 2.8378992080688477,
+ "learning_rate": 1.4421039052429083e-06,
+ "loss": 0.3984,
+ "step": 8146
+ },
+ {
+ "epoch": 3.8520094562647755,
+ "grad_norm": 2.6588387489318848,
+ "learning_rate": 1.4415387161944929e-06,
+ "loss": 0.3961,
+ "step": 8147
+ },
+ {
+ "epoch": 3.852482269503546,
+ "grad_norm": 2.919325351715088,
+ "learning_rate": 1.4409735930518197e-06,
+ "loss": 0.4058,
+ "step": 8148
+ },
+ {
+ "epoch": 3.8529550827423167,
+ "grad_norm": 3.2239115238189697,
+ "learning_rate": 1.4404085358500778e-06,
+ "loss": 0.4018,
+ "step": 8149
+ },
+ {
+ "epoch": 3.8534278959810875,
+ "grad_norm": 3.2509875297546387,
+ "learning_rate": 1.4398435446244502e-06,
+ "loss": 0.4078,
+ "step": 8150
+ },
+ {
+ "epoch": 3.8539007092198583,
+ "grad_norm": 3.124782085418701,
+ "learning_rate": 1.4392786194101155e-06,
+ "loss": 0.4459,
+ "step": 8151
+ },
+ {
+ "epoch": 3.854373522458629,
+ "grad_norm": 2.924095392227173,
+ "learning_rate": 1.4387137602422512e-06,
+ "loss": 0.3686,
+ "step": 8152
+ },
+ {
+ "epoch": 3.8548463356973994,
+ "grad_norm": 2.9307191371917725,
+ "learning_rate": 1.4381489671560272e-06,
+ "loss": 0.4345,
+ "step": 8153
+ },
+ {
+ "epoch": 3.8553191489361702,
+ "grad_norm": 2.868488073348999,
+ "learning_rate": 1.4375842401866113e-06,
+ "loss": 0.366,
+ "step": 8154
+ },
+ {
+ "epoch": 3.855791962174941,
+ "grad_norm": 2.9893085956573486,
+ "learning_rate": 1.4370195793691661e-06,
+ "loss": 0.3401,
+ "step": 8155
+ },
+ {
+ "epoch": 3.8562647754137114,
+ "grad_norm": 3.0113472938537598,
+ "learning_rate": 1.4364549847388492e-06,
+ "loss": 0.4051,
+ "step": 8156
+ },
+ {
+ "epoch": 3.856737588652482,
+ "grad_norm": 3.4693121910095215,
+ "learning_rate": 1.4358904563308184e-06,
+ "loss": 0.4505,
+ "step": 8157
+ },
+ {
+ "epoch": 3.857210401891253,
+ "grad_norm": 2.9048118591308594,
+ "learning_rate": 1.4353259941802216e-06,
+ "loss": 0.3973,
+ "step": 8158
+ },
+ {
+ "epoch": 3.8576832151300238,
+ "grad_norm": 3.264910936355591,
+ "learning_rate": 1.434761598322208e-06,
+ "loss": 0.4317,
+ "step": 8159
+ },
+ {
+ "epoch": 3.8581560283687946,
+ "grad_norm": 2.973742723464966,
+ "learning_rate": 1.4341972687919186e-06,
+ "loss": 0.3896,
+ "step": 8160
+ },
+ {
+ "epoch": 3.858628841607565,
+ "grad_norm": 2.7802605628967285,
+ "learning_rate": 1.4336330056244906e-06,
+ "loss": 0.4063,
+ "step": 8161
+ },
+ {
+ "epoch": 3.8591016548463357,
+ "grad_norm": 3.1401731967926025,
+ "learning_rate": 1.433068808855061e-06,
+ "loss": 0.4068,
+ "step": 8162
+ },
+ {
+ "epoch": 3.8595744680851065,
+ "grad_norm": 3.132723331451416,
+ "learning_rate": 1.432504678518757e-06,
+ "loss": 0.4724,
+ "step": 8163
+ },
+ {
+ "epoch": 3.860047281323877,
+ "grad_norm": 2.94944167137146,
+ "learning_rate": 1.4319406146507068e-06,
+ "loss": 0.3666,
+ "step": 8164
+ },
+ {
+ "epoch": 3.8605200945626477,
+ "grad_norm": 2.972322463989258,
+ "learning_rate": 1.4313766172860311e-06,
+ "loss": 0.4226,
+ "step": 8165
+ },
+ {
+ "epoch": 3.8609929078014185,
+ "grad_norm": 2.9808123111724854,
+ "learning_rate": 1.430812686459847e-06,
+ "loss": 0.4079,
+ "step": 8166
+ },
+ {
+ "epoch": 3.8614657210401893,
+ "grad_norm": 2.9656291007995605,
+ "learning_rate": 1.4302488222072698e-06,
+ "loss": 0.3423,
+ "step": 8167
+ },
+ {
+ "epoch": 3.86193853427896,
+ "grad_norm": 2.886765241622925,
+ "learning_rate": 1.4296850245634073e-06,
+ "loss": 0.3577,
+ "step": 8168
+ },
+ {
+ "epoch": 3.8624113475177304,
+ "grad_norm": 3.0613043308258057,
+ "learning_rate": 1.4291212935633653e-06,
+ "loss": 0.4121,
+ "step": 8169
+ },
+ {
+ "epoch": 3.862884160756501,
+ "grad_norm": 2.842050313949585,
+ "learning_rate": 1.4285576292422445e-06,
+ "loss": 0.373,
+ "step": 8170
+ },
+ {
+ "epoch": 3.863356973995272,
+ "grad_norm": 3.0604517459869385,
+ "learning_rate": 1.4279940316351413e-06,
+ "loss": 0.3938,
+ "step": 8171
+ },
+ {
+ "epoch": 3.8638297872340424,
+ "grad_norm": 3.9742302894592285,
+ "learning_rate": 1.42743050077715e-06,
+ "loss": 0.4463,
+ "step": 8172
+ },
+ {
+ "epoch": 3.864302600472813,
+ "grad_norm": 2.8330607414245605,
+ "learning_rate": 1.4268670367033572e-06,
+ "loss": 0.4423,
+ "step": 8173
+ },
+ {
+ "epoch": 3.864775413711584,
+ "grad_norm": 2.953256607055664,
+ "learning_rate": 1.4263036394488497e-06,
+ "loss": 0.3553,
+ "step": 8174
+ },
+ {
+ "epoch": 3.8652482269503547,
+ "grad_norm": 2.865849018096924,
+ "learning_rate": 1.4257403090487065e-06,
+ "loss": 0.3348,
+ "step": 8175
+ },
+ {
+ "epoch": 3.8657210401891255,
+ "grad_norm": 2.712502956390381,
+ "learning_rate": 1.4251770455380027e-06,
+ "loss": 0.3896,
+ "step": 8176
+ },
+ {
+ "epoch": 3.866193853427896,
+ "grad_norm": 2.798898220062256,
+ "learning_rate": 1.4246138489518123e-06,
+ "loss": 0.4275,
+ "step": 8177
+ },
+ {
+ "epoch": 3.8666666666666667,
+ "grad_norm": 2.830899953842163,
+ "learning_rate": 1.4240507193252023e-06,
+ "loss": 0.3952,
+ "step": 8178
+ },
+ {
+ "epoch": 3.8671394799054375,
+ "grad_norm": 2.5789451599121094,
+ "learning_rate": 1.4234876566932348e-06,
+ "loss": 0.3483,
+ "step": 8179
+ },
+ {
+ "epoch": 3.867612293144208,
+ "grad_norm": 2.8513095378875732,
+ "learning_rate": 1.422924661090972e-06,
+ "loss": 0.3403,
+ "step": 8180
+ },
+ {
+ "epoch": 3.8680851063829786,
+ "grad_norm": 3.5031449794769287,
+ "learning_rate": 1.4223617325534664e-06,
+ "loss": 0.3964,
+ "step": 8181
+ },
+ {
+ "epoch": 3.8685579196217494,
+ "grad_norm": 2.7495479583740234,
+ "learning_rate": 1.4217988711157715e-06,
+ "loss": 0.3376,
+ "step": 8182
+ },
+ {
+ "epoch": 3.8690307328605202,
+ "grad_norm": 2.8609421253204346,
+ "learning_rate": 1.421236076812933e-06,
+ "loss": 0.3967,
+ "step": 8183
+ },
+ {
+ "epoch": 3.869503546099291,
+ "grad_norm": 3.0624637603759766,
+ "learning_rate": 1.420673349679994e-06,
+ "loss": 0.3764,
+ "step": 8184
+ },
+ {
+ "epoch": 3.8699763593380614,
+ "grad_norm": 3.3084404468536377,
+ "learning_rate": 1.4201106897519926e-06,
+ "loss": 0.4567,
+ "step": 8185
+ },
+ {
+ "epoch": 3.870449172576832,
+ "grad_norm": 3.164116382598877,
+ "learning_rate": 1.4195480970639624e-06,
+ "loss": 0.4217,
+ "step": 8186
+ },
+ {
+ "epoch": 3.870921985815603,
+ "grad_norm": 2.971390724182129,
+ "learning_rate": 1.4189855716509355e-06,
+ "loss": 0.3981,
+ "step": 8187
+ },
+ {
+ "epoch": 3.8713947990543733,
+ "grad_norm": 3.0537233352661133,
+ "learning_rate": 1.418423113547937e-06,
+ "loss": 0.4093,
+ "step": 8188
+ },
+ {
+ "epoch": 3.871867612293144,
+ "grad_norm": 3.698120594024658,
+ "learning_rate": 1.4178607227899877e-06,
+ "loss": 0.3158,
+ "step": 8189
+ },
+ {
+ "epoch": 3.872340425531915,
+ "grad_norm": 3.0320451259613037,
+ "learning_rate": 1.417298399412107e-06,
+ "loss": 0.3903,
+ "step": 8190
+ },
+ {
+ "epoch": 3.8728132387706857,
+ "grad_norm": 2.913296699523926,
+ "learning_rate": 1.4167361434493068e-06,
+ "loss": 0.3396,
+ "step": 8191
+ },
+ {
+ "epoch": 3.8732860520094565,
+ "grad_norm": 3.011906147003174,
+ "learning_rate": 1.4161739549365976e-06,
+ "loss": 0.3915,
+ "step": 8192
+ },
+ {
+ "epoch": 3.873758865248227,
+ "grad_norm": 3.2707724571228027,
+ "learning_rate": 1.4156118339089842e-06,
+ "loss": 0.4466,
+ "step": 8193
+ },
+ {
+ "epoch": 3.8742316784869977,
+ "grad_norm": 3.036747694015503,
+ "learning_rate": 1.4150497804014656e-06,
+ "loss": 0.4095,
+ "step": 8194
+ },
+ {
+ "epoch": 3.8747044917257685,
+ "grad_norm": 2.8851394653320312,
+ "learning_rate": 1.4144877944490411e-06,
+ "loss": 0.4235,
+ "step": 8195
+ },
+ {
+ "epoch": 3.875177304964539,
+ "grad_norm": 3.099785566329956,
+ "learning_rate": 1.4139258760867008e-06,
+ "loss": 0.4102,
+ "step": 8196
+ },
+ {
+ "epoch": 3.8756501182033096,
+ "grad_norm": 3.0752081871032715,
+ "learning_rate": 1.4133640253494347e-06,
+ "loss": 0.4165,
+ "step": 8197
+ },
+ {
+ "epoch": 3.8761229314420804,
+ "grad_norm": 2.842257261276245,
+ "learning_rate": 1.412802242272226e-06,
+ "loss": 0.3573,
+ "step": 8198
+ },
+ {
+ "epoch": 3.876595744680851,
+ "grad_norm": 2.93868350982666,
+ "learning_rate": 1.4122405268900547e-06,
+ "loss": 0.36,
+ "step": 8199
+ },
+ {
+ "epoch": 3.877068557919622,
+ "grad_norm": 2.674356460571289,
+ "learning_rate": 1.411678879237896e-06,
+ "loss": 0.3763,
+ "step": 8200
+ },
+ {
+ "epoch": 3.8775413711583924,
+ "grad_norm": 2.710617780685425,
+ "learning_rate": 1.411117299350721e-06,
+ "loss": 0.358,
+ "step": 8201
+ },
+ {
+ "epoch": 3.878014184397163,
+ "grad_norm": 3.0299410820007324,
+ "learning_rate": 1.4105557872634968e-06,
+ "loss": 0.3723,
+ "step": 8202
+ },
+ {
+ "epoch": 3.878486997635934,
+ "grad_norm": 3.1951241493225098,
+ "learning_rate": 1.4099943430111874e-06,
+ "loss": 0.4163,
+ "step": 8203
+ },
+ {
+ "epoch": 3.8789598108747043,
+ "grad_norm": 2.752410411834717,
+ "learning_rate": 1.4094329666287495e-06,
+ "loss": 0.3753,
+ "step": 8204
+ },
+ {
+ "epoch": 3.879432624113475,
+ "grad_norm": 3.1242496967315674,
+ "learning_rate": 1.40887165815114e-06,
+ "loss": 0.3694,
+ "step": 8205
+ },
+ {
+ "epoch": 3.879905437352246,
+ "grad_norm": 5.16750431060791,
+ "learning_rate": 1.4083104176133079e-06,
+ "loss": 0.3869,
+ "step": 8206
+ },
+ {
+ "epoch": 3.8803782505910167,
+ "grad_norm": 3.2995245456695557,
+ "learning_rate": 1.4077492450501978e-06,
+ "loss": 0.4194,
+ "step": 8207
+ },
+ {
+ "epoch": 3.8808510638297875,
+ "grad_norm": 3.506807804107666,
+ "learning_rate": 1.4071881404967541e-06,
+ "loss": 0.3873,
+ "step": 8208
+ },
+ {
+ "epoch": 3.881323877068558,
+ "grad_norm": 3.1201252937316895,
+ "learning_rate": 1.4066271039879123e-06,
+ "loss": 0.3625,
+ "step": 8209
+ },
+ {
+ "epoch": 3.8817966903073287,
+ "grad_norm": 2.870683193206787,
+ "learning_rate": 1.4060661355586073e-06,
+ "loss": 0.4039,
+ "step": 8210
+ },
+ {
+ "epoch": 3.8822695035460995,
+ "grad_norm": 3.177701234817505,
+ "learning_rate": 1.405505235243767e-06,
+ "loss": 0.3715,
+ "step": 8211
+ },
+ {
+ "epoch": 3.88274231678487,
+ "grad_norm": 3.0319771766662598,
+ "learning_rate": 1.4049444030783157e-06,
+ "loss": 0.3588,
+ "step": 8212
+ },
+ {
+ "epoch": 3.8832151300236406,
+ "grad_norm": 2.4598889350891113,
+ "learning_rate": 1.404383639097176e-06,
+ "loss": 0.2788,
+ "step": 8213
+ },
+ {
+ "epoch": 3.8836879432624114,
+ "grad_norm": 2.916987419128418,
+ "learning_rate": 1.4038229433352623e-06,
+ "loss": 0.4167,
+ "step": 8214
+ },
+ {
+ "epoch": 3.884160756501182,
+ "grad_norm": 3.005075216293335,
+ "learning_rate": 1.4032623158274872e-06,
+ "loss": 0.4251,
+ "step": 8215
+ },
+ {
+ "epoch": 3.8846335697399526,
+ "grad_norm": 3.1718621253967285,
+ "learning_rate": 1.4027017566087591e-06,
+ "loss": 0.383,
+ "step": 8216
+ },
+ {
+ "epoch": 3.8851063829787233,
+ "grad_norm": 2.954662322998047,
+ "learning_rate": 1.402141265713981e-06,
+ "loss": 0.373,
+ "step": 8217
+ },
+ {
+ "epoch": 3.885579196217494,
+ "grad_norm": 3.408008337020874,
+ "learning_rate": 1.4015808431780526e-06,
+ "loss": 0.4216,
+ "step": 8218
+ },
+ {
+ "epoch": 3.8860520094562645,
+ "grad_norm": 3.1599369049072266,
+ "learning_rate": 1.4010204890358675e-06,
+ "loss": 0.4544,
+ "step": 8219
+ },
+ {
+ "epoch": 3.8865248226950353,
+ "grad_norm": 2.8919107913970947,
+ "learning_rate": 1.4004602033223186e-06,
+ "loss": 0.3785,
+ "step": 8220
+ },
+ {
+ "epoch": 3.886997635933806,
+ "grad_norm": 3.522581100463867,
+ "learning_rate": 1.3998999860722918e-06,
+ "loss": 0.4276,
+ "step": 8221
+ },
+ {
+ "epoch": 3.887470449172577,
+ "grad_norm": 2.9278945922851562,
+ "learning_rate": 1.399339837320668e-06,
+ "loss": 0.409,
+ "step": 8222
+ },
+ {
+ "epoch": 3.8879432624113477,
+ "grad_norm": 3.032557725906372,
+ "learning_rate": 1.398779757102327e-06,
+ "loss": 0.3973,
+ "step": 8223
+ },
+ {
+ "epoch": 3.888416075650118,
+ "grad_norm": 2.843118667602539,
+ "learning_rate": 1.3982197454521423e-06,
+ "loss": 0.3418,
+ "step": 8224
+ },
+ {
+ "epoch": 3.888888888888889,
+ "grad_norm": 2.8620638847351074,
+ "learning_rate": 1.3976598024049815e-06,
+ "loss": 0.3751,
+ "step": 8225
+ },
+ {
+ "epoch": 3.8893617021276596,
+ "grad_norm": 2.532327175140381,
+ "learning_rate": 1.3970999279957124e-06,
+ "loss": 0.3541,
+ "step": 8226
+ },
+ {
+ "epoch": 3.88983451536643,
+ "grad_norm": 3.1074535846710205,
+ "learning_rate": 1.3965401222591935e-06,
+ "loss": 0.4706,
+ "step": 8227
+ },
+ {
+ "epoch": 3.890307328605201,
+ "grad_norm": 3.1558735370635986,
+ "learning_rate": 1.3959803852302839e-06,
+ "loss": 0.448,
+ "step": 8228
+ },
+ {
+ "epoch": 3.8907801418439716,
+ "grad_norm": 3.0862064361572266,
+ "learning_rate": 1.3954207169438344e-06,
+ "loss": 0.3308,
+ "step": 8229
+ },
+ {
+ "epoch": 3.8912529550827424,
+ "grad_norm": 2.9246280193328857,
+ "learning_rate": 1.3948611174346927e-06,
+ "loss": 0.3771,
+ "step": 8230
+ },
+ {
+ "epoch": 3.891725768321513,
+ "grad_norm": 2.7959492206573486,
+ "learning_rate": 1.394301586737704e-06,
+ "loss": 0.4248,
+ "step": 8231
+ },
+ {
+ "epoch": 3.8921985815602835,
+ "grad_norm": 2.787670373916626,
+ "learning_rate": 1.3937421248877075e-06,
+ "loss": 0.3416,
+ "step": 8232
+ },
+ {
+ "epoch": 3.8926713947990543,
+ "grad_norm": 3.0775792598724365,
+ "learning_rate": 1.393182731919538e-06,
+ "loss": 0.4345,
+ "step": 8233
+ },
+ {
+ "epoch": 3.893144208037825,
+ "grad_norm": 2.6338887214660645,
+ "learning_rate": 1.3926234078680268e-06,
+ "loss": 0.3995,
+ "step": 8234
+ },
+ {
+ "epoch": 3.8936170212765955,
+ "grad_norm": 2.9975900650024414,
+ "learning_rate": 1.392064152767999e-06,
+ "loss": 0.3997,
+ "step": 8235
+ },
+ {
+ "epoch": 3.8940898345153663,
+ "grad_norm": 2.8615779876708984,
+ "learning_rate": 1.3915049666542791e-06,
+ "loss": 0.3687,
+ "step": 8236
+ },
+ {
+ "epoch": 3.894562647754137,
+ "grad_norm": 3.0132436752319336,
+ "learning_rate": 1.3909458495616835e-06,
+ "loss": 0.4085,
+ "step": 8237
+ },
+ {
+ "epoch": 3.895035460992908,
+ "grad_norm": 3.141291379928589,
+ "learning_rate": 1.3903868015250278e-06,
+ "loss": 0.3903,
+ "step": 8238
+ },
+ {
+ "epoch": 3.8955082742316787,
+ "grad_norm": 2.6998603343963623,
+ "learning_rate": 1.3898278225791204e-06,
+ "loss": 0.3576,
+ "step": 8239
+ },
+ {
+ "epoch": 3.895981087470449,
+ "grad_norm": 3.212578535079956,
+ "learning_rate": 1.3892689127587656e-06,
+ "loss": 0.4321,
+ "step": 8240
+ },
+ {
+ "epoch": 3.89645390070922,
+ "grad_norm": 3.15732741355896,
+ "learning_rate": 1.3887100720987662e-06,
+ "loss": 0.4247,
+ "step": 8241
+ },
+ {
+ "epoch": 3.8969267139479906,
+ "grad_norm": 2.6001040935516357,
+ "learning_rate": 1.3881513006339168e-06,
+ "loss": 0.3376,
+ "step": 8242
+ },
+ {
+ "epoch": 3.897399527186761,
+ "grad_norm": 2.766188859939575,
+ "learning_rate": 1.3875925983990113e-06,
+ "loss": 0.3771,
+ "step": 8243
+ },
+ {
+ "epoch": 3.8978723404255318,
+ "grad_norm": 2.7471580505371094,
+ "learning_rate": 1.3870339654288372e-06,
+ "loss": 0.3311,
+ "step": 8244
+ },
+ {
+ "epoch": 3.8983451536643026,
+ "grad_norm": 3.577664375305176,
+ "learning_rate": 1.3864754017581769e-06,
+ "loss": 0.3725,
+ "step": 8245
+ },
+ {
+ "epoch": 3.8988179669030734,
+ "grad_norm": 2.8747243881225586,
+ "learning_rate": 1.3859169074218116e-06,
+ "loss": 0.3706,
+ "step": 8246
+ },
+ {
+ "epoch": 3.899290780141844,
+ "grad_norm": 2.5249671936035156,
+ "learning_rate": 1.3853584824545152e-06,
+ "loss": 0.3621,
+ "step": 8247
+ },
+ {
+ "epoch": 3.8997635933806145,
+ "grad_norm": 2.7290890216827393,
+ "learning_rate": 1.3848001268910589e-06,
+ "loss": 0.3209,
+ "step": 8248
+ },
+ {
+ "epoch": 3.9002364066193853,
+ "grad_norm": 3.0917534828186035,
+ "learning_rate": 1.3842418407662084e-06,
+ "loss": 0.3904,
+ "step": 8249
+ },
+ {
+ "epoch": 3.900709219858156,
+ "grad_norm": 3.099494695663452,
+ "learning_rate": 1.383683624114725e-06,
+ "loss": 0.3714,
+ "step": 8250
+ },
+ {
+ "epoch": 3.9011820330969265,
+ "grad_norm": 3.077505588531494,
+ "learning_rate": 1.3831254769713687e-06,
+ "loss": 0.4166,
+ "step": 8251
+ },
+ {
+ "epoch": 3.9016548463356973,
+ "grad_norm": 2.9983766078948975,
+ "learning_rate": 1.3825673993708915e-06,
+ "loss": 0.3909,
+ "step": 8252
+ },
+ {
+ "epoch": 3.902127659574468,
+ "grad_norm": 2.7958667278289795,
+ "learning_rate": 1.3820093913480415e-06,
+ "loss": 0.3966,
+ "step": 8253
+ },
+ {
+ "epoch": 3.902600472813239,
+ "grad_norm": 3.0938336849212646,
+ "learning_rate": 1.3814514529375656e-06,
+ "loss": 0.4118,
+ "step": 8254
+ },
+ {
+ "epoch": 3.9030732860520096,
+ "grad_norm": 3.2711637020111084,
+ "learning_rate": 1.3808935841742016e-06,
+ "loss": 0.4021,
+ "step": 8255
+ },
+ {
+ "epoch": 3.90354609929078,
+ "grad_norm": 3.23563814163208,
+ "learning_rate": 1.3803357850926885e-06,
+ "loss": 0.3679,
+ "step": 8256
+ },
+ {
+ "epoch": 3.904018912529551,
+ "grad_norm": 2.77942156791687,
+ "learning_rate": 1.3797780557277563e-06,
+ "loss": 0.3938,
+ "step": 8257
+ },
+ {
+ "epoch": 3.9044917257683216,
+ "grad_norm": 3.1273257732391357,
+ "learning_rate": 1.3792203961141313e-06,
+ "loss": 0.3579,
+ "step": 8258
+ },
+ {
+ "epoch": 3.904964539007092,
+ "grad_norm": 3.69164776802063,
+ "learning_rate": 1.378662806286539e-06,
+ "loss": 0.3712,
+ "step": 8259
+ },
+ {
+ "epoch": 3.9054373522458627,
+ "grad_norm": 2.8818306922912598,
+ "learning_rate": 1.3781052862796957e-06,
+ "loss": 0.3972,
+ "step": 8260
+ },
+ {
+ "epoch": 3.9059101654846335,
+ "grad_norm": 2.776651382446289,
+ "learning_rate": 1.377547836128318e-06,
+ "loss": 0.3605,
+ "step": 8261
+ },
+ {
+ "epoch": 3.9063829787234043,
+ "grad_norm": 3.1498706340789795,
+ "learning_rate": 1.376990455867115e-06,
+ "loss": 0.3995,
+ "step": 8262
+ },
+ {
+ "epoch": 3.906855791962175,
+ "grad_norm": 2.777390956878662,
+ "learning_rate": 1.3764331455307916e-06,
+ "loss": 0.3463,
+ "step": 8263
+ },
+ {
+ "epoch": 3.9073286052009455,
+ "grad_norm": 2.9953835010528564,
+ "learning_rate": 1.3758759051540496e-06,
+ "loss": 0.3881,
+ "step": 8264
+ },
+ {
+ "epoch": 3.9078014184397163,
+ "grad_norm": 3.737194538116455,
+ "learning_rate": 1.375318734771585e-06,
+ "loss": 0.4456,
+ "step": 8265
+ },
+ {
+ "epoch": 3.908274231678487,
+ "grad_norm": 3.1575849056243896,
+ "learning_rate": 1.374761634418092e-06,
+ "loss": 0.3613,
+ "step": 8266
+ },
+ {
+ "epoch": 3.9087470449172574,
+ "grad_norm": 3.140662908554077,
+ "learning_rate": 1.374204604128258e-06,
+ "loss": 0.4462,
+ "step": 8267
+ },
+ {
+ "epoch": 3.9092198581560282,
+ "grad_norm": 3.2106714248657227,
+ "learning_rate": 1.3736476439367663e-06,
+ "loss": 0.3801,
+ "step": 8268
+ },
+ {
+ "epoch": 3.909692671394799,
+ "grad_norm": 2.888345956802368,
+ "learning_rate": 1.3730907538782976e-06,
+ "loss": 0.4209,
+ "step": 8269
+ },
+ {
+ "epoch": 3.91016548463357,
+ "grad_norm": 2.8903355598449707,
+ "learning_rate": 1.3725339339875252e-06,
+ "loss": 0.3612,
+ "step": 8270
+ },
+ {
+ "epoch": 3.9106382978723406,
+ "grad_norm": 3.2661736011505127,
+ "learning_rate": 1.371977184299122e-06,
+ "loss": 0.4151,
+ "step": 8271
+ },
+ {
+ "epoch": 3.911111111111111,
+ "grad_norm": 3.1532459259033203,
+ "learning_rate": 1.3714205048477535e-06,
+ "loss": 0.3706,
+ "step": 8272
+ },
+ {
+ "epoch": 3.911583924349882,
+ "grad_norm": 2.907306432723999,
+ "learning_rate": 1.3708638956680804e-06,
+ "loss": 0.4113,
+ "step": 8273
+ },
+ {
+ "epoch": 3.9120567375886526,
+ "grad_norm": 2.7301599979400635,
+ "learning_rate": 1.3703073567947622e-06,
+ "loss": 0.355,
+ "step": 8274
+ },
+ {
+ "epoch": 3.912529550827423,
+ "grad_norm": 2.595625877380371,
+ "learning_rate": 1.3697508882624516e-06,
+ "loss": 0.3733,
+ "step": 8275
+ },
+ {
+ "epoch": 3.9130023640661937,
+ "grad_norm": 2.784294366836548,
+ "learning_rate": 1.369194490105796e-06,
+ "loss": 0.3366,
+ "step": 8276
+ },
+ {
+ "epoch": 3.9134751773049645,
+ "grad_norm": 3.0179800987243652,
+ "learning_rate": 1.3686381623594419e-06,
+ "loss": 0.3922,
+ "step": 8277
+ },
+ {
+ "epoch": 3.9139479905437353,
+ "grad_norm": 2.6641111373901367,
+ "learning_rate": 1.3680819050580291e-06,
+ "loss": 0.3324,
+ "step": 8278
+ },
+ {
+ "epoch": 3.914420803782506,
+ "grad_norm": 2.917741060256958,
+ "learning_rate": 1.3675257182361923e-06,
+ "loss": 0.3784,
+ "step": 8279
+ },
+ {
+ "epoch": 3.9148936170212765,
+ "grad_norm": 2.959599018096924,
+ "learning_rate": 1.3669696019285626e-06,
+ "loss": 0.3846,
+ "step": 8280
+ },
+ {
+ "epoch": 3.9153664302600473,
+ "grad_norm": 3.078824043273926,
+ "learning_rate": 1.3664135561697683e-06,
+ "loss": 0.4357,
+ "step": 8281
+ },
+ {
+ "epoch": 3.915839243498818,
+ "grad_norm": 3.0174930095672607,
+ "learning_rate": 1.3658575809944313e-06,
+ "loss": 0.3643,
+ "step": 8282
+ },
+ {
+ "epoch": 3.9163120567375884,
+ "grad_norm": 2.6805408000946045,
+ "learning_rate": 1.365301676437169e-06,
+ "loss": 0.3193,
+ "step": 8283
+ },
+ {
+ "epoch": 3.916784869976359,
+ "grad_norm": 2.6996054649353027,
+ "learning_rate": 1.3647458425325966e-06,
+ "loss": 0.3378,
+ "step": 8284
+ },
+ {
+ "epoch": 3.91725768321513,
+ "grad_norm": 2.7950546741485596,
+ "learning_rate": 1.3641900793153223e-06,
+ "loss": 0.3864,
+ "step": 8285
+ },
+ {
+ "epoch": 3.917730496453901,
+ "grad_norm": 2.9658634662628174,
+ "learning_rate": 1.363634386819951e-06,
+ "loss": 0.3452,
+ "step": 8286
+ },
+ {
+ "epoch": 3.9182033096926716,
+ "grad_norm": 3.0684404373168945,
+ "learning_rate": 1.363078765081084e-06,
+ "loss": 0.3278,
+ "step": 8287
+ },
+ {
+ "epoch": 3.918676122931442,
+ "grad_norm": 3.0293614864349365,
+ "learning_rate": 1.3625232141333164e-06,
+ "loss": 0.3827,
+ "step": 8288
+ },
+ {
+ "epoch": 3.9191489361702128,
+ "grad_norm": 2.9969890117645264,
+ "learning_rate": 1.3619677340112413e-06,
+ "loss": 0.3412,
+ "step": 8289
+ },
+ {
+ "epoch": 3.9196217494089836,
+ "grad_norm": 2.991654396057129,
+ "learning_rate": 1.3614123247494457e-06,
+ "loss": 0.3683,
+ "step": 8290
+ },
+ {
+ "epoch": 3.920094562647754,
+ "grad_norm": 3.032158374786377,
+ "learning_rate": 1.360856986382511e-06,
+ "loss": 0.421,
+ "step": 8291
+ },
+ {
+ "epoch": 3.9205673758865247,
+ "grad_norm": 3.1413731575012207,
+ "learning_rate": 1.3603017189450173e-06,
+ "loss": 0.3818,
+ "step": 8292
+ },
+ {
+ "epoch": 3.9210401891252955,
+ "grad_norm": 3.295527219772339,
+ "learning_rate": 1.3597465224715387e-06,
+ "loss": 0.4828,
+ "step": 8293
+ },
+ {
+ "epoch": 3.9215130023640663,
+ "grad_norm": 3.116053581237793,
+ "learning_rate": 1.359191396996643e-06,
+ "loss": 0.4108,
+ "step": 8294
+ },
+ {
+ "epoch": 3.921985815602837,
+ "grad_norm": 2.957446336746216,
+ "learning_rate": 1.3586363425548975e-06,
+ "loss": 0.3482,
+ "step": 8295
+ },
+ {
+ "epoch": 3.9224586288416075,
+ "grad_norm": 2.745471715927124,
+ "learning_rate": 1.3580813591808627e-06,
+ "loss": 0.4184,
+ "step": 8296
+ },
+ {
+ "epoch": 3.9229314420803783,
+ "grad_norm": 3.0920722484588623,
+ "learning_rate": 1.3575264469090943e-06,
+ "loss": 0.3826,
+ "step": 8297
+ },
+ {
+ "epoch": 3.923404255319149,
+ "grad_norm": 2.8719749450683594,
+ "learning_rate": 1.3569716057741444e-06,
+ "loss": 0.3953,
+ "step": 8298
+ },
+ {
+ "epoch": 3.9238770685579194,
+ "grad_norm": 3.1278762817382812,
+ "learning_rate": 1.3564168358105597e-06,
+ "loss": 0.3658,
+ "step": 8299
+ },
+ {
+ "epoch": 3.92434988179669,
+ "grad_norm": 2.7752785682678223,
+ "learning_rate": 1.3558621370528851e-06,
+ "loss": 0.3447,
+ "step": 8300
+ },
+ {
+ "epoch": 3.924822695035461,
+ "grad_norm": 2.948575735092163,
+ "learning_rate": 1.3553075095356575e-06,
+ "loss": 0.3803,
+ "step": 8301
+ },
+ {
+ "epoch": 3.925295508274232,
+ "grad_norm": 2.8164193630218506,
+ "learning_rate": 1.354752953293413e-06,
+ "loss": 0.3724,
+ "step": 8302
+ },
+ {
+ "epoch": 3.9257683215130026,
+ "grad_norm": 3.2431271076202393,
+ "learning_rate": 1.3541984683606798e-06,
+ "loss": 0.382,
+ "step": 8303
+ },
+ {
+ "epoch": 3.926241134751773,
+ "grad_norm": 2.8485286235809326,
+ "learning_rate": 1.353644054771983e-06,
+ "loss": 0.3632,
+ "step": 8304
+ },
+ {
+ "epoch": 3.9267139479905437,
+ "grad_norm": 3.334914445877075,
+ "learning_rate": 1.3530897125618456e-06,
+ "loss": 0.5286,
+ "step": 8305
+ },
+ {
+ "epoch": 3.9271867612293145,
+ "grad_norm": 3.3895132541656494,
+ "learning_rate": 1.3525354417647815e-06,
+ "loss": 0.3838,
+ "step": 8306
+ },
+ {
+ "epoch": 3.927659574468085,
+ "grad_norm": 3.141935110092163,
+ "learning_rate": 1.351981242415305e-06,
+ "loss": 0.3928,
+ "step": 8307
+ },
+ {
+ "epoch": 3.9281323877068557,
+ "grad_norm": 3.3013596534729004,
+ "learning_rate": 1.3514271145479225e-06,
+ "loss": 0.4046,
+ "step": 8308
+ },
+ {
+ "epoch": 3.9286052009456265,
+ "grad_norm": 2.8704745769500732,
+ "learning_rate": 1.3508730581971363e-06,
+ "loss": 0.3542,
+ "step": 8309
+ },
+ {
+ "epoch": 3.9290780141843973,
+ "grad_norm": 3.179405689239502,
+ "learning_rate": 1.3503190733974472e-06,
+ "loss": 0.3911,
+ "step": 8310
+ },
+ {
+ "epoch": 3.929550827423168,
+ "grad_norm": 3.1091885566711426,
+ "learning_rate": 1.3497651601833481e-06,
+ "loss": 0.3552,
+ "step": 8311
+ },
+ {
+ "epoch": 3.9300236406619384,
+ "grad_norm": 2.687678813934326,
+ "learning_rate": 1.3492113185893288e-06,
+ "loss": 0.3462,
+ "step": 8312
+ },
+ {
+ "epoch": 3.9304964539007092,
+ "grad_norm": 3.4954965114593506,
+ "learning_rate": 1.3486575486498749e-06,
+ "loss": 0.4358,
+ "step": 8313
+ },
+ {
+ "epoch": 3.93096926713948,
+ "grad_norm": 2.8652899265289307,
+ "learning_rate": 1.3481038503994652e-06,
+ "loss": 0.3434,
+ "step": 8314
+ },
+ {
+ "epoch": 3.9314420803782504,
+ "grad_norm": 3.927623748779297,
+ "learning_rate": 1.3475502238725797e-06,
+ "loss": 0.4662,
+ "step": 8315
+ },
+ {
+ "epoch": 3.931914893617021,
+ "grad_norm": 3.1166276931762695,
+ "learning_rate": 1.346996669103687e-06,
+ "loss": 0.3953,
+ "step": 8316
+ },
+ {
+ "epoch": 3.932387706855792,
+ "grad_norm": 3.140003204345703,
+ "learning_rate": 1.346443186127257e-06,
+ "loss": 0.3616,
+ "step": 8317
+ },
+ {
+ "epoch": 3.9328605200945628,
+ "grad_norm": 3.335466146469116,
+ "learning_rate": 1.3458897749777516e-06,
+ "loss": 0.3854,
+ "step": 8318
+ },
+ {
+ "epoch": 3.9333333333333336,
+ "grad_norm": 2.8305466175079346,
+ "learning_rate": 1.3453364356896282e-06,
+ "loss": 0.374,
+ "step": 8319
+ },
+ {
+ "epoch": 3.933806146572104,
+ "grad_norm": 2.9511806964874268,
+ "learning_rate": 1.344783168297343e-06,
+ "loss": 0.4235,
+ "step": 8320
+ },
+ {
+ "epoch": 3.9342789598108747,
+ "grad_norm": 3.1868233680725098,
+ "learning_rate": 1.3442299728353448e-06,
+ "loss": 0.4384,
+ "step": 8321
+ },
+ {
+ "epoch": 3.9347517730496455,
+ "grad_norm": 3.1358237266540527,
+ "learning_rate": 1.3436768493380766e-06,
+ "loss": 0.4011,
+ "step": 8322
+ },
+ {
+ "epoch": 3.935224586288416,
+ "grad_norm": 3.126192808151245,
+ "learning_rate": 1.343123797839982e-06,
+ "loss": 0.4061,
+ "step": 8323
+ },
+ {
+ "epoch": 3.9356973995271867,
+ "grad_norm": 2.9724647998809814,
+ "learning_rate": 1.3425708183754949e-06,
+ "loss": 0.3859,
+ "step": 8324
+ },
+ {
+ "epoch": 3.9361702127659575,
+ "grad_norm": 3.1526355743408203,
+ "learning_rate": 1.3420179109790485e-06,
+ "loss": 0.3543,
+ "step": 8325
+ },
+ {
+ "epoch": 3.9366430260047283,
+ "grad_norm": 3.1289172172546387,
+ "learning_rate": 1.3414650756850695e-06,
+ "loss": 0.3836,
+ "step": 8326
+ },
+ {
+ "epoch": 3.937115839243499,
+ "grad_norm": 2.851264715194702,
+ "learning_rate": 1.34091231252798e-06,
+ "loss": 0.3294,
+ "step": 8327
+ },
+ {
+ "epoch": 3.9375886524822694,
+ "grad_norm": 2.921872138977051,
+ "learning_rate": 1.3403596215421981e-06,
+ "loss": 0.3698,
+ "step": 8328
+ },
+ {
+ "epoch": 3.93806146572104,
+ "grad_norm": 2.947258234024048,
+ "learning_rate": 1.339807002762137e-06,
+ "loss": 0.3616,
+ "step": 8329
+ },
+ {
+ "epoch": 3.938534278959811,
+ "grad_norm": 3.011021375656128,
+ "learning_rate": 1.3392544562222077e-06,
+ "loss": 0.3387,
+ "step": 8330
+ },
+ {
+ "epoch": 3.9390070921985814,
+ "grad_norm": 3.5230746269226074,
+ "learning_rate": 1.3387019819568134e-06,
+ "loss": 0.4054,
+ "step": 8331
+ },
+ {
+ "epoch": 3.939479905437352,
+ "grad_norm": 3.120321035385132,
+ "learning_rate": 1.3381495800003536e-06,
+ "loss": 0.4389,
+ "step": 8332
+ },
+ {
+ "epoch": 3.939952718676123,
+ "grad_norm": 3.0090999603271484,
+ "learning_rate": 1.3375972503872259e-06,
+ "loss": 0.4158,
+ "step": 8333
+ },
+ {
+ "epoch": 3.9404255319148938,
+ "grad_norm": 3.4807989597320557,
+ "learning_rate": 1.3370449931518198e-06,
+ "loss": 0.4144,
+ "step": 8334
+ },
+ {
+ "epoch": 3.9408983451536646,
+ "grad_norm": 2.8535733222961426,
+ "learning_rate": 1.336492808328523e-06,
+ "loss": 0.4281,
+ "step": 8335
+ },
+ {
+ "epoch": 3.941371158392435,
+ "grad_norm": 2.9032745361328125,
+ "learning_rate": 1.3359406959517174e-06,
+ "loss": 0.3389,
+ "step": 8336
+ },
+ {
+ "epoch": 3.9418439716312057,
+ "grad_norm": 2.725823163986206,
+ "learning_rate": 1.3353886560557793e-06,
+ "loss": 0.369,
+ "step": 8337
+ },
+ {
+ "epoch": 3.9423167848699765,
+ "grad_norm": 3.1965179443359375,
+ "learning_rate": 1.3348366886750844e-06,
+ "loss": 0.4031,
+ "step": 8338
+ },
+ {
+ "epoch": 3.942789598108747,
+ "grad_norm": 2.6991076469421387,
+ "learning_rate": 1.3342847938439985e-06,
+ "loss": 0.3434,
+ "step": 8339
+ },
+ {
+ "epoch": 3.9432624113475176,
+ "grad_norm": 4.491400718688965,
+ "learning_rate": 1.3337329715968877e-06,
+ "loss": 0.4175,
+ "step": 8340
+ },
+ {
+ "epoch": 3.9437352245862884,
+ "grad_norm": 4.005452632904053,
+ "learning_rate": 1.3331812219681112e-06,
+ "loss": 0.4191,
+ "step": 8341
+ },
+ {
+ "epoch": 3.9442080378250592,
+ "grad_norm": 3.1575794219970703,
+ "learning_rate": 1.3326295449920238e-06,
+ "loss": 0.4135,
+ "step": 8342
+ },
+ {
+ "epoch": 3.94468085106383,
+ "grad_norm": 3.2383973598480225,
+ "learning_rate": 1.3320779407029755e-06,
+ "loss": 0.38,
+ "step": 8343
+ },
+ {
+ "epoch": 3.9451536643026004,
+ "grad_norm": 2.873703718185425,
+ "learning_rate": 1.3315264091353119e-06,
+ "loss": 0.4128,
+ "step": 8344
+ },
+ {
+ "epoch": 3.945626477541371,
+ "grad_norm": 2.947274923324585,
+ "learning_rate": 1.330974950323376e-06,
+ "loss": 0.3342,
+ "step": 8345
+ },
+ {
+ "epoch": 3.946099290780142,
+ "grad_norm": 3.2874088287353516,
+ "learning_rate": 1.330423564301504e-06,
+ "loss": 0.3849,
+ "step": 8346
+ },
+ {
+ "epoch": 3.9465721040189123,
+ "grad_norm": 2.885772466659546,
+ "learning_rate": 1.3298722511040275e-06,
+ "loss": 0.3562,
+ "step": 8347
+ },
+ {
+ "epoch": 3.947044917257683,
+ "grad_norm": 3.0031309127807617,
+ "learning_rate": 1.3293210107652753e-06,
+ "loss": 0.3593,
+ "step": 8348
+ },
+ {
+ "epoch": 3.947517730496454,
+ "grad_norm": 2.815854549407959,
+ "learning_rate": 1.3287698433195712e-06,
+ "loss": 0.3633,
+ "step": 8349
+ },
+ {
+ "epoch": 3.9479905437352247,
+ "grad_norm": 3.228415012359619,
+ "learning_rate": 1.328218748801232e-06,
+ "loss": 0.3869,
+ "step": 8350
+ },
+ {
+ "epoch": 3.9484633569739955,
+ "grad_norm": 2.8497684001922607,
+ "learning_rate": 1.3276677272445743e-06,
+ "loss": 0.3833,
+ "step": 8351
+ },
+ {
+ "epoch": 3.948936170212766,
+ "grad_norm": 3.2330706119537354,
+ "learning_rate": 1.3271167786839057e-06,
+ "loss": 0.4414,
+ "step": 8352
+ },
+ {
+ "epoch": 3.9494089834515367,
+ "grad_norm": 3.2179152965545654,
+ "learning_rate": 1.3265659031535332e-06,
+ "loss": 0.3821,
+ "step": 8353
+ },
+ {
+ "epoch": 3.9498817966903075,
+ "grad_norm": 3.0861377716064453,
+ "learning_rate": 1.3260151006877567e-06,
+ "loss": 0.4014,
+ "step": 8354
+ },
+ {
+ "epoch": 3.950354609929078,
+ "grad_norm": 3.118872880935669,
+ "learning_rate": 1.325464371320871e-06,
+ "loss": 0.3836,
+ "step": 8355
+ },
+ {
+ "epoch": 3.9508274231678486,
+ "grad_norm": 2.9787702560424805,
+ "learning_rate": 1.32491371508717e-06,
+ "loss": 0.3794,
+ "step": 8356
+ },
+ {
+ "epoch": 3.9513002364066194,
+ "grad_norm": 3.132089853286743,
+ "learning_rate": 1.3243631320209387e-06,
+ "loss": 0.3698,
+ "step": 8357
+ },
+ {
+ "epoch": 3.9517730496453902,
+ "grad_norm": 2.5304882526397705,
+ "learning_rate": 1.32381262215646e-06,
+ "loss": 0.3687,
+ "step": 8358
+ },
+ {
+ "epoch": 3.952245862884161,
+ "grad_norm": 2.9121861457824707,
+ "learning_rate": 1.3232621855280126e-06,
+ "loss": 0.3704,
+ "step": 8359
+ },
+ {
+ "epoch": 3.9527186761229314,
+ "grad_norm": 3.0885608196258545,
+ "learning_rate": 1.3227118221698688e-06,
+ "loss": 0.4303,
+ "step": 8360
+ },
+ {
+ "epoch": 3.953191489361702,
+ "grad_norm": 2.7274837493896484,
+ "learning_rate": 1.3221615321162979e-06,
+ "loss": 0.3556,
+ "step": 8361
+ },
+ {
+ "epoch": 3.953664302600473,
+ "grad_norm": 3.1329922676086426,
+ "learning_rate": 1.3216113154015625e-06,
+ "loss": 0.4042,
+ "step": 8362
+ },
+ {
+ "epoch": 3.9541371158392433,
+ "grad_norm": 2.937380313873291,
+ "learning_rate": 1.3210611720599243e-06,
+ "loss": 0.3358,
+ "step": 8363
+ },
+ {
+ "epoch": 3.954609929078014,
+ "grad_norm": 2.939194440841675,
+ "learning_rate": 1.3205111021256378e-06,
+ "loss": 0.3885,
+ "step": 8364
+ },
+ {
+ "epoch": 3.955082742316785,
+ "grad_norm": 2.9151997566223145,
+ "learning_rate": 1.3199611056329516e-06,
+ "loss": 0.4094,
+ "step": 8365
+ },
+ {
+ "epoch": 3.9555555555555557,
+ "grad_norm": 3.029733419418335,
+ "learning_rate": 1.3194111826161143e-06,
+ "loss": 0.3999,
+ "step": 8366
+ },
+ {
+ "epoch": 3.9560283687943265,
+ "grad_norm": 2.7899951934814453,
+ "learning_rate": 1.3188613331093653e-06,
+ "loss": 0.321,
+ "step": 8367
+ },
+ {
+ "epoch": 3.956501182033097,
+ "grad_norm": 3.1109507083892822,
+ "learning_rate": 1.3183115571469425e-06,
+ "loss": 0.4266,
+ "step": 8368
+ },
+ {
+ "epoch": 3.9569739952718677,
+ "grad_norm": 3.085594415664673,
+ "learning_rate": 1.3177618547630774e-06,
+ "loss": 0.4412,
+ "step": 8369
+ },
+ {
+ "epoch": 3.9574468085106385,
+ "grad_norm": 3.0980300903320312,
+ "learning_rate": 1.3172122259919968e-06,
+ "loss": 0.3385,
+ "step": 8370
+ },
+ {
+ "epoch": 3.957919621749409,
+ "grad_norm": 3.103438138961792,
+ "learning_rate": 1.3166626708679256e-06,
+ "loss": 0.3887,
+ "step": 8371
+ },
+ {
+ "epoch": 3.9583924349881796,
+ "grad_norm": 2.8235526084899902,
+ "learning_rate": 1.3161131894250812e-06,
+ "loss": 0.3759,
+ "step": 8372
+ },
+ {
+ "epoch": 3.9588652482269504,
+ "grad_norm": 2.8316404819488525,
+ "learning_rate": 1.3155637816976762e-06,
+ "loss": 0.3666,
+ "step": 8373
+ },
+ {
+ "epoch": 3.959338061465721,
+ "grad_norm": 2.7873756885528564,
+ "learning_rate": 1.3150144477199218e-06,
+ "loss": 0.3284,
+ "step": 8374
+ },
+ {
+ "epoch": 3.959810874704492,
+ "grad_norm": 3.355039119720459,
+ "learning_rate": 1.3144651875260218e-06,
+ "loss": 0.4197,
+ "step": 8375
+ },
+ {
+ "epoch": 3.9602836879432624,
+ "grad_norm": 3.477721929550171,
+ "learning_rate": 1.3139160011501761e-06,
+ "loss": 0.3298,
+ "step": 8376
+ },
+ {
+ "epoch": 3.960756501182033,
+ "grad_norm": 3.557152032852173,
+ "learning_rate": 1.3133668886265805e-06,
+ "loss": 0.3788,
+ "step": 8377
+ },
+ {
+ "epoch": 3.961229314420804,
+ "grad_norm": 3.06707763671875,
+ "learning_rate": 1.312817849989424e-06,
+ "loss": 0.3613,
+ "step": 8378
+ },
+ {
+ "epoch": 3.9617021276595743,
+ "grad_norm": 2.7702202796936035,
+ "learning_rate": 1.3122688852728956e-06,
+ "loss": 0.402,
+ "step": 8379
+ },
+ {
+ "epoch": 3.962174940898345,
+ "grad_norm": 2.8121016025543213,
+ "learning_rate": 1.3117199945111746e-06,
+ "loss": 0.3576,
+ "step": 8380
+ },
+ {
+ "epoch": 3.962647754137116,
+ "grad_norm": 2.809282064437866,
+ "learning_rate": 1.3111711777384403e-06,
+ "loss": 0.3741,
+ "step": 8381
+ },
+ {
+ "epoch": 3.9631205673758867,
+ "grad_norm": 3.1175687313079834,
+ "learning_rate": 1.3106224349888638e-06,
+ "loss": 0.3388,
+ "step": 8382
+ },
+ {
+ "epoch": 3.963593380614657,
+ "grad_norm": 2.930525064468384,
+ "learning_rate": 1.310073766296612e-06,
+ "loss": 0.3593,
+ "step": 8383
+ },
+ {
+ "epoch": 3.964066193853428,
+ "grad_norm": 3.0673177242279053,
+ "learning_rate": 1.3095251716958501e-06,
+ "loss": 0.402,
+ "step": 8384
+ },
+ {
+ "epoch": 3.9645390070921986,
+ "grad_norm": 2.9725706577301025,
+ "learning_rate": 1.3089766512207347e-06,
+ "loss": 0.3707,
+ "step": 8385
+ },
+ {
+ "epoch": 3.965011820330969,
+ "grad_norm": 2.9790916442871094,
+ "learning_rate": 1.3084282049054218e-06,
+ "loss": 0.3292,
+ "step": 8386
+ },
+ {
+ "epoch": 3.96548463356974,
+ "grad_norm": 3.257035493850708,
+ "learning_rate": 1.3078798327840598e-06,
+ "loss": 0.3753,
+ "step": 8387
+ },
+ {
+ "epoch": 3.9659574468085106,
+ "grad_norm": 3.0534379482269287,
+ "learning_rate": 1.307331534890792e-06,
+ "loss": 0.4134,
+ "step": 8388
+ },
+ {
+ "epoch": 3.9664302600472814,
+ "grad_norm": 2.919243812561035,
+ "learning_rate": 1.306783311259761e-06,
+ "loss": 0.4283,
+ "step": 8389
+ },
+ {
+ "epoch": 3.966903073286052,
+ "grad_norm": 2.7643322944641113,
+ "learning_rate": 1.306235161925101e-06,
+ "loss": 0.3454,
+ "step": 8390
+ },
+ {
+ "epoch": 3.9673758865248225,
+ "grad_norm": 3.0208916664123535,
+ "learning_rate": 1.3056870869209431e-06,
+ "loss": 0.385,
+ "step": 8391
+ },
+ {
+ "epoch": 3.9678486997635933,
+ "grad_norm": 2.8657243251800537,
+ "learning_rate": 1.3051390862814135e-06,
+ "loss": 0.3614,
+ "step": 8392
+ },
+ {
+ "epoch": 3.968321513002364,
+ "grad_norm": 3.2093591690063477,
+ "learning_rate": 1.3045911600406325e-06,
+ "loss": 0.3774,
+ "step": 8393
+ },
+ {
+ "epoch": 3.9687943262411345,
+ "grad_norm": 3.091618537902832,
+ "learning_rate": 1.3040433082327192e-06,
+ "loss": 0.4157,
+ "step": 8394
+ },
+ {
+ "epoch": 3.9692671394799053,
+ "grad_norm": 2.99763560295105,
+ "learning_rate": 1.3034955308917849e-06,
+ "loss": 0.4017,
+ "step": 8395
+ },
+ {
+ "epoch": 3.969739952718676,
+ "grad_norm": 3.063109874725342,
+ "learning_rate": 1.3029478280519364e-06,
+ "loss": 0.4568,
+ "step": 8396
+ },
+ {
+ "epoch": 3.970212765957447,
+ "grad_norm": 3.2660679817199707,
+ "learning_rate": 1.3024001997472791e-06,
+ "loss": 0.3999,
+ "step": 8397
+ },
+ {
+ "epoch": 3.9706855791962177,
+ "grad_norm": 2.860121250152588,
+ "learning_rate": 1.3018526460119088e-06,
+ "loss": 0.433,
+ "step": 8398
+ },
+ {
+ "epoch": 3.971158392434988,
+ "grad_norm": 3.1037673950195312,
+ "learning_rate": 1.3013051668799216e-06,
+ "loss": 0.4526,
+ "step": 8399
+ },
+ {
+ "epoch": 3.971631205673759,
+ "grad_norm": 2.9408578872680664,
+ "learning_rate": 1.3007577623854053e-06,
+ "loss": 0.3722,
+ "step": 8400
+ },
+ {
+ "epoch": 3.9721040189125296,
+ "grad_norm": 3.0684635639190674,
+ "learning_rate": 1.3002104325624436e-06,
+ "loss": 0.3789,
+ "step": 8401
+ },
+ {
+ "epoch": 3.9725768321513,
+ "grad_norm": 2.6469366550445557,
+ "learning_rate": 1.2996631774451187e-06,
+ "loss": 0.3409,
+ "step": 8402
+ },
+ {
+ "epoch": 3.9730496453900708,
+ "grad_norm": 3.3741610050201416,
+ "learning_rate": 1.2991159970675033e-06,
+ "loss": 0.3544,
+ "step": 8403
+ },
+ {
+ "epoch": 3.9735224586288416,
+ "grad_norm": 3.3716588020324707,
+ "learning_rate": 1.2985688914636701e-06,
+ "loss": 0.3747,
+ "step": 8404
+ },
+ {
+ "epoch": 3.9739952718676124,
+ "grad_norm": 3.000469923019409,
+ "learning_rate": 1.2980218606676837e-06,
+ "loss": 0.4506,
+ "step": 8405
+ },
+ {
+ "epoch": 3.974468085106383,
+ "grad_norm": 3.0139408111572266,
+ "learning_rate": 1.2974749047136057e-06,
+ "loss": 0.4156,
+ "step": 8406
+ },
+ {
+ "epoch": 3.9749408983451535,
+ "grad_norm": 2.9494218826293945,
+ "learning_rate": 1.2969280236354925e-06,
+ "loss": 0.3378,
+ "step": 8407
+ },
+ {
+ "epoch": 3.9754137115839243,
+ "grad_norm": 2.6061158180236816,
+ "learning_rate": 1.2963812174673948e-06,
+ "loss": 0.3887,
+ "step": 8408
+ },
+ {
+ "epoch": 3.975886524822695,
+ "grad_norm": 2.873987913131714,
+ "learning_rate": 1.295834486243362e-06,
+ "loss": 0.3202,
+ "step": 8409
+ },
+ {
+ "epoch": 3.9763593380614655,
+ "grad_norm": 3.0106539726257324,
+ "learning_rate": 1.2952878299974358e-06,
+ "loss": 0.4142,
+ "step": 8410
+ },
+ {
+ "epoch": 3.9768321513002363,
+ "grad_norm": 3.0011982917785645,
+ "learning_rate": 1.2947412487636527e-06,
+ "loss": 0.4121,
+ "step": 8411
+ },
+ {
+ "epoch": 3.977304964539007,
+ "grad_norm": 3.1321003437042236,
+ "learning_rate": 1.294194742576048e-06,
+ "loss": 0.4033,
+ "step": 8412
+ },
+ {
+ "epoch": 3.977777777777778,
+ "grad_norm": 2.812255382537842,
+ "learning_rate": 1.2936483114686487e-06,
+ "loss": 0.3414,
+ "step": 8413
+ },
+ {
+ "epoch": 3.9782505910165487,
+ "grad_norm": 2.9594221115112305,
+ "learning_rate": 1.2931019554754804e-06,
+ "loss": 0.3666,
+ "step": 8414
+ },
+ {
+ "epoch": 3.978723404255319,
+ "grad_norm": 3.119440793991089,
+ "learning_rate": 1.2925556746305612e-06,
+ "loss": 0.3902,
+ "step": 8415
+ },
+ {
+ "epoch": 3.97919621749409,
+ "grad_norm": 3.042102098464966,
+ "learning_rate": 1.2920094689679047e-06,
+ "loss": 0.344,
+ "step": 8416
+ },
+ {
+ "epoch": 3.9796690307328606,
+ "grad_norm": 2.8443872928619385,
+ "learning_rate": 1.2914633385215225e-06,
+ "loss": 0.372,
+ "step": 8417
+ },
+ {
+ "epoch": 3.980141843971631,
+ "grad_norm": 3.483201265335083,
+ "learning_rate": 1.2909172833254187e-06,
+ "loss": 0.4028,
+ "step": 8418
+ },
+ {
+ "epoch": 3.9806146572104018,
+ "grad_norm": 2.966996431350708,
+ "learning_rate": 1.2903713034135934e-06,
+ "loss": 0.3527,
+ "step": 8419
+ },
+ {
+ "epoch": 3.9810874704491725,
+ "grad_norm": 2.7813172340393066,
+ "learning_rate": 1.2898253988200437e-06,
+ "loss": 0.3873,
+ "step": 8420
+ },
+ {
+ "epoch": 3.9815602836879433,
+ "grad_norm": 3.24611234664917,
+ "learning_rate": 1.2892795695787602e-06,
+ "loss": 0.4783,
+ "step": 8421
+ },
+ {
+ "epoch": 3.982033096926714,
+ "grad_norm": 3.345573663711548,
+ "learning_rate": 1.2887338157237289e-06,
+ "loss": 0.4179,
+ "step": 8422
+ },
+ {
+ "epoch": 3.9825059101654845,
+ "grad_norm": 3.1726880073547363,
+ "learning_rate": 1.288188137288931e-06,
+ "loss": 0.3725,
+ "step": 8423
+ },
+ {
+ "epoch": 3.9829787234042553,
+ "grad_norm": 3.398966073989868,
+ "learning_rate": 1.2876425343083449e-06,
+ "loss": 0.4117,
+ "step": 8424
+ },
+ {
+ "epoch": 3.983451536643026,
+ "grad_norm": 2.615680456161499,
+ "learning_rate": 1.2870970068159423e-06,
+ "loss": 0.324,
+ "step": 8425
+ },
+ {
+ "epoch": 3.9839243498817964,
+ "grad_norm": 3.0505547523498535,
+ "learning_rate": 1.2865515548456893e-06,
+ "loss": 0.3698,
+ "step": 8426
+ },
+ {
+ "epoch": 3.9843971631205672,
+ "grad_norm": 3.077404260635376,
+ "learning_rate": 1.2860061784315514e-06,
+ "loss": 0.3592,
+ "step": 8427
+ },
+ {
+ "epoch": 3.984869976359338,
+ "grad_norm": 2.654080390930176,
+ "learning_rate": 1.2854608776074855e-06,
+ "loss": 0.3451,
+ "step": 8428
+ },
+ {
+ "epoch": 3.985342789598109,
+ "grad_norm": 3.1023523807525635,
+ "learning_rate": 1.284915652407444e-06,
+ "loss": 0.3809,
+ "step": 8429
+ },
+ {
+ "epoch": 3.9858156028368796,
+ "grad_norm": 3.0526652336120605,
+ "learning_rate": 1.2843705028653783e-06,
+ "loss": 0.3633,
+ "step": 8430
+ },
+ {
+ "epoch": 3.98628841607565,
+ "grad_norm": 2.7829604148864746,
+ "learning_rate": 1.2838254290152296e-06,
+ "loss": 0.3213,
+ "step": 8431
+ },
+ {
+ "epoch": 3.986761229314421,
+ "grad_norm": 3.2218687534332275,
+ "learning_rate": 1.28328043089094e-06,
+ "loss": 0.465,
+ "step": 8432
+ },
+ {
+ "epoch": 3.9872340425531916,
+ "grad_norm": 2.952998161315918,
+ "learning_rate": 1.2827355085264425e-06,
+ "loss": 0.4405,
+ "step": 8433
+ },
+ {
+ "epoch": 3.987706855791962,
+ "grad_norm": 2.81211519241333,
+ "learning_rate": 1.2821906619556667e-06,
+ "loss": 0.3444,
+ "step": 8434
+ },
+ {
+ "epoch": 3.9881796690307327,
+ "grad_norm": 3.1707375049591064,
+ "learning_rate": 1.281645891212539e-06,
+ "loss": 0.4019,
+ "step": 8435
+ },
+ {
+ "epoch": 3.9886524822695035,
+ "grad_norm": 2.791504383087158,
+ "learning_rate": 1.2811011963309788e-06,
+ "loss": 0.3606,
+ "step": 8436
+ },
+ {
+ "epoch": 3.9891252955082743,
+ "grad_norm": 2.954782247543335,
+ "learning_rate": 1.280556577344903e-06,
+ "loss": 0.3141,
+ "step": 8437
+ },
+ {
+ "epoch": 3.989598108747045,
+ "grad_norm": 2.718273878097534,
+ "learning_rate": 1.2800120342882223e-06,
+ "loss": 0.3715,
+ "step": 8438
+ },
+ {
+ "epoch": 3.9900709219858155,
+ "grad_norm": 3.2916250228881836,
+ "learning_rate": 1.2794675671948425e-06,
+ "loss": 0.4048,
+ "step": 8439
+ },
+ {
+ "epoch": 3.9905437352245863,
+ "grad_norm": 3.060060977935791,
+ "learning_rate": 1.2789231760986655e-06,
+ "loss": 0.4032,
+ "step": 8440
+ },
+ {
+ "epoch": 3.991016548463357,
+ "grad_norm": 2.8467273712158203,
+ "learning_rate": 1.2783788610335882e-06,
+ "loss": 0.4041,
+ "step": 8441
+ },
+ {
+ "epoch": 3.9914893617021274,
+ "grad_norm": 3.161790132522583,
+ "learning_rate": 1.2778346220335013e-06,
+ "loss": 0.4049,
+ "step": 8442
+ },
+ {
+ "epoch": 3.9919621749408982,
+ "grad_norm": 2.6512296199798584,
+ "learning_rate": 1.277290459132295e-06,
+ "loss": 0.3598,
+ "step": 8443
+ },
+ {
+ "epoch": 3.992434988179669,
+ "grad_norm": 2.792736291885376,
+ "learning_rate": 1.276746372363849e-06,
+ "loss": 0.3874,
+ "step": 8444
+ },
+ {
+ "epoch": 3.99290780141844,
+ "grad_norm": 2.887047052383423,
+ "learning_rate": 1.2762023617620433e-06,
+ "loss": 0.4255,
+ "step": 8445
+ },
+ {
+ "epoch": 3.9933806146572106,
+ "grad_norm": 3.0420780181884766,
+ "learning_rate": 1.275658427360751e-06,
+ "loss": 0.4489,
+ "step": 8446
+ },
+ {
+ "epoch": 3.993853427895981,
+ "grad_norm": 3.107618570327759,
+ "learning_rate": 1.2751145691938383e-06,
+ "loss": 0.4354,
+ "step": 8447
+ },
+ {
+ "epoch": 3.9943262411347518,
+ "grad_norm": 2.656224250793457,
+ "learning_rate": 1.2745707872951718e-06,
+ "loss": 0.4188,
+ "step": 8448
+ },
+ {
+ "epoch": 3.9947990543735226,
+ "grad_norm": 2.9895219802856445,
+ "learning_rate": 1.2740270816986079e-06,
+ "loss": 0.391,
+ "step": 8449
+ },
+ {
+ "epoch": 3.995271867612293,
+ "grad_norm": 2.919255018234253,
+ "learning_rate": 1.2734834524380025e-06,
+ "loss": 0.4058,
+ "step": 8450
+ },
+ {
+ "epoch": 3.9957446808510637,
+ "grad_norm": 3.4418535232543945,
+ "learning_rate": 1.2729398995472048e-06,
+ "loss": 0.3977,
+ "step": 8451
+ },
+ {
+ "epoch": 3.9962174940898345,
+ "grad_norm": 2.980224132537842,
+ "learning_rate": 1.272396423060058e-06,
+ "loss": 0.4417,
+ "step": 8452
+ },
+ {
+ "epoch": 3.9966903073286053,
+ "grad_norm": 3.6488101482391357,
+ "learning_rate": 1.2718530230104043e-06,
+ "loss": 0.4472,
+ "step": 8453
+ },
+ {
+ "epoch": 3.997163120567376,
+ "grad_norm": 2.725437641143799,
+ "learning_rate": 1.2713096994320774e-06,
+ "loss": 0.3125,
+ "step": 8454
+ },
+ {
+ "epoch": 3.9976359338061465,
+ "grad_norm": 3.453794002532959,
+ "learning_rate": 1.2707664523589076e-06,
+ "loss": 0.3792,
+ "step": 8455
+ },
+ {
+ "epoch": 3.9981087470449173,
+ "grad_norm": 2.8443076610565186,
+ "learning_rate": 1.270223281824721e-06,
+ "loss": 0.3627,
+ "step": 8456
+ },
+ {
+ "epoch": 3.998581560283688,
+ "grad_norm": 3.1851959228515625,
+ "learning_rate": 1.2696801878633372e-06,
+ "loss": 0.3745,
+ "step": 8457
+ },
+ {
+ "epoch": 3.9990543735224584,
+ "grad_norm": 2.897239923477173,
+ "learning_rate": 1.2691371705085743e-06,
+ "loss": 0.3817,
+ "step": 8458
+ },
+ {
+ "epoch": 3.999527186761229,
+ "grad_norm": 2.92111873626709,
+ "learning_rate": 1.2685942297942416e-06,
+ "loss": 0.3824,
+ "step": 8459
+ },
+ {
+ "epoch": 4.0,
+ "grad_norm": 2.9540340900421143,
+ "learning_rate": 1.268051365754148e-06,
+ "loss": 0.3656,
+ "step": 8460
+ },
+ {
+ "epoch": 4.000472813238771,
+ "grad_norm": 2.6815075874328613,
+ "learning_rate": 1.2675085784220936e-06,
+ "loss": 0.3296,
+ "step": 8461
+ },
+ {
+ "epoch": 4.000945626477542,
+ "grad_norm": 3.0823302268981934,
+ "learning_rate": 1.2669658678318747e-06,
+ "loss": 0.3918,
+ "step": 8462
+ },
+ {
+ "epoch": 4.001418439716312,
+ "grad_norm": 2.5451176166534424,
+ "learning_rate": 1.2664232340172855e-06,
+ "loss": 0.3528,
+ "step": 8463
+ },
+ {
+ "epoch": 4.001891252955082,
+ "grad_norm": 2.539541721343994,
+ "learning_rate": 1.2658806770121119e-06,
+ "loss": 0.3034,
+ "step": 8464
+ },
+ {
+ "epoch": 4.002364066193853,
+ "grad_norm": 2.5537798404693604,
+ "learning_rate": 1.2653381968501374e-06,
+ "loss": 0.2981,
+ "step": 8465
+ },
+ {
+ "epoch": 4.002836879432624,
+ "grad_norm": 2.6316089630126953,
+ "learning_rate": 1.26479579356514e-06,
+ "loss": 0.3328,
+ "step": 8466
+ },
+ {
+ "epoch": 4.003309692671395,
+ "grad_norm": 3.080700635910034,
+ "learning_rate": 1.2642534671908914e-06,
+ "loss": 0.3471,
+ "step": 8467
+ },
+ {
+ "epoch": 4.0037825059101655,
+ "grad_norm": 3.0111753940582275,
+ "learning_rate": 1.2637112177611614e-06,
+ "loss": 0.3147,
+ "step": 8468
+ },
+ {
+ "epoch": 4.004255319148936,
+ "grad_norm": 2.759606122970581,
+ "learning_rate": 1.2631690453097128e-06,
+ "loss": 0.2634,
+ "step": 8469
+ },
+ {
+ "epoch": 4.004728132387707,
+ "grad_norm": 2.862098217010498,
+ "learning_rate": 1.2626269498703048e-06,
+ "loss": 0.3333,
+ "step": 8470
+ },
+ {
+ "epoch": 4.005200945626478,
+ "grad_norm": 3.122239589691162,
+ "learning_rate": 1.262084931476691e-06,
+ "loss": 0.3311,
+ "step": 8471
+ },
+ {
+ "epoch": 4.005673758865248,
+ "grad_norm": 2.6428070068359375,
+ "learning_rate": 1.261542990162619e-06,
+ "loss": 0.3534,
+ "step": 8472
+ },
+ {
+ "epoch": 4.006146572104019,
+ "grad_norm": 3.2870724201202393,
+ "learning_rate": 1.261001125961836e-06,
+ "loss": 0.3373,
+ "step": 8473
+ },
+ {
+ "epoch": 4.006619385342789,
+ "grad_norm": 2.7675375938415527,
+ "learning_rate": 1.26045933890808e-06,
+ "loss": 0.3117,
+ "step": 8474
+ },
+ {
+ "epoch": 4.00709219858156,
+ "grad_norm": 2.811736583709717,
+ "learning_rate": 1.2599176290350844e-06,
+ "loss": 0.3087,
+ "step": 8475
+ },
+ {
+ "epoch": 4.007565011820331,
+ "grad_norm": 2.9146902561187744,
+ "learning_rate": 1.2593759963765817e-06,
+ "loss": 0.336,
+ "step": 8476
+ },
+ {
+ "epoch": 4.008037825059102,
+ "grad_norm": 3.074338674545288,
+ "learning_rate": 1.2588344409662945e-06,
+ "loss": 0.384,
+ "step": 8477
+ },
+ {
+ "epoch": 4.008510638297873,
+ "grad_norm": 3.5597734451293945,
+ "learning_rate": 1.2582929628379455e-06,
+ "loss": 0.4061,
+ "step": 8478
+ },
+ {
+ "epoch": 4.008983451536643,
+ "grad_norm": 3.0091497898101807,
+ "learning_rate": 1.2577515620252489e-06,
+ "loss": 0.3783,
+ "step": 8479
+ },
+ {
+ "epoch": 4.009456264775413,
+ "grad_norm": 2.9654228687286377,
+ "learning_rate": 1.2572102385619145e-06,
+ "loss": 0.3541,
+ "step": 8480
+ },
+ {
+ "epoch": 4.009929078014184,
+ "grad_norm": 3.140733242034912,
+ "learning_rate": 1.2566689924816502e-06,
+ "loss": 0.3706,
+ "step": 8481
+ },
+ {
+ "epoch": 4.010401891252955,
+ "grad_norm": 3.2180161476135254,
+ "learning_rate": 1.2561278238181548e-06,
+ "loss": 0.3573,
+ "step": 8482
+ },
+ {
+ "epoch": 4.010874704491726,
+ "grad_norm": 2.91209077835083,
+ "learning_rate": 1.2555867326051265e-06,
+ "loss": 0.3619,
+ "step": 8483
+ },
+ {
+ "epoch": 4.0113475177304965,
+ "grad_norm": 3.016916036605835,
+ "learning_rate": 1.255045718876256e-06,
+ "loss": 0.3866,
+ "step": 8484
+ },
+ {
+ "epoch": 4.011820330969267,
+ "grad_norm": 3.1008472442626953,
+ "learning_rate": 1.2545047826652294e-06,
+ "loss": 0.3352,
+ "step": 8485
+ },
+ {
+ "epoch": 4.012293144208038,
+ "grad_norm": 3.136124610900879,
+ "learning_rate": 1.2539639240057287e-06,
+ "loss": 0.3478,
+ "step": 8486
+ },
+ {
+ "epoch": 4.012765957446809,
+ "grad_norm": 3.0021555423736572,
+ "learning_rate": 1.2534231429314299e-06,
+ "loss": 0.3522,
+ "step": 8487
+ },
+ {
+ "epoch": 4.013238770685579,
+ "grad_norm": 3.2261948585510254,
+ "learning_rate": 1.2528824394760065e-06,
+ "loss": 0.3632,
+ "step": 8488
+ },
+ {
+ "epoch": 4.01371158392435,
+ "grad_norm": 3.0598134994506836,
+ "learning_rate": 1.2523418136731252e-06,
+ "loss": 0.3422,
+ "step": 8489
+ },
+ {
+ "epoch": 4.01418439716312,
+ "grad_norm": 2.9821391105651855,
+ "learning_rate": 1.2518012655564476e-06,
+ "loss": 0.3324,
+ "step": 8490
+ },
+ {
+ "epoch": 4.014657210401891,
+ "grad_norm": 2.583130359649658,
+ "learning_rate": 1.251260795159633e-06,
+ "loss": 0.3509,
+ "step": 8491
+ },
+ {
+ "epoch": 4.015130023640662,
+ "grad_norm": 3.3090853691101074,
+ "learning_rate": 1.2507204025163333e-06,
+ "loss": 0.3494,
+ "step": 8492
+ },
+ {
+ "epoch": 4.015602836879433,
+ "grad_norm": 2.6412856578826904,
+ "learning_rate": 1.250180087660195e-06,
+ "loss": 0.3419,
+ "step": 8493
+ },
+ {
+ "epoch": 4.0160756501182036,
+ "grad_norm": 2.729210615158081,
+ "learning_rate": 1.2496398506248634e-06,
+ "loss": 0.3591,
+ "step": 8494
+ },
+ {
+ "epoch": 4.016548463356974,
+ "grad_norm": 2.892150402069092,
+ "learning_rate": 1.2490996914439745e-06,
+ "loss": 0.3866,
+ "step": 8495
+ },
+ {
+ "epoch": 4.017021276595744,
+ "grad_norm": 3.1967804431915283,
+ "learning_rate": 1.2485596101511638e-06,
+ "loss": 0.358,
+ "step": 8496
+ },
+ {
+ "epoch": 4.017494089834515,
+ "grad_norm": 3.0190439224243164,
+ "learning_rate": 1.2480196067800588e-06,
+ "loss": 0.3723,
+ "step": 8497
+ },
+ {
+ "epoch": 4.017966903073286,
+ "grad_norm": 2.856370210647583,
+ "learning_rate": 1.2474796813642822e-06,
+ "loss": 0.3519,
+ "step": 8498
+ },
+ {
+ "epoch": 4.018439716312057,
+ "grad_norm": 2.979842185974121,
+ "learning_rate": 1.2469398339374546e-06,
+ "loss": 0.3483,
+ "step": 8499
+ },
+ {
+ "epoch": 4.0189125295508275,
+ "grad_norm": 3.0953211784362793,
+ "learning_rate": 1.246400064533189e-06,
+ "loss": 0.355,
+ "step": 8500
+ },
+ {
+ "epoch": 4.019385342789598,
+ "grad_norm": 3.342609167098999,
+ "learning_rate": 1.2458603731850938e-06,
+ "loss": 0.4258,
+ "step": 8501
+ },
+ {
+ "epoch": 4.019858156028369,
+ "grad_norm": 3.2789435386657715,
+ "learning_rate": 1.2453207599267747e-06,
+ "loss": 0.3653,
+ "step": 8502
+ },
+ {
+ "epoch": 4.02033096926714,
+ "grad_norm": 2.8867030143737793,
+ "learning_rate": 1.2447812247918303e-06,
+ "loss": 0.3128,
+ "step": 8503
+ },
+ {
+ "epoch": 4.02080378250591,
+ "grad_norm": 2.9467437267303467,
+ "learning_rate": 1.2442417678138552e-06,
+ "loss": 0.3149,
+ "step": 8504
+ },
+ {
+ "epoch": 4.0212765957446805,
+ "grad_norm": 2.6293485164642334,
+ "learning_rate": 1.2437023890264377e-06,
+ "loss": 0.2751,
+ "step": 8505
+ },
+ {
+ "epoch": 4.021749408983451,
+ "grad_norm": 2.9672160148620605,
+ "learning_rate": 1.2431630884631648e-06,
+ "loss": 0.3858,
+ "step": 8506
+ },
+ {
+ "epoch": 4.022222222222222,
+ "grad_norm": 3.0518734455108643,
+ "learning_rate": 1.2426238661576154e-06,
+ "loss": 0.3404,
+ "step": 8507
+ },
+ {
+ "epoch": 4.022695035460993,
+ "grad_norm": 2.829012632369995,
+ "learning_rate": 1.2420847221433633e-06,
+ "loss": 0.3211,
+ "step": 8508
+ },
+ {
+ "epoch": 4.023167848699764,
+ "grad_norm": 2.855806589126587,
+ "learning_rate": 1.2415456564539808e-06,
+ "loss": 0.3462,
+ "step": 8509
+ },
+ {
+ "epoch": 4.0236406619385345,
+ "grad_norm": 3.491786003112793,
+ "learning_rate": 1.2410066691230311e-06,
+ "loss": 0.3793,
+ "step": 8510
+ },
+ {
+ "epoch": 4.024113475177305,
+ "grad_norm": 2.9612972736358643,
+ "learning_rate": 1.2404677601840765e-06,
+ "loss": 0.3899,
+ "step": 8511
+ },
+ {
+ "epoch": 4.024586288416075,
+ "grad_norm": 2.949498176574707,
+ "learning_rate": 1.2399289296706718e-06,
+ "loss": 0.3655,
+ "step": 8512
+ },
+ {
+ "epoch": 4.025059101654846,
+ "grad_norm": 2.736524820327759,
+ "learning_rate": 1.2393901776163664e-06,
+ "loss": 0.318,
+ "step": 8513
+ },
+ {
+ "epoch": 4.025531914893617,
+ "grad_norm": 3.005297899246216,
+ "learning_rate": 1.2388515040547077e-06,
+ "loss": 0.3484,
+ "step": 8514
+ },
+ {
+ "epoch": 4.026004728132388,
+ "grad_norm": 2.9835290908813477,
+ "learning_rate": 1.2383129090192361e-06,
+ "loss": 0.3205,
+ "step": 8515
+ },
+ {
+ "epoch": 4.026477541371158,
+ "grad_norm": 3.1437056064605713,
+ "learning_rate": 1.2377743925434865e-06,
+ "loss": 0.3524,
+ "step": 8516
+ },
+ {
+ "epoch": 4.026950354609929,
+ "grad_norm": 3.0250096321105957,
+ "learning_rate": 1.2372359546609917e-06,
+ "loss": 0.3398,
+ "step": 8517
+ },
+ {
+ "epoch": 4.0274231678487,
+ "grad_norm": 3.109083890914917,
+ "learning_rate": 1.2366975954052767e-06,
+ "loss": 0.3317,
+ "step": 8518
+ },
+ {
+ "epoch": 4.027895981087471,
+ "grad_norm": 2.7713027000427246,
+ "learning_rate": 1.2361593148098634e-06,
+ "loss": 0.335,
+ "step": 8519
+ },
+ {
+ "epoch": 4.028368794326241,
+ "grad_norm": 2.9302117824554443,
+ "learning_rate": 1.2356211129082673e-06,
+ "loss": 0.3054,
+ "step": 8520
+ },
+ {
+ "epoch": 4.0288416075650115,
+ "grad_norm": 3.1805200576782227,
+ "learning_rate": 1.2350829897339996e-06,
+ "loss": 0.3219,
+ "step": 8521
+ },
+ {
+ "epoch": 4.029314420803782,
+ "grad_norm": 3.2687618732452393,
+ "learning_rate": 1.2345449453205688e-06,
+ "loss": 0.3966,
+ "step": 8522
+ },
+ {
+ "epoch": 4.029787234042553,
+ "grad_norm": 3.2010693550109863,
+ "learning_rate": 1.2340069797014741e-06,
+ "loss": 0.3547,
+ "step": 8523
+ },
+ {
+ "epoch": 4.030260047281324,
+ "grad_norm": 2.7061285972595215,
+ "learning_rate": 1.233469092910215e-06,
+ "loss": 0.2829,
+ "step": 8524
+ },
+ {
+ "epoch": 4.030732860520095,
+ "grad_norm": 3.1565401554107666,
+ "learning_rate": 1.2329312849802817e-06,
+ "loss": 0.3376,
+ "step": 8525
+ },
+ {
+ "epoch": 4.0312056737588655,
+ "grad_norm": 2.8864760398864746,
+ "learning_rate": 1.2323935559451603e-06,
+ "loss": 0.3946,
+ "step": 8526
+ },
+ {
+ "epoch": 4.031678486997636,
+ "grad_norm": 3.4621710777282715,
+ "learning_rate": 1.2318559058383348e-06,
+ "loss": 0.3859,
+ "step": 8527
+ },
+ {
+ "epoch": 4.032151300236406,
+ "grad_norm": 3.074201822280884,
+ "learning_rate": 1.2313183346932806e-06,
+ "loss": 0.3583,
+ "step": 8528
+ },
+ {
+ "epoch": 4.032624113475177,
+ "grad_norm": 3.1746935844421387,
+ "learning_rate": 1.2307808425434715e-06,
+ "loss": 0.3766,
+ "step": 8529
+ },
+ {
+ "epoch": 4.033096926713948,
+ "grad_norm": 3.327202081680298,
+ "learning_rate": 1.2302434294223738e-06,
+ "loss": 0.3556,
+ "step": 8530
+ },
+ {
+ "epoch": 4.033569739952719,
+ "grad_norm": 3.375643730163574,
+ "learning_rate": 1.2297060953634496e-06,
+ "loss": 0.3574,
+ "step": 8531
+ },
+ {
+ "epoch": 4.034042553191489,
+ "grad_norm": 2.8553316593170166,
+ "learning_rate": 1.2291688404001573e-06,
+ "loss": 0.2807,
+ "step": 8532
+ },
+ {
+ "epoch": 4.03451536643026,
+ "grad_norm": 3.439772367477417,
+ "learning_rate": 1.2286316645659492e-06,
+ "loss": 0.3519,
+ "step": 8533
+ },
+ {
+ "epoch": 4.034988179669031,
+ "grad_norm": 2.794694662094116,
+ "learning_rate": 1.2280945678942724e-06,
+ "loss": 0.3117,
+ "step": 8534
+ },
+ {
+ "epoch": 4.035460992907802,
+ "grad_norm": 2.9869043827056885,
+ "learning_rate": 1.2275575504185697e-06,
+ "loss": 0.3663,
+ "step": 8535
+ },
+ {
+ "epoch": 4.035933806146572,
+ "grad_norm": 2.711435317993164,
+ "learning_rate": 1.2270206121722777e-06,
+ "loss": 0.3547,
+ "step": 8536
+ },
+ {
+ "epoch": 4.0364066193853425,
+ "grad_norm": 2.843391180038452,
+ "learning_rate": 1.2264837531888317e-06,
+ "loss": 0.3124,
+ "step": 8537
+ },
+ {
+ "epoch": 4.036879432624113,
+ "grad_norm": 3.2082388401031494,
+ "learning_rate": 1.225946973501658e-06,
+ "loss": 0.3573,
+ "step": 8538
+ },
+ {
+ "epoch": 4.037352245862884,
+ "grad_norm": 2.799604654312134,
+ "learning_rate": 1.2254102731441786e-06,
+ "loss": 0.3234,
+ "step": 8539
+ },
+ {
+ "epoch": 4.037825059101655,
+ "grad_norm": 2.682777166366577,
+ "learning_rate": 1.2248736521498137e-06,
+ "loss": 0.3087,
+ "step": 8540
+ },
+ {
+ "epoch": 4.038297872340426,
+ "grad_norm": 2.8138248920440674,
+ "learning_rate": 1.2243371105519741e-06,
+ "loss": 0.3668,
+ "step": 8541
+ },
+ {
+ "epoch": 4.0387706855791965,
+ "grad_norm": 3.3388478755950928,
+ "learning_rate": 1.2238006483840702e-06,
+ "loss": 0.3294,
+ "step": 8542
+ },
+ {
+ "epoch": 4.039243498817967,
+ "grad_norm": 3.06247615814209,
+ "learning_rate": 1.2232642656795039e-06,
+ "loss": 0.3348,
+ "step": 8543
+ },
+ {
+ "epoch": 4.039716312056737,
+ "grad_norm": 2.742628335952759,
+ "learning_rate": 1.2227279624716724e-06,
+ "loss": 0.3427,
+ "step": 8544
+ },
+ {
+ "epoch": 4.040189125295508,
+ "grad_norm": 3.0785365104675293,
+ "learning_rate": 1.222191738793971e-06,
+ "loss": 0.3762,
+ "step": 8545
+ },
+ {
+ "epoch": 4.040661938534279,
+ "grad_norm": 3.0352790355682373,
+ "learning_rate": 1.2216555946797862e-06,
+ "loss": 0.3311,
+ "step": 8546
+ },
+ {
+ "epoch": 4.04113475177305,
+ "grad_norm": 3.1949729919433594,
+ "learning_rate": 1.2211195301625028e-06,
+ "loss": 0.3429,
+ "step": 8547
+ },
+ {
+ "epoch": 4.04160756501182,
+ "grad_norm": 3.214021921157837,
+ "learning_rate": 1.2205835452754989e-06,
+ "loss": 0.3528,
+ "step": 8548
+ },
+ {
+ "epoch": 4.042080378250591,
+ "grad_norm": 3.206296443939209,
+ "learning_rate": 1.2200476400521474e-06,
+ "loss": 0.3499,
+ "step": 8549
+ },
+ {
+ "epoch": 4.042553191489362,
+ "grad_norm": 3.0067825317382812,
+ "learning_rate": 1.2195118145258167e-06,
+ "loss": 0.3597,
+ "step": 8550
+ },
+ {
+ "epoch": 4.043026004728133,
+ "grad_norm": 2.7811057567596436,
+ "learning_rate": 1.21897606872987e-06,
+ "loss": 0.3268,
+ "step": 8551
+ },
+ {
+ "epoch": 4.043498817966903,
+ "grad_norm": 3.1679844856262207,
+ "learning_rate": 1.218440402697667e-06,
+ "loss": 0.4025,
+ "step": 8552
+ },
+ {
+ "epoch": 4.0439716312056735,
+ "grad_norm": 3.2010326385498047,
+ "learning_rate": 1.217904816462561e-06,
+ "loss": 0.3426,
+ "step": 8553
+ },
+ {
+ "epoch": 4.044444444444444,
+ "grad_norm": 3.381863832473755,
+ "learning_rate": 1.217369310057899e-06,
+ "loss": 0.3693,
+ "step": 8554
+ },
+ {
+ "epoch": 4.044917257683215,
+ "grad_norm": 3.471402168273926,
+ "learning_rate": 1.2168338835170267e-06,
+ "loss": 0.3977,
+ "step": 8555
+ },
+ {
+ "epoch": 4.045390070921986,
+ "grad_norm": 3.0549192428588867,
+ "learning_rate": 1.2162985368732813e-06,
+ "loss": 0.3262,
+ "step": 8556
+ },
+ {
+ "epoch": 4.045862884160757,
+ "grad_norm": 3.02451229095459,
+ "learning_rate": 1.215763270159998e-06,
+ "loss": 0.3408,
+ "step": 8557
+ },
+ {
+ "epoch": 4.0463356973995275,
+ "grad_norm": 3.1335513591766357,
+ "learning_rate": 1.215228083410505e-06,
+ "loss": 0.3275,
+ "step": 8558
+ },
+ {
+ "epoch": 4.046808510638298,
+ "grad_norm": 3.379655599594116,
+ "learning_rate": 1.2146929766581242e-06,
+ "loss": 0.3511,
+ "step": 8559
+ },
+ {
+ "epoch": 4.047281323877068,
+ "grad_norm": 3.210146903991699,
+ "learning_rate": 1.2141579499361772e-06,
+ "loss": 0.3607,
+ "step": 8560
+ },
+ {
+ "epoch": 4.047754137115839,
+ "grad_norm": 3.3693792819976807,
+ "learning_rate": 1.2136230032779753e-06,
+ "loss": 0.3642,
+ "step": 8561
+ },
+ {
+ "epoch": 4.04822695035461,
+ "grad_norm": 3.0397274494171143,
+ "learning_rate": 1.2130881367168292e-06,
+ "loss": 0.3376,
+ "step": 8562
+ },
+ {
+ "epoch": 4.048699763593381,
+ "grad_norm": 3.119372606277466,
+ "learning_rate": 1.212553350286042e-06,
+ "loss": 0.3581,
+ "step": 8563
+ },
+ {
+ "epoch": 4.049172576832151,
+ "grad_norm": 2.9431848526000977,
+ "learning_rate": 1.2120186440189124e-06,
+ "loss": 0.3453,
+ "step": 8564
+ },
+ {
+ "epoch": 4.049645390070922,
+ "grad_norm": 3.256748914718628,
+ "learning_rate": 1.2114840179487333e-06,
+ "loss": 0.3766,
+ "step": 8565
+ },
+ {
+ "epoch": 4.050118203309693,
+ "grad_norm": 2.792759656906128,
+ "learning_rate": 1.2109494721087953e-06,
+ "loss": 0.3396,
+ "step": 8566
+ },
+ {
+ "epoch": 4.050591016548464,
+ "grad_norm": 2.9790122509002686,
+ "learning_rate": 1.2104150065323813e-06,
+ "loss": 0.3631,
+ "step": 8567
+ },
+ {
+ "epoch": 4.051063829787234,
+ "grad_norm": 2.7998805046081543,
+ "learning_rate": 1.2098806212527705e-06,
+ "loss": 0.3442,
+ "step": 8568
+ },
+ {
+ "epoch": 4.0515366430260045,
+ "grad_norm": 3.1292848587036133,
+ "learning_rate": 1.2093463163032351e-06,
+ "loss": 0.3798,
+ "step": 8569
+ },
+ {
+ "epoch": 4.052009456264775,
+ "grad_norm": 3.156205892562866,
+ "learning_rate": 1.2088120917170465e-06,
+ "loss": 0.309,
+ "step": 8570
+ },
+ {
+ "epoch": 4.052482269503546,
+ "grad_norm": 2.8891193866729736,
+ "learning_rate": 1.208277947527467e-06,
+ "loss": 0.2989,
+ "step": 8571
+ },
+ {
+ "epoch": 4.052955082742317,
+ "grad_norm": 3.087719678878784,
+ "learning_rate": 1.2077438837677548e-06,
+ "loss": 0.3348,
+ "step": 8572
+ },
+ {
+ "epoch": 4.053427895981088,
+ "grad_norm": 3.345583915710449,
+ "learning_rate": 1.2072099004711657e-06,
+ "loss": 0.3395,
+ "step": 8573
+ },
+ {
+ "epoch": 4.0539007092198585,
+ "grad_norm": 2.9834377765655518,
+ "learning_rate": 1.2066759976709463e-06,
+ "loss": 0.3252,
+ "step": 8574
+ },
+ {
+ "epoch": 4.054373522458629,
+ "grad_norm": 3.0764353275299072,
+ "learning_rate": 1.2061421754003425e-06,
+ "loss": 0.3467,
+ "step": 8575
+ },
+ {
+ "epoch": 4.054846335697399,
+ "grad_norm": 3.332232713699341,
+ "learning_rate": 1.2056084336925919e-06,
+ "loss": 0.3448,
+ "step": 8576
+ },
+ {
+ "epoch": 4.05531914893617,
+ "grad_norm": 3.1885993480682373,
+ "learning_rate": 1.2050747725809275e-06,
+ "loss": 0.325,
+ "step": 8577
+ },
+ {
+ "epoch": 4.055791962174941,
+ "grad_norm": 3.2727091312408447,
+ "learning_rate": 1.2045411920985798e-06,
+ "loss": 0.3755,
+ "step": 8578
+ },
+ {
+ "epoch": 4.0562647754137116,
+ "grad_norm": 3.0687687397003174,
+ "learning_rate": 1.2040076922787708e-06,
+ "loss": 0.2791,
+ "step": 8579
+ },
+ {
+ "epoch": 4.056737588652482,
+ "grad_norm": 3.2538771629333496,
+ "learning_rate": 1.2034742731547211e-06,
+ "loss": 0.3409,
+ "step": 8580
+ },
+ {
+ "epoch": 4.057210401891253,
+ "grad_norm": 3.237423896789551,
+ "learning_rate": 1.2029409347596429e-06,
+ "loss": 0.3803,
+ "step": 8581
+ },
+ {
+ "epoch": 4.057683215130024,
+ "grad_norm": 3.3347854614257812,
+ "learning_rate": 1.2024076771267457e-06,
+ "loss": 0.3123,
+ "step": 8582
+ },
+ {
+ "epoch": 4.058156028368795,
+ "grad_norm": 3.1294021606445312,
+ "learning_rate": 1.2018745002892327e-06,
+ "loss": 0.33,
+ "step": 8583
+ },
+ {
+ "epoch": 4.058628841607565,
+ "grad_norm": 2.9440014362335205,
+ "learning_rate": 1.2013414042803013e-06,
+ "loss": 0.3698,
+ "step": 8584
+ },
+ {
+ "epoch": 4.0591016548463354,
+ "grad_norm": 3.602764129638672,
+ "learning_rate": 1.200808389133147e-06,
+ "loss": 0.3733,
+ "step": 8585
+ },
+ {
+ "epoch": 4.059574468085106,
+ "grad_norm": 3.2689952850341797,
+ "learning_rate": 1.2002754548809578e-06,
+ "loss": 0.3188,
+ "step": 8586
+ },
+ {
+ "epoch": 4.060047281323877,
+ "grad_norm": 3.15454363822937,
+ "learning_rate": 1.199742601556916e-06,
+ "loss": 0.3493,
+ "step": 8587
+ },
+ {
+ "epoch": 4.060520094562648,
+ "grad_norm": 2.843860387802124,
+ "learning_rate": 1.1992098291942016e-06,
+ "loss": 0.3277,
+ "step": 8588
+ },
+ {
+ "epoch": 4.060992907801419,
+ "grad_norm": 3.0749056339263916,
+ "learning_rate": 1.1986771378259876e-06,
+ "loss": 0.3465,
+ "step": 8589
+ },
+ {
+ "epoch": 4.061465721040189,
+ "grad_norm": 3.3339948654174805,
+ "learning_rate": 1.1981445274854412e-06,
+ "loss": 0.3507,
+ "step": 8590
+ },
+ {
+ "epoch": 4.06193853427896,
+ "grad_norm": 2.7992780208587646,
+ "learning_rate": 1.1976119982057275e-06,
+ "loss": 0.302,
+ "step": 8591
+ },
+ {
+ "epoch": 4.06241134751773,
+ "grad_norm": 3.0862269401550293,
+ "learning_rate": 1.1970795500200028e-06,
+ "loss": 0.3365,
+ "step": 8592
+ },
+ {
+ "epoch": 4.062884160756501,
+ "grad_norm": 3.263456106185913,
+ "learning_rate": 1.1965471829614222e-06,
+ "loss": 0.3764,
+ "step": 8593
+ },
+ {
+ "epoch": 4.063356973995272,
+ "grad_norm": 3.0682623386383057,
+ "learning_rate": 1.1960148970631332e-06,
+ "loss": 0.3488,
+ "step": 8594
+ },
+ {
+ "epoch": 4.0638297872340425,
+ "grad_norm": 2.8910646438598633,
+ "learning_rate": 1.195482692358278e-06,
+ "loss": 0.3224,
+ "step": 8595
+ },
+ {
+ "epoch": 4.064302600472813,
+ "grad_norm": 3.170072555541992,
+ "learning_rate": 1.1949505688799961e-06,
+ "loss": 0.3058,
+ "step": 8596
+ },
+ {
+ "epoch": 4.064775413711584,
+ "grad_norm": 3.018674373626709,
+ "learning_rate": 1.19441852666142e-06,
+ "loss": 0.3824,
+ "step": 8597
+ },
+ {
+ "epoch": 4.065248226950355,
+ "grad_norm": 3.0038044452667236,
+ "learning_rate": 1.1938865657356773e-06,
+ "loss": 0.3657,
+ "step": 8598
+ },
+ {
+ "epoch": 4.065721040189126,
+ "grad_norm": 3.248204469680786,
+ "learning_rate": 1.193354686135891e-06,
+ "loss": 0.3305,
+ "step": 8599
+ },
+ {
+ "epoch": 4.066193853427896,
+ "grad_norm": 3.144714832305908,
+ "learning_rate": 1.192822887895178e-06,
+ "loss": 0.3395,
+ "step": 8600
+ },
+ {
+ "epoch": 4.066666666666666,
+ "grad_norm": 2.9457240104675293,
+ "learning_rate": 1.1922911710466531e-06,
+ "loss": 0.3288,
+ "step": 8601
+ },
+ {
+ "epoch": 4.067139479905437,
+ "grad_norm": 3.1602869033813477,
+ "learning_rate": 1.1917595356234218e-06,
+ "loss": 0.3713,
+ "step": 8602
+ },
+ {
+ "epoch": 4.067612293144208,
+ "grad_norm": 3.0820837020874023,
+ "learning_rate": 1.1912279816585888e-06,
+ "loss": 0.2987,
+ "step": 8603
+ },
+ {
+ "epoch": 4.068085106382979,
+ "grad_norm": 3.0366809368133545,
+ "learning_rate": 1.1906965091852502e-06,
+ "loss": 0.4151,
+ "step": 8604
+ },
+ {
+ "epoch": 4.06855791962175,
+ "grad_norm": 3.229402780532837,
+ "learning_rate": 1.190165118236498e-06,
+ "loss": 0.321,
+ "step": 8605
+ },
+ {
+ "epoch": 4.06903073286052,
+ "grad_norm": 2.832232713699341,
+ "learning_rate": 1.1896338088454217e-06,
+ "loss": 0.3551,
+ "step": 8606
+ },
+ {
+ "epoch": 4.069503546099291,
+ "grad_norm": 3.5618600845336914,
+ "learning_rate": 1.1891025810451012e-06,
+ "loss": 0.3704,
+ "step": 8607
+ },
+ {
+ "epoch": 4.069976359338061,
+ "grad_norm": 3.287827491760254,
+ "learning_rate": 1.1885714348686158e-06,
+ "loss": 0.3469,
+ "step": 8608
+ },
+ {
+ "epoch": 4.070449172576832,
+ "grad_norm": 3.468825101852417,
+ "learning_rate": 1.188040370349037e-06,
+ "loss": 0.3687,
+ "step": 8609
+ },
+ {
+ "epoch": 4.070921985815603,
+ "grad_norm": 3.2931180000305176,
+ "learning_rate": 1.1875093875194302e-06,
+ "loss": 0.3832,
+ "step": 8610
+ },
+ {
+ "epoch": 4.0713947990543735,
+ "grad_norm": 2.9613003730773926,
+ "learning_rate": 1.18697848641286e-06,
+ "loss": 0.3314,
+ "step": 8611
+ },
+ {
+ "epoch": 4.071867612293144,
+ "grad_norm": 3.1507649421691895,
+ "learning_rate": 1.1864476670623816e-06,
+ "loss": 0.3153,
+ "step": 8612
+ },
+ {
+ "epoch": 4.072340425531915,
+ "grad_norm": 2.844064950942993,
+ "learning_rate": 1.1859169295010478e-06,
+ "loss": 0.3566,
+ "step": 8613
+ },
+ {
+ "epoch": 4.072813238770686,
+ "grad_norm": 3.227264881134033,
+ "learning_rate": 1.1853862737619042e-06,
+ "loss": 0.3717,
+ "step": 8614
+ },
+ {
+ "epoch": 4.073286052009456,
+ "grad_norm": 2.9416239261627197,
+ "learning_rate": 1.1848556998779922e-06,
+ "loss": 0.3438,
+ "step": 8615
+ },
+ {
+ "epoch": 4.073758865248227,
+ "grad_norm": 4.1662492752075195,
+ "learning_rate": 1.18432520788235e-06,
+ "loss": 0.362,
+ "step": 8616
+ },
+ {
+ "epoch": 4.074231678486997,
+ "grad_norm": 3.47951602935791,
+ "learning_rate": 1.183794797808008e-06,
+ "loss": 0.3672,
+ "step": 8617
+ },
+ {
+ "epoch": 4.074704491725768,
+ "grad_norm": 2.998969793319702,
+ "learning_rate": 1.1832644696879919e-06,
+ "loss": 0.3281,
+ "step": 8618
+ },
+ {
+ "epoch": 4.075177304964539,
+ "grad_norm": 2.956167221069336,
+ "learning_rate": 1.182734223555324e-06,
+ "loss": 0.3059,
+ "step": 8619
+ },
+ {
+ "epoch": 4.07565011820331,
+ "grad_norm": 3.447821855545044,
+ "learning_rate": 1.1822040594430195e-06,
+ "loss": 0.333,
+ "step": 8620
+ },
+ {
+ "epoch": 4.076122931442081,
+ "grad_norm": 3.072972059249878,
+ "learning_rate": 1.1816739773840905e-06,
+ "loss": 0.3737,
+ "step": 8621
+ },
+ {
+ "epoch": 4.076595744680851,
+ "grad_norm": 3.142913341522217,
+ "learning_rate": 1.1811439774115424e-06,
+ "loss": 0.3697,
+ "step": 8622
+ },
+ {
+ "epoch": 4.077068557919622,
+ "grad_norm": 3.4997763633728027,
+ "learning_rate": 1.1806140595583745e-06,
+ "loss": 0.4177,
+ "step": 8623
+ },
+ {
+ "epoch": 4.077541371158392,
+ "grad_norm": 3.032951831817627,
+ "learning_rate": 1.1800842238575853e-06,
+ "loss": 0.351,
+ "step": 8624
+ },
+ {
+ "epoch": 4.078014184397163,
+ "grad_norm": 2.8878438472747803,
+ "learning_rate": 1.1795544703421625e-06,
+ "loss": 0.3409,
+ "step": 8625
+ },
+ {
+ "epoch": 4.078486997635934,
+ "grad_norm": 2.931614637374878,
+ "learning_rate": 1.1790247990450936e-06,
+ "loss": 0.3416,
+ "step": 8626
+ },
+ {
+ "epoch": 4.0789598108747045,
+ "grad_norm": 3.1719822883605957,
+ "learning_rate": 1.1784952099993586e-06,
+ "loss": 0.3574,
+ "step": 8627
+ },
+ {
+ "epoch": 4.079432624113475,
+ "grad_norm": 2.960068464279175,
+ "learning_rate": 1.1779657032379322e-06,
+ "loss": 0.3557,
+ "step": 8628
+ },
+ {
+ "epoch": 4.079905437352246,
+ "grad_norm": 3.1410937309265137,
+ "learning_rate": 1.1774362787937843e-06,
+ "loss": 0.3839,
+ "step": 8629
+ },
+ {
+ "epoch": 4.080378250591017,
+ "grad_norm": 3.596153736114502,
+ "learning_rate": 1.1769069366998793e-06,
+ "loss": 0.3135,
+ "step": 8630
+ },
+ {
+ "epoch": 4.080851063829787,
+ "grad_norm": 3.385826587677002,
+ "learning_rate": 1.1763776769891786e-06,
+ "loss": 0.3624,
+ "step": 8631
+ },
+ {
+ "epoch": 4.081323877068558,
+ "grad_norm": 3.2531018257141113,
+ "learning_rate": 1.175848499694636e-06,
+ "loss": 0.3593,
+ "step": 8632
+ },
+ {
+ "epoch": 4.081796690307328,
+ "grad_norm": 3.3864004611968994,
+ "learning_rate": 1.1753194048492004e-06,
+ "loss": 0.3929,
+ "step": 8633
+ },
+ {
+ "epoch": 4.082269503546099,
+ "grad_norm": 2.8734285831451416,
+ "learning_rate": 1.1747903924858175e-06,
+ "loss": 0.3145,
+ "step": 8634
+ },
+ {
+ "epoch": 4.08274231678487,
+ "grad_norm": 3.3261659145355225,
+ "learning_rate": 1.174261462637426e-06,
+ "loss": 0.3351,
+ "step": 8635
+ },
+ {
+ "epoch": 4.083215130023641,
+ "grad_norm": 3.413990020751953,
+ "learning_rate": 1.1737326153369594e-06,
+ "loss": 0.3984,
+ "step": 8636
+ },
+ {
+ "epoch": 4.083687943262412,
+ "grad_norm": 3.311741590499878,
+ "learning_rate": 1.1732038506173481e-06,
+ "loss": 0.3716,
+ "step": 8637
+ },
+ {
+ "epoch": 4.084160756501182,
+ "grad_norm": 3.691573143005371,
+ "learning_rate": 1.1726751685115142e-06,
+ "loss": 0.3542,
+ "step": 8638
+ },
+ {
+ "epoch": 4.084633569739952,
+ "grad_norm": 3.1951167583465576,
+ "learning_rate": 1.1721465690523784e-06,
+ "loss": 0.3683,
+ "step": 8639
+ },
+ {
+ "epoch": 4.085106382978723,
+ "grad_norm": 3.1731514930725098,
+ "learning_rate": 1.1716180522728534e-06,
+ "loss": 0.3552,
+ "step": 8640
+ },
+ {
+ "epoch": 4.085579196217494,
+ "grad_norm": 3.1588845252990723,
+ "learning_rate": 1.1710896182058465e-06,
+ "loss": 0.3908,
+ "step": 8641
+ },
+ {
+ "epoch": 4.086052009456265,
+ "grad_norm": 3.6902294158935547,
+ "learning_rate": 1.1705612668842628e-06,
+ "loss": 0.4099,
+ "step": 8642
+ },
+ {
+ "epoch": 4.0865248226950355,
+ "grad_norm": 4.56397819519043,
+ "learning_rate": 1.1700329983409988e-06,
+ "loss": 0.3456,
+ "step": 8643
+ },
+ {
+ "epoch": 4.086997635933806,
+ "grad_norm": 2.924715995788574,
+ "learning_rate": 1.1695048126089492e-06,
+ "loss": 0.3885,
+ "step": 8644
+ },
+ {
+ "epoch": 4.087470449172577,
+ "grad_norm": 3.537550687789917,
+ "learning_rate": 1.1689767097210009e-06,
+ "loss": 0.3551,
+ "step": 8645
+ },
+ {
+ "epoch": 4.087943262411348,
+ "grad_norm": 3.0198440551757812,
+ "learning_rate": 1.1684486897100364e-06,
+ "loss": 0.3448,
+ "step": 8646
+ },
+ {
+ "epoch": 4.088416075650118,
+ "grad_norm": 3.448965072631836,
+ "learning_rate": 1.1679207526089334e-06,
+ "loss": 0.3252,
+ "step": 8647
+ },
+ {
+ "epoch": 4.088888888888889,
+ "grad_norm": 3.057326078414917,
+ "learning_rate": 1.167392898450563e-06,
+ "loss": 0.3231,
+ "step": 8648
+ },
+ {
+ "epoch": 4.089361702127659,
+ "grad_norm": 3.0788655281066895,
+ "learning_rate": 1.1668651272677948e-06,
+ "loss": 0.3273,
+ "step": 8649
+ },
+ {
+ "epoch": 4.08983451536643,
+ "grad_norm": 2.9126291275024414,
+ "learning_rate": 1.1663374390934893e-06,
+ "loss": 0.3162,
+ "step": 8650
+ },
+ {
+ "epoch": 4.090307328605201,
+ "grad_norm": 3.278874635696411,
+ "learning_rate": 1.1658098339605027e-06,
+ "loss": 0.3123,
+ "step": 8651
+ },
+ {
+ "epoch": 4.090780141843972,
+ "grad_norm": 2.8490889072418213,
+ "learning_rate": 1.1652823119016882e-06,
+ "loss": 0.3408,
+ "step": 8652
+ },
+ {
+ "epoch": 4.091252955082743,
+ "grad_norm": 3.0473995208740234,
+ "learning_rate": 1.164754872949891e-06,
+ "loss": 0.3349,
+ "step": 8653
+ },
+ {
+ "epoch": 4.091725768321513,
+ "grad_norm": 2.9052987098693848,
+ "learning_rate": 1.1642275171379535e-06,
+ "loss": 0.3113,
+ "step": 8654
+ },
+ {
+ "epoch": 4.092198581560283,
+ "grad_norm": 3.1060919761657715,
+ "learning_rate": 1.1637002444987116e-06,
+ "loss": 0.3139,
+ "step": 8655
+ },
+ {
+ "epoch": 4.092671394799054,
+ "grad_norm": 3.172394275665283,
+ "learning_rate": 1.163173055064995e-06,
+ "loss": 0.3555,
+ "step": 8656
+ },
+ {
+ "epoch": 4.093144208037825,
+ "grad_norm": 3.36523699760437,
+ "learning_rate": 1.1626459488696313e-06,
+ "loss": 0.3835,
+ "step": 8657
+ },
+ {
+ "epoch": 4.093617021276596,
+ "grad_norm": 2.9513938426971436,
+ "learning_rate": 1.1621189259454393e-06,
+ "loss": 0.3432,
+ "step": 8658
+ },
+ {
+ "epoch": 4.0940898345153665,
+ "grad_norm": 2.8415515422821045,
+ "learning_rate": 1.1615919863252365e-06,
+ "loss": 0.3494,
+ "step": 8659
+ },
+ {
+ "epoch": 4.094562647754137,
+ "grad_norm": 3.3759984970092773,
+ "learning_rate": 1.1610651300418315e-06,
+ "loss": 0.3519,
+ "step": 8660
+ },
+ {
+ "epoch": 4.095035460992908,
+ "grad_norm": 3.1927380561828613,
+ "learning_rate": 1.1605383571280304e-06,
+ "loss": 0.3675,
+ "step": 8661
+ },
+ {
+ "epoch": 4.095508274231679,
+ "grad_norm": 2.800658941268921,
+ "learning_rate": 1.1600116676166321e-06,
+ "loss": 0.3291,
+ "step": 8662
+ },
+ {
+ "epoch": 4.095981087470449,
+ "grad_norm": 2.9948630332946777,
+ "learning_rate": 1.1594850615404316e-06,
+ "loss": 0.332,
+ "step": 8663
+ },
+ {
+ "epoch": 4.0964539007092196,
+ "grad_norm": 3.032003879547119,
+ "learning_rate": 1.1589585389322176e-06,
+ "loss": 0.3583,
+ "step": 8664
+ },
+ {
+ "epoch": 4.09692671394799,
+ "grad_norm": 2.9765310287475586,
+ "learning_rate": 1.1584320998247757e-06,
+ "loss": 0.3296,
+ "step": 8665
+ },
+ {
+ "epoch": 4.097399527186761,
+ "grad_norm": 3.049954414367676,
+ "learning_rate": 1.1579057442508838e-06,
+ "loss": 0.4007,
+ "step": 8666
+ },
+ {
+ "epoch": 4.097872340425532,
+ "grad_norm": 3.3874928951263428,
+ "learning_rate": 1.1573794722433168e-06,
+ "loss": 0.3856,
+ "step": 8667
+ },
+ {
+ "epoch": 4.098345153664303,
+ "grad_norm": 2.56701397895813,
+ "learning_rate": 1.1568532838348432e-06,
+ "loss": 0.3291,
+ "step": 8668
+ },
+ {
+ "epoch": 4.0988179669030735,
+ "grad_norm": 2.956408739089966,
+ "learning_rate": 1.1563271790582247e-06,
+ "loss": 0.3538,
+ "step": 8669
+ },
+ {
+ "epoch": 4.099290780141844,
+ "grad_norm": 3.827467679977417,
+ "learning_rate": 1.1558011579462225e-06,
+ "loss": 0.3764,
+ "step": 8670
+ },
+ {
+ "epoch": 4.099763593380614,
+ "grad_norm": 3.3271424770355225,
+ "learning_rate": 1.1552752205315867e-06,
+ "loss": 0.339,
+ "step": 8671
+ },
+ {
+ "epoch": 4.100236406619385,
+ "grad_norm": 3.0050785541534424,
+ "learning_rate": 1.1547493668470675e-06,
+ "loss": 0.3764,
+ "step": 8672
+ },
+ {
+ "epoch": 4.100709219858156,
+ "grad_norm": 2.6030385494232178,
+ "learning_rate": 1.1542235969254065e-06,
+ "loss": 0.3507,
+ "step": 8673
+ },
+ {
+ "epoch": 4.101182033096927,
+ "grad_norm": 3.081695556640625,
+ "learning_rate": 1.1536979107993402e-06,
+ "loss": 0.3386,
+ "step": 8674
+ },
+ {
+ "epoch": 4.101654846335697,
+ "grad_norm": 2.7685163021087646,
+ "learning_rate": 1.1531723085016025e-06,
+ "loss": 0.3456,
+ "step": 8675
+ },
+ {
+ "epoch": 4.102127659574468,
+ "grad_norm": 3.037252902984619,
+ "learning_rate": 1.1526467900649195e-06,
+ "loss": 0.3489,
+ "step": 8676
+ },
+ {
+ "epoch": 4.102600472813239,
+ "grad_norm": 2.9675045013427734,
+ "learning_rate": 1.1521213555220129e-06,
+ "loss": 0.3638,
+ "step": 8677
+ },
+ {
+ "epoch": 4.10307328605201,
+ "grad_norm": 3.3377575874328613,
+ "learning_rate": 1.1515960049055994e-06,
+ "loss": 0.3557,
+ "step": 8678
+ },
+ {
+ "epoch": 4.10354609929078,
+ "grad_norm": 2.7452030181884766,
+ "learning_rate": 1.1510707382483888e-06,
+ "loss": 0.285,
+ "step": 8679
+ },
+ {
+ "epoch": 4.1040189125295505,
+ "grad_norm": 2.9602560997009277,
+ "learning_rate": 1.1505455555830897e-06,
+ "loss": 0.321,
+ "step": 8680
+ },
+ {
+ "epoch": 4.104491725768321,
+ "grad_norm": 3.122945547103882,
+ "learning_rate": 1.1500204569424007e-06,
+ "loss": 0.375,
+ "step": 8681
+ },
+ {
+ "epoch": 4.104964539007092,
+ "grad_norm": 3.334885835647583,
+ "learning_rate": 1.149495442359019e-06,
+ "loss": 0.3399,
+ "step": 8682
+ },
+ {
+ "epoch": 4.105437352245863,
+ "grad_norm": 3.3663594722747803,
+ "learning_rate": 1.1489705118656346e-06,
+ "loss": 0.3482,
+ "step": 8683
+ },
+ {
+ "epoch": 4.105910165484634,
+ "grad_norm": 2.9761641025543213,
+ "learning_rate": 1.1484456654949313e-06,
+ "loss": 0.3317,
+ "step": 8684
+ },
+ {
+ "epoch": 4.1063829787234045,
+ "grad_norm": 3.486905574798584,
+ "learning_rate": 1.147920903279591e-06,
+ "loss": 0.3737,
+ "step": 8685
+ },
+ {
+ "epoch": 4.106855791962175,
+ "grad_norm": 3.4820523262023926,
+ "learning_rate": 1.1473962252522875e-06,
+ "loss": 0.4135,
+ "step": 8686
+ },
+ {
+ "epoch": 4.107328605200945,
+ "grad_norm": 3.314117431640625,
+ "learning_rate": 1.146871631445689e-06,
+ "loss": 0.369,
+ "step": 8687
+ },
+ {
+ "epoch": 4.107801418439716,
+ "grad_norm": 2.9497411251068115,
+ "learning_rate": 1.1463471218924615e-06,
+ "loss": 0.3233,
+ "step": 8688
+ },
+ {
+ "epoch": 4.108274231678487,
+ "grad_norm": 3.1337075233459473,
+ "learning_rate": 1.1458226966252624e-06,
+ "loss": 0.401,
+ "step": 8689
+ },
+ {
+ "epoch": 4.108747044917258,
+ "grad_norm": 3.0163166522979736,
+ "learning_rate": 1.1452983556767473e-06,
+ "loss": 0.3812,
+ "step": 8690
+ },
+ {
+ "epoch": 4.109219858156028,
+ "grad_norm": 2.976491928100586,
+ "learning_rate": 1.1447740990795629e-06,
+ "loss": 0.3508,
+ "step": 8691
+ },
+ {
+ "epoch": 4.109692671394799,
+ "grad_norm": 3.2449910640716553,
+ "learning_rate": 1.144249926866353e-06,
+ "loss": 0.3056,
+ "step": 8692
+ },
+ {
+ "epoch": 4.11016548463357,
+ "grad_norm": 2.562558650970459,
+ "learning_rate": 1.1437258390697553e-06,
+ "loss": 0.2878,
+ "step": 8693
+ },
+ {
+ "epoch": 4.110638297872341,
+ "grad_norm": 3.1823108196258545,
+ "learning_rate": 1.1432018357224017e-06,
+ "loss": 0.2849,
+ "step": 8694
+ },
+ {
+ "epoch": 4.111111111111111,
+ "grad_norm": 2.9045653343200684,
+ "learning_rate": 1.1426779168569217e-06,
+ "loss": 0.3264,
+ "step": 8695
+ },
+ {
+ "epoch": 4.1115839243498815,
+ "grad_norm": 2.7991254329681396,
+ "learning_rate": 1.1421540825059355e-06,
+ "loss": 0.3427,
+ "step": 8696
+ },
+ {
+ "epoch": 4.112056737588652,
+ "grad_norm": 2.9184927940368652,
+ "learning_rate": 1.14163033270206e-06,
+ "loss": 0.3073,
+ "step": 8697
+ },
+ {
+ "epoch": 4.112529550827423,
+ "grad_norm": 3.189335584640503,
+ "learning_rate": 1.1411066674779084e-06,
+ "loss": 0.3836,
+ "step": 8698
+ },
+ {
+ "epoch": 4.113002364066194,
+ "grad_norm": 2.899711847305298,
+ "learning_rate": 1.140583086866085e-06,
+ "loss": 0.3378,
+ "step": 8699
+ },
+ {
+ "epoch": 4.113475177304965,
+ "grad_norm": 3.167665481567383,
+ "learning_rate": 1.1400595908991927e-06,
+ "loss": 0.3273,
+ "step": 8700
+ },
+ {
+ "epoch": 4.1139479905437355,
+ "grad_norm": 3.2930212020874023,
+ "learning_rate": 1.1395361796098268e-06,
+ "loss": 0.3221,
+ "step": 8701
+ },
+ {
+ "epoch": 4.114420803782506,
+ "grad_norm": 3.0603861808776855,
+ "learning_rate": 1.1390128530305764e-06,
+ "loss": 0.3371,
+ "step": 8702
+ },
+ {
+ "epoch": 4.114893617021276,
+ "grad_norm": 3.6339457035064697,
+ "learning_rate": 1.1384896111940289e-06,
+ "loss": 0.3986,
+ "step": 8703
+ },
+ {
+ "epoch": 4.115366430260047,
+ "grad_norm": 2.975799322128296,
+ "learning_rate": 1.1379664541327623e-06,
+ "loss": 0.3021,
+ "step": 8704
+ },
+ {
+ "epoch": 4.115839243498818,
+ "grad_norm": 2.9100987911224365,
+ "learning_rate": 1.1374433818793534e-06,
+ "loss": 0.3473,
+ "step": 8705
+ },
+ {
+ "epoch": 4.116312056737589,
+ "grad_norm": 2.9515233039855957,
+ "learning_rate": 1.1369203944663704e-06,
+ "loss": 0.3004,
+ "step": 8706
+ },
+ {
+ "epoch": 4.116784869976359,
+ "grad_norm": 3.283583879470825,
+ "learning_rate": 1.1363974919263774e-06,
+ "loss": 0.401,
+ "step": 8707
+ },
+ {
+ "epoch": 4.11725768321513,
+ "grad_norm": 3.307530641555786,
+ "learning_rate": 1.1358746742919325e-06,
+ "loss": 0.322,
+ "step": 8708
+ },
+ {
+ "epoch": 4.117730496453901,
+ "grad_norm": 3.6834614276885986,
+ "learning_rate": 1.135351941595591e-06,
+ "loss": 0.3703,
+ "step": 8709
+ },
+ {
+ "epoch": 4.118203309692672,
+ "grad_norm": 3.0829904079437256,
+ "learning_rate": 1.1348292938699e-06,
+ "loss": 0.3283,
+ "step": 8710
+ },
+ {
+ "epoch": 4.118676122931442,
+ "grad_norm": 2.914794921875,
+ "learning_rate": 1.1343067311474033e-06,
+ "loss": 0.337,
+ "step": 8711
+ },
+ {
+ "epoch": 4.1191489361702125,
+ "grad_norm": 3.550536870956421,
+ "learning_rate": 1.1337842534606368e-06,
+ "loss": 0.3752,
+ "step": 8712
+ },
+ {
+ "epoch": 4.119621749408983,
+ "grad_norm": 3.337012767791748,
+ "learning_rate": 1.1332618608421353e-06,
+ "loss": 0.3604,
+ "step": 8713
+ },
+ {
+ "epoch": 4.120094562647754,
+ "grad_norm": 2.7749485969543457,
+ "learning_rate": 1.1327395533244248e-06,
+ "loss": 0.3712,
+ "step": 8714
+ },
+ {
+ "epoch": 4.120567375886525,
+ "grad_norm": 3.571261405944824,
+ "learning_rate": 1.1322173309400258e-06,
+ "loss": 0.4148,
+ "step": 8715
+ },
+ {
+ "epoch": 4.121040189125296,
+ "grad_norm": 3.264871597290039,
+ "learning_rate": 1.1316951937214573e-06,
+ "loss": 0.3229,
+ "step": 8716
+ },
+ {
+ "epoch": 4.1215130023640665,
+ "grad_norm": 2.974625825881958,
+ "learning_rate": 1.131173141701228e-06,
+ "loss": 0.3372,
+ "step": 8717
+ },
+ {
+ "epoch": 4.121985815602837,
+ "grad_norm": 3.18060302734375,
+ "learning_rate": 1.1306511749118466e-06,
+ "loss": 0.4041,
+ "step": 8718
+ },
+ {
+ "epoch": 4.122458628841607,
+ "grad_norm": 2.7793190479278564,
+ "learning_rate": 1.1301292933858115e-06,
+ "loss": 0.3329,
+ "step": 8719
+ },
+ {
+ "epoch": 4.122931442080378,
+ "grad_norm": 3.0883100032806396,
+ "learning_rate": 1.1296074971556179e-06,
+ "loss": 0.3999,
+ "step": 8720
+ },
+ {
+ "epoch": 4.123404255319149,
+ "grad_norm": 2.984799385070801,
+ "learning_rate": 1.1290857862537573e-06,
+ "loss": 0.3432,
+ "step": 8721
+ },
+ {
+ "epoch": 4.12387706855792,
+ "grad_norm": 3.0691094398498535,
+ "learning_rate": 1.1285641607127127e-06,
+ "loss": 0.3043,
+ "step": 8722
+ },
+ {
+ "epoch": 4.12434988179669,
+ "grad_norm": 3.2218985557556152,
+ "learning_rate": 1.128042620564965e-06,
+ "loss": 0.363,
+ "step": 8723
+ },
+ {
+ "epoch": 4.124822695035461,
+ "grad_norm": 2.951098918914795,
+ "learning_rate": 1.1275211658429877e-06,
+ "loss": 0.3459,
+ "step": 8724
+ },
+ {
+ "epoch": 4.125295508274232,
+ "grad_norm": 3.038513660430908,
+ "learning_rate": 1.1269997965792493e-06,
+ "loss": 0.3073,
+ "step": 8725
+ },
+ {
+ "epoch": 4.125768321513003,
+ "grad_norm": 2.7548015117645264,
+ "learning_rate": 1.1264785128062129e-06,
+ "loss": 0.3587,
+ "step": 8726
+ },
+ {
+ "epoch": 4.126241134751773,
+ "grad_norm": 3.618379592895508,
+ "learning_rate": 1.125957314556336e-06,
+ "loss": 0.4009,
+ "step": 8727
+ },
+ {
+ "epoch": 4.1267139479905435,
+ "grad_norm": 3.264702320098877,
+ "learning_rate": 1.1254362018620728e-06,
+ "loss": 0.3684,
+ "step": 8728
+ },
+ {
+ "epoch": 4.127186761229314,
+ "grad_norm": 3.209995746612549,
+ "learning_rate": 1.1249151747558704e-06,
+ "loss": 0.3796,
+ "step": 8729
+ },
+ {
+ "epoch": 4.127659574468085,
+ "grad_norm": 3.164973735809326,
+ "learning_rate": 1.1243942332701693e-06,
+ "loss": 0.3147,
+ "step": 8730
+ },
+ {
+ "epoch": 4.128132387706856,
+ "grad_norm": 3.309659957885742,
+ "learning_rate": 1.1238733774374087e-06,
+ "loss": 0.308,
+ "step": 8731
+ },
+ {
+ "epoch": 4.128605200945627,
+ "grad_norm": 3.138901710510254,
+ "learning_rate": 1.1233526072900184e-06,
+ "loss": 0.3721,
+ "step": 8732
+ },
+ {
+ "epoch": 4.1290780141843975,
+ "grad_norm": 3.5710649490356445,
+ "learning_rate": 1.122831922860424e-06,
+ "loss": 0.3872,
+ "step": 8733
+ },
+ {
+ "epoch": 4.129550827423168,
+ "grad_norm": 3.192469835281372,
+ "learning_rate": 1.1223113241810482e-06,
+ "loss": 0.349,
+ "step": 8734
+ },
+ {
+ "epoch": 4.130023640661938,
+ "grad_norm": 2.9302608966827393,
+ "learning_rate": 1.121790811284304e-06,
+ "loss": 0.3207,
+ "step": 8735
+ },
+ {
+ "epoch": 4.130496453900709,
+ "grad_norm": 3.022963047027588,
+ "learning_rate": 1.121270384202604e-06,
+ "loss": 0.3487,
+ "step": 8736
+ },
+ {
+ "epoch": 4.13096926713948,
+ "grad_norm": 3.0473732948303223,
+ "learning_rate": 1.1207500429683513e-06,
+ "loss": 0.3083,
+ "step": 8737
+ },
+ {
+ "epoch": 4.131442080378251,
+ "grad_norm": 2.9411537647247314,
+ "learning_rate": 1.1202297876139448e-06,
+ "loss": 0.3077,
+ "step": 8738
+ },
+ {
+ "epoch": 4.131914893617021,
+ "grad_norm": 2.9274520874023438,
+ "learning_rate": 1.1197096181717804e-06,
+ "loss": 0.3071,
+ "step": 8739
+ },
+ {
+ "epoch": 4.132387706855792,
+ "grad_norm": 2.79213285446167,
+ "learning_rate": 1.1191895346742454e-06,
+ "loss": 0.3346,
+ "step": 8740
+ },
+ {
+ "epoch": 4.132860520094563,
+ "grad_norm": 3.2763726711273193,
+ "learning_rate": 1.1186695371537235e-06,
+ "loss": 0.3753,
+ "step": 8741
+ },
+ {
+ "epoch": 4.133333333333334,
+ "grad_norm": 3.245525598526001,
+ "learning_rate": 1.1181496256425927e-06,
+ "loss": 0.3586,
+ "step": 8742
+ },
+ {
+ "epoch": 4.133806146572104,
+ "grad_norm": 3.557176351547241,
+ "learning_rate": 1.1176298001732244e-06,
+ "loss": 0.3547,
+ "step": 8743
+ },
+ {
+ "epoch": 4.1342789598108745,
+ "grad_norm": 3.674633741378784,
+ "learning_rate": 1.117110060777988e-06,
+ "loss": 0.3994,
+ "step": 8744
+ },
+ {
+ "epoch": 4.134751773049645,
+ "grad_norm": 3.168025016784668,
+ "learning_rate": 1.1165904074892433e-06,
+ "loss": 0.3568,
+ "step": 8745
+ },
+ {
+ "epoch": 4.135224586288416,
+ "grad_norm": 2.9492177963256836,
+ "learning_rate": 1.1160708403393488e-06,
+ "loss": 0.3257,
+ "step": 8746
+ },
+ {
+ "epoch": 4.135697399527187,
+ "grad_norm": 3.139941930770874,
+ "learning_rate": 1.1155513593606548e-06,
+ "loss": 0.3464,
+ "step": 8747
+ },
+ {
+ "epoch": 4.136170212765958,
+ "grad_norm": 3.1875250339508057,
+ "learning_rate": 1.115031964585506e-06,
+ "loss": 0.3154,
+ "step": 8748
+ },
+ {
+ "epoch": 4.136643026004728,
+ "grad_norm": 3.0219457149505615,
+ "learning_rate": 1.1145126560462447e-06,
+ "loss": 0.3433,
+ "step": 8749
+ },
+ {
+ "epoch": 4.137115839243499,
+ "grad_norm": 2.992807149887085,
+ "learning_rate": 1.1139934337752046e-06,
+ "loss": 0.3127,
+ "step": 8750
+ },
+ {
+ "epoch": 4.137588652482269,
+ "grad_norm": 3.354733943939209,
+ "learning_rate": 1.1134742978047163e-06,
+ "loss": 0.3166,
+ "step": 8751
+ },
+ {
+ "epoch": 4.13806146572104,
+ "grad_norm": 3.1885886192321777,
+ "learning_rate": 1.1129552481671042e-06,
+ "loss": 0.3872,
+ "step": 8752
+ },
+ {
+ "epoch": 4.138534278959811,
+ "grad_norm": 2.8869078159332275,
+ "learning_rate": 1.1124362848946858e-06,
+ "loss": 0.3218,
+ "step": 8753
+ },
+ {
+ "epoch": 4.1390070921985815,
+ "grad_norm": 3.818469285964966,
+ "learning_rate": 1.1119174080197762e-06,
+ "loss": 0.3442,
+ "step": 8754
+ },
+ {
+ "epoch": 4.139479905437352,
+ "grad_norm": 3.2445592880249023,
+ "learning_rate": 1.1113986175746833e-06,
+ "loss": 0.3858,
+ "step": 8755
+ },
+ {
+ "epoch": 4.139952718676123,
+ "grad_norm": 2.654083490371704,
+ "learning_rate": 1.1108799135917098e-06,
+ "loss": 0.3023,
+ "step": 8756
+ },
+ {
+ "epoch": 4.140425531914894,
+ "grad_norm": 3.129635810852051,
+ "learning_rate": 1.1103612961031527e-06,
+ "loss": 0.3179,
+ "step": 8757
+ },
+ {
+ "epoch": 4.140898345153665,
+ "grad_norm": 2.8118138313293457,
+ "learning_rate": 1.1098427651413035e-06,
+ "loss": 0.3374,
+ "step": 8758
+ },
+ {
+ "epoch": 4.141371158392435,
+ "grad_norm": 3.104051113128662,
+ "learning_rate": 1.1093243207384506e-06,
+ "loss": 0.3202,
+ "step": 8759
+ },
+ {
+ "epoch": 4.141843971631205,
+ "grad_norm": 3.12392520904541,
+ "learning_rate": 1.1088059629268744e-06,
+ "loss": 0.3567,
+ "step": 8760
+ },
+ {
+ "epoch": 4.142316784869976,
+ "grad_norm": 3.467481851577759,
+ "learning_rate": 1.1082876917388497e-06,
+ "loss": 0.4148,
+ "step": 8761
+ },
+ {
+ "epoch": 4.142789598108747,
+ "grad_norm": 3.1120564937591553,
+ "learning_rate": 1.1077695072066488e-06,
+ "loss": 0.3838,
+ "step": 8762
+ },
+ {
+ "epoch": 4.143262411347518,
+ "grad_norm": 3.028073310852051,
+ "learning_rate": 1.107251409362535e-06,
+ "loss": 0.3703,
+ "step": 8763
+ },
+ {
+ "epoch": 4.143735224586289,
+ "grad_norm": 3.091510057449341,
+ "learning_rate": 1.1067333982387699e-06,
+ "loss": 0.3695,
+ "step": 8764
+ },
+ {
+ "epoch": 4.144208037825059,
+ "grad_norm": 3.9426586627960205,
+ "learning_rate": 1.1062154738676067e-06,
+ "loss": 0.3934,
+ "step": 8765
+ },
+ {
+ "epoch": 4.14468085106383,
+ "grad_norm": 2.923741102218628,
+ "learning_rate": 1.1056976362812939e-06,
+ "loss": 0.3679,
+ "step": 8766
+ },
+ {
+ "epoch": 4.1451536643026,
+ "grad_norm": 3.1010327339172363,
+ "learning_rate": 1.1051798855120757e-06,
+ "loss": 0.3314,
+ "step": 8767
+ },
+ {
+ "epoch": 4.145626477541371,
+ "grad_norm": 2.9165778160095215,
+ "learning_rate": 1.1046622215921896e-06,
+ "loss": 0.3473,
+ "step": 8768
+ },
+ {
+ "epoch": 4.146099290780142,
+ "grad_norm": 2.8494462966918945,
+ "learning_rate": 1.1041446445538692e-06,
+ "loss": 0.3226,
+ "step": 8769
+ },
+ {
+ "epoch": 4.1465721040189125,
+ "grad_norm": 2.53379225730896,
+ "learning_rate": 1.1036271544293412e-06,
+ "loss": 0.3123,
+ "step": 8770
+ },
+ {
+ "epoch": 4.147044917257683,
+ "grad_norm": 3.0433695316314697,
+ "learning_rate": 1.1031097512508274e-06,
+ "loss": 0.37,
+ "step": 8771
+ },
+ {
+ "epoch": 4.147517730496454,
+ "grad_norm": 3.418458938598633,
+ "learning_rate": 1.1025924350505431e-06,
+ "loss": 0.3266,
+ "step": 8772
+ },
+ {
+ "epoch": 4.147990543735225,
+ "grad_norm": 2.843733787536621,
+ "learning_rate": 1.1020752058607017e-06,
+ "loss": 0.3548,
+ "step": 8773
+ },
+ {
+ "epoch": 4.148463356973995,
+ "grad_norm": 3.122965097427368,
+ "learning_rate": 1.1015580637135073e-06,
+ "loss": 0.3214,
+ "step": 8774
+ },
+ {
+ "epoch": 4.148936170212766,
+ "grad_norm": 3.0042455196380615,
+ "learning_rate": 1.1010410086411601e-06,
+ "loss": 0.3395,
+ "step": 8775
+ },
+ {
+ "epoch": 4.149408983451536,
+ "grad_norm": 2.841426372528076,
+ "learning_rate": 1.1005240406758546e-06,
+ "loss": 0.3381,
+ "step": 8776
+ },
+ {
+ "epoch": 4.149881796690307,
+ "grad_norm": 2.8241262435913086,
+ "learning_rate": 1.100007159849781e-06,
+ "loss": 0.3504,
+ "step": 8777
+ },
+ {
+ "epoch": 4.150354609929078,
+ "grad_norm": 2.5685677528381348,
+ "learning_rate": 1.0994903661951223e-06,
+ "loss": 0.309,
+ "step": 8778
+ },
+ {
+ "epoch": 4.150827423167849,
+ "grad_norm": 3.197665214538574,
+ "learning_rate": 1.0989736597440581e-06,
+ "loss": 0.3722,
+ "step": 8779
+ },
+ {
+ "epoch": 4.15130023640662,
+ "grad_norm": 3.1483469009399414,
+ "learning_rate": 1.098457040528761e-06,
+ "loss": 0.3301,
+ "step": 8780
+ },
+ {
+ "epoch": 4.15177304964539,
+ "grad_norm": 2.8838415145874023,
+ "learning_rate": 1.0979405085813972e-06,
+ "loss": 0.3212,
+ "step": 8781
+ },
+ {
+ "epoch": 4.152245862884161,
+ "grad_norm": 3.1998705863952637,
+ "learning_rate": 1.0974240639341312e-06,
+ "loss": 0.3557,
+ "step": 8782
+ },
+ {
+ "epoch": 4.152718676122931,
+ "grad_norm": 2.9004411697387695,
+ "learning_rate": 1.0969077066191187e-06,
+ "loss": 0.351,
+ "step": 8783
+ },
+ {
+ "epoch": 4.153191489361702,
+ "grad_norm": 3.036574125289917,
+ "learning_rate": 1.0963914366685096e-06,
+ "loss": 0.3762,
+ "step": 8784
+ },
+ {
+ "epoch": 4.153664302600473,
+ "grad_norm": 3.6683623790740967,
+ "learning_rate": 1.0958752541144523e-06,
+ "loss": 0.3938,
+ "step": 8785
+ },
+ {
+ "epoch": 4.1541371158392435,
+ "grad_norm": 2.922271490097046,
+ "learning_rate": 1.0953591589890852e-06,
+ "loss": 0.3375,
+ "step": 8786
+ },
+ {
+ "epoch": 4.154609929078014,
+ "grad_norm": 3.1750547885894775,
+ "learning_rate": 1.094843151324545e-06,
+ "loss": 0.3455,
+ "step": 8787
+ },
+ {
+ "epoch": 4.155082742316785,
+ "grad_norm": 2.7836148738861084,
+ "learning_rate": 1.0943272311529602e-06,
+ "loss": 0.3359,
+ "step": 8788
+ },
+ {
+ "epoch": 4.155555555555556,
+ "grad_norm": 3.4582557678222656,
+ "learning_rate": 1.0938113985064553e-06,
+ "loss": 0.3358,
+ "step": 8789
+ },
+ {
+ "epoch": 4.156028368794326,
+ "grad_norm": 3.0436923503875732,
+ "learning_rate": 1.0932956534171483e-06,
+ "loss": 0.3531,
+ "step": 8790
+ },
+ {
+ "epoch": 4.156501182033097,
+ "grad_norm": 3.1420092582702637,
+ "learning_rate": 1.092779995917152e-06,
+ "loss": 0.3917,
+ "step": 8791
+ },
+ {
+ "epoch": 4.156973995271867,
+ "grad_norm": 2.9556260108947754,
+ "learning_rate": 1.0922644260385756e-06,
+ "loss": 0.3259,
+ "step": 8792
+ },
+ {
+ "epoch": 4.157446808510638,
+ "grad_norm": 2.8876030445098877,
+ "learning_rate": 1.091748943813521e-06,
+ "loss": 0.3447,
+ "step": 8793
+ },
+ {
+ "epoch": 4.157919621749409,
+ "grad_norm": 3.039207696914673,
+ "learning_rate": 1.0912335492740836e-06,
+ "loss": 0.3216,
+ "step": 8794
+ },
+ {
+ "epoch": 4.15839243498818,
+ "grad_norm": 2.852355480194092,
+ "learning_rate": 1.0907182424523568e-06,
+ "loss": 0.2906,
+ "step": 8795
+ },
+ {
+ "epoch": 4.158865248226951,
+ "grad_norm": 2.931675434112549,
+ "learning_rate": 1.0902030233804245e-06,
+ "loss": 0.3124,
+ "step": 8796
+ },
+ {
+ "epoch": 4.159338061465721,
+ "grad_norm": 3.010590076446533,
+ "learning_rate": 1.0896878920903691e-06,
+ "loss": 0.283,
+ "step": 8797
+ },
+ {
+ "epoch": 4.159810874704492,
+ "grad_norm": 3.093153953552246,
+ "learning_rate": 1.0891728486142648e-06,
+ "loss": 0.3269,
+ "step": 8798
+ },
+ {
+ "epoch": 4.160283687943262,
+ "grad_norm": 2.93019437789917,
+ "learning_rate": 1.0886578929841798e-06,
+ "loss": 0.3261,
+ "step": 8799
+ },
+ {
+ "epoch": 4.160756501182033,
+ "grad_norm": 3.176790475845337,
+ "learning_rate": 1.0881430252321803e-06,
+ "loss": 0.2805,
+ "step": 8800
+ },
+ {
+ "epoch": 4.161229314420804,
+ "grad_norm": 3.215359687805176,
+ "learning_rate": 1.0876282453903228e-06,
+ "loss": 0.4022,
+ "step": 8801
+ },
+ {
+ "epoch": 4.1617021276595745,
+ "grad_norm": 3.3343284130096436,
+ "learning_rate": 1.0871135534906623e-06,
+ "loss": 0.3469,
+ "step": 8802
+ },
+ {
+ "epoch": 4.162174940898345,
+ "grad_norm": 3.030043363571167,
+ "learning_rate": 1.0865989495652456e-06,
+ "loss": 0.3548,
+ "step": 8803
+ },
+ {
+ "epoch": 4.162647754137116,
+ "grad_norm": 2.9456260204315186,
+ "learning_rate": 1.0860844336461146e-06,
+ "loss": 0.3356,
+ "step": 8804
+ },
+ {
+ "epoch": 4.163120567375887,
+ "grad_norm": 2.9399044513702393,
+ "learning_rate": 1.0855700057653063e-06,
+ "loss": 0.362,
+ "step": 8805
+ },
+ {
+ "epoch": 4.163593380614657,
+ "grad_norm": 3.3188061714172363,
+ "learning_rate": 1.0850556659548513e-06,
+ "loss": 0.3866,
+ "step": 8806
+ },
+ {
+ "epoch": 4.164066193853428,
+ "grad_norm": 3.1601030826568604,
+ "learning_rate": 1.084541414246775e-06,
+ "loss": 0.3662,
+ "step": 8807
+ },
+ {
+ "epoch": 4.164539007092198,
+ "grad_norm": 3.0458695888519287,
+ "learning_rate": 1.0840272506730993e-06,
+ "loss": 0.3318,
+ "step": 8808
+ },
+ {
+ "epoch": 4.165011820330969,
+ "grad_norm": 3.056387186050415,
+ "learning_rate": 1.0835131752658365e-06,
+ "loss": 0.3538,
+ "step": 8809
+ },
+ {
+ "epoch": 4.16548463356974,
+ "grad_norm": 2.9833531379699707,
+ "learning_rate": 1.0829991880569984e-06,
+ "loss": 0.3088,
+ "step": 8810
+ },
+ {
+ "epoch": 4.165957446808511,
+ "grad_norm": 3.325438976287842,
+ "learning_rate": 1.0824852890785876e-06,
+ "loss": 0.3524,
+ "step": 8811
+ },
+ {
+ "epoch": 4.166430260047282,
+ "grad_norm": 2.781290054321289,
+ "learning_rate": 1.0819714783626009e-06,
+ "loss": 0.3925,
+ "step": 8812
+ },
+ {
+ "epoch": 4.166903073286052,
+ "grad_norm": 16.3265323638916,
+ "learning_rate": 1.0814577559410336e-06,
+ "loss": 0.4248,
+ "step": 8813
+ },
+ {
+ "epoch": 4.167375886524822,
+ "grad_norm": 2.906619071960449,
+ "learning_rate": 1.0809441218458708e-06,
+ "loss": 0.3904,
+ "step": 8814
+ },
+ {
+ "epoch": 4.167848699763593,
+ "grad_norm": 2.7133800983428955,
+ "learning_rate": 1.0804305761090957e-06,
+ "loss": 0.2855,
+ "step": 8815
+ },
+ {
+ "epoch": 4.168321513002364,
+ "grad_norm": 3.252946376800537,
+ "learning_rate": 1.0799171187626844e-06,
+ "loss": 0.3285,
+ "step": 8816
+ },
+ {
+ "epoch": 4.168794326241135,
+ "grad_norm": 3.0832788944244385,
+ "learning_rate": 1.0794037498386062e-06,
+ "loss": 0.3175,
+ "step": 8817
+ },
+ {
+ "epoch": 4.1692671394799055,
+ "grad_norm": 3.046424150466919,
+ "learning_rate": 1.0788904693688284e-06,
+ "loss": 0.3545,
+ "step": 8818
+ },
+ {
+ "epoch": 4.169739952718676,
+ "grad_norm": 3.643488645553589,
+ "learning_rate": 1.0783772773853095e-06,
+ "loss": 0.3889,
+ "step": 8819
+ },
+ {
+ "epoch": 4.170212765957447,
+ "grad_norm": 3.433997392654419,
+ "learning_rate": 1.077864173920004e-06,
+ "loss": 0.311,
+ "step": 8820
+ },
+ {
+ "epoch": 4.170685579196218,
+ "grad_norm": 3.287684679031372,
+ "learning_rate": 1.0773511590048605e-06,
+ "loss": 0.3708,
+ "step": 8821
+ },
+ {
+ "epoch": 4.171158392434988,
+ "grad_norm": 3.5546534061431885,
+ "learning_rate": 1.0768382326718212e-06,
+ "loss": 0.3845,
+ "step": 8822
+ },
+ {
+ "epoch": 4.171631205673759,
+ "grad_norm": 3.2245540618896484,
+ "learning_rate": 1.076325394952826e-06,
+ "loss": 0.4412,
+ "step": 8823
+ },
+ {
+ "epoch": 4.172104018912529,
+ "grad_norm": 3.199784994125366,
+ "learning_rate": 1.0758126458798046e-06,
+ "loss": 0.3635,
+ "step": 8824
+ },
+ {
+ "epoch": 4.1725768321513,
+ "grad_norm": 2.961003303527832,
+ "learning_rate": 1.075299985484686e-06,
+ "loss": 0.3167,
+ "step": 8825
+ },
+ {
+ "epoch": 4.173049645390071,
+ "grad_norm": 2.8316452503204346,
+ "learning_rate": 1.07478741379939e-06,
+ "loss": 0.3342,
+ "step": 8826
+ },
+ {
+ "epoch": 4.173522458628842,
+ "grad_norm": 3.0721595287323,
+ "learning_rate": 1.0742749308558316e-06,
+ "loss": 0.3642,
+ "step": 8827
+ },
+ {
+ "epoch": 4.1739952718676125,
+ "grad_norm": 3.001324415206909,
+ "learning_rate": 1.0737625366859225e-06,
+ "loss": 0.3479,
+ "step": 8828
+ },
+ {
+ "epoch": 4.174468085106383,
+ "grad_norm": 3.199108839035034,
+ "learning_rate": 1.0732502313215665e-06,
+ "loss": 0.3434,
+ "step": 8829
+ },
+ {
+ "epoch": 4.174940898345153,
+ "grad_norm": 3.602139472961426,
+ "learning_rate": 1.072738014794661e-06,
+ "loss": 0.401,
+ "step": 8830
+ },
+ {
+ "epoch": 4.175413711583924,
+ "grad_norm": 3.2303357124328613,
+ "learning_rate": 1.0722258871371025e-06,
+ "loss": 0.3603,
+ "step": 8831
+ },
+ {
+ "epoch": 4.175886524822695,
+ "grad_norm": 3.138611316680908,
+ "learning_rate": 1.0717138483807766e-06,
+ "loss": 0.3481,
+ "step": 8832
+ },
+ {
+ "epoch": 4.176359338061466,
+ "grad_norm": 3.059134006500244,
+ "learning_rate": 1.071201898557567e-06,
+ "loss": 0.357,
+ "step": 8833
+ },
+ {
+ "epoch": 4.176832151300236,
+ "grad_norm": 3.237121820449829,
+ "learning_rate": 1.0706900376993501e-06,
+ "loss": 0.3424,
+ "step": 8834
+ },
+ {
+ "epoch": 4.177304964539007,
+ "grad_norm": 3.1065425872802734,
+ "learning_rate": 1.0701782658379974e-06,
+ "loss": 0.3506,
+ "step": 8835
+ },
+ {
+ "epoch": 4.177777777777778,
+ "grad_norm": 2.9971365928649902,
+ "learning_rate": 1.0696665830053743e-06,
+ "loss": 0.3205,
+ "step": 8836
+ },
+ {
+ "epoch": 4.178250591016549,
+ "grad_norm": 3.2898313999176025,
+ "learning_rate": 1.0691549892333406e-06,
+ "loss": 0.3297,
+ "step": 8837
+ },
+ {
+ "epoch": 4.178723404255319,
+ "grad_norm": 3.166144609451294,
+ "learning_rate": 1.0686434845537525e-06,
+ "loss": 0.3097,
+ "step": 8838
+ },
+ {
+ "epoch": 4.1791962174940895,
+ "grad_norm": 2.9629571437835693,
+ "learning_rate": 1.0681320689984581e-06,
+ "loss": 0.3709,
+ "step": 8839
+ },
+ {
+ "epoch": 4.17966903073286,
+ "grad_norm": 3.2954351902008057,
+ "learning_rate": 1.0676207425993004e-06,
+ "loss": 0.3448,
+ "step": 8840
+ },
+ {
+ "epoch": 4.180141843971631,
+ "grad_norm": 2.8537824153900146,
+ "learning_rate": 1.0671095053881194e-06,
+ "loss": 0.3069,
+ "step": 8841
+ },
+ {
+ "epoch": 4.180614657210402,
+ "grad_norm": 3.382916212081909,
+ "learning_rate": 1.0665983573967453e-06,
+ "loss": 0.3909,
+ "step": 8842
+ },
+ {
+ "epoch": 4.181087470449173,
+ "grad_norm": 3.4717860221862793,
+ "learning_rate": 1.0660872986570072e-06,
+ "loss": 0.3641,
+ "step": 8843
+ },
+ {
+ "epoch": 4.1815602836879435,
+ "grad_norm": 3.088916778564453,
+ "learning_rate": 1.0655763292007256e-06,
+ "loss": 0.3184,
+ "step": 8844
+ },
+ {
+ "epoch": 4.182033096926714,
+ "grad_norm": 2.8693177700042725,
+ "learning_rate": 1.065065449059715e-06,
+ "loss": 0.3486,
+ "step": 8845
+ },
+ {
+ "epoch": 4.182505910165484,
+ "grad_norm": 3.162811517715454,
+ "learning_rate": 1.0645546582657881e-06,
+ "loss": 0.3559,
+ "step": 8846
+ },
+ {
+ "epoch": 4.182978723404255,
+ "grad_norm": 3.8519816398620605,
+ "learning_rate": 1.0640439568507475e-06,
+ "loss": 0.4159,
+ "step": 8847
+ },
+ {
+ "epoch": 4.183451536643026,
+ "grad_norm": 2.9316959381103516,
+ "learning_rate": 1.063533344846394e-06,
+ "loss": 0.34,
+ "step": 8848
+ },
+ {
+ "epoch": 4.183924349881797,
+ "grad_norm": 3.018986463546753,
+ "learning_rate": 1.0630228222845205e-06,
+ "loss": 0.3378,
+ "step": 8849
+ },
+ {
+ "epoch": 4.184397163120567,
+ "grad_norm": 2.949428081512451,
+ "learning_rate": 1.062512389196914e-06,
+ "loss": 0.3634,
+ "step": 8850
+ },
+ {
+ "epoch": 4.184869976359338,
+ "grad_norm": 3.3298749923706055,
+ "learning_rate": 1.0620020456153585e-06,
+ "loss": 0.3067,
+ "step": 8851
+ },
+ {
+ "epoch": 4.185342789598109,
+ "grad_norm": 3.0566864013671875,
+ "learning_rate": 1.0614917915716302e-06,
+ "loss": 0.3534,
+ "step": 8852
+ },
+ {
+ "epoch": 4.18581560283688,
+ "grad_norm": 3.156620979309082,
+ "learning_rate": 1.0609816270975007e-06,
+ "loss": 0.3684,
+ "step": 8853
+ },
+ {
+ "epoch": 4.18628841607565,
+ "grad_norm": 3.0776474475860596,
+ "learning_rate": 1.0604715522247352e-06,
+ "loss": 0.3616,
+ "step": 8854
+ },
+ {
+ "epoch": 4.1867612293144205,
+ "grad_norm": 3.1254587173461914,
+ "learning_rate": 1.059961566985093e-06,
+ "loss": 0.3455,
+ "step": 8855
+ },
+ {
+ "epoch": 4.187234042553191,
+ "grad_norm": 2.8769783973693848,
+ "learning_rate": 1.0594516714103306e-06,
+ "loss": 0.2754,
+ "step": 8856
+ },
+ {
+ "epoch": 4.187706855791962,
+ "grad_norm": 3.461308240890503,
+ "learning_rate": 1.0589418655321962e-06,
+ "loss": 0.3744,
+ "step": 8857
+ },
+ {
+ "epoch": 4.188179669030733,
+ "grad_norm": 3.3546712398529053,
+ "learning_rate": 1.0584321493824317e-06,
+ "loss": 0.4116,
+ "step": 8858
+ },
+ {
+ "epoch": 4.188652482269504,
+ "grad_norm": 3.233792543411255,
+ "learning_rate": 1.0579225229927775e-06,
+ "loss": 0.3591,
+ "step": 8859
+ },
+ {
+ "epoch": 4.1891252955082745,
+ "grad_norm": 3.295444965362549,
+ "learning_rate": 1.0574129863949633e-06,
+ "loss": 0.3179,
+ "step": 8860
+ },
+ {
+ "epoch": 4.189598108747045,
+ "grad_norm": 3.403062105178833,
+ "learning_rate": 1.0569035396207178e-06,
+ "loss": 0.3948,
+ "step": 8861
+ },
+ {
+ "epoch": 4.190070921985815,
+ "grad_norm": 2.901970148086548,
+ "learning_rate": 1.0563941827017613e-06,
+ "loss": 0.3537,
+ "step": 8862
+ },
+ {
+ "epoch": 4.190543735224586,
+ "grad_norm": 3.1239142417907715,
+ "learning_rate": 1.0558849156698078e-06,
+ "loss": 0.3764,
+ "step": 8863
+ },
+ {
+ "epoch": 4.191016548463357,
+ "grad_norm": 2.8480169773101807,
+ "learning_rate": 1.0553757385565694e-06,
+ "loss": 0.3085,
+ "step": 8864
+ },
+ {
+ "epoch": 4.191489361702128,
+ "grad_norm": 3.0914061069488525,
+ "learning_rate": 1.0548666513937487e-06,
+ "loss": 0.3003,
+ "step": 8865
+ },
+ {
+ "epoch": 4.191962174940898,
+ "grad_norm": 2.9875683784484863,
+ "learning_rate": 1.0543576542130452e-06,
+ "loss": 0.3178,
+ "step": 8866
+ },
+ {
+ "epoch": 4.192434988179669,
+ "grad_norm": 2.952052354812622,
+ "learning_rate": 1.053848747046152e-06,
+ "loss": 0.3221,
+ "step": 8867
+ },
+ {
+ "epoch": 4.19290780141844,
+ "grad_norm": 3.2211997509002686,
+ "learning_rate": 1.0533399299247559e-06,
+ "loss": 0.3698,
+ "step": 8868
+ },
+ {
+ "epoch": 4.193380614657211,
+ "grad_norm": 3.2954046726226807,
+ "learning_rate": 1.0528312028805392e-06,
+ "loss": 0.3697,
+ "step": 8869
+ },
+ {
+ "epoch": 4.193853427895981,
+ "grad_norm": 2.978306293487549,
+ "learning_rate": 1.0523225659451768e-06,
+ "loss": 0.3358,
+ "step": 8870
+ },
+ {
+ "epoch": 4.1943262411347515,
+ "grad_norm": 3.3803653717041016,
+ "learning_rate": 1.0518140191503415e-06,
+ "loss": 0.3851,
+ "step": 8871
+ },
+ {
+ "epoch": 4.194799054373522,
+ "grad_norm": 3.282294273376465,
+ "learning_rate": 1.051305562527697e-06,
+ "loss": 0.4518,
+ "step": 8872
+ },
+ {
+ "epoch": 4.195271867612293,
+ "grad_norm": 2.950310468673706,
+ "learning_rate": 1.0507971961089017e-06,
+ "loss": 0.3045,
+ "step": 8873
+ },
+ {
+ "epoch": 4.195744680851064,
+ "grad_norm": 3.4069037437438965,
+ "learning_rate": 1.0502889199256114e-06,
+ "loss": 0.3832,
+ "step": 8874
+ },
+ {
+ "epoch": 4.196217494089835,
+ "grad_norm": 3.1440858840942383,
+ "learning_rate": 1.0497807340094722e-06,
+ "loss": 0.2958,
+ "step": 8875
+ },
+ {
+ "epoch": 4.1966903073286055,
+ "grad_norm": 3.050755262374878,
+ "learning_rate": 1.049272638392129e-06,
+ "loss": 0.3494,
+ "step": 8876
+ },
+ {
+ "epoch": 4.197163120567376,
+ "grad_norm": 2.908078670501709,
+ "learning_rate": 1.0487646331052171e-06,
+ "loss": 0.349,
+ "step": 8877
+ },
+ {
+ "epoch": 4.197635933806146,
+ "grad_norm": 3.2089946269989014,
+ "learning_rate": 1.048256718180367e-06,
+ "loss": 0.3507,
+ "step": 8878
+ },
+ {
+ "epoch": 4.198108747044917,
+ "grad_norm": 2.984745740890503,
+ "learning_rate": 1.0477488936492067e-06,
+ "loss": 0.3252,
+ "step": 8879
+ },
+ {
+ "epoch": 4.198581560283688,
+ "grad_norm": 2.9207515716552734,
+ "learning_rate": 1.0472411595433545e-06,
+ "loss": 0.3192,
+ "step": 8880
+ },
+ {
+ "epoch": 4.199054373522459,
+ "grad_norm": 3.0090811252593994,
+ "learning_rate": 1.0467335158944242e-06,
+ "loss": 0.3827,
+ "step": 8881
+ },
+ {
+ "epoch": 4.199527186761229,
+ "grad_norm": 3.2763171195983887,
+ "learning_rate": 1.0462259627340265e-06,
+ "loss": 0.3481,
+ "step": 8882
+ },
+ {
+ "epoch": 4.2,
+ "grad_norm": 3.068268299102783,
+ "learning_rate": 1.0457185000937636e-06,
+ "loss": 0.3926,
+ "step": 8883
+ },
+ {
+ "epoch": 4.200472813238771,
+ "grad_norm": 2.6999998092651367,
+ "learning_rate": 1.0452111280052326e-06,
+ "loss": 0.2884,
+ "step": 8884
+ },
+ {
+ "epoch": 4.200945626477542,
+ "grad_norm": 3.1187727451324463,
+ "learning_rate": 1.044703846500026e-06,
+ "loss": 0.3797,
+ "step": 8885
+ },
+ {
+ "epoch": 4.201418439716312,
+ "grad_norm": 2.7876172065734863,
+ "learning_rate": 1.0441966556097283e-06,
+ "loss": 0.3284,
+ "step": 8886
+ },
+ {
+ "epoch": 4.2018912529550825,
+ "grad_norm": 2.973261833190918,
+ "learning_rate": 1.0436895553659224e-06,
+ "loss": 0.2845,
+ "step": 8887
+ },
+ {
+ "epoch": 4.202364066193853,
+ "grad_norm": 3.496096611022949,
+ "learning_rate": 1.0431825458001811e-06,
+ "loss": 0.3341,
+ "step": 8888
+ },
+ {
+ "epoch": 4.202836879432624,
+ "grad_norm": 3.370410680770874,
+ "learning_rate": 1.0426756269440761e-06,
+ "loss": 0.3459,
+ "step": 8889
+ },
+ {
+ "epoch": 4.203309692671395,
+ "grad_norm": 2.864126682281494,
+ "learning_rate": 1.0421687988291693e-06,
+ "loss": 0.3195,
+ "step": 8890
+ },
+ {
+ "epoch": 4.203782505910166,
+ "grad_norm": 3.3575501441955566,
+ "learning_rate": 1.0416620614870181e-06,
+ "loss": 0.3424,
+ "step": 8891
+ },
+ {
+ "epoch": 4.2042553191489365,
+ "grad_norm": 3.4441967010498047,
+ "learning_rate": 1.0411554149491766e-06,
+ "loss": 0.3677,
+ "step": 8892
+ },
+ {
+ "epoch": 4.204728132387707,
+ "grad_norm": 3.014472007751465,
+ "learning_rate": 1.0406488592471898e-06,
+ "loss": 0.3004,
+ "step": 8893
+ },
+ {
+ "epoch": 4.205200945626477,
+ "grad_norm": 3.1186721324920654,
+ "learning_rate": 1.0401423944126002e-06,
+ "loss": 0.4182,
+ "step": 8894
+ },
+ {
+ "epoch": 4.205673758865248,
+ "grad_norm": 3.166337013244629,
+ "learning_rate": 1.0396360204769426e-06,
+ "loss": 0.3303,
+ "step": 8895
+ },
+ {
+ "epoch": 4.206146572104019,
+ "grad_norm": 3.081855058670044,
+ "learning_rate": 1.0391297374717454e-06,
+ "loss": 0.3096,
+ "step": 8896
+ },
+ {
+ "epoch": 4.20661938534279,
+ "grad_norm": 3.0924830436706543,
+ "learning_rate": 1.0386235454285348e-06,
+ "loss": 0.3238,
+ "step": 8897
+ },
+ {
+ "epoch": 4.20709219858156,
+ "grad_norm": 3.043519973754883,
+ "learning_rate": 1.0381174443788277e-06,
+ "loss": 0.3322,
+ "step": 8898
+ },
+ {
+ "epoch": 4.207565011820331,
+ "grad_norm": 3.160785675048828,
+ "learning_rate": 1.0376114343541377e-06,
+ "loss": 0.3244,
+ "step": 8899
+ },
+ {
+ "epoch": 4.208037825059102,
+ "grad_norm": 2.9988417625427246,
+ "learning_rate": 1.037105515385971e-06,
+ "loss": 0.3386,
+ "step": 8900
+ },
+ {
+ "epoch": 4.208510638297873,
+ "grad_norm": 2.981959342956543,
+ "learning_rate": 1.0365996875058284e-06,
+ "loss": 0.3412,
+ "step": 8901
+ },
+ {
+ "epoch": 4.208983451536643,
+ "grad_norm": 3.144815683364868,
+ "learning_rate": 1.0360939507452075e-06,
+ "loss": 0.3716,
+ "step": 8902
+ },
+ {
+ "epoch": 4.2094562647754135,
+ "grad_norm": 2.9644055366516113,
+ "learning_rate": 1.0355883051355972e-06,
+ "loss": 0.3488,
+ "step": 8903
+ },
+ {
+ "epoch": 4.209929078014184,
+ "grad_norm": 3.3212029933929443,
+ "learning_rate": 1.035082750708481e-06,
+ "loss": 0.3048,
+ "step": 8904
+ },
+ {
+ "epoch": 4.210401891252955,
+ "grad_norm": 2.82843279838562,
+ "learning_rate": 1.034577287495339e-06,
+ "loss": 0.3141,
+ "step": 8905
+ },
+ {
+ "epoch": 4.210874704491726,
+ "grad_norm": 3.040215253829956,
+ "learning_rate": 1.034071915527643e-06,
+ "loss": 0.3517,
+ "step": 8906
+ },
+ {
+ "epoch": 4.211347517730497,
+ "grad_norm": 2.850985288619995,
+ "learning_rate": 1.033566634836862e-06,
+ "loss": 0.3556,
+ "step": 8907
+ },
+ {
+ "epoch": 4.2118203309692674,
+ "grad_norm": 3.522962808609009,
+ "learning_rate": 1.0330614454544564e-06,
+ "loss": 0.3432,
+ "step": 8908
+ },
+ {
+ "epoch": 4.212293144208038,
+ "grad_norm": 3.0228631496429443,
+ "learning_rate": 1.032556347411881e-06,
+ "loss": 0.3165,
+ "step": 8909
+ },
+ {
+ "epoch": 4.212765957446808,
+ "grad_norm": 3.275134563446045,
+ "learning_rate": 1.0320513407405886e-06,
+ "loss": 0.413,
+ "step": 8910
+ },
+ {
+ "epoch": 4.213238770685579,
+ "grad_norm": 2.850020408630371,
+ "learning_rate": 1.0315464254720213e-06,
+ "loss": 0.3051,
+ "step": 8911
+ },
+ {
+ "epoch": 4.21371158392435,
+ "grad_norm": 3.153916597366333,
+ "learning_rate": 1.0310416016376203e-06,
+ "loss": 0.2973,
+ "step": 8912
+ },
+ {
+ "epoch": 4.2141843971631205,
+ "grad_norm": 3.423772096633911,
+ "learning_rate": 1.0305368692688175e-06,
+ "loss": 0.302,
+ "step": 8913
+ },
+ {
+ "epoch": 4.214657210401891,
+ "grad_norm": 3.420687198638916,
+ "learning_rate": 1.0300322283970404e-06,
+ "loss": 0.3732,
+ "step": 8914
+ },
+ {
+ "epoch": 4.215130023640662,
+ "grad_norm": 3.2490479946136475,
+ "learning_rate": 1.02952767905371e-06,
+ "loss": 0.3793,
+ "step": 8915
+ },
+ {
+ "epoch": 4.215602836879433,
+ "grad_norm": 3.3043079376220703,
+ "learning_rate": 1.0290232212702438e-06,
+ "loss": 0.3472,
+ "step": 8916
+ },
+ {
+ "epoch": 4.216075650118204,
+ "grad_norm": 3.152435779571533,
+ "learning_rate": 1.0285188550780516e-06,
+ "loss": 0.3617,
+ "step": 8917
+ },
+ {
+ "epoch": 4.216548463356974,
+ "grad_norm": 3.311063766479492,
+ "learning_rate": 1.0280145805085384e-06,
+ "loss": 0.3681,
+ "step": 8918
+ },
+ {
+ "epoch": 4.217021276595744,
+ "grad_norm": 3.1113057136535645,
+ "learning_rate": 1.0275103975931016e-06,
+ "loss": 0.3526,
+ "step": 8919
+ },
+ {
+ "epoch": 4.217494089834515,
+ "grad_norm": 2.7904412746429443,
+ "learning_rate": 1.0270063063631369e-06,
+ "loss": 0.3125,
+ "step": 8920
+ },
+ {
+ "epoch": 4.217966903073286,
+ "grad_norm": 3.3566761016845703,
+ "learning_rate": 1.0265023068500293e-06,
+ "loss": 0.3305,
+ "step": 8921
+ },
+ {
+ "epoch": 4.218439716312057,
+ "grad_norm": 2.97943115234375,
+ "learning_rate": 1.0259983990851633e-06,
+ "loss": 0.3277,
+ "step": 8922
+ },
+ {
+ "epoch": 4.218912529550828,
+ "grad_norm": 3.1507925987243652,
+ "learning_rate": 1.0254945830999134e-06,
+ "loss": 0.385,
+ "step": 8923
+ },
+ {
+ "epoch": 4.219385342789598,
+ "grad_norm": 2.632859706878662,
+ "learning_rate": 1.0249908589256493e-06,
+ "loss": 0.2889,
+ "step": 8924
+ },
+ {
+ "epoch": 4.219858156028369,
+ "grad_norm": 2.9816136360168457,
+ "learning_rate": 1.0244872265937378e-06,
+ "loss": 0.2838,
+ "step": 8925
+ },
+ {
+ "epoch": 4.220330969267139,
+ "grad_norm": 2.751431465148926,
+ "learning_rate": 1.0239836861355369e-06,
+ "loss": 0.3069,
+ "step": 8926
+ },
+ {
+ "epoch": 4.22080378250591,
+ "grad_norm": 3.3390228748321533,
+ "learning_rate": 1.0234802375823985e-06,
+ "loss": 0.3074,
+ "step": 8927
+ },
+ {
+ "epoch": 4.221276595744681,
+ "grad_norm": 3.345242500305176,
+ "learning_rate": 1.0229768809656726e-06,
+ "loss": 0.3603,
+ "step": 8928
+ },
+ {
+ "epoch": 4.2217494089834515,
+ "grad_norm": 3.0684640407562256,
+ "learning_rate": 1.0224736163166984e-06,
+ "loss": 0.3343,
+ "step": 8929
+ },
+ {
+ "epoch": 4.222222222222222,
+ "grad_norm": 3.2813572883605957,
+ "learning_rate": 1.0219704436668146e-06,
+ "loss": 0.3173,
+ "step": 8930
+ },
+ {
+ "epoch": 4.222695035460993,
+ "grad_norm": 3.135668992996216,
+ "learning_rate": 1.0214673630473504e-06,
+ "loss": 0.3608,
+ "step": 8931
+ },
+ {
+ "epoch": 4.223167848699764,
+ "grad_norm": 2.8798727989196777,
+ "learning_rate": 1.0209643744896303e-06,
+ "loss": 0.3317,
+ "step": 8932
+ },
+ {
+ "epoch": 4.223640661938534,
+ "grad_norm": 3.2659590244293213,
+ "learning_rate": 1.0204614780249731e-06,
+ "loss": 0.3017,
+ "step": 8933
+ },
+ {
+ "epoch": 4.224113475177305,
+ "grad_norm": 2.988126516342163,
+ "learning_rate": 1.0199586736846911e-06,
+ "loss": 0.3555,
+ "step": 8934
+ },
+ {
+ "epoch": 4.224586288416075,
+ "grad_norm": 3.3775575160980225,
+ "learning_rate": 1.0194559615000937e-06,
+ "loss": 0.3966,
+ "step": 8935
+ },
+ {
+ "epoch": 4.225059101654846,
+ "grad_norm": 3.1004798412323,
+ "learning_rate": 1.0189533415024817e-06,
+ "loss": 0.3192,
+ "step": 8936
+ },
+ {
+ "epoch": 4.225531914893617,
+ "grad_norm": 3.1722211837768555,
+ "learning_rate": 1.0184508137231498e-06,
+ "loss": 0.3075,
+ "step": 8937
+ },
+ {
+ "epoch": 4.226004728132388,
+ "grad_norm": 3.0679538249969482,
+ "learning_rate": 1.0179483781933903e-06,
+ "loss": 0.3475,
+ "step": 8938
+ },
+ {
+ "epoch": 4.226477541371159,
+ "grad_norm": 3.079246759414673,
+ "learning_rate": 1.0174460349444857e-06,
+ "loss": 0.4054,
+ "step": 8939
+ },
+ {
+ "epoch": 4.226950354609929,
+ "grad_norm": 3.308229684829712,
+ "learning_rate": 1.0169437840077169e-06,
+ "loss": 0.3455,
+ "step": 8940
+ },
+ {
+ "epoch": 4.2274231678487,
+ "grad_norm": 3.363147258758545,
+ "learning_rate": 1.0164416254143552e-06,
+ "loss": 0.3538,
+ "step": 8941
+ },
+ {
+ "epoch": 4.22789598108747,
+ "grad_norm": 2.7227768898010254,
+ "learning_rate": 1.0159395591956677e-06,
+ "loss": 0.3206,
+ "step": 8942
+ },
+ {
+ "epoch": 4.228368794326241,
+ "grad_norm": 3.0010764598846436,
+ "learning_rate": 1.0154375853829175e-06,
+ "loss": 0.3593,
+ "step": 8943
+ },
+ {
+ "epoch": 4.228841607565012,
+ "grad_norm": 3.0478785037994385,
+ "learning_rate": 1.0149357040073581e-06,
+ "loss": 0.3808,
+ "step": 8944
+ },
+ {
+ "epoch": 4.2293144208037825,
+ "grad_norm": 2.804421901702881,
+ "learning_rate": 1.0144339151002416e-06,
+ "loss": 0.3633,
+ "step": 8945
+ },
+ {
+ "epoch": 4.229787234042553,
+ "grad_norm": 3.563140630722046,
+ "learning_rate": 1.013932218692811e-06,
+ "loss": 0.2843,
+ "step": 8946
+ },
+ {
+ "epoch": 4.230260047281324,
+ "grad_norm": 3.1959750652313232,
+ "learning_rate": 1.0134306148163051e-06,
+ "loss": 0.377,
+ "step": 8947
+ },
+ {
+ "epoch": 4.230732860520095,
+ "grad_norm": 3.0841214656829834,
+ "learning_rate": 1.0129291035019565e-06,
+ "loss": 0.3234,
+ "step": 8948
+ },
+ {
+ "epoch": 4.231205673758865,
+ "grad_norm": 3.3893179893493652,
+ "learning_rate": 1.0124276847809911e-06,
+ "loss": 0.3823,
+ "step": 8949
+ },
+ {
+ "epoch": 4.231678486997636,
+ "grad_norm": 3.2250518798828125,
+ "learning_rate": 1.0119263586846316e-06,
+ "loss": 0.35,
+ "step": 8950
+ },
+ {
+ "epoch": 4.232151300236406,
+ "grad_norm": 3.287285566329956,
+ "learning_rate": 1.0114251252440928e-06,
+ "loss": 0.3306,
+ "step": 8951
+ },
+ {
+ "epoch": 4.232624113475177,
+ "grad_norm": 3.5018274784088135,
+ "learning_rate": 1.0109239844905836e-06,
+ "loss": 0.378,
+ "step": 8952
+ },
+ {
+ "epoch": 4.233096926713948,
+ "grad_norm": 3.224838972091675,
+ "learning_rate": 1.0104229364553093e-06,
+ "loss": 0.3379,
+ "step": 8953
+ },
+ {
+ "epoch": 4.233569739952719,
+ "grad_norm": 3.2302494049072266,
+ "learning_rate": 1.0099219811694668e-06,
+ "loss": 0.358,
+ "step": 8954
+ },
+ {
+ "epoch": 4.23404255319149,
+ "grad_norm": 3.098205804824829,
+ "learning_rate": 1.0094211186642483e-06,
+ "loss": 0.3669,
+ "step": 8955
+ },
+ {
+ "epoch": 4.23451536643026,
+ "grad_norm": 3.0045907497406006,
+ "learning_rate": 1.0089203489708415e-06,
+ "loss": 0.3293,
+ "step": 8956
+ },
+ {
+ "epoch": 4.234988179669031,
+ "grad_norm": 3.245818853378296,
+ "learning_rate": 1.0084196721204254e-06,
+ "loss": 0.3365,
+ "step": 8957
+ },
+ {
+ "epoch": 4.235460992907801,
+ "grad_norm": 2.8547208309173584,
+ "learning_rate": 1.007919088144177e-06,
+ "loss": 0.3072,
+ "step": 8958
+ },
+ {
+ "epoch": 4.235933806146572,
+ "grad_norm": 3.2914109230041504,
+ "learning_rate": 1.0074185970732642e-06,
+ "loss": 0.3734,
+ "step": 8959
+ },
+ {
+ "epoch": 4.236406619385343,
+ "grad_norm": 2.527096748352051,
+ "learning_rate": 1.0069181989388496e-06,
+ "loss": 0.3091,
+ "step": 8960
+ },
+ {
+ "epoch": 4.2368794326241135,
+ "grad_norm": 2.921369791030884,
+ "learning_rate": 1.006417893772093e-06,
+ "loss": 0.3162,
+ "step": 8961
+ },
+ {
+ "epoch": 4.237352245862884,
+ "grad_norm": 2.8698911666870117,
+ "learning_rate": 1.005917681604145e-06,
+ "loss": 0.3818,
+ "step": 8962
+ },
+ {
+ "epoch": 4.237825059101655,
+ "grad_norm": 2.958021402359009,
+ "learning_rate": 1.0054175624661514e-06,
+ "loss": 0.2934,
+ "step": 8963
+ },
+ {
+ "epoch": 4.238297872340426,
+ "grad_norm": 3.0945863723754883,
+ "learning_rate": 1.0049175363892527e-06,
+ "loss": 0.384,
+ "step": 8964
+ },
+ {
+ "epoch": 4.238770685579196,
+ "grad_norm": 2.890333890914917,
+ "learning_rate": 1.0044176034045822e-06,
+ "loss": 0.3312,
+ "step": 8965
+ },
+ {
+ "epoch": 4.239243498817967,
+ "grad_norm": 3.460975408554077,
+ "learning_rate": 1.0039177635432706e-06,
+ "loss": 0.4015,
+ "step": 8966
+ },
+ {
+ "epoch": 4.239716312056737,
+ "grad_norm": 3.5411946773529053,
+ "learning_rate": 1.003418016836439e-06,
+ "loss": 0.3307,
+ "step": 8967
+ },
+ {
+ "epoch": 4.240189125295508,
+ "grad_norm": 3.3310446739196777,
+ "learning_rate": 1.0029183633152061e-06,
+ "loss": 0.3154,
+ "step": 8968
+ },
+ {
+ "epoch": 4.240661938534279,
+ "grad_norm": 3.121110677719116,
+ "learning_rate": 1.0024188030106822e-06,
+ "loss": 0.3827,
+ "step": 8969
+ },
+ {
+ "epoch": 4.24113475177305,
+ "grad_norm": 3.421278715133667,
+ "learning_rate": 1.0019193359539717e-06,
+ "loss": 0.3677,
+ "step": 8970
+ },
+ {
+ "epoch": 4.241607565011821,
+ "grad_norm": 3.1790332794189453,
+ "learning_rate": 1.0014199621761761e-06,
+ "loss": 0.3219,
+ "step": 8971
+ },
+ {
+ "epoch": 4.242080378250591,
+ "grad_norm": 3.238412380218506,
+ "learning_rate": 1.0009206817083878e-06,
+ "loss": 0.3296,
+ "step": 8972
+ },
+ {
+ "epoch": 4.242553191489361,
+ "grad_norm": 3.0206923484802246,
+ "learning_rate": 1.0004214945816959e-06,
+ "loss": 0.3769,
+ "step": 8973
+ },
+ {
+ "epoch": 4.243026004728132,
+ "grad_norm": 3.2117667198181152,
+ "learning_rate": 9.999224008271822e-07,
+ "loss": 0.3876,
+ "step": 8974
+ },
+ {
+ "epoch": 4.243498817966903,
+ "grad_norm": 2.849250316619873,
+ "learning_rate": 9.99423400475922e-07,
+ "loss": 0.3267,
+ "step": 8975
+ },
+ {
+ "epoch": 4.243971631205674,
+ "grad_norm": 3.084845542907715,
+ "learning_rate": 9.989244935589878e-07,
+ "loss": 0.3074,
+ "step": 8976
+ },
+ {
+ "epoch": 4.2444444444444445,
+ "grad_norm": 3.0177342891693115,
+ "learning_rate": 9.984256801074434e-07,
+ "loss": 0.3524,
+ "step": 8977
+ },
+ {
+ "epoch": 4.244917257683215,
+ "grad_norm": 3.196692943572998,
+ "learning_rate": 9.979269601523477e-07,
+ "loss": 0.3943,
+ "step": 8978
+ },
+ {
+ "epoch": 4.245390070921986,
+ "grad_norm": 2.849760055541992,
+ "learning_rate": 9.97428333724753e-07,
+ "loss": 0.3145,
+ "step": 8979
+ },
+ {
+ "epoch": 4.245862884160757,
+ "grad_norm": 3.003265857696533,
+ "learning_rate": 9.969298008557083e-07,
+ "loss": 0.3393,
+ "step": 8980
+ },
+ {
+ "epoch": 4.246335697399527,
+ "grad_norm": 2.925597667694092,
+ "learning_rate": 9.96431361576254e-07,
+ "loss": 0.331,
+ "step": 8981
+ },
+ {
+ "epoch": 4.246808510638298,
+ "grad_norm": 2.87599515914917,
+ "learning_rate": 9.959330159174257e-07,
+ "loss": 0.3102,
+ "step": 8982
+ },
+ {
+ "epoch": 4.247281323877068,
+ "grad_norm": 2.841588020324707,
+ "learning_rate": 9.954347639102528e-07,
+ "loss": 0.3261,
+ "step": 8983
+ },
+ {
+ "epoch": 4.247754137115839,
+ "grad_norm": 3.14918851852417,
+ "learning_rate": 9.949366055857605e-07,
+ "loss": 0.3366,
+ "step": 8984
+ },
+ {
+ "epoch": 4.24822695035461,
+ "grad_norm": 3.113927125930786,
+ "learning_rate": 9.944385409749654e-07,
+ "loss": 0.3532,
+ "step": 8985
+ },
+ {
+ "epoch": 4.248699763593381,
+ "grad_norm": 3.0749151706695557,
+ "learning_rate": 9.939405701088818e-07,
+ "loss": 0.3659,
+ "step": 8986
+ },
+ {
+ "epoch": 4.2491725768321515,
+ "grad_norm": 2.831846237182617,
+ "learning_rate": 9.934426930185145e-07,
+ "loss": 0.2965,
+ "step": 8987
+ },
+ {
+ "epoch": 4.249645390070922,
+ "grad_norm": 3.0280253887176514,
+ "learning_rate": 9.929449097348642e-07,
+ "loss": 0.3144,
+ "step": 8988
+ },
+ {
+ "epoch": 4.250118203309692,
+ "grad_norm": 3.250284433364868,
+ "learning_rate": 9.924472202889267e-07,
+ "loss": 0.3414,
+ "step": 8989
+ },
+ {
+ "epoch": 4.250591016548463,
+ "grad_norm": 3.582306146621704,
+ "learning_rate": 9.9194962471169e-07,
+ "loss": 0.3593,
+ "step": 8990
+ },
+ {
+ "epoch": 4.251063829787234,
+ "grad_norm": 2.8985490798950195,
+ "learning_rate": 9.914521230341382e-07,
+ "loss": 0.2948,
+ "step": 8991
+ },
+ {
+ "epoch": 4.251536643026005,
+ "grad_norm": 3.399209499359131,
+ "learning_rate": 9.909547152872476e-07,
+ "loss": 0.3942,
+ "step": 8992
+ },
+ {
+ "epoch": 4.2520094562647754,
+ "grad_norm": 3.344658613204956,
+ "learning_rate": 9.904574015019895e-07,
+ "loss": 0.3649,
+ "step": 8993
+ },
+ {
+ "epoch": 4.252482269503546,
+ "grad_norm": 3.057995319366455,
+ "learning_rate": 9.899601817093305e-07,
+ "loss": 0.317,
+ "step": 8994
+ },
+ {
+ "epoch": 4.252955082742317,
+ "grad_norm": 3.4610090255737305,
+ "learning_rate": 9.894630559402296e-07,
+ "loss": 0.3235,
+ "step": 8995
+ },
+ {
+ "epoch": 4.253427895981088,
+ "grad_norm": 3.6014657020568848,
+ "learning_rate": 9.889660242256407e-07,
+ "loss": 0.3057,
+ "step": 8996
+ },
+ {
+ "epoch": 4.253900709219858,
+ "grad_norm": 2.850391149520874,
+ "learning_rate": 9.884690865965118e-07,
+ "loss": 0.3584,
+ "step": 8997
+ },
+ {
+ "epoch": 4.2543735224586285,
+ "grad_norm": 3.100820541381836,
+ "learning_rate": 9.879722430837844e-07,
+ "loss": 0.3802,
+ "step": 8998
+ },
+ {
+ "epoch": 4.254846335697399,
+ "grad_norm": 3.1044704914093018,
+ "learning_rate": 9.874754937183962e-07,
+ "loss": 0.3293,
+ "step": 8999
+ },
+ {
+ "epoch": 4.25531914893617,
+ "grad_norm": 2.750356912612915,
+ "learning_rate": 9.869788385312764e-07,
+ "loss": 0.3218,
+ "step": 9000
+ },
+ {
+ "epoch": 4.255791962174941,
+ "grad_norm": 3.1008687019348145,
+ "learning_rate": 9.864822775533494e-07,
+ "loss": 0.3316,
+ "step": 9001
+ },
+ {
+ "epoch": 4.256264775413712,
+ "grad_norm": 3.2051985263824463,
+ "learning_rate": 9.859858108155351e-07,
+ "loss": 0.3661,
+ "step": 9002
+ },
+ {
+ "epoch": 4.2567375886524825,
+ "grad_norm": 3.1303839683532715,
+ "learning_rate": 9.854894383487448e-07,
+ "loss": 0.3683,
+ "step": 9003
+ },
+ {
+ "epoch": 4.257210401891253,
+ "grad_norm": 3.0718302726745605,
+ "learning_rate": 9.84993160183887e-07,
+ "loss": 0.3284,
+ "step": 9004
+ },
+ {
+ "epoch": 4.257683215130023,
+ "grad_norm": 2.9759013652801514,
+ "learning_rate": 9.844969763518625e-07,
+ "loss": 0.3465,
+ "step": 9005
+ },
+ {
+ "epoch": 4.258156028368794,
+ "grad_norm": 3.1965582370758057,
+ "learning_rate": 9.840008868835647e-07,
+ "loss": 0.3593,
+ "step": 9006
+ },
+ {
+ "epoch": 4.258628841607565,
+ "grad_norm": 3.2931249141693115,
+ "learning_rate": 9.835048918098853e-07,
+ "loss": 0.3631,
+ "step": 9007
+ },
+ {
+ "epoch": 4.259101654846336,
+ "grad_norm": 3.070627450942993,
+ "learning_rate": 9.830089911617054e-07,
+ "loss": 0.3541,
+ "step": 9008
+ },
+ {
+ "epoch": 4.259574468085106,
+ "grad_norm": 3.209110736846924,
+ "learning_rate": 9.825131849699051e-07,
+ "loss": 0.344,
+ "step": 9009
+ },
+ {
+ "epoch": 4.260047281323877,
+ "grad_norm": 3.2239089012145996,
+ "learning_rate": 9.820174732653545e-07,
+ "loss": 0.3469,
+ "step": 9010
+ },
+ {
+ "epoch": 4.260520094562648,
+ "grad_norm": 3.0812292098999023,
+ "learning_rate": 9.815218560789199e-07,
+ "loss": 0.2898,
+ "step": 9011
+ },
+ {
+ "epoch": 4.260992907801419,
+ "grad_norm": 3.1709752082824707,
+ "learning_rate": 9.81026333441461e-07,
+ "loss": 0.381,
+ "step": 9012
+ },
+ {
+ "epoch": 4.261465721040189,
+ "grad_norm": 3.1551907062530518,
+ "learning_rate": 9.805309053838308e-07,
+ "loss": 0.2959,
+ "step": 9013
+ },
+ {
+ "epoch": 4.2619385342789595,
+ "grad_norm": 3.3751494884490967,
+ "learning_rate": 9.800355719368793e-07,
+ "loss": 0.3806,
+ "step": 9014
+ },
+ {
+ "epoch": 4.26241134751773,
+ "grad_norm": 3.2392799854278564,
+ "learning_rate": 9.795403331314479e-07,
+ "loss": 0.3006,
+ "step": 9015
+ },
+ {
+ "epoch": 4.262884160756501,
+ "grad_norm": 3.1428463459014893,
+ "learning_rate": 9.790451889983724e-07,
+ "loss": 0.3212,
+ "step": 9016
+ },
+ {
+ "epoch": 4.263356973995272,
+ "grad_norm": 3.353379726409912,
+ "learning_rate": 9.785501395684844e-07,
+ "loss": 0.3555,
+ "step": 9017
+ },
+ {
+ "epoch": 4.263829787234043,
+ "grad_norm": 3.3555281162261963,
+ "learning_rate": 9.780551848726068e-07,
+ "loss": 0.3729,
+ "step": 9018
+ },
+ {
+ "epoch": 4.2643026004728135,
+ "grad_norm": 3.0275049209594727,
+ "learning_rate": 9.775603249415606e-07,
+ "loss": 0.3579,
+ "step": 9019
+ },
+ {
+ "epoch": 4.264775413711584,
+ "grad_norm": 3.2631473541259766,
+ "learning_rate": 9.770655598061569e-07,
+ "loss": 0.3755,
+ "step": 9020
+ },
+ {
+ "epoch": 4.265248226950354,
+ "grad_norm": 2.9419705867767334,
+ "learning_rate": 9.76570889497202e-07,
+ "loss": 0.28,
+ "step": 9021
+ },
+ {
+ "epoch": 4.265721040189125,
+ "grad_norm": 2.931673288345337,
+ "learning_rate": 9.76076314045499e-07,
+ "loss": 0.3497,
+ "step": 9022
+ },
+ {
+ "epoch": 4.266193853427896,
+ "grad_norm": 3.218503952026367,
+ "learning_rate": 9.755818334818416e-07,
+ "loss": 0.3775,
+ "step": 9023
+ },
+ {
+ "epoch": 4.266666666666667,
+ "grad_norm": 3.1422977447509766,
+ "learning_rate": 9.750874478370181e-07,
+ "loss": 0.3957,
+ "step": 9024
+ },
+ {
+ "epoch": 4.267139479905437,
+ "grad_norm": 3.066502571105957,
+ "learning_rate": 9.745931571418134e-07,
+ "loss": 0.3361,
+ "step": 9025
+ },
+ {
+ "epoch": 4.267612293144208,
+ "grad_norm": 3.186897039413452,
+ "learning_rate": 9.740989614270044e-07,
+ "loss": 0.3794,
+ "step": 9026
+ },
+ {
+ "epoch": 4.268085106382979,
+ "grad_norm": 3.2698588371276855,
+ "learning_rate": 9.736048607233623e-07,
+ "loss": 0.3595,
+ "step": 9027
+ },
+ {
+ "epoch": 4.26855791962175,
+ "grad_norm": 2.9609718322753906,
+ "learning_rate": 9.731108550616523e-07,
+ "loss": 0.3387,
+ "step": 9028
+ },
+ {
+ "epoch": 4.26903073286052,
+ "grad_norm": 3.10768985748291,
+ "learning_rate": 9.72616944472633e-07,
+ "loss": 0.387,
+ "step": 9029
+ },
+ {
+ "epoch": 4.2695035460992905,
+ "grad_norm": 2.8060896396636963,
+ "learning_rate": 9.721231289870602e-07,
+ "loss": 0.3132,
+ "step": 9030
+ },
+ {
+ "epoch": 4.269976359338061,
+ "grad_norm": 3.0502681732177734,
+ "learning_rate": 9.716294086356801e-07,
+ "loss": 0.3246,
+ "step": 9031
+ },
+ {
+ "epoch": 4.270449172576832,
+ "grad_norm": 2.8298611640930176,
+ "learning_rate": 9.711357834492356e-07,
+ "loss": 0.2958,
+ "step": 9032
+ },
+ {
+ "epoch": 4.270921985815603,
+ "grad_norm": 2.693819761276245,
+ "learning_rate": 9.70642253458462e-07,
+ "loss": 0.325,
+ "step": 9033
+ },
+ {
+ "epoch": 4.271394799054374,
+ "grad_norm": 2.8179452419281006,
+ "learning_rate": 9.701488186940885e-07,
+ "loss": 0.3252,
+ "step": 9034
+ },
+ {
+ "epoch": 4.2718676122931445,
+ "grad_norm": 2.9885077476501465,
+ "learning_rate": 9.696554791868406e-07,
+ "loss": 0.3234,
+ "step": 9035
+ },
+ {
+ "epoch": 4.272340425531915,
+ "grad_norm": 4.8119378089904785,
+ "learning_rate": 9.691622349674349e-07,
+ "loss": 0.3814,
+ "step": 9036
+ },
+ {
+ "epoch": 4.272813238770685,
+ "grad_norm": 3.971498966217041,
+ "learning_rate": 9.68669086066585e-07,
+ "loss": 0.3684,
+ "step": 9037
+ },
+ {
+ "epoch": 4.273286052009456,
+ "grad_norm": 3.0153439044952393,
+ "learning_rate": 9.681760325149967e-07,
+ "loss": 0.3449,
+ "step": 9038
+ },
+ {
+ "epoch": 4.273758865248227,
+ "grad_norm": 3.4421799182891846,
+ "learning_rate": 9.676830743433688e-07,
+ "loss": 0.3578,
+ "step": 9039
+ },
+ {
+ "epoch": 4.274231678486998,
+ "grad_norm": 3.2896533012390137,
+ "learning_rate": 9.67190211582398e-07,
+ "loss": 0.3421,
+ "step": 9040
+ },
+ {
+ "epoch": 4.274704491725768,
+ "grad_norm": 3.388833522796631,
+ "learning_rate": 9.666974442627717e-07,
+ "loss": 0.4089,
+ "step": 9041
+ },
+ {
+ "epoch": 4.275177304964539,
+ "grad_norm": 3.1000685691833496,
+ "learning_rate": 9.662047724151718e-07,
+ "loss": 0.4046,
+ "step": 9042
+ },
+ {
+ "epoch": 4.27565011820331,
+ "grad_norm": 3.5651235580444336,
+ "learning_rate": 9.657121960702753e-07,
+ "loss": 0.4275,
+ "step": 9043
+ },
+ {
+ "epoch": 4.276122931442081,
+ "grad_norm": 2.944434881210327,
+ "learning_rate": 9.65219715258752e-07,
+ "loss": 0.3395,
+ "step": 9044
+ },
+ {
+ "epoch": 4.276595744680851,
+ "grad_norm": 2.7315311431884766,
+ "learning_rate": 9.64727330011268e-07,
+ "loss": 0.3305,
+ "step": 9045
+ },
+ {
+ "epoch": 4.2770685579196215,
+ "grad_norm": 3.423567533493042,
+ "learning_rate": 9.642350403584805e-07,
+ "loss": 0.3605,
+ "step": 9046
+ },
+ {
+ "epoch": 4.277541371158392,
+ "grad_norm": 3.239745616912842,
+ "learning_rate": 9.637428463310435e-07,
+ "loss": 0.3519,
+ "step": 9047
+ },
+ {
+ "epoch": 4.278014184397163,
+ "grad_norm": 3.388700008392334,
+ "learning_rate": 9.632507479596035e-07,
+ "loss": 0.359,
+ "step": 9048
+ },
+ {
+ "epoch": 4.278486997635934,
+ "grad_norm": 3.3524253368377686,
+ "learning_rate": 9.627587452747996e-07,
+ "loss": 0.3381,
+ "step": 9049
+ },
+ {
+ "epoch": 4.278959810874705,
+ "grad_norm": 3.1089365482330322,
+ "learning_rate": 9.622668383072695e-07,
+ "loss": 0.3143,
+ "step": 9050
+ },
+ {
+ "epoch": 4.2794326241134755,
+ "grad_norm": 3.3477213382720947,
+ "learning_rate": 9.617750270876402e-07,
+ "loss": 0.3788,
+ "step": 9051
+ },
+ {
+ "epoch": 4.279905437352246,
+ "grad_norm": 2.934818983078003,
+ "learning_rate": 9.612833116465342e-07,
+ "loss": 0.3589,
+ "step": 9052
+ },
+ {
+ "epoch": 4.280378250591016,
+ "grad_norm": 3.125014305114746,
+ "learning_rate": 9.607916920145704e-07,
+ "loss": 0.3181,
+ "step": 9053
+ },
+ {
+ "epoch": 4.280851063829787,
+ "grad_norm": 3.5860400199890137,
+ "learning_rate": 9.60300168222358e-07,
+ "loss": 0.3597,
+ "step": 9054
+ },
+ {
+ "epoch": 4.281323877068558,
+ "grad_norm": 3.1414008140563965,
+ "learning_rate": 9.598087403005032e-07,
+ "loss": 0.3186,
+ "step": 9055
+ },
+ {
+ "epoch": 4.281796690307329,
+ "grad_norm": 2.841228723526001,
+ "learning_rate": 9.593174082796046e-07,
+ "loss": 0.3547,
+ "step": 9056
+ },
+ {
+ "epoch": 4.282269503546099,
+ "grad_norm": 3.1145405769348145,
+ "learning_rate": 9.588261721902547e-07,
+ "loss": 0.3317,
+ "step": 9057
+ },
+ {
+ "epoch": 4.28274231678487,
+ "grad_norm": 2.9518024921417236,
+ "learning_rate": 9.58335032063042e-07,
+ "loss": 0.3723,
+ "step": 9058
+ },
+ {
+ "epoch": 4.283215130023641,
+ "grad_norm": 2.887479782104492,
+ "learning_rate": 9.578439879285467e-07,
+ "loss": 0.3288,
+ "step": 9059
+ },
+ {
+ "epoch": 4.283687943262412,
+ "grad_norm": 3.253427267074585,
+ "learning_rate": 9.573530398173444e-07,
+ "loss": 0.326,
+ "step": 9060
+ },
+ {
+ "epoch": 4.284160756501182,
+ "grad_norm": 3.0442020893096924,
+ "learning_rate": 9.568621877600038e-07,
+ "loss": 0.3807,
+ "step": 9061
+ },
+ {
+ "epoch": 4.2846335697399525,
+ "grad_norm": 2.928743600845337,
+ "learning_rate": 9.563714317870877e-07,
+ "loss": 0.2977,
+ "step": 9062
+ },
+ {
+ "epoch": 4.285106382978723,
+ "grad_norm": 3.2095022201538086,
+ "learning_rate": 9.558807719291543e-07,
+ "loss": 0.3571,
+ "step": 9063
+ },
+ {
+ "epoch": 4.285579196217494,
+ "grad_norm": 3.3752429485321045,
+ "learning_rate": 9.55390208216754e-07,
+ "loss": 0.3928,
+ "step": 9064
+ },
+ {
+ "epoch": 4.286052009456265,
+ "grad_norm": 3.125702381134033,
+ "learning_rate": 9.548997406804333e-07,
+ "loss": 0.3334,
+ "step": 9065
+ },
+ {
+ "epoch": 4.286524822695036,
+ "grad_norm": 3.058772563934326,
+ "learning_rate": 9.544093693507308e-07,
+ "loss": 0.3184,
+ "step": 9066
+ },
+ {
+ "epoch": 4.2869976359338064,
+ "grad_norm": 3.1085948944091797,
+ "learning_rate": 9.539190942581785e-07,
+ "loss": 0.3626,
+ "step": 9067
+ },
+ {
+ "epoch": 4.287470449172577,
+ "grad_norm": 2.8497378826141357,
+ "learning_rate": 9.53428915433306e-07,
+ "loss": 0.3451,
+ "step": 9068
+ },
+ {
+ "epoch": 4.287943262411347,
+ "grad_norm": 3.411508798599243,
+ "learning_rate": 9.529388329066325e-07,
+ "loss": 0.3608,
+ "step": 9069
+ },
+ {
+ "epoch": 4.288416075650118,
+ "grad_norm": 3.1312575340270996,
+ "learning_rate": 9.524488467086751e-07,
+ "loss": 0.3419,
+ "step": 9070
+ },
+ {
+ "epoch": 4.288888888888889,
+ "grad_norm": 3.1531126499176025,
+ "learning_rate": 9.519589568699419e-07,
+ "loss": 0.3261,
+ "step": 9071
+ },
+ {
+ "epoch": 4.2893617021276595,
+ "grad_norm": 2.8852546215057373,
+ "learning_rate": 9.514691634209361e-07,
+ "loss": 0.3258,
+ "step": 9072
+ },
+ {
+ "epoch": 4.28983451536643,
+ "grad_norm": 3.0486297607421875,
+ "learning_rate": 9.50979466392156e-07,
+ "loss": 0.3207,
+ "step": 9073
+ },
+ {
+ "epoch": 4.290307328605201,
+ "grad_norm": 3.017788887023926,
+ "learning_rate": 9.504898658140924e-07,
+ "loss": 0.3288,
+ "step": 9074
+ },
+ {
+ "epoch": 4.290780141843972,
+ "grad_norm": 3.24040150642395,
+ "learning_rate": 9.500003617172302e-07,
+ "loss": 0.2847,
+ "step": 9075
+ },
+ {
+ "epoch": 4.291252955082742,
+ "grad_norm": 3.6793692111968994,
+ "learning_rate": 9.49510954132049e-07,
+ "loss": 0.425,
+ "step": 9076
+ },
+ {
+ "epoch": 4.291725768321513,
+ "grad_norm": 2.7292215824127197,
+ "learning_rate": 9.490216430890215e-07,
+ "loss": 0.3208,
+ "step": 9077
+ },
+ {
+ "epoch": 4.292198581560283,
+ "grad_norm": 2.650388479232788,
+ "learning_rate": 9.485324286186159e-07,
+ "loss": 0.2842,
+ "step": 9078
+ },
+ {
+ "epoch": 4.292671394799054,
+ "grad_norm": 3.1459171772003174,
+ "learning_rate": 9.480433107512932e-07,
+ "loss": 0.3287,
+ "step": 9079
+ },
+ {
+ "epoch": 4.293144208037825,
+ "grad_norm": 3.1777186393737793,
+ "learning_rate": 9.475542895175074e-07,
+ "loss": 0.3385,
+ "step": 9080
+ },
+ {
+ "epoch": 4.293617021276596,
+ "grad_norm": 3.5608465671539307,
+ "learning_rate": 9.470653649477096e-07,
+ "loss": 0.3574,
+ "step": 9081
+ },
+ {
+ "epoch": 4.294089834515367,
+ "grad_norm": 2.58306884765625,
+ "learning_rate": 9.465765370723415e-07,
+ "loss": 0.3156,
+ "step": 9082
+ },
+ {
+ "epoch": 4.294562647754137,
+ "grad_norm": 3.3265857696533203,
+ "learning_rate": 9.460878059218415e-07,
+ "loss": 0.3678,
+ "step": 9083
+ },
+ {
+ "epoch": 4.295035460992908,
+ "grad_norm": 3.259326696395874,
+ "learning_rate": 9.455991715266403e-07,
+ "loss": 0.3675,
+ "step": 9084
+ },
+ {
+ "epoch": 4.295508274231678,
+ "grad_norm": 3.430608034133911,
+ "learning_rate": 9.451106339171618e-07,
+ "loss": 0.3147,
+ "step": 9085
+ },
+ {
+ "epoch": 4.295981087470449,
+ "grad_norm": 3.2896342277526855,
+ "learning_rate": 9.44622193123827e-07,
+ "loss": 0.3482,
+ "step": 9086
+ },
+ {
+ "epoch": 4.29645390070922,
+ "grad_norm": 2.9680557250976562,
+ "learning_rate": 9.441338491770474e-07,
+ "loss": 0.3504,
+ "step": 9087
+ },
+ {
+ "epoch": 4.2969267139479905,
+ "grad_norm": 2.9656941890716553,
+ "learning_rate": 9.436456021072313e-07,
+ "loss": 0.3782,
+ "step": 9088
+ },
+ {
+ "epoch": 4.297399527186761,
+ "grad_norm": 3.463456630706787,
+ "learning_rate": 9.431574519447794e-07,
+ "loss": 0.3517,
+ "step": 9089
+ },
+ {
+ "epoch": 4.297872340425532,
+ "grad_norm": 3.3658525943756104,
+ "learning_rate": 9.426693987200864e-07,
+ "loss": 0.3535,
+ "step": 9090
+ },
+ {
+ "epoch": 4.298345153664303,
+ "grad_norm": 3.087533712387085,
+ "learning_rate": 9.421814424635414e-07,
+ "loss": 0.3007,
+ "step": 9091
+ },
+ {
+ "epoch": 4.298817966903073,
+ "grad_norm": 3.4596481323242188,
+ "learning_rate": 9.41693583205526e-07,
+ "loss": 0.3797,
+ "step": 9092
+ },
+ {
+ "epoch": 4.299290780141844,
+ "grad_norm": 3.647507667541504,
+ "learning_rate": 9.412058209764191e-07,
+ "loss": 0.3803,
+ "step": 9093
+ },
+ {
+ "epoch": 4.299763593380614,
+ "grad_norm": 2.9130196571350098,
+ "learning_rate": 9.407181558065909e-07,
+ "loss": 0.32,
+ "step": 9094
+ },
+ {
+ "epoch": 4.300236406619385,
+ "grad_norm": 3.2562668323516846,
+ "learning_rate": 9.402305877264048e-07,
+ "loss": 0.4103,
+ "step": 9095
+ },
+ {
+ "epoch": 4.300709219858156,
+ "grad_norm": 3.1416616439819336,
+ "learning_rate": 9.397431167662216e-07,
+ "loss": 0.3498,
+ "step": 9096
+ },
+ {
+ "epoch": 4.301182033096927,
+ "grad_norm": 2.9540042877197266,
+ "learning_rate": 9.392557429563929e-07,
+ "loss": 0.3073,
+ "step": 9097
+ },
+ {
+ "epoch": 4.301654846335698,
+ "grad_norm": 3.0450825691223145,
+ "learning_rate": 9.387684663272645e-07,
+ "loss": 0.3295,
+ "step": 9098
+ },
+ {
+ "epoch": 4.302127659574468,
+ "grad_norm": 3.1060359477996826,
+ "learning_rate": 9.38281286909179e-07,
+ "loss": 0.3582,
+ "step": 9099
+ },
+ {
+ "epoch": 4.302600472813239,
+ "grad_norm": 2.901136636734009,
+ "learning_rate": 9.377942047324687e-07,
+ "loss": 0.3162,
+ "step": 9100
+ },
+ {
+ "epoch": 4.303073286052009,
+ "grad_norm": 3.5618929862976074,
+ "learning_rate": 9.373072198274641e-07,
+ "loss": 0.2917,
+ "step": 9101
+ },
+ {
+ "epoch": 4.30354609929078,
+ "grad_norm": 3.0853395462036133,
+ "learning_rate": 9.368203322244871e-07,
+ "loss": 0.3124,
+ "step": 9102
+ },
+ {
+ "epoch": 4.304018912529551,
+ "grad_norm": 3.00398588180542,
+ "learning_rate": 9.363335419538524e-07,
+ "loss": 0.3167,
+ "step": 9103
+ },
+ {
+ "epoch": 4.3044917257683215,
+ "grad_norm": 3.4705588817596436,
+ "learning_rate": 9.358468490458725e-07,
+ "loss": 0.3188,
+ "step": 9104
+ },
+ {
+ "epoch": 4.304964539007092,
+ "grad_norm": 2.948302745819092,
+ "learning_rate": 9.353602535308509e-07,
+ "loss": 0.2739,
+ "step": 9105
+ },
+ {
+ "epoch": 4.305437352245863,
+ "grad_norm": 3.4512269496917725,
+ "learning_rate": 9.348737554390852e-07,
+ "loss": 0.3256,
+ "step": 9106
+ },
+ {
+ "epoch": 4.305910165484634,
+ "grad_norm": 2.9979147911071777,
+ "learning_rate": 9.343873548008684e-07,
+ "loss": 0.3184,
+ "step": 9107
+ },
+ {
+ "epoch": 4.306382978723404,
+ "grad_norm": 3.1008479595184326,
+ "learning_rate": 9.339010516464847e-07,
+ "loss": 0.3251,
+ "step": 9108
+ },
+ {
+ "epoch": 4.306855791962175,
+ "grad_norm": 2.86930775642395,
+ "learning_rate": 9.334148460062165e-07,
+ "loss": 0.3322,
+ "step": 9109
+ },
+ {
+ "epoch": 4.307328605200945,
+ "grad_norm": 3.2068963050842285,
+ "learning_rate": 9.329287379103355e-07,
+ "loss": 0.3845,
+ "step": 9110
+ },
+ {
+ "epoch": 4.307801418439716,
+ "grad_norm": 3.567309856414795,
+ "learning_rate": 9.324427273891115e-07,
+ "loss": 0.4037,
+ "step": 9111
+ },
+ {
+ "epoch": 4.308274231678487,
+ "grad_norm": 3.2064783573150635,
+ "learning_rate": 9.319568144728056e-07,
+ "loss": 0.3481,
+ "step": 9112
+ },
+ {
+ "epoch": 4.308747044917258,
+ "grad_norm": 3.2492294311523438,
+ "learning_rate": 9.314709991916721e-07,
+ "loss": 0.3657,
+ "step": 9113
+ },
+ {
+ "epoch": 4.309219858156029,
+ "grad_norm": 2.990755081176758,
+ "learning_rate": 9.309852815759626e-07,
+ "loss": 0.3582,
+ "step": 9114
+ },
+ {
+ "epoch": 4.309692671394799,
+ "grad_norm": 3.3375513553619385,
+ "learning_rate": 9.304996616559187e-07,
+ "loss": 0.3657,
+ "step": 9115
+ },
+ {
+ "epoch": 4.31016548463357,
+ "grad_norm": 2.945552349090576,
+ "learning_rate": 9.300141394617798e-07,
+ "loss": 0.3075,
+ "step": 9116
+ },
+ {
+ "epoch": 4.31063829787234,
+ "grad_norm": 3.5318517684936523,
+ "learning_rate": 9.295287150237764e-07,
+ "loss": 0.39,
+ "step": 9117
+ },
+ {
+ "epoch": 4.311111111111111,
+ "grad_norm": 3.452049732208252,
+ "learning_rate": 9.290433883721326e-07,
+ "loss": 0.3821,
+ "step": 9118
+ },
+ {
+ "epoch": 4.311583924349882,
+ "grad_norm": 3.0762388706207275,
+ "learning_rate": 9.285581595370693e-07,
+ "loss": 0.316,
+ "step": 9119
+ },
+ {
+ "epoch": 4.3120567375886525,
+ "grad_norm": 4.13551664352417,
+ "learning_rate": 9.28073028548799e-07,
+ "loss": 0.3661,
+ "step": 9120
+ },
+ {
+ "epoch": 4.312529550827423,
+ "grad_norm": 3.1915719509124756,
+ "learning_rate": 9.275879954375286e-07,
+ "loss": 0.371,
+ "step": 9121
+ },
+ {
+ "epoch": 4.313002364066194,
+ "grad_norm": 3.118861198425293,
+ "learning_rate": 9.271030602334577e-07,
+ "loss": 0.3943,
+ "step": 9122
+ },
+ {
+ "epoch": 4.313475177304965,
+ "grad_norm": 3.042757987976074,
+ "learning_rate": 9.266182229667836e-07,
+ "loss": 0.3779,
+ "step": 9123
+ },
+ {
+ "epoch": 4.313947990543735,
+ "grad_norm": 2.949110746383667,
+ "learning_rate": 9.261334836676933e-07,
+ "loss": 0.3721,
+ "step": 9124
+ },
+ {
+ "epoch": 4.314420803782506,
+ "grad_norm": 2.982090950012207,
+ "learning_rate": 9.256488423663701e-07,
+ "loss": 0.2865,
+ "step": 9125
+ },
+ {
+ "epoch": 4.314893617021276,
+ "grad_norm": 3.6527535915374756,
+ "learning_rate": 9.25164299092989e-07,
+ "loss": 0.3689,
+ "step": 9126
+ },
+ {
+ "epoch": 4.315366430260047,
+ "grad_norm": 3.3310744762420654,
+ "learning_rate": 9.246798538777227e-07,
+ "loss": 0.3198,
+ "step": 9127
+ },
+ {
+ "epoch": 4.315839243498818,
+ "grad_norm": 2.8298583030700684,
+ "learning_rate": 9.241955067507332e-07,
+ "loss": 0.3711,
+ "step": 9128
+ },
+ {
+ "epoch": 4.316312056737589,
+ "grad_norm": 3.636894702911377,
+ "learning_rate": 9.237112577421809e-07,
+ "loss": 0.374,
+ "step": 9129
+ },
+ {
+ "epoch": 4.31678486997636,
+ "grad_norm": 2.896251678466797,
+ "learning_rate": 9.232271068822166e-07,
+ "loss": 0.3372,
+ "step": 9130
+ },
+ {
+ "epoch": 4.31725768321513,
+ "grad_norm": 3.2836971282958984,
+ "learning_rate": 9.227430542009854e-07,
+ "loss": 0.3584,
+ "step": 9131
+ },
+ {
+ "epoch": 4.317730496453901,
+ "grad_norm": 2.9452571868896484,
+ "learning_rate": 9.222590997286293e-07,
+ "loss": 0.3658,
+ "step": 9132
+ },
+ {
+ "epoch": 4.318203309692671,
+ "grad_norm": 2.88613224029541,
+ "learning_rate": 9.217752434952801e-07,
+ "loss": 0.3221,
+ "step": 9133
+ },
+ {
+ "epoch": 4.318676122931442,
+ "grad_norm": 2.7794570922851562,
+ "learning_rate": 9.212914855310667e-07,
+ "loss": 0.3142,
+ "step": 9134
+ },
+ {
+ "epoch": 4.319148936170213,
+ "grad_norm": 3.0195112228393555,
+ "learning_rate": 9.208078258661102e-07,
+ "loss": 0.3039,
+ "step": 9135
+ },
+ {
+ "epoch": 4.3196217494089835,
+ "grad_norm": 3.5178396701812744,
+ "learning_rate": 9.203242645305253e-07,
+ "loss": 0.3912,
+ "step": 9136
+ },
+ {
+ "epoch": 4.320094562647754,
+ "grad_norm": 3.145413875579834,
+ "learning_rate": 9.198408015544222e-07,
+ "loss": 0.3045,
+ "step": 9137
+ },
+ {
+ "epoch": 4.320567375886525,
+ "grad_norm": 3.151193380355835,
+ "learning_rate": 9.193574369679037e-07,
+ "loss": 0.341,
+ "step": 9138
+ },
+ {
+ "epoch": 4.321040189125296,
+ "grad_norm": 3.248255968093872,
+ "learning_rate": 9.188741708010668e-07,
+ "loss": 0.4344,
+ "step": 9139
+ },
+ {
+ "epoch": 4.321513002364066,
+ "grad_norm": 2.953218460083008,
+ "learning_rate": 9.183910030840021e-07,
+ "loss": 0.343,
+ "step": 9140
+ },
+ {
+ "epoch": 4.321985815602837,
+ "grad_norm": 3.1873161792755127,
+ "learning_rate": 9.179079338467936e-07,
+ "loss": 0.3082,
+ "step": 9141
+ },
+ {
+ "epoch": 4.322458628841607,
+ "grad_norm": 3.2587013244628906,
+ "learning_rate": 9.174249631195218e-07,
+ "loss": 0.3855,
+ "step": 9142
+ },
+ {
+ "epoch": 4.322931442080378,
+ "grad_norm": 2.956145763397217,
+ "learning_rate": 9.169420909322573e-07,
+ "loss": 0.3156,
+ "step": 9143
+ },
+ {
+ "epoch": 4.323404255319149,
+ "grad_norm": 3.1664650440216064,
+ "learning_rate": 9.164593173150683e-07,
+ "loss": 0.352,
+ "step": 9144
+ },
+ {
+ "epoch": 4.32387706855792,
+ "grad_norm": 3.2792744636535645,
+ "learning_rate": 9.159766422980138e-07,
+ "loss": 0.3963,
+ "step": 9145
+ },
+ {
+ "epoch": 4.3243498817966906,
+ "grad_norm": 3.1249687671661377,
+ "learning_rate": 9.154940659111472e-07,
+ "loss": 0.3405,
+ "step": 9146
+ },
+ {
+ "epoch": 4.324822695035461,
+ "grad_norm": 3.128340244293213,
+ "learning_rate": 9.150115881845181e-07,
+ "loss": 0.2733,
+ "step": 9147
+ },
+ {
+ "epoch": 4.325295508274232,
+ "grad_norm": 3.1790847778320312,
+ "learning_rate": 9.145292091481675e-07,
+ "loss": 0.3676,
+ "step": 9148
+ },
+ {
+ "epoch": 4.325768321513002,
+ "grad_norm": 2.874678134918213,
+ "learning_rate": 9.1404692883213e-07,
+ "loss": 0.2937,
+ "step": 9149
+ },
+ {
+ "epoch": 4.326241134751773,
+ "grad_norm": 3.102196216583252,
+ "learning_rate": 9.135647472664369e-07,
+ "loss": 0.2772,
+ "step": 9150
+ },
+ {
+ "epoch": 4.326713947990544,
+ "grad_norm": 3.0027546882629395,
+ "learning_rate": 9.130826644811099e-07,
+ "loss": 0.3171,
+ "step": 9151
+ },
+ {
+ "epoch": 4.3271867612293144,
+ "grad_norm": 2.750152587890625,
+ "learning_rate": 9.126006805061679e-07,
+ "loss": 0.3689,
+ "step": 9152
+ },
+ {
+ "epoch": 4.327659574468085,
+ "grad_norm": 3.251054525375366,
+ "learning_rate": 9.12118795371621e-07,
+ "loss": 0.3463,
+ "step": 9153
+ },
+ {
+ "epoch": 4.328132387706856,
+ "grad_norm": 2.8849353790283203,
+ "learning_rate": 9.116370091074738e-07,
+ "loss": 0.3,
+ "step": 9154
+ },
+ {
+ "epoch": 4.328605200945627,
+ "grad_norm": 3.4823720455169678,
+ "learning_rate": 9.111553217437255e-07,
+ "loss": 0.336,
+ "step": 9155
+ },
+ {
+ "epoch": 4.329078014184397,
+ "grad_norm": 2.8170886039733887,
+ "learning_rate": 9.106737333103677e-07,
+ "loss": 0.3237,
+ "step": 9156
+ },
+ {
+ "epoch": 4.3295508274231675,
+ "grad_norm": 3.095379114151001,
+ "learning_rate": 9.101922438373881e-07,
+ "loss": 0.3438,
+ "step": 9157
+ },
+ {
+ "epoch": 4.330023640661938,
+ "grad_norm": 3.1764986515045166,
+ "learning_rate": 9.097108533547667e-07,
+ "loss": 0.3174,
+ "step": 9158
+ },
+ {
+ "epoch": 4.330496453900709,
+ "grad_norm": 3.3972036838531494,
+ "learning_rate": 9.092295618924763e-07,
+ "loss": 0.3118,
+ "step": 9159
+ },
+ {
+ "epoch": 4.33096926713948,
+ "grad_norm": 3.112926959991455,
+ "learning_rate": 9.087483694804863e-07,
+ "loss": 0.3521,
+ "step": 9160
+ },
+ {
+ "epoch": 4.331442080378251,
+ "grad_norm": 3.395550012588501,
+ "learning_rate": 9.082672761487573e-07,
+ "loss": 0.3423,
+ "step": 9161
+ },
+ {
+ "epoch": 4.3319148936170215,
+ "grad_norm": 3.486910343170166,
+ "learning_rate": 9.077862819272465e-07,
+ "loss": 0.3655,
+ "step": 9162
+ },
+ {
+ "epoch": 4.332387706855792,
+ "grad_norm": 3.0986499786376953,
+ "learning_rate": 9.07305386845902e-07,
+ "loss": 0.2865,
+ "step": 9163
+ },
+ {
+ "epoch": 4.332860520094562,
+ "grad_norm": 2.962139844894409,
+ "learning_rate": 9.068245909346665e-07,
+ "loss": 0.315,
+ "step": 9164
+ },
+ {
+ "epoch": 4.333333333333333,
+ "grad_norm": 3.0887413024902344,
+ "learning_rate": 9.063438942234787e-07,
+ "loss": 0.3207,
+ "step": 9165
+ },
+ {
+ "epoch": 4.333806146572104,
+ "grad_norm": 2.909770965576172,
+ "learning_rate": 9.058632967422678e-07,
+ "loss": 0.3221,
+ "step": 9166
+ },
+ {
+ "epoch": 4.334278959810875,
+ "grad_norm": 3.1872079372406006,
+ "learning_rate": 9.053827985209604e-07,
+ "loss": 0.3856,
+ "step": 9167
+ },
+ {
+ "epoch": 4.334751773049645,
+ "grad_norm": 3.196985960006714,
+ "learning_rate": 9.049023995894738e-07,
+ "loss": 0.2994,
+ "step": 9168
+ },
+ {
+ "epoch": 4.335224586288416,
+ "grad_norm": 3.3150243759155273,
+ "learning_rate": 9.044220999777204e-07,
+ "loss": 0.3619,
+ "step": 9169
+ },
+ {
+ "epoch": 4.335697399527187,
+ "grad_norm": 3.1662707328796387,
+ "learning_rate": 9.039418997156066e-07,
+ "loss": 0.3347,
+ "step": 9170
+ },
+ {
+ "epoch": 4.336170212765958,
+ "grad_norm": 3.6789329051971436,
+ "learning_rate": 9.034617988330318e-07,
+ "loss": 0.4106,
+ "step": 9171
+ },
+ {
+ "epoch": 4.336643026004728,
+ "grad_norm": 3.5192553997039795,
+ "learning_rate": 9.029817973598898e-07,
+ "loss": 0.3561,
+ "step": 9172
+ },
+ {
+ "epoch": 4.3371158392434985,
+ "grad_norm": 3.4309239387512207,
+ "learning_rate": 9.025018953260692e-07,
+ "loss": 0.3739,
+ "step": 9173
+ },
+ {
+ "epoch": 4.337588652482269,
+ "grad_norm": 3.1840806007385254,
+ "learning_rate": 9.020220927614498e-07,
+ "loss": 0.3479,
+ "step": 9174
+ },
+ {
+ "epoch": 4.33806146572104,
+ "grad_norm": 3.6679139137268066,
+ "learning_rate": 9.015423896959088e-07,
+ "loss": 0.3739,
+ "step": 9175
+ },
+ {
+ "epoch": 4.338534278959811,
+ "grad_norm": 3.125296115875244,
+ "learning_rate": 9.010627861593143e-07,
+ "loss": 0.3046,
+ "step": 9176
+ },
+ {
+ "epoch": 4.339007092198582,
+ "grad_norm": 3.0710368156433105,
+ "learning_rate": 9.005832821815278e-07,
+ "loss": 0.322,
+ "step": 9177
+ },
+ {
+ "epoch": 4.3394799054373525,
+ "grad_norm": 3.068833351135254,
+ "learning_rate": 9.001038777924082e-07,
+ "loss": 0.3229,
+ "step": 9178
+ },
+ {
+ "epoch": 4.339952718676123,
+ "grad_norm": 3.2746002674102783,
+ "learning_rate": 8.996245730218037e-07,
+ "loss": 0.369,
+ "step": 9179
+ },
+ {
+ "epoch": 4.340425531914893,
+ "grad_norm": 3.1138477325439453,
+ "learning_rate": 8.991453678995607e-07,
+ "loss": 0.3456,
+ "step": 9180
+ },
+ {
+ "epoch": 4.340898345153664,
+ "grad_norm": 3.2195467948913574,
+ "learning_rate": 8.986662624555159e-07,
+ "loss": 0.377,
+ "step": 9181
+ },
+ {
+ "epoch": 4.341371158392435,
+ "grad_norm": 3.1197304725646973,
+ "learning_rate": 8.981872567195008e-07,
+ "loss": 0.3007,
+ "step": 9182
+ },
+ {
+ "epoch": 4.341843971631206,
+ "grad_norm": 3.295881748199463,
+ "learning_rate": 8.977083507213418e-07,
+ "loss": 0.4007,
+ "step": 9183
+ },
+ {
+ "epoch": 4.342316784869976,
+ "grad_norm": 3.8013954162597656,
+ "learning_rate": 8.972295444908582e-07,
+ "loss": 0.4322,
+ "step": 9184
+ },
+ {
+ "epoch": 4.342789598108747,
+ "grad_norm": 3.133434295654297,
+ "learning_rate": 8.967508380578633e-07,
+ "loss": 0.3379,
+ "step": 9185
+ },
+ {
+ "epoch": 4.343262411347518,
+ "grad_norm": 3.0942039489746094,
+ "learning_rate": 8.962722314521625e-07,
+ "loss": 0.3753,
+ "step": 9186
+ },
+ {
+ "epoch": 4.343735224586289,
+ "grad_norm": 2.8691020011901855,
+ "learning_rate": 8.957937247035583e-07,
+ "loss": 0.3003,
+ "step": 9187
+ },
+ {
+ "epoch": 4.344208037825059,
+ "grad_norm": 2.8353092670440674,
+ "learning_rate": 8.95315317841845e-07,
+ "loss": 0.3068,
+ "step": 9188
+ },
+ {
+ "epoch": 4.3446808510638295,
+ "grad_norm": 3.071207046508789,
+ "learning_rate": 8.948370108968097e-07,
+ "loss": 0.3147,
+ "step": 9189
+ },
+ {
+ "epoch": 4.3451536643026,
+ "grad_norm": 3.3605904579162598,
+ "learning_rate": 8.943588038982359e-07,
+ "loss": 0.3216,
+ "step": 9190
+ },
+ {
+ "epoch": 4.345626477541371,
+ "grad_norm": 3.0702717304229736,
+ "learning_rate": 8.93880696875899e-07,
+ "loss": 0.3507,
+ "step": 9191
+ },
+ {
+ "epoch": 4.346099290780142,
+ "grad_norm": 3.261456251144409,
+ "learning_rate": 8.934026898595675e-07,
+ "loss": 0.3677,
+ "step": 9192
+ },
+ {
+ "epoch": 4.346572104018913,
+ "grad_norm": 3.0827512741088867,
+ "learning_rate": 8.929247828790066e-07,
+ "loss": 0.3786,
+ "step": 9193
+ },
+ {
+ "epoch": 4.3470449172576835,
+ "grad_norm": 3.488949775695801,
+ "learning_rate": 8.924469759639728e-07,
+ "loss": 0.3685,
+ "step": 9194
+ },
+ {
+ "epoch": 4.347517730496454,
+ "grad_norm": 2.8565423488616943,
+ "learning_rate": 8.919692691442162e-07,
+ "loss": 0.3449,
+ "step": 9195
+ },
+ {
+ "epoch": 4.347990543735224,
+ "grad_norm": 4.654722213745117,
+ "learning_rate": 8.914916624494829e-07,
+ "loss": 0.344,
+ "step": 9196
+ },
+ {
+ "epoch": 4.348463356973995,
+ "grad_norm": 3.256714344024658,
+ "learning_rate": 8.910141559095098e-07,
+ "loss": 0.3487,
+ "step": 9197
+ },
+ {
+ "epoch": 4.348936170212766,
+ "grad_norm": 3.0921413898468018,
+ "learning_rate": 8.90536749554031e-07,
+ "loss": 0.3171,
+ "step": 9198
+ },
+ {
+ "epoch": 4.349408983451537,
+ "grad_norm": 3.1129112243652344,
+ "learning_rate": 8.900594434127712e-07,
+ "loss": 0.3501,
+ "step": 9199
+ },
+ {
+ "epoch": 4.349881796690307,
+ "grad_norm": 3.077688217163086,
+ "learning_rate": 8.8958223751545e-07,
+ "loss": 0.2781,
+ "step": 9200
+ },
+ {
+ "epoch": 4.350354609929078,
+ "grad_norm": 3.2839295864105225,
+ "learning_rate": 8.891051318917821e-07,
+ "loss": 0.3699,
+ "step": 9201
+ },
+ {
+ "epoch": 4.350827423167849,
+ "grad_norm": 3.0370850563049316,
+ "learning_rate": 8.886281265714741e-07,
+ "loss": 0.3344,
+ "step": 9202
+ },
+ {
+ "epoch": 4.35130023640662,
+ "grad_norm": 3.439702033996582,
+ "learning_rate": 8.88151221584227e-07,
+ "loss": 0.3865,
+ "step": 9203
+ },
+ {
+ "epoch": 4.35177304964539,
+ "grad_norm": 3.133317470550537,
+ "learning_rate": 8.876744169597357e-07,
+ "loss": 0.3352,
+ "step": 9204
+ },
+ {
+ "epoch": 4.3522458628841605,
+ "grad_norm": 3.2529115676879883,
+ "learning_rate": 8.871977127276876e-07,
+ "loss": 0.3708,
+ "step": 9205
+ },
+ {
+ "epoch": 4.352718676122931,
+ "grad_norm": 3.2149887084960938,
+ "learning_rate": 8.867211089177669e-07,
+ "loss": 0.3298,
+ "step": 9206
+ },
+ {
+ "epoch": 4.353191489361702,
+ "grad_norm": 2.778116464614868,
+ "learning_rate": 8.86244605559648e-07,
+ "loss": 0.3319,
+ "step": 9207
+ },
+ {
+ "epoch": 4.353664302600473,
+ "grad_norm": 3.206336736679077,
+ "learning_rate": 8.85768202683002e-07,
+ "loss": 0.3793,
+ "step": 9208
+ },
+ {
+ "epoch": 4.354137115839244,
+ "grad_norm": 3.4236080646514893,
+ "learning_rate": 8.852919003174921e-07,
+ "loss": 0.3341,
+ "step": 9209
+ },
+ {
+ "epoch": 4.3546099290780145,
+ "grad_norm": 3.049886703491211,
+ "learning_rate": 8.848156984927742e-07,
+ "loss": 0.3153,
+ "step": 9210
+ },
+ {
+ "epoch": 4.355082742316785,
+ "grad_norm": 4.048248291015625,
+ "learning_rate": 8.843395972385013e-07,
+ "loss": 0.3857,
+ "step": 9211
+ },
+ {
+ "epoch": 4.355555555555555,
+ "grad_norm": 3.3379292488098145,
+ "learning_rate": 8.838635965843165e-07,
+ "loss": 0.3167,
+ "step": 9212
+ },
+ {
+ "epoch": 4.356028368794326,
+ "grad_norm": 2.963364839553833,
+ "learning_rate": 8.833876965598598e-07,
+ "loss": 0.3427,
+ "step": 9213
+ },
+ {
+ "epoch": 4.356501182033097,
+ "grad_norm": 3.1309237480163574,
+ "learning_rate": 8.829118971947625e-07,
+ "loss": 0.3694,
+ "step": 9214
+ },
+ {
+ "epoch": 4.356973995271868,
+ "grad_norm": 3.4728028774261475,
+ "learning_rate": 8.824361985186497e-07,
+ "loss": 0.3769,
+ "step": 9215
+ },
+ {
+ "epoch": 4.357446808510638,
+ "grad_norm": 2.7183408737182617,
+ "learning_rate": 8.819606005611431e-07,
+ "loss": 0.3053,
+ "step": 9216
+ },
+ {
+ "epoch": 4.357919621749409,
+ "grad_norm": 3.34867262840271,
+ "learning_rate": 8.814851033518549e-07,
+ "loss": 0.3316,
+ "step": 9217
+ },
+ {
+ "epoch": 4.35839243498818,
+ "grad_norm": 3.288097858428955,
+ "learning_rate": 8.810097069203924e-07,
+ "loss": 0.362,
+ "step": 9218
+ },
+ {
+ "epoch": 4.358865248226951,
+ "grad_norm": 3.2768566608428955,
+ "learning_rate": 8.805344112963563e-07,
+ "loss": 0.3762,
+ "step": 9219
+ },
+ {
+ "epoch": 4.359338061465721,
+ "grad_norm": 2.730982542037964,
+ "learning_rate": 8.800592165093405e-07,
+ "loss": 0.3184,
+ "step": 9220
+ },
+ {
+ "epoch": 4.3598108747044915,
+ "grad_norm": 3.2347333431243896,
+ "learning_rate": 8.795841225889348e-07,
+ "loss": 0.3414,
+ "step": 9221
+ },
+ {
+ "epoch": 4.360283687943262,
+ "grad_norm": 2.8792049884796143,
+ "learning_rate": 8.791091295647208e-07,
+ "loss": 0.3312,
+ "step": 9222
+ },
+ {
+ "epoch": 4.360756501182033,
+ "grad_norm": 3.2037971019744873,
+ "learning_rate": 8.786342374662726e-07,
+ "loss": 0.3772,
+ "step": 9223
+ },
+ {
+ "epoch": 4.361229314420804,
+ "grad_norm": 3.765244245529175,
+ "learning_rate": 8.781594463231621e-07,
+ "loss": 0.3724,
+ "step": 9224
+ },
+ {
+ "epoch": 4.361702127659575,
+ "grad_norm": 3.085339069366455,
+ "learning_rate": 8.776847561649504e-07,
+ "loss": 0.3468,
+ "step": 9225
+ },
+ {
+ "epoch": 4.3621749408983455,
+ "grad_norm": 2.8031229972839355,
+ "learning_rate": 8.772101670211963e-07,
+ "loss": 0.3219,
+ "step": 9226
+ },
+ {
+ "epoch": 4.362647754137116,
+ "grad_norm": 2.667694091796875,
+ "learning_rate": 8.76735678921449e-07,
+ "loss": 0.3381,
+ "step": 9227
+ },
+ {
+ "epoch": 4.363120567375886,
+ "grad_norm": 2.898273229598999,
+ "learning_rate": 8.762612918952526e-07,
+ "loss": 0.3526,
+ "step": 9228
+ },
+ {
+ "epoch": 4.363593380614657,
+ "grad_norm": 3.1458849906921387,
+ "learning_rate": 8.757870059721465e-07,
+ "loss": 0.3516,
+ "step": 9229
+ },
+ {
+ "epoch": 4.364066193853428,
+ "grad_norm": 3.1719279289245605,
+ "learning_rate": 8.753128211816609e-07,
+ "loss": 0.328,
+ "step": 9230
+ },
+ {
+ "epoch": 4.3645390070921986,
+ "grad_norm": 3.0799217224121094,
+ "learning_rate": 8.748387375533224e-07,
+ "loss": 0.2802,
+ "step": 9231
+ },
+ {
+ "epoch": 4.365011820330969,
+ "grad_norm": 3.1218812465667725,
+ "learning_rate": 8.743647551166498e-07,
+ "loss": 0.3264,
+ "step": 9232
+ },
+ {
+ "epoch": 4.36548463356974,
+ "grad_norm": 3.231175184249878,
+ "learning_rate": 8.738908739011556e-07,
+ "loss": 0.3192,
+ "step": 9233
+ },
+ {
+ "epoch": 4.365957446808511,
+ "grad_norm": 3.088284730911255,
+ "learning_rate": 8.734170939363465e-07,
+ "loss": 0.3569,
+ "step": 9234
+ },
+ {
+ "epoch": 4.366430260047281,
+ "grad_norm": 3.2510828971862793,
+ "learning_rate": 8.729434152517217e-07,
+ "loss": 0.3977,
+ "step": 9235
+ },
+ {
+ "epoch": 4.366903073286052,
+ "grad_norm": 3.435762405395508,
+ "learning_rate": 8.724698378767768e-07,
+ "loss": 0.3201,
+ "step": 9236
+ },
+ {
+ "epoch": 4.3673758865248224,
+ "grad_norm": 3.6876676082611084,
+ "learning_rate": 8.719963618409985e-07,
+ "loss": 0.381,
+ "step": 9237
+ },
+ {
+ "epoch": 4.367848699763593,
+ "grad_norm": 2.7620339393615723,
+ "learning_rate": 8.715229871738676e-07,
+ "loss": 0.2939,
+ "step": 9238
+ },
+ {
+ "epoch": 4.368321513002364,
+ "grad_norm": 3.412893056869507,
+ "learning_rate": 8.710497139048604e-07,
+ "loss": 0.3592,
+ "step": 9239
+ },
+ {
+ "epoch": 4.368794326241135,
+ "grad_norm": 3.2498574256896973,
+ "learning_rate": 8.705765420634446e-07,
+ "loss": 0.4054,
+ "step": 9240
+ },
+ {
+ "epoch": 4.369267139479906,
+ "grad_norm": 3.138425827026367,
+ "learning_rate": 8.701034716790821e-07,
+ "loss": 0.3609,
+ "step": 9241
+ },
+ {
+ "epoch": 4.369739952718676,
+ "grad_norm": 2.7645158767700195,
+ "learning_rate": 8.696305027812301e-07,
+ "loss": 0.3085,
+ "step": 9242
+ },
+ {
+ "epoch": 4.370212765957447,
+ "grad_norm": 3.5948917865753174,
+ "learning_rate": 8.691576353993372e-07,
+ "loss": 0.3846,
+ "step": 9243
+ },
+ {
+ "epoch": 4.370685579196217,
+ "grad_norm": 3.2185158729553223,
+ "learning_rate": 8.68684869562848e-07,
+ "loss": 0.3516,
+ "step": 9244
+ },
+ {
+ "epoch": 4.371158392434988,
+ "grad_norm": 3.057281494140625,
+ "learning_rate": 8.68212205301199e-07,
+ "loss": 0.3197,
+ "step": 9245
+ },
+ {
+ "epoch": 4.371631205673759,
+ "grad_norm": 2.9788076877593994,
+ "learning_rate": 8.677396426438198e-07,
+ "loss": 0.3283,
+ "step": 9246
+ },
+ {
+ "epoch": 4.3721040189125295,
+ "grad_norm": 2.9246625900268555,
+ "learning_rate": 8.672671816201366e-07,
+ "loss": 0.3482,
+ "step": 9247
+ },
+ {
+ "epoch": 4.3725768321513,
+ "grad_norm": 2.9994964599609375,
+ "learning_rate": 8.667948222595671e-07,
+ "loss": 0.3802,
+ "step": 9248
+ },
+ {
+ "epoch": 4.373049645390071,
+ "grad_norm": 2.692626476287842,
+ "learning_rate": 8.663225645915222e-07,
+ "loss": 0.3045,
+ "step": 9249
+ },
+ {
+ "epoch": 4.373522458628842,
+ "grad_norm": 2.794236660003662,
+ "learning_rate": 8.658504086454078e-07,
+ "loss": 0.3056,
+ "step": 9250
+ },
+ {
+ "epoch": 4.373995271867612,
+ "grad_norm": 3.020534038543701,
+ "learning_rate": 8.653783544506222e-07,
+ "loss": 0.3341,
+ "step": 9251
+ },
+ {
+ "epoch": 4.374468085106383,
+ "grad_norm": 3.2142958641052246,
+ "learning_rate": 8.649064020365596e-07,
+ "loss": 0.3435,
+ "step": 9252
+ },
+ {
+ "epoch": 4.374940898345153,
+ "grad_norm": 3.3818624019622803,
+ "learning_rate": 8.644345514326049e-07,
+ "loss": 0.3744,
+ "step": 9253
+ },
+ {
+ "epoch": 4.375413711583924,
+ "grad_norm": 3.1566405296325684,
+ "learning_rate": 8.639628026681399e-07,
+ "loss": 0.3568,
+ "step": 9254
+ },
+ {
+ "epoch": 4.375886524822695,
+ "grad_norm": 3.3773083686828613,
+ "learning_rate": 8.63491155772537e-07,
+ "loss": 0.3523,
+ "step": 9255
+ },
+ {
+ "epoch": 4.376359338061466,
+ "grad_norm": 3.0850939750671387,
+ "learning_rate": 8.630196107751634e-07,
+ "loss": 0.3356,
+ "step": 9256
+ },
+ {
+ "epoch": 4.376832151300237,
+ "grad_norm": 3.361496686935425,
+ "learning_rate": 8.625481677053815e-07,
+ "loss": 0.3619,
+ "step": 9257
+ },
+ {
+ "epoch": 4.377304964539007,
+ "grad_norm": 3.026015043258667,
+ "learning_rate": 8.620768265925444e-07,
+ "loss": 0.3476,
+ "step": 9258
+ },
+ {
+ "epoch": 4.377777777777778,
+ "grad_norm": 3.142747640609741,
+ "learning_rate": 8.61605587466002e-07,
+ "loss": 0.3391,
+ "step": 9259
+ },
+ {
+ "epoch": 4.378250591016548,
+ "grad_norm": 3.0910356044769287,
+ "learning_rate": 8.611344503550956e-07,
+ "loss": 0.3201,
+ "step": 9260
+ },
+ {
+ "epoch": 4.378723404255319,
+ "grad_norm": 3.4462292194366455,
+ "learning_rate": 8.606634152891599e-07,
+ "loss": 0.4075,
+ "step": 9261
+ },
+ {
+ "epoch": 4.37919621749409,
+ "grad_norm": 2.984248638153076,
+ "learning_rate": 8.601924822975258e-07,
+ "loss": 0.3415,
+ "step": 9262
+ },
+ {
+ "epoch": 4.3796690307328605,
+ "grad_norm": 2.944971799850464,
+ "learning_rate": 8.597216514095155e-07,
+ "loss": 0.3163,
+ "step": 9263
+ },
+ {
+ "epoch": 4.380141843971631,
+ "grad_norm": 3.1562247276306152,
+ "learning_rate": 8.592509226544457e-07,
+ "loss": 0.3093,
+ "step": 9264
+ },
+ {
+ "epoch": 4.380614657210402,
+ "grad_norm": 2.911339282989502,
+ "learning_rate": 8.587802960616254e-07,
+ "loss": 0.3287,
+ "step": 9265
+ },
+ {
+ "epoch": 4.381087470449173,
+ "grad_norm": 3.5560295581817627,
+ "learning_rate": 8.583097716603605e-07,
+ "loss": 0.3763,
+ "step": 9266
+ },
+ {
+ "epoch": 4.381560283687943,
+ "grad_norm": 3.35855770111084,
+ "learning_rate": 8.578393494799478e-07,
+ "loss": 0.3703,
+ "step": 9267
+ },
+ {
+ "epoch": 4.382033096926714,
+ "grad_norm": 3.0229954719543457,
+ "learning_rate": 8.573690295496778e-07,
+ "loss": 0.3421,
+ "step": 9268
+ },
+ {
+ "epoch": 4.382505910165484,
+ "grad_norm": 3.0842833518981934,
+ "learning_rate": 8.568988118988348e-07,
+ "loss": 0.3473,
+ "step": 9269
+ },
+ {
+ "epoch": 4.382978723404255,
+ "grad_norm": 3.2471694946289062,
+ "learning_rate": 8.564286965566989e-07,
+ "loss": 0.3025,
+ "step": 9270
+ },
+ {
+ "epoch": 4.383451536643026,
+ "grad_norm": 3.4435837268829346,
+ "learning_rate": 8.559586835525404e-07,
+ "loss": 0.394,
+ "step": 9271
+ },
+ {
+ "epoch": 4.383924349881797,
+ "grad_norm": 3.4572243690490723,
+ "learning_rate": 8.554887729156267e-07,
+ "loss": 0.3745,
+ "step": 9272
+ },
+ {
+ "epoch": 4.384397163120568,
+ "grad_norm": 3.3646514415740967,
+ "learning_rate": 8.550189646752161e-07,
+ "loss": 0.308,
+ "step": 9273
+ },
+ {
+ "epoch": 4.384869976359338,
+ "grad_norm": 2.794933319091797,
+ "learning_rate": 8.545492588605606e-07,
+ "loss": 0.3039,
+ "step": 9274
+ },
+ {
+ "epoch": 4.385342789598109,
+ "grad_norm": 2.969306707382202,
+ "learning_rate": 8.540796555009084e-07,
+ "loss": 0.3815,
+ "step": 9275
+ },
+ {
+ "epoch": 4.385815602836879,
+ "grad_norm": 2.9203877449035645,
+ "learning_rate": 8.536101546254982e-07,
+ "loss": 0.3143,
+ "step": 9276
+ },
+ {
+ "epoch": 4.38628841607565,
+ "grad_norm": 3.451172113418579,
+ "learning_rate": 8.531407562635655e-07,
+ "loss": 0.3673,
+ "step": 9277
+ },
+ {
+ "epoch": 4.386761229314421,
+ "grad_norm": 3.1196818351745605,
+ "learning_rate": 8.526714604443365e-07,
+ "loss": 0.3449,
+ "step": 9278
+ },
+ {
+ "epoch": 4.3872340425531915,
+ "grad_norm": 3.0087406635284424,
+ "learning_rate": 8.522022671970312e-07,
+ "loss": 0.2898,
+ "step": 9279
+ },
+ {
+ "epoch": 4.387706855791962,
+ "grad_norm": 2.885667085647583,
+ "learning_rate": 8.517331765508666e-07,
+ "loss": 0.3119,
+ "step": 9280
+ },
+ {
+ "epoch": 4.388179669030733,
+ "grad_norm": 3.115769624710083,
+ "learning_rate": 8.512641885350494e-07,
+ "loss": 0.3662,
+ "step": 9281
+ },
+ {
+ "epoch": 4.388652482269504,
+ "grad_norm": 2.935692071914673,
+ "learning_rate": 8.507953031787818e-07,
+ "loss": 0.2957,
+ "step": 9282
+ },
+ {
+ "epoch": 4.389125295508274,
+ "grad_norm": 2.96824312210083,
+ "learning_rate": 8.503265205112593e-07,
+ "loss": 0.301,
+ "step": 9283
+ },
+ {
+ "epoch": 4.389598108747045,
+ "grad_norm": 2.8329155445098877,
+ "learning_rate": 8.498578405616697e-07,
+ "loss": 0.3289,
+ "step": 9284
+ },
+ {
+ "epoch": 4.390070921985815,
+ "grad_norm": 3.3063509464263916,
+ "learning_rate": 8.493892633591976e-07,
+ "loss": 0.371,
+ "step": 9285
+ },
+ {
+ "epoch": 4.390543735224586,
+ "grad_norm": 3.036324977874756,
+ "learning_rate": 8.489207889330175e-07,
+ "loss": 0.3111,
+ "step": 9286
+ },
+ {
+ "epoch": 4.391016548463357,
+ "grad_norm": 3.221714496612549,
+ "learning_rate": 8.48452417312301e-07,
+ "loss": 0.3034,
+ "step": 9287
+ },
+ {
+ "epoch": 4.391489361702128,
+ "grad_norm": 2.956813097000122,
+ "learning_rate": 8.479841485262108e-07,
+ "loss": 0.2826,
+ "step": 9288
+ },
+ {
+ "epoch": 4.391962174940899,
+ "grad_norm": 3.3818461894989014,
+ "learning_rate": 8.475159826039028e-07,
+ "loss": 0.3701,
+ "step": 9289
+ },
+ {
+ "epoch": 4.392434988179669,
+ "grad_norm": 3.1623525619506836,
+ "learning_rate": 8.470479195745293e-07,
+ "loss": 0.3405,
+ "step": 9290
+ },
+ {
+ "epoch": 4.39290780141844,
+ "grad_norm": 3.8068127632141113,
+ "learning_rate": 8.465799594672342e-07,
+ "loss": 0.3498,
+ "step": 9291
+ },
+ {
+ "epoch": 4.39338061465721,
+ "grad_norm": 3.042862892150879,
+ "learning_rate": 8.461121023111541e-07,
+ "loss": 0.3025,
+ "step": 9292
+ },
+ {
+ "epoch": 4.393853427895981,
+ "grad_norm": 3.0237231254577637,
+ "learning_rate": 8.456443481354221e-07,
+ "loss": 0.3351,
+ "step": 9293
+ },
+ {
+ "epoch": 4.394326241134752,
+ "grad_norm": 3.233386754989624,
+ "learning_rate": 8.451766969691614e-07,
+ "loss": 0.3314,
+ "step": 9294
+ },
+ {
+ "epoch": 4.3947990543735225,
+ "grad_norm": 2.922518014907837,
+ "learning_rate": 8.447091488414924e-07,
+ "loss": 0.2876,
+ "step": 9295
+ },
+ {
+ "epoch": 4.395271867612293,
+ "grad_norm": 3.2621119022369385,
+ "learning_rate": 8.442417037815268e-07,
+ "loss": 0.3731,
+ "step": 9296
+ },
+ {
+ "epoch": 4.395744680851064,
+ "grad_norm": 3.4238440990448,
+ "learning_rate": 8.437743618183697e-07,
+ "loss": 0.3563,
+ "step": 9297
+ },
+ {
+ "epoch": 4.396217494089835,
+ "grad_norm": 3.607088804244995,
+ "learning_rate": 8.43307122981121e-07,
+ "loss": 0.315,
+ "step": 9298
+ },
+ {
+ "epoch": 4.396690307328605,
+ "grad_norm": 3.0737040042877197,
+ "learning_rate": 8.428399872988724e-07,
+ "loss": 0.2699,
+ "step": 9299
+ },
+ {
+ "epoch": 4.397163120567376,
+ "grad_norm": 3.2364611625671387,
+ "learning_rate": 8.423729548007123e-07,
+ "loss": 0.3054,
+ "step": 9300
+ },
+ {
+ "epoch": 4.397635933806146,
+ "grad_norm": 3.505194664001465,
+ "learning_rate": 8.419060255157199e-07,
+ "loss": 0.3377,
+ "step": 9301
+ },
+ {
+ "epoch": 4.398108747044917,
+ "grad_norm": 3.337815523147583,
+ "learning_rate": 8.414391994729676e-07,
+ "loss": 0.3709,
+ "step": 9302
+ },
+ {
+ "epoch": 4.398581560283688,
+ "grad_norm": 3.310739040374756,
+ "learning_rate": 8.409724767015248e-07,
+ "loss": 0.3125,
+ "step": 9303
+ },
+ {
+ "epoch": 4.399054373522459,
+ "grad_norm": 2.9035723209381104,
+ "learning_rate": 8.405058572304506e-07,
+ "loss": 0.3294,
+ "step": 9304
+ },
+ {
+ "epoch": 4.39952718676123,
+ "grad_norm": 3.162543535232544,
+ "learning_rate": 8.400393410888008e-07,
+ "loss": 0.3563,
+ "step": 9305
+ },
+ {
+ "epoch": 4.4,
+ "grad_norm": 3.320204973220825,
+ "learning_rate": 8.395729283056222e-07,
+ "loss": 0.3681,
+ "step": 9306
+ },
+ {
+ "epoch": 4.400472813238771,
+ "grad_norm": 3.2953343391418457,
+ "learning_rate": 8.391066189099562e-07,
+ "loss": 0.3574,
+ "step": 9307
+ },
+ {
+ "epoch": 4.400945626477541,
+ "grad_norm": 3.2041780948638916,
+ "learning_rate": 8.386404129308387e-07,
+ "loss": 0.3787,
+ "step": 9308
+ },
+ {
+ "epoch": 4.401418439716312,
+ "grad_norm": 3.493856906890869,
+ "learning_rate": 8.381743103972973e-07,
+ "loss": 0.3678,
+ "step": 9309
+ },
+ {
+ "epoch": 4.401891252955083,
+ "grad_norm": 3.3114027976989746,
+ "learning_rate": 8.377083113383553e-07,
+ "loss": 0.3853,
+ "step": 9310
+ },
+ {
+ "epoch": 4.4023640661938535,
+ "grad_norm": 3.148033857345581,
+ "learning_rate": 8.372424157830281e-07,
+ "loss": 0.41,
+ "step": 9311
+ },
+ {
+ "epoch": 4.402836879432624,
+ "grad_norm": 3.1810758113861084,
+ "learning_rate": 8.367766237603245e-07,
+ "loss": 0.3536,
+ "step": 9312
+ },
+ {
+ "epoch": 4.403309692671395,
+ "grad_norm": 3.110158920288086,
+ "learning_rate": 8.363109352992474e-07,
+ "loss": 0.3453,
+ "step": 9313
+ },
+ {
+ "epoch": 4.403782505910166,
+ "grad_norm": 3.140287399291992,
+ "learning_rate": 8.358453504287934e-07,
+ "loss": 0.3617,
+ "step": 9314
+ },
+ {
+ "epoch": 4.404255319148936,
+ "grad_norm": 3.0819156169891357,
+ "learning_rate": 8.353798691779516e-07,
+ "loss": 0.3033,
+ "step": 9315
+ },
+ {
+ "epoch": 4.4047281323877066,
+ "grad_norm": 3.167506217956543,
+ "learning_rate": 8.349144915757071e-07,
+ "loss": 0.325,
+ "step": 9316
+ },
+ {
+ "epoch": 4.405200945626477,
+ "grad_norm": 3.0423221588134766,
+ "learning_rate": 8.34449217651035e-07,
+ "loss": 0.3213,
+ "step": 9317
+ },
+ {
+ "epoch": 4.405673758865248,
+ "grad_norm": 3.4442083835601807,
+ "learning_rate": 8.339840474329078e-07,
+ "loss": 0.3493,
+ "step": 9318
+ },
+ {
+ "epoch": 4.406146572104019,
+ "grad_norm": 3.2931764125823975,
+ "learning_rate": 8.335189809502886e-07,
+ "loss": 0.3947,
+ "step": 9319
+ },
+ {
+ "epoch": 4.40661938534279,
+ "grad_norm": 3.217146396636963,
+ "learning_rate": 8.330540182321345e-07,
+ "loss": 0.3622,
+ "step": 9320
+ },
+ {
+ "epoch": 4.4070921985815605,
+ "grad_norm": 3.0024805068969727,
+ "learning_rate": 8.325891593073981e-07,
+ "loss": 0.3333,
+ "step": 9321
+ },
+ {
+ "epoch": 4.407565011820331,
+ "grad_norm": 2.9128856658935547,
+ "learning_rate": 8.321244042050225e-07,
+ "loss": 0.3069,
+ "step": 9322
+ },
+ {
+ "epoch": 4.408037825059101,
+ "grad_norm": 3.1456804275512695,
+ "learning_rate": 8.316597529539477e-07,
+ "loss": 0.3749,
+ "step": 9323
+ },
+ {
+ "epoch": 4.408510638297872,
+ "grad_norm": 3.093379497528076,
+ "learning_rate": 8.31195205583105e-07,
+ "loss": 0.3899,
+ "step": 9324
+ },
+ {
+ "epoch": 4.408983451536643,
+ "grad_norm": 2.95357608795166,
+ "learning_rate": 8.307307621214181e-07,
+ "loss": 0.3525,
+ "step": 9325
+ },
+ {
+ "epoch": 4.409456264775414,
+ "grad_norm": 3.0832929611206055,
+ "learning_rate": 8.30266422597808e-07,
+ "loss": 0.3976,
+ "step": 9326
+ },
+ {
+ "epoch": 4.409929078014184,
+ "grad_norm": 3.203678846359253,
+ "learning_rate": 8.298021870411862e-07,
+ "loss": 0.3954,
+ "step": 9327
+ },
+ {
+ "epoch": 4.410401891252955,
+ "grad_norm": 3.0880157947540283,
+ "learning_rate": 8.293380554804586e-07,
+ "loss": 0.3153,
+ "step": 9328
+ },
+ {
+ "epoch": 4.410874704491726,
+ "grad_norm": 2.9109299182891846,
+ "learning_rate": 8.28874027944524e-07,
+ "loss": 0.3526,
+ "step": 9329
+ },
+ {
+ "epoch": 4.411347517730497,
+ "grad_norm": 3.4241647720336914,
+ "learning_rate": 8.284101044622767e-07,
+ "loss": 0.3288,
+ "step": 9330
+ },
+ {
+ "epoch": 4.411820330969267,
+ "grad_norm": 3.110163450241089,
+ "learning_rate": 8.279462850626024e-07,
+ "loss": 0.314,
+ "step": 9331
+ },
+ {
+ "epoch": 4.4122931442080375,
+ "grad_norm": 3.024353504180908,
+ "learning_rate": 8.274825697743805e-07,
+ "loss": 0.3107,
+ "step": 9332
+ },
+ {
+ "epoch": 4.412765957446808,
+ "grad_norm": 2.8271758556365967,
+ "learning_rate": 8.270189586264859e-07,
+ "loss": 0.3339,
+ "step": 9333
+ },
+ {
+ "epoch": 4.413238770685579,
+ "grad_norm": 3.179032325744629,
+ "learning_rate": 8.265554516477853e-07,
+ "loss": 0.3365,
+ "step": 9334
+ },
+ {
+ "epoch": 4.41371158392435,
+ "grad_norm": 2.9746336936950684,
+ "learning_rate": 8.260920488671376e-07,
+ "loss": 0.3127,
+ "step": 9335
+ },
+ {
+ "epoch": 4.414184397163121,
+ "grad_norm": 3.247529983520508,
+ "learning_rate": 8.256287503133992e-07,
+ "loss": 0.3251,
+ "step": 9336
+ },
+ {
+ "epoch": 4.4146572104018915,
+ "grad_norm": 3.3317253589630127,
+ "learning_rate": 8.251655560154168e-07,
+ "loss": 0.3246,
+ "step": 9337
+ },
+ {
+ "epoch": 4.415130023640662,
+ "grad_norm": 2.9890010356903076,
+ "learning_rate": 8.247024660020303e-07,
+ "loss": 0.3317,
+ "step": 9338
+ },
+ {
+ "epoch": 4.415602836879432,
+ "grad_norm": 3.3956406116485596,
+ "learning_rate": 8.242394803020759e-07,
+ "loss": 0.4055,
+ "step": 9339
+ },
+ {
+ "epoch": 4.416075650118203,
+ "grad_norm": 2.9918906688690186,
+ "learning_rate": 8.237765989443805e-07,
+ "loss": 0.3415,
+ "step": 9340
+ },
+ {
+ "epoch": 4.416548463356974,
+ "grad_norm": 3.4310927391052246,
+ "learning_rate": 8.233138219577671e-07,
+ "loss": 0.3654,
+ "step": 9341
+ },
+ {
+ "epoch": 4.417021276595745,
+ "grad_norm": 3.207947254180908,
+ "learning_rate": 8.2285114937105e-07,
+ "loss": 0.3773,
+ "step": 9342
+ },
+ {
+ "epoch": 4.417494089834515,
+ "grad_norm": 3.202953338623047,
+ "learning_rate": 8.223885812130367e-07,
+ "loss": 0.3476,
+ "step": 9343
+ },
+ {
+ "epoch": 4.417966903073286,
+ "grad_norm": 3.160951614379883,
+ "learning_rate": 8.219261175125315e-07,
+ "loss": 0.3583,
+ "step": 9344
+ },
+ {
+ "epoch": 4.418439716312057,
+ "grad_norm": 2.892636775970459,
+ "learning_rate": 8.214637582983284e-07,
+ "loss": 0.3241,
+ "step": 9345
+ },
+ {
+ "epoch": 4.418912529550828,
+ "grad_norm": 2.830085277557373,
+ "learning_rate": 8.210015035992172e-07,
+ "loss": 0.2921,
+ "step": 9346
+ },
+ {
+ "epoch": 4.419385342789598,
+ "grad_norm": 3.2846477031707764,
+ "learning_rate": 8.205393534439801e-07,
+ "loss": 0.4281,
+ "step": 9347
+ },
+ {
+ "epoch": 4.4198581560283685,
+ "grad_norm": 2.6153810024261475,
+ "learning_rate": 8.200773078613924e-07,
+ "loss": 0.2848,
+ "step": 9348
+ },
+ {
+ "epoch": 4.420330969267139,
+ "grad_norm": 3.0541396141052246,
+ "learning_rate": 8.196153668802253e-07,
+ "loss": 0.3619,
+ "step": 9349
+ },
+ {
+ "epoch": 4.42080378250591,
+ "grad_norm": 3.516235589981079,
+ "learning_rate": 8.191535305292406e-07,
+ "loss": 0.3996,
+ "step": 9350
+ },
+ {
+ "epoch": 4.421276595744681,
+ "grad_norm": 3.2205963134765625,
+ "learning_rate": 8.186917988371956e-07,
+ "loss": 0.3219,
+ "step": 9351
+ },
+ {
+ "epoch": 4.421749408983452,
+ "grad_norm": 3.2431082725524902,
+ "learning_rate": 8.1823017183284e-07,
+ "loss": 0.3033,
+ "step": 9352
+ },
+ {
+ "epoch": 4.4222222222222225,
+ "grad_norm": 3.337085485458374,
+ "learning_rate": 8.177686495449166e-07,
+ "loss": 0.3467,
+ "step": 9353
+ },
+ {
+ "epoch": 4.422695035460993,
+ "grad_norm": 3.2539291381835938,
+ "learning_rate": 8.173072320021641e-07,
+ "loss": 0.3565,
+ "step": 9354
+ },
+ {
+ "epoch": 4.423167848699763,
+ "grad_norm": 3.069993734359741,
+ "learning_rate": 8.168459192333105e-07,
+ "loss": 0.3627,
+ "step": 9355
+ },
+ {
+ "epoch": 4.423640661938534,
+ "grad_norm": 3.068195104598999,
+ "learning_rate": 8.163847112670826e-07,
+ "loss": 0.2945,
+ "step": 9356
+ },
+ {
+ "epoch": 4.424113475177305,
+ "grad_norm": 3.273607015609741,
+ "learning_rate": 8.159236081321959e-07,
+ "loss": 0.3451,
+ "step": 9357
+ },
+ {
+ "epoch": 4.424586288416076,
+ "grad_norm": 3.355647325515747,
+ "learning_rate": 8.154626098573607e-07,
+ "loss": 0.3469,
+ "step": 9358
+ },
+ {
+ "epoch": 4.425059101654846,
+ "grad_norm": 2.792948007583618,
+ "learning_rate": 8.150017164712831e-07,
+ "loss": 0.3393,
+ "step": 9359
+ },
+ {
+ "epoch": 4.425531914893617,
+ "grad_norm": 3.031167507171631,
+ "learning_rate": 8.145409280026607e-07,
+ "loss": 0.2924,
+ "step": 9360
+ },
+ {
+ "epoch": 4.426004728132388,
+ "grad_norm": 3.0713601112365723,
+ "learning_rate": 8.140802444801835e-07,
+ "loss": 0.3221,
+ "step": 9361
+ },
+ {
+ "epoch": 4.426477541371159,
+ "grad_norm": 3.225785493850708,
+ "learning_rate": 8.136196659325374e-07,
+ "loss": 0.3626,
+ "step": 9362
+ },
+ {
+ "epoch": 4.426950354609929,
+ "grad_norm": 2.9779045581817627,
+ "learning_rate": 8.131591923883991e-07,
+ "loss": 0.3329,
+ "step": 9363
+ },
+ {
+ "epoch": 4.4274231678486995,
+ "grad_norm": 3.534536600112915,
+ "learning_rate": 8.126988238764422e-07,
+ "loss": 0.4029,
+ "step": 9364
+ },
+ {
+ "epoch": 4.42789598108747,
+ "grad_norm": 3.4237616062164307,
+ "learning_rate": 8.122385604253311e-07,
+ "loss": 0.3763,
+ "step": 9365
+ },
+ {
+ "epoch": 4.428368794326241,
+ "grad_norm": 2.8711681365966797,
+ "learning_rate": 8.117784020637231e-07,
+ "loss": 0.3141,
+ "step": 9366
+ },
+ {
+ "epoch": 4.428841607565012,
+ "grad_norm": 3.0277621746063232,
+ "learning_rate": 8.113183488202725e-07,
+ "loss": 0.3848,
+ "step": 9367
+ },
+ {
+ "epoch": 4.429314420803783,
+ "grad_norm": 3.1275761127471924,
+ "learning_rate": 8.108584007236226e-07,
+ "loss": 0.3343,
+ "step": 9368
+ },
+ {
+ "epoch": 4.4297872340425535,
+ "grad_norm": 3.2320117950439453,
+ "learning_rate": 8.103985578024143e-07,
+ "loss": 0.3211,
+ "step": 9369
+ },
+ {
+ "epoch": 4.430260047281324,
+ "grad_norm": 3.258829355239868,
+ "learning_rate": 8.099388200852792e-07,
+ "loss": 0.3473,
+ "step": 9370
+ },
+ {
+ "epoch": 4.430732860520094,
+ "grad_norm": 2.9609436988830566,
+ "learning_rate": 8.094791876008423e-07,
+ "loss": 0.318,
+ "step": 9371
+ },
+ {
+ "epoch": 4.431205673758865,
+ "grad_norm": 3.128053665161133,
+ "learning_rate": 8.090196603777245e-07,
+ "loss": 0.3372,
+ "step": 9372
+ },
+ {
+ "epoch": 4.431678486997636,
+ "grad_norm": 3.013979196548462,
+ "learning_rate": 8.085602384445368e-07,
+ "loss": 0.3098,
+ "step": 9373
+ },
+ {
+ "epoch": 4.432151300236407,
+ "grad_norm": 3.603433132171631,
+ "learning_rate": 8.081009218298871e-07,
+ "loss": 0.4016,
+ "step": 9374
+ },
+ {
+ "epoch": 4.432624113475177,
+ "grad_norm": 2.687730312347412,
+ "learning_rate": 8.076417105623743e-07,
+ "loss": 0.3173,
+ "step": 9375
+ },
+ {
+ "epoch": 4.433096926713948,
+ "grad_norm": 3.3575692176818848,
+ "learning_rate": 8.071826046705913e-07,
+ "loss": 0.3173,
+ "step": 9376
+ },
+ {
+ "epoch": 4.433569739952719,
+ "grad_norm": 3.3599679470062256,
+ "learning_rate": 8.06723604183125e-07,
+ "loss": 0.3466,
+ "step": 9377
+ },
+ {
+ "epoch": 4.43404255319149,
+ "grad_norm": 3.101696014404297,
+ "learning_rate": 8.062647091285542e-07,
+ "loss": 0.3322,
+ "step": 9378
+ },
+ {
+ "epoch": 4.43451536643026,
+ "grad_norm": 3.0010359287261963,
+ "learning_rate": 8.05805919535454e-07,
+ "loss": 0.3863,
+ "step": 9379
+ },
+ {
+ "epoch": 4.4349881796690305,
+ "grad_norm": 3.1428821086883545,
+ "learning_rate": 8.053472354323902e-07,
+ "loss": 0.3402,
+ "step": 9380
+ },
+ {
+ "epoch": 4.435460992907801,
+ "grad_norm": 3.416954517364502,
+ "learning_rate": 8.048886568479222e-07,
+ "loss": 0.3637,
+ "step": 9381
+ },
+ {
+ "epoch": 4.435933806146572,
+ "grad_norm": 3.015092611312866,
+ "learning_rate": 8.044301838106059e-07,
+ "loss": 0.3371,
+ "step": 9382
+ },
+ {
+ "epoch": 4.436406619385343,
+ "grad_norm": 2.6680097579956055,
+ "learning_rate": 8.039718163489862e-07,
+ "loss": 0.3148,
+ "step": 9383
+ },
+ {
+ "epoch": 4.436879432624114,
+ "grad_norm": 3.098219633102417,
+ "learning_rate": 8.035135544916056e-07,
+ "loss": 0.3348,
+ "step": 9384
+ },
+ {
+ "epoch": 4.4373522458628845,
+ "grad_norm": 3.5400390625,
+ "learning_rate": 8.030553982669969e-07,
+ "loss": 0.4211,
+ "step": 9385
+ },
+ {
+ "epoch": 4.437825059101655,
+ "grad_norm": 3.6074001789093018,
+ "learning_rate": 8.025973477036872e-07,
+ "loss": 0.3179,
+ "step": 9386
+ },
+ {
+ "epoch": 4.438297872340425,
+ "grad_norm": 3.111982583999634,
+ "learning_rate": 8.021394028301982e-07,
+ "loss": 0.3414,
+ "step": 9387
+ },
+ {
+ "epoch": 4.438770685579196,
+ "grad_norm": 3.5494184494018555,
+ "learning_rate": 8.016815636750439e-07,
+ "loss": 0.3847,
+ "step": 9388
+ },
+ {
+ "epoch": 4.439243498817967,
+ "grad_norm": 3.3602912425994873,
+ "learning_rate": 8.012238302667308e-07,
+ "loss": 0.3293,
+ "step": 9389
+ },
+ {
+ "epoch": 4.439716312056738,
+ "grad_norm": 3.1263039112091064,
+ "learning_rate": 8.007662026337617e-07,
+ "loss": 0.3675,
+ "step": 9390
+ },
+ {
+ "epoch": 4.440189125295508,
+ "grad_norm": 3.239863157272339,
+ "learning_rate": 8.003086808046304e-07,
+ "loss": 0.3445,
+ "step": 9391
+ },
+ {
+ "epoch": 4.440661938534279,
+ "grad_norm": 2.8646275997161865,
+ "learning_rate": 7.998512648078244e-07,
+ "loss": 0.2715,
+ "step": 9392
+ },
+ {
+ "epoch": 4.44113475177305,
+ "grad_norm": 2.9777262210845947,
+ "learning_rate": 7.993939546718255e-07,
+ "loss": 0.3143,
+ "step": 9393
+ },
+ {
+ "epoch": 4.441607565011821,
+ "grad_norm": 3.5436604022979736,
+ "learning_rate": 7.98936750425107e-07,
+ "loss": 0.406,
+ "step": 9394
+ },
+ {
+ "epoch": 4.442080378250591,
+ "grad_norm": 3.1395277976989746,
+ "learning_rate": 7.984796520961391e-07,
+ "loss": 0.3893,
+ "step": 9395
+ },
+ {
+ "epoch": 4.4425531914893615,
+ "grad_norm": 3.6379475593566895,
+ "learning_rate": 7.980226597133814e-07,
+ "loss": 0.3878,
+ "step": 9396
+ },
+ {
+ "epoch": 4.443026004728132,
+ "grad_norm": 3.3866498470306396,
+ "learning_rate": 7.975657733052908e-07,
+ "loss": 0.3399,
+ "step": 9397
+ },
+ {
+ "epoch": 4.443498817966903,
+ "grad_norm": 2.9472098350524902,
+ "learning_rate": 7.971089929003142e-07,
+ "loss": 0.3436,
+ "step": 9398
+ },
+ {
+ "epoch": 4.443971631205674,
+ "grad_norm": 3.314652442932129,
+ "learning_rate": 7.966523185268929e-07,
+ "loss": 0.305,
+ "step": 9399
+ },
+ {
+ "epoch": 4.444444444444445,
+ "grad_norm": 3.8230092525482178,
+ "learning_rate": 7.961957502134638e-07,
+ "loss": 0.3632,
+ "step": 9400
+ },
+ {
+ "epoch": 4.444917257683215,
+ "grad_norm": 3.088292360305786,
+ "learning_rate": 7.957392879884534e-07,
+ "loss": 0.3373,
+ "step": 9401
+ },
+ {
+ "epoch": 4.445390070921986,
+ "grad_norm": 3.1412665843963623,
+ "learning_rate": 7.952829318802854e-07,
+ "loss": 0.3703,
+ "step": 9402
+ },
+ {
+ "epoch": 4.445862884160756,
+ "grad_norm": 3.464963674545288,
+ "learning_rate": 7.948266819173745e-07,
+ "loss": 0.3485,
+ "step": 9403
+ },
+ {
+ "epoch": 4.446335697399527,
+ "grad_norm": 3.2092626094818115,
+ "learning_rate": 7.943705381281281e-07,
+ "loss": 0.341,
+ "step": 9404
+ },
+ {
+ "epoch": 4.446808510638298,
+ "grad_norm": 2.5458641052246094,
+ "learning_rate": 7.939145005409502e-07,
+ "loss": 0.3059,
+ "step": 9405
+ },
+ {
+ "epoch": 4.4472813238770685,
+ "grad_norm": 4.1484150886535645,
+ "learning_rate": 7.934585691842353e-07,
+ "loss": 0.3405,
+ "step": 9406
+ },
+ {
+ "epoch": 4.447754137115839,
+ "grad_norm": 2.9794130325317383,
+ "learning_rate": 7.930027440863716e-07,
+ "loss": 0.359,
+ "step": 9407
+ },
+ {
+ "epoch": 4.44822695035461,
+ "grad_norm": 2.951674222946167,
+ "learning_rate": 7.92547025275743e-07,
+ "loss": 0.3087,
+ "step": 9408
+ },
+ {
+ "epoch": 4.448699763593381,
+ "grad_norm": 3.2857377529144287,
+ "learning_rate": 7.920914127807241e-07,
+ "loss": 0.3327,
+ "step": 9409
+ },
+ {
+ "epoch": 4.449172576832151,
+ "grad_norm": 3.4455840587615967,
+ "learning_rate": 7.916359066296839e-07,
+ "loss": 0.3731,
+ "step": 9410
+ },
+ {
+ "epoch": 4.449645390070922,
+ "grad_norm": 3.392779588699341,
+ "learning_rate": 7.911805068509848e-07,
+ "loss": 0.3559,
+ "step": 9411
+ },
+ {
+ "epoch": 4.450118203309692,
+ "grad_norm": 3.3362300395965576,
+ "learning_rate": 7.90725213472982e-07,
+ "loss": 0.333,
+ "step": 9412
+ },
+ {
+ "epoch": 4.450591016548463,
+ "grad_norm": 3.0671608448028564,
+ "learning_rate": 7.902700265240259e-07,
+ "loss": 0.3342,
+ "step": 9413
+ },
+ {
+ "epoch": 4.451063829787234,
+ "grad_norm": 2.8350744247436523,
+ "learning_rate": 7.898149460324575e-07,
+ "loss": 0.3157,
+ "step": 9414
+ },
+ {
+ "epoch": 4.451536643026005,
+ "grad_norm": 2.932446241378784,
+ "learning_rate": 7.893599720266143e-07,
+ "loss": 0.379,
+ "step": 9415
+ },
+ {
+ "epoch": 4.452009456264776,
+ "grad_norm": 3.354112386703491,
+ "learning_rate": 7.889051045348245e-07,
+ "loss": 0.3001,
+ "step": 9416
+ },
+ {
+ "epoch": 4.452482269503546,
+ "grad_norm": 3.068276882171631,
+ "learning_rate": 7.884503435854104e-07,
+ "loss": 0.3466,
+ "step": 9417
+ },
+ {
+ "epoch": 4.452955082742317,
+ "grad_norm": 2.833534002304077,
+ "learning_rate": 7.879956892066892e-07,
+ "loss": 0.3278,
+ "step": 9418
+ },
+ {
+ "epoch": 4.453427895981087,
+ "grad_norm": 2.9622433185577393,
+ "learning_rate": 7.875411414269687e-07,
+ "loss": 0.3725,
+ "step": 9419
+ },
+ {
+ "epoch": 4.453900709219858,
+ "grad_norm": 3.2055954933166504,
+ "learning_rate": 7.870867002745533e-07,
+ "loss": 0.3215,
+ "step": 9420
+ },
+ {
+ "epoch": 4.454373522458629,
+ "grad_norm": 2.877063274383545,
+ "learning_rate": 7.86632365777738e-07,
+ "loss": 0.2845,
+ "step": 9421
+ },
+ {
+ "epoch": 4.4548463356973995,
+ "grad_norm": 3.2809367179870605,
+ "learning_rate": 7.861781379648117e-07,
+ "loss": 0.39,
+ "step": 9422
+ },
+ {
+ "epoch": 4.45531914893617,
+ "grad_norm": 3.404816150665283,
+ "learning_rate": 7.857240168640587e-07,
+ "loss": 0.3003,
+ "step": 9423
+ },
+ {
+ "epoch": 4.455791962174941,
+ "grad_norm": 3.367253303527832,
+ "learning_rate": 7.85270002503754e-07,
+ "loss": 0.3414,
+ "step": 9424
+ },
+ {
+ "epoch": 4.456264775413712,
+ "grad_norm": 3.1247670650482178,
+ "learning_rate": 7.848160949121678e-07,
+ "loss": 0.2922,
+ "step": 9425
+ },
+ {
+ "epoch": 4.456737588652482,
+ "grad_norm": 3.474435806274414,
+ "learning_rate": 7.843622941175624e-07,
+ "loss": 0.3601,
+ "step": 9426
+ },
+ {
+ "epoch": 4.457210401891253,
+ "grad_norm": 3.0552384853363037,
+ "learning_rate": 7.839086001481933e-07,
+ "loss": 0.3905,
+ "step": 9427
+ },
+ {
+ "epoch": 4.457683215130023,
+ "grad_norm": 3.3532586097717285,
+ "learning_rate": 7.834550130323115e-07,
+ "loss": 0.3783,
+ "step": 9428
+ },
+ {
+ "epoch": 4.458156028368794,
+ "grad_norm": 3.7321903705596924,
+ "learning_rate": 7.830015327981585e-07,
+ "loss": 0.3765,
+ "step": 9429
+ },
+ {
+ "epoch": 4.458628841607565,
+ "grad_norm": 3.070158004760742,
+ "learning_rate": 7.82548159473972e-07,
+ "loss": 0.3279,
+ "step": 9430
+ },
+ {
+ "epoch": 4.459101654846336,
+ "grad_norm": 3.498399257659912,
+ "learning_rate": 7.820948930879807e-07,
+ "loss": 0.3864,
+ "step": 9431
+ },
+ {
+ "epoch": 4.459574468085107,
+ "grad_norm": 3.0352776050567627,
+ "learning_rate": 7.816417336684071e-07,
+ "loss": 0.2963,
+ "step": 9432
+ },
+ {
+ "epoch": 4.460047281323877,
+ "grad_norm": 3.190154790878296,
+ "learning_rate": 7.811886812434686e-07,
+ "loss": 0.354,
+ "step": 9433
+ },
+ {
+ "epoch": 4.460520094562648,
+ "grad_norm": 3.1933085918426514,
+ "learning_rate": 7.807357358413742e-07,
+ "loss": 0.3613,
+ "step": 9434
+ },
+ {
+ "epoch": 4.460992907801418,
+ "grad_norm": 4.0385637283325195,
+ "learning_rate": 7.80282897490326e-07,
+ "loss": 0.3257,
+ "step": 9435
+ },
+ {
+ "epoch": 4.461465721040189,
+ "grad_norm": 3.365485191345215,
+ "learning_rate": 7.798301662185218e-07,
+ "loss": 0.3093,
+ "step": 9436
+ },
+ {
+ "epoch": 4.46193853427896,
+ "grad_norm": 3.5345213413238525,
+ "learning_rate": 7.793775420541497e-07,
+ "loss": 0.3262,
+ "step": 9437
+ },
+ {
+ "epoch": 4.4624113475177305,
+ "grad_norm": 3.2894418239593506,
+ "learning_rate": 7.789250250253941e-07,
+ "loss": 0.3417,
+ "step": 9438
+ },
+ {
+ "epoch": 4.462884160756501,
+ "grad_norm": 2.972001791000366,
+ "learning_rate": 7.784726151604305e-07,
+ "loss": 0.3396,
+ "step": 9439
+ },
+ {
+ "epoch": 4.463356973995272,
+ "grad_norm": 3.161794424057007,
+ "learning_rate": 7.780203124874283e-07,
+ "loss": 0.3583,
+ "step": 9440
+ },
+ {
+ "epoch": 4.463829787234043,
+ "grad_norm": 3.0976521968841553,
+ "learning_rate": 7.775681170345508e-07,
+ "loss": 0.3743,
+ "step": 9441
+ },
+ {
+ "epoch": 4.464302600472813,
+ "grad_norm": 3.1454756259918213,
+ "learning_rate": 7.771160288299534e-07,
+ "loss": 0.3483,
+ "step": 9442
+ },
+ {
+ "epoch": 4.464775413711584,
+ "grad_norm": 3.467618942260742,
+ "learning_rate": 7.766640479017868e-07,
+ "loss": 0.3253,
+ "step": 9443
+ },
+ {
+ "epoch": 4.465248226950354,
+ "grad_norm": 3.3349552154541016,
+ "learning_rate": 7.762121742781933e-07,
+ "loss": 0.3579,
+ "step": 9444
+ },
+ {
+ "epoch": 4.465721040189125,
+ "grad_norm": 3.442701578140259,
+ "learning_rate": 7.757604079873085e-07,
+ "loss": 0.3854,
+ "step": 9445
+ },
+ {
+ "epoch": 4.466193853427896,
+ "grad_norm": 3.3095569610595703,
+ "learning_rate": 7.753087490572633e-07,
+ "loss": 0.3385,
+ "step": 9446
+ },
+ {
+ "epoch": 4.466666666666667,
+ "grad_norm": 3.0978634357452393,
+ "learning_rate": 7.748571975161786e-07,
+ "loss": 0.3511,
+ "step": 9447
+ },
+ {
+ "epoch": 4.467139479905438,
+ "grad_norm": 2.9801225662231445,
+ "learning_rate": 7.744057533921731e-07,
+ "loss": 0.3239,
+ "step": 9448
+ },
+ {
+ "epoch": 4.467612293144208,
+ "grad_norm": 3.116586923599243,
+ "learning_rate": 7.739544167133545e-07,
+ "loss": 0.3786,
+ "step": 9449
+ },
+ {
+ "epoch": 4.468085106382979,
+ "grad_norm": 3.2235381603240967,
+ "learning_rate": 7.73503187507825e-07,
+ "loss": 0.3285,
+ "step": 9450
+ },
+ {
+ "epoch": 4.468557919621749,
+ "grad_norm": 3.175649404525757,
+ "learning_rate": 7.730520658036825e-07,
+ "loss": 0.315,
+ "step": 9451
+ },
+ {
+ "epoch": 4.46903073286052,
+ "grad_norm": 3.013848066329956,
+ "learning_rate": 7.726010516290144e-07,
+ "loss": 0.3533,
+ "step": 9452
+ },
+ {
+ "epoch": 4.469503546099291,
+ "grad_norm": 2.87581467628479,
+ "learning_rate": 7.721501450119057e-07,
+ "loss": 0.2948,
+ "step": 9453
+ },
+ {
+ "epoch": 4.4699763593380615,
+ "grad_norm": 3.504119873046875,
+ "learning_rate": 7.716993459804306e-07,
+ "loss": 0.354,
+ "step": 9454
+ },
+ {
+ "epoch": 4.470449172576832,
+ "grad_norm": 3.2914042472839355,
+ "learning_rate": 7.712486545626591e-07,
+ "loss": 0.3724,
+ "step": 9455
+ },
+ {
+ "epoch": 4.470921985815603,
+ "grad_norm": 3.007551908493042,
+ "learning_rate": 7.707980707866533e-07,
+ "loss": 0.3923,
+ "step": 9456
+ },
+ {
+ "epoch": 4.471394799054374,
+ "grad_norm": 3.2758076190948486,
+ "learning_rate": 7.703475946804687e-07,
+ "loss": 0.4092,
+ "step": 9457
+ },
+ {
+ "epoch": 4.471867612293144,
+ "grad_norm": 3.265875816345215,
+ "learning_rate": 7.698972262721557e-07,
+ "loss": 0.4249,
+ "step": 9458
+ },
+ {
+ "epoch": 4.472340425531915,
+ "grad_norm": 3.0962677001953125,
+ "learning_rate": 7.694469655897565e-07,
+ "loss": 0.3273,
+ "step": 9459
+ },
+ {
+ "epoch": 4.472813238770685,
+ "grad_norm": 3.2247416973114014,
+ "learning_rate": 7.689968126613053e-07,
+ "loss": 0.366,
+ "step": 9460
+ },
+ {
+ "epoch": 4.473286052009456,
+ "grad_norm": 3.326211929321289,
+ "learning_rate": 7.685467675148334e-07,
+ "loss": 0.4044,
+ "step": 9461
+ },
+ {
+ "epoch": 4.473758865248227,
+ "grad_norm": 2.9795444011688232,
+ "learning_rate": 7.68096830178362e-07,
+ "loss": 0.3763,
+ "step": 9462
+ },
+ {
+ "epoch": 4.474231678486998,
+ "grad_norm": 3.0721724033355713,
+ "learning_rate": 7.676470006799061e-07,
+ "loss": 0.3586,
+ "step": 9463
+ },
+ {
+ "epoch": 4.474704491725769,
+ "grad_norm": 3.1191349029541016,
+ "learning_rate": 7.67197279047476e-07,
+ "loss": 0.3111,
+ "step": 9464
+ },
+ {
+ "epoch": 4.475177304964539,
+ "grad_norm": 3.2980053424835205,
+ "learning_rate": 7.667476653090727e-07,
+ "loss": 0.3413,
+ "step": 9465
+ },
+ {
+ "epoch": 4.47565011820331,
+ "grad_norm": 3.159794807434082,
+ "learning_rate": 7.662981594926927e-07,
+ "loss": 0.3559,
+ "step": 9466
+ },
+ {
+ "epoch": 4.47612293144208,
+ "grad_norm": 2.9250876903533936,
+ "learning_rate": 7.658487616263244e-07,
+ "loss": 0.3582,
+ "step": 9467
+ },
+ {
+ "epoch": 4.476595744680851,
+ "grad_norm": 2.915234088897705,
+ "learning_rate": 7.65399471737949e-07,
+ "loss": 0.3466,
+ "step": 9468
+ },
+ {
+ "epoch": 4.477068557919622,
+ "grad_norm": 3.2557425498962402,
+ "learning_rate": 7.649502898555431e-07,
+ "loss": 0.3772,
+ "step": 9469
+ },
+ {
+ "epoch": 4.4775413711583925,
+ "grad_norm": 3.1906673908233643,
+ "learning_rate": 7.645012160070748e-07,
+ "loss": 0.3379,
+ "step": 9470
+ },
+ {
+ "epoch": 4.478014184397163,
+ "grad_norm": 3.1513144969940186,
+ "learning_rate": 7.640522502205056e-07,
+ "loss": 0.3529,
+ "step": 9471
+ },
+ {
+ "epoch": 4.478486997635934,
+ "grad_norm": 3.1969199180603027,
+ "learning_rate": 7.636033925237904e-07,
+ "loss": 0.363,
+ "step": 9472
+ },
+ {
+ "epoch": 4.478959810874705,
+ "grad_norm": 3.4546799659729004,
+ "learning_rate": 7.631546429448785e-07,
+ "loss": 0.2944,
+ "step": 9473
+ },
+ {
+ "epoch": 4.479432624113475,
+ "grad_norm": 3.102057456970215,
+ "learning_rate": 7.627060015117116e-07,
+ "loss": 0.3309,
+ "step": 9474
+ },
+ {
+ "epoch": 4.479905437352246,
+ "grad_norm": 3.125751495361328,
+ "learning_rate": 7.622574682522232e-07,
+ "loss": 0.3576,
+ "step": 9475
+ },
+ {
+ "epoch": 4.480378250591016,
+ "grad_norm": 3.071798324584961,
+ "learning_rate": 7.618090431943432e-07,
+ "loss": 0.3298,
+ "step": 9476
+ },
+ {
+ "epoch": 4.480851063829787,
+ "grad_norm": 3.0060672760009766,
+ "learning_rate": 7.613607263659922e-07,
+ "loss": 0.3528,
+ "step": 9477
+ },
+ {
+ "epoch": 4.481323877068558,
+ "grad_norm": 3.254667043685913,
+ "learning_rate": 7.609125177950846e-07,
+ "loss": 0.3701,
+ "step": 9478
+ },
+ {
+ "epoch": 4.481796690307329,
+ "grad_norm": 3.282247304916382,
+ "learning_rate": 7.604644175095293e-07,
+ "loss": 0.4025,
+ "step": 9479
+ },
+ {
+ "epoch": 4.4822695035460995,
+ "grad_norm": 3.231097936630249,
+ "learning_rate": 7.600164255372266e-07,
+ "loss": 0.3395,
+ "step": 9480
+ },
+ {
+ "epoch": 4.48274231678487,
+ "grad_norm": 3.08368182182312,
+ "learning_rate": 7.595685419060722e-07,
+ "loss": 0.3356,
+ "step": 9481
+ },
+ {
+ "epoch": 4.48321513002364,
+ "grad_norm": 3.0406503677368164,
+ "learning_rate": 7.591207666439532e-07,
+ "loss": 0.2851,
+ "step": 9482
+ },
+ {
+ "epoch": 4.483687943262411,
+ "grad_norm": 3.021157741546631,
+ "learning_rate": 7.586730997787495e-07,
+ "loss": 0.3691,
+ "step": 9483
+ },
+ {
+ "epoch": 4.484160756501182,
+ "grad_norm": 2.8793535232543945,
+ "learning_rate": 7.582255413383375e-07,
+ "loss": 0.332,
+ "step": 9484
+ },
+ {
+ "epoch": 4.484633569739953,
+ "grad_norm": 3.319021224975586,
+ "learning_rate": 7.577780913505833e-07,
+ "loss": 0.3489,
+ "step": 9485
+ },
+ {
+ "epoch": 4.485106382978723,
+ "grad_norm": 3.0267672538757324,
+ "learning_rate": 7.573307498433472e-07,
+ "loss": 0.2989,
+ "step": 9486
+ },
+ {
+ "epoch": 4.485579196217494,
+ "grad_norm": 2.8953561782836914,
+ "learning_rate": 7.568835168444849e-07,
+ "loss": 0.306,
+ "step": 9487
+ },
+ {
+ "epoch": 4.486052009456265,
+ "grad_norm": 3.0559732913970947,
+ "learning_rate": 7.564363923818424e-07,
+ "loss": 0.3122,
+ "step": 9488
+ },
+ {
+ "epoch": 4.486524822695036,
+ "grad_norm": 3.369352340698242,
+ "learning_rate": 7.559893764832607e-07,
+ "loss": 0.3999,
+ "step": 9489
+ },
+ {
+ "epoch": 4.486997635933806,
+ "grad_norm": 3.3339598178863525,
+ "learning_rate": 7.555424691765731e-07,
+ "loss": 0.3566,
+ "step": 9490
+ },
+ {
+ "epoch": 4.4874704491725765,
+ "grad_norm": 3.6563758850097656,
+ "learning_rate": 7.550956704896062e-07,
+ "loss": 0.3686,
+ "step": 9491
+ },
+ {
+ "epoch": 4.487943262411347,
+ "grad_norm": 3.492706537246704,
+ "learning_rate": 7.546489804501811e-07,
+ "loss": 0.3382,
+ "step": 9492
+ },
+ {
+ "epoch": 4.488416075650118,
+ "grad_norm": 3.1645941734313965,
+ "learning_rate": 7.542023990861106e-07,
+ "loss": 0.3583,
+ "step": 9493
+ },
+ {
+ "epoch": 4.488888888888889,
+ "grad_norm": 2.9827258586883545,
+ "learning_rate": 7.537559264252021e-07,
+ "loss": 0.3243,
+ "step": 9494
+ },
+ {
+ "epoch": 4.48936170212766,
+ "grad_norm": 3.2876698970794678,
+ "learning_rate": 7.533095624952547e-07,
+ "loss": 0.373,
+ "step": 9495
+ },
+ {
+ "epoch": 4.4898345153664305,
+ "grad_norm": 3.110868453979492,
+ "learning_rate": 7.528633073240616e-07,
+ "loss": 0.351,
+ "step": 9496
+ },
+ {
+ "epoch": 4.490307328605201,
+ "grad_norm": 3.3962604999542236,
+ "learning_rate": 7.524171609394099e-07,
+ "loss": 0.3686,
+ "step": 9497
+ },
+ {
+ "epoch": 4.490780141843971,
+ "grad_norm": 3.272610902786255,
+ "learning_rate": 7.519711233690777e-07,
+ "loss": 0.3667,
+ "step": 9498
+ },
+ {
+ "epoch": 4.491252955082742,
+ "grad_norm": 3.0907654762268066,
+ "learning_rate": 7.515251946408398e-07,
+ "loss": 0.3247,
+ "step": 9499
+ },
+ {
+ "epoch": 4.491725768321513,
+ "grad_norm": 4.225870609283447,
+ "learning_rate": 7.510793747824613e-07,
+ "loss": 0.3636,
+ "step": 9500
+ },
+ {
+ "epoch": 4.492198581560284,
+ "grad_norm": 3.1911606788635254,
+ "learning_rate": 7.506336638217004e-07,
+ "loss": 0.3661,
+ "step": 9501
+ },
+ {
+ "epoch": 4.492671394799054,
+ "grad_norm": 2.907573699951172,
+ "learning_rate": 7.501880617863114e-07,
+ "loss": 0.3285,
+ "step": 9502
+ },
+ {
+ "epoch": 4.493144208037825,
+ "grad_norm": 3.388460397720337,
+ "learning_rate": 7.497425687040388e-07,
+ "loss": 0.3354,
+ "step": 9503
+ },
+ {
+ "epoch": 4.493617021276596,
+ "grad_norm": 2.7236225605010986,
+ "learning_rate": 7.49297184602622e-07,
+ "loss": 0.3389,
+ "step": 9504
+ },
+ {
+ "epoch": 4.494089834515367,
+ "grad_norm": 3.1962947845458984,
+ "learning_rate": 7.488519095097929e-07,
+ "loss": 0.3377,
+ "step": 9505
+ },
+ {
+ "epoch": 4.494562647754137,
+ "grad_norm": 2.936845541000366,
+ "learning_rate": 7.484067434532763e-07,
+ "loss": 0.2893,
+ "step": 9506
+ },
+ {
+ "epoch": 4.4950354609929075,
+ "grad_norm": 2.8567588329315186,
+ "learning_rate": 7.47961686460792e-07,
+ "loss": 0.3026,
+ "step": 9507
+ },
+ {
+ "epoch": 4.495508274231678,
+ "grad_norm": 3.3522651195526123,
+ "learning_rate": 7.475167385600507e-07,
+ "loss": 0.3517,
+ "step": 9508
+ },
+ {
+ "epoch": 4.495981087470449,
+ "grad_norm": 3.338757276535034,
+ "learning_rate": 7.470718997787572e-07,
+ "loss": 0.3224,
+ "step": 9509
+ },
+ {
+ "epoch": 4.49645390070922,
+ "grad_norm": 3.1484947204589844,
+ "learning_rate": 7.466271701446107e-07,
+ "loss": 0.3872,
+ "step": 9510
+ },
+ {
+ "epoch": 4.496926713947991,
+ "grad_norm": 3.3275411128997803,
+ "learning_rate": 7.461825496853012e-07,
+ "loss": 0.3287,
+ "step": 9511
+ },
+ {
+ "epoch": 4.4973995271867615,
+ "grad_norm": 3.101416826248169,
+ "learning_rate": 7.457380384285151e-07,
+ "loss": 0.3223,
+ "step": 9512
+ },
+ {
+ "epoch": 4.497872340425532,
+ "grad_norm": 2.761810779571533,
+ "learning_rate": 7.45293636401929e-07,
+ "loss": 0.3264,
+ "step": 9513
+ },
+ {
+ "epoch": 4.498345153664302,
+ "grad_norm": 3.215078592300415,
+ "learning_rate": 7.448493436332132e-07,
+ "loss": 0.3548,
+ "step": 9514
+ },
+ {
+ "epoch": 4.498817966903073,
+ "grad_norm": 3.00111722946167,
+ "learning_rate": 7.444051601500335e-07,
+ "loss": 0.3271,
+ "step": 9515
+ },
+ {
+ "epoch": 4.499290780141844,
+ "grad_norm": 3.2428977489471436,
+ "learning_rate": 7.439610859800456e-07,
+ "loss": 0.3024,
+ "step": 9516
+ },
+ {
+ "epoch": 4.499763593380615,
+ "grad_norm": 2.7977585792541504,
+ "learning_rate": 7.435171211509018e-07,
+ "loss": 0.334,
+ "step": 9517
+ },
+ {
+ "epoch": 4.500236406619385,
+ "grad_norm": 3.273468494415283,
+ "learning_rate": 7.430732656902447e-07,
+ "loss": 0.337,
+ "step": 9518
+ },
+ {
+ "epoch": 4.500709219858156,
+ "grad_norm": 4.663364410400391,
+ "learning_rate": 7.426295196257116e-07,
+ "loss": 0.3492,
+ "step": 9519
+ },
+ {
+ "epoch": 4.501182033096927,
+ "grad_norm": 3.1147210597991943,
+ "learning_rate": 7.421858829849327e-07,
+ "loss": 0.3273,
+ "step": 9520
+ },
+ {
+ "epoch": 4.501654846335697,
+ "grad_norm": 3.1411445140838623,
+ "learning_rate": 7.4174235579553e-07,
+ "loss": 0.3413,
+ "step": 9521
+ },
+ {
+ "epoch": 4.502127659574468,
+ "grad_norm": 3.0361053943634033,
+ "learning_rate": 7.412989380851218e-07,
+ "loss": 0.2908,
+ "step": 9522
+ },
+ {
+ "epoch": 4.5026004728132385,
+ "grad_norm": 3.4923086166381836,
+ "learning_rate": 7.408556298813172e-07,
+ "loss": 0.3659,
+ "step": 9523
+ },
+ {
+ "epoch": 4.503073286052009,
+ "grad_norm": 3.6827056407928467,
+ "learning_rate": 7.40412431211718e-07,
+ "loss": 0.3485,
+ "step": 9524
+ },
+ {
+ "epoch": 4.50354609929078,
+ "grad_norm": 3.257322311401367,
+ "learning_rate": 7.399693421039219e-07,
+ "loss": 0.3592,
+ "step": 9525
+ },
+ {
+ "epoch": 4.504018912529551,
+ "grad_norm": 3.515291929244995,
+ "learning_rate": 7.395263625855167e-07,
+ "loss": 0.3662,
+ "step": 9526
+ },
+ {
+ "epoch": 4.504491725768322,
+ "grad_norm": 2.899764060974121,
+ "learning_rate": 7.390834926840865e-07,
+ "loss": 0.3564,
+ "step": 9527
+ },
+ {
+ "epoch": 4.5049645390070925,
+ "grad_norm": 3.2578322887420654,
+ "learning_rate": 7.386407324272055e-07,
+ "loss": 0.3074,
+ "step": 9528
+ },
+ {
+ "epoch": 4.505437352245863,
+ "grad_norm": 3.3826515674591064,
+ "learning_rate": 7.381980818424419e-07,
+ "loss": 0.3669,
+ "step": 9529
+ },
+ {
+ "epoch": 4.505910165484633,
+ "grad_norm": 3.470733404159546,
+ "learning_rate": 7.377555409573594e-07,
+ "loss": 0.3905,
+ "step": 9530
+ },
+ {
+ "epoch": 4.506382978723404,
+ "grad_norm": 3.228917121887207,
+ "learning_rate": 7.373131097995123e-07,
+ "loss": 0.3336,
+ "step": 9531
+ },
+ {
+ "epoch": 4.506855791962175,
+ "grad_norm": 3.2193191051483154,
+ "learning_rate": 7.368707883964476e-07,
+ "loss": 0.3285,
+ "step": 9532
+ },
+ {
+ "epoch": 4.507328605200946,
+ "grad_norm": 3.19169020652771,
+ "learning_rate": 7.36428576775709e-07,
+ "loss": 0.4022,
+ "step": 9533
+ },
+ {
+ "epoch": 4.507801418439716,
+ "grad_norm": 3.1887755393981934,
+ "learning_rate": 7.359864749648296e-07,
+ "loss": 0.3749,
+ "step": 9534
+ },
+ {
+ "epoch": 4.508274231678487,
+ "grad_norm": 3.574314832687378,
+ "learning_rate": 7.355444829913375e-07,
+ "loss": 0.3549,
+ "step": 9535
+ },
+ {
+ "epoch": 4.508747044917258,
+ "grad_norm": 3.3482754230499268,
+ "learning_rate": 7.351026008827527e-07,
+ "loss": 0.311,
+ "step": 9536
+ },
+ {
+ "epoch": 4.509219858156028,
+ "grad_norm": 3.5075576305389404,
+ "learning_rate": 7.34660828666591e-07,
+ "loss": 0.3575,
+ "step": 9537
+ },
+ {
+ "epoch": 4.509692671394799,
+ "grad_norm": 3.233328104019165,
+ "learning_rate": 7.342191663703588e-07,
+ "loss": 0.3087,
+ "step": 9538
+ },
+ {
+ "epoch": 4.5101654846335695,
+ "grad_norm": 3.3704137802124023,
+ "learning_rate": 7.337776140215555e-07,
+ "loss": 0.356,
+ "step": 9539
+ },
+ {
+ "epoch": 4.51063829787234,
+ "grad_norm": 4.084654331207275,
+ "learning_rate": 7.333361716476761e-07,
+ "loss": 0.3382,
+ "step": 9540
+ },
+ {
+ "epoch": 4.511111111111111,
+ "grad_norm": 2.985344886779785,
+ "learning_rate": 7.32894839276207e-07,
+ "loss": 0.3476,
+ "step": 9541
+ },
+ {
+ "epoch": 4.511583924349882,
+ "grad_norm": 3.405877113342285,
+ "learning_rate": 7.324536169346269e-07,
+ "loss": 0.3402,
+ "step": 9542
+ },
+ {
+ "epoch": 4.512056737588653,
+ "grad_norm": 3.0168516635894775,
+ "learning_rate": 7.320125046504103e-07,
+ "loss": 0.2936,
+ "step": 9543
+ },
+ {
+ "epoch": 4.5125295508274235,
+ "grad_norm": 2.9846513271331787,
+ "learning_rate": 7.315715024510219e-07,
+ "loss": 0.3165,
+ "step": 9544
+ },
+ {
+ "epoch": 4.513002364066194,
+ "grad_norm": 3.1661694049835205,
+ "learning_rate": 7.311306103639224e-07,
+ "loss": 0.3372,
+ "step": 9545
+ },
+ {
+ "epoch": 4.513475177304964,
+ "grad_norm": 3.3390371799468994,
+ "learning_rate": 7.306898284165637e-07,
+ "loss": 0.3427,
+ "step": 9546
+ },
+ {
+ "epoch": 4.513947990543735,
+ "grad_norm": 3.1360137462615967,
+ "learning_rate": 7.302491566363904e-07,
+ "loss": 0.3786,
+ "step": 9547
+ },
+ {
+ "epoch": 4.514420803782506,
+ "grad_norm": 3.3442773818969727,
+ "learning_rate": 7.298085950508427e-07,
+ "loss": 0.3516,
+ "step": 9548
+ },
+ {
+ "epoch": 4.514893617021277,
+ "grad_norm": 3.1403257846832275,
+ "learning_rate": 7.293681436873518e-07,
+ "loss": 0.32,
+ "step": 9549
+ },
+ {
+ "epoch": 4.515366430260047,
+ "grad_norm": 5.084080696105957,
+ "learning_rate": 7.289278025733417e-07,
+ "loss": 0.3036,
+ "step": 9550
+ },
+ {
+ "epoch": 4.515839243498818,
+ "grad_norm": 3.055558919906616,
+ "learning_rate": 7.284875717362322e-07,
+ "loss": 0.3625,
+ "step": 9551
+ },
+ {
+ "epoch": 4.516312056737589,
+ "grad_norm": 3.3781931400299072,
+ "learning_rate": 7.280474512034338e-07,
+ "loss": 0.3759,
+ "step": 9552
+ },
+ {
+ "epoch": 4.516784869976359,
+ "grad_norm": 3.5266852378845215,
+ "learning_rate": 7.27607441002351e-07,
+ "loss": 0.4057,
+ "step": 9553
+ },
+ {
+ "epoch": 4.51725768321513,
+ "grad_norm": 3.290174961090088,
+ "learning_rate": 7.271675411603802e-07,
+ "loss": 0.3471,
+ "step": 9554
+ },
+ {
+ "epoch": 4.5177304964539005,
+ "grad_norm": 3.399919033050537,
+ "learning_rate": 7.267277517049137e-07,
+ "loss": 0.394,
+ "step": 9555
+ },
+ {
+ "epoch": 4.518203309692671,
+ "grad_norm": 3.4410784244537354,
+ "learning_rate": 7.262880726633348e-07,
+ "loss": 0.4351,
+ "step": 9556
+ },
+ {
+ "epoch": 4.518676122931442,
+ "grad_norm": 2.7317543029785156,
+ "learning_rate": 7.258485040630192e-07,
+ "loss": 0.3039,
+ "step": 9557
+ },
+ {
+ "epoch": 4.519148936170213,
+ "grad_norm": 3.769446849822998,
+ "learning_rate": 7.254090459313384e-07,
+ "loss": 0.4061,
+ "step": 9558
+ },
+ {
+ "epoch": 4.519621749408984,
+ "grad_norm": 3.5625245571136475,
+ "learning_rate": 7.249696982956553e-07,
+ "loss": 0.4148,
+ "step": 9559
+ },
+ {
+ "epoch": 4.520094562647754,
+ "grad_norm": 3.1534764766693115,
+ "learning_rate": 7.245304611833248e-07,
+ "loss": 0.3369,
+ "step": 9560
+ },
+ {
+ "epoch": 4.520567375886525,
+ "grad_norm": 3.179197311401367,
+ "learning_rate": 7.240913346216982e-07,
+ "loss": 0.2798,
+ "step": 9561
+ },
+ {
+ "epoch": 4.521040189125295,
+ "grad_norm": 3.2608659267425537,
+ "learning_rate": 7.236523186381162e-07,
+ "loss": 0.3513,
+ "step": 9562
+ },
+ {
+ "epoch": 4.521513002364066,
+ "grad_norm": 3.055513381958008,
+ "learning_rate": 7.232134132599158e-07,
+ "loss": 0.3098,
+ "step": 9563
+ },
+ {
+ "epoch": 4.521985815602837,
+ "grad_norm": 3.159937620162964,
+ "learning_rate": 7.227746185144258e-07,
+ "loss": 0.3234,
+ "step": 9564
+ },
+ {
+ "epoch": 4.5224586288416075,
+ "grad_norm": 3.176802635192871,
+ "learning_rate": 7.22335934428966e-07,
+ "loss": 0.3547,
+ "step": 9565
+ },
+ {
+ "epoch": 4.522931442080378,
+ "grad_norm": 3.476203680038452,
+ "learning_rate": 7.218973610308538e-07,
+ "loss": 0.3659,
+ "step": 9566
+ },
+ {
+ "epoch": 4.523404255319149,
+ "grad_norm": 3.277595043182373,
+ "learning_rate": 7.214588983473964e-07,
+ "loss": 0.3448,
+ "step": 9567
+ },
+ {
+ "epoch": 4.52387706855792,
+ "grad_norm": 3.0068325996398926,
+ "learning_rate": 7.210205464058944e-07,
+ "loss": 0.3341,
+ "step": 9568
+ },
+ {
+ "epoch": 4.52434988179669,
+ "grad_norm": 3.3836655616760254,
+ "learning_rate": 7.205823052336425e-07,
+ "loss": 0.3253,
+ "step": 9569
+ },
+ {
+ "epoch": 4.524822695035461,
+ "grad_norm": 3.4170496463775635,
+ "learning_rate": 7.201441748579271e-07,
+ "loss": 0.3754,
+ "step": 9570
+ },
+ {
+ "epoch": 4.525295508274231,
+ "grad_norm": 3.5432863235473633,
+ "learning_rate": 7.197061553060303e-07,
+ "loss": 0.3265,
+ "step": 9571
+ },
+ {
+ "epoch": 4.525768321513002,
+ "grad_norm": 2.900520086288452,
+ "learning_rate": 7.192682466052243e-07,
+ "loss": 0.2961,
+ "step": 9572
+ },
+ {
+ "epoch": 4.526241134751773,
+ "grad_norm": 3.028733491897583,
+ "learning_rate": 7.188304487827768e-07,
+ "loss": 0.347,
+ "step": 9573
+ },
+ {
+ "epoch": 4.526713947990544,
+ "grad_norm": 2.8739330768585205,
+ "learning_rate": 7.183927618659473e-07,
+ "loss": 0.3265,
+ "step": 9574
+ },
+ {
+ "epoch": 4.527186761229315,
+ "grad_norm": 3.4727251529693604,
+ "learning_rate": 7.179551858819873e-07,
+ "loss": 0.3882,
+ "step": 9575
+ },
+ {
+ "epoch": 4.527659574468085,
+ "grad_norm": 2.950634002685547,
+ "learning_rate": 7.175177208581449e-07,
+ "loss": 0.2699,
+ "step": 9576
+ },
+ {
+ "epoch": 4.528132387706856,
+ "grad_norm": 3.035752773284912,
+ "learning_rate": 7.170803668216572e-07,
+ "loss": 0.3939,
+ "step": 9577
+ },
+ {
+ "epoch": 4.528605200945626,
+ "grad_norm": 2.9155373573303223,
+ "learning_rate": 7.166431237997579e-07,
+ "loss": 0.3112,
+ "step": 9578
+ },
+ {
+ "epoch": 4.529078014184397,
+ "grad_norm": 3.2655560970306396,
+ "learning_rate": 7.162059918196715e-07,
+ "loss": 0.3605,
+ "step": 9579
+ },
+ {
+ "epoch": 4.529550827423168,
+ "grad_norm": 3.0889620780944824,
+ "learning_rate": 7.157689709086157e-07,
+ "loss": 0.3333,
+ "step": 9580
+ },
+ {
+ "epoch": 4.5300236406619385,
+ "grad_norm": 3.193974256515503,
+ "learning_rate": 7.153320610938031e-07,
+ "loss": 0.3206,
+ "step": 9581
+ },
+ {
+ "epoch": 4.530496453900709,
+ "grad_norm": 3.0833280086517334,
+ "learning_rate": 7.148952624024374e-07,
+ "loss": 0.3483,
+ "step": 9582
+ },
+ {
+ "epoch": 4.53096926713948,
+ "grad_norm": 2.8866562843322754,
+ "learning_rate": 7.144585748617163e-07,
+ "loss": 0.3147,
+ "step": 9583
+ },
+ {
+ "epoch": 4.531442080378251,
+ "grad_norm": 3.2411928176879883,
+ "learning_rate": 7.140219984988305e-07,
+ "loss": 0.336,
+ "step": 9584
+ },
+ {
+ "epoch": 4.531914893617021,
+ "grad_norm": 3.0993618965148926,
+ "learning_rate": 7.13585533340963e-07,
+ "loss": 0.3599,
+ "step": 9585
+ },
+ {
+ "epoch": 4.532387706855792,
+ "grad_norm": 3.361176013946533,
+ "learning_rate": 7.131491794152917e-07,
+ "loss": 0.3448,
+ "step": 9586
+ },
+ {
+ "epoch": 4.532860520094562,
+ "grad_norm": 3.0916879177093506,
+ "learning_rate": 7.12712936748986e-07,
+ "loss": 0.3479,
+ "step": 9587
+ },
+ {
+ "epoch": 4.533333333333333,
+ "grad_norm": 3.254135847091675,
+ "learning_rate": 7.122768053692078e-07,
+ "loss": 0.3536,
+ "step": 9588
+ },
+ {
+ "epoch": 4.533806146572104,
+ "grad_norm": 3.120321035385132,
+ "learning_rate": 7.118407853031148e-07,
+ "loss": 0.3604,
+ "step": 9589
+ },
+ {
+ "epoch": 4.534278959810875,
+ "grad_norm": 3.0456507205963135,
+ "learning_rate": 7.114048765778544e-07,
+ "loss": 0.3473,
+ "step": 9590
+ },
+ {
+ "epoch": 4.534751773049646,
+ "grad_norm": 3.7177469730377197,
+ "learning_rate": 7.109690792205704e-07,
+ "loss": 0.374,
+ "step": 9591
+ },
+ {
+ "epoch": 4.535224586288416,
+ "grad_norm": 3.2694458961486816,
+ "learning_rate": 7.105333932583972e-07,
+ "loss": 0.4206,
+ "step": 9592
+ },
+ {
+ "epoch": 4.535697399527187,
+ "grad_norm": 3.506195068359375,
+ "learning_rate": 7.100978187184624e-07,
+ "loss": 0.3483,
+ "step": 9593
+ },
+ {
+ "epoch": 4.536170212765957,
+ "grad_norm": 3.569413661956787,
+ "learning_rate": 7.096623556278887e-07,
+ "loss": 0.3389,
+ "step": 9594
+ },
+ {
+ "epoch": 4.536643026004728,
+ "grad_norm": 3.2686502933502197,
+ "learning_rate": 7.092270040137886e-07,
+ "loss": 0.3571,
+ "step": 9595
+ },
+ {
+ "epoch": 4.537115839243499,
+ "grad_norm": 3.2042582035064697,
+ "learning_rate": 7.087917639032718e-07,
+ "loss": 0.3742,
+ "step": 9596
+ },
+ {
+ "epoch": 4.5375886524822695,
+ "grad_norm": 3.014989137649536,
+ "learning_rate": 7.083566353234375e-07,
+ "loss": 0.3294,
+ "step": 9597
+ },
+ {
+ "epoch": 4.53806146572104,
+ "grad_norm": 3.4535064697265625,
+ "learning_rate": 7.079216183013793e-07,
+ "loss": 0.3434,
+ "step": 9598
+ },
+ {
+ "epoch": 4.538534278959811,
+ "grad_norm": 3.123633623123169,
+ "learning_rate": 7.074867128641841e-07,
+ "loss": 0.357,
+ "step": 9599
+ },
+ {
+ "epoch": 4.539007092198582,
+ "grad_norm": 3.0646567344665527,
+ "learning_rate": 7.070519190389305e-07,
+ "loss": 0.3488,
+ "step": 9600
+ },
+ {
+ "epoch": 4.539479905437352,
+ "grad_norm": 2.951892852783203,
+ "learning_rate": 7.066172368526927e-07,
+ "loss": 0.329,
+ "step": 9601
+ },
+ {
+ "epoch": 4.539952718676123,
+ "grad_norm": 2.8071751594543457,
+ "learning_rate": 7.061826663325361e-07,
+ "loss": 0.2788,
+ "step": 9602
+ },
+ {
+ "epoch": 4.540425531914893,
+ "grad_norm": 3.9670250415802,
+ "learning_rate": 7.057482075055183e-07,
+ "loss": 0.3776,
+ "step": 9603
+ },
+ {
+ "epoch": 4.540898345153664,
+ "grad_norm": 2.683743476867676,
+ "learning_rate": 7.053138603986928e-07,
+ "loss": 0.3044,
+ "step": 9604
+ },
+ {
+ "epoch": 4.541371158392435,
+ "grad_norm": 2.9766221046447754,
+ "learning_rate": 7.048796250391038e-07,
+ "loss": 0.3542,
+ "step": 9605
+ },
+ {
+ "epoch": 4.541843971631206,
+ "grad_norm": 3.1156277656555176,
+ "learning_rate": 7.044455014537882e-07,
+ "loss": 0.3401,
+ "step": 9606
+ },
+ {
+ "epoch": 4.542316784869977,
+ "grad_norm": 2.8444416522979736,
+ "learning_rate": 7.040114896697789e-07,
+ "loss": 0.3437,
+ "step": 9607
+ },
+ {
+ "epoch": 4.542789598108747,
+ "grad_norm": 2.9964232444763184,
+ "learning_rate": 7.035775897140984e-07,
+ "loss": 0.331,
+ "step": 9608
+ },
+ {
+ "epoch": 4.543262411347518,
+ "grad_norm": 3.511500597000122,
+ "learning_rate": 7.031438016137648e-07,
+ "loss": 0.3685,
+ "step": 9609
+ },
+ {
+ "epoch": 4.543735224586288,
+ "grad_norm": 3.541271686553955,
+ "learning_rate": 7.027101253957877e-07,
+ "loss": 0.3945,
+ "step": 9610
+ },
+ {
+ "epoch": 4.544208037825059,
+ "grad_norm": 3.1483919620513916,
+ "learning_rate": 7.022765610871696e-07,
+ "loss": 0.3681,
+ "step": 9611
+ },
+ {
+ "epoch": 4.54468085106383,
+ "grad_norm": 2.908977508544922,
+ "learning_rate": 7.01843108714908e-07,
+ "loss": 0.3468,
+ "step": 9612
+ },
+ {
+ "epoch": 4.5451536643026005,
+ "grad_norm": 3.3107962608337402,
+ "learning_rate": 7.014097683059912e-07,
+ "loss": 0.419,
+ "step": 9613
+ },
+ {
+ "epoch": 4.545626477541371,
+ "grad_norm": 3.5597898960113525,
+ "learning_rate": 7.009765398874008e-07,
+ "loss": 0.3238,
+ "step": 9614
+ },
+ {
+ "epoch": 4.546099290780142,
+ "grad_norm": 3.091235399246216,
+ "learning_rate": 7.005434234861136e-07,
+ "loss": 0.3632,
+ "step": 9615
+ },
+ {
+ "epoch": 4.546572104018913,
+ "grad_norm": 3.279076099395752,
+ "learning_rate": 7.001104191290972e-07,
+ "loss": 0.4006,
+ "step": 9616
+ },
+ {
+ "epoch": 4.547044917257683,
+ "grad_norm": 3.3877902030944824,
+ "learning_rate": 6.996775268433126e-07,
+ "loss": 0.4183,
+ "step": 9617
+ },
+ {
+ "epoch": 4.547517730496454,
+ "grad_norm": 2.979999542236328,
+ "learning_rate": 6.992447466557134e-07,
+ "loss": 0.2921,
+ "step": 9618
+ },
+ {
+ "epoch": 4.547990543735224,
+ "grad_norm": 3.196361780166626,
+ "learning_rate": 6.988120785932484e-07,
+ "loss": 0.3352,
+ "step": 9619
+ },
+ {
+ "epoch": 4.548463356973995,
+ "grad_norm": 3.3237528800964355,
+ "learning_rate": 6.983795226828577e-07,
+ "loss": 0.3487,
+ "step": 9620
+ },
+ {
+ "epoch": 4.548936170212766,
+ "grad_norm": 3.0740649700164795,
+ "learning_rate": 6.979470789514731e-07,
+ "loss": 0.3497,
+ "step": 9621
+ },
+ {
+ "epoch": 4.549408983451537,
+ "grad_norm": 3.3443479537963867,
+ "learning_rate": 6.97514747426023e-07,
+ "loss": 0.3752,
+ "step": 9622
+ },
+ {
+ "epoch": 4.549881796690308,
+ "grad_norm": 3.450427293777466,
+ "learning_rate": 6.970825281334254e-07,
+ "loss": 0.3981,
+ "step": 9623
+ },
+ {
+ "epoch": 4.550354609929078,
+ "grad_norm": 3.4733047485351562,
+ "learning_rate": 6.966504211005937e-07,
+ "loss": 0.3953,
+ "step": 9624
+ },
+ {
+ "epoch": 4.550827423167849,
+ "grad_norm": 3.1651546955108643,
+ "learning_rate": 6.962184263544328e-07,
+ "loss": 0.3012,
+ "step": 9625
+ },
+ {
+ "epoch": 4.551300236406619,
+ "grad_norm": 2.9222865104675293,
+ "learning_rate": 6.957865439218405e-07,
+ "loss": 0.2774,
+ "step": 9626
+ },
+ {
+ "epoch": 4.55177304964539,
+ "grad_norm": 2.972437620162964,
+ "learning_rate": 6.953547738297095e-07,
+ "loss": 0.3478,
+ "step": 9627
+ },
+ {
+ "epoch": 4.552245862884161,
+ "grad_norm": 3.2741193771362305,
+ "learning_rate": 6.949231161049239e-07,
+ "loss": 0.355,
+ "step": 9628
+ },
+ {
+ "epoch": 4.5527186761229315,
+ "grad_norm": 2.8715150356292725,
+ "learning_rate": 6.9449157077436e-07,
+ "loss": 0.3055,
+ "step": 9629
+ },
+ {
+ "epoch": 4.553191489361702,
+ "grad_norm": 4.50998592376709,
+ "learning_rate": 6.940601378648895e-07,
+ "loss": 0.3732,
+ "step": 9630
+ },
+ {
+ "epoch": 4.553664302600473,
+ "grad_norm": 2.9277849197387695,
+ "learning_rate": 6.936288174033757e-07,
+ "loss": 0.3367,
+ "step": 9631
+ },
+ {
+ "epoch": 4.554137115839244,
+ "grad_norm": 3.169978380203247,
+ "learning_rate": 6.931976094166746e-07,
+ "loss": 0.3529,
+ "step": 9632
+ },
+ {
+ "epoch": 4.554609929078014,
+ "grad_norm": 2.9629712104797363,
+ "learning_rate": 6.927665139316359e-07,
+ "loss": 0.3416,
+ "step": 9633
+ },
+ {
+ "epoch": 4.555082742316785,
+ "grad_norm": 3.1368603706359863,
+ "learning_rate": 6.923355309751012e-07,
+ "loss": 0.3267,
+ "step": 9634
+ },
+ {
+ "epoch": 4.555555555555555,
+ "grad_norm": 2.9895052909851074,
+ "learning_rate": 6.919046605739071e-07,
+ "loss": 0.3411,
+ "step": 9635
+ },
+ {
+ "epoch": 4.556028368794326,
+ "grad_norm": 3.1592509746551514,
+ "learning_rate": 6.914739027548809e-07,
+ "loss": 0.3488,
+ "step": 9636
+ },
+ {
+ "epoch": 4.556501182033097,
+ "grad_norm": 3.0848731994628906,
+ "learning_rate": 6.910432575448456e-07,
+ "loss": 0.3732,
+ "step": 9637
+ },
+ {
+ "epoch": 4.556973995271868,
+ "grad_norm": 3.1475934982299805,
+ "learning_rate": 6.906127249706143e-07,
+ "loss": 0.3525,
+ "step": 9638
+ },
+ {
+ "epoch": 4.5574468085106385,
+ "grad_norm": 2.9435455799102783,
+ "learning_rate": 6.90182305058994e-07,
+ "loss": 0.3155,
+ "step": 9639
+ },
+ {
+ "epoch": 4.557919621749409,
+ "grad_norm": 3.4412894248962402,
+ "learning_rate": 6.897519978367867e-07,
+ "loss": 0.3511,
+ "step": 9640
+ },
+ {
+ "epoch": 4.55839243498818,
+ "grad_norm": 3.3600406646728516,
+ "learning_rate": 6.893218033307838e-07,
+ "loss": 0.4311,
+ "step": 9641
+ },
+ {
+ "epoch": 4.55886524822695,
+ "grad_norm": 3.35927414894104,
+ "learning_rate": 6.888917215677734e-07,
+ "loss": 0.387,
+ "step": 9642
+ },
+ {
+ "epoch": 4.559338061465721,
+ "grad_norm": 3.2481210231781006,
+ "learning_rate": 6.884617525745343e-07,
+ "loss": 0.3456,
+ "step": 9643
+ },
+ {
+ "epoch": 4.559810874704492,
+ "grad_norm": 3.661160469055176,
+ "learning_rate": 6.880318963778374e-07,
+ "loss": 0.4276,
+ "step": 9644
+ },
+ {
+ "epoch": 4.560283687943262,
+ "grad_norm": 3.038726806640625,
+ "learning_rate": 6.876021530044502e-07,
+ "loss": 0.3288,
+ "step": 9645
+ },
+ {
+ "epoch": 4.560756501182033,
+ "grad_norm": 3.0502963066101074,
+ "learning_rate": 6.871725224811296e-07,
+ "loss": 0.3334,
+ "step": 9646
+ },
+ {
+ "epoch": 4.561229314420804,
+ "grad_norm": 3.1810805797576904,
+ "learning_rate": 6.867430048346268e-07,
+ "loss": 0.3335,
+ "step": 9647
+ },
+ {
+ "epoch": 4.561702127659575,
+ "grad_norm": 3.028670072555542,
+ "learning_rate": 6.863136000916864e-07,
+ "loss": 0.3235,
+ "step": 9648
+ },
+ {
+ "epoch": 4.562174940898345,
+ "grad_norm": 2.805989980697632,
+ "learning_rate": 6.858843082790447e-07,
+ "loss": 0.3201,
+ "step": 9649
+ },
+ {
+ "epoch": 4.5626477541371155,
+ "grad_norm": 3.0792744159698486,
+ "learning_rate": 6.854551294234333e-07,
+ "loss": 0.3757,
+ "step": 9650
+ },
+ {
+ "epoch": 4.563120567375886,
+ "grad_norm": 3.115539312362671,
+ "learning_rate": 6.850260635515735e-07,
+ "loss": 0.3311,
+ "step": 9651
+ },
+ {
+ "epoch": 4.563593380614657,
+ "grad_norm": 3.003520965576172,
+ "learning_rate": 6.845971106901831e-07,
+ "loss": 0.35,
+ "step": 9652
+ },
+ {
+ "epoch": 4.564066193853428,
+ "grad_norm": 2.954759359359741,
+ "learning_rate": 6.841682708659702e-07,
+ "loss": 0.279,
+ "step": 9653
+ },
+ {
+ "epoch": 4.564539007092199,
+ "grad_norm": 3.1510894298553467,
+ "learning_rate": 6.83739544105636e-07,
+ "loss": 0.333,
+ "step": 9654
+ },
+ {
+ "epoch": 4.5650118203309695,
+ "grad_norm": 3.3958635330200195,
+ "learning_rate": 6.833109304358776e-07,
+ "loss": 0.3668,
+ "step": 9655
+ },
+ {
+ "epoch": 4.56548463356974,
+ "grad_norm": 3.493522882461548,
+ "learning_rate": 6.828824298833811e-07,
+ "loss": 0.3522,
+ "step": 9656
+ },
+ {
+ "epoch": 4.565957446808511,
+ "grad_norm": 3.217268705368042,
+ "learning_rate": 6.824540424748275e-07,
+ "loss": 0.4066,
+ "step": 9657
+ },
+ {
+ "epoch": 4.566430260047281,
+ "grad_norm": 3.148505210876465,
+ "learning_rate": 6.820257682368914e-07,
+ "loss": 0.3252,
+ "step": 9658
+ },
+ {
+ "epoch": 4.566903073286052,
+ "grad_norm": 3.070316791534424,
+ "learning_rate": 6.815976071962385e-07,
+ "loss": 0.3362,
+ "step": 9659
+ },
+ {
+ "epoch": 4.567375886524823,
+ "grad_norm": 3.0421791076660156,
+ "learning_rate": 6.811695593795301e-07,
+ "loss": 0.3894,
+ "step": 9660
+ },
+ {
+ "epoch": 4.567848699763593,
+ "grad_norm": 2.9165565967559814,
+ "learning_rate": 6.807416248134177e-07,
+ "loss": 0.3147,
+ "step": 9661
+ },
+ {
+ "epoch": 4.568321513002364,
+ "grad_norm": 3.361647129058838,
+ "learning_rate": 6.803138035245471e-07,
+ "loss": 0.3346,
+ "step": 9662
+ },
+ {
+ "epoch": 4.568794326241135,
+ "grad_norm": 3.0013155937194824,
+ "learning_rate": 6.79886095539557e-07,
+ "loss": 0.3476,
+ "step": 9663
+ },
+ {
+ "epoch": 4.569267139479906,
+ "grad_norm": 2.9030165672302246,
+ "learning_rate": 6.794585008850779e-07,
+ "loss": 0.3118,
+ "step": 9664
+ },
+ {
+ "epoch": 4.569739952718676,
+ "grad_norm": 3.229907989501953,
+ "learning_rate": 6.790310195877361e-07,
+ "loss": 0.3257,
+ "step": 9665
+ },
+ {
+ "epoch": 4.5702127659574465,
+ "grad_norm": 3.4075570106506348,
+ "learning_rate": 6.786036516741479e-07,
+ "loss": 0.3545,
+ "step": 9666
+ },
+ {
+ "epoch": 4.570685579196217,
+ "grad_norm": 2.9831581115722656,
+ "learning_rate": 6.781763971709229e-07,
+ "loss": 0.3173,
+ "step": 9667
+ },
+ {
+ "epoch": 4.571158392434988,
+ "grad_norm": 3.8512840270996094,
+ "learning_rate": 6.777492561046659e-07,
+ "loss": 0.288,
+ "step": 9668
+ },
+ {
+ "epoch": 4.571631205673759,
+ "grad_norm": 3.3054401874542236,
+ "learning_rate": 6.773222285019718e-07,
+ "loss": 0.369,
+ "step": 9669
+ },
+ {
+ "epoch": 4.57210401891253,
+ "grad_norm": 2.9155004024505615,
+ "learning_rate": 6.768953143894308e-07,
+ "loss": 0.3334,
+ "step": 9670
+ },
+ {
+ "epoch": 4.5725768321513005,
+ "grad_norm": 3.60557222366333,
+ "learning_rate": 6.764685137936247e-07,
+ "loss": 0.4094,
+ "step": 9671
+ },
+ {
+ "epoch": 4.573049645390071,
+ "grad_norm": 3.271256446838379,
+ "learning_rate": 6.760418267411275e-07,
+ "loss": 0.3646,
+ "step": 9672
+ },
+ {
+ "epoch": 4.573522458628842,
+ "grad_norm": 2.970238447189331,
+ "learning_rate": 6.756152532585086e-07,
+ "loss": 0.34,
+ "step": 9673
+ },
+ {
+ "epoch": 4.573995271867612,
+ "grad_norm": 3.412712574005127,
+ "learning_rate": 6.751887933723277e-07,
+ "loss": 0.3674,
+ "step": 9674
+ },
+ {
+ "epoch": 4.574468085106383,
+ "grad_norm": 2.9984517097473145,
+ "learning_rate": 6.747624471091396e-07,
+ "loss": 0.3579,
+ "step": 9675
+ },
+ {
+ "epoch": 4.574940898345154,
+ "grad_norm": 2.863788366317749,
+ "learning_rate": 6.743362144954907e-07,
+ "loss": 0.3234,
+ "step": 9676
+ },
+ {
+ "epoch": 4.575413711583924,
+ "grad_norm": 3.313793897628784,
+ "learning_rate": 6.739100955579203e-07,
+ "loss": 0.334,
+ "step": 9677
+ },
+ {
+ "epoch": 4.575886524822695,
+ "grad_norm": 3.5350630283355713,
+ "learning_rate": 6.734840903229611e-07,
+ "loss": 0.3682,
+ "step": 9678
+ },
+ {
+ "epoch": 4.576359338061466,
+ "grad_norm": 3.531888723373413,
+ "learning_rate": 6.730581988171378e-07,
+ "loss": 0.3434,
+ "step": 9679
+ },
+ {
+ "epoch": 4.576832151300237,
+ "grad_norm": 3.358574867248535,
+ "learning_rate": 6.726324210669702e-07,
+ "loss": 0.3751,
+ "step": 9680
+ },
+ {
+ "epoch": 4.577304964539007,
+ "grad_norm": 2.9723873138427734,
+ "learning_rate": 6.722067570989691e-07,
+ "loss": 0.3077,
+ "step": 9681
+ },
+ {
+ "epoch": 4.5777777777777775,
+ "grad_norm": 3.2287187576293945,
+ "learning_rate": 6.717812069396379e-07,
+ "loss": 0.3493,
+ "step": 9682
+ },
+ {
+ "epoch": 4.578250591016548,
+ "grad_norm": 2.9089417457580566,
+ "learning_rate": 6.71355770615475e-07,
+ "loss": 0.324,
+ "step": 9683
+ },
+ {
+ "epoch": 4.578723404255319,
+ "grad_norm": 3.2894415855407715,
+ "learning_rate": 6.709304481529703e-07,
+ "loss": 0.3066,
+ "step": 9684
+ },
+ {
+ "epoch": 4.57919621749409,
+ "grad_norm": 3.1914620399475098,
+ "learning_rate": 6.705052395786052e-07,
+ "loss": 0.3453,
+ "step": 9685
+ },
+ {
+ "epoch": 4.579669030732861,
+ "grad_norm": 3.1095924377441406,
+ "learning_rate": 6.700801449188577e-07,
+ "loss": 0.3678,
+ "step": 9686
+ },
+ {
+ "epoch": 4.5801418439716315,
+ "grad_norm": 3.416944980621338,
+ "learning_rate": 6.696551642001948e-07,
+ "loss": 0.3754,
+ "step": 9687
+ },
+ {
+ "epoch": 4.580614657210402,
+ "grad_norm": 3.7102952003479004,
+ "learning_rate": 6.692302974490797e-07,
+ "loss": 0.3723,
+ "step": 9688
+ },
+ {
+ "epoch": 4.581087470449172,
+ "grad_norm": 3.296607494354248,
+ "learning_rate": 6.688055446919664e-07,
+ "loss": 0.3607,
+ "step": 9689
+ },
+ {
+ "epoch": 4.581560283687943,
+ "grad_norm": 3.4449238777160645,
+ "learning_rate": 6.683809059553014e-07,
+ "loss": 0.3102,
+ "step": 9690
+ },
+ {
+ "epoch": 4.582033096926714,
+ "grad_norm": 3.202671766281128,
+ "learning_rate": 6.679563812655268e-07,
+ "loss": 0.3535,
+ "step": 9691
+ },
+ {
+ "epoch": 4.582505910165485,
+ "grad_norm": 3.25919771194458,
+ "learning_rate": 6.675319706490744e-07,
+ "loss": 0.3778,
+ "step": 9692
+ },
+ {
+ "epoch": 4.582978723404255,
+ "grad_norm": 3.316021680831909,
+ "learning_rate": 6.671076741323718e-07,
+ "loss": 0.2943,
+ "step": 9693
+ },
+ {
+ "epoch": 4.583451536643026,
+ "grad_norm": 3.2375826835632324,
+ "learning_rate": 6.666834917418371e-07,
+ "loss": 0.3249,
+ "step": 9694
+ },
+ {
+ "epoch": 4.583924349881797,
+ "grad_norm": 2.875436782836914,
+ "learning_rate": 6.662594235038827e-07,
+ "loss": 0.2991,
+ "step": 9695
+ },
+ {
+ "epoch": 4.584397163120567,
+ "grad_norm": 3.75874924659729,
+ "learning_rate": 6.658354694449134e-07,
+ "loss": 0.3718,
+ "step": 9696
+ },
+ {
+ "epoch": 4.584869976359338,
+ "grad_norm": 3.060943126678467,
+ "learning_rate": 6.65411629591326e-07,
+ "loss": 0.3183,
+ "step": 9697
+ },
+ {
+ "epoch": 4.5853427895981085,
+ "grad_norm": 3.024336576461792,
+ "learning_rate": 6.649879039695126e-07,
+ "loss": 0.3118,
+ "step": 9698
+ },
+ {
+ "epoch": 4.585815602836879,
+ "grad_norm": 3.3640875816345215,
+ "learning_rate": 6.645642926058562e-07,
+ "loss": 0.3408,
+ "step": 9699
+ },
+ {
+ "epoch": 4.58628841607565,
+ "grad_norm": 2.8885910511016846,
+ "learning_rate": 6.641407955267326e-07,
+ "loss": 0.3304,
+ "step": 9700
+ },
+ {
+ "epoch": 4.586761229314421,
+ "grad_norm": 3.8225393295288086,
+ "learning_rate": 6.637174127585122e-07,
+ "loss": 0.3469,
+ "step": 9701
+ },
+ {
+ "epoch": 4.587234042553192,
+ "grad_norm": 3.0624778270721436,
+ "learning_rate": 6.632941443275567e-07,
+ "loss": 0.3177,
+ "step": 9702
+ },
+ {
+ "epoch": 4.5877068557919625,
+ "grad_norm": 3.1422903537750244,
+ "learning_rate": 6.628709902602204e-07,
+ "loss": 0.3205,
+ "step": 9703
+ },
+ {
+ "epoch": 4.588179669030733,
+ "grad_norm": 3.1315362453460693,
+ "learning_rate": 6.62447950582853e-07,
+ "loss": 0.3443,
+ "step": 9704
+ },
+ {
+ "epoch": 4.588652482269503,
+ "grad_norm": 3.096041202545166,
+ "learning_rate": 6.62025025321793e-07,
+ "loss": 0.3567,
+ "step": 9705
+ },
+ {
+ "epoch": 4.589125295508274,
+ "grad_norm": 3.225820302963257,
+ "learning_rate": 6.616022145033766e-07,
+ "loss": 0.3873,
+ "step": 9706
+ },
+ {
+ "epoch": 4.589598108747045,
+ "grad_norm": 3.3879058361053467,
+ "learning_rate": 6.611795181539288e-07,
+ "loss": 0.4379,
+ "step": 9707
+ },
+ {
+ "epoch": 4.590070921985816,
+ "grad_norm": 3.508265733718872,
+ "learning_rate": 6.60756936299769e-07,
+ "loss": 0.3182,
+ "step": 9708
+ },
+ {
+ "epoch": 4.590543735224586,
+ "grad_norm": 3.278857946395874,
+ "learning_rate": 6.603344689672106e-07,
+ "loss": 0.3508,
+ "step": 9709
+ },
+ {
+ "epoch": 4.591016548463357,
+ "grad_norm": 2.9961371421813965,
+ "learning_rate": 6.599121161825581e-07,
+ "loss": 0.3178,
+ "step": 9710
+ },
+ {
+ "epoch": 4.591489361702128,
+ "grad_norm": 3.413717269897461,
+ "learning_rate": 6.594898779721092e-07,
+ "loss": 0.363,
+ "step": 9711
+ },
+ {
+ "epoch": 4.591962174940898,
+ "grad_norm": 3.2014074325561523,
+ "learning_rate": 6.590677543621557e-07,
+ "loss": 0.392,
+ "step": 9712
+ },
+ {
+ "epoch": 4.592434988179669,
+ "grad_norm": 3.0421640872955322,
+ "learning_rate": 6.586457453789802e-07,
+ "loss": 0.3119,
+ "step": 9713
+ },
+ {
+ "epoch": 4.5929078014184395,
+ "grad_norm": 3.0515928268432617,
+ "learning_rate": 6.582238510488604e-07,
+ "loss": 0.3021,
+ "step": 9714
+ },
+ {
+ "epoch": 4.59338061465721,
+ "grad_norm": 3.0824668407440186,
+ "learning_rate": 6.578020713980648e-07,
+ "loss": 0.3551,
+ "step": 9715
+ },
+ {
+ "epoch": 4.593853427895981,
+ "grad_norm": 3.0002171993255615,
+ "learning_rate": 6.573804064528574e-07,
+ "loss": 0.3691,
+ "step": 9716
+ },
+ {
+ "epoch": 4.594326241134752,
+ "grad_norm": 3.0174765586853027,
+ "learning_rate": 6.569588562394924e-07,
+ "loss": 0.3289,
+ "step": 9717
+ },
+ {
+ "epoch": 4.594799054373523,
+ "grad_norm": 4.098819732666016,
+ "learning_rate": 6.565374207842171e-07,
+ "loss": 0.3637,
+ "step": 9718
+ },
+ {
+ "epoch": 4.5952718676122934,
+ "grad_norm": 3.396275281906128,
+ "learning_rate": 6.561161001132737e-07,
+ "loss": 0.348,
+ "step": 9719
+ },
+ {
+ "epoch": 4.595744680851064,
+ "grad_norm": 3.6430864334106445,
+ "learning_rate": 6.556948942528952e-07,
+ "loss": 0.3543,
+ "step": 9720
+ },
+ {
+ "epoch": 4.596217494089834,
+ "grad_norm": 3.170236587524414,
+ "learning_rate": 6.552738032293093e-07,
+ "loss": 0.3565,
+ "step": 9721
+ },
+ {
+ "epoch": 4.596690307328605,
+ "grad_norm": 3.402683734893799,
+ "learning_rate": 6.548528270687349e-07,
+ "loss": 0.4001,
+ "step": 9722
+ },
+ {
+ "epoch": 4.597163120567376,
+ "grad_norm": 2.861463785171509,
+ "learning_rate": 6.544319657973833e-07,
+ "loss": 0.3436,
+ "step": 9723
+ },
+ {
+ "epoch": 4.5976359338061465,
+ "grad_norm": 3.209259510040283,
+ "learning_rate": 6.540112194414613e-07,
+ "loss": 0.3317,
+ "step": 9724
+ },
+ {
+ "epoch": 4.598108747044917,
+ "grad_norm": 3.099533796310425,
+ "learning_rate": 6.535905880271662e-07,
+ "loss": 0.3416,
+ "step": 9725
+ },
+ {
+ "epoch": 4.598581560283688,
+ "grad_norm": 3.3558053970336914,
+ "learning_rate": 6.531700715806891e-07,
+ "loss": 0.3567,
+ "step": 9726
+ },
+ {
+ "epoch": 4.599054373522459,
+ "grad_norm": 3.1330227851867676,
+ "learning_rate": 6.527496701282135e-07,
+ "loss": 0.3025,
+ "step": 9727
+ },
+ {
+ "epoch": 4.599527186761229,
+ "grad_norm": 3.140184164047241,
+ "learning_rate": 6.523293836959152e-07,
+ "loss": 0.3195,
+ "step": 9728
+ },
+ {
+ "epoch": 4.6,
+ "grad_norm": 3.246844530105591,
+ "learning_rate": 6.519092123099652e-07,
+ "loss": 0.3367,
+ "step": 9729
+ },
+ {
+ "epoch": 4.60047281323877,
+ "grad_norm": 3.1590709686279297,
+ "learning_rate": 6.51489155996525e-07,
+ "loss": 0.367,
+ "step": 9730
+ },
+ {
+ "epoch": 4.600945626477541,
+ "grad_norm": 3.122746467590332,
+ "learning_rate": 6.510692147817488e-07,
+ "loss": 0.3401,
+ "step": 9731
+ },
+ {
+ "epoch": 4.601418439716312,
+ "grad_norm": 3.0418715476989746,
+ "learning_rate": 6.506493886917859e-07,
+ "loss": 0.2958,
+ "step": 9732
+ },
+ {
+ "epoch": 4.601891252955083,
+ "grad_norm": 3.06303334236145,
+ "learning_rate": 6.502296777527756e-07,
+ "loss": 0.3459,
+ "step": 9733
+ },
+ {
+ "epoch": 4.602364066193854,
+ "grad_norm": 3.0807206630706787,
+ "learning_rate": 6.498100819908532e-07,
+ "loss": 0.3473,
+ "step": 9734
+ },
+ {
+ "epoch": 4.602836879432624,
+ "grad_norm": 3.790008306503296,
+ "learning_rate": 6.493906014321441e-07,
+ "loss": 0.3541,
+ "step": 9735
+ },
+ {
+ "epoch": 4.603309692671395,
+ "grad_norm": 3.3040049076080322,
+ "learning_rate": 6.489712361027667e-07,
+ "loss": 0.3317,
+ "step": 9736
+ },
+ {
+ "epoch": 4.603782505910165,
+ "grad_norm": 3.3145735263824463,
+ "learning_rate": 6.485519860288347e-07,
+ "loss": 0.3351,
+ "step": 9737
+ },
+ {
+ "epoch": 4.604255319148936,
+ "grad_norm": 3.1374423503875732,
+ "learning_rate": 6.481328512364515e-07,
+ "loss": 0.3544,
+ "step": 9738
+ },
+ {
+ "epoch": 4.604728132387707,
+ "grad_norm": 3.3598453998565674,
+ "learning_rate": 6.477138317517162e-07,
+ "loss": 0.4219,
+ "step": 9739
+ },
+ {
+ "epoch": 4.6052009456264775,
+ "grad_norm": 3.227466583251953,
+ "learning_rate": 6.472949276007187e-07,
+ "loss": 0.3179,
+ "step": 9740
+ },
+ {
+ "epoch": 4.605673758865248,
+ "grad_norm": 2.9815897941589355,
+ "learning_rate": 6.46876138809542e-07,
+ "loss": 0.3753,
+ "step": 9741
+ },
+ {
+ "epoch": 4.606146572104019,
+ "grad_norm": 3.072967290878296,
+ "learning_rate": 6.464574654042624e-07,
+ "loss": 0.3288,
+ "step": 9742
+ },
+ {
+ "epoch": 4.60661938534279,
+ "grad_norm": 3.695613145828247,
+ "learning_rate": 6.460389074109482e-07,
+ "loss": 0.3305,
+ "step": 9743
+ },
+ {
+ "epoch": 4.60709219858156,
+ "grad_norm": 3.205684185028076,
+ "learning_rate": 6.456204648556628e-07,
+ "loss": 0.3305,
+ "step": 9744
+ },
+ {
+ "epoch": 4.607565011820331,
+ "grad_norm": 3.216615915298462,
+ "learning_rate": 6.452021377644596e-07,
+ "loss": 0.3416,
+ "step": 9745
+ },
+ {
+ "epoch": 4.608037825059101,
+ "grad_norm": 3.2224013805389404,
+ "learning_rate": 6.447839261633856e-07,
+ "loss": 0.3773,
+ "step": 9746
+ },
+ {
+ "epoch": 4.608510638297872,
+ "grad_norm": 3.2811145782470703,
+ "learning_rate": 6.443658300784824e-07,
+ "loss": 0.3292,
+ "step": 9747
+ },
+ {
+ "epoch": 4.608983451536643,
+ "grad_norm": 3.8610804080963135,
+ "learning_rate": 6.439478495357815e-07,
+ "loss": 0.3975,
+ "step": 9748
+ },
+ {
+ "epoch": 4.609456264775414,
+ "grad_norm": 3.2154266834259033,
+ "learning_rate": 6.435299845613102e-07,
+ "loss": 0.3367,
+ "step": 9749
+ },
+ {
+ "epoch": 4.609929078014185,
+ "grad_norm": 3.18072509765625,
+ "learning_rate": 6.431122351810862e-07,
+ "loss": 0.3972,
+ "step": 9750
+ },
+ {
+ "epoch": 4.610401891252955,
+ "grad_norm": 3.513521194458008,
+ "learning_rate": 6.426946014211205e-07,
+ "loss": 0.374,
+ "step": 9751
+ },
+ {
+ "epoch": 4.610874704491726,
+ "grad_norm": 3.2900753021240234,
+ "learning_rate": 6.422770833074188e-07,
+ "loss": 0.3823,
+ "step": 9752
+ },
+ {
+ "epoch": 4.611347517730496,
+ "grad_norm": 2.791400194168091,
+ "learning_rate": 6.418596808659772e-07,
+ "loss": 0.3187,
+ "step": 9753
+ },
+ {
+ "epoch": 4.611820330969267,
+ "grad_norm": 3.042336940765381,
+ "learning_rate": 6.414423941227846e-07,
+ "loss": 0.3832,
+ "step": 9754
+ },
+ {
+ "epoch": 4.612293144208038,
+ "grad_norm": 3.130197286605835,
+ "learning_rate": 6.410252231038255e-07,
+ "loss": 0.3152,
+ "step": 9755
+ },
+ {
+ "epoch": 4.6127659574468085,
+ "grad_norm": 3.28125262260437,
+ "learning_rate": 6.406081678350745e-07,
+ "loss": 0.3082,
+ "step": 9756
+ },
+ {
+ "epoch": 4.613238770685579,
+ "grad_norm": 3.5695526599884033,
+ "learning_rate": 6.401912283424988e-07,
+ "loss": 0.4303,
+ "step": 9757
+ },
+ {
+ "epoch": 4.61371158392435,
+ "grad_norm": 2.9045464992523193,
+ "learning_rate": 6.397744046520612e-07,
+ "loss": 0.3392,
+ "step": 9758
+ },
+ {
+ "epoch": 4.614184397163121,
+ "grad_norm": 3.4325780868530273,
+ "learning_rate": 6.393576967897145e-07,
+ "loss": 0.3446,
+ "step": 9759
+ },
+ {
+ "epoch": 4.614657210401891,
+ "grad_norm": 3.1146414279937744,
+ "learning_rate": 6.389411047814053e-07,
+ "loss": 0.3444,
+ "step": 9760
+ },
+ {
+ "epoch": 4.615130023640662,
+ "grad_norm": 3.9922995567321777,
+ "learning_rate": 6.385246286530722e-07,
+ "loss": 0.3431,
+ "step": 9761
+ },
+ {
+ "epoch": 4.615602836879432,
+ "grad_norm": 2.868818759918213,
+ "learning_rate": 6.381082684306491e-07,
+ "loss": 0.2819,
+ "step": 9762
+ },
+ {
+ "epoch": 4.616075650118203,
+ "grad_norm": 3.1957287788391113,
+ "learning_rate": 6.376920241400597e-07,
+ "loss": 0.315,
+ "step": 9763
+ },
+ {
+ "epoch": 4.616548463356974,
+ "grad_norm": 3.327913999557495,
+ "learning_rate": 6.372758958072215e-07,
+ "loss": 0.3224,
+ "step": 9764
+ },
+ {
+ "epoch": 4.617021276595745,
+ "grad_norm": 3.2451798915863037,
+ "learning_rate": 6.368598834580461e-07,
+ "loss": 0.3219,
+ "step": 9765
+ },
+ {
+ "epoch": 4.617494089834516,
+ "grad_norm": 3.328977346420288,
+ "learning_rate": 6.364439871184355e-07,
+ "loss": 0.3123,
+ "step": 9766
+ },
+ {
+ "epoch": 4.617966903073286,
+ "grad_norm": 2.929624557495117,
+ "learning_rate": 6.36028206814287e-07,
+ "loss": 0.3137,
+ "step": 9767
+ },
+ {
+ "epoch": 4.618439716312057,
+ "grad_norm": 3.2356855869293213,
+ "learning_rate": 6.356125425714888e-07,
+ "loss": 0.3672,
+ "step": 9768
+ },
+ {
+ "epoch": 4.618912529550827,
+ "grad_norm": 3.099452018737793,
+ "learning_rate": 6.351969944159217e-07,
+ "loss": 0.3875,
+ "step": 9769
+ },
+ {
+ "epoch": 4.619385342789598,
+ "grad_norm": 4.037657260894775,
+ "learning_rate": 6.347815623734616e-07,
+ "loss": 0.3984,
+ "step": 9770
+ },
+ {
+ "epoch": 4.619858156028369,
+ "grad_norm": 3.350639581680298,
+ "learning_rate": 6.343662464699743e-07,
+ "loss": 0.3325,
+ "step": 9771
+ },
+ {
+ "epoch": 4.6203309692671395,
+ "grad_norm": 3.3933796882629395,
+ "learning_rate": 6.339510467313206e-07,
+ "loss": 0.3922,
+ "step": 9772
+ },
+ {
+ "epoch": 4.62080378250591,
+ "grad_norm": 2.8599045276641846,
+ "learning_rate": 6.335359631833532e-07,
+ "loss": 0.2677,
+ "step": 9773
+ },
+ {
+ "epoch": 4.621276595744681,
+ "grad_norm": 3.0792534351348877,
+ "learning_rate": 6.331209958519172e-07,
+ "loss": 0.3784,
+ "step": 9774
+ },
+ {
+ "epoch": 4.621749408983452,
+ "grad_norm": 3.1678860187530518,
+ "learning_rate": 6.327061447628507e-07,
+ "loss": 0.3698,
+ "step": 9775
+ },
+ {
+ "epoch": 4.622222222222222,
+ "grad_norm": 3.500584602355957,
+ "learning_rate": 6.322914099419846e-07,
+ "loss": 0.281,
+ "step": 9776
+ },
+ {
+ "epoch": 4.622695035460993,
+ "grad_norm": 3.089900016784668,
+ "learning_rate": 6.318767914151422e-07,
+ "loss": 0.3202,
+ "step": 9777
+ },
+ {
+ "epoch": 4.623167848699763,
+ "grad_norm": 3.353118896484375,
+ "learning_rate": 6.31462289208141e-07,
+ "loss": 0.3584,
+ "step": 9778
+ },
+ {
+ "epoch": 4.623640661938534,
+ "grad_norm": 3.1742143630981445,
+ "learning_rate": 6.310479033467893e-07,
+ "loss": 0.3309,
+ "step": 9779
+ },
+ {
+ "epoch": 4.624113475177305,
+ "grad_norm": 3.5430498123168945,
+ "learning_rate": 6.306336338568903e-07,
+ "loss": 0.3972,
+ "step": 9780
+ },
+ {
+ "epoch": 4.624586288416076,
+ "grad_norm": 3.141406774520874,
+ "learning_rate": 6.302194807642379e-07,
+ "loss": 0.3875,
+ "step": 9781
+ },
+ {
+ "epoch": 4.625059101654847,
+ "grad_norm": 3.1661601066589355,
+ "learning_rate": 6.298054440946188e-07,
+ "loss": 0.3969,
+ "step": 9782
+ },
+ {
+ "epoch": 4.625531914893617,
+ "grad_norm": 2.9834651947021484,
+ "learning_rate": 6.293915238738149e-07,
+ "loss": 0.3357,
+ "step": 9783
+ },
+ {
+ "epoch": 4.626004728132388,
+ "grad_norm": 3.497030258178711,
+ "learning_rate": 6.289777201275979e-07,
+ "loss": 0.3683,
+ "step": 9784
+ },
+ {
+ "epoch": 4.626477541371158,
+ "grad_norm": 3.519390106201172,
+ "learning_rate": 6.285640328817347e-07,
+ "loss": 0.3647,
+ "step": 9785
+ },
+ {
+ "epoch": 4.626950354609929,
+ "grad_norm": 3.0032200813293457,
+ "learning_rate": 6.281504621619833e-07,
+ "loss": 0.2854,
+ "step": 9786
+ },
+ {
+ "epoch": 4.6274231678487,
+ "grad_norm": 2.9891152381896973,
+ "learning_rate": 6.277370079940939e-07,
+ "loss": 0.3771,
+ "step": 9787
+ },
+ {
+ "epoch": 4.6278959810874705,
+ "grad_norm": 3.379671812057495,
+ "learning_rate": 6.273236704038122e-07,
+ "loss": 0.3916,
+ "step": 9788
+ },
+ {
+ "epoch": 4.628368794326241,
+ "grad_norm": 2.9964048862457275,
+ "learning_rate": 6.26910449416874e-07,
+ "loss": 0.3618,
+ "step": 9789
+ },
+ {
+ "epoch": 4.628841607565012,
+ "grad_norm": 3.0143628120422363,
+ "learning_rate": 6.264973450590089e-07,
+ "loss": 0.336,
+ "step": 9790
+ },
+ {
+ "epoch": 4.629314420803783,
+ "grad_norm": 2.956737756729126,
+ "learning_rate": 6.260843573559392e-07,
+ "loss": 0.3657,
+ "step": 9791
+ },
+ {
+ "epoch": 4.629787234042553,
+ "grad_norm": 3.057551145553589,
+ "learning_rate": 6.256714863333787e-07,
+ "loss": 0.3475,
+ "step": 9792
+ },
+ {
+ "epoch": 4.630260047281324,
+ "grad_norm": 3.9289608001708984,
+ "learning_rate": 6.25258732017037e-07,
+ "loss": 0.3679,
+ "step": 9793
+ },
+ {
+ "epoch": 4.630732860520094,
+ "grad_norm": 3.8519062995910645,
+ "learning_rate": 6.248460944326129e-07,
+ "loss": 0.4182,
+ "step": 9794
+ },
+ {
+ "epoch": 4.631205673758865,
+ "grad_norm": 3.6360673904418945,
+ "learning_rate": 6.244335736058007e-07,
+ "loss": 0.3836,
+ "step": 9795
+ },
+ {
+ "epoch": 4.631678486997636,
+ "grad_norm": 3.1905548572540283,
+ "learning_rate": 6.240211695622861e-07,
+ "loss": 0.357,
+ "step": 9796
+ },
+ {
+ "epoch": 4.632151300236407,
+ "grad_norm": 3.3542017936706543,
+ "learning_rate": 6.236088823277465e-07,
+ "loss": 0.3191,
+ "step": 9797
+ },
+ {
+ "epoch": 4.6326241134751776,
+ "grad_norm": 3.453275442123413,
+ "learning_rate": 6.231967119278546e-07,
+ "loss": 0.3346,
+ "step": 9798
+ },
+ {
+ "epoch": 4.633096926713948,
+ "grad_norm": 3.559972047805786,
+ "learning_rate": 6.227846583882741e-07,
+ "loss": 0.365,
+ "step": 9799
+ },
+ {
+ "epoch": 4.633569739952719,
+ "grad_norm": 2.795891046524048,
+ "learning_rate": 6.223727217346606e-07,
+ "loss": 0.3346,
+ "step": 9800
+ },
+ {
+ "epoch": 4.634042553191489,
+ "grad_norm": 3.176762342453003,
+ "learning_rate": 6.219609019926653e-07,
+ "loss": 0.3692,
+ "step": 9801
+ },
+ {
+ "epoch": 4.63451536643026,
+ "grad_norm": 3.490229845046997,
+ "learning_rate": 6.215491991879294e-07,
+ "loss": 0.3334,
+ "step": 9802
+ },
+ {
+ "epoch": 4.634988179669031,
+ "grad_norm": 3.27502179145813,
+ "learning_rate": 6.211376133460884e-07,
+ "loss": 0.3484,
+ "step": 9803
+ },
+ {
+ "epoch": 4.6354609929078014,
+ "grad_norm": 2.9768311977386475,
+ "learning_rate": 6.207261444927698e-07,
+ "loss": 0.3342,
+ "step": 9804
+ },
+ {
+ "epoch": 4.635933806146572,
+ "grad_norm": 3.1726930141448975,
+ "learning_rate": 6.203147926535938e-07,
+ "loss": 0.3187,
+ "step": 9805
+ },
+ {
+ "epoch": 4.636406619385343,
+ "grad_norm": 3.1797916889190674,
+ "learning_rate": 6.199035578541737e-07,
+ "loss": 0.3418,
+ "step": 9806
+ },
+ {
+ "epoch": 4.636879432624114,
+ "grad_norm": 3.1262030601501465,
+ "learning_rate": 6.194924401201141e-07,
+ "loss": 0.3099,
+ "step": 9807
+ },
+ {
+ "epoch": 4.637352245862884,
+ "grad_norm": 3.556866407394409,
+ "learning_rate": 6.190814394770153e-07,
+ "loss": 0.2879,
+ "step": 9808
+ },
+ {
+ "epoch": 4.6378250591016545,
+ "grad_norm": 3.508984327316284,
+ "learning_rate": 6.186705559504678e-07,
+ "loss": 0.3414,
+ "step": 9809
+ },
+ {
+ "epoch": 4.638297872340425,
+ "grad_norm": 3.266221761703491,
+ "learning_rate": 6.182597895660544e-07,
+ "loss": 0.3281,
+ "step": 9810
+ },
+ {
+ "epoch": 4.638770685579196,
+ "grad_norm": 3.3781862258911133,
+ "learning_rate": 6.178491403493537e-07,
+ "loss": 0.3583,
+ "step": 9811
+ },
+ {
+ "epoch": 4.639243498817967,
+ "grad_norm": 3.4480984210968018,
+ "learning_rate": 6.174386083259329e-07,
+ "loss": 0.3704,
+ "step": 9812
+ },
+ {
+ "epoch": 4.639716312056738,
+ "grad_norm": 3.2882535457611084,
+ "learning_rate": 6.170281935213563e-07,
+ "loss": 0.3515,
+ "step": 9813
+ },
+ {
+ "epoch": 4.6401891252955085,
+ "grad_norm": 2.852627992630005,
+ "learning_rate": 6.166178959611774e-07,
+ "loss": 0.3266,
+ "step": 9814
+ },
+ {
+ "epoch": 4.640661938534279,
+ "grad_norm": 3.5469841957092285,
+ "learning_rate": 6.162077156709431e-07,
+ "loss": 0.3374,
+ "step": 9815
+ },
+ {
+ "epoch": 4.64113475177305,
+ "grad_norm": 3.343583345413208,
+ "learning_rate": 6.157976526761947e-07,
+ "loss": 0.3084,
+ "step": 9816
+ },
+ {
+ "epoch": 4.64160756501182,
+ "grad_norm": 3.028337001800537,
+ "learning_rate": 6.153877070024639e-07,
+ "loss": 0.3083,
+ "step": 9817
+ },
+ {
+ "epoch": 4.642080378250591,
+ "grad_norm": 3.1543455123901367,
+ "learning_rate": 6.149778786752775e-07,
+ "loss": 0.3273,
+ "step": 9818
+ },
+ {
+ "epoch": 4.642553191489362,
+ "grad_norm": 3.2126576900482178,
+ "learning_rate": 6.145681677201529e-07,
+ "loss": 0.3107,
+ "step": 9819
+ },
+ {
+ "epoch": 4.643026004728132,
+ "grad_norm": 3.4443142414093018,
+ "learning_rate": 6.141585741626014e-07,
+ "loss": 0.3193,
+ "step": 9820
+ },
+ {
+ "epoch": 4.643498817966903,
+ "grad_norm": 3.1558680534362793,
+ "learning_rate": 6.137490980281255e-07,
+ "loss": 0.3855,
+ "step": 9821
+ },
+ {
+ "epoch": 4.643971631205674,
+ "grad_norm": 3.370654821395874,
+ "learning_rate": 6.133397393422228e-07,
+ "loss": 0.309,
+ "step": 9822
+ },
+ {
+ "epoch": 4.644444444444445,
+ "grad_norm": 3.0980682373046875,
+ "learning_rate": 6.129304981303822e-07,
+ "loss": 0.2784,
+ "step": 9823
+ },
+ {
+ "epoch": 4.644917257683215,
+ "grad_norm": 3.102229356765747,
+ "learning_rate": 6.125213744180844e-07,
+ "loss": 0.3064,
+ "step": 9824
+ },
+ {
+ "epoch": 4.6453900709219855,
+ "grad_norm": 2.9737658500671387,
+ "learning_rate": 6.121123682308039e-07,
+ "loss": 0.2926,
+ "step": 9825
+ },
+ {
+ "epoch": 4.645862884160756,
+ "grad_norm": 3.3927671909332275,
+ "learning_rate": 6.117034795940089e-07,
+ "loss": 0.404,
+ "step": 9826
+ },
+ {
+ "epoch": 4.646335697399527,
+ "grad_norm": 2.885082721710205,
+ "learning_rate": 6.112947085331581e-07,
+ "loss": 0.3375,
+ "step": 9827
+ },
+ {
+ "epoch": 4.646808510638298,
+ "grad_norm": 2.9711341857910156,
+ "learning_rate": 6.108860550737034e-07,
+ "loss": 0.3051,
+ "step": 9828
+ },
+ {
+ "epoch": 4.647281323877069,
+ "grad_norm": 3.1437952518463135,
+ "learning_rate": 6.104775192410911e-07,
+ "loss": 0.3408,
+ "step": 9829
+ },
+ {
+ "epoch": 4.6477541371158395,
+ "grad_norm": 3.055950164794922,
+ "learning_rate": 6.100691010607579e-07,
+ "loss": 0.316,
+ "step": 9830
+ },
+ {
+ "epoch": 4.64822695035461,
+ "grad_norm": 3.515423536300659,
+ "learning_rate": 6.096608005581353e-07,
+ "loss": 0.3994,
+ "step": 9831
+ },
+ {
+ "epoch": 4.648699763593381,
+ "grad_norm": 3.1165153980255127,
+ "learning_rate": 6.092526177586455e-07,
+ "loss": 0.3908,
+ "step": 9832
+ },
+ {
+ "epoch": 4.649172576832151,
+ "grad_norm": 3.504673719406128,
+ "learning_rate": 6.088445526877043e-07,
+ "loss": 0.3328,
+ "step": 9833
+ },
+ {
+ "epoch": 4.649645390070922,
+ "grad_norm": 3.4175243377685547,
+ "learning_rate": 6.084366053707208e-07,
+ "loss": 0.3234,
+ "step": 9834
+ },
+ {
+ "epoch": 4.650118203309693,
+ "grad_norm": 3.14725661277771,
+ "learning_rate": 6.080287758330946e-07,
+ "loss": 0.3118,
+ "step": 9835
+ },
+ {
+ "epoch": 4.650591016548463,
+ "grad_norm": 3.7654550075531006,
+ "learning_rate": 6.076210641002217e-07,
+ "loss": 0.4177,
+ "step": 9836
+ },
+ {
+ "epoch": 4.651063829787234,
+ "grad_norm": 3.188804864883423,
+ "learning_rate": 6.072134701974871e-07,
+ "loss": 0.3468,
+ "step": 9837
+ },
+ {
+ "epoch": 4.651536643026005,
+ "grad_norm": 3.2176342010498047,
+ "learning_rate": 6.068059941502702e-07,
+ "loss": 0.3486,
+ "step": 9838
+ },
+ {
+ "epoch": 4.652009456264776,
+ "grad_norm": 4.188257217407227,
+ "learning_rate": 6.063986359839424e-07,
+ "loss": 0.3973,
+ "step": 9839
+ },
+ {
+ "epoch": 4.652482269503546,
+ "grad_norm": 3.206559896469116,
+ "learning_rate": 6.059913957238678e-07,
+ "loss": 0.3088,
+ "step": 9840
+ },
+ {
+ "epoch": 4.6529550827423165,
+ "grad_norm": 3.033918857574463,
+ "learning_rate": 6.055842733954048e-07,
+ "loss": 0.3331,
+ "step": 9841
+ },
+ {
+ "epoch": 4.653427895981087,
+ "grad_norm": 3.2453384399414062,
+ "learning_rate": 6.051772690239022e-07,
+ "loss": 0.323,
+ "step": 9842
+ },
+ {
+ "epoch": 4.653900709219858,
+ "grad_norm": 3.001999855041504,
+ "learning_rate": 6.047703826347017e-07,
+ "loss": 0.3763,
+ "step": 9843
+ },
+ {
+ "epoch": 4.654373522458629,
+ "grad_norm": 3.845486640930176,
+ "learning_rate": 6.043636142531401e-07,
+ "loss": 0.3595,
+ "step": 9844
+ },
+ {
+ "epoch": 4.6548463356974,
+ "grad_norm": 3.347628593444824,
+ "learning_rate": 6.039569639045434e-07,
+ "loss": 0.3388,
+ "step": 9845
+ },
+ {
+ "epoch": 4.6553191489361705,
+ "grad_norm": 3.1889400482177734,
+ "learning_rate": 6.035504316142333e-07,
+ "loss": 0.335,
+ "step": 9846
+ },
+ {
+ "epoch": 4.655791962174941,
+ "grad_norm": 3.3385977745056152,
+ "learning_rate": 6.031440174075221e-07,
+ "loss": 0.3985,
+ "step": 9847
+ },
+ {
+ "epoch": 4.656264775413711,
+ "grad_norm": 2.849853277206421,
+ "learning_rate": 6.027377213097146e-07,
+ "loss": 0.3604,
+ "step": 9848
+ },
+ {
+ "epoch": 4.656737588652482,
+ "grad_norm": 3.243053436279297,
+ "learning_rate": 6.02331543346111e-07,
+ "loss": 0.3257,
+ "step": 9849
+ },
+ {
+ "epoch": 4.657210401891253,
+ "grad_norm": 3.344167709350586,
+ "learning_rate": 6.01925483542001e-07,
+ "loss": 0.3511,
+ "step": 9850
+ },
+ {
+ "epoch": 4.657683215130024,
+ "grad_norm": 2.9741430282592773,
+ "learning_rate": 6.015195419226677e-07,
+ "loss": 0.3303,
+ "step": 9851
+ },
+ {
+ "epoch": 4.658156028368794,
+ "grad_norm": 3.0257937908172607,
+ "learning_rate": 6.011137185133883e-07,
+ "loss": 0.3716,
+ "step": 9852
+ },
+ {
+ "epoch": 4.658628841607565,
+ "grad_norm": 3.5770089626312256,
+ "learning_rate": 6.007080133394316e-07,
+ "loss": 0.3258,
+ "step": 9853
+ },
+ {
+ "epoch": 4.659101654846336,
+ "grad_norm": 3.363703489303589,
+ "learning_rate": 6.003024264260587e-07,
+ "loss": 0.3924,
+ "step": 9854
+ },
+ {
+ "epoch": 4.659574468085106,
+ "grad_norm": 3.3533787727355957,
+ "learning_rate": 5.998969577985239e-07,
+ "loss": 0.3242,
+ "step": 9855
+ },
+ {
+ "epoch": 4.660047281323877,
+ "grad_norm": 2.7335259914398193,
+ "learning_rate": 5.994916074820731e-07,
+ "loss": 0.3269,
+ "step": 9856
+ },
+ {
+ "epoch": 4.6605200945626475,
+ "grad_norm": 3.7654764652252197,
+ "learning_rate": 5.990863755019471e-07,
+ "loss": 0.4,
+ "step": 9857
+ },
+ {
+ "epoch": 4.660992907801418,
+ "grad_norm": 3.372542381286621,
+ "learning_rate": 5.986812618833765e-07,
+ "loss": 0.3423,
+ "step": 9858
+ },
+ {
+ "epoch": 4.661465721040189,
+ "grad_norm": 2.797814130783081,
+ "learning_rate": 5.982762666515873e-07,
+ "loss": 0.3228,
+ "step": 9859
+ },
+ {
+ "epoch": 4.66193853427896,
+ "grad_norm": 3.0121023654937744,
+ "learning_rate": 5.978713898317964e-07,
+ "loss": 0.3063,
+ "step": 9860
+ },
+ {
+ "epoch": 4.662411347517731,
+ "grad_norm": 3.052292823791504,
+ "learning_rate": 5.974666314492126e-07,
+ "loss": 0.3029,
+ "step": 9861
+ },
+ {
+ "epoch": 4.6628841607565015,
+ "grad_norm": 2.990906238555908,
+ "learning_rate": 5.970619915290399e-07,
+ "loss": 0.3788,
+ "step": 9862
+ },
+ {
+ "epoch": 4.663356973995272,
+ "grad_norm": 3.214334726333618,
+ "learning_rate": 5.966574700964722e-07,
+ "loss": 0.3138,
+ "step": 9863
+ },
+ {
+ "epoch": 4.663829787234042,
+ "grad_norm": 3.5982940196990967,
+ "learning_rate": 5.962530671766989e-07,
+ "loss": 0.3685,
+ "step": 9864
+ },
+ {
+ "epoch": 4.664302600472813,
+ "grad_norm": 3.2522151470184326,
+ "learning_rate": 5.958487827948991e-07,
+ "loss": 0.3086,
+ "step": 9865
+ },
+ {
+ "epoch": 4.664775413711584,
+ "grad_norm": 3.070181131362915,
+ "learning_rate": 5.954446169762457e-07,
+ "loss": 0.3534,
+ "step": 9866
+ },
+ {
+ "epoch": 4.665248226950355,
+ "grad_norm": 3.3051350116729736,
+ "learning_rate": 5.950405697459055e-07,
+ "loss": 0.3871,
+ "step": 9867
+ },
+ {
+ "epoch": 4.665721040189125,
+ "grad_norm": 2.8587753772735596,
+ "learning_rate": 5.946366411290358e-07,
+ "loss": 0.3157,
+ "step": 9868
+ },
+ {
+ "epoch": 4.666193853427896,
+ "grad_norm": 3.154926061630249,
+ "learning_rate": 5.942328311507878e-07,
+ "loss": 0.2967,
+ "step": 9869
+ },
+ {
+ "epoch": 4.666666666666667,
+ "grad_norm": 3.0322320461273193,
+ "learning_rate": 5.938291398363049e-07,
+ "loss": 0.323,
+ "step": 9870
+ },
+ {
+ "epoch": 4.667139479905437,
+ "grad_norm": 3.0678954124450684,
+ "learning_rate": 5.934255672107222e-07,
+ "loss": 0.3337,
+ "step": 9871
+ },
+ {
+ "epoch": 4.667612293144208,
+ "grad_norm": 3.4822635650634766,
+ "learning_rate": 5.930221132991704e-07,
+ "loss": 0.4052,
+ "step": 9872
+ },
+ {
+ "epoch": 4.6680851063829785,
+ "grad_norm": 3.26842999458313,
+ "learning_rate": 5.926187781267695e-07,
+ "loss": 0.3501,
+ "step": 9873
+ },
+ {
+ "epoch": 4.668557919621749,
+ "grad_norm": 2.911407709121704,
+ "learning_rate": 5.922155617186332e-07,
+ "loss": 0.3544,
+ "step": 9874
+ },
+ {
+ "epoch": 4.66903073286052,
+ "grad_norm": 3.1876001358032227,
+ "learning_rate": 5.91812464099869e-07,
+ "loss": 0.3389,
+ "step": 9875
+ },
+ {
+ "epoch": 4.669503546099291,
+ "grad_norm": 3.4954607486724854,
+ "learning_rate": 5.914094852955749e-07,
+ "loss": 0.3461,
+ "step": 9876
+ },
+ {
+ "epoch": 4.669976359338062,
+ "grad_norm": 3.8845367431640625,
+ "learning_rate": 5.910066253308439e-07,
+ "loss": 0.3868,
+ "step": 9877
+ },
+ {
+ "epoch": 4.6704491725768325,
+ "grad_norm": 3.18038272857666,
+ "learning_rate": 5.906038842307598e-07,
+ "loss": 0.3311,
+ "step": 9878
+ },
+ {
+ "epoch": 4.670921985815603,
+ "grad_norm": 3.5944042205810547,
+ "learning_rate": 5.902012620203984e-07,
+ "loss": 0.3246,
+ "step": 9879
+ },
+ {
+ "epoch": 4.671394799054373,
+ "grad_norm": 2.980142116546631,
+ "learning_rate": 5.897987587248311e-07,
+ "loss": 0.3361,
+ "step": 9880
+ },
+ {
+ "epoch": 4.671867612293144,
+ "grad_norm": 4.3120269775390625,
+ "learning_rate": 5.893963743691183e-07,
+ "loss": 0.3213,
+ "step": 9881
+ },
+ {
+ "epoch": 4.672340425531915,
+ "grad_norm": 3.42366361618042,
+ "learning_rate": 5.889941089783163e-07,
+ "loss": 0.3515,
+ "step": 9882
+ },
+ {
+ "epoch": 4.6728132387706856,
+ "grad_norm": 2.910720109939575,
+ "learning_rate": 5.885919625774716e-07,
+ "loss": 0.3417,
+ "step": 9883
+ },
+ {
+ "epoch": 4.673286052009456,
+ "grad_norm": 3.122042179107666,
+ "learning_rate": 5.881899351916242e-07,
+ "loss": 0.3714,
+ "step": 9884
+ },
+ {
+ "epoch": 4.673758865248227,
+ "grad_norm": 3.0564188957214355,
+ "learning_rate": 5.877880268458064e-07,
+ "loss": 0.3146,
+ "step": 9885
+ },
+ {
+ "epoch": 4.674231678486998,
+ "grad_norm": 3.303421974182129,
+ "learning_rate": 5.873862375650427e-07,
+ "loss": 0.3476,
+ "step": 9886
+ },
+ {
+ "epoch": 4.674704491725768,
+ "grad_norm": 3.3057096004486084,
+ "learning_rate": 5.869845673743521e-07,
+ "loss": 0.3237,
+ "step": 9887
+ },
+ {
+ "epoch": 4.675177304964539,
+ "grad_norm": 3.1843838691711426,
+ "learning_rate": 5.865830162987443e-07,
+ "loss": 0.3789,
+ "step": 9888
+ },
+ {
+ "epoch": 4.6756501182033094,
+ "grad_norm": 2.865844964981079,
+ "learning_rate": 5.861815843632213e-07,
+ "loss": 0.3173,
+ "step": 9889
+ },
+ {
+ "epoch": 4.67612293144208,
+ "grad_norm": 2.986262083053589,
+ "learning_rate": 5.857802715927796e-07,
+ "loss": 0.2697,
+ "step": 9890
+ },
+ {
+ "epoch": 4.676595744680851,
+ "grad_norm": 3.2936089038848877,
+ "learning_rate": 5.853790780124063e-07,
+ "loss": 0.3839,
+ "step": 9891
+ },
+ {
+ "epoch": 4.677068557919622,
+ "grad_norm": 2.7130303382873535,
+ "learning_rate": 5.849780036470831e-07,
+ "loss": 0.3004,
+ "step": 9892
+ },
+ {
+ "epoch": 4.677541371158393,
+ "grad_norm": 3.0076770782470703,
+ "learning_rate": 5.845770485217827e-07,
+ "loss": 0.3467,
+ "step": 9893
+ },
+ {
+ "epoch": 4.678014184397163,
+ "grad_norm": 3.5340375900268555,
+ "learning_rate": 5.841762126614697e-07,
+ "loss": 0.3759,
+ "step": 9894
+ },
+ {
+ "epoch": 4.678486997635934,
+ "grad_norm": 3.0034375190734863,
+ "learning_rate": 5.837754960911041e-07,
+ "loss": 0.3099,
+ "step": 9895
+ },
+ {
+ "epoch": 4.678959810874704,
+ "grad_norm": 3.576899766921997,
+ "learning_rate": 5.833748988356358e-07,
+ "loss": 0.3612,
+ "step": 9896
+ },
+ {
+ "epoch": 4.679432624113475,
+ "grad_norm": 3.0961546897888184,
+ "learning_rate": 5.829744209200077e-07,
+ "loss": 0.3098,
+ "step": 9897
+ },
+ {
+ "epoch": 4.679905437352246,
+ "grad_norm": 3.1387925148010254,
+ "learning_rate": 5.825740623691576e-07,
+ "loss": 0.3538,
+ "step": 9898
+ },
+ {
+ "epoch": 4.6803782505910165,
+ "grad_norm": 3.4131572246551514,
+ "learning_rate": 5.821738232080127e-07,
+ "loss": 0.3984,
+ "step": 9899
+ },
+ {
+ "epoch": 4.680851063829787,
+ "grad_norm": 3.1346065998077393,
+ "learning_rate": 5.817737034614934e-07,
+ "loss": 0.3585,
+ "step": 9900
+ },
+ {
+ "epoch": 4.681323877068558,
+ "grad_norm": 3.148144483566284,
+ "learning_rate": 5.813737031545155e-07,
+ "loss": 0.3774,
+ "step": 9901
+ },
+ {
+ "epoch": 4.681796690307329,
+ "grad_norm": 3.2461299896240234,
+ "learning_rate": 5.809738223119843e-07,
+ "loss": 0.3181,
+ "step": 9902
+ },
+ {
+ "epoch": 4.682269503546099,
+ "grad_norm": 3.1998214721679688,
+ "learning_rate": 5.805740609587981e-07,
+ "loss": 0.3452,
+ "step": 9903
+ },
+ {
+ "epoch": 4.68274231678487,
+ "grad_norm": 2.897399425506592,
+ "learning_rate": 5.801744191198483e-07,
+ "loss": 0.3247,
+ "step": 9904
+ },
+ {
+ "epoch": 4.68321513002364,
+ "grad_norm": 2.921877384185791,
+ "learning_rate": 5.797748968200198e-07,
+ "loss": 0.2842,
+ "step": 9905
+ },
+ {
+ "epoch": 4.683687943262411,
+ "grad_norm": 3.17667818069458,
+ "learning_rate": 5.793754940841887e-07,
+ "loss": 0.3218,
+ "step": 9906
+ },
+ {
+ "epoch": 4.684160756501182,
+ "grad_norm": 3.499068260192871,
+ "learning_rate": 5.78976210937223e-07,
+ "loss": 0.3352,
+ "step": 9907
+ },
+ {
+ "epoch": 4.684633569739953,
+ "grad_norm": 3.2782368659973145,
+ "learning_rate": 5.785770474039859e-07,
+ "loss": 0.3671,
+ "step": 9908
+ },
+ {
+ "epoch": 4.685106382978724,
+ "grad_norm": 3.089757204055786,
+ "learning_rate": 5.781780035093304e-07,
+ "loss": 0.3613,
+ "step": 9909
+ },
+ {
+ "epoch": 4.685579196217494,
+ "grad_norm": 3.082561492919922,
+ "learning_rate": 5.77779079278104e-07,
+ "loss": 0.3351,
+ "step": 9910
+ },
+ {
+ "epoch": 4.686052009456265,
+ "grad_norm": 3.6009864807128906,
+ "learning_rate": 5.773802747351462e-07,
+ "loss": 0.3545,
+ "step": 9911
+ },
+ {
+ "epoch": 4.686524822695035,
+ "grad_norm": 3.488717555999756,
+ "learning_rate": 5.769815899052872e-07,
+ "loss": 0.3926,
+ "step": 9912
+ },
+ {
+ "epoch": 4.686997635933806,
+ "grad_norm": 3.5619056224823,
+ "learning_rate": 5.765830248133531e-07,
+ "loss": 0.4171,
+ "step": 9913
+ },
+ {
+ "epoch": 4.687470449172577,
+ "grad_norm": 3.30653977394104,
+ "learning_rate": 5.761845794841594e-07,
+ "loss": 0.3713,
+ "step": 9914
+ },
+ {
+ "epoch": 4.6879432624113475,
+ "grad_norm": 2.8256847858428955,
+ "learning_rate": 5.757862539425171e-07,
+ "loss": 0.2633,
+ "step": 9915
+ },
+ {
+ "epoch": 4.688416075650118,
+ "grad_norm": 3.0387041568756104,
+ "learning_rate": 5.753880482132274e-07,
+ "loss": 0.3169,
+ "step": 9916
+ },
+ {
+ "epoch": 4.688888888888889,
+ "grad_norm": 3.2312963008880615,
+ "learning_rate": 5.749899623210845e-07,
+ "loss": 0.3238,
+ "step": 9917
+ },
+ {
+ "epoch": 4.68936170212766,
+ "grad_norm": 3.077155351638794,
+ "learning_rate": 5.74591996290876e-07,
+ "loss": 0.3101,
+ "step": 9918
+ },
+ {
+ "epoch": 4.68983451536643,
+ "grad_norm": 3.461580991744995,
+ "learning_rate": 5.741941501473811e-07,
+ "loss": 0.3756,
+ "step": 9919
+ },
+ {
+ "epoch": 4.690307328605201,
+ "grad_norm": 3.8845605850219727,
+ "learning_rate": 5.737964239153712e-07,
+ "loss": 0.3747,
+ "step": 9920
+ },
+ {
+ "epoch": 4.690780141843971,
+ "grad_norm": 3.1688292026519775,
+ "learning_rate": 5.733988176196129e-07,
+ "loss": 0.3663,
+ "step": 9921
+ },
+ {
+ "epoch": 4.691252955082742,
+ "grad_norm": 3.2730917930603027,
+ "learning_rate": 5.730013312848614e-07,
+ "loss": 0.3697,
+ "step": 9922
+ },
+ {
+ "epoch": 4.691725768321513,
+ "grad_norm": 3.093761682510376,
+ "learning_rate": 5.726039649358681e-07,
+ "loss": 0.3215,
+ "step": 9923
+ },
+ {
+ "epoch": 4.692198581560284,
+ "grad_norm": 3.1679420471191406,
+ "learning_rate": 5.722067185973746e-07,
+ "loss": 0.3019,
+ "step": 9924
+ },
+ {
+ "epoch": 4.692671394799055,
+ "grad_norm": 3.4821531772613525,
+ "learning_rate": 5.718095922941147e-07,
+ "loss": 0.3659,
+ "step": 9925
+ },
+ {
+ "epoch": 4.693144208037825,
+ "grad_norm": 2.985276699066162,
+ "learning_rate": 5.714125860508177e-07,
+ "loss": 0.3293,
+ "step": 9926
+ },
+ {
+ "epoch": 4.693617021276596,
+ "grad_norm": 3.171663999557495,
+ "learning_rate": 5.710156998922015e-07,
+ "loss": 0.3647,
+ "step": 9927
+ },
+ {
+ "epoch": 4.694089834515366,
+ "grad_norm": 3.3699564933776855,
+ "learning_rate": 5.706189338429798e-07,
+ "loss": 0.4021,
+ "step": 9928
+ },
+ {
+ "epoch": 4.694562647754137,
+ "grad_norm": 3.0827202796936035,
+ "learning_rate": 5.702222879278571e-07,
+ "loss": 0.329,
+ "step": 9929
+ },
+ {
+ "epoch": 4.695035460992908,
+ "grad_norm": 3.5798332691192627,
+ "learning_rate": 5.698257621715303e-07,
+ "loss": 0.3777,
+ "step": 9930
+ },
+ {
+ "epoch": 4.6955082742316785,
+ "grad_norm": 2.741230010986328,
+ "learning_rate": 5.6942935659869e-07,
+ "loss": 0.31,
+ "step": 9931
+ },
+ {
+ "epoch": 4.695981087470449,
+ "grad_norm": 2.9929327964782715,
+ "learning_rate": 5.690330712340187e-07,
+ "loss": 0.3132,
+ "step": 9932
+ },
+ {
+ "epoch": 4.69645390070922,
+ "grad_norm": 3.062685489654541,
+ "learning_rate": 5.68636906102191e-07,
+ "loss": 0.3204,
+ "step": 9933
+ },
+ {
+ "epoch": 4.696926713947991,
+ "grad_norm": 3.166281223297119,
+ "learning_rate": 5.682408612278742e-07,
+ "loss": 0.3444,
+ "step": 9934
+ },
+ {
+ "epoch": 4.697399527186761,
+ "grad_norm": 3.0413401126861572,
+ "learning_rate": 5.678449366357278e-07,
+ "loss": 0.3506,
+ "step": 9935
+ },
+ {
+ "epoch": 4.697872340425532,
+ "grad_norm": 3.7843124866485596,
+ "learning_rate": 5.674491323504059e-07,
+ "loss": 0.3349,
+ "step": 9936
+ },
+ {
+ "epoch": 4.698345153664302,
+ "grad_norm": 2.9070212841033936,
+ "learning_rate": 5.670534483965514e-07,
+ "loss": 0.2954,
+ "step": 9937
+ },
+ {
+ "epoch": 4.698817966903073,
+ "grad_norm": 2.924229383468628,
+ "learning_rate": 5.666578847988041e-07,
+ "loss": 0.3392,
+ "step": 9938
+ },
+ {
+ "epoch": 4.699290780141844,
+ "grad_norm": 3.1302332878112793,
+ "learning_rate": 5.662624415817924e-07,
+ "loss": 0.3198,
+ "step": 9939
+ },
+ {
+ "epoch": 4.699763593380615,
+ "grad_norm": 3.163005828857422,
+ "learning_rate": 5.65867118770139e-07,
+ "loss": 0.3817,
+ "step": 9940
+ },
+ {
+ "epoch": 4.700236406619386,
+ "grad_norm": 3.4002792835235596,
+ "learning_rate": 5.654719163884598e-07,
+ "loss": 0.3961,
+ "step": 9941
+ },
+ {
+ "epoch": 4.700709219858156,
+ "grad_norm": 3.9756014347076416,
+ "learning_rate": 5.650768344613616e-07,
+ "loss": 0.4011,
+ "step": 9942
+ },
+ {
+ "epoch": 4.701182033096927,
+ "grad_norm": 3.118243455886841,
+ "learning_rate": 5.64681873013444e-07,
+ "loss": 0.3675,
+ "step": 9943
+ },
+ {
+ "epoch": 4.701654846335697,
+ "grad_norm": 3.0520825386047363,
+ "learning_rate": 5.642870320693005e-07,
+ "loss": 0.2782,
+ "step": 9944
+ },
+ {
+ "epoch": 4.702127659574468,
+ "grad_norm": 3.380565643310547,
+ "learning_rate": 5.638923116535152e-07,
+ "loss": 0.3632,
+ "step": 9945
+ },
+ {
+ "epoch": 4.702600472813239,
+ "grad_norm": 3.2340569496154785,
+ "learning_rate": 5.634977117906668e-07,
+ "loss": 0.3754,
+ "step": 9946
+ },
+ {
+ "epoch": 4.7030732860520095,
+ "grad_norm": 3.0068717002868652,
+ "learning_rate": 5.631032325053243e-07,
+ "loss": 0.3879,
+ "step": 9947
+ },
+ {
+ "epoch": 4.70354609929078,
+ "grad_norm": 3.4717891216278076,
+ "learning_rate": 5.627088738220507e-07,
+ "loss": 0.4053,
+ "step": 9948
+ },
+ {
+ "epoch": 4.704018912529551,
+ "grad_norm": 4.362999439239502,
+ "learning_rate": 5.623146357654008e-07,
+ "loss": 0.3115,
+ "step": 9949
+ },
+ {
+ "epoch": 4.704491725768322,
+ "grad_norm": 3.2190041542053223,
+ "learning_rate": 5.619205183599211e-07,
+ "loss": 0.3267,
+ "step": 9950
+ },
+ {
+ "epoch": 4.704964539007092,
+ "grad_norm": 3.413800001144409,
+ "learning_rate": 5.615265216301532e-07,
+ "loss": 0.4012,
+ "step": 9951
+ },
+ {
+ "epoch": 4.705437352245863,
+ "grad_norm": 3.5244312286376953,
+ "learning_rate": 5.611326456006291e-07,
+ "loss": 0.3484,
+ "step": 9952
+ },
+ {
+ "epoch": 4.705910165484633,
+ "grad_norm": 3.055433511734009,
+ "learning_rate": 5.607388902958727e-07,
+ "loss": 0.3637,
+ "step": 9953
+ },
+ {
+ "epoch": 4.706382978723404,
+ "grad_norm": 3.4459595680236816,
+ "learning_rate": 5.603452557404029e-07,
+ "loss": 0.3632,
+ "step": 9954
+ },
+ {
+ "epoch": 4.706855791962175,
+ "grad_norm": 2.972321033477783,
+ "learning_rate": 5.59951741958728e-07,
+ "loss": 0.3411,
+ "step": 9955
+ },
+ {
+ "epoch": 4.707328605200946,
+ "grad_norm": 3.2460532188415527,
+ "learning_rate": 5.595583489753523e-07,
+ "loss": 0.3779,
+ "step": 9956
+ },
+ {
+ "epoch": 4.707801418439717,
+ "grad_norm": 3.514521837234497,
+ "learning_rate": 5.591650768147694e-07,
+ "loss": 0.3313,
+ "step": 9957
+ },
+ {
+ "epoch": 4.708274231678487,
+ "grad_norm": 2.8473336696624756,
+ "learning_rate": 5.587719255014662e-07,
+ "loss": 0.3078,
+ "step": 9958
+ },
+ {
+ "epoch": 4.708747044917258,
+ "grad_norm": 3.309263229370117,
+ "learning_rate": 5.583788950599239e-07,
+ "loss": 0.3905,
+ "step": 9959
+ },
+ {
+ "epoch": 4.709219858156028,
+ "grad_norm": 3.435980796813965,
+ "learning_rate": 5.579859855146133e-07,
+ "loss": 0.3507,
+ "step": 9960
+ },
+ {
+ "epoch": 4.709692671394799,
+ "grad_norm": 3.0237598419189453,
+ "learning_rate": 5.575931968900006e-07,
+ "loss": 0.3349,
+ "step": 9961
+ },
+ {
+ "epoch": 4.71016548463357,
+ "grad_norm": 3.6978237628936768,
+ "learning_rate": 5.572005292105426e-07,
+ "loss": 0.3672,
+ "step": 9962
+ },
+ {
+ "epoch": 4.7106382978723405,
+ "grad_norm": 3.3029704093933105,
+ "learning_rate": 5.568079825006883e-07,
+ "loss": 0.3438,
+ "step": 9963
+ },
+ {
+ "epoch": 4.711111111111111,
+ "grad_norm": 2.9121241569519043,
+ "learning_rate": 5.5641555678488e-07,
+ "loss": 0.3299,
+ "step": 9964
+ },
+ {
+ "epoch": 4.711583924349882,
+ "grad_norm": 3.2730703353881836,
+ "learning_rate": 5.56023252087553e-07,
+ "loss": 0.3572,
+ "step": 9965
+ },
+ {
+ "epoch": 4.712056737588653,
+ "grad_norm": 3.316593885421753,
+ "learning_rate": 5.556310684331343e-07,
+ "loss": 0.3139,
+ "step": 9966
+ },
+ {
+ "epoch": 4.712529550827423,
+ "grad_norm": 3.1281843185424805,
+ "learning_rate": 5.552390058460427e-07,
+ "loss": 0.3362,
+ "step": 9967
+ },
+ {
+ "epoch": 4.7130023640661936,
+ "grad_norm": 3.3069980144500732,
+ "learning_rate": 5.548470643506904e-07,
+ "loss": 0.3839,
+ "step": 9968
+ },
+ {
+ "epoch": 4.713475177304964,
+ "grad_norm": 4.4018354415893555,
+ "learning_rate": 5.544552439714826e-07,
+ "loss": 0.2954,
+ "step": 9969
+ },
+ {
+ "epoch": 4.713947990543735,
+ "grad_norm": 2.797149658203125,
+ "learning_rate": 5.540635447328161e-07,
+ "loss": 0.3253,
+ "step": 9970
+ },
+ {
+ "epoch": 4.714420803782506,
+ "grad_norm": 3.0065677165985107,
+ "learning_rate": 5.536719666590792e-07,
+ "loss": 0.3376,
+ "step": 9971
+ },
+ {
+ "epoch": 4.714893617021277,
+ "grad_norm": 3.1383140087127686,
+ "learning_rate": 5.532805097746552e-07,
+ "loss": 0.3444,
+ "step": 9972
+ },
+ {
+ "epoch": 4.7153664302600475,
+ "grad_norm": 2.983229398727417,
+ "learning_rate": 5.528891741039169e-07,
+ "loss": 0.3173,
+ "step": 9973
+ },
+ {
+ "epoch": 4.715839243498818,
+ "grad_norm": 3.119361162185669,
+ "learning_rate": 5.524979596712326e-07,
+ "loss": 0.3829,
+ "step": 9974
+ },
+ {
+ "epoch": 4.716312056737589,
+ "grad_norm": 3.4099128246307373,
+ "learning_rate": 5.52106866500961e-07,
+ "loss": 0.3363,
+ "step": 9975
+ },
+ {
+ "epoch": 4.716784869976359,
+ "grad_norm": 2.818964719772339,
+ "learning_rate": 5.517158946174528e-07,
+ "loss": 0.321,
+ "step": 9976
+ },
+ {
+ "epoch": 4.71725768321513,
+ "grad_norm": 3.4968421459198,
+ "learning_rate": 5.513250440450538e-07,
+ "loss": 0.3973,
+ "step": 9977
+ },
+ {
+ "epoch": 4.717730496453901,
+ "grad_norm": 3.3777382373809814,
+ "learning_rate": 5.509343148080987e-07,
+ "loss": 0.3607,
+ "step": 9978
+ },
+ {
+ "epoch": 4.718203309692671,
+ "grad_norm": 2.95882511138916,
+ "learning_rate": 5.50543706930918e-07,
+ "loss": 0.3483,
+ "step": 9979
+ },
+ {
+ "epoch": 4.718676122931442,
+ "grad_norm": 2.8768858909606934,
+ "learning_rate": 5.501532204378327e-07,
+ "loss": 0.3488,
+ "step": 9980
+ },
+ {
+ "epoch": 4.719148936170213,
+ "grad_norm": 2.9310572147369385,
+ "learning_rate": 5.497628553531565e-07,
+ "loss": 0.3174,
+ "step": 9981
+ },
+ {
+ "epoch": 4.719621749408984,
+ "grad_norm": 3.1057486534118652,
+ "learning_rate": 5.493726117011957e-07,
+ "loss": 0.346,
+ "step": 9982
+ },
+ {
+ "epoch": 4.720094562647754,
+ "grad_norm": 3.681593418121338,
+ "learning_rate": 5.489824895062487e-07,
+ "loss": 0.3371,
+ "step": 9983
+ },
+ {
+ "epoch": 4.7205673758865245,
+ "grad_norm": 3.0641729831695557,
+ "learning_rate": 5.485924887926075e-07,
+ "loss": 0.3614,
+ "step": 9984
+ },
+ {
+ "epoch": 4.721040189125295,
+ "grad_norm": 3.2925705909729004,
+ "learning_rate": 5.482026095845555e-07,
+ "loss": 0.3023,
+ "step": 9985
+ },
+ {
+ "epoch": 4.721513002364066,
+ "grad_norm": 2.963693141937256,
+ "learning_rate": 5.47812851906368e-07,
+ "loss": 0.3706,
+ "step": 9986
+ },
+ {
+ "epoch": 4.721985815602837,
+ "grad_norm": 3.187870740890503,
+ "learning_rate": 5.474232157823147e-07,
+ "loss": 0.3332,
+ "step": 9987
+ },
+ {
+ "epoch": 4.722458628841608,
+ "grad_norm": 3.9346799850463867,
+ "learning_rate": 5.470337012366556e-07,
+ "loss": 0.3738,
+ "step": 9988
+ },
+ {
+ "epoch": 4.7229314420803785,
+ "grad_norm": 3.385035753250122,
+ "learning_rate": 5.466443082936446e-07,
+ "loss": 0.3194,
+ "step": 9989
+ },
+ {
+ "epoch": 4.723404255319149,
+ "grad_norm": 3.0829477310180664,
+ "learning_rate": 5.462550369775277e-07,
+ "loss": 0.2877,
+ "step": 9990
+ },
+ {
+ "epoch": 4.72387706855792,
+ "grad_norm": 2.8730506896972656,
+ "learning_rate": 5.458658873125419e-07,
+ "loss": 0.3352,
+ "step": 9991
+ },
+ {
+ "epoch": 4.72434988179669,
+ "grad_norm": 3.198498249053955,
+ "learning_rate": 5.454768593229193e-07,
+ "loss": 0.3697,
+ "step": 9992
+ },
+ {
+ "epoch": 4.724822695035461,
+ "grad_norm": 3.37144136428833,
+ "learning_rate": 5.450879530328824e-07,
+ "loss": 0.4245,
+ "step": 9993
+ },
+ {
+ "epoch": 4.725295508274232,
+ "grad_norm": 3.6235079765319824,
+ "learning_rate": 5.446991684666461e-07,
+ "loss": 0.3707,
+ "step": 9994
+ },
+ {
+ "epoch": 4.725768321513002,
+ "grad_norm": 3.5587494373321533,
+ "learning_rate": 5.443105056484194e-07,
+ "loss": 0.3297,
+ "step": 9995
+ },
+ {
+ "epoch": 4.726241134751773,
+ "grad_norm": 3.5308549404144287,
+ "learning_rate": 5.439219646024018e-07,
+ "loss": 0.3521,
+ "step": 9996
+ },
+ {
+ "epoch": 4.726713947990544,
+ "grad_norm": 3.16542649269104,
+ "learning_rate": 5.435335453527868e-07,
+ "loss": 0.3499,
+ "step": 9997
+ },
+ {
+ "epoch": 4.727186761229315,
+ "grad_norm": 3.2565104961395264,
+ "learning_rate": 5.431452479237586e-07,
+ "loss": 0.338,
+ "step": 9998
+ },
+ {
+ "epoch": 4.727659574468085,
+ "grad_norm": 3.371232032775879,
+ "learning_rate": 5.427570723394951e-07,
+ "loss": 0.3641,
+ "step": 9999
+ },
+ {
+ "epoch": 4.7281323877068555,
+ "grad_norm": 2.9784507751464844,
+ "learning_rate": 5.423690186241668e-07,
+ "loss": 0.3667,
+ "step": 10000
+ },
+ {
+ "epoch": 4.728605200945626,
+ "grad_norm": 3.0877480506896973,
+ "learning_rate": 5.419810868019351e-07,
+ "loss": 0.3098,
+ "step": 10001
+ },
+ {
+ "epoch": 4.729078014184397,
+ "grad_norm": 4.132823467254639,
+ "learning_rate": 5.415932768969562e-07,
+ "loss": 0.3712,
+ "step": 10002
+ },
+ {
+ "epoch": 4.729550827423168,
+ "grad_norm": 2.8105905055999756,
+ "learning_rate": 5.412055889333767e-07,
+ "loss": 0.2829,
+ "step": 10003
+ },
+ {
+ "epoch": 4.730023640661939,
+ "grad_norm": 3.543795585632324,
+ "learning_rate": 5.408180229353352e-07,
+ "loss": 0.3101,
+ "step": 10004
+ },
+ {
+ "epoch": 4.7304964539007095,
+ "grad_norm": 3.307525157928467,
+ "learning_rate": 5.404305789269657e-07,
+ "loss": 0.3585,
+ "step": 10005
+ },
+ {
+ "epoch": 4.73096926713948,
+ "grad_norm": 3.0976414680480957,
+ "learning_rate": 5.400432569323905e-07,
+ "loss": 0.3202,
+ "step": 10006
+ },
+ {
+ "epoch": 4.73144208037825,
+ "grad_norm": 3.0249791145324707,
+ "learning_rate": 5.396560569757284e-07,
+ "loss": 0.3468,
+ "step": 10007
+ },
+ {
+ "epoch": 4.731914893617021,
+ "grad_norm": 3.0199971199035645,
+ "learning_rate": 5.392689790810879e-07,
+ "loss": 0.3483,
+ "step": 10008
+ },
+ {
+ "epoch": 4.732387706855792,
+ "grad_norm": 3.177297592163086,
+ "learning_rate": 5.388820232725697e-07,
+ "loss": 0.3333,
+ "step": 10009
+ },
+ {
+ "epoch": 4.732860520094563,
+ "grad_norm": 3.247121572494507,
+ "learning_rate": 5.384951895742693e-07,
+ "loss": 0.2881,
+ "step": 10010
+ },
+ {
+ "epoch": 4.733333333333333,
+ "grad_norm": 3.513106346130371,
+ "learning_rate": 5.381084780102727e-07,
+ "loss": 0.3786,
+ "step": 10011
+ },
+ {
+ "epoch": 4.733806146572104,
+ "grad_norm": 2.8936305046081543,
+ "learning_rate": 5.377218886046584e-07,
+ "loss": 0.3174,
+ "step": 10012
+ },
+ {
+ "epoch": 4.734278959810875,
+ "grad_norm": 3.1088016033172607,
+ "learning_rate": 5.373354213814977e-07,
+ "loss": 0.3108,
+ "step": 10013
+ },
+ {
+ "epoch": 4.734751773049645,
+ "grad_norm": 2.693617343902588,
+ "learning_rate": 5.369490763648539e-07,
+ "loss": 0.3441,
+ "step": 10014
+ },
+ {
+ "epoch": 4.735224586288416,
+ "grad_norm": 3.4399259090423584,
+ "learning_rate": 5.365628535787837e-07,
+ "loss": 0.3937,
+ "step": 10015
+ },
+ {
+ "epoch": 4.7356973995271865,
+ "grad_norm": 3.28714919090271,
+ "learning_rate": 5.361767530473355e-07,
+ "loss": 0.2993,
+ "step": 10016
+ },
+ {
+ "epoch": 4.736170212765957,
+ "grad_norm": 3.1407346725463867,
+ "learning_rate": 5.35790774794549e-07,
+ "loss": 0.3605,
+ "step": 10017
+ },
+ {
+ "epoch": 4.736643026004728,
+ "grad_norm": 3.464386224746704,
+ "learning_rate": 5.354049188444588e-07,
+ "loss": 0.382,
+ "step": 10018
+ },
+ {
+ "epoch": 4.737115839243499,
+ "grad_norm": 3.303809881210327,
+ "learning_rate": 5.350191852210889e-07,
+ "loss": 0.3438,
+ "step": 10019
+ },
+ {
+ "epoch": 4.73758865248227,
+ "grad_norm": 3.3727755546569824,
+ "learning_rate": 5.346335739484593e-07,
+ "loss": 0.3524,
+ "step": 10020
+ },
+ {
+ "epoch": 4.7380614657210405,
+ "grad_norm": 3.125762939453125,
+ "learning_rate": 5.342480850505788e-07,
+ "loss": 0.3762,
+ "step": 10021
+ },
+ {
+ "epoch": 4.738534278959811,
+ "grad_norm": 3.32598876953125,
+ "learning_rate": 5.3386271855145e-07,
+ "loss": 0.345,
+ "step": 10022
+ },
+ {
+ "epoch": 4.739007092198581,
+ "grad_norm": 2.889338970184326,
+ "learning_rate": 5.334774744750692e-07,
+ "loss": 0.3245,
+ "step": 10023
+ },
+ {
+ "epoch": 4.739479905437352,
+ "grad_norm": 3.3369252681732178,
+ "learning_rate": 5.330923528454223e-07,
+ "loss": 0.366,
+ "step": 10024
+ },
+ {
+ "epoch": 4.739952718676123,
+ "grad_norm": 3.008836269378662,
+ "learning_rate": 5.327073536864908e-07,
+ "loss": 0.358,
+ "step": 10025
+ },
+ {
+ "epoch": 4.740425531914894,
+ "grad_norm": 3.1076738834381104,
+ "learning_rate": 5.323224770222457e-07,
+ "loss": 0.3398,
+ "step": 10026
+ },
+ {
+ "epoch": 4.740898345153664,
+ "grad_norm": 3.269164800643921,
+ "learning_rate": 5.319377228766523e-07,
+ "loss": 0.3364,
+ "step": 10027
+ },
+ {
+ "epoch": 4.741371158392435,
+ "grad_norm": 3.3928871154785156,
+ "learning_rate": 5.315530912736671e-07,
+ "loss": 0.3376,
+ "step": 10028
+ },
+ {
+ "epoch": 4.741843971631206,
+ "grad_norm": 2.7413101196289062,
+ "learning_rate": 5.31168582237239e-07,
+ "loss": 0.3551,
+ "step": 10029
+ },
+ {
+ "epoch": 4.742316784869976,
+ "grad_norm": 2.837280035018921,
+ "learning_rate": 5.307841957913104e-07,
+ "loss": 0.316,
+ "step": 10030
+ },
+ {
+ "epoch": 4.742789598108747,
+ "grad_norm": 3.140482187271118,
+ "learning_rate": 5.303999319598158e-07,
+ "loss": 0.3951,
+ "step": 10031
+ },
+ {
+ "epoch": 4.7432624113475175,
+ "grad_norm": 2.978053331375122,
+ "learning_rate": 5.3001579076668e-07,
+ "loss": 0.3328,
+ "step": 10032
+ },
+ {
+ "epoch": 4.743735224586288,
+ "grad_norm": 3.3469338417053223,
+ "learning_rate": 5.296317722358235e-07,
+ "loss": 0.328,
+ "step": 10033
+ },
+ {
+ "epoch": 4.744208037825059,
+ "grad_norm": 3.1574513912200928,
+ "learning_rate": 5.29247876391156e-07,
+ "loss": 0.3375,
+ "step": 10034
+ },
+ {
+ "epoch": 4.74468085106383,
+ "grad_norm": 2.9314582347869873,
+ "learning_rate": 5.288641032565825e-07,
+ "loss": 0.3025,
+ "step": 10035
+ },
+ {
+ "epoch": 4.745153664302601,
+ "grad_norm": 3.298856258392334,
+ "learning_rate": 5.284804528559981e-07,
+ "loss": 0.3071,
+ "step": 10036
+ },
+ {
+ "epoch": 4.7456264775413715,
+ "grad_norm": 3.489758014678955,
+ "learning_rate": 5.280969252132903e-07,
+ "loss": 0.3392,
+ "step": 10037
+ },
+ {
+ "epoch": 4.746099290780142,
+ "grad_norm": 3.1727964878082275,
+ "learning_rate": 5.277135203523412e-07,
+ "loss": 0.3472,
+ "step": 10038
+ },
+ {
+ "epoch": 4.746572104018912,
+ "grad_norm": 3.267204761505127,
+ "learning_rate": 5.27330238297023e-07,
+ "loss": 0.3555,
+ "step": 10039
+ },
+ {
+ "epoch": 4.747044917257683,
+ "grad_norm": 3.376077175140381,
+ "learning_rate": 5.269470790712003e-07,
+ "loss": 0.4018,
+ "step": 10040
+ },
+ {
+ "epoch": 4.747517730496454,
+ "grad_norm": 3.2389678955078125,
+ "learning_rate": 5.265640426987321e-07,
+ "loss": 0.3742,
+ "step": 10041
+ },
+ {
+ "epoch": 4.7479905437352246,
+ "grad_norm": 3.0280439853668213,
+ "learning_rate": 5.261811292034668e-07,
+ "loss": 0.3254,
+ "step": 10042
+ },
+ {
+ "epoch": 4.748463356973995,
+ "grad_norm": 3.1756322383880615,
+ "learning_rate": 5.257983386092486e-07,
+ "loss": 0.3434,
+ "step": 10043
+ },
+ {
+ "epoch": 4.748936170212766,
+ "grad_norm": 3.220245599746704,
+ "learning_rate": 5.254156709399111e-07,
+ "loss": 0.3795,
+ "step": 10044
+ },
+ {
+ "epoch": 4.749408983451537,
+ "grad_norm": 3.4887516498565674,
+ "learning_rate": 5.250331262192815e-07,
+ "loss": 0.353,
+ "step": 10045
+ },
+ {
+ "epoch": 4.749881796690307,
+ "grad_norm": 3.1106226444244385,
+ "learning_rate": 5.246507044711791e-07,
+ "loss": 0.3329,
+ "step": 10046
+ },
+ {
+ "epoch": 4.750354609929078,
+ "grad_norm": 3.0493836402893066,
+ "learning_rate": 5.24268405719415e-07,
+ "loss": 0.3372,
+ "step": 10047
+ },
+ {
+ "epoch": 4.7508274231678485,
+ "grad_norm": 3.0885660648345947,
+ "learning_rate": 5.238862299877948e-07,
+ "loss": 0.3583,
+ "step": 10048
+ },
+ {
+ "epoch": 4.751300236406619,
+ "grad_norm": 3.194566011428833,
+ "learning_rate": 5.23504177300114e-07,
+ "loss": 0.3886,
+ "step": 10049
+ },
+ {
+ "epoch": 4.75177304964539,
+ "grad_norm": 2.9062368869781494,
+ "learning_rate": 5.231222476801606e-07,
+ "loss": 0.3267,
+ "step": 10050
+ },
+ {
+ "epoch": 4.752245862884161,
+ "grad_norm": 2.9814155101776123,
+ "learning_rate": 5.227404411517173e-07,
+ "loss": 0.3817,
+ "step": 10051
+ },
+ {
+ "epoch": 4.752718676122932,
+ "grad_norm": 3.526301383972168,
+ "learning_rate": 5.22358757738556e-07,
+ "loss": 0.3405,
+ "step": 10052
+ },
+ {
+ "epoch": 4.753191489361702,
+ "grad_norm": 3.2342031002044678,
+ "learning_rate": 5.219771974644439e-07,
+ "loss": 0.3429,
+ "step": 10053
+ },
+ {
+ "epoch": 4.753664302600473,
+ "grad_norm": 3.0213656425476074,
+ "learning_rate": 5.215957603531383e-07,
+ "loss": 0.3482,
+ "step": 10054
+ },
+ {
+ "epoch": 4.754137115839243,
+ "grad_norm": 3.566260576248169,
+ "learning_rate": 5.212144464283889e-07,
+ "loss": 0.3633,
+ "step": 10055
+ },
+ {
+ "epoch": 4.754609929078014,
+ "grad_norm": 3.3363420963287354,
+ "learning_rate": 5.208332557139398e-07,
+ "loss": 0.3528,
+ "step": 10056
+ },
+ {
+ "epoch": 4.755082742316785,
+ "grad_norm": 3.3407959938049316,
+ "learning_rate": 5.204521882335251e-07,
+ "loss": 0.3219,
+ "step": 10057
+ },
+ {
+ "epoch": 4.7555555555555555,
+ "grad_norm": 2.9756882190704346,
+ "learning_rate": 5.200712440108729e-07,
+ "loss": 0.3141,
+ "step": 10058
+ },
+ {
+ "epoch": 4.756028368794326,
+ "grad_norm": 3.7191832065582275,
+ "learning_rate": 5.19690423069703e-07,
+ "loss": 0.3657,
+ "step": 10059
+ },
+ {
+ "epoch": 4.756501182033097,
+ "grad_norm": 3.175494432449341,
+ "learning_rate": 5.193097254337268e-07,
+ "loss": 0.2922,
+ "step": 10060
+ },
+ {
+ "epoch": 4.756973995271868,
+ "grad_norm": 2.9288907051086426,
+ "learning_rate": 5.189291511266489e-07,
+ "loss": 0.3097,
+ "step": 10061
+ },
+ {
+ "epoch": 4.757446808510638,
+ "grad_norm": 3.1014389991760254,
+ "learning_rate": 5.185487001721656e-07,
+ "loss": 0.3443,
+ "step": 10062
+ },
+ {
+ "epoch": 4.757919621749409,
+ "grad_norm": 3.3224666118621826,
+ "learning_rate": 5.181683725939668e-07,
+ "loss": 0.3408,
+ "step": 10063
+ },
+ {
+ "epoch": 4.758392434988179,
+ "grad_norm": 3.248089075088501,
+ "learning_rate": 5.177881684157335e-07,
+ "loss": 0.366,
+ "step": 10064
+ },
+ {
+ "epoch": 4.75886524822695,
+ "grad_norm": 3.3183906078338623,
+ "learning_rate": 5.174080876611385e-07,
+ "loss": 0.3774,
+ "step": 10065
+ },
+ {
+ "epoch": 4.759338061465721,
+ "grad_norm": 3.1653311252593994,
+ "learning_rate": 5.17028130353849e-07,
+ "loss": 0.3208,
+ "step": 10066
+ },
+ {
+ "epoch": 4.759810874704492,
+ "grad_norm": 2.9300882816314697,
+ "learning_rate": 5.166482965175229e-07,
+ "loss": 0.3494,
+ "step": 10067
+ },
+ {
+ "epoch": 4.760283687943263,
+ "grad_norm": 3.504225254058838,
+ "learning_rate": 5.162685861758099e-07,
+ "loss": 0.3777,
+ "step": 10068
+ },
+ {
+ "epoch": 4.760756501182033,
+ "grad_norm": 3.3933908939361572,
+ "learning_rate": 5.158889993523544e-07,
+ "loss": 0.3575,
+ "step": 10069
+ },
+ {
+ "epoch": 4.761229314420804,
+ "grad_norm": 4.30021333694458,
+ "learning_rate": 5.155095360707901e-07,
+ "loss": 0.3435,
+ "step": 10070
+ },
+ {
+ "epoch": 4.761702127659574,
+ "grad_norm": 3.226658582687378,
+ "learning_rate": 5.151301963547462e-07,
+ "loss": 0.3473,
+ "step": 10071
+ },
+ {
+ "epoch": 4.762174940898345,
+ "grad_norm": 3.222884178161621,
+ "learning_rate": 5.14750980227841e-07,
+ "loss": 0.314,
+ "step": 10072
+ },
+ {
+ "epoch": 4.762647754137116,
+ "grad_norm": 3.077139377593994,
+ "learning_rate": 5.143718877136872e-07,
+ "loss": 0.2929,
+ "step": 10073
+ },
+ {
+ "epoch": 4.7631205673758865,
+ "grad_norm": 2.9789531230926514,
+ "learning_rate": 5.139929188358894e-07,
+ "loss": 0.3594,
+ "step": 10074
+ },
+ {
+ "epoch": 4.763593380614657,
+ "grad_norm": 3.558417797088623,
+ "learning_rate": 5.136140736180445e-07,
+ "loss": 0.356,
+ "step": 10075
+ },
+ {
+ "epoch": 4.764066193853428,
+ "grad_norm": 2.8887953758239746,
+ "learning_rate": 5.13235352083741e-07,
+ "loss": 0.2957,
+ "step": 10076
+ },
+ {
+ "epoch": 4.764539007092199,
+ "grad_norm": 3.187857151031494,
+ "learning_rate": 5.128567542565605e-07,
+ "loss": 0.3879,
+ "step": 10077
+ },
+ {
+ "epoch": 4.765011820330969,
+ "grad_norm": 3.761465072631836,
+ "learning_rate": 5.124782801600758e-07,
+ "loss": 0.3163,
+ "step": 10078
+ },
+ {
+ "epoch": 4.76548463356974,
+ "grad_norm": 3.4338560104370117,
+ "learning_rate": 5.120999298178541e-07,
+ "loss": 0.3924,
+ "step": 10079
+ },
+ {
+ "epoch": 4.76595744680851,
+ "grad_norm": 2.8551666736602783,
+ "learning_rate": 5.117217032534528e-07,
+ "loss": 0.329,
+ "step": 10080
+ },
+ {
+ "epoch": 4.766430260047281,
+ "grad_norm": 3.4713878631591797,
+ "learning_rate": 5.113436004904232e-07,
+ "loss": 0.3802,
+ "step": 10081
+ },
+ {
+ "epoch": 4.766903073286052,
+ "grad_norm": 3.1913888454437256,
+ "learning_rate": 5.109656215523076e-07,
+ "loss": 0.3273,
+ "step": 10082
+ },
+ {
+ "epoch": 4.767375886524823,
+ "grad_norm": 2.8070812225341797,
+ "learning_rate": 5.105877664626402e-07,
+ "loss": 0.3398,
+ "step": 10083
+ },
+ {
+ "epoch": 4.767848699763594,
+ "grad_norm": 3.316321849822998,
+ "learning_rate": 5.102100352449502e-07,
+ "loss": 0.3649,
+ "step": 10084
+ },
+ {
+ "epoch": 4.768321513002364,
+ "grad_norm": 3.3555870056152344,
+ "learning_rate": 5.098324279227557e-07,
+ "loss": 0.333,
+ "step": 10085
+ },
+ {
+ "epoch": 4.768794326241135,
+ "grad_norm": 3.0964810848236084,
+ "learning_rate": 5.094549445195699e-07,
+ "loss": 0.3384,
+ "step": 10086
+ },
+ {
+ "epoch": 4.769267139479905,
+ "grad_norm": 3.0406007766723633,
+ "learning_rate": 5.090775850588963e-07,
+ "loss": 0.3582,
+ "step": 10087
+ },
+ {
+ "epoch": 4.769739952718676,
+ "grad_norm": 2.934340238571167,
+ "learning_rate": 5.087003495642309e-07,
+ "loss": 0.3306,
+ "step": 10088
+ },
+ {
+ "epoch": 4.770212765957447,
+ "grad_norm": 3.441734552383423,
+ "learning_rate": 5.083232380590641e-07,
+ "loss": 0.386,
+ "step": 10089
+ },
+ {
+ "epoch": 4.7706855791962175,
+ "grad_norm": 3.176483631134033,
+ "learning_rate": 5.079462505668758e-07,
+ "loss": 0.3516,
+ "step": 10090
+ },
+ {
+ "epoch": 4.771158392434988,
+ "grad_norm": 3.1490824222564697,
+ "learning_rate": 5.075693871111395e-07,
+ "loss": 0.3233,
+ "step": 10091
+ },
+ {
+ "epoch": 4.771631205673759,
+ "grad_norm": 3.300335645675659,
+ "learning_rate": 5.07192647715321e-07,
+ "loss": 0.2975,
+ "step": 10092
+ },
+ {
+ "epoch": 4.77210401891253,
+ "grad_norm": 3.199085235595703,
+ "learning_rate": 5.068160324028776e-07,
+ "loss": 0.3468,
+ "step": 10093
+ },
+ {
+ "epoch": 4.7725768321513,
+ "grad_norm": 3.4611270427703857,
+ "learning_rate": 5.064395411972605e-07,
+ "loss": 0.3319,
+ "step": 10094
+ },
+ {
+ "epoch": 4.773049645390071,
+ "grad_norm": 3.0549957752227783,
+ "learning_rate": 5.060631741219119e-07,
+ "loss": 0.3542,
+ "step": 10095
+ },
+ {
+ "epoch": 4.773522458628841,
+ "grad_norm": 3.085744619369507,
+ "learning_rate": 5.056869312002655e-07,
+ "loss": 0.3611,
+ "step": 10096
+ },
+ {
+ "epoch": 4.773995271867612,
+ "grad_norm": 3.4383676052093506,
+ "learning_rate": 5.053108124557496e-07,
+ "loss": 0.3606,
+ "step": 10097
+ },
+ {
+ "epoch": 4.774468085106383,
+ "grad_norm": 2.8119592666625977,
+ "learning_rate": 5.049348179117825e-07,
+ "loss": 0.3192,
+ "step": 10098
+ },
+ {
+ "epoch": 4.774940898345154,
+ "grad_norm": 2.8554961681365967,
+ "learning_rate": 5.045589475917767e-07,
+ "loss": 0.321,
+ "step": 10099
+ },
+ {
+ "epoch": 4.775413711583925,
+ "grad_norm": 3.612732410430908,
+ "learning_rate": 5.041832015191356e-07,
+ "loss": 0.3385,
+ "step": 10100
+ },
+ {
+ "epoch": 4.775886524822695,
+ "grad_norm": 3.432650327682495,
+ "learning_rate": 5.038075797172543e-07,
+ "loss": 0.3494,
+ "step": 10101
+ },
+ {
+ "epoch": 4.776359338061466,
+ "grad_norm": 3.241612672805786,
+ "learning_rate": 5.034320822095228e-07,
+ "loss": 0.3377,
+ "step": 10102
+ },
+ {
+ "epoch": 4.776832151300236,
+ "grad_norm": 3.5062692165374756,
+ "learning_rate": 5.030567090193203e-07,
+ "loss": 0.4038,
+ "step": 10103
+ },
+ {
+ "epoch": 4.777304964539007,
+ "grad_norm": 2.9015917778015137,
+ "learning_rate": 5.026814601700205e-07,
+ "loss": 0.2987,
+ "step": 10104
+ },
+ {
+ "epoch": 4.777777777777778,
+ "grad_norm": 3.0691189765930176,
+ "learning_rate": 5.023063356849886e-07,
+ "loss": 0.3725,
+ "step": 10105
+ },
+ {
+ "epoch": 4.7782505910165485,
+ "grad_norm": 3.1556789875030518,
+ "learning_rate": 5.019313355875813e-07,
+ "loss": 0.3554,
+ "step": 10106
+ },
+ {
+ "epoch": 4.778723404255319,
+ "grad_norm": 2.84529447555542,
+ "learning_rate": 5.01556459901148e-07,
+ "loss": 0.3369,
+ "step": 10107
+ },
+ {
+ "epoch": 4.77919621749409,
+ "grad_norm": 3.322565793991089,
+ "learning_rate": 5.011817086490315e-07,
+ "loss": 0.3641,
+ "step": 10108
+ },
+ {
+ "epoch": 4.779669030732861,
+ "grad_norm": 3.011988639831543,
+ "learning_rate": 5.008070818545654e-07,
+ "loss": 0.3153,
+ "step": 10109
+ },
+ {
+ "epoch": 4.780141843971631,
+ "grad_norm": 2.937770128250122,
+ "learning_rate": 5.004325795410764e-07,
+ "loss": 0.3235,
+ "step": 10110
+ },
+ {
+ "epoch": 4.780614657210402,
+ "grad_norm": 3.0186142921447754,
+ "learning_rate": 5.00058201731882e-07,
+ "loss": 0.3443,
+ "step": 10111
+ },
+ {
+ "epoch": 4.781087470449172,
+ "grad_norm": 3.1810684204101562,
+ "learning_rate": 4.996839484502946e-07,
+ "loss": 0.3511,
+ "step": 10112
+ },
+ {
+ "epoch": 4.781560283687943,
+ "grad_norm": 3.5470240116119385,
+ "learning_rate": 4.993098197196167e-07,
+ "loss": 0.4096,
+ "step": 10113
+ },
+ {
+ "epoch": 4.782033096926714,
+ "grad_norm": 3.1422345638275146,
+ "learning_rate": 4.989358155631427e-07,
+ "loss": 0.3566,
+ "step": 10114
+ },
+ {
+ "epoch": 4.782505910165485,
+ "grad_norm": 3.3392271995544434,
+ "learning_rate": 4.985619360041619e-07,
+ "loss": 0.3278,
+ "step": 10115
+ },
+ {
+ "epoch": 4.782978723404256,
+ "grad_norm": 3.020026206970215,
+ "learning_rate": 4.981881810659525e-07,
+ "loss": 0.3349,
+ "step": 10116
+ },
+ {
+ "epoch": 4.783451536643026,
+ "grad_norm": 3.061652660369873,
+ "learning_rate": 4.97814550771788e-07,
+ "loss": 0.3275,
+ "step": 10117
+ },
+ {
+ "epoch": 4.783924349881797,
+ "grad_norm": 3.5875346660614014,
+ "learning_rate": 4.974410451449321e-07,
+ "loss": 0.3694,
+ "step": 10118
+ },
+ {
+ "epoch": 4.784397163120567,
+ "grad_norm": 3.848348379135132,
+ "learning_rate": 4.970676642086408e-07,
+ "loss": 0.3539,
+ "step": 10119
+ },
+ {
+ "epoch": 4.784869976359338,
+ "grad_norm": 3.237959146499634,
+ "learning_rate": 4.966944079861641e-07,
+ "loss": 0.3468,
+ "step": 10120
+ },
+ {
+ "epoch": 4.785342789598109,
+ "grad_norm": 3.4829745292663574,
+ "learning_rate": 4.96321276500742e-07,
+ "loss": 0.348,
+ "step": 10121
+ },
+ {
+ "epoch": 4.7858156028368795,
+ "grad_norm": 3.29961895942688,
+ "learning_rate": 4.959482697756085e-07,
+ "loss": 0.3499,
+ "step": 10122
+ },
+ {
+ "epoch": 4.78628841607565,
+ "grad_norm": 3.291260242462158,
+ "learning_rate": 4.955753878339886e-07,
+ "loss": 0.3525,
+ "step": 10123
+ },
+ {
+ "epoch": 4.786761229314421,
+ "grad_norm": 3.543893575668335,
+ "learning_rate": 4.952026306991004e-07,
+ "loss": 0.4274,
+ "step": 10124
+ },
+ {
+ "epoch": 4.787234042553192,
+ "grad_norm": 3.551354169845581,
+ "learning_rate": 4.948299983941534e-07,
+ "loss": 0.3116,
+ "step": 10125
+ },
+ {
+ "epoch": 4.787706855791962,
+ "grad_norm": 3.1988296508789062,
+ "learning_rate": 4.944574909423497e-07,
+ "loss": 0.3273,
+ "step": 10126
+ },
+ {
+ "epoch": 4.7881796690307326,
+ "grad_norm": 2.8899428844451904,
+ "learning_rate": 4.940851083668843e-07,
+ "loss": 0.3518,
+ "step": 10127
+ },
+ {
+ "epoch": 4.788652482269503,
+ "grad_norm": 3.279688835144043,
+ "learning_rate": 4.937128506909439e-07,
+ "loss": 0.3735,
+ "step": 10128
+ },
+ {
+ "epoch": 4.789125295508274,
+ "grad_norm": 3.0784502029418945,
+ "learning_rate": 4.933407179377059e-07,
+ "loss": 0.327,
+ "step": 10129
+ },
+ {
+ "epoch": 4.789598108747045,
+ "grad_norm": 3.390169858932495,
+ "learning_rate": 4.929687101303435e-07,
+ "loss": 0.3895,
+ "step": 10130
+ },
+ {
+ "epoch": 4.790070921985816,
+ "grad_norm": 3.72928524017334,
+ "learning_rate": 4.925968272920181e-07,
+ "loss": 0.3598,
+ "step": 10131
+ },
+ {
+ "epoch": 4.7905437352245865,
+ "grad_norm": 3.3786826133728027,
+ "learning_rate": 4.922250694458866e-07,
+ "loss": 0.363,
+ "step": 10132
+ },
+ {
+ "epoch": 4.791016548463357,
+ "grad_norm": 3.086150884628296,
+ "learning_rate": 4.918534366150965e-07,
+ "loss": 0.2877,
+ "step": 10133
+ },
+ {
+ "epoch": 4.791489361702128,
+ "grad_norm": 3.3568673133850098,
+ "learning_rate": 4.914819288227865e-07,
+ "loss": 0.3153,
+ "step": 10134
+ },
+ {
+ "epoch": 4.791962174940898,
+ "grad_norm": 3.294382095336914,
+ "learning_rate": 4.911105460920904e-07,
+ "loss": 0.3327,
+ "step": 10135
+ },
+ {
+ "epoch": 4.792434988179669,
+ "grad_norm": 3.0562479496002197,
+ "learning_rate": 4.907392884461321e-07,
+ "loss": 0.3368,
+ "step": 10136
+ },
+ {
+ "epoch": 4.79290780141844,
+ "grad_norm": 2.928912878036499,
+ "learning_rate": 4.90368155908027e-07,
+ "loss": 0.295,
+ "step": 10137
+ },
+ {
+ "epoch": 4.79338061465721,
+ "grad_norm": 3.0252797603607178,
+ "learning_rate": 4.899971485008858e-07,
+ "loss": 0.2985,
+ "step": 10138
+ },
+ {
+ "epoch": 4.793853427895981,
+ "grad_norm": 2.830035924911499,
+ "learning_rate": 4.896262662478085e-07,
+ "loss": 0.3518,
+ "step": 10139
+ },
+ {
+ "epoch": 4.794326241134752,
+ "grad_norm": 3.042524576187134,
+ "learning_rate": 4.892555091718884e-07,
+ "loss": 0.2871,
+ "step": 10140
+ },
+ {
+ "epoch": 4.794799054373523,
+ "grad_norm": 2.920741558074951,
+ "learning_rate": 4.888848772962107e-07,
+ "loss": 0.3234,
+ "step": 10141
+ },
+ {
+ "epoch": 4.795271867612293,
+ "grad_norm": 3.3935956954956055,
+ "learning_rate": 4.885143706438527e-07,
+ "loss": 0.3612,
+ "step": 10142
+ },
+ {
+ "epoch": 4.7957446808510635,
+ "grad_norm": 3.1501455307006836,
+ "learning_rate": 4.881439892378853e-07,
+ "loss": 0.3239,
+ "step": 10143
+ },
+ {
+ "epoch": 4.796217494089834,
+ "grad_norm": 3.233794927597046,
+ "learning_rate": 4.877737331013696e-07,
+ "loss": 0.3185,
+ "step": 10144
+ },
+ {
+ "epoch": 4.796690307328605,
+ "grad_norm": 3.1155240535736084,
+ "learning_rate": 4.874036022573605e-07,
+ "loss": 0.3128,
+ "step": 10145
+ },
+ {
+ "epoch": 4.797163120567376,
+ "grad_norm": 3.313546895980835,
+ "learning_rate": 4.870335967289042e-07,
+ "loss": 0.3136,
+ "step": 10146
+ },
+ {
+ "epoch": 4.797635933806147,
+ "grad_norm": 5.024696350097656,
+ "learning_rate": 4.866637165390387e-07,
+ "loss": 0.4032,
+ "step": 10147
+ },
+ {
+ "epoch": 4.7981087470449175,
+ "grad_norm": 3.109086275100708,
+ "learning_rate": 4.862939617107959e-07,
+ "loss": 0.3822,
+ "step": 10148
+ },
+ {
+ "epoch": 4.798581560283688,
+ "grad_norm": 3.144777536392212,
+ "learning_rate": 4.859243322671978e-07,
+ "loss": 0.3362,
+ "step": 10149
+ },
+ {
+ "epoch": 4.799054373522459,
+ "grad_norm": 3.402974843978882,
+ "learning_rate": 4.855548282312605e-07,
+ "loss": 0.3803,
+ "step": 10150
+ },
+ {
+ "epoch": 4.799527186761229,
+ "grad_norm": 2.6077685356140137,
+ "learning_rate": 4.851854496259911e-07,
+ "loss": 0.3043,
+ "step": 10151
+ },
+ {
+ "epoch": 4.8,
+ "grad_norm": 3.38386607170105,
+ "learning_rate": 4.848161964743883e-07,
+ "loss": 0.3129,
+ "step": 10152
+ },
+ {
+ "epoch": 4.800472813238771,
+ "grad_norm": 3.193723440170288,
+ "learning_rate": 4.844470687994454e-07,
+ "loss": 0.3544,
+ "step": 10153
+ },
+ {
+ "epoch": 4.800945626477541,
+ "grad_norm": 2.9620895385742188,
+ "learning_rate": 4.840780666241457e-07,
+ "loss": 0.3376,
+ "step": 10154
+ },
+ {
+ "epoch": 4.801418439716312,
+ "grad_norm": 4.192742824554443,
+ "learning_rate": 4.83709189971465e-07,
+ "loss": 0.3325,
+ "step": 10155
+ },
+ {
+ "epoch": 4.801891252955083,
+ "grad_norm": 2.996617555618286,
+ "learning_rate": 4.83340438864372e-07,
+ "loss": 0.3237,
+ "step": 10156
+ },
+ {
+ "epoch": 4.802364066193854,
+ "grad_norm": 3.255037307739258,
+ "learning_rate": 4.829718133258263e-07,
+ "loss": 0.3575,
+ "step": 10157
+ },
+ {
+ "epoch": 4.802836879432624,
+ "grad_norm": 3.1065316200256348,
+ "learning_rate": 4.826033133787822e-07,
+ "loss": 0.347,
+ "step": 10158
+ },
+ {
+ "epoch": 4.8033096926713945,
+ "grad_norm": 3.321096420288086,
+ "learning_rate": 4.822349390461831e-07,
+ "loss": 0.3628,
+ "step": 10159
+ },
+ {
+ "epoch": 4.803782505910165,
+ "grad_norm": 3.549182653427124,
+ "learning_rate": 4.818666903509672e-07,
+ "loss": 0.3539,
+ "step": 10160
+ },
+ {
+ "epoch": 4.804255319148936,
+ "grad_norm": 2.9063286781311035,
+ "learning_rate": 4.814985673160633e-07,
+ "loss": 0.2956,
+ "step": 10161
+ },
+ {
+ "epoch": 4.804728132387707,
+ "grad_norm": 3.1669399738311768,
+ "learning_rate": 4.81130569964392e-07,
+ "loss": 0.3263,
+ "step": 10162
+ },
+ {
+ "epoch": 4.805200945626478,
+ "grad_norm": 3.667128562927246,
+ "learning_rate": 4.807626983188684e-07,
+ "loss": 0.3514,
+ "step": 10163
+ },
+ {
+ "epoch": 4.8056737588652485,
+ "grad_norm": 3.1469576358795166,
+ "learning_rate": 4.803949524023976e-07,
+ "loss": 0.3273,
+ "step": 10164
+ },
+ {
+ "epoch": 4.806146572104019,
+ "grad_norm": 3.6988110542297363,
+ "learning_rate": 4.800273322378768e-07,
+ "loss": 0.3293,
+ "step": 10165
+ },
+ {
+ "epoch": 4.80661938534279,
+ "grad_norm": 3.6419219970703125,
+ "learning_rate": 4.79659837848197e-07,
+ "loss": 0.3696,
+ "step": 10166
+ },
+ {
+ "epoch": 4.80709219858156,
+ "grad_norm": 3.4860944747924805,
+ "learning_rate": 4.792924692562398e-07,
+ "loss": 0.3372,
+ "step": 10167
+ },
+ {
+ "epoch": 4.807565011820331,
+ "grad_norm": 2.879600763320923,
+ "learning_rate": 4.789252264848806e-07,
+ "loss": 0.3192,
+ "step": 10168
+ },
+ {
+ "epoch": 4.808037825059102,
+ "grad_norm": 3.4475104808807373,
+ "learning_rate": 4.785581095569855e-07,
+ "loss": 0.3285,
+ "step": 10169
+ },
+ {
+ "epoch": 4.808510638297872,
+ "grad_norm": 3.528397560119629,
+ "learning_rate": 4.78191118495413e-07,
+ "loss": 0.3612,
+ "step": 10170
+ },
+ {
+ "epoch": 4.808983451536643,
+ "grad_norm": 3.056796073913574,
+ "learning_rate": 4.778242533230138e-07,
+ "loss": 0.3077,
+ "step": 10171
+ },
+ {
+ "epoch": 4.809456264775414,
+ "grad_norm": 3.302171230316162,
+ "learning_rate": 4.774575140626317e-07,
+ "loss": 0.2963,
+ "step": 10172
+ },
+ {
+ "epoch": 4.809929078014184,
+ "grad_norm": 3.1446237564086914,
+ "learning_rate": 4.770909007371016e-07,
+ "loss": 0.3438,
+ "step": 10173
+ },
+ {
+ "epoch": 4.810401891252955,
+ "grad_norm": 2.917919635772705,
+ "learning_rate": 4.767244133692511e-07,
+ "loss": 0.3353,
+ "step": 10174
+ },
+ {
+ "epoch": 4.8108747044917255,
+ "grad_norm": 3.0808987617492676,
+ "learning_rate": 4.763580519818989e-07,
+ "loss": 0.3574,
+ "step": 10175
+ },
+ {
+ "epoch": 4.811347517730496,
+ "grad_norm": 3.2861616611480713,
+ "learning_rate": 4.75991816597858e-07,
+ "loss": 0.3891,
+ "step": 10176
+ },
+ {
+ "epoch": 4.811820330969267,
+ "grad_norm": 2.835925340652466,
+ "learning_rate": 4.7562570723993116e-07,
+ "loss": 0.3785,
+ "step": 10177
+ },
+ {
+ "epoch": 4.812293144208038,
+ "grad_norm": 3.5441393852233887,
+ "learning_rate": 4.7525972393091534e-07,
+ "loss": 0.3914,
+ "step": 10178
+ },
+ {
+ "epoch": 4.812765957446809,
+ "grad_norm": 3.94022798538208,
+ "learning_rate": 4.748938666935984e-07,
+ "loss": 0.3564,
+ "step": 10179
+ },
+ {
+ "epoch": 4.8132387706855795,
+ "grad_norm": 3.1686532497406006,
+ "learning_rate": 4.7452813555076e-07,
+ "loss": 0.3348,
+ "step": 10180
+ },
+ {
+ "epoch": 4.81371158392435,
+ "grad_norm": 3.534032106399536,
+ "learning_rate": 4.7416253052517374e-07,
+ "loss": 0.3165,
+ "step": 10181
+ },
+ {
+ "epoch": 4.81418439716312,
+ "grad_norm": 3.1169021129608154,
+ "learning_rate": 4.7379705163960317e-07,
+ "loss": 0.3515,
+ "step": 10182
+ },
+ {
+ "epoch": 4.814657210401891,
+ "grad_norm": 3.564509391784668,
+ "learning_rate": 4.7343169891680585e-07,
+ "loss": 0.4195,
+ "step": 10183
+ },
+ {
+ "epoch": 4.815130023640662,
+ "grad_norm": 3.2813005447387695,
+ "learning_rate": 4.7306647237953085e-07,
+ "loss": 0.3574,
+ "step": 10184
+ },
+ {
+ "epoch": 4.815602836879433,
+ "grad_norm": 3.053349018096924,
+ "learning_rate": 4.727013720505177e-07,
+ "loss": 0.3792,
+ "step": 10185
+ },
+ {
+ "epoch": 4.816075650118203,
+ "grad_norm": 3.069258689880371,
+ "learning_rate": 4.723363979525017e-07,
+ "loss": 0.3377,
+ "step": 10186
+ },
+ {
+ "epoch": 4.816548463356974,
+ "grad_norm": 2.999802350997925,
+ "learning_rate": 4.71971550108207e-07,
+ "loss": 0.3232,
+ "step": 10187
+ },
+ {
+ "epoch": 4.817021276595745,
+ "grad_norm": 2.941810131072998,
+ "learning_rate": 4.7160682854035107e-07,
+ "loss": 0.342,
+ "step": 10188
+ },
+ {
+ "epoch": 4.817494089834515,
+ "grad_norm": 3.407975196838379,
+ "learning_rate": 4.71242233271644e-07,
+ "loss": 0.3697,
+ "step": 10189
+ },
+ {
+ "epoch": 4.817966903073286,
+ "grad_norm": 3.148359537124634,
+ "learning_rate": 4.708777643247864e-07,
+ "loss": 0.3297,
+ "step": 10190
+ },
+ {
+ "epoch": 4.8184397163120565,
+ "grad_norm": 2.9067797660827637,
+ "learning_rate": 4.7051342172247354e-07,
+ "loss": 0.2646,
+ "step": 10191
+ },
+ {
+ "epoch": 4.818912529550827,
+ "grad_norm": 3.4185385704040527,
+ "learning_rate": 4.70149205487391e-07,
+ "loss": 0.3296,
+ "step": 10192
+ },
+ {
+ "epoch": 4.819385342789598,
+ "grad_norm": 3.363966464996338,
+ "learning_rate": 4.697851156422162e-07,
+ "loss": 0.3744,
+ "step": 10193
+ },
+ {
+ "epoch": 4.819858156028369,
+ "grad_norm": 2.944939613342285,
+ "learning_rate": 4.6942115220962067e-07,
+ "loss": 0.3311,
+ "step": 10194
+ },
+ {
+ "epoch": 4.82033096926714,
+ "grad_norm": 3.2023603916168213,
+ "learning_rate": 4.6905731521226544e-07,
+ "loss": 0.3114,
+ "step": 10195
+ },
+ {
+ "epoch": 4.8208037825059105,
+ "grad_norm": 2.9747812747955322,
+ "learning_rate": 4.686936046728063e-07,
+ "loss": 0.2891,
+ "step": 10196
+ },
+ {
+ "epoch": 4.821276595744681,
+ "grad_norm": 3.6693246364593506,
+ "learning_rate": 4.6833002061388965e-07,
+ "loss": 0.3758,
+ "step": 10197
+ },
+ {
+ "epoch": 4.821749408983451,
+ "grad_norm": 3.4812891483306885,
+ "learning_rate": 4.679665630581534e-07,
+ "loss": 0.3274,
+ "step": 10198
+ },
+ {
+ "epoch": 4.822222222222222,
+ "grad_norm": 2.888956308364868,
+ "learning_rate": 4.676032320282295e-07,
+ "loss": 0.3304,
+ "step": 10199
+ },
+ {
+ "epoch": 4.822695035460993,
+ "grad_norm": 3.2659964561462402,
+ "learning_rate": 4.6724002754674006e-07,
+ "loss": 0.3267,
+ "step": 10200
+ },
+ {
+ "epoch": 4.823167848699764,
+ "grad_norm": 3.2733213901519775,
+ "learning_rate": 4.6687694963630127e-07,
+ "loss": 0.3067,
+ "step": 10201
+ },
+ {
+ "epoch": 4.823640661938534,
+ "grad_norm": 3.0957846641540527,
+ "learning_rate": 4.6651399831951995e-07,
+ "loss": 0.3586,
+ "step": 10202
+ },
+ {
+ "epoch": 4.824113475177305,
+ "grad_norm": 2.9597535133361816,
+ "learning_rate": 4.6615117361899526e-07,
+ "loss": 0.3409,
+ "step": 10203
+ },
+ {
+ "epoch": 4.824586288416076,
+ "grad_norm": 3.0622851848602295,
+ "learning_rate": 4.657884755573189e-07,
+ "loss": 0.3112,
+ "step": 10204
+ },
+ {
+ "epoch": 4.825059101654846,
+ "grad_norm": 3.088568925857544,
+ "learning_rate": 4.6542590415707355e-07,
+ "loss": 0.3161,
+ "step": 10205
+ },
+ {
+ "epoch": 4.825531914893617,
+ "grad_norm": 3.2927064895629883,
+ "learning_rate": 4.650634594408368e-07,
+ "loss": 0.3368,
+ "step": 10206
+ },
+ {
+ "epoch": 4.8260047281323875,
+ "grad_norm": 2.9728758335113525,
+ "learning_rate": 4.647011414311753e-07,
+ "loss": 0.3615,
+ "step": 10207
+ },
+ {
+ "epoch": 4.826477541371158,
+ "grad_norm": 3.301173686981201,
+ "learning_rate": 4.643389501506487e-07,
+ "loss": 0.3597,
+ "step": 10208
+ },
+ {
+ "epoch": 4.826950354609929,
+ "grad_norm": 3.421177864074707,
+ "learning_rate": 4.639768856218102e-07,
+ "loss": 0.3087,
+ "step": 10209
+ },
+ {
+ "epoch": 4.8274231678487,
+ "grad_norm": 3.1131463050842285,
+ "learning_rate": 4.636149478672031e-07,
+ "loss": 0.3776,
+ "step": 10210
+ },
+ {
+ "epoch": 4.827895981087471,
+ "grad_norm": 3.5807228088378906,
+ "learning_rate": 4.6325313690936347e-07,
+ "loss": 0.3556,
+ "step": 10211
+ },
+ {
+ "epoch": 4.828368794326241,
+ "grad_norm": 3.2873311042785645,
+ "learning_rate": 4.6289145277082085e-07,
+ "loss": 0.377,
+ "step": 10212
+ },
+ {
+ "epoch": 4.828841607565012,
+ "grad_norm": 3.502228021621704,
+ "learning_rate": 4.6252989547409423e-07,
+ "loss": 0.3699,
+ "step": 10213
+ },
+ {
+ "epoch": 4.829314420803782,
+ "grad_norm": 3.8895792961120605,
+ "learning_rate": 4.621684650416977e-07,
+ "loss": 0.3594,
+ "step": 10214
+ },
+ {
+ "epoch": 4.829787234042553,
+ "grad_norm": 3.11706805229187,
+ "learning_rate": 4.6180716149613505e-07,
+ "loss": 0.3404,
+ "step": 10215
+ },
+ {
+ "epoch": 4.830260047281324,
+ "grad_norm": 3.174584150314331,
+ "learning_rate": 4.614459848599029e-07,
+ "loss": 0.3684,
+ "step": 10216
+ },
+ {
+ "epoch": 4.8307328605200945,
+ "grad_norm": 3.0028135776519775,
+ "learning_rate": 4.610849351554908e-07,
+ "loss": 0.3505,
+ "step": 10217
+ },
+ {
+ "epoch": 4.831205673758865,
+ "grad_norm": 3.053354024887085,
+ "learning_rate": 4.6072401240537965e-07,
+ "loss": 0.392,
+ "step": 10218
+ },
+ {
+ "epoch": 4.831678486997636,
+ "grad_norm": 3.2726800441741943,
+ "learning_rate": 4.603632166320424e-07,
+ "loss": 0.3506,
+ "step": 10219
+ },
+ {
+ "epoch": 4.832151300236407,
+ "grad_norm": 3.5746219158172607,
+ "learning_rate": 4.600025478579437e-07,
+ "loss": 0.3585,
+ "step": 10220
+ },
+ {
+ "epoch": 4.832624113475177,
+ "grad_norm": 3.0742499828338623,
+ "learning_rate": 4.596420061055409e-07,
+ "loss": 0.332,
+ "step": 10221
+ },
+ {
+ "epoch": 4.833096926713948,
+ "grad_norm": 3.161022186279297,
+ "learning_rate": 4.5928159139728426e-07,
+ "loss": 0.3303,
+ "step": 10222
+ },
+ {
+ "epoch": 4.833569739952718,
+ "grad_norm": 3.2312185764312744,
+ "learning_rate": 4.5892130375561395e-07,
+ "loss": 0.3532,
+ "step": 10223
+ },
+ {
+ "epoch": 4.834042553191489,
+ "grad_norm": 3.501893997192383,
+ "learning_rate": 4.585611432029649e-07,
+ "loss": 0.3862,
+ "step": 10224
+ },
+ {
+ "epoch": 4.83451536643026,
+ "grad_norm": 3.2005560398101807,
+ "learning_rate": 4.5820110976176194e-07,
+ "loss": 0.3626,
+ "step": 10225
+ },
+ {
+ "epoch": 4.834988179669031,
+ "grad_norm": 3.4039556980133057,
+ "learning_rate": 4.578412034544225e-07,
+ "loss": 0.3789,
+ "step": 10226
+ },
+ {
+ "epoch": 4.835460992907802,
+ "grad_norm": 3.4461448192596436,
+ "learning_rate": 4.574814243033571e-07,
+ "loss": 0.3714,
+ "step": 10227
+ },
+ {
+ "epoch": 4.835933806146572,
+ "grad_norm": 3.435886859893799,
+ "learning_rate": 4.571217723309665e-07,
+ "loss": 0.3015,
+ "step": 10228
+ },
+ {
+ "epoch": 4.836406619385343,
+ "grad_norm": 3.612645149230957,
+ "learning_rate": 4.567622475596462e-07,
+ "loss": 0.3738,
+ "step": 10229
+ },
+ {
+ "epoch": 4.836879432624113,
+ "grad_norm": 3.1911067962646484,
+ "learning_rate": 4.564028500117815e-07,
+ "loss": 0.2994,
+ "step": 10230
+ },
+ {
+ "epoch": 4.837352245862884,
+ "grad_norm": 2.9745163917541504,
+ "learning_rate": 4.5604357970974956e-07,
+ "loss": 0.3353,
+ "step": 10231
+ },
+ {
+ "epoch": 4.837825059101655,
+ "grad_norm": 3.4999606609344482,
+ "learning_rate": 4.556844366759222e-07,
+ "loss": 0.3796,
+ "step": 10232
+ },
+ {
+ "epoch": 4.8382978723404255,
+ "grad_norm": 2.8130152225494385,
+ "learning_rate": 4.553254209326607e-07,
+ "loss": 0.2964,
+ "step": 10233
+ },
+ {
+ "epoch": 4.838770685579196,
+ "grad_norm": 3.5461673736572266,
+ "learning_rate": 4.5496653250232005e-07,
+ "loss": 0.3626,
+ "step": 10234
+ },
+ {
+ "epoch": 4.839243498817967,
+ "grad_norm": 3.3498404026031494,
+ "learning_rate": 4.546077714072458e-07,
+ "loss": 0.2982,
+ "step": 10235
+ },
+ {
+ "epoch": 4.839716312056738,
+ "grad_norm": 2.8942501544952393,
+ "learning_rate": 4.5424913766977635e-07,
+ "loss": 0.2447,
+ "step": 10236
+ },
+ {
+ "epoch": 4.840189125295508,
+ "grad_norm": 3.3506743907928467,
+ "learning_rate": 4.5389063131224346e-07,
+ "loss": 0.2908,
+ "step": 10237
+ },
+ {
+ "epoch": 4.840661938534279,
+ "grad_norm": 3.058872699737549,
+ "learning_rate": 4.535322523569691e-07,
+ "loss": 0.3275,
+ "step": 10238
+ },
+ {
+ "epoch": 4.841134751773049,
+ "grad_norm": 3.0573856830596924,
+ "learning_rate": 4.5317400082626696e-07,
+ "loss": 0.3096,
+ "step": 10239
+ },
+ {
+ "epoch": 4.84160756501182,
+ "grad_norm": 3.3260257244110107,
+ "learning_rate": 4.5281587674244563e-07,
+ "loss": 0.3334,
+ "step": 10240
+ },
+ {
+ "epoch": 4.842080378250591,
+ "grad_norm": 3.265740156173706,
+ "learning_rate": 4.5245788012780234e-07,
+ "loss": 0.3698,
+ "step": 10241
+ },
+ {
+ "epoch": 4.842553191489362,
+ "grad_norm": 3.4116036891937256,
+ "learning_rate": 4.521000110046292e-07,
+ "loss": 0.4159,
+ "step": 10242
+ },
+ {
+ "epoch": 4.843026004728133,
+ "grad_norm": 3.3263189792633057,
+ "learning_rate": 4.5174226939520865e-07,
+ "loss": 0.3579,
+ "step": 10243
+ },
+ {
+ "epoch": 4.843498817966903,
+ "grad_norm": 3.4223177433013916,
+ "learning_rate": 4.5138465532181514e-07,
+ "loss": 0.3539,
+ "step": 10244
+ },
+ {
+ "epoch": 4.843971631205674,
+ "grad_norm": 3.481016159057617,
+ "learning_rate": 4.5102716880671665e-07,
+ "loss": 0.3527,
+ "step": 10245
+ },
+ {
+ "epoch": 4.844444444444444,
+ "grad_norm": 2.830122232437134,
+ "learning_rate": 4.5066980987217124e-07,
+ "loss": 0.3339,
+ "step": 10246
+ },
+ {
+ "epoch": 4.844917257683215,
+ "grad_norm": 2.895792007446289,
+ "learning_rate": 4.5031257854043163e-07,
+ "loss": 0.3056,
+ "step": 10247
+ },
+ {
+ "epoch": 4.845390070921986,
+ "grad_norm": 2.9748036861419678,
+ "learning_rate": 4.499554748337398e-07,
+ "loss": 0.2794,
+ "step": 10248
+ },
+ {
+ "epoch": 4.8458628841607565,
+ "grad_norm": 3.223539113998413,
+ "learning_rate": 4.49598498774331e-07,
+ "loss": 0.3756,
+ "step": 10249
+ },
+ {
+ "epoch": 4.846335697399527,
+ "grad_norm": 3.491365432739258,
+ "learning_rate": 4.492416503844335e-07,
+ "loss": 0.378,
+ "step": 10250
+ },
+ {
+ "epoch": 4.846808510638298,
+ "grad_norm": 2.7236695289611816,
+ "learning_rate": 4.48884929686266e-07,
+ "loss": 0.297,
+ "step": 10251
+ },
+ {
+ "epoch": 4.847281323877069,
+ "grad_norm": 3.3814051151275635,
+ "learning_rate": 4.4852833670204045e-07,
+ "loss": 0.2923,
+ "step": 10252
+ },
+ {
+ "epoch": 4.847754137115839,
+ "grad_norm": 3.168334722518921,
+ "learning_rate": 4.4817187145395956e-07,
+ "loss": 0.3455,
+ "step": 10253
+ },
+ {
+ "epoch": 4.84822695035461,
+ "grad_norm": 3.0346829891204834,
+ "learning_rate": 4.4781553396421873e-07,
+ "loss": 0.3416,
+ "step": 10254
+ },
+ {
+ "epoch": 4.84869976359338,
+ "grad_norm": 3.1232426166534424,
+ "learning_rate": 4.4745932425500657e-07,
+ "loss": 0.3494,
+ "step": 10255
+ },
+ {
+ "epoch": 4.849172576832151,
+ "grad_norm": 3.0737383365631104,
+ "learning_rate": 4.471032423485017e-07,
+ "loss": 0.3246,
+ "step": 10256
+ },
+ {
+ "epoch": 4.849645390070922,
+ "grad_norm": 3.421461582183838,
+ "learning_rate": 4.467472882668769e-07,
+ "loss": 0.3807,
+ "step": 10257
+ },
+ {
+ "epoch": 4.850118203309693,
+ "grad_norm": 3.3846490383148193,
+ "learning_rate": 4.463914620322951e-07,
+ "loss": 0.3695,
+ "step": 10258
+ },
+ {
+ "epoch": 4.850591016548464,
+ "grad_norm": 4.0876007080078125,
+ "learning_rate": 4.460357636669116e-07,
+ "loss": 0.3913,
+ "step": 10259
+ },
+ {
+ "epoch": 4.851063829787234,
+ "grad_norm": 3.2078847885131836,
+ "learning_rate": 4.456801931928753e-07,
+ "loss": 0.3424,
+ "step": 10260
+ },
+ {
+ "epoch": 4.851536643026005,
+ "grad_norm": 3.4163241386413574,
+ "learning_rate": 4.453247506323255e-07,
+ "loss": 0.3907,
+ "step": 10261
+ },
+ {
+ "epoch": 4.852009456264775,
+ "grad_norm": 2.989793539047241,
+ "learning_rate": 4.449694360073931e-07,
+ "loss": 0.3313,
+ "step": 10262
+ },
+ {
+ "epoch": 4.852482269503546,
+ "grad_norm": 3.291537284851074,
+ "learning_rate": 4.446142493402039e-07,
+ "loss": 0.3594,
+ "step": 10263
+ },
+ {
+ "epoch": 4.852955082742317,
+ "grad_norm": 3.6327221393585205,
+ "learning_rate": 4.4425919065287204e-07,
+ "loss": 0.3844,
+ "step": 10264
+ },
+ {
+ "epoch": 4.8534278959810875,
+ "grad_norm": 3.486333131790161,
+ "learning_rate": 4.439042599675067e-07,
+ "loss": 0.3666,
+ "step": 10265
+ },
+ {
+ "epoch": 4.853900709219858,
+ "grad_norm": 3.7585315704345703,
+ "learning_rate": 4.435494573062074e-07,
+ "loss": 0.3287,
+ "step": 10266
+ },
+ {
+ "epoch": 4.854373522458629,
+ "grad_norm": 3.3496108055114746,
+ "learning_rate": 4.4319478269106625e-07,
+ "loss": 0.4021,
+ "step": 10267
+ },
+ {
+ "epoch": 4.8548463356974,
+ "grad_norm": 3.4681267738342285,
+ "learning_rate": 4.428402361441672e-07,
+ "loss": 0.3119,
+ "step": 10268
+ },
+ {
+ "epoch": 4.85531914893617,
+ "grad_norm": 2.9935829639434814,
+ "learning_rate": 4.4248581768758567e-07,
+ "loss": 0.305,
+ "step": 10269
+ },
+ {
+ "epoch": 4.855791962174941,
+ "grad_norm": 3.5839056968688965,
+ "learning_rate": 4.42131527343391e-07,
+ "loss": 0.4095,
+ "step": 10270
+ },
+ {
+ "epoch": 4.856264775413711,
+ "grad_norm": 3.088690757751465,
+ "learning_rate": 4.4177736513364237e-07,
+ "loss": 0.3391,
+ "step": 10271
+ },
+ {
+ "epoch": 4.856737588652482,
+ "grad_norm": 3.2721431255340576,
+ "learning_rate": 4.414233310803917e-07,
+ "loss": 0.3741,
+ "step": 10272
+ },
+ {
+ "epoch": 4.857210401891253,
+ "grad_norm": 3.108041524887085,
+ "learning_rate": 4.4106942520568437e-07,
+ "loss": 0.4041,
+ "step": 10273
+ },
+ {
+ "epoch": 4.857683215130024,
+ "grad_norm": 3.0035696029663086,
+ "learning_rate": 4.407156475315549e-07,
+ "loss": 0.3408,
+ "step": 10274
+ },
+ {
+ "epoch": 4.858156028368795,
+ "grad_norm": 3.0572783946990967,
+ "learning_rate": 4.4036199808003334e-07,
+ "loss": 0.3207,
+ "step": 10275
+ },
+ {
+ "epoch": 4.858628841607565,
+ "grad_norm": 3.1695926189422607,
+ "learning_rate": 4.4000847687313857e-07,
+ "loss": 0.3605,
+ "step": 10276
+ },
+ {
+ "epoch": 4.859101654846336,
+ "grad_norm": 3.690382957458496,
+ "learning_rate": 4.396550839328828e-07,
+ "loss": 0.4076,
+ "step": 10277
+ },
+ {
+ "epoch": 4.859574468085106,
+ "grad_norm": 3.271988868713379,
+ "learning_rate": 4.393018192812712e-07,
+ "loss": 0.4169,
+ "step": 10278
+ },
+ {
+ "epoch": 4.860047281323877,
+ "grad_norm": 2.8622982501983643,
+ "learning_rate": 4.389486829402986e-07,
+ "loss": 0.3114,
+ "step": 10279
+ },
+ {
+ "epoch": 4.860520094562648,
+ "grad_norm": 3.3875632286071777,
+ "learning_rate": 4.385956749319548e-07,
+ "loss": 0.3664,
+ "step": 10280
+ },
+ {
+ "epoch": 4.8609929078014185,
+ "grad_norm": 2.98962664604187,
+ "learning_rate": 4.382427952782195e-07,
+ "loss": 0.314,
+ "step": 10281
+ },
+ {
+ "epoch": 4.861465721040189,
+ "grad_norm": 2.899529457092285,
+ "learning_rate": 4.3789004400106473e-07,
+ "loss": 0.3588,
+ "step": 10282
+ },
+ {
+ "epoch": 4.86193853427896,
+ "grad_norm": 3.11767578125,
+ "learning_rate": 4.3753742112245476e-07,
+ "loss": 0.3311,
+ "step": 10283
+ },
+ {
+ "epoch": 4.862411347517731,
+ "grad_norm": 2.9610254764556885,
+ "learning_rate": 4.3718492666434576e-07,
+ "loss": 0.3234,
+ "step": 10284
+ },
+ {
+ "epoch": 4.862884160756501,
+ "grad_norm": 2.9350297451019287,
+ "learning_rate": 4.368325606486859e-07,
+ "loss": 0.3086,
+ "step": 10285
+ },
+ {
+ "epoch": 4.863356973995272,
+ "grad_norm": 3.0126571655273438,
+ "learning_rate": 4.3648032309741626e-07,
+ "loss": 0.3033,
+ "step": 10286
+ },
+ {
+ "epoch": 4.863829787234042,
+ "grad_norm": 3.0580496788024902,
+ "learning_rate": 4.3612821403246795e-07,
+ "loss": 0.3631,
+ "step": 10287
+ },
+ {
+ "epoch": 4.864302600472813,
+ "grad_norm": 2.9186129570007324,
+ "learning_rate": 4.3577623347576676e-07,
+ "loss": 0.3449,
+ "step": 10288
+ },
+ {
+ "epoch": 4.864775413711584,
+ "grad_norm": 3.146562099456787,
+ "learning_rate": 4.354243814492282e-07,
+ "loss": 0.369,
+ "step": 10289
+ },
+ {
+ "epoch": 4.865248226950355,
+ "grad_norm": 2.646812915802002,
+ "learning_rate": 4.350726579747597e-07,
+ "loss": 0.331,
+ "step": 10290
+ },
+ {
+ "epoch": 4.8657210401891255,
+ "grad_norm": 3.2851274013519287,
+ "learning_rate": 4.3472106307426293e-07,
+ "loss": 0.3445,
+ "step": 10291
+ },
+ {
+ "epoch": 4.866193853427896,
+ "grad_norm": 3.144446849822998,
+ "learning_rate": 4.34369596769629e-07,
+ "loss": 0.3687,
+ "step": 10292
+ },
+ {
+ "epoch": 4.866666666666667,
+ "grad_norm": 3.01517915725708,
+ "learning_rate": 4.3401825908274353e-07,
+ "loss": 0.3282,
+ "step": 10293
+ },
+ {
+ "epoch": 4.867139479905437,
+ "grad_norm": 3.171759605407715,
+ "learning_rate": 4.33667050035482e-07,
+ "loss": 0.351,
+ "step": 10294
+ },
+ {
+ "epoch": 4.867612293144208,
+ "grad_norm": 3.5374269485473633,
+ "learning_rate": 4.333159696497119e-07,
+ "loss": 0.3586,
+ "step": 10295
+ },
+ {
+ "epoch": 4.868085106382979,
+ "grad_norm": 3.4506356716156006,
+ "learning_rate": 4.3296501794729494e-07,
+ "loss": 0.4076,
+ "step": 10296
+ },
+ {
+ "epoch": 4.868557919621749,
+ "grad_norm": 3.348048448562622,
+ "learning_rate": 4.326141949500826e-07,
+ "loss": 0.3256,
+ "step": 10297
+ },
+ {
+ "epoch": 4.86903073286052,
+ "grad_norm": 3.235438108444214,
+ "learning_rate": 4.322635006799192e-07,
+ "loss": 0.3215,
+ "step": 10298
+ },
+ {
+ "epoch": 4.869503546099291,
+ "grad_norm": 3.2025554180145264,
+ "learning_rate": 4.319129351586407e-07,
+ "loss": 0.335,
+ "step": 10299
+ },
+ {
+ "epoch": 4.869976359338062,
+ "grad_norm": 3.0318121910095215,
+ "learning_rate": 4.315624984080749e-07,
+ "loss": 0.3304,
+ "step": 10300
+ },
+ {
+ "epoch": 4.870449172576832,
+ "grad_norm": 2.9115359783172607,
+ "learning_rate": 4.312121904500433e-07,
+ "loss": 0.3459,
+ "step": 10301
+ },
+ {
+ "epoch": 4.8709219858156025,
+ "grad_norm": 3.41164493560791,
+ "learning_rate": 4.3086201130635633e-07,
+ "loss": 0.3846,
+ "step": 10302
+ },
+ {
+ "epoch": 4.871394799054373,
+ "grad_norm": 3.5832016468048096,
+ "learning_rate": 4.305119609988198e-07,
+ "loss": 0.3422,
+ "step": 10303
+ },
+ {
+ "epoch": 4.871867612293144,
+ "grad_norm": 3.5244979858398438,
+ "learning_rate": 4.30162039549229e-07,
+ "loss": 0.3862,
+ "step": 10304
+ },
+ {
+ "epoch": 4.872340425531915,
+ "grad_norm": 3.0881710052490234,
+ "learning_rate": 4.298122469793714e-07,
+ "loss": 0.358,
+ "step": 10305
+ },
+ {
+ "epoch": 4.872813238770686,
+ "grad_norm": 3.3237557411193848,
+ "learning_rate": 4.294625833110283e-07,
+ "loss": 0.3742,
+ "step": 10306
+ },
+ {
+ "epoch": 4.8732860520094565,
+ "grad_norm": 3.1959686279296875,
+ "learning_rate": 4.291130485659711e-07,
+ "loss": 0.3426,
+ "step": 10307
+ },
+ {
+ "epoch": 4.873758865248227,
+ "grad_norm": 3.1890714168548584,
+ "learning_rate": 4.2876364276596333e-07,
+ "loss": 0.3131,
+ "step": 10308
+ },
+ {
+ "epoch": 4.874231678486998,
+ "grad_norm": 2.9387660026550293,
+ "learning_rate": 4.284143659327619e-07,
+ "loss": 0.3227,
+ "step": 10309
+ },
+ {
+ "epoch": 4.874704491725768,
+ "grad_norm": 3.6868603229522705,
+ "learning_rate": 4.2806521808811367e-07,
+ "loss": 0.3159,
+ "step": 10310
+ },
+ {
+ "epoch": 4.875177304964539,
+ "grad_norm": 3.1396310329437256,
+ "learning_rate": 4.277161992537596e-07,
+ "loss": 0.3757,
+ "step": 10311
+ },
+ {
+ "epoch": 4.87565011820331,
+ "grad_norm": 3.4745748043060303,
+ "learning_rate": 4.273673094514313e-07,
+ "loss": 0.347,
+ "step": 10312
+ },
+ {
+ "epoch": 4.87612293144208,
+ "grad_norm": 3.1869146823883057,
+ "learning_rate": 4.270185487028525e-07,
+ "loss": 0.3364,
+ "step": 10313
+ },
+ {
+ "epoch": 4.876595744680851,
+ "grad_norm": 2.8646297454833984,
+ "learning_rate": 4.2666991702973807e-07,
+ "loss": 0.2987,
+ "step": 10314
+ },
+ {
+ "epoch": 4.877068557919622,
+ "grad_norm": 3.3483452796936035,
+ "learning_rate": 4.263214144537975e-07,
+ "loss": 0.307,
+ "step": 10315
+ },
+ {
+ "epoch": 4.877541371158393,
+ "grad_norm": 2.8557562828063965,
+ "learning_rate": 4.259730409967294e-07,
+ "loss": 0.3406,
+ "step": 10316
+ },
+ {
+ "epoch": 4.878014184397163,
+ "grad_norm": 3.351121664047241,
+ "learning_rate": 4.256247966802257e-07,
+ "loss": 0.3571,
+ "step": 10317
+ },
+ {
+ "epoch": 4.8784869976359335,
+ "grad_norm": 3.1691417694091797,
+ "learning_rate": 4.252766815259696e-07,
+ "loss": 0.3686,
+ "step": 10318
+ },
+ {
+ "epoch": 4.878959810874704,
+ "grad_norm": 2.957632303237915,
+ "learning_rate": 4.249286955556378e-07,
+ "loss": 0.3055,
+ "step": 10319
+ },
+ {
+ "epoch": 4.879432624113475,
+ "grad_norm": 3.234708070755005,
+ "learning_rate": 4.2458083879089645e-07,
+ "loss": 0.3733,
+ "step": 10320
+ },
+ {
+ "epoch": 4.879905437352246,
+ "grad_norm": 3.469207525253296,
+ "learning_rate": 4.242331112534065e-07,
+ "loss": 0.3758,
+ "step": 10321
+ },
+ {
+ "epoch": 4.880378250591017,
+ "grad_norm": 3.2442891597747803,
+ "learning_rate": 4.2388551296481896e-07,
+ "loss": 0.3515,
+ "step": 10322
+ },
+ {
+ "epoch": 4.8808510638297875,
+ "grad_norm": 3.3709537982940674,
+ "learning_rate": 4.235380439467762e-07,
+ "loss": 0.421,
+ "step": 10323
+ },
+ {
+ "epoch": 4.881323877068558,
+ "grad_norm": 2.730891227722168,
+ "learning_rate": 4.231907042209149e-07,
+ "loss": 0.3105,
+ "step": 10324
+ },
+ {
+ "epoch": 4.881796690307329,
+ "grad_norm": 3.6933813095092773,
+ "learning_rate": 4.228434938088616e-07,
+ "loss": 0.338,
+ "step": 10325
+ },
+ {
+ "epoch": 4.882269503546099,
+ "grad_norm": 3.2480294704437256,
+ "learning_rate": 4.224964127322362e-07,
+ "loss": 0.3695,
+ "step": 10326
+ },
+ {
+ "epoch": 4.88274231678487,
+ "grad_norm": 3.229762554168701,
+ "learning_rate": 4.2214946101264976e-07,
+ "loss": 0.3768,
+ "step": 10327
+ },
+ {
+ "epoch": 4.883215130023641,
+ "grad_norm": 3.3844475746154785,
+ "learning_rate": 4.218026386717047e-07,
+ "loss": 0.3441,
+ "step": 10328
+ },
+ {
+ "epoch": 4.883687943262411,
+ "grad_norm": 3.159759283065796,
+ "learning_rate": 4.2145594573099745e-07,
+ "loss": 0.3459,
+ "step": 10329
+ },
+ {
+ "epoch": 4.884160756501182,
+ "grad_norm": 3.5672366619110107,
+ "learning_rate": 4.21109382212114e-07,
+ "loss": 0.3908,
+ "step": 10330
+ },
+ {
+ "epoch": 4.884633569739953,
+ "grad_norm": 3.2481353282928467,
+ "learning_rate": 4.2076294813663405e-07,
+ "loss": 0.3778,
+ "step": 10331
+ },
+ {
+ "epoch": 4.885106382978723,
+ "grad_norm": 3.3311941623687744,
+ "learning_rate": 4.2041664352612785e-07,
+ "loss": 0.3171,
+ "step": 10332
+ },
+ {
+ "epoch": 4.885579196217494,
+ "grad_norm": 3.4712841510772705,
+ "learning_rate": 4.2007046840215783e-07,
+ "loss": 0.3858,
+ "step": 10333
+ },
+ {
+ "epoch": 4.8860520094562645,
+ "grad_norm": 3.1591062545776367,
+ "learning_rate": 4.197244227862804e-07,
+ "loss": 0.327,
+ "step": 10334
+ },
+ {
+ "epoch": 4.886524822695035,
+ "grad_norm": 3.400400400161743,
+ "learning_rate": 4.1937850670004136e-07,
+ "loss": 0.3231,
+ "step": 10335
+ },
+ {
+ "epoch": 4.886997635933806,
+ "grad_norm": 2.9156908988952637,
+ "learning_rate": 4.190327201649788e-07,
+ "loss": 0.2834,
+ "step": 10336
+ },
+ {
+ "epoch": 4.887470449172577,
+ "grad_norm": 3.0125153064727783,
+ "learning_rate": 4.1868706320262467e-07,
+ "loss": 0.3143,
+ "step": 10337
+ },
+ {
+ "epoch": 4.887943262411348,
+ "grad_norm": 2.656107187271118,
+ "learning_rate": 4.183415358345003e-07,
+ "loss": 0.3348,
+ "step": 10338
+ },
+ {
+ "epoch": 4.8884160756501185,
+ "grad_norm": 3.0910565853118896,
+ "learning_rate": 4.17996138082121e-07,
+ "loss": 0.3212,
+ "step": 10339
+ },
+ {
+ "epoch": 4.888888888888889,
+ "grad_norm": 3.1303164958953857,
+ "learning_rate": 4.1765086996699315e-07,
+ "loss": 0.3573,
+ "step": 10340
+ },
+ {
+ "epoch": 4.889361702127659,
+ "grad_norm": 3.504901885986328,
+ "learning_rate": 4.173057315106141e-07,
+ "loss": 0.3912,
+ "step": 10341
+ },
+ {
+ "epoch": 4.88983451536643,
+ "grad_norm": 2.994338035583496,
+ "learning_rate": 4.1696072273447547e-07,
+ "loss": 0.3896,
+ "step": 10342
+ },
+ {
+ "epoch": 4.890307328605201,
+ "grad_norm": 3.0409624576568604,
+ "learning_rate": 4.1661584366005814e-07,
+ "loss": 0.3109,
+ "step": 10343
+ },
+ {
+ "epoch": 4.890780141843972,
+ "grad_norm": 3.479952096939087,
+ "learning_rate": 4.1627109430883743e-07,
+ "loss": 0.3265,
+ "step": 10344
+ },
+ {
+ "epoch": 4.891252955082742,
+ "grad_norm": 3.0288894176483154,
+ "learning_rate": 4.159264747022787e-07,
+ "loss": 0.3345,
+ "step": 10345
+ },
+ {
+ "epoch": 4.891725768321513,
+ "grad_norm": 3.7433063983917236,
+ "learning_rate": 4.1558198486184005e-07,
+ "loss": 0.3888,
+ "step": 10346
+ },
+ {
+ "epoch": 4.892198581560284,
+ "grad_norm": 3.431964635848999,
+ "learning_rate": 4.152376248089715e-07,
+ "loss": 0.3062,
+ "step": 10347
+ },
+ {
+ "epoch": 4.892671394799054,
+ "grad_norm": 3.3993113040924072,
+ "learning_rate": 4.1489339456511376e-07,
+ "loss": 0.3955,
+ "step": 10348
+ },
+ {
+ "epoch": 4.893144208037825,
+ "grad_norm": 3.09287428855896,
+ "learning_rate": 4.145492941517024e-07,
+ "loss": 0.2857,
+ "step": 10349
+ },
+ {
+ "epoch": 4.8936170212765955,
+ "grad_norm": 3.355915069580078,
+ "learning_rate": 4.1420532359016166e-07,
+ "loss": 0.3403,
+ "step": 10350
+ },
+ {
+ "epoch": 4.894089834515366,
+ "grad_norm": 4.00920295715332,
+ "learning_rate": 4.1386148290190915e-07,
+ "loss": 0.3455,
+ "step": 10351
+ },
+ {
+ "epoch": 4.894562647754137,
+ "grad_norm": 3.408311605453491,
+ "learning_rate": 4.1351777210835524e-07,
+ "loss": 0.3606,
+ "step": 10352
+ },
+ {
+ "epoch": 4.895035460992908,
+ "grad_norm": 3.031616449356079,
+ "learning_rate": 4.1317419123090007e-07,
+ "loss": 0.3696,
+ "step": 10353
+ },
+ {
+ "epoch": 4.895508274231679,
+ "grad_norm": 3.555751085281372,
+ "learning_rate": 4.1283074029093814e-07,
+ "loss": 0.3197,
+ "step": 10354
+ },
+ {
+ "epoch": 4.8959810874704495,
+ "grad_norm": 3.3839752674102783,
+ "learning_rate": 4.124874193098541e-07,
+ "loss": 0.3744,
+ "step": 10355
+ },
+ {
+ "epoch": 4.89645390070922,
+ "grad_norm": 3.514296531677246,
+ "learning_rate": 4.1214422830902406e-07,
+ "loss": 0.29,
+ "step": 10356
+ },
+ {
+ "epoch": 4.89692671394799,
+ "grad_norm": 3.056325674057007,
+ "learning_rate": 4.1180116730981905e-07,
+ "loss": 0.371,
+ "step": 10357
+ },
+ {
+ "epoch": 4.897399527186761,
+ "grad_norm": 3.7567055225372314,
+ "learning_rate": 4.1145823633359865e-07,
+ "loss": 0.4105,
+ "step": 10358
+ },
+ {
+ "epoch": 4.897872340425532,
+ "grad_norm": 3.0050766468048096,
+ "learning_rate": 4.111154354017152e-07,
+ "loss": 0.3262,
+ "step": 10359
+ },
+ {
+ "epoch": 4.898345153664303,
+ "grad_norm": 3.2767333984375,
+ "learning_rate": 4.1077276453551476e-07,
+ "loss": 0.3253,
+ "step": 10360
+ },
+ {
+ "epoch": 4.898817966903073,
+ "grad_norm": 4.133147239685059,
+ "learning_rate": 4.1043022375633347e-07,
+ "loss": 0.4549,
+ "step": 10361
+ },
+ {
+ "epoch": 4.899290780141844,
+ "grad_norm": 3.372962236404419,
+ "learning_rate": 4.1008781308549934e-07,
+ "loss": 0.315,
+ "step": 10362
+ },
+ {
+ "epoch": 4.899763593380615,
+ "grad_norm": 3.4167628288269043,
+ "learning_rate": 4.0974553254433335e-07,
+ "loss": 0.3832,
+ "step": 10363
+ },
+ {
+ "epoch": 4.900236406619385,
+ "grad_norm": 3.103311061859131,
+ "learning_rate": 4.094033821541468e-07,
+ "loss": 0.3347,
+ "step": 10364
+ },
+ {
+ "epoch": 4.900709219858156,
+ "grad_norm": 2.95872163772583,
+ "learning_rate": 4.0906136193624547e-07,
+ "loss": 0.2861,
+ "step": 10365
+ },
+ {
+ "epoch": 4.9011820330969265,
+ "grad_norm": 3.1035397052764893,
+ "learning_rate": 4.087194719119239e-07,
+ "loss": 0.3089,
+ "step": 10366
+ },
+ {
+ "epoch": 4.901654846335697,
+ "grad_norm": 3.0228095054626465,
+ "learning_rate": 4.083777121024715e-07,
+ "loss": 0.38,
+ "step": 10367
+ },
+ {
+ "epoch": 4.902127659574468,
+ "grad_norm": 3.3528707027435303,
+ "learning_rate": 4.080360825291674e-07,
+ "loss": 0.3614,
+ "step": 10368
+ },
+ {
+ "epoch": 4.902600472813239,
+ "grad_norm": 3.5866968631744385,
+ "learning_rate": 4.076945832132828e-07,
+ "loss": 0.3751,
+ "step": 10369
+ },
+ {
+ "epoch": 4.90307328605201,
+ "grad_norm": 3.388880729675293,
+ "learning_rate": 4.0735321417608276e-07,
+ "loss": 0.3358,
+ "step": 10370
+ },
+ {
+ "epoch": 4.9035460992907804,
+ "grad_norm": 3.5489447116851807,
+ "learning_rate": 4.070119754388213e-07,
+ "loss": 0.3437,
+ "step": 10371
+ },
+ {
+ "epoch": 4.904018912529551,
+ "grad_norm": 2.8234825134277344,
+ "learning_rate": 4.0667086702274733e-07,
+ "loss": 0.3164,
+ "step": 10372
+ },
+ {
+ "epoch": 4.904491725768321,
+ "grad_norm": 3.337445020675659,
+ "learning_rate": 4.0632988894909965e-07,
+ "loss": 0.3213,
+ "step": 10373
+ },
+ {
+ "epoch": 4.904964539007092,
+ "grad_norm": 3.639477491378784,
+ "learning_rate": 4.0598904123910847e-07,
+ "loss": 0.3642,
+ "step": 10374
+ },
+ {
+ "epoch": 4.905437352245863,
+ "grad_norm": 3.101829767227173,
+ "learning_rate": 4.0564832391399857e-07,
+ "loss": 0.3415,
+ "step": 10375
+ },
+ {
+ "epoch": 4.9059101654846335,
+ "grad_norm": 2.8291256427764893,
+ "learning_rate": 4.05307736994984e-07,
+ "loss": 0.3014,
+ "step": 10376
+ },
+ {
+ "epoch": 4.906382978723404,
+ "grad_norm": 2.8689401149749756,
+ "learning_rate": 4.049672805032717e-07,
+ "loss": 0.3151,
+ "step": 10377
+ },
+ {
+ "epoch": 4.906855791962175,
+ "grad_norm": 3.468038320541382,
+ "learning_rate": 4.046269544600598e-07,
+ "loss": 0.3956,
+ "step": 10378
+ },
+ {
+ "epoch": 4.907328605200946,
+ "grad_norm": 3.5246312618255615,
+ "learning_rate": 4.042867588865401e-07,
+ "loss": 0.3003,
+ "step": 10379
+ },
+ {
+ "epoch": 4.907801418439716,
+ "grad_norm": 3.273010730743408,
+ "learning_rate": 4.039466938038944e-07,
+ "loss": 0.3036,
+ "step": 10380
+ },
+ {
+ "epoch": 4.908274231678487,
+ "grad_norm": 3.064718008041382,
+ "learning_rate": 4.0360675923329733e-07,
+ "loss": 0.3353,
+ "step": 10381
+ },
+ {
+ "epoch": 4.908747044917257,
+ "grad_norm": 3.413242816925049,
+ "learning_rate": 4.032669551959142e-07,
+ "loss": 0.354,
+ "step": 10382
+ },
+ {
+ "epoch": 4.909219858156028,
+ "grad_norm": 3.136293411254883,
+ "learning_rate": 4.029272817129046e-07,
+ "loss": 0.331,
+ "step": 10383
+ },
+ {
+ "epoch": 4.909692671394799,
+ "grad_norm": 3.0966274738311768,
+ "learning_rate": 4.025877388054172e-07,
+ "loss": 0.2878,
+ "step": 10384
+ },
+ {
+ "epoch": 4.91016548463357,
+ "grad_norm": 3.334113836288452,
+ "learning_rate": 4.022483264945948e-07,
+ "loss": 0.3276,
+ "step": 10385
+ },
+ {
+ "epoch": 4.910638297872341,
+ "grad_norm": 3.2662229537963867,
+ "learning_rate": 4.019090448015711e-07,
+ "loss": 0.3265,
+ "step": 10386
+ },
+ {
+ "epoch": 4.911111111111111,
+ "grad_norm": 3.134220838546753,
+ "learning_rate": 4.0156989374747047e-07,
+ "loss": 0.3684,
+ "step": 10387
+ },
+ {
+ "epoch": 4.911583924349882,
+ "grad_norm": 3.803694725036621,
+ "learning_rate": 4.012308733534118e-07,
+ "loss": 0.3394,
+ "step": 10388
+ },
+ {
+ "epoch": 4.912056737588652,
+ "grad_norm": 2.788388252258301,
+ "learning_rate": 4.008919836405034e-07,
+ "loss": 0.2835,
+ "step": 10389
+ },
+ {
+ "epoch": 4.912529550827423,
+ "grad_norm": 3.3408966064453125,
+ "learning_rate": 4.005532246298474e-07,
+ "loss": 0.3694,
+ "step": 10390
+ },
+ {
+ "epoch": 4.913002364066194,
+ "grad_norm": 2.913114547729492,
+ "learning_rate": 4.0021459634253605e-07,
+ "loss": 0.3456,
+ "step": 10391
+ },
+ {
+ "epoch": 4.9134751773049645,
+ "grad_norm": 3.778111457824707,
+ "learning_rate": 3.9987609879965414e-07,
+ "loss": 0.3887,
+ "step": 10392
+ },
+ {
+ "epoch": 4.913947990543735,
+ "grad_norm": 2.871978282928467,
+ "learning_rate": 3.995377320222796e-07,
+ "loss": 0.28,
+ "step": 10393
+ },
+ {
+ "epoch": 4.914420803782506,
+ "grad_norm": 3.5189783573150635,
+ "learning_rate": 3.9919949603147987e-07,
+ "loss": 0.3802,
+ "step": 10394
+ },
+ {
+ "epoch": 4.914893617021277,
+ "grad_norm": 3.381014585494995,
+ "learning_rate": 3.9886139084831607e-07,
+ "loss": 0.3661,
+ "step": 10395
+ },
+ {
+ "epoch": 4.915366430260047,
+ "grad_norm": 2.908207654953003,
+ "learning_rate": 3.9852341649384006e-07,
+ "loss": 0.3228,
+ "step": 10396
+ },
+ {
+ "epoch": 4.915839243498818,
+ "grad_norm": 3.4134814739227295,
+ "learning_rate": 3.981855729890957e-07,
+ "loss": 0.3149,
+ "step": 10397
+ },
+ {
+ "epoch": 4.916312056737588,
+ "grad_norm": 4.496891975402832,
+ "learning_rate": 3.9784786035512004e-07,
+ "loss": 0.3516,
+ "step": 10398
+ },
+ {
+ "epoch": 4.916784869976359,
+ "grad_norm": 3.2910919189453125,
+ "learning_rate": 3.975102786129398e-07,
+ "loss": 0.3329,
+ "step": 10399
+ },
+ {
+ "epoch": 4.91725768321513,
+ "grad_norm": 3.6607260704040527,
+ "learning_rate": 3.97172827783576e-07,
+ "loss": 0.3878,
+ "step": 10400
+ },
+ {
+ "epoch": 4.917730496453901,
+ "grad_norm": 3.1500742435455322,
+ "learning_rate": 3.9683550788803983e-07,
+ "loss": 0.3323,
+ "step": 10401
+ },
+ {
+ "epoch": 4.918203309692672,
+ "grad_norm": 3.263714075088501,
+ "learning_rate": 3.964983189473337e-07,
+ "loss": 0.352,
+ "step": 10402
+ },
+ {
+ "epoch": 4.918676122931442,
+ "grad_norm": 3.433868408203125,
+ "learning_rate": 3.961612609824542e-07,
+ "loss": 0.3308,
+ "step": 10403
+ },
+ {
+ "epoch": 4.919148936170213,
+ "grad_norm": 3.3086423873901367,
+ "learning_rate": 3.95824334014388e-07,
+ "loss": 0.3641,
+ "step": 10404
+ },
+ {
+ "epoch": 4.919621749408983,
+ "grad_norm": 3.2854621410369873,
+ "learning_rate": 3.954875380641135e-07,
+ "loss": 0.3405,
+ "step": 10405
+ },
+ {
+ "epoch": 4.920094562647754,
+ "grad_norm": 3.1408650875091553,
+ "learning_rate": 3.9515087315260244e-07,
+ "loss": 0.3103,
+ "step": 10406
+ },
+ {
+ "epoch": 4.920567375886525,
+ "grad_norm": 2.9340312480926514,
+ "learning_rate": 3.948143393008164e-07,
+ "loss": 0.3405,
+ "step": 10407
+ },
+ {
+ "epoch": 4.9210401891252955,
+ "grad_norm": 3.525876522064209,
+ "learning_rate": 3.944779365297113e-07,
+ "loss": 0.3464,
+ "step": 10408
+ },
+ {
+ "epoch": 4.921513002364066,
+ "grad_norm": 3.26991605758667,
+ "learning_rate": 3.9414166486023253e-07,
+ "loss": 0.3529,
+ "step": 10409
+ },
+ {
+ "epoch": 4.921985815602837,
+ "grad_norm": 2.7669694423675537,
+ "learning_rate": 3.938055243133182e-07,
+ "loss": 0.3242,
+ "step": 10410
+ },
+ {
+ "epoch": 4.922458628841608,
+ "grad_norm": 2.8268136978149414,
+ "learning_rate": 3.934695149098988e-07,
+ "loss": 0.3086,
+ "step": 10411
+ },
+ {
+ "epoch": 4.922931442080378,
+ "grad_norm": 3.119053602218628,
+ "learning_rate": 3.931336366708952e-07,
+ "loss": 0.3065,
+ "step": 10412
+ },
+ {
+ "epoch": 4.923404255319149,
+ "grad_norm": 3.1537275314331055,
+ "learning_rate": 3.9279788961722215e-07,
+ "loss": 0.3325,
+ "step": 10413
+ },
+ {
+ "epoch": 4.923877068557919,
+ "grad_norm": 3.1365256309509277,
+ "learning_rate": 3.9246227376978476e-07,
+ "loss": 0.4139,
+ "step": 10414
+ },
+ {
+ "epoch": 4.92434988179669,
+ "grad_norm": 3.3495218753814697,
+ "learning_rate": 3.921267891494798e-07,
+ "loss": 0.3463,
+ "step": 10415
+ },
+ {
+ "epoch": 4.924822695035461,
+ "grad_norm": 3.2402634620666504,
+ "learning_rate": 3.9179143577719736e-07,
+ "loss": 0.3499,
+ "step": 10416
+ },
+ {
+ "epoch": 4.925295508274232,
+ "grad_norm": 2.986429452896118,
+ "learning_rate": 3.914562136738176e-07,
+ "loss": 0.3326,
+ "step": 10417
+ },
+ {
+ "epoch": 4.925768321513003,
+ "grad_norm": 3.1276674270629883,
+ "learning_rate": 3.9112112286021407e-07,
+ "loss": 0.3087,
+ "step": 10418
+ },
+ {
+ "epoch": 4.926241134751773,
+ "grad_norm": 3.1767871379852295,
+ "learning_rate": 3.9078616335725126e-07,
+ "loss": 0.3804,
+ "step": 10419
+ },
+ {
+ "epoch": 4.926713947990544,
+ "grad_norm": 3.1657216548919678,
+ "learning_rate": 3.904513351857847e-07,
+ "loss": 0.333,
+ "step": 10420
+ },
+ {
+ "epoch": 4.927186761229314,
+ "grad_norm": 3.005009174346924,
+ "learning_rate": 3.901166383666641e-07,
+ "loss": 0.3583,
+ "step": 10421
+ },
+ {
+ "epoch": 4.927659574468085,
+ "grad_norm": 2.900146722793579,
+ "learning_rate": 3.897820729207283e-07,
+ "loss": 0.3061,
+ "step": 10422
+ },
+ {
+ "epoch": 4.928132387706856,
+ "grad_norm": 3.2418317794799805,
+ "learning_rate": 3.8944763886881037e-07,
+ "loss": 0.3822,
+ "step": 10423
+ },
+ {
+ "epoch": 4.9286052009456265,
+ "grad_norm": 3.1222848892211914,
+ "learning_rate": 3.8911333623173344e-07,
+ "loss": 0.3167,
+ "step": 10424
+ },
+ {
+ "epoch": 4.929078014184397,
+ "grad_norm": 2.727388858795166,
+ "learning_rate": 3.8877916503031325e-07,
+ "loss": 0.2977,
+ "step": 10425
+ },
+ {
+ "epoch": 4.929550827423168,
+ "grad_norm": 3.190159797668457,
+ "learning_rate": 3.884451252853569e-07,
+ "loss": 0.3399,
+ "step": 10426
+ },
+ {
+ "epoch": 4.930023640661939,
+ "grad_norm": 3.253791570663452,
+ "learning_rate": 3.8811121701766373e-07,
+ "loss": 0.3806,
+ "step": 10427
+ },
+ {
+ "epoch": 4.930496453900709,
+ "grad_norm": 3.4284887313842773,
+ "learning_rate": 3.8777744024802414e-07,
+ "loss": 0.3437,
+ "step": 10428
+ },
+ {
+ "epoch": 4.93096926713948,
+ "grad_norm": 3.3665032386779785,
+ "learning_rate": 3.874437949972221e-07,
+ "loss": 0.375,
+ "step": 10429
+ },
+ {
+ "epoch": 4.93144208037825,
+ "grad_norm": 3.1024677753448486,
+ "learning_rate": 3.8711028128603084e-07,
+ "loss": 0.3493,
+ "step": 10430
+ },
+ {
+ "epoch": 4.931914893617021,
+ "grad_norm": 3.599743604660034,
+ "learning_rate": 3.867768991352186e-07,
+ "loss": 0.3852,
+ "step": 10431
+ },
+ {
+ "epoch": 4.932387706855792,
+ "grad_norm": 3.3676376342773438,
+ "learning_rate": 3.8644364856554236e-07,
+ "loss": 0.3489,
+ "step": 10432
+ },
+ {
+ "epoch": 4.932860520094563,
+ "grad_norm": 3.2186801433563232,
+ "learning_rate": 3.861105295977521e-07,
+ "loss": 0.3413,
+ "step": 10433
+ },
+ {
+ "epoch": 4.933333333333334,
+ "grad_norm": 3.3672704696655273,
+ "learning_rate": 3.8577754225259055e-07,
+ "loss": 0.3392,
+ "step": 10434
+ },
+ {
+ "epoch": 4.933806146572104,
+ "grad_norm": 3.4285950660705566,
+ "learning_rate": 3.854446865507902e-07,
+ "loss": 0.3188,
+ "step": 10435
+ },
+ {
+ "epoch": 4.934278959810875,
+ "grad_norm": 3.187617063522339,
+ "learning_rate": 3.8511196251307783e-07,
+ "loss": 0.3258,
+ "step": 10436
+ },
+ {
+ "epoch": 4.934751773049645,
+ "grad_norm": 2.9744882583618164,
+ "learning_rate": 3.847793701601699e-07,
+ "loss": 0.3879,
+ "step": 10437
+ },
+ {
+ "epoch": 4.935224586288416,
+ "grad_norm": 2.9745848178863525,
+ "learning_rate": 3.844469095127751e-07,
+ "loss": 0.3178,
+ "step": 10438
+ },
+ {
+ "epoch": 4.935697399527187,
+ "grad_norm": 3.7419471740722656,
+ "learning_rate": 3.841145805915955e-07,
+ "loss": 0.3167,
+ "step": 10439
+ },
+ {
+ "epoch": 4.9361702127659575,
+ "grad_norm": 4.295339107513428,
+ "learning_rate": 3.837823834173232e-07,
+ "loss": 0.3209,
+ "step": 10440
+ },
+ {
+ "epoch": 4.936643026004728,
+ "grad_norm": 3.4612984657287598,
+ "learning_rate": 3.8345031801064217e-07,
+ "loss": 0.3351,
+ "step": 10441
+ },
+ {
+ "epoch": 4.937115839243499,
+ "grad_norm": 3.0626909732818604,
+ "learning_rate": 3.8311838439222953e-07,
+ "loss": 0.3395,
+ "step": 10442
+ },
+ {
+ "epoch": 4.93758865248227,
+ "grad_norm": 3.606682538986206,
+ "learning_rate": 3.827865825827518e-07,
+ "loss": 0.4112,
+ "step": 10443
+ },
+ {
+ "epoch": 4.93806146572104,
+ "grad_norm": 3.3908627033233643,
+ "learning_rate": 3.8245491260287064e-07,
+ "loss": 0.3367,
+ "step": 10444
+ },
+ {
+ "epoch": 4.938534278959811,
+ "grad_norm": 2.8598084449768066,
+ "learning_rate": 3.821233744732364e-07,
+ "loss": 0.3408,
+ "step": 10445
+ },
+ {
+ "epoch": 4.939007092198581,
+ "grad_norm": 3.207010269165039,
+ "learning_rate": 3.8179196821449354e-07,
+ "loss": 0.3301,
+ "step": 10446
+ },
+ {
+ "epoch": 4.939479905437352,
+ "grad_norm": 3.018414258956909,
+ "learning_rate": 3.8146069384727674e-07,
+ "loss": 0.3622,
+ "step": 10447
+ },
+ {
+ "epoch": 4.939952718676123,
+ "grad_norm": 3.399415969848633,
+ "learning_rate": 3.811295513922125e-07,
+ "loss": 0.3525,
+ "step": 10448
+ },
+ {
+ "epoch": 4.940425531914894,
+ "grad_norm": 3.175705671310425,
+ "learning_rate": 3.807985408699208e-07,
+ "loss": 0.322,
+ "step": 10449
+ },
+ {
+ "epoch": 4.9408983451536646,
+ "grad_norm": 2.906064033508301,
+ "learning_rate": 3.804676623010109e-07,
+ "loss": 0.3246,
+ "step": 10450
+ },
+ {
+ "epoch": 4.941371158392435,
+ "grad_norm": 3.1224400997161865,
+ "learning_rate": 3.8013691570608634e-07,
+ "loss": 0.3607,
+ "step": 10451
+ },
+ {
+ "epoch": 4.941843971631206,
+ "grad_norm": 3.4386677742004395,
+ "learning_rate": 3.7980630110574067e-07,
+ "loss": 0.3315,
+ "step": 10452
+ },
+ {
+ "epoch": 4.942316784869976,
+ "grad_norm": 3.432509183883667,
+ "learning_rate": 3.794758185205594e-07,
+ "loss": 0.3713,
+ "step": 10453
+ },
+ {
+ "epoch": 4.942789598108747,
+ "grad_norm": 3.314802646636963,
+ "learning_rate": 3.7914546797112097e-07,
+ "loss": 0.3587,
+ "step": 10454
+ },
+ {
+ "epoch": 4.943262411347518,
+ "grad_norm": 2.9151065349578857,
+ "learning_rate": 3.788152494779948e-07,
+ "loss": 0.367,
+ "step": 10455
+ },
+ {
+ "epoch": 4.9437352245862884,
+ "grad_norm": 3.3444712162017822,
+ "learning_rate": 3.784851630617414e-07,
+ "loss": 0.4009,
+ "step": 10456
+ },
+ {
+ "epoch": 4.944208037825059,
+ "grad_norm": 3.2677152156829834,
+ "learning_rate": 3.7815520874291494e-07,
+ "loss": 0.3553,
+ "step": 10457
+ },
+ {
+ "epoch": 4.94468085106383,
+ "grad_norm": 3.2326159477233887,
+ "learning_rate": 3.7782538654205946e-07,
+ "loss": 0.4008,
+ "step": 10458
+ },
+ {
+ "epoch": 4.945153664302601,
+ "grad_norm": 3.3304033279418945,
+ "learning_rate": 3.774956964797119e-07,
+ "loss": 0.3591,
+ "step": 10459
+ },
+ {
+ "epoch": 4.945626477541371,
+ "grad_norm": 3.038605213165283,
+ "learning_rate": 3.7716613857640026e-07,
+ "loss": 0.2907,
+ "step": 10460
+ },
+ {
+ "epoch": 4.9460992907801415,
+ "grad_norm": 3.016227960586548,
+ "learning_rate": 3.768367128526443e-07,
+ "loss": 0.2898,
+ "step": 10461
+ },
+ {
+ "epoch": 4.946572104018912,
+ "grad_norm": 3.354973316192627,
+ "learning_rate": 3.76507419328957e-07,
+ "loss": 0.3498,
+ "step": 10462
+ },
+ {
+ "epoch": 4.947044917257683,
+ "grad_norm": 3.5561892986297607,
+ "learning_rate": 3.761782580258408e-07,
+ "loss": 0.4384,
+ "step": 10463
+ },
+ {
+ "epoch": 4.947517730496454,
+ "grad_norm": 3.2498281002044678,
+ "learning_rate": 3.7584922896379244e-07,
+ "loss": 0.3289,
+ "step": 10464
+ },
+ {
+ "epoch": 4.947990543735225,
+ "grad_norm": 3.250598907470703,
+ "learning_rate": 3.755203321632986e-07,
+ "loss": 0.4104,
+ "step": 10465
+ },
+ {
+ "epoch": 4.9484633569739955,
+ "grad_norm": 2.8788363933563232,
+ "learning_rate": 3.7519156764483727e-07,
+ "loss": 0.2896,
+ "step": 10466
+ },
+ {
+ "epoch": 4.948936170212766,
+ "grad_norm": 3.068180561065674,
+ "learning_rate": 3.7486293542888075e-07,
+ "loss": 0.3346,
+ "step": 10467
+ },
+ {
+ "epoch": 4.949408983451537,
+ "grad_norm": 3.4533181190490723,
+ "learning_rate": 3.7453443553589043e-07,
+ "loss": 0.3917,
+ "step": 10468
+ },
+ {
+ "epoch": 4.949881796690307,
+ "grad_norm": 2.8812358379364014,
+ "learning_rate": 3.7420606798632104e-07,
+ "loss": 0.3276,
+ "step": 10469
+ },
+ {
+ "epoch": 4.950354609929078,
+ "grad_norm": 3.0952184200286865,
+ "learning_rate": 3.7387783280061875e-07,
+ "loss": 0.3261,
+ "step": 10470
+ },
+ {
+ "epoch": 4.950827423167849,
+ "grad_norm": 3.2409560680389404,
+ "learning_rate": 3.735497299992205e-07,
+ "loss": 0.3504,
+ "step": 10471
+ },
+ {
+ "epoch": 4.951300236406619,
+ "grad_norm": 3.3790557384490967,
+ "learning_rate": 3.73221759602557e-07,
+ "loss": 0.3316,
+ "step": 10472
+ },
+ {
+ "epoch": 4.95177304964539,
+ "grad_norm": 3.2161364555358887,
+ "learning_rate": 3.728939216310487e-07,
+ "loss": 0.3364,
+ "step": 10473
+ },
+ {
+ "epoch": 4.952245862884161,
+ "grad_norm": 3.3514342308044434,
+ "learning_rate": 3.7256621610510884e-07,
+ "loss": 0.3912,
+ "step": 10474
+ },
+ {
+ "epoch": 4.952718676122932,
+ "grad_norm": 2.7333486080169678,
+ "learning_rate": 3.722386430451422e-07,
+ "loss": 0.3145,
+ "step": 10475
+ },
+ {
+ "epoch": 4.953191489361702,
+ "grad_norm": 3.104905128479004,
+ "learning_rate": 3.719112024715449e-07,
+ "loss": 0.3599,
+ "step": 10476
+ },
+ {
+ "epoch": 4.9536643026004725,
+ "grad_norm": 3.16666579246521,
+ "learning_rate": 3.715838944047059e-07,
+ "loss": 0.3462,
+ "step": 10477
+ },
+ {
+ "epoch": 4.954137115839243,
+ "grad_norm": 3.078171491622925,
+ "learning_rate": 3.7125671886500514e-07,
+ "loss": 0.3119,
+ "step": 10478
+ },
+ {
+ "epoch": 4.954609929078014,
+ "grad_norm": 3.261456251144409,
+ "learning_rate": 3.709296758728137e-07,
+ "loss": 0.3959,
+ "step": 10479
+ },
+ {
+ "epoch": 4.955082742316785,
+ "grad_norm": 3.0302278995513916,
+ "learning_rate": 3.706027654484962e-07,
+ "loss": 0.3526,
+ "step": 10480
+ },
+ {
+ "epoch": 4.955555555555556,
+ "grad_norm": 3.175342559814453,
+ "learning_rate": 3.702759876124068e-07,
+ "loss": 0.3237,
+ "step": 10481
+ },
+ {
+ "epoch": 4.9560283687943265,
+ "grad_norm": 3.4779844284057617,
+ "learning_rate": 3.699493423848938e-07,
+ "loss": 0.3075,
+ "step": 10482
+ },
+ {
+ "epoch": 4.956501182033097,
+ "grad_norm": 2.809904098510742,
+ "learning_rate": 3.69622829786295e-07,
+ "loss": 0.3238,
+ "step": 10483
+ },
+ {
+ "epoch": 4.956973995271868,
+ "grad_norm": 3.092604875564575,
+ "learning_rate": 3.692964498369406e-07,
+ "loss": 0.3344,
+ "step": 10484
+ },
+ {
+ "epoch": 4.957446808510638,
+ "grad_norm": 3.477560520172119,
+ "learning_rate": 3.689702025571543e-07,
+ "loss": 0.3525,
+ "step": 10485
+ },
+ {
+ "epoch": 4.957919621749409,
+ "grad_norm": 4.119097709655762,
+ "learning_rate": 3.6864408796724815e-07,
+ "loss": 0.3953,
+ "step": 10486
+ },
+ {
+ "epoch": 4.95839243498818,
+ "grad_norm": 3.1418824195861816,
+ "learning_rate": 3.6831810608752986e-07,
+ "loss": 0.3689,
+ "step": 10487
+ },
+ {
+ "epoch": 4.95886524822695,
+ "grad_norm": 3.1947824954986572,
+ "learning_rate": 3.6799225693829596e-07,
+ "loss": 0.3427,
+ "step": 10488
+ },
+ {
+ "epoch": 4.959338061465721,
+ "grad_norm": 3.196894884109497,
+ "learning_rate": 3.6766654053983554e-07,
+ "loss": 0.3138,
+ "step": 10489
+ },
+ {
+ "epoch": 4.959810874704492,
+ "grad_norm": 2.9747161865234375,
+ "learning_rate": 3.6734095691242975e-07,
+ "loss": 0.3336,
+ "step": 10490
+ },
+ {
+ "epoch": 4.960283687943263,
+ "grad_norm": 3.2788970470428467,
+ "learning_rate": 3.670155060763503e-07,
+ "loss": 0.3418,
+ "step": 10491
+ },
+ {
+ "epoch": 4.960756501182033,
+ "grad_norm": 3.1619482040405273,
+ "learning_rate": 3.6669018805186335e-07,
+ "loss": 0.3173,
+ "step": 10492
+ },
+ {
+ "epoch": 4.9612293144208035,
+ "grad_norm": 2.9894869327545166,
+ "learning_rate": 3.6636500285922386e-07,
+ "loss": 0.3057,
+ "step": 10493
+ },
+ {
+ "epoch": 4.961702127659574,
+ "grad_norm": 3.1162378787994385,
+ "learning_rate": 3.660399505186793e-07,
+ "loss": 0.3404,
+ "step": 10494
+ },
+ {
+ "epoch": 4.962174940898345,
+ "grad_norm": 2.811485528945923,
+ "learning_rate": 3.657150310504706e-07,
+ "loss": 0.3199,
+ "step": 10495
+ },
+ {
+ "epoch": 4.962647754137116,
+ "grad_norm": 2.8914854526519775,
+ "learning_rate": 3.653902444748278e-07,
+ "loss": 0.3666,
+ "step": 10496
+ },
+ {
+ "epoch": 4.963120567375887,
+ "grad_norm": 4.075942516326904,
+ "learning_rate": 3.6506559081197517e-07,
+ "loss": 0.3737,
+ "step": 10497
+ },
+ {
+ "epoch": 4.9635933806146575,
+ "grad_norm": 4.395053863525391,
+ "learning_rate": 3.647410700821266e-07,
+ "loss": 0.338,
+ "step": 10498
+ },
+ {
+ "epoch": 4.964066193853428,
+ "grad_norm": 2.89145565032959,
+ "learning_rate": 3.644166823054884e-07,
+ "loss": 0.2893,
+ "step": 10499
+ },
+ {
+ "epoch": 4.964539007092198,
+ "grad_norm": 2.8189663887023926,
+ "learning_rate": 3.640924275022595e-07,
+ "loss": 0.289,
+ "step": 10500
+ },
+ {
+ "epoch": 4.965011820330969,
+ "grad_norm": 3.0912365913391113,
+ "learning_rate": 3.6376830569262946e-07,
+ "loss": 0.3474,
+ "step": 10501
+ },
+ {
+ "epoch": 4.96548463356974,
+ "grad_norm": 3.3087918758392334,
+ "learning_rate": 3.634443168967797e-07,
+ "loss": 0.3104,
+ "step": 10502
+ },
+ {
+ "epoch": 4.965957446808511,
+ "grad_norm": 2.855022430419922,
+ "learning_rate": 3.6312046113488403e-07,
+ "loss": 0.3328,
+ "step": 10503
+ },
+ {
+ "epoch": 4.966430260047281,
+ "grad_norm": 3.5445404052734375,
+ "learning_rate": 3.627967384271072e-07,
+ "loss": 0.322,
+ "step": 10504
+ },
+ {
+ "epoch": 4.966903073286052,
+ "grad_norm": 3.526319742202759,
+ "learning_rate": 3.624731487936065e-07,
+ "loss": 0.3264,
+ "step": 10505
+ },
+ {
+ "epoch": 4.967375886524823,
+ "grad_norm": 3.521204948425293,
+ "learning_rate": 3.621496922545298e-07,
+ "loss": 0.369,
+ "step": 10506
+ },
+ {
+ "epoch": 4.967848699763593,
+ "grad_norm": 2.8956806659698486,
+ "learning_rate": 3.618263688300172e-07,
+ "loss": 0.3396,
+ "step": 10507
+ },
+ {
+ "epoch": 4.968321513002364,
+ "grad_norm": 3.155200958251953,
+ "learning_rate": 3.615031785402015e-07,
+ "loss": 0.354,
+ "step": 10508
+ },
+ {
+ "epoch": 4.9687943262411345,
+ "grad_norm": 3.2896533012390137,
+ "learning_rate": 3.611801214052052e-07,
+ "loss": 0.3034,
+ "step": 10509
+ },
+ {
+ "epoch": 4.969267139479905,
+ "grad_norm": 3.0860259532928467,
+ "learning_rate": 3.608571974451447e-07,
+ "loss": 0.3354,
+ "step": 10510
+ },
+ {
+ "epoch": 4.969739952718676,
+ "grad_norm": 3.3194656372070312,
+ "learning_rate": 3.6053440668012697e-07,
+ "loss": 0.3714,
+ "step": 10511
+ },
+ {
+ "epoch": 4.970212765957447,
+ "grad_norm": 2.9831063747406006,
+ "learning_rate": 3.602117491302498e-07,
+ "loss": 0.311,
+ "step": 10512
+ },
+ {
+ "epoch": 4.970685579196218,
+ "grad_norm": 3.175940752029419,
+ "learning_rate": 3.59889224815605e-07,
+ "loss": 0.3658,
+ "step": 10513
+ },
+ {
+ "epoch": 4.9711583924349885,
+ "grad_norm": 3.051496982574463,
+ "learning_rate": 3.5956683375627324e-07,
+ "loss": 0.3458,
+ "step": 10514
+ },
+ {
+ "epoch": 4.971631205673759,
+ "grad_norm": 3.0264453887939453,
+ "learning_rate": 3.592445759723298e-07,
+ "loss": 0.2843,
+ "step": 10515
+ },
+ {
+ "epoch": 4.972104018912529,
+ "grad_norm": 3.404376745223999,
+ "learning_rate": 3.589224514838399e-07,
+ "loss": 0.366,
+ "step": 10516
+ },
+ {
+ "epoch": 4.9725768321513,
+ "grad_norm": 3.640212297439575,
+ "learning_rate": 3.586004603108598e-07,
+ "loss": 0.3248,
+ "step": 10517
+ },
+ {
+ "epoch": 4.973049645390071,
+ "grad_norm": 3.0829873085021973,
+ "learning_rate": 3.5827860247344e-07,
+ "loss": 0.3613,
+ "step": 10518
+ },
+ {
+ "epoch": 4.973522458628842,
+ "grad_norm": 3.6157045364379883,
+ "learning_rate": 3.5795687799162064e-07,
+ "loss": 0.3599,
+ "step": 10519
+ },
+ {
+ "epoch": 4.973995271867612,
+ "grad_norm": 3.150632619857788,
+ "learning_rate": 3.576352868854335e-07,
+ "loss": 0.3242,
+ "step": 10520
+ },
+ {
+ "epoch": 4.974468085106383,
+ "grad_norm": 3.04829740524292,
+ "learning_rate": 3.5731382917490286e-07,
+ "loss": 0.3819,
+ "step": 10521
+ },
+ {
+ "epoch": 4.974940898345154,
+ "grad_norm": 3.216092348098755,
+ "learning_rate": 3.5699250488004516e-07,
+ "loss": 0.3538,
+ "step": 10522
+ },
+ {
+ "epoch": 4.975413711583924,
+ "grad_norm": 3.36538028717041,
+ "learning_rate": 3.5667131402086717e-07,
+ "loss": 0.3381,
+ "step": 10523
+ },
+ {
+ "epoch": 4.975886524822695,
+ "grad_norm": 3.3398420810699463,
+ "learning_rate": 3.563502566173685e-07,
+ "loss": 0.3085,
+ "step": 10524
+ },
+ {
+ "epoch": 4.9763593380614655,
+ "grad_norm": 3.10583233833313,
+ "learning_rate": 3.5602933268953893e-07,
+ "loss": 0.3023,
+ "step": 10525
+ },
+ {
+ "epoch": 4.976832151300236,
+ "grad_norm": 3.422929525375366,
+ "learning_rate": 3.557085422573625e-07,
+ "loss": 0.3319,
+ "step": 10526
+ },
+ {
+ "epoch": 4.977304964539007,
+ "grad_norm": 3.7357773780822754,
+ "learning_rate": 3.5538788534081214e-07,
+ "loss": 0.3762,
+ "step": 10527
+ },
+ {
+ "epoch": 4.977777777777778,
+ "grad_norm": 3.0172133445739746,
+ "learning_rate": 3.550673619598549e-07,
+ "loss": 0.3292,
+ "step": 10528
+ },
+ {
+ "epoch": 4.978250591016549,
+ "grad_norm": 3.2497189044952393,
+ "learning_rate": 3.5474697213444763e-07,
+ "loss": 0.3292,
+ "step": 10529
+ },
+ {
+ "epoch": 4.9787234042553195,
+ "grad_norm": 2.8510115146636963,
+ "learning_rate": 3.544267158845394e-07,
+ "loss": 0.3717,
+ "step": 10530
+ },
+ {
+ "epoch": 4.97919621749409,
+ "grad_norm": 3.2559750080108643,
+ "learning_rate": 3.541065932300719e-07,
+ "loss": 0.3656,
+ "step": 10531
+ },
+ {
+ "epoch": 4.97966903073286,
+ "grad_norm": 3.3215935230255127,
+ "learning_rate": 3.537866041909768e-07,
+ "loss": 0.364,
+ "step": 10532
+ },
+ {
+ "epoch": 4.980141843971631,
+ "grad_norm": 3.4923696517944336,
+ "learning_rate": 3.5346674878717954e-07,
+ "loss": 0.3464,
+ "step": 10533
+ },
+ {
+ "epoch": 4.980614657210402,
+ "grad_norm": 3.5320425033569336,
+ "learning_rate": 3.531470270385959e-07,
+ "loss": 0.3506,
+ "step": 10534
+ },
+ {
+ "epoch": 4.9810874704491725,
+ "grad_norm": 3.290199041366577,
+ "learning_rate": 3.528274389651323e-07,
+ "loss": 0.4092,
+ "step": 10535
+ },
+ {
+ "epoch": 4.981560283687943,
+ "grad_norm": 3.108628034591675,
+ "learning_rate": 3.5250798458668966e-07,
+ "loss": 0.3522,
+ "step": 10536
+ },
+ {
+ "epoch": 4.982033096926714,
+ "grad_norm": 3.3015148639678955,
+ "learning_rate": 3.521886639231584e-07,
+ "loss": 0.3609,
+ "step": 10537
+ },
+ {
+ "epoch": 4.982505910165485,
+ "grad_norm": 3.506431818008423,
+ "learning_rate": 3.518694769944211e-07,
+ "loss": 0.3458,
+ "step": 10538
+ },
+ {
+ "epoch": 4.982978723404255,
+ "grad_norm": 3.560453414916992,
+ "learning_rate": 3.5155042382035236e-07,
+ "loss": 0.3803,
+ "step": 10539
+ },
+ {
+ "epoch": 4.983451536643026,
+ "grad_norm": 3.1382486820220947,
+ "learning_rate": 3.5123150442081757e-07,
+ "loss": 0.3209,
+ "step": 10540
+ },
+ {
+ "epoch": 4.9839243498817964,
+ "grad_norm": 4.326927661895752,
+ "learning_rate": 3.5091271881567523e-07,
+ "loss": 0.3649,
+ "step": 10541
+ },
+ {
+ "epoch": 4.984397163120567,
+ "grad_norm": 3.0951757431030273,
+ "learning_rate": 3.50594067024774e-07,
+ "loss": 0.3808,
+ "step": 10542
+ },
+ {
+ "epoch": 4.984869976359338,
+ "grad_norm": 3.264277458190918,
+ "learning_rate": 3.5027554906795574e-07,
+ "loss": 0.3408,
+ "step": 10543
+ },
+ {
+ "epoch": 4.985342789598109,
+ "grad_norm": 3.3679237365722656,
+ "learning_rate": 3.4995716496505293e-07,
+ "loss": 0.3746,
+ "step": 10544
+ },
+ {
+ "epoch": 4.98581560283688,
+ "grad_norm": 3.489201545715332,
+ "learning_rate": 3.496389147358892e-07,
+ "loss": 0.3725,
+ "step": 10545
+ },
+ {
+ "epoch": 4.98628841607565,
+ "grad_norm": 2.8233766555786133,
+ "learning_rate": 3.4932079840028193e-07,
+ "loss": 0.3178,
+ "step": 10546
+ },
+ {
+ "epoch": 4.986761229314421,
+ "grad_norm": 3.1723084449768066,
+ "learning_rate": 3.490028159780373e-07,
+ "loss": 0.348,
+ "step": 10547
+ },
+ {
+ "epoch": 4.987234042553191,
+ "grad_norm": 3.2631607055664062,
+ "learning_rate": 3.4868496748895616e-07,
+ "loss": 0.3608,
+ "step": 10548
+ },
+ {
+ "epoch": 4.987706855791962,
+ "grad_norm": 3.4170608520507812,
+ "learning_rate": 3.483672529528287e-07,
+ "loss": 0.3819,
+ "step": 10549
+ },
+ {
+ "epoch": 4.988179669030733,
+ "grad_norm": 3.002686023712158,
+ "learning_rate": 3.480496723894375e-07,
+ "loss": 0.2695,
+ "step": 10550
+ },
+ {
+ "epoch": 4.9886524822695035,
+ "grad_norm": 3.051232099533081,
+ "learning_rate": 3.4773222581855753e-07,
+ "loss": 0.3638,
+ "step": 10551
+ },
+ {
+ "epoch": 4.989125295508274,
+ "grad_norm": 2.959977149963379,
+ "learning_rate": 3.474149132599544e-07,
+ "loss": 0.3338,
+ "step": 10552
+ },
+ {
+ "epoch": 4.989598108747045,
+ "grad_norm": 2.925457000732422,
+ "learning_rate": 3.470977347333859e-07,
+ "loss": 0.3212,
+ "step": 10553
+ },
+ {
+ "epoch": 4.990070921985816,
+ "grad_norm": 3.0996408462524414,
+ "learning_rate": 3.4678069025860154e-07,
+ "loss": 0.3447,
+ "step": 10554
+ },
+ {
+ "epoch": 4.990543735224586,
+ "grad_norm": 2.8487865924835205,
+ "learning_rate": 3.4646377985534106e-07,
+ "loss": 0.3434,
+ "step": 10555
+ },
+ {
+ "epoch": 4.991016548463357,
+ "grad_norm": 2.8337016105651855,
+ "learning_rate": 3.461470035433387e-07,
+ "loss": 0.342,
+ "step": 10556
+ },
+ {
+ "epoch": 4.991489361702127,
+ "grad_norm": 2.9243876934051514,
+ "learning_rate": 3.4583036134231805e-07,
+ "loss": 0.3256,
+ "step": 10557
+ },
+ {
+ "epoch": 4.991962174940898,
+ "grad_norm": 3.2548747062683105,
+ "learning_rate": 3.455138532719948e-07,
+ "loss": 0.3313,
+ "step": 10558
+ },
+ {
+ "epoch": 4.992434988179669,
+ "grad_norm": 3.03932523727417,
+ "learning_rate": 3.451974793520771e-07,
+ "loss": 0.3854,
+ "step": 10559
+ },
+ {
+ "epoch": 4.99290780141844,
+ "grad_norm": 3.4757370948791504,
+ "learning_rate": 3.44881239602263e-07,
+ "loss": 0.3909,
+ "step": 10560
+ },
+ {
+ "epoch": 4.993380614657211,
+ "grad_norm": 2.9729294776916504,
+ "learning_rate": 3.4456513404224513e-07,
+ "loss": 0.3645,
+ "step": 10561
+ },
+ {
+ "epoch": 4.993853427895981,
+ "grad_norm": 3.2144060134887695,
+ "learning_rate": 3.4424916269170495e-07,
+ "loss": 0.3236,
+ "step": 10562
+ },
+ {
+ "epoch": 4.994326241134752,
+ "grad_norm": 3.742386817932129,
+ "learning_rate": 3.4393332557031615e-07,
+ "loss": 0.332,
+ "step": 10563
+ },
+ {
+ "epoch": 4.994799054373522,
+ "grad_norm": 3.2569401264190674,
+ "learning_rate": 3.4361762269774557e-07,
+ "loss": 0.3774,
+ "step": 10564
+ },
+ {
+ "epoch": 4.995271867612293,
+ "grad_norm": 2.91739821434021,
+ "learning_rate": 3.433020540936499e-07,
+ "loss": 0.3061,
+ "step": 10565
+ },
+ {
+ "epoch": 4.995744680851064,
+ "grad_norm": 3.534137487411499,
+ "learning_rate": 3.429866197776788e-07,
+ "loss": 0.357,
+ "step": 10566
+ },
+ {
+ "epoch": 4.9962174940898345,
+ "grad_norm": 3.215837001800537,
+ "learning_rate": 3.4267131976947284e-07,
+ "loss": 0.3395,
+ "step": 10567
+ },
+ {
+ "epoch": 4.996690307328605,
+ "grad_norm": 3.294857978820801,
+ "learning_rate": 3.4235615408866384e-07,
+ "loss": 0.3273,
+ "step": 10568
+ },
+ {
+ "epoch": 4.997163120567376,
+ "grad_norm": 3.519171953201294,
+ "learning_rate": 3.4204112275487646e-07,
+ "loss": 0.3712,
+ "step": 10569
+ },
+ {
+ "epoch": 4.997635933806147,
+ "grad_norm": 3.037527084350586,
+ "learning_rate": 3.4172622578772544e-07,
+ "loss": 0.2949,
+ "step": 10570
+ },
+ {
+ "epoch": 4.998108747044917,
+ "grad_norm": 3.309682846069336,
+ "learning_rate": 3.4141146320681913e-07,
+ "loss": 0.3068,
+ "step": 10571
+ },
+ {
+ "epoch": 4.998581560283688,
+ "grad_norm": 3.2197179794311523,
+ "learning_rate": 3.410968350317559e-07,
+ "loss": 0.3725,
+ "step": 10572
+ },
+ {
+ "epoch": 4.999054373522458,
+ "grad_norm": 3.0465641021728516,
+ "learning_rate": 3.4078234128212537e-07,
+ "loss": 0.3505,
+ "step": 10573
+ },
+ {
+ "epoch": 4.999527186761229,
+ "grad_norm": 3.066941022872925,
+ "learning_rate": 3.404679819775114e-07,
+ "loss": 0.3435,
+ "step": 10574
+ },
+ {
+ "epoch": 5.0,
+ "grad_norm": 3.3947532176971436,
+ "learning_rate": 3.401537571374869e-07,
+ "loss": 0.3344,
+ "step": 10575
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 12690,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 6,
+ "save_steps": 2115,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 2.6712206559066194e+19,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-10575/training_args.bin b/checkpoint-10575/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3885e6a27b204c834ff6fbc6f3a5a4c7f69fe2cc
--- /dev/null
+++ b/checkpoint-10575/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51188d498a3fe1bf9ddf234271786a31e25e89b3aa68721a97f8264e1013c9b6
+size 8056
diff --git a/checkpoint-10575/zero_to_fp32.py b/checkpoint-10575/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-10575/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-12690/README.md b/checkpoint-12690/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..049d467664ca6172b7ffbe6ba60b3eac7479cac4
--- /dev/null
+++ b/checkpoint-12690/README.md
@@ -0,0 +1,202 @@
+---
+base_model: meta-llama/Llama-3.1-8B
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.14.0
\ No newline at end of file
diff --git a/checkpoint-12690/adapter_config.json b/checkpoint-12690/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..38890ca21f7e1854f7350049a9113a9eec8a443a
--- /dev/null
+++ b/checkpoint-12690/adapter_config.json
@@ -0,0 +1,40 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "meta-llama/Llama-3.1-8B",
+ "bias": "none",
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "v_proj",
+ "q_proj",
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "up_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-12690/adapter_model.safetensors b/checkpoint-12690/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a86dd37eb82fb1d46b236ea471b6db3ec90b904d
--- /dev/null
+++ b/checkpoint-12690/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f07f4ee71740fb0bcf54ffe982c86734f08671b6b0cb55e4add89e30c744f1b
+size 3443586272
diff --git a/checkpoint-12690/global_step12690/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-12690/global_step12690/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..887a519261f8f876fc4769c8738a1e205fa73ddd
--- /dev/null
+++ b/checkpoint-12690/global_step12690/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04f19a180841f418255309732702e6882ca7ea020dc9a5f21223f035c03e2a0d
+size 20661195036
diff --git a/checkpoint-12690/global_step12690/mp_rank_00_model_states.pt b/checkpoint-12690/global_step12690/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..160501f67104b890930394ef272f22d85dca9d2d
--- /dev/null
+++ b/checkpoint-12690/global_step12690/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc733ade2590ed396f26e68b03ae28b0ac8492cc6c22c4a5ebada21090755b37
+size 3555326841
diff --git a/checkpoint-12690/latest b/checkpoint-12690/latest
new file mode 100644
index 0000000000000000000000000000000000000000..44725bf699ccd6f2b841ff307fc5f70037c9c745
--- /dev/null
+++ b/checkpoint-12690/latest
@@ -0,0 +1 @@
+global_step12690
\ No newline at end of file
diff --git a/checkpoint-12690/rng_state.pth b/checkpoint-12690/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..59692a9331087e66e22d50634c54db6e81349d0b
--- /dev/null
+++ b/checkpoint-12690/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76d547af609f90786b6fc88240881d5ac35a1e7d00f6657cc6b00880840e6bef
+size 14244
diff --git a/checkpoint-12690/scheduler.pt b/checkpoint-12690/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d5de7097cc8d80d2c14fd06e8fe60f0ba95e6906
--- /dev/null
+++ b/checkpoint-12690/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6865648a0e7d98218d3202f30defe46013d7294dd42f0bff20e0d0463dab454
+size 1064
diff --git a/checkpoint-12690/special_tokens_map.json b/checkpoint-12690/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca
--- /dev/null
+++ b/checkpoint-12690/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-12690/tokenizer.json b/checkpoint-12690/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-12690/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-12690/tokenizer_config.json b/checkpoint-12690/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..81dd14db6632ad5b35b9d447732e37ac074873a5
--- /dev/null
+++ b/checkpoint-12690/tokenizer_config.json
@@ -0,0 +1,2063 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|end_of_text|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-12690/trainer_state.json b/checkpoint-12690/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..28fbbd8d7dbf8f2d88e36569a6088362ca746929
--- /dev/null
+++ b/checkpoint-12690/trainer_state.json
@@ -0,0 +1,88863 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 6.0,
+ "eval_steps": 500,
+ "global_step": 12690,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.00047281323877068556,
+ "grad_norm": 5.163570880889893,
+ "learning_rate": 5.0000000000000004e-08,
+ "loss": 1.4628,
+ "step": 1
+ },
+ {
+ "epoch": 0.0009456264775413711,
+ "grad_norm": 6.298020839691162,
+ "learning_rate": 1.0000000000000001e-07,
+ "loss": 1.5003,
+ "step": 2
+ },
+ {
+ "epoch": 0.0014184397163120568,
+ "grad_norm": 5.853623390197754,
+ "learning_rate": 1.5000000000000002e-07,
+ "loss": 1.4495,
+ "step": 3
+ },
+ {
+ "epoch": 0.0018912529550827422,
+ "grad_norm": 5.456025123596191,
+ "learning_rate": 2.0000000000000002e-07,
+ "loss": 1.3798,
+ "step": 4
+ },
+ {
+ "epoch": 0.002364066193853428,
+ "grad_norm": 5.757407188415527,
+ "learning_rate": 2.5000000000000004e-07,
+ "loss": 1.4515,
+ "step": 5
+ },
+ {
+ "epoch": 0.0028368794326241137,
+ "grad_norm": 5.872277736663818,
+ "learning_rate": 3.0000000000000004e-07,
+ "loss": 1.4424,
+ "step": 6
+ },
+ {
+ "epoch": 0.003309692671394799,
+ "grad_norm": 6.7816009521484375,
+ "learning_rate": 3.5000000000000004e-07,
+ "loss": 1.4004,
+ "step": 7
+ },
+ {
+ "epoch": 0.0037825059101654845,
+ "grad_norm": 6.229667663574219,
+ "learning_rate": 4.0000000000000003e-07,
+ "loss": 1.4494,
+ "step": 8
+ },
+ {
+ "epoch": 0.00425531914893617,
+ "grad_norm": 5.336202621459961,
+ "learning_rate": 4.5000000000000003e-07,
+ "loss": 1.3916,
+ "step": 9
+ },
+ {
+ "epoch": 0.004728132387706856,
+ "grad_norm": 5.589445114135742,
+ "learning_rate": 5.000000000000001e-07,
+ "loss": 1.2318,
+ "step": 10
+ },
+ {
+ "epoch": 0.005200945626477541,
+ "grad_norm": 5.720539569854736,
+ "learning_rate": 5.5e-07,
+ "loss": 1.4367,
+ "step": 11
+ },
+ {
+ "epoch": 0.005673758865248227,
+ "grad_norm": 5.913913726806641,
+ "learning_rate": 6.000000000000001e-07,
+ "loss": 1.342,
+ "step": 12
+ },
+ {
+ "epoch": 0.006146572104018913,
+ "grad_norm": 5.899744987487793,
+ "learning_rate": 6.5e-07,
+ "loss": 1.4307,
+ "step": 13
+ },
+ {
+ "epoch": 0.006619385342789598,
+ "grad_norm": 5.571037292480469,
+ "learning_rate": 7.000000000000001e-07,
+ "loss": 1.3372,
+ "step": 14
+ },
+ {
+ "epoch": 0.0070921985815602835,
+ "grad_norm": 5.480010509490967,
+ "learning_rate": 7.5e-07,
+ "loss": 1.3923,
+ "step": 15
+ },
+ {
+ "epoch": 0.007565011820330969,
+ "grad_norm": 5.254702091217041,
+ "learning_rate": 8.000000000000001e-07,
+ "loss": 1.2928,
+ "step": 16
+ },
+ {
+ "epoch": 0.008037825059101654,
+ "grad_norm": 6.090312480926514,
+ "learning_rate": 8.500000000000001e-07,
+ "loss": 1.4984,
+ "step": 17
+ },
+ {
+ "epoch": 0.00851063829787234,
+ "grad_norm": 5.689319610595703,
+ "learning_rate": 9.000000000000001e-07,
+ "loss": 1.4108,
+ "step": 18
+ },
+ {
+ "epoch": 0.008983451536643027,
+ "grad_norm": 5.386685848236084,
+ "learning_rate": 9.500000000000001e-07,
+ "loss": 1.425,
+ "step": 19
+ },
+ {
+ "epoch": 0.009456264775413711,
+ "grad_norm": 6.451584815979004,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 1.5507,
+ "step": 20
+ },
+ {
+ "epoch": 0.009929078014184398,
+ "grad_norm": 5.37647008895874,
+ "learning_rate": 1.0500000000000001e-06,
+ "loss": 1.4109,
+ "step": 21
+ },
+ {
+ "epoch": 0.010401891252955082,
+ "grad_norm": 4.716553211212158,
+ "learning_rate": 1.1e-06,
+ "loss": 1.2028,
+ "step": 22
+ },
+ {
+ "epoch": 0.010874704491725768,
+ "grad_norm": 4.950989723205566,
+ "learning_rate": 1.1500000000000002e-06,
+ "loss": 1.3043,
+ "step": 23
+ },
+ {
+ "epoch": 0.011347517730496455,
+ "grad_norm": 4.688975811004639,
+ "learning_rate": 1.2000000000000002e-06,
+ "loss": 1.2708,
+ "step": 24
+ },
+ {
+ "epoch": 0.01182033096926714,
+ "grad_norm": 4.905868053436279,
+ "learning_rate": 1.25e-06,
+ "loss": 1.3268,
+ "step": 25
+ },
+ {
+ "epoch": 0.012293144208037825,
+ "grad_norm": 4.503395080566406,
+ "learning_rate": 1.3e-06,
+ "loss": 1.1799,
+ "step": 26
+ },
+ {
+ "epoch": 0.01276595744680851,
+ "grad_norm": 4.77382230758667,
+ "learning_rate": 1.3500000000000002e-06,
+ "loss": 1.3882,
+ "step": 27
+ },
+ {
+ "epoch": 0.013238770685579196,
+ "grad_norm": 4.734329700469971,
+ "learning_rate": 1.4000000000000001e-06,
+ "loss": 1.3476,
+ "step": 28
+ },
+ {
+ "epoch": 0.013711583924349883,
+ "grad_norm": 4.775066375732422,
+ "learning_rate": 1.45e-06,
+ "loss": 1.2429,
+ "step": 29
+ },
+ {
+ "epoch": 0.014184397163120567,
+ "grad_norm": 4.978334426879883,
+ "learning_rate": 1.5e-06,
+ "loss": 1.2119,
+ "step": 30
+ },
+ {
+ "epoch": 0.014657210401891253,
+ "grad_norm": 4.506785869598389,
+ "learning_rate": 1.5500000000000002e-06,
+ "loss": 1.3157,
+ "step": 31
+ },
+ {
+ "epoch": 0.015130023640661938,
+ "grad_norm": 4.007757186889648,
+ "learning_rate": 1.6000000000000001e-06,
+ "loss": 1.1451,
+ "step": 32
+ },
+ {
+ "epoch": 0.015602836879432624,
+ "grad_norm": 3.6621618270874023,
+ "learning_rate": 1.6500000000000003e-06,
+ "loss": 1.093,
+ "step": 33
+ },
+ {
+ "epoch": 0.01607565011820331,
+ "grad_norm": 3.8733766078948975,
+ "learning_rate": 1.7000000000000002e-06,
+ "loss": 1.2289,
+ "step": 34
+ },
+ {
+ "epoch": 0.016548463356973995,
+ "grad_norm": 4.3391900062561035,
+ "learning_rate": 1.75e-06,
+ "loss": 1.1453,
+ "step": 35
+ },
+ {
+ "epoch": 0.01702127659574468,
+ "grad_norm": 3.287623643875122,
+ "learning_rate": 1.8000000000000001e-06,
+ "loss": 1.0257,
+ "step": 36
+ },
+ {
+ "epoch": 0.017494089834515367,
+ "grad_norm": 3.591721773147583,
+ "learning_rate": 1.85e-06,
+ "loss": 0.9976,
+ "step": 37
+ },
+ {
+ "epoch": 0.017966903073286054,
+ "grad_norm": 4.028271675109863,
+ "learning_rate": 1.9000000000000002e-06,
+ "loss": 1.0773,
+ "step": 38
+ },
+ {
+ "epoch": 0.018439716312056736,
+ "grad_norm": 3.3543951511383057,
+ "learning_rate": 1.9500000000000004e-06,
+ "loss": 1.1677,
+ "step": 39
+ },
+ {
+ "epoch": 0.018912529550827423,
+ "grad_norm": 3.807624340057373,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 1.1232,
+ "step": 40
+ },
+ {
+ "epoch": 0.01938534278959811,
+ "grad_norm": 4.242797374725342,
+ "learning_rate": 2.05e-06,
+ "loss": 1.1819,
+ "step": 41
+ },
+ {
+ "epoch": 0.019858156028368795,
+ "grad_norm": 3.4574992656707764,
+ "learning_rate": 2.1000000000000002e-06,
+ "loss": 0.9878,
+ "step": 42
+ },
+ {
+ "epoch": 0.02033096926713948,
+ "grad_norm": 3.906695604324341,
+ "learning_rate": 2.15e-06,
+ "loss": 1.0592,
+ "step": 43
+ },
+ {
+ "epoch": 0.020803782505910164,
+ "grad_norm": 3.7543163299560547,
+ "learning_rate": 2.2e-06,
+ "loss": 1.0309,
+ "step": 44
+ },
+ {
+ "epoch": 0.02127659574468085,
+ "grad_norm": 3.3777148723602295,
+ "learning_rate": 2.25e-06,
+ "loss": 1.0664,
+ "step": 45
+ },
+ {
+ "epoch": 0.021749408983451537,
+ "grad_norm": 3.6003634929656982,
+ "learning_rate": 2.3000000000000004e-06,
+ "loss": 1.0482,
+ "step": 46
+ },
+ {
+ "epoch": 0.022222222222222223,
+ "grad_norm": 3.3961377143859863,
+ "learning_rate": 2.35e-06,
+ "loss": 1.0252,
+ "step": 47
+ },
+ {
+ "epoch": 0.02269503546099291,
+ "grad_norm": 3.1601035594940186,
+ "learning_rate": 2.4000000000000003e-06,
+ "loss": 1.0435,
+ "step": 48
+ },
+ {
+ "epoch": 0.023167848699763592,
+ "grad_norm": 3.4192967414855957,
+ "learning_rate": 2.4500000000000003e-06,
+ "loss": 1.0935,
+ "step": 49
+ },
+ {
+ "epoch": 0.02364066193853428,
+ "grad_norm": 3.1225922107696533,
+ "learning_rate": 2.5e-06,
+ "loss": 0.8988,
+ "step": 50
+ },
+ {
+ "epoch": 0.024113475177304965,
+ "grad_norm": 3.1423380374908447,
+ "learning_rate": 2.55e-06,
+ "loss": 1.0159,
+ "step": 51
+ },
+ {
+ "epoch": 0.02458628841607565,
+ "grad_norm": 3.4782402515411377,
+ "learning_rate": 2.6e-06,
+ "loss": 1.0231,
+ "step": 52
+ },
+ {
+ "epoch": 0.025059101654846337,
+ "grad_norm": 3.8362693786621094,
+ "learning_rate": 2.6500000000000005e-06,
+ "loss": 1.0725,
+ "step": 53
+ },
+ {
+ "epoch": 0.02553191489361702,
+ "grad_norm": 3.033294916152954,
+ "learning_rate": 2.7000000000000004e-06,
+ "loss": 0.9377,
+ "step": 54
+ },
+ {
+ "epoch": 0.026004728132387706,
+ "grad_norm": 3.849741220474243,
+ "learning_rate": 2.7500000000000004e-06,
+ "loss": 1.0046,
+ "step": 55
+ },
+ {
+ "epoch": 0.026477541371158392,
+ "grad_norm": 3.141876220703125,
+ "learning_rate": 2.8000000000000003e-06,
+ "loss": 0.9226,
+ "step": 56
+ },
+ {
+ "epoch": 0.02695035460992908,
+ "grad_norm": 2.773594856262207,
+ "learning_rate": 2.85e-06,
+ "loss": 0.8662,
+ "step": 57
+ },
+ {
+ "epoch": 0.027423167848699765,
+ "grad_norm": 3.1460225582122803,
+ "learning_rate": 2.9e-06,
+ "loss": 0.9304,
+ "step": 58
+ },
+ {
+ "epoch": 0.027895981087470448,
+ "grad_norm": 3.293583631515503,
+ "learning_rate": 2.95e-06,
+ "loss": 1.0374,
+ "step": 59
+ },
+ {
+ "epoch": 0.028368794326241134,
+ "grad_norm": 3.8190863132476807,
+ "learning_rate": 3e-06,
+ "loss": 0.971,
+ "step": 60
+ },
+ {
+ "epoch": 0.02884160756501182,
+ "grad_norm": 3.4566776752471924,
+ "learning_rate": 3.05e-06,
+ "loss": 0.9631,
+ "step": 61
+ },
+ {
+ "epoch": 0.029314420803782507,
+ "grad_norm": 3.355741500854492,
+ "learning_rate": 3.1000000000000004e-06,
+ "loss": 1.0097,
+ "step": 62
+ },
+ {
+ "epoch": 0.029787234042553193,
+ "grad_norm": 3.29746675491333,
+ "learning_rate": 3.1500000000000003e-06,
+ "loss": 0.9459,
+ "step": 63
+ },
+ {
+ "epoch": 0.030260047281323876,
+ "grad_norm": 3.3122968673706055,
+ "learning_rate": 3.2000000000000003e-06,
+ "loss": 0.8594,
+ "step": 64
+ },
+ {
+ "epoch": 0.030732860520094562,
+ "grad_norm": 3.477701187133789,
+ "learning_rate": 3.2500000000000002e-06,
+ "loss": 0.9197,
+ "step": 65
+ },
+ {
+ "epoch": 0.031205673758865248,
+ "grad_norm": 3.3363406658172607,
+ "learning_rate": 3.3000000000000006e-06,
+ "loss": 0.9478,
+ "step": 66
+ },
+ {
+ "epoch": 0.03167848699763593,
+ "grad_norm": 4.143295764923096,
+ "learning_rate": 3.3500000000000005e-06,
+ "loss": 1.0534,
+ "step": 67
+ },
+ {
+ "epoch": 0.03215130023640662,
+ "grad_norm": 3.2363274097442627,
+ "learning_rate": 3.4000000000000005e-06,
+ "loss": 0.9454,
+ "step": 68
+ },
+ {
+ "epoch": 0.032624113475177303,
+ "grad_norm": 3.198746681213379,
+ "learning_rate": 3.45e-06,
+ "loss": 0.9388,
+ "step": 69
+ },
+ {
+ "epoch": 0.03309692671394799,
+ "grad_norm": 3.5751023292541504,
+ "learning_rate": 3.5e-06,
+ "loss": 0.9444,
+ "step": 70
+ },
+ {
+ "epoch": 0.033569739952718676,
+ "grad_norm": 3.1745729446411133,
+ "learning_rate": 3.5500000000000003e-06,
+ "loss": 0.8683,
+ "step": 71
+ },
+ {
+ "epoch": 0.03404255319148936,
+ "grad_norm": 3.3210883140563965,
+ "learning_rate": 3.6000000000000003e-06,
+ "loss": 0.8811,
+ "step": 72
+ },
+ {
+ "epoch": 0.03451536643026005,
+ "grad_norm": 3.2502429485321045,
+ "learning_rate": 3.65e-06,
+ "loss": 1.0012,
+ "step": 73
+ },
+ {
+ "epoch": 0.034988179669030735,
+ "grad_norm": 3.44598126411438,
+ "learning_rate": 3.7e-06,
+ "loss": 0.9217,
+ "step": 74
+ },
+ {
+ "epoch": 0.03546099290780142,
+ "grad_norm": 3.439117431640625,
+ "learning_rate": 3.7500000000000005e-06,
+ "loss": 0.8976,
+ "step": 75
+ },
+ {
+ "epoch": 0.03593380614657211,
+ "grad_norm": 3.523627758026123,
+ "learning_rate": 3.8000000000000005e-06,
+ "loss": 0.8996,
+ "step": 76
+ },
+ {
+ "epoch": 0.03640661938534279,
+ "grad_norm": 3.3716015815734863,
+ "learning_rate": 3.85e-06,
+ "loss": 0.9061,
+ "step": 77
+ },
+ {
+ "epoch": 0.03687943262411347,
+ "grad_norm": 3.33518385887146,
+ "learning_rate": 3.900000000000001e-06,
+ "loss": 0.9371,
+ "step": 78
+ },
+ {
+ "epoch": 0.03735224586288416,
+ "grad_norm": 3.833829879760742,
+ "learning_rate": 3.95e-06,
+ "loss": 0.9669,
+ "step": 79
+ },
+ {
+ "epoch": 0.037825059101654845,
+ "grad_norm": 3.260446786880493,
+ "learning_rate": 4.000000000000001e-06,
+ "loss": 0.9449,
+ "step": 80
+ },
+ {
+ "epoch": 0.03829787234042553,
+ "grad_norm": 3.532451629638672,
+ "learning_rate": 4.05e-06,
+ "loss": 0.897,
+ "step": 81
+ },
+ {
+ "epoch": 0.03877068557919622,
+ "grad_norm": 3.1156492233276367,
+ "learning_rate": 4.1e-06,
+ "loss": 0.8463,
+ "step": 82
+ },
+ {
+ "epoch": 0.039243498817966904,
+ "grad_norm": 2.8801751136779785,
+ "learning_rate": 4.15e-06,
+ "loss": 0.8616,
+ "step": 83
+ },
+ {
+ "epoch": 0.03971631205673759,
+ "grad_norm": 3.072476863861084,
+ "learning_rate": 4.2000000000000004e-06,
+ "loss": 0.8387,
+ "step": 84
+ },
+ {
+ "epoch": 0.04018912529550828,
+ "grad_norm": 2.9601376056671143,
+ "learning_rate": 4.25e-06,
+ "loss": 0.8538,
+ "step": 85
+ },
+ {
+ "epoch": 0.04066193853427896,
+ "grad_norm": 3.521664619445801,
+ "learning_rate": 4.3e-06,
+ "loss": 0.8894,
+ "step": 86
+ },
+ {
+ "epoch": 0.04113475177304964,
+ "grad_norm": 3.2670981884002686,
+ "learning_rate": 4.350000000000001e-06,
+ "loss": 0.8387,
+ "step": 87
+ },
+ {
+ "epoch": 0.04160756501182033,
+ "grad_norm": 3.422089099884033,
+ "learning_rate": 4.4e-06,
+ "loss": 0.7728,
+ "step": 88
+ },
+ {
+ "epoch": 0.042080378250591015,
+ "grad_norm": 3.414034128189087,
+ "learning_rate": 4.450000000000001e-06,
+ "loss": 0.7968,
+ "step": 89
+ },
+ {
+ "epoch": 0.0425531914893617,
+ "grad_norm": 4.234285354614258,
+ "learning_rate": 4.5e-06,
+ "loss": 0.8502,
+ "step": 90
+ },
+ {
+ "epoch": 0.04302600472813239,
+ "grad_norm": 3.1446919441223145,
+ "learning_rate": 4.5500000000000005e-06,
+ "loss": 0.8236,
+ "step": 91
+ },
+ {
+ "epoch": 0.043498817966903074,
+ "grad_norm": 3.683443307876587,
+ "learning_rate": 4.600000000000001e-06,
+ "loss": 0.9792,
+ "step": 92
+ },
+ {
+ "epoch": 0.04397163120567376,
+ "grad_norm": 3.664219617843628,
+ "learning_rate": 4.65e-06,
+ "loss": 0.8743,
+ "step": 93
+ },
+ {
+ "epoch": 0.044444444444444446,
+ "grad_norm": 3.369479179382324,
+ "learning_rate": 4.7e-06,
+ "loss": 0.8741,
+ "step": 94
+ },
+ {
+ "epoch": 0.04491725768321513,
+ "grad_norm": 3.694949150085449,
+ "learning_rate": 4.75e-06,
+ "loss": 0.7574,
+ "step": 95
+ },
+ {
+ "epoch": 0.04539007092198582,
+ "grad_norm": 3.5144498348236084,
+ "learning_rate": 4.800000000000001e-06,
+ "loss": 0.9934,
+ "step": 96
+ },
+ {
+ "epoch": 0.0458628841607565,
+ "grad_norm": 3.164451837539673,
+ "learning_rate": 4.85e-06,
+ "loss": 0.7463,
+ "step": 97
+ },
+ {
+ "epoch": 0.046335697399527184,
+ "grad_norm": 3.222785472869873,
+ "learning_rate": 4.9000000000000005e-06,
+ "loss": 0.7698,
+ "step": 98
+ },
+ {
+ "epoch": 0.04680851063829787,
+ "grad_norm": 2.9129555225372314,
+ "learning_rate": 4.95e-06,
+ "loss": 0.7856,
+ "step": 99
+ },
+ {
+ "epoch": 0.04728132387706856,
+ "grad_norm": 3.5061235427856445,
+ "learning_rate": 5e-06,
+ "loss": 0.8588,
+ "step": 100
+ },
+ {
+ "epoch": 0.04775413711583924,
+ "grad_norm": 3.2805044651031494,
+ "learning_rate": 4.999999922167982e-06,
+ "loss": 0.7643,
+ "step": 101
+ },
+ {
+ "epoch": 0.04822695035460993,
+ "grad_norm": 3.5461678504943848,
+ "learning_rate": 4.999999688671929e-06,
+ "loss": 0.8253,
+ "step": 102
+ },
+ {
+ "epoch": 0.048699763593380616,
+ "grad_norm": 3.2238264083862305,
+ "learning_rate": 4.99999929951186e-06,
+ "loss": 0.7622,
+ "step": 103
+ },
+ {
+ "epoch": 0.0491725768321513,
+ "grad_norm": 3.818955898284912,
+ "learning_rate": 4.999998754687795e-06,
+ "loss": 0.8471,
+ "step": 104
+ },
+ {
+ "epoch": 0.04964539007092199,
+ "grad_norm": 3.1252424716949463,
+ "learning_rate": 4.99999805419977e-06,
+ "loss": 0.8409,
+ "step": 105
+ },
+ {
+ "epoch": 0.050118203309692674,
+ "grad_norm": 3.604283571243286,
+ "learning_rate": 4.999997198047828e-06,
+ "loss": 0.9027,
+ "step": 106
+ },
+ {
+ "epoch": 0.050591016548463354,
+ "grad_norm": 3.6752424240112305,
+ "learning_rate": 4.999996186232023e-06,
+ "loss": 0.9336,
+ "step": 107
+ },
+ {
+ "epoch": 0.05106382978723404,
+ "grad_norm": 3.517557144165039,
+ "learning_rate": 4.9999950187524184e-06,
+ "loss": 0.8351,
+ "step": 108
+ },
+ {
+ "epoch": 0.051536643026004726,
+ "grad_norm": 3.427285671234131,
+ "learning_rate": 4.999993695609085e-06,
+ "loss": 0.8457,
+ "step": 109
+ },
+ {
+ "epoch": 0.05200945626477541,
+ "grad_norm": 3.2792510986328125,
+ "learning_rate": 4.999992216802107e-06,
+ "loss": 0.8391,
+ "step": 110
+ },
+ {
+ "epoch": 0.0524822695035461,
+ "grad_norm": 3.581094741821289,
+ "learning_rate": 4.999990582331576e-06,
+ "loss": 0.7533,
+ "step": 111
+ },
+ {
+ "epoch": 0.052955082742316785,
+ "grad_norm": 3.1667377948760986,
+ "learning_rate": 4.999988792197593e-06,
+ "loss": 0.9562,
+ "step": 112
+ },
+ {
+ "epoch": 0.05342789598108747,
+ "grad_norm": 3.3609890937805176,
+ "learning_rate": 4.99998684640027e-06,
+ "loss": 0.8181,
+ "step": 113
+ },
+ {
+ "epoch": 0.05390070921985816,
+ "grad_norm": 3.260627269744873,
+ "learning_rate": 4.999984744939729e-06,
+ "loss": 0.8012,
+ "step": 114
+ },
+ {
+ "epoch": 0.054373522458628844,
+ "grad_norm": 3.4535653591156006,
+ "learning_rate": 4.9999824878160985e-06,
+ "loss": 0.919,
+ "step": 115
+ },
+ {
+ "epoch": 0.05484633569739953,
+ "grad_norm": 3.4880740642547607,
+ "learning_rate": 4.999980075029522e-06,
+ "loss": 0.8114,
+ "step": 116
+ },
+ {
+ "epoch": 0.05531914893617021,
+ "grad_norm": 3.2546932697296143,
+ "learning_rate": 4.999977506580147e-06,
+ "loss": 0.8274,
+ "step": 117
+ },
+ {
+ "epoch": 0.055791962174940896,
+ "grad_norm": 3.2762744426727295,
+ "learning_rate": 4.999974782468136e-06,
+ "loss": 0.9018,
+ "step": 118
+ },
+ {
+ "epoch": 0.05626477541371158,
+ "grad_norm": 3.42825984954834,
+ "learning_rate": 4.999971902693657e-06,
+ "loss": 0.8262,
+ "step": 119
+ },
+ {
+ "epoch": 0.05673758865248227,
+ "grad_norm": 3.082496404647827,
+ "learning_rate": 4.99996886725689e-06,
+ "loss": 0.8181,
+ "step": 120
+ },
+ {
+ "epoch": 0.057210401891252954,
+ "grad_norm": 3.322869300842285,
+ "learning_rate": 4.9999656761580225e-06,
+ "loss": 0.8382,
+ "step": 121
+ },
+ {
+ "epoch": 0.05768321513002364,
+ "grad_norm": 3.6365339756011963,
+ "learning_rate": 4.9999623293972555e-06,
+ "loss": 0.7489,
+ "step": 122
+ },
+ {
+ "epoch": 0.05815602836879433,
+ "grad_norm": 3.376352548599243,
+ "learning_rate": 4.999958826974796e-06,
+ "loss": 0.9012,
+ "step": 123
+ },
+ {
+ "epoch": 0.05862884160756501,
+ "grad_norm": 3.49088716506958,
+ "learning_rate": 4.999955168890862e-06,
+ "loss": 0.8999,
+ "step": 124
+ },
+ {
+ "epoch": 0.0591016548463357,
+ "grad_norm": 3.3265068531036377,
+ "learning_rate": 4.999951355145682e-06,
+ "loss": 0.8161,
+ "step": 125
+ },
+ {
+ "epoch": 0.059574468085106386,
+ "grad_norm": 3.697282314300537,
+ "learning_rate": 4.999947385739493e-06,
+ "loss": 0.9623,
+ "step": 126
+ },
+ {
+ "epoch": 0.06004728132387707,
+ "grad_norm": 2.7901928424835205,
+ "learning_rate": 4.999943260672542e-06,
+ "loss": 0.7371,
+ "step": 127
+ },
+ {
+ "epoch": 0.06052009456264775,
+ "grad_norm": 3.110319137573242,
+ "learning_rate": 4.999938979945086e-06,
+ "loss": 0.715,
+ "step": 128
+ },
+ {
+ "epoch": 0.06099290780141844,
+ "grad_norm": 3.2211520671844482,
+ "learning_rate": 4.999934543557392e-06,
+ "loss": 0.8888,
+ "step": 129
+ },
+ {
+ "epoch": 0.061465721040189124,
+ "grad_norm": 3.2466187477111816,
+ "learning_rate": 4.999929951509735e-06,
+ "loss": 0.9389,
+ "step": 130
+ },
+ {
+ "epoch": 0.06193853427895981,
+ "grad_norm": 3.3574399948120117,
+ "learning_rate": 4.999925203802403e-06,
+ "loss": 0.8263,
+ "step": 131
+ },
+ {
+ "epoch": 0.062411347517730496,
+ "grad_norm": 3.275601625442505,
+ "learning_rate": 4.99992030043569e-06,
+ "loss": 0.8338,
+ "step": 132
+ },
+ {
+ "epoch": 0.06288416075650118,
+ "grad_norm": 3.6011312007904053,
+ "learning_rate": 4.999915241409902e-06,
+ "loss": 0.8351,
+ "step": 133
+ },
+ {
+ "epoch": 0.06335697399527186,
+ "grad_norm": 2.969011068344116,
+ "learning_rate": 4.999910026725352e-06,
+ "loss": 0.79,
+ "step": 134
+ },
+ {
+ "epoch": 0.06382978723404255,
+ "grad_norm": 3.690784454345703,
+ "learning_rate": 4.999904656382369e-06,
+ "loss": 0.8209,
+ "step": 135
+ },
+ {
+ "epoch": 0.06430260047281323,
+ "grad_norm": 3.3363115787506104,
+ "learning_rate": 4.999899130381283e-06,
+ "loss": 0.858,
+ "step": 136
+ },
+ {
+ "epoch": 0.06477541371158392,
+ "grad_norm": 3.206881523132324,
+ "learning_rate": 4.9998934487224405e-06,
+ "loss": 0.834,
+ "step": 137
+ },
+ {
+ "epoch": 0.06524822695035461,
+ "grad_norm": 2.773146152496338,
+ "learning_rate": 4.999887611406195e-06,
+ "loss": 0.7576,
+ "step": 138
+ },
+ {
+ "epoch": 0.0657210401891253,
+ "grad_norm": 3.307725667953491,
+ "learning_rate": 4.999881618432908e-06,
+ "loss": 0.7487,
+ "step": 139
+ },
+ {
+ "epoch": 0.06619385342789598,
+ "grad_norm": 4.273657321929932,
+ "learning_rate": 4.999875469802956e-06,
+ "loss": 0.8176,
+ "step": 140
+ },
+ {
+ "epoch": 0.06666666666666667,
+ "grad_norm": 3.0898005962371826,
+ "learning_rate": 4.999869165516719e-06,
+ "loss": 0.7578,
+ "step": 141
+ },
+ {
+ "epoch": 0.06713947990543735,
+ "grad_norm": 3.25150990486145,
+ "learning_rate": 4.9998627055745915e-06,
+ "loss": 0.7873,
+ "step": 142
+ },
+ {
+ "epoch": 0.06761229314420804,
+ "grad_norm": 2.9705755710601807,
+ "learning_rate": 4.999856089976974e-06,
+ "loss": 0.6473,
+ "step": 143
+ },
+ {
+ "epoch": 0.06808510638297872,
+ "grad_norm": 3.5658507347106934,
+ "learning_rate": 4.9998493187242804e-06,
+ "loss": 0.855,
+ "step": 144
+ },
+ {
+ "epoch": 0.06855791962174941,
+ "grad_norm": 3.3994076251983643,
+ "learning_rate": 4.99984239181693e-06,
+ "loss": 0.7926,
+ "step": 145
+ },
+ {
+ "epoch": 0.0690307328605201,
+ "grad_norm": 2.8266260623931885,
+ "learning_rate": 4.999835309255357e-06,
+ "loss": 0.7564,
+ "step": 146
+ },
+ {
+ "epoch": 0.06950354609929078,
+ "grad_norm": 3.1143875122070312,
+ "learning_rate": 4.999828071039999e-06,
+ "loss": 0.8398,
+ "step": 147
+ },
+ {
+ "epoch": 0.06997635933806147,
+ "grad_norm": 2.9364278316497803,
+ "learning_rate": 4.99982067717131e-06,
+ "loss": 0.7381,
+ "step": 148
+ },
+ {
+ "epoch": 0.07044917257683216,
+ "grad_norm": 3.4155616760253906,
+ "learning_rate": 4.999813127649748e-06,
+ "loss": 0.7933,
+ "step": 149
+ },
+ {
+ "epoch": 0.07092198581560284,
+ "grad_norm": 4.371236324310303,
+ "learning_rate": 4.999805422475784e-06,
+ "loss": 0.8292,
+ "step": 150
+ },
+ {
+ "epoch": 0.07139479905437353,
+ "grad_norm": 3.3967185020446777,
+ "learning_rate": 4.999797561649897e-06,
+ "loss": 0.8712,
+ "step": 151
+ },
+ {
+ "epoch": 0.07186761229314421,
+ "grad_norm": 3.343303680419922,
+ "learning_rate": 4.999789545172578e-06,
+ "loss": 0.8177,
+ "step": 152
+ },
+ {
+ "epoch": 0.07234042553191489,
+ "grad_norm": 3.040235757827759,
+ "learning_rate": 4.999781373044325e-06,
+ "loss": 0.7379,
+ "step": 153
+ },
+ {
+ "epoch": 0.07281323877068557,
+ "grad_norm": 3.4069204330444336,
+ "learning_rate": 4.999773045265647e-06,
+ "loss": 0.7939,
+ "step": 154
+ },
+ {
+ "epoch": 0.07328605200945626,
+ "grad_norm": 3.1939475536346436,
+ "learning_rate": 4.999764561837063e-06,
+ "loss": 0.8037,
+ "step": 155
+ },
+ {
+ "epoch": 0.07375886524822695,
+ "grad_norm": 4.452004909515381,
+ "learning_rate": 4.999755922759101e-06,
+ "loss": 0.8421,
+ "step": 156
+ },
+ {
+ "epoch": 0.07423167848699763,
+ "grad_norm": 3.2031240463256836,
+ "learning_rate": 4.999747128032298e-06,
+ "loss": 0.794,
+ "step": 157
+ },
+ {
+ "epoch": 0.07470449172576832,
+ "grad_norm": 3.175920009613037,
+ "learning_rate": 4.999738177657203e-06,
+ "loss": 0.759,
+ "step": 158
+ },
+ {
+ "epoch": 0.075177304964539,
+ "grad_norm": 3.7679688930511475,
+ "learning_rate": 4.9997290716343725e-06,
+ "loss": 0.8174,
+ "step": 159
+ },
+ {
+ "epoch": 0.07565011820330969,
+ "grad_norm": 3.7020037174224854,
+ "learning_rate": 4.999719809964373e-06,
+ "loss": 0.7116,
+ "step": 160
+ },
+ {
+ "epoch": 0.07612293144208038,
+ "grad_norm": 4.357471942901611,
+ "learning_rate": 4.999710392647783e-06,
+ "loss": 0.7649,
+ "step": 161
+ },
+ {
+ "epoch": 0.07659574468085106,
+ "grad_norm": 3.3439087867736816,
+ "learning_rate": 4.999700819685187e-06,
+ "loss": 0.7907,
+ "step": 162
+ },
+ {
+ "epoch": 0.07706855791962175,
+ "grad_norm": 3.210815191268921,
+ "learning_rate": 4.999691091077182e-06,
+ "loss": 0.8446,
+ "step": 163
+ },
+ {
+ "epoch": 0.07754137115839244,
+ "grad_norm": 3.1029553413391113,
+ "learning_rate": 4.9996812068243735e-06,
+ "loss": 0.7232,
+ "step": 164
+ },
+ {
+ "epoch": 0.07801418439716312,
+ "grad_norm": 2.9389400482177734,
+ "learning_rate": 4.999671166927378e-06,
+ "loss": 0.7413,
+ "step": 165
+ },
+ {
+ "epoch": 0.07848699763593381,
+ "grad_norm": 3.7062697410583496,
+ "learning_rate": 4.9996609713868185e-06,
+ "loss": 0.8773,
+ "step": 166
+ },
+ {
+ "epoch": 0.0789598108747045,
+ "grad_norm": 3.2768924236297607,
+ "learning_rate": 4.999650620203332e-06,
+ "loss": 0.8046,
+ "step": 167
+ },
+ {
+ "epoch": 0.07943262411347518,
+ "grad_norm": 3.380373001098633,
+ "learning_rate": 4.999640113377561e-06,
+ "loss": 0.7529,
+ "step": 168
+ },
+ {
+ "epoch": 0.07990543735224587,
+ "grad_norm": 3.520022392272949,
+ "learning_rate": 4.999629450910162e-06,
+ "loss": 0.7352,
+ "step": 169
+ },
+ {
+ "epoch": 0.08037825059101655,
+ "grad_norm": 3.43269419670105,
+ "learning_rate": 4.999618632801796e-06,
+ "loss": 0.9371,
+ "step": 170
+ },
+ {
+ "epoch": 0.08085106382978724,
+ "grad_norm": 3.555877923965454,
+ "learning_rate": 4.99960765905314e-06,
+ "loss": 0.8276,
+ "step": 171
+ },
+ {
+ "epoch": 0.08132387706855793,
+ "grad_norm": 3.597050189971924,
+ "learning_rate": 4.999596529664874e-06,
+ "loss": 0.8164,
+ "step": 172
+ },
+ {
+ "epoch": 0.0817966903073286,
+ "grad_norm": 3.2002956867218018,
+ "learning_rate": 4.999585244637693e-06,
+ "loss": 0.7824,
+ "step": 173
+ },
+ {
+ "epoch": 0.08226950354609928,
+ "grad_norm": 3.527275562286377,
+ "learning_rate": 4.999573803972299e-06,
+ "loss": 0.8033,
+ "step": 174
+ },
+ {
+ "epoch": 0.08274231678486997,
+ "grad_norm": 3.5184452533721924,
+ "learning_rate": 4.999562207669405e-06,
+ "loss": 0.724,
+ "step": 175
+ },
+ {
+ "epoch": 0.08321513002364066,
+ "grad_norm": 3.6635067462921143,
+ "learning_rate": 4.999550455729732e-06,
+ "loss": 0.819,
+ "step": 176
+ },
+ {
+ "epoch": 0.08368794326241134,
+ "grad_norm": 3.192399740219116,
+ "learning_rate": 4.999538548154012e-06,
+ "loss": 0.7999,
+ "step": 177
+ },
+ {
+ "epoch": 0.08416075650118203,
+ "grad_norm": 3.0946953296661377,
+ "learning_rate": 4.999526484942988e-06,
+ "loss": 0.7367,
+ "step": 178
+ },
+ {
+ "epoch": 0.08463356973995272,
+ "grad_norm": 2.847198009490967,
+ "learning_rate": 4.99951426609741e-06,
+ "loss": 0.7536,
+ "step": 179
+ },
+ {
+ "epoch": 0.0851063829787234,
+ "grad_norm": 2.7674827575683594,
+ "learning_rate": 4.999501891618037e-06,
+ "loss": 0.701,
+ "step": 180
+ },
+ {
+ "epoch": 0.08557919621749409,
+ "grad_norm": 3.357933521270752,
+ "learning_rate": 4.999489361505643e-06,
+ "loss": 0.8331,
+ "step": 181
+ },
+ {
+ "epoch": 0.08605200945626477,
+ "grad_norm": 3.1464426517486572,
+ "learning_rate": 4.999476675761004e-06,
+ "loss": 0.7931,
+ "step": 182
+ },
+ {
+ "epoch": 0.08652482269503546,
+ "grad_norm": 3.310697078704834,
+ "learning_rate": 4.999463834384915e-06,
+ "loss": 0.753,
+ "step": 183
+ },
+ {
+ "epoch": 0.08699763593380615,
+ "grad_norm": 2.9794881343841553,
+ "learning_rate": 4.999450837378171e-06,
+ "loss": 0.7091,
+ "step": 184
+ },
+ {
+ "epoch": 0.08747044917257683,
+ "grad_norm": 3.0776889324188232,
+ "learning_rate": 4.999437684741584e-06,
+ "loss": 0.7226,
+ "step": 185
+ },
+ {
+ "epoch": 0.08794326241134752,
+ "grad_norm": 3.6657519340515137,
+ "learning_rate": 4.999424376475972e-06,
+ "loss": 0.845,
+ "step": 186
+ },
+ {
+ "epoch": 0.0884160756501182,
+ "grad_norm": 3.872718572616577,
+ "learning_rate": 4.999410912582164e-06,
+ "loss": 0.812,
+ "step": 187
+ },
+ {
+ "epoch": 0.08888888888888889,
+ "grad_norm": 2.9184508323669434,
+ "learning_rate": 4.9993972930609976e-06,
+ "loss": 0.6823,
+ "step": 188
+ },
+ {
+ "epoch": 0.08936170212765958,
+ "grad_norm": 3.5567142963409424,
+ "learning_rate": 4.999383517913321e-06,
+ "loss": 0.7614,
+ "step": 189
+ },
+ {
+ "epoch": 0.08983451536643026,
+ "grad_norm": 3.3688533306121826,
+ "learning_rate": 4.999369587139992e-06,
+ "loss": 0.858,
+ "step": 190
+ },
+ {
+ "epoch": 0.09030732860520095,
+ "grad_norm": 2.893223524093628,
+ "learning_rate": 4.99935550074188e-06,
+ "loss": 0.6761,
+ "step": 191
+ },
+ {
+ "epoch": 0.09078014184397164,
+ "grad_norm": 3.400225877761841,
+ "learning_rate": 4.999341258719859e-06,
+ "loss": 0.7531,
+ "step": 192
+ },
+ {
+ "epoch": 0.09125295508274232,
+ "grad_norm": 3.6167714595794678,
+ "learning_rate": 4.999326861074817e-06,
+ "loss": 0.8164,
+ "step": 193
+ },
+ {
+ "epoch": 0.091725768321513,
+ "grad_norm": 4.325016498565674,
+ "learning_rate": 4.9993123078076506e-06,
+ "loss": 0.7069,
+ "step": 194
+ },
+ {
+ "epoch": 0.09219858156028368,
+ "grad_norm": 3.195317029953003,
+ "learning_rate": 4.999297598919266e-06,
+ "loss": 0.726,
+ "step": 195
+ },
+ {
+ "epoch": 0.09267139479905437,
+ "grad_norm": 3.146530866622925,
+ "learning_rate": 4.999282734410579e-06,
+ "loss": 0.7888,
+ "step": 196
+ },
+ {
+ "epoch": 0.09314420803782505,
+ "grad_norm": 3.5166752338409424,
+ "learning_rate": 4.999267714282515e-06,
+ "loss": 0.8473,
+ "step": 197
+ },
+ {
+ "epoch": 0.09361702127659574,
+ "grad_norm": 3.3140196800231934,
+ "learning_rate": 4.99925253853601e-06,
+ "loss": 0.7233,
+ "step": 198
+ },
+ {
+ "epoch": 0.09408983451536643,
+ "grad_norm": 3.0318164825439453,
+ "learning_rate": 4.999237207172008e-06,
+ "loss": 0.7543,
+ "step": 199
+ },
+ {
+ "epoch": 0.09456264775413711,
+ "grad_norm": 3.662214756011963,
+ "learning_rate": 4.999221720191464e-06,
+ "loss": 0.7783,
+ "step": 200
+ },
+ {
+ "epoch": 0.0950354609929078,
+ "grad_norm": 3.452078104019165,
+ "learning_rate": 4.9992060775953425e-06,
+ "loss": 0.7868,
+ "step": 201
+ },
+ {
+ "epoch": 0.09550827423167849,
+ "grad_norm": 3.4051287174224854,
+ "learning_rate": 4.999190279384617e-06,
+ "loss": 0.7849,
+ "step": 202
+ },
+ {
+ "epoch": 0.09598108747044917,
+ "grad_norm": 3.1377196311950684,
+ "learning_rate": 4.999174325560271e-06,
+ "loss": 0.8364,
+ "step": 203
+ },
+ {
+ "epoch": 0.09645390070921986,
+ "grad_norm": 3.129473924636841,
+ "learning_rate": 4.999158216123299e-06,
+ "loss": 0.7458,
+ "step": 204
+ },
+ {
+ "epoch": 0.09692671394799054,
+ "grad_norm": 3.169548749923706,
+ "learning_rate": 4.999141951074703e-06,
+ "loss": 0.7256,
+ "step": 205
+ },
+ {
+ "epoch": 0.09739952718676123,
+ "grad_norm": 3.186009168624878,
+ "learning_rate": 4.999125530415495e-06,
+ "loss": 0.783,
+ "step": 206
+ },
+ {
+ "epoch": 0.09787234042553192,
+ "grad_norm": 3.0995123386383057,
+ "learning_rate": 4.9991089541467e-06,
+ "loss": 0.7519,
+ "step": 207
+ },
+ {
+ "epoch": 0.0983451536643026,
+ "grad_norm": 3.1854088306427,
+ "learning_rate": 4.999092222269348e-06,
+ "loss": 0.7444,
+ "step": 208
+ },
+ {
+ "epoch": 0.09881796690307329,
+ "grad_norm": 3.1512246131896973,
+ "learning_rate": 4.999075334784482e-06,
+ "loss": 0.7882,
+ "step": 209
+ },
+ {
+ "epoch": 0.09929078014184398,
+ "grad_norm": 3.6199698448181152,
+ "learning_rate": 4.999058291693153e-06,
+ "loss": 0.8048,
+ "step": 210
+ },
+ {
+ "epoch": 0.09976359338061466,
+ "grad_norm": 2.956907272338867,
+ "learning_rate": 4.999041092996422e-06,
+ "loss": 0.7663,
+ "step": 211
+ },
+ {
+ "epoch": 0.10023640661938535,
+ "grad_norm": 3.3493971824645996,
+ "learning_rate": 4.99902373869536e-06,
+ "loss": 0.7639,
+ "step": 212
+ },
+ {
+ "epoch": 0.10070921985815603,
+ "grad_norm": 3.144812822341919,
+ "learning_rate": 4.9990062287910475e-06,
+ "loss": 0.7953,
+ "step": 213
+ },
+ {
+ "epoch": 0.10118203309692671,
+ "grad_norm": 3.5986971855163574,
+ "learning_rate": 4.998988563284576e-06,
+ "loss": 0.8297,
+ "step": 214
+ },
+ {
+ "epoch": 0.1016548463356974,
+ "grad_norm": 3.447584867477417,
+ "learning_rate": 4.998970742177044e-06,
+ "loss": 0.808,
+ "step": 215
+ },
+ {
+ "epoch": 0.10212765957446808,
+ "grad_norm": 3.791353940963745,
+ "learning_rate": 4.998952765469562e-06,
+ "loss": 0.8005,
+ "step": 216
+ },
+ {
+ "epoch": 0.10260047281323877,
+ "grad_norm": 3.4490807056427,
+ "learning_rate": 4.998934633163247e-06,
+ "loss": 0.8135,
+ "step": 217
+ },
+ {
+ "epoch": 0.10307328605200945,
+ "grad_norm": 3.1053314208984375,
+ "learning_rate": 4.998916345259232e-06,
+ "loss": 0.7888,
+ "step": 218
+ },
+ {
+ "epoch": 0.10354609929078014,
+ "grad_norm": 3.407862663269043,
+ "learning_rate": 4.9988979017586514e-06,
+ "loss": 0.7099,
+ "step": 219
+ },
+ {
+ "epoch": 0.10401891252955082,
+ "grad_norm": 3.116656541824341,
+ "learning_rate": 4.998879302662658e-06,
+ "loss": 0.8344,
+ "step": 220
+ },
+ {
+ "epoch": 0.10449172576832151,
+ "grad_norm": 3.339264154434204,
+ "learning_rate": 4.998860547972406e-06,
+ "loss": 0.8496,
+ "step": 221
+ },
+ {
+ "epoch": 0.1049645390070922,
+ "grad_norm": 3.251892566680908,
+ "learning_rate": 4.998841637689066e-06,
+ "loss": 0.7455,
+ "step": 222
+ },
+ {
+ "epoch": 0.10543735224586288,
+ "grad_norm": 4.098135471343994,
+ "learning_rate": 4.998822571813814e-06,
+ "loss": 0.7772,
+ "step": 223
+ },
+ {
+ "epoch": 0.10591016548463357,
+ "grad_norm": 3.9871134757995605,
+ "learning_rate": 4.998803350347837e-06,
+ "loss": 0.8261,
+ "step": 224
+ },
+ {
+ "epoch": 0.10638297872340426,
+ "grad_norm": 3.2822303771972656,
+ "learning_rate": 4.998783973292333e-06,
+ "loss": 0.8623,
+ "step": 225
+ },
+ {
+ "epoch": 0.10685579196217494,
+ "grad_norm": 3.0356857776641846,
+ "learning_rate": 4.998764440648507e-06,
+ "loss": 0.7426,
+ "step": 226
+ },
+ {
+ "epoch": 0.10732860520094563,
+ "grad_norm": 2.8932785987854004,
+ "learning_rate": 4.998744752417576e-06,
+ "loss": 0.6741,
+ "step": 227
+ },
+ {
+ "epoch": 0.10780141843971631,
+ "grad_norm": 3.085820436477661,
+ "learning_rate": 4.998724908600767e-06,
+ "loss": 0.6549,
+ "step": 228
+ },
+ {
+ "epoch": 0.108274231678487,
+ "grad_norm": 3.135829210281372,
+ "learning_rate": 4.998704909199314e-06,
+ "loss": 0.6702,
+ "step": 229
+ },
+ {
+ "epoch": 0.10874704491725769,
+ "grad_norm": 5.016134262084961,
+ "learning_rate": 4.9986847542144625e-06,
+ "loss": 0.7852,
+ "step": 230
+ },
+ {
+ "epoch": 0.10921985815602837,
+ "grad_norm": 3.9056200981140137,
+ "learning_rate": 4.998664443647468e-06,
+ "loss": 0.9654,
+ "step": 231
+ },
+ {
+ "epoch": 0.10969267139479906,
+ "grad_norm": 3.0880749225616455,
+ "learning_rate": 4.998643977499595e-06,
+ "loss": 0.7579,
+ "step": 232
+ },
+ {
+ "epoch": 0.11016548463356975,
+ "grad_norm": 3.6893601417541504,
+ "learning_rate": 4.998623355772118e-06,
+ "loss": 0.713,
+ "step": 233
+ },
+ {
+ "epoch": 0.11063829787234042,
+ "grad_norm": 4.181536674499512,
+ "learning_rate": 4.998602578466319e-06,
+ "loss": 0.7331,
+ "step": 234
+ },
+ {
+ "epoch": 0.1111111111111111,
+ "grad_norm": 3.036386728286743,
+ "learning_rate": 4.998581645583496e-06,
+ "loss": 0.7115,
+ "step": 235
+ },
+ {
+ "epoch": 0.11158392434988179,
+ "grad_norm": 3.6333255767822266,
+ "learning_rate": 4.998560557124948e-06,
+ "loss": 0.7544,
+ "step": 236
+ },
+ {
+ "epoch": 0.11205673758865248,
+ "grad_norm": 2.926417827606201,
+ "learning_rate": 4.9985393130919915e-06,
+ "loss": 0.715,
+ "step": 237
+ },
+ {
+ "epoch": 0.11252955082742316,
+ "grad_norm": 2.969158172607422,
+ "learning_rate": 4.998517913485946e-06,
+ "loss": 0.7304,
+ "step": 238
+ },
+ {
+ "epoch": 0.11300236406619385,
+ "grad_norm": 3.5254971981048584,
+ "learning_rate": 4.9984963583081466e-06,
+ "loss": 0.7725,
+ "step": 239
+ },
+ {
+ "epoch": 0.11347517730496454,
+ "grad_norm": 3.7840335369110107,
+ "learning_rate": 4.998474647559936e-06,
+ "loss": 0.8685,
+ "step": 240
+ },
+ {
+ "epoch": 0.11394799054373522,
+ "grad_norm": 3.0333125591278076,
+ "learning_rate": 4.9984527812426625e-06,
+ "loss": 0.7793,
+ "step": 241
+ },
+ {
+ "epoch": 0.11442080378250591,
+ "grad_norm": 3.290159225463867,
+ "learning_rate": 4.99843075935769e-06,
+ "loss": 0.7158,
+ "step": 242
+ },
+ {
+ "epoch": 0.1148936170212766,
+ "grad_norm": 3.3935494422912598,
+ "learning_rate": 4.99840858190639e-06,
+ "loss": 0.7643,
+ "step": 243
+ },
+ {
+ "epoch": 0.11536643026004728,
+ "grad_norm": 3.333965539932251,
+ "learning_rate": 4.998386248890142e-06,
+ "loss": 0.7255,
+ "step": 244
+ },
+ {
+ "epoch": 0.11583924349881797,
+ "grad_norm": 2.8129613399505615,
+ "learning_rate": 4.998363760310339e-06,
+ "loss": 0.768,
+ "step": 245
+ },
+ {
+ "epoch": 0.11631205673758865,
+ "grad_norm": 2.8678107261657715,
+ "learning_rate": 4.998341116168378e-06,
+ "loss": 0.7403,
+ "step": 246
+ },
+ {
+ "epoch": 0.11678486997635934,
+ "grad_norm": 2.8898239135742188,
+ "learning_rate": 4.998318316465672e-06,
+ "loss": 0.6844,
+ "step": 247
+ },
+ {
+ "epoch": 0.11725768321513003,
+ "grad_norm": 3.139777898788452,
+ "learning_rate": 4.998295361203637e-06,
+ "loss": 0.7936,
+ "step": 248
+ },
+ {
+ "epoch": 0.11773049645390071,
+ "grad_norm": 3.393721103668213,
+ "learning_rate": 4.998272250383707e-06,
+ "loss": 0.8173,
+ "step": 249
+ },
+ {
+ "epoch": 0.1182033096926714,
+ "grad_norm": 3.240973949432373,
+ "learning_rate": 4.998248984007318e-06,
+ "loss": 0.8252,
+ "step": 250
+ },
+ {
+ "epoch": 0.11867612293144209,
+ "grad_norm": 3.384855031967163,
+ "learning_rate": 4.998225562075918e-06,
+ "loss": 0.7244,
+ "step": 251
+ },
+ {
+ "epoch": 0.11914893617021277,
+ "grad_norm": 3.1881816387176514,
+ "learning_rate": 4.9982019845909675e-06,
+ "loss": 0.6818,
+ "step": 252
+ },
+ {
+ "epoch": 0.11962174940898346,
+ "grad_norm": 2.888364553451538,
+ "learning_rate": 4.998178251553934e-06,
+ "loss": 0.6753,
+ "step": 253
+ },
+ {
+ "epoch": 0.12009456264775414,
+ "grad_norm": 3.630093812942505,
+ "learning_rate": 4.9981543629662944e-06,
+ "loss": 0.7995,
+ "step": 254
+ },
+ {
+ "epoch": 0.12056737588652482,
+ "grad_norm": 2.9820947647094727,
+ "learning_rate": 4.998130318829537e-06,
+ "loss": 0.7478,
+ "step": 255
+ },
+ {
+ "epoch": 0.1210401891252955,
+ "grad_norm": 2.7094738483428955,
+ "learning_rate": 4.998106119145159e-06,
+ "loss": 0.7237,
+ "step": 256
+ },
+ {
+ "epoch": 0.12151300236406619,
+ "grad_norm": 3.1808104515075684,
+ "learning_rate": 4.9980817639146665e-06,
+ "loss": 0.7915,
+ "step": 257
+ },
+ {
+ "epoch": 0.12198581560283688,
+ "grad_norm": 3.1661291122436523,
+ "learning_rate": 4.998057253139575e-06,
+ "loss": 0.8053,
+ "step": 258
+ },
+ {
+ "epoch": 0.12245862884160756,
+ "grad_norm": 3.528749942779541,
+ "learning_rate": 4.998032586821413e-06,
+ "loss": 0.7946,
+ "step": 259
+ },
+ {
+ "epoch": 0.12293144208037825,
+ "grad_norm": 3.125964879989624,
+ "learning_rate": 4.998007764961716e-06,
+ "loss": 0.7569,
+ "step": 260
+ },
+ {
+ "epoch": 0.12340425531914893,
+ "grad_norm": 3.0778942108154297,
+ "learning_rate": 4.997982787562029e-06,
+ "loss": 0.7184,
+ "step": 261
+ },
+ {
+ "epoch": 0.12387706855791962,
+ "grad_norm": 3.3531930446624756,
+ "learning_rate": 4.997957654623906e-06,
+ "loss": 0.7586,
+ "step": 262
+ },
+ {
+ "epoch": 0.1243498817966903,
+ "grad_norm": 3.229278564453125,
+ "learning_rate": 4.997932366148913e-06,
+ "loss": 0.6092,
+ "step": 263
+ },
+ {
+ "epoch": 0.12482269503546099,
+ "grad_norm": 3.7286155223846436,
+ "learning_rate": 4.997906922138626e-06,
+ "loss": 0.7965,
+ "step": 264
+ },
+ {
+ "epoch": 0.12529550827423167,
+ "grad_norm": 3.300311803817749,
+ "learning_rate": 4.997881322594628e-06,
+ "loss": 0.7665,
+ "step": 265
+ },
+ {
+ "epoch": 0.12576832151300235,
+ "grad_norm": 3.411482572555542,
+ "learning_rate": 4.9978555675185115e-06,
+ "loss": 0.7253,
+ "step": 266
+ },
+ {
+ "epoch": 0.12624113475177304,
+ "grad_norm": 3.0884511470794678,
+ "learning_rate": 4.9978296569118825e-06,
+ "loss": 0.659,
+ "step": 267
+ },
+ {
+ "epoch": 0.12671394799054372,
+ "grad_norm": 3.0652925968170166,
+ "learning_rate": 4.9978035907763535e-06,
+ "loss": 0.6739,
+ "step": 268
+ },
+ {
+ "epoch": 0.1271867612293144,
+ "grad_norm": 3.280555009841919,
+ "learning_rate": 4.997777369113547e-06,
+ "loss": 0.8003,
+ "step": 269
+ },
+ {
+ "epoch": 0.1276595744680851,
+ "grad_norm": 2.980860948562622,
+ "learning_rate": 4.997750991925096e-06,
+ "loss": 0.7097,
+ "step": 270
+ },
+ {
+ "epoch": 0.12813238770685578,
+ "grad_norm": 3.301760673522949,
+ "learning_rate": 4.997724459212644e-06,
+ "loss": 0.7894,
+ "step": 271
+ },
+ {
+ "epoch": 0.12860520094562647,
+ "grad_norm": 2.9584903717041016,
+ "learning_rate": 4.997697770977841e-06,
+ "loss": 0.733,
+ "step": 272
+ },
+ {
+ "epoch": 0.12907801418439716,
+ "grad_norm": 3.5632214546203613,
+ "learning_rate": 4.99767092722235e-06,
+ "loss": 0.7228,
+ "step": 273
+ },
+ {
+ "epoch": 0.12955082742316784,
+ "grad_norm": 3.5900983810424805,
+ "learning_rate": 4.997643927947843e-06,
+ "loss": 0.7634,
+ "step": 274
+ },
+ {
+ "epoch": 0.13002364066193853,
+ "grad_norm": 3.332650661468506,
+ "learning_rate": 4.997616773156e-06,
+ "loss": 0.797,
+ "step": 275
+ },
+ {
+ "epoch": 0.13049645390070921,
+ "grad_norm": 3.1094167232513428,
+ "learning_rate": 4.997589462848512e-06,
+ "loss": 0.7849,
+ "step": 276
+ },
+ {
+ "epoch": 0.1309692671394799,
+ "grad_norm": 3.5359463691711426,
+ "learning_rate": 4.99756199702708e-06,
+ "loss": 0.6871,
+ "step": 277
+ },
+ {
+ "epoch": 0.1314420803782506,
+ "grad_norm": 3.190441846847534,
+ "learning_rate": 4.997534375693414e-06,
+ "loss": 0.6883,
+ "step": 278
+ },
+ {
+ "epoch": 0.13191489361702127,
+ "grad_norm": 3.063518762588501,
+ "learning_rate": 4.997506598849234e-06,
+ "loss": 0.7586,
+ "step": 279
+ },
+ {
+ "epoch": 0.13238770685579196,
+ "grad_norm": 3.4112050533294678,
+ "learning_rate": 4.997478666496269e-06,
+ "loss": 0.796,
+ "step": 280
+ },
+ {
+ "epoch": 0.13286052009456265,
+ "grad_norm": 3.231886386871338,
+ "learning_rate": 4.997450578636259e-06,
+ "loss": 0.7714,
+ "step": 281
+ },
+ {
+ "epoch": 0.13333333333333333,
+ "grad_norm": 3.279425621032715,
+ "learning_rate": 4.9974223352709515e-06,
+ "loss": 0.7793,
+ "step": 282
+ },
+ {
+ "epoch": 0.13380614657210402,
+ "grad_norm": 3.2154316902160645,
+ "learning_rate": 4.9973939364021075e-06,
+ "loss": 0.791,
+ "step": 283
+ },
+ {
+ "epoch": 0.1342789598108747,
+ "grad_norm": 3.2090768814086914,
+ "learning_rate": 4.9973653820314925e-06,
+ "loss": 0.6433,
+ "step": 284
+ },
+ {
+ "epoch": 0.1347517730496454,
+ "grad_norm": 3.1712026596069336,
+ "learning_rate": 4.997336672160886e-06,
+ "loss": 0.8128,
+ "step": 285
+ },
+ {
+ "epoch": 0.13522458628841608,
+ "grad_norm": 2.929229497909546,
+ "learning_rate": 4.997307806792076e-06,
+ "loss": 0.7594,
+ "step": 286
+ },
+ {
+ "epoch": 0.13569739952718676,
+ "grad_norm": 3.0363314151763916,
+ "learning_rate": 4.997278785926859e-06,
+ "loss": 0.7336,
+ "step": 287
+ },
+ {
+ "epoch": 0.13617021276595745,
+ "grad_norm": 3.1352357864379883,
+ "learning_rate": 4.997249609567042e-06,
+ "loss": 0.7225,
+ "step": 288
+ },
+ {
+ "epoch": 0.13664302600472814,
+ "grad_norm": 3.3171157836914062,
+ "learning_rate": 4.997220277714442e-06,
+ "loss": 0.7777,
+ "step": 289
+ },
+ {
+ "epoch": 0.13711583924349882,
+ "grad_norm": 3.050717353820801,
+ "learning_rate": 4.997190790370885e-06,
+ "loss": 0.6836,
+ "step": 290
+ },
+ {
+ "epoch": 0.1375886524822695,
+ "grad_norm": 3.0297694206237793,
+ "learning_rate": 4.997161147538208e-06,
+ "loss": 0.6883,
+ "step": 291
+ },
+ {
+ "epoch": 0.1380614657210402,
+ "grad_norm": 3.0566554069519043,
+ "learning_rate": 4.997131349218256e-06,
+ "loss": 0.6674,
+ "step": 292
+ },
+ {
+ "epoch": 0.13853427895981088,
+ "grad_norm": 3.799111843109131,
+ "learning_rate": 4.997101395412885e-06,
+ "loss": 0.8256,
+ "step": 293
+ },
+ {
+ "epoch": 0.13900709219858157,
+ "grad_norm": 3.1394248008728027,
+ "learning_rate": 4.9970712861239576e-06,
+ "loss": 0.7306,
+ "step": 294
+ },
+ {
+ "epoch": 0.13947990543735225,
+ "grad_norm": 3.0605666637420654,
+ "learning_rate": 4.997041021353352e-06,
+ "loss": 0.7212,
+ "step": 295
+ },
+ {
+ "epoch": 0.13995271867612294,
+ "grad_norm": 3.8813397884368896,
+ "learning_rate": 4.997010601102951e-06,
+ "loss": 0.769,
+ "step": 296
+ },
+ {
+ "epoch": 0.14042553191489363,
+ "grad_norm": 3.0514819622039795,
+ "learning_rate": 4.996980025374649e-06,
+ "loss": 0.7422,
+ "step": 297
+ },
+ {
+ "epoch": 0.1408983451536643,
+ "grad_norm": 2.9544146060943604,
+ "learning_rate": 4.99694929417035e-06,
+ "loss": 0.6912,
+ "step": 298
+ },
+ {
+ "epoch": 0.141371158392435,
+ "grad_norm": 3.2635602951049805,
+ "learning_rate": 4.996918407491966e-06,
+ "loss": 0.7395,
+ "step": 299
+ },
+ {
+ "epoch": 0.14184397163120568,
+ "grad_norm": 3.373882532119751,
+ "learning_rate": 4.996887365341423e-06,
+ "loss": 0.7799,
+ "step": 300
+ },
+ {
+ "epoch": 0.14231678486997637,
+ "grad_norm": 3.001128673553467,
+ "learning_rate": 4.996856167720652e-06,
+ "loss": 0.7168,
+ "step": 301
+ },
+ {
+ "epoch": 0.14278959810874706,
+ "grad_norm": 3.1026835441589355,
+ "learning_rate": 4.996824814631595e-06,
+ "loss": 0.7492,
+ "step": 302
+ },
+ {
+ "epoch": 0.14326241134751774,
+ "grad_norm": 3.41947603225708,
+ "learning_rate": 4.996793306076205e-06,
+ "loss": 0.6659,
+ "step": 303
+ },
+ {
+ "epoch": 0.14373522458628843,
+ "grad_norm": 3.2272400856018066,
+ "learning_rate": 4.996761642056444e-06,
+ "loss": 0.7184,
+ "step": 304
+ },
+ {
+ "epoch": 0.14420803782505912,
+ "grad_norm": 2.9488935470581055,
+ "learning_rate": 4.996729822574284e-06,
+ "loss": 0.7451,
+ "step": 305
+ },
+ {
+ "epoch": 0.14468085106382977,
+ "grad_norm": 3.268231153488159,
+ "learning_rate": 4.9966978476317065e-06,
+ "loss": 0.7798,
+ "step": 306
+ },
+ {
+ "epoch": 0.14515366430260046,
+ "grad_norm": 3.9086556434631348,
+ "learning_rate": 4.996665717230701e-06,
+ "loss": 0.7871,
+ "step": 307
+ },
+ {
+ "epoch": 0.14562647754137115,
+ "grad_norm": 3.3483879566192627,
+ "learning_rate": 4.996633431373269e-06,
+ "loss": 0.7415,
+ "step": 308
+ },
+ {
+ "epoch": 0.14609929078014183,
+ "grad_norm": 2.839400053024292,
+ "learning_rate": 4.99660099006142e-06,
+ "loss": 0.7192,
+ "step": 309
+ },
+ {
+ "epoch": 0.14657210401891252,
+ "grad_norm": 3.177302598953247,
+ "learning_rate": 4.996568393297175e-06,
+ "loss": 0.755,
+ "step": 310
+ },
+ {
+ "epoch": 0.1470449172576832,
+ "grad_norm": 3.5477044582366943,
+ "learning_rate": 4.996535641082563e-06,
+ "loss": 0.7531,
+ "step": 311
+ },
+ {
+ "epoch": 0.1475177304964539,
+ "grad_norm": 3.418576717376709,
+ "learning_rate": 4.996502733419624e-06,
+ "loss": 0.8009,
+ "step": 312
+ },
+ {
+ "epoch": 0.14799054373522458,
+ "grad_norm": 3.711341619491577,
+ "learning_rate": 4.996469670310407e-06,
+ "loss": 0.7362,
+ "step": 313
+ },
+ {
+ "epoch": 0.14846335697399526,
+ "grad_norm": 3.2419373989105225,
+ "learning_rate": 4.99643645175697e-06,
+ "loss": 0.7761,
+ "step": 314
+ },
+ {
+ "epoch": 0.14893617021276595,
+ "grad_norm": 3.121858835220337,
+ "learning_rate": 4.996403077761381e-06,
+ "loss": 0.6495,
+ "step": 315
+ },
+ {
+ "epoch": 0.14940898345153664,
+ "grad_norm": 3.123054265975952,
+ "learning_rate": 4.996369548325719e-06,
+ "loss": 0.7444,
+ "step": 316
+ },
+ {
+ "epoch": 0.14988179669030732,
+ "grad_norm": 2.780880928039551,
+ "learning_rate": 4.996335863452072e-06,
+ "loss": 0.672,
+ "step": 317
+ },
+ {
+ "epoch": 0.150354609929078,
+ "grad_norm": 3.3738629817962646,
+ "learning_rate": 4.996302023142536e-06,
+ "loss": 0.7972,
+ "step": 318
+ },
+ {
+ "epoch": 0.1508274231678487,
+ "grad_norm": 3.4874777793884277,
+ "learning_rate": 4.99626802739922e-06,
+ "loss": 0.8252,
+ "step": 319
+ },
+ {
+ "epoch": 0.15130023640661938,
+ "grad_norm": 3.7074787616729736,
+ "learning_rate": 4.9962338762242395e-06,
+ "loss": 0.8216,
+ "step": 320
+ },
+ {
+ "epoch": 0.15177304964539007,
+ "grad_norm": 3.281912326812744,
+ "learning_rate": 4.996199569619721e-06,
+ "loss": 0.8175,
+ "step": 321
+ },
+ {
+ "epoch": 0.15224586288416075,
+ "grad_norm": 2.9485340118408203,
+ "learning_rate": 4.996165107587801e-06,
+ "loss": 0.707,
+ "step": 322
+ },
+ {
+ "epoch": 0.15271867612293144,
+ "grad_norm": 3.3757646083831787,
+ "learning_rate": 4.996130490130625e-06,
+ "loss": 0.7955,
+ "step": 323
+ },
+ {
+ "epoch": 0.15319148936170213,
+ "grad_norm": 2.962181568145752,
+ "learning_rate": 4.996095717250349e-06,
+ "loss": 0.7067,
+ "step": 324
+ },
+ {
+ "epoch": 0.1536643026004728,
+ "grad_norm": 3.114272356033325,
+ "learning_rate": 4.996060788949136e-06,
+ "loss": 0.7486,
+ "step": 325
+ },
+ {
+ "epoch": 0.1541371158392435,
+ "grad_norm": 3.0621590614318848,
+ "learning_rate": 4.996025705229165e-06,
+ "loss": 0.6547,
+ "step": 326
+ },
+ {
+ "epoch": 0.15460992907801419,
+ "grad_norm": 2.8745882511138916,
+ "learning_rate": 4.995990466092616e-06,
+ "loss": 0.6435,
+ "step": 327
+ },
+ {
+ "epoch": 0.15508274231678487,
+ "grad_norm": 2.90841007232666,
+ "learning_rate": 4.995955071541686e-06,
+ "loss": 0.7331,
+ "step": 328
+ },
+ {
+ "epoch": 0.15555555555555556,
+ "grad_norm": 2.694580316543579,
+ "learning_rate": 4.9959195215785784e-06,
+ "loss": 0.6731,
+ "step": 329
+ },
+ {
+ "epoch": 0.15602836879432624,
+ "grad_norm": 3.158083438873291,
+ "learning_rate": 4.995883816205507e-06,
+ "loss": 0.7257,
+ "step": 330
+ },
+ {
+ "epoch": 0.15650118203309693,
+ "grad_norm": 3.3234715461730957,
+ "learning_rate": 4.995847955424694e-06,
+ "loss": 0.7389,
+ "step": 331
+ },
+ {
+ "epoch": 0.15697399527186762,
+ "grad_norm": 2.9406495094299316,
+ "learning_rate": 4.995811939238373e-06,
+ "loss": 0.643,
+ "step": 332
+ },
+ {
+ "epoch": 0.1574468085106383,
+ "grad_norm": 3.3191726207733154,
+ "learning_rate": 4.995775767648785e-06,
+ "loss": 0.7879,
+ "step": 333
+ },
+ {
+ "epoch": 0.157919621749409,
+ "grad_norm": 3.711925745010376,
+ "learning_rate": 4.995739440658185e-06,
+ "loss": 0.7586,
+ "step": 334
+ },
+ {
+ "epoch": 0.15839243498817968,
+ "grad_norm": 9.573421478271484,
+ "learning_rate": 4.995702958268833e-06,
+ "loss": 0.7842,
+ "step": 335
+ },
+ {
+ "epoch": 0.15886524822695036,
+ "grad_norm": 3.4154508113861084,
+ "learning_rate": 4.995666320483001e-06,
+ "loss": 0.6735,
+ "step": 336
+ },
+ {
+ "epoch": 0.15933806146572105,
+ "grad_norm": 3.4169859886169434,
+ "learning_rate": 4.995629527302971e-06,
+ "loss": 0.741,
+ "step": 337
+ },
+ {
+ "epoch": 0.15981087470449173,
+ "grad_norm": 3.287503242492676,
+ "learning_rate": 4.9955925787310335e-06,
+ "loss": 0.7139,
+ "step": 338
+ },
+ {
+ "epoch": 0.16028368794326242,
+ "grad_norm": 3.288409471511841,
+ "learning_rate": 4.995555474769488e-06,
+ "loss": 0.7636,
+ "step": 339
+ },
+ {
+ "epoch": 0.1607565011820331,
+ "grad_norm": 2.8021693229675293,
+ "learning_rate": 4.995518215420646e-06,
+ "loss": 0.5883,
+ "step": 340
+ },
+ {
+ "epoch": 0.1612293144208038,
+ "grad_norm": 2.7038564682006836,
+ "learning_rate": 4.995480800686827e-06,
+ "loss": 0.657,
+ "step": 341
+ },
+ {
+ "epoch": 0.16170212765957448,
+ "grad_norm": 3.2370235919952393,
+ "learning_rate": 4.9954432305703615e-06,
+ "loss": 0.6999,
+ "step": 342
+ },
+ {
+ "epoch": 0.16217494089834517,
+ "grad_norm": 2.8666412830352783,
+ "learning_rate": 4.995405505073588e-06,
+ "loss": 0.7199,
+ "step": 343
+ },
+ {
+ "epoch": 0.16264775413711585,
+ "grad_norm": 3.6467232704162598,
+ "learning_rate": 4.995367624198856e-06,
+ "loss": 0.7317,
+ "step": 344
+ },
+ {
+ "epoch": 0.16312056737588654,
+ "grad_norm": 2.7576327323913574,
+ "learning_rate": 4.9953295879485246e-06,
+ "loss": 0.647,
+ "step": 345
+ },
+ {
+ "epoch": 0.1635933806146572,
+ "grad_norm": 2.922232151031494,
+ "learning_rate": 4.995291396324959e-06,
+ "loss": 0.6686,
+ "step": 346
+ },
+ {
+ "epoch": 0.16406619385342788,
+ "grad_norm": 2.8693501949310303,
+ "learning_rate": 4.995253049330542e-06,
+ "loss": 0.6756,
+ "step": 347
+ },
+ {
+ "epoch": 0.16453900709219857,
+ "grad_norm": 3.671865701675415,
+ "learning_rate": 4.995214546967658e-06,
+ "loss": 0.7347,
+ "step": 348
+ },
+ {
+ "epoch": 0.16501182033096926,
+ "grad_norm": 3.024219274520874,
+ "learning_rate": 4.995175889238706e-06,
+ "loss": 0.7547,
+ "step": 349
+ },
+ {
+ "epoch": 0.16548463356973994,
+ "grad_norm": 2.8470778465270996,
+ "learning_rate": 4.995137076146091e-06,
+ "loss": 0.6764,
+ "step": 350
+ },
+ {
+ "epoch": 0.16595744680851063,
+ "grad_norm": 2.905057907104492,
+ "learning_rate": 4.9950981076922324e-06,
+ "loss": 0.6814,
+ "step": 351
+ },
+ {
+ "epoch": 0.16643026004728131,
+ "grad_norm": 3.504377841949463,
+ "learning_rate": 4.995058983879555e-06,
+ "loss": 0.7145,
+ "step": 352
+ },
+ {
+ "epoch": 0.166903073286052,
+ "grad_norm": 3.0029661655426025,
+ "learning_rate": 4.995019704710495e-06,
+ "loss": 0.7114,
+ "step": 353
+ },
+ {
+ "epoch": 0.1673758865248227,
+ "grad_norm": 2.8666274547576904,
+ "learning_rate": 4.994980270187499e-06,
+ "loss": 0.7416,
+ "step": 354
+ },
+ {
+ "epoch": 0.16784869976359337,
+ "grad_norm": 3.1644718647003174,
+ "learning_rate": 4.994940680313021e-06,
+ "loss": 0.661,
+ "step": 355
+ },
+ {
+ "epoch": 0.16832151300236406,
+ "grad_norm": 3.050391674041748,
+ "learning_rate": 4.994900935089527e-06,
+ "loss": 0.7243,
+ "step": 356
+ },
+ {
+ "epoch": 0.16879432624113475,
+ "grad_norm": 2.985466480255127,
+ "learning_rate": 4.994861034519491e-06,
+ "loss": 0.6917,
+ "step": 357
+ },
+ {
+ "epoch": 0.16926713947990543,
+ "grad_norm": 2.909342050552368,
+ "learning_rate": 4.9948209786053995e-06,
+ "loss": 0.6636,
+ "step": 358
+ },
+ {
+ "epoch": 0.16973995271867612,
+ "grad_norm": 3.2214784622192383,
+ "learning_rate": 4.9947807673497435e-06,
+ "loss": 0.7903,
+ "step": 359
+ },
+ {
+ "epoch": 0.1702127659574468,
+ "grad_norm": 2.5654983520507812,
+ "learning_rate": 4.994740400755029e-06,
+ "loss": 0.6129,
+ "step": 360
+ },
+ {
+ "epoch": 0.1706855791962175,
+ "grad_norm": 3.775646448135376,
+ "learning_rate": 4.99469987882377e-06,
+ "loss": 0.7145,
+ "step": 361
+ },
+ {
+ "epoch": 0.17115839243498818,
+ "grad_norm": 2.8965413570404053,
+ "learning_rate": 4.994659201558487e-06,
+ "loss": 0.7177,
+ "step": 362
+ },
+ {
+ "epoch": 0.17163120567375886,
+ "grad_norm": 3.485597848892212,
+ "learning_rate": 4.9946183689617146e-06,
+ "loss": 0.8107,
+ "step": 363
+ },
+ {
+ "epoch": 0.17210401891252955,
+ "grad_norm": 3.277839183807373,
+ "learning_rate": 4.994577381035995e-06,
+ "loss": 0.691,
+ "step": 364
+ },
+ {
+ "epoch": 0.17257683215130024,
+ "grad_norm": 2.8807685375213623,
+ "learning_rate": 4.99453623778388e-06,
+ "loss": 0.7627,
+ "step": 365
+ },
+ {
+ "epoch": 0.17304964539007092,
+ "grad_norm": 3.0659940242767334,
+ "learning_rate": 4.994494939207932e-06,
+ "loss": 0.6858,
+ "step": 366
+ },
+ {
+ "epoch": 0.1735224586288416,
+ "grad_norm": 3.0881855487823486,
+ "learning_rate": 4.994453485310723e-06,
+ "loss": 0.8212,
+ "step": 367
+ },
+ {
+ "epoch": 0.1739952718676123,
+ "grad_norm": 2.7199201583862305,
+ "learning_rate": 4.994411876094832e-06,
+ "loss": 0.6516,
+ "step": 368
+ },
+ {
+ "epoch": 0.17446808510638298,
+ "grad_norm": 2.955889940261841,
+ "learning_rate": 4.994370111562851e-06,
+ "loss": 0.6579,
+ "step": 369
+ },
+ {
+ "epoch": 0.17494089834515367,
+ "grad_norm": 3.1321663856506348,
+ "learning_rate": 4.994328191717382e-06,
+ "loss": 0.6891,
+ "step": 370
+ },
+ {
+ "epoch": 0.17541371158392435,
+ "grad_norm": 3.0560388565063477,
+ "learning_rate": 4.994286116561034e-06,
+ "loss": 0.7243,
+ "step": 371
+ },
+ {
+ "epoch": 0.17588652482269504,
+ "grad_norm": 3.1560704708099365,
+ "learning_rate": 4.994243886096425e-06,
+ "loss": 0.7262,
+ "step": 372
+ },
+ {
+ "epoch": 0.17635933806146573,
+ "grad_norm": 2.913541316986084,
+ "learning_rate": 4.994201500326187e-06,
+ "loss": 0.7318,
+ "step": 373
+ },
+ {
+ "epoch": 0.1768321513002364,
+ "grad_norm": 3.098376512527466,
+ "learning_rate": 4.994158959252958e-06,
+ "loss": 0.6419,
+ "step": 374
+ },
+ {
+ "epoch": 0.1773049645390071,
+ "grad_norm": 2.977508544921875,
+ "learning_rate": 4.994116262879387e-06,
+ "loss": 0.6709,
+ "step": 375
+ },
+ {
+ "epoch": 0.17777777777777778,
+ "grad_norm": 3.168186902999878,
+ "learning_rate": 4.994073411208133e-06,
+ "loss": 0.6608,
+ "step": 376
+ },
+ {
+ "epoch": 0.17825059101654847,
+ "grad_norm": 3.436844825744629,
+ "learning_rate": 4.994030404241864e-06,
+ "loss": 0.7227,
+ "step": 377
+ },
+ {
+ "epoch": 0.17872340425531916,
+ "grad_norm": 2.8998289108276367,
+ "learning_rate": 4.993987241983258e-06,
+ "loss": 0.6512,
+ "step": 378
+ },
+ {
+ "epoch": 0.17919621749408984,
+ "grad_norm": 3.407191514968872,
+ "learning_rate": 4.993943924435002e-06,
+ "loss": 0.616,
+ "step": 379
+ },
+ {
+ "epoch": 0.17966903073286053,
+ "grad_norm": 3.744858741760254,
+ "learning_rate": 4.993900451599793e-06,
+ "loss": 0.8599,
+ "step": 380
+ },
+ {
+ "epoch": 0.18014184397163122,
+ "grad_norm": 3.486283779144287,
+ "learning_rate": 4.993856823480338e-06,
+ "loss": 0.6634,
+ "step": 381
+ },
+ {
+ "epoch": 0.1806146572104019,
+ "grad_norm": 2.895719051361084,
+ "learning_rate": 4.993813040079355e-06,
+ "loss": 0.6972,
+ "step": 382
+ },
+ {
+ "epoch": 0.1810874704491726,
+ "grad_norm": 2.814133882522583,
+ "learning_rate": 4.993769101399569e-06,
+ "loss": 0.6271,
+ "step": 383
+ },
+ {
+ "epoch": 0.18156028368794327,
+ "grad_norm": 2.8609800338745117,
+ "learning_rate": 4.993725007443715e-06,
+ "loss": 0.6481,
+ "step": 384
+ },
+ {
+ "epoch": 0.18203309692671396,
+ "grad_norm": 3.2829644680023193,
+ "learning_rate": 4.99368075821454e-06,
+ "loss": 0.7999,
+ "step": 385
+ },
+ {
+ "epoch": 0.18250591016548465,
+ "grad_norm": 3.1417458057403564,
+ "learning_rate": 4.993636353714798e-06,
+ "loss": 0.6972,
+ "step": 386
+ },
+ {
+ "epoch": 0.1829787234042553,
+ "grad_norm": 3.0679385662078857,
+ "learning_rate": 4.993591793947256e-06,
+ "loss": 0.667,
+ "step": 387
+ },
+ {
+ "epoch": 0.183451536643026,
+ "grad_norm": 3.1387410163879395,
+ "learning_rate": 4.993547078914686e-06,
+ "loss": 0.7618,
+ "step": 388
+ },
+ {
+ "epoch": 0.18392434988179668,
+ "grad_norm": 2.9181406497955322,
+ "learning_rate": 4.993502208619872e-06,
+ "loss": 0.7391,
+ "step": 389
+ },
+ {
+ "epoch": 0.18439716312056736,
+ "grad_norm": 2.8952157497406006,
+ "learning_rate": 4.993457183065611e-06,
+ "loss": 0.6988,
+ "step": 390
+ },
+ {
+ "epoch": 0.18486997635933805,
+ "grad_norm": 3.2274813652038574,
+ "learning_rate": 4.993412002254704e-06,
+ "loss": 0.688,
+ "step": 391
+ },
+ {
+ "epoch": 0.18534278959810874,
+ "grad_norm": 3.4693779945373535,
+ "learning_rate": 4.993366666189965e-06,
+ "loss": 0.6634,
+ "step": 392
+ },
+ {
+ "epoch": 0.18581560283687942,
+ "grad_norm": 3.5358526706695557,
+ "learning_rate": 4.993321174874217e-06,
+ "loss": 0.7343,
+ "step": 393
+ },
+ {
+ "epoch": 0.1862884160756501,
+ "grad_norm": 3.013338088989258,
+ "learning_rate": 4.993275528310292e-06,
+ "loss": 0.7579,
+ "step": 394
+ },
+ {
+ "epoch": 0.1867612293144208,
+ "grad_norm": 2.694772720336914,
+ "learning_rate": 4.993229726501033e-06,
+ "loss": 0.718,
+ "step": 395
+ },
+ {
+ "epoch": 0.18723404255319148,
+ "grad_norm": 3.070612907409668,
+ "learning_rate": 4.9931837694492915e-06,
+ "loss": 0.6438,
+ "step": 396
+ },
+ {
+ "epoch": 0.18770685579196217,
+ "grad_norm": 2.9193027019500732,
+ "learning_rate": 4.993137657157928e-06,
+ "loss": 0.6788,
+ "step": 397
+ },
+ {
+ "epoch": 0.18817966903073285,
+ "grad_norm": 3.047682046890259,
+ "learning_rate": 4.993091389629816e-06,
+ "loss": 0.6826,
+ "step": 398
+ },
+ {
+ "epoch": 0.18865248226950354,
+ "grad_norm": 2.9629905223846436,
+ "learning_rate": 4.993044966867834e-06,
+ "loss": 0.7196,
+ "step": 399
+ },
+ {
+ "epoch": 0.18912529550827423,
+ "grad_norm": 3.0692050457000732,
+ "learning_rate": 4.992998388874874e-06,
+ "loss": 0.7015,
+ "step": 400
+ },
+ {
+ "epoch": 0.1895981087470449,
+ "grad_norm": 3.5427212715148926,
+ "learning_rate": 4.992951655653836e-06,
+ "loss": 0.8292,
+ "step": 401
+ },
+ {
+ "epoch": 0.1900709219858156,
+ "grad_norm": 2.643526554107666,
+ "learning_rate": 4.992904767207629e-06,
+ "loss": 0.624,
+ "step": 402
+ },
+ {
+ "epoch": 0.19054373522458629,
+ "grad_norm": 3.1185996532440186,
+ "learning_rate": 4.992857723539173e-06,
+ "loss": 0.7354,
+ "step": 403
+ },
+ {
+ "epoch": 0.19101654846335697,
+ "grad_norm": 3.006856679916382,
+ "learning_rate": 4.992810524651398e-06,
+ "loss": 0.7752,
+ "step": 404
+ },
+ {
+ "epoch": 0.19148936170212766,
+ "grad_norm": 2.9913275241851807,
+ "learning_rate": 4.9927631705472425e-06,
+ "loss": 0.7306,
+ "step": 405
+ },
+ {
+ "epoch": 0.19196217494089834,
+ "grad_norm": 2.6794071197509766,
+ "learning_rate": 4.992715661229655e-06,
+ "loss": 0.6136,
+ "step": 406
+ },
+ {
+ "epoch": 0.19243498817966903,
+ "grad_norm": 3.5933966636657715,
+ "learning_rate": 4.992667996701593e-06,
+ "loss": 0.7024,
+ "step": 407
+ },
+ {
+ "epoch": 0.19290780141843972,
+ "grad_norm": 2.862187623977661,
+ "learning_rate": 4.992620176966025e-06,
+ "loss": 0.692,
+ "step": 408
+ },
+ {
+ "epoch": 0.1933806146572104,
+ "grad_norm": 3.076845407485962,
+ "learning_rate": 4.9925722020259286e-06,
+ "loss": 0.7475,
+ "step": 409
+ },
+ {
+ "epoch": 0.1938534278959811,
+ "grad_norm": 3.372919797897339,
+ "learning_rate": 4.9925240718842895e-06,
+ "loss": 0.6886,
+ "step": 410
+ },
+ {
+ "epoch": 0.19432624113475178,
+ "grad_norm": 2.922977924346924,
+ "learning_rate": 4.992475786544108e-06,
+ "loss": 0.7049,
+ "step": 411
+ },
+ {
+ "epoch": 0.19479905437352246,
+ "grad_norm": 2.908034324645996,
+ "learning_rate": 4.992427346008387e-06,
+ "loss": 0.6498,
+ "step": 412
+ },
+ {
+ "epoch": 0.19527186761229315,
+ "grad_norm": 3.096723794937134,
+ "learning_rate": 4.992378750280144e-06,
+ "loss": 0.7151,
+ "step": 413
+ },
+ {
+ "epoch": 0.19574468085106383,
+ "grad_norm": 2.895237684249878,
+ "learning_rate": 4.992329999362405e-06,
+ "loss": 0.7277,
+ "step": 414
+ },
+ {
+ "epoch": 0.19621749408983452,
+ "grad_norm": 2.718230724334717,
+ "learning_rate": 4.9922810932582065e-06,
+ "loss": 0.6375,
+ "step": 415
+ },
+ {
+ "epoch": 0.1966903073286052,
+ "grad_norm": 3.187743663787842,
+ "learning_rate": 4.992232031970592e-06,
+ "loss": 0.6528,
+ "step": 416
+ },
+ {
+ "epoch": 0.1971631205673759,
+ "grad_norm": 2.996406316757202,
+ "learning_rate": 4.992182815502616e-06,
+ "loss": 0.6552,
+ "step": 417
+ },
+ {
+ "epoch": 0.19763593380614658,
+ "grad_norm": 3.301084041595459,
+ "learning_rate": 4.992133443857345e-06,
+ "loss": 0.7061,
+ "step": 418
+ },
+ {
+ "epoch": 0.19810874704491727,
+ "grad_norm": 3.7874677181243896,
+ "learning_rate": 4.992083917037853e-06,
+ "loss": 0.7859,
+ "step": 419
+ },
+ {
+ "epoch": 0.19858156028368795,
+ "grad_norm": 3.124253511428833,
+ "learning_rate": 4.992034235047222e-06,
+ "loss": 0.7615,
+ "step": 420
+ },
+ {
+ "epoch": 0.19905437352245864,
+ "grad_norm": 3.0488970279693604,
+ "learning_rate": 4.991984397888546e-06,
+ "loss": 0.6916,
+ "step": 421
+ },
+ {
+ "epoch": 0.19952718676122932,
+ "grad_norm": 3.1241321563720703,
+ "learning_rate": 4.991934405564929e-06,
+ "loss": 0.7055,
+ "step": 422
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 3.396632432937622,
+ "learning_rate": 4.991884258079484e-06,
+ "loss": 0.7675,
+ "step": 423
+ },
+ {
+ "epoch": 0.2004728132387707,
+ "grad_norm": 3.7776873111724854,
+ "learning_rate": 4.9918339554353316e-06,
+ "loss": 0.7371,
+ "step": 424
+ },
+ {
+ "epoch": 0.20094562647754138,
+ "grad_norm": 3.3356032371520996,
+ "learning_rate": 4.991783497635606e-06,
+ "loss": 0.6778,
+ "step": 425
+ },
+ {
+ "epoch": 0.20141843971631207,
+ "grad_norm": 2.988856792449951,
+ "learning_rate": 4.9917328846834474e-06,
+ "loss": 0.6795,
+ "step": 426
+ },
+ {
+ "epoch": 0.20189125295508276,
+ "grad_norm": 3.264183282852173,
+ "learning_rate": 4.99168211658201e-06,
+ "loss": 0.7707,
+ "step": 427
+ },
+ {
+ "epoch": 0.20236406619385341,
+ "grad_norm": 3.878068208694458,
+ "learning_rate": 4.991631193334451e-06,
+ "loss": 0.857,
+ "step": 428
+ },
+ {
+ "epoch": 0.2028368794326241,
+ "grad_norm": 3.6377553939819336,
+ "learning_rate": 4.991580114943943e-06,
+ "loss": 0.8033,
+ "step": 429
+ },
+ {
+ "epoch": 0.2033096926713948,
+ "grad_norm": 2.95393967628479,
+ "learning_rate": 4.991528881413667e-06,
+ "loss": 0.6809,
+ "step": 430
+ },
+ {
+ "epoch": 0.20378250591016547,
+ "grad_norm": 3.058704376220703,
+ "learning_rate": 4.9914774927468125e-06,
+ "loss": 0.6664,
+ "step": 431
+ },
+ {
+ "epoch": 0.20425531914893616,
+ "grad_norm": 2.7783217430114746,
+ "learning_rate": 4.9914259489465795e-06,
+ "loss": 0.6478,
+ "step": 432
+ },
+ {
+ "epoch": 0.20472813238770685,
+ "grad_norm": 2.4825217723846436,
+ "learning_rate": 4.991374250016177e-06,
+ "loss": 0.6598,
+ "step": 433
+ },
+ {
+ "epoch": 0.20520094562647753,
+ "grad_norm": 2.8753600120544434,
+ "learning_rate": 4.991322395958824e-06,
+ "loss": 0.6947,
+ "step": 434
+ },
+ {
+ "epoch": 0.20567375886524822,
+ "grad_norm": 3.2339367866516113,
+ "learning_rate": 4.99127038677775e-06,
+ "loss": 0.8201,
+ "step": 435
+ },
+ {
+ "epoch": 0.2061465721040189,
+ "grad_norm": 2.9065537452697754,
+ "learning_rate": 4.991218222476193e-06,
+ "loss": 0.6679,
+ "step": 436
+ },
+ {
+ "epoch": 0.2066193853427896,
+ "grad_norm": 3.283228874206543,
+ "learning_rate": 4.991165903057401e-06,
+ "loss": 0.8039,
+ "step": 437
+ },
+ {
+ "epoch": 0.20709219858156028,
+ "grad_norm": 3.429872751235962,
+ "learning_rate": 4.991113428524631e-06,
+ "loss": 0.7392,
+ "step": 438
+ },
+ {
+ "epoch": 0.20756501182033096,
+ "grad_norm": 3.118943452835083,
+ "learning_rate": 4.991060798881152e-06,
+ "loss": 0.6794,
+ "step": 439
+ },
+ {
+ "epoch": 0.20803782505910165,
+ "grad_norm": 3.395970106124878,
+ "learning_rate": 4.99100801413024e-06,
+ "loss": 0.6862,
+ "step": 440
+ },
+ {
+ "epoch": 0.20851063829787234,
+ "grad_norm": 2.869191884994507,
+ "learning_rate": 4.99095507427518e-06,
+ "loss": 0.6076,
+ "step": 441
+ },
+ {
+ "epoch": 0.20898345153664302,
+ "grad_norm": 3.1934661865234375,
+ "learning_rate": 4.990901979319272e-06,
+ "loss": 0.6927,
+ "step": 442
+ },
+ {
+ "epoch": 0.2094562647754137,
+ "grad_norm": 2.9068603515625,
+ "learning_rate": 4.990848729265819e-06,
+ "loss": 0.6864,
+ "step": 443
+ },
+ {
+ "epoch": 0.2099290780141844,
+ "grad_norm": 3.0535948276519775,
+ "learning_rate": 4.9907953241181375e-06,
+ "loss": 0.6396,
+ "step": 444
+ },
+ {
+ "epoch": 0.21040189125295508,
+ "grad_norm": 2.871511459350586,
+ "learning_rate": 4.990741763879554e-06,
+ "loss": 0.6743,
+ "step": 445
+ },
+ {
+ "epoch": 0.21087470449172577,
+ "grad_norm": 2.9184393882751465,
+ "learning_rate": 4.9906880485534015e-06,
+ "loss": 0.6786,
+ "step": 446
+ },
+ {
+ "epoch": 0.21134751773049645,
+ "grad_norm": 3.0628271102905273,
+ "learning_rate": 4.990634178143026e-06,
+ "loss": 0.6326,
+ "step": 447
+ },
+ {
+ "epoch": 0.21182033096926714,
+ "grad_norm": 3.7878305912017822,
+ "learning_rate": 4.990580152651782e-06,
+ "loss": 0.7944,
+ "step": 448
+ },
+ {
+ "epoch": 0.21229314420803783,
+ "grad_norm": 2.8577189445495605,
+ "learning_rate": 4.990525972083031e-06,
+ "loss": 0.71,
+ "step": 449
+ },
+ {
+ "epoch": 0.2127659574468085,
+ "grad_norm": 3.307769775390625,
+ "learning_rate": 4.99047163644015e-06,
+ "loss": 0.6893,
+ "step": 450
+ },
+ {
+ "epoch": 0.2132387706855792,
+ "grad_norm": 2.7391717433929443,
+ "learning_rate": 4.990417145726519e-06,
+ "loss": 0.712,
+ "step": 451
+ },
+ {
+ "epoch": 0.21371158392434988,
+ "grad_norm": 2.938044786453247,
+ "learning_rate": 4.990362499945534e-06,
+ "loss": 0.7516,
+ "step": 452
+ },
+ {
+ "epoch": 0.21418439716312057,
+ "grad_norm": 2.7831056118011475,
+ "learning_rate": 4.990307699100595e-06,
+ "loss": 0.6168,
+ "step": 453
+ },
+ {
+ "epoch": 0.21465721040189126,
+ "grad_norm": 2.907977342605591,
+ "learning_rate": 4.990252743195116e-06,
+ "loss": 0.6706,
+ "step": 454
+ },
+ {
+ "epoch": 0.21513002364066194,
+ "grad_norm": 3.7882161140441895,
+ "learning_rate": 4.990197632232517e-06,
+ "loss": 0.6847,
+ "step": 455
+ },
+ {
+ "epoch": 0.21560283687943263,
+ "grad_norm": 2.899716854095459,
+ "learning_rate": 4.990142366216232e-06,
+ "loss": 0.6699,
+ "step": 456
+ },
+ {
+ "epoch": 0.21607565011820332,
+ "grad_norm": 2.907003879547119,
+ "learning_rate": 4.990086945149701e-06,
+ "loss": 0.6864,
+ "step": 457
+ },
+ {
+ "epoch": 0.216548463356974,
+ "grad_norm": 3.2407333850860596,
+ "learning_rate": 4.9900313690363736e-06,
+ "loss": 0.692,
+ "step": 458
+ },
+ {
+ "epoch": 0.2170212765957447,
+ "grad_norm": 2.9055583477020264,
+ "learning_rate": 4.989975637879712e-06,
+ "loss": 0.7113,
+ "step": 459
+ },
+ {
+ "epoch": 0.21749408983451538,
+ "grad_norm": 2.9836206436157227,
+ "learning_rate": 4.989919751683184e-06,
+ "loss": 0.6673,
+ "step": 460
+ },
+ {
+ "epoch": 0.21796690307328606,
+ "grad_norm": 3.371035575866699,
+ "learning_rate": 4.989863710450273e-06,
+ "loss": 0.7181,
+ "step": 461
+ },
+ {
+ "epoch": 0.21843971631205675,
+ "grad_norm": 2.9636635780334473,
+ "learning_rate": 4.989807514184465e-06,
+ "loss": 0.6082,
+ "step": 462
+ },
+ {
+ "epoch": 0.21891252955082743,
+ "grad_norm": 2.9634664058685303,
+ "learning_rate": 4.9897511628892615e-06,
+ "loss": 0.7086,
+ "step": 463
+ },
+ {
+ "epoch": 0.21938534278959812,
+ "grad_norm": 3.154763698577881,
+ "learning_rate": 4.98969465656817e-06,
+ "loss": 0.7027,
+ "step": 464
+ },
+ {
+ "epoch": 0.2198581560283688,
+ "grad_norm": 2.9959890842437744,
+ "learning_rate": 4.98963799522471e-06,
+ "loss": 0.6498,
+ "step": 465
+ },
+ {
+ "epoch": 0.2203309692671395,
+ "grad_norm": 3.5470590591430664,
+ "learning_rate": 4.989581178862408e-06,
+ "loss": 0.7199,
+ "step": 466
+ },
+ {
+ "epoch": 0.22080378250591018,
+ "grad_norm": 7.1873369216918945,
+ "learning_rate": 4.989524207484802e-06,
+ "loss": 0.6676,
+ "step": 467
+ },
+ {
+ "epoch": 0.22127659574468084,
+ "grad_norm": 3.1099541187286377,
+ "learning_rate": 4.98946708109544e-06,
+ "loss": 0.6785,
+ "step": 468
+ },
+ {
+ "epoch": 0.22174940898345152,
+ "grad_norm": 2.830991506576538,
+ "learning_rate": 4.9894097996978795e-06,
+ "loss": 0.6456,
+ "step": 469
+ },
+ {
+ "epoch": 0.2222222222222222,
+ "grad_norm": 3.0212316513061523,
+ "learning_rate": 4.989352363295687e-06,
+ "loss": 0.6048,
+ "step": 470
+ },
+ {
+ "epoch": 0.2226950354609929,
+ "grad_norm": 3.18776798248291,
+ "learning_rate": 4.989294771892437e-06,
+ "loss": 0.7078,
+ "step": 471
+ },
+ {
+ "epoch": 0.22316784869976358,
+ "grad_norm": 2.9972598552703857,
+ "learning_rate": 4.989237025491717e-06,
+ "loss": 0.7082,
+ "step": 472
+ },
+ {
+ "epoch": 0.22364066193853427,
+ "grad_norm": 3.4935688972473145,
+ "learning_rate": 4.989179124097123e-06,
+ "loss": 0.8199,
+ "step": 473
+ },
+ {
+ "epoch": 0.22411347517730495,
+ "grad_norm": 2.6485543251037598,
+ "learning_rate": 4.9891210677122595e-06,
+ "loss": 0.6371,
+ "step": 474
+ },
+ {
+ "epoch": 0.22458628841607564,
+ "grad_norm": 2.969233512878418,
+ "learning_rate": 4.989062856340742e-06,
+ "loss": 0.6879,
+ "step": 475
+ },
+ {
+ "epoch": 0.22505910165484633,
+ "grad_norm": 2.881875514984131,
+ "learning_rate": 4.989004489986194e-06,
+ "loss": 0.7415,
+ "step": 476
+ },
+ {
+ "epoch": 0.225531914893617,
+ "grad_norm": 2.624540090560913,
+ "learning_rate": 4.98894596865225e-06,
+ "loss": 0.6522,
+ "step": 477
+ },
+ {
+ "epoch": 0.2260047281323877,
+ "grad_norm": 3.61075496673584,
+ "learning_rate": 4.988887292342555e-06,
+ "loss": 0.7109,
+ "step": 478
+ },
+ {
+ "epoch": 0.2264775413711584,
+ "grad_norm": 2.9368972778320312,
+ "learning_rate": 4.988828461060762e-06,
+ "loss": 0.6843,
+ "step": 479
+ },
+ {
+ "epoch": 0.22695035460992907,
+ "grad_norm": 3.0670197010040283,
+ "learning_rate": 4.988769474810533e-06,
+ "loss": 0.6807,
+ "step": 480
+ },
+ {
+ "epoch": 0.22742316784869976,
+ "grad_norm": 2.9662792682647705,
+ "learning_rate": 4.988710333595542e-06,
+ "loss": 0.6796,
+ "step": 481
+ },
+ {
+ "epoch": 0.22789598108747045,
+ "grad_norm": 2.971235752105713,
+ "learning_rate": 4.988651037419472e-06,
+ "loss": 0.696,
+ "step": 482
+ },
+ {
+ "epoch": 0.22836879432624113,
+ "grad_norm": 2.931884527206421,
+ "learning_rate": 4.988591586286013e-06,
+ "loss": 0.7323,
+ "step": 483
+ },
+ {
+ "epoch": 0.22884160756501182,
+ "grad_norm": 2.8114213943481445,
+ "learning_rate": 4.988531980198868e-06,
+ "loss": 0.6584,
+ "step": 484
+ },
+ {
+ "epoch": 0.2293144208037825,
+ "grad_norm": 3.2785916328430176,
+ "learning_rate": 4.98847221916175e-06,
+ "loss": 0.7514,
+ "step": 485
+ },
+ {
+ "epoch": 0.2297872340425532,
+ "grad_norm": 3.0520215034484863,
+ "learning_rate": 4.988412303178377e-06,
+ "loss": 0.7564,
+ "step": 486
+ },
+ {
+ "epoch": 0.23026004728132388,
+ "grad_norm": 3.181002616882324,
+ "learning_rate": 4.988352232252483e-06,
+ "loss": 0.6768,
+ "step": 487
+ },
+ {
+ "epoch": 0.23073286052009456,
+ "grad_norm": 3.4953625202178955,
+ "learning_rate": 4.988292006387805e-06,
+ "loss": 0.7143,
+ "step": 488
+ },
+ {
+ "epoch": 0.23120567375886525,
+ "grad_norm": 3.326571226119995,
+ "learning_rate": 4.988231625588096e-06,
+ "loss": 0.7318,
+ "step": 489
+ },
+ {
+ "epoch": 0.23167848699763594,
+ "grad_norm": 3.09614634513855,
+ "learning_rate": 4.988171089857113e-06,
+ "loss": 0.6574,
+ "step": 490
+ },
+ {
+ "epoch": 0.23215130023640662,
+ "grad_norm": 2.7439446449279785,
+ "learning_rate": 4.9881103991986265e-06,
+ "loss": 0.6637,
+ "step": 491
+ },
+ {
+ "epoch": 0.2326241134751773,
+ "grad_norm": 3.0681190490722656,
+ "learning_rate": 4.988049553616416e-06,
+ "loss": 0.6326,
+ "step": 492
+ },
+ {
+ "epoch": 0.233096926713948,
+ "grad_norm": 3.0757341384887695,
+ "learning_rate": 4.98798855311427e-06,
+ "loss": 0.695,
+ "step": 493
+ },
+ {
+ "epoch": 0.23356973995271868,
+ "grad_norm": 2.8637635707855225,
+ "learning_rate": 4.987927397695985e-06,
+ "loss": 0.6598,
+ "step": 494
+ },
+ {
+ "epoch": 0.23404255319148937,
+ "grad_norm": 3.3641068935394287,
+ "learning_rate": 4.9878660873653715e-06,
+ "loss": 0.7435,
+ "step": 495
+ },
+ {
+ "epoch": 0.23451536643026005,
+ "grad_norm": 3.5025596618652344,
+ "learning_rate": 4.987804622126245e-06,
+ "loss": 0.735,
+ "step": 496
+ },
+ {
+ "epoch": 0.23498817966903074,
+ "grad_norm": 2.9298837184906006,
+ "learning_rate": 4.987743001982434e-06,
+ "loss": 0.7063,
+ "step": 497
+ },
+ {
+ "epoch": 0.23546099290780143,
+ "grad_norm": 2.70358943939209,
+ "learning_rate": 4.987681226937774e-06,
+ "loss": 0.6799,
+ "step": 498
+ },
+ {
+ "epoch": 0.2359338061465721,
+ "grad_norm": 3.027871608734131,
+ "learning_rate": 4.9876192969961125e-06,
+ "loss": 0.6881,
+ "step": 499
+ },
+ {
+ "epoch": 0.2364066193853428,
+ "grad_norm": 3.362306594848633,
+ "learning_rate": 4.987557212161304e-06,
+ "loss": 0.7906,
+ "step": 500
+ },
+ {
+ "epoch": 0.23687943262411348,
+ "grad_norm": 3.3136050701141357,
+ "learning_rate": 4.987494972437217e-06,
+ "loss": 0.6878,
+ "step": 501
+ },
+ {
+ "epoch": 0.23735224586288417,
+ "grad_norm": 3.017089605331421,
+ "learning_rate": 4.9874325778277255e-06,
+ "loss": 0.7279,
+ "step": 502
+ },
+ {
+ "epoch": 0.23782505910165486,
+ "grad_norm": 2.8300516605377197,
+ "learning_rate": 4.987370028336714e-06,
+ "loss": 0.6864,
+ "step": 503
+ },
+ {
+ "epoch": 0.23829787234042554,
+ "grad_norm": 3.201860189437866,
+ "learning_rate": 4.987307323968077e-06,
+ "loss": 0.7531,
+ "step": 504
+ },
+ {
+ "epoch": 0.23877068557919623,
+ "grad_norm": 2.685396194458008,
+ "learning_rate": 4.987244464725721e-06,
+ "loss": 0.5849,
+ "step": 505
+ },
+ {
+ "epoch": 0.23924349881796692,
+ "grad_norm": 2.8715312480926514,
+ "learning_rate": 4.987181450613557e-06,
+ "loss": 0.675,
+ "step": 506
+ },
+ {
+ "epoch": 0.2397163120567376,
+ "grad_norm": 2.813908815383911,
+ "learning_rate": 4.987118281635511e-06,
+ "loss": 0.6841,
+ "step": 507
+ },
+ {
+ "epoch": 0.2401891252955083,
+ "grad_norm": 3.2738473415374756,
+ "learning_rate": 4.987054957795514e-06,
+ "loss": 0.7158,
+ "step": 508
+ },
+ {
+ "epoch": 0.24066193853427895,
+ "grad_norm": 2.896134376525879,
+ "learning_rate": 4.986991479097511e-06,
+ "loss": 0.7542,
+ "step": 509
+ },
+ {
+ "epoch": 0.24113475177304963,
+ "grad_norm": 3.0390403270721436,
+ "learning_rate": 4.986927845545454e-06,
+ "loss": 0.6733,
+ "step": 510
+ },
+ {
+ "epoch": 0.24160756501182032,
+ "grad_norm": 3.0300254821777344,
+ "learning_rate": 4.9868640571433044e-06,
+ "loss": 0.722,
+ "step": 511
+ },
+ {
+ "epoch": 0.242080378250591,
+ "grad_norm": 3.3037352561950684,
+ "learning_rate": 4.986800113895035e-06,
+ "loss": 0.6811,
+ "step": 512
+ },
+ {
+ "epoch": 0.2425531914893617,
+ "grad_norm": 3.0358474254608154,
+ "learning_rate": 4.986736015804627e-06,
+ "loss": 0.7348,
+ "step": 513
+ },
+ {
+ "epoch": 0.24302600472813238,
+ "grad_norm": 3.108792304992676,
+ "learning_rate": 4.986671762876071e-06,
+ "loss": 0.6096,
+ "step": 514
+ },
+ {
+ "epoch": 0.24349881796690306,
+ "grad_norm": 3.1316237449645996,
+ "learning_rate": 4.986607355113367e-06,
+ "loss": 0.6357,
+ "step": 515
+ },
+ {
+ "epoch": 0.24397163120567375,
+ "grad_norm": 3.3095219135284424,
+ "learning_rate": 4.986542792520528e-06,
+ "loss": 0.7515,
+ "step": 516
+ },
+ {
+ "epoch": 0.24444444444444444,
+ "grad_norm": 3.4775984287261963,
+ "learning_rate": 4.986478075101572e-06,
+ "loss": 0.7104,
+ "step": 517
+ },
+ {
+ "epoch": 0.24491725768321512,
+ "grad_norm": 3.341708183288574,
+ "learning_rate": 4.986413202860528e-06,
+ "loss": 0.7339,
+ "step": 518
+ },
+ {
+ "epoch": 0.2453900709219858,
+ "grad_norm": 2.9646966457366943,
+ "learning_rate": 4.986348175801438e-06,
+ "loss": 0.6032,
+ "step": 519
+ },
+ {
+ "epoch": 0.2458628841607565,
+ "grad_norm": 3.1853902339935303,
+ "learning_rate": 4.986282993928349e-06,
+ "loss": 0.6925,
+ "step": 520
+ },
+ {
+ "epoch": 0.24633569739952718,
+ "grad_norm": 3.286909818649292,
+ "learning_rate": 4.98621765724532e-06,
+ "loss": 0.7447,
+ "step": 521
+ },
+ {
+ "epoch": 0.24680851063829787,
+ "grad_norm": 3.2255051136016846,
+ "learning_rate": 4.986152165756419e-06,
+ "loss": 0.7747,
+ "step": 522
+ },
+ {
+ "epoch": 0.24728132387706855,
+ "grad_norm": 3.002352237701416,
+ "learning_rate": 4.986086519465724e-06,
+ "loss": 0.6472,
+ "step": 523
+ },
+ {
+ "epoch": 0.24775413711583924,
+ "grad_norm": 3.4738974571228027,
+ "learning_rate": 4.986020718377322e-06,
+ "loss": 0.7381,
+ "step": 524
+ },
+ {
+ "epoch": 0.24822695035460993,
+ "grad_norm": 3.4470200538635254,
+ "learning_rate": 4.985954762495312e-06,
+ "loss": 0.6878,
+ "step": 525
+ },
+ {
+ "epoch": 0.2486997635933806,
+ "grad_norm": 2.9219350814819336,
+ "learning_rate": 4.985888651823799e-06,
+ "loss": 0.6317,
+ "step": 526
+ },
+ {
+ "epoch": 0.2491725768321513,
+ "grad_norm": 3.061767101287842,
+ "learning_rate": 4.985822386366899e-06,
+ "loss": 0.6842,
+ "step": 527
+ },
+ {
+ "epoch": 0.24964539007092199,
+ "grad_norm": 3.0291247367858887,
+ "learning_rate": 4.985755966128742e-06,
+ "loss": 0.6852,
+ "step": 528
+ },
+ {
+ "epoch": 0.25011820330969264,
+ "grad_norm": 2.964280843734741,
+ "learning_rate": 4.985689391113457e-06,
+ "loss": 0.7738,
+ "step": 529
+ },
+ {
+ "epoch": 0.25059101654846333,
+ "grad_norm": 3.058302164077759,
+ "learning_rate": 4.9856226613251955e-06,
+ "loss": 0.6677,
+ "step": 530
+ },
+ {
+ "epoch": 0.251063829787234,
+ "grad_norm": 3.345141649246216,
+ "learning_rate": 4.985555776768109e-06,
+ "loss": 0.7837,
+ "step": 531
+ },
+ {
+ "epoch": 0.2515366430260047,
+ "grad_norm": 3.565031051635742,
+ "learning_rate": 4.9854887374463636e-06,
+ "loss": 0.7231,
+ "step": 532
+ },
+ {
+ "epoch": 0.2520094562647754,
+ "grad_norm": 2.7953789234161377,
+ "learning_rate": 4.985421543364132e-06,
+ "loss": 0.6102,
+ "step": 533
+ },
+ {
+ "epoch": 0.2524822695035461,
+ "grad_norm": 2.887606620788574,
+ "learning_rate": 4.9853541945256e-06,
+ "loss": 0.6289,
+ "step": 534
+ },
+ {
+ "epoch": 0.25295508274231676,
+ "grad_norm": 3.1480495929718018,
+ "learning_rate": 4.985286690934961e-06,
+ "loss": 0.6348,
+ "step": 535
+ },
+ {
+ "epoch": 0.25342789598108745,
+ "grad_norm": 2.8912761211395264,
+ "learning_rate": 4.985219032596416e-06,
+ "loss": 0.595,
+ "step": 536
+ },
+ {
+ "epoch": 0.25390070921985813,
+ "grad_norm": 2.947936534881592,
+ "learning_rate": 4.98515121951418e-06,
+ "loss": 0.6196,
+ "step": 537
+ },
+ {
+ "epoch": 0.2543735224586288,
+ "grad_norm": 3.1085827350616455,
+ "learning_rate": 4.985083251692474e-06,
+ "loss": 0.6387,
+ "step": 538
+ },
+ {
+ "epoch": 0.2548463356973995,
+ "grad_norm": 3.1688334941864014,
+ "learning_rate": 4.985015129135531e-06,
+ "loss": 0.7055,
+ "step": 539
+ },
+ {
+ "epoch": 0.2553191489361702,
+ "grad_norm": 3.075042963027954,
+ "learning_rate": 4.984946851847593e-06,
+ "loss": 0.7515,
+ "step": 540
+ },
+ {
+ "epoch": 0.2557919621749409,
+ "grad_norm": 3.1933093070983887,
+ "learning_rate": 4.98487841983291e-06,
+ "loss": 0.7054,
+ "step": 541
+ },
+ {
+ "epoch": 0.25626477541371157,
+ "grad_norm": 3.043473958969116,
+ "learning_rate": 4.984809833095744e-06,
+ "loss": 0.6281,
+ "step": 542
+ },
+ {
+ "epoch": 0.25673758865248225,
+ "grad_norm": 3.0532584190368652,
+ "learning_rate": 4.9847410916403645e-06,
+ "loss": 0.6155,
+ "step": 543
+ },
+ {
+ "epoch": 0.25721040189125294,
+ "grad_norm": 3.608480215072632,
+ "learning_rate": 4.984672195471053e-06,
+ "loss": 0.7363,
+ "step": 544
+ },
+ {
+ "epoch": 0.2576832151300236,
+ "grad_norm": 2.7491862773895264,
+ "learning_rate": 4.9846031445921e-06,
+ "loss": 0.6594,
+ "step": 545
+ },
+ {
+ "epoch": 0.2581560283687943,
+ "grad_norm": 2.8602418899536133,
+ "learning_rate": 4.984533939007802e-06,
+ "loss": 0.6742,
+ "step": 546
+ },
+ {
+ "epoch": 0.258628841607565,
+ "grad_norm": 3.1782007217407227,
+ "learning_rate": 4.98446457872247e-06,
+ "loss": 0.731,
+ "step": 547
+ },
+ {
+ "epoch": 0.2591016548463357,
+ "grad_norm": 2.796147584915161,
+ "learning_rate": 4.984395063740423e-06,
+ "loss": 0.6617,
+ "step": 548
+ },
+ {
+ "epoch": 0.25957446808510637,
+ "grad_norm": 2.8392202854156494,
+ "learning_rate": 4.984325394065991e-06,
+ "loss": 0.6753,
+ "step": 549
+ },
+ {
+ "epoch": 0.26004728132387706,
+ "grad_norm": 3.134672164916992,
+ "learning_rate": 4.984255569703508e-06,
+ "loss": 0.7222,
+ "step": 550
+ },
+ {
+ "epoch": 0.26052009456264774,
+ "grad_norm": 2.734330177307129,
+ "learning_rate": 4.984185590657325e-06,
+ "loss": 0.6098,
+ "step": 551
+ },
+ {
+ "epoch": 0.26099290780141843,
+ "grad_norm": 3.739010810852051,
+ "learning_rate": 4.984115456931798e-06,
+ "loss": 0.7457,
+ "step": 552
+ },
+ {
+ "epoch": 0.2614657210401891,
+ "grad_norm": 2.8412528038024902,
+ "learning_rate": 4.9840451685312925e-06,
+ "loss": 0.6972,
+ "step": 553
+ },
+ {
+ "epoch": 0.2619385342789598,
+ "grad_norm": 3.017395496368408,
+ "learning_rate": 4.983974725460188e-06,
+ "loss": 0.6887,
+ "step": 554
+ },
+ {
+ "epoch": 0.2624113475177305,
+ "grad_norm": 3.2746949195861816,
+ "learning_rate": 4.98390412772287e-06,
+ "loss": 0.7047,
+ "step": 555
+ },
+ {
+ "epoch": 0.2628841607565012,
+ "grad_norm": 3.1561965942382812,
+ "learning_rate": 4.983833375323732e-06,
+ "loss": 0.7726,
+ "step": 556
+ },
+ {
+ "epoch": 0.26335697399527186,
+ "grad_norm": 3.2367217540740967,
+ "learning_rate": 4.9837624682671816e-06,
+ "loss": 0.6348,
+ "step": 557
+ },
+ {
+ "epoch": 0.26382978723404255,
+ "grad_norm": 2.8195858001708984,
+ "learning_rate": 4.983691406557633e-06,
+ "loss": 0.6387,
+ "step": 558
+ },
+ {
+ "epoch": 0.26430260047281323,
+ "grad_norm": 3.349820852279663,
+ "learning_rate": 4.983620190199511e-06,
+ "loss": 0.6776,
+ "step": 559
+ },
+ {
+ "epoch": 0.2647754137115839,
+ "grad_norm": 2.8025588989257812,
+ "learning_rate": 4.98354881919725e-06,
+ "loss": 0.6512,
+ "step": 560
+ },
+ {
+ "epoch": 0.2652482269503546,
+ "grad_norm": 2.9125499725341797,
+ "learning_rate": 4.983477293555295e-06,
+ "loss": 0.7024,
+ "step": 561
+ },
+ {
+ "epoch": 0.2657210401891253,
+ "grad_norm": 3.3479275703430176,
+ "learning_rate": 4.983405613278098e-06,
+ "loss": 0.688,
+ "step": 562
+ },
+ {
+ "epoch": 0.266193853427896,
+ "grad_norm": 3.123971462249756,
+ "learning_rate": 4.983333778370123e-06,
+ "loss": 0.6743,
+ "step": 563
+ },
+ {
+ "epoch": 0.26666666666666666,
+ "grad_norm": 2.891625165939331,
+ "learning_rate": 4.983261788835843e-06,
+ "loss": 0.5971,
+ "step": 564
+ },
+ {
+ "epoch": 0.26713947990543735,
+ "grad_norm": 3.5066864490509033,
+ "learning_rate": 4.98318964467974e-06,
+ "loss": 0.6958,
+ "step": 565
+ },
+ {
+ "epoch": 0.26761229314420804,
+ "grad_norm": 2.570547342300415,
+ "learning_rate": 4.983117345906306e-06,
+ "loss": 0.609,
+ "step": 566
+ },
+ {
+ "epoch": 0.2680851063829787,
+ "grad_norm": 3.005106210708618,
+ "learning_rate": 4.983044892520044e-06,
+ "loss": 0.6791,
+ "step": 567
+ },
+ {
+ "epoch": 0.2685579196217494,
+ "grad_norm": 3.429675340652466,
+ "learning_rate": 4.982972284525463e-06,
+ "loss": 0.6625,
+ "step": 568
+ },
+ {
+ "epoch": 0.2690307328605201,
+ "grad_norm": 3.825657367706299,
+ "learning_rate": 4.982899521927086e-06,
+ "loss": 0.6368,
+ "step": 569
+ },
+ {
+ "epoch": 0.2695035460992908,
+ "grad_norm": 2.8699095249176025,
+ "learning_rate": 4.982826604729443e-06,
+ "loss": 0.6425,
+ "step": 570
+ },
+ {
+ "epoch": 0.26997635933806147,
+ "grad_norm": 3.1688714027404785,
+ "learning_rate": 4.982753532937074e-06,
+ "loss": 0.6904,
+ "step": 571
+ },
+ {
+ "epoch": 0.27044917257683215,
+ "grad_norm": 3.3889992237091064,
+ "learning_rate": 4.98268030655453e-06,
+ "loss": 0.7575,
+ "step": 572
+ },
+ {
+ "epoch": 0.27092198581560284,
+ "grad_norm": 3.108315944671631,
+ "learning_rate": 4.982606925586367e-06,
+ "loss": 0.6648,
+ "step": 573
+ },
+ {
+ "epoch": 0.2713947990543735,
+ "grad_norm": 3.209831953048706,
+ "learning_rate": 4.982533390037159e-06,
+ "loss": 0.657,
+ "step": 574
+ },
+ {
+ "epoch": 0.2718676122931442,
+ "grad_norm": 3.1740927696228027,
+ "learning_rate": 4.982459699911482e-06,
+ "loss": 0.7262,
+ "step": 575
+ },
+ {
+ "epoch": 0.2723404255319149,
+ "grad_norm": 3.0190417766571045,
+ "learning_rate": 4.982385855213924e-06,
+ "loss": 0.6368,
+ "step": 576
+ },
+ {
+ "epoch": 0.2728132387706856,
+ "grad_norm": 3.05049467086792,
+ "learning_rate": 4.982311855949084e-06,
+ "loss": 0.72,
+ "step": 577
+ },
+ {
+ "epoch": 0.27328605200945627,
+ "grad_norm": 2.984816551208496,
+ "learning_rate": 4.98223770212157e-06,
+ "loss": 0.6856,
+ "step": 578
+ },
+ {
+ "epoch": 0.27375886524822696,
+ "grad_norm": 2.744969606399536,
+ "learning_rate": 4.982163393735998e-06,
+ "loss": 0.6023,
+ "step": 579
+ },
+ {
+ "epoch": 0.27423167848699764,
+ "grad_norm": 3.170564889907837,
+ "learning_rate": 4.982088930796996e-06,
+ "loss": 0.6678,
+ "step": 580
+ },
+ {
+ "epoch": 0.27470449172576833,
+ "grad_norm": 2.8686118125915527,
+ "learning_rate": 4.982014313309199e-06,
+ "loss": 0.6157,
+ "step": 581
+ },
+ {
+ "epoch": 0.275177304964539,
+ "grad_norm": 2.8768694400787354,
+ "learning_rate": 4.981939541277254e-06,
+ "loss": 0.6566,
+ "step": 582
+ },
+ {
+ "epoch": 0.2756501182033097,
+ "grad_norm": 2.621481418609619,
+ "learning_rate": 4.981864614705818e-06,
+ "loss": 0.7372,
+ "step": 583
+ },
+ {
+ "epoch": 0.2761229314420804,
+ "grad_norm": 3.527374267578125,
+ "learning_rate": 4.981789533599554e-06,
+ "loss": 0.6485,
+ "step": 584
+ },
+ {
+ "epoch": 0.2765957446808511,
+ "grad_norm": 3.3141074180603027,
+ "learning_rate": 4.981714297963138e-06,
+ "loss": 0.6816,
+ "step": 585
+ },
+ {
+ "epoch": 0.27706855791962176,
+ "grad_norm": 2.9247069358825684,
+ "learning_rate": 4.981638907801255e-06,
+ "loss": 0.7217,
+ "step": 586
+ },
+ {
+ "epoch": 0.27754137115839245,
+ "grad_norm": 2.875236749649048,
+ "learning_rate": 4.981563363118599e-06,
+ "loss": 0.6662,
+ "step": 587
+ },
+ {
+ "epoch": 0.27801418439716313,
+ "grad_norm": 2.9540364742279053,
+ "learning_rate": 4.981487663919874e-06,
+ "loss": 0.7225,
+ "step": 588
+ },
+ {
+ "epoch": 0.2784869976359338,
+ "grad_norm": 2.90889310836792,
+ "learning_rate": 4.981411810209793e-06,
+ "loss": 0.6054,
+ "step": 589
+ },
+ {
+ "epoch": 0.2789598108747045,
+ "grad_norm": 2.8541409969329834,
+ "learning_rate": 4.981335801993078e-06,
+ "loss": 0.6539,
+ "step": 590
+ },
+ {
+ "epoch": 0.2794326241134752,
+ "grad_norm": 3.1600730419158936,
+ "learning_rate": 4.981259639274465e-06,
+ "loss": 0.6415,
+ "step": 591
+ },
+ {
+ "epoch": 0.2799054373522459,
+ "grad_norm": 3.569376230239868,
+ "learning_rate": 4.981183322058693e-06,
+ "loss": 0.6944,
+ "step": 592
+ },
+ {
+ "epoch": 0.28037825059101656,
+ "grad_norm": 3.067667007446289,
+ "learning_rate": 4.981106850350515e-06,
+ "loss": 0.7378,
+ "step": 593
+ },
+ {
+ "epoch": 0.28085106382978725,
+ "grad_norm": 3.082073450088501,
+ "learning_rate": 4.981030224154693e-06,
+ "loss": 0.693,
+ "step": 594
+ },
+ {
+ "epoch": 0.28132387706855794,
+ "grad_norm": 2.902932643890381,
+ "learning_rate": 4.980953443475998e-06,
+ "loss": 0.6549,
+ "step": 595
+ },
+ {
+ "epoch": 0.2817966903073286,
+ "grad_norm": 2.6821181774139404,
+ "learning_rate": 4.980876508319211e-06,
+ "loss": 0.6231,
+ "step": 596
+ },
+ {
+ "epoch": 0.2822695035460993,
+ "grad_norm": 3.1747355461120605,
+ "learning_rate": 4.9807994186891215e-06,
+ "loss": 0.6826,
+ "step": 597
+ },
+ {
+ "epoch": 0.28274231678487,
+ "grad_norm": 2.6975860595703125,
+ "learning_rate": 4.980722174590531e-06,
+ "loss": 0.6669,
+ "step": 598
+ },
+ {
+ "epoch": 0.2832151300236407,
+ "grad_norm": 2.924285650253296,
+ "learning_rate": 4.9806447760282486e-06,
+ "loss": 0.689,
+ "step": 599
+ },
+ {
+ "epoch": 0.28368794326241137,
+ "grad_norm": 2.941417694091797,
+ "learning_rate": 4.980567223007093e-06,
+ "loss": 0.6672,
+ "step": 600
+ },
+ {
+ "epoch": 0.28416075650118205,
+ "grad_norm": 2.8582186698913574,
+ "learning_rate": 4.980489515531892e-06,
+ "loss": 0.6229,
+ "step": 601
+ },
+ {
+ "epoch": 0.28463356973995274,
+ "grad_norm": 2.6462013721466064,
+ "learning_rate": 4.9804116536074865e-06,
+ "loss": 0.606,
+ "step": 602
+ },
+ {
+ "epoch": 0.2851063829787234,
+ "grad_norm": 2.9029998779296875,
+ "learning_rate": 4.980333637238723e-06,
+ "loss": 0.5915,
+ "step": 603
+ },
+ {
+ "epoch": 0.2855791962174941,
+ "grad_norm": 3.9359042644500732,
+ "learning_rate": 4.980255466430462e-06,
+ "loss": 0.7035,
+ "step": 604
+ },
+ {
+ "epoch": 0.2860520094562648,
+ "grad_norm": 3.200524091720581,
+ "learning_rate": 4.980177141187566e-06,
+ "loss": 0.7156,
+ "step": 605
+ },
+ {
+ "epoch": 0.2865248226950355,
+ "grad_norm": 3.1708686351776123,
+ "learning_rate": 4.980098661514916e-06,
+ "loss": 0.746,
+ "step": 606
+ },
+ {
+ "epoch": 0.28699763593380617,
+ "grad_norm": 2.8926830291748047,
+ "learning_rate": 4.980020027417397e-06,
+ "loss": 0.6282,
+ "step": 607
+ },
+ {
+ "epoch": 0.28747044917257686,
+ "grad_norm": 3.0526294708251953,
+ "learning_rate": 4.979941238899906e-06,
+ "loss": 0.6594,
+ "step": 608
+ },
+ {
+ "epoch": 0.28794326241134754,
+ "grad_norm": 2.9869306087493896,
+ "learning_rate": 4.9798622959673486e-06,
+ "loss": 0.7771,
+ "step": 609
+ },
+ {
+ "epoch": 0.28841607565011823,
+ "grad_norm": 2.7894513607025146,
+ "learning_rate": 4.979783198624638e-06,
+ "loss": 0.6819,
+ "step": 610
+ },
+ {
+ "epoch": 0.28888888888888886,
+ "grad_norm": 2.958575963973999,
+ "learning_rate": 4.9797039468767025e-06,
+ "loss": 0.6474,
+ "step": 611
+ },
+ {
+ "epoch": 0.28936170212765955,
+ "grad_norm": 3.423748016357422,
+ "learning_rate": 4.979624540728475e-06,
+ "loss": 0.7389,
+ "step": 612
+ },
+ {
+ "epoch": 0.28983451536643023,
+ "grad_norm": 2.9641635417938232,
+ "learning_rate": 4.9795449801849e-06,
+ "loss": 0.6005,
+ "step": 613
+ },
+ {
+ "epoch": 0.2903073286052009,
+ "grad_norm": 3.02274227142334,
+ "learning_rate": 4.979465265250933e-06,
+ "loss": 0.6358,
+ "step": 614
+ },
+ {
+ "epoch": 0.2907801418439716,
+ "grad_norm": 3.0562758445739746,
+ "learning_rate": 4.979385395931534e-06,
+ "loss": 0.6313,
+ "step": 615
+ },
+ {
+ "epoch": 0.2912529550827423,
+ "grad_norm": 3.301816701889038,
+ "learning_rate": 4.97930537223168e-06,
+ "loss": 0.7264,
+ "step": 616
+ },
+ {
+ "epoch": 0.291725768321513,
+ "grad_norm": 2.975360870361328,
+ "learning_rate": 4.979225194156351e-06,
+ "loss": 0.613,
+ "step": 617
+ },
+ {
+ "epoch": 0.29219858156028367,
+ "grad_norm": 2.9245030879974365,
+ "learning_rate": 4.97914486171054e-06,
+ "loss": 0.6646,
+ "step": 618
+ },
+ {
+ "epoch": 0.29267139479905435,
+ "grad_norm": 3.1336188316345215,
+ "learning_rate": 4.979064374899249e-06,
+ "loss": 0.6421,
+ "step": 619
+ },
+ {
+ "epoch": 0.29314420803782504,
+ "grad_norm": 3.6298763751983643,
+ "learning_rate": 4.978983733727491e-06,
+ "loss": 0.6433,
+ "step": 620
+ },
+ {
+ "epoch": 0.2936170212765957,
+ "grad_norm": 2.919597625732422,
+ "learning_rate": 4.9789029382002845e-06,
+ "loss": 0.6288,
+ "step": 621
+ },
+ {
+ "epoch": 0.2940898345153664,
+ "grad_norm": 3.2206127643585205,
+ "learning_rate": 4.978821988322662e-06,
+ "loss": 0.7102,
+ "step": 622
+ },
+ {
+ "epoch": 0.2945626477541371,
+ "grad_norm": 3.1767101287841797,
+ "learning_rate": 4.978740884099664e-06,
+ "loss": 0.6722,
+ "step": 623
+ },
+ {
+ "epoch": 0.2950354609929078,
+ "grad_norm": 3.3425452709198,
+ "learning_rate": 4.97865962553634e-06,
+ "loss": 0.6492,
+ "step": 624
+ },
+ {
+ "epoch": 0.29550827423167847,
+ "grad_norm": 3.0408358573913574,
+ "learning_rate": 4.97857821263775e-06,
+ "loss": 0.6522,
+ "step": 625
+ },
+ {
+ "epoch": 0.29598108747044916,
+ "grad_norm": 2.8144783973693848,
+ "learning_rate": 4.978496645408963e-06,
+ "loss": 0.7237,
+ "step": 626
+ },
+ {
+ "epoch": 0.29645390070921984,
+ "grad_norm": 3.7010560035705566,
+ "learning_rate": 4.978414923855057e-06,
+ "loss": 0.7509,
+ "step": 627
+ },
+ {
+ "epoch": 0.29692671394799053,
+ "grad_norm": 2.9438371658325195,
+ "learning_rate": 4.978333047981122e-06,
+ "loss": 0.6244,
+ "step": 628
+ },
+ {
+ "epoch": 0.2973995271867612,
+ "grad_norm": 3.285982370376587,
+ "learning_rate": 4.978251017792255e-06,
+ "loss": 0.7553,
+ "step": 629
+ },
+ {
+ "epoch": 0.2978723404255319,
+ "grad_norm": 3.7021138668060303,
+ "learning_rate": 4.978168833293564e-06,
+ "loss": 0.7859,
+ "step": 630
+ },
+ {
+ "epoch": 0.2983451536643026,
+ "grad_norm": 3.481858730316162,
+ "learning_rate": 4.9780864944901654e-06,
+ "loss": 0.7146,
+ "step": 631
+ },
+ {
+ "epoch": 0.2988179669030733,
+ "grad_norm": 3.693824529647827,
+ "learning_rate": 4.978004001387188e-06,
+ "loss": 0.6608,
+ "step": 632
+ },
+ {
+ "epoch": 0.29929078014184396,
+ "grad_norm": 3.0069146156311035,
+ "learning_rate": 4.9779213539897665e-06,
+ "loss": 0.6506,
+ "step": 633
+ },
+ {
+ "epoch": 0.29976359338061465,
+ "grad_norm": 3.037644147872925,
+ "learning_rate": 4.977838552303048e-06,
+ "loss": 0.6487,
+ "step": 634
+ },
+ {
+ "epoch": 0.30023640661938533,
+ "grad_norm": 3.018554449081421,
+ "learning_rate": 4.977755596332188e-06,
+ "loss": 0.6128,
+ "step": 635
+ },
+ {
+ "epoch": 0.300709219858156,
+ "grad_norm": 3.000312089920044,
+ "learning_rate": 4.977672486082351e-06,
+ "loss": 0.6431,
+ "step": 636
+ },
+ {
+ "epoch": 0.3011820330969267,
+ "grad_norm": 2.836803913116455,
+ "learning_rate": 4.977589221558713e-06,
+ "loss": 0.5914,
+ "step": 637
+ },
+ {
+ "epoch": 0.3016548463356974,
+ "grad_norm": 3.080469846725464,
+ "learning_rate": 4.977505802766457e-06,
+ "loss": 0.7265,
+ "step": 638
+ },
+ {
+ "epoch": 0.3021276595744681,
+ "grad_norm": 3.2245471477508545,
+ "learning_rate": 4.97742222971078e-06,
+ "loss": 0.6895,
+ "step": 639
+ },
+ {
+ "epoch": 0.30260047281323876,
+ "grad_norm": 3.559006452560425,
+ "learning_rate": 4.977338502396882e-06,
+ "loss": 0.7439,
+ "step": 640
+ },
+ {
+ "epoch": 0.30307328605200945,
+ "grad_norm": 2.9116289615631104,
+ "learning_rate": 4.9772546208299795e-06,
+ "loss": 0.6907,
+ "step": 641
+ },
+ {
+ "epoch": 0.30354609929078014,
+ "grad_norm": 3.3645524978637695,
+ "learning_rate": 4.977170585015295e-06,
+ "loss": 0.6983,
+ "step": 642
+ },
+ {
+ "epoch": 0.3040189125295508,
+ "grad_norm": 3.080148458480835,
+ "learning_rate": 4.977086394958058e-06,
+ "loss": 0.7016,
+ "step": 643
+ },
+ {
+ "epoch": 0.3044917257683215,
+ "grad_norm": 2.9276750087738037,
+ "learning_rate": 4.977002050663515e-06,
+ "loss": 0.6509,
+ "step": 644
+ },
+ {
+ "epoch": 0.3049645390070922,
+ "grad_norm": 3.183609962463379,
+ "learning_rate": 4.976917552136914e-06,
+ "loss": 0.6814,
+ "step": 645
+ },
+ {
+ "epoch": 0.3054373522458629,
+ "grad_norm": 3.0980000495910645,
+ "learning_rate": 4.976832899383519e-06,
+ "loss": 0.6319,
+ "step": 646
+ },
+ {
+ "epoch": 0.30591016548463357,
+ "grad_norm": 3.211376190185547,
+ "learning_rate": 4.9767480924086e-06,
+ "loss": 0.6365,
+ "step": 647
+ },
+ {
+ "epoch": 0.30638297872340425,
+ "grad_norm": 3.214430093765259,
+ "learning_rate": 4.976663131217437e-06,
+ "loss": 0.6006,
+ "step": 648
+ },
+ {
+ "epoch": 0.30685579196217494,
+ "grad_norm": 3.0914318561553955,
+ "learning_rate": 4.976578015815321e-06,
+ "loss": 0.7162,
+ "step": 649
+ },
+ {
+ "epoch": 0.3073286052009456,
+ "grad_norm": 2.7644500732421875,
+ "learning_rate": 4.976492746207551e-06,
+ "loss": 0.6045,
+ "step": 650
+ },
+ {
+ "epoch": 0.3078014184397163,
+ "grad_norm": 3.1913280487060547,
+ "learning_rate": 4.9764073223994374e-06,
+ "loss": 0.6796,
+ "step": 651
+ },
+ {
+ "epoch": 0.308274231678487,
+ "grad_norm": 2.8919692039489746,
+ "learning_rate": 4.976321744396299e-06,
+ "loss": 0.6683,
+ "step": 652
+ },
+ {
+ "epoch": 0.3087470449172577,
+ "grad_norm": 2.862234115600586,
+ "learning_rate": 4.976236012203463e-06,
+ "loss": 0.6631,
+ "step": 653
+ },
+ {
+ "epoch": 0.30921985815602837,
+ "grad_norm": 2.9708092212677,
+ "learning_rate": 4.976150125826268e-06,
+ "loss": 0.6326,
+ "step": 654
+ },
+ {
+ "epoch": 0.30969267139479906,
+ "grad_norm": 2.892465353012085,
+ "learning_rate": 4.976064085270063e-06,
+ "loss": 0.6574,
+ "step": 655
+ },
+ {
+ "epoch": 0.31016548463356974,
+ "grad_norm": 3.9215126037597656,
+ "learning_rate": 4.975977890540205e-06,
+ "loss": 0.7351,
+ "step": 656
+ },
+ {
+ "epoch": 0.31063829787234043,
+ "grad_norm": 2.9544081687927246,
+ "learning_rate": 4.975891541642059e-06,
+ "loss": 0.7264,
+ "step": 657
+ },
+ {
+ "epoch": 0.3111111111111111,
+ "grad_norm": 2.995035409927368,
+ "learning_rate": 4.975805038581005e-06,
+ "loss": 0.7405,
+ "step": 658
+ },
+ {
+ "epoch": 0.3115839243498818,
+ "grad_norm": 2.9653120040893555,
+ "learning_rate": 4.975718381362427e-06,
+ "loss": 0.679,
+ "step": 659
+ },
+ {
+ "epoch": 0.3120567375886525,
+ "grad_norm": 2.93976092338562,
+ "learning_rate": 4.9756315699917205e-06,
+ "loss": 0.627,
+ "step": 660
+ },
+ {
+ "epoch": 0.3125295508274232,
+ "grad_norm": 3.106522560119629,
+ "learning_rate": 4.9755446044742915e-06,
+ "loss": 0.6329,
+ "step": 661
+ },
+ {
+ "epoch": 0.31300236406619386,
+ "grad_norm": 3.0238280296325684,
+ "learning_rate": 4.975457484815554e-06,
+ "loss": 0.6643,
+ "step": 662
+ },
+ {
+ "epoch": 0.31347517730496455,
+ "grad_norm": 2.943528175354004,
+ "learning_rate": 4.9753702110209356e-06,
+ "loss": 0.668,
+ "step": 663
+ },
+ {
+ "epoch": 0.31394799054373523,
+ "grad_norm": 2.6840121746063232,
+ "learning_rate": 4.9752827830958676e-06,
+ "loss": 0.5482,
+ "step": 664
+ },
+ {
+ "epoch": 0.3144208037825059,
+ "grad_norm": 2.823875904083252,
+ "learning_rate": 4.975195201045794e-06,
+ "loss": 0.7017,
+ "step": 665
+ },
+ {
+ "epoch": 0.3148936170212766,
+ "grad_norm": 3.148181200027466,
+ "learning_rate": 4.975107464876168e-06,
+ "loss": 0.747,
+ "step": 666
+ },
+ {
+ "epoch": 0.3153664302600473,
+ "grad_norm": 2.630584478378296,
+ "learning_rate": 4.9750195745924545e-06,
+ "loss": 0.5987,
+ "step": 667
+ },
+ {
+ "epoch": 0.315839243498818,
+ "grad_norm": 3.075866460800171,
+ "learning_rate": 4.974931530200124e-06,
+ "loss": 0.664,
+ "step": 668
+ },
+ {
+ "epoch": 0.31631205673758866,
+ "grad_norm": 2.947197914123535,
+ "learning_rate": 4.974843331704659e-06,
+ "loss": 0.631,
+ "step": 669
+ },
+ {
+ "epoch": 0.31678486997635935,
+ "grad_norm": 3.519646644592285,
+ "learning_rate": 4.974754979111552e-06,
+ "loss": 0.7154,
+ "step": 670
+ },
+ {
+ "epoch": 0.31725768321513004,
+ "grad_norm": 2.8687186241149902,
+ "learning_rate": 4.974666472426305e-06,
+ "loss": 0.6366,
+ "step": 671
+ },
+ {
+ "epoch": 0.3177304964539007,
+ "grad_norm": 2.6966612339019775,
+ "learning_rate": 4.974577811654426e-06,
+ "loss": 0.7112,
+ "step": 672
+ },
+ {
+ "epoch": 0.3182033096926714,
+ "grad_norm": 3.1390228271484375,
+ "learning_rate": 4.974488996801439e-06,
+ "loss": 0.6882,
+ "step": 673
+ },
+ {
+ "epoch": 0.3186761229314421,
+ "grad_norm": 3.4667599201202393,
+ "learning_rate": 4.974400027872871e-06,
+ "loss": 0.7153,
+ "step": 674
+ },
+ {
+ "epoch": 0.3191489361702128,
+ "grad_norm": 2.9632184505462646,
+ "learning_rate": 4.974310904874265e-06,
+ "loss": 0.7081,
+ "step": 675
+ },
+ {
+ "epoch": 0.31962174940898347,
+ "grad_norm": 3.46150279045105,
+ "learning_rate": 4.9742216278111666e-06,
+ "loss": 0.6242,
+ "step": 676
+ },
+ {
+ "epoch": 0.32009456264775416,
+ "grad_norm": 3.380403757095337,
+ "learning_rate": 4.974132196689137e-06,
+ "loss": 0.6863,
+ "step": 677
+ },
+ {
+ "epoch": 0.32056737588652484,
+ "grad_norm": 3.4279606342315674,
+ "learning_rate": 4.974042611513746e-06,
+ "loss": 0.6388,
+ "step": 678
+ },
+ {
+ "epoch": 0.3210401891252955,
+ "grad_norm": 2.634523391723633,
+ "learning_rate": 4.973952872290568e-06,
+ "loss": 0.6038,
+ "step": 679
+ },
+ {
+ "epoch": 0.3215130023640662,
+ "grad_norm": 3.19693922996521,
+ "learning_rate": 4.973862979025194e-06,
+ "loss": 0.6383,
+ "step": 680
+ },
+ {
+ "epoch": 0.3219858156028369,
+ "grad_norm": 3.437692165374756,
+ "learning_rate": 4.973772931723218e-06,
+ "loss": 0.7288,
+ "step": 681
+ },
+ {
+ "epoch": 0.3224586288416076,
+ "grad_norm": 2.506301164627075,
+ "learning_rate": 4.97368273039025e-06,
+ "loss": 0.5707,
+ "step": 682
+ },
+ {
+ "epoch": 0.3229314420803783,
+ "grad_norm": 3.0942845344543457,
+ "learning_rate": 4.9735923750319044e-06,
+ "loss": 0.6348,
+ "step": 683
+ },
+ {
+ "epoch": 0.32340425531914896,
+ "grad_norm": 3.0889835357666016,
+ "learning_rate": 4.973501865653809e-06,
+ "loss": 0.6697,
+ "step": 684
+ },
+ {
+ "epoch": 0.32387706855791965,
+ "grad_norm": 3.0391931533813477,
+ "learning_rate": 4.973411202261598e-06,
+ "loss": 0.7091,
+ "step": 685
+ },
+ {
+ "epoch": 0.32434988179669033,
+ "grad_norm": 3.0333497524261475,
+ "learning_rate": 4.973320384860917e-06,
+ "loss": 0.6403,
+ "step": 686
+ },
+ {
+ "epoch": 0.324822695035461,
+ "grad_norm": 2.9714622497558594,
+ "learning_rate": 4.973229413457421e-06,
+ "loss": 0.6977,
+ "step": 687
+ },
+ {
+ "epoch": 0.3252955082742317,
+ "grad_norm": 3.057558298110962,
+ "learning_rate": 4.973138288056774e-06,
+ "loss": 0.7236,
+ "step": 688
+ },
+ {
+ "epoch": 0.3257683215130024,
+ "grad_norm": 2.921093463897705,
+ "learning_rate": 4.97304700866465e-06,
+ "loss": 0.576,
+ "step": 689
+ },
+ {
+ "epoch": 0.3262411347517731,
+ "grad_norm": 3.0287256240844727,
+ "learning_rate": 4.972955575286732e-06,
+ "loss": 0.7077,
+ "step": 690
+ },
+ {
+ "epoch": 0.32671394799054376,
+ "grad_norm": 2.8621346950531006,
+ "learning_rate": 4.972863987928716e-06,
+ "loss": 0.6952,
+ "step": 691
+ },
+ {
+ "epoch": 0.3271867612293144,
+ "grad_norm": 2.631359100341797,
+ "learning_rate": 4.9727722465963006e-06,
+ "loss": 0.6931,
+ "step": 692
+ },
+ {
+ "epoch": 0.3276595744680851,
+ "grad_norm": 2.8484320640563965,
+ "learning_rate": 4.972680351295201e-06,
+ "loss": 0.6292,
+ "step": 693
+ },
+ {
+ "epoch": 0.32813238770685577,
+ "grad_norm": 2.593001365661621,
+ "learning_rate": 4.972588302031138e-06,
+ "loss": 0.5942,
+ "step": 694
+ },
+ {
+ "epoch": 0.32860520094562645,
+ "grad_norm": 2.6321065425872803,
+ "learning_rate": 4.972496098809844e-06,
+ "loss": 0.65,
+ "step": 695
+ },
+ {
+ "epoch": 0.32907801418439714,
+ "grad_norm": 3.2516732215881348,
+ "learning_rate": 4.972403741637059e-06,
+ "loss": 0.7385,
+ "step": 696
+ },
+ {
+ "epoch": 0.3295508274231678,
+ "grad_norm": 3.180854320526123,
+ "learning_rate": 4.972311230518535e-06,
+ "loss": 0.6569,
+ "step": 697
+ },
+ {
+ "epoch": 0.3300236406619385,
+ "grad_norm": 4.161016941070557,
+ "learning_rate": 4.972218565460031e-06,
+ "loss": 0.6416,
+ "step": 698
+ },
+ {
+ "epoch": 0.3304964539007092,
+ "grad_norm": 3.153897762298584,
+ "learning_rate": 4.972125746467317e-06,
+ "loss": 0.7196,
+ "step": 699
+ },
+ {
+ "epoch": 0.3309692671394799,
+ "grad_norm": 2.9595556259155273,
+ "learning_rate": 4.972032773546173e-06,
+ "loss": 0.7093,
+ "step": 700
+ },
+ {
+ "epoch": 0.33144208037825057,
+ "grad_norm": 3.1086833477020264,
+ "learning_rate": 4.9719396467023875e-06,
+ "loss": 0.6963,
+ "step": 701
+ },
+ {
+ "epoch": 0.33191489361702126,
+ "grad_norm": 2.958921432495117,
+ "learning_rate": 4.971846365941759e-06,
+ "loss": 0.6518,
+ "step": 702
+ },
+ {
+ "epoch": 0.33238770685579194,
+ "grad_norm": 2.8745479583740234,
+ "learning_rate": 4.971752931270096e-06,
+ "loss": 0.696,
+ "step": 703
+ },
+ {
+ "epoch": 0.33286052009456263,
+ "grad_norm": 3.224358558654785,
+ "learning_rate": 4.971659342693217e-06,
+ "loss": 0.6769,
+ "step": 704
+ },
+ {
+ "epoch": 0.3333333333333333,
+ "grad_norm": 2.696319580078125,
+ "learning_rate": 4.9715656002169486e-06,
+ "loss": 0.6833,
+ "step": 705
+ },
+ {
+ "epoch": 0.333806146572104,
+ "grad_norm": 2.9283502101898193,
+ "learning_rate": 4.971471703847127e-06,
+ "loss": 0.6784,
+ "step": 706
+ },
+ {
+ "epoch": 0.3342789598108747,
+ "grad_norm": 2.654914140701294,
+ "learning_rate": 4.9713776535896e-06,
+ "loss": 0.6337,
+ "step": 707
+ },
+ {
+ "epoch": 0.3347517730496454,
+ "grad_norm": 3.041555643081665,
+ "learning_rate": 4.971283449450224e-06,
+ "loss": 0.6227,
+ "step": 708
+ },
+ {
+ "epoch": 0.33522458628841606,
+ "grad_norm": 2.893008232116699,
+ "learning_rate": 4.971189091434863e-06,
+ "loss": 0.655,
+ "step": 709
+ },
+ {
+ "epoch": 0.33569739952718675,
+ "grad_norm": 2.8806653022766113,
+ "learning_rate": 4.971094579549393e-06,
+ "loss": 0.7077,
+ "step": 710
+ },
+ {
+ "epoch": 0.33617021276595743,
+ "grad_norm": 3.4830048084259033,
+ "learning_rate": 4.9709999137996986e-06,
+ "loss": 0.7461,
+ "step": 711
+ },
+ {
+ "epoch": 0.3366430260047281,
+ "grad_norm": 3.155444860458374,
+ "learning_rate": 4.970905094191674e-06,
+ "loss": 0.652,
+ "step": 712
+ },
+ {
+ "epoch": 0.3371158392434988,
+ "grad_norm": 2.7608706951141357,
+ "learning_rate": 4.970810120731225e-06,
+ "loss": 0.684,
+ "step": 713
+ },
+ {
+ "epoch": 0.3375886524822695,
+ "grad_norm": 2.8209474086761475,
+ "learning_rate": 4.970714993424265e-06,
+ "loss": 0.6009,
+ "step": 714
+ },
+ {
+ "epoch": 0.3380614657210402,
+ "grad_norm": 3.6532654762268066,
+ "learning_rate": 4.9706197122767145e-06,
+ "loss": 0.702,
+ "step": 715
+ },
+ {
+ "epoch": 0.33853427895981086,
+ "grad_norm": 2.6276566982269287,
+ "learning_rate": 4.970524277294508e-06,
+ "loss": 0.6338,
+ "step": 716
+ },
+ {
+ "epoch": 0.33900709219858155,
+ "grad_norm": 3.509871482849121,
+ "learning_rate": 4.970428688483589e-06,
+ "loss": 0.6853,
+ "step": 717
+ },
+ {
+ "epoch": 0.33947990543735224,
+ "grad_norm": 5.332682132720947,
+ "learning_rate": 4.970332945849906e-06,
+ "loss": 0.6684,
+ "step": 718
+ },
+ {
+ "epoch": 0.3399527186761229,
+ "grad_norm": 2.718801975250244,
+ "learning_rate": 4.970237049399424e-06,
+ "loss": 0.6676,
+ "step": 719
+ },
+ {
+ "epoch": 0.3404255319148936,
+ "grad_norm": 3.891003131866455,
+ "learning_rate": 4.970140999138112e-06,
+ "loss": 0.7043,
+ "step": 720
+ },
+ {
+ "epoch": 0.3408983451536643,
+ "grad_norm": 2.8863155841827393,
+ "learning_rate": 4.970044795071951e-06,
+ "loss": 0.6563,
+ "step": 721
+ },
+ {
+ "epoch": 0.341371158392435,
+ "grad_norm": 3.2527518272399902,
+ "learning_rate": 4.969948437206932e-06,
+ "loss": 0.7244,
+ "step": 722
+ },
+ {
+ "epoch": 0.34184397163120567,
+ "grad_norm": 2.9726758003234863,
+ "learning_rate": 4.969851925549054e-06,
+ "loss": 0.6548,
+ "step": 723
+ },
+ {
+ "epoch": 0.34231678486997635,
+ "grad_norm": 3.118309497833252,
+ "learning_rate": 4.969755260104327e-06,
+ "loss": 0.7293,
+ "step": 724
+ },
+ {
+ "epoch": 0.34278959810874704,
+ "grad_norm": 3.373068332672119,
+ "learning_rate": 4.969658440878769e-06,
+ "loss": 0.6444,
+ "step": 725
+ },
+ {
+ "epoch": 0.3432624113475177,
+ "grad_norm": 2.7157437801361084,
+ "learning_rate": 4.969561467878409e-06,
+ "loss": 0.642,
+ "step": 726
+ },
+ {
+ "epoch": 0.3437352245862884,
+ "grad_norm": 2.58929705619812,
+ "learning_rate": 4.969464341109285e-06,
+ "loss": 0.6165,
+ "step": 727
+ },
+ {
+ "epoch": 0.3442080378250591,
+ "grad_norm": 2.8811306953430176,
+ "learning_rate": 4.969367060577445e-06,
+ "loss": 0.7127,
+ "step": 728
+ },
+ {
+ "epoch": 0.3446808510638298,
+ "grad_norm": 3.494358539581299,
+ "learning_rate": 4.969269626288946e-06,
+ "loss": 0.7103,
+ "step": 729
+ },
+ {
+ "epoch": 0.34515366430260047,
+ "grad_norm": 2.9753928184509277,
+ "learning_rate": 4.969172038249855e-06,
+ "loss": 0.6911,
+ "step": 730
+ },
+ {
+ "epoch": 0.34562647754137116,
+ "grad_norm": 3.2885913848876953,
+ "learning_rate": 4.969074296466247e-06,
+ "loss": 0.6968,
+ "step": 731
+ },
+ {
+ "epoch": 0.34609929078014184,
+ "grad_norm": 2.7564568519592285,
+ "learning_rate": 4.968976400944211e-06,
+ "loss": 0.6843,
+ "step": 732
+ },
+ {
+ "epoch": 0.34657210401891253,
+ "grad_norm": 2.9255006313323975,
+ "learning_rate": 4.96887835168984e-06,
+ "loss": 0.6024,
+ "step": 733
+ },
+ {
+ "epoch": 0.3470449172576832,
+ "grad_norm": 3.1808290481567383,
+ "learning_rate": 4.968780148709239e-06,
+ "loss": 0.7377,
+ "step": 734
+ },
+ {
+ "epoch": 0.3475177304964539,
+ "grad_norm": 2.956666946411133,
+ "learning_rate": 4.968681792008523e-06,
+ "loss": 0.65,
+ "step": 735
+ },
+ {
+ "epoch": 0.3479905437352246,
+ "grad_norm": 2.9631855487823486,
+ "learning_rate": 4.9685832815938175e-06,
+ "loss": 0.677,
+ "step": 736
+ },
+ {
+ "epoch": 0.3484633569739953,
+ "grad_norm": 2.501917600631714,
+ "learning_rate": 4.968484617471256e-06,
+ "loss": 0.6282,
+ "step": 737
+ },
+ {
+ "epoch": 0.34893617021276596,
+ "grad_norm": 2.750779628753662,
+ "learning_rate": 4.968385799646981e-06,
+ "loss": 0.6507,
+ "step": 738
+ },
+ {
+ "epoch": 0.34940898345153665,
+ "grad_norm": 2.872300624847412,
+ "learning_rate": 4.968286828127146e-06,
+ "loss": 0.5949,
+ "step": 739
+ },
+ {
+ "epoch": 0.34988179669030733,
+ "grad_norm": 2.6316142082214355,
+ "learning_rate": 4.9681877029179124e-06,
+ "loss": 0.6328,
+ "step": 740
+ },
+ {
+ "epoch": 0.350354609929078,
+ "grad_norm": 3.244364023208618,
+ "learning_rate": 4.968088424025454e-06,
+ "loss": 0.7393,
+ "step": 741
+ },
+ {
+ "epoch": 0.3508274231678487,
+ "grad_norm": 2.620465040206909,
+ "learning_rate": 4.967988991455951e-06,
+ "loss": 0.6797,
+ "step": 742
+ },
+ {
+ "epoch": 0.3513002364066194,
+ "grad_norm": 2.854513645172119,
+ "learning_rate": 4.967889405215596e-06,
+ "loss": 0.6368,
+ "step": 743
+ },
+ {
+ "epoch": 0.3517730496453901,
+ "grad_norm": 2.579854726791382,
+ "learning_rate": 4.9677896653105886e-06,
+ "loss": 0.6489,
+ "step": 744
+ },
+ {
+ "epoch": 0.35224586288416077,
+ "grad_norm": 3.0697381496429443,
+ "learning_rate": 4.96768977174714e-06,
+ "loss": 0.6313,
+ "step": 745
+ },
+ {
+ "epoch": 0.35271867612293145,
+ "grad_norm": 3.369338035583496,
+ "learning_rate": 4.96758972453147e-06,
+ "loss": 0.7416,
+ "step": 746
+ },
+ {
+ "epoch": 0.35319148936170214,
+ "grad_norm": 2.836221933364868,
+ "learning_rate": 4.967489523669807e-06,
+ "loss": 0.6422,
+ "step": 747
+ },
+ {
+ "epoch": 0.3536643026004728,
+ "grad_norm": 2.929579496383667,
+ "learning_rate": 4.967389169168392e-06,
+ "loss": 0.6482,
+ "step": 748
+ },
+ {
+ "epoch": 0.3541371158392435,
+ "grad_norm": 2.9243831634521484,
+ "learning_rate": 4.967288661033472e-06,
+ "loss": 0.5813,
+ "step": 749
+ },
+ {
+ "epoch": 0.3546099290780142,
+ "grad_norm": 3.7555336952209473,
+ "learning_rate": 4.967187999271306e-06,
+ "loss": 0.6501,
+ "step": 750
+ },
+ {
+ "epoch": 0.3550827423167849,
+ "grad_norm": 3.4279143810272217,
+ "learning_rate": 4.9670871838881615e-06,
+ "loss": 0.6326,
+ "step": 751
+ },
+ {
+ "epoch": 0.35555555555555557,
+ "grad_norm": 2.875066041946411,
+ "learning_rate": 4.9669862148903166e-06,
+ "loss": 0.664,
+ "step": 752
+ },
+ {
+ "epoch": 0.35602836879432626,
+ "grad_norm": 3.130394697189331,
+ "learning_rate": 4.966885092284057e-06,
+ "loss": 0.706,
+ "step": 753
+ },
+ {
+ "epoch": 0.35650118203309694,
+ "grad_norm": 2.9606287479400635,
+ "learning_rate": 4.96678381607568e-06,
+ "loss": 0.693,
+ "step": 754
+ },
+ {
+ "epoch": 0.35697399527186763,
+ "grad_norm": 3.0584909915924072,
+ "learning_rate": 4.966682386271491e-06,
+ "loss": 0.6034,
+ "step": 755
+ },
+ {
+ "epoch": 0.3574468085106383,
+ "grad_norm": 2.8215200901031494,
+ "learning_rate": 4.966580802877805e-06,
+ "loss": 0.6217,
+ "step": 756
+ },
+ {
+ "epoch": 0.357919621749409,
+ "grad_norm": 2.7348055839538574,
+ "learning_rate": 4.966479065900949e-06,
+ "loss": 0.6194,
+ "step": 757
+ },
+ {
+ "epoch": 0.3583924349881797,
+ "grad_norm": 3.2347466945648193,
+ "learning_rate": 4.966377175347257e-06,
+ "loss": 0.6377,
+ "step": 758
+ },
+ {
+ "epoch": 0.3588652482269504,
+ "grad_norm": 3.311845302581787,
+ "learning_rate": 4.966275131223072e-06,
+ "loss": 0.6234,
+ "step": 759
+ },
+ {
+ "epoch": 0.35933806146572106,
+ "grad_norm": 3.0384368896484375,
+ "learning_rate": 4.96617293353475e-06,
+ "loss": 0.609,
+ "step": 760
+ },
+ {
+ "epoch": 0.35981087470449175,
+ "grad_norm": 3.516854763031006,
+ "learning_rate": 4.966070582288653e-06,
+ "loss": 0.6627,
+ "step": 761
+ },
+ {
+ "epoch": 0.36028368794326243,
+ "grad_norm": 3.2425215244293213,
+ "learning_rate": 4.9659680774911534e-06,
+ "loss": 0.7355,
+ "step": 762
+ },
+ {
+ "epoch": 0.3607565011820331,
+ "grad_norm": 3.2665750980377197,
+ "learning_rate": 4.965865419148636e-06,
+ "loss": 0.6787,
+ "step": 763
+ },
+ {
+ "epoch": 0.3612293144208038,
+ "grad_norm": 2.729428291320801,
+ "learning_rate": 4.96576260726749e-06,
+ "loss": 0.6272,
+ "step": 764
+ },
+ {
+ "epoch": 0.3617021276595745,
+ "grad_norm": 3.299969434738159,
+ "learning_rate": 4.965659641854119e-06,
+ "loss": 0.6552,
+ "step": 765
+ },
+ {
+ "epoch": 0.3621749408983452,
+ "grad_norm": 2.7090916633605957,
+ "learning_rate": 4.965556522914934e-06,
+ "loss": 0.6661,
+ "step": 766
+ },
+ {
+ "epoch": 0.36264775413711586,
+ "grad_norm": 2.488846778869629,
+ "learning_rate": 4.965453250456355e-06,
+ "loss": 0.5821,
+ "step": 767
+ },
+ {
+ "epoch": 0.36312056737588655,
+ "grad_norm": 2.5267233848571777,
+ "learning_rate": 4.965349824484813e-06,
+ "loss": 0.5593,
+ "step": 768
+ },
+ {
+ "epoch": 0.36359338061465724,
+ "grad_norm": 3.0646679401397705,
+ "learning_rate": 4.965246245006748e-06,
+ "loss": 0.6341,
+ "step": 769
+ },
+ {
+ "epoch": 0.3640661938534279,
+ "grad_norm": 2.9877712726593018,
+ "learning_rate": 4.965142512028609e-06,
+ "loss": 0.7202,
+ "step": 770
+ },
+ {
+ "epoch": 0.3645390070921986,
+ "grad_norm": 3.7494113445281982,
+ "learning_rate": 4.965038625556854e-06,
+ "loss": 0.7643,
+ "step": 771
+ },
+ {
+ "epoch": 0.3650118203309693,
+ "grad_norm": 2.8382890224456787,
+ "learning_rate": 4.964934585597954e-06,
+ "loss": 0.6522,
+ "step": 772
+ },
+ {
+ "epoch": 0.3654846335697399,
+ "grad_norm": 3.091655731201172,
+ "learning_rate": 4.9648303921583854e-06,
+ "loss": 0.7117,
+ "step": 773
+ },
+ {
+ "epoch": 0.3659574468085106,
+ "grad_norm": 3.0608325004577637,
+ "learning_rate": 4.964726045244635e-06,
+ "loss": 0.6538,
+ "step": 774
+ },
+ {
+ "epoch": 0.3664302600472813,
+ "grad_norm": 2.8492867946624756,
+ "learning_rate": 4.964621544863203e-06,
+ "loss": 0.6079,
+ "step": 775
+ },
+ {
+ "epoch": 0.366903073286052,
+ "grad_norm": 3.0669894218444824,
+ "learning_rate": 4.964516891020594e-06,
+ "loss": 0.6223,
+ "step": 776
+ },
+ {
+ "epoch": 0.36737588652482267,
+ "grad_norm": 3.089984893798828,
+ "learning_rate": 4.964412083723325e-06,
+ "loss": 0.671,
+ "step": 777
+ },
+ {
+ "epoch": 0.36784869976359336,
+ "grad_norm": 2.905242443084717,
+ "learning_rate": 4.964307122977921e-06,
+ "loss": 0.62,
+ "step": 778
+ },
+ {
+ "epoch": 0.36832151300236404,
+ "grad_norm": 3.954436779022217,
+ "learning_rate": 4.964202008790918e-06,
+ "loss": 0.6535,
+ "step": 779
+ },
+ {
+ "epoch": 0.36879432624113473,
+ "grad_norm": 2.6026058197021484,
+ "learning_rate": 4.9640967411688615e-06,
+ "loss": 0.5865,
+ "step": 780
+ },
+ {
+ "epoch": 0.3692671394799054,
+ "grad_norm": 2.9876346588134766,
+ "learning_rate": 4.963991320118306e-06,
+ "loss": 0.6698,
+ "step": 781
+ },
+ {
+ "epoch": 0.3697399527186761,
+ "grad_norm": 2.9411263465881348,
+ "learning_rate": 4.963885745645815e-06,
+ "loss": 0.6173,
+ "step": 782
+ },
+ {
+ "epoch": 0.3702127659574468,
+ "grad_norm": 2.5679805278778076,
+ "learning_rate": 4.963780017757962e-06,
+ "loss": 0.6285,
+ "step": 783
+ },
+ {
+ "epoch": 0.3706855791962175,
+ "grad_norm": 3.3100640773773193,
+ "learning_rate": 4.963674136461332e-06,
+ "loss": 0.5968,
+ "step": 784
+ },
+ {
+ "epoch": 0.37115839243498816,
+ "grad_norm": 3.1293699741363525,
+ "learning_rate": 4.963568101762515e-06,
+ "loss": 0.697,
+ "step": 785
+ },
+ {
+ "epoch": 0.37163120567375885,
+ "grad_norm": 3.043853759765625,
+ "learning_rate": 4.963461913668115e-06,
+ "loss": 0.5881,
+ "step": 786
+ },
+ {
+ "epoch": 0.37210401891252953,
+ "grad_norm": 3.07351016998291,
+ "learning_rate": 4.963355572184744e-06,
+ "loss": 0.6307,
+ "step": 787
+ },
+ {
+ "epoch": 0.3725768321513002,
+ "grad_norm": 2.7381317615509033,
+ "learning_rate": 4.9632490773190225e-06,
+ "loss": 0.716,
+ "step": 788
+ },
+ {
+ "epoch": 0.3730496453900709,
+ "grad_norm": 2.892221450805664,
+ "learning_rate": 4.963142429077582e-06,
+ "loss": 0.6867,
+ "step": 789
+ },
+ {
+ "epoch": 0.3735224586288416,
+ "grad_norm": 3.133122205734253,
+ "learning_rate": 4.963035627467064e-06,
+ "loss": 0.659,
+ "step": 790
+ },
+ {
+ "epoch": 0.3739952718676123,
+ "grad_norm": 3.032599925994873,
+ "learning_rate": 4.962928672494116e-06,
+ "loss": 0.6848,
+ "step": 791
+ },
+ {
+ "epoch": 0.37446808510638296,
+ "grad_norm": 3.0076355934143066,
+ "learning_rate": 4.9628215641654e-06,
+ "loss": 0.6549,
+ "step": 792
+ },
+ {
+ "epoch": 0.37494089834515365,
+ "grad_norm": 2.8904454708099365,
+ "learning_rate": 4.962714302487585e-06,
+ "loss": 0.6484,
+ "step": 793
+ },
+ {
+ "epoch": 0.37541371158392434,
+ "grad_norm": 2.881364107131958,
+ "learning_rate": 4.9626068874673486e-06,
+ "loss": 0.721,
+ "step": 794
+ },
+ {
+ "epoch": 0.375886524822695,
+ "grad_norm": 3.11668062210083,
+ "learning_rate": 4.962499319111379e-06,
+ "loss": 0.7824,
+ "step": 795
+ },
+ {
+ "epoch": 0.3763593380614657,
+ "grad_norm": 2.9201436042785645,
+ "learning_rate": 4.962391597426374e-06,
+ "loss": 0.6911,
+ "step": 796
+ },
+ {
+ "epoch": 0.3768321513002364,
+ "grad_norm": 2.926598072052002,
+ "learning_rate": 4.962283722419043e-06,
+ "loss": 0.6715,
+ "step": 797
+ },
+ {
+ "epoch": 0.3773049645390071,
+ "grad_norm": 2.7267675399780273,
+ "learning_rate": 4.962175694096101e-06,
+ "loss": 0.6111,
+ "step": 798
+ },
+ {
+ "epoch": 0.37777777777777777,
+ "grad_norm": 3.194031000137329,
+ "learning_rate": 4.962067512464275e-06,
+ "loss": 0.6558,
+ "step": 799
+ },
+ {
+ "epoch": 0.37825059101654845,
+ "grad_norm": 2.6249136924743652,
+ "learning_rate": 4.9619591775303e-06,
+ "loss": 0.6166,
+ "step": 800
+ },
+ {
+ "epoch": 0.37872340425531914,
+ "grad_norm": 2.6356167793273926,
+ "learning_rate": 4.961850689300923e-06,
+ "loss": 0.6112,
+ "step": 801
+ },
+ {
+ "epoch": 0.3791962174940898,
+ "grad_norm": 3.030724287033081,
+ "learning_rate": 4.961742047782898e-06,
+ "loss": 0.6511,
+ "step": 802
+ },
+ {
+ "epoch": 0.3796690307328605,
+ "grad_norm": 3.4987757205963135,
+ "learning_rate": 4.96163325298299e-06,
+ "loss": 0.5888,
+ "step": 803
+ },
+ {
+ "epoch": 0.3801418439716312,
+ "grad_norm": 3.0371780395507812,
+ "learning_rate": 4.961524304907974e-06,
+ "loss": 0.6385,
+ "step": 804
+ },
+ {
+ "epoch": 0.3806146572104019,
+ "grad_norm": 3.302570104598999,
+ "learning_rate": 4.961415203564632e-06,
+ "loss": 0.6515,
+ "step": 805
+ },
+ {
+ "epoch": 0.38108747044917257,
+ "grad_norm": 2.7597038745880127,
+ "learning_rate": 4.961305948959759e-06,
+ "loss": 0.6126,
+ "step": 806
+ },
+ {
+ "epoch": 0.38156028368794326,
+ "grad_norm": 2.789811849594116,
+ "learning_rate": 4.9611965411001575e-06,
+ "loss": 0.6601,
+ "step": 807
+ },
+ {
+ "epoch": 0.38203309692671394,
+ "grad_norm": 3.0403921604156494,
+ "learning_rate": 4.961086979992639e-06,
+ "loss": 0.6947,
+ "step": 808
+ },
+ {
+ "epoch": 0.38250591016548463,
+ "grad_norm": 3.2139980792999268,
+ "learning_rate": 4.960977265644026e-06,
+ "loss": 0.6876,
+ "step": 809
+ },
+ {
+ "epoch": 0.3829787234042553,
+ "grad_norm": 2.918515205383301,
+ "learning_rate": 4.960867398061149e-06,
+ "loss": 0.5997,
+ "step": 810
+ },
+ {
+ "epoch": 0.383451536643026,
+ "grad_norm": 3.197636604309082,
+ "learning_rate": 4.9607573772508495e-06,
+ "loss": 0.5754,
+ "step": 811
+ },
+ {
+ "epoch": 0.3839243498817967,
+ "grad_norm": 2.8848466873168945,
+ "learning_rate": 4.960647203219979e-06,
+ "loss": 0.6424,
+ "step": 812
+ },
+ {
+ "epoch": 0.3843971631205674,
+ "grad_norm": 3.4810187816619873,
+ "learning_rate": 4.960536875975397e-06,
+ "loss": 0.6851,
+ "step": 813
+ },
+ {
+ "epoch": 0.38486997635933806,
+ "grad_norm": 3.713934898376465,
+ "learning_rate": 4.960426395523972e-06,
+ "loss": 0.6122,
+ "step": 814
+ },
+ {
+ "epoch": 0.38534278959810875,
+ "grad_norm": 2.862600803375244,
+ "learning_rate": 4.960315761872585e-06,
+ "loss": 0.6493,
+ "step": 815
+ },
+ {
+ "epoch": 0.38581560283687943,
+ "grad_norm": 3.133882522583008,
+ "learning_rate": 4.960204975028123e-06,
+ "loss": 0.7535,
+ "step": 816
+ },
+ {
+ "epoch": 0.3862884160756501,
+ "grad_norm": 3.1526732444763184,
+ "learning_rate": 4.960094034997485e-06,
+ "loss": 0.6512,
+ "step": 817
+ },
+ {
+ "epoch": 0.3867612293144208,
+ "grad_norm": 2.7213544845581055,
+ "learning_rate": 4.959982941787579e-06,
+ "loss": 0.6121,
+ "step": 818
+ },
+ {
+ "epoch": 0.3872340425531915,
+ "grad_norm": 3.4935851097106934,
+ "learning_rate": 4.9598716954053214e-06,
+ "loss": 0.7852,
+ "step": 819
+ },
+ {
+ "epoch": 0.3877068557919622,
+ "grad_norm": 2.691016435623169,
+ "learning_rate": 4.9597602958576395e-06,
+ "loss": 0.6861,
+ "step": 820
+ },
+ {
+ "epoch": 0.38817966903073287,
+ "grad_norm": 2.8621015548706055,
+ "learning_rate": 4.959648743151469e-06,
+ "loss": 0.6262,
+ "step": 821
+ },
+ {
+ "epoch": 0.38865248226950355,
+ "grad_norm": 3.3887462615966797,
+ "learning_rate": 4.959537037293758e-06,
+ "loss": 0.7103,
+ "step": 822
+ },
+ {
+ "epoch": 0.38912529550827424,
+ "grad_norm": 2.7565438747406006,
+ "learning_rate": 4.95942517829146e-06,
+ "loss": 0.6471,
+ "step": 823
+ },
+ {
+ "epoch": 0.3895981087470449,
+ "grad_norm": 2.7920358180999756,
+ "learning_rate": 4.959313166151541e-06,
+ "loss": 0.6239,
+ "step": 824
+ },
+ {
+ "epoch": 0.3900709219858156,
+ "grad_norm": 3.18904185295105,
+ "learning_rate": 4.959201000880973e-06,
+ "loss": 0.7461,
+ "step": 825
+ },
+ {
+ "epoch": 0.3905437352245863,
+ "grad_norm": 2.727872371673584,
+ "learning_rate": 4.959088682486743e-06,
+ "loss": 0.6333,
+ "step": 826
+ },
+ {
+ "epoch": 0.391016548463357,
+ "grad_norm": 2.906378746032715,
+ "learning_rate": 4.958976210975844e-06,
+ "loss": 0.7547,
+ "step": 827
+ },
+ {
+ "epoch": 0.39148936170212767,
+ "grad_norm": 2.96482515335083,
+ "learning_rate": 4.958863586355278e-06,
+ "loss": 0.6312,
+ "step": 828
+ },
+ {
+ "epoch": 0.39196217494089836,
+ "grad_norm": 3.2890889644622803,
+ "learning_rate": 4.958750808632059e-06,
+ "loss": 0.6943,
+ "step": 829
+ },
+ {
+ "epoch": 0.39243498817966904,
+ "grad_norm": 2.7004311084747314,
+ "learning_rate": 4.958637877813207e-06,
+ "loss": 0.5918,
+ "step": 830
+ },
+ {
+ "epoch": 0.39290780141843973,
+ "grad_norm": 2.7487950325012207,
+ "learning_rate": 4.9585247939057566e-06,
+ "loss": 0.6201,
+ "step": 831
+ },
+ {
+ "epoch": 0.3933806146572104,
+ "grad_norm": 2.7873897552490234,
+ "learning_rate": 4.958411556916747e-06,
+ "loss": 0.6268,
+ "step": 832
+ },
+ {
+ "epoch": 0.3938534278959811,
+ "grad_norm": 2.8501343727111816,
+ "learning_rate": 4.958298166853229e-06,
+ "loss": 0.7119,
+ "step": 833
+ },
+ {
+ "epoch": 0.3943262411347518,
+ "grad_norm": 3.0391547679901123,
+ "learning_rate": 4.958184623722265e-06,
+ "loss": 0.6375,
+ "step": 834
+ },
+ {
+ "epoch": 0.3947990543735225,
+ "grad_norm": 2.850520133972168,
+ "learning_rate": 4.958070927530922e-06,
+ "loss": 0.5962,
+ "step": 835
+ },
+ {
+ "epoch": 0.39527186761229316,
+ "grad_norm": 3.351914644241333,
+ "learning_rate": 4.957957078286281e-06,
+ "loss": 0.7247,
+ "step": 836
+ },
+ {
+ "epoch": 0.39574468085106385,
+ "grad_norm": 2.9559543132781982,
+ "learning_rate": 4.957843075995431e-06,
+ "loss": 0.6571,
+ "step": 837
+ },
+ {
+ "epoch": 0.39621749408983453,
+ "grad_norm": 3.225785255432129,
+ "learning_rate": 4.95772892066547e-06,
+ "loss": 0.7074,
+ "step": 838
+ },
+ {
+ "epoch": 0.3966903073286052,
+ "grad_norm": 2.7842373847961426,
+ "learning_rate": 4.957614612303505e-06,
+ "loss": 0.6469,
+ "step": 839
+ },
+ {
+ "epoch": 0.3971631205673759,
+ "grad_norm": 4.249724864959717,
+ "learning_rate": 4.957500150916655e-06,
+ "loss": 0.741,
+ "step": 840
+ },
+ {
+ "epoch": 0.3976359338061466,
+ "grad_norm": 3.138221263885498,
+ "learning_rate": 4.957385536512046e-06,
+ "loss": 0.6676,
+ "step": 841
+ },
+ {
+ "epoch": 0.3981087470449173,
+ "grad_norm": 3.456423759460449,
+ "learning_rate": 4.957270769096816e-06,
+ "loss": 0.6877,
+ "step": 842
+ },
+ {
+ "epoch": 0.39858156028368796,
+ "grad_norm": 2.8676278591156006,
+ "learning_rate": 4.957155848678109e-06,
+ "loss": 0.5986,
+ "step": 843
+ },
+ {
+ "epoch": 0.39905437352245865,
+ "grad_norm": 2.705324411392212,
+ "learning_rate": 4.957040775263082e-06,
+ "loss": 0.6356,
+ "step": 844
+ },
+ {
+ "epoch": 0.39952718676122934,
+ "grad_norm": 3.0767486095428467,
+ "learning_rate": 4.9569255488589e-06,
+ "loss": 0.6844,
+ "step": 845
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 2.7787704467773438,
+ "learning_rate": 4.956810169472736e-06,
+ "loss": 0.6641,
+ "step": 846
+ },
+ {
+ "epoch": 0.4004728132387707,
+ "grad_norm": 2.584277868270874,
+ "learning_rate": 4.956694637111777e-06,
+ "loss": 0.6256,
+ "step": 847
+ },
+ {
+ "epoch": 0.4009456264775414,
+ "grad_norm": 2.751641273498535,
+ "learning_rate": 4.956578951783215e-06,
+ "loss": 0.5954,
+ "step": 848
+ },
+ {
+ "epoch": 0.4014184397163121,
+ "grad_norm": 3.0181658267974854,
+ "learning_rate": 4.956463113494253e-06,
+ "loss": 0.6569,
+ "step": 849
+ },
+ {
+ "epoch": 0.40189125295508277,
+ "grad_norm": 3.0933220386505127,
+ "learning_rate": 4.956347122252104e-06,
+ "loss": 0.6248,
+ "step": 850
+ },
+ {
+ "epoch": 0.40236406619385345,
+ "grad_norm": 3.3767428398132324,
+ "learning_rate": 4.956230978063991e-06,
+ "loss": 0.719,
+ "step": 851
+ },
+ {
+ "epoch": 0.40283687943262414,
+ "grad_norm": 3.7666573524475098,
+ "learning_rate": 4.956114680937145e-06,
+ "loss": 0.6467,
+ "step": 852
+ },
+ {
+ "epoch": 0.4033096926713948,
+ "grad_norm": 2.9836843013763428,
+ "learning_rate": 4.955998230878808e-06,
+ "loss": 0.6993,
+ "step": 853
+ },
+ {
+ "epoch": 0.4037825059101655,
+ "grad_norm": 2.981497049331665,
+ "learning_rate": 4.955881627896229e-06,
+ "loss": 0.6578,
+ "step": 854
+ },
+ {
+ "epoch": 0.40425531914893614,
+ "grad_norm": 3.1369056701660156,
+ "learning_rate": 4.955764871996672e-06,
+ "loss": 0.6763,
+ "step": 855
+ },
+ {
+ "epoch": 0.40472813238770683,
+ "grad_norm": 2.7675817012786865,
+ "learning_rate": 4.9556479631874036e-06,
+ "loss": 0.6488,
+ "step": 856
+ },
+ {
+ "epoch": 0.4052009456264775,
+ "grad_norm": 3.035334825515747,
+ "learning_rate": 4.9555309014757034e-06,
+ "loss": 0.7076,
+ "step": 857
+ },
+ {
+ "epoch": 0.4056737588652482,
+ "grad_norm": 3.493704319000244,
+ "learning_rate": 4.955413686868862e-06,
+ "loss": 0.6773,
+ "step": 858
+ },
+ {
+ "epoch": 0.4061465721040189,
+ "grad_norm": 3.245487928390503,
+ "learning_rate": 4.9552963193741765e-06,
+ "loss": 0.6915,
+ "step": 859
+ },
+ {
+ "epoch": 0.4066193853427896,
+ "grad_norm": 3.189969539642334,
+ "learning_rate": 4.955178798998956e-06,
+ "loss": 0.7318,
+ "step": 860
+ },
+ {
+ "epoch": 0.40709219858156026,
+ "grad_norm": 2.7987146377563477,
+ "learning_rate": 4.955061125750517e-06,
+ "loss": 0.6162,
+ "step": 861
+ },
+ {
+ "epoch": 0.40756501182033095,
+ "grad_norm": 3.020118474960327,
+ "learning_rate": 4.954943299636187e-06,
+ "loss": 0.6678,
+ "step": 862
+ },
+ {
+ "epoch": 0.40803782505910163,
+ "grad_norm": 2.715463876724243,
+ "learning_rate": 4.954825320663302e-06,
+ "loss": 0.668,
+ "step": 863
+ },
+ {
+ "epoch": 0.4085106382978723,
+ "grad_norm": 2.595050096511841,
+ "learning_rate": 4.9547071888392085e-06,
+ "loss": 0.6557,
+ "step": 864
+ },
+ {
+ "epoch": 0.408983451536643,
+ "grad_norm": 3.131596088409424,
+ "learning_rate": 4.954588904171261e-06,
+ "loss": 0.6548,
+ "step": 865
+ },
+ {
+ "epoch": 0.4094562647754137,
+ "grad_norm": 2.5742313861846924,
+ "learning_rate": 4.954470466666827e-06,
+ "loss": 0.6592,
+ "step": 866
+ },
+ {
+ "epoch": 0.4099290780141844,
+ "grad_norm": 2.8612802028656006,
+ "learning_rate": 4.9543518763332785e-06,
+ "loss": 0.5391,
+ "step": 867
+ },
+ {
+ "epoch": 0.41040189125295506,
+ "grad_norm": 2.8973186016082764,
+ "learning_rate": 4.954233133178001e-06,
+ "loss": 0.6649,
+ "step": 868
+ },
+ {
+ "epoch": 0.41087470449172575,
+ "grad_norm": 2.802525043487549,
+ "learning_rate": 4.954114237208388e-06,
+ "loss": 0.6212,
+ "step": 869
+ },
+ {
+ "epoch": 0.41134751773049644,
+ "grad_norm": 2.5919506549835205,
+ "learning_rate": 4.953995188431843e-06,
+ "loss": 0.6596,
+ "step": 870
+ },
+ {
+ "epoch": 0.4118203309692671,
+ "grad_norm": 3.139169454574585,
+ "learning_rate": 4.953875986855777e-06,
+ "loss": 0.6799,
+ "step": 871
+ },
+ {
+ "epoch": 0.4122931442080378,
+ "grad_norm": 3.99727725982666,
+ "learning_rate": 4.953756632487614e-06,
+ "loss": 0.6519,
+ "step": 872
+ },
+ {
+ "epoch": 0.4127659574468085,
+ "grad_norm": 3.238706350326538,
+ "learning_rate": 4.953637125334784e-06,
+ "loss": 0.7361,
+ "step": 873
+ },
+ {
+ "epoch": 0.4132387706855792,
+ "grad_norm": 2.780019998550415,
+ "learning_rate": 4.9535174654047295e-06,
+ "loss": 0.6406,
+ "step": 874
+ },
+ {
+ "epoch": 0.41371158392434987,
+ "grad_norm": 2.7629551887512207,
+ "learning_rate": 4.953397652704901e-06,
+ "loss": 0.6131,
+ "step": 875
+ },
+ {
+ "epoch": 0.41418439716312055,
+ "grad_norm": 2.8008246421813965,
+ "learning_rate": 4.9532776872427585e-06,
+ "loss": 0.6464,
+ "step": 876
+ },
+ {
+ "epoch": 0.41465721040189124,
+ "grad_norm": 3.0970115661621094,
+ "learning_rate": 4.953157569025772e-06,
+ "loss": 0.7066,
+ "step": 877
+ },
+ {
+ "epoch": 0.4151300236406619,
+ "grad_norm": 2.8375589847564697,
+ "learning_rate": 4.9530372980614195e-06,
+ "loss": 0.6551,
+ "step": 878
+ },
+ {
+ "epoch": 0.4156028368794326,
+ "grad_norm": 2.718843936920166,
+ "learning_rate": 4.952916874357191e-06,
+ "loss": 0.5947,
+ "step": 879
+ },
+ {
+ "epoch": 0.4160756501182033,
+ "grad_norm": 2.7104697227478027,
+ "learning_rate": 4.952796297920585e-06,
+ "loss": 0.6708,
+ "step": 880
+ },
+ {
+ "epoch": 0.416548463356974,
+ "grad_norm": 2.8223445415496826,
+ "learning_rate": 4.952675568759108e-06,
+ "loss": 0.6214,
+ "step": 881
+ },
+ {
+ "epoch": 0.41702127659574467,
+ "grad_norm": 2.6598153114318848,
+ "learning_rate": 4.952554686880279e-06,
+ "loss": 0.6116,
+ "step": 882
+ },
+ {
+ "epoch": 0.41749408983451536,
+ "grad_norm": 2.8639824390411377,
+ "learning_rate": 4.952433652291623e-06,
+ "loss": 0.5971,
+ "step": 883
+ },
+ {
+ "epoch": 0.41796690307328604,
+ "grad_norm": 2.9578304290771484,
+ "learning_rate": 4.952312465000677e-06,
+ "loss": 0.6785,
+ "step": 884
+ },
+ {
+ "epoch": 0.41843971631205673,
+ "grad_norm": 2.872144937515259,
+ "learning_rate": 4.952191125014987e-06,
+ "loss": 0.6772,
+ "step": 885
+ },
+ {
+ "epoch": 0.4189125295508274,
+ "grad_norm": 2.7513675689697266,
+ "learning_rate": 4.952069632342108e-06,
+ "loss": 0.702,
+ "step": 886
+ },
+ {
+ "epoch": 0.4193853427895981,
+ "grad_norm": 2.9275078773498535,
+ "learning_rate": 4.951947986989606e-06,
+ "loss": 0.589,
+ "step": 887
+ },
+ {
+ "epoch": 0.4198581560283688,
+ "grad_norm": 2.740549325942993,
+ "learning_rate": 4.951826188965053e-06,
+ "loss": 0.5942,
+ "step": 888
+ },
+ {
+ "epoch": 0.4203309692671395,
+ "grad_norm": 2.92452073097229,
+ "learning_rate": 4.951704238276035e-06,
+ "loss": 0.6819,
+ "step": 889
+ },
+ {
+ "epoch": 0.42080378250591016,
+ "grad_norm": 2.842491865158081,
+ "learning_rate": 4.951582134930144e-06,
+ "loss": 0.6304,
+ "step": 890
+ },
+ {
+ "epoch": 0.42127659574468085,
+ "grad_norm": 2.613478422164917,
+ "learning_rate": 4.951459878934983e-06,
+ "loss": 0.6912,
+ "step": 891
+ },
+ {
+ "epoch": 0.42174940898345153,
+ "grad_norm": 3.2408607006073,
+ "learning_rate": 4.951337470298165e-06,
+ "loss": 0.6755,
+ "step": 892
+ },
+ {
+ "epoch": 0.4222222222222222,
+ "grad_norm": 3.1022439002990723,
+ "learning_rate": 4.9512149090273125e-06,
+ "loss": 0.6138,
+ "step": 893
+ },
+ {
+ "epoch": 0.4226950354609929,
+ "grad_norm": 2.6418895721435547,
+ "learning_rate": 4.951092195130055e-06,
+ "loss": 0.639,
+ "step": 894
+ },
+ {
+ "epoch": 0.4231678486997636,
+ "grad_norm": 3.010744333267212,
+ "learning_rate": 4.950969328614035e-06,
+ "loss": 0.7102,
+ "step": 895
+ },
+ {
+ "epoch": 0.4236406619385343,
+ "grad_norm": 2.673292636871338,
+ "learning_rate": 4.950846309486901e-06,
+ "loss": 0.5676,
+ "step": 896
+ },
+ {
+ "epoch": 0.42411347517730497,
+ "grad_norm": 3.6974737644195557,
+ "learning_rate": 4.950723137756314e-06,
+ "loss": 0.5722,
+ "step": 897
+ },
+ {
+ "epoch": 0.42458628841607565,
+ "grad_norm": 3.69028902053833,
+ "learning_rate": 4.9505998134299435e-06,
+ "loss": 0.6337,
+ "step": 898
+ },
+ {
+ "epoch": 0.42505910165484634,
+ "grad_norm": 3.2136125564575195,
+ "learning_rate": 4.950476336515469e-06,
+ "loss": 0.6469,
+ "step": 899
+ },
+ {
+ "epoch": 0.425531914893617,
+ "grad_norm": 2.7396016120910645,
+ "learning_rate": 4.950352707020577e-06,
+ "loss": 0.6656,
+ "step": 900
+ },
+ {
+ "epoch": 0.4260047281323877,
+ "grad_norm": 2.825416088104248,
+ "learning_rate": 4.950228924952967e-06,
+ "loss": 0.6298,
+ "step": 901
+ },
+ {
+ "epoch": 0.4264775413711584,
+ "grad_norm": 3.401658535003662,
+ "learning_rate": 4.950104990320345e-06,
+ "loss": 0.778,
+ "step": 902
+ },
+ {
+ "epoch": 0.4269503546099291,
+ "grad_norm": 2.7002272605895996,
+ "learning_rate": 4.9499809031304294e-06,
+ "loss": 0.6536,
+ "step": 903
+ },
+ {
+ "epoch": 0.42742316784869977,
+ "grad_norm": 2.62386417388916,
+ "learning_rate": 4.949856663390945e-06,
+ "loss": 0.6629,
+ "step": 904
+ },
+ {
+ "epoch": 0.42789598108747046,
+ "grad_norm": 2.584247589111328,
+ "learning_rate": 4.94973227110963e-06,
+ "loss": 0.5813,
+ "step": 905
+ },
+ {
+ "epoch": 0.42836879432624114,
+ "grad_norm": 3.4365768432617188,
+ "learning_rate": 4.9496077262942265e-06,
+ "loss": 0.7648,
+ "step": 906
+ },
+ {
+ "epoch": 0.42884160756501183,
+ "grad_norm": 2.8993639945983887,
+ "learning_rate": 4.949483028952492e-06,
+ "loss": 0.6696,
+ "step": 907
+ },
+ {
+ "epoch": 0.4293144208037825,
+ "grad_norm": 2.922809362411499,
+ "learning_rate": 4.94935817909219e-06,
+ "loss": 0.6892,
+ "step": 908
+ },
+ {
+ "epoch": 0.4297872340425532,
+ "grad_norm": 2.85478138923645,
+ "learning_rate": 4.9492331767210944e-06,
+ "loss": 0.536,
+ "step": 909
+ },
+ {
+ "epoch": 0.4302600472813239,
+ "grad_norm": 2.8639259338378906,
+ "learning_rate": 4.949108021846988e-06,
+ "loss": 0.634,
+ "step": 910
+ },
+ {
+ "epoch": 0.4307328605200946,
+ "grad_norm": 3.0533697605133057,
+ "learning_rate": 4.948982714477664e-06,
+ "loss": 0.6318,
+ "step": 911
+ },
+ {
+ "epoch": 0.43120567375886526,
+ "grad_norm": 2.331674814224243,
+ "learning_rate": 4.9488572546209255e-06,
+ "loss": 0.6562,
+ "step": 912
+ },
+ {
+ "epoch": 0.43167848699763595,
+ "grad_norm": 3.0154623985290527,
+ "learning_rate": 4.9487316422845835e-06,
+ "loss": 0.6675,
+ "step": 913
+ },
+ {
+ "epoch": 0.43215130023640663,
+ "grad_norm": 2.7354514598846436,
+ "learning_rate": 4.948605877476459e-06,
+ "loss": 0.6012,
+ "step": 914
+ },
+ {
+ "epoch": 0.4326241134751773,
+ "grad_norm": 2.863736629486084,
+ "learning_rate": 4.948479960204383e-06,
+ "loss": 0.6062,
+ "step": 915
+ },
+ {
+ "epoch": 0.433096926713948,
+ "grad_norm": 3.01998233795166,
+ "learning_rate": 4.948353890476197e-06,
+ "loss": 0.6749,
+ "step": 916
+ },
+ {
+ "epoch": 0.4335697399527187,
+ "grad_norm": 2.7550456523895264,
+ "learning_rate": 4.94822766829975e-06,
+ "loss": 0.6507,
+ "step": 917
+ },
+ {
+ "epoch": 0.4340425531914894,
+ "grad_norm": 3.370572805404663,
+ "learning_rate": 4.948101293682901e-06,
+ "loss": 0.714,
+ "step": 918
+ },
+ {
+ "epoch": 0.43451536643026006,
+ "grad_norm": 2.9736790657043457,
+ "learning_rate": 4.947974766633519e-06,
+ "loss": 0.729,
+ "step": 919
+ },
+ {
+ "epoch": 0.43498817966903075,
+ "grad_norm": 3.1036548614501953,
+ "learning_rate": 4.947848087159483e-06,
+ "loss": 0.7547,
+ "step": 920
+ },
+ {
+ "epoch": 0.43546099290780144,
+ "grad_norm": 2.895094871520996,
+ "learning_rate": 4.947721255268679e-06,
+ "loss": 0.6089,
+ "step": 921
+ },
+ {
+ "epoch": 0.4359338061465721,
+ "grad_norm": 2.798476219177246,
+ "learning_rate": 4.947594270969005e-06,
+ "loss": 0.5432,
+ "step": 922
+ },
+ {
+ "epoch": 0.4364066193853428,
+ "grad_norm": 2.7675702571868896,
+ "learning_rate": 4.94746713426837e-06,
+ "loss": 0.5693,
+ "step": 923
+ },
+ {
+ "epoch": 0.4368794326241135,
+ "grad_norm": 2.6851553916931152,
+ "learning_rate": 4.947339845174687e-06,
+ "loss": 0.6503,
+ "step": 924
+ },
+ {
+ "epoch": 0.4373522458628842,
+ "grad_norm": 2.909635543823242,
+ "learning_rate": 4.947212403695883e-06,
+ "loss": 0.6494,
+ "step": 925
+ },
+ {
+ "epoch": 0.43782505910165487,
+ "grad_norm": 2.604526996612549,
+ "learning_rate": 4.947084809839894e-06,
+ "loss": 0.6349,
+ "step": 926
+ },
+ {
+ "epoch": 0.43829787234042555,
+ "grad_norm": 3.118149518966675,
+ "learning_rate": 4.946957063614664e-06,
+ "loss": 0.6219,
+ "step": 927
+ },
+ {
+ "epoch": 0.43877068557919624,
+ "grad_norm": 2.7452616691589355,
+ "learning_rate": 4.9468291650281465e-06,
+ "loss": 0.6096,
+ "step": 928
+ },
+ {
+ "epoch": 0.4392434988179669,
+ "grad_norm": 3.30098819732666,
+ "learning_rate": 4.946701114088307e-06,
+ "loss": 0.6277,
+ "step": 929
+ },
+ {
+ "epoch": 0.4397163120567376,
+ "grad_norm": 2.789482593536377,
+ "learning_rate": 4.946572910803116e-06,
+ "loss": 0.7,
+ "step": 930
+ },
+ {
+ "epoch": 0.4401891252955083,
+ "grad_norm": 2.7283935546875,
+ "learning_rate": 4.946444555180559e-06,
+ "loss": 0.5375,
+ "step": 931
+ },
+ {
+ "epoch": 0.440661938534279,
+ "grad_norm": 3.101304054260254,
+ "learning_rate": 4.946316047228627e-06,
+ "loss": 0.6131,
+ "step": 932
+ },
+ {
+ "epoch": 0.44113475177304967,
+ "grad_norm": 3.573908805847168,
+ "learning_rate": 4.946187386955321e-06,
+ "loss": 0.7073,
+ "step": 933
+ },
+ {
+ "epoch": 0.44160756501182036,
+ "grad_norm": 3.214979648590088,
+ "learning_rate": 4.946058574368653e-06,
+ "loss": 0.6508,
+ "step": 934
+ },
+ {
+ "epoch": 0.44208037825059104,
+ "grad_norm": 3.145082712173462,
+ "learning_rate": 4.945929609476643e-06,
+ "loss": 0.64,
+ "step": 935
+ },
+ {
+ "epoch": 0.4425531914893617,
+ "grad_norm": 2.991780996322632,
+ "learning_rate": 4.945800492287321e-06,
+ "loss": 0.6315,
+ "step": 936
+ },
+ {
+ "epoch": 0.44302600472813236,
+ "grad_norm": 3.2441139221191406,
+ "learning_rate": 4.945671222808727e-06,
+ "loss": 0.7144,
+ "step": 937
+ },
+ {
+ "epoch": 0.44349881796690305,
+ "grad_norm": 2.9397029876708984,
+ "learning_rate": 4.94554180104891e-06,
+ "loss": 0.6818,
+ "step": 938
+ },
+ {
+ "epoch": 0.44397163120567373,
+ "grad_norm": 3.2471461296081543,
+ "learning_rate": 4.945412227015929e-06,
+ "loss": 0.6921,
+ "step": 939
+ },
+ {
+ "epoch": 0.4444444444444444,
+ "grad_norm": 3.0882487297058105,
+ "learning_rate": 4.945282500717851e-06,
+ "loss": 0.718,
+ "step": 940
+ },
+ {
+ "epoch": 0.4449172576832151,
+ "grad_norm": 2.6035783290863037,
+ "learning_rate": 4.945152622162753e-06,
+ "loss": 0.621,
+ "step": 941
+ },
+ {
+ "epoch": 0.4453900709219858,
+ "grad_norm": 2.83659029006958,
+ "learning_rate": 4.945022591358724e-06,
+ "loss": 0.6403,
+ "step": 942
+ },
+ {
+ "epoch": 0.4458628841607565,
+ "grad_norm": 2.824463129043579,
+ "learning_rate": 4.944892408313859e-06,
+ "loss": 0.6594,
+ "step": 943
+ },
+ {
+ "epoch": 0.44633569739952716,
+ "grad_norm": 2.753735065460205,
+ "learning_rate": 4.9447620730362645e-06,
+ "loss": 0.6116,
+ "step": 944
+ },
+ {
+ "epoch": 0.44680851063829785,
+ "grad_norm": 3.0659725666046143,
+ "learning_rate": 4.944631585534056e-06,
+ "loss": 0.5983,
+ "step": 945
+ },
+ {
+ "epoch": 0.44728132387706854,
+ "grad_norm": 2.969113349914551,
+ "learning_rate": 4.944500945815357e-06,
+ "loss": 0.6859,
+ "step": 946
+ },
+ {
+ "epoch": 0.4477541371158392,
+ "grad_norm": 2.810303211212158,
+ "learning_rate": 4.944370153888303e-06,
+ "loss": 0.7025,
+ "step": 947
+ },
+ {
+ "epoch": 0.4482269503546099,
+ "grad_norm": 3.027721643447876,
+ "learning_rate": 4.944239209761038e-06,
+ "loss": 0.7268,
+ "step": 948
+ },
+ {
+ "epoch": 0.4486997635933806,
+ "grad_norm": 2.661503314971924,
+ "learning_rate": 4.944108113441716e-06,
+ "loss": 0.6702,
+ "step": 949
+ },
+ {
+ "epoch": 0.4491725768321513,
+ "grad_norm": 2.738591432571411,
+ "learning_rate": 4.943976864938498e-06,
+ "loss": 0.6728,
+ "step": 950
+ },
+ {
+ "epoch": 0.44964539007092197,
+ "grad_norm": 3.447505474090576,
+ "learning_rate": 4.943845464259557e-06,
+ "loss": 0.6586,
+ "step": 951
+ },
+ {
+ "epoch": 0.45011820330969265,
+ "grad_norm": 3.0968854427337646,
+ "learning_rate": 4.943713911413075e-06,
+ "loss": 0.7666,
+ "step": 952
+ },
+ {
+ "epoch": 0.45059101654846334,
+ "grad_norm": 2.4113779067993164,
+ "learning_rate": 4.943582206407244e-06,
+ "loss": 0.6173,
+ "step": 953
+ },
+ {
+ "epoch": 0.451063829787234,
+ "grad_norm": 2.6357979774475098,
+ "learning_rate": 4.943450349250263e-06,
+ "loss": 0.5589,
+ "step": 954
+ },
+ {
+ "epoch": 0.4515366430260047,
+ "grad_norm": 2.9182233810424805,
+ "learning_rate": 4.9433183399503425e-06,
+ "loss": 0.6252,
+ "step": 955
+ },
+ {
+ "epoch": 0.4520094562647754,
+ "grad_norm": 2.832740306854248,
+ "learning_rate": 4.943186178515703e-06,
+ "loss": 0.6882,
+ "step": 956
+ },
+ {
+ "epoch": 0.4524822695035461,
+ "grad_norm": 2.9508981704711914,
+ "learning_rate": 4.943053864954574e-06,
+ "loss": 0.5722,
+ "step": 957
+ },
+ {
+ "epoch": 0.4529550827423168,
+ "grad_norm": 3.044729471206665,
+ "learning_rate": 4.9429213992751925e-06,
+ "loss": 0.6772,
+ "step": 958
+ },
+ {
+ "epoch": 0.45342789598108746,
+ "grad_norm": 2.606003522872925,
+ "learning_rate": 4.9427887814858075e-06,
+ "loss": 0.6445,
+ "step": 959
+ },
+ {
+ "epoch": 0.45390070921985815,
+ "grad_norm": 2.4634225368499756,
+ "learning_rate": 4.942656011594676e-06,
+ "loss": 0.6151,
+ "step": 960
+ },
+ {
+ "epoch": 0.45437352245862883,
+ "grad_norm": 2.8872334957122803,
+ "learning_rate": 4.942523089610066e-06,
+ "loss": 0.6255,
+ "step": 961
+ },
+ {
+ "epoch": 0.4548463356973995,
+ "grad_norm": 2.870605707168579,
+ "learning_rate": 4.942390015540253e-06,
+ "loss": 0.7481,
+ "step": 962
+ },
+ {
+ "epoch": 0.4553191489361702,
+ "grad_norm": 2.952680826187134,
+ "learning_rate": 4.942256789393524e-06,
+ "loss": 0.5556,
+ "step": 963
+ },
+ {
+ "epoch": 0.4557919621749409,
+ "grad_norm": 2.623680353164673,
+ "learning_rate": 4.9421234111781725e-06,
+ "loss": 0.6115,
+ "step": 964
+ },
+ {
+ "epoch": 0.4562647754137116,
+ "grad_norm": 2.6933600902557373,
+ "learning_rate": 4.941989880902505e-06,
+ "loss": 0.6102,
+ "step": 965
+ },
+ {
+ "epoch": 0.45673758865248226,
+ "grad_norm": 2.6047189235687256,
+ "learning_rate": 4.941856198574836e-06,
+ "loss": 0.612,
+ "step": 966
+ },
+ {
+ "epoch": 0.45721040189125295,
+ "grad_norm": 2.779186725616455,
+ "learning_rate": 4.9417223642034885e-06,
+ "loss": 0.5424,
+ "step": 967
+ },
+ {
+ "epoch": 0.45768321513002364,
+ "grad_norm": 2.6177165508270264,
+ "learning_rate": 4.941588377796795e-06,
+ "loss": 0.4661,
+ "step": 968
+ },
+ {
+ "epoch": 0.4581560283687943,
+ "grad_norm": 2.959676742553711,
+ "learning_rate": 4.941454239363101e-06,
+ "loss": 0.6966,
+ "step": 969
+ },
+ {
+ "epoch": 0.458628841607565,
+ "grad_norm": 2.9788379669189453,
+ "learning_rate": 4.941319948910756e-06,
+ "loss": 0.6181,
+ "step": 970
+ },
+ {
+ "epoch": 0.4591016548463357,
+ "grad_norm": 4.642750263214111,
+ "learning_rate": 4.941185506448122e-06,
+ "loss": 0.5602,
+ "step": 971
+ },
+ {
+ "epoch": 0.4595744680851064,
+ "grad_norm": 2.793002128601074,
+ "learning_rate": 4.941050911983572e-06,
+ "loss": 0.602,
+ "step": 972
+ },
+ {
+ "epoch": 0.46004728132387707,
+ "grad_norm": 2.6833035945892334,
+ "learning_rate": 4.9409161655254845e-06,
+ "loss": 0.5549,
+ "step": 973
+ },
+ {
+ "epoch": 0.46052009456264775,
+ "grad_norm": 3.905032157897949,
+ "learning_rate": 4.94078126708225e-06,
+ "loss": 0.6335,
+ "step": 974
+ },
+ {
+ "epoch": 0.46099290780141844,
+ "grad_norm": 2.922609329223633,
+ "learning_rate": 4.94064621666227e-06,
+ "loss": 0.5839,
+ "step": 975
+ },
+ {
+ "epoch": 0.4614657210401891,
+ "grad_norm": 2.8277416229248047,
+ "learning_rate": 4.940511014273952e-06,
+ "loss": 0.629,
+ "step": 976
+ },
+ {
+ "epoch": 0.4619385342789598,
+ "grad_norm": 3.07511043548584,
+ "learning_rate": 4.940375659925714e-06,
+ "loss": 0.7058,
+ "step": 977
+ },
+ {
+ "epoch": 0.4624113475177305,
+ "grad_norm": 3.65043044090271,
+ "learning_rate": 4.940240153625984e-06,
+ "loss": 0.7174,
+ "step": 978
+ },
+ {
+ "epoch": 0.4628841607565012,
+ "grad_norm": 2.755167245864868,
+ "learning_rate": 4.9401044953832e-06,
+ "loss": 0.6548,
+ "step": 979
+ },
+ {
+ "epoch": 0.46335697399527187,
+ "grad_norm": 2.9881057739257812,
+ "learning_rate": 4.939968685205808e-06,
+ "loss": 0.6245,
+ "step": 980
+ },
+ {
+ "epoch": 0.46382978723404256,
+ "grad_norm": 2.9484212398529053,
+ "learning_rate": 4.939832723102266e-06,
+ "loss": 0.655,
+ "step": 981
+ },
+ {
+ "epoch": 0.46430260047281324,
+ "grad_norm": 2.898918628692627,
+ "learning_rate": 4.939696609081038e-06,
+ "loss": 0.6178,
+ "step": 982
+ },
+ {
+ "epoch": 0.46477541371158393,
+ "grad_norm": 2.7052435874938965,
+ "learning_rate": 4.9395603431506e-06,
+ "loss": 0.6393,
+ "step": 983
+ },
+ {
+ "epoch": 0.4652482269503546,
+ "grad_norm": 2.5610013008117676,
+ "learning_rate": 4.939423925319436e-06,
+ "loss": 0.4847,
+ "step": 984
+ },
+ {
+ "epoch": 0.4657210401891253,
+ "grad_norm": 3.229083299636841,
+ "learning_rate": 4.939287355596042e-06,
+ "loss": 0.6473,
+ "step": 985
+ },
+ {
+ "epoch": 0.466193853427896,
+ "grad_norm": 2.907097816467285,
+ "learning_rate": 4.9391506339889195e-06,
+ "loss": 0.652,
+ "step": 986
+ },
+ {
+ "epoch": 0.4666666666666667,
+ "grad_norm": 2.6929478645324707,
+ "learning_rate": 4.939013760506582e-06,
+ "loss": 0.6175,
+ "step": 987
+ },
+ {
+ "epoch": 0.46713947990543736,
+ "grad_norm": 3.414813280105591,
+ "learning_rate": 4.938876735157554e-06,
+ "loss": 0.7597,
+ "step": 988
+ },
+ {
+ "epoch": 0.46761229314420805,
+ "grad_norm": 3.297360420227051,
+ "learning_rate": 4.938739557950365e-06,
+ "loss": 0.6824,
+ "step": 989
+ },
+ {
+ "epoch": 0.46808510638297873,
+ "grad_norm": 3.083155393600464,
+ "learning_rate": 4.938602228893557e-06,
+ "loss": 0.6505,
+ "step": 990
+ },
+ {
+ "epoch": 0.4685579196217494,
+ "grad_norm": 2.9781153202056885,
+ "learning_rate": 4.938464747995681e-06,
+ "loss": 0.666,
+ "step": 991
+ },
+ {
+ "epoch": 0.4690307328605201,
+ "grad_norm": 3.1494534015655518,
+ "learning_rate": 4.9383271152652975e-06,
+ "loss": 0.6422,
+ "step": 992
+ },
+ {
+ "epoch": 0.4695035460992908,
+ "grad_norm": 2.547868490219116,
+ "learning_rate": 4.938189330710976e-06,
+ "loss": 0.5766,
+ "step": 993
+ },
+ {
+ "epoch": 0.4699763593380615,
+ "grad_norm": 2.684736967086792,
+ "learning_rate": 4.938051394341297e-06,
+ "loss": 0.6407,
+ "step": 994
+ },
+ {
+ "epoch": 0.47044917257683216,
+ "grad_norm": 2.9619693756103516,
+ "learning_rate": 4.937913306164847e-06,
+ "loss": 0.6936,
+ "step": 995
+ },
+ {
+ "epoch": 0.47092198581560285,
+ "grad_norm": 2.9698498249053955,
+ "learning_rate": 4.937775066190227e-06,
+ "loss": 0.6464,
+ "step": 996
+ },
+ {
+ "epoch": 0.47139479905437354,
+ "grad_norm": 3.121049642562866,
+ "learning_rate": 4.937636674426042e-06,
+ "loss": 0.6383,
+ "step": 997
+ },
+ {
+ "epoch": 0.4718676122931442,
+ "grad_norm": 3.113672971725464,
+ "learning_rate": 4.93749813088091e-06,
+ "loss": 0.6892,
+ "step": 998
+ },
+ {
+ "epoch": 0.4723404255319149,
+ "grad_norm": 3.126113176345825,
+ "learning_rate": 4.937359435563458e-06,
+ "loss": 0.6728,
+ "step": 999
+ },
+ {
+ "epoch": 0.4728132387706856,
+ "grad_norm": 3.353966236114502,
+ "learning_rate": 4.937220588482321e-06,
+ "loss": 0.6041,
+ "step": 1000
+ },
+ {
+ "epoch": 0.4732860520094563,
+ "grad_norm": 2.8860628604888916,
+ "learning_rate": 4.937081589646144e-06,
+ "loss": 0.6798,
+ "step": 1001
+ },
+ {
+ "epoch": 0.47375886524822697,
+ "grad_norm": 3.0510590076446533,
+ "learning_rate": 4.936942439063584e-06,
+ "loss": 0.5841,
+ "step": 1002
+ },
+ {
+ "epoch": 0.47423167848699765,
+ "grad_norm": 2.6998369693756104,
+ "learning_rate": 4.936803136743303e-06,
+ "loss": 0.6403,
+ "step": 1003
+ },
+ {
+ "epoch": 0.47470449172576834,
+ "grad_norm": 2.875347137451172,
+ "learning_rate": 4.9366636826939765e-06,
+ "loss": 0.5811,
+ "step": 1004
+ },
+ {
+ "epoch": 0.475177304964539,
+ "grad_norm": 2.9122262001037598,
+ "learning_rate": 4.936524076924287e-06,
+ "loss": 0.6852,
+ "step": 1005
+ },
+ {
+ "epoch": 0.4756501182033097,
+ "grad_norm": 2.5167057514190674,
+ "learning_rate": 4.9363843194429265e-06,
+ "loss": 0.5367,
+ "step": 1006
+ },
+ {
+ "epoch": 0.4761229314420804,
+ "grad_norm": 2.5745551586151123,
+ "learning_rate": 4.9362444102585985e-06,
+ "loss": 0.6241,
+ "step": 1007
+ },
+ {
+ "epoch": 0.4765957446808511,
+ "grad_norm": 2.5024216175079346,
+ "learning_rate": 4.9361043493800125e-06,
+ "loss": 0.6133,
+ "step": 1008
+ },
+ {
+ "epoch": 0.47706855791962177,
+ "grad_norm": 2.7281384468078613,
+ "learning_rate": 4.935964136815892e-06,
+ "loss": 0.6834,
+ "step": 1009
+ },
+ {
+ "epoch": 0.47754137115839246,
+ "grad_norm": 3.0118913650512695,
+ "learning_rate": 4.935823772574965e-06,
+ "loss": 0.6922,
+ "step": 1010
+ },
+ {
+ "epoch": 0.47801418439716314,
+ "grad_norm": 3.016216993331909,
+ "learning_rate": 4.935683256665973e-06,
+ "loss": 0.6653,
+ "step": 1011
+ },
+ {
+ "epoch": 0.47848699763593383,
+ "grad_norm": 2.9526784420013428,
+ "learning_rate": 4.9355425890976636e-06,
+ "loss": 0.6423,
+ "step": 1012
+ },
+ {
+ "epoch": 0.4789598108747045,
+ "grad_norm": 6.222797393798828,
+ "learning_rate": 4.9354017698787985e-06,
+ "loss": 0.5884,
+ "step": 1013
+ },
+ {
+ "epoch": 0.4794326241134752,
+ "grad_norm": 2.6553597450256348,
+ "learning_rate": 4.935260799018143e-06,
+ "loss": 0.6624,
+ "step": 1014
+ },
+ {
+ "epoch": 0.4799054373522459,
+ "grad_norm": 3.0942065715789795,
+ "learning_rate": 4.935119676524475e-06,
+ "loss": 0.6623,
+ "step": 1015
+ },
+ {
+ "epoch": 0.4803782505910166,
+ "grad_norm": 2.626359224319458,
+ "learning_rate": 4.934978402406585e-06,
+ "loss": 0.6195,
+ "step": 1016
+ },
+ {
+ "epoch": 0.4808510638297872,
+ "grad_norm": 2.7954699993133545,
+ "learning_rate": 4.934836976673265e-06,
+ "loss": 0.5545,
+ "step": 1017
+ },
+ {
+ "epoch": 0.4813238770685579,
+ "grad_norm": 2.913557291030884,
+ "learning_rate": 4.934695399333324e-06,
+ "loss": 0.6288,
+ "step": 1018
+ },
+ {
+ "epoch": 0.4817966903073286,
+ "grad_norm": 3.1043739318847656,
+ "learning_rate": 4.9345536703955746e-06,
+ "loss": 0.6771,
+ "step": 1019
+ },
+ {
+ "epoch": 0.48226950354609927,
+ "grad_norm": 2.789357900619507,
+ "learning_rate": 4.934411789868845e-06,
+ "loss": 0.6227,
+ "step": 1020
+ },
+ {
+ "epoch": 0.48274231678486995,
+ "grad_norm": 2.480609655380249,
+ "learning_rate": 4.934269757761967e-06,
+ "loss": 0.5779,
+ "step": 1021
+ },
+ {
+ "epoch": 0.48321513002364064,
+ "grad_norm": 2.7946252822875977,
+ "learning_rate": 4.934127574083785e-06,
+ "loss": 0.6166,
+ "step": 1022
+ },
+ {
+ "epoch": 0.4836879432624113,
+ "grad_norm": 3.0670509338378906,
+ "learning_rate": 4.933985238843153e-06,
+ "loss": 0.7766,
+ "step": 1023
+ },
+ {
+ "epoch": 0.484160756501182,
+ "grad_norm": 2.8567559719085693,
+ "learning_rate": 4.933842752048932e-06,
+ "loss": 0.5088,
+ "step": 1024
+ },
+ {
+ "epoch": 0.4846335697399527,
+ "grad_norm": 2.5674657821655273,
+ "learning_rate": 4.933700113709996e-06,
+ "loss": 0.6036,
+ "step": 1025
+ },
+ {
+ "epoch": 0.4851063829787234,
+ "grad_norm": 2.782339096069336,
+ "learning_rate": 4.933557323835224e-06,
+ "loss": 0.5335,
+ "step": 1026
+ },
+ {
+ "epoch": 0.48557919621749407,
+ "grad_norm": 2.6334071159362793,
+ "learning_rate": 4.93341438243351e-06,
+ "loss": 0.6327,
+ "step": 1027
+ },
+ {
+ "epoch": 0.48605200945626476,
+ "grad_norm": 3.0853965282440186,
+ "learning_rate": 4.933271289513751e-06,
+ "loss": 0.7102,
+ "step": 1028
+ },
+ {
+ "epoch": 0.48652482269503544,
+ "grad_norm": 2.619997501373291,
+ "learning_rate": 4.933128045084859e-06,
+ "loss": 0.6138,
+ "step": 1029
+ },
+ {
+ "epoch": 0.48699763593380613,
+ "grad_norm": 2.8316116333007812,
+ "learning_rate": 4.932984649155753e-06,
+ "loss": 0.6346,
+ "step": 1030
+ },
+ {
+ "epoch": 0.4874704491725768,
+ "grad_norm": 3.153486490249634,
+ "learning_rate": 4.932841101735361e-06,
+ "loss": 0.7626,
+ "step": 1031
+ },
+ {
+ "epoch": 0.4879432624113475,
+ "grad_norm": 3.1831274032592773,
+ "learning_rate": 4.9326974028326214e-06,
+ "loss": 0.6607,
+ "step": 1032
+ },
+ {
+ "epoch": 0.4884160756501182,
+ "grad_norm": 2.791078567504883,
+ "learning_rate": 4.932553552456481e-06,
+ "loss": 0.6141,
+ "step": 1033
+ },
+ {
+ "epoch": 0.4888888888888889,
+ "grad_norm": 2.627263307571411,
+ "learning_rate": 4.932409550615898e-06,
+ "loss": 0.6777,
+ "step": 1034
+ },
+ {
+ "epoch": 0.48936170212765956,
+ "grad_norm": 2.8550007343292236,
+ "learning_rate": 4.932265397319838e-06,
+ "loss": 0.6379,
+ "step": 1035
+ },
+ {
+ "epoch": 0.48983451536643025,
+ "grad_norm": 4.505824089050293,
+ "learning_rate": 4.932121092577276e-06,
+ "loss": 0.5892,
+ "step": 1036
+ },
+ {
+ "epoch": 0.49030732860520093,
+ "grad_norm": 3.100191116333008,
+ "learning_rate": 4.931976636397199e-06,
+ "loss": 0.6443,
+ "step": 1037
+ },
+ {
+ "epoch": 0.4907801418439716,
+ "grad_norm": 2.921494245529175,
+ "learning_rate": 4.9318320287886e-06,
+ "loss": 0.6821,
+ "step": 1038
+ },
+ {
+ "epoch": 0.4912529550827423,
+ "grad_norm": 4.577807903289795,
+ "learning_rate": 4.931687269760485e-06,
+ "loss": 0.5946,
+ "step": 1039
+ },
+ {
+ "epoch": 0.491725768321513,
+ "grad_norm": 2.7347636222839355,
+ "learning_rate": 4.931542359321865e-06,
+ "loss": 0.5689,
+ "step": 1040
+ },
+ {
+ "epoch": 0.4921985815602837,
+ "grad_norm": 2.5289158821105957,
+ "learning_rate": 4.931397297481765e-06,
+ "loss": 0.5632,
+ "step": 1041
+ },
+ {
+ "epoch": 0.49267139479905436,
+ "grad_norm": 3.3518471717834473,
+ "learning_rate": 4.9312520842492165e-06,
+ "loss": 0.6349,
+ "step": 1042
+ },
+ {
+ "epoch": 0.49314420803782505,
+ "grad_norm": 3.0469748973846436,
+ "learning_rate": 4.931106719633261e-06,
+ "loss": 0.5734,
+ "step": 1043
+ },
+ {
+ "epoch": 0.49361702127659574,
+ "grad_norm": 3.104682445526123,
+ "learning_rate": 4.930961203642951e-06,
+ "loss": 0.6101,
+ "step": 1044
+ },
+ {
+ "epoch": 0.4940898345153664,
+ "grad_norm": 2.776705503463745,
+ "learning_rate": 4.930815536287346e-06,
+ "loss": 0.6397,
+ "step": 1045
+ },
+ {
+ "epoch": 0.4945626477541371,
+ "grad_norm": 2.760380983352661,
+ "learning_rate": 4.930669717575516e-06,
+ "loss": 0.668,
+ "step": 1046
+ },
+ {
+ "epoch": 0.4950354609929078,
+ "grad_norm": 2.70084547996521,
+ "learning_rate": 4.930523747516541e-06,
+ "loss": 0.5729,
+ "step": 1047
+ },
+ {
+ "epoch": 0.4955082742316785,
+ "grad_norm": 2.7319583892822266,
+ "learning_rate": 4.930377626119511e-06,
+ "loss": 0.6258,
+ "step": 1048
+ },
+ {
+ "epoch": 0.49598108747044917,
+ "grad_norm": 3.2515223026275635,
+ "learning_rate": 4.930231353393521e-06,
+ "loss": 0.7412,
+ "step": 1049
+ },
+ {
+ "epoch": 0.49645390070921985,
+ "grad_norm": 3.0646486282348633,
+ "learning_rate": 4.930084929347682e-06,
+ "loss": 0.5809,
+ "step": 1050
+ },
+ {
+ "epoch": 0.49692671394799054,
+ "grad_norm": 3.1621921062469482,
+ "learning_rate": 4.9299383539911096e-06,
+ "loss": 0.6282,
+ "step": 1051
+ },
+ {
+ "epoch": 0.4973995271867612,
+ "grad_norm": 2.864713191986084,
+ "learning_rate": 4.929791627332931e-06,
+ "loss": 0.6263,
+ "step": 1052
+ },
+ {
+ "epoch": 0.4978723404255319,
+ "grad_norm": 3.181016683578491,
+ "learning_rate": 4.929644749382283e-06,
+ "loss": 0.5697,
+ "step": 1053
+ },
+ {
+ "epoch": 0.4983451536643026,
+ "grad_norm": 2.9064836502075195,
+ "learning_rate": 4.929497720148309e-06,
+ "loss": 0.6161,
+ "step": 1054
+ },
+ {
+ "epoch": 0.4988179669030733,
+ "grad_norm": 3.058112859725952,
+ "learning_rate": 4.9293505396401655e-06,
+ "loss": 0.6477,
+ "step": 1055
+ },
+ {
+ "epoch": 0.49929078014184397,
+ "grad_norm": 2.5227596759796143,
+ "learning_rate": 4.929203207867016e-06,
+ "loss": 0.5819,
+ "step": 1056
+ },
+ {
+ "epoch": 0.49976359338061466,
+ "grad_norm": 3.386862277984619,
+ "learning_rate": 4.929055724838035e-06,
+ "loss": 0.7342,
+ "step": 1057
+ },
+ {
+ "epoch": 0.5002364066193853,
+ "grad_norm": 3.368346929550171,
+ "learning_rate": 4.928908090562404e-06,
+ "loss": 0.6622,
+ "step": 1058
+ },
+ {
+ "epoch": 0.500709219858156,
+ "grad_norm": 2.9108314514160156,
+ "learning_rate": 4.928760305049317e-06,
+ "loss": 0.6598,
+ "step": 1059
+ },
+ {
+ "epoch": 0.5011820330969267,
+ "grad_norm": 2.822305917739868,
+ "learning_rate": 4.928612368307977e-06,
+ "loss": 0.5841,
+ "step": 1060
+ },
+ {
+ "epoch": 0.5016548463356973,
+ "grad_norm": 2.689131259918213,
+ "learning_rate": 4.928464280347592e-06,
+ "loss": 0.6631,
+ "step": 1061
+ },
+ {
+ "epoch": 0.502127659574468,
+ "grad_norm": 3.337214946746826,
+ "learning_rate": 4.9283160411773864e-06,
+ "loss": 0.6105,
+ "step": 1062
+ },
+ {
+ "epoch": 0.5026004728132387,
+ "grad_norm": 3.035911798477173,
+ "learning_rate": 4.928167650806588e-06,
+ "loss": 0.6981,
+ "step": 1063
+ },
+ {
+ "epoch": 0.5030732860520094,
+ "grad_norm": 2.8820855617523193,
+ "learning_rate": 4.9280191092444375e-06,
+ "loss": 0.6408,
+ "step": 1064
+ },
+ {
+ "epoch": 0.5035460992907801,
+ "grad_norm": 3.080432415008545,
+ "learning_rate": 4.927870416500183e-06,
+ "loss": 0.6398,
+ "step": 1065
+ },
+ {
+ "epoch": 0.5040189125295508,
+ "grad_norm": 2.761612892150879,
+ "learning_rate": 4.927721572583084e-06,
+ "loss": 0.6126,
+ "step": 1066
+ },
+ {
+ "epoch": 0.5044917257683215,
+ "grad_norm": 2.8561882972717285,
+ "learning_rate": 4.927572577502408e-06,
+ "loss": 0.584,
+ "step": 1067
+ },
+ {
+ "epoch": 0.5049645390070922,
+ "grad_norm": 3.3386311531066895,
+ "learning_rate": 4.927423431267432e-06,
+ "loss": 0.6666,
+ "step": 1068
+ },
+ {
+ "epoch": 0.5054373522458628,
+ "grad_norm": 2.632906675338745,
+ "learning_rate": 4.927274133887443e-06,
+ "loss": 0.632,
+ "step": 1069
+ },
+ {
+ "epoch": 0.5059101654846335,
+ "grad_norm": 2.8737308979034424,
+ "learning_rate": 4.927124685371737e-06,
+ "loss": 0.6051,
+ "step": 1070
+ },
+ {
+ "epoch": 0.5063829787234042,
+ "grad_norm": 3.042222738265991,
+ "learning_rate": 4.926975085729619e-06,
+ "loss": 0.6954,
+ "step": 1071
+ },
+ {
+ "epoch": 0.5068557919621749,
+ "grad_norm": 3.3341481685638428,
+ "learning_rate": 4.926825334970404e-06,
+ "loss": 0.7148,
+ "step": 1072
+ },
+ {
+ "epoch": 0.5073286052009456,
+ "grad_norm": 2.7415387630462646,
+ "learning_rate": 4.926675433103418e-06,
+ "loss": 0.5456,
+ "step": 1073
+ },
+ {
+ "epoch": 0.5078014184397163,
+ "grad_norm": 2.7545325756073,
+ "learning_rate": 4.926525380137993e-06,
+ "loss": 0.6213,
+ "step": 1074
+ },
+ {
+ "epoch": 0.508274231678487,
+ "grad_norm": 2.9153690338134766,
+ "learning_rate": 4.926375176083472e-06,
+ "loss": 0.6466,
+ "step": 1075
+ },
+ {
+ "epoch": 0.5087470449172576,
+ "grad_norm": 4.210638523101807,
+ "learning_rate": 4.926224820949209e-06,
+ "loss": 0.6192,
+ "step": 1076
+ },
+ {
+ "epoch": 0.5092198581560283,
+ "grad_norm": 2.4357898235321045,
+ "learning_rate": 4.926074314744565e-06,
+ "loss": 0.594,
+ "step": 1077
+ },
+ {
+ "epoch": 0.509692671394799,
+ "grad_norm": 2.8004701137542725,
+ "learning_rate": 4.92592365747891e-06,
+ "loss": 0.6276,
+ "step": 1078
+ },
+ {
+ "epoch": 0.5101654846335697,
+ "grad_norm": 2.920675039291382,
+ "learning_rate": 4.925772849161628e-06,
+ "loss": 0.6043,
+ "step": 1079
+ },
+ {
+ "epoch": 0.5106382978723404,
+ "grad_norm": 2.791555404663086,
+ "learning_rate": 4.9256218898021055e-06,
+ "loss": 0.6837,
+ "step": 1080
+ },
+ {
+ "epoch": 0.5111111111111111,
+ "grad_norm": 3.1702463626861572,
+ "learning_rate": 4.925470779409746e-06,
+ "loss": 0.668,
+ "step": 1081
+ },
+ {
+ "epoch": 0.5115839243498818,
+ "grad_norm": 2.7149479389190674,
+ "learning_rate": 4.925319517993955e-06,
+ "loss": 0.5842,
+ "step": 1082
+ },
+ {
+ "epoch": 0.5120567375886524,
+ "grad_norm": 2.916311025619507,
+ "learning_rate": 4.925168105564153e-06,
+ "loss": 0.6893,
+ "step": 1083
+ },
+ {
+ "epoch": 0.5125295508274231,
+ "grad_norm": 2.917654514312744,
+ "learning_rate": 4.925016542129767e-06,
+ "loss": 0.6513,
+ "step": 1084
+ },
+ {
+ "epoch": 0.5130023640661938,
+ "grad_norm": 2.5568928718566895,
+ "learning_rate": 4.924864827700234e-06,
+ "loss": 0.6177,
+ "step": 1085
+ },
+ {
+ "epoch": 0.5134751773049645,
+ "grad_norm": 2.816720485687256,
+ "learning_rate": 4.924712962285001e-06,
+ "loss": 0.5833,
+ "step": 1086
+ },
+ {
+ "epoch": 0.5139479905437352,
+ "grad_norm": 2.6989188194274902,
+ "learning_rate": 4.9245609458935235e-06,
+ "loss": 0.6332,
+ "step": 1087
+ },
+ {
+ "epoch": 0.5144208037825059,
+ "grad_norm": 2.959599494934082,
+ "learning_rate": 4.924408778535268e-06,
+ "loss": 0.626,
+ "step": 1088
+ },
+ {
+ "epoch": 0.5148936170212766,
+ "grad_norm": 2.872814416885376,
+ "learning_rate": 4.924256460219708e-06,
+ "loss": 0.6407,
+ "step": 1089
+ },
+ {
+ "epoch": 0.5153664302600472,
+ "grad_norm": 2.6989097595214844,
+ "learning_rate": 4.924103990956329e-06,
+ "loss": 0.6391,
+ "step": 1090
+ },
+ {
+ "epoch": 0.5158392434988179,
+ "grad_norm": 2.986492156982422,
+ "learning_rate": 4.9239513707546235e-06,
+ "loss": 0.6911,
+ "step": 1091
+ },
+ {
+ "epoch": 0.5163120567375886,
+ "grad_norm": 3.069920301437378,
+ "learning_rate": 4.9237985996240954e-06,
+ "loss": 0.671,
+ "step": 1092
+ },
+ {
+ "epoch": 0.5167848699763593,
+ "grad_norm": 2.8214917182922363,
+ "learning_rate": 4.9236456775742555e-06,
+ "loss": 0.5885,
+ "step": 1093
+ },
+ {
+ "epoch": 0.51725768321513,
+ "grad_norm": 2.9416961669921875,
+ "learning_rate": 4.923492604614627e-06,
+ "loss": 0.6293,
+ "step": 1094
+ },
+ {
+ "epoch": 0.5177304964539007,
+ "grad_norm": 2.761780023574829,
+ "learning_rate": 4.923339380754741e-06,
+ "loss": 0.649,
+ "step": 1095
+ },
+ {
+ "epoch": 0.5182033096926714,
+ "grad_norm": 2.7648792266845703,
+ "learning_rate": 4.923186006004138e-06,
+ "loss": 0.5906,
+ "step": 1096
+ },
+ {
+ "epoch": 0.518676122931442,
+ "grad_norm": 3.5535428524017334,
+ "learning_rate": 4.923032480372367e-06,
+ "loss": 0.7138,
+ "step": 1097
+ },
+ {
+ "epoch": 0.5191489361702127,
+ "grad_norm": 2.6252479553222656,
+ "learning_rate": 4.922878803868988e-06,
+ "loss": 0.5499,
+ "step": 1098
+ },
+ {
+ "epoch": 0.5196217494089834,
+ "grad_norm": 2.901002883911133,
+ "learning_rate": 4.9227249765035715e-06,
+ "loss": 0.6991,
+ "step": 1099
+ },
+ {
+ "epoch": 0.5200945626477541,
+ "grad_norm": 2.621877431869507,
+ "learning_rate": 4.9225709982856925e-06,
+ "loss": 0.6269,
+ "step": 1100
+ },
+ {
+ "epoch": 0.5205673758865248,
+ "grad_norm": 2.872483015060425,
+ "learning_rate": 4.92241686922494e-06,
+ "loss": 0.6657,
+ "step": 1101
+ },
+ {
+ "epoch": 0.5210401891252955,
+ "grad_norm": 2.730447769165039,
+ "learning_rate": 4.922262589330912e-06,
+ "loss": 0.6061,
+ "step": 1102
+ },
+ {
+ "epoch": 0.5215130023640662,
+ "grad_norm": 2.646247386932373,
+ "learning_rate": 4.922108158613213e-06,
+ "loss": 0.5923,
+ "step": 1103
+ },
+ {
+ "epoch": 0.5219858156028369,
+ "grad_norm": 2.6488895416259766,
+ "learning_rate": 4.92195357708146e-06,
+ "loss": 0.6293,
+ "step": 1104
+ },
+ {
+ "epoch": 0.5224586288416075,
+ "grad_norm": 2.756338357925415,
+ "learning_rate": 4.921798844745278e-06,
+ "loss": 0.6374,
+ "step": 1105
+ },
+ {
+ "epoch": 0.5229314420803782,
+ "grad_norm": 3.1441280841827393,
+ "learning_rate": 4.921643961614301e-06,
+ "loss": 0.6652,
+ "step": 1106
+ },
+ {
+ "epoch": 0.5234042553191489,
+ "grad_norm": 3.050002098083496,
+ "learning_rate": 4.921488927698172e-06,
+ "loss": 0.6809,
+ "step": 1107
+ },
+ {
+ "epoch": 0.5238770685579196,
+ "grad_norm": 2.71750807762146,
+ "learning_rate": 4.921333743006547e-06,
+ "loss": 0.6266,
+ "step": 1108
+ },
+ {
+ "epoch": 0.5243498817966903,
+ "grad_norm": 2.8439245223999023,
+ "learning_rate": 4.921178407549086e-06,
+ "loss": 0.5663,
+ "step": 1109
+ },
+ {
+ "epoch": 0.524822695035461,
+ "grad_norm": 3.0722241401672363,
+ "learning_rate": 4.921022921335464e-06,
+ "loss": 0.6791,
+ "step": 1110
+ },
+ {
+ "epoch": 0.5252955082742317,
+ "grad_norm": 3.4381656646728516,
+ "learning_rate": 4.920867284375358e-06,
+ "loss": 0.6687,
+ "step": 1111
+ },
+ {
+ "epoch": 0.5257683215130023,
+ "grad_norm": 2.819812774658203,
+ "learning_rate": 4.920711496678463e-06,
+ "loss": 0.6299,
+ "step": 1112
+ },
+ {
+ "epoch": 0.526241134751773,
+ "grad_norm": 3.6587414741516113,
+ "learning_rate": 4.9205555582544765e-06,
+ "loss": 0.7392,
+ "step": 1113
+ },
+ {
+ "epoch": 0.5267139479905437,
+ "grad_norm": 2.774296522140503,
+ "learning_rate": 4.920399469113109e-06,
+ "loss": 0.6652,
+ "step": 1114
+ },
+ {
+ "epoch": 0.5271867612293144,
+ "grad_norm": 2.7480580806732178,
+ "learning_rate": 4.920243229264081e-06,
+ "loss": 0.596,
+ "step": 1115
+ },
+ {
+ "epoch": 0.5276595744680851,
+ "grad_norm": 3.213057518005371,
+ "learning_rate": 4.920086838717119e-06,
+ "loss": 0.6986,
+ "step": 1116
+ },
+ {
+ "epoch": 0.5281323877068558,
+ "grad_norm": 2.940546989440918,
+ "learning_rate": 4.919930297481962e-06,
+ "loss": 0.6481,
+ "step": 1117
+ },
+ {
+ "epoch": 0.5286052009456265,
+ "grad_norm": 2.5970494747161865,
+ "learning_rate": 4.9197736055683555e-06,
+ "loss": 0.5658,
+ "step": 1118
+ },
+ {
+ "epoch": 0.5290780141843971,
+ "grad_norm": 4.49385404586792,
+ "learning_rate": 4.919616762986057e-06,
+ "loss": 0.605,
+ "step": 1119
+ },
+ {
+ "epoch": 0.5295508274231678,
+ "grad_norm": 2.971857786178589,
+ "learning_rate": 4.919459769744833e-06,
+ "loss": 0.6539,
+ "step": 1120
+ },
+ {
+ "epoch": 0.5300236406619385,
+ "grad_norm": 2.6192965507507324,
+ "learning_rate": 4.919302625854457e-06,
+ "loss": 0.6226,
+ "step": 1121
+ },
+ {
+ "epoch": 0.5304964539007092,
+ "grad_norm": 2.665088176727295,
+ "learning_rate": 4.919145331324716e-06,
+ "loss": 0.6647,
+ "step": 1122
+ },
+ {
+ "epoch": 0.5309692671394799,
+ "grad_norm": 2.612126111984253,
+ "learning_rate": 4.918987886165403e-06,
+ "loss": 0.6965,
+ "step": 1123
+ },
+ {
+ "epoch": 0.5314420803782506,
+ "grad_norm": 3.80017352104187,
+ "learning_rate": 4.9188302903863205e-06,
+ "loss": 0.7396,
+ "step": 1124
+ },
+ {
+ "epoch": 0.5319148936170213,
+ "grad_norm": 2.781752824783325,
+ "learning_rate": 4.918672543997282e-06,
+ "loss": 0.5985,
+ "step": 1125
+ },
+ {
+ "epoch": 0.532387706855792,
+ "grad_norm": 2.6067914962768555,
+ "learning_rate": 4.91851464700811e-06,
+ "loss": 0.6159,
+ "step": 1126
+ },
+ {
+ "epoch": 0.5328605200945626,
+ "grad_norm": 2.670807123184204,
+ "learning_rate": 4.918356599428636e-06,
+ "loss": 0.5958,
+ "step": 1127
+ },
+ {
+ "epoch": 0.5333333333333333,
+ "grad_norm": 2.608611822128296,
+ "learning_rate": 4.9181984012687e-06,
+ "loss": 0.5768,
+ "step": 1128
+ },
+ {
+ "epoch": 0.533806146572104,
+ "grad_norm": 2.586764097213745,
+ "learning_rate": 4.918040052538154e-06,
+ "loss": 0.661,
+ "step": 1129
+ },
+ {
+ "epoch": 0.5342789598108747,
+ "grad_norm": 3.1317451000213623,
+ "learning_rate": 4.917881553246856e-06,
+ "loss": 0.6626,
+ "step": 1130
+ },
+ {
+ "epoch": 0.5347517730496454,
+ "grad_norm": 2.7135281562805176,
+ "learning_rate": 4.917722903404676e-06,
+ "loss": 0.6572,
+ "step": 1131
+ },
+ {
+ "epoch": 0.5352245862884161,
+ "grad_norm": 3.4546358585357666,
+ "learning_rate": 4.917564103021493e-06,
+ "loss": 0.5597,
+ "step": 1132
+ },
+ {
+ "epoch": 0.5356973995271868,
+ "grad_norm": 3.0943493843078613,
+ "learning_rate": 4.917405152107193e-06,
+ "loss": 0.7258,
+ "step": 1133
+ },
+ {
+ "epoch": 0.5361702127659574,
+ "grad_norm": 2.6069352626800537,
+ "learning_rate": 4.917246050671674e-06,
+ "loss": 0.6209,
+ "step": 1134
+ },
+ {
+ "epoch": 0.5366430260047281,
+ "grad_norm": 2.584883689880371,
+ "learning_rate": 4.917086798724844e-06,
+ "loss": 0.658,
+ "step": 1135
+ },
+ {
+ "epoch": 0.5371158392434988,
+ "grad_norm": 3.001976490020752,
+ "learning_rate": 4.9169273962766166e-06,
+ "loss": 0.6306,
+ "step": 1136
+ },
+ {
+ "epoch": 0.5375886524822695,
+ "grad_norm": 2.5013928413391113,
+ "learning_rate": 4.916767843336918e-06,
+ "loss": 0.572,
+ "step": 1137
+ },
+ {
+ "epoch": 0.5380614657210402,
+ "grad_norm": 2.9114553928375244,
+ "learning_rate": 4.916608139915684e-06,
+ "loss": 0.5841,
+ "step": 1138
+ },
+ {
+ "epoch": 0.5385342789598109,
+ "grad_norm": 2.8878467082977295,
+ "learning_rate": 4.9164482860228564e-06,
+ "loss": 0.6654,
+ "step": 1139
+ },
+ {
+ "epoch": 0.5390070921985816,
+ "grad_norm": 2.9827866554260254,
+ "learning_rate": 4.91628828166839e-06,
+ "loss": 0.6674,
+ "step": 1140
+ },
+ {
+ "epoch": 0.5394799054373522,
+ "grad_norm": 3.8696281909942627,
+ "learning_rate": 4.916128126862248e-06,
+ "loss": 0.6241,
+ "step": 1141
+ },
+ {
+ "epoch": 0.5399527186761229,
+ "grad_norm": 2.9556291103363037,
+ "learning_rate": 4.915967821614402e-06,
+ "loss": 0.6478,
+ "step": 1142
+ },
+ {
+ "epoch": 0.5404255319148936,
+ "grad_norm": 2.392942428588867,
+ "learning_rate": 4.915807365934834e-06,
+ "loss": 0.6097,
+ "step": 1143
+ },
+ {
+ "epoch": 0.5408983451536643,
+ "grad_norm": 3.032235860824585,
+ "learning_rate": 4.915646759833534e-06,
+ "loss": 0.7193,
+ "step": 1144
+ },
+ {
+ "epoch": 0.541371158392435,
+ "grad_norm": 2.840416193008423,
+ "learning_rate": 4.915486003320501e-06,
+ "loss": 0.5506,
+ "step": 1145
+ },
+ {
+ "epoch": 0.5418439716312057,
+ "grad_norm": 2.5438895225524902,
+ "learning_rate": 4.915325096405747e-06,
+ "loss": 0.6487,
+ "step": 1146
+ },
+ {
+ "epoch": 0.5423167848699764,
+ "grad_norm": 2.544334650039673,
+ "learning_rate": 4.9151640390992905e-06,
+ "loss": 0.6168,
+ "step": 1147
+ },
+ {
+ "epoch": 0.542789598108747,
+ "grad_norm": 2.8535678386688232,
+ "learning_rate": 4.91500283141116e-06,
+ "loss": 0.678,
+ "step": 1148
+ },
+ {
+ "epoch": 0.5432624113475177,
+ "grad_norm": 2.8086955547332764,
+ "learning_rate": 4.9148414733513915e-06,
+ "loss": 0.6473,
+ "step": 1149
+ },
+ {
+ "epoch": 0.5437352245862884,
+ "grad_norm": 2.4709885120391846,
+ "learning_rate": 4.914679964930034e-06,
+ "loss": 0.6797,
+ "step": 1150
+ },
+ {
+ "epoch": 0.5442080378250591,
+ "grad_norm": 2.8546934127807617,
+ "learning_rate": 4.9145183061571435e-06,
+ "loss": 0.6247,
+ "step": 1151
+ },
+ {
+ "epoch": 0.5446808510638298,
+ "grad_norm": 2.991184711456299,
+ "learning_rate": 4.9143564970427844e-06,
+ "loss": 0.5977,
+ "step": 1152
+ },
+ {
+ "epoch": 0.5451536643026005,
+ "grad_norm": 3.011216402053833,
+ "learning_rate": 4.914194537597033e-06,
+ "loss": 0.7005,
+ "step": 1153
+ },
+ {
+ "epoch": 0.5456264775413712,
+ "grad_norm": 2.807521343231201,
+ "learning_rate": 4.9140324278299744e-06,
+ "loss": 0.5412,
+ "step": 1154
+ },
+ {
+ "epoch": 0.5460992907801419,
+ "grad_norm": 3.0401229858398438,
+ "learning_rate": 4.913870167751701e-06,
+ "loss": 0.6394,
+ "step": 1155
+ },
+ {
+ "epoch": 0.5465721040189125,
+ "grad_norm": 2.853914976119995,
+ "learning_rate": 4.913707757372317e-06,
+ "loss": 0.6745,
+ "step": 1156
+ },
+ {
+ "epoch": 0.5470449172576832,
+ "grad_norm": 4.505620956420898,
+ "learning_rate": 4.913545196701935e-06,
+ "loss": 0.6668,
+ "step": 1157
+ },
+ {
+ "epoch": 0.5475177304964539,
+ "grad_norm": 3.0505781173706055,
+ "learning_rate": 4.913382485750676e-06,
+ "loss": 0.6926,
+ "step": 1158
+ },
+ {
+ "epoch": 0.5479905437352246,
+ "grad_norm": 2.798435688018799,
+ "learning_rate": 4.913219624528672e-06,
+ "loss": 0.605,
+ "step": 1159
+ },
+ {
+ "epoch": 0.5484633569739953,
+ "grad_norm": 2.7814908027648926,
+ "learning_rate": 4.913056613046065e-06,
+ "loss": 0.6678,
+ "step": 1160
+ },
+ {
+ "epoch": 0.548936170212766,
+ "grad_norm": 3.2089321613311768,
+ "learning_rate": 4.9128934513130025e-06,
+ "loss": 0.5995,
+ "step": 1161
+ },
+ {
+ "epoch": 0.5494089834515367,
+ "grad_norm": 2.7699952125549316,
+ "learning_rate": 4.9127301393396455e-06,
+ "loss": 0.7062,
+ "step": 1162
+ },
+ {
+ "epoch": 0.5498817966903073,
+ "grad_norm": 2.859368324279785,
+ "learning_rate": 4.912566677136162e-06,
+ "loss": 0.6063,
+ "step": 1163
+ },
+ {
+ "epoch": 0.550354609929078,
+ "grad_norm": 2.727334499359131,
+ "learning_rate": 4.91240306471273e-06,
+ "loss": 0.6848,
+ "step": 1164
+ },
+ {
+ "epoch": 0.5508274231678487,
+ "grad_norm": 2.6017510890960693,
+ "learning_rate": 4.912239302079537e-06,
+ "loss": 0.5808,
+ "step": 1165
+ },
+ {
+ "epoch": 0.5513002364066194,
+ "grad_norm": 3.539583206176758,
+ "learning_rate": 4.912075389246781e-06,
+ "loss": 0.7053,
+ "step": 1166
+ },
+ {
+ "epoch": 0.5517730496453901,
+ "grad_norm": 2.918280601501465,
+ "learning_rate": 4.911911326224666e-06,
+ "loss": 0.5904,
+ "step": 1167
+ },
+ {
+ "epoch": 0.5522458628841608,
+ "grad_norm": 3.0067362785339355,
+ "learning_rate": 4.9117471130234095e-06,
+ "loss": 0.6392,
+ "step": 1168
+ },
+ {
+ "epoch": 0.5527186761229315,
+ "grad_norm": 2.4374797344207764,
+ "learning_rate": 4.911582749653236e-06,
+ "loss": 0.5793,
+ "step": 1169
+ },
+ {
+ "epoch": 0.5531914893617021,
+ "grad_norm": 3.121182918548584,
+ "learning_rate": 4.911418236124378e-06,
+ "loss": 0.6636,
+ "step": 1170
+ },
+ {
+ "epoch": 0.5536643026004728,
+ "grad_norm": 3.1289851665496826,
+ "learning_rate": 4.91125357244708e-06,
+ "loss": 0.656,
+ "step": 1171
+ },
+ {
+ "epoch": 0.5541371158392435,
+ "grad_norm": 2.7034592628479004,
+ "learning_rate": 4.911088758631596e-06,
+ "loss": 0.6001,
+ "step": 1172
+ },
+ {
+ "epoch": 0.5546099290780142,
+ "grad_norm": 2.710146188735962,
+ "learning_rate": 4.910923794688187e-06,
+ "loss": 0.6007,
+ "step": 1173
+ },
+ {
+ "epoch": 0.5550827423167849,
+ "grad_norm": 2.5424487590789795,
+ "learning_rate": 4.910758680627124e-06,
+ "loss": 0.5193,
+ "step": 1174
+ },
+ {
+ "epoch": 0.5555555555555556,
+ "grad_norm": 2.615893602371216,
+ "learning_rate": 4.91059341645869e-06,
+ "loss": 0.5525,
+ "step": 1175
+ },
+ {
+ "epoch": 0.5560283687943263,
+ "grad_norm": 3.3179728984832764,
+ "learning_rate": 4.910428002193174e-06,
+ "loss": 0.7285,
+ "step": 1176
+ },
+ {
+ "epoch": 0.556501182033097,
+ "grad_norm": 2.7234175205230713,
+ "learning_rate": 4.910262437840875e-06,
+ "loss": 0.574,
+ "step": 1177
+ },
+ {
+ "epoch": 0.5569739952718676,
+ "grad_norm": 3.0416605472564697,
+ "learning_rate": 4.9100967234121034e-06,
+ "loss": 0.5623,
+ "step": 1178
+ },
+ {
+ "epoch": 0.5574468085106383,
+ "grad_norm": 3.067786455154419,
+ "learning_rate": 4.909930858917177e-06,
+ "loss": 0.6491,
+ "step": 1179
+ },
+ {
+ "epoch": 0.557919621749409,
+ "grad_norm": 3.0037379264831543,
+ "learning_rate": 4.909764844366422e-06,
+ "loss": 0.5696,
+ "step": 1180
+ },
+ {
+ "epoch": 0.5583924349881797,
+ "grad_norm": 2.966179609298706,
+ "learning_rate": 4.909598679770178e-06,
+ "loss": 0.6042,
+ "step": 1181
+ },
+ {
+ "epoch": 0.5588652482269504,
+ "grad_norm": 2.6000657081604004,
+ "learning_rate": 4.909432365138789e-06,
+ "loss": 0.5883,
+ "step": 1182
+ },
+ {
+ "epoch": 0.5593380614657211,
+ "grad_norm": 2.6794495582580566,
+ "learning_rate": 4.909265900482612e-06,
+ "loss": 0.6809,
+ "step": 1183
+ },
+ {
+ "epoch": 0.5598108747044918,
+ "grad_norm": 2.6765122413635254,
+ "learning_rate": 4.9090992858120115e-06,
+ "loss": 0.6601,
+ "step": 1184
+ },
+ {
+ "epoch": 0.5602836879432624,
+ "grad_norm": 2.6051928997039795,
+ "learning_rate": 4.908932521137363e-06,
+ "loss": 0.5946,
+ "step": 1185
+ },
+ {
+ "epoch": 0.5607565011820331,
+ "grad_norm": 3.0405542850494385,
+ "learning_rate": 4.908765606469048e-06,
+ "loss": 0.6998,
+ "step": 1186
+ },
+ {
+ "epoch": 0.5612293144208038,
+ "grad_norm": 2.7975668907165527,
+ "learning_rate": 4.908598541817462e-06,
+ "loss": 0.6218,
+ "step": 1187
+ },
+ {
+ "epoch": 0.5617021276595745,
+ "grad_norm": 2.5367627143859863,
+ "learning_rate": 4.908431327193005e-06,
+ "loss": 0.6354,
+ "step": 1188
+ },
+ {
+ "epoch": 0.5621749408983452,
+ "grad_norm": 3.7939631938934326,
+ "learning_rate": 4.908263962606091e-06,
+ "loss": 0.6376,
+ "step": 1189
+ },
+ {
+ "epoch": 0.5626477541371159,
+ "grad_norm": 2.864079475402832,
+ "learning_rate": 4.908096448067139e-06,
+ "loss": 0.5485,
+ "step": 1190
+ },
+ {
+ "epoch": 0.5631205673758866,
+ "grad_norm": 2.7855563163757324,
+ "learning_rate": 4.9079287835865804e-06,
+ "loss": 0.6645,
+ "step": 1191
+ },
+ {
+ "epoch": 0.5635933806146572,
+ "grad_norm": 2.6156625747680664,
+ "learning_rate": 4.9077609691748556e-06,
+ "loss": 0.5751,
+ "step": 1192
+ },
+ {
+ "epoch": 0.5640661938534279,
+ "grad_norm": 3.0475659370422363,
+ "learning_rate": 4.907593004842412e-06,
+ "loss": 0.6739,
+ "step": 1193
+ },
+ {
+ "epoch": 0.5645390070921986,
+ "grad_norm": 2.9176738262176514,
+ "learning_rate": 4.9074248905997104e-06,
+ "loss": 0.6493,
+ "step": 1194
+ },
+ {
+ "epoch": 0.5650118203309693,
+ "grad_norm": 2.6168384552001953,
+ "learning_rate": 4.907256626457216e-06,
+ "loss": 0.6154,
+ "step": 1195
+ },
+ {
+ "epoch": 0.56548463356974,
+ "grad_norm": 2.893980026245117,
+ "learning_rate": 4.907088212425408e-06,
+ "loss": 0.5808,
+ "step": 1196
+ },
+ {
+ "epoch": 0.5659574468085107,
+ "grad_norm": 3.3832836151123047,
+ "learning_rate": 4.90691964851477e-06,
+ "loss": 0.7888,
+ "step": 1197
+ },
+ {
+ "epoch": 0.5664302600472814,
+ "grad_norm": 3.088932752609253,
+ "learning_rate": 4.906750934735801e-06,
+ "loss": 0.6516,
+ "step": 1198
+ },
+ {
+ "epoch": 0.566903073286052,
+ "grad_norm": 2.494471549987793,
+ "learning_rate": 4.906582071099004e-06,
+ "loss": 0.6286,
+ "step": 1199
+ },
+ {
+ "epoch": 0.5673758865248227,
+ "grad_norm": 2.716550588607788,
+ "learning_rate": 4.906413057614895e-06,
+ "loss": 0.5939,
+ "step": 1200
+ },
+ {
+ "epoch": 0.5678486997635934,
+ "grad_norm": 2.5821073055267334,
+ "learning_rate": 4.906243894293995e-06,
+ "loss": 0.6668,
+ "step": 1201
+ },
+ {
+ "epoch": 0.5683215130023641,
+ "grad_norm": 3.651787042617798,
+ "learning_rate": 4.90607458114684e-06,
+ "loss": 0.6124,
+ "step": 1202
+ },
+ {
+ "epoch": 0.5687943262411348,
+ "grad_norm": 2.7567858695983887,
+ "learning_rate": 4.9059051181839705e-06,
+ "loss": 0.6656,
+ "step": 1203
+ },
+ {
+ "epoch": 0.5692671394799055,
+ "grad_norm": 2.8067586421966553,
+ "learning_rate": 4.90573550541594e-06,
+ "loss": 0.6306,
+ "step": 1204
+ },
+ {
+ "epoch": 0.5697399527186762,
+ "grad_norm": 2.6136393547058105,
+ "learning_rate": 4.905565742853307e-06,
+ "loss": 0.5992,
+ "step": 1205
+ },
+ {
+ "epoch": 0.5702127659574469,
+ "grad_norm": 2.899049758911133,
+ "learning_rate": 4.905395830506644e-06,
+ "loss": 0.621,
+ "step": 1206
+ },
+ {
+ "epoch": 0.5706855791962175,
+ "grad_norm": 3.036583185195923,
+ "learning_rate": 4.9052257683865294e-06,
+ "loss": 0.652,
+ "step": 1207
+ },
+ {
+ "epoch": 0.5711583924349882,
+ "grad_norm": 2.7947216033935547,
+ "learning_rate": 4.905055556503553e-06,
+ "loss": 0.6636,
+ "step": 1208
+ },
+ {
+ "epoch": 0.5716312056737589,
+ "grad_norm": 3.1646955013275146,
+ "learning_rate": 4.9048851948683135e-06,
+ "loss": 0.6376,
+ "step": 1209
+ },
+ {
+ "epoch": 0.5721040189125296,
+ "grad_norm": 2.8175766468048096,
+ "learning_rate": 4.904714683491417e-06,
+ "loss": 0.5929,
+ "step": 1210
+ },
+ {
+ "epoch": 0.5725768321513003,
+ "grad_norm": 2.923923969268799,
+ "learning_rate": 4.904544022383483e-06,
+ "loss": 0.6633,
+ "step": 1211
+ },
+ {
+ "epoch": 0.573049645390071,
+ "grad_norm": 2.7471134662628174,
+ "learning_rate": 4.9043732115551356e-06,
+ "loss": 0.6551,
+ "step": 1212
+ },
+ {
+ "epoch": 0.5735224586288417,
+ "grad_norm": 2.8660807609558105,
+ "learning_rate": 4.90420225101701e-06,
+ "loss": 0.6423,
+ "step": 1213
+ },
+ {
+ "epoch": 0.5739952718676123,
+ "grad_norm": 2.769247531890869,
+ "learning_rate": 4.904031140779754e-06,
+ "loss": 0.5982,
+ "step": 1214
+ },
+ {
+ "epoch": 0.574468085106383,
+ "grad_norm": 2.9043145179748535,
+ "learning_rate": 4.90385988085402e-06,
+ "loss": 0.5843,
+ "step": 1215
+ },
+ {
+ "epoch": 0.5749408983451537,
+ "grad_norm": 2.6639609336853027,
+ "learning_rate": 4.903688471250471e-06,
+ "loss": 0.5858,
+ "step": 1216
+ },
+ {
+ "epoch": 0.5754137115839244,
+ "grad_norm": 2.6967573165893555,
+ "learning_rate": 4.903516911979781e-06,
+ "loss": 0.5755,
+ "step": 1217
+ },
+ {
+ "epoch": 0.5758865248226951,
+ "grad_norm": 2.8865857124328613,
+ "learning_rate": 4.903345203052633e-06,
+ "loss": 0.6051,
+ "step": 1218
+ },
+ {
+ "epoch": 0.5763593380614658,
+ "grad_norm": 2.381979465484619,
+ "learning_rate": 4.903173344479717e-06,
+ "loss": 0.5727,
+ "step": 1219
+ },
+ {
+ "epoch": 0.5768321513002365,
+ "grad_norm": 2.7717981338500977,
+ "learning_rate": 4.903001336271734e-06,
+ "loss": 0.6406,
+ "step": 1220
+ },
+ {
+ "epoch": 0.577304964539007,
+ "grad_norm": 2.6431570053100586,
+ "learning_rate": 4.902829178439395e-06,
+ "loss": 0.6226,
+ "step": 1221
+ },
+ {
+ "epoch": 0.5777777777777777,
+ "grad_norm": 2.8090415000915527,
+ "learning_rate": 4.902656870993419e-06,
+ "loss": 0.5761,
+ "step": 1222
+ },
+ {
+ "epoch": 0.5782505910165484,
+ "grad_norm": 2.4769368171691895,
+ "learning_rate": 4.902484413944535e-06,
+ "loss": 0.5602,
+ "step": 1223
+ },
+ {
+ "epoch": 0.5787234042553191,
+ "grad_norm": 2.693316698074341,
+ "learning_rate": 4.902311807303481e-06,
+ "loss": 0.5222,
+ "step": 1224
+ },
+ {
+ "epoch": 0.5791962174940898,
+ "grad_norm": 2.7623913288116455,
+ "learning_rate": 4.902139051081004e-06,
+ "loss": 0.6978,
+ "step": 1225
+ },
+ {
+ "epoch": 0.5796690307328605,
+ "grad_norm": 2.6133766174316406,
+ "learning_rate": 4.901966145287863e-06,
+ "loss": 0.5802,
+ "step": 1226
+ },
+ {
+ "epoch": 0.5801418439716312,
+ "grad_norm": 2.7345972061157227,
+ "learning_rate": 4.901793089934821e-06,
+ "loss": 0.6294,
+ "step": 1227
+ },
+ {
+ "epoch": 0.5806146572104018,
+ "grad_norm": 2.7545835971832275,
+ "learning_rate": 4.9016198850326555e-06,
+ "loss": 0.6085,
+ "step": 1228
+ },
+ {
+ "epoch": 0.5810874704491725,
+ "grad_norm": 2.6947758197784424,
+ "learning_rate": 4.90144653059215e-06,
+ "loss": 0.6025,
+ "step": 1229
+ },
+ {
+ "epoch": 0.5815602836879432,
+ "grad_norm": 2.692967414855957,
+ "learning_rate": 4.901273026624099e-06,
+ "loss": 0.5715,
+ "step": 1230
+ },
+ {
+ "epoch": 0.5820330969267139,
+ "grad_norm": 2.78347110748291,
+ "learning_rate": 4.901099373139307e-06,
+ "loss": 0.6063,
+ "step": 1231
+ },
+ {
+ "epoch": 0.5825059101654846,
+ "grad_norm": 2.346496343612671,
+ "learning_rate": 4.900925570148585e-06,
+ "loss": 0.5869,
+ "step": 1232
+ },
+ {
+ "epoch": 0.5829787234042553,
+ "grad_norm": 2.606639862060547,
+ "learning_rate": 4.900751617662755e-06,
+ "loss": 0.6197,
+ "step": 1233
+ },
+ {
+ "epoch": 0.583451536643026,
+ "grad_norm": 2.5825929641723633,
+ "learning_rate": 4.900577515692649e-06,
+ "loss": 0.6721,
+ "step": 1234
+ },
+ {
+ "epoch": 0.5839243498817966,
+ "grad_norm": 2.731349468231201,
+ "learning_rate": 4.900403264249107e-06,
+ "loss": 0.6273,
+ "step": 1235
+ },
+ {
+ "epoch": 0.5843971631205673,
+ "grad_norm": 3.2133874893188477,
+ "learning_rate": 4.90022886334298e-06,
+ "loss": 0.6231,
+ "step": 1236
+ },
+ {
+ "epoch": 0.584869976359338,
+ "grad_norm": 2.9213852882385254,
+ "learning_rate": 4.900054312985127e-06,
+ "loss": 0.6677,
+ "step": 1237
+ },
+ {
+ "epoch": 0.5853427895981087,
+ "grad_norm": 2.815425157546997,
+ "learning_rate": 4.899879613186414e-06,
+ "loss": 0.6405,
+ "step": 1238
+ },
+ {
+ "epoch": 0.5858156028368794,
+ "grad_norm": 2.730782985687256,
+ "learning_rate": 4.899704763957721e-06,
+ "loss": 0.6233,
+ "step": 1239
+ },
+ {
+ "epoch": 0.5862884160756501,
+ "grad_norm": 2.6432766914367676,
+ "learning_rate": 4.899529765309936e-06,
+ "loss": 0.6267,
+ "step": 1240
+ },
+ {
+ "epoch": 0.5867612293144208,
+ "grad_norm": 2.616215229034424,
+ "learning_rate": 4.899354617253953e-06,
+ "loss": 0.6268,
+ "step": 1241
+ },
+ {
+ "epoch": 0.5872340425531914,
+ "grad_norm": 2.7630255222320557,
+ "learning_rate": 4.899179319800679e-06,
+ "loss": 0.6348,
+ "step": 1242
+ },
+ {
+ "epoch": 0.5877068557919621,
+ "grad_norm": 2.785095453262329,
+ "learning_rate": 4.899003872961029e-06,
+ "loss": 0.5839,
+ "step": 1243
+ },
+ {
+ "epoch": 0.5881796690307328,
+ "grad_norm": 2.9050328731536865,
+ "learning_rate": 4.898828276745927e-06,
+ "loss": 0.651,
+ "step": 1244
+ },
+ {
+ "epoch": 0.5886524822695035,
+ "grad_norm": 2.958092212677002,
+ "learning_rate": 4.8986525311663065e-06,
+ "loss": 0.6395,
+ "step": 1245
+ },
+ {
+ "epoch": 0.5891252955082742,
+ "grad_norm": 2.952310800552368,
+ "learning_rate": 4.898476636233111e-06,
+ "loss": 0.6731,
+ "step": 1246
+ },
+ {
+ "epoch": 0.5895981087470449,
+ "grad_norm": 2.9876346588134766,
+ "learning_rate": 4.898300591957293e-06,
+ "loss": 0.7015,
+ "step": 1247
+ },
+ {
+ "epoch": 0.5900709219858156,
+ "grad_norm": 2.8941752910614014,
+ "learning_rate": 4.898124398349813e-06,
+ "loss": 0.6452,
+ "step": 1248
+ },
+ {
+ "epoch": 0.5905437352245863,
+ "grad_norm": 2.9809536933898926,
+ "learning_rate": 4.897948055421642e-06,
+ "loss": 0.5736,
+ "step": 1249
+ },
+ {
+ "epoch": 0.5910165484633569,
+ "grad_norm": 2.927046775817871,
+ "learning_rate": 4.897771563183761e-06,
+ "loss": 0.5918,
+ "step": 1250
+ },
+ {
+ "epoch": 0.5914893617021276,
+ "grad_norm": 2.865020275115967,
+ "learning_rate": 4.897594921647158e-06,
+ "loss": 0.6924,
+ "step": 1251
+ },
+ {
+ "epoch": 0.5919621749408983,
+ "grad_norm": 2.7406699657440186,
+ "learning_rate": 4.897418130822832e-06,
+ "loss": 0.509,
+ "step": 1252
+ },
+ {
+ "epoch": 0.592434988179669,
+ "grad_norm": 2.781606912612915,
+ "learning_rate": 4.897241190721791e-06,
+ "loss": 0.5555,
+ "step": 1253
+ },
+ {
+ "epoch": 0.5929078014184397,
+ "grad_norm": 2.79209303855896,
+ "learning_rate": 4.8970641013550535e-06,
+ "loss": 0.6722,
+ "step": 1254
+ },
+ {
+ "epoch": 0.5933806146572104,
+ "grad_norm": 3.0672268867492676,
+ "learning_rate": 4.896886862733645e-06,
+ "loss": 0.6366,
+ "step": 1255
+ },
+ {
+ "epoch": 0.5938534278959811,
+ "grad_norm": 2.7456953525543213,
+ "learning_rate": 4.896709474868602e-06,
+ "loss": 0.6246,
+ "step": 1256
+ },
+ {
+ "epoch": 0.5943262411347517,
+ "grad_norm": 3.6731202602386475,
+ "learning_rate": 4.896531937770968e-06,
+ "loss": 0.668,
+ "step": 1257
+ },
+ {
+ "epoch": 0.5947990543735224,
+ "grad_norm": 2.6056087017059326,
+ "learning_rate": 4.8963542514518e-06,
+ "loss": 0.5815,
+ "step": 1258
+ },
+ {
+ "epoch": 0.5952718676122931,
+ "grad_norm": 2.719698905944824,
+ "learning_rate": 4.89617641592216e-06,
+ "loss": 0.6058,
+ "step": 1259
+ },
+ {
+ "epoch": 0.5957446808510638,
+ "grad_norm": 2.625838279724121,
+ "learning_rate": 4.895998431193121e-06,
+ "loss": 0.6143,
+ "step": 1260
+ },
+ {
+ "epoch": 0.5962174940898345,
+ "grad_norm": 2.7166085243225098,
+ "learning_rate": 4.895820297275767e-06,
+ "loss": 0.5187,
+ "step": 1261
+ },
+ {
+ "epoch": 0.5966903073286052,
+ "grad_norm": 2.7544102668762207,
+ "learning_rate": 4.8956420141811875e-06,
+ "loss": 0.5928,
+ "step": 1262
+ },
+ {
+ "epoch": 0.5971631205673759,
+ "grad_norm": 2.6678333282470703,
+ "learning_rate": 4.895463581920484e-06,
+ "loss": 0.611,
+ "step": 1263
+ },
+ {
+ "epoch": 0.5976359338061465,
+ "grad_norm": 2.853384494781494,
+ "learning_rate": 4.895285000504768e-06,
+ "loss": 0.642,
+ "step": 1264
+ },
+ {
+ "epoch": 0.5981087470449172,
+ "grad_norm": 2.637852430343628,
+ "learning_rate": 4.895106269945158e-06,
+ "loss": 0.6308,
+ "step": 1265
+ },
+ {
+ "epoch": 0.5985815602836879,
+ "grad_norm": 2.9880387783050537,
+ "learning_rate": 4.8949273902527826e-06,
+ "loss": 0.5781,
+ "step": 1266
+ },
+ {
+ "epoch": 0.5990543735224586,
+ "grad_norm": 3.5984015464782715,
+ "learning_rate": 4.89474836143878e-06,
+ "loss": 0.5865,
+ "step": 1267
+ },
+ {
+ "epoch": 0.5995271867612293,
+ "grad_norm": 2.719855546951294,
+ "learning_rate": 4.8945691835142975e-06,
+ "loss": 0.6393,
+ "step": 1268
+ },
+ {
+ "epoch": 0.6,
+ "grad_norm": 2.7885141372680664,
+ "learning_rate": 4.894389856490492e-06,
+ "loss": 0.66,
+ "step": 1269
+ },
+ {
+ "epoch": 0.6004728132387707,
+ "grad_norm": 2.698819875717163,
+ "learning_rate": 4.894210380378529e-06,
+ "loss": 0.6144,
+ "step": 1270
+ },
+ {
+ "epoch": 0.6009456264775414,
+ "grad_norm": 2.278045654296875,
+ "learning_rate": 4.894030755189584e-06,
+ "loss": 0.5609,
+ "step": 1271
+ },
+ {
+ "epoch": 0.601418439716312,
+ "grad_norm": 2.8729357719421387,
+ "learning_rate": 4.893850980934841e-06,
+ "loss": 0.6715,
+ "step": 1272
+ },
+ {
+ "epoch": 0.6018912529550827,
+ "grad_norm": 2.8541221618652344,
+ "learning_rate": 4.893671057625495e-06,
+ "loss": 0.6787,
+ "step": 1273
+ },
+ {
+ "epoch": 0.6023640661938534,
+ "grad_norm": 2.4561476707458496,
+ "learning_rate": 4.893490985272748e-06,
+ "loss": 0.6331,
+ "step": 1274
+ },
+ {
+ "epoch": 0.6028368794326241,
+ "grad_norm": 2.565739154815674,
+ "learning_rate": 4.893310763887812e-06,
+ "loss": 0.587,
+ "step": 1275
+ },
+ {
+ "epoch": 0.6033096926713948,
+ "grad_norm": 2.384951591491699,
+ "learning_rate": 4.8931303934819095e-06,
+ "loss": 0.5358,
+ "step": 1276
+ },
+ {
+ "epoch": 0.6037825059101655,
+ "grad_norm": 2.380808115005493,
+ "learning_rate": 4.89294987406627e-06,
+ "loss": 0.5402,
+ "step": 1277
+ },
+ {
+ "epoch": 0.6042553191489362,
+ "grad_norm": 2.764815092086792,
+ "learning_rate": 4.892769205652136e-06,
+ "loss": 0.6103,
+ "step": 1278
+ },
+ {
+ "epoch": 0.6047281323877068,
+ "grad_norm": 2.463350296020508,
+ "learning_rate": 4.892588388250754e-06,
+ "loss": 0.5937,
+ "step": 1279
+ },
+ {
+ "epoch": 0.6052009456264775,
+ "grad_norm": 3.099689245223999,
+ "learning_rate": 4.8924074218733855e-06,
+ "loss": 0.6354,
+ "step": 1280
+ },
+ {
+ "epoch": 0.6056737588652482,
+ "grad_norm": 2.804450035095215,
+ "learning_rate": 4.892226306531297e-06,
+ "loss": 0.6595,
+ "step": 1281
+ },
+ {
+ "epoch": 0.6061465721040189,
+ "grad_norm": 3.1559767723083496,
+ "learning_rate": 4.892045042235765e-06,
+ "loss": 0.6664,
+ "step": 1282
+ },
+ {
+ "epoch": 0.6066193853427896,
+ "grad_norm": 2.844341993331909,
+ "learning_rate": 4.891863628998079e-06,
+ "loss": 0.7454,
+ "step": 1283
+ },
+ {
+ "epoch": 0.6070921985815603,
+ "grad_norm": 2.686602830886841,
+ "learning_rate": 4.891682066829532e-06,
+ "loss": 0.6755,
+ "step": 1284
+ },
+ {
+ "epoch": 0.607565011820331,
+ "grad_norm": 2.736457347869873,
+ "learning_rate": 4.8915003557414285e-06,
+ "loss": 0.6305,
+ "step": 1285
+ },
+ {
+ "epoch": 0.6080378250591016,
+ "grad_norm": 2.661362409591675,
+ "learning_rate": 4.891318495745086e-06,
+ "loss": 0.5958,
+ "step": 1286
+ },
+ {
+ "epoch": 0.6085106382978723,
+ "grad_norm": 2.707348108291626,
+ "learning_rate": 4.8911364868518255e-06,
+ "loss": 0.5824,
+ "step": 1287
+ },
+ {
+ "epoch": 0.608983451536643,
+ "grad_norm": 2.9798858165740967,
+ "learning_rate": 4.890954329072981e-06,
+ "loss": 0.5981,
+ "step": 1288
+ },
+ {
+ "epoch": 0.6094562647754137,
+ "grad_norm": 2.6285455226898193,
+ "learning_rate": 4.890772022419895e-06,
+ "loss": 0.6194,
+ "step": 1289
+ },
+ {
+ "epoch": 0.6099290780141844,
+ "grad_norm": 2.9254322052001953,
+ "learning_rate": 4.890589566903917e-06,
+ "loss": 0.6002,
+ "step": 1290
+ },
+ {
+ "epoch": 0.6104018912529551,
+ "grad_norm": 2.6458325386047363,
+ "learning_rate": 4.89040696253641e-06,
+ "loss": 0.5457,
+ "step": 1291
+ },
+ {
+ "epoch": 0.6108747044917258,
+ "grad_norm": 2.508242607116699,
+ "learning_rate": 4.890224209328743e-06,
+ "loss": 0.6168,
+ "step": 1292
+ },
+ {
+ "epoch": 0.6113475177304964,
+ "grad_norm": 3.034785509109497,
+ "learning_rate": 4.890041307292296e-06,
+ "loss": 0.664,
+ "step": 1293
+ },
+ {
+ "epoch": 0.6118203309692671,
+ "grad_norm": 3.52469539642334,
+ "learning_rate": 4.889858256438455e-06,
+ "loss": 0.7301,
+ "step": 1294
+ },
+ {
+ "epoch": 0.6122931442080378,
+ "grad_norm": 2.9145348072052,
+ "learning_rate": 4.889675056778622e-06,
+ "loss": 0.6494,
+ "step": 1295
+ },
+ {
+ "epoch": 0.6127659574468085,
+ "grad_norm": 2.831829071044922,
+ "learning_rate": 4.8894917083242e-06,
+ "loss": 0.6064,
+ "step": 1296
+ },
+ {
+ "epoch": 0.6132387706855792,
+ "grad_norm": 2.6883130073547363,
+ "learning_rate": 4.889308211086608e-06,
+ "loss": 0.5642,
+ "step": 1297
+ },
+ {
+ "epoch": 0.6137115839243499,
+ "grad_norm": 3.0605485439300537,
+ "learning_rate": 4.889124565077269e-06,
+ "loss": 0.6695,
+ "step": 1298
+ },
+ {
+ "epoch": 0.6141843971631206,
+ "grad_norm": 3.44062876701355,
+ "learning_rate": 4.88894077030762e-06,
+ "loss": 0.6415,
+ "step": 1299
+ },
+ {
+ "epoch": 0.6146572104018913,
+ "grad_norm": 2.5970818996429443,
+ "learning_rate": 4.888756826789105e-06,
+ "loss": 0.6518,
+ "step": 1300
+ },
+ {
+ "epoch": 0.6151300236406619,
+ "grad_norm": 4.2233567237854,
+ "learning_rate": 4.8885727345331755e-06,
+ "loss": 0.6555,
+ "step": 1301
+ },
+ {
+ "epoch": 0.6156028368794326,
+ "grad_norm": 2.645385503768921,
+ "learning_rate": 4.888388493551297e-06,
+ "loss": 0.6762,
+ "step": 1302
+ },
+ {
+ "epoch": 0.6160756501182033,
+ "grad_norm": 2.907954454421997,
+ "learning_rate": 4.8882041038549385e-06,
+ "loss": 0.6526,
+ "step": 1303
+ },
+ {
+ "epoch": 0.616548463356974,
+ "grad_norm": 2.482771873474121,
+ "learning_rate": 4.888019565455583e-06,
+ "loss": 0.628,
+ "step": 1304
+ },
+ {
+ "epoch": 0.6170212765957447,
+ "grad_norm": 2.7165915966033936,
+ "learning_rate": 4.88783487836472e-06,
+ "loss": 0.5743,
+ "step": 1305
+ },
+ {
+ "epoch": 0.6174940898345154,
+ "grad_norm": 3.095627546310425,
+ "learning_rate": 4.88765004259385e-06,
+ "loss": 0.627,
+ "step": 1306
+ },
+ {
+ "epoch": 0.6179669030732861,
+ "grad_norm": 2.5018465518951416,
+ "learning_rate": 4.8874650581544805e-06,
+ "loss": 0.5215,
+ "step": 1307
+ },
+ {
+ "epoch": 0.6184397163120567,
+ "grad_norm": 3.094337224960327,
+ "learning_rate": 4.8872799250581316e-06,
+ "loss": 0.6979,
+ "step": 1308
+ },
+ {
+ "epoch": 0.6189125295508274,
+ "grad_norm": 3.1002209186553955,
+ "learning_rate": 4.887094643316329e-06,
+ "loss": 0.6565,
+ "step": 1309
+ },
+ {
+ "epoch": 0.6193853427895981,
+ "grad_norm": 2.551431894302368,
+ "learning_rate": 4.88690921294061e-06,
+ "loss": 0.5748,
+ "step": 1310
+ },
+ {
+ "epoch": 0.6198581560283688,
+ "grad_norm": 2.8282904624938965,
+ "learning_rate": 4.886723633942521e-06,
+ "loss": 0.676,
+ "step": 1311
+ },
+ {
+ "epoch": 0.6203309692671395,
+ "grad_norm": 2.8887810707092285,
+ "learning_rate": 4.886537906333617e-06,
+ "loss": 0.5971,
+ "step": 1312
+ },
+ {
+ "epoch": 0.6208037825059102,
+ "grad_norm": 2.9989118576049805,
+ "learning_rate": 4.886352030125462e-06,
+ "loss": 0.6341,
+ "step": 1313
+ },
+ {
+ "epoch": 0.6212765957446809,
+ "grad_norm": 2.8042776584625244,
+ "learning_rate": 4.886166005329629e-06,
+ "loss": 0.6578,
+ "step": 1314
+ },
+ {
+ "epoch": 0.6217494089834515,
+ "grad_norm": 2.4980967044830322,
+ "learning_rate": 4.8859798319577026e-06,
+ "loss": 0.6711,
+ "step": 1315
+ },
+ {
+ "epoch": 0.6222222222222222,
+ "grad_norm": 2.762369155883789,
+ "learning_rate": 4.885793510021274e-06,
+ "loss": 0.5747,
+ "step": 1316
+ },
+ {
+ "epoch": 0.6226950354609929,
+ "grad_norm": 3.136327028274536,
+ "learning_rate": 4.885607039531945e-06,
+ "loss": 0.7544,
+ "step": 1317
+ },
+ {
+ "epoch": 0.6231678486997636,
+ "grad_norm": 2.8736963272094727,
+ "learning_rate": 4.885420420501327e-06,
+ "loss": 0.6603,
+ "step": 1318
+ },
+ {
+ "epoch": 0.6236406619385343,
+ "grad_norm": 2.766237497329712,
+ "learning_rate": 4.885233652941039e-06,
+ "loss": 0.581,
+ "step": 1319
+ },
+ {
+ "epoch": 0.624113475177305,
+ "grad_norm": 2.4740939140319824,
+ "learning_rate": 4.88504673686271e-06,
+ "loss": 0.6335,
+ "step": 1320
+ },
+ {
+ "epoch": 0.6245862884160757,
+ "grad_norm": 3.324795961380005,
+ "learning_rate": 4.884859672277978e-06,
+ "loss": 0.6019,
+ "step": 1321
+ },
+ {
+ "epoch": 0.6250591016548463,
+ "grad_norm": 3.521327257156372,
+ "learning_rate": 4.884672459198493e-06,
+ "loss": 0.6104,
+ "step": 1322
+ },
+ {
+ "epoch": 0.625531914893617,
+ "grad_norm": 2.7728071212768555,
+ "learning_rate": 4.884485097635909e-06,
+ "loss": 0.6714,
+ "step": 1323
+ },
+ {
+ "epoch": 0.6260047281323877,
+ "grad_norm": 3.0738155841827393,
+ "learning_rate": 4.884297587601895e-06,
+ "loss": 0.604,
+ "step": 1324
+ },
+ {
+ "epoch": 0.6264775413711584,
+ "grad_norm": 2.719240427017212,
+ "learning_rate": 4.884109929108124e-06,
+ "loss": 0.6795,
+ "step": 1325
+ },
+ {
+ "epoch": 0.6269503546099291,
+ "grad_norm": 2.4108200073242188,
+ "learning_rate": 4.883922122166282e-06,
+ "loss": 0.5846,
+ "step": 1326
+ },
+ {
+ "epoch": 0.6274231678486998,
+ "grad_norm": 2.393899917602539,
+ "learning_rate": 4.883734166788063e-06,
+ "loss": 0.6188,
+ "step": 1327
+ },
+ {
+ "epoch": 0.6278959810874705,
+ "grad_norm": 4.555255889892578,
+ "learning_rate": 4.883546062985169e-06,
+ "loss": 0.5962,
+ "step": 1328
+ },
+ {
+ "epoch": 0.6283687943262412,
+ "grad_norm": 2.571075439453125,
+ "learning_rate": 4.883357810769315e-06,
+ "loss": 0.6165,
+ "step": 1329
+ },
+ {
+ "epoch": 0.6288416075650118,
+ "grad_norm": 2.553115129470825,
+ "learning_rate": 4.8831694101522185e-06,
+ "loss": 0.6787,
+ "step": 1330
+ },
+ {
+ "epoch": 0.6293144208037825,
+ "grad_norm": 3.2564642429351807,
+ "learning_rate": 4.882980861145614e-06,
+ "loss": 0.659,
+ "step": 1331
+ },
+ {
+ "epoch": 0.6297872340425532,
+ "grad_norm": 2.535216808319092,
+ "learning_rate": 4.882792163761241e-06,
+ "loss": 0.6176,
+ "step": 1332
+ },
+ {
+ "epoch": 0.6302600472813239,
+ "grad_norm": 3.097921848297119,
+ "learning_rate": 4.882603318010847e-06,
+ "loss": 0.6822,
+ "step": 1333
+ },
+ {
+ "epoch": 0.6307328605200946,
+ "grad_norm": 2.8135175704956055,
+ "learning_rate": 4.882414323906192e-06,
+ "loss": 0.6782,
+ "step": 1334
+ },
+ {
+ "epoch": 0.6312056737588653,
+ "grad_norm": 2.724634885787964,
+ "learning_rate": 4.882225181459044e-06,
+ "loss": 0.6545,
+ "step": 1335
+ },
+ {
+ "epoch": 0.631678486997636,
+ "grad_norm": 2.9585227966308594,
+ "learning_rate": 4.882035890681179e-06,
+ "loss": 0.6218,
+ "step": 1336
+ },
+ {
+ "epoch": 0.6321513002364066,
+ "grad_norm": 2.6952011585235596,
+ "learning_rate": 4.881846451584385e-06,
+ "loss": 0.6,
+ "step": 1337
+ },
+ {
+ "epoch": 0.6326241134751773,
+ "grad_norm": 3.1400704383850098,
+ "learning_rate": 4.881656864180455e-06,
+ "loss": 0.6687,
+ "step": 1338
+ },
+ {
+ "epoch": 0.633096926713948,
+ "grad_norm": 2.8382487297058105,
+ "learning_rate": 4.881467128481197e-06,
+ "loss": 0.574,
+ "step": 1339
+ },
+ {
+ "epoch": 0.6335697399527187,
+ "grad_norm": 2.8520095348358154,
+ "learning_rate": 4.881277244498422e-06,
+ "loss": 0.6582,
+ "step": 1340
+ },
+ {
+ "epoch": 0.6340425531914894,
+ "grad_norm": 2.703498363494873,
+ "learning_rate": 4.881087212243956e-06,
+ "loss": 0.7224,
+ "step": 1341
+ },
+ {
+ "epoch": 0.6345153664302601,
+ "grad_norm": 3.697205066680908,
+ "learning_rate": 4.880897031729629e-06,
+ "loss": 0.6582,
+ "step": 1342
+ },
+ {
+ "epoch": 0.6349881796690308,
+ "grad_norm": 2.7625808715820312,
+ "learning_rate": 4.880706702967284e-06,
+ "loss": 0.574,
+ "step": 1343
+ },
+ {
+ "epoch": 0.6354609929078014,
+ "grad_norm": 2.949984073638916,
+ "learning_rate": 4.880516225968771e-06,
+ "loss": 0.66,
+ "step": 1344
+ },
+ {
+ "epoch": 0.6359338061465721,
+ "grad_norm": 2.548269748687744,
+ "learning_rate": 4.8803256007459525e-06,
+ "loss": 0.642,
+ "step": 1345
+ },
+ {
+ "epoch": 0.6364066193853428,
+ "grad_norm": 2.5102174282073975,
+ "learning_rate": 4.8801348273106945e-06,
+ "loss": 0.6238,
+ "step": 1346
+ },
+ {
+ "epoch": 0.6368794326241135,
+ "grad_norm": 2.9847946166992188,
+ "learning_rate": 4.8799439056748786e-06,
+ "loss": 0.5416,
+ "step": 1347
+ },
+ {
+ "epoch": 0.6373522458628842,
+ "grad_norm": 2.8711049556732178,
+ "learning_rate": 4.879752835850391e-06,
+ "loss": 0.6427,
+ "step": 1348
+ },
+ {
+ "epoch": 0.6378250591016549,
+ "grad_norm": 2.7901716232299805,
+ "learning_rate": 4.879561617849129e-06,
+ "loss": 0.6026,
+ "step": 1349
+ },
+ {
+ "epoch": 0.6382978723404256,
+ "grad_norm": 2.659778356552124,
+ "learning_rate": 4.879370251682999e-06,
+ "loss": 0.6623,
+ "step": 1350
+ },
+ {
+ "epoch": 0.6387706855791963,
+ "grad_norm": 3.224386692047119,
+ "learning_rate": 4.879178737363917e-06,
+ "loss": 0.6485,
+ "step": 1351
+ },
+ {
+ "epoch": 0.6392434988179669,
+ "grad_norm": 2.6385605335235596,
+ "learning_rate": 4.8789870749038076e-06,
+ "loss": 0.5866,
+ "step": 1352
+ },
+ {
+ "epoch": 0.6397163120567376,
+ "grad_norm": 2.807713270187378,
+ "learning_rate": 4.8787952643146045e-06,
+ "loss": 0.6537,
+ "step": 1353
+ },
+ {
+ "epoch": 0.6401891252955083,
+ "grad_norm": 2.5689280033111572,
+ "learning_rate": 4.878603305608251e-06,
+ "loss": 0.6216,
+ "step": 1354
+ },
+ {
+ "epoch": 0.640661938534279,
+ "grad_norm": 2.7347843647003174,
+ "learning_rate": 4.8784111987967e-06,
+ "loss": 0.6318,
+ "step": 1355
+ },
+ {
+ "epoch": 0.6411347517730497,
+ "grad_norm": 2.5210378170013428,
+ "learning_rate": 4.878218943891911e-06,
+ "loss": 0.5472,
+ "step": 1356
+ },
+ {
+ "epoch": 0.6416075650118204,
+ "grad_norm": 2.866785764694214,
+ "learning_rate": 4.878026540905858e-06,
+ "loss": 0.7108,
+ "step": 1357
+ },
+ {
+ "epoch": 0.642080378250591,
+ "grad_norm": 2.923314332962036,
+ "learning_rate": 4.877833989850519e-06,
+ "loss": 0.5557,
+ "step": 1358
+ },
+ {
+ "epoch": 0.6425531914893617,
+ "grad_norm": 2.925463914871216,
+ "learning_rate": 4.8776412907378845e-06,
+ "loss": 0.6382,
+ "step": 1359
+ },
+ {
+ "epoch": 0.6430260047281324,
+ "grad_norm": 2.909644365310669,
+ "learning_rate": 4.877448443579952e-06,
+ "loss": 0.5603,
+ "step": 1360
+ },
+ {
+ "epoch": 0.6434988179669031,
+ "grad_norm": 3.501148223876953,
+ "learning_rate": 4.8772554483887306e-06,
+ "loss": 0.6722,
+ "step": 1361
+ },
+ {
+ "epoch": 0.6439716312056738,
+ "grad_norm": 2.823765516281128,
+ "learning_rate": 4.877062305176235e-06,
+ "loss": 0.6408,
+ "step": 1362
+ },
+ {
+ "epoch": 0.6444444444444445,
+ "grad_norm": 2.9807584285736084,
+ "learning_rate": 4.8768690139544935e-06,
+ "loss": 0.5984,
+ "step": 1363
+ },
+ {
+ "epoch": 0.6449172576832152,
+ "grad_norm": 2.8411378860473633,
+ "learning_rate": 4.8766755747355405e-06,
+ "loss": 0.6231,
+ "step": 1364
+ },
+ {
+ "epoch": 0.6453900709219859,
+ "grad_norm": 3.158952236175537,
+ "learning_rate": 4.8764819875314215e-06,
+ "loss": 0.6441,
+ "step": 1365
+ },
+ {
+ "epoch": 0.6458628841607565,
+ "grad_norm": 2.9614369869232178,
+ "learning_rate": 4.876288252354189e-06,
+ "loss": 0.6308,
+ "step": 1366
+ },
+ {
+ "epoch": 0.6463356973995272,
+ "grad_norm": 3.073805570602417,
+ "learning_rate": 4.876094369215907e-06,
+ "loss": 0.6046,
+ "step": 1367
+ },
+ {
+ "epoch": 0.6468085106382979,
+ "grad_norm": 2.719189405441284,
+ "learning_rate": 4.875900338128648e-06,
+ "loss": 0.6082,
+ "step": 1368
+ },
+ {
+ "epoch": 0.6472813238770686,
+ "grad_norm": 2.676726818084717,
+ "learning_rate": 4.8757061591044914e-06,
+ "loss": 0.6344,
+ "step": 1369
+ },
+ {
+ "epoch": 0.6477541371158393,
+ "grad_norm": 2.955256938934326,
+ "learning_rate": 4.87551183215553e-06,
+ "loss": 0.6506,
+ "step": 1370
+ },
+ {
+ "epoch": 0.64822695035461,
+ "grad_norm": 2.5672218799591064,
+ "learning_rate": 4.875317357293864e-06,
+ "loss": 0.5284,
+ "step": 1371
+ },
+ {
+ "epoch": 0.6486997635933807,
+ "grad_norm": 2.5860238075256348,
+ "learning_rate": 4.875122734531602e-06,
+ "loss": 0.667,
+ "step": 1372
+ },
+ {
+ "epoch": 0.6491725768321513,
+ "grad_norm": 3.1037003993988037,
+ "learning_rate": 4.8749279638808605e-06,
+ "loss": 0.6902,
+ "step": 1373
+ },
+ {
+ "epoch": 0.649645390070922,
+ "grad_norm": 2.7715282440185547,
+ "learning_rate": 4.874733045353769e-06,
+ "loss": 0.6291,
+ "step": 1374
+ },
+ {
+ "epoch": 0.6501182033096927,
+ "grad_norm": 2.527071475982666,
+ "learning_rate": 4.874537978962463e-06,
+ "loss": 0.5565,
+ "step": 1375
+ },
+ {
+ "epoch": 0.6505910165484634,
+ "grad_norm": 2.722092628479004,
+ "learning_rate": 4.874342764719091e-06,
+ "loss": 0.5724,
+ "step": 1376
+ },
+ {
+ "epoch": 0.6510638297872341,
+ "grad_norm": 2.6342411041259766,
+ "learning_rate": 4.874147402635805e-06,
+ "loss": 0.6308,
+ "step": 1377
+ },
+ {
+ "epoch": 0.6515366430260048,
+ "grad_norm": 2.3850719928741455,
+ "learning_rate": 4.8739518927247695e-06,
+ "loss": 0.5692,
+ "step": 1378
+ },
+ {
+ "epoch": 0.6520094562647755,
+ "grad_norm": 2.9787259101867676,
+ "learning_rate": 4.873756234998161e-06,
+ "loss": 0.6953,
+ "step": 1379
+ },
+ {
+ "epoch": 0.6524822695035462,
+ "grad_norm": 2.634141683578491,
+ "learning_rate": 4.873560429468159e-06,
+ "loss": 0.6077,
+ "step": 1380
+ },
+ {
+ "epoch": 0.6529550827423168,
+ "grad_norm": 2.803046941757202,
+ "learning_rate": 4.873364476146958e-06,
+ "loss": 0.6657,
+ "step": 1381
+ },
+ {
+ "epoch": 0.6534278959810875,
+ "grad_norm": 2.762827157974243,
+ "learning_rate": 4.8731683750467574e-06,
+ "loss": 0.6061,
+ "step": 1382
+ },
+ {
+ "epoch": 0.6539007092198581,
+ "grad_norm": 2.6654391288757324,
+ "learning_rate": 4.872972126179768e-06,
+ "loss": 0.6387,
+ "step": 1383
+ },
+ {
+ "epoch": 0.6543735224586288,
+ "grad_norm": 2.4363625049591064,
+ "learning_rate": 4.872775729558209e-06,
+ "loss": 0.5623,
+ "step": 1384
+ },
+ {
+ "epoch": 0.6548463356973995,
+ "grad_norm": 2.528959035873413,
+ "learning_rate": 4.87257918519431e-06,
+ "loss": 0.5609,
+ "step": 1385
+ },
+ {
+ "epoch": 0.6553191489361702,
+ "grad_norm": 2.718383312225342,
+ "learning_rate": 4.872382493100309e-06,
+ "loss": 0.5575,
+ "step": 1386
+ },
+ {
+ "epoch": 0.6557919621749408,
+ "grad_norm": 2.660841226577759,
+ "learning_rate": 4.872185653288453e-06,
+ "loss": 0.6106,
+ "step": 1387
+ },
+ {
+ "epoch": 0.6562647754137115,
+ "grad_norm": 2.508753538131714,
+ "learning_rate": 4.871988665770997e-06,
+ "loss": 0.5705,
+ "step": 1388
+ },
+ {
+ "epoch": 0.6567375886524822,
+ "grad_norm": 2.5134334564208984,
+ "learning_rate": 4.871791530560208e-06,
+ "loss": 0.5592,
+ "step": 1389
+ },
+ {
+ "epoch": 0.6572104018912529,
+ "grad_norm": 2.7475597858428955,
+ "learning_rate": 4.871594247668361e-06,
+ "loss": 0.6277,
+ "step": 1390
+ },
+ {
+ "epoch": 0.6576832151300236,
+ "grad_norm": 2.793616533279419,
+ "learning_rate": 4.871396817107739e-06,
+ "loss": 0.595,
+ "step": 1391
+ },
+ {
+ "epoch": 0.6581560283687943,
+ "grad_norm": 2.8285086154937744,
+ "learning_rate": 4.871199238890635e-06,
+ "loss": 0.6094,
+ "step": 1392
+ },
+ {
+ "epoch": 0.658628841607565,
+ "grad_norm": 2.74124813079834,
+ "learning_rate": 4.871001513029352e-06,
+ "loss": 0.6296,
+ "step": 1393
+ },
+ {
+ "epoch": 0.6591016548463356,
+ "grad_norm": 2.761237621307373,
+ "learning_rate": 4.870803639536202e-06,
+ "loss": 0.5702,
+ "step": 1394
+ },
+ {
+ "epoch": 0.6595744680851063,
+ "grad_norm": 2.761038064956665,
+ "learning_rate": 4.870605618423504e-06,
+ "loss": 0.6195,
+ "step": 1395
+ },
+ {
+ "epoch": 0.660047281323877,
+ "grad_norm": 2.8812482357025146,
+ "learning_rate": 4.870407449703589e-06,
+ "loss": 0.616,
+ "step": 1396
+ },
+ {
+ "epoch": 0.6605200945626477,
+ "grad_norm": 2.9966578483581543,
+ "learning_rate": 4.870209133388797e-06,
+ "loss": 0.6547,
+ "step": 1397
+ },
+ {
+ "epoch": 0.6609929078014184,
+ "grad_norm": 2.7969017028808594,
+ "learning_rate": 4.870010669491474e-06,
+ "loss": 0.5762,
+ "step": 1398
+ },
+ {
+ "epoch": 0.6614657210401891,
+ "grad_norm": 2.557783842086792,
+ "learning_rate": 4.86981205802398e-06,
+ "loss": 0.6184,
+ "step": 1399
+ },
+ {
+ "epoch": 0.6619385342789598,
+ "grad_norm": 2.5393927097320557,
+ "learning_rate": 4.86961329899868e-06,
+ "loss": 0.5953,
+ "step": 1400
+ },
+ {
+ "epoch": 0.6624113475177305,
+ "grad_norm": 2.7745981216430664,
+ "learning_rate": 4.86941439242795e-06,
+ "loss": 0.5967,
+ "step": 1401
+ },
+ {
+ "epoch": 0.6628841607565011,
+ "grad_norm": 2.650381326675415,
+ "learning_rate": 4.869215338324176e-06,
+ "loss": 0.5667,
+ "step": 1402
+ },
+ {
+ "epoch": 0.6633569739952718,
+ "grad_norm": 2.583169937133789,
+ "learning_rate": 4.869016136699751e-06,
+ "loss": 0.549,
+ "step": 1403
+ },
+ {
+ "epoch": 0.6638297872340425,
+ "grad_norm": 2.984978437423706,
+ "learning_rate": 4.868816787567079e-06,
+ "loss": 0.5931,
+ "step": 1404
+ },
+ {
+ "epoch": 0.6643026004728132,
+ "grad_norm": 3.1947181224823,
+ "learning_rate": 4.868617290938573e-06,
+ "loss": 0.5473,
+ "step": 1405
+ },
+ {
+ "epoch": 0.6647754137115839,
+ "grad_norm": 2.562927007675171,
+ "learning_rate": 4.868417646826654e-06,
+ "loss": 0.6878,
+ "step": 1406
+ },
+ {
+ "epoch": 0.6652482269503546,
+ "grad_norm": 2.8741261959075928,
+ "learning_rate": 4.868217855243754e-06,
+ "loss": 0.6312,
+ "step": 1407
+ },
+ {
+ "epoch": 0.6657210401891253,
+ "grad_norm": 2.9834797382354736,
+ "learning_rate": 4.868017916202312e-06,
+ "loss": 0.5624,
+ "step": 1408
+ },
+ {
+ "epoch": 0.6661938534278959,
+ "grad_norm": 2.6935982704162598,
+ "learning_rate": 4.8678178297147785e-06,
+ "loss": 0.5857,
+ "step": 1409
+ },
+ {
+ "epoch": 0.6666666666666666,
+ "grad_norm": 2.8200576305389404,
+ "learning_rate": 4.86761759579361e-06,
+ "loss": 0.6153,
+ "step": 1410
+ },
+ {
+ "epoch": 0.6671394799054373,
+ "grad_norm": 2.831425189971924,
+ "learning_rate": 4.867417214451276e-06,
+ "loss": 0.6495,
+ "step": 1411
+ },
+ {
+ "epoch": 0.667612293144208,
+ "grad_norm": 2.733565092086792,
+ "learning_rate": 4.867216685700253e-06,
+ "loss": 0.6036,
+ "step": 1412
+ },
+ {
+ "epoch": 0.6680851063829787,
+ "grad_norm": 3.0609400272369385,
+ "learning_rate": 4.867016009553027e-06,
+ "loss": 0.6773,
+ "step": 1413
+ },
+ {
+ "epoch": 0.6685579196217494,
+ "grad_norm": 2.665452241897583,
+ "learning_rate": 4.866815186022093e-06,
+ "loss": 0.6256,
+ "step": 1414
+ },
+ {
+ "epoch": 0.6690307328605201,
+ "grad_norm": 2.9480721950531006,
+ "learning_rate": 4.866614215119956e-06,
+ "loss": 0.535,
+ "step": 1415
+ },
+ {
+ "epoch": 0.6695035460992907,
+ "grad_norm": 2.5514180660247803,
+ "learning_rate": 4.866413096859128e-06,
+ "loss": 0.6588,
+ "step": 1416
+ },
+ {
+ "epoch": 0.6699763593380614,
+ "grad_norm": 3.3442373275756836,
+ "learning_rate": 4.866211831252134e-06,
+ "loss": 0.5754,
+ "step": 1417
+ },
+ {
+ "epoch": 0.6704491725768321,
+ "grad_norm": 2.521467685699463,
+ "learning_rate": 4.866010418311504e-06,
+ "loss": 0.5546,
+ "step": 1418
+ },
+ {
+ "epoch": 0.6709219858156028,
+ "grad_norm": 2.930706262588501,
+ "learning_rate": 4.865808858049781e-06,
+ "loss": 0.589,
+ "step": 1419
+ },
+ {
+ "epoch": 0.6713947990543735,
+ "grad_norm": 2.6298375129699707,
+ "learning_rate": 4.865607150479513e-06,
+ "loss": 0.5915,
+ "step": 1420
+ },
+ {
+ "epoch": 0.6718676122931442,
+ "grad_norm": 2.9554293155670166,
+ "learning_rate": 4.8654052956132615e-06,
+ "loss": 0.6654,
+ "step": 1421
+ },
+ {
+ "epoch": 0.6723404255319149,
+ "grad_norm": 3.2706902027130127,
+ "learning_rate": 4.865203293463593e-06,
+ "loss": 0.7115,
+ "step": 1422
+ },
+ {
+ "epoch": 0.6728132387706856,
+ "grad_norm": 3.041539430618286,
+ "learning_rate": 4.865001144043088e-06,
+ "loss": 0.5818,
+ "step": 1423
+ },
+ {
+ "epoch": 0.6732860520094562,
+ "grad_norm": 3.1314544677734375,
+ "learning_rate": 4.864798847364331e-06,
+ "loss": 0.5822,
+ "step": 1424
+ },
+ {
+ "epoch": 0.6737588652482269,
+ "grad_norm": 2.5301461219787598,
+ "learning_rate": 4.86459640343992e-06,
+ "loss": 0.5525,
+ "step": 1425
+ },
+ {
+ "epoch": 0.6742316784869976,
+ "grad_norm": 2.809295892715454,
+ "learning_rate": 4.864393812282458e-06,
+ "loss": 0.6768,
+ "step": 1426
+ },
+ {
+ "epoch": 0.6747044917257683,
+ "grad_norm": 2.794664144515991,
+ "learning_rate": 4.864191073904562e-06,
+ "loss": 0.5793,
+ "step": 1427
+ },
+ {
+ "epoch": 0.675177304964539,
+ "grad_norm": 2.7771105766296387,
+ "learning_rate": 4.863988188318854e-06,
+ "loss": 0.6453,
+ "step": 1428
+ },
+ {
+ "epoch": 0.6756501182033097,
+ "grad_norm": 2.6431946754455566,
+ "learning_rate": 4.863785155537967e-06,
+ "loss": 0.5877,
+ "step": 1429
+ },
+ {
+ "epoch": 0.6761229314420804,
+ "grad_norm": 2.951353073120117,
+ "learning_rate": 4.863581975574544e-06,
+ "loss": 0.6793,
+ "step": 1430
+ },
+ {
+ "epoch": 0.676595744680851,
+ "grad_norm": 3.1336071491241455,
+ "learning_rate": 4.863378648441235e-06,
+ "loss": 0.6695,
+ "step": 1431
+ },
+ {
+ "epoch": 0.6770685579196217,
+ "grad_norm": 2.735982656478882,
+ "learning_rate": 4.8631751741507e-06,
+ "loss": 0.5239,
+ "step": 1432
+ },
+ {
+ "epoch": 0.6775413711583924,
+ "grad_norm": 2.7085206508636475,
+ "learning_rate": 4.862971552715611e-06,
+ "loss": 0.6837,
+ "step": 1433
+ },
+ {
+ "epoch": 0.6780141843971631,
+ "grad_norm": 3.136528730392456,
+ "learning_rate": 4.8627677841486436e-06,
+ "loss": 0.683,
+ "step": 1434
+ },
+ {
+ "epoch": 0.6784869976359338,
+ "grad_norm": 2.7879369258880615,
+ "learning_rate": 4.862563868462486e-06,
+ "loss": 0.608,
+ "step": 1435
+ },
+ {
+ "epoch": 0.6789598108747045,
+ "grad_norm": 2.7937729358673096,
+ "learning_rate": 4.862359805669837e-06,
+ "loss": 0.6131,
+ "step": 1436
+ },
+ {
+ "epoch": 0.6794326241134752,
+ "grad_norm": 2.5988364219665527,
+ "learning_rate": 4.862155595783401e-06,
+ "loss": 0.6303,
+ "step": 1437
+ },
+ {
+ "epoch": 0.6799054373522458,
+ "grad_norm": 3.251070499420166,
+ "learning_rate": 4.861951238815894e-06,
+ "loss": 0.7246,
+ "step": 1438
+ },
+ {
+ "epoch": 0.6803782505910165,
+ "grad_norm": 2.646759271621704,
+ "learning_rate": 4.861746734780039e-06,
+ "loss": 0.6313,
+ "step": 1439
+ },
+ {
+ "epoch": 0.6808510638297872,
+ "grad_norm": 2.773866891860962,
+ "learning_rate": 4.861542083688573e-06,
+ "loss": 0.6463,
+ "step": 1440
+ },
+ {
+ "epoch": 0.6813238770685579,
+ "grad_norm": 2.759965658187866,
+ "learning_rate": 4.861337285554235e-06,
+ "loss": 0.5428,
+ "step": 1441
+ },
+ {
+ "epoch": 0.6817966903073286,
+ "grad_norm": 3.3250818252563477,
+ "learning_rate": 4.861132340389779e-06,
+ "loss": 0.6522,
+ "step": 1442
+ },
+ {
+ "epoch": 0.6822695035460993,
+ "grad_norm": 2.661797523498535,
+ "learning_rate": 4.860927248207965e-06,
+ "loss": 0.5871,
+ "step": 1443
+ },
+ {
+ "epoch": 0.68274231678487,
+ "grad_norm": 2.706289052963257,
+ "learning_rate": 4.860722009021563e-06,
+ "loss": 0.6651,
+ "step": 1444
+ },
+ {
+ "epoch": 0.6832151300236406,
+ "grad_norm": 2.8459298610687256,
+ "learning_rate": 4.860516622843354e-06,
+ "loss": 0.5827,
+ "step": 1445
+ },
+ {
+ "epoch": 0.6836879432624113,
+ "grad_norm": 3.1041831970214844,
+ "learning_rate": 4.860311089686125e-06,
+ "loss": 0.6727,
+ "step": 1446
+ },
+ {
+ "epoch": 0.684160756501182,
+ "grad_norm": 2.9382801055908203,
+ "learning_rate": 4.8601054095626746e-06,
+ "loss": 0.6002,
+ "step": 1447
+ },
+ {
+ "epoch": 0.6846335697399527,
+ "grad_norm": 2.782475471496582,
+ "learning_rate": 4.859899582485808e-06,
+ "loss": 0.6951,
+ "step": 1448
+ },
+ {
+ "epoch": 0.6851063829787234,
+ "grad_norm": 3.313894510269165,
+ "learning_rate": 4.859693608468343e-06,
+ "loss": 0.6363,
+ "step": 1449
+ },
+ {
+ "epoch": 0.6855791962174941,
+ "grad_norm": 3.1639695167541504,
+ "learning_rate": 4.8594874875231045e-06,
+ "loss": 0.7002,
+ "step": 1450
+ },
+ {
+ "epoch": 0.6860520094562648,
+ "grad_norm": 2.6762218475341797,
+ "learning_rate": 4.859281219662926e-06,
+ "loss": 0.6246,
+ "step": 1451
+ },
+ {
+ "epoch": 0.6865248226950355,
+ "grad_norm": 2.8368663787841797,
+ "learning_rate": 4.85907480490065e-06,
+ "loss": 0.5906,
+ "step": 1452
+ },
+ {
+ "epoch": 0.6869976359338061,
+ "grad_norm": 2.887373208999634,
+ "learning_rate": 4.858868243249131e-06,
+ "loss": 0.5931,
+ "step": 1453
+ },
+ {
+ "epoch": 0.6874704491725768,
+ "grad_norm": 2.8115322589874268,
+ "learning_rate": 4.858661534721229e-06,
+ "loss": 0.6337,
+ "step": 1454
+ },
+ {
+ "epoch": 0.6879432624113475,
+ "grad_norm": 2.8470499515533447,
+ "learning_rate": 4.8584546793298174e-06,
+ "loss": 0.632,
+ "step": 1455
+ },
+ {
+ "epoch": 0.6884160756501182,
+ "grad_norm": 2.8229613304138184,
+ "learning_rate": 4.8582476770877725e-06,
+ "loss": 0.6494,
+ "step": 1456
+ },
+ {
+ "epoch": 0.6888888888888889,
+ "grad_norm": 2.4235479831695557,
+ "learning_rate": 4.858040528007987e-06,
+ "loss": 0.5709,
+ "step": 1457
+ },
+ {
+ "epoch": 0.6893617021276596,
+ "grad_norm": 2.9348199367523193,
+ "learning_rate": 4.857833232103356e-06,
+ "loss": 0.5404,
+ "step": 1458
+ },
+ {
+ "epoch": 0.6898345153664303,
+ "grad_norm": 2.8274219036102295,
+ "learning_rate": 4.857625789386789e-06,
+ "loss": 0.701,
+ "step": 1459
+ },
+ {
+ "epoch": 0.6903073286052009,
+ "grad_norm": 3.136929988861084,
+ "learning_rate": 4.857418199871203e-06,
+ "loss": 0.6971,
+ "step": 1460
+ },
+ {
+ "epoch": 0.6907801418439716,
+ "grad_norm": 2.8987185955047607,
+ "learning_rate": 4.8572104635695214e-06,
+ "loss": 0.6613,
+ "step": 1461
+ },
+ {
+ "epoch": 0.6912529550827423,
+ "grad_norm": 2.5073442459106445,
+ "learning_rate": 4.857002580494681e-06,
+ "loss": 0.6032,
+ "step": 1462
+ },
+ {
+ "epoch": 0.691725768321513,
+ "grad_norm": 2.7019522190093994,
+ "learning_rate": 4.856794550659625e-06,
+ "loss": 0.567,
+ "step": 1463
+ },
+ {
+ "epoch": 0.6921985815602837,
+ "grad_norm": 2.4795594215393066,
+ "learning_rate": 4.8565863740773054e-06,
+ "loss": 0.5777,
+ "step": 1464
+ },
+ {
+ "epoch": 0.6926713947990544,
+ "grad_norm": 3.032506227493286,
+ "learning_rate": 4.856378050760687e-06,
+ "loss": 0.607,
+ "step": 1465
+ },
+ {
+ "epoch": 0.6931442080378251,
+ "grad_norm": 3.052091121673584,
+ "learning_rate": 4.85616958072274e-06,
+ "loss": 0.591,
+ "step": 1466
+ },
+ {
+ "epoch": 0.6936170212765957,
+ "grad_norm": 2.704831838607788,
+ "learning_rate": 4.855960963976443e-06,
+ "loss": 0.6528,
+ "step": 1467
+ },
+ {
+ "epoch": 0.6940898345153664,
+ "grad_norm": 2.680995225906372,
+ "learning_rate": 4.855752200534788e-06,
+ "loss": 0.6294,
+ "step": 1468
+ },
+ {
+ "epoch": 0.6945626477541371,
+ "grad_norm": 2.3948659896850586,
+ "learning_rate": 4.855543290410774e-06,
+ "loss": 0.6091,
+ "step": 1469
+ },
+ {
+ "epoch": 0.6950354609929078,
+ "grad_norm": 2.6407411098480225,
+ "learning_rate": 4.855334233617407e-06,
+ "loss": 0.5572,
+ "step": 1470
+ },
+ {
+ "epoch": 0.6955082742316785,
+ "grad_norm": 2.5526835918426514,
+ "learning_rate": 4.8551250301677064e-06,
+ "loss": 0.5432,
+ "step": 1471
+ },
+ {
+ "epoch": 0.6959810874704492,
+ "grad_norm": 3.1237430572509766,
+ "learning_rate": 4.8549156800746965e-06,
+ "loss": 0.5944,
+ "step": 1472
+ },
+ {
+ "epoch": 0.6964539007092199,
+ "grad_norm": 2.8112540245056152,
+ "learning_rate": 4.854706183351412e-06,
+ "loss": 0.604,
+ "step": 1473
+ },
+ {
+ "epoch": 0.6969267139479906,
+ "grad_norm": 2.664644479751587,
+ "learning_rate": 4.8544965400109e-06,
+ "loss": 0.5647,
+ "step": 1474
+ },
+ {
+ "epoch": 0.6973995271867612,
+ "grad_norm": 3.26310133934021,
+ "learning_rate": 4.854286750066212e-06,
+ "loss": 0.6999,
+ "step": 1475
+ },
+ {
+ "epoch": 0.6978723404255319,
+ "grad_norm": 2.9717442989349365,
+ "learning_rate": 4.8540768135304115e-06,
+ "loss": 0.6655,
+ "step": 1476
+ },
+ {
+ "epoch": 0.6983451536643026,
+ "grad_norm": 2.5302982330322266,
+ "learning_rate": 4.85386673041657e-06,
+ "loss": 0.6384,
+ "step": 1477
+ },
+ {
+ "epoch": 0.6988179669030733,
+ "grad_norm": 2.864877700805664,
+ "learning_rate": 4.853656500737769e-06,
+ "loss": 0.6834,
+ "step": 1478
+ },
+ {
+ "epoch": 0.699290780141844,
+ "grad_norm": 2.5522031784057617,
+ "learning_rate": 4.853446124507098e-06,
+ "loss": 0.5929,
+ "step": 1479
+ },
+ {
+ "epoch": 0.6997635933806147,
+ "grad_norm": 3.096477746963501,
+ "learning_rate": 4.853235601737656e-06,
+ "loss": 0.5737,
+ "step": 1480
+ },
+ {
+ "epoch": 0.7002364066193854,
+ "grad_norm": 2.884779214859009,
+ "learning_rate": 4.853024932442552e-06,
+ "loss": 0.6362,
+ "step": 1481
+ },
+ {
+ "epoch": 0.700709219858156,
+ "grad_norm": 3.368558406829834,
+ "learning_rate": 4.852814116634903e-06,
+ "loss": 0.6721,
+ "step": 1482
+ },
+ {
+ "epoch": 0.7011820330969267,
+ "grad_norm": 2.742414951324463,
+ "learning_rate": 4.852603154327837e-06,
+ "loss": 0.6212,
+ "step": 1483
+ },
+ {
+ "epoch": 0.7016548463356974,
+ "grad_norm": 2.53454852104187,
+ "learning_rate": 4.8523920455344864e-06,
+ "loss": 0.6675,
+ "step": 1484
+ },
+ {
+ "epoch": 0.7021276595744681,
+ "grad_norm": 2.9354238510131836,
+ "learning_rate": 4.852180790267999e-06,
+ "loss": 0.6692,
+ "step": 1485
+ },
+ {
+ "epoch": 0.7026004728132388,
+ "grad_norm": 2.585070848464966,
+ "learning_rate": 4.8519693885415274e-06,
+ "loss": 0.6215,
+ "step": 1486
+ },
+ {
+ "epoch": 0.7030732860520095,
+ "grad_norm": 2.9047999382019043,
+ "learning_rate": 4.851757840368235e-06,
+ "loss": 0.6231,
+ "step": 1487
+ },
+ {
+ "epoch": 0.7035460992907802,
+ "grad_norm": 3.0930933952331543,
+ "learning_rate": 4.851546145761295e-06,
+ "loss": 0.7267,
+ "step": 1488
+ },
+ {
+ "epoch": 0.7040189125295508,
+ "grad_norm": 3.0224719047546387,
+ "learning_rate": 4.8513343047338875e-06,
+ "loss": 0.6293,
+ "step": 1489
+ },
+ {
+ "epoch": 0.7044917257683215,
+ "grad_norm": 2.5758471488952637,
+ "learning_rate": 4.851122317299203e-06,
+ "loss": 0.5855,
+ "step": 1490
+ },
+ {
+ "epoch": 0.7049645390070922,
+ "grad_norm": 2.579272508621216,
+ "learning_rate": 4.850910183470441e-06,
+ "loss": 0.582,
+ "step": 1491
+ },
+ {
+ "epoch": 0.7054373522458629,
+ "grad_norm": 2.8148300647735596,
+ "learning_rate": 4.85069790326081e-06,
+ "loss": 0.6396,
+ "step": 1492
+ },
+ {
+ "epoch": 0.7059101654846336,
+ "grad_norm": 2.6380527019500732,
+ "learning_rate": 4.850485476683528e-06,
+ "loss": 0.6114,
+ "step": 1493
+ },
+ {
+ "epoch": 0.7063829787234043,
+ "grad_norm": 2.7736263275146484,
+ "learning_rate": 4.850272903751823e-06,
+ "loss": 0.6683,
+ "step": 1494
+ },
+ {
+ "epoch": 0.706855791962175,
+ "grad_norm": 3.1958179473876953,
+ "learning_rate": 4.8500601844789285e-06,
+ "loss": 0.6265,
+ "step": 1495
+ },
+ {
+ "epoch": 0.7073286052009456,
+ "grad_norm": 3.783212423324585,
+ "learning_rate": 4.8498473188780916e-06,
+ "loss": 0.6078,
+ "step": 1496
+ },
+ {
+ "epoch": 0.7078014184397163,
+ "grad_norm": 2.6656646728515625,
+ "learning_rate": 4.849634306962566e-06,
+ "loss": 0.5756,
+ "step": 1497
+ },
+ {
+ "epoch": 0.708274231678487,
+ "grad_norm": 2.757141590118408,
+ "learning_rate": 4.849421148745615e-06,
+ "loss": 0.5596,
+ "step": 1498
+ },
+ {
+ "epoch": 0.7087470449172577,
+ "grad_norm": 3.0391886234283447,
+ "learning_rate": 4.849207844240511e-06,
+ "loss": 0.5293,
+ "step": 1499
+ },
+ {
+ "epoch": 0.7092198581560284,
+ "grad_norm": 2.981912851333618,
+ "learning_rate": 4.848994393460535e-06,
+ "loss": 0.598,
+ "step": 1500
+ },
+ {
+ "epoch": 0.7096926713947991,
+ "grad_norm": 2.5470798015594482,
+ "learning_rate": 4.848780796418978e-06,
+ "loss": 0.6266,
+ "step": 1501
+ },
+ {
+ "epoch": 0.7101654846335698,
+ "grad_norm": 2.8394415378570557,
+ "learning_rate": 4.8485670531291415e-06,
+ "loss": 0.6844,
+ "step": 1502
+ },
+ {
+ "epoch": 0.7106382978723405,
+ "grad_norm": 3.2023508548736572,
+ "learning_rate": 4.848353163604331e-06,
+ "loss": 0.6134,
+ "step": 1503
+ },
+ {
+ "epoch": 0.7111111111111111,
+ "grad_norm": 2.98245906829834,
+ "learning_rate": 4.848139127857867e-06,
+ "loss": 0.7084,
+ "step": 1504
+ },
+ {
+ "epoch": 0.7115839243498818,
+ "grad_norm": 2.5917441844940186,
+ "learning_rate": 4.847924945903076e-06,
+ "loss": 0.5676,
+ "step": 1505
+ },
+ {
+ "epoch": 0.7120567375886525,
+ "grad_norm": 2.8736681938171387,
+ "learning_rate": 4.847710617753294e-06,
+ "loss": 0.6304,
+ "step": 1506
+ },
+ {
+ "epoch": 0.7125295508274232,
+ "grad_norm": 2.7832682132720947,
+ "learning_rate": 4.847496143421866e-06,
+ "loss": 0.5705,
+ "step": 1507
+ },
+ {
+ "epoch": 0.7130023640661939,
+ "grad_norm": 2.480560779571533,
+ "learning_rate": 4.847281522922147e-06,
+ "loss": 0.5595,
+ "step": 1508
+ },
+ {
+ "epoch": 0.7134751773049646,
+ "grad_norm": 2.357675313949585,
+ "learning_rate": 4.847066756267499e-06,
+ "loss": 0.5065,
+ "step": 1509
+ },
+ {
+ "epoch": 0.7139479905437353,
+ "grad_norm": 2.632669448852539,
+ "learning_rate": 4.846851843471296e-06,
+ "loss": 0.6949,
+ "step": 1510
+ },
+ {
+ "epoch": 0.7144208037825059,
+ "grad_norm": 2.7691073417663574,
+ "learning_rate": 4.84663678454692e-06,
+ "loss": 0.6638,
+ "step": 1511
+ },
+ {
+ "epoch": 0.7148936170212766,
+ "grad_norm": 2.5647685527801514,
+ "learning_rate": 4.846421579507761e-06,
+ "loss": 0.6098,
+ "step": 1512
+ },
+ {
+ "epoch": 0.7153664302600473,
+ "grad_norm": 2.476701021194458,
+ "learning_rate": 4.846206228367218e-06,
+ "loss": 0.592,
+ "step": 1513
+ },
+ {
+ "epoch": 0.715839243498818,
+ "grad_norm": 2.805727958679199,
+ "learning_rate": 4.845990731138702e-06,
+ "loss": 0.5466,
+ "step": 1514
+ },
+ {
+ "epoch": 0.7163120567375887,
+ "grad_norm": 2.551392078399658,
+ "learning_rate": 4.84577508783563e-06,
+ "loss": 0.6039,
+ "step": 1515
+ },
+ {
+ "epoch": 0.7167848699763594,
+ "grad_norm": 2.6861350536346436,
+ "learning_rate": 4.845559298471429e-06,
+ "loss": 0.6427,
+ "step": 1516
+ },
+ {
+ "epoch": 0.7172576832151301,
+ "grad_norm": 3.1908371448516846,
+ "learning_rate": 4.845343363059535e-06,
+ "loss": 0.5447,
+ "step": 1517
+ },
+ {
+ "epoch": 0.7177304964539007,
+ "grad_norm": 2.9021761417388916,
+ "learning_rate": 4.845127281613394e-06,
+ "loss": 0.5836,
+ "step": 1518
+ },
+ {
+ "epoch": 0.7182033096926714,
+ "grad_norm": 2.476670742034912,
+ "learning_rate": 4.844911054146461e-06,
+ "loss": 0.5863,
+ "step": 1519
+ },
+ {
+ "epoch": 0.7186761229314421,
+ "grad_norm": 2.662935495376587,
+ "learning_rate": 4.844694680672198e-06,
+ "loss": 0.5678,
+ "step": 1520
+ },
+ {
+ "epoch": 0.7191489361702128,
+ "grad_norm": 2.677896738052368,
+ "learning_rate": 4.844478161204079e-06,
+ "loss": 0.6195,
+ "step": 1521
+ },
+ {
+ "epoch": 0.7196217494089835,
+ "grad_norm": 2.781921863555908,
+ "learning_rate": 4.844261495755585e-06,
+ "loss": 0.643,
+ "step": 1522
+ },
+ {
+ "epoch": 0.7200945626477542,
+ "grad_norm": 3.0157392024993896,
+ "learning_rate": 4.844044684340206e-06,
+ "loss": 0.7559,
+ "step": 1523
+ },
+ {
+ "epoch": 0.7205673758865249,
+ "grad_norm": 2.8109354972839355,
+ "learning_rate": 4.843827726971444e-06,
+ "loss": 0.6264,
+ "step": 1524
+ },
+ {
+ "epoch": 0.7210401891252955,
+ "grad_norm": 3.0953569412231445,
+ "learning_rate": 4.8436106236628064e-06,
+ "loss": 0.6429,
+ "step": 1525
+ },
+ {
+ "epoch": 0.7215130023640662,
+ "grad_norm": 2.6850643157958984,
+ "learning_rate": 4.843393374427812e-06,
+ "loss": 0.6598,
+ "step": 1526
+ },
+ {
+ "epoch": 0.7219858156028369,
+ "grad_norm": 3.043480634689331,
+ "learning_rate": 4.8431759792799874e-06,
+ "loss": 0.6331,
+ "step": 1527
+ },
+ {
+ "epoch": 0.7224586288416076,
+ "grad_norm": 2.723870038986206,
+ "learning_rate": 4.842958438232868e-06,
+ "loss": 0.6259,
+ "step": 1528
+ },
+ {
+ "epoch": 0.7229314420803783,
+ "grad_norm": 2.822492837905884,
+ "learning_rate": 4.842740751300002e-06,
+ "loss": 0.6554,
+ "step": 1529
+ },
+ {
+ "epoch": 0.723404255319149,
+ "grad_norm": 2.7866315841674805,
+ "learning_rate": 4.842522918494941e-06,
+ "loss": 0.6991,
+ "step": 1530
+ },
+ {
+ "epoch": 0.7238770685579197,
+ "grad_norm": 2.8881826400756836,
+ "learning_rate": 4.84230493983125e-06,
+ "loss": 0.5876,
+ "step": 1531
+ },
+ {
+ "epoch": 0.7243498817966904,
+ "grad_norm": 2.7456939220428467,
+ "learning_rate": 4.8420868153225e-06,
+ "loss": 0.6188,
+ "step": 1532
+ },
+ {
+ "epoch": 0.724822695035461,
+ "grad_norm": 3.0257532596588135,
+ "learning_rate": 4.841868544982274e-06,
+ "loss": 0.63,
+ "step": 1533
+ },
+ {
+ "epoch": 0.7252955082742317,
+ "grad_norm": 3.1581954956054688,
+ "learning_rate": 4.841650128824164e-06,
+ "loss": 0.7214,
+ "step": 1534
+ },
+ {
+ "epoch": 0.7257683215130024,
+ "grad_norm": 2.9174306392669678,
+ "learning_rate": 4.841431566861767e-06,
+ "loss": 0.704,
+ "step": 1535
+ },
+ {
+ "epoch": 0.7262411347517731,
+ "grad_norm": 2.5019054412841797,
+ "learning_rate": 4.8412128591086935e-06,
+ "loss": 0.6298,
+ "step": 1536
+ },
+ {
+ "epoch": 0.7267139479905438,
+ "grad_norm": 2.724285125732422,
+ "learning_rate": 4.840994005578562e-06,
+ "loss": 0.6289,
+ "step": 1537
+ },
+ {
+ "epoch": 0.7271867612293145,
+ "grad_norm": 2.5882341861724854,
+ "learning_rate": 4.840775006284998e-06,
+ "loss": 0.6355,
+ "step": 1538
+ },
+ {
+ "epoch": 0.7276595744680852,
+ "grad_norm": 3.1281991004943848,
+ "learning_rate": 4.840555861241638e-06,
+ "loss": 0.5551,
+ "step": 1539
+ },
+ {
+ "epoch": 0.7281323877068558,
+ "grad_norm": 2.6064817905426025,
+ "learning_rate": 4.840336570462127e-06,
+ "loss": 0.5543,
+ "step": 1540
+ },
+ {
+ "epoch": 0.7286052009456265,
+ "grad_norm": 2.67112398147583,
+ "learning_rate": 4.840117133960122e-06,
+ "loss": 0.6044,
+ "step": 1541
+ },
+ {
+ "epoch": 0.7290780141843972,
+ "grad_norm": 2.838022232055664,
+ "learning_rate": 4.839897551749282e-06,
+ "loss": 0.6814,
+ "step": 1542
+ },
+ {
+ "epoch": 0.7295508274231679,
+ "grad_norm": 2.8897151947021484,
+ "learning_rate": 4.839677823843283e-06,
+ "loss": 0.593,
+ "step": 1543
+ },
+ {
+ "epoch": 0.7300236406619386,
+ "grad_norm": 2.9238014221191406,
+ "learning_rate": 4.839457950255805e-06,
+ "loss": 0.5544,
+ "step": 1544
+ },
+ {
+ "epoch": 0.7304964539007093,
+ "grad_norm": 3.016876459121704,
+ "learning_rate": 4.839237931000538e-06,
+ "loss": 0.6099,
+ "step": 1545
+ },
+ {
+ "epoch": 0.7309692671394799,
+ "grad_norm": 2.9415392875671387,
+ "learning_rate": 4.839017766091182e-06,
+ "loss": 0.6413,
+ "step": 1546
+ },
+ {
+ "epoch": 0.7314420803782505,
+ "grad_norm": 2.658067226409912,
+ "learning_rate": 4.838797455541446e-06,
+ "loss": 0.6534,
+ "step": 1547
+ },
+ {
+ "epoch": 0.7319148936170212,
+ "grad_norm": 2.460358142852783,
+ "learning_rate": 4.838576999365049e-06,
+ "loss": 0.5307,
+ "step": 1548
+ },
+ {
+ "epoch": 0.7323877068557919,
+ "grad_norm": 2.5818674564361572,
+ "learning_rate": 4.838356397575716e-06,
+ "loss": 0.6265,
+ "step": 1549
+ },
+ {
+ "epoch": 0.7328605200945626,
+ "grad_norm": 3.009197473526001,
+ "learning_rate": 4.838135650187183e-06,
+ "loss": 0.6957,
+ "step": 1550
+ },
+ {
+ "epoch": 0.7333333333333333,
+ "grad_norm": 2.738543748855591,
+ "learning_rate": 4.837914757213196e-06,
+ "loss": 0.646,
+ "step": 1551
+ },
+ {
+ "epoch": 0.733806146572104,
+ "grad_norm": 2.8208494186401367,
+ "learning_rate": 4.837693718667508e-06,
+ "loss": 0.5936,
+ "step": 1552
+ },
+ {
+ "epoch": 0.7342789598108747,
+ "grad_norm": 3.1574649810791016,
+ "learning_rate": 4.837472534563883e-06,
+ "loss": 0.6455,
+ "step": 1553
+ },
+ {
+ "epoch": 0.7347517730496453,
+ "grad_norm": 2.6737420558929443,
+ "learning_rate": 4.837251204916093e-06,
+ "loss": 0.5921,
+ "step": 1554
+ },
+ {
+ "epoch": 0.735224586288416,
+ "grad_norm": 2.424983024597168,
+ "learning_rate": 4.837029729737918e-06,
+ "loss": 0.6346,
+ "step": 1555
+ },
+ {
+ "epoch": 0.7356973995271867,
+ "grad_norm": 2.5163493156433105,
+ "learning_rate": 4.836808109043151e-06,
+ "loss": 0.6061,
+ "step": 1556
+ },
+ {
+ "epoch": 0.7361702127659574,
+ "grad_norm": 2.8377044200897217,
+ "learning_rate": 4.836586342845588e-06,
+ "loss": 0.611,
+ "step": 1557
+ },
+ {
+ "epoch": 0.7366430260047281,
+ "grad_norm": 2.5929181575775146,
+ "learning_rate": 4.83636443115904e-06,
+ "loss": 0.5496,
+ "step": 1558
+ },
+ {
+ "epoch": 0.7371158392434988,
+ "grad_norm": 2.5017223358154297,
+ "learning_rate": 4.836142373997323e-06,
+ "loss": 0.6235,
+ "step": 1559
+ },
+ {
+ "epoch": 0.7375886524822695,
+ "grad_norm": 2.822500228881836,
+ "learning_rate": 4.835920171374265e-06,
+ "loss": 0.6147,
+ "step": 1560
+ },
+ {
+ "epoch": 0.7380614657210401,
+ "grad_norm": 2.7234230041503906,
+ "learning_rate": 4.8356978233037e-06,
+ "loss": 0.6228,
+ "step": 1561
+ },
+ {
+ "epoch": 0.7385342789598108,
+ "grad_norm": 2.9565515518188477,
+ "learning_rate": 4.835475329799472e-06,
+ "loss": 0.5728,
+ "step": 1562
+ },
+ {
+ "epoch": 0.7390070921985815,
+ "grad_norm": 2.4356038570404053,
+ "learning_rate": 4.835252690875438e-06,
+ "loss": 0.6723,
+ "step": 1563
+ },
+ {
+ "epoch": 0.7394799054373522,
+ "grad_norm": 2.765913248062134,
+ "learning_rate": 4.835029906545458e-06,
+ "loss": 0.5805,
+ "step": 1564
+ },
+ {
+ "epoch": 0.7399527186761229,
+ "grad_norm": 2.4481914043426514,
+ "learning_rate": 4.834806976823405e-06,
+ "loss": 0.599,
+ "step": 1565
+ },
+ {
+ "epoch": 0.7404255319148936,
+ "grad_norm": 2.620779514312744,
+ "learning_rate": 4.834583901723158e-06,
+ "loss": 0.63,
+ "step": 1566
+ },
+ {
+ "epoch": 0.7408983451536643,
+ "grad_norm": 2.654426097869873,
+ "learning_rate": 4.83436068125861e-06,
+ "loss": 0.6544,
+ "step": 1567
+ },
+ {
+ "epoch": 0.741371158392435,
+ "grad_norm": 2.589623212814331,
+ "learning_rate": 4.834137315443656e-06,
+ "loss": 0.5596,
+ "step": 1568
+ },
+ {
+ "epoch": 0.7418439716312056,
+ "grad_norm": 2.572883129119873,
+ "learning_rate": 4.833913804292209e-06,
+ "loss": 0.5974,
+ "step": 1569
+ },
+ {
+ "epoch": 0.7423167848699763,
+ "grad_norm": 2.8744914531707764,
+ "learning_rate": 4.833690147818181e-06,
+ "loss": 0.5364,
+ "step": 1570
+ },
+ {
+ "epoch": 0.742789598108747,
+ "grad_norm": 2.9800851345062256,
+ "learning_rate": 4.833466346035502e-06,
+ "loss": 0.6287,
+ "step": 1571
+ },
+ {
+ "epoch": 0.7432624113475177,
+ "grad_norm": 2.627784490585327,
+ "learning_rate": 4.833242398958105e-06,
+ "loss": 0.621,
+ "step": 1572
+ },
+ {
+ "epoch": 0.7437352245862884,
+ "grad_norm": 2.5187721252441406,
+ "learning_rate": 4.833018306599933e-06,
+ "loss": 0.5901,
+ "step": 1573
+ },
+ {
+ "epoch": 0.7442080378250591,
+ "grad_norm": 2.4843688011169434,
+ "learning_rate": 4.832794068974944e-06,
+ "loss": 0.6336,
+ "step": 1574
+ },
+ {
+ "epoch": 0.7446808510638298,
+ "grad_norm": 2.774911880493164,
+ "learning_rate": 4.832569686097096e-06,
+ "loss": 0.6091,
+ "step": 1575
+ },
+ {
+ "epoch": 0.7451536643026004,
+ "grad_norm": 3.2562527656555176,
+ "learning_rate": 4.8323451579803615e-06,
+ "loss": 0.7686,
+ "step": 1576
+ },
+ {
+ "epoch": 0.7456264775413711,
+ "grad_norm": 2.799570083618164,
+ "learning_rate": 4.832120484638721e-06,
+ "loss": 0.6233,
+ "step": 1577
+ },
+ {
+ "epoch": 0.7460992907801418,
+ "grad_norm": 2.661893367767334,
+ "learning_rate": 4.831895666086164e-06,
+ "loss": 0.5841,
+ "step": 1578
+ },
+ {
+ "epoch": 0.7465721040189125,
+ "grad_norm": 3.0382652282714844,
+ "learning_rate": 4.831670702336689e-06,
+ "loss": 0.5769,
+ "step": 1579
+ },
+ {
+ "epoch": 0.7470449172576832,
+ "grad_norm": 2.676398515701294,
+ "learning_rate": 4.831445593404304e-06,
+ "loss": 0.619,
+ "step": 1580
+ },
+ {
+ "epoch": 0.7475177304964539,
+ "grad_norm": 2.717916965484619,
+ "learning_rate": 4.831220339303024e-06,
+ "loss": 0.5787,
+ "step": 1581
+ },
+ {
+ "epoch": 0.7479905437352246,
+ "grad_norm": 2.3918066024780273,
+ "learning_rate": 4.830994940046876e-06,
+ "loss": 0.5108,
+ "step": 1582
+ },
+ {
+ "epoch": 0.7484633569739952,
+ "grad_norm": 2.709144115447998,
+ "learning_rate": 4.830769395649895e-06,
+ "loss": 0.6875,
+ "step": 1583
+ },
+ {
+ "epoch": 0.7489361702127659,
+ "grad_norm": 2.8711116313934326,
+ "learning_rate": 4.830543706126123e-06,
+ "loss": 0.6745,
+ "step": 1584
+ },
+ {
+ "epoch": 0.7494089834515366,
+ "grad_norm": 2.612339496612549,
+ "learning_rate": 4.830317871489614e-06,
+ "loss": 0.5738,
+ "step": 1585
+ },
+ {
+ "epoch": 0.7498817966903073,
+ "grad_norm": 2.4355857372283936,
+ "learning_rate": 4.830091891754429e-06,
+ "loss": 0.5907,
+ "step": 1586
+ },
+ {
+ "epoch": 0.750354609929078,
+ "grad_norm": 2.676051378250122,
+ "learning_rate": 4.829865766934638e-06,
+ "loss": 0.6628,
+ "step": 1587
+ },
+ {
+ "epoch": 0.7508274231678487,
+ "grad_norm": 2.66489839553833,
+ "learning_rate": 4.829639497044323e-06,
+ "loss": 0.5984,
+ "step": 1588
+ },
+ {
+ "epoch": 0.7513002364066194,
+ "grad_norm": 2.5358035564422607,
+ "learning_rate": 4.829413082097572e-06,
+ "loss": 0.5867,
+ "step": 1589
+ },
+ {
+ "epoch": 0.75177304964539,
+ "grad_norm": 2.6530144214630127,
+ "learning_rate": 4.8291865221084815e-06,
+ "loss": 0.5917,
+ "step": 1590
+ },
+ {
+ "epoch": 0.7522458628841607,
+ "grad_norm": 2.5160958766937256,
+ "learning_rate": 4.82895981709116e-06,
+ "loss": 0.6347,
+ "step": 1591
+ },
+ {
+ "epoch": 0.7527186761229314,
+ "grad_norm": 2.61592698097229,
+ "learning_rate": 4.8287329670597225e-06,
+ "loss": 0.5472,
+ "step": 1592
+ },
+ {
+ "epoch": 0.7531914893617021,
+ "grad_norm": 2.7528622150421143,
+ "learning_rate": 4.828505972028296e-06,
+ "loss": 0.5842,
+ "step": 1593
+ },
+ {
+ "epoch": 0.7536643026004728,
+ "grad_norm": 2.8154072761535645,
+ "learning_rate": 4.828278832011011e-06,
+ "loss": 0.5757,
+ "step": 1594
+ },
+ {
+ "epoch": 0.7541371158392435,
+ "grad_norm": 3.118515729904175,
+ "learning_rate": 4.828051547022013e-06,
+ "loss": 0.6472,
+ "step": 1595
+ },
+ {
+ "epoch": 0.7546099290780142,
+ "grad_norm": 2.452033758163452,
+ "learning_rate": 4.827824117075453e-06,
+ "loss": 0.5571,
+ "step": 1596
+ },
+ {
+ "epoch": 0.7550827423167848,
+ "grad_norm": 2.984388828277588,
+ "learning_rate": 4.827596542185492e-06,
+ "loss": 0.6656,
+ "step": 1597
+ },
+ {
+ "epoch": 0.7555555555555555,
+ "grad_norm": 2.61356782913208,
+ "learning_rate": 4.8273688223663014e-06,
+ "loss": 0.6444,
+ "step": 1598
+ },
+ {
+ "epoch": 0.7560283687943262,
+ "grad_norm": 2.8967196941375732,
+ "learning_rate": 4.8271409576320595e-06,
+ "loss": 0.6457,
+ "step": 1599
+ },
+ {
+ "epoch": 0.7565011820330969,
+ "grad_norm": 2.852367639541626,
+ "learning_rate": 4.826912947996954e-06,
+ "loss": 0.5629,
+ "step": 1600
+ },
+ {
+ "epoch": 0.7569739952718676,
+ "grad_norm": 2.905280590057373,
+ "learning_rate": 4.826684793475182e-06,
+ "loss": 0.6245,
+ "step": 1601
+ },
+ {
+ "epoch": 0.7574468085106383,
+ "grad_norm": 2.6156530380249023,
+ "learning_rate": 4.826456494080951e-06,
+ "loss": 0.5869,
+ "step": 1602
+ },
+ {
+ "epoch": 0.757919621749409,
+ "grad_norm": 2.6490228176116943,
+ "learning_rate": 4.826228049828475e-06,
+ "loss": 0.5461,
+ "step": 1603
+ },
+ {
+ "epoch": 0.7583924349881797,
+ "grad_norm": 2.9626693725585938,
+ "learning_rate": 4.825999460731978e-06,
+ "loss": 0.6842,
+ "step": 1604
+ },
+ {
+ "epoch": 0.7588652482269503,
+ "grad_norm": 2.6866023540496826,
+ "learning_rate": 4.825770726805695e-06,
+ "loss": 0.5726,
+ "step": 1605
+ },
+ {
+ "epoch": 0.759338061465721,
+ "grad_norm": 2.5525858402252197,
+ "learning_rate": 4.825541848063866e-06,
+ "loss": 0.6061,
+ "step": 1606
+ },
+ {
+ "epoch": 0.7598108747044917,
+ "grad_norm": 2.703977584838867,
+ "learning_rate": 4.825312824520743e-06,
+ "loss": 0.6726,
+ "step": 1607
+ },
+ {
+ "epoch": 0.7602836879432624,
+ "grad_norm": 2.856534957885742,
+ "learning_rate": 4.825083656190588e-06,
+ "loss": 0.625,
+ "step": 1608
+ },
+ {
+ "epoch": 0.7607565011820331,
+ "grad_norm": 2.8564887046813965,
+ "learning_rate": 4.824854343087668e-06,
+ "loss": 0.7251,
+ "step": 1609
+ },
+ {
+ "epoch": 0.7612293144208038,
+ "grad_norm": 2.327650308609009,
+ "learning_rate": 4.824624885226262e-06,
+ "loss": 0.526,
+ "step": 1610
+ },
+ {
+ "epoch": 0.7617021276595745,
+ "grad_norm": 3.0025737285614014,
+ "learning_rate": 4.824395282620659e-06,
+ "loss": 0.6043,
+ "step": 1611
+ },
+ {
+ "epoch": 0.7621749408983451,
+ "grad_norm": 2.5441737174987793,
+ "learning_rate": 4.824165535285152e-06,
+ "loss": 0.6276,
+ "step": 1612
+ },
+ {
+ "epoch": 0.7626477541371158,
+ "grad_norm": 2.4177372455596924,
+ "learning_rate": 4.823935643234049e-06,
+ "loss": 0.6419,
+ "step": 1613
+ },
+ {
+ "epoch": 0.7631205673758865,
+ "grad_norm": 2.9210550785064697,
+ "learning_rate": 4.823705606481664e-06,
+ "loss": 0.5663,
+ "step": 1614
+ },
+ {
+ "epoch": 0.7635933806146572,
+ "grad_norm": 2.6353724002838135,
+ "learning_rate": 4.82347542504232e-06,
+ "loss": 0.5669,
+ "step": 1615
+ },
+ {
+ "epoch": 0.7640661938534279,
+ "grad_norm": 2.419081926345825,
+ "learning_rate": 4.823245098930349e-06,
+ "loss": 0.5777,
+ "step": 1616
+ },
+ {
+ "epoch": 0.7645390070921986,
+ "grad_norm": 2.5077571868896484,
+ "learning_rate": 4.823014628160093e-06,
+ "loss": 0.5924,
+ "step": 1617
+ },
+ {
+ "epoch": 0.7650118203309693,
+ "grad_norm": 2.816056251525879,
+ "learning_rate": 4.822784012745902e-06,
+ "loss": 0.7273,
+ "step": 1618
+ },
+ {
+ "epoch": 0.76548463356974,
+ "grad_norm": 2.7163147926330566,
+ "learning_rate": 4.8225532527021366e-06,
+ "loss": 0.5545,
+ "step": 1619
+ },
+ {
+ "epoch": 0.7659574468085106,
+ "grad_norm": 2.4784302711486816,
+ "learning_rate": 4.822322348043164e-06,
+ "loss": 0.556,
+ "step": 1620
+ },
+ {
+ "epoch": 0.7664302600472813,
+ "grad_norm": 2.712467670440674,
+ "learning_rate": 4.822091298783361e-06,
+ "loss": 0.6501,
+ "step": 1621
+ },
+ {
+ "epoch": 0.766903073286052,
+ "grad_norm": 2.7217724323272705,
+ "learning_rate": 4.821860104937115e-06,
+ "loss": 0.5989,
+ "step": 1622
+ },
+ {
+ "epoch": 0.7673758865248227,
+ "grad_norm": 2.5622854232788086,
+ "learning_rate": 4.821628766518821e-06,
+ "loss": 0.5263,
+ "step": 1623
+ },
+ {
+ "epoch": 0.7678486997635934,
+ "grad_norm": 3.230923891067505,
+ "learning_rate": 4.821397283542884e-06,
+ "loss": 0.6707,
+ "step": 1624
+ },
+ {
+ "epoch": 0.7683215130023641,
+ "grad_norm": 2.37929105758667,
+ "learning_rate": 4.821165656023718e-06,
+ "loss": 0.6124,
+ "step": 1625
+ },
+ {
+ "epoch": 0.7687943262411348,
+ "grad_norm": 2.9811325073242188,
+ "learning_rate": 4.820933883975745e-06,
+ "loss": 0.6435,
+ "step": 1626
+ },
+ {
+ "epoch": 0.7692671394799054,
+ "grad_norm": 2.887380838394165,
+ "learning_rate": 4.820701967413395e-06,
+ "loss": 0.621,
+ "step": 1627
+ },
+ {
+ "epoch": 0.7697399527186761,
+ "grad_norm": 2.6762876510620117,
+ "learning_rate": 4.820469906351109e-06,
+ "loss": 0.5713,
+ "step": 1628
+ },
+ {
+ "epoch": 0.7702127659574468,
+ "grad_norm": 2.7347512245178223,
+ "learning_rate": 4.820237700803337e-06,
+ "loss": 0.6136,
+ "step": 1629
+ },
+ {
+ "epoch": 0.7706855791962175,
+ "grad_norm": 2.7244746685028076,
+ "learning_rate": 4.820005350784539e-06,
+ "loss": 0.5816,
+ "step": 1630
+ },
+ {
+ "epoch": 0.7711583924349882,
+ "grad_norm": 2.9293999671936035,
+ "learning_rate": 4.8197728563091795e-06,
+ "loss": 0.6649,
+ "step": 1631
+ },
+ {
+ "epoch": 0.7716312056737589,
+ "grad_norm": 2.4402127265930176,
+ "learning_rate": 4.819540217391736e-06,
+ "loss": 0.6481,
+ "step": 1632
+ },
+ {
+ "epoch": 0.7721040189125296,
+ "grad_norm": 3.083941698074341,
+ "learning_rate": 4.819307434046694e-06,
+ "loss": 0.6951,
+ "step": 1633
+ },
+ {
+ "epoch": 0.7725768321513002,
+ "grad_norm": 2.544952392578125,
+ "learning_rate": 4.819074506288548e-06,
+ "loss": 0.539,
+ "step": 1634
+ },
+ {
+ "epoch": 0.7730496453900709,
+ "grad_norm": 2.7791268825531006,
+ "learning_rate": 4.818841434131801e-06,
+ "loss": 0.5827,
+ "step": 1635
+ },
+ {
+ "epoch": 0.7735224586288416,
+ "grad_norm": 2.7349796295166016,
+ "learning_rate": 4.818608217590967e-06,
+ "loss": 0.5584,
+ "step": 1636
+ },
+ {
+ "epoch": 0.7739952718676123,
+ "grad_norm": 2.637652635574341,
+ "learning_rate": 4.818374856680565e-06,
+ "loss": 0.6386,
+ "step": 1637
+ },
+ {
+ "epoch": 0.774468085106383,
+ "grad_norm": 2.9821584224700928,
+ "learning_rate": 4.818141351415127e-06,
+ "loss": 0.6734,
+ "step": 1638
+ },
+ {
+ "epoch": 0.7749408983451537,
+ "grad_norm": 2.992938995361328,
+ "learning_rate": 4.817907701809192e-06,
+ "loss": 0.5899,
+ "step": 1639
+ },
+ {
+ "epoch": 0.7754137115839244,
+ "grad_norm": 4.35719633102417,
+ "learning_rate": 4.8176739078773076e-06,
+ "loss": 0.6281,
+ "step": 1640
+ },
+ {
+ "epoch": 0.775886524822695,
+ "grad_norm": 2.838146209716797,
+ "learning_rate": 4.8174399696340315e-06,
+ "loss": 0.5766,
+ "step": 1641
+ },
+ {
+ "epoch": 0.7763593380614657,
+ "grad_norm": 3.3116989135742188,
+ "learning_rate": 4.81720588709393e-06,
+ "loss": 0.6409,
+ "step": 1642
+ },
+ {
+ "epoch": 0.7768321513002364,
+ "grad_norm": 2.9843590259552,
+ "learning_rate": 4.816971660271579e-06,
+ "loss": 0.6108,
+ "step": 1643
+ },
+ {
+ "epoch": 0.7773049645390071,
+ "grad_norm": 2.843770742416382,
+ "learning_rate": 4.816737289181562e-06,
+ "loss": 0.6053,
+ "step": 1644
+ },
+ {
+ "epoch": 0.7777777777777778,
+ "grad_norm": 2.7608556747436523,
+ "learning_rate": 4.816502773838473e-06,
+ "loss": 0.5854,
+ "step": 1645
+ },
+ {
+ "epoch": 0.7782505910165485,
+ "grad_norm": 3.343682289123535,
+ "learning_rate": 4.816268114256914e-06,
+ "loss": 0.6329,
+ "step": 1646
+ },
+ {
+ "epoch": 0.7787234042553192,
+ "grad_norm": 2.769768476486206,
+ "learning_rate": 4.816033310451496e-06,
+ "loss": 0.6242,
+ "step": 1647
+ },
+ {
+ "epoch": 0.7791962174940898,
+ "grad_norm": 2.989851713180542,
+ "learning_rate": 4.815798362436838e-06,
+ "loss": 0.6493,
+ "step": 1648
+ },
+ {
+ "epoch": 0.7796690307328605,
+ "grad_norm": 3.170736312866211,
+ "learning_rate": 4.8155632702275716e-06,
+ "loss": 0.6341,
+ "step": 1649
+ },
+ {
+ "epoch": 0.7801418439716312,
+ "grad_norm": 2.7372522354125977,
+ "learning_rate": 4.815328033838334e-06,
+ "loss": 0.5445,
+ "step": 1650
+ },
+ {
+ "epoch": 0.7806146572104019,
+ "grad_norm": 2.6947238445281982,
+ "learning_rate": 4.8150926532837715e-06,
+ "loss": 0.6437,
+ "step": 1651
+ },
+ {
+ "epoch": 0.7810874704491726,
+ "grad_norm": 2.472323179244995,
+ "learning_rate": 4.81485712857854e-06,
+ "loss": 0.5751,
+ "step": 1652
+ },
+ {
+ "epoch": 0.7815602836879433,
+ "grad_norm": 2.791114091873169,
+ "learning_rate": 4.814621459737308e-06,
+ "loss": 0.5996,
+ "step": 1653
+ },
+ {
+ "epoch": 0.782033096926714,
+ "grad_norm": 3.1957521438598633,
+ "learning_rate": 4.814385646774745e-06,
+ "loss": 0.5803,
+ "step": 1654
+ },
+ {
+ "epoch": 0.7825059101654847,
+ "grad_norm": 2.4120798110961914,
+ "learning_rate": 4.8141496897055364e-06,
+ "loss": 0.5814,
+ "step": 1655
+ },
+ {
+ "epoch": 0.7829787234042553,
+ "grad_norm": 2.9262423515319824,
+ "learning_rate": 4.813913588544374e-06,
+ "loss": 0.6292,
+ "step": 1656
+ },
+ {
+ "epoch": 0.783451536643026,
+ "grad_norm": 2.8251047134399414,
+ "learning_rate": 4.813677343305959e-06,
+ "loss": 0.6787,
+ "step": 1657
+ },
+ {
+ "epoch": 0.7839243498817967,
+ "grad_norm": 2.931659698486328,
+ "learning_rate": 4.8134409540050005e-06,
+ "loss": 0.6163,
+ "step": 1658
+ },
+ {
+ "epoch": 0.7843971631205674,
+ "grad_norm": 2.7160706520080566,
+ "learning_rate": 4.813204420656219e-06,
+ "loss": 0.6831,
+ "step": 1659
+ },
+ {
+ "epoch": 0.7848699763593381,
+ "grad_norm": 3.2134454250335693,
+ "learning_rate": 4.81296774327434e-06,
+ "loss": 0.6002,
+ "step": 1660
+ },
+ {
+ "epoch": 0.7853427895981088,
+ "grad_norm": 2.4002513885498047,
+ "learning_rate": 4.812730921874103e-06,
+ "loss": 0.5488,
+ "step": 1661
+ },
+ {
+ "epoch": 0.7858156028368795,
+ "grad_norm": 2.5559282302856445,
+ "learning_rate": 4.812493956470251e-06,
+ "loss": 0.5802,
+ "step": 1662
+ },
+ {
+ "epoch": 0.7862884160756501,
+ "grad_norm": 2.57478404045105,
+ "learning_rate": 4.812256847077541e-06,
+ "loss": 0.646,
+ "step": 1663
+ },
+ {
+ "epoch": 0.7867612293144208,
+ "grad_norm": 2.811851978302002,
+ "learning_rate": 4.812019593710736e-06,
+ "loss": 0.6245,
+ "step": 1664
+ },
+ {
+ "epoch": 0.7872340425531915,
+ "grad_norm": 2.5228829383850098,
+ "learning_rate": 4.811782196384609e-06,
+ "loss": 0.5949,
+ "step": 1665
+ },
+ {
+ "epoch": 0.7877068557919622,
+ "grad_norm": 2.744096040725708,
+ "learning_rate": 4.8115446551139415e-06,
+ "loss": 0.6006,
+ "step": 1666
+ },
+ {
+ "epoch": 0.7881796690307329,
+ "grad_norm": 3.129242420196533,
+ "learning_rate": 4.811306969913524e-06,
+ "loss": 0.7251,
+ "step": 1667
+ },
+ {
+ "epoch": 0.7886524822695036,
+ "grad_norm": 2.7855660915374756,
+ "learning_rate": 4.811069140798156e-06,
+ "loss": 0.6534,
+ "step": 1668
+ },
+ {
+ "epoch": 0.7891252955082743,
+ "grad_norm": 2.836603879928589,
+ "learning_rate": 4.810831167782647e-06,
+ "loss": 0.6661,
+ "step": 1669
+ },
+ {
+ "epoch": 0.789598108747045,
+ "grad_norm": 2.5339887142181396,
+ "learning_rate": 4.810593050881813e-06,
+ "loss": 0.5354,
+ "step": 1670
+ },
+ {
+ "epoch": 0.7900709219858156,
+ "grad_norm": 2.9553709030151367,
+ "learning_rate": 4.810354790110482e-06,
+ "loss": 0.6001,
+ "step": 1671
+ },
+ {
+ "epoch": 0.7905437352245863,
+ "grad_norm": 2.6581788063049316,
+ "learning_rate": 4.8101163854834885e-06,
+ "loss": 0.6802,
+ "step": 1672
+ },
+ {
+ "epoch": 0.791016548463357,
+ "grad_norm": 3.2002551555633545,
+ "learning_rate": 4.809877837015677e-06,
+ "loss": 0.6641,
+ "step": 1673
+ },
+ {
+ "epoch": 0.7914893617021277,
+ "grad_norm": 2.918792963027954,
+ "learning_rate": 4.809639144721902e-06,
+ "loss": 0.6758,
+ "step": 1674
+ },
+ {
+ "epoch": 0.7919621749408984,
+ "grad_norm": 2.7993946075439453,
+ "learning_rate": 4.8094003086170245e-06,
+ "loss": 0.5889,
+ "step": 1675
+ },
+ {
+ "epoch": 0.7924349881796691,
+ "grad_norm": 2.3698952198028564,
+ "learning_rate": 4.809161328715916e-06,
+ "loss": 0.6244,
+ "step": 1676
+ },
+ {
+ "epoch": 0.7929078014184398,
+ "grad_norm": 2.8891594409942627,
+ "learning_rate": 4.808922205033458e-06,
+ "loss": 0.5835,
+ "step": 1677
+ },
+ {
+ "epoch": 0.7933806146572104,
+ "grad_norm": 2.838345766067505,
+ "learning_rate": 4.808682937584537e-06,
+ "loss": 0.6907,
+ "step": 1678
+ },
+ {
+ "epoch": 0.7938534278959811,
+ "grad_norm": 2.8443174362182617,
+ "learning_rate": 4.808443526384053e-06,
+ "loss": 0.6692,
+ "step": 1679
+ },
+ {
+ "epoch": 0.7943262411347518,
+ "grad_norm": 2.7355034351348877,
+ "learning_rate": 4.808203971446913e-06,
+ "loss": 0.5799,
+ "step": 1680
+ },
+ {
+ "epoch": 0.7947990543735225,
+ "grad_norm": 2.7108020782470703,
+ "learning_rate": 4.807964272788033e-06,
+ "loss": 0.652,
+ "step": 1681
+ },
+ {
+ "epoch": 0.7952718676122932,
+ "grad_norm": 2.397650957107544,
+ "learning_rate": 4.807724430422338e-06,
+ "loss": 0.5418,
+ "step": 1682
+ },
+ {
+ "epoch": 0.7957446808510639,
+ "grad_norm": 2.4981582164764404,
+ "learning_rate": 4.807484444364762e-06,
+ "loss": 0.5731,
+ "step": 1683
+ },
+ {
+ "epoch": 0.7962174940898346,
+ "grad_norm": 2.7943713665008545,
+ "learning_rate": 4.8072443146302475e-06,
+ "loss": 0.5913,
+ "step": 1684
+ },
+ {
+ "epoch": 0.7966903073286052,
+ "grad_norm": 2.5691423416137695,
+ "learning_rate": 4.807004041233746e-06,
+ "loss": 0.6475,
+ "step": 1685
+ },
+ {
+ "epoch": 0.7971631205673759,
+ "grad_norm": 3.2367498874664307,
+ "learning_rate": 4.8067636241902195e-06,
+ "loss": 0.675,
+ "step": 1686
+ },
+ {
+ "epoch": 0.7976359338061466,
+ "grad_norm": 3.000595808029175,
+ "learning_rate": 4.806523063514637e-06,
+ "loss": 0.5481,
+ "step": 1687
+ },
+ {
+ "epoch": 0.7981087470449173,
+ "grad_norm": 2.702014207839966,
+ "learning_rate": 4.806282359221976e-06,
+ "loss": 0.5993,
+ "step": 1688
+ },
+ {
+ "epoch": 0.798581560283688,
+ "grad_norm": 2.383671998977661,
+ "learning_rate": 4.806041511327226e-06,
+ "loss": 0.562,
+ "step": 1689
+ },
+ {
+ "epoch": 0.7990543735224587,
+ "grad_norm": 2.6965041160583496,
+ "learning_rate": 4.8058005198453834e-06,
+ "loss": 0.5955,
+ "step": 1690
+ },
+ {
+ "epoch": 0.7995271867612294,
+ "grad_norm": 2.5906765460968018,
+ "learning_rate": 4.805559384791453e-06,
+ "loss": 0.5151,
+ "step": 1691
+ },
+ {
+ "epoch": 0.8,
+ "grad_norm": 2.5454652309417725,
+ "learning_rate": 4.8053181061804475e-06,
+ "loss": 0.5843,
+ "step": 1692
+ },
+ {
+ "epoch": 0.8004728132387707,
+ "grad_norm": 2.661343812942505,
+ "learning_rate": 4.8050766840273935e-06,
+ "loss": 0.5995,
+ "step": 1693
+ },
+ {
+ "epoch": 0.8009456264775414,
+ "grad_norm": 2.5635924339294434,
+ "learning_rate": 4.8048351183473215e-06,
+ "loss": 0.5676,
+ "step": 1694
+ },
+ {
+ "epoch": 0.8014184397163121,
+ "grad_norm": 2.5936667919158936,
+ "learning_rate": 4.804593409155274e-06,
+ "loss": 0.6291,
+ "step": 1695
+ },
+ {
+ "epoch": 0.8018912529550828,
+ "grad_norm": 2.6902432441711426,
+ "learning_rate": 4.804351556466299e-06,
+ "loss": 0.6114,
+ "step": 1696
+ },
+ {
+ "epoch": 0.8023640661938535,
+ "grad_norm": 2.7764673233032227,
+ "learning_rate": 4.804109560295457e-06,
+ "loss": 0.5768,
+ "step": 1697
+ },
+ {
+ "epoch": 0.8028368794326242,
+ "grad_norm": 2.9587221145629883,
+ "learning_rate": 4.803867420657816e-06,
+ "loss": 0.6048,
+ "step": 1698
+ },
+ {
+ "epoch": 0.8033096926713948,
+ "grad_norm": 2.9238998889923096,
+ "learning_rate": 4.803625137568453e-06,
+ "loss": 0.6329,
+ "step": 1699
+ },
+ {
+ "epoch": 0.8037825059101655,
+ "grad_norm": 2.70473313331604,
+ "learning_rate": 4.803382711042455e-06,
+ "loss": 0.5427,
+ "step": 1700
+ },
+ {
+ "epoch": 0.8042553191489362,
+ "grad_norm": 3.1604979038238525,
+ "learning_rate": 4.803140141094914e-06,
+ "loss": 0.626,
+ "step": 1701
+ },
+ {
+ "epoch": 0.8047281323877069,
+ "grad_norm": 2.9567699432373047,
+ "learning_rate": 4.802897427740936e-06,
+ "loss": 0.5319,
+ "step": 1702
+ },
+ {
+ "epoch": 0.8052009456264776,
+ "grad_norm": 2.90983247756958,
+ "learning_rate": 4.802654570995632e-06,
+ "loss": 0.586,
+ "step": 1703
+ },
+ {
+ "epoch": 0.8056737588652483,
+ "grad_norm": 2.783480167388916,
+ "learning_rate": 4.8024115708741255e-06,
+ "loss": 0.5773,
+ "step": 1704
+ },
+ {
+ "epoch": 0.806146572104019,
+ "grad_norm": 3.3307793140411377,
+ "learning_rate": 4.802168427391547e-06,
+ "loss": 0.6257,
+ "step": 1705
+ },
+ {
+ "epoch": 0.8066193853427897,
+ "grad_norm": 3.0475001335144043,
+ "learning_rate": 4.801925140563034e-06,
+ "loss": 0.6612,
+ "step": 1706
+ },
+ {
+ "epoch": 0.8070921985815603,
+ "grad_norm": 2.8278894424438477,
+ "learning_rate": 4.8016817104037375e-06,
+ "loss": 0.6449,
+ "step": 1707
+ },
+ {
+ "epoch": 0.807565011820331,
+ "grad_norm": 2.760244369506836,
+ "learning_rate": 4.801438136928812e-06,
+ "loss": 0.7007,
+ "step": 1708
+ },
+ {
+ "epoch": 0.8080378250591016,
+ "grad_norm": 2.827634572982788,
+ "learning_rate": 4.801194420153427e-06,
+ "loss": 0.6418,
+ "step": 1709
+ },
+ {
+ "epoch": 0.8085106382978723,
+ "grad_norm": 2.8655009269714355,
+ "learning_rate": 4.800950560092754e-06,
+ "loss": 0.6231,
+ "step": 1710
+ },
+ {
+ "epoch": 0.808983451536643,
+ "grad_norm": 2.738112688064575,
+ "learning_rate": 4.800706556761981e-06,
+ "loss": 0.6463,
+ "step": 1711
+ },
+ {
+ "epoch": 0.8094562647754137,
+ "grad_norm": 2.4781179428100586,
+ "learning_rate": 4.800462410176296e-06,
+ "loss": 0.5365,
+ "step": 1712
+ },
+ {
+ "epoch": 0.8099290780141843,
+ "grad_norm": 2.6049838066101074,
+ "learning_rate": 4.800218120350906e-06,
+ "loss": 0.6035,
+ "step": 1713
+ },
+ {
+ "epoch": 0.810401891252955,
+ "grad_norm": 2.9089980125427246,
+ "learning_rate": 4.79997368730102e-06,
+ "loss": 0.5828,
+ "step": 1714
+ },
+ {
+ "epoch": 0.8108747044917257,
+ "grad_norm": 2.831871747970581,
+ "learning_rate": 4.799729111041857e-06,
+ "loss": 0.5953,
+ "step": 1715
+ },
+ {
+ "epoch": 0.8113475177304964,
+ "grad_norm": 2.5611300468444824,
+ "learning_rate": 4.799484391588647e-06,
+ "loss": 0.6302,
+ "step": 1716
+ },
+ {
+ "epoch": 0.8118203309692671,
+ "grad_norm": 2.744070053100586,
+ "learning_rate": 4.799239528956625e-06,
+ "loss": 0.5561,
+ "step": 1717
+ },
+ {
+ "epoch": 0.8122931442080378,
+ "grad_norm": 2.7344231605529785,
+ "learning_rate": 4.798994523161041e-06,
+ "loss": 0.6317,
+ "step": 1718
+ },
+ {
+ "epoch": 0.8127659574468085,
+ "grad_norm": 2.3420889377593994,
+ "learning_rate": 4.798749374217149e-06,
+ "loss": 0.5415,
+ "step": 1719
+ },
+ {
+ "epoch": 0.8132387706855791,
+ "grad_norm": 2.57384991645813,
+ "learning_rate": 4.798504082140212e-06,
+ "loss": 0.6383,
+ "step": 1720
+ },
+ {
+ "epoch": 0.8137115839243498,
+ "grad_norm": 2.8819844722747803,
+ "learning_rate": 4.798258646945505e-06,
+ "loss": 0.6355,
+ "step": 1721
+ },
+ {
+ "epoch": 0.8141843971631205,
+ "grad_norm": 2.908123254776001,
+ "learning_rate": 4.79801306864831e-06,
+ "loss": 0.701,
+ "step": 1722
+ },
+ {
+ "epoch": 0.8146572104018912,
+ "grad_norm": 2.6500701904296875,
+ "learning_rate": 4.797767347263917e-06,
+ "loss": 0.6152,
+ "step": 1723
+ },
+ {
+ "epoch": 0.8151300236406619,
+ "grad_norm": 2.5513017177581787,
+ "learning_rate": 4.797521482807628e-06,
+ "loss": 0.6241,
+ "step": 1724
+ },
+ {
+ "epoch": 0.8156028368794326,
+ "grad_norm": 2.6239185333251953,
+ "learning_rate": 4.7972754752947495e-06,
+ "loss": 0.6072,
+ "step": 1725
+ },
+ {
+ "epoch": 0.8160756501182033,
+ "grad_norm": 2.673436403274536,
+ "learning_rate": 4.797029324740601e-06,
+ "loss": 0.5802,
+ "step": 1726
+ },
+ {
+ "epoch": 0.816548463356974,
+ "grad_norm": 2.533831834793091,
+ "learning_rate": 4.796783031160508e-06,
+ "loss": 0.5566,
+ "step": 1727
+ },
+ {
+ "epoch": 0.8170212765957446,
+ "grad_norm": 2.9806582927703857,
+ "learning_rate": 4.796536594569807e-06,
+ "loss": 0.6945,
+ "step": 1728
+ },
+ {
+ "epoch": 0.8174940898345153,
+ "grad_norm": 2.7093560695648193,
+ "learning_rate": 4.796290014983842e-06,
+ "loss": 0.7143,
+ "step": 1729
+ },
+ {
+ "epoch": 0.817966903073286,
+ "grad_norm": 2.814507246017456,
+ "learning_rate": 4.796043292417967e-06,
+ "loss": 0.6122,
+ "step": 1730
+ },
+ {
+ "epoch": 0.8184397163120567,
+ "grad_norm": 2.537156820297241,
+ "learning_rate": 4.795796426887543e-06,
+ "loss": 0.6229,
+ "step": 1731
+ },
+ {
+ "epoch": 0.8189125295508274,
+ "grad_norm": 2.4878013134002686,
+ "learning_rate": 4.795549418407944e-06,
+ "loss": 0.5442,
+ "step": 1732
+ },
+ {
+ "epoch": 0.8193853427895981,
+ "grad_norm": 2.839383363723755,
+ "learning_rate": 4.795302266994548e-06,
+ "loss": 0.6717,
+ "step": 1733
+ },
+ {
+ "epoch": 0.8198581560283688,
+ "grad_norm": 3.1981801986694336,
+ "learning_rate": 4.795054972662744e-06,
+ "loss": 0.6596,
+ "step": 1734
+ },
+ {
+ "epoch": 0.8203309692671394,
+ "grad_norm": 2.781730890274048,
+ "learning_rate": 4.79480753542793e-06,
+ "loss": 0.5845,
+ "step": 1735
+ },
+ {
+ "epoch": 0.8208037825059101,
+ "grad_norm": 2.689948558807373,
+ "learning_rate": 4.794559955305513e-06,
+ "loss": 0.5928,
+ "step": 1736
+ },
+ {
+ "epoch": 0.8212765957446808,
+ "grad_norm": 2.7267637252807617,
+ "learning_rate": 4.7943122323109105e-06,
+ "loss": 0.5224,
+ "step": 1737
+ },
+ {
+ "epoch": 0.8217494089834515,
+ "grad_norm": 2.4346601963043213,
+ "learning_rate": 4.794064366459544e-06,
+ "loss": 0.6431,
+ "step": 1738
+ },
+ {
+ "epoch": 0.8222222222222222,
+ "grad_norm": 2.7440176010131836,
+ "learning_rate": 4.793816357766849e-06,
+ "loss": 0.6083,
+ "step": 1739
+ },
+ {
+ "epoch": 0.8226950354609929,
+ "grad_norm": 2.6558027267456055,
+ "learning_rate": 4.793568206248268e-06,
+ "loss": 0.698,
+ "step": 1740
+ },
+ {
+ "epoch": 0.8231678486997636,
+ "grad_norm": 2.591658353805542,
+ "learning_rate": 4.793319911919251e-06,
+ "loss": 0.6601,
+ "step": 1741
+ },
+ {
+ "epoch": 0.8236406619385342,
+ "grad_norm": 2.5431172847747803,
+ "learning_rate": 4.79307147479526e-06,
+ "loss": 0.5917,
+ "step": 1742
+ },
+ {
+ "epoch": 0.8241134751773049,
+ "grad_norm": 2.7335588932037354,
+ "learning_rate": 4.792822894891762e-06,
+ "loss": 0.5925,
+ "step": 1743
+ },
+ {
+ "epoch": 0.8245862884160756,
+ "grad_norm": 2.2500839233398438,
+ "learning_rate": 4.792574172224237e-06,
+ "loss": 0.4984,
+ "step": 1744
+ },
+ {
+ "epoch": 0.8250591016548463,
+ "grad_norm": 2.691343069076538,
+ "learning_rate": 4.79232530680817e-06,
+ "loss": 0.6262,
+ "step": 1745
+ },
+ {
+ "epoch": 0.825531914893617,
+ "grad_norm": 2.612204074859619,
+ "learning_rate": 4.792076298659058e-06,
+ "loss": 0.5822,
+ "step": 1746
+ },
+ {
+ "epoch": 0.8260047281323877,
+ "grad_norm": 3.0163519382476807,
+ "learning_rate": 4.791827147792406e-06,
+ "loss": 0.6263,
+ "step": 1747
+ },
+ {
+ "epoch": 0.8264775413711584,
+ "grad_norm": 2.742183208465576,
+ "learning_rate": 4.791577854223727e-06,
+ "loss": 0.6628,
+ "step": 1748
+ },
+ {
+ "epoch": 0.826950354609929,
+ "grad_norm": 2.872213840484619,
+ "learning_rate": 4.791328417968542e-06,
+ "loss": 0.6332,
+ "step": 1749
+ },
+ {
+ "epoch": 0.8274231678486997,
+ "grad_norm": 2.725006580352783,
+ "learning_rate": 4.7910788390423844e-06,
+ "loss": 0.6266,
+ "step": 1750
+ },
+ {
+ "epoch": 0.8278959810874704,
+ "grad_norm": 3.0366697311401367,
+ "learning_rate": 4.790829117460793e-06,
+ "loss": 0.6403,
+ "step": 1751
+ },
+ {
+ "epoch": 0.8283687943262411,
+ "grad_norm": 2.594881772994995,
+ "learning_rate": 4.790579253239318e-06,
+ "loss": 0.521,
+ "step": 1752
+ },
+ {
+ "epoch": 0.8288416075650118,
+ "grad_norm": 2.4496347904205322,
+ "learning_rate": 4.790329246393517e-06,
+ "loss": 0.54,
+ "step": 1753
+ },
+ {
+ "epoch": 0.8293144208037825,
+ "grad_norm": 3.102278470993042,
+ "learning_rate": 4.790079096938956e-06,
+ "loss": 0.6142,
+ "step": 1754
+ },
+ {
+ "epoch": 0.8297872340425532,
+ "grad_norm": 2.4645912647247314,
+ "learning_rate": 4.789828804891212e-06,
+ "loss": 0.5212,
+ "step": 1755
+ },
+ {
+ "epoch": 0.8302600472813239,
+ "grad_norm": 2.7482516765594482,
+ "learning_rate": 4.789578370265868e-06,
+ "loss": 0.6712,
+ "step": 1756
+ },
+ {
+ "epoch": 0.8307328605200945,
+ "grad_norm": 2.61360502243042,
+ "learning_rate": 4.7893277930785195e-06,
+ "loss": 0.6367,
+ "step": 1757
+ },
+ {
+ "epoch": 0.8312056737588652,
+ "grad_norm": 2.79028058052063,
+ "learning_rate": 4.789077073344767e-06,
+ "loss": 0.5099,
+ "step": 1758
+ },
+ {
+ "epoch": 0.8316784869976359,
+ "grad_norm": 2.647662401199341,
+ "learning_rate": 4.788826211080222e-06,
+ "loss": 0.6698,
+ "step": 1759
+ },
+ {
+ "epoch": 0.8321513002364066,
+ "grad_norm": 3.0214831829071045,
+ "learning_rate": 4.7885752063005055e-06,
+ "loss": 0.6121,
+ "step": 1760
+ },
+ {
+ "epoch": 0.8326241134751773,
+ "grad_norm": 2.8244032859802246,
+ "learning_rate": 4.788324059021247e-06,
+ "loss": 0.6921,
+ "step": 1761
+ },
+ {
+ "epoch": 0.833096926713948,
+ "grad_norm": 3.1501076221466064,
+ "learning_rate": 4.788072769258082e-06,
+ "loss": 0.6872,
+ "step": 1762
+ },
+ {
+ "epoch": 0.8335697399527187,
+ "grad_norm": 2.6989903450012207,
+ "learning_rate": 4.7878213370266594e-06,
+ "loss": 0.5884,
+ "step": 1763
+ },
+ {
+ "epoch": 0.8340425531914893,
+ "grad_norm": 2.6982665061950684,
+ "learning_rate": 4.787569762342633e-06,
+ "loss": 0.6112,
+ "step": 1764
+ },
+ {
+ "epoch": 0.83451536643026,
+ "grad_norm": 2.6918323040008545,
+ "learning_rate": 4.7873180452216685e-06,
+ "loss": 0.5315,
+ "step": 1765
+ },
+ {
+ "epoch": 0.8349881796690307,
+ "grad_norm": 2.5494401454925537,
+ "learning_rate": 4.78706618567944e-06,
+ "loss": 0.5909,
+ "step": 1766
+ },
+ {
+ "epoch": 0.8354609929078014,
+ "grad_norm": 2.7532095909118652,
+ "learning_rate": 4.786814183731627e-06,
+ "loss": 0.5566,
+ "step": 1767
+ },
+ {
+ "epoch": 0.8359338061465721,
+ "grad_norm": 2.550865888595581,
+ "learning_rate": 4.786562039393923e-06,
+ "loss": 0.555,
+ "step": 1768
+ },
+ {
+ "epoch": 0.8364066193853428,
+ "grad_norm": 2.4477791786193848,
+ "learning_rate": 4.786309752682028e-06,
+ "loss": 0.5844,
+ "step": 1769
+ },
+ {
+ "epoch": 0.8368794326241135,
+ "grad_norm": 2.6982262134552,
+ "learning_rate": 4.7860573236116485e-06,
+ "loss": 0.6136,
+ "step": 1770
+ },
+ {
+ "epoch": 0.8373522458628841,
+ "grad_norm": 2.456263542175293,
+ "learning_rate": 4.785804752198503e-06,
+ "loss": 0.5055,
+ "step": 1771
+ },
+ {
+ "epoch": 0.8378250591016548,
+ "grad_norm": 2.428544521331787,
+ "learning_rate": 4.78555203845832e-06,
+ "loss": 0.5859,
+ "step": 1772
+ },
+ {
+ "epoch": 0.8382978723404255,
+ "grad_norm": 2.1782307624816895,
+ "learning_rate": 4.785299182406833e-06,
+ "loss": 0.5325,
+ "step": 1773
+ },
+ {
+ "epoch": 0.8387706855791962,
+ "grad_norm": 3.137956142425537,
+ "learning_rate": 4.785046184059786e-06,
+ "loss": 0.6097,
+ "step": 1774
+ },
+ {
+ "epoch": 0.8392434988179669,
+ "grad_norm": 2.6269001960754395,
+ "learning_rate": 4.7847930434329336e-06,
+ "loss": 0.5972,
+ "step": 1775
+ },
+ {
+ "epoch": 0.8397163120567376,
+ "grad_norm": 2.732659339904785,
+ "learning_rate": 4.784539760542037e-06,
+ "loss": 0.6054,
+ "step": 1776
+ },
+ {
+ "epoch": 0.8401891252955083,
+ "grad_norm": 2.5346736907958984,
+ "learning_rate": 4.784286335402866e-06,
+ "loss": 0.5521,
+ "step": 1777
+ },
+ {
+ "epoch": 0.840661938534279,
+ "grad_norm": 3.1420228481292725,
+ "learning_rate": 4.784032768031202e-06,
+ "loss": 0.6165,
+ "step": 1778
+ },
+ {
+ "epoch": 0.8411347517730496,
+ "grad_norm": 3.073793411254883,
+ "learning_rate": 4.783779058442831e-06,
+ "loss": 0.6414,
+ "step": 1779
+ },
+ {
+ "epoch": 0.8416075650118203,
+ "grad_norm": 2.6621336936950684,
+ "learning_rate": 4.783525206653554e-06,
+ "loss": 0.5836,
+ "step": 1780
+ },
+ {
+ "epoch": 0.842080378250591,
+ "grad_norm": 2.7029049396514893,
+ "learning_rate": 4.7832712126791745e-06,
+ "loss": 0.5897,
+ "step": 1781
+ },
+ {
+ "epoch": 0.8425531914893617,
+ "grad_norm": 2.4733822345733643,
+ "learning_rate": 4.783017076535509e-06,
+ "loss": 0.5913,
+ "step": 1782
+ },
+ {
+ "epoch": 0.8430260047281324,
+ "grad_norm": 2.8119473457336426,
+ "learning_rate": 4.782762798238381e-06,
+ "loss": 0.6105,
+ "step": 1783
+ },
+ {
+ "epoch": 0.8434988179669031,
+ "grad_norm": 2.5290818214416504,
+ "learning_rate": 4.782508377803622e-06,
+ "loss": 0.6119,
+ "step": 1784
+ },
+ {
+ "epoch": 0.8439716312056738,
+ "grad_norm": 3.193472385406494,
+ "learning_rate": 4.782253815247076e-06,
+ "loss": 0.6665,
+ "step": 1785
+ },
+ {
+ "epoch": 0.8444444444444444,
+ "grad_norm": 3.206759452819824,
+ "learning_rate": 4.781999110584592e-06,
+ "loss": 0.6012,
+ "step": 1786
+ },
+ {
+ "epoch": 0.8449172576832151,
+ "grad_norm": 2.6227457523345947,
+ "learning_rate": 4.781744263832029e-06,
+ "loss": 0.5845,
+ "step": 1787
+ },
+ {
+ "epoch": 0.8453900709219858,
+ "grad_norm": 2.838365316390991,
+ "learning_rate": 4.781489275005257e-06,
+ "loss": 0.5695,
+ "step": 1788
+ },
+ {
+ "epoch": 0.8458628841607565,
+ "grad_norm": 2.8348326683044434,
+ "learning_rate": 4.78123414412015e-06,
+ "loss": 0.6136,
+ "step": 1789
+ },
+ {
+ "epoch": 0.8463356973995272,
+ "grad_norm": 2.5698344707489014,
+ "learning_rate": 4.780978871192597e-06,
+ "loss": 0.6576,
+ "step": 1790
+ },
+ {
+ "epoch": 0.8468085106382979,
+ "grad_norm": 2.5198330879211426,
+ "learning_rate": 4.780723456238492e-06,
+ "loss": 0.5521,
+ "step": 1791
+ },
+ {
+ "epoch": 0.8472813238770686,
+ "grad_norm": 3.001325845718384,
+ "learning_rate": 4.780467899273737e-06,
+ "loss": 0.6075,
+ "step": 1792
+ },
+ {
+ "epoch": 0.8477541371158392,
+ "grad_norm": 2.7732746601104736,
+ "learning_rate": 4.780212200314247e-06,
+ "loss": 0.6245,
+ "step": 1793
+ },
+ {
+ "epoch": 0.8482269503546099,
+ "grad_norm": 2.6950337886810303,
+ "learning_rate": 4.77995635937594e-06,
+ "loss": 0.5723,
+ "step": 1794
+ },
+ {
+ "epoch": 0.8486997635933806,
+ "grad_norm": 2.82051420211792,
+ "learning_rate": 4.779700376474749e-06,
+ "loss": 0.6184,
+ "step": 1795
+ },
+ {
+ "epoch": 0.8491725768321513,
+ "grad_norm": 2.757791757583618,
+ "learning_rate": 4.779444251626611e-06,
+ "loss": 0.608,
+ "step": 1796
+ },
+ {
+ "epoch": 0.849645390070922,
+ "grad_norm": 2.394108533859253,
+ "learning_rate": 4.779187984847475e-06,
+ "loss": 0.6174,
+ "step": 1797
+ },
+ {
+ "epoch": 0.8501182033096927,
+ "grad_norm": 2.427562713623047,
+ "learning_rate": 4.778931576153296e-06,
+ "loss": 0.5618,
+ "step": 1798
+ },
+ {
+ "epoch": 0.8505910165484634,
+ "grad_norm": 2.891268491744995,
+ "learning_rate": 4.778675025560042e-06,
+ "loss": 0.6865,
+ "step": 1799
+ },
+ {
+ "epoch": 0.851063829787234,
+ "grad_norm": 2.665534257888794,
+ "learning_rate": 4.778418333083685e-06,
+ "loss": 0.5852,
+ "step": 1800
+ },
+ {
+ "epoch": 0.8515366430260047,
+ "grad_norm": 2.5492889881134033,
+ "learning_rate": 4.7781614987402095e-06,
+ "loss": 0.5161,
+ "step": 1801
+ },
+ {
+ "epoch": 0.8520094562647754,
+ "grad_norm": 2.400177001953125,
+ "learning_rate": 4.777904522545607e-06,
+ "loss": 0.5128,
+ "step": 1802
+ },
+ {
+ "epoch": 0.8524822695035461,
+ "grad_norm": 2.3949809074401855,
+ "learning_rate": 4.777647404515878e-06,
+ "loss": 0.571,
+ "step": 1803
+ },
+ {
+ "epoch": 0.8529550827423168,
+ "grad_norm": 2.3624472618103027,
+ "learning_rate": 4.7773901446670325e-06,
+ "loss": 0.5486,
+ "step": 1804
+ },
+ {
+ "epoch": 0.8534278959810875,
+ "grad_norm": 2.711366891860962,
+ "learning_rate": 4.7771327430150885e-06,
+ "loss": 0.5667,
+ "step": 1805
+ },
+ {
+ "epoch": 0.8539007092198582,
+ "grad_norm": 2.7681493759155273,
+ "learning_rate": 4.776875199576073e-06,
+ "loss": 0.5686,
+ "step": 1806
+ },
+ {
+ "epoch": 0.8543735224586289,
+ "grad_norm": 3.0369436740875244,
+ "learning_rate": 4.776617514366023e-06,
+ "loss": 0.6635,
+ "step": 1807
+ },
+ {
+ "epoch": 0.8548463356973995,
+ "grad_norm": 2.919649600982666,
+ "learning_rate": 4.776359687400983e-06,
+ "loss": 0.5749,
+ "step": 1808
+ },
+ {
+ "epoch": 0.8553191489361702,
+ "grad_norm": 2.7986185550689697,
+ "learning_rate": 4.776101718697007e-06,
+ "loss": 0.559,
+ "step": 1809
+ },
+ {
+ "epoch": 0.8557919621749409,
+ "grad_norm": 2.5951223373413086,
+ "learning_rate": 4.775843608270158e-06,
+ "loss": 0.5654,
+ "step": 1810
+ },
+ {
+ "epoch": 0.8562647754137116,
+ "grad_norm": 2.674138069152832,
+ "learning_rate": 4.775585356136505e-06,
+ "loss": 0.5286,
+ "step": 1811
+ },
+ {
+ "epoch": 0.8567375886524823,
+ "grad_norm": 3.045437812805176,
+ "learning_rate": 4.775326962312131e-06,
+ "loss": 0.6185,
+ "step": 1812
+ },
+ {
+ "epoch": 0.857210401891253,
+ "grad_norm": 2.6145293712615967,
+ "learning_rate": 4.775068426813124e-06,
+ "loss": 0.6075,
+ "step": 1813
+ },
+ {
+ "epoch": 0.8576832151300237,
+ "grad_norm": 2.6320106983184814,
+ "learning_rate": 4.7748097496555824e-06,
+ "loss": 0.561,
+ "step": 1814
+ },
+ {
+ "epoch": 0.8581560283687943,
+ "grad_norm": 2.5038623809814453,
+ "learning_rate": 4.774550930855612e-06,
+ "loss": 0.593,
+ "step": 1815
+ },
+ {
+ "epoch": 0.858628841607565,
+ "grad_norm": 2.8168089389801025,
+ "learning_rate": 4.774291970429329e-06,
+ "loss": 0.5196,
+ "step": 1816
+ },
+ {
+ "epoch": 0.8591016548463357,
+ "grad_norm": 2.778130292892456,
+ "learning_rate": 4.774032868392858e-06,
+ "loss": 0.5984,
+ "step": 1817
+ },
+ {
+ "epoch": 0.8595744680851064,
+ "grad_norm": 2.536458730697632,
+ "learning_rate": 4.7737736247623305e-06,
+ "loss": 0.568,
+ "step": 1818
+ },
+ {
+ "epoch": 0.8600472813238771,
+ "grad_norm": 2.6669719219207764,
+ "learning_rate": 4.77351423955389e-06,
+ "loss": 0.6233,
+ "step": 1819
+ },
+ {
+ "epoch": 0.8605200945626478,
+ "grad_norm": 2.578242540359497,
+ "learning_rate": 4.773254712783687e-06,
+ "loss": 0.579,
+ "step": 1820
+ },
+ {
+ "epoch": 0.8609929078014185,
+ "grad_norm": 2.816664457321167,
+ "learning_rate": 4.772995044467881e-06,
+ "loss": 0.6635,
+ "step": 1821
+ },
+ {
+ "epoch": 0.8614657210401891,
+ "grad_norm": 3.1111979484558105,
+ "learning_rate": 4.77273523462264e-06,
+ "loss": 0.6372,
+ "step": 1822
+ },
+ {
+ "epoch": 0.8619385342789598,
+ "grad_norm": 2.764552354812622,
+ "learning_rate": 4.772475283264142e-06,
+ "loss": 0.6216,
+ "step": 1823
+ },
+ {
+ "epoch": 0.8624113475177305,
+ "grad_norm": 2.9126830101013184,
+ "learning_rate": 4.772215190408572e-06,
+ "loss": 0.6396,
+ "step": 1824
+ },
+ {
+ "epoch": 0.8628841607565012,
+ "grad_norm": 2.7502307891845703,
+ "learning_rate": 4.7719549560721264e-06,
+ "loss": 0.6186,
+ "step": 1825
+ },
+ {
+ "epoch": 0.8633569739952719,
+ "grad_norm": 2.6279006004333496,
+ "learning_rate": 4.771694580271007e-06,
+ "loss": 0.5557,
+ "step": 1826
+ },
+ {
+ "epoch": 0.8638297872340426,
+ "grad_norm": 2.996563196182251,
+ "learning_rate": 4.7714340630214276e-06,
+ "loss": 0.6259,
+ "step": 1827
+ },
+ {
+ "epoch": 0.8643026004728133,
+ "grad_norm": 3.231323480606079,
+ "learning_rate": 4.771173404339609e-06,
+ "loss": 0.5473,
+ "step": 1828
+ },
+ {
+ "epoch": 0.864775413711584,
+ "grad_norm": 3.143519878387451,
+ "learning_rate": 4.770912604241781e-06,
+ "loss": 0.593,
+ "step": 1829
+ },
+ {
+ "epoch": 0.8652482269503546,
+ "grad_norm": 2.515484094619751,
+ "learning_rate": 4.770651662744184e-06,
+ "loss": 0.538,
+ "step": 1830
+ },
+ {
+ "epoch": 0.8657210401891253,
+ "grad_norm": 2.629058837890625,
+ "learning_rate": 4.770390579863064e-06,
+ "loss": 0.5745,
+ "step": 1831
+ },
+ {
+ "epoch": 0.866193853427896,
+ "grad_norm": 2.5826802253723145,
+ "learning_rate": 4.770129355614677e-06,
+ "loss": 0.6397,
+ "step": 1832
+ },
+ {
+ "epoch": 0.8666666666666667,
+ "grad_norm": 2.954623222351074,
+ "learning_rate": 4.769867990015289e-06,
+ "loss": 0.6106,
+ "step": 1833
+ },
+ {
+ "epoch": 0.8671394799054374,
+ "grad_norm": 2.742192268371582,
+ "learning_rate": 4.769606483081175e-06,
+ "loss": 0.6902,
+ "step": 1834
+ },
+ {
+ "epoch": 0.8676122931442081,
+ "grad_norm": 2.2619097232818604,
+ "learning_rate": 4.769344834828618e-06,
+ "loss": 0.5414,
+ "step": 1835
+ },
+ {
+ "epoch": 0.8680851063829788,
+ "grad_norm": 2.7384188175201416,
+ "learning_rate": 4.769083045273908e-06,
+ "loss": 0.5787,
+ "step": 1836
+ },
+ {
+ "epoch": 0.8685579196217494,
+ "grad_norm": 2.6734485626220703,
+ "learning_rate": 4.768821114433346e-06,
+ "loss": 0.5923,
+ "step": 1837
+ },
+ {
+ "epoch": 0.8690307328605201,
+ "grad_norm": 2.286140203475952,
+ "learning_rate": 4.768559042323243e-06,
+ "loss": 0.5822,
+ "step": 1838
+ },
+ {
+ "epoch": 0.8695035460992908,
+ "grad_norm": 3.0243725776672363,
+ "learning_rate": 4.768296828959915e-06,
+ "loss": 0.6623,
+ "step": 1839
+ },
+ {
+ "epoch": 0.8699763593380615,
+ "grad_norm": 2.4026312828063965,
+ "learning_rate": 4.768034474359689e-06,
+ "loss": 0.5554,
+ "step": 1840
+ },
+ {
+ "epoch": 0.8704491725768322,
+ "grad_norm": 2.7469029426574707,
+ "learning_rate": 4.767771978538903e-06,
+ "loss": 0.6316,
+ "step": 1841
+ },
+ {
+ "epoch": 0.8709219858156029,
+ "grad_norm": 2.729659080505371,
+ "learning_rate": 4.767509341513899e-06,
+ "loss": 0.5807,
+ "step": 1842
+ },
+ {
+ "epoch": 0.8713947990543736,
+ "grad_norm": 2.5336945056915283,
+ "learning_rate": 4.76724656330103e-06,
+ "loss": 0.6109,
+ "step": 1843
+ },
+ {
+ "epoch": 0.8718676122931442,
+ "grad_norm": 2.519880533218384,
+ "learning_rate": 4.76698364391666e-06,
+ "loss": 0.5313,
+ "step": 1844
+ },
+ {
+ "epoch": 0.8723404255319149,
+ "grad_norm": 2.698862075805664,
+ "learning_rate": 4.766720583377159e-06,
+ "loss": 0.5953,
+ "step": 1845
+ },
+ {
+ "epoch": 0.8728132387706856,
+ "grad_norm": 3.0195560455322266,
+ "learning_rate": 4.766457381698907e-06,
+ "loss": 0.5965,
+ "step": 1846
+ },
+ {
+ "epoch": 0.8732860520094563,
+ "grad_norm": 2.5972697734832764,
+ "learning_rate": 4.766194038898291e-06,
+ "loss": 0.6014,
+ "step": 1847
+ },
+ {
+ "epoch": 0.873758865248227,
+ "grad_norm": 2.7132294178009033,
+ "learning_rate": 4.76593055499171e-06,
+ "loss": 0.5638,
+ "step": 1848
+ },
+ {
+ "epoch": 0.8742316784869977,
+ "grad_norm": 2.7134575843811035,
+ "learning_rate": 4.765666929995568e-06,
+ "loss": 0.52,
+ "step": 1849
+ },
+ {
+ "epoch": 0.8747044917257684,
+ "grad_norm": 2.3804993629455566,
+ "learning_rate": 4.765403163926282e-06,
+ "loss": 0.5435,
+ "step": 1850
+ },
+ {
+ "epoch": 0.875177304964539,
+ "grad_norm": 2.8782761096954346,
+ "learning_rate": 4.765139256800274e-06,
+ "loss": 0.5843,
+ "step": 1851
+ },
+ {
+ "epoch": 0.8756501182033097,
+ "grad_norm": 2.836209774017334,
+ "learning_rate": 4.764875208633977e-06,
+ "loss": 0.6667,
+ "step": 1852
+ },
+ {
+ "epoch": 0.8761229314420804,
+ "grad_norm": 2.608851194381714,
+ "learning_rate": 4.764611019443831e-06,
+ "loss": 0.5436,
+ "step": 1853
+ },
+ {
+ "epoch": 0.8765957446808511,
+ "grad_norm": 2.788738965988159,
+ "learning_rate": 4.764346689246288e-06,
+ "loss": 0.7331,
+ "step": 1854
+ },
+ {
+ "epoch": 0.8770685579196218,
+ "grad_norm": 2.524277687072754,
+ "learning_rate": 4.764082218057805e-06,
+ "loss": 0.5067,
+ "step": 1855
+ },
+ {
+ "epoch": 0.8775413711583925,
+ "grad_norm": 3.7559316158294678,
+ "learning_rate": 4.763817605894851e-06,
+ "loss": 0.6809,
+ "step": 1856
+ },
+ {
+ "epoch": 0.8780141843971632,
+ "grad_norm": 2.9070613384246826,
+ "learning_rate": 4.763552852773899e-06,
+ "loss": 0.5913,
+ "step": 1857
+ },
+ {
+ "epoch": 0.8784869976359339,
+ "grad_norm": 2.7050609588623047,
+ "learning_rate": 4.7632879587114386e-06,
+ "loss": 0.6074,
+ "step": 1858
+ },
+ {
+ "epoch": 0.8789598108747045,
+ "grad_norm": 2.891134262084961,
+ "learning_rate": 4.76302292372396e-06,
+ "loss": 0.5939,
+ "step": 1859
+ },
+ {
+ "epoch": 0.8794326241134752,
+ "grad_norm": 2.8581702709198,
+ "learning_rate": 4.762757747827968e-06,
+ "loss": 0.5972,
+ "step": 1860
+ },
+ {
+ "epoch": 0.8799054373522459,
+ "grad_norm": 2.8266196250915527,
+ "learning_rate": 4.762492431039971e-06,
+ "loss": 0.5993,
+ "step": 1861
+ },
+ {
+ "epoch": 0.8803782505910166,
+ "grad_norm": 2.4853954315185547,
+ "learning_rate": 4.762226973376493e-06,
+ "loss": 0.6388,
+ "step": 1862
+ },
+ {
+ "epoch": 0.8808510638297873,
+ "grad_norm": 3.2212886810302734,
+ "learning_rate": 4.761961374854059e-06,
+ "loss": 0.6698,
+ "step": 1863
+ },
+ {
+ "epoch": 0.881323877068558,
+ "grad_norm": 3.1254501342773438,
+ "learning_rate": 4.761695635489211e-06,
+ "loss": 0.5263,
+ "step": 1864
+ },
+ {
+ "epoch": 0.8817966903073287,
+ "grad_norm": 2.6891462802886963,
+ "learning_rate": 4.761429755298491e-06,
+ "loss": 0.5359,
+ "step": 1865
+ },
+ {
+ "epoch": 0.8822695035460993,
+ "grad_norm": 2.8557538986206055,
+ "learning_rate": 4.761163734298457e-06,
+ "loss": 0.5933,
+ "step": 1866
+ },
+ {
+ "epoch": 0.88274231678487,
+ "grad_norm": 2.53548264503479,
+ "learning_rate": 4.7608975725056724e-06,
+ "loss": 0.6397,
+ "step": 1867
+ },
+ {
+ "epoch": 0.8832151300236407,
+ "grad_norm": 3.0237956047058105,
+ "learning_rate": 4.76063126993671e-06,
+ "loss": 0.6845,
+ "step": 1868
+ },
+ {
+ "epoch": 0.8836879432624114,
+ "grad_norm": 3.222886800765991,
+ "learning_rate": 4.76036482660815e-06,
+ "loss": 0.6055,
+ "step": 1869
+ },
+ {
+ "epoch": 0.8841607565011821,
+ "grad_norm": 3.1867551803588867,
+ "learning_rate": 4.760098242536584e-06,
+ "loss": 0.6592,
+ "step": 1870
+ },
+ {
+ "epoch": 0.8846335697399527,
+ "grad_norm": 2.782209873199463,
+ "learning_rate": 4.7598315177386115e-06,
+ "loss": 0.5833,
+ "step": 1871
+ },
+ {
+ "epoch": 0.8851063829787233,
+ "grad_norm": 2.899871587753296,
+ "learning_rate": 4.759564652230838e-06,
+ "loss": 0.6129,
+ "step": 1872
+ },
+ {
+ "epoch": 0.885579196217494,
+ "grad_norm": 2.5690579414367676,
+ "learning_rate": 4.759297646029882e-06,
+ "loss": 0.5827,
+ "step": 1873
+ },
+ {
+ "epoch": 0.8860520094562647,
+ "grad_norm": 2.666130304336548,
+ "learning_rate": 4.759030499152368e-06,
+ "loss": 0.5272,
+ "step": 1874
+ },
+ {
+ "epoch": 0.8865248226950354,
+ "grad_norm": 2.7030911445617676,
+ "learning_rate": 4.758763211614932e-06,
+ "loss": 0.6415,
+ "step": 1875
+ },
+ {
+ "epoch": 0.8869976359338061,
+ "grad_norm": 2.717512845993042,
+ "learning_rate": 4.7584957834342135e-06,
+ "loss": 0.5827,
+ "step": 1876
+ },
+ {
+ "epoch": 0.8874704491725768,
+ "grad_norm": 2.665823459625244,
+ "learning_rate": 4.758228214626867e-06,
+ "loss": 0.6209,
+ "step": 1877
+ },
+ {
+ "epoch": 0.8879432624113475,
+ "grad_norm": 2.636653184890747,
+ "learning_rate": 4.75796050520955e-06,
+ "loss": 0.6413,
+ "step": 1878
+ },
+ {
+ "epoch": 0.8884160756501182,
+ "grad_norm": 2.585115671157837,
+ "learning_rate": 4.7576926551989345e-06,
+ "loss": 0.5518,
+ "step": 1879
+ },
+ {
+ "epoch": 0.8888888888888888,
+ "grad_norm": 2.808526039123535,
+ "learning_rate": 4.757424664611697e-06,
+ "loss": 0.5717,
+ "step": 1880
+ },
+ {
+ "epoch": 0.8893617021276595,
+ "grad_norm": 3.5957939624786377,
+ "learning_rate": 4.757156533464524e-06,
+ "loss": 0.6323,
+ "step": 1881
+ },
+ {
+ "epoch": 0.8898345153664302,
+ "grad_norm": 2.5003883838653564,
+ "learning_rate": 4.756888261774111e-06,
+ "loss": 0.5937,
+ "step": 1882
+ },
+ {
+ "epoch": 0.8903073286052009,
+ "grad_norm": 2.749061346054077,
+ "learning_rate": 4.756619849557161e-06,
+ "loss": 0.6642,
+ "step": 1883
+ },
+ {
+ "epoch": 0.8907801418439716,
+ "grad_norm": 2.6757891178131104,
+ "learning_rate": 4.756351296830389e-06,
+ "loss": 0.5887,
+ "step": 1884
+ },
+ {
+ "epoch": 0.8912529550827423,
+ "grad_norm": 2.811925172805786,
+ "learning_rate": 4.756082603610516e-06,
+ "loss": 0.6571,
+ "step": 1885
+ },
+ {
+ "epoch": 0.891725768321513,
+ "grad_norm": 2.5054616928100586,
+ "learning_rate": 4.755813769914271e-06,
+ "loss": 0.6312,
+ "step": 1886
+ },
+ {
+ "epoch": 0.8921985815602836,
+ "grad_norm": 2.7518467903137207,
+ "learning_rate": 4.755544795758395e-06,
+ "loss": 0.6685,
+ "step": 1887
+ },
+ {
+ "epoch": 0.8926713947990543,
+ "grad_norm": 2.7527287006378174,
+ "learning_rate": 4.755275681159634e-06,
+ "loss": 0.5886,
+ "step": 1888
+ },
+ {
+ "epoch": 0.893144208037825,
+ "grad_norm": 2.6162452697753906,
+ "learning_rate": 4.755006426134745e-06,
+ "loss": 0.546,
+ "step": 1889
+ },
+ {
+ "epoch": 0.8936170212765957,
+ "grad_norm": 2.4016737937927246,
+ "learning_rate": 4.754737030700495e-06,
+ "loss": 0.5726,
+ "step": 1890
+ },
+ {
+ "epoch": 0.8940898345153664,
+ "grad_norm": 2.528327703475952,
+ "learning_rate": 4.754467494873656e-06,
+ "loss": 0.5682,
+ "step": 1891
+ },
+ {
+ "epoch": 0.8945626477541371,
+ "grad_norm": 2.3139286041259766,
+ "learning_rate": 4.7541978186710115e-06,
+ "loss": 0.6108,
+ "step": 1892
+ },
+ {
+ "epoch": 0.8950354609929078,
+ "grad_norm": 2.7269136905670166,
+ "learning_rate": 4.753928002109354e-06,
+ "loss": 0.5875,
+ "step": 1893
+ },
+ {
+ "epoch": 0.8955082742316784,
+ "grad_norm": 4.425495147705078,
+ "learning_rate": 4.753658045205482e-06,
+ "loss": 0.5572,
+ "step": 1894
+ },
+ {
+ "epoch": 0.8959810874704491,
+ "grad_norm": 2.535409927368164,
+ "learning_rate": 4.753387947976206e-06,
+ "loss": 0.5868,
+ "step": 1895
+ },
+ {
+ "epoch": 0.8964539007092198,
+ "grad_norm": 2.722458600997925,
+ "learning_rate": 4.753117710438343e-06,
+ "loss": 0.5935,
+ "step": 1896
+ },
+ {
+ "epoch": 0.8969267139479905,
+ "grad_norm": 2.743861436843872,
+ "learning_rate": 4.75284733260872e-06,
+ "loss": 0.572,
+ "step": 1897
+ },
+ {
+ "epoch": 0.8973995271867612,
+ "grad_norm": 2.60640549659729,
+ "learning_rate": 4.752576814504173e-06,
+ "loss": 0.567,
+ "step": 1898
+ },
+ {
+ "epoch": 0.8978723404255319,
+ "grad_norm": 2.7486042976379395,
+ "learning_rate": 4.7523061561415435e-06,
+ "loss": 0.5768,
+ "step": 1899
+ },
+ {
+ "epoch": 0.8983451536643026,
+ "grad_norm": 3.8410251140594482,
+ "learning_rate": 4.752035357537686e-06,
+ "loss": 0.6034,
+ "step": 1900
+ },
+ {
+ "epoch": 0.8988179669030733,
+ "grad_norm": 3.0935890674591064,
+ "learning_rate": 4.751764418709462e-06,
+ "loss": 0.5644,
+ "step": 1901
+ },
+ {
+ "epoch": 0.8992907801418439,
+ "grad_norm": 2.7989892959594727,
+ "learning_rate": 4.751493339673742e-06,
+ "loss": 0.656,
+ "step": 1902
+ },
+ {
+ "epoch": 0.8997635933806146,
+ "grad_norm": 3.6940557956695557,
+ "learning_rate": 4.751222120447403e-06,
+ "loss": 0.6632,
+ "step": 1903
+ },
+ {
+ "epoch": 0.9002364066193853,
+ "grad_norm": 2.3428797721862793,
+ "learning_rate": 4.750950761047335e-06,
+ "loss": 0.4485,
+ "step": 1904
+ },
+ {
+ "epoch": 0.900709219858156,
+ "grad_norm": 2.622544050216675,
+ "learning_rate": 4.750679261490432e-06,
+ "loss": 0.5857,
+ "step": 1905
+ },
+ {
+ "epoch": 0.9011820330969267,
+ "grad_norm": 2.4911322593688965,
+ "learning_rate": 4.750407621793601e-06,
+ "loss": 0.5618,
+ "step": 1906
+ },
+ {
+ "epoch": 0.9016548463356974,
+ "grad_norm": 2.6434662342071533,
+ "learning_rate": 4.750135841973755e-06,
+ "loss": 0.6057,
+ "step": 1907
+ },
+ {
+ "epoch": 0.902127659574468,
+ "grad_norm": 3.115443706512451,
+ "learning_rate": 4.749863922047817e-06,
+ "loss": 0.6064,
+ "step": 1908
+ },
+ {
+ "epoch": 0.9026004728132387,
+ "grad_norm": 2.5671091079711914,
+ "learning_rate": 4.749591862032718e-06,
+ "loss": 0.5625,
+ "step": 1909
+ },
+ {
+ "epoch": 0.9030732860520094,
+ "grad_norm": 3.2008655071258545,
+ "learning_rate": 4.749319661945398e-06,
+ "loss": 0.5547,
+ "step": 1910
+ },
+ {
+ "epoch": 0.9035460992907801,
+ "grad_norm": 2.905987024307251,
+ "learning_rate": 4.749047321802805e-06,
+ "loss": 0.6033,
+ "step": 1911
+ },
+ {
+ "epoch": 0.9040189125295508,
+ "grad_norm": 3.1456053256988525,
+ "learning_rate": 4.748774841621897e-06,
+ "loss": 0.5651,
+ "step": 1912
+ },
+ {
+ "epoch": 0.9044917257683215,
+ "grad_norm": 2.8116416931152344,
+ "learning_rate": 4.748502221419641e-06,
+ "loss": 0.5853,
+ "step": 1913
+ },
+ {
+ "epoch": 0.9049645390070922,
+ "grad_norm": 3.123835325241089,
+ "learning_rate": 4.748229461213011e-06,
+ "loss": 0.5427,
+ "step": 1914
+ },
+ {
+ "epoch": 0.9054373522458629,
+ "grad_norm": 2.4750146865844727,
+ "learning_rate": 4.747956561018989e-06,
+ "loss": 0.6517,
+ "step": 1915
+ },
+ {
+ "epoch": 0.9059101654846335,
+ "grad_norm": 2.6174299716949463,
+ "learning_rate": 4.7476835208545705e-06,
+ "loss": 0.6119,
+ "step": 1916
+ },
+ {
+ "epoch": 0.9063829787234042,
+ "grad_norm": 2.7390382289886475,
+ "learning_rate": 4.747410340736755e-06,
+ "loss": 0.5664,
+ "step": 1917
+ },
+ {
+ "epoch": 0.9068557919621749,
+ "grad_norm": 2.7940444946289062,
+ "learning_rate": 4.747137020682552e-06,
+ "loss": 0.5628,
+ "step": 1918
+ },
+ {
+ "epoch": 0.9073286052009456,
+ "grad_norm": 2.477365016937256,
+ "learning_rate": 4.7468635607089795e-06,
+ "loss": 0.5261,
+ "step": 1919
+ },
+ {
+ "epoch": 0.9078014184397163,
+ "grad_norm": 2.7016685009002686,
+ "learning_rate": 4.746589960833066e-06,
+ "loss": 0.5576,
+ "step": 1920
+ },
+ {
+ "epoch": 0.908274231678487,
+ "grad_norm": 2.8806519508361816,
+ "learning_rate": 4.746316221071846e-06,
+ "loss": 0.5925,
+ "step": 1921
+ },
+ {
+ "epoch": 0.9087470449172577,
+ "grad_norm": 3.0315234661102295,
+ "learning_rate": 4.746042341442365e-06,
+ "loss": 0.6142,
+ "step": 1922
+ },
+ {
+ "epoch": 0.9092198581560283,
+ "grad_norm": 4.2446160316467285,
+ "learning_rate": 4.745768321961676e-06,
+ "loss": 0.5352,
+ "step": 1923
+ },
+ {
+ "epoch": 0.909692671394799,
+ "grad_norm": 2.6517012119293213,
+ "learning_rate": 4.745494162646841e-06,
+ "loss": 0.6118,
+ "step": 1924
+ },
+ {
+ "epoch": 0.9101654846335697,
+ "grad_norm": 2.774900197982788,
+ "learning_rate": 4.7452198635149304e-06,
+ "loss": 0.572,
+ "step": 1925
+ },
+ {
+ "epoch": 0.9106382978723404,
+ "grad_norm": 3.0133683681488037,
+ "learning_rate": 4.744945424583024e-06,
+ "loss": 0.5897,
+ "step": 1926
+ },
+ {
+ "epoch": 0.9111111111111111,
+ "grad_norm": 2.7344839572906494,
+ "learning_rate": 4.744670845868211e-06,
+ "loss": 0.6207,
+ "step": 1927
+ },
+ {
+ "epoch": 0.9115839243498818,
+ "grad_norm": 2.636578321456909,
+ "learning_rate": 4.744396127387586e-06,
+ "loss": 0.6687,
+ "step": 1928
+ },
+ {
+ "epoch": 0.9120567375886525,
+ "grad_norm": 2.8663458824157715,
+ "learning_rate": 4.744121269158255e-06,
+ "loss": 0.5002,
+ "step": 1929
+ },
+ {
+ "epoch": 0.9125295508274232,
+ "grad_norm": 2.661079168319702,
+ "learning_rate": 4.743846271197333e-06,
+ "loss": 0.5848,
+ "step": 1930
+ },
+ {
+ "epoch": 0.9130023640661938,
+ "grad_norm": 2.881256341934204,
+ "learning_rate": 4.743571133521943e-06,
+ "loss": 0.5911,
+ "step": 1931
+ },
+ {
+ "epoch": 0.9134751773049645,
+ "grad_norm": 2.5540573596954346,
+ "learning_rate": 4.743295856149217e-06,
+ "loss": 0.5647,
+ "step": 1932
+ },
+ {
+ "epoch": 0.9139479905437352,
+ "grad_norm": 2.7060387134552,
+ "learning_rate": 4.743020439096293e-06,
+ "loss": 0.6267,
+ "step": 1933
+ },
+ {
+ "epoch": 0.9144208037825059,
+ "grad_norm": 2.694481372833252,
+ "learning_rate": 4.742744882380323e-06,
+ "loss": 0.6283,
+ "step": 1934
+ },
+ {
+ "epoch": 0.9148936170212766,
+ "grad_norm": 2.711555242538452,
+ "learning_rate": 4.7424691860184625e-06,
+ "loss": 0.5784,
+ "step": 1935
+ },
+ {
+ "epoch": 0.9153664302600473,
+ "grad_norm": 2.9077224731445312,
+ "learning_rate": 4.742193350027879e-06,
+ "loss": 0.5948,
+ "step": 1936
+ },
+ {
+ "epoch": 0.915839243498818,
+ "grad_norm": 2.9824187755584717,
+ "learning_rate": 4.7419173744257476e-06,
+ "loss": 0.6115,
+ "step": 1937
+ },
+ {
+ "epoch": 0.9163120567375886,
+ "grad_norm": 2.5127830505371094,
+ "learning_rate": 4.7416412592292515e-06,
+ "loss": 0.5803,
+ "step": 1938
+ },
+ {
+ "epoch": 0.9167848699763593,
+ "grad_norm": 3.1307175159454346,
+ "learning_rate": 4.741365004455583e-06,
+ "loss": 0.5657,
+ "step": 1939
+ },
+ {
+ "epoch": 0.91725768321513,
+ "grad_norm": 2.8205273151397705,
+ "learning_rate": 4.741088610121944e-06,
+ "loss": 0.6145,
+ "step": 1940
+ },
+ {
+ "epoch": 0.9177304964539007,
+ "grad_norm": 2.6119720935821533,
+ "learning_rate": 4.7408120762455444e-06,
+ "loss": 0.6058,
+ "step": 1941
+ },
+ {
+ "epoch": 0.9182033096926714,
+ "grad_norm": 2.421276092529297,
+ "learning_rate": 4.7405354028436025e-06,
+ "loss": 0.5973,
+ "step": 1942
+ },
+ {
+ "epoch": 0.9186761229314421,
+ "grad_norm": 2.9846808910369873,
+ "learning_rate": 4.740258589933346e-06,
+ "loss": 0.6892,
+ "step": 1943
+ },
+ {
+ "epoch": 0.9191489361702128,
+ "grad_norm": 2.6899871826171875,
+ "learning_rate": 4.739981637532009e-06,
+ "loss": 0.5705,
+ "step": 1944
+ },
+ {
+ "epoch": 0.9196217494089834,
+ "grad_norm": 2.8636131286621094,
+ "learning_rate": 4.739704545656839e-06,
+ "loss": 0.5775,
+ "step": 1945
+ },
+ {
+ "epoch": 0.9200945626477541,
+ "grad_norm": 2.7659449577331543,
+ "learning_rate": 4.739427314325087e-06,
+ "loss": 0.5823,
+ "step": 1946
+ },
+ {
+ "epoch": 0.9205673758865248,
+ "grad_norm": 4.71295166015625,
+ "learning_rate": 4.739149943554016e-06,
+ "loss": 0.5601,
+ "step": 1947
+ },
+ {
+ "epoch": 0.9210401891252955,
+ "grad_norm": 2.642636775970459,
+ "learning_rate": 4.738872433360896e-06,
+ "loss": 0.5278,
+ "step": 1948
+ },
+ {
+ "epoch": 0.9215130023640662,
+ "grad_norm": 2.4658217430114746,
+ "learning_rate": 4.7385947837630065e-06,
+ "loss": 0.6392,
+ "step": 1949
+ },
+ {
+ "epoch": 0.9219858156028369,
+ "grad_norm": 2.851602792739868,
+ "learning_rate": 4.738316994777636e-06,
+ "loss": 0.6164,
+ "step": 1950
+ },
+ {
+ "epoch": 0.9224586288416076,
+ "grad_norm": 2.394226551055908,
+ "learning_rate": 4.738039066422081e-06,
+ "loss": 0.5556,
+ "step": 1951
+ },
+ {
+ "epoch": 0.9229314420803783,
+ "grad_norm": 2.7985100746154785,
+ "learning_rate": 4.737760998713647e-06,
+ "loss": 0.5799,
+ "step": 1952
+ },
+ {
+ "epoch": 0.9234042553191489,
+ "grad_norm": 2.5974674224853516,
+ "learning_rate": 4.737482791669648e-06,
+ "loss": 0.6984,
+ "step": 1953
+ },
+ {
+ "epoch": 0.9238770685579196,
+ "grad_norm": 2.707636594772339,
+ "learning_rate": 4.737204445307406e-06,
+ "loss": 0.5548,
+ "step": 1954
+ },
+ {
+ "epoch": 0.9243498817966903,
+ "grad_norm": 2.7882707118988037,
+ "learning_rate": 4.736925959644254e-06,
+ "loss": 0.6026,
+ "step": 1955
+ },
+ {
+ "epoch": 0.924822695035461,
+ "grad_norm": 2.474482774734497,
+ "learning_rate": 4.7366473346975304e-06,
+ "loss": 0.5832,
+ "step": 1956
+ },
+ {
+ "epoch": 0.9252955082742317,
+ "grad_norm": 2.6196324825286865,
+ "learning_rate": 4.736368570484585e-06,
+ "loss": 0.5861,
+ "step": 1957
+ },
+ {
+ "epoch": 0.9257683215130024,
+ "grad_norm": 2.826864004135132,
+ "learning_rate": 4.736089667022775e-06,
+ "loss": 0.6173,
+ "step": 1958
+ },
+ {
+ "epoch": 0.926241134751773,
+ "grad_norm": 2.414473056793213,
+ "learning_rate": 4.735810624329466e-06,
+ "loss": 0.5753,
+ "step": 1959
+ },
+ {
+ "epoch": 0.9267139479905437,
+ "grad_norm": 2.8037970066070557,
+ "learning_rate": 4.7355314424220335e-06,
+ "loss": 0.6207,
+ "step": 1960
+ },
+ {
+ "epoch": 0.9271867612293144,
+ "grad_norm": 2.645458698272705,
+ "learning_rate": 4.735252121317861e-06,
+ "loss": 0.5959,
+ "step": 1961
+ },
+ {
+ "epoch": 0.9276595744680851,
+ "grad_norm": 2.7983884811401367,
+ "learning_rate": 4.734972661034339e-06,
+ "loss": 0.5696,
+ "step": 1962
+ },
+ {
+ "epoch": 0.9281323877068558,
+ "grad_norm": 3.0568997859954834,
+ "learning_rate": 4.73469306158887e-06,
+ "loss": 0.6194,
+ "step": 1963
+ },
+ {
+ "epoch": 0.9286052009456265,
+ "grad_norm": 2.7205135822296143,
+ "learning_rate": 4.734413322998863e-06,
+ "loss": 0.5292,
+ "step": 1964
+ },
+ {
+ "epoch": 0.9290780141843972,
+ "grad_norm": 3.3168489933013916,
+ "learning_rate": 4.734133445281735e-06,
+ "loss": 0.5654,
+ "step": 1965
+ },
+ {
+ "epoch": 0.9295508274231679,
+ "grad_norm": 3.0095653533935547,
+ "learning_rate": 4.733853428454916e-06,
+ "loss": 0.6508,
+ "step": 1966
+ },
+ {
+ "epoch": 0.9300236406619385,
+ "grad_norm": 2.7726712226867676,
+ "learning_rate": 4.733573272535838e-06,
+ "loss": 0.644,
+ "step": 1967
+ },
+ {
+ "epoch": 0.9304964539007092,
+ "grad_norm": 2.474397659301758,
+ "learning_rate": 4.7332929775419456e-06,
+ "loss": 0.5479,
+ "step": 1968
+ },
+ {
+ "epoch": 0.9309692671394799,
+ "grad_norm": 2.4518635272979736,
+ "learning_rate": 4.733012543490693e-06,
+ "loss": 0.6,
+ "step": 1969
+ },
+ {
+ "epoch": 0.9314420803782506,
+ "grad_norm": 2.9292192459106445,
+ "learning_rate": 4.73273197039954e-06,
+ "loss": 0.6647,
+ "step": 1970
+ },
+ {
+ "epoch": 0.9319148936170213,
+ "grad_norm": 2.425004720687866,
+ "learning_rate": 4.732451258285958e-06,
+ "loss": 0.6338,
+ "step": 1971
+ },
+ {
+ "epoch": 0.932387706855792,
+ "grad_norm": 2.904479503631592,
+ "learning_rate": 4.7321704071674255e-06,
+ "loss": 0.5923,
+ "step": 1972
+ },
+ {
+ "epoch": 0.9328605200945627,
+ "grad_norm": 2.477085590362549,
+ "learning_rate": 4.731889417061428e-06,
+ "loss": 0.5984,
+ "step": 1973
+ },
+ {
+ "epoch": 0.9333333333333333,
+ "grad_norm": 2.585240364074707,
+ "learning_rate": 4.731608287985465e-06,
+ "loss": 0.558,
+ "step": 1974
+ },
+ {
+ "epoch": 0.933806146572104,
+ "grad_norm": 2.658714532852173,
+ "learning_rate": 4.731327019957039e-06,
+ "loss": 0.5567,
+ "step": 1975
+ },
+ {
+ "epoch": 0.9342789598108747,
+ "grad_norm": 2.7593026161193848,
+ "learning_rate": 4.731045612993662e-06,
+ "loss": 0.5772,
+ "step": 1976
+ },
+ {
+ "epoch": 0.9347517730496454,
+ "grad_norm": 2.4386026859283447,
+ "learning_rate": 4.7307640671128585e-06,
+ "loss": 0.6199,
+ "step": 1977
+ },
+ {
+ "epoch": 0.9352245862884161,
+ "grad_norm": 2.681910514831543,
+ "learning_rate": 4.730482382332158e-06,
+ "loss": 0.5971,
+ "step": 1978
+ },
+ {
+ "epoch": 0.9356973995271868,
+ "grad_norm": 3.7593860626220703,
+ "learning_rate": 4.7302005586691e-06,
+ "loss": 0.6346,
+ "step": 1979
+ },
+ {
+ "epoch": 0.9361702127659575,
+ "grad_norm": 2.5789096355438232,
+ "learning_rate": 4.729918596141232e-06,
+ "loss": 0.5684,
+ "step": 1980
+ },
+ {
+ "epoch": 0.9366430260047282,
+ "grad_norm": 3.0607335567474365,
+ "learning_rate": 4.729636494766111e-06,
+ "loss": 0.6223,
+ "step": 1981
+ },
+ {
+ "epoch": 0.9371158392434988,
+ "grad_norm": 2.906643867492676,
+ "learning_rate": 4.729354254561303e-06,
+ "loss": 0.6513,
+ "step": 1982
+ },
+ {
+ "epoch": 0.9375886524822695,
+ "grad_norm": 3.192430019378662,
+ "learning_rate": 4.7290718755443795e-06,
+ "loss": 0.5095,
+ "step": 1983
+ },
+ {
+ "epoch": 0.9380614657210402,
+ "grad_norm": 2.661536931991577,
+ "learning_rate": 4.7287893577329255e-06,
+ "loss": 0.5525,
+ "step": 1984
+ },
+ {
+ "epoch": 0.9385342789598109,
+ "grad_norm": 2.8436734676361084,
+ "learning_rate": 4.728506701144531e-06,
+ "loss": 0.6323,
+ "step": 1985
+ },
+ {
+ "epoch": 0.9390070921985816,
+ "grad_norm": 2.75544810295105,
+ "learning_rate": 4.728223905796796e-06,
+ "loss": 0.6018,
+ "step": 1986
+ },
+ {
+ "epoch": 0.9394799054373523,
+ "grad_norm": 3.0652759075164795,
+ "learning_rate": 4.727940971707329e-06,
+ "loss": 0.62,
+ "step": 1987
+ },
+ {
+ "epoch": 0.939952718676123,
+ "grad_norm": 2.802567720413208,
+ "learning_rate": 4.727657898893747e-06,
+ "loss": 0.5809,
+ "step": 1988
+ },
+ {
+ "epoch": 0.9404255319148936,
+ "grad_norm": 2.6208512783050537,
+ "learning_rate": 4.7273746873736745e-06,
+ "loss": 0.5762,
+ "step": 1989
+ },
+ {
+ "epoch": 0.9408983451536643,
+ "grad_norm": 2.5901873111724854,
+ "learning_rate": 4.727091337164748e-06,
+ "loss": 0.6111,
+ "step": 1990
+ },
+ {
+ "epoch": 0.941371158392435,
+ "grad_norm": 3.002347707748413,
+ "learning_rate": 4.726807848284609e-06,
+ "loss": 0.6419,
+ "step": 1991
+ },
+ {
+ "epoch": 0.9418439716312057,
+ "grad_norm": 2.522151470184326,
+ "learning_rate": 4.72652422075091e-06,
+ "loss": 0.642,
+ "step": 1992
+ },
+ {
+ "epoch": 0.9423167848699764,
+ "grad_norm": 2.5571532249450684,
+ "learning_rate": 4.726240454581311e-06,
+ "loss": 0.5729,
+ "step": 1993
+ },
+ {
+ "epoch": 0.9427895981087471,
+ "grad_norm": 2.7704918384552,
+ "learning_rate": 4.72595654979348e-06,
+ "loss": 0.6816,
+ "step": 1994
+ },
+ {
+ "epoch": 0.9432624113475178,
+ "grad_norm": 2.517040491104126,
+ "learning_rate": 4.7256725064050955e-06,
+ "loss": 0.5782,
+ "step": 1995
+ },
+ {
+ "epoch": 0.9437352245862884,
+ "grad_norm": 2.613955020904541,
+ "learning_rate": 4.725388324433843e-06,
+ "loss": 0.6291,
+ "step": 1996
+ },
+ {
+ "epoch": 0.9442080378250591,
+ "grad_norm": 2.848891258239746,
+ "learning_rate": 4.725104003897418e-06,
+ "loss": 0.6544,
+ "step": 1997
+ },
+ {
+ "epoch": 0.9446808510638298,
+ "grad_norm": 3.0162429809570312,
+ "learning_rate": 4.724819544813523e-06,
+ "loss": 0.6301,
+ "step": 1998
+ },
+ {
+ "epoch": 0.9451536643026005,
+ "grad_norm": 2.613614559173584,
+ "learning_rate": 4.72453494719987e-06,
+ "loss": 0.5829,
+ "step": 1999
+ },
+ {
+ "epoch": 0.9456264775413712,
+ "grad_norm": 2.4838767051696777,
+ "learning_rate": 4.724250211074182e-06,
+ "loss": 0.6042,
+ "step": 2000
+ },
+ {
+ "epoch": 0.9460992907801419,
+ "grad_norm": 2.526470899581909,
+ "learning_rate": 4.723965336454185e-06,
+ "loss": 0.6167,
+ "step": 2001
+ },
+ {
+ "epoch": 0.9465721040189126,
+ "grad_norm": 2.504506826400757,
+ "learning_rate": 4.723680323357618e-06,
+ "loss": 0.6061,
+ "step": 2002
+ },
+ {
+ "epoch": 0.9470449172576832,
+ "grad_norm": 3.0547544956207275,
+ "learning_rate": 4.723395171802228e-06,
+ "loss": 0.6619,
+ "step": 2003
+ },
+ {
+ "epoch": 0.9475177304964539,
+ "grad_norm": 2.8692407608032227,
+ "learning_rate": 4.723109881805771e-06,
+ "loss": 0.5985,
+ "step": 2004
+ },
+ {
+ "epoch": 0.9479905437352246,
+ "grad_norm": 2.7929654121398926,
+ "learning_rate": 4.7228244533860094e-06,
+ "loss": 0.5869,
+ "step": 2005
+ },
+ {
+ "epoch": 0.9484633569739953,
+ "grad_norm": 2.764869451522827,
+ "learning_rate": 4.7225388865607146e-06,
+ "loss": 0.6288,
+ "step": 2006
+ },
+ {
+ "epoch": 0.948936170212766,
+ "grad_norm": 2.7656404972076416,
+ "learning_rate": 4.722253181347671e-06,
+ "loss": 0.5831,
+ "step": 2007
+ },
+ {
+ "epoch": 0.9494089834515367,
+ "grad_norm": 2.6698336601257324,
+ "learning_rate": 4.7219673377646635e-06,
+ "loss": 0.6087,
+ "step": 2008
+ },
+ {
+ "epoch": 0.9498817966903074,
+ "grad_norm": 2.524935722351074,
+ "learning_rate": 4.7216813558294946e-06,
+ "loss": 0.5675,
+ "step": 2009
+ },
+ {
+ "epoch": 0.950354609929078,
+ "grad_norm": 2.5998785495758057,
+ "learning_rate": 4.721395235559969e-06,
+ "loss": 0.5667,
+ "step": 2010
+ },
+ {
+ "epoch": 0.9508274231678487,
+ "grad_norm": 2.758021354675293,
+ "learning_rate": 4.721108976973902e-06,
+ "loss": 0.4931,
+ "step": 2011
+ },
+ {
+ "epoch": 0.9513002364066194,
+ "grad_norm": 2.767695903778076,
+ "learning_rate": 4.72082258008912e-06,
+ "loss": 0.5778,
+ "step": 2012
+ },
+ {
+ "epoch": 0.9517730496453901,
+ "grad_norm": 2.982314348220825,
+ "learning_rate": 4.720536044923453e-06,
+ "loss": 0.6096,
+ "step": 2013
+ },
+ {
+ "epoch": 0.9522458628841608,
+ "grad_norm": 2.7608799934387207,
+ "learning_rate": 4.720249371494743e-06,
+ "loss": 0.6242,
+ "step": 2014
+ },
+ {
+ "epoch": 0.9527186761229315,
+ "grad_norm": 2.60054349899292,
+ "learning_rate": 4.71996255982084e-06,
+ "loss": 0.6249,
+ "step": 2015
+ },
+ {
+ "epoch": 0.9531914893617022,
+ "grad_norm": 2.654355764389038,
+ "learning_rate": 4.719675609919603e-06,
+ "loss": 0.6327,
+ "step": 2016
+ },
+ {
+ "epoch": 0.9536643026004729,
+ "grad_norm": 2.589404582977295,
+ "learning_rate": 4.719388521808899e-06,
+ "loss": 0.6357,
+ "step": 2017
+ },
+ {
+ "epoch": 0.9541371158392435,
+ "grad_norm": 2.8016581535339355,
+ "learning_rate": 4.719101295506603e-06,
+ "loss": 0.5901,
+ "step": 2018
+ },
+ {
+ "epoch": 0.9546099290780142,
+ "grad_norm": 3.1408045291900635,
+ "learning_rate": 4.7188139310306e-06,
+ "loss": 0.598,
+ "step": 2019
+ },
+ {
+ "epoch": 0.9550827423167849,
+ "grad_norm": 2.7432665824890137,
+ "learning_rate": 4.718526428398783e-06,
+ "loss": 0.5508,
+ "step": 2020
+ },
+ {
+ "epoch": 0.9555555555555556,
+ "grad_norm": 2.947800874710083,
+ "learning_rate": 4.718238787629053e-06,
+ "loss": 0.6439,
+ "step": 2021
+ },
+ {
+ "epoch": 0.9560283687943263,
+ "grad_norm": 2.50828218460083,
+ "learning_rate": 4.71795100873932e-06,
+ "loss": 0.5441,
+ "step": 2022
+ },
+ {
+ "epoch": 0.956501182033097,
+ "grad_norm": 2.8558974266052246,
+ "learning_rate": 4.717663091747503e-06,
+ "loss": 0.5416,
+ "step": 2023
+ },
+ {
+ "epoch": 0.9569739952718677,
+ "grad_norm": 2.4803316593170166,
+ "learning_rate": 4.71737503667153e-06,
+ "loss": 0.5317,
+ "step": 2024
+ },
+ {
+ "epoch": 0.9574468085106383,
+ "grad_norm": 4.36754035949707,
+ "learning_rate": 4.717086843529336e-06,
+ "loss": 0.5808,
+ "step": 2025
+ },
+ {
+ "epoch": 0.957919621749409,
+ "grad_norm": 2.730185031890869,
+ "learning_rate": 4.7167985123388665e-06,
+ "loss": 0.5257,
+ "step": 2026
+ },
+ {
+ "epoch": 0.9583924349881797,
+ "grad_norm": 2.8136069774627686,
+ "learning_rate": 4.716510043118074e-06,
+ "loss": 0.5836,
+ "step": 2027
+ },
+ {
+ "epoch": 0.9588652482269504,
+ "grad_norm": 2.793975353240967,
+ "learning_rate": 4.71622143588492e-06,
+ "loss": 0.5706,
+ "step": 2028
+ },
+ {
+ "epoch": 0.9593380614657211,
+ "grad_norm": 2.3883821964263916,
+ "learning_rate": 4.7159326906573745e-06,
+ "loss": 0.5291,
+ "step": 2029
+ },
+ {
+ "epoch": 0.9598108747044918,
+ "grad_norm": 2.6135976314544678,
+ "learning_rate": 4.715643807453417e-06,
+ "loss": 0.6199,
+ "step": 2030
+ },
+ {
+ "epoch": 0.9602836879432625,
+ "grad_norm": 2.6245670318603516,
+ "learning_rate": 4.715354786291035e-06,
+ "loss": 0.5585,
+ "step": 2031
+ },
+ {
+ "epoch": 0.9607565011820332,
+ "grad_norm": 2.7870967388153076,
+ "learning_rate": 4.715065627188225e-06,
+ "loss": 0.6196,
+ "step": 2032
+ },
+ {
+ "epoch": 0.9612293144208038,
+ "grad_norm": 2.6983911991119385,
+ "learning_rate": 4.714776330162991e-06,
+ "loss": 0.6424,
+ "step": 2033
+ },
+ {
+ "epoch": 0.9617021276595744,
+ "grad_norm": 2.3221919536590576,
+ "learning_rate": 4.7144868952333465e-06,
+ "loss": 0.568,
+ "step": 2034
+ },
+ {
+ "epoch": 0.9621749408983451,
+ "grad_norm": 2.9408178329467773,
+ "learning_rate": 4.714197322417314e-06,
+ "loss": 0.6175,
+ "step": 2035
+ },
+ {
+ "epoch": 0.9626477541371158,
+ "grad_norm": 2.404057264328003,
+ "learning_rate": 4.713907611732921e-06,
+ "loss": 0.4943,
+ "step": 2036
+ },
+ {
+ "epoch": 0.9631205673758865,
+ "grad_norm": 3.547607660293579,
+ "learning_rate": 4.71361776319821e-06,
+ "loss": 0.5488,
+ "step": 2037
+ },
+ {
+ "epoch": 0.9635933806146572,
+ "grad_norm": 2.679614543914795,
+ "learning_rate": 4.713327776831227e-06,
+ "loss": 0.6234,
+ "step": 2038
+ },
+ {
+ "epoch": 0.9640661938534278,
+ "grad_norm": 2.526914119720459,
+ "learning_rate": 4.7130376526500286e-06,
+ "loss": 0.5891,
+ "step": 2039
+ },
+ {
+ "epoch": 0.9645390070921985,
+ "grad_norm": 2.6953470706939697,
+ "learning_rate": 4.71274739067268e-06,
+ "loss": 0.69,
+ "step": 2040
+ },
+ {
+ "epoch": 0.9650118203309692,
+ "grad_norm": 2.546660900115967,
+ "learning_rate": 4.712456990917254e-06,
+ "loss": 0.6185,
+ "step": 2041
+ },
+ {
+ "epoch": 0.9654846335697399,
+ "grad_norm": 3.3920490741729736,
+ "learning_rate": 4.712166453401832e-06,
+ "loss": 0.587,
+ "step": 2042
+ },
+ {
+ "epoch": 0.9659574468085106,
+ "grad_norm": 2.5961573123931885,
+ "learning_rate": 4.711875778144504e-06,
+ "loss": 0.6105,
+ "step": 2043
+ },
+ {
+ "epoch": 0.9664302600472813,
+ "grad_norm": 2.5111498832702637,
+ "learning_rate": 4.711584965163372e-06,
+ "loss": 0.5533,
+ "step": 2044
+ },
+ {
+ "epoch": 0.966903073286052,
+ "grad_norm": 2.4878132343292236,
+ "learning_rate": 4.7112940144765405e-06,
+ "loss": 0.5604,
+ "step": 2045
+ },
+ {
+ "epoch": 0.9673758865248226,
+ "grad_norm": 2.5714077949523926,
+ "learning_rate": 4.711002926102128e-06,
+ "loss": 0.5794,
+ "step": 2046
+ },
+ {
+ "epoch": 0.9678486997635933,
+ "grad_norm": 2.7069091796875,
+ "learning_rate": 4.710711700058257e-06,
+ "loss": 0.594,
+ "step": 2047
+ },
+ {
+ "epoch": 0.968321513002364,
+ "grad_norm": 2.8104631900787354,
+ "learning_rate": 4.710420336363063e-06,
+ "loss": 0.6247,
+ "step": 2048
+ },
+ {
+ "epoch": 0.9687943262411347,
+ "grad_norm": 2.8464386463165283,
+ "learning_rate": 4.7101288350346865e-06,
+ "loss": 0.6162,
+ "step": 2049
+ },
+ {
+ "epoch": 0.9692671394799054,
+ "grad_norm": 2.7187976837158203,
+ "learning_rate": 4.709837196091279e-06,
+ "loss": 0.6109,
+ "step": 2050
+ },
+ {
+ "epoch": 0.9697399527186761,
+ "grad_norm": 2.556734085083008,
+ "learning_rate": 4.709545419550999e-06,
+ "loss": 0.6297,
+ "step": 2051
+ },
+ {
+ "epoch": 0.9702127659574468,
+ "grad_norm": 2.937195062637329,
+ "learning_rate": 4.709253505432014e-06,
+ "loss": 0.6862,
+ "step": 2052
+ },
+ {
+ "epoch": 0.9706855791962175,
+ "grad_norm": 2.792175531387329,
+ "learning_rate": 4.7089614537525015e-06,
+ "loss": 0.6105,
+ "step": 2053
+ },
+ {
+ "epoch": 0.9711583924349881,
+ "grad_norm": 2.625636100769043,
+ "learning_rate": 4.708669264530644e-06,
+ "loss": 0.5849,
+ "step": 2054
+ },
+ {
+ "epoch": 0.9716312056737588,
+ "grad_norm": 2.6752610206604004,
+ "learning_rate": 4.708376937784637e-06,
+ "loss": 0.5949,
+ "step": 2055
+ },
+ {
+ "epoch": 0.9721040189125295,
+ "grad_norm": 2.6072793006896973,
+ "learning_rate": 4.708084473532681e-06,
+ "loss": 0.5776,
+ "step": 2056
+ },
+ {
+ "epoch": 0.9725768321513002,
+ "grad_norm": 2.728632926940918,
+ "learning_rate": 4.707791871792988e-06,
+ "loss": 0.6352,
+ "step": 2057
+ },
+ {
+ "epoch": 0.9730496453900709,
+ "grad_norm": 2.5841758251190186,
+ "learning_rate": 4.707499132583775e-06,
+ "loss": 0.5488,
+ "step": 2058
+ },
+ {
+ "epoch": 0.9735224586288416,
+ "grad_norm": 2.8464293479919434,
+ "learning_rate": 4.707206255923271e-06,
+ "loss": 0.7051,
+ "step": 2059
+ },
+ {
+ "epoch": 0.9739952718676123,
+ "grad_norm": 2.547297239303589,
+ "learning_rate": 4.706913241829712e-06,
+ "loss": 0.5937,
+ "step": 2060
+ },
+ {
+ "epoch": 0.9744680851063829,
+ "grad_norm": 2.6572306156158447,
+ "learning_rate": 4.706620090321341e-06,
+ "loss": 0.6041,
+ "step": 2061
+ },
+ {
+ "epoch": 0.9749408983451536,
+ "grad_norm": 2.3262805938720703,
+ "learning_rate": 4.706326801416414e-06,
+ "loss": 0.5144,
+ "step": 2062
+ },
+ {
+ "epoch": 0.9754137115839243,
+ "grad_norm": 2.9693965911865234,
+ "learning_rate": 4.706033375133191e-06,
+ "loss": 0.551,
+ "step": 2063
+ },
+ {
+ "epoch": 0.975886524822695,
+ "grad_norm": 2.5993731021881104,
+ "learning_rate": 4.7057398114899435e-06,
+ "loss": 0.6143,
+ "step": 2064
+ },
+ {
+ "epoch": 0.9763593380614657,
+ "grad_norm": 2.453336477279663,
+ "learning_rate": 4.70544611050495e-06,
+ "loss": 0.6093,
+ "step": 2065
+ },
+ {
+ "epoch": 0.9768321513002364,
+ "grad_norm": 2.898629665374756,
+ "learning_rate": 4.705152272196497e-06,
+ "loss": 0.6007,
+ "step": 2066
+ },
+ {
+ "epoch": 0.9773049645390071,
+ "grad_norm": 2.7990612983703613,
+ "learning_rate": 4.7048582965828815e-06,
+ "loss": 0.6687,
+ "step": 2067
+ },
+ {
+ "epoch": 0.9777777777777777,
+ "grad_norm": 2.635284423828125,
+ "learning_rate": 4.704564183682408e-06,
+ "loss": 0.5564,
+ "step": 2068
+ },
+ {
+ "epoch": 0.9782505910165484,
+ "grad_norm": 3.014547109603882,
+ "learning_rate": 4.704269933513389e-06,
+ "loss": 0.6084,
+ "step": 2069
+ },
+ {
+ "epoch": 0.9787234042553191,
+ "grad_norm": 2.659357786178589,
+ "learning_rate": 4.703975546094147e-06,
+ "loss": 0.6031,
+ "step": 2070
+ },
+ {
+ "epoch": 0.9791962174940898,
+ "grad_norm": 2.326932668685913,
+ "learning_rate": 4.703681021443013e-06,
+ "loss": 0.5859,
+ "step": 2071
+ },
+ {
+ "epoch": 0.9796690307328605,
+ "grad_norm": 2.958803653717041,
+ "learning_rate": 4.7033863595783235e-06,
+ "loss": 0.5586,
+ "step": 2072
+ },
+ {
+ "epoch": 0.9801418439716312,
+ "grad_norm": 2.921386957168579,
+ "learning_rate": 4.703091560518427e-06,
+ "loss": 0.6126,
+ "step": 2073
+ },
+ {
+ "epoch": 0.9806146572104019,
+ "grad_norm": 2.6500775814056396,
+ "learning_rate": 4.702796624281679e-06,
+ "loss": 0.5678,
+ "step": 2074
+ },
+ {
+ "epoch": 0.9810874704491725,
+ "grad_norm": 2.7740228176116943,
+ "learning_rate": 4.702501550886445e-06,
+ "loss": 0.6067,
+ "step": 2075
+ },
+ {
+ "epoch": 0.9815602836879432,
+ "grad_norm": 2.3296213150024414,
+ "learning_rate": 4.702206340351096e-06,
+ "loss": 0.5247,
+ "step": 2076
+ },
+ {
+ "epoch": 0.9820330969267139,
+ "grad_norm": 2.748300790786743,
+ "learning_rate": 4.701910992694016e-06,
+ "loss": 0.5197,
+ "step": 2077
+ },
+ {
+ "epoch": 0.9825059101654846,
+ "grad_norm": 2.250985622406006,
+ "learning_rate": 4.7016155079335926e-06,
+ "loss": 0.5214,
+ "step": 2078
+ },
+ {
+ "epoch": 0.9829787234042553,
+ "grad_norm": 2.389845848083496,
+ "learning_rate": 4.701319886088226e-06,
+ "loss": 0.519,
+ "step": 2079
+ },
+ {
+ "epoch": 0.983451536643026,
+ "grad_norm": 2.818220853805542,
+ "learning_rate": 4.701024127176322e-06,
+ "loss": 0.607,
+ "step": 2080
+ },
+ {
+ "epoch": 0.9839243498817967,
+ "grad_norm": 3.4058034420013428,
+ "learning_rate": 4.700728231216297e-06,
+ "loss": 0.5711,
+ "step": 2081
+ },
+ {
+ "epoch": 0.9843971631205674,
+ "grad_norm": 2.5297787189483643,
+ "learning_rate": 4.700432198226575e-06,
+ "loss": 0.5979,
+ "step": 2082
+ },
+ {
+ "epoch": 0.984869976359338,
+ "grad_norm": 3.0548105239868164,
+ "learning_rate": 4.7001360282255885e-06,
+ "loss": 0.6041,
+ "step": 2083
+ },
+ {
+ "epoch": 0.9853427895981087,
+ "grad_norm": 2.8983733654022217,
+ "learning_rate": 4.699839721231779e-06,
+ "loss": 0.5926,
+ "step": 2084
+ },
+ {
+ "epoch": 0.9858156028368794,
+ "grad_norm": 3.2717764377593994,
+ "learning_rate": 4.699543277263596e-06,
+ "loss": 0.6477,
+ "step": 2085
+ },
+ {
+ "epoch": 0.9862884160756501,
+ "grad_norm": 3.03729248046875,
+ "learning_rate": 4.699246696339497e-06,
+ "loss": 0.6786,
+ "step": 2086
+ },
+ {
+ "epoch": 0.9867612293144208,
+ "grad_norm": 2.852301597595215,
+ "learning_rate": 4.698949978477951e-06,
+ "loss": 0.6565,
+ "step": 2087
+ },
+ {
+ "epoch": 0.9872340425531915,
+ "grad_norm": 2.843485116958618,
+ "learning_rate": 4.698653123697431e-06,
+ "loss": 0.6627,
+ "step": 2088
+ },
+ {
+ "epoch": 0.9877068557919622,
+ "grad_norm": 2.6315064430236816,
+ "learning_rate": 4.698356132016423e-06,
+ "loss": 0.6577,
+ "step": 2089
+ },
+ {
+ "epoch": 0.9881796690307328,
+ "grad_norm": 2.7482151985168457,
+ "learning_rate": 4.698059003453417e-06,
+ "loss": 0.5514,
+ "step": 2090
+ },
+ {
+ "epoch": 0.9886524822695035,
+ "grad_norm": 2.826673746109009,
+ "learning_rate": 4.6977617380269145e-06,
+ "loss": 0.565,
+ "step": 2091
+ },
+ {
+ "epoch": 0.9891252955082742,
+ "grad_norm": 3.0273752212524414,
+ "learning_rate": 4.697464335755427e-06,
+ "loss": 0.6331,
+ "step": 2092
+ },
+ {
+ "epoch": 0.9895981087470449,
+ "grad_norm": 2.7551653385162354,
+ "learning_rate": 4.6971667966574695e-06,
+ "loss": 0.6486,
+ "step": 2093
+ },
+ {
+ "epoch": 0.9900709219858156,
+ "grad_norm": 2.656299114227295,
+ "learning_rate": 4.696869120751571e-06,
+ "loss": 0.6562,
+ "step": 2094
+ },
+ {
+ "epoch": 0.9905437352245863,
+ "grad_norm": 2.785322904586792,
+ "learning_rate": 4.696571308056265e-06,
+ "loss": 0.5892,
+ "step": 2095
+ },
+ {
+ "epoch": 0.991016548463357,
+ "grad_norm": 2.9334635734558105,
+ "learning_rate": 4.696273358590095e-06,
+ "loss": 0.6346,
+ "step": 2096
+ },
+ {
+ "epoch": 0.9914893617021276,
+ "grad_norm": 2.7944300174713135,
+ "learning_rate": 4.695975272371613e-06,
+ "loss": 0.5859,
+ "step": 2097
+ },
+ {
+ "epoch": 0.9919621749408983,
+ "grad_norm": 2.5416972637176514,
+ "learning_rate": 4.695677049419381e-06,
+ "loss": 0.5658,
+ "step": 2098
+ },
+ {
+ "epoch": 0.992434988179669,
+ "grad_norm": 2.4056856632232666,
+ "learning_rate": 4.695378689751966e-06,
+ "loss": 0.5121,
+ "step": 2099
+ },
+ {
+ "epoch": 0.9929078014184397,
+ "grad_norm": 2.614548683166504,
+ "learning_rate": 4.695080193387948e-06,
+ "loss": 0.5961,
+ "step": 2100
+ },
+ {
+ "epoch": 0.9933806146572104,
+ "grad_norm": 2.8966517448425293,
+ "learning_rate": 4.69478156034591e-06,
+ "loss": 0.5985,
+ "step": 2101
+ },
+ {
+ "epoch": 0.9938534278959811,
+ "grad_norm": 2.9514098167419434,
+ "learning_rate": 4.694482790644448e-06,
+ "loss": 0.5677,
+ "step": 2102
+ },
+ {
+ "epoch": 0.9943262411347518,
+ "grad_norm": 2.4326791763305664,
+ "learning_rate": 4.694183884302165e-06,
+ "loss": 0.5698,
+ "step": 2103
+ },
+ {
+ "epoch": 0.9947990543735225,
+ "grad_norm": 2.9242892265319824,
+ "learning_rate": 4.6938848413376735e-06,
+ "loss": 0.6245,
+ "step": 2104
+ },
+ {
+ "epoch": 0.9952718676122931,
+ "grad_norm": 2.9134104251861572,
+ "learning_rate": 4.693585661769593e-06,
+ "loss": 0.6164,
+ "step": 2105
+ },
+ {
+ "epoch": 0.9957446808510638,
+ "grad_norm": 2.472564458847046,
+ "learning_rate": 4.693286345616551e-06,
+ "loss": 0.5616,
+ "step": 2106
+ },
+ {
+ "epoch": 0.9962174940898345,
+ "grad_norm": 3.2456448078155518,
+ "learning_rate": 4.692986892897186e-06,
+ "loss": 0.6977,
+ "step": 2107
+ },
+ {
+ "epoch": 0.9966903073286052,
+ "grad_norm": 3.4032769203186035,
+ "learning_rate": 4.692687303630143e-06,
+ "loss": 0.643,
+ "step": 2108
+ },
+ {
+ "epoch": 0.9971631205673759,
+ "grad_norm": 2.722200870513916,
+ "learning_rate": 4.692387577834076e-06,
+ "loss": 0.5873,
+ "step": 2109
+ },
+ {
+ "epoch": 0.9976359338061466,
+ "grad_norm": 2.687532663345337,
+ "learning_rate": 4.692087715527648e-06,
+ "loss": 0.5423,
+ "step": 2110
+ },
+ {
+ "epoch": 0.9981087470449173,
+ "grad_norm": 2.578613042831421,
+ "learning_rate": 4.6917877167295305e-06,
+ "loss": 0.5689,
+ "step": 2111
+ },
+ {
+ "epoch": 0.9985815602836879,
+ "grad_norm": 3.1806094646453857,
+ "learning_rate": 4.691487581458402e-06,
+ "loss": 0.6133,
+ "step": 2112
+ },
+ {
+ "epoch": 0.9990543735224586,
+ "grad_norm": 2.4449520111083984,
+ "learning_rate": 4.691187309732952e-06,
+ "loss": 0.5841,
+ "step": 2113
+ },
+ {
+ "epoch": 0.9995271867612293,
+ "grad_norm": 2.908749580383301,
+ "learning_rate": 4.690886901571875e-06,
+ "loss": 0.534,
+ "step": 2114
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 4.019968032836914,
+ "learning_rate": 4.6905863569938785e-06,
+ "loss": 0.596,
+ "step": 2115
+ },
+ {
+ "epoch": 1.0004728132387706,
+ "grad_norm": 2.4319307804107666,
+ "learning_rate": 4.690285676017675e-06,
+ "loss": 0.4973,
+ "step": 2116
+ },
+ {
+ "epoch": 1.0009456264775414,
+ "grad_norm": 2.6366477012634277,
+ "learning_rate": 4.689984858661986e-06,
+ "loss": 0.5682,
+ "step": 2117
+ },
+ {
+ "epoch": 1.001418439716312,
+ "grad_norm": 2.815114974975586,
+ "learning_rate": 4.689683904945542e-06,
+ "loss": 0.5616,
+ "step": 2118
+ },
+ {
+ "epoch": 1.0018912529550827,
+ "grad_norm": 2.6680490970611572,
+ "learning_rate": 4.689382814887084e-06,
+ "loss": 0.5161,
+ "step": 2119
+ },
+ {
+ "epoch": 1.0023640661938533,
+ "grad_norm": 2.7406351566314697,
+ "learning_rate": 4.689081588505358e-06,
+ "loss": 0.4937,
+ "step": 2120
+ },
+ {
+ "epoch": 1.0028368794326241,
+ "grad_norm": 2.2832298278808594,
+ "learning_rate": 4.68878022581912e-06,
+ "loss": 0.4986,
+ "step": 2121
+ },
+ {
+ "epoch": 1.0033096926713947,
+ "grad_norm": 2.5525307655334473,
+ "learning_rate": 4.688478726847136e-06,
+ "loss": 0.4909,
+ "step": 2122
+ },
+ {
+ "epoch": 1.0037825059101655,
+ "grad_norm": 2.9843199253082275,
+ "learning_rate": 4.688177091608176e-06,
+ "loss": 0.6046,
+ "step": 2123
+ },
+ {
+ "epoch": 1.004255319148936,
+ "grad_norm": 2.5231106281280518,
+ "learning_rate": 4.687875320121024e-06,
+ "loss": 0.5423,
+ "step": 2124
+ },
+ {
+ "epoch": 1.0047281323877069,
+ "grad_norm": 2.567599058151245,
+ "learning_rate": 4.68757341240447e-06,
+ "loss": 0.5092,
+ "step": 2125
+ },
+ {
+ "epoch": 1.0052009456264774,
+ "grad_norm": 2.768111228942871,
+ "learning_rate": 4.687271368477311e-06,
+ "loss": 0.5175,
+ "step": 2126
+ },
+ {
+ "epoch": 1.0056737588652482,
+ "grad_norm": 2.7223286628723145,
+ "learning_rate": 4.686969188358355e-06,
+ "loss": 0.5412,
+ "step": 2127
+ },
+ {
+ "epoch": 1.0061465721040188,
+ "grad_norm": 2.488299608230591,
+ "learning_rate": 4.686666872066418e-06,
+ "loss": 0.5288,
+ "step": 2128
+ },
+ {
+ "epoch": 1.0066193853427896,
+ "grad_norm": 2.882981777191162,
+ "learning_rate": 4.6863644196203215e-06,
+ "loss": 0.6117,
+ "step": 2129
+ },
+ {
+ "epoch": 1.0070921985815602,
+ "grad_norm": 3.0019447803497314,
+ "learning_rate": 4.686061831038901e-06,
+ "loss": 0.5308,
+ "step": 2130
+ },
+ {
+ "epoch": 1.007565011820331,
+ "grad_norm": 3.0056138038635254,
+ "learning_rate": 4.685759106340996e-06,
+ "loss": 0.5833,
+ "step": 2131
+ },
+ {
+ "epoch": 1.0080378250591016,
+ "grad_norm": 2.5709075927734375,
+ "learning_rate": 4.685456245545454e-06,
+ "loss": 0.5071,
+ "step": 2132
+ },
+ {
+ "epoch": 1.0085106382978724,
+ "grad_norm": 2.4641504287719727,
+ "learning_rate": 4.685153248671136e-06,
+ "loss": 0.4813,
+ "step": 2133
+ },
+ {
+ "epoch": 1.008983451536643,
+ "grad_norm": 2.374413013458252,
+ "learning_rate": 4.684850115736906e-06,
+ "loss": 0.5179,
+ "step": 2134
+ },
+ {
+ "epoch": 1.0094562647754137,
+ "grad_norm": 2.6504571437835693,
+ "learning_rate": 4.684546846761641e-06,
+ "loss": 0.437,
+ "step": 2135
+ },
+ {
+ "epoch": 1.0099290780141843,
+ "grad_norm": 2.5977871417999268,
+ "learning_rate": 4.684243441764221e-06,
+ "loss": 0.497,
+ "step": 2136
+ },
+ {
+ "epoch": 1.010401891252955,
+ "grad_norm": 2.4950785636901855,
+ "learning_rate": 4.683939900763541e-06,
+ "loss": 0.5624,
+ "step": 2137
+ },
+ {
+ "epoch": 1.0108747044917257,
+ "grad_norm": 3.065718412399292,
+ "learning_rate": 4.6836362237785e-06,
+ "loss": 0.512,
+ "step": 2138
+ },
+ {
+ "epoch": 1.0113475177304965,
+ "grad_norm": 2.7419207096099854,
+ "learning_rate": 4.6833324108280045e-06,
+ "loss": 0.5585,
+ "step": 2139
+ },
+ {
+ "epoch": 1.011820330969267,
+ "grad_norm": 2.623610496520996,
+ "learning_rate": 4.6830284619309744e-06,
+ "loss": 0.5163,
+ "step": 2140
+ },
+ {
+ "epoch": 1.0122931442080378,
+ "grad_norm": 2.774322986602783,
+ "learning_rate": 4.682724377106334e-06,
+ "loss": 0.527,
+ "step": 2141
+ },
+ {
+ "epoch": 1.0127659574468084,
+ "grad_norm": 2.959935188293457,
+ "learning_rate": 4.682420156373017e-06,
+ "loss": 0.6166,
+ "step": 2142
+ },
+ {
+ "epoch": 1.0132387706855792,
+ "grad_norm": 2.584026336669922,
+ "learning_rate": 4.682115799749968e-06,
+ "loss": 0.5086,
+ "step": 2143
+ },
+ {
+ "epoch": 1.0137115839243498,
+ "grad_norm": 2.6039700508117676,
+ "learning_rate": 4.6818113072561346e-06,
+ "loss": 0.49,
+ "step": 2144
+ },
+ {
+ "epoch": 1.0141843971631206,
+ "grad_norm": 2.466381072998047,
+ "learning_rate": 4.681506678910479e-06,
+ "loss": 0.4959,
+ "step": 2145
+ },
+ {
+ "epoch": 1.0146572104018912,
+ "grad_norm": 2.432636260986328,
+ "learning_rate": 4.681201914731969e-06,
+ "loss": 0.5057,
+ "step": 2146
+ },
+ {
+ "epoch": 1.015130023640662,
+ "grad_norm": 2.6134090423583984,
+ "learning_rate": 4.680897014739579e-06,
+ "loss": 0.4874,
+ "step": 2147
+ },
+ {
+ "epoch": 1.0156028368794325,
+ "grad_norm": 2.774481773376465,
+ "learning_rate": 4.680591978952295e-06,
+ "loss": 0.4967,
+ "step": 2148
+ },
+ {
+ "epoch": 1.0160756501182033,
+ "grad_norm": 2.66050124168396,
+ "learning_rate": 4.68028680738911e-06,
+ "loss": 0.4932,
+ "step": 2149
+ },
+ {
+ "epoch": 1.016548463356974,
+ "grad_norm": 3.020594835281372,
+ "learning_rate": 4.679981500069026e-06,
+ "loss": 0.5788,
+ "step": 2150
+ },
+ {
+ "epoch": 1.0170212765957447,
+ "grad_norm": 2.697758436203003,
+ "learning_rate": 4.679676057011053e-06,
+ "loss": 0.5441,
+ "step": 2151
+ },
+ {
+ "epoch": 1.0174940898345153,
+ "grad_norm": 6.986445903778076,
+ "learning_rate": 4.679370478234209e-06,
+ "loss": 0.6483,
+ "step": 2152
+ },
+ {
+ "epoch": 1.017966903073286,
+ "grad_norm": 2.6637115478515625,
+ "learning_rate": 4.679064763757522e-06,
+ "loss": 0.5859,
+ "step": 2153
+ },
+ {
+ "epoch": 1.0184397163120567,
+ "grad_norm": 2.7501862049102783,
+ "learning_rate": 4.678758913600027e-06,
+ "loss": 0.5745,
+ "step": 2154
+ },
+ {
+ "epoch": 1.0189125295508275,
+ "grad_norm": 2.7959372997283936,
+ "learning_rate": 4.678452927780768e-06,
+ "loss": 0.5076,
+ "step": 2155
+ },
+ {
+ "epoch": 1.019385342789598,
+ "grad_norm": 2.4377388954162598,
+ "learning_rate": 4.678146806318798e-06,
+ "loss": 0.5061,
+ "step": 2156
+ },
+ {
+ "epoch": 1.0198581560283688,
+ "grad_norm": 2.5478947162628174,
+ "learning_rate": 4.677840549233176e-06,
+ "loss": 0.4941,
+ "step": 2157
+ },
+ {
+ "epoch": 1.0203309692671394,
+ "grad_norm": 3.0956528186798096,
+ "learning_rate": 4.677534156542973e-06,
+ "loss": 0.5879,
+ "step": 2158
+ },
+ {
+ "epoch": 1.0208037825059102,
+ "grad_norm": 2.5247607231140137,
+ "learning_rate": 4.6772276282672666e-06,
+ "loss": 0.5532,
+ "step": 2159
+ },
+ {
+ "epoch": 1.0212765957446808,
+ "grad_norm": 3.1972787380218506,
+ "learning_rate": 4.676920964425143e-06,
+ "loss": 0.6081,
+ "step": 2160
+ },
+ {
+ "epoch": 1.0217494089834516,
+ "grad_norm": 2.6173388957977295,
+ "learning_rate": 4.6766141650356955e-06,
+ "loss": 0.5001,
+ "step": 2161
+ },
+ {
+ "epoch": 1.0222222222222221,
+ "grad_norm": 2.9914398193359375,
+ "learning_rate": 4.676307230118029e-06,
+ "loss": 0.5566,
+ "step": 2162
+ },
+ {
+ "epoch": 1.022695035460993,
+ "grad_norm": 2.8011834621429443,
+ "learning_rate": 4.676000159691254e-06,
+ "loss": 0.4909,
+ "step": 2163
+ },
+ {
+ "epoch": 1.0231678486997635,
+ "grad_norm": 2.6049559116363525,
+ "learning_rate": 4.67569295377449e-06,
+ "loss": 0.5018,
+ "step": 2164
+ },
+ {
+ "epoch": 1.0236406619385343,
+ "grad_norm": 2.8175013065338135,
+ "learning_rate": 4.675385612386866e-06,
+ "loss": 0.5309,
+ "step": 2165
+ },
+ {
+ "epoch": 1.0241134751773049,
+ "grad_norm": 2.854696750640869,
+ "learning_rate": 4.675078135547519e-06,
+ "loss": 0.5627,
+ "step": 2166
+ },
+ {
+ "epoch": 1.0245862884160757,
+ "grad_norm": 3.1856436729431152,
+ "learning_rate": 4.674770523275594e-06,
+ "loss": 0.5475,
+ "step": 2167
+ },
+ {
+ "epoch": 1.0250591016548463,
+ "grad_norm": 2.8289129734039307,
+ "learning_rate": 4.674462775590244e-06,
+ "loss": 0.5878,
+ "step": 2168
+ },
+ {
+ "epoch": 1.025531914893617,
+ "grad_norm": 2.8824517726898193,
+ "learning_rate": 4.6741548925106325e-06,
+ "loss": 0.4392,
+ "step": 2169
+ },
+ {
+ "epoch": 1.0260047281323876,
+ "grad_norm": 2.7044589519500732,
+ "learning_rate": 4.673846874055928e-06,
+ "loss": 0.5264,
+ "step": 2170
+ },
+ {
+ "epoch": 1.0264775413711584,
+ "grad_norm": 2.575035810470581,
+ "learning_rate": 4.673538720245312e-06,
+ "loss": 0.4615,
+ "step": 2171
+ },
+ {
+ "epoch": 1.026950354609929,
+ "grad_norm": 2.48168683052063,
+ "learning_rate": 4.67323043109797e-06,
+ "loss": 0.4404,
+ "step": 2172
+ },
+ {
+ "epoch": 1.0274231678486998,
+ "grad_norm": 2.926593065261841,
+ "learning_rate": 4.672922006633098e-06,
+ "loss": 0.54,
+ "step": 2173
+ },
+ {
+ "epoch": 1.0278959810874704,
+ "grad_norm": 2.4610698223114014,
+ "learning_rate": 4.672613446869901e-06,
+ "loss": 0.5555,
+ "step": 2174
+ },
+ {
+ "epoch": 1.0283687943262412,
+ "grad_norm": 3.026901960372925,
+ "learning_rate": 4.672304751827592e-06,
+ "loss": 0.62,
+ "step": 2175
+ },
+ {
+ "epoch": 1.0288416075650118,
+ "grad_norm": 2.3946213722229004,
+ "learning_rate": 4.671995921525391e-06,
+ "loss": 0.5228,
+ "step": 2176
+ },
+ {
+ "epoch": 1.0293144208037825,
+ "grad_norm": 2.985020399093628,
+ "learning_rate": 4.671686955982528e-06,
+ "loss": 0.6256,
+ "step": 2177
+ },
+ {
+ "epoch": 1.0297872340425531,
+ "grad_norm": 3.0910139083862305,
+ "learning_rate": 4.671377855218239e-06,
+ "loss": 0.5893,
+ "step": 2178
+ },
+ {
+ "epoch": 1.030260047281324,
+ "grad_norm": 2.507805109024048,
+ "learning_rate": 4.6710686192517744e-06,
+ "loss": 0.5329,
+ "step": 2179
+ },
+ {
+ "epoch": 1.0307328605200945,
+ "grad_norm": 2.4514641761779785,
+ "learning_rate": 4.670759248102386e-06,
+ "loss": 0.4585,
+ "step": 2180
+ },
+ {
+ "epoch": 1.0312056737588653,
+ "grad_norm": 2.742838144302368,
+ "learning_rate": 4.670449741789337e-06,
+ "loss": 0.6255,
+ "step": 2181
+ },
+ {
+ "epoch": 1.0316784869976359,
+ "grad_norm": 2.374349594116211,
+ "learning_rate": 4.670140100331901e-06,
+ "loss": 0.5049,
+ "step": 2182
+ },
+ {
+ "epoch": 1.0321513002364067,
+ "grad_norm": 2.78894305229187,
+ "learning_rate": 4.669830323749356e-06,
+ "loss": 0.6061,
+ "step": 2183
+ },
+ {
+ "epoch": 1.0326241134751772,
+ "grad_norm": 2.7195091247558594,
+ "learning_rate": 4.6695204120609905e-06,
+ "loss": 0.592,
+ "step": 2184
+ },
+ {
+ "epoch": 1.033096926713948,
+ "grad_norm": 2.824411630630493,
+ "learning_rate": 4.6692103652861035e-06,
+ "loss": 0.5666,
+ "step": 2185
+ },
+ {
+ "epoch": 1.0335697399527186,
+ "grad_norm": 2.4981014728546143,
+ "learning_rate": 4.6689001834439975e-06,
+ "loss": 0.5045,
+ "step": 2186
+ },
+ {
+ "epoch": 1.0340425531914894,
+ "grad_norm": 2.7375214099884033,
+ "learning_rate": 4.668589866553988e-06,
+ "loss": 0.5305,
+ "step": 2187
+ },
+ {
+ "epoch": 1.03451536643026,
+ "grad_norm": 2.625345468521118,
+ "learning_rate": 4.668279414635396e-06,
+ "loss": 0.4819,
+ "step": 2188
+ },
+ {
+ "epoch": 1.0349881796690308,
+ "grad_norm": 2.60479736328125,
+ "learning_rate": 4.667968827707553e-06,
+ "loss": 0.55,
+ "step": 2189
+ },
+ {
+ "epoch": 1.0354609929078014,
+ "grad_norm": 2.642014741897583,
+ "learning_rate": 4.667658105789797e-06,
+ "loss": 0.5264,
+ "step": 2190
+ },
+ {
+ "epoch": 1.0359338061465722,
+ "grad_norm": 2.5439083576202393,
+ "learning_rate": 4.667347248901476e-06,
+ "loss": 0.4657,
+ "step": 2191
+ },
+ {
+ "epoch": 1.0364066193853427,
+ "grad_norm": 2.5537586212158203,
+ "learning_rate": 4.667036257061945e-06,
+ "loss": 0.527,
+ "step": 2192
+ },
+ {
+ "epoch": 1.0368794326241135,
+ "grad_norm": 2.595466375350952,
+ "learning_rate": 4.666725130290569e-06,
+ "loss": 0.5336,
+ "step": 2193
+ },
+ {
+ "epoch": 1.037352245862884,
+ "grad_norm": 3.5106313228607178,
+ "learning_rate": 4.666413868606719e-06,
+ "loss": 0.5176,
+ "step": 2194
+ },
+ {
+ "epoch": 1.037825059101655,
+ "grad_norm": 2.931553363800049,
+ "learning_rate": 4.666102472029778e-06,
+ "loss": 0.549,
+ "step": 2195
+ },
+ {
+ "epoch": 1.0382978723404255,
+ "grad_norm": 2.4325125217437744,
+ "learning_rate": 4.665790940579133e-06,
+ "loss": 0.5095,
+ "step": 2196
+ },
+ {
+ "epoch": 1.0387706855791963,
+ "grad_norm": 2.708477258682251,
+ "learning_rate": 4.665479274274184e-06,
+ "loss": 0.5264,
+ "step": 2197
+ },
+ {
+ "epoch": 1.0392434988179668,
+ "grad_norm": 2.905977487564087,
+ "learning_rate": 4.665167473134335e-06,
+ "loss": 0.5575,
+ "step": 2198
+ },
+ {
+ "epoch": 1.0397163120567376,
+ "grad_norm": 2.428938865661621,
+ "learning_rate": 4.664855537179003e-06,
+ "loss": 0.5099,
+ "step": 2199
+ },
+ {
+ "epoch": 1.0401891252955082,
+ "grad_norm": 2.8432137966156006,
+ "learning_rate": 4.6645434664276075e-06,
+ "loss": 0.5331,
+ "step": 2200
+ },
+ {
+ "epoch": 1.040661938534279,
+ "grad_norm": 2.5185136795043945,
+ "learning_rate": 4.6642312608995825e-06,
+ "loss": 0.5217,
+ "step": 2201
+ },
+ {
+ "epoch": 1.0411347517730496,
+ "grad_norm": 2.556607723236084,
+ "learning_rate": 4.663918920614366e-06,
+ "loss": 0.4431,
+ "step": 2202
+ },
+ {
+ "epoch": 1.0416075650118204,
+ "grad_norm": 3.1271166801452637,
+ "learning_rate": 4.663606445591407e-06,
+ "loss": 0.5398,
+ "step": 2203
+ },
+ {
+ "epoch": 1.042080378250591,
+ "grad_norm": 2.573680877685547,
+ "learning_rate": 4.663293835850162e-06,
+ "loss": 0.4713,
+ "step": 2204
+ },
+ {
+ "epoch": 1.0425531914893618,
+ "grad_norm": 2.5230324268341064,
+ "learning_rate": 4.662981091410096e-06,
+ "loss": 0.5571,
+ "step": 2205
+ },
+ {
+ "epoch": 1.0430260047281323,
+ "grad_norm": 2.552182912826538,
+ "learning_rate": 4.662668212290681e-06,
+ "loss": 0.5173,
+ "step": 2206
+ },
+ {
+ "epoch": 1.0434988179669031,
+ "grad_norm": 2.832345724105835,
+ "learning_rate": 4.6623551985113995e-06,
+ "loss": 0.525,
+ "step": 2207
+ },
+ {
+ "epoch": 1.0439716312056737,
+ "grad_norm": 2.9729080200195312,
+ "learning_rate": 4.6620420500917416e-06,
+ "loss": 0.6308,
+ "step": 2208
+ },
+ {
+ "epoch": 1.0444444444444445,
+ "grad_norm": 2.618187665939331,
+ "learning_rate": 4.661728767051206e-06,
+ "loss": 0.4942,
+ "step": 2209
+ },
+ {
+ "epoch": 1.044917257683215,
+ "grad_norm": 2.515566349029541,
+ "learning_rate": 4.661415349409299e-06,
+ "loss": 0.5229,
+ "step": 2210
+ },
+ {
+ "epoch": 1.0453900709219859,
+ "grad_norm": 2.8651459217071533,
+ "learning_rate": 4.6611017971855356e-06,
+ "loss": 0.5029,
+ "step": 2211
+ },
+ {
+ "epoch": 1.0458628841607565,
+ "grad_norm": 2.502405881881714,
+ "learning_rate": 4.660788110399439e-06,
+ "loss": 0.4732,
+ "step": 2212
+ },
+ {
+ "epoch": 1.0463356973995273,
+ "grad_norm": 2.540668249130249,
+ "learning_rate": 4.660474289070541e-06,
+ "loss": 0.547,
+ "step": 2213
+ },
+ {
+ "epoch": 1.0468085106382978,
+ "grad_norm": 2.803469181060791,
+ "learning_rate": 4.660160333218384e-06,
+ "loss": 0.5441,
+ "step": 2214
+ },
+ {
+ "epoch": 1.0472813238770686,
+ "grad_norm": 3.233325481414795,
+ "learning_rate": 4.659846242862514e-06,
+ "loss": 0.4457,
+ "step": 2215
+ },
+ {
+ "epoch": 1.0477541371158392,
+ "grad_norm": 2.549548387527466,
+ "learning_rate": 4.659532018022489e-06,
+ "loss": 0.5684,
+ "step": 2216
+ },
+ {
+ "epoch": 1.04822695035461,
+ "grad_norm": 2.6112852096557617,
+ "learning_rate": 4.659217658717875e-06,
+ "loss": 0.5323,
+ "step": 2217
+ },
+ {
+ "epoch": 1.0486997635933806,
+ "grad_norm": 2.347418785095215,
+ "learning_rate": 4.658903164968245e-06,
+ "loss": 0.5349,
+ "step": 2218
+ },
+ {
+ "epoch": 1.0491725768321514,
+ "grad_norm": 2.695502281188965,
+ "learning_rate": 4.658588536793182e-06,
+ "loss": 0.4883,
+ "step": 2219
+ },
+ {
+ "epoch": 1.049645390070922,
+ "grad_norm": 2.7575674057006836,
+ "learning_rate": 4.658273774212275e-06,
+ "loss": 0.5517,
+ "step": 2220
+ },
+ {
+ "epoch": 1.0501182033096927,
+ "grad_norm": 2.787855386734009,
+ "learning_rate": 4.6579588772451245e-06,
+ "loss": 0.5744,
+ "step": 2221
+ },
+ {
+ "epoch": 1.0505910165484633,
+ "grad_norm": 3.0699398517608643,
+ "learning_rate": 4.657643845911337e-06,
+ "loss": 0.5258,
+ "step": 2222
+ },
+ {
+ "epoch": 1.0510638297872341,
+ "grad_norm": 2.652040719985962,
+ "learning_rate": 4.657328680230527e-06,
+ "loss": 0.5141,
+ "step": 2223
+ },
+ {
+ "epoch": 1.0515366430260047,
+ "grad_norm": 2.6896369457244873,
+ "learning_rate": 4.657013380222322e-06,
+ "loss": 0.5139,
+ "step": 2224
+ },
+ {
+ "epoch": 1.0520094562647755,
+ "grad_norm": 2.551839590072632,
+ "learning_rate": 4.65669794590635e-06,
+ "loss": 0.5099,
+ "step": 2225
+ },
+ {
+ "epoch": 1.052482269503546,
+ "grad_norm": 2.8543262481689453,
+ "learning_rate": 4.656382377302255e-06,
+ "loss": 0.6085,
+ "step": 2226
+ },
+ {
+ "epoch": 1.0529550827423169,
+ "grad_norm": 2.871469259262085,
+ "learning_rate": 4.656066674429685e-06,
+ "loss": 0.6108,
+ "step": 2227
+ },
+ {
+ "epoch": 1.0534278959810874,
+ "grad_norm": 2.4840824604034424,
+ "learning_rate": 4.655750837308296e-06,
+ "loss": 0.4994,
+ "step": 2228
+ },
+ {
+ "epoch": 1.0539007092198582,
+ "grad_norm": 2.5203280448913574,
+ "learning_rate": 4.6554348659577555e-06,
+ "loss": 0.4928,
+ "step": 2229
+ },
+ {
+ "epoch": 1.0543735224586288,
+ "grad_norm": 2.9327683448791504,
+ "learning_rate": 4.655118760397737e-06,
+ "loss": 0.6324,
+ "step": 2230
+ },
+ {
+ "epoch": 1.0548463356973996,
+ "grad_norm": 2.6766855716705322,
+ "learning_rate": 4.654802520647924e-06,
+ "loss": 0.5178,
+ "step": 2231
+ },
+ {
+ "epoch": 1.0553191489361702,
+ "grad_norm": 2.8438873291015625,
+ "learning_rate": 4.654486146728006e-06,
+ "loss": 0.509,
+ "step": 2232
+ },
+ {
+ "epoch": 1.055791962174941,
+ "grad_norm": 2.538661241531372,
+ "learning_rate": 4.6541696386576826e-06,
+ "loss": 0.5463,
+ "step": 2233
+ },
+ {
+ "epoch": 1.0562647754137116,
+ "grad_norm": 2.829030990600586,
+ "learning_rate": 4.653852996456662e-06,
+ "loss": 0.5404,
+ "step": 2234
+ },
+ {
+ "epoch": 1.0567375886524824,
+ "grad_norm": 2.5657269954681396,
+ "learning_rate": 4.653536220144659e-06,
+ "loss": 0.5479,
+ "step": 2235
+ },
+ {
+ "epoch": 1.057210401891253,
+ "grad_norm": 2.6641297340393066,
+ "learning_rate": 4.653219309741399e-06,
+ "loss": 0.5503,
+ "step": 2236
+ },
+ {
+ "epoch": 1.0576832151300237,
+ "grad_norm": 2.966350555419922,
+ "learning_rate": 4.652902265266615e-06,
+ "loss": 0.6404,
+ "step": 2237
+ },
+ {
+ "epoch": 1.0581560283687943,
+ "grad_norm": 2.462430000305176,
+ "learning_rate": 4.6525850867400455e-06,
+ "loss": 0.4885,
+ "step": 2238
+ },
+ {
+ "epoch": 1.058628841607565,
+ "grad_norm": 2.1791880130767822,
+ "learning_rate": 4.652267774181443e-06,
+ "loss": 0.4405,
+ "step": 2239
+ },
+ {
+ "epoch": 1.0591016548463357,
+ "grad_norm": 2.5473732948303223,
+ "learning_rate": 4.651950327610563e-06,
+ "loss": 0.5295,
+ "step": 2240
+ },
+ {
+ "epoch": 1.0595744680851065,
+ "grad_norm": 2.70904803276062,
+ "learning_rate": 4.651632747047172e-06,
+ "loss": 0.5169,
+ "step": 2241
+ },
+ {
+ "epoch": 1.060047281323877,
+ "grad_norm": 3.8442928791046143,
+ "learning_rate": 4.651315032511045e-06,
+ "loss": 0.5473,
+ "step": 2242
+ },
+ {
+ "epoch": 1.0605200945626478,
+ "grad_norm": 2.8613383769989014,
+ "learning_rate": 4.650997184021963e-06,
+ "loss": 0.5445,
+ "step": 2243
+ },
+ {
+ "epoch": 1.0609929078014184,
+ "grad_norm": 2.5995829105377197,
+ "learning_rate": 4.6506792015997184e-06,
+ "loss": 0.5525,
+ "step": 2244
+ },
+ {
+ "epoch": 1.0614657210401892,
+ "grad_norm": 2.5465996265411377,
+ "learning_rate": 4.650361085264111e-06,
+ "loss": 0.5093,
+ "step": 2245
+ },
+ {
+ "epoch": 1.0619385342789598,
+ "grad_norm": 2.46553111076355,
+ "learning_rate": 4.650042835034948e-06,
+ "loss": 0.5375,
+ "step": 2246
+ },
+ {
+ "epoch": 1.0624113475177306,
+ "grad_norm": 2.6907830238342285,
+ "learning_rate": 4.649724450932045e-06,
+ "loss": 0.572,
+ "step": 2247
+ },
+ {
+ "epoch": 1.0628841607565012,
+ "grad_norm": 3.0671346187591553,
+ "learning_rate": 4.649405932975226e-06,
+ "loss": 0.4974,
+ "step": 2248
+ },
+ {
+ "epoch": 1.063356973995272,
+ "grad_norm": 2.5392491817474365,
+ "learning_rate": 4.649087281184325e-06,
+ "loss": 0.524,
+ "step": 2249
+ },
+ {
+ "epoch": 1.0638297872340425,
+ "grad_norm": 2.7498562335968018,
+ "learning_rate": 4.648768495579183e-06,
+ "loss": 0.5801,
+ "step": 2250
+ },
+ {
+ "epoch": 1.0643026004728133,
+ "grad_norm": 2.8536248207092285,
+ "learning_rate": 4.648449576179649e-06,
+ "loss": 0.5384,
+ "step": 2251
+ },
+ {
+ "epoch": 1.064775413711584,
+ "grad_norm": 2.7062792778015137,
+ "learning_rate": 4.64813052300558e-06,
+ "loss": 0.5262,
+ "step": 2252
+ },
+ {
+ "epoch": 1.0652482269503547,
+ "grad_norm": 2.798650026321411,
+ "learning_rate": 4.647811336076841e-06,
+ "loss": 0.5719,
+ "step": 2253
+ },
+ {
+ "epoch": 1.0657210401891253,
+ "grad_norm": 2.9793951511383057,
+ "learning_rate": 4.647492015413311e-06,
+ "loss": 0.5377,
+ "step": 2254
+ },
+ {
+ "epoch": 1.066193853427896,
+ "grad_norm": 2.572129011154175,
+ "learning_rate": 4.647172561034868e-06,
+ "loss": 0.4791,
+ "step": 2255
+ },
+ {
+ "epoch": 1.0666666666666667,
+ "grad_norm": 3.7490930557250977,
+ "learning_rate": 4.646852972961405e-06,
+ "loss": 0.5423,
+ "step": 2256
+ },
+ {
+ "epoch": 1.0671394799054374,
+ "grad_norm": 2.626255750656128,
+ "learning_rate": 4.646533251212821e-06,
+ "loss": 0.5558,
+ "step": 2257
+ },
+ {
+ "epoch": 1.067612293144208,
+ "grad_norm": 2.8408126831054688,
+ "learning_rate": 4.646213395809023e-06,
+ "loss": 0.55,
+ "step": 2258
+ },
+ {
+ "epoch": 1.0680851063829788,
+ "grad_norm": 3.255606174468994,
+ "learning_rate": 4.645893406769929e-06,
+ "loss": 0.547,
+ "step": 2259
+ },
+ {
+ "epoch": 1.0685579196217494,
+ "grad_norm": 2.4352102279663086,
+ "learning_rate": 4.645573284115461e-06,
+ "loss": 0.4898,
+ "step": 2260
+ },
+ {
+ "epoch": 1.0690307328605202,
+ "grad_norm": 2.408634662628174,
+ "learning_rate": 4.6452530278655535e-06,
+ "loss": 0.5264,
+ "step": 2261
+ },
+ {
+ "epoch": 1.0695035460992908,
+ "grad_norm": 2.4220449924468994,
+ "learning_rate": 4.644932638040146e-06,
+ "loss": 0.5166,
+ "step": 2262
+ },
+ {
+ "epoch": 1.0699763593380616,
+ "grad_norm": 2.9188082218170166,
+ "learning_rate": 4.644612114659188e-06,
+ "loss": 0.5611,
+ "step": 2263
+ },
+ {
+ "epoch": 1.0704491725768321,
+ "grad_norm": 2.906557083129883,
+ "learning_rate": 4.644291457742638e-06,
+ "loss": 0.5515,
+ "step": 2264
+ },
+ {
+ "epoch": 1.070921985815603,
+ "grad_norm": 2.9039015769958496,
+ "learning_rate": 4.643970667310462e-06,
+ "loss": 0.5732,
+ "step": 2265
+ },
+ {
+ "epoch": 1.0713947990543735,
+ "grad_norm": 2.9985480308532715,
+ "learning_rate": 4.643649743382632e-06,
+ "loss": 0.563,
+ "step": 2266
+ },
+ {
+ "epoch": 1.0718676122931443,
+ "grad_norm": 2.5780906677246094,
+ "learning_rate": 4.6433286859791335e-06,
+ "loss": 0.502,
+ "step": 2267
+ },
+ {
+ "epoch": 1.0723404255319149,
+ "grad_norm": 2.590209722518921,
+ "learning_rate": 4.643007495119955e-06,
+ "loss": 0.4995,
+ "step": 2268
+ },
+ {
+ "epoch": 1.0728132387706855,
+ "grad_norm": 2.378894805908203,
+ "learning_rate": 4.642686170825097e-06,
+ "loss": 0.4886,
+ "step": 2269
+ },
+ {
+ "epoch": 1.0732860520094563,
+ "grad_norm": 2.6826229095458984,
+ "learning_rate": 4.642364713114567e-06,
+ "loss": 0.465,
+ "step": 2270
+ },
+ {
+ "epoch": 1.073758865248227,
+ "grad_norm": 2.627819538116455,
+ "learning_rate": 4.64204312200838e-06,
+ "loss": 0.4954,
+ "step": 2271
+ },
+ {
+ "epoch": 1.0742316784869976,
+ "grad_norm": 2.993021249771118,
+ "learning_rate": 4.641721397526561e-06,
+ "loss": 0.5073,
+ "step": 2272
+ },
+ {
+ "epoch": 1.0747044917257682,
+ "grad_norm": 2.719052791595459,
+ "learning_rate": 4.64139953968914e-06,
+ "loss": 0.538,
+ "step": 2273
+ },
+ {
+ "epoch": 1.075177304964539,
+ "grad_norm": 2.729252576828003,
+ "learning_rate": 4.6410775485161605e-06,
+ "loss": 0.552,
+ "step": 2274
+ },
+ {
+ "epoch": 1.0756501182033098,
+ "grad_norm": 2.924142599105835,
+ "learning_rate": 4.640755424027671e-06,
+ "loss": 0.522,
+ "step": 2275
+ },
+ {
+ "epoch": 1.0761229314420804,
+ "grad_norm": 3.329162120819092,
+ "learning_rate": 4.640433166243728e-06,
+ "loss": 0.5965,
+ "step": 2276
+ },
+ {
+ "epoch": 1.076595744680851,
+ "grad_norm": 2.9810245037078857,
+ "learning_rate": 4.640110775184396e-06,
+ "loss": 0.5653,
+ "step": 2277
+ },
+ {
+ "epoch": 1.0770685579196217,
+ "grad_norm": 2.61772084236145,
+ "learning_rate": 4.639788250869751e-06,
+ "loss": 0.5382,
+ "step": 2278
+ },
+ {
+ "epoch": 1.0775413711583925,
+ "grad_norm": 2.741225004196167,
+ "learning_rate": 4.639465593319874e-06,
+ "loss": 0.4866,
+ "step": 2279
+ },
+ {
+ "epoch": 1.0780141843971631,
+ "grad_norm": 2.7945218086242676,
+ "learning_rate": 4.639142802554856e-06,
+ "loss": 0.4711,
+ "step": 2280
+ },
+ {
+ "epoch": 1.0784869976359337,
+ "grad_norm": 2.4282329082489014,
+ "learning_rate": 4.638819878594795e-06,
+ "loss": 0.4911,
+ "step": 2281
+ },
+ {
+ "epoch": 1.0789598108747045,
+ "grad_norm": 2.551741361618042,
+ "learning_rate": 4.638496821459799e-06,
+ "loss": 0.453,
+ "step": 2282
+ },
+ {
+ "epoch": 1.0794326241134753,
+ "grad_norm": 2.5622754096984863,
+ "learning_rate": 4.638173631169983e-06,
+ "loss": 0.5983,
+ "step": 2283
+ },
+ {
+ "epoch": 1.0799054373522459,
+ "grad_norm": 2.7748284339904785,
+ "learning_rate": 4.6378503077454715e-06,
+ "loss": 0.5143,
+ "step": 2284
+ },
+ {
+ "epoch": 1.0803782505910164,
+ "grad_norm": 2.7693238258361816,
+ "learning_rate": 4.637526851206394e-06,
+ "loss": 0.5929,
+ "step": 2285
+ },
+ {
+ "epoch": 1.0808510638297872,
+ "grad_norm": 2.705548048019409,
+ "learning_rate": 4.637203261572893e-06,
+ "loss": 0.5577,
+ "step": 2286
+ },
+ {
+ "epoch": 1.081323877068558,
+ "grad_norm": 2.739307165145874,
+ "learning_rate": 4.636879538865117e-06,
+ "loss": 0.5676,
+ "step": 2287
+ },
+ {
+ "epoch": 1.0817966903073286,
+ "grad_norm": 2.514059543609619,
+ "learning_rate": 4.636555683103221e-06,
+ "loss": 0.5001,
+ "step": 2288
+ },
+ {
+ "epoch": 1.0822695035460992,
+ "grad_norm": 2.7166874408721924,
+ "learning_rate": 4.636231694307372e-06,
+ "loss": 0.5411,
+ "step": 2289
+ },
+ {
+ "epoch": 1.08274231678487,
+ "grad_norm": 2.7661683559417725,
+ "learning_rate": 4.635907572497741e-06,
+ "loss": 0.6353,
+ "step": 2290
+ },
+ {
+ "epoch": 1.0832151300236406,
+ "grad_norm": 2.598381996154785,
+ "learning_rate": 4.635583317694512e-06,
+ "loss": 0.5213,
+ "step": 2291
+ },
+ {
+ "epoch": 1.0836879432624114,
+ "grad_norm": 2.821491003036499,
+ "learning_rate": 4.6352589299178744e-06,
+ "loss": 0.6172,
+ "step": 2292
+ },
+ {
+ "epoch": 1.084160756501182,
+ "grad_norm": 2.5422823429107666,
+ "learning_rate": 4.634934409188025e-06,
+ "loss": 0.5245,
+ "step": 2293
+ },
+ {
+ "epoch": 1.0846335697399527,
+ "grad_norm": 2.8264620304107666,
+ "learning_rate": 4.634609755525173e-06,
+ "loss": 0.5004,
+ "step": 2294
+ },
+ {
+ "epoch": 1.0851063829787233,
+ "grad_norm": 2.3286643028259277,
+ "learning_rate": 4.63428496894953e-06,
+ "loss": 0.4561,
+ "step": 2295
+ },
+ {
+ "epoch": 1.085579196217494,
+ "grad_norm": 2.462005376815796,
+ "learning_rate": 4.633960049481321e-06,
+ "loss": 0.4948,
+ "step": 2296
+ },
+ {
+ "epoch": 1.0860520094562647,
+ "grad_norm": 2.760258913040161,
+ "learning_rate": 4.633634997140777e-06,
+ "loss": 0.5407,
+ "step": 2297
+ },
+ {
+ "epoch": 1.0865248226950355,
+ "grad_norm": 3.0234217643737793,
+ "learning_rate": 4.633309811948138e-06,
+ "loss": 0.4914,
+ "step": 2298
+ },
+ {
+ "epoch": 1.086997635933806,
+ "grad_norm": 2.8380849361419678,
+ "learning_rate": 4.63298449392365e-06,
+ "loss": 0.5562,
+ "step": 2299
+ },
+ {
+ "epoch": 1.0874704491725768,
+ "grad_norm": 2.6201648712158203,
+ "learning_rate": 4.632659043087572e-06,
+ "loss": 0.5882,
+ "step": 2300
+ },
+ {
+ "epoch": 1.0879432624113474,
+ "grad_norm": 2.586339235305786,
+ "learning_rate": 4.632333459460165e-06,
+ "loss": 0.4991,
+ "step": 2301
+ },
+ {
+ "epoch": 1.0884160756501182,
+ "grad_norm": 2.500115394592285,
+ "learning_rate": 4.632007743061705e-06,
+ "loss": 0.552,
+ "step": 2302
+ },
+ {
+ "epoch": 1.0888888888888888,
+ "grad_norm": 2.816390037536621,
+ "learning_rate": 4.63168189391247e-06,
+ "loss": 0.5301,
+ "step": 2303
+ },
+ {
+ "epoch": 1.0893617021276596,
+ "grad_norm": 2.975400924682617,
+ "learning_rate": 4.631355912032753e-06,
+ "loss": 0.6056,
+ "step": 2304
+ },
+ {
+ "epoch": 1.0898345153664302,
+ "grad_norm": 2.747985363006592,
+ "learning_rate": 4.631029797442846e-06,
+ "loss": 0.5335,
+ "step": 2305
+ },
+ {
+ "epoch": 1.090307328605201,
+ "grad_norm": 2.609281539916992,
+ "learning_rate": 4.630703550163059e-06,
+ "loss": 0.5189,
+ "step": 2306
+ },
+ {
+ "epoch": 1.0907801418439715,
+ "grad_norm": 2.624131202697754,
+ "learning_rate": 4.630377170213705e-06,
+ "loss": 0.5646,
+ "step": 2307
+ },
+ {
+ "epoch": 1.0912529550827423,
+ "grad_norm": 2.6186959743499756,
+ "learning_rate": 4.630050657615107e-06,
+ "loss": 0.5187,
+ "step": 2308
+ },
+ {
+ "epoch": 1.091725768321513,
+ "grad_norm": 2.9961764812469482,
+ "learning_rate": 4.629724012387594e-06,
+ "loss": 0.6207,
+ "step": 2309
+ },
+ {
+ "epoch": 1.0921985815602837,
+ "grad_norm": 2.665799140930176,
+ "learning_rate": 4.629397234551505e-06,
+ "loss": 0.5046,
+ "step": 2310
+ },
+ {
+ "epoch": 1.0926713947990543,
+ "grad_norm": 2.6154725551605225,
+ "learning_rate": 4.629070324127187e-06,
+ "loss": 0.5553,
+ "step": 2311
+ },
+ {
+ "epoch": 1.093144208037825,
+ "grad_norm": 2.702967643737793,
+ "learning_rate": 4.628743281134996e-06,
+ "loss": 0.5159,
+ "step": 2312
+ },
+ {
+ "epoch": 1.0936170212765957,
+ "grad_norm": 2.578080177307129,
+ "learning_rate": 4.628416105595295e-06,
+ "loss": 0.4934,
+ "step": 2313
+ },
+ {
+ "epoch": 1.0940898345153665,
+ "grad_norm": 2.8763060569763184,
+ "learning_rate": 4.628088797528456e-06,
+ "loss": 0.5404,
+ "step": 2314
+ },
+ {
+ "epoch": 1.094562647754137,
+ "grad_norm": 2.5301198959350586,
+ "learning_rate": 4.6277613569548585e-06,
+ "loss": 0.524,
+ "step": 2315
+ },
+ {
+ "epoch": 1.0950354609929078,
+ "grad_norm": 2.559903144836426,
+ "learning_rate": 4.627433783894892e-06,
+ "loss": 0.5177,
+ "step": 2316
+ },
+ {
+ "epoch": 1.0955082742316784,
+ "grad_norm": 2.430863380432129,
+ "learning_rate": 4.627106078368952e-06,
+ "loss": 0.5368,
+ "step": 2317
+ },
+ {
+ "epoch": 1.0959810874704492,
+ "grad_norm": 2.687567949295044,
+ "learning_rate": 4.626778240397444e-06,
+ "loss": 0.5385,
+ "step": 2318
+ },
+ {
+ "epoch": 1.0964539007092198,
+ "grad_norm": 3.053466558456421,
+ "learning_rate": 4.62645027000078e-06,
+ "loss": 0.5814,
+ "step": 2319
+ },
+ {
+ "epoch": 1.0969267139479906,
+ "grad_norm": 2.4612979888916016,
+ "learning_rate": 4.6261221671993815e-06,
+ "loss": 0.5069,
+ "step": 2320
+ },
+ {
+ "epoch": 1.0973995271867611,
+ "grad_norm": 2.6153628826141357,
+ "learning_rate": 4.625793932013679e-06,
+ "loss": 0.5422,
+ "step": 2321
+ },
+ {
+ "epoch": 1.097872340425532,
+ "grad_norm": 2.8918874263763428,
+ "learning_rate": 4.62546556446411e-06,
+ "loss": 0.5326,
+ "step": 2322
+ },
+ {
+ "epoch": 1.0983451536643025,
+ "grad_norm": 3.62565279006958,
+ "learning_rate": 4.625137064571119e-06,
+ "loss": 0.5164,
+ "step": 2323
+ },
+ {
+ "epoch": 1.0988179669030733,
+ "grad_norm": 2.4285085201263428,
+ "learning_rate": 4.624808432355164e-06,
+ "loss": 0.5084,
+ "step": 2324
+ },
+ {
+ "epoch": 1.099290780141844,
+ "grad_norm": 2.593979835510254,
+ "learning_rate": 4.624479667836702e-06,
+ "loss": 0.4986,
+ "step": 2325
+ },
+ {
+ "epoch": 1.0997635933806147,
+ "grad_norm": 2.490752935409546,
+ "learning_rate": 4.624150771036208e-06,
+ "loss": 0.5296,
+ "step": 2326
+ },
+ {
+ "epoch": 1.1002364066193853,
+ "grad_norm": 2.67694091796875,
+ "learning_rate": 4.6238217419741595e-06,
+ "loss": 0.5229,
+ "step": 2327
+ },
+ {
+ "epoch": 1.100709219858156,
+ "grad_norm": 2.594147205352783,
+ "learning_rate": 4.623492580671044e-06,
+ "loss": 0.4916,
+ "step": 2328
+ },
+ {
+ "epoch": 1.1011820330969266,
+ "grad_norm": 2.943472385406494,
+ "learning_rate": 4.623163287147356e-06,
+ "loss": 0.5591,
+ "step": 2329
+ },
+ {
+ "epoch": 1.1016548463356974,
+ "grad_norm": 2.569410562515259,
+ "learning_rate": 4.622833861423601e-06,
+ "loss": 0.4648,
+ "step": 2330
+ },
+ {
+ "epoch": 1.102127659574468,
+ "grad_norm": 2.5490405559539795,
+ "learning_rate": 4.6225043035202886e-06,
+ "loss": 0.5493,
+ "step": 2331
+ },
+ {
+ "epoch": 1.1026004728132388,
+ "grad_norm": 2.5964598655700684,
+ "learning_rate": 4.622174613457941e-06,
+ "loss": 0.5358,
+ "step": 2332
+ },
+ {
+ "epoch": 1.1030732860520094,
+ "grad_norm": 2.6456820964813232,
+ "learning_rate": 4.621844791257085e-06,
+ "loss": 0.5864,
+ "step": 2333
+ },
+ {
+ "epoch": 1.1035460992907802,
+ "grad_norm": 2.861180067062378,
+ "learning_rate": 4.621514836938259e-06,
+ "loss": 0.6064,
+ "step": 2334
+ },
+ {
+ "epoch": 1.1040189125295508,
+ "grad_norm": 2.8199548721313477,
+ "learning_rate": 4.621184750522005e-06,
+ "loss": 0.5244,
+ "step": 2335
+ },
+ {
+ "epoch": 1.1044917257683216,
+ "grad_norm": 2.7398853302001953,
+ "learning_rate": 4.6208545320288795e-06,
+ "loss": 0.5496,
+ "step": 2336
+ },
+ {
+ "epoch": 1.1049645390070921,
+ "grad_norm": 2.7941031455993652,
+ "learning_rate": 4.620524181479441e-06,
+ "loss": 0.5496,
+ "step": 2337
+ },
+ {
+ "epoch": 1.105437352245863,
+ "grad_norm": 2.973785161972046,
+ "learning_rate": 4.620193698894259e-06,
+ "loss": 0.5492,
+ "step": 2338
+ },
+ {
+ "epoch": 1.1059101654846335,
+ "grad_norm": 2.650355815887451,
+ "learning_rate": 4.6198630842939144e-06,
+ "loss": 0.5392,
+ "step": 2339
+ },
+ {
+ "epoch": 1.1063829787234043,
+ "grad_norm": 2.9092214107513428,
+ "learning_rate": 4.61953233769899e-06,
+ "loss": 0.5305,
+ "step": 2340
+ },
+ {
+ "epoch": 1.1068557919621749,
+ "grad_norm": 2.6329731941223145,
+ "learning_rate": 4.61920145913008e-06,
+ "loss": 0.5031,
+ "step": 2341
+ },
+ {
+ "epoch": 1.1073286052009457,
+ "grad_norm": 2.7214207649230957,
+ "learning_rate": 4.618870448607788e-06,
+ "loss": 0.5536,
+ "step": 2342
+ },
+ {
+ "epoch": 1.1078014184397162,
+ "grad_norm": 2.873119592666626,
+ "learning_rate": 4.618539306152724e-06,
+ "loss": 0.4531,
+ "step": 2343
+ },
+ {
+ "epoch": 1.108274231678487,
+ "grad_norm": 2.701042413711548,
+ "learning_rate": 4.618208031785507e-06,
+ "loss": 0.5217,
+ "step": 2344
+ },
+ {
+ "epoch": 1.1087470449172576,
+ "grad_norm": 2.7189881801605225,
+ "learning_rate": 4.6178766255267635e-06,
+ "loss": 0.6205,
+ "step": 2345
+ },
+ {
+ "epoch": 1.1092198581560284,
+ "grad_norm": 2.546382188796997,
+ "learning_rate": 4.61754508739713e-06,
+ "loss": 0.5475,
+ "step": 2346
+ },
+ {
+ "epoch": 1.109692671394799,
+ "grad_norm": 2.8429276943206787,
+ "learning_rate": 4.617213417417249e-06,
+ "loss": 0.4809,
+ "step": 2347
+ },
+ {
+ "epoch": 1.1101654846335698,
+ "grad_norm": 2.9515812397003174,
+ "learning_rate": 4.616881615607772e-06,
+ "loss": 0.5067,
+ "step": 2348
+ },
+ {
+ "epoch": 1.1106382978723404,
+ "grad_norm": 2.5910723209381104,
+ "learning_rate": 4.616549681989358e-06,
+ "loss": 0.5368,
+ "step": 2349
+ },
+ {
+ "epoch": 1.1111111111111112,
+ "grad_norm": 2.80855655670166,
+ "learning_rate": 4.616217616582678e-06,
+ "loss": 0.5827,
+ "step": 2350
+ },
+ {
+ "epoch": 1.1115839243498817,
+ "grad_norm": 2.604383945465088,
+ "learning_rate": 4.6158854194084044e-06,
+ "loss": 0.5716,
+ "step": 2351
+ },
+ {
+ "epoch": 1.1120567375886525,
+ "grad_norm": 3.0585904121398926,
+ "learning_rate": 4.6155530904872246e-06,
+ "loss": 0.4998,
+ "step": 2352
+ },
+ {
+ "epoch": 1.112529550827423,
+ "grad_norm": 2.660961627960205,
+ "learning_rate": 4.61522062983983e-06,
+ "loss": 0.4533,
+ "step": 2353
+ },
+ {
+ "epoch": 1.113002364066194,
+ "grad_norm": 2.8042070865631104,
+ "learning_rate": 4.614888037486923e-06,
+ "loss": 0.5592,
+ "step": 2354
+ },
+ {
+ "epoch": 1.1134751773049645,
+ "grad_norm": 2.681664228439331,
+ "learning_rate": 4.61455531344921e-06,
+ "loss": 0.5439,
+ "step": 2355
+ },
+ {
+ "epoch": 1.1139479905437353,
+ "grad_norm": 2.905054807662964,
+ "learning_rate": 4.61422245774741e-06,
+ "loss": 0.5497,
+ "step": 2356
+ },
+ {
+ "epoch": 1.1144208037825059,
+ "grad_norm": 2.7979753017425537,
+ "learning_rate": 4.6138894704022484e-06,
+ "loss": 0.5374,
+ "step": 2357
+ },
+ {
+ "epoch": 1.1148936170212767,
+ "grad_norm": 2.965611696243286,
+ "learning_rate": 4.613556351434458e-06,
+ "loss": 0.5145,
+ "step": 2358
+ },
+ {
+ "epoch": 1.1153664302600472,
+ "grad_norm": 2.583134889602661,
+ "learning_rate": 4.613223100864782e-06,
+ "loss": 0.535,
+ "step": 2359
+ },
+ {
+ "epoch": 1.115839243498818,
+ "grad_norm": 2.5979621410369873,
+ "learning_rate": 4.61288971871397e-06,
+ "loss": 0.5514,
+ "step": 2360
+ },
+ {
+ "epoch": 1.1163120567375886,
+ "grad_norm": 3.0117669105529785,
+ "learning_rate": 4.612556205002779e-06,
+ "loss": 0.5266,
+ "step": 2361
+ },
+ {
+ "epoch": 1.1167848699763594,
+ "grad_norm": 2.425133466720581,
+ "learning_rate": 4.612222559751976e-06,
+ "loss": 0.4838,
+ "step": 2362
+ },
+ {
+ "epoch": 1.11725768321513,
+ "grad_norm": 2.5102691650390625,
+ "learning_rate": 4.611888782982337e-06,
+ "loss": 0.3947,
+ "step": 2363
+ },
+ {
+ "epoch": 1.1177304964539008,
+ "grad_norm": 3.0327367782592773,
+ "learning_rate": 4.611554874714645e-06,
+ "loss": 0.5753,
+ "step": 2364
+ },
+ {
+ "epoch": 1.1182033096926713,
+ "grad_norm": 2.4561009407043457,
+ "learning_rate": 4.6112208349696875e-06,
+ "loss": 0.5054,
+ "step": 2365
+ },
+ {
+ "epoch": 1.1186761229314421,
+ "grad_norm": 3.3898050785064697,
+ "learning_rate": 4.610886663768267e-06,
+ "loss": 0.5946,
+ "step": 2366
+ },
+ {
+ "epoch": 1.1191489361702127,
+ "grad_norm": 2.8112242221832275,
+ "learning_rate": 4.61055236113119e-06,
+ "loss": 0.5475,
+ "step": 2367
+ },
+ {
+ "epoch": 1.1196217494089835,
+ "grad_norm": 3.152946710586548,
+ "learning_rate": 4.610217927079272e-06,
+ "loss": 0.5165,
+ "step": 2368
+ },
+ {
+ "epoch": 1.120094562647754,
+ "grad_norm": 2.7847867012023926,
+ "learning_rate": 4.609883361633336e-06,
+ "loss": 0.5533,
+ "step": 2369
+ },
+ {
+ "epoch": 1.1205673758865249,
+ "grad_norm": 2.6376686096191406,
+ "learning_rate": 4.6095486648142155e-06,
+ "loss": 0.4942,
+ "step": 2370
+ },
+ {
+ "epoch": 1.1210401891252955,
+ "grad_norm": 3.123072862625122,
+ "learning_rate": 4.609213836642749e-06,
+ "loss": 0.616,
+ "step": 2371
+ },
+ {
+ "epoch": 1.1215130023640663,
+ "grad_norm": 2.802694320678711,
+ "learning_rate": 4.608878877139786e-06,
+ "loss": 0.5323,
+ "step": 2372
+ },
+ {
+ "epoch": 1.1219858156028368,
+ "grad_norm": 2.3567938804626465,
+ "learning_rate": 4.6085437863261825e-06,
+ "loss": 0.4822,
+ "step": 2373
+ },
+ {
+ "epoch": 1.1224586288416076,
+ "grad_norm": 2.553112030029297,
+ "learning_rate": 4.608208564222804e-06,
+ "loss": 0.5447,
+ "step": 2374
+ },
+ {
+ "epoch": 1.1229314420803782,
+ "grad_norm": 3.0020132064819336,
+ "learning_rate": 4.607873210850521e-06,
+ "loss": 0.6486,
+ "step": 2375
+ },
+ {
+ "epoch": 1.123404255319149,
+ "grad_norm": 2.832442045211792,
+ "learning_rate": 4.607537726230216e-06,
+ "loss": 0.5257,
+ "step": 2376
+ },
+ {
+ "epoch": 1.1238770685579196,
+ "grad_norm": 2.471527099609375,
+ "learning_rate": 4.607202110382778e-06,
+ "loss": 0.4816,
+ "step": 2377
+ },
+ {
+ "epoch": 1.1243498817966904,
+ "grad_norm": 2.4232118129730225,
+ "learning_rate": 4.606866363329105e-06,
+ "loss": 0.5533,
+ "step": 2378
+ },
+ {
+ "epoch": 1.124822695035461,
+ "grad_norm": 2.477506637573242,
+ "learning_rate": 4.6065304850901025e-06,
+ "loss": 0.5223,
+ "step": 2379
+ },
+ {
+ "epoch": 1.1252955082742317,
+ "grad_norm": 3.54127836227417,
+ "learning_rate": 4.6061944756866824e-06,
+ "loss": 0.6514,
+ "step": 2380
+ },
+ {
+ "epoch": 1.1257683215130023,
+ "grad_norm": 2.5148677825927734,
+ "learning_rate": 4.605858335139768e-06,
+ "loss": 0.4864,
+ "step": 2381
+ },
+ {
+ "epoch": 1.1262411347517731,
+ "grad_norm": 2.8363659381866455,
+ "learning_rate": 4.605522063470289e-06,
+ "loss": 0.5034,
+ "step": 2382
+ },
+ {
+ "epoch": 1.1267139479905437,
+ "grad_norm": 2.4996654987335205,
+ "learning_rate": 4.605185660699184e-06,
+ "loss": 0.4126,
+ "step": 2383
+ },
+ {
+ "epoch": 1.1271867612293145,
+ "grad_norm": 2.352543830871582,
+ "learning_rate": 4.604849126847398e-06,
+ "loss": 0.5224,
+ "step": 2384
+ },
+ {
+ "epoch": 1.127659574468085,
+ "grad_norm": 2.60101056098938,
+ "learning_rate": 4.6045124619358875e-06,
+ "loss": 0.4867,
+ "step": 2385
+ },
+ {
+ "epoch": 1.1281323877068559,
+ "grad_norm": 2.9471068382263184,
+ "learning_rate": 4.604175665985613e-06,
+ "loss": 0.6474,
+ "step": 2386
+ },
+ {
+ "epoch": 1.1286052009456264,
+ "grad_norm": 2.5933351516723633,
+ "learning_rate": 4.603838739017546e-06,
+ "loss": 0.5081,
+ "step": 2387
+ },
+ {
+ "epoch": 1.1290780141843972,
+ "grad_norm": 2.3740346431732178,
+ "learning_rate": 4.6035016810526665e-06,
+ "loss": 0.4438,
+ "step": 2388
+ },
+ {
+ "epoch": 1.1295508274231678,
+ "grad_norm": 2.675020217895508,
+ "learning_rate": 4.6031644921119614e-06,
+ "loss": 0.4968,
+ "step": 2389
+ },
+ {
+ "epoch": 1.1300236406619386,
+ "grad_norm": 2.599472999572754,
+ "learning_rate": 4.602827172216424e-06,
+ "loss": 0.5131,
+ "step": 2390
+ },
+ {
+ "epoch": 1.1304964539007092,
+ "grad_norm": 2.8176097869873047,
+ "learning_rate": 4.602489721387061e-06,
+ "loss": 0.5549,
+ "step": 2391
+ },
+ {
+ "epoch": 1.13096926713948,
+ "grad_norm": 2.466914176940918,
+ "learning_rate": 4.602152139644881e-06,
+ "loss": 0.5052,
+ "step": 2392
+ },
+ {
+ "epoch": 1.1314420803782506,
+ "grad_norm": 2.8938796520233154,
+ "learning_rate": 4.601814427010905e-06,
+ "loss": 0.6181,
+ "step": 2393
+ },
+ {
+ "epoch": 1.1319148936170214,
+ "grad_norm": 2.7390825748443604,
+ "learning_rate": 4.601476583506161e-06,
+ "loss": 0.5178,
+ "step": 2394
+ },
+ {
+ "epoch": 1.132387706855792,
+ "grad_norm": 3.180112838745117,
+ "learning_rate": 4.601138609151685e-06,
+ "loss": 0.6071,
+ "step": 2395
+ },
+ {
+ "epoch": 1.1328605200945627,
+ "grad_norm": 2.9282350540161133,
+ "learning_rate": 4.600800503968521e-06,
+ "loss": 0.5557,
+ "step": 2396
+ },
+ {
+ "epoch": 1.1333333333333333,
+ "grad_norm": 2.6689717769622803,
+ "learning_rate": 4.6004622679777215e-06,
+ "loss": 0.4679,
+ "step": 2397
+ },
+ {
+ "epoch": 1.133806146572104,
+ "grad_norm": 2.651582956314087,
+ "learning_rate": 4.600123901200347e-06,
+ "loss": 0.4907,
+ "step": 2398
+ },
+ {
+ "epoch": 1.1342789598108747,
+ "grad_norm": 2.5702924728393555,
+ "learning_rate": 4.599785403657464e-06,
+ "loss": 0.4919,
+ "step": 2399
+ },
+ {
+ "epoch": 1.1347517730496455,
+ "grad_norm": 2.636812448501587,
+ "learning_rate": 4.599446775370153e-06,
+ "loss": 0.5091,
+ "step": 2400
+ },
+ {
+ "epoch": 1.135224586288416,
+ "grad_norm": 2.5965442657470703,
+ "learning_rate": 4.599108016359497e-06,
+ "loss": 0.5035,
+ "step": 2401
+ },
+ {
+ "epoch": 1.1356973995271868,
+ "grad_norm": 2.689732313156128,
+ "learning_rate": 4.5987691266465885e-06,
+ "loss": 0.5307,
+ "step": 2402
+ },
+ {
+ "epoch": 1.1361702127659574,
+ "grad_norm": 2.7256956100463867,
+ "learning_rate": 4.59843010625253e-06,
+ "loss": 0.5066,
+ "step": 2403
+ },
+ {
+ "epoch": 1.1366430260047282,
+ "grad_norm": 2.726020574569702,
+ "learning_rate": 4.59809095519843e-06,
+ "loss": 0.4805,
+ "step": 2404
+ },
+ {
+ "epoch": 1.1371158392434988,
+ "grad_norm": 2.703339099884033,
+ "learning_rate": 4.597751673505406e-06,
+ "loss": 0.4992,
+ "step": 2405
+ },
+ {
+ "epoch": 1.1375886524822696,
+ "grad_norm": 2.54455304145813,
+ "learning_rate": 4.5974122611945835e-06,
+ "loss": 0.5251,
+ "step": 2406
+ },
+ {
+ "epoch": 1.1380614657210402,
+ "grad_norm": 2.623507022857666,
+ "learning_rate": 4.597072718287096e-06,
+ "loss": 0.4831,
+ "step": 2407
+ },
+ {
+ "epoch": 1.138534278959811,
+ "grad_norm": 2.653590202331543,
+ "learning_rate": 4.596733044804086e-06,
+ "loss": 0.5646,
+ "step": 2408
+ },
+ {
+ "epoch": 1.1390070921985815,
+ "grad_norm": 2.8230600357055664,
+ "learning_rate": 4.5963932407667035e-06,
+ "loss": 0.514,
+ "step": 2409
+ },
+ {
+ "epoch": 1.1394799054373523,
+ "grad_norm": 2.6077451705932617,
+ "learning_rate": 4.5960533061961065e-06,
+ "loss": 0.4713,
+ "step": 2410
+ },
+ {
+ "epoch": 1.139952718676123,
+ "grad_norm": 2.3945798873901367,
+ "learning_rate": 4.595713241113461e-06,
+ "loss": 0.466,
+ "step": 2411
+ },
+ {
+ "epoch": 1.1404255319148937,
+ "grad_norm": 2.8100006580352783,
+ "learning_rate": 4.595373045539941e-06,
+ "loss": 0.5365,
+ "step": 2412
+ },
+ {
+ "epoch": 1.1408983451536643,
+ "grad_norm": 2.6825881004333496,
+ "learning_rate": 4.59503271949673e-06,
+ "loss": 0.4457,
+ "step": 2413
+ },
+ {
+ "epoch": 1.141371158392435,
+ "grad_norm": 2.969435691833496,
+ "learning_rate": 4.594692263005016e-06,
+ "loss": 0.5459,
+ "step": 2414
+ },
+ {
+ "epoch": 1.1418439716312057,
+ "grad_norm": 2.4103164672851562,
+ "learning_rate": 4.594351676086002e-06,
+ "loss": 0.4573,
+ "step": 2415
+ },
+ {
+ "epoch": 1.1423167848699765,
+ "grad_norm": 2.9450128078460693,
+ "learning_rate": 4.594010958760892e-06,
+ "loss": 0.5529,
+ "step": 2416
+ },
+ {
+ "epoch": 1.142789598108747,
+ "grad_norm": 2.6416335105895996,
+ "learning_rate": 4.593670111050901e-06,
+ "loss": 0.5153,
+ "step": 2417
+ },
+ {
+ "epoch": 1.1432624113475178,
+ "grad_norm": 2.473177194595337,
+ "learning_rate": 4.593329132977253e-06,
+ "loss": 0.4962,
+ "step": 2418
+ },
+ {
+ "epoch": 1.1437352245862884,
+ "grad_norm": 2.4494502544403076,
+ "learning_rate": 4.592988024561179e-06,
+ "loss": 0.5182,
+ "step": 2419
+ },
+ {
+ "epoch": 1.1442080378250592,
+ "grad_norm": 2.773930311203003,
+ "learning_rate": 4.592646785823918e-06,
+ "loss": 0.4442,
+ "step": 2420
+ },
+ {
+ "epoch": 1.1446808510638298,
+ "grad_norm": 2.4733314514160156,
+ "learning_rate": 4.592305416786718e-06,
+ "loss": 0.5106,
+ "step": 2421
+ },
+ {
+ "epoch": 1.1451536643026006,
+ "grad_norm": 2.6870038509368896,
+ "learning_rate": 4.591963917470834e-06,
+ "loss": 0.5316,
+ "step": 2422
+ },
+ {
+ "epoch": 1.1456264775413711,
+ "grad_norm": 2.8989531993865967,
+ "learning_rate": 4.591622287897529e-06,
+ "loss": 0.5906,
+ "step": 2423
+ },
+ {
+ "epoch": 1.1460992907801417,
+ "grad_norm": 2.6349124908447266,
+ "learning_rate": 4.591280528088077e-06,
+ "loss": 0.6225,
+ "step": 2424
+ },
+ {
+ "epoch": 1.1465721040189125,
+ "grad_norm": 3.19022274017334,
+ "learning_rate": 4.5909386380637555e-06,
+ "loss": 0.555,
+ "step": 2425
+ },
+ {
+ "epoch": 1.1470449172576833,
+ "grad_norm": 3.1473541259765625,
+ "learning_rate": 4.5905966178458535e-06,
+ "loss": 0.537,
+ "step": 2426
+ },
+ {
+ "epoch": 1.147517730496454,
+ "grad_norm": 2.6996145248413086,
+ "learning_rate": 4.590254467455667e-06,
+ "loss": 0.565,
+ "step": 2427
+ },
+ {
+ "epoch": 1.1479905437352245,
+ "grad_norm": 2.830188274383545,
+ "learning_rate": 4.5899121869145015e-06,
+ "loss": 0.6773,
+ "step": 2428
+ },
+ {
+ "epoch": 1.1484633569739953,
+ "grad_norm": 2.4937260150909424,
+ "learning_rate": 4.589569776243667e-06,
+ "loss": 0.5484,
+ "step": 2429
+ },
+ {
+ "epoch": 1.148936170212766,
+ "grad_norm": 2.54011869430542,
+ "learning_rate": 4.589227235464486e-06,
+ "loss": 0.5307,
+ "step": 2430
+ },
+ {
+ "epoch": 1.1494089834515366,
+ "grad_norm": 2.8764214515686035,
+ "learning_rate": 4.5888845645982845e-06,
+ "loss": 0.5296,
+ "step": 2431
+ },
+ {
+ "epoch": 1.1498817966903072,
+ "grad_norm": 2.637033462524414,
+ "learning_rate": 4.588541763666402e-06,
+ "loss": 0.5975,
+ "step": 2432
+ },
+ {
+ "epoch": 1.150354609929078,
+ "grad_norm": 2.8534255027770996,
+ "learning_rate": 4.5881988326901815e-06,
+ "loss": 0.5431,
+ "step": 2433
+ },
+ {
+ "epoch": 1.1508274231678488,
+ "grad_norm": 2.8546559810638428,
+ "learning_rate": 4.587855771690976e-06,
+ "loss": 0.469,
+ "step": 2434
+ },
+ {
+ "epoch": 1.1513002364066194,
+ "grad_norm": 2.9084973335266113,
+ "learning_rate": 4.587512580690146e-06,
+ "loss": 0.5566,
+ "step": 2435
+ },
+ {
+ "epoch": 1.15177304964539,
+ "grad_norm": 3.0993130207061768,
+ "learning_rate": 4.587169259709063e-06,
+ "loss": 0.5612,
+ "step": 2436
+ },
+ {
+ "epoch": 1.1522458628841608,
+ "grad_norm": 10.847400665283203,
+ "learning_rate": 4.5868258087691e-06,
+ "loss": 0.4678,
+ "step": 2437
+ },
+ {
+ "epoch": 1.1527186761229316,
+ "grad_norm": 2.6648571491241455,
+ "learning_rate": 4.586482227891645e-06,
+ "loss": 0.5951,
+ "step": 2438
+ },
+ {
+ "epoch": 1.1531914893617021,
+ "grad_norm": 2.529043197631836,
+ "learning_rate": 4.586138517098091e-06,
+ "loss": 0.5048,
+ "step": 2439
+ },
+ {
+ "epoch": 1.1536643026004727,
+ "grad_norm": 2.833904504776001,
+ "learning_rate": 4.585794676409839e-06,
+ "loss": 0.536,
+ "step": 2440
+ },
+ {
+ "epoch": 1.1541371158392435,
+ "grad_norm": 3.507657766342163,
+ "learning_rate": 4.585450705848298e-06,
+ "loss": 0.5954,
+ "step": 2441
+ },
+ {
+ "epoch": 1.1546099290780143,
+ "grad_norm": 2.6108388900756836,
+ "learning_rate": 4.585106605434887e-06,
+ "loss": 0.5684,
+ "step": 2442
+ },
+ {
+ "epoch": 1.1550827423167849,
+ "grad_norm": 2.490708589553833,
+ "learning_rate": 4.58476237519103e-06,
+ "loss": 0.4678,
+ "step": 2443
+ },
+ {
+ "epoch": 1.1555555555555554,
+ "grad_norm": 2.8192343711853027,
+ "learning_rate": 4.584418015138161e-06,
+ "loss": 0.5291,
+ "step": 2444
+ },
+ {
+ "epoch": 1.1560283687943262,
+ "grad_norm": 3.0878679752349854,
+ "learning_rate": 4.584073525297722e-06,
+ "loss": 0.5691,
+ "step": 2445
+ },
+ {
+ "epoch": 1.156501182033097,
+ "grad_norm": 3.1444318294525146,
+ "learning_rate": 4.583728905691163e-06,
+ "loss": 0.5643,
+ "step": 2446
+ },
+ {
+ "epoch": 1.1569739952718676,
+ "grad_norm": 3.02382230758667,
+ "learning_rate": 4.583384156339942e-06,
+ "loss": 0.6008,
+ "step": 2447
+ },
+ {
+ "epoch": 1.1574468085106382,
+ "grad_norm": 2.5942490100860596,
+ "learning_rate": 4.583039277265525e-06,
+ "loss": 0.5105,
+ "step": 2448
+ },
+ {
+ "epoch": 1.157919621749409,
+ "grad_norm": 2.938608407974243,
+ "learning_rate": 4.582694268489386e-06,
+ "loss": 0.5123,
+ "step": 2449
+ },
+ {
+ "epoch": 1.1583924349881798,
+ "grad_norm": 2.4622268676757812,
+ "learning_rate": 4.5823491300330075e-06,
+ "loss": 0.4538,
+ "step": 2450
+ },
+ {
+ "epoch": 1.1588652482269504,
+ "grad_norm": 2.4380505084991455,
+ "learning_rate": 4.5820038619178795e-06,
+ "loss": 0.4682,
+ "step": 2451
+ },
+ {
+ "epoch": 1.159338061465721,
+ "grad_norm": 2.479896068572998,
+ "learning_rate": 4.581658464165501e-06,
+ "loss": 0.4877,
+ "step": 2452
+ },
+ {
+ "epoch": 1.1598108747044917,
+ "grad_norm": 2.3373546600341797,
+ "learning_rate": 4.5813129367973765e-06,
+ "loss": 0.445,
+ "step": 2453
+ },
+ {
+ "epoch": 1.1602836879432625,
+ "grad_norm": 2.8586013317108154,
+ "learning_rate": 4.5809672798350214e-06,
+ "loss": 0.5232,
+ "step": 2454
+ },
+ {
+ "epoch": 1.160756501182033,
+ "grad_norm": 3.2302439212799072,
+ "learning_rate": 4.5806214932999595e-06,
+ "loss": 0.5336,
+ "step": 2455
+ },
+ {
+ "epoch": 1.1612293144208037,
+ "grad_norm": 3.1005783081054688,
+ "learning_rate": 4.580275577213721e-06,
+ "loss": 0.5123,
+ "step": 2456
+ },
+ {
+ "epoch": 1.1617021276595745,
+ "grad_norm": 2.7131073474884033,
+ "learning_rate": 4.579929531597842e-06,
+ "loss": 0.5648,
+ "step": 2457
+ },
+ {
+ "epoch": 1.1621749408983453,
+ "grad_norm": 2.5067050457000732,
+ "learning_rate": 4.579583356473874e-06,
+ "loss": 0.5324,
+ "step": 2458
+ },
+ {
+ "epoch": 1.1626477541371159,
+ "grad_norm": 2.7870543003082275,
+ "learning_rate": 4.579237051863366e-06,
+ "loss": 0.5094,
+ "step": 2459
+ },
+ {
+ "epoch": 1.1631205673758864,
+ "grad_norm": 2.739196300506592,
+ "learning_rate": 4.578890617787887e-06,
+ "loss": 0.5103,
+ "step": 2460
+ },
+ {
+ "epoch": 1.1635933806146572,
+ "grad_norm": 2.7108185291290283,
+ "learning_rate": 4.578544054269003e-06,
+ "loss": 0.533,
+ "step": 2461
+ },
+ {
+ "epoch": 1.1640661938534278,
+ "grad_norm": 3.028005361557007,
+ "learning_rate": 4.578197361328295e-06,
+ "loss": 0.636,
+ "step": 2462
+ },
+ {
+ "epoch": 1.1645390070921986,
+ "grad_norm": 2.4855129718780518,
+ "learning_rate": 4.5778505389873505e-06,
+ "loss": 0.501,
+ "step": 2463
+ },
+ {
+ "epoch": 1.1650118203309692,
+ "grad_norm": 2.6314198970794678,
+ "learning_rate": 4.577503587267764e-06,
+ "loss": 0.5812,
+ "step": 2464
+ },
+ {
+ "epoch": 1.16548463356974,
+ "grad_norm": 2.4209671020507812,
+ "learning_rate": 4.5771565061911385e-06,
+ "loss": 0.5168,
+ "step": 2465
+ },
+ {
+ "epoch": 1.1659574468085105,
+ "grad_norm": 2.526388645172119,
+ "learning_rate": 4.576809295779085e-06,
+ "loss": 0.5047,
+ "step": 2466
+ },
+ {
+ "epoch": 1.1664302600472813,
+ "grad_norm": 2.8278191089630127,
+ "learning_rate": 4.576461956053224e-06,
+ "loss": 0.4759,
+ "step": 2467
+ },
+ {
+ "epoch": 1.166903073286052,
+ "grad_norm": 2.7862167358398438,
+ "learning_rate": 4.576114487035182e-06,
+ "loss": 0.5492,
+ "step": 2468
+ },
+ {
+ "epoch": 1.1673758865248227,
+ "grad_norm": 2.6303019523620605,
+ "learning_rate": 4.575766888746594e-06,
+ "loss": 0.5538,
+ "step": 2469
+ },
+ {
+ "epoch": 1.1678486997635933,
+ "grad_norm": 2.613104820251465,
+ "learning_rate": 4.5754191612091034e-06,
+ "loss": 0.5114,
+ "step": 2470
+ },
+ {
+ "epoch": 1.168321513002364,
+ "grad_norm": 2.653958320617676,
+ "learning_rate": 4.5750713044443625e-06,
+ "loss": 0.5858,
+ "step": 2471
+ },
+ {
+ "epoch": 1.1687943262411347,
+ "grad_norm": 3.1143975257873535,
+ "learning_rate": 4.574723318474031e-06,
+ "loss": 0.5193,
+ "step": 2472
+ },
+ {
+ "epoch": 1.1692671394799055,
+ "grad_norm": 3.05454421043396,
+ "learning_rate": 4.574375203319775e-06,
+ "loss": 0.464,
+ "step": 2473
+ },
+ {
+ "epoch": 1.169739952718676,
+ "grad_norm": 2.66626238822937,
+ "learning_rate": 4.574026959003272e-06,
+ "loss": 0.4988,
+ "step": 2474
+ },
+ {
+ "epoch": 1.1702127659574468,
+ "grad_norm": 2.8871963024139404,
+ "learning_rate": 4.573678585546203e-06,
+ "loss": 0.5557,
+ "step": 2475
+ },
+ {
+ "epoch": 1.1706855791962174,
+ "grad_norm": 2.592949628829956,
+ "learning_rate": 4.573330082970262e-06,
+ "loss": 0.5178,
+ "step": 2476
+ },
+ {
+ "epoch": 1.1711583924349882,
+ "grad_norm": 2.9111456871032715,
+ "learning_rate": 4.572981451297148e-06,
+ "loss": 0.5712,
+ "step": 2477
+ },
+ {
+ "epoch": 1.1716312056737588,
+ "grad_norm": 2.8152248859405518,
+ "learning_rate": 4.57263269054857e-06,
+ "loss": 0.5548,
+ "step": 2478
+ },
+ {
+ "epoch": 1.1721040189125296,
+ "grad_norm": 3.0292418003082275,
+ "learning_rate": 4.572283800746241e-06,
+ "loss": 0.5937,
+ "step": 2479
+ },
+ {
+ "epoch": 1.1725768321513002,
+ "grad_norm": 3.454618215560913,
+ "learning_rate": 4.571934781911886e-06,
+ "loss": 0.5537,
+ "step": 2480
+ },
+ {
+ "epoch": 1.173049645390071,
+ "grad_norm": 2.7817866802215576,
+ "learning_rate": 4.571585634067239e-06,
+ "loss": 0.5649,
+ "step": 2481
+ },
+ {
+ "epoch": 1.1735224586288415,
+ "grad_norm": 2.7989349365234375,
+ "learning_rate": 4.571236357234037e-06,
+ "loss": 0.5448,
+ "step": 2482
+ },
+ {
+ "epoch": 1.1739952718676123,
+ "grad_norm": 2.8863933086395264,
+ "learning_rate": 4.57088695143403e-06,
+ "loss": 0.63,
+ "step": 2483
+ },
+ {
+ "epoch": 1.174468085106383,
+ "grad_norm": 2.5738039016723633,
+ "learning_rate": 4.570537416688972e-06,
+ "loss": 0.4702,
+ "step": 2484
+ },
+ {
+ "epoch": 1.1749408983451537,
+ "grad_norm": 3.003643274307251,
+ "learning_rate": 4.570187753020629e-06,
+ "loss": 0.5918,
+ "step": 2485
+ },
+ {
+ "epoch": 1.1754137115839243,
+ "grad_norm": 2.8619167804718018,
+ "learning_rate": 4.569837960450772e-06,
+ "loss": 0.5268,
+ "step": 2486
+ },
+ {
+ "epoch": 1.175886524822695,
+ "grad_norm": 2.876077175140381,
+ "learning_rate": 4.569488039001181e-06,
+ "loss": 0.4915,
+ "step": 2487
+ },
+ {
+ "epoch": 1.1763593380614656,
+ "grad_norm": 3.407115936279297,
+ "learning_rate": 4.569137988693644e-06,
+ "loss": 0.5761,
+ "step": 2488
+ },
+ {
+ "epoch": 1.1768321513002364,
+ "grad_norm": 2.7292826175689697,
+ "learning_rate": 4.568787809549958e-06,
+ "loss": 0.541,
+ "step": 2489
+ },
+ {
+ "epoch": 1.177304964539007,
+ "grad_norm": 2.8805999755859375,
+ "learning_rate": 4.568437501591926e-06,
+ "loss": 0.6223,
+ "step": 2490
+ },
+ {
+ "epoch": 1.1777777777777778,
+ "grad_norm": 2.9264373779296875,
+ "learning_rate": 4.56808706484136e-06,
+ "loss": 0.6081,
+ "step": 2491
+ },
+ {
+ "epoch": 1.1782505910165484,
+ "grad_norm": 2.5167033672332764,
+ "learning_rate": 4.567736499320082e-06,
+ "loss": 0.5393,
+ "step": 2492
+ },
+ {
+ "epoch": 1.1787234042553192,
+ "grad_norm": 3.4647862911224365,
+ "learning_rate": 4.567385805049918e-06,
+ "loss": 0.4826,
+ "step": 2493
+ },
+ {
+ "epoch": 1.1791962174940898,
+ "grad_norm": 2.9824202060699463,
+ "learning_rate": 4.5670349820527055e-06,
+ "loss": 0.541,
+ "step": 2494
+ },
+ {
+ "epoch": 1.1796690307328606,
+ "grad_norm": 2.997105836868286,
+ "learning_rate": 4.5666840303502885e-06,
+ "loss": 0.5771,
+ "step": 2495
+ },
+ {
+ "epoch": 1.1801418439716311,
+ "grad_norm": 2.8728017807006836,
+ "learning_rate": 4.56633294996452e-06,
+ "loss": 0.4877,
+ "step": 2496
+ },
+ {
+ "epoch": 1.180614657210402,
+ "grad_norm": 2.626498222351074,
+ "learning_rate": 4.5659817409172565e-06,
+ "loss": 0.5296,
+ "step": 2497
+ },
+ {
+ "epoch": 1.1810874704491725,
+ "grad_norm": 2.87037992477417,
+ "learning_rate": 4.565630403230371e-06,
+ "loss": 0.539,
+ "step": 2498
+ },
+ {
+ "epoch": 1.1815602836879433,
+ "grad_norm": 2.5719685554504395,
+ "learning_rate": 4.5652789369257375e-06,
+ "loss": 0.5653,
+ "step": 2499
+ },
+ {
+ "epoch": 1.1820330969267139,
+ "grad_norm": 2.4842135906219482,
+ "learning_rate": 4.56492734202524e-06,
+ "loss": 0.515,
+ "step": 2500
+ },
+ {
+ "epoch": 1.1825059101654847,
+ "grad_norm": 2.640951156616211,
+ "learning_rate": 4.564575618550773e-06,
+ "loss": 0.5601,
+ "step": 2501
+ },
+ {
+ "epoch": 1.1829787234042553,
+ "grad_norm": 2.624394655227661,
+ "learning_rate": 4.564223766524234e-06,
+ "loss": 0.5551,
+ "step": 2502
+ },
+ {
+ "epoch": 1.183451536643026,
+ "grad_norm": 3.014537811279297,
+ "learning_rate": 4.563871785967533e-06,
+ "loss": 0.5212,
+ "step": 2503
+ },
+ {
+ "epoch": 1.1839243498817966,
+ "grad_norm": 2.8756890296936035,
+ "learning_rate": 4.563519676902585e-06,
+ "loss": 0.5132,
+ "step": 2504
+ },
+ {
+ "epoch": 1.1843971631205674,
+ "grad_norm": 2.636781692504883,
+ "learning_rate": 4.5631674393513145e-06,
+ "loss": 0.5323,
+ "step": 2505
+ },
+ {
+ "epoch": 1.184869976359338,
+ "grad_norm": 2.7233786582946777,
+ "learning_rate": 4.562815073335655e-06,
+ "loss": 0.5608,
+ "step": 2506
+ },
+ {
+ "epoch": 1.1853427895981088,
+ "grad_norm": 2.7158713340759277,
+ "learning_rate": 4.562462578877546e-06,
+ "loss": 0.5373,
+ "step": 2507
+ },
+ {
+ "epoch": 1.1858156028368794,
+ "grad_norm": 2.9754762649536133,
+ "learning_rate": 4.562109955998936e-06,
+ "loss": 0.5712,
+ "step": 2508
+ },
+ {
+ "epoch": 1.1862884160756502,
+ "grad_norm": 2.8815054893493652,
+ "learning_rate": 4.561757204721781e-06,
+ "loss": 0.6126,
+ "step": 2509
+ },
+ {
+ "epoch": 1.1867612293144207,
+ "grad_norm": 2.866319417953491,
+ "learning_rate": 4.561404325068045e-06,
+ "loss": 0.506,
+ "step": 2510
+ },
+ {
+ "epoch": 1.1872340425531915,
+ "grad_norm": 2.6187376976013184,
+ "learning_rate": 4.561051317059701e-06,
+ "loss": 0.4674,
+ "step": 2511
+ },
+ {
+ "epoch": 1.1877068557919621,
+ "grad_norm": 2.642552137374878,
+ "learning_rate": 4.560698180718729e-06,
+ "loss": 0.4793,
+ "step": 2512
+ },
+ {
+ "epoch": 1.188179669030733,
+ "grad_norm": 2.7815041542053223,
+ "learning_rate": 4.560344916067117e-06,
+ "loss": 0.5034,
+ "step": 2513
+ },
+ {
+ "epoch": 1.1886524822695035,
+ "grad_norm": 2.70853590965271,
+ "learning_rate": 4.559991523126862e-06,
+ "loss": 0.4811,
+ "step": 2514
+ },
+ {
+ "epoch": 1.1891252955082743,
+ "grad_norm": 2.7049436569213867,
+ "learning_rate": 4.559638001919967e-06,
+ "loss": 0.547,
+ "step": 2515
+ },
+ {
+ "epoch": 1.1895981087470449,
+ "grad_norm": 2.766773223876953,
+ "learning_rate": 4.559284352468445e-06,
+ "loss": 0.5362,
+ "step": 2516
+ },
+ {
+ "epoch": 1.1900709219858157,
+ "grad_norm": 3.0064334869384766,
+ "learning_rate": 4.558930574794316e-06,
+ "loss": 0.5915,
+ "step": 2517
+ },
+ {
+ "epoch": 1.1905437352245862,
+ "grad_norm": 2.4899885654449463,
+ "learning_rate": 4.558576668919609e-06,
+ "loss": 0.4379,
+ "step": 2518
+ },
+ {
+ "epoch": 1.191016548463357,
+ "grad_norm": 2.925963878631592,
+ "learning_rate": 4.558222634866358e-06,
+ "loss": 0.5389,
+ "step": 2519
+ },
+ {
+ "epoch": 1.1914893617021276,
+ "grad_norm": 6.087667465209961,
+ "learning_rate": 4.55786847265661e-06,
+ "loss": 0.4777,
+ "step": 2520
+ },
+ {
+ "epoch": 1.1919621749408984,
+ "grad_norm": 2.4560582637786865,
+ "learning_rate": 4.5575141823124145e-06,
+ "loss": 0.5576,
+ "step": 2521
+ },
+ {
+ "epoch": 1.192434988179669,
+ "grad_norm": 3.184252977371216,
+ "learning_rate": 4.557159763855834e-06,
+ "loss": 0.5151,
+ "step": 2522
+ },
+ {
+ "epoch": 1.1929078014184398,
+ "grad_norm": 2.359722137451172,
+ "learning_rate": 4.556805217308935e-06,
+ "loss": 0.478,
+ "step": 2523
+ },
+ {
+ "epoch": 1.1933806146572103,
+ "grad_norm": 3.0821568965911865,
+ "learning_rate": 4.5564505426937935e-06,
+ "loss": 0.5784,
+ "step": 2524
+ },
+ {
+ "epoch": 1.1938534278959811,
+ "grad_norm": 2.9905128479003906,
+ "learning_rate": 4.5560957400324936e-06,
+ "loss": 0.6087,
+ "step": 2525
+ },
+ {
+ "epoch": 1.1943262411347517,
+ "grad_norm": 2.462102174758911,
+ "learning_rate": 4.555740809347128e-06,
+ "loss": 0.4739,
+ "step": 2526
+ },
+ {
+ "epoch": 1.1947990543735225,
+ "grad_norm": 2.7931067943573,
+ "learning_rate": 4.555385750659796e-06,
+ "loss": 0.4961,
+ "step": 2527
+ },
+ {
+ "epoch": 1.195271867612293,
+ "grad_norm": 2.660320997238159,
+ "learning_rate": 4.555030563992607e-06,
+ "loss": 0.487,
+ "step": 2528
+ },
+ {
+ "epoch": 1.195744680851064,
+ "grad_norm": 2.8135557174682617,
+ "learning_rate": 4.554675249367675e-06,
+ "loss": 0.5269,
+ "step": 2529
+ },
+ {
+ "epoch": 1.1962174940898345,
+ "grad_norm": 2.661933422088623,
+ "learning_rate": 4.554319806807126e-06,
+ "loss": 0.4723,
+ "step": 2530
+ },
+ {
+ "epoch": 1.1966903073286053,
+ "grad_norm": 2.568176507949829,
+ "learning_rate": 4.553964236333089e-06,
+ "loss": 0.5258,
+ "step": 2531
+ },
+ {
+ "epoch": 1.1971631205673758,
+ "grad_norm": 2.6890947818756104,
+ "learning_rate": 4.553608537967705e-06,
+ "loss": 0.4965,
+ "step": 2532
+ },
+ {
+ "epoch": 1.1976359338061466,
+ "grad_norm": 3.133470058441162,
+ "learning_rate": 4.553252711733124e-06,
+ "loss": 0.5423,
+ "step": 2533
+ },
+ {
+ "epoch": 1.1981087470449172,
+ "grad_norm": 2.7086687088012695,
+ "learning_rate": 4.552896757651498e-06,
+ "loss": 0.5326,
+ "step": 2534
+ },
+ {
+ "epoch": 1.198581560283688,
+ "grad_norm": 2.8411715030670166,
+ "learning_rate": 4.552540675744994e-06,
+ "loss": 0.5793,
+ "step": 2535
+ },
+ {
+ "epoch": 1.1990543735224586,
+ "grad_norm": 3.041077136993408,
+ "learning_rate": 4.552184466035782e-06,
+ "loss": 0.5068,
+ "step": 2536
+ },
+ {
+ "epoch": 1.1995271867612294,
+ "grad_norm": 2.5921192169189453,
+ "learning_rate": 4.551828128546041e-06,
+ "loss": 0.5189,
+ "step": 2537
+ },
+ {
+ "epoch": 1.2,
+ "grad_norm": 2.923305034637451,
+ "learning_rate": 4.5514716632979605e-06,
+ "loss": 0.516,
+ "step": 2538
+ },
+ {
+ "epoch": 1.2004728132387708,
+ "grad_norm": 2.7083024978637695,
+ "learning_rate": 4.551115070313734e-06,
+ "loss": 0.4825,
+ "step": 2539
+ },
+ {
+ "epoch": 1.2009456264775413,
+ "grad_norm": 2.746842384338379,
+ "learning_rate": 4.550758349615567e-06,
+ "loss": 0.5691,
+ "step": 2540
+ },
+ {
+ "epoch": 1.2014184397163121,
+ "grad_norm": 2.6596429347991943,
+ "learning_rate": 4.550401501225669e-06,
+ "loss": 0.5983,
+ "step": 2541
+ },
+ {
+ "epoch": 1.2018912529550827,
+ "grad_norm": 2.9057931900024414,
+ "learning_rate": 4.550044525166261e-06,
+ "loss": 0.5069,
+ "step": 2542
+ },
+ {
+ "epoch": 1.2023640661938535,
+ "grad_norm": 2.6139039993286133,
+ "learning_rate": 4.5496874214595686e-06,
+ "loss": 0.5102,
+ "step": 2543
+ },
+ {
+ "epoch": 1.202836879432624,
+ "grad_norm": 2.630286455154419,
+ "learning_rate": 4.5493301901278285e-06,
+ "loss": 0.4902,
+ "step": 2544
+ },
+ {
+ "epoch": 1.2033096926713949,
+ "grad_norm": 2.639174222946167,
+ "learning_rate": 4.548972831193284e-06,
+ "loss": 0.4566,
+ "step": 2545
+ },
+ {
+ "epoch": 1.2037825059101654,
+ "grad_norm": 2.9569664001464844,
+ "learning_rate": 4.548615344678186e-06,
+ "loss": 0.5636,
+ "step": 2546
+ },
+ {
+ "epoch": 1.2042553191489362,
+ "grad_norm": 2.981734037399292,
+ "learning_rate": 4.5482577306047924e-06,
+ "loss": 0.4884,
+ "step": 2547
+ },
+ {
+ "epoch": 1.2047281323877068,
+ "grad_norm": 2.6760342121124268,
+ "learning_rate": 4.547899988995371e-06,
+ "loss": 0.5426,
+ "step": 2548
+ },
+ {
+ "epoch": 1.2052009456264776,
+ "grad_norm": 2.825805902481079,
+ "learning_rate": 4.547542119872198e-06,
+ "loss": 0.4989,
+ "step": 2549
+ },
+ {
+ "epoch": 1.2056737588652482,
+ "grad_norm": 2.856426954269409,
+ "learning_rate": 4.547184123257555e-06,
+ "loss": 0.5734,
+ "step": 2550
+ },
+ {
+ "epoch": 1.206146572104019,
+ "grad_norm": 2.555682420730591,
+ "learning_rate": 4.5468259991737334e-06,
+ "loss": 0.5299,
+ "step": 2551
+ },
+ {
+ "epoch": 1.2066193853427896,
+ "grad_norm": 2.6324024200439453,
+ "learning_rate": 4.546467747643032e-06,
+ "loss": 0.5906,
+ "step": 2552
+ },
+ {
+ "epoch": 1.2070921985815604,
+ "grad_norm": 3.4145350456237793,
+ "learning_rate": 4.546109368687757e-06,
+ "loss": 0.5153,
+ "step": 2553
+ },
+ {
+ "epoch": 1.207565011820331,
+ "grad_norm": 2.658691644668579,
+ "learning_rate": 4.545750862330225e-06,
+ "loss": 0.5759,
+ "step": 2554
+ },
+ {
+ "epoch": 1.2080378250591017,
+ "grad_norm": 3.162605047225952,
+ "learning_rate": 4.545392228592755e-06,
+ "loss": 0.5379,
+ "step": 2555
+ },
+ {
+ "epoch": 1.2085106382978723,
+ "grad_norm": 2.8631198406219482,
+ "learning_rate": 4.545033467497681e-06,
+ "loss": 0.5959,
+ "step": 2556
+ },
+ {
+ "epoch": 1.208983451536643,
+ "grad_norm": 2.457109212875366,
+ "learning_rate": 4.54467457906734e-06,
+ "loss": 0.4864,
+ "step": 2557
+ },
+ {
+ "epoch": 1.2094562647754137,
+ "grad_norm": 2.5307061672210693,
+ "learning_rate": 4.544315563324078e-06,
+ "loss": 0.5308,
+ "step": 2558
+ },
+ {
+ "epoch": 1.2099290780141845,
+ "grad_norm": 2.8482773303985596,
+ "learning_rate": 4.543956420290251e-06,
+ "loss": 0.5126,
+ "step": 2559
+ },
+ {
+ "epoch": 1.210401891252955,
+ "grad_norm": 2.4990832805633545,
+ "learning_rate": 4.5435971499882195e-06,
+ "loss": 0.4534,
+ "step": 2560
+ },
+ {
+ "epoch": 1.2108747044917259,
+ "grad_norm": 2.6292665004730225,
+ "learning_rate": 4.543237752440354e-06,
+ "loss": 0.4434,
+ "step": 2561
+ },
+ {
+ "epoch": 1.2113475177304964,
+ "grad_norm": 2.865983247756958,
+ "learning_rate": 4.542878227669033e-06,
+ "loss": 0.5667,
+ "step": 2562
+ },
+ {
+ "epoch": 1.2118203309692672,
+ "grad_norm": 2.745614528656006,
+ "learning_rate": 4.542518575696644e-06,
+ "loss": 0.4724,
+ "step": 2563
+ },
+ {
+ "epoch": 1.2122931442080378,
+ "grad_norm": 2.8562581539154053,
+ "learning_rate": 4.5421587965455785e-06,
+ "loss": 0.5405,
+ "step": 2564
+ },
+ {
+ "epoch": 1.2127659574468086,
+ "grad_norm": 2.6670095920562744,
+ "learning_rate": 4.5417988902382385e-06,
+ "loss": 0.5432,
+ "step": 2565
+ },
+ {
+ "epoch": 1.2132387706855792,
+ "grad_norm": 2.9320743083953857,
+ "learning_rate": 4.541438856797036e-06,
+ "loss": 0.5862,
+ "step": 2566
+ },
+ {
+ "epoch": 1.21371158392435,
+ "grad_norm": 2.577505588531494,
+ "learning_rate": 4.541078696244386e-06,
+ "loss": 0.4742,
+ "step": 2567
+ },
+ {
+ "epoch": 1.2141843971631205,
+ "grad_norm": 3.4476120471954346,
+ "learning_rate": 4.540718408602717e-06,
+ "loss": 0.5903,
+ "step": 2568
+ },
+ {
+ "epoch": 1.2146572104018913,
+ "grad_norm": 2.816210985183716,
+ "learning_rate": 4.540357993894459e-06,
+ "loss": 0.5033,
+ "step": 2569
+ },
+ {
+ "epoch": 1.215130023640662,
+ "grad_norm": 3.0806639194488525,
+ "learning_rate": 4.539997452142058e-06,
+ "loss": 0.6064,
+ "step": 2570
+ },
+ {
+ "epoch": 1.2156028368794327,
+ "grad_norm": 2.563060760498047,
+ "learning_rate": 4.5396367833679586e-06,
+ "loss": 0.5597,
+ "step": 2571
+ },
+ {
+ "epoch": 1.2160756501182033,
+ "grad_norm": 3.1014397144317627,
+ "learning_rate": 4.5392759875946215e-06,
+ "loss": 0.54,
+ "step": 2572
+ },
+ {
+ "epoch": 1.216548463356974,
+ "grad_norm": 3.124190330505371,
+ "learning_rate": 4.53891506484451e-06,
+ "loss": 0.5122,
+ "step": 2573
+ },
+ {
+ "epoch": 1.2170212765957447,
+ "grad_norm": 2.6688716411590576,
+ "learning_rate": 4.538554015140097e-06,
+ "loss": 0.5615,
+ "step": 2574
+ },
+ {
+ "epoch": 1.2174940898345155,
+ "grad_norm": 2.775543689727783,
+ "learning_rate": 4.538192838503866e-06,
+ "loss": 0.496,
+ "step": 2575
+ },
+ {
+ "epoch": 1.217966903073286,
+ "grad_norm": 2.7877283096313477,
+ "learning_rate": 4.537831534958303e-06,
+ "loss": 0.4995,
+ "step": 2576
+ },
+ {
+ "epoch": 1.2184397163120568,
+ "grad_norm": 2.824810028076172,
+ "learning_rate": 4.537470104525906e-06,
+ "loss": 0.5481,
+ "step": 2577
+ },
+ {
+ "epoch": 1.2189125295508274,
+ "grad_norm": 2.801269292831421,
+ "learning_rate": 4.53710854722918e-06,
+ "loss": 0.5628,
+ "step": 2578
+ },
+ {
+ "epoch": 1.2193853427895982,
+ "grad_norm": 2.7780683040618896,
+ "learning_rate": 4.536746863090637e-06,
+ "loss": 0.4845,
+ "step": 2579
+ },
+ {
+ "epoch": 1.2198581560283688,
+ "grad_norm": 2.536010265350342,
+ "learning_rate": 4.536385052132798e-06,
+ "loss": 0.4771,
+ "step": 2580
+ },
+ {
+ "epoch": 1.2203309692671396,
+ "grad_norm": 2.768775701522827,
+ "learning_rate": 4.536023114378191e-06,
+ "loss": 0.5366,
+ "step": 2581
+ },
+ {
+ "epoch": 1.2208037825059102,
+ "grad_norm": 2.658125877380371,
+ "learning_rate": 4.535661049849352e-06,
+ "loss": 0.524,
+ "step": 2582
+ },
+ {
+ "epoch": 1.2212765957446807,
+ "grad_norm": 2.558696746826172,
+ "learning_rate": 4.535298858568825e-06,
+ "loss": 0.5482,
+ "step": 2583
+ },
+ {
+ "epoch": 1.2217494089834515,
+ "grad_norm": 2.5284535884857178,
+ "learning_rate": 4.534936540559164e-06,
+ "loss": 0.4454,
+ "step": 2584
+ },
+ {
+ "epoch": 1.2222222222222223,
+ "grad_norm": 7.617330074310303,
+ "learning_rate": 4.534574095842927e-06,
+ "loss": 0.5615,
+ "step": 2585
+ },
+ {
+ "epoch": 1.222695035460993,
+ "grad_norm": 2.9120311737060547,
+ "learning_rate": 4.534211524442682e-06,
+ "loss": 0.5624,
+ "step": 2586
+ },
+ {
+ "epoch": 1.2231678486997635,
+ "grad_norm": 2.5004289150238037,
+ "learning_rate": 4.533848826381005e-06,
+ "loss": 0.4743,
+ "step": 2587
+ },
+ {
+ "epoch": 1.2236406619385343,
+ "grad_norm": 2.8395533561706543,
+ "learning_rate": 4.53348600168048e-06,
+ "loss": 0.4457,
+ "step": 2588
+ },
+ {
+ "epoch": 1.224113475177305,
+ "grad_norm": 2.832211494445801,
+ "learning_rate": 4.533123050363699e-06,
+ "loss": 0.5559,
+ "step": 2589
+ },
+ {
+ "epoch": 1.2245862884160756,
+ "grad_norm": 2.6318583488464355,
+ "learning_rate": 4.53275997245326e-06,
+ "loss": 0.5281,
+ "step": 2590
+ },
+ {
+ "epoch": 1.2250591016548462,
+ "grad_norm": 3.0509233474731445,
+ "learning_rate": 4.532396767971771e-06,
+ "loss": 0.6003,
+ "step": 2591
+ },
+ {
+ "epoch": 1.225531914893617,
+ "grad_norm": 2.6863620281219482,
+ "learning_rate": 4.532033436941847e-06,
+ "loss": 0.5219,
+ "step": 2592
+ },
+ {
+ "epoch": 1.2260047281323878,
+ "grad_norm": 2.401463747024536,
+ "learning_rate": 4.5316699793861104e-06,
+ "loss": 0.5994,
+ "step": 2593
+ },
+ {
+ "epoch": 1.2264775413711584,
+ "grad_norm": 2.613517999649048,
+ "learning_rate": 4.531306395327194e-06,
+ "loss": 0.5785,
+ "step": 2594
+ },
+ {
+ "epoch": 1.226950354609929,
+ "grad_norm": 2.5016374588012695,
+ "learning_rate": 4.530942684787735e-06,
+ "loss": 0.5695,
+ "step": 2595
+ },
+ {
+ "epoch": 1.2274231678486998,
+ "grad_norm": 2.576464891433716,
+ "learning_rate": 4.53057884779038e-06,
+ "loss": 0.4427,
+ "step": 2596
+ },
+ {
+ "epoch": 1.2278959810874706,
+ "grad_norm": 2.5688700675964355,
+ "learning_rate": 4.530214884357785e-06,
+ "loss": 0.4966,
+ "step": 2597
+ },
+ {
+ "epoch": 1.2283687943262411,
+ "grad_norm": 3.179013729095459,
+ "learning_rate": 4.52985079451261e-06,
+ "loss": 0.5239,
+ "step": 2598
+ },
+ {
+ "epoch": 1.2288416075650117,
+ "grad_norm": 2.6015284061431885,
+ "learning_rate": 4.529486578277527e-06,
+ "loss": 0.5135,
+ "step": 2599
+ },
+ {
+ "epoch": 1.2293144208037825,
+ "grad_norm": 2.3029589653015137,
+ "learning_rate": 4.529122235675214e-06,
+ "loss": 0.4044,
+ "step": 2600
+ },
+ {
+ "epoch": 1.2297872340425533,
+ "grad_norm": 2.994093656539917,
+ "learning_rate": 4.528757766728357e-06,
+ "loss": 0.5419,
+ "step": 2601
+ },
+ {
+ "epoch": 1.2302600472813239,
+ "grad_norm": 2.6297390460968018,
+ "learning_rate": 4.52839317145965e-06,
+ "loss": 0.488,
+ "step": 2602
+ },
+ {
+ "epoch": 1.2307328605200945,
+ "grad_norm": 2.4814043045043945,
+ "learning_rate": 4.528028449891793e-06,
+ "loss": 0.4917,
+ "step": 2603
+ },
+ {
+ "epoch": 1.2312056737588652,
+ "grad_norm": 3.6052863597869873,
+ "learning_rate": 4.527663602047499e-06,
+ "loss": 0.5301,
+ "step": 2604
+ },
+ {
+ "epoch": 1.231678486997636,
+ "grad_norm": 2.6984751224517822,
+ "learning_rate": 4.5272986279494825e-06,
+ "loss": 0.5253,
+ "step": 2605
+ },
+ {
+ "epoch": 1.2321513002364066,
+ "grad_norm": 2.514000415802002,
+ "learning_rate": 4.526933527620469e-06,
+ "loss": 0.5661,
+ "step": 2606
+ },
+ {
+ "epoch": 1.2326241134751772,
+ "grad_norm": 2.890921115875244,
+ "learning_rate": 4.526568301083195e-06,
+ "loss": 0.5585,
+ "step": 2607
+ },
+ {
+ "epoch": 1.233096926713948,
+ "grad_norm": 2.6390011310577393,
+ "learning_rate": 4.526202948360397e-06,
+ "loss": 0.5168,
+ "step": 2608
+ },
+ {
+ "epoch": 1.2335697399527188,
+ "grad_norm": 2.7370636463165283,
+ "learning_rate": 4.5258374694748266e-06,
+ "loss": 0.5453,
+ "step": 2609
+ },
+ {
+ "epoch": 1.2340425531914894,
+ "grad_norm": 2.8203976154327393,
+ "learning_rate": 4.52547186444924e-06,
+ "loss": 0.5763,
+ "step": 2610
+ },
+ {
+ "epoch": 1.23451536643026,
+ "grad_norm": 2.7567849159240723,
+ "learning_rate": 4.5251061333064025e-06,
+ "loss": 0.5194,
+ "step": 2611
+ },
+ {
+ "epoch": 1.2349881796690307,
+ "grad_norm": 2.767519474029541,
+ "learning_rate": 4.524740276069085e-06,
+ "loss": 0.5355,
+ "step": 2612
+ },
+ {
+ "epoch": 1.2354609929078015,
+ "grad_norm": 3.072035312652588,
+ "learning_rate": 4.5243742927600695e-06,
+ "loss": 0.5391,
+ "step": 2613
+ },
+ {
+ "epoch": 1.2359338061465721,
+ "grad_norm": 2.5957462787628174,
+ "learning_rate": 4.524008183402143e-06,
+ "loss": 0.5645,
+ "step": 2614
+ },
+ {
+ "epoch": 1.2364066193853427,
+ "grad_norm": 2.774897575378418,
+ "learning_rate": 4.523641948018101e-06,
+ "loss": 0.5576,
+ "step": 2615
+ },
+ {
+ "epoch": 1.2368794326241135,
+ "grad_norm": 2.635887622833252,
+ "learning_rate": 4.5232755866307496e-06,
+ "loss": 0.5254,
+ "step": 2616
+ },
+ {
+ "epoch": 1.2373522458628843,
+ "grad_norm": 2.4860997200012207,
+ "learning_rate": 4.522909099262899e-06,
+ "loss": 0.4692,
+ "step": 2617
+ },
+ {
+ "epoch": 1.2378250591016549,
+ "grad_norm": 2.595513105392456,
+ "learning_rate": 4.522542485937369e-06,
+ "loss": 0.5166,
+ "step": 2618
+ },
+ {
+ "epoch": 1.2382978723404254,
+ "grad_norm": 2.961474895477295,
+ "learning_rate": 4.522175746676986e-06,
+ "loss": 0.5455,
+ "step": 2619
+ },
+ {
+ "epoch": 1.2387706855791962,
+ "grad_norm": 2.813889741897583,
+ "learning_rate": 4.521808881504588e-06,
+ "loss": 0.5249,
+ "step": 2620
+ },
+ {
+ "epoch": 1.239243498817967,
+ "grad_norm": 2.8434813022613525,
+ "learning_rate": 4.521441890443015e-06,
+ "loss": 0.472,
+ "step": 2621
+ },
+ {
+ "epoch": 1.2397163120567376,
+ "grad_norm": 2.4264845848083496,
+ "learning_rate": 4.521074773515119e-06,
+ "loss": 0.4783,
+ "step": 2622
+ },
+ {
+ "epoch": 1.2401891252955082,
+ "grad_norm": 2.615169048309326,
+ "learning_rate": 4.520707530743761e-06,
+ "loss": 0.5324,
+ "step": 2623
+ },
+ {
+ "epoch": 1.240661938534279,
+ "grad_norm": 2.6772537231445312,
+ "learning_rate": 4.520340162151803e-06,
+ "loss": 0.5224,
+ "step": 2624
+ },
+ {
+ "epoch": 1.2411347517730495,
+ "grad_norm": 2.683393955230713,
+ "learning_rate": 4.519972667762124e-06,
+ "loss": 0.4863,
+ "step": 2625
+ },
+ {
+ "epoch": 1.2416075650118203,
+ "grad_norm": 3.0335750579833984,
+ "learning_rate": 4.519605047597603e-06,
+ "loss": 0.544,
+ "step": 2626
+ },
+ {
+ "epoch": 1.242080378250591,
+ "grad_norm": 2.8694353103637695,
+ "learning_rate": 4.519237301681132e-06,
+ "loss": 0.5576,
+ "step": 2627
+ },
+ {
+ "epoch": 1.2425531914893617,
+ "grad_norm": 3.217808246612549,
+ "learning_rate": 4.518869430035609e-06,
+ "loss": 0.5459,
+ "step": 2628
+ },
+ {
+ "epoch": 1.2430260047281323,
+ "grad_norm": 2.7700083255767822,
+ "learning_rate": 4.518501432683937e-06,
+ "loss": 0.5579,
+ "step": 2629
+ },
+ {
+ "epoch": 1.243498817966903,
+ "grad_norm": 2.4759175777435303,
+ "learning_rate": 4.5181333096490335e-06,
+ "loss": 0.5049,
+ "step": 2630
+ },
+ {
+ "epoch": 1.2439716312056737,
+ "grad_norm": 2.8652584552764893,
+ "learning_rate": 4.517765060953818e-06,
+ "loss": 0.5366,
+ "step": 2631
+ },
+ {
+ "epoch": 1.2444444444444445,
+ "grad_norm": 2.776334524154663,
+ "learning_rate": 4.517396686621218e-06,
+ "loss": 0.5677,
+ "step": 2632
+ },
+ {
+ "epoch": 1.244917257683215,
+ "grad_norm": 2.676708221435547,
+ "learning_rate": 4.517028186674174e-06,
+ "loss": 0.5055,
+ "step": 2633
+ },
+ {
+ "epoch": 1.2453900709219858,
+ "grad_norm": 2.6851537227630615,
+ "learning_rate": 4.516659561135629e-06,
+ "loss": 0.5537,
+ "step": 2634
+ },
+ {
+ "epoch": 1.2458628841607564,
+ "grad_norm": 2.619971513748169,
+ "learning_rate": 4.516290810028536e-06,
+ "loss": 0.5765,
+ "step": 2635
+ },
+ {
+ "epoch": 1.2463356973995272,
+ "grad_norm": 2.7302334308624268,
+ "learning_rate": 4.515921933375855e-06,
+ "loss": 0.5611,
+ "step": 2636
+ },
+ {
+ "epoch": 1.2468085106382978,
+ "grad_norm": 2.5005829334259033,
+ "learning_rate": 4.5155529312005554e-06,
+ "loss": 0.442,
+ "step": 2637
+ },
+ {
+ "epoch": 1.2472813238770686,
+ "grad_norm": 2.713587522506714,
+ "learning_rate": 4.515183803525612e-06,
+ "loss": 0.5023,
+ "step": 2638
+ },
+ {
+ "epoch": 1.2477541371158392,
+ "grad_norm": 2.5146236419677734,
+ "learning_rate": 4.514814550374009e-06,
+ "loss": 0.5195,
+ "step": 2639
+ },
+ {
+ "epoch": 1.24822695035461,
+ "grad_norm": 2.761060953140259,
+ "learning_rate": 4.51444517176874e-06,
+ "loss": 0.5138,
+ "step": 2640
+ },
+ {
+ "epoch": 1.2486997635933805,
+ "grad_norm": 3.082329273223877,
+ "learning_rate": 4.5140756677328026e-06,
+ "loss": 0.6105,
+ "step": 2641
+ },
+ {
+ "epoch": 1.2491725768321513,
+ "grad_norm": 2.6933493614196777,
+ "learning_rate": 4.513706038289205e-06,
+ "loss": 0.5185,
+ "step": 2642
+ },
+ {
+ "epoch": 1.249645390070922,
+ "grad_norm": 2.515856981277466,
+ "learning_rate": 4.513336283460962e-06,
+ "loss": 0.5375,
+ "step": 2643
+ },
+ {
+ "epoch": 1.2501182033096927,
+ "grad_norm": 2.8553731441497803,
+ "learning_rate": 4.512966403271096e-06,
+ "loss": 0.5582,
+ "step": 2644
+ },
+ {
+ "epoch": 1.2505910165484633,
+ "grad_norm": 2.640880823135376,
+ "learning_rate": 4.5125963977426405e-06,
+ "loss": 0.5125,
+ "step": 2645
+ },
+ {
+ "epoch": 1.251063829787234,
+ "grad_norm": 2.9845943450927734,
+ "learning_rate": 4.512226266898631e-06,
+ "loss": 0.4749,
+ "step": 2646
+ },
+ {
+ "epoch": 1.2515366430260046,
+ "grad_norm": 2.5131032466888428,
+ "learning_rate": 4.511856010762116e-06,
+ "loss": 0.4764,
+ "step": 2647
+ },
+ {
+ "epoch": 1.2520094562647754,
+ "grad_norm": 2.370638370513916,
+ "learning_rate": 4.511485629356148e-06,
+ "loss": 0.5153,
+ "step": 2648
+ },
+ {
+ "epoch": 1.252482269503546,
+ "grad_norm": 2.912461996078491,
+ "learning_rate": 4.511115122703791e-06,
+ "loss": 0.6117,
+ "step": 2649
+ },
+ {
+ "epoch": 1.2529550827423168,
+ "grad_norm": 2.7308082580566406,
+ "learning_rate": 4.510744490828113e-06,
+ "loss": 0.5076,
+ "step": 2650
+ },
+ {
+ "epoch": 1.2534278959810874,
+ "grad_norm": 2.8524296283721924,
+ "learning_rate": 4.510373733752193e-06,
+ "loss": 0.542,
+ "step": 2651
+ },
+ {
+ "epoch": 1.2539007092198582,
+ "grad_norm": 2.799377202987671,
+ "learning_rate": 4.5100028514991145e-06,
+ "loss": 0.486,
+ "step": 2652
+ },
+ {
+ "epoch": 1.2543735224586288,
+ "grad_norm": 2.7248027324676514,
+ "learning_rate": 4.509631844091973e-06,
+ "loss": 0.4972,
+ "step": 2653
+ },
+ {
+ "epoch": 1.2548463356973996,
+ "grad_norm": 2.8041458129882812,
+ "learning_rate": 4.5092607115538686e-06,
+ "loss": 0.588,
+ "step": 2654
+ },
+ {
+ "epoch": 1.2553191489361701,
+ "grad_norm": 2.679417133331299,
+ "learning_rate": 4.50888945390791e-06,
+ "loss": 0.4639,
+ "step": 2655
+ },
+ {
+ "epoch": 1.255791962174941,
+ "grad_norm": 3.1049270629882812,
+ "learning_rate": 4.508518071177214e-06,
+ "loss": 0.5857,
+ "step": 2656
+ },
+ {
+ "epoch": 1.2562647754137115,
+ "grad_norm": 2.8590362071990967,
+ "learning_rate": 4.508146563384904e-06,
+ "loss": 0.5451,
+ "step": 2657
+ },
+ {
+ "epoch": 1.2567375886524823,
+ "grad_norm": 2.9774081707000732,
+ "learning_rate": 4.507774930554114e-06,
+ "loss": 0.5493,
+ "step": 2658
+ },
+ {
+ "epoch": 1.2572104018912529,
+ "grad_norm": 2.617643356323242,
+ "learning_rate": 4.507403172707983e-06,
+ "loss": 0.5472,
+ "step": 2659
+ },
+ {
+ "epoch": 1.2576832151300237,
+ "grad_norm": 2.9195587635040283,
+ "learning_rate": 4.507031289869658e-06,
+ "loss": 0.5403,
+ "step": 2660
+ },
+ {
+ "epoch": 1.2581560283687943,
+ "grad_norm": 2.706089496612549,
+ "learning_rate": 4.506659282062295e-06,
+ "loss": 0.4899,
+ "step": 2661
+ },
+ {
+ "epoch": 1.258628841607565,
+ "grad_norm": 2.8229358196258545,
+ "learning_rate": 4.506287149309057e-06,
+ "loss": 0.5336,
+ "step": 2662
+ },
+ {
+ "epoch": 1.2591016548463356,
+ "grad_norm": 2.5295674800872803,
+ "learning_rate": 4.505914891633117e-06,
+ "loss": 0.4806,
+ "step": 2663
+ },
+ {
+ "epoch": 1.2595744680851064,
+ "grad_norm": 3.098208427429199,
+ "learning_rate": 4.505542509057651e-06,
+ "loss": 0.6039,
+ "step": 2664
+ },
+ {
+ "epoch": 1.260047281323877,
+ "grad_norm": 2.5118041038513184,
+ "learning_rate": 4.5051700016058475e-06,
+ "loss": 0.5279,
+ "step": 2665
+ },
+ {
+ "epoch": 1.2605200945626478,
+ "grad_norm": 2.6901369094848633,
+ "learning_rate": 4.5047973693009005e-06,
+ "loss": 0.5515,
+ "step": 2666
+ },
+ {
+ "epoch": 1.2609929078014184,
+ "grad_norm": 2.5622377395629883,
+ "learning_rate": 4.504424612166012e-06,
+ "loss": 0.5405,
+ "step": 2667
+ },
+ {
+ "epoch": 1.2614657210401892,
+ "grad_norm": 2.685751438140869,
+ "learning_rate": 4.5040517302243915e-06,
+ "loss": 0.5797,
+ "step": 2668
+ },
+ {
+ "epoch": 1.2619385342789597,
+ "grad_norm": 2.8525350093841553,
+ "learning_rate": 4.503678723499259e-06,
+ "loss": 0.5561,
+ "step": 2669
+ },
+ {
+ "epoch": 1.2624113475177305,
+ "grad_norm": 2.803386926651001,
+ "learning_rate": 4.503305592013836e-06,
+ "loss": 0.5376,
+ "step": 2670
+ },
+ {
+ "epoch": 1.2628841607565011,
+ "grad_norm": 2.78633189201355,
+ "learning_rate": 4.502932335791359e-06,
+ "loss": 0.4739,
+ "step": 2671
+ },
+ {
+ "epoch": 1.263356973995272,
+ "grad_norm": 2.8337297439575195,
+ "learning_rate": 4.502558954855069e-06,
+ "loss": 0.5406,
+ "step": 2672
+ },
+ {
+ "epoch": 1.2638297872340425,
+ "grad_norm": 2.610275983810425,
+ "learning_rate": 4.502185449228213e-06,
+ "loss": 0.5343,
+ "step": 2673
+ },
+ {
+ "epoch": 1.2643026004728133,
+ "grad_norm": 2.7842252254486084,
+ "learning_rate": 4.501811818934048e-06,
+ "loss": 0.532,
+ "step": 2674
+ },
+ {
+ "epoch": 1.2647754137115839,
+ "grad_norm": 2.4472389221191406,
+ "learning_rate": 4.501438063995839e-06,
+ "loss": 0.4976,
+ "step": 2675
+ },
+ {
+ "epoch": 1.2652482269503547,
+ "grad_norm": 3.076580762863159,
+ "learning_rate": 4.501064184436858e-06,
+ "loss": 0.507,
+ "step": 2676
+ },
+ {
+ "epoch": 1.2657210401891252,
+ "grad_norm": 2.5952908992767334,
+ "learning_rate": 4.500690180280384e-06,
+ "loss": 0.5498,
+ "step": 2677
+ },
+ {
+ "epoch": 1.266193853427896,
+ "grad_norm": 2.476943016052246,
+ "learning_rate": 4.500316051549706e-06,
+ "loss": 0.557,
+ "step": 2678
+ },
+ {
+ "epoch": 1.2666666666666666,
+ "grad_norm": 2.730579376220703,
+ "learning_rate": 4.499941798268118e-06,
+ "loss": 0.4975,
+ "step": 2679
+ },
+ {
+ "epoch": 1.2671394799054374,
+ "grad_norm": 2.7916698455810547,
+ "learning_rate": 4.499567420458924e-06,
+ "loss": 0.5673,
+ "step": 2680
+ },
+ {
+ "epoch": 1.267612293144208,
+ "grad_norm": 2.4249091148376465,
+ "learning_rate": 4.4991929181454355e-06,
+ "loss": 0.4836,
+ "step": 2681
+ },
+ {
+ "epoch": 1.2680851063829788,
+ "grad_norm": 2.661911725997925,
+ "learning_rate": 4.498818291350969e-06,
+ "loss": 0.5332,
+ "step": 2682
+ },
+ {
+ "epoch": 1.2685579196217494,
+ "grad_norm": 2.693657875061035,
+ "learning_rate": 4.498443540098852e-06,
+ "loss": 0.5257,
+ "step": 2683
+ },
+ {
+ "epoch": 1.2690307328605201,
+ "grad_norm": 2.609386682510376,
+ "learning_rate": 4.4980686644124195e-06,
+ "loss": 0.4918,
+ "step": 2684
+ },
+ {
+ "epoch": 1.2695035460992907,
+ "grad_norm": 3.2104930877685547,
+ "learning_rate": 4.4976936643150124e-06,
+ "loss": 0.6097,
+ "step": 2685
+ },
+ {
+ "epoch": 1.2699763593380615,
+ "grad_norm": 2.707860231399536,
+ "learning_rate": 4.49731853982998e-06,
+ "loss": 0.5109,
+ "step": 2686
+ },
+ {
+ "epoch": 1.270449172576832,
+ "grad_norm": 3.5046379566192627,
+ "learning_rate": 4.49694329098068e-06,
+ "loss": 0.5883,
+ "step": 2687
+ },
+ {
+ "epoch": 1.270921985815603,
+ "grad_norm": 2.5362324714660645,
+ "learning_rate": 4.496567917790477e-06,
+ "loss": 0.5301,
+ "step": 2688
+ },
+ {
+ "epoch": 1.2713947990543735,
+ "grad_norm": 2.7095518112182617,
+ "learning_rate": 4.496192420282746e-06,
+ "loss": 0.4772,
+ "step": 2689
+ },
+ {
+ "epoch": 1.2718676122931443,
+ "grad_norm": 2.416433095932007,
+ "learning_rate": 4.495816798480865e-06,
+ "loss": 0.5012,
+ "step": 2690
+ },
+ {
+ "epoch": 1.2723404255319148,
+ "grad_norm": 2.5362391471862793,
+ "learning_rate": 4.495441052408224e-06,
+ "loss": 0.5197,
+ "step": 2691
+ },
+ {
+ "epoch": 1.2728132387706856,
+ "grad_norm": 2.9093947410583496,
+ "learning_rate": 4.495065182088218e-06,
+ "loss": 0.4893,
+ "step": 2692
+ },
+ {
+ "epoch": 1.2732860520094562,
+ "grad_norm": 2.520470142364502,
+ "learning_rate": 4.494689187544251e-06,
+ "loss": 0.5072,
+ "step": 2693
+ },
+ {
+ "epoch": 1.273758865248227,
+ "grad_norm": 2.4385125637054443,
+ "learning_rate": 4.494313068799735e-06,
+ "loss": 0.4923,
+ "step": 2694
+ },
+ {
+ "epoch": 1.2742316784869976,
+ "grad_norm": 2.636852502822876,
+ "learning_rate": 4.493936825878089e-06,
+ "loss": 0.5409,
+ "step": 2695
+ },
+ {
+ "epoch": 1.2747044917257684,
+ "grad_norm": 2.7027053833007812,
+ "learning_rate": 4.493560458802741e-06,
+ "loss": 0.5906,
+ "step": 2696
+ },
+ {
+ "epoch": 1.275177304964539,
+ "grad_norm": 2.58752179145813,
+ "learning_rate": 4.493183967597123e-06,
+ "loss": 0.5292,
+ "step": 2697
+ },
+ {
+ "epoch": 1.2756501182033098,
+ "grad_norm": 2.7658379077911377,
+ "learning_rate": 4.49280735228468e-06,
+ "loss": 0.5613,
+ "step": 2698
+ },
+ {
+ "epoch": 1.2761229314420803,
+ "grad_norm": 3.272688388824463,
+ "learning_rate": 4.492430612888861e-06,
+ "loss": 0.5654,
+ "step": 2699
+ },
+ {
+ "epoch": 1.2765957446808511,
+ "grad_norm": 2.806819438934326,
+ "learning_rate": 4.492053749433125e-06,
+ "loss": 0.5388,
+ "step": 2700
+ },
+ {
+ "epoch": 1.2770685579196217,
+ "grad_norm": 2.879727602005005,
+ "learning_rate": 4.491676761940936e-06,
+ "loss": 0.5033,
+ "step": 2701
+ },
+ {
+ "epoch": 1.2775413711583925,
+ "grad_norm": 2.733347177505493,
+ "learning_rate": 4.4912996504357695e-06,
+ "loss": 0.5113,
+ "step": 2702
+ },
+ {
+ "epoch": 1.278014184397163,
+ "grad_norm": 2.7431252002716064,
+ "learning_rate": 4.490922414941104e-06,
+ "loss": 0.5417,
+ "step": 2703
+ },
+ {
+ "epoch": 1.2784869976359339,
+ "grad_norm": 2.9287240505218506,
+ "learning_rate": 4.490545055480431e-06,
+ "loss": 0.5875,
+ "step": 2704
+ },
+ {
+ "epoch": 1.2789598108747045,
+ "grad_norm": 2.576775550842285,
+ "learning_rate": 4.490167572077244e-06,
+ "loss": 0.5176,
+ "step": 2705
+ },
+ {
+ "epoch": 1.2794326241134752,
+ "grad_norm": 2.4335594177246094,
+ "learning_rate": 4.4897899647550505e-06,
+ "loss": 0.4749,
+ "step": 2706
+ },
+ {
+ "epoch": 1.2799054373522458,
+ "grad_norm": 2.6798062324523926,
+ "learning_rate": 4.489412233537361e-06,
+ "loss": 0.5439,
+ "step": 2707
+ },
+ {
+ "epoch": 1.2803782505910166,
+ "grad_norm": 2.8440675735473633,
+ "learning_rate": 4.489034378447693e-06,
+ "loss": 0.552,
+ "step": 2708
+ },
+ {
+ "epoch": 1.2808510638297872,
+ "grad_norm": 2.9059503078460693,
+ "learning_rate": 4.488656399509577e-06,
+ "loss": 0.5667,
+ "step": 2709
+ },
+ {
+ "epoch": 1.281323877068558,
+ "grad_norm": 2.7415006160736084,
+ "learning_rate": 4.488278296746548e-06,
+ "loss": 0.5676,
+ "step": 2710
+ },
+ {
+ "epoch": 1.2817966903073286,
+ "grad_norm": 2.4584875106811523,
+ "learning_rate": 4.487900070182147e-06,
+ "loss": 0.4787,
+ "step": 2711
+ },
+ {
+ "epoch": 1.2822695035460994,
+ "grad_norm": 2.990940809249878,
+ "learning_rate": 4.487521719839924e-06,
+ "loss": 0.5239,
+ "step": 2712
+ },
+ {
+ "epoch": 1.28274231678487,
+ "grad_norm": 3.075201988220215,
+ "learning_rate": 4.487143245743441e-06,
+ "loss": 0.5103,
+ "step": 2713
+ },
+ {
+ "epoch": 1.2832151300236407,
+ "grad_norm": 2.543341875076294,
+ "learning_rate": 4.486764647916259e-06,
+ "loss": 0.5475,
+ "step": 2714
+ },
+ {
+ "epoch": 1.2836879432624113,
+ "grad_norm": 2.9927213191986084,
+ "learning_rate": 4.486385926381957e-06,
+ "loss": 0.4923,
+ "step": 2715
+ },
+ {
+ "epoch": 1.284160756501182,
+ "grad_norm": 2.4220657348632812,
+ "learning_rate": 4.486007081164111e-06,
+ "loss": 0.543,
+ "step": 2716
+ },
+ {
+ "epoch": 1.2846335697399527,
+ "grad_norm": 2.468214988708496,
+ "learning_rate": 4.4856281122863134e-06,
+ "loss": 0.5248,
+ "step": 2717
+ },
+ {
+ "epoch": 1.2851063829787235,
+ "grad_norm": 2.633711099624634,
+ "learning_rate": 4.48524901977216e-06,
+ "loss": 0.4764,
+ "step": 2718
+ },
+ {
+ "epoch": 1.285579196217494,
+ "grad_norm": 2.8399546146392822,
+ "learning_rate": 4.484869803645254e-06,
+ "loss": 0.5503,
+ "step": 2719
+ },
+ {
+ "epoch": 1.2860520094562649,
+ "grad_norm": 2.769063949584961,
+ "learning_rate": 4.484490463929209e-06,
+ "loss": 0.5468,
+ "step": 2720
+ },
+ {
+ "epoch": 1.2865248226950354,
+ "grad_norm": 2.617863893508911,
+ "learning_rate": 4.4841110006476465e-06,
+ "loss": 0.5906,
+ "step": 2721
+ },
+ {
+ "epoch": 1.2869976359338062,
+ "grad_norm": 2.7639541625976562,
+ "learning_rate": 4.4837314138241905e-06,
+ "loss": 0.552,
+ "step": 2722
+ },
+ {
+ "epoch": 1.2874704491725768,
+ "grad_norm": 2.7711129188537598,
+ "learning_rate": 4.483351703482478e-06,
+ "loss": 0.5229,
+ "step": 2723
+ },
+ {
+ "epoch": 1.2879432624113476,
+ "grad_norm": 2.611205577850342,
+ "learning_rate": 4.482971869646152e-06,
+ "loss": 0.5055,
+ "step": 2724
+ },
+ {
+ "epoch": 1.2884160756501182,
+ "grad_norm": 2.8602211475372314,
+ "learning_rate": 4.482591912338862e-06,
+ "loss": 0.5561,
+ "step": 2725
+ },
+ {
+ "epoch": 1.2888888888888888,
+ "grad_norm": 2.5882298946380615,
+ "learning_rate": 4.4822118315842675e-06,
+ "loss": 0.5555,
+ "step": 2726
+ },
+ {
+ "epoch": 1.2893617021276595,
+ "grad_norm": 2.7533531188964844,
+ "learning_rate": 4.481831627406033e-06,
+ "loss": 0.5346,
+ "step": 2727
+ },
+ {
+ "epoch": 1.2898345153664303,
+ "grad_norm": 2.4296958446502686,
+ "learning_rate": 4.481451299827835e-06,
+ "loss": 0.4915,
+ "step": 2728
+ },
+ {
+ "epoch": 1.290307328605201,
+ "grad_norm": 2.4403445720672607,
+ "learning_rate": 4.481070848873352e-06,
+ "loss": 0.5648,
+ "step": 2729
+ },
+ {
+ "epoch": 1.2907801418439715,
+ "grad_norm": 2.473224401473999,
+ "learning_rate": 4.480690274566274e-06,
+ "loss": 0.4849,
+ "step": 2730
+ },
+ {
+ "epoch": 1.2912529550827423,
+ "grad_norm": 2.637899875640869,
+ "learning_rate": 4.480309576930297e-06,
+ "loss": 0.4968,
+ "step": 2731
+ },
+ {
+ "epoch": 1.291725768321513,
+ "grad_norm": 2.7156927585601807,
+ "learning_rate": 4.479928755989127e-06,
+ "loss": 0.4759,
+ "step": 2732
+ },
+ {
+ "epoch": 1.2921985815602837,
+ "grad_norm": 2.632786989212036,
+ "learning_rate": 4.479547811766475e-06,
+ "loss": 0.5468,
+ "step": 2733
+ },
+ {
+ "epoch": 1.2926713947990542,
+ "grad_norm": 2.529218912124634,
+ "learning_rate": 4.479166744286061e-06,
+ "loss": 0.4852,
+ "step": 2734
+ },
+ {
+ "epoch": 1.293144208037825,
+ "grad_norm": 2.561978340148926,
+ "learning_rate": 4.4787855535716115e-06,
+ "loss": 0.546,
+ "step": 2735
+ },
+ {
+ "epoch": 1.2936170212765958,
+ "grad_norm": 2.3684909343719482,
+ "learning_rate": 4.478404239646862e-06,
+ "loss": 0.5369,
+ "step": 2736
+ },
+ {
+ "epoch": 1.2940898345153664,
+ "grad_norm": 2.8940367698669434,
+ "learning_rate": 4.4780228025355566e-06,
+ "loss": 0.568,
+ "step": 2737
+ },
+ {
+ "epoch": 1.294562647754137,
+ "grad_norm": 2.6950316429138184,
+ "learning_rate": 4.477641242261445e-06,
+ "loss": 0.4576,
+ "step": 2738
+ },
+ {
+ "epoch": 1.2950354609929078,
+ "grad_norm": 2.4211716651916504,
+ "learning_rate": 4.4772595588482835e-06,
+ "loss": 0.4341,
+ "step": 2739
+ },
+ {
+ "epoch": 1.2955082742316786,
+ "grad_norm": 3.141097068786621,
+ "learning_rate": 4.47687775231984e-06,
+ "loss": 0.5944,
+ "step": 2740
+ },
+ {
+ "epoch": 1.2959810874704492,
+ "grad_norm": 3.077522039413452,
+ "learning_rate": 4.476495822699887e-06,
+ "loss": 0.5786,
+ "step": 2741
+ },
+ {
+ "epoch": 1.2964539007092197,
+ "grad_norm": 2.708139419555664,
+ "learning_rate": 4.476113770012206e-06,
+ "loss": 0.5014,
+ "step": 2742
+ },
+ {
+ "epoch": 1.2969267139479905,
+ "grad_norm": 2.7572035789489746,
+ "learning_rate": 4.475731594280586e-06,
+ "loss": 0.594,
+ "step": 2743
+ },
+ {
+ "epoch": 1.2973995271867613,
+ "grad_norm": 2.673126459121704,
+ "learning_rate": 4.475349295528822e-06,
+ "loss": 0.5317,
+ "step": 2744
+ },
+ {
+ "epoch": 1.297872340425532,
+ "grad_norm": 2.6757819652557373,
+ "learning_rate": 4.4749668737807195e-06,
+ "loss": 0.5614,
+ "step": 2745
+ },
+ {
+ "epoch": 1.2983451536643025,
+ "grad_norm": 2.7077620029449463,
+ "learning_rate": 4.47458432906009e-06,
+ "loss": 0.4916,
+ "step": 2746
+ },
+ {
+ "epoch": 1.2988179669030733,
+ "grad_norm": 2.446570873260498,
+ "learning_rate": 4.474201661390752e-06,
+ "loss": 0.5005,
+ "step": 2747
+ },
+ {
+ "epoch": 1.299290780141844,
+ "grad_norm": 2.642695665359497,
+ "learning_rate": 4.473818870796533e-06,
+ "loss": 0.5048,
+ "step": 2748
+ },
+ {
+ "epoch": 1.2997635933806146,
+ "grad_norm": 2.519824743270874,
+ "learning_rate": 4.4734359573012686e-06,
+ "loss": 0.5131,
+ "step": 2749
+ },
+ {
+ "epoch": 1.3002364066193852,
+ "grad_norm": 2.5901925563812256,
+ "learning_rate": 4.4730529209287995e-06,
+ "loss": 0.4582,
+ "step": 2750
+ },
+ {
+ "epoch": 1.300709219858156,
+ "grad_norm": 2.6789121627807617,
+ "learning_rate": 4.472669761702978e-06,
+ "loss": 0.5685,
+ "step": 2751
+ },
+ {
+ "epoch": 1.3011820330969268,
+ "grad_norm": 2.408003807067871,
+ "learning_rate": 4.472286479647659e-06,
+ "loss": 0.4329,
+ "step": 2752
+ },
+ {
+ "epoch": 1.3016548463356974,
+ "grad_norm": 2.681403398513794,
+ "learning_rate": 4.47190307478671e-06,
+ "loss": 0.4853,
+ "step": 2753
+ },
+ {
+ "epoch": 1.302127659574468,
+ "grad_norm": 2.9923183917999268,
+ "learning_rate": 4.4715195471440025e-06,
+ "loss": 0.5184,
+ "step": 2754
+ },
+ {
+ "epoch": 1.3026004728132388,
+ "grad_norm": 2.5100321769714355,
+ "learning_rate": 4.471135896743418e-06,
+ "loss": 0.5148,
+ "step": 2755
+ },
+ {
+ "epoch": 1.3030732860520096,
+ "grad_norm": 2.267881393432617,
+ "learning_rate": 4.4707521236088444e-06,
+ "loss": 0.5028,
+ "step": 2756
+ },
+ {
+ "epoch": 1.3035460992907801,
+ "grad_norm": 2.7779829502105713,
+ "learning_rate": 4.4703682277641775e-06,
+ "loss": 0.5724,
+ "step": 2757
+ },
+ {
+ "epoch": 1.3040189125295507,
+ "grad_norm": 2.4262194633483887,
+ "learning_rate": 4.4699842092333205e-06,
+ "loss": 0.5341,
+ "step": 2758
+ },
+ {
+ "epoch": 1.3044917257683215,
+ "grad_norm": 2.8682050704956055,
+ "learning_rate": 4.469600068040185e-06,
+ "loss": 0.6114,
+ "step": 2759
+ },
+ {
+ "epoch": 1.3049645390070923,
+ "grad_norm": 2.647853374481201,
+ "learning_rate": 4.46921580420869e-06,
+ "loss": 0.5107,
+ "step": 2760
+ },
+ {
+ "epoch": 1.3054373522458629,
+ "grad_norm": 2.561998128890991,
+ "learning_rate": 4.468831417762762e-06,
+ "loss": 0.6019,
+ "step": 2761
+ },
+ {
+ "epoch": 1.3059101654846335,
+ "grad_norm": 2.763425350189209,
+ "learning_rate": 4.468446908726334e-06,
+ "loss": 0.572,
+ "step": 2762
+ },
+ {
+ "epoch": 1.3063829787234043,
+ "grad_norm": 2.7052934169769287,
+ "learning_rate": 4.468062277123348e-06,
+ "loss": 0.4876,
+ "step": 2763
+ },
+ {
+ "epoch": 1.306855791962175,
+ "grad_norm": 2.997845411300659,
+ "learning_rate": 4.467677522977755e-06,
+ "loss": 0.5683,
+ "step": 2764
+ },
+ {
+ "epoch": 1.3073286052009456,
+ "grad_norm": 2.503129005432129,
+ "learning_rate": 4.46729264631351e-06,
+ "loss": 0.4951,
+ "step": 2765
+ },
+ {
+ "epoch": 1.3078014184397162,
+ "grad_norm": 2.617492437362671,
+ "learning_rate": 4.466907647154578e-06,
+ "loss": 0.5054,
+ "step": 2766
+ },
+ {
+ "epoch": 1.308274231678487,
+ "grad_norm": 2.934967279434204,
+ "learning_rate": 4.4665225255249315e-06,
+ "loss": 0.5299,
+ "step": 2767
+ },
+ {
+ "epoch": 1.3087470449172578,
+ "grad_norm": 2.787252187728882,
+ "learning_rate": 4.46613728144855e-06,
+ "loss": 0.4652,
+ "step": 2768
+ },
+ {
+ "epoch": 1.3092198581560284,
+ "grad_norm": 2.567439556121826,
+ "learning_rate": 4.465751914949422e-06,
+ "loss": 0.538,
+ "step": 2769
+ },
+ {
+ "epoch": 1.309692671394799,
+ "grad_norm": 2.6386024951934814,
+ "learning_rate": 4.4653664260515416e-06,
+ "loss": 0.464,
+ "step": 2770
+ },
+ {
+ "epoch": 1.3101654846335697,
+ "grad_norm": 2.966848134994507,
+ "learning_rate": 4.464980814778912e-06,
+ "loss": 0.4889,
+ "step": 2771
+ },
+ {
+ "epoch": 1.3106382978723405,
+ "grad_norm": 2.571256637573242,
+ "learning_rate": 4.464595081155542e-06,
+ "loss": 0.4979,
+ "step": 2772
+ },
+ {
+ "epoch": 1.3111111111111111,
+ "grad_norm": 2.774203062057495,
+ "learning_rate": 4.4642092252054515e-06,
+ "loss": 0.5366,
+ "step": 2773
+ },
+ {
+ "epoch": 1.3115839243498817,
+ "grad_norm": 2.682969331741333,
+ "learning_rate": 4.463823246952666e-06,
+ "loss": 0.5118,
+ "step": 2774
+ },
+ {
+ "epoch": 1.3120567375886525,
+ "grad_norm": 2.4873905181884766,
+ "learning_rate": 4.463437146421217e-06,
+ "loss": 0.5548,
+ "step": 2775
+ },
+ {
+ "epoch": 1.3125295508274233,
+ "grad_norm": 2.6769661903381348,
+ "learning_rate": 4.463050923635147e-06,
+ "loss": 0.5023,
+ "step": 2776
+ },
+ {
+ "epoch": 1.3130023640661939,
+ "grad_norm": 2.7190892696380615,
+ "learning_rate": 4.462664578618503e-06,
+ "loss": 0.5546,
+ "step": 2777
+ },
+ {
+ "epoch": 1.3134751773049644,
+ "grad_norm": 2.8193624019622803,
+ "learning_rate": 4.462278111395343e-06,
+ "loss": 0.5265,
+ "step": 2778
+ },
+ {
+ "epoch": 1.3139479905437352,
+ "grad_norm": 2.7324538230895996,
+ "learning_rate": 4.461891521989728e-06,
+ "loss": 0.5449,
+ "step": 2779
+ },
+ {
+ "epoch": 1.314420803782506,
+ "grad_norm": 2.87320876121521,
+ "learning_rate": 4.4615048104257305e-06,
+ "loss": 0.5367,
+ "step": 2780
+ },
+ {
+ "epoch": 1.3148936170212766,
+ "grad_norm": 2.6777031421661377,
+ "learning_rate": 4.4611179767274306e-06,
+ "loss": 0.5026,
+ "step": 2781
+ },
+ {
+ "epoch": 1.3153664302600472,
+ "grad_norm": 3.714524269104004,
+ "learning_rate": 4.460731020918913e-06,
+ "loss": 0.569,
+ "step": 2782
+ },
+ {
+ "epoch": 1.315839243498818,
+ "grad_norm": 2.7493600845336914,
+ "learning_rate": 4.460343943024273e-06,
+ "loss": 0.5826,
+ "step": 2783
+ },
+ {
+ "epoch": 1.3163120567375888,
+ "grad_norm": 2.6544079780578613,
+ "learning_rate": 4.459956743067609e-06,
+ "loss": 0.5399,
+ "step": 2784
+ },
+ {
+ "epoch": 1.3167848699763594,
+ "grad_norm": 2.4338037967681885,
+ "learning_rate": 4.459569421073036e-06,
+ "loss": 0.5186,
+ "step": 2785
+ },
+ {
+ "epoch": 1.31725768321513,
+ "grad_norm": 2.9312374591827393,
+ "learning_rate": 4.459181977064665e-06,
+ "loss": 0.5571,
+ "step": 2786
+ },
+ {
+ "epoch": 1.3177304964539007,
+ "grad_norm": 2.5988922119140625,
+ "learning_rate": 4.458794411066624e-06,
+ "loss": 0.5926,
+ "step": 2787
+ },
+ {
+ "epoch": 1.3182033096926715,
+ "grad_norm": 2.5193772315979004,
+ "learning_rate": 4.458406723103044e-06,
+ "loss": 0.5243,
+ "step": 2788
+ },
+ {
+ "epoch": 1.318676122931442,
+ "grad_norm": 2.8653743267059326,
+ "learning_rate": 4.458018913198066e-06,
+ "loss": 0.5421,
+ "step": 2789
+ },
+ {
+ "epoch": 1.3191489361702127,
+ "grad_norm": 2.486245632171631,
+ "learning_rate": 4.457630981375834e-06,
+ "loss": 0.4862,
+ "step": 2790
+ },
+ {
+ "epoch": 1.3196217494089835,
+ "grad_norm": 3.155435800552368,
+ "learning_rate": 4.457242927660506e-06,
+ "loss": 0.5386,
+ "step": 2791
+ },
+ {
+ "epoch": 1.3200945626477543,
+ "grad_norm": 3.102023124694824,
+ "learning_rate": 4.456854752076242e-06,
+ "loss": 0.5527,
+ "step": 2792
+ },
+ {
+ "epoch": 1.3205673758865248,
+ "grad_norm": 2.7995986938476562,
+ "learning_rate": 4.456466454647215e-06,
+ "loss": 0.4364,
+ "step": 2793
+ },
+ {
+ "epoch": 1.3210401891252954,
+ "grad_norm": 2.8328311443328857,
+ "learning_rate": 4.456078035397599e-06,
+ "loss": 0.5516,
+ "step": 2794
+ },
+ {
+ "epoch": 1.3215130023640662,
+ "grad_norm": 2.606161594390869,
+ "learning_rate": 4.455689494351581e-06,
+ "loss": 0.5042,
+ "step": 2795
+ },
+ {
+ "epoch": 1.321985815602837,
+ "grad_norm": 2.6344757080078125,
+ "learning_rate": 4.455300831533354e-06,
+ "loss": 0.4807,
+ "step": 2796
+ },
+ {
+ "epoch": 1.3224586288416076,
+ "grad_norm": 2.8539786338806152,
+ "learning_rate": 4.454912046967118e-06,
+ "loss": 0.4694,
+ "step": 2797
+ },
+ {
+ "epoch": 1.3229314420803782,
+ "grad_norm": 2.849066734313965,
+ "learning_rate": 4.454523140677081e-06,
+ "loss": 0.5037,
+ "step": 2798
+ },
+ {
+ "epoch": 1.323404255319149,
+ "grad_norm": 2.6803371906280518,
+ "learning_rate": 4.454134112687458e-06,
+ "loss": 0.4959,
+ "step": 2799
+ },
+ {
+ "epoch": 1.3238770685579198,
+ "grad_norm": 3.0546066761016846,
+ "learning_rate": 4.453744963022473e-06,
+ "loss": 0.5935,
+ "step": 2800
+ },
+ {
+ "epoch": 1.3243498817966903,
+ "grad_norm": 2.625602960586548,
+ "learning_rate": 4.453355691706356e-06,
+ "loss": 0.5349,
+ "step": 2801
+ },
+ {
+ "epoch": 1.324822695035461,
+ "grad_norm": 2.7568554878234863,
+ "learning_rate": 4.452966298763345e-06,
+ "loss": 0.5012,
+ "step": 2802
+ },
+ {
+ "epoch": 1.3252955082742317,
+ "grad_norm": 2.940427303314209,
+ "learning_rate": 4.452576784217686e-06,
+ "loss": 0.5246,
+ "step": 2803
+ },
+ {
+ "epoch": 1.3257683215130025,
+ "grad_norm": 2.5485289096832275,
+ "learning_rate": 4.452187148093633e-06,
+ "loss": 0.5282,
+ "step": 2804
+ },
+ {
+ "epoch": 1.326241134751773,
+ "grad_norm": 2.8152987957000732,
+ "learning_rate": 4.4517973904154455e-06,
+ "loss": 0.5468,
+ "step": 2805
+ },
+ {
+ "epoch": 1.3267139479905437,
+ "grad_norm": 2.9399688243865967,
+ "learning_rate": 4.451407511207393e-06,
+ "loss": 0.5586,
+ "step": 2806
+ },
+ {
+ "epoch": 1.3271867612293144,
+ "grad_norm": 2.3870036602020264,
+ "learning_rate": 4.451017510493751e-06,
+ "loss": 0.4807,
+ "step": 2807
+ },
+ {
+ "epoch": 1.327659574468085,
+ "grad_norm": 3.4667887687683105,
+ "learning_rate": 4.450627388298805e-06,
+ "loss": 0.5571,
+ "step": 2808
+ },
+ {
+ "epoch": 1.3281323877068558,
+ "grad_norm": 2.685986042022705,
+ "learning_rate": 4.450237144646844e-06,
+ "loss": 0.5525,
+ "step": 2809
+ },
+ {
+ "epoch": 1.3286052009456264,
+ "grad_norm": 2.8529131412506104,
+ "learning_rate": 4.449846779562168e-06,
+ "loss": 0.491,
+ "step": 2810
+ },
+ {
+ "epoch": 1.3290780141843972,
+ "grad_norm": 2.7360332012176514,
+ "learning_rate": 4.449456293069082e-06,
+ "loss": 0.5574,
+ "step": 2811
+ },
+ {
+ "epoch": 1.3295508274231678,
+ "grad_norm": 2.4656026363372803,
+ "learning_rate": 4.4490656851919015e-06,
+ "loss": 0.4678,
+ "step": 2812
+ },
+ {
+ "epoch": 1.3300236406619386,
+ "grad_norm": 2.602651357650757,
+ "learning_rate": 4.448674955954947e-06,
+ "loss": 0.5118,
+ "step": 2813
+ },
+ {
+ "epoch": 1.3304964539007091,
+ "grad_norm": 3.0129756927490234,
+ "learning_rate": 4.448284105382548e-06,
+ "loss": 0.6136,
+ "step": 2814
+ },
+ {
+ "epoch": 1.33096926713948,
+ "grad_norm": 2.8499927520751953,
+ "learning_rate": 4.447893133499039e-06,
+ "loss": 0.5286,
+ "step": 2815
+ },
+ {
+ "epoch": 1.3314420803782505,
+ "grad_norm": 2.8320744037628174,
+ "learning_rate": 4.447502040328767e-06,
+ "loss": 0.5186,
+ "step": 2816
+ },
+ {
+ "epoch": 1.3319148936170213,
+ "grad_norm": 2.499950885772705,
+ "learning_rate": 4.447110825896084e-06,
+ "loss": 0.5338,
+ "step": 2817
+ },
+ {
+ "epoch": 1.3323877068557919,
+ "grad_norm": 2.530895233154297,
+ "learning_rate": 4.446719490225346e-06,
+ "loss": 0.5151,
+ "step": 2818
+ },
+ {
+ "epoch": 1.3328605200945627,
+ "grad_norm": 2.5276098251342773,
+ "learning_rate": 4.446328033340921e-06,
+ "loss": 0.5424,
+ "step": 2819
+ },
+ {
+ "epoch": 1.3333333333333333,
+ "grad_norm": 2.90218186378479,
+ "learning_rate": 4.4459364552671845e-06,
+ "loss": 0.5747,
+ "step": 2820
+ },
+ {
+ "epoch": 1.333806146572104,
+ "grad_norm": 2.500943183898926,
+ "learning_rate": 4.445544756028518e-06,
+ "loss": 0.5459,
+ "step": 2821
+ },
+ {
+ "epoch": 1.3342789598108746,
+ "grad_norm": 2.960374355316162,
+ "learning_rate": 4.44515293564931e-06,
+ "loss": 0.6092,
+ "step": 2822
+ },
+ {
+ "epoch": 1.3347517730496454,
+ "grad_norm": 2.813671827316284,
+ "learning_rate": 4.444760994153958e-06,
+ "loss": 0.5536,
+ "step": 2823
+ },
+ {
+ "epoch": 1.335224586288416,
+ "grad_norm": 2.7147483825683594,
+ "learning_rate": 4.444368931566867e-06,
+ "loss": 0.5291,
+ "step": 2824
+ },
+ {
+ "epoch": 1.3356973995271868,
+ "grad_norm": 2.710101842880249,
+ "learning_rate": 4.443976747912447e-06,
+ "loss": 0.5138,
+ "step": 2825
+ },
+ {
+ "epoch": 1.3361702127659574,
+ "grad_norm": 2.711419105529785,
+ "learning_rate": 4.443584443215121e-06,
+ "loss": 0.5223,
+ "step": 2826
+ },
+ {
+ "epoch": 1.3366430260047282,
+ "grad_norm": 2.887472152709961,
+ "learning_rate": 4.443192017499313e-06,
+ "loss": 0.5464,
+ "step": 2827
+ },
+ {
+ "epoch": 1.3371158392434987,
+ "grad_norm": 2.8867223262786865,
+ "learning_rate": 4.4427994707894585e-06,
+ "loss": 0.5748,
+ "step": 2828
+ },
+ {
+ "epoch": 1.3375886524822695,
+ "grad_norm": 2.407247543334961,
+ "learning_rate": 4.44240680311e-06,
+ "loss": 0.4727,
+ "step": 2829
+ },
+ {
+ "epoch": 1.3380614657210401,
+ "grad_norm": 2.578420877456665,
+ "learning_rate": 4.4420140144853865e-06,
+ "loss": 0.5129,
+ "step": 2830
+ },
+ {
+ "epoch": 1.338534278959811,
+ "grad_norm": 2.884373426437378,
+ "learning_rate": 4.441621104940077e-06,
+ "loss": 0.5366,
+ "step": 2831
+ },
+ {
+ "epoch": 1.3390070921985815,
+ "grad_norm": 2.8652374744415283,
+ "learning_rate": 4.441228074498534e-06,
+ "loss": 0.5045,
+ "step": 2832
+ },
+ {
+ "epoch": 1.3394799054373523,
+ "grad_norm": 2.5380210876464844,
+ "learning_rate": 4.440834923185231e-06,
+ "loss": 0.509,
+ "step": 2833
+ },
+ {
+ "epoch": 1.3399527186761229,
+ "grad_norm": 2.415734052658081,
+ "learning_rate": 4.440441651024648e-06,
+ "loss": 0.5066,
+ "step": 2834
+ },
+ {
+ "epoch": 1.3404255319148937,
+ "grad_norm": 2.503051996231079,
+ "learning_rate": 4.440048258041272e-06,
+ "loss": 0.5118,
+ "step": 2835
+ },
+ {
+ "epoch": 1.3408983451536642,
+ "grad_norm": 3.351001024246216,
+ "learning_rate": 4.439654744259598e-06,
+ "loss": 0.5758,
+ "step": 2836
+ },
+ {
+ "epoch": 1.341371158392435,
+ "grad_norm": 2.7368781566619873,
+ "learning_rate": 4.439261109704129e-06,
+ "loss": 0.5674,
+ "step": 2837
+ },
+ {
+ "epoch": 1.3418439716312056,
+ "grad_norm": 3.008199453353882,
+ "learning_rate": 4.438867354399372e-06,
+ "loss": 0.5891,
+ "step": 2838
+ },
+ {
+ "epoch": 1.3423167848699764,
+ "grad_norm": 2.538907766342163,
+ "learning_rate": 4.438473478369847e-06,
+ "loss": 0.5102,
+ "step": 2839
+ },
+ {
+ "epoch": 1.342789598108747,
+ "grad_norm": 2.7169063091278076,
+ "learning_rate": 4.438079481640079e-06,
+ "loss": 0.6131,
+ "step": 2840
+ },
+ {
+ "epoch": 1.3432624113475178,
+ "grad_norm": 2.7411608695983887,
+ "learning_rate": 4.437685364234601e-06,
+ "loss": 0.5337,
+ "step": 2841
+ },
+ {
+ "epoch": 1.3437352245862884,
+ "grad_norm": 3.2374939918518066,
+ "learning_rate": 4.43729112617795e-06,
+ "loss": 0.5401,
+ "step": 2842
+ },
+ {
+ "epoch": 1.3442080378250592,
+ "grad_norm": 2.4712226390838623,
+ "learning_rate": 4.436896767494676e-06,
+ "loss": 0.5365,
+ "step": 2843
+ },
+ {
+ "epoch": 1.3446808510638297,
+ "grad_norm": 2.661619186401367,
+ "learning_rate": 4.436502288209334e-06,
+ "loss": 0.4919,
+ "step": 2844
+ },
+ {
+ "epoch": 1.3451536643026005,
+ "grad_norm": 2.5943779945373535,
+ "learning_rate": 4.4361076883464845e-06,
+ "loss": 0.5253,
+ "step": 2845
+ },
+ {
+ "epoch": 1.345626477541371,
+ "grad_norm": 2.672297477722168,
+ "learning_rate": 4.4357129679307e-06,
+ "loss": 0.541,
+ "step": 2846
+ },
+ {
+ "epoch": 1.346099290780142,
+ "grad_norm": 2.6830925941467285,
+ "learning_rate": 4.435318126986557e-06,
+ "loss": 0.5641,
+ "step": 2847
+ },
+ {
+ "epoch": 1.3465721040189125,
+ "grad_norm": 2.7394626140594482,
+ "learning_rate": 4.434923165538639e-06,
+ "loss": 0.5591,
+ "step": 2848
+ },
+ {
+ "epoch": 1.3470449172576833,
+ "grad_norm": 2.9656317234039307,
+ "learning_rate": 4.434528083611541e-06,
+ "loss": 0.515,
+ "step": 2849
+ },
+ {
+ "epoch": 1.3475177304964538,
+ "grad_norm": 3.30155086517334,
+ "learning_rate": 4.434132881229861e-06,
+ "loss": 0.5871,
+ "step": 2850
+ },
+ {
+ "epoch": 1.3479905437352246,
+ "grad_norm": 2.6222476959228516,
+ "learning_rate": 4.433737558418209e-06,
+ "loss": 0.5143,
+ "step": 2851
+ },
+ {
+ "epoch": 1.3484633569739952,
+ "grad_norm": 2.903158187866211,
+ "learning_rate": 4.4333421152011965e-06,
+ "loss": 0.4484,
+ "step": 2852
+ },
+ {
+ "epoch": 1.348936170212766,
+ "grad_norm": 2.863116979598999,
+ "learning_rate": 4.432946551603449e-06,
+ "loss": 0.5213,
+ "step": 2853
+ },
+ {
+ "epoch": 1.3494089834515366,
+ "grad_norm": 2.8253962993621826,
+ "learning_rate": 4.432550867649596e-06,
+ "loss": 0.5713,
+ "step": 2854
+ },
+ {
+ "epoch": 1.3498817966903074,
+ "grad_norm": 2.652493953704834,
+ "learning_rate": 4.432155063364273e-06,
+ "loss": 0.5559,
+ "step": 2855
+ },
+ {
+ "epoch": 1.350354609929078,
+ "grad_norm": 2.4289376735687256,
+ "learning_rate": 4.431759138772127e-06,
+ "loss": 0.5122,
+ "step": 2856
+ },
+ {
+ "epoch": 1.3508274231678488,
+ "grad_norm": 2.6329853534698486,
+ "learning_rate": 4.43136309389781e-06,
+ "loss": 0.5332,
+ "step": 2857
+ },
+ {
+ "epoch": 1.3513002364066193,
+ "grad_norm": 2.431103229522705,
+ "learning_rate": 4.430966928765982e-06,
+ "loss": 0.4863,
+ "step": 2858
+ },
+ {
+ "epoch": 1.3517730496453901,
+ "grad_norm": 2.7529025077819824,
+ "learning_rate": 4.4305706434013106e-06,
+ "loss": 0.5263,
+ "step": 2859
+ },
+ {
+ "epoch": 1.3522458628841607,
+ "grad_norm": 2.884605646133423,
+ "learning_rate": 4.43017423782847e-06,
+ "loss": 0.564,
+ "step": 2860
+ },
+ {
+ "epoch": 1.3527186761229315,
+ "grad_norm": 3.027771234512329,
+ "learning_rate": 4.4297777120721435e-06,
+ "loss": 0.5846,
+ "step": 2861
+ },
+ {
+ "epoch": 1.353191489361702,
+ "grad_norm": 3.0140626430511475,
+ "learning_rate": 4.4293810661570205e-06,
+ "loss": 0.6621,
+ "step": 2862
+ },
+ {
+ "epoch": 1.3536643026004729,
+ "grad_norm": 2.721799612045288,
+ "learning_rate": 4.428984300107799e-06,
+ "loss": 0.5566,
+ "step": 2863
+ },
+ {
+ "epoch": 1.3541371158392435,
+ "grad_norm": 3.0016496181488037,
+ "learning_rate": 4.428587413949183e-06,
+ "loss": 0.5525,
+ "step": 2864
+ },
+ {
+ "epoch": 1.3546099290780143,
+ "grad_norm": 2.77138614654541,
+ "learning_rate": 4.428190407705886e-06,
+ "loss": 0.6016,
+ "step": 2865
+ },
+ {
+ "epoch": 1.3550827423167848,
+ "grad_norm": 2.9783477783203125,
+ "learning_rate": 4.427793281402627e-06,
+ "loss": 0.5556,
+ "step": 2866
+ },
+ {
+ "epoch": 1.3555555555555556,
+ "grad_norm": 2.2490382194519043,
+ "learning_rate": 4.427396035064132e-06,
+ "loss": 0.5138,
+ "step": 2867
+ },
+ {
+ "epoch": 1.3560283687943262,
+ "grad_norm": 2.442225217819214,
+ "learning_rate": 4.426998668715139e-06,
+ "loss": 0.4843,
+ "step": 2868
+ },
+ {
+ "epoch": 1.356501182033097,
+ "grad_norm": 2.74040150642395,
+ "learning_rate": 4.426601182380388e-06,
+ "loss": 0.54,
+ "step": 2869
+ },
+ {
+ "epoch": 1.3569739952718676,
+ "grad_norm": 2.4434332847595215,
+ "learning_rate": 4.426203576084629e-06,
+ "loss": 0.5199,
+ "step": 2870
+ },
+ {
+ "epoch": 1.3574468085106384,
+ "grad_norm": 2.6380388736724854,
+ "learning_rate": 4.42580584985262e-06,
+ "loss": 0.5049,
+ "step": 2871
+ },
+ {
+ "epoch": 1.357919621749409,
+ "grad_norm": 2.7324254512786865,
+ "learning_rate": 4.425408003709125e-06,
+ "loss": 0.5036,
+ "step": 2872
+ },
+ {
+ "epoch": 1.3583924349881797,
+ "grad_norm": 2.661012649536133,
+ "learning_rate": 4.425010037678916e-06,
+ "loss": 0.4965,
+ "step": 2873
+ },
+ {
+ "epoch": 1.3588652482269503,
+ "grad_norm": 2.5380208492279053,
+ "learning_rate": 4.424611951786773e-06,
+ "loss": 0.4293,
+ "step": 2874
+ },
+ {
+ "epoch": 1.3593380614657211,
+ "grad_norm": 2.6060714721679688,
+ "learning_rate": 4.424213746057483e-06,
+ "loss": 0.5335,
+ "step": 2875
+ },
+ {
+ "epoch": 1.3598108747044917,
+ "grad_norm": 2.98282527923584,
+ "learning_rate": 4.423815420515841e-06,
+ "loss": 0.5626,
+ "step": 2876
+ },
+ {
+ "epoch": 1.3602836879432625,
+ "grad_norm": 2.779371500015259,
+ "learning_rate": 4.423416975186647e-06,
+ "loss": 0.5353,
+ "step": 2877
+ },
+ {
+ "epoch": 1.360756501182033,
+ "grad_norm": 2.8033530712127686,
+ "learning_rate": 4.423018410094713e-06,
+ "loss": 0.538,
+ "step": 2878
+ },
+ {
+ "epoch": 1.3612293144208039,
+ "grad_norm": 3.225177764892578,
+ "learning_rate": 4.422619725264855e-06,
+ "loss": 0.5441,
+ "step": 2879
+ },
+ {
+ "epoch": 1.3617021276595744,
+ "grad_norm": 2.959135055541992,
+ "learning_rate": 4.422220920721896e-06,
+ "loss": 0.5293,
+ "step": 2880
+ },
+ {
+ "epoch": 1.3621749408983452,
+ "grad_norm": 2.5558884143829346,
+ "learning_rate": 4.4218219964906704e-06,
+ "loss": 0.442,
+ "step": 2881
+ },
+ {
+ "epoch": 1.3626477541371158,
+ "grad_norm": 2.694899797439575,
+ "learning_rate": 4.421422952596015e-06,
+ "loss": 0.5318,
+ "step": 2882
+ },
+ {
+ "epoch": 1.3631205673758866,
+ "grad_norm": 2.7909531593322754,
+ "learning_rate": 4.421023789062777e-06,
+ "loss": 0.6648,
+ "step": 2883
+ },
+ {
+ "epoch": 1.3635933806146572,
+ "grad_norm": 2.421995162963867,
+ "learning_rate": 4.420624505915813e-06,
+ "loss": 0.4644,
+ "step": 2884
+ },
+ {
+ "epoch": 1.364066193853428,
+ "grad_norm": 2.5876688957214355,
+ "learning_rate": 4.420225103179981e-06,
+ "loss": 0.5743,
+ "step": 2885
+ },
+ {
+ "epoch": 1.3645390070921986,
+ "grad_norm": 2.89341139793396,
+ "learning_rate": 4.419825580880152e-06,
+ "loss": 0.5454,
+ "step": 2886
+ },
+ {
+ "epoch": 1.3650118203309693,
+ "grad_norm": 2.534708261489868,
+ "learning_rate": 4.419425939041203e-06,
+ "loss": 0.5572,
+ "step": 2887
+ },
+ {
+ "epoch": 1.36548463356974,
+ "grad_norm": 2.6052141189575195,
+ "learning_rate": 4.419026177688017e-06,
+ "loss": 0.4763,
+ "step": 2888
+ },
+ {
+ "epoch": 1.3659574468085105,
+ "grad_norm": 2.723720073699951,
+ "learning_rate": 4.4186262968454854e-06,
+ "loss": 0.5659,
+ "step": 2889
+ },
+ {
+ "epoch": 1.3664302600472813,
+ "grad_norm": 2.8909599781036377,
+ "learning_rate": 4.418226296538507e-06,
+ "loss": 0.4996,
+ "step": 2890
+ },
+ {
+ "epoch": 1.366903073286052,
+ "grad_norm": 2.551375389099121,
+ "learning_rate": 4.417826176791988e-06,
+ "loss": 0.5259,
+ "step": 2891
+ },
+ {
+ "epoch": 1.3673758865248227,
+ "grad_norm": 3.360267162322998,
+ "learning_rate": 4.417425937630843e-06,
+ "loss": 0.5381,
+ "step": 2892
+ },
+ {
+ "epoch": 1.3678486997635932,
+ "grad_norm": 2.7611942291259766,
+ "learning_rate": 4.417025579079992e-06,
+ "loss": 0.6022,
+ "step": 2893
+ },
+ {
+ "epoch": 1.368321513002364,
+ "grad_norm": 2.5931224822998047,
+ "learning_rate": 4.416625101164365e-06,
+ "loss": 0.5102,
+ "step": 2894
+ },
+ {
+ "epoch": 1.3687943262411348,
+ "grad_norm": 2.5888102054595947,
+ "learning_rate": 4.416224503908897e-06,
+ "loss": 0.4955,
+ "step": 2895
+ },
+ {
+ "epoch": 1.3692671394799054,
+ "grad_norm": 2.6262896060943604,
+ "learning_rate": 4.41582378733853e-06,
+ "loss": 0.5101,
+ "step": 2896
+ },
+ {
+ "epoch": 1.369739952718676,
+ "grad_norm": 3.339170217514038,
+ "learning_rate": 4.415422951478218e-06,
+ "loss": 0.4939,
+ "step": 2897
+ },
+ {
+ "epoch": 1.3702127659574468,
+ "grad_norm": 2.940866708755493,
+ "learning_rate": 4.415021996352917e-06,
+ "loss": 0.5157,
+ "step": 2898
+ },
+ {
+ "epoch": 1.3706855791962176,
+ "grad_norm": 2.7423818111419678,
+ "learning_rate": 4.414620921987594e-06,
+ "loss": 0.5308,
+ "step": 2899
+ },
+ {
+ "epoch": 1.3711583924349882,
+ "grad_norm": 2.7177040576934814,
+ "learning_rate": 4.414219728407221e-06,
+ "loss": 0.5429,
+ "step": 2900
+ },
+ {
+ "epoch": 1.3716312056737587,
+ "grad_norm": 2.560774087905884,
+ "learning_rate": 4.4138184156367794e-06,
+ "loss": 0.5266,
+ "step": 2901
+ },
+ {
+ "epoch": 1.3721040189125295,
+ "grad_norm": 2.5649116039276123,
+ "learning_rate": 4.413416983701256e-06,
+ "loss": 0.4718,
+ "step": 2902
+ },
+ {
+ "epoch": 1.3725768321513003,
+ "grad_norm": 2.8547167778015137,
+ "learning_rate": 4.413015432625648e-06,
+ "loss": 0.5129,
+ "step": 2903
+ },
+ {
+ "epoch": 1.373049645390071,
+ "grad_norm": 2.5413618087768555,
+ "learning_rate": 4.412613762434958e-06,
+ "loss": 0.5738,
+ "step": 2904
+ },
+ {
+ "epoch": 1.3735224586288415,
+ "grad_norm": 3.3252241611480713,
+ "learning_rate": 4.412211973154195e-06,
+ "loss": 0.5639,
+ "step": 2905
+ },
+ {
+ "epoch": 1.3739952718676123,
+ "grad_norm": 2.869102954864502,
+ "learning_rate": 4.411810064808376e-06,
+ "loss": 0.5384,
+ "step": 2906
+ },
+ {
+ "epoch": 1.374468085106383,
+ "grad_norm": 2.703199863433838,
+ "learning_rate": 4.411408037422529e-06,
+ "loss": 0.5742,
+ "step": 2907
+ },
+ {
+ "epoch": 1.3749408983451537,
+ "grad_norm": 2.685450792312622,
+ "learning_rate": 4.411005891021684e-06,
+ "loss": 0.5121,
+ "step": 2908
+ },
+ {
+ "epoch": 1.3754137115839242,
+ "grad_norm": 2.9572203159332275,
+ "learning_rate": 4.410603625630882e-06,
+ "loss": 0.5444,
+ "step": 2909
+ },
+ {
+ "epoch": 1.375886524822695,
+ "grad_norm": 2.707002878189087,
+ "learning_rate": 4.410201241275169e-06,
+ "loss": 0.5125,
+ "step": 2910
+ },
+ {
+ "epoch": 1.3763593380614658,
+ "grad_norm": 3.0158939361572266,
+ "learning_rate": 4.409798737979602e-06,
+ "loss": 0.5299,
+ "step": 2911
+ },
+ {
+ "epoch": 1.3768321513002364,
+ "grad_norm": 2.7932698726654053,
+ "learning_rate": 4.4093961157692415e-06,
+ "loss": 0.5437,
+ "step": 2912
+ },
+ {
+ "epoch": 1.377304964539007,
+ "grad_norm": 2.459510326385498,
+ "learning_rate": 4.408993374669156e-06,
+ "loss": 0.5548,
+ "step": 2913
+ },
+ {
+ "epoch": 1.3777777777777778,
+ "grad_norm": 2.7500696182250977,
+ "learning_rate": 4.408590514704425e-06,
+ "loss": 0.5186,
+ "step": 2914
+ },
+ {
+ "epoch": 1.3782505910165486,
+ "grad_norm": 2.7824268341064453,
+ "learning_rate": 4.4081875359001315e-06,
+ "loss": 0.4762,
+ "step": 2915
+ },
+ {
+ "epoch": 1.3787234042553191,
+ "grad_norm": 2.4202158451080322,
+ "learning_rate": 4.4077844382813675e-06,
+ "loss": 0.5005,
+ "step": 2916
+ },
+ {
+ "epoch": 1.3791962174940897,
+ "grad_norm": 2.5566670894622803,
+ "learning_rate": 4.4073812218732316e-06,
+ "loss": 0.5377,
+ "step": 2917
+ },
+ {
+ "epoch": 1.3796690307328605,
+ "grad_norm": 3.400874376296997,
+ "learning_rate": 4.406977886700831e-06,
+ "loss": 0.6637,
+ "step": 2918
+ },
+ {
+ "epoch": 1.3801418439716313,
+ "grad_norm": 2.8187878131866455,
+ "learning_rate": 4.406574432789278e-06,
+ "loss": 0.5033,
+ "step": 2919
+ },
+ {
+ "epoch": 1.3806146572104019,
+ "grad_norm": 2.5578041076660156,
+ "learning_rate": 4.406170860163697e-06,
+ "loss": 0.5293,
+ "step": 2920
+ },
+ {
+ "epoch": 1.3810874704491725,
+ "grad_norm": 2.6709718704223633,
+ "learning_rate": 4.405767168849213e-06,
+ "loss": 0.5144,
+ "step": 2921
+ },
+ {
+ "epoch": 1.3815602836879433,
+ "grad_norm": 3.049365997314453,
+ "learning_rate": 4.405363358870965e-06,
+ "loss": 0.4894,
+ "step": 2922
+ },
+ {
+ "epoch": 1.382033096926714,
+ "grad_norm": 2.5569891929626465,
+ "learning_rate": 4.404959430254095e-06,
+ "loss": 0.4929,
+ "step": 2923
+ },
+ {
+ "epoch": 1.3825059101654846,
+ "grad_norm": 2.8288230895996094,
+ "learning_rate": 4.404555383023754e-06,
+ "loss": 0.5438,
+ "step": 2924
+ },
+ {
+ "epoch": 1.3829787234042552,
+ "grad_norm": 2.8363358974456787,
+ "learning_rate": 4.404151217205102e-06,
+ "loss": 0.545,
+ "step": 2925
+ },
+ {
+ "epoch": 1.383451536643026,
+ "grad_norm": 2.720972776412964,
+ "learning_rate": 4.403746932823302e-06,
+ "loss": 0.5732,
+ "step": 2926
+ },
+ {
+ "epoch": 1.3839243498817968,
+ "grad_norm": 2.728043794631958,
+ "learning_rate": 4.403342529903528e-06,
+ "loss": 0.4944,
+ "step": 2927
+ },
+ {
+ "epoch": 1.3843971631205674,
+ "grad_norm": 2.4366135597229004,
+ "learning_rate": 4.402938008470961e-06,
+ "loss": 0.4441,
+ "step": 2928
+ },
+ {
+ "epoch": 1.384869976359338,
+ "grad_norm": 2.858454704284668,
+ "learning_rate": 4.402533368550788e-06,
+ "loss": 0.5359,
+ "step": 2929
+ },
+ {
+ "epoch": 1.3853427895981087,
+ "grad_norm": 2.805795907974243,
+ "learning_rate": 4.402128610168205e-06,
+ "loss": 0.4954,
+ "step": 2930
+ },
+ {
+ "epoch": 1.3858156028368795,
+ "grad_norm": 3.3514177799224854,
+ "learning_rate": 4.401723733348413e-06,
+ "loss": 0.579,
+ "step": 2931
+ },
+ {
+ "epoch": 1.3862884160756501,
+ "grad_norm": 2.6255125999450684,
+ "learning_rate": 4.401318738116624e-06,
+ "loss": 0.5002,
+ "step": 2932
+ },
+ {
+ "epoch": 1.3867612293144207,
+ "grad_norm": 2.3480796813964844,
+ "learning_rate": 4.400913624498054e-06,
+ "loss": 0.4688,
+ "step": 2933
+ },
+ {
+ "epoch": 1.3872340425531915,
+ "grad_norm": 2.710165023803711,
+ "learning_rate": 4.400508392517927e-06,
+ "loss": 0.5099,
+ "step": 2934
+ },
+ {
+ "epoch": 1.3877068557919623,
+ "grad_norm": 2.5820295810699463,
+ "learning_rate": 4.400103042201477e-06,
+ "loss": 0.512,
+ "step": 2935
+ },
+ {
+ "epoch": 1.3881796690307329,
+ "grad_norm": 2.750596523284912,
+ "learning_rate": 4.399697573573942e-06,
+ "loss": 0.463,
+ "step": 2936
+ },
+ {
+ "epoch": 1.3886524822695034,
+ "grad_norm": 3.497537612915039,
+ "learning_rate": 4.399291986660569e-06,
+ "loss": 0.5676,
+ "step": 2937
+ },
+ {
+ "epoch": 1.3891252955082742,
+ "grad_norm": 2.4046003818511963,
+ "learning_rate": 4.398886281486612e-06,
+ "loss": 0.5408,
+ "step": 2938
+ },
+ {
+ "epoch": 1.389598108747045,
+ "grad_norm": 2.941606283187866,
+ "learning_rate": 4.398480458077332e-06,
+ "loss": 0.5734,
+ "step": 2939
+ },
+ {
+ "epoch": 1.3900709219858156,
+ "grad_norm": 3.030214309692383,
+ "learning_rate": 4.398074516458e-06,
+ "loss": 0.5353,
+ "step": 2940
+ },
+ {
+ "epoch": 1.3905437352245862,
+ "grad_norm": 2.9991626739501953,
+ "learning_rate": 4.397668456653889e-06,
+ "loss": 0.5989,
+ "step": 2941
+ },
+ {
+ "epoch": 1.391016548463357,
+ "grad_norm": 4.163141250610352,
+ "learning_rate": 4.397262278690285e-06,
+ "loss": 0.5436,
+ "step": 2942
+ },
+ {
+ "epoch": 1.3914893617021278,
+ "grad_norm": 2.6576037406921387,
+ "learning_rate": 4.396855982592478e-06,
+ "loss": 0.5206,
+ "step": 2943
+ },
+ {
+ "epoch": 1.3919621749408984,
+ "grad_norm": 2.7729203701019287,
+ "learning_rate": 4.396449568385768e-06,
+ "loss": 0.5403,
+ "step": 2944
+ },
+ {
+ "epoch": 1.392434988179669,
+ "grad_norm": 2.4560446739196777,
+ "learning_rate": 4.396043036095457e-06,
+ "loss": 0.4924,
+ "step": 2945
+ },
+ {
+ "epoch": 1.3929078014184397,
+ "grad_norm": 2.6370556354522705,
+ "learning_rate": 4.39563638574686e-06,
+ "loss": 0.5543,
+ "step": 2946
+ },
+ {
+ "epoch": 1.3933806146572105,
+ "grad_norm": 2.593914270401001,
+ "learning_rate": 4.395229617365298e-06,
+ "loss": 0.5133,
+ "step": 2947
+ },
+ {
+ "epoch": 1.393853427895981,
+ "grad_norm": 2.3583998680114746,
+ "learning_rate": 4.394822730976099e-06,
+ "loss": 0.4436,
+ "step": 2948
+ },
+ {
+ "epoch": 1.3943262411347517,
+ "grad_norm": 3.2768537998199463,
+ "learning_rate": 4.394415726604596e-06,
+ "loss": 0.5489,
+ "step": 2949
+ },
+ {
+ "epoch": 1.3947990543735225,
+ "grad_norm": 2.88662052154541,
+ "learning_rate": 4.394008604276133e-06,
+ "loss": 0.5194,
+ "step": 2950
+ },
+ {
+ "epoch": 1.3952718676122933,
+ "grad_norm": 2.46610426902771,
+ "learning_rate": 4.393601364016059e-06,
+ "loss": 0.5255,
+ "step": 2951
+ },
+ {
+ "epoch": 1.3957446808510638,
+ "grad_norm": 3.122509241104126,
+ "learning_rate": 4.393194005849731e-06,
+ "loss": 0.6046,
+ "step": 2952
+ },
+ {
+ "epoch": 1.3962174940898344,
+ "grad_norm": 2.724926471710205,
+ "learning_rate": 4.392786529802513e-06,
+ "loss": 0.4958,
+ "step": 2953
+ },
+ {
+ "epoch": 1.3966903073286052,
+ "grad_norm": 2.491485595703125,
+ "learning_rate": 4.3923789358997785e-06,
+ "loss": 0.5209,
+ "step": 2954
+ },
+ {
+ "epoch": 1.397163120567376,
+ "grad_norm": 2.61110520362854,
+ "learning_rate": 4.3919712241669056e-06,
+ "loss": 0.5202,
+ "step": 2955
+ },
+ {
+ "epoch": 1.3976359338061466,
+ "grad_norm": 2.3814501762390137,
+ "learning_rate": 4.39156339462928e-06,
+ "loss": 0.4966,
+ "step": 2956
+ },
+ {
+ "epoch": 1.3981087470449172,
+ "grad_norm": 2.762498617172241,
+ "learning_rate": 4.391155447312296e-06,
+ "loss": 0.6025,
+ "step": 2957
+ },
+ {
+ "epoch": 1.398581560283688,
+ "grad_norm": 2.964975595474243,
+ "learning_rate": 4.390747382241355e-06,
+ "loss": 0.4845,
+ "step": 2958
+ },
+ {
+ "epoch": 1.3990543735224588,
+ "grad_norm": 3.0117249488830566,
+ "learning_rate": 4.3903391994418655e-06,
+ "loss": 0.5326,
+ "step": 2959
+ },
+ {
+ "epoch": 1.3995271867612293,
+ "grad_norm": 2.578626871109009,
+ "learning_rate": 4.389930898939243e-06,
+ "loss": 0.5271,
+ "step": 2960
+ },
+ {
+ "epoch": 1.4,
+ "grad_norm": 2.747441053390503,
+ "learning_rate": 4.38952248075891e-06,
+ "loss": 0.5553,
+ "step": 2961
+ },
+ {
+ "epoch": 1.4004728132387707,
+ "grad_norm": 2.8273086547851562,
+ "learning_rate": 4.389113944926297e-06,
+ "loss": 0.5475,
+ "step": 2962
+ },
+ {
+ "epoch": 1.4009456264775415,
+ "grad_norm": 2.55238676071167,
+ "learning_rate": 4.388705291466843e-06,
+ "loss": 0.4864,
+ "step": 2963
+ },
+ {
+ "epoch": 1.401418439716312,
+ "grad_norm": 2.597214460372925,
+ "learning_rate": 4.388296520405992e-06,
+ "loss": 0.4845,
+ "step": 2964
+ },
+ {
+ "epoch": 1.4018912529550827,
+ "grad_norm": 2.608962297439575,
+ "learning_rate": 4.387887631769196e-06,
+ "loss": 0.5544,
+ "step": 2965
+ },
+ {
+ "epoch": 1.4023640661938535,
+ "grad_norm": 2.2754876613616943,
+ "learning_rate": 4.3874786255819165e-06,
+ "loss": 0.5045,
+ "step": 2966
+ },
+ {
+ "epoch": 1.4028368794326243,
+ "grad_norm": 2.9900264739990234,
+ "learning_rate": 4.387069501869618e-06,
+ "loss": 0.562,
+ "step": 2967
+ },
+ {
+ "epoch": 1.4033096926713948,
+ "grad_norm": 2.8069417476654053,
+ "learning_rate": 4.386660260657778e-06,
+ "loss": 0.5284,
+ "step": 2968
+ },
+ {
+ "epoch": 1.4037825059101654,
+ "grad_norm": 2.68894624710083,
+ "learning_rate": 4.386250901971875e-06,
+ "loss": 0.5879,
+ "step": 2969
+ },
+ {
+ "epoch": 1.4042553191489362,
+ "grad_norm": 2.614485025405884,
+ "learning_rate": 4.385841425837399e-06,
+ "loss": 0.4771,
+ "step": 2970
+ },
+ {
+ "epoch": 1.4047281323877068,
+ "grad_norm": 2.487950325012207,
+ "learning_rate": 4.385431832279848e-06,
+ "loss": 0.5552,
+ "step": 2971
+ },
+ {
+ "epoch": 1.4052009456264776,
+ "grad_norm": 2.5098392963409424,
+ "learning_rate": 4.385022121324723e-06,
+ "loss": 0.5267,
+ "step": 2972
+ },
+ {
+ "epoch": 1.4056737588652481,
+ "grad_norm": 2.825838565826416,
+ "learning_rate": 4.384612292997537e-06,
+ "loss": 0.5336,
+ "step": 2973
+ },
+ {
+ "epoch": 1.406146572104019,
+ "grad_norm": 2.898188829421997,
+ "learning_rate": 4.384202347323806e-06,
+ "loss": 0.5685,
+ "step": 2974
+ },
+ {
+ "epoch": 1.4066193853427895,
+ "grad_norm": 2.8722569942474365,
+ "learning_rate": 4.383792284329057e-06,
+ "loss": 0.5977,
+ "step": 2975
+ },
+ {
+ "epoch": 1.4070921985815603,
+ "grad_norm": 2.832951307296753,
+ "learning_rate": 4.3833821040388235e-06,
+ "loss": 0.5766,
+ "step": 2976
+ },
+ {
+ "epoch": 1.407565011820331,
+ "grad_norm": 2.7353670597076416,
+ "learning_rate": 4.3829718064786446e-06,
+ "loss": 0.5461,
+ "step": 2977
+ },
+ {
+ "epoch": 1.4080378250591017,
+ "grad_norm": 2.6050429344177246,
+ "learning_rate": 4.3825613916740675e-06,
+ "loss": 0.5501,
+ "step": 2978
+ },
+ {
+ "epoch": 1.4085106382978723,
+ "grad_norm": 2.79719877243042,
+ "learning_rate": 4.382150859650647e-06,
+ "loss": 0.502,
+ "step": 2979
+ },
+ {
+ "epoch": 1.408983451536643,
+ "grad_norm": 2.5538079738616943,
+ "learning_rate": 4.381740210433946e-06,
+ "loss": 0.4762,
+ "step": 2980
+ },
+ {
+ "epoch": 1.4094562647754136,
+ "grad_norm": 2.7256062030792236,
+ "learning_rate": 4.381329444049533e-06,
+ "loss": 0.4692,
+ "step": 2981
+ },
+ {
+ "epoch": 1.4099290780141844,
+ "grad_norm": 2.7778146266937256,
+ "learning_rate": 4.3809185605229855e-06,
+ "loss": 0.5366,
+ "step": 2982
+ },
+ {
+ "epoch": 1.410401891252955,
+ "grad_norm": 2.6289451122283936,
+ "learning_rate": 4.380507559879887e-06,
+ "loss": 0.5412,
+ "step": 2983
+ },
+ {
+ "epoch": 1.4108747044917258,
+ "grad_norm": 2.697204828262329,
+ "learning_rate": 4.380096442145827e-06,
+ "loss": 0.5065,
+ "step": 2984
+ },
+ {
+ "epoch": 1.4113475177304964,
+ "grad_norm": 2.4709219932556152,
+ "learning_rate": 4.379685207346407e-06,
+ "loss": 0.568,
+ "step": 2985
+ },
+ {
+ "epoch": 1.4118203309692672,
+ "grad_norm": 2.9740655422210693,
+ "learning_rate": 4.379273855507231e-06,
+ "loss": 0.5512,
+ "step": 2986
+ },
+ {
+ "epoch": 1.4122931442080378,
+ "grad_norm": 3.0090627670288086,
+ "learning_rate": 4.378862386653911e-06,
+ "loss": 0.5459,
+ "step": 2987
+ },
+ {
+ "epoch": 1.4127659574468086,
+ "grad_norm": 2.8835368156433105,
+ "learning_rate": 4.378450800812071e-06,
+ "loss": 0.5357,
+ "step": 2988
+ },
+ {
+ "epoch": 1.4132387706855791,
+ "grad_norm": 2.558824062347412,
+ "learning_rate": 4.378039098007335e-06,
+ "loss": 0.536,
+ "step": 2989
+ },
+ {
+ "epoch": 1.41371158392435,
+ "grad_norm": 2.5572092533111572,
+ "learning_rate": 4.377627278265339e-06,
+ "loss": 0.5183,
+ "step": 2990
+ },
+ {
+ "epoch": 1.4141843971631205,
+ "grad_norm": 2.7356579303741455,
+ "learning_rate": 4.377215341611727e-06,
+ "loss": 0.5087,
+ "step": 2991
+ },
+ {
+ "epoch": 1.4146572104018913,
+ "grad_norm": 2.7541024684906006,
+ "learning_rate": 4.376803288072146e-06,
+ "loss": 0.4509,
+ "step": 2992
+ },
+ {
+ "epoch": 1.4151300236406619,
+ "grad_norm": 2.7548446655273438,
+ "learning_rate": 4.376391117672254e-06,
+ "loss": 0.5532,
+ "step": 2993
+ },
+ {
+ "epoch": 1.4156028368794327,
+ "grad_norm": 2.9107465744018555,
+ "learning_rate": 4.375978830437715e-06,
+ "loss": 0.5719,
+ "step": 2994
+ },
+ {
+ "epoch": 1.4160756501182032,
+ "grad_norm": 2.7077393531799316,
+ "learning_rate": 4.3755664263942e-06,
+ "loss": 0.5084,
+ "step": 2995
+ },
+ {
+ "epoch": 1.416548463356974,
+ "grad_norm": 2.764209270477295,
+ "learning_rate": 4.375153905567388e-06,
+ "loss": 0.5976,
+ "step": 2996
+ },
+ {
+ "epoch": 1.4170212765957446,
+ "grad_norm": 2.7792932987213135,
+ "learning_rate": 4.374741267982964e-06,
+ "loss": 0.5358,
+ "step": 2997
+ },
+ {
+ "epoch": 1.4174940898345154,
+ "grad_norm": 2.459212064743042,
+ "learning_rate": 4.374328513666622e-06,
+ "loss": 0.5181,
+ "step": 2998
+ },
+ {
+ "epoch": 1.417966903073286,
+ "grad_norm": 2.548546552658081,
+ "learning_rate": 4.373915642644062e-06,
+ "loss": 0.528,
+ "step": 2999
+ },
+ {
+ "epoch": 1.4184397163120568,
+ "grad_norm": 2.998138189315796,
+ "learning_rate": 4.373502654940992e-06,
+ "loss": 0.5233,
+ "step": 3000
+ },
+ {
+ "epoch": 1.4189125295508274,
+ "grad_norm": 2.604341983795166,
+ "learning_rate": 4.373089550583126e-06,
+ "loss": 0.5274,
+ "step": 3001
+ },
+ {
+ "epoch": 1.4193853427895982,
+ "grad_norm": 2.6792588233947754,
+ "learning_rate": 4.372676329596188e-06,
+ "loss": 0.5061,
+ "step": 3002
+ },
+ {
+ "epoch": 1.4198581560283687,
+ "grad_norm": 2.5182368755340576,
+ "learning_rate": 4.372262992005906e-06,
+ "loss": 0.541,
+ "step": 3003
+ },
+ {
+ "epoch": 1.4203309692671395,
+ "grad_norm": 2.690718173980713,
+ "learning_rate": 4.371849537838018e-06,
+ "loss": 0.5308,
+ "step": 3004
+ },
+ {
+ "epoch": 1.42080378250591,
+ "grad_norm": 2.6797590255737305,
+ "learning_rate": 4.371435967118266e-06,
+ "loss": 0.5728,
+ "step": 3005
+ },
+ {
+ "epoch": 1.421276595744681,
+ "grad_norm": 2.847900152206421,
+ "learning_rate": 4.371022279872403e-06,
+ "loss": 0.5053,
+ "step": 3006
+ },
+ {
+ "epoch": 1.4217494089834515,
+ "grad_norm": 2.497810125350952,
+ "learning_rate": 4.370608476126186e-06,
+ "loss": 0.5057,
+ "step": 3007
+ },
+ {
+ "epoch": 1.4222222222222223,
+ "grad_norm": 2.5259225368499756,
+ "learning_rate": 4.370194555905382e-06,
+ "loss": 0.5508,
+ "step": 3008
+ },
+ {
+ "epoch": 1.4226950354609929,
+ "grad_norm": 2.774118423461914,
+ "learning_rate": 4.369780519235763e-06,
+ "loss": 0.5419,
+ "step": 3009
+ },
+ {
+ "epoch": 1.4231678486997636,
+ "grad_norm": 2.2764663696289062,
+ "learning_rate": 4.369366366143111e-06,
+ "loss": 0.5032,
+ "step": 3010
+ },
+ {
+ "epoch": 1.4236406619385342,
+ "grad_norm": 2.736347198486328,
+ "learning_rate": 4.368952096653211e-06,
+ "loss": 0.5184,
+ "step": 3011
+ },
+ {
+ "epoch": 1.424113475177305,
+ "grad_norm": 2.476762056350708,
+ "learning_rate": 4.36853771079186e-06,
+ "loss": 0.5331,
+ "step": 3012
+ },
+ {
+ "epoch": 1.4245862884160756,
+ "grad_norm": 2.8006162643432617,
+ "learning_rate": 4.3681232085848585e-06,
+ "loss": 0.5331,
+ "step": 3013
+ },
+ {
+ "epoch": 1.4250591016548464,
+ "grad_norm": 2.509143590927124,
+ "learning_rate": 4.367708590058016e-06,
+ "loss": 0.5127,
+ "step": 3014
+ },
+ {
+ "epoch": 1.425531914893617,
+ "grad_norm": 3.030137538909912,
+ "learning_rate": 4.3672938552371505e-06,
+ "loss": 0.5555,
+ "step": 3015
+ },
+ {
+ "epoch": 1.4260047281323878,
+ "grad_norm": 3.0536904335021973,
+ "learning_rate": 4.3668790041480835e-06,
+ "loss": 0.5241,
+ "step": 3016
+ },
+ {
+ "epoch": 1.4264775413711583,
+ "grad_norm": 2.6400439739227295,
+ "learning_rate": 4.366464036816647e-06,
+ "loss": 0.4946,
+ "step": 3017
+ },
+ {
+ "epoch": 1.4269503546099291,
+ "grad_norm": 2.7302589416503906,
+ "learning_rate": 4.366048953268679e-06,
+ "loss": 0.5105,
+ "step": 3018
+ },
+ {
+ "epoch": 1.4274231678486997,
+ "grad_norm": 2.504549264907837,
+ "learning_rate": 4.365633753530026e-06,
+ "loss": 0.4844,
+ "step": 3019
+ },
+ {
+ "epoch": 1.4278959810874705,
+ "grad_norm": 2.3872320652008057,
+ "learning_rate": 4.365218437626539e-06,
+ "loss": 0.4402,
+ "step": 3020
+ },
+ {
+ "epoch": 1.428368794326241,
+ "grad_norm": 2.531649351119995,
+ "learning_rate": 4.364803005584078e-06,
+ "loss": 0.4913,
+ "step": 3021
+ },
+ {
+ "epoch": 1.4288416075650119,
+ "grad_norm": 2.4683783054351807,
+ "learning_rate": 4.364387457428512e-06,
+ "loss": 0.515,
+ "step": 3022
+ },
+ {
+ "epoch": 1.4293144208037825,
+ "grad_norm": 2.632336378097534,
+ "learning_rate": 4.363971793185713e-06,
+ "loss": 0.5398,
+ "step": 3023
+ },
+ {
+ "epoch": 1.4297872340425533,
+ "grad_norm": 2.7456719875335693,
+ "learning_rate": 4.363556012881565e-06,
+ "loss": 0.5254,
+ "step": 3024
+ },
+ {
+ "epoch": 1.4302600472813238,
+ "grad_norm": 2.607177972793579,
+ "learning_rate": 4.363140116541955e-06,
+ "loss": 0.5266,
+ "step": 3025
+ },
+ {
+ "epoch": 1.4307328605200946,
+ "grad_norm": 2.640127420425415,
+ "learning_rate": 4.3627241041927796e-06,
+ "loss": 0.5157,
+ "step": 3026
+ },
+ {
+ "epoch": 1.4312056737588652,
+ "grad_norm": 2.4210736751556396,
+ "learning_rate": 4.362307975859941e-06,
+ "loss": 0.4599,
+ "step": 3027
+ },
+ {
+ "epoch": 1.431678486997636,
+ "grad_norm": 2.6007790565490723,
+ "learning_rate": 4.361891731569352e-06,
+ "loss": 0.5298,
+ "step": 3028
+ },
+ {
+ "epoch": 1.4321513002364066,
+ "grad_norm": 2.5352046489715576,
+ "learning_rate": 4.361475371346928e-06,
+ "loss": 0.5128,
+ "step": 3029
+ },
+ {
+ "epoch": 1.4326241134751774,
+ "grad_norm": 2.4204049110412598,
+ "learning_rate": 4.361058895218596e-06,
+ "loss": 0.4669,
+ "step": 3030
+ },
+ {
+ "epoch": 1.433096926713948,
+ "grad_norm": 2.525240182876587,
+ "learning_rate": 4.360642303210286e-06,
+ "loss": 0.4925,
+ "step": 3031
+ },
+ {
+ "epoch": 1.4335697399527187,
+ "grad_norm": 2.839646339416504,
+ "learning_rate": 4.360225595347939e-06,
+ "loss": 0.5868,
+ "step": 3032
+ },
+ {
+ "epoch": 1.4340425531914893,
+ "grad_norm": 2.5043296813964844,
+ "learning_rate": 4.359808771657501e-06,
+ "loss": 0.4951,
+ "step": 3033
+ },
+ {
+ "epoch": 1.4345153664302601,
+ "grad_norm": 2.9082300662994385,
+ "learning_rate": 4.359391832164927e-06,
+ "loss": 0.5259,
+ "step": 3034
+ },
+ {
+ "epoch": 1.4349881796690307,
+ "grad_norm": 2.6651999950408936,
+ "learning_rate": 4.3589747768961745e-06,
+ "loss": 0.537,
+ "step": 3035
+ },
+ {
+ "epoch": 1.4354609929078015,
+ "grad_norm": 2.577077865600586,
+ "learning_rate": 4.358557605877216e-06,
+ "loss": 0.5186,
+ "step": 3036
+ },
+ {
+ "epoch": 1.435933806146572,
+ "grad_norm": 2.7445287704467773,
+ "learning_rate": 4.3581403191340236e-06,
+ "loss": 0.5573,
+ "step": 3037
+ },
+ {
+ "epoch": 1.4364066193853429,
+ "grad_norm": 2.502086639404297,
+ "learning_rate": 4.357722916692582e-06,
+ "loss": 0.5039,
+ "step": 3038
+ },
+ {
+ "epoch": 1.4368794326241134,
+ "grad_norm": 2.4476163387298584,
+ "learning_rate": 4.357305398578879e-06,
+ "loss": 0.5638,
+ "step": 3039
+ },
+ {
+ "epoch": 1.4373522458628842,
+ "grad_norm": 2.7705588340759277,
+ "learning_rate": 4.356887764818915e-06,
+ "loss": 0.5485,
+ "step": 3040
+ },
+ {
+ "epoch": 1.4378250591016548,
+ "grad_norm": 2.498225450515747,
+ "learning_rate": 4.356470015438691e-06,
+ "loss": 0.5486,
+ "step": 3041
+ },
+ {
+ "epoch": 1.4382978723404256,
+ "grad_norm": 2.394320011138916,
+ "learning_rate": 4.356052150464219e-06,
+ "loss": 0.512,
+ "step": 3042
+ },
+ {
+ "epoch": 1.4387706855791962,
+ "grad_norm": 2.8725767135620117,
+ "learning_rate": 4.3556341699215185e-06,
+ "loss": 0.5202,
+ "step": 3043
+ },
+ {
+ "epoch": 1.439243498817967,
+ "grad_norm": 3.1707918643951416,
+ "learning_rate": 4.355216073836615e-06,
+ "loss": 0.5229,
+ "step": 3044
+ },
+ {
+ "epoch": 1.4397163120567376,
+ "grad_norm": 2.532578468322754,
+ "learning_rate": 4.3547978622355415e-06,
+ "loss": 0.4569,
+ "step": 3045
+ },
+ {
+ "epoch": 1.4401891252955084,
+ "grad_norm": 3.0111029148101807,
+ "learning_rate": 4.354379535144338e-06,
+ "loss": 0.5801,
+ "step": 3046
+ },
+ {
+ "epoch": 1.440661938534279,
+ "grad_norm": 2.9554224014282227,
+ "learning_rate": 4.353961092589052e-06,
+ "loss": 0.5968,
+ "step": 3047
+ },
+ {
+ "epoch": 1.4411347517730497,
+ "grad_norm": 2.7562637329101562,
+ "learning_rate": 4.353542534595738e-06,
+ "loss": 0.5005,
+ "step": 3048
+ },
+ {
+ "epoch": 1.4416075650118203,
+ "grad_norm": 3.083254337310791,
+ "learning_rate": 4.3531238611904595e-06,
+ "loss": 0.5389,
+ "step": 3049
+ },
+ {
+ "epoch": 1.442080378250591,
+ "grad_norm": 2.7778005599975586,
+ "learning_rate": 4.352705072399282e-06,
+ "loss": 0.5342,
+ "step": 3050
+ },
+ {
+ "epoch": 1.4425531914893617,
+ "grad_norm": 2.6673996448516846,
+ "learning_rate": 4.3522861682482845e-06,
+ "loss": 0.5213,
+ "step": 3051
+ },
+ {
+ "epoch": 1.4430260047281322,
+ "grad_norm": 2.637605905532837,
+ "learning_rate": 4.351867148763548e-06,
+ "loss": 0.4893,
+ "step": 3052
+ },
+ {
+ "epoch": 1.443498817966903,
+ "grad_norm": 2.834469795227051,
+ "learning_rate": 4.351448013971166e-06,
+ "loss": 0.5391,
+ "step": 3053
+ },
+ {
+ "epoch": 1.4439716312056738,
+ "grad_norm": 2.824153184890747,
+ "learning_rate": 4.351028763897234e-06,
+ "loss": 0.6403,
+ "step": 3054
+ },
+ {
+ "epoch": 1.4444444444444444,
+ "grad_norm": 2.558966875076294,
+ "learning_rate": 4.350609398567857e-06,
+ "loss": 0.4912,
+ "step": 3055
+ },
+ {
+ "epoch": 1.444917257683215,
+ "grad_norm": 2.281726360321045,
+ "learning_rate": 4.3501899180091475e-06,
+ "loss": 0.4655,
+ "step": 3056
+ },
+ {
+ "epoch": 1.4453900709219858,
+ "grad_norm": 2.499472141265869,
+ "learning_rate": 4.349770322247225e-06,
+ "loss": 0.4878,
+ "step": 3057
+ },
+ {
+ "epoch": 1.4458628841607566,
+ "grad_norm": 2.578615188598633,
+ "learning_rate": 4.349350611308215e-06,
+ "loss": 0.4855,
+ "step": 3058
+ },
+ {
+ "epoch": 1.4463356973995272,
+ "grad_norm": 2.7111165523529053,
+ "learning_rate": 4.348930785218252e-06,
+ "loss": 0.5415,
+ "step": 3059
+ },
+ {
+ "epoch": 1.4468085106382977,
+ "grad_norm": 2.8081610202789307,
+ "learning_rate": 4.348510844003476e-06,
+ "loss": 0.4881,
+ "step": 3060
+ },
+ {
+ "epoch": 1.4472813238770685,
+ "grad_norm": 2.9439868927001953,
+ "learning_rate": 4.348090787690036e-06,
+ "loss": 0.5485,
+ "step": 3061
+ },
+ {
+ "epoch": 1.4477541371158393,
+ "grad_norm": 2.592532157897949,
+ "learning_rate": 4.347670616304085e-06,
+ "loss": 0.4912,
+ "step": 3062
+ },
+ {
+ "epoch": 1.44822695035461,
+ "grad_norm": 2.960592746734619,
+ "learning_rate": 4.347250329871787e-06,
+ "loss": 0.5473,
+ "step": 3063
+ },
+ {
+ "epoch": 1.4486997635933805,
+ "grad_norm": 2.5786688327789307,
+ "learning_rate": 4.3468299284193116e-06,
+ "loss": 0.5348,
+ "step": 3064
+ },
+ {
+ "epoch": 1.4491725768321513,
+ "grad_norm": 2.6084046363830566,
+ "learning_rate": 4.346409411972834e-06,
+ "loss": 0.527,
+ "step": 3065
+ },
+ {
+ "epoch": 1.449645390070922,
+ "grad_norm": 2.489748239517212,
+ "learning_rate": 4.3459887805585385e-06,
+ "loss": 0.4943,
+ "step": 3066
+ },
+ {
+ "epoch": 1.4501182033096927,
+ "grad_norm": 2.452131986618042,
+ "learning_rate": 4.345568034202617e-06,
+ "loss": 0.4886,
+ "step": 3067
+ },
+ {
+ "epoch": 1.4505910165484632,
+ "grad_norm": 2.4034671783447266,
+ "learning_rate": 4.345147172931266e-06,
+ "loss": 0.4689,
+ "step": 3068
+ },
+ {
+ "epoch": 1.451063829787234,
+ "grad_norm": 2.6045448780059814,
+ "learning_rate": 4.344726196770691e-06,
+ "loss": 0.5842,
+ "step": 3069
+ },
+ {
+ "epoch": 1.4515366430260048,
+ "grad_norm": 2.697593927383423,
+ "learning_rate": 4.3443051057471045e-06,
+ "loss": 0.5358,
+ "step": 3070
+ },
+ {
+ "epoch": 1.4520094562647754,
+ "grad_norm": 2.6080820560455322,
+ "learning_rate": 4.343883899886727e-06,
+ "loss": 0.5361,
+ "step": 3071
+ },
+ {
+ "epoch": 1.452482269503546,
+ "grad_norm": 2.4605307579040527,
+ "learning_rate": 4.343462579215783e-06,
+ "loss": 0.4941,
+ "step": 3072
+ },
+ {
+ "epoch": 1.4529550827423168,
+ "grad_norm": 2.8025355339050293,
+ "learning_rate": 4.343041143760509e-06,
+ "loss": 0.5116,
+ "step": 3073
+ },
+ {
+ "epoch": 1.4534278959810876,
+ "grad_norm": 2.432515859603882,
+ "learning_rate": 4.3426195935471434e-06,
+ "loss": 0.4991,
+ "step": 3074
+ },
+ {
+ "epoch": 1.4539007092198581,
+ "grad_norm": 2.5838661193847656,
+ "learning_rate": 4.342197928601935e-06,
+ "loss": 0.4994,
+ "step": 3075
+ },
+ {
+ "epoch": 1.4543735224586287,
+ "grad_norm": 2.421692371368408,
+ "learning_rate": 4.341776148951141e-06,
+ "loss": 0.4945,
+ "step": 3076
+ },
+ {
+ "epoch": 1.4548463356973995,
+ "grad_norm": 2.5354676246643066,
+ "learning_rate": 4.341354254621021e-06,
+ "loss": 0.4859,
+ "step": 3077
+ },
+ {
+ "epoch": 1.4553191489361703,
+ "grad_norm": 2.7316789627075195,
+ "learning_rate": 4.340932245637846e-06,
+ "loss": 0.5136,
+ "step": 3078
+ },
+ {
+ "epoch": 1.455791962174941,
+ "grad_norm": 3.5903496742248535,
+ "learning_rate": 4.340510122027891e-06,
+ "loss": 0.6451,
+ "step": 3079
+ },
+ {
+ "epoch": 1.4562647754137115,
+ "grad_norm": 2.95190167427063,
+ "learning_rate": 4.340087883817442e-06,
+ "loss": 0.6354,
+ "step": 3080
+ },
+ {
+ "epoch": 1.4567375886524823,
+ "grad_norm": 2.8659214973449707,
+ "learning_rate": 4.339665531032789e-06,
+ "loss": 0.5514,
+ "step": 3081
+ },
+ {
+ "epoch": 1.457210401891253,
+ "grad_norm": 2.5681674480438232,
+ "learning_rate": 4.339243063700231e-06,
+ "loss": 0.5135,
+ "step": 3082
+ },
+ {
+ "epoch": 1.4576832151300236,
+ "grad_norm": 2.7353906631469727,
+ "learning_rate": 4.338820481846072e-06,
+ "loss": 0.4608,
+ "step": 3083
+ },
+ {
+ "epoch": 1.4581560283687942,
+ "grad_norm": 2.6116466522216797,
+ "learning_rate": 4.3383977854966245e-06,
+ "loss": 0.4924,
+ "step": 3084
+ },
+ {
+ "epoch": 1.458628841607565,
+ "grad_norm": 2.6676487922668457,
+ "learning_rate": 4.337974974678207e-06,
+ "loss": 0.5747,
+ "step": 3085
+ },
+ {
+ "epoch": 1.4591016548463358,
+ "grad_norm": 2.909031629562378,
+ "learning_rate": 4.337552049417147e-06,
+ "loss": 0.4618,
+ "step": 3086
+ },
+ {
+ "epoch": 1.4595744680851064,
+ "grad_norm": 2.7614190578460693,
+ "learning_rate": 4.33712900973978e-06,
+ "loss": 0.5154,
+ "step": 3087
+ },
+ {
+ "epoch": 1.460047281323877,
+ "grad_norm": 2.452188014984131,
+ "learning_rate": 4.336705855672444e-06,
+ "loss": 0.542,
+ "step": 3088
+ },
+ {
+ "epoch": 1.4605200945626478,
+ "grad_norm": 3.0004117488861084,
+ "learning_rate": 4.336282587241488e-06,
+ "loss": 0.5857,
+ "step": 3089
+ },
+ {
+ "epoch": 1.4609929078014185,
+ "grad_norm": 2.870783567428589,
+ "learning_rate": 4.335859204473268e-06,
+ "loss": 0.5506,
+ "step": 3090
+ },
+ {
+ "epoch": 1.4614657210401891,
+ "grad_norm": 3.1078689098358154,
+ "learning_rate": 4.335435707394145e-06,
+ "loss": 0.5138,
+ "step": 3091
+ },
+ {
+ "epoch": 1.4619385342789597,
+ "grad_norm": 2.8516197204589844,
+ "learning_rate": 4.335012096030488e-06,
+ "loss": 0.5842,
+ "step": 3092
+ },
+ {
+ "epoch": 1.4624113475177305,
+ "grad_norm": 2.615922212600708,
+ "learning_rate": 4.334588370408675e-06,
+ "loss": 0.4896,
+ "step": 3093
+ },
+ {
+ "epoch": 1.4628841607565013,
+ "grad_norm": 3.1911802291870117,
+ "learning_rate": 4.334164530555088e-06,
+ "loss": 0.4974,
+ "step": 3094
+ },
+ {
+ "epoch": 1.4633569739952719,
+ "grad_norm": 3.075051784515381,
+ "learning_rate": 4.3337405764961186e-06,
+ "loss": 0.567,
+ "step": 3095
+ },
+ {
+ "epoch": 1.4638297872340424,
+ "grad_norm": 2.550625801086426,
+ "learning_rate": 4.333316508258163e-06,
+ "loss": 0.4887,
+ "step": 3096
+ },
+ {
+ "epoch": 1.4643026004728132,
+ "grad_norm": 2.3986475467681885,
+ "learning_rate": 4.332892325867629e-06,
+ "loss": 0.5047,
+ "step": 3097
+ },
+ {
+ "epoch": 1.464775413711584,
+ "grad_norm": 2.5045125484466553,
+ "learning_rate": 4.332468029350926e-06,
+ "loss": 0.4721,
+ "step": 3098
+ },
+ {
+ "epoch": 1.4652482269503546,
+ "grad_norm": 2.347365617752075,
+ "learning_rate": 4.332043618734474e-06,
+ "loss": 0.4913,
+ "step": 3099
+ },
+ {
+ "epoch": 1.4657210401891252,
+ "grad_norm": 2.459928512573242,
+ "learning_rate": 4.331619094044699e-06,
+ "loss": 0.523,
+ "step": 3100
+ },
+ {
+ "epoch": 1.466193853427896,
+ "grad_norm": 2.5771310329437256,
+ "learning_rate": 4.331194455308035e-06,
+ "loss": 0.593,
+ "step": 3101
+ },
+ {
+ "epoch": 1.4666666666666668,
+ "grad_norm": 3.1351823806762695,
+ "learning_rate": 4.330769702550921e-06,
+ "loss": 0.5852,
+ "step": 3102
+ },
+ {
+ "epoch": 1.4671394799054374,
+ "grad_norm": 2.589817523956299,
+ "learning_rate": 4.330344835799806e-06,
+ "loss": 0.508,
+ "step": 3103
+ },
+ {
+ "epoch": 1.467612293144208,
+ "grad_norm": 3.1140341758728027,
+ "learning_rate": 4.329919855081144e-06,
+ "loss": 0.469,
+ "step": 3104
+ },
+ {
+ "epoch": 1.4680851063829787,
+ "grad_norm": 2.8186635971069336,
+ "learning_rate": 4.329494760421396e-06,
+ "loss": 0.5088,
+ "step": 3105
+ },
+ {
+ "epoch": 1.4685579196217495,
+ "grad_norm": 2.676077365875244,
+ "learning_rate": 4.329069551847031e-06,
+ "loss": 0.52,
+ "step": 3106
+ },
+ {
+ "epoch": 1.46903073286052,
+ "grad_norm": 2.5543313026428223,
+ "learning_rate": 4.328644229384526e-06,
+ "loss": 0.5066,
+ "step": 3107
+ },
+ {
+ "epoch": 1.4695035460992907,
+ "grad_norm": 2.8176217079162598,
+ "learning_rate": 4.328218793060362e-06,
+ "loss": 0.6404,
+ "step": 3108
+ },
+ {
+ "epoch": 1.4699763593380615,
+ "grad_norm": 2.485217332839966,
+ "learning_rate": 4.3277932429010314e-06,
+ "loss": 0.4578,
+ "step": 3109
+ },
+ {
+ "epoch": 1.4704491725768323,
+ "grad_norm": 2.6741621494293213,
+ "learning_rate": 4.327367578933031e-06,
+ "loss": 0.5068,
+ "step": 3110
+ },
+ {
+ "epoch": 1.4709219858156029,
+ "grad_norm": 2.377242088317871,
+ "learning_rate": 4.326941801182863e-06,
+ "loss": 0.5249,
+ "step": 3111
+ },
+ {
+ "epoch": 1.4713947990543734,
+ "grad_norm": 2.790046215057373,
+ "learning_rate": 4.32651590967704e-06,
+ "loss": 0.5532,
+ "step": 3112
+ },
+ {
+ "epoch": 1.4718676122931442,
+ "grad_norm": 2.78019642829895,
+ "learning_rate": 4.326089904442081e-06,
+ "loss": 0.5362,
+ "step": 3113
+ },
+ {
+ "epoch": 1.472340425531915,
+ "grad_norm": 2.5661380290985107,
+ "learning_rate": 4.32566378550451e-06,
+ "loss": 0.5041,
+ "step": 3114
+ },
+ {
+ "epoch": 1.4728132387706856,
+ "grad_norm": 2.522153615951538,
+ "learning_rate": 4.3252375528908605e-06,
+ "loss": 0.5074,
+ "step": 3115
+ },
+ {
+ "epoch": 1.4732860520094562,
+ "grad_norm": 2.874688148498535,
+ "learning_rate": 4.3248112066276725e-06,
+ "loss": 0.59,
+ "step": 3116
+ },
+ {
+ "epoch": 1.473758865248227,
+ "grad_norm": 3.067866802215576,
+ "learning_rate": 4.324384746741492e-06,
+ "loss": 0.5924,
+ "step": 3117
+ },
+ {
+ "epoch": 1.4742316784869978,
+ "grad_norm": 3.359463930130005,
+ "learning_rate": 4.323958173258873e-06,
+ "loss": 0.6346,
+ "step": 3118
+ },
+ {
+ "epoch": 1.4747044917257683,
+ "grad_norm": 2.193024158477783,
+ "learning_rate": 4.323531486206376e-06,
+ "loss": 0.4594,
+ "step": 3119
+ },
+ {
+ "epoch": 1.475177304964539,
+ "grad_norm": 2.886889934539795,
+ "learning_rate": 4.323104685610569e-06,
+ "loss": 0.523,
+ "step": 3120
+ },
+ {
+ "epoch": 1.4756501182033097,
+ "grad_norm": 2.7558681964874268,
+ "learning_rate": 4.322677771498028e-06,
+ "loss": 0.5387,
+ "step": 3121
+ },
+ {
+ "epoch": 1.4761229314420805,
+ "grad_norm": 2.639277935028076,
+ "learning_rate": 4.322250743895335e-06,
+ "loss": 0.5599,
+ "step": 3122
+ },
+ {
+ "epoch": 1.476595744680851,
+ "grad_norm": 2.786198616027832,
+ "learning_rate": 4.321823602829078e-06,
+ "loss": 0.5405,
+ "step": 3123
+ },
+ {
+ "epoch": 1.4770685579196217,
+ "grad_norm": 2.582315683364868,
+ "learning_rate": 4.321396348325853e-06,
+ "loss": 0.4452,
+ "step": 3124
+ },
+ {
+ "epoch": 1.4775413711583925,
+ "grad_norm": 2.8574297428131104,
+ "learning_rate": 4.320968980412265e-06,
+ "loss": 0.4846,
+ "step": 3125
+ },
+ {
+ "epoch": 1.4780141843971633,
+ "grad_norm": 2.705281972885132,
+ "learning_rate": 4.320541499114922e-06,
+ "loss": 0.5548,
+ "step": 3126
+ },
+ {
+ "epoch": 1.4784869976359338,
+ "grad_norm": 2.3152754306793213,
+ "learning_rate": 4.320113904460444e-06,
+ "loss": 0.5216,
+ "step": 3127
+ },
+ {
+ "epoch": 1.4789598108747044,
+ "grad_norm": 3.230764150619507,
+ "learning_rate": 4.319686196475453e-06,
+ "loss": 0.6192,
+ "step": 3128
+ },
+ {
+ "epoch": 1.4794326241134752,
+ "grad_norm": 2.463380813598633,
+ "learning_rate": 4.319258375186583e-06,
+ "loss": 0.4872,
+ "step": 3129
+ },
+ {
+ "epoch": 1.479905437352246,
+ "grad_norm": 2.8477656841278076,
+ "learning_rate": 4.31883044062047e-06,
+ "loss": 0.5371,
+ "step": 3130
+ },
+ {
+ "epoch": 1.4803782505910166,
+ "grad_norm": 2.393911123275757,
+ "learning_rate": 4.318402392803762e-06,
+ "loss": 0.5334,
+ "step": 3131
+ },
+ {
+ "epoch": 1.4808510638297872,
+ "grad_norm": 2.6113736629486084,
+ "learning_rate": 4.317974231763109e-06,
+ "loss": 0.5572,
+ "step": 3132
+ },
+ {
+ "epoch": 1.481323877068558,
+ "grad_norm": 2.3941731452941895,
+ "learning_rate": 4.317545957525173e-06,
+ "loss": 0.4849,
+ "step": 3133
+ },
+ {
+ "epoch": 1.4817966903073285,
+ "grad_norm": 2.9536755084991455,
+ "learning_rate": 4.317117570116619e-06,
+ "loss": 0.6058,
+ "step": 3134
+ },
+ {
+ "epoch": 1.4822695035460993,
+ "grad_norm": 2.595754623413086,
+ "learning_rate": 4.316689069564123e-06,
+ "loss": 0.5193,
+ "step": 3135
+ },
+ {
+ "epoch": 1.48274231678487,
+ "grad_norm": 2.569833993911743,
+ "learning_rate": 4.316260455894364e-06,
+ "loss": 0.543,
+ "step": 3136
+ },
+ {
+ "epoch": 1.4832151300236407,
+ "grad_norm": 2.5137455463409424,
+ "learning_rate": 4.315831729134031e-06,
+ "loss": 0.5415,
+ "step": 3137
+ },
+ {
+ "epoch": 1.4836879432624113,
+ "grad_norm": 2.5582292079925537,
+ "learning_rate": 4.3154028893098176e-06,
+ "loss": 0.5338,
+ "step": 3138
+ },
+ {
+ "epoch": 1.484160756501182,
+ "grad_norm": 2.666426181793213,
+ "learning_rate": 4.3149739364484265e-06,
+ "loss": 0.5435,
+ "step": 3139
+ },
+ {
+ "epoch": 1.4846335697399526,
+ "grad_norm": 2.790851354598999,
+ "learning_rate": 4.314544870576568e-06,
+ "loss": 0.5746,
+ "step": 3140
+ },
+ {
+ "epoch": 1.4851063829787234,
+ "grad_norm": 2.620326042175293,
+ "learning_rate": 4.314115691720956e-06,
+ "loss": 0.5076,
+ "step": 3141
+ },
+ {
+ "epoch": 1.485579196217494,
+ "grad_norm": 3.075674533843994,
+ "learning_rate": 4.313686399908314e-06,
+ "loss": 0.5486,
+ "step": 3142
+ },
+ {
+ "epoch": 1.4860520094562648,
+ "grad_norm": 3.1347315311431885,
+ "learning_rate": 4.3132569951653745e-06,
+ "loss": 0.531,
+ "step": 3143
+ },
+ {
+ "epoch": 1.4865248226950354,
+ "grad_norm": 2.5783653259277344,
+ "learning_rate": 4.312827477518871e-06,
+ "loss": 0.5818,
+ "step": 3144
+ },
+ {
+ "epoch": 1.4869976359338062,
+ "grad_norm": 3.0247137546539307,
+ "learning_rate": 4.3123978469955505e-06,
+ "loss": 0.5347,
+ "step": 3145
+ },
+ {
+ "epoch": 1.4874704491725768,
+ "grad_norm": 2.4789345264434814,
+ "learning_rate": 4.311968103622163e-06,
+ "loss": 0.5,
+ "step": 3146
+ },
+ {
+ "epoch": 1.4879432624113476,
+ "grad_norm": 2.663341522216797,
+ "learning_rate": 4.311538247425466e-06,
+ "loss": 0.4825,
+ "step": 3147
+ },
+ {
+ "epoch": 1.4884160756501181,
+ "grad_norm": 2.633711099624634,
+ "learning_rate": 4.311108278432226e-06,
+ "loss": 0.5244,
+ "step": 3148
+ },
+ {
+ "epoch": 1.488888888888889,
+ "grad_norm": 2.51312518119812,
+ "learning_rate": 4.310678196669216e-06,
+ "loss": 0.513,
+ "step": 3149
+ },
+ {
+ "epoch": 1.4893617021276595,
+ "grad_norm": 2.5263755321502686,
+ "learning_rate": 4.310248002163214e-06,
+ "loss": 0.5236,
+ "step": 3150
+ },
+ {
+ "epoch": 1.4898345153664303,
+ "grad_norm": 2.559216260910034,
+ "learning_rate": 4.309817694941007e-06,
+ "loss": 0.5107,
+ "step": 3151
+ },
+ {
+ "epoch": 1.4903073286052009,
+ "grad_norm": 2.5023303031921387,
+ "learning_rate": 4.309387275029386e-06,
+ "loss": 0.4685,
+ "step": 3152
+ },
+ {
+ "epoch": 1.4907801418439717,
+ "grad_norm": 3.0314254760742188,
+ "learning_rate": 4.308956742455155e-06,
+ "loss": 0.5462,
+ "step": 3153
+ },
+ {
+ "epoch": 1.4912529550827422,
+ "grad_norm": 2.675295114517212,
+ "learning_rate": 4.308526097245119e-06,
+ "loss": 0.5398,
+ "step": 3154
+ },
+ {
+ "epoch": 1.491725768321513,
+ "grad_norm": 2.6613399982452393,
+ "learning_rate": 4.308095339426094e-06,
+ "loss": 0.5376,
+ "step": 3155
+ },
+ {
+ "epoch": 1.4921985815602836,
+ "grad_norm": 2.58937668800354,
+ "learning_rate": 4.307664469024899e-06,
+ "loss": 0.5385,
+ "step": 3156
+ },
+ {
+ "epoch": 1.4926713947990544,
+ "grad_norm": 2.583631992340088,
+ "learning_rate": 4.3072334860683655e-06,
+ "loss": 0.4927,
+ "step": 3157
+ },
+ {
+ "epoch": 1.493144208037825,
+ "grad_norm": 2.5889222621917725,
+ "learning_rate": 4.306802390583327e-06,
+ "loss": 0.47,
+ "step": 3158
+ },
+ {
+ "epoch": 1.4936170212765958,
+ "grad_norm": 2.9362716674804688,
+ "learning_rate": 4.3063711825966244e-06,
+ "loss": 0.4902,
+ "step": 3159
+ },
+ {
+ "epoch": 1.4940898345153664,
+ "grad_norm": 2.5385425090789795,
+ "learning_rate": 4.305939862135111e-06,
+ "loss": 0.5396,
+ "step": 3160
+ },
+ {
+ "epoch": 1.4945626477541372,
+ "grad_norm": 2.776326894760132,
+ "learning_rate": 4.305508429225641e-06,
+ "loss": 0.5169,
+ "step": 3161
+ },
+ {
+ "epoch": 1.4950354609929077,
+ "grad_norm": 2.575063467025757,
+ "learning_rate": 4.305076883895076e-06,
+ "loss": 0.4938,
+ "step": 3162
+ },
+ {
+ "epoch": 1.4955082742316785,
+ "grad_norm": 2.7552313804626465,
+ "learning_rate": 4.304645226170291e-06,
+ "loss": 0.6211,
+ "step": 3163
+ },
+ {
+ "epoch": 1.4959810874704491,
+ "grad_norm": 2.57149338722229,
+ "learning_rate": 4.30421345607816e-06,
+ "loss": 0.5241,
+ "step": 3164
+ },
+ {
+ "epoch": 1.49645390070922,
+ "grad_norm": 2.8142426013946533,
+ "learning_rate": 4.303781573645568e-06,
+ "loss": 0.5699,
+ "step": 3165
+ },
+ {
+ "epoch": 1.4969267139479905,
+ "grad_norm": 2.6344845294952393,
+ "learning_rate": 4.303349578899407e-06,
+ "loss": 0.5049,
+ "step": 3166
+ },
+ {
+ "epoch": 1.4973995271867613,
+ "grad_norm": 2.554410934448242,
+ "learning_rate": 4.302917471866575e-06,
+ "loss": 0.4404,
+ "step": 3167
+ },
+ {
+ "epoch": 1.4978723404255319,
+ "grad_norm": 2.896240711212158,
+ "learning_rate": 4.302485252573978e-06,
+ "loss": 0.602,
+ "step": 3168
+ },
+ {
+ "epoch": 1.4983451536643027,
+ "grad_norm": 2.4044477939605713,
+ "learning_rate": 4.302052921048527e-06,
+ "loss": 0.4857,
+ "step": 3169
+ },
+ {
+ "epoch": 1.4988179669030732,
+ "grad_norm": 2.7447879314422607,
+ "learning_rate": 4.301620477317144e-06,
+ "loss": 0.5438,
+ "step": 3170
+ },
+ {
+ "epoch": 1.499290780141844,
+ "grad_norm": 2.851820945739746,
+ "learning_rate": 4.301187921406752e-06,
+ "loss": 0.5245,
+ "step": 3171
+ },
+ {
+ "epoch": 1.4997635933806146,
+ "grad_norm": 3.247114419937134,
+ "learning_rate": 4.300755253344287e-06,
+ "loss": 0.504,
+ "step": 3172
+ },
+ {
+ "epoch": 1.5002364066193854,
+ "grad_norm": 3.117490291595459,
+ "learning_rate": 4.300322473156688e-06,
+ "loss": 0.4627,
+ "step": 3173
+ },
+ {
+ "epoch": 1.500709219858156,
+ "grad_norm": 2.558319330215454,
+ "learning_rate": 4.299889580870904e-06,
+ "loss": 0.5721,
+ "step": 3174
+ },
+ {
+ "epoch": 1.5011820330969265,
+ "grad_norm": 2.8983113765716553,
+ "learning_rate": 4.2994565765138865e-06,
+ "loss": 0.5257,
+ "step": 3175
+ },
+ {
+ "epoch": 1.5016548463356973,
+ "grad_norm": 2.744056463241577,
+ "learning_rate": 4.299023460112599e-06,
+ "loss": 0.4892,
+ "step": 3176
+ },
+ {
+ "epoch": 1.5021276595744681,
+ "grad_norm": 2.5506751537323,
+ "learning_rate": 4.29859023169401e-06,
+ "loss": 0.4933,
+ "step": 3177
+ },
+ {
+ "epoch": 1.5026004728132387,
+ "grad_norm": 2.842615842819214,
+ "learning_rate": 4.298156891285092e-06,
+ "loss": 0.6124,
+ "step": 3178
+ },
+ {
+ "epoch": 1.5030732860520093,
+ "grad_norm": 2.5355329513549805,
+ "learning_rate": 4.2977234389128305e-06,
+ "loss": 0.641,
+ "step": 3179
+ },
+ {
+ "epoch": 1.50354609929078,
+ "grad_norm": 2.674781084060669,
+ "learning_rate": 4.297289874604213e-06,
+ "loss": 0.475,
+ "step": 3180
+ },
+ {
+ "epoch": 1.5040189125295509,
+ "grad_norm": 2.6845548152923584,
+ "learning_rate": 4.296856198386235e-06,
+ "loss": 0.5328,
+ "step": 3181
+ },
+ {
+ "epoch": 1.5044917257683215,
+ "grad_norm": 2.9686241149902344,
+ "learning_rate": 4.296422410285902e-06,
+ "loss": 0.6216,
+ "step": 3182
+ },
+ {
+ "epoch": 1.504964539007092,
+ "grad_norm": 2.5095980167388916,
+ "learning_rate": 4.295988510330222e-06,
+ "loss": 0.4993,
+ "step": 3183
+ },
+ {
+ "epoch": 1.5054373522458628,
+ "grad_norm": 2.4906392097473145,
+ "learning_rate": 4.2955544985462125e-06,
+ "loss": 0.4795,
+ "step": 3184
+ },
+ {
+ "epoch": 1.5059101654846336,
+ "grad_norm": 2.5593366622924805,
+ "learning_rate": 4.295120374960897e-06,
+ "loss": 0.5527,
+ "step": 3185
+ },
+ {
+ "epoch": 1.5063829787234042,
+ "grad_norm": 2.691495180130005,
+ "learning_rate": 4.294686139601308e-06,
+ "loss": 0.5646,
+ "step": 3186
+ },
+ {
+ "epoch": 1.5068557919621748,
+ "grad_norm": 2.74320387840271,
+ "learning_rate": 4.294251792494483e-06,
+ "loss": 0.6149,
+ "step": 3187
+ },
+ {
+ "epoch": 1.5073286052009456,
+ "grad_norm": 2.8827052116394043,
+ "learning_rate": 4.293817333667465e-06,
+ "loss": 0.5414,
+ "step": 3188
+ },
+ {
+ "epoch": 1.5078014184397164,
+ "grad_norm": 2.5652425289154053,
+ "learning_rate": 4.293382763147308e-06,
+ "loss": 0.5006,
+ "step": 3189
+ },
+ {
+ "epoch": 1.508274231678487,
+ "grad_norm": 2.729295253753662,
+ "learning_rate": 4.29294808096107e-06,
+ "loss": 0.522,
+ "step": 3190
+ },
+ {
+ "epoch": 1.5087470449172575,
+ "grad_norm": 2.348118305206299,
+ "learning_rate": 4.292513287135817e-06,
+ "loss": 0.4125,
+ "step": 3191
+ },
+ {
+ "epoch": 1.5092198581560283,
+ "grad_norm": 2.809551954269409,
+ "learning_rate": 4.292078381698621e-06,
+ "loss": 0.5577,
+ "step": 3192
+ },
+ {
+ "epoch": 1.5096926713947991,
+ "grad_norm": 2.6925361156463623,
+ "learning_rate": 4.291643364676563e-06,
+ "loss": 0.62,
+ "step": 3193
+ },
+ {
+ "epoch": 1.5101654846335697,
+ "grad_norm": 2.4200620651245117,
+ "learning_rate": 4.291208236096729e-06,
+ "loss": 0.5464,
+ "step": 3194
+ },
+ {
+ "epoch": 1.5106382978723403,
+ "grad_norm": 2.5659191608428955,
+ "learning_rate": 4.290772995986211e-06,
+ "loss": 0.5402,
+ "step": 3195
+ },
+ {
+ "epoch": 1.511111111111111,
+ "grad_norm": 2.3877315521240234,
+ "learning_rate": 4.290337644372113e-06,
+ "loss": 0.463,
+ "step": 3196
+ },
+ {
+ "epoch": 1.5115839243498819,
+ "grad_norm": 2.7063233852386475,
+ "learning_rate": 4.289902181281538e-06,
+ "loss": 0.5253,
+ "step": 3197
+ },
+ {
+ "epoch": 1.5120567375886524,
+ "grad_norm": 2.56788969039917,
+ "learning_rate": 4.289466606741603e-06,
+ "loss": 0.5012,
+ "step": 3198
+ },
+ {
+ "epoch": 1.512529550827423,
+ "grad_norm": 2.637164831161499,
+ "learning_rate": 4.28903092077943e-06,
+ "loss": 0.5236,
+ "step": 3199
+ },
+ {
+ "epoch": 1.5130023640661938,
+ "grad_norm": 2.767526865005493,
+ "learning_rate": 4.288595123422146e-06,
+ "loss": 0.5832,
+ "step": 3200
+ },
+ {
+ "epoch": 1.5134751773049646,
+ "grad_norm": 2.33365535736084,
+ "learning_rate": 4.2881592146968866e-06,
+ "loss": 0.4548,
+ "step": 3201
+ },
+ {
+ "epoch": 1.5139479905437352,
+ "grad_norm": 2.544189453125,
+ "learning_rate": 4.287723194630793e-06,
+ "loss": 0.5115,
+ "step": 3202
+ },
+ {
+ "epoch": 1.5144208037825058,
+ "grad_norm": 2.588793992996216,
+ "learning_rate": 4.2872870632510155e-06,
+ "loss": 0.4766,
+ "step": 3203
+ },
+ {
+ "epoch": 1.5148936170212766,
+ "grad_norm": 2.5382184982299805,
+ "learning_rate": 4.286850820584709e-06,
+ "loss": 0.5401,
+ "step": 3204
+ },
+ {
+ "epoch": 1.5153664302600474,
+ "grad_norm": 2.597930669784546,
+ "learning_rate": 4.286414466659038e-06,
+ "loss": 0.5346,
+ "step": 3205
+ },
+ {
+ "epoch": 1.515839243498818,
+ "grad_norm": 2.8522393703460693,
+ "learning_rate": 4.28597800150117e-06,
+ "loss": 0.486,
+ "step": 3206
+ },
+ {
+ "epoch": 1.5163120567375885,
+ "grad_norm": 2.4801454544067383,
+ "learning_rate": 4.285541425138285e-06,
+ "loss": 0.5162,
+ "step": 3207
+ },
+ {
+ "epoch": 1.5167848699763593,
+ "grad_norm": 2.353665351867676,
+ "learning_rate": 4.285104737597563e-06,
+ "loss": 0.5066,
+ "step": 3208
+ },
+ {
+ "epoch": 1.51725768321513,
+ "grad_norm": 2.767976760864258,
+ "learning_rate": 4.2846679389061975e-06,
+ "loss": 0.5331,
+ "step": 3209
+ },
+ {
+ "epoch": 1.5177304964539007,
+ "grad_norm": 2.9307682514190674,
+ "learning_rate": 4.284231029091385e-06,
+ "loss": 0.5291,
+ "step": 3210
+ },
+ {
+ "epoch": 1.5182033096926713,
+ "grad_norm": 2.39719820022583,
+ "learning_rate": 4.283794008180329e-06,
+ "loss": 0.4759,
+ "step": 3211
+ },
+ {
+ "epoch": 1.518676122931442,
+ "grad_norm": 2.452244758605957,
+ "learning_rate": 4.283356876200242e-06,
+ "loss": 0.4283,
+ "step": 3212
+ },
+ {
+ "epoch": 1.5191489361702128,
+ "grad_norm": 2.4911608695983887,
+ "learning_rate": 4.282919633178343e-06,
+ "loss": 0.4812,
+ "step": 3213
+ },
+ {
+ "epoch": 1.5196217494089834,
+ "grad_norm": 2.5813944339752197,
+ "learning_rate": 4.282482279141856e-06,
+ "loss": 0.4911,
+ "step": 3214
+ },
+ {
+ "epoch": 1.520094562647754,
+ "grad_norm": 2.503542184829712,
+ "learning_rate": 4.282044814118013e-06,
+ "loss": 0.4969,
+ "step": 3215
+ },
+ {
+ "epoch": 1.5205673758865248,
+ "grad_norm": 2.5090713500976562,
+ "learning_rate": 4.281607238134053e-06,
+ "loss": 0.5293,
+ "step": 3216
+ },
+ {
+ "epoch": 1.5210401891252956,
+ "grad_norm": 2.425994396209717,
+ "learning_rate": 4.281169551217223e-06,
+ "loss": 0.5365,
+ "step": 3217
+ },
+ {
+ "epoch": 1.5215130023640662,
+ "grad_norm": 2.637655258178711,
+ "learning_rate": 4.2807317533947765e-06,
+ "loss": 0.5589,
+ "step": 3218
+ },
+ {
+ "epoch": 1.5219858156028367,
+ "grad_norm": 2.9335296154022217,
+ "learning_rate": 4.28029384469397e-06,
+ "loss": 0.6071,
+ "step": 3219
+ },
+ {
+ "epoch": 1.5224586288416075,
+ "grad_norm": 2.898683547973633,
+ "learning_rate": 4.279855825142073e-06,
+ "loss": 0.5392,
+ "step": 3220
+ },
+ {
+ "epoch": 1.5229314420803783,
+ "grad_norm": 2.613914966583252,
+ "learning_rate": 4.279417694766359e-06,
+ "loss": 0.4968,
+ "step": 3221
+ },
+ {
+ "epoch": 1.523404255319149,
+ "grad_norm": 2.500682830810547,
+ "learning_rate": 4.278979453594106e-06,
+ "loss": 0.471,
+ "step": 3222
+ },
+ {
+ "epoch": 1.5238770685579195,
+ "grad_norm": 2.5269598960876465,
+ "learning_rate": 4.278541101652605e-06,
+ "loss": 0.471,
+ "step": 3223
+ },
+ {
+ "epoch": 1.5243498817966903,
+ "grad_norm": 2.8153114318847656,
+ "learning_rate": 4.2781026389691465e-06,
+ "loss": 0.5742,
+ "step": 3224
+ },
+ {
+ "epoch": 1.524822695035461,
+ "grad_norm": 2.5648019313812256,
+ "learning_rate": 4.277664065571034e-06,
+ "loss": 0.5315,
+ "step": 3225
+ },
+ {
+ "epoch": 1.5252955082742317,
+ "grad_norm": 2.778355836868286,
+ "learning_rate": 4.277225381485575e-06,
+ "loss": 0.5543,
+ "step": 3226
+ },
+ {
+ "epoch": 1.5257683215130022,
+ "grad_norm": 2.6736745834350586,
+ "learning_rate": 4.2767865867400846e-06,
+ "loss": 0.4947,
+ "step": 3227
+ },
+ {
+ "epoch": 1.526241134751773,
+ "grad_norm": 2.9560294151306152,
+ "learning_rate": 4.276347681361884e-06,
+ "loss": 0.5835,
+ "step": 3228
+ },
+ {
+ "epoch": 1.5267139479905438,
+ "grad_norm": 2.5580296516418457,
+ "learning_rate": 4.275908665378302e-06,
+ "loss": 0.4751,
+ "step": 3229
+ },
+ {
+ "epoch": 1.5271867612293144,
+ "grad_norm": 3.0705175399780273,
+ "learning_rate": 4.2754695388166755e-06,
+ "loss": 0.5327,
+ "step": 3230
+ },
+ {
+ "epoch": 1.527659574468085,
+ "grad_norm": 2.664652109146118,
+ "learning_rate": 4.275030301704346e-06,
+ "loss": 0.4934,
+ "step": 3231
+ },
+ {
+ "epoch": 1.5281323877068558,
+ "grad_norm": 2.308499813079834,
+ "learning_rate": 4.274590954068663e-06,
+ "loss": 0.4412,
+ "step": 3232
+ },
+ {
+ "epoch": 1.5286052009456266,
+ "grad_norm": 2.871189594268799,
+ "learning_rate": 4.2741514959369815e-06,
+ "loss": 0.5001,
+ "step": 3233
+ },
+ {
+ "epoch": 1.5290780141843971,
+ "grad_norm": 2.5274453163146973,
+ "learning_rate": 4.273711927336666e-06,
+ "loss": 0.4938,
+ "step": 3234
+ },
+ {
+ "epoch": 1.5295508274231677,
+ "grad_norm": 2.8848133087158203,
+ "learning_rate": 4.273272248295087e-06,
+ "loss": 0.5397,
+ "step": 3235
+ },
+ {
+ "epoch": 1.5300236406619385,
+ "grad_norm": 2.3927090167999268,
+ "learning_rate": 4.27283245883962e-06,
+ "loss": 0.5497,
+ "step": 3236
+ },
+ {
+ "epoch": 1.5304964539007093,
+ "grad_norm": 2.5413873195648193,
+ "learning_rate": 4.27239255899765e-06,
+ "loss": 0.5108,
+ "step": 3237
+ },
+ {
+ "epoch": 1.53096926713948,
+ "grad_norm": 2.7692389488220215,
+ "learning_rate": 4.271952548796567e-06,
+ "loss": 0.5768,
+ "step": 3238
+ },
+ {
+ "epoch": 1.5314420803782505,
+ "grad_norm": 2.4621126651763916,
+ "learning_rate": 4.271512428263768e-06,
+ "loss": 0.4698,
+ "step": 3239
+ },
+ {
+ "epoch": 1.5319148936170213,
+ "grad_norm": 2.6423375606536865,
+ "learning_rate": 4.271072197426659e-06,
+ "loss": 0.4929,
+ "step": 3240
+ },
+ {
+ "epoch": 1.532387706855792,
+ "grad_norm": 2.7097692489624023,
+ "learning_rate": 4.270631856312649e-06,
+ "loss": 0.4836,
+ "step": 3241
+ },
+ {
+ "epoch": 1.5328605200945626,
+ "grad_norm": 2.545706272125244,
+ "learning_rate": 4.270191404949158e-06,
+ "loss": 0.4636,
+ "step": 3242
+ },
+ {
+ "epoch": 1.5333333333333332,
+ "grad_norm": 3.138781785964966,
+ "learning_rate": 4.26975084336361e-06,
+ "loss": 0.5988,
+ "step": 3243
+ },
+ {
+ "epoch": 1.533806146572104,
+ "grad_norm": 2.492715835571289,
+ "learning_rate": 4.269310171583438e-06,
+ "loss": 0.5095,
+ "step": 3244
+ },
+ {
+ "epoch": 1.5342789598108748,
+ "grad_norm": 2.5705838203430176,
+ "learning_rate": 4.268869389636077e-06,
+ "loss": 0.4818,
+ "step": 3245
+ },
+ {
+ "epoch": 1.5347517730496454,
+ "grad_norm": 2.7633554935455322,
+ "learning_rate": 4.268428497548979e-06,
+ "loss": 0.547,
+ "step": 3246
+ },
+ {
+ "epoch": 1.535224586288416,
+ "grad_norm": 2.654528856277466,
+ "learning_rate": 4.2679874953495905e-06,
+ "loss": 0.5261,
+ "step": 3247
+ },
+ {
+ "epoch": 1.5356973995271868,
+ "grad_norm": 2.5039751529693604,
+ "learning_rate": 4.2675463830653744e-06,
+ "loss": 0.4941,
+ "step": 3248
+ },
+ {
+ "epoch": 1.5361702127659576,
+ "grad_norm": 2.897268295288086,
+ "learning_rate": 4.267105160723794e-06,
+ "loss": 0.5404,
+ "step": 3249
+ },
+ {
+ "epoch": 1.5366430260047281,
+ "grad_norm": 2.500732421875,
+ "learning_rate": 4.266663828352324e-06,
+ "loss": 0.5375,
+ "step": 3250
+ },
+ {
+ "epoch": 1.5371158392434987,
+ "grad_norm": 2.6310064792633057,
+ "learning_rate": 4.266222385978444e-06,
+ "loss": 0.5217,
+ "step": 3251
+ },
+ {
+ "epoch": 1.5375886524822695,
+ "grad_norm": 2.7440476417541504,
+ "learning_rate": 4.265780833629642e-06,
+ "loss": 0.5419,
+ "step": 3252
+ },
+ {
+ "epoch": 1.5380614657210403,
+ "grad_norm": 2.7037577629089355,
+ "learning_rate": 4.2653391713334095e-06,
+ "loss": 0.5634,
+ "step": 3253
+ },
+ {
+ "epoch": 1.5385342789598109,
+ "grad_norm": 2.548525810241699,
+ "learning_rate": 4.264897399117248e-06,
+ "loss": 0.535,
+ "step": 3254
+ },
+ {
+ "epoch": 1.5390070921985815,
+ "grad_norm": 2.6127355098724365,
+ "learning_rate": 4.264455517008663e-06,
+ "loss": 0.4619,
+ "step": 3255
+ },
+ {
+ "epoch": 1.5394799054373522,
+ "grad_norm": 2.5597004890441895,
+ "learning_rate": 4.264013525035171e-06,
+ "loss": 0.4477,
+ "step": 3256
+ },
+ {
+ "epoch": 1.539952718676123,
+ "grad_norm": 2.642432689666748,
+ "learning_rate": 4.263571423224292e-06,
+ "loss": 0.4749,
+ "step": 3257
+ },
+ {
+ "epoch": 1.5404255319148936,
+ "grad_norm": 2.5121877193450928,
+ "learning_rate": 4.2631292116035526e-06,
+ "loss": 0.4693,
+ "step": 3258
+ },
+ {
+ "epoch": 1.5408983451536642,
+ "grad_norm": 2.390292167663574,
+ "learning_rate": 4.262686890200489e-06,
+ "loss": 0.4872,
+ "step": 3259
+ },
+ {
+ "epoch": 1.541371158392435,
+ "grad_norm": 2.5898337364196777,
+ "learning_rate": 4.2622444590426405e-06,
+ "loss": 0.5193,
+ "step": 3260
+ },
+ {
+ "epoch": 1.5418439716312058,
+ "grad_norm": 2.508821487426758,
+ "learning_rate": 4.261801918157558e-06,
+ "loss": 0.511,
+ "step": 3261
+ },
+ {
+ "epoch": 1.5423167848699764,
+ "grad_norm": 2.6992101669311523,
+ "learning_rate": 4.261359267572795e-06,
+ "loss": 0.5069,
+ "step": 3262
+ },
+ {
+ "epoch": 1.542789598108747,
+ "grad_norm": 2.6011030673980713,
+ "learning_rate": 4.2609165073159145e-06,
+ "loss": 0.5887,
+ "step": 3263
+ },
+ {
+ "epoch": 1.5432624113475177,
+ "grad_norm": 2.887053966522217,
+ "learning_rate": 4.260473637414483e-06,
+ "loss": 0.5556,
+ "step": 3264
+ },
+ {
+ "epoch": 1.5437352245862885,
+ "grad_norm": 2.6433887481689453,
+ "learning_rate": 4.260030657896079e-06,
+ "loss": 0.4728,
+ "step": 3265
+ },
+ {
+ "epoch": 1.544208037825059,
+ "grad_norm": 2.6134607791900635,
+ "learning_rate": 4.259587568788282e-06,
+ "loss": 0.483,
+ "step": 3266
+ },
+ {
+ "epoch": 1.5446808510638297,
+ "grad_norm": 2.5308640003204346,
+ "learning_rate": 4.259144370118684e-06,
+ "loss": 0.5115,
+ "step": 3267
+ },
+ {
+ "epoch": 1.5451536643026005,
+ "grad_norm": 2.8256733417510986,
+ "learning_rate": 4.258701061914879e-06,
+ "loss": 0.5414,
+ "step": 3268
+ },
+ {
+ "epoch": 1.5456264775413713,
+ "grad_norm": 2.8648319244384766,
+ "learning_rate": 4.258257644204471e-06,
+ "loss": 0.5695,
+ "step": 3269
+ },
+ {
+ "epoch": 1.5460992907801419,
+ "grad_norm": 2.8568081855773926,
+ "learning_rate": 4.257814117015069e-06,
+ "loss": 0.5264,
+ "step": 3270
+ },
+ {
+ "epoch": 1.5465721040189124,
+ "grad_norm": 2.6065011024475098,
+ "learning_rate": 4.257370480374289e-06,
+ "loss": 0.5646,
+ "step": 3271
+ },
+ {
+ "epoch": 1.5470449172576832,
+ "grad_norm": 2.7840216159820557,
+ "learning_rate": 4.256926734309756e-06,
+ "loss": 0.5191,
+ "step": 3272
+ },
+ {
+ "epoch": 1.547517730496454,
+ "grad_norm": 2.85906982421875,
+ "learning_rate": 4.256482878849099e-06,
+ "loss": 0.5911,
+ "step": 3273
+ },
+ {
+ "epoch": 1.5479905437352246,
+ "grad_norm": 2.916029930114746,
+ "learning_rate": 4.256038914019954e-06,
+ "loss": 0.5589,
+ "step": 3274
+ },
+ {
+ "epoch": 1.5484633569739952,
+ "grad_norm": 2.6748716831207275,
+ "learning_rate": 4.255594839849967e-06,
+ "loss": 0.5323,
+ "step": 3275
+ },
+ {
+ "epoch": 1.548936170212766,
+ "grad_norm": 2.717212200164795,
+ "learning_rate": 4.255150656366787e-06,
+ "loss": 0.453,
+ "step": 3276
+ },
+ {
+ "epoch": 1.5494089834515368,
+ "grad_norm": 2.4974849224090576,
+ "learning_rate": 4.254706363598072e-06,
+ "loss": 0.4516,
+ "step": 3277
+ },
+ {
+ "epoch": 1.5498817966903073,
+ "grad_norm": 2.648151397705078,
+ "learning_rate": 4.254261961571485e-06,
+ "loss": 0.5452,
+ "step": 3278
+ },
+ {
+ "epoch": 1.550354609929078,
+ "grad_norm": 2.932905435562134,
+ "learning_rate": 4.253817450314699e-06,
+ "loss": 0.4813,
+ "step": 3279
+ },
+ {
+ "epoch": 1.5508274231678487,
+ "grad_norm": 2.862912178039551,
+ "learning_rate": 4.25337282985539e-06,
+ "loss": 0.5689,
+ "step": 3280
+ },
+ {
+ "epoch": 1.5513002364066195,
+ "grad_norm": 2.532156467437744,
+ "learning_rate": 4.2529281002212436e-06,
+ "loss": 0.485,
+ "step": 3281
+ },
+ {
+ "epoch": 1.55177304964539,
+ "grad_norm": 2.583299160003662,
+ "learning_rate": 4.25248326143995e-06,
+ "loss": 0.4661,
+ "step": 3282
+ },
+ {
+ "epoch": 1.5522458628841607,
+ "grad_norm": 2.5790653228759766,
+ "learning_rate": 4.252038313539209e-06,
+ "loss": 0.5455,
+ "step": 3283
+ },
+ {
+ "epoch": 1.5527186761229315,
+ "grad_norm": 2.872864007949829,
+ "learning_rate": 4.251593256546724e-06,
+ "loss": 0.5317,
+ "step": 3284
+ },
+ {
+ "epoch": 1.5531914893617023,
+ "grad_norm": 3.0382463932037354,
+ "learning_rate": 4.251148090490208e-06,
+ "loss": 0.5131,
+ "step": 3285
+ },
+ {
+ "epoch": 1.5536643026004728,
+ "grad_norm": 2.574399709701538,
+ "learning_rate": 4.250702815397379e-06,
+ "loss": 0.5399,
+ "step": 3286
+ },
+ {
+ "epoch": 1.5541371158392434,
+ "grad_norm": 2.9784770011901855,
+ "learning_rate": 4.250257431295962e-06,
+ "loss": 0.5209,
+ "step": 3287
+ },
+ {
+ "epoch": 1.5546099290780142,
+ "grad_norm": 2.6482062339782715,
+ "learning_rate": 4.249811938213689e-06,
+ "loss": 0.5416,
+ "step": 3288
+ },
+ {
+ "epoch": 1.555082742316785,
+ "grad_norm": 2.82142972946167,
+ "learning_rate": 4.2493663361783e-06,
+ "loss": 0.594,
+ "step": 3289
+ },
+ {
+ "epoch": 1.5555555555555556,
+ "grad_norm": 2.815595865249634,
+ "learning_rate": 4.24892062521754e-06,
+ "loss": 0.5381,
+ "step": 3290
+ },
+ {
+ "epoch": 1.5560283687943262,
+ "grad_norm": 2.689764976501465,
+ "learning_rate": 4.248474805359161e-06,
+ "loss": 0.5141,
+ "step": 3291
+ },
+ {
+ "epoch": 1.556501182033097,
+ "grad_norm": 2.7718515396118164,
+ "learning_rate": 4.248028876630922e-06,
+ "loss": 0.5324,
+ "step": 3292
+ },
+ {
+ "epoch": 1.5569739952718678,
+ "grad_norm": 3.0196774005889893,
+ "learning_rate": 4.247582839060591e-06,
+ "loss": 0.4971,
+ "step": 3293
+ },
+ {
+ "epoch": 1.5574468085106383,
+ "grad_norm": 2.608475923538208,
+ "learning_rate": 4.247136692675939e-06,
+ "loss": 0.5795,
+ "step": 3294
+ },
+ {
+ "epoch": 1.557919621749409,
+ "grad_norm": 2.4912326335906982,
+ "learning_rate": 4.246690437504746e-06,
+ "loss": 0.5348,
+ "step": 3295
+ },
+ {
+ "epoch": 1.5583924349881797,
+ "grad_norm": 2.519303560256958,
+ "learning_rate": 4.246244073574799e-06,
+ "loss": 0.4953,
+ "step": 3296
+ },
+ {
+ "epoch": 1.5588652482269505,
+ "grad_norm": 2.5667171478271484,
+ "learning_rate": 4.24579760091389e-06,
+ "loss": 0.5353,
+ "step": 3297
+ },
+ {
+ "epoch": 1.559338061465721,
+ "grad_norm": 2.8835761547088623,
+ "learning_rate": 4.24535101954982e-06,
+ "loss": 0.578,
+ "step": 3298
+ },
+ {
+ "epoch": 1.5598108747044916,
+ "grad_norm": 3.0506930351257324,
+ "learning_rate": 4.244904329510395e-06,
+ "loss": 0.6418,
+ "step": 3299
+ },
+ {
+ "epoch": 1.5602836879432624,
+ "grad_norm": 2.579446315765381,
+ "learning_rate": 4.244457530823428e-06,
+ "loss": 0.5027,
+ "step": 3300
+ },
+ {
+ "epoch": 1.5607565011820332,
+ "grad_norm": 2.72012996673584,
+ "learning_rate": 4.24401062351674e-06,
+ "loss": 0.5438,
+ "step": 3301
+ },
+ {
+ "epoch": 1.5612293144208038,
+ "grad_norm": 2.527007818222046,
+ "learning_rate": 4.243563607618158e-06,
+ "loss": 0.5303,
+ "step": 3302
+ },
+ {
+ "epoch": 1.5617021276595744,
+ "grad_norm": 2.4415159225463867,
+ "learning_rate": 4.243116483155516e-06,
+ "loss": 0.4893,
+ "step": 3303
+ },
+ {
+ "epoch": 1.5621749408983452,
+ "grad_norm": 2.462256669998169,
+ "learning_rate": 4.242669250156653e-06,
+ "loss": 0.5671,
+ "step": 3304
+ },
+ {
+ "epoch": 1.562647754137116,
+ "grad_norm": 2.479865074157715,
+ "learning_rate": 4.242221908649418e-06,
+ "loss": 0.5038,
+ "step": 3305
+ },
+ {
+ "epoch": 1.5631205673758866,
+ "grad_norm": 2.74670672416687,
+ "learning_rate": 4.241774458661662e-06,
+ "loss": 0.5689,
+ "step": 3306
+ },
+ {
+ "epoch": 1.5635933806146571,
+ "grad_norm": 2.55938982963562,
+ "learning_rate": 4.24132690022125e-06,
+ "loss": 0.492,
+ "step": 3307
+ },
+ {
+ "epoch": 1.564066193853428,
+ "grad_norm": 2.634956121444702,
+ "learning_rate": 4.240879233356048e-06,
+ "loss": 0.503,
+ "step": 3308
+ },
+ {
+ "epoch": 1.5645390070921987,
+ "grad_norm": 2.381775140762329,
+ "learning_rate": 4.240431458093928e-06,
+ "loss": 0.4939,
+ "step": 3309
+ },
+ {
+ "epoch": 1.5650118203309693,
+ "grad_norm": 2.8176610469818115,
+ "learning_rate": 4.239983574462774e-06,
+ "loss": 0.5609,
+ "step": 3310
+ },
+ {
+ "epoch": 1.5654846335697399,
+ "grad_norm": 3.0268442630767822,
+ "learning_rate": 4.239535582490471e-06,
+ "loss": 0.5427,
+ "step": 3311
+ },
+ {
+ "epoch": 1.5659574468085107,
+ "grad_norm": 2.5881481170654297,
+ "learning_rate": 4.239087482204916e-06,
+ "loss": 0.5538,
+ "step": 3312
+ },
+ {
+ "epoch": 1.5664302600472815,
+ "grad_norm": 2.5317704677581787,
+ "learning_rate": 4.238639273634008e-06,
+ "loss": 0.4915,
+ "step": 3313
+ },
+ {
+ "epoch": 1.566903073286052,
+ "grad_norm": 2.9608731269836426,
+ "learning_rate": 4.238190956805658e-06,
+ "loss": 0.564,
+ "step": 3314
+ },
+ {
+ "epoch": 1.5673758865248226,
+ "grad_norm": 3.022686243057251,
+ "learning_rate": 4.237742531747777e-06,
+ "loss": 0.5503,
+ "step": 3315
+ },
+ {
+ "epoch": 1.5678486997635934,
+ "grad_norm": 2.763622283935547,
+ "learning_rate": 4.23729399848829e-06,
+ "loss": 0.5241,
+ "step": 3316
+ },
+ {
+ "epoch": 1.5683215130023642,
+ "grad_norm": 2.6112794876098633,
+ "learning_rate": 4.236845357055122e-06,
+ "loss": 0.4919,
+ "step": 3317
+ },
+ {
+ "epoch": 1.5687943262411348,
+ "grad_norm": 2.649829149246216,
+ "learning_rate": 4.23639660747621e-06,
+ "loss": 0.5472,
+ "step": 3318
+ },
+ {
+ "epoch": 1.5692671394799054,
+ "grad_norm": 2.8888115882873535,
+ "learning_rate": 4.2359477497794955e-06,
+ "loss": 0.5077,
+ "step": 3319
+ },
+ {
+ "epoch": 1.5697399527186762,
+ "grad_norm": 2.5666911602020264,
+ "learning_rate": 4.235498783992927e-06,
+ "loss": 0.5365,
+ "step": 3320
+ },
+ {
+ "epoch": 1.570212765957447,
+ "grad_norm": 2.448758363723755,
+ "learning_rate": 4.2350497101444575e-06,
+ "loss": 0.5043,
+ "step": 3321
+ },
+ {
+ "epoch": 1.5706855791962175,
+ "grad_norm": 2.595207691192627,
+ "learning_rate": 4.234600528262052e-06,
+ "loss": 0.5303,
+ "step": 3322
+ },
+ {
+ "epoch": 1.5711583924349881,
+ "grad_norm": 2.7814228534698486,
+ "learning_rate": 4.234151238373676e-06,
+ "loss": 0.4521,
+ "step": 3323
+ },
+ {
+ "epoch": 1.571631205673759,
+ "grad_norm": 2.781538724899292,
+ "learning_rate": 4.233701840507308e-06,
+ "loss": 0.5193,
+ "step": 3324
+ },
+ {
+ "epoch": 1.5721040189125297,
+ "grad_norm": 2.771907329559326,
+ "learning_rate": 4.233252334690928e-06,
+ "loss": 0.497,
+ "step": 3325
+ },
+ {
+ "epoch": 1.5725768321513003,
+ "grad_norm": 2.5557498931884766,
+ "learning_rate": 4.232802720952525e-06,
+ "loss": 0.4913,
+ "step": 3326
+ },
+ {
+ "epoch": 1.5730496453900709,
+ "grad_norm": 2.478267192840576,
+ "learning_rate": 4.232352999320094e-06,
+ "loss": 0.4967,
+ "step": 3327
+ },
+ {
+ "epoch": 1.5735224586288417,
+ "grad_norm": 3.1548502445220947,
+ "learning_rate": 4.231903169821639e-06,
+ "loss": 0.5009,
+ "step": 3328
+ },
+ {
+ "epoch": 1.5739952718676125,
+ "grad_norm": 2.634824275970459,
+ "learning_rate": 4.231453232485168e-06,
+ "loss": 0.5223,
+ "step": 3329
+ },
+ {
+ "epoch": 1.574468085106383,
+ "grad_norm": 2.579102039337158,
+ "learning_rate": 4.231003187338695e-06,
+ "loss": 0.5513,
+ "step": 3330
+ },
+ {
+ "epoch": 1.5749408983451536,
+ "grad_norm": 2.8477070331573486,
+ "learning_rate": 4.230553034410245e-06,
+ "loss": 0.561,
+ "step": 3331
+ },
+ {
+ "epoch": 1.5754137115839244,
+ "grad_norm": 2.6714725494384766,
+ "learning_rate": 4.2301027737278446e-06,
+ "loss": 0.4687,
+ "step": 3332
+ },
+ {
+ "epoch": 1.5758865248226952,
+ "grad_norm": 2.6562764644622803,
+ "learning_rate": 4.229652405319532e-06,
+ "loss": 0.5925,
+ "step": 3333
+ },
+ {
+ "epoch": 1.5763593380614658,
+ "grad_norm": 2.750946283340454,
+ "learning_rate": 4.229201929213348e-06,
+ "loss": 0.4748,
+ "step": 3334
+ },
+ {
+ "epoch": 1.5768321513002364,
+ "grad_norm": 2.760470151901245,
+ "learning_rate": 4.228751345437342e-06,
+ "loss": 0.5989,
+ "step": 3335
+ },
+ {
+ "epoch": 1.5773049645390071,
+ "grad_norm": 3.1451845169067383,
+ "learning_rate": 4.2283006540195706e-06,
+ "loss": 0.562,
+ "step": 3336
+ },
+ {
+ "epoch": 1.5777777777777777,
+ "grad_norm": 2.563011407852173,
+ "learning_rate": 4.227849854988095e-06,
+ "loss": 0.5473,
+ "step": 3337
+ },
+ {
+ "epoch": 1.5782505910165483,
+ "grad_norm": 2.310469388961792,
+ "learning_rate": 4.2273989483709856e-06,
+ "loss": 0.5033,
+ "step": 3338
+ },
+ {
+ "epoch": 1.578723404255319,
+ "grad_norm": 2.677978754043579,
+ "learning_rate": 4.226947934196318e-06,
+ "loss": 0.5291,
+ "step": 3339
+ },
+ {
+ "epoch": 1.57919621749409,
+ "grad_norm": 3.0423545837402344,
+ "learning_rate": 4.226496812492176e-06,
+ "loss": 0.5201,
+ "step": 3340
+ },
+ {
+ "epoch": 1.5796690307328605,
+ "grad_norm": 2.357513904571533,
+ "learning_rate": 4.226045583286647e-06,
+ "loss": 0.4421,
+ "step": 3341
+ },
+ {
+ "epoch": 1.580141843971631,
+ "grad_norm": 2.719860315322876,
+ "learning_rate": 4.225594246607828e-06,
+ "loss": 0.4855,
+ "step": 3342
+ },
+ {
+ "epoch": 1.5806146572104018,
+ "grad_norm": 3.2645058631896973,
+ "learning_rate": 4.2251428024838215e-06,
+ "loss": 0.6654,
+ "step": 3343
+ },
+ {
+ "epoch": 1.5810874704491726,
+ "grad_norm": 2.2997004985809326,
+ "learning_rate": 4.224691250942737e-06,
+ "loss": 0.4565,
+ "step": 3344
+ },
+ {
+ "epoch": 1.5815602836879432,
+ "grad_norm": 2.8103034496307373,
+ "learning_rate": 4.2242395920126926e-06,
+ "loss": 0.5543,
+ "step": 3345
+ },
+ {
+ "epoch": 1.5820330969267138,
+ "grad_norm": 2.720254898071289,
+ "learning_rate": 4.223787825721808e-06,
+ "loss": 0.5028,
+ "step": 3346
+ },
+ {
+ "epoch": 1.5825059101654846,
+ "grad_norm": 2.735544204711914,
+ "learning_rate": 4.223335952098214e-06,
+ "loss": 0.5169,
+ "step": 3347
+ },
+ {
+ "epoch": 1.5829787234042554,
+ "grad_norm": 2.784254550933838,
+ "learning_rate": 4.222883971170047e-06,
+ "loss": 0.4989,
+ "step": 3348
+ },
+ {
+ "epoch": 1.583451536643026,
+ "grad_norm": 2.7192094326019287,
+ "learning_rate": 4.22243188296545e-06,
+ "loss": 0.502,
+ "step": 3349
+ },
+ {
+ "epoch": 1.5839243498817965,
+ "grad_norm": 2.716501474380493,
+ "learning_rate": 4.221979687512573e-06,
+ "loss": 0.5687,
+ "step": 3350
+ },
+ {
+ "epoch": 1.5843971631205673,
+ "grad_norm": 2.8420114517211914,
+ "learning_rate": 4.22152738483957e-06,
+ "loss": 0.5903,
+ "step": 3351
+ },
+ {
+ "epoch": 1.5848699763593381,
+ "grad_norm": 2.734872579574585,
+ "learning_rate": 4.2210749749746065e-06,
+ "loss": 0.5397,
+ "step": 3352
+ },
+ {
+ "epoch": 1.5853427895981087,
+ "grad_norm": 2.4343836307525635,
+ "learning_rate": 4.220622457945851e-06,
+ "loss": 0.436,
+ "step": 3353
+ },
+ {
+ "epoch": 1.5858156028368793,
+ "grad_norm": 2.728177547454834,
+ "learning_rate": 4.2201698337814785e-06,
+ "loss": 0.5703,
+ "step": 3354
+ },
+ {
+ "epoch": 1.58628841607565,
+ "grad_norm": 2.502098560333252,
+ "learning_rate": 4.219717102509674e-06,
+ "loss": 0.5275,
+ "step": 3355
+ },
+ {
+ "epoch": 1.5867612293144209,
+ "grad_norm": 2.6595494747161865,
+ "learning_rate": 4.219264264158627e-06,
+ "loss": 0.4659,
+ "step": 3356
+ },
+ {
+ "epoch": 1.5872340425531914,
+ "grad_norm": 2.5307185649871826,
+ "learning_rate": 4.218811318756532e-06,
+ "loss": 0.5048,
+ "step": 3357
+ },
+ {
+ "epoch": 1.587706855791962,
+ "grad_norm": 2.9300129413604736,
+ "learning_rate": 4.218358266331593e-06,
+ "loss": 0.5137,
+ "step": 3358
+ },
+ {
+ "epoch": 1.5881796690307328,
+ "grad_norm": 2.686586618423462,
+ "learning_rate": 4.21790510691202e-06,
+ "loss": 0.4529,
+ "step": 3359
+ },
+ {
+ "epoch": 1.5886524822695036,
+ "grad_norm": 2.9981517791748047,
+ "learning_rate": 4.217451840526029e-06,
+ "loss": 0.6054,
+ "step": 3360
+ },
+ {
+ "epoch": 1.5891252955082742,
+ "grad_norm": 2.6943674087524414,
+ "learning_rate": 4.216998467201841e-06,
+ "loss": 0.5153,
+ "step": 3361
+ },
+ {
+ "epoch": 1.5895981087470448,
+ "grad_norm": 2.707084894180298,
+ "learning_rate": 4.216544986967689e-06,
+ "loss": 0.5235,
+ "step": 3362
+ },
+ {
+ "epoch": 1.5900709219858156,
+ "grad_norm": 2.6553728580474854,
+ "learning_rate": 4.216091399851808e-06,
+ "loss": 0.5275,
+ "step": 3363
+ },
+ {
+ "epoch": 1.5905437352245864,
+ "grad_norm": 2.9136953353881836,
+ "learning_rate": 4.215637705882439e-06,
+ "loss": 0.5834,
+ "step": 3364
+ },
+ {
+ "epoch": 1.591016548463357,
+ "grad_norm": 2.7647159099578857,
+ "learning_rate": 4.2151839050878325e-06,
+ "loss": 0.5641,
+ "step": 3365
+ },
+ {
+ "epoch": 1.5914893617021275,
+ "grad_norm": 2.4556827545166016,
+ "learning_rate": 4.214729997496246e-06,
+ "loss": 0.5636,
+ "step": 3366
+ },
+ {
+ "epoch": 1.5919621749408983,
+ "grad_norm": 2.6111652851104736,
+ "learning_rate": 4.2142759831359414e-06,
+ "loss": 0.5097,
+ "step": 3367
+ },
+ {
+ "epoch": 1.592434988179669,
+ "grad_norm": 2.4886903762817383,
+ "learning_rate": 4.213821862035189e-06,
+ "loss": 0.531,
+ "step": 3368
+ },
+ {
+ "epoch": 1.5929078014184397,
+ "grad_norm": 2.5245840549468994,
+ "learning_rate": 4.213367634222263e-06,
+ "loss": 0.5085,
+ "step": 3369
+ },
+ {
+ "epoch": 1.5933806146572103,
+ "grad_norm": 2.970214605331421,
+ "learning_rate": 4.212913299725447e-06,
+ "loss": 0.5851,
+ "step": 3370
+ },
+ {
+ "epoch": 1.593853427895981,
+ "grad_norm": 2.5433361530303955,
+ "learning_rate": 4.212458858573032e-06,
+ "loss": 0.48,
+ "step": 3371
+ },
+ {
+ "epoch": 1.5943262411347519,
+ "grad_norm": 2.3550102710723877,
+ "learning_rate": 4.212004310793312e-06,
+ "loss": 0.4405,
+ "step": 3372
+ },
+ {
+ "epoch": 1.5947990543735224,
+ "grad_norm": 2.4824719429016113,
+ "learning_rate": 4.2115496564145896e-06,
+ "loss": 0.4634,
+ "step": 3373
+ },
+ {
+ "epoch": 1.595271867612293,
+ "grad_norm": 2.4751930236816406,
+ "learning_rate": 4.211094895465176e-06,
+ "loss": 0.5662,
+ "step": 3374
+ },
+ {
+ "epoch": 1.5957446808510638,
+ "grad_norm": 2.4193356037139893,
+ "learning_rate": 4.210640027973386e-06,
+ "loss": 0.4441,
+ "step": 3375
+ },
+ {
+ "epoch": 1.5962174940898346,
+ "grad_norm": 2.4477498531341553,
+ "learning_rate": 4.210185053967543e-06,
+ "loss": 0.5205,
+ "step": 3376
+ },
+ {
+ "epoch": 1.5966903073286052,
+ "grad_norm": 2.7954161167144775,
+ "learning_rate": 4.209729973475976e-06,
+ "loss": 0.4951,
+ "step": 3377
+ },
+ {
+ "epoch": 1.5971631205673757,
+ "grad_norm": 3.1907570362091064,
+ "learning_rate": 4.209274786527019e-06,
+ "loss": 0.6024,
+ "step": 3378
+ },
+ {
+ "epoch": 1.5976359338061465,
+ "grad_norm": 2.485245704650879,
+ "learning_rate": 4.2088194931490165e-06,
+ "loss": 0.5652,
+ "step": 3379
+ },
+ {
+ "epoch": 1.5981087470449173,
+ "grad_norm": 2.589310884475708,
+ "learning_rate": 4.208364093370317e-06,
+ "loss": 0.5085,
+ "step": 3380
+ },
+ {
+ "epoch": 1.598581560283688,
+ "grad_norm": 2.8941214084625244,
+ "learning_rate": 4.207908587219276e-06,
+ "loss": 0.53,
+ "step": 3381
+ },
+ {
+ "epoch": 1.5990543735224585,
+ "grad_norm": 2.480509042739868,
+ "learning_rate": 4.207452974724258e-06,
+ "loss": 0.4543,
+ "step": 3382
+ },
+ {
+ "epoch": 1.5995271867612293,
+ "grad_norm": 2.7884905338287354,
+ "learning_rate": 4.206997255913629e-06,
+ "loss": 0.5483,
+ "step": 3383
+ },
+ {
+ "epoch": 1.6,
+ "grad_norm": 2.7976696491241455,
+ "learning_rate": 4.206541430815766e-06,
+ "loss": 0.4734,
+ "step": 3384
+ },
+ {
+ "epoch": 1.6004728132387707,
+ "grad_norm": 2.5463132858276367,
+ "learning_rate": 4.206085499459051e-06,
+ "loss": 0.4931,
+ "step": 3385
+ },
+ {
+ "epoch": 1.6009456264775412,
+ "grad_norm": 2.8384251594543457,
+ "learning_rate": 4.205629461871871e-06,
+ "loss": 0.5066,
+ "step": 3386
+ },
+ {
+ "epoch": 1.601418439716312,
+ "grad_norm": 2.8578574657440186,
+ "learning_rate": 4.205173318082626e-06,
+ "loss": 0.458,
+ "step": 3387
+ },
+ {
+ "epoch": 1.6018912529550828,
+ "grad_norm": 2.7779932022094727,
+ "learning_rate": 4.204717068119715e-06,
+ "loss": 0.5293,
+ "step": 3388
+ },
+ {
+ "epoch": 1.6023640661938534,
+ "grad_norm": 2.9123778343200684,
+ "learning_rate": 4.204260712011546e-06,
+ "loss": 0.4866,
+ "step": 3389
+ },
+ {
+ "epoch": 1.602836879432624,
+ "grad_norm": 2.757922887802124,
+ "learning_rate": 4.203804249786537e-06,
+ "loss": 0.4925,
+ "step": 3390
+ },
+ {
+ "epoch": 1.6033096926713948,
+ "grad_norm": 3.287733316421509,
+ "learning_rate": 4.203347681473107e-06,
+ "loss": 0.6694,
+ "step": 3391
+ },
+ {
+ "epoch": 1.6037825059101656,
+ "grad_norm": 3.2117912769317627,
+ "learning_rate": 4.202891007099687e-06,
+ "loss": 0.5269,
+ "step": 3392
+ },
+ {
+ "epoch": 1.6042553191489362,
+ "grad_norm": 2.8489456176757812,
+ "learning_rate": 4.20243422669471e-06,
+ "loss": 0.5073,
+ "step": 3393
+ },
+ {
+ "epoch": 1.6047281323877067,
+ "grad_norm": 2.7660224437713623,
+ "learning_rate": 4.201977340286619e-06,
+ "loss": 0.5014,
+ "step": 3394
+ },
+ {
+ "epoch": 1.6052009456264775,
+ "grad_norm": 2.68182110786438,
+ "learning_rate": 4.201520347903862e-06,
+ "loss": 0.4542,
+ "step": 3395
+ },
+ {
+ "epoch": 1.6056737588652483,
+ "grad_norm": 2.7546045780181885,
+ "learning_rate": 4.2010632495748934e-06,
+ "loss": 0.516,
+ "step": 3396
+ },
+ {
+ "epoch": 1.606146572104019,
+ "grad_norm": 2.744668483734131,
+ "learning_rate": 4.200606045328176e-06,
+ "loss": 0.5243,
+ "step": 3397
+ },
+ {
+ "epoch": 1.6066193853427895,
+ "grad_norm": 2.935343027114868,
+ "learning_rate": 4.200148735192177e-06,
+ "loss": 0.5624,
+ "step": 3398
+ },
+ {
+ "epoch": 1.6070921985815603,
+ "grad_norm": 2.7392852306365967,
+ "learning_rate": 4.19969131919537e-06,
+ "loss": 0.5796,
+ "step": 3399
+ },
+ {
+ "epoch": 1.607565011820331,
+ "grad_norm": 2.864750385284424,
+ "learning_rate": 4.199233797366239e-06,
+ "loss": 0.549,
+ "step": 3400
+ },
+ {
+ "epoch": 1.6080378250591016,
+ "grad_norm": 2.684157371520996,
+ "learning_rate": 4.198776169733269e-06,
+ "loss": 0.5532,
+ "step": 3401
+ },
+ {
+ "epoch": 1.6085106382978722,
+ "grad_norm": 2.4717135429382324,
+ "learning_rate": 4.198318436324957e-06,
+ "loss": 0.5174,
+ "step": 3402
+ },
+ {
+ "epoch": 1.608983451536643,
+ "grad_norm": 2.640242338180542,
+ "learning_rate": 4.197860597169802e-06,
+ "loss": 0.5117,
+ "step": 3403
+ },
+ {
+ "epoch": 1.6094562647754138,
+ "grad_norm": 2.4957473278045654,
+ "learning_rate": 4.197402652296313e-06,
+ "loss": 0.474,
+ "step": 3404
+ },
+ {
+ "epoch": 1.6099290780141844,
+ "grad_norm": 2.416138172149658,
+ "learning_rate": 4.196944601733004e-06,
+ "loss": 0.4858,
+ "step": 3405
+ },
+ {
+ "epoch": 1.610401891252955,
+ "grad_norm": 2.4498109817504883,
+ "learning_rate": 4.196486445508395e-06,
+ "loss": 0.5048,
+ "step": 3406
+ },
+ {
+ "epoch": 1.6108747044917258,
+ "grad_norm": 2.415895938873291,
+ "learning_rate": 4.196028183651014e-06,
+ "loss": 0.4745,
+ "step": 3407
+ },
+ {
+ "epoch": 1.6113475177304966,
+ "grad_norm": 2.843665838241577,
+ "learning_rate": 4.195569816189395e-06,
+ "loss": 0.5219,
+ "step": 3408
+ },
+ {
+ "epoch": 1.6118203309692671,
+ "grad_norm": 2.608579158782959,
+ "learning_rate": 4.195111343152079e-06,
+ "loss": 0.4941,
+ "step": 3409
+ },
+ {
+ "epoch": 1.6122931442080377,
+ "grad_norm": 2.643789529800415,
+ "learning_rate": 4.194652764567611e-06,
+ "loss": 0.515,
+ "step": 3410
+ },
+ {
+ "epoch": 1.6127659574468085,
+ "grad_norm": 2.8099429607391357,
+ "learning_rate": 4.194194080464547e-06,
+ "loss": 0.4935,
+ "step": 3411
+ },
+ {
+ "epoch": 1.6132387706855793,
+ "grad_norm": 2.595628261566162,
+ "learning_rate": 4.193735290871446e-06,
+ "loss": 0.5571,
+ "step": 3412
+ },
+ {
+ "epoch": 1.6137115839243499,
+ "grad_norm": 2.7903778553009033,
+ "learning_rate": 4.193276395816876e-06,
+ "loss": 0.5228,
+ "step": 3413
+ },
+ {
+ "epoch": 1.6141843971631205,
+ "grad_norm": 2.83910870552063,
+ "learning_rate": 4.192817395329409e-06,
+ "loss": 0.6124,
+ "step": 3414
+ },
+ {
+ "epoch": 1.6146572104018913,
+ "grad_norm": 2.6155734062194824,
+ "learning_rate": 4.192358289437626e-06,
+ "loss": 0.552,
+ "step": 3415
+ },
+ {
+ "epoch": 1.615130023640662,
+ "grad_norm": 2.795832872390747,
+ "learning_rate": 4.191899078170113e-06,
+ "loss": 0.5561,
+ "step": 3416
+ },
+ {
+ "epoch": 1.6156028368794326,
+ "grad_norm": 2.3402161598205566,
+ "learning_rate": 4.191439761555464e-06,
+ "loss": 0.4889,
+ "step": 3417
+ },
+ {
+ "epoch": 1.6160756501182032,
+ "grad_norm": 3.1183433532714844,
+ "learning_rate": 4.190980339622276e-06,
+ "loss": 0.5337,
+ "step": 3418
+ },
+ {
+ "epoch": 1.616548463356974,
+ "grad_norm": 2.6262872219085693,
+ "learning_rate": 4.190520812399158e-06,
+ "loss": 0.525,
+ "step": 3419
+ },
+ {
+ "epoch": 1.6170212765957448,
+ "grad_norm": 2.578340530395508,
+ "learning_rate": 4.190061179914722e-06,
+ "loss": 0.4975,
+ "step": 3420
+ },
+ {
+ "epoch": 1.6174940898345154,
+ "grad_norm": 3.19482159614563,
+ "learning_rate": 4.189601442197586e-06,
+ "loss": 0.5832,
+ "step": 3421
+ },
+ {
+ "epoch": 1.617966903073286,
+ "grad_norm": 2.6398792266845703,
+ "learning_rate": 4.189141599276378e-06,
+ "loss": 0.4676,
+ "step": 3422
+ },
+ {
+ "epoch": 1.6184397163120567,
+ "grad_norm": 2.624865770339966,
+ "learning_rate": 4.1886816511797275e-06,
+ "loss": 0.4507,
+ "step": 3423
+ },
+ {
+ "epoch": 1.6189125295508275,
+ "grad_norm": 2.4136857986450195,
+ "learning_rate": 4.1882215979362775e-06,
+ "loss": 0.4616,
+ "step": 3424
+ },
+ {
+ "epoch": 1.6193853427895981,
+ "grad_norm": 2.6906614303588867,
+ "learning_rate": 4.18776143957467e-06,
+ "loss": 0.5142,
+ "step": 3425
+ },
+ {
+ "epoch": 1.6198581560283687,
+ "grad_norm": 2.5149154663085938,
+ "learning_rate": 4.187301176123558e-06,
+ "loss": 0.5252,
+ "step": 3426
+ },
+ {
+ "epoch": 1.6203309692671395,
+ "grad_norm": 2.677405834197998,
+ "learning_rate": 4.186840807611602e-06,
+ "loss": 0.4635,
+ "step": 3427
+ },
+ {
+ "epoch": 1.6208037825059103,
+ "grad_norm": 2.7164649963378906,
+ "learning_rate": 4.186380334067464e-06,
+ "loss": 0.5634,
+ "step": 3428
+ },
+ {
+ "epoch": 1.6212765957446809,
+ "grad_norm": 2.8299832344055176,
+ "learning_rate": 4.185919755519817e-06,
+ "loss": 0.5166,
+ "step": 3429
+ },
+ {
+ "epoch": 1.6217494089834514,
+ "grad_norm": 2.465848207473755,
+ "learning_rate": 4.18545907199734e-06,
+ "loss": 0.4696,
+ "step": 3430
+ },
+ {
+ "epoch": 1.6222222222222222,
+ "grad_norm": 2.407616376876831,
+ "learning_rate": 4.1849982835287175e-06,
+ "loss": 0.5111,
+ "step": 3431
+ },
+ {
+ "epoch": 1.622695035460993,
+ "grad_norm": 2.452146291732788,
+ "learning_rate": 4.184537390142639e-06,
+ "loss": 0.4574,
+ "step": 3432
+ },
+ {
+ "epoch": 1.6231678486997636,
+ "grad_norm": 2.653071165084839,
+ "learning_rate": 4.1840763918678055e-06,
+ "loss": 0.5611,
+ "step": 3433
+ },
+ {
+ "epoch": 1.6236406619385342,
+ "grad_norm": 2.5920350551605225,
+ "learning_rate": 4.183615288732919e-06,
+ "loss": 0.5437,
+ "step": 3434
+ },
+ {
+ "epoch": 1.624113475177305,
+ "grad_norm": 2.782900810241699,
+ "learning_rate": 4.18315408076669e-06,
+ "loss": 0.5824,
+ "step": 3435
+ },
+ {
+ "epoch": 1.6245862884160758,
+ "grad_norm": 2.8769774436950684,
+ "learning_rate": 4.1826927679978365e-06,
+ "loss": 0.5271,
+ "step": 3436
+ },
+ {
+ "epoch": 1.6250591016548463,
+ "grad_norm": 2.488598585128784,
+ "learning_rate": 4.182231350455084e-06,
+ "loss": 0.4684,
+ "step": 3437
+ },
+ {
+ "epoch": 1.625531914893617,
+ "grad_norm": 2.6472036838531494,
+ "learning_rate": 4.181769828167161e-06,
+ "loss": 0.5372,
+ "step": 3438
+ },
+ {
+ "epoch": 1.6260047281323877,
+ "grad_norm": 2.6498794555664062,
+ "learning_rate": 4.1813082011628045e-06,
+ "loss": 0.4805,
+ "step": 3439
+ },
+ {
+ "epoch": 1.6264775413711585,
+ "grad_norm": 2.5386533737182617,
+ "learning_rate": 4.1808464694707595e-06,
+ "loss": 0.5015,
+ "step": 3440
+ },
+ {
+ "epoch": 1.626950354609929,
+ "grad_norm": 2.8812551498413086,
+ "learning_rate": 4.180384633119775e-06,
+ "loss": 0.5225,
+ "step": 3441
+ },
+ {
+ "epoch": 1.6274231678486997,
+ "grad_norm": 2.870124578475952,
+ "learning_rate": 4.179922692138609e-06,
+ "loss": 0.537,
+ "step": 3442
+ },
+ {
+ "epoch": 1.6278959810874705,
+ "grad_norm": 2.5759785175323486,
+ "learning_rate": 4.179460646556021e-06,
+ "loss": 0.5142,
+ "step": 3443
+ },
+ {
+ "epoch": 1.6283687943262413,
+ "grad_norm": 2.629347324371338,
+ "learning_rate": 4.1789984964007836e-06,
+ "loss": 0.5007,
+ "step": 3444
+ },
+ {
+ "epoch": 1.6288416075650118,
+ "grad_norm": 2.751128673553467,
+ "learning_rate": 4.178536241701672e-06,
+ "loss": 0.5677,
+ "step": 3445
+ },
+ {
+ "epoch": 1.6293144208037824,
+ "grad_norm": 2.7582364082336426,
+ "learning_rate": 4.178073882487469e-06,
+ "loss": 0.499,
+ "step": 3446
+ },
+ {
+ "epoch": 1.6297872340425532,
+ "grad_norm": 3.136711359024048,
+ "learning_rate": 4.177611418786963e-06,
+ "loss": 0.5294,
+ "step": 3447
+ },
+ {
+ "epoch": 1.630260047281324,
+ "grad_norm": 2.7363100051879883,
+ "learning_rate": 4.17714885062895e-06,
+ "loss": 0.5264,
+ "step": 3448
+ },
+ {
+ "epoch": 1.6307328605200946,
+ "grad_norm": 2.7305946350097656,
+ "learning_rate": 4.176686178042233e-06,
+ "loss": 0.5235,
+ "step": 3449
+ },
+ {
+ "epoch": 1.6312056737588652,
+ "grad_norm": 2.6500556468963623,
+ "learning_rate": 4.176223401055619e-06,
+ "loss": 0.5463,
+ "step": 3450
+ },
+ {
+ "epoch": 1.631678486997636,
+ "grad_norm": 2.756321907043457,
+ "learning_rate": 4.175760519697924e-06,
+ "loss": 0.545,
+ "step": 3451
+ },
+ {
+ "epoch": 1.6321513002364068,
+ "grad_norm": 2.6234960556030273,
+ "learning_rate": 4.17529753399797e-06,
+ "loss": 0.4927,
+ "step": 3452
+ },
+ {
+ "epoch": 1.6326241134751773,
+ "grad_norm": 2.6358842849731445,
+ "learning_rate": 4.174834443984584e-06,
+ "loss": 0.5445,
+ "step": 3453
+ },
+ {
+ "epoch": 1.633096926713948,
+ "grad_norm": 2.541147470474243,
+ "learning_rate": 4.174371249686601e-06,
+ "loss": 0.4691,
+ "step": 3454
+ },
+ {
+ "epoch": 1.6335697399527187,
+ "grad_norm": 2.566981077194214,
+ "learning_rate": 4.173907951132863e-06,
+ "loss": 0.4932,
+ "step": 3455
+ },
+ {
+ "epoch": 1.6340425531914895,
+ "grad_norm": 2.670940399169922,
+ "learning_rate": 4.173444548352216e-06,
+ "loss": 0.4979,
+ "step": 3456
+ },
+ {
+ "epoch": 1.63451536643026,
+ "grad_norm": 2.5440268516540527,
+ "learning_rate": 4.172981041373515e-06,
+ "loss": 0.4716,
+ "step": 3457
+ },
+ {
+ "epoch": 1.6349881796690307,
+ "grad_norm": 2.3801631927490234,
+ "learning_rate": 4.17251743022562e-06,
+ "loss": 0.5126,
+ "step": 3458
+ },
+ {
+ "epoch": 1.6354609929078014,
+ "grad_norm": 2.5051121711730957,
+ "learning_rate": 4.1720537149373985e-06,
+ "loss": 0.4964,
+ "step": 3459
+ },
+ {
+ "epoch": 1.6359338061465722,
+ "grad_norm": 3.5521697998046875,
+ "learning_rate": 4.171589895537724e-06,
+ "loss": 0.5447,
+ "step": 3460
+ },
+ {
+ "epoch": 1.6364066193853428,
+ "grad_norm": 2.6041572093963623,
+ "learning_rate": 4.171125972055477e-06,
+ "loss": 0.4637,
+ "step": 3461
+ },
+ {
+ "epoch": 1.6368794326241134,
+ "grad_norm": 2.2297258377075195,
+ "learning_rate": 4.170661944519543e-06,
+ "loss": 0.4702,
+ "step": 3462
+ },
+ {
+ "epoch": 1.6373522458628842,
+ "grad_norm": 2.6764535903930664,
+ "learning_rate": 4.170197812958815e-06,
+ "loss": 0.5111,
+ "step": 3463
+ },
+ {
+ "epoch": 1.637825059101655,
+ "grad_norm": 2.86892032623291,
+ "learning_rate": 4.169733577402193e-06,
+ "loss": 0.5437,
+ "step": 3464
+ },
+ {
+ "epoch": 1.6382978723404256,
+ "grad_norm": 2.9007070064544678,
+ "learning_rate": 4.1692692378785825e-06,
+ "loss": 0.5425,
+ "step": 3465
+ },
+ {
+ "epoch": 1.6387706855791961,
+ "grad_norm": 2.5902905464172363,
+ "learning_rate": 4.168804794416896e-06,
+ "loss": 0.5252,
+ "step": 3466
+ },
+ {
+ "epoch": 1.639243498817967,
+ "grad_norm": 2.821183681488037,
+ "learning_rate": 4.168340247046053e-06,
+ "loss": 0.5265,
+ "step": 3467
+ },
+ {
+ "epoch": 1.6397163120567377,
+ "grad_norm": 2.7928314208984375,
+ "learning_rate": 4.167875595794978e-06,
+ "loss": 0.5151,
+ "step": 3468
+ },
+ {
+ "epoch": 1.6401891252955083,
+ "grad_norm": 2.3130412101745605,
+ "learning_rate": 4.167410840692603e-06,
+ "loss": 0.4941,
+ "step": 3469
+ },
+ {
+ "epoch": 1.6406619385342789,
+ "grad_norm": 2.6078619956970215,
+ "learning_rate": 4.1669459817678655e-06,
+ "loss": 0.493,
+ "step": 3470
+ },
+ {
+ "epoch": 1.6411347517730497,
+ "grad_norm": 2.5335731506347656,
+ "learning_rate": 4.166481019049712e-06,
+ "loss": 0.4969,
+ "step": 3471
+ },
+ {
+ "epoch": 1.6416075650118205,
+ "grad_norm": 2.8181469440460205,
+ "learning_rate": 4.166015952567093e-06,
+ "loss": 0.5062,
+ "step": 3472
+ },
+ {
+ "epoch": 1.642080378250591,
+ "grad_norm": 2.7256782054901123,
+ "learning_rate": 4.165550782348966e-06,
+ "loss": 0.5397,
+ "step": 3473
+ },
+ {
+ "epoch": 1.6425531914893616,
+ "grad_norm": 2.284345865249634,
+ "learning_rate": 4.1650855084242946e-06,
+ "loss": 0.4448,
+ "step": 3474
+ },
+ {
+ "epoch": 1.6430260047281324,
+ "grad_norm": 3.0383145809173584,
+ "learning_rate": 4.164620130822049e-06,
+ "loss": 0.5873,
+ "step": 3475
+ },
+ {
+ "epoch": 1.6434988179669032,
+ "grad_norm": 2.754448652267456,
+ "learning_rate": 4.1641546495712085e-06,
+ "loss": 0.4852,
+ "step": 3476
+ },
+ {
+ "epoch": 1.6439716312056738,
+ "grad_norm": 2.6820101737976074,
+ "learning_rate": 4.1636890647007535e-06,
+ "loss": 0.5325,
+ "step": 3477
+ },
+ {
+ "epoch": 1.6444444444444444,
+ "grad_norm": 2.6396398544311523,
+ "learning_rate": 4.163223376239676e-06,
+ "loss": 0.466,
+ "step": 3478
+ },
+ {
+ "epoch": 1.6449172576832152,
+ "grad_norm": 2.395049810409546,
+ "learning_rate": 4.162757584216972e-06,
+ "loss": 0.4531,
+ "step": 3479
+ },
+ {
+ "epoch": 1.645390070921986,
+ "grad_norm": 2.596670627593994,
+ "learning_rate": 4.162291688661645e-06,
+ "loss": 0.5207,
+ "step": 3480
+ },
+ {
+ "epoch": 1.6458628841607565,
+ "grad_norm": 2.4391872882843018,
+ "learning_rate": 4.161825689602703e-06,
+ "loss": 0.5133,
+ "step": 3481
+ },
+ {
+ "epoch": 1.6463356973995271,
+ "grad_norm": 2.6169841289520264,
+ "learning_rate": 4.161359587069162e-06,
+ "loss": 0.5096,
+ "step": 3482
+ },
+ {
+ "epoch": 1.646808510638298,
+ "grad_norm": 2.634089946746826,
+ "learning_rate": 4.1608933810900445e-06,
+ "loss": 0.4921,
+ "step": 3483
+ },
+ {
+ "epoch": 1.6472813238770687,
+ "grad_norm": 2.815877914428711,
+ "learning_rate": 4.160427071694379e-06,
+ "loss": 0.5045,
+ "step": 3484
+ },
+ {
+ "epoch": 1.6477541371158393,
+ "grad_norm": 2.417525053024292,
+ "learning_rate": 4.159960658911199e-06,
+ "loss": 0.4997,
+ "step": 3485
+ },
+ {
+ "epoch": 1.6482269503546099,
+ "grad_norm": 2.5713605880737305,
+ "learning_rate": 4.15949414276955e-06,
+ "loss": 0.5246,
+ "step": 3486
+ },
+ {
+ "epoch": 1.6486997635933807,
+ "grad_norm": 3.49833607673645,
+ "learning_rate": 4.159027523298475e-06,
+ "loss": 0.4901,
+ "step": 3487
+ },
+ {
+ "epoch": 1.6491725768321515,
+ "grad_norm": 2.985464334487915,
+ "learning_rate": 4.158560800527033e-06,
+ "loss": 0.5726,
+ "step": 3488
+ },
+ {
+ "epoch": 1.649645390070922,
+ "grad_norm": 2.72745680809021,
+ "learning_rate": 4.158093974484282e-06,
+ "loss": 0.5119,
+ "step": 3489
+ },
+ {
+ "epoch": 1.6501182033096926,
+ "grad_norm": 2.4885571002960205,
+ "learning_rate": 4.157627045199289e-06,
+ "loss": 0.4838,
+ "step": 3490
+ },
+ {
+ "epoch": 1.6505910165484634,
+ "grad_norm": 2.7622628211975098,
+ "learning_rate": 4.157160012701128e-06,
+ "loss": 0.5269,
+ "step": 3491
+ },
+ {
+ "epoch": 1.6510638297872342,
+ "grad_norm": 2.615122079849243,
+ "learning_rate": 4.156692877018879e-06,
+ "loss": 0.5501,
+ "step": 3492
+ },
+ {
+ "epoch": 1.6515366430260048,
+ "grad_norm": 2.827753782272339,
+ "learning_rate": 4.156225638181631e-06,
+ "loss": 0.5452,
+ "step": 3493
+ },
+ {
+ "epoch": 1.6520094562647754,
+ "grad_norm": 2.724820137023926,
+ "learning_rate": 4.155758296218474e-06,
+ "loss": 0.5155,
+ "step": 3494
+ },
+ {
+ "epoch": 1.6524822695035462,
+ "grad_norm": 2.5806174278259277,
+ "learning_rate": 4.155290851158508e-06,
+ "loss": 0.5292,
+ "step": 3495
+ },
+ {
+ "epoch": 1.652955082742317,
+ "grad_norm": 2.5655179023742676,
+ "learning_rate": 4.154823303030838e-06,
+ "loss": 0.4959,
+ "step": 3496
+ },
+ {
+ "epoch": 1.6534278959810875,
+ "grad_norm": 2.656548261642456,
+ "learning_rate": 4.154355651864579e-06,
+ "loss": 0.5703,
+ "step": 3497
+ },
+ {
+ "epoch": 1.653900709219858,
+ "grad_norm": 2.9085004329681396,
+ "learning_rate": 4.153887897688847e-06,
+ "loss": 0.5061,
+ "step": 3498
+ },
+ {
+ "epoch": 1.654373522458629,
+ "grad_norm": 2.608010768890381,
+ "learning_rate": 4.1534200405327665e-06,
+ "loss": 0.5165,
+ "step": 3499
+ },
+ {
+ "epoch": 1.6548463356973995,
+ "grad_norm": 2.600463628768921,
+ "learning_rate": 4.152952080425471e-06,
+ "loss": 0.4946,
+ "step": 3500
+ },
+ {
+ "epoch": 1.65531914893617,
+ "grad_norm": 2.5561563968658447,
+ "learning_rate": 4.152484017396098e-06,
+ "loss": 0.4804,
+ "step": 3501
+ },
+ {
+ "epoch": 1.6557919621749408,
+ "grad_norm": 2.788594961166382,
+ "learning_rate": 4.152015851473791e-06,
+ "loss": 0.5635,
+ "step": 3502
+ },
+ {
+ "epoch": 1.6562647754137116,
+ "grad_norm": 2.693302631378174,
+ "learning_rate": 4.151547582687699e-06,
+ "loss": 0.5139,
+ "step": 3503
+ },
+ {
+ "epoch": 1.6567375886524822,
+ "grad_norm": 2.7887485027313232,
+ "learning_rate": 4.1510792110669825e-06,
+ "loss": 0.4952,
+ "step": 3504
+ },
+ {
+ "epoch": 1.6572104018912528,
+ "grad_norm": 2.8982298374176025,
+ "learning_rate": 4.150610736640803e-06,
+ "loss": 0.4136,
+ "step": 3505
+ },
+ {
+ "epoch": 1.6576832151300236,
+ "grad_norm": 2.7569408416748047,
+ "learning_rate": 4.150142159438331e-06,
+ "loss": 0.5272,
+ "step": 3506
+ },
+ {
+ "epoch": 1.6581560283687944,
+ "grad_norm": 2.531648874282837,
+ "learning_rate": 4.149673479488742e-06,
+ "loss": 0.5016,
+ "step": 3507
+ },
+ {
+ "epoch": 1.658628841607565,
+ "grad_norm": 2.7706353664398193,
+ "learning_rate": 4.149204696821219e-06,
+ "loss": 0.5512,
+ "step": 3508
+ },
+ {
+ "epoch": 1.6591016548463355,
+ "grad_norm": 2.7307450771331787,
+ "learning_rate": 4.148735811464951e-06,
+ "loss": 0.4968,
+ "step": 3509
+ },
+ {
+ "epoch": 1.6595744680851063,
+ "grad_norm": 3.0097429752349854,
+ "learning_rate": 4.1482668234491335e-06,
+ "loss": 0.4797,
+ "step": 3510
+ },
+ {
+ "epoch": 1.6600472813238771,
+ "grad_norm": 2.6045308113098145,
+ "learning_rate": 4.147797732802969e-06,
+ "loss": 0.5496,
+ "step": 3511
+ },
+ {
+ "epoch": 1.6605200945626477,
+ "grad_norm": 2.702061176300049,
+ "learning_rate": 4.147328539555664e-06,
+ "loss": 0.5302,
+ "step": 3512
+ },
+ {
+ "epoch": 1.6609929078014183,
+ "grad_norm": 3.3724892139434814,
+ "learning_rate": 4.1468592437364356e-06,
+ "loss": 0.5124,
+ "step": 3513
+ },
+ {
+ "epoch": 1.661465721040189,
+ "grad_norm": 2.5117242336273193,
+ "learning_rate": 4.146389845374502e-06,
+ "loss": 0.4953,
+ "step": 3514
+ },
+ {
+ "epoch": 1.6619385342789599,
+ "grad_norm": 2.86547589302063,
+ "learning_rate": 4.145920344499092e-06,
+ "loss": 0.5337,
+ "step": 3515
+ },
+ {
+ "epoch": 1.6624113475177305,
+ "grad_norm": 2.745149850845337,
+ "learning_rate": 4.14545074113944e-06,
+ "loss": 0.5187,
+ "step": 3516
+ },
+ {
+ "epoch": 1.662884160756501,
+ "grad_norm": 2.5560994148254395,
+ "learning_rate": 4.1449810353247855e-06,
+ "loss": 0.5183,
+ "step": 3517
+ },
+ {
+ "epoch": 1.6633569739952718,
+ "grad_norm": 2.2318122386932373,
+ "learning_rate": 4.144511227084374e-06,
+ "loss": 0.4452,
+ "step": 3518
+ },
+ {
+ "epoch": 1.6638297872340426,
+ "grad_norm": 2.6980903148651123,
+ "learning_rate": 4.14404131644746e-06,
+ "loss": 0.4974,
+ "step": 3519
+ },
+ {
+ "epoch": 1.6643026004728132,
+ "grad_norm": 2.6875357627868652,
+ "learning_rate": 4.1435713034433025e-06,
+ "loss": 0.4582,
+ "step": 3520
+ },
+ {
+ "epoch": 1.6647754137115838,
+ "grad_norm": 2.9430019855499268,
+ "learning_rate": 4.143101188101166e-06,
+ "loss": 0.5004,
+ "step": 3521
+ },
+ {
+ "epoch": 1.6652482269503546,
+ "grad_norm": 2.4447221755981445,
+ "learning_rate": 4.142630970450323e-06,
+ "loss": 0.5436,
+ "step": 3522
+ },
+ {
+ "epoch": 1.6657210401891254,
+ "grad_norm": 2.571023941040039,
+ "learning_rate": 4.142160650520053e-06,
+ "loss": 0.5307,
+ "step": 3523
+ },
+ {
+ "epoch": 1.666193853427896,
+ "grad_norm": 2.9725306034088135,
+ "learning_rate": 4.14169022833964e-06,
+ "loss": 0.5918,
+ "step": 3524
+ },
+ {
+ "epoch": 1.6666666666666665,
+ "grad_norm": 2.5958926677703857,
+ "learning_rate": 4.141219703938375e-06,
+ "loss": 0.5036,
+ "step": 3525
+ },
+ {
+ "epoch": 1.6671394799054373,
+ "grad_norm": 2.935788631439209,
+ "learning_rate": 4.140749077345556e-06,
+ "loss": 0.5773,
+ "step": 3526
+ },
+ {
+ "epoch": 1.6676122931442081,
+ "grad_norm": 2.5460526943206787,
+ "learning_rate": 4.140278348590485e-06,
+ "loss": 0.4762,
+ "step": 3527
+ },
+ {
+ "epoch": 1.6680851063829787,
+ "grad_norm": 2.5729143619537354,
+ "learning_rate": 4.139807517702475e-06,
+ "loss": 0.5515,
+ "step": 3528
+ },
+ {
+ "epoch": 1.6685579196217493,
+ "grad_norm": 2.4377381801605225,
+ "learning_rate": 4.13933658471084e-06,
+ "loss": 0.5383,
+ "step": 3529
+ },
+ {
+ "epoch": 1.66903073286052,
+ "grad_norm": 2.6284425258636475,
+ "learning_rate": 4.138865549644905e-06,
+ "loss": 0.5396,
+ "step": 3530
+ },
+ {
+ "epoch": 1.6695035460992909,
+ "grad_norm": 2.857250928878784,
+ "learning_rate": 4.138394412533998e-06,
+ "loss": 0.5861,
+ "step": 3531
+ },
+ {
+ "epoch": 1.6699763593380614,
+ "grad_norm": 2.9226012229919434,
+ "learning_rate": 4.137923173407456e-06,
+ "loss": 0.5262,
+ "step": 3532
+ },
+ {
+ "epoch": 1.670449172576832,
+ "grad_norm": 4.839131832122803,
+ "learning_rate": 4.137451832294619e-06,
+ "loss": 0.651,
+ "step": 3533
+ },
+ {
+ "epoch": 1.6709219858156028,
+ "grad_norm": 2.4727771282196045,
+ "learning_rate": 4.1369803892248375e-06,
+ "loss": 0.5149,
+ "step": 3534
+ },
+ {
+ "epoch": 1.6713947990543736,
+ "grad_norm": 2.5391688346862793,
+ "learning_rate": 4.1365088442274635e-06,
+ "loss": 0.4907,
+ "step": 3535
+ },
+ {
+ "epoch": 1.6718676122931442,
+ "grad_norm": 2.5168209075927734,
+ "learning_rate": 4.136037197331862e-06,
+ "loss": 0.5091,
+ "step": 3536
+ },
+ {
+ "epoch": 1.6723404255319148,
+ "grad_norm": 2.6278600692749023,
+ "learning_rate": 4.135565448567396e-06,
+ "loss": 0.4357,
+ "step": 3537
+ },
+ {
+ "epoch": 1.6728132387706856,
+ "grad_norm": 2.835184097290039,
+ "learning_rate": 4.135093597963441e-06,
+ "loss": 0.4786,
+ "step": 3538
+ },
+ {
+ "epoch": 1.6732860520094563,
+ "grad_norm": 2.385328531265259,
+ "learning_rate": 4.134621645549379e-06,
+ "loss": 0.4849,
+ "step": 3539
+ },
+ {
+ "epoch": 1.673758865248227,
+ "grad_norm": 2.6504149436950684,
+ "learning_rate": 4.134149591354593e-06,
+ "loss": 0.6037,
+ "step": 3540
+ },
+ {
+ "epoch": 1.6742316784869975,
+ "grad_norm": 2.945634126663208,
+ "learning_rate": 4.1336774354084786e-06,
+ "loss": 0.532,
+ "step": 3541
+ },
+ {
+ "epoch": 1.6747044917257683,
+ "grad_norm": 2.8373215198516846,
+ "learning_rate": 4.133205177740434e-06,
+ "loss": 0.5138,
+ "step": 3542
+ },
+ {
+ "epoch": 1.675177304964539,
+ "grad_norm": 2.6616621017456055,
+ "learning_rate": 4.1327328183798634e-06,
+ "loss": 0.5543,
+ "step": 3543
+ },
+ {
+ "epoch": 1.6756501182033097,
+ "grad_norm": 3.0843071937561035,
+ "learning_rate": 4.13226035735618e-06,
+ "loss": 0.6585,
+ "step": 3544
+ },
+ {
+ "epoch": 1.6761229314420802,
+ "grad_norm": 2.2214272022247314,
+ "learning_rate": 4.131787794698802e-06,
+ "loss": 0.5413,
+ "step": 3545
+ },
+ {
+ "epoch": 1.676595744680851,
+ "grad_norm": 2.4515018463134766,
+ "learning_rate": 4.131315130437152e-06,
+ "loss": 0.4966,
+ "step": 3546
+ },
+ {
+ "epoch": 1.6770685579196218,
+ "grad_norm": 2.647414207458496,
+ "learning_rate": 4.130842364600663e-06,
+ "loss": 0.5401,
+ "step": 3547
+ },
+ {
+ "epoch": 1.6775413711583924,
+ "grad_norm": 2.648941993713379,
+ "learning_rate": 4.13036949721877e-06,
+ "loss": 0.4796,
+ "step": 3548
+ },
+ {
+ "epoch": 1.678014184397163,
+ "grad_norm": 2.7835679054260254,
+ "learning_rate": 4.129896528320919e-06,
+ "loss": 0.5653,
+ "step": 3549
+ },
+ {
+ "epoch": 1.6784869976359338,
+ "grad_norm": 2.995964288711548,
+ "learning_rate": 4.129423457936556e-06,
+ "loss": 0.4999,
+ "step": 3550
+ },
+ {
+ "epoch": 1.6789598108747046,
+ "grad_norm": 2.5980007648468018,
+ "learning_rate": 4.1289502860951405e-06,
+ "loss": 0.5177,
+ "step": 3551
+ },
+ {
+ "epoch": 1.6794326241134752,
+ "grad_norm": 2.442254066467285,
+ "learning_rate": 4.128477012826133e-06,
+ "loss": 0.5062,
+ "step": 3552
+ },
+ {
+ "epoch": 1.6799054373522457,
+ "grad_norm": 2.3007538318634033,
+ "learning_rate": 4.1280036381590025e-06,
+ "loss": 0.5029,
+ "step": 3553
+ },
+ {
+ "epoch": 1.6803782505910165,
+ "grad_norm": 2.4169347286224365,
+ "learning_rate": 4.1275301621232245e-06,
+ "loss": 0.515,
+ "step": 3554
+ },
+ {
+ "epoch": 1.6808510638297873,
+ "grad_norm": 2.6456379890441895,
+ "learning_rate": 4.127056584748279e-06,
+ "loss": 0.5343,
+ "step": 3555
+ },
+ {
+ "epoch": 1.681323877068558,
+ "grad_norm": 2.6406595706939697,
+ "learning_rate": 4.1265829060636546e-06,
+ "loss": 0.5047,
+ "step": 3556
+ },
+ {
+ "epoch": 1.6817966903073285,
+ "grad_norm": 2.9344475269317627,
+ "learning_rate": 4.126109126098846e-06,
+ "loss": 0.5501,
+ "step": 3557
+ },
+ {
+ "epoch": 1.6822695035460993,
+ "grad_norm": 2.3292455673217773,
+ "learning_rate": 4.125635244883351e-06,
+ "loss": 0.463,
+ "step": 3558
+ },
+ {
+ "epoch": 1.68274231678487,
+ "grad_norm": 2.4150657653808594,
+ "learning_rate": 4.125161262446677e-06,
+ "loss": 0.4802,
+ "step": 3559
+ },
+ {
+ "epoch": 1.6832151300236406,
+ "grad_norm": 2.604292392730713,
+ "learning_rate": 4.124687178818339e-06,
+ "loss": 0.5683,
+ "step": 3560
+ },
+ {
+ "epoch": 1.6836879432624112,
+ "grad_norm": 2.5676791667938232,
+ "learning_rate": 4.1242129940278544e-06,
+ "loss": 0.5519,
+ "step": 3561
+ },
+ {
+ "epoch": 1.684160756501182,
+ "grad_norm": 3.078514814376831,
+ "learning_rate": 4.123738708104748e-06,
+ "loss": 0.5194,
+ "step": 3562
+ },
+ {
+ "epoch": 1.6846335697399528,
+ "grad_norm": 2.893577814102173,
+ "learning_rate": 4.123264321078552e-06,
+ "loss": 0.5107,
+ "step": 3563
+ },
+ {
+ "epoch": 1.6851063829787234,
+ "grad_norm": 2.772413730621338,
+ "learning_rate": 4.122789832978804e-06,
+ "loss": 0.6147,
+ "step": 3564
+ },
+ {
+ "epoch": 1.685579196217494,
+ "grad_norm": 2.5804643630981445,
+ "learning_rate": 4.12231524383505e-06,
+ "loss": 0.5057,
+ "step": 3565
+ },
+ {
+ "epoch": 1.6860520094562648,
+ "grad_norm": 2.599571466445923,
+ "learning_rate": 4.121840553676839e-06,
+ "loss": 0.5591,
+ "step": 3566
+ },
+ {
+ "epoch": 1.6865248226950356,
+ "grad_norm": 2.9124577045440674,
+ "learning_rate": 4.1213657625337275e-06,
+ "loss": 0.565,
+ "step": 3567
+ },
+ {
+ "epoch": 1.6869976359338061,
+ "grad_norm": 2.6582155227661133,
+ "learning_rate": 4.120890870435281e-06,
+ "loss": 0.4607,
+ "step": 3568
+ },
+ {
+ "epoch": 1.6874704491725767,
+ "grad_norm": 2.929227590560913,
+ "learning_rate": 4.120415877411066e-06,
+ "loss": 0.5705,
+ "step": 3569
+ },
+ {
+ "epoch": 1.6879432624113475,
+ "grad_norm": 2.4443247318267822,
+ "learning_rate": 4.11994078349066e-06,
+ "loss": 0.4592,
+ "step": 3570
+ },
+ {
+ "epoch": 1.6884160756501183,
+ "grad_norm": 2.4799163341522217,
+ "learning_rate": 4.119465588703645e-06,
+ "loss": 0.5361,
+ "step": 3571
+ },
+ {
+ "epoch": 1.6888888888888889,
+ "grad_norm": 2.9408936500549316,
+ "learning_rate": 4.1189902930796085e-06,
+ "loss": 0.5347,
+ "step": 3572
+ },
+ {
+ "epoch": 1.6893617021276595,
+ "grad_norm": 3.3348076343536377,
+ "learning_rate": 4.118514896648146e-06,
+ "loss": 0.5612,
+ "step": 3573
+ },
+ {
+ "epoch": 1.6898345153664303,
+ "grad_norm": 2.764889717102051,
+ "learning_rate": 4.118039399438857e-06,
+ "loss": 0.4745,
+ "step": 3574
+ },
+ {
+ "epoch": 1.690307328605201,
+ "grad_norm": 2.7023751735687256,
+ "learning_rate": 4.11756380148135e-06,
+ "loss": 0.5106,
+ "step": 3575
+ },
+ {
+ "epoch": 1.6907801418439716,
+ "grad_norm": 2.8816208839416504,
+ "learning_rate": 4.117088102805238e-06,
+ "loss": 0.6016,
+ "step": 3576
+ },
+ {
+ "epoch": 1.6912529550827422,
+ "grad_norm": 2.215733289718628,
+ "learning_rate": 4.11661230344014e-06,
+ "loss": 0.4404,
+ "step": 3577
+ },
+ {
+ "epoch": 1.691725768321513,
+ "grad_norm": 2.8190999031066895,
+ "learning_rate": 4.116136403415683e-06,
+ "loss": 0.5038,
+ "step": 3578
+ },
+ {
+ "epoch": 1.6921985815602838,
+ "grad_norm": 2.616424083709717,
+ "learning_rate": 4.115660402761499e-06,
+ "loss": 0.5493,
+ "step": 3579
+ },
+ {
+ "epoch": 1.6926713947990544,
+ "grad_norm": 2.7738113403320312,
+ "learning_rate": 4.115184301507226e-06,
+ "loss": 0.5416,
+ "step": 3580
+ },
+ {
+ "epoch": 1.693144208037825,
+ "grad_norm": 2.4793593883514404,
+ "learning_rate": 4.114708099682509e-06,
+ "loss": 0.4526,
+ "step": 3581
+ },
+ {
+ "epoch": 1.6936170212765957,
+ "grad_norm": 2.390652894973755,
+ "learning_rate": 4.114231797316999e-06,
+ "loss": 0.4908,
+ "step": 3582
+ },
+ {
+ "epoch": 1.6940898345153665,
+ "grad_norm": 2.513197660446167,
+ "learning_rate": 4.113755394440352e-06,
+ "loss": 0.4738,
+ "step": 3583
+ },
+ {
+ "epoch": 1.6945626477541371,
+ "grad_norm": 2.504497766494751,
+ "learning_rate": 4.113278891082234e-06,
+ "loss": 0.4661,
+ "step": 3584
+ },
+ {
+ "epoch": 1.6950354609929077,
+ "grad_norm": 2.4966917037963867,
+ "learning_rate": 4.112802287272314e-06,
+ "loss": 0.4979,
+ "step": 3585
+ },
+ {
+ "epoch": 1.6955082742316785,
+ "grad_norm": 2.3129689693450928,
+ "learning_rate": 4.112325583040265e-06,
+ "loss": 0.4933,
+ "step": 3586
+ },
+ {
+ "epoch": 1.6959810874704493,
+ "grad_norm": 2.822136878967285,
+ "learning_rate": 4.111848778415774e-06,
+ "loss": 0.5087,
+ "step": 3587
+ },
+ {
+ "epoch": 1.6964539007092199,
+ "grad_norm": 2.5181210041046143,
+ "learning_rate": 4.111371873428527e-06,
+ "loss": 0.4836,
+ "step": 3588
+ },
+ {
+ "epoch": 1.6969267139479904,
+ "grad_norm": 2.7564687728881836,
+ "learning_rate": 4.110894868108218e-06,
+ "loss": 0.5224,
+ "step": 3589
+ },
+ {
+ "epoch": 1.6973995271867612,
+ "grad_norm": 2.424421787261963,
+ "learning_rate": 4.11041776248455e-06,
+ "loss": 0.4552,
+ "step": 3590
+ },
+ {
+ "epoch": 1.697872340425532,
+ "grad_norm": 2.7013823986053467,
+ "learning_rate": 4.10994055658723e-06,
+ "loss": 0.5535,
+ "step": 3591
+ },
+ {
+ "epoch": 1.6983451536643026,
+ "grad_norm": 2.5660946369171143,
+ "learning_rate": 4.10946325044597e-06,
+ "loss": 0.5351,
+ "step": 3592
+ },
+ {
+ "epoch": 1.6988179669030732,
+ "grad_norm": 2.5598108768463135,
+ "learning_rate": 4.10898584409049e-06,
+ "loss": 0.5246,
+ "step": 3593
+ },
+ {
+ "epoch": 1.699290780141844,
+ "grad_norm": 2.6318907737731934,
+ "learning_rate": 4.108508337550518e-06,
+ "loss": 0.5002,
+ "step": 3594
+ },
+ {
+ "epoch": 1.6997635933806148,
+ "grad_norm": 2.527099132537842,
+ "learning_rate": 4.108030730855784e-06,
+ "loss": 0.5366,
+ "step": 3595
+ },
+ {
+ "epoch": 1.7002364066193854,
+ "grad_norm": 2.8629603385925293,
+ "learning_rate": 4.107553024036029e-06,
+ "loss": 0.5742,
+ "step": 3596
+ },
+ {
+ "epoch": 1.700709219858156,
+ "grad_norm": 2.8084018230438232,
+ "learning_rate": 4.107075217120994e-06,
+ "loss": 0.5618,
+ "step": 3597
+ },
+ {
+ "epoch": 1.7011820330969267,
+ "grad_norm": 3.6470065116882324,
+ "learning_rate": 4.1065973101404325e-06,
+ "loss": 0.508,
+ "step": 3598
+ },
+ {
+ "epoch": 1.7016548463356975,
+ "grad_norm": 3.0332422256469727,
+ "learning_rate": 4.106119303124102e-06,
+ "loss": 0.51,
+ "step": 3599
+ },
+ {
+ "epoch": 1.702127659574468,
+ "grad_norm": 2.4887590408325195,
+ "learning_rate": 4.105641196101765e-06,
+ "loss": 0.5109,
+ "step": 3600
+ },
+ {
+ "epoch": 1.7026004728132387,
+ "grad_norm": 2.6102066040039062,
+ "learning_rate": 4.105162989103191e-06,
+ "loss": 0.5278,
+ "step": 3601
+ },
+ {
+ "epoch": 1.7030732860520095,
+ "grad_norm": 2.771578073501587,
+ "learning_rate": 4.104684682158156e-06,
+ "loss": 0.498,
+ "step": 3602
+ },
+ {
+ "epoch": 1.7035460992907803,
+ "grad_norm": 2.5452702045440674,
+ "learning_rate": 4.1042062752964425e-06,
+ "loss": 0.4939,
+ "step": 3603
+ },
+ {
+ "epoch": 1.7040189125295508,
+ "grad_norm": 2.4287021160125732,
+ "learning_rate": 4.103727768547838e-06,
+ "loss": 0.4819,
+ "step": 3604
+ },
+ {
+ "epoch": 1.7044917257683214,
+ "grad_norm": 2.412280321121216,
+ "learning_rate": 4.103249161942138e-06,
+ "loss": 0.5196,
+ "step": 3605
+ },
+ {
+ "epoch": 1.7049645390070922,
+ "grad_norm": 2.8850717544555664,
+ "learning_rate": 4.102770455509142e-06,
+ "loss": 0.5724,
+ "step": 3606
+ },
+ {
+ "epoch": 1.705437352245863,
+ "grad_norm": 2.7979609966278076,
+ "learning_rate": 4.102291649278659e-06,
+ "loss": 0.5295,
+ "step": 3607
+ },
+ {
+ "epoch": 1.7059101654846336,
+ "grad_norm": 2.762238025665283,
+ "learning_rate": 4.1018127432805e-06,
+ "loss": 0.5166,
+ "step": 3608
+ },
+ {
+ "epoch": 1.7063829787234042,
+ "grad_norm": 2.921586513519287,
+ "learning_rate": 4.101333737544485e-06,
+ "loss": 0.5607,
+ "step": 3609
+ },
+ {
+ "epoch": 1.706855791962175,
+ "grad_norm": 3.001929998397827,
+ "learning_rate": 4.100854632100439e-06,
+ "loss": 0.6255,
+ "step": 3610
+ },
+ {
+ "epoch": 1.7073286052009458,
+ "grad_norm": 2.752713918685913,
+ "learning_rate": 4.100375426978196e-06,
+ "loss": 0.5732,
+ "step": 3611
+ },
+ {
+ "epoch": 1.7078014184397163,
+ "grad_norm": 2.6496472358703613,
+ "learning_rate": 4.099896122207593e-06,
+ "loss": 0.5138,
+ "step": 3612
+ },
+ {
+ "epoch": 1.708274231678487,
+ "grad_norm": 3.0079452991485596,
+ "learning_rate": 4.099416717818473e-06,
+ "loss": 0.5746,
+ "step": 3613
+ },
+ {
+ "epoch": 1.7087470449172577,
+ "grad_norm": 2.5762360095977783,
+ "learning_rate": 4.098937213840687e-06,
+ "loss": 0.5308,
+ "step": 3614
+ },
+ {
+ "epoch": 1.7092198581560285,
+ "grad_norm": 2.6026158332824707,
+ "learning_rate": 4.098457610304092e-06,
+ "loss": 0.4857,
+ "step": 3615
+ },
+ {
+ "epoch": 1.709692671394799,
+ "grad_norm": 2.587583541870117,
+ "learning_rate": 4.097977907238551e-06,
+ "loss": 0.4591,
+ "step": 3616
+ },
+ {
+ "epoch": 1.7101654846335697,
+ "grad_norm": 2.6996991634368896,
+ "learning_rate": 4.097498104673932e-06,
+ "loss": 0.5298,
+ "step": 3617
+ },
+ {
+ "epoch": 1.7106382978723405,
+ "grad_norm": 2.600029945373535,
+ "learning_rate": 4.097018202640111e-06,
+ "loss": 0.4726,
+ "step": 3618
+ },
+ {
+ "epoch": 1.7111111111111112,
+ "grad_norm": 2.8261220455169678,
+ "learning_rate": 4.096538201166969e-06,
+ "loss": 0.5242,
+ "step": 3619
+ },
+ {
+ "epoch": 1.7115839243498818,
+ "grad_norm": 3.053027629852295,
+ "learning_rate": 4.096058100284394e-06,
+ "loss": 0.5568,
+ "step": 3620
+ },
+ {
+ "epoch": 1.7120567375886524,
+ "grad_norm": 2.9638442993164062,
+ "learning_rate": 4.0955779000222805e-06,
+ "loss": 0.5325,
+ "step": 3621
+ },
+ {
+ "epoch": 1.7125295508274232,
+ "grad_norm": 2.731095790863037,
+ "learning_rate": 4.095097600410527e-06,
+ "loss": 0.4733,
+ "step": 3622
+ },
+ {
+ "epoch": 1.713002364066194,
+ "grad_norm": 2.632490873336792,
+ "learning_rate": 4.09461720147904e-06,
+ "loss": 0.5253,
+ "step": 3623
+ },
+ {
+ "epoch": 1.7134751773049646,
+ "grad_norm": 2.847689390182495,
+ "learning_rate": 4.094136703257732e-06,
+ "loss": 0.57,
+ "step": 3624
+ },
+ {
+ "epoch": 1.7139479905437351,
+ "grad_norm": 3.1078696250915527,
+ "learning_rate": 4.0936561057765215e-06,
+ "loss": 0.5368,
+ "step": 3625
+ },
+ {
+ "epoch": 1.714420803782506,
+ "grad_norm": 2.696349620819092,
+ "learning_rate": 4.0931754090653334e-06,
+ "loss": 0.491,
+ "step": 3626
+ },
+ {
+ "epoch": 1.7148936170212767,
+ "grad_norm": 2.712958812713623,
+ "learning_rate": 4.092694613154099e-06,
+ "loss": 0.5768,
+ "step": 3627
+ },
+ {
+ "epoch": 1.7153664302600473,
+ "grad_norm": 2.5421478748321533,
+ "learning_rate": 4.092213718072754e-06,
+ "loss": 0.4839,
+ "step": 3628
+ },
+ {
+ "epoch": 1.715839243498818,
+ "grad_norm": 2.5176162719726562,
+ "learning_rate": 4.091732723851243e-06,
+ "loss": 0.5049,
+ "step": 3629
+ },
+ {
+ "epoch": 1.7163120567375887,
+ "grad_norm": 2.642185926437378,
+ "learning_rate": 4.091251630519514e-06,
+ "loss": 0.589,
+ "step": 3630
+ },
+ {
+ "epoch": 1.7167848699763595,
+ "grad_norm": 2.587348461151123,
+ "learning_rate": 4.0907704381075245e-06,
+ "loss": 0.5281,
+ "step": 3631
+ },
+ {
+ "epoch": 1.71725768321513,
+ "grad_norm": 2.4628195762634277,
+ "learning_rate": 4.090289146645234e-06,
+ "loss": 0.5592,
+ "step": 3632
+ },
+ {
+ "epoch": 1.7177304964539006,
+ "grad_norm": 2.2751028537750244,
+ "learning_rate": 4.0898077561626125e-06,
+ "loss": 0.502,
+ "step": 3633
+ },
+ {
+ "epoch": 1.7182033096926714,
+ "grad_norm": 2.7712769508361816,
+ "learning_rate": 4.089326266689632e-06,
+ "loss": 0.5143,
+ "step": 3634
+ },
+ {
+ "epoch": 1.7186761229314422,
+ "grad_norm": 2.5297727584838867,
+ "learning_rate": 4.088844678256275e-06,
+ "loss": 0.5035,
+ "step": 3635
+ },
+ {
+ "epoch": 1.7191489361702128,
+ "grad_norm": 2.739130735397339,
+ "learning_rate": 4.088362990892527e-06,
+ "loss": 0.5959,
+ "step": 3636
+ },
+ {
+ "epoch": 1.7196217494089834,
+ "grad_norm": 2.3708314895629883,
+ "learning_rate": 4.08788120462838e-06,
+ "loss": 0.4796,
+ "step": 3637
+ },
+ {
+ "epoch": 1.7200945626477542,
+ "grad_norm": 2.7664241790771484,
+ "learning_rate": 4.087399319493832e-06,
+ "loss": 0.6052,
+ "step": 3638
+ },
+ {
+ "epoch": 1.720567375886525,
+ "grad_norm": 2.5900204181671143,
+ "learning_rate": 4.0869173355188895e-06,
+ "loss": 0.4955,
+ "step": 3639
+ },
+ {
+ "epoch": 1.7210401891252955,
+ "grad_norm": 2.6771862506866455,
+ "learning_rate": 4.0864352527335635e-06,
+ "loss": 0.4889,
+ "step": 3640
+ },
+ {
+ "epoch": 1.7215130023640661,
+ "grad_norm": 2.888479471206665,
+ "learning_rate": 4.085953071167871e-06,
+ "loss": 0.5719,
+ "step": 3641
+ },
+ {
+ "epoch": 1.721985815602837,
+ "grad_norm": 2.5967187881469727,
+ "learning_rate": 4.085470790851833e-06,
+ "loss": 0.4959,
+ "step": 3642
+ },
+ {
+ "epoch": 1.7224586288416077,
+ "grad_norm": 2.5317695140838623,
+ "learning_rate": 4.084988411815483e-06,
+ "loss": 0.4596,
+ "step": 3643
+ },
+ {
+ "epoch": 1.7229314420803783,
+ "grad_norm": 2.6531455516815186,
+ "learning_rate": 4.084505934088853e-06,
+ "loss": 0.5346,
+ "step": 3644
+ },
+ {
+ "epoch": 1.7234042553191489,
+ "grad_norm": 2.6525208950042725,
+ "learning_rate": 4.084023357701987e-06,
+ "loss": 0.5178,
+ "step": 3645
+ },
+ {
+ "epoch": 1.7238770685579197,
+ "grad_norm": 2.461954116821289,
+ "learning_rate": 4.083540682684932e-06,
+ "loss": 0.4802,
+ "step": 3646
+ },
+ {
+ "epoch": 1.7243498817966905,
+ "grad_norm": 2.794696807861328,
+ "learning_rate": 4.083057909067743e-06,
+ "loss": 0.5148,
+ "step": 3647
+ },
+ {
+ "epoch": 1.724822695035461,
+ "grad_norm": 2.867572546005249,
+ "learning_rate": 4.082575036880479e-06,
+ "loss": 0.5352,
+ "step": 3648
+ },
+ {
+ "epoch": 1.7252955082742316,
+ "grad_norm": 2.642820358276367,
+ "learning_rate": 4.082092066153207e-06,
+ "loss": 0.4652,
+ "step": 3649
+ },
+ {
+ "epoch": 1.7257683215130024,
+ "grad_norm": 2.782142400741577,
+ "learning_rate": 4.081608996915999e-06,
+ "loss": 0.5591,
+ "step": 3650
+ },
+ {
+ "epoch": 1.7262411347517732,
+ "grad_norm": 2.327331304550171,
+ "learning_rate": 4.081125829198934e-06,
+ "loss": 0.4339,
+ "step": 3651
+ },
+ {
+ "epoch": 1.7267139479905438,
+ "grad_norm": 2.7959988117218018,
+ "learning_rate": 4.0806425630320965e-06,
+ "loss": 0.5783,
+ "step": 3652
+ },
+ {
+ "epoch": 1.7271867612293144,
+ "grad_norm": 2.595053195953369,
+ "learning_rate": 4.080159198445578e-06,
+ "loss": 0.4602,
+ "step": 3653
+ },
+ {
+ "epoch": 1.7276595744680852,
+ "grad_norm": 3.0968129634857178,
+ "learning_rate": 4.079675735469475e-06,
+ "loss": 0.5775,
+ "step": 3654
+ },
+ {
+ "epoch": 1.728132387706856,
+ "grad_norm": 2.628044605255127,
+ "learning_rate": 4.07919217413389e-06,
+ "loss": 0.486,
+ "step": 3655
+ },
+ {
+ "epoch": 1.7286052009456265,
+ "grad_norm": 2.782799005508423,
+ "learning_rate": 4.078708514468933e-06,
+ "loss": 0.5282,
+ "step": 3656
+ },
+ {
+ "epoch": 1.729078014184397,
+ "grad_norm": 2.655365467071533,
+ "learning_rate": 4.0782247565047205e-06,
+ "loss": 0.4873,
+ "step": 3657
+ },
+ {
+ "epoch": 1.729550827423168,
+ "grad_norm": 2.9461584091186523,
+ "learning_rate": 4.077740900271371e-06,
+ "loss": 0.548,
+ "step": 3658
+ },
+ {
+ "epoch": 1.7300236406619387,
+ "grad_norm": 2.5094761848449707,
+ "learning_rate": 4.077256945799015e-06,
+ "loss": 0.5437,
+ "step": 3659
+ },
+ {
+ "epoch": 1.7304964539007093,
+ "grad_norm": 2.555793285369873,
+ "learning_rate": 4.0767728931177845e-06,
+ "loss": 0.5268,
+ "step": 3660
+ },
+ {
+ "epoch": 1.7309692671394799,
+ "grad_norm": 2.4433486461639404,
+ "learning_rate": 4.07628874225782e-06,
+ "loss": 0.5211,
+ "step": 3661
+ },
+ {
+ "epoch": 1.7314420803782506,
+ "grad_norm": 2.365206003189087,
+ "learning_rate": 4.075804493249267e-06,
+ "loss": 0.5084,
+ "step": 3662
+ },
+ {
+ "epoch": 1.7319148936170212,
+ "grad_norm": 2.514305830001831,
+ "learning_rate": 4.075320146122278e-06,
+ "loss": 0.4693,
+ "step": 3663
+ },
+ {
+ "epoch": 1.7323877068557918,
+ "grad_norm": 2.9270083904266357,
+ "learning_rate": 4.074835700907012e-06,
+ "loss": 0.5724,
+ "step": 3664
+ },
+ {
+ "epoch": 1.7328605200945626,
+ "grad_norm": 2.938692569732666,
+ "learning_rate": 4.0743511576336315e-06,
+ "loss": 0.5361,
+ "step": 3665
+ },
+ {
+ "epoch": 1.7333333333333334,
+ "grad_norm": 3.1978867053985596,
+ "learning_rate": 4.073866516332307e-06,
+ "loss": 0.6277,
+ "step": 3666
+ },
+ {
+ "epoch": 1.733806146572104,
+ "grad_norm": 2.3477370738983154,
+ "learning_rate": 4.073381777033217e-06,
+ "loss": 0.5139,
+ "step": 3667
+ },
+ {
+ "epoch": 1.7342789598108745,
+ "grad_norm": 2.5954184532165527,
+ "learning_rate": 4.072896939766543e-06,
+ "loss": 0.537,
+ "step": 3668
+ },
+ {
+ "epoch": 1.7347517730496453,
+ "grad_norm": 2.8999998569488525,
+ "learning_rate": 4.072412004562472e-06,
+ "loss": 0.5486,
+ "step": 3669
+ },
+ {
+ "epoch": 1.7352245862884161,
+ "grad_norm": 2.7320556640625,
+ "learning_rate": 4.071926971451201e-06,
+ "loss": 0.6025,
+ "step": 3670
+ },
+ {
+ "epoch": 1.7356973995271867,
+ "grad_norm": 2.499234676361084,
+ "learning_rate": 4.0714418404629304e-06,
+ "loss": 0.456,
+ "step": 3671
+ },
+ {
+ "epoch": 1.7361702127659573,
+ "grad_norm": 2.485924243927002,
+ "learning_rate": 4.070956611627867e-06,
+ "loss": 0.5097,
+ "step": 3672
+ },
+ {
+ "epoch": 1.736643026004728,
+ "grad_norm": 2.513723373413086,
+ "learning_rate": 4.070471284976225e-06,
+ "loss": 0.4744,
+ "step": 3673
+ },
+ {
+ "epoch": 1.7371158392434989,
+ "grad_norm": 2.281977653503418,
+ "learning_rate": 4.06998586053822e-06,
+ "loss": 0.5124,
+ "step": 3674
+ },
+ {
+ "epoch": 1.7375886524822695,
+ "grad_norm": 2.3683905601501465,
+ "learning_rate": 4.069500338344081e-06,
+ "loss": 0.4816,
+ "step": 3675
+ },
+ {
+ "epoch": 1.73806146572104,
+ "grad_norm": 2.5635924339294434,
+ "learning_rate": 4.069014718424038e-06,
+ "loss": 0.5665,
+ "step": 3676
+ },
+ {
+ "epoch": 1.7385342789598108,
+ "grad_norm": 2.7308456897735596,
+ "learning_rate": 4.068529000808328e-06,
+ "loss": 0.534,
+ "step": 3677
+ },
+ {
+ "epoch": 1.7390070921985816,
+ "grad_norm": 2.788452625274658,
+ "learning_rate": 4.068043185527196e-06,
+ "loss": 0.5609,
+ "step": 3678
+ },
+ {
+ "epoch": 1.7394799054373522,
+ "grad_norm": 2.832368850708008,
+ "learning_rate": 4.067557272610889e-06,
+ "loss": 0.553,
+ "step": 3679
+ },
+ {
+ "epoch": 1.7399527186761228,
+ "grad_norm": 2.9987435340881348,
+ "learning_rate": 4.067071262089665e-06,
+ "loss": 0.5,
+ "step": 3680
+ },
+ {
+ "epoch": 1.7404255319148936,
+ "grad_norm": 3.04913067817688,
+ "learning_rate": 4.066585153993785e-06,
+ "loss": 0.5158,
+ "step": 3681
+ },
+ {
+ "epoch": 1.7408983451536644,
+ "grad_norm": 2.5177130699157715,
+ "learning_rate": 4.066098948353516e-06,
+ "loss": 0.4508,
+ "step": 3682
+ },
+ {
+ "epoch": 1.741371158392435,
+ "grad_norm": 2.8991222381591797,
+ "learning_rate": 4.065612645199133e-06,
+ "loss": 0.5268,
+ "step": 3683
+ },
+ {
+ "epoch": 1.7418439716312055,
+ "grad_norm": 2.4928159713745117,
+ "learning_rate": 4.0651262445609156e-06,
+ "loss": 0.5024,
+ "step": 3684
+ },
+ {
+ "epoch": 1.7423167848699763,
+ "grad_norm": 2.9737319946289062,
+ "learning_rate": 4.06463974646915e-06,
+ "loss": 0.5429,
+ "step": 3685
+ },
+ {
+ "epoch": 1.7427895981087471,
+ "grad_norm": 2.6485493183135986,
+ "learning_rate": 4.064153150954128e-06,
+ "loss": 0.5619,
+ "step": 3686
+ },
+ {
+ "epoch": 1.7432624113475177,
+ "grad_norm": 2.564861297607422,
+ "learning_rate": 4.063666458046148e-06,
+ "loss": 0.4878,
+ "step": 3687
+ },
+ {
+ "epoch": 1.7437352245862883,
+ "grad_norm": 2.6048383712768555,
+ "learning_rate": 4.063179667775514e-06,
+ "loss": 0.4836,
+ "step": 3688
+ },
+ {
+ "epoch": 1.744208037825059,
+ "grad_norm": 2.751638650894165,
+ "learning_rate": 4.062692780172536e-06,
+ "loss": 0.5558,
+ "step": 3689
+ },
+ {
+ "epoch": 1.7446808510638299,
+ "grad_norm": 3.3866634368896484,
+ "learning_rate": 4.062205795267531e-06,
+ "loss": 0.4825,
+ "step": 3690
+ },
+ {
+ "epoch": 1.7451536643026004,
+ "grad_norm": 3.0112249851226807,
+ "learning_rate": 4.061718713090822e-06,
+ "loss": 0.5732,
+ "step": 3691
+ },
+ {
+ "epoch": 1.745626477541371,
+ "grad_norm": 2.5889365673065186,
+ "learning_rate": 4.061231533672736e-06,
+ "loss": 0.483,
+ "step": 3692
+ },
+ {
+ "epoch": 1.7460992907801418,
+ "grad_norm": 2.624598979949951,
+ "learning_rate": 4.0607442570436085e-06,
+ "loss": 0.5706,
+ "step": 3693
+ },
+ {
+ "epoch": 1.7465721040189126,
+ "grad_norm": 2.9219250679016113,
+ "learning_rate": 4.060256883233779e-06,
+ "loss": 0.5153,
+ "step": 3694
+ },
+ {
+ "epoch": 1.7470449172576832,
+ "grad_norm": 3.2219252586364746,
+ "learning_rate": 4.059769412273595e-06,
+ "loss": 0.5184,
+ "step": 3695
+ },
+ {
+ "epoch": 1.7475177304964538,
+ "grad_norm": 2.890697956085205,
+ "learning_rate": 4.05928184419341e-06,
+ "loss": 0.5312,
+ "step": 3696
+ },
+ {
+ "epoch": 1.7479905437352246,
+ "grad_norm": 2.673809289932251,
+ "learning_rate": 4.0587941790235816e-06,
+ "loss": 0.4893,
+ "step": 3697
+ },
+ {
+ "epoch": 1.7484633569739954,
+ "grad_norm": 2.5339348316192627,
+ "learning_rate": 4.058306416794474e-06,
+ "loss": 0.5115,
+ "step": 3698
+ },
+ {
+ "epoch": 1.748936170212766,
+ "grad_norm": 2.6525840759277344,
+ "learning_rate": 4.05781855753646e-06,
+ "loss": 0.5256,
+ "step": 3699
+ },
+ {
+ "epoch": 1.7494089834515365,
+ "grad_norm": 2.7868754863739014,
+ "learning_rate": 4.057330601279914e-06,
+ "loss": 0.5227,
+ "step": 3700
+ },
+ {
+ "epoch": 1.7498817966903073,
+ "grad_norm": 3.1629884243011475,
+ "learning_rate": 4.056842548055221e-06,
+ "loss": 0.5617,
+ "step": 3701
+ },
+ {
+ "epoch": 1.750354609929078,
+ "grad_norm": 2.9350688457489014,
+ "learning_rate": 4.056354397892769e-06,
+ "loss": 0.4753,
+ "step": 3702
+ },
+ {
+ "epoch": 1.7508274231678487,
+ "grad_norm": 2.9688615798950195,
+ "learning_rate": 4.0558661508229525e-06,
+ "loss": 0.596,
+ "step": 3703
+ },
+ {
+ "epoch": 1.7513002364066192,
+ "grad_norm": 2.802205801010132,
+ "learning_rate": 4.055377806876174e-06,
+ "loss": 0.5793,
+ "step": 3704
+ },
+ {
+ "epoch": 1.75177304964539,
+ "grad_norm": 2.4933416843414307,
+ "learning_rate": 4.054889366082839e-06,
+ "loss": 0.4824,
+ "step": 3705
+ },
+ {
+ "epoch": 1.7522458628841608,
+ "grad_norm": 3.7904608249664307,
+ "learning_rate": 4.054400828473361e-06,
+ "loss": 0.5124,
+ "step": 3706
+ },
+ {
+ "epoch": 1.7527186761229314,
+ "grad_norm": 2.694838762283325,
+ "learning_rate": 4.053912194078159e-06,
+ "loss": 0.5604,
+ "step": 3707
+ },
+ {
+ "epoch": 1.753191489361702,
+ "grad_norm": 2.3721256256103516,
+ "learning_rate": 4.053423462927659e-06,
+ "loss": 0.4978,
+ "step": 3708
+ },
+ {
+ "epoch": 1.7536643026004728,
+ "grad_norm": 2.718512773513794,
+ "learning_rate": 4.052934635052292e-06,
+ "loss": 0.5029,
+ "step": 3709
+ },
+ {
+ "epoch": 1.7541371158392436,
+ "grad_norm": 3.061558246612549,
+ "learning_rate": 4.052445710482493e-06,
+ "loss": 0.4886,
+ "step": 3710
+ },
+ {
+ "epoch": 1.7546099290780142,
+ "grad_norm": 3.0490729808807373,
+ "learning_rate": 4.051956689248709e-06,
+ "loss": 0.5363,
+ "step": 3711
+ },
+ {
+ "epoch": 1.7550827423167847,
+ "grad_norm": 2.611661672592163,
+ "learning_rate": 4.051467571381385e-06,
+ "loss": 0.5397,
+ "step": 3712
+ },
+ {
+ "epoch": 1.7555555555555555,
+ "grad_norm": 2.7829177379608154,
+ "learning_rate": 4.050978356910979e-06,
+ "loss": 0.4973,
+ "step": 3713
+ },
+ {
+ "epoch": 1.7560283687943263,
+ "grad_norm": 2.6228256225585938,
+ "learning_rate": 4.0504890458679525e-06,
+ "loss": 0.4551,
+ "step": 3714
+ },
+ {
+ "epoch": 1.756501182033097,
+ "grad_norm": 2.6801326274871826,
+ "learning_rate": 4.049999638282771e-06,
+ "loss": 0.5581,
+ "step": 3715
+ },
+ {
+ "epoch": 1.7569739952718675,
+ "grad_norm": 2.4476819038391113,
+ "learning_rate": 4.049510134185908e-06,
+ "loss": 0.5226,
+ "step": 3716
+ },
+ {
+ "epoch": 1.7574468085106383,
+ "grad_norm": 2.5661075115203857,
+ "learning_rate": 4.049020533607844e-06,
+ "loss": 0.5163,
+ "step": 3717
+ },
+ {
+ "epoch": 1.757919621749409,
+ "grad_norm": 2.3923349380493164,
+ "learning_rate": 4.048530836579065e-06,
+ "loss": 0.5076,
+ "step": 3718
+ },
+ {
+ "epoch": 1.7583924349881797,
+ "grad_norm": 2.8204405307769775,
+ "learning_rate": 4.0480410431300585e-06,
+ "loss": 0.5883,
+ "step": 3719
+ },
+ {
+ "epoch": 1.7588652482269502,
+ "grad_norm": 2.323107957839966,
+ "learning_rate": 4.047551153291325e-06,
+ "loss": 0.5116,
+ "step": 3720
+ },
+ {
+ "epoch": 1.759338061465721,
+ "grad_norm": 2.8306009769439697,
+ "learning_rate": 4.047061167093368e-06,
+ "loss": 0.5094,
+ "step": 3721
+ },
+ {
+ "epoch": 1.7598108747044918,
+ "grad_norm": 2.568765640258789,
+ "learning_rate": 4.046571084566695e-06,
+ "loss": 0.4725,
+ "step": 3722
+ },
+ {
+ "epoch": 1.7602836879432624,
+ "grad_norm": 2.7212061882019043,
+ "learning_rate": 4.046080905741822e-06,
+ "loss": 0.4741,
+ "step": 3723
+ },
+ {
+ "epoch": 1.760756501182033,
+ "grad_norm": 2.802917003631592,
+ "learning_rate": 4.04559063064927e-06,
+ "loss": 0.5691,
+ "step": 3724
+ },
+ {
+ "epoch": 1.7612293144208038,
+ "grad_norm": 3.1044139862060547,
+ "learning_rate": 4.0451002593195675e-06,
+ "loss": 0.5472,
+ "step": 3725
+ },
+ {
+ "epoch": 1.7617021276595746,
+ "grad_norm": 2.5855562686920166,
+ "learning_rate": 4.044609791783246e-06,
+ "loss": 0.4852,
+ "step": 3726
+ },
+ {
+ "epoch": 1.7621749408983451,
+ "grad_norm": 2.6235129833221436,
+ "learning_rate": 4.0441192280708465e-06,
+ "loss": 0.5269,
+ "step": 3727
+ },
+ {
+ "epoch": 1.7626477541371157,
+ "grad_norm": 3.535630464553833,
+ "learning_rate": 4.043628568212914e-06,
+ "loss": 0.5266,
+ "step": 3728
+ },
+ {
+ "epoch": 1.7631205673758865,
+ "grad_norm": 2.7783355712890625,
+ "learning_rate": 4.043137812239998e-06,
+ "loss": 0.5609,
+ "step": 3729
+ },
+ {
+ "epoch": 1.7635933806146573,
+ "grad_norm": 2.9344944953918457,
+ "learning_rate": 4.042646960182657e-06,
+ "loss": 0.5056,
+ "step": 3730
+ },
+ {
+ "epoch": 1.7640661938534279,
+ "grad_norm": 2.6205739974975586,
+ "learning_rate": 4.042156012071453e-06,
+ "loss": 0.4914,
+ "step": 3731
+ },
+ {
+ "epoch": 1.7645390070921985,
+ "grad_norm": 2.8004493713378906,
+ "learning_rate": 4.041664967936958e-06,
+ "loss": 0.4901,
+ "step": 3732
+ },
+ {
+ "epoch": 1.7650118203309693,
+ "grad_norm": 2.944589138031006,
+ "learning_rate": 4.041173827809745e-06,
+ "loss": 0.5572,
+ "step": 3733
+ },
+ {
+ "epoch": 1.76548463356974,
+ "grad_norm": 2.5021605491638184,
+ "learning_rate": 4.040682591720397e-06,
+ "loss": 0.4637,
+ "step": 3734
+ },
+ {
+ "epoch": 1.7659574468085106,
+ "grad_norm": 2.448030948638916,
+ "learning_rate": 4.040191259699497e-06,
+ "loss": 0.4785,
+ "step": 3735
+ },
+ {
+ "epoch": 1.7664302600472812,
+ "grad_norm": 2.7171032428741455,
+ "learning_rate": 4.039699831777643e-06,
+ "loss": 0.4919,
+ "step": 3736
+ },
+ {
+ "epoch": 1.766903073286052,
+ "grad_norm": 2.453118324279785,
+ "learning_rate": 4.03920830798543e-06,
+ "loss": 0.4326,
+ "step": 3737
+ },
+ {
+ "epoch": 1.7673758865248228,
+ "grad_norm": 3.112877368927002,
+ "learning_rate": 4.038716688353466e-06,
+ "loss": 0.5375,
+ "step": 3738
+ },
+ {
+ "epoch": 1.7678486997635934,
+ "grad_norm": 2.742239236831665,
+ "learning_rate": 4.038224972912361e-06,
+ "loss": 0.5267,
+ "step": 3739
+ },
+ {
+ "epoch": 1.768321513002364,
+ "grad_norm": 2.544785737991333,
+ "learning_rate": 4.037733161692731e-06,
+ "loss": 0.5032,
+ "step": 3740
+ },
+ {
+ "epoch": 1.7687943262411348,
+ "grad_norm": 2.4639062881469727,
+ "learning_rate": 4.037241254725201e-06,
+ "loss": 0.5532,
+ "step": 3741
+ },
+ {
+ "epoch": 1.7692671394799055,
+ "grad_norm": 2.866290330886841,
+ "learning_rate": 4.036749252040398e-06,
+ "loss": 0.5503,
+ "step": 3742
+ },
+ {
+ "epoch": 1.7697399527186761,
+ "grad_norm": 2.3466262817382812,
+ "learning_rate": 4.0362571536689575e-06,
+ "loss": 0.5286,
+ "step": 3743
+ },
+ {
+ "epoch": 1.7702127659574467,
+ "grad_norm": 2.246464967727661,
+ "learning_rate": 4.03576495964152e-06,
+ "loss": 0.4656,
+ "step": 3744
+ },
+ {
+ "epoch": 1.7706855791962175,
+ "grad_norm": 2.667558431625366,
+ "learning_rate": 4.035272669988733e-06,
+ "loss": 0.5205,
+ "step": 3745
+ },
+ {
+ "epoch": 1.7711583924349883,
+ "grad_norm": 2.974666118621826,
+ "learning_rate": 4.034780284741249e-06,
+ "loss": 0.6007,
+ "step": 3746
+ },
+ {
+ "epoch": 1.7716312056737589,
+ "grad_norm": 2.7164433002471924,
+ "learning_rate": 4.034287803929726e-06,
+ "loss": 0.4913,
+ "step": 3747
+ },
+ {
+ "epoch": 1.7721040189125294,
+ "grad_norm": 2.5923962593078613,
+ "learning_rate": 4.033795227584829e-06,
+ "loss": 0.5275,
+ "step": 3748
+ },
+ {
+ "epoch": 1.7725768321513002,
+ "grad_norm": 2.606027126312256,
+ "learning_rate": 4.033302555737229e-06,
+ "loss": 0.4869,
+ "step": 3749
+ },
+ {
+ "epoch": 1.773049645390071,
+ "grad_norm": 3.0110089778900146,
+ "learning_rate": 4.032809788417602e-06,
+ "loss": 0.4956,
+ "step": 3750
+ },
+ {
+ "epoch": 1.7735224586288416,
+ "grad_norm": 3.004598617553711,
+ "learning_rate": 4.032316925656632e-06,
+ "loss": 0.5159,
+ "step": 3751
+ },
+ {
+ "epoch": 1.7739952718676122,
+ "grad_norm": 2.731539249420166,
+ "learning_rate": 4.031823967485005e-06,
+ "loss": 0.5237,
+ "step": 3752
+ },
+ {
+ "epoch": 1.774468085106383,
+ "grad_norm": 2.7466373443603516,
+ "learning_rate": 4.0313309139334155e-06,
+ "loss": 0.4948,
+ "step": 3753
+ },
+ {
+ "epoch": 1.7749408983451538,
+ "grad_norm": 2.8596460819244385,
+ "learning_rate": 4.030837765032565e-06,
+ "loss": 0.5016,
+ "step": 3754
+ },
+ {
+ "epoch": 1.7754137115839244,
+ "grad_norm": 3.2886788845062256,
+ "learning_rate": 4.03034452081316e-06,
+ "loss": 0.5377,
+ "step": 3755
+ },
+ {
+ "epoch": 1.775886524822695,
+ "grad_norm": 2.5629258155822754,
+ "learning_rate": 4.029851181305912e-06,
+ "loss": 0.519,
+ "step": 3756
+ },
+ {
+ "epoch": 1.7763593380614657,
+ "grad_norm": 2.5988714694976807,
+ "learning_rate": 4.029357746541539e-06,
+ "loss": 0.5521,
+ "step": 3757
+ },
+ {
+ "epoch": 1.7768321513002365,
+ "grad_norm": 2.987884759902954,
+ "learning_rate": 4.028864216550765e-06,
+ "loss": 0.6225,
+ "step": 3758
+ },
+ {
+ "epoch": 1.777304964539007,
+ "grad_norm": 2.6875851154327393,
+ "learning_rate": 4.02837059136432e-06,
+ "loss": 0.5321,
+ "step": 3759
+ },
+ {
+ "epoch": 1.7777777777777777,
+ "grad_norm": 2.6414570808410645,
+ "learning_rate": 4.02787687101294e-06,
+ "loss": 0.4831,
+ "step": 3760
+ },
+ {
+ "epoch": 1.7782505910165485,
+ "grad_norm": 2.581475019454956,
+ "learning_rate": 4.027383055527368e-06,
+ "loss": 0.5204,
+ "step": 3761
+ },
+ {
+ "epoch": 1.7787234042553193,
+ "grad_norm": 2.811298131942749,
+ "learning_rate": 4.026889144938349e-06,
+ "loss": 0.5486,
+ "step": 3762
+ },
+ {
+ "epoch": 1.7791962174940898,
+ "grad_norm": 3.1589081287384033,
+ "learning_rate": 4.026395139276639e-06,
+ "loss": 0.4979,
+ "step": 3763
+ },
+ {
+ "epoch": 1.7796690307328604,
+ "grad_norm": 2.3773093223571777,
+ "learning_rate": 4.025901038572996e-06,
+ "loss": 0.503,
+ "step": 3764
+ },
+ {
+ "epoch": 1.7801418439716312,
+ "grad_norm": 2.962541341781616,
+ "learning_rate": 4.025406842858187e-06,
+ "loss": 0.4613,
+ "step": 3765
+ },
+ {
+ "epoch": 1.780614657210402,
+ "grad_norm": 2.603092908859253,
+ "learning_rate": 4.024912552162982e-06,
+ "loss": 0.5142,
+ "step": 3766
+ },
+ {
+ "epoch": 1.7810874704491726,
+ "grad_norm": 2.648927927017212,
+ "learning_rate": 4.024418166518159e-06,
+ "loss": 0.4491,
+ "step": 3767
+ },
+ {
+ "epoch": 1.7815602836879432,
+ "grad_norm": 3.3239917755126953,
+ "learning_rate": 4.023923685954502e-06,
+ "loss": 0.6272,
+ "step": 3768
+ },
+ {
+ "epoch": 1.782033096926714,
+ "grad_norm": 2.672821283340454,
+ "learning_rate": 4.023429110502798e-06,
+ "loss": 0.5171,
+ "step": 3769
+ },
+ {
+ "epoch": 1.7825059101654848,
+ "grad_norm": 2.364332437515259,
+ "learning_rate": 4.022934440193844e-06,
+ "loss": 0.4513,
+ "step": 3770
+ },
+ {
+ "epoch": 1.7829787234042553,
+ "grad_norm": 3.03108549118042,
+ "learning_rate": 4.022439675058441e-06,
+ "loss": 0.4324,
+ "step": 3771
+ },
+ {
+ "epoch": 1.783451536643026,
+ "grad_norm": 2.647557020187378,
+ "learning_rate": 4.021944815127393e-06,
+ "loss": 0.5162,
+ "step": 3772
+ },
+ {
+ "epoch": 1.7839243498817967,
+ "grad_norm": 2.4111907482147217,
+ "learning_rate": 4.021449860431517e-06,
+ "loss": 0.4712,
+ "step": 3773
+ },
+ {
+ "epoch": 1.7843971631205675,
+ "grad_norm": 2.796175718307495,
+ "learning_rate": 4.020954811001629e-06,
+ "loss": 0.5131,
+ "step": 3774
+ },
+ {
+ "epoch": 1.784869976359338,
+ "grad_norm": 2.4594924449920654,
+ "learning_rate": 4.020459666868553e-06,
+ "loss": 0.4739,
+ "step": 3775
+ },
+ {
+ "epoch": 1.7853427895981087,
+ "grad_norm": 2.5735671520233154,
+ "learning_rate": 4.0199644280631215e-06,
+ "loss": 0.4716,
+ "step": 3776
+ },
+ {
+ "epoch": 1.7858156028368795,
+ "grad_norm": 2.419990062713623,
+ "learning_rate": 4.01946909461617e-06,
+ "loss": 0.4866,
+ "step": 3777
+ },
+ {
+ "epoch": 1.7862884160756503,
+ "grad_norm": 2.5597951412200928,
+ "learning_rate": 4.01897366655854e-06,
+ "loss": 0.5569,
+ "step": 3778
+ },
+ {
+ "epoch": 1.7867612293144208,
+ "grad_norm": 2.462383985519409,
+ "learning_rate": 4.018478143921081e-06,
+ "loss": 0.4588,
+ "step": 3779
+ },
+ {
+ "epoch": 1.7872340425531914,
+ "grad_norm": 2.536701202392578,
+ "learning_rate": 4.017982526734646e-06,
+ "loss": 0.5278,
+ "step": 3780
+ },
+ {
+ "epoch": 1.7877068557919622,
+ "grad_norm": 2.691077470779419,
+ "learning_rate": 4.017486815030095e-06,
+ "loss": 0.4815,
+ "step": 3781
+ },
+ {
+ "epoch": 1.788179669030733,
+ "grad_norm": 2.4277288913726807,
+ "learning_rate": 4.016991008838294e-06,
+ "loss": 0.4877,
+ "step": 3782
+ },
+ {
+ "epoch": 1.7886524822695036,
+ "grad_norm": 2.6740009784698486,
+ "learning_rate": 4.016495108190115e-06,
+ "loss": 0.572,
+ "step": 3783
+ },
+ {
+ "epoch": 1.7891252955082741,
+ "grad_norm": 3.179232120513916,
+ "learning_rate": 4.0159991131164355e-06,
+ "loss": 0.4821,
+ "step": 3784
+ },
+ {
+ "epoch": 1.789598108747045,
+ "grad_norm": 3.2747793197631836,
+ "learning_rate": 4.015503023648138e-06,
+ "loss": 0.5517,
+ "step": 3785
+ },
+ {
+ "epoch": 1.7900709219858157,
+ "grad_norm": 2.671367645263672,
+ "learning_rate": 4.015006839816113e-06,
+ "loss": 0.5158,
+ "step": 3786
+ },
+ {
+ "epoch": 1.7905437352245863,
+ "grad_norm": 2.6600193977355957,
+ "learning_rate": 4.014510561651256e-06,
+ "loss": 0.535,
+ "step": 3787
+ },
+ {
+ "epoch": 1.791016548463357,
+ "grad_norm": 2.481509208679199,
+ "learning_rate": 4.014014189184466e-06,
+ "loss": 0.5596,
+ "step": 3788
+ },
+ {
+ "epoch": 1.7914893617021277,
+ "grad_norm": 2.759816884994507,
+ "learning_rate": 4.013517722446652e-06,
+ "loss": 0.5201,
+ "step": 3789
+ },
+ {
+ "epoch": 1.7919621749408985,
+ "grad_norm": 2.6913561820983887,
+ "learning_rate": 4.013021161468724e-06,
+ "loss": 0.5758,
+ "step": 3790
+ },
+ {
+ "epoch": 1.792434988179669,
+ "grad_norm": 2.775087594985962,
+ "learning_rate": 4.0125245062816044e-06,
+ "loss": 0.499,
+ "step": 3791
+ },
+ {
+ "epoch": 1.7929078014184396,
+ "grad_norm": 2.6134777069091797,
+ "learning_rate": 4.012027756916216e-06,
+ "loss": 0.5659,
+ "step": 3792
+ },
+ {
+ "epoch": 1.7933806146572104,
+ "grad_norm": 2.7109756469726562,
+ "learning_rate": 4.0115309134034895e-06,
+ "loss": 0.5337,
+ "step": 3793
+ },
+ {
+ "epoch": 1.7938534278959812,
+ "grad_norm": 2.5389950275421143,
+ "learning_rate": 4.0110339757743595e-06,
+ "loss": 0.4501,
+ "step": 3794
+ },
+ {
+ "epoch": 1.7943262411347518,
+ "grad_norm": 2.634648561477661,
+ "learning_rate": 4.010536944059771e-06,
+ "loss": 0.4411,
+ "step": 3795
+ },
+ {
+ "epoch": 1.7947990543735224,
+ "grad_norm": 2.527070999145508,
+ "learning_rate": 4.0100398182906695e-06,
+ "loss": 0.5145,
+ "step": 3796
+ },
+ {
+ "epoch": 1.7952718676122932,
+ "grad_norm": 2.62988543510437,
+ "learning_rate": 4.0095425984980105e-06,
+ "loss": 0.4981,
+ "step": 3797
+ },
+ {
+ "epoch": 1.795744680851064,
+ "grad_norm": 2.6032519340515137,
+ "learning_rate": 4.009045284712752e-06,
+ "loss": 0.453,
+ "step": 3798
+ },
+ {
+ "epoch": 1.7962174940898346,
+ "grad_norm": 2.735173463821411,
+ "learning_rate": 4.008547876965863e-06,
+ "loss": 0.5925,
+ "step": 3799
+ },
+ {
+ "epoch": 1.7966903073286051,
+ "grad_norm": 2.6296730041503906,
+ "learning_rate": 4.00805037528831e-06,
+ "loss": 0.5651,
+ "step": 3800
+ },
+ {
+ "epoch": 1.797163120567376,
+ "grad_norm": 2.641214370727539,
+ "learning_rate": 4.0075527797110735e-06,
+ "loss": 0.4973,
+ "step": 3801
+ },
+ {
+ "epoch": 1.7976359338061467,
+ "grad_norm": 2.6104819774627686,
+ "learning_rate": 4.007055090265136e-06,
+ "loss": 0.4432,
+ "step": 3802
+ },
+ {
+ "epoch": 1.7981087470449173,
+ "grad_norm": 2.8200619220733643,
+ "learning_rate": 4.0065573069814865e-06,
+ "loss": 0.4899,
+ "step": 3803
+ },
+ {
+ "epoch": 1.7985815602836879,
+ "grad_norm": 2.982354164123535,
+ "learning_rate": 4.006059429891119e-06,
+ "loss": 0.5488,
+ "step": 3804
+ },
+ {
+ "epoch": 1.7990543735224587,
+ "grad_norm": 2.7561678886413574,
+ "learning_rate": 4.005561459025034e-06,
+ "loss": 0.5637,
+ "step": 3805
+ },
+ {
+ "epoch": 1.7995271867612295,
+ "grad_norm": 2.702212333679199,
+ "learning_rate": 4.005063394414241e-06,
+ "loss": 0.4804,
+ "step": 3806
+ },
+ {
+ "epoch": 1.8,
+ "grad_norm": 2.8655319213867188,
+ "learning_rate": 4.004565236089748e-06,
+ "loss": 0.5759,
+ "step": 3807
+ },
+ {
+ "epoch": 1.8004728132387706,
+ "grad_norm": 2.703676223754883,
+ "learning_rate": 4.0040669840825756e-06,
+ "loss": 0.4728,
+ "step": 3808
+ },
+ {
+ "epoch": 1.8009456264775414,
+ "grad_norm": 2.802645683288574,
+ "learning_rate": 4.003568638423747e-06,
+ "loss": 0.5421,
+ "step": 3809
+ },
+ {
+ "epoch": 1.8014184397163122,
+ "grad_norm": 2.4723124504089355,
+ "learning_rate": 4.003070199144292e-06,
+ "loss": 0.4944,
+ "step": 3810
+ },
+ {
+ "epoch": 1.8018912529550828,
+ "grad_norm": 2.4889068603515625,
+ "learning_rate": 4.0025716662752475e-06,
+ "loss": 0.4774,
+ "step": 3811
+ },
+ {
+ "epoch": 1.8023640661938534,
+ "grad_norm": 2.5408077239990234,
+ "learning_rate": 4.002073039847653e-06,
+ "loss": 0.5233,
+ "step": 3812
+ },
+ {
+ "epoch": 1.8028368794326242,
+ "grad_norm": 2.734602689743042,
+ "learning_rate": 4.001574319892557e-06,
+ "loss": 0.5403,
+ "step": 3813
+ },
+ {
+ "epoch": 1.803309692671395,
+ "grad_norm": 3.3786163330078125,
+ "learning_rate": 4.001075506441012e-06,
+ "loss": 0.6969,
+ "step": 3814
+ },
+ {
+ "epoch": 1.8037825059101655,
+ "grad_norm": 2.7375378608703613,
+ "learning_rate": 4.000576599524078e-06,
+ "loss": 0.4907,
+ "step": 3815
+ },
+ {
+ "epoch": 1.804255319148936,
+ "grad_norm": 3.041804075241089,
+ "learning_rate": 4.000077599172818e-06,
+ "loss": 0.6021,
+ "step": 3816
+ },
+ {
+ "epoch": 1.804728132387707,
+ "grad_norm": 2.697599411010742,
+ "learning_rate": 3.999578505418305e-06,
+ "loss": 0.4743,
+ "step": 3817
+ },
+ {
+ "epoch": 1.8052009456264777,
+ "grad_norm": 2.276921272277832,
+ "learning_rate": 3.999079318291612e-06,
+ "loss": 0.4885,
+ "step": 3818
+ },
+ {
+ "epoch": 1.8056737588652483,
+ "grad_norm": 2.4896953105926514,
+ "learning_rate": 3.998580037823825e-06,
+ "loss": 0.503,
+ "step": 3819
+ },
+ {
+ "epoch": 1.8061465721040189,
+ "grad_norm": 2.6232175827026367,
+ "learning_rate": 3.998080664046029e-06,
+ "loss": 0.5058,
+ "step": 3820
+ },
+ {
+ "epoch": 1.8066193853427897,
+ "grad_norm": 2.695861339569092,
+ "learning_rate": 3.997581196989319e-06,
+ "loss": 0.4949,
+ "step": 3821
+ },
+ {
+ "epoch": 1.8070921985815604,
+ "grad_norm": 2.912886142730713,
+ "learning_rate": 3.997081636684795e-06,
+ "loss": 0.4971,
+ "step": 3822
+ },
+ {
+ "epoch": 1.807565011820331,
+ "grad_norm": 2.876500368118286,
+ "learning_rate": 3.996581983163561e-06,
+ "loss": 0.5584,
+ "step": 3823
+ },
+ {
+ "epoch": 1.8080378250591016,
+ "grad_norm": 2.857069730758667,
+ "learning_rate": 3.99608223645673e-06,
+ "loss": 0.5457,
+ "step": 3824
+ },
+ {
+ "epoch": 1.8085106382978724,
+ "grad_norm": 2.486743211746216,
+ "learning_rate": 3.995582396595419e-06,
+ "loss": 0.5291,
+ "step": 3825
+ },
+ {
+ "epoch": 1.808983451536643,
+ "grad_norm": 2.509441375732422,
+ "learning_rate": 3.9950824636107486e-06,
+ "loss": 0.4747,
+ "step": 3826
+ },
+ {
+ "epoch": 1.8094562647754135,
+ "grad_norm": 2.931394100189209,
+ "learning_rate": 3.99458243753385e-06,
+ "loss": 0.5116,
+ "step": 3827
+ },
+ {
+ "epoch": 1.8099290780141843,
+ "grad_norm": 2.4868650436401367,
+ "learning_rate": 3.994082318395856e-06,
+ "loss": 0.4671,
+ "step": 3828
+ },
+ {
+ "epoch": 1.8104018912529551,
+ "grad_norm": 2.5554752349853516,
+ "learning_rate": 3.993582106227907e-06,
+ "loss": 0.4969,
+ "step": 3829
+ },
+ {
+ "epoch": 1.8108747044917257,
+ "grad_norm": 2.8367133140563965,
+ "learning_rate": 3.99308180106115e-06,
+ "loss": 0.5507,
+ "step": 3830
+ },
+ {
+ "epoch": 1.8113475177304963,
+ "grad_norm": 2.68245792388916,
+ "learning_rate": 3.992581402926737e-06,
+ "loss": 0.5115,
+ "step": 3831
+ },
+ {
+ "epoch": 1.811820330969267,
+ "grad_norm": 2.406674385070801,
+ "learning_rate": 3.992080911855824e-06,
+ "loss": 0.545,
+ "step": 3832
+ },
+ {
+ "epoch": 1.8122931442080379,
+ "grad_norm": 2.5003464221954346,
+ "learning_rate": 3.991580327879575e-06,
+ "loss": 0.4331,
+ "step": 3833
+ },
+ {
+ "epoch": 1.8127659574468085,
+ "grad_norm": 2.49320912361145,
+ "learning_rate": 3.99107965102916e-06,
+ "loss": 0.5118,
+ "step": 3834
+ },
+ {
+ "epoch": 1.813238770685579,
+ "grad_norm": 2.6183295249938965,
+ "learning_rate": 3.990578881335752e-06,
+ "loss": 0.5286,
+ "step": 3835
+ },
+ {
+ "epoch": 1.8137115839243498,
+ "grad_norm": 3.1999518871307373,
+ "learning_rate": 3.990078018830534e-06,
+ "loss": 0.5048,
+ "step": 3836
+ },
+ {
+ "epoch": 1.8141843971631206,
+ "grad_norm": 2.4351117610931396,
+ "learning_rate": 3.9895770635446915e-06,
+ "loss": 0.514,
+ "step": 3837
+ },
+ {
+ "epoch": 1.8146572104018912,
+ "grad_norm": 2.6859259605407715,
+ "learning_rate": 3.989076015509416e-06,
+ "loss": 0.5575,
+ "step": 3838
+ },
+ {
+ "epoch": 1.8151300236406618,
+ "grad_norm": 2.790421962738037,
+ "learning_rate": 3.988574874755909e-06,
+ "loss": 0.5467,
+ "step": 3839
+ },
+ {
+ "epoch": 1.8156028368794326,
+ "grad_norm": 2.5202765464782715,
+ "learning_rate": 3.988073641315369e-06,
+ "loss": 0.5229,
+ "step": 3840
+ },
+ {
+ "epoch": 1.8160756501182034,
+ "grad_norm": 2.623652219772339,
+ "learning_rate": 3.987572315219009e-06,
+ "loss": 0.509,
+ "step": 3841
+ },
+ {
+ "epoch": 1.816548463356974,
+ "grad_norm": 2.6038360595703125,
+ "learning_rate": 3.987070896498044e-06,
+ "loss": 0.5304,
+ "step": 3842
+ },
+ {
+ "epoch": 1.8170212765957445,
+ "grad_norm": 2.9378011226654053,
+ "learning_rate": 3.9865693851836955e-06,
+ "loss": 0.5845,
+ "step": 3843
+ },
+ {
+ "epoch": 1.8174940898345153,
+ "grad_norm": 2.4061124324798584,
+ "learning_rate": 3.98606778130719e-06,
+ "loss": 0.4333,
+ "step": 3844
+ },
+ {
+ "epoch": 1.8179669030732861,
+ "grad_norm": 2.483489751815796,
+ "learning_rate": 3.985566084899759e-06,
+ "loss": 0.4827,
+ "step": 3845
+ },
+ {
+ "epoch": 1.8184397163120567,
+ "grad_norm": 2.7774932384490967,
+ "learning_rate": 3.985064295992642e-06,
+ "loss": 0.5016,
+ "step": 3846
+ },
+ {
+ "epoch": 1.8189125295508273,
+ "grad_norm": 2.5936765670776367,
+ "learning_rate": 3.984562414617083e-06,
+ "loss": 0.4448,
+ "step": 3847
+ },
+ {
+ "epoch": 1.819385342789598,
+ "grad_norm": 2.8608627319335938,
+ "learning_rate": 3.9840604408043325e-06,
+ "loss": 0.5735,
+ "step": 3848
+ },
+ {
+ "epoch": 1.8198581560283689,
+ "grad_norm": 2.6212472915649414,
+ "learning_rate": 3.983558374585646e-06,
+ "loss": 0.5091,
+ "step": 3849
+ },
+ {
+ "epoch": 1.8203309692671394,
+ "grad_norm": 2.832460641860962,
+ "learning_rate": 3.983056215992284e-06,
+ "loss": 0.5169,
+ "step": 3850
+ },
+ {
+ "epoch": 1.82080378250591,
+ "grad_norm": 2.5293610095977783,
+ "learning_rate": 3.982553965055514e-06,
+ "loss": 0.4708,
+ "step": 3851
+ },
+ {
+ "epoch": 1.8212765957446808,
+ "grad_norm": 2.9362871646881104,
+ "learning_rate": 3.982051621806611e-06,
+ "loss": 0.575,
+ "step": 3852
+ },
+ {
+ "epoch": 1.8217494089834516,
+ "grad_norm": 2.69073486328125,
+ "learning_rate": 3.98154918627685e-06,
+ "loss": 0.5278,
+ "step": 3853
+ },
+ {
+ "epoch": 1.8222222222222222,
+ "grad_norm": 2.6711034774780273,
+ "learning_rate": 3.98104665849752e-06,
+ "loss": 0.4918,
+ "step": 3854
+ },
+ {
+ "epoch": 1.8226950354609928,
+ "grad_norm": 2.571110963821411,
+ "learning_rate": 3.980544038499907e-06,
+ "loss": 0.5234,
+ "step": 3855
+ },
+ {
+ "epoch": 1.8231678486997636,
+ "grad_norm": 3.2603371143341064,
+ "learning_rate": 3.980041326315309e-06,
+ "loss": 0.5996,
+ "step": 3856
+ },
+ {
+ "epoch": 1.8236406619385344,
+ "grad_norm": 2.8472323417663574,
+ "learning_rate": 3.979538521975028e-06,
+ "loss": 0.4769,
+ "step": 3857
+ },
+ {
+ "epoch": 1.824113475177305,
+ "grad_norm": 2.6714751720428467,
+ "learning_rate": 3.979035625510371e-06,
+ "loss": 0.4826,
+ "step": 3858
+ },
+ {
+ "epoch": 1.8245862884160755,
+ "grad_norm": 2.6816468238830566,
+ "learning_rate": 3.97853263695265e-06,
+ "loss": 0.5127,
+ "step": 3859
+ },
+ {
+ "epoch": 1.8250591016548463,
+ "grad_norm": 2.6464123725891113,
+ "learning_rate": 3.978029556333185e-06,
+ "loss": 0.4925,
+ "step": 3860
+ },
+ {
+ "epoch": 1.825531914893617,
+ "grad_norm": 2.5317227840423584,
+ "learning_rate": 3.977526383683301e-06,
+ "loss": 0.4765,
+ "step": 3861
+ },
+ {
+ "epoch": 1.8260047281323877,
+ "grad_norm": 2.5052425861358643,
+ "learning_rate": 3.977023119034328e-06,
+ "loss": 0.4804,
+ "step": 3862
+ },
+ {
+ "epoch": 1.8264775413711583,
+ "grad_norm": 2.7022836208343506,
+ "learning_rate": 3.976519762417602e-06,
+ "loss": 0.4824,
+ "step": 3863
+ },
+ {
+ "epoch": 1.826950354609929,
+ "grad_norm": 2.7445900440216064,
+ "learning_rate": 3.976016313864464e-06,
+ "loss": 0.5698,
+ "step": 3864
+ },
+ {
+ "epoch": 1.8274231678486998,
+ "grad_norm": 2.442518711090088,
+ "learning_rate": 3.975512773406262e-06,
+ "loss": 0.5133,
+ "step": 3865
+ },
+ {
+ "epoch": 1.8278959810874704,
+ "grad_norm": 2.4100050926208496,
+ "learning_rate": 3.975009141074351e-06,
+ "loss": 0.5044,
+ "step": 3866
+ },
+ {
+ "epoch": 1.828368794326241,
+ "grad_norm": 2.9507648944854736,
+ "learning_rate": 3.974505416900088e-06,
+ "loss": 0.5367,
+ "step": 3867
+ },
+ {
+ "epoch": 1.8288416075650118,
+ "grad_norm": 2.5662600994110107,
+ "learning_rate": 3.974001600914837e-06,
+ "loss": 0.5878,
+ "step": 3868
+ },
+ {
+ "epoch": 1.8293144208037826,
+ "grad_norm": 2.4306657314300537,
+ "learning_rate": 3.973497693149971e-06,
+ "loss": 0.4647,
+ "step": 3869
+ },
+ {
+ "epoch": 1.8297872340425532,
+ "grad_norm": 2.974686622619629,
+ "learning_rate": 3.972993693636864e-06,
+ "loss": 0.4911,
+ "step": 3870
+ },
+ {
+ "epoch": 1.8302600472813237,
+ "grad_norm": 2.5711987018585205,
+ "learning_rate": 3.972489602406899e-06,
+ "loss": 0.5089,
+ "step": 3871
+ },
+ {
+ "epoch": 1.8307328605200945,
+ "grad_norm": 3.259617328643799,
+ "learning_rate": 3.971985419491463e-06,
+ "loss": 0.5966,
+ "step": 3872
+ },
+ {
+ "epoch": 1.8312056737588653,
+ "grad_norm": 2.7437000274658203,
+ "learning_rate": 3.971481144921949e-06,
+ "loss": 0.5097,
+ "step": 3873
+ },
+ {
+ "epoch": 1.831678486997636,
+ "grad_norm": 2.9597461223602295,
+ "learning_rate": 3.970976778729757e-06,
+ "loss": 0.5672,
+ "step": 3874
+ },
+ {
+ "epoch": 1.8321513002364065,
+ "grad_norm": 2.5775723457336426,
+ "learning_rate": 3.970472320946291e-06,
+ "loss": 0.4749,
+ "step": 3875
+ },
+ {
+ "epoch": 1.8326241134751773,
+ "grad_norm": 2.7381200790405273,
+ "learning_rate": 3.969967771602961e-06,
+ "loss": 0.5255,
+ "step": 3876
+ },
+ {
+ "epoch": 1.833096926713948,
+ "grad_norm": 2.651698350906372,
+ "learning_rate": 3.969463130731183e-06,
+ "loss": 0.5098,
+ "step": 3877
+ },
+ {
+ "epoch": 1.8335697399527187,
+ "grad_norm": 2.7277021408081055,
+ "learning_rate": 3.968958398362381e-06,
+ "loss": 0.5251,
+ "step": 3878
+ },
+ {
+ "epoch": 1.8340425531914892,
+ "grad_norm": 2.5184953212738037,
+ "learning_rate": 3.968453574527978e-06,
+ "loss": 0.5086,
+ "step": 3879
+ },
+ {
+ "epoch": 1.83451536643026,
+ "grad_norm": 2.8227882385253906,
+ "learning_rate": 3.967948659259412e-06,
+ "loss": 0.5742,
+ "step": 3880
+ },
+ {
+ "epoch": 1.8349881796690308,
+ "grad_norm": 2.547922134399414,
+ "learning_rate": 3.967443652588119e-06,
+ "loss": 0.5411,
+ "step": 3881
+ },
+ {
+ "epoch": 1.8354609929078014,
+ "grad_norm": 2.6572835445404053,
+ "learning_rate": 3.966938554545545e-06,
+ "loss": 0.4854,
+ "step": 3882
+ },
+ {
+ "epoch": 1.835933806146572,
+ "grad_norm": 2.9416658878326416,
+ "learning_rate": 3.966433365163139e-06,
+ "loss": 0.5236,
+ "step": 3883
+ },
+ {
+ "epoch": 1.8364066193853428,
+ "grad_norm": 2.344325304031372,
+ "learning_rate": 3.965928084472357e-06,
+ "loss": 0.4916,
+ "step": 3884
+ },
+ {
+ "epoch": 1.8368794326241136,
+ "grad_norm": 2.890418291091919,
+ "learning_rate": 3.965422712504662e-06,
+ "loss": 0.5287,
+ "step": 3885
+ },
+ {
+ "epoch": 1.8373522458628841,
+ "grad_norm": 2.6063363552093506,
+ "learning_rate": 3.96491724929152e-06,
+ "loss": 0.4842,
+ "step": 3886
+ },
+ {
+ "epoch": 1.8378250591016547,
+ "grad_norm": 2.5582427978515625,
+ "learning_rate": 3.964411694864404e-06,
+ "loss": 0.4768,
+ "step": 3887
+ },
+ {
+ "epoch": 1.8382978723404255,
+ "grad_norm": 2.84356951713562,
+ "learning_rate": 3.963906049254793e-06,
+ "loss": 0.5284,
+ "step": 3888
+ },
+ {
+ "epoch": 1.8387706855791963,
+ "grad_norm": 2.7048516273498535,
+ "learning_rate": 3.963400312494172e-06,
+ "loss": 0.5271,
+ "step": 3889
+ },
+ {
+ "epoch": 1.839243498817967,
+ "grad_norm": 2.5401699542999268,
+ "learning_rate": 3.962894484614031e-06,
+ "loss": 0.4734,
+ "step": 3890
+ },
+ {
+ "epoch": 1.8397163120567375,
+ "grad_norm": 2.208256244659424,
+ "learning_rate": 3.962388565645864e-06,
+ "loss": 0.4113,
+ "step": 3891
+ },
+ {
+ "epoch": 1.8401891252955083,
+ "grad_norm": 2.775139331817627,
+ "learning_rate": 3.961882555621173e-06,
+ "loss": 0.5172,
+ "step": 3892
+ },
+ {
+ "epoch": 1.840661938534279,
+ "grad_norm": 2.7540855407714844,
+ "learning_rate": 3.961376454571466e-06,
+ "loss": 0.5252,
+ "step": 3893
+ },
+ {
+ "epoch": 1.8411347517730496,
+ "grad_norm": 2.6731574535369873,
+ "learning_rate": 3.960870262528255e-06,
+ "loss": 0.4495,
+ "step": 3894
+ },
+ {
+ "epoch": 1.8416075650118202,
+ "grad_norm": 2.791492223739624,
+ "learning_rate": 3.960363979523058e-06,
+ "loss": 0.5457,
+ "step": 3895
+ },
+ {
+ "epoch": 1.842080378250591,
+ "grad_norm": 2.9280290603637695,
+ "learning_rate": 3.959857605587401e-06,
+ "loss": 0.5373,
+ "step": 3896
+ },
+ {
+ "epoch": 1.8425531914893618,
+ "grad_norm": 2.5652217864990234,
+ "learning_rate": 3.95935114075281e-06,
+ "loss": 0.5191,
+ "step": 3897
+ },
+ {
+ "epoch": 1.8430260047281324,
+ "grad_norm": 2.7297749519348145,
+ "learning_rate": 3.958844585050824e-06,
+ "loss": 0.5366,
+ "step": 3898
+ },
+ {
+ "epoch": 1.843498817966903,
+ "grad_norm": 2.5302982330322266,
+ "learning_rate": 3.958337938512983e-06,
+ "loss": 0.569,
+ "step": 3899
+ },
+ {
+ "epoch": 1.8439716312056738,
+ "grad_norm": 2.644777297973633,
+ "learning_rate": 3.957831201170832e-06,
+ "loss": 0.521,
+ "step": 3900
+ },
+ {
+ "epoch": 1.8444444444444446,
+ "grad_norm": 2.8375515937805176,
+ "learning_rate": 3.957324373055925e-06,
+ "loss": 0.573,
+ "step": 3901
+ },
+ {
+ "epoch": 1.8449172576832151,
+ "grad_norm": 2.512296676635742,
+ "learning_rate": 3.956817454199819e-06,
+ "loss": 0.5081,
+ "step": 3902
+ },
+ {
+ "epoch": 1.8453900709219857,
+ "grad_norm": 2.3662109375,
+ "learning_rate": 3.956310444634079e-06,
+ "loss": 0.4989,
+ "step": 3903
+ },
+ {
+ "epoch": 1.8458628841607565,
+ "grad_norm": 2.6849682331085205,
+ "learning_rate": 3.955803344390272e-06,
+ "loss": 0.5459,
+ "step": 3904
+ },
+ {
+ "epoch": 1.8463356973995273,
+ "grad_norm": 2.8364317417144775,
+ "learning_rate": 3.9552961534999756e-06,
+ "loss": 0.5704,
+ "step": 3905
+ },
+ {
+ "epoch": 1.8468085106382979,
+ "grad_norm": 2.6006948947906494,
+ "learning_rate": 3.954788871994768e-06,
+ "loss": 0.5696,
+ "step": 3906
+ },
+ {
+ "epoch": 1.8472813238770684,
+ "grad_norm": 2.558300018310547,
+ "learning_rate": 3.9542814999062375e-06,
+ "loss": 0.5047,
+ "step": 3907
+ },
+ {
+ "epoch": 1.8477541371158392,
+ "grad_norm": 2.6343321800231934,
+ "learning_rate": 3.953774037265974e-06,
+ "loss": 0.525,
+ "step": 3908
+ },
+ {
+ "epoch": 1.84822695035461,
+ "grad_norm": 2.5050008296966553,
+ "learning_rate": 3.953266484105576e-06,
+ "loss": 0.4867,
+ "step": 3909
+ },
+ {
+ "epoch": 1.8486997635933806,
+ "grad_norm": 2.3775103092193604,
+ "learning_rate": 3.952758840456647e-06,
+ "loss": 0.4349,
+ "step": 3910
+ },
+ {
+ "epoch": 1.8491725768321512,
+ "grad_norm": 2.508376359939575,
+ "learning_rate": 3.952251106350794e-06,
+ "loss": 0.539,
+ "step": 3911
+ },
+ {
+ "epoch": 1.849645390070922,
+ "grad_norm": 2.7403106689453125,
+ "learning_rate": 3.951743281819633e-06,
+ "loss": 0.4478,
+ "step": 3912
+ },
+ {
+ "epoch": 1.8501182033096928,
+ "grad_norm": 2.5332062244415283,
+ "learning_rate": 3.951235366894784e-06,
+ "loss": 0.4658,
+ "step": 3913
+ },
+ {
+ "epoch": 1.8505910165484634,
+ "grad_norm": 3.0137248039245605,
+ "learning_rate": 3.950727361607872e-06,
+ "loss": 0.5047,
+ "step": 3914
+ },
+ {
+ "epoch": 1.851063829787234,
+ "grad_norm": 2.5820653438568115,
+ "learning_rate": 3.950219265990528e-06,
+ "loss": 0.542,
+ "step": 3915
+ },
+ {
+ "epoch": 1.8515366430260047,
+ "grad_norm": 2.555133819580078,
+ "learning_rate": 3.949711080074389e-06,
+ "loss": 0.5253,
+ "step": 3916
+ },
+ {
+ "epoch": 1.8520094562647755,
+ "grad_norm": 2.876882791519165,
+ "learning_rate": 3.949202803891099e-06,
+ "loss": 0.5242,
+ "step": 3917
+ },
+ {
+ "epoch": 1.852482269503546,
+ "grad_norm": 2.5929203033447266,
+ "learning_rate": 3.948694437472305e-06,
+ "loss": 0.5358,
+ "step": 3918
+ },
+ {
+ "epoch": 1.8529550827423167,
+ "grad_norm": 2.468513250350952,
+ "learning_rate": 3.948185980849659e-06,
+ "loss": 0.5119,
+ "step": 3919
+ },
+ {
+ "epoch": 1.8534278959810875,
+ "grad_norm": 2.9259560108184814,
+ "learning_rate": 3.947677434054824e-06,
+ "loss": 0.4756,
+ "step": 3920
+ },
+ {
+ "epoch": 1.8539007092198583,
+ "grad_norm": 2.5247011184692383,
+ "learning_rate": 3.947168797119462e-06,
+ "loss": 0.4627,
+ "step": 3921
+ },
+ {
+ "epoch": 1.8543735224586289,
+ "grad_norm": 2.7396671772003174,
+ "learning_rate": 3.946660070075245e-06,
+ "loss": 0.5013,
+ "step": 3922
+ },
+ {
+ "epoch": 1.8548463356973994,
+ "grad_norm": 2.7059738636016846,
+ "learning_rate": 3.946151252953849e-06,
+ "loss": 0.5875,
+ "step": 3923
+ },
+ {
+ "epoch": 1.8553191489361702,
+ "grad_norm": 2.5638437271118164,
+ "learning_rate": 3.945642345786955e-06,
+ "loss": 0.5063,
+ "step": 3924
+ },
+ {
+ "epoch": 1.855791962174941,
+ "grad_norm": 2.6647839546203613,
+ "learning_rate": 3.945133348606251e-06,
+ "loss": 0.5421,
+ "step": 3925
+ },
+ {
+ "epoch": 1.8562647754137116,
+ "grad_norm": 3.7235286235809326,
+ "learning_rate": 3.944624261443431e-06,
+ "loss": 0.5958,
+ "step": 3926
+ },
+ {
+ "epoch": 1.8567375886524822,
+ "grad_norm": 2.769984245300293,
+ "learning_rate": 3.944115084330192e-06,
+ "loss": 0.5678,
+ "step": 3927
+ },
+ {
+ "epoch": 1.857210401891253,
+ "grad_norm": 2.567249059677124,
+ "learning_rate": 3.9436058172982395e-06,
+ "loss": 0.4767,
+ "step": 3928
+ },
+ {
+ "epoch": 1.8576832151300238,
+ "grad_norm": 2.6196048259735107,
+ "learning_rate": 3.943096460379283e-06,
+ "loss": 0.5345,
+ "step": 3929
+ },
+ {
+ "epoch": 1.8581560283687943,
+ "grad_norm": 2.5999555587768555,
+ "learning_rate": 3.942587013605037e-06,
+ "loss": 0.5482,
+ "step": 3930
+ },
+ {
+ "epoch": 1.858628841607565,
+ "grad_norm": 2.630387783050537,
+ "learning_rate": 3.942077477007224e-06,
+ "loss": 0.6023,
+ "step": 3931
+ },
+ {
+ "epoch": 1.8591016548463357,
+ "grad_norm": 2.543503761291504,
+ "learning_rate": 3.941567850617569e-06,
+ "loss": 0.5157,
+ "step": 3932
+ },
+ {
+ "epoch": 1.8595744680851065,
+ "grad_norm": 2.5109236240386963,
+ "learning_rate": 3.941058134467805e-06,
+ "loss": 0.4774,
+ "step": 3933
+ },
+ {
+ "epoch": 1.860047281323877,
+ "grad_norm": 2.5110230445861816,
+ "learning_rate": 3.94054832858967e-06,
+ "loss": 0.5064,
+ "step": 3934
+ },
+ {
+ "epoch": 1.8605200945626477,
+ "grad_norm": 2.4780776500701904,
+ "learning_rate": 3.940038433014908e-06,
+ "loss": 0.5216,
+ "step": 3935
+ },
+ {
+ "epoch": 1.8609929078014185,
+ "grad_norm": 2.4398856163024902,
+ "learning_rate": 3.939528447775266e-06,
+ "loss": 0.4958,
+ "step": 3936
+ },
+ {
+ "epoch": 1.8614657210401893,
+ "grad_norm": 2.449498176574707,
+ "learning_rate": 3.9390183729025e-06,
+ "loss": 0.5165,
+ "step": 3937
+ },
+ {
+ "epoch": 1.8619385342789598,
+ "grad_norm": 2.982544422149658,
+ "learning_rate": 3.938508208428371e-06,
+ "loss": 0.4803,
+ "step": 3938
+ },
+ {
+ "epoch": 1.8624113475177304,
+ "grad_norm": 2.6574015617370605,
+ "learning_rate": 3.937997954384641e-06,
+ "loss": 0.4797,
+ "step": 3939
+ },
+ {
+ "epoch": 1.8628841607565012,
+ "grad_norm": 2.7773542404174805,
+ "learning_rate": 3.937487610803086e-06,
+ "loss": 0.4843,
+ "step": 3940
+ },
+ {
+ "epoch": 1.863356973995272,
+ "grad_norm": 2.588937759399414,
+ "learning_rate": 3.9369771777154805e-06,
+ "loss": 0.5426,
+ "step": 3941
+ },
+ {
+ "epoch": 1.8638297872340426,
+ "grad_norm": 2.855442523956299,
+ "learning_rate": 3.936466655153607e-06,
+ "loss": 0.5443,
+ "step": 3942
+ },
+ {
+ "epoch": 1.8643026004728132,
+ "grad_norm": 2.554676055908203,
+ "learning_rate": 3.935956043149253e-06,
+ "loss": 0.5334,
+ "step": 3943
+ },
+ {
+ "epoch": 1.864775413711584,
+ "grad_norm": 2.901599884033203,
+ "learning_rate": 3.935445341734212e-06,
+ "loss": 0.5842,
+ "step": 3944
+ },
+ {
+ "epoch": 1.8652482269503547,
+ "grad_norm": 2.554485321044922,
+ "learning_rate": 3.934934550940285e-06,
+ "loss": 0.4941,
+ "step": 3945
+ },
+ {
+ "epoch": 1.8657210401891253,
+ "grad_norm": 2.357203245162964,
+ "learning_rate": 3.934423670799275e-06,
+ "loss": 0.4402,
+ "step": 3946
+ },
+ {
+ "epoch": 1.866193853427896,
+ "grad_norm": 2.7036049365997314,
+ "learning_rate": 3.933912701342993e-06,
+ "loss": 0.4966,
+ "step": 3947
+ },
+ {
+ "epoch": 1.8666666666666667,
+ "grad_norm": 2.7817211151123047,
+ "learning_rate": 3.933401642603255e-06,
+ "loss": 0.4908,
+ "step": 3948
+ },
+ {
+ "epoch": 1.8671394799054375,
+ "grad_norm": 2.439490795135498,
+ "learning_rate": 3.932890494611882e-06,
+ "loss": 0.4322,
+ "step": 3949
+ },
+ {
+ "epoch": 1.867612293144208,
+ "grad_norm": 3.187152147293091,
+ "learning_rate": 3.9323792574007e-06,
+ "loss": 0.501,
+ "step": 3950
+ },
+ {
+ "epoch": 1.8680851063829786,
+ "grad_norm": 2.405773401260376,
+ "learning_rate": 3.931867931001543e-06,
+ "loss": 0.4477,
+ "step": 3951
+ },
+ {
+ "epoch": 1.8685579196217494,
+ "grad_norm": 2.4922525882720947,
+ "learning_rate": 3.931356515446248e-06,
+ "loss": 0.5098,
+ "step": 3952
+ },
+ {
+ "epoch": 1.8690307328605202,
+ "grad_norm": 2.7781267166137695,
+ "learning_rate": 3.93084501076666e-06,
+ "loss": 0.5815,
+ "step": 3953
+ },
+ {
+ "epoch": 1.8695035460992908,
+ "grad_norm": 2.74621844291687,
+ "learning_rate": 3.930333416994626e-06,
+ "loss": 0.5605,
+ "step": 3954
+ },
+ {
+ "epoch": 1.8699763593380614,
+ "grad_norm": 2.5527689456939697,
+ "learning_rate": 3.929821734162004e-06,
+ "loss": 0.5141,
+ "step": 3955
+ },
+ {
+ "epoch": 1.8704491725768322,
+ "grad_norm": 2.5730628967285156,
+ "learning_rate": 3.92930996230065e-06,
+ "loss": 0.5446,
+ "step": 3956
+ },
+ {
+ "epoch": 1.870921985815603,
+ "grad_norm": 2.7053353786468506,
+ "learning_rate": 3.9287981014424334e-06,
+ "loss": 0.4722,
+ "step": 3957
+ },
+ {
+ "epoch": 1.8713947990543736,
+ "grad_norm": 2.7591893672943115,
+ "learning_rate": 3.928286151619224e-06,
+ "loss": 0.509,
+ "step": 3958
+ },
+ {
+ "epoch": 1.8718676122931441,
+ "grad_norm": 2.6233739852905273,
+ "learning_rate": 3.927774112862898e-06,
+ "loss": 0.5266,
+ "step": 3959
+ },
+ {
+ "epoch": 1.872340425531915,
+ "grad_norm": 2.7715370655059814,
+ "learning_rate": 3.9272619852053396e-06,
+ "loss": 0.5612,
+ "step": 3960
+ },
+ {
+ "epoch": 1.8728132387706857,
+ "grad_norm": 2.4815211296081543,
+ "learning_rate": 3.926749768678435e-06,
+ "loss": 0.5498,
+ "step": 3961
+ },
+ {
+ "epoch": 1.8732860520094563,
+ "grad_norm": 2.6819605827331543,
+ "learning_rate": 3.926237463314078e-06,
+ "loss": 0.5499,
+ "step": 3962
+ },
+ {
+ "epoch": 1.8737588652482269,
+ "grad_norm": 2.638664722442627,
+ "learning_rate": 3.925725069144168e-06,
+ "loss": 0.5429,
+ "step": 3963
+ },
+ {
+ "epoch": 1.8742316784869977,
+ "grad_norm": 2.527294874191284,
+ "learning_rate": 3.925212586200611e-06,
+ "loss": 0.5451,
+ "step": 3964
+ },
+ {
+ "epoch": 1.8747044917257685,
+ "grad_norm": 2.831638813018799,
+ "learning_rate": 3.924700014515315e-06,
+ "loss": 0.5276,
+ "step": 3965
+ },
+ {
+ "epoch": 1.875177304964539,
+ "grad_norm": 2.5906996726989746,
+ "learning_rate": 3.924187354120196e-06,
+ "loss": 0.5323,
+ "step": 3966
+ },
+ {
+ "epoch": 1.8756501182033096,
+ "grad_norm": 2.5482442378997803,
+ "learning_rate": 3.923674605047175e-06,
+ "loss": 0.4882,
+ "step": 3967
+ },
+ {
+ "epoch": 1.8761229314420804,
+ "grad_norm": 2.56402850151062,
+ "learning_rate": 3.923161767328179e-06,
+ "loss": 0.5111,
+ "step": 3968
+ },
+ {
+ "epoch": 1.8765957446808512,
+ "grad_norm": 3.223782539367676,
+ "learning_rate": 3.9226488409951405e-06,
+ "loss": 0.5829,
+ "step": 3969
+ },
+ {
+ "epoch": 1.8770685579196218,
+ "grad_norm": 2.665964365005493,
+ "learning_rate": 3.922135826079997e-06,
+ "loss": 0.4739,
+ "step": 3970
+ },
+ {
+ "epoch": 1.8775413711583924,
+ "grad_norm": 2.602696418762207,
+ "learning_rate": 3.921622722614691e-06,
+ "loss": 0.5199,
+ "step": 3971
+ },
+ {
+ "epoch": 1.8780141843971632,
+ "grad_norm": 2.5384418964385986,
+ "learning_rate": 3.921109530631172e-06,
+ "loss": 0.5086,
+ "step": 3972
+ },
+ {
+ "epoch": 1.878486997635934,
+ "grad_norm": 2.7961080074310303,
+ "learning_rate": 3.920596250161394e-06,
+ "loss": 0.5454,
+ "step": 3973
+ },
+ {
+ "epoch": 1.8789598108747045,
+ "grad_norm": 3.022007465362549,
+ "learning_rate": 3.920082881237317e-06,
+ "loss": 0.5537,
+ "step": 3974
+ },
+ {
+ "epoch": 1.8794326241134751,
+ "grad_norm": 2.699885129928589,
+ "learning_rate": 3.9195694238909045e-06,
+ "loss": 0.5274,
+ "step": 3975
+ },
+ {
+ "epoch": 1.879905437352246,
+ "grad_norm": 2.3994593620300293,
+ "learning_rate": 3.919055878154129e-06,
+ "loss": 0.4134,
+ "step": 3976
+ },
+ {
+ "epoch": 1.8803782505910167,
+ "grad_norm": 4.093045711517334,
+ "learning_rate": 3.918542244058967e-06,
+ "loss": 0.5305,
+ "step": 3977
+ },
+ {
+ "epoch": 1.8808510638297873,
+ "grad_norm": 3.011643171310425,
+ "learning_rate": 3.9180285216374e-06,
+ "loss": 0.5481,
+ "step": 3978
+ },
+ {
+ "epoch": 1.8813238770685579,
+ "grad_norm": 2.6426854133605957,
+ "learning_rate": 3.917514710921414e-06,
+ "loss": 0.5415,
+ "step": 3979
+ },
+ {
+ "epoch": 1.8817966903073287,
+ "grad_norm": 2.4379019737243652,
+ "learning_rate": 3.917000811943002e-06,
+ "loss": 0.4566,
+ "step": 3980
+ },
+ {
+ "epoch": 1.8822695035460995,
+ "grad_norm": 3.18522047996521,
+ "learning_rate": 3.9164868247341634e-06,
+ "loss": 0.6079,
+ "step": 3981
+ },
+ {
+ "epoch": 1.88274231678487,
+ "grad_norm": 2.6451141834259033,
+ "learning_rate": 3.915972749326903e-06,
+ "loss": 0.515,
+ "step": 3982
+ },
+ {
+ "epoch": 1.8832151300236406,
+ "grad_norm": 2.565598726272583,
+ "learning_rate": 3.915458585753226e-06,
+ "loss": 0.4799,
+ "step": 3983
+ },
+ {
+ "epoch": 1.8836879432624114,
+ "grad_norm": 2.711651563644409,
+ "learning_rate": 3.91494433404515e-06,
+ "loss": 0.5595,
+ "step": 3984
+ },
+ {
+ "epoch": 1.8841607565011822,
+ "grad_norm": 2.749328851699829,
+ "learning_rate": 3.914429994234695e-06,
+ "loss": 0.495,
+ "step": 3985
+ },
+ {
+ "epoch": 1.8846335697399526,
+ "grad_norm": 2.9492287635803223,
+ "learning_rate": 3.913915566353886e-06,
+ "loss": 0.5683,
+ "step": 3986
+ },
+ {
+ "epoch": 1.8851063829787233,
+ "grad_norm": 3.07747745513916,
+ "learning_rate": 3.913401050434756e-06,
+ "loss": 0.4953,
+ "step": 3987
+ },
+ {
+ "epoch": 1.8855791962174941,
+ "grad_norm": 2.8746345043182373,
+ "learning_rate": 3.912886446509338e-06,
+ "loss": 0.4752,
+ "step": 3988
+ },
+ {
+ "epoch": 1.8860520094562647,
+ "grad_norm": 2.772954225540161,
+ "learning_rate": 3.912371754609677e-06,
+ "loss": 0.5473,
+ "step": 3989
+ },
+ {
+ "epoch": 1.8865248226950353,
+ "grad_norm": 2.8906044960021973,
+ "learning_rate": 3.911856974767821e-06,
+ "loss": 0.5285,
+ "step": 3990
+ },
+ {
+ "epoch": 1.886997635933806,
+ "grad_norm": 2.8992726802825928,
+ "learning_rate": 3.9113421070158206e-06,
+ "loss": 0.571,
+ "step": 3991
+ },
+ {
+ "epoch": 1.887470449172577,
+ "grad_norm": 2.624662160873413,
+ "learning_rate": 3.910827151385737e-06,
+ "loss": 0.5183,
+ "step": 3992
+ },
+ {
+ "epoch": 1.8879432624113475,
+ "grad_norm": 2.4491732120513916,
+ "learning_rate": 3.910312107909632e-06,
+ "loss": 0.4205,
+ "step": 3993
+ },
+ {
+ "epoch": 1.888416075650118,
+ "grad_norm": 2.278259515762329,
+ "learning_rate": 3.909796976619575e-06,
+ "loss": 0.4464,
+ "step": 3994
+ },
+ {
+ "epoch": 1.8888888888888888,
+ "grad_norm": 2.6481523513793945,
+ "learning_rate": 3.909281757547644e-06,
+ "loss": 0.5023,
+ "step": 3995
+ },
+ {
+ "epoch": 1.8893617021276596,
+ "grad_norm": 2.6687493324279785,
+ "learning_rate": 3.908766450725917e-06,
+ "loss": 0.495,
+ "step": 3996
+ },
+ {
+ "epoch": 1.8898345153664302,
+ "grad_norm": 2.507525682449341,
+ "learning_rate": 3.908251056186481e-06,
+ "loss": 0.4155,
+ "step": 3997
+ },
+ {
+ "epoch": 1.8903073286052008,
+ "grad_norm": 2.7048323154449463,
+ "learning_rate": 3.907735573961426e-06,
+ "loss": 0.4601,
+ "step": 3998
+ },
+ {
+ "epoch": 1.8907801418439716,
+ "grad_norm": 2.6825389862060547,
+ "learning_rate": 3.907220004082848e-06,
+ "loss": 0.5067,
+ "step": 3999
+ },
+ {
+ "epoch": 1.8912529550827424,
+ "grad_norm": 2.775696039199829,
+ "learning_rate": 3.906704346582852e-06,
+ "loss": 0.5411,
+ "step": 4000
+ },
+ {
+ "epoch": 1.891725768321513,
+ "grad_norm": 2.4492077827453613,
+ "learning_rate": 3.906188601493545e-06,
+ "loss": 0.4931,
+ "step": 4001
+ },
+ {
+ "epoch": 1.8921985815602835,
+ "grad_norm": 2.320810556411743,
+ "learning_rate": 3.905672768847041e-06,
+ "loss": 0.4908,
+ "step": 4002
+ },
+ {
+ "epoch": 1.8926713947990543,
+ "grad_norm": 2.455162525177002,
+ "learning_rate": 3.905156848675455e-06,
+ "loss": 0.508,
+ "step": 4003
+ },
+ {
+ "epoch": 1.8931442080378251,
+ "grad_norm": 2.515921115875244,
+ "learning_rate": 3.904640841010915e-06,
+ "loss": 0.5318,
+ "step": 4004
+ },
+ {
+ "epoch": 1.8936170212765957,
+ "grad_norm": 2.7230770587921143,
+ "learning_rate": 3.904124745885548e-06,
+ "loss": 0.4793,
+ "step": 4005
+ },
+ {
+ "epoch": 1.8940898345153663,
+ "grad_norm": 2.519934892654419,
+ "learning_rate": 3.903608563331491e-06,
+ "loss": 0.5013,
+ "step": 4006
+ },
+ {
+ "epoch": 1.894562647754137,
+ "grad_norm": 2.719674587249756,
+ "learning_rate": 3.903092293380883e-06,
+ "loss": 0.516,
+ "step": 4007
+ },
+ {
+ "epoch": 1.8950354609929079,
+ "grad_norm": 3.2107343673706055,
+ "learning_rate": 3.902575936065869e-06,
+ "loss": 0.6297,
+ "step": 4008
+ },
+ {
+ "epoch": 1.8955082742316784,
+ "grad_norm": 2.9773149490356445,
+ "learning_rate": 3.902059491418603e-06,
+ "loss": 0.566,
+ "step": 4009
+ },
+ {
+ "epoch": 1.895981087470449,
+ "grad_norm": 2.6754770278930664,
+ "learning_rate": 3.90154295947124e-06,
+ "loss": 0.5187,
+ "step": 4010
+ },
+ {
+ "epoch": 1.8964539007092198,
+ "grad_norm": 2.457303762435913,
+ "learning_rate": 3.901026340255943e-06,
+ "loss": 0.5757,
+ "step": 4011
+ },
+ {
+ "epoch": 1.8969267139479906,
+ "grad_norm": 2.5944161415100098,
+ "learning_rate": 3.900509633804878e-06,
+ "loss": 0.5049,
+ "step": 4012
+ },
+ {
+ "epoch": 1.8973995271867612,
+ "grad_norm": 2.610445022583008,
+ "learning_rate": 3.89999284015022e-06,
+ "loss": 0.521,
+ "step": 4013
+ },
+ {
+ "epoch": 1.8978723404255318,
+ "grad_norm": 2.6949338912963867,
+ "learning_rate": 3.899475959324146e-06,
+ "loss": 0.5619,
+ "step": 4014
+ },
+ {
+ "epoch": 1.8983451536643026,
+ "grad_norm": 2.7889559268951416,
+ "learning_rate": 3.898958991358841e-06,
+ "loss": 0.5223,
+ "step": 4015
+ },
+ {
+ "epoch": 1.8988179669030734,
+ "grad_norm": 2.569265842437744,
+ "learning_rate": 3.898441936286493e-06,
+ "loss": 0.5724,
+ "step": 4016
+ },
+ {
+ "epoch": 1.899290780141844,
+ "grad_norm": 2.3567774295806885,
+ "learning_rate": 3.897924794139299e-06,
+ "loss": 0.4784,
+ "step": 4017
+ },
+ {
+ "epoch": 1.8997635933806145,
+ "grad_norm": 2.9176526069641113,
+ "learning_rate": 3.897407564949457e-06,
+ "loss": 0.646,
+ "step": 4018
+ },
+ {
+ "epoch": 1.9002364066193853,
+ "grad_norm": 2.7870090007781982,
+ "learning_rate": 3.896890248749174e-06,
+ "loss": 0.4922,
+ "step": 4019
+ },
+ {
+ "epoch": 1.900709219858156,
+ "grad_norm": 2.8310980796813965,
+ "learning_rate": 3.89637284557066e-06,
+ "loss": 0.4746,
+ "step": 4020
+ },
+ {
+ "epoch": 1.9011820330969267,
+ "grad_norm": 2.434915542602539,
+ "learning_rate": 3.895855355446131e-06,
+ "loss": 0.4537,
+ "step": 4021
+ },
+ {
+ "epoch": 1.9016548463356973,
+ "grad_norm": 3.0547034740448,
+ "learning_rate": 3.89533777840781e-06,
+ "loss": 0.6161,
+ "step": 4022
+ },
+ {
+ "epoch": 1.902127659574468,
+ "grad_norm": 3.416774272918701,
+ "learning_rate": 3.894820114487925e-06,
+ "loss": 0.5448,
+ "step": 4023
+ },
+ {
+ "epoch": 1.9026004728132389,
+ "grad_norm": 2.606951951980591,
+ "learning_rate": 3.894302363718707e-06,
+ "loss": 0.5501,
+ "step": 4024
+ },
+ {
+ "epoch": 1.9030732860520094,
+ "grad_norm": 3.082165002822876,
+ "learning_rate": 3.8937845261323945e-06,
+ "loss": 0.6035,
+ "step": 4025
+ },
+ {
+ "epoch": 1.90354609929078,
+ "grad_norm": 2.616093397140503,
+ "learning_rate": 3.893266601761231e-06,
+ "loss": 0.5294,
+ "step": 4026
+ },
+ {
+ "epoch": 1.9040189125295508,
+ "grad_norm": 2.7141637802124023,
+ "learning_rate": 3.8927485906374654e-06,
+ "loss": 0.5481,
+ "step": 4027
+ },
+ {
+ "epoch": 1.9044917257683216,
+ "grad_norm": 2.5129404067993164,
+ "learning_rate": 3.892230492793352e-06,
+ "loss": 0.4958,
+ "step": 4028
+ },
+ {
+ "epoch": 1.9049645390070922,
+ "grad_norm": 2.703403949737549,
+ "learning_rate": 3.891712308261151e-06,
+ "loss": 0.4852,
+ "step": 4029
+ },
+ {
+ "epoch": 1.9054373522458627,
+ "grad_norm": 2.881058931350708,
+ "learning_rate": 3.891194037073127e-06,
+ "loss": 0.4662,
+ "step": 4030
+ },
+ {
+ "epoch": 1.9059101654846335,
+ "grad_norm": 3.216769218444824,
+ "learning_rate": 3.8906756792615505e-06,
+ "loss": 0.5076,
+ "step": 4031
+ },
+ {
+ "epoch": 1.9063829787234043,
+ "grad_norm": 2.442265748977661,
+ "learning_rate": 3.890157234858697e-06,
+ "loss": 0.4748,
+ "step": 4032
+ },
+ {
+ "epoch": 1.906855791962175,
+ "grad_norm": 3.088672399520874,
+ "learning_rate": 3.889638703896849e-06,
+ "loss": 0.5729,
+ "step": 4033
+ },
+ {
+ "epoch": 1.9073286052009455,
+ "grad_norm": 2.9304986000061035,
+ "learning_rate": 3.889120086408291e-06,
+ "loss": 0.603,
+ "step": 4034
+ },
+ {
+ "epoch": 1.9078014184397163,
+ "grad_norm": 2.686093807220459,
+ "learning_rate": 3.888601382425318e-06,
+ "loss": 0.4978,
+ "step": 4035
+ },
+ {
+ "epoch": 1.908274231678487,
+ "grad_norm": 2.5668389797210693,
+ "learning_rate": 3.888082591980225e-06,
+ "loss": 0.5086,
+ "step": 4036
+ },
+ {
+ "epoch": 1.9087470449172577,
+ "grad_norm": 2.530996561050415,
+ "learning_rate": 3.887563715105315e-06,
+ "loss": 0.4678,
+ "step": 4037
+ },
+ {
+ "epoch": 1.9092198581560282,
+ "grad_norm": 3.043342351913452,
+ "learning_rate": 3.887044751832897e-06,
+ "loss": 0.5452,
+ "step": 4038
+ },
+ {
+ "epoch": 1.909692671394799,
+ "grad_norm": 2.799734115600586,
+ "learning_rate": 3.886525702195284e-06,
+ "loss": 0.5265,
+ "step": 4039
+ },
+ {
+ "epoch": 1.9101654846335698,
+ "grad_norm": 2.890022039413452,
+ "learning_rate": 3.886006566224796e-06,
+ "loss": 0.4634,
+ "step": 4040
+ },
+ {
+ "epoch": 1.9106382978723404,
+ "grad_norm": 2.6804237365722656,
+ "learning_rate": 3.8854873439537555e-06,
+ "loss": 0.5031,
+ "step": 4041
+ },
+ {
+ "epoch": 1.911111111111111,
+ "grad_norm": 2.43038272857666,
+ "learning_rate": 3.884968035414495e-06,
+ "loss": 0.5098,
+ "step": 4042
+ },
+ {
+ "epoch": 1.9115839243498818,
+ "grad_norm": 2.589583396911621,
+ "learning_rate": 3.884448640639346e-06,
+ "loss": 0.498,
+ "step": 4043
+ },
+ {
+ "epoch": 1.9120567375886526,
+ "grad_norm": 2.4565231800079346,
+ "learning_rate": 3.8839291596606524e-06,
+ "loss": 0.4318,
+ "step": 4044
+ },
+ {
+ "epoch": 1.9125295508274232,
+ "grad_norm": 2.66762638092041,
+ "learning_rate": 3.8834095925107575e-06,
+ "loss": 0.5441,
+ "step": 4045
+ },
+ {
+ "epoch": 1.9130023640661937,
+ "grad_norm": 2.7334461212158203,
+ "learning_rate": 3.882889939222013e-06,
+ "loss": 0.5209,
+ "step": 4046
+ },
+ {
+ "epoch": 1.9134751773049645,
+ "grad_norm": 2.6398537158966064,
+ "learning_rate": 3.8823701998267765e-06,
+ "loss": 0.4874,
+ "step": 4047
+ },
+ {
+ "epoch": 1.9139479905437353,
+ "grad_norm": 2.82405161857605,
+ "learning_rate": 3.881850374357409e-06,
+ "loss": 0.4519,
+ "step": 4048
+ },
+ {
+ "epoch": 1.914420803782506,
+ "grad_norm": 2.7552523612976074,
+ "learning_rate": 3.8813304628462776e-06,
+ "loss": 0.547,
+ "step": 4049
+ },
+ {
+ "epoch": 1.9148936170212765,
+ "grad_norm": 2.5287928581237793,
+ "learning_rate": 3.880810465325755e-06,
+ "loss": 0.5226,
+ "step": 4050
+ },
+ {
+ "epoch": 1.9153664302600473,
+ "grad_norm": 2.7597358226776123,
+ "learning_rate": 3.88029038182822e-06,
+ "loss": 0.5171,
+ "step": 4051
+ },
+ {
+ "epoch": 1.915839243498818,
+ "grad_norm": 2.563899278640747,
+ "learning_rate": 3.879770212386055e-06,
+ "loss": 0.4911,
+ "step": 4052
+ },
+ {
+ "epoch": 1.9163120567375886,
+ "grad_norm": 2.499404191970825,
+ "learning_rate": 3.879249957031649e-06,
+ "loss": 0.5072,
+ "step": 4053
+ },
+ {
+ "epoch": 1.9167848699763592,
+ "grad_norm": 2.817713499069214,
+ "learning_rate": 3.878729615797396e-06,
+ "loss": 0.5452,
+ "step": 4054
+ },
+ {
+ "epoch": 1.91725768321513,
+ "grad_norm": 2.7152490615844727,
+ "learning_rate": 3.878209188715696e-06,
+ "loss": 0.4917,
+ "step": 4055
+ },
+ {
+ "epoch": 1.9177304964539008,
+ "grad_norm": 2.384265661239624,
+ "learning_rate": 3.877688675818953e-06,
+ "loss": 0.4823,
+ "step": 4056
+ },
+ {
+ "epoch": 1.9182033096926714,
+ "grad_norm": 2.61059308052063,
+ "learning_rate": 3.877168077139577e-06,
+ "loss": 0.478,
+ "step": 4057
+ },
+ {
+ "epoch": 1.918676122931442,
+ "grad_norm": 2.6107938289642334,
+ "learning_rate": 3.8766473927099824e-06,
+ "loss": 0.5202,
+ "step": 4058
+ },
+ {
+ "epoch": 1.9191489361702128,
+ "grad_norm": 2.2339766025543213,
+ "learning_rate": 3.876126622562592e-06,
+ "loss": 0.547,
+ "step": 4059
+ },
+ {
+ "epoch": 1.9196217494089836,
+ "grad_norm": 2.4324610233306885,
+ "learning_rate": 3.8756057667298304e-06,
+ "loss": 0.5333,
+ "step": 4060
+ },
+ {
+ "epoch": 1.9200945626477541,
+ "grad_norm": 2.5521230697631836,
+ "learning_rate": 3.875084825244131e-06,
+ "loss": 0.5503,
+ "step": 4061
+ },
+ {
+ "epoch": 1.9205673758865247,
+ "grad_norm": 2.6985747814178467,
+ "learning_rate": 3.874563798137928e-06,
+ "loss": 0.4944,
+ "step": 4062
+ },
+ {
+ "epoch": 1.9210401891252955,
+ "grad_norm": 2.422332525253296,
+ "learning_rate": 3.874042685443664e-06,
+ "loss": 0.4807,
+ "step": 4063
+ },
+ {
+ "epoch": 1.9215130023640663,
+ "grad_norm": 2.914553165435791,
+ "learning_rate": 3.873521487193788e-06,
+ "loss": 0.4439,
+ "step": 4064
+ },
+ {
+ "epoch": 1.9219858156028369,
+ "grad_norm": 2.8098697662353516,
+ "learning_rate": 3.873000203420752e-06,
+ "loss": 0.5433,
+ "step": 4065
+ },
+ {
+ "epoch": 1.9224586288416075,
+ "grad_norm": 2.6124703884124756,
+ "learning_rate": 3.872478834157013e-06,
+ "loss": 0.4812,
+ "step": 4066
+ },
+ {
+ "epoch": 1.9229314420803783,
+ "grad_norm": 2.511059522628784,
+ "learning_rate": 3.871957379435035e-06,
+ "loss": 0.4666,
+ "step": 4067
+ },
+ {
+ "epoch": 1.923404255319149,
+ "grad_norm": 2.950542688369751,
+ "learning_rate": 3.871435839287287e-06,
+ "loss": 0.5687,
+ "step": 4068
+ },
+ {
+ "epoch": 1.9238770685579196,
+ "grad_norm": 2.4969422817230225,
+ "learning_rate": 3.870914213746243e-06,
+ "loss": 0.5235,
+ "step": 4069
+ },
+ {
+ "epoch": 1.9243498817966902,
+ "grad_norm": 2.512152910232544,
+ "learning_rate": 3.870392502844382e-06,
+ "loss": 0.4524,
+ "step": 4070
+ },
+ {
+ "epoch": 1.924822695035461,
+ "grad_norm": 3.0212557315826416,
+ "learning_rate": 3.86987070661419e-06,
+ "loss": 0.4868,
+ "step": 4071
+ },
+ {
+ "epoch": 1.9252955082742318,
+ "grad_norm": 2.8949966430664062,
+ "learning_rate": 3.869348825088154e-06,
+ "loss": 0.5556,
+ "step": 4072
+ },
+ {
+ "epoch": 1.9257683215130024,
+ "grad_norm": 2.402043581008911,
+ "learning_rate": 3.868826858298772e-06,
+ "loss": 0.5307,
+ "step": 4073
+ },
+ {
+ "epoch": 1.926241134751773,
+ "grad_norm": 2.980992078781128,
+ "learning_rate": 3.868304806278543e-06,
+ "loss": 0.6313,
+ "step": 4074
+ },
+ {
+ "epoch": 1.9267139479905437,
+ "grad_norm": 2.7140514850616455,
+ "learning_rate": 3.867782669059975e-06,
+ "loss": 0.5359,
+ "step": 4075
+ },
+ {
+ "epoch": 1.9271867612293145,
+ "grad_norm": 2.499631643295288,
+ "learning_rate": 3.867260446675577e-06,
+ "loss": 0.4873,
+ "step": 4076
+ },
+ {
+ "epoch": 1.9276595744680851,
+ "grad_norm": 2.915583610534668,
+ "learning_rate": 3.866738139157866e-06,
+ "loss": 0.5736,
+ "step": 4077
+ },
+ {
+ "epoch": 1.9281323877068557,
+ "grad_norm": 2.4231131076812744,
+ "learning_rate": 3.866215746539363e-06,
+ "loss": 0.5096,
+ "step": 4078
+ },
+ {
+ "epoch": 1.9286052009456265,
+ "grad_norm": 2.360074996948242,
+ "learning_rate": 3.865693268852599e-06,
+ "loss": 0.4907,
+ "step": 4079
+ },
+ {
+ "epoch": 1.9290780141843973,
+ "grad_norm": 2.5410032272338867,
+ "learning_rate": 3.865170706130101e-06,
+ "loss": 0.473,
+ "step": 4080
+ },
+ {
+ "epoch": 1.9295508274231679,
+ "grad_norm": 2.780090808868408,
+ "learning_rate": 3.86464805840441e-06,
+ "loss": 0.5213,
+ "step": 4081
+ },
+ {
+ "epoch": 1.9300236406619384,
+ "grad_norm": 2.7318382263183594,
+ "learning_rate": 3.864125325708068e-06,
+ "loss": 0.5617,
+ "step": 4082
+ },
+ {
+ "epoch": 1.9304964539007092,
+ "grad_norm": 2.76509165763855,
+ "learning_rate": 3.863602508073623e-06,
+ "loss": 0.52,
+ "step": 4083
+ },
+ {
+ "epoch": 1.93096926713948,
+ "grad_norm": 2.8041110038757324,
+ "learning_rate": 3.863079605533631e-06,
+ "loss": 0.5343,
+ "step": 4084
+ },
+ {
+ "epoch": 1.9314420803782506,
+ "grad_norm": 2.4462404251098633,
+ "learning_rate": 3.862556618120647e-06,
+ "loss": 0.4657,
+ "step": 4085
+ },
+ {
+ "epoch": 1.9319148936170212,
+ "grad_norm": 2.460864305496216,
+ "learning_rate": 3.862033545867238e-06,
+ "loss": 0.517,
+ "step": 4086
+ },
+ {
+ "epoch": 1.932387706855792,
+ "grad_norm": 2.6480276584625244,
+ "learning_rate": 3.8615103888059715e-06,
+ "loss": 0.4702,
+ "step": 4087
+ },
+ {
+ "epoch": 1.9328605200945628,
+ "grad_norm": 2.7175381183624268,
+ "learning_rate": 3.860987146969424e-06,
+ "loss": 0.5073,
+ "step": 4088
+ },
+ {
+ "epoch": 1.9333333333333333,
+ "grad_norm": 2.4963486194610596,
+ "learning_rate": 3.860463820390175e-06,
+ "loss": 0.4491,
+ "step": 4089
+ },
+ {
+ "epoch": 1.933806146572104,
+ "grad_norm": 2.548135757446289,
+ "learning_rate": 3.8599404091008075e-06,
+ "loss": 0.5134,
+ "step": 4090
+ },
+ {
+ "epoch": 1.9342789598108747,
+ "grad_norm": 2.8693668842315674,
+ "learning_rate": 3.859416913133916e-06,
+ "loss": 0.5467,
+ "step": 4091
+ },
+ {
+ "epoch": 1.9347517730496455,
+ "grad_norm": 2.711273670196533,
+ "learning_rate": 3.858893332522092e-06,
+ "loss": 0.6287,
+ "step": 4092
+ },
+ {
+ "epoch": 1.935224586288416,
+ "grad_norm": 2.8604533672332764,
+ "learning_rate": 3.858369667297941e-06,
+ "loss": 0.5661,
+ "step": 4093
+ },
+ {
+ "epoch": 1.9356973995271867,
+ "grad_norm": 2.936988353729248,
+ "learning_rate": 3.857845917494066e-06,
+ "loss": 0.5311,
+ "step": 4094
+ },
+ {
+ "epoch": 1.9361702127659575,
+ "grad_norm": 2.414093494415283,
+ "learning_rate": 3.857322083143079e-06,
+ "loss": 0.505,
+ "step": 4095
+ },
+ {
+ "epoch": 1.9366430260047283,
+ "grad_norm": 2.5528934001922607,
+ "learning_rate": 3.856798164277599e-06,
+ "loss": 0.4759,
+ "step": 4096
+ },
+ {
+ "epoch": 1.9371158392434988,
+ "grad_norm": 2.592893600463867,
+ "learning_rate": 3.8562741609302456e-06,
+ "loss": 0.4932,
+ "step": 4097
+ },
+ {
+ "epoch": 1.9375886524822694,
+ "grad_norm": 2.9619107246398926,
+ "learning_rate": 3.855750073133648e-06,
+ "loss": 0.5563,
+ "step": 4098
+ },
+ {
+ "epoch": 1.9380614657210402,
+ "grad_norm": 2.864889621734619,
+ "learning_rate": 3.855225900920438e-06,
+ "loss": 0.5069,
+ "step": 4099
+ },
+ {
+ "epoch": 1.938534278959811,
+ "grad_norm": 2.3951032161712646,
+ "learning_rate": 3.854701644323253e-06,
+ "loss": 0.4883,
+ "step": 4100
+ },
+ {
+ "epoch": 1.9390070921985816,
+ "grad_norm": 2.6339633464813232,
+ "learning_rate": 3.854177303374737e-06,
+ "loss": 0.5207,
+ "step": 4101
+ },
+ {
+ "epoch": 1.9394799054373522,
+ "grad_norm": 2.6435508728027344,
+ "learning_rate": 3.853652878107539e-06,
+ "loss": 0.4679,
+ "step": 4102
+ },
+ {
+ "epoch": 1.939952718676123,
+ "grad_norm": 2.4635629653930664,
+ "learning_rate": 3.853128368554311e-06,
+ "loss": 0.5639,
+ "step": 4103
+ },
+ {
+ "epoch": 1.9404255319148938,
+ "grad_norm": 2.664635419845581,
+ "learning_rate": 3.852603774747714e-06,
+ "loss": 0.5697,
+ "step": 4104
+ },
+ {
+ "epoch": 1.9408983451536643,
+ "grad_norm": 2.7020363807678223,
+ "learning_rate": 3.8520790967204095e-06,
+ "loss": 0.5462,
+ "step": 4105
+ },
+ {
+ "epoch": 1.941371158392435,
+ "grad_norm": 3.529282331466675,
+ "learning_rate": 3.851554334505069e-06,
+ "loss": 0.54,
+ "step": 4106
+ },
+ {
+ "epoch": 1.9418439716312057,
+ "grad_norm": 2.7125768661499023,
+ "learning_rate": 3.851029488134367e-06,
+ "loss": 0.5355,
+ "step": 4107
+ },
+ {
+ "epoch": 1.9423167848699765,
+ "grad_norm": 2.5226643085479736,
+ "learning_rate": 3.850504557640981e-06,
+ "loss": 0.5106,
+ "step": 4108
+ },
+ {
+ "epoch": 1.942789598108747,
+ "grad_norm": 2.834352731704712,
+ "learning_rate": 3.8499795430575995e-06,
+ "loss": 0.6069,
+ "step": 4109
+ },
+ {
+ "epoch": 1.9432624113475176,
+ "grad_norm": 2.8484177589416504,
+ "learning_rate": 3.849454444416911e-06,
+ "loss": 0.5542,
+ "step": 4110
+ },
+ {
+ "epoch": 1.9437352245862884,
+ "grad_norm": 2.402539014816284,
+ "learning_rate": 3.848929261751612e-06,
+ "loss": 0.47,
+ "step": 4111
+ },
+ {
+ "epoch": 1.9442080378250592,
+ "grad_norm": 2.7010042667388916,
+ "learning_rate": 3.848403995094402e-06,
+ "loss": 0.5263,
+ "step": 4112
+ },
+ {
+ "epoch": 1.9446808510638298,
+ "grad_norm": 2.441689968109131,
+ "learning_rate": 3.847878644477988e-06,
+ "loss": 0.5607,
+ "step": 4113
+ },
+ {
+ "epoch": 1.9451536643026004,
+ "grad_norm": 2.5994722843170166,
+ "learning_rate": 3.847353209935081e-06,
+ "loss": 0.5103,
+ "step": 4114
+ },
+ {
+ "epoch": 1.9456264775413712,
+ "grad_norm": 2.452242136001587,
+ "learning_rate": 3.8468276914983975e-06,
+ "loss": 0.4409,
+ "step": 4115
+ },
+ {
+ "epoch": 1.946099290780142,
+ "grad_norm": 2.421023368835449,
+ "learning_rate": 3.84630208920066e-06,
+ "loss": 0.4429,
+ "step": 4116
+ },
+ {
+ "epoch": 1.9465721040189126,
+ "grad_norm": 2.696399688720703,
+ "learning_rate": 3.8457764030745945e-06,
+ "loss": 0.5352,
+ "step": 4117
+ },
+ {
+ "epoch": 1.9470449172576831,
+ "grad_norm": 2.3963489532470703,
+ "learning_rate": 3.845250633152933e-06,
+ "loss": 0.4505,
+ "step": 4118
+ },
+ {
+ "epoch": 1.947517730496454,
+ "grad_norm": 2.610649585723877,
+ "learning_rate": 3.8447247794684135e-06,
+ "loss": 0.501,
+ "step": 4119
+ },
+ {
+ "epoch": 1.9479905437352247,
+ "grad_norm": 2.740412712097168,
+ "learning_rate": 3.8441988420537775e-06,
+ "loss": 0.5362,
+ "step": 4120
+ },
+ {
+ "epoch": 1.9484633569739953,
+ "grad_norm": 2.2614004611968994,
+ "learning_rate": 3.8436728209417755e-06,
+ "loss": 0.4199,
+ "step": 4121
+ },
+ {
+ "epoch": 1.9489361702127659,
+ "grad_norm": 3.0683481693267822,
+ "learning_rate": 3.843146716165158e-06,
+ "loss": 0.5248,
+ "step": 4122
+ },
+ {
+ "epoch": 1.9494089834515367,
+ "grad_norm": 3.005174398422241,
+ "learning_rate": 3.842620527756684e-06,
+ "loss": 0.5246,
+ "step": 4123
+ },
+ {
+ "epoch": 1.9498817966903075,
+ "grad_norm": 2.672896385192871,
+ "learning_rate": 3.842094255749117e-06,
+ "loss": 0.5586,
+ "step": 4124
+ },
+ {
+ "epoch": 1.950354609929078,
+ "grad_norm": 2.5481197834014893,
+ "learning_rate": 3.8415679001752255e-06,
+ "loss": 0.5061,
+ "step": 4125
+ },
+ {
+ "epoch": 1.9508274231678486,
+ "grad_norm": 2.515789270401001,
+ "learning_rate": 3.8410414610677835e-06,
+ "loss": 0.4645,
+ "step": 4126
+ },
+ {
+ "epoch": 1.9513002364066194,
+ "grad_norm": 2.7236077785491943,
+ "learning_rate": 3.84051493845957e-06,
+ "loss": 0.5623,
+ "step": 4127
+ },
+ {
+ "epoch": 1.9517730496453902,
+ "grad_norm": 2.6252009868621826,
+ "learning_rate": 3.839988332383369e-06,
+ "loss": 0.5078,
+ "step": 4128
+ },
+ {
+ "epoch": 1.9522458628841608,
+ "grad_norm": 2.719196081161499,
+ "learning_rate": 3.83946164287197e-06,
+ "loss": 0.5481,
+ "step": 4129
+ },
+ {
+ "epoch": 1.9527186761229314,
+ "grad_norm": 2.484163284301758,
+ "learning_rate": 3.838934869958169e-06,
+ "loss": 0.5332,
+ "step": 4130
+ },
+ {
+ "epoch": 1.9531914893617022,
+ "grad_norm": 2.615382671356201,
+ "learning_rate": 3.838408013674764e-06,
+ "loss": 0.4742,
+ "step": 4131
+ },
+ {
+ "epoch": 1.953664302600473,
+ "grad_norm": 2.735321044921875,
+ "learning_rate": 3.83788107405456e-06,
+ "loss": 0.421,
+ "step": 4132
+ },
+ {
+ "epoch": 1.9541371158392435,
+ "grad_norm": 2.892652750015259,
+ "learning_rate": 3.837354051130369e-06,
+ "loss": 0.5326,
+ "step": 4133
+ },
+ {
+ "epoch": 1.9546099290780141,
+ "grad_norm": 2.6800546646118164,
+ "learning_rate": 3.8368269449350055e-06,
+ "loss": 0.5041,
+ "step": 4134
+ },
+ {
+ "epoch": 1.955082742316785,
+ "grad_norm": 2.362470865249634,
+ "learning_rate": 3.836299755501289e-06,
+ "loss": 0.4697,
+ "step": 4135
+ },
+ {
+ "epoch": 1.9555555555555557,
+ "grad_norm": 2.3855135440826416,
+ "learning_rate": 3.835772482862047e-06,
+ "loss": 0.5148,
+ "step": 4136
+ },
+ {
+ "epoch": 1.9560283687943263,
+ "grad_norm": 2.3338418006896973,
+ "learning_rate": 3.83524512705011e-06,
+ "loss": 0.4643,
+ "step": 4137
+ },
+ {
+ "epoch": 1.9565011820330969,
+ "grad_norm": 2.261355400085449,
+ "learning_rate": 3.834717688098313e-06,
+ "loss": 0.5573,
+ "step": 4138
+ },
+ {
+ "epoch": 1.9569739952718677,
+ "grad_norm": 2.8166391849517822,
+ "learning_rate": 3.834190166039498e-06,
+ "loss": 0.4868,
+ "step": 4139
+ },
+ {
+ "epoch": 1.9574468085106385,
+ "grad_norm": 2.4155869483947754,
+ "learning_rate": 3.833662560906512e-06,
+ "loss": 0.4923,
+ "step": 4140
+ },
+ {
+ "epoch": 1.957919621749409,
+ "grad_norm": 2.3977696895599365,
+ "learning_rate": 3.833134872732206e-06,
+ "loss": 0.5106,
+ "step": 4141
+ },
+ {
+ "epoch": 1.9583924349881796,
+ "grad_norm": 2.9541378021240234,
+ "learning_rate": 3.832607101549438e-06,
+ "loss": 0.4683,
+ "step": 4142
+ },
+ {
+ "epoch": 1.9588652482269504,
+ "grad_norm": 2.5862700939178467,
+ "learning_rate": 3.832079247391068e-06,
+ "loss": 0.4453,
+ "step": 4143
+ },
+ {
+ "epoch": 1.9593380614657212,
+ "grad_norm": 2.7459371089935303,
+ "learning_rate": 3.8315513102899644e-06,
+ "loss": 0.5511,
+ "step": 4144
+ },
+ {
+ "epoch": 1.9598108747044918,
+ "grad_norm": 2.904869556427002,
+ "learning_rate": 3.831023290279e-06,
+ "loss": 0.5348,
+ "step": 4145
+ },
+ {
+ "epoch": 1.9602836879432624,
+ "grad_norm": 3.092846632003784,
+ "learning_rate": 3.830495187391051e-06,
+ "loss": 0.5664,
+ "step": 4146
+ },
+ {
+ "epoch": 1.9607565011820332,
+ "grad_norm": 3.2838528156280518,
+ "learning_rate": 3.829967001659001e-06,
+ "loss": 0.5115,
+ "step": 4147
+ },
+ {
+ "epoch": 1.961229314420804,
+ "grad_norm": 2.7799549102783203,
+ "learning_rate": 3.829438733115738e-06,
+ "loss": 0.5145,
+ "step": 4148
+ },
+ {
+ "epoch": 1.9617021276595743,
+ "grad_norm": 2.436084270477295,
+ "learning_rate": 3.828910381794154e-06,
+ "loss": 0.4718,
+ "step": 4149
+ },
+ {
+ "epoch": 1.962174940898345,
+ "grad_norm": 2.6662371158599854,
+ "learning_rate": 3.828381947727148e-06,
+ "loss": 0.6129,
+ "step": 4150
+ },
+ {
+ "epoch": 1.962647754137116,
+ "grad_norm": 2.937000036239624,
+ "learning_rate": 3.827853430947622e-06,
+ "loss": 0.522,
+ "step": 4151
+ },
+ {
+ "epoch": 1.9631205673758865,
+ "grad_norm": 2.5737369060516357,
+ "learning_rate": 3.827324831488486e-06,
+ "loss": 0.4916,
+ "step": 4152
+ },
+ {
+ "epoch": 1.963593380614657,
+ "grad_norm": 2.70232892036438,
+ "learning_rate": 3.826796149382653e-06,
+ "loss": 0.4726,
+ "step": 4153
+ },
+ {
+ "epoch": 1.9640661938534278,
+ "grad_norm": 2.6899707317352295,
+ "learning_rate": 3.826267384663042e-06,
+ "loss": 0.529,
+ "step": 4154
+ },
+ {
+ "epoch": 1.9645390070921986,
+ "grad_norm": 2.6142728328704834,
+ "learning_rate": 3.825738537362575e-06,
+ "loss": 0.4999,
+ "step": 4155
+ },
+ {
+ "epoch": 1.9650118203309692,
+ "grad_norm": 2.43949818611145,
+ "learning_rate": 3.825209607514183e-06,
+ "loss": 0.5035,
+ "step": 4156
+ },
+ {
+ "epoch": 1.9654846335697398,
+ "grad_norm": 2.3735458850860596,
+ "learning_rate": 3.824680595150801e-06,
+ "loss": 0.4779,
+ "step": 4157
+ },
+ {
+ "epoch": 1.9659574468085106,
+ "grad_norm": 2.444307565689087,
+ "learning_rate": 3.824151500305365e-06,
+ "loss": 0.4825,
+ "step": 4158
+ },
+ {
+ "epoch": 1.9664302600472814,
+ "grad_norm": 2.8219668865203857,
+ "learning_rate": 3.8236223230108224e-06,
+ "loss": 0.5354,
+ "step": 4159
+ },
+ {
+ "epoch": 1.966903073286052,
+ "grad_norm": 2.720721483230591,
+ "learning_rate": 3.823093063300121e-06,
+ "loss": 0.5064,
+ "step": 4160
+ },
+ {
+ "epoch": 1.9673758865248225,
+ "grad_norm": 2.324190616607666,
+ "learning_rate": 3.822563721206217e-06,
+ "loss": 0.5348,
+ "step": 4161
+ },
+ {
+ "epoch": 1.9678486997635933,
+ "grad_norm": 2.702155351638794,
+ "learning_rate": 3.8220342967620695e-06,
+ "loss": 0.5388,
+ "step": 4162
+ },
+ {
+ "epoch": 1.9683215130023641,
+ "grad_norm": 2.4956369400024414,
+ "learning_rate": 3.821504790000642e-06,
+ "loss": 0.5071,
+ "step": 4163
+ },
+ {
+ "epoch": 1.9687943262411347,
+ "grad_norm": 2.568039655685425,
+ "learning_rate": 3.820975200954906e-06,
+ "loss": 0.5133,
+ "step": 4164
+ },
+ {
+ "epoch": 1.9692671394799053,
+ "grad_norm": 2.810868978500366,
+ "learning_rate": 3.820445529657837e-06,
+ "loss": 0.4856,
+ "step": 4165
+ },
+ {
+ "epoch": 1.969739952718676,
+ "grad_norm": 2.66365647315979,
+ "learning_rate": 3.819915776142415e-06,
+ "loss": 0.5235,
+ "step": 4166
+ },
+ {
+ "epoch": 1.9702127659574469,
+ "grad_norm": 2.2982139587402344,
+ "learning_rate": 3.8193859404416265e-06,
+ "loss": 0.4361,
+ "step": 4167
+ },
+ {
+ "epoch": 1.9706855791962175,
+ "grad_norm": 2.585672378540039,
+ "learning_rate": 3.818856022588458e-06,
+ "loss": 0.4842,
+ "step": 4168
+ },
+ {
+ "epoch": 1.971158392434988,
+ "grad_norm": 2.57857346534729,
+ "learning_rate": 3.81832602261591e-06,
+ "loss": 0.5249,
+ "step": 4169
+ },
+ {
+ "epoch": 1.9716312056737588,
+ "grad_norm": 2.6947224140167236,
+ "learning_rate": 3.817795940556981e-06,
+ "loss": 0.5234,
+ "step": 4170
+ },
+ {
+ "epoch": 1.9721040189125296,
+ "grad_norm": 2.7453415393829346,
+ "learning_rate": 3.8172657764446764e-06,
+ "loss": 0.5219,
+ "step": 4171
+ },
+ {
+ "epoch": 1.9725768321513002,
+ "grad_norm": 8.424073219299316,
+ "learning_rate": 3.816735530312009e-06,
+ "loss": 0.5162,
+ "step": 4172
+ },
+ {
+ "epoch": 1.9730496453900708,
+ "grad_norm": 2.8229739665985107,
+ "learning_rate": 3.816205202191993e-06,
+ "loss": 0.4621,
+ "step": 4173
+ },
+ {
+ "epoch": 1.9735224586288416,
+ "grad_norm": 2.5969009399414062,
+ "learning_rate": 3.815674792117651e-06,
+ "loss": 0.5044,
+ "step": 4174
+ },
+ {
+ "epoch": 1.9739952718676124,
+ "grad_norm": 2.646024227142334,
+ "learning_rate": 3.815144300122009e-06,
+ "loss": 0.5094,
+ "step": 4175
+ },
+ {
+ "epoch": 1.974468085106383,
+ "grad_norm": 2.4950616359710693,
+ "learning_rate": 3.814613726238097e-06,
+ "loss": 0.4827,
+ "step": 4176
+ },
+ {
+ "epoch": 1.9749408983451535,
+ "grad_norm": 2.5636119842529297,
+ "learning_rate": 3.8140830704989535e-06,
+ "loss": 0.5241,
+ "step": 4177
+ },
+ {
+ "epoch": 1.9754137115839243,
+ "grad_norm": 2.7936553955078125,
+ "learning_rate": 3.813552332937619e-06,
+ "loss": 0.5344,
+ "step": 4178
+ },
+ {
+ "epoch": 1.9758865248226951,
+ "grad_norm": 2.8085341453552246,
+ "learning_rate": 3.8130215135871405e-06,
+ "loss": 0.5647,
+ "step": 4179
+ },
+ {
+ "epoch": 1.9763593380614657,
+ "grad_norm": 2.4776322841644287,
+ "learning_rate": 3.8124906124805694e-06,
+ "loss": 0.542,
+ "step": 4180
+ },
+ {
+ "epoch": 1.9768321513002363,
+ "grad_norm": 2.3227856159210205,
+ "learning_rate": 3.8119596296509635e-06,
+ "loss": 0.4618,
+ "step": 4181
+ },
+ {
+ "epoch": 1.977304964539007,
+ "grad_norm": 2.5157814025878906,
+ "learning_rate": 3.8114285651313848e-06,
+ "loss": 0.538,
+ "step": 4182
+ },
+ {
+ "epoch": 1.9777777777777779,
+ "grad_norm": 2.5630218982696533,
+ "learning_rate": 3.8108974189548987e-06,
+ "loss": 0.5254,
+ "step": 4183
+ },
+ {
+ "epoch": 1.9782505910165484,
+ "grad_norm": 2.703237533569336,
+ "learning_rate": 3.8103661911545787e-06,
+ "loss": 0.4859,
+ "step": 4184
+ },
+ {
+ "epoch": 1.978723404255319,
+ "grad_norm": 2.8808000087738037,
+ "learning_rate": 3.809834881763502e-06,
+ "loss": 0.5585,
+ "step": 4185
+ },
+ {
+ "epoch": 1.9791962174940898,
+ "grad_norm": 2.9047577381134033,
+ "learning_rate": 3.8093034908147507e-06,
+ "loss": 0.5022,
+ "step": 4186
+ },
+ {
+ "epoch": 1.9796690307328606,
+ "grad_norm": 2.7417640686035156,
+ "learning_rate": 3.8087720183414125e-06,
+ "loss": 0.5275,
+ "step": 4187
+ },
+ {
+ "epoch": 1.9801418439716312,
+ "grad_norm": 2.952012062072754,
+ "learning_rate": 3.8082404643765786e-06,
+ "loss": 0.543,
+ "step": 4188
+ },
+ {
+ "epoch": 1.9806146572104018,
+ "grad_norm": 2.538376569747925,
+ "learning_rate": 3.807708828953348e-06,
+ "loss": 0.4969,
+ "step": 4189
+ },
+ {
+ "epoch": 1.9810874704491725,
+ "grad_norm": 2.3476181030273438,
+ "learning_rate": 3.807177112104823e-06,
+ "loss": 0.4979,
+ "step": 4190
+ },
+ {
+ "epoch": 1.9815602836879433,
+ "grad_norm": 2.6480464935302734,
+ "learning_rate": 3.80664531386411e-06,
+ "loss": 0.4894,
+ "step": 4191
+ },
+ {
+ "epoch": 1.982033096926714,
+ "grad_norm": 2.792916774749756,
+ "learning_rate": 3.8061134342643235e-06,
+ "loss": 0.5468,
+ "step": 4192
+ },
+ {
+ "epoch": 1.9825059101654845,
+ "grad_norm": 2.368736743927002,
+ "learning_rate": 3.805581473338581e-06,
+ "loss": 0.4672,
+ "step": 4193
+ },
+ {
+ "epoch": 1.9829787234042553,
+ "grad_norm": 2.379084348678589,
+ "learning_rate": 3.8050494311200037e-06,
+ "loss": 0.4577,
+ "step": 4194
+ },
+ {
+ "epoch": 1.983451536643026,
+ "grad_norm": 2.722471237182617,
+ "learning_rate": 3.804517307641722e-06,
+ "loss": 0.4988,
+ "step": 4195
+ },
+ {
+ "epoch": 1.9839243498817967,
+ "grad_norm": 2.356649875640869,
+ "learning_rate": 3.8039851029368674e-06,
+ "loss": 0.4933,
+ "step": 4196
+ },
+ {
+ "epoch": 1.9843971631205672,
+ "grad_norm": 2.9182281494140625,
+ "learning_rate": 3.8034528170385776e-06,
+ "loss": 0.4873,
+ "step": 4197
+ },
+ {
+ "epoch": 1.984869976359338,
+ "grad_norm": 2.6232199668884277,
+ "learning_rate": 3.8029204499799976e-06,
+ "loss": 0.4425,
+ "step": 4198
+ },
+ {
+ "epoch": 1.9853427895981088,
+ "grad_norm": 2.667541980743408,
+ "learning_rate": 3.802388001794274e-06,
+ "loss": 0.5022,
+ "step": 4199
+ },
+ {
+ "epoch": 1.9858156028368794,
+ "grad_norm": 3.168470621109009,
+ "learning_rate": 3.8018554725145596e-06,
+ "loss": 0.5505,
+ "step": 4200
+ },
+ {
+ "epoch": 1.98628841607565,
+ "grad_norm": 2.716625452041626,
+ "learning_rate": 3.8013228621740132e-06,
+ "loss": 0.4937,
+ "step": 4201
+ },
+ {
+ "epoch": 1.9867612293144208,
+ "grad_norm": 2.3014442920684814,
+ "learning_rate": 3.800790170805799e-06,
+ "loss": 0.4734,
+ "step": 4202
+ },
+ {
+ "epoch": 1.9872340425531916,
+ "grad_norm": 2.9426841735839844,
+ "learning_rate": 3.8002573984430847e-06,
+ "loss": 0.4983,
+ "step": 4203
+ },
+ {
+ "epoch": 1.9877068557919622,
+ "grad_norm": 2.5598278045654297,
+ "learning_rate": 3.7997245451190435e-06,
+ "loss": 0.4834,
+ "step": 4204
+ },
+ {
+ "epoch": 1.9881796690307327,
+ "grad_norm": 2.86458420753479,
+ "learning_rate": 3.7991916108668538e-06,
+ "loss": 0.5613,
+ "step": 4205
+ },
+ {
+ "epoch": 1.9886524822695035,
+ "grad_norm": 2.842914342880249,
+ "learning_rate": 3.7986585957196997e-06,
+ "loss": 0.4951,
+ "step": 4206
+ },
+ {
+ "epoch": 1.9891252955082743,
+ "grad_norm": 3.1828150749206543,
+ "learning_rate": 3.7981254997107686e-06,
+ "loss": 0.5913,
+ "step": 4207
+ },
+ {
+ "epoch": 1.989598108747045,
+ "grad_norm": 2.5765931606292725,
+ "learning_rate": 3.7975923228732547e-06,
+ "loss": 0.5544,
+ "step": 4208
+ },
+ {
+ "epoch": 1.9900709219858155,
+ "grad_norm": 2.492234945297241,
+ "learning_rate": 3.797059065240357e-06,
+ "loss": 0.5046,
+ "step": 4209
+ },
+ {
+ "epoch": 1.9905437352245863,
+ "grad_norm": 2.870346784591675,
+ "learning_rate": 3.7965257268452795e-06,
+ "loss": 0.5354,
+ "step": 4210
+ },
+ {
+ "epoch": 1.991016548463357,
+ "grad_norm": 2.4989993572235107,
+ "learning_rate": 3.795992307721229e-06,
+ "loss": 0.4677,
+ "step": 4211
+ },
+ {
+ "epoch": 1.9914893617021276,
+ "grad_norm": 2.931114673614502,
+ "learning_rate": 3.7954588079014206e-06,
+ "loss": 0.5504,
+ "step": 4212
+ },
+ {
+ "epoch": 1.9919621749408982,
+ "grad_norm": 2.5247652530670166,
+ "learning_rate": 3.794925227419073e-06,
+ "loss": 0.4736,
+ "step": 4213
+ },
+ {
+ "epoch": 1.992434988179669,
+ "grad_norm": 2.6238436698913574,
+ "learning_rate": 3.794391566307409e-06,
+ "loss": 0.4591,
+ "step": 4214
+ },
+ {
+ "epoch": 1.9929078014184398,
+ "grad_norm": 2.654886245727539,
+ "learning_rate": 3.7938578245996584e-06,
+ "loss": 0.5149,
+ "step": 4215
+ },
+ {
+ "epoch": 1.9933806146572104,
+ "grad_norm": 2.509164810180664,
+ "learning_rate": 3.793324002329054e-06,
+ "loss": 0.4951,
+ "step": 4216
+ },
+ {
+ "epoch": 1.993853427895981,
+ "grad_norm": 2.909632921218872,
+ "learning_rate": 3.7927900995288345e-06,
+ "loss": 0.5131,
+ "step": 4217
+ },
+ {
+ "epoch": 1.9943262411347518,
+ "grad_norm": 2.4354615211486816,
+ "learning_rate": 3.7922561162322456e-06,
+ "loss": 0.4716,
+ "step": 4218
+ },
+ {
+ "epoch": 1.9947990543735226,
+ "grad_norm": 2.6514649391174316,
+ "learning_rate": 3.791722052472534e-06,
+ "loss": 0.5714,
+ "step": 4219
+ },
+ {
+ "epoch": 1.9952718676122931,
+ "grad_norm": 2.77089262008667,
+ "learning_rate": 3.791187908282954e-06,
+ "loss": 0.5736,
+ "step": 4220
+ },
+ {
+ "epoch": 1.9957446808510637,
+ "grad_norm": 2.7651021480560303,
+ "learning_rate": 3.7906536836967657e-06,
+ "loss": 0.4948,
+ "step": 4221
+ },
+ {
+ "epoch": 1.9962174940898345,
+ "grad_norm": 2.7536795139312744,
+ "learning_rate": 3.7901193787472306e-06,
+ "loss": 0.512,
+ "step": 4222
+ },
+ {
+ "epoch": 1.9966903073286053,
+ "grad_norm": 2.684893846511841,
+ "learning_rate": 3.78958499346762e-06,
+ "loss": 0.5118,
+ "step": 4223
+ },
+ {
+ "epoch": 1.9971631205673759,
+ "grad_norm": 2.7616753578186035,
+ "learning_rate": 3.7890505278912054e-06,
+ "loss": 0.4516,
+ "step": 4224
+ },
+ {
+ "epoch": 1.9976359338061465,
+ "grad_norm": 2.4731967449188232,
+ "learning_rate": 3.7885159820512666e-06,
+ "loss": 0.4736,
+ "step": 4225
+ },
+ {
+ "epoch": 1.9981087470449173,
+ "grad_norm": 2.366631031036377,
+ "learning_rate": 3.7879813559810884e-06,
+ "loss": 0.4999,
+ "step": 4226
+ },
+ {
+ "epoch": 1.998581560283688,
+ "grad_norm": 2.994624137878418,
+ "learning_rate": 3.7874466497139582e-06,
+ "loss": 0.5273,
+ "step": 4227
+ },
+ {
+ "epoch": 1.9990543735224586,
+ "grad_norm": 2.4499242305755615,
+ "learning_rate": 3.7869118632831712e-06,
+ "loss": 0.5761,
+ "step": 4228
+ },
+ {
+ "epoch": 1.9995271867612292,
+ "grad_norm": 2.3370113372802734,
+ "learning_rate": 3.7863769967220243e-06,
+ "loss": 0.4673,
+ "step": 4229
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 3.1131203174591064,
+ "learning_rate": 3.7858420500638236e-06,
+ "loss": 0.5118,
+ "step": 4230
+ },
+ {
+ "epoch": 2.000472813238771,
+ "grad_norm": 2.2747561931610107,
+ "learning_rate": 3.785307023341876e-06,
+ "loss": 0.4166,
+ "step": 4231
+ },
+ {
+ "epoch": 2.000945626477541,
+ "grad_norm": 2.4347424507141113,
+ "learning_rate": 3.7847719165894963e-06,
+ "loss": 0.4161,
+ "step": 4232
+ },
+ {
+ "epoch": 2.001418439716312,
+ "grad_norm": 2.398805618286133,
+ "learning_rate": 3.784236729840003e-06,
+ "loss": 0.4652,
+ "step": 4233
+ },
+ {
+ "epoch": 2.0018912529550827,
+ "grad_norm": 2.1904916763305664,
+ "learning_rate": 3.783701463126719e-06,
+ "loss": 0.4554,
+ "step": 4234
+ },
+ {
+ "epoch": 2.0023640661938535,
+ "grad_norm": 2.237330913543701,
+ "learning_rate": 3.7831661164829735e-06,
+ "loss": 0.4471,
+ "step": 4235
+ },
+ {
+ "epoch": 2.002836879432624,
+ "grad_norm": 2.3656628131866455,
+ "learning_rate": 3.7826306899421016e-06,
+ "loss": 0.4052,
+ "step": 4236
+ },
+ {
+ "epoch": 2.0033096926713947,
+ "grad_norm": 2.615489959716797,
+ "learning_rate": 3.7820951835374405e-06,
+ "loss": 0.4847,
+ "step": 4237
+ },
+ {
+ "epoch": 2.0037825059101655,
+ "grad_norm": 2.453036308288574,
+ "learning_rate": 3.7815595973023347e-06,
+ "loss": 0.4672,
+ "step": 4238
+ },
+ {
+ "epoch": 2.0042553191489363,
+ "grad_norm": 2.537468671798706,
+ "learning_rate": 3.7810239312701306e-06,
+ "loss": 0.467,
+ "step": 4239
+ },
+ {
+ "epoch": 2.0047281323877066,
+ "grad_norm": 2.3321666717529297,
+ "learning_rate": 3.780488185474184e-06,
+ "loss": 0.3557,
+ "step": 4240
+ },
+ {
+ "epoch": 2.0052009456264774,
+ "grad_norm": 2.9051828384399414,
+ "learning_rate": 3.779952359947854e-06,
+ "loss": 0.5474,
+ "step": 4241
+ },
+ {
+ "epoch": 2.0056737588652482,
+ "grad_norm": 2.7458817958831787,
+ "learning_rate": 3.7794164547245015e-06,
+ "loss": 0.4659,
+ "step": 4242
+ },
+ {
+ "epoch": 2.006146572104019,
+ "grad_norm": 2.627046585083008,
+ "learning_rate": 3.778880469837497e-06,
+ "loss": 0.4179,
+ "step": 4243
+ },
+ {
+ "epoch": 2.0066193853427894,
+ "grad_norm": 2.4186174869537354,
+ "learning_rate": 3.7783444053202135e-06,
+ "loss": 0.3976,
+ "step": 4244
+ },
+ {
+ "epoch": 2.00709219858156,
+ "grad_norm": 3.109376907348633,
+ "learning_rate": 3.7778082612060296e-06,
+ "loss": 0.4095,
+ "step": 4245
+ },
+ {
+ "epoch": 2.007565011820331,
+ "grad_norm": 2.583376169204712,
+ "learning_rate": 3.7772720375283282e-06,
+ "loss": 0.4325,
+ "step": 4246
+ },
+ {
+ "epoch": 2.0080378250591018,
+ "grad_norm": 2.6199896335601807,
+ "learning_rate": 3.776735734320497e-06,
+ "loss": 0.4207,
+ "step": 4247
+ },
+ {
+ "epoch": 2.008510638297872,
+ "grad_norm": 2.545353651046753,
+ "learning_rate": 3.77619935161593e-06,
+ "loss": 0.4483,
+ "step": 4248
+ },
+ {
+ "epoch": 2.008983451536643,
+ "grad_norm": 2.770266056060791,
+ "learning_rate": 3.7756628894480263e-06,
+ "loss": 0.457,
+ "step": 4249
+ },
+ {
+ "epoch": 2.0094562647754137,
+ "grad_norm": 2.903254985809326,
+ "learning_rate": 3.7751263478501878e-06,
+ "loss": 0.4171,
+ "step": 4250
+ },
+ {
+ "epoch": 2.0099290780141845,
+ "grad_norm": 2.5576963424682617,
+ "learning_rate": 3.774589726855822e-06,
+ "loss": 0.3631,
+ "step": 4251
+ },
+ {
+ "epoch": 2.010401891252955,
+ "grad_norm": 3.7584285736083984,
+ "learning_rate": 3.7740530264983434e-06,
+ "loss": 0.4827,
+ "step": 4252
+ },
+ {
+ "epoch": 2.0108747044917257,
+ "grad_norm": 3.3116581439971924,
+ "learning_rate": 3.77351624681117e-06,
+ "loss": 0.5071,
+ "step": 4253
+ },
+ {
+ "epoch": 2.0113475177304965,
+ "grad_norm": 3.1370885372161865,
+ "learning_rate": 3.772979387827723e-06,
+ "loss": 0.4963,
+ "step": 4254
+ },
+ {
+ "epoch": 2.0118203309692673,
+ "grad_norm": 2.4832639694213867,
+ "learning_rate": 3.772442449581432e-06,
+ "loss": 0.4442,
+ "step": 4255
+ },
+ {
+ "epoch": 2.0122931442080376,
+ "grad_norm": 2.7645785808563232,
+ "learning_rate": 3.7719054321057293e-06,
+ "loss": 0.4572,
+ "step": 4256
+ },
+ {
+ "epoch": 2.0127659574468084,
+ "grad_norm": 2.7962236404418945,
+ "learning_rate": 3.7713683354340515e-06,
+ "loss": 0.4906,
+ "step": 4257
+ },
+ {
+ "epoch": 2.013238770685579,
+ "grad_norm": 2.647991895675659,
+ "learning_rate": 3.7708311595998425e-06,
+ "loss": 0.4027,
+ "step": 4258
+ },
+ {
+ "epoch": 2.01371158392435,
+ "grad_norm": 2.3780267238616943,
+ "learning_rate": 3.7702939046365504e-06,
+ "loss": 0.4285,
+ "step": 4259
+ },
+ {
+ "epoch": 2.0141843971631204,
+ "grad_norm": 2.5185933113098145,
+ "learning_rate": 3.7697565705776266e-06,
+ "loss": 0.4834,
+ "step": 4260
+ },
+ {
+ "epoch": 2.014657210401891,
+ "grad_norm": 2.432507276535034,
+ "learning_rate": 3.7692191574565294e-06,
+ "loss": 0.3695,
+ "step": 4261
+ },
+ {
+ "epoch": 2.015130023640662,
+ "grad_norm": 2.8010706901550293,
+ "learning_rate": 3.76868166530672e-06,
+ "loss": 0.478,
+ "step": 4262
+ },
+ {
+ "epoch": 2.0156028368794328,
+ "grad_norm": 2.32817006111145,
+ "learning_rate": 3.768144094161666e-06,
+ "loss": 0.4154,
+ "step": 4263
+ },
+ {
+ "epoch": 2.016075650118203,
+ "grad_norm": 3.062812328338623,
+ "learning_rate": 3.7676064440548405e-06,
+ "loss": 0.5015,
+ "step": 4264
+ },
+ {
+ "epoch": 2.016548463356974,
+ "grad_norm": 2.6129536628723145,
+ "learning_rate": 3.7670687150197194e-06,
+ "loss": 0.3843,
+ "step": 4265
+ },
+ {
+ "epoch": 2.0170212765957447,
+ "grad_norm": 2.838259696960449,
+ "learning_rate": 3.766530907089786e-06,
+ "loss": 0.4937,
+ "step": 4266
+ },
+ {
+ "epoch": 2.0174940898345155,
+ "grad_norm": 2.601203680038452,
+ "learning_rate": 3.7659930202985263e-06,
+ "loss": 0.4644,
+ "step": 4267
+ },
+ {
+ "epoch": 2.017966903073286,
+ "grad_norm": 2.5964133739471436,
+ "learning_rate": 3.7654550546794322e-06,
+ "loss": 0.4365,
+ "step": 4268
+ },
+ {
+ "epoch": 2.0184397163120567,
+ "grad_norm": 3.0028915405273438,
+ "learning_rate": 3.764917010266001e-06,
+ "loss": 0.434,
+ "step": 4269
+ },
+ {
+ "epoch": 2.0189125295508275,
+ "grad_norm": 2.719252586364746,
+ "learning_rate": 3.764378887091734e-06,
+ "loss": 0.4401,
+ "step": 4270
+ },
+ {
+ "epoch": 2.0193853427895982,
+ "grad_norm": 2.400254011154175,
+ "learning_rate": 3.7638406851901377e-06,
+ "loss": 0.4904,
+ "step": 4271
+ },
+ {
+ "epoch": 2.0198581560283686,
+ "grad_norm": 2.8015363216400146,
+ "learning_rate": 3.763302404594724e-06,
+ "loss": 0.4569,
+ "step": 4272
+ },
+ {
+ "epoch": 2.0203309692671394,
+ "grad_norm": 2.718416452407837,
+ "learning_rate": 3.762764045339009e-06,
+ "loss": 0.5124,
+ "step": 4273
+ },
+ {
+ "epoch": 2.02080378250591,
+ "grad_norm": 2.484049081802368,
+ "learning_rate": 3.762225607456514e-06,
+ "loss": 0.4255,
+ "step": 4274
+ },
+ {
+ "epoch": 2.021276595744681,
+ "grad_norm": 2.6377930641174316,
+ "learning_rate": 3.7616870909807645e-06,
+ "loss": 0.5044,
+ "step": 4275
+ },
+ {
+ "epoch": 2.0217494089834513,
+ "grad_norm": 2.8845038414001465,
+ "learning_rate": 3.7611484959452927e-06,
+ "loss": 0.4924,
+ "step": 4276
+ },
+ {
+ "epoch": 2.022222222222222,
+ "grad_norm": 2.5939974784851074,
+ "learning_rate": 3.7606098223836342e-06,
+ "loss": 0.4873,
+ "step": 4277
+ },
+ {
+ "epoch": 2.022695035460993,
+ "grad_norm": 2.499826431274414,
+ "learning_rate": 3.76007107032933e-06,
+ "loss": 0.4515,
+ "step": 4278
+ },
+ {
+ "epoch": 2.0231678486997637,
+ "grad_norm": 3.0318663120269775,
+ "learning_rate": 3.759532239815924e-06,
+ "loss": 0.4901,
+ "step": 4279
+ },
+ {
+ "epoch": 2.023640661938534,
+ "grad_norm": 2.857977867126465,
+ "learning_rate": 3.758993330876969e-06,
+ "loss": 0.4659,
+ "step": 4280
+ },
+ {
+ "epoch": 2.024113475177305,
+ "grad_norm": 2.47918438911438,
+ "learning_rate": 3.7584543435460196e-06,
+ "loss": 0.4323,
+ "step": 4281
+ },
+ {
+ "epoch": 2.0245862884160757,
+ "grad_norm": 2.6033785343170166,
+ "learning_rate": 3.757915277856637e-06,
+ "loss": 0.4437,
+ "step": 4282
+ },
+ {
+ "epoch": 2.0250591016548465,
+ "grad_norm": 2.799781322479248,
+ "learning_rate": 3.757376133842386e-06,
+ "loss": 0.4523,
+ "step": 4283
+ },
+ {
+ "epoch": 2.025531914893617,
+ "grad_norm": 2.6092529296875,
+ "learning_rate": 3.756836911536836e-06,
+ "loss": 0.3898,
+ "step": 4284
+ },
+ {
+ "epoch": 2.0260047281323876,
+ "grad_norm": 2.66229248046875,
+ "learning_rate": 3.7562976109735627e-06,
+ "loss": 0.4731,
+ "step": 4285
+ },
+ {
+ "epoch": 2.0264775413711584,
+ "grad_norm": 2.90142822265625,
+ "learning_rate": 3.7557582321861463e-06,
+ "loss": 0.4285,
+ "step": 4286
+ },
+ {
+ "epoch": 2.0269503546099292,
+ "grad_norm": 2.5138802528381348,
+ "learning_rate": 3.7552187752081707e-06,
+ "loss": 0.4467,
+ "step": 4287
+ },
+ {
+ "epoch": 2.0274231678486996,
+ "grad_norm": 3.0656235218048096,
+ "learning_rate": 3.754679240073226e-06,
+ "loss": 0.4718,
+ "step": 4288
+ },
+ {
+ "epoch": 2.0278959810874704,
+ "grad_norm": 2.9633383750915527,
+ "learning_rate": 3.754139626814907e-06,
+ "loss": 0.4741,
+ "step": 4289
+ },
+ {
+ "epoch": 2.028368794326241,
+ "grad_norm": 2.5925145149230957,
+ "learning_rate": 3.753599935466812e-06,
+ "loss": 0.4281,
+ "step": 4290
+ },
+ {
+ "epoch": 2.028841607565012,
+ "grad_norm": 2.837740659713745,
+ "learning_rate": 3.7530601660625456e-06,
+ "loss": 0.4757,
+ "step": 4291
+ },
+ {
+ "epoch": 2.0293144208037823,
+ "grad_norm": 2.3995790481567383,
+ "learning_rate": 3.752520318635718e-06,
+ "loss": 0.4148,
+ "step": 4292
+ },
+ {
+ "epoch": 2.029787234042553,
+ "grad_norm": 2.572601795196533,
+ "learning_rate": 3.7519803932199424e-06,
+ "loss": 0.4051,
+ "step": 4293
+ },
+ {
+ "epoch": 2.030260047281324,
+ "grad_norm": 2.6780295372009277,
+ "learning_rate": 3.751440389848837e-06,
+ "loss": 0.4626,
+ "step": 4294
+ },
+ {
+ "epoch": 2.0307328605200947,
+ "grad_norm": 2.8666839599609375,
+ "learning_rate": 3.7509003085560257e-06,
+ "loss": 0.4255,
+ "step": 4295
+ },
+ {
+ "epoch": 2.031205673758865,
+ "grad_norm": 2.4398207664489746,
+ "learning_rate": 3.750360149375138e-06,
+ "loss": 0.4235,
+ "step": 4296
+ },
+ {
+ "epoch": 2.031678486997636,
+ "grad_norm": 2.436840534210205,
+ "learning_rate": 3.7498199123398062e-06,
+ "loss": 0.3907,
+ "step": 4297
+ },
+ {
+ "epoch": 2.0321513002364067,
+ "grad_norm": 3.3945820331573486,
+ "learning_rate": 3.7492795974836683e-06,
+ "loss": 0.465,
+ "step": 4298
+ },
+ {
+ "epoch": 2.0326241134751775,
+ "grad_norm": 2.6693103313446045,
+ "learning_rate": 3.7487392048403678e-06,
+ "loss": 0.4948,
+ "step": 4299
+ },
+ {
+ "epoch": 2.033096926713948,
+ "grad_norm": 2.7642734050750732,
+ "learning_rate": 3.748198734443553e-06,
+ "loss": 0.4538,
+ "step": 4300
+ },
+ {
+ "epoch": 2.0335697399527186,
+ "grad_norm": 3.1436543464660645,
+ "learning_rate": 3.747658186326876e-06,
+ "loss": 0.5137,
+ "step": 4301
+ },
+ {
+ "epoch": 2.0340425531914894,
+ "grad_norm": 3.482678174972534,
+ "learning_rate": 3.7471175605239947e-06,
+ "loss": 0.4982,
+ "step": 4302
+ },
+ {
+ "epoch": 2.03451536643026,
+ "grad_norm": 2.712557077407837,
+ "learning_rate": 3.746576857068571e-06,
+ "loss": 0.4459,
+ "step": 4303
+ },
+ {
+ "epoch": 2.0349881796690306,
+ "grad_norm": 3.147440195083618,
+ "learning_rate": 3.7460360759942726e-06,
+ "loss": 0.5063,
+ "step": 4304
+ },
+ {
+ "epoch": 2.0354609929078014,
+ "grad_norm": 2.840672492980957,
+ "learning_rate": 3.7454952173347714e-06,
+ "loss": 0.5041,
+ "step": 4305
+ },
+ {
+ "epoch": 2.035933806146572,
+ "grad_norm": 2.584122657775879,
+ "learning_rate": 3.744954281123745e-06,
+ "loss": 0.4487,
+ "step": 4306
+ },
+ {
+ "epoch": 2.036406619385343,
+ "grad_norm": 2.9869542121887207,
+ "learning_rate": 3.7444132673948737e-06,
+ "loss": 0.479,
+ "step": 4307
+ },
+ {
+ "epoch": 2.0368794326241133,
+ "grad_norm": 2.478459358215332,
+ "learning_rate": 3.7438721761818446e-06,
+ "loss": 0.4636,
+ "step": 4308
+ },
+ {
+ "epoch": 2.037352245862884,
+ "grad_norm": 2.5524215698242188,
+ "learning_rate": 3.7433310075183504e-06,
+ "loss": 0.4601,
+ "step": 4309
+ },
+ {
+ "epoch": 2.037825059101655,
+ "grad_norm": 2.3709988594055176,
+ "learning_rate": 3.742789761438086e-06,
+ "loss": 0.4163,
+ "step": 4310
+ },
+ {
+ "epoch": 2.0382978723404257,
+ "grad_norm": 3.140355348587036,
+ "learning_rate": 3.742248437974752e-06,
+ "loss": 0.4433,
+ "step": 4311
+ },
+ {
+ "epoch": 2.038770685579196,
+ "grad_norm": 2.940948486328125,
+ "learning_rate": 3.741707037162055e-06,
+ "loss": 0.4299,
+ "step": 4312
+ },
+ {
+ "epoch": 2.039243498817967,
+ "grad_norm": 3.009157419204712,
+ "learning_rate": 3.7411655590337055e-06,
+ "loss": 0.463,
+ "step": 4313
+ },
+ {
+ "epoch": 2.0397163120567376,
+ "grad_norm": 2.672945737838745,
+ "learning_rate": 3.7406240036234185e-06,
+ "loss": 0.4696,
+ "step": 4314
+ },
+ {
+ "epoch": 2.0401891252955084,
+ "grad_norm": 2.745962142944336,
+ "learning_rate": 3.740082370964916e-06,
+ "loss": 0.4931,
+ "step": 4315
+ },
+ {
+ "epoch": 2.040661938534279,
+ "grad_norm": 2.3939316272735596,
+ "learning_rate": 3.7395406610919217e-06,
+ "loss": 0.4396,
+ "step": 4316
+ },
+ {
+ "epoch": 2.0411347517730496,
+ "grad_norm": 2.4364447593688965,
+ "learning_rate": 3.738998874038165e-06,
+ "loss": 0.4807,
+ "step": 4317
+ },
+ {
+ "epoch": 2.0416075650118204,
+ "grad_norm": 2.360489845275879,
+ "learning_rate": 3.738457009837381e-06,
+ "loss": 0.4426,
+ "step": 4318
+ },
+ {
+ "epoch": 2.042080378250591,
+ "grad_norm": 2.5494935512542725,
+ "learning_rate": 3.7379150685233108e-06,
+ "loss": 0.4189,
+ "step": 4319
+ },
+ {
+ "epoch": 2.0425531914893615,
+ "grad_norm": 2.635472059249878,
+ "learning_rate": 3.7373730501296963e-06,
+ "loss": 0.5014,
+ "step": 4320
+ },
+ {
+ "epoch": 2.0430260047281323,
+ "grad_norm": 2.4982943534851074,
+ "learning_rate": 3.7368309546902876e-06,
+ "loss": 0.4658,
+ "step": 4321
+ },
+ {
+ "epoch": 2.043498817966903,
+ "grad_norm": 2.692742109298706,
+ "learning_rate": 3.736288782238839e-06,
+ "loss": 0.4454,
+ "step": 4322
+ },
+ {
+ "epoch": 2.043971631205674,
+ "grad_norm": 2.6774091720581055,
+ "learning_rate": 3.7357465328091086e-06,
+ "loss": 0.5002,
+ "step": 4323
+ },
+ {
+ "epoch": 2.0444444444444443,
+ "grad_norm": 2.695138692855835,
+ "learning_rate": 3.735204206434861e-06,
+ "loss": 0.448,
+ "step": 4324
+ },
+ {
+ "epoch": 2.044917257683215,
+ "grad_norm": 2.5383570194244385,
+ "learning_rate": 3.7346618031498635e-06,
+ "loss": 0.4352,
+ "step": 4325
+ },
+ {
+ "epoch": 2.045390070921986,
+ "grad_norm": 2.267277240753174,
+ "learning_rate": 3.7341193229878886e-06,
+ "loss": 0.4162,
+ "step": 4326
+ },
+ {
+ "epoch": 2.0458628841607567,
+ "grad_norm": 2.6037328243255615,
+ "learning_rate": 3.733576765982715e-06,
+ "loss": 0.4471,
+ "step": 4327
+ },
+ {
+ "epoch": 2.046335697399527,
+ "grad_norm": 3.261385440826416,
+ "learning_rate": 3.7330341321681253e-06,
+ "loss": 0.4618,
+ "step": 4328
+ },
+ {
+ "epoch": 2.046808510638298,
+ "grad_norm": 2.440650463104248,
+ "learning_rate": 3.7324914215779072e-06,
+ "loss": 0.4476,
+ "step": 4329
+ },
+ {
+ "epoch": 2.0472813238770686,
+ "grad_norm": 2.5940682888031006,
+ "learning_rate": 3.731948634245853e-06,
+ "loss": 0.4389,
+ "step": 4330
+ },
+ {
+ "epoch": 2.0477541371158394,
+ "grad_norm": 2.7428150177001953,
+ "learning_rate": 3.7314057702057582e-06,
+ "loss": 0.4477,
+ "step": 4331
+ },
+ {
+ "epoch": 2.0482269503546098,
+ "grad_norm": 2.3546223640441895,
+ "learning_rate": 3.730862829491427e-06,
+ "loss": 0.4047,
+ "step": 4332
+ },
+ {
+ "epoch": 2.0486997635933806,
+ "grad_norm": 2.552422523498535,
+ "learning_rate": 3.7303198121366637e-06,
+ "loss": 0.4438,
+ "step": 4333
+ },
+ {
+ "epoch": 2.0491725768321514,
+ "grad_norm": 2.99226713180542,
+ "learning_rate": 3.729776718175281e-06,
+ "loss": 0.491,
+ "step": 4334
+ },
+ {
+ "epoch": 2.049645390070922,
+ "grad_norm": 3.2003321647644043,
+ "learning_rate": 3.7292335476410935e-06,
+ "loss": 0.5458,
+ "step": 4335
+ },
+ {
+ "epoch": 2.0501182033096925,
+ "grad_norm": 2.739847183227539,
+ "learning_rate": 3.7286903005679237e-06,
+ "loss": 0.4499,
+ "step": 4336
+ },
+ {
+ "epoch": 2.0505910165484633,
+ "grad_norm": 2.5917470455169678,
+ "learning_rate": 3.7281469769895963e-06,
+ "loss": 0.4714,
+ "step": 4337
+ },
+ {
+ "epoch": 2.051063829787234,
+ "grad_norm": 2.8029327392578125,
+ "learning_rate": 3.7276035769399422e-06,
+ "loss": 0.42,
+ "step": 4338
+ },
+ {
+ "epoch": 2.051536643026005,
+ "grad_norm": 2.484879493713379,
+ "learning_rate": 3.727060100452796e-06,
+ "loss": 0.4163,
+ "step": 4339
+ },
+ {
+ "epoch": 2.0520094562647753,
+ "grad_norm": 2.7126030921936035,
+ "learning_rate": 3.7265165475619973e-06,
+ "loss": 0.4112,
+ "step": 4340
+ },
+ {
+ "epoch": 2.052482269503546,
+ "grad_norm": 2.618267774581909,
+ "learning_rate": 3.7259729183013927e-06,
+ "loss": 0.4281,
+ "step": 4341
+ },
+ {
+ "epoch": 2.052955082742317,
+ "grad_norm": 2.703270673751831,
+ "learning_rate": 3.7254292127048293e-06,
+ "loss": 0.4437,
+ "step": 4342
+ },
+ {
+ "epoch": 2.0534278959810877,
+ "grad_norm": 2.429150104522705,
+ "learning_rate": 3.7248854308061623e-06,
+ "loss": 0.3971,
+ "step": 4343
+ },
+ {
+ "epoch": 2.053900709219858,
+ "grad_norm": 2.54354190826416,
+ "learning_rate": 3.7243415726392508e-06,
+ "loss": 0.4485,
+ "step": 4344
+ },
+ {
+ "epoch": 2.054373522458629,
+ "grad_norm": 2.9515016078948975,
+ "learning_rate": 3.723797638237957e-06,
+ "loss": 0.4386,
+ "step": 4345
+ },
+ {
+ "epoch": 2.0548463356973996,
+ "grad_norm": 2.9129958152770996,
+ "learning_rate": 3.7232536276361514e-06,
+ "loss": 0.4595,
+ "step": 4346
+ },
+ {
+ "epoch": 2.0553191489361704,
+ "grad_norm": 2.5397512912750244,
+ "learning_rate": 3.722709540867706e-06,
+ "loss": 0.3681,
+ "step": 4347
+ },
+ {
+ "epoch": 2.0557919621749408,
+ "grad_norm": 2.79884672164917,
+ "learning_rate": 3.722165377966499e-06,
+ "loss": 0.4576,
+ "step": 4348
+ },
+ {
+ "epoch": 2.0562647754137116,
+ "grad_norm": 2.669936180114746,
+ "learning_rate": 3.7216211389664137e-06,
+ "loss": 0.3692,
+ "step": 4349
+ },
+ {
+ "epoch": 2.0567375886524824,
+ "grad_norm": 2.512326240539551,
+ "learning_rate": 3.7210768239013355e-06,
+ "loss": 0.4554,
+ "step": 4350
+ },
+ {
+ "epoch": 2.057210401891253,
+ "grad_norm": 2.913693904876709,
+ "learning_rate": 3.7205324328051583e-06,
+ "loss": 0.5282,
+ "step": 4351
+ },
+ {
+ "epoch": 2.0576832151300235,
+ "grad_norm": 3.040891170501709,
+ "learning_rate": 3.719987965711778e-06,
+ "loss": 0.4778,
+ "step": 4352
+ },
+ {
+ "epoch": 2.0581560283687943,
+ "grad_norm": 2.7504117488861084,
+ "learning_rate": 3.7194434226550966e-06,
+ "loss": 0.4217,
+ "step": 4353
+ },
+ {
+ "epoch": 2.058628841607565,
+ "grad_norm": 2.5522971153259277,
+ "learning_rate": 3.718898803669021e-06,
+ "loss": 0.437,
+ "step": 4354
+ },
+ {
+ "epoch": 2.059101654846336,
+ "grad_norm": 2.8531908988952637,
+ "learning_rate": 3.718354108787461e-06,
+ "loss": 0.4251,
+ "step": 4355
+ },
+ {
+ "epoch": 2.0595744680851062,
+ "grad_norm": 2.5812065601348877,
+ "learning_rate": 3.7178093380443337e-06,
+ "loss": 0.4374,
+ "step": 4356
+ },
+ {
+ "epoch": 2.060047281323877,
+ "grad_norm": 2.627871513366699,
+ "learning_rate": 3.7172644914735583e-06,
+ "loss": 0.436,
+ "step": 4357
+ },
+ {
+ "epoch": 2.060520094562648,
+ "grad_norm": 2.7146239280700684,
+ "learning_rate": 3.7167195691090607e-06,
+ "loss": 0.4204,
+ "step": 4358
+ },
+ {
+ "epoch": 2.0609929078014186,
+ "grad_norm": 2.486483573913574,
+ "learning_rate": 3.7161745709847706e-06,
+ "loss": 0.4015,
+ "step": 4359
+ },
+ {
+ "epoch": 2.061465721040189,
+ "grad_norm": 2.866049289703369,
+ "learning_rate": 3.7156294971346226e-06,
+ "loss": 0.4087,
+ "step": 4360
+ },
+ {
+ "epoch": 2.06193853427896,
+ "grad_norm": 2.9345552921295166,
+ "learning_rate": 3.715084347592556e-06,
+ "loss": 0.5074,
+ "step": 4361
+ },
+ {
+ "epoch": 2.0624113475177306,
+ "grad_norm": 2.502455711364746,
+ "learning_rate": 3.7145391223925155e-06,
+ "loss": 0.469,
+ "step": 4362
+ },
+ {
+ "epoch": 2.0628841607565014,
+ "grad_norm": 2.6419875621795654,
+ "learning_rate": 3.713993821568449e-06,
+ "loss": 0.4493,
+ "step": 4363
+ },
+ {
+ "epoch": 2.0633569739952717,
+ "grad_norm": 3.812079429626465,
+ "learning_rate": 3.7134484451543114e-06,
+ "loss": 0.4764,
+ "step": 4364
+ },
+ {
+ "epoch": 2.0638297872340425,
+ "grad_norm": 2.581780195236206,
+ "learning_rate": 3.712902993184059e-06,
+ "loss": 0.3994,
+ "step": 4365
+ },
+ {
+ "epoch": 2.0643026004728133,
+ "grad_norm": 2.282508134841919,
+ "learning_rate": 3.712357465691656e-06,
+ "loss": 0.4252,
+ "step": 4366
+ },
+ {
+ "epoch": 2.064775413711584,
+ "grad_norm": 2.4727818965911865,
+ "learning_rate": 3.71181186271107e-06,
+ "loss": 0.4558,
+ "step": 4367
+ },
+ {
+ "epoch": 2.0652482269503545,
+ "grad_norm": 2.7661173343658447,
+ "learning_rate": 3.711266184276272e-06,
+ "loss": 0.505,
+ "step": 4368
+ },
+ {
+ "epoch": 2.0657210401891253,
+ "grad_norm": 2.6264543533325195,
+ "learning_rate": 3.71072043042124e-06,
+ "loss": 0.4297,
+ "step": 4369
+ },
+ {
+ "epoch": 2.066193853427896,
+ "grad_norm": 2.773699998855591,
+ "learning_rate": 3.7101746011799565e-06,
+ "loss": 0.4267,
+ "step": 4370
+ },
+ {
+ "epoch": 2.066666666666667,
+ "grad_norm": 2.686955213546753,
+ "learning_rate": 3.709628696586407e-06,
+ "loss": 0.4099,
+ "step": 4371
+ },
+ {
+ "epoch": 2.0671394799054372,
+ "grad_norm": 2.6066620349884033,
+ "learning_rate": 3.709082716674582e-06,
+ "loss": 0.4146,
+ "step": 4372
+ },
+ {
+ "epoch": 2.067612293144208,
+ "grad_norm": 2.7769250869750977,
+ "learning_rate": 3.7085366614784784e-06,
+ "loss": 0.4047,
+ "step": 4373
+ },
+ {
+ "epoch": 2.068085106382979,
+ "grad_norm": 2.4986939430236816,
+ "learning_rate": 3.7079905310320957e-06,
+ "loss": 0.4021,
+ "step": 4374
+ },
+ {
+ "epoch": 2.0685579196217496,
+ "grad_norm": 2.5456206798553467,
+ "learning_rate": 3.7074443253694402e-06,
+ "loss": 0.3569,
+ "step": 4375
+ },
+ {
+ "epoch": 2.06903073286052,
+ "grad_norm": 2.4079296588897705,
+ "learning_rate": 3.70689804452452e-06,
+ "loss": 0.4308,
+ "step": 4376
+ },
+ {
+ "epoch": 2.0695035460992908,
+ "grad_norm": 2.86014723777771,
+ "learning_rate": 3.7063516885313513e-06,
+ "loss": 0.4577,
+ "step": 4377
+ },
+ {
+ "epoch": 2.0699763593380616,
+ "grad_norm": 2.8025779724121094,
+ "learning_rate": 3.7058052574239523e-06,
+ "loss": 0.4615,
+ "step": 4378
+ },
+ {
+ "epoch": 2.0704491725768324,
+ "grad_norm": 2.902676820755005,
+ "learning_rate": 3.7052587512363475e-06,
+ "loss": 0.4765,
+ "step": 4379
+ },
+ {
+ "epoch": 2.0709219858156027,
+ "grad_norm": 2.814509391784668,
+ "learning_rate": 3.704712170002566e-06,
+ "loss": 0.434,
+ "step": 4380
+ },
+ {
+ "epoch": 2.0713947990543735,
+ "grad_norm": 2.7923502922058105,
+ "learning_rate": 3.704165513756639e-06,
+ "loss": 0.4626,
+ "step": 4381
+ },
+ {
+ "epoch": 2.0718676122931443,
+ "grad_norm": 2.6802031993865967,
+ "learning_rate": 3.703618782532606e-06,
+ "loss": 0.4835,
+ "step": 4382
+ },
+ {
+ "epoch": 2.072340425531915,
+ "grad_norm": 3.0963687896728516,
+ "learning_rate": 3.7030719763645085e-06,
+ "loss": 0.4813,
+ "step": 4383
+ },
+ {
+ "epoch": 2.0728132387706855,
+ "grad_norm": 2.5658695697784424,
+ "learning_rate": 3.7025250952863956e-06,
+ "loss": 0.4428,
+ "step": 4384
+ },
+ {
+ "epoch": 2.0732860520094563,
+ "grad_norm": 2.7738289833068848,
+ "learning_rate": 3.7019781393323167e-06,
+ "loss": 0.4376,
+ "step": 4385
+ },
+ {
+ "epoch": 2.073758865248227,
+ "grad_norm": 2.6446938514709473,
+ "learning_rate": 3.7014311085363303e-06,
+ "loss": 0.4208,
+ "step": 4386
+ },
+ {
+ "epoch": 2.0742316784869974,
+ "grad_norm": 2.7556118965148926,
+ "learning_rate": 3.7008840029324967e-06,
+ "loss": 0.3831,
+ "step": 4387
+ },
+ {
+ "epoch": 2.074704491725768,
+ "grad_norm": 2.573141574859619,
+ "learning_rate": 3.700336822554882e-06,
+ "loss": 0.4396,
+ "step": 4388
+ },
+ {
+ "epoch": 2.075177304964539,
+ "grad_norm": 2.762319803237915,
+ "learning_rate": 3.6997895674375566e-06,
+ "loss": 0.4579,
+ "step": 4389
+ },
+ {
+ "epoch": 2.07565011820331,
+ "grad_norm": 2.729780435562134,
+ "learning_rate": 3.699242237614596e-06,
+ "loss": 0.4262,
+ "step": 4390
+ },
+ {
+ "epoch": 2.0761229314420806,
+ "grad_norm": 2.657480001449585,
+ "learning_rate": 3.698694833120079e-06,
+ "loss": 0.4176,
+ "step": 4391
+ },
+ {
+ "epoch": 2.076595744680851,
+ "grad_norm": 2.8433303833007812,
+ "learning_rate": 3.6981473539880914e-06,
+ "loss": 0.457,
+ "step": 4392
+ },
+ {
+ "epoch": 2.0770685579196217,
+ "grad_norm": 2.819047212600708,
+ "learning_rate": 3.6975998002527225e-06,
+ "loss": 0.4244,
+ "step": 4393
+ },
+ {
+ "epoch": 2.0775413711583925,
+ "grad_norm": 2.6565003395080566,
+ "learning_rate": 3.697052171948064e-06,
+ "loss": 0.4384,
+ "step": 4394
+ },
+ {
+ "epoch": 2.078014184397163,
+ "grad_norm": 2.5795063972473145,
+ "learning_rate": 3.696504469108216e-06,
+ "loss": 0.4958,
+ "step": 4395
+ },
+ {
+ "epoch": 2.0784869976359337,
+ "grad_norm": 2.455730676651001,
+ "learning_rate": 3.6959566917672822e-06,
+ "loss": 0.4191,
+ "step": 4396
+ },
+ {
+ "epoch": 2.0789598108747045,
+ "grad_norm": 2.6706607341766357,
+ "learning_rate": 3.6954088399593684e-06,
+ "loss": 0.4709,
+ "step": 4397
+ },
+ {
+ "epoch": 2.0794326241134753,
+ "grad_norm": 2.3758466243743896,
+ "learning_rate": 3.694860913718589e-06,
+ "loss": 0.4231,
+ "step": 4398
+ },
+ {
+ "epoch": 2.079905437352246,
+ "grad_norm": 2.3488340377807617,
+ "learning_rate": 3.6943129130790583e-06,
+ "loss": 0.4321,
+ "step": 4399
+ },
+ {
+ "epoch": 2.0803782505910164,
+ "grad_norm": 2.6438148021698,
+ "learning_rate": 3.6937648380748996e-06,
+ "loss": 0.4907,
+ "step": 4400
+ },
+ {
+ "epoch": 2.0808510638297872,
+ "grad_norm": 2.9826784133911133,
+ "learning_rate": 3.6932166887402395e-06,
+ "loss": 0.4404,
+ "step": 4401
+ },
+ {
+ "epoch": 2.081323877068558,
+ "grad_norm": 2.5203495025634766,
+ "learning_rate": 3.6926684651092076e-06,
+ "loss": 0.4337,
+ "step": 4402
+ },
+ {
+ "epoch": 2.0817966903073284,
+ "grad_norm": 2.7704148292541504,
+ "learning_rate": 3.692120167215941e-06,
+ "loss": 0.4195,
+ "step": 4403
+ },
+ {
+ "epoch": 2.082269503546099,
+ "grad_norm": 2.879430055618286,
+ "learning_rate": 3.6915717950945782e-06,
+ "loss": 0.4498,
+ "step": 4404
+ },
+ {
+ "epoch": 2.08274231678487,
+ "grad_norm": 2.7659497261047363,
+ "learning_rate": 3.6910233487792655e-06,
+ "loss": 0.4017,
+ "step": 4405
+ },
+ {
+ "epoch": 2.083215130023641,
+ "grad_norm": 3.4017205238342285,
+ "learning_rate": 3.6904748283041503e-06,
+ "loss": 0.4733,
+ "step": 4406
+ },
+ {
+ "epoch": 2.083687943262411,
+ "grad_norm": 2.706223249435425,
+ "learning_rate": 3.6899262337033887e-06,
+ "loss": 0.4926,
+ "step": 4407
+ },
+ {
+ "epoch": 2.084160756501182,
+ "grad_norm": 2.644932508468628,
+ "learning_rate": 3.6893775650111372e-06,
+ "loss": 0.3904,
+ "step": 4408
+ },
+ {
+ "epoch": 2.0846335697399527,
+ "grad_norm": 2.666585683822632,
+ "learning_rate": 3.6888288222615603e-06,
+ "loss": 0.4698,
+ "step": 4409
+ },
+ {
+ "epoch": 2.0851063829787235,
+ "grad_norm": 3.0058486461639404,
+ "learning_rate": 3.688280005488826e-06,
+ "loss": 0.5291,
+ "step": 4410
+ },
+ {
+ "epoch": 2.085579196217494,
+ "grad_norm": 2.533088445663452,
+ "learning_rate": 3.687731114727105e-06,
+ "loss": 0.393,
+ "step": 4411
+ },
+ {
+ "epoch": 2.0860520094562647,
+ "grad_norm": 2.921687364578247,
+ "learning_rate": 3.6871821500105763e-06,
+ "loss": 0.4719,
+ "step": 4412
+ },
+ {
+ "epoch": 2.0865248226950355,
+ "grad_norm": 2.291804313659668,
+ "learning_rate": 3.686633111373421e-06,
+ "loss": 0.4105,
+ "step": 4413
+ },
+ {
+ "epoch": 2.0869976359338063,
+ "grad_norm": 2.496333122253418,
+ "learning_rate": 3.6860839988498255e-06,
+ "loss": 0.4704,
+ "step": 4414
+ },
+ {
+ "epoch": 2.0874704491725766,
+ "grad_norm": 2.8059427738189697,
+ "learning_rate": 3.6855348124739787e-06,
+ "loss": 0.4961,
+ "step": 4415
+ },
+ {
+ "epoch": 2.0879432624113474,
+ "grad_norm": 2.683922290802002,
+ "learning_rate": 3.6849855522800795e-06,
+ "loss": 0.4838,
+ "step": 4416
+ },
+ {
+ "epoch": 2.088416075650118,
+ "grad_norm": 2.694148540496826,
+ "learning_rate": 3.684436218302324e-06,
+ "loss": 0.4812,
+ "step": 4417
+ },
+ {
+ "epoch": 2.088888888888889,
+ "grad_norm": 2.724531888961792,
+ "learning_rate": 3.683886810574919e-06,
+ "loss": 0.4495,
+ "step": 4418
+ },
+ {
+ "epoch": 2.0893617021276594,
+ "grad_norm": 2.6176564693450928,
+ "learning_rate": 3.6833373291320746e-06,
+ "loss": 0.4698,
+ "step": 4419
+ },
+ {
+ "epoch": 2.08983451536643,
+ "grad_norm": 2.534116268157959,
+ "learning_rate": 3.6827877740080032e-06,
+ "loss": 0.3912,
+ "step": 4420
+ },
+ {
+ "epoch": 2.090307328605201,
+ "grad_norm": 2.5747432708740234,
+ "learning_rate": 3.682238145236924e-06,
+ "loss": 0.4072,
+ "step": 4421
+ },
+ {
+ "epoch": 2.0907801418439718,
+ "grad_norm": 2.5947659015655518,
+ "learning_rate": 3.6816884428530588e-06,
+ "loss": 0.4638,
+ "step": 4422
+ },
+ {
+ "epoch": 2.091252955082742,
+ "grad_norm": 2.811992883682251,
+ "learning_rate": 3.6811386668906353e-06,
+ "loss": 0.4345,
+ "step": 4423
+ },
+ {
+ "epoch": 2.091725768321513,
+ "grad_norm": 2.7482287883758545,
+ "learning_rate": 3.680588817383886e-06,
+ "loss": 0.4541,
+ "step": 4424
+ },
+ {
+ "epoch": 2.0921985815602837,
+ "grad_norm": 2.987131357192993,
+ "learning_rate": 3.6800388943670484e-06,
+ "loss": 0.4571,
+ "step": 4425
+ },
+ {
+ "epoch": 2.0926713947990545,
+ "grad_norm": 3.1918671131134033,
+ "learning_rate": 3.6794888978743637e-06,
+ "loss": 0.5722,
+ "step": 4426
+ },
+ {
+ "epoch": 2.093144208037825,
+ "grad_norm": 2.5654571056365967,
+ "learning_rate": 3.678938827940076e-06,
+ "loss": 0.4686,
+ "step": 4427
+ },
+ {
+ "epoch": 2.0936170212765957,
+ "grad_norm": 2.942084789276123,
+ "learning_rate": 3.6783886845984383e-06,
+ "loss": 0.4512,
+ "step": 4428
+ },
+ {
+ "epoch": 2.0940898345153665,
+ "grad_norm": 2.74847674369812,
+ "learning_rate": 3.677838467883703e-06,
+ "loss": 0.4506,
+ "step": 4429
+ },
+ {
+ "epoch": 2.0945626477541373,
+ "grad_norm": 2.7569334506988525,
+ "learning_rate": 3.6772881778301322e-06,
+ "loss": 0.502,
+ "step": 4430
+ },
+ {
+ "epoch": 2.0950354609929076,
+ "grad_norm": 2.969966173171997,
+ "learning_rate": 3.6767378144719884e-06,
+ "loss": 0.4772,
+ "step": 4431
+ },
+ {
+ "epoch": 2.0955082742316784,
+ "grad_norm": 2.773524522781372,
+ "learning_rate": 3.67618737784354e-06,
+ "loss": 0.5183,
+ "step": 4432
+ },
+ {
+ "epoch": 2.095981087470449,
+ "grad_norm": 2.6760106086730957,
+ "learning_rate": 3.6756368679790617e-06,
+ "loss": 0.4787,
+ "step": 4433
+ },
+ {
+ "epoch": 2.09645390070922,
+ "grad_norm": 2.8758978843688965,
+ "learning_rate": 3.6750862849128304e-06,
+ "loss": 0.4275,
+ "step": 4434
+ },
+ {
+ "epoch": 2.0969267139479904,
+ "grad_norm": 2.670509099960327,
+ "learning_rate": 3.6745356286791288e-06,
+ "loss": 0.4401,
+ "step": 4435
+ },
+ {
+ "epoch": 2.097399527186761,
+ "grad_norm": 2.8453969955444336,
+ "learning_rate": 3.673984899312244e-06,
+ "loss": 0.4303,
+ "step": 4436
+ },
+ {
+ "epoch": 2.097872340425532,
+ "grad_norm": 2.6212339401245117,
+ "learning_rate": 3.673434096846468e-06,
+ "loss": 0.4675,
+ "step": 4437
+ },
+ {
+ "epoch": 2.0983451536643027,
+ "grad_norm": 2.8211941719055176,
+ "learning_rate": 3.672883221316095e-06,
+ "loss": 0.4678,
+ "step": 4438
+ },
+ {
+ "epoch": 2.098817966903073,
+ "grad_norm": 2.4838058948516846,
+ "learning_rate": 3.672332272755427e-06,
+ "loss": 0.4128,
+ "step": 4439
+ },
+ {
+ "epoch": 2.099290780141844,
+ "grad_norm": 2.596660852432251,
+ "learning_rate": 3.671781251198769e-06,
+ "loss": 0.423,
+ "step": 4440
+ },
+ {
+ "epoch": 2.0997635933806147,
+ "grad_norm": 2.9979989528656006,
+ "learning_rate": 3.67123015668043e-06,
+ "loss": 0.4493,
+ "step": 4441
+ },
+ {
+ "epoch": 2.1002364066193855,
+ "grad_norm": 2.6232850551605225,
+ "learning_rate": 3.670678989234725e-06,
+ "loss": 0.4237,
+ "step": 4442
+ },
+ {
+ "epoch": 2.100709219858156,
+ "grad_norm": 2.575039863586426,
+ "learning_rate": 3.670127748895973e-06,
+ "loss": 0.4464,
+ "step": 4443
+ },
+ {
+ "epoch": 2.1011820330969266,
+ "grad_norm": 2.3381190299987793,
+ "learning_rate": 3.669576435698497e-06,
+ "loss": 0.4208,
+ "step": 4444
+ },
+ {
+ "epoch": 2.1016548463356974,
+ "grad_norm": 2.9645180702209473,
+ "learning_rate": 3.669025049676625e-06,
+ "loss": 0.5272,
+ "step": 4445
+ },
+ {
+ "epoch": 2.1021276595744682,
+ "grad_norm": 2.719320297241211,
+ "learning_rate": 3.668473590864689e-06,
+ "loss": 0.4485,
+ "step": 4446
+ },
+ {
+ "epoch": 2.1026004728132386,
+ "grad_norm": 2.8665547370910645,
+ "learning_rate": 3.6679220592970254e-06,
+ "loss": 0.4433,
+ "step": 4447
+ },
+ {
+ "epoch": 2.1030732860520094,
+ "grad_norm": 2.6922879219055176,
+ "learning_rate": 3.667370455007977e-06,
+ "loss": 0.502,
+ "step": 4448
+ },
+ {
+ "epoch": 2.10354609929078,
+ "grad_norm": 3.018228530883789,
+ "learning_rate": 3.6668187780318894e-06,
+ "loss": 0.4939,
+ "step": 4449
+ },
+ {
+ "epoch": 2.104018912529551,
+ "grad_norm": 3.187901735305786,
+ "learning_rate": 3.666267028403112e-06,
+ "loss": 0.4151,
+ "step": 4450
+ },
+ {
+ "epoch": 2.1044917257683213,
+ "grad_norm": 2.9521446228027344,
+ "learning_rate": 3.6657152061560012e-06,
+ "loss": 0.4343,
+ "step": 4451
+ },
+ {
+ "epoch": 2.104964539007092,
+ "grad_norm": 2.5125739574432373,
+ "learning_rate": 3.6651633113249164e-06,
+ "loss": 0.4071,
+ "step": 4452
+ },
+ {
+ "epoch": 2.105437352245863,
+ "grad_norm": 2.9164133071899414,
+ "learning_rate": 3.664611343944221e-06,
+ "loss": 0.4173,
+ "step": 4453
+ },
+ {
+ "epoch": 2.1059101654846337,
+ "grad_norm": 2.680893898010254,
+ "learning_rate": 3.6640593040482834e-06,
+ "loss": 0.4917,
+ "step": 4454
+ },
+ {
+ "epoch": 2.106382978723404,
+ "grad_norm": 2.6823534965515137,
+ "learning_rate": 3.6635071916714774e-06,
+ "loss": 0.4668,
+ "step": 4455
+ },
+ {
+ "epoch": 2.106855791962175,
+ "grad_norm": 2.6221907138824463,
+ "learning_rate": 3.6629550068481806e-06,
+ "loss": 0.4956,
+ "step": 4456
+ },
+ {
+ "epoch": 2.1073286052009457,
+ "grad_norm": 3.096370220184326,
+ "learning_rate": 3.6624027496127745e-06,
+ "loss": 0.3995,
+ "step": 4457
+ },
+ {
+ "epoch": 2.1078014184397165,
+ "grad_norm": 2.752885341644287,
+ "learning_rate": 3.661850419999647e-06,
+ "loss": 0.4838,
+ "step": 4458
+ },
+ {
+ "epoch": 2.108274231678487,
+ "grad_norm": 2.6806766986846924,
+ "learning_rate": 3.661298018043188e-06,
+ "loss": 0.4817,
+ "step": 4459
+ },
+ {
+ "epoch": 2.1087470449172576,
+ "grad_norm": 2.6317873001098633,
+ "learning_rate": 3.660745543777794e-06,
+ "loss": 0.4777,
+ "step": 4460
+ },
+ {
+ "epoch": 2.1092198581560284,
+ "grad_norm": 2.4939377307891846,
+ "learning_rate": 3.6601929972378634e-06,
+ "loss": 0.4525,
+ "step": 4461
+ },
+ {
+ "epoch": 2.109692671394799,
+ "grad_norm": 2.4902873039245605,
+ "learning_rate": 3.659640378457803e-06,
+ "loss": 0.4392,
+ "step": 4462
+ },
+ {
+ "epoch": 2.1101654846335696,
+ "grad_norm": 2.5082345008850098,
+ "learning_rate": 3.6590876874720216e-06,
+ "loss": 0.4224,
+ "step": 4463
+ },
+ {
+ "epoch": 2.1106382978723404,
+ "grad_norm": 2.658407211303711,
+ "learning_rate": 3.6585349243149313e-06,
+ "loss": 0.4316,
+ "step": 4464
+ },
+ {
+ "epoch": 2.111111111111111,
+ "grad_norm": 2.562883138656616,
+ "learning_rate": 3.6579820890209515e-06,
+ "loss": 0.4491,
+ "step": 4465
+ },
+ {
+ "epoch": 2.111583924349882,
+ "grad_norm": 2.5719261169433594,
+ "learning_rate": 3.657429181624505e-06,
+ "loss": 0.4406,
+ "step": 4466
+ },
+ {
+ "epoch": 2.1120567375886523,
+ "grad_norm": 2.8840596675872803,
+ "learning_rate": 3.6568762021600184e-06,
+ "loss": 0.4267,
+ "step": 4467
+ },
+ {
+ "epoch": 2.112529550827423,
+ "grad_norm": 2.660304546356201,
+ "learning_rate": 3.656323150661924e-06,
+ "loss": 0.4502,
+ "step": 4468
+ },
+ {
+ "epoch": 2.113002364066194,
+ "grad_norm": 2.610996961593628,
+ "learning_rate": 3.655770027164657e-06,
+ "loss": 0.3934,
+ "step": 4469
+ },
+ {
+ "epoch": 2.1134751773049647,
+ "grad_norm": 2.6000053882598877,
+ "learning_rate": 3.655216831702658e-06,
+ "loss": 0.4582,
+ "step": 4470
+ },
+ {
+ "epoch": 2.113947990543735,
+ "grad_norm": 2.73124098777771,
+ "learning_rate": 3.654663564310372e-06,
+ "loss": 0.4748,
+ "step": 4471
+ },
+ {
+ "epoch": 2.114420803782506,
+ "grad_norm": 2.711091995239258,
+ "learning_rate": 3.6541102250222495e-06,
+ "loss": 0.4145,
+ "step": 4472
+ },
+ {
+ "epoch": 2.1148936170212767,
+ "grad_norm": 2.655996561050415,
+ "learning_rate": 3.6535568138727438e-06,
+ "loss": 0.4407,
+ "step": 4473
+ },
+ {
+ "epoch": 2.1153664302600474,
+ "grad_norm": 2.7630865573883057,
+ "learning_rate": 3.653003330896313e-06,
+ "loss": 0.4298,
+ "step": 4474
+ },
+ {
+ "epoch": 2.115839243498818,
+ "grad_norm": 2.554415464401245,
+ "learning_rate": 3.6524497761274214e-06,
+ "loss": 0.44,
+ "step": 4475
+ },
+ {
+ "epoch": 2.1163120567375886,
+ "grad_norm": 2.790328025817871,
+ "learning_rate": 3.651896149600535e-06,
+ "loss": 0.5061,
+ "step": 4476
+ },
+ {
+ "epoch": 2.1167848699763594,
+ "grad_norm": 2.755267381668091,
+ "learning_rate": 3.651342451350127e-06,
+ "loss": 0.4588,
+ "step": 4477
+ },
+ {
+ "epoch": 2.11725768321513,
+ "grad_norm": 2.8936638832092285,
+ "learning_rate": 3.6507886814106722e-06,
+ "loss": 0.468,
+ "step": 4478
+ },
+ {
+ "epoch": 2.1177304964539005,
+ "grad_norm": 2.7394332885742188,
+ "learning_rate": 3.6502348398166525e-06,
+ "loss": 0.383,
+ "step": 4479
+ },
+ {
+ "epoch": 2.1182033096926713,
+ "grad_norm": 2.3359546661376953,
+ "learning_rate": 3.649680926602553e-06,
+ "loss": 0.3903,
+ "step": 4480
+ },
+ {
+ "epoch": 2.118676122931442,
+ "grad_norm": 3.102202892303467,
+ "learning_rate": 3.6491269418028637e-06,
+ "loss": 0.4525,
+ "step": 4481
+ },
+ {
+ "epoch": 2.119148936170213,
+ "grad_norm": 2.467970848083496,
+ "learning_rate": 3.648572885452078e-06,
+ "loss": 0.414,
+ "step": 4482
+ },
+ {
+ "epoch": 2.1196217494089833,
+ "grad_norm": 2.8984131813049316,
+ "learning_rate": 3.6480187575846952e-06,
+ "loss": 0.4571,
+ "step": 4483
+ },
+ {
+ "epoch": 2.120094562647754,
+ "grad_norm": 2.674834966659546,
+ "learning_rate": 3.6474645582352187e-06,
+ "loss": 0.455,
+ "step": 4484
+ },
+ {
+ "epoch": 2.120567375886525,
+ "grad_norm": 2.8713369369506836,
+ "learning_rate": 3.6469102874381552e-06,
+ "loss": 0.4567,
+ "step": 4485
+ },
+ {
+ "epoch": 2.1210401891252957,
+ "grad_norm": 3.174814462661743,
+ "learning_rate": 3.646355945228017e-06,
+ "loss": 0.5295,
+ "step": 4486
+ },
+ {
+ "epoch": 2.121513002364066,
+ "grad_norm": 2.6409823894500732,
+ "learning_rate": 3.6458015316393215e-06,
+ "loss": 0.4308,
+ "step": 4487
+ },
+ {
+ "epoch": 2.121985815602837,
+ "grad_norm": 2.4228954315185547,
+ "learning_rate": 3.645247046706588e-06,
+ "loss": 0.4042,
+ "step": 4488
+ },
+ {
+ "epoch": 2.1224586288416076,
+ "grad_norm": 2.553551435470581,
+ "learning_rate": 3.6446924904643427e-06,
+ "loss": 0.3925,
+ "step": 4489
+ },
+ {
+ "epoch": 2.1229314420803784,
+ "grad_norm": 2.8019237518310547,
+ "learning_rate": 3.6441378629471157e-06,
+ "loss": 0.4079,
+ "step": 4490
+ },
+ {
+ "epoch": 2.123404255319149,
+ "grad_norm": 2.993251085281372,
+ "learning_rate": 3.643583164189441e-06,
+ "loss": 0.4558,
+ "step": 4491
+ },
+ {
+ "epoch": 2.1238770685579196,
+ "grad_norm": 2.4531471729278564,
+ "learning_rate": 3.643028394225857e-06,
+ "loss": 0.4167,
+ "step": 4492
+ },
+ {
+ "epoch": 2.1243498817966904,
+ "grad_norm": 2.6827852725982666,
+ "learning_rate": 3.6424735530909065e-06,
+ "loss": 0.4311,
+ "step": 4493
+ },
+ {
+ "epoch": 2.124822695035461,
+ "grad_norm": 3.1232128143310547,
+ "learning_rate": 3.6419186408191377e-06,
+ "loss": 0.4537,
+ "step": 4494
+ },
+ {
+ "epoch": 2.1252955082742315,
+ "grad_norm": 2.816348075866699,
+ "learning_rate": 3.641363657445103e-06,
+ "loss": 0.4869,
+ "step": 4495
+ },
+ {
+ "epoch": 2.1257683215130023,
+ "grad_norm": 2.6269683837890625,
+ "learning_rate": 3.6408086030033575e-06,
+ "loss": 0.4066,
+ "step": 4496
+ },
+ {
+ "epoch": 2.126241134751773,
+ "grad_norm": 4.6375956535339355,
+ "learning_rate": 3.640253477528462e-06,
+ "loss": 0.4488,
+ "step": 4497
+ },
+ {
+ "epoch": 2.126713947990544,
+ "grad_norm": 3.020970582962036,
+ "learning_rate": 3.639698281054983e-06,
+ "loss": 0.4197,
+ "step": 4498
+ },
+ {
+ "epoch": 2.1271867612293143,
+ "grad_norm": 2.87904691696167,
+ "learning_rate": 3.6391430136174892e-06,
+ "loss": 0.4743,
+ "step": 4499
+ },
+ {
+ "epoch": 2.127659574468085,
+ "grad_norm": 2.719892978668213,
+ "learning_rate": 3.6385876752505554e-06,
+ "loss": 0.388,
+ "step": 4500
+ },
+ {
+ "epoch": 2.128132387706856,
+ "grad_norm": 2.7321808338165283,
+ "learning_rate": 3.638032265988759e-06,
+ "loss": 0.4857,
+ "step": 4501
+ },
+ {
+ "epoch": 2.1286052009456267,
+ "grad_norm": 2.700814723968506,
+ "learning_rate": 3.6374767858666836e-06,
+ "loss": 0.4819,
+ "step": 4502
+ },
+ {
+ "epoch": 2.129078014184397,
+ "grad_norm": 2.658423662185669,
+ "learning_rate": 3.6369212349189164e-06,
+ "loss": 0.4113,
+ "step": 4503
+ },
+ {
+ "epoch": 2.129550827423168,
+ "grad_norm": 2.673877716064453,
+ "learning_rate": 3.63636561318005e-06,
+ "loss": 0.3745,
+ "step": 4504
+ },
+ {
+ "epoch": 2.1300236406619386,
+ "grad_norm": 2.607758045196533,
+ "learning_rate": 3.6358099206846787e-06,
+ "loss": 0.4409,
+ "step": 4505
+ },
+ {
+ "epoch": 2.1304964539007094,
+ "grad_norm": 2.8117682933807373,
+ "learning_rate": 3.6352541574674044e-06,
+ "loss": 0.426,
+ "step": 4506
+ },
+ {
+ "epoch": 2.1309692671394798,
+ "grad_norm": 2.6970250606536865,
+ "learning_rate": 3.634698323562832e-06,
+ "loss": 0.4295,
+ "step": 4507
+ },
+ {
+ "epoch": 2.1314420803782506,
+ "grad_norm": 2.7133560180664062,
+ "learning_rate": 3.6341424190055696e-06,
+ "loss": 0.4443,
+ "step": 4508
+ },
+ {
+ "epoch": 2.1319148936170214,
+ "grad_norm": 2.57181715965271,
+ "learning_rate": 3.6335864438302328e-06,
+ "loss": 0.3995,
+ "step": 4509
+ },
+ {
+ "epoch": 2.132387706855792,
+ "grad_norm": 2.8618004322052,
+ "learning_rate": 3.633030398071438e-06,
+ "loss": 0.5075,
+ "step": 4510
+ },
+ {
+ "epoch": 2.1328605200945625,
+ "grad_norm": 2.7586729526519775,
+ "learning_rate": 3.6324742817638087e-06,
+ "loss": 0.4322,
+ "step": 4511
+ },
+ {
+ "epoch": 2.1333333333333333,
+ "grad_norm": 2.913256883621216,
+ "learning_rate": 3.631918094941972e-06,
+ "loss": 0.4708,
+ "step": 4512
+ },
+ {
+ "epoch": 2.133806146572104,
+ "grad_norm": 2.7715728282928467,
+ "learning_rate": 3.6313618376405585e-06,
+ "loss": 0.5194,
+ "step": 4513
+ },
+ {
+ "epoch": 2.134278959810875,
+ "grad_norm": 2.7986366748809814,
+ "learning_rate": 3.6308055098942042e-06,
+ "loss": 0.4419,
+ "step": 4514
+ },
+ {
+ "epoch": 2.1347517730496453,
+ "grad_norm": 3.043549060821533,
+ "learning_rate": 3.6302491117375492e-06,
+ "loss": 0.4441,
+ "step": 4515
+ },
+ {
+ "epoch": 2.135224586288416,
+ "grad_norm": 2.771761417388916,
+ "learning_rate": 3.629692643205238e-06,
+ "loss": 0.4752,
+ "step": 4516
+ },
+ {
+ "epoch": 2.135697399527187,
+ "grad_norm": 2.804941415786743,
+ "learning_rate": 3.6291361043319202e-06,
+ "loss": 0.4089,
+ "step": 4517
+ },
+ {
+ "epoch": 2.1361702127659576,
+ "grad_norm": 2.9897940158843994,
+ "learning_rate": 3.628579495152248e-06,
+ "loss": 0.4829,
+ "step": 4518
+ },
+ {
+ "epoch": 2.136643026004728,
+ "grad_norm": 2.9273486137390137,
+ "learning_rate": 3.6280228157008784e-06,
+ "loss": 0.4469,
+ "step": 4519
+ },
+ {
+ "epoch": 2.137115839243499,
+ "grad_norm": 2.584373950958252,
+ "learning_rate": 3.627466066012475e-06,
+ "loss": 0.4277,
+ "step": 4520
+ },
+ {
+ "epoch": 2.1375886524822696,
+ "grad_norm": 3.009333848953247,
+ "learning_rate": 3.626909246121703e-06,
+ "loss": 0.4025,
+ "step": 4521
+ },
+ {
+ "epoch": 2.1380614657210404,
+ "grad_norm": 2.634615659713745,
+ "learning_rate": 3.626352356063234e-06,
+ "loss": 0.4046,
+ "step": 4522
+ },
+ {
+ "epoch": 2.1385342789598107,
+ "grad_norm": 2.87310528755188,
+ "learning_rate": 3.625795395871743e-06,
+ "loss": 0.4426,
+ "step": 4523
+ },
+ {
+ "epoch": 2.1390070921985815,
+ "grad_norm": 2.94985032081604,
+ "learning_rate": 3.625238365581909e-06,
+ "loss": 0.445,
+ "step": 4524
+ },
+ {
+ "epoch": 2.1394799054373523,
+ "grad_norm": 2.470189332962036,
+ "learning_rate": 3.624681265228416e-06,
+ "loss": 0.4082,
+ "step": 4525
+ },
+ {
+ "epoch": 2.139952718676123,
+ "grad_norm": 2.5304040908813477,
+ "learning_rate": 3.624124094845952e-06,
+ "loss": 0.403,
+ "step": 4526
+ },
+ {
+ "epoch": 2.1404255319148935,
+ "grad_norm": 2.6148900985717773,
+ "learning_rate": 3.62356685446921e-06,
+ "loss": 0.3867,
+ "step": 4527
+ },
+ {
+ "epoch": 2.1408983451536643,
+ "grad_norm": 2.885549783706665,
+ "learning_rate": 3.623009544132886e-06,
+ "loss": 0.4706,
+ "step": 4528
+ },
+ {
+ "epoch": 2.141371158392435,
+ "grad_norm": 3.00490665435791,
+ "learning_rate": 3.6224521638716827e-06,
+ "loss": 0.4733,
+ "step": 4529
+ },
+ {
+ "epoch": 2.141843971631206,
+ "grad_norm": 2.925879716873169,
+ "learning_rate": 3.6218947137203043e-06,
+ "loss": 0.4581,
+ "step": 4530
+ },
+ {
+ "epoch": 2.1423167848699762,
+ "grad_norm": 3.10861873626709,
+ "learning_rate": 3.621337193713462e-06,
+ "loss": 0.4579,
+ "step": 4531
+ },
+ {
+ "epoch": 2.142789598108747,
+ "grad_norm": 2.7386577129364014,
+ "learning_rate": 3.6207796038858693e-06,
+ "loss": 0.4248,
+ "step": 4532
+ },
+ {
+ "epoch": 2.143262411347518,
+ "grad_norm": 2.601836681365967,
+ "learning_rate": 3.6202219442722453e-06,
+ "loss": 0.4928,
+ "step": 4533
+ },
+ {
+ "epoch": 2.1437352245862886,
+ "grad_norm": 2.598778247833252,
+ "learning_rate": 3.6196642149073123e-06,
+ "loss": 0.4415,
+ "step": 4534
+ },
+ {
+ "epoch": 2.144208037825059,
+ "grad_norm": 2.443995714187622,
+ "learning_rate": 3.619106415825798e-06,
+ "loss": 0.3917,
+ "step": 4535
+ },
+ {
+ "epoch": 2.1446808510638298,
+ "grad_norm": 2.84643816947937,
+ "learning_rate": 3.6185485470624354e-06,
+ "loss": 0.4162,
+ "step": 4536
+ },
+ {
+ "epoch": 2.1451536643026006,
+ "grad_norm": 2.4568188190460205,
+ "learning_rate": 3.617990608651959e-06,
+ "loss": 0.4298,
+ "step": 4537
+ },
+ {
+ "epoch": 2.145626477541371,
+ "grad_norm": 2.968804359436035,
+ "learning_rate": 3.61743260062911e-06,
+ "loss": 0.4696,
+ "step": 4538
+ },
+ {
+ "epoch": 2.1460992907801417,
+ "grad_norm": 2.629075288772583,
+ "learning_rate": 3.6168745230286327e-06,
+ "loss": 0.4234,
+ "step": 4539
+ },
+ {
+ "epoch": 2.1465721040189125,
+ "grad_norm": 2.7680578231811523,
+ "learning_rate": 3.6163163758852754e-06,
+ "loss": 0.4669,
+ "step": 4540
+ },
+ {
+ "epoch": 2.1470449172576833,
+ "grad_norm": 2.782825469970703,
+ "learning_rate": 3.615758159233793e-06,
+ "loss": 0.4552,
+ "step": 4541
+ },
+ {
+ "epoch": 2.147517730496454,
+ "grad_norm": 2.653047561645508,
+ "learning_rate": 3.615199873108942e-06,
+ "loss": 0.4393,
+ "step": 4542
+ },
+ {
+ "epoch": 2.1479905437352245,
+ "grad_norm": 2.4175806045532227,
+ "learning_rate": 3.6146415175454852e-06,
+ "loss": 0.4114,
+ "step": 4543
+ },
+ {
+ "epoch": 2.1484633569739953,
+ "grad_norm": 2.627943515777588,
+ "learning_rate": 3.614083092578189e-06,
+ "loss": 0.4215,
+ "step": 4544
+ },
+ {
+ "epoch": 2.148936170212766,
+ "grad_norm": 2.8934123516082764,
+ "learning_rate": 3.6135245982418227e-06,
+ "loss": 0.4815,
+ "step": 4545
+ },
+ {
+ "epoch": 2.1494089834515364,
+ "grad_norm": 2.8535244464874268,
+ "learning_rate": 3.612966034571164e-06,
+ "loss": 0.4683,
+ "step": 4546
+ },
+ {
+ "epoch": 2.149881796690307,
+ "grad_norm": 2.7826647758483887,
+ "learning_rate": 3.6124074016009893e-06,
+ "loss": 0.4351,
+ "step": 4547
+ },
+ {
+ "epoch": 2.150354609929078,
+ "grad_norm": 2.6906018257141113,
+ "learning_rate": 3.6118486993660834e-06,
+ "loss": 0.4585,
+ "step": 4548
+ },
+ {
+ "epoch": 2.150827423167849,
+ "grad_norm": 2.726766586303711,
+ "learning_rate": 3.6112899279012346e-06,
+ "loss": 0.4753,
+ "step": 4549
+ },
+ {
+ "epoch": 2.1513002364066196,
+ "grad_norm": 3.0193991661071777,
+ "learning_rate": 3.6107310872412348e-06,
+ "loss": 0.4827,
+ "step": 4550
+ },
+ {
+ "epoch": 2.15177304964539,
+ "grad_norm": 2.6788697242736816,
+ "learning_rate": 3.610172177420881e-06,
+ "loss": 0.4333,
+ "step": 4551
+ },
+ {
+ "epoch": 2.1522458628841608,
+ "grad_norm": 2.865410327911377,
+ "learning_rate": 3.609613198474973e-06,
+ "loss": 0.4569,
+ "step": 4552
+ },
+ {
+ "epoch": 2.1527186761229316,
+ "grad_norm": 2.9199366569519043,
+ "learning_rate": 3.609054150438317e-06,
+ "loss": 0.5097,
+ "step": 4553
+ },
+ {
+ "epoch": 2.153191489361702,
+ "grad_norm": 2.761035203933716,
+ "learning_rate": 3.6084950333457215e-06,
+ "loss": 0.5002,
+ "step": 4554
+ },
+ {
+ "epoch": 2.1536643026004727,
+ "grad_norm": 2.514223337173462,
+ "learning_rate": 3.607935847232002e-06,
+ "loss": 0.4171,
+ "step": 4555
+ },
+ {
+ "epoch": 2.1541371158392435,
+ "grad_norm": 2.5167524814605713,
+ "learning_rate": 3.6073765921319747e-06,
+ "loss": 0.4494,
+ "step": 4556
+ },
+ {
+ "epoch": 2.1546099290780143,
+ "grad_norm": 2.7540643215179443,
+ "learning_rate": 3.606817268080463e-06,
+ "loss": 0.4472,
+ "step": 4557
+ },
+ {
+ "epoch": 2.155082742316785,
+ "grad_norm": 2.7728664875030518,
+ "learning_rate": 3.6062578751122936e-06,
+ "loss": 0.4669,
+ "step": 4558
+ },
+ {
+ "epoch": 2.1555555555555554,
+ "grad_norm": 2.7788400650024414,
+ "learning_rate": 3.605698413262296e-06,
+ "loss": 0.4613,
+ "step": 4559
+ },
+ {
+ "epoch": 2.1560283687943262,
+ "grad_norm": 2.7811810970306396,
+ "learning_rate": 3.605138882565308e-06,
+ "loss": 0.4242,
+ "step": 4560
+ },
+ {
+ "epoch": 2.156501182033097,
+ "grad_norm": 2.7819995880126953,
+ "learning_rate": 3.6045792830561664e-06,
+ "loss": 0.443,
+ "step": 4561
+ },
+ {
+ "epoch": 2.1569739952718674,
+ "grad_norm": 2.671259641647339,
+ "learning_rate": 3.6040196147697166e-06,
+ "loss": 0.4336,
+ "step": 4562
+ },
+ {
+ "epoch": 2.157446808510638,
+ "grad_norm": 2.9296300411224365,
+ "learning_rate": 3.603459877740807e-06,
+ "loss": 0.479,
+ "step": 4563
+ },
+ {
+ "epoch": 2.157919621749409,
+ "grad_norm": 2.834937334060669,
+ "learning_rate": 3.602900072004289e-06,
+ "loss": 0.4603,
+ "step": 4564
+ },
+ {
+ "epoch": 2.15839243498818,
+ "grad_norm": 2.8434760570526123,
+ "learning_rate": 3.602340197595019e-06,
+ "loss": 0.4288,
+ "step": 4565
+ },
+ {
+ "epoch": 2.1588652482269506,
+ "grad_norm": 2.7245426177978516,
+ "learning_rate": 3.6017802545478593e-06,
+ "loss": 0.4194,
+ "step": 4566
+ },
+ {
+ "epoch": 2.159338061465721,
+ "grad_norm": 2.7795023918151855,
+ "learning_rate": 3.6012202428976735e-06,
+ "loss": 0.4481,
+ "step": 4567
+ },
+ {
+ "epoch": 2.1598108747044917,
+ "grad_norm": 2.9482083320617676,
+ "learning_rate": 3.6006601626793325e-06,
+ "loss": 0.468,
+ "step": 4568
+ },
+ {
+ "epoch": 2.1602836879432625,
+ "grad_norm": 2.9563326835632324,
+ "learning_rate": 3.6001000139277094e-06,
+ "loss": 0.4427,
+ "step": 4569
+ },
+ {
+ "epoch": 2.160756501182033,
+ "grad_norm": 2.7755916118621826,
+ "learning_rate": 3.599539796677682e-06,
+ "loss": 0.4258,
+ "step": 4570
+ },
+ {
+ "epoch": 2.1612293144208037,
+ "grad_norm": 2.961045265197754,
+ "learning_rate": 3.5989795109641333e-06,
+ "loss": 0.4645,
+ "step": 4571
+ },
+ {
+ "epoch": 2.1617021276595745,
+ "grad_norm": 3.0184407234191895,
+ "learning_rate": 3.5984191568219482e-06,
+ "loss": 0.4192,
+ "step": 4572
+ },
+ {
+ "epoch": 2.1621749408983453,
+ "grad_norm": 2.9811131954193115,
+ "learning_rate": 3.5978587342860192e-06,
+ "loss": 0.408,
+ "step": 4573
+ },
+ {
+ "epoch": 2.162647754137116,
+ "grad_norm": 2.9172329902648926,
+ "learning_rate": 3.597298243391242e-06,
+ "loss": 0.4528,
+ "step": 4574
+ },
+ {
+ "epoch": 2.1631205673758864,
+ "grad_norm": 2.7798452377319336,
+ "learning_rate": 3.596737684172513e-06,
+ "loss": 0.391,
+ "step": 4575
+ },
+ {
+ "epoch": 2.1635933806146572,
+ "grad_norm": 2.526277542114258,
+ "learning_rate": 3.596177056664738e-06,
+ "loss": 0.3699,
+ "step": 4576
+ },
+ {
+ "epoch": 2.164066193853428,
+ "grad_norm": 2.856269121170044,
+ "learning_rate": 3.5956163609028244e-06,
+ "loss": 0.4082,
+ "step": 4577
+ },
+ {
+ "epoch": 2.1645390070921984,
+ "grad_norm": 2.7681572437286377,
+ "learning_rate": 3.5950555969216845e-06,
+ "loss": 0.4064,
+ "step": 4578
+ },
+ {
+ "epoch": 2.165011820330969,
+ "grad_norm": 2.2924954891204834,
+ "learning_rate": 3.5944947647562333e-06,
+ "loss": 0.416,
+ "step": 4579
+ },
+ {
+ "epoch": 2.16548463356974,
+ "grad_norm": 2.439929485321045,
+ "learning_rate": 3.5939338644413936e-06,
+ "loss": 0.4476,
+ "step": 4580
+ },
+ {
+ "epoch": 2.1659574468085108,
+ "grad_norm": 2.786442518234253,
+ "learning_rate": 3.5933728960120877e-06,
+ "loss": 0.4525,
+ "step": 4581
+ },
+ {
+ "epoch": 2.166430260047281,
+ "grad_norm": 2.5910253524780273,
+ "learning_rate": 3.5928118595032465e-06,
+ "loss": 0.4441,
+ "step": 4582
+ },
+ {
+ "epoch": 2.166903073286052,
+ "grad_norm": 2.8144876956939697,
+ "learning_rate": 3.5922507549498024e-06,
+ "loss": 0.497,
+ "step": 4583
+ },
+ {
+ "epoch": 2.1673758865248227,
+ "grad_norm": 2.5714170932769775,
+ "learning_rate": 3.591689582386694e-06,
+ "loss": 0.4625,
+ "step": 4584
+ },
+ {
+ "epoch": 2.1678486997635935,
+ "grad_norm": 2.878187894821167,
+ "learning_rate": 3.591128341848861e-06,
+ "loss": 0.4835,
+ "step": 4585
+ },
+ {
+ "epoch": 2.168321513002364,
+ "grad_norm": 2.4946508407592773,
+ "learning_rate": 3.5905670333712504e-06,
+ "loss": 0.4278,
+ "step": 4586
+ },
+ {
+ "epoch": 2.1687943262411347,
+ "grad_norm": 2.9186196327209473,
+ "learning_rate": 3.590005656988814e-06,
+ "loss": 0.465,
+ "step": 4587
+ },
+ {
+ "epoch": 2.1692671394799055,
+ "grad_norm": 3.136807441711426,
+ "learning_rate": 3.5894442127365046e-06,
+ "loss": 0.4146,
+ "step": 4588
+ },
+ {
+ "epoch": 2.1697399527186763,
+ "grad_norm": 2.8106343746185303,
+ "learning_rate": 3.5888827006492804e-06,
+ "loss": 0.4737,
+ "step": 4589
+ },
+ {
+ "epoch": 2.1702127659574466,
+ "grad_norm": 2.874553680419922,
+ "learning_rate": 3.5883211207621047e-06,
+ "loss": 0.3962,
+ "step": 4590
+ },
+ {
+ "epoch": 2.1706855791962174,
+ "grad_norm": 2.7914116382598877,
+ "learning_rate": 3.587759473109946e-06,
+ "loss": 0.4705,
+ "step": 4591
+ },
+ {
+ "epoch": 2.171158392434988,
+ "grad_norm": 2.7273290157318115,
+ "learning_rate": 3.5871977577277745e-06,
+ "loss": 0.4827,
+ "step": 4592
+ },
+ {
+ "epoch": 2.171631205673759,
+ "grad_norm": 2.4167256355285645,
+ "learning_rate": 3.5866359746505653e-06,
+ "loss": 0.4181,
+ "step": 4593
+ },
+ {
+ "epoch": 2.1721040189125294,
+ "grad_norm": 2.8929779529571533,
+ "learning_rate": 3.586074123913299e-06,
+ "loss": 0.4006,
+ "step": 4594
+ },
+ {
+ "epoch": 2.1725768321513,
+ "grad_norm": 2.6996190547943115,
+ "learning_rate": 3.5855122055509593e-06,
+ "loss": 0.4792,
+ "step": 4595
+ },
+ {
+ "epoch": 2.173049645390071,
+ "grad_norm": 2.9341464042663574,
+ "learning_rate": 3.584950219598534e-06,
+ "loss": 0.3903,
+ "step": 4596
+ },
+ {
+ "epoch": 2.1735224586288417,
+ "grad_norm": 2.799330234527588,
+ "learning_rate": 3.5843881660910166e-06,
+ "loss": 0.4717,
+ "step": 4597
+ },
+ {
+ "epoch": 2.173995271867612,
+ "grad_norm": 2.5028693675994873,
+ "learning_rate": 3.5838260450634028e-06,
+ "loss": 0.4462,
+ "step": 4598
+ },
+ {
+ "epoch": 2.174468085106383,
+ "grad_norm": 2.5845541954040527,
+ "learning_rate": 3.583263856550693e-06,
+ "loss": 0.4327,
+ "step": 4599
+ },
+ {
+ "epoch": 2.1749408983451537,
+ "grad_norm": 2.4804906845092773,
+ "learning_rate": 3.5827016005878933e-06,
+ "loss": 0.4555,
+ "step": 4600
+ },
+ {
+ "epoch": 2.1754137115839245,
+ "grad_norm": 2.625746011734009,
+ "learning_rate": 3.5821392772100125e-06,
+ "loss": 0.455,
+ "step": 4601
+ },
+ {
+ "epoch": 2.175886524822695,
+ "grad_norm": 2.6230757236480713,
+ "learning_rate": 3.581576886452064e-06,
+ "loss": 0.4422,
+ "step": 4602
+ },
+ {
+ "epoch": 2.1763593380614656,
+ "grad_norm": 3.3104100227355957,
+ "learning_rate": 3.5810144283490656e-06,
+ "loss": 0.4212,
+ "step": 4603
+ },
+ {
+ "epoch": 2.1768321513002364,
+ "grad_norm": 2.6799755096435547,
+ "learning_rate": 3.5804519029360384e-06,
+ "loss": 0.4575,
+ "step": 4604
+ },
+ {
+ "epoch": 2.1773049645390072,
+ "grad_norm": 2.462216854095459,
+ "learning_rate": 3.5798893102480085e-06,
+ "loss": 0.4096,
+ "step": 4605
+ },
+ {
+ "epoch": 2.1777777777777776,
+ "grad_norm": 2.8600878715515137,
+ "learning_rate": 3.5793266503200074e-06,
+ "loss": 0.4798,
+ "step": 4606
+ },
+ {
+ "epoch": 2.1782505910165484,
+ "grad_norm": 2.935746431350708,
+ "learning_rate": 3.5787639231870673e-06,
+ "loss": 0.4021,
+ "step": 4607
+ },
+ {
+ "epoch": 2.178723404255319,
+ "grad_norm": 2.8655526638031006,
+ "learning_rate": 3.578201128884229e-06,
+ "loss": 0.4553,
+ "step": 4608
+ },
+ {
+ "epoch": 2.17919621749409,
+ "grad_norm": 3.219498634338379,
+ "learning_rate": 3.577638267446533e-06,
+ "loss": 0.4692,
+ "step": 4609
+ },
+ {
+ "epoch": 2.1796690307328603,
+ "grad_norm": 3.0449860095977783,
+ "learning_rate": 3.5770753389090283e-06,
+ "loss": 0.4675,
+ "step": 4610
+ },
+ {
+ "epoch": 2.180141843971631,
+ "grad_norm": 2.7045507431030273,
+ "learning_rate": 3.576512343306765e-06,
+ "loss": 0.4773,
+ "step": 4611
+ },
+ {
+ "epoch": 2.180614657210402,
+ "grad_norm": 2.601499557495117,
+ "learning_rate": 3.5759492806747985e-06,
+ "loss": 0.4112,
+ "step": 4612
+ },
+ {
+ "epoch": 2.1810874704491727,
+ "grad_norm": 2.987741470336914,
+ "learning_rate": 3.575386151048188e-06,
+ "loss": 0.4651,
+ "step": 4613
+ },
+ {
+ "epoch": 2.181560283687943,
+ "grad_norm": 2.961228847503662,
+ "learning_rate": 3.5748229544619973e-06,
+ "loss": 0.5116,
+ "step": 4614
+ },
+ {
+ "epoch": 2.182033096926714,
+ "grad_norm": 2.8008430004119873,
+ "learning_rate": 3.574259690951295e-06,
+ "loss": 0.4152,
+ "step": 4615
+ },
+ {
+ "epoch": 2.1825059101654847,
+ "grad_norm": 2.5429348945617676,
+ "learning_rate": 3.573696360551151e-06,
+ "loss": 0.4188,
+ "step": 4616
+ },
+ {
+ "epoch": 2.1829787234042555,
+ "grad_norm": 2.9566478729248047,
+ "learning_rate": 3.5731329632966428e-06,
+ "loss": 0.5156,
+ "step": 4617
+ },
+ {
+ "epoch": 2.183451536643026,
+ "grad_norm": 2.5302467346191406,
+ "learning_rate": 3.572569499222851e-06,
+ "loss": 0.4361,
+ "step": 4618
+ },
+ {
+ "epoch": 2.1839243498817966,
+ "grad_norm": 3.206803560256958,
+ "learning_rate": 3.5720059683648593e-06,
+ "loss": 0.5149,
+ "step": 4619
+ },
+ {
+ "epoch": 2.1843971631205674,
+ "grad_norm": 2.9432034492492676,
+ "learning_rate": 3.5714423707577573e-06,
+ "loss": 0.4411,
+ "step": 4620
+ },
+ {
+ "epoch": 2.184869976359338,
+ "grad_norm": 2.9412078857421875,
+ "learning_rate": 3.5708787064366358e-06,
+ "loss": 0.4372,
+ "step": 4621
+ },
+ {
+ "epoch": 2.1853427895981086,
+ "grad_norm": 3.1702330112457275,
+ "learning_rate": 3.5703149754365935e-06,
+ "loss": 0.4761,
+ "step": 4622
+ },
+ {
+ "epoch": 2.1858156028368794,
+ "grad_norm": 3.1240456104278564,
+ "learning_rate": 3.569751177792731e-06,
+ "loss": 0.4854,
+ "step": 4623
+ },
+ {
+ "epoch": 2.18628841607565,
+ "grad_norm": 2.7221994400024414,
+ "learning_rate": 3.5691873135401534e-06,
+ "loss": 0.4048,
+ "step": 4624
+ },
+ {
+ "epoch": 2.186761229314421,
+ "grad_norm": 2.74397873878479,
+ "learning_rate": 3.5686233827139695e-06,
+ "loss": 0.4747,
+ "step": 4625
+ },
+ {
+ "epoch": 2.1872340425531913,
+ "grad_norm": 2.7379889488220215,
+ "learning_rate": 3.5680593853492932e-06,
+ "loss": 0.4963,
+ "step": 4626
+ },
+ {
+ "epoch": 2.187706855791962,
+ "grad_norm": 3.040205478668213,
+ "learning_rate": 3.5674953214812435e-06,
+ "loss": 0.4917,
+ "step": 4627
+ },
+ {
+ "epoch": 2.188179669030733,
+ "grad_norm": 2.95302677154541,
+ "learning_rate": 3.56693119114494e-06,
+ "loss": 0.4758,
+ "step": 4628
+ },
+ {
+ "epoch": 2.1886524822695037,
+ "grad_norm": 2.5488312244415283,
+ "learning_rate": 3.56636699437551e-06,
+ "loss": 0.4057,
+ "step": 4629
+ },
+ {
+ "epoch": 2.189125295508274,
+ "grad_norm": 2.8379666805267334,
+ "learning_rate": 3.565802731208083e-06,
+ "loss": 0.4755,
+ "step": 4630
+ },
+ {
+ "epoch": 2.189598108747045,
+ "grad_norm": 2.8765869140625,
+ "learning_rate": 3.565238401677793e-06,
+ "loss": 0.4232,
+ "step": 4631
+ },
+ {
+ "epoch": 2.1900709219858157,
+ "grad_norm": 2.9091262817382812,
+ "learning_rate": 3.5646740058197784e-06,
+ "loss": 0.3874,
+ "step": 4632
+ },
+ {
+ "epoch": 2.1905437352245865,
+ "grad_norm": 2.7067387104034424,
+ "learning_rate": 3.5641095436691826e-06,
+ "loss": 0.4771,
+ "step": 4633
+ },
+ {
+ "epoch": 2.191016548463357,
+ "grad_norm": 2.403043508529663,
+ "learning_rate": 3.563545015261151e-06,
+ "loss": 0.4062,
+ "step": 4634
+ },
+ {
+ "epoch": 2.1914893617021276,
+ "grad_norm": 2.8059732913970947,
+ "learning_rate": 3.562980420630836e-06,
+ "loss": 0.4635,
+ "step": 4635
+ },
+ {
+ "epoch": 2.1919621749408984,
+ "grad_norm": 2.5467724800109863,
+ "learning_rate": 3.56241575981339e-06,
+ "loss": 0.4552,
+ "step": 4636
+ },
+ {
+ "epoch": 2.192434988179669,
+ "grad_norm": 2.651024103164673,
+ "learning_rate": 3.561851032843973e-06,
+ "loss": 0.38,
+ "step": 4637
+ },
+ {
+ "epoch": 2.1929078014184396,
+ "grad_norm": 2.5529849529266357,
+ "learning_rate": 3.5612862397577496e-06,
+ "loss": 0.4106,
+ "step": 4638
+ },
+ {
+ "epoch": 2.1933806146572103,
+ "grad_norm": 3.069258451461792,
+ "learning_rate": 3.5607213805898844e-06,
+ "loss": 0.461,
+ "step": 4639
+ },
+ {
+ "epoch": 2.193853427895981,
+ "grad_norm": 2.5652637481689453,
+ "learning_rate": 3.56015645537555e-06,
+ "loss": 0.4497,
+ "step": 4640
+ },
+ {
+ "epoch": 2.194326241134752,
+ "grad_norm": 2.699101209640503,
+ "learning_rate": 3.5595914641499224e-06,
+ "loss": 0.4887,
+ "step": 4641
+ },
+ {
+ "epoch": 2.1947990543735223,
+ "grad_norm": 2.9292235374450684,
+ "learning_rate": 3.5590264069481805e-06,
+ "loss": 0.4462,
+ "step": 4642
+ },
+ {
+ "epoch": 2.195271867612293,
+ "grad_norm": 2.6151106357574463,
+ "learning_rate": 3.5584612838055077e-06,
+ "loss": 0.4334,
+ "step": 4643
+ },
+ {
+ "epoch": 2.195744680851064,
+ "grad_norm": 2.895798444747925,
+ "learning_rate": 3.5578960947570923e-06,
+ "loss": 0.4448,
+ "step": 4644
+ },
+ {
+ "epoch": 2.1962174940898347,
+ "grad_norm": 2.627631425857544,
+ "learning_rate": 3.557330839838125e-06,
+ "loss": 0.436,
+ "step": 4645
+ },
+ {
+ "epoch": 2.196690307328605,
+ "grad_norm": 2.8803584575653076,
+ "learning_rate": 3.556765519083803e-06,
+ "loss": 0.4698,
+ "step": 4646
+ },
+ {
+ "epoch": 2.197163120567376,
+ "grad_norm": 2.436609983444214,
+ "learning_rate": 3.5562001325293265e-06,
+ "loss": 0.4043,
+ "step": 4647
+ },
+ {
+ "epoch": 2.1976359338061466,
+ "grad_norm": 2.5090718269348145,
+ "learning_rate": 3.5556346802098985e-06,
+ "loss": 0.4505,
+ "step": 4648
+ },
+ {
+ "epoch": 2.1981087470449174,
+ "grad_norm": 2.792783737182617,
+ "learning_rate": 3.5550691621607277e-06,
+ "loss": 0.43,
+ "step": 4649
+ },
+ {
+ "epoch": 2.198581560283688,
+ "grad_norm": 2.74153470993042,
+ "learning_rate": 3.554503578417026e-06,
+ "loss": 0.4496,
+ "step": 4650
+ },
+ {
+ "epoch": 2.1990543735224586,
+ "grad_norm": 3.0262627601623535,
+ "learning_rate": 3.5539379290140114e-06,
+ "loss": 0.4503,
+ "step": 4651
+ },
+ {
+ "epoch": 2.1995271867612294,
+ "grad_norm": 2.783811330795288,
+ "learning_rate": 3.553372213986903e-06,
+ "loss": 0.432,
+ "step": 4652
+ },
+ {
+ "epoch": 2.2,
+ "grad_norm": 3.091191053390503,
+ "learning_rate": 3.5528064333709255e-06,
+ "loss": 0.4658,
+ "step": 4653
+ },
+ {
+ "epoch": 2.2004728132387705,
+ "grad_norm": 2.814634084701538,
+ "learning_rate": 3.5522405872013076e-06,
+ "loss": 0.4473,
+ "step": 4654
+ },
+ {
+ "epoch": 2.2009456264775413,
+ "grad_norm": 2.6918299198150635,
+ "learning_rate": 3.5516746755132824e-06,
+ "loss": 0.5323,
+ "step": 4655
+ },
+ {
+ "epoch": 2.201418439716312,
+ "grad_norm": 2.9902455806732178,
+ "learning_rate": 3.5511086983420867e-06,
+ "loss": 0.5166,
+ "step": 4656
+ },
+ {
+ "epoch": 2.201891252955083,
+ "grad_norm": 2.932699203491211,
+ "learning_rate": 3.5505426557229616e-06,
+ "loss": 0.5197,
+ "step": 4657
+ },
+ {
+ "epoch": 2.2023640661938533,
+ "grad_norm": 2.585712432861328,
+ "learning_rate": 3.549976547691152e-06,
+ "loss": 0.425,
+ "step": 4658
+ },
+ {
+ "epoch": 2.202836879432624,
+ "grad_norm": 3.1019949913024902,
+ "learning_rate": 3.5494103742819065e-06,
+ "loss": 0.485,
+ "step": 4659
+ },
+ {
+ "epoch": 2.203309692671395,
+ "grad_norm": 2.3169195652008057,
+ "learning_rate": 3.548844135530478e-06,
+ "loss": 0.4064,
+ "step": 4660
+ },
+ {
+ "epoch": 2.2037825059101657,
+ "grad_norm": 2.779240846633911,
+ "learning_rate": 3.5482778314721257e-06,
+ "loss": 0.427,
+ "step": 4661
+ },
+ {
+ "epoch": 2.204255319148936,
+ "grad_norm": 2.765423059463501,
+ "learning_rate": 3.5477114621421078e-06,
+ "loss": 0.5125,
+ "step": 4662
+ },
+ {
+ "epoch": 2.204728132387707,
+ "grad_norm": 2.5590033531188965,
+ "learning_rate": 3.5471450275756913e-06,
+ "loss": 0.4009,
+ "step": 4663
+ },
+ {
+ "epoch": 2.2052009456264776,
+ "grad_norm": 2.706068515777588,
+ "learning_rate": 3.546578527808146e-06,
+ "loss": 0.4604,
+ "step": 4664
+ },
+ {
+ "epoch": 2.2056737588652484,
+ "grad_norm": 2.7995102405548096,
+ "learning_rate": 3.546011962874745e-06,
+ "loss": 0.4088,
+ "step": 4665
+ },
+ {
+ "epoch": 2.2061465721040188,
+ "grad_norm": 2.6369729042053223,
+ "learning_rate": 3.5454453328107656e-06,
+ "loss": 0.4634,
+ "step": 4666
+ },
+ {
+ "epoch": 2.2066193853427896,
+ "grad_norm": 3.1426475048065186,
+ "learning_rate": 3.54487863765149e-06,
+ "loss": 0.4761,
+ "step": 4667
+ },
+ {
+ "epoch": 2.2070921985815604,
+ "grad_norm": 2.7739460468292236,
+ "learning_rate": 3.5443118774322027e-06,
+ "loss": 0.467,
+ "step": 4668
+ },
+ {
+ "epoch": 2.207565011820331,
+ "grad_norm": 2.559105157852173,
+ "learning_rate": 3.5437450521881934e-06,
+ "loss": 0.4268,
+ "step": 4669
+ },
+ {
+ "epoch": 2.2080378250591015,
+ "grad_norm": 2.726593017578125,
+ "learning_rate": 3.543178161954758e-06,
+ "loss": 0.462,
+ "step": 4670
+ },
+ {
+ "epoch": 2.2085106382978723,
+ "grad_norm": 2.796109199523926,
+ "learning_rate": 3.5426112067671907e-06,
+ "loss": 0.4571,
+ "step": 4671
+ },
+ {
+ "epoch": 2.208983451536643,
+ "grad_norm": 2.7989072799682617,
+ "learning_rate": 3.5420441866607964e-06,
+ "loss": 0.4648,
+ "step": 4672
+ },
+ {
+ "epoch": 2.209456264775414,
+ "grad_norm": 2.6750967502593994,
+ "learning_rate": 3.5414771016708795e-06,
+ "loss": 0.4717,
+ "step": 4673
+ },
+ {
+ "epoch": 2.2099290780141843,
+ "grad_norm": 2.705659866333008,
+ "learning_rate": 3.5409099518327507e-06,
+ "loss": 0.4738,
+ "step": 4674
+ },
+ {
+ "epoch": 2.210401891252955,
+ "grad_norm": 2.79276442527771,
+ "learning_rate": 3.5403427371817234e-06,
+ "loss": 0.4625,
+ "step": 4675
+ },
+ {
+ "epoch": 2.210874704491726,
+ "grad_norm": 2.781339406967163,
+ "learning_rate": 3.539775457753115e-06,
+ "loss": 0.438,
+ "step": 4676
+ },
+ {
+ "epoch": 2.2113475177304966,
+ "grad_norm": 3.0088918209075928,
+ "learning_rate": 3.5392081135822488e-06,
+ "loss": 0.4776,
+ "step": 4677
+ },
+ {
+ "epoch": 2.211820330969267,
+ "grad_norm": 3.0291390419006348,
+ "learning_rate": 3.538640704704449e-06,
+ "loss": 0.4634,
+ "step": 4678
+ },
+ {
+ "epoch": 2.212293144208038,
+ "grad_norm": 2.967867374420166,
+ "learning_rate": 3.5380732311550477e-06,
+ "loss": 0.4776,
+ "step": 4679
+ },
+ {
+ "epoch": 2.2127659574468086,
+ "grad_norm": 2.6268832683563232,
+ "learning_rate": 3.5375056929693787e-06,
+ "loss": 0.4646,
+ "step": 4680
+ },
+ {
+ "epoch": 2.2132387706855794,
+ "grad_norm": 2.6688554286956787,
+ "learning_rate": 3.536938090182778e-06,
+ "loss": 0.3975,
+ "step": 4681
+ },
+ {
+ "epoch": 2.2137115839243497,
+ "grad_norm": 3.0079736709594727,
+ "learning_rate": 3.5363704228305906e-06,
+ "loss": 0.4724,
+ "step": 4682
+ },
+ {
+ "epoch": 2.2141843971631205,
+ "grad_norm": 2.4287586212158203,
+ "learning_rate": 3.535802690948161e-06,
+ "loss": 0.4371,
+ "step": 4683
+ },
+ {
+ "epoch": 2.2146572104018913,
+ "grad_norm": 2.960679531097412,
+ "learning_rate": 3.53523489457084e-06,
+ "loss": 0.4347,
+ "step": 4684
+ },
+ {
+ "epoch": 2.215130023640662,
+ "grad_norm": 2.9646008014678955,
+ "learning_rate": 3.5346670337339807e-06,
+ "loss": 0.4803,
+ "step": 4685
+ },
+ {
+ "epoch": 2.2156028368794325,
+ "grad_norm": 3.0518898963928223,
+ "learning_rate": 3.534099108472942e-06,
+ "loss": 0.4712,
+ "step": 4686
+ },
+ {
+ "epoch": 2.2160756501182033,
+ "grad_norm": 2.776681900024414,
+ "learning_rate": 3.533531118823086e-06,
+ "loss": 0.4347,
+ "step": 4687
+ },
+ {
+ "epoch": 2.216548463356974,
+ "grad_norm": 2.18019437789917,
+ "learning_rate": 3.53296306481978e-06,
+ "loss": 0.3551,
+ "step": 4688
+ },
+ {
+ "epoch": 2.217021276595745,
+ "grad_norm": 2.9400811195373535,
+ "learning_rate": 3.5323949464983937e-06,
+ "loss": 0.4912,
+ "step": 4689
+ },
+ {
+ "epoch": 2.2174940898345152,
+ "grad_norm": 2.798386812210083,
+ "learning_rate": 3.5318267638943e-06,
+ "loss": 0.3967,
+ "step": 4690
+ },
+ {
+ "epoch": 2.217966903073286,
+ "grad_norm": 2.5452775955200195,
+ "learning_rate": 3.531258517042879e-06,
+ "loss": 0.3773,
+ "step": 4691
+ },
+ {
+ "epoch": 2.218439716312057,
+ "grad_norm": 2.711137294769287,
+ "learning_rate": 3.5306902059795113e-06,
+ "loss": 0.4123,
+ "step": 4692
+ },
+ {
+ "epoch": 2.2189125295508276,
+ "grad_norm": 3.0022387504577637,
+ "learning_rate": 3.530121830739584e-06,
+ "loss": 0.4898,
+ "step": 4693
+ },
+ {
+ "epoch": 2.219385342789598,
+ "grad_norm": 2.871814250946045,
+ "learning_rate": 3.5295533913584877e-06,
+ "loss": 0.4497,
+ "step": 4694
+ },
+ {
+ "epoch": 2.219858156028369,
+ "grad_norm": 2.9782521724700928,
+ "learning_rate": 3.528984887871616e-06,
+ "loss": 0.4797,
+ "step": 4695
+ },
+ {
+ "epoch": 2.2203309692671396,
+ "grad_norm": 2.6896398067474365,
+ "learning_rate": 3.5284163203143673e-06,
+ "loss": 0.439,
+ "step": 4696
+ },
+ {
+ "epoch": 2.2208037825059104,
+ "grad_norm": 2.7898833751678467,
+ "learning_rate": 3.5278476887221436e-06,
+ "loss": 0.4656,
+ "step": 4697
+ },
+ {
+ "epoch": 2.2212765957446807,
+ "grad_norm": 2.800416946411133,
+ "learning_rate": 3.527278993130352e-06,
+ "loss": 0.4452,
+ "step": 4698
+ },
+ {
+ "epoch": 2.2217494089834515,
+ "grad_norm": 3.653228998184204,
+ "learning_rate": 3.526710233574401e-06,
+ "loss": 0.4189,
+ "step": 4699
+ },
+ {
+ "epoch": 2.2222222222222223,
+ "grad_norm": 2.856956958770752,
+ "learning_rate": 3.5261414100897064e-06,
+ "loss": 0.4298,
+ "step": 4700
+ },
+ {
+ "epoch": 2.222695035460993,
+ "grad_norm": 2.8576223850250244,
+ "learning_rate": 3.5255725227116854e-06,
+ "loss": 0.4425,
+ "step": 4701
+ },
+ {
+ "epoch": 2.2231678486997635,
+ "grad_norm": 3.1161351203918457,
+ "learning_rate": 3.5250035714757603e-06,
+ "loss": 0.4609,
+ "step": 4702
+ },
+ {
+ "epoch": 2.2236406619385343,
+ "grad_norm": 2.843379259109497,
+ "learning_rate": 3.5244345564173578e-06,
+ "loss": 0.3589,
+ "step": 4703
+ },
+ {
+ "epoch": 2.224113475177305,
+ "grad_norm": 2.877157211303711,
+ "learning_rate": 3.5238654775719068e-06,
+ "loss": 0.4591,
+ "step": 4704
+ },
+ {
+ "epoch": 2.2245862884160754,
+ "grad_norm": 3.488954782485962,
+ "learning_rate": 3.5232963349748424e-06,
+ "loss": 0.4836,
+ "step": 4705
+ },
+ {
+ "epoch": 2.225059101654846,
+ "grad_norm": 2.929037570953369,
+ "learning_rate": 3.5227271286616025e-06,
+ "loss": 0.5293,
+ "step": 4706
+ },
+ {
+ "epoch": 2.225531914893617,
+ "grad_norm": 2.6230576038360596,
+ "learning_rate": 3.5221578586676286e-06,
+ "loss": 0.4235,
+ "step": 4707
+ },
+ {
+ "epoch": 2.226004728132388,
+ "grad_norm": 2.529998302459717,
+ "learning_rate": 3.5215885250283664e-06,
+ "loss": 0.4369,
+ "step": 4708
+ },
+ {
+ "epoch": 2.2264775413711586,
+ "grad_norm": 2.817279577255249,
+ "learning_rate": 3.521019127779267e-06,
+ "loss": 0.481,
+ "step": 4709
+ },
+ {
+ "epoch": 2.226950354609929,
+ "grad_norm": 3.1513843536376953,
+ "learning_rate": 3.5204496669557833e-06,
+ "loss": 0.463,
+ "step": 4710
+ },
+ {
+ "epoch": 2.2274231678486998,
+ "grad_norm": 2.9403610229492188,
+ "learning_rate": 3.5198801425933725e-06,
+ "loss": 0.455,
+ "step": 4711
+ },
+ {
+ "epoch": 2.2278959810874706,
+ "grad_norm": 2.648346424102783,
+ "learning_rate": 3.5193105547274987e-06,
+ "loss": 0.4441,
+ "step": 4712
+ },
+ {
+ "epoch": 2.228368794326241,
+ "grad_norm": 2.791898727416992,
+ "learning_rate": 3.5187409033936252e-06,
+ "loss": 0.4682,
+ "step": 4713
+ },
+ {
+ "epoch": 2.2288416075650117,
+ "grad_norm": 2.8157432079315186,
+ "learning_rate": 3.5181711886272242e-06,
+ "loss": 0.4572,
+ "step": 4714
+ },
+ {
+ "epoch": 2.2293144208037825,
+ "grad_norm": 3.250319480895996,
+ "learning_rate": 3.5176014104637665e-06,
+ "loss": 0.4599,
+ "step": 4715
+ },
+ {
+ "epoch": 2.2297872340425533,
+ "grad_norm": 2.6747050285339355,
+ "learning_rate": 3.5170315689387307e-06,
+ "loss": 0.4328,
+ "step": 4716
+ },
+ {
+ "epoch": 2.230260047281324,
+ "grad_norm": 2.584094762802124,
+ "learning_rate": 3.5164616640875993e-06,
+ "loss": 0.4268,
+ "step": 4717
+ },
+ {
+ "epoch": 2.2307328605200945,
+ "grad_norm": 2.480710506439209,
+ "learning_rate": 3.5158916959458573e-06,
+ "loss": 0.438,
+ "step": 4718
+ },
+ {
+ "epoch": 2.2312056737588652,
+ "grad_norm": 2.9338483810424805,
+ "learning_rate": 3.515321664548993e-06,
+ "loss": 0.4937,
+ "step": 4719
+ },
+ {
+ "epoch": 2.231678486997636,
+ "grad_norm": 2.7880783081054688,
+ "learning_rate": 3.5147515699325013e-06,
+ "loss": 0.4624,
+ "step": 4720
+ },
+ {
+ "epoch": 2.2321513002364064,
+ "grad_norm": 2.740841865539551,
+ "learning_rate": 3.5141814121318797e-06,
+ "loss": 0.3689,
+ "step": 4721
+ },
+ {
+ "epoch": 2.232624113475177,
+ "grad_norm": 2.9541244506835938,
+ "learning_rate": 3.5136111911826277e-06,
+ "loss": 0.4092,
+ "step": 4722
+ },
+ {
+ "epoch": 2.233096926713948,
+ "grad_norm": 2.7205398082733154,
+ "learning_rate": 3.5130409071202515e-06,
+ "loss": 0.445,
+ "step": 4723
+ },
+ {
+ "epoch": 2.233569739952719,
+ "grad_norm": 2.563406229019165,
+ "learning_rate": 3.51247055998026e-06,
+ "loss": 0.4335,
+ "step": 4724
+ },
+ {
+ "epoch": 2.2340425531914896,
+ "grad_norm": 2.4249489307403564,
+ "learning_rate": 3.5119001497981666e-06,
+ "loss": 0.4671,
+ "step": 4725
+ },
+ {
+ "epoch": 2.23451536643026,
+ "grad_norm": 2.711630344390869,
+ "learning_rate": 3.5113296766094875e-06,
+ "loss": 0.4177,
+ "step": 4726
+ },
+ {
+ "epoch": 2.2349881796690307,
+ "grad_norm": 3.0257632732391357,
+ "learning_rate": 3.5107591404497443e-06,
+ "loss": 0.4976,
+ "step": 4727
+ },
+ {
+ "epoch": 2.2354609929078015,
+ "grad_norm": 2.717303991317749,
+ "learning_rate": 3.5101885413544614e-06,
+ "loss": 0.4621,
+ "step": 4728
+ },
+ {
+ "epoch": 2.235933806146572,
+ "grad_norm": 3.2846004962921143,
+ "learning_rate": 3.509617879359167e-06,
+ "loss": 0.4284,
+ "step": 4729
+ },
+ {
+ "epoch": 2.2364066193853427,
+ "grad_norm": 2.7217819690704346,
+ "learning_rate": 3.5090471544993953e-06,
+ "loss": 0.4247,
+ "step": 4730
+ },
+ {
+ "epoch": 2.2368794326241135,
+ "grad_norm": 2.5003223419189453,
+ "learning_rate": 3.5084763668106812e-06,
+ "loss": 0.4096,
+ "step": 4731
+ },
+ {
+ "epoch": 2.2373522458628843,
+ "grad_norm": 2.7312731742858887,
+ "learning_rate": 3.5079055163285658e-06,
+ "loss": 0.4741,
+ "step": 4732
+ },
+ {
+ "epoch": 2.237825059101655,
+ "grad_norm": 2.84940767288208,
+ "learning_rate": 3.5073346030885934e-06,
+ "loss": 0.4887,
+ "step": 4733
+ },
+ {
+ "epoch": 2.2382978723404254,
+ "grad_norm": 3.1188511848449707,
+ "learning_rate": 3.506763627126313e-06,
+ "loss": 0.5335,
+ "step": 4734
+ },
+ {
+ "epoch": 2.2387706855791962,
+ "grad_norm": 2.6741397380828857,
+ "learning_rate": 3.5061925884772753e-06,
+ "loss": 0.4137,
+ "step": 4735
+ },
+ {
+ "epoch": 2.239243498817967,
+ "grad_norm": 3.1542465686798096,
+ "learning_rate": 3.505621487177037e-06,
+ "loss": 0.5303,
+ "step": 4736
+ },
+ {
+ "epoch": 2.2397163120567374,
+ "grad_norm": 5.448268890380859,
+ "learning_rate": 3.505050323261159e-06,
+ "loss": 0.4995,
+ "step": 4737
+ },
+ {
+ "epoch": 2.240189125295508,
+ "grad_norm": 2.7317898273468018,
+ "learning_rate": 3.5044790967652037e-06,
+ "loss": 0.4595,
+ "step": 4738
+ },
+ {
+ "epoch": 2.240661938534279,
+ "grad_norm": 2.8135695457458496,
+ "learning_rate": 3.50390780772474e-06,
+ "loss": 0.4593,
+ "step": 4739
+ },
+ {
+ "epoch": 2.2411347517730498,
+ "grad_norm": 3.1391844749450684,
+ "learning_rate": 3.5033364561753393e-06,
+ "loss": 0.4902,
+ "step": 4740
+ },
+ {
+ "epoch": 2.24160756501182,
+ "grad_norm": 2.6383132934570312,
+ "learning_rate": 3.5027650421525762e-06,
+ "loss": 0.3832,
+ "step": 4741
+ },
+ {
+ "epoch": 2.242080378250591,
+ "grad_norm": 2.742546558380127,
+ "learning_rate": 3.5021935656920314e-06,
+ "loss": 0.4012,
+ "step": 4742
+ },
+ {
+ "epoch": 2.2425531914893617,
+ "grad_norm": 3.1243674755096436,
+ "learning_rate": 3.5016220268292873e-06,
+ "loss": 0.4271,
+ "step": 4743
+ },
+ {
+ "epoch": 2.2430260047281325,
+ "grad_norm": 2.794717788696289,
+ "learning_rate": 3.501050425599932e-06,
+ "loss": 0.4604,
+ "step": 4744
+ },
+ {
+ "epoch": 2.243498817966903,
+ "grad_norm": 2.8481621742248535,
+ "learning_rate": 3.5004787620395565e-06,
+ "loss": 0.4814,
+ "step": 4745
+ },
+ {
+ "epoch": 2.2439716312056737,
+ "grad_norm": 2.8842051029205322,
+ "learning_rate": 3.499907036183755e-06,
+ "loss": 0.4987,
+ "step": 4746
+ },
+ {
+ "epoch": 2.2444444444444445,
+ "grad_norm": 3.074805974960327,
+ "learning_rate": 3.4993352480681265e-06,
+ "loss": 0.4966,
+ "step": 4747
+ },
+ {
+ "epoch": 2.2449172576832153,
+ "grad_norm": 2.7204246520996094,
+ "learning_rate": 3.4987633977282742e-06,
+ "loss": 0.4,
+ "step": 4748
+ },
+ {
+ "epoch": 2.2453900709219856,
+ "grad_norm": 2.685884952545166,
+ "learning_rate": 3.4981914851998055e-06,
+ "loss": 0.4285,
+ "step": 4749
+ },
+ {
+ "epoch": 2.2458628841607564,
+ "grad_norm": 2.1666336059570312,
+ "learning_rate": 3.4976195105183287e-06,
+ "loss": 0.3756,
+ "step": 4750
+ },
+ {
+ "epoch": 2.246335697399527,
+ "grad_norm": 2.863006353378296,
+ "learning_rate": 3.49704747371946e-06,
+ "loss": 0.4535,
+ "step": 4751
+ },
+ {
+ "epoch": 2.246808510638298,
+ "grad_norm": 2.5558736324310303,
+ "learning_rate": 3.496475374838817e-06,
+ "loss": 0.4129,
+ "step": 4752
+ },
+ {
+ "epoch": 2.2472813238770684,
+ "grad_norm": 2.9780309200286865,
+ "learning_rate": 3.495903213912022e-06,
+ "loss": 0.4871,
+ "step": 4753
+ },
+ {
+ "epoch": 2.247754137115839,
+ "grad_norm": 2.951779365539551,
+ "learning_rate": 3.4953309909747e-06,
+ "loss": 0.5162,
+ "step": 4754
+ },
+ {
+ "epoch": 2.24822695035461,
+ "grad_norm": 2.7654693126678467,
+ "learning_rate": 3.4947587060624834e-06,
+ "loss": 0.4662,
+ "step": 4755
+ },
+ {
+ "epoch": 2.2486997635933808,
+ "grad_norm": 2.708247184753418,
+ "learning_rate": 3.494186359211002e-06,
+ "loss": 0.4279,
+ "step": 4756
+ },
+ {
+ "epoch": 2.249172576832151,
+ "grad_norm": 3.09916615486145,
+ "learning_rate": 3.4936139504558963e-06,
+ "loss": 0.4085,
+ "step": 4757
+ },
+ {
+ "epoch": 2.249645390070922,
+ "grad_norm": 2.913806200027466,
+ "learning_rate": 3.493041479832807e-06,
+ "loss": 0.4653,
+ "step": 4758
+ },
+ {
+ "epoch": 2.2501182033096927,
+ "grad_norm": 3.2903928756713867,
+ "learning_rate": 3.4924689473773787e-06,
+ "loss": 0.5167,
+ "step": 4759
+ },
+ {
+ "epoch": 2.2505910165484635,
+ "grad_norm": 3.1302902698516846,
+ "learning_rate": 3.4918963531252607e-06,
+ "loss": 0.5398,
+ "step": 4760
+ },
+ {
+ "epoch": 2.251063829787234,
+ "grad_norm": 2.8858273029327393,
+ "learning_rate": 3.4913236971121063e-06,
+ "loss": 0.4395,
+ "step": 4761
+ },
+ {
+ "epoch": 2.2515366430260046,
+ "grad_norm": 3.194521903991699,
+ "learning_rate": 3.4907509793735727e-06,
+ "loss": 0.5258,
+ "step": 4762
+ },
+ {
+ "epoch": 2.2520094562647754,
+ "grad_norm": 2.8640544414520264,
+ "learning_rate": 3.49017819994532e-06,
+ "loss": 0.4073,
+ "step": 4763
+ },
+ {
+ "epoch": 2.2524822695035462,
+ "grad_norm": 3.139995813369751,
+ "learning_rate": 3.489605358863011e-06,
+ "loss": 0.4653,
+ "step": 4764
+ },
+ {
+ "epoch": 2.2529550827423166,
+ "grad_norm": 2.6228537559509277,
+ "learning_rate": 3.489032456162317e-06,
+ "loss": 0.4546,
+ "step": 4765
+ },
+ {
+ "epoch": 2.2534278959810874,
+ "grad_norm": 2.8197672367095947,
+ "learning_rate": 3.4884594918789083e-06,
+ "loss": 0.479,
+ "step": 4766
+ },
+ {
+ "epoch": 2.253900709219858,
+ "grad_norm": 2.7839298248291016,
+ "learning_rate": 3.4878864660484612e-06,
+ "loss": 0.5081,
+ "step": 4767
+ },
+ {
+ "epoch": 2.254373522458629,
+ "grad_norm": 2.8630709648132324,
+ "learning_rate": 3.487313378706656e-06,
+ "loss": 0.4345,
+ "step": 4768
+ },
+ {
+ "epoch": 2.2548463356973993,
+ "grad_norm": 2.5661563873291016,
+ "learning_rate": 3.4867402298891755e-06,
+ "loss": 0.4266,
+ "step": 4769
+ },
+ {
+ "epoch": 2.25531914893617,
+ "grad_norm": 2.6274025440216064,
+ "learning_rate": 3.4861670196317084e-06,
+ "loss": 0.4645,
+ "step": 4770
+ },
+ {
+ "epoch": 2.255791962174941,
+ "grad_norm": 2.578702449798584,
+ "learning_rate": 3.485593747969944e-06,
+ "loss": 0.4242,
+ "step": 4771
+ },
+ {
+ "epoch": 2.2562647754137117,
+ "grad_norm": 2.322476625442505,
+ "learning_rate": 3.48502041493958e-06,
+ "loss": 0.3975,
+ "step": 4772
+ },
+ {
+ "epoch": 2.256737588652482,
+ "grad_norm": 2.8412630558013916,
+ "learning_rate": 3.484447020576313e-06,
+ "loss": 0.4276,
+ "step": 4773
+ },
+ {
+ "epoch": 2.257210401891253,
+ "grad_norm": 2.6090497970581055,
+ "learning_rate": 3.483873564915847e-06,
+ "loss": 0.429,
+ "step": 4774
+ },
+ {
+ "epoch": 2.2576832151300237,
+ "grad_norm": 2.692458152770996,
+ "learning_rate": 3.4833000479938877e-06,
+ "loss": 0.4211,
+ "step": 4775
+ },
+ {
+ "epoch": 2.2581560283687945,
+ "grad_norm": 2.5546815395355225,
+ "learning_rate": 3.482726469846146e-06,
+ "loss": 0.4751,
+ "step": 4776
+ },
+ {
+ "epoch": 2.258628841607565,
+ "grad_norm": 2.8409626483917236,
+ "learning_rate": 3.4821528305083376e-06,
+ "loss": 0.4821,
+ "step": 4777
+ },
+ {
+ "epoch": 2.2591016548463356,
+ "grad_norm": 2.722966432571411,
+ "learning_rate": 3.4815791300161785e-06,
+ "loss": 0.5029,
+ "step": 4778
+ },
+ {
+ "epoch": 2.2595744680851064,
+ "grad_norm": 2.691603899002075,
+ "learning_rate": 3.48100536840539e-06,
+ "loss": 0.4242,
+ "step": 4779
+ },
+ {
+ "epoch": 2.260047281323877,
+ "grad_norm": 2.64035964012146,
+ "learning_rate": 3.4804315457116992e-06,
+ "loss": 0.4033,
+ "step": 4780
+ },
+ {
+ "epoch": 2.2605200945626476,
+ "grad_norm": 2.758819580078125,
+ "learning_rate": 3.4798576619708357e-06,
+ "loss": 0.4321,
+ "step": 4781
+ },
+ {
+ "epoch": 2.2609929078014184,
+ "grad_norm": 2.8204405307769775,
+ "learning_rate": 3.4792837172185324e-06,
+ "loss": 0.4309,
+ "step": 4782
+ },
+ {
+ "epoch": 2.261465721040189,
+ "grad_norm": 2.529771327972412,
+ "learning_rate": 3.478709711490525e-06,
+ "loss": 0.4398,
+ "step": 4783
+ },
+ {
+ "epoch": 2.26193853427896,
+ "grad_norm": 2.8156251907348633,
+ "learning_rate": 3.4781356448225557e-06,
+ "loss": 0.447,
+ "step": 4784
+ },
+ {
+ "epoch": 2.2624113475177303,
+ "grad_norm": 2.689528703689575,
+ "learning_rate": 3.477561517250369e-06,
+ "loss": 0.3907,
+ "step": 4785
+ },
+ {
+ "epoch": 2.262884160756501,
+ "grad_norm": 2.9148027896881104,
+ "learning_rate": 3.476987328809713e-06,
+ "loss": 0.4287,
+ "step": 4786
+ },
+ {
+ "epoch": 2.263356973995272,
+ "grad_norm": 2.933021306991577,
+ "learning_rate": 3.4764130795363404e-06,
+ "loss": 0.4847,
+ "step": 4787
+ },
+ {
+ "epoch": 2.2638297872340427,
+ "grad_norm": 2.8559257984161377,
+ "learning_rate": 3.4758387694660064e-06,
+ "loss": 0.4554,
+ "step": 4788
+ },
+ {
+ "epoch": 2.264302600472813,
+ "grad_norm": 3.0355522632598877,
+ "learning_rate": 3.4752643986344707e-06,
+ "loss": 0.4286,
+ "step": 4789
+ },
+ {
+ "epoch": 2.264775413711584,
+ "grad_norm": 2.9768362045288086,
+ "learning_rate": 3.474689967077498e-06,
+ "loss": 0.4917,
+ "step": 4790
+ },
+ {
+ "epoch": 2.2652482269503547,
+ "grad_norm": 2.827971935272217,
+ "learning_rate": 3.474115474830855e-06,
+ "loss": 0.4542,
+ "step": 4791
+ },
+ {
+ "epoch": 2.2657210401891255,
+ "grad_norm": 2.559659719467163,
+ "learning_rate": 3.4735409219303123e-06,
+ "loss": 0.4168,
+ "step": 4792
+ },
+ {
+ "epoch": 2.266193853427896,
+ "grad_norm": 2.3172824382781982,
+ "learning_rate": 3.472966308411645e-06,
+ "loss": 0.3535,
+ "step": 4793
+ },
+ {
+ "epoch": 2.2666666666666666,
+ "grad_norm": 2.6779656410217285,
+ "learning_rate": 3.4723916343106327e-06,
+ "loss": 0.4599,
+ "step": 4794
+ },
+ {
+ "epoch": 2.2671394799054374,
+ "grad_norm": 2.55780291557312,
+ "learning_rate": 3.4718168996630573e-06,
+ "loss": 0.4185,
+ "step": 4795
+ },
+ {
+ "epoch": 2.267612293144208,
+ "grad_norm": 2.4929800033569336,
+ "learning_rate": 3.471242104504704e-06,
+ "loss": 0.4008,
+ "step": 4796
+ },
+ {
+ "epoch": 2.2680851063829786,
+ "grad_norm": 2.849475145339966,
+ "learning_rate": 3.4706672488713642e-06,
+ "loss": 0.396,
+ "step": 4797
+ },
+ {
+ "epoch": 2.2685579196217494,
+ "grad_norm": 2.4830739498138428,
+ "learning_rate": 3.4700923327988306e-06,
+ "loss": 0.4087,
+ "step": 4798
+ },
+ {
+ "epoch": 2.26903073286052,
+ "grad_norm": 3.2748119831085205,
+ "learning_rate": 3.469517356322901e-06,
+ "loss": 0.4496,
+ "step": 4799
+ },
+ {
+ "epoch": 2.269503546099291,
+ "grad_norm": 3.0440170764923096,
+ "learning_rate": 3.468942319479378e-06,
+ "loss": 0.4903,
+ "step": 4800
+ },
+ {
+ "epoch": 2.2699763593380613,
+ "grad_norm": 2.8200504779815674,
+ "learning_rate": 3.4683672223040645e-06,
+ "loss": 0.4588,
+ "step": 4801
+ },
+ {
+ "epoch": 2.270449172576832,
+ "grad_norm": 2.675206184387207,
+ "learning_rate": 3.4677920648327707e-06,
+ "loss": 0.4257,
+ "step": 4802
+ },
+ {
+ "epoch": 2.270921985815603,
+ "grad_norm": 2.862675905227661,
+ "learning_rate": 3.4672168471013084e-06,
+ "loss": 0.466,
+ "step": 4803
+ },
+ {
+ "epoch": 2.2713947990543737,
+ "grad_norm": 2.65663743019104,
+ "learning_rate": 3.4666415691454947e-06,
+ "loss": 0.4784,
+ "step": 4804
+ },
+ {
+ "epoch": 2.271867612293144,
+ "grad_norm": 2.5610506534576416,
+ "learning_rate": 3.4660662310011483e-06,
+ "loss": 0.4429,
+ "step": 4805
+ },
+ {
+ "epoch": 2.272340425531915,
+ "grad_norm": 2.6459643840789795,
+ "learning_rate": 3.465490832704094e-06,
+ "loss": 0.4345,
+ "step": 4806
+ },
+ {
+ "epoch": 2.2728132387706856,
+ "grad_norm": 2.426013469696045,
+ "learning_rate": 3.4649153742901585e-06,
+ "loss": 0.4533,
+ "step": 4807
+ },
+ {
+ "epoch": 2.2732860520094564,
+ "grad_norm": 2.6714842319488525,
+ "learning_rate": 3.4643398557951745e-06,
+ "loss": 0.4409,
+ "step": 4808
+ },
+ {
+ "epoch": 2.273758865248227,
+ "grad_norm": 2.703629493713379,
+ "learning_rate": 3.463764277254976e-06,
+ "loss": 0.3656,
+ "step": 4809
+ },
+ {
+ "epoch": 2.2742316784869976,
+ "grad_norm": 2.811753988265991,
+ "learning_rate": 3.4631886387054025e-06,
+ "loss": 0.4957,
+ "step": 4810
+ },
+ {
+ "epoch": 2.2747044917257684,
+ "grad_norm": 2.9469289779663086,
+ "learning_rate": 3.462612940182295e-06,
+ "loss": 0.4582,
+ "step": 4811
+ },
+ {
+ "epoch": 2.275177304964539,
+ "grad_norm": 2.6287801265716553,
+ "learning_rate": 3.462037181721501e-06,
+ "loss": 0.4072,
+ "step": 4812
+ },
+ {
+ "epoch": 2.2756501182033095,
+ "grad_norm": 2.7104952335357666,
+ "learning_rate": 3.46146136335887e-06,
+ "loss": 0.4998,
+ "step": 4813
+ },
+ {
+ "epoch": 2.2761229314420803,
+ "grad_norm": 3.170363187789917,
+ "learning_rate": 3.460885485130256e-06,
+ "loss": 0.4722,
+ "step": 4814
+ },
+ {
+ "epoch": 2.276595744680851,
+ "grad_norm": 2.7315151691436768,
+ "learning_rate": 3.460309547071516e-06,
+ "loss": 0.4482,
+ "step": 4815
+ },
+ {
+ "epoch": 2.277068557919622,
+ "grad_norm": 2.685988187789917,
+ "learning_rate": 3.4597335492185113e-06,
+ "loss": 0.4419,
+ "step": 4816
+ },
+ {
+ "epoch": 2.2775413711583923,
+ "grad_norm": 2.532790184020996,
+ "learning_rate": 3.459157491607107e-06,
+ "loss": 0.3961,
+ "step": 4817
+ },
+ {
+ "epoch": 2.278014184397163,
+ "grad_norm": 2.920729875564575,
+ "learning_rate": 3.458581374273171e-06,
+ "loss": 0.4767,
+ "step": 4818
+ },
+ {
+ "epoch": 2.278486997635934,
+ "grad_norm": 3.2481250762939453,
+ "learning_rate": 3.458005197252577e-06,
+ "loss": 0.4985,
+ "step": 4819
+ },
+ {
+ "epoch": 2.2789598108747047,
+ "grad_norm": 2.373809814453125,
+ "learning_rate": 3.4574289605811994e-06,
+ "loss": 0.4259,
+ "step": 4820
+ },
+ {
+ "epoch": 2.279432624113475,
+ "grad_norm": 2.7851033210754395,
+ "learning_rate": 3.4568526642949184e-06,
+ "loss": 0.4829,
+ "step": 4821
+ },
+ {
+ "epoch": 2.279905437352246,
+ "grad_norm": 2.9777133464813232,
+ "learning_rate": 3.456276308429618e-06,
+ "loss": 0.4896,
+ "step": 4822
+ },
+ {
+ "epoch": 2.2803782505910166,
+ "grad_norm": 2.7922022342681885,
+ "learning_rate": 3.4556998930211853e-06,
+ "loss": 0.4908,
+ "step": 4823
+ },
+ {
+ "epoch": 2.2808510638297874,
+ "grad_norm": 2.699180841445923,
+ "learning_rate": 3.4551234181055104e-06,
+ "loss": 0.4518,
+ "step": 4824
+ },
+ {
+ "epoch": 2.2813238770685578,
+ "grad_norm": 3.1200520992279053,
+ "learning_rate": 3.4545468837184885e-06,
+ "loss": 0.4877,
+ "step": 4825
+ },
+ {
+ "epoch": 2.2817966903073286,
+ "grad_norm": 2.56782603263855,
+ "learning_rate": 3.453970289896018e-06,
+ "loss": 0.4281,
+ "step": 4826
+ },
+ {
+ "epoch": 2.2822695035460994,
+ "grad_norm": 3.241356372833252,
+ "learning_rate": 3.4533936366740007e-06,
+ "loss": 0.4338,
+ "step": 4827
+ },
+ {
+ "epoch": 2.28274231678487,
+ "grad_norm": 3.560295343399048,
+ "learning_rate": 3.452816924088342e-06,
+ "loss": 0.4121,
+ "step": 4828
+ },
+ {
+ "epoch": 2.2832151300236405,
+ "grad_norm": 2.8512449264526367,
+ "learning_rate": 3.452240152174951e-06,
+ "loss": 0.4357,
+ "step": 4829
+ },
+ {
+ "epoch": 2.2836879432624113,
+ "grad_norm": 3.0332651138305664,
+ "learning_rate": 3.4516633209697408e-06,
+ "loss": 0.4985,
+ "step": 4830
+ },
+ {
+ "epoch": 2.284160756501182,
+ "grad_norm": 2.520930528640747,
+ "learning_rate": 3.451086430508629e-06,
+ "loss": 0.4021,
+ "step": 4831
+ },
+ {
+ "epoch": 2.284633569739953,
+ "grad_norm": 2.508227825164795,
+ "learning_rate": 3.4505094808275363e-06,
+ "loss": 0.3935,
+ "step": 4832
+ },
+ {
+ "epoch": 2.2851063829787233,
+ "grad_norm": 2.56752610206604,
+ "learning_rate": 3.449932471962385e-06,
+ "loss": 0.4689,
+ "step": 4833
+ },
+ {
+ "epoch": 2.285579196217494,
+ "grad_norm": 2.7757534980773926,
+ "learning_rate": 3.449355403949105e-06,
+ "loss": 0.4565,
+ "step": 4834
+ },
+ {
+ "epoch": 2.286052009456265,
+ "grad_norm": 3.364821195602417,
+ "learning_rate": 3.448778276823626e-06,
+ "loss": 0.4729,
+ "step": 4835
+ },
+ {
+ "epoch": 2.2865248226950357,
+ "grad_norm": 3.0045557022094727,
+ "learning_rate": 3.448201090621884e-06,
+ "loss": 0.4834,
+ "step": 4836
+ },
+ {
+ "epoch": 2.286997635933806,
+ "grad_norm": 2.9451794624328613,
+ "learning_rate": 3.4476238453798183e-06,
+ "loss": 0.489,
+ "step": 4837
+ },
+ {
+ "epoch": 2.287470449172577,
+ "grad_norm": 2.8307435512542725,
+ "learning_rate": 3.4470465411333708e-06,
+ "loss": 0.5079,
+ "step": 4838
+ },
+ {
+ "epoch": 2.2879432624113476,
+ "grad_norm": 2.7118136882781982,
+ "learning_rate": 3.4464691779184876e-06,
+ "loss": 0.4794,
+ "step": 4839
+ },
+ {
+ "epoch": 2.2884160756501184,
+ "grad_norm": 2.6724441051483154,
+ "learning_rate": 3.445891755771119e-06,
+ "loss": 0.4619,
+ "step": 4840
+ },
+ {
+ "epoch": 2.2888888888888888,
+ "grad_norm": 2.8161258697509766,
+ "learning_rate": 3.445314274727218e-06,
+ "loss": 0.4287,
+ "step": 4841
+ },
+ {
+ "epoch": 2.2893617021276595,
+ "grad_norm": 2.5681750774383545,
+ "learning_rate": 3.4447367348227433e-06,
+ "loss": 0.4167,
+ "step": 4842
+ },
+ {
+ "epoch": 2.2898345153664303,
+ "grad_norm": 2.8136284351348877,
+ "learning_rate": 3.444159136093654e-06,
+ "loss": 0.4195,
+ "step": 4843
+ },
+ {
+ "epoch": 2.290307328605201,
+ "grad_norm": 3.153651714324951,
+ "learning_rate": 3.443581478575915e-06,
+ "loss": 0.4821,
+ "step": 4844
+ },
+ {
+ "epoch": 2.2907801418439715,
+ "grad_norm": 2.980883836746216,
+ "learning_rate": 3.4430037623054953e-06,
+ "loss": 0.4627,
+ "step": 4845
+ },
+ {
+ "epoch": 2.2912529550827423,
+ "grad_norm": 2.786182403564453,
+ "learning_rate": 3.4424259873183664e-06,
+ "loss": 0.4342,
+ "step": 4846
+ },
+ {
+ "epoch": 2.291725768321513,
+ "grad_norm": 2.8938279151916504,
+ "learning_rate": 3.4418481536505026e-06,
+ "loss": 0.3997,
+ "step": 4847
+ },
+ {
+ "epoch": 2.2921985815602834,
+ "grad_norm": 2.5534510612487793,
+ "learning_rate": 3.4412702613378844e-06,
+ "loss": 0.3982,
+ "step": 4848
+ },
+ {
+ "epoch": 2.2926713947990542,
+ "grad_norm": 2.7907063961029053,
+ "learning_rate": 3.4406923104164956e-06,
+ "loss": 0.4484,
+ "step": 4849
+ },
+ {
+ "epoch": 2.293144208037825,
+ "grad_norm": 3.162702798843384,
+ "learning_rate": 3.4401143009223203e-06,
+ "loss": 0.4528,
+ "step": 4850
+ },
+ {
+ "epoch": 2.293617021276596,
+ "grad_norm": 2.4647393226623535,
+ "learning_rate": 3.4395362328913505e-06,
+ "loss": 0.3759,
+ "step": 4851
+ },
+ {
+ "epoch": 2.2940898345153666,
+ "grad_norm": 2.8219876289367676,
+ "learning_rate": 3.438958106359579e-06,
+ "loss": 0.4903,
+ "step": 4852
+ },
+ {
+ "epoch": 2.294562647754137,
+ "grad_norm": 2.827073097229004,
+ "learning_rate": 3.438379921363003e-06,
+ "loss": 0.4315,
+ "step": 4853
+ },
+ {
+ "epoch": 2.295035460992908,
+ "grad_norm": 2.472470283508301,
+ "learning_rate": 3.4378016779376244e-06,
+ "loss": 0.4478,
+ "step": 4854
+ },
+ {
+ "epoch": 2.2955082742316786,
+ "grad_norm": 3.3994734287261963,
+ "learning_rate": 3.4372233761194473e-06,
+ "loss": 0.5086,
+ "step": 4855
+ },
+ {
+ "epoch": 2.295981087470449,
+ "grad_norm": 3.030465602874756,
+ "learning_rate": 3.4366450159444796e-06,
+ "loss": 0.4159,
+ "step": 4856
+ },
+ {
+ "epoch": 2.2964539007092197,
+ "grad_norm": 2.5460705757141113,
+ "learning_rate": 3.4360665974487346e-06,
+ "loss": 0.4097,
+ "step": 4857
+ },
+ {
+ "epoch": 2.2969267139479905,
+ "grad_norm": 2.884469509124756,
+ "learning_rate": 3.4354881206682273e-06,
+ "loss": 0.4478,
+ "step": 4858
+ },
+ {
+ "epoch": 2.2973995271867613,
+ "grad_norm": 2.5139710903167725,
+ "learning_rate": 3.4349095856389765e-06,
+ "loss": 0.4286,
+ "step": 4859
+ },
+ {
+ "epoch": 2.297872340425532,
+ "grad_norm": 3.1628260612487793,
+ "learning_rate": 3.4343309923970053e-06,
+ "loss": 0.4617,
+ "step": 4860
+ },
+ {
+ "epoch": 2.2983451536643025,
+ "grad_norm": 2.6141695976257324,
+ "learning_rate": 3.4337523409783395e-06,
+ "loss": 0.3841,
+ "step": 4861
+ },
+ {
+ "epoch": 2.2988179669030733,
+ "grad_norm": 2.766834259033203,
+ "learning_rate": 3.43317363141901e-06,
+ "loss": 0.4484,
+ "step": 4862
+ },
+ {
+ "epoch": 2.299290780141844,
+ "grad_norm": 2.785491943359375,
+ "learning_rate": 3.4325948637550503e-06,
+ "loss": 0.4363,
+ "step": 4863
+ },
+ {
+ "epoch": 2.2997635933806144,
+ "grad_norm": 2.624929189682007,
+ "learning_rate": 3.4320160380224988e-06,
+ "loss": 0.4518,
+ "step": 4864
+ },
+ {
+ "epoch": 2.300236406619385,
+ "grad_norm": 2.895413398742676,
+ "learning_rate": 3.4314371542573944e-06,
+ "loss": 0.4745,
+ "step": 4865
+ },
+ {
+ "epoch": 2.300709219858156,
+ "grad_norm": 2.603816270828247,
+ "learning_rate": 3.430858212495783e-06,
+ "loss": 0.4444,
+ "step": 4866
+ },
+ {
+ "epoch": 2.301182033096927,
+ "grad_norm": 3.387360095977783,
+ "learning_rate": 3.4302792127737116e-06,
+ "loss": 0.4169,
+ "step": 4867
+ },
+ {
+ "epoch": 2.3016548463356976,
+ "grad_norm": 2.894054651260376,
+ "learning_rate": 3.4297001551272334e-06,
+ "loss": 0.4493,
+ "step": 4868
+ },
+ {
+ "epoch": 2.302127659574468,
+ "grad_norm": 3.0432028770446777,
+ "learning_rate": 3.4291210395924035e-06,
+ "loss": 0.4854,
+ "step": 4869
+ },
+ {
+ "epoch": 2.3026004728132388,
+ "grad_norm": 2.5144734382629395,
+ "learning_rate": 3.42854186620528e-06,
+ "loss": 0.4556,
+ "step": 4870
+ },
+ {
+ "epoch": 2.3030732860520096,
+ "grad_norm": 2.964812755584717,
+ "learning_rate": 3.427962635001926e-06,
+ "loss": 0.495,
+ "step": 4871
+ },
+ {
+ "epoch": 2.30354609929078,
+ "grad_norm": 2.9991118907928467,
+ "learning_rate": 3.4273833460184077e-06,
+ "loss": 0.4787,
+ "step": 4872
+ },
+ {
+ "epoch": 2.3040189125295507,
+ "grad_norm": 2.9424328804016113,
+ "learning_rate": 3.4268039992907955e-06,
+ "loss": 0.5006,
+ "step": 4873
+ },
+ {
+ "epoch": 2.3044917257683215,
+ "grad_norm": 2.792880058288574,
+ "learning_rate": 3.426224594855162e-06,
+ "loss": 0.4399,
+ "step": 4874
+ },
+ {
+ "epoch": 2.3049645390070923,
+ "grad_norm": 2.5308053493499756,
+ "learning_rate": 3.4256451327475838e-06,
+ "loss": 0.4843,
+ "step": 4875
+ },
+ {
+ "epoch": 2.305437352245863,
+ "grad_norm": 2.7937564849853516,
+ "learning_rate": 3.425065613004142e-06,
+ "loss": 0.4428,
+ "step": 4876
+ },
+ {
+ "epoch": 2.3059101654846335,
+ "grad_norm": 2.4231557846069336,
+ "learning_rate": 3.424486035660921e-06,
+ "loss": 0.4054,
+ "step": 4877
+ },
+ {
+ "epoch": 2.3063829787234043,
+ "grad_norm": 3.0622596740722656,
+ "learning_rate": 3.423906400754009e-06,
+ "loss": 0.4623,
+ "step": 4878
+ },
+ {
+ "epoch": 2.306855791962175,
+ "grad_norm": 2.6532933712005615,
+ "learning_rate": 3.4233267083194955e-06,
+ "loss": 0.4387,
+ "step": 4879
+ },
+ {
+ "epoch": 2.3073286052009454,
+ "grad_norm": 2.793325185775757,
+ "learning_rate": 3.422746958393477e-06,
+ "loss": 0.4047,
+ "step": 4880
+ },
+ {
+ "epoch": 2.307801418439716,
+ "grad_norm": 2.9178314208984375,
+ "learning_rate": 3.422167151012052e-06,
+ "loss": 0.4397,
+ "step": 4881
+ },
+ {
+ "epoch": 2.308274231678487,
+ "grad_norm": 3.463913917541504,
+ "learning_rate": 3.4215872862113214e-06,
+ "loss": 0.4347,
+ "step": 4882
+ },
+ {
+ "epoch": 2.308747044917258,
+ "grad_norm": 3.228403091430664,
+ "learning_rate": 3.421007364027392e-06,
+ "loss": 0.4405,
+ "step": 4883
+ },
+ {
+ "epoch": 2.3092198581560286,
+ "grad_norm": 2.896933078765869,
+ "learning_rate": 3.420427384496372e-06,
+ "loss": 0.4429,
+ "step": 4884
+ },
+ {
+ "epoch": 2.309692671394799,
+ "grad_norm": 2.5559937953948975,
+ "learning_rate": 3.4198473476543755e-06,
+ "loss": 0.4281,
+ "step": 4885
+ },
+ {
+ "epoch": 2.3101654846335697,
+ "grad_norm": 3.457918167114258,
+ "learning_rate": 3.419267253537517e-06,
+ "loss": 0.4495,
+ "step": 4886
+ },
+ {
+ "epoch": 2.3106382978723405,
+ "grad_norm": 2.6554839611053467,
+ "learning_rate": 3.418687102181918e-06,
+ "loss": 0.4682,
+ "step": 4887
+ },
+ {
+ "epoch": 2.311111111111111,
+ "grad_norm": 2.8171639442443848,
+ "learning_rate": 3.4181068936237024e-06,
+ "loss": 0.4184,
+ "step": 4888
+ },
+ {
+ "epoch": 2.3115839243498817,
+ "grad_norm": 2.9272499084472656,
+ "learning_rate": 3.4175266278989955e-06,
+ "loss": 0.5445,
+ "step": 4889
+ },
+ {
+ "epoch": 2.3120567375886525,
+ "grad_norm": 2.5928499698638916,
+ "learning_rate": 3.4169463050439284e-06,
+ "loss": 0.3808,
+ "step": 4890
+ },
+ {
+ "epoch": 2.3125295508274233,
+ "grad_norm": 2.6624577045440674,
+ "learning_rate": 3.4163659250946356e-06,
+ "loss": 0.4678,
+ "step": 4891
+ },
+ {
+ "epoch": 2.313002364066194,
+ "grad_norm": 2.666555643081665,
+ "learning_rate": 3.4157854880872553e-06,
+ "loss": 0.457,
+ "step": 4892
+ },
+ {
+ "epoch": 2.3134751773049644,
+ "grad_norm": 3.2987406253814697,
+ "learning_rate": 3.4152049940579278e-06,
+ "loss": 0.551,
+ "step": 4893
+ },
+ {
+ "epoch": 2.3139479905437352,
+ "grad_norm": 2.728119134902954,
+ "learning_rate": 3.414624443042799e-06,
+ "loss": 0.3935,
+ "step": 4894
+ },
+ {
+ "epoch": 2.314420803782506,
+ "grad_norm": 3.133005380630493,
+ "learning_rate": 3.4140438350780157e-06,
+ "loss": 0.4981,
+ "step": 4895
+ },
+ {
+ "epoch": 2.3148936170212764,
+ "grad_norm": 2.591252565383911,
+ "learning_rate": 3.4134631701997312e-06,
+ "loss": 0.4251,
+ "step": 4896
+ },
+ {
+ "epoch": 2.315366430260047,
+ "grad_norm": 3.007136344909668,
+ "learning_rate": 3.412882448444101e-06,
+ "loss": 0.4492,
+ "step": 4897
+ },
+ {
+ "epoch": 2.315839243498818,
+ "grad_norm": 2.6391026973724365,
+ "learning_rate": 3.412301669847284e-06,
+ "loss": 0.5151,
+ "step": 4898
+ },
+ {
+ "epoch": 2.3163120567375888,
+ "grad_norm": 7.453699111938477,
+ "learning_rate": 3.411720834445441e-06,
+ "loss": 0.4983,
+ "step": 4899
+ },
+ {
+ "epoch": 2.3167848699763596,
+ "grad_norm": 2.667712688446045,
+ "learning_rate": 3.41113994227474e-06,
+ "loss": 0.4581,
+ "step": 4900
+ },
+ {
+ "epoch": 2.31725768321513,
+ "grad_norm": 2.7727627754211426,
+ "learning_rate": 3.41055899337135e-06,
+ "loss": 0.4731,
+ "step": 4901
+ },
+ {
+ "epoch": 2.3177304964539007,
+ "grad_norm": 3.0096890926361084,
+ "learning_rate": 3.409977987771444e-06,
+ "loss": 0.4996,
+ "step": 4902
+ },
+ {
+ "epoch": 2.3182033096926715,
+ "grad_norm": 2.725830078125,
+ "learning_rate": 3.4093969255111993e-06,
+ "loss": 0.4544,
+ "step": 4903
+ },
+ {
+ "epoch": 2.318676122931442,
+ "grad_norm": 2.7596993446350098,
+ "learning_rate": 3.4088158066267945e-06,
+ "loss": 0.4846,
+ "step": 4904
+ },
+ {
+ "epoch": 2.3191489361702127,
+ "grad_norm": 2.702620029449463,
+ "learning_rate": 3.4082346311544156e-06,
+ "loss": 0.4849,
+ "step": 4905
+ },
+ {
+ "epoch": 2.3196217494089835,
+ "grad_norm": 2.725374460220337,
+ "learning_rate": 3.407653399130249e-06,
+ "loss": 0.4116,
+ "step": 4906
+ },
+ {
+ "epoch": 2.3200945626477543,
+ "grad_norm": 2.6770219802856445,
+ "learning_rate": 3.4070721105904847e-06,
+ "loss": 0.4606,
+ "step": 4907
+ },
+ {
+ "epoch": 2.320567375886525,
+ "grad_norm": 2.9249117374420166,
+ "learning_rate": 3.406490765571317e-06,
+ "loss": 0.461,
+ "step": 4908
+ },
+ {
+ "epoch": 2.3210401891252954,
+ "grad_norm": 2.7568278312683105,
+ "learning_rate": 3.405909364108944e-06,
+ "loss": 0.4065,
+ "step": 4909
+ },
+ {
+ "epoch": 2.321513002364066,
+ "grad_norm": 2.7231340408325195,
+ "learning_rate": 3.4053279062395676e-06,
+ "loss": 0.4173,
+ "step": 4910
+ },
+ {
+ "epoch": 2.321985815602837,
+ "grad_norm": 3.1401100158691406,
+ "learning_rate": 3.404746391999393e-06,
+ "loss": 0.4287,
+ "step": 4911
+ },
+ {
+ "epoch": 2.3224586288416074,
+ "grad_norm": 2.714853525161743,
+ "learning_rate": 3.404164821424627e-06,
+ "loss": 0.4552,
+ "step": 4912
+ },
+ {
+ "epoch": 2.322931442080378,
+ "grad_norm": 3.1509978771209717,
+ "learning_rate": 3.4035831945514825e-06,
+ "loss": 0.5296,
+ "step": 4913
+ },
+ {
+ "epoch": 2.323404255319149,
+ "grad_norm": 2.567194938659668,
+ "learning_rate": 3.403001511416174e-06,
+ "loss": 0.4306,
+ "step": 4914
+ },
+ {
+ "epoch": 2.3238770685579198,
+ "grad_norm": 2.7473888397216797,
+ "learning_rate": 3.402419772054922e-06,
+ "loss": 0.4009,
+ "step": 4915
+ },
+ {
+ "epoch": 2.3243498817966906,
+ "grad_norm": 2.8617780208587646,
+ "learning_rate": 3.401837976503947e-06,
+ "loss": 0.4545,
+ "step": 4916
+ },
+ {
+ "epoch": 2.324822695035461,
+ "grad_norm": 2.3650572299957275,
+ "learning_rate": 3.401256124799475e-06,
+ "loss": 0.4046,
+ "step": 4917
+ },
+ {
+ "epoch": 2.3252955082742317,
+ "grad_norm": 2.418407678604126,
+ "learning_rate": 3.4006742169777364e-06,
+ "loss": 0.4222,
+ "step": 4918
+ },
+ {
+ "epoch": 2.3257683215130025,
+ "grad_norm": 2.7232494354248047,
+ "learning_rate": 3.400092253074964e-06,
+ "loss": 0.4373,
+ "step": 4919
+ },
+ {
+ "epoch": 2.326241134751773,
+ "grad_norm": 2.702965497970581,
+ "learning_rate": 3.399510233127394e-06,
+ "loss": 0.437,
+ "step": 4920
+ },
+ {
+ "epoch": 2.3267139479905437,
+ "grad_norm": 2.8381760120391846,
+ "learning_rate": 3.3989281571712664e-06,
+ "loss": 0.4294,
+ "step": 4921
+ },
+ {
+ "epoch": 2.3271867612293144,
+ "grad_norm": 2.767131805419922,
+ "learning_rate": 3.398346025242823e-06,
+ "loss": 0.4673,
+ "step": 4922
+ },
+ {
+ "epoch": 2.3276595744680852,
+ "grad_norm": 2.5261805057525635,
+ "learning_rate": 3.3977638373783123e-06,
+ "loss": 0.4147,
+ "step": 4923
+ },
+ {
+ "epoch": 2.3281323877068556,
+ "grad_norm": 2.7176897525787354,
+ "learning_rate": 3.3971815936139836e-06,
+ "loss": 0.3885,
+ "step": 4924
+ },
+ {
+ "epoch": 2.3286052009456264,
+ "grad_norm": 2.849043130874634,
+ "learning_rate": 3.396599293986092e-06,
+ "loss": 0.4842,
+ "step": 4925
+ },
+ {
+ "epoch": 2.329078014184397,
+ "grad_norm": 2.550673484802246,
+ "learning_rate": 3.3960169385308927e-06,
+ "loss": 0.4049,
+ "step": 4926
+ },
+ {
+ "epoch": 2.329550827423168,
+ "grad_norm": 3.0821585655212402,
+ "learning_rate": 3.3954345272846477e-06,
+ "loss": 0.53,
+ "step": 4927
+ },
+ {
+ "epoch": 2.3300236406619383,
+ "grad_norm": 2.68658185005188,
+ "learning_rate": 3.3948520602836223e-06,
+ "loss": 0.4592,
+ "step": 4928
+ },
+ {
+ "epoch": 2.330496453900709,
+ "grad_norm": 2.7391903400421143,
+ "learning_rate": 3.394269537564082e-06,
+ "loss": 0.4773,
+ "step": 4929
+ },
+ {
+ "epoch": 2.33096926713948,
+ "grad_norm": 2.665114164352417,
+ "learning_rate": 3.393686959162299e-06,
+ "loss": 0.4671,
+ "step": 4930
+ },
+ {
+ "epoch": 2.3314420803782507,
+ "grad_norm": 2.6827399730682373,
+ "learning_rate": 3.3931043251145477e-06,
+ "loss": 0.4669,
+ "step": 4931
+ },
+ {
+ "epoch": 2.331914893617021,
+ "grad_norm": 3.1760666370391846,
+ "learning_rate": 3.392521635457106e-06,
+ "loss": 0.4729,
+ "step": 4932
+ },
+ {
+ "epoch": 2.332387706855792,
+ "grad_norm": 2.9686226844787598,
+ "learning_rate": 3.3919388902262555e-06,
+ "loss": 0.5017,
+ "step": 4933
+ },
+ {
+ "epoch": 2.3328605200945627,
+ "grad_norm": 2.471325397491455,
+ "learning_rate": 3.3913560894582818e-06,
+ "loss": 0.4195,
+ "step": 4934
+ },
+ {
+ "epoch": 2.3333333333333335,
+ "grad_norm": 2.4062955379486084,
+ "learning_rate": 3.3907732331894732e-06,
+ "loss": 0.3666,
+ "step": 4935
+ },
+ {
+ "epoch": 2.333806146572104,
+ "grad_norm": 2.6800320148468018,
+ "learning_rate": 3.3901903214561206e-06,
+ "loss": 0.4774,
+ "step": 4936
+ },
+ {
+ "epoch": 2.3342789598108746,
+ "grad_norm": 2.923741102218628,
+ "learning_rate": 3.389607354294521e-06,
+ "loss": 0.4546,
+ "step": 4937
+ },
+ {
+ "epoch": 2.3347517730496454,
+ "grad_norm": 3.0034096240997314,
+ "learning_rate": 3.3890243317409716e-06,
+ "loss": 0.5373,
+ "step": 4938
+ },
+ {
+ "epoch": 2.3352245862884162,
+ "grad_norm": 3.0757339000701904,
+ "learning_rate": 3.388441253831775e-06,
+ "loss": 0.4655,
+ "step": 4939
+ },
+ {
+ "epoch": 2.3356973995271866,
+ "grad_norm": 2.5352041721343994,
+ "learning_rate": 3.3878581206032373e-06,
+ "loss": 0.4391,
+ "step": 4940
+ },
+ {
+ "epoch": 2.3361702127659574,
+ "grad_norm": 2.9332237243652344,
+ "learning_rate": 3.3872749320916675e-06,
+ "loss": 0.4685,
+ "step": 4941
+ },
+ {
+ "epoch": 2.336643026004728,
+ "grad_norm": 2.4871222972869873,
+ "learning_rate": 3.386691688333379e-06,
+ "loss": 0.3952,
+ "step": 4942
+ },
+ {
+ "epoch": 2.337115839243499,
+ "grad_norm": 2.6384918689727783,
+ "learning_rate": 3.386108389364687e-06,
+ "loss": 0.4044,
+ "step": 4943
+ },
+ {
+ "epoch": 2.3375886524822693,
+ "grad_norm": 2.3545165061950684,
+ "learning_rate": 3.3855250352219102e-06,
+ "loss": 0.426,
+ "step": 4944
+ },
+ {
+ "epoch": 2.33806146572104,
+ "grad_norm": 2.972242593765259,
+ "learning_rate": 3.3849416259413735e-06,
+ "loss": 0.5033,
+ "step": 4945
+ },
+ {
+ "epoch": 2.338534278959811,
+ "grad_norm": 3.117351770401001,
+ "learning_rate": 3.384358161559401e-06,
+ "loss": 0.4695,
+ "step": 4946
+ },
+ {
+ "epoch": 2.3390070921985817,
+ "grad_norm": 2.888916492462158,
+ "learning_rate": 3.383774642112324e-06,
+ "loss": 0.437,
+ "step": 4947
+ },
+ {
+ "epoch": 2.339479905437352,
+ "grad_norm": 3.0677435398101807,
+ "learning_rate": 3.3831910676364753e-06,
+ "loss": 0.4293,
+ "step": 4948
+ },
+ {
+ "epoch": 2.339952718676123,
+ "grad_norm": 2.8571784496307373,
+ "learning_rate": 3.3826074381681916e-06,
+ "loss": 0.4574,
+ "step": 4949
+ },
+ {
+ "epoch": 2.3404255319148937,
+ "grad_norm": 2.907276153564453,
+ "learning_rate": 3.3820237537438127e-06,
+ "loss": 0.4731,
+ "step": 4950
+ },
+ {
+ "epoch": 2.3408983451536645,
+ "grad_norm": 2.923762559890747,
+ "learning_rate": 3.3814400143996823e-06,
+ "loss": 0.4648,
+ "step": 4951
+ },
+ {
+ "epoch": 2.341371158392435,
+ "grad_norm": 2.6206982135772705,
+ "learning_rate": 3.3808562201721473e-06,
+ "loss": 0.436,
+ "step": 4952
+ },
+ {
+ "epoch": 2.3418439716312056,
+ "grad_norm": 6.279088973999023,
+ "learning_rate": 3.380272371097558e-06,
+ "loss": 0.4461,
+ "step": 4953
+ },
+ {
+ "epoch": 2.3423167848699764,
+ "grad_norm": 2.785297155380249,
+ "learning_rate": 3.3796884672122684e-06,
+ "loss": 0.4619,
+ "step": 4954
+ },
+ {
+ "epoch": 2.342789598108747,
+ "grad_norm": 2.6241793632507324,
+ "learning_rate": 3.379104508552634e-06,
+ "loss": 0.4323,
+ "step": 4955
+ },
+ {
+ "epoch": 2.3432624113475176,
+ "grad_norm": 2.6052167415618896,
+ "learning_rate": 3.378520495155017e-06,
+ "loss": 0.3943,
+ "step": 4956
+ },
+ {
+ "epoch": 2.3437352245862884,
+ "grad_norm": 2.8247411251068115,
+ "learning_rate": 3.3779364270557818e-06,
+ "loss": 0.4689,
+ "step": 4957
+ },
+ {
+ "epoch": 2.344208037825059,
+ "grad_norm": 2.5348927974700928,
+ "learning_rate": 3.377352304291294e-06,
+ "loss": 0.4619,
+ "step": 4958
+ },
+ {
+ "epoch": 2.34468085106383,
+ "grad_norm": 2.906648874282837,
+ "learning_rate": 3.376768126897926e-06,
+ "loss": 0.5191,
+ "step": 4959
+ },
+ {
+ "epoch": 2.3451536643026003,
+ "grad_norm": 2.796870470046997,
+ "learning_rate": 3.3761838949120514e-06,
+ "loss": 0.4227,
+ "step": 4960
+ },
+ {
+ "epoch": 2.345626477541371,
+ "grad_norm": 2.789635419845581,
+ "learning_rate": 3.3755996083700464e-06,
+ "loss": 0.3927,
+ "step": 4961
+ },
+ {
+ "epoch": 2.346099290780142,
+ "grad_norm": 2.86641263961792,
+ "learning_rate": 3.375015267308295e-06,
+ "loss": 0.4097,
+ "step": 4962
+ },
+ {
+ "epoch": 2.3465721040189127,
+ "grad_norm": 2.8374414443969727,
+ "learning_rate": 3.374430871763178e-06,
+ "loss": 0.4566,
+ "step": 4963
+ },
+ {
+ "epoch": 2.347044917257683,
+ "grad_norm": 2.71951961517334,
+ "learning_rate": 3.3738464217710854e-06,
+ "loss": 0.4748,
+ "step": 4964
+ },
+ {
+ "epoch": 2.347517730496454,
+ "grad_norm": 2.6939785480499268,
+ "learning_rate": 3.373261917368408e-06,
+ "loss": 0.4499,
+ "step": 4965
+ },
+ {
+ "epoch": 2.3479905437352246,
+ "grad_norm": 2.862661600112915,
+ "learning_rate": 3.37267735859154e-06,
+ "loss": 0.415,
+ "step": 4966
+ },
+ {
+ "epoch": 2.3484633569739954,
+ "grad_norm": 2.3657119274139404,
+ "learning_rate": 3.3720927454768793e-06,
+ "loss": 0.4112,
+ "step": 4967
+ },
+ {
+ "epoch": 2.348936170212766,
+ "grad_norm": 3.701571464538574,
+ "learning_rate": 3.3715080780608277e-06,
+ "loss": 0.4735,
+ "step": 4968
+ },
+ {
+ "epoch": 2.3494089834515366,
+ "grad_norm": 2.894350528717041,
+ "learning_rate": 3.3709233563797895e-06,
+ "loss": 0.4278,
+ "step": 4969
+ },
+ {
+ "epoch": 2.3498817966903074,
+ "grad_norm": 3.0072877407073975,
+ "learning_rate": 3.3703385804701727e-06,
+ "loss": 0.4718,
+ "step": 4970
+ },
+ {
+ "epoch": 2.350354609929078,
+ "grad_norm": 2.9920408725738525,
+ "learning_rate": 3.369753750368389e-06,
+ "loss": 0.4636,
+ "step": 4971
+ },
+ {
+ "epoch": 2.3508274231678485,
+ "grad_norm": 2.381770372390747,
+ "learning_rate": 3.369168866110853e-06,
+ "loss": 0.3841,
+ "step": 4972
+ },
+ {
+ "epoch": 2.3513002364066193,
+ "grad_norm": 2.6195342540740967,
+ "learning_rate": 3.3685839277339825e-06,
+ "loss": 0.4422,
+ "step": 4973
+ },
+ {
+ "epoch": 2.35177304964539,
+ "grad_norm": 2.885852575302124,
+ "learning_rate": 3.3679989352741992e-06,
+ "loss": 0.4798,
+ "step": 4974
+ },
+ {
+ "epoch": 2.352245862884161,
+ "grad_norm": 2.820004940032959,
+ "learning_rate": 3.367413888767929e-06,
+ "loss": 0.4498,
+ "step": 4975
+ },
+ {
+ "epoch": 2.3527186761229313,
+ "grad_norm": 2.579680919647217,
+ "learning_rate": 3.366828788251599e-06,
+ "loss": 0.4894,
+ "step": 4976
+ },
+ {
+ "epoch": 2.353191489361702,
+ "grad_norm": 2.7509915828704834,
+ "learning_rate": 3.366243633761642e-06,
+ "loss": 0.4354,
+ "step": 4977
+ },
+ {
+ "epoch": 2.353664302600473,
+ "grad_norm": 3.061767339706421,
+ "learning_rate": 3.3656584253344917e-06,
+ "loss": 0.4651,
+ "step": 4978
+ },
+ {
+ "epoch": 2.3541371158392437,
+ "grad_norm": 2.6109485626220703,
+ "learning_rate": 3.365073163006587e-06,
+ "loss": 0.44,
+ "step": 4979
+ },
+ {
+ "epoch": 2.354609929078014,
+ "grad_norm": 3.4247376918792725,
+ "learning_rate": 3.36448784681437e-06,
+ "loss": 0.3993,
+ "step": 4980
+ },
+ {
+ "epoch": 2.355082742316785,
+ "grad_norm": 2.953695297241211,
+ "learning_rate": 3.363902476794285e-06,
+ "loss": 0.4763,
+ "step": 4981
+ },
+ {
+ "epoch": 2.3555555555555556,
+ "grad_norm": 2.836543083190918,
+ "learning_rate": 3.3633170529827806e-06,
+ "loss": 0.4755,
+ "step": 4982
+ },
+ {
+ "epoch": 2.3560283687943264,
+ "grad_norm": 2.944082021713257,
+ "learning_rate": 3.36273157541631e-06,
+ "loss": 0.472,
+ "step": 4983
+ },
+ {
+ "epoch": 2.3565011820330968,
+ "grad_norm": 2.891716957092285,
+ "learning_rate": 3.3621460441313262e-06,
+ "loss": 0.5259,
+ "step": 4984
+ },
+ {
+ "epoch": 2.3569739952718676,
+ "grad_norm": 2.8448829650878906,
+ "learning_rate": 3.3615604591642896e-06,
+ "loss": 0.4587,
+ "step": 4985
+ },
+ {
+ "epoch": 2.3574468085106384,
+ "grad_norm": 3.114393711090088,
+ "learning_rate": 3.36097482055166e-06,
+ "loss": 0.4352,
+ "step": 4986
+ },
+ {
+ "epoch": 2.357919621749409,
+ "grad_norm": 2.964851140975952,
+ "learning_rate": 3.360389128329904e-06,
+ "loss": 0.5015,
+ "step": 4987
+ },
+ {
+ "epoch": 2.3583924349881795,
+ "grad_norm": 2.4819815158843994,
+ "learning_rate": 3.3598033825354893e-06,
+ "loss": 0.3459,
+ "step": 4988
+ },
+ {
+ "epoch": 2.3588652482269503,
+ "grad_norm": 2.635754346847534,
+ "learning_rate": 3.359217583204889e-06,
+ "loss": 0.4367,
+ "step": 4989
+ },
+ {
+ "epoch": 2.359338061465721,
+ "grad_norm": 2.542482376098633,
+ "learning_rate": 3.358631730374576e-06,
+ "loss": 0.3978,
+ "step": 4990
+ },
+ {
+ "epoch": 2.359810874704492,
+ "grad_norm": 2.614018678665161,
+ "learning_rate": 3.358045824081031e-06,
+ "loss": 0.424,
+ "step": 4991
+ },
+ {
+ "epoch": 2.3602836879432623,
+ "grad_norm": 2.775373697280884,
+ "learning_rate": 3.3574598643607354e-06,
+ "loss": 0.4901,
+ "step": 4992
+ },
+ {
+ "epoch": 2.360756501182033,
+ "grad_norm": 3.091381311416626,
+ "learning_rate": 3.356873851250173e-06,
+ "loss": 0.4954,
+ "step": 4993
+ },
+ {
+ "epoch": 2.361229314420804,
+ "grad_norm": 2.440023422241211,
+ "learning_rate": 3.3562877847858337e-06,
+ "loss": 0.4053,
+ "step": 4994
+ },
+ {
+ "epoch": 2.3617021276595747,
+ "grad_norm": 2.8879518508911133,
+ "learning_rate": 3.3557016650042084e-06,
+ "loss": 0.4766,
+ "step": 4995
+ },
+ {
+ "epoch": 2.362174940898345,
+ "grad_norm": 3.1298391819000244,
+ "learning_rate": 3.355115491941793e-06,
+ "loss": 0.4743,
+ "step": 4996
+ },
+ {
+ "epoch": 2.362647754137116,
+ "grad_norm": 3.3325259685516357,
+ "learning_rate": 3.3545292656350845e-06,
+ "loss": 0.4703,
+ "step": 4997
+ },
+ {
+ "epoch": 2.3631205673758866,
+ "grad_norm": 2.7935359477996826,
+ "learning_rate": 3.353942986120587e-06,
+ "loss": 0.432,
+ "step": 4998
+ },
+ {
+ "epoch": 2.3635933806146574,
+ "grad_norm": 2.623624324798584,
+ "learning_rate": 3.3533566534348033e-06,
+ "loss": 0.4302,
+ "step": 4999
+ },
+ {
+ "epoch": 2.3640661938534278,
+ "grad_norm": 3.1467108726501465,
+ "learning_rate": 3.3527702676142426e-06,
+ "loss": 0.4661,
+ "step": 5000
+ },
+ {
+ "epoch": 2.3645390070921986,
+ "grad_norm": 2.5364840030670166,
+ "learning_rate": 3.352183828695418e-06,
+ "loss": 0.4134,
+ "step": 5001
+ },
+ {
+ "epoch": 2.3650118203309693,
+ "grad_norm": 3.002777338027954,
+ "learning_rate": 3.3515973367148415e-06,
+ "loss": 0.3771,
+ "step": 5002
+ },
+ {
+ "epoch": 2.36548463356974,
+ "grad_norm": 2.660043954849243,
+ "learning_rate": 3.3510107917090335e-06,
+ "loss": 0.4254,
+ "step": 5003
+ },
+ {
+ "epoch": 2.3659574468085105,
+ "grad_norm": 2.7041075229644775,
+ "learning_rate": 3.3504241937145148e-06,
+ "loss": 0.4651,
+ "step": 5004
+ },
+ {
+ "epoch": 2.3664302600472813,
+ "grad_norm": 2.7387280464172363,
+ "learning_rate": 3.349837542767811e-06,
+ "loss": 0.3874,
+ "step": 5005
+ },
+ {
+ "epoch": 2.366903073286052,
+ "grad_norm": 3.012188196182251,
+ "learning_rate": 3.349250838905449e-06,
+ "loss": 0.4508,
+ "step": 5006
+ },
+ {
+ "epoch": 2.3673758865248224,
+ "grad_norm": 2.3108484745025635,
+ "learning_rate": 3.3486640821639616e-06,
+ "loss": 0.3783,
+ "step": 5007
+ },
+ {
+ "epoch": 2.3678486997635932,
+ "grad_norm": 3.2188332080841064,
+ "learning_rate": 3.3480772725798837e-06,
+ "loss": 0.4879,
+ "step": 5008
+ },
+ {
+ "epoch": 2.368321513002364,
+ "grad_norm": 2.566087484359741,
+ "learning_rate": 3.3474904101897526e-06,
+ "loss": 0.3847,
+ "step": 5009
+ },
+ {
+ "epoch": 2.368794326241135,
+ "grad_norm": 2.5581698417663574,
+ "learning_rate": 3.3469034950301092e-06,
+ "loss": 0.4201,
+ "step": 5010
+ },
+ {
+ "epoch": 2.3692671394799056,
+ "grad_norm": 2.900296926498413,
+ "learning_rate": 3.3463165271374992e-06,
+ "loss": 0.4568,
+ "step": 5011
+ },
+ {
+ "epoch": 2.369739952718676,
+ "grad_norm": 2.8239312171936035,
+ "learning_rate": 3.34572950654847e-06,
+ "loss": 0.4583,
+ "step": 5012
+ },
+ {
+ "epoch": 2.370212765957447,
+ "grad_norm": 3.219465970993042,
+ "learning_rate": 3.3451424332995723e-06,
+ "loss": 0.5435,
+ "step": 5013
+ },
+ {
+ "epoch": 2.3706855791962176,
+ "grad_norm": 3.3111915588378906,
+ "learning_rate": 3.344555307427362e-06,
+ "loss": 0.435,
+ "step": 5014
+ },
+ {
+ "epoch": 2.371158392434988,
+ "grad_norm": 3.296668529510498,
+ "learning_rate": 3.3439681289683946e-06,
+ "loss": 0.4738,
+ "step": 5015
+ },
+ {
+ "epoch": 2.3716312056737587,
+ "grad_norm": 3.005722761154175,
+ "learning_rate": 3.343380897959234e-06,
+ "loss": 0.4267,
+ "step": 5016
+ },
+ {
+ "epoch": 2.3721040189125295,
+ "grad_norm": 2.7844085693359375,
+ "learning_rate": 3.3427936144364425e-06,
+ "loss": 0.4558,
+ "step": 5017
+ },
+ {
+ "epoch": 2.3725768321513003,
+ "grad_norm": 2.7532076835632324,
+ "learning_rate": 3.3422062784365884e-06,
+ "loss": 0.4144,
+ "step": 5018
+ },
+ {
+ "epoch": 2.373049645390071,
+ "grad_norm": 2.835764169692993,
+ "learning_rate": 3.3416188899962413e-06,
+ "loss": 0.4945,
+ "step": 5019
+ },
+ {
+ "epoch": 2.3735224586288415,
+ "grad_norm": 3.1513726711273193,
+ "learning_rate": 3.3410314491519767e-06,
+ "loss": 0.4971,
+ "step": 5020
+ },
+ {
+ "epoch": 2.3739952718676123,
+ "grad_norm": 3.0162220001220703,
+ "learning_rate": 3.3404439559403723e-06,
+ "loss": 0.4477,
+ "step": 5021
+ },
+ {
+ "epoch": 2.374468085106383,
+ "grad_norm": 2.676391363143921,
+ "learning_rate": 3.3398564103980073e-06,
+ "loss": 0.432,
+ "step": 5022
+ },
+ {
+ "epoch": 2.3749408983451534,
+ "grad_norm": 2.7806248664855957,
+ "learning_rate": 3.3392688125614663e-06,
+ "loss": 0.4818,
+ "step": 5023
+ },
+ {
+ "epoch": 2.3754137115839242,
+ "grad_norm": 2.968806505203247,
+ "learning_rate": 3.3386811624673373e-06,
+ "loss": 0.4893,
+ "step": 5024
+ },
+ {
+ "epoch": 2.375886524822695,
+ "grad_norm": 2.992684841156006,
+ "learning_rate": 3.3380934601522087e-06,
+ "loss": 0.4423,
+ "step": 5025
+ },
+ {
+ "epoch": 2.376359338061466,
+ "grad_norm": 2.578420639038086,
+ "learning_rate": 3.3375057056526762e-06,
+ "loss": 0.3682,
+ "step": 5026
+ },
+ {
+ "epoch": 2.3768321513002366,
+ "grad_norm": 2.7683115005493164,
+ "learning_rate": 3.336917899005335e-06,
+ "loss": 0.4038,
+ "step": 5027
+ },
+ {
+ "epoch": 2.377304964539007,
+ "grad_norm": 2.838812828063965,
+ "learning_rate": 3.336330040246786e-06,
+ "loss": 0.442,
+ "step": 5028
+ },
+ {
+ "epoch": 2.3777777777777778,
+ "grad_norm": 2.766136646270752,
+ "learning_rate": 3.335742129413633e-06,
+ "loss": 0.4745,
+ "step": 5029
+ },
+ {
+ "epoch": 2.3782505910165486,
+ "grad_norm": 2.862656593322754,
+ "learning_rate": 3.3351541665424812e-06,
+ "loss": 0.4324,
+ "step": 5030
+ },
+ {
+ "epoch": 2.378723404255319,
+ "grad_norm": 2.71425199508667,
+ "learning_rate": 3.3345661516699433e-06,
+ "loss": 0.4013,
+ "step": 5031
+ },
+ {
+ "epoch": 2.3791962174940897,
+ "grad_norm": 2.8404030799865723,
+ "learning_rate": 3.333978084832629e-06,
+ "loss": 0.5038,
+ "step": 5032
+ },
+ {
+ "epoch": 2.3796690307328605,
+ "grad_norm": 2.965851068496704,
+ "learning_rate": 3.3333899660671574e-06,
+ "loss": 0.4668,
+ "step": 5033
+ },
+ {
+ "epoch": 2.3801418439716313,
+ "grad_norm": 2.686452627182007,
+ "learning_rate": 3.3328017954101464e-06,
+ "loss": 0.4167,
+ "step": 5034
+ },
+ {
+ "epoch": 2.380614657210402,
+ "grad_norm": 2.8676156997680664,
+ "learning_rate": 3.3322135728982197e-06,
+ "loss": 0.4531,
+ "step": 5035
+ },
+ {
+ "epoch": 2.3810874704491725,
+ "grad_norm": 2.4456300735473633,
+ "learning_rate": 3.3316252985680026e-06,
+ "loss": 0.4173,
+ "step": 5036
+ },
+ {
+ "epoch": 2.3815602836879433,
+ "grad_norm": 2.5472559928894043,
+ "learning_rate": 3.331036972456124e-06,
+ "loss": 0.3926,
+ "step": 5037
+ },
+ {
+ "epoch": 2.382033096926714,
+ "grad_norm": 2.81900954246521,
+ "learning_rate": 3.330448594599218e-06,
+ "loss": 0.4785,
+ "step": 5038
+ },
+ {
+ "epoch": 2.3825059101654844,
+ "grad_norm": 3.0930590629577637,
+ "learning_rate": 3.329860165033919e-06,
+ "loss": 0.4587,
+ "step": 5039
+ },
+ {
+ "epoch": 2.382978723404255,
+ "grad_norm": 3.0553040504455566,
+ "learning_rate": 3.3292716837968673e-06,
+ "loss": 0.5285,
+ "step": 5040
+ },
+ {
+ "epoch": 2.383451536643026,
+ "grad_norm": 2.577580690383911,
+ "learning_rate": 3.328683150924704e-06,
+ "loss": 0.4184,
+ "step": 5041
+ },
+ {
+ "epoch": 2.383924349881797,
+ "grad_norm": 2.6430366039276123,
+ "learning_rate": 3.3280945664540735e-06,
+ "loss": 0.4636,
+ "step": 5042
+ },
+ {
+ "epoch": 2.3843971631205676,
+ "grad_norm": 3.228360891342163,
+ "learning_rate": 3.3275059304216255e-06,
+ "loss": 0.455,
+ "step": 5043
+ },
+ {
+ "epoch": 2.384869976359338,
+ "grad_norm": 2.776142120361328,
+ "learning_rate": 3.3269172428640125e-06,
+ "loss": 0.4785,
+ "step": 5044
+ },
+ {
+ "epoch": 2.3853427895981087,
+ "grad_norm": 2.755671739578247,
+ "learning_rate": 3.3263285038178882e-06,
+ "loss": 0.4625,
+ "step": 5045
+ },
+ {
+ "epoch": 2.3858156028368795,
+ "grad_norm": 3.061004400253296,
+ "learning_rate": 3.3257397133199114e-06,
+ "loss": 0.4641,
+ "step": 5046
+ },
+ {
+ "epoch": 2.38628841607565,
+ "grad_norm": 2.8391458988189697,
+ "learning_rate": 3.3251508714067432e-06,
+ "loss": 0.5003,
+ "step": 5047
+ },
+ {
+ "epoch": 2.3867612293144207,
+ "grad_norm": 2.390810966491699,
+ "learning_rate": 3.324561978115049e-06,
+ "loss": 0.4446,
+ "step": 5048
+ },
+ {
+ "epoch": 2.3872340425531915,
+ "grad_norm": 2.7760825157165527,
+ "learning_rate": 3.323973033481496e-06,
+ "loss": 0.4443,
+ "step": 5049
+ },
+ {
+ "epoch": 2.3877068557919623,
+ "grad_norm": 3.157893419265747,
+ "learning_rate": 3.3233840375427552e-06,
+ "loss": 0.4934,
+ "step": 5050
+ },
+ {
+ "epoch": 2.388179669030733,
+ "grad_norm": 2.7245349884033203,
+ "learning_rate": 3.3227949903355e-06,
+ "loss": 0.4254,
+ "step": 5051
+ },
+ {
+ "epoch": 2.3886524822695034,
+ "grad_norm": 2.6674044132232666,
+ "learning_rate": 3.322205891896409e-06,
+ "loss": 0.4116,
+ "step": 5052
+ },
+ {
+ "epoch": 2.3891252955082742,
+ "grad_norm": 3.1490554809570312,
+ "learning_rate": 3.3216167422621627e-06,
+ "loss": 0.4604,
+ "step": 5053
+ },
+ {
+ "epoch": 2.389598108747045,
+ "grad_norm": 2.725731134414673,
+ "learning_rate": 3.321027541469444e-06,
+ "loss": 0.4836,
+ "step": 5054
+ },
+ {
+ "epoch": 2.3900709219858154,
+ "grad_norm": 2.5378828048706055,
+ "learning_rate": 3.3204382895549407e-06,
+ "loss": 0.4228,
+ "step": 5055
+ },
+ {
+ "epoch": 2.390543735224586,
+ "grad_norm": 2.8191192150115967,
+ "learning_rate": 3.3198489865553427e-06,
+ "loss": 0.4371,
+ "step": 5056
+ },
+ {
+ "epoch": 2.391016548463357,
+ "grad_norm": 2.5676498413085938,
+ "learning_rate": 3.3192596325073433e-06,
+ "loss": 0.4463,
+ "step": 5057
+ },
+ {
+ "epoch": 2.391489361702128,
+ "grad_norm": 3.0846121311187744,
+ "learning_rate": 3.3186702274476397e-06,
+ "loss": 0.5049,
+ "step": 5058
+ },
+ {
+ "epoch": 2.3919621749408986,
+ "grad_norm": 2.6085152626037598,
+ "learning_rate": 3.3180807714129293e-06,
+ "loss": 0.4376,
+ "step": 5059
+ },
+ {
+ "epoch": 2.392434988179669,
+ "grad_norm": 3.0218591690063477,
+ "learning_rate": 3.3174912644399172e-06,
+ "loss": 0.4734,
+ "step": 5060
+ },
+ {
+ "epoch": 2.3929078014184397,
+ "grad_norm": 2.5904781818389893,
+ "learning_rate": 3.316901706565308e-06,
+ "loss": 0.4924,
+ "step": 5061
+ },
+ {
+ "epoch": 2.3933806146572105,
+ "grad_norm": 2.675478458404541,
+ "learning_rate": 3.3163120978258123e-06,
+ "loss": 0.4072,
+ "step": 5062
+ },
+ {
+ "epoch": 2.393853427895981,
+ "grad_norm": 2.7944445610046387,
+ "learning_rate": 3.3157224382581415e-06,
+ "loss": 0.4328,
+ "step": 5063
+ },
+ {
+ "epoch": 2.3943262411347517,
+ "grad_norm": 2.846224546432495,
+ "learning_rate": 3.315132727899012e-06,
+ "loss": 0.4447,
+ "step": 5064
+ },
+ {
+ "epoch": 2.3947990543735225,
+ "grad_norm": 2.6825828552246094,
+ "learning_rate": 3.3145429667851402e-06,
+ "loss": 0.4528,
+ "step": 5065
+ },
+ {
+ "epoch": 2.3952718676122933,
+ "grad_norm": 3.0305285453796387,
+ "learning_rate": 3.3139531549532505e-06,
+ "loss": 0.4538,
+ "step": 5066
+ },
+ {
+ "epoch": 2.395744680851064,
+ "grad_norm": 2.707540988922119,
+ "learning_rate": 3.313363292440067e-06,
+ "loss": 0.4412,
+ "step": 5067
+ },
+ {
+ "epoch": 2.3962174940898344,
+ "grad_norm": 3.0458385944366455,
+ "learning_rate": 3.3127733792823173e-06,
+ "loss": 0.4587,
+ "step": 5068
+ },
+ {
+ "epoch": 2.396690307328605,
+ "grad_norm": 2.7711992263793945,
+ "learning_rate": 3.312183415516733e-06,
+ "loss": 0.4157,
+ "step": 5069
+ },
+ {
+ "epoch": 2.397163120567376,
+ "grad_norm": 2.6953988075256348,
+ "learning_rate": 3.3115934011800494e-06,
+ "loss": 0.3828,
+ "step": 5070
+ },
+ {
+ "epoch": 2.3976359338061464,
+ "grad_norm": 3.033721923828125,
+ "learning_rate": 3.311003336309003e-06,
+ "loss": 0.5204,
+ "step": 5071
+ },
+ {
+ "epoch": 2.398108747044917,
+ "grad_norm": 2.6134517192840576,
+ "learning_rate": 3.3104132209403355e-06,
+ "loss": 0.4181,
+ "step": 5072
+ },
+ {
+ "epoch": 2.398581560283688,
+ "grad_norm": 2.8800251483917236,
+ "learning_rate": 3.30982305511079e-06,
+ "loss": 0.466,
+ "step": 5073
+ },
+ {
+ "epoch": 2.3990543735224588,
+ "grad_norm": 2.5043210983276367,
+ "learning_rate": 3.309232838857114e-06,
+ "loss": 0.4161,
+ "step": 5074
+ },
+ {
+ "epoch": 2.3995271867612296,
+ "grad_norm": 2.6577322483062744,
+ "learning_rate": 3.308642572216057e-06,
+ "loss": 0.465,
+ "step": 5075
+ },
+ {
+ "epoch": 2.4,
+ "grad_norm": 2.549098253250122,
+ "learning_rate": 3.3080522552243734e-06,
+ "loss": 0.4571,
+ "step": 5076
+ },
+ {
+ "epoch": 2.4004728132387707,
+ "grad_norm": 2.881958246231079,
+ "learning_rate": 3.3074618879188186e-06,
+ "loss": 0.4443,
+ "step": 5077
+ },
+ {
+ "epoch": 2.4009456264775415,
+ "grad_norm": 2.608397960662842,
+ "learning_rate": 3.3068714703361528e-06,
+ "loss": 0.3843,
+ "step": 5078
+ },
+ {
+ "epoch": 2.401418439716312,
+ "grad_norm": 2.8666789531707764,
+ "learning_rate": 3.306281002513139e-06,
+ "loss": 0.4857,
+ "step": 5079
+ },
+ {
+ "epoch": 2.4018912529550827,
+ "grad_norm": 2.9008588790893555,
+ "learning_rate": 3.3056904844865422e-06,
+ "loss": 0.4454,
+ "step": 5080
+ },
+ {
+ "epoch": 2.4023640661938535,
+ "grad_norm": 2.7446060180664062,
+ "learning_rate": 3.3050999162931315e-06,
+ "loss": 0.4522,
+ "step": 5081
+ },
+ {
+ "epoch": 2.4028368794326243,
+ "grad_norm": 2.787116765975952,
+ "learning_rate": 3.3045092979696804e-06,
+ "loss": 0.4714,
+ "step": 5082
+ },
+ {
+ "epoch": 2.403309692671395,
+ "grad_norm": 2.7494192123413086,
+ "learning_rate": 3.3039186295529613e-06,
+ "loss": 0.4107,
+ "step": 5083
+ },
+ {
+ "epoch": 2.4037825059101654,
+ "grad_norm": 2.733794927597046,
+ "learning_rate": 3.303327911079755e-06,
+ "loss": 0.4169,
+ "step": 5084
+ },
+ {
+ "epoch": 2.404255319148936,
+ "grad_norm": 2.7313334941864014,
+ "learning_rate": 3.3027371425868422e-06,
+ "loss": 0.4287,
+ "step": 5085
+ },
+ {
+ "epoch": 2.404728132387707,
+ "grad_norm": 2.7832977771759033,
+ "learning_rate": 3.3021463241110075e-06,
+ "loss": 0.5307,
+ "step": 5086
+ },
+ {
+ "epoch": 2.4052009456264773,
+ "grad_norm": 2.6615281105041504,
+ "learning_rate": 3.301555455689038e-06,
+ "loss": 0.4519,
+ "step": 5087
+ },
+ {
+ "epoch": 2.405673758865248,
+ "grad_norm": 2.343921422958374,
+ "learning_rate": 3.3009645373577264e-06,
+ "loss": 0.46,
+ "step": 5088
+ },
+ {
+ "epoch": 2.406146572104019,
+ "grad_norm": 2.6115355491638184,
+ "learning_rate": 3.300373569153864e-06,
+ "loss": 0.4782,
+ "step": 5089
+ },
+ {
+ "epoch": 2.4066193853427897,
+ "grad_norm": 2.730625629425049,
+ "learning_rate": 3.299782551114249e-06,
+ "loss": 0.4632,
+ "step": 5090
+ },
+ {
+ "epoch": 2.40709219858156,
+ "grad_norm": 2.4495043754577637,
+ "learning_rate": 3.2991914832756824e-06,
+ "loss": 0.4243,
+ "step": 5091
+ },
+ {
+ "epoch": 2.407565011820331,
+ "grad_norm": 2.8731648921966553,
+ "learning_rate": 3.2986003656749654e-06,
+ "loss": 0.4262,
+ "step": 5092
+ },
+ {
+ "epoch": 2.4080378250591017,
+ "grad_norm": 2.870342969894409,
+ "learning_rate": 3.2980091983489053e-06,
+ "loss": 0.4735,
+ "step": 5093
+ },
+ {
+ "epoch": 2.4085106382978725,
+ "grad_norm": 2.500786542892456,
+ "learning_rate": 3.297417981334312e-06,
+ "loss": 0.4007,
+ "step": 5094
+ },
+ {
+ "epoch": 2.408983451536643,
+ "grad_norm": 2.7787322998046875,
+ "learning_rate": 3.2968267146679978e-06,
+ "loss": 0.493,
+ "step": 5095
+ },
+ {
+ "epoch": 2.4094562647754136,
+ "grad_norm": 2.5229599475860596,
+ "learning_rate": 3.2962353983867783e-06,
+ "loss": 0.3676,
+ "step": 5096
+ },
+ {
+ "epoch": 2.4099290780141844,
+ "grad_norm": 3.1955904960632324,
+ "learning_rate": 3.2956440325274715e-06,
+ "loss": 0.4888,
+ "step": 5097
+ },
+ {
+ "epoch": 2.4104018912529552,
+ "grad_norm": 2.8580288887023926,
+ "learning_rate": 3.2950526171268995e-06,
+ "loss": 0.4892,
+ "step": 5098
+ },
+ {
+ "epoch": 2.4108747044917256,
+ "grad_norm": 2.6321749687194824,
+ "learning_rate": 3.294461152221887e-06,
+ "loss": 0.3823,
+ "step": 5099
+ },
+ {
+ "epoch": 2.4113475177304964,
+ "grad_norm": 2.881127119064331,
+ "learning_rate": 3.293869637849263e-06,
+ "loss": 0.4569,
+ "step": 5100
+ },
+ {
+ "epoch": 2.411820330969267,
+ "grad_norm": 2.7742316722869873,
+ "learning_rate": 3.293278074045857e-06,
+ "loss": 0.4445,
+ "step": 5101
+ },
+ {
+ "epoch": 2.412293144208038,
+ "grad_norm": 2.546701431274414,
+ "learning_rate": 3.2926864608485037e-06,
+ "loss": 0.3995,
+ "step": 5102
+ },
+ {
+ "epoch": 2.4127659574468083,
+ "grad_norm": 2.588226318359375,
+ "learning_rate": 3.292094798294041e-06,
+ "loss": 0.4081,
+ "step": 5103
+ },
+ {
+ "epoch": 2.413238770685579,
+ "grad_norm": 2.968689441680908,
+ "learning_rate": 3.2915030864193077e-06,
+ "loss": 0.4475,
+ "step": 5104
+ },
+ {
+ "epoch": 2.41371158392435,
+ "grad_norm": 2.9249184131622314,
+ "learning_rate": 3.290911325261148e-06,
+ "loss": 0.4763,
+ "step": 5105
+ },
+ {
+ "epoch": 2.4141843971631207,
+ "grad_norm": 2.817596673965454,
+ "learning_rate": 3.2903195148564083e-06,
+ "loss": 0.4451,
+ "step": 5106
+ },
+ {
+ "epoch": 2.414657210401891,
+ "grad_norm": 2.6465954780578613,
+ "learning_rate": 3.2897276552419377e-06,
+ "loss": 0.4665,
+ "step": 5107
+ },
+ {
+ "epoch": 2.415130023640662,
+ "grad_norm": 2.8613853454589844,
+ "learning_rate": 3.2891357464545885e-06,
+ "loss": 0.4398,
+ "step": 5108
+ },
+ {
+ "epoch": 2.4156028368794327,
+ "grad_norm": 2.756321907043457,
+ "learning_rate": 3.2885437885312175e-06,
+ "loss": 0.4634,
+ "step": 5109
+ },
+ {
+ "epoch": 2.4160756501182035,
+ "grad_norm": 2.8965282440185547,
+ "learning_rate": 3.287951781508682e-06,
+ "loss": 0.4319,
+ "step": 5110
+ },
+ {
+ "epoch": 2.416548463356974,
+ "grad_norm": 2.896756172180176,
+ "learning_rate": 3.287359725423844e-06,
+ "loss": 0.4771,
+ "step": 5111
+ },
+ {
+ "epoch": 2.4170212765957446,
+ "grad_norm": 2.952911376953125,
+ "learning_rate": 3.286767620313569e-06,
+ "loss": 0.5026,
+ "step": 5112
+ },
+ {
+ "epoch": 2.4174940898345154,
+ "grad_norm": 3.850515604019165,
+ "learning_rate": 3.2861754662147234e-06,
+ "loss": 0.4387,
+ "step": 5113
+ },
+ {
+ "epoch": 2.417966903073286,
+ "grad_norm": 3.0072689056396484,
+ "learning_rate": 3.2855832631641794e-06,
+ "loss": 0.4586,
+ "step": 5114
+ },
+ {
+ "epoch": 2.4184397163120566,
+ "grad_norm": 3.166790246963501,
+ "learning_rate": 3.2849910111988092e-06,
+ "loss": 0.4842,
+ "step": 5115
+ },
+ {
+ "epoch": 2.4189125295508274,
+ "grad_norm": 3.5397679805755615,
+ "learning_rate": 3.284398710355492e-06,
+ "loss": 0.5138,
+ "step": 5116
+ },
+ {
+ "epoch": 2.419385342789598,
+ "grad_norm": 2.779609441757202,
+ "learning_rate": 3.283806360671106e-06,
+ "loss": 0.4049,
+ "step": 5117
+ },
+ {
+ "epoch": 2.419858156028369,
+ "grad_norm": 2.5924575328826904,
+ "learning_rate": 3.283213962182535e-06,
+ "loss": 0.433,
+ "step": 5118
+ },
+ {
+ "epoch": 2.4203309692671393,
+ "grad_norm": 2.7429699897766113,
+ "learning_rate": 3.282621514926665e-06,
+ "loss": 0.4674,
+ "step": 5119
+ },
+ {
+ "epoch": 2.42080378250591,
+ "grad_norm": 2.8113889694213867,
+ "learning_rate": 3.2820290189403846e-06,
+ "loss": 0.3898,
+ "step": 5120
+ },
+ {
+ "epoch": 2.421276595744681,
+ "grad_norm": 2.867105722427368,
+ "learning_rate": 3.2814364742605863e-06,
+ "loss": 0.4439,
+ "step": 5121
+ },
+ {
+ "epoch": 2.4217494089834517,
+ "grad_norm": 2.428597927093506,
+ "learning_rate": 3.2808438809241654e-06,
+ "loss": 0.4339,
+ "step": 5122
+ },
+ {
+ "epoch": 2.422222222222222,
+ "grad_norm": 3.071735143661499,
+ "learning_rate": 3.2802512389680203e-06,
+ "loss": 0.4583,
+ "step": 5123
+ },
+ {
+ "epoch": 2.422695035460993,
+ "grad_norm": 3.046313762664795,
+ "learning_rate": 3.279658548429051e-06,
+ "loss": 0.5351,
+ "step": 5124
+ },
+ {
+ "epoch": 2.4231678486997636,
+ "grad_norm": 2.8412697315216064,
+ "learning_rate": 3.279065809344163e-06,
+ "loss": 0.5258,
+ "step": 5125
+ },
+ {
+ "epoch": 2.4236406619385344,
+ "grad_norm": 2.887169122695923,
+ "learning_rate": 3.278473021750263e-06,
+ "loss": 0.4568,
+ "step": 5126
+ },
+ {
+ "epoch": 2.424113475177305,
+ "grad_norm": 2.8316574096679688,
+ "learning_rate": 3.2778801856842624e-06,
+ "loss": 0.46,
+ "step": 5127
+ },
+ {
+ "epoch": 2.4245862884160756,
+ "grad_norm": 2.7660772800445557,
+ "learning_rate": 3.277287301183073e-06,
+ "loss": 0.4323,
+ "step": 5128
+ },
+ {
+ "epoch": 2.4250591016548464,
+ "grad_norm": 2.737682819366455,
+ "learning_rate": 3.276694368283611e-06,
+ "loss": 0.4296,
+ "step": 5129
+ },
+ {
+ "epoch": 2.425531914893617,
+ "grad_norm": 2.8807425498962402,
+ "learning_rate": 3.276101387022797e-06,
+ "loss": 0.4673,
+ "step": 5130
+ },
+ {
+ "epoch": 2.4260047281323875,
+ "grad_norm": 2.530526876449585,
+ "learning_rate": 3.275508357437552e-06,
+ "loss": 0.416,
+ "step": 5131
+ },
+ {
+ "epoch": 2.4264775413711583,
+ "grad_norm": 3.1189746856689453,
+ "learning_rate": 3.274915279564803e-06,
+ "loss": 0.4171,
+ "step": 5132
+ },
+ {
+ "epoch": 2.426950354609929,
+ "grad_norm": 2.6612462997436523,
+ "learning_rate": 3.274322153441477e-06,
+ "loss": 0.4104,
+ "step": 5133
+ },
+ {
+ "epoch": 2.4274231678487,
+ "grad_norm": 2.717973470687866,
+ "learning_rate": 3.2737289791045064e-06,
+ "loss": 0.479,
+ "step": 5134
+ },
+ {
+ "epoch": 2.4278959810874703,
+ "grad_norm": 2.764216661453247,
+ "learning_rate": 3.2731357565908247e-06,
+ "loss": 0.481,
+ "step": 5135
+ },
+ {
+ "epoch": 2.428368794326241,
+ "grad_norm": 2.5081393718719482,
+ "learning_rate": 3.272542485937369e-06,
+ "loss": 0.4592,
+ "step": 5136
+ },
+ {
+ "epoch": 2.428841607565012,
+ "grad_norm": 3.1380364894866943,
+ "learning_rate": 3.271949167181081e-06,
+ "loss": 0.4179,
+ "step": 5137
+ },
+ {
+ "epoch": 2.4293144208037827,
+ "grad_norm": 2.9275963306427,
+ "learning_rate": 3.2713558003589026e-06,
+ "loss": 0.5196,
+ "step": 5138
+ },
+ {
+ "epoch": 2.429787234042553,
+ "grad_norm": 2.8215506076812744,
+ "learning_rate": 3.270762385507781e-06,
+ "loss": 0.4081,
+ "step": 5139
+ },
+ {
+ "epoch": 2.430260047281324,
+ "grad_norm": 2.9185614585876465,
+ "learning_rate": 3.270168922664665e-06,
+ "loss": 0.4936,
+ "step": 5140
+ },
+ {
+ "epoch": 2.4307328605200946,
+ "grad_norm": 2.6507248878479004,
+ "learning_rate": 3.269575411866507e-06,
+ "loss": 0.4834,
+ "step": 5141
+ },
+ {
+ "epoch": 2.4312056737588654,
+ "grad_norm": 2.864741563796997,
+ "learning_rate": 3.2689818531502637e-06,
+ "loss": 0.4562,
+ "step": 5142
+ },
+ {
+ "epoch": 2.431678486997636,
+ "grad_norm": 2.806919813156128,
+ "learning_rate": 3.2683882465528917e-06,
+ "loss": 0.4645,
+ "step": 5143
+ },
+ {
+ "epoch": 2.4321513002364066,
+ "grad_norm": 2.733372211456299,
+ "learning_rate": 3.267794592111353e-06,
+ "loss": 0.4123,
+ "step": 5144
+ },
+ {
+ "epoch": 2.4326241134751774,
+ "grad_norm": 2.8005833625793457,
+ "learning_rate": 3.2672008898626116e-06,
+ "loss": 0.4343,
+ "step": 5145
+ },
+ {
+ "epoch": 2.433096926713948,
+ "grad_norm": 3.2339670658111572,
+ "learning_rate": 3.2666071398436354e-06,
+ "loss": 0.4017,
+ "step": 5146
+ },
+ {
+ "epoch": 2.4335697399527185,
+ "grad_norm": 2.510251760482788,
+ "learning_rate": 3.2660133420913932e-06,
+ "loss": 0.3882,
+ "step": 5147
+ },
+ {
+ "epoch": 2.4340425531914893,
+ "grad_norm": 3.5633628368377686,
+ "learning_rate": 3.26541949664286e-06,
+ "loss": 0.4766,
+ "step": 5148
+ },
+ {
+ "epoch": 2.43451536643026,
+ "grad_norm": 2.8246724605560303,
+ "learning_rate": 3.26482560353501e-06,
+ "loss": 0.3728,
+ "step": 5149
+ },
+ {
+ "epoch": 2.434988179669031,
+ "grad_norm": 2.4923641681671143,
+ "learning_rate": 3.264231662804823e-06,
+ "loss": 0.4346,
+ "step": 5150
+ },
+ {
+ "epoch": 2.4354609929078013,
+ "grad_norm": 3.180874824523926,
+ "learning_rate": 3.2636376744892827e-06,
+ "loss": 0.4351,
+ "step": 5151
+ },
+ {
+ "epoch": 2.435933806146572,
+ "grad_norm": 2.6933515071868896,
+ "learning_rate": 3.263043638625373e-06,
+ "loss": 0.4293,
+ "step": 5152
+ },
+ {
+ "epoch": 2.436406619385343,
+ "grad_norm": 2.584132194519043,
+ "learning_rate": 3.262449555250081e-06,
+ "loss": 0.4589,
+ "step": 5153
+ },
+ {
+ "epoch": 2.4368794326241137,
+ "grad_norm": 2.8103036880493164,
+ "learning_rate": 3.2618554244003985e-06,
+ "loss": 0.463,
+ "step": 5154
+ },
+ {
+ "epoch": 2.437352245862884,
+ "grad_norm": 2.809070587158203,
+ "learning_rate": 3.2612612461133197e-06,
+ "loss": 0.4629,
+ "step": 5155
+ },
+ {
+ "epoch": 2.437825059101655,
+ "grad_norm": 2.98148512840271,
+ "learning_rate": 3.2606670204258405e-06,
+ "loss": 0.451,
+ "step": 5156
+ },
+ {
+ "epoch": 2.4382978723404256,
+ "grad_norm": 2.691047191619873,
+ "learning_rate": 3.2600727473749614e-06,
+ "loss": 0.3878,
+ "step": 5157
+ },
+ {
+ "epoch": 2.4387706855791964,
+ "grad_norm": 2.900360345840454,
+ "learning_rate": 3.2594784269976856e-06,
+ "loss": 0.4216,
+ "step": 5158
+ },
+ {
+ "epoch": 2.4392434988179668,
+ "grad_norm": 2.8449952602386475,
+ "learning_rate": 3.258884059331019e-06,
+ "loss": 0.4268,
+ "step": 5159
+ },
+ {
+ "epoch": 2.4397163120567376,
+ "grad_norm": 2.7226388454437256,
+ "learning_rate": 3.258289644411969e-06,
+ "loss": 0.4381,
+ "step": 5160
+ },
+ {
+ "epoch": 2.4401891252955084,
+ "grad_norm": 2.513946056365967,
+ "learning_rate": 3.257695182277547e-06,
+ "loss": 0.4566,
+ "step": 5161
+ },
+ {
+ "epoch": 2.440661938534279,
+ "grad_norm": 2.9941394329071045,
+ "learning_rate": 3.2571006729647693e-06,
+ "loss": 0.4395,
+ "step": 5162
+ },
+ {
+ "epoch": 2.4411347517730495,
+ "grad_norm": 2.699094533920288,
+ "learning_rate": 3.2565061165106523e-06,
+ "loss": 0.4274,
+ "step": 5163
+ },
+ {
+ "epoch": 2.4416075650118203,
+ "grad_norm": 2.574193000793457,
+ "learning_rate": 3.255911512952216e-06,
+ "loss": 0.4187,
+ "step": 5164
+ },
+ {
+ "epoch": 2.442080378250591,
+ "grad_norm": 2.920766592025757,
+ "learning_rate": 3.2553168623264854e-06,
+ "loss": 0.4911,
+ "step": 5165
+ },
+ {
+ "epoch": 2.4425531914893615,
+ "grad_norm": 2.728421926498413,
+ "learning_rate": 3.2547221646704853e-06,
+ "loss": 0.4466,
+ "step": 5166
+ },
+ {
+ "epoch": 2.4430260047281322,
+ "grad_norm": 2.8171417713165283,
+ "learning_rate": 3.254127420021246e-06,
+ "loss": 0.4331,
+ "step": 5167
+ },
+ {
+ "epoch": 2.443498817966903,
+ "grad_norm": 2.4069135189056396,
+ "learning_rate": 3.2535326284157975e-06,
+ "loss": 0.389,
+ "step": 5168
+ },
+ {
+ "epoch": 2.443971631205674,
+ "grad_norm": 2.912405490875244,
+ "learning_rate": 3.2529377898911777e-06,
+ "loss": 0.4681,
+ "step": 5169
+ },
+ {
+ "epoch": 2.4444444444444446,
+ "grad_norm": 2.987558126449585,
+ "learning_rate": 3.2523429044844228e-06,
+ "loss": 0.4715,
+ "step": 5170
+ },
+ {
+ "epoch": 2.444917257683215,
+ "grad_norm": 2.5117199420928955,
+ "learning_rate": 3.251747972232574e-06,
+ "loss": 0.4531,
+ "step": 5171
+ },
+ {
+ "epoch": 2.445390070921986,
+ "grad_norm": 2.5405385494232178,
+ "learning_rate": 3.2511529931726752e-06,
+ "loss": 0.4323,
+ "step": 5172
+ },
+ {
+ "epoch": 2.4458628841607566,
+ "grad_norm": 2.989932060241699,
+ "learning_rate": 3.250557967341773e-06,
+ "loss": 0.4039,
+ "step": 5173
+ },
+ {
+ "epoch": 2.446335697399527,
+ "grad_norm": 2.6331627368927,
+ "learning_rate": 3.2499628947769186e-06,
+ "loss": 0.5147,
+ "step": 5174
+ },
+ {
+ "epoch": 2.4468085106382977,
+ "grad_norm": 2.71699857711792,
+ "learning_rate": 3.249367775515162e-06,
+ "loss": 0.3748,
+ "step": 5175
+ },
+ {
+ "epoch": 2.4472813238770685,
+ "grad_norm": 2.9508471488952637,
+ "learning_rate": 3.2487726095935606e-06,
+ "loss": 0.5145,
+ "step": 5176
+ },
+ {
+ "epoch": 2.4477541371158393,
+ "grad_norm": 2.8276431560516357,
+ "learning_rate": 3.2481773970491713e-06,
+ "loss": 0.4295,
+ "step": 5177
+ },
+ {
+ "epoch": 2.44822695035461,
+ "grad_norm": 2.5500540733337402,
+ "learning_rate": 3.2475821379190565e-06,
+ "loss": 0.4246,
+ "step": 5178
+ },
+ {
+ "epoch": 2.4486997635933805,
+ "grad_norm": 2.845641613006592,
+ "learning_rate": 3.246986832240281e-06,
+ "loss": 0.4211,
+ "step": 5179
+ },
+ {
+ "epoch": 2.4491725768321513,
+ "grad_norm": 3.1215856075286865,
+ "learning_rate": 3.2463914800499097e-06,
+ "loss": 0.4378,
+ "step": 5180
+ },
+ {
+ "epoch": 2.449645390070922,
+ "grad_norm": 2.4685606956481934,
+ "learning_rate": 3.2457960813850137e-06,
+ "loss": 0.4836,
+ "step": 5181
+ },
+ {
+ "epoch": 2.4501182033096924,
+ "grad_norm": 2.508028268814087,
+ "learning_rate": 3.245200636282666e-06,
+ "loss": 0.4377,
+ "step": 5182
+ },
+ {
+ "epoch": 2.4505910165484632,
+ "grad_norm": 2.899949312210083,
+ "learning_rate": 3.244605144779943e-06,
+ "loss": 0.501,
+ "step": 5183
+ },
+ {
+ "epoch": 2.451063829787234,
+ "grad_norm": 2.6494483947753906,
+ "learning_rate": 3.244009606913923e-06,
+ "loss": 0.4255,
+ "step": 5184
+ },
+ {
+ "epoch": 2.451536643026005,
+ "grad_norm": 2.4363760948181152,
+ "learning_rate": 3.243414022721686e-06,
+ "loss": 0.4402,
+ "step": 5185
+ },
+ {
+ "epoch": 2.4520094562647756,
+ "grad_norm": 2.4725022315979004,
+ "learning_rate": 3.242818392240317e-06,
+ "loss": 0.4388,
+ "step": 5186
+ },
+ {
+ "epoch": 2.452482269503546,
+ "grad_norm": 2.7010514736175537,
+ "learning_rate": 3.242222715506905e-06,
+ "loss": 0.4388,
+ "step": 5187
+ },
+ {
+ "epoch": 2.4529550827423168,
+ "grad_norm": 2.811464548110962,
+ "learning_rate": 3.241626992558539e-06,
+ "loss": 0.4634,
+ "step": 5188
+ },
+ {
+ "epoch": 2.4534278959810876,
+ "grad_norm": 2.6473052501678467,
+ "learning_rate": 3.2410312234323123e-06,
+ "loss": 0.4752,
+ "step": 5189
+ },
+ {
+ "epoch": 2.453900709219858,
+ "grad_norm": 2.5587213039398193,
+ "learning_rate": 3.24043540816532e-06,
+ "loss": 0.4458,
+ "step": 5190
+ },
+ {
+ "epoch": 2.4543735224586287,
+ "grad_norm": 2.6306557655334473,
+ "learning_rate": 3.239839546794662e-06,
+ "loss": 0.4081,
+ "step": 5191
+ },
+ {
+ "epoch": 2.4548463356973995,
+ "grad_norm": 2.4613633155822754,
+ "learning_rate": 3.23924363935744e-06,
+ "loss": 0.4165,
+ "step": 5192
+ },
+ {
+ "epoch": 2.4553191489361703,
+ "grad_norm": 2.7189204692840576,
+ "learning_rate": 3.238647685890757e-06,
+ "loss": 0.4822,
+ "step": 5193
+ },
+ {
+ "epoch": 2.455791962174941,
+ "grad_norm": 3.015977382659912,
+ "learning_rate": 3.238051686431722e-06,
+ "loss": 0.4964,
+ "step": 5194
+ },
+ {
+ "epoch": 2.4562647754137115,
+ "grad_norm": 2.8868937492370605,
+ "learning_rate": 3.2374556410174445e-06,
+ "loss": 0.4514,
+ "step": 5195
+ },
+ {
+ "epoch": 2.4567375886524823,
+ "grad_norm": 2.7959537506103516,
+ "learning_rate": 3.2368595496850375e-06,
+ "loss": 0.475,
+ "step": 5196
+ },
+ {
+ "epoch": 2.457210401891253,
+ "grad_norm": 3.0086777210235596,
+ "learning_rate": 3.2362634124716187e-06,
+ "loss": 0.4913,
+ "step": 5197
+ },
+ {
+ "epoch": 2.4576832151300234,
+ "grad_norm": 2.621335506439209,
+ "learning_rate": 3.2356672294143044e-06,
+ "loss": 0.4259,
+ "step": 5198
+ },
+ {
+ "epoch": 2.458156028368794,
+ "grad_norm": 3.1620380878448486,
+ "learning_rate": 3.235071000550218e-06,
+ "loss": 0.451,
+ "step": 5199
+ },
+ {
+ "epoch": 2.458628841607565,
+ "grad_norm": 2.7663278579711914,
+ "learning_rate": 3.234474725916484e-06,
+ "loss": 0.3854,
+ "step": 5200
+ },
+ {
+ "epoch": 2.459101654846336,
+ "grad_norm": 2.5187132358551025,
+ "learning_rate": 3.2338784055502288e-06,
+ "loss": 0.4068,
+ "step": 5201
+ },
+ {
+ "epoch": 2.4595744680851066,
+ "grad_norm": 2.6022701263427734,
+ "learning_rate": 3.233282039488583e-06,
+ "loss": 0.4484,
+ "step": 5202
+ },
+ {
+ "epoch": 2.460047281323877,
+ "grad_norm": 2.874750852584839,
+ "learning_rate": 3.2326856277686807e-06,
+ "loss": 0.45,
+ "step": 5203
+ },
+ {
+ "epoch": 2.4605200945626478,
+ "grad_norm": 2.671008586883545,
+ "learning_rate": 3.232089170427656e-06,
+ "loss": 0.4446,
+ "step": 5204
+ },
+ {
+ "epoch": 2.4609929078014185,
+ "grad_norm": 2.7365503311157227,
+ "learning_rate": 3.2314926675026498e-06,
+ "loss": 0.4402,
+ "step": 5205
+ },
+ {
+ "epoch": 2.461465721040189,
+ "grad_norm": 2.8163657188415527,
+ "learning_rate": 3.230896119030803e-06,
+ "loss": 0.3881,
+ "step": 5206
+ },
+ {
+ "epoch": 2.4619385342789597,
+ "grad_norm": 2.812433958053589,
+ "learning_rate": 3.2302995250492584e-06,
+ "loss": 0.4897,
+ "step": 5207
+ },
+ {
+ "epoch": 2.4624113475177305,
+ "grad_norm": 2.786033868789673,
+ "learning_rate": 3.2297028855951664e-06,
+ "loss": 0.4069,
+ "step": 5208
+ },
+ {
+ "epoch": 2.4628841607565013,
+ "grad_norm": 3.0247974395751953,
+ "learning_rate": 3.229106200705674e-06,
+ "loss": 0.4048,
+ "step": 5209
+ },
+ {
+ "epoch": 2.463356973995272,
+ "grad_norm": 3.3280487060546875,
+ "learning_rate": 3.2285094704179353e-06,
+ "loss": 0.5613,
+ "step": 5210
+ },
+ {
+ "epoch": 2.4638297872340424,
+ "grad_norm": 2.603219985961914,
+ "learning_rate": 3.2279126947691073e-06,
+ "loss": 0.432,
+ "step": 5211
+ },
+ {
+ "epoch": 2.4643026004728132,
+ "grad_norm": 3.1532180309295654,
+ "learning_rate": 3.2273158737963472e-06,
+ "loss": 0.4602,
+ "step": 5212
+ },
+ {
+ "epoch": 2.464775413711584,
+ "grad_norm": 2.7512969970703125,
+ "learning_rate": 3.2267190075368164e-06,
+ "loss": 0.5064,
+ "step": 5213
+ },
+ {
+ "epoch": 2.4652482269503544,
+ "grad_norm": 2.926992177963257,
+ "learning_rate": 3.22612209602768e-06,
+ "loss": 0.4753,
+ "step": 5214
+ },
+ {
+ "epoch": 2.465721040189125,
+ "grad_norm": 4.052840709686279,
+ "learning_rate": 3.2255251393061047e-06,
+ "loss": 0.5235,
+ "step": 5215
+ },
+ {
+ "epoch": 2.466193853427896,
+ "grad_norm": 2.8266959190368652,
+ "learning_rate": 3.2249281374092606e-06,
+ "loss": 0.3931,
+ "step": 5216
+ },
+ {
+ "epoch": 2.466666666666667,
+ "grad_norm": 2.564359426498413,
+ "learning_rate": 3.2243310903743196e-06,
+ "loss": 0.4146,
+ "step": 5217
+ },
+ {
+ "epoch": 2.4671394799054376,
+ "grad_norm": 2.387925148010254,
+ "learning_rate": 3.2237339982384576e-06,
+ "loss": 0.4142,
+ "step": 5218
+ },
+ {
+ "epoch": 2.467612293144208,
+ "grad_norm": 2.7045164108276367,
+ "learning_rate": 3.223136861038853e-06,
+ "loss": 0.4345,
+ "step": 5219
+ },
+ {
+ "epoch": 2.4680851063829787,
+ "grad_norm": 2.6963284015655518,
+ "learning_rate": 3.2225396788126872e-06,
+ "loss": 0.4243,
+ "step": 5220
+ },
+ {
+ "epoch": 2.4685579196217495,
+ "grad_norm": 2.8247268199920654,
+ "learning_rate": 3.221942451597144e-06,
+ "loss": 0.3919,
+ "step": 5221
+ },
+ {
+ "epoch": 2.46903073286052,
+ "grad_norm": 3.843836784362793,
+ "learning_rate": 3.2213451794294093e-06,
+ "loss": 0.4183,
+ "step": 5222
+ },
+ {
+ "epoch": 2.4695035460992907,
+ "grad_norm": 2.8579909801483154,
+ "learning_rate": 3.220747862346674e-06,
+ "loss": 0.4844,
+ "step": 5223
+ },
+ {
+ "epoch": 2.4699763593380615,
+ "grad_norm": 3.744027853012085,
+ "learning_rate": 3.2201505003861294e-06,
+ "loss": 0.4563,
+ "step": 5224
+ },
+ {
+ "epoch": 2.4704491725768323,
+ "grad_norm": 2.835108995437622,
+ "learning_rate": 3.219553093584971e-06,
+ "loss": 0.4394,
+ "step": 5225
+ },
+ {
+ "epoch": 2.470921985815603,
+ "grad_norm": 2.5681865215301514,
+ "learning_rate": 3.218955641980397e-06,
+ "loss": 0.3907,
+ "step": 5226
+ },
+ {
+ "epoch": 2.4713947990543734,
+ "grad_norm": 2.963172674179077,
+ "learning_rate": 3.2183581456096067e-06,
+ "loss": 0.5163,
+ "step": 5227
+ },
+ {
+ "epoch": 2.4718676122931442,
+ "grad_norm": 2.7840685844421387,
+ "learning_rate": 3.2177606045098047e-06,
+ "loss": 0.411,
+ "step": 5228
+ },
+ {
+ "epoch": 2.472340425531915,
+ "grad_norm": 2.7849979400634766,
+ "learning_rate": 3.2171630187181977e-06,
+ "loss": 0.4671,
+ "step": 5229
+ },
+ {
+ "epoch": 2.4728132387706854,
+ "grad_norm": 2.736406087875366,
+ "learning_rate": 3.216565388271994e-06,
+ "loss": 0.5225,
+ "step": 5230
+ },
+ {
+ "epoch": 2.473286052009456,
+ "grad_norm": 2.978271007537842,
+ "learning_rate": 3.215967713208406e-06,
+ "loss": 0.4668,
+ "step": 5231
+ },
+ {
+ "epoch": 2.473758865248227,
+ "grad_norm": 2.687560796737671,
+ "learning_rate": 3.2153699935646475e-06,
+ "loss": 0.4683,
+ "step": 5232
+ },
+ {
+ "epoch": 2.4742316784869978,
+ "grad_norm": 2.7096521854400635,
+ "learning_rate": 3.214772229377936e-06,
+ "loss": 0.4999,
+ "step": 5233
+ },
+ {
+ "epoch": 2.4747044917257686,
+ "grad_norm": 3.1861157417297363,
+ "learning_rate": 3.214174420685493e-06,
+ "loss": 0.4365,
+ "step": 5234
+ },
+ {
+ "epoch": 2.475177304964539,
+ "grad_norm": 2.623061418533325,
+ "learning_rate": 3.2135765675245394e-06,
+ "loss": 0.3717,
+ "step": 5235
+ },
+ {
+ "epoch": 2.4756501182033097,
+ "grad_norm": 2.680921792984009,
+ "learning_rate": 3.2129786699323016e-06,
+ "loss": 0.4688,
+ "step": 5236
+ },
+ {
+ "epoch": 2.4761229314420805,
+ "grad_norm": 2.80426025390625,
+ "learning_rate": 3.2123807279460096e-06,
+ "loss": 0.5043,
+ "step": 5237
+ },
+ {
+ "epoch": 2.476595744680851,
+ "grad_norm": 2.676156997680664,
+ "learning_rate": 3.211782741602893e-06,
+ "loss": 0.4486,
+ "step": 5238
+ },
+ {
+ "epoch": 2.4770685579196217,
+ "grad_norm": 2.700822591781616,
+ "learning_rate": 3.2111847109401855e-06,
+ "loss": 0.4097,
+ "step": 5239
+ },
+ {
+ "epoch": 2.4775413711583925,
+ "grad_norm": 2.735387086868286,
+ "learning_rate": 3.2105866359951254e-06,
+ "loss": 0.4357,
+ "step": 5240
+ },
+ {
+ "epoch": 2.4780141843971633,
+ "grad_norm": 2.961874485015869,
+ "learning_rate": 3.2099885168049507e-06,
+ "loss": 0.4942,
+ "step": 5241
+ },
+ {
+ "epoch": 2.478486997635934,
+ "grad_norm": 2.546588659286499,
+ "learning_rate": 3.209390353406904e-06,
+ "loss": 0.3852,
+ "step": 5242
+ },
+ {
+ "epoch": 2.4789598108747044,
+ "grad_norm": 2.6269772052764893,
+ "learning_rate": 3.208792145838231e-06,
+ "loss": 0.3935,
+ "step": 5243
+ },
+ {
+ "epoch": 2.479432624113475,
+ "grad_norm": 2.9009883403778076,
+ "learning_rate": 3.208193894136179e-06,
+ "loss": 0.4003,
+ "step": 5244
+ },
+ {
+ "epoch": 2.479905437352246,
+ "grad_norm": 2.772834300994873,
+ "learning_rate": 3.2075955983379982e-06,
+ "loss": 0.4742,
+ "step": 5245
+ },
+ {
+ "epoch": 2.4803782505910164,
+ "grad_norm": 2.728703737258911,
+ "learning_rate": 3.2069972584809423e-06,
+ "loss": 0.4405,
+ "step": 5246
+ },
+ {
+ "epoch": 2.480851063829787,
+ "grad_norm": 2.72868275642395,
+ "learning_rate": 3.206398874602268e-06,
+ "loss": 0.4714,
+ "step": 5247
+ },
+ {
+ "epoch": 2.481323877068558,
+ "grad_norm": 2.6804213523864746,
+ "learning_rate": 3.2058004467392323e-06,
+ "loss": 0.4106,
+ "step": 5248
+ },
+ {
+ "epoch": 2.4817966903073287,
+ "grad_norm": 2.6740739345550537,
+ "learning_rate": 3.205201974929098e-06,
+ "loss": 0.3855,
+ "step": 5249
+ },
+ {
+ "epoch": 2.482269503546099,
+ "grad_norm": 2.8131754398345947,
+ "learning_rate": 3.204603459209129e-06,
+ "loss": 0.418,
+ "step": 5250
+ },
+ {
+ "epoch": 2.48274231678487,
+ "grad_norm": 2.5242888927459717,
+ "learning_rate": 3.204004899616592e-06,
+ "loss": 0.4914,
+ "step": 5251
+ },
+ {
+ "epoch": 2.4832151300236407,
+ "grad_norm": 2.969191551208496,
+ "learning_rate": 3.2034062961887567e-06,
+ "loss": 0.4634,
+ "step": 5252
+ },
+ {
+ "epoch": 2.4836879432624115,
+ "grad_norm": 2.967968463897705,
+ "learning_rate": 3.2028076489628963e-06,
+ "loss": 0.456,
+ "step": 5253
+ },
+ {
+ "epoch": 2.484160756501182,
+ "grad_norm": 2.9006540775299072,
+ "learning_rate": 3.2022089579762845e-06,
+ "loss": 0.4203,
+ "step": 5254
+ },
+ {
+ "epoch": 2.4846335697399526,
+ "grad_norm": 2.6377336978912354,
+ "learning_rate": 3.2016102232662003e-06,
+ "loss": 0.4518,
+ "step": 5255
+ },
+ {
+ "epoch": 2.4851063829787234,
+ "grad_norm": 2.757749319076538,
+ "learning_rate": 3.201011444869925e-06,
+ "loss": 0.4314,
+ "step": 5256
+ },
+ {
+ "epoch": 2.4855791962174942,
+ "grad_norm": 2.571560859680176,
+ "learning_rate": 3.20041262282474e-06,
+ "loss": 0.427,
+ "step": 5257
+ },
+ {
+ "epoch": 2.4860520094562646,
+ "grad_norm": 3.1367194652557373,
+ "learning_rate": 3.1998137571679316e-06,
+ "loss": 0.4901,
+ "step": 5258
+ },
+ {
+ "epoch": 2.4865248226950354,
+ "grad_norm": 3.194042205810547,
+ "learning_rate": 3.1992148479367896e-06,
+ "loss": 0.466,
+ "step": 5259
+ },
+ {
+ "epoch": 2.486997635933806,
+ "grad_norm": 2.5546324253082275,
+ "learning_rate": 3.1986158951686052e-06,
+ "loss": 0.4182,
+ "step": 5260
+ },
+ {
+ "epoch": 2.487470449172577,
+ "grad_norm": 2.919783115386963,
+ "learning_rate": 3.198016898900672e-06,
+ "loss": 0.4234,
+ "step": 5261
+ },
+ {
+ "epoch": 2.4879432624113473,
+ "grad_norm": 2.865248918533325,
+ "learning_rate": 3.1974178591702877e-06,
+ "loss": 0.4291,
+ "step": 5262
+ },
+ {
+ "epoch": 2.488416075650118,
+ "grad_norm": 2.685737133026123,
+ "learning_rate": 3.196818776014752e-06,
+ "loss": 0.4548,
+ "step": 5263
+ },
+ {
+ "epoch": 2.488888888888889,
+ "grad_norm": 2.826974630355835,
+ "learning_rate": 3.196219649471365e-06,
+ "loss": 0.4152,
+ "step": 5264
+ },
+ {
+ "epoch": 2.4893617021276597,
+ "grad_norm": 2.764975070953369,
+ "learning_rate": 3.1956204795774336e-06,
+ "loss": 0.5209,
+ "step": 5265
+ },
+ {
+ "epoch": 2.48983451536643,
+ "grad_norm": 2.4184255599975586,
+ "learning_rate": 3.1950212663702662e-06,
+ "loss": 0.3969,
+ "step": 5266
+ },
+ {
+ "epoch": 2.490307328605201,
+ "grad_norm": 2.9361133575439453,
+ "learning_rate": 3.1944220098871713e-06,
+ "loss": 0.4589,
+ "step": 5267
+ },
+ {
+ "epoch": 2.4907801418439717,
+ "grad_norm": 2.377051830291748,
+ "learning_rate": 3.193822710165463e-06,
+ "loss": 0.4328,
+ "step": 5268
+ },
+ {
+ "epoch": 2.4912529550827425,
+ "grad_norm": 3.1302497386932373,
+ "learning_rate": 3.1932233672424563e-06,
+ "loss": 0.3918,
+ "step": 5269
+ },
+ {
+ "epoch": 2.491725768321513,
+ "grad_norm": 2.89577579498291,
+ "learning_rate": 3.192623981155471e-06,
+ "loss": 0.5004,
+ "step": 5270
+ },
+ {
+ "epoch": 2.4921985815602836,
+ "grad_norm": 2.7735235691070557,
+ "learning_rate": 3.1920245519418273e-06,
+ "loss": 0.4206,
+ "step": 5271
+ },
+ {
+ "epoch": 2.4926713947990544,
+ "grad_norm": 2.5424516201019287,
+ "learning_rate": 3.1914250796388493e-06,
+ "loss": 0.4419,
+ "step": 5272
+ },
+ {
+ "epoch": 2.493144208037825,
+ "grad_norm": 3.1216981410980225,
+ "learning_rate": 3.1908255642838628e-06,
+ "loss": 0.4552,
+ "step": 5273
+ },
+ {
+ "epoch": 2.4936170212765956,
+ "grad_norm": 3.044045925140381,
+ "learning_rate": 3.1902260059141978e-06,
+ "loss": 0.4967,
+ "step": 5274
+ },
+ {
+ "epoch": 2.4940898345153664,
+ "grad_norm": 2.5630741119384766,
+ "learning_rate": 3.189626404567186e-06,
+ "loss": 0.3908,
+ "step": 5275
+ },
+ {
+ "epoch": 2.494562647754137,
+ "grad_norm": 2.7177648544311523,
+ "learning_rate": 3.189026760280162e-06,
+ "loss": 0.4915,
+ "step": 5276
+ },
+ {
+ "epoch": 2.495035460992908,
+ "grad_norm": 2.653416395187378,
+ "learning_rate": 3.1884270730904632e-06,
+ "loss": 0.4633,
+ "step": 5277
+ },
+ {
+ "epoch": 2.4955082742316783,
+ "grad_norm": 3.7212321758270264,
+ "learning_rate": 3.1878273430354284e-06,
+ "loss": 0.4549,
+ "step": 5278
+ },
+ {
+ "epoch": 2.495981087470449,
+ "grad_norm": 2.4152729511260986,
+ "learning_rate": 3.187227570152402e-06,
+ "loss": 0.4674,
+ "step": 5279
+ },
+ {
+ "epoch": 2.49645390070922,
+ "grad_norm": 2.5354862213134766,
+ "learning_rate": 3.1866277544787284e-06,
+ "loss": 0.4135,
+ "step": 5280
+ },
+ {
+ "epoch": 2.4969267139479907,
+ "grad_norm": 3.1766583919525146,
+ "learning_rate": 3.186027896051754e-06,
+ "loss": 0.5656,
+ "step": 5281
+ },
+ {
+ "epoch": 2.497399527186761,
+ "grad_norm": 2.5636754035949707,
+ "learning_rate": 3.1854279949088313e-06,
+ "loss": 0.4138,
+ "step": 5282
+ },
+ {
+ "epoch": 2.497872340425532,
+ "grad_norm": 2.7615602016448975,
+ "learning_rate": 3.1848280510873124e-06,
+ "loss": 0.4936,
+ "step": 5283
+ },
+ {
+ "epoch": 2.4983451536643027,
+ "grad_norm": 2.964721918106079,
+ "learning_rate": 3.1842280646245543e-06,
+ "loss": 0.4865,
+ "step": 5284
+ },
+ {
+ "epoch": 2.4988179669030735,
+ "grad_norm": 2.6915178298950195,
+ "learning_rate": 3.1836280355579152e-06,
+ "loss": 0.4179,
+ "step": 5285
+ },
+ {
+ "epoch": 2.499290780141844,
+ "grad_norm": 2.820451259613037,
+ "learning_rate": 3.183027963924755e-06,
+ "loss": 0.4785,
+ "step": 5286
+ },
+ {
+ "epoch": 2.4997635933806146,
+ "grad_norm": 2.841719627380371,
+ "learning_rate": 3.1824278497624393e-06,
+ "loss": 0.4535,
+ "step": 5287
+ },
+ {
+ "epoch": 2.5002364066193854,
+ "grad_norm": 2.459167957305908,
+ "learning_rate": 3.181827693108333e-06,
+ "loss": 0.4353,
+ "step": 5288
+ },
+ {
+ "epoch": 2.500709219858156,
+ "grad_norm": 3.2538363933563232,
+ "learning_rate": 3.1812274939998066e-06,
+ "loss": 0.4037,
+ "step": 5289
+ },
+ {
+ "epoch": 2.5011820330969265,
+ "grad_norm": 2.6980504989624023,
+ "learning_rate": 3.180627252474231e-06,
+ "loss": 0.4181,
+ "step": 5290
+ },
+ {
+ "epoch": 2.5016548463356973,
+ "grad_norm": 2.9400012493133545,
+ "learning_rate": 3.1800269685689804e-06,
+ "loss": 0.4642,
+ "step": 5291
+ },
+ {
+ "epoch": 2.502127659574468,
+ "grad_norm": 2.7832958698272705,
+ "learning_rate": 3.1794266423214328e-06,
+ "loss": 0.3936,
+ "step": 5292
+ },
+ {
+ "epoch": 2.5026004728132385,
+ "grad_norm": 2.4017868041992188,
+ "learning_rate": 3.178826273768967e-06,
+ "loss": 0.3984,
+ "step": 5293
+ },
+ {
+ "epoch": 2.5030732860520093,
+ "grad_norm": 2.398120641708374,
+ "learning_rate": 3.1782258629489665e-06,
+ "loss": 0.4219,
+ "step": 5294
+ },
+ {
+ "epoch": 2.50354609929078,
+ "grad_norm": 2.973947763442993,
+ "learning_rate": 3.177625409898815e-06,
+ "loss": 0.4192,
+ "step": 5295
+ },
+ {
+ "epoch": 2.504018912529551,
+ "grad_norm": 3.1169888973236084,
+ "learning_rate": 3.1770249146559006e-06,
+ "loss": 0.5098,
+ "step": 5296
+ },
+ {
+ "epoch": 2.5044917257683217,
+ "grad_norm": 2.816964864730835,
+ "learning_rate": 3.1764243772576132e-06,
+ "loss": 0.4228,
+ "step": 5297
+ },
+ {
+ "epoch": 2.504964539007092,
+ "grad_norm": 2.5624163150787354,
+ "learning_rate": 3.1758237977413452e-06,
+ "loss": 0.4389,
+ "step": 5298
+ },
+ {
+ "epoch": 2.505437352245863,
+ "grad_norm": 2.7477777004241943,
+ "learning_rate": 3.175223176144494e-06,
+ "loss": 0.4564,
+ "step": 5299
+ },
+ {
+ "epoch": 2.5059101654846336,
+ "grad_norm": 3.1478309631347656,
+ "learning_rate": 3.174622512504456e-06,
+ "loss": 0.4859,
+ "step": 5300
+ },
+ {
+ "epoch": 2.506382978723404,
+ "grad_norm": 2.8400418758392334,
+ "learning_rate": 3.1740218068586315e-06,
+ "loss": 0.4476,
+ "step": 5301
+ },
+ {
+ "epoch": 2.506855791962175,
+ "grad_norm": 2.7097036838531494,
+ "learning_rate": 3.173421059244426e-06,
+ "loss": 0.4559,
+ "step": 5302
+ },
+ {
+ "epoch": 2.5073286052009456,
+ "grad_norm": 2.864760637283325,
+ "learning_rate": 3.172820269699243e-06,
+ "loss": 0.5124,
+ "step": 5303
+ },
+ {
+ "epoch": 2.5078014184397164,
+ "grad_norm": 2.877110004425049,
+ "learning_rate": 3.1722194382604926e-06,
+ "loss": 0.5083,
+ "step": 5304
+ },
+ {
+ "epoch": 2.508274231678487,
+ "grad_norm": 3.2369656562805176,
+ "learning_rate": 3.1716185649655844e-06,
+ "loss": 0.4894,
+ "step": 5305
+ },
+ {
+ "epoch": 2.5087470449172575,
+ "grad_norm": 2.7377753257751465,
+ "learning_rate": 3.171017649851934e-06,
+ "loss": 0.4324,
+ "step": 5306
+ },
+ {
+ "epoch": 2.5092198581560283,
+ "grad_norm": 2.883364200592041,
+ "learning_rate": 3.1704166929569564e-06,
+ "loss": 0.3731,
+ "step": 5307
+ },
+ {
+ "epoch": 2.509692671394799,
+ "grad_norm": 2.5724737644195557,
+ "learning_rate": 3.1698156943180716e-06,
+ "loss": 0.4768,
+ "step": 5308
+ },
+ {
+ "epoch": 2.5101654846335695,
+ "grad_norm": 2.7532460689544678,
+ "learning_rate": 3.1692146539727e-06,
+ "loss": 0.4385,
+ "step": 5309
+ },
+ {
+ "epoch": 2.5106382978723403,
+ "grad_norm": 2.786505699157715,
+ "learning_rate": 3.168613571958267e-06,
+ "loss": 0.4241,
+ "step": 5310
+ },
+ {
+ "epoch": 2.511111111111111,
+ "grad_norm": 3.1674118041992188,
+ "learning_rate": 3.1680124483121975e-06,
+ "loss": 0.4445,
+ "step": 5311
+ },
+ {
+ "epoch": 2.511583924349882,
+ "grad_norm": 2.7861545085906982,
+ "learning_rate": 3.167411283071923e-06,
+ "loss": 0.4264,
+ "step": 5312
+ },
+ {
+ "epoch": 2.5120567375886527,
+ "grad_norm": 2.7412493228912354,
+ "learning_rate": 3.1668100762748745e-06,
+ "loss": 0.4725,
+ "step": 5313
+ },
+ {
+ "epoch": 2.512529550827423,
+ "grad_norm": 2.710019588470459,
+ "learning_rate": 3.1662088279584858e-06,
+ "loss": 0.5207,
+ "step": 5314
+ },
+ {
+ "epoch": 2.513002364066194,
+ "grad_norm": 2.694812297821045,
+ "learning_rate": 3.165607538160194e-06,
+ "loss": 0.3666,
+ "step": 5315
+ },
+ {
+ "epoch": 2.5134751773049646,
+ "grad_norm": 2.4390623569488525,
+ "learning_rate": 3.1650062069174405e-06,
+ "loss": 0.4025,
+ "step": 5316
+ },
+ {
+ "epoch": 2.513947990543735,
+ "grad_norm": 3.055738925933838,
+ "learning_rate": 3.1644048342676663e-06,
+ "loss": 0.4288,
+ "step": 5317
+ },
+ {
+ "epoch": 2.5144208037825058,
+ "grad_norm": 3.065824508666992,
+ "learning_rate": 3.163803420248316e-06,
+ "loss": 0.4592,
+ "step": 5318
+ },
+ {
+ "epoch": 2.5148936170212766,
+ "grad_norm": 2.6011085510253906,
+ "learning_rate": 3.163201964896838e-06,
+ "loss": 0.4081,
+ "step": 5319
+ },
+ {
+ "epoch": 2.5153664302600474,
+ "grad_norm": 2.4833033084869385,
+ "learning_rate": 3.162600468250681e-06,
+ "loss": 0.4343,
+ "step": 5320
+ },
+ {
+ "epoch": 2.515839243498818,
+ "grad_norm": 2.9035534858703613,
+ "learning_rate": 3.161998930347299e-06,
+ "loss": 0.4972,
+ "step": 5321
+ },
+ {
+ "epoch": 2.5163120567375885,
+ "grad_norm": 2.788752317428589,
+ "learning_rate": 3.161397351224146e-06,
+ "loss": 0.4597,
+ "step": 5322
+ },
+ {
+ "epoch": 2.5167848699763593,
+ "grad_norm": 2.4344491958618164,
+ "learning_rate": 3.16079573091868e-06,
+ "loss": 0.359,
+ "step": 5323
+ },
+ {
+ "epoch": 2.51725768321513,
+ "grad_norm": 2.750150680541992,
+ "learning_rate": 3.160194069468361e-06,
+ "loss": 0.4596,
+ "step": 5324
+ },
+ {
+ "epoch": 2.5177304964539005,
+ "grad_norm": 2.826902389526367,
+ "learning_rate": 3.1595923669106526e-06,
+ "loss": 0.4377,
+ "step": 5325
+ },
+ {
+ "epoch": 2.5182033096926713,
+ "grad_norm": 2.554439067840576,
+ "learning_rate": 3.15899062328302e-06,
+ "loss": 0.4517,
+ "step": 5326
+ },
+ {
+ "epoch": 2.518676122931442,
+ "grad_norm": 3.0882742404937744,
+ "learning_rate": 3.158388838622931e-06,
+ "loss": 0.47,
+ "step": 5327
+ },
+ {
+ "epoch": 2.519148936170213,
+ "grad_norm": 2.918947696685791,
+ "learning_rate": 3.157787012967856e-06,
+ "loss": 0.522,
+ "step": 5328
+ },
+ {
+ "epoch": 2.5196217494089836,
+ "grad_norm": 2.8057637214660645,
+ "learning_rate": 3.1571851463552674e-06,
+ "loss": 0.4837,
+ "step": 5329
+ },
+ {
+ "epoch": 2.520094562647754,
+ "grad_norm": 2.66241455078125,
+ "learning_rate": 3.156583238822641e-06,
+ "loss": 0.3988,
+ "step": 5330
+ },
+ {
+ "epoch": 2.520567375886525,
+ "grad_norm": 2.9793803691864014,
+ "learning_rate": 3.155981290407456e-06,
+ "loss": 0.4737,
+ "step": 5331
+ },
+ {
+ "epoch": 2.5210401891252956,
+ "grad_norm": 2.847522258758545,
+ "learning_rate": 3.1553793011471924e-06,
+ "loss": 0.4394,
+ "step": 5332
+ },
+ {
+ "epoch": 2.521513002364066,
+ "grad_norm": 2.9561474323272705,
+ "learning_rate": 3.154777271079333e-06,
+ "loss": 0.47,
+ "step": 5333
+ },
+ {
+ "epoch": 2.5219858156028367,
+ "grad_norm": 2.8353018760681152,
+ "learning_rate": 3.154175200241365e-06,
+ "loss": 0.4015,
+ "step": 5334
+ },
+ {
+ "epoch": 2.5224586288416075,
+ "grad_norm": 2.609049081802368,
+ "learning_rate": 3.153573088670775e-06,
+ "loss": 0.4723,
+ "step": 5335
+ },
+ {
+ "epoch": 2.5229314420803783,
+ "grad_norm": 2.8538455963134766,
+ "learning_rate": 3.1529709364050556e-06,
+ "loss": 0.4665,
+ "step": 5336
+ },
+ {
+ "epoch": 2.523404255319149,
+ "grad_norm": 2.768310785293579,
+ "learning_rate": 3.1523687434816978e-06,
+ "loss": 0.4933,
+ "step": 5337
+ },
+ {
+ "epoch": 2.5238770685579195,
+ "grad_norm": 2.9300906658172607,
+ "learning_rate": 3.1517665099382e-06,
+ "loss": 0.4651,
+ "step": 5338
+ },
+ {
+ "epoch": 2.5243498817966903,
+ "grad_norm": 2.6984703540802,
+ "learning_rate": 3.1511642358120585e-06,
+ "loss": 0.4442,
+ "step": 5339
+ },
+ {
+ "epoch": 2.524822695035461,
+ "grad_norm": 2.8148467540740967,
+ "learning_rate": 3.1505619211407762e-06,
+ "loss": 0.4611,
+ "step": 5340
+ },
+ {
+ "epoch": 2.5252955082742314,
+ "grad_norm": 2.816436290740967,
+ "learning_rate": 3.1499595659618556e-06,
+ "loss": 0.5291,
+ "step": 5341
+ },
+ {
+ "epoch": 2.5257683215130022,
+ "grad_norm": 2.902805805206299,
+ "learning_rate": 3.149357170312802e-06,
+ "loss": 0.4394,
+ "step": 5342
+ },
+ {
+ "epoch": 2.526241134751773,
+ "grad_norm": 2.6443474292755127,
+ "learning_rate": 3.148754734231126e-06,
+ "loss": 0.4444,
+ "step": 5343
+ },
+ {
+ "epoch": 2.526713947990544,
+ "grad_norm": 2.6818583011627197,
+ "learning_rate": 3.148152257754336e-06,
+ "loss": 0.4256,
+ "step": 5344
+ },
+ {
+ "epoch": 2.5271867612293146,
+ "grad_norm": 2.5266945362091064,
+ "learning_rate": 3.1475497409199485e-06,
+ "loss": 0.4087,
+ "step": 5345
+ },
+ {
+ "epoch": 2.527659574468085,
+ "grad_norm": 2.6326711177825928,
+ "learning_rate": 3.146947183765477e-06,
+ "loss": 0.3842,
+ "step": 5346
+ },
+ {
+ "epoch": 2.5281323877068558,
+ "grad_norm": 3.122880697250366,
+ "learning_rate": 3.1463445863284413e-06,
+ "loss": 0.482,
+ "step": 5347
+ },
+ {
+ "epoch": 2.5286052009456266,
+ "grad_norm": 2.819258213043213,
+ "learning_rate": 3.145741948646362e-06,
+ "loss": 0.4628,
+ "step": 5348
+ },
+ {
+ "epoch": 2.529078014184397,
+ "grad_norm": 2.5842230319976807,
+ "learning_rate": 3.145139270756764e-06,
+ "loss": 0.4479,
+ "step": 5349
+ },
+ {
+ "epoch": 2.5295508274231677,
+ "grad_norm": 2.7257237434387207,
+ "learning_rate": 3.144536552697172e-06,
+ "loss": 0.473,
+ "step": 5350
+ },
+ {
+ "epoch": 2.5300236406619385,
+ "grad_norm": 2.6876981258392334,
+ "learning_rate": 3.143933794505115e-06,
+ "loss": 0.4615,
+ "step": 5351
+ },
+ {
+ "epoch": 2.5304964539007093,
+ "grad_norm": 2.7942895889282227,
+ "learning_rate": 3.143330996218124e-06,
+ "loss": 0.4982,
+ "step": 5352
+ },
+ {
+ "epoch": 2.53096926713948,
+ "grad_norm": 2.3150579929351807,
+ "learning_rate": 3.1427281578737327e-06,
+ "loss": 0.3905,
+ "step": 5353
+ },
+ {
+ "epoch": 2.5314420803782505,
+ "grad_norm": 2.7326138019561768,
+ "learning_rate": 3.142125279509478e-06,
+ "loss": 0.4076,
+ "step": 5354
+ },
+ {
+ "epoch": 2.5319148936170213,
+ "grad_norm": 2.46362566947937,
+ "learning_rate": 3.1415223611628976e-06,
+ "loss": 0.4043,
+ "step": 5355
+ },
+ {
+ "epoch": 2.532387706855792,
+ "grad_norm": 2.6670427322387695,
+ "learning_rate": 3.1409194028715323e-06,
+ "loss": 0.484,
+ "step": 5356
+ },
+ {
+ "epoch": 2.5328605200945624,
+ "grad_norm": 2.917771100997925,
+ "learning_rate": 3.140316404672926e-06,
+ "loss": 0.4539,
+ "step": 5357
+ },
+ {
+ "epoch": 2.533333333333333,
+ "grad_norm": 2.7964110374450684,
+ "learning_rate": 3.1397133666046254e-06,
+ "loss": 0.4706,
+ "step": 5358
+ },
+ {
+ "epoch": 2.533806146572104,
+ "grad_norm": 2.6481330394744873,
+ "learning_rate": 3.139110288704179e-06,
+ "loss": 0.4101,
+ "step": 5359
+ },
+ {
+ "epoch": 2.534278959810875,
+ "grad_norm": 2.859452962875366,
+ "learning_rate": 3.1385071710091365e-06,
+ "loss": 0.4842,
+ "step": 5360
+ },
+ {
+ "epoch": 2.5347517730496456,
+ "grad_norm": 2.686077356338501,
+ "learning_rate": 3.137904013557052e-06,
+ "loss": 0.4073,
+ "step": 5361
+ },
+ {
+ "epoch": 2.535224586288416,
+ "grad_norm": 3.7147045135498047,
+ "learning_rate": 3.137300816385482e-06,
+ "loss": 0.4536,
+ "step": 5362
+ },
+ {
+ "epoch": 2.5356973995271868,
+ "grad_norm": 2.51054048538208,
+ "learning_rate": 3.1366975795319856e-06,
+ "loss": 0.4171,
+ "step": 5363
+ },
+ {
+ "epoch": 2.5361702127659576,
+ "grad_norm": 3.043149471282959,
+ "learning_rate": 3.136094303034121e-06,
+ "loss": 0.5179,
+ "step": 5364
+ },
+ {
+ "epoch": 2.536643026004728,
+ "grad_norm": 2.398878812789917,
+ "learning_rate": 3.1354909869294548e-06,
+ "loss": 0.4144,
+ "step": 5365
+ },
+ {
+ "epoch": 2.5371158392434987,
+ "grad_norm": 2.969712257385254,
+ "learning_rate": 3.134887631255551e-06,
+ "loss": 0.3983,
+ "step": 5366
+ },
+ {
+ "epoch": 2.5375886524822695,
+ "grad_norm": 2.7707982063293457,
+ "learning_rate": 3.134284236049978e-06,
+ "loss": 0.4405,
+ "step": 5367
+ },
+ {
+ "epoch": 2.5380614657210403,
+ "grad_norm": 2.579742193222046,
+ "learning_rate": 3.1336808013503073e-06,
+ "loss": 0.4402,
+ "step": 5368
+ },
+ {
+ "epoch": 2.538534278959811,
+ "grad_norm": 2.6041927337646484,
+ "learning_rate": 3.1330773271941113e-06,
+ "loss": 0.396,
+ "step": 5369
+ },
+ {
+ "epoch": 2.5390070921985815,
+ "grad_norm": 2.7383856773376465,
+ "learning_rate": 3.1324738136189658e-06,
+ "loss": 0.4424,
+ "step": 5370
+ },
+ {
+ "epoch": 2.5394799054373522,
+ "grad_norm": 3.053644895553589,
+ "learning_rate": 3.13187026066245e-06,
+ "loss": 0.473,
+ "step": 5371
+ },
+ {
+ "epoch": 2.539952718676123,
+ "grad_norm": 2.684244155883789,
+ "learning_rate": 3.1312666683621428e-06,
+ "loss": 0.3963,
+ "step": 5372
+ },
+ {
+ "epoch": 2.5404255319148934,
+ "grad_norm": 2.6505017280578613,
+ "learning_rate": 3.130663036755629e-06,
+ "loss": 0.4292,
+ "step": 5373
+ },
+ {
+ "epoch": 2.540898345153664,
+ "grad_norm": 3.025965929031372,
+ "learning_rate": 3.1300593658804935e-06,
+ "loss": 0.4539,
+ "step": 5374
+ },
+ {
+ "epoch": 2.541371158392435,
+ "grad_norm": 2.72106671333313,
+ "learning_rate": 3.1294556557743237e-06,
+ "loss": 0.4519,
+ "step": 5375
+ },
+ {
+ "epoch": 2.541843971631206,
+ "grad_norm": 2.759995222091675,
+ "learning_rate": 3.12885190647471e-06,
+ "loss": 0.451,
+ "step": 5376
+ },
+ {
+ "epoch": 2.5423167848699766,
+ "grad_norm": 2.697950601577759,
+ "learning_rate": 3.1282481180192457e-06,
+ "loss": 0.4328,
+ "step": 5377
+ },
+ {
+ "epoch": 2.542789598108747,
+ "grad_norm": 2.6970415115356445,
+ "learning_rate": 3.127644290445526e-06,
+ "loss": 0.4489,
+ "step": 5378
+ },
+ {
+ "epoch": 2.5432624113475177,
+ "grad_norm": 2.5856997966766357,
+ "learning_rate": 3.127040423791148e-06,
+ "loss": 0.3848,
+ "step": 5379
+ },
+ {
+ "epoch": 2.5437352245862885,
+ "grad_norm": 2.9798166751861572,
+ "learning_rate": 3.1264365180937127e-06,
+ "loss": 0.5038,
+ "step": 5380
+ },
+ {
+ "epoch": 2.544208037825059,
+ "grad_norm": 3.413175106048584,
+ "learning_rate": 3.1258325733908224e-06,
+ "loss": 0.5247,
+ "step": 5381
+ },
+ {
+ "epoch": 2.5446808510638297,
+ "grad_norm": 2.838517904281616,
+ "learning_rate": 3.1252285897200818e-06,
+ "loss": 0.4652,
+ "step": 5382
+ },
+ {
+ "epoch": 2.5451536643026005,
+ "grad_norm": 2.8342528343200684,
+ "learning_rate": 3.1246245671190983e-06,
+ "loss": 0.4245,
+ "step": 5383
+ },
+ {
+ "epoch": 2.5456264775413713,
+ "grad_norm": 3.06026029586792,
+ "learning_rate": 3.124020505625482e-06,
+ "loss": 0.469,
+ "step": 5384
+ },
+ {
+ "epoch": 2.546099290780142,
+ "grad_norm": 2.633894681930542,
+ "learning_rate": 3.1234164052768452e-06,
+ "loss": 0.4509,
+ "step": 5385
+ },
+ {
+ "epoch": 2.5465721040189124,
+ "grad_norm": 2.634819984436035,
+ "learning_rate": 3.1228122661108023e-06,
+ "loss": 0.4879,
+ "step": 5386
+ },
+ {
+ "epoch": 2.5470449172576832,
+ "grad_norm": 3.9843504428863525,
+ "learning_rate": 3.1222080881649707e-06,
+ "loss": 0.4472,
+ "step": 5387
+ },
+ {
+ "epoch": 2.547517730496454,
+ "grad_norm": 2.5480258464813232,
+ "learning_rate": 3.1216038714769694e-06,
+ "loss": 0.4396,
+ "step": 5388
+ },
+ {
+ "epoch": 2.5479905437352244,
+ "grad_norm": 2.7461917400360107,
+ "learning_rate": 3.12099961608442e-06,
+ "loss": 0.4735,
+ "step": 5389
+ },
+ {
+ "epoch": 2.548463356973995,
+ "grad_norm": 3.167769193649292,
+ "learning_rate": 3.1203953220249493e-06,
+ "loss": 0.4196,
+ "step": 5390
+ },
+ {
+ "epoch": 2.548936170212766,
+ "grad_norm": 2.721696615219116,
+ "learning_rate": 3.1197909893361814e-06,
+ "loss": 0.4571,
+ "step": 5391
+ },
+ {
+ "epoch": 2.5494089834515368,
+ "grad_norm": 2.726668119430542,
+ "learning_rate": 3.1191866180557463e-06,
+ "loss": 0.4856,
+ "step": 5392
+ },
+ {
+ "epoch": 2.5498817966903076,
+ "grad_norm": 2.602205276489258,
+ "learning_rate": 3.1185822082212754e-06,
+ "loss": 0.4631,
+ "step": 5393
+ },
+ {
+ "epoch": 2.550354609929078,
+ "grad_norm": 2.7715859413146973,
+ "learning_rate": 3.1179777598704025e-06,
+ "loss": 0.4136,
+ "step": 5394
+ },
+ {
+ "epoch": 2.5508274231678487,
+ "grad_norm": 2.8081955909729004,
+ "learning_rate": 3.1173732730407647e-06,
+ "loss": 0.4963,
+ "step": 5395
+ },
+ {
+ "epoch": 2.5513002364066195,
+ "grad_norm": 2.946772336959839,
+ "learning_rate": 3.1167687477700006e-06,
+ "loss": 0.4443,
+ "step": 5396
+ },
+ {
+ "epoch": 2.55177304964539,
+ "grad_norm": 2.89345383644104,
+ "learning_rate": 3.1161641840957503e-06,
+ "loss": 0.4377,
+ "step": 5397
+ },
+ {
+ "epoch": 2.5522458628841607,
+ "grad_norm": 2.908317804336548,
+ "learning_rate": 3.115559582055659e-06,
+ "loss": 0.4702,
+ "step": 5398
+ },
+ {
+ "epoch": 2.5527186761229315,
+ "grad_norm": 2.554417848587036,
+ "learning_rate": 3.1149549416873704e-06,
+ "loss": 0.3738,
+ "step": 5399
+ },
+ {
+ "epoch": 2.5531914893617023,
+ "grad_norm": 2.3132457733154297,
+ "learning_rate": 3.1143502630285356e-06,
+ "loss": 0.4074,
+ "step": 5400
+ },
+ {
+ "epoch": 2.553664302600473,
+ "grad_norm": 2.751666784286499,
+ "learning_rate": 3.1137455461168026e-06,
+ "loss": 0.4697,
+ "step": 5401
+ },
+ {
+ "epoch": 2.5541371158392434,
+ "grad_norm": 2.7088871002197266,
+ "learning_rate": 3.113140790989826e-06,
+ "loss": 0.4754,
+ "step": 5402
+ },
+ {
+ "epoch": 2.554609929078014,
+ "grad_norm": 3.0633046627044678,
+ "learning_rate": 3.1125359976852605e-06,
+ "loss": 0.4874,
+ "step": 5403
+ },
+ {
+ "epoch": 2.555082742316785,
+ "grad_norm": 3.399456024169922,
+ "learning_rate": 3.111931166240764e-06,
+ "loss": 0.5529,
+ "step": 5404
+ },
+ {
+ "epoch": 2.5555555555555554,
+ "grad_norm": 2.7729690074920654,
+ "learning_rate": 3.1113262966939985e-06,
+ "loss": 0.4677,
+ "step": 5405
+ },
+ {
+ "epoch": 2.556028368794326,
+ "grad_norm": 2.81025767326355,
+ "learning_rate": 3.1107213890826244e-06,
+ "loss": 0.4954,
+ "step": 5406
+ },
+ {
+ "epoch": 2.556501182033097,
+ "grad_norm": 2.4837241172790527,
+ "learning_rate": 3.110116443444307e-06,
+ "loss": 0.3681,
+ "step": 5407
+ },
+ {
+ "epoch": 2.5569739952718678,
+ "grad_norm": 2.6406874656677246,
+ "learning_rate": 3.109511459816714e-06,
+ "loss": 0.4569,
+ "step": 5408
+ },
+ {
+ "epoch": 2.5574468085106385,
+ "grad_norm": 2.6093738079071045,
+ "learning_rate": 3.1089064382375155e-06,
+ "loss": 0.413,
+ "step": 5409
+ },
+ {
+ "epoch": 2.557919621749409,
+ "grad_norm": 2.6629011631011963,
+ "learning_rate": 3.108301378744383e-06,
+ "loss": 0.4286,
+ "step": 5410
+ },
+ {
+ "epoch": 2.5583924349881797,
+ "grad_norm": 2.694796323776245,
+ "learning_rate": 3.10769628137499e-06,
+ "loss": 0.4316,
+ "step": 5411
+ },
+ {
+ "epoch": 2.5588652482269505,
+ "grad_norm": 2.88023042678833,
+ "learning_rate": 3.107091146167015e-06,
+ "loss": 0.4378,
+ "step": 5412
+ },
+ {
+ "epoch": 2.559338061465721,
+ "grad_norm": 2.8804919719696045,
+ "learning_rate": 3.1064859731581365e-06,
+ "loss": 0.4971,
+ "step": 5413
+ },
+ {
+ "epoch": 2.5598108747044916,
+ "grad_norm": 2.850468397140503,
+ "learning_rate": 3.1058807623860353e-06,
+ "loss": 0.4686,
+ "step": 5414
+ },
+ {
+ "epoch": 2.5602836879432624,
+ "grad_norm": 3.0548019409179688,
+ "learning_rate": 3.1052755138883963e-06,
+ "loss": 0.4497,
+ "step": 5415
+ },
+ {
+ "epoch": 2.5607565011820332,
+ "grad_norm": 3.10168719291687,
+ "learning_rate": 3.1046702277029046e-06,
+ "loss": 0.569,
+ "step": 5416
+ },
+ {
+ "epoch": 2.561229314420804,
+ "grad_norm": 2.5887374877929688,
+ "learning_rate": 3.1040649038672494e-06,
+ "loss": 0.3812,
+ "step": 5417
+ },
+ {
+ "epoch": 2.5617021276595744,
+ "grad_norm": 2.9928438663482666,
+ "learning_rate": 3.1034595424191212e-06,
+ "loss": 0.4308,
+ "step": 5418
+ },
+ {
+ "epoch": 2.562174940898345,
+ "grad_norm": 2.7003073692321777,
+ "learning_rate": 3.102854143396214e-06,
+ "loss": 0.4967,
+ "step": 5419
+ },
+ {
+ "epoch": 2.562647754137116,
+ "grad_norm": 3.172868490219116,
+ "learning_rate": 3.102248706836222e-06,
+ "loss": 0.5311,
+ "step": 5420
+ },
+ {
+ "epoch": 2.5631205673758863,
+ "grad_norm": 3.0146191120147705,
+ "learning_rate": 3.101643232776844e-06,
+ "loss": 0.4714,
+ "step": 5421
+ },
+ {
+ "epoch": 2.563593380614657,
+ "grad_norm": 3.0683791637420654,
+ "learning_rate": 3.1010377212557806e-06,
+ "loss": 0.4047,
+ "step": 5422
+ },
+ {
+ "epoch": 2.564066193853428,
+ "grad_norm": 2.8260676860809326,
+ "learning_rate": 3.1004321723107334e-06,
+ "loss": 0.5282,
+ "step": 5423
+ },
+ {
+ "epoch": 2.5645390070921987,
+ "grad_norm": 3.0792388916015625,
+ "learning_rate": 3.0998265859794074e-06,
+ "loss": 0.5323,
+ "step": 5424
+ },
+ {
+ "epoch": 2.5650118203309695,
+ "grad_norm": 2.7332866191864014,
+ "learning_rate": 3.09922096229951e-06,
+ "loss": 0.4401,
+ "step": 5425
+ },
+ {
+ "epoch": 2.56548463356974,
+ "grad_norm": 2.9366047382354736,
+ "learning_rate": 3.098615301308751e-06,
+ "loss": 0.4495,
+ "step": 5426
+ },
+ {
+ "epoch": 2.5659574468085107,
+ "grad_norm": 2.982088565826416,
+ "learning_rate": 3.098009603044842e-06,
+ "loss": 0.495,
+ "step": 5427
+ },
+ {
+ "epoch": 2.5664302600472815,
+ "grad_norm": 3.1204755306243896,
+ "learning_rate": 3.0974038675454976e-06,
+ "loss": 0.4354,
+ "step": 5428
+ },
+ {
+ "epoch": 2.566903073286052,
+ "grad_norm": 2.835238218307495,
+ "learning_rate": 3.0967980948484333e-06,
+ "loss": 0.4161,
+ "step": 5429
+ },
+ {
+ "epoch": 2.5673758865248226,
+ "grad_norm": 2.8104958534240723,
+ "learning_rate": 3.096192284991369e-06,
+ "loss": 0.5045,
+ "step": 5430
+ },
+ {
+ "epoch": 2.5678486997635934,
+ "grad_norm": 3.1636080741882324,
+ "learning_rate": 3.0955864380120247e-06,
+ "loss": 0.4533,
+ "step": 5431
+ },
+ {
+ "epoch": 2.568321513002364,
+ "grad_norm": 2.980112314224243,
+ "learning_rate": 3.0949805539481247e-06,
+ "loss": 0.3998,
+ "step": 5432
+ },
+ {
+ "epoch": 2.568794326241135,
+ "grad_norm": 2.6379945278167725,
+ "learning_rate": 3.0943746328373953e-06,
+ "loss": 0.3785,
+ "step": 5433
+ },
+ {
+ "epoch": 2.5692671394799054,
+ "grad_norm": 2.780930757522583,
+ "learning_rate": 3.0937686747175627e-06,
+ "loss": 0.4801,
+ "step": 5434
+ },
+ {
+ "epoch": 2.569739952718676,
+ "grad_norm": 2.6608550548553467,
+ "learning_rate": 3.0931626796263585e-06,
+ "loss": 0.4047,
+ "step": 5435
+ },
+ {
+ "epoch": 2.570212765957447,
+ "grad_norm": 3.130584716796875,
+ "learning_rate": 3.0925566476015156e-06,
+ "loss": 0.5049,
+ "step": 5436
+ },
+ {
+ "epoch": 2.5706855791962173,
+ "grad_norm": 2.9699313640594482,
+ "learning_rate": 3.0919505786807687e-06,
+ "loss": 0.3847,
+ "step": 5437
+ },
+ {
+ "epoch": 2.571158392434988,
+ "grad_norm": 2.919260025024414,
+ "learning_rate": 3.091344472901855e-06,
+ "loss": 0.4631,
+ "step": 5438
+ },
+ {
+ "epoch": 2.571631205673759,
+ "grad_norm": 2.956587553024292,
+ "learning_rate": 3.0907383303025134e-06,
+ "loss": 0.4974,
+ "step": 5439
+ },
+ {
+ "epoch": 2.5721040189125297,
+ "grad_norm": 2.758542776107788,
+ "learning_rate": 3.090132150920486e-06,
+ "loss": 0.4785,
+ "step": 5440
+ },
+ {
+ "epoch": 2.5725768321513005,
+ "grad_norm": 2.678469657897949,
+ "learning_rate": 3.0895259347935175e-06,
+ "loss": 0.4453,
+ "step": 5441
+ },
+ {
+ "epoch": 2.573049645390071,
+ "grad_norm": 2.6508545875549316,
+ "learning_rate": 3.088919681959355e-06,
+ "loss": 0.4426,
+ "step": 5442
+ },
+ {
+ "epoch": 2.5735224586288417,
+ "grad_norm": 2.6156187057495117,
+ "learning_rate": 3.0883133924557453e-06,
+ "loss": 0.4445,
+ "step": 5443
+ },
+ {
+ "epoch": 2.5739952718676125,
+ "grad_norm": 2.484374761581421,
+ "learning_rate": 3.08770706632044e-06,
+ "loss": 0.4155,
+ "step": 5444
+ },
+ {
+ "epoch": 2.574468085106383,
+ "grad_norm": 2.7465295791625977,
+ "learning_rate": 3.087100703591193e-06,
+ "loss": 0.4085,
+ "step": 5445
+ },
+ {
+ "epoch": 2.5749408983451536,
+ "grad_norm": 2.771740198135376,
+ "learning_rate": 3.08649430430576e-06,
+ "loss": 0.4313,
+ "step": 5446
+ },
+ {
+ "epoch": 2.5754137115839244,
+ "grad_norm": 2.7480874061584473,
+ "learning_rate": 3.0858878685018984e-06,
+ "loss": 0.3471,
+ "step": 5447
+ },
+ {
+ "epoch": 2.575886524822695,
+ "grad_norm": 2.894913673400879,
+ "learning_rate": 3.085281396217368e-06,
+ "loss": 0.4888,
+ "step": 5448
+ },
+ {
+ "epoch": 2.576359338061466,
+ "grad_norm": 3.037628173828125,
+ "learning_rate": 3.0846748874899306e-06,
+ "loss": 0.3976,
+ "step": 5449
+ },
+ {
+ "epoch": 2.5768321513002364,
+ "grad_norm": 2.4811434745788574,
+ "learning_rate": 3.0840683423573526e-06,
+ "loss": 0.4822,
+ "step": 5450
+ },
+ {
+ "epoch": 2.577304964539007,
+ "grad_norm": 3.0078725814819336,
+ "learning_rate": 3.0834617608573998e-06,
+ "loss": 0.4999,
+ "step": 5451
+ },
+ {
+ "epoch": 2.5777777777777775,
+ "grad_norm": 3.174154043197632,
+ "learning_rate": 3.0828551430278413e-06,
+ "loss": 0.4626,
+ "step": 5452
+ },
+ {
+ "epoch": 2.5782505910165483,
+ "grad_norm": 2.8277535438537598,
+ "learning_rate": 3.082248488906449e-06,
+ "loss": 0.4633,
+ "step": 5453
+ },
+ {
+ "epoch": 2.578723404255319,
+ "grad_norm": 2.731767416000366,
+ "learning_rate": 3.0816417985309966e-06,
+ "loss": 0.4148,
+ "step": 5454
+ },
+ {
+ "epoch": 2.57919621749409,
+ "grad_norm": 2.5480549335479736,
+ "learning_rate": 3.0810350719392597e-06,
+ "loss": 0.4773,
+ "step": 5455
+ },
+ {
+ "epoch": 2.5796690307328607,
+ "grad_norm": 2.9755172729492188,
+ "learning_rate": 3.080428309169017e-06,
+ "loss": 0.5107,
+ "step": 5456
+ },
+ {
+ "epoch": 2.580141843971631,
+ "grad_norm": 2.6499290466308594,
+ "learning_rate": 3.079821510258048e-06,
+ "loss": 0.3982,
+ "step": 5457
+ },
+ {
+ "epoch": 2.580614657210402,
+ "grad_norm": 2.663214921951294,
+ "learning_rate": 3.079214675244136e-06,
+ "loss": 0.4419,
+ "step": 5458
+ },
+ {
+ "epoch": 2.5810874704491726,
+ "grad_norm": 2.595489263534546,
+ "learning_rate": 3.078607804165066e-06,
+ "loss": 0.3958,
+ "step": 5459
+ },
+ {
+ "epoch": 2.581560283687943,
+ "grad_norm": 3.031458854675293,
+ "learning_rate": 3.0780008970586255e-06,
+ "loss": 0.518,
+ "step": 5460
+ },
+ {
+ "epoch": 2.582033096926714,
+ "grad_norm": 2.827071189880371,
+ "learning_rate": 3.077393953962603e-06,
+ "loss": 0.4397,
+ "step": 5461
+ },
+ {
+ "epoch": 2.5825059101654846,
+ "grad_norm": 2.656111240386963,
+ "learning_rate": 3.0767869749147917e-06,
+ "loss": 0.4912,
+ "step": 5462
+ },
+ {
+ "epoch": 2.5829787234042554,
+ "grad_norm": 2.545365333557129,
+ "learning_rate": 3.076179959952984e-06,
+ "loss": 0.3991,
+ "step": 5463
+ },
+ {
+ "epoch": 2.583451536643026,
+ "grad_norm": 2.5794365406036377,
+ "learning_rate": 3.075572909114977e-06,
+ "loss": 0.4499,
+ "step": 5464
+ },
+ {
+ "epoch": 2.5839243498817965,
+ "grad_norm": 2.787140369415283,
+ "learning_rate": 3.074965822438568e-06,
+ "loss": 0.386,
+ "step": 5465
+ },
+ {
+ "epoch": 2.5843971631205673,
+ "grad_norm": 2.6406853199005127,
+ "learning_rate": 3.0743586999615594e-06,
+ "loss": 0.4853,
+ "step": 5466
+ },
+ {
+ "epoch": 2.584869976359338,
+ "grad_norm": 2.8082082271575928,
+ "learning_rate": 3.073751541721752e-06,
+ "loss": 0.4669,
+ "step": 5467
+ },
+ {
+ "epoch": 2.5853427895981085,
+ "grad_norm": 2.8808975219726562,
+ "learning_rate": 3.073144347756952e-06,
+ "loss": 0.4193,
+ "step": 5468
+ },
+ {
+ "epoch": 2.5858156028368793,
+ "grad_norm": 2.823352813720703,
+ "learning_rate": 3.072537118104968e-06,
+ "loss": 0.482,
+ "step": 5469
+ },
+ {
+ "epoch": 2.58628841607565,
+ "grad_norm": 2.6454555988311768,
+ "learning_rate": 3.0719298528036073e-06,
+ "loss": 0.4667,
+ "step": 5470
+ },
+ {
+ "epoch": 2.586761229314421,
+ "grad_norm": 2.871145486831665,
+ "learning_rate": 3.0713225518906826e-06,
+ "loss": 0.5125,
+ "step": 5471
+ },
+ {
+ "epoch": 2.5872340425531917,
+ "grad_norm": 3.1301417350769043,
+ "learning_rate": 3.070715215404007e-06,
+ "loss": 0.4827,
+ "step": 5472
+ },
+ {
+ "epoch": 2.587706855791962,
+ "grad_norm": 2.31062912940979,
+ "learning_rate": 3.070107843381398e-06,
+ "loss": 0.3954,
+ "step": 5473
+ },
+ {
+ "epoch": 2.588179669030733,
+ "grad_norm": 2.8366353511810303,
+ "learning_rate": 3.069500435860674e-06,
+ "loss": 0.4597,
+ "step": 5474
+ },
+ {
+ "epoch": 2.5886524822695036,
+ "grad_norm": 2.900143623352051,
+ "learning_rate": 3.068892992879654e-06,
+ "loss": 0.4294,
+ "step": 5475
+ },
+ {
+ "epoch": 2.589125295508274,
+ "grad_norm": 2.923313617706299,
+ "learning_rate": 3.0682855144761626e-06,
+ "loss": 0.505,
+ "step": 5476
+ },
+ {
+ "epoch": 2.5895981087470448,
+ "grad_norm": 2.726475954055786,
+ "learning_rate": 3.0676780006880242e-06,
+ "loss": 0.4208,
+ "step": 5477
+ },
+ {
+ "epoch": 2.5900709219858156,
+ "grad_norm": 4.115052223205566,
+ "learning_rate": 3.0670704515530654e-06,
+ "loss": 0.466,
+ "step": 5478
+ },
+ {
+ "epoch": 2.5905437352245864,
+ "grad_norm": 2.6018717288970947,
+ "learning_rate": 3.0664628671091163e-06,
+ "loss": 0.4697,
+ "step": 5479
+ },
+ {
+ "epoch": 2.591016548463357,
+ "grad_norm": 2.7393722534179688,
+ "learning_rate": 3.0658552473940085e-06,
+ "loss": 0.4618,
+ "step": 5480
+ },
+ {
+ "epoch": 2.5914893617021275,
+ "grad_norm": 2.8406929969787598,
+ "learning_rate": 3.065247592445575e-06,
+ "loss": 0.4806,
+ "step": 5481
+ },
+ {
+ "epoch": 2.5919621749408983,
+ "grad_norm": 2.9773001670837402,
+ "learning_rate": 3.0646399023016525e-06,
+ "loss": 0.4764,
+ "step": 5482
+ },
+ {
+ "epoch": 2.592434988179669,
+ "grad_norm": 3.374643325805664,
+ "learning_rate": 3.0640321770000804e-06,
+ "loss": 0.4481,
+ "step": 5483
+ },
+ {
+ "epoch": 2.5929078014184395,
+ "grad_norm": 2.5742013454437256,
+ "learning_rate": 3.0634244165786965e-06,
+ "loss": 0.432,
+ "step": 5484
+ },
+ {
+ "epoch": 2.5933806146572103,
+ "grad_norm": 2.9390289783477783,
+ "learning_rate": 3.062816621075346e-06,
+ "loss": 0.3941,
+ "step": 5485
+ },
+ {
+ "epoch": 2.593853427895981,
+ "grad_norm": 2.683414936065674,
+ "learning_rate": 3.062208790527871e-06,
+ "loss": 0.4268,
+ "step": 5486
+ },
+ {
+ "epoch": 2.594326241134752,
+ "grad_norm": 2.689647674560547,
+ "learning_rate": 3.06160092497412e-06,
+ "loss": 0.4569,
+ "step": 5487
+ },
+ {
+ "epoch": 2.5947990543735227,
+ "grad_norm": 3.1170310974121094,
+ "learning_rate": 3.060993024451943e-06,
+ "loss": 0.4387,
+ "step": 5488
+ },
+ {
+ "epoch": 2.595271867612293,
+ "grad_norm": 2.8732447624206543,
+ "learning_rate": 3.0603850889991894e-06,
+ "loss": 0.451,
+ "step": 5489
+ },
+ {
+ "epoch": 2.595744680851064,
+ "grad_norm": 3.0444157123565674,
+ "learning_rate": 3.0597771186537135e-06,
+ "loss": 0.4691,
+ "step": 5490
+ },
+ {
+ "epoch": 2.5962174940898346,
+ "grad_norm": 2.3791720867156982,
+ "learning_rate": 3.0591691134533714e-06,
+ "loss": 0.4771,
+ "step": 5491
+ },
+ {
+ "epoch": 2.596690307328605,
+ "grad_norm": 3.0677225589752197,
+ "learning_rate": 3.05856107343602e-06,
+ "loss": 0.459,
+ "step": 5492
+ },
+ {
+ "epoch": 2.5971631205673757,
+ "grad_norm": 3.1702635288238525,
+ "learning_rate": 3.05795299863952e-06,
+ "loss": 0.4816,
+ "step": 5493
+ },
+ {
+ "epoch": 2.5976359338061465,
+ "grad_norm": 2.964869499206543,
+ "learning_rate": 3.057344889101734e-06,
+ "loss": 0.4369,
+ "step": 5494
+ },
+ {
+ "epoch": 2.5981087470449173,
+ "grad_norm": 3.1333882808685303,
+ "learning_rate": 3.056736744860525e-06,
+ "loss": 0.4178,
+ "step": 5495
+ },
+ {
+ "epoch": 2.598581560283688,
+ "grad_norm": 2.4340405464172363,
+ "learning_rate": 3.05612856595376e-06,
+ "loss": 0.4359,
+ "step": 5496
+ },
+ {
+ "epoch": 2.5990543735224585,
+ "grad_norm": 2.638620615005493,
+ "learning_rate": 3.0555203524193083e-06,
+ "loss": 0.3915,
+ "step": 5497
+ },
+ {
+ "epoch": 2.5995271867612293,
+ "grad_norm": 2.8218815326690674,
+ "learning_rate": 3.054912104295039e-06,
+ "loss": 0.4684,
+ "step": 5498
+ },
+ {
+ "epoch": 2.6,
+ "grad_norm": 2.6696009635925293,
+ "learning_rate": 3.054303821618827e-06,
+ "loss": 0.4073,
+ "step": 5499
+ },
+ {
+ "epoch": 2.6004728132387704,
+ "grad_norm": 2.3880512714385986,
+ "learning_rate": 3.0536955044285465e-06,
+ "loss": 0.3576,
+ "step": 5500
+ },
+ {
+ "epoch": 2.6009456264775412,
+ "grad_norm": 2.762890100479126,
+ "learning_rate": 3.053087152762075e-06,
+ "loss": 0.3857,
+ "step": 5501
+ },
+ {
+ "epoch": 2.601418439716312,
+ "grad_norm": 2.729033946990967,
+ "learning_rate": 3.052478766657292e-06,
+ "loss": 0.3935,
+ "step": 5502
+ },
+ {
+ "epoch": 2.601891252955083,
+ "grad_norm": 2.630490303039551,
+ "learning_rate": 3.051870346152078e-06,
+ "loss": 0.3932,
+ "step": 5503
+ },
+ {
+ "epoch": 2.6023640661938536,
+ "grad_norm": 3.0335981845855713,
+ "learning_rate": 3.051261891284318e-06,
+ "loss": 0.4313,
+ "step": 5504
+ },
+ {
+ "epoch": 2.602836879432624,
+ "grad_norm": 2.969888687133789,
+ "learning_rate": 3.0506534020918963e-06,
+ "loss": 0.4698,
+ "step": 5505
+ },
+ {
+ "epoch": 2.603309692671395,
+ "grad_norm": 3.093996524810791,
+ "learning_rate": 3.050044878612703e-06,
+ "loss": 0.5338,
+ "step": 5506
+ },
+ {
+ "epoch": 2.6037825059101656,
+ "grad_norm": 2.759993314743042,
+ "learning_rate": 3.049436320884626e-06,
+ "loss": 0.4429,
+ "step": 5507
+ },
+ {
+ "epoch": 2.604255319148936,
+ "grad_norm": 2.979422092437744,
+ "learning_rate": 3.0488277289455587e-06,
+ "loss": 0.4489,
+ "step": 5508
+ },
+ {
+ "epoch": 2.6047281323877067,
+ "grad_norm": 2.8266701698303223,
+ "learning_rate": 3.048219102833396e-06,
+ "loss": 0.489,
+ "step": 5509
+ },
+ {
+ "epoch": 2.6052009456264775,
+ "grad_norm": 2.2582461833953857,
+ "learning_rate": 3.047610442586033e-06,
+ "loss": 0.3759,
+ "step": 5510
+ },
+ {
+ "epoch": 2.6056737588652483,
+ "grad_norm": 3.078152894973755,
+ "learning_rate": 3.0470017482413694e-06,
+ "loss": 0.5059,
+ "step": 5511
+ },
+ {
+ "epoch": 2.606146572104019,
+ "grad_norm": 2.7895498275756836,
+ "learning_rate": 3.0463930198373047e-06,
+ "loss": 0.4752,
+ "step": 5512
+ },
+ {
+ "epoch": 2.6066193853427895,
+ "grad_norm": 3.2307958602905273,
+ "learning_rate": 3.045784257411743e-06,
+ "loss": 0.4847,
+ "step": 5513
+ },
+ {
+ "epoch": 2.6070921985815603,
+ "grad_norm": 2.793661594390869,
+ "learning_rate": 3.0451754610025884e-06,
+ "loss": 0.4492,
+ "step": 5514
+ },
+ {
+ "epoch": 2.607565011820331,
+ "grad_norm": 2.4443132877349854,
+ "learning_rate": 3.0445666306477484e-06,
+ "loss": 0.4174,
+ "step": 5515
+ },
+ {
+ "epoch": 2.6080378250591014,
+ "grad_norm": 2.628769636154175,
+ "learning_rate": 3.0439577663851326e-06,
+ "loss": 0.3889,
+ "step": 5516
+ },
+ {
+ "epoch": 2.608510638297872,
+ "grad_norm": 2.9367563724517822,
+ "learning_rate": 3.0433488682526525e-06,
+ "loss": 0.437,
+ "step": 5517
+ },
+ {
+ "epoch": 2.608983451536643,
+ "grad_norm": 3.171353340148926,
+ "learning_rate": 3.04273993628822e-06,
+ "loss": 0.47,
+ "step": 5518
+ },
+ {
+ "epoch": 2.609456264775414,
+ "grad_norm": 2.856576442718506,
+ "learning_rate": 3.0421309705297513e-06,
+ "loss": 0.4797,
+ "step": 5519
+ },
+ {
+ "epoch": 2.6099290780141846,
+ "grad_norm": 2.4926068782806396,
+ "learning_rate": 3.041521971015165e-06,
+ "loss": 0.4294,
+ "step": 5520
+ },
+ {
+ "epoch": 2.610401891252955,
+ "grad_norm": 2.7897613048553467,
+ "learning_rate": 3.040912937782379e-06,
+ "loss": 0.4388,
+ "step": 5521
+ },
+ {
+ "epoch": 2.6108747044917258,
+ "grad_norm": 3.588188886642456,
+ "learning_rate": 3.0403038708693173e-06,
+ "loss": 0.4027,
+ "step": 5522
+ },
+ {
+ "epoch": 2.6113475177304966,
+ "grad_norm": 3.5394980907440186,
+ "learning_rate": 3.0396947703139017e-06,
+ "loss": 0.4866,
+ "step": 5523
+ },
+ {
+ "epoch": 2.611820330969267,
+ "grad_norm": 3.086865186691284,
+ "learning_rate": 3.03908563615406e-06,
+ "loss": 0.4344,
+ "step": 5524
+ },
+ {
+ "epoch": 2.6122931442080377,
+ "grad_norm": 2.649564504623413,
+ "learning_rate": 3.0384764684277194e-06,
+ "loss": 0.4571,
+ "step": 5525
+ },
+ {
+ "epoch": 2.6127659574468085,
+ "grad_norm": 2.945234775543213,
+ "learning_rate": 3.0378672671728105e-06,
+ "loss": 0.4885,
+ "step": 5526
+ },
+ {
+ "epoch": 2.6132387706855793,
+ "grad_norm": 2.625424861907959,
+ "learning_rate": 3.037258032427265e-06,
+ "loss": 0.4095,
+ "step": 5527
+ },
+ {
+ "epoch": 2.61371158392435,
+ "grad_norm": 2.7597248554229736,
+ "learning_rate": 3.0366487642290175e-06,
+ "loss": 0.4393,
+ "step": 5528
+ },
+ {
+ "epoch": 2.6141843971631205,
+ "grad_norm": 2.721189260482788,
+ "learning_rate": 3.0360394626160043e-06,
+ "loss": 0.3865,
+ "step": 5529
+ },
+ {
+ "epoch": 2.6146572104018913,
+ "grad_norm": 2.624056339263916,
+ "learning_rate": 3.0354301276261656e-06,
+ "loss": 0.4273,
+ "step": 5530
+ },
+ {
+ "epoch": 2.615130023640662,
+ "grad_norm": 2.7764177322387695,
+ "learning_rate": 3.034820759297439e-06,
+ "loss": 0.4756,
+ "step": 5531
+ },
+ {
+ "epoch": 2.6156028368794324,
+ "grad_norm": 3.0841729640960693,
+ "learning_rate": 3.0342113576677696e-06,
+ "loss": 0.4907,
+ "step": 5532
+ },
+ {
+ "epoch": 2.616075650118203,
+ "grad_norm": 2.678715705871582,
+ "learning_rate": 3.0336019227751017e-06,
+ "loss": 0.4478,
+ "step": 5533
+ },
+ {
+ "epoch": 2.616548463356974,
+ "grad_norm": 2.378679037094116,
+ "learning_rate": 3.032992454657382e-06,
+ "loss": 0.3678,
+ "step": 5534
+ },
+ {
+ "epoch": 2.617021276595745,
+ "grad_norm": 2.792079210281372,
+ "learning_rate": 3.0323829533525583e-06,
+ "loss": 0.4115,
+ "step": 5535
+ },
+ {
+ "epoch": 2.6174940898345156,
+ "grad_norm": 2.738133192062378,
+ "learning_rate": 3.0317734188985832e-06,
+ "loss": 0.4152,
+ "step": 5536
+ },
+ {
+ "epoch": 2.617966903073286,
+ "grad_norm": 2.6963796615600586,
+ "learning_rate": 3.0311638513334084e-06,
+ "loss": 0.4096,
+ "step": 5537
+ },
+ {
+ "epoch": 2.6184397163120567,
+ "grad_norm": 2.694145679473877,
+ "learning_rate": 3.03055425069499e-06,
+ "loss": 0.3793,
+ "step": 5538
+ },
+ {
+ "epoch": 2.6189125295508275,
+ "grad_norm": 2.762403964996338,
+ "learning_rate": 3.0299446170212855e-06,
+ "loss": 0.459,
+ "step": 5539
+ },
+ {
+ "epoch": 2.619385342789598,
+ "grad_norm": 2.804382562637329,
+ "learning_rate": 3.0293349503502522e-06,
+ "loss": 0.4853,
+ "step": 5540
+ },
+ {
+ "epoch": 2.6198581560283687,
+ "grad_norm": 2.7768518924713135,
+ "learning_rate": 3.0287252507198537e-06,
+ "loss": 0.4496,
+ "step": 5541
+ },
+ {
+ "epoch": 2.6203309692671395,
+ "grad_norm": 2.9075138568878174,
+ "learning_rate": 3.028115518168052e-06,
+ "loss": 0.4498,
+ "step": 5542
+ },
+ {
+ "epoch": 2.6208037825059103,
+ "grad_norm": 2.8966822624206543,
+ "learning_rate": 3.0275057527328126e-06,
+ "loss": 0.4434,
+ "step": 5543
+ },
+ {
+ "epoch": 2.621276595744681,
+ "grad_norm": 2.8140156269073486,
+ "learning_rate": 3.0268959544521027e-06,
+ "loss": 0.3935,
+ "step": 5544
+ },
+ {
+ "epoch": 2.6217494089834514,
+ "grad_norm": 2.8606276512145996,
+ "learning_rate": 3.0262861233638924e-06,
+ "loss": 0.4222,
+ "step": 5545
+ },
+ {
+ "epoch": 2.6222222222222222,
+ "grad_norm": 3.003610134124756,
+ "learning_rate": 3.0256762595061522e-06,
+ "loss": 0.428,
+ "step": 5546
+ },
+ {
+ "epoch": 2.622695035460993,
+ "grad_norm": 2.725907802581787,
+ "learning_rate": 3.025066362916857e-06,
+ "loss": 0.3975,
+ "step": 5547
+ },
+ {
+ "epoch": 2.6231678486997634,
+ "grad_norm": 2.5247902870178223,
+ "learning_rate": 3.024456433633982e-06,
+ "loss": 0.4584,
+ "step": 5548
+ },
+ {
+ "epoch": 2.623640661938534,
+ "grad_norm": 2.932798147201538,
+ "learning_rate": 3.0238464716955045e-06,
+ "loss": 0.4991,
+ "step": 5549
+ },
+ {
+ "epoch": 2.624113475177305,
+ "grad_norm": 2.693547010421753,
+ "learning_rate": 3.023236477139404e-06,
+ "loss": 0.4405,
+ "step": 5550
+ },
+ {
+ "epoch": 2.6245862884160758,
+ "grad_norm": 3.2600035667419434,
+ "learning_rate": 3.022626450003662e-06,
+ "loss": 0.4904,
+ "step": 5551
+ },
+ {
+ "epoch": 2.6250591016548466,
+ "grad_norm": 2.9471960067749023,
+ "learning_rate": 3.0220163903262627e-06,
+ "loss": 0.4487,
+ "step": 5552
+ },
+ {
+ "epoch": 2.625531914893617,
+ "grad_norm": 2.583944082260132,
+ "learning_rate": 3.0214062981451926e-06,
+ "loss": 0.3552,
+ "step": 5553
+ },
+ {
+ "epoch": 2.6260047281323877,
+ "grad_norm": 2.675062656402588,
+ "learning_rate": 3.0207961734984377e-06,
+ "loss": 0.4524,
+ "step": 5554
+ },
+ {
+ "epoch": 2.6264775413711585,
+ "grad_norm": 3.0126802921295166,
+ "learning_rate": 3.0201860164239887e-06,
+ "loss": 0.4124,
+ "step": 5555
+ },
+ {
+ "epoch": 2.626950354609929,
+ "grad_norm": 2.490734577178955,
+ "learning_rate": 3.019575826959838e-06,
+ "loss": 0.4095,
+ "step": 5556
+ },
+ {
+ "epoch": 2.6274231678486997,
+ "grad_norm": 2.72817063331604,
+ "learning_rate": 3.018965605143978e-06,
+ "loss": 0.4298,
+ "step": 5557
+ },
+ {
+ "epoch": 2.6278959810874705,
+ "grad_norm": 3.1298327445983887,
+ "learning_rate": 3.0183553510144064e-06,
+ "loss": 0.4961,
+ "step": 5558
+ },
+ {
+ "epoch": 2.6283687943262413,
+ "grad_norm": 3.2379956245422363,
+ "learning_rate": 3.0177450646091195e-06,
+ "loss": 0.4943,
+ "step": 5559
+ },
+ {
+ "epoch": 2.628841607565012,
+ "grad_norm": 2.5040571689605713,
+ "learning_rate": 3.017134745966117e-06,
+ "loss": 0.3701,
+ "step": 5560
+ },
+ {
+ "epoch": 2.6293144208037824,
+ "grad_norm": 3.047184944152832,
+ "learning_rate": 3.0165243951234025e-06,
+ "loss": 0.4587,
+ "step": 5561
+ },
+ {
+ "epoch": 2.629787234042553,
+ "grad_norm": 2.4926774501800537,
+ "learning_rate": 3.0159140121189783e-06,
+ "loss": 0.3723,
+ "step": 5562
+ },
+ {
+ "epoch": 2.630260047281324,
+ "grad_norm": 2.5434961318969727,
+ "learning_rate": 3.015303596990851e-06,
+ "loss": 0.4176,
+ "step": 5563
+ },
+ {
+ "epoch": 2.6307328605200944,
+ "grad_norm": 2.5117976665496826,
+ "learning_rate": 3.0146931497770284e-06,
+ "loss": 0.4218,
+ "step": 5564
+ },
+ {
+ "epoch": 2.631205673758865,
+ "grad_norm": 2.9408798217773438,
+ "learning_rate": 3.0140826705155196e-06,
+ "loss": 0.4473,
+ "step": 5565
+ },
+ {
+ "epoch": 2.631678486997636,
+ "grad_norm": 2.996422052383423,
+ "learning_rate": 3.0134721592443385e-06,
+ "loss": 0.4513,
+ "step": 5566
+ },
+ {
+ "epoch": 2.6321513002364068,
+ "grad_norm": 2.984356164932251,
+ "learning_rate": 3.0128616160014955e-06,
+ "loss": 0.4749,
+ "step": 5567
+ },
+ {
+ "epoch": 2.6326241134751776,
+ "grad_norm": 2.6075069904327393,
+ "learning_rate": 3.0122510408250095e-06,
+ "loss": 0.4707,
+ "step": 5568
+ },
+ {
+ "epoch": 2.633096926713948,
+ "grad_norm": 2.9463071823120117,
+ "learning_rate": 3.0116404337528972e-06,
+ "loss": 0.5125,
+ "step": 5569
+ },
+ {
+ "epoch": 2.6335697399527187,
+ "grad_norm": 2.98574161529541,
+ "learning_rate": 3.0110297948231787e-06,
+ "loss": 0.4487,
+ "step": 5570
+ },
+ {
+ "epoch": 2.6340425531914895,
+ "grad_norm": 2.6039397716522217,
+ "learning_rate": 3.010419124073876e-06,
+ "loss": 0.4516,
+ "step": 5571
+ },
+ {
+ "epoch": 2.63451536643026,
+ "grad_norm": 2.8480236530303955,
+ "learning_rate": 3.0098084215430124e-06,
+ "loss": 0.4962,
+ "step": 5572
+ },
+ {
+ "epoch": 2.6349881796690307,
+ "grad_norm": 2.527597427368164,
+ "learning_rate": 3.0091976872686133e-06,
+ "loss": 0.435,
+ "step": 5573
+ },
+ {
+ "epoch": 2.6354609929078014,
+ "grad_norm": 2.898303508758545,
+ "learning_rate": 3.0085869212887076e-06,
+ "loss": 0.4473,
+ "step": 5574
+ },
+ {
+ "epoch": 2.6359338061465722,
+ "grad_norm": 2.981414318084717,
+ "learning_rate": 3.007976123641324e-06,
+ "loss": 0.4203,
+ "step": 5575
+ },
+ {
+ "epoch": 2.636406619385343,
+ "grad_norm": 3.219064474105835,
+ "learning_rate": 3.0073652943644947e-06,
+ "loss": 0.4596,
+ "step": 5576
+ },
+ {
+ "epoch": 2.6368794326241134,
+ "grad_norm": 2.7287049293518066,
+ "learning_rate": 3.0067544334962532e-06,
+ "loss": 0.433,
+ "step": 5577
+ },
+ {
+ "epoch": 2.637352245862884,
+ "grad_norm": 2.6232664585113525,
+ "learning_rate": 3.0061435410746352e-06,
+ "loss": 0.4254,
+ "step": 5578
+ },
+ {
+ "epoch": 2.637825059101655,
+ "grad_norm": 2.908311605453491,
+ "learning_rate": 3.0055326171376788e-06,
+ "loss": 0.4349,
+ "step": 5579
+ },
+ {
+ "epoch": 2.6382978723404253,
+ "grad_norm": 2.8369064331054688,
+ "learning_rate": 3.0049216617234224e-06,
+ "loss": 0.4675,
+ "step": 5580
+ },
+ {
+ "epoch": 2.638770685579196,
+ "grad_norm": 2.659499406814575,
+ "learning_rate": 3.0043106748699085e-06,
+ "loss": 0.4073,
+ "step": 5581
+ },
+ {
+ "epoch": 2.639243498817967,
+ "grad_norm": 2.579765558242798,
+ "learning_rate": 3.00369965661518e-06,
+ "loss": 0.4536,
+ "step": 5582
+ },
+ {
+ "epoch": 2.6397163120567377,
+ "grad_norm": 3.572861909866333,
+ "learning_rate": 3.0030886069972827e-06,
+ "loss": 0.5227,
+ "step": 5583
+ },
+ {
+ "epoch": 2.6401891252955085,
+ "grad_norm": 2.6523196697235107,
+ "learning_rate": 3.002477526054263e-06,
+ "loss": 0.3846,
+ "step": 5584
+ },
+ {
+ "epoch": 2.640661938534279,
+ "grad_norm": 3.072181463241577,
+ "learning_rate": 3.001866413824173e-06,
+ "loss": 0.5399,
+ "step": 5585
+ },
+ {
+ "epoch": 2.6411347517730497,
+ "grad_norm": 2.7304325103759766,
+ "learning_rate": 3.0012552703450597e-06,
+ "loss": 0.4048,
+ "step": 5586
+ },
+ {
+ "epoch": 2.6416075650118205,
+ "grad_norm": 3.039491891860962,
+ "learning_rate": 3.0006440956549798e-06,
+ "loss": 0.5035,
+ "step": 5587
+ },
+ {
+ "epoch": 2.642080378250591,
+ "grad_norm": 2.7623798847198486,
+ "learning_rate": 3.000032889791988e-06,
+ "loss": 0.4369,
+ "step": 5588
+ },
+ {
+ "epoch": 2.6425531914893616,
+ "grad_norm": 3.391052722930908,
+ "learning_rate": 2.9994216527941394e-06,
+ "loss": 0.5308,
+ "step": 5589
+ },
+ {
+ "epoch": 2.6430260047281324,
+ "grad_norm": 3.0263915061950684,
+ "learning_rate": 2.9988103846994954e-06,
+ "loss": 0.4319,
+ "step": 5590
+ },
+ {
+ "epoch": 2.6434988179669032,
+ "grad_norm": 2.786607027053833,
+ "learning_rate": 2.998199085546115e-06,
+ "loss": 0.4695,
+ "step": 5591
+ },
+ {
+ "epoch": 2.643971631205674,
+ "grad_norm": 2.884674310684204,
+ "learning_rate": 2.9975877553720627e-06,
+ "loss": 0.4615,
+ "step": 5592
+ },
+ {
+ "epoch": 2.6444444444444444,
+ "grad_norm": 2.6100499629974365,
+ "learning_rate": 2.996976394215402e-06,
+ "loss": 0.4784,
+ "step": 5593
+ },
+ {
+ "epoch": 2.644917257683215,
+ "grad_norm": 2.6978676319122314,
+ "learning_rate": 2.9963650021142018e-06,
+ "loss": 0.3911,
+ "step": 5594
+ },
+ {
+ "epoch": 2.645390070921986,
+ "grad_norm": 2.8080835342407227,
+ "learning_rate": 2.9957535791065284e-06,
+ "loss": 0.4997,
+ "step": 5595
+ },
+ {
+ "epoch": 2.6458628841607563,
+ "grad_norm": 2.6639578342437744,
+ "learning_rate": 2.9951421252304537e-06,
+ "loss": 0.4066,
+ "step": 5596
+ },
+ {
+ "epoch": 2.646335697399527,
+ "grad_norm": 3.102456569671631,
+ "learning_rate": 2.9945306405240505e-06,
+ "loss": 0.5554,
+ "step": 5597
+ },
+ {
+ "epoch": 2.646808510638298,
+ "grad_norm": 2.6524150371551514,
+ "learning_rate": 2.993919125025392e-06,
+ "loss": 0.3881,
+ "step": 5598
+ },
+ {
+ "epoch": 2.6472813238770687,
+ "grad_norm": 2.926316499710083,
+ "learning_rate": 2.993307578772556e-06,
+ "loss": 0.4845,
+ "step": 5599
+ },
+ {
+ "epoch": 2.6477541371158395,
+ "grad_norm": 3.346550703048706,
+ "learning_rate": 2.9926960018036195e-06,
+ "loss": 0.4481,
+ "step": 5600
+ },
+ {
+ "epoch": 2.64822695035461,
+ "grad_norm": 2.6211020946502686,
+ "learning_rate": 2.9920843941566634e-06,
+ "loss": 0.4355,
+ "step": 5601
+ },
+ {
+ "epoch": 2.6486997635933807,
+ "grad_norm": 2.7479333877563477,
+ "learning_rate": 2.99147275586977e-06,
+ "loss": 0.4373,
+ "step": 5602
+ },
+ {
+ "epoch": 2.6491725768321515,
+ "grad_norm": 2.523385524749756,
+ "learning_rate": 2.9908610869810235e-06,
+ "loss": 0.4467,
+ "step": 5603
+ },
+ {
+ "epoch": 2.649645390070922,
+ "grad_norm": 2.93886137008667,
+ "learning_rate": 2.9902493875285086e-06,
+ "loss": 0.4956,
+ "step": 5604
+ },
+ {
+ "epoch": 2.6501182033096926,
+ "grad_norm": 2.7630443572998047,
+ "learning_rate": 2.989637657550315e-06,
+ "loss": 0.5012,
+ "step": 5605
+ },
+ {
+ "epoch": 2.6505910165484634,
+ "grad_norm": 2.6733906269073486,
+ "learning_rate": 2.989025897084531e-06,
+ "loss": 0.446,
+ "step": 5606
+ },
+ {
+ "epoch": 2.651063829787234,
+ "grad_norm": 2.8411107063293457,
+ "learning_rate": 2.9884141061692484e-06,
+ "loss": 0.4817,
+ "step": 5607
+ },
+ {
+ "epoch": 2.651536643026005,
+ "grad_norm": 2.8667192459106445,
+ "learning_rate": 2.987802284842562e-06,
+ "loss": 0.3909,
+ "step": 5608
+ },
+ {
+ "epoch": 2.6520094562647754,
+ "grad_norm": 3.4640755653381348,
+ "learning_rate": 2.987190433142565e-06,
+ "loss": 0.4379,
+ "step": 5609
+ },
+ {
+ "epoch": 2.652482269503546,
+ "grad_norm": 2.675121307373047,
+ "learning_rate": 2.9865785511073565e-06,
+ "loss": 0.4833,
+ "step": 5610
+ },
+ {
+ "epoch": 2.652955082742317,
+ "grad_norm": 2.4375529289245605,
+ "learning_rate": 2.9859666387750353e-06,
+ "loss": 0.3949,
+ "step": 5611
+ },
+ {
+ "epoch": 2.6534278959810873,
+ "grad_norm": 2.7312581539154053,
+ "learning_rate": 2.9853546961837026e-06,
+ "loss": 0.4546,
+ "step": 5612
+ },
+ {
+ "epoch": 2.653900709219858,
+ "grad_norm": 2.7695999145507812,
+ "learning_rate": 2.9847427233714617e-06,
+ "loss": 0.4696,
+ "step": 5613
+ },
+ {
+ "epoch": 2.654373522458629,
+ "grad_norm": 2.6313109397888184,
+ "learning_rate": 2.984130720376416e-06,
+ "loss": 0.4733,
+ "step": 5614
+ },
+ {
+ "epoch": 2.6548463356973997,
+ "grad_norm": 2.656864881515503,
+ "learning_rate": 2.9835186872366733e-06,
+ "loss": 0.3806,
+ "step": 5615
+ },
+ {
+ "epoch": 2.65531914893617,
+ "grad_norm": 2.720075845718384,
+ "learning_rate": 2.982906623990342e-06,
+ "loss": 0.4041,
+ "step": 5616
+ },
+ {
+ "epoch": 2.655791962174941,
+ "grad_norm": 2.6684951782226562,
+ "learning_rate": 2.9822945306755334e-06,
+ "loss": 0.4552,
+ "step": 5617
+ },
+ {
+ "epoch": 2.6562647754137116,
+ "grad_norm": 2.567751884460449,
+ "learning_rate": 2.9816824073303585e-06,
+ "loss": 0.465,
+ "step": 5618
+ },
+ {
+ "epoch": 2.656737588652482,
+ "grad_norm": 2.7490367889404297,
+ "learning_rate": 2.981070253992933e-06,
+ "loss": 0.4647,
+ "step": 5619
+ },
+ {
+ "epoch": 2.657210401891253,
+ "grad_norm": 2.548656463623047,
+ "learning_rate": 2.9804580707013715e-06,
+ "loss": 0.4226,
+ "step": 5620
+ },
+ {
+ "epoch": 2.6576832151300236,
+ "grad_norm": 2.5484731197357178,
+ "learning_rate": 2.9798458574937927e-06,
+ "loss": 0.382,
+ "step": 5621
+ },
+ {
+ "epoch": 2.6581560283687944,
+ "grad_norm": 2.7293949127197266,
+ "learning_rate": 2.979233614408317e-06,
+ "loss": 0.4418,
+ "step": 5622
+ },
+ {
+ "epoch": 2.658628841607565,
+ "grad_norm": 2.645036458969116,
+ "learning_rate": 2.9786213414830646e-06,
+ "loss": 0.414,
+ "step": 5623
+ },
+ {
+ "epoch": 2.6591016548463355,
+ "grad_norm": 2.5287609100341797,
+ "learning_rate": 2.9780090387561604e-06,
+ "loss": 0.3914,
+ "step": 5624
+ },
+ {
+ "epoch": 2.6595744680851063,
+ "grad_norm": 2.5570411682128906,
+ "learning_rate": 2.9773967062657293e-06,
+ "loss": 0.4431,
+ "step": 5625
+ },
+ {
+ "epoch": 2.660047281323877,
+ "grad_norm": 2.681749105453491,
+ "learning_rate": 2.9767843440498983e-06,
+ "loss": 0.4245,
+ "step": 5626
+ },
+ {
+ "epoch": 2.6605200945626475,
+ "grad_norm": 2.8629777431488037,
+ "learning_rate": 2.976171952146798e-06,
+ "loss": 0.4643,
+ "step": 5627
+ },
+ {
+ "epoch": 2.6609929078014183,
+ "grad_norm": 2.577148199081421,
+ "learning_rate": 2.9755595305945573e-06,
+ "loss": 0.43,
+ "step": 5628
+ },
+ {
+ "epoch": 2.661465721040189,
+ "grad_norm": 2.747218370437622,
+ "learning_rate": 2.97494707943131e-06,
+ "loss": 0.5194,
+ "step": 5629
+ },
+ {
+ "epoch": 2.66193853427896,
+ "grad_norm": 2.535604953765869,
+ "learning_rate": 2.9743345986951904e-06,
+ "loss": 0.4401,
+ "step": 5630
+ },
+ {
+ "epoch": 2.6624113475177307,
+ "grad_norm": 3.3341166973114014,
+ "learning_rate": 2.973722088424336e-06,
+ "loss": 0.4925,
+ "step": 5631
+ },
+ {
+ "epoch": 2.662884160756501,
+ "grad_norm": 2.9264349937438965,
+ "learning_rate": 2.973109548656884e-06,
+ "loss": 0.4787,
+ "step": 5632
+ },
+ {
+ "epoch": 2.663356973995272,
+ "grad_norm": 2.7132506370544434,
+ "learning_rate": 2.9724969794309742e-06,
+ "loss": 0.4138,
+ "step": 5633
+ },
+ {
+ "epoch": 2.6638297872340426,
+ "grad_norm": 2.7970192432403564,
+ "learning_rate": 2.9718843807847497e-06,
+ "loss": 0.4896,
+ "step": 5634
+ },
+ {
+ "epoch": 2.664302600472813,
+ "grad_norm": 2.610208749771118,
+ "learning_rate": 2.9712717527563545e-06,
+ "loss": 0.3997,
+ "step": 5635
+ },
+ {
+ "epoch": 2.6647754137115838,
+ "grad_norm": 3.5483577251434326,
+ "learning_rate": 2.9706590953839335e-06,
+ "loss": 0.5109,
+ "step": 5636
+ },
+ {
+ "epoch": 2.6652482269503546,
+ "grad_norm": 2.746933698654175,
+ "learning_rate": 2.9700464087056345e-06,
+ "loss": 0.4672,
+ "step": 5637
+ },
+ {
+ "epoch": 2.6657210401891254,
+ "grad_norm": 2.704436779022217,
+ "learning_rate": 2.969433692759607e-06,
+ "loss": 0.4402,
+ "step": 5638
+ },
+ {
+ "epoch": 2.666193853427896,
+ "grad_norm": 2.859520196914673,
+ "learning_rate": 2.9688209475840005e-06,
+ "loss": 0.4679,
+ "step": 5639
+ },
+ {
+ "epoch": 2.6666666666666665,
+ "grad_norm": 2.518580436706543,
+ "learning_rate": 2.968208173216971e-06,
+ "loss": 0.3772,
+ "step": 5640
+ },
+ {
+ "epoch": 2.6671394799054373,
+ "grad_norm": 2.7624926567077637,
+ "learning_rate": 2.967595369696671e-06,
+ "loss": 0.4753,
+ "step": 5641
+ },
+ {
+ "epoch": 2.667612293144208,
+ "grad_norm": 2.654003620147705,
+ "learning_rate": 2.966982537061257e-06,
+ "loss": 0.4583,
+ "step": 5642
+ },
+ {
+ "epoch": 2.6680851063829785,
+ "grad_norm": 2.8473968505859375,
+ "learning_rate": 2.966369675348888e-06,
+ "loss": 0.4623,
+ "step": 5643
+ },
+ {
+ "epoch": 2.6685579196217493,
+ "grad_norm": 2.5587947368621826,
+ "learning_rate": 2.9657567845977253e-06,
+ "loss": 0.4014,
+ "step": 5644
+ },
+ {
+ "epoch": 2.66903073286052,
+ "grad_norm": 2.572220802307129,
+ "learning_rate": 2.96514386484593e-06,
+ "loss": 0.4249,
+ "step": 5645
+ },
+ {
+ "epoch": 2.669503546099291,
+ "grad_norm": 2.7995707988739014,
+ "learning_rate": 2.964530916131665e-06,
+ "loss": 0.4575,
+ "step": 5646
+ },
+ {
+ "epoch": 2.6699763593380617,
+ "grad_norm": 2.8712687492370605,
+ "learning_rate": 2.963917938493097e-06,
+ "loss": 0.4353,
+ "step": 5647
+ },
+ {
+ "epoch": 2.670449172576832,
+ "grad_norm": 2.856473207473755,
+ "learning_rate": 2.963304931968393e-06,
+ "loss": 0.4345,
+ "step": 5648
+ },
+ {
+ "epoch": 2.670921985815603,
+ "grad_norm": 2.709198474884033,
+ "learning_rate": 2.9626918965957224e-06,
+ "loss": 0.4116,
+ "step": 5649
+ },
+ {
+ "epoch": 2.6713947990543736,
+ "grad_norm": 2.8144607543945312,
+ "learning_rate": 2.962078832413257e-06,
+ "loss": 0.4575,
+ "step": 5650
+ },
+ {
+ "epoch": 2.671867612293144,
+ "grad_norm": 3.131911039352417,
+ "learning_rate": 2.961465739459168e-06,
+ "loss": 0.4743,
+ "step": 5651
+ },
+ {
+ "epoch": 2.6723404255319148,
+ "grad_norm": 2.8487515449523926,
+ "learning_rate": 2.9608526177716316e-06,
+ "loss": 0.4314,
+ "step": 5652
+ },
+ {
+ "epoch": 2.6728132387706856,
+ "grad_norm": 2.613229751586914,
+ "learning_rate": 2.960239467388823e-06,
+ "loss": 0.4807,
+ "step": 5653
+ },
+ {
+ "epoch": 2.6732860520094563,
+ "grad_norm": 2.5049116611480713,
+ "learning_rate": 2.9596262883489213e-06,
+ "loss": 0.4708,
+ "step": 5654
+ },
+ {
+ "epoch": 2.673758865248227,
+ "grad_norm": 2.6347460746765137,
+ "learning_rate": 2.9590130806901052e-06,
+ "loss": 0.3689,
+ "step": 5655
+ },
+ {
+ "epoch": 2.6742316784869975,
+ "grad_norm": 3.3290371894836426,
+ "learning_rate": 2.9583998444505578e-06,
+ "loss": 0.4674,
+ "step": 5656
+ },
+ {
+ "epoch": 2.6747044917257683,
+ "grad_norm": 2.748403549194336,
+ "learning_rate": 2.957786579668462e-06,
+ "loss": 0.3852,
+ "step": 5657
+ },
+ {
+ "epoch": 2.675177304964539,
+ "grad_norm": 2.837573766708374,
+ "learning_rate": 2.957173286382003e-06,
+ "loss": 0.4541,
+ "step": 5658
+ },
+ {
+ "epoch": 2.6756501182033094,
+ "grad_norm": 3.0976510047912598,
+ "learning_rate": 2.9565599646293686e-06,
+ "loss": 0.4669,
+ "step": 5659
+ },
+ {
+ "epoch": 2.6761229314420802,
+ "grad_norm": 2.7059597969055176,
+ "learning_rate": 2.955946614448747e-06,
+ "loss": 0.3935,
+ "step": 5660
+ },
+ {
+ "epoch": 2.676595744680851,
+ "grad_norm": 2.6700541973114014,
+ "learning_rate": 2.9553332358783294e-06,
+ "loss": 0.4322,
+ "step": 5661
+ },
+ {
+ "epoch": 2.677068557919622,
+ "grad_norm": 2.9782698154449463,
+ "learning_rate": 2.9547198289563068e-06,
+ "loss": 0.4338,
+ "step": 5662
+ },
+ {
+ "epoch": 2.6775413711583926,
+ "grad_norm": 2.637876510620117,
+ "learning_rate": 2.9541063937208755e-06,
+ "loss": 0.4289,
+ "step": 5663
+ },
+ {
+ "epoch": 2.678014184397163,
+ "grad_norm": 3.421949863433838,
+ "learning_rate": 2.953492930210229e-06,
+ "loss": 0.5458,
+ "step": 5664
+ },
+ {
+ "epoch": 2.678486997635934,
+ "grad_norm": 2.8273842334747314,
+ "learning_rate": 2.952879438462567e-06,
+ "loss": 0.4529,
+ "step": 5665
+ },
+ {
+ "epoch": 2.6789598108747046,
+ "grad_norm": 2.9090168476104736,
+ "learning_rate": 2.9522659185160873e-06,
+ "loss": 0.444,
+ "step": 5666
+ },
+ {
+ "epoch": 2.679432624113475,
+ "grad_norm": 2.646710157394409,
+ "learning_rate": 2.9516523704089927e-06,
+ "loss": 0.4226,
+ "step": 5667
+ },
+ {
+ "epoch": 2.6799054373522457,
+ "grad_norm": 2.65915584564209,
+ "learning_rate": 2.951038794179486e-06,
+ "loss": 0.4307,
+ "step": 5668
+ },
+ {
+ "epoch": 2.6803782505910165,
+ "grad_norm": 3.004507303237915,
+ "learning_rate": 2.950425189865771e-06,
+ "loss": 0.4799,
+ "step": 5669
+ },
+ {
+ "epoch": 2.6808510638297873,
+ "grad_norm": 2.5210134983062744,
+ "learning_rate": 2.949811557506054e-06,
+ "loss": 0.3842,
+ "step": 5670
+ },
+ {
+ "epoch": 2.681323877068558,
+ "grad_norm": 2.8072893619537354,
+ "learning_rate": 2.9491978971385436e-06,
+ "loss": 0.435,
+ "step": 5671
+ },
+ {
+ "epoch": 2.6817966903073285,
+ "grad_norm": 2.5701990127563477,
+ "learning_rate": 2.9485842088014498e-06,
+ "loss": 0.4932,
+ "step": 5672
+ },
+ {
+ "epoch": 2.6822695035460993,
+ "grad_norm": 2.9368457794189453,
+ "learning_rate": 2.9479704925329854e-06,
+ "loss": 0.455,
+ "step": 5673
+ },
+ {
+ "epoch": 2.68274231678487,
+ "grad_norm": 2.8576247692108154,
+ "learning_rate": 2.947356748371362e-06,
+ "loss": 0.4254,
+ "step": 5674
+ },
+ {
+ "epoch": 2.6832151300236404,
+ "grad_norm": 2.8999195098876953,
+ "learning_rate": 2.946742976354795e-06,
+ "loss": 0.4159,
+ "step": 5675
+ },
+ {
+ "epoch": 2.6836879432624112,
+ "grad_norm": 2.8439736366271973,
+ "learning_rate": 2.946129176521502e-06,
+ "loss": 0.4035,
+ "step": 5676
+ },
+ {
+ "epoch": 2.684160756501182,
+ "grad_norm": 2.8525729179382324,
+ "learning_rate": 2.945515348909702e-06,
+ "loss": 0.4137,
+ "step": 5677
+ },
+ {
+ "epoch": 2.684633569739953,
+ "grad_norm": 2.6573562622070312,
+ "learning_rate": 2.9449014935576147e-06,
+ "loss": 0.4203,
+ "step": 5678
+ },
+ {
+ "epoch": 2.6851063829787236,
+ "grad_norm": 2.765794277191162,
+ "learning_rate": 2.9442876105034616e-06,
+ "loss": 0.5184,
+ "step": 5679
+ },
+ {
+ "epoch": 2.685579196217494,
+ "grad_norm": 2.694617748260498,
+ "learning_rate": 2.943673699785467e-06,
+ "loss": 0.417,
+ "step": 5680
+ },
+ {
+ "epoch": 2.6860520094562648,
+ "grad_norm": 2.740774393081665,
+ "learning_rate": 2.943059761441857e-06,
+ "loss": 0.4431,
+ "step": 5681
+ },
+ {
+ "epoch": 2.6865248226950356,
+ "grad_norm": 2.670642614364624,
+ "learning_rate": 2.942445795510859e-06,
+ "loss": 0.4298,
+ "step": 5682
+ },
+ {
+ "epoch": 2.686997635933806,
+ "grad_norm": 2.838907241821289,
+ "learning_rate": 2.9418318020307e-06,
+ "loss": 0.4529,
+ "step": 5683
+ },
+ {
+ "epoch": 2.6874704491725767,
+ "grad_norm": 2.562317371368408,
+ "learning_rate": 2.9412177810396135e-06,
+ "loss": 0.4251,
+ "step": 5684
+ },
+ {
+ "epoch": 2.6879432624113475,
+ "grad_norm": 2.5805928707122803,
+ "learning_rate": 2.9406037325758298e-06,
+ "loss": 0.4405,
+ "step": 5685
+ },
+ {
+ "epoch": 2.6884160756501183,
+ "grad_norm": 2.5701205730438232,
+ "learning_rate": 2.939989656677583e-06,
+ "loss": 0.4184,
+ "step": 5686
+ },
+ {
+ "epoch": 2.688888888888889,
+ "grad_norm": 2.7990400791168213,
+ "learning_rate": 2.939375553383111e-06,
+ "loss": 0.4866,
+ "step": 5687
+ },
+ {
+ "epoch": 2.6893617021276595,
+ "grad_norm": 3.063319206237793,
+ "learning_rate": 2.9387614227306487e-06,
+ "loss": 0.4202,
+ "step": 5688
+ },
+ {
+ "epoch": 2.6898345153664303,
+ "grad_norm": 3.0891315937042236,
+ "learning_rate": 2.938147264758437e-06,
+ "loss": 0.4344,
+ "step": 5689
+ },
+ {
+ "epoch": 2.690307328605201,
+ "grad_norm": 2.8982670307159424,
+ "learning_rate": 2.9375330795047165e-06,
+ "loss": 0.4548,
+ "step": 5690
+ },
+ {
+ "epoch": 2.6907801418439714,
+ "grad_norm": 2.7947235107421875,
+ "learning_rate": 2.9369188670077293e-06,
+ "loss": 0.5028,
+ "step": 5691
+ },
+ {
+ "epoch": 2.691252955082742,
+ "grad_norm": 3.1615960597991943,
+ "learning_rate": 2.9363046273057206e-06,
+ "loss": 0.4855,
+ "step": 5692
+ },
+ {
+ "epoch": 2.691725768321513,
+ "grad_norm": 2.669516086578369,
+ "learning_rate": 2.935690360436935e-06,
+ "loss": 0.3813,
+ "step": 5693
+ },
+ {
+ "epoch": 2.692198581560284,
+ "grad_norm": 2.8743274211883545,
+ "learning_rate": 2.935076066439622e-06,
+ "loss": 0.4302,
+ "step": 5694
+ },
+ {
+ "epoch": 2.6926713947990546,
+ "grad_norm": 2.6829612255096436,
+ "learning_rate": 2.9344617453520295e-06,
+ "loss": 0.4063,
+ "step": 5695
+ },
+ {
+ "epoch": 2.693144208037825,
+ "grad_norm": 2.776447057723999,
+ "learning_rate": 2.9338473972124097e-06,
+ "loss": 0.4921,
+ "step": 5696
+ },
+ {
+ "epoch": 2.6936170212765957,
+ "grad_norm": 2.7865772247314453,
+ "learning_rate": 2.9332330220590143e-06,
+ "loss": 0.4939,
+ "step": 5697
+ },
+ {
+ "epoch": 2.6940898345153665,
+ "grad_norm": 3.020526170730591,
+ "learning_rate": 2.932618619930098e-06,
+ "loss": 0.4839,
+ "step": 5698
+ },
+ {
+ "epoch": 2.694562647754137,
+ "grad_norm": 2.637057065963745,
+ "learning_rate": 2.932004190863918e-06,
+ "loss": 0.4343,
+ "step": 5699
+ },
+ {
+ "epoch": 2.6950354609929077,
+ "grad_norm": 2.7426512241363525,
+ "learning_rate": 2.9313897348987314e-06,
+ "loss": 0.3609,
+ "step": 5700
+ },
+ {
+ "epoch": 2.6955082742316785,
+ "grad_norm": 2.767186164855957,
+ "learning_rate": 2.9307752520727974e-06,
+ "loss": 0.3793,
+ "step": 5701
+ },
+ {
+ "epoch": 2.6959810874704493,
+ "grad_norm": 2.4791622161865234,
+ "learning_rate": 2.930160742424377e-06,
+ "loss": 0.4192,
+ "step": 5702
+ },
+ {
+ "epoch": 2.69645390070922,
+ "grad_norm": 2.661461591720581,
+ "learning_rate": 2.9295462059917336e-06,
+ "loss": 0.4758,
+ "step": 5703
+ },
+ {
+ "epoch": 2.6969267139479904,
+ "grad_norm": 2.896242380142212,
+ "learning_rate": 2.928931642813131e-06,
+ "loss": 0.42,
+ "step": 5704
+ },
+ {
+ "epoch": 2.6973995271867612,
+ "grad_norm": 2.783813238143921,
+ "learning_rate": 2.9283170529268366e-06,
+ "loss": 0.4726,
+ "step": 5705
+ },
+ {
+ "epoch": 2.697872340425532,
+ "grad_norm": 2.4347333908081055,
+ "learning_rate": 2.927702436371117e-06,
+ "loss": 0.4199,
+ "step": 5706
+ },
+ {
+ "epoch": 2.6983451536643024,
+ "grad_norm": 2.4643805027008057,
+ "learning_rate": 2.927087793184242e-06,
+ "loss": 0.3578,
+ "step": 5707
+ },
+ {
+ "epoch": 2.698817966903073,
+ "grad_norm": 2.6396660804748535,
+ "learning_rate": 2.9264731234044835e-06,
+ "loss": 0.4509,
+ "step": 5708
+ },
+ {
+ "epoch": 2.699290780141844,
+ "grad_norm": 2.7341182231903076,
+ "learning_rate": 2.925858427070113e-06,
+ "loss": 0.4331,
+ "step": 5709
+ },
+ {
+ "epoch": 2.699763593380615,
+ "grad_norm": 2.7578938007354736,
+ "learning_rate": 2.9252437042194058e-06,
+ "loss": 0.4508,
+ "step": 5710
+ },
+ {
+ "epoch": 2.7002364066193856,
+ "grad_norm": 2.557788133621216,
+ "learning_rate": 2.9246289548906375e-06,
+ "loss": 0.3775,
+ "step": 5711
+ },
+ {
+ "epoch": 2.700709219858156,
+ "grad_norm": 2.802851676940918,
+ "learning_rate": 2.924014179122086e-06,
+ "loss": 0.4518,
+ "step": 5712
+ },
+ {
+ "epoch": 2.7011820330969267,
+ "grad_norm": 2.4773001670837402,
+ "learning_rate": 2.9233993769520313e-06,
+ "loss": 0.4019,
+ "step": 5713
+ },
+ {
+ "epoch": 2.7016548463356975,
+ "grad_norm": 3.108971357345581,
+ "learning_rate": 2.922784548418754e-06,
+ "loss": 0.4715,
+ "step": 5714
+ },
+ {
+ "epoch": 2.702127659574468,
+ "grad_norm": 2.8596770763397217,
+ "learning_rate": 2.9221696935605366e-06,
+ "loss": 0.4361,
+ "step": 5715
+ },
+ {
+ "epoch": 2.7026004728132387,
+ "grad_norm": 2.570604085922241,
+ "learning_rate": 2.9215548124156633e-06,
+ "loss": 0.3982,
+ "step": 5716
+ },
+ {
+ "epoch": 2.7030732860520095,
+ "grad_norm": 2.3157799243927,
+ "learning_rate": 2.9209399050224206e-06,
+ "loss": 0.456,
+ "step": 5717
+ },
+ {
+ "epoch": 2.7035460992907803,
+ "grad_norm": 2.6865758895874023,
+ "learning_rate": 2.9203249714190952e-06,
+ "loss": 0.4441,
+ "step": 5718
+ },
+ {
+ "epoch": 2.704018912529551,
+ "grad_norm": 2.76723313331604,
+ "learning_rate": 2.919710011643978e-06,
+ "loss": 0.464,
+ "step": 5719
+ },
+ {
+ "epoch": 2.7044917257683214,
+ "grad_norm": 2.648792028427124,
+ "learning_rate": 2.9190950257353578e-06,
+ "loss": 0.3426,
+ "step": 5720
+ },
+ {
+ "epoch": 2.704964539007092,
+ "grad_norm": 2.878739833831787,
+ "learning_rate": 2.9184800137315276e-06,
+ "loss": 0.4431,
+ "step": 5721
+ },
+ {
+ "epoch": 2.705437352245863,
+ "grad_norm": 2.670567274093628,
+ "learning_rate": 2.917864975670783e-06,
+ "loss": 0.4347,
+ "step": 5722
+ },
+ {
+ "epoch": 2.7059101654846334,
+ "grad_norm": 2.7031569480895996,
+ "learning_rate": 2.9172499115914184e-06,
+ "loss": 0.4557,
+ "step": 5723
+ },
+ {
+ "epoch": 2.706382978723404,
+ "grad_norm": 2.5225696563720703,
+ "learning_rate": 2.9166348215317314e-06,
+ "loss": 0.4159,
+ "step": 5724
+ },
+ {
+ "epoch": 2.706855791962175,
+ "grad_norm": 2.8676085472106934,
+ "learning_rate": 2.916019705530021e-06,
+ "loss": 0.5018,
+ "step": 5725
+ },
+ {
+ "epoch": 2.7073286052009458,
+ "grad_norm": 2.576463460922241,
+ "learning_rate": 2.915404563624587e-06,
+ "loss": 0.4317,
+ "step": 5726
+ },
+ {
+ "epoch": 2.7078014184397166,
+ "grad_norm": 3.155565023422241,
+ "learning_rate": 2.9147893958537328e-06,
+ "loss": 0.5029,
+ "step": 5727
+ },
+ {
+ "epoch": 2.708274231678487,
+ "grad_norm": 2.604079008102417,
+ "learning_rate": 2.9141742022557622e-06,
+ "loss": 0.4324,
+ "step": 5728
+ },
+ {
+ "epoch": 2.7087470449172577,
+ "grad_norm": 2.6597228050231934,
+ "learning_rate": 2.913558982868979e-06,
+ "loss": 0.4335,
+ "step": 5729
+ },
+ {
+ "epoch": 2.7092198581560285,
+ "grad_norm": 2.811384439468384,
+ "learning_rate": 2.9129437377316923e-06,
+ "loss": 0.4031,
+ "step": 5730
+ },
+ {
+ "epoch": 2.709692671394799,
+ "grad_norm": 3.1041207313537598,
+ "learning_rate": 2.91232846688221e-06,
+ "loss": 0.481,
+ "step": 5731
+ },
+ {
+ "epoch": 2.7101654846335697,
+ "grad_norm": 2.5992188453674316,
+ "learning_rate": 2.9117131703588414e-06,
+ "loss": 0.4266,
+ "step": 5732
+ },
+ {
+ "epoch": 2.7106382978723405,
+ "grad_norm": 2.7726242542266846,
+ "learning_rate": 2.911097848199899e-06,
+ "loss": 0.4464,
+ "step": 5733
+ },
+ {
+ "epoch": 2.7111111111111112,
+ "grad_norm": 2.8683483600616455,
+ "learning_rate": 2.9104825004436966e-06,
+ "loss": 0.4248,
+ "step": 5734
+ },
+ {
+ "epoch": 2.711583924349882,
+ "grad_norm": 2.776386022567749,
+ "learning_rate": 2.9098671271285484e-06,
+ "loss": 0.4556,
+ "step": 5735
+ },
+ {
+ "epoch": 2.7120567375886524,
+ "grad_norm": 2.7612528800964355,
+ "learning_rate": 2.909251728292771e-06,
+ "loss": 0.455,
+ "step": 5736
+ },
+ {
+ "epoch": 2.712529550827423,
+ "grad_norm": 2.9223551750183105,
+ "learning_rate": 2.908636303974684e-06,
+ "loss": 0.4302,
+ "step": 5737
+ },
+ {
+ "epoch": 2.713002364066194,
+ "grad_norm": 2.898226022720337,
+ "learning_rate": 2.908020854212606e-06,
+ "loss": 0.4827,
+ "step": 5738
+ },
+ {
+ "epoch": 2.7134751773049643,
+ "grad_norm": 2.706361770629883,
+ "learning_rate": 2.9074053790448576e-06,
+ "loss": 0.4444,
+ "step": 5739
+ },
+ {
+ "epoch": 2.713947990543735,
+ "grad_norm": 2.8227248191833496,
+ "learning_rate": 2.9067898785097637e-06,
+ "loss": 0.4661,
+ "step": 5740
+ },
+ {
+ "epoch": 2.714420803782506,
+ "grad_norm": 2.597837448120117,
+ "learning_rate": 2.9061743526456474e-06,
+ "loss": 0.4646,
+ "step": 5741
+ },
+ {
+ "epoch": 2.7148936170212767,
+ "grad_norm": 2.5525131225585938,
+ "learning_rate": 2.9055588014908354e-06,
+ "loss": 0.4172,
+ "step": 5742
+ },
+ {
+ "epoch": 2.7153664302600475,
+ "grad_norm": 2.713071823120117,
+ "learning_rate": 2.904943225083655e-06,
+ "loss": 0.4893,
+ "step": 5743
+ },
+ {
+ "epoch": 2.715839243498818,
+ "grad_norm": 2.538623571395874,
+ "learning_rate": 2.9043276234624353e-06,
+ "loss": 0.3905,
+ "step": 5744
+ },
+ {
+ "epoch": 2.7163120567375887,
+ "grad_norm": 2.5190389156341553,
+ "learning_rate": 2.9037119966655076e-06,
+ "loss": 0.4318,
+ "step": 5745
+ },
+ {
+ "epoch": 2.7167848699763595,
+ "grad_norm": 2.6587612628936768,
+ "learning_rate": 2.903096344731204e-06,
+ "loss": 0.4153,
+ "step": 5746
+ },
+ {
+ "epoch": 2.71725768321513,
+ "grad_norm": 2.836731433868408,
+ "learning_rate": 2.902480667697859e-06,
+ "loss": 0.4779,
+ "step": 5747
+ },
+ {
+ "epoch": 2.7177304964539006,
+ "grad_norm": 2.8076045513153076,
+ "learning_rate": 2.9018649656038074e-06,
+ "loss": 0.5126,
+ "step": 5748
+ },
+ {
+ "epoch": 2.7182033096926714,
+ "grad_norm": 2.8930516242980957,
+ "learning_rate": 2.9012492384873865e-06,
+ "loss": 0.4561,
+ "step": 5749
+ },
+ {
+ "epoch": 2.7186761229314422,
+ "grad_norm": 2.7000370025634766,
+ "learning_rate": 2.9006334863869343e-06,
+ "loss": 0.4659,
+ "step": 5750
+ },
+ {
+ "epoch": 2.719148936170213,
+ "grad_norm": 2.927011251449585,
+ "learning_rate": 2.9000177093407926e-06,
+ "loss": 0.5123,
+ "step": 5751
+ },
+ {
+ "epoch": 2.7196217494089834,
+ "grad_norm": 3.0102779865264893,
+ "learning_rate": 2.8994019073873015e-06,
+ "loss": 0.3972,
+ "step": 5752
+ },
+ {
+ "epoch": 2.720094562647754,
+ "grad_norm": 2.778838634490967,
+ "learning_rate": 2.8987860805648054e-06,
+ "loss": 0.4922,
+ "step": 5753
+ },
+ {
+ "epoch": 2.720567375886525,
+ "grad_norm": 2.6150314807891846,
+ "learning_rate": 2.898170228911648e-06,
+ "loss": 0.4425,
+ "step": 5754
+ },
+ {
+ "epoch": 2.7210401891252953,
+ "grad_norm": 2.9329984188079834,
+ "learning_rate": 2.8975543524661777e-06,
+ "loss": 0.4872,
+ "step": 5755
+ },
+ {
+ "epoch": 2.721513002364066,
+ "grad_norm": 2.756803512573242,
+ "learning_rate": 2.8969384512667404e-06,
+ "loss": 0.4362,
+ "step": 5756
+ },
+ {
+ "epoch": 2.721985815602837,
+ "grad_norm": 2.600877285003662,
+ "learning_rate": 2.896322525351686e-06,
+ "loss": 0.4802,
+ "step": 5757
+ },
+ {
+ "epoch": 2.7224586288416077,
+ "grad_norm": 2.647069215774536,
+ "learning_rate": 2.8957065747593655e-06,
+ "loss": 0.4649,
+ "step": 5758
+ },
+ {
+ "epoch": 2.7229314420803785,
+ "grad_norm": 2.845388174057007,
+ "learning_rate": 2.895090599528132e-06,
+ "loss": 0.4533,
+ "step": 5759
+ },
+ {
+ "epoch": 2.723404255319149,
+ "grad_norm": 2.973881721496582,
+ "learning_rate": 2.8944745996963397e-06,
+ "loss": 0.4959,
+ "step": 5760
+ },
+ {
+ "epoch": 2.7238770685579197,
+ "grad_norm": 2.8995487689971924,
+ "learning_rate": 2.8938585753023435e-06,
+ "loss": 0.4597,
+ "step": 5761
+ },
+ {
+ "epoch": 2.7243498817966905,
+ "grad_norm": 2.903693437576294,
+ "learning_rate": 2.8932425263845004e-06,
+ "loss": 0.4521,
+ "step": 5762
+ },
+ {
+ "epoch": 2.724822695035461,
+ "grad_norm": 2.7609009742736816,
+ "learning_rate": 2.8926264529811702e-06,
+ "loss": 0.4399,
+ "step": 5763
+ },
+ {
+ "epoch": 2.7252955082742316,
+ "grad_norm": 2.788787603378296,
+ "learning_rate": 2.892010355130712e-06,
+ "loss": 0.4614,
+ "step": 5764
+ },
+ {
+ "epoch": 2.7257683215130024,
+ "grad_norm": 2.786498785018921,
+ "learning_rate": 2.8913942328714887e-06,
+ "loss": 0.4798,
+ "step": 5765
+ },
+ {
+ "epoch": 2.726241134751773,
+ "grad_norm": 2.9809393882751465,
+ "learning_rate": 2.8907780862418616e-06,
+ "loss": 0.5108,
+ "step": 5766
+ },
+ {
+ "epoch": 2.726713947990544,
+ "grad_norm": 2.6621177196502686,
+ "learning_rate": 2.8901619152801967e-06,
+ "loss": 0.4031,
+ "step": 5767
+ },
+ {
+ "epoch": 2.7271867612293144,
+ "grad_norm": 3.3092098236083984,
+ "learning_rate": 2.8895457200248607e-06,
+ "loss": 0.4671,
+ "step": 5768
+ },
+ {
+ "epoch": 2.727659574468085,
+ "grad_norm": 2.866306781768799,
+ "learning_rate": 2.8889295005142204e-06,
+ "loss": 0.4434,
+ "step": 5769
+ },
+ {
+ "epoch": 2.728132387706856,
+ "grad_norm": 2.6861231327056885,
+ "learning_rate": 2.888313256786646e-06,
+ "loss": 0.429,
+ "step": 5770
+ },
+ {
+ "epoch": 2.7286052009456263,
+ "grad_norm": 2.873180389404297,
+ "learning_rate": 2.8876969888805072e-06,
+ "loss": 0.4412,
+ "step": 5771
+ },
+ {
+ "epoch": 2.729078014184397,
+ "grad_norm": 2.511678695678711,
+ "learning_rate": 2.887080696834178e-06,
+ "loss": 0.4024,
+ "step": 5772
+ },
+ {
+ "epoch": 2.729550827423168,
+ "grad_norm": 2.6502726078033447,
+ "learning_rate": 2.88646438068603e-06,
+ "loss": 0.4357,
+ "step": 5773
+ },
+ {
+ "epoch": 2.7300236406619387,
+ "grad_norm": 2.7156145572662354,
+ "learning_rate": 2.8858480404744403e-06,
+ "loss": 0.4511,
+ "step": 5774
+ },
+ {
+ "epoch": 2.7304964539007095,
+ "grad_norm": 2.882582187652588,
+ "learning_rate": 2.8852316762377842e-06,
+ "loss": 0.4822,
+ "step": 5775
+ },
+ {
+ "epoch": 2.73096926713948,
+ "grad_norm": 2.7139666080474854,
+ "learning_rate": 2.8846152880144413e-06,
+ "loss": 0.4666,
+ "step": 5776
+ },
+ {
+ "epoch": 2.7314420803782506,
+ "grad_norm": 2.7453949451446533,
+ "learning_rate": 2.8839988758427907e-06,
+ "loss": 0.3927,
+ "step": 5777
+ },
+ {
+ "epoch": 2.731914893617021,
+ "grad_norm": 2.7859580516815186,
+ "learning_rate": 2.883382439761214e-06,
+ "loss": 0.4466,
+ "step": 5778
+ },
+ {
+ "epoch": 2.732387706855792,
+ "grad_norm": 2.695234537124634,
+ "learning_rate": 2.882765979808094e-06,
+ "loss": 0.4227,
+ "step": 5779
+ },
+ {
+ "epoch": 2.7328605200945626,
+ "grad_norm": 2.8081552982330322,
+ "learning_rate": 2.8821494960218148e-06,
+ "loss": 0.447,
+ "step": 5780
+ },
+ {
+ "epoch": 2.7333333333333334,
+ "grad_norm": 2.887643337249756,
+ "learning_rate": 2.881532988440762e-06,
+ "loss": 0.5018,
+ "step": 5781
+ },
+ {
+ "epoch": 2.733806146572104,
+ "grad_norm": 3.108212471008301,
+ "learning_rate": 2.8809164571033233e-06,
+ "loss": 0.4132,
+ "step": 5782
+ },
+ {
+ "epoch": 2.7342789598108745,
+ "grad_norm": 2.874328374862671,
+ "learning_rate": 2.880299902047886e-06,
+ "loss": 0.4618,
+ "step": 5783
+ },
+ {
+ "epoch": 2.7347517730496453,
+ "grad_norm": 3.089132308959961,
+ "learning_rate": 2.879683323312843e-06,
+ "loss": 0.4956,
+ "step": 5784
+ },
+ {
+ "epoch": 2.735224586288416,
+ "grad_norm": 2.5173206329345703,
+ "learning_rate": 2.879066720936583e-06,
+ "loss": 0.4087,
+ "step": 5785
+ },
+ {
+ "epoch": 2.7356973995271865,
+ "grad_norm": 2.6401286125183105,
+ "learning_rate": 2.8784500949575014e-06,
+ "loss": 0.3995,
+ "step": 5786
+ },
+ {
+ "epoch": 2.7361702127659573,
+ "grad_norm": 2.9371910095214844,
+ "learning_rate": 2.877833445413991e-06,
+ "loss": 0.5209,
+ "step": 5787
+ },
+ {
+ "epoch": 2.736643026004728,
+ "grad_norm": 3.218158006668091,
+ "learning_rate": 2.8772167723444498e-06,
+ "loss": 0.4275,
+ "step": 5788
+ },
+ {
+ "epoch": 2.737115839243499,
+ "grad_norm": 2.9072160720825195,
+ "learning_rate": 2.8766000757872736e-06,
+ "loss": 0.4244,
+ "step": 5789
+ },
+ {
+ "epoch": 2.7375886524822697,
+ "grad_norm": 3.0378096103668213,
+ "learning_rate": 2.8759833557808614e-06,
+ "loss": 0.507,
+ "step": 5790
+ },
+ {
+ "epoch": 2.73806146572104,
+ "grad_norm": 2.728353977203369,
+ "learning_rate": 2.8753666123636148e-06,
+ "loss": 0.413,
+ "step": 5791
+ },
+ {
+ "epoch": 2.738534278959811,
+ "grad_norm": 2.6869957447052,
+ "learning_rate": 2.874749845573935e-06,
+ "loss": 0.44,
+ "step": 5792
+ },
+ {
+ "epoch": 2.7390070921985816,
+ "grad_norm": 2.6381702423095703,
+ "learning_rate": 2.8741330554502263e-06,
+ "loss": 0.4708,
+ "step": 5793
+ },
+ {
+ "epoch": 2.739479905437352,
+ "grad_norm": 2.6944689750671387,
+ "learning_rate": 2.873516242030892e-06,
+ "loss": 0.4555,
+ "step": 5794
+ },
+ {
+ "epoch": 2.739952718676123,
+ "grad_norm": 3.168473243713379,
+ "learning_rate": 2.8728994053543396e-06,
+ "loss": 0.4538,
+ "step": 5795
+ },
+ {
+ "epoch": 2.7404255319148936,
+ "grad_norm": 2.7504515647888184,
+ "learning_rate": 2.872282545458976e-06,
+ "loss": 0.4628,
+ "step": 5796
+ },
+ {
+ "epoch": 2.7408983451536644,
+ "grad_norm": 2.896462917327881,
+ "learning_rate": 2.8716656623832114e-06,
+ "loss": 0.4946,
+ "step": 5797
+ },
+ {
+ "epoch": 2.741371158392435,
+ "grad_norm": 2.8053417205810547,
+ "learning_rate": 2.8710487561654547e-06,
+ "loss": 0.4893,
+ "step": 5798
+ },
+ {
+ "epoch": 2.7418439716312055,
+ "grad_norm": 2.63171124458313,
+ "learning_rate": 2.870431826844119e-06,
+ "loss": 0.4257,
+ "step": 5799
+ },
+ {
+ "epoch": 2.7423167848699763,
+ "grad_norm": 3.0963807106018066,
+ "learning_rate": 2.869814874457618e-06,
+ "loss": 0.5404,
+ "step": 5800
+ },
+ {
+ "epoch": 2.742789598108747,
+ "grad_norm": 2.591132164001465,
+ "learning_rate": 2.8691978990443664e-06,
+ "loss": 0.4015,
+ "step": 5801
+ },
+ {
+ "epoch": 2.7432624113475175,
+ "grad_norm": 3.0319552421569824,
+ "learning_rate": 2.8685809006427812e-06,
+ "loss": 0.4411,
+ "step": 5802
+ },
+ {
+ "epoch": 2.7437352245862883,
+ "grad_norm": 2.7791874408721924,
+ "learning_rate": 2.8679638792912784e-06,
+ "loss": 0.43,
+ "step": 5803
+ },
+ {
+ "epoch": 2.744208037825059,
+ "grad_norm": 3.530632495880127,
+ "learning_rate": 2.867346835028279e-06,
+ "loss": 0.4581,
+ "step": 5804
+ },
+ {
+ "epoch": 2.74468085106383,
+ "grad_norm": 3.2043099403381348,
+ "learning_rate": 2.8667297678922024e-06,
+ "loss": 0.4375,
+ "step": 5805
+ },
+ {
+ "epoch": 2.7451536643026007,
+ "grad_norm": 2.8442344665527344,
+ "learning_rate": 2.8661126779214716e-06,
+ "loss": 0.4059,
+ "step": 5806
+ },
+ {
+ "epoch": 2.745626477541371,
+ "grad_norm": 2.7561380863189697,
+ "learning_rate": 2.86549556515451e-06,
+ "loss": 0.4391,
+ "step": 5807
+ },
+ {
+ "epoch": 2.746099290780142,
+ "grad_norm": 3.229663848876953,
+ "learning_rate": 2.8648784296297418e-06,
+ "loss": 0.4579,
+ "step": 5808
+ },
+ {
+ "epoch": 2.7465721040189126,
+ "grad_norm": 2.8375027179718018,
+ "learning_rate": 2.864261271385593e-06,
+ "loss": 0.4566,
+ "step": 5809
+ },
+ {
+ "epoch": 2.747044917257683,
+ "grad_norm": 2.392998695373535,
+ "learning_rate": 2.863644090460493e-06,
+ "loss": 0.4123,
+ "step": 5810
+ },
+ {
+ "epoch": 2.7475177304964538,
+ "grad_norm": 2.707610607147217,
+ "learning_rate": 2.86302688689287e-06,
+ "loss": 0.4299,
+ "step": 5811
+ },
+ {
+ "epoch": 2.7479905437352246,
+ "grad_norm": 2.824042797088623,
+ "learning_rate": 2.8624096607211547e-06,
+ "loss": 0.3799,
+ "step": 5812
+ },
+ {
+ "epoch": 2.7484633569739954,
+ "grad_norm": 2.933102607727051,
+ "learning_rate": 2.861792411983779e-06,
+ "loss": 0.4416,
+ "step": 5813
+ },
+ {
+ "epoch": 2.748936170212766,
+ "grad_norm": 2.7000277042388916,
+ "learning_rate": 2.8611751407191757e-06,
+ "loss": 0.4346,
+ "step": 5814
+ },
+ {
+ "epoch": 2.7494089834515365,
+ "grad_norm": 2.6757142543792725,
+ "learning_rate": 2.86055784696578e-06,
+ "loss": 0.4559,
+ "step": 5815
+ },
+ {
+ "epoch": 2.7498817966903073,
+ "grad_norm": 2.5791053771972656,
+ "learning_rate": 2.8599405307620287e-06,
+ "loss": 0.4619,
+ "step": 5816
+ },
+ {
+ "epoch": 2.750354609929078,
+ "grad_norm": 3.0327374935150146,
+ "learning_rate": 2.859323192146359e-06,
+ "loss": 0.4163,
+ "step": 5817
+ },
+ {
+ "epoch": 2.7508274231678485,
+ "grad_norm": 2.5580220222473145,
+ "learning_rate": 2.8587058311572084e-06,
+ "loss": 0.4005,
+ "step": 5818
+ },
+ {
+ "epoch": 2.7513002364066192,
+ "grad_norm": 2.592179536819458,
+ "learning_rate": 2.85808844783302e-06,
+ "loss": 0.4404,
+ "step": 5819
+ },
+ {
+ "epoch": 2.75177304964539,
+ "grad_norm": 3.2779927253723145,
+ "learning_rate": 2.8574710422122342e-06,
+ "loss": 0.54,
+ "step": 5820
+ },
+ {
+ "epoch": 2.752245862884161,
+ "grad_norm": 2.4804370403289795,
+ "learning_rate": 2.8568536143332933e-06,
+ "loss": 0.4476,
+ "step": 5821
+ },
+ {
+ "epoch": 2.7527186761229316,
+ "grad_norm": 2.649477481842041,
+ "learning_rate": 2.8562361642346427e-06,
+ "loss": 0.4336,
+ "step": 5822
+ },
+ {
+ "epoch": 2.753191489361702,
+ "grad_norm": 3.138587474822998,
+ "learning_rate": 2.855618691954728e-06,
+ "loss": 0.5042,
+ "step": 5823
+ },
+ {
+ "epoch": 2.753664302600473,
+ "grad_norm": 2.75093412399292,
+ "learning_rate": 2.855001197531997e-06,
+ "loss": 0.4327,
+ "step": 5824
+ },
+ {
+ "epoch": 2.7541371158392436,
+ "grad_norm": 2.678809642791748,
+ "learning_rate": 2.854383681004898e-06,
+ "loss": 0.4409,
+ "step": 5825
+ },
+ {
+ "epoch": 2.754609929078014,
+ "grad_norm": 2.965386390686035,
+ "learning_rate": 2.853766142411881e-06,
+ "loss": 0.4716,
+ "step": 5826
+ },
+ {
+ "epoch": 2.7550827423167847,
+ "grad_norm": 2.6419436931610107,
+ "learning_rate": 2.853148581791398e-06,
+ "loss": 0.4367,
+ "step": 5827
+ },
+ {
+ "epoch": 2.7555555555555555,
+ "grad_norm": 3.205794095993042,
+ "learning_rate": 2.8525309991819004e-06,
+ "loss": 0.4869,
+ "step": 5828
+ },
+ {
+ "epoch": 2.7560283687943263,
+ "grad_norm": 3.041008472442627,
+ "learning_rate": 2.851913394621844e-06,
+ "loss": 0.5087,
+ "step": 5829
+ },
+ {
+ "epoch": 2.756501182033097,
+ "grad_norm": 2.6525566577911377,
+ "learning_rate": 2.851295768149684e-06,
+ "loss": 0.3951,
+ "step": 5830
+ },
+ {
+ "epoch": 2.7569739952718675,
+ "grad_norm": 2.732220411300659,
+ "learning_rate": 2.850678119803876e-06,
+ "loss": 0.4797,
+ "step": 5831
+ },
+ {
+ "epoch": 2.7574468085106383,
+ "grad_norm": 2.8965251445770264,
+ "learning_rate": 2.8500604496228797e-06,
+ "loss": 0.4938,
+ "step": 5832
+ },
+ {
+ "epoch": 2.757919621749409,
+ "grad_norm": 2.48020076751709,
+ "learning_rate": 2.849442757645154e-06,
+ "loss": 0.4172,
+ "step": 5833
+ },
+ {
+ "epoch": 2.7583924349881794,
+ "grad_norm": 2.4764912128448486,
+ "learning_rate": 2.8488250439091603e-06,
+ "loss": 0.4123,
+ "step": 5834
+ },
+ {
+ "epoch": 2.7588652482269502,
+ "grad_norm": 2.4547016620635986,
+ "learning_rate": 2.84820730845336e-06,
+ "loss": 0.4116,
+ "step": 5835
+ },
+ {
+ "epoch": 2.759338061465721,
+ "grad_norm": 2.55476975440979,
+ "learning_rate": 2.847589551316218e-06,
+ "loss": 0.4744,
+ "step": 5836
+ },
+ {
+ "epoch": 2.759810874704492,
+ "grad_norm": 2.3866238594055176,
+ "learning_rate": 2.846971772536199e-06,
+ "loss": 0.4406,
+ "step": 5837
+ },
+ {
+ "epoch": 2.7602836879432626,
+ "grad_norm": 2.855318784713745,
+ "learning_rate": 2.8463539721517687e-06,
+ "loss": 0.4517,
+ "step": 5838
+ },
+ {
+ "epoch": 2.760756501182033,
+ "grad_norm": 2.527198314666748,
+ "learning_rate": 2.8457361502013954e-06,
+ "loss": 0.3588,
+ "step": 5839
+ },
+ {
+ "epoch": 2.7612293144208038,
+ "grad_norm": 2.6761462688446045,
+ "learning_rate": 2.8451183067235476e-06,
+ "loss": 0.4192,
+ "step": 5840
+ },
+ {
+ "epoch": 2.7617021276595746,
+ "grad_norm": 2.5692319869995117,
+ "learning_rate": 2.8445004417566967e-06,
+ "loss": 0.4108,
+ "step": 5841
+ },
+ {
+ "epoch": 2.762174940898345,
+ "grad_norm": 2.5721096992492676,
+ "learning_rate": 2.8438825553393133e-06,
+ "loss": 0.3941,
+ "step": 5842
+ },
+ {
+ "epoch": 2.7626477541371157,
+ "grad_norm": 2.699430227279663,
+ "learning_rate": 2.843264647509872e-06,
+ "loss": 0.4418,
+ "step": 5843
+ },
+ {
+ "epoch": 2.7631205673758865,
+ "grad_norm": 2.6943318843841553,
+ "learning_rate": 2.842646718306846e-06,
+ "loss": 0.4505,
+ "step": 5844
+ },
+ {
+ "epoch": 2.7635933806146573,
+ "grad_norm": 2.661656379699707,
+ "learning_rate": 2.8420287677687107e-06,
+ "loss": 0.4413,
+ "step": 5845
+ },
+ {
+ "epoch": 2.764066193853428,
+ "grad_norm": 2.830467939376831,
+ "learning_rate": 2.8414107959339444e-06,
+ "loss": 0.5095,
+ "step": 5846
+ },
+ {
+ "epoch": 2.7645390070921985,
+ "grad_norm": 2.598053455352783,
+ "learning_rate": 2.840792802841024e-06,
+ "loss": 0.4029,
+ "step": 5847
+ },
+ {
+ "epoch": 2.7650118203309693,
+ "grad_norm": 2.641700029373169,
+ "learning_rate": 2.8401747885284316e-06,
+ "loss": 0.4237,
+ "step": 5848
+ },
+ {
+ "epoch": 2.76548463356974,
+ "grad_norm": 2.6672768592834473,
+ "learning_rate": 2.8395567530346454e-06,
+ "loss": 0.4181,
+ "step": 5849
+ },
+ {
+ "epoch": 2.7659574468085104,
+ "grad_norm": 2.5851705074310303,
+ "learning_rate": 2.838938696398149e-06,
+ "loss": 0.4165,
+ "step": 5850
+ },
+ {
+ "epoch": 2.766430260047281,
+ "grad_norm": 2.318120002746582,
+ "learning_rate": 2.8383206186574276e-06,
+ "loss": 0.3578,
+ "step": 5851
+ },
+ {
+ "epoch": 2.766903073286052,
+ "grad_norm": 2.6199793815612793,
+ "learning_rate": 2.8377025198509635e-06,
+ "loss": 0.4719,
+ "step": 5852
+ },
+ {
+ "epoch": 2.767375886524823,
+ "grad_norm": 2.7186086177825928,
+ "learning_rate": 2.837084400017245e-06,
+ "loss": 0.41,
+ "step": 5853
+ },
+ {
+ "epoch": 2.7678486997635936,
+ "grad_norm": 2.702514886856079,
+ "learning_rate": 2.8364662591947583e-06,
+ "loss": 0.4659,
+ "step": 5854
+ },
+ {
+ "epoch": 2.768321513002364,
+ "grad_norm": 2.612375259399414,
+ "learning_rate": 2.835848097421993e-06,
+ "loss": 0.4252,
+ "step": 5855
+ },
+ {
+ "epoch": 2.7687943262411348,
+ "grad_norm": 3.0127978324890137,
+ "learning_rate": 2.8352299147374394e-06,
+ "loss": 0.4084,
+ "step": 5856
+ },
+ {
+ "epoch": 2.7692671394799055,
+ "grad_norm": 2.6460049152374268,
+ "learning_rate": 2.83461171117959e-06,
+ "loss": 0.4035,
+ "step": 5857
+ },
+ {
+ "epoch": 2.769739952718676,
+ "grad_norm": 2.9844725131988525,
+ "learning_rate": 2.8339934867869357e-06,
+ "loss": 0.4912,
+ "step": 5858
+ },
+ {
+ "epoch": 2.7702127659574467,
+ "grad_norm": 2.731217861175537,
+ "learning_rate": 2.833375241597972e-06,
+ "loss": 0.4112,
+ "step": 5859
+ },
+ {
+ "epoch": 2.7706855791962175,
+ "grad_norm": 2.731194496154785,
+ "learning_rate": 2.832756975651193e-06,
+ "loss": 0.4516,
+ "step": 5860
+ },
+ {
+ "epoch": 2.7711583924349883,
+ "grad_norm": 3.0532076358795166,
+ "learning_rate": 2.8321386889850965e-06,
+ "loss": 0.3959,
+ "step": 5861
+ },
+ {
+ "epoch": 2.771631205673759,
+ "grad_norm": 3.5437800884246826,
+ "learning_rate": 2.831520381638181e-06,
+ "loss": 0.6055,
+ "step": 5862
+ },
+ {
+ "epoch": 2.7721040189125294,
+ "grad_norm": 2.4297714233398438,
+ "learning_rate": 2.830902053648944e-06,
+ "loss": 0.4038,
+ "step": 5863
+ },
+ {
+ "epoch": 2.7725768321513002,
+ "grad_norm": 2.696768045425415,
+ "learning_rate": 2.8302837050558876e-06,
+ "loss": 0.3983,
+ "step": 5864
+ },
+ {
+ "epoch": 2.773049645390071,
+ "grad_norm": 2.6574649810791016,
+ "learning_rate": 2.8296653358975122e-06,
+ "loss": 0.4937,
+ "step": 5865
+ },
+ {
+ "epoch": 2.7735224586288414,
+ "grad_norm": 2.9393341541290283,
+ "learning_rate": 2.8290469462123234e-06,
+ "loss": 0.4603,
+ "step": 5866
+ },
+ {
+ "epoch": 2.773995271867612,
+ "grad_norm": 2.7630696296691895,
+ "learning_rate": 2.828428536038824e-06,
+ "loss": 0.4663,
+ "step": 5867
+ },
+ {
+ "epoch": 2.774468085106383,
+ "grad_norm": 2.7354233264923096,
+ "learning_rate": 2.8278101054155183e-06,
+ "loss": 0.4444,
+ "step": 5868
+ },
+ {
+ "epoch": 2.774940898345154,
+ "grad_norm": 3.0489425659179688,
+ "learning_rate": 2.827191654380915e-06,
+ "loss": 0.4684,
+ "step": 5869
+ },
+ {
+ "epoch": 2.7754137115839246,
+ "grad_norm": 2.9602572917938232,
+ "learning_rate": 2.8265731829735226e-06,
+ "loss": 0.4571,
+ "step": 5870
+ },
+ {
+ "epoch": 2.775886524822695,
+ "grad_norm": 2.774132013320923,
+ "learning_rate": 2.825954691231851e-06,
+ "loss": 0.4458,
+ "step": 5871
+ },
+ {
+ "epoch": 2.7763593380614657,
+ "grad_norm": 2.696622133255005,
+ "learning_rate": 2.825336179194409e-06,
+ "loss": 0.4933,
+ "step": 5872
+ },
+ {
+ "epoch": 2.7768321513002365,
+ "grad_norm": 2.742184638977051,
+ "learning_rate": 2.8247176468997096e-06,
+ "loss": 0.4464,
+ "step": 5873
+ },
+ {
+ "epoch": 2.777304964539007,
+ "grad_norm": 2.7033183574676514,
+ "learning_rate": 2.824099094386266e-06,
+ "loss": 0.4369,
+ "step": 5874
+ },
+ {
+ "epoch": 2.7777777777777777,
+ "grad_norm": 2.7264044284820557,
+ "learning_rate": 2.8234805216925935e-06,
+ "loss": 0.4621,
+ "step": 5875
+ },
+ {
+ "epoch": 2.7782505910165485,
+ "grad_norm": 2.6417739391326904,
+ "learning_rate": 2.822861928857208e-06,
+ "loss": 0.4254,
+ "step": 5876
+ },
+ {
+ "epoch": 2.7787234042553193,
+ "grad_norm": 3.17209529876709,
+ "learning_rate": 2.8222433159186245e-06,
+ "loss": 0.5011,
+ "step": 5877
+ },
+ {
+ "epoch": 2.77919621749409,
+ "grad_norm": 3.1434381008148193,
+ "learning_rate": 2.8216246829153633e-06,
+ "loss": 0.4567,
+ "step": 5878
+ },
+ {
+ "epoch": 2.7796690307328604,
+ "grad_norm": 2.781608819961548,
+ "learning_rate": 2.821006029885943e-06,
+ "loss": 0.4723,
+ "step": 5879
+ },
+ {
+ "epoch": 2.780141843971631,
+ "grad_norm": 3.00079345703125,
+ "learning_rate": 2.820387356868885e-06,
+ "loss": 0.4796,
+ "step": 5880
+ },
+ {
+ "epoch": 2.780614657210402,
+ "grad_norm": 2.703555107116699,
+ "learning_rate": 2.819768663902712e-06,
+ "loss": 0.4577,
+ "step": 5881
+ },
+ {
+ "epoch": 2.7810874704491724,
+ "grad_norm": 2.5741801261901855,
+ "learning_rate": 2.8191499510259453e-06,
+ "loss": 0.4255,
+ "step": 5882
+ },
+ {
+ "epoch": 2.781560283687943,
+ "grad_norm": 2.9871208667755127,
+ "learning_rate": 2.8185312182771112e-06,
+ "loss": 0.4495,
+ "step": 5883
+ },
+ {
+ "epoch": 2.782033096926714,
+ "grad_norm": 2.525317668914795,
+ "learning_rate": 2.8179124656947343e-06,
+ "loss": 0.4428,
+ "step": 5884
+ },
+ {
+ "epoch": 2.7825059101654848,
+ "grad_norm": 2.525092840194702,
+ "learning_rate": 2.817293693317343e-06,
+ "loss": 0.4348,
+ "step": 5885
+ },
+ {
+ "epoch": 2.7829787234042556,
+ "grad_norm": 2.8485171794891357,
+ "learning_rate": 2.816674901183464e-06,
+ "loss": 0.4206,
+ "step": 5886
+ },
+ {
+ "epoch": 2.783451536643026,
+ "grad_norm": 2.6612746715545654,
+ "learning_rate": 2.8160560893316272e-06,
+ "loss": 0.396,
+ "step": 5887
+ },
+ {
+ "epoch": 2.7839243498817967,
+ "grad_norm": 2.7093865871429443,
+ "learning_rate": 2.815437257800364e-06,
+ "loss": 0.4468,
+ "step": 5888
+ },
+ {
+ "epoch": 2.7843971631205675,
+ "grad_norm": 2.6130900382995605,
+ "learning_rate": 2.814818406628206e-06,
+ "loss": 0.443,
+ "step": 5889
+ },
+ {
+ "epoch": 2.784869976359338,
+ "grad_norm": 2.8147552013397217,
+ "learning_rate": 2.8141995358536866e-06,
+ "loss": 0.4454,
+ "step": 5890
+ },
+ {
+ "epoch": 2.7853427895981087,
+ "grad_norm": 2.5621275901794434,
+ "learning_rate": 2.8135806455153395e-06,
+ "loss": 0.439,
+ "step": 5891
+ },
+ {
+ "epoch": 2.7858156028368795,
+ "grad_norm": 2.880228281021118,
+ "learning_rate": 2.812961735651701e-06,
+ "loss": 0.3895,
+ "step": 5892
+ },
+ {
+ "epoch": 2.7862884160756503,
+ "grad_norm": 2.5861377716064453,
+ "learning_rate": 2.8123428063013068e-06,
+ "loss": 0.4402,
+ "step": 5893
+ },
+ {
+ "epoch": 2.786761229314421,
+ "grad_norm": 2.9707765579223633,
+ "learning_rate": 2.811723857502696e-06,
+ "loss": 0.4461,
+ "step": 5894
+ },
+ {
+ "epoch": 2.7872340425531914,
+ "grad_norm": 2.923999309539795,
+ "learning_rate": 2.811104889294408e-06,
+ "loss": 0.4395,
+ "step": 5895
+ },
+ {
+ "epoch": 2.787706855791962,
+ "grad_norm": 2.846933603286743,
+ "learning_rate": 2.810485901714981e-06,
+ "loss": 0.5168,
+ "step": 5896
+ },
+ {
+ "epoch": 2.788179669030733,
+ "grad_norm": 4.1052350997924805,
+ "learning_rate": 2.8098668948029597e-06,
+ "loss": 0.5152,
+ "step": 5897
+ },
+ {
+ "epoch": 2.7886524822695034,
+ "grad_norm": 2.7391018867492676,
+ "learning_rate": 2.8092478685968856e-06,
+ "loss": 0.4515,
+ "step": 5898
+ },
+ {
+ "epoch": 2.789125295508274,
+ "grad_norm": 2.976088285446167,
+ "learning_rate": 2.8086288231353027e-06,
+ "loss": 0.5156,
+ "step": 5899
+ },
+ {
+ "epoch": 2.789598108747045,
+ "grad_norm": 2.6139633655548096,
+ "learning_rate": 2.8080097584567562e-06,
+ "loss": 0.4237,
+ "step": 5900
+ },
+ {
+ "epoch": 2.7900709219858157,
+ "grad_norm": 2.501654624938965,
+ "learning_rate": 2.807390674599792e-06,
+ "loss": 0.4349,
+ "step": 5901
+ },
+ {
+ "epoch": 2.7905437352245865,
+ "grad_norm": 2.8814525604248047,
+ "learning_rate": 2.8067715716029586e-06,
+ "loss": 0.4866,
+ "step": 5902
+ },
+ {
+ "epoch": 2.791016548463357,
+ "grad_norm": 2.7953200340270996,
+ "learning_rate": 2.8061524495048046e-06,
+ "loss": 0.3964,
+ "step": 5903
+ },
+ {
+ "epoch": 2.7914893617021277,
+ "grad_norm": 2.7362849712371826,
+ "learning_rate": 2.8055333083438808e-06,
+ "loss": 0.4181,
+ "step": 5904
+ },
+ {
+ "epoch": 2.7919621749408985,
+ "grad_norm": 2.9740512371063232,
+ "learning_rate": 2.8049141481587366e-06,
+ "loss": 0.4784,
+ "step": 5905
+ },
+ {
+ "epoch": 2.792434988179669,
+ "grad_norm": 2.595813274383545,
+ "learning_rate": 2.8042949689879262e-06,
+ "loss": 0.4421,
+ "step": 5906
+ },
+ {
+ "epoch": 2.7929078014184396,
+ "grad_norm": 2.886899948120117,
+ "learning_rate": 2.803675770870002e-06,
+ "loss": 0.4435,
+ "step": 5907
+ },
+ {
+ "epoch": 2.7933806146572104,
+ "grad_norm": 2.6057486534118652,
+ "learning_rate": 2.8030565538435196e-06,
+ "loss": 0.4472,
+ "step": 5908
+ },
+ {
+ "epoch": 2.7938534278959812,
+ "grad_norm": 2.7422802448272705,
+ "learning_rate": 2.802437317947034e-06,
+ "loss": 0.4799,
+ "step": 5909
+ },
+ {
+ "epoch": 2.794326241134752,
+ "grad_norm": 2.3904244899749756,
+ "learning_rate": 2.801818063219102e-06,
+ "loss": 0.4508,
+ "step": 5910
+ },
+ {
+ "epoch": 2.7947990543735224,
+ "grad_norm": 2.8434207439422607,
+ "learning_rate": 2.8011987896982835e-06,
+ "loss": 0.4473,
+ "step": 5911
+ },
+ {
+ "epoch": 2.795271867612293,
+ "grad_norm": 2.916088819503784,
+ "learning_rate": 2.8005794974231366e-06,
+ "loss": 0.464,
+ "step": 5912
+ },
+ {
+ "epoch": 2.795744680851064,
+ "grad_norm": 2.6483397483825684,
+ "learning_rate": 2.7999601864322236e-06,
+ "loss": 0.441,
+ "step": 5913
+ },
+ {
+ "epoch": 2.7962174940898343,
+ "grad_norm": 2.9287428855895996,
+ "learning_rate": 2.7993408567641033e-06,
+ "loss": 0.4551,
+ "step": 5914
+ },
+ {
+ "epoch": 2.796690307328605,
+ "grad_norm": 2.575024127960205,
+ "learning_rate": 2.798721508457342e-06,
+ "loss": 0.4494,
+ "step": 5915
+ },
+ {
+ "epoch": 2.797163120567376,
+ "grad_norm": 2.7156829833984375,
+ "learning_rate": 2.7981021415505015e-06,
+ "loss": 0.419,
+ "step": 5916
+ },
+ {
+ "epoch": 2.7976359338061467,
+ "grad_norm": 2.850553035736084,
+ "learning_rate": 2.7974827560821482e-06,
+ "loss": 0.4709,
+ "step": 5917
+ },
+ {
+ "epoch": 2.7981087470449175,
+ "grad_norm": 2.673846483230591,
+ "learning_rate": 2.796863352090847e-06,
+ "loss": 0.4224,
+ "step": 5918
+ },
+ {
+ "epoch": 2.798581560283688,
+ "grad_norm": 2.9093217849731445,
+ "learning_rate": 2.796243929615168e-06,
+ "loss": 0.468,
+ "step": 5919
+ },
+ {
+ "epoch": 2.7990543735224587,
+ "grad_norm": 2.4853813648223877,
+ "learning_rate": 2.7956244886936775e-06,
+ "loss": 0.4723,
+ "step": 5920
+ },
+ {
+ "epoch": 2.7995271867612295,
+ "grad_norm": 3.026428461074829,
+ "learning_rate": 2.795005029364946e-06,
+ "loss": 0.4721,
+ "step": 5921
+ },
+ {
+ "epoch": 2.8,
+ "grad_norm": 2.886295795440674,
+ "learning_rate": 2.794385551667546e-06,
+ "loss": 0.456,
+ "step": 5922
+ },
+ {
+ "epoch": 2.8004728132387706,
+ "grad_norm": 3.2260656356811523,
+ "learning_rate": 2.7937660556400486e-06,
+ "loss": 0.4499,
+ "step": 5923
+ },
+ {
+ "epoch": 2.8009456264775414,
+ "grad_norm": 2.7971982955932617,
+ "learning_rate": 2.793146541321027e-06,
+ "loss": 0.3982,
+ "step": 5924
+ },
+ {
+ "epoch": 2.801418439716312,
+ "grad_norm": 2.85461163520813,
+ "learning_rate": 2.7925270087490546e-06,
+ "loss": 0.4841,
+ "step": 5925
+ },
+ {
+ "epoch": 2.801891252955083,
+ "grad_norm": 3.0642316341400146,
+ "learning_rate": 2.7919074579627086e-06,
+ "loss": 0.4538,
+ "step": 5926
+ },
+ {
+ "epoch": 2.8023640661938534,
+ "grad_norm": 2.9053616523742676,
+ "learning_rate": 2.7912878890005657e-06,
+ "loss": 0.434,
+ "step": 5927
+ },
+ {
+ "epoch": 2.802836879432624,
+ "grad_norm": 2.7649240493774414,
+ "learning_rate": 2.7906683019012027e-06,
+ "loss": 0.414,
+ "step": 5928
+ },
+ {
+ "epoch": 2.803309692671395,
+ "grad_norm": 2.8717660903930664,
+ "learning_rate": 2.7900486967031987e-06,
+ "loss": 0.4337,
+ "step": 5929
+ },
+ {
+ "epoch": 2.8037825059101653,
+ "grad_norm": 2.6860995292663574,
+ "learning_rate": 2.789429073445135e-06,
+ "loss": 0.447,
+ "step": 5930
+ },
+ {
+ "epoch": 2.804255319148936,
+ "grad_norm": 2.67509126663208,
+ "learning_rate": 2.7888094321655918e-06,
+ "loss": 0.4955,
+ "step": 5931
+ },
+ {
+ "epoch": 2.804728132387707,
+ "grad_norm": 2.7426326274871826,
+ "learning_rate": 2.7881897729031514e-06,
+ "loss": 0.4564,
+ "step": 5932
+ },
+ {
+ "epoch": 2.8052009456264777,
+ "grad_norm": 2.7087252140045166,
+ "learning_rate": 2.7875700956963973e-06,
+ "loss": 0.4571,
+ "step": 5933
+ },
+ {
+ "epoch": 2.8056737588652485,
+ "grad_norm": 2.513526439666748,
+ "learning_rate": 2.7869504005839147e-06,
+ "loss": 0.4361,
+ "step": 5934
+ },
+ {
+ "epoch": 2.806146572104019,
+ "grad_norm": 3.2246084213256836,
+ "learning_rate": 2.7863306876042885e-06,
+ "loss": 0.4612,
+ "step": 5935
+ },
+ {
+ "epoch": 2.8066193853427897,
+ "grad_norm": 3.226325511932373,
+ "learning_rate": 2.7857109567961066e-06,
+ "loss": 0.4528,
+ "step": 5936
+ },
+ {
+ "epoch": 2.8070921985815604,
+ "grad_norm": 2.8861422538757324,
+ "learning_rate": 2.785091208197956e-06,
+ "loss": 0.5049,
+ "step": 5937
+ },
+ {
+ "epoch": 2.807565011820331,
+ "grad_norm": 2.76279616355896,
+ "learning_rate": 2.7844714418484257e-06,
+ "loss": 0.4714,
+ "step": 5938
+ },
+ {
+ "epoch": 2.8080378250591016,
+ "grad_norm": 2.9591920375823975,
+ "learning_rate": 2.7838516577861063e-06,
+ "loss": 0.4633,
+ "step": 5939
+ },
+ {
+ "epoch": 2.8085106382978724,
+ "grad_norm": 2.536916971206665,
+ "learning_rate": 2.7832318560495885e-06,
+ "loss": 0.4108,
+ "step": 5940
+ },
+ {
+ "epoch": 2.808983451536643,
+ "grad_norm": 3.2484991550445557,
+ "learning_rate": 2.7826120366774657e-06,
+ "loss": 0.4888,
+ "step": 5941
+ },
+ {
+ "epoch": 2.8094562647754135,
+ "grad_norm": 2.7129359245300293,
+ "learning_rate": 2.781992199708329e-06,
+ "loss": 0.4008,
+ "step": 5942
+ },
+ {
+ "epoch": 2.8099290780141843,
+ "grad_norm": 2.4176113605499268,
+ "learning_rate": 2.781372345180776e-06,
+ "loss": 0.3864,
+ "step": 5943
+ },
+ {
+ "epoch": 2.810401891252955,
+ "grad_norm": 2.6557252407073975,
+ "learning_rate": 2.7807524731334e-06,
+ "loss": 0.4295,
+ "step": 5944
+ },
+ {
+ "epoch": 2.8108747044917255,
+ "grad_norm": 2.9191324710845947,
+ "learning_rate": 2.7801325836047993e-06,
+ "loss": 0.4854,
+ "step": 5945
+ },
+ {
+ "epoch": 2.8113475177304963,
+ "grad_norm": 2.6325371265411377,
+ "learning_rate": 2.7795126766335705e-06,
+ "loss": 0.4332,
+ "step": 5946
+ },
+ {
+ "epoch": 2.811820330969267,
+ "grad_norm": 2.658337116241455,
+ "learning_rate": 2.778892752258314e-06,
+ "loss": 0.4276,
+ "step": 5947
+ },
+ {
+ "epoch": 2.812293144208038,
+ "grad_norm": 2.763782262802124,
+ "learning_rate": 2.778272810517627e-06,
+ "loss": 0.4246,
+ "step": 5948
+ },
+ {
+ "epoch": 2.8127659574468087,
+ "grad_norm": 2.407607078552246,
+ "learning_rate": 2.777652851450113e-06,
+ "loss": 0.3788,
+ "step": 5949
+ },
+ {
+ "epoch": 2.813238770685579,
+ "grad_norm": 3.0339951515197754,
+ "learning_rate": 2.7770328750943736e-06,
+ "loss": 0.477,
+ "step": 5950
+ },
+ {
+ "epoch": 2.81371158392435,
+ "grad_norm": 2.3475773334503174,
+ "learning_rate": 2.776412881489012e-06,
+ "loss": 0.4206,
+ "step": 5951
+ },
+ {
+ "epoch": 2.8141843971631206,
+ "grad_norm": 3.0455260276794434,
+ "learning_rate": 2.7757928706726318e-06,
+ "loss": 0.4301,
+ "step": 5952
+ },
+ {
+ "epoch": 2.814657210401891,
+ "grad_norm": 2.803920030593872,
+ "learning_rate": 2.7751728426838386e-06,
+ "loss": 0.3738,
+ "step": 5953
+ },
+ {
+ "epoch": 2.815130023640662,
+ "grad_norm": 3.1083319187164307,
+ "learning_rate": 2.77455279756124e-06,
+ "loss": 0.5365,
+ "step": 5954
+ },
+ {
+ "epoch": 2.8156028368794326,
+ "grad_norm": 3.180809497833252,
+ "learning_rate": 2.7739327353434427e-06,
+ "loss": 0.4789,
+ "step": 5955
+ },
+ {
+ "epoch": 2.8160756501182034,
+ "grad_norm": 2.975043773651123,
+ "learning_rate": 2.7733126560690543e-06,
+ "loss": 0.4798,
+ "step": 5956
+ },
+ {
+ "epoch": 2.816548463356974,
+ "grad_norm": 2.765475034713745,
+ "learning_rate": 2.772692559776685e-06,
+ "loss": 0.4206,
+ "step": 5957
+ },
+ {
+ "epoch": 2.8170212765957445,
+ "grad_norm": 2.48612380027771,
+ "learning_rate": 2.7720724465049463e-06,
+ "loss": 0.4234,
+ "step": 5958
+ },
+ {
+ "epoch": 2.8174940898345153,
+ "grad_norm": 2.7145729064941406,
+ "learning_rate": 2.77145231629245e-06,
+ "loss": 0.4713,
+ "step": 5959
+ },
+ {
+ "epoch": 2.817966903073286,
+ "grad_norm": 2.5993762016296387,
+ "learning_rate": 2.7708321691778074e-06,
+ "loss": 0.4144,
+ "step": 5960
+ },
+ {
+ "epoch": 2.8184397163120565,
+ "grad_norm": 3.0902538299560547,
+ "learning_rate": 2.770212005199633e-06,
+ "loss": 0.4822,
+ "step": 5961
+ },
+ {
+ "epoch": 2.8189125295508273,
+ "grad_norm": 2.849757671356201,
+ "learning_rate": 2.7695918243965424e-06,
+ "loss": 0.4449,
+ "step": 5962
+ },
+ {
+ "epoch": 2.819385342789598,
+ "grad_norm": 2.77148699760437,
+ "learning_rate": 2.768971626807151e-06,
+ "loss": 0.4448,
+ "step": 5963
+ },
+ {
+ "epoch": 2.819858156028369,
+ "grad_norm": 2.7865898609161377,
+ "learning_rate": 2.7683514124700757e-06,
+ "loss": 0.4944,
+ "step": 5964
+ },
+ {
+ "epoch": 2.8203309692671397,
+ "grad_norm": 2.9057955741882324,
+ "learning_rate": 2.767731181423934e-06,
+ "loss": 0.5074,
+ "step": 5965
+ },
+ {
+ "epoch": 2.82080378250591,
+ "grad_norm": 2.725837469100952,
+ "learning_rate": 2.7671109337073465e-06,
+ "loss": 0.4207,
+ "step": 5966
+ },
+ {
+ "epoch": 2.821276595744681,
+ "grad_norm": 3.078531265258789,
+ "learning_rate": 2.7664906693589315e-06,
+ "loss": 0.4835,
+ "step": 5967
+ },
+ {
+ "epoch": 2.8217494089834516,
+ "grad_norm": 2.8692002296447754,
+ "learning_rate": 2.765870388417312e-06,
+ "loss": 0.4284,
+ "step": 5968
+ },
+ {
+ "epoch": 2.822222222222222,
+ "grad_norm": 2.8519723415374756,
+ "learning_rate": 2.765250090921109e-06,
+ "loss": 0.541,
+ "step": 5969
+ },
+ {
+ "epoch": 2.8226950354609928,
+ "grad_norm": 3.2037532329559326,
+ "learning_rate": 2.7646297769089457e-06,
+ "loss": 0.4276,
+ "step": 5970
+ },
+ {
+ "epoch": 2.8231678486997636,
+ "grad_norm": 2.8637137413024902,
+ "learning_rate": 2.7640094464194468e-06,
+ "loss": 0.4904,
+ "step": 5971
+ },
+ {
+ "epoch": 2.8236406619385344,
+ "grad_norm": 2.681516408920288,
+ "learning_rate": 2.7633890994912372e-06,
+ "loss": 0.4942,
+ "step": 5972
+ },
+ {
+ "epoch": 2.824113475177305,
+ "grad_norm": 3.0035219192504883,
+ "learning_rate": 2.7627687361629434e-06,
+ "loss": 0.4556,
+ "step": 5973
+ },
+ {
+ "epoch": 2.8245862884160755,
+ "grad_norm": 2.8107759952545166,
+ "learning_rate": 2.7621483564731923e-06,
+ "loss": 0.4225,
+ "step": 5974
+ },
+ {
+ "epoch": 2.8250591016548463,
+ "grad_norm": 2.87276029586792,
+ "learning_rate": 2.7615279604606126e-06,
+ "loss": 0.5045,
+ "step": 5975
+ },
+ {
+ "epoch": 2.825531914893617,
+ "grad_norm": 2.687953233718872,
+ "learning_rate": 2.760907548163833e-06,
+ "loss": 0.4018,
+ "step": 5976
+ },
+ {
+ "epoch": 2.8260047281323875,
+ "grad_norm": 2.587979555130005,
+ "learning_rate": 2.760287119621486e-06,
+ "loss": 0.4407,
+ "step": 5977
+ },
+ {
+ "epoch": 2.8264775413711583,
+ "grad_norm": 2.805602550506592,
+ "learning_rate": 2.7596666748722e-06,
+ "loss": 0.4559,
+ "step": 5978
+ },
+ {
+ "epoch": 2.826950354609929,
+ "grad_norm": 2.320763111114502,
+ "learning_rate": 2.759046213954609e-06,
+ "loss": 0.3847,
+ "step": 5979
+ },
+ {
+ "epoch": 2.8274231678487,
+ "grad_norm": 2.6876401901245117,
+ "learning_rate": 2.758425736907347e-06,
+ "loss": 0.4528,
+ "step": 5980
+ },
+ {
+ "epoch": 2.8278959810874706,
+ "grad_norm": 2.6852915287017822,
+ "learning_rate": 2.757805243769046e-06,
+ "loss": 0.395,
+ "step": 5981
+ },
+ {
+ "epoch": 2.828368794326241,
+ "grad_norm": 2.808326005935669,
+ "learning_rate": 2.7571847345783447e-06,
+ "loss": 0.4647,
+ "step": 5982
+ },
+ {
+ "epoch": 2.828841607565012,
+ "grad_norm": 2.641479015350342,
+ "learning_rate": 2.7565642093738766e-06,
+ "loss": 0.3798,
+ "step": 5983
+ },
+ {
+ "epoch": 2.8293144208037826,
+ "grad_norm": 2.8066110610961914,
+ "learning_rate": 2.7559436681942803e-06,
+ "loss": 0.5072,
+ "step": 5984
+ },
+ {
+ "epoch": 2.829787234042553,
+ "grad_norm": 2.898375988006592,
+ "learning_rate": 2.7553231110781936e-06,
+ "loss": 0.5182,
+ "step": 5985
+ },
+ {
+ "epoch": 2.8302600472813237,
+ "grad_norm": 2.704890489578247,
+ "learning_rate": 2.7547025380642574e-06,
+ "loss": 0.3999,
+ "step": 5986
+ },
+ {
+ "epoch": 2.8307328605200945,
+ "grad_norm": 2.6024270057678223,
+ "learning_rate": 2.7540819491911106e-06,
+ "loss": 0.4302,
+ "step": 5987
+ },
+ {
+ "epoch": 2.8312056737588653,
+ "grad_norm": 2.8006081581115723,
+ "learning_rate": 2.7534613444973946e-06,
+ "loss": 0.4492,
+ "step": 5988
+ },
+ {
+ "epoch": 2.831678486997636,
+ "grad_norm": 2.9532058238983154,
+ "learning_rate": 2.752840724021752e-06,
+ "loss": 0.4552,
+ "step": 5989
+ },
+ {
+ "epoch": 2.8321513002364065,
+ "grad_norm": 3.1830217838287354,
+ "learning_rate": 2.7522200878028265e-06,
+ "loss": 0.5013,
+ "step": 5990
+ },
+ {
+ "epoch": 2.8326241134751773,
+ "grad_norm": 2.716176748275757,
+ "learning_rate": 2.7515994358792624e-06,
+ "loss": 0.4569,
+ "step": 5991
+ },
+ {
+ "epoch": 2.833096926713948,
+ "grad_norm": 2.6852715015411377,
+ "learning_rate": 2.7509787682897044e-06,
+ "loss": 0.4764,
+ "step": 5992
+ },
+ {
+ "epoch": 2.8335697399527184,
+ "grad_norm": 2.9383316040039062,
+ "learning_rate": 2.7503580850727985e-06,
+ "loss": 0.5205,
+ "step": 5993
+ },
+ {
+ "epoch": 2.8340425531914892,
+ "grad_norm": 2.703132152557373,
+ "learning_rate": 2.749737386267193e-06,
+ "loss": 0.4543,
+ "step": 5994
+ },
+ {
+ "epoch": 2.83451536643026,
+ "grad_norm": 2.4304885864257812,
+ "learning_rate": 2.7491166719115354e-06,
+ "loss": 0.4479,
+ "step": 5995
+ },
+ {
+ "epoch": 2.834988179669031,
+ "grad_norm": 2.975722551345825,
+ "learning_rate": 2.748495942044475e-06,
+ "loss": 0.4074,
+ "step": 5996
+ },
+ {
+ "epoch": 2.8354609929078016,
+ "grad_norm": 3.440208911895752,
+ "learning_rate": 2.7478751967046617e-06,
+ "loss": 0.4497,
+ "step": 5997
+ },
+ {
+ "epoch": 2.835933806146572,
+ "grad_norm": 2.734673261642456,
+ "learning_rate": 2.747254435930747e-06,
+ "loss": 0.437,
+ "step": 5998
+ },
+ {
+ "epoch": 2.8364066193853428,
+ "grad_norm": 3.1918959617614746,
+ "learning_rate": 2.7466336597613826e-06,
+ "loss": 0.4197,
+ "step": 5999
+ },
+ {
+ "epoch": 2.8368794326241136,
+ "grad_norm": 3.1440329551696777,
+ "learning_rate": 2.7460128682352216e-06,
+ "loss": 0.4425,
+ "step": 6000
+ },
+ {
+ "epoch": 2.837352245862884,
+ "grad_norm": 2.582993507385254,
+ "learning_rate": 2.7453920613909183e-06,
+ "loss": 0.4475,
+ "step": 6001
+ },
+ {
+ "epoch": 2.8378250591016547,
+ "grad_norm": 3.2682149410247803,
+ "learning_rate": 2.744771239267128e-06,
+ "loss": 0.4615,
+ "step": 6002
+ },
+ {
+ "epoch": 2.8382978723404255,
+ "grad_norm": 2.848477840423584,
+ "learning_rate": 2.7441504019025046e-06,
+ "loss": 0.4093,
+ "step": 6003
+ },
+ {
+ "epoch": 2.8387706855791963,
+ "grad_norm": 2.3582282066345215,
+ "learning_rate": 2.7435295493357067e-06,
+ "loss": 0.3911,
+ "step": 6004
+ },
+ {
+ "epoch": 2.839243498817967,
+ "grad_norm": 2.7707207202911377,
+ "learning_rate": 2.742908681605392e-06,
+ "loss": 0.4069,
+ "step": 6005
+ },
+ {
+ "epoch": 2.8397163120567375,
+ "grad_norm": 3.0763752460479736,
+ "learning_rate": 2.7422877987502183e-06,
+ "loss": 0.512,
+ "step": 6006
+ },
+ {
+ "epoch": 2.8401891252955083,
+ "grad_norm": 2.8027124404907227,
+ "learning_rate": 2.741666900808846e-06,
+ "loss": 0.4922,
+ "step": 6007
+ },
+ {
+ "epoch": 2.840661938534279,
+ "grad_norm": 2.487982988357544,
+ "learning_rate": 2.7410459878199353e-06,
+ "loss": 0.4368,
+ "step": 6008
+ },
+ {
+ "epoch": 2.8411347517730494,
+ "grad_norm": 2.8727993965148926,
+ "learning_rate": 2.7404250598221484e-06,
+ "loss": 0.4639,
+ "step": 6009
+ },
+ {
+ "epoch": 2.84160756501182,
+ "grad_norm": 2.5556678771972656,
+ "learning_rate": 2.739804116854147e-06,
+ "loss": 0.4217,
+ "step": 6010
+ },
+ {
+ "epoch": 2.842080378250591,
+ "grad_norm": 2.6306912899017334,
+ "learning_rate": 2.7391831589545948e-06,
+ "loss": 0.4816,
+ "step": 6011
+ },
+ {
+ "epoch": 2.842553191489362,
+ "grad_norm": 2.7340946197509766,
+ "learning_rate": 2.7385621861621557e-06,
+ "loss": 0.4113,
+ "step": 6012
+ },
+ {
+ "epoch": 2.8430260047281326,
+ "grad_norm": 2.834190607070923,
+ "learning_rate": 2.737941198515495e-06,
+ "loss": 0.4691,
+ "step": 6013
+ },
+ {
+ "epoch": 2.843498817966903,
+ "grad_norm": 2.7139697074890137,
+ "learning_rate": 2.737320196053281e-06,
+ "loss": 0.3798,
+ "step": 6014
+ },
+ {
+ "epoch": 2.8439716312056738,
+ "grad_norm": 2.7934985160827637,
+ "learning_rate": 2.736699178814177e-06,
+ "loss": 0.446,
+ "step": 6015
+ },
+ {
+ "epoch": 2.8444444444444446,
+ "grad_norm": 2.6941518783569336,
+ "learning_rate": 2.7360781468368534e-06,
+ "loss": 0.4787,
+ "step": 6016
+ },
+ {
+ "epoch": 2.844917257683215,
+ "grad_norm": 3.1530468463897705,
+ "learning_rate": 2.7354571001599792e-06,
+ "loss": 0.474,
+ "step": 6017
+ },
+ {
+ "epoch": 2.8453900709219857,
+ "grad_norm": 2.613875389099121,
+ "learning_rate": 2.7348360388222243e-06,
+ "loss": 0.4297,
+ "step": 6018
+ },
+ {
+ "epoch": 2.8458628841607565,
+ "grad_norm": 2.5481486320495605,
+ "learning_rate": 2.7342149628622587e-06,
+ "loss": 0.3762,
+ "step": 6019
+ },
+ {
+ "epoch": 2.8463356973995273,
+ "grad_norm": 2.6425609588623047,
+ "learning_rate": 2.7335938723187544e-06,
+ "loss": 0.4077,
+ "step": 6020
+ },
+ {
+ "epoch": 2.846808510638298,
+ "grad_norm": 2.6281731128692627,
+ "learning_rate": 2.7329727672303836e-06,
+ "loss": 0.466,
+ "step": 6021
+ },
+ {
+ "epoch": 2.8472813238770684,
+ "grad_norm": 2.8862180709838867,
+ "learning_rate": 2.7323516476358197e-06,
+ "loss": 0.4191,
+ "step": 6022
+ },
+ {
+ "epoch": 2.8477541371158392,
+ "grad_norm": 2.907731533050537,
+ "learning_rate": 2.7317305135737383e-06,
+ "loss": 0.4867,
+ "step": 6023
+ },
+ {
+ "epoch": 2.84822695035461,
+ "grad_norm": 2.825593948364258,
+ "learning_rate": 2.731109365082814e-06,
+ "loss": 0.4888,
+ "step": 6024
+ },
+ {
+ "epoch": 2.8486997635933804,
+ "grad_norm": 2.478163003921509,
+ "learning_rate": 2.730488202201722e-06,
+ "loss": 0.4714,
+ "step": 6025
+ },
+ {
+ "epoch": 2.849172576832151,
+ "grad_norm": 2.928899049758911,
+ "learning_rate": 2.7298670249691418e-06,
+ "loss": 0.4671,
+ "step": 6026
+ },
+ {
+ "epoch": 2.849645390070922,
+ "grad_norm": 2.778256893157959,
+ "learning_rate": 2.7292458334237488e-06,
+ "loss": 0.429,
+ "step": 6027
+ },
+ {
+ "epoch": 2.850118203309693,
+ "grad_norm": 3.0689055919647217,
+ "learning_rate": 2.7286246276042234e-06,
+ "loss": 0.4727,
+ "step": 6028
+ },
+ {
+ "epoch": 2.8505910165484636,
+ "grad_norm": 2.582066774368286,
+ "learning_rate": 2.7280034075492447e-06,
+ "loss": 0.4025,
+ "step": 6029
+ },
+ {
+ "epoch": 2.851063829787234,
+ "grad_norm": 3.6679015159606934,
+ "learning_rate": 2.7273821732974936e-06,
+ "loss": 0.4856,
+ "step": 6030
+ },
+ {
+ "epoch": 2.8515366430260047,
+ "grad_norm": 2.7222588062286377,
+ "learning_rate": 2.7267609248876516e-06,
+ "loss": 0.4255,
+ "step": 6031
+ },
+ {
+ "epoch": 2.8520094562647755,
+ "grad_norm": 2.455038547515869,
+ "learning_rate": 2.726139662358401e-06,
+ "loss": 0.4234,
+ "step": 6032
+ },
+ {
+ "epoch": 2.852482269503546,
+ "grad_norm": 2.8277318477630615,
+ "learning_rate": 2.7255183857484253e-06,
+ "loss": 0.4146,
+ "step": 6033
+ },
+ {
+ "epoch": 2.8529550827423167,
+ "grad_norm": 2.523615837097168,
+ "learning_rate": 2.724897095096409e-06,
+ "loss": 0.4227,
+ "step": 6034
+ },
+ {
+ "epoch": 2.8534278959810875,
+ "grad_norm": 3.353646755218506,
+ "learning_rate": 2.724275790441036e-06,
+ "loss": 0.5041,
+ "step": 6035
+ },
+ {
+ "epoch": 2.8539007092198583,
+ "grad_norm": 2.753981828689575,
+ "learning_rate": 2.7236544718209934e-06,
+ "loss": 0.4646,
+ "step": 6036
+ },
+ {
+ "epoch": 2.854373522458629,
+ "grad_norm": 2.954744577407837,
+ "learning_rate": 2.723033139274967e-06,
+ "loss": 0.5182,
+ "step": 6037
+ },
+ {
+ "epoch": 2.8548463356973994,
+ "grad_norm": 2.4814131259918213,
+ "learning_rate": 2.7224117928416462e-06,
+ "loss": 0.4626,
+ "step": 6038
+ },
+ {
+ "epoch": 2.8553191489361702,
+ "grad_norm": 2.7414886951446533,
+ "learning_rate": 2.721790432559717e-06,
+ "loss": 0.4111,
+ "step": 6039
+ },
+ {
+ "epoch": 2.855791962174941,
+ "grad_norm": 2.8743896484375,
+ "learning_rate": 2.7211690584678706e-06,
+ "loss": 0.4986,
+ "step": 6040
+ },
+ {
+ "epoch": 2.8562647754137114,
+ "grad_norm": 3.0691921710968018,
+ "learning_rate": 2.720547670604797e-06,
+ "loss": 0.4743,
+ "step": 6041
+ },
+ {
+ "epoch": 2.856737588652482,
+ "grad_norm": 2.7273411750793457,
+ "learning_rate": 2.7199262690091872e-06,
+ "loss": 0.4403,
+ "step": 6042
+ },
+ {
+ "epoch": 2.857210401891253,
+ "grad_norm": 2.8022944927215576,
+ "learning_rate": 2.7193048537197325e-06,
+ "loss": 0.4413,
+ "step": 6043
+ },
+ {
+ "epoch": 2.8576832151300238,
+ "grad_norm": 2.4883248805999756,
+ "learning_rate": 2.718683424775126e-06,
+ "loss": 0.4485,
+ "step": 6044
+ },
+ {
+ "epoch": 2.8581560283687946,
+ "grad_norm": 2.457249879837036,
+ "learning_rate": 2.718061982214062e-06,
+ "loss": 0.4167,
+ "step": 6045
+ },
+ {
+ "epoch": 2.858628841607565,
+ "grad_norm": 2.7210328578948975,
+ "learning_rate": 2.717440526075234e-06,
+ "loss": 0.4419,
+ "step": 6046
+ },
+ {
+ "epoch": 2.8591016548463357,
+ "grad_norm": 2.684483766555786,
+ "learning_rate": 2.7168190563973386e-06,
+ "loss": 0.4449,
+ "step": 6047
+ },
+ {
+ "epoch": 2.8595744680851065,
+ "grad_norm": 2.5305230617523193,
+ "learning_rate": 2.7161975732190706e-06,
+ "loss": 0.3829,
+ "step": 6048
+ },
+ {
+ "epoch": 2.860047281323877,
+ "grad_norm": 3.0284602642059326,
+ "learning_rate": 2.7155760765791278e-06,
+ "loss": 0.5164,
+ "step": 6049
+ },
+ {
+ "epoch": 2.8605200945626477,
+ "grad_norm": 3.154599189758301,
+ "learning_rate": 2.7149545665162085e-06,
+ "loss": 0.527,
+ "step": 6050
+ },
+ {
+ "epoch": 2.8609929078014185,
+ "grad_norm": 2.6798126697540283,
+ "learning_rate": 2.7143330430690113e-06,
+ "loss": 0.4379,
+ "step": 6051
+ },
+ {
+ "epoch": 2.8614657210401893,
+ "grad_norm": 2.9531302452087402,
+ "learning_rate": 2.7137115062762344e-06,
+ "loss": 0.4549,
+ "step": 6052
+ },
+ {
+ "epoch": 2.86193853427896,
+ "grad_norm": 2.779531240463257,
+ "learning_rate": 2.7130899561765787e-06,
+ "loss": 0.4037,
+ "step": 6053
+ },
+ {
+ "epoch": 2.8624113475177304,
+ "grad_norm": 2.786763906478882,
+ "learning_rate": 2.7124683928087466e-06,
+ "loss": 0.3986,
+ "step": 6054
+ },
+ {
+ "epoch": 2.862884160756501,
+ "grad_norm": 2.430415630340576,
+ "learning_rate": 2.7118468162114385e-06,
+ "loss": 0.4402,
+ "step": 6055
+ },
+ {
+ "epoch": 2.863356973995272,
+ "grad_norm": 3.027268409729004,
+ "learning_rate": 2.7112252264233596e-06,
+ "loss": 0.4737,
+ "step": 6056
+ },
+ {
+ "epoch": 2.8638297872340424,
+ "grad_norm": 3.024935483932495,
+ "learning_rate": 2.710603623483211e-06,
+ "loss": 0.3997,
+ "step": 6057
+ },
+ {
+ "epoch": 2.864302600472813,
+ "grad_norm": 2.8862195014953613,
+ "learning_rate": 2.7099820074296985e-06,
+ "loss": 0.4896,
+ "step": 6058
+ },
+ {
+ "epoch": 2.864775413711584,
+ "grad_norm": 2.595579147338867,
+ "learning_rate": 2.709360378301527e-06,
+ "loss": 0.4387,
+ "step": 6059
+ },
+ {
+ "epoch": 2.8652482269503547,
+ "grad_norm": 2.8046188354492188,
+ "learning_rate": 2.708738736137403e-06,
+ "loss": 0.4726,
+ "step": 6060
+ },
+ {
+ "epoch": 2.8657210401891255,
+ "grad_norm": 3.040304660797119,
+ "learning_rate": 2.708117080976033e-06,
+ "loss": 0.4642,
+ "step": 6061
+ },
+ {
+ "epoch": 2.866193853427896,
+ "grad_norm": 2.618128538131714,
+ "learning_rate": 2.7074954128561248e-06,
+ "loss": 0.3171,
+ "step": 6062
+ },
+ {
+ "epoch": 2.8666666666666667,
+ "grad_norm": 2.7966055870056152,
+ "learning_rate": 2.706873731816387e-06,
+ "loss": 0.4893,
+ "step": 6063
+ },
+ {
+ "epoch": 2.8671394799054375,
+ "grad_norm": 2.9198038578033447,
+ "learning_rate": 2.706252037895529e-06,
+ "loss": 0.4428,
+ "step": 6064
+ },
+ {
+ "epoch": 2.867612293144208,
+ "grad_norm": 2.417705774307251,
+ "learning_rate": 2.7056303311322617e-06,
+ "loss": 0.3704,
+ "step": 6065
+ },
+ {
+ "epoch": 2.8680851063829786,
+ "grad_norm": 3.143918752670288,
+ "learning_rate": 2.7050086115652953e-06,
+ "loss": 0.5247,
+ "step": 6066
+ },
+ {
+ "epoch": 2.8685579196217494,
+ "grad_norm": 2.620781183242798,
+ "learning_rate": 2.704386879233341e-06,
+ "loss": 0.4131,
+ "step": 6067
+ },
+ {
+ "epoch": 2.8690307328605202,
+ "grad_norm": 2.6929845809936523,
+ "learning_rate": 2.703765134175112e-06,
+ "loss": 0.4833,
+ "step": 6068
+ },
+ {
+ "epoch": 2.869503546099291,
+ "grad_norm": 2.695920944213867,
+ "learning_rate": 2.7031433764293214e-06,
+ "loss": 0.435,
+ "step": 6069
+ },
+ {
+ "epoch": 2.8699763593380614,
+ "grad_norm": 2.6184475421905518,
+ "learning_rate": 2.702521606034684e-06,
+ "loss": 0.3898,
+ "step": 6070
+ },
+ {
+ "epoch": 2.870449172576832,
+ "grad_norm": 3.130624532699585,
+ "learning_rate": 2.7018998230299136e-06,
+ "loss": 0.4934,
+ "step": 6071
+ },
+ {
+ "epoch": 2.870921985815603,
+ "grad_norm": 2.947936534881592,
+ "learning_rate": 2.701278027453727e-06,
+ "loss": 0.4167,
+ "step": 6072
+ },
+ {
+ "epoch": 2.8713947990543733,
+ "grad_norm": 2.389263391494751,
+ "learning_rate": 2.7006562193448406e-06,
+ "loss": 0.3854,
+ "step": 6073
+ },
+ {
+ "epoch": 2.871867612293144,
+ "grad_norm": 2.9040684700012207,
+ "learning_rate": 2.700034398741971e-06,
+ "loss": 0.4656,
+ "step": 6074
+ },
+ {
+ "epoch": 2.872340425531915,
+ "grad_norm": 2.8671910762786865,
+ "learning_rate": 2.6994125656838365e-06,
+ "loss": 0.4642,
+ "step": 6075
+ },
+ {
+ "epoch": 2.8728132387706857,
+ "grad_norm": 2.6957180500030518,
+ "learning_rate": 2.698790720209156e-06,
+ "loss": 0.4894,
+ "step": 6076
+ },
+ {
+ "epoch": 2.8732860520094565,
+ "grad_norm": 2.748342514038086,
+ "learning_rate": 2.698168862356648e-06,
+ "loss": 0.4552,
+ "step": 6077
+ },
+ {
+ "epoch": 2.873758865248227,
+ "grad_norm": 2.7459912300109863,
+ "learning_rate": 2.6975469921650344e-06,
+ "loss": 0.4244,
+ "step": 6078
+ },
+ {
+ "epoch": 2.8742316784869977,
+ "grad_norm": 2.515650987625122,
+ "learning_rate": 2.6969251096730366e-06,
+ "loss": 0.4178,
+ "step": 6079
+ },
+ {
+ "epoch": 2.8747044917257685,
+ "grad_norm": 2.747373342514038,
+ "learning_rate": 2.696303214919375e-06,
+ "loss": 0.4623,
+ "step": 6080
+ },
+ {
+ "epoch": 2.875177304964539,
+ "grad_norm": 2.72092604637146,
+ "learning_rate": 2.695681307942773e-06,
+ "loss": 0.4227,
+ "step": 6081
+ },
+ {
+ "epoch": 2.8756501182033096,
+ "grad_norm": 2.6925108432769775,
+ "learning_rate": 2.695059388781955e-06,
+ "loss": 0.3807,
+ "step": 6082
+ },
+ {
+ "epoch": 2.8761229314420804,
+ "grad_norm": 2.673546314239502,
+ "learning_rate": 2.6944374574756427e-06,
+ "loss": 0.424,
+ "step": 6083
+ },
+ {
+ "epoch": 2.876595744680851,
+ "grad_norm": 2.7018187046051025,
+ "learning_rate": 2.6938155140625636e-06,
+ "loss": 0.4367,
+ "step": 6084
+ },
+ {
+ "epoch": 2.877068557919622,
+ "grad_norm": 2.9420957565307617,
+ "learning_rate": 2.6931935585814416e-06,
+ "loss": 0.4223,
+ "step": 6085
+ },
+ {
+ "epoch": 2.8775413711583924,
+ "grad_norm": 2.6523385047912598,
+ "learning_rate": 2.6925715910710036e-06,
+ "loss": 0.4074,
+ "step": 6086
+ },
+ {
+ "epoch": 2.878014184397163,
+ "grad_norm": 2.6104063987731934,
+ "learning_rate": 2.691949611569978e-06,
+ "loss": 0.423,
+ "step": 6087
+ },
+ {
+ "epoch": 2.878486997635934,
+ "grad_norm": 2.6463685035705566,
+ "learning_rate": 2.691327620117091e-06,
+ "loss": 0.4354,
+ "step": 6088
+ },
+ {
+ "epoch": 2.8789598108747043,
+ "grad_norm": 2.5863583087921143,
+ "learning_rate": 2.6907056167510725e-06,
+ "loss": 0.4177,
+ "step": 6089
+ },
+ {
+ "epoch": 2.879432624113475,
+ "grad_norm": 2.6946942806243896,
+ "learning_rate": 2.690083601510651e-06,
+ "loss": 0.4176,
+ "step": 6090
+ },
+ {
+ "epoch": 2.879905437352246,
+ "grad_norm": 3.0649454593658447,
+ "learning_rate": 2.6894615744345575e-06,
+ "loss": 0.4827,
+ "step": 6091
+ },
+ {
+ "epoch": 2.8803782505910167,
+ "grad_norm": 2.6454906463623047,
+ "learning_rate": 2.6888395355615226e-06,
+ "loss": 0.4757,
+ "step": 6092
+ },
+ {
+ "epoch": 2.8808510638297875,
+ "grad_norm": 3.251805067062378,
+ "learning_rate": 2.688217484930278e-06,
+ "loss": 0.5651,
+ "step": 6093
+ },
+ {
+ "epoch": 2.881323877068558,
+ "grad_norm": 2.543999433517456,
+ "learning_rate": 2.687595422579555e-06,
+ "loss": 0.4196,
+ "step": 6094
+ },
+ {
+ "epoch": 2.8817966903073287,
+ "grad_norm": 3.1502909660339355,
+ "learning_rate": 2.686973348548088e-06,
+ "loss": 0.4376,
+ "step": 6095
+ },
+ {
+ "epoch": 2.8822695035460995,
+ "grad_norm": 2.7800376415252686,
+ "learning_rate": 2.686351262874611e-06,
+ "loss": 0.444,
+ "step": 6096
+ },
+ {
+ "epoch": 2.88274231678487,
+ "grad_norm": 3.1529603004455566,
+ "learning_rate": 2.685729165597858e-06,
+ "loss": 0.5137,
+ "step": 6097
+ },
+ {
+ "epoch": 2.8832151300236406,
+ "grad_norm": 2.6079602241516113,
+ "learning_rate": 2.685107056756564e-06,
+ "loss": 0.4213,
+ "step": 6098
+ },
+ {
+ "epoch": 2.8836879432624114,
+ "grad_norm": 2.8969249725341797,
+ "learning_rate": 2.6844849363894648e-06,
+ "loss": 0.4679,
+ "step": 6099
+ },
+ {
+ "epoch": 2.884160756501182,
+ "grad_norm": 2.5882437229156494,
+ "learning_rate": 2.6838628045352977e-06,
+ "loss": 0.3891,
+ "step": 6100
+ },
+ {
+ "epoch": 2.8846335697399526,
+ "grad_norm": 2.9458062648773193,
+ "learning_rate": 2.6832406612328007e-06,
+ "loss": 0.4802,
+ "step": 6101
+ },
+ {
+ "epoch": 2.8851063829787233,
+ "grad_norm": 2.8463058471679688,
+ "learning_rate": 2.6826185065207105e-06,
+ "loss": 0.4332,
+ "step": 6102
+ },
+ {
+ "epoch": 2.885579196217494,
+ "grad_norm": 2.8799285888671875,
+ "learning_rate": 2.6819963404377667e-06,
+ "loss": 0.4474,
+ "step": 6103
+ },
+ {
+ "epoch": 2.8860520094562645,
+ "grad_norm": 2.846860408782959,
+ "learning_rate": 2.681374163022709e-06,
+ "loss": 0.4317,
+ "step": 6104
+ },
+ {
+ "epoch": 2.8865248226950353,
+ "grad_norm": 2.7918877601623535,
+ "learning_rate": 2.6807519743142775e-06,
+ "loss": 0.4243,
+ "step": 6105
+ },
+ {
+ "epoch": 2.886997635933806,
+ "grad_norm": 2.9351487159729004,
+ "learning_rate": 2.6801297743512127e-06,
+ "loss": 0.5253,
+ "step": 6106
+ },
+ {
+ "epoch": 2.887470449172577,
+ "grad_norm": 2.9422426223754883,
+ "learning_rate": 2.6795075631722576e-06,
+ "loss": 0.4887,
+ "step": 6107
+ },
+ {
+ "epoch": 2.8879432624113477,
+ "grad_norm": 2.6837220191955566,
+ "learning_rate": 2.678885340816153e-06,
+ "loss": 0.4761,
+ "step": 6108
+ },
+ {
+ "epoch": 2.888416075650118,
+ "grad_norm": 2.6800777912139893,
+ "learning_rate": 2.6782631073216425e-06,
+ "loss": 0.4248,
+ "step": 6109
+ },
+ {
+ "epoch": 2.888888888888889,
+ "grad_norm": 2.9654436111450195,
+ "learning_rate": 2.6776408627274702e-06,
+ "loss": 0.487,
+ "step": 6110
+ },
+ {
+ "epoch": 2.8893617021276596,
+ "grad_norm": 2.7725181579589844,
+ "learning_rate": 2.6770186070723804e-06,
+ "loss": 0.4166,
+ "step": 6111
+ },
+ {
+ "epoch": 2.88983451536643,
+ "grad_norm": 2.6547815799713135,
+ "learning_rate": 2.676396340395118e-06,
+ "loss": 0.4039,
+ "step": 6112
+ },
+ {
+ "epoch": 2.890307328605201,
+ "grad_norm": 2.690997838973999,
+ "learning_rate": 2.6757740627344292e-06,
+ "loss": 0.4639,
+ "step": 6113
+ },
+ {
+ "epoch": 2.8907801418439716,
+ "grad_norm": 2.4693069458007812,
+ "learning_rate": 2.67515177412906e-06,
+ "loss": 0.4052,
+ "step": 6114
+ },
+ {
+ "epoch": 2.8912529550827424,
+ "grad_norm": 2.7137033939361572,
+ "learning_rate": 2.6745294746177576e-06,
+ "loss": 0.4442,
+ "step": 6115
+ },
+ {
+ "epoch": 2.891725768321513,
+ "grad_norm": 3.7417004108428955,
+ "learning_rate": 2.6739071642392712e-06,
+ "loss": 0.4809,
+ "step": 6116
+ },
+ {
+ "epoch": 2.8921985815602835,
+ "grad_norm": 2.707094669342041,
+ "learning_rate": 2.673284843032347e-06,
+ "loss": 0.411,
+ "step": 6117
+ },
+ {
+ "epoch": 2.8926713947990543,
+ "grad_norm": 2.7864158153533936,
+ "learning_rate": 2.672662511035736e-06,
+ "loss": 0.4939,
+ "step": 6118
+ },
+ {
+ "epoch": 2.893144208037825,
+ "grad_norm": 2.8753504753112793,
+ "learning_rate": 2.672040168288187e-06,
+ "loss": 0.4396,
+ "step": 6119
+ },
+ {
+ "epoch": 2.8936170212765955,
+ "grad_norm": 2.7581071853637695,
+ "learning_rate": 2.6714178148284516e-06,
+ "loss": 0.427,
+ "step": 6120
+ },
+ {
+ "epoch": 2.8940898345153663,
+ "grad_norm": 2.9754791259765625,
+ "learning_rate": 2.6707954506952803e-06,
+ "loss": 0.4255,
+ "step": 6121
+ },
+ {
+ "epoch": 2.894562647754137,
+ "grad_norm": 2.876939296722412,
+ "learning_rate": 2.670173075927426e-06,
+ "loss": 0.4699,
+ "step": 6122
+ },
+ {
+ "epoch": 2.895035460992908,
+ "grad_norm": 2.4875400066375732,
+ "learning_rate": 2.6695506905636397e-06,
+ "loss": 0.3568,
+ "step": 6123
+ },
+ {
+ "epoch": 2.8955082742316787,
+ "grad_norm": 2.703606128692627,
+ "learning_rate": 2.668928294642675e-06,
+ "loss": 0.3646,
+ "step": 6124
+ },
+ {
+ "epoch": 2.895981087470449,
+ "grad_norm": 2.8618338108062744,
+ "learning_rate": 2.6683058882032868e-06,
+ "loss": 0.378,
+ "step": 6125
+ },
+ {
+ "epoch": 2.89645390070922,
+ "grad_norm": 2.9756760597229004,
+ "learning_rate": 2.667683471284229e-06,
+ "loss": 0.4348,
+ "step": 6126
+ },
+ {
+ "epoch": 2.8969267139479906,
+ "grad_norm": 2.7861104011535645,
+ "learning_rate": 2.667061043924256e-06,
+ "loss": 0.4435,
+ "step": 6127
+ },
+ {
+ "epoch": 2.897399527186761,
+ "grad_norm": 2.7932238578796387,
+ "learning_rate": 2.6664386061621243e-06,
+ "loss": 0.4824,
+ "step": 6128
+ },
+ {
+ "epoch": 2.8978723404255318,
+ "grad_norm": 2.85483455657959,
+ "learning_rate": 2.6658161580365917e-06,
+ "loss": 0.4925,
+ "step": 6129
+ },
+ {
+ "epoch": 2.8983451536643026,
+ "grad_norm": 2.4242141246795654,
+ "learning_rate": 2.6651936995864136e-06,
+ "loss": 0.3466,
+ "step": 6130
+ },
+ {
+ "epoch": 2.8988179669030734,
+ "grad_norm": 3.385214328765869,
+ "learning_rate": 2.6645712308503473e-06,
+ "loss": 0.4751,
+ "step": 6131
+ },
+ {
+ "epoch": 2.899290780141844,
+ "grad_norm": 2.7109622955322266,
+ "learning_rate": 2.6639487518671525e-06,
+ "loss": 0.4469,
+ "step": 6132
+ },
+ {
+ "epoch": 2.8997635933806145,
+ "grad_norm": 2.6537814140319824,
+ "learning_rate": 2.6633262626755877e-06,
+ "loss": 0.4678,
+ "step": 6133
+ },
+ {
+ "epoch": 2.9002364066193853,
+ "grad_norm": 2.5992231369018555,
+ "learning_rate": 2.6627037633144124e-06,
+ "loss": 0.4206,
+ "step": 6134
+ },
+ {
+ "epoch": 2.900709219858156,
+ "grad_norm": 2.988940954208374,
+ "learning_rate": 2.6620812538223885e-06,
+ "loss": 0.4554,
+ "step": 6135
+ },
+ {
+ "epoch": 2.9011820330969265,
+ "grad_norm": 3.0678138732910156,
+ "learning_rate": 2.661458734238274e-06,
+ "loss": 0.4671,
+ "step": 6136
+ },
+ {
+ "epoch": 2.9016548463356973,
+ "grad_norm": 2.6902482509613037,
+ "learning_rate": 2.6608362046008335e-06,
+ "loss": 0.372,
+ "step": 6137
+ },
+ {
+ "epoch": 2.902127659574468,
+ "grad_norm": 3.031597375869751,
+ "learning_rate": 2.660213664948827e-06,
+ "loss": 0.4424,
+ "step": 6138
+ },
+ {
+ "epoch": 2.902600472813239,
+ "grad_norm": 2.8376755714416504,
+ "learning_rate": 2.6595911153210187e-06,
+ "loss": 0.4599,
+ "step": 6139
+ },
+ {
+ "epoch": 2.9030732860520096,
+ "grad_norm": 3.3164854049682617,
+ "learning_rate": 2.6589685557561707e-06,
+ "loss": 0.3897,
+ "step": 6140
+ },
+ {
+ "epoch": 2.90354609929078,
+ "grad_norm": 2.9535014629364014,
+ "learning_rate": 2.658345986293048e-06,
+ "loss": 0.4957,
+ "step": 6141
+ },
+ {
+ "epoch": 2.904018912529551,
+ "grad_norm": 2.821276903152466,
+ "learning_rate": 2.657723406970415e-06,
+ "loss": 0.4453,
+ "step": 6142
+ },
+ {
+ "epoch": 2.9044917257683216,
+ "grad_norm": 2.7314651012420654,
+ "learning_rate": 2.657100817827037e-06,
+ "loss": 0.4406,
+ "step": 6143
+ },
+ {
+ "epoch": 2.904964539007092,
+ "grad_norm": 2.9509520530700684,
+ "learning_rate": 2.6564782189016804e-06,
+ "loss": 0.4629,
+ "step": 6144
+ },
+ {
+ "epoch": 2.9054373522458627,
+ "grad_norm": 2.6234960556030273,
+ "learning_rate": 2.655855610233111e-06,
+ "loss": 0.4306,
+ "step": 6145
+ },
+ {
+ "epoch": 2.9059101654846335,
+ "grad_norm": 2.7209644317626953,
+ "learning_rate": 2.6552329918600962e-06,
+ "loss": 0.3643,
+ "step": 6146
+ },
+ {
+ "epoch": 2.9063829787234043,
+ "grad_norm": 2.9797747135162354,
+ "learning_rate": 2.654610363821404e-06,
+ "loss": 0.4616,
+ "step": 6147
+ },
+ {
+ "epoch": 2.906855791962175,
+ "grad_norm": 2.8179666996002197,
+ "learning_rate": 2.6539877261558016e-06,
+ "loss": 0.4526,
+ "step": 6148
+ },
+ {
+ "epoch": 2.9073286052009455,
+ "grad_norm": 2.7492244243621826,
+ "learning_rate": 2.653365078902059e-06,
+ "loss": 0.4862,
+ "step": 6149
+ },
+ {
+ "epoch": 2.9078014184397163,
+ "grad_norm": 3.0262451171875,
+ "learning_rate": 2.6527424220989457e-06,
+ "loss": 0.3728,
+ "step": 6150
+ },
+ {
+ "epoch": 2.908274231678487,
+ "grad_norm": 2.8092808723449707,
+ "learning_rate": 2.6521197557852315e-06,
+ "loss": 0.4668,
+ "step": 6151
+ },
+ {
+ "epoch": 2.9087470449172574,
+ "grad_norm": 2.915719985961914,
+ "learning_rate": 2.651497079999687e-06,
+ "loss": 0.5124,
+ "step": 6152
+ },
+ {
+ "epoch": 2.9092198581560282,
+ "grad_norm": 2.9794204235076904,
+ "learning_rate": 2.6508743947810834e-06,
+ "loss": 0.5207,
+ "step": 6153
+ },
+ {
+ "epoch": 2.909692671394799,
+ "grad_norm": 2.882453680038452,
+ "learning_rate": 2.650251700168193e-06,
+ "loss": 0.4382,
+ "step": 6154
+ },
+ {
+ "epoch": 2.91016548463357,
+ "grad_norm": 3.183680534362793,
+ "learning_rate": 2.6496289961997886e-06,
+ "loss": 0.5134,
+ "step": 6155
+ },
+ {
+ "epoch": 2.9106382978723406,
+ "grad_norm": 2.9374759197235107,
+ "learning_rate": 2.649006282914642e-06,
+ "loss": 0.4748,
+ "step": 6156
+ },
+ {
+ "epoch": 2.911111111111111,
+ "grad_norm": 2.8096041679382324,
+ "learning_rate": 2.648383560351527e-06,
+ "loss": 0.4672,
+ "step": 6157
+ },
+ {
+ "epoch": 2.911583924349882,
+ "grad_norm": 2.8799238204956055,
+ "learning_rate": 2.6477608285492196e-06,
+ "loss": 0.4679,
+ "step": 6158
+ },
+ {
+ "epoch": 2.9120567375886526,
+ "grad_norm": 2.689310073852539,
+ "learning_rate": 2.6471380875464923e-06,
+ "loss": 0.4069,
+ "step": 6159
+ },
+ {
+ "epoch": 2.912529550827423,
+ "grad_norm": 2.909323215484619,
+ "learning_rate": 2.6465153373821216e-06,
+ "loss": 0.4463,
+ "step": 6160
+ },
+ {
+ "epoch": 2.9130023640661937,
+ "grad_norm": 2.797724962234497,
+ "learning_rate": 2.6458925780948845e-06,
+ "loss": 0.4269,
+ "step": 6161
+ },
+ {
+ "epoch": 2.9134751773049645,
+ "grad_norm": 2.7533204555511475,
+ "learning_rate": 2.645269809723556e-06,
+ "loss": 0.453,
+ "step": 6162
+ },
+ {
+ "epoch": 2.9139479905437353,
+ "grad_norm": 2.6615989208221436,
+ "learning_rate": 2.6446470323069122e-06,
+ "loss": 0.3921,
+ "step": 6163
+ },
+ {
+ "epoch": 2.914420803782506,
+ "grad_norm": 3.0493314266204834,
+ "learning_rate": 2.644024245883733e-06,
+ "loss": 0.4779,
+ "step": 6164
+ },
+ {
+ "epoch": 2.9148936170212765,
+ "grad_norm": 2.649845600128174,
+ "learning_rate": 2.643401450492795e-06,
+ "loss": 0.454,
+ "step": 6165
+ },
+ {
+ "epoch": 2.9153664302600473,
+ "grad_norm": 2.7931838035583496,
+ "learning_rate": 2.642778646172877e-06,
+ "loss": 0.504,
+ "step": 6166
+ },
+ {
+ "epoch": 2.915839243498818,
+ "grad_norm": 2.9518136978149414,
+ "learning_rate": 2.64215583296276e-06,
+ "loss": 0.4767,
+ "step": 6167
+ },
+ {
+ "epoch": 2.9163120567375884,
+ "grad_norm": 2.6047427654266357,
+ "learning_rate": 2.6415330109012216e-06,
+ "loss": 0.4316,
+ "step": 6168
+ },
+ {
+ "epoch": 2.916784869976359,
+ "grad_norm": 2.7732112407684326,
+ "learning_rate": 2.640910180027044e-06,
+ "loss": 0.4213,
+ "step": 6169
+ },
+ {
+ "epoch": 2.91725768321513,
+ "grad_norm": 3.1157236099243164,
+ "learning_rate": 2.6402873403790068e-06,
+ "loss": 0.4559,
+ "step": 6170
+ },
+ {
+ "epoch": 2.917730496453901,
+ "grad_norm": 2.68424129486084,
+ "learning_rate": 2.6396644919958917e-06,
+ "loss": 0.3456,
+ "step": 6171
+ },
+ {
+ "epoch": 2.9182033096926716,
+ "grad_norm": 3.1093270778656006,
+ "learning_rate": 2.639041634916482e-06,
+ "loss": 0.4172,
+ "step": 6172
+ },
+ {
+ "epoch": 2.918676122931442,
+ "grad_norm": 2.9844655990600586,
+ "learning_rate": 2.6384187691795594e-06,
+ "loss": 0.4844,
+ "step": 6173
+ },
+ {
+ "epoch": 2.9191489361702128,
+ "grad_norm": 2.907151222229004,
+ "learning_rate": 2.637795894823906e-06,
+ "loss": 0.5126,
+ "step": 6174
+ },
+ {
+ "epoch": 2.9196217494089836,
+ "grad_norm": 2.804105520248413,
+ "learning_rate": 2.637173011888307e-06,
+ "loss": 0.3919,
+ "step": 6175
+ },
+ {
+ "epoch": 2.920094562647754,
+ "grad_norm": 2.8809266090393066,
+ "learning_rate": 2.636550120411547e-06,
+ "loss": 0.4468,
+ "step": 6176
+ },
+ {
+ "epoch": 2.9205673758865247,
+ "grad_norm": 2.686290979385376,
+ "learning_rate": 2.6359272204324087e-06,
+ "loss": 0.4352,
+ "step": 6177
+ },
+ {
+ "epoch": 2.9210401891252955,
+ "grad_norm": 2.448101758956909,
+ "learning_rate": 2.635304311989678e-06,
+ "loss": 0.4218,
+ "step": 6178
+ },
+ {
+ "epoch": 2.9215130023640663,
+ "grad_norm": 2.81024169921875,
+ "learning_rate": 2.6346813951221416e-06,
+ "loss": 0.5177,
+ "step": 6179
+ },
+ {
+ "epoch": 2.921985815602837,
+ "grad_norm": 2.7590086460113525,
+ "learning_rate": 2.6340584698685856e-06,
+ "loss": 0.3897,
+ "step": 6180
+ },
+ {
+ "epoch": 2.9224586288416075,
+ "grad_norm": 3.1226227283477783,
+ "learning_rate": 2.6334355362677965e-06,
+ "loss": 0.4595,
+ "step": 6181
+ },
+ {
+ "epoch": 2.9229314420803783,
+ "grad_norm": 2.673828125,
+ "learning_rate": 2.6328125943585607e-06,
+ "loss": 0.4932,
+ "step": 6182
+ },
+ {
+ "epoch": 2.923404255319149,
+ "grad_norm": 2.8297293186187744,
+ "learning_rate": 2.632189644179668e-06,
+ "loss": 0.3819,
+ "step": 6183
+ },
+ {
+ "epoch": 2.9238770685579194,
+ "grad_norm": 2.9661548137664795,
+ "learning_rate": 2.6315666857699056e-06,
+ "loss": 0.4419,
+ "step": 6184
+ },
+ {
+ "epoch": 2.92434988179669,
+ "grad_norm": 2.9745798110961914,
+ "learning_rate": 2.6309437191680627e-06,
+ "loss": 0.4423,
+ "step": 6185
+ },
+ {
+ "epoch": 2.924822695035461,
+ "grad_norm": 2.8351712226867676,
+ "learning_rate": 2.6303207444129285e-06,
+ "loss": 0.5043,
+ "step": 6186
+ },
+ {
+ "epoch": 2.925295508274232,
+ "grad_norm": 2.6442384719848633,
+ "learning_rate": 2.6296977615432927e-06,
+ "loss": 0.4431,
+ "step": 6187
+ },
+ {
+ "epoch": 2.9257683215130026,
+ "grad_norm": 2.4128029346466064,
+ "learning_rate": 2.6290747705979457e-06,
+ "loss": 0.3603,
+ "step": 6188
+ },
+ {
+ "epoch": 2.926241134751773,
+ "grad_norm": 2.730424642562866,
+ "learning_rate": 2.6284517716156786e-06,
+ "loss": 0.439,
+ "step": 6189
+ },
+ {
+ "epoch": 2.9267139479905437,
+ "grad_norm": 2.6215405464172363,
+ "learning_rate": 2.627828764635284e-06,
+ "loss": 0.4117,
+ "step": 6190
+ },
+ {
+ "epoch": 2.9271867612293145,
+ "grad_norm": 2.56585955619812,
+ "learning_rate": 2.627205749695552e-06,
+ "loss": 0.4404,
+ "step": 6191
+ },
+ {
+ "epoch": 2.927659574468085,
+ "grad_norm": 2.9587886333465576,
+ "learning_rate": 2.6265827268352763e-06,
+ "loss": 0.4295,
+ "step": 6192
+ },
+ {
+ "epoch": 2.9281323877068557,
+ "grad_norm": 2.6611828804016113,
+ "learning_rate": 2.625959696093249e-06,
+ "loss": 0.4441,
+ "step": 6193
+ },
+ {
+ "epoch": 2.9286052009456265,
+ "grad_norm": 2.4391369819641113,
+ "learning_rate": 2.6253366575082634e-06,
+ "loss": 0.4447,
+ "step": 6194
+ },
+ {
+ "epoch": 2.9290780141843973,
+ "grad_norm": 2.710763454437256,
+ "learning_rate": 2.6247136111191144e-06,
+ "loss": 0.4662,
+ "step": 6195
+ },
+ {
+ "epoch": 2.929550827423168,
+ "grad_norm": 2.770697593688965,
+ "learning_rate": 2.6240905569645952e-06,
+ "loss": 0.4263,
+ "step": 6196
+ },
+ {
+ "epoch": 2.9300236406619384,
+ "grad_norm": 2.5885732173919678,
+ "learning_rate": 2.623467495083501e-06,
+ "loss": 0.4303,
+ "step": 6197
+ },
+ {
+ "epoch": 2.9304964539007092,
+ "grad_norm": 2.5716748237609863,
+ "learning_rate": 2.6228444255146274e-06,
+ "loss": 0.3714,
+ "step": 6198
+ },
+ {
+ "epoch": 2.93096926713948,
+ "grad_norm": 3.0437910556793213,
+ "learning_rate": 2.6222213482967703e-06,
+ "loss": 0.4077,
+ "step": 6199
+ },
+ {
+ "epoch": 2.9314420803782504,
+ "grad_norm": 2.7861344814300537,
+ "learning_rate": 2.6215982634687253e-06,
+ "loss": 0.4157,
+ "step": 6200
+ },
+ {
+ "epoch": 2.931914893617021,
+ "grad_norm": 2.5265355110168457,
+ "learning_rate": 2.6209751710692905e-06,
+ "loss": 0.4586,
+ "step": 6201
+ },
+ {
+ "epoch": 2.932387706855792,
+ "grad_norm": 2.940112590789795,
+ "learning_rate": 2.6203520711372615e-06,
+ "loss": 0.4208,
+ "step": 6202
+ },
+ {
+ "epoch": 2.9328605200945628,
+ "grad_norm": 2.7124581336975098,
+ "learning_rate": 2.6197289637114363e-06,
+ "loss": 0.4173,
+ "step": 6203
+ },
+ {
+ "epoch": 2.9333333333333336,
+ "grad_norm": 2.818523406982422,
+ "learning_rate": 2.619105848830615e-06,
+ "loss": 0.4349,
+ "step": 6204
+ },
+ {
+ "epoch": 2.933806146572104,
+ "grad_norm": 2.7630393505096436,
+ "learning_rate": 2.6184827265335937e-06,
+ "loss": 0.5078,
+ "step": 6205
+ },
+ {
+ "epoch": 2.9342789598108747,
+ "grad_norm": 3.0554699897766113,
+ "learning_rate": 2.6178595968591726e-06,
+ "loss": 0.4712,
+ "step": 6206
+ },
+ {
+ "epoch": 2.9347517730496455,
+ "grad_norm": 2.721992254257202,
+ "learning_rate": 2.6172364598461507e-06,
+ "loss": 0.4847,
+ "step": 6207
+ },
+ {
+ "epoch": 2.935224586288416,
+ "grad_norm": 2.809663772583008,
+ "learning_rate": 2.6166133155333303e-06,
+ "loss": 0.4447,
+ "step": 6208
+ },
+ {
+ "epoch": 2.9356973995271867,
+ "grad_norm": 2.568394660949707,
+ "learning_rate": 2.6159901639595088e-06,
+ "loss": 0.4543,
+ "step": 6209
+ },
+ {
+ "epoch": 2.9361702127659575,
+ "grad_norm": 3.3670637607574463,
+ "learning_rate": 2.6153670051634884e-06,
+ "loss": 0.4901,
+ "step": 6210
+ },
+ {
+ "epoch": 2.9366430260047283,
+ "grad_norm": 3.082508087158203,
+ "learning_rate": 2.614743839184071e-06,
+ "loss": 0.4862,
+ "step": 6211
+ },
+ {
+ "epoch": 2.937115839243499,
+ "grad_norm": 2.692139148712158,
+ "learning_rate": 2.6141206660600566e-06,
+ "loss": 0.5199,
+ "step": 6212
+ },
+ {
+ "epoch": 2.9375886524822694,
+ "grad_norm": 3.231433391571045,
+ "learning_rate": 2.6134974858302504e-06,
+ "loss": 0.464,
+ "step": 6213
+ },
+ {
+ "epoch": 2.93806146572104,
+ "grad_norm": 3.224238157272339,
+ "learning_rate": 2.612874298533452e-06,
+ "loss": 0.4507,
+ "step": 6214
+ },
+ {
+ "epoch": 2.938534278959811,
+ "grad_norm": 2.812755584716797,
+ "learning_rate": 2.6122511042084663e-06,
+ "loss": 0.4527,
+ "step": 6215
+ },
+ {
+ "epoch": 2.9390070921985814,
+ "grad_norm": 2.837811231613159,
+ "learning_rate": 2.611627902894098e-06,
+ "loss": 0.4782,
+ "step": 6216
+ },
+ {
+ "epoch": 2.939479905437352,
+ "grad_norm": 3.093817710876465,
+ "learning_rate": 2.6110046946291476e-06,
+ "loss": 0.4933,
+ "step": 6217
+ },
+ {
+ "epoch": 2.939952718676123,
+ "grad_norm": 2.950119733810425,
+ "learning_rate": 2.6103814794524235e-06,
+ "loss": 0.4884,
+ "step": 6218
+ },
+ {
+ "epoch": 2.9404255319148938,
+ "grad_norm": 2.469681978225708,
+ "learning_rate": 2.6097582574027274e-06,
+ "loss": 0.4135,
+ "step": 6219
+ },
+ {
+ "epoch": 2.9408983451536646,
+ "grad_norm": 2.779238224029541,
+ "learning_rate": 2.609135028518866e-06,
+ "loss": 0.5165,
+ "step": 6220
+ },
+ {
+ "epoch": 2.941371158392435,
+ "grad_norm": 2.807705879211426,
+ "learning_rate": 2.608511792839645e-06,
+ "loss": 0.4046,
+ "step": 6221
+ },
+ {
+ "epoch": 2.9418439716312057,
+ "grad_norm": 2.6067750453948975,
+ "learning_rate": 2.607888550403871e-06,
+ "loss": 0.406,
+ "step": 6222
+ },
+ {
+ "epoch": 2.9423167848699765,
+ "grad_norm": 2.865766763687134,
+ "learning_rate": 2.607265301250349e-06,
+ "loss": 0.471,
+ "step": 6223
+ },
+ {
+ "epoch": 2.942789598108747,
+ "grad_norm": 2.977681875228882,
+ "learning_rate": 2.6066420454178876e-06,
+ "loss": 0.4666,
+ "step": 6224
+ },
+ {
+ "epoch": 2.9432624113475176,
+ "grad_norm": 2.870884418487549,
+ "learning_rate": 2.606018782945294e-06,
+ "loss": 0.4768,
+ "step": 6225
+ },
+ {
+ "epoch": 2.9437352245862884,
+ "grad_norm": 2.992851495742798,
+ "learning_rate": 2.6053955138713756e-06,
+ "loss": 0.4657,
+ "step": 6226
+ },
+ {
+ "epoch": 2.9442080378250592,
+ "grad_norm": 2.7279815673828125,
+ "learning_rate": 2.6047722382349406e-06,
+ "loss": 0.4087,
+ "step": 6227
+ },
+ {
+ "epoch": 2.94468085106383,
+ "grad_norm": 2.8587028980255127,
+ "learning_rate": 2.604148956074797e-06,
+ "loss": 0.4452,
+ "step": 6228
+ },
+ {
+ "epoch": 2.9451536643026004,
+ "grad_norm": 3.001694679260254,
+ "learning_rate": 2.6035256674297555e-06,
+ "loss": 0.4852,
+ "step": 6229
+ },
+ {
+ "epoch": 2.945626477541371,
+ "grad_norm": 2.858069896697998,
+ "learning_rate": 2.6029023723386237e-06,
+ "loss": 0.4281,
+ "step": 6230
+ },
+ {
+ "epoch": 2.946099290780142,
+ "grad_norm": 2.675856828689575,
+ "learning_rate": 2.602279070840213e-06,
+ "loss": 0.4545,
+ "step": 6231
+ },
+ {
+ "epoch": 2.9465721040189123,
+ "grad_norm": 2.530245065689087,
+ "learning_rate": 2.6016557629733334e-06,
+ "loss": 0.4619,
+ "step": 6232
+ },
+ {
+ "epoch": 2.947044917257683,
+ "grad_norm": 2.7533743381500244,
+ "learning_rate": 2.601032448776795e-06,
+ "loss": 0.4879,
+ "step": 6233
+ },
+ {
+ "epoch": 2.947517730496454,
+ "grad_norm": 3.130453109741211,
+ "learning_rate": 2.600409128289409e-06,
+ "loss": 0.4056,
+ "step": 6234
+ },
+ {
+ "epoch": 2.9479905437352247,
+ "grad_norm": 3.4736509323120117,
+ "learning_rate": 2.5997858015499867e-06,
+ "loss": 0.5063,
+ "step": 6235
+ },
+ {
+ "epoch": 2.9484633569739955,
+ "grad_norm": 2.871978282928467,
+ "learning_rate": 2.5991624685973406e-06,
+ "loss": 0.4562,
+ "step": 6236
+ },
+ {
+ "epoch": 2.948936170212766,
+ "grad_norm": 2.976503372192383,
+ "learning_rate": 2.5985391294702817e-06,
+ "loss": 0.5079,
+ "step": 6237
+ },
+ {
+ "epoch": 2.9494089834515367,
+ "grad_norm": 2.578122615814209,
+ "learning_rate": 2.597915784207623e-06,
+ "loss": 0.4069,
+ "step": 6238
+ },
+ {
+ "epoch": 2.9498817966903075,
+ "grad_norm": 2.885911226272583,
+ "learning_rate": 2.597292432848178e-06,
+ "loss": 0.4382,
+ "step": 6239
+ },
+ {
+ "epoch": 2.950354609929078,
+ "grad_norm": 2.9301681518554688,
+ "learning_rate": 2.5966690754307605e-06,
+ "loss": 0.4888,
+ "step": 6240
+ },
+ {
+ "epoch": 2.9508274231678486,
+ "grad_norm": 2.9912192821502686,
+ "learning_rate": 2.5960457119941834e-06,
+ "loss": 0.4699,
+ "step": 6241
+ },
+ {
+ "epoch": 2.9513002364066194,
+ "grad_norm": 2.6612601280212402,
+ "learning_rate": 2.5954223425772607e-06,
+ "loss": 0.3736,
+ "step": 6242
+ },
+ {
+ "epoch": 2.9517730496453902,
+ "grad_norm": 2.9325380325317383,
+ "learning_rate": 2.5947989672188067e-06,
+ "loss": 0.4771,
+ "step": 6243
+ },
+ {
+ "epoch": 2.952245862884161,
+ "grad_norm": 2.8143959045410156,
+ "learning_rate": 2.594175585957637e-06,
+ "loss": 0.5103,
+ "step": 6244
+ },
+ {
+ "epoch": 2.9527186761229314,
+ "grad_norm": 2.355078935623169,
+ "learning_rate": 2.5935521988325674e-06,
+ "loss": 0.44,
+ "step": 6245
+ },
+ {
+ "epoch": 2.953191489361702,
+ "grad_norm": 2.733156442642212,
+ "learning_rate": 2.5929288058824114e-06,
+ "loss": 0.4306,
+ "step": 6246
+ },
+ {
+ "epoch": 2.953664302600473,
+ "grad_norm": 3.182563304901123,
+ "learning_rate": 2.5923054071459865e-06,
+ "loss": 0.417,
+ "step": 6247
+ },
+ {
+ "epoch": 2.9541371158392433,
+ "grad_norm": 2.4162323474884033,
+ "learning_rate": 2.5916820026621094e-06,
+ "loss": 0.3802,
+ "step": 6248
+ },
+ {
+ "epoch": 2.954609929078014,
+ "grad_norm": 2.772706985473633,
+ "learning_rate": 2.591058592469595e-06,
+ "loss": 0.4654,
+ "step": 6249
+ },
+ {
+ "epoch": 2.955082742316785,
+ "grad_norm": 2.6011102199554443,
+ "learning_rate": 2.5904351766072616e-06,
+ "loss": 0.4619,
+ "step": 6250
+ },
+ {
+ "epoch": 2.9555555555555557,
+ "grad_norm": 2.5700361728668213,
+ "learning_rate": 2.589811755113926e-06,
+ "loss": 0.3991,
+ "step": 6251
+ },
+ {
+ "epoch": 2.9560283687943265,
+ "grad_norm": 2.6444971561431885,
+ "learning_rate": 2.589188328028407e-06,
+ "loss": 0.4388,
+ "step": 6252
+ },
+ {
+ "epoch": 2.956501182033097,
+ "grad_norm": 2.739567279815674,
+ "learning_rate": 2.588564895389521e-06,
+ "loss": 0.4193,
+ "step": 6253
+ },
+ {
+ "epoch": 2.9569739952718677,
+ "grad_norm": 2.7070045471191406,
+ "learning_rate": 2.5879414572360877e-06,
+ "loss": 0.4347,
+ "step": 6254
+ },
+ {
+ "epoch": 2.9574468085106385,
+ "grad_norm": 2.7811532020568848,
+ "learning_rate": 2.587318013606926e-06,
+ "loss": 0.43,
+ "step": 6255
+ },
+ {
+ "epoch": 2.957919621749409,
+ "grad_norm": 3.0036091804504395,
+ "learning_rate": 2.5866945645408537e-06,
+ "loss": 0.4855,
+ "step": 6256
+ },
+ {
+ "epoch": 2.9583924349881796,
+ "grad_norm": 2.948573112487793,
+ "learning_rate": 2.5860711100766918e-06,
+ "loss": 0.4594,
+ "step": 6257
+ },
+ {
+ "epoch": 2.9588652482269504,
+ "grad_norm": 2.6371593475341797,
+ "learning_rate": 2.5854476502532583e-06,
+ "loss": 0.446,
+ "step": 6258
+ },
+ {
+ "epoch": 2.959338061465721,
+ "grad_norm": 2.668677806854248,
+ "learning_rate": 2.5848241851093754e-06,
+ "loss": 0.3991,
+ "step": 6259
+ },
+ {
+ "epoch": 2.959810874704492,
+ "grad_norm": 3.1640663146972656,
+ "learning_rate": 2.5842007146838614e-06,
+ "loss": 0.5146,
+ "step": 6260
+ },
+ {
+ "epoch": 2.9602836879432624,
+ "grad_norm": 2.9412102699279785,
+ "learning_rate": 2.5835772390155382e-06,
+ "loss": 0.4798,
+ "step": 6261
+ },
+ {
+ "epoch": 2.960756501182033,
+ "grad_norm": 2.7674343585968018,
+ "learning_rate": 2.582953758143227e-06,
+ "loss": 0.4262,
+ "step": 6262
+ },
+ {
+ "epoch": 2.961229314420804,
+ "grad_norm": 3.5219457149505615,
+ "learning_rate": 2.582330272105749e-06,
+ "loss": 0.4905,
+ "step": 6263
+ },
+ {
+ "epoch": 2.9617021276595743,
+ "grad_norm": 2.4274468421936035,
+ "learning_rate": 2.5817067809419267e-06,
+ "loss": 0.4048,
+ "step": 6264
+ },
+ {
+ "epoch": 2.962174940898345,
+ "grad_norm": 2.6907944679260254,
+ "learning_rate": 2.5810832846905814e-06,
+ "loss": 0.388,
+ "step": 6265
+ },
+ {
+ "epoch": 2.962647754137116,
+ "grad_norm": 2.603151321411133,
+ "learning_rate": 2.5804597833905347e-06,
+ "loss": 0.4377,
+ "step": 6266
+ },
+ {
+ "epoch": 2.9631205673758867,
+ "grad_norm": 2.685837507247925,
+ "learning_rate": 2.57983627708061e-06,
+ "loss": 0.4409,
+ "step": 6267
+ },
+ {
+ "epoch": 2.963593380614657,
+ "grad_norm": 2.8281500339508057,
+ "learning_rate": 2.579212765799631e-06,
+ "loss": 0.4567,
+ "step": 6268
+ },
+ {
+ "epoch": 2.964066193853428,
+ "grad_norm": 2.6387875080108643,
+ "learning_rate": 2.57858924958642e-06,
+ "loss": 0.4061,
+ "step": 6269
+ },
+ {
+ "epoch": 2.9645390070921986,
+ "grad_norm": 2.64139986038208,
+ "learning_rate": 2.5779657284798017e-06,
+ "loss": 0.4539,
+ "step": 6270
+ },
+ {
+ "epoch": 2.965011820330969,
+ "grad_norm": 2.7384836673736572,
+ "learning_rate": 2.5773422025185983e-06,
+ "loss": 0.408,
+ "step": 6271
+ },
+ {
+ "epoch": 2.96548463356974,
+ "grad_norm": 2.262514352798462,
+ "learning_rate": 2.576718671741636e-06,
+ "loss": 0.3726,
+ "step": 6272
+ },
+ {
+ "epoch": 2.9659574468085106,
+ "grad_norm": 2.53800106048584,
+ "learning_rate": 2.5760951361877384e-06,
+ "loss": 0.4716,
+ "step": 6273
+ },
+ {
+ "epoch": 2.9664302600472814,
+ "grad_norm": 3.256701707839966,
+ "learning_rate": 2.57547159589573e-06,
+ "loss": 0.518,
+ "step": 6274
+ },
+ {
+ "epoch": 2.966903073286052,
+ "grad_norm": 2.9427342414855957,
+ "learning_rate": 2.574848050904436e-06,
+ "loss": 0.4255,
+ "step": 6275
+ },
+ {
+ "epoch": 2.9673758865248225,
+ "grad_norm": 2.5794098377227783,
+ "learning_rate": 2.574224501252682e-06,
+ "loss": 0.4412,
+ "step": 6276
+ },
+ {
+ "epoch": 2.9678486997635933,
+ "grad_norm": 2.5894877910614014,
+ "learning_rate": 2.573600946979294e-06,
+ "loss": 0.4356,
+ "step": 6277
+ },
+ {
+ "epoch": 2.968321513002364,
+ "grad_norm": 2.9597361087799072,
+ "learning_rate": 2.572977388123098e-06,
+ "loss": 0.4376,
+ "step": 6278
+ },
+ {
+ "epoch": 2.9687943262411345,
+ "grad_norm": 2.779303550720215,
+ "learning_rate": 2.5723538247229197e-06,
+ "loss": 0.3985,
+ "step": 6279
+ },
+ {
+ "epoch": 2.9692671394799053,
+ "grad_norm": 2.9173855781555176,
+ "learning_rate": 2.5717302568175866e-06,
+ "loss": 0.4581,
+ "step": 6280
+ },
+ {
+ "epoch": 2.969739952718676,
+ "grad_norm": 2.703721284866333,
+ "learning_rate": 2.5711066844459242e-06,
+ "loss": 0.3705,
+ "step": 6281
+ },
+ {
+ "epoch": 2.970212765957447,
+ "grad_norm": 2.5415029525756836,
+ "learning_rate": 2.5704831076467613e-06,
+ "loss": 0.4089,
+ "step": 6282
+ },
+ {
+ "epoch": 2.9706855791962177,
+ "grad_norm": 2.791780948638916,
+ "learning_rate": 2.5698595264589234e-06,
+ "loss": 0.4357,
+ "step": 6283
+ },
+ {
+ "epoch": 2.971158392434988,
+ "grad_norm": 2.887662887573242,
+ "learning_rate": 2.5692359409212392e-06,
+ "loss": 0.4093,
+ "step": 6284
+ },
+ {
+ "epoch": 2.971631205673759,
+ "grad_norm": 3.0309557914733887,
+ "learning_rate": 2.5686123510725364e-06,
+ "loss": 0.4461,
+ "step": 6285
+ },
+ {
+ "epoch": 2.9721040189125296,
+ "grad_norm": 2.6861515045166016,
+ "learning_rate": 2.5679887569516437e-06,
+ "loss": 0.4199,
+ "step": 6286
+ },
+ {
+ "epoch": 2.9725768321513,
+ "grad_norm": 2.7014012336730957,
+ "learning_rate": 2.5673651585973897e-06,
+ "loss": 0.4373,
+ "step": 6287
+ },
+ {
+ "epoch": 2.9730496453900708,
+ "grad_norm": 2.951265811920166,
+ "learning_rate": 2.5667415560486026e-06,
+ "loss": 0.4426,
+ "step": 6288
+ },
+ {
+ "epoch": 2.9735224586288416,
+ "grad_norm": 2.7664504051208496,
+ "learning_rate": 2.5661179493441106e-06,
+ "loss": 0.474,
+ "step": 6289
+ },
+ {
+ "epoch": 2.9739952718676124,
+ "grad_norm": 2.6081087589263916,
+ "learning_rate": 2.5654943385227445e-06,
+ "loss": 0.4058,
+ "step": 6290
+ },
+ {
+ "epoch": 2.974468085106383,
+ "grad_norm": 2.9416966438293457,
+ "learning_rate": 2.564870723623333e-06,
+ "loss": 0.506,
+ "step": 6291
+ },
+ {
+ "epoch": 2.9749408983451535,
+ "grad_norm": 2.9441659450531006,
+ "learning_rate": 2.564247104684706e-06,
+ "loss": 0.4505,
+ "step": 6292
+ },
+ {
+ "epoch": 2.9754137115839243,
+ "grad_norm": 2.7110862731933594,
+ "learning_rate": 2.563623481745693e-06,
+ "loss": 0.4493,
+ "step": 6293
+ },
+ {
+ "epoch": 2.975886524822695,
+ "grad_norm": 2.88459849357605,
+ "learning_rate": 2.562999854845125e-06,
+ "loss": 0.4462,
+ "step": 6294
+ },
+ {
+ "epoch": 2.9763593380614655,
+ "grad_norm": 3.0491793155670166,
+ "learning_rate": 2.5623762240218327e-06,
+ "loss": 0.4928,
+ "step": 6295
+ },
+ {
+ "epoch": 2.9768321513002363,
+ "grad_norm": 2.9475483894348145,
+ "learning_rate": 2.561752589314646e-06,
+ "loss": 0.4535,
+ "step": 6296
+ },
+ {
+ "epoch": 2.977304964539007,
+ "grad_norm": 2.879495859146118,
+ "learning_rate": 2.561128950762397e-06,
+ "loss": 0.4393,
+ "step": 6297
+ },
+ {
+ "epoch": 2.977777777777778,
+ "grad_norm": 2.8478336334228516,
+ "learning_rate": 2.560505308403916e-06,
+ "loss": 0.4363,
+ "step": 6298
+ },
+ {
+ "epoch": 2.9782505910165487,
+ "grad_norm": 2.5475094318389893,
+ "learning_rate": 2.5598816622780343e-06,
+ "loss": 0.3825,
+ "step": 6299
+ },
+ {
+ "epoch": 2.978723404255319,
+ "grad_norm": 2.85430908203125,
+ "learning_rate": 2.5592580124235838e-06,
+ "loss": 0.4226,
+ "step": 6300
+ },
+ {
+ "epoch": 2.97919621749409,
+ "grad_norm": 2.569775104522705,
+ "learning_rate": 2.5586343588793975e-06,
+ "loss": 0.4045,
+ "step": 6301
+ },
+ {
+ "epoch": 2.9796690307328606,
+ "grad_norm": 2.4482202529907227,
+ "learning_rate": 2.558010701684307e-06,
+ "loss": 0.4625,
+ "step": 6302
+ },
+ {
+ "epoch": 2.980141843971631,
+ "grad_norm": 2.9301230907440186,
+ "learning_rate": 2.5573870408771436e-06,
+ "loss": 0.4358,
+ "step": 6303
+ },
+ {
+ "epoch": 2.9806146572104018,
+ "grad_norm": 2.9865870475769043,
+ "learning_rate": 2.5567633764967416e-06,
+ "loss": 0.497,
+ "step": 6304
+ },
+ {
+ "epoch": 2.9810874704491725,
+ "grad_norm": 2.523524522781372,
+ "learning_rate": 2.556139708581933e-06,
+ "loss": 0.4141,
+ "step": 6305
+ },
+ {
+ "epoch": 2.9815602836879433,
+ "grad_norm": 2.8489344120025635,
+ "learning_rate": 2.5555160371715504e-06,
+ "loss": 0.4205,
+ "step": 6306
+ },
+ {
+ "epoch": 2.982033096926714,
+ "grad_norm": 2.417759895324707,
+ "learning_rate": 2.5548923623044274e-06,
+ "loss": 0.44,
+ "step": 6307
+ },
+ {
+ "epoch": 2.9825059101654845,
+ "grad_norm": 2.7626900672912598,
+ "learning_rate": 2.554268684019398e-06,
+ "loss": 0.4646,
+ "step": 6308
+ },
+ {
+ "epoch": 2.9829787234042553,
+ "grad_norm": 3.0916266441345215,
+ "learning_rate": 2.5536450023552956e-06,
+ "loss": 0.4443,
+ "step": 6309
+ },
+ {
+ "epoch": 2.983451536643026,
+ "grad_norm": 2.721992015838623,
+ "learning_rate": 2.5530213173509542e-06,
+ "loss": 0.4008,
+ "step": 6310
+ },
+ {
+ "epoch": 2.9839243498817964,
+ "grad_norm": 2.825334072113037,
+ "learning_rate": 2.552397629045208e-06,
+ "loss": 0.4513,
+ "step": 6311
+ },
+ {
+ "epoch": 2.9843971631205672,
+ "grad_norm": 2.912050485610962,
+ "learning_rate": 2.5517739374768915e-06,
+ "loss": 0.4104,
+ "step": 6312
+ },
+ {
+ "epoch": 2.984869976359338,
+ "grad_norm": 2.760650634765625,
+ "learning_rate": 2.551150242684838e-06,
+ "loss": 0.4372,
+ "step": 6313
+ },
+ {
+ "epoch": 2.985342789598109,
+ "grad_norm": 2.8926033973693848,
+ "learning_rate": 2.5505265447078838e-06,
+ "loss": 0.475,
+ "step": 6314
+ },
+ {
+ "epoch": 2.9858156028368796,
+ "grad_norm": 2.6279892921447754,
+ "learning_rate": 2.5499028435848633e-06,
+ "loss": 0.4589,
+ "step": 6315
+ },
+ {
+ "epoch": 2.98628841607565,
+ "grad_norm": 3.2147316932678223,
+ "learning_rate": 2.549279139354611e-06,
+ "loss": 0.4968,
+ "step": 6316
+ },
+ {
+ "epoch": 2.986761229314421,
+ "grad_norm": 2.4510674476623535,
+ "learning_rate": 2.5486554320559626e-06,
+ "loss": 0.4291,
+ "step": 6317
+ },
+ {
+ "epoch": 2.9872340425531916,
+ "grad_norm": 2.6919643878936768,
+ "learning_rate": 2.5480317217277544e-06,
+ "loss": 0.4704,
+ "step": 6318
+ },
+ {
+ "epoch": 2.987706855791962,
+ "grad_norm": 2.9832234382629395,
+ "learning_rate": 2.5474080084088215e-06,
+ "loss": 0.4129,
+ "step": 6319
+ },
+ {
+ "epoch": 2.9881796690307327,
+ "grad_norm": 2.893209218978882,
+ "learning_rate": 2.5467842921380004e-06,
+ "loss": 0.5099,
+ "step": 6320
+ },
+ {
+ "epoch": 2.9886524822695035,
+ "grad_norm": 2.6734580993652344,
+ "learning_rate": 2.5461605729541254e-06,
+ "loss": 0.4588,
+ "step": 6321
+ },
+ {
+ "epoch": 2.9891252955082743,
+ "grad_norm": 2.5591681003570557,
+ "learning_rate": 2.5455368508960343e-06,
+ "loss": 0.4162,
+ "step": 6322
+ },
+ {
+ "epoch": 2.989598108747045,
+ "grad_norm": 3.2619881629943848,
+ "learning_rate": 2.5449131260025626e-06,
+ "loss": 0.4412,
+ "step": 6323
+ },
+ {
+ "epoch": 2.9900709219858155,
+ "grad_norm": 2.897914409637451,
+ "learning_rate": 2.544289398312549e-06,
+ "loss": 0.5079,
+ "step": 6324
+ },
+ {
+ "epoch": 2.9905437352245863,
+ "grad_norm": 2.7891685962677,
+ "learning_rate": 2.5436656678648274e-06,
+ "loss": 0.42,
+ "step": 6325
+ },
+ {
+ "epoch": 2.991016548463357,
+ "grad_norm": 3.022341728210449,
+ "learning_rate": 2.5430419346982367e-06,
+ "loss": 0.4739,
+ "step": 6326
+ },
+ {
+ "epoch": 2.9914893617021274,
+ "grad_norm": 3.395775556564331,
+ "learning_rate": 2.542418198851614e-06,
+ "loss": 0.4822,
+ "step": 6327
+ },
+ {
+ "epoch": 2.9919621749408982,
+ "grad_norm": 3.0200490951538086,
+ "learning_rate": 2.541794460363795e-06,
+ "loss": 0.4755,
+ "step": 6328
+ },
+ {
+ "epoch": 2.992434988179669,
+ "grad_norm": 3.302020311355591,
+ "learning_rate": 2.541170719273619e-06,
+ "loss": 0.4603,
+ "step": 6329
+ },
+ {
+ "epoch": 2.99290780141844,
+ "grad_norm": 2.5985910892486572,
+ "learning_rate": 2.5405469756199226e-06,
+ "loss": 0.4475,
+ "step": 6330
+ },
+ {
+ "epoch": 2.9933806146572106,
+ "grad_norm": 2.9413928985595703,
+ "learning_rate": 2.5399232294415434e-06,
+ "loss": 0.4695,
+ "step": 6331
+ },
+ {
+ "epoch": 2.993853427895981,
+ "grad_norm": 2.942777156829834,
+ "learning_rate": 2.53929948077732e-06,
+ "loss": 0.4462,
+ "step": 6332
+ },
+ {
+ "epoch": 2.9943262411347518,
+ "grad_norm": 2.971120595932007,
+ "learning_rate": 2.53867572966609e-06,
+ "loss": 0.4546,
+ "step": 6333
+ },
+ {
+ "epoch": 2.9947990543735226,
+ "grad_norm": 2.8248138427734375,
+ "learning_rate": 2.5380519761466927e-06,
+ "loss": 0.453,
+ "step": 6334
+ },
+ {
+ "epoch": 2.995271867612293,
+ "grad_norm": 3.0819008350372314,
+ "learning_rate": 2.5374282202579647e-06,
+ "loss": 0.4774,
+ "step": 6335
+ },
+ {
+ "epoch": 2.9957446808510637,
+ "grad_norm": 2.742570161819458,
+ "learning_rate": 2.5368044620387466e-06,
+ "loss": 0.5059,
+ "step": 6336
+ },
+ {
+ "epoch": 2.9962174940898345,
+ "grad_norm": 2.9087419509887695,
+ "learning_rate": 2.5361807015278757e-06,
+ "loss": 0.3606,
+ "step": 6337
+ },
+ {
+ "epoch": 2.9966903073286053,
+ "grad_norm": 2.6887354850769043,
+ "learning_rate": 2.5355569387641908e-06,
+ "loss": 0.4247,
+ "step": 6338
+ },
+ {
+ "epoch": 2.997163120567376,
+ "grad_norm": 2.8516008853912354,
+ "learning_rate": 2.534933173786531e-06,
+ "loss": 0.4502,
+ "step": 6339
+ },
+ {
+ "epoch": 2.9976359338061465,
+ "grad_norm": 2.4463164806365967,
+ "learning_rate": 2.5343094066337366e-06,
+ "loss": 0.3883,
+ "step": 6340
+ },
+ {
+ "epoch": 2.9981087470449173,
+ "grad_norm": 2.87025785446167,
+ "learning_rate": 2.533685637344645e-06,
+ "loss": 0.4534,
+ "step": 6341
+ },
+ {
+ "epoch": 2.998581560283688,
+ "grad_norm": 3.0706169605255127,
+ "learning_rate": 2.5330618659580967e-06,
+ "loss": 0.5426,
+ "step": 6342
+ },
+ {
+ "epoch": 2.9990543735224584,
+ "grad_norm": 2.7185773849487305,
+ "learning_rate": 2.532438092512931e-06,
+ "loss": 0.497,
+ "step": 6343
+ },
+ {
+ "epoch": 2.999527186761229,
+ "grad_norm": 2.840207815170288,
+ "learning_rate": 2.531814317047988e-06,
+ "loss": 0.4073,
+ "step": 6344
+ },
+ {
+ "epoch": 3.0,
+ "grad_norm": 3.1592655181884766,
+ "learning_rate": 2.5311905396021063e-06,
+ "loss": 0.4728,
+ "step": 6345
+ },
+ {
+ "epoch": 3.000472813238771,
+ "grad_norm": 2.190042495727539,
+ "learning_rate": 2.530566760214127e-06,
+ "loss": 0.3588,
+ "step": 6346
+ },
+ {
+ "epoch": 3.000945626477541,
+ "grad_norm": 2.749516248703003,
+ "learning_rate": 2.5299429789228898e-06,
+ "loss": 0.3495,
+ "step": 6347
+ },
+ {
+ "epoch": 3.001418439716312,
+ "grad_norm": 2.6181938648223877,
+ "learning_rate": 2.5293191957672335e-06,
+ "loss": 0.3611,
+ "step": 6348
+ },
+ {
+ "epoch": 3.0018912529550827,
+ "grad_norm": 2.7235212326049805,
+ "learning_rate": 2.528695410786e-06,
+ "loss": 0.4173,
+ "step": 6349
+ },
+ {
+ "epoch": 3.0023640661938535,
+ "grad_norm": 2.5408031940460205,
+ "learning_rate": 2.528071624018029e-06,
+ "loss": 0.3651,
+ "step": 6350
+ },
+ {
+ "epoch": 3.002836879432624,
+ "grad_norm": 2.7824409008026123,
+ "learning_rate": 2.5274478355021615e-06,
+ "loss": 0.378,
+ "step": 6351
+ },
+ {
+ "epoch": 3.0033096926713947,
+ "grad_norm": 2.7671427726745605,
+ "learning_rate": 2.526824045277238e-06,
+ "loss": 0.446,
+ "step": 6352
+ },
+ {
+ "epoch": 3.0037825059101655,
+ "grad_norm": 2.6746346950531006,
+ "learning_rate": 2.526200253382098e-06,
+ "loss": 0.3831,
+ "step": 6353
+ },
+ {
+ "epoch": 3.0042553191489363,
+ "grad_norm": 2.437439441680908,
+ "learning_rate": 2.525576459855583e-06,
+ "loss": 0.352,
+ "step": 6354
+ },
+ {
+ "epoch": 3.0047281323877066,
+ "grad_norm": 2.7632546424865723,
+ "learning_rate": 2.5249526647365343e-06,
+ "loss": 0.4636,
+ "step": 6355
+ },
+ {
+ "epoch": 3.0052009456264774,
+ "grad_norm": 2.681955099105835,
+ "learning_rate": 2.524328868063793e-06,
+ "loss": 0.3978,
+ "step": 6356
+ },
+ {
+ "epoch": 3.0056737588652482,
+ "grad_norm": 2.9575345516204834,
+ "learning_rate": 2.523705069876199e-06,
+ "loss": 0.3803,
+ "step": 6357
+ },
+ {
+ "epoch": 3.006146572104019,
+ "grad_norm": 2.7368216514587402,
+ "learning_rate": 2.523081270212594e-06,
+ "loss": 0.3968,
+ "step": 6358
+ },
+ {
+ "epoch": 3.0066193853427894,
+ "grad_norm": 2.637592077255249,
+ "learning_rate": 2.522457469111821e-06,
+ "loss": 0.3629,
+ "step": 6359
+ },
+ {
+ "epoch": 3.00709219858156,
+ "grad_norm": 2.579331398010254,
+ "learning_rate": 2.5218336666127187e-06,
+ "loss": 0.4044,
+ "step": 6360
+ },
+ {
+ "epoch": 3.007565011820331,
+ "grad_norm": 3.014544725418091,
+ "learning_rate": 2.5212098627541296e-06,
+ "loss": 0.3518,
+ "step": 6361
+ },
+ {
+ "epoch": 3.0080378250591018,
+ "grad_norm": 2.5261058807373047,
+ "learning_rate": 2.520586057574896e-06,
+ "loss": 0.3763,
+ "step": 6362
+ },
+ {
+ "epoch": 3.008510638297872,
+ "grad_norm": 3.234910249710083,
+ "learning_rate": 2.519962251113858e-06,
+ "loss": 0.3691,
+ "step": 6363
+ },
+ {
+ "epoch": 3.008983451536643,
+ "grad_norm": 3.2930967807769775,
+ "learning_rate": 2.519338443409859e-06,
+ "loss": 0.4363,
+ "step": 6364
+ },
+ {
+ "epoch": 3.0094562647754137,
+ "grad_norm": 2.807910442352295,
+ "learning_rate": 2.51871463450174e-06,
+ "loss": 0.3984,
+ "step": 6365
+ },
+ {
+ "epoch": 3.0099290780141845,
+ "grad_norm": 3.1555075645446777,
+ "learning_rate": 2.518090824428342e-06,
+ "loss": 0.4006,
+ "step": 6366
+ },
+ {
+ "epoch": 3.010401891252955,
+ "grad_norm": 3.1793272495269775,
+ "learning_rate": 2.5174670132285084e-06,
+ "loss": 0.4966,
+ "step": 6367
+ },
+ {
+ "epoch": 3.0108747044917257,
+ "grad_norm": 2.7007548809051514,
+ "learning_rate": 2.5168432009410805e-06,
+ "loss": 0.3755,
+ "step": 6368
+ },
+ {
+ "epoch": 3.0113475177304965,
+ "grad_norm": 2.914792537689209,
+ "learning_rate": 2.5162193876048995e-06,
+ "loss": 0.39,
+ "step": 6369
+ },
+ {
+ "epoch": 3.0118203309692673,
+ "grad_norm": 2.935516119003296,
+ "learning_rate": 2.5155955732588093e-06,
+ "loss": 0.4045,
+ "step": 6370
+ },
+ {
+ "epoch": 3.0122931442080376,
+ "grad_norm": 2.8817989826202393,
+ "learning_rate": 2.5149717579416503e-06,
+ "loss": 0.3751,
+ "step": 6371
+ },
+ {
+ "epoch": 3.0127659574468084,
+ "grad_norm": 2.9181740283966064,
+ "learning_rate": 2.514347941692266e-06,
+ "loss": 0.3689,
+ "step": 6372
+ },
+ {
+ "epoch": 3.013238770685579,
+ "grad_norm": 3.052060604095459,
+ "learning_rate": 2.5137241245494982e-06,
+ "loss": 0.3874,
+ "step": 6373
+ },
+ {
+ "epoch": 3.01371158392435,
+ "grad_norm": 2.6931657791137695,
+ "learning_rate": 2.513100306552189e-06,
+ "loss": 0.3673,
+ "step": 6374
+ },
+ {
+ "epoch": 3.0141843971631204,
+ "grad_norm": 2.3422248363494873,
+ "learning_rate": 2.5124764877391824e-06,
+ "loss": 0.3753,
+ "step": 6375
+ },
+ {
+ "epoch": 3.014657210401891,
+ "grad_norm": 2.5826265811920166,
+ "learning_rate": 2.5118526681493186e-06,
+ "loss": 0.3661,
+ "step": 6376
+ },
+ {
+ "epoch": 3.015130023640662,
+ "grad_norm": 2.7407493591308594,
+ "learning_rate": 2.5112288478214415e-06,
+ "loss": 0.3887,
+ "step": 6377
+ },
+ {
+ "epoch": 3.0156028368794328,
+ "grad_norm": 2.7378315925598145,
+ "learning_rate": 2.510605026794393e-06,
+ "loss": 0.3623,
+ "step": 6378
+ },
+ {
+ "epoch": 3.016075650118203,
+ "grad_norm": 2.59541654586792,
+ "learning_rate": 2.5099812051070167e-06,
+ "loss": 0.3804,
+ "step": 6379
+ },
+ {
+ "epoch": 3.016548463356974,
+ "grad_norm": 3.1022770404815674,
+ "learning_rate": 2.509357382798154e-06,
+ "loss": 0.4092,
+ "step": 6380
+ },
+ {
+ "epoch": 3.0170212765957447,
+ "grad_norm": 2.521545648574829,
+ "learning_rate": 2.5087335599066476e-06,
+ "loss": 0.3509,
+ "step": 6381
+ },
+ {
+ "epoch": 3.0174940898345155,
+ "grad_norm": 2.949395179748535,
+ "learning_rate": 2.5081097364713407e-06,
+ "loss": 0.387,
+ "step": 6382
+ },
+ {
+ "epoch": 3.017966903073286,
+ "grad_norm": 2.4806487560272217,
+ "learning_rate": 2.507485912531077e-06,
+ "loss": 0.4004,
+ "step": 6383
+ },
+ {
+ "epoch": 3.0184397163120567,
+ "grad_norm": 2.6480894088745117,
+ "learning_rate": 2.506862088124698e-06,
+ "loss": 0.3366,
+ "step": 6384
+ },
+ {
+ "epoch": 3.0189125295508275,
+ "grad_norm": 2.62559175491333,
+ "learning_rate": 2.5062382632910463e-06,
+ "loss": 0.3676,
+ "step": 6385
+ },
+ {
+ "epoch": 3.0193853427895982,
+ "grad_norm": 2.694767951965332,
+ "learning_rate": 2.5056144380689657e-06,
+ "loss": 0.3438,
+ "step": 6386
+ },
+ {
+ "epoch": 3.0198581560283686,
+ "grad_norm": 2.808107614517212,
+ "learning_rate": 2.504990612497299e-06,
+ "loss": 0.3831,
+ "step": 6387
+ },
+ {
+ "epoch": 3.0203309692671394,
+ "grad_norm": 3.2392303943634033,
+ "learning_rate": 2.504366786614888e-06,
+ "loss": 0.3493,
+ "step": 6388
+ },
+ {
+ "epoch": 3.02080378250591,
+ "grad_norm": 2.6899030208587646,
+ "learning_rate": 2.5037429604605774e-06,
+ "loss": 0.3998,
+ "step": 6389
+ },
+ {
+ "epoch": 3.021276595744681,
+ "grad_norm": 2.5622799396514893,
+ "learning_rate": 2.503119134073208e-06,
+ "loss": 0.3443,
+ "step": 6390
+ },
+ {
+ "epoch": 3.0217494089834513,
+ "grad_norm": 2.716832399368286,
+ "learning_rate": 2.502495307491625e-06,
+ "loss": 0.4465,
+ "step": 6391
+ },
+ {
+ "epoch": 3.022222222222222,
+ "grad_norm": 2.8117692470550537,
+ "learning_rate": 2.501871480754669e-06,
+ "loss": 0.3513,
+ "step": 6392
+ },
+ {
+ "epoch": 3.022695035460993,
+ "grad_norm": 3.1260762214660645,
+ "learning_rate": 2.501247653901185e-06,
+ "loss": 0.4336,
+ "step": 6393
+ },
+ {
+ "epoch": 3.0231678486997637,
+ "grad_norm": 2.5076897144317627,
+ "learning_rate": 2.5006238269700137e-06,
+ "loss": 0.3437,
+ "step": 6394
+ },
+ {
+ "epoch": 3.023640661938534,
+ "grad_norm": 2.781937837600708,
+ "learning_rate": 2.5e-06,
+ "loss": 0.3583,
+ "step": 6395
+ },
+ {
+ "epoch": 3.024113475177305,
+ "grad_norm": 3.084050178527832,
+ "learning_rate": 2.499376173029987e-06,
+ "loss": 0.3785,
+ "step": 6396
+ },
+ {
+ "epoch": 3.0245862884160757,
+ "grad_norm": 3.2292473316192627,
+ "learning_rate": 2.498752346098816e-06,
+ "loss": 0.3858,
+ "step": 6397
+ },
+ {
+ "epoch": 3.0250591016548465,
+ "grad_norm": 2.738614797592163,
+ "learning_rate": 2.498128519245332e-06,
+ "loss": 0.4166,
+ "step": 6398
+ },
+ {
+ "epoch": 3.025531914893617,
+ "grad_norm": 2.940103054046631,
+ "learning_rate": 2.4975046925083764e-06,
+ "loss": 0.4117,
+ "step": 6399
+ },
+ {
+ "epoch": 3.0260047281323876,
+ "grad_norm": 2.5177032947540283,
+ "learning_rate": 2.4968808659267927e-06,
+ "loss": 0.3704,
+ "step": 6400
+ },
+ {
+ "epoch": 3.0264775413711584,
+ "grad_norm": 2.6969990730285645,
+ "learning_rate": 2.4962570395394243e-06,
+ "loss": 0.3721,
+ "step": 6401
+ },
+ {
+ "epoch": 3.0269503546099292,
+ "grad_norm": 2.9696028232574463,
+ "learning_rate": 2.495633213385112e-06,
+ "loss": 0.3934,
+ "step": 6402
+ },
+ {
+ "epoch": 3.0274231678486996,
+ "grad_norm": 3.4032552242279053,
+ "learning_rate": 2.495009387502702e-06,
+ "loss": 0.3877,
+ "step": 6403
+ },
+ {
+ "epoch": 3.0278959810874704,
+ "grad_norm": 2.6801865100860596,
+ "learning_rate": 2.4943855619310343e-06,
+ "loss": 0.3421,
+ "step": 6404
+ },
+ {
+ "epoch": 3.028368794326241,
+ "grad_norm": 2.827056884765625,
+ "learning_rate": 2.493761736708954e-06,
+ "loss": 0.3791,
+ "step": 6405
+ },
+ {
+ "epoch": 3.028841607565012,
+ "grad_norm": 2.6393566131591797,
+ "learning_rate": 2.4931379118753034e-06,
+ "loss": 0.3729,
+ "step": 6406
+ },
+ {
+ "epoch": 3.0293144208037823,
+ "grad_norm": 2.833519458770752,
+ "learning_rate": 2.4925140874689236e-06,
+ "loss": 0.3836,
+ "step": 6407
+ },
+ {
+ "epoch": 3.029787234042553,
+ "grad_norm": 2.8852169513702393,
+ "learning_rate": 2.4918902635286597e-06,
+ "loss": 0.4307,
+ "step": 6408
+ },
+ {
+ "epoch": 3.030260047281324,
+ "grad_norm": 2.7166404724121094,
+ "learning_rate": 2.491266440093354e-06,
+ "loss": 0.3825,
+ "step": 6409
+ },
+ {
+ "epoch": 3.0307328605200947,
+ "grad_norm": 2.5828018188476562,
+ "learning_rate": 2.4906426172018474e-06,
+ "loss": 0.3579,
+ "step": 6410
+ },
+ {
+ "epoch": 3.031205673758865,
+ "grad_norm": 2.915632724761963,
+ "learning_rate": 2.490018794892985e-06,
+ "loss": 0.4099,
+ "step": 6411
+ },
+ {
+ "epoch": 3.031678486997636,
+ "grad_norm": 2.7117249965667725,
+ "learning_rate": 2.489394973205607e-06,
+ "loss": 0.4063,
+ "step": 6412
+ },
+ {
+ "epoch": 3.0321513002364067,
+ "grad_norm": 2.3989102840423584,
+ "learning_rate": 2.488771152178559e-06,
+ "loss": 0.3377,
+ "step": 6413
+ },
+ {
+ "epoch": 3.0326241134751775,
+ "grad_norm": 2.6560115814208984,
+ "learning_rate": 2.488147331850682e-06,
+ "loss": 0.4072,
+ "step": 6414
+ },
+ {
+ "epoch": 3.033096926713948,
+ "grad_norm": 2.9466328620910645,
+ "learning_rate": 2.4875235122608184e-06,
+ "loss": 0.3559,
+ "step": 6415
+ },
+ {
+ "epoch": 3.0335697399527186,
+ "grad_norm": 2.765348196029663,
+ "learning_rate": 2.4868996934478114e-06,
+ "loss": 0.336,
+ "step": 6416
+ },
+ {
+ "epoch": 3.0340425531914894,
+ "grad_norm": 2.6021807193756104,
+ "learning_rate": 2.4862758754505017e-06,
+ "loss": 0.3861,
+ "step": 6417
+ },
+ {
+ "epoch": 3.03451536643026,
+ "grad_norm": 2.7293684482574463,
+ "learning_rate": 2.4856520583077344e-06,
+ "loss": 0.3926,
+ "step": 6418
+ },
+ {
+ "epoch": 3.0349881796690306,
+ "grad_norm": 2.9704763889312744,
+ "learning_rate": 2.485028242058351e-06,
+ "loss": 0.4303,
+ "step": 6419
+ },
+ {
+ "epoch": 3.0354609929078014,
+ "grad_norm": 3.385713815689087,
+ "learning_rate": 2.484404426741191e-06,
+ "loss": 0.44,
+ "step": 6420
+ },
+ {
+ "epoch": 3.035933806146572,
+ "grad_norm": 3.177983045578003,
+ "learning_rate": 2.4837806123951013e-06,
+ "loss": 0.4256,
+ "step": 6421
+ },
+ {
+ "epoch": 3.036406619385343,
+ "grad_norm": 2.6287200450897217,
+ "learning_rate": 2.4831567990589203e-06,
+ "loss": 0.3764,
+ "step": 6422
+ },
+ {
+ "epoch": 3.0368794326241133,
+ "grad_norm": 2.81823992729187,
+ "learning_rate": 2.4825329867714924e-06,
+ "loss": 0.3645,
+ "step": 6423
+ },
+ {
+ "epoch": 3.037352245862884,
+ "grad_norm": 3.1826934814453125,
+ "learning_rate": 2.4819091755716586e-06,
+ "loss": 0.3666,
+ "step": 6424
+ },
+ {
+ "epoch": 3.037825059101655,
+ "grad_norm": 3.0880346298217773,
+ "learning_rate": 2.481285365498261e-06,
+ "loss": 0.4339,
+ "step": 6425
+ },
+ {
+ "epoch": 3.0382978723404257,
+ "grad_norm": 3.1764965057373047,
+ "learning_rate": 2.480661556590142e-06,
+ "loss": 0.4804,
+ "step": 6426
+ },
+ {
+ "epoch": 3.038770685579196,
+ "grad_norm": 2.89469313621521,
+ "learning_rate": 2.480037748886142e-06,
+ "loss": 0.3875,
+ "step": 6427
+ },
+ {
+ "epoch": 3.039243498817967,
+ "grad_norm": 2.6043636798858643,
+ "learning_rate": 2.479413942425105e-06,
+ "loss": 0.3859,
+ "step": 6428
+ },
+ {
+ "epoch": 3.0397163120567376,
+ "grad_norm": 2.6570727825164795,
+ "learning_rate": 2.4787901372458712e-06,
+ "loss": 0.3508,
+ "step": 6429
+ },
+ {
+ "epoch": 3.0401891252955084,
+ "grad_norm": 2.914050579071045,
+ "learning_rate": 2.4781663333872825e-06,
+ "loss": 0.3904,
+ "step": 6430
+ },
+ {
+ "epoch": 3.040661938534279,
+ "grad_norm": 2.595606803894043,
+ "learning_rate": 2.47754253088818e-06,
+ "loss": 0.3753,
+ "step": 6431
+ },
+ {
+ "epoch": 3.0411347517730496,
+ "grad_norm": 2.68186616897583,
+ "learning_rate": 2.4769187297874065e-06,
+ "loss": 0.3545,
+ "step": 6432
+ },
+ {
+ "epoch": 3.0416075650118204,
+ "grad_norm": 2.956507921218872,
+ "learning_rate": 2.476294930123802e-06,
+ "loss": 0.3778,
+ "step": 6433
+ },
+ {
+ "epoch": 3.042080378250591,
+ "grad_norm": 2.8327226638793945,
+ "learning_rate": 2.475671131936209e-06,
+ "loss": 0.3205,
+ "step": 6434
+ },
+ {
+ "epoch": 3.0425531914893615,
+ "grad_norm": 2.594348430633545,
+ "learning_rate": 2.475047335263466e-06,
+ "loss": 0.3859,
+ "step": 6435
+ },
+ {
+ "epoch": 3.0430260047281323,
+ "grad_norm": 3.5030717849731445,
+ "learning_rate": 2.4744235401444177e-06,
+ "loss": 0.3611,
+ "step": 6436
+ },
+ {
+ "epoch": 3.043498817966903,
+ "grad_norm": 2.8478317260742188,
+ "learning_rate": 2.4737997466179034e-06,
+ "loss": 0.3927,
+ "step": 6437
+ },
+ {
+ "epoch": 3.043971631205674,
+ "grad_norm": 2.677827835083008,
+ "learning_rate": 2.4731759547227627e-06,
+ "loss": 0.3784,
+ "step": 6438
+ },
+ {
+ "epoch": 3.0444444444444443,
+ "grad_norm": 3.0059866905212402,
+ "learning_rate": 2.4725521644978393e-06,
+ "loss": 0.4279,
+ "step": 6439
+ },
+ {
+ "epoch": 3.044917257683215,
+ "grad_norm": 3.012500047683716,
+ "learning_rate": 2.4719283759819713e-06,
+ "loss": 0.4007,
+ "step": 6440
+ },
+ {
+ "epoch": 3.045390070921986,
+ "grad_norm": 2.758204936981201,
+ "learning_rate": 2.4713045892140007e-06,
+ "loss": 0.3668,
+ "step": 6441
+ },
+ {
+ "epoch": 3.0458628841607567,
+ "grad_norm": 2.9551615715026855,
+ "learning_rate": 2.4706808042327678e-06,
+ "loss": 0.3524,
+ "step": 6442
+ },
+ {
+ "epoch": 3.046335697399527,
+ "grad_norm": 2.8639965057373047,
+ "learning_rate": 2.4700570210771115e-06,
+ "loss": 0.3886,
+ "step": 6443
+ },
+ {
+ "epoch": 3.046808510638298,
+ "grad_norm": 2.718219757080078,
+ "learning_rate": 2.4694332397858738e-06,
+ "loss": 0.3693,
+ "step": 6444
+ },
+ {
+ "epoch": 3.0472813238770686,
+ "grad_norm": 3.050135612487793,
+ "learning_rate": 2.4688094603978933e-06,
+ "loss": 0.3979,
+ "step": 6445
+ },
+ {
+ "epoch": 3.0477541371158394,
+ "grad_norm": 2.786186456680298,
+ "learning_rate": 2.468185682952013e-06,
+ "loss": 0.3809,
+ "step": 6446
+ },
+ {
+ "epoch": 3.0482269503546098,
+ "grad_norm": 2.6462252140045166,
+ "learning_rate": 2.4675619074870697e-06,
+ "loss": 0.3746,
+ "step": 6447
+ },
+ {
+ "epoch": 3.0486997635933806,
+ "grad_norm": 2.984783887863159,
+ "learning_rate": 2.4669381340419037e-06,
+ "loss": 0.4092,
+ "step": 6448
+ },
+ {
+ "epoch": 3.0491725768321514,
+ "grad_norm": 2.936380624771118,
+ "learning_rate": 2.466314362655356e-06,
+ "loss": 0.4335,
+ "step": 6449
+ },
+ {
+ "epoch": 3.049645390070922,
+ "grad_norm": 2.730738639831543,
+ "learning_rate": 2.465690593366264e-06,
+ "loss": 0.364,
+ "step": 6450
+ },
+ {
+ "epoch": 3.0501182033096925,
+ "grad_norm": 2.7273590564727783,
+ "learning_rate": 2.4650668262134693e-06,
+ "loss": 0.3905,
+ "step": 6451
+ },
+ {
+ "epoch": 3.0505910165484633,
+ "grad_norm": 2.9588208198547363,
+ "learning_rate": 2.4644430612358105e-06,
+ "loss": 0.3936,
+ "step": 6452
+ },
+ {
+ "epoch": 3.051063829787234,
+ "grad_norm": 2.8721611499786377,
+ "learning_rate": 2.4638192984721247e-06,
+ "loss": 0.4279,
+ "step": 6453
+ },
+ {
+ "epoch": 3.051536643026005,
+ "grad_norm": 3.7179651260375977,
+ "learning_rate": 2.463195537961254e-06,
+ "loss": 0.427,
+ "step": 6454
+ },
+ {
+ "epoch": 3.0520094562647753,
+ "grad_norm": 2.651731491088867,
+ "learning_rate": 2.4625717797420353e-06,
+ "loss": 0.3471,
+ "step": 6455
+ },
+ {
+ "epoch": 3.052482269503546,
+ "grad_norm": 3.898737668991089,
+ "learning_rate": 2.4619480238533085e-06,
+ "loss": 0.4574,
+ "step": 6456
+ },
+ {
+ "epoch": 3.052955082742317,
+ "grad_norm": 2.916252374649048,
+ "learning_rate": 2.4613242703339108e-06,
+ "loss": 0.3622,
+ "step": 6457
+ },
+ {
+ "epoch": 3.0534278959810877,
+ "grad_norm": 3.122565507888794,
+ "learning_rate": 2.4607005192226806e-06,
+ "loss": 0.3954,
+ "step": 6458
+ },
+ {
+ "epoch": 3.053900709219858,
+ "grad_norm": 3.2377424240112305,
+ "learning_rate": 2.4600767705584575e-06,
+ "loss": 0.4082,
+ "step": 6459
+ },
+ {
+ "epoch": 3.054373522458629,
+ "grad_norm": 2.941102981567383,
+ "learning_rate": 2.459453024380079e-06,
+ "loss": 0.4324,
+ "step": 6460
+ },
+ {
+ "epoch": 3.0548463356973996,
+ "grad_norm": 2.964313507080078,
+ "learning_rate": 2.4588292807263816e-06,
+ "loss": 0.3037,
+ "step": 6461
+ },
+ {
+ "epoch": 3.0553191489361704,
+ "grad_norm": 2.824669599533081,
+ "learning_rate": 2.4582055396362055e-06,
+ "loss": 0.4076,
+ "step": 6462
+ },
+ {
+ "epoch": 3.0557919621749408,
+ "grad_norm": 2.7739884853363037,
+ "learning_rate": 2.457581801148387e-06,
+ "loss": 0.3615,
+ "step": 6463
+ },
+ {
+ "epoch": 3.0562647754137116,
+ "grad_norm": 3.2974464893341064,
+ "learning_rate": 2.456958065301764e-06,
+ "loss": 0.426,
+ "step": 6464
+ },
+ {
+ "epoch": 3.0567375886524824,
+ "grad_norm": 3.0801217555999756,
+ "learning_rate": 2.456334332135174e-06,
+ "loss": 0.3737,
+ "step": 6465
+ },
+ {
+ "epoch": 3.057210401891253,
+ "grad_norm": 2.788851022720337,
+ "learning_rate": 2.455710601687452e-06,
+ "loss": 0.4367,
+ "step": 6466
+ },
+ {
+ "epoch": 3.0576832151300235,
+ "grad_norm": 2.8078136444091797,
+ "learning_rate": 2.4550868739974378e-06,
+ "loss": 0.3796,
+ "step": 6467
+ },
+ {
+ "epoch": 3.0581560283687943,
+ "grad_norm": 2.9871349334716797,
+ "learning_rate": 2.4544631491039657e-06,
+ "loss": 0.3869,
+ "step": 6468
+ },
+ {
+ "epoch": 3.058628841607565,
+ "grad_norm": 2.9170174598693848,
+ "learning_rate": 2.453839427045875e-06,
+ "loss": 0.4591,
+ "step": 6469
+ },
+ {
+ "epoch": 3.059101654846336,
+ "grad_norm": 2.7316131591796875,
+ "learning_rate": 2.4532157078620013e-06,
+ "loss": 0.3723,
+ "step": 6470
+ },
+ {
+ "epoch": 3.0595744680851062,
+ "grad_norm": 3.047921657562256,
+ "learning_rate": 2.4525919915911793e-06,
+ "loss": 0.3804,
+ "step": 6471
+ },
+ {
+ "epoch": 3.060047281323877,
+ "grad_norm": 3.047934055328369,
+ "learning_rate": 2.4519682782722465e-06,
+ "loss": 0.3949,
+ "step": 6472
+ },
+ {
+ "epoch": 3.060520094562648,
+ "grad_norm": 2.4911186695098877,
+ "learning_rate": 2.4513445679440374e-06,
+ "loss": 0.3629,
+ "step": 6473
+ },
+ {
+ "epoch": 3.0609929078014186,
+ "grad_norm": 2.5353519916534424,
+ "learning_rate": 2.4507208606453895e-06,
+ "loss": 0.3417,
+ "step": 6474
+ },
+ {
+ "epoch": 3.061465721040189,
+ "grad_norm": 2.474622964859009,
+ "learning_rate": 2.4500971564151384e-06,
+ "loss": 0.3468,
+ "step": 6475
+ },
+ {
+ "epoch": 3.06193853427896,
+ "grad_norm": 2.7016963958740234,
+ "learning_rate": 2.4494734552921166e-06,
+ "loss": 0.3872,
+ "step": 6476
+ },
+ {
+ "epoch": 3.0624113475177306,
+ "grad_norm": 2.912144184112549,
+ "learning_rate": 2.4488497573151625e-06,
+ "loss": 0.3727,
+ "step": 6477
+ },
+ {
+ "epoch": 3.0628841607565014,
+ "grad_norm": 2.8234877586364746,
+ "learning_rate": 2.4482260625231093e-06,
+ "loss": 0.3472,
+ "step": 6478
+ },
+ {
+ "epoch": 3.0633569739952717,
+ "grad_norm": 2.6554179191589355,
+ "learning_rate": 2.447602370954793e-06,
+ "loss": 0.343,
+ "step": 6479
+ },
+ {
+ "epoch": 3.0638297872340425,
+ "grad_norm": 2.666419744491577,
+ "learning_rate": 2.446978682649047e-06,
+ "loss": 0.3932,
+ "step": 6480
+ },
+ {
+ "epoch": 3.0643026004728133,
+ "grad_norm": 2.968574285507202,
+ "learning_rate": 2.446354997644705e-06,
+ "loss": 0.4418,
+ "step": 6481
+ },
+ {
+ "epoch": 3.064775413711584,
+ "grad_norm": 2.692253589630127,
+ "learning_rate": 2.4457313159806028e-06,
+ "loss": 0.3141,
+ "step": 6482
+ },
+ {
+ "epoch": 3.0652482269503545,
+ "grad_norm": 2.5857295989990234,
+ "learning_rate": 2.445107637695574e-06,
+ "loss": 0.3392,
+ "step": 6483
+ },
+ {
+ "epoch": 3.0657210401891253,
+ "grad_norm": 3.2332825660705566,
+ "learning_rate": 2.4444839628284504e-06,
+ "loss": 0.4694,
+ "step": 6484
+ },
+ {
+ "epoch": 3.066193853427896,
+ "grad_norm": 2.7391014099121094,
+ "learning_rate": 2.4438602914180684e-06,
+ "loss": 0.3966,
+ "step": 6485
+ },
+ {
+ "epoch": 3.066666666666667,
+ "grad_norm": 2.7882139682769775,
+ "learning_rate": 2.4432366235032593e-06,
+ "loss": 0.3552,
+ "step": 6486
+ },
+ {
+ "epoch": 3.0671394799054372,
+ "grad_norm": 2.8907811641693115,
+ "learning_rate": 2.4426129591228573e-06,
+ "loss": 0.4478,
+ "step": 6487
+ },
+ {
+ "epoch": 3.067612293144208,
+ "grad_norm": 2.878929853439331,
+ "learning_rate": 2.4419892983156947e-06,
+ "loss": 0.3457,
+ "step": 6488
+ },
+ {
+ "epoch": 3.068085106382979,
+ "grad_norm": 2.7087442874908447,
+ "learning_rate": 2.441365641120603e-06,
+ "loss": 0.3491,
+ "step": 6489
+ },
+ {
+ "epoch": 3.0685579196217496,
+ "grad_norm": 3.2330431938171387,
+ "learning_rate": 2.4407419875764167e-06,
+ "loss": 0.3901,
+ "step": 6490
+ },
+ {
+ "epoch": 3.06903073286052,
+ "grad_norm": 3.0529370307922363,
+ "learning_rate": 2.440118337721966e-06,
+ "loss": 0.4059,
+ "step": 6491
+ },
+ {
+ "epoch": 3.0695035460992908,
+ "grad_norm": 2.4786794185638428,
+ "learning_rate": 2.439494691596085e-06,
+ "loss": 0.3153,
+ "step": 6492
+ },
+ {
+ "epoch": 3.0699763593380616,
+ "grad_norm": 2.956310510635376,
+ "learning_rate": 2.438871049237604e-06,
+ "loss": 0.3973,
+ "step": 6493
+ },
+ {
+ "epoch": 3.0704491725768324,
+ "grad_norm": 3.0816991329193115,
+ "learning_rate": 2.4382474106853543e-06,
+ "loss": 0.388,
+ "step": 6494
+ },
+ {
+ "epoch": 3.0709219858156027,
+ "grad_norm": 2.6103477478027344,
+ "learning_rate": 2.4376237759781686e-06,
+ "loss": 0.3656,
+ "step": 6495
+ },
+ {
+ "epoch": 3.0713947990543735,
+ "grad_norm": 2.974076271057129,
+ "learning_rate": 2.437000145154875e-06,
+ "loss": 0.3246,
+ "step": 6496
+ },
+ {
+ "epoch": 3.0718676122931443,
+ "grad_norm": 2.633605718612671,
+ "learning_rate": 2.4363765182543075e-06,
+ "loss": 0.3556,
+ "step": 6497
+ },
+ {
+ "epoch": 3.072340425531915,
+ "grad_norm": 2.49161434173584,
+ "learning_rate": 2.4357528953152953e-06,
+ "loss": 0.3506,
+ "step": 6498
+ },
+ {
+ "epoch": 3.0728132387706855,
+ "grad_norm": 2.6435935497283936,
+ "learning_rate": 2.4351292763766676e-06,
+ "loss": 0.3652,
+ "step": 6499
+ },
+ {
+ "epoch": 3.0732860520094563,
+ "grad_norm": 2.9710617065429688,
+ "learning_rate": 2.4345056614772563e-06,
+ "loss": 0.3713,
+ "step": 6500
+ },
+ {
+ "epoch": 3.073758865248227,
+ "grad_norm": 2.6947052478790283,
+ "learning_rate": 2.43388205065589e-06,
+ "loss": 0.378,
+ "step": 6501
+ },
+ {
+ "epoch": 3.0742316784869974,
+ "grad_norm": 2.9686238765716553,
+ "learning_rate": 2.433258443951398e-06,
+ "loss": 0.3936,
+ "step": 6502
+ },
+ {
+ "epoch": 3.074704491725768,
+ "grad_norm": 2.6008691787719727,
+ "learning_rate": 2.432634841402611e-06,
+ "loss": 0.3709,
+ "step": 6503
+ },
+ {
+ "epoch": 3.075177304964539,
+ "grad_norm": 2.595116376876831,
+ "learning_rate": 2.4320112430483563e-06,
+ "loss": 0.3884,
+ "step": 6504
+ },
+ {
+ "epoch": 3.07565011820331,
+ "grad_norm": 2.685241460800171,
+ "learning_rate": 2.431387648927464e-06,
+ "loss": 0.3751,
+ "step": 6505
+ },
+ {
+ "epoch": 3.0761229314420806,
+ "grad_norm": 2.8863797187805176,
+ "learning_rate": 2.430764059078762e-06,
+ "loss": 0.3765,
+ "step": 6506
+ },
+ {
+ "epoch": 3.076595744680851,
+ "grad_norm": 3.020766019821167,
+ "learning_rate": 2.430140473541077e-06,
+ "loss": 0.362,
+ "step": 6507
+ },
+ {
+ "epoch": 3.0770685579196217,
+ "grad_norm": 2.9521167278289795,
+ "learning_rate": 2.42951689235324e-06,
+ "loss": 0.41,
+ "step": 6508
+ },
+ {
+ "epoch": 3.0775413711583925,
+ "grad_norm": 2.5844924449920654,
+ "learning_rate": 2.4288933155540757e-06,
+ "loss": 0.3258,
+ "step": 6509
+ },
+ {
+ "epoch": 3.078014184397163,
+ "grad_norm": 3.052661657333374,
+ "learning_rate": 2.4282697431824138e-06,
+ "loss": 0.363,
+ "step": 6510
+ },
+ {
+ "epoch": 3.0784869976359337,
+ "grad_norm": 3.109342575073242,
+ "learning_rate": 2.427646175277081e-06,
+ "loss": 0.4105,
+ "step": 6511
+ },
+ {
+ "epoch": 3.0789598108747045,
+ "grad_norm": 3.3141326904296875,
+ "learning_rate": 2.427022611876903e-06,
+ "loss": 0.405,
+ "step": 6512
+ },
+ {
+ "epoch": 3.0794326241134753,
+ "grad_norm": 3.054673194885254,
+ "learning_rate": 2.426399053020707e-06,
+ "loss": 0.3532,
+ "step": 6513
+ },
+ {
+ "epoch": 3.079905437352246,
+ "grad_norm": 2.823489189147949,
+ "learning_rate": 2.425775498747318e-06,
+ "loss": 0.3762,
+ "step": 6514
+ },
+ {
+ "epoch": 3.0803782505910164,
+ "grad_norm": 2.6739792823791504,
+ "learning_rate": 2.425151949095565e-06,
+ "loss": 0.4044,
+ "step": 6515
+ },
+ {
+ "epoch": 3.0808510638297872,
+ "grad_norm": 2.7313177585601807,
+ "learning_rate": 2.4245284041042714e-06,
+ "loss": 0.3136,
+ "step": 6516
+ },
+ {
+ "epoch": 3.081323877068558,
+ "grad_norm": 3.1661181449890137,
+ "learning_rate": 2.4239048638122624e-06,
+ "loss": 0.44,
+ "step": 6517
+ },
+ {
+ "epoch": 3.0817966903073284,
+ "grad_norm": 3.326542377471924,
+ "learning_rate": 2.4232813282583647e-06,
+ "loss": 0.3798,
+ "step": 6518
+ },
+ {
+ "epoch": 3.082269503546099,
+ "grad_norm": 3.0194952487945557,
+ "learning_rate": 2.422657797481402e-06,
+ "loss": 0.423,
+ "step": 6519
+ },
+ {
+ "epoch": 3.08274231678487,
+ "grad_norm": 2.6704318523406982,
+ "learning_rate": 2.4220342715201995e-06,
+ "loss": 0.41,
+ "step": 6520
+ },
+ {
+ "epoch": 3.083215130023641,
+ "grad_norm": 3.057990312576294,
+ "learning_rate": 2.421410750413581e-06,
+ "loss": 0.4096,
+ "step": 6521
+ },
+ {
+ "epoch": 3.083687943262411,
+ "grad_norm": 2.6242079734802246,
+ "learning_rate": 2.4207872342003693e-06,
+ "loss": 0.3673,
+ "step": 6522
+ },
+ {
+ "epoch": 3.084160756501182,
+ "grad_norm": 2.933910846710205,
+ "learning_rate": 2.4201637229193904e-06,
+ "loss": 0.4018,
+ "step": 6523
+ },
+ {
+ "epoch": 3.0846335697399527,
+ "grad_norm": 2.6973681449890137,
+ "learning_rate": 2.4195402166094657e-06,
+ "loss": 0.3533,
+ "step": 6524
+ },
+ {
+ "epoch": 3.0851063829787235,
+ "grad_norm": 3.096013307571411,
+ "learning_rate": 2.4189167153094194e-06,
+ "loss": 0.3872,
+ "step": 6525
+ },
+ {
+ "epoch": 3.085579196217494,
+ "grad_norm": 3.0707414150238037,
+ "learning_rate": 2.4182932190580737e-06,
+ "loss": 0.3775,
+ "step": 6526
+ },
+ {
+ "epoch": 3.0860520094562647,
+ "grad_norm": 2.873190402984619,
+ "learning_rate": 2.417669727894251e-06,
+ "loss": 0.3144,
+ "step": 6527
+ },
+ {
+ "epoch": 3.0865248226950355,
+ "grad_norm": 2.316431999206543,
+ "learning_rate": 2.4170462418567732e-06,
+ "loss": 0.3238,
+ "step": 6528
+ },
+ {
+ "epoch": 3.0869976359338063,
+ "grad_norm": 2.3672494888305664,
+ "learning_rate": 2.4164227609844626e-06,
+ "loss": 0.3585,
+ "step": 6529
+ },
+ {
+ "epoch": 3.0874704491725766,
+ "grad_norm": 2.904538154602051,
+ "learning_rate": 2.415799285316139e-06,
+ "loss": 0.366,
+ "step": 6530
+ },
+ {
+ "epoch": 3.0879432624113474,
+ "grad_norm": 2.914602279663086,
+ "learning_rate": 2.415175814890626e-06,
+ "loss": 0.3793,
+ "step": 6531
+ },
+ {
+ "epoch": 3.088416075650118,
+ "grad_norm": 2.652005672454834,
+ "learning_rate": 2.4145523497467417e-06,
+ "loss": 0.362,
+ "step": 6532
+ },
+ {
+ "epoch": 3.088888888888889,
+ "grad_norm": 2.5137813091278076,
+ "learning_rate": 2.413928889923309e-06,
+ "loss": 0.2974,
+ "step": 6533
+ },
+ {
+ "epoch": 3.0893617021276594,
+ "grad_norm": 3.2166645526885986,
+ "learning_rate": 2.413305435459147e-06,
+ "loss": 0.4151,
+ "step": 6534
+ },
+ {
+ "epoch": 3.08983451536643,
+ "grad_norm": 3.0506820678710938,
+ "learning_rate": 2.412681986393075e-06,
+ "loss": 0.4223,
+ "step": 6535
+ },
+ {
+ "epoch": 3.090307328605201,
+ "grad_norm": 3.035275936126709,
+ "learning_rate": 2.412058542763913e-06,
+ "loss": 0.4841,
+ "step": 6536
+ },
+ {
+ "epoch": 3.0907801418439718,
+ "grad_norm": 3.3195009231567383,
+ "learning_rate": 2.4114351046104793e-06,
+ "loss": 0.4205,
+ "step": 6537
+ },
+ {
+ "epoch": 3.091252955082742,
+ "grad_norm": 2.8700361251831055,
+ "learning_rate": 2.410811671971594e-06,
+ "loss": 0.3704,
+ "step": 6538
+ },
+ {
+ "epoch": 3.091725768321513,
+ "grad_norm": 2.900595188140869,
+ "learning_rate": 2.410188244886075e-06,
+ "loss": 0.4184,
+ "step": 6539
+ },
+ {
+ "epoch": 3.0921985815602837,
+ "grad_norm": 2.88179349899292,
+ "learning_rate": 2.409564823392739e-06,
+ "loss": 0.4156,
+ "step": 6540
+ },
+ {
+ "epoch": 3.0926713947990545,
+ "grad_norm": 2.677568197250366,
+ "learning_rate": 2.408941407530406e-06,
+ "loss": 0.4084,
+ "step": 6541
+ },
+ {
+ "epoch": 3.093144208037825,
+ "grad_norm": 3.0236027240753174,
+ "learning_rate": 2.408317997337892e-06,
+ "loss": 0.4384,
+ "step": 6542
+ },
+ {
+ "epoch": 3.0936170212765957,
+ "grad_norm": 3.1708545684814453,
+ "learning_rate": 2.4076945928540143e-06,
+ "loss": 0.3876,
+ "step": 6543
+ },
+ {
+ "epoch": 3.0940898345153665,
+ "grad_norm": 3.248821973800659,
+ "learning_rate": 2.40707119411759e-06,
+ "loss": 0.3865,
+ "step": 6544
+ },
+ {
+ "epoch": 3.0945626477541373,
+ "grad_norm": 3.0961649417877197,
+ "learning_rate": 2.4064478011674334e-06,
+ "loss": 0.3982,
+ "step": 6545
+ },
+ {
+ "epoch": 3.0950354609929076,
+ "grad_norm": 3.1989805698394775,
+ "learning_rate": 2.4058244140423637e-06,
+ "loss": 0.4777,
+ "step": 6546
+ },
+ {
+ "epoch": 3.0955082742316784,
+ "grad_norm": 2.805640459060669,
+ "learning_rate": 2.4052010327811933e-06,
+ "loss": 0.3764,
+ "step": 6547
+ },
+ {
+ "epoch": 3.095981087470449,
+ "grad_norm": 2.7225050926208496,
+ "learning_rate": 2.40457765742274e-06,
+ "loss": 0.3286,
+ "step": 6548
+ },
+ {
+ "epoch": 3.09645390070922,
+ "grad_norm": 3.119915008544922,
+ "learning_rate": 2.4039542880058174e-06,
+ "loss": 0.4463,
+ "step": 6549
+ },
+ {
+ "epoch": 3.0969267139479904,
+ "grad_norm": 2.8503530025482178,
+ "learning_rate": 2.4033309245692403e-06,
+ "loss": 0.395,
+ "step": 6550
+ },
+ {
+ "epoch": 3.097399527186761,
+ "grad_norm": 2.947504758834839,
+ "learning_rate": 2.4027075671518225e-06,
+ "loss": 0.4024,
+ "step": 6551
+ },
+ {
+ "epoch": 3.097872340425532,
+ "grad_norm": 3.170905113220215,
+ "learning_rate": 2.402084215792377e-06,
+ "loss": 0.4302,
+ "step": 6552
+ },
+ {
+ "epoch": 3.0983451536643027,
+ "grad_norm": 2.910475492477417,
+ "learning_rate": 2.4014608705297195e-06,
+ "loss": 0.4037,
+ "step": 6553
+ },
+ {
+ "epoch": 3.098817966903073,
+ "grad_norm": 2.627511978149414,
+ "learning_rate": 2.400837531402661e-06,
+ "loss": 0.3972,
+ "step": 6554
+ },
+ {
+ "epoch": 3.099290780141844,
+ "grad_norm": 2.6485681533813477,
+ "learning_rate": 2.4002141984500133e-06,
+ "loss": 0.4044,
+ "step": 6555
+ },
+ {
+ "epoch": 3.0997635933806147,
+ "grad_norm": 2.930954694747925,
+ "learning_rate": 2.399590871710592e-06,
+ "loss": 0.4214,
+ "step": 6556
+ },
+ {
+ "epoch": 3.1002364066193855,
+ "grad_norm": 2.6014554500579834,
+ "learning_rate": 2.3989675512232063e-06,
+ "loss": 0.3493,
+ "step": 6557
+ },
+ {
+ "epoch": 3.100709219858156,
+ "grad_norm": 2.899001121520996,
+ "learning_rate": 2.398344237026667e-06,
+ "loss": 0.382,
+ "step": 6558
+ },
+ {
+ "epoch": 3.1011820330969266,
+ "grad_norm": 2.4698870182037354,
+ "learning_rate": 2.3977209291597876e-06,
+ "loss": 0.3558,
+ "step": 6559
+ },
+ {
+ "epoch": 3.1016548463356974,
+ "grad_norm": 3.2926251888275146,
+ "learning_rate": 2.3970976276613763e-06,
+ "loss": 0.4078,
+ "step": 6560
+ },
+ {
+ "epoch": 3.1021276595744682,
+ "grad_norm": 2.5306150913238525,
+ "learning_rate": 2.3964743325702454e-06,
+ "loss": 0.3657,
+ "step": 6561
+ },
+ {
+ "epoch": 3.1026004728132386,
+ "grad_norm": 2.727583408355713,
+ "learning_rate": 2.395851043925204e-06,
+ "loss": 0.3791,
+ "step": 6562
+ },
+ {
+ "epoch": 3.1030732860520094,
+ "grad_norm": 3.1403541564941406,
+ "learning_rate": 2.3952277617650602e-06,
+ "loss": 0.3934,
+ "step": 6563
+ },
+ {
+ "epoch": 3.10354609929078,
+ "grad_norm": 2.5816383361816406,
+ "learning_rate": 2.3946044861286256e-06,
+ "loss": 0.3703,
+ "step": 6564
+ },
+ {
+ "epoch": 3.104018912529551,
+ "grad_norm": 2.5742220878601074,
+ "learning_rate": 2.3939812170547067e-06,
+ "loss": 0.3628,
+ "step": 6565
+ },
+ {
+ "epoch": 3.1044917257683213,
+ "grad_norm": 2.7276530265808105,
+ "learning_rate": 2.393357954582113e-06,
+ "loss": 0.3789,
+ "step": 6566
+ },
+ {
+ "epoch": 3.104964539007092,
+ "grad_norm": 3.05595064163208,
+ "learning_rate": 2.3927346987496515e-06,
+ "loss": 0.3766,
+ "step": 6567
+ },
+ {
+ "epoch": 3.105437352245863,
+ "grad_norm": 2.786970615386963,
+ "learning_rate": 2.39211144959613e-06,
+ "loss": 0.3329,
+ "step": 6568
+ },
+ {
+ "epoch": 3.1059101654846337,
+ "grad_norm": 3.499018430709839,
+ "learning_rate": 2.391488207160356e-06,
+ "loss": 0.4175,
+ "step": 6569
+ },
+ {
+ "epoch": 3.106382978723404,
+ "grad_norm": 2.969735860824585,
+ "learning_rate": 2.3908649714811346e-06,
+ "loss": 0.3893,
+ "step": 6570
+ },
+ {
+ "epoch": 3.106855791962175,
+ "grad_norm": 3.1494929790496826,
+ "learning_rate": 2.3902417425972734e-06,
+ "loss": 0.4048,
+ "step": 6571
+ },
+ {
+ "epoch": 3.1073286052009457,
+ "grad_norm": 2.6393489837646484,
+ "learning_rate": 2.3896185205475782e-06,
+ "loss": 0.3216,
+ "step": 6572
+ },
+ {
+ "epoch": 3.1078014184397165,
+ "grad_norm": 3.6984152793884277,
+ "learning_rate": 2.3889953053708528e-06,
+ "loss": 0.3646,
+ "step": 6573
+ },
+ {
+ "epoch": 3.108274231678487,
+ "grad_norm": 3.518547534942627,
+ "learning_rate": 2.388372097105903e-06,
+ "loss": 0.3627,
+ "step": 6574
+ },
+ {
+ "epoch": 3.1087470449172576,
+ "grad_norm": 3.422043800354004,
+ "learning_rate": 2.3877488957915333e-06,
+ "loss": 0.4116,
+ "step": 6575
+ },
+ {
+ "epoch": 3.1092198581560284,
+ "grad_norm": 2.8088064193725586,
+ "learning_rate": 2.3871257014665486e-06,
+ "loss": 0.3477,
+ "step": 6576
+ },
+ {
+ "epoch": 3.109692671394799,
+ "grad_norm": 2.7877607345581055,
+ "learning_rate": 2.3865025141697513e-06,
+ "loss": 0.351,
+ "step": 6577
+ },
+ {
+ "epoch": 3.1101654846335696,
+ "grad_norm": 2.9446799755096436,
+ "learning_rate": 2.3858793339399433e-06,
+ "loss": 0.4025,
+ "step": 6578
+ },
+ {
+ "epoch": 3.1106382978723404,
+ "grad_norm": 2.886584758758545,
+ "learning_rate": 2.3852561608159304e-06,
+ "loss": 0.3765,
+ "step": 6579
+ },
+ {
+ "epoch": 3.111111111111111,
+ "grad_norm": 3.45711088180542,
+ "learning_rate": 2.384632994836513e-06,
+ "loss": 0.3744,
+ "step": 6580
+ },
+ {
+ "epoch": 3.111583924349882,
+ "grad_norm": 2.737441301345825,
+ "learning_rate": 2.3840098360404916e-06,
+ "loss": 0.4048,
+ "step": 6581
+ },
+ {
+ "epoch": 3.1120567375886523,
+ "grad_norm": 2.742567300796509,
+ "learning_rate": 2.383386684466671e-06,
+ "loss": 0.3717,
+ "step": 6582
+ },
+ {
+ "epoch": 3.112529550827423,
+ "grad_norm": 3.017970561981201,
+ "learning_rate": 2.382763540153849e-06,
+ "loss": 0.3922,
+ "step": 6583
+ },
+ {
+ "epoch": 3.113002364066194,
+ "grad_norm": 3.132004499435425,
+ "learning_rate": 2.3821404031408283e-06,
+ "loss": 0.3969,
+ "step": 6584
+ },
+ {
+ "epoch": 3.1134751773049647,
+ "grad_norm": 2.910820245742798,
+ "learning_rate": 2.3815172734664075e-06,
+ "loss": 0.4241,
+ "step": 6585
+ },
+ {
+ "epoch": 3.113947990543735,
+ "grad_norm": 3.0029842853546143,
+ "learning_rate": 2.380894151169386e-06,
+ "loss": 0.4007,
+ "step": 6586
+ },
+ {
+ "epoch": 3.114420803782506,
+ "grad_norm": 3.0309178829193115,
+ "learning_rate": 2.380271036288564e-06,
+ "loss": 0.3876,
+ "step": 6587
+ },
+ {
+ "epoch": 3.1148936170212767,
+ "grad_norm": 2.963204860687256,
+ "learning_rate": 2.379647928862739e-06,
+ "loss": 0.4017,
+ "step": 6588
+ },
+ {
+ "epoch": 3.1153664302600474,
+ "grad_norm": 3.0127944946289062,
+ "learning_rate": 2.3790248289307103e-06,
+ "loss": 0.3651,
+ "step": 6589
+ },
+ {
+ "epoch": 3.115839243498818,
+ "grad_norm": 2.557485580444336,
+ "learning_rate": 2.3784017365312755e-06,
+ "loss": 0.3419,
+ "step": 6590
+ },
+ {
+ "epoch": 3.1163120567375886,
+ "grad_norm": 2.8577969074249268,
+ "learning_rate": 2.3777786517032306e-06,
+ "loss": 0.372,
+ "step": 6591
+ },
+ {
+ "epoch": 3.1167848699763594,
+ "grad_norm": 2.450324058532715,
+ "learning_rate": 2.3771555744853735e-06,
+ "loss": 0.3442,
+ "step": 6592
+ },
+ {
+ "epoch": 3.11725768321513,
+ "grad_norm": 2.7939295768737793,
+ "learning_rate": 2.3765325049164996e-06,
+ "loss": 0.401,
+ "step": 6593
+ },
+ {
+ "epoch": 3.1177304964539005,
+ "grad_norm": 2.9690325260162354,
+ "learning_rate": 2.3759094430354056e-06,
+ "loss": 0.3962,
+ "step": 6594
+ },
+ {
+ "epoch": 3.1182033096926713,
+ "grad_norm": 2.7630631923675537,
+ "learning_rate": 2.375286388880887e-06,
+ "loss": 0.4126,
+ "step": 6595
+ },
+ {
+ "epoch": 3.118676122931442,
+ "grad_norm": 2.6259944438934326,
+ "learning_rate": 2.3746633424917366e-06,
+ "loss": 0.3285,
+ "step": 6596
+ },
+ {
+ "epoch": 3.119148936170213,
+ "grad_norm": 2.7107701301574707,
+ "learning_rate": 2.3740403039067516e-06,
+ "loss": 0.3636,
+ "step": 6597
+ },
+ {
+ "epoch": 3.1196217494089833,
+ "grad_norm": 2.985301971435547,
+ "learning_rate": 2.373417273164724e-06,
+ "loss": 0.3928,
+ "step": 6598
+ },
+ {
+ "epoch": 3.120094562647754,
+ "grad_norm": 3.2578976154327393,
+ "learning_rate": 2.3727942503044483e-06,
+ "loss": 0.3379,
+ "step": 6599
+ },
+ {
+ "epoch": 3.120567375886525,
+ "grad_norm": 3.1681406497955322,
+ "learning_rate": 2.372171235364717e-06,
+ "loss": 0.4023,
+ "step": 6600
+ },
+ {
+ "epoch": 3.1210401891252957,
+ "grad_norm": 3.120147705078125,
+ "learning_rate": 2.371548228384321e-06,
+ "loss": 0.4228,
+ "step": 6601
+ },
+ {
+ "epoch": 3.121513002364066,
+ "grad_norm": 2.7786099910736084,
+ "learning_rate": 2.3709252294020547e-06,
+ "loss": 0.4386,
+ "step": 6602
+ },
+ {
+ "epoch": 3.121985815602837,
+ "grad_norm": 2.698849678039551,
+ "learning_rate": 2.3703022384567086e-06,
+ "loss": 0.3861,
+ "step": 6603
+ },
+ {
+ "epoch": 3.1224586288416076,
+ "grad_norm": 2.7917959690093994,
+ "learning_rate": 2.3696792555870724e-06,
+ "loss": 0.3535,
+ "step": 6604
+ },
+ {
+ "epoch": 3.1229314420803784,
+ "grad_norm": 2.8249263763427734,
+ "learning_rate": 2.3690562808319385e-06,
+ "loss": 0.3415,
+ "step": 6605
+ },
+ {
+ "epoch": 3.123404255319149,
+ "grad_norm": 2.567458391189575,
+ "learning_rate": 2.368433314230095e-06,
+ "loss": 0.3827,
+ "step": 6606
+ },
+ {
+ "epoch": 3.1238770685579196,
+ "grad_norm": 2.9670443534851074,
+ "learning_rate": 2.3678103558203328e-06,
+ "loss": 0.4238,
+ "step": 6607
+ },
+ {
+ "epoch": 3.1243498817966904,
+ "grad_norm": 2.6893439292907715,
+ "learning_rate": 2.36718740564144e-06,
+ "loss": 0.3461,
+ "step": 6608
+ },
+ {
+ "epoch": 3.124822695035461,
+ "grad_norm": 3.2669708728790283,
+ "learning_rate": 2.3665644637322044e-06,
+ "loss": 0.3992,
+ "step": 6609
+ },
+ {
+ "epoch": 3.1252955082742315,
+ "grad_norm": 2.889340400695801,
+ "learning_rate": 2.3659415301314152e-06,
+ "loss": 0.3829,
+ "step": 6610
+ },
+ {
+ "epoch": 3.1257683215130023,
+ "grad_norm": 2.625603199005127,
+ "learning_rate": 2.3653186048778584e-06,
+ "loss": 0.3559,
+ "step": 6611
+ },
+ {
+ "epoch": 3.126241134751773,
+ "grad_norm": 2.8128650188446045,
+ "learning_rate": 2.3646956880103224e-06,
+ "loss": 0.4035,
+ "step": 6612
+ },
+ {
+ "epoch": 3.126713947990544,
+ "grad_norm": 3.1887412071228027,
+ "learning_rate": 2.3640727795675925e-06,
+ "loss": 0.3938,
+ "step": 6613
+ },
+ {
+ "epoch": 3.1271867612293143,
+ "grad_norm": 2.886514186859131,
+ "learning_rate": 2.363449879588454e-06,
+ "loss": 0.3504,
+ "step": 6614
+ },
+ {
+ "epoch": 3.127659574468085,
+ "grad_norm": 3.2149860858917236,
+ "learning_rate": 2.3628269881116937e-06,
+ "loss": 0.4137,
+ "step": 6615
+ },
+ {
+ "epoch": 3.128132387706856,
+ "grad_norm": 3.3155312538146973,
+ "learning_rate": 2.362204105176094e-06,
+ "loss": 0.3811,
+ "step": 6616
+ },
+ {
+ "epoch": 3.1286052009456267,
+ "grad_norm": 2.6228792667388916,
+ "learning_rate": 2.3615812308204415e-06,
+ "loss": 0.3511,
+ "step": 6617
+ },
+ {
+ "epoch": 3.129078014184397,
+ "grad_norm": 2.7686524391174316,
+ "learning_rate": 2.3609583650835187e-06,
+ "loss": 0.3722,
+ "step": 6618
+ },
+ {
+ "epoch": 3.129550827423168,
+ "grad_norm": 3.396368980407715,
+ "learning_rate": 2.3603355080041083e-06,
+ "loss": 0.4678,
+ "step": 6619
+ },
+ {
+ "epoch": 3.1300236406619386,
+ "grad_norm": 2.7329437732696533,
+ "learning_rate": 2.359712659620994e-06,
+ "loss": 0.3775,
+ "step": 6620
+ },
+ {
+ "epoch": 3.1304964539007094,
+ "grad_norm": 2.7633914947509766,
+ "learning_rate": 2.3590898199729567e-06,
+ "loss": 0.3306,
+ "step": 6621
+ },
+ {
+ "epoch": 3.1309692671394798,
+ "grad_norm": 3.020887613296509,
+ "learning_rate": 2.3584669890987792e-06,
+ "loss": 0.4121,
+ "step": 6622
+ },
+ {
+ "epoch": 3.1314420803782506,
+ "grad_norm": 2.8912103176116943,
+ "learning_rate": 2.3578441670372414e-06,
+ "loss": 0.4297,
+ "step": 6623
+ },
+ {
+ "epoch": 3.1319148936170214,
+ "grad_norm": 3.0654027462005615,
+ "learning_rate": 2.3572213538271234e-06,
+ "loss": 0.3856,
+ "step": 6624
+ },
+ {
+ "epoch": 3.132387706855792,
+ "grad_norm": 3.1126575469970703,
+ "learning_rate": 2.356598549507206e-06,
+ "loss": 0.3886,
+ "step": 6625
+ },
+ {
+ "epoch": 3.1328605200945625,
+ "grad_norm": 2.7066447734832764,
+ "learning_rate": 2.3559757541162687e-06,
+ "loss": 0.4212,
+ "step": 6626
+ },
+ {
+ "epoch": 3.1333333333333333,
+ "grad_norm": 2.876338243484497,
+ "learning_rate": 2.355352967693088e-06,
+ "loss": 0.3607,
+ "step": 6627
+ },
+ {
+ "epoch": 3.133806146572104,
+ "grad_norm": 2.9011716842651367,
+ "learning_rate": 2.3547301902764454e-06,
+ "loss": 0.428,
+ "step": 6628
+ },
+ {
+ "epoch": 3.134278959810875,
+ "grad_norm": 2.805656909942627,
+ "learning_rate": 2.3541074219051163e-06,
+ "loss": 0.4038,
+ "step": 6629
+ },
+ {
+ "epoch": 3.1347517730496453,
+ "grad_norm": 2.89546275138855,
+ "learning_rate": 2.353484662617879e-06,
+ "loss": 0.3798,
+ "step": 6630
+ },
+ {
+ "epoch": 3.135224586288416,
+ "grad_norm": 3.0290539264678955,
+ "learning_rate": 2.352861912453508e-06,
+ "loss": 0.3916,
+ "step": 6631
+ },
+ {
+ "epoch": 3.135697399527187,
+ "grad_norm": 2.848393440246582,
+ "learning_rate": 2.352239171450781e-06,
+ "loss": 0.3423,
+ "step": 6632
+ },
+ {
+ "epoch": 3.1361702127659576,
+ "grad_norm": 2.871372938156128,
+ "learning_rate": 2.3516164396484737e-06,
+ "loss": 0.3872,
+ "step": 6633
+ },
+ {
+ "epoch": 3.136643026004728,
+ "grad_norm": 3.120682716369629,
+ "learning_rate": 2.3509937170853585e-06,
+ "loss": 0.3952,
+ "step": 6634
+ },
+ {
+ "epoch": 3.137115839243499,
+ "grad_norm": 2.6936683654785156,
+ "learning_rate": 2.3503710038002127e-06,
+ "loss": 0.3643,
+ "step": 6635
+ },
+ {
+ "epoch": 3.1375886524822696,
+ "grad_norm": 3.749519109725952,
+ "learning_rate": 2.349748299831808e-06,
+ "loss": 0.4519,
+ "step": 6636
+ },
+ {
+ "epoch": 3.1380614657210404,
+ "grad_norm": 2.8034276962280273,
+ "learning_rate": 2.3491256052189175e-06,
+ "loss": 0.401,
+ "step": 6637
+ },
+ {
+ "epoch": 3.1385342789598107,
+ "grad_norm": 2.6201975345611572,
+ "learning_rate": 2.348502920000314e-06,
+ "loss": 0.3491,
+ "step": 6638
+ },
+ {
+ "epoch": 3.1390070921985815,
+ "grad_norm": 2.890552043914795,
+ "learning_rate": 2.347880244214769e-06,
+ "loss": 0.3439,
+ "step": 6639
+ },
+ {
+ "epoch": 3.1394799054373523,
+ "grad_norm": 2.899594306945801,
+ "learning_rate": 2.347257577901055e-06,
+ "loss": 0.3707,
+ "step": 6640
+ },
+ {
+ "epoch": 3.139952718676123,
+ "grad_norm": 2.8660130500793457,
+ "learning_rate": 2.346634921097942e-06,
+ "loss": 0.3582,
+ "step": 6641
+ },
+ {
+ "epoch": 3.1404255319148935,
+ "grad_norm": 2.9805452823638916,
+ "learning_rate": 2.346012273844199e-06,
+ "loss": 0.3466,
+ "step": 6642
+ },
+ {
+ "epoch": 3.1408983451536643,
+ "grad_norm": 3.162977457046509,
+ "learning_rate": 2.345389636178597e-06,
+ "loss": 0.3657,
+ "step": 6643
+ },
+ {
+ "epoch": 3.141371158392435,
+ "grad_norm": 2.838988780975342,
+ "learning_rate": 2.344767008139904e-06,
+ "loss": 0.3826,
+ "step": 6644
+ },
+ {
+ "epoch": 3.141843971631206,
+ "grad_norm": 3.8427252769470215,
+ "learning_rate": 2.3441443897668893e-06,
+ "loss": 0.3697,
+ "step": 6645
+ },
+ {
+ "epoch": 3.1423167848699762,
+ "grad_norm": 2.9233880043029785,
+ "learning_rate": 2.34352178109832e-06,
+ "loss": 0.3481,
+ "step": 6646
+ },
+ {
+ "epoch": 3.142789598108747,
+ "grad_norm": 2.5840606689453125,
+ "learning_rate": 2.342899182172963e-06,
+ "loss": 0.3746,
+ "step": 6647
+ },
+ {
+ "epoch": 3.143262411347518,
+ "grad_norm": 2.806793451309204,
+ "learning_rate": 2.3422765930295857e-06,
+ "loss": 0.419,
+ "step": 6648
+ },
+ {
+ "epoch": 3.1437352245862886,
+ "grad_norm": 2.803952693939209,
+ "learning_rate": 2.3416540137069522e-06,
+ "loss": 0.3965,
+ "step": 6649
+ },
+ {
+ "epoch": 3.144208037825059,
+ "grad_norm": 2.8416364192962646,
+ "learning_rate": 2.3410314442438297e-06,
+ "loss": 0.4317,
+ "step": 6650
+ },
+ {
+ "epoch": 3.1446808510638298,
+ "grad_norm": 2.9956440925598145,
+ "learning_rate": 2.3404088846789826e-06,
+ "loss": 0.4268,
+ "step": 6651
+ },
+ {
+ "epoch": 3.1451536643026006,
+ "grad_norm": 3.1649162769317627,
+ "learning_rate": 2.339786335051173e-06,
+ "loss": 0.4149,
+ "step": 6652
+ },
+ {
+ "epoch": 3.145626477541371,
+ "grad_norm": 2.909107208251953,
+ "learning_rate": 2.3391637953991673e-06,
+ "loss": 0.4085,
+ "step": 6653
+ },
+ {
+ "epoch": 3.1460992907801417,
+ "grad_norm": 2.416755199432373,
+ "learning_rate": 2.3385412657617264e-06,
+ "loss": 0.3585,
+ "step": 6654
+ },
+ {
+ "epoch": 3.1465721040189125,
+ "grad_norm": 3.1122629642486572,
+ "learning_rate": 2.3379187461776123e-06,
+ "loss": 0.3876,
+ "step": 6655
+ },
+ {
+ "epoch": 3.1470449172576833,
+ "grad_norm": 2.6854658126831055,
+ "learning_rate": 2.337296236685588e-06,
+ "loss": 0.3125,
+ "step": 6656
+ },
+ {
+ "epoch": 3.147517730496454,
+ "grad_norm": 2.779876708984375,
+ "learning_rate": 2.3366737373244127e-06,
+ "loss": 0.3688,
+ "step": 6657
+ },
+ {
+ "epoch": 3.1479905437352245,
+ "grad_norm": 3.1444761753082275,
+ "learning_rate": 2.3360512481328484e-06,
+ "loss": 0.4089,
+ "step": 6658
+ },
+ {
+ "epoch": 3.1484633569739953,
+ "grad_norm": 2.71445894241333,
+ "learning_rate": 2.335428769149654e-06,
+ "loss": 0.3532,
+ "step": 6659
+ },
+ {
+ "epoch": 3.148936170212766,
+ "grad_norm": 2.9788241386413574,
+ "learning_rate": 2.334806300413587e-06,
+ "loss": 0.4238,
+ "step": 6660
+ },
+ {
+ "epoch": 3.1494089834515364,
+ "grad_norm": 3.0118865966796875,
+ "learning_rate": 2.334183841963409e-06,
+ "loss": 0.4437,
+ "step": 6661
+ },
+ {
+ "epoch": 3.149881796690307,
+ "grad_norm": 3.2229537963867188,
+ "learning_rate": 2.3335613938378753e-06,
+ "loss": 0.3582,
+ "step": 6662
+ },
+ {
+ "epoch": 3.150354609929078,
+ "grad_norm": 2.734997034072876,
+ "learning_rate": 2.3329389560757447e-06,
+ "loss": 0.3737,
+ "step": 6663
+ },
+ {
+ "epoch": 3.150827423167849,
+ "grad_norm": 3.4746382236480713,
+ "learning_rate": 2.3323165287157724e-06,
+ "loss": 0.3516,
+ "step": 6664
+ },
+ {
+ "epoch": 3.1513002364066196,
+ "grad_norm": 2.9428153038024902,
+ "learning_rate": 2.3316941117967137e-06,
+ "loss": 0.3985,
+ "step": 6665
+ },
+ {
+ "epoch": 3.15177304964539,
+ "grad_norm": 2.6840944290161133,
+ "learning_rate": 2.3310717053573257e-06,
+ "loss": 0.3274,
+ "step": 6666
+ },
+ {
+ "epoch": 3.1522458628841608,
+ "grad_norm": 3.048335552215576,
+ "learning_rate": 2.3304493094363607e-06,
+ "loss": 0.4262,
+ "step": 6667
+ },
+ {
+ "epoch": 3.1527186761229316,
+ "grad_norm": 2.87381911277771,
+ "learning_rate": 2.329826924072575e-06,
+ "loss": 0.3867,
+ "step": 6668
+ },
+ {
+ "epoch": 3.153191489361702,
+ "grad_norm": 2.6236355304718018,
+ "learning_rate": 2.32920454930472e-06,
+ "loss": 0.3649,
+ "step": 6669
+ },
+ {
+ "epoch": 3.1536643026004727,
+ "grad_norm": 3.1326401233673096,
+ "learning_rate": 2.328582185171549e-06,
+ "loss": 0.3451,
+ "step": 6670
+ },
+ {
+ "epoch": 3.1541371158392435,
+ "grad_norm": 3.011826992034912,
+ "learning_rate": 2.327959831711814e-06,
+ "loss": 0.4118,
+ "step": 6671
+ },
+ {
+ "epoch": 3.1546099290780143,
+ "grad_norm": 2.834933280944824,
+ "learning_rate": 2.3273374889642646e-06,
+ "loss": 0.4378,
+ "step": 6672
+ },
+ {
+ "epoch": 3.155082742316785,
+ "grad_norm": 3.085756778717041,
+ "learning_rate": 2.326715156967654e-06,
+ "loss": 0.4389,
+ "step": 6673
+ },
+ {
+ "epoch": 3.1555555555555554,
+ "grad_norm": 2.7912232875823975,
+ "learning_rate": 2.3260928357607305e-06,
+ "loss": 0.3352,
+ "step": 6674
+ },
+ {
+ "epoch": 3.1560283687943262,
+ "grad_norm": 2.7643113136291504,
+ "learning_rate": 2.3254705253822424e-06,
+ "loss": 0.3449,
+ "step": 6675
+ },
+ {
+ "epoch": 3.156501182033097,
+ "grad_norm": 2.8984663486480713,
+ "learning_rate": 2.3248482258709405e-06,
+ "loss": 0.4231,
+ "step": 6676
+ },
+ {
+ "epoch": 3.1569739952718674,
+ "grad_norm": 3.214996814727783,
+ "learning_rate": 2.324225937265572e-06,
+ "loss": 0.4616,
+ "step": 6677
+ },
+ {
+ "epoch": 3.157446808510638,
+ "grad_norm": 2.58534836769104,
+ "learning_rate": 2.3236036596048827e-06,
+ "loss": 0.3264,
+ "step": 6678
+ },
+ {
+ "epoch": 3.157919621749409,
+ "grad_norm": 2.790714740753174,
+ "learning_rate": 2.322981392927621e-06,
+ "loss": 0.4086,
+ "step": 6679
+ },
+ {
+ "epoch": 3.15839243498818,
+ "grad_norm": 2.726029872894287,
+ "learning_rate": 2.32235913727253e-06,
+ "loss": 0.3344,
+ "step": 6680
+ },
+ {
+ "epoch": 3.1588652482269506,
+ "grad_norm": 2.8392906188964844,
+ "learning_rate": 2.3217368926783583e-06,
+ "loss": 0.3468,
+ "step": 6681
+ },
+ {
+ "epoch": 3.159338061465721,
+ "grad_norm": 2.9796900749206543,
+ "learning_rate": 2.321114659183848e-06,
+ "loss": 0.4051,
+ "step": 6682
+ },
+ {
+ "epoch": 3.1598108747044917,
+ "grad_norm": 3.0399303436279297,
+ "learning_rate": 2.320492436827743e-06,
+ "loss": 0.402,
+ "step": 6683
+ },
+ {
+ "epoch": 3.1602836879432625,
+ "grad_norm": 2.9295334815979004,
+ "learning_rate": 2.3198702256487877e-06,
+ "loss": 0.3975,
+ "step": 6684
+ },
+ {
+ "epoch": 3.160756501182033,
+ "grad_norm": 2.881552219390869,
+ "learning_rate": 2.319248025685723e-06,
+ "loss": 0.4342,
+ "step": 6685
+ },
+ {
+ "epoch": 3.1612293144208037,
+ "grad_norm": 3.0711705684661865,
+ "learning_rate": 2.3186258369772916e-06,
+ "loss": 0.3829,
+ "step": 6686
+ },
+ {
+ "epoch": 3.1617021276595745,
+ "grad_norm": 2.6614468097686768,
+ "learning_rate": 2.3180036595622345e-06,
+ "loss": 0.3473,
+ "step": 6687
+ },
+ {
+ "epoch": 3.1621749408983453,
+ "grad_norm": 3.0084400177001953,
+ "learning_rate": 2.3173814934792903e-06,
+ "loss": 0.4363,
+ "step": 6688
+ },
+ {
+ "epoch": 3.162647754137116,
+ "grad_norm": 2.9340786933898926,
+ "learning_rate": 2.3167593387672006e-06,
+ "loss": 0.4235,
+ "step": 6689
+ },
+ {
+ "epoch": 3.1631205673758864,
+ "grad_norm": 3.0765340328216553,
+ "learning_rate": 2.3161371954647023e-06,
+ "loss": 0.4601,
+ "step": 6690
+ },
+ {
+ "epoch": 3.1635933806146572,
+ "grad_norm": 2.816096067428589,
+ "learning_rate": 2.3155150636105356e-06,
+ "loss": 0.3764,
+ "step": 6691
+ },
+ {
+ "epoch": 3.164066193853428,
+ "grad_norm": 3.0476551055908203,
+ "learning_rate": 2.3148929432434372e-06,
+ "loss": 0.3956,
+ "step": 6692
+ },
+ {
+ "epoch": 3.1645390070921984,
+ "grad_norm": 2.628934860229492,
+ "learning_rate": 2.314270834402143e-06,
+ "loss": 0.3551,
+ "step": 6693
+ },
+ {
+ "epoch": 3.165011820330969,
+ "grad_norm": 3.3933539390563965,
+ "learning_rate": 2.31364873712539e-06,
+ "loss": 0.4523,
+ "step": 6694
+ },
+ {
+ "epoch": 3.16548463356974,
+ "grad_norm": 3.256176233291626,
+ "learning_rate": 2.313026651451912e-06,
+ "loss": 0.417,
+ "step": 6695
+ },
+ {
+ "epoch": 3.1659574468085108,
+ "grad_norm": 2.92926025390625,
+ "learning_rate": 2.312404577420445e-06,
+ "loss": 0.4365,
+ "step": 6696
+ },
+ {
+ "epoch": 3.166430260047281,
+ "grad_norm": 2.9514732360839844,
+ "learning_rate": 2.3117825150697233e-06,
+ "loss": 0.4632,
+ "step": 6697
+ },
+ {
+ "epoch": 3.166903073286052,
+ "grad_norm": 2.8635852336883545,
+ "learning_rate": 2.3111604644384778e-06,
+ "loss": 0.4018,
+ "step": 6698
+ },
+ {
+ "epoch": 3.1673758865248227,
+ "grad_norm": 2.5937020778656006,
+ "learning_rate": 2.3105384255654433e-06,
+ "loss": 0.3682,
+ "step": 6699
+ },
+ {
+ "epoch": 3.1678486997635935,
+ "grad_norm": 2.857851266860962,
+ "learning_rate": 2.3099163984893497e-06,
+ "loss": 0.3293,
+ "step": 6700
+ },
+ {
+ "epoch": 3.168321513002364,
+ "grad_norm": 2.5903947353363037,
+ "learning_rate": 2.3092943832489283e-06,
+ "loss": 0.3543,
+ "step": 6701
+ },
+ {
+ "epoch": 3.1687943262411347,
+ "grad_norm": 2.9783661365509033,
+ "learning_rate": 2.30867237988291e-06,
+ "loss": 0.3707,
+ "step": 6702
+ },
+ {
+ "epoch": 3.1692671394799055,
+ "grad_norm": 3.0133306980133057,
+ "learning_rate": 2.3080503884300225e-06,
+ "loss": 0.439,
+ "step": 6703
+ },
+ {
+ "epoch": 3.1697399527186763,
+ "grad_norm": 2.7119483947753906,
+ "learning_rate": 2.3074284089289968e-06,
+ "loss": 0.3956,
+ "step": 6704
+ },
+ {
+ "epoch": 3.1702127659574466,
+ "grad_norm": 3.0499672889709473,
+ "learning_rate": 2.3068064414185597e-06,
+ "loss": 0.434,
+ "step": 6705
+ },
+ {
+ "epoch": 3.1706855791962174,
+ "grad_norm": 2.862807512283325,
+ "learning_rate": 2.306184485937437e-06,
+ "loss": 0.3644,
+ "step": 6706
+ },
+ {
+ "epoch": 3.171158392434988,
+ "grad_norm": 2.9445149898529053,
+ "learning_rate": 2.305562542524358e-06,
+ "loss": 0.3894,
+ "step": 6707
+ },
+ {
+ "epoch": 3.171631205673759,
+ "grad_norm": 3.0442428588867188,
+ "learning_rate": 2.304940611218046e-06,
+ "loss": 0.3816,
+ "step": 6708
+ },
+ {
+ "epoch": 3.1721040189125294,
+ "grad_norm": 2.7101798057556152,
+ "learning_rate": 2.304318692057228e-06,
+ "loss": 0.3708,
+ "step": 6709
+ },
+ {
+ "epoch": 3.1725768321513,
+ "grad_norm": 2.7874515056610107,
+ "learning_rate": 2.303696785080626e-06,
+ "loss": 0.404,
+ "step": 6710
+ },
+ {
+ "epoch": 3.173049645390071,
+ "grad_norm": 3.0438833236694336,
+ "learning_rate": 2.303074890326964e-06,
+ "loss": 0.4342,
+ "step": 6711
+ },
+ {
+ "epoch": 3.1735224586288417,
+ "grad_norm": 2.6079208850860596,
+ "learning_rate": 2.302453007834966e-06,
+ "loss": 0.3725,
+ "step": 6712
+ },
+ {
+ "epoch": 3.173995271867612,
+ "grad_norm": 3.3353021144866943,
+ "learning_rate": 2.3018311376433523e-06,
+ "loss": 0.4372,
+ "step": 6713
+ },
+ {
+ "epoch": 3.174468085106383,
+ "grad_norm": 2.840771436691284,
+ "learning_rate": 2.3012092797908454e-06,
+ "loss": 0.3979,
+ "step": 6714
+ },
+ {
+ "epoch": 3.1749408983451537,
+ "grad_norm": 3.0474867820739746,
+ "learning_rate": 2.3005874343161648e-06,
+ "loss": 0.4077,
+ "step": 6715
+ },
+ {
+ "epoch": 3.1754137115839245,
+ "grad_norm": 2.849835157394409,
+ "learning_rate": 2.2999656012580296e-06,
+ "loss": 0.393,
+ "step": 6716
+ },
+ {
+ "epoch": 3.175886524822695,
+ "grad_norm": 2.6361217498779297,
+ "learning_rate": 2.29934378065516e-06,
+ "loss": 0.3894,
+ "step": 6717
+ },
+ {
+ "epoch": 3.1763593380614656,
+ "grad_norm": 3.139700174331665,
+ "learning_rate": 2.298721972546273e-06,
+ "loss": 0.36,
+ "step": 6718
+ },
+ {
+ "epoch": 3.1768321513002364,
+ "grad_norm": 2.987861156463623,
+ "learning_rate": 2.298100176970087e-06,
+ "loss": 0.4306,
+ "step": 6719
+ },
+ {
+ "epoch": 3.1773049645390072,
+ "grad_norm": 2.6403157711029053,
+ "learning_rate": 2.297478393965317e-06,
+ "loss": 0.3978,
+ "step": 6720
+ },
+ {
+ "epoch": 3.1777777777777776,
+ "grad_norm": 2.819519281387329,
+ "learning_rate": 2.296856623570679e-06,
+ "loss": 0.3467,
+ "step": 6721
+ },
+ {
+ "epoch": 3.1782505910165484,
+ "grad_norm": 2.7195916175842285,
+ "learning_rate": 2.296234865824889e-06,
+ "loss": 0.3685,
+ "step": 6722
+ },
+ {
+ "epoch": 3.178723404255319,
+ "grad_norm": 3.015488624572754,
+ "learning_rate": 2.2956131207666604e-06,
+ "loss": 0.3751,
+ "step": 6723
+ },
+ {
+ "epoch": 3.17919621749409,
+ "grad_norm": 2.9283792972564697,
+ "learning_rate": 2.2949913884347055e-06,
+ "loss": 0.3261,
+ "step": 6724
+ },
+ {
+ "epoch": 3.1796690307328603,
+ "grad_norm": 3.358991861343384,
+ "learning_rate": 2.294369668867739e-06,
+ "loss": 0.4505,
+ "step": 6725
+ },
+ {
+ "epoch": 3.180141843971631,
+ "grad_norm": 2.9143471717834473,
+ "learning_rate": 2.2937479621044712e-06,
+ "loss": 0.3612,
+ "step": 6726
+ },
+ {
+ "epoch": 3.180614657210402,
+ "grad_norm": 3.020519495010376,
+ "learning_rate": 2.2931262681836136e-06,
+ "loss": 0.4241,
+ "step": 6727
+ },
+ {
+ "epoch": 3.1810874704491727,
+ "grad_norm": 2.693737745285034,
+ "learning_rate": 2.2925045871438765e-06,
+ "loss": 0.366,
+ "step": 6728
+ },
+ {
+ "epoch": 3.181560283687943,
+ "grad_norm": 2.9427194595336914,
+ "learning_rate": 2.2918829190239677e-06,
+ "loss": 0.3741,
+ "step": 6729
+ },
+ {
+ "epoch": 3.182033096926714,
+ "grad_norm": 2.529383659362793,
+ "learning_rate": 2.291261263862598e-06,
+ "loss": 0.4469,
+ "step": 6730
+ },
+ {
+ "epoch": 3.1825059101654847,
+ "grad_norm": 3.0097804069519043,
+ "learning_rate": 2.290639621698473e-06,
+ "loss": 0.4167,
+ "step": 6731
+ },
+ {
+ "epoch": 3.1829787234042555,
+ "grad_norm": 2.7047014236450195,
+ "learning_rate": 2.290017992570302e-06,
+ "loss": 0.3615,
+ "step": 6732
+ },
+ {
+ "epoch": 3.183451536643026,
+ "grad_norm": 2.676964282989502,
+ "learning_rate": 2.2893963765167897e-06,
+ "loss": 0.3722,
+ "step": 6733
+ },
+ {
+ "epoch": 3.1839243498817966,
+ "grad_norm": 3.0529778003692627,
+ "learning_rate": 2.2887747735766413e-06,
+ "loss": 0.395,
+ "step": 6734
+ },
+ {
+ "epoch": 3.1843971631205674,
+ "grad_norm": 2.826725721359253,
+ "learning_rate": 2.288153183788562e-06,
+ "loss": 0.3713,
+ "step": 6735
+ },
+ {
+ "epoch": 3.184869976359338,
+ "grad_norm": 2.8689587116241455,
+ "learning_rate": 2.287531607191254e-06,
+ "loss": 0.4383,
+ "step": 6736
+ },
+ {
+ "epoch": 3.1853427895981086,
+ "grad_norm": 3.1835694313049316,
+ "learning_rate": 2.2869100438234217e-06,
+ "loss": 0.3908,
+ "step": 6737
+ },
+ {
+ "epoch": 3.1858156028368794,
+ "grad_norm": 3.227262020111084,
+ "learning_rate": 2.286288493723767e-06,
+ "loss": 0.3549,
+ "step": 6738
+ },
+ {
+ "epoch": 3.18628841607565,
+ "grad_norm": 2.7543468475341797,
+ "learning_rate": 2.2856669569309896e-06,
+ "loss": 0.351,
+ "step": 6739
+ },
+ {
+ "epoch": 3.186761229314421,
+ "grad_norm": 2.5381555557250977,
+ "learning_rate": 2.2850454334837923e-06,
+ "loss": 0.3473,
+ "step": 6740
+ },
+ {
+ "epoch": 3.1872340425531913,
+ "grad_norm": 2.785923957824707,
+ "learning_rate": 2.284423923420872e-06,
+ "loss": 0.4144,
+ "step": 6741
+ },
+ {
+ "epoch": 3.187706855791962,
+ "grad_norm": 2.583853006362915,
+ "learning_rate": 2.28380242678093e-06,
+ "loss": 0.3088,
+ "step": 6742
+ },
+ {
+ "epoch": 3.188179669030733,
+ "grad_norm": 2.604647159576416,
+ "learning_rate": 2.2831809436026627e-06,
+ "loss": 0.3474,
+ "step": 6743
+ },
+ {
+ "epoch": 3.1886524822695037,
+ "grad_norm": 6.13611364364624,
+ "learning_rate": 2.2825594739247662e-06,
+ "loss": 0.4089,
+ "step": 6744
+ },
+ {
+ "epoch": 3.189125295508274,
+ "grad_norm": 3.034011125564575,
+ "learning_rate": 2.281938017785939e-06,
+ "loss": 0.4569,
+ "step": 6745
+ },
+ {
+ "epoch": 3.189598108747045,
+ "grad_norm": 2.9352638721466064,
+ "learning_rate": 2.281316575224874e-06,
+ "loss": 0.4293,
+ "step": 6746
+ },
+ {
+ "epoch": 3.1900709219858157,
+ "grad_norm": 3.860957384109497,
+ "learning_rate": 2.280695146280268e-06,
+ "loss": 0.4082,
+ "step": 6747
+ },
+ {
+ "epoch": 3.1905437352245865,
+ "grad_norm": 2.8131468296051025,
+ "learning_rate": 2.280073730990814e-06,
+ "loss": 0.3194,
+ "step": 6748
+ },
+ {
+ "epoch": 3.191016548463357,
+ "grad_norm": 3.1310737133026123,
+ "learning_rate": 2.2794523293952033e-06,
+ "loss": 0.4454,
+ "step": 6749
+ },
+ {
+ "epoch": 3.1914893617021276,
+ "grad_norm": 3.065091133117676,
+ "learning_rate": 2.27883094153213e-06,
+ "loss": 0.3789,
+ "step": 6750
+ },
+ {
+ "epoch": 3.1919621749408984,
+ "grad_norm": 3.315216541290283,
+ "learning_rate": 2.278209567440284e-06,
+ "loss": 0.4037,
+ "step": 6751
+ },
+ {
+ "epoch": 3.192434988179669,
+ "grad_norm": 3.0228476524353027,
+ "learning_rate": 2.2775882071583546e-06,
+ "loss": 0.3652,
+ "step": 6752
+ },
+ {
+ "epoch": 3.1929078014184396,
+ "grad_norm": 3.703540802001953,
+ "learning_rate": 2.2769668607250336e-06,
+ "loss": 0.3477,
+ "step": 6753
+ },
+ {
+ "epoch": 3.1933806146572103,
+ "grad_norm": 2.952481508255005,
+ "learning_rate": 2.2763455281790065e-06,
+ "loss": 0.4026,
+ "step": 6754
+ },
+ {
+ "epoch": 3.193853427895981,
+ "grad_norm": 2.5798189640045166,
+ "learning_rate": 2.275724209558965e-06,
+ "loss": 0.3475,
+ "step": 6755
+ },
+ {
+ "epoch": 3.194326241134752,
+ "grad_norm": 2.599669933319092,
+ "learning_rate": 2.2751029049035923e-06,
+ "loss": 0.3499,
+ "step": 6756
+ },
+ {
+ "epoch": 3.1947990543735223,
+ "grad_norm": 3.0463781356811523,
+ "learning_rate": 2.2744816142515756e-06,
+ "loss": 0.3927,
+ "step": 6757
+ },
+ {
+ "epoch": 3.195271867612293,
+ "grad_norm": 3.134199380874634,
+ "learning_rate": 2.2738603376416003e-06,
+ "loss": 0.3957,
+ "step": 6758
+ },
+ {
+ "epoch": 3.195744680851064,
+ "grad_norm": 3.1326372623443604,
+ "learning_rate": 2.273239075112349e-06,
+ "loss": 0.4305,
+ "step": 6759
+ },
+ {
+ "epoch": 3.1962174940898347,
+ "grad_norm": 2.847128391265869,
+ "learning_rate": 2.2726178267025072e-06,
+ "loss": 0.3825,
+ "step": 6760
+ },
+ {
+ "epoch": 3.196690307328605,
+ "grad_norm": 2.697584629058838,
+ "learning_rate": 2.2719965924507566e-06,
+ "loss": 0.3517,
+ "step": 6761
+ },
+ {
+ "epoch": 3.197163120567376,
+ "grad_norm": 2.881446599960327,
+ "learning_rate": 2.271375372395777e-06,
+ "loss": 0.3791,
+ "step": 6762
+ },
+ {
+ "epoch": 3.1976359338061466,
+ "grad_norm": 3.085054874420166,
+ "learning_rate": 2.270754166576252e-06,
+ "loss": 0.4324,
+ "step": 6763
+ },
+ {
+ "epoch": 3.1981087470449174,
+ "grad_norm": 3.3494462966918945,
+ "learning_rate": 2.270132975030859e-06,
+ "loss": 0.4242,
+ "step": 6764
+ },
+ {
+ "epoch": 3.198581560283688,
+ "grad_norm": 2.8617660999298096,
+ "learning_rate": 2.2695117977982785e-06,
+ "loss": 0.3563,
+ "step": 6765
+ },
+ {
+ "epoch": 3.1990543735224586,
+ "grad_norm": 2.7437968254089355,
+ "learning_rate": 2.2688906349171873e-06,
+ "loss": 0.4042,
+ "step": 6766
+ },
+ {
+ "epoch": 3.1995271867612294,
+ "grad_norm": 3.1129143238067627,
+ "learning_rate": 2.268269486426262e-06,
+ "loss": 0.3761,
+ "step": 6767
+ },
+ {
+ "epoch": 3.2,
+ "grad_norm": 3.32441782951355,
+ "learning_rate": 2.2676483523641807e-06,
+ "loss": 0.4439,
+ "step": 6768
+ },
+ {
+ "epoch": 3.2004728132387705,
+ "grad_norm": 2.8744730949401855,
+ "learning_rate": 2.267027232769617e-06,
+ "loss": 0.4015,
+ "step": 6769
+ },
+ {
+ "epoch": 3.2009456264775413,
+ "grad_norm": 3.6283397674560547,
+ "learning_rate": 2.2664061276812465e-06,
+ "loss": 0.3634,
+ "step": 6770
+ },
+ {
+ "epoch": 3.201418439716312,
+ "grad_norm": 2.7826597690582275,
+ "learning_rate": 2.2657850371377426e-06,
+ "loss": 0.3178,
+ "step": 6771
+ },
+ {
+ "epoch": 3.201891252955083,
+ "grad_norm": 2.668173313140869,
+ "learning_rate": 2.265163961177776e-06,
+ "loss": 0.3662,
+ "step": 6772
+ },
+ {
+ "epoch": 3.2023640661938533,
+ "grad_norm": 2.868441104888916,
+ "learning_rate": 2.264542899840021e-06,
+ "loss": 0.4235,
+ "step": 6773
+ },
+ {
+ "epoch": 3.202836879432624,
+ "grad_norm": 3.2715935707092285,
+ "learning_rate": 2.263921853163147e-06,
+ "loss": 0.4741,
+ "step": 6774
+ },
+ {
+ "epoch": 3.203309692671395,
+ "grad_norm": 2.8647544384002686,
+ "learning_rate": 2.2633008211858233e-06,
+ "loss": 0.3885,
+ "step": 6775
+ },
+ {
+ "epoch": 3.2037825059101657,
+ "grad_norm": 3.070164680480957,
+ "learning_rate": 2.2626798039467207e-06,
+ "loss": 0.4191,
+ "step": 6776
+ },
+ {
+ "epoch": 3.204255319148936,
+ "grad_norm": 2.846686840057373,
+ "learning_rate": 2.262058801484505e-06,
+ "loss": 0.3619,
+ "step": 6777
+ },
+ {
+ "epoch": 3.204728132387707,
+ "grad_norm": 2.767031192779541,
+ "learning_rate": 2.261437813837845e-06,
+ "loss": 0.3248,
+ "step": 6778
+ },
+ {
+ "epoch": 3.2052009456264776,
+ "grad_norm": 2.6819260120391846,
+ "learning_rate": 2.2608168410454065e-06,
+ "loss": 0.3871,
+ "step": 6779
+ },
+ {
+ "epoch": 3.2056737588652484,
+ "grad_norm": 3.1176788806915283,
+ "learning_rate": 2.260195883145854e-06,
+ "loss": 0.3929,
+ "step": 6780
+ },
+ {
+ "epoch": 3.2061465721040188,
+ "grad_norm": 3.143209457397461,
+ "learning_rate": 2.2595749401778524e-06,
+ "loss": 0.4188,
+ "step": 6781
+ },
+ {
+ "epoch": 3.2066193853427896,
+ "grad_norm": 2.9685657024383545,
+ "learning_rate": 2.2589540121800647e-06,
+ "loss": 0.4049,
+ "step": 6782
+ },
+ {
+ "epoch": 3.2070921985815604,
+ "grad_norm": 2.6853368282318115,
+ "learning_rate": 2.258333099191155e-06,
+ "loss": 0.349,
+ "step": 6783
+ },
+ {
+ "epoch": 3.207565011820331,
+ "grad_norm": 2.8418309688568115,
+ "learning_rate": 2.257712201249783e-06,
+ "loss": 0.4121,
+ "step": 6784
+ },
+ {
+ "epoch": 3.2080378250591015,
+ "grad_norm": 2.9441449642181396,
+ "learning_rate": 2.2570913183946085e-06,
+ "loss": 0.3846,
+ "step": 6785
+ },
+ {
+ "epoch": 3.2085106382978723,
+ "grad_norm": 2.9956493377685547,
+ "learning_rate": 2.256470450664294e-06,
+ "loss": 0.3941,
+ "step": 6786
+ },
+ {
+ "epoch": 3.208983451536643,
+ "grad_norm": 3.1774401664733887,
+ "learning_rate": 2.255849598097496e-06,
+ "loss": 0.4252,
+ "step": 6787
+ },
+ {
+ "epoch": 3.209456264775414,
+ "grad_norm": 2.8948934078216553,
+ "learning_rate": 2.255228760732873e-06,
+ "loss": 0.3963,
+ "step": 6788
+ },
+ {
+ "epoch": 3.2099290780141843,
+ "grad_norm": 3.440021276473999,
+ "learning_rate": 2.2546079386090825e-06,
+ "loss": 0.3777,
+ "step": 6789
+ },
+ {
+ "epoch": 3.210401891252955,
+ "grad_norm": 3.1573195457458496,
+ "learning_rate": 2.253987131764779e-06,
+ "loss": 0.3896,
+ "step": 6790
+ },
+ {
+ "epoch": 3.210874704491726,
+ "grad_norm": 3.4218719005584717,
+ "learning_rate": 2.2533663402386183e-06,
+ "loss": 0.3979,
+ "step": 6791
+ },
+ {
+ "epoch": 3.2113475177304966,
+ "grad_norm": 3.3442487716674805,
+ "learning_rate": 2.252745564069253e-06,
+ "loss": 0.406,
+ "step": 6792
+ },
+ {
+ "epoch": 3.211820330969267,
+ "grad_norm": 2.6089327335357666,
+ "learning_rate": 2.2521248032953387e-06,
+ "loss": 0.3539,
+ "step": 6793
+ },
+ {
+ "epoch": 3.212293144208038,
+ "grad_norm": 3.8015971183776855,
+ "learning_rate": 2.251504057955526e-06,
+ "loss": 0.4184,
+ "step": 6794
+ },
+ {
+ "epoch": 3.2127659574468086,
+ "grad_norm": 3.797565460205078,
+ "learning_rate": 2.250883328088465e-06,
+ "loss": 0.3392,
+ "step": 6795
+ },
+ {
+ "epoch": 3.2132387706855794,
+ "grad_norm": 3.290762186050415,
+ "learning_rate": 2.2502626137328077e-06,
+ "loss": 0.3726,
+ "step": 6796
+ },
+ {
+ "epoch": 3.2137115839243497,
+ "grad_norm": 3.149158000946045,
+ "learning_rate": 2.2496419149272023e-06,
+ "loss": 0.3869,
+ "step": 6797
+ },
+ {
+ "epoch": 3.2141843971631205,
+ "grad_norm": 2.652902364730835,
+ "learning_rate": 2.2490212317102964e-06,
+ "loss": 0.3256,
+ "step": 6798
+ },
+ {
+ "epoch": 3.2146572104018913,
+ "grad_norm": 3.3039770126342773,
+ "learning_rate": 2.248400564120739e-06,
+ "loss": 0.4231,
+ "step": 6799
+ },
+ {
+ "epoch": 3.215130023640662,
+ "grad_norm": 3.0190038681030273,
+ "learning_rate": 2.247779912197174e-06,
+ "loss": 0.4319,
+ "step": 6800
+ },
+ {
+ "epoch": 3.2156028368794325,
+ "grad_norm": 2.861393690109253,
+ "learning_rate": 2.2471592759782485e-06,
+ "loss": 0.465,
+ "step": 6801
+ },
+ {
+ "epoch": 3.2160756501182033,
+ "grad_norm": 2.7796146869659424,
+ "learning_rate": 2.246538655502606e-06,
+ "loss": 0.3896,
+ "step": 6802
+ },
+ {
+ "epoch": 3.216548463356974,
+ "grad_norm": 3.1849005222320557,
+ "learning_rate": 2.24591805080889e-06,
+ "loss": 0.3782,
+ "step": 6803
+ },
+ {
+ "epoch": 3.217021276595745,
+ "grad_norm": 3.076164960861206,
+ "learning_rate": 2.2452974619357435e-06,
+ "loss": 0.4023,
+ "step": 6804
+ },
+ {
+ "epoch": 3.2174940898345152,
+ "grad_norm": 2.7006006240844727,
+ "learning_rate": 2.2446768889218064e-06,
+ "loss": 0.3902,
+ "step": 6805
+ },
+ {
+ "epoch": 3.217966903073286,
+ "grad_norm": 2.9310474395751953,
+ "learning_rate": 2.2440563318057205e-06,
+ "loss": 0.366,
+ "step": 6806
+ },
+ {
+ "epoch": 3.218439716312057,
+ "grad_norm": 3.057248592376709,
+ "learning_rate": 2.2434357906261246e-06,
+ "loss": 0.4042,
+ "step": 6807
+ },
+ {
+ "epoch": 3.2189125295508276,
+ "grad_norm": 3.3720197677612305,
+ "learning_rate": 2.242815265421656e-06,
+ "loss": 0.3816,
+ "step": 6808
+ },
+ {
+ "epoch": 3.219385342789598,
+ "grad_norm": 2.9626352787017822,
+ "learning_rate": 2.2421947562309545e-06,
+ "loss": 0.363,
+ "step": 6809
+ },
+ {
+ "epoch": 3.219858156028369,
+ "grad_norm": 2.7848782539367676,
+ "learning_rate": 2.2415742630926533e-06,
+ "loss": 0.3597,
+ "step": 6810
+ },
+ {
+ "epoch": 3.2203309692671396,
+ "grad_norm": 2.757319450378418,
+ "learning_rate": 2.2409537860453913e-06,
+ "loss": 0.3304,
+ "step": 6811
+ },
+ {
+ "epoch": 3.2208037825059104,
+ "grad_norm": 2.7765560150146484,
+ "learning_rate": 2.240333325127801e-06,
+ "loss": 0.3896,
+ "step": 6812
+ },
+ {
+ "epoch": 3.2212765957446807,
+ "grad_norm": 2.9882447719573975,
+ "learning_rate": 2.239712880378515e-06,
+ "loss": 0.4004,
+ "step": 6813
+ },
+ {
+ "epoch": 3.2217494089834515,
+ "grad_norm": 2.8551244735717773,
+ "learning_rate": 2.2390924518361673e-06,
+ "loss": 0.4167,
+ "step": 6814
+ },
+ {
+ "epoch": 3.2222222222222223,
+ "grad_norm": 2.8051679134368896,
+ "learning_rate": 2.2384720395393878e-06,
+ "loss": 0.3319,
+ "step": 6815
+ },
+ {
+ "epoch": 3.222695035460993,
+ "grad_norm": 3.1172873973846436,
+ "learning_rate": 2.2378516435268086e-06,
+ "loss": 0.379,
+ "step": 6816
+ },
+ {
+ "epoch": 3.2231678486997635,
+ "grad_norm": 3.0282177925109863,
+ "learning_rate": 2.237231263837058e-06,
+ "loss": 0.3855,
+ "step": 6817
+ },
+ {
+ "epoch": 3.2236406619385343,
+ "grad_norm": 2.7156803607940674,
+ "learning_rate": 2.236610900508763e-06,
+ "loss": 0.4062,
+ "step": 6818
+ },
+ {
+ "epoch": 3.224113475177305,
+ "grad_norm": 2.721327781677246,
+ "learning_rate": 2.235990553580554e-06,
+ "loss": 0.3726,
+ "step": 6819
+ },
+ {
+ "epoch": 3.2245862884160754,
+ "grad_norm": 2.881181240081787,
+ "learning_rate": 2.235370223091055e-06,
+ "loss": 0.421,
+ "step": 6820
+ },
+ {
+ "epoch": 3.225059101654846,
+ "grad_norm": 2.8074657917022705,
+ "learning_rate": 2.234749909078892e-06,
+ "loss": 0.3628,
+ "step": 6821
+ },
+ {
+ "epoch": 3.225531914893617,
+ "grad_norm": 2.8781638145446777,
+ "learning_rate": 2.234129611582689e-06,
+ "loss": 0.3857,
+ "step": 6822
+ },
+ {
+ "epoch": 3.226004728132388,
+ "grad_norm": 2.9473299980163574,
+ "learning_rate": 2.233509330641068e-06,
+ "loss": 0.4358,
+ "step": 6823
+ },
+ {
+ "epoch": 3.2264775413711586,
+ "grad_norm": 3.261209011077881,
+ "learning_rate": 2.2328890662926543e-06,
+ "loss": 0.4115,
+ "step": 6824
+ },
+ {
+ "epoch": 3.226950354609929,
+ "grad_norm": 3.2796943187713623,
+ "learning_rate": 2.232268818576067e-06,
+ "loss": 0.3846,
+ "step": 6825
+ },
+ {
+ "epoch": 3.2274231678486998,
+ "grad_norm": 3.1083059310913086,
+ "learning_rate": 2.2316485875299247e-06,
+ "loss": 0.3452,
+ "step": 6826
+ },
+ {
+ "epoch": 3.2278959810874706,
+ "grad_norm": 2.7947003841400146,
+ "learning_rate": 2.23102837319285e-06,
+ "loss": 0.3733,
+ "step": 6827
+ },
+ {
+ "epoch": 3.228368794326241,
+ "grad_norm": 2.792348861694336,
+ "learning_rate": 2.230408175603458e-06,
+ "loss": 0.411,
+ "step": 6828
+ },
+ {
+ "epoch": 3.2288416075650117,
+ "grad_norm": 2.8563876152038574,
+ "learning_rate": 2.229787994800368e-06,
+ "loss": 0.4303,
+ "step": 6829
+ },
+ {
+ "epoch": 3.2293144208037825,
+ "grad_norm": 2.9573659896850586,
+ "learning_rate": 2.2291678308221943e-06,
+ "loss": 0.4124,
+ "step": 6830
+ },
+ {
+ "epoch": 3.2297872340425533,
+ "grad_norm": 2.8554422855377197,
+ "learning_rate": 2.228547683707551e-06,
+ "loss": 0.3715,
+ "step": 6831
+ },
+ {
+ "epoch": 3.230260047281324,
+ "grad_norm": 2.9457242488861084,
+ "learning_rate": 2.227927553495054e-06,
+ "loss": 0.4339,
+ "step": 6832
+ },
+ {
+ "epoch": 3.2307328605200945,
+ "grad_norm": 2.799135684967041,
+ "learning_rate": 2.227307440223315e-06,
+ "loss": 0.3335,
+ "step": 6833
+ },
+ {
+ "epoch": 3.2312056737588652,
+ "grad_norm": 2.768529176712036,
+ "learning_rate": 2.2266873439309465e-06,
+ "loss": 0.3929,
+ "step": 6834
+ },
+ {
+ "epoch": 3.231678486997636,
+ "grad_norm": 3.124069929122925,
+ "learning_rate": 2.2260672646565585e-06,
+ "loss": 0.4205,
+ "step": 6835
+ },
+ {
+ "epoch": 3.2321513002364064,
+ "grad_norm": 2.8153982162475586,
+ "learning_rate": 2.2254472024387603e-06,
+ "loss": 0.3565,
+ "step": 6836
+ },
+ {
+ "epoch": 3.232624113475177,
+ "grad_norm": 3.1802141666412354,
+ "learning_rate": 2.224827157316162e-06,
+ "loss": 0.4614,
+ "step": 6837
+ },
+ {
+ "epoch": 3.233096926713948,
+ "grad_norm": 2.669651746749878,
+ "learning_rate": 2.2242071293273682e-06,
+ "loss": 0.3581,
+ "step": 6838
+ },
+ {
+ "epoch": 3.233569739952719,
+ "grad_norm": 3.073127269744873,
+ "learning_rate": 2.223587118510989e-06,
+ "loss": 0.3581,
+ "step": 6839
+ },
+ {
+ "epoch": 3.2340425531914896,
+ "grad_norm": 2.875955820083618,
+ "learning_rate": 2.222967124905627e-06,
+ "loss": 0.3905,
+ "step": 6840
+ },
+ {
+ "epoch": 3.23451536643026,
+ "grad_norm": 2.887744903564453,
+ "learning_rate": 2.2223471485498872e-06,
+ "loss": 0.4131,
+ "step": 6841
+ },
+ {
+ "epoch": 3.2349881796690307,
+ "grad_norm": 2.6957902908325195,
+ "learning_rate": 2.2217271894823735e-06,
+ "loss": 0.3631,
+ "step": 6842
+ },
+ {
+ "epoch": 3.2354609929078015,
+ "grad_norm": 2.7098400592803955,
+ "learning_rate": 2.221107247741688e-06,
+ "loss": 0.3959,
+ "step": 6843
+ },
+ {
+ "epoch": 3.235933806146572,
+ "grad_norm": 2.986271858215332,
+ "learning_rate": 2.22048732336643e-06,
+ "loss": 0.3515,
+ "step": 6844
+ },
+ {
+ "epoch": 3.2364066193853427,
+ "grad_norm": 3.0537121295928955,
+ "learning_rate": 2.2198674163952015e-06,
+ "loss": 0.438,
+ "step": 6845
+ },
+ {
+ "epoch": 3.2368794326241135,
+ "grad_norm": 2.8351151943206787,
+ "learning_rate": 2.2192475268666e-06,
+ "loss": 0.4069,
+ "step": 6846
+ },
+ {
+ "epoch": 3.2373522458628843,
+ "grad_norm": 2.6455280780792236,
+ "learning_rate": 2.218627654819225e-06,
+ "loss": 0.3626,
+ "step": 6847
+ },
+ {
+ "epoch": 3.237825059101655,
+ "grad_norm": 3.060352325439453,
+ "learning_rate": 2.2180078002916717e-06,
+ "loss": 0.3306,
+ "step": 6848
+ },
+ {
+ "epoch": 3.2382978723404254,
+ "grad_norm": 3.0178887844085693,
+ "learning_rate": 2.2173879633225355e-06,
+ "loss": 0.4111,
+ "step": 6849
+ },
+ {
+ "epoch": 3.2387706855791962,
+ "grad_norm": 2.895822763442993,
+ "learning_rate": 2.2167681439504123e-06,
+ "loss": 0.4053,
+ "step": 6850
+ },
+ {
+ "epoch": 3.239243498817967,
+ "grad_norm": 2.7295608520507812,
+ "learning_rate": 2.2161483422138945e-06,
+ "loss": 0.4021,
+ "step": 6851
+ },
+ {
+ "epoch": 3.2397163120567374,
+ "grad_norm": 3.1004912853240967,
+ "learning_rate": 2.2155285581515747e-06,
+ "loss": 0.3882,
+ "step": 6852
+ },
+ {
+ "epoch": 3.240189125295508,
+ "grad_norm": 2.927987813949585,
+ "learning_rate": 2.214908791802045e-06,
+ "loss": 0.4036,
+ "step": 6853
+ },
+ {
+ "epoch": 3.240661938534279,
+ "grad_norm": 3.1679599285125732,
+ "learning_rate": 2.2142890432038943e-06,
+ "loss": 0.3897,
+ "step": 6854
+ },
+ {
+ "epoch": 3.2411347517730498,
+ "grad_norm": 3.2094008922576904,
+ "learning_rate": 2.213669312395712e-06,
+ "loss": 0.4429,
+ "step": 6855
+ },
+ {
+ "epoch": 3.24160756501182,
+ "grad_norm": 4.637594223022461,
+ "learning_rate": 2.2130495994160857e-06,
+ "loss": 0.3708,
+ "step": 6856
+ },
+ {
+ "epoch": 3.242080378250591,
+ "grad_norm": 3.0063490867614746,
+ "learning_rate": 2.212429904303603e-06,
+ "loss": 0.3949,
+ "step": 6857
+ },
+ {
+ "epoch": 3.2425531914893617,
+ "grad_norm": 3.285444736480713,
+ "learning_rate": 2.21181022709685e-06,
+ "loss": 0.4236,
+ "step": 6858
+ },
+ {
+ "epoch": 3.2430260047281325,
+ "grad_norm": 3.02506422996521,
+ "learning_rate": 2.2111905678344086e-06,
+ "loss": 0.368,
+ "step": 6859
+ },
+ {
+ "epoch": 3.243498817966903,
+ "grad_norm": 2.9845006465911865,
+ "learning_rate": 2.2105709265548657e-06,
+ "loss": 0.4154,
+ "step": 6860
+ },
+ {
+ "epoch": 3.2439716312056737,
+ "grad_norm": 3.2537527084350586,
+ "learning_rate": 2.2099513032968013e-06,
+ "loss": 0.4385,
+ "step": 6861
+ },
+ {
+ "epoch": 3.2444444444444445,
+ "grad_norm": 2.8521063327789307,
+ "learning_rate": 2.2093316980987985e-06,
+ "loss": 0.384,
+ "step": 6862
+ },
+ {
+ "epoch": 3.2449172576832153,
+ "grad_norm": 3.186844825744629,
+ "learning_rate": 2.208712110999436e-06,
+ "loss": 0.4131,
+ "step": 6863
+ },
+ {
+ "epoch": 3.2453900709219856,
+ "grad_norm": 2.932058095932007,
+ "learning_rate": 2.208092542037292e-06,
+ "loss": 0.3341,
+ "step": 6864
+ },
+ {
+ "epoch": 3.2458628841607564,
+ "grad_norm": 3.0818707942962646,
+ "learning_rate": 2.2074729912509462e-06,
+ "loss": 0.4149,
+ "step": 6865
+ },
+ {
+ "epoch": 3.246335697399527,
+ "grad_norm": 2.9788503646850586,
+ "learning_rate": 2.2068534586789735e-06,
+ "loss": 0.3572,
+ "step": 6866
+ },
+ {
+ "epoch": 3.246808510638298,
+ "grad_norm": 2.84075665473938,
+ "learning_rate": 2.206233944359952e-06,
+ "loss": 0.3561,
+ "step": 6867
+ },
+ {
+ "epoch": 3.2472813238770684,
+ "grad_norm": 2.966459035873413,
+ "learning_rate": 2.2056144483324545e-06,
+ "loss": 0.3909,
+ "step": 6868
+ },
+ {
+ "epoch": 3.247754137115839,
+ "grad_norm": 2.892038106918335,
+ "learning_rate": 2.204994970635054e-06,
+ "loss": 0.3557,
+ "step": 6869
+ },
+ {
+ "epoch": 3.24822695035461,
+ "grad_norm": 2.7458810806274414,
+ "learning_rate": 2.2043755113063233e-06,
+ "loss": 0.3551,
+ "step": 6870
+ },
+ {
+ "epoch": 3.2486997635933808,
+ "grad_norm": 2.766803741455078,
+ "learning_rate": 2.2037560703848334e-06,
+ "loss": 0.3343,
+ "step": 6871
+ },
+ {
+ "epoch": 3.249172576832151,
+ "grad_norm": 2.9780561923980713,
+ "learning_rate": 2.2031366479091533e-06,
+ "loss": 0.4004,
+ "step": 6872
+ },
+ {
+ "epoch": 3.249645390070922,
+ "grad_norm": 2.8848516941070557,
+ "learning_rate": 2.202517243917853e-06,
+ "loss": 0.3467,
+ "step": 6873
+ },
+ {
+ "epoch": 3.2501182033096927,
+ "grad_norm": 2.9962213039398193,
+ "learning_rate": 2.201897858449499e-06,
+ "loss": 0.3796,
+ "step": 6874
+ },
+ {
+ "epoch": 3.2505910165484635,
+ "grad_norm": 2.838131904602051,
+ "learning_rate": 2.201278491542659e-06,
+ "loss": 0.3683,
+ "step": 6875
+ },
+ {
+ "epoch": 3.251063829787234,
+ "grad_norm": 3.0232505798339844,
+ "learning_rate": 2.200659143235897e-06,
+ "loss": 0.3793,
+ "step": 6876
+ },
+ {
+ "epoch": 3.2515366430260046,
+ "grad_norm": 3.0690126419067383,
+ "learning_rate": 2.2000398135677776e-06,
+ "loss": 0.417,
+ "step": 6877
+ },
+ {
+ "epoch": 3.2520094562647754,
+ "grad_norm": 3.1838719844818115,
+ "learning_rate": 2.1994205025768643e-06,
+ "loss": 0.4608,
+ "step": 6878
+ },
+ {
+ "epoch": 3.2524822695035462,
+ "grad_norm": 3.1187257766723633,
+ "learning_rate": 2.198801210301717e-06,
+ "loss": 0.3396,
+ "step": 6879
+ },
+ {
+ "epoch": 3.2529550827423166,
+ "grad_norm": 2.7608656883239746,
+ "learning_rate": 2.1981819367808984e-06,
+ "loss": 0.386,
+ "step": 6880
+ },
+ {
+ "epoch": 3.2534278959810874,
+ "grad_norm": 3.027456283569336,
+ "learning_rate": 2.197562682052968e-06,
+ "loss": 0.3941,
+ "step": 6881
+ },
+ {
+ "epoch": 3.253900709219858,
+ "grad_norm": 2.925515651702881,
+ "learning_rate": 2.1969434461564816e-06,
+ "loss": 0.3608,
+ "step": 6882
+ },
+ {
+ "epoch": 3.254373522458629,
+ "grad_norm": 2.946770668029785,
+ "learning_rate": 2.196324229129999e-06,
+ "loss": 0.4116,
+ "step": 6883
+ },
+ {
+ "epoch": 3.2548463356973993,
+ "grad_norm": 2.6497952938079834,
+ "learning_rate": 2.1957050310120746e-06,
+ "loss": 0.338,
+ "step": 6884
+ },
+ {
+ "epoch": 3.25531914893617,
+ "grad_norm": 2.6915128231048584,
+ "learning_rate": 2.195085851841264e-06,
+ "loss": 0.3372,
+ "step": 6885
+ },
+ {
+ "epoch": 3.255791962174941,
+ "grad_norm": 3.4022350311279297,
+ "learning_rate": 2.1944666916561205e-06,
+ "loss": 0.3844,
+ "step": 6886
+ },
+ {
+ "epoch": 3.2562647754137117,
+ "grad_norm": 2.7463366985321045,
+ "learning_rate": 2.1938475504951958e-06,
+ "loss": 0.3268,
+ "step": 6887
+ },
+ {
+ "epoch": 3.256737588652482,
+ "grad_norm": 2.828810691833496,
+ "learning_rate": 2.193228428397042e-06,
+ "loss": 0.3275,
+ "step": 6888
+ },
+ {
+ "epoch": 3.257210401891253,
+ "grad_norm": 3.4016268253326416,
+ "learning_rate": 2.192609325400208e-06,
+ "loss": 0.3916,
+ "step": 6889
+ },
+ {
+ "epoch": 3.2576832151300237,
+ "grad_norm": 2.4980733394622803,
+ "learning_rate": 2.191990241543245e-06,
+ "loss": 0.3636,
+ "step": 6890
+ },
+ {
+ "epoch": 3.2581560283687945,
+ "grad_norm": 3.0384702682495117,
+ "learning_rate": 2.191371176864698e-06,
+ "loss": 0.398,
+ "step": 6891
+ },
+ {
+ "epoch": 3.258628841607565,
+ "grad_norm": 2.8949527740478516,
+ "learning_rate": 2.190752131403115e-06,
+ "loss": 0.3919,
+ "step": 6892
+ },
+ {
+ "epoch": 3.2591016548463356,
+ "grad_norm": 2.765617609024048,
+ "learning_rate": 2.190133105197041e-06,
+ "loss": 0.3799,
+ "step": 6893
+ },
+ {
+ "epoch": 3.2595744680851064,
+ "grad_norm": 2.6149277687072754,
+ "learning_rate": 2.18951409828502e-06,
+ "loss": 0.3895,
+ "step": 6894
+ },
+ {
+ "epoch": 3.260047281323877,
+ "grad_norm": 2.9738945960998535,
+ "learning_rate": 2.1888951107055934e-06,
+ "loss": 0.3879,
+ "step": 6895
+ },
+ {
+ "epoch": 3.2605200945626476,
+ "grad_norm": 2.9438633918762207,
+ "learning_rate": 2.1882761424973053e-06,
+ "loss": 0.438,
+ "step": 6896
+ },
+ {
+ "epoch": 3.2609929078014184,
+ "grad_norm": 3.114243984222412,
+ "learning_rate": 2.1876571936986936e-06,
+ "loss": 0.4737,
+ "step": 6897
+ },
+ {
+ "epoch": 3.261465721040189,
+ "grad_norm": 3.017526388168335,
+ "learning_rate": 2.1870382643483e-06,
+ "loss": 0.4039,
+ "step": 6898
+ },
+ {
+ "epoch": 3.26193853427896,
+ "grad_norm": 3.1475703716278076,
+ "learning_rate": 2.1864193544846613e-06,
+ "loss": 0.3825,
+ "step": 6899
+ },
+ {
+ "epoch": 3.2624113475177303,
+ "grad_norm": 2.75502872467041,
+ "learning_rate": 2.1858004641463142e-06,
+ "loss": 0.3507,
+ "step": 6900
+ },
+ {
+ "epoch": 3.262884160756501,
+ "grad_norm": 3.0467209815979004,
+ "learning_rate": 2.1851815933717944e-06,
+ "loss": 0.3938,
+ "step": 6901
+ },
+ {
+ "epoch": 3.263356973995272,
+ "grad_norm": 2.993014097213745,
+ "learning_rate": 2.184562742199636e-06,
+ "loss": 0.3711,
+ "step": 6902
+ },
+ {
+ "epoch": 3.2638297872340427,
+ "grad_norm": 2.607309341430664,
+ "learning_rate": 2.183943910668373e-06,
+ "loss": 0.3689,
+ "step": 6903
+ },
+ {
+ "epoch": 3.264302600472813,
+ "grad_norm": 2.961653470993042,
+ "learning_rate": 2.1833250988165373e-06,
+ "loss": 0.3806,
+ "step": 6904
+ },
+ {
+ "epoch": 3.264775413711584,
+ "grad_norm": 2.8202552795410156,
+ "learning_rate": 2.1827063066826574e-06,
+ "loss": 0.391,
+ "step": 6905
+ },
+ {
+ "epoch": 3.2652482269503547,
+ "grad_norm": 3.032648801803589,
+ "learning_rate": 2.1820875343052666e-06,
+ "loss": 0.4011,
+ "step": 6906
+ },
+ {
+ "epoch": 3.2657210401891255,
+ "grad_norm": 2.8265180587768555,
+ "learning_rate": 2.1814687817228896e-06,
+ "loss": 0.3923,
+ "step": 6907
+ },
+ {
+ "epoch": 3.266193853427896,
+ "grad_norm": 3.1425564289093018,
+ "learning_rate": 2.1808500489740555e-06,
+ "loss": 0.4913,
+ "step": 6908
+ },
+ {
+ "epoch": 3.2666666666666666,
+ "grad_norm": 2.977809429168701,
+ "learning_rate": 2.18023133609729e-06,
+ "loss": 0.379,
+ "step": 6909
+ },
+ {
+ "epoch": 3.2671394799054374,
+ "grad_norm": 3.509551525115967,
+ "learning_rate": 2.1796126431311153e-06,
+ "loss": 0.4025,
+ "step": 6910
+ },
+ {
+ "epoch": 3.267612293144208,
+ "grad_norm": 2.9133846759796143,
+ "learning_rate": 2.178993970114058e-06,
+ "loss": 0.4209,
+ "step": 6911
+ },
+ {
+ "epoch": 3.2680851063829786,
+ "grad_norm": 2.945513963699341,
+ "learning_rate": 2.178375317084637e-06,
+ "loss": 0.3882,
+ "step": 6912
+ },
+ {
+ "epoch": 3.2685579196217494,
+ "grad_norm": 2.7868733406066895,
+ "learning_rate": 2.1777566840813763e-06,
+ "loss": 0.3456,
+ "step": 6913
+ },
+ {
+ "epoch": 3.26903073286052,
+ "grad_norm": 2.803220748901367,
+ "learning_rate": 2.1771380711427937e-06,
+ "loss": 0.3394,
+ "step": 6914
+ },
+ {
+ "epoch": 3.269503546099291,
+ "grad_norm": 3.1293554306030273,
+ "learning_rate": 2.176519478307407e-06,
+ "loss": 0.402,
+ "step": 6915
+ },
+ {
+ "epoch": 3.2699763593380613,
+ "grad_norm": 2.843971014022827,
+ "learning_rate": 2.1759009056137347e-06,
+ "loss": 0.3449,
+ "step": 6916
+ },
+ {
+ "epoch": 3.270449172576832,
+ "grad_norm": 2.9983274936676025,
+ "learning_rate": 2.1752823531002917e-06,
+ "loss": 0.4091,
+ "step": 6917
+ },
+ {
+ "epoch": 3.270921985815603,
+ "grad_norm": 2.686722993850708,
+ "learning_rate": 2.174663820805592e-06,
+ "loss": 0.4303,
+ "step": 6918
+ },
+ {
+ "epoch": 3.2713947990543737,
+ "grad_norm": 2.669349431991577,
+ "learning_rate": 2.1740453087681508e-06,
+ "loss": 0.3796,
+ "step": 6919
+ },
+ {
+ "epoch": 3.271867612293144,
+ "grad_norm": 2.992138624191284,
+ "learning_rate": 2.173426817026477e-06,
+ "loss": 0.4125,
+ "step": 6920
+ },
+ {
+ "epoch": 3.272340425531915,
+ "grad_norm": 3.332834243774414,
+ "learning_rate": 2.1728083456190852e-06,
+ "loss": 0.3885,
+ "step": 6921
+ },
+ {
+ "epoch": 3.2728132387706856,
+ "grad_norm": 2.869673013687134,
+ "learning_rate": 2.1721898945844825e-06,
+ "loss": 0.3941,
+ "step": 6922
+ },
+ {
+ "epoch": 3.2732860520094564,
+ "grad_norm": 2.804440975189209,
+ "learning_rate": 2.1715714639611774e-06,
+ "loss": 0.4007,
+ "step": 6923
+ },
+ {
+ "epoch": 3.273758865248227,
+ "grad_norm": 3.1751439571380615,
+ "learning_rate": 2.1709530537876774e-06,
+ "loss": 0.3981,
+ "step": 6924
+ },
+ {
+ "epoch": 3.2742316784869976,
+ "grad_norm": 2.6367175579071045,
+ "learning_rate": 2.1703346641024878e-06,
+ "loss": 0.3582,
+ "step": 6925
+ },
+ {
+ "epoch": 3.2747044917257684,
+ "grad_norm": 2.99164080619812,
+ "learning_rate": 2.1697162949441137e-06,
+ "loss": 0.3846,
+ "step": 6926
+ },
+ {
+ "epoch": 3.275177304964539,
+ "grad_norm": 3.3206982612609863,
+ "learning_rate": 2.169097946351057e-06,
+ "loss": 0.3689,
+ "step": 6927
+ },
+ {
+ "epoch": 3.2756501182033095,
+ "grad_norm": 2.927907943725586,
+ "learning_rate": 2.16847961836182e-06,
+ "loss": 0.3536,
+ "step": 6928
+ },
+ {
+ "epoch": 3.2761229314420803,
+ "grad_norm": 3.1950864791870117,
+ "learning_rate": 2.167861311014904e-06,
+ "loss": 0.4154,
+ "step": 6929
+ },
+ {
+ "epoch": 3.276595744680851,
+ "grad_norm": 2.888383388519287,
+ "learning_rate": 2.1672430243488073e-06,
+ "loss": 0.3702,
+ "step": 6930
+ },
+ {
+ "epoch": 3.277068557919622,
+ "grad_norm": 2.842287063598633,
+ "learning_rate": 2.166624758402029e-06,
+ "loss": 0.3623,
+ "step": 6931
+ },
+ {
+ "epoch": 3.2775413711583923,
+ "grad_norm": 2.84350323677063,
+ "learning_rate": 2.166006513213065e-06,
+ "loss": 0.3757,
+ "step": 6932
+ },
+ {
+ "epoch": 3.278014184397163,
+ "grad_norm": 3.105626344680786,
+ "learning_rate": 2.165388288820411e-06,
+ "loss": 0.3955,
+ "step": 6933
+ },
+ {
+ "epoch": 3.278486997635934,
+ "grad_norm": 3.273508071899414,
+ "learning_rate": 2.164770085262561e-06,
+ "loss": 0.4046,
+ "step": 6934
+ },
+ {
+ "epoch": 3.2789598108747047,
+ "grad_norm": 3.2530124187469482,
+ "learning_rate": 2.1641519025780066e-06,
+ "loss": 0.3141,
+ "step": 6935
+ },
+ {
+ "epoch": 3.279432624113475,
+ "grad_norm": 2.822849750518799,
+ "learning_rate": 2.163533740805242e-06,
+ "loss": 0.3973,
+ "step": 6936
+ },
+ {
+ "epoch": 3.279905437352246,
+ "grad_norm": 2.772097587585449,
+ "learning_rate": 2.162915599982756e-06,
+ "loss": 0.3606,
+ "step": 6937
+ },
+ {
+ "epoch": 3.2803782505910166,
+ "grad_norm": 3.150696039199829,
+ "learning_rate": 2.1622974801490365e-06,
+ "loss": 0.4709,
+ "step": 6938
+ },
+ {
+ "epoch": 3.2808510638297874,
+ "grad_norm": 3.2072134017944336,
+ "learning_rate": 2.1616793813425736e-06,
+ "loss": 0.3946,
+ "step": 6939
+ },
+ {
+ "epoch": 3.2813238770685578,
+ "grad_norm": 2.9922473430633545,
+ "learning_rate": 2.1610613036018515e-06,
+ "loss": 0.3263,
+ "step": 6940
+ },
+ {
+ "epoch": 3.2817966903073286,
+ "grad_norm": 2.7818009853363037,
+ "learning_rate": 2.1604432469653555e-06,
+ "loss": 0.3887,
+ "step": 6941
+ },
+ {
+ "epoch": 3.2822695035460994,
+ "grad_norm": 3.12998628616333,
+ "learning_rate": 2.15982521147157e-06,
+ "loss": 0.3522,
+ "step": 6942
+ },
+ {
+ "epoch": 3.28274231678487,
+ "grad_norm": 2.876678228378296,
+ "learning_rate": 2.159207197158976e-06,
+ "loss": 0.3643,
+ "step": 6943
+ },
+ {
+ "epoch": 3.2832151300236405,
+ "grad_norm": 2.825488805770874,
+ "learning_rate": 2.1585892040660565e-06,
+ "loss": 0.3223,
+ "step": 6944
+ },
+ {
+ "epoch": 3.2836879432624113,
+ "grad_norm": 2.8724498748779297,
+ "learning_rate": 2.1579712322312906e-06,
+ "loss": 0.3855,
+ "step": 6945
+ },
+ {
+ "epoch": 3.284160756501182,
+ "grad_norm": 2.841064691543579,
+ "learning_rate": 2.1573532816931547e-06,
+ "loss": 0.4106,
+ "step": 6946
+ },
+ {
+ "epoch": 3.284633569739953,
+ "grad_norm": 3.053391218185425,
+ "learning_rate": 2.1567353524901288e-06,
+ "loss": 0.4875,
+ "step": 6947
+ },
+ {
+ "epoch": 3.2851063829787233,
+ "grad_norm": 2.7294771671295166,
+ "learning_rate": 2.156117444660687e-06,
+ "loss": 0.3856,
+ "step": 6948
+ },
+ {
+ "epoch": 3.285579196217494,
+ "grad_norm": 3.0965659618377686,
+ "learning_rate": 2.155499558243304e-06,
+ "loss": 0.4104,
+ "step": 6949
+ },
+ {
+ "epoch": 3.286052009456265,
+ "grad_norm": 2.778923511505127,
+ "learning_rate": 2.1548816932764536e-06,
+ "loss": 0.3636,
+ "step": 6950
+ },
+ {
+ "epoch": 3.2865248226950357,
+ "grad_norm": 2.890679121017456,
+ "learning_rate": 2.1542638497986054e-06,
+ "loss": 0.4026,
+ "step": 6951
+ },
+ {
+ "epoch": 3.286997635933806,
+ "grad_norm": 3.0466806888580322,
+ "learning_rate": 2.1536460278482326e-06,
+ "loss": 0.3856,
+ "step": 6952
+ },
+ {
+ "epoch": 3.287470449172577,
+ "grad_norm": 3.1367077827453613,
+ "learning_rate": 2.1530282274638013e-06,
+ "loss": 0.3767,
+ "step": 6953
+ },
+ {
+ "epoch": 3.2879432624113476,
+ "grad_norm": 2.984694719314575,
+ "learning_rate": 2.1524104486837823e-06,
+ "loss": 0.4142,
+ "step": 6954
+ },
+ {
+ "epoch": 3.2884160756501184,
+ "grad_norm": 3.1542797088623047,
+ "learning_rate": 2.151792691546641e-06,
+ "loss": 0.4361,
+ "step": 6955
+ },
+ {
+ "epoch": 3.2888888888888888,
+ "grad_norm": 2.7306816577911377,
+ "learning_rate": 2.1511749560908405e-06,
+ "loss": 0.3692,
+ "step": 6956
+ },
+ {
+ "epoch": 3.2893617021276595,
+ "grad_norm": 3.6679904460906982,
+ "learning_rate": 2.150557242354847e-06,
+ "loss": 0.4496,
+ "step": 6957
+ },
+ {
+ "epoch": 3.2898345153664303,
+ "grad_norm": 3.2040863037109375,
+ "learning_rate": 2.1499395503771207e-06,
+ "loss": 0.3526,
+ "step": 6958
+ },
+ {
+ "epoch": 3.290307328605201,
+ "grad_norm": 3.2416043281555176,
+ "learning_rate": 2.1493218801961246e-06,
+ "loss": 0.3955,
+ "step": 6959
+ },
+ {
+ "epoch": 3.2907801418439715,
+ "grad_norm": 2.8164525032043457,
+ "learning_rate": 2.1487042318503174e-06,
+ "loss": 0.3727,
+ "step": 6960
+ },
+ {
+ "epoch": 3.2912529550827423,
+ "grad_norm": 2.5954513549804688,
+ "learning_rate": 2.148086605378156e-06,
+ "loss": 0.3315,
+ "step": 6961
+ },
+ {
+ "epoch": 3.291725768321513,
+ "grad_norm": 2.8068149089813232,
+ "learning_rate": 2.1474690008181e-06,
+ "loss": 0.3702,
+ "step": 6962
+ },
+ {
+ "epoch": 3.2921985815602834,
+ "grad_norm": 2.9063730239868164,
+ "learning_rate": 2.1468514182086025e-06,
+ "loss": 0.3357,
+ "step": 6963
+ },
+ {
+ "epoch": 3.2926713947990542,
+ "grad_norm": 2.7623207569122314,
+ "learning_rate": 2.1462338575881197e-06,
+ "loss": 0.381,
+ "step": 6964
+ },
+ {
+ "epoch": 3.293144208037825,
+ "grad_norm": 2.6818830966949463,
+ "learning_rate": 2.145616318995103e-06,
+ "loss": 0.3733,
+ "step": 6965
+ },
+ {
+ "epoch": 3.293617021276596,
+ "grad_norm": 2.7966864109039307,
+ "learning_rate": 2.1449988024680034e-06,
+ "loss": 0.3993,
+ "step": 6966
+ },
+ {
+ "epoch": 3.2940898345153666,
+ "grad_norm": 3.0644514560699463,
+ "learning_rate": 2.1443813080452728e-06,
+ "loss": 0.3541,
+ "step": 6967
+ },
+ {
+ "epoch": 3.294562647754137,
+ "grad_norm": 3.03204607963562,
+ "learning_rate": 2.1437638357653586e-06,
+ "loss": 0.3864,
+ "step": 6968
+ },
+ {
+ "epoch": 3.295035460992908,
+ "grad_norm": 2.980565071105957,
+ "learning_rate": 2.143146385666707e-06,
+ "loss": 0.36,
+ "step": 6969
+ },
+ {
+ "epoch": 3.2955082742316786,
+ "grad_norm": 3.1261661052703857,
+ "learning_rate": 2.1425289577877675e-06,
+ "loss": 0.4053,
+ "step": 6970
+ },
+ {
+ "epoch": 3.295981087470449,
+ "grad_norm": 3.0194897651672363,
+ "learning_rate": 2.1419115521669804e-06,
+ "loss": 0.4553,
+ "step": 6971
+ },
+ {
+ "epoch": 3.2964539007092197,
+ "grad_norm": 2.7620482444763184,
+ "learning_rate": 2.141294168842792e-06,
+ "loss": 0.3846,
+ "step": 6972
+ },
+ {
+ "epoch": 3.2969267139479905,
+ "grad_norm": 2.9575016498565674,
+ "learning_rate": 2.1406768078536427e-06,
+ "loss": 0.4415,
+ "step": 6973
+ },
+ {
+ "epoch": 3.2973995271867613,
+ "grad_norm": 3.17909574508667,
+ "learning_rate": 2.1400594692379717e-06,
+ "loss": 0.4514,
+ "step": 6974
+ },
+ {
+ "epoch": 3.297872340425532,
+ "grad_norm": 3.128613233566284,
+ "learning_rate": 2.1394421530342207e-06,
+ "loss": 0.3757,
+ "step": 6975
+ },
+ {
+ "epoch": 3.2983451536643025,
+ "grad_norm": 3.0247111320495605,
+ "learning_rate": 2.1388248592808243e-06,
+ "loss": 0.3881,
+ "step": 6976
+ },
+ {
+ "epoch": 3.2988179669030733,
+ "grad_norm": 2.8091228008270264,
+ "learning_rate": 2.1382075880162217e-06,
+ "loss": 0.3782,
+ "step": 6977
+ },
+ {
+ "epoch": 3.299290780141844,
+ "grad_norm": 2.985105514526367,
+ "learning_rate": 2.137590339278846e-06,
+ "loss": 0.3783,
+ "step": 6978
+ },
+ {
+ "epoch": 3.2997635933806144,
+ "grad_norm": 3.1862502098083496,
+ "learning_rate": 2.1369731131071304e-06,
+ "loss": 0.4776,
+ "step": 6979
+ },
+ {
+ "epoch": 3.300236406619385,
+ "grad_norm": 3.3138091564178467,
+ "learning_rate": 2.1363559095395075e-06,
+ "loss": 0.4056,
+ "step": 6980
+ },
+ {
+ "epoch": 3.300709219858156,
+ "grad_norm": 3.023695707321167,
+ "learning_rate": 2.135738728614407e-06,
+ "loss": 0.3716,
+ "step": 6981
+ },
+ {
+ "epoch": 3.301182033096927,
+ "grad_norm": 6.149252414703369,
+ "learning_rate": 2.135121570370259e-06,
+ "loss": 0.3713,
+ "step": 6982
+ },
+ {
+ "epoch": 3.3016548463356976,
+ "grad_norm": 2.689671754837036,
+ "learning_rate": 2.134504434845491e-06,
+ "loss": 0.3541,
+ "step": 6983
+ },
+ {
+ "epoch": 3.302127659574468,
+ "grad_norm": 3.241212844848633,
+ "learning_rate": 2.1338873220785284e-06,
+ "loss": 0.4328,
+ "step": 6984
+ },
+ {
+ "epoch": 3.3026004728132388,
+ "grad_norm": 3.6037068367004395,
+ "learning_rate": 2.133270232107798e-06,
+ "loss": 0.4091,
+ "step": 6985
+ },
+ {
+ "epoch": 3.3030732860520096,
+ "grad_norm": 3.300031900405884,
+ "learning_rate": 2.1326531649717216e-06,
+ "loss": 0.3742,
+ "step": 6986
+ },
+ {
+ "epoch": 3.30354609929078,
+ "grad_norm": 2.82257342338562,
+ "learning_rate": 2.1320361207087225e-06,
+ "loss": 0.3622,
+ "step": 6987
+ },
+ {
+ "epoch": 3.3040189125295507,
+ "grad_norm": 3.297513246536255,
+ "learning_rate": 2.1314190993572196e-06,
+ "loss": 0.4606,
+ "step": 6988
+ },
+ {
+ "epoch": 3.3044917257683215,
+ "grad_norm": 2.676440954208374,
+ "learning_rate": 2.130802100955634e-06,
+ "loss": 0.382,
+ "step": 6989
+ },
+ {
+ "epoch": 3.3049645390070923,
+ "grad_norm": 2.9548017978668213,
+ "learning_rate": 2.130185125542383e-06,
+ "loss": 0.3751,
+ "step": 6990
+ },
+ {
+ "epoch": 3.305437352245863,
+ "grad_norm": 2.800647020339966,
+ "learning_rate": 2.129568173155882e-06,
+ "loss": 0.3868,
+ "step": 6991
+ },
+ {
+ "epoch": 3.3059101654846335,
+ "grad_norm": 3.3789260387420654,
+ "learning_rate": 2.128951243834546e-06,
+ "loss": 0.4373,
+ "step": 6992
+ },
+ {
+ "epoch": 3.3063829787234043,
+ "grad_norm": 2.944807767868042,
+ "learning_rate": 2.12833433761679e-06,
+ "loss": 0.4205,
+ "step": 6993
+ },
+ {
+ "epoch": 3.306855791962175,
+ "grad_norm": 2.577975273132324,
+ "learning_rate": 2.127717454541025e-06,
+ "loss": 0.4197,
+ "step": 6994
+ },
+ {
+ "epoch": 3.3073286052009454,
+ "grad_norm": 3.0542666912078857,
+ "learning_rate": 2.127100594645661e-06,
+ "loss": 0.3811,
+ "step": 6995
+ },
+ {
+ "epoch": 3.307801418439716,
+ "grad_norm": 3.163015842437744,
+ "learning_rate": 2.1264837579691088e-06,
+ "loss": 0.415,
+ "step": 6996
+ },
+ {
+ "epoch": 3.308274231678487,
+ "grad_norm": 2.9161269664764404,
+ "learning_rate": 2.1258669445497746e-06,
+ "loss": 0.3714,
+ "step": 6997
+ },
+ {
+ "epoch": 3.308747044917258,
+ "grad_norm": 2.934483289718628,
+ "learning_rate": 2.1252501544260657e-06,
+ "loss": 0.4085,
+ "step": 6998
+ },
+ {
+ "epoch": 3.3092198581560286,
+ "grad_norm": 3.155613660812378,
+ "learning_rate": 2.1246333876363852e-06,
+ "loss": 0.4698,
+ "step": 6999
+ },
+ {
+ "epoch": 3.309692671394799,
+ "grad_norm": 2.648171901702881,
+ "learning_rate": 2.124016644219139e-06,
+ "loss": 0.3091,
+ "step": 7000
+ },
+ {
+ "epoch": 3.3101654846335697,
+ "grad_norm": 2.908219814300537,
+ "learning_rate": 2.123399924212728e-06,
+ "loss": 0.4063,
+ "step": 7001
+ },
+ {
+ "epoch": 3.3106382978723405,
+ "grad_norm": 3.138749361038208,
+ "learning_rate": 2.122783227655551e-06,
+ "loss": 0.4296,
+ "step": 7002
+ },
+ {
+ "epoch": 3.311111111111111,
+ "grad_norm": 3.044466018676758,
+ "learning_rate": 2.1221665545860094e-06,
+ "loss": 0.4424,
+ "step": 7003
+ },
+ {
+ "epoch": 3.3115839243498817,
+ "grad_norm": 2.6758792400360107,
+ "learning_rate": 2.121549905042499e-06,
+ "loss": 0.4073,
+ "step": 7004
+ },
+ {
+ "epoch": 3.3120567375886525,
+ "grad_norm": 2.8901989459991455,
+ "learning_rate": 2.1209332790634174e-06,
+ "loss": 0.3842,
+ "step": 7005
+ },
+ {
+ "epoch": 3.3125295508274233,
+ "grad_norm": 2.8179712295532227,
+ "learning_rate": 2.1203166766871582e-06,
+ "loss": 0.366,
+ "step": 7006
+ },
+ {
+ "epoch": 3.313002364066194,
+ "grad_norm": 2.6536550521850586,
+ "learning_rate": 2.1197000979521138e-06,
+ "loss": 0.3851,
+ "step": 7007
+ },
+ {
+ "epoch": 3.3134751773049644,
+ "grad_norm": 3.1277682781219482,
+ "learning_rate": 2.1190835428966775e-06,
+ "loss": 0.4249,
+ "step": 7008
+ },
+ {
+ "epoch": 3.3139479905437352,
+ "grad_norm": 2.924666166305542,
+ "learning_rate": 2.1184670115592383e-06,
+ "loss": 0.3873,
+ "step": 7009
+ },
+ {
+ "epoch": 3.314420803782506,
+ "grad_norm": 2.7921009063720703,
+ "learning_rate": 2.1178505039781856e-06,
+ "loss": 0.3754,
+ "step": 7010
+ },
+ {
+ "epoch": 3.3148936170212764,
+ "grad_norm": 2.5349879264831543,
+ "learning_rate": 2.1172340201919067e-06,
+ "loss": 0.3701,
+ "step": 7011
+ },
+ {
+ "epoch": 3.315366430260047,
+ "grad_norm": 2.849376678466797,
+ "learning_rate": 2.1166175602387866e-06,
+ "loss": 0.3963,
+ "step": 7012
+ },
+ {
+ "epoch": 3.315839243498818,
+ "grad_norm": 3.141280174255371,
+ "learning_rate": 2.11600112415721e-06,
+ "loss": 0.4158,
+ "step": 7013
+ },
+ {
+ "epoch": 3.3163120567375888,
+ "grad_norm": 2.922807455062866,
+ "learning_rate": 2.11538471198556e-06,
+ "loss": 0.3667,
+ "step": 7014
+ },
+ {
+ "epoch": 3.3167848699763596,
+ "grad_norm": 2.770400047302246,
+ "learning_rate": 2.114768323762216e-06,
+ "loss": 0.3674,
+ "step": 7015
+ },
+ {
+ "epoch": 3.31725768321513,
+ "grad_norm": 2.7706570625305176,
+ "learning_rate": 2.114151959525561e-06,
+ "loss": 0.3761,
+ "step": 7016
+ },
+ {
+ "epoch": 3.3177304964539007,
+ "grad_norm": 3.041755437850952,
+ "learning_rate": 2.1135356193139704e-06,
+ "loss": 0.4483,
+ "step": 7017
+ },
+ {
+ "epoch": 3.3182033096926715,
+ "grad_norm": 3.5757904052734375,
+ "learning_rate": 2.1129193031658227e-06,
+ "loss": 0.4094,
+ "step": 7018
+ },
+ {
+ "epoch": 3.318676122931442,
+ "grad_norm": 2.9292917251586914,
+ "learning_rate": 2.1123030111194936e-06,
+ "loss": 0.3514,
+ "step": 7019
+ },
+ {
+ "epoch": 3.3191489361702127,
+ "grad_norm": 3.1443874835968018,
+ "learning_rate": 2.111686743213355e-06,
+ "loss": 0.4098,
+ "step": 7020
+ },
+ {
+ "epoch": 3.3196217494089835,
+ "grad_norm": 2.9738030433654785,
+ "learning_rate": 2.1110704994857804e-06,
+ "loss": 0.3584,
+ "step": 7021
+ },
+ {
+ "epoch": 3.3200945626477543,
+ "grad_norm": 2.8961563110351562,
+ "learning_rate": 2.1104542799751397e-06,
+ "loss": 0.3736,
+ "step": 7022
+ },
+ {
+ "epoch": 3.320567375886525,
+ "grad_norm": 3.9264683723449707,
+ "learning_rate": 2.1098380847198037e-06,
+ "loss": 0.457,
+ "step": 7023
+ },
+ {
+ "epoch": 3.3210401891252954,
+ "grad_norm": 2.8742756843566895,
+ "learning_rate": 2.109221913758139e-06,
+ "loss": 0.4252,
+ "step": 7024
+ },
+ {
+ "epoch": 3.321513002364066,
+ "grad_norm": 3.7229559421539307,
+ "learning_rate": 2.108605767128512e-06,
+ "loss": 0.4451,
+ "step": 7025
+ },
+ {
+ "epoch": 3.321985815602837,
+ "grad_norm": 2.6417593955993652,
+ "learning_rate": 2.1079896448692884e-06,
+ "loss": 0.3658,
+ "step": 7026
+ },
+ {
+ "epoch": 3.3224586288416074,
+ "grad_norm": 2.8780412673950195,
+ "learning_rate": 2.10737354701883e-06,
+ "loss": 0.4225,
+ "step": 7027
+ },
+ {
+ "epoch": 3.322931442080378,
+ "grad_norm": 2.557816505432129,
+ "learning_rate": 2.1067574736155e-06,
+ "loss": 0.3812,
+ "step": 7028
+ },
+ {
+ "epoch": 3.323404255319149,
+ "grad_norm": 2.859062910079956,
+ "learning_rate": 2.106141424697658e-06,
+ "loss": 0.3629,
+ "step": 7029
+ },
+ {
+ "epoch": 3.3238770685579198,
+ "grad_norm": 2.4776878356933594,
+ "learning_rate": 2.1055254003036607e-06,
+ "loss": 0.3591,
+ "step": 7030
+ },
+ {
+ "epoch": 3.3243498817966906,
+ "grad_norm": 3.085066795349121,
+ "learning_rate": 2.1049094004718687e-06,
+ "loss": 0.4237,
+ "step": 7031
+ },
+ {
+ "epoch": 3.324822695035461,
+ "grad_norm": 2.862592935562134,
+ "learning_rate": 2.1042934252406345e-06,
+ "loss": 0.3185,
+ "step": 7032
+ },
+ {
+ "epoch": 3.3252955082742317,
+ "grad_norm": 2.965743064880371,
+ "learning_rate": 2.1036774746483145e-06,
+ "loss": 0.4058,
+ "step": 7033
+ },
+ {
+ "epoch": 3.3257683215130025,
+ "grad_norm": 2.7420589923858643,
+ "learning_rate": 2.103061548733261e-06,
+ "loss": 0.3566,
+ "step": 7034
+ },
+ {
+ "epoch": 3.326241134751773,
+ "grad_norm": 2.7824347019195557,
+ "learning_rate": 2.1024456475338235e-06,
+ "loss": 0.3553,
+ "step": 7035
+ },
+ {
+ "epoch": 3.3267139479905437,
+ "grad_norm": 3.0410704612731934,
+ "learning_rate": 2.1018297710883528e-06,
+ "loss": 0.3772,
+ "step": 7036
+ },
+ {
+ "epoch": 3.3271867612293144,
+ "grad_norm": 3.0811562538146973,
+ "learning_rate": 2.101213919435196e-06,
+ "loss": 0.3738,
+ "step": 7037
+ },
+ {
+ "epoch": 3.3276595744680852,
+ "grad_norm": 2.939445734024048,
+ "learning_rate": 2.100598092612699e-06,
+ "loss": 0.4107,
+ "step": 7038
+ },
+ {
+ "epoch": 3.3281323877068556,
+ "grad_norm": 3.05804705619812,
+ "learning_rate": 2.0999822906592086e-06,
+ "loss": 0.3972,
+ "step": 7039
+ },
+ {
+ "epoch": 3.3286052009456264,
+ "grad_norm": 2.803558111190796,
+ "learning_rate": 2.0993665136130657e-06,
+ "loss": 0.3487,
+ "step": 7040
+ },
+ {
+ "epoch": 3.329078014184397,
+ "grad_norm": 2.937675714492798,
+ "learning_rate": 2.0987507615126147e-06,
+ "loss": 0.4095,
+ "step": 7041
+ },
+ {
+ "epoch": 3.329550827423168,
+ "grad_norm": 2.853905439376831,
+ "learning_rate": 2.098135034396194e-06,
+ "loss": 0.3775,
+ "step": 7042
+ },
+ {
+ "epoch": 3.3300236406619383,
+ "grad_norm": 3.3520495891571045,
+ "learning_rate": 2.097519332302142e-06,
+ "loss": 0.4065,
+ "step": 7043
+ },
+ {
+ "epoch": 3.330496453900709,
+ "grad_norm": 2.8787078857421875,
+ "learning_rate": 2.096903655268797e-06,
+ "loss": 0.3452,
+ "step": 7044
+ },
+ {
+ "epoch": 3.33096926713948,
+ "grad_norm": 2.993896007537842,
+ "learning_rate": 2.096288003334493e-06,
+ "loss": 0.3814,
+ "step": 7045
+ },
+ {
+ "epoch": 3.3314420803782507,
+ "grad_norm": 3.5248336791992188,
+ "learning_rate": 2.0956723765375655e-06,
+ "loss": 0.3852,
+ "step": 7046
+ },
+ {
+ "epoch": 3.331914893617021,
+ "grad_norm": 3.2227890491485596,
+ "learning_rate": 2.0950567749163463e-06,
+ "loss": 0.3913,
+ "step": 7047
+ },
+ {
+ "epoch": 3.332387706855792,
+ "grad_norm": 3.390401601791382,
+ "learning_rate": 2.094441198509165e-06,
+ "loss": 0.3944,
+ "step": 7048
+ },
+ {
+ "epoch": 3.3328605200945627,
+ "grad_norm": 3.2057554721832275,
+ "learning_rate": 2.0938256473543534e-06,
+ "loss": 0.404,
+ "step": 7049
+ },
+ {
+ "epoch": 3.3333333333333335,
+ "grad_norm": 2.866708755493164,
+ "learning_rate": 2.0932101214902367e-06,
+ "loss": 0.4345,
+ "step": 7050
+ },
+ {
+ "epoch": 3.333806146572104,
+ "grad_norm": 3.4304039478302,
+ "learning_rate": 2.0925946209551428e-06,
+ "loss": 0.4209,
+ "step": 7051
+ },
+ {
+ "epoch": 3.3342789598108746,
+ "grad_norm": 3.996561288833618,
+ "learning_rate": 2.091979145787395e-06,
+ "loss": 0.4394,
+ "step": 7052
+ },
+ {
+ "epoch": 3.3347517730496454,
+ "grad_norm": 3.1932613849639893,
+ "learning_rate": 2.0913636960253166e-06,
+ "loss": 0.3837,
+ "step": 7053
+ },
+ {
+ "epoch": 3.3352245862884162,
+ "grad_norm": 2.908832311630249,
+ "learning_rate": 2.0907482717072293e-06,
+ "loss": 0.3526,
+ "step": 7054
+ },
+ {
+ "epoch": 3.3356973995271866,
+ "grad_norm": 2.7319607734680176,
+ "learning_rate": 2.090132872871452e-06,
+ "loss": 0.3686,
+ "step": 7055
+ },
+ {
+ "epoch": 3.3361702127659574,
+ "grad_norm": 2.9213504791259766,
+ "learning_rate": 2.0895174995563043e-06,
+ "loss": 0.4034,
+ "step": 7056
+ },
+ {
+ "epoch": 3.336643026004728,
+ "grad_norm": 2.8093936443328857,
+ "learning_rate": 2.0889021518001017e-06,
+ "loss": 0.4151,
+ "step": 7057
+ },
+ {
+ "epoch": 3.337115839243499,
+ "grad_norm": 3.1840829849243164,
+ "learning_rate": 2.0882868296411594e-06,
+ "loss": 0.3501,
+ "step": 7058
+ },
+ {
+ "epoch": 3.3375886524822693,
+ "grad_norm": 2.793567657470703,
+ "learning_rate": 2.087671533117791e-06,
+ "loss": 0.3911,
+ "step": 7059
+ },
+ {
+ "epoch": 3.33806146572104,
+ "grad_norm": 3.0820090770721436,
+ "learning_rate": 2.0870562622683077e-06,
+ "loss": 0.432,
+ "step": 7060
+ },
+ {
+ "epoch": 3.338534278959811,
+ "grad_norm": 2.774630546569824,
+ "learning_rate": 2.0864410171310213e-06,
+ "loss": 0.3434,
+ "step": 7061
+ },
+ {
+ "epoch": 3.3390070921985817,
+ "grad_norm": 2.70447039604187,
+ "learning_rate": 2.085825797744239e-06,
+ "loss": 0.3787,
+ "step": 7062
+ },
+ {
+ "epoch": 3.339479905437352,
+ "grad_norm": 3.1014437675476074,
+ "learning_rate": 2.0852106041462672e-06,
+ "loss": 0.4568,
+ "step": 7063
+ },
+ {
+ "epoch": 3.339952718676123,
+ "grad_norm": 3.312680244445801,
+ "learning_rate": 2.0845954363754133e-06,
+ "loss": 0.4285,
+ "step": 7064
+ },
+ {
+ "epoch": 3.3404255319148937,
+ "grad_norm": 2.7070534229278564,
+ "learning_rate": 2.0839802944699806e-06,
+ "loss": 0.4096,
+ "step": 7065
+ },
+ {
+ "epoch": 3.3408983451536645,
+ "grad_norm": 2.8172531127929688,
+ "learning_rate": 2.083365178468269e-06,
+ "loss": 0.3652,
+ "step": 7066
+ },
+ {
+ "epoch": 3.341371158392435,
+ "grad_norm": 2.896378517150879,
+ "learning_rate": 2.082750088408582e-06,
+ "loss": 0.3778,
+ "step": 7067
+ },
+ {
+ "epoch": 3.3418439716312056,
+ "grad_norm": 2.769805669784546,
+ "learning_rate": 2.0821350243292175e-06,
+ "loss": 0.3593,
+ "step": 7068
+ },
+ {
+ "epoch": 3.3423167848699764,
+ "grad_norm": 2.672520875930786,
+ "learning_rate": 2.0815199862684728e-06,
+ "loss": 0.3873,
+ "step": 7069
+ },
+ {
+ "epoch": 3.342789598108747,
+ "grad_norm": 2.841327428817749,
+ "learning_rate": 2.0809049742646435e-06,
+ "loss": 0.41,
+ "step": 7070
+ },
+ {
+ "epoch": 3.3432624113475176,
+ "grad_norm": 3.0540482997894287,
+ "learning_rate": 2.080289988356023e-06,
+ "loss": 0.32,
+ "step": 7071
+ },
+ {
+ "epoch": 3.3437352245862884,
+ "grad_norm": 3.471684217453003,
+ "learning_rate": 2.079675028580905e-06,
+ "loss": 0.3779,
+ "step": 7072
+ },
+ {
+ "epoch": 3.344208037825059,
+ "grad_norm": 2.8545875549316406,
+ "learning_rate": 2.07906009497758e-06,
+ "loss": 0.4645,
+ "step": 7073
+ },
+ {
+ "epoch": 3.34468085106383,
+ "grad_norm": 2.7771127223968506,
+ "learning_rate": 2.078445187584337e-06,
+ "loss": 0.3889,
+ "step": 7074
+ },
+ {
+ "epoch": 3.3451536643026003,
+ "grad_norm": 2.769188165664673,
+ "learning_rate": 2.0778303064394647e-06,
+ "loss": 0.3745,
+ "step": 7075
+ },
+ {
+ "epoch": 3.345626477541371,
+ "grad_norm": 2.739577531814575,
+ "learning_rate": 2.0772154515812467e-06,
+ "loss": 0.4402,
+ "step": 7076
+ },
+ {
+ "epoch": 3.346099290780142,
+ "grad_norm": 2.6124343872070312,
+ "learning_rate": 2.0766006230479696e-06,
+ "loss": 0.3595,
+ "step": 7077
+ },
+ {
+ "epoch": 3.3465721040189127,
+ "grad_norm": 2.7100563049316406,
+ "learning_rate": 2.0759858208779136e-06,
+ "loss": 0.3641,
+ "step": 7078
+ },
+ {
+ "epoch": 3.347044917257683,
+ "grad_norm": 2.8594000339508057,
+ "learning_rate": 2.075371045109363e-06,
+ "loss": 0.402,
+ "step": 7079
+ },
+ {
+ "epoch": 3.347517730496454,
+ "grad_norm": 3.2045278549194336,
+ "learning_rate": 2.0747562957805955e-06,
+ "loss": 0.4719,
+ "step": 7080
+ },
+ {
+ "epoch": 3.3479905437352246,
+ "grad_norm": 2.825594663619995,
+ "learning_rate": 2.0741415729298874e-06,
+ "loss": 0.4127,
+ "step": 7081
+ },
+ {
+ "epoch": 3.3484633569739954,
+ "grad_norm": 2.992403984069824,
+ "learning_rate": 2.0735268765955173e-06,
+ "loss": 0.3943,
+ "step": 7082
+ },
+ {
+ "epoch": 3.348936170212766,
+ "grad_norm": 3.0629165172576904,
+ "learning_rate": 2.072912206815758e-06,
+ "loss": 0.4132,
+ "step": 7083
+ },
+ {
+ "epoch": 3.3494089834515366,
+ "grad_norm": 2.7553658485412598,
+ "learning_rate": 2.0722975636288836e-06,
+ "loss": 0.3667,
+ "step": 7084
+ },
+ {
+ "epoch": 3.3498817966903074,
+ "grad_norm": 3.1556780338287354,
+ "learning_rate": 2.0716829470731647e-06,
+ "loss": 0.4383,
+ "step": 7085
+ },
+ {
+ "epoch": 3.350354609929078,
+ "grad_norm": 2.6693310737609863,
+ "learning_rate": 2.071068357186869e-06,
+ "loss": 0.363,
+ "step": 7086
+ },
+ {
+ "epoch": 3.3508274231678485,
+ "grad_norm": 2.738314628601074,
+ "learning_rate": 2.0704537940082673e-06,
+ "loss": 0.3493,
+ "step": 7087
+ },
+ {
+ "epoch": 3.3513002364066193,
+ "grad_norm": 3.2205989360809326,
+ "learning_rate": 2.069839257575624e-06,
+ "loss": 0.3802,
+ "step": 7088
+ },
+ {
+ "epoch": 3.35177304964539,
+ "grad_norm": 2.8969876766204834,
+ "learning_rate": 2.069224747927203e-06,
+ "loss": 0.3846,
+ "step": 7089
+ },
+ {
+ "epoch": 3.352245862884161,
+ "grad_norm": 2.833179473876953,
+ "learning_rate": 2.0686102651012694e-06,
+ "loss": 0.3892,
+ "step": 7090
+ },
+ {
+ "epoch": 3.3527186761229313,
+ "grad_norm": 3.303830623626709,
+ "learning_rate": 2.067995809136082e-06,
+ "loss": 0.4009,
+ "step": 7091
+ },
+ {
+ "epoch": 3.353191489361702,
+ "grad_norm": 3.3684141635894775,
+ "learning_rate": 2.0673813800699024e-06,
+ "loss": 0.4434,
+ "step": 7092
+ },
+ {
+ "epoch": 3.353664302600473,
+ "grad_norm": 2.6549112796783447,
+ "learning_rate": 2.066766977940987e-06,
+ "loss": 0.3941,
+ "step": 7093
+ },
+ {
+ "epoch": 3.3541371158392437,
+ "grad_norm": 2.852935314178467,
+ "learning_rate": 2.066152602787591e-06,
+ "loss": 0.4143,
+ "step": 7094
+ },
+ {
+ "epoch": 3.354609929078014,
+ "grad_norm": 2.9621706008911133,
+ "learning_rate": 2.0655382546479713e-06,
+ "loss": 0.4502,
+ "step": 7095
+ },
+ {
+ "epoch": 3.355082742316785,
+ "grad_norm": 3.2836413383483887,
+ "learning_rate": 2.064923933560378e-06,
+ "loss": 0.3993,
+ "step": 7096
+ },
+ {
+ "epoch": 3.3555555555555556,
+ "grad_norm": 2.8187968730926514,
+ "learning_rate": 2.0643096395630654e-06,
+ "loss": 0.3766,
+ "step": 7097
+ },
+ {
+ "epoch": 3.3560283687943264,
+ "grad_norm": 2.7965118885040283,
+ "learning_rate": 2.0636953726942803e-06,
+ "loss": 0.4258,
+ "step": 7098
+ },
+ {
+ "epoch": 3.3565011820330968,
+ "grad_norm": 3.002030611038208,
+ "learning_rate": 2.063081132992271e-06,
+ "loss": 0.3548,
+ "step": 7099
+ },
+ {
+ "epoch": 3.3569739952718676,
+ "grad_norm": 2.927603006362915,
+ "learning_rate": 2.0624669204952847e-06,
+ "loss": 0.3759,
+ "step": 7100
+ },
+ {
+ "epoch": 3.3574468085106384,
+ "grad_norm": 2.911393165588379,
+ "learning_rate": 2.061852735241563e-06,
+ "loss": 0.3599,
+ "step": 7101
+ },
+ {
+ "epoch": 3.357919621749409,
+ "grad_norm": 3.0596864223480225,
+ "learning_rate": 2.0612385772693517e-06,
+ "loss": 0.3557,
+ "step": 7102
+ },
+ {
+ "epoch": 3.3583924349881795,
+ "grad_norm": 2.7869808673858643,
+ "learning_rate": 2.0606244466168905e-06,
+ "loss": 0.3696,
+ "step": 7103
+ },
+ {
+ "epoch": 3.3588652482269503,
+ "grad_norm": 2.927715539932251,
+ "learning_rate": 2.060010343322417e-06,
+ "loss": 0.3309,
+ "step": 7104
+ },
+ {
+ "epoch": 3.359338061465721,
+ "grad_norm": 3.44653058052063,
+ "learning_rate": 2.059396267424171e-06,
+ "loss": 0.4453,
+ "step": 7105
+ },
+ {
+ "epoch": 3.359810874704492,
+ "grad_norm": 3.047652244567871,
+ "learning_rate": 2.0587822189603873e-06,
+ "loss": 0.3615,
+ "step": 7106
+ },
+ {
+ "epoch": 3.3602836879432623,
+ "grad_norm": 2.6640517711639404,
+ "learning_rate": 2.0581681979693002e-06,
+ "loss": 0.3716,
+ "step": 7107
+ },
+ {
+ "epoch": 3.360756501182033,
+ "grad_norm": 2.8253493309020996,
+ "learning_rate": 2.0575542044891424e-06,
+ "loss": 0.3485,
+ "step": 7108
+ },
+ {
+ "epoch": 3.361229314420804,
+ "grad_norm": 3.0512938499450684,
+ "learning_rate": 2.0569402385581433e-06,
+ "loss": 0.4582,
+ "step": 7109
+ },
+ {
+ "epoch": 3.3617021276595747,
+ "grad_norm": 2.935060739517212,
+ "learning_rate": 2.0563263002145333e-06,
+ "loss": 0.425,
+ "step": 7110
+ },
+ {
+ "epoch": 3.362174940898345,
+ "grad_norm": 3.2708780765533447,
+ "learning_rate": 2.0557123894965396e-06,
+ "loss": 0.4193,
+ "step": 7111
+ },
+ {
+ "epoch": 3.362647754137116,
+ "grad_norm": 2.758329391479492,
+ "learning_rate": 2.055098506442386e-06,
+ "loss": 0.3754,
+ "step": 7112
+ },
+ {
+ "epoch": 3.3631205673758866,
+ "grad_norm": 3.0359015464782715,
+ "learning_rate": 2.0544846510902987e-06,
+ "loss": 0.4207,
+ "step": 7113
+ },
+ {
+ "epoch": 3.3635933806146574,
+ "grad_norm": 3.096968412399292,
+ "learning_rate": 2.0538708234784983e-06,
+ "loss": 0.4303,
+ "step": 7114
+ },
+ {
+ "epoch": 3.3640661938534278,
+ "grad_norm": 3.0777673721313477,
+ "learning_rate": 2.053257023645206e-06,
+ "loss": 0.3904,
+ "step": 7115
+ },
+ {
+ "epoch": 3.3645390070921986,
+ "grad_norm": 2.9483232498168945,
+ "learning_rate": 2.0526432516286394e-06,
+ "loss": 0.3949,
+ "step": 7116
+ },
+ {
+ "epoch": 3.3650118203309693,
+ "grad_norm": 2.839067220687866,
+ "learning_rate": 2.0520295074670154e-06,
+ "loss": 0.3705,
+ "step": 7117
+ },
+ {
+ "epoch": 3.36548463356974,
+ "grad_norm": 3.0450778007507324,
+ "learning_rate": 2.0514157911985506e-06,
+ "loss": 0.3987,
+ "step": 7118
+ },
+ {
+ "epoch": 3.3659574468085105,
+ "grad_norm": 3.425318717956543,
+ "learning_rate": 2.0508021028614564e-06,
+ "loss": 0.3941,
+ "step": 7119
+ },
+ {
+ "epoch": 3.3664302600472813,
+ "grad_norm": 2.9509286880493164,
+ "learning_rate": 2.0501884424939465e-06,
+ "loss": 0.354,
+ "step": 7120
+ },
+ {
+ "epoch": 3.366903073286052,
+ "grad_norm": 2.799504518508911,
+ "learning_rate": 2.0495748101342303e-06,
+ "loss": 0.3891,
+ "step": 7121
+ },
+ {
+ "epoch": 3.3673758865248224,
+ "grad_norm": 2.9140994548797607,
+ "learning_rate": 2.048961205820515e-06,
+ "loss": 0.3638,
+ "step": 7122
+ },
+ {
+ "epoch": 3.3678486997635932,
+ "grad_norm": 2.8074216842651367,
+ "learning_rate": 2.0483476295910077e-06,
+ "loss": 0.3501,
+ "step": 7123
+ },
+ {
+ "epoch": 3.368321513002364,
+ "grad_norm": 2.770829677581787,
+ "learning_rate": 2.0477340814839126e-06,
+ "loss": 0.3774,
+ "step": 7124
+ },
+ {
+ "epoch": 3.368794326241135,
+ "grad_norm": 2.581655502319336,
+ "learning_rate": 2.047120561537434e-06,
+ "loss": 0.3523,
+ "step": 7125
+ },
+ {
+ "epoch": 3.3692671394799056,
+ "grad_norm": 3.4234209060668945,
+ "learning_rate": 2.046507069789772e-06,
+ "loss": 0.4191,
+ "step": 7126
+ },
+ {
+ "epoch": 3.369739952718676,
+ "grad_norm": 2.669860601425171,
+ "learning_rate": 2.045893606279126e-06,
+ "loss": 0.3542,
+ "step": 7127
+ },
+ {
+ "epoch": 3.370212765957447,
+ "grad_norm": 3.2426629066467285,
+ "learning_rate": 2.045280171043694e-06,
+ "loss": 0.4416,
+ "step": 7128
+ },
+ {
+ "epoch": 3.3706855791962176,
+ "grad_norm": 3.1318910121917725,
+ "learning_rate": 2.044666764121672e-06,
+ "loss": 0.3999,
+ "step": 7129
+ },
+ {
+ "epoch": 3.371158392434988,
+ "grad_norm": 2.7044012546539307,
+ "learning_rate": 2.044053385551254e-06,
+ "loss": 0.3907,
+ "step": 7130
+ },
+ {
+ "epoch": 3.3716312056737587,
+ "grad_norm": 2.9429895877838135,
+ "learning_rate": 2.0434400353706322e-06,
+ "loss": 0.3827,
+ "step": 7131
+ },
+ {
+ "epoch": 3.3721040189125295,
+ "grad_norm": 2.7258787155151367,
+ "learning_rate": 2.0428267136179973e-06,
+ "loss": 0.3688,
+ "step": 7132
+ },
+ {
+ "epoch": 3.3725768321513003,
+ "grad_norm": 2.765108108520508,
+ "learning_rate": 2.042213420331539e-06,
+ "loss": 0.4078,
+ "step": 7133
+ },
+ {
+ "epoch": 3.373049645390071,
+ "grad_norm": 3.2951347827911377,
+ "learning_rate": 2.0416001555494435e-06,
+ "loss": 0.4259,
+ "step": 7134
+ },
+ {
+ "epoch": 3.3735224586288415,
+ "grad_norm": 3.3917062282562256,
+ "learning_rate": 2.040986919309895e-06,
+ "loss": 0.5094,
+ "step": 7135
+ },
+ {
+ "epoch": 3.3739952718676123,
+ "grad_norm": 2.746434450149536,
+ "learning_rate": 2.04037371165108e-06,
+ "loss": 0.3513,
+ "step": 7136
+ },
+ {
+ "epoch": 3.374468085106383,
+ "grad_norm": 3.268731117248535,
+ "learning_rate": 2.0397605326111774e-06,
+ "loss": 0.3909,
+ "step": 7137
+ },
+ {
+ "epoch": 3.3749408983451534,
+ "grad_norm": 2.8498165607452393,
+ "learning_rate": 2.0391473822283692e-06,
+ "loss": 0.3657,
+ "step": 7138
+ },
+ {
+ "epoch": 3.3754137115839242,
+ "grad_norm": 2.855966567993164,
+ "learning_rate": 2.0385342605408325e-06,
+ "loss": 0.3927,
+ "step": 7139
+ },
+ {
+ "epoch": 3.375886524822695,
+ "grad_norm": 3.1839048862457275,
+ "learning_rate": 2.0379211675867438e-06,
+ "loss": 0.4476,
+ "step": 7140
+ },
+ {
+ "epoch": 3.376359338061466,
+ "grad_norm": 2.9379947185516357,
+ "learning_rate": 2.037308103404278e-06,
+ "loss": 0.3657,
+ "step": 7141
+ },
+ {
+ "epoch": 3.3768321513002366,
+ "grad_norm": 2.9251210689544678,
+ "learning_rate": 2.0366950680316073e-06,
+ "loss": 0.3975,
+ "step": 7142
+ },
+ {
+ "epoch": 3.377304964539007,
+ "grad_norm": 2.811885118484497,
+ "learning_rate": 2.036082061506904e-06,
+ "loss": 0.3064,
+ "step": 7143
+ },
+ {
+ "epoch": 3.3777777777777778,
+ "grad_norm": 2.755229949951172,
+ "learning_rate": 2.0354690838683363e-06,
+ "loss": 0.3328,
+ "step": 7144
+ },
+ {
+ "epoch": 3.3782505910165486,
+ "grad_norm": 3.006819725036621,
+ "learning_rate": 2.0348561351540706e-06,
+ "loss": 0.4168,
+ "step": 7145
+ },
+ {
+ "epoch": 3.378723404255319,
+ "grad_norm": 2.8788509368896484,
+ "learning_rate": 2.034243215402275e-06,
+ "loss": 0.4123,
+ "step": 7146
+ },
+ {
+ "epoch": 3.3791962174940897,
+ "grad_norm": 2.9732980728149414,
+ "learning_rate": 2.033630324651112e-06,
+ "loss": 0.3371,
+ "step": 7147
+ },
+ {
+ "epoch": 3.3796690307328605,
+ "grad_norm": 2.7731754779815674,
+ "learning_rate": 2.033017462938744e-06,
+ "loss": 0.382,
+ "step": 7148
+ },
+ {
+ "epoch": 3.3801418439716313,
+ "grad_norm": 2.766395092010498,
+ "learning_rate": 2.032404630303331e-06,
+ "loss": 0.3295,
+ "step": 7149
+ },
+ {
+ "epoch": 3.380614657210402,
+ "grad_norm": 3.197960138320923,
+ "learning_rate": 2.03179182678303e-06,
+ "loss": 0.354,
+ "step": 7150
+ },
+ {
+ "epoch": 3.3810874704491725,
+ "grad_norm": 3.048553228378296,
+ "learning_rate": 2.031179052416e-06,
+ "loss": 0.4027,
+ "step": 7151
+ },
+ {
+ "epoch": 3.3815602836879433,
+ "grad_norm": 3.1527998447418213,
+ "learning_rate": 2.0305663072403934e-06,
+ "loss": 0.4229,
+ "step": 7152
+ },
+ {
+ "epoch": 3.382033096926714,
+ "grad_norm": 3.0407028198242188,
+ "learning_rate": 2.029953591294366e-06,
+ "loss": 0.4254,
+ "step": 7153
+ },
+ {
+ "epoch": 3.3825059101654844,
+ "grad_norm": 2.7170357704162598,
+ "learning_rate": 2.0293409046160673e-06,
+ "loss": 0.3307,
+ "step": 7154
+ },
+ {
+ "epoch": 3.382978723404255,
+ "grad_norm": 3.0128726959228516,
+ "learning_rate": 2.028728247243646e-06,
+ "loss": 0.3873,
+ "step": 7155
+ },
+ {
+ "epoch": 3.383451536643026,
+ "grad_norm": 4.861877918243408,
+ "learning_rate": 2.0281156192152507e-06,
+ "loss": 0.4371,
+ "step": 7156
+ },
+ {
+ "epoch": 3.383924349881797,
+ "grad_norm": 2.890249252319336,
+ "learning_rate": 2.0275030205690257e-06,
+ "loss": 0.3899,
+ "step": 7157
+ },
+ {
+ "epoch": 3.3843971631205676,
+ "grad_norm": 3.0774779319763184,
+ "learning_rate": 2.026890451343117e-06,
+ "loss": 0.4151,
+ "step": 7158
+ },
+ {
+ "epoch": 3.384869976359338,
+ "grad_norm": 2.8705947399139404,
+ "learning_rate": 2.026277911575665e-06,
+ "loss": 0.4004,
+ "step": 7159
+ },
+ {
+ "epoch": 3.3853427895981087,
+ "grad_norm": 3.170760154724121,
+ "learning_rate": 2.0256654013048096e-06,
+ "loss": 0.4442,
+ "step": 7160
+ },
+ {
+ "epoch": 3.3858156028368795,
+ "grad_norm": 4.211156368255615,
+ "learning_rate": 2.0250529205686905e-06,
+ "loss": 0.4605,
+ "step": 7161
+ },
+ {
+ "epoch": 3.38628841607565,
+ "grad_norm": 2.513519287109375,
+ "learning_rate": 2.0244404694054435e-06,
+ "loss": 0.3506,
+ "step": 7162
+ },
+ {
+ "epoch": 3.3867612293144207,
+ "grad_norm": 3.1558821201324463,
+ "learning_rate": 2.023828047853203e-06,
+ "loss": 0.43,
+ "step": 7163
+ },
+ {
+ "epoch": 3.3872340425531915,
+ "grad_norm": 3.6770291328430176,
+ "learning_rate": 2.023215655950102e-06,
+ "loss": 0.3911,
+ "step": 7164
+ },
+ {
+ "epoch": 3.3877068557919623,
+ "grad_norm": 2.6544485092163086,
+ "learning_rate": 2.022603293734271e-06,
+ "loss": 0.3306,
+ "step": 7165
+ },
+ {
+ "epoch": 3.388179669030733,
+ "grad_norm": 3.34232759475708,
+ "learning_rate": 2.0219909612438405e-06,
+ "loss": 0.4233,
+ "step": 7166
+ },
+ {
+ "epoch": 3.3886524822695034,
+ "grad_norm": 3.388561725616455,
+ "learning_rate": 2.0213786585169363e-06,
+ "loss": 0.4171,
+ "step": 7167
+ },
+ {
+ "epoch": 3.3891252955082742,
+ "grad_norm": 2.8606953620910645,
+ "learning_rate": 2.020766385591684e-06,
+ "loss": 0.3864,
+ "step": 7168
+ },
+ {
+ "epoch": 3.389598108747045,
+ "grad_norm": 3.0135979652404785,
+ "learning_rate": 2.020154142506208e-06,
+ "loss": 0.3933,
+ "step": 7169
+ },
+ {
+ "epoch": 3.3900709219858154,
+ "grad_norm": 2.5003163814544678,
+ "learning_rate": 2.0195419292986294e-06,
+ "loss": 0.3852,
+ "step": 7170
+ },
+ {
+ "epoch": 3.390543735224586,
+ "grad_norm": 2.8591368198394775,
+ "learning_rate": 2.0189297460070685e-06,
+ "loss": 0.3962,
+ "step": 7171
+ },
+ {
+ "epoch": 3.391016548463357,
+ "grad_norm": 2.8830223083496094,
+ "learning_rate": 2.0183175926696427e-06,
+ "loss": 0.3632,
+ "step": 7172
+ },
+ {
+ "epoch": 3.391489361702128,
+ "grad_norm": 3.3904542922973633,
+ "learning_rate": 2.0177054693244674e-06,
+ "loss": 0.4284,
+ "step": 7173
+ },
+ {
+ "epoch": 3.3919621749408986,
+ "grad_norm": 3.0325920581817627,
+ "learning_rate": 2.0170933760096585e-06,
+ "loss": 0.4331,
+ "step": 7174
+ },
+ {
+ "epoch": 3.392434988179669,
+ "grad_norm": 2.60345196723938,
+ "learning_rate": 2.016481312763327e-06,
+ "loss": 0.4077,
+ "step": 7175
+ },
+ {
+ "epoch": 3.3929078014184397,
+ "grad_norm": 2.8146891593933105,
+ "learning_rate": 2.0158692796235845e-06,
+ "loss": 0.4224,
+ "step": 7176
+ },
+ {
+ "epoch": 3.3933806146572105,
+ "grad_norm": 2.8158490657806396,
+ "learning_rate": 2.0152572766285396e-06,
+ "loss": 0.3454,
+ "step": 7177
+ },
+ {
+ "epoch": 3.393853427895981,
+ "grad_norm": 3.2753400802612305,
+ "learning_rate": 2.0146453038162978e-06,
+ "loss": 0.3615,
+ "step": 7178
+ },
+ {
+ "epoch": 3.3943262411347517,
+ "grad_norm": 3.0527124404907227,
+ "learning_rate": 2.0140333612249655e-06,
+ "loss": 0.415,
+ "step": 7179
+ },
+ {
+ "epoch": 3.3947990543735225,
+ "grad_norm": 2.6813764572143555,
+ "learning_rate": 2.0134214488926435e-06,
+ "loss": 0.3391,
+ "step": 7180
+ },
+ {
+ "epoch": 3.3952718676122933,
+ "grad_norm": 2.809319496154785,
+ "learning_rate": 2.0128095668574356e-06,
+ "loss": 0.3123,
+ "step": 7181
+ },
+ {
+ "epoch": 3.395744680851064,
+ "grad_norm": 2.6619064807891846,
+ "learning_rate": 2.0121977151574396e-06,
+ "loss": 0.4222,
+ "step": 7182
+ },
+ {
+ "epoch": 3.3962174940898344,
+ "grad_norm": 2.9201200008392334,
+ "learning_rate": 2.0115858938307516e-06,
+ "loss": 0.3712,
+ "step": 7183
+ },
+ {
+ "epoch": 3.396690307328605,
+ "grad_norm": 3.2058637142181396,
+ "learning_rate": 2.0109741029154696e-06,
+ "loss": 0.4004,
+ "step": 7184
+ },
+ {
+ "epoch": 3.397163120567376,
+ "grad_norm": 2.821855306625366,
+ "learning_rate": 2.0103623424496862e-06,
+ "loss": 0.4053,
+ "step": 7185
+ },
+ {
+ "epoch": 3.3976359338061464,
+ "grad_norm": 3.0371549129486084,
+ "learning_rate": 2.009750612471492e-06,
+ "loss": 0.4246,
+ "step": 7186
+ },
+ {
+ "epoch": 3.398108747044917,
+ "grad_norm": 2.8827290534973145,
+ "learning_rate": 2.009138913018978e-06,
+ "loss": 0.3256,
+ "step": 7187
+ },
+ {
+ "epoch": 3.398581560283688,
+ "grad_norm": 3.168039560317993,
+ "learning_rate": 2.0085272441302305e-06,
+ "loss": 0.4233,
+ "step": 7188
+ },
+ {
+ "epoch": 3.3990543735224588,
+ "grad_norm": 3.259723663330078,
+ "learning_rate": 2.0079156058433374e-06,
+ "loss": 0.4168,
+ "step": 7189
+ },
+ {
+ "epoch": 3.3995271867612296,
+ "grad_norm": 2.456231117248535,
+ "learning_rate": 2.007303998196382e-06,
+ "loss": 0.3383,
+ "step": 7190
+ },
+ {
+ "epoch": 3.4,
+ "grad_norm": 2.735180377960205,
+ "learning_rate": 2.006692421227445e-06,
+ "loss": 0.3475,
+ "step": 7191
+ },
+ {
+ "epoch": 3.4004728132387707,
+ "grad_norm": 2.76263427734375,
+ "learning_rate": 2.006080874974609e-06,
+ "loss": 0.3651,
+ "step": 7192
+ },
+ {
+ "epoch": 3.4009456264775415,
+ "grad_norm": 3.36867094039917,
+ "learning_rate": 2.0054693594759504e-06,
+ "loss": 0.4479,
+ "step": 7193
+ },
+ {
+ "epoch": 3.401418439716312,
+ "grad_norm": 2.532167673110962,
+ "learning_rate": 2.004857874769547e-06,
+ "loss": 0.3818,
+ "step": 7194
+ },
+ {
+ "epoch": 3.4018912529550827,
+ "grad_norm": 2.8723537921905518,
+ "learning_rate": 2.0042464208934724e-06,
+ "loss": 0.3332,
+ "step": 7195
+ },
+ {
+ "epoch": 3.4023640661938535,
+ "grad_norm": 2.676460027694702,
+ "learning_rate": 2.0036349978857987e-06,
+ "loss": 0.3488,
+ "step": 7196
+ },
+ {
+ "epoch": 3.4028368794326243,
+ "grad_norm": 2.805851459503174,
+ "learning_rate": 2.0030236057845983e-06,
+ "loss": 0.3796,
+ "step": 7197
+ },
+ {
+ "epoch": 3.403309692671395,
+ "grad_norm": 2.688988447189331,
+ "learning_rate": 2.0024122446279377e-06,
+ "loss": 0.3707,
+ "step": 7198
+ },
+ {
+ "epoch": 3.4037825059101654,
+ "grad_norm": 3.118720293045044,
+ "learning_rate": 2.0018009144538853e-06,
+ "loss": 0.4064,
+ "step": 7199
+ },
+ {
+ "epoch": 3.404255319148936,
+ "grad_norm": 2.876507520675659,
+ "learning_rate": 2.001189615300506e-06,
+ "loss": 0.3543,
+ "step": 7200
+ },
+ {
+ "epoch": 3.404728132387707,
+ "grad_norm": 3.0043466091156006,
+ "learning_rate": 2.000578347205861e-06,
+ "loss": 0.3833,
+ "step": 7201
+ },
+ {
+ "epoch": 3.4052009456264773,
+ "grad_norm": 3.1057114601135254,
+ "learning_rate": 1.9999671102080133e-06,
+ "loss": 0.4154,
+ "step": 7202
+ },
+ {
+ "epoch": 3.405673758865248,
+ "grad_norm": 2.9791855812072754,
+ "learning_rate": 1.9993559043450202e-06,
+ "loss": 0.3865,
+ "step": 7203
+ },
+ {
+ "epoch": 3.406146572104019,
+ "grad_norm": 3.4403460025787354,
+ "learning_rate": 1.9987447296549407e-06,
+ "loss": 0.3883,
+ "step": 7204
+ },
+ {
+ "epoch": 3.4066193853427897,
+ "grad_norm": 2.9962027072906494,
+ "learning_rate": 1.998133586175829e-06,
+ "loss": 0.3796,
+ "step": 7205
+ },
+ {
+ "epoch": 3.40709219858156,
+ "grad_norm": 3.0613129138946533,
+ "learning_rate": 1.997522473945737e-06,
+ "loss": 0.3917,
+ "step": 7206
+ },
+ {
+ "epoch": 3.407565011820331,
+ "grad_norm": 3.065985679626465,
+ "learning_rate": 1.996911393002718e-06,
+ "loss": 0.3521,
+ "step": 7207
+ },
+ {
+ "epoch": 3.4080378250591017,
+ "grad_norm": 2.976177930831909,
+ "learning_rate": 1.996300343384821e-06,
+ "loss": 0.3852,
+ "step": 7208
+ },
+ {
+ "epoch": 3.4085106382978725,
+ "grad_norm": 3.3587961196899414,
+ "learning_rate": 1.995689325130092e-06,
+ "loss": 0.3947,
+ "step": 7209
+ },
+ {
+ "epoch": 3.408983451536643,
+ "grad_norm": 2.626983165740967,
+ "learning_rate": 1.995078338276578e-06,
+ "loss": 0.316,
+ "step": 7210
+ },
+ {
+ "epoch": 3.4094562647754136,
+ "grad_norm": 3.14713978767395,
+ "learning_rate": 1.9944673828623217e-06,
+ "loss": 0.4008,
+ "step": 7211
+ },
+ {
+ "epoch": 3.4099290780141844,
+ "grad_norm": 2.968918800354004,
+ "learning_rate": 1.993856458925365e-06,
+ "loss": 0.439,
+ "step": 7212
+ },
+ {
+ "epoch": 3.4104018912529552,
+ "grad_norm": 2.7724127769470215,
+ "learning_rate": 1.9932455665037476e-06,
+ "loss": 0.3941,
+ "step": 7213
+ },
+ {
+ "epoch": 3.4108747044917256,
+ "grad_norm": 2.963146448135376,
+ "learning_rate": 1.9926347056355057e-06,
+ "loss": 0.3893,
+ "step": 7214
+ },
+ {
+ "epoch": 3.4113475177304964,
+ "grad_norm": 2.791637420654297,
+ "learning_rate": 1.9920238763586765e-06,
+ "loss": 0.4068,
+ "step": 7215
+ },
+ {
+ "epoch": 3.411820330969267,
+ "grad_norm": 3.030275583267212,
+ "learning_rate": 1.9914130787112924e-06,
+ "loss": 0.3828,
+ "step": 7216
+ },
+ {
+ "epoch": 3.412293144208038,
+ "grad_norm": 3.113128900527954,
+ "learning_rate": 1.990802312731387e-06,
+ "loss": 0.3903,
+ "step": 7217
+ },
+ {
+ "epoch": 3.4127659574468083,
+ "grad_norm": 3.104170322418213,
+ "learning_rate": 1.9901915784569884e-06,
+ "loss": 0.4171,
+ "step": 7218
+ },
+ {
+ "epoch": 3.413238770685579,
+ "grad_norm": 3.1247572898864746,
+ "learning_rate": 1.989580875926125e-06,
+ "loss": 0.4022,
+ "step": 7219
+ },
+ {
+ "epoch": 3.41371158392435,
+ "grad_norm": 2.9487457275390625,
+ "learning_rate": 1.988970205176822e-06,
+ "loss": 0.3948,
+ "step": 7220
+ },
+ {
+ "epoch": 3.4141843971631207,
+ "grad_norm": 2.8763654232025146,
+ "learning_rate": 1.9883595662471028e-06,
+ "loss": 0.3588,
+ "step": 7221
+ },
+ {
+ "epoch": 3.414657210401891,
+ "grad_norm": 2.563152551651001,
+ "learning_rate": 1.987748959174991e-06,
+ "loss": 0.3509,
+ "step": 7222
+ },
+ {
+ "epoch": 3.415130023640662,
+ "grad_norm": 3.148759365081787,
+ "learning_rate": 1.9871383839985053e-06,
+ "loss": 0.4364,
+ "step": 7223
+ },
+ {
+ "epoch": 3.4156028368794327,
+ "grad_norm": 2.8187363147735596,
+ "learning_rate": 1.986527840755663e-06,
+ "loss": 0.3803,
+ "step": 7224
+ },
+ {
+ "epoch": 3.4160756501182035,
+ "grad_norm": 3.009376287460327,
+ "learning_rate": 1.985917329484481e-06,
+ "loss": 0.3841,
+ "step": 7225
+ },
+ {
+ "epoch": 3.416548463356974,
+ "grad_norm": 2.869291067123413,
+ "learning_rate": 1.985306850222972e-06,
+ "loss": 0.3877,
+ "step": 7226
+ },
+ {
+ "epoch": 3.4170212765957446,
+ "grad_norm": 3.108461856842041,
+ "learning_rate": 1.9846964030091497e-06,
+ "loss": 0.3767,
+ "step": 7227
+ },
+ {
+ "epoch": 3.4174940898345154,
+ "grad_norm": 3.096320629119873,
+ "learning_rate": 1.9840859878810226e-06,
+ "loss": 0.4603,
+ "step": 7228
+ },
+ {
+ "epoch": 3.417966903073286,
+ "grad_norm": 2.8519909381866455,
+ "learning_rate": 1.983475604876598e-06,
+ "loss": 0.3263,
+ "step": 7229
+ },
+ {
+ "epoch": 3.4184397163120566,
+ "grad_norm": 3.192051410675049,
+ "learning_rate": 1.9828652540338835e-06,
+ "loss": 0.4132,
+ "step": 7230
+ },
+ {
+ "epoch": 3.4189125295508274,
+ "grad_norm": 3.0398056507110596,
+ "learning_rate": 1.9822549353908817e-06,
+ "loss": 0.4038,
+ "step": 7231
+ },
+ {
+ "epoch": 3.419385342789598,
+ "grad_norm": 3.12247896194458,
+ "learning_rate": 1.9816446489855944e-06,
+ "loss": 0.409,
+ "step": 7232
+ },
+ {
+ "epoch": 3.419858156028369,
+ "grad_norm": 3.20316481590271,
+ "learning_rate": 1.9810343948560223e-06,
+ "loss": 0.4058,
+ "step": 7233
+ },
+ {
+ "epoch": 3.4203309692671393,
+ "grad_norm": 3.3397457599639893,
+ "learning_rate": 1.9804241730401625e-06,
+ "loss": 0.3657,
+ "step": 7234
+ },
+ {
+ "epoch": 3.42080378250591,
+ "grad_norm": 3.928691864013672,
+ "learning_rate": 1.979813983576012e-06,
+ "loss": 0.361,
+ "step": 7235
+ },
+ {
+ "epoch": 3.421276595744681,
+ "grad_norm": 3.5814051628112793,
+ "learning_rate": 1.9792038265015635e-06,
+ "loss": 0.3975,
+ "step": 7236
+ },
+ {
+ "epoch": 3.4217494089834517,
+ "grad_norm": 2.8578879833221436,
+ "learning_rate": 1.9785937018548086e-06,
+ "loss": 0.3915,
+ "step": 7237
+ },
+ {
+ "epoch": 3.422222222222222,
+ "grad_norm": 3.0343220233917236,
+ "learning_rate": 1.977983609673738e-06,
+ "loss": 0.3686,
+ "step": 7238
+ },
+ {
+ "epoch": 3.422695035460993,
+ "grad_norm": 3.2719056606292725,
+ "learning_rate": 1.977373549996338e-06,
+ "loss": 0.3905,
+ "step": 7239
+ },
+ {
+ "epoch": 3.4231678486997636,
+ "grad_norm": 2.6638169288635254,
+ "learning_rate": 1.976763522860597e-06,
+ "loss": 0.3631,
+ "step": 7240
+ },
+ {
+ "epoch": 3.4236406619385344,
+ "grad_norm": 2.7679927349090576,
+ "learning_rate": 1.9761535283044967e-06,
+ "loss": 0.377,
+ "step": 7241
+ },
+ {
+ "epoch": 3.424113475177305,
+ "grad_norm": 2.774540424346924,
+ "learning_rate": 1.975543566366019e-06,
+ "loss": 0.3509,
+ "step": 7242
+ },
+ {
+ "epoch": 3.4245862884160756,
+ "grad_norm": 2.811659336090088,
+ "learning_rate": 1.9749336370831438e-06,
+ "loss": 0.3835,
+ "step": 7243
+ },
+ {
+ "epoch": 3.4250591016548464,
+ "grad_norm": 2.8533360958099365,
+ "learning_rate": 1.9743237404938478e-06,
+ "loss": 0.3765,
+ "step": 7244
+ },
+ {
+ "epoch": 3.425531914893617,
+ "grad_norm": 2.712301015853882,
+ "learning_rate": 1.9737138766361084e-06,
+ "loss": 0.3797,
+ "step": 7245
+ },
+ {
+ "epoch": 3.4260047281323875,
+ "grad_norm": 2.9763426780700684,
+ "learning_rate": 1.9731040455478986e-06,
+ "loss": 0.4223,
+ "step": 7246
+ },
+ {
+ "epoch": 3.4264775413711583,
+ "grad_norm": 2.8802297115325928,
+ "learning_rate": 1.9724942472671882e-06,
+ "loss": 0.3666,
+ "step": 7247
+ },
+ {
+ "epoch": 3.426950354609929,
+ "grad_norm": 2.934107542037964,
+ "learning_rate": 1.9718844818319486e-06,
+ "loss": 0.3612,
+ "step": 7248
+ },
+ {
+ "epoch": 3.4274231678487,
+ "grad_norm": 3.0172696113586426,
+ "learning_rate": 1.9712747492801467e-06,
+ "loss": 0.3643,
+ "step": 7249
+ },
+ {
+ "epoch": 3.4278959810874703,
+ "grad_norm": 3.368419647216797,
+ "learning_rate": 1.970665049649748e-06,
+ "loss": 0.4511,
+ "step": 7250
+ },
+ {
+ "epoch": 3.428368794326241,
+ "grad_norm": 3.077819585800171,
+ "learning_rate": 1.9700553829787162e-06,
+ "loss": 0.4013,
+ "step": 7251
+ },
+ {
+ "epoch": 3.428841607565012,
+ "grad_norm": 2.690673828125,
+ "learning_rate": 1.96944574930501e-06,
+ "loss": 0.3776,
+ "step": 7252
+ },
+ {
+ "epoch": 3.4293144208037827,
+ "grad_norm": 3.1122169494628906,
+ "learning_rate": 1.9688361486665924e-06,
+ "loss": 0.3802,
+ "step": 7253
+ },
+ {
+ "epoch": 3.429787234042553,
+ "grad_norm": 2.9874207973480225,
+ "learning_rate": 1.968226581101417e-06,
+ "loss": 0.4492,
+ "step": 7254
+ },
+ {
+ "epoch": 3.430260047281324,
+ "grad_norm": 2.885493278503418,
+ "learning_rate": 1.967617046647442e-06,
+ "loss": 0.3958,
+ "step": 7255
+ },
+ {
+ "epoch": 3.4307328605200946,
+ "grad_norm": 2.953897476196289,
+ "learning_rate": 1.9670075453426195e-06,
+ "loss": 0.3973,
+ "step": 7256
+ },
+ {
+ "epoch": 3.4312056737588654,
+ "grad_norm": 2.685088634490967,
+ "learning_rate": 1.966398077224899e-06,
+ "loss": 0.393,
+ "step": 7257
+ },
+ {
+ "epoch": 3.431678486997636,
+ "grad_norm": 4.035208702087402,
+ "learning_rate": 1.9657886423322313e-06,
+ "loss": 0.4263,
+ "step": 7258
+ },
+ {
+ "epoch": 3.4321513002364066,
+ "grad_norm": 2.942042827606201,
+ "learning_rate": 1.965179240702562e-06,
+ "loss": 0.4319,
+ "step": 7259
+ },
+ {
+ "epoch": 3.4326241134751774,
+ "grad_norm": 3.0794999599456787,
+ "learning_rate": 1.9645698723738356e-06,
+ "loss": 0.4199,
+ "step": 7260
+ },
+ {
+ "epoch": 3.433096926713948,
+ "grad_norm": 3.0653584003448486,
+ "learning_rate": 1.963960537383996e-06,
+ "loss": 0.3723,
+ "step": 7261
+ },
+ {
+ "epoch": 3.4335697399527185,
+ "grad_norm": 3.1571545600891113,
+ "learning_rate": 1.963351235770983e-06,
+ "loss": 0.4211,
+ "step": 7262
+ },
+ {
+ "epoch": 3.4340425531914893,
+ "grad_norm": 2.6681735515594482,
+ "learning_rate": 1.962741967572736e-06,
+ "loss": 0.3333,
+ "step": 7263
+ },
+ {
+ "epoch": 3.43451536643026,
+ "grad_norm": 2.9747934341430664,
+ "learning_rate": 1.9621327328271907e-06,
+ "loss": 0.3896,
+ "step": 7264
+ },
+ {
+ "epoch": 3.434988179669031,
+ "grad_norm": 2.7994508743286133,
+ "learning_rate": 1.9615235315722814e-06,
+ "loss": 0.3642,
+ "step": 7265
+ },
+ {
+ "epoch": 3.4354609929078013,
+ "grad_norm": 2.933928966522217,
+ "learning_rate": 1.9609143638459405e-06,
+ "loss": 0.3955,
+ "step": 7266
+ },
+ {
+ "epoch": 3.435933806146572,
+ "grad_norm": 2.9577367305755615,
+ "learning_rate": 1.9603052296860983e-06,
+ "loss": 0.3437,
+ "step": 7267
+ },
+ {
+ "epoch": 3.436406619385343,
+ "grad_norm": 3.017282009124756,
+ "learning_rate": 1.959696129130684e-06,
+ "loss": 0.3784,
+ "step": 7268
+ },
+ {
+ "epoch": 3.4368794326241137,
+ "grad_norm": 3.2072815895080566,
+ "learning_rate": 1.959087062217622e-06,
+ "loss": 0.3901,
+ "step": 7269
+ },
+ {
+ "epoch": 3.437352245862884,
+ "grad_norm": 2.91153621673584,
+ "learning_rate": 1.9584780289848358e-06,
+ "loss": 0.4402,
+ "step": 7270
+ },
+ {
+ "epoch": 3.437825059101655,
+ "grad_norm": 2.846842050552368,
+ "learning_rate": 1.9578690294702495e-06,
+ "loss": 0.3804,
+ "step": 7271
+ },
+ {
+ "epoch": 3.4382978723404256,
+ "grad_norm": 3.0958521366119385,
+ "learning_rate": 1.957260063711781e-06,
+ "loss": 0.4103,
+ "step": 7272
+ },
+ {
+ "epoch": 3.4387706855791964,
+ "grad_norm": 2.9808530807495117,
+ "learning_rate": 1.9566511317473483e-06,
+ "loss": 0.4127,
+ "step": 7273
+ },
+ {
+ "epoch": 3.4392434988179668,
+ "grad_norm": 2.725851058959961,
+ "learning_rate": 1.9560422336148678e-06,
+ "loss": 0.3493,
+ "step": 7274
+ },
+ {
+ "epoch": 3.4397163120567376,
+ "grad_norm": 2.7861814498901367,
+ "learning_rate": 1.9554333693522515e-06,
+ "loss": 0.3703,
+ "step": 7275
+ },
+ {
+ "epoch": 3.4401891252955084,
+ "grad_norm": 3.128708839416504,
+ "learning_rate": 1.954824538997412e-06,
+ "loss": 0.3917,
+ "step": 7276
+ },
+ {
+ "epoch": 3.440661938534279,
+ "grad_norm": 3.117403268814087,
+ "learning_rate": 1.954215742588257e-06,
+ "loss": 0.3581,
+ "step": 7277
+ },
+ {
+ "epoch": 3.4411347517730495,
+ "grad_norm": 2.710076093673706,
+ "learning_rate": 1.9536069801626957e-06,
+ "loss": 0.3255,
+ "step": 7278
+ },
+ {
+ "epoch": 3.4416075650118203,
+ "grad_norm": 2.7732627391815186,
+ "learning_rate": 1.952998251758632e-06,
+ "loss": 0.375,
+ "step": 7279
+ },
+ {
+ "epoch": 3.442080378250591,
+ "grad_norm": 2.896050453186035,
+ "learning_rate": 1.9523895574139673e-06,
+ "loss": 0.4087,
+ "step": 7280
+ },
+ {
+ "epoch": 3.4425531914893615,
+ "grad_norm": 2.9051663875579834,
+ "learning_rate": 1.9517808971666048e-06,
+ "loss": 0.3423,
+ "step": 7281
+ },
+ {
+ "epoch": 3.4430260047281322,
+ "grad_norm": 3.0232038497924805,
+ "learning_rate": 1.9511722710544417e-06,
+ "loss": 0.364,
+ "step": 7282
+ },
+ {
+ "epoch": 3.443498817966903,
+ "grad_norm": 2.753870725631714,
+ "learning_rate": 1.9505636791153744e-06,
+ "loss": 0.3484,
+ "step": 7283
+ },
+ {
+ "epoch": 3.443971631205674,
+ "grad_norm": 2.944079637527466,
+ "learning_rate": 1.9499551213872983e-06,
+ "loss": 0.3354,
+ "step": 7284
+ },
+ {
+ "epoch": 3.4444444444444446,
+ "grad_norm": 3.1531970500946045,
+ "learning_rate": 1.949346597908104e-06,
+ "loss": 0.3394,
+ "step": 7285
+ },
+ {
+ "epoch": 3.444917257683215,
+ "grad_norm": 3.0357189178466797,
+ "learning_rate": 1.948738108715683e-06,
+ "loss": 0.4302,
+ "step": 7286
+ },
+ {
+ "epoch": 3.445390070921986,
+ "grad_norm": 3.3698086738586426,
+ "learning_rate": 1.948129653847923e-06,
+ "loss": 0.419,
+ "step": 7287
+ },
+ {
+ "epoch": 3.4458628841607566,
+ "grad_norm": 3.343132495880127,
+ "learning_rate": 1.947521233342709e-06,
+ "loss": 0.3895,
+ "step": 7288
+ },
+ {
+ "epoch": 3.446335697399527,
+ "grad_norm": 3.1905252933502197,
+ "learning_rate": 1.9469128472379257e-06,
+ "loss": 0.429,
+ "step": 7289
+ },
+ {
+ "epoch": 3.4468085106382977,
+ "grad_norm": 2.8517212867736816,
+ "learning_rate": 1.946304495571454e-06,
+ "loss": 0.3513,
+ "step": 7290
+ },
+ {
+ "epoch": 3.4472813238770685,
+ "grad_norm": 2.7713496685028076,
+ "learning_rate": 1.9456961783811735e-06,
+ "loss": 0.4331,
+ "step": 7291
+ },
+ {
+ "epoch": 3.4477541371158393,
+ "grad_norm": 2.8258652687072754,
+ "learning_rate": 1.945087895704962e-06,
+ "loss": 0.3539,
+ "step": 7292
+ },
+ {
+ "epoch": 3.44822695035461,
+ "grad_norm": 2.757322072982788,
+ "learning_rate": 1.9444796475806925e-06,
+ "loss": 0.3865,
+ "step": 7293
+ },
+ {
+ "epoch": 3.4486997635933805,
+ "grad_norm": 2.8410696983337402,
+ "learning_rate": 1.943871434046241e-06,
+ "loss": 0.3612,
+ "step": 7294
+ },
+ {
+ "epoch": 3.4491725768321513,
+ "grad_norm": 3.2297637462615967,
+ "learning_rate": 1.9432632551394753e-06,
+ "loss": 0.3956,
+ "step": 7295
+ },
+ {
+ "epoch": 3.449645390070922,
+ "grad_norm": 2.991351842880249,
+ "learning_rate": 1.9426551108982666e-06,
+ "loss": 0.3864,
+ "step": 7296
+ },
+ {
+ "epoch": 3.4501182033096924,
+ "grad_norm": 2.7942168712615967,
+ "learning_rate": 1.94204700136048e-06,
+ "loss": 0.4314,
+ "step": 7297
+ },
+ {
+ "epoch": 3.4505910165484632,
+ "grad_norm": 2.8188698291778564,
+ "learning_rate": 1.9414389265639805e-06,
+ "loss": 0.3585,
+ "step": 7298
+ },
+ {
+ "epoch": 3.451063829787234,
+ "grad_norm": 3.2826895713806152,
+ "learning_rate": 1.9408308865466295e-06,
+ "loss": 0.4614,
+ "step": 7299
+ },
+ {
+ "epoch": 3.451536643026005,
+ "grad_norm": 3.273867130279541,
+ "learning_rate": 1.9402228813462865e-06,
+ "loss": 0.3533,
+ "step": 7300
+ },
+ {
+ "epoch": 3.4520094562647756,
+ "grad_norm": 3.5334157943725586,
+ "learning_rate": 1.939614911000811e-06,
+ "loss": 0.4088,
+ "step": 7301
+ },
+ {
+ "epoch": 3.452482269503546,
+ "grad_norm": 2.983908176422119,
+ "learning_rate": 1.9390069755480583e-06,
+ "loss": 0.3725,
+ "step": 7302
+ },
+ {
+ "epoch": 3.4529550827423168,
+ "grad_norm": 2.893660306930542,
+ "learning_rate": 1.93839907502588e-06,
+ "loss": 0.3746,
+ "step": 7303
+ },
+ {
+ "epoch": 3.4534278959810876,
+ "grad_norm": 3.1762871742248535,
+ "learning_rate": 1.9377912094721295e-06,
+ "loss": 0.446,
+ "step": 7304
+ },
+ {
+ "epoch": 3.453900709219858,
+ "grad_norm": 3.3231537342071533,
+ "learning_rate": 1.9371833789246554e-06,
+ "loss": 0.4837,
+ "step": 7305
+ },
+ {
+ "epoch": 3.4543735224586287,
+ "grad_norm": 3.548333168029785,
+ "learning_rate": 1.936575583421304e-06,
+ "loss": 0.3911,
+ "step": 7306
+ },
+ {
+ "epoch": 3.4548463356973995,
+ "grad_norm": 3.0627071857452393,
+ "learning_rate": 1.9359678229999213e-06,
+ "loss": 0.3751,
+ "step": 7307
+ },
+ {
+ "epoch": 3.4553191489361703,
+ "grad_norm": 2.797663927078247,
+ "learning_rate": 1.9353600976983475e-06,
+ "loss": 0.41,
+ "step": 7308
+ },
+ {
+ "epoch": 3.455791962174941,
+ "grad_norm": 2.803269624710083,
+ "learning_rate": 1.9347524075544258e-06,
+ "loss": 0.3775,
+ "step": 7309
+ },
+ {
+ "epoch": 3.4562647754137115,
+ "grad_norm": 2.828010320663452,
+ "learning_rate": 1.934144752605993e-06,
+ "loss": 0.375,
+ "step": 7310
+ },
+ {
+ "epoch": 3.4567375886524823,
+ "grad_norm": 3.456477165222168,
+ "learning_rate": 1.933537132890884e-06,
+ "loss": 0.4764,
+ "step": 7311
+ },
+ {
+ "epoch": 3.457210401891253,
+ "grad_norm": 2.723670244216919,
+ "learning_rate": 1.9329295484469354e-06,
+ "loss": 0.3581,
+ "step": 7312
+ },
+ {
+ "epoch": 3.4576832151300234,
+ "grad_norm": 3.9723474979400635,
+ "learning_rate": 1.9323219993119766e-06,
+ "loss": 0.3951,
+ "step": 7313
+ },
+ {
+ "epoch": 3.458156028368794,
+ "grad_norm": 2.951300859451294,
+ "learning_rate": 1.931714485523838e-06,
+ "loss": 0.3865,
+ "step": 7314
+ },
+ {
+ "epoch": 3.458628841607565,
+ "grad_norm": 2.9265835285186768,
+ "learning_rate": 1.931107007120347e-06,
+ "loss": 0.3731,
+ "step": 7315
+ },
+ {
+ "epoch": 3.459101654846336,
+ "grad_norm": 3.271883249282837,
+ "learning_rate": 1.930499564139327e-06,
+ "loss": 0.3971,
+ "step": 7316
+ },
+ {
+ "epoch": 3.4595744680851066,
+ "grad_norm": 2.8716280460357666,
+ "learning_rate": 1.929892156618603e-06,
+ "loss": 0.3332,
+ "step": 7317
+ },
+ {
+ "epoch": 3.460047281323877,
+ "grad_norm": 2.9820191860198975,
+ "learning_rate": 1.929284784595993e-06,
+ "loss": 0.3907,
+ "step": 7318
+ },
+ {
+ "epoch": 3.4605200945626478,
+ "grad_norm": 3.313225269317627,
+ "learning_rate": 1.9286774481093183e-06,
+ "loss": 0.3678,
+ "step": 7319
+ },
+ {
+ "epoch": 3.4609929078014185,
+ "grad_norm": 3.365387439727783,
+ "learning_rate": 1.928070147196394e-06,
+ "loss": 0.4894,
+ "step": 7320
+ },
+ {
+ "epoch": 3.461465721040189,
+ "grad_norm": 3.1723599433898926,
+ "learning_rate": 1.927462881895033e-06,
+ "loss": 0.4607,
+ "step": 7321
+ },
+ {
+ "epoch": 3.4619385342789597,
+ "grad_norm": 2.7644999027252197,
+ "learning_rate": 1.9268556522430483e-06,
+ "loss": 0.3627,
+ "step": 7322
+ },
+ {
+ "epoch": 3.4624113475177305,
+ "grad_norm": 2.65572190284729,
+ "learning_rate": 1.9262484582782483e-06,
+ "loss": 0.3893,
+ "step": 7323
+ },
+ {
+ "epoch": 3.4628841607565013,
+ "grad_norm": 2.992037773132324,
+ "learning_rate": 1.9256413000384415e-06,
+ "loss": 0.4175,
+ "step": 7324
+ },
+ {
+ "epoch": 3.463356973995272,
+ "grad_norm": 3.020496368408203,
+ "learning_rate": 1.925034177561433e-06,
+ "loss": 0.42,
+ "step": 7325
+ },
+ {
+ "epoch": 3.4638297872340424,
+ "grad_norm": 2.780334234237671,
+ "learning_rate": 1.9244270908850236e-06,
+ "loss": 0.4195,
+ "step": 7326
+ },
+ {
+ "epoch": 3.4643026004728132,
+ "grad_norm": 2.863028049468994,
+ "learning_rate": 1.9238200400470166e-06,
+ "loss": 0.3706,
+ "step": 7327
+ },
+ {
+ "epoch": 3.464775413711584,
+ "grad_norm": 3.2766900062561035,
+ "learning_rate": 1.923213025085209e-06,
+ "loss": 0.4506,
+ "step": 7328
+ },
+ {
+ "epoch": 3.4652482269503544,
+ "grad_norm": 2.7300634384155273,
+ "learning_rate": 1.9226060460373975e-06,
+ "loss": 0.3463,
+ "step": 7329
+ },
+ {
+ "epoch": 3.465721040189125,
+ "grad_norm": 3.136104106903076,
+ "learning_rate": 1.921999102941376e-06,
+ "loss": 0.3839,
+ "step": 7330
+ },
+ {
+ "epoch": 3.466193853427896,
+ "grad_norm": 2.944932699203491,
+ "learning_rate": 1.921392195834934e-06,
+ "loss": 0.432,
+ "step": 7331
+ },
+ {
+ "epoch": 3.466666666666667,
+ "grad_norm": 3.428375005722046,
+ "learning_rate": 1.9207853247558647e-06,
+ "loss": 0.3407,
+ "step": 7332
+ },
+ {
+ "epoch": 3.4671394799054376,
+ "grad_norm": 3.3732450008392334,
+ "learning_rate": 1.9201784897419535e-06,
+ "loss": 0.361,
+ "step": 7333
+ },
+ {
+ "epoch": 3.467612293144208,
+ "grad_norm": 2.8291900157928467,
+ "learning_rate": 1.9195716908309836e-06,
+ "loss": 0.3805,
+ "step": 7334
+ },
+ {
+ "epoch": 3.4680851063829787,
+ "grad_norm": 3.3229610919952393,
+ "learning_rate": 1.9189649280607407e-06,
+ "loss": 0.3756,
+ "step": 7335
+ },
+ {
+ "epoch": 3.4685579196217495,
+ "grad_norm": 2.949416160583496,
+ "learning_rate": 1.918358201469004e-06,
+ "loss": 0.4316,
+ "step": 7336
+ },
+ {
+ "epoch": 3.46903073286052,
+ "grad_norm": 3.525501251220703,
+ "learning_rate": 1.9177515110935515e-06,
+ "loss": 0.4018,
+ "step": 7337
+ },
+ {
+ "epoch": 3.4695035460992907,
+ "grad_norm": 3.1439104080200195,
+ "learning_rate": 1.917144856972159e-06,
+ "loss": 0.4176,
+ "step": 7338
+ },
+ {
+ "epoch": 3.4699763593380615,
+ "grad_norm": 3.0022377967834473,
+ "learning_rate": 1.9165382391426006e-06,
+ "loss": 0.3962,
+ "step": 7339
+ },
+ {
+ "epoch": 3.4704491725768323,
+ "grad_norm": 3.2174794673919678,
+ "learning_rate": 1.9159316576426482e-06,
+ "loss": 0.441,
+ "step": 7340
+ },
+ {
+ "epoch": 3.470921985815603,
+ "grad_norm": 2.965123414993286,
+ "learning_rate": 1.9153251125100694e-06,
+ "loss": 0.4105,
+ "step": 7341
+ },
+ {
+ "epoch": 3.4713947990543734,
+ "grad_norm": 2.722904920578003,
+ "learning_rate": 1.9147186037826333e-06,
+ "loss": 0.4102,
+ "step": 7342
+ },
+ {
+ "epoch": 3.4718676122931442,
+ "grad_norm": 3.4894051551818848,
+ "learning_rate": 1.9141121314981033e-06,
+ "loss": 0.4225,
+ "step": 7343
+ },
+ {
+ "epoch": 3.472340425531915,
+ "grad_norm": 2.828497886657715,
+ "learning_rate": 1.913505695694241e-06,
+ "loss": 0.374,
+ "step": 7344
+ },
+ {
+ "epoch": 3.4728132387706854,
+ "grad_norm": 3.3046014308929443,
+ "learning_rate": 1.9128992964088077e-06,
+ "loss": 0.3568,
+ "step": 7345
+ },
+ {
+ "epoch": 3.473286052009456,
+ "grad_norm": 2.927281618118286,
+ "learning_rate": 1.9122929336795605e-06,
+ "loss": 0.4308,
+ "step": 7346
+ },
+ {
+ "epoch": 3.473758865248227,
+ "grad_norm": 2.9569990634918213,
+ "learning_rate": 1.911686607544256e-06,
+ "loss": 0.3226,
+ "step": 7347
+ },
+ {
+ "epoch": 3.4742316784869978,
+ "grad_norm": 3.1061038970947266,
+ "learning_rate": 1.9110803180406468e-06,
+ "loss": 0.4426,
+ "step": 7348
+ },
+ {
+ "epoch": 3.4747044917257686,
+ "grad_norm": 2.9609580039978027,
+ "learning_rate": 1.9104740652064825e-06,
+ "loss": 0.3835,
+ "step": 7349
+ },
+ {
+ "epoch": 3.475177304964539,
+ "grad_norm": 3.1547608375549316,
+ "learning_rate": 1.9098678490795147e-06,
+ "loss": 0.3814,
+ "step": 7350
+ },
+ {
+ "epoch": 3.4756501182033097,
+ "grad_norm": 2.869022846221924,
+ "learning_rate": 1.909261669697487e-06,
+ "loss": 0.4048,
+ "step": 7351
+ },
+ {
+ "epoch": 3.4761229314420805,
+ "grad_norm": 3.0565078258514404,
+ "learning_rate": 1.908655527098146e-06,
+ "loss": 0.3736,
+ "step": 7352
+ },
+ {
+ "epoch": 3.476595744680851,
+ "grad_norm": 2.893603563308716,
+ "learning_rate": 1.9080494213192317e-06,
+ "loss": 0.3906,
+ "step": 7353
+ },
+ {
+ "epoch": 3.4770685579196217,
+ "grad_norm": 2.818938732147217,
+ "learning_rate": 1.9074433523984844e-06,
+ "loss": 0.3958,
+ "step": 7354
+ },
+ {
+ "epoch": 3.4775413711583925,
+ "grad_norm": 2.675461769104004,
+ "learning_rate": 1.9068373203736419e-06,
+ "loss": 0.3371,
+ "step": 7355
+ },
+ {
+ "epoch": 3.4780141843971633,
+ "grad_norm": 2.5831551551818848,
+ "learning_rate": 1.9062313252824384e-06,
+ "loss": 0.3365,
+ "step": 7356
+ },
+ {
+ "epoch": 3.478486997635934,
+ "grad_norm": 3.299736738204956,
+ "learning_rate": 1.9056253671626054e-06,
+ "loss": 0.3923,
+ "step": 7357
+ },
+ {
+ "epoch": 3.4789598108747044,
+ "grad_norm": 2.508787155151367,
+ "learning_rate": 1.905019446051876e-06,
+ "loss": 0.3367,
+ "step": 7358
+ },
+ {
+ "epoch": 3.479432624113475,
+ "grad_norm": 2.980327606201172,
+ "learning_rate": 1.9044135619879753e-06,
+ "loss": 0.3842,
+ "step": 7359
+ },
+ {
+ "epoch": 3.479905437352246,
+ "grad_norm": 3.2114269733428955,
+ "learning_rate": 1.9038077150086317e-06,
+ "loss": 0.4625,
+ "step": 7360
+ },
+ {
+ "epoch": 3.4803782505910164,
+ "grad_norm": 3.2119715213775635,
+ "learning_rate": 1.9032019051515677e-06,
+ "loss": 0.4197,
+ "step": 7361
+ },
+ {
+ "epoch": 3.480851063829787,
+ "grad_norm": 3.2967300415039062,
+ "learning_rate": 1.9025961324545034e-06,
+ "loss": 0.4462,
+ "step": 7362
+ },
+ {
+ "epoch": 3.481323877068558,
+ "grad_norm": 3.132643461227417,
+ "learning_rate": 1.9019903969551589e-06,
+ "loss": 0.4355,
+ "step": 7363
+ },
+ {
+ "epoch": 3.4817966903073287,
+ "grad_norm": 2.9940602779388428,
+ "learning_rate": 1.9013846986912493e-06,
+ "loss": 0.3584,
+ "step": 7364
+ },
+ {
+ "epoch": 3.482269503546099,
+ "grad_norm": 2.901935577392578,
+ "learning_rate": 1.9007790377004907e-06,
+ "loss": 0.3987,
+ "step": 7365
+ },
+ {
+ "epoch": 3.48274231678487,
+ "grad_norm": 3.143404006958008,
+ "learning_rate": 1.9001734140205939e-06,
+ "loss": 0.3642,
+ "step": 7366
+ },
+ {
+ "epoch": 3.4832151300236407,
+ "grad_norm": 3.6701388359069824,
+ "learning_rate": 1.899567827689267e-06,
+ "loss": 0.4111,
+ "step": 7367
+ },
+ {
+ "epoch": 3.4836879432624115,
+ "grad_norm": 2.6821651458740234,
+ "learning_rate": 1.8989622787442202e-06,
+ "loss": 0.3827,
+ "step": 7368
+ },
+ {
+ "epoch": 3.484160756501182,
+ "grad_norm": 3.3916640281677246,
+ "learning_rate": 1.8983567672231562e-06,
+ "loss": 0.4214,
+ "step": 7369
+ },
+ {
+ "epoch": 3.4846335697399526,
+ "grad_norm": 3.3192927837371826,
+ "learning_rate": 1.8977512931637788e-06,
+ "loss": 0.4375,
+ "step": 7370
+ },
+ {
+ "epoch": 3.4851063829787234,
+ "grad_norm": 3.177884578704834,
+ "learning_rate": 1.8971458566037877e-06,
+ "loss": 0.3944,
+ "step": 7371
+ },
+ {
+ "epoch": 3.4855791962174942,
+ "grad_norm": 3.1554532051086426,
+ "learning_rate": 1.896540457580879e-06,
+ "loss": 0.3712,
+ "step": 7372
+ },
+ {
+ "epoch": 3.4860520094562646,
+ "grad_norm": 2.7773821353912354,
+ "learning_rate": 1.8959350961327516e-06,
+ "loss": 0.3534,
+ "step": 7373
+ },
+ {
+ "epoch": 3.4865248226950354,
+ "grad_norm": 2.4792168140411377,
+ "learning_rate": 1.8953297722970956e-06,
+ "loss": 0.3571,
+ "step": 7374
+ },
+ {
+ "epoch": 3.486997635933806,
+ "grad_norm": 2.676361083984375,
+ "learning_rate": 1.8947244861116044e-06,
+ "loss": 0.3365,
+ "step": 7375
+ },
+ {
+ "epoch": 3.487470449172577,
+ "grad_norm": 2.9340765476226807,
+ "learning_rate": 1.8941192376139655e-06,
+ "loss": 0.4656,
+ "step": 7376
+ },
+ {
+ "epoch": 3.4879432624113473,
+ "grad_norm": 3.7924742698669434,
+ "learning_rate": 1.8935140268418646e-06,
+ "loss": 0.3639,
+ "step": 7377
+ },
+ {
+ "epoch": 3.488416075650118,
+ "grad_norm": 2.798912286758423,
+ "learning_rate": 1.892908853832986e-06,
+ "loss": 0.3741,
+ "step": 7378
+ },
+ {
+ "epoch": 3.488888888888889,
+ "grad_norm": 3.1731197834014893,
+ "learning_rate": 1.8923037186250112e-06,
+ "loss": 0.4041,
+ "step": 7379
+ },
+ {
+ "epoch": 3.4893617021276597,
+ "grad_norm": 2.893725633621216,
+ "learning_rate": 1.8916986212556182e-06,
+ "loss": 0.3103,
+ "step": 7380
+ },
+ {
+ "epoch": 3.48983451536643,
+ "grad_norm": 3.2489001750946045,
+ "learning_rate": 1.891093561762486e-06,
+ "loss": 0.328,
+ "step": 7381
+ },
+ {
+ "epoch": 3.490307328605201,
+ "grad_norm": 2.8076415061950684,
+ "learning_rate": 1.8904885401832862e-06,
+ "loss": 0.426,
+ "step": 7382
+ },
+ {
+ "epoch": 3.4907801418439717,
+ "grad_norm": 3.076544761657715,
+ "learning_rate": 1.8898835565556938e-06,
+ "loss": 0.3664,
+ "step": 7383
+ },
+ {
+ "epoch": 3.4912529550827425,
+ "grad_norm": 2.7615935802459717,
+ "learning_rate": 1.8892786109173769e-06,
+ "loss": 0.3718,
+ "step": 7384
+ },
+ {
+ "epoch": 3.491725768321513,
+ "grad_norm": 2.9050116539001465,
+ "learning_rate": 1.8886737033060023e-06,
+ "loss": 0.3456,
+ "step": 7385
+ },
+ {
+ "epoch": 3.4921985815602836,
+ "grad_norm": 2.4928293228149414,
+ "learning_rate": 1.8880688337592366e-06,
+ "loss": 0.3487,
+ "step": 7386
+ },
+ {
+ "epoch": 3.4926713947990544,
+ "grad_norm": 2.773418426513672,
+ "learning_rate": 1.88746400231474e-06,
+ "loss": 0.3771,
+ "step": 7387
+ },
+ {
+ "epoch": 3.493144208037825,
+ "grad_norm": 2.7137296199798584,
+ "learning_rate": 1.886859209010175e-06,
+ "loss": 0.376,
+ "step": 7388
+ },
+ {
+ "epoch": 3.4936170212765956,
+ "grad_norm": 3.327976942062378,
+ "learning_rate": 1.886254453883199e-06,
+ "loss": 0.3481,
+ "step": 7389
+ },
+ {
+ "epoch": 3.4940898345153664,
+ "grad_norm": 3.8637235164642334,
+ "learning_rate": 1.8856497369714655e-06,
+ "loss": 0.3726,
+ "step": 7390
+ },
+ {
+ "epoch": 3.494562647754137,
+ "grad_norm": 3.1517951488494873,
+ "learning_rate": 1.88504505831263e-06,
+ "loss": 0.4459,
+ "step": 7391
+ },
+ {
+ "epoch": 3.495035460992908,
+ "grad_norm": 3.160130262374878,
+ "learning_rate": 1.884440417944342e-06,
+ "loss": 0.3918,
+ "step": 7392
+ },
+ {
+ "epoch": 3.4955082742316783,
+ "grad_norm": 2.6518726348876953,
+ "learning_rate": 1.8838358159042503e-06,
+ "loss": 0.3493,
+ "step": 7393
+ },
+ {
+ "epoch": 3.495981087470449,
+ "grad_norm": 2.7487380504608154,
+ "learning_rate": 1.8832312522300009e-06,
+ "loss": 0.3846,
+ "step": 7394
+ },
+ {
+ "epoch": 3.49645390070922,
+ "grad_norm": 3.062293291091919,
+ "learning_rate": 1.8826267269592355e-06,
+ "loss": 0.3792,
+ "step": 7395
+ },
+ {
+ "epoch": 3.4969267139479907,
+ "grad_norm": 3.3636794090270996,
+ "learning_rate": 1.8820222401295979e-06,
+ "loss": 0.4504,
+ "step": 7396
+ },
+ {
+ "epoch": 3.497399527186761,
+ "grad_norm": 3.230196237564087,
+ "learning_rate": 1.8814177917787246e-06,
+ "loss": 0.3953,
+ "step": 7397
+ },
+ {
+ "epoch": 3.497872340425532,
+ "grad_norm": 2.891002893447876,
+ "learning_rate": 1.8808133819442541e-06,
+ "loss": 0.3923,
+ "step": 7398
+ },
+ {
+ "epoch": 3.4983451536643027,
+ "grad_norm": 2.7478551864624023,
+ "learning_rate": 1.8802090106638196e-06,
+ "loss": 0.4115,
+ "step": 7399
+ },
+ {
+ "epoch": 3.4988179669030735,
+ "grad_norm": 3.0452797412872314,
+ "learning_rate": 1.8796046779750515e-06,
+ "loss": 0.4154,
+ "step": 7400
+ },
+ {
+ "epoch": 3.499290780141844,
+ "grad_norm": 3.0759124755859375,
+ "learning_rate": 1.87900038391558e-06,
+ "loss": 0.4277,
+ "step": 7401
+ },
+ {
+ "epoch": 3.4997635933806146,
+ "grad_norm": 2.7563929557800293,
+ "learning_rate": 1.8783961285230314e-06,
+ "loss": 0.3896,
+ "step": 7402
+ },
+ {
+ "epoch": 3.5002364066193854,
+ "grad_norm": 2.661916494369507,
+ "learning_rate": 1.87779191183503e-06,
+ "loss": 0.3625,
+ "step": 7403
+ },
+ {
+ "epoch": 3.500709219858156,
+ "grad_norm": 2.881241798400879,
+ "learning_rate": 1.877187733889199e-06,
+ "loss": 0.3724,
+ "step": 7404
+ },
+ {
+ "epoch": 3.5011820330969265,
+ "grad_norm": 3.2405693531036377,
+ "learning_rate": 1.8765835947231554e-06,
+ "loss": 0.3974,
+ "step": 7405
+ },
+ {
+ "epoch": 3.5016548463356973,
+ "grad_norm": 2.924288034439087,
+ "learning_rate": 1.8759794943745184e-06,
+ "loss": 0.3467,
+ "step": 7406
+ },
+ {
+ "epoch": 3.502127659574468,
+ "grad_norm": 3.031663656234741,
+ "learning_rate": 1.8753754328809027e-06,
+ "loss": 0.3995,
+ "step": 7407
+ },
+ {
+ "epoch": 3.5026004728132385,
+ "grad_norm": 3.028277635574341,
+ "learning_rate": 1.874771410279919e-06,
+ "loss": 0.3741,
+ "step": 7408
+ },
+ {
+ "epoch": 3.5030732860520093,
+ "grad_norm": 3.0211644172668457,
+ "learning_rate": 1.8741674266091782e-06,
+ "loss": 0.4018,
+ "step": 7409
+ },
+ {
+ "epoch": 3.50354609929078,
+ "grad_norm": 2.732234239578247,
+ "learning_rate": 1.8735634819062875e-06,
+ "loss": 0.313,
+ "step": 7410
+ },
+ {
+ "epoch": 3.504018912529551,
+ "grad_norm": 3.139596939086914,
+ "learning_rate": 1.8729595762088525e-06,
+ "loss": 0.4112,
+ "step": 7411
+ },
+ {
+ "epoch": 3.5044917257683217,
+ "grad_norm": 2.894230365753174,
+ "learning_rate": 1.8723557095544754e-06,
+ "loss": 0.3891,
+ "step": 7412
+ },
+ {
+ "epoch": 3.504964539007092,
+ "grad_norm": 2.850205659866333,
+ "learning_rate": 1.8717518819807547e-06,
+ "loss": 0.424,
+ "step": 7413
+ },
+ {
+ "epoch": 3.505437352245863,
+ "grad_norm": 3.047736644744873,
+ "learning_rate": 1.8711480935252907e-06,
+ "loss": 0.3757,
+ "step": 7414
+ },
+ {
+ "epoch": 3.5059101654846336,
+ "grad_norm": 3.0174455642700195,
+ "learning_rate": 1.8705443442256772e-06,
+ "loss": 0.3625,
+ "step": 7415
+ },
+ {
+ "epoch": 3.506382978723404,
+ "grad_norm": 2.840681552886963,
+ "learning_rate": 1.869940634119507e-06,
+ "loss": 0.3595,
+ "step": 7416
+ },
+ {
+ "epoch": 3.506855791962175,
+ "grad_norm": 3.067473888397217,
+ "learning_rate": 1.8693369632443713e-06,
+ "loss": 0.432,
+ "step": 7417
+ },
+ {
+ "epoch": 3.5073286052009456,
+ "grad_norm": 2.94655179977417,
+ "learning_rate": 1.8687333316378572e-06,
+ "loss": 0.4222,
+ "step": 7418
+ },
+ {
+ "epoch": 3.5078014184397164,
+ "grad_norm": 2.968548536300659,
+ "learning_rate": 1.868129739337551e-06,
+ "loss": 0.4098,
+ "step": 7419
+ },
+ {
+ "epoch": 3.508274231678487,
+ "grad_norm": 2.70094895362854,
+ "learning_rate": 1.867526186381034e-06,
+ "loss": 0.386,
+ "step": 7420
+ },
+ {
+ "epoch": 3.5087470449172575,
+ "grad_norm": 3.25897216796875,
+ "learning_rate": 1.8669226728058895e-06,
+ "loss": 0.4411,
+ "step": 7421
+ },
+ {
+ "epoch": 3.5092198581560283,
+ "grad_norm": 4.281215667724609,
+ "learning_rate": 1.866319198649694e-06,
+ "loss": 0.4011,
+ "step": 7422
+ },
+ {
+ "epoch": 3.509692671394799,
+ "grad_norm": 2.8394858837127686,
+ "learning_rate": 1.8657157639500223e-06,
+ "loss": 0.4162,
+ "step": 7423
+ },
+ {
+ "epoch": 3.5101654846335695,
+ "grad_norm": 2.732691764831543,
+ "learning_rate": 1.86511236874445e-06,
+ "loss": 0.3603,
+ "step": 7424
+ },
+ {
+ "epoch": 3.5106382978723403,
+ "grad_norm": 3.0152828693389893,
+ "learning_rate": 1.8645090130705463e-06,
+ "loss": 0.3811,
+ "step": 7425
+ },
+ {
+ "epoch": 3.511111111111111,
+ "grad_norm": 3.1762008666992188,
+ "learning_rate": 1.8639056969658793e-06,
+ "loss": 0.3985,
+ "step": 7426
+ },
+ {
+ "epoch": 3.511583924349882,
+ "grad_norm": 3.151123523712158,
+ "learning_rate": 1.863302420468016e-06,
+ "loss": 0.3582,
+ "step": 7427
+ },
+ {
+ "epoch": 3.5120567375886527,
+ "grad_norm": 2.738206386566162,
+ "learning_rate": 1.862699183614518e-06,
+ "loss": 0.3768,
+ "step": 7428
+ },
+ {
+ "epoch": 3.512529550827423,
+ "grad_norm": 3.235212564468384,
+ "learning_rate": 1.8620959864429487e-06,
+ "loss": 0.3964,
+ "step": 7429
+ },
+ {
+ "epoch": 3.513002364066194,
+ "grad_norm": 3.1113579273223877,
+ "learning_rate": 1.8614928289908648e-06,
+ "loss": 0.3979,
+ "step": 7430
+ },
+ {
+ "epoch": 3.5134751773049646,
+ "grad_norm": 2.6802520751953125,
+ "learning_rate": 1.860889711295822e-06,
+ "loss": 0.327,
+ "step": 7431
+ },
+ {
+ "epoch": 3.513947990543735,
+ "grad_norm": 2.9212403297424316,
+ "learning_rate": 1.860286633395375e-06,
+ "loss": 0.4104,
+ "step": 7432
+ },
+ {
+ "epoch": 3.5144208037825058,
+ "grad_norm": 2.868861198425293,
+ "learning_rate": 1.8596835953270742e-06,
+ "loss": 0.383,
+ "step": 7433
+ },
+ {
+ "epoch": 3.5148936170212766,
+ "grad_norm": 2.831655740737915,
+ "learning_rate": 1.8590805971284686e-06,
+ "loss": 0.3615,
+ "step": 7434
+ },
+ {
+ "epoch": 3.5153664302600474,
+ "grad_norm": 3.1540114879608154,
+ "learning_rate": 1.8584776388371039e-06,
+ "loss": 0.3914,
+ "step": 7435
+ },
+ {
+ "epoch": 3.515839243498818,
+ "grad_norm": 3.22031307220459,
+ "learning_rate": 1.8578747204905223e-06,
+ "loss": 0.4358,
+ "step": 7436
+ },
+ {
+ "epoch": 3.5163120567375885,
+ "grad_norm": 3.2922887802124023,
+ "learning_rate": 1.8572718421262677e-06,
+ "loss": 0.3894,
+ "step": 7437
+ },
+ {
+ "epoch": 3.5167848699763593,
+ "grad_norm": 2.936475992202759,
+ "learning_rate": 1.856669003781876e-06,
+ "loss": 0.3748,
+ "step": 7438
+ },
+ {
+ "epoch": 3.51725768321513,
+ "grad_norm": 3.4542860984802246,
+ "learning_rate": 1.8560662054948856e-06,
+ "loss": 0.3362,
+ "step": 7439
+ },
+ {
+ "epoch": 3.5177304964539005,
+ "grad_norm": 3.1532278060913086,
+ "learning_rate": 1.8554634473028288e-06,
+ "loss": 0.411,
+ "step": 7440
+ },
+ {
+ "epoch": 3.5182033096926713,
+ "grad_norm": 3.1678943634033203,
+ "learning_rate": 1.854860729243237e-06,
+ "loss": 0.4357,
+ "step": 7441
+ },
+ {
+ "epoch": 3.518676122931442,
+ "grad_norm": 2.608930826187134,
+ "learning_rate": 1.8542580513536385e-06,
+ "loss": 0.3851,
+ "step": 7442
+ },
+ {
+ "epoch": 3.519148936170213,
+ "grad_norm": 3.127915143966675,
+ "learning_rate": 1.853655413671559e-06,
+ "loss": 0.4227,
+ "step": 7443
+ },
+ {
+ "epoch": 3.5196217494089836,
+ "grad_norm": 3.0593245029449463,
+ "learning_rate": 1.8530528162345238e-06,
+ "loss": 0.4315,
+ "step": 7444
+ },
+ {
+ "epoch": 3.520094562647754,
+ "grad_norm": 2.7818729877471924,
+ "learning_rate": 1.852450259080053e-06,
+ "loss": 0.4018,
+ "step": 7445
+ },
+ {
+ "epoch": 3.520567375886525,
+ "grad_norm": 3.2635445594787598,
+ "learning_rate": 1.8518477422456639e-06,
+ "loss": 0.415,
+ "step": 7446
+ },
+ {
+ "epoch": 3.5210401891252956,
+ "grad_norm": 2.5713813304901123,
+ "learning_rate": 1.851245265768875e-06,
+ "loss": 0.3309,
+ "step": 7447
+ },
+ {
+ "epoch": 3.521513002364066,
+ "grad_norm": 2.6778969764709473,
+ "learning_rate": 1.8506428296871982e-06,
+ "loss": 0.3106,
+ "step": 7448
+ },
+ {
+ "epoch": 3.5219858156028367,
+ "grad_norm": 2.901095390319824,
+ "learning_rate": 1.8500404340381455e-06,
+ "loss": 0.3729,
+ "step": 7449
+ },
+ {
+ "epoch": 3.5224586288416075,
+ "grad_norm": 3.1000046730041504,
+ "learning_rate": 1.849438078859225e-06,
+ "loss": 0.438,
+ "step": 7450
+ },
+ {
+ "epoch": 3.5229314420803783,
+ "grad_norm": 2.901890993118286,
+ "learning_rate": 1.8488357641879417e-06,
+ "loss": 0.3934,
+ "step": 7451
+ },
+ {
+ "epoch": 3.523404255319149,
+ "grad_norm": 3.2212157249450684,
+ "learning_rate": 1.8482334900618009e-06,
+ "loss": 0.4359,
+ "step": 7452
+ },
+ {
+ "epoch": 3.5238770685579195,
+ "grad_norm": 3.3780901432037354,
+ "learning_rate": 1.847631256518303e-06,
+ "loss": 0.4022,
+ "step": 7453
+ },
+ {
+ "epoch": 3.5243498817966903,
+ "grad_norm": 2.9996445178985596,
+ "learning_rate": 1.847029063594945e-06,
+ "loss": 0.3989,
+ "step": 7454
+ },
+ {
+ "epoch": 3.524822695035461,
+ "grad_norm": 2.8581080436706543,
+ "learning_rate": 1.8464269113292255e-06,
+ "loss": 0.3401,
+ "step": 7455
+ },
+ {
+ "epoch": 3.5252955082742314,
+ "grad_norm": 2.9551661014556885,
+ "learning_rate": 1.8458247997586354e-06,
+ "loss": 0.4556,
+ "step": 7456
+ },
+ {
+ "epoch": 3.5257683215130022,
+ "grad_norm": 2.9672555923461914,
+ "learning_rate": 1.8452227289206672e-06,
+ "loss": 0.3575,
+ "step": 7457
+ },
+ {
+ "epoch": 3.526241134751773,
+ "grad_norm": 3.226273536682129,
+ "learning_rate": 1.8446206988528087e-06,
+ "loss": 0.3769,
+ "step": 7458
+ },
+ {
+ "epoch": 3.526713947990544,
+ "grad_norm": 2.994356155395508,
+ "learning_rate": 1.8440187095925443e-06,
+ "loss": 0.3653,
+ "step": 7459
+ },
+ {
+ "epoch": 3.5271867612293146,
+ "grad_norm": 2.489049196243286,
+ "learning_rate": 1.8434167611773595e-06,
+ "loss": 0.3454,
+ "step": 7460
+ },
+ {
+ "epoch": 3.527659574468085,
+ "grad_norm": 2.7897472381591797,
+ "learning_rate": 1.8428148536447333e-06,
+ "loss": 0.3526,
+ "step": 7461
+ },
+ {
+ "epoch": 3.5281323877068558,
+ "grad_norm": 2.947746992111206,
+ "learning_rate": 1.842212987032145e-06,
+ "loss": 0.3542,
+ "step": 7462
+ },
+ {
+ "epoch": 3.5286052009456266,
+ "grad_norm": 2.9303736686706543,
+ "learning_rate": 1.84161116137707e-06,
+ "loss": 0.3618,
+ "step": 7463
+ },
+ {
+ "epoch": 3.529078014184397,
+ "grad_norm": 2.81052827835083,
+ "learning_rate": 1.8410093767169807e-06,
+ "loss": 0.3833,
+ "step": 7464
+ },
+ {
+ "epoch": 3.5295508274231677,
+ "grad_norm": 3.4084126949310303,
+ "learning_rate": 1.840407633089348e-06,
+ "loss": 0.3868,
+ "step": 7465
+ },
+ {
+ "epoch": 3.5300236406619385,
+ "grad_norm": 2.8372802734375,
+ "learning_rate": 1.839805930531639e-06,
+ "loss": 0.3407,
+ "step": 7466
+ },
+ {
+ "epoch": 3.5304964539007093,
+ "grad_norm": 2.9218525886535645,
+ "learning_rate": 1.8392042690813205e-06,
+ "loss": 0.3772,
+ "step": 7467
+ },
+ {
+ "epoch": 3.53096926713948,
+ "grad_norm": 3.425274610519409,
+ "learning_rate": 1.8386026487758552e-06,
+ "loss": 0.3996,
+ "step": 7468
+ },
+ {
+ "epoch": 3.5314420803782505,
+ "grad_norm": 3.027423858642578,
+ "learning_rate": 1.8380010696527015e-06,
+ "loss": 0.3752,
+ "step": 7469
+ },
+ {
+ "epoch": 3.5319148936170213,
+ "grad_norm": 2.974896192550659,
+ "learning_rate": 1.8373995317493193e-06,
+ "loss": 0.3657,
+ "step": 7470
+ },
+ {
+ "epoch": 3.532387706855792,
+ "grad_norm": 2.837458610534668,
+ "learning_rate": 1.8367980351031628e-06,
+ "loss": 0.3949,
+ "step": 7471
+ },
+ {
+ "epoch": 3.5328605200945624,
+ "grad_norm": 2.8257288932800293,
+ "learning_rate": 1.8361965797516844e-06,
+ "loss": 0.3253,
+ "step": 7472
+ },
+ {
+ "epoch": 3.533333333333333,
+ "grad_norm": 2.8278095722198486,
+ "learning_rate": 1.8355951657323351e-06,
+ "loss": 0.3588,
+ "step": 7473
+ },
+ {
+ "epoch": 3.533806146572104,
+ "grad_norm": 2.641160249710083,
+ "learning_rate": 1.8349937930825601e-06,
+ "loss": 0.3423,
+ "step": 7474
+ },
+ {
+ "epoch": 3.534278959810875,
+ "grad_norm": 2.6909263134002686,
+ "learning_rate": 1.8343924618398065e-06,
+ "loss": 0.3973,
+ "step": 7475
+ },
+ {
+ "epoch": 3.5347517730496456,
+ "grad_norm": 3.0727429389953613,
+ "learning_rate": 1.8337911720415157e-06,
+ "loss": 0.4207,
+ "step": 7476
+ },
+ {
+ "epoch": 3.535224586288416,
+ "grad_norm": 3.218925714492798,
+ "learning_rate": 1.8331899237251265e-06,
+ "loss": 0.3955,
+ "step": 7477
+ },
+ {
+ "epoch": 3.5356973995271868,
+ "grad_norm": 3.163914918899536,
+ "learning_rate": 1.832588716928078e-06,
+ "loss": 0.4655,
+ "step": 7478
+ },
+ {
+ "epoch": 3.5361702127659576,
+ "grad_norm": 2.8622686862945557,
+ "learning_rate": 1.831987551687803e-06,
+ "loss": 0.4084,
+ "step": 7479
+ },
+ {
+ "epoch": 3.536643026004728,
+ "grad_norm": 2.8534188270568848,
+ "learning_rate": 1.831386428041734e-06,
+ "loss": 0.4144,
+ "step": 7480
+ },
+ {
+ "epoch": 3.5371158392434987,
+ "grad_norm": 2.8138554096221924,
+ "learning_rate": 1.8307853460273008e-06,
+ "loss": 0.3835,
+ "step": 7481
+ },
+ {
+ "epoch": 3.5375886524822695,
+ "grad_norm": 3.061960458755493,
+ "learning_rate": 1.830184305681929e-06,
+ "loss": 0.4128,
+ "step": 7482
+ },
+ {
+ "epoch": 3.5380614657210403,
+ "grad_norm": 2.8524835109710693,
+ "learning_rate": 1.8295833070430444e-06,
+ "loss": 0.3372,
+ "step": 7483
+ },
+ {
+ "epoch": 3.538534278959811,
+ "grad_norm": 3.2567028999328613,
+ "learning_rate": 1.8289823501480663e-06,
+ "loss": 0.4533,
+ "step": 7484
+ },
+ {
+ "epoch": 3.5390070921985815,
+ "grad_norm": 2.945634603500366,
+ "learning_rate": 1.8283814350344158e-06,
+ "loss": 0.3565,
+ "step": 7485
+ },
+ {
+ "epoch": 3.5394799054373522,
+ "grad_norm": 2.903287649154663,
+ "learning_rate": 1.8277805617395089e-06,
+ "loss": 0.349,
+ "step": 7486
+ },
+ {
+ "epoch": 3.539952718676123,
+ "grad_norm": 3.249272584915161,
+ "learning_rate": 1.827179730300757e-06,
+ "loss": 0.4076,
+ "step": 7487
+ },
+ {
+ "epoch": 3.5404255319148934,
+ "grad_norm": 2.9591739177703857,
+ "learning_rate": 1.8265789407555748e-06,
+ "loss": 0.3439,
+ "step": 7488
+ },
+ {
+ "epoch": 3.540898345153664,
+ "grad_norm": 3.8527538776397705,
+ "learning_rate": 1.8259781931413683e-06,
+ "loss": 0.4684,
+ "step": 7489
+ },
+ {
+ "epoch": 3.541371158392435,
+ "grad_norm": 2.7392261028289795,
+ "learning_rate": 1.8253774874955449e-06,
+ "loss": 0.3494,
+ "step": 7490
+ },
+ {
+ "epoch": 3.541843971631206,
+ "grad_norm": 2.880993127822876,
+ "learning_rate": 1.8247768238555069e-06,
+ "loss": 0.3546,
+ "step": 7491
+ },
+ {
+ "epoch": 3.5423167848699766,
+ "grad_norm": 2.9944894313812256,
+ "learning_rate": 1.8241762022586545e-06,
+ "loss": 0.3594,
+ "step": 7492
+ },
+ {
+ "epoch": 3.542789598108747,
+ "grad_norm": 3.0084292888641357,
+ "learning_rate": 1.8235756227423878e-06,
+ "loss": 0.408,
+ "step": 7493
+ },
+ {
+ "epoch": 3.5432624113475177,
+ "grad_norm": 2.75227689743042,
+ "learning_rate": 1.8229750853440998e-06,
+ "loss": 0.3515,
+ "step": 7494
+ },
+ {
+ "epoch": 3.5437352245862885,
+ "grad_norm": 3.041893243789673,
+ "learning_rate": 1.8223745901011856e-06,
+ "loss": 0.401,
+ "step": 7495
+ },
+ {
+ "epoch": 3.544208037825059,
+ "grad_norm": 2.8728370666503906,
+ "learning_rate": 1.8217741370510345e-06,
+ "loss": 0.3832,
+ "step": 7496
+ },
+ {
+ "epoch": 3.5446808510638297,
+ "grad_norm": 3.095460891723633,
+ "learning_rate": 1.8211737262310331e-06,
+ "loss": 0.3086,
+ "step": 7497
+ },
+ {
+ "epoch": 3.5451536643026005,
+ "grad_norm": 3.1869826316833496,
+ "learning_rate": 1.8205733576785678e-06,
+ "loss": 0.3666,
+ "step": 7498
+ },
+ {
+ "epoch": 3.5456264775413713,
+ "grad_norm": 3.307560443878174,
+ "learning_rate": 1.8199730314310204e-06,
+ "loss": 0.4489,
+ "step": 7499
+ },
+ {
+ "epoch": 3.546099290780142,
+ "grad_norm": 2.9531142711639404,
+ "learning_rate": 1.8193727475257697e-06,
+ "loss": 0.4017,
+ "step": 7500
+ },
+ {
+ "epoch": 3.5465721040189124,
+ "grad_norm": 3.2969162464141846,
+ "learning_rate": 1.8187725060001942e-06,
+ "loss": 0.4179,
+ "step": 7501
+ },
+ {
+ "epoch": 3.5470449172576832,
+ "grad_norm": 2.9434688091278076,
+ "learning_rate": 1.818172306891667e-06,
+ "loss": 0.3562,
+ "step": 7502
+ },
+ {
+ "epoch": 3.547517730496454,
+ "grad_norm": 3.070732355117798,
+ "learning_rate": 1.8175721502375616e-06,
+ "loss": 0.393,
+ "step": 7503
+ },
+ {
+ "epoch": 3.5479905437352244,
+ "grad_norm": 2.970898389816284,
+ "learning_rate": 1.8169720360752457e-06,
+ "loss": 0.4448,
+ "step": 7504
+ },
+ {
+ "epoch": 3.548463356973995,
+ "grad_norm": 2.7050931453704834,
+ "learning_rate": 1.8163719644420858e-06,
+ "loss": 0.3562,
+ "step": 7505
+ },
+ {
+ "epoch": 3.548936170212766,
+ "grad_norm": 3.0073063373565674,
+ "learning_rate": 1.8157719353754467e-06,
+ "loss": 0.4429,
+ "step": 7506
+ },
+ {
+ "epoch": 3.5494089834515368,
+ "grad_norm": 3.0240445137023926,
+ "learning_rate": 1.8151719489126874e-06,
+ "loss": 0.4073,
+ "step": 7507
+ },
+ {
+ "epoch": 3.5498817966903076,
+ "grad_norm": 3.558763265609741,
+ "learning_rate": 1.8145720050911695e-06,
+ "loss": 0.4025,
+ "step": 7508
+ },
+ {
+ "epoch": 3.550354609929078,
+ "grad_norm": 3.637258768081665,
+ "learning_rate": 1.8139721039482473e-06,
+ "loss": 0.5074,
+ "step": 7509
+ },
+ {
+ "epoch": 3.5508274231678487,
+ "grad_norm": 2.804719924926758,
+ "learning_rate": 1.8133722455212726e-06,
+ "loss": 0.3727,
+ "step": 7510
+ },
+ {
+ "epoch": 3.5513002364066195,
+ "grad_norm": 2.9034759998321533,
+ "learning_rate": 1.8127724298475984e-06,
+ "loss": 0.3642,
+ "step": 7511
+ },
+ {
+ "epoch": 3.55177304964539,
+ "grad_norm": 3.1304872035980225,
+ "learning_rate": 1.8121726569645714e-06,
+ "loss": 0.434,
+ "step": 7512
+ },
+ {
+ "epoch": 3.5522458628841607,
+ "grad_norm": 3.019956111907959,
+ "learning_rate": 1.8115729269095378e-06,
+ "loss": 0.4235,
+ "step": 7513
+ },
+ {
+ "epoch": 3.5527186761229315,
+ "grad_norm": 2.7984633445739746,
+ "learning_rate": 1.810973239719839e-06,
+ "loss": 0.3344,
+ "step": 7514
+ },
+ {
+ "epoch": 3.5531914893617023,
+ "grad_norm": 2.839709997177124,
+ "learning_rate": 1.8103735954328145e-06,
+ "loss": 0.3708,
+ "step": 7515
+ },
+ {
+ "epoch": 3.553664302600473,
+ "grad_norm": 2.766819477081299,
+ "learning_rate": 1.809773994085803e-06,
+ "loss": 0.3402,
+ "step": 7516
+ },
+ {
+ "epoch": 3.5541371158392434,
+ "grad_norm": 2.707942247390747,
+ "learning_rate": 1.8091744357161372e-06,
+ "loss": 0.4327,
+ "step": 7517
+ },
+ {
+ "epoch": 3.554609929078014,
+ "grad_norm": 3.512702465057373,
+ "learning_rate": 1.8085749203611516e-06,
+ "loss": 0.3965,
+ "step": 7518
+ },
+ {
+ "epoch": 3.555082742316785,
+ "grad_norm": 2.717024803161621,
+ "learning_rate": 1.8079754480581738e-06,
+ "loss": 0.3237,
+ "step": 7519
+ },
+ {
+ "epoch": 3.5555555555555554,
+ "grad_norm": 2.659001350402832,
+ "learning_rate": 1.8073760188445296e-06,
+ "loss": 0.3546,
+ "step": 7520
+ },
+ {
+ "epoch": 3.556028368794326,
+ "grad_norm": 2.615028142929077,
+ "learning_rate": 1.8067766327575445e-06,
+ "loss": 0.3232,
+ "step": 7521
+ },
+ {
+ "epoch": 3.556501182033097,
+ "grad_norm": 2.659428119659424,
+ "learning_rate": 1.8061772898345386e-06,
+ "loss": 0.3769,
+ "step": 7522
+ },
+ {
+ "epoch": 3.5569739952718678,
+ "grad_norm": 3.142369270324707,
+ "learning_rate": 1.8055779901128296e-06,
+ "loss": 0.4292,
+ "step": 7523
+ },
+ {
+ "epoch": 3.5574468085106385,
+ "grad_norm": 3.0832736492156982,
+ "learning_rate": 1.8049787336297352e-06,
+ "loss": 0.3871,
+ "step": 7524
+ },
+ {
+ "epoch": 3.557919621749409,
+ "grad_norm": 2.778411865234375,
+ "learning_rate": 1.8043795204225664e-06,
+ "loss": 0.3938,
+ "step": 7525
+ },
+ {
+ "epoch": 3.5583924349881797,
+ "grad_norm": 3.1651480197906494,
+ "learning_rate": 1.8037803505286355e-06,
+ "loss": 0.3315,
+ "step": 7526
+ },
+ {
+ "epoch": 3.5588652482269505,
+ "grad_norm": 3.266508102416992,
+ "learning_rate": 1.8031812239852498e-06,
+ "loss": 0.4156,
+ "step": 7527
+ },
+ {
+ "epoch": 3.559338061465721,
+ "grad_norm": 3.1345436573028564,
+ "learning_rate": 1.8025821408297127e-06,
+ "loss": 0.3813,
+ "step": 7528
+ },
+ {
+ "epoch": 3.5598108747044916,
+ "grad_norm": 3.1535425186157227,
+ "learning_rate": 1.8019831010993289e-06,
+ "loss": 0.3897,
+ "step": 7529
+ },
+ {
+ "epoch": 3.5602836879432624,
+ "grad_norm": 3.0934345722198486,
+ "learning_rate": 1.8013841048313952e-06,
+ "loss": 0.4074,
+ "step": 7530
+ },
+ {
+ "epoch": 3.5607565011820332,
+ "grad_norm": 3.224876642227173,
+ "learning_rate": 1.8007851520632108e-06,
+ "loss": 0.3969,
+ "step": 7531
+ },
+ {
+ "epoch": 3.561229314420804,
+ "grad_norm": 3.082303285598755,
+ "learning_rate": 1.8001862428320693e-06,
+ "loss": 0.3559,
+ "step": 7532
+ },
+ {
+ "epoch": 3.5617021276595744,
+ "grad_norm": 3.5289969444274902,
+ "learning_rate": 1.7995873771752608e-06,
+ "loss": 0.3961,
+ "step": 7533
+ },
+ {
+ "epoch": 3.562174940898345,
+ "grad_norm": 3.1893370151519775,
+ "learning_rate": 1.7989885551300762e-06,
+ "loss": 0.3721,
+ "step": 7534
+ },
+ {
+ "epoch": 3.562647754137116,
+ "grad_norm": 2.6911089420318604,
+ "learning_rate": 1.7983897767337999e-06,
+ "loss": 0.3801,
+ "step": 7535
+ },
+ {
+ "epoch": 3.5631205673758863,
+ "grad_norm": 3.0837483406066895,
+ "learning_rate": 1.797791042023716e-06,
+ "loss": 0.3886,
+ "step": 7536
+ },
+ {
+ "epoch": 3.563593380614657,
+ "grad_norm": 2.973459005355835,
+ "learning_rate": 1.7971923510371054e-06,
+ "loss": 0.438,
+ "step": 7537
+ },
+ {
+ "epoch": 3.564066193853428,
+ "grad_norm": 3.1537392139434814,
+ "learning_rate": 1.7965937038112435e-06,
+ "loss": 0.4022,
+ "step": 7538
+ },
+ {
+ "epoch": 3.5645390070921987,
+ "grad_norm": 3.2339680194854736,
+ "learning_rate": 1.795995100383409e-06,
+ "loss": 0.3883,
+ "step": 7539
+ },
+ {
+ "epoch": 3.5650118203309695,
+ "grad_norm": 2.5029079914093018,
+ "learning_rate": 1.7953965407908714e-06,
+ "loss": 0.3522,
+ "step": 7540
+ },
+ {
+ "epoch": 3.56548463356974,
+ "grad_norm": 3.1560211181640625,
+ "learning_rate": 1.7947980250709027e-06,
+ "loss": 0.4024,
+ "step": 7541
+ },
+ {
+ "epoch": 3.5659574468085107,
+ "grad_norm": 2.950477361679077,
+ "learning_rate": 1.7941995532607687e-06,
+ "loss": 0.3598,
+ "step": 7542
+ },
+ {
+ "epoch": 3.5664302600472815,
+ "grad_norm": 3.1263279914855957,
+ "learning_rate": 1.793601125397733e-06,
+ "loss": 0.3535,
+ "step": 7543
+ },
+ {
+ "epoch": 3.566903073286052,
+ "grad_norm": 2.986631393432617,
+ "learning_rate": 1.7930027415190587e-06,
+ "loss": 0.4251,
+ "step": 7544
+ },
+ {
+ "epoch": 3.5673758865248226,
+ "grad_norm": 2.6882247924804688,
+ "learning_rate": 1.7924044016620022e-06,
+ "loss": 0.3584,
+ "step": 7545
+ },
+ {
+ "epoch": 3.5678486997635934,
+ "grad_norm": 2.9358696937561035,
+ "learning_rate": 1.791806105863822e-06,
+ "loss": 0.3671,
+ "step": 7546
+ },
+ {
+ "epoch": 3.568321513002364,
+ "grad_norm": 2.774198055267334,
+ "learning_rate": 1.7912078541617704e-06,
+ "loss": 0.3505,
+ "step": 7547
+ },
+ {
+ "epoch": 3.568794326241135,
+ "grad_norm": 2.7384231090545654,
+ "learning_rate": 1.7906096465930964e-06,
+ "loss": 0.3992,
+ "step": 7548
+ },
+ {
+ "epoch": 3.5692671394799054,
+ "grad_norm": 2.8625354766845703,
+ "learning_rate": 1.7900114831950506e-06,
+ "loss": 0.3858,
+ "step": 7549
+ },
+ {
+ "epoch": 3.569739952718676,
+ "grad_norm": 2.737884044647217,
+ "learning_rate": 1.7894133640048761e-06,
+ "loss": 0.3973,
+ "step": 7550
+ },
+ {
+ "epoch": 3.570212765957447,
+ "grad_norm": 2.9817614555358887,
+ "learning_rate": 1.7888152890598154e-06,
+ "loss": 0.3613,
+ "step": 7551
+ },
+ {
+ "epoch": 3.5706855791962173,
+ "grad_norm": 2.760956287384033,
+ "learning_rate": 1.7882172583971081e-06,
+ "loss": 0.3645,
+ "step": 7552
+ },
+ {
+ "epoch": 3.571158392434988,
+ "grad_norm": 2.6867735385894775,
+ "learning_rate": 1.7876192720539908e-06,
+ "loss": 0.3771,
+ "step": 7553
+ },
+ {
+ "epoch": 3.571631205673759,
+ "grad_norm": 3.3362443447113037,
+ "learning_rate": 1.7870213300676986e-06,
+ "loss": 0.3989,
+ "step": 7554
+ },
+ {
+ "epoch": 3.5721040189125297,
+ "grad_norm": 2.8359227180480957,
+ "learning_rate": 1.7864234324754617e-06,
+ "loss": 0.3645,
+ "step": 7555
+ },
+ {
+ "epoch": 3.5725768321513005,
+ "grad_norm": 3.3070647716522217,
+ "learning_rate": 1.7858255793145076e-06,
+ "loss": 0.4128,
+ "step": 7556
+ },
+ {
+ "epoch": 3.573049645390071,
+ "grad_norm": 2.544879913330078,
+ "learning_rate": 1.7852277706220644e-06,
+ "loss": 0.3779,
+ "step": 7557
+ },
+ {
+ "epoch": 3.5735224586288417,
+ "grad_norm": 2.890796661376953,
+ "learning_rate": 1.7846300064353525e-06,
+ "loss": 0.373,
+ "step": 7558
+ },
+ {
+ "epoch": 3.5739952718676125,
+ "grad_norm": 2.9703400135040283,
+ "learning_rate": 1.7840322867915944e-06,
+ "loss": 0.3619,
+ "step": 7559
+ },
+ {
+ "epoch": 3.574468085106383,
+ "grad_norm": 3.0122430324554443,
+ "learning_rate": 1.7834346117280066e-06,
+ "loss": 0.4079,
+ "step": 7560
+ },
+ {
+ "epoch": 3.5749408983451536,
+ "grad_norm": 2.904963493347168,
+ "learning_rate": 1.7828369812818025e-06,
+ "loss": 0.38,
+ "step": 7561
+ },
+ {
+ "epoch": 3.5754137115839244,
+ "grad_norm": 3.0917439460754395,
+ "learning_rate": 1.7822393954901957e-06,
+ "loss": 0.383,
+ "step": 7562
+ },
+ {
+ "epoch": 3.575886524822695,
+ "grad_norm": 2.633920907974243,
+ "learning_rate": 1.7816418543903935e-06,
+ "loss": 0.3823,
+ "step": 7563
+ },
+ {
+ "epoch": 3.576359338061466,
+ "grad_norm": 2.9266390800476074,
+ "learning_rate": 1.781044358019604e-06,
+ "loss": 0.4642,
+ "step": 7564
+ },
+ {
+ "epoch": 3.5768321513002364,
+ "grad_norm": 2.878138780593872,
+ "learning_rate": 1.7804469064150299e-06,
+ "loss": 0.4056,
+ "step": 7565
+ },
+ {
+ "epoch": 3.577304964539007,
+ "grad_norm": 2.949370861053467,
+ "learning_rate": 1.7798494996138708e-06,
+ "loss": 0.3863,
+ "step": 7566
+ },
+ {
+ "epoch": 3.5777777777777775,
+ "grad_norm": 3.1444685459136963,
+ "learning_rate": 1.7792521376533264e-06,
+ "loss": 0.3611,
+ "step": 7567
+ },
+ {
+ "epoch": 3.5782505910165483,
+ "grad_norm": 3.0719716548919678,
+ "learning_rate": 1.7786548205705906e-06,
+ "loss": 0.3866,
+ "step": 7568
+ },
+ {
+ "epoch": 3.578723404255319,
+ "grad_norm": 3.155343770980835,
+ "learning_rate": 1.7780575484028566e-06,
+ "loss": 0.3896,
+ "step": 7569
+ },
+ {
+ "epoch": 3.57919621749409,
+ "grad_norm": 3.0447211265563965,
+ "learning_rate": 1.7774603211873138e-06,
+ "loss": 0.3713,
+ "step": 7570
+ },
+ {
+ "epoch": 3.5796690307328607,
+ "grad_norm": 2.8683619499206543,
+ "learning_rate": 1.7768631389611471e-06,
+ "loss": 0.3808,
+ "step": 7571
+ },
+ {
+ "epoch": 3.580141843971631,
+ "grad_norm": 3.1548070907592773,
+ "learning_rate": 1.776266001761543e-06,
+ "loss": 0.3714,
+ "step": 7572
+ },
+ {
+ "epoch": 3.580614657210402,
+ "grad_norm": 2.8699257373809814,
+ "learning_rate": 1.7756689096256816e-06,
+ "loss": 0.3694,
+ "step": 7573
+ },
+ {
+ "epoch": 3.5810874704491726,
+ "grad_norm": 2.834714412689209,
+ "learning_rate": 1.7750718625907398e-06,
+ "loss": 0.3935,
+ "step": 7574
+ },
+ {
+ "epoch": 3.581560283687943,
+ "grad_norm": 3.3828539848327637,
+ "learning_rate": 1.7744748606938957e-06,
+ "loss": 0.4783,
+ "step": 7575
+ },
+ {
+ "epoch": 3.582033096926714,
+ "grad_norm": 3.3892476558685303,
+ "learning_rate": 1.7738779039723202e-06,
+ "loss": 0.41,
+ "step": 7576
+ },
+ {
+ "epoch": 3.5825059101654846,
+ "grad_norm": 3.014289379119873,
+ "learning_rate": 1.7732809924631842e-06,
+ "loss": 0.3516,
+ "step": 7577
+ },
+ {
+ "epoch": 3.5829787234042554,
+ "grad_norm": 3.477212429046631,
+ "learning_rate": 1.772684126203654e-06,
+ "loss": 0.4144,
+ "step": 7578
+ },
+ {
+ "epoch": 3.583451536643026,
+ "grad_norm": 2.9156792163848877,
+ "learning_rate": 1.772087305230893e-06,
+ "loss": 0.3772,
+ "step": 7579
+ },
+ {
+ "epoch": 3.5839243498817965,
+ "grad_norm": 2.639169931411743,
+ "learning_rate": 1.7714905295820651e-06,
+ "loss": 0.3487,
+ "step": 7580
+ },
+ {
+ "epoch": 3.5843971631205673,
+ "grad_norm": 3.196894407272339,
+ "learning_rate": 1.7708937992943263e-06,
+ "loss": 0.4852,
+ "step": 7581
+ },
+ {
+ "epoch": 3.584869976359338,
+ "grad_norm": 2.9140779972076416,
+ "learning_rate": 1.7702971144048347e-06,
+ "loss": 0.3703,
+ "step": 7582
+ },
+ {
+ "epoch": 3.5853427895981085,
+ "grad_norm": 3.3844895362854004,
+ "learning_rate": 1.7697004749507418e-06,
+ "loss": 0.4227,
+ "step": 7583
+ },
+ {
+ "epoch": 3.5858156028368793,
+ "grad_norm": 3.080061912536621,
+ "learning_rate": 1.769103880969198e-06,
+ "loss": 0.4237,
+ "step": 7584
+ },
+ {
+ "epoch": 3.58628841607565,
+ "grad_norm": 3.037505865097046,
+ "learning_rate": 1.7685073324973506e-06,
+ "loss": 0.3902,
+ "step": 7585
+ },
+ {
+ "epoch": 3.586761229314421,
+ "grad_norm": 3.6563873291015625,
+ "learning_rate": 1.7679108295723436e-06,
+ "loss": 0.3956,
+ "step": 7586
+ },
+ {
+ "epoch": 3.5872340425531917,
+ "grad_norm": 3.158935546875,
+ "learning_rate": 1.76731437223132e-06,
+ "loss": 0.3898,
+ "step": 7587
+ },
+ {
+ "epoch": 3.587706855791962,
+ "grad_norm": 3.059199571609497,
+ "learning_rate": 1.7667179605114176e-06,
+ "loss": 0.4183,
+ "step": 7588
+ },
+ {
+ "epoch": 3.588179669030733,
+ "grad_norm": 2.8123233318328857,
+ "learning_rate": 1.7661215944497716e-06,
+ "loss": 0.3731,
+ "step": 7589
+ },
+ {
+ "epoch": 3.5886524822695036,
+ "grad_norm": 3.094287633895874,
+ "learning_rate": 1.7655252740835169e-06,
+ "loss": 0.4562,
+ "step": 7590
+ },
+ {
+ "epoch": 3.589125295508274,
+ "grad_norm": 2.886833667755127,
+ "learning_rate": 1.7649289994497822e-06,
+ "loss": 0.4178,
+ "step": 7591
+ },
+ {
+ "epoch": 3.5895981087470448,
+ "grad_norm": 3.3040647506713867,
+ "learning_rate": 1.764332770585696e-06,
+ "loss": 0.4311,
+ "step": 7592
+ },
+ {
+ "epoch": 3.5900709219858156,
+ "grad_norm": 2.7948951721191406,
+ "learning_rate": 1.7637365875283827e-06,
+ "loss": 0.3704,
+ "step": 7593
+ },
+ {
+ "epoch": 3.5905437352245864,
+ "grad_norm": 3.092221975326538,
+ "learning_rate": 1.7631404503149623e-06,
+ "loss": 0.4166,
+ "step": 7594
+ },
+ {
+ "epoch": 3.591016548463357,
+ "grad_norm": 3.6018600463867188,
+ "learning_rate": 1.7625443589825564e-06,
+ "loss": 0.4251,
+ "step": 7595
+ },
+ {
+ "epoch": 3.5914893617021275,
+ "grad_norm": 2.708017110824585,
+ "learning_rate": 1.7619483135682791e-06,
+ "loss": 0.3775,
+ "step": 7596
+ },
+ {
+ "epoch": 3.5919621749408983,
+ "grad_norm": 2.8069381713867188,
+ "learning_rate": 1.7613523141092438e-06,
+ "loss": 0.3929,
+ "step": 7597
+ },
+ {
+ "epoch": 3.592434988179669,
+ "grad_norm": 3.097787380218506,
+ "learning_rate": 1.7607563606425616e-06,
+ "loss": 0.3992,
+ "step": 7598
+ },
+ {
+ "epoch": 3.5929078014184395,
+ "grad_norm": 2.9691715240478516,
+ "learning_rate": 1.7601604532053385e-06,
+ "loss": 0.4001,
+ "step": 7599
+ },
+ {
+ "epoch": 3.5933806146572103,
+ "grad_norm": 2.5511624813079834,
+ "learning_rate": 1.7595645918346807e-06,
+ "loss": 0.3136,
+ "step": 7600
+ },
+ {
+ "epoch": 3.593853427895981,
+ "grad_norm": 2.4688427448272705,
+ "learning_rate": 1.7589687765676891e-06,
+ "loss": 0.3922,
+ "step": 7601
+ },
+ {
+ "epoch": 3.594326241134752,
+ "grad_norm": 3.004023790359497,
+ "learning_rate": 1.7583730074414613e-06,
+ "loss": 0.4203,
+ "step": 7602
+ },
+ {
+ "epoch": 3.5947990543735227,
+ "grad_norm": 2.902641773223877,
+ "learning_rate": 1.7577772844930957e-06,
+ "loss": 0.3855,
+ "step": 7603
+ },
+ {
+ "epoch": 3.595271867612293,
+ "grad_norm": 3.851375102996826,
+ "learning_rate": 1.7571816077596826e-06,
+ "loss": 0.3769,
+ "step": 7604
+ },
+ {
+ "epoch": 3.595744680851064,
+ "grad_norm": 3.03249192237854,
+ "learning_rate": 1.756585977278315e-06,
+ "loss": 0.3448,
+ "step": 7605
+ },
+ {
+ "epoch": 3.5962174940898346,
+ "grad_norm": 2.992363214492798,
+ "learning_rate": 1.7559903930860789e-06,
+ "loss": 0.3893,
+ "step": 7606
+ },
+ {
+ "epoch": 3.596690307328605,
+ "grad_norm": 2.9322855472564697,
+ "learning_rate": 1.7553948552200577e-06,
+ "loss": 0.4337,
+ "step": 7607
+ },
+ {
+ "epoch": 3.5971631205673757,
+ "grad_norm": 3.2564096450805664,
+ "learning_rate": 1.7547993637173347e-06,
+ "loss": 0.3943,
+ "step": 7608
+ },
+ {
+ "epoch": 3.5976359338061465,
+ "grad_norm": 2.9988484382629395,
+ "learning_rate": 1.7542039186149867e-06,
+ "loss": 0.3421,
+ "step": 7609
+ },
+ {
+ "epoch": 3.5981087470449173,
+ "grad_norm": 2.8188817501068115,
+ "learning_rate": 1.7536085199500914e-06,
+ "loss": 0.3657,
+ "step": 7610
+ },
+ {
+ "epoch": 3.598581560283688,
+ "grad_norm": 3.0583255290985107,
+ "learning_rate": 1.7530131677597206e-06,
+ "loss": 0.4036,
+ "step": 7611
+ },
+ {
+ "epoch": 3.5990543735224585,
+ "grad_norm": 2.8700921535491943,
+ "learning_rate": 1.7524178620809435e-06,
+ "loss": 0.3928,
+ "step": 7612
+ },
+ {
+ "epoch": 3.5995271867612293,
+ "grad_norm": 3.4497945308685303,
+ "learning_rate": 1.751822602950829e-06,
+ "loss": 0.3517,
+ "step": 7613
+ },
+ {
+ "epoch": 3.6,
+ "grad_norm": 3.334191083908081,
+ "learning_rate": 1.75122739040644e-06,
+ "loss": 0.3414,
+ "step": 7614
+ },
+ {
+ "epoch": 3.6004728132387704,
+ "grad_norm": 3.1435158252716064,
+ "learning_rate": 1.7506322244848387e-06,
+ "loss": 0.4075,
+ "step": 7615
+ },
+ {
+ "epoch": 3.6009456264775412,
+ "grad_norm": 3.178990125656128,
+ "learning_rate": 1.7500371052230824e-06,
+ "loss": 0.4688,
+ "step": 7616
+ },
+ {
+ "epoch": 3.601418439716312,
+ "grad_norm": 2.9292044639587402,
+ "learning_rate": 1.7494420326582267e-06,
+ "loss": 0.3882,
+ "step": 7617
+ },
+ {
+ "epoch": 3.601891252955083,
+ "grad_norm": 2.6899197101593018,
+ "learning_rate": 1.7488470068273256e-06,
+ "loss": 0.3916,
+ "step": 7618
+ },
+ {
+ "epoch": 3.6023640661938536,
+ "grad_norm": 2.8319191932678223,
+ "learning_rate": 1.7482520277674273e-06,
+ "loss": 0.3924,
+ "step": 7619
+ },
+ {
+ "epoch": 3.602836879432624,
+ "grad_norm": 2.74589204788208,
+ "learning_rate": 1.747657095515578e-06,
+ "loss": 0.2911,
+ "step": 7620
+ },
+ {
+ "epoch": 3.603309692671395,
+ "grad_norm": 2.857028007507324,
+ "learning_rate": 1.7470622101088233e-06,
+ "loss": 0.3618,
+ "step": 7621
+ },
+ {
+ "epoch": 3.6037825059101656,
+ "grad_norm": 3.3715617656707764,
+ "learning_rate": 1.746467371584203e-06,
+ "loss": 0.4186,
+ "step": 7622
+ },
+ {
+ "epoch": 3.604255319148936,
+ "grad_norm": 2.839526414871216,
+ "learning_rate": 1.745872579978755e-06,
+ "loss": 0.4088,
+ "step": 7623
+ },
+ {
+ "epoch": 3.6047281323877067,
+ "grad_norm": 3.7689156532287598,
+ "learning_rate": 1.7452778353295155e-06,
+ "loss": 0.4748,
+ "step": 7624
+ },
+ {
+ "epoch": 3.6052009456264775,
+ "grad_norm": 2.9345123767852783,
+ "learning_rate": 1.7446831376735152e-06,
+ "loss": 0.4117,
+ "step": 7625
+ },
+ {
+ "epoch": 3.6056737588652483,
+ "grad_norm": 2.7898924350738525,
+ "learning_rate": 1.7440884870477845e-06,
+ "loss": 0.3515,
+ "step": 7626
+ },
+ {
+ "epoch": 3.606146572104019,
+ "grad_norm": 3.4268569946289062,
+ "learning_rate": 1.7434938834893481e-06,
+ "loss": 0.4051,
+ "step": 7627
+ },
+ {
+ "epoch": 3.6066193853427895,
+ "grad_norm": 3.019066095352173,
+ "learning_rate": 1.7428993270352311e-06,
+ "loss": 0.4128,
+ "step": 7628
+ },
+ {
+ "epoch": 3.6070921985815603,
+ "grad_norm": 3.1277568340301514,
+ "learning_rate": 1.742304817722454e-06,
+ "loss": 0.37,
+ "step": 7629
+ },
+ {
+ "epoch": 3.607565011820331,
+ "grad_norm": 2.924818277359009,
+ "learning_rate": 1.7417103555880318e-06,
+ "loss": 0.3792,
+ "step": 7630
+ },
+ {
+ "epoch": 3.6080378250591014,
+ "grad_norm": 2.664699077606201,
+ "learning_rate": 1.7411159406689821e-06,
+ "loss": 0.3584,
+ "step": 7631
+ },
+ {
+ "epoch": 3.608510638297872,
+ "grad_norm": 3.223729133605957,
+ "learning_rate": 1.7405215730023144e-06,
+ "loss": 0.3956,
+ "step": 7632
+ },
+ {
+ "epoch": 3.608983451536643,
+ "grad_norm": 2.934225559234619,
+ "learning_rate": 1.7399272526250388e-06,
+ "loss": 0.4179,
+ "step": 7633
+ },
+ {
+ "epoch": 3.609456264775414,
+ "grad_norm": 2.833798885345459,
+ "learning_rate": 1.7393329795741603e-06,
+ "loss": 0.3283,
+ "step": 7634
+ },
+ {
+ "epoch": 3.6099290780141846,
+ "grad_norm": 3.008798837661743,
+ "learning_rate": 1.738738753886681e-06,
+ "loss": 0.3704,
+ "step": 7635
+ },
+ {
+ "epoch": 3.610401891252955,
+ "grad_norm": 2.8714520931243896,
+ "learning_rate": 1.7381445755996023e-06,
+ "loss": 0.3646,
+ "step": 7636
+ },
+ {
+ "epoch": 3.6108747044917258,
+ "grad_norm": 3.083554267883301,
+ "learning_rate": 1.7375504447499193e-06,
+ "loss": 0.3785,
+ "step": 7637
+ },
+ {
+ "epoch": 3.6113475177304966,
+ "grad_norm": 3.270347833633423,
+ "learning_rate": 1.7369563613746277e-06,
+ "loss": 0.4426,
+ "step": 7638
+ },
+ {
+ "epoch": 3.611820330969267,
+ "grad_norm": 2.7754862308502197,
+ "learning_rate": 1.7363623255107175e-06,
+ "loss": 0.3448,
+ "step": 7639
+ },
+ {
+ "epoch": 3.6122931442080377,
+ "grad_norm": 2.98140025138855,
+ "learning_rate": 1.7357683371951767e-06,
+ "loss": 0.4027,
+ "step": 7640
+ },
+ {
+ "epoch": 3.6127659574468085,
+ "grad_norm": 3.1640074253082275,
+ "learning_rate": 1.7351743964649908e-06,
+ "loss": 0.3913,
+ "step": 7641
+ },
+ {
+ "epoch": 3.6132387706855793,
+ "grad_norm": 2.758202075958252,
+ "learning_rate": 1.7345805033571417e-06,
+ "loss": 0.4148,
+ "step": 7642
+ },
+ {
+ "epoch": 3.61371158392435,
+ "grad_norm": 3.1030571460723877,
+ "learning_rate": 1.7339866579086074e-06,
+ "loss": 0.4002,
+ "step": 7643
+ },
+ {
+ "epoch": 3.6141843971631205,
+ "grad_norm": 3.2414135932922363,
+ "learning_rate": 1.733392860156366e-06,
+ "loss": 0.4732,
+ "step": 7644
+ },
+ {
+ "epoch": 3.6146572104018913,
+ "grad_norm": 2.8720390796661377,
+ "learning_rate": 1.7327991101373886e-06,
+ "loss": 0.4112,
+ "step": 7645
+ },
+ {
+ "epoch": 3.615130023640662,
+ "grad_norm": 3.0104875564575195,
+ "learning_rate": 1.7322054078886474e-06,
+ "loss": 0.3934,
+ "step": 7646
+ },
+ {
+ "epoch": 3.6156028368794324,
+ "grad_norm": 2.8615126609802246,
+ "learning_rate": 1.7316117534471091e-06,
+ "loss": 0.3437,
+ "step": 7647
+ },
+ {
+ "epoch": 3.616075650118203,
+ "grad_norm": 2.8283586502075195,
+ "learning_rate": 1.7310181468497369e-06,
+ "loss": 0.374,
+ "step": 7648
+ },
+ {
+ "epoch": 3.616548463356974,
+ "grad_norm": 3.2289321422576904,
+ "learning_rate": 1.7304245881334935e-06,
+ "loss": 0.3899,
+ "step": 7649
+ },
+ {
+ "epoch": 3.617021276595745,
+ "grad_norm": 3.126882791519165,
+ "learning_rate": 1.7298310773353356e-06,
+ "loss": 0.388,
+ "step": 7650
+ },
+ {
+ "epoch": 3.6174940898345156,
+ "grad_norm": 3.013657569885254,
+ "learning_rate": 1.7292376144922201e-06,
+ "loss": 0.379,
+ "step": 7651
+ },
+ {
+ "epoch": 3.617966903073286,
+ "grad_norm": 3.070192337036133,
+ "learning_rate": 1.7286441996410989e-06,
+ "loss": 0.3801,
+ "step": 7652
+ },
+ {
+ "epoch": 3.6184397163120567,
+ "grad_norm": 2.805380344390869,
+ "learning_rate": 1.7280508328189199e-06,
+ "loss": 0.3577,
+ "step": 7653
+ },
+ {
+ "epoch": 3.6189125295508275,
+ "grad_norm": 3.2853379249572754,
+ "learning_rate": 1.7274575140626318e-06,
+ "loss": 0.4168,
+ "step": 7654
+ },
+ {
+ "epoch": 3.619385342789598,
+ "grad_norm": 3.16316819190979,
+ "learning_rate": 1.7268642434091761e-06,
+ "loss": 0.425,
+ "step": 7655
+ },
+ {
+ "epoch": 3.6198581560283687,
+ "grad_norm": 3.2971179485321045,
+ "learning_rate": 1.7262710208954947e-06,
+ "loss": 0.3884,
+ "step": 7656
+ },
+ {
+ "epoch": 3.6203309692671395,
+ "grad_norm": 3.1823747158050537,
+ "learning_rate": 1.725677846558524e-06,
+ "loss": 0.3419,
+ "step": 7657
+ },
+ {
+ "epoch": 3.6208037825059103,
+ "grad_norm": 3.114654779434204,
+ "learning_rate": 1.7250847204351973e-06,
+ "loss": 0.3951,
+ "step": 7658
+ },
+ {
+ "epoch": 3.621276595744681,
+ "grad_norm": 3.0272440910339355,
+ "learning_rate": 1.7244916425624482e-06,
+ "loss": 0.4102,
+ "step": 7659
+ },
+ {
+ "epoch": 3.6217494089834514,
+ "grad_norm": 2.973611354827881,
+ "learning_rate": 1.7238986129772035e-06,
+ "loss": 0.3827,
+ "step": 7660
+ },
+ {
+ "epoch": 3.6222222222222222,
+ "grad_norm": 3.063713312149048,
+ "learning_rate": 1.7233056317163894e-06,
+ "loss": 0.3909,
+ "step": 7661
+ },
+ {
+ "epoch": 3.622695035460993,
+ "grad_norm": 3.203725576400757,
+ "learning_rate": 1.7227126988169283e-06,
+ "loss": 0.3933,
+ "step": 7662
+ },
+ {
+ "epoch": 3.6231678486997634,
+ "grad_norm": 2.945887327194214,
+ "learning_rate": 1.7221198143157386e-06,
+ "loss": 0.3722,
+ "step": 7663
+ },
+ {
+ "epoch": 3.623640661938534,
+ "grad_norm": 3.042691469192505,
+ "learning_rate": 1.7215269782497373e-06,
+ "loss": 0.4108,
+ "step": 7664
+ },
+ {
+ "epoch": 3.624113475177305,
+ "grad_norm": 2.8496763706207275,
+ "learning_rate": 1.720934190655837e-06,
+ "loss": 0.3867,
+ "step": 7665
+ },
+ {
+ "epoch": 3.6245862884160758,
+ "grad_norm": 2.7017154693603516,
+ "learning_rate": 1.7203414515709493e-06,
+ "loss": 0.3246,
+ "step": 7666
+ },
+ {
+ "epoch": 3.6250591016548466,
+ "grad_norm": 2.66630482673645,
+ "learning_rate": 1.7197487610319808e-06,
+ "loss": 0.365,
+ "step": 7667
+ },
+ {
+ "epoch": 3.625531914893617,
+ "grad_norm": 2.8724591732025146,
+ "learning_rate": 1.7191561190758348e-06,
+ "loss": 0.3361,
+ "step": 7668
+ },
+ {
+ "epoch": 3.6260047281323877,
+ "grad_norm": 3.1413803100585938,
+ "learning_rate": 1.7185635257394143e-06,
+ "loss": 0.3949,
+ "step": 7669
+ },
+ {
+ "epoch": 3.6264775413711585,
+ "grad_norm": 2.9866268634796143,
+ "learning_rate": 1.7179709810596163e-06,
+ "loss": 0.3728,
+ "step": 7670
+ },
+ {
+ "epoch": 3.626950354609929,
+ "grad_norm": 3.003497838973999,
+ "learning_rate": 1.717378485073336e-06,
+ "loss": 0.384,
+ "step": 7671
+ },
+ {
+ "epoch": 3.6274231678486997,
+ "grad_norm": 3.0043468475341797,
+ "learning_rate": 1.716786037817466e-06,
+ "loss": 0.3432,
+ "step": 7672
+ },
+ {
+ "epoch": 3.6278959810874705,
+ "grad_norm": 3.216550827026367,
+ "learning_rate": 1.7161936393288945e-06,
+ "loss": 0.3963,
+ "step": 7673
+ },
+ {
+ "epoch": 3.6283687943262413,
+ "grad_norm": 3.1091387271881104,
+ "learning_rate": 1.715601289644509e-06,
+ "loss": 0.4347,
+ "step": 7674
+ },
+ {
+ "epoch": 3.628841607565012,
+ "grad_norm": 3.2288286685943604,
+ "learning_rate": 1.7150089888011916e-06,
+ "loss": 0.4291,
+ "step": 7675
+ },
+ {
+ "epoch": 3.6293144208037824,
+ "grad_norm": 2.943941831588745,
+ "learning_rate": 1.7144167368358216e-06,
+ "loss": 0.3643,
+ "step": 7676
+ },
+ {
+ "epoch": 3.629787234042553,
+ "grad_norm": 2.819683313369751,
+ "learning_rate": 1.7138245337852774e-06,
+ "loss": 0.4051,
+ "step": 7677
+ },
+ {
+ "epoch": 3.630260047281324,
+ "grad_norm": 2.9988269805908203,
+ "learning_rate": 1.713232379686432e-06,
+ "loss": 0.4102,
+ "step": 7678
+ },
+ {
+ "epoch": 3.6307328605200944,
+ "grad_norm": 3.0041310787200928,
+ "learning_rate": 1.7126402745761566e-06,
+ "loss": 0.3854,
+ "step": 7679
+ },
+ {
+ "epoch": 3.631205673758865,
+ "grad_norm": 2.8700194358825684,
+ "learning_rate": 1.7120482184913192e-06,
+ "loss": 0.3441,
+ "step": 7680
+ },
+ {
+ "epoch": 3.631678486997636,
+ "grad_norm": 3.5275180339813232,
+ "learning_rate": 1.7114562114687833e-06,
+ "loss": 0.3808,
+ "step": 7681
+ },
+ {
+ "epoch": 3.6321513002364068,
+ "grad_norm": 3.182326078414917,
+ "learning_rate": 1.710864253545412e-06,
+ "loss": 0.4178,
+ "step": 7682
+ },
+ {
+ "epoch": 3.6326241134751776,
+ "grad_norm": 3.0514512062072754,
+ "learning_rate": 1.7102723447580627e-06,
+ "loss": 0.3527,
+ "step": 7683
+ },
+ {
+ "epoch": 3.633096926713948,
+ "grad_norm": 2.8293066024780273,
+ "learning_rate": 1.7096804851435922e-06,
+ "loss": 0.3723,
+ "step": 7684
+ },
+ {
+ "epoch": 3.6335697399527187,
+ "grad_norm": 2.9601097106933594,
+ "learning_rate": 1.709088674738853e-06,
+ "loss": 0.3704,
+ "step": 7685
+ },
+ {
+ "epoch": 3.6340425531914895,
+ "grad_norm": 2.8070995807647705,
+ "learning_rate": 1.7084969135806933e-06,
+ "loss": 0.346,
+ "step": 7686
+ },
+ {
+ "epoch": 3.63451536643026,
+ "grad_norm": 3.0162715911865234,
+ "learning_rate": 1.70790520170596e-06,
+ "loss": 0.39,
+ "step": 7687
+ },
+ {
+ "epoch": 3.6349881796690307,
+ "grad_norm": 3.018763780593872,
+ "learning_rate": 1.7073135391514967e-06,
+ "loss": 0.4621,
+ "step": 7688
+ },
+ {
+ "epoch": 3.6354609929078014,
+ "grad_norm": 2.963604688644409,
+ "learning_rate": 1.706721925954144e-06,
+ "loss": 0.339,
+ "step": 7689
+ },
+ {
+ "epoch": 3.6359338061465722,
+ "grad_norm": 2.8532896041870117,
+ "learning_rate": 1.7061303621507383e-06,
+ "loss": 0.3915,
+ "step": 7690
+ },
+ {
+ "epoch": 3.636406619385343,
+ "grad_norm": 3.248006820678711,
+ "learning_rate": 1.7055388477781133e-06,
+ "loss": 0.3712,
+ "step": 7691
+ },
+ {
+ "epoch": 3.6368794326241134,
+ "grad_norm": 3.2195777893066406,
+ "learning_rate": 1.7049473828731011e-06,
+ "loss": 0.4358,
+ "step": 7692
+ },
+ {
+ "epoch": 3.637352245862884,
+ "grad_norm": 2.7190768718719482,
+ "learning_rate": 1.7043559674725296e-06,
+ "loss": 0.341,
+ "step": 7693
+ },
+ {
+ "epoch": 3.637825059101655,
+ "grad_norm": 2.6047232151031494,
+ "learning_rate": 1.7037646016132223e-06,
+ "loss": 0.3513,
+ "step": 7694
+ },
+ {
+ "epoch": 3.6382978723404253,
+ "grad_norm": 3.0824201107025146,
+ "learning_rate": 1.7031732853320026e-06,
+ "loss": 0.4097,
+ "step": 7695
+ },
+ {
+ "epoch": 3.638770685579196,
+ "grad_norm": 2.845461130142212,
+ "learning_rate": 1.7025820186656883e-06,
+ "loss": 0.3395,
+ "step": 7696
+ },
+ {
+ "epoch": 3.639243498817967,
+ "grad_norm": 2.937863826751709,
+ "learning_rate": 1.7019908016510953e-06,
+ "loss": 0.395,
+ "step": 7697
+ },
+ {
+ "epoch": 3.6397163120567377,
+ "grad_norm": 3.349780559539795,
+ "learning_rate": 1.701399634325036e-06,
+ "loss": 0.3889,
+ "step": 7698
+ },
+ {
+ "epoch": 3.6401891252955085,
+ "grad_norm": 2.8527066707611084,
+ "learning_rate": 1.7008085167243187e-06,
+ "loss": 0.3753,
+ "step": 7699
+ },
+ {
+ "epoch": 3.640661938534279,
+ "grad_norm": 2.8112385272979736,
+ "learning_rate": 1.7002174488857517e-06,
+ "loss": 0.3912,
+ "step": 7700
+ },
+ {
+ "epoch": 3.6411347517730497,
+ "grad_norm": 2.731933832168579,
+ "learning_rate": 1.6996264308461363e-06,
+ "loss": 0.4142,
+ "step": 7701
+ },
+ {
+ "epoch": 3.6416075650118205,
+ "grad_norm": 3.70465350151062,
+ "learning_rate": 1.6990354626422744e-06,
+ "loss": 0.4089,
+ "step": 7702
+ },
+ {
+ "epoch": 3.642080378250591,
+ "grad_norm": 2.8656258583068848,
+ "learning_rate": 1.698444544310962e-06,
+ "loss": 0.3771,
+ "step": 7703
+ },
+ {
+ "epoch": 3.6425531914893616,
+ "grad_norm": 2.878830671310425,
+ "learning_rate": 1.697853675888993e-06,
+ "loss": 0.3754,
+ "step": 7704
+ },
+ {
+ "epoch": 3.6430260047281324,
+ "grad_norm": 3.440528154373169,
+ "learning_rate": 1.6972628574131586e-06,
+ "loss": 0.4543,
+ "step": 7705
+ },
+ {
+ "epoch": 3.6434988179669032,
+ "grad_norm": 2.70736026763916,
+ "learning_rate": 1.6966720889202451e-06,
+ "loss": 0.4049,
+ "step": 7706
+ },
+ {
+ "epoch": 3.643971631205674,
+ "grad_norm": 2.787992238998413,
+ "learning_rate": 1.6960813704470391e-06,
+ "loss": 0.3854,
+ "step": 7707
+ },
+ {
+ "epoch": 3.6444444444444444,
+ "grad_norm": 2.631490707397461,
+ "learning_rate": 1.6954907020303213e-06,
+ "loss": 0.3775,
+ "step": 7708
+ },
+ {
+ "epoch": 3.644917257683215,
+ "grad_norm": 3.052255392074585,
+ "learning_rate": 1.6949000837068685e-06,
+ "loss": 0.3873,
+ "step": 7709
+ },
+ {
+ "epoch": 3.645390070921986,
+ "grad_norm": 2.7443203926086426,
+ "learning_rate": 1.6943095155134586e-06,
+ "loss": 0.3362,
+ "step": 7710
+ },
+ {
+ "epoch": 3.6458628841607563,
+ "grad_norm": 2.931688070297241,
+ "learning_rate": 1.6937189974868618e-06,
+ "loss": 0.3839,
+ "step": 7711
+ },
+ {
+ "epoch": 3.646335697399527,
+ "grad_norm": 2.950242757797241,
+ "learning_rate": 1.6931285296638479e-06,
+ "loss": 0.3552,
+ "step": 7712
+ },
+ {
+ "epoch": 3.646808510638298,
+ "grad_norm": 2.940735340118408,
+ "learning_rate": 1.6925381120811823e-06,
+ "loss": 0.3881,
+ "step": 7713
+ },
+ {
+ "epoch": 3.6472813238770687,
+ "grad_norm": 2.771355390548706,
+ "learning_rate": 1.6919477447756273e-06,
+ "loss": 0.3578,
+ "step": 7714
+ },
+ {
+ "epoch": 3.6477541371158395,
+ "grad_norm": 2.919004201889038,
+ "learning_rate": 1.6913574277839435e-06,
+ "loss": 0.3971,
+ "step": 7715
+ },
+ {
+ "epoch": 3.64822695035461,
+ "grad_norm": 3.293705463409424,
+ "learning_rate": 1.6907671611428872e-06,
+ "loss": 0.422,
+ "step": 7716
+ },
+ {
+ "epoch": 3.6486997635933807,
+ "grad_norm": 2.744239091873169,
+ "learning_rate": 1.6901769448892103e-06,
+ "loss": 0.398,
+ "step": 7717
+ },
+ {
+ "epoch": 3.6491725768321515,
+ "grad_norm": 3.1726129055023193,
+ "learning_rate": 1.689586779059665e-06,
+ "loss": 0.39,
+ "step": 7718
+ },
+ {
+ "epoch": 3.649645390070922,
+ "grad_norm": 3.146743059158325,
+ "learning_rate": 1.688996663690997e-06,
+ "loss": 0.4059,
+ "step": 7719
+ },
+ {
+ "epoch": 3.6501182033096926,
+ "grad_norm": 2.941025495529175,
+ "learning_rate": 1.688406598819951e-06,
+ "loss": 0.3479,
+ "step": 7720
+ },
+ {
+ "epoch": 3.6505910165484634,
+ "grad_norm": 3.3480939865112305,
+ "learning_rate": 1.6878165844832679e-06,
+ "loss": 0.4141,
+ "step": 7721
+ },
+ {
+ "epoch": 3.651063829787234,
+ "grad_norm": 2.9145030975341797,
+ "learning_rate": 1.6872266207176833e-06,
+ "loss": 0.3497,
+ "step": 7722
+ },
+ {
+ "epoch": 3.651536643026005,
+ "grad_norm": 3.119502067565918,
+ "learning_rate": 1.686636707559934e-06,
+ "loss": 0.424,
+ "step": 7723
+ },
+ {
+ "epoch": 3.6520094562647754,
+ "grad_norm": 3.0867667198181152,
+ "learning_rate": 1.6860468450467497e-06,
+ "loss": 0.3998,
+ "step": 7724
+ },
+ {
+ "epoch": 3.652482269503546,
+ "grad_norm": 2.9128987789154053,
+ "learning_rate": 1.6854570332148602e-06,
+ "loss": 0.4043,
+ "step": 7725
+ },
+ {
+ "epoch": 3.652955082742317,
+ "grad_norm": 2.9973206520080566,
+ "learning_rate": 1.6848672721009896e-06,
+ "loss": 0.3395,
+ "step": 7726
+ },
+ {
+ "epoch": 3.6534278959810873,
+ "grad_norm": 2.824916124343872,
+ "learning_rate": 1.6842775617418591e-06,
+ "loss": 0.4102,
+ "step": 7727
+ },
+ {
+ "epoch": 3.653900709219858,
+ "grad_norm": 2.7984440326690674,
+ "learning_rate": 1.6836879021741887e-06,
+ "loss": 0.3823,
+ "step": 7728
+ },
+ {
+ "epoch": 3.654373522458629,
+ "grad_norm": 2.8412179946899414,
+ "learning_rate": 1.6830982934346917e-06,
+ "loss": 0.3755,
+ "step": 7729
+ },
+ {
+ "epoch": 3.6548463356973997,
+ "grad_norm": 3.1677138805389404,
+ "learning_rate": 1.6825087355600836e-06,
+ "loss": 0.4224,
+ "step": 7730
+ },
+ {
+ "epoch": 3.65531914893617,
+ "grad_norm": 3.097085475921631,
+ "learning_rate": 1.6819192285870718e-06,
+ "loss": 0.4103,
+ "step": 7731
+ },
+ {
+ "epoch": 3.655791962174941,
+ "grad_norm": 2.9802496433258057,
+ "learning_rate": 1.6813297725523613e-06,
+ "loss": 0.4297,
+ "step": 7732
+ },
+ {
+ "epoch": 3.6562647754137116,
+ "grad_norm": 3.0135059356689453,
+ "learning_rate": 1.680740367492657e-06,
+ "loss": 0.4526,
+ "step": 7733
+ },
+ {
+ "epoch": 3.656737588652482,
+ "grad_norm": 2.7776739597320557,
+ "learning_rate": 1.6801510134446575e-06,
+ "loss": 0.3924,
+ "step": 7734
+ },
+ {
+ "epoch": 3.657210401891253,
+ "grad_norm": 2.7500126361846924,
+ "learning_rate": 1.6795617104450595e-06,
+ "loss": 0.3785,
+ "step": 7735
+ },
+ {
+ "epoch": 3.6576832151300236,
+ "grad_norm": 3.494142770767212,
+ "learning_rate": 1.6789724585305566e-06,
+ "loss": 0.3483,
+ "step": 7736
+ },
+ {
+ "epoch": 3.6581560283687944,
+ "grad_norm": 3.055081605911255,
+ "learning_rate": 1.6783832577378377e-06,
+ "loss": 0.4481,
+ "step": 7737
+ },
+ {
+ "epoch": 3.658628841607565,
+ "grad_norm": 2.781412124633789,
+ "learning_rate": 1.6777941081035914e-06,
+ "loss": 0.3969,
+ "step": 7738
+ },
+ {
+ "epoch": 3.6591016548463355,
+ "grad_norm": 3.1672184467315674,
+ "learning_rate": 1.677205009664501e-06,
+ "loss": 0.3959,
+ "step": 7739
+ },
+ {
+ "epoch": 3.6595744680851063,
+ "grad_norm": 3.0597715377807617,
+ "learning_rate": 1.6766159624572458e-06,
+ "loss": 0.418,
+ "step": 7740
+ },
+ {
+ "epoch": 3.660047281323877,
+ "grad_norm": 3.2906267642974854,
+ "learning_rate": 1.676026966518505e-06,
+ "loss": 0.4335,
+ "step": 7741
+ },
+ {
+ "epoch": 3.6605200945626475,
+ "grad_norm": 3.2519290447235107,
+ "learning_rate": 1.6754380218849515e-06,
+ "loss": 0.3786,
+ "step": 7742
+ },
+ {
+ "epoch": 3.6609929078014183,
+ "grad_norm": 3.24716854095459,
+ "learning_rate": 1.6748491285932572e-06,
+ "loss": 0.3599,
+ "step": 7743
+ },
+ {
+ "epoch": 3.661465721040189,
+ "grad_norm": 3.2940993309020996,
+ "learning_rate": 1.6742602866800897e-06,
+ "loss": 0.3934,
+ "step": 7744
+ },
+ {
+ "epoch": 3.66193853427896,
+ "grad_norm": 2.917409896850586,
+ "learning_rate": 1.6736714961821124e-06,
+ "loss": 0.4197,
+ "step": 7745
+ },
+ {
+ "epoch": 3.6624113475177307,
+ "grad_norm": 3.005068063735962,
+ "learning_rate": 1.6730827571359887e-06,
+ "loss": 0.4239,
+ "step": 7746
+ },
+ {
+ "epoch": 3.662884160756501,
+ "grad_norm": 2.751880168914795,
+ "learning_rate": 1.6724940695783745e-06,
+ "loss": 0.4257,
+ "step": 7747
+ },
+ {
+ "epoch": 3.663356973995272,
+ "grad_norm": 3.090670585632324,
+ "learning_rate": 1.6719054335459273e-06,
+ "loss": 0.3686,
+ "step": 7748
+ },
+ {
+ "epoch": 3.6638297872340426,
+ "grad_norm": 3.250251293182373,
+ "learning_rate": 1.6713168490752974e-06,
+ "loss": 0.4249,
+ "step": 7749
+ },
+ {
+ "epoch": 3.664302600472813,
+ "grad_norm": 2.8662827014923096,
+ "learning_rate": 1.6707283162031335e-06,
+ "loss": 0.3692,
+ "step": 7750
+ },
+ {
+ "epoch": 3.6647754137115838,
+ "grad_norm": 2.8709118366241455,
+ "learning_rate": 1.6701398349660813e-06,
+ "loss": 0.3929,
+ "step": 7751
+ },
+ {
+ "epoch": 3.6652482269503546,
+ "grad_norm": 2.992035388946533,
+ "learning_rate": 1.6695514054007822e-06,
+ "loss": 0.4131,
+ "step": 7752
+ },
+ {
+ "epoch": 3.6657210401891254,
+ "grad_norm": 3.0427589416503906,
+ "learning_rate": 1.668963027543876e-06,
+ "loss": 0.387,
+ "step": 7753
+ },
+ {
+ "epoch": 3.666193853427896,
+ "grad_norm": 3.0147807598114014,
+ "learning_rate": 1.6683747014319987e-06,
+ "loss": 0.3648,
+ "step": 7754
+ },
+ {
+ "epoch": 3.6666666666666665,
+ "grad_norm": 2.5483829975128174,
+ "learning_rate": 1.6677864271017811e-06,
+ "loss": 0.3643,
+ "step": 7755
+ },
+ {
+ "epoch": 3.6671394799054373,
+ "grad_norm": 2.7661986351013184,
+ "learning_rate": 1.6671982045898544e-06,
+ "loss": 0.3731,
+ "step": 7756
+ },
+ {
+ "epoch": 3.667612293144208,
+ "grad_norm": 2.778036117553711,
+ "learning_rate": 1.666610033932843e-06,
+ "loss": 0.3744,
+ "step": 7757
+ },
+ {
+ "epoch": 3.6680851063829785,
+ "grad_norm": 2.9028329849243164,
+ "learning_rate": 1.6660219151673712e-06,
+ "loss": 0.4286,
+ "step": 7758
+ },
+ {
+ "epoch": 3.6685579196217493,
+ "grad_norm": 2.826687812805176,
+ "learning_rate": 1.6654338483300575e-06,
+ "loss": 0.318,
+ "step": 7759
+ },
+ {
+ "epoch": 3.66903073286052,
+ "grad_norm": 2.7063660621643066,
+ "learning_rate": 1.6648458334575186e-06,
+ "loss": 0.3351,
+ "step": 7760
+ },
+ {
+ "epoch": 3.669503546099291,
+ "grad_norm": 2.708361864089966,
+ "learning_rate": 1.664257870586368e-06,
+ "loss": 0.376,
+ "step": 7761
+ },
+ {
+ "epoch": 3.6699763593380617,
+ "grad_norm": 3.1139161586761475,
+ "learning_rate": 1.6636699597532141e-06,
+ "loss": 0.3572,
+ "step": 7762
+ },
+ {
+ "epoch": 3.670449172576832,
+ "grad_norm": 3.0858285427093506,
+ "learning_rate": 1.6630821009946658e-06,
+ "loss": 0.4204,
+ "step": 7763
+ },
+ {
+ "epoch": 3.670921985815603,
+ "grad_norm": 3.5593984127044678,
+ "learning_rate": 1.6624942943473252e-06,
+ "loss": 0.463,
+ "step": 7764
+ },
+ {
+ "epoch": 3.6713947990543736,
+ "grad_norm": 2.863851308822632,
+ "learning_rate": 1.6619065398477921e-06,
+ "loss": 0.4272,
+ "step": 7765
+ },
+ {
+ "epoch": 3.671867612293144,
+ "grad_norm": 2.833399772644043,
+ "learning_rate": 1.6613188375326638e-06,
+ "loss": 0.3509,
+ "step": 7766
+ },
+ {
+ "epoch": 3.6723404255319148,
+ "grad_norm": 2.988948345184326,
+ "learning_rate": 1.6607311874385346e-06,
+ "loss": 0.3572,
+ "step": 7767
+ },
+ {
+ "epoch": 3.6728132387706856,
+ "grad_norm": 2.7349398136138916,
+ "learning_rate": 1.6601435896019936e-06,
+ "loss": 0.3875,
+ "step": 7768
+ },
+ {
+ "epoch": 3.6732860520094563,
+ "grad_norm": 2.8544445037841797,
+ "learning_rate": 1.659556044059629e-06,
+ "loss": 0.4057,
+ "step": 7769
+ },
+ {
+ "epoch": 3.673758865248227,
+ "grad_norm": 3.0341904163360596,
+ "learning_rate": 1.6589685508480235e-06,
+ "loss": 0.3935,
+ "step": 7770
+ },
+ {
+ "epoch": 3.6742316784869975,
+ "grad_norm": 2.7495710849761963,
+ "learning_rate": 1.6583811100037595e-06,
+ "loss": 0.352,
+ "step": 7771
+ },
+ {
+ "epoch": 3.6747044917257683,
+ "grad_norm": 3.258525848388672,
+ "learning_rate": 1.6577937215634133e-06,
+ "loss": 0.4414,
+ "step": 7772
+ },
+ {
+ "epoch": 3.675177304964539,
+ "grad_norm": 3.0686328411102295,
+ "learning_rate": 1.657206385563558e-06,
+ "loss": 0.353,
+ "step": 7773
+ },
+ {
+ "epoch": 3.6756501182033094,
+ "grad_norm": 3.1168248653411865,
+ "learning_rate": 1.6566191020407668e-06,
+ "loss": 0.4064,
+ "step": 7774
+ },
+ {
+ "epoch": 3.6761229314420802,
+ "grad_norm": 2.7837352752685547,
+ "learning_rate": 1.6560318710316053e-06,
+ "loss": 0.3956,
+ "step": 7775
+ },
+ {
+ "epoch": 3.676595744680851,
+ "grad_norm": 3.1514039039611816,
+ "learning_rate": 1.6554446925726391e-06,
+ "loss": 0.4044,
+ "step": 7776
+ },
+ {
+ "epoch": 3.677068557919622,
+ "grad_norm": 3.010352611541748,
+ "learning_rate": 1.6548575667004285e-06,
+ "loss": 0.4162,
+ "step": 7777
+ },
+ {
+ "epoch": 3.6775413711583926,
+ "grad_norm": 3.1727633476257324,
+ "learning_rate": 1.6542704934515308e-06,
+ "loss": 0.411,
+ "step": 7778
+ },
+ {
+ "epoch": 3.678014184397163,
+ "grad_norm": 3.6771271228790283,
+ "learning_rate": 1.6536834728625018e-06,
+ "loss": 0.4562,
+ "step": 7779
+ },
+ {
+ "epoch": 3.678486997635934,
+ "grad_norm": 2.9793131351470947,
+ "learning_rate": 1.6530965049698908e-06,
+ "loss": 0.4039,
+ "step": 7780
+ },
+ {
+ "epoch": 3.6789598108747046,
+ "grad_norm": 3.193751096725464,
+ "learning_rate": 1.6525095898102478e-06,
+ "loss": 0.4064,
+ "step": 7781
+ },
+ {
+ "epoch": 3.679432624113475,
+ "grad_norm": 2.6643173694610596,
+ "learning_rate": 1.6519227274201169e-06,
+ "loss": 0.3731,
+ "step": 7782
+ },
+ {
+ "epoch": 3.6799054373522457,
+ "grad_norm": 3.4855685234069824,
+ "learning_rate": 1.6513359178360384e-06,
+ "loss": 0.3815,
+ "step": 7783
+ },
+ {
+ "epoch": 3.6803782505910165,
+ "grad_norm": 3.320537567138672,
+ "learning_rate": 1.6507491610945514e-06,
+ "loss": 0.4065,
+ "step": 7784
+ },
+ {
+ "epoch": 3.6808510638297873,
+ "grad_norm": 3.2793102264404297,
+ "learning_rate": 1.6501624572321895e-06,
+ "loss": 0.429,
+ "step": 7785
+ },
+ {
+ "epoch": 3.681323877068558,
+ "grad_norm": 2.8609631061553955,
+ "learning_rate": 1.6495758062854854e-06,
+ "loss": 0.3881,
+ "step": 7786
+ },
+ {
+ "epoch": 3.6817966903073285,
+ "grad_norm": 2.956533193588257,
+ "learning_rate": 1.6489892082909675e-06,
+ "loss": 0.3662,
+ "step": 7787
+ },
+ {
+ "epoch": 3.6822695035460993,
+ "grad_norm": 2.949092149734497,
+ "learning_rate": 1.6484026632851591e-06,
+ "loss": 0.4021,
+ "step": 7788
+ },
+ {
+ "epoch": 3.68274231678487,
+ "grad_norm": 2.802572250366211,
+ "learning_rate": 1.6478161713045831e-06,
+ "loss": 0.3862,
+ "step": 7789
+ },
+ {
+ "epoch": 3.6832151300236404,
+ "grad_norm": 3.0064797401428223,
+ "learning_rate": 1.6472297323857578e-06,
+ "loss": 0.3495,
+ "step": 7790
+ },
+ {
+ "epoch": 3.6836879432624112,
+ "grad_norm": 2.74037766456604,
+ "learning_rate": 1.646643346565197e-06,
+ "loss": 0.362,
+ "step": 7791
+ },
+ {
+ "epoch": 3.684160756501182,
+ "grad_norm": 3.423326253890991,
+ "learning_rate": 1.646057013879414e-06,
+ "loss": 0.4383,
+ "step": 7792
+ },
+ {
+ "epoch": 3.684633569739953,
+ "grad_norm": 2.7426443099975586,
+ "learning_rate": 1.645470734364915e-06,
+ "loss": 0.3741,
+ "step": 7793
+ },
+ {
+ "epoch": 3.6851063829787236,
+ "grad_norm": 2.879296064376831,
+ "learning_rate": 1.6448845080582077e-06,
+ "loss": 0.4003,
+ "step": 7794
+ },
+ {
+ "epoch": 3.685579196217494,
+ "grad_norm": 2.6821188926696777,
+ "learning_rate": 1.6442983349957924e-06,
+ "loss": 0.3545,
+ "step": 7795
+ },
+ {
+ "epoch": 3.6860520094562648,
+ "grad_norm": 2.714059829711914,
+ "learning_rate": 1.6437122152141665e-06,
+ "loss": 0.3702,
+ "step": 7796
+ },
+ {
+ "epoch": 3.6865248226950356,
+ "grad_norm": 2.593811511993408,
+ "learning_rate": 1.6431261487498274e-06,
+ "loss": 0.3512,
+ "step": 7797
+ },
+ {
+ "epoch": 3.686997635933806,
+ "grad_norm": 2.687533378601074,
+ "learning_rate": 1.6425401356392652e-06,
+ "loss": 0.36,
+ "step": 7798
+ },
+ {
+ "epoch": 3.6874704491725767,
+ "grad_norm": 3.1675431728363037,
+ "learning_rate": 1.6419541759189694e-06,
+ "loss": 0.3349,
+ "step": 7799
+ },
+ {
+ "epoch": 3.6879432624113475,
+ "grad_norm": 2.777310371398926,
+ "learning_rate": 1.6413682696254246e-06,
+ "loss": 0.317,
+ "step": 7800
+ },
+ {
+ "epoch": 3.6884160756501183,
+ "grad_norm": 3.0121655464172363,
+ "learning_rate": 1.640782416795112e-06,
+ "loss": 0.3612,
+ "step": 7801
+ },
+ {
+ "epoch": 3.688888888888889,
+ "grad_norm": 3.0532145500183105,
+ "learning_rate": 1.6401966174645113e-06,
+ "loss": 0.4065,
+ "step": 7802
+ },
+ {
+ "epoch": 3.6893617021276595,
+ "grad_norm": 2.8221664428710938,
+ "learning_rate": 1.6396108716700961e-06,
+ "loss": 0.3669,
+ "step": 7803
+ },
+ {
+ "epoch": 3.6898345153664303,
+ "grad_norm": 2.966357707977295,
+ "learning_rate": 1.6390251794483405e-06,
+ "loss": 0.391,
+ "step": 7804
+ },
+ {
+ "epoch": 3.690307328605201,
+ "grad_norm": 3.460252046585083,
+ "learning_rate": 1.6384395408357118e-06,
+ "loss": 0.429,
+ "step": 7805
+ },
+ {
+ "epoch": 3.6907801418439714,
+ "grad_norm": 2.8907718658447266,
+ "learning_rate": 1.637853955868674e-06,
+ "loss": 0.3761,
+ "step": 7806
+ },
+ {
+ "epoch": 3.691252955082742,
+ "grad_norm": 3.114612102508545,
+ "learning_rate": 1.6372684245836912e-06,
+ "loss": 0.4376,
+ "step": 7807
+ },
+ {
+ "epoch": 3.691725768321513,
+ "grad_norm": 2.9361326694488525,
+ "learning_rate": 1.6366829470172191e-06,
+ "loss": 0.3672,
+ "step": 7808
+ },
+ {
+ "epoch": 3.692198581560284,
+ "grad_norm": 3.2719476222991943,
+ "learning_rate": 1.6360975232057156e-06,
+ "loss": 0.4266,
+ "step": 7809
+ },
+ {
+ "epoch": 3.6926713947990546,
+ "grad_norm": 2.873952865600586,
+ "learning_rate": 1.635512153185631e-06,
+ "loss": 0.4056,
+ "step": 7810
+ },
+ {
+ "epoch": 3.693144208037825,
+ "grad_norm": 3.0273401737213135,
+ "learning_rate": 1.634926836993413e-06,
+ "loss": 0.3947,
+ "step": 7811
+ },
+ {
+ "epoch": 3.6936170212765957,
+ "grad_norm": 2.868738889694214,
+ "learning_rate": 1.634341574665509e-06,
+ "loss": 0.3935,
+ "step": 7812
+ },
+ {
+ "epoch": 3.6940898345153665,
+ "grad_norm": 3.3080437183380127,
+ "learning_rate": 1.6337563662383591e-06,
+ "loss": 0.3606,
+ "step": 7813
+ },
+ {
+ "epoch": 3.694562647754137,
+ "grad_norm": 2.8339016437530518,
+ "learning_rate": 1.6331712117484014e-06,
+ "loss": 0.4019,
+ "step": 7814
+ },
+ {
+ "epoch": 3.6950354609929077,
+ "grad_norm": 2.666815996170044,
+ "learning_rate": 1.6325861112320717e-06,
+ "loss": 0.3502,
+ "step": 7815
+ },
+ {
+ "epoch": 3.6955082742316785,
+ "grad_norm": 2.7624311447143555,
+ "learning_rate": 1.6320010647258008e-06,
+ "loss": 0.3481,
+ "step": 7816
+ },
+ {
+ "epoch": 3.6959810874704493,
+ "grad_norm": 2.7796332836151123,
+ "learning_rate": 1.6314160722660183e-06,
+ "loss": 0.3735,
+ "step": 7817
+ },
+ {
+ "epoch": 3.69645390070922,
+ "grad_norm": 2.954318046569824,
+ "learning_rate": 1.6308311338891484e-06,
+ "loss": 0.3933,
+ "step": 7818
+ },
+ {
+ "epoch": 3.6969267139479904,
+ "grad_norm": 2.821072816848755,
+ "learning_rate": 1.6302462496316115e-06,
+ "loss": 0.3437,
+ "step": 7819
+ },
+ {
+ "epoch": 3.6973995271867612,
+ "grad_norm": 3.436192750930786,
+ "learning_rate": 1.629661419529828e-06,
+ "loss": 0.4469,
+ "step": 7820
+ },
+ {
+ "epoch": 3.697872340425532,
+ "grad_norm": 3.1361067295074463,
+ "learning_rate": 1.629076643620211e-06,
+ "loss": 0.3887,
+ "step": 7821
+ },
+ {
+ "epoch": 3.6983451536643024,
+ "grad_norm": 3.355024576187134,
+ "learning_rate": 1.6284919219391732e-06,
+ "loss": 0.424,
+ "step": 7822
+ },
+ {
+ "epoch": 3.698817966903073,
+ "grad_norm": 2.7671639919281006,
+ "learning_rate": 1.6279072545231212e-06,
+ "loss": 0.3765,
+ "step": 7823
+ },
+ {
+ "epoch": 3.699290780141844,
+ "grad_norm": 2.9509360790252686,
+ "learning_rate": 1.6273226414084606e-06,
+ "loss": 0.4057,
+ "step": 7824
+ },
+ {
+ "epoch": 3.699763593380615,
+ "grad_norm": 2.9852921962738037,
+ "learning_rate": 1.6267380826315932e-06,
+ "loss": 0.4238,
+ "step": 7825
+ },
+ {
+ "epoch": 3.7002364066193856,
+ "grad_norm": 2.826594114303589,
+ "learning_rate": 1.626153578228915e-06,
+ "loss": 0.3958,
+ "step": 7826
+ },
+ {
+ "epoch": 3.700709219858156,
+ "grad_norm": 2.9103410243988037,
+ "learning_rate": 1.6255691282368228e-06,
+ "loss": 0.394,
+ "step": 7827
+ },
+ {
+ "epoch": 3.7011820330969267,
+ "grad_norm": 3.362992525100708,
+ "learning_rate": 1.6249847326917068e-06,
+ "loss": 0.4233,
+ "step": 7828
+ },
+ {
+ "epoch": 3.7016548463356975,
+ "grad_norm": 2.711280107498169,
+ "learning_rate": 1.624400391629954e-06,
+ "loss": 0.2977,
+ "step": 7829
+ },
+ {
+ "epoch": 3.702127659574468,
+ "grad_norm": 2.8354649543762207,
+ "learning_rate": 1.6238161050879497e-06,
+ "loss": 0.3549,
+ "step": 7830
+ },
+ {
+ "epoch": 3.7026004728132387,
+ "grad_norm": 3.096376895904541,
+ "learning_rate": 1.6232318731020743e-06,
+ "loss": 0.3486,
+ "step": 7831
+ },
+ {
+ "epoch": 3.7030732860520095,
+ "grad_norm": 2.918267250061035,
+ "learning_rate": 1.6226476957087064e-06,
+ "loss": 0.3659,
+ "step": 7832
+ },
+ {
+ "epoch": 3.7035460992907803,
+ "grad_norm": 2.705399513244629,
+ "learning_rate": 1.6220635729442195e-06,
+ "loss": 0.4301,
+ "step": 7833
+ },
+ {
+ "epoch": 3.704018912529551,
+ "grad_norm": 2.9192235469818115,
+ "learning_rate": 1.621479504844983e-06,
+ "loss": 0.3384,
+ "step": 7834
+ },
+ {
+ "epoch": 3.7044917257683214,
+ "grad_norm": 2.78623104095459,
+ "learning_rate": 1.6208954914473669e-06,
+ "loss": 0.3528,
+ "step": 7835
+ },
+ {
+ "epoch": 3.704964539007092,
+ "grad_norm": 3.0218069553375244,
+ "learning_rate": 1.6203115327877333e-06,
+ "loss": 0.3698,
+ "step": 7836
+ },
+ {
+ "epoch": 3.705437352245863,
+ "grad_norm": 3.019101619720459,
+ "learning_rate": 1.6197276289024422e-06,
+ "loss": 0.4398,
+ "step": 7837
+ },
+ {
+ "epoch": 3.7059101654846334,
+ "grad_norm": 2.9220848083496094,
+ "learning_rate": 1.6191437798278531e-06,
+ "loss": 0.3803,
+ "step": 7838
+ },
+ {
+ "epoch": 3.706382978723404,
+ "grad_norm": 3.2731969356536865,
+ "learning_rate": 1.6185599856003181e-06,
+ "loss": 0.4529,
+ "step": 7839
+ },
+ {
+ "epoch": 3.706855791962175,
+ "grad_norm": 2.85239577293396,
+ "learning_rate": 1.617976246256188e-06,
+ "loss": 0.3801,
+ "step": 7840
+ },
+ {
+ "epoch": 3.7073286052009458,
+ "grad_norm": 2.8250765800476074,
+ "learning_rate": 1.6173925618318092e-06,
+ "loss": 0.3267,
+ "step": 7841
+ },
+ {
+ "epoch": 3.7078014184397166,
+ "grad_norm": 2.9152321815490723,
+ "learning_rate": 1.616808932363525e-06,
+ "loss": 0.428,
+ "step": 7842
+ },
+ {
+ "epoch": 3.708274231678487,
+ "grad_norm": 2.912656545639038,
+ "learning_rate": 1.6162253578876766e-06,
+ "loss": 0.3802,
+ "step": 7843
+ },
+ {
+ "epoch": 3.7087470449172577,
+ "grad_norm": 3.0700762271881104,
+ "learning_rate": 1.6156418384405992e-06,
+ "loss": 0.377,
+ "step": 7844
+ },
+ {
+ "epoch": 3.7092198581560285,
+ "grad_norm": 2.873141050338745,
+ "learning_rate": 1.6150583740586274e-06,
+ "loss": 0.399,
+ "step": 7845
+ },
+ {
+ "epoch": 3.709692671394799,
+ "grad_norm": 2.899555206298828,
+ "learning_rate": 1.6144749647780906e-06,
+ "loss": 0.402,
+ "step": 7846
+ },
+ {
+ "epoch": 3.7101654846335697,
+ "grad_norm": 3.257697343826294,
+ "learning_rate": 1.6138916106353139e-06,
+ "loss": 0.4193,
+ "step": 7847
+ },
+ {
+ "epoch": 3.7106382978723405,
+ "grad_norm": 2.6879804134368896,
+ "learning_rate": 1.613308311666622e-06,
+ "loss": 0.3474,
+ "step": 7848
+ },
+ {
+ "epoch": 3.7111111111111112,
+ "grad_norm": 2.712491273880005,
+ "learning_rate": 1.6127250679083323e-06,
+ "loss": 0.3315,
+ "step": 7849
+ },
+ {
+ "epoch": 3.711583924349882,
+ "grad_norm": 2.9762673377990723,
+ "learning_rate": 1.6121418793967631e-06,
+ "loss": 0.3953,
+ "step": 7850
+ },
+ {
+ "epoch": 3.7120567375886524,
+ "grad_norm": 2.743668556213379,
+ "learning_rate": 1.6115587461682258e-06,
+ "loss": 0.381,
+ "step": 7851
+ },
+ {
+ "epoch": 3.712529550827423,
+ "grad_norm": 3.0545318126678467,
+ "learning_rate": 1.6109756682590288e-06,
+ "loss": 0.412,
+ "step": 7852
+ },
+ {
+ "epoch": 3.713002364066194,
+ "grad_norm": 3.0125906467437744,
+ "learning_rate": 1.61039264570548e-06,
+ "loss": 0.3931,
+ "step": 7853
+ },
+ {
+ "epoch": 3.7134751773049643,
+ "grad_norm": 2.809302806854248,
+ "learning_rate": 1.6098096785438794e-06,
+ "loss": 0.3943,
+ "step": 7854
+ },
+ {
+ "epoch": 3.713947990543735,
+ "grad_norm": 3.092452049255371,
+ "learning_rate": 1.6092267668105276e-06,
+ "loss": 0.3932,
+ "step": 7855
+ },
+ {
+ "epoch": 3.714420803782506,
+ "grad_norm": 2.9878969192504883,
+ "learning_rate": 1.608643910541719e-06,
+ "loss": 0.4289,
+ "step": 7856
+ },
+ {
+ "epoch": 3.7148936170212767,
+ "grad_norm": 2.693387508392334,
+ "learning_rate": 1.6080611097737444e-06,
+ "loss": 0.373,
+ "step": 7857
+ },
+ {
+ "epoch": 3.7153664302600475,
+ "grad_norm": 3.4097673892974854,
+ "learning_rate": 1.6074783645428945e-06,
+ "loss": 0.4487,
+ "step": 7858
+ },
+ {
+ "epoch": 3.715839243498818,
+ "grad_norm": 3.1466784477233887,
+ "learning_rate": 1.6068956748854525e-06,
+ "loss": 0.3648,
+ "step": 7859
+ },
+ {
+ "epoch": 3.7163120567375887,
+ "grad_norm": 3.062107563018799,
+ "learning_rate": 1.6063130408377015e-06,
+ "loss": 0.3899,
+ "step": 7860
+ },
+ {
+ "epoch": 3.7167848699763595,
+ "grad_norm": 3.2298364639282227,
+ "learning_rate": 1.6057304624359188e-06,
+ "loss": 0.4243,
+ "step": 7861
+ },
+ {
+ "epoch": 3.71725768321513,
+ "grad_norm": 3.0285773277282715,
+ "learning_rate": 1.6051479397163784e-06,
+ "loss": 0.3469,
+ "step": 7862
+ },
+ {
+ "epoch": 3.7177304964539006,
+ "grad_norm": 2.8438515663146973,
+ "learning_rate": 1.6045654727153525e-06,
+ "loss": 0.3363,
+ "step": 7863
+ },
+ {
+ "epoch": 3.7182033096926714,
+ "grad_norm": 3.1558034420013428,
+ "learning_rate": 1.6039830614691081e-06,
+ "loss": 0.4326,
+ "step": 7864
+ },
+ {
+ "epoch": 3.7186761229314422,
+ "grad_norm": 2.438640594482422,
+ "learning_rate": 1.603400706013909e-06,
+ "loss": 0.3647,
+ "step": 7865
+ },
+ {
+ "epoch": 3.719148936170213,
+ "grad_norm": 3.0443127155303955,
+ "learning_rate": 1.6028184063860168e-06,
+ "loss": 0.4295,
+ "step": 7866
+ },
+ {
+ "epoch": 3.7196217494089834,
+ "grad_norm": 2.8009512424468994,
+ "learning_rate": 1.602236162621688e-06,
+ "loss": 0.4055,
+ "step": 7867
+ },
+ {
+ "epoch": 3.720094562647754,
+ "grad_norm": 3.227698802947998,
+ "learning_rate": 1.6016539747571775e-06,
+ "loss": 0.4681,
+ "step": 7868
+ },
+ {
+ "epoch": 3.720567375886525,
+ "grad_norm": 2.8242595195770264,
+ "learning_rate": 1.601071842828735e-06,
+ "loss": 0.3707,
+ "step": 7869
+ },
+ {
+ "epoch": 3.7210401891252953,
+ "grad_norm": 3.602937698364258,
+ "learning_rate": 1.6004897668726067e-06,
+ "loss": 0.5201,
+ "step": 7870
+ },
+ {
+ "epoch": 3.721513002364066,
+ "grad_norm": 3.578422784805298,
+ "learning_rate": 1.599907746925037e-06,
+ "loss": 0.4514,
+ "step": 7871
+ },
+ {
+ "epoch": 3.721985815602837,
+ "grad_norm": 2.7365758419036865,
+ "learning_rate": 1.5993257830222635e-06,
+ "loss": 0.356,
+ "step": 7872
+ },
+ {
+ "epoch": 3.7224586288416077,
+ "grad_norm": 3.125636577606201,
+ "learning_rate": 1.5987438752005258e-06,
+ "loss": 0.4277,
+ "step": 7873
+ },
+ {
+ "epoch": 3.7229314420803785,
+ "grad_norm": 2.7157294750213623,
+ "learning_rate": 1.5981620234960549e-06,
+ "loss": 0.363,
+ "step": 7874
+ },
+ {
+ "epoch": 3.723404255319149,
+ "grad_norm": 2.90950083732605,
+ "learning_rate": 1.5975802279450793e-06,
+ "loss": 0.4027,
+ "step": 7875
+ },
+ {
+ "epoch": 3.7238770685579197,
+ "grad_norm": 2.659787178039551,
+ "learning_rate": 1.596998488583827e-06,
+ "loss": 0.3632,
+ "step": 7876
+ },
+ {
+ "epoch": 3.7243498817966905,
+ "grad_norm": 3.221623182296753,
+ "learning_rate": 1.5964168054485185e-06,
+ "loss": 0.4295,
+ "step": 7877
+ },
+ {
+ "epoch": 3.724822695035461,
+ "grad_norm": 2.6838672161102295,
+ "learning_rate": 1.595835178575374e-06,
+ "loss": 0.3413,
+ "step": 7878
+ },
+ {
+ "epoch": 3.7252955082742316,
+ "grad_norm": 2.804706335067749,
+ "learning_rate": 1.5952536080006084e-06,
+ "loss": 0.3801,
+ "step": 7879
+ },
+ {
+ "epoch": 3.7257683215130024,
+ "grad_norm": 2.7647509574890137,
+ "learning_rate": 1.5946720937604326e-06,
+ "loss": 0.3941,
+ "step": 7880
+ },
+ {
+ "epoch": 3.726241134751773,
+ "grad_norm": 2.8363754749298096,
+ "learning_rate": 1.5940906358910566e-06,
+ "loss": 0.3772,
+ "step": 7881
+ },
+ {
+ "epoch": 3.726713947990544,
+ "grad_norm": 3.4147698879241943,
+ "learning_rate": 1.5935092344286835e-06,
+ "loss": 0.399,
+ "step": 7882
+ },
+ {
+ "epoch": 3.7271867612293144,
+ "grad_norm": 2.984090805053711,
+ "learning_rate": 1.5929278894095162e-06,
+ "loss": 0.3373,
+ "step": 7883
+ },
+ {
+ "epoch": 3.727659574468085,
+ "grad_norm": 3.250173330307007,
+ "learning_rate": 1.5923466008697521e-06,
+ "loss": 0.3932,
+ "step": 7884
+ },
+ {
+ "epoch": 3.728132387706856,
+ "grad_norm": 3.2699649333953857,
+ "learning_rate": 1.5917653688455848e-06,
+ "loss": 0.4529,
+ "step": 7885
+ },
+ {
+ "epoch": 3.7286052009456263,
+ "grad_norm": 3.175934076309204,
+ "learning_rate": 1.591184193373206e-06,
+ "loss": 0.3726,
+ "step": 7886
+ },
+ {
+ "epoch": 3.729078014184397,
+ "grad_norm": 2.8128812313079834,
+ "learning_rate": 1.5906030744888024e-06,
+ "loss": 0.3648,
+ "step": 7887
+ },
+ {
+ "epoch": 3.729550827423168,
+ "grad_norm": 3.025012493133545,
+ "learning_rate": 1.5900220122285564e-06,
+ "loss": 0.3945,
+ "step": 7888
+ },
+ {
+ "epoch": 3.7300236406619387,
+ "grad_norm": 3.237680435180664,
+ "learning_rate": 1.5894410066286512e-06,
+ "loss": 0.3815,
+ "step": 7889
+ },
+ {
+ "epoch": 3.7304964539007095,
+ "grad_norm": 3.458033323287964,
+ "learning_rate": 1.5888600577252605e-06,
+ "loss": 0.4104,
+ "step": 7890
+ },
+ {
+ "epoch": 3.73096926713948,
+ "grad_norm": 2.718867540359497,
+ "learning_rate": 1.58827916555456e-06,
+ "loss": 0.3243,
+ "step": 7891
+ },
+ {
+ "epoch": 3.7314420803782506,
+ "grad_norm": 3.047157049179077,
+ "learning_rate": 1.5876983301527176e-06,
+ "loss": 0.3689,
+ "step": 7892
+ },
+ {
+ "epoch": 3.731914893617021,
+ "grad_norm": 3.2904715538024902,
+ "learning_rate": 1.5871175515558995e-06,
+ "loss": 0.4045,
+ "step": 7893
+ },
+ {
+ "epoch": 3.732387706855792,
+ "grad_norm": 2.956467866897583,
+ "learning_rate": 1.5865368298002692e-06,
+ "loss": 0.3806,
+ "step": 7894
+ },
+ {
+ "epoch": 3.7328605200945626,
+ "grad_norm": 3.3309173583984375,
+ "learning_rate": 1.5859561649219843e-06,
+ "loss": 0.4011,
+ "step": 7895
+ },
+ {
+ "epoch": 3.7333333333333334,
+ "grad_norm": 2.7853524684906006,
+ "learning_rate": 1.5853755569572018e-06,
+ "loss": 0.3239,
+ "step": 7896
+ },
+ {
+ "epoch": 3.733806146572104,
+ "grad_norm": 2.9832780361175537,
+ "learning_rate": 1.584795005942073e-06,
+ "loss": 0.4582,
+ "step": 7897
+ },
+ {
+ "epoch": 3.7342789598108745,
+ "grad_norm": 3.2866461277008057,
+ "learning_rate": 1.584214511912745e-06,
+ "loss": 0.3876,
+ "step": 7898
+ },
+ {
+ "epoch": 3.7347517730496453,
+ "grad_norm": 3.018526792526245,
+ "learning_rate": 1.5836340749053646e-06,
+ "loss": 0.3221,
+ "step": 7899
+ },
+ {
+ "epoch": 3.735224586288416,
+ "grad_norm": 2.9109885692596436,
+ "learning_rate": 1.583053694956072e-06,
+ "loss": 0.4225,
+ "step": 7900
+ },
+ {
+ "epoch": 3.7356973995271865,
+ "grad_norm": 3.104146718978882,
+ "learning_rate": 1.5824733721010051e-06,
+ "loss": 0.3843,
+ "step": 7901
+ },
+ {
+ "epoch": 3.7361702127659573,
+ "grad_norm": 3.0982813835144043,
+ "learning_rate": 1.5818931063762989e-06,
+ "loss": 0.4223,
+ "step": 7902
+ },
+ {
+ "epoch": 3.736643026004728,
+ "grad_norm": 2.7797579765319824,
+ "learning_rate": 1.5813128978180819e-06,
+ "loss": 0.3536,
+ "step": 7903
+ },
+ {
+ "epoch": 3.737115839243499,
+ "grad_norm": 2.870884656906128,
+ "learning_rate": 1.5807327464624835e-06,
+ "loss": 0.3053,
+ "step": 7904
+ },
+ {
+ "epoch": 3.7375886524822697,
+ "grad_norm": 2.896674633026123,
+ "learning_rate": 1.5801526523456251e-06,
+ "loss": 0.3806,
+ "step": 7905
+ },
+ {
+ "epoch": 3.73806146572104,
+ "grad_norm": 3.009662389755249,
+ "learning_rate": 1.5795726155036284e-06,
+ "loss": 0.3568,
+ "step": 7906
+ },
+ {
+ "epoch": 3.738534278959811,
+ "grad_norm": 2.6860599517822266,
+ "learning_rate": 1.578992635972609e-06,
+ "loss": 0.4392,
+ "step": 7907
+ },
+ {
+ "epoch": 3.7390070921985816,
+ "grad_norm": 2.9046099185943604,
+ "learning_rate": 1.578412713788679e-06,
+ "loss": 0.3756,
+ "step": 7908
+ },
+ {
+ "epoch": 3.739479905437352,
+ "grad_norm": 2.8035101890563965,
+ "learning_rate": 1.5778328489879488e-06,
+ "loss": 0.3576,
+ "step": 7909
+ },
+ {
+ "epoch": 3.739952718676123,
+ "grad_norm": 2.767514228820801,
+ "learning_rate": 1.5772530416065238e-06,
+ "loss": 0.4037,
+ "step": 7910
+ },
+ {
+ "epoch": 3.7404255319148936,
+ "grad_norm": 3.0867795944213867,
+ "learning_rate": 1.576673291680505e-06,
+ "loss": 0.4394,
+ "step": 7911
+ },
+ {
+ "epoch": 3.7408983451536644,
+ "grad_norm": 3.295976161956787,
+ "learning_rate": 1.5760935992459926e-06,
+ "loss": 0.3938,
+ "step": 7912
+ },
+ {
+ "epoch": 3.741371158392435,
+ "grad_norm": 2.725949287414551,
+ "learning_rate": 1.5755139643390794e-06,
+ "loss": 0.3633,
+ "step": 7913
+ },
+ {
+ "epoch": 3.7418439716312055,
+ "grad_norm": 3.0864083766937256,
+ "learning_rate": 1.5749343869958585e-06,
+ "loss": 0.3034,
+ "step": 7914
+ },
+ {
+ "epoch": 3.7423167848699763,
+ "grad_norm": 3.707273244857788,
+ "learning_rate": 1.5743548672524175e-06,
+ "loss": 0.4206,
+ "step": 7915
+ },
+ {
+ "epoch": 3.742789598108747,
+ "grad_norm": 2.9829516410827637,
+ "learning_rate": 1.573775405144839e-06,
+ "loss": 0.333,
+ "step": 7916
+ },
+ {
+ "epoch": 3.7432624113475175,
+ "grad_norm": 3.3303117752075195,
+ "learning_rate": 1.5731960007092056e-06,
+ "loss": 0.4558,
+ "step": 7917
+ },
+ {
+ "epoch": 3.7437352245862883,
+ "grad_norm": 2.63291335105896,
+ "learning_rate": 1.5726166539815925e-06,
+ "loss": 0.39,
+ "step": 7918
+ },
+ {
+ "epoch": 3.744208037825059,
+ "grad_norm": 3.0533673763275146,
+ "learning_rate": 1.572037364998075e-06,
+ "loss": 0.3586,
+ "step": 7919
+ },
+ {
+ "epoch": 3.74468085106383,
+ "grad_norm": 2.9185104370117188,
+ "learning_rate": 1.5714581337947216e-06,
+ "loss": 0.3809,
+ "step": 7920
+ },
+ {
+ "epoch": 3.7451536643026007,
+ "grad_norm": 3.1863298416137695,
+ "learning_rate": 1.5708789604075975e-06,
+ "loss": 0.4132,
+ "step": 7921
+ },
+ {
+ "epoch": 3.745626477541371,
+ "grad_norm": 3.2700514793395996,
+ "learning_rate": 1.5702998448727674e-06,
+ "loss": 0.4601,
+ "step": 7922
+ },
+ {
+ "epoch": 3.746099290780142,
+ "grad_norm": 3.4729206562042236,
+ "learning_rate": 1.5697207872262886e-06,
+ "loss": 0.4585,
+ "step": 7923
+ },
+ {
+ "epoch": 3.7465721040189126,
+ "grad_norm": 3.1432926654815674,
+ "learning_rate": 1.5691417875042182e-06,
+ "loss": 0.3128,
+ "step": 7924
+ },
+ {
+ "epoch": 3.747044917257683,
+ "grad_norm": 3.096121072769165,
+ "learning_rate": 1.5685628457426066e-06,
+ "loss": 0.3903,
+ "step": 7925
+ },
+ {
+ "epoch": 3.7475177304964538,
+ "grad_norm": 2.6897027492523193,
+ "learning_rate": 1.5679839619775023e-06,
+ "loss": 0.3707,
+ "step": 7926
+ },
+ {
+ "epoch": 3.7479905437352246,
+ "grad_norm": 2.8020687103271484,
+ "learning_rate": 1.5674051362449503e-06,
+ "loss": 0.3986,
+ "step": 7927
+ },
+ {
+ "epoch": 3.7484633569739954,
+ "grad_norm": 3.1278326511383057,
+ "learning_rate": 1.56682636858099e-06,
+ "loss": 0.3899,
+ "step": 7928
+ },
+ {
+ "epoch": 3.748936170212766,
+ "grad_norm": 2.907982587814331,
+ "learning_rate": 1.5662476590216613e-06,
+ "loss": 0.3422,
+ "step": 7929
+ },
+ {
+ "epoch": 3.7494089834515365,
+ "grad_norm": 3.1246347427368164,
+ "learning_rate": 1.5656690076029962e-06,
+ "loss": 0.452,
+ "step": 7930
+ },
+ {
+ "epoch": 3.7498817966903073,
+ "grad_norm": 2.9161367416381836,
+ "learning_rate": 1.565090414361024e-06,
+ "loss": 0.3284,
+ "step": 7931
+ },
+ {
+ "epoch": 3.750354609929078,
+ "grad_norm": 2.943183422088623,
+ "learning_rate": 1.564511879331773e-06,
+ "loss": 0.3478,
+ "step": 7932
+ },
+ {
+ "epoch": 3.7508274231678485,
+ "grad_norm": 3.2308566570281982,
+ "learning_rate": 1.563933402551266e-06,
+ "loss": 0.4143,
+ "step": 7933
+ },
+ {
+ "epoch": 3.7513002364066192,
+ "grad_norm": 2.6846251487731934,
+ "learning_rate": 1.5633549840555206e-06,
+ "loss": 0.3681,
+ "step": 7934
+ },
+ {
+ "epoch": 3.75177304964539,
+ "grad_norm": 3.0995283126831055,
+ "learning_rate": 1.562776623880554e-06,
+ "loss": 0.4642,
+ "step": 7935
+ },
+ {
+ "epoch": 3.752245862884161,
+ "grad_norm": 2.7406163215637207,
+ "learning_rate": 1.562198322062376e-06,
+ "loss": 0.3823,
+ "step": 7936
+ },
+ {
+ "epoch": 3.7527186761229316,
+ "grad_norm": 2.85732364654541,
+ "learning_rate": 1.5616200786369978e-06,
+ "loss": 0.3053,
+ "step": 7937
+ },
+ {
+ "epoch": 3.753191489361702,
+ "grad_norm": 2.812526226043701,
+ "learning_rate": 1.5610418936404223e-06,
+ "loss": 0.3944,
+ "step": 7938
+ },
+ {
+ "epoch": 3.753664302600473,
+ "grad_norm": 2.8886849880218506,
+ "learning_rate": 1.5604637671086499e-06,
+ "loss": 0.3936,
+ "step": 7939
+ },
+ {
+ "epoch": 3.7541371158392436,
+ "grad_norm": 2.831774950027466,
+ "learning_rate": 1.5598856990776801e-06,
+ "loss": 0.3146,
+ "step": 7940
+ },
+ {
+ "epoch": 3.754609929078014,
+ "grad_norm": 2.8853790760040283,
+ "learning_rate": 1.5593076895835052e-06,
+ "loss": 0.3286,
+ "step": 7941
+ },
+ {
+ "epoch": 3.7550827423167847,
+ "grad_norm": 3.2724483013153076,
+ "learning_rate": 1.5587297386621158e-06,
+ "loss": 0.3396,
+ "step": 7942
+ },
+ {
+ "epoch": 3.7555555555555555,
+ "grad_norm": 3.5077168941497803,
+ "learning_rate": 1.5581518463494983e-06,
+ "loss": 0.4528,
+ "step": 7943
+ },
+ {
+ "epoch": 3.7560283687943263,
+ "grad_norm": 3.031503915786743,
+ "learning_rate": 1.5575740126816346e-06,
+ "loss": 0.3803,
+ "step": 7944
+ },
+ {
+ "epoch": 3.756501182033097,
+ "grad_norm": 3.0939114093780518,
+ "learning_rate": 1.556996237694506e-06,
+ "loss": 0.3931,
+ "step": 7945
+ },
+ {
+ "epoch": 3.7569739952718675,
+ "grad_norm": 2.9404146671295166,
+ "learning_rate": 1.556418521424085e-06,
+ "loss": 0.3608,
+ "step": 7946
+ },
+ {
+ "epoch": 3.7574468085106383,
+ "grad_norm": 3.4363012313842773,
+ "learning_rate": 1.5558408639063465e-06,
+ "loss": 0.4335,
+ "step": 7947
+ },
+ {
+ "epoch": 3.757919621749409,
+ "grad_norm": 3.2819864749908447,
+ "learning_rate": 1.5552632651772575e-06,
+ "loss": 0.4147,
+ "step": 7948
+ },
+ {
+ "epoch": 3.7583924349881794,
+ "grad_norm": 2.917788505554199,
+ "learning_rate": 1.554685725272782e-06,
+ "loss": 0.3516,
+ "step": 7949
+ },
+ {
+ "epoch": 3.7588652482269502,
+ "grad_norm": 2.8425943851470947,
+ "learning_rate": 1.5541082442288818e-06,
+ "loss": 0.3596,
+ "step": 7950
+ },
+ {
+ "epoch": 3.759338061465721,
+ "grad_norm": 3.087005376815796,
+ "learning_rate": 1.5535308220815126e-06,
+ "loss": 0.3968,
+ "step": 7951
+ },
+ {
+ "epoch": 3.759810874704492,
+ "grad_norm": 2.743110179901123,
+ "learning_rate": 1.5529534588666298e-06,
+ "loss": 0.3802,
+ "step": 7952
+ },
+ {
+ "epoch": 3.7602836879432626,
+ "grad_norm": 2.914424180984497,
+ "learning_rate": 1.5523761546201825e-06,
+ "loss": 0.4055,
+ "step": 7953
+ },
+ {
+ "epoch": 3.760756501182033,
+ "grad_norm": 2.9691991806030273,
+ "learning_rate": 1.551798909378116e-06,
+ "loss": 0.3384,
+ "step": 7954
+ },
+ {
+ "epoch": 3.7612293144208038,
+ "grad_norm": 2.433657646179199,
+ "learning_rate": 1.5512217231763747e-06,
+ "loss": 0.3019,
+ "step": 7955
+ },
+ {
+ "epoch": 3.7617021276595746,
+ "grad_norm": 2.7904880046844482,
+ "learning_rate": 1.5506445960508957e-06,
+ "loss": 0.389,
+ "step": 7956
+ },
+ {
+ "epoch": 3.762174940898345,
+ "grad_norm": 2.9241607189178467,
+ "learning_rate": 1.5500675280376154e-06,
+ "loss": 0.4291,
+ "step": 7957
+ },
+ {
+ "epoch": 3.7626477541371157,
+ "grad_norm": 3.216491222381592,
+ "learning_rate": 1.549490519172465e-06,
+ "loss": 0.4065,
+ "step": 7958
+ },
+ {
+ "epoch": 3.7631205673758865,
+ "grad_norm": 2.8859689235687256,
+ "learning_rate": 1.548913569491371e-06,
+ "loss": 0.353,
+ "step": 7959
+ },
+ {
+ "epoch": 3.7635933806146573,
+ "grad_norm": 2.958773136138916,
+ "learning_rate": 1.5483366790302594e-06,
+ "loss": 0.3829,
+ "step": 7960
+ },
+ {
+ "epoch": 3.764066193853428,
+ "grad_norm": 2.868649482727051,
+ "learning_rate": 1.5477598478250505e-06,
+ "loss": 0.3591,
+ "step": 7961
+ },
+ {
+ "epoch": 3.7645390070921985,
+ "grad_norm": 2.6912996768951416,
+ "learning_rate": 1.5471830759116591e-06,
+ "loss": 0.3695,
+ "step": 7962
+ },
+ {
+ "epoch": 3.7650118203309693,
+ "grad_norm": 3.3318257331848145,
+ "learning_rate": 1.5466063633260004e-06,
+ "loss": 0.4126,
+ "step": 7963
+ },
+ {
+ "epoch": 3.76548463356974,
+ "grad_norm": 2.865525007247925,
+ "learning_rate": 1.5460297101039825e-06,
+ "loss": 0.4235,
+ "step": 7964
+ },
+ {
+ "epoch": 3.7659574468085104,
+ "grad_norm": 2.8639180660247803,
+ "learning_rate": 1.5454531162815123e-06,
+ "loss": 0.4392,
+ "step": 7965
+ },
+ {
+ "epoch": 3.766430260047281,
+ "grad_norm": 2.5752499103546143,
+ "learning_rate": 1.5448765818944902e-06,
+ "loss": 0.4113,
+ "step": 7966
+ },
+ {
+ "epoch": 3.766903073286052,
+ "grad_norm": 2.7622742652893066,
+ "learning_rate": 1.5443001069788155e-06,
+ "loss": 0.3785,
+ "step": 7967
+ },
+ {
+ "epoch": 3.767375886524823,
+ "grad_norm": 2.965579032897949,
+ "learning_rate": 1.5437236915703829e-06,
+ "loss": 0.335,
+ "step": 7968
+ },
+ {
+ "epoch": 3.7678486997635936,
+ "grad_norm": 3.0587408542633057,
+ "learning_rate": 1.5431473357050816e-06,
+ "loss": 0.4047,
+ "step": 7969
+ },
+ {
+ "epoch": 3.768321513002364,
+ "grad_norm": 3.2929413318634033,
+ "learning_rate": 1.5425710394188014e-06,
+ "loss": 0.4061,
+ "step": 7970
+ },
+ {
+ "epoch": 3.7687943262411348,
+ "grad_norm": 2.663043975830078,
+ "learning_rate": 1.541994802747424e-06,
+ "loss": 0.3478,
+ "step": 7971
+ },
+ {
+ "epoch": 3.7692671394799055,
+ "grad_norm": 3.0657591819763184,
+ "learning_rate": 1.5414186257268293e-06,
+ "loss": 0.3735,
+ "step": 7972
+ },
+ {
+ "epoch": 3.769739952718676,
+ "grad_norm": 2.963189125061035,
+ "learning_rate": 1.5408425083928939e-06,
+ "loss": 0.4743,
+ "step": 7973
+ },
+ {
+ "epoch": 3.7702127659574467,
+ "grad_norm": 3.1509387493133545,
+ "learning_rate": 1.540266450781489e-06,
+ "loss": 0.4164,
+ "step": 7974
+ },
+ {
+ "epoch": 3.7706855791962175,
+ "grad_norm": 3.4436306953430176,
+ "learning_rate": 1.539690452928485e-06,
+ "loss": 0.4583,
+ "step": 7975
+ },
+ {
+ "epoch": 3.7711583924349883,
+ "grad_norm": 3.1746156215667725,
+ "learning_rate": 1.5391145148697454e-06,
+ "loss": 0.4042,
+ "step": 7976
+ },
+ {
+ "epoch": 3.771631205673759,
+ "grad_norm": 3.531028985977173,
+ "learning_rate": 1.5385386366411304e-06,
+ "loss": 0.4304,
+ "step": 7977
+ },
+ {
+ "epoch": 3.7721040189125294,
+ "grad_norm": 2.867871046066284,
+ "learning_rate": 1.5379628182785e-06,
+ "loss": 0.4023,
+ "step": 7978
+ },
+ {
+ "epoch": 3.7725768321513002,
+ "grad_norm": 3.0504629611968994,
+ "learning_rate": 1.5373870598177051e-06,
+ "loss": 0.3785,
+ "step": 7979
+ },
+ {
+ "epoch": 3.773049645390071,
+ "grad_norm": 2.8188650608062744,
+ "learning_rate": 1.5368113612945983e-06,
+ "loss": 0.3808,
+ "step": 7980
+ },
+ {
+ "epoch": 3.7735224586288414,
+ "grad_norm": 3.0809133052825928,
+ "learning_rate": 1.5362357227450248e-06,
+ "loss": 0.3912,
+ "step": 7981
+ },
+ {
+ "epoch": 3.773995271867612,
+ "grad_norm": 3.223273277282715,
+ "learning_rate": 1.5356601442048257e-06,
+ "loss": 0.3802,
+ "step": 7982
+ },
+ {
+ "epoch": 3.774468085106383,
+ "grad_norm": 2.7513339519500732,
+ "learning_rate": 1.535084625709842e-06,
+ "loss": 0.3822,
+ "step": 7983
+ },
+ {
+ "epoch": 3.774940898345154,
+ "grad_norm": 3.085592031478882,
+ "learning_rate": 1.5345091672959074e-06,
+ "loss": 0.4348,
+ "step": 7984
+ },
+ {
+ "epoch": 3.7754137115839246,
+ "grad_norm": 3.315108299255371,
+ "learning_rate": 1.5339337689988525e-06,
+ "loss": 0.4196,
+ "step": 7985
+ },
+ {
+ "epoch": 3.775886524822695,
+ "grad_norm": 3.713372230529785,
+ "learning_rate": 1.533358430854507e-06,
+ "loss": 0.4292,
+ "step": 7986
+ },
+ {
+ "epoch": 3.7763593380614657,
+ "grad_norm": 2.7899155616760254,
+ "learning_rate": 1.532783152898692e-06,
+ "loss": 0.3874,
+ "step": 7987
+ },
+ {
+ "epoch": 3.7768321513002365,
+ "grad_norm": 2.918851852416992,
+ "learning_rate": 1.5322079351672297e-06,
+ "loss": 0.4073,
+ "step": 7988
+ },
+ {
+ "epoch": 3.777304964539007,
+ "grad_norm": 3.13395619392395,
+ "learning_rate": 1.5316327776959361e-06,
+ "loss": 0.3441,
+ "step": 7989
+ },
+ {
+ "epoch": 3.7777777777777777,
+ "grad_norm": 3.2320916652679443,
+ "learning_rate": 1.531057680520623e-06,
+ "loss": 0.372,
+ "step": 7990
+ },
+ {
+ "epoch": 3.7782505910165485,
+ "grad_norm": 3.1130621433258057,
+ "learning_rate": 1.5304826436770991e-06,
+ "loss": 0.3514,
+ "step": 7991
+ },
+ {
+ "epoch": 3.7787234042553193,
+ "grad_norm": 3.223207712173462,
+ "learning_rate": 1.5299076672011696e-06,
+ "loss": 0.44,
+ "step": 7992
+ },
+ {
+ "epoch": 3.77919621749409,
+ "grad_norm": 3.0757877826690674,
+ "learning_rate": 1.5293327511286366e-06,
+ "loss": 0.4051,
+ "step": 7993
+ },
+ {
+ "epoch": 3.7796690307328604,
+ "grad_norm": 2.936678409576416,
+ "learning_rate": 1.528757895495297e-06,
+ "loss": 0.3965,
+ "step": 7994
+ },
+ {
+ "epoch": 3.780141843971631,
+ "grad_norm": 2.993445873260498,
+ "learning_rate": 1.5281831003369435e-06,
+ "loss": 0.4222,
+ "step": 7995
+ },
+ {
+ "epoch": 3.780614657210402,
+ "grad_norm": 2.9140853881835938,
+ "learning_rate": 1.5276083656893679e-06,
+ "loss": 0.3662,
+ "step": 7996
+ },
+ {
+ "epoch": 3.7810874704491724,
+ "grad_norm": 3.2649893760681152,
+ "learning_rate": 1.5270336915883549e-06,
+ "loss": 0.4272,
+ "step": 7997
+ },
+ {
+ "epoch": 3.781560283687943,
+ "grad_norm": 3.0631372928619385,
+ "learning_rate": 1.5264590780696887e-06,
+ "loss": 0.4111,
+ "step": 7998
+ },
+ {
+ "epoch": 3.782033096926714,
+ "grad_norm": 2.791299343109131,
+ "learning_rate": 1.5258845251691463e-06,
+ "loss": 0.416,
+ "step": 7999
+ },
+ {
+ "epoch": 3.7825059101654848,
+ "grad_norm": 3.262294054031372,
+ "learning_rate": 1.5253100329225023e-06,
+ "loss": 0.4236,
+ "step": 8000
+ },
+ {
+ "epoch": 3.7829787234042556,
+ "grad_norm": 2.574486017227173,
+ "learning_rate": 1.5247356013655295e-06,
+ "loss": 0.4089,
+ "step": 8001
+ },
+ {
+ "epoch": 3.783451536643026,
+ "grad_norm": 3.1566531658172607,
+ "learning_rate": 1.5241612305339936e-06,
+ "loss": 0.3955,
+ "step": 8002
+ },
+ {
+ "epoch": 3.7839243498817967,
+ "grad_norm": 2.5845813751220703,
+ "learning_rate": 1.5235869204636602e-06,
+ "loss": 0.3672,
+ "step": 8003
+ },
+ {
+ "epoch": 3.7843971631205675,
+ "grad_norm": 2.877570629119873,
+ "learning_rate": 1.5230126711902876e-06,
+ "loss": 0.3919,
+ "step": 8004
+ },
+ {
+ "epoch": 3.784869976359338,
+ "grad_norm": 3.183061122894287,
+ "learning_rate": 1.5224384827496314e-06,
+ "loss": 0.3291,
+ "step": 8005
+ },
+ {
+ "epoch": 3.7853427895981087,
+ "grad_norm": 3.0778391361236572,
+ "learning_rate": 1.5218643551774451e-06,
+ "loss": 0.3571,
+ "step": 8006
+ },
+ {
+ "epoch": 3.7858156028368795,
+ "grad_norm": 3.2364399433135986,
+ "learning_rate": 1.5212902885094762e-06,
+ "loss": 0.4045,
+ "step": 8007
+ },
+ {
+ "epoch": 3.7862884160756503,
+ "grad_norm": 3.0571746826171875,
+ "learning_rate": 1.5207162827814687e-06,
+ "loss": 0.4181,
+ "step": 8008
+ },
+ {
+ "epoch": 3.786761229314421,
+ "grad_norm": 2.7215163707733154,
+ "learning_rate": 1.5201423380291652e-06,
+ "loss": 0.3328,
+ "step": 8009
+ },
+ {
+ "epoch": 3.7872340425531914,
+ "grad_norm": 3.0521233081817627,
+ "learning_rate": 1.5195684542883007e-06,
+ "loss": 0.4072,
+ "step": 8010
+ },
+ {
+ "epoch": 3.787706855791962,
+ "grad_norm": 2.541666269302368,
+ "learning_rate": 1.5189946315946104e-06,
+ "loss": 0.3293,
+ "step": 8011
+ },
+ {
+ "epoch": 3.788179669030733,
+ "grad_norm": 3.0041720867156982,
+ "learning_rate": 1.5184208699838232e-06,
+ "loss": 0.3998,
+ "step": 8012
+ },
+ {
+ "epoch": 3.7886524822695034,
+ "grad_norm": 3.0763001441955566,
+ "learning_rate": 1.5178471694916635e-06,
+ "loss": 0.38,
+ "step": 8013
+ },
+ {
+ "epoch": 3.789125295508274,
+ "grad_norm": 3.0788497924804688,
+ "learning_rate": 1.5172735301538544e-06,
+ "loss": 0.3986,
+ "step": 8014
+ },
+ {
+ "epoch": 3.789598108747045,
+ "grad_norm": 2.830225944519043,
+ "learning_rate": 1.5166999520061127e-06,
+ "loss": 0.3977,
+ "step": 8015
+ },
+ {
+ "epoch": 3.7900709219858157,
+ "grad_norm": 3.196078062057495,
+ "learning_rate": 1.5161264350841543e-06,
+ "loss": 0.4058,
+ "step": 8016
+ },
+ {
+ "epoch": 3.7905437352245865,
+ "grad_norm": 9.898200988769531,
+ "learning_rate": 1.5155529794236884e-06,
+ "loss": 0.3451,
+ "step": 8017
+ },
+ {
+ "epoch": 3.791016548463357,
+ "grad_norm": 3.0028066635131836,
+ "learning_rate": 1.514979585060421e-06,
+ "loss": 0.4029,
+ "step": 8018
+ },
+ {
+ "epoch": 3.7914893617021277,
+ "grad_norm": 2.984926223754883,
+ "learning_rate": 1.5144062520300562e-06,
+ "loss": 0.3995,
+ "step": 8019
+ },
+ {
+ "epoch": 3.7919621749408985,
+ "grad_norm": 2.938596487045288,
+ "learning_rate": 1.5138329803682925e-06,
+ "loss": 0.386,
+ "step": 8020
+ },
+ {
+ "epoch": 3.792434988179669,
+ "grad_norm": 2.992565393447876,
+ "learning_rate": 1.513259770110825e-06,
+ "loss": 0.3919,
+ "step": 8021
+ },
+ {
+ "epoch": 3.7929078014184396,
+ "grad_norm": 3.0182361602783203,
+ "learning_rate": 1.5126866212933453e-06,
+ "loss": 0.3506,
+ "step": 8022
+ },
+ {
+ "epoch": 3.7933806146572104,
+ "grad_norm": 3.2039108276367188,
+ "learning_rate": 1.5121135339515392e-06,
+ "loss": 0.3807,
+ "step": 8023
+ },
+ {
+ "epoch": 3.7938534278959812,
+ "grad_norm": 2.9290878772735596,
+ "learning_rate": 1.5115405081210927e-06,
+ "loss": 0.3596,
+ "step": 8024
+ },
+ {
+ "epoch": 3.794326241134752,
+ "grad_norm": 3.106152057647705,
+ "learning_rate": 1.510967543837683e-06,
+ "loss": 0.3703,
+ "step": 8025
+ },
+ {
+ "epoch": 3.7947990543735224,
+ "grad_norm": 2.9752190113067627,
+ "learning_rate": 1.510394641136989e-06,
+ "loss": 0.4049,
+ "step": 8026
+ },
+ {
+ "epoch": 3.795271867612293,
+ "grad_norm": 2.996206283569336,
+ "learning_rate": 1.5098218000546815e-06,
+ "loss": 0.4286,
+ "step": 8027
+ },
+ {
+ "epoch": 3.795744680851064,
+ "grad_norm": 2.9403493404388428,
+ "learning_rate": 1.5092490206264281e-06,
+ "loss": 0.3628,
+ "step": 8028
+ },
+ {
+ "epoch": 3.7962174940898343,
+ "grad_norm": 2.8101110458374023,
+ "learning_rate": 1.5086763028878943e-06,
+ "loss": 0.4016,
+ "step": 8029
+ },
+ {
+ "epoch": 3.796690307328605,
+ "grad_norm": 3.162264108657837,
+ "learning_rate": 1.5081036468747401e-06,
+ "loss": 0.4133,
+ "step": 8030
+ },
+ {
+ "epoch": 3.797163120567376,
+ "grad_norm": 2.6871988773345947,
+ "learning_rate": 1.5075310526226223e-06,
+ "loss": 0.3748,
+ "step": 8031
+ },
+ {
+ "epoch": 3.7976359338061467,
+ "grad_norm": 2.997924327850342,
+ "learning_rate": 1.5069585201671944e-06,
+ "loss": 0.4083,
+ "step": 8032
+ },
+ {
+ "epoch": 3.7981087470449175,
+ "grad_norm": 2.8266279697418213,
+ "learning_rate": 1.506386049544104e-06,
+ "loss": 0.4488,
+ "step": 8033
+ },
+ {
+ "epoch": 3.798581560283688,
+ "grad_norm": 2.7106378078460693,
+ "learning_rate": 1.5058136407889985e-06,
+ "loss": 0.363,
+ "step": 8034
+ },
+ {
+ "epoch": 3.7990543735224587,
+ "grad_norm": 2.8983304500579834,
+ "learning_rate": 1.5052412939375183e-06,
+ "loss": 0.4156,
+ "step": 8035
+ },
+ {
+ "epoch": 3.7995271867612295,
+ "grad_norm": 3.0333914756774902,
+ "learning_rate": 1.5046690090253001e-06,
+ "loss": 0.3694,
+ "step": 8036
+ },
+ {
+ "epoch": 3.8,
+ "grad_norm": 2.872662305831909,
+ "learning_rate": 1.5040967860879785e-06,
+ "loss": 0.3492,
+ "step": 8037
+ },
+ {
+ "epoch": 3.8004728132387706,
+ "grad_norm": 2.7279646396636963,
+ "learning_rate": 1.5035246251611835e-06,
+ "loss": 0.327,
+ "step": 8038
+ },
+ {
+ "epoch": 3.8009456264775414,
+ "grad_norm": 2.969326972961426,
+ "learning_rate": 1.5029525262805405e-06,
+ "loss": 0.3977,
+ "step": 8039
+ },
+ {
+ "epoch": 3.801418439716312,
+ "grad_norm": 3.073899745941162,
+ "learning_rate": 1.5023804894816723e-06,
+ "loss": 0.388,
+ "step": 8040
+ },
+ {
+ "epoch": 3.801891252955083,
+ "grad_norm": 3.026284694671631,
+ "learning_rate": 1.5018085148001953e-06,
+ "loss": 0.3761,
+ "step": 8041
+ },
+ {
+ "epoch": 3.8023640661938534,
+ "grad_norm": 3.0478618144989014,
+ "learning_rate": 1.5012366022717262e-06,
+ "loss": 0.4415,
+ "step": 8042
+ },
+ {
+ "epoch": 3.802836879432624,
+ "grad_norm": 2.801584005355835,
+ "learning_rate": 1.500664751931874e-06,
+ "loss": 0.4079,
+ "step": 8043
+ },
+ {
+ "epoch": 3.803309692671395,
+ "grad_norm": 3.4839112758636475,
+ "learning_rate": 1.5000929638162459e-06,
+ "loss": 0.4391,
+ "step": 8044
+ },
+ {
+ "epoch": 3.8037825059101653,
+ "grad_norm": 2.6945605278015137,
+ "learning_rate": 1.4995212379604446e-06,
+ "loss": 0.3564,
+ "step": 8045
+ },
+ {
+ "epoch": 3.804255319148936,
+ "grad_norm": 3.0870234966278076,
+ "learning_rate": 1.4989495744000687e-06,
+ "loss": 0.3801,
+ "step": 8046
+ },
+ {
+ "epoch": 3.804728132387707,
+ "grad_norm": 2.975332021713257,
+ "learning_rate": 1.4983779731707135e-06,
+ "loss": 0.3408,
+ "step": 8047
+ },
+ {
+ "epoch": 3.8052009456264777,
+ "grad_norm": 2.9920027256011963,
+ "learning_rate": 1.497806434307969e-06,
+ "loss": 0.3875,
+ "step": 8048
+ },
+ {
+ "epoch": 3.8056737588652485,
+ "grad_norm": 3.1974916458129883,
+ "learning_rate": 1.4972349578474244e-06,
+ "loss": 0.4492,
+ "step": 8049
+ },
+ {
+ "epoch": 3.806146572104019,
+ "grad_norm": 2.839503526687622,
+ "learning_rate": 1.4966635438246622e-06,
+ "loss": 0.3785,
+ "step": 8050
+ },
+ {
+ "epoch": 3.8066193853427897,
+ "grad_norm": 3.274502992630005,
+ "learning_rate": 1.4960921922752603e-06,
+ "loss": 0.4404,
+ "step": 8051
+ },
+ {
+ "epoch": 3.8070921985815604,
+ "grad_norm": 3.0852737426757812,
+ "learning_rate": 1.4955209032347967e-06,
+ "loss": 0.4047,
+ "step": 8052
+ },
+ {
+ "epoch": 3.807565011820331,
+ "grad_norm": 2.9251608848571777,
+ "learning_rate": 1.4949496767388417e-06,
+ "loss": 0.3654,
+ "step": 8053
+ },
+ {
+ "epoch": 3.8080378250591016,
+ "grad_norm": 2.518220901489258,
+ "learning_rate": 1.4943785128229635e-06,
+ "loss": 0.3157,
+ "step": 8054
+ },
+ {
+ "epoch": 3.8085106382978724,
+ "grad_norm": 3.3993279933929443,
+ "learning_rate": 1.4938074115227257e-06,
+ "loss": 0.4204,
+ "step": 8055
+ },
+ {
+ "epoch": 3.808983451536643,
+ "grad_norm": 3.2847096920013428,
+ "learning_rate": 1.4932363728736876e-06,
+ "loss": 0.339,
+ "step": 8056
+ },
+ {
+ "epoch": 3.8094562647754135,
+ "grad_norm": 2.7779417037963867,
+ "learning_rate": 1.492665396911407e-06,
+ "loss": 0.3538,
+ "step": 8057
+ },
+ {
+ "epoch": 3.8099290780141843,
+ "grad_norm": 2.958131790161133,
+ "learning_rate": 1.4920944836714353e-06,
+ "loss": 0.363,
+ "step": 8058
+ },
+ {
+ "epoch": 3.810401891252955,
+ "grad_norm": 3.1873440742492676,
+ "learning_rate": 1.491523633189319e-06,
+ "loss": 0.3785,
+ "step": 8059
+ },
+ {
+ "epoch": 3.8108747044917255,
+ "grad_norm": 3.132652759552002,
+ "learning_rate": 1.4909528455006055e-06,
+ "loss": 0.375,
+ "step": 8060
+ },
+ {
+ "epoch": 3.8113475177304963,
+ "grad_norm": 2.8598761558532715,
+ "learning_rate": 1.490382120640833e-06,
+ "loss": 0.4152,
+ "step": 8061
+ },
+ {
+ "epoch": 3.811820330969267,
+ "grad_norm": 3.115870952606201,
+ "learning_rate": 1.4898114586455399e-06,
+ "loss": 0.4609,
+ "step": 8062
+ },
+ {
+ "epoch": 3.812293144208038,
+ "grad_norm": 3.347944974899292,
+ "learning_rate": 1.4892408595502571e-06,
+ "loss": 0.3836,
+ "step": 8063
+ },
+ {
+ "epoch": 3.8127659574468087,
+ "grad_norm": 3.1747031211853027,
+ "learning_rate": 1.4886703233905132e-06,
+ "loss": 0.374,
+ "step": 8064
+ },
+ {
+ "epoch": 3.813238770685579,
+ "grad_norm": 2.945139169692993,
+ "learning_rate": 1.4880998502018345e-06,
+ "loss": 0.3652,
+ "step": 8065
+ },
+ {
+ "epoch": 3.81371158392435,
+ "grad_norm": 2.8911492824554443,
+ "learning_rate": 1.4875294400197403e-06,
+ "loss": 0.3683,
+ "step": 8066
+ },
+ {
+ "epoch": 3.8141843971631206,
+ "grad_norm": 3.080268383026123,
+ "learning_rate": 1.4869590928797491e-06,
+ "loss": 0.3919,
+ "step": 8067
+ },
+ {
+ "epoch": 3.814657210401891,
+ "grad_norm": 3.0834288597106934,
+ "learning_rate": 1.4863888088173734e-06,
+ "loss": 0.3988,
+ "step": 8068
+ },
+ {
+ "epoch": 3.815130023640662,
+ "grad_norm": 2.765702724456787,
+ "learning_rate": 1.4858185878681213e-06,
+ "loss": 0.3659,
+ "step": 8069
+ },
+ {
+ "epoch": 3.8156028368794326,
+ "grad_norm": 3.074059247970581,
+ "learning_rate": 1.4852484300674993e-06,
+ "loss": 0.3888,
+ "step": 8070
+ },
+ {
+ "epoch": 3.8160756501182034,
+ "grad_norm": 3.0009944438934326,
+ "learning_rate": 1.484678335451007e-06,
+ "loss": 0.417,
+ "step": 8071
+ },
+ {
+ "epoch": 3.816548463356974,
+ "grad_norm": 2.6661112308502197,
+ "learning_rate": 1.4841083040541438e-06,
+ "loss": 0.3544,
+ "step": 8072
+ },
+ {
+ "epoch": 3.8170212765957445,
+ "grad_norm": 2.7849514484405518,
+ "learning_rate": 1.4835383359124018e-06,
+ "loss": 0.3691,
+ "step": 8073
+ },
+ {
+ "epoch": 3.8174940898345153,
+ "grad_norm": 3.008070707321167,
+ "learning_rate": 1.4829684310612697e-06,
+ "loss": 0.4228,
+ "step": 8074
+ },
+ {
+ "epoch": 3.817966903073286,
+ "grad_norm": 2.649296998977661,
+ "learning_rate": 1.4823985895362348e-06,
+ "loss": 0.3642,
+ "step": 8075
+ },
+ {
+ "epoch": 3.8184397163120565,
+ "grad_norm": 2.6017661094665527,
+ "learning_rate": 1.4818288113727768e-06,
+ "loss": 0.3537,
+ "step": 8076
+ },
+ {
+ "epoch": 3.8189125295508273,
+ "grad_norm": 2.9071972370147705,
+ "learning_rate": 1.481259096606375e-06,
+ "loss": 0.3096,
+ "step": 8077
+ },
+ {
+ "epoch": 3.819385342789598,
+ "grad_norm": 3.0866518020629883,
+ "learning_rate": 1.4806894452725024e-06,
+ "loss": 0.4148,
+ "step": 8078
+ },
+ {
+ "epoch": 3.819858156028369,
+ "grad_norm": 3.2099499702453613,
+ "learning_rate": 1.4801198574066272e-06,
+ "loss": 0.4058,
+ "step": 8079
+ },
+ {
+ "epoch": 3.8203309692671397,
+ "grad_norm": 3.0204920768737793,
+ "learning_rate": 1.4795503330442176e-06,
+ "loss": 0.3427,
+ "step": 8080
+ },
+ {
+ "epoch": 3.82080378250591,
+ "grad_norm": 2.88667368888855,
+ "learning_rate": 1.478980872220734e-06,
+ "loss": 0.4075,
+ "step": 8081
+ },
+ {
+ "epoch": 3.821276595744681,
+ "grad_norm": 2.926673173904419,
+ "learning_rate": 1.4784114749716338e-06,
+ "loss": 0.3449,
+ "step": 8082
+ },
+ {
+ "epoch": 3.8217494089834516,
+ "grad_norm": 2.818936347961426,
+ "learning_rate": 1.4778421413323723e-06,
+ "loss": 0.3628,
+ "step": 8083
+ },
+ {
+ "epoch": 3.822222222222222,
+ "grad_norm": 2.960322380065918,
+ "learning_rate": 1.4772728713383983e-06,
+ "loss": 0.3669,
+ "step": 8084
+ },
+ {
+ "epoch": 3.8226950354609928,
+ "grad_norm": 2.940131902694702,
+ "learning_rate": 1.4767036650251584e-06,
+ "loss": 0.4357,
+ "step": 8085
+ },
+ {
+ "epoch": 3.8231678486997636,
+ "grad_norm": 2.9251785278320312,
+ "learning_rate": 1.4761345224280943e-06,
+ "loss": 0.4046,
+ "step": 8086
+ },
+ {
+ "epoch": 3.8236406619385344,
+ "grad_norm": 3.115590810775757,
+ "learning_rate": 1.475565443582643e-06,
+ "loss": 0.3712,
+ "step": 8087
+ },
+ {
+ "epoch": 3.824113475177305,
+ "grad_norm": 2.5968618392944336,
+ "learning_rate": 1.4749964285242408e-06,
+ "loss": 0.3432,
+ "step": 8088
+ },
+ {
+ "epoch": 3.8245862884160755,
+ "grad_norm": 3.195409059524536,
+ "learning_rate": 1.4744274772883148e-06,
+ "loss": 0.3717,
+ "step": 8089
+ },
+ {
+ "epoch": 3.8250591016548463,
+ "grad_norm": 2.8658018112182617,
+ "learning_rate": 1.4738585899102942e-06,
+ "loss": 0.3807,
+ "step": 8090
+ },
+ {
+ "epoch": 3.825531914893617,
+ "grad_norm": 2.9005510807037354,
+ "learning_rate": 1.4732897664255998e-06,
+ "loss": 0.3988,
+ "step": 8091
+ },
+ {
+ "epoch": 3.8260047281323875,
+ "grad_norm": 3.9155731201171875,
+ "learning_rate": 1.472721006869649e-06,
+ "loss": 0.3981,
+ "step": 8092
+ },
+ {
+ "epoch": 3.8264775413711583,
+ "grad_norm": 2.89312744140625,
+ "learning_rate": 1.4721523112778575e-06,
+ "loss": 0.3286,
+ "step": 8093
+ },
+ {
+ "epoch": 3.826950354609929,
+ "grad_norm": 3.006071090698242,
+ "learning_rate": 1.4715836796856332e-06,
+ "loss": 0.3901,
+ "step": 8094
+ },
+ {
+ "epoch": 3.8274231678487,
+ "grad_norm": 3.083411693572998,
+ "learning_rate": 1.4710151121283845e-06,
+ "loss": 0.3741,
+ "step": 8095
+ },
+ {
+ "epoch": 3.8278959810874706,
+ "grad_norm": 2.864989995956421,
+ "learning_rate": 1.4704466086415131e-06,
+ "loss": 0.3887,
+ "step": 8096
+ },
+ {
+ "epoch": 3.828368794326241,
+ "grad_norm": 2.4846417903900146,
+ "learning_rate": 1.4698781692604158e-06,
+ "loss": 0.33,
+ "step": 8097
+ },
+ {
+ "epoch": 3.828841607565012,
+ "grad_norm": 3.2497007846832275,
+ "learning_rate": 1.4693097940204893e-06,
+ "loss": 0.4011,
+ "step": 8098
+ },
+ {
+ "epoch": 3.8293144208037826,
+ "grad_norm": 3.0079777240753174,
+ "learning_rate": 1.4687414829571218e-06,
+ "loss": 0.4263,
+ "step": 8099
+ },
+ {
+ "epoch": 3.829787234042553,
+ "grad_norm": 2.8538410663604736,
+ "learning_rate": 1.4681732361057005e-06,
+ "loss": 0.3651,
+ "step": 8100
+ },
+ {
+ "epoch": 3.8302600472813237,
+ "grad_norm": 3.238163948059082,
+ "learning_rate": 1.4676050535016076e-06,
+ "loss": 0.392,
+ "step": 8101
+ },
+ {
+ "epoch": 3.8307328605200945,
+ "grad_norm": 2.9991304874420166,
+ "learning_rate": 1.46703693518022e-06,
+ "loss": 0.3643,
+ "step": 8102
+ },
+ {
+ "epoch": 3.8312056737588653,
+ "grad_norm": 2.9816839694976807,
+ "learning_rate": 1.466468881176914e-06,
+ "loss": 0.3803,
+ "step": 8103
+ },
+ {
+ "epoch": 3.831678486997636,
+ "grad_norm": 3.2009265422821045,
+ "learning_rate": 1.465900891527059e-06,
+ "loss": 0.3828,
+ "step": 8104
+ },
+ {
+ "epoch": 3.8321513002364065,
+ "grad_norm": 2.9479124546051025,
+ "learning_rate": 1.4653329662660201e-06,
+ "loss": 0.3683,
+ "step": 8105
+ },
+ {
+ "epoch": 3.8326241134751773,
+ "grad_norm": 2.938507080078125,
+ "learning_rate": 1.4647651054291614e-06,
+ "loss": 0.3703,
+ "step": 8106
+ },
+ {
+ "epoch": 3.833096926713948,
+ "grad_norm": 2.7777645587921143,
+ "learning_rate": 1.4641973090518397e-06,
+ "loss": 0.3982,
+ "step": 8107
+ },
+ {
+ "epoch": 3.8335697399527184,
+ "grad_norm": 3.2470149993896484,
+ "learning_rate": 1.4636295771694099e-06,
+ "loss": 0.3748,
+ "step": 8108
+ },
+ {
+ "epoch": 3.8340425531914892,
+ "grad_norm": 2.869310140609741,
+ "learning_rate": 1.4630619098172223e-06,
+ "loss": 0.3577,
+ "step": 8109
+ },
+ {
+ "epoch": 3.83451536643026,
+ "grad_norm": 3.1245369911193848,
+ "learning_rate": 1.4624943070306225e-06,
+ "loss": 0.4518,
+ "step": 8110
+ },
+ {
+ "epoch": 3.834988179669031,
+ "grad_norm": 3.0390701293945312,
+ "learning_rate": 1.4619267688449529e-06,
+ "loss": 0.5051,
+ "step": 8111
+ },
+ {
+ "epoch": 3.8354609929078016,
+ "grad_norm": 2.929943799972534,
+ "learning_rate": 1.4613592952955507e-06,
+ "loss": 0.4207,
+ "step": 8112
+ },
+ {
+ "epoch": 3.835933806146572,
+ "grad_norm": 3.17008376121521,
+ "learning_rate": 1.4607918864177523e-06,
+ "loss": 0.3836,
+ "step": 8113
+ },
+ {
+ "epoch": 3.8364066193853428,
+ "grad_norm": 3.0689237117767334,
+ "learning_rate": 1.460224542246886e-06,
+ "loss": 0.3413,
+ "step": 8114
+ },
+ {
+ "epoch": 3.8368794326241136,
+ "grad_norm": 2.9966423511505127,
+ "learning_rate": 1.4596572628182774e-06,
+ "loss": 0.4367,
+ "step": 8115
+ },
+ {
+ "epoch": 3.837352245862884,
+ "grad_norm": 3.0572052001953125,
+ "learning_rate": 1.45909004816725e-06,
+ "loss": 0.4089,
+ "step": 8116
+ },
+ {
+ "epoch": 3.8378250591016547,
+ "grad_norm": 2.911263942718506,
+ "learning_rate": 1.4585228983291203e-06,
+ "loss": 0.3848,
+ "step": 8117
+ },
+ {
+ "epoch": 3.8382978723404255,
+ "grad_norm": 2.9233853816986084,
+ "learning_rate": 1.4579558133392038e-06,
+ "loss": 0.4012,
+ "step": 8118
+ },
+ {
+ "epoch": 3.8387706855791963,
+ "grad_norm": 2.7813868522644043,
+ "learning_rate": 1.4573887932328097e-06,
+ "loss": 0.3898,
+ "step": 8119
+ },
+ {
+ "epoch": 3.839243498817967,
+ "grad_norm": 2.8727006912231445,
+ "learning_rate": 1.4568218380452436e-06,
+ "loss": 0.3965,
+ "step": 8120
+ },
+ {
+ "epoch": 3.8397163120567375,
+ "grad_norm": 3.0381174087524414,
+ "learning_rate": 1.4562549478118077e-06,
+ "loss": 0.4304,
+ "step": 8121
+ },
+ {
+ "epoch": 3.8401891252955083,
+ "grad_norm": 2.7406346797943115,
+ "learning_rate": 1.4556881225677982e-06,
+ "loss": 0.3636,
+ "step": 8122
+ },
+ {
+ "epoch": 3.840661938534279,
+ "grad_norm": 3.3900108337402344,
+ "learning_rate": 1.4551213623485111e-06,
+ "loss": 0.3863,
+ "step": 8123
+ },
+ {
+ "epoch": 3.8411347517730494,
+ "grad_norm": 2.885150909423828,
+ "learning_rate": 1.4545546671892354e-06,
+ "loss": 0.3679,
+ "step": 8124
+ },
+ {
+ "epoch": 3.84160756501182,
+ "grad_norm": 3.3361690044403076,
+ "learning_rate": 1.4539880371252555e-06,
+ "loss": 0.4333,
+ "step": 8125
+ },
+ {
+ "epoch": 3.842080378250591,
+ "grad_norm": 3.1547763347625732,
+ "learning_rate": 1.4534214721918545e-06,
+ "loss": 0.4477,
+ "step": 8126
+ },
+ {
+ "epoch": 3.842553191489362,
+ "grad_norm": 3.0337510108947754,
+ "learning_rate": 1.4528549724243095e-06,
+ "loss": 0.3647,
+ "step": 8127
+ },
+ {
+ "epoch": 3.8430260047281326,
+ "grad_norm": 2.8390069007873535,
+ "learning_rate": 1.452288537857893e-06,
+ "loss": 0.3698,
+ "step": 8128
+ },
+ {
+ "epoch": 3.843498817966903,
+ "grad_norm": 2.857513427734375,
+ "learning_rate": 1.451722168527876e-06,
+ "loss": 0.3842,
+ "step": 8129
+ },
+ {
+ "epoch": 3.8439716312056738,
+ "grad_norm": 3.015320062637329,
+ "learning_rate": 1.451155864469522e-06,
+ "loss": 0.4058,
+ "step": 8130
+ },
+ {
+ "epoch": 3.8444444444444446,
+ "grad_norm": 2.923957347869873,
+ "learning_rate": 1.450589625718094e-06,
+ "loss": 0.3976,
+ "step": 8131
+ },
+ {
+ "epoch": 3.844917257683215,
+ "grad_norm": 3.332338571548462,
+ "learning_rate": 1.4500234523088492e-06,
+ "loss": 0.4118,
+ "step": 8132
+ },
+ {
+ "epoch": 3.8453900709219857,
+ "grad_norm": 3.0403711795806885,
+ "learning_rate": 1.4494573442770381e-06,
+ "loss": 0.3715,
+ "step": 8133
+ },
+ {
+ "epoch": 3.8458628841607565,
+ "grad_norm": 3.2310287952423096,
+ "learning_rate": 1.4488913016579135e-06,
+ "loss": 0.4587,
+ "step": 8134
+ },
+ {
+ "epoch": 3.8463356973995273,
+ "grad_norm": 3.091282844543457,
+ "learning_rate": 1.448325324486718e-06,
+ "loss": 0.4234,
+ "step": 8135
+ },
+ {
+ "epoch": 3.846808510638298,
+ "grad_norm": 3.11161208152771,
+ "learning_rate": 1.4477594127986933e-06,
+ "loss": 0.4176,
+ "step": 8136
+ },
+ {
+ "epoch": 3.8472813238770684,
+ "grad_norm": 3.21042537689209,
+ "learning_rate": 1.4471935666290751e-06,
+ "loss": 0.4326,
+ "step": 8137
+ },
+ {
+ "epoch": 3.8477541371158392,
+ "grad_norm": 3.411543846130371,
+ "learning_rate": 1.4466277860130981e-06,
+ "loss": 0.4525,
+ "step": 8138
+ },
+ {
+ "epoch": 3.84822695035461,
+ "grad_norm": 3.0475308895111084,
+ "learning_rate": 1.4460620709859898e-06,
+ "loss": 0.3906,
+ "step": 8139
+ },
+ {
+ "epoch": 3.8486997635933804,
+ "grad_norm": 2.989367723464966,
+ "learning_rate": 1.4454964215829742e-06,
+ "loss": 0.3732,
+ "step": 8140
+ },
+ {
+ "epoch": 3.849172576832151,
+ "grad_norm": 2.8130393028259277,
+ "learning_rate": 1.4449308378392734e-06,
+ "loss": 0.3733,
+ "step": 8141
+ },
+ {
+ "epoch": 3.849645390070922,
+ "grad_norm": 12.2243013381958,
+ "learning_rate": 1.444365319790103e-06,
+ "loss": 0.3506,
+ "step": 8142
+ },
+ {
+ "epoch": 3.850118203309693,
+ "grad_norm": 3.075556516647339,
+ "learning_rate": 1.4437998674706743e-06,
+ "loss": 0.376,
+ "step": 8143
+ },
+ {
+ "epoch": 3.8505910165484636,
+ "grad_norm": 2.765650510787964,
+ "learning_rate": 1.4432344809161974e-06,
+ "loss": 0.3865,
+ "step": 8144
+ },
+ {
+ "epoch": 3.851063829787234,
+ "grad_norm": 3.171588897705078,
+ "learning_rate": 1.4426691601618747e-06,
+ "loss": 0.4391,
+ "step": 8145
+ },
+ {
+ "epoch": 3.8515366430260047,
+ "grad_norm": 2.8378992080688477,
+ "learning_rate": 1.4421039052429083e-06,
+ "loss": 0.3984,
+ "step": 8146
+ },
+ {
+ "epoch": 3.8520094562647755,
+ "grad_norm": 2.6588387489318848,
+ "learning_rate": 1.4415387161944929e-06,
+ "loss": 0.3961,
+ "step": 8147
+ },
+ {
+ "epoch": 3.852482269503546,
+ "grad_norm": 2.919325351715088,
+ "learning_rate": 1.4409735930518197e-06,
+ "loss": 0.4058,
+ "step": 8148
+ },
+ {
+ "epoch": 3.8529550827423167,
+ "grad_norm": 3.2239115238189697,
+ "learning_rate": 1.4404085358500778e-06,
+ "loss": 0.4018,
+ "step": 8149
+ },
+ {
+ "epoch": 3.8534278959810875,
+ "grad_norm": 3.2509875297546387,
+ "learning_rate": 1.4398435446244502e-06,
+ "loss": 0.4078,
+ "step": 8150
+ },
+ {
+ "epoch": 3.8539007092198583,
+ "grad_norm": 3.124782085418701,
+ "learning_rate": 1.4392786194101155e-06,
+ "loss": 0.4459,
+ "step": 8151
+ },
+ {
+ "epoch": 3.854373522458629,
+ "grad_norm": 2.924095392227173,
+ "learning_rate": 1.4387137602422512e-06,
+ "loss": 0.3686,
+ "step": 8152
+ },
+ {
+ "epoch": 3.8548463356973994,
+ "grad_norm": 2.9307191371917725,
+ "learning_rate": 1.4381489671560272e-06,
+ "loss": 0.4345,
+ "step": 8153
+ },
+ {
+ "epoch": 3.8553191489361702,
+ "grad_norm": 2.868488073348999,
+ "learning_rate": 1.4375842401866113e-06,
+ "loss": 0.366,
+ "step": 8154
+ },
+ {
+ "epoch": 3.855791962174941,
+ "grad_norm": 2.9893085956573486,
+ "learning_rate": 1.4370195793691661e-06,
+ "loss": 0.3401,
+ "step": 8155
+ },
+ {
+ "epoch": 3.8562647754137114,
+ "grad_norm": 3.0113472938537598,
+ "learning_rate": 1.4364549847388492e-06,
+ "loss": 0.4051,
+ "step": 8156
+ },
+ {
+ "epoch": 3.856737588652482,
+ "grad_norm": 3.4693121910095215,
+ "learning_rate": 1.4358904563308184e-06,
+ "loss": 0.4505,
+ "step": 8157
+ },
+ {
+ "epoch": 3.857210401891253,
+ "grad_norm": 2.9048118591308594,
+ "learning_rate": 1.4353259941802216e-06,
+ "loss": 0.3973,
+ "step": 8158
+ },
+ {
+ "epoch": 3.8576832151300238,
+ "grad_norm": 3.264910936355591,
+ "learning_rate": 1.434761598322208e-06,
+ "loss": 0.4317,
+ "step": 8159
+ },
+ {
+ "epoch": 3.8581560283687946,
+ "grad_norm": 2.973742723464966,
+ "learning_rate": 1.4341972687919186e-06,
+ "loss": 0.3896,
+ "step": 8160
+ },
+ {
+ "epoch": 3.858628841607565,
+ "grad_norm": 2.7802605628967285,
+ "learning_rate": 1.4336330056244906e-06,
+ "loss": 0.4063,
+ "step": 8161
+ },
+ {
+ "epoch": 3.8591016548463357,
+ "grad_norm": 3.1401731967926025,
+ "learning_rate": 1.433068808855061e-06,
+ "loss": 0.4068,
+ "step": 8162
+ },
+ {
+ "epoch": 3.8595744680851065,
+ "grad_norm": 3.132723331451416,
+ "learning_rate": 1.432504678518757e-06,
+ "loss": 0.4724,
+ "step": 8163
+ },
+ {
+ "epoch": 3.860047281323877,
+ "grad_norm": 2.94944167137146,
+ "learning_rate": 1.4319406146507068e-06,
+ "loss": 0.3666,
+ "step": 8164
+ },
+ {
+ "epoch": 3.8605200945626477,
+ "grad_norm": 2.972322463989258,
+ "learning_rate": 1.4313766172860311e-06,
+ "loss": 0.4226,
+ "step": 8165
+ },
+ {
+ "epoch": 3.8609929078014185,
+ "grad_norm": 2.9808123111724854,
+ "learning_rate": 1.430812686459847e-06,
+ "loss": 0.4079,
+ "step": 8166
+ },
+ {
+ "epoch": 3.8614657210401893,
+ "grad_norm": 2.9656291007995605,
+ "learning_rate": 1.4302488222072698e-06,
+ "loss": 0.3423,
+ "step": 8167
+ },
+ {
+ "epoch": 3.86193853427896,
+ "grad_norm": 2.886765241622925,
+ "learning_rate": 1.4296850245634073e-06,
+ "loss": 0.3577,
+ "step": 8168
+ },
+ {
+ "epoch": 3.8624113475177304,
+ "grad_norm": 3.0613043308258057,
+ "learning_rate": 1.4291212935633653e-06,
+ "loss": 0.4121,
+ "step": 8169
+ },
+ {
+ "epoch": 3.862884160756501,
+ "grad_norm": 2.842050313949585,
+ "learning_rate": 1.4285576292422445e-06,
+ "loss": 0.373,
+ "step": 8170
+ },
+ {
+ "epoch": 3.863356973995272,
+ "grad_norm": 3.0604517459869385,
+ "learning_rate": 1.4279940316351413e-06,
+ "loss": 0.3938,
+ "step": 8171
+ },
+ {
+ "epoch": 3.8638297872340424,
+ "grad_norm": 3.9742302894592285,
+ "learning_rate": 1.42743050077715e-06,
+ "loss": 0.4463,
+ "step": 8172
+ },
+ {
+ "epoch": 3.864302600472813,
+ "grad_norm": 2.8330607414245605,
+ "learning_rate": 1.4268670367033572e-06,
+ "loss": 0.4423,
+ "step": 8173
+ },
+ {
+ "epoch": 3.864775413711584,
+ "grad_norm": 2.953256607055664,
+ "learning_rate": 1.4263036394488497e-06,
+ "loss": 0.3553,
+ "step": 8174
+ },
+ {
+ "epoch": 3.8652482269503547,
+ "grad_norm": 2.865849018096924,
+ "learning_rate": 1.4257403090487065e-06,
+ "loss": 0.3348,
+ "step": 8175
+ },
+ {
+ "epoch": 3.8657210401891255,
+ "grad_norm": 2.712502956390381,
+ "learning_rate": 1.4251770455380027e-06,
+ "loss": 0.3896,
+ "step": 8176
+ },
+ {
+ "epoch": 3.866193853427896,
+ "grad_norm": 2.798898220062256,
+ "learning_rate": 1.4246138489518123e-06,
+ "loss": 0.4275,
+ "step": 8177
+ },
+ {
+ "epoch": 3.8666666666666667,
+ "grad_norm": 2.830899953842163,
+ "learning_rate": 1.4240507193252023e-06,
+ "loss": 0.3952,
+ "step": 8178
+ },
+ {
+ "epoch": 3.8671394799054375,
+ "grad_norm": 2.5789451599121094,
+ "learning_rate": 1.4234876566932348e-06,
+ "loss": 0.3483,
+ "step": 8179
+ },
+ {
+ "epoch": 3.867612293144208,
+ "grad_norm": 2.8513095378875732,
+ "learning_rate": 1.422924661090972e-06,
+ "loss": 0.3403,
+ "step": 8180
+ },
+ {
+ "epoch": 3.8680851063829786,
+ "grad_norm": 3.5031449794769287,
+ "learning_rate": 1.4223617325534664e-06,
+ "loss": 0.3964,
+ "step": 8181
+ },
+ {
+ "epoch": 3.8685579196217494,
+ "grad_norm": 2.7495479583740234,
+ "learning_rate": 1.4217988711157715e-06,
+ "loss": 0.3376,
+ "step": 8182
+ },
+ {
+ "epoch": 3.8690307328605202,
+ "grad_norm": 2.8609421253204346,
+ "learning_rate": 1.421236076812933e-06,
+ "loss": 0.3967,
+ "step": 8183
+ },
+ {
+ "epoch": 3.869503546099291,
+ "grad_norm": 3.0624637603759766,
+ "learning_rate": 1.420673349679994e-06,
+ "loss": 0.3764,
+ "step": 8184
+ },
+ {
+ "epoch": 3.8699763593380614,
+ "grad_norm": 3.3084404468536377,
+ "learning_rate": 1.4201106897519926e-06,
+ "loss": 0.4567,
+ "step": 8185
+ },
+ {
+ "epoch": 3.870449172576832,
+ "grad_norm": 3.164116382598877,
+ "learning_rate": 1.4195480970639624e-06,
+ "loss": 0.4217,
+ "step": 8186
+ },
+ {
+ "epoch": 3.870921985815603,
+ "grad_norm": 2.971390724182129,
+ "learning_rate": 1.4189855716509355e-06,
+ "loss": 0.3981,
+ "step": 8187
+ },
+ {
+ "epoch": 3.8713947990543733,
+ "grad_norm": 3.0537233352661133,
+ "learning_rate": 1.418423113547937e-06,
+ "loss": 0.4093,
+ "step": 8188
+ },
+ {
+ "epoch": 3.871867612293144,
+ "grad_norm": 3.698120594024658,
+ "learning_rate": 1.4178607227899877e-06,
+ "loss": 0.3158,
+ "step": 8189
+ },
+ {
+ "epoch": 3.872340425531915,
+ "grad_norm": 3.0320451259613037,
+ "learning_rate": 1.417298399412107e-06,
+ "loss": 0.3903,
+ "step": 8190
+ },
+ {
+ "epoch": 3.8728132387706857,
+ "grad_norm": 2.913296699523926,
+ "learning_rate": 1.4167361434493068e-06,
+ "loss": 0.3396,
+ "step": 8191
+ },
+ {
+ "epoch": 3.8732860520094565,
+ "grad_norm": 3.011906147003174,
+ "learning_rate": 1.4161739549365976e-06,
+ "loss": 0.3915,
+ "step": 8192
+ },
+ {
+ "epoch": 3.873758865248227,
+ "grad_norm": 3.2707724571228027,
+ "learning_rate": 1.4156118339089842e-06,
+ "loss": 0.4466,
+ "step": 8193
+ },
+ {
+ "epoch": 3.8742316784869977,
+ "grad_norm": 3.036747694015503,
+ "learning_rate": 1.4150497804014656e-06,
+ "loss": 0.4095,
+ "step": 8194
+ },
+ {
+ "epoch": 3.8747044917257685,
+ "grad_norm": 2.8851394653320312,
+ "learning_rate": 1.4144877944490411e-06,
+ "loss": 0.4235,
+ "step": 8195
+ },
+ {
+ "epoch": 3.875177304964539,
+ "grad_norm": 3.099785566329956,
+ "learning_rate": 1.4139258760867008e-06,
+ "loss": 0.4102,
+ "step": 8196
+ },
+ {
+ "epoch": 3.8756501182033096,
+ "grad_norm": 3.0752081871032715,
+ "learning_rate": 1.4133640253494347e-06,
+ "loss": 0.4165,
+ "step": 8197
+ },
+ {
+ "epoch": 3.8761229314420804,
+ "grad_norm": 2.842257261276245,
+ "learning_rate": 1.412802242272226e-06,
+ "loss": 0.3573,
+ "step": 8198
+ },
+ {
+ "epoch": 3.876595744680851,
+ "grad_norm": 2.93868350982666,
+ "learning_rate": 1.4122405268900547e-06,
+ "loss": 0.36,
+ "step": 8199
+ },
+ {
+ "epoch": 3.877068557919622,
+ "grad_norm": 2.674356460571289,
+ "learning_rate": 1.411678879237896e-06,
+ "loss": 0.3763,
+ "step": 8200
+ },
+ {
+ "epoch": 3.8775413711583924,
+ "grad_norm": 2.710617780685425,
+ "learning_rate": 1.411117299350721e-06,
+ "loss": 0.358,
+ "step": 8201
+ },
+ {
+ "epoch": 3.878014184397163,
+ "grad_norm": 3.0299410820007324,
+ "learning_rate": 1.4105557872634968e-06,
+ "loss": 0.3723,
+ "step": 8202
+ },
+ {
+ "epoch": 3.878486997635934,
+ "grad_norm": 3.1951241493225098,
+ "learning_rate": 1.4099943430111874e-06,
+ "loss": 0.4163,
+ "step": 8203
+ },
+ {
+ "epoch": 3.8789598108747043,
+ "grad_norm": 2.752410411834717,
+ "learning_rate": 1.4094329666287495e-06,
+ "loss": 0.3753,
+ "step": 8204
+ },
+ {
+ "epoch": 3.879432624113475,
+ "grad_norm": 3.1242496967315674,
+ "learning_rate": 1.40887165815114e-06,
+ "loss": 0.3694,
+ "step": 8205
+ },
+ {
+ "epoch": 3.879905437352246,
+ "grad_norm": 5.16750431060791,
+ "learning_rate": 1.4083104176133079e-06,
+ "loss": 0.3869,
+ "step": 8206
+ },
+ {
+ "epoch": 3.8803782505910167,
+ "grad_norm": 3.2995245456695557,
+ "learning_rate": 1.4077492450501978e-06,
+ "loss": 0.4194,
+ "step": 8207
+ },
+ {
+ "epoch": 3.8808510638297875,
+ "grad_norm": 3.506807804107666,
+ "learning_rate": 1.4071881404967541e-06,
+ "loss": 0.3873,
+ "step": 8208
+ },
+ {
+ "epoch": 3.881323877068558,
+ "grad_norm": 3.1201252937316895,
+ "learning_rate": 1.4066271039879123e-06,
+ "loss": 0.3625,
+ "step": 8209
+ },
+ {
+ "epoch": 3.8817966903073287,
+ "grad_norm": 2.870683193206787,
+ "learning_rate": 1.4060661355586073e-06,
+ "loss": 0.4039,
+ "step": 8210
+ },
+ {
+ "epoch": 3.8822695035460995,
+ "grad_norm": 3.177701234817505,
+ "learning_rate": 1.405505235243767e-06,
+ "loss": 0.3715,
+ "step": 8211
+ },
+ {
+ "epoch": 3.88274231678487,
+ "grad_norm": 3.0319771766662598,
+ "learning_rate": 1.4049444030783157e-06,
+ "loss": 0.3588,
+ "step": 8212
+ },
+ {
+ "epoch": 3.8832151300236406,
+ "grad_norm": 2.4598889350891113,
+ "learning_rate": 1.404383639097176e-06,
+ "loss": 0.2788,
+ "step": 8213
+ },
+ {
+ "epoch": 3.8836879432624114,
+ "grad_norm": 2.916987419128418,
+ "learning_rate": 1.4038229433352623e-06,
+ "loss": 0.4167,
+ "step": 8214
+ },
+ {
+ "epoch": 3.884160756501182,
+ "grad_norm": 3.005075216293335,
+ "learning_rate": 1.4032623158274872e-06,
+ "loss": 0.4251,
+ "step": 8215
+ },
+ {
+ "epoch": 3.8846335697399526,
+ "grad_norm": 3.1718621253967285,
+ "learning_rate": 1.4027017566087591e-06,
+ "loss": 0.383,
+ "step": 8216
+ },
+ {
+ "epoch": 3.8851063829787233,
+ "grad_norm": 2.954662322998047,
+ "learning_rate": 1.402141265713981e-06,
+ "loss": 0.373,
+ "step": 8217
+ },
+ {
+ "epoch": 3.885579196217494,
+ "grad_norm": 3.408008337020874,
+ "learning_rate": 1.4015808431780526e-06,
+ "loss": 0.4216,
+ "step": 8218
+ },
+ {
+ "epoch": 3.8860520094562645,
+ "grad_norm": 3.1599369049072266,
+ "learning_rate": 1.4010204890358675e-06,
+ "loss": 0.4544,
+ "step": 8219
+ },
+ {
+ "epoch": 3.8865248226950353,
+ "grad_norm": 2.8919107913970947,
+ "learning_rate": 1.4004602033223186e-06,
+ "loss": 0.3785,
+ "step": 8220
+ },
+ {
+ "epoch": 3.886997635933806,
+ "grad_norm": 3.522581100463867,
+ "learning_rate": 1.3998999860722918e-06,
+ "loss": 0.4276,
+ "step": 8221
+ },
+ {
+ "epoch": 3.887470449172577,
+ "grad_norm": 2.9278945922851562,
+ "learning_rate": 1.399339837320668e-06,
+ "loss": 0.409,
+ "step": 8222
+ },
+ {
+ "epoch": 3.8879432624113477,
+ "grad_norm": 3.032557725906372,
+ "learning_rate": 1.398779757102327e-06,
+ "loss": 0.3973,
+ "step": 8223
+ },
+ {
+ "epoch": 3.888416075650118,
+ "grad_norm": 2.843118667602539,
+ "learning_rate": 1.3982197454521423e-06,
+ "loss": 0.3418,
+ "step": 8224
+ },
+ {
+ "epoch": 3.888888888888889,
+ "grad_norm": 2.8620638847351074,
+ "learning_rate": 1.3976598024049815e-06,
+ "loss": 0.3751,
+ "step": 8225
+ },
+ {
+ "epoch": 3.8893617021276596,
+ "grad_norm": 2.532327175140381,
+ "learning_rate": 1.3970999279957124e-06,
+ "loss": 0.3541,
+ "step": 8226
+ },
+ {
+ "epoch": 3.88983451536643,
+ "grad_norm": 3.1074535846710205,
+ "learning_rate": 1.3965401222591935e-06,
+ "loss": 0.4706,
+ "step": 8227
+ },
+ {
+ "epoch": 3.890307328605201,
+ "grad_norm": 3.1558735370635986,
+ "learning_rate": 1.3959803852302839e-06,
+ "loss": 0.448,
+ "step": 8228
+ },
+ {
+ "epoch": 3.8907801418439716,
+ "grad_norm": 3.0862064361572266,
+ "learning_rate": 1.3954207169438344e-06,
+ "loss": 0.3308,
+ "step": 8229
+ },
+ {
+ "epoch": 3.8912529550827424,
+ "grad_norm": 2.9246280193328857,
+ "learning_rate": 1.3948611174346927e-06,
+ "loss": 0.3771,
+ "step": 8230
+ },
+ {
+ "epoch": 3.891725768321513,
+ "grad_norm": 2.7959492206573486,
+ "learning_rate": 1.394301586737704e-06,
+ "loss": 0.4248,
+ "step": 8231
+ },
+ {
+ "epoch": 3.8921985815602835,
+ "grad_norm": 2.787670373916626,
+ "learning_rate": 1.3937421248877075e-06,
+ "loss": 0.3416,
+ "step": 8232
+ },
+ {
+ "epoch": 3.8926713947990543,
+ "grad_norm": 3.0775792598724365,
+ "learning_rate": 1.393182731919538e-06,
+ "loss": 0.4345,
+ "step": 8233
+ },
+ {
+ "epoch": 3.893144208037825,
+ "grad_norm": 2.6338887214660645,
+ "learning_rate": 1.3926234078680268e-06,
+ "loss": 0.3995,
+ "step": 8234
+ },
+ {
+ "epoch": 3.8936170212765955,
+ "grad_norm": 2.9975900650024414,
+ "learning_rate": 1.392064152767999e-06,
+ "loss": 0.3997,
+ "step": 8235
+ },
+ {
+ "epoch": 3.8940898345153663,
+ "grad_norm": 2.8615779876708984,
+ "learning_rate": 1.3915049666542791e-06,
+ "loss": 0.3687,
+ "step": 8236
+ },
+ {
+ "epoch": 3.894562647754137,
+ "grad_norm": 3.0132436752319336,
+ "learning_rate": 1.3909458495616835e-06,
+ "loss": 0.4085,
+ "step": 8237
+ },
+ {
+ "epoch": 3.895035460992908,
+ "grad_norm": 3.141291379928589,
+ "learning_rate": 1.3903868015250278e-06,
+ "loss": 0.3903,
+ "step": 8238
+ },
+ {
+ "epoch": 3.8955082742316787,
+ "grad_norm": 2.6998603343963623,
+ "learning_rate": 1.3898278225791204e-06,
+ "loss": 0.3576,
+ "step": 8239
+ },
+ {
+ "epoch": 3.895981087470449,
+ "grad_norm": 3.212578535079956,
+ "learning_rate": 1.3892689127587656e-06,
+ "loss": 0.4321,
+ "step": 8240
+ },
+ {
+ "epoch": 3.89645390070922,
+ "grad_norm": 3.15732741355896,
+ "learning_rate": 1.3887100720987662e-06,
+ "loss": 0.4247,
+ "step": 8241
+ },
+ {
+ "epoch": 3.8969267139479906,
+ "grad_norm": 2.6001040935516357,
+ "learning_rate": 1.3881513006339168e-06,
+ "loss": 0.3376,
+ "step": 8242
+ },
+ {
+ "epoch": 3.897399527186761,
+ "grad_norm": 2.766188859939575,
+ "learning_rate": 1.3875925983990113e-06,
+ "loss": 0.3771,
+ "step": 8243
+ },
+ {
+ "epoch": 3.8978723404255318,
+ "grad_norm": 2.7471580505371094,
+ "learning_rate": 1.3870339654288372e-06,
+ "loss": 0.3311,
+ "step": 8244
+ },
+ {
+ "epoch": 3.8983451536643026,
+ "grad_norm": 3.577664375305176,
+ "learning_rate": 1.3864754017581769e-06,
+ "loss": 0.3725,
+ "step": 8245
+ },
+ {
+ "epoch": 3.8988179669030734,
+ "grad_norm": 2.8747243881225586,
+ "learning_rate": 1.3859169074218116e-06,
+ "loss": 0.3706,
+ "step": 8246
+ },
+ {
+ "epoch": 3.899290780141844,
+ "grad_norm": 2.5249671936035156,
+ "learning_rate": 1.3853584824545152e-06,
+ "loss": 0.3621,
+ "step": 8247
+ },
+ {
+ "epoch": 3.8997635933806145,
+ "grad_norm": 2.7290890216827393,
+ "learning_rate": 1.3848001268910589e-06,
+ "loss": 0.3209,
+ "step": 8248
+ },
+ {
+ "epoch": 3.9002364066193853,
+ "grad_norm": 3.0917534828186035,
+ "learning_rate": 1.3842418407662084e-06,
+ "loss": 0.3904,
+ "step": 8249
+ },
+ {
+ "epoch": 3.900709219858156,
+ "grad_norm": 3.099494695663452,
+ "learning_rate": 1.383683624114725e-06,
+ "loss": 0.3714,
+ "step": 8250
+ },
+ {
+ "epoch": 3.9011820330969265,
+ "grad_norm": 3.077505588531494,
+ "learning_rate": 1.3831254769713687e-06,
+ "loss": 0.4166,
+ "step": 8251
+ },
+ {
+ "epoch": 3.9016548463356973,
+ "grad_norm": 2.9983766078948975,
+ "learning_rate": 1.3825673993708915e-06,
+ "loss": 0.3909,
+ "step": 8252
+ },
+ {
+ "epoch": 3.902127659574468,
+ "grad_norm": 2.7958667278289795,
+ "learning_rate": 1.3820093913480415e-06,
+ "loss": 0.3966,
+ "step": 8253
+ },
+ {
+ "epoch": 3.902600472813239,
+ "grad_norm": 3.0938336849212646,
+ "learning_rate": 1.3814514529375656e-06,
+ "loss": 0.4118,
+ "step": 8254
+ },
+ {
+ "epoch": 3.9030732860520096,
+ "grad_norm": 3.2711637020111084,
+ "learning_rate": 1.3808935841742016e-06,
+ "loss": 0.4021,
+ "step": 8255
+ },
+ {
+ "epoch": 3.90354609929078,
+ "grad_norm": 3.23563814163208,
+ "learning_rate": 1.3803357850926885e-06,
+ "loss": 0.3679,
+ "step": 8256
+ },
+ {
+ "epoch": 3.904018912529551,
+ "grad_norm": 2.77942156791687,
+ "learning_rate": 1.3797780557277563e-06,
+ "loss": 0.3938,
+ "step": 8257
+ },
+ {
+ "epoch": 3.9044917257683216,
+ "grad_norm": 3.1273257732391357,
+ "learning_rate": 1.3792203961141313e-06,
+ "loss": 0.3579,
+ "step": 8258
+ },
+ {
+ "epoch": 3.904964539007092,
+ "grad_norm": 3.69164776802063,
+ "learning_rate": 1.378662806286539e-06,
+ "loss": 0.3712,
+ "step": 8259
+ },
+ {
+ "epoch": 3.9054373522458627,
+ "grad_norm": 2.8818306922912598,
+ "learning_rate": 1.3781052862796957e-06,
+ "loss": 0.3972,
+ "step": 8260
+ },
+ {
+ "epoch": 3.9059101654846335,
+ "grad_norm": 2.776651382446289,
+ "learning_rate": 1.377547836128318e-06,
+ "loss": 0.3605,
+ "step": 8261
+ },
+ {
+ "epoch": 3.9063829787234043,
+ "grad_norm": 3.1498706340789795,
+ "learning_rate": 1.376990455867115e-06,
+ "loss": 0.3995,
+ "step": 8262
+ },
+ {
+ "epoch": 3.906855791962175,
+ "grad_norm": 2.777390956878662,
+ "learning_rate": 1.3764331455307916e-06,
+ "loss": 0.3463,
+ "step": 8263
+ },
+ {
+ "epoch": 3.9073286052009455,
+ "grad_norm": 2.9953835010528564,
+ "learning_rate": 1.3758759051540496e-06,
+ "loss": 0.3881,
+ "step": 8264
+ },
+ {
+ "epoch": 3.9078014184397163,
+ "grad_norm": 3.737194538116455,
+ "learning_rate": 1.375318734771585e-06,
+ "loss": 0.4456,
+ "step": 8265
+ },
+ {
+ "epoch": 3.908274231678487,
+ "grad_norm": 3.1575849056243896,
+ "learning_rate": 1.374761634418092e-06,
+ "loss": 0.3613,
+ "step": 8266
+ },
+ {
+ "epoch": 3.9087470449172574,
+ "grad_norm": 3.140662908554077,
+ "learning_rate": 1.374204604128258e-06,
+ "loss": 0.4462,
+ "step": 8267
+ },
+ {
+ "epoch": 3.9092198581560282,
+ "grad_norm": 3.2106714248657227,
+ "learning_rate": 1.3736476439367663e-06,
+ "loss": 0.3801,
+ "step": 8268
+ },
+ {
+ "epoch": 3.909692671394799,
+ "grad_norm": 2.888345956802368,
+ "learning_rate": 1.3730907538782976e-06,
+ "loss": 0.4209,
+ "step": 8269
+ },
+ {
+ "epoch": 3.91016548463357,
+ "grad_norm": 2.8903355598449707,
+ "learning_rate": 1.3725339339875252e-06,
+ "loss": 0.3612,
+ "step": 8270
+ },
+ {
+ "epoch": 3.9106382978723406,
+ "grad_norm": 3.2661736011505127,
+ "learning_rate": 1.371977184299122e-06,
+ "loss": 0.4151,
+ "step": 8271
+ },
+ {
+ "epoch": 3.911111111111111,
+ "grad_norm": 3.1532459259033203,
+ "learning_rate": 1.3714205048477535e-06,
+ "loss": 0.3706,
+ "step": 8272
+ },
+ {
+ "epoch": 3.911583924349882,
+ "grad_norm": 2.907306432723999,
+ "learning_rate": 1.3708638956680804e-06,
+ "loss": 0.4113,
+ "step": 8273
+ },
+ {
+ "epoch": 3.9120567375886526,
+ "grad_norm": 2.7301599979400635,
+ "learning_rate": 1.3703073567947622e-06,
+ "loss": 0.355,
+ "step": 8274
+ },
+ {
+ "epoch": 3.912529550827423,
+ "grad_norm": 2.595625877380371,
+ "learning_rate": 1.3697508882624516e-06,
+ "loss": 0.3733,
+ "step": 8275
+ },
+ {
+ "epoch": 3.9130023640661937,
+ "grad_norm": 2.784294366836548,
+ "learning_rate": 1.369194490105796e-06,
+ "loss": 0.3366,
+ "step": 8276
+ },
+ {
+ "epoch": 3.9134751773049645,
+ "grad_norm": 3.0179800987243652,
+ "learning_rate": 1.3686381623594419e-06,
+ "loss": 0.3922,
+ "step": 8277
+ },
+ {
+ "epoch": 3.9139479905437353,
+ "grad_norm": 2.6641111373901367,
+ "learning_rate": 1.3680819050580291e-06,
+ "loss": 0.3324,
+ "step": 8278
+ },
+ {
+ "epoch": 3.914420803782506,
+ "grad_norm": 2.917741060256958,
+ "learning_rate": 1.3675257182361923e-06,
+ "loss": 0.3784,
+ "step": 8279
+ },
+ {
+ "epoch": 3.9148936170212765,
+ "grad_norm": 2.959599018096924,
+ "learning_rate": 1.3669696019285626e-06,
+ "loss": 0.3846,
+ "step": 8280
+ },
+ {
+ "epoch": 3.9153664302600473,
+ "grad_norm": 3.078824043273926,
+ "learning_rate": 1.3664135561697683e-06,
+ "loss": 0.4357,
+ "step": 8281
+ },
+ {
+ "epoch": 3.915839243498818,
+ "grad_norm": 3.0174930095672607,
+ "learning_rate": 1.3658575809944313e-06,
+ "loss": 0.3643,
+ "step": 8282
+ },
+ {
+ "epoch": 3.9163120567375884,
+ "grad_norm": 2.6805408000946045,
+ "learning_rate": 1.365301676437169e-06,
+ "loss": 0.3193,
+ "step": 8283
+ },
+ {
+ "epoch": 3.916784869976359,
+ "grad_norm": 2.6996054649353027,
+ "learning_rate": 1.3647458425325966e-06,
+ "loss": 0.3378,
+ "step": 8284
+ },
+ {
+ "epoch": 3.91725768321513,
+ "grad_norm": 2.7950546741485596,
+ "learning_rate": 1.3641900793153223e-06,
+ "loss": 0.3864,
+ "step": 8285
+ },
+ {
+ "epoch": 3.917730496453901,
+ "grad_norm": 2.9658634662628174,
+ "learning_rate": 1.363634386819951e-06,
+ "loss": 0.3452,
+ "step": 8286
+ },
+ {
+ "epoch": 3.9182033096926716,
+ "grad_norm": 3.0684404373168945,
+ "learning_rate": 1.363078765081084e-06,
+ "loss": 0.3278,
+ "step": 8287
+ },
+ {
+ "epoch": 3.918676122931442,
+ "grad_norm": 3.0293614864349365,
+ "learning_rate": 1.3625232141333164e-06,
+ "loss": 0.3827,
+ "step": 8288
+ },
+ {
+ "epoch": 3.9191489361702128,
+ "grad_norm": 2.9969890117645264,
+ "learning_rate": 1.3619677340112413e-06,
+ "loss": 0.3412,
+ "step": 8289
+ },
+ {
+ "epoch": 3.9196217494089836,
+ "grad_norm": 2.991654396057129,
+ "learning_rate": 1.3614123247494457e-06,
+ "loss": 0.3683,
+ "step": 8290
+ },
+ {
+ "epoch": 3.920094562647754,
+ "grad_norm": 3.032158374786377,
+ "learning_rate": 1.360856986382511e-06,
+ "loss": 0.421,
+ "step": 8291
+ },
+ {
+ "epoch": 3.9205673758865247,
+ "grad_norm": 3.1413731575012207,
+ "learning_rate": 1.3603017189450173e-06,
+ "loss": 0.3818,
+ "step": 8292
+ },
+ {
+ "epoch": 3.9210401891252955,
+ "grad_norm": 3.295527219772339,
+ "learning_rate": 1.3597465224715387e-06,
+ "loss": 0.4828,
+ "step": 8293
+ },
+ {
+ "epoch": 3.9215130023640663,
+ "grad_norm": 3.116053581237793,
+ "learning_rate": 1.359191396996643e-06,
+ "loss": 0.4108,
+ "step": 8294
+ },
+ {
+ "epoch": 3.921985815602837,
+ "grad_norm": 2.957446336746216,
+ "learning_rate": 1.3586363425548975e-06,
+ "loss": 0.3482,
+ "step": 8295
+ },
+ {
+ "epoch": 3.9224586288416075,
+ "grad_norm": 2.745471715927124,
+ "learning_rate": 1.3580813591808627e-06,
+ "loss": 0.4184,
+ "step": 8296
+ },
+ {
+ "epoch": 3.9229314420803783,
+ "grad_norm": 3.0920722484588623,
+ "learning_rate": 1.3575264469090943e-06,
+ "loss": 0.3826,
+ "step": 8297
+ },
+ {
+ "epoch": 3.923404255319149,
+ "grad_norm": 2.8719749450683594,
+ "learning_rate": 1.3569716057741444e-06,
+ "loss": 0.3953,
+ "step": 8298
+ },
+ {
+ "epoch": 3.9238770685579194,
+ "grad_norm": 3.1278762817382812,
+ "learning_rate": 1.3564168358105597e-06,
+ "loss": 0.3658,
+ "step": 8299
+ },
+ {
+ "epoch": 3.92434988179669,
+ "grad_norm": 2.7752785682678223,
+ "learning_rate": 1.3558621370528851e-06,
+ "loss": 0.3447,
+ "step": 8300
+ },
+ {
+ "epoch": 3.924822695035461,
+ "grad_norm": 2.948575735092163,
+ "learning_rate": 1.3553075095356575e-06,
+ "loss": 0.3803,
+ "step": 8301
+ },
+ {
+ "epoch": 3.925295508274232,
+ "grad_norm": 2.8164193630218506,
+ "learning_rate": 1.354752953293413e-06,
+ "loss": 0.3724,
+ "step": 8302
+ },
+ {
+ "epoch": 3.9257683215130026,
+ "grad_norm": 3.2431271076202393,
+ "learning_rate": 1.3541984683606798e-06,
+ "loss": 0.382,
+ "step": 8303
+ },
+ {
+ "epoch": 3.926241134751773,
+ "grad_norm": 2.8485286235809326,
+ "learning_rate": 1.353644054771983e-06,
+ "loss": 0.3632,
+ "step": 8304
+ },
+ {
+ "epoch": 3.9267139479905437,
+ "grad_norm": 3.334914445877075,
+ "learning_rate": 1.3530897125618456e-06,
+ "loss": 0.5286,
+ "step": 8305
+ },
+ {
+ "epoch": 3.9271867612293145,
+ "grad_norm": 3.3895132541656494,
+ "learning_rate": 1.3525354417647815e-06,
+ "loss": 0.3838,
+ "step": 8306
+ },
+ {
+ "epoch": 3.927659574468085,
+ "grad_norm": 3.141935110092163,
+ "learning_rate": 1.351981242415305e-06,
+ "loss": 0.3928,
+ "step": 8307
+ },
+ {
+ "epoch": 3.9281323877068557,
+ "grad_norm": 3.3013596534729004,
+ "learning_rate": 1.3514271145479225e-06,
+ "loss": 0.4046,
+ "step": 8308
+ },
+ {
+ "epoch": 3.9286052009456265,
+ "grad_norm": 2.8704745769500732,
+ "learning_rate": 1.3508730581971363e-06,
+ "loss": 0.3542,
+ "step": 8309
+ },
+ {
+ "epoch": 3.9290780141843973,
+ "grad_norm": 3.179405689239502,
+ "learning_rate": 1.3503190733974472e-06,
+ "loss": 0.3911,
+ "step": 8310
+ },
+ {
+ "epoch": 3.929550827423168,
+ "grad_norm": 3.1091885566711426,
+ "learning_rate": 1.3497651601833481e-06,
+ "loss": 0.3552,
+ "step": 8311
+ },
+ {
+ "epoch": 3.9300236406619384,
+ "grad_norm": 2.687678813934326,
+ "learning_rate": 1.3492113185893288e-06,
+ "loss": 0.3462,
+ "step": 8312
+ },
+ {
+ "epoch": 3.9304964539007092,
+ "grad_norm": 3.4954965114593506,
+ "learning_rate": 1.3486575486498749e-06,
+ "loss": 0.4358,
+ "step": 8313
+ },
+ {
+ "epoch": 3.93096926713948,
+ "grad_norm": 2.8652899265289307,
+ "learning_rate": 1.3481038503994652e-06,
+ "loss": 0.3434,
+ "step": 8314
+ },
+ {
+ "epoch": 3.9314420803782504,
+ "grad_norm": 3.927623748779297,
+ "learning_rate": 1.3475502238725797e-06,
+ "loss": 0.4662,
+ "step": 8315
+ },
+ {
+ "epoch": 3.931914893617021,
+ "grad_norm": 3.1166276931762695,
+ "learning_rate": 1.346996669103687e-06,
+ "loss": 0.3953,
+ "step": 8316
+ },
+ {
+ "epoch": 3.932387706855792,
+ "grad_norm": 3.140003204345703,
+ "learning_rate": 1.346443186127257e-06,
+ "loss": 0.3616,
+ "step": 8317
+ },
+ {
+ "epoch": 3.9328605200945628,
+ "grad_norm": 3.335466146469116,
+ "learning_rate": 1.3458897749777516e-06,
+ "loss": 0.3854,
+ "step": 8318
+ },
+ {
+ "epoch": 3.9333333333333336,
+ "grad_norm": 2.8305466175079346,
+ "learning_rate": 1.3453364356896282e-06,
+ "loss": 0.374,
+ "step": 8319
+ },
+ {
+ "epoch": 3.933806146572104,
+ "grad_norm": 2.9511806964874268,
+ "learning_rate": 1.344783168297343e-06,
+ "loss": 0.4235,
+ "step": 8320
+ },
+ {
+ "epoch": 3.9342789598108747,
+ "grad_norm": 3.1868233680725098,
+ "learning_rate": 1.3442299728353448e-06,
+ "loss": 0.4384,
+ "step": 8321
+ },
+ {
+ "epoch": 3.9347517730496455,
+ "grad_norm": 3.1358237266540527,
+ "learning_rate": 1.3436768493380766e-06,
+ "loss": 0.4011,
+ "step": 8322
+ },
+ {
+ "epoch": 3.935224586288416,
+ "grad_norm": 3.126192808151245,
+ "learning_rate": 1.343123797839982e-06,
+ "loss": 0.4061,
+ "step": 8323
+ },
+ {
+ "epoch": 3.9356973995271867,
+ "grad_norm": 2.9724647998809814,
+ "learning_rate": 1.3425708183754949e-06,
+ "loss": 0.3859,
+ "step": 8324
+ },
+ {
+ "epoch": 3.9361702127659575,
+ "grad_norm": 3.1526355743408203,
+ "learning_rate": 1.3420179109790485e-06,
+ "loss": 0.3543,
+ "step": 8325
+ },
+ {
+ "epoch": 3.9366430260047283,
+ "grad_norm": 3.1289172172546387,
+ "learning_rate": 1.3414650756850695e-06,
+ "loss": 0.3836,
+ "step": 8326
+ },
+ {
+ "epoch": 3.937115839243499,
+ "grad_norm": 2.851264715194702,
+ "learning_rate": 1.34091231252798e-06,
+ "loss": 0.3294,
+ "step": 8327
+ },
+ {
+ "epoch": 3.9375886524822694,
+ "grad_norm": 2.921872138977051,
+ "learning_rate": 1.3403596215421981e-06,
+ "loss": 0.3698,
+ "step": 8328
+ },
+ {
+ "epoch": 3.93806146572104,
+ "grad_norm": 2.947258234024048,
+ "learning_rate": 1.339807002762137e-06,
+ "loss": 0.3616,
+ "step": 8329
+ },
+ {
+ "epoch": 3.938534278959811,
+ "grad_norm": 3.011021375656128,
+ "learning_rate": 1.3392544562222077e-06,
+ "loss": 0.3387,
+ "step": 8330
+ },
+ {
+ "epoch": 3.9390070921985814,
+ "grad_norm": 3.5230746269226074,
+ "learning_rate": 1.3387019819568134e-06,
+ "loss": 0.4054,
+ "step": 8331
+ },
+ {
+ "epoch": 3.939479905437352,
+ "grad_norm": 3.120321035385132,
+ "learning_rate": 1.3381495800003536e-06,
+ "loss": 0.4389,
+ "step": 8332
+ },
+ {
+ "epoch": 3.939952718676123,
+ "grad_norm": 3.0090999603271484,
+ "learning_rate": 1.3375972503872259e-06,
+ "loss": 0.4158,
+ "step": 8333
+ },
+ {
+ "epoch": 3.9404255319148938,
+ "grad_norm": 3.4807989597320557,
+ "learning_rate": 1.3370449931518198e-06,
+ "loss": 0.4144,
+ "step": 8334
+ },
+ {
+ "epoch": 3.9408983451536646,
+ "grad_norm": 2.8535733222961426,
+ "learning_rate": 1.336492808328523e-06,
+ "loss": 0.4281,
+ "step": 8335
+ },
+ {
+ "epoch": 3.941371158392435,
+ "grad_norm": 2.9032745361328125,
+ "learning_rate": 1.3359406959517174e-06,
+ "loss": 0.3389,
+ "step": 8336
+ },
+ {
+ "epoch": 3.9418439716312057,
+ "grad_norm": 2.725823163986206,
+ "learning_rate": 1.3353886560557793e-06,
+ "loss": 0.369,
+ "step": 8337
+ },
+ {
+ "epoch": 3.9423167848699765,
+ "grad_norm": 3.1965179443359375,
+ "learning_rate": 1.3348366886750844e-06,
+ "loss": 0.4031,
+ "step": 8338
+ },
+ {
+ "epoch": 3.942789598108747,
+ "grad_norm": 2.6991076469421387,
+ "learning_rate": 1.3342847938439985e-06,
+ "loss": 0.3434,
+ "step": 8339
+ },
+ {
+ "epoch": 3.9432624113475176,
+ "grad_norm": 4.491400718688965,
+ "learning_rate": 1.3337329715968877e-06,
+ "loss": 0.4175,
+ "step": 8340
+ },
+ {
+ "epoch": 3.9437352245862884,
+ "grad_norm": 4.005452632904053,
+ "learning_rate": 1.3331812219681112e-06,
+ "loss": 0.4191,
+ "step": 8341
+ },
+ {
+ "epoch": 3.9442080378250592,
+ "grad_norm": 3.1575794219970703,
+ "learning_rate": 1.3326295449920238e-06,
+ "loss": 0.4135,
+ "step": 8342
+ },
+ {
+ "epoch": 3.94468085106383,
+ "grad_norm": 3.2383973598480225,
+ "learning_rate": 1.3320779407029755e-06,
+ "loss": 0.38,
+ "step": 8343
+ },
+ {
+ "epoch": 3.9451536643026004,
+ "grad_norm": 2.873703718185425,
+ "learning_rate": 1.3315264091353119e-06,
+ "loss": 0.4128,
+ "step": 8344
+ },
+ {
+ "epoch": 3.945626477541371,
+ "grad_norm": 2.947274923324585,
+ "learning_rate": 1.330974950323376e-06,
+ "loss": 0.3342,
+ "step": 8345
+ },
+ {
+ "epoch": 3.946099290780142,
+ "grad_norm": 3.2874088287353516,
+ "learning_rate": 1.330423564301504e-06,
+ "loss": 0.3849,
+ "step": 8346
+ },
+ {
+ "epoch": 3.9465721040189123,
+ "grad_norm": 2.885772466659546,
+ "learning_rate": 1.3298722511040275e-06,
+ "loss": 0.3562,
+ "step": 8347
+ },
+ {
+ "epoch": 3.947044917257683,
+ "grad_norm": 3.0031309127807617,
+ "learning_rate": 1.3293210107652753e-06,
+ "loss": 0.3593,
+ "step": 8348
+ },
+ {
+ "epoch": 3.947517730496454,
+ "grad_norm": 2.815854549407959,
+ "learning_rate": 1.3287698433195712e-06,
+ "loss": 0.3633,
+ "step": 8349
+ },
+ {
+ "epoch": 3.9479905437352247,
+ "grad_norm": 3.228415012359619,
+ "learning_rate": 1.328218748801232e-06,
+ "loss": 0.3869,
+ "step": 8350
+ },
+ {
+ "epoch": 3.9484633569739955,
+ "grad_norm": 2.8497684001922607,
+ "learning_rate": 1.3276677272445743e-06,
+ "loss": 0.3833,
+ "step": 8351
+ },
+ {
+ "epoch": 3.948936170212766,
+ "grad_norm": 3.2330706119537354,
+ "learning_rate": 1.3271167786839057e-06,
+ "loss": 0.4414,
+ "step": 8352
+ },
+ {
+ "epoch": 3.9494089834515367,
+ "grad_norm": 3.2179152965545654,
+ "learning_rate": 1.3265659031535332e-06,
+ "loss": 0.3821,
+ "step": 8353
+ },
+ {
+ "epoch": 3.9498817966903075,
+ "grad_norm": 3.0861377716064453,
+ "learning_rate": 1.3260151006877567e-06,
+ "loss": 0.4014,
+ "step": 8354
+ },
+ {
+ "epoch": 3.950354609929078,
+ "grad_norm": 3.118872880935669,
+ "learning_rate": 1.325464371320871e-06,
+ "loss": 0.3836,
+ "step": 8355
+ },
+ {
+ "epoch": 3.9508274231678486,
+ "grad_norm": 2.9787702560424805,
+ "learning_rate": 1.32491371508717e-06,
+ "loss": 0.3794,
+ "step": 8356
+ },
+ {
+ "epoch": 3.9513002364066194,
+ "grad_norm": 3.132089853286743,
+ "learning_rate": 1.3243631320209387e-06,
+ "loss": 0.3698,
+ "step": 8357
+ },
+ {
+ "epoch": 3.9517730496453902,
+ "grad_norm": 2.5304882526397705,
+ "learning_rate": 1.32381262215646e-06,
+ "loss": 0.3687,
+ "step": 8358
+ },
+ {
+ "epoch": 3.952245862884161,
+ "grad_norm": 2.9121861457824707,
+ "learning_rate": 1.3232621855280126e-06,
+ "loss": 0.3704,
+ "step": 8359
+ },
+ {
+ "epoch": 3.9527186761229314,
+ "grad_norm": 3.0885608196258545,
+ "learning_rate": 1.3227118221698688e-06,
+ "loss": 0.4303,
+ "step": 8360
+ },
+ {
+ "epoch": 3.953191489361702,
+ "grad_norm": 2.7274837493896484,
+ "learning_rate": 1.3221615321162979e-06,
+ "loss": 0.3556,
+ "step": 8361
+ },
+ {
+ "epoch": 3.953664302600473,
+ "grad_norm": 3.1329922676086426,
+ "learning_rate": 1.3216113154015625e-06,
+ "loss": 0.4042,
+ "step": 8362
+ },
+ {
+ "epoch": 3.9541371158392433,
+ "grad_norm": 2.937380313873291,
+ "learning_rate": 1.3210611720599243e-06,
+ "loss": 0.3358,
+ "step": 8363
+ },
+ {
+ "epoch": 3.954609929078014,
+ "grad_norm": 2.939194440841675,
+ "learning_rate": 1.3205111021256378e-06,
+ "loss": 0.3885,
+ "step": 8364
+ },
+ {
+ "epoch": 3.955082742316785,
+ "grad_norm": 2.9151997566223145,
+ "learning_rate": 1.3199611056329516e-06,
+ "loss": 0.4094,
+ "step": 8365
+ },
+ {
+ "epoch": 3.9555555555555557,
+ "grad_norm": 3.029733419418335,
+ "learning_rate": 1.3194111826161143e-06,
+ "loss": 0.3999,
+ "step": 8366
+ },
+ {
+ "epoch": 3.9560283687943265,
+ "grad_norm": 2.7899951934814453,
+ "learning_rate": 1.3188613331093653e-06,
+ "loss": 0.321,
+ "step": 8367
+ },
+ {
+ "epoch": 3.956501182033097,
+ "grad_norm": 3.1109507083892822,
+ "learning_rate": 1.3183115571469425e-06,
+ "loss": 0.4266,
+ "step": 8368
+ },
+ {
+ "epoch": 3.9569739952718677,
+ "grad_norm": 3.085594415664673,
+ "learning_rate": 1.3177618547630774e-06,
+ "loss": 0.4412,
+ "step": 8369
+ },
+ {
+ "epoch": 3.9574468085106385,
+ "grad_norm": 3.0980300903320312,
+ "learning_rate": 1.3172122259919968e-06,
+ "loss": 0.3385,
+ "step": 8370
+ },
+ {
+ "epoch": 3.957919621749409,
+ "grad_norm": 3.103438138961792,
+ "learning_rate": 1.3166626708679256e-06,
+ "loss": 0.3887,
+ "step": 8371
+ },
+ {
+ "epoch": 3.9583924349881796,
+ "grad_norm": 2.8235526084899902,
+ "learning_rate": 1.3161131894250812e-06,
+ "loss": 0.3759,
+ "step": 8372
+ },
+ {
+ "epoch": 3.9588652482269504,
+ "grad_norm": 2.8316404819488525,
+ "learning_rate": 1.3155637816976762e-06,
+ "loss": 0.3666,
+ "step": 8373
+ },
+ {
+ "epoch": 3.959338061465721,
+ "grad_norm": 2.7873756885528564,
+ "learning_rate": 1.3150144477199218e-06,
+ "loss": 0.3284,
+ "step": 8374
+ },
+ {
+ "epoch": 3.959810874704492,
+ "grad_norm": 3.355039119720459,
+ "learning_rate": 1.3144651875260218e-06,
+ "loss": 0.4197,
+ "step": 8375
+ },
+ {
+ "epoch": 3.9602836879432624,
+ "grad_norm": 3.477721929550171,
+ "learning_rate": 1.3139160011501761e-06,
+ "loss": 0.3298,
+ "step": 8376
+ },
+ {
+ "epoch": 3.960756501182033,
+ "grad_norm": 3.557152032852173,
+ "learning_rate": 1.3133668886265805e-06,
+ "loss": 0.3788,
+ "step": 8377
+ },
+ {
+ "epoch": 3.961229314420804,
+ "grad_norm": 3.06707763671875,
+ "learning_rate": 1.312817849989424e-06,
+ "loss": 0.3613,
+ "step": 8378
+ },
+ {
+ "epoch": 3.9617021276595743,
+ "grad_norm": 2.7702202796936035,
+ "learning_rate": 1.3122688852728956e-06,
+ "loss": 0.402,
+ "step": 8379
+ },
+ {
+ "epoch": 3.962174940898345,
+ "grad_norm": 2.8121016025543213,
+ "learning_rate": 1.3117199945111746e-06,
+ "loss": 0.3576,
+ "step": 8380
+ },
+ {
+ "epoch": 3.962647754137116,
+ "grad_norm": 2.809282064437866,
+ "learning_rate": 1.3111711777384403e-06,
+ "loss": 0.3741,
+ "step": 8381
+ },
+ {
+ "epoch": 3.9631205673758867,
+ "grad_norm": 3.1175687313079834,
+ "learning_rate": 1.3106224349888638e-06,
+ "loss": 0.3388,
+ "step": 8382
+ },
+ {
+ "epoch": 3.963593380614657,
+ "grad_norm": 2.930525064468384,
+ "learning_rate": 1.310073766296612e-06,
+ "loss": 0.3593,
+ "step": 8383
+ },
+ {
+ "epoch": 3.964066193853428,
+ "grad_norm": 3.0673177242279053,
+ "learning_rate": 1.3095251716958501e-06,
+ "loss": 0.402,
+ "step": 8384
+ },
+ {
+ "epoch": 3.9645390070921986,
+ "grad_norm": 2.9725706577301025,
+ "learning_rate": 1.3089766512207347e-06,
+ "loss": 0.3707,
+ "step": 8385
+ },
+ {
+ "epoch": 3.965011820330969,
+ "grad_norm": 2.9790916442871094,
+ "learning_rate": 1.3084282049054218e-06,
+ "loss": 0.3292,
+ "step": 8386
+ },
+ {
+ "epoch": 3.96548463356974,
+ "grad_norm": 3.257035493850708,
+ "learning_rate": 1.3078798327840598e-06,
+ "loss": 0.3753,
+ "step": 8387
+ },
+ {
+ "epoch": 3.9659574468085106,
+ "grad_norm": 3.0534379482269287,
+ "learning_rate": 1.307331534890792e-06,
+ "loss": 0.4134,
+ "step": 8388
+ },
+ {
+ "epoch": 3.9664302600472814,
+ "grad_norm": 2.919243812561035,
+ "learning_rate": 1.306783311259761e-06,
+ "loss": 0.4283,
+ "step": 8389
+ },
+ {
+ "epoch": 3.966903073286052,
+ "grad_norm": 2.7643322944641113,
+ "learning_rate": 1.306235161925101e-06,
+ "loss": 0.3454,
+ "step": 8390
+ },
+ {
+ "epoch": 3.9673758865248225,
+ "grad_norm": 3.0208916664123535,
+ "learning_rate": 1.3056870869209431e-06,
+ "loss": 0.385,
+ "step": 8391
+ },
+ {
+ "epoch": 3.9678486997635933,
+ "grad_norm": 2.8657243251800537,
+ "learning_rate": 1.3051390862814135e-06,
+ "loss": 0.3614,
+ "step": 8392
+ },
+ {
+ "epoch": 3.968321513002364,
+ "grad_norm": 3.2093591690063477,
+ "learning_rate": 1.3045911600406325e-06,
+ "loss": 0.3774,
+ "step": 8393
+ },
+ {
+ "epoch": 3.9687943262411345,
+ "grad_norm": 3.091618537902832,
+ "learning_rate": 1.3040433082327192e-06,
+ "loss": 0.4157,
+ "step": 8394
+ },
+ {
+ "epoch": 3.9692671394799053,
+ "grad_norm": 2.99763560295105,
+ "learning_rate": 1.3034955308917849e-06,
+ "loss": 0.4017,
+ "step": 8395
+ },
+ {
+ "epoch": 3.969739952718676,
+ "grad_norm": 3.063109874725342,
+ "learning_rate": 1.3029478280519364e-06,
+ "loss": 0.4568,
+ "step": 8396
+ },
+ {
+ "epoch": 3.970212765957447,
+ "grad_norm": 3.2660679817199707,
+ "learning_rate": 1.3024001997472791e-06,
+ "loss": 0.3999,
+ "step": 8397
+ },
+ {
+ "epoch": 3.9706855791962177,
+ "grad_norm": 2.860121250152588,
+ "learning_rate": 1.3018526460119088e-06,
+ "loss": 0.433,
+ "step": 8398
+ },
+ {
+ "epoch": 3.971158392434988,
+ "grad_norm": 3.1037673950195312,
+ "learning_rate": 1.3013051668799216e-06,
+ "loss": 0.4526,
+ "step": 8399
+ },
+ {
+ "epoch": 3.971631205673759,
+ "grad_norm": 2.9408578872680664,
+ "learning_rate": 1.3007577623854053e-06,
+ "loss": 0.3722,
+ "step": 8400
+ },
+ {
+ "epoch": 3.9721040189125296,
+ "grad_norm": 3.0684635639190674,
+ "learning_rate": 1.3002104325624436e-06,
+ "loss": 0.3789,
+ "step": 8401
+ },
+ {
+ "epoch": 3.9725768321513,
+ "grad_norm": 2.6469366550445557,
+ "learning_rate": 1.2996631774451187e-06,
+ "loss": 0.3409,
+ "step": 8402
+ },
+ {
+ "epoch": 3.9730496453900708,
+ "grad_norm": 3.3741610050201416,
+ "learning_rate": 1.2991159970675033e-06,
+ "loss": 0.3544,
+ "step": 8403
+ },
+ {
+ "epoch": 3.9735224586288416,
+ "grad_norm": 3.3716588020324707,
+ "learning_rate": 1.2985688914636701e-06,
+ "loss": 0.3747,
+ "step": 8404
+ },
+ {
+ "epoch": 3.9739952718676124,
+ "grad_norm": 3.000469923019409,
+ "learning_rate": 1.2980218606676837e-06,
+ "loss": 0.4506,
+ "step": 8405
+ },
+ {
+ "epoch": 3.974468085106383,
+ "grad_norm": 3.0139408111572266,
+ "learning_rate": 1.2974749047136057e-06,
+ "loss": 0.4156,
+ "step": 8406
+ },
+ {
+ "epoch": 3.9749408983451535,
+ "grad_norm": 2.9494218826293945,
+ "learning_rate": 1.2969280236354925e-06,
+ "loss": 0.3378,
+ "step": 8407
+ },
+ {
+ "epoch": 3.9754137115839243,
+ "grad_norm": 2.6061158180236816,
+ "learning_rate": 1.2963812174673948e-06,
+ "loss": 0.3887,
+ "step": 8408
+ },
+ {
+ "epoch": 3.975886524822695,
+ "grad_norm": 2.873987913131714,
+ "learning_rate": 1.295834486243362e-06,
+ "loss": 0.3202,
+ "step": 8409
+ },
+ {
+ "epoch": 3.9763593380614655,
+ "grad_norm": 3.0106539726257324,
+ "learning_rate": 1.2952878299974358e-06,
+ "loss": 0.4142,
+ "step": 8410
+ },
+ {
+ "epoch": 3.9768321513002363,
+ "grad_norm": 3.0011982917785645,
+ "learning_rate": 1.2947412487636527e-06,
+ "loss": 0.4121,
+ "step": 8411
+ },
+ {
+ "epoch": 3.977304964539007,
+ "grad_norm": 3.1321003437042236,
+ "learning_rate": 1.294194742576048e-06,
+ "loss": 0.4033,
+ "step": 8412
+ },
+ {
+ "epoch": 3.977777777777778,
+ "grad_norm": 2.812255382537842,
+ "learning_rate": 1.2936483114686487e-06,
+ "loss": 0.3414,
+ "step": 8413
+ },
+ {
+ "epoch": 3.9782505910165487,
+ "grad_norm": 2.9594221115112305,
+ "learning_rate": 1.2931019554754804e-06,
+ "loss": 0.3666,
+ "step": 8414
+ },
+ {
+ "epoch": 3.978723404255319,
+ "grad_norm": 3.119440793991089,
+ "learning_rate": 1.2925556746305612e-06,
+ "loss": 0.3902,
+ "step": 8415
+ },
+ {
+ "epoch": 3.97919621749409,
+ "grad_norm": 3.042102098464966,
+ "learning_rate": 1.2920094689679047e-06,
+ "loss": 0.344,
+ "step": 8416
+ },
+ {
+ "epoch": 3.9796690307328606,
+ "grad_norm": 2.8443872928619385,
+ "learning_rate": 1.2914633385215225e-06,
+ "loss": 0.372,
+ "step": 8417
+ },
+ {
+ "epoch": 3.980141843971631,
+ "grad_norm": 3.483201265335083,
+ "learning_rate": 1.2909172833254187e-06,
+ "loss": 0.4028,
+ "step": 8418
+ },
+ {
+ "epoch": 3.9806146572104018,
+ "grad_norm": 2.966996431350708,
+ "learning_rate": 1.2903713034135934e-06,
+ "loss": 0.3527,
+ "step": 8419
+ },
+ {
+ "epoch": 3.9810874704491725,
+ "grad_norm": 2.7813172340393066,
+ "learning_rate": 1.2898253988200437e-06,
+ "loss": 0.3873,
+ "step": 8420
+ },
+ {
+ "epoch": 3.9815602836879433,
+ "grad_norm": 3.24611234664917,
+ "learning_rate": 1.2892795695787602e-06,
+ "loss": 0.4783,
+ "step": 8421
+ },
+ {
+ "epoch": 3.982033096926714,
+ "grad_norm": 3.345573663711548,
+ "learning_rate": 1.2887338157237289e-06,
+ "loss": 0.4179,
+ "step": 8422
+ },
+ {
+ "epoch": 3.9825059101654845,
+ "grad_norm": 3.1726880073547363,
+ "learning_rate": 1.288188137288931e-06,
+ "loss": 0.3725,
+ "step": 8423
+ },
+ {
+ "epoch": 3.9829787234042553,
+ "grad_norm": 3.398966073989868,
+ "learning_rate": 1.2876425343083449e-06,
+ "loss": 0.4117,
+ "step": 8424
+ },
+ {
+ "epoch": 3.983451536643026,
+ "grad_norm": 2.615680456161499,
+ "learning_rate": 1.2870970068159423e-06,
+ "loss": 0.324,
+ "step": 8425
+ },
+ {
+ "epoch": 3.9839243498817964,
+ "grad_norm": 3.0505547523498535,
+ "learning_rate": 1.2865515548456893e-06,
+ "loss": 0.3698,
+ "step": 8426
+ },
+ {
+ "epoch": 3.9843971631205672,
+ "grad_norm": 3.077404260635376,
+ "learning_rate": 1.2860061784315514e-06,
+ "loss": 0.3592,
+ "step": 8427
+ },
+ {
+ "epoch": 3.984869976359338,
+ "grad_norm": 2.654080390930176,
+ "learning_rate": 1.2854608776074855e-06,
+ "loss": 0.3451,
+ "step": 8428
+ },
+ {
+ "epoch": 3.985342789598109,
+ "grad_norm": 3.1023523807525635,
+ "learning_rate": 1.284915652407444e-06,
+ "loss": 0.3809,
+ "step": 8429
+ },
+ {
+ "epoch": 3.9858156028368796,
+ "grad_norm": 3.0526652336120605,
+ "learning_rate": 1.2843705028653783e-06,
+ "loss": 0.3633,
+ "step": 8430
+ },
+ {
+ "epoch": 3.98628841607565,
+ "grad_norm": 2.7829604148864746,
+ "learning_rate": 1.2838254290152296e-06,
+ "loss": 0.3213,
+ "step": 8431
+ },
+ {
+ "epoch": 3.986761229314421,
+ "grad_norm": 3.2218687534332275,
+ "learning_rate": 1.28328043089094e-06,
+ "loss": 0.465,
+ "step": 8432
+ },
+ {
+ "epoch": 3.9872340425531916,
+ "grad_norm": 2.952998161315918,
+ "learning_rate": 1.2827355085264425e-06,
+ "loss": 0.4405,
+ "step": 8433
+ },
+ {
+ "epoch": 3.987706855791962,
+ "grad_norm": 2.81211519241333,
+ "learning_rate": 1.2821906619556667e-06,
+ "loss": 0.3444,
+ "step": 8434
+ },
+ {
+ "epoch": 3.9881796690307327,
+ "grad_norm": 3.1707375049591064,
+ "learning_rate": 1.281645891212539e-06,
+ "loss": 0.4019,
+ "step": 8435
+ },
+ {
+ "epoch": 3.9886524822695035,
+ "grad_norm": 2.791504383087158,
+ "learning_rate": 1.2811011963309788e-06,
+ "loss": 0.3606,
+ "step": 8436
+ },
+ {
+ "epoch": 3.9891252955082743,
+ "grad_norm": 2.954782247543335,
+ "learning_rate": 1.280556577344903e-06,
+ "loss": 0.3141,
+ "step": 8437
+ },
+ {
+ "epoch": 3.989598108747045,
+ "grad_norm": 2.718273878097534,
+ "learning_rate": 1.2800120342882223e-06,
+ "loss": 0.3715,
+ "step": 8438
+ },
+ {
+ "epoch": 3.9900709219858155,
+ "grad_norm": 3.2916250228881836,
+ "learning_rate": 1.2794675671948425e-06,
+ "loss": 0.4048,
+ "step": 8439
+ },
+ {
+ "epoch": 3.9905437352245863,
+ "grad_norm": 3.060060977935791,
+ "learning_rate": 1.2789231760986655e-06,
+ "loss": 0.4032,
+ "step": 8440
+ },
+ {
+ "epoch": 3.991016548463357,
+ "grad_norm": 2.8467273712158203,
+ "learning_rate": 1.2783788610335882e-06,
+ "loss": 0.4041,
+ "step": 8441
+ },
+ {
+ "epoch": 3.9914893617021274,
+ "grad_norm": 3.161790132522583,
+ "learning_rate": 1.2778346220335013e-06,
+ "loss": 0.4049,
+ "step": 8442
+ },
+ {
+ "epoch": 3.9919621749408982,
+ "grad_norm": 2.6512296199798584,
+ "learning_rate": 1.277290459132295e-06,
+ "loss": 0.3598,
+ "step": 8443
+ },
+ {
+ "epoch": 3.992434988179669,
+ "grad_norm": 2.792736291885376,
+ "learning_rate": 1.276746372363849e-06,
+ "loss": 0.3874,
+ "step": 8444
+ },
+ {
+ "epoch": 3.99290780141844,
+ "grad_norm": 2.887047052383423,
+ "learning_rate": 1.2762023617620433e-06,
+ "loss": 0.4255,
+ "step": 8445
+ },
+ {
+ "epoch": 3.9933806146572106,
+ "grad_norm": 3.0420780181884766,
+ "learning_rate": 1.275658427360751e-06,
+ "loss": 0.4489,
+ "step": 8446
+ },
+ {
+ "epoch": 3.993853427895981,
+ "grad_norm": 3.107618570327759,
+ "learning_rate": 1.2751145691938383e-06,
+ "loss": 0.4354,
+ "step": 8447
+ },
+ {
+ "epoch": 3.9943262411347518,
+ "grad_norm": 2.656224250793457,
+ "learning_rate": 1.2745707872951718e-06,
+ "loss": 0.4188,
+ "step": 8448
+ },
+ {
+ "epoch": 3.9947990543735226,
+ "grad_norm": 2.9895219802856445,
+ "learning_rate": 1.2740270816986079e-06,
+ "loss": 0.391,
+ "step": 8449
+ },
+ {
+ "epoch": 3.995271867612293,
+ "grad_norm": 2.919255018234253,
+ "learning_rate": 1.2734834524380025e-06,
+ "loss": 0.4058,
+ "step": 8450
+ },
+ {
+ "epoch": 3.9957446808510637,
+ "grad_norm": 3.4418535232543945,
+ "learning_rate": 1.2729398995472048e-06,
+ "loss": 0.3977,
+ "step": 8451
+ },
+ {
+ "epoch": 3.9962174940898345,
+ "grad_norm": 2.980224132537842,
+ "learning_rate": 1.272396423060058e-06,
+ "loss": 0.4417,
+ "step": 8452
+ },
+ {
+ "epoch": 3.9966903073286053,
+ "grad_norm": 3.6488101482391357,
+ "learning_rate": 1.2718530230104043e-06,
+ "loss": 0.4472,
+ "step": 8453
+ },
+ {
+ "epoch": 3.997163120567376,
+ "grad_norm": 2.725437641143799,
+ "learning_rate": 1.2713096994320774e-06,
+ "loss": 0.3125,
+ "step": 8454
+ },
+ {
+ "epoch": 3.9976359338061465,
+ "grad_norm": 3.453794002532959,
+ "learning_rate": 1.2707664523589076e-06,
+ "loss": 0.3792,
+ "step": 8455
+ },
+ {
+ "epoch": 3.9981087470449173,
+ "grad_norm": 2.8443076610565186,
+ "learning_rate": 1.270223281824721e-06,
+ "loss": 0.3627,
+ "step": 8456
+ },
+ {
+ "epoch": 3.998581560283688,
+ "grad_norm": 3.1851959228515625,
+ "learning_rate": 1.2696801878633372e-06,
+ "loss": 0.3745,
+ "step": 8457
+ },
+ {
+ "epoch": 3.9990543735224584,
+ "grad_norm": 2.897239923477173,
+ "learning_rate": 1.2691371705085743e-06,
+ "loss": 0.3817,
+ "step": 8458
+ },
+ {
+ "epoch": 3.999527186761229,
+ "grad_norm": 2.92111873626709,
+ "learning_rate": 1.2685942297942416e-06,
+ "loss": 0.3824,
+ "step": 8459
+ },
+ {
+ "epoch": 4.0,
+ "grad_norm": 2.9540340900421143,
+ "learning_rate": 1.268051365754148e-06,
+ "loss": 0.3656,
+ "step": 8460
+ },
+ {
+ "epoch": 4.000472813238771,
+ "grad_norm": 2.6815075874328613,
+ "learning_rate": 1.2675085784220936e-06,
+ "loss": 0.3296,
+ "step": 8461
+ },
+ {
+ "epoch": 4.000945626477542,
+ "grad_norm": 3.0823302268981934,
+ "learning_rate": 1.2669658678318747e-06,
+ "loss": 0.3918,
+ "step": 8462
+ },
+ {
+ "epoch": 4.001418439716312,
+ "grad_norm": 2.5451176166534424,
+ "learning_rate": 1.2664232340172855e-06,
+ "loss": 0.3528,
+ "step": 8463
+ },
+ {
+ "epoch": 4.001891252955082,
+ "grad_norm": 2.539541721343994,
+ "learning_rate": 1.2658806770121119e-06,
+ "loss": 0.3034,
+ "step": 8464
+ },
+ {
+ "epoch": 4.002364066193853,
+ "grad_norm": 2.5537798404693604,
+ "learning_rate": 1.2653381968501374e-06,
+ "loss": 0.2981,
+ "step": 8465
+ },
+ {
+ "epoch": 4.002836879432624,
+ "grad_norm": 2.6316089630126953,
+ "learning_rate": 1.26479579356514e-06,
+ "loss": 0.3328,
+ "step": 8466
+ },
+ {
+ "epoch": 4.003309692671395,
+ "grad_norm": 3.080700635910034,
+ "learning_rate": 1.2642534671908914e-06,
+ "loss": 0.3471,
+ "step": 8467
+ },
+ {
+ "epoch": 4.0037825059101655,
+ "grad_norm": 3.0111753940582275,
+ "learning_rate": 1.2637112177611614e-06,
+ "loss": 0.3147,
+ "step": 8468
+ },
+ {
+ "epoch": 4.004255319148936,
+ "grad_norm": 2.759606122970581,
+ "learning_rate": 1.2631690453097128e-06,
+ "loss": 0.2634,
+ "step": 8469
+ },
+ {
+ "epoch": 4.004728132387707,
+ "grad_norm": 2.862098217010498,
+ "learning_rate": 1.2626269498703048e-06,
+ "loss": 0.3333,
+ "step": 8470
+ },
+ {
+ "epoch": 4.005200945626478,
+ "grad_norm": 3.122239589691162,
+ "learning_rate": 1.262084931476691e-06,
+ "loss": 0.3311,
+ "step": 8471
+ },
+ {
+ "epoch": 4.005673758865248,
+ "grad_norm": 2.6428070068359375,
+ "learning_rate": 1.261542990162619e-06,
+ "loss": 0.3534,
+ "step": 8472
+ },
+ {
+ "epoch": 4.006146572104019,
+ "grad_norm": 3.2870724201202393,
+ "learning_rate": 1.261001125961836e-06,
+ "loss": 0.3373,
+ "step": 8473
+ },
+ {
+ "epoch": 4.006619385342789,
+ "grad_norm": 2.7675375938415527,
+ "learning_rate": 1.26045933890808e-06,
+ "loss": 0.3117,
+ "step": 8474
+ },
+ {
+ "epoch": 4.00709219858156,
+ "grad_norm": 2.811736583709717,
+ "learning_rate": 1.2599176290350844e-06,
+ "loss": 0.3087,
+ "step": 8475
+ },
+ {
+ "epoch": 4.007565011820331,
+ "grad_norm": 2.9146902561187744,
+ "learning_rate": 1.2593759963765817e-06,
+ "loss": 0.336,
+ "step": 8476
+ },
+ {
+ "epoch": 4.008037825059102,
+ "grad_norm": 3.074338674545288,
+ "learning_rate": 1.2588344409662945e-06,
+ "loss": 0.384,
+ "step": 8477
+ },
+ {
+ "epoch": 4.008510638297873,
+ "grad_norm": 3.5597734451293945,
+ "learning_rate": 1.2582929628379455e-06,
+ "loss": 0.4061,
+ "step": 8478
+ },
+ {
+ "epoch": 4.008983451536643,
+ "grad_norm": 3.0091497898101807,
+ "learning_rate": 1.2577515620252489e-06,
+ "loss": 0.3783,
+ "step": 8479
+ },
+ {
+ "epoch": 4.009456264775413,
+ "grad_norm": 2.9654228687286377,
+ "learning_rate": 1.2572102385619145e-06,
+ "loss": 0.3541,
+ "step": 8480
+ },
+ {
+ "epoch": 4.009929078014184,
+ "grad_norm": 3.140733242034912,
+ "learning_rate": 1.2566689924816502e-06,
+ "loss": 0.3706,
+ "step": 8481
+ },
+ {
+ "epoch": 4.010401891252955,
+ "grad_norm": 3.2180161476135254,
+ "learning_rate": 1.2561278238181548e-06,
+ "loss": 0.3573,
+ "step": 8482
+ },
+ {
+ "epoch": 4.010874704491726,
+ "grad_norm": 2.91209077835083,
+ "learning_rate": 1.2555867326051265e-06,
+ "loss": 0.3619,
+ "step": 8483
+ },
+ {
+ "epoch": 4.0113475177304965,
+ "grad_norm": 3.016916036605835,
+ "learning_rate": 1.255045718876256e-06,
+ "loss": 0.3866,
+ "step": 8484
+ },
+ {
+ "epoch": 4.011820330969267,
+ "grad_norm": 3.1008472442626953,
+ "learning_rate": 1.2545047826652294e-06,
+ "loss": 0.3352,
+ "step": 8485
+ },
+ {
+ "epoch": 4.012293144208038,
+ "grad_norm": 3.136124610900879,
+ "learning_rate": 1.2539639240057287e-06,
+ "loss": 0.3478,
+ "step": 8486
+ },
+ {
+ "epoch": 4.012765957446809,
+ "grad_norm": 3.0021555423736572,
+ "learning_rate": 1.2534231429314299e-06,
+ "loss": 0.3522,
+ "step": 8487
+ },
+ {
+ "epoch": 4.013238770685579,
+ "grad_norm": 3.2261948585510254,
+ "learning_rate": 1.2528824394760065e-06,
+ "loss": 0.3632,
+ "step": 8488
+ },
+ {
+ "epoch": 4.01371158392435,
+ "grad_norm": 3.0598134994506836,
+ "learning_rate": 1.2523418136731252e-06,
+ "loss": 0.3422,
+ "step": 8489
+ },
+ {
+ "epoch": 4.01418439716312,
+ "grad_norm": 2.9821391105651855,
+ "learning_rate": 1.2518012655564476e-06,
+ "loss": 0.3324,
+ "step": 8490
+ },
+ {
+ "epoch": 4.014657210401891,
+ "grad_norm": 2.583130359649658,
+ "learning_rate": 1.251260795159633e-06,
+ "loss": 0.3509,
+ "step": 8491
+ },
+ {
+ "epoch": 4.015130023640662,
+ "grad_norm": 3.3090853691101074,
+ "learning_rate": 1.2507204025163333e-06,
+ "loss": 0.3494,
+ "step": 8492
+ },
+ {
+ "epoch": 4.015602836879433,
+ "grad_norm": 2.6412856578826904,
+ "learning_rate": 1.250180087660195e-06,
+ "loss": 0.3419,
+ "step": 8493
+ },
+ {
+ "epoch": 4.0160756501182036,
+ "grad_norm": 2.729210615158081,
+ "learning_rate": 1.2496398506248634e-06,
+ "loss": 0.3591,
+ "step": 8494
+ },
+ {
+ "epoch": 4.016548463356974,
+ "grad_norm": 2.892150402069092,
+ "learning_rate": 1.2490996914439745e-06,
+ "loss": 0.3866,
+ "step": 8495
+ },
+ {
+ "epoch": 4.017021276595744,
+ "grad_norm": 3.1967804431915283,
+ "learning_rate": 1.2485596101511638e-06,
+ "loss": 0.358,
+ "step": 8496
+ },
+ {
+ "epoch": 4.017494089834515,
+ "grad_norm": 3.0190439224243164,
+ "learning_rate": 1.2480196067800588e-06,
+ "loss": 0.3723,
+ "step": 8497
+ },
+ {
+ "epoch": 4.017966903073286,
+ "grad_norm": 2.856370210647583,
+ "learning_rate": 1.2474796813642822e-06,
+ "loss": 0.3519,
+ "step": 8498
+ },
+ {
+ "epoch": 4.018439716312057,
+ "grad_norm": 2.979842185974121,
+ "learning_rate": 1.2469398339374546e-06,
+ "loss": 0.3483,
+ "step": 8499
+ },
+ {
+ "epoch": 4.0189125295508275,
+ "grad_norm": 3.0953211784362793,
+ "learning_rate": 1.246400064533189e-06,
+ "loss": 0.355,
+ "step": 8500
+ },
+ {
+ "epoch": 4.019385342789598,
+ "grad_norm": 3.342609167098999,
+ "learning_rate": 1.2458603731850938e-06,
+ "loss": 0.4258,
+ "step": 8501
+ },
+ {
+ "epoch": 4.019858156028369,
+ "grad_norm": 3.2789435386657715,
+ "learning_rate": 1.2453207599267747e-06,
+ "loss": 0.3653,
+ "step": 8502
+ },
+ {
+ "epoch": 4.02033096926714,
+ "grad_norm": 2.8867030143737793,
+ "learning_rate": 1.2447812247918303e-06,
+ "loss": 0.3128,
+ "step": 8503
+ },
+ {
+ "epoch": 4.02080378250591,
+ "grad_norm": 2.9467437267303467,
+ "learning_rate": 1.2442417678138552e-06,
+ "loss": 0.3149,
+ "step": 8504
+ },
+ {
+ "epoch": 4.0212765957446805,
+ "grad_norm": 2.6293485164642334,
+ "learning_rate": 1.2437023890264377e-06,
+ "loss": 0.2751,
+ "step": 8505
+ },
+ {
+ "epoch": 4.021749408983451,
+ "grad_norm": 2.9672160148620605,
+ "learning_rate": 1.2431630884631648e-06,
+ "loss": 0.3858,
+ "step": 8506
+ },
+ {
+ "epoch": 4.022222222222222,
+ "grad_norm": 3.0518734455108643,
+ "learning_rate": 1.2426238661576154e-06,
+ "loss": 0.3404,
+ "step": 8507
+ },
+ {
+ "epoch": 4.022695035460993,
+ "grad_norm": 2.829012632369995,
+ "learning_rate": 1.2420847221433633e-06,
+ "loss": 0.3211,
+ "step": 8508
+ },
+ {
+ "epoch": 4.023167848699764,
+ "grad_norm": 2.855806589126587,
+ "learning_rate": 1.2415456564539808e-06,
+ "loss": 0.3462,
+ "step": 8509
+ },
+ {
+ "epoch": 4.0236406619385345,
+ "grad_norm": 3.491786003112793,
+ "learning_rate": 1.2410066691230311e-06,
+ "loss": 0.3793,
+ "step": 8510
+ },
+ {
+ "epoch": 4.024113475177305,
+ "grad_norm": 2.9612972736358643,
+ "learning_rate": 1.2404677601840765e-06,
+ "loss": 0.3899,
+ "step": 8511
+ },
+ {
+ "epoch": 4.024586288416075,
+ "grad_norm": 2.949498176574707,
+ "learning_rate": 1.2399289296706718e-06,
+ "loss": 0.3655,
+ "step": 8512
+ },
+ {
+ "epoch": 4.025059101654846,
+ "grad_norm": 2.736524820327759,
+ "learning_rate": 1.2393901776163664e-06,
+ "loss": 0.318,
+ "step": 8513
+ },
+ {
+ "epoch": 4.025531914893617,
+ "grad_norm": 3.005297899246216,
+ "learning_rate": 1.2388515040547077e-06,
+ "loss": 0.3484,
+ "step": 8514
+ },
+ {
+ "epoch": 4.026004728132388,
+ "grad_norm": 2.9835290908813477,
+ "learning_rate": 1.2383129090192361e-06,
+ "loss": 0.3205,
+ "step": 8515
+ },
+ {
+ "epoch": 4.026477541371158,
+ "grad_norm": 3.1437056064605713,
+ "learning_rate": 1.2377743925434865e-06,
+ "loss": 0.3524,
+ "step": 8516
+ },
+ {
+ "epoch": 4.026950354609929,
+ "grad_norm": 3.0250096321105957,
+ "learning_rate": 1.2372359546609917e-06,
+ "loss": 0.3398,
+ "step": 8517
+ },
+ {
+ "epoch": 4.0274231678487,
+ "grad_norm": 3.109083890914917,
+ "learning_rate": 1.2366975954052767e-06,
+ "loss": 0.3317,
+ "step": 8518
+ },
+ {
+ "epoch": 4.027895981087471,
+ "grad_norm": 2.7713027000427246,
+ "learning_rate": 1.2361593148098634e-06,
+ "loss": 0.335,
+ "step": 8519
+ },
+ {
+ "epoch": 4.028368794326241,
+ "grad_norm": 2.9302117824554443,
+ "learning_rate": 1.2356211129082673e-06,
+ "loss": 0.3054,
+ "step": 8520
+ },
+ {
+ "epoch": 4.0288416075650115,
+ "grad_norm": 3.1805200576782227,
+ "learning_rate": 1.2350829897339996e-06,
+ "loss": 0.3219,
+ "step": 8521
+ },
+ {
+ "epoch": 4.029314420803782,
+ "grad_norm": 3.2687618732452393,
+ "learning_rate": 1.2345449453205688e-06,
+ "loss": 0.3966,
+ "step": 8522
+ },
+ {
+ "epoch": 4.029787234042553,
+ "grad_norm": 3.2010693550109863,
+ "learning_rate": 1.2340069797014741e-06,
+ "loss": 0.3547,
+ "step": 8523
+ },
+ {
+ "epoch": 4.030260047281324,
+ "grad_norm": 2.7061285972595215,
+ "learning_rate": 1.233469092910215e-06,
+ "loss": 0.2829,
+ "step": 8524
+ },
+ {
+ "epoch": 4.030732860520095,
+ "grad_norm": 3.1565401554107666,
+ "learning_rate": 1.2329312849802817e-06,
+ "loss": 0.3376,
+ "step": 8525
+ },
+ {
+ "epoch": 4.0312056737588655,
+ "grad_norm": 2.8864760398864746,
+ "learning_rate": 1.2323935559451603e-06,
+ "loss": 0.3946,
+ "step": 8526
+ },
+ {
+ "epoch": 4.031678486997636,
+ "grad_norm": 3.4621710777282715,
+ "learning_rate": 1.2318559058383348e-06,
+ "loss": 0.3859,
+ "step": 8527
+ },
+ {
+ "epoch": 4.032151300236406,
+ "grad_norm": 3.074201822280884,
+ "learning_rate": 1.2313183346932806e-06,
+ "loss": 0.3583,
+ "step": 8528
+ },
+ {
+ "epoch": 4.032624113475177,
+ "grad_norm": 3.1746935844421387,
+ "learning_rate": 1.2307808425434715e-06,
+ "loss": 0.3766,
+ "step": 8529
+ },
+ {
+ "epoch": 4.033096926713948,
+ "grad_norm": 3.327202081680298,
+ "learning_rate": 1.2302434294223738e-06,
+ "loss": 0.3556,
+ "step": 8530
+ },
+ {
+ "epoch": 4.033569739952719,
+ "grad_norm": 3.375643730163574,
+ "learning_rate": 1.2297060953634496e-06,
+ "loss": 0.3574,
+ "step": 8531
+ },
+ {
+ "epoch": 4.034042553191489,
+ "grad_norm": 2.8553316593170166,
+ "learning_rate": 1.2291688404001573e-06,
+ "loss": 0.2807,
+ "step": 8532
+ },
+ {
+ "epoch": 4.03451536643026,
+ "grad_norm": 3.439772367477417,
+ "learning_rate": 1.2286316645659492e-06,
+ "loss": 0.3519,
+ "step": 8533
+ },
+ {
+ "epoch": 4.034988179669031,
+ "grad_norm": 2.794694662094116,
+ "learning_rate": 1.2280945678942724e-06,
+ "loss": 0.3117,
+ "step": 8534
+ },
+ {
+ "epoch": 4.035460992907802,
+ "grad_norm": 2.9869043827056885,
+ "learning_rate": 1.2275575504185697e-06,
+ "loss": 0.3663,
+ "step": 8535
+ },
+ {
+ "epoch": 4.035933806146572,
+ "grad_norm": 2.711435317993164,
+ "learning_rate": 1.2270206121722777e-06,
+ "loss": 0.3547,
+ "step": 8536
+ },
+ {
+ "epoch": 4.0364066193853425,
+ "grad_norm": 2.843391180038452,
+ "learning_rate": 1.2264837531888317e-06,
+ "loss": 0.3124,
+ "step": 8537
+ },
+ {
+ "epoch": 4.036879432624113,
+ "grad_norm": 3.2082388401031494,
+ "learning_rate": 1.225946973501658e-06,
+ "loss": 0.3573,
+ "step": 8538
+ },
+ {
+ "epoch": 4.037352245862884,
+ "grad_norm": 2.799604654312134,
+ "learning_rate": 1.2254102731441786e-06,
+ "loss": 0.3234,
+ "step": 8539
+ },
+ {
+ "epoch": 4.037825059101655,
+ "grad_norm": 2.682777166366577,
+ "learning_rate": 1.2248736521498137e-06,
+ "loss": 0.3087,
+ "step": 8540
+ },
+ {
+ "epoch": 4.038297872340426,
+ "grad_norm": 2.8138248920440674,
+ "learning_rate": 1.2243371105519741e-06,
+ "loss": 0.3668,
+ "step": 8541
+ },
+ {
+ "epoch": 4.0387706855791965,
+ "grad_norm": 3.3388478755950928,
+ "learning_rate": 1.2238006483840702e-06,
+ "loss": 0.3294,
+ "step": 8542
+ },
+ {
+ "epoch": 4.039243498817967,
+ "grad_norm": 3.06247615814209,
+ "learning_rate": 1.2232642656795039e-06,
+ "loss": 0.3348,
+ "step": 8543
+ },
+ {
+ "epoch": 4.039716312056737,
+ "grad_norm": 2.742628335952759,
+ "learning_rate": 1.2227279624716724e-06,
+ "loss": 0.3427,
+ "step": 8544
+ },
+ {
+ "epoch": 4.040189125295508,
+ "grad_norm": 3.0785365104675293,
+ "learning_rate": 1.222191738793971e-06,
+ "loss": 0.3762,
+ "step": 8545
+ },
+ {
+ "epoch": 4.040661938534279,
+ "grad_norm": 3.0352790355682373,
+ "learning_rate": 1.2216555946797862e-06,
+ "loss": 0.3311,
+ "step": 8546
+ },
+ {
+ "epoch": 4.04113475177305,
+ "grad_norm": 3.1949729919433594,
+ "learning_rate": 1.2211195301625028e-06,
+ "loss": 0.3429,
+ "step": 8547
+ },
+ {
+ "epoch": 4.04160756501182,
+ "grad_norm": 3.214021921157837,
+ "learning_rate": 1.2205835452754989e-06,
+ "loss": 0.3528,
+ "step": 8548
+ },
+ {
+ "epoch": 4.042080378250591,
+ "grad_norm": 3.206296443939209,
+ "learning_rate": 1.2200476400521474e-06,
+ "loss": 0.3499,
+ "step": 8549
+ },
+ {
+ "epoch": 4.042553191489362,
+ "grad_norm": 3.0067825317382812,
+ "learning_rate": 1.2195118145258167e-06,
+ "loss": 0.3597,
+ "step": 8550
+ },
+ {
+ "epoch": 4.043026004728133,
+ "grad_norm": 2.7811057567596436,
+ "learning_rate": 1.21897606872987e-06,
+ "loss": 0.3268,
+ "step": 8551
+ },
+ {
+ "epoch": 4.043498817966903,
+ "grad_norm": 3.1679844856262207,
+ "learning_rate": 1.218440402697667e-06,
+ "loss": 0.4025,
+ "step": 8552
+ },
+ {
+ "epoch": 4.0439716312056735,
+ "grad_norm": 3.2010326385498047,
+ "learning_rate": 1.217904816462561e-06,
+ "loss": 0.3426,
+ "step": 8553
+ },
+ {
+ "epoch": 4.044444444444444,
+ "grad_norm": 3.381863832473755,
+ "learning_rate": 1.217369310057899e-06,
+ "loss": 0.3693,
+ "step": 8554
+ },
+ {
+ "epoch": 4.044917257683215,
+ "grad_norm": 3.471402168273926,
+ "learning_rate": 1.2168338835170267e-06,
+ "loss": 0.3977,
+ "step": 8555
+ },
+ {
+ "epoch": 4.045390070921986,
+ "grad_norm": 3.0549192428588867,
+ "learning_rate": 1.2162985368732813e-06,
+ "loss": 0.3262,
+ "step": 8556
+ },
+ {
+ "epoch": 4.045862884160757,
+ "grad_norm": 3.02451229095459,
+ "learning_rate": 1.215763270159998e-06,
+ "loss": 0.3408,
+ "step": 8557
+ },
+ {
+ "epoch": 4.0463356973995275,
+ "grad_norm": 3.1335513591766357,
+ "learning_rate": 1.215228083410505e-06,
+ "loss": 0.3275,
+ "step": 8558
+ },
+ {
+ "epoch": 4.046808510638298,
+ "grad_norm": 3.379655599594116,
+ "learning_rate": 1.2146929766581242e-06,
+ "loss": 0.3511,
+ "step": 8559
+ },
+ {
+ "epoch": 4.047281323877068,
+ "grad_norm": 3.210146903991699,
+ "learning_rate": 1.2141579499361772e-06,
+ "loss": 0.3607,
+ "step": 8560
+ },
+ {
+ "epoch": 4.047754137115839,
+ "grad_norm": 3.3693792819976807,
+ "learning_rate": 1.2136230032779753e-06,
+ "loss": 0.3642,
+ "step": 8561
+ },
+ {
+ "epoch": 4.04822695035461,
+ "grad_norm": 3.0397274494171143,
+ "learning_rate": 1.2130881367168292e-06,
+ "loss": 0.3376,
+ "step": 8562
+ },
+ {
+ "epoch": 4.048699763593381,
+ "grad_norm": 3.119372606277466,
+ "learning_rate": 1.212553350286042e-06,
+ "loss": 0.3581,
+ "step": 8563
+ },
+ {
+ "epoch": 4.049172576832151,
+ "grad_norm": 2.9431848526000977,
+ "learning_rate": 1.2120186440189124e-06,
+ "loss": 0.3453,
+ "step": 8564
+ },
+ {
+ "epoch": 4.049645390070922,
+ "grad_norm": 3.256748914718628,
+ "learning_rate": 1.2114840179487333e-06,
+ "loss": 0.3766,
+ "step": 8565
+ },
+ {
+ "epoch": 4.050118203309693,
+ "grad_norm": 2.792759656906128,
+ "learning_rate": 1.2109494721087953e-06,
+ "loss": 0.3396,
+ "step": 8566
+ },
+ {
+ "epoch": 4.050591016548464,
+ "grad_norm": 2.9790122509002686,
+ "learning_rate": 1.2104150065323813e-06,
+ "loss": 0.3631,
+ "step": 8567
+ },
+ {
+ "epoch": 4.051063829787234,
+ "grad_norm": 2.7998805046081543,
+ "learning_rate": 1.2098806212527705e-06,
+ "loss": 0.3442,
+ "step": 8568
+ },
+ {
+ "epoch": 4.0515366430260045,
+ "grad_norm": 3.1292848587036133,
+ "learning_rate": 1.2093463163032351e-06,
+ "loss": 0.3798,
+ "step": 8569
+ },
+ {
+ "epoch": 4.052009456264775,
+ "grad_norm": 3.156205892562866,
+ "learning_rate": 1.2088120917170465e-06,
+ "loss": 0.309,
+ "step": 8570
+ },
+ {
+ "epoch": 4.052482269503546,
+ "grad_norm": 2.8891193866729736,
+ "learning_rate": 1.208277947527467e-06,
+ "loss": 0.2989,
+ "step": 8571
+ },
+ {
+ "epoch": 4.052955082742317,
+ "grad_norm": 3.087719678878784,
+ "learning_rate": 1.2077438837677548e-06,
+ "loss": 0.3348,
+ "step": 8572
+ },
+ {
+ "epoch": 4.053427895981088,
+ "grad_norm": 3.345583915710449,
+ "learning_rate": 1.2072099004711657e-06,
+ "loss": 0.3395,
+ "step": 8573
+ },
+ {
+ "epoch": 4.0539007092198585,
+ "grad_norm": 2.9834377765655518,
+ "learning_rate": 1.2066759976709463e-06,
+ "loss": 0.3252,
+ "step": 8574
+ },
+ {
+ "epoch": 4.054373522458629,
+ "grad_norm": 3.0764353275299072,
+ "learning_rate": 1.2061421754003425e-06,
+ "loss": 0.3467,
+ "step": 8575
+ },
+ {
+ "epoch": 4.054846335697399,
+ "grad_norm": 3.332232713699341,
+ "learning_rate": 1.2056084336925919e-06,
+ "loss": 0.3448,
+ "step": 8576
+ },
+ {
+ "epoch": 4.05531914893617,
+ "grad_norm": 3.1885993480682373,
+ "learning_rate": 1.2050747725809275e-06,
+ "loss": 0.325,
+ "step": 8577
+ },
+ {
+ "epoch": 4.055791962174941,
+ "grad_norm": 3.2727091312408447,
+ "learning_rate": 1.2045411920985798e-06,
+ "loss": 0.3755,
+ "step": 8578
+ },
+ {
+ "epoch": 4.0562647754137116,
+ "grad_norm": 3.0687687397003174,
+ "learning_rate": 1.2040076922787708e-06,
+ "loss": 0.2791,
+ "step": 8579
+ },
+ {
+ "epoch": 4.056737588652482,
+ "grad_norm": 3.2538771629333496,
+ "learning_rate": 1.2034742731547211e-06,
+ "loss": 0.3409,
+ "step": 8580
+ },
+ {
+ "epoch": 4.057210401891253,
+ "grad_norm": 3.237423896789551,
+ "learning_rate": 1.2029409347596429e-06,
+ "loss": 0.3803,
+ "step": 8581
+ },
+ {
+ "epoch": 4.057683215130024,
+ "grad_norm": 3.3347854614257812,
+ "learning_rate": 1.2024076771267457e-06,
+ "loss": 0.3123,
+ "step": 8582
+ },
+ {
+ "epoch": 4.058156028368795,
+ "grad_norm": 3.1294021606445312,
+ "learning_rate": 1.2018745002892327e-06,
+ "loss": 0.33,
+ "step": 8583
+ },
+ {
+ "epoch": 4.058628841607565,
+ "grad_norm": 2.9440014362335205,
+ "learning_rate": 1.2013414042803013e-06,
+ "loss": 0.3698,
+ "step": 8584
+ },
+ {
+ "epoch": 4.0591016548463354,
+ "grad_norm": 3.602764129638672,
+ "learning_rate": 1.200808389133147e-06,
+ "loss": 0.3733,
+ "step": 8585
+ },
+ {
+ "epoch": 4.059574468085106,
+ "grad_norm": 3.2689952850341797,
+ "learning_rate": 1.2002754548809578e-06,
+ "loss": 0.3188,
+ "step": 8586
+ },
+ {
+ "epoch": 4.060047281323877,
+ "grad_norm": 3.15454363822937,
+ "learning_rate": 1.199742601556916e-06,
+ "loss": 0.3493,
+ "step": 8587
+ },
+ {
+ "epoch": 4.060520094562648,
+ "grad_norm": 2.843860387802124,
+ "learning_rate": 1.1992098291942016e-06,
+ "loss": 0.3277,
+ "step": 8588
+ },
+ {
+ "epoch": 4.060992907801419,
+ "grad_norm": 3.0749056339263916,
+ "learning_rate": 1.1986771378259876e-06,
+ "loss": 0.3465,
+ "step": 8589
+ },
+ {
+ "epoch": 4.061465721040189,
+ "grad_norm": 3.3339948654174805,
+ "learning_rate": 1.1981445274854412e-06,
+ "loss": 0.3507,
+ "step": 8590
+ },
+ {
+ "epoch": 4.06193853427896,
+ "grad_norm": 2.7992780208587646,
+ "learning_rate": 1.1976119982057275e-06,
+ "loss": 0.302,
+ "step": 8591
+ },
+ {
+ "epoch": 4.06241134751773,
+ "grad_norm": 3.0862269401550293,
+ "learning_rate": 1.1970795500200028e-06,
+ "loss": 0.3365,
+ "step": 8592
+ },
+ {
+ "epoch": 4.062884160756501,
+ "grad_norm": 3.263456106185913,
+ "learning_rate": 1.1965471829614222e-06,
+ "loss": 0.3764,
+ "step": 8593
+ },
+ {
+ "epoch": 4.063356973995272,
+ "grad_norm": 3.0682623386383057,
+ "learning_rate": 1.1960148970631332e-06,
+ "loss": 0.3488,
+ "step": 8594
+ },
+ {
+ "epoch": 4.0638297872340425,
+ "grad_norm": 2.8910646438598633,
+ "learning_rate": 1.195482692358278e-06,
+ "loss": 0.3224,
+ "step": 8595
+ },
+ {
+ "epoch": 4.064302600472813,
+ "grad_norm": 3.170072555541992,
+ "learning_rate": 1.1949505688799961e-06,
+ "loss": 0.3058,
+ "step": 8596
+ },
+ {
+ "epoch": 4.064775413711584,
+ "grad_norm": 3.018674373626709,
+ "learning_rate": 1.19441852666142e-06,
+ "loss": 0.3824,
+ "step": 8597
+ },
+ {
+ "epoch": 4.065248226950355,
+ "grad_norm": 3.0038044452667236,
+ "learning_rate": 1.1938865657356773e-06,
+ "loss": 0.3657,
+ "step": 8598
+ },
+ {
+ "epoch": 4.065721040189126,
+ "grad_norm": 3.248204469680786,
+ "learning_rate": 1.193354686135891e-06,
+ "loss": 0.3305,
+ "step": 8599
+ },
+ {
+ "epoch": 4.066193853427896,
+ "grad_norm": 3.144714832305908,
+ "learning_rate": 1.192822887895178e-06,
+ "loss": 0.3395,
+ "step": 8600
+ },
+ {
+ "epoch": 4.066666666666666,
+ "grad_norm": 2.9457240104675293,
+ "learning_rate": 1.1922911710466531e-06,
+ "loss": 0.3288,
+ "step": 8601
+ },
+ {
+ "epoch": 4.067139479905437,
+ "grad_norm": 3.1602869033813477,
+ "learning_rate": 1.1917595356234218e-06,
+ "loss": 0.3713,
+ "step": 8602
+ },
+ {
+ "epoch": 4.067612293144208,
+ "grad_norm": 3.0820837020874023,
+ "learning_rate": 1.1912279816585888e-06,
+ "loss": 0.2987,
+ "step": 8603
+ },
+ {
+ "epoch": 4.068085106382979,
+ "grad_norm": 3.0366809368133545,
+ "learning_rate": 1.1906965091852502e-06,
+ "loss": 0.4151,
+ "step": 8604
+ },
+ {
+ "epoch": 4.06855791962175,
+ "grad_norm": 3.229402780532837,
+ "learning_rate": 1.190165118236498e-06,
+ "loss": 0.321,
+ "step": 8605
+ },
+ {
+ "epoch": 4.06903073286052,
+ "grad_norm": 2.832232713699341,
+ "learning_rate": 1.1896338088454217e-06,
+ "loss": 0.3551,
+ "step": 8606
+ },
+ {
+ "epoch": 4.069503546099291,
+ "grad_norm": 3.5618600845336914,
+ "learning_rate": 1.1891025810451012e-06,
+ "loss": 0.3704,
+ "step": 8607
+ },
+ {
+ "epoch": 4.069976359338061,
+ "grad_norm": 3.287827491760254,
+ "learning_rate": 1.1885714348686158e-06,
+ "loss": 0.3469,
+ "step": 8608
+ },
+ {
+ "epoch": 4.070449172576832,
+ "grad_norm": 3.468825101852417,
+ "learning_rate": 1.188040370349037e-06,
+ "loss": 0.3687,
+ "step": 8609
+ },
+ {
+ "epoch": 4.070921985815603,
+ "grad_norm": 3.2931180000305176,
+ "learning_rate": 1.1875093875194302e-06,
+ "loss": 0.3832,
+ "step": 8610
+ },
+ {
+ "epoch": 4.0713947990543735,
+ "grad_norm": 2.9613003730773926,
+ "learning_rate": 1.18697848641286e-06,
+ "loss": 0.3314,
+ "step": 8611
+ },
+ {
+ "epoch": 4.071867612293144,
+ "grad_norm": 3.1507649421691895,
+ "learning_rate": 1.1864476670623816e-06,
+ "loss": 0.3153,
+ "step": 8612
+ },
+ {
+ "epoch": 4.072340425531915,
+ "grad_norm": 2.844064950942993,
+ "learning_rate": 1.1859169295010478e-06,
+ "loss": 0.3566,
+ "step": 8613
+ },
+ {
+ "epoch": 4.072813238770686,
+ "grad_norm": 3.227264881134033,
+ "learning_rate": 1.1853862737619042e-06,
+ "loss": 0.3717,
+ "step": 8614
+ },
+ {
+ "epoch": 4.073286052009456,
+ "grad_norm": 2.9416239261627197,
+ "learning_rate": 1.1848556998779922e-06,
+ "loss": 0.3438,
+ "step": 8615
+ },
+ {
+ "epoch": 4.073758865248227,
+ "grad_norm": 4.1662492752075195,
+ "learning_rate": 1.18432520788235e-06,
+ "loss": 0.362,
+ "step": 8616
+ },
+ {
+ "epoch": 4.074231678486997,
+ "grad_norm": 3.47951602935791,
+ "learning_rate": 1.183794797808008e-06,
+ "loss": 0.3672,
+ "step": 8617
+ },
+ {
+ "epoch": 4.074704491725768,
+ "grad_norm": 2.998969793319702,
+ "learning_rate": 1.1832644696879919e-06,
+ "loss": 0.3281,
+ "step": 8618
+ },
+ {
+ "epoch": 4.075177304964539,
+ "grad_norm": 2.956167221069336,
+ "learning_rate": 1.182734223555324e-06,
+ "loss": 0.3059,
+ "step": 8619
+ },
+ {
+ "epoch": 4.07565011820331,
+ "grad_norm": 3.447821855545044,
+ "learning_rate": 1.1822040594430195e-06,
+ "loss": 0.333,
+ "step": 8620
+ },
+ {
+ "epoch": 4.076122931442081,
+ "grad_norm": 3.072972059249878,
+ "learning_rate": 1.1816739773840905e-06,
+ "loss": 0.3737,
+ "step": 8621
+ },
+ {
+ "epoch": 4.076595744680851,
+ "grad_norm": 3.142913341522217,
+ "learning_rate": 1.1811439774115424e-06,
+ "loss": 0.3697,
+ "step": 8622
+ },
+ {
+ "epoch": 4.077068557919622,
+ "grad_norm": 3.4997763633728027,
+ "learning_rate": 1.1806140595583745e-06,
+ "loss": 0.4177,
+ "step": 8623
+ },
+ {
+ "epoch": 4.077541371158392,
+ "grad_norm": 3.032951831817627,
+ "learning_rate": 1.1800842238575853e-06,
+ "loss": 0.351,
+ "step": 8624
+ },
+ {
+ "epoch": 4.078014184397163,
+ "grad_norm": 2.8878438472747803,
+ "learning_rate": 1.1795544703421625e-06,
+ "loss": 0.3409,
+ "step": 8625
+ },
+ {
+ "epoch": 4.078486997635934,
+ "grad_norm": 2.931614637374878,
+ "learning_rate": 1.1790247990450936e-06,
+ "loss": 0.3416,
+ "step": 8626
+ },
+ {
+ "epoch": 4.0789598108747045,
+ "grad_norm": 3.1719822883605957,
+ "learning_rate": 1.1784952099993586e-06,
+ "loss": 0.3574,
+ "step": 8627
+ },
+ {
+ "epoch": 4.079432624113475,
+ "grad_norm": 2.960068464279175,
+ "learning_rate": 1.1779657032379322e-06,
+ "loss": 0.3557,
+ "step": 8628
+ },
+ {
+ "epoch": 4.079905437352246,
+ "grad_norm": 3.1410937309265137,
+ "learning_rate": 1.1774362787937843e-06,
+ "loss": 0.3839,
+ "step": 8629
+ },
+ {
+ "epoch": 4.080378250591017,
+ "grad_norm": 3.596153736114502,
+ "learning_rate": 1.1769069366998793e-06,
+ "loss": 0.3135,
+ "step": 8630
+ },
+ {
+ "epoch": 4.080851063829787,
+ "grad_norm": 3.385826587677002,
+ "learning_rate": 1.1763776769891786e-06,
+ "loss": 0.3624,
+ "step": 8631
+ },
+ {
+ "epoch": 4.081323877068558,
+ "grad_norm": 3.2531018257141113,
+ "learning_rate": 1.175848499694636e-06,
+ "loss": 0.3593,
+ "step": 8632
+ },
+ {
+ "epoch": 4.081796690307328,
+ "grad_norm": 3.3864004611968994,
+ "learning_rate": 1.1753194048492004e-06,
+ "loss": 0.3929,
+ "step": 8633
+ },
+ {
+ "epoch": 4.082269503546099,
+ "grad_norm": 2.8734285831451416,
+ "learning_rate": 1.1747903924858175e-06,
+ "loss": 0.3145,
+ "step": 8634
+ },
+ {
+ "epoch": 4.08274231678487,
+ "grad_norm": 3.3261659145355225,
+ "learning_rate": 1.174261462637426e-06,
+ "loss": 0.3351,
+ "step": 8635
+ },
+ {
+ "epoch": 4.083215130023641,
+ "grad_norm": 3.413990020751953,
+ "learning_rate": 1.1737326153369594e-06,
+ "loss": 0.3984,
+ "step": 8636
+ },
+ {
+ "epoch": 4.083687943262412,
+ "grad_norm": 3.311741590499878,
+ "learning_rate": 1.1732038506173481e-06,
+ "loss": 0.3716,
+ "step": 8637
+ },
+ {
+ "epoch": 4.084160756501182,
+ "grad_norm": 3.691573143005371,
+ "learning_rate": 1.1726751685115142e-06,
+ "loss": 0.3542,
+ "step": 8638
+ },
+ {
+ "epoch": 4.084633569739952,
+ "grad_norm": 3.1951167583465576,
+ "learning_rate": 1.1721465690523784e-06,
+ "loss": 0.3683,
+ "step": 8639
+ },
+ {
+ "epoch": 4.085106382978723,
+ "grad_norm": 3.1731514930725098,
+ "learning_rate": 1.1716180522728534e-06,
+ "loss": 0.3552,
+ "step": 8640
+ },
+ {
+ "epoch": 4.085579196217494,
+ "grad_norm": 3.1588845252990723,
+ "learning_rate": 1.1710896182058465e-06,
+ "loss": 0.3908,
+ "step": 8641
+ },
+ {
+ "epoch": 4.086052009456265,
+ "grad_norm": 3.6902294158935547,
+ "learning_rate": 1.1705612668842628e-06,
+ "loss": 0.4099,
+ "step": 8642
+ },
+ {
+ "epoch": 4.0865248226950355,
+ "grad_norm": 4.56397819519043,
+ "learning_rate": 1.1700329983409988e-06,
+ "loss": 0.3456,
+ "step": 8643
+ },
+ {
+ "epoch": 4.086997635933806,
+ "grad_norm": 2.924715995788574,
+ "learning_rate": 1.1695048126089492e-06,
+ "loss": 0.3885,
+ "step": 8644
+ },
+ {
+ "epoch": 4.087470449172577,
+ "grad_norm": 3.537550687789917,
+ "learning_rate": 1.1689767097210009e-06,
+ "loss": 0.3551,
+ "step": 8645
+ },
+ {
+ "epoch": 4.087943262411348,
+ "grad_norm": 3.0198440551757812,
+ "learning_rate": 1.1684486897100364e-06,
+ "loss": 0.3448,
+ "step": 8646
+ },
+ {
+ "epoch": 4.088416075650118,
+ "grad_norm": 3.448965072631836,
+ "learning_rate": 1.1679207526089334e-06,
+ "loss": 0.3252,
+ "step": 8647
+ },
+ {
+ "epoch": 4.088888888888889,
+ "grad_norm": 3.057326078414917,
+ "learning_rate": 1.167392898450563e-06,
+ "loss": 0.3231,
+ "step": 8648
+ },
+ {
+ "epoch": 4.089361702127659,
+ "grad_norm": 3.0788655281066895,
+ "learning_rate": 1.1668651272677948e-06,
+ "loss": 0.3273,
+ "step": 8649
+ },
+ {
+ "epoch": 4.08983451536643,
+ "grad_norm": 2.9126291275024414,
+ "learning_rate": 1.1663374390934893e-06,
+ "loss": 0.3162,
+ "step": 8650
+ },
+ {
+ "epoch": 4.090307328605201,
+ "grad_norm": 3.278874635696411,
+ "learning_rate": 1.1658098339605027e-06,
+ "loss": 0.3123,
+ "step": 8651
+ },
+ {
+ "epoch": 4.090780141843972,
+ "grad_norm": 2.8490889072418213,
+ "learning_rate": 1.1652823119016882e-06,
+ "loss": 0.3408,
+ "step": 8652
+ },
+ {
+ "epoch": 4.091252955082743,
+ "grad_norm": 3.0473995208740234,
+ "learning_rate": 1.164754872949891e-06,
+ "loss": 0.3349,
+ "step": 8653
+ },
+ {
+ "epoch": 4.091725768321513,
+ "grad_norm": 2.9052987098693848,
+ "learning_rate": 1.1642275171379535e-06,
+ "loss": 0.3113,
+ "step": 8654
+ },
+ {
+ "epoch": 4.092198581560283,
+ "grad_norm": 3.1060919761657715,
+ "learning_rate": 1.1637002444987116e-06,
+ "loss": 0.3139,
+ "step": 8655
+ },
+ {
+ "epoch": 4.092671394799054,
+ "grad_norm": 3.172394275665283,
+ "learning_rate": 1.163173055064995e-06,
+ "loss": 0.3555,
+ "step": 8656
+ },
+ {
+ "epoch": 4.093144208037825,
+ "grad_norm": 3.36523699760437,
+ "learning_rate": 1.1626459488696313e-06,
+ "loss": 0.3835,
+ "step": 8657
+ },
+ {
+ "epoch": 4.093617021276596,
+ "grad_norm": 2.9513938426971436,
+ "learning_rate": 1.1621189259454393e-06,
+ "loss": 0.3432,
+ "step": 8658
+ },
+ {
+ "epoch": 4.0940898345153665,
+ "grad_norm": 2.8415515422821045,
+ "learning_rate": 1.1615919863252365e-06,
+ "loss": 0.3494,
+ "step": 8659
+ },
+ {
+ "epoch": 4.094562647754137,
+ "grad_norm": 3.3759984970092773,
+ "learning_rate": 1.1610651300418315e-06,
+ "loss": 0.3519,
+ "step": 8660
+ },
+ {
+ "epoch": 4.095035460992908,
+ "grad_norm": 3.1927380561828613,
+ "learning_rate": 1.1605383571280304e-06,
+ "loss": 0.3675,
+ "step": 8661
+ },
+ {
+ "epoch": 4.095508274231679,
+ "grad_norm": 2.800658941268921,
+ "learning_rate": 1.1600116676166321e-06,
+ "loss": 0.3291,
+ "step": 8662
+ },
+ {
+ "epoch": 4.095981087470449,
+ "grad_norm": 2.9948630332946777,
+ "learning_rate": 1.1594850615404316e-06,
+ "loss": 0.332,
+ "step": 8663
+ },
+ {
+ "epoch": 4.0964539007092196,
+ "grad_norm": 3.032003879547119,
+ "learning_rate": 1.1589585389322176e-06,
+ "loss": 0.3583,
+ "step": 8664
+ },
+ {
+ "epoch": 4.09692671394799,
+ "grad_norm": 2.9765310287475586,
+ "learning_rate": 1.1584320998247757e-06,
+ "loss": 0.3296,
+ "step": 8665
+ },
+ {
+ "epoch": 4.097399527186761,
+ "grad_norm": 3.049954414367676,
+ "learning_rate": 1.1579057442508838e-06,
+ "loss": 0.4007,
+ "step": 8666
+ },
+ {
+ "epoch": 4.097872340425532,
+ "grad_norm": 3.3874928951263428,
+ "learning_rate": 1.1573794722433168e-06,
+ "loss": 0.3856,
+ "step": 8667
+ },
+ {
+ "epoch": 4.098345153664303,
+ "grad_norm": 2.56701397895813,
+ "learning_rate": 1.1568532838348432e-06,
+ "loss": 0.3291,
+ "step": 8668
+ },
+ {
+ "epoch": 4.0988179669030735,
+ "grad_norm": 2.956408739089966,
+ "learning_rate": 1.1563271790582247e-06,
+ "loss": 0.3538,
+ "step": 8669
+ },
+ {
+ "epoch": 4.099290780141844,
+ "grad_norm": 3.827467679977417,
+ "learning_rate": 1.1558011579462225e-06,
+ "loss": 0.3764,
+ "step": 8670
+ },
+ {
+ "epoch": 4.099763593380614,
+ "grad_norm": 3.3271424770355225,
+ "learning_rate": 1.1552752205315867e-06,
+ "loss": 0.339,
+ "step": 8671
+ },
+ {
+ "epoch": 4.100236406619385,
+ "grad_norm": 3.0050785541534424,
+ "learning_rate": 1.1547493668470675e-06,
+ "loss": 0.3764,
+ "step": 8672
+ },
+ {
+ "epoch": 4.100709219858156,
+ "grad_norm": 2.6030385494232178,
+ "learning_rate": 1.1542235969254065e-06,
+ "loss": 0.3507,
+ "step": 8673
+ },
+ {
+ "epoch": 4.101182033096927,
+ "grad_norm": 3.081695556640625,
+ "learning_rate": 1.1536979107993402e-06,
+ "loss": 0.3386,
+ "step": 8674
+ },
+ {
+ "epoch": 4.101654846335697,
+ "grad_norm": 2.7685163021087646,
+ "learning_rate": 1.1531723085016025e-06,
+ "loss": 0.3456,
+ "step": 8675
+ },
+ {
+ "epoch": 4.102127659574468,
+ "grad_norm": 3.037252902984619,
+ "learning_rate": 1.1526467900649195e-06,
+ "loss": 0.3489,
+ "step": 8676
+ },
+ {
+ "epoch": 4.102600472813239,
+ "grad_norm": 2.9675045013427734,
+ "learning_rate": 1.1521213555220129e-06,
+ "loss": 0.3638,
+ "step": 8677
+ },
+ {
+ "epoch": 4.10307328605201,
+ "grad_norm": 3.3377575874328613,
+ "learning_rate": 1.1515960049055994e-06,
+ "loss": 0.3557,
+ "step": 8678
+ },
+ {
+ "epoch": 4.10354609929078,
+ "grad_norm": 2.7452030181884766,
+ "learning_rate": 1.1510707382483888e-06,
+ "loss": 0.285,
+ "step": 8679
+ },
+ {
+ "epoch": 4.1040189125295505,
+ "grad_norm": 2.9602560997009277,
+ "learning_rate": 1.1505455555830897e-06,
+ "loss": 0.321,
+ "step": 8680
+ },
+ {
+ "epoch": 4.104491725768321,
+ "grad_norm": 3.122945547103882,
+ "learning_rate": 1.1500204569424007e-06,
+ "loss": 0.375,
+ "step": 8681
+ },
+ {
+ "epoch": 4.104964539007092,
+ "grad_norm": 3.334885835647583,
+ "learning_rate": 1.149495442359019e-06,
+ "loss": 0.3399,
+ "step": 8682
+ },
+ {
+ "epoch": 4.105437352245863,
+ "grad_norm": 3.3663594722747803,
+ "learning_rate": 1.1489705118656346e-06,
+ "loss": 0.3482,
+ "step": 8683
+ },
+ {
+ "epoch": 4.105910165484634,
+ "grad_norm": 2.9761641025543213,
+ "learning_rate": 1.1484456654949313e-06,
+ "loss": 0.3317,
+ "step": 8684
+ },
+ {
+ "epoch": 4.1063829787234045,
+ "grad_norm": 3.486905574798584,
+ "learning_rate": 1.147920903279591e-06,
+ "loss": 0.3737,
+ "step": 8685
+ },
+ {
+ "epoch": 4.106855791962175,
+ "grad_norm": 3.4820523262023926,
+ "learning_rate": 1.1473962252522875e-06,
+ "loss": 0.4135,
+ "step": 8686
+ },
+ {
+ "epoch": 4.107328605200945,
+ "grad_norm": 3.314117431640625,
+ "learning_rate": 1.146871631445689e-06,
+ "loss": 0.369,
+ "step": 8687
+ },
+ {
+ "epoch": 4.107801418439716,
+ "grad_norm": 2.9497411251068115,
+ "learning_rate": 1.1463471218924615e-06,
+ "loss": 0.3233,
+ "step": 8688
+ },
+ {
+ "epoch": 4.108274231678487,
+ "grad_norm": 3.1337075233459473,
+ "learning_rate": 1.1458226966252624e-06,
+ "loss": 0.401,
+ "step": 8689
+ },
+ {
+ "epoch": 4.108747044917258,
+ "grad_norm": 3.0163166522979736,
+ "learning_rate": 1.1452983556767473e-06,
+ "loss": 0.3812,
+ "step": 8690
+ },
+ {
+ "epoch": 4.109219858156028,
+ "grad_norm": 2.976491928100586,
+ "learning_rate": 1.1447740990795629e-06,
+ "loss": 0.3508,
+ "step": 8691
+ },
+ {
+ "epoch": 4.109692671394799,
+ "grad_norm": 3.2449910640716553,
+ "learning_rate": 1.144249926866353e-06,
+ "loss": 0.3056,
+ "step": 8692
+ },
+ {
+ "epoch": 4.11016548463357,
+ "grad_norm": 2.562558650970459,
+ "learning_rate": 1.1437258390697553e-06,
+ "loss": 0.2878,
+ "step": 8693
+ },
+ {
+ "epoch": 4.110638297872341,
+ "grad_norm": 3.1823108196258545,
+ "learning_rate": 1.1432018357224017e-06,
+ "loss": 0.2849,
+ "step": 8694
+ },
+ {
+ "epoch": 4.111111111111111,
+ "grad_norm": 2.9045653343200684,
+ "learning_rate": 1.1426779168569217e-06,
+ "loss": 0.3264,
+ "step": 8695
+ },
+ {
+ "epoch": 4.1115839243498815,
+ "grad_norm": 2.7991254329681396,
+ "learning_rate": 1.1421540825059355e-06,
+ "loss": 0.3427,
+ "step": 8696
+ },
+ {
+ "epoch": 4.112056737588652,
+ "grad_norm": 2.9184927940368652,
+ "learning_rate": 1.14163033270206e-06,
+ "loss": 0.3073,
+ "step": 8697
+ },
+ {
+ "epoch": 4.112529550827423,
+ "grad_norm": 3.189335584640503,
+ "learning_rate": 1.1411066674779084e-06,
+ "loss": 0.3836,
+ "step": 8698
+ },
+ {
+ "epoch": 4.113002364066194,
+ "grad_norm": 2.899711847305298,
+ "learning_rate": 1.140583086866085e-06,
+ "loss": 0.3378,
+ "step": 8699
+ },
+ {
+ "epoch": 4.113475177304965,
+ "grad_norm": 3.167665481567383,
+ "learning_rate": 1.1400595908991927e-06,
+ "loss": 0.3273,
+ "step": 8700
+ },
+ {
+ "epoch": 4.1139479905437355,
+ "grad_norm": 3.2930212020874023,
+ "learning_rate": 1.1395361796098268e-06,
+ "loss": 0.3221,
+ "step": 8701
+ },
+ {
+ "epoch": 4.114420803782506,
+ "grad_norm": 3.0603861808776855,
+ "learning_rate": 1.1390128530305764e-06,
+ "loss": 0.3371,
+ "step": 8702
+ },
+ {
+ "epoch": 4.114893617021276,
+ "grad_norm": 3.6339457035064697,
+ "learning_rate": 1.1384896111940289e-06,
+ "loss": 0.3986,
+ "step": 8703
+ },
+ {
+ "epoch": 4.115366430260047,
+ "grad_norm": 2.975799322128296,
+ "learning_rate": 1.1379664541327623e-06,
+ "loss": 0.3021,
+ "step": 8704
+ },
+ {
+ "epoch": 4.115839243498818,
+ "grad_norm": 2.9100987911224365,
+ "learning_rate": 1.1374433818793534e-06,
+ "loss": 0.3473,
+ "step": 8705
+ },
+ {
+ "epoch": 4.116312056737589,
+ "grad_norm": 2.9515233039855957,
+ "learning_rate": 1.1369203944663704e-06,
+ "loss": 0.3004,
+ "step": 8706
+ },
+ {
+ "epoch": 4.116784869976359,
+ "grad_norm": 3.283583879470825,
+ "learning_rate": 1.1363974919263774e-06,
+ "loss": 0.401,
+ "step": 8707
+ },
+ {
+ "epoch": 4.11725768321513,
+ "grad_norm": 3.307530641555786,
+ "learning_rate": 1.1358746742919325e-06,
+ "loss": 0.322,
+ "step": 8708
+ },
+ {
+ "epoch": 4.117730496453901,
+ "grad_norm": 3.6834614276885986,
+ "learning_rate": 1.135351941595591e-06,
+ "loss": 0.3703,
+ "step": 8709
+ },
+ {
+ "epoch": 4.118203309692672,
+ "grad_norm": 3.0829904079437256,
+ "learning_rate": 1.1348292938699e-06,
+ "loss": 0.3283,
+ "step": 8710
+ },
+ {
+ "epoch": 4.118676122931442,
+ "grad_norm": 2.914794921875,
+ "learning_rate": 1.1343067311474033e-06,
+ "loss": 0.337,
+ "step": 8711
+ },
+ {
+ "epoch": 4.1191489361702125,
+ "grad_norm": 3.550536870956421,
+ "learning_rate": 1.1337842534606368e-06,
+ "loss": 0.3752,
+ "step": 8712
+ },
+ {
+ "epoch": 4.119621749408983,
+ "grad_norm": 3.337012767791748,
+ "learning_rate": 1.1332618608421353e-06,
+ "loss": 0.3604,
+ "step": 8713
+ },
+ {
+ "epoch": 4.120094562647754,
+ "grad_norm": 2.7749485969543457,
+ "learning_rate": 1.1327395533244248e-06,
+ "loss": 0.3712,
+ "step": 8714
+ },
+ {
+ "epoch": 4.120567375886525,
+ "grad_norm": 3.571261405944824,
+ "learning_rate": 1.1322173309400258e-06,
+ "loss": 0.4148,
+ "step": 8715
+ },
+ {
+ "epoch": 4.121040189125296,
+ "grad_norm": 3.264871597290039,
+ "learning_rate": 1.1316951937214573e-06,
+ "loss": 0.3229,
+ "step": 8716
+ },
+ {
+ "epoch": 4.1215130023640665,
+ "grad_norm": 2.974625825881958,
+ "learning_rate": 1.131173141701228e-06,
+ "loss": 0.3372,
+ "step": 8717
+ },
+ {
+ "epoch": 4.121985815602837,
+ "grad_norm": 3.18060302734375,
+ "learning_rate": 1.1306511749118466e-06,
+ "loss": 0.4041,
+ "step": 8718
+ },
+ {
+ "epoch": 4.122458628841607,
+ "grad_norm": 2.7793190479278564,
+ "learning_rate": 1.1301292933858115e-06,
+ "loss": 0.3329,
+ "step": 8719
+ },
+ {
+ "epoch": 4.122931442080378,
+ "grad_norm": 3.0883100032806396,
+ "learning_rate": 1.1296074971556179e-06,
+ "loss": 0.3999,
+ "step": 8720
+ },
+ {
+ "epoch": 4.123404255319149,
+ "grad_norm": 2.984799385070801,
+ "learning_rate": 1.1290857862537573e-06,
+ "loss": 0.3432,
+ "step": 8721
+ },
+ {
+ "epoch": 4.12387706855792,
+ "grad_norm": 3.0691094398498535,
+ "learning_rate": 1.1285641607127127e-06,
+ "loss": 0.3043,
+ "step": 8722
+ },
+ {
+ "epoch": 4.12434988179669,
+ "grad_norm": 3.2218985557556152,
+ "learning_rate": 1.128042620564965e-06,
+ "loss": 0.363,
+ "step": 8723
+ },
+ {
+ "epoch": 4.124822695035461,
+ "grad_norm": 2.951098918914795,
+ "learning_rate": 1.1275211658429877e-06,
+ "loss": 0.3459,
+ "step": 8724
+ },
+ {
+ "epoch": 4.125295508274232,
+ "grad_norm": 3.038513660430908,
+ "learning_rate": 1.1269997965792493e-06,
+ "loss": 0.3073,
+ "step": 8725
+ },
+ {
+ "epoch": 4.125768321513003,
+ "grad_norm": 2.7548015117645264,
+ "learning_rate": 1.1264785128062129e-06,
+ "loss": 0.3587,
+ "step": 8726
+ },
+ {
+ "epoch": 4.126241134751773,
+ "grad_norm": 3.618379592895508,
+ "learning_rate": 1.125957314556336e-06,
+ "loss": 0.4009,
+ "step": 8727
+ },
+ {
+ "epoch": 4.1267139479905435,
+ "grad_norm": 3.264702320098877,
+ "learning_rate": 1.1254362018620728e-06,
+ "loss": 0.3684,
+ "step": 8728
+ },
+ {
+ "epoch": 4.127186761229314,
+ "grad_norm": 3.209995746612549,
+ "learning_rate": 1.1249151747558704e-06,
+ "loss": 0.3796,
+ "step": 8729
+ },
+ {
+ "epoch": 4.127659574468085,
+ "grad_norm": 3.164973735809326,
+ "learning_rate": 1.1243942332701693e-06,
+ "loss": 0.3147,
+ "step": 8730
+ },
+ {
+ "epoch": 4.128132387706856,
+ "grad_norm": 3.309659957885742,
+ "learning_rate": 1.1238733774374087e-06,
+ "loss": 0.308,
+ "step": 8731
+ },
+ {
+ "epoch": 4.128605200945627,
+ "grad_norm": 3.138901710510254,
+ "learning_rate": 1.1233526072900184e-06,
+ "loss": 0.3721,
+ "step": 8732
+ },
+ {
+ "epoch": 4.1290780141843975,
+ "grad_norm": 3.5710649490356445,
+ "learning_rate": 1.122831922860424e-06,
+ "loss": 0.3872,
+ "step": 8733
+ },
+ {
+ "epoch": 4.129550827423168,
+ "grad_norm": 3.192469835281372,
+ "learning_rate": 1.1223113241810482e-06,
+ "loss": 0.349,
+ "step": 8734
+ },
+ {
+ "epoch": 4.130023640661938,
+ "grad_norm": 2.9302608966827393,
+ "learning_rate": 1.121790811284304e-06,
+ "loss": 0.3207,
+ "step": 8735
+ },
+ {
+ "epoch": 4.130496453900709,
+ "grad_norm": 3.022963047027588,
+ "learning_rate": 1.121270384202604e-06,
+ "loss": 0.3487,
+ "step": 8736
+ },
+ {
+ "epoch": 4.13096926713948,
+ "grad_norm": 3.0473732948303223,
+ "learning_rate": 1.1207500429683513e-06,
+ "loss": 0.3083,
+ "step": 8737
+ },
+ {
+ "epoch": 4.131442080378251,
+ "grad_norm": 2.9411537647247314,
+ "learning_rate": 1.1202297876139448e-06,
+ "loss": 0.3077,
+ "step": 8738
+ },
+ {
+ "epoch": 4.131914893617021,
+ "grad_norm": 2.9274520874023438,
+ "learning_rate": 1.1197096181717804e-06,
+ "loss": 0.3071,
+ "step": 8739
+ },
+ {
+ "epoch": 4.132387706855792,
+ "grad_norm": 2.79213285446167,
+ "learning_rate": 1.1191895346742454e-06,
+ "loss": 0.3346,
+ "step": 8740
+ },
+ {
+ "epoch": 4.132860520094563,
+ "grad_norm": 3.2763726711273193,
+ "learning_rate": 1.1186695371537235e-06,
+ "loss": 0.3753,
+ "step": 8741
+ },
+ {
+ "epoch": 4.133333333333334,
+ "grad_norm": 3.245525598526001,
+ "learning_rate": 1.1181496256425927e-06,
+ "loss": 0.3586,
+ "step": 8742
+ },
+ {
+ "epoch": 4.133806146572104,
+ "grad_norm": 3.557176351547241,
+ "learning_rate": 1.1176298001732244e-06,
+ "loss": 0.3547,
+ "step": 8743
+ },
+ {
+ "epoch": 4.1342789598108745,
+ "grad_norm": 3.674633741378784,
+ "learning_rate": 1.117110060777988e-06,
+ "loss": 0.3994,
+ "step": 8744
+ },
+ {
+ "epoch": 4.134751773049645,
+ "grad_norm": 3.168025016784668,
+ "learning_rate": 1.1165904074892433e-06,
+ "loss": 0.3568,
+ "step": 8745
+ },
+ {
+ "epoch": 4.135224586288416,
+ "grad_norm": 2.9492177963256836,
+ "learning_rate": 1.1160708403393488e-06,
+ "loss": 0.3257,
+ "step": 8746
+ },
+ {
+ "epoch": 4.135697399527187,
+ "grad_norm": 3.139941930770874,
+ "learning_rate": 1.1155513593606548e-06,
+ "loss": 0.3464,
+ "step": 8747
+ },
+ {
+ "epoch": 4.136170212765958,
+ "grad_norm": 3.1875250339508057,
+ "learning_rate": 1.115031964585506e-06,
+ "loss": 0.3154,
+ "step": 8748
+ },
+ {
+ "epoch": 4.136643026004728,
+ "grad_norm": 3.0219457149505615,
+ "learning_rate": 1.1145126560462447e-06,
+ "loss": 0.3433,
+ "step": 8749
+ },
+ {
+ "epoch": 4.137115839243499,
+ "grad_norm": 2.992807149887085,
+ "learning_rate": 1.1139934337752046e-06,
+ "loss": 0.3127,
+ "step": 8750
+ },
+ {
+ "epoch": 4.137588652482269,
+ "grad_norm": 3.354733943939209,
+ "learning_rate": 1.1134742978047163e-06,
+ "loss": 0.3166,
+ "step": 8751
+ },
+ {
+ "epoch": 4.13806146572104,
+ "grad_norm": 3.1885886192321777,
+ "learning_rate": 1.1129552481671042e-06,
+ "loss": 0.3872,
+ "step": 8752
+ },
+ {
+ "epoch": 4.138534278959811,
+ "grad_norm": 2.8869078159332275,
+ "learning_rate": 1.1124362848946858e-06,
+ "loss": 0.3218,
+ "step": 8753
+ },
+ {
+ "epoch": 4.1390070921985815,
+ "grad_norm": 3.818469285964966,
+ "learning_rate": 1.1119174080197762e-06,
+ "loss": 0.3442,
+ "step": 8754
+ },
+ {
+ "epoch": 4.139479905437352,
+ "grad_norm": 3.2445592880249023,
+ "learning_rate": 1.1113986175746833e-06,
+ "loss": 0.3858,
+ "step": 8755
+ },
+ {
+ "epoch": 4.139952718676123,
+ "grad_norm": 2.654083490371704,
+ "learning_rate": 1.1108799135917098e-06,
+ "loss": 0.3023,
+ "step": 8756
+ },
+ {
+ "epoch": 4.140425531914894,
+ "grad_norm": 3.129635810852051,
+ "learning_rate": 1.1103612961031527e-06,
+ "loss": 0.3179,
+ "step": 8757
+ },
+ {
+ "epoch": 4.140898345153665,
+ "grad_norm": 2.8118138313293457,
+ "learning_rate": 1.1098427651413035e-06,
+ "loss": 0.3374,
+ "step": 8758
+ },
+ {
+ "epoch": 4.141371158392435,
+ "grad_norm": 3.104051113128662,
+ "learning_rate": 1.1093243207384506e-06,
+ "loss": 0.3202,
+ "step": 8759
+ },
+ {
+ "epoch": 4.141843971631205,
+ "grad_norm": 3.12392520904541,
+ "learning_rate": 1.1088059629268744e-06,
+ "loss": 0.3567,
+ "step": 8760
+ },
+ {
+ "epoch": 4.142316784869976,
+ "grad_norm": 3.467481851577759,
+ "learning_rate": 1.1082876917388497e-06,
+ "loss": 0.4148,
+ "step": 8761
+ },
+ {
+ "epoch": 4.142789598108747,
+ "grad_norm": 3.1120564937591553,
+ "learning_rate": 1.1077695072066488e-06,
+ "loss": 0.3838,
+ "step": 8762
+ },
+ {
+ "epoch": 4.143262411347518,
+ "grad_norm": 3.028073310852051,
+ "learning_rate": 1.107251409362535e-06,
+ "loss": 0.3703,
+ "step": 8763
+ },
+ {
+ "epoch": 4.143735224586289,
+ "grad_norm": 3.091510057449341,
+ "learning_rate": 1.1067333982387699e-06,
+ "loss": 0.3695,
+ "step": 8764
+ },
+ {
+ "epoch": 4.144208037825059,
+ "grad_norm": 3.9426586627960205,
+ "learning_rate": 1.1062154738676067e-06,
+ "loss": 0.3934,
+ "step": 8765
+ },
+ {
+ "epoch": 4.14468085106383,
+ "grad_norm": 2.923741102218628,
+ "learning_rate": 1.1056976362812939e-06,
+ "loss": 0.3679,
+ "step": 8766
+ },
+ {
+ "epoch": 4.1451536643026,
+ "grad_norm": 3.1010327339172363,
+ "learning_rate": 1.1051798855120757e-06,
+ "loss": 0.3314,
+ "step": 8767
+ },
+ {
+ "epoch": 4.145626477541371,
+ "grad_norm": 2.9165778160095215,
+ "learning_rate": 1.1046622215921896e-06,
+ "loss": 0.3473,
+ "step": 8768
+ },
+ {
+ "epoch": 4.146099290780142,
+ "grad_norm": 2.8494462966918945,
+ "learning_rate": 1.1041446445538692e-06,
+ "loss": 0.3226,
+ "step": 8769
+ },
+ {
+ "epoch": 4.1465721040189125,
+ "grad_norm": 2.53379225730896,
+ "learning_rate": 1.1036271544293412e-06,
+ "loss": 0.3123,
+ "step": 8770
+ },
+ {
+ "epoch": 4.147044917257683,
+ "grad_norm": 3.0433695316314697,
+ "learning_rate": 1.1031097512508274e-06,
+ "loss": 0.37,
+ "step": 8771
+ },
+ {
+ "epoch": 4.147517730496454,
+ "grad_norm": 3.418458938598633,
+ "learning_rate": 1.1025924350505431e-06,
+ "loss": 0.3266,
+ "step": 8772
+ },
+ {
+ "epoch": 4.147990543735225,
+ "grad_norm": 2.843733787536621,
+ "learning_rate": 1.1020752058607017e-06,
+ "loss": 0.3548,
+ "step": 8773
+ },
+ {
+ "epoch": 4.148463356973995,
+ "grad_norm": 3.122965097427368,
+ "learning_rate": 1.1015580637135073e-06,
+ "loss": 0.3214,
+ "step": 8774
+ },
+ {
+ "epoch": 4.148936170212766,
+ "grad_norm": 3.0042455196380615,
+ "learning_rate": 1.1010410086411601e-06,
+ "loss": 0.3395,
+ "step": 8775
+ },
+ {
+ "epoch": 4.149408983451536,
+ "grad_norm": 2.841426372528076,
+ "learning_rate": 1.1005240406758546e-06,
+ "loss": 0.3381,
+ "step": 8776
+ },
+ {
+ "epoch": 4.149881796690307,
+ "grad_norm": 2.8241262435913086,
+ "learning_rate": 1.100007159849781e-06,
+ "loss": 0.3504,
+ "step": 8777
+ },
+ {
+ "epoch": 4.150354609929078,
+ "grad_norm": 2.5685677528381348,
+ "learning_rate": 1.0994903661951223e-06,
+ "loss": 0.309,
+ "step": 8778
+ },
+ {
+ "epoch": 4.150827423167849,
+ "grad_norm": 3.197665214538574,
+ "learning_rate": 1.0989736597440581e-06,
+ "loss": 0.3722,
+ "step": 8779
+ },
+ {
+ "epoch": 4.15130023640662,
+ "grad_norm": 3.1483469009399414,
+ "learning_rate": 1.098457040528761e-06,
+ "loss": 0.3301,
+ "step": 8780
+ },
+ {
+ "epoch": 4.15177304964539,
+ "grad_norm": 2.8838415145874023,
+ "learning_rate": 1.0979405085813972e-06,
+ "loss": 0.3212,
+ "step": 8781
+ },
+ {
+ "epoch": 4.152245862884161,
+ "grad_norm": 3.1998705863952637,
+ "learning_rate": 1.0974240639341312e-06,
+ "loss": 0.3557,
+ "step": 8782
+ },
+ {
+ "epoch": 4.152718676122931,
+ "grad_norm": 2.9004411697387695,
+ "learning_rate": 1.0969077066191187e-06,
+ "loss": 0.351,
+ "step": 8783
+ },
+ {
+ "epoch": 4.153191489361702,
+ "grad_norm": 3.036574125289917,
+ "learning_rate": 1.0963914366685096e-06,
+ "loss": 0.3762,
+ "step": 8784
+ },
+ {
+ "epoch": 4.153664302600473,
+ "grad_norm": 3.6683623790740967,
+ "learning_rate": 1.0958752541144523e-06,
+ "loss": 0.3938,
+ "step": 8785
+ },
+ {
+ "epoch": 4.1541371158392435,
+ "grad_norm": 2.922271490097046,
+ "learning_rate": 1.0953591589890852e-06,
+ "loss": 0.3375,
+ "step": 8786
+ },
+ {
+ "epoch": 4.154609929078014,
+ "grad_norm": 3.1750547885894775,
+ "learning_rate": 1.094843151324545e-06,
+ "loss": 0.3455,
+ "step": 8787
+ },
+ {
+ "epoch": 4.155082742316785,
+ "grad_norm": 2.7836148738861084,
+ "learning_rate": 1.0943272311529602e-06,
+ "loss": 0.3359,
+ "step": 8788
+ },
+ {
+ "epoch": 4.155555555555556,
+ "grad_norm": 3.4582557678222656,
+ "learning_rate": 1.0938113985064553e-06,
+ "loss": 0.3358,
+ "step": 8789
+ },
+ {
+ "epoch": 4.156028368794326,
+ "grad_norm": 3.0436923503875732,
+ "learning_rate": 1.0932956534171483e-06,
+ "loss": 0.3531,
+ "step": 8790
+ },
+ {
+ "epoch": 4.156501182033097,
+ "grad_norm": 3.1420092582702637,
+ "learning_rate": 1.092779995917152e-06,
+ "loss": 0.3917,
+ "step": 8791
+ },
+ {
+ "epoch": 4.156973995271867,
+ "grad_norm": 2.9556260108947754,
+ "learning_rate": 1.0922644260385756e-06,
+ "loss": 0.3259,
+ "step": 8792
+ },
+ {
+ "epoch": 4.157446808510638,
+ "grad_norm": 2.8876030445098877,
+ "learning_rate": 1.091748943813521e-06,
+ "loss": 0.3447,
+ "step": 8793
+ },
+ {
+ "epoch": 4.157919621749409,
+ "grad_norm": 3.039207696914673,
+ "learning_rate": 1.0912335492740836e-06,
+ "loss": 0.3216,
+ "step": 8794
+ },
+ {
+ "epoch": 4.15839243498818,
+ "grad_norm": 2.852355480194092,
+ "learning_rate": 1.0907182424523568e-06,
+ "loss": 0.2906,
+ "step": 8795
+ },
+ {
+ "epoch": 4.158865248226951,
+ "grad_norm": 2.931675434112549,
+ "learning_rate": 1.0902030233804245e-06,
+ "loss": 0.3124,
+ "step": 8796
+ },
+ {
+ "epoch": 4.159338061465721,
+ "grad_norm": 3.010590076446533,
+ "learning_rate": 1.0896878920903691e-06,
+ "loss": 0.283,
+ "step": 8797
+ },
+ {
+ "epoch": 4.159810874704492,
+ "grad_norm": 3.093153953552246,
+ "learning_rate": 1.0891728486142648e-06,
+ "loss": 0.3269,
+ "step": 8798
+ },
+ {
+ "epoch": 4.160283687943262,
+ "grad_norm": 2.93019437789917,
+ "learning_rate": 1.0886578929841798e-06,
+ "loss": 0.3261,
+ "step": 8799
+ },
+ {
+ "epoch": 4.160756501182033,
+ "grad_norm": 3.176790475845337,
+ "learning_rate": 1.0881430252321803e-06,
+ "loss": 0.2805,
+ "step": 8800
+ },
+ {
+ "epoch": 4.161229314420804,
+ "grad_norm": 3.215359687805176,
+ "learning_rate": 1.0876282453903228e-06,
+ "loss": 0.4022,
+ "step": 8801
+ },
+ {
+ "epoch": 4.1617021276595745,
+ "grad_norm": 3.3343284130096436,
+ "learning_rate": 1.0871135534906623e-06,
+ "loss": 0.3469,
+ "step": 8802
+ },
+ {
+ "epoch": 4.162174940898345,
+ "grad_norm": 3.030043363571167,
+ "learning_rate": 1.0865989495652456e-06,
+ "loss": 0.3548,
+ "step": 8803
+ },
+ {
+ "epoch": 4.162647754137116,
+ "grad_norm": 2.9456260204315186,
+ "learning_rate": 1.0860844336461146e-06,
+ "loss": 0.3356,
+ "step": 8804
+ },
+ {
+ "epoch": 4.163120567375887,
+ "grad_norm": 2.9399044513702393,
+ "learning_rate": 1.0855700057653063e-06,
+ "loss": 0.362,
+ "step": 8805
+ },
+ {
+ "epoch": 4.163593380614657,
+ "grad_norm": 3.3188061714172363,
+ "learning_rate": 1.0850556659548513e-06,
+ "loss": 0.3866,
+ "step": 8806
+ },
+ {
+ "epoch": 4.164066193853428,
+ "grad_norm": 3.1601030826568604,
+ "learning_rate": 1.084541414246775e-06,
+ "loss": 0.3662,
+ "step": 8807
+ },
+ {
+ "epoch": 4.164539007092198,
+ "grad_norm": 3.0458695888519287,
+ "learning_rate": 1.0840272506730993e-06,
+ "loss": 0.3318,
+ "step": 8808
+ },
+ {
+ "epoch": 4.165011820330969,
+ "grad_norm": 3.056387186050415,
+ "learning_rate": 1.0835131752658365e-06,
+ "loss": 0.3538,
+ "step": 8809
+ },
+ {
+ "epoch": 4.16548463356974,
+ "grad_norm": 2.9833531379699707,
+ "learning_rate": 1.0829991880569984e-06,
+ "loss": 0.3088,
+ "step": 8810
+ },
+ {
+ "epoch": 4.165957446808511,
+ "grad_norm": 3.325438976287842,
+ "learning_rate": 1.0824852890785876e-06,
+ "loss": 0.3524,
+ "step": 8811
+ },
+ {
+ "epoch": 4.166430260047282,
+ "grad_norm": 2.781290054321289,
+ "learning_rate": 1.0819714783626009e-06,
+ "loss": 0.3925,
+ "step": 8812
+ },
+ {
+ "epoch": 4.166903073286052,
+ "grad_norm": 16.3265323638916,
+ "learning_rate": 1.0814577559410336e-06,
+ "loss": 0.4248,
+ "step": 8813
+ },
+ {
+ "epoch": 4.167375886524822,
+ "grad_norm": 2.906619071960449,
+ "learning_rate": 1.0809441218458708e-06,
+ "loss": 0.3904,
+ "step": 8814
+ },
+ {
+ "epoch": 4.167848699763593,
+ "grad_norm": 2.7133800983428955,
+ "learning_rate": 1.0804305761090957e-06,
+ "loss": 0.2855,
+ "step": 8815
+ },
+ {
+ "epoch": 4.168321513002364,
+ "grad_norm": 3.252946376800537,
+ "learning_rate": 1.0799171187626844e-06,
+ "loss": 0.3285,
+ "step": 8816
+ },
+ {
+ "epoch": 4.168794326241135,
+ "grad_norm": 3.0832788944244385,
+ "learning_rate": 1.0794037498386062e-06,
+ "loss": 0.3175,
+ "step": 8817
+ },
+ {
+ "epoch": 4.1692671394799055,
+ "grad_norm": 3.046424150466919,
+ "learning_rate": 1.0788904693688284e-06,
+ "loss": 0.3545,
+ "step": 8818
+ },
+ {
+ "epoch": 4.169739952718676,
+ "grad_norm": 3.643488645553589,
+ "learning_rate": 1.0783772773853095e-06,
+ "loss": 0.3889,
+ "step": 8819
+ },
+ {
+ "epoch": 4.170212765957447,
+ "grad_norm": 3.433997392654419,
+ "learning_rate": 1.077864173920004e-06,
+ "loss": 0.311,
+ "step": 8820
+ },
+ {
+ "epoch": 4.170685579196218,
+ "grad_norm": 3.287684679031372,
+ "learning_rate": 1.0773511590048605e-06,
+ "loss": 0.3708,
+ "step": 8821
+ },
+ {
+ "epoch": 4.171158392434988,
+ "grad_norm": 3.5546534061431885,
+ "learning_rate": 1.0768382326718212e-06,
+ "loss": 0.3845,
+ "step": 8822
+ },
+ {
+ "epoch": 4.171631205673759,
+ "grad_norm": 3.2245540618896484,
+ "learning_rate": 1.076325394952826e-06,
+ "loss": 0.4412,
+ "step": 8823
+ },
+ {
+ "epoch": 4.172104018912529,
+ "grad_norm": 3.199784994125366,
+ "learning_rate": 1.0758126458798046e-06,
+ "loss": 0.3635,
+ "step": 8824
+ },
+ {
+ "epoch": 4.1725768321513,
+ "grad_norm": 2.961003303527832,
+ "learning_rate": 1.075299985484686e-06,
+ "loss": 0.3167,
+ "step": 8825
+ },
+ {
+ "epoch": 4.173049645390071,
+ "grad_norm": 2.8316452503204346,
+ "learning_rate": 1.07478741379939e-06,
+ "loss": 0.3342,
+ "step": 8826
+ },
+ {
+ "epoch": 4.173522458628842,
+ "grad_norm": 3.0721595287323,
+ "learning_rate": 1.0742749308558316e-06,
+ "loss": 0.3642,
+ "step": 8827
+ },
+ {
+ "epoch": 4.1739952718676125,
+ "grad_norm": 3.001324415206909,
+ "learning_rate": 1.0737625366859225e-06,
+ "loss": 0.3479,
+ "step": 8828
+ },
+ {
+ "epoch": 4.174468085106383,
+ "grad_norm": 3.199108839035034,
+ "learning_rate": 1.0732502313215665e-06,
+ "loss": 0.3434,
+ "step": 8829
+ },
+ {
+ "epoch": 4.174940898345153,
+ "grad_norm": 3.602139472961426,
+ "learning_rate": 1.072738014794661e-06,
+ "loss": 0.401,
+ "step": 8830
+ },
+ {
+ "epoch": 4.175413711583924,
+ "grad_norm": 3.2303357124328613,
+ "learning_rate": 1.0722258871371025e-06,
+ "loss": 0.3603,
+ "step": 8831
+ },
+ {
+ "epoch": 4.175886524822695,
+ "grad_norm": 3.138611316680908,
+ "learning_rate": 1.0717138483807766e-06,
+ "loss": 0.3481,
+ "step": 8832
+ },
+ {
+ "epoch": 4.176359338061466,
+ "grad_norm": 3.059134006500244,
+ "learning_rate": 1.071201898557567e-06,
+ "loss": 0.357,
+ "step": 8833
+ },
+ {
+ "epoch": 4.176832151300236,
+ "grad_norm": 3.237121820449829,
+ "learning_rate": 1.0706900376993501e-06,
+ "loss": 0.3424,
+ "step": 8834
+ },
+ {
+ "epoch": 4.177304964539007,
+ "grad_norm": 3.1065425872802734,
+ "learning_rate": 1.0701782658379974e-06,
+ "loss": 0.3506,
+ "step": 8835
+ },
+ {
+ "epoch": 4.177777777777778,
+ "grad_norm": 2.9971365928649902,
+ "learning_rate": 1.0696665830053743e-06,
+ "loss": 0.3205,
+ "step": 8836
+ },
+ {
+ "epoch": 4.178250591016549,
+ "grad_norm": 3.2898313999176025,
+ "learning_rate": 1.0691549892333406e-06,
+ "loss": 0.3297,
+ "step": 8837
+ },
+ {
+ "epoch": 4.178723404255319,
+ "grad_norm": 3.166144609451294,
+ "learning_rate": 1.0686434845537525e-06,
+ "loss": 0.3097,
+ "step": 8838
+ },
+ {
+ "epoch": 4.1791962174940895,
+ "grad_norm": 2.9629571437835693,
+ "learning_rate": 1.0681320689984581e-06,
+ "loss": 0.3709,
+ "step": 8839
+ },
+ {
+ "epoch": 4.17966903073286,
+ "grad_norm": 3.2954351902008057,
+ "learning_rate": 1.0676207425993004e-06,
+ "loss": 0.3448,
+ "step": 8840
+ },
+ {
+ "epoch": 4.180141843971631,
+ "grad_norm": 2.8537824153900146,
+ "learning_rate": 1.0671095053881194e-06,
+ "loss": 0.3069,
+ "step": 8841
+ },
+ {
+ "epoch": 4.180614657210402,
+ "grad_norm": 3.382916212081909,
+ "learning_rate": 1.0665983573967453e-06,
+ "loss": 0.3909,
+ "step": 8842
+ },
+ {
+ "epoch": 4.181087470449173,
+ "grad_norm": 3.4717860221862793,
+ "learning_rate": 1.0660872986570072e-06,
+ "loss": 0.3641,
+ "step": 8843
+ },
+ {
+ "epoch": 4.1815602836879435,
+ "grad_norm": 3.088916778564453,
+ "learning_rate": 1.0655763292007256e-06,
+ "loss": 0.3184,
+ "step": 8844
+ },
+ {
+ "epoch": 4.182033096926714,
+ "grad_norm": 2.8693177700042725,
+ "learning_rate": 1.065065449059715e-06,
+ "loss": 0.3486,
+ "step": 8845
+ },
+ {
+ "epoch": 4.182505910165484,
+ "grad_norm": 3.162811517715454,
+ "learning_rate": 1.0645546582657881e-06,
+ "loss": 0.3559,
+ "step": 8846
+ },
+ {
+ "epoch": 4.182978723404255,
+ "grad_norm": 3.8519816398620605,
+ "learning_rate": 1.0640439568507475e-06,
+ "loss": 0.4159,
+ "step": 8847
+ },
+ {
+ "epoch": 4.183451536643026,
+ "grad_norm": 2.9316959381103516,
+ "learning_rate": 1.063533344846394e-06,
+ "loss": 0.34,
+ "step": 8848
+ },
+ {
+ "epoch": 4.183924349881797,
+ "grad_norm": 3.018986463546753,
+ "learning_rate": 1.0630228222845205e-06,
+ "loss": 0.3378,
+ "step": 8849
+ },
+ {
+ "epoch": 4.184397163120567,
+ "grad_norm": 2.949428081512451,
+ "learning_rate": 1.062512389196914e-06,
+ "loss": 0.3634,
+ "step": 8850
+ },
+ {
+ "epoch": 4.184869976359338,
+ "grad_norm": 3.3298749923706055,
+ "learning_rate": 1.0620020456153585e-06,
+ "loss": 0.3067,
+ "step": 8851
+ },
+ {
+ "epoch": 4.185342789598109,
+ "grad_norm": 3.0566864013671875,
+ "learning_rate": 1.0614917915716302e-06,
+ "loss": 0.3534,
+ "step": 8852
+ },
+ {
+ "epoch": 4.18581560283688,
+ "grad_norm": 3.156620979309082,
+ "learning_rate": 1.0609816270975007e-06,
+ "loss": 0.3684,
+ "step": 8853
+ },
+ {
+ "epoch": 4.18628841607565,
+ "grad_norm": 3.0776474475860596,
+ "learning_rate": 1.0604715522247352e-06,
+ "loss": 0.3616,
+ "step": 8854
+ },
+ {
+ "epoch": 4.1867612293144205,
+ "grad_norm": 3.1254587173461914,
+ "learning_rate": 1.059961566985093e-06,
+ "loss": 0.3455,
+ "step": 8855
+ },
+ {
+ "epoch": 4.187234042553191,
+ "grad_norm": 2.8769783973693848,
+ "learning_rate": 1.0594516714103306e-06,
+ "loss": 0.2754,
+ "step": 8856
+ },
+ {
+ "epoch": 4.187706855791962,
+ "grad_norm": 3.461308240890503,
+ "learning_rate": 1.0589418655321962e-06,
+ "loss": 0.3744,
+ "step": 8857
+ },
+ {
+ "epoch": 4.188179669030733,
+ "grad_norm": 3.3546712398529053,
+ "learning_rate": 1.0584321493824317e-06,
+ "loss": 0.4116,
+ "step": 8858
+ },
+ {
+ "epoch": 4.188652482269504,
+ "grad_norm": 3.233792543411255,
+ "learning_rate": 1.0579225229927775e-06,
+ "loss": 0.3591,
+ "step": 8859
+ },
+ {
+ "epoch": 4.1891252955082745,
+ "grad_norm": 3.295444965362549,
+ "learning_rate": 1.0574129863949633e-06,
+ "loss": 0.3179,
+ "step": 8860
+ },
+ {
+ "epoch": 4.189598108747045,
+ "grad_norm": 3.403062105178833,
+ "learning_rate": 1.0569035396207178e-06,
+ "loss": 0.3948,
+ "step": 8861
+ },
+ {
+ "epoch": 4.190070921985815,
+ "grad_norm": 2.901970148086548,
+ "learning_rate": 1.0563941827017613e-06,
+ "loss": 0.3537,
+ "step": 8862
+ },
+ {
+ "epoch": 4.190543735224586,
+ "grad_norm": 3.1239142417907715,
+ "learning_rate": 1.0558849156698078e-06,
+ "loss": 0.3764,
+ "step": 8863
+ },
+ {
+ "epoch": 4.191016548463357,
+ "grad_norm": 2.8480169773101807,
+ "learning_rate": 1.0553757385565694e-06,
+ "loss": 0.3085,
+ "step": 8864
+ },
+ {
+ "epoch": 4.191489361702128,
+ "grad_norm": 3.0914061069488525,
+ "learning_rate": 1.0548666513937487e-06,
+ "loss": 0.3003,
+ "step": 8865
+ },
+ {
+ "epoch": 4.191962174940898,
+ "grad_norm": 2.9875683784484863,
+ "learning_rate": 1.0543576542130452e-06,
+ "loss": 0.3178,
+ "step": 8866
+ },
+ {
+ "epoch": 4.192434988179669,
+ "grad_norm": 2.952052354812622,
+ "learning_rate": 1.053848747046152e-06,
+ "loss": 0.3221,
+ "step": 8867
+ },
+ {
+ "epoch": 4.19290780141844,
+ "grad_norm": 3.2211997509002686,
+ "learning_rate": 1.0533399299247559e-06,
+ "loss": 0.3698,
+ "step": 8868
+ },
+ {
+ "epoch": 4.193380614657211,
+ "grad_norm": 3.2954046726226807,
+ "learning_rate": 1.0528312028805392e-06,
+ "loss": 0.3697,
+ "step": 8869
+ },
+ {
+ "epoch": 4.193853427895981,
+ "grad_norm": 2.978306293487549,
+ "learning_rate": 1.0523225659451768e-06,
+ "loss": 0.3358,
+ "step": 8870
+ },
+ {
+ "epoch": 4.1943262411347515,
+ "grad_norm": 3.3803653717041016,
+ "learning_rate": 1.0518140191503415e-06,
+ "loss": 0.3851,
+ "step": 8871
+ },
+ {
+ "epoch": 4.194799054373522,
+ "grad_norm": 3.282294273376465,
+ "learning_rate": 1.051305562527697e-06,
+ "loss": 0.4518,
+ "step": 8872
+ },
+ {
+ "epoch": 4.195271867612293,
+ "grad_norm": 2.950310468673706,
+ "learning_rate": 1.0507971961089017e-06,
+ "loss": 0.3045,
+ "step": 8873
+ },
+ {
+ "epoch": 4.195744680851064,
+ "grad_norm": 3.4069037437438965,
+ "learning_rate": 1.0502889199256114e-06,
+ "loss": 0.3832,
+ "step": 8874
+ },
+ {
+ "epoch": 4.196217494089835,
+ "grad_norm": 3.1440858840942383,
+ "learning_rate": 1.0497807340094722e-06,
+ "loss": 0.2958,
+ "step": 8875
+ },
+ {
+ "epoch": 4.1966903073286055,
+ "grad_norm": 3.050755262374878,
+ "learning_rate": 1.049272638392129e-06,
+ "loss": 0.3494,
+ "step": 8876
+ },
+ {
+ "epoch": 4.197163120567376,
+ "grad_norm": 2.908078670501709,
+ "learning_rate": 1.0487646331052171e-06,
+ "loss": 0.349,
+ "step": 8877
+ },
+ {
+ "epoch": 4.197635933806146,
+ "grad_norm": 3.2089946269989014,
+ "learning_rate": 1.048256718180367e-06,
+ "loss": 0.3507,
+ "step": 8878
+ },
+ {
+ "epoch": 4.198108747044917,
+ "grad_norm": 2.984745740890503,
+ "learning_rate": 1.0477488936492067e-06,
+ "loss": 0.3252,
+ "step": 8879
+ },
+ {
+ "epoch": 4.198581560283688,
+ "grad_norm": 2.9207515716552734,
+ "learning_rate": 1.0472411595433545e-06,
+ "loss": 0.3192,
+ "step": 8880
+ },
+ {
+ "epoch": 4.199054373522459,
+ "grad_norm": 3.0090811252593994,
+ "learning_rate": 1.0467335158944242e-06,
+ "loss": 0.3827,
+ "step": 8881
+ },
+ {
+ "epoch": 4.199527186761229,
+ "grad_norm": 3.2763171195983887,
+ "learning_rate": 1.0462259627340265e-06,
+ "loss": 0.3481,
+ "step": 8882
+ },
+ {
+ "epoch": 4.2,
+ "grad_norm": 3.068268299102783,
+ "learning_rate": 1.0457185000937636e-06,
+ "loss": 0.3926,
+ "step": 8883
+ },
+ {
+ "epoch": 4.200472813238771,
+ "grad_norm": 2.6999998092651367,
+ "learning_rate": 1.0452111280052326e-06,
+ "loss": 0.2884,
+ "step": 8884
+ },
+ {
+ "epoch": 4.200945626477542,
+ "grad_norm": 3.1187727451324463,
+ "learning_rate": 1.044703846500026e-06,
+ "loss": 0.3797,
+ "step": 8885
+ },
+ {
+ "epoch": 4.201418439716312,
+ "grad_norm": 2.7876172065734863,
+ "learning_rate": 1.0441966556097283e-06,
+ "loss": 0.3284,
+ "step": 8886
+ },
+ {
+ "epoch": 4.2018912529550825,
+ "grad_norm": 2.973261833190918,
+ "learning_rate": 1.0436895553659224e-06,
+ "loss": 0.2845,
+ "step": 8887
+ },
+ {
+ "epoch": 4.202364066193853,
+ "grad_norm": 3.496096611022949,
+ "learning_rate": 1.0431825458001811e-06,
+ "loss": 0.3341,
+ "step": 8888
+ },
+ {
+ "epoch": 4.202836879432624,
+ "grad_norm": 3.370410680770874,
+ "learning_rate": 1.0426756269440761e-06,
+ "loss": 0.3459,
+ "step": 8889
+ },
+ {
+ "epoch": 4.203309692671395,
+ "grad_norm": 2.864126682281494,
+ "learning_rate": 1.0421687988291693e-06,
+ "loss": 0.3195,
+ "step": 8890
+ },
+ {
+ "epoch": 4.203782505910166,
+ "grad_norm": 3.3575501441955566,
+ "learning_rate": 1.0416620614870181e-06,
+ "loss": 0.3424,
+ "step": 8891
+ },
+ {
+ "epoch": 4.2042553191489365,
+ "grad_norm": 3.4441967010498047,
+ "learning_rate": 1.0411554149491766e-06,
+ "loss": 0.3677,
+ "step": 8892
+ },
+ {
+ "epoch": 4.204728132387707,
+ "grad_norm": 3.014472007751465,
+ "learning_rate": 1.0406488592471898e-06,
+ "loss": 0.3004,
+ "step": 8893
+ },
+ {
+ "epoch": 4.205200945626477,
+ "grad_norm": 3.1186721324920654,
+ "learning_rate": 1.0401423944126002e-06,
+ "loss": 0.4182,
+ "step": 8894
+ },
+ {
+ "epoch": 4.205673758865248,
+ "grad_norm": 3.166337013244629,
+ "learning_rate": 1.0396360204769426e-06,
+ "loss": 0.3303,
+ "step": 8895
+ },
+ {
+ "epoch": 4.206146572104019,
+ "grad_norm": 3.081855058670044,
+ "learning_rate": 1.0391297374717454e-06,
+ "loss": 0.3096,
+ "step": 8896
+ },
+ {
+ "epoch": 4.20661938534279,
+ "grad_norm": 3.0924830436706543,
+ "learning_rate": 1.0386235454285348e-06,
+ "loss": 0.3238,
+ "step": 8897
+ },
+ {
+ "epoch": 4.20709219858156,
+ "grad_norm": 3.043519973754883,
+ "learning_rate": 1.0381174443788277e-06,
+ "loss": 0.3322,
+ "step": 8898
+ },
+ {
+ "epoch": 4.207565011820331,
+ "grad_norm": 3.160785675048828,
+ "learning_rate": 1.0376114343541377e-06,
+ "loss": 0.3244,
+ "step": 8899
+ },
+ {
+ "epoch": 4.208037825059102,
+ "grad_norm": 2.9988417625427246,
+ "learning_rate": 1.037105515385971e-06,
+ "loss": 0.3386,
+ "step": 8900
+ },
+ {
+ "epoch": 4.208510638297873,
+ "grad_norm": 2.981959342956543,
+ "learning_rate": 1.0365996875058284e-06,
+ "loss": 0.3412,
+ "step": 8901
+ },
+ {
+ "epoch": 4.208983451536643,
+ "grad_norm": 3.144815683364868,
+ "learning_rate": 1.0360939507452075e-06,
+ "loss": 0.3716,
+ "step": 8902
+ },
+ {
+ "epoch": 4.2094562647754135,
+ "grad_norm": 2.9644055366516113,
+ "learning_rate": 1.0355883051355972e-06,
+ "loss": 0.3488,
+ "step": 8903
+ },
+ {
+ "epoch": 4.209929078014184,
+ "grad_norm": 3.3212029933929443,
+ "learning_rate": 1.035082750708481e-06,
+ "loss": 0.3048,
+ "step": 8904
+ },
+ {
+ "epoch": 4.210401891252955,
+ "grad_norm": 2.82843279838562,
+ "learning_rate": 1.034577287495339e-06,
+ "loss": 0.3141,
+ "step": 8905
+ },
+ {
+ "epoch": 4.210874704491726,
+ "grad_norm": 3.040215253829956,
+ "learning_rate": 1.034071915527643e-06,
+ "loss": 0.3517,
+ "step": 8906
+ },
+ {
+ "epoch": 4.211347517730497,
+ "grad_norm": 2.850985288619995,
+ "learning_rate": 1.033566634836862e-06,
+ "loss": 0.3556,
+ "step": 8907
+ },
+ {
+ "epoch": 4.2118203309692674,
+ "grad_norm": 3.522962808609009,
+ "learning_rate": 1.0330614454544564e-06,
+ "loss": 0.3432,
+ "step": 8908
+ },
+ {
+ "epoch": 4.212293144208038,
+ "grad_norm": 3.0228631496429443,
+ "learning_rate": 1.032556347411881e-06,
+ "loss": 0.3165,
+ "step": 8909
+ },
+ {
+ "epoch": 4.212765957446808,
+ "grad_norm": 3.275134563446045,
+ "learning_rate": 1.0320513407405886e-06,
+ "loss": 0.413,
+ "step": 8910
+ },
+ {
+ "epoch": 4.213238770685579,
+ "grad_norm": 2.850020408630371,
+ "learning_rate": 1.0315464254720213e-06,
+ "loss": 0.3051,
+ "step": 8911
+ },
+ {
+ "epoch": 4.21371158392435,
+ "grad_norm": 3.153916597366333,
+ "learning_rate": 1.0310416016376203e-06,
+ "loss": 0.2973,
+ "step": 8912
+ },
+ {
+ "epoch": 4.2141843971631205,
+ "grad_norm": 3.423772096633911,
+ "learning_rate": 1.0305368692688175e-06,
+ "loss": 0.302,
+ "step": 8913
+ },
+ {
+ "epoch": 4.214657210401891,
+ "grad_norm": 3.420687198638916,
+ "learning_rate": 1.0300322283970404e-06,
+ "loss": 0.3732,
+ "step": 8914
+ },
+ {
+ "epoch": 4.215130023640662,
+ "grad_norm": 3.2490479946136475,
+ "learning_rate": 1.02952767905371e-06,
+ "loss": 0.3793,
+ "step": 8915
+ },
+ {
+ "epoch": 4.215602836879433,
+ "grad_norm": 3.3043079376220703,
+ "learning_rate": 1.0290232212702438e-06,
+ "loss": 0.3472,
+ "step": 8916
+ },
+ {
+ "epoch": 4.216075650118204,
+ "grad_norm": 3.152435779571533,
+ "learning_rate": 1.0285188550780516e-06,
+ "loss": 0.3617,
+ "step": 8917
+ },
+ {
+ "epoch": 4.216548463356974,
+ "grad_norm": 3.311063766479492,
+ "learning_rate": 1.0280145805085384e-06,
+ "loss": 0.3681,
+ "step": 8918
+ },
+ {
+ "epoch": 4.217021276595744,
+ "grad_norm": 3.1113057136535645,
+ "learning_rate": 1.0275103975931016e-06,
+ "loss": 0.3526,
+ "step": 8919
+ },
+ {
+ "epoch": 4.217494089834515,
+ "grad_norm": 2.7904412746429443,
+ "learning_rate": 1.0270063063631369e-06,
+ "loss": 0.3125,
+ "step": 8920
+ },
+ {
+ "epoch": 4.217966903073286,
+ "grad_norm": 3.3566761016845703,
+ "learning_rate": 1.0265023068500293e-06,
+ "loss": 0.3305,
+ "step": 8921
+ },
+ {
+ "epoch": 4.218439716312057,
+ "grad_norm": 2.97943115234375,
+ "learning_rate": 1.0259983990851633e-06,
+ "loss": 0.3277,
+ "step": 8922
+ },
+ {
+ "epoch": 4.218912529550828,
+ "grad_norm": 3.1507925987243652,
+ "learning_rate": 1.0254945830999134e-06,
+ "loss": 0.385,
+ "step": 8923
+ },
+ {
+ "epoch": 4.219385342789598,
+ "grad_norm": 2.632859706878662,
+ "learning_rate": 1.0249908589256493e-06,
+ "loss": 0.2889,
+ "step": 8924
+ },
+ {
+ "epoch": 4.219858156028369,
+ "grad_norm": 2.9816136360168457,
+ "learning_rate": 1.0244872265937378e-06,
+ "loss": 0.2838,
+ "step": 8925
+ },
+ {
+ "epoch": 4.220330969267139,
+ "grad_norm": 2.751431465148926,
+ "learning_rate": 1.0239836861355369e-06,
+ "loss": 0.3069,
+ "step": 8926
+ },
+ {
+ "epoch": 4.22080378250591,
+ "grad_norm": 3.3390228748321533,
+ "learning_rate": 1.0234802375823985e-06,
+ "loss": 0.3074,
+ "step": 8927
+ },
+ {
+ "epoch": 4.221276595744681,
+ "grad_norm": 3.345242500305176,
+ "learning_rate": 1.0229768809656726e-06,
+ "loss": 0.3603,
+ "step": 8928
+ },
+ {
+ "epoch": 4.2217494089834515,
+ "grad_norm": 3.0684640407562256,
+ "learning_rate": 1.0224736163166984e-06,
+ "loss": 0.3343,
+ "step": 8929
+ },
+ {
+ "epoch": 4.222222222222222,
+ "grad_norm": 3.2813572883605957,
+ "learning_rate": 1.0219704436668146e-06,
+ "loss": 0.3173,
+ "step": 8930
+ },
+ {
+ "epoch": 4.222695035460993,
+ "grad_norm": 3.135668992996216,
+ "learning_rate": 1.0214673630473504e-06,
+ "loss": 0.3608,
+ "step": 8931
+ },
+ {
+ "epoch": 4.223167848699764,
+ "grad_norm": 2.8798727989196777,
+ "learning_rate": 1.0209643744896303e-06,
+ "loss": 0.3317,
+ "step": 8932
+ },
+ {
+ "epoch": 4.223640661938534,
+ "grad_norm": 3.2659590244293213,
+ "learning_rate": 1.0204614780249731e-06,
+ "loss": 0.3017,
+ "step": 8933
+ },
+ {
+ "epoch": 4.224113475177305,
+ "grad_norm": 2.988126516342163,
+ "learning_rate": 1.0199586736846911e-06,
+ "loss": 0.3555,
+ "step": 8934
+ },
+ {
+ "epoch": 4.224586288416075,
+ "grad_norm": 3.3775575160980225,
+ "learning_rate": 1.0194559615000937e-06,
+ "loss": 0.3966,
+ "step": 8935
+ },
+ {
+ "epoch": 4.225059101654846,
+ "grad_norm": 3.1004798412323,
+ "learning_rate": 1.0189533415024817e-06,
+ "loss": 0.3192,
+ "step": 8936
+ },
+ {
+ "epoch": 4.225531914893617,
+ "grad_norm": 3.1722211837768555,
+ "learning_rate": 1.0184508137231498e-06,
+ "loss": 0.3075,
+ "step": 8937
+ },
+ {
+ "epoch": 4.226004728132388,
+ "grad_norm": 3.0679538249969482,
+ "learning_rate": 1.0179483781933903e-06,
+ "loss": 0.3475,
+ "step": 8938
+ },
+ {
+ "epoch": 4.226477541371159,
+ "grad_norm": 3.079246759414673,
+ "learning_rate": 1.0174460349444857e-06,
+ "loss": 0.4054,
+ "step": 8939
+ },
+ {
+ "epoch": 4.226950354609929,
+ "grad_norm": 3.308229684829712,
+ "learning_rate": 1.0169437840077169e-06,
+ "loss": 0.3455,
+ "step": 8940
+ },
+ {
+ "epoch": 4.2274231678487,
+ "grad_norm": 3.363147258758545,
+ "learning_rate": 1.0164416254143552e-06,
+ "loss": 0.3538,
+ "step": 8941
+ },
+ {
+ "epoch": 4.22789598108747,
+ "grad_norm": 2.7227768898010254,
+ "learning_rate": 1.0159395591956677e-06,
+ "loss": 0.3206,
+ "step": 8942
+ },
+ {
+ "epoch": 4.228368794326241,
+ "grad_norm": 3.0010764598846436,
+ "learning_rate": 1.0154375853829175e-06,
+ "loss": 0.3593,
+ "step": 8943
+ },
+ {
+ "epoch": 4.228841607565012,
+ "grad_norm": 3.0478785037994385,
+ "learning_rate": 1.0149357040073581e-06,
+ "loss": 0.3808,
+ "step": 8944
+ },
+ {
+ "epoch": 4.2293144208037825,
+ "grad_norm": 2.804421901702881,
+ "learning_rate": 1.0144339151002416e-06,
+ "loss": 0.3633,
+ "step": 8945
+ },
+ {
+ "epoch": 4.229787234042553,
+ "grad_norm": 3.563140630722046,
+ "learning_rate": 1.013932218692811e-06,
+ "loss": 0.2843,
+ "step": 8946
+ },
+ {
+ "epoch": 4.230260047281324,
+ "grad_norm": 3.1959750652313232,
+ "learning_rate": 1.0134306148163051e-06,
+ "loss": 0.377,
+ "step": 8947
+ },
+ {
+ "epoch": 4.230732860520095,
+ "grad_norm": 3.0841214656829834,
+ "learning_rate": 1.0129291035019565e-06,
+ "loss": 0.3234,
+ "step": 8948
+ },
+ {
+ "epoch": 4.231205673758865,
+ "grad_norm": 3.3893179893493652,
+ "learning_rate": 1.0124276847809911e-06,
+ "loss": 0.3823,
+ "step": 8949
+ },
+ {
+ "epoch": 4.231678486997636,
+ "grad_norm": 3.2250518798828125,
+ "learning_rate": 1.0119263586846316e-06,
+ "loss": 0.35,
+ "step": 8950
+ },
+ {
+ "epoch": 4.232151300236406,
+ "grad_norm": 3.287285566329956,
+ "learning_rate": 1.0114251252440928e-06,
+ "loss": 0.3306,
+ "step": 8951
+ },
+ {
+ "epoch": 4.232624113475177,
+ "grad_norm": 3.5018274784088135,
+ "learning_rate": 1.0109239844905836e-06,
+ "loss": 0.378,
+ "step": 8952
+ },
+ {
+ "epoch": 4.233096926713948,
+ "grad_norm": 3.224838972091675,
+ "learning_rate": 1.0104229364553093e-06,
+ "loss": 0.3379,
+ "step": 8953
+ },
+ {
+ "epoch": 4.233569739952719,
+ "grad_norm": 3.2302494049072266,
+ "learning_rate": 1.0099219811694668e-06,
+ "loss": 0.358,
+ "step": 8954
+ },
+ {
+ "epoch": 4.23404255319149,
+ "grad_norm": 3.098205804824829,
+ "learning_rate": 1.0094211186642483e-06,
+ "loss": 0.3669,
+ "step": 8955
+ },
+ {
+ "epoch": 4.23451536643026,
+ "grad_norm": 3.0045907497406006,
+ "learning_rate": 1.0089203489708415e-06,
+ "loss": 0.3293,
+ "step": 8956
+ },
+ {
+ "epoch": 4.234988179669031,
+ "grad_norm": 3.245818853378296,
+ "learning_rate": 1.0084196721204254e-06,
+ "loss": 0.3365,
+ "step": 8957
+ },
+ {
+ "epoch": 4.235460992907801,
+ "grad_norm": 2.8547208309173584,
+ "learning_rate": 1.007919088144177e-06,
+ "loss": 0.3072,
+ "step": 8958
+ },
+ {
+ "epoch": 4.235933806146572,
+ "grad_norm": 3.2914109230041504,
+ "learning_rate": 1.0074185970732642e-06,
+ "loss": 0.3734,
+ "step": 8959
+ },
+ {
+ "epoch": 4.236406619385343,
+ "grad_norm": 2.527096748352051,
+ "learning_rate": 1.0069181989388496e-06,
+ "loss": 0.3091,
+ "step": 8960
+ },
+ {
+ "epoch": 4.2368794326241135,
+ "grad_norm": 2.921369791030884,
+ "learning_rate": 1.006417893772093e-06,
+ "loss": 0.3162,
+ "step": 8961
+ },
+ {
+ "epoch": 4.237352245862884,
+ "grad_norm": 2.8698911666870117,
+ "learning_rate": 1.005917681604145e-06,
+ "loss": 0.3818,
+ "step": 8962
+ },
+ {
+ "epoch": 4.237825059101655,
+ "grad_norm": 2.958021402359009,
+ "learning_rate": 1.0054175624661514e-06,
+ "loss": 0.2934,
+ "step": 8963
+ },
+ {
+ "epoch": 4.238297872340426,
+ "grad_norm": 3.0945863723754883,
+ "learning_rate": 1.0049175363892527e-06,
+ "loss": 0.384,
+ "step": 8964
+ },
+ {
+ "epoch": 4.238770685579196,
+ "grad_norm": 2.890333890914917,
+ "learning_rate": 1.0044176034045822e-06,
+ "loss": 0.3312,
+ "step": 8965
+ },
+ {
+ "epoch": 4.239243498817967,
+ "grad_norm": 3.460975408554077,
+ "learning_rate": 1.0039177635432706e-06,
+ "loss": 0.4015,
+ "step": 8966
+ },
+ {
+ "epoch": 4.239716312056737,
+ "grad_norm": 3.5411946773529053,
+ "learning_rate": 1.003418016836439e-06,
+ "loss": 0.3307,
+ "step": 8967
+ },
+ {
+ "epoch": 4.240189125295508,
+ "grad_norm": 3.3310446739196777,
+ "learning_rate": 1.0029183633152061e-06,
+ "loss": 0.3154,
+ "step": 8968
+ },
+ {
+ "epoch": 4.240661938534279,
+ "grad_norm": 3.121110677719116,
+ "learning_rate": 1.0024188030106822e-06,
+ "loss": 0.3827,
+ "step": 8969
+ },
+ {
+ "epoch": 4.24113475177305,
+ "grad_norm": 3.421278715133667,
+ "learning_rate": 1.0019193359539717e-06,
+ "loss": 0.3677,
+ "step": 8970
+ },
+ {
+ "epoch": 4.241607565011821,
+ "grad_norm": 3.1790332794189453,
+ "learning_rate": 1.0014199621761761e-06,
+ "loss": 0.3219,
+ "step": 8971
+ },
+ {
+ "epoch": 4.242080378250591,
+ "grad_norm": 3.238412380218506,
+ "learning_rate": 1.0009206817083878e-06,
+ "loss": 0.3296,
+ "step": 8972
+ },
+ {
+ "epoch": 4.242553191489361,
+ "grad_norm": 3.0206923484802246,
+ "learning_rate": 1.0004214945816959e-06,
+ "loss": 0.3769,
+ "step": 8973
+ },
+ {
+ "epoch": 4.243026004728132,
+ "grad_norm": 3.2117667198181152,
+ "learning_rate": 9.999224008271822e-07,
+ "loss": 0.3876,
+ "step": 8974
+ },
+ {
+ "epoch": 4.243498817966903,
+ "grad_norm": 2.849250316619873,
+ "learning_rate": 9.99423400475922e-07,
+ "loss": 0.3267,
+ "step": 8975
+ },
+ {
+ "epoch": 4.243971631205674,
+ "grad_norm": 3.084845542907715,
+ "learning_rate": 9.989244935589878e-07,
+ "loss": 0.3074,
+ "step": 8976
+ },
+ {
+ "epoch": 4.2444444444444445,
+ "grad_norm": 3.0177342891693115,
+ "learning_rate": 9.984256801074434e-07,
+ "loss": 0.3524,
+ "step": 8977
+ },
+ {
+ "epoch": 4.244917257683215,
+ "grad_norm": 3.196692943572998,
+ "learning_rate": 9.979269601523477e-07,
+ "loss": 0.3943,
+ "step": 8978
+ },
+ {
+ "epoch": 4.245390070921986,
+ "grad_norm": 2.849760055541992,
+ "learning_rate": 9.97428333724753e-07,
+ "loss": 0.3145,
+ "step": 8979
+ },
+ {
+ "epoch": 4.245862884160757,
+ "grad_norm": 3.003265857696533,
+ "learning_rate": 9.969298008557083e-07,
+ "loss": 0.3393,
+ "step": 8980
+ },
+ {
+ "epoch": 4.246335697399527,
+ "grad_norm": 2.925597667694092,
+ "learning_rate": 9.96431361576254e-07,
+ "loss": 0.331,
+ "step": 8981
+ },
+ {
+ "epoch": 4.246808510638298,
+ "grad_norm": 2.87599515914917,
+ "learning_rate": 9.959330159174257e-07,
+ "loss": 0.3102,
+ "step": 8982
+ },
+ {
+ "epoch": 4.247281323877068,
+ "grad_norm": 2.841588020324707,
+ "learning_rate": 9.954347639102528e-07,
+ "loss": 0.3261,
+ "step": 8983
+ },
+ {
+ "epoch": 4.247754137115839,
+ "grad_norm": 3.14918851852417,
+ "learning_rate": 9.949366055857605e-07,
+ "loss": 0.3366,
+ "step": 8984
+ },
+ {
+ "epoch": 4.24822695035461,
+ "grad_norm": 3.113927125930786,
+ "learning_rate": 9.944385409749654e-07,
+ "loss": 0.3532,
+ "step": 8985
+ },
+ {
+ "epoch": 4.248699763593381,
+ "grad_norm": 3.0749151706695557,
+ "learning_rate": 9.939405701088818e-07,
+ "loss": 0.3659,
+ "step": 8986
+ },
+ {
+ "epoch": 4.2491725768321515,
+ "grad_norm": 2.831846237182617,
+ "learning_rate": 9.934426930185145e-07,
+ "loss": 0.2965,
+ "step": 8987
+ },
+ {
+ "epoch": 4.249645390070922,
+ "grad_norm": 3.0280253887176514,
+ "learning_rate": 9.929449097348642e-07,
+ "loss": 0.3144,
+ "step": 8988
+ },
+ {
+ "epoch": 4.250118203309692,
+ "grad_norm": 3.250284433364868,
+ "learning_rate": 9.924472202889267e-07,
+ "loss": 0.3414,
+ "step": 8989
+ },
+ {
+ "epoch": 4.250591016548463,
+ "grad_norm": 3.582306146621704,
+ "learning_rate": 9.9194962471169e-07,
+ "loss": 0.3593,
+ "step": 8990
+ },
+ {
+ "epoch": 4.251063829787234,
+ "grad_norm": 2.8985490798950195,
+ "learning_rate": 9.914521230341382e-07,
+ "loss": 0.2948,
+ "step": 8991
+ },
+ {
+ "epoch": 4.251536643026005,
+ "grad_norm": 3.399209499359131,
+ "learning_rate": 9.909547152872476e-07,
+ "loss": 0.3942,
+ "step": 8992
+ },
+ {
+ "epoch": 4.2520094562647754,
+ "grad_norm": 3.344658613204956,
+ "learning_rate": 9.904574015019895e-07,
+ "loss": 0.3649,
+ "step": 8993
+ },
+ {
+ "epoch": 4.252482269503546,
+ "grad_norm": 3.057995319366455,
+ "learning_rate": 9.899601817093305e-07,
+ "loss": 0.317,
+ "step": 8994
+ },
+ {
+ "epoch": 4.252955082742317,
+ "grad_norm": 3.4610090255737305,
+ "learning_rate": 9.894630559402296e-07,
+ "loss": 0.3235,
+ "step": 8995
+ },
+ {
+ "epoch": 4.253427895981088,
+ "grad_norm": 3.6014657020568848,
+ "learning_rate": 9.889660242256407e-07,
+ "loss": 0.3057,
+ "step": 8996
+ },
+ {
+ "epoch": 4.253900709219858,
+ "grad_norm": 2.850391149520874,
+ "learning_rate": 9.884690865965118e-07,
+ "loss": 0.3584,
+ "step": 8997
+ },
+ {
+ "epoch": 4.2543735224586285,
+ "grad_norm": 3.100820541381836,
+ "learning_rate": 9.879722430837844e-07,
+ "loss": 0.3802,
+ "step": 8998
+ },
+ {
+ "epoch": 4.254846335697399,
+ "grad_norm": 3.1044704914093018,
+ "learning_rate": 9.874754937183962e-07,
+ "loss": 0.3293,
+ "step": 8999
+ },
+ {
+ "epoch": 4.25531914893617,
+ "grad_norm": 2.750356912612915,
+ "learning_rate": 9.869788385312764e-07,
+ "loss": 0.3218,
+ "step": 9000
+ },
+ {
+ "epoch": 4.255791962174941,
+ "grad_norm": 3.1008687019348145,
+ "learning_rate": 9.864822775533494e-07,
+ "loss": 0.3316,
+ "step": 9001
+ },
+ {
+ "epoch": 4.256264775413712,
+ "grad_norm": 3.2051985263824463,
+ "learning_rate": 9.859858108155351e-07,
+ "loss": 0.3661,
+ "step": 9002
+ },
+ {
+ "epoch": 4.2567375886524825,
+ "grad_norm": 3.1303839683532715,
+ "learning_rate": 9.854894383487448e-07,
+ "loss": 0.3683,
+ "step": 9003
+ },
+ {
+ "epoch": 4.257210401891253,
+ "grad_norm": 3.0718302726745605,
+ "learning_rate": 9.84993160183887e-07,
+ "loss": 0.3284,
+ "step": 9004
+ },
+ {
+ "epoch": 4.257683215130023,
+ "grad_norm": 2.9759013652801514,
+ "learning_rate": 9.844969763518625e-07,
+ "loss": 0.3465,
+ "step": 9005
+ },
+ {
+ "epoch": 4.258156028368794,
+ "grad_norm": 3.1965582370758057,
+ "learning_rate": 9.840008868835647e-07,
+ "loss": 0.3593,
+ "step": 9006
+ },
+ {
+ "epoch": 4.258628841607565,
+ "grad_norm": 3.2931249141693115,
+ "learning_rate": 9.835048918098853e-07,
+ "loss": 0.3631,
+ "step": 9007
+ },
+ {
+ "epoch": 4.259101654846336,
+ "grad_norm": 3.070627450942993,
+ "learning_rate": 9.830089911617054e-07,
+ "loss": 0.3541,
+ "step": 9008
+ },
+ {
+ "epoch": 4.259574468085106,
+ "grad_norm": 3.209110736846924,
+ "learning_rate": 9.825131849699051e-07,
+ "loss": 0.344,
+ "step": 9009
+ },
+ {
+ "epoch": 4.260047281323877,
+ "grad_norm": 3.2239089012145996,
+ "learning_rate": 9.820174732653545e-07,
+ "loss": 0.3469,
+ "step": 9010
+ },
+ {
+ "epoch": 4.260520094562648,
+ "grad_norm": 3.0812292098999023,
+ "learning_rate": 9.815218560789199e-07,
+ "loss": 0.2898,
+ "step": 9011
+ },
+ {
+ "epoch": 4.260992907801419,
+ "grad_norm": 3.1709752082824707,
+ "learning_rate": 9.81026333441461e-07,
+ "loss": 0.381,
+ "step": 9012
+ },
+ {
+ "epoch": 4.261465721040189,
+ "grad_norm": 3.1551907062530518,
+ "learning_rate": 9.805309053838308e-07,
+ "loss": 0.2959,
+ "step": 9013
+ },
+ {
+ "epoch": 4.2619385342789595,
+ "grad_norm": 3.3751494884490967,
+ "learning_rate": 9.800355719368793e-07,
+ "loss": 0.3806,
+ "step": 9014
+ },
+ {
+ "epoch": 4.26241134751773,
+ "grad_norm": 3.2392799854278564,
+ "learning_rate": 9.795403331314479e-07,
+ "loss": 0.3006,
+ "step": 9015
+ },
+ {
+ "epoch": 4.262884160756501,
+ "grad_norm": 3.1428463459014893,
+ "learning_rate": 9.790451889983724e-07,
+ "loss": 0.3212,
+ "step": 9016
+ },
+ {
+ "epoch": 4.263356973995272,
+ "grad_norm": 3.353379726409912,
+ "learning_rate": 9.785501395684844e-07,
+ "loss": 0.3555,
+ "step": 9017
+ },
+ {
+ "epoch": 4.263829787234043,
+ "grad_norm": 3.3555281162261963,
+ "learning_rate": 9.780551848726068e-07,
+ "loss": 0.3729,
+ "step": 9018
+ },
+ {
+ "epoch": 4.2643026004728135,
+ "grad_norm": 3.0275049209594727,
+ "learning_rate": 9.775603249415606e-07,
+ "loss": 0.3579,
+ "step": 9019
+ },
+ {
+ "epoch": 4.264775413711584,
+ "grad_norm": 3.2631473541259766,
+ "learning_rate": 9.770655598061569e-07,
+ "loss": 0.3755,
+ "step": 9020
+ },
+ {
+ "epoch": 4.265248226950354,
+ "grad_norm": 2.9419705867767334,
+ "learning_rate": 9.76570889497202e-07,
+ "loss": 0.28,
+ "step": 9021
+ },
+ {
+ "epoch": 4.265721040189125,
+ "grad_norm": 2.931673288345337,
+ "learning_rate": 9.76076314045499e-07,
+ "loss": 0.3497,
+ "step": 9022
+ },
+ {
+ "epoch": 4.266193853427896,
+ "grad_norm": 3.218503952026367,
+ "learning_rate": 9.755818334818416e-07,
+ "loss": 0.3775,
+ "step": 9023
+ },
+ {
+ "epoch": 4.266666666666667,
+ "grad_norm": 3.1422977447509766,
+ "learning_rate": 9.750874478370181e-07,
+ "loss": 0.3957,
+ "step": 9024
+ },
+ {
+ "epoch": 4.267139479905437,
+ "grad_norm": 3.066502571105957,
+ "learning_rate": 9.745931571418134e-07,
+ "loss": 0.3361,
+ "step": 9025
+ },
+ {
+ "epoch": 4.267612293144208,
+ "grad_norm": 3.186897039413452,
+ "learning_rate": 9.740989614270044e-07,
+ "loss": 0.3794,
+ "step": 9026
+ },
+ {
+ "epoch": 4.268085106382979,
+ "grad_norm": 3.2698588371276855,
+ "learning_rate": 9.736048607233623e-07,
+ "loss": 0.3595,
+ "step": 9027
+ },
+ {
+ "epoch": 4.26855791962175,
+ "grad_norm": 2.9609718322753906,
+ "learning_rate": 9.731108550616523e-07,
+ "loss": 0.3387,
+ "step": 9028
+ },
+ {
+ "epoch": 4.26903073286052,
+ "grad_norm": 3.10768985748291,
+ "learning_rate": 9.72616944472633e-07,
+ "loss": 0.387,
+ "step": 9029
+ },
+ {
+ "epoch": 4.2695035460992905,
+ "grad_norm": 2.8060896396636963,
+ "learning_rate": 9.721231289870602e-07,
+ "loss": 0.3132,
+ "step": 9030
+ },
+ {
+ "epoch": 4.269976359338061,
+ "grad_norm": 3.0502681732177734,
+ "learning_rate": 9.716294086356801e-07,
+ "loss": 0.3246,
+ "step": 9031
+ },
+ {
+ "epoch": 4.270449172576832,
+ "grad_norm": 2.8298611640930176,
+ "learning_rate": 9.711357834492356e-07,
+ "loss": 0.2958,
+ "step": 9032
+ },
+ {
+ "epoch": 4.270921985815603,
+ "grad_norm": 2.693819761276245,
+ "learning_rate": 9.70642253458462e-07,
+ "loss": 0.325,
+ "step": 9033
+ },
+ {
+ "epoch": 4.271394799054374,
+ "grad_norm": 2.8179452419281006,
+ "learning_rate": 9.701488186940885e-07,
+ "loss": 0.3252,
+ "step": 9034
+ },
+ {
+ "epoch": 4.2718676122931445,
+ "grad_norm": 2.9885077476501465,
+ "learning_rate": 9.696554791868406e-07,
+ "loss": 0.3234,
+ "step": 9035
+ },
+ {
+ "epoch": 4.272340425531915,
+ "grad_norm": 4.8119378089904785,
+ "learning_rate": 9.691622349674349e-07,
+ "loss": 0.3814,
+ "step": 9036
+ },
+ {
+ "epoch": 4.272813238770685,
+ "grad_norm": 3.971498966217041,
+ "learning_rate": 9.68669086066585e-07,
+ "loss": 0.3684,
+ "step": 9037
+ },
+ {
+ "epoch": 4.273286052009456,
+ "grad_norm": 3.0153439044952393,
+ "learning_rate": 9.681760325149967e-07,
+ "loss": 0.3449,
+ "step": 9038
+ },
+ {
+ "epoch": 4.273758865248227,
+ "grad_norm": 3.4421799182891846,
+ "learning_rate": 9.676830743433688e-07,
+ "loss": 0.3578,
+ "step": 9039
+ },
+ {
+ "epoch": 4.274231678486998,
+ "grad_norm": 3.2896533012390137,
+ "learning_rate": 9.67190211582398e-07,
+ "loss": 0.3421,
+ "step": 9040
+ },
+ {
+ "epoch": 4.274704491725768,
+ "grad_norm": 3.388833522796631,
+ "learning_rate": 9.666974442627717e-07,
+ "loss": 0.4089,
+ "step": 9041
+ },
+ {
+ "epoch": 4.275177304964539,
+ "grad_norm": 3.1000685691833496,
+ "learning_rate": 9.662047724151718e-07,
+ "loss": 0.4046,
+ "step": 9042
+ },
+ {
+ "epoch": 4.27565011820331,
+ "grad_norm": 3.5651235580444336,
+ "learning_rate": 9.657121960702753e-07,
+ "loss": 0.4275,
+ "step": 9043
+ },
+ {
+ "epoch": 4.276122931442081,
+ "grad_norm": 2.944434881210327,
+ "learning_rate": 9.65219715258752e-07,
+ "loss": 0.3395,
+ "step": 9044
+ },
+ {
+ "epoch": 4.276595744680851,
+ "grad_norm": 2.7315311431884766,
+ "learning_rate": 9.64727330011268e-07,
+ "loss": 0.3305,
+ "step": 9045
+ },
+ {
+ "epoch": 4.2770685579196215,
+ "grad_norm": 3.423567533493042,
+ "learning_rate": 9.642350403584805e-07,
+ "loss": 0.3605,
+ "step": 9046
+ },
+ {
+ "epoch": 4.277541371158392,
+ "grad_norm": 3.239745616912842,
+ "learning_rate": 9.637428463310435e-07,
+ "loss": 0.3519,
+ "step": 9047
+ },
+ {
+ "epoch": 4.278014184397163,
+ "grad_norm": 3.388700008392334,
+ "learning_rate": 9.632507479596035e-07,
+ "loss": 0.359,
+ "step": 9048
+ },
+ {
+ "epoch": 4.278486997635934,
+ "grad_norm": 3.3524253368377686,
+ "learning_rate": 9.627587452747996e-07,
+ "loss": 0.3381,
+ "step": 9049
+ },
+ {
+ "epoch": 4.278959810874705,
+ "grad_norm": 3.1089365482330322,
+ "learning_rate": 9.622668383072695e-07,
+ "loss": 0.3143,
+ "step": 9050
+ },
+ {
+ "epoch": 4.2794326241134755,
+ "grad_norm": 3.3477213382720947,
+ "learning_rate": 9.617750270876402e-07,
+ "loss": 0.3788,
+ "step": 9051
+ },
+ {
+ "epoch": 4.279905437352246,
+ "grad_norm": 2.934818983078003,
+ "learning_rate": 9.612833116465342e-07,
+ "loss": 0.3589,
+ "step": 9052
+ },
+ {
+ "epoch": 4.280378250591016,
+ "grad_norm": 3.125014305114746,
+ "learning_rate": 9.607916920145704e-07,
+ "loss": 0.3181,
+ "step": 9053
+ },
+ {
+ "epoch": 4.280851063829787,
+ "grad_norm": 3.5860400199890137,
+ "learning_rate": 9.60300168222358e-07,
+ "loss": 0.3597,
+ "step": 9054
+ },
+ {
+ "epoch": 4.281323877068558,
+ "grad_norm": 3.1414008140563965,
+ "learning_rate": 9.598087403005032e-07,
+ "loss": 0.3186,
+ "step": 9055
+ },
+ {
+ "epoch": 4.281796690307329,
+ "grad_norm": 2.841228723526001,
+ "learning_rate": 9.593174082796046e-07,
+ "loss": 0.3547,
+ "step": 9056
+ },
+ {
+ "epoch": 4.282269503546099,
+ "grad_norm": 3.1145405769348145,
+ "learning_rate": 9.588261721902547e-07,
+ "loss": 0.3317,
+ "step": 9057
+ },
+ {
+ "epoch": 4.28274231678487,
+ "grad_norm": 2.9518024921417236,
+ "learning_rate": 9.58335032063042e-07,
+ "loss": 0.3723,
+ "step": 9058
+ },
+ {
+ "epoch": 4.283215130023641,
+ "grad_norm": 2.887479782104492,
+ "learning_rate": 9.578439879285467e-07,
+ "loss": 0.3288,
+ "step": 9059
+ },
+ {
+ "epoch": 4.283687943262412,
+ "grad_norm": 3.253427267074585,
+ "learning_rate": 9.573530398173444e-07,
+ "loss": 0.326,
+ "step": 9060
+ },
+ {
+ "epoch": 4.284160756501182,
+ "grad_norm": 3.0442020893096924,
+ "learning_rate": 9.568621877600038e-07,
+ "loss": 0.3807,
+ "step": 9061
+ },
+ {
+ "epoch": 4.2846335697399525,
+ "grad_norm": 2.928743600845337,
+ "learning_rate": 9.563714317870877e-07,
+ "loss": 0.2977,
+ "step": 9062
+ },
+ {
+ "epoch": 4.285106382978723,
+ "grad_norm": 3.2095022201538086,
+ "learning_rate": 9.558807719291543e-07,
+ "loss": 0.3571,
+ "step": 9063
+ },
+ {
+ "epoch": 4.285579196217494,
+ "grad_norm": 3.3752429485321045,
+ "learning_rate": 9.55390208216754e-07,
+ "loss": 0.3928,
+ "step": 9064
+ },
+ {
+ "epoch": 4.286052009456265,
+ "grad_norm": 3.125702381134033,
+ "learning_rate": 9.548997406804333e-07,
+ "loss": 0.3334,
+ "step": 9065
+ },
+ {
+ "epoch": 4.286524822695036,
+ "grad_norm": 3.058772563934326,
+ "learning_rate": 9.544093693507308e-07,
+ "loss": 0.3184,
+ "step": 9066
+ },
+ {
+ "epoch": 4.2869976359338064,
+ "grad_norm": 3.1085948944091797,
+ "learning_rate": 9.539190942581785e-07,
+ "loss": 0.3626,
+ "step": 9067
+ },
+ {
+ "epoch": 4.287470449172577,
+ "grad_norm": 2.8497378826141357,
+ "learning_rate": 9.53428915433306e-07,
+ "loss": 0.3451,
+ "step": 9068
+ },
+ {
+ "epoch": 4.287943262411347,
+ "grad_norm": 3.411508798599243,
+ "learning_rate": 9.529388329066325e-07,
+ "loss": 0.3608,
+ "step": 9069
+ },
+ {
+ "epoch": 4.288416075650118,
+ "grad_norm": 3.1312575340270996,
+ "learning_rate": 9.524488467086751e-07,
+ "loss": 0.3419,
+ "step": 9070
+ },
+ {
+ "epoch": 4.288888888888889,
+ "grad_norm": 3.1531126499176025,
+ "learning_rate": 9.519589568699419e-07,
+ "loss": 0.3261,
+ "step": 9071
+ },
+ {
+ "epoch": 4.2893617021276595,
+ "grad_norm": 2.8852546215057373,
+ "learning_rate": 9.514691634209361e-07,
+ "loss": 0.3258,
+ "step": 9072
+ },
+ {
+ "epoch": 4.28983451536643,
+ "grad_norm": 3.0486297607421875,
+ "learning_rate": 9.50979466392156e-07,
+ "loss": 0.3207,
+ "step": 9073
+ },
+ {
+ "epoch": 4.290307328605201,
+ "grad_norm": 3.017788887023926,
+ "learning_rate": 9.504898658140924e-07,
+ "loss": 0.3288,
+ "step": 9074
+ },
+ {
+ "epoch": 4.290780141843972,
+ "grad_norm": 3.24040150642395,
+ "learning_rate": 9.500003617172302e-07,
+ "loss": 0.2847,
+ "step": 9075
+ },
+ {
+ "epoch": 4.291252955082742,
+ "grad_norm": 3.6793692111968994,
+ "learning_rate": 9.49510954132049e-07,
+ "loss": 0.425,
+ "step": 9076
+ },
+ {
+ "epoch": 4.291725768321513,
+ "grad_norm": 2.7292215824127197,
+ "learning_rate": 9.490216430890215e-07,
+ "loss": 0.3208,
+ "step": 9077
+ },
+ {
+ "epoch": 4.292198581560283,
+ "grad_norm": 2.650388479232788,
+ "learning_rate": 9.485324286186159e-07,
+ "loss": 0.2842,
+ "step": 9078
+ },
+ {
+ "epoch": 4.292671394799054,
+ "grad_norm": 3.1459171772003174,
+ "learning_rate": 9.480433107512932e-07,
+ "loss": 0.3287,
+ "step": 9079
+ },
+ {
+ "epoch": 4.293144208037825,
+ "grad_norm": 3.1777186393737793,
+ "learning_rate": 9.475542895175074e-07,
+ "loss": 0.3385,
+ "step": 9080
+ },
+ {
+ "epoch": 4.293617021276596,
+ "grad_norm": 3.5608465671539307,
+ "learning_rate": 9.470653649477096e-07,
+ "loss": 0.3574,
+ "step": 9081
+ },
+ {
+ "epoch": 4.294089834515367,
+ "grad_norm": 2.58306884765625,
+ "learning_rate": 9.465765370723415e-07,
+ "loss": 0.3156,
+ "step": 9082
+ },
+ {
+ "epoch": 4.294562647754137,
+ "grad_norm": 3.3265857696533203,
+ "learning_rate": 9.460878059218415e-07,
+ "loss": 0.3678,
+ "step": 9083
+ },
+ {
+ "epoch": 4.295035460992908,
+ "grad_norm": 3.259326696395874,
+ "learning_rate": 9.455991715266403e-07,
+ "loss": 0.3675,
+ "step": 9084
+ },
+ {
+ "epoch": 4.295508274231678,
+ "grad_norm": 3.430608034133911,
+ "learning_rate": 9.451106339171618e-07,
+ "loss": 0.3147,
+ "step": 9085
+ },
+ {
+ "epoch": 4.295981087470449,
+ "grad_norm": 3.2896342277526855,
+ "learning_rate": 9.44622193123827e-07,
+ "loss": 0.3482,
+ "step": 9086
+ },
+ {
+ "epoch": 4.29645390070922,
+ "grad_norm": 2.9680557250976562,
+ "learning_rate": 9.441338491770474e-07,
+ "loss": 0.3504,
+ "step": 9087
+ },
+ {
+ "epoch": 4.2969267139479905,
+ "grad_norm": 2.9656941890716553,
+ "learning_rate": 9.436456021072313e-07,
+ "loss": 0.3782,
+ "step": 9088
+ },
+ {
+ "epoch": 4.297399527186761,
+ "grad_norm": 3.463456630706787,
+ "learning_rate": 9.431574519447794e-07,
+ "loss": 0.3517,
+ "step": 9089
+ },
+ {
+ "epoch": 4.297872340425532,
+ "grad_norm": 3.3658525943756104,
+ "learning_rate": 9.426693987200864e-07,
+ "loss": 0.3535,
+ "step": 9090
+ },
+ {
+ "epoch": 4.298345153664303,
+ "grad_norm": 3.087533712387085,
+ "learning_rate": 9.421814424635414e-07,
+ "loss": 0.3007,
+ "step": 9091
+ },
+ {
+ "epoch": 4.298817966903073,
+ "grad_norm": 3.4596481323242188,
+ "learning_rate": 9.41693583205526e-07,
+ "loss": 0.3797,
+ "step": 9092
+ },
+ {
+ "epoch": 4.299290780141844,
+ "grad_norm": 3.647507667541504,
+ "learning_rate": 9.412058209764191e-07,
+ "loss": 0.3803,
+ "step": 9093
+ },
+ {
+ "epoch": 4.299763593380614,
+ "grad_norm": 2.9130196571350098,
+ "learning_rate": 9.407181558065909e-07,
+ "loss": 0.32,
+ "step": 9094
+ },
+ {
+ "epoch": 4.300236406619385,
+ "grad_norm": 3.2562668323516846,
+ "learning_rate": 9.402305877264048e-07,
+ "loss": 0.4103,
+ "step": 9095
+ },
+ {
+ "epoch": 4.300709219858156,
+ "grad_norm": 3.1416616439819336,
+ "learning_rate": 9.397431167662216e-07,
+ "loss": 0.3498,
+ "step": 9096
+ },
+ {
+ "epoch": 4.301182033096927,
+ "grad_norm": 2.9540042877197266,
+ "learning_rate": 9.392557429563929e-07,
+ "loss": 0.3073,
+ "step": 9097
+ },
+ {
+ "epoch": 4.301654846335698,
+ "grad_norm": 3.0450825691223145,
+ "learning_rate": 9.387684663272645e-07,
+ "loss": 0.3295,
+ "step": 9098
+ },
+ {
+ "epoch": 4.302127659574468,
+ "grad_norm": 3.1060359477996826,
+ "learning_rate": 9.38281286909179e-07,
+ "loss": 0.3582,
+ "step": 9099
+ },
+ {
+ "epoch": 4.302600472813239,
+ "grad_norm": 2.901136636734009,
+ "learning_rate": 9.377942047324687e-07,
+ "loss": 0.3162,
+ "step": 9100
+ },
+ {
+ "epoch": 4.303073286052009,
+ "grad_norm": 3.5618929862976074,
+ "learning_rate": 9.373072198274641e-07,
+ "loss": 0.2917,
+ "step": 9101
+ },
+ {
+ "epoch": 4.30354609929078,
+ "grad_norm": 3.0853395462036133,
+ "learning_rate": 9.368203322244871e-07,
+ "loss": 0.3124,
+ "step": 9102
+ },
+ {
+ "epoch": 4.304018912529551,
+ "grad_norm": 3.00398588180542,
+ "learning_rate": 9.363335419538524e-07,
+ "loss": 0.3167,
+ "step": 9103
+ },
+ {
+ "epoch": 4.3044917257683215,
+ "grad_norm": 3.4705588817596436,
+ "learning_rate": 9.358468490458725e-07,
+ "loss": 0.3188,
+ "step": 9104
+ },
+ {
+ "epoch": 4.304964539007092,
+ "grad_norm": 2.948302745819092,
+ "learning_rate": 9.353602535308509e-07,
+ "loss": 0.2739,
+ "step": 9105
+ },
+ {
+ "epoch": 4.305437352245863,
+ "grad_norm": 3.4512269496917725,
+ "learning_rate": 9.348737554390852e-07,
+ "loss": 0.3256,
+ "step": 9106
+ },
+ {
+ "epoch": 4.305910165484634,
+ "grad_norm": 2.9979147911071777,
+ "learning_rate": 9.343873548008684e-07,
+ "loss": 0.3184,
+ "step": 9107
+ },
+ {
+ "epoch": 4.306382978723404,
+ "grad_norm": 3.1008479595184326,
+ "learning_rate": 9.339010516464847e-07,
+ "loss": 0.3251,
+ "step": 9108
+ },
+ {
+ "epoch": 4.306855791962175,
+ "grad_norm": 2.86930775642395,
+ "learning_rate": 9.334148460062165e-07,
+ "loss": 0.3322,
+ "step": 9109
+ },
+ {
+ "epoch": 4.307328605200945,
+ "grad_norm": 3.2068963050842285,
+ "learning_rate": 9.329287379103355e-07,
+ "loss": 0.3845,
+ "step": 9110
+ },
+ {
+ "epoch": 4.307801418439716,
+ "grad_norm": 3.567309856414795,
+ "learning_rate": 9.324427273891115e-07,
+ "loss": 0.4037,
+ "step": 9111
+ },
+ {
+ "epoch": 4.308274231678487,
+ "grad_norm": 3.2064783573150635,
+ "learning_rate": 9.319568144728056e-07,
+ "loss": 0.3481,
+ "step": 9112
+ },
+ {
+ "epoch": 4.308747044917258,
+ "grad_norm": 3.2492294311523438,
+ "learning_rate": 9.314709991916721e-07,
+ "loss": 0.3657,
+ "step": 9113
+ },
+ {
+ "epoch": 4.309219858156029,
+ "grad_norm": 2.990755081176758,
+ "learning_rate": 9.309852815759626e-07,
+ "loss": 0.3582,
+ "step": 9114
+ },
+ {
+ "epoch": 4.309692671394799,
+ "grad_norm": 3.3375513553619385,
+ "learning_rate": 9.304996616559187e-07,
+ "loss": 0.3657,
+ "step": 9115
+ },
+ {
+ "epoch": 4.31016548463357,
+ "grad_norm": 2.945552349090576,
+ "learning_rate": 9.300141394617798e-07,
+ "loss": 0.3075,
+ "step": 9116
+ },
+ {
+ "epoch": 4.31063829787234,
+ "grad_norm": 3.5318517684936523,
+ "learning_rate": 9.295287150237764e-07,
+ "loss": 0.39,
+ "step": 9117
+ },
+ {
+ "epoch": 4.311111111111111,
+ "grad_norm": 3.452049732208252,
+ "learning_rate": 9.290433883721326e-07,
+ "loss": 0.3821,
+ "step": 9118
+ },
+ {
+ "epoch": 4.311583924349882,
+ "grad_norm": 3.0762388706207275,
+ "learning_rate": 9.285581595370693e-07,
+ "loss": 0.316,
+ "step": 9119
+ },
+ {
+ "epoch": 4.3120567375886525,
+ "grad_norm": 4.13551664352417,
+ "learning_rate": 9.28073028548799e-07,
+ "loss": 0.3661,
+ "step": 9120
+ },
+ {
+ "epoch": 4.312529550827423,
+ "grad_norm": 3.1915719509124756,
+ "learning_rate": 9.275879954375286e-07,
+ "loss": 0.371,
+ "step": 9121
+ },
+ {
+ "epoch": 4.313002364066194,
+ "grad_norm": 3.118861198425293,
+ "learning_rate": 9.271030602334577e-07,
+ "loss": 0.3943,
+ "step": 9122
+ },
+ {
+ "epoch": 4.313475177304965,
+ "grad_norm": 3.042757987976074,
+ "learning_rate": 9.266182229667836e-07,
+ "loss": 0.3779,
+ "step": 9123
+ },
+ {
+ "epoch": 4.313947990543735,
+ "grad_norm": 2.949110746383667,
+ "learning_rate": 9.261334836676933e-07,
+ "loss": 0.3721,
+ "step": 9124
+ },
+ {
+ "epoch": 4.314420803782506,
+ "grad_norm": 2.982090950012207,
+ "learning_rate": 9.256488423663701e-07,
+ "loss": 0.2865,
+ "step": 9125
+ },
+ {
+ "epoch": 4.314893617021276,
+ "grad_norm": 3.6527535915374756,
+ "learning_rate": 9.25164299092989e-07,
+ "loss": 0.3689,
+ "step": 9126
+ },
+ {
+ "epoch": 4.315366430260047,
+ "grad_norm": 3.3310744762420654,
+ "learning_rate": 9.246798538777227e-07,
+ "loss": 0.3198,
+ "step": 9127
+ },
+ {
+ "epoch": 4.315839243498818,
+ "grad_norm": 2.8298583030700684,
+ "learning_rate": 9.241955067507332e-07,
+ "loss": 0.3711,
+ "step": 9128
+ },
+ {
+ "epoch": 4.316312056737589,
+ "grad_norm": 3.636894702911377,
+ "learning_rate": 9.237112577421809e-07,
+ "loss": 0.374,
+ "step": 9129
+ },
+ {
+ "epoch": 4.31678486997636,
+ "grad_norm": 2.896251678466797,
+ "learning_rate": 9.232271068822166e-07,
+ "loss": 0.3372,
+ "step": 9130
+ },
+ {
+ "epoch": 4.31725768321513,
+ "grad_norm": 3.2836971282958984,
+ "learning_rate": 9.227430542009854e-07,
+ "loss": 0.3584,
+ "step": 9131
+ },
+ {
+ "epoch": 4.317730496453901,
+ "grad_norm": 2.9452571868896484,
+ "learning_rate": 9.222590997286293e-07,
+ "loss": 0.3658,
+ "step": 9132
+ },
+ {
+ "epoch": 4.318203309692671,
+ "grad_norm": 2.88613224029541,
+ "learning_rate": 9.217752434952801e-07,
+ "loss": 0.3221,
+ "step": 9133
+ },
+ {
+ "epoch": 4.318676122931442,
+ "grad_norm": 2.7794570922851562,
+ "learning_rate": 9.212914855310667e-07,
+ "loss": 0.3142,
+ "step": 9134
+ },
+ {
+ "epoch": 4.319148936170213,
+ "grad_norm": 3.0195112228393555,
+ "learning_rate": 9.208078258661102e-07,
+ "loss": 0.3039,
+ "step": 9135
+ },
+ {
+ "epoch": 4.3196217494089835,
+ "grad_norm": 3.5178396701812744,
+ "learning_rate": 9.203242645305253e-07,
+ "loss": 0.3912,
+ "step": 9136
+ },
+ {
+ "epoch": 4.320094562647754,
+ "grad_norm": 3.145413875579834,
+ "learning_rate": 9.198408015544222e-07,
+ "loss": 0.3045,
+ "step": 9137
+ },
+ {
+ "epoch": 4.320567375886525,
+ "grad_norm": 3.151193380355835,
+ "learning_rate": 9.193574369679037e-07,
+ "loss": 0.341,
+ "step": 9138
+ },
+ {
+ "epoch": 4.321040189125296,
+ "grad_norm": 3.248255968093872,
+ "learning_rate": 9.188741708010668e-07,
+ "loss": 0.4344,
+ "step": 9139
+ },
+ {
+ "epoch": 4.321513002364066,
+ "grad_norm": 2.953218460083008,
+ "learning_rate": 9.183910030840021e-07,
+ "loss": 0.343,
+ "step": 9140
+ },
+ {
+ "epoch": 4.321985815602837,
+ "grad_norm": 3.1873161792755127,
+ "learning_rate": 9.179079338467936e-07,
+ "loss": 0.3082,
+ "step": 9141
+ },
+ {
+ "epoch": 4.322458628841607,
+ "grad_norm": 3.2587013244628906,
+ "learning_rate": 9.174249631195218e-07,
+ "loss": 0.3855,
+ "step": 9142
+ },
+ {
+ "epoch": 4.322931442080378,
+ "grad_norm": 2.956145763397217,
+ "learning_rate": 9.169420909322573e-07,
+ "loss": 0.3156,
+ "step": 9143
+ },
+ {
+ "epoch": 4.323404255319149,
+ "grad_norm": 3.1664650440216064,
+ "learning_rate": 9.164593173150683e-07,
+ "loss": 0.352,
+ "step": 9144
+ },
+ {
+ "epoch": 4.32387706855792,
+ "grad_norm": 3.2792744636535645,
+ "learning_rate": 9.159766422980138e-07,
+ "loss": 0.3963,
+ "step": 9145
+ },
+ {
+ "epoch": 4.3243498817966906,
+ "grad_norm": 3.1249687671661377,
+ "learning_rate": 9.154940659111472e-07,
+ "loss": 0.3405,
+ "step": 9146
+ },
+ {
+ "epoch": 4.324822695035461,
+ "grad_norm": 3.128340244293213,
+ "learning_rate": 9.150115881845181e-07,
+ "loss": 0.2733,
+ "step": 9147
+ },
+ {
+ "epoch": 4.325295508274232,
+ "grad_norm": 3.1790847778320312,
+ "learning_rate": 9.145292091481675e-07,
+ "loss": 0.3676,
+ "step": 9148
+ },
+ {
+ "epoch": 4.325768321513002,
+ "grad_norm": 2.874678134918213,
+ "learning_rate": 9.1404692883213e-07,
+ "loss": 0.2937,
+ "step": 9149
+ },
+ {
+ "epoch": 4.326241134751773,
+ "grad_norm": 3.102196216583252,
+ "learning_rate": 9.135647472664369e-07,
+ "loss": 0.2772,
+ "step": 9150
+ },
+ {
+ "epoch": 4.326713947990544,
+ "grad_norm": 3.0027546882629395,
+ "learning_rate": 9.130826644811099e-07,
+ "loss": 0.3171,
+ "step": 9151
+ },
+ {
+ "epoch": 4.3271867612293144,
+ "grad_norm": 2.750152587890625,
+ "learning_rate": 9.126006805061679e-07,
+ "loss": 0.3689,
+ "step": 9152
+ },
+ {
+ "epoch": 4.327659574468085,
+ "grad_norm": 3.251054525375366,
+ "learning_rate": 9.12118795371621e-07,
+ "loss": 0.3463,
+ "step": 9153
+ },
+ {
+ "epoch": 4.328132387706856,
+ "grad_norm": 2.8849353790283203,
+ "learning_rate": 9.116370091074738e-07,
+ "loss": 0.3,
+ "step": 9154
+ },
+ {
+ "epoch": 4.328605200945627,
+ "grad_norm": 3.4823720455169678,
+ "learning_rate": 9.111553217437255e-07,
+ "loss": 0.336,
+ "step": 9155
+ },
+ {
+ "epoch": 4.329078014184397,
+ "grad_norm": 2.8170886039733887,
+ "learning_rate": 9.106737333103677e-07,
+ "loss": 0.3237,
+ "step": 9156
+ },
+ {
+ "epoch": 4.3295508274231675,
+ "grad_norm": 3.095379114151001,
+ "learning_rate": 9.101922438373881e-07,
+ "loss": 0.3438,
+ "step": 9157
+ },
+ {
+ "epoch": 4.330023640661938,
+ "grad_norm": 3.1764986515045166,
+ "learning_rate": 9.097108533547667e-07,
+ "loss": 0.3174,
+ "step": 9158
+ },
+ {
+ "epoch": 4.330496453900709,
+ "grad_norm": 3.3972036838531494,
+ "learning_rate": 9.092295618924763e-07,
+ "loss": 0.3118,
+ "step": 9159
+ },
+ {
+ "epoch": 4.33096926713948,
+ "grad_norm": 3.112926959991455,
+ "learning_rate": 9.087483694804863e-07,
+ "loss": 0.3521,
+ "step": 9160
+ },
+ {
+ "epoch": 4.331442080378251,
+ "grad_norm": 3.395550012588501,
+ "learning_rate": 9.082672761487573e-07,
+ "loss": 0.3423,
+ "step": 9161
+ },
+ {
+ "epoch": 4.3319148936170215,
+ "grad_norm": 3.486910343170166,
+ "learning_rate": 9.077862819272465e-07,
+ "loss": 0.3655,
+ "step": 9162
+ },
+ {
+ "epoch": 4.332387706855792,
+ "grad_norm": 3.0986499786376953,
+ "learning_rate": 9.07305386845902e-07,
+ "loss": 0.2865,
+ "step": 9163
+ },
+ {
+ "epoch": 4.332860520094562,
+ "grad_norm": 2.962139844894409,
+ "learning_rate": 9.068245909346665e-07,
+ "loss": 0.315,
+ "step": 9164
+ },
+ {
+ "epoch": 4.333333333333333,
+ "grad_norm": 3.0887413024902344,
+ "learning_rate": 9.063438942234787e-07,
+ "loss": 0.3207,
+ "step": 9165
+ },
+ {
+ "epoch": 4.333806146572104,
+ "grad_norm": 2.909770965576172,
+ "learning_rate": 9.058632967422678e-07,
+ "loss": 0.3221,
+ "step": 9166
+ },
+ {
+ "epoch": 4.334278959810875,
+ "grad_norm": 3.1872079372406006,
+ "learning_rate": 9.053827985209604e-07,
+ "loss": 0.3856,
+ "step": 9167
+ },
+ {
+ "epoch": 4.334751773049645,
+ "grad_norm": 3.196985960006714,
+ "learning_rate": 9.049023995894738e-07,
+ "loss": 0.2994,
+ "step": 9168
+ },
+ {
+ "epoch": 4.335224586288416,
+ "grad_norm": 3.3150243759155273,
+ "learning_rate": 9.044220999777204e-07,
+ "loss": 0.3619,
+ "step": 9169
+ },
+ {
+ "epoch": 4.335697399527187,
+ "grad_norm": 3.1662707328796387,
+ "learning_rate": 9.039418997156066e-07,
+ "loss": 0.3347,
+ "step": 9170
+ },
+ {
+ "epoch": 4.336170212765958,
+ "grad_norm": 3.6789329051971436,
+ "learning_rate": 9.034617988330318e-07,
+ "loss": 0.4106,
+ "step": 9171
+ },
+ {
+ "epoch": 4.336643026004728,
+ "grad_norm": 3.5192553997039795,
+ "learning_rate": 9.029817973598898e-07,
+ "loss": 0.3561,
+ "step": 9172
+ },
+ {
+ "epoch": 4.3371158392434985,
+ "grad_norm": 3.4309239387512207,
+ "learning_rate": 9.025018953260692e-07,
+ "loss": 0.3739,
+ "step": 9173
+ },
+ {
+ "epoch": 4.337588652482269,
+ "grad_norm": 3.1840806007385254,
+ "learning_rate": 9.020220927614498e-07,
+ "loss": 0.3479,
+ "step": 9174
+ },
+ {
+ "epoch": 4.33806146572104,
+ "grad_norm": 3.6679139137268066,
+ "learning_rate": 9.015423896959088e-07,
+ "loss": 0.3739,
+ "step": 9175
+ },
+ {
+ "epoch": 4.338534278959811,
+ "grad_norm": 3.125296115875244,
+ "learning_rate": 9.010627861593143e-07,
+ "loss": 0.3046,
+ "step": 9176
+ },
+ {
+ "epoch": 4.339007092198582,
+ "grad_norm": 3.0710368156433105,
+ "learning_rate": 9.005832821815278e-07,
+ "loss": 0.322,
+ "step": 9177
+ },
+ {
+ "epoch": 4.3394799054373525,
+ "grad_norm": 3.068833351135254,
+ "learning_rate": 9.001038777924082e-07,
+ "loss": 0.3229,
+ "step": 9178
+ },
+ {
+ "epoch": 4.339952718676123,
+ "grad_norm": 3.2746002674102783,
+ "learning_rate": 8.996245730218037e-07,
+ "loss": 0.369,
+ "step": 9179
+ },
+ {
+ "epoch": 4.340425531914893,
+ "grad_norm": 3.1138477325439453,
+ "learning_rate": 8.991453678995607e-07,
+ "loss": 0.3456,
+ "step": 9180
+ },
+ {
+ "epoch": 4.340898345153664,
+ "grad_norm": 3.2195467948913574,
+ "learning_rate": 8.986662624555159e-07,
+ "loss": 0.377,
+ "step": 9181
+ },
+ {
+ "epoch": 4.341371158392435,
+ "grad_norm": 3.1197304725646973,
+ "learning_rate": 8.981872567195008e-07,
+ "loss": 0.3007,
+ "step": 9182
+ },
+ {
+ "epoch": 4.341843971631206,
+ "grad_norm": 3.295881748199463,
+ "learning_rate": 8.977083507213418e-07,
+ "loss": 0.4007,
+ "step": 9183
+ },
+ {
+ "epoch": 4.342316784869976,
+ "grad_norm": 3.8013954162597656,
+ "learning_rate": 8.972295444908582e-07,
+ "loss": 0.4322,
+ "step": 9184
+ },
+ {
+ "epoch": 4.342789598108747,
+ "grad_norm": 3.133434295654297,
+ "learning_rate": 8.967508380578633e-07,
+ "loss": 0.3379,
+ "step": 9185
+ },
+ {
+ "epoch": 4.343262411347518,
+ "grad_norm": 3.0942039489746094,
+ "learning_rate": 8.962722314521625e-07,
+ "loss": 0.3753,
+ "step": 9186
+ },
+ {
+ "epoch": 4.343735224586289,
+ "grad_norm": 2.8691020011901855,
+ "learning_rate": 8.957937247035583e-07,
+ "loss": 0.3003,
+ "step": 9187
+ },
+ {
+ "epoch": 4.344208037825059,
+ "grad_norm": 2.8353092670440674,
+ "learning_rate": 8.95315317841845e-07,
+ "loss": 0.3068,
+ "step": 9188
+ },
+ {
+ "epoch": 4.3446808510638295,
+ "grad_norm": 3.071207046508789,
+ "learning_rate": 8.948370108968097e-07,
+ "loss": 0.3147,
+ "step": 9189
+ },
+ {
+ "epoch": 4.3451536643026,
+ "grad_norm": 3.3605904579162598,
+ "learning_rate": 8.943588038982359e-07,
+ "loss": 0.3216,
+ "step": 9190
+ },
+ {
+ "epoch": 4.345626477541371,
+ "grad_norm": 3.0702717304229736,
+ "learning_rate": 8.93880696875899e-07,
+ "loss": 0.3507,
+ "step": 9191
+ },
+ {
+ "epoch": 4.346099290780142,
+ "grad_norm": 3.261456251144409,
+ "learning_rate": 8.934026898595675e-07,
+ "loss": 0.3677,
+ "step": 9192
+ },
+ {
+ "epoch": 4.346572104018913,
+ "grad_norm": 3.0827512741088867,
+ "learning_rate": 8.929247828790066e-07,
+ "loss": 0.3786,
+ "step": 9193
+ },
+ {
+ "epoch": 4.3470449172576835,
+ "grad_norm": 3.488949775695801,
+ "learning_rate": 8.924469759639728e-07,
+ "loss": 0.3685,
+ "step": 9194
+ },
+ {
+ "epoch": 4.347517730496454,
+ "grad_norm": 2.8565423488616943,
+ "learning_rate": 8.919692691442162e-07,
+ "loss": 0.3449,
+ "step": 9195
+ },
+ {
+ "epoch": 4.347990543735224,
+ "grad_norm": 4.654722213745117,
+ "learning_rate": 8.914916624494829e-07,
+ "loss": 0.344,
+ "step": 9196
+ },
+ {
+ "epoch": 4.348463356973995,
+ "grad_norm": 3.256714344024658,
+ "learning_rate": 8.910141559095098e-07,
+ "loss": 0.3487,
+ "step": 9197
+ },
+ {
+ "epoch": 4.348936170212766,
+ "grad_norm": 3.0921413898468018,
+ "learning_rate": 8.90536749554031e-07,
+ "loss": 0.3171,
+ "step": 9198
+ },
+ {
+ "epoch": 4.349408983451537,
+ "grad_norm": 3.1129112243652344,
+ "learning_rate": 8.900594434127712e-07,
+ "loss": 0.3501,
+ "step": 9199
+ },
+ {
+ "epoch": 4.349881796690307,
+ "grad_norm": 3.077688217163086,
+ "learning_rate": 8.8958223751545e-07,
+ "loss": 0.2781,
+ "step": 9200
+ },
+ {
+ "epoch": 4.350354609929078,
+ "grad_norm": 3.2839295864105225,
+ "learning_rate": 8.891051318917821e-07,
+ "loss": 0.3699,
+ "step": 9201
+ },
+ {
+ "epoch": 4.350827423167849,
+ "grad_norm": 3.0370850563049316,
+ "learning_rate": 8.886281265714741e-07,
+ "loss": 0.3344,
+ "step": 9202
+ },
+ {
+ "epoch": 4.35130023640662,
+ "grad_norm": 3.439702033996582,
+ "learning_rate": 8.88151221584227e-07,
+ "loss": 0.3865,
+ "step": 9203
+ },
+ {
+ "epoch": 4.35177304964539,
+ "grad_norm": 3.133317470550537,
+ "learning_rate": 8.876744169597357e-07,
+ "loss": 0.3352,
+ "step": 9204
+ },
+ {
+ "epoch": 4.3522458628841605,
+ "grad_norm": 3.2529115676879883,
+ "learning_rate": 8.871977127276876e-07,
+ "loss": 0.3708,
+ "step": 9205
+ },
+ {
+ "epoch": 4.352718676122931,
+ "grad_norm": 3.2149887084960938,
+ "learning_rate": 8.867211089177669e-07,
+ "loss": 0.3298,
+ "step": 9206
+ },
+ {
+ "epoch": 4.353191489361702,
+ "grad_norm": 2.778116464614868,
+ "learning_rate": 8.86244605559648e-07,
+ "loss": 0.3319,
+ "step": 9207
+ },
+ {
+ "epoch": 4.353664302600473,
+ "grad_norm": 3.206336736679077,
+ "learning_rate": 8.85768202683002e-07,
+ "loss": 0.3793,
+ "step": 9208
+ },
+ {
+ "epoch": 4.354137115839244,
+ "grad_norm": 3.4236080646514893,
+ "learning_rate": 8.852919003174921e-07,
+ "loss": 0.3341,
+ "step": 9209
+ },
+ {
+ "epoch": 4.3546099290780145,
+ "grad_norm": 3.049886703491211,
+ "learning_rate": 8.848156984927742e-07,
+ "loss": 0.3153,
+ "step": 9210
+ },
+ {
+ "epoch": 4.355082742316785,
+ "grad_norm": 4.048248291015625,
+ "learning_rate": 8.843395972385013e-07,
+ "loss": 0.3857,
+ "step": 9211
+ },
+ {
+ "epoch": 4.355555555555555,
+ "grad_norm": 3.3379292488098145,
+ "learning_rate": 8.838635965843165e-07,
+ "loss": 0.3167,
+ "step": 9212
+ },
+ {
+ "epoch": 4.356028368794326,
+ "grad_norm": 2.963364839553833,
+ "learning_rate": 8.833876965598598e-07,
+ "loss": 0.3427,
+ "step": 9213
+ },
+ {
+ "epoch": 4.356501182033097,
+ "grad_norm": 3.1309237480163574,
+ "learning_rate": 8.829118971947625e-07,
+ "loss": 0.3694,
+ "step": 9214
+ },
+ {
+ "epoch": 4.356973995271868,
+ "grad_norm": 3.4728028774261475,
+ "learning_rate": 8.824361985186497e-07,
+ "loss": 0.3769,
+ "step": 9215
+ },
+ {
+ "epoch": 4.357446808510638,
+ "grad_norm": 2.7183408737182617,
+ "learning_rate": 8.819606005611431e-07,
+ "loss": 0.3053,
+ "step": 9216
+ },
+ {
+ "epoch": 4.357919621749409,
+ "grad_norm": 3.34867262840271,
+ "learning_rate": 8.814851033518549e-07,
+ "loss": 0.3316,
+ "step": 9217
+ },
+ {
+ "epoch": 4.35839243498818,
+ "grad_norm": 3.288097858428955,
+ "learning_rate": 8.810097069203924e-07,
+ "loss": 0.362,
+ "step": 9218
+ },
+ {
+ "epoch": 4.358865248226951,
+ "grad_norm": 3.2768566608428955,
+ "learning_rate": 8.805344112963563e-07,
+ "loss": 0.3762,
+ "step": 9219
+ },
+ {
+ "epoch": 4.359338061465721,
+ "grad_norm": 2.730982542037964,
+ "learning_rate": 8.800592165093405e-07,
+ "loss": 0.3184,
+ "step": 9220
+ },
+ {
+ "epoch": 4.3598108747044915,
+ "grad_norm": 3.2347333431243896,
+ "learning_rate": 8.795841225889348e-07,
+ "loss": 0.3414,
+ "step": 9221
+ },
+ {
+ "epoch": 4.360283687943262,
+ "grad_norm": 2.8792049884796143,
+ "learning_rate": 8.791091295647208e-07,
+ "loss": 0.3312,
+ "step": 9222
+ },
+ {
+ "epoch": 4.360756501182033,
+ "grad_norm": 3.2037971019744873,
+ "learning_rate": 8.786342374662726e-07,
+ "loss": 0.3772,
+ "step": 9223
+ },
+ {
+ "epoch": 4.361229314420804,
+ "grad_norm": 3.765244245529175,
+ "learning_rate": 8.781594463231621e-07,
+ "loss": 0.3724,
+ "step": 9224
+ },
+ {
+ "epoch": 4.361702127659575,
+ "grad_norm": 3.085339069366455,
+ "learning_rate": 8.776847561649504e-07,
+ "loss": 0.3468,
+ "step": 9225
+ },
+ {
+ "epoch": 4.3621749408983455,
+ "grad_norm": 2.8031229972839355,
+ "learning_rate": 8.772101670211963e-07,
+ "loss": 0.3219,
+ "step": 9226
+ },
+ {
+ "epoch": 4.362647754137116,
+ "grad_norm": 2.667694091796875,
+ "learning_rate": 8.76735678921449e-07,
+ "loss": 0.3381,
+ "step": 9227
+ },
+ {
+ "epoch": 4.363120567375886,
+ "grad_norm": 2.898273229598999,
+ "learning_rate": 8.762612918952526e-07,
+ "loss": 0.3526,
+ "step": 9228
+ },
+ {
+ "epoch": 4.363593380614657,
+ "grad_norm": 3.1458849906921387,
+ "learning_rate": 8.757870059721465e-07,
+ "loss": 0.3516,
+ "step": 9229
+ },
+ {
+ "epoch": 4.364066193853428,
+ "grad_norm": 3.1719279289245605,
+ "learning_rate": 8.753128211816609e-07,
+ "loss": 0.328,
+ "step": 9230
+ },
+ {
+ "epoch": 4.3645390070921986,
+ "grad_norm": 3.0799217224121094,
+ "learning_rate": 8.748387375533224e-07,
+ "loss": 0.2802,
+ "step": 9231
+ },
+ {
+ "epoch": 4.365011820330969,
+ "grad_norm": 3.1218812465667725,
+ "learning_rate": 8.743647551166498e-07,
+ "loss": 0.3264,
+ "step": 9232
+ },
+ {
+ "epoch": 4.36548463356974,
+ "grad_norm": 3.231175184249878,
+ "learning_rate": 8.738908739011556e-07,
+ "loss": 0.3192,
+ "step": 9233
+ },
+ {
+ "epoch": 4.365957446808511,
+ "grad_norm": 3.088284730911255,
+ "learning_rate": 8.734170939363465e-07,
+ "loss": 0.3569,
+ "step": 9234
+ },
+ {
+ "epoch": 4.366430260047281,
+ "grad_norm": 3.2510828971862793,
+ "learning_rate": 8.729434152517217e-07,
+ "loss": 0.3977,
+ "step": 9235
+ },
+ {
+ "epoch": 4.366903073286052,
+ "grad_norm": 3.435762405395508,
+ "learning_rate": 8.724698378767768e-07,
+ "loss": 0.3201,
+ "step": 9236
+ },
+ {
+ "epoch": 4.3673758865248224,
+ "grad_norm": 3.6876676082611084,
+ "learning_rate": 8.719963618409985e-07,
+ "loss": 0.381,
+ "step": 9237
+ },
+ {
+ "epoch": 4.367848699763593,
+ "grad_norm": 2.7620339393615723,
+ "learning_rate": 8.715229871738676e-07,
+ "loss": 0.2939,
+ "step": 9238
+ },
+ {
+ "epoch": 4.368321513002364,
+ "grad_norm": 3.412893056869507,
+ "learning_rate": 8.710497139048604e-07,
+ "loss": 0.3592,
+ "step": 9239
+ },
+ {
+ "epoch": 4.368794326241135,
+ "grad_norm": 3.2498574256896973,
+ "learning_rate": 8.705765420634446e-07,
+ "loss": 0.4054,
+ "step": 9240
+ },
+ {
+ "epoch": 4.369267139479906,
+ "grad_norm": 3.138425827026367,
+ "learning_rate": 8.701034716790821e-07,
+ "loss": 0.3609,
+ "step": 9241
+ },
+ {
+ "epoch": 4.369739952718676,
+ "grad_norm": 2.7645158767700195,
+ "learning_rate": 8.696305027812301e-07,
+ "loss": 0.3085,
+ "step": 9242
+ },
+ {
+ "epoch": 4.370212765957447,
+ "grad_norm": 3.5948917865753174,
+ "learning_rate": 8.691576353993372e-07,
+ "loss": 0.3846,
+ "step": 9243
+ },
+ {
+ "epoch": 4.370685579196217,
+ "grad_norm": 3.2185158729553223,
+ "learning_rate": 8.68684869562848e-07,
+ "loss": 0.3516,
+ "step": 9244
+ },
+ {
+ "epoch": 4.371158392434988,
+ "grad_norm": 3.057281494140625,
+ "learning_rate": 8.68212205301199e-07,
+ "loss": 0.3197,
+ "step": 9245
+ },
+ {
+ "epoch": 4.371631205673759,
+ "grad_norm": 2.9788076877593994,
+ "learning_rate": 8.677396426438198e-07,
+ "loss": 0.3283,
+ "step": 9246
+ },
+ {
+ "epoch": 4.3721040189125295,
+ "grad_norm": 2.9246625900268555,
+ "learning_rate": 8.672671816201366e-07,
+ "loss": 0.3482,
+ "step": 9247
+ },
+ {
+ "epoch": 4.3725768321513,
+ "grad_norm": 2.9994964599609375,
+ "learning_rate": 8.667948222595671e-07,
+ "loss": 0.3802,
+ "step": 9248
+ },
+ {
+ "epoch": 4.373049645390071,
+ "grad_norm": 2.692626476287842,
+ "learning_rate": 8.663225645915222e-07,
+ "loss": 0.3045,
+ "step": 9249
+ },
+ {
+ "epoch": 4.373522458628842,
+ "grad_norm": 2.794236660003662,
+ "learning_rate": 8.658504086454078e-07,
+ "loss": 0.3056,
+ "step": 9250
+ },
+ {
+ "epoch": 4.373995271867612,
+ "grad_norm": 3.020534038543701,
+ "learning_rate": 8.653783544506222e-07,
+ "loss": 0.3341,
+ "step": 9251
+ },
+ {
+ "epoch": 4.374468085106383,
+ "grad_norm": 3.2142958641052246,
+ "learning_rate": 8.649064020365596e-07,
+ "loss": 0.3435,
+ "step": 9252
+ },
+ {
+ "epoch": 4.374940898345153,
+ "grad_norm": 3.3818624019622803,
+ "learning_rate": 8.644345514326049e-07,
+ "loss": 0.3744,
+ "step": 9253
+ },
+ {
+ "epoch": 4.375413711583924,
+ "grad_norm": 3.1566405296325684,
+ "learning_rate": 8.639628026681399e-07,
+ "loss": 0.3568,
+ "step": 9254
+ },
+ {
+ "epoch": 4.375886524822695,
+ "grad_norm": 3.3773083686828613,
+ "learning_rate": 8.63491155772537e-07,
+ "loss": 0.3523,
+ "step": 9255
+ },
+ {
+ "epoch": 4.376359338061466,
+ "grad_norm": 3.0850939750671387,
+ "learning_rate": 8.630196107751634e-07,
+ "loss": 0.3356,
+ "step": 9256
+ },
+ {
+ "epoch": 4.376832151300237,
+ "grad_norm": 3.361496686935425,
+ "learning_rate": 8.625481677053815e-07,
+ "loss": 0.3619,
+ "step": 9257
+ },
+ {
+ "epoch": 4.377304964539007,
+ "grad_norm": 3.026015043258667,
+ "learning_rate": 8.620768265925444e-07,
+ "loss": 0.3476,
+ "step": 9258
+ },
+ {
+ "epoch": 4.377777777777778,
+ "grad_norm": 3.142747640609741,
+ "learning_rate": 8.61605587466002e-07,
+ "loss": 0.3391,
+ "step": 9259
+ },
+ {
+ "epoch": 4.378250591016548,
+ "grad_norm": 3.0910356044769287,
+ "learning_rate": 8.611344503550956e-07,
+ "loss": 0.3201,
+ "step": 9260
+ },
+ {
+ "epoch": 4.378723404255319,
+ "grad_norm": 3.4462292194366455,
+ "learning_rate": 8.606634152891599e-07,
+ "loss": 0.4075,
+ "step": 9261
+ },
+ {
+ "epoch": 4.37919621749409,
+ "grad_norm": 2.984248638153076,
+ "learning_rate": 8.601924822975258e-07,
+ "loss": 0.3415,
+ "step": 9262
+ },
+ {
+ "epoch": 4.3796690307328605,
+ "grad_norm": 2.944971799850464,
+ "learning_rate": 8.597216514095155e-07,
+ "loss": 0.3163,
+ "step": 9263
+ },
+ {
+ "epoch": 4.380141843971631,
+ "grad_norm": 3.1562247276306152,
+ "learning_rate": 8.592509226544457e-07,
+ "loss": 0.3093,
+ "step": 9264
+ },
+ {
+ "epoch": 4.380614657210402,
+ "grad_norm": 2.911339282989502,
+ "learning_rate": 8.587802960616254e-07,
+ "loss": 0.3287,
+ "step": 9265
+ },
+ {
+ "epoch": 4.381087470449173,
+ "grad_norm": 3.5560295581817627,
+ "learning_rate": 8.583097716603605e-07,
+ "loss": 0.3763,
+ "step": 9266
+ },
+ {
+ "epoch": 4.381560283687943,
+ "grad_norm": 3.35855770111084,
+ "learning_rate": 8.578393494799478e-07,
+ "loss": 0.3703,
+ "step": 9267
+ },
+ {
+ "epoch": 4.382033096926714,
+ "grad_norm": 3.0229954719543457,
+ "learning_rate": 8.573690295496778e-07,
+ "loss": 0.3421,
+ "step": 9268
+ },
+ {
+ "epoch": 4.382505910165484,
+ "grad_norm": 3.0842833518981934,
+ "learning_rate": 8.568988118988348e-07,
+ "loss": 0.3473,
+ "step": 9269
+ },
+ {
+ "epoch": 4.382978723404255,
+ "grad_norm": 3.2471694946289062,
+ "learning_rate": 8.564286965566989e-07,
+ "loss": 0.3025,
+ "step": 9270
+ },
+ {
+ "epoch": 4.383451536643026,
+ "grad_norm": 3.4435837268829346,
+ "learning_rate": 8.559586835525404e-07,
+ "loss": 0.394,
+ "step": 9271
+ },
+ {
+ "epoch": 4.383924349881797,
+ "grad_norm": 3.4572243690490723,
+ "learning_rate": 8.554887729156267e-07,
+ "loss": 0.3745,
+ "step": 9272
+ },
+ {
+ "epoch": 4.384397163120568,
+ "grad_norm": 3.3646514415740967,
+ "learning_rate": 8.550189646752161e-07,
+ "loss": 0.308,
+ "step": 9273
+ },
+ {
+ "epoch": 4.384869976359338,
+ "grad_norm": 2.794933319091797,
+ "learning_rate": 8.545492588605606e-07,
+ "loss": 0.3039,
+ "step": 9274
+ },
+ {
+ "epoch": 4.385342789598109,
+ "grad_norm": 2.969306707382202,
+ "learning_rate": 8.540796555009084e-07,
+ "loss": 0.3815,
+ "step": 9275
+ },
+ {
+ "epoch": 4.385815602836879,
+ "grad_norm": 2.9203877449035645,
+ "learning_rate": 8.536101546254982e-07,
+ "loss": 0.3143,
+ "step": 9276
+ },
+ {
+ "epoch": 4.38628841607565,
+ "grad_norm": 3.451172113418579,
+ "learning_rate": 8.531407562635655e-07,
+ "loss": 0.3673,
+ "step": 9277
+ },
+ {
+ "epoch": 4.386761229314421,
+ "grad_norm": 3.1196818351745605,
+ "learning_rate": 8.526714604443365e-07,
+ "loss": 0.3449,
+ "step": 9278
+ },
+ {
+ "epoch": 4.3872340425531915,
+ "grad_norm": 3.0087406635284424,
+ "learning_rate": 8.522022671970312e-07,
+ "loss": 0.2898,
+ "step": 9279
+ },
+ {
+ "epoch": 4.387706855791962,
+ "grad_norm": 2.885667085647583,
+ "learning_rate": 8.517331765508666e-07,
+ "loss": 0.3119,
+ "step": 9280
+ },
+ {
+ "epoch": 4.388179669030733,
+ "grad_norm": 3.115769624710083,
+ "learning_rate": 8.512641885350494e-07,
+ "loss": 0.3662,
+ "step": 9281
+ },
+ {
+ "epoch": 4.388652482269504,
+ "grad_norm": 2.935692071914673,
+ "learning_rate": 8.507953031787818e-07,
+ "loss": 0.2957,
+ "step": 9282
+ },
+ {
+ "epoch": 4.389125295508274,
+ "grad_norm": 2.96824312210083,
+ "learning_rate": 8.503265205112593e-07,
+ "loss": 0.301,
+ "step": 9283
+ },
+ {
+ "epoch": 4.389598108747045,
+ "grad_norm": 2.8329155445098877,
+ "learning_rate": 8.498578405616697e-07,
+ "loss": 0.3289,
+ "step": 9284
+ },
+ {
+ "epoch": 4.390070921985815,
+ "grad_norm": 3.3063509464263916,
+ "learning_rate": 8.493892633591976e-07,
+ "loss": 0.371,
+ "step": 9285
+ },
+ {
+ "epoch": 4.390543735224586,
+ "grad_norm": 3.036324977874756,
+ "learning_rate": 8.489207889330175e-07,
+ "loss": 0.3111,
+ "step": 9286
+ },
+ {
+ "epoch": 4.391016548463357,
+ "grad_norm": 3.221714496612549,
+ "learning_rate": 8.48452417312301e-07,
+ "loss": 0.3034,
+ "step": 9287
+ },
+ {
+ "epoch": 4.391489361702128,
+ "grad_norm": 2.956813097000122,
+ "learning_rate": 8.479841485262108e-07,
+ "loss": 0.2826,
+ "step": 9288
+ },
+ {
+ "epoch": 4.391962174940899,
+ "grad_norm": 3.3818461894989014,
+ "learning_rate": 8.475159826039028e-07,
+ "loss": 0.3701,
+ "step": 9289
+ },
+ {
+ "epoch": 4.392434988179669,
+ "grad_norm": 3.1623525619506836,
+ "learning_rate": 8.470479195745293e-07,
+ "loss": 0.3405,
+ "step": 9290
+ },
+ {
+ "epoch": 4.39290780141844,
+ "grad_norm": 3.8068127632141113,
+ "learning_rate": 8.465799594672342e-07,
+ "loss": 0.3498,
+ "step": 9291
+ },
+ {
+ "epoch": 4.39338061465721,
+ "grad_norm": 3.042862892150879,
+ "learning_rate": 8.461121023111541e-07,
+ "loss": 0.3025,
+ "step": 9292
+ },
+ {
+ "epoch": 4.393853427895981,
+ "grad_norm": 3.0237231254577637,
+ "learning_rate": 8.456443481354221e-07,
+ "loss": 0.3351,
+ "step": 9293
+ },
+ {
+ "epoch": 4.394326241134752,
+ "grad_norm": 3.233386754989624,
+ "learning_rate": 8.451766969691614e-07,
+ "loss": 0.3314,
+ "step": 9294
+ },
+ {
+ "epoch": 4.3947990543735225,
+ "grad_norm": 2.922518014907837,
+ "learning_rate": 8.447091488414924e-07,
+ "loss": 0.2876,
+ "step": 9295
+ },
+ {
+ "epoch": 4.395271867612293,
+ "grad_norm": 3.2621119022369385,
+ "learning_rate": 8.442417037815268e-07,
+ "loss": 0.3731,
+ "step": 9296
+ },
+ {
+ "epoch": 4.395744680851064,
+ "grad_norm": 3.4238440990448,
+ "learning_rate": 8.437743618183697e-07,
+ "loss": 0.3563,
+ "step": 9297
+ },
+ {
+ "epoch": 4.396217494089835,
+ "grad_norm": 3.607088804244995,
+ "learning_rate": 8.43307122981121e-07,
+ "loss": 0.315,
+ "step": 9298
+ },
+ {
+ "epoch": 4.396690307328605,
+ "grad_norm": 3.0737040042877197,
+ "learning_rate": 8.428399872988724e-07,
+ "loss": 0.2699,
+ "step": 9299
+ },
+ {
+ "epoch": 4.397163120567376,
+ "grad_norm": 3.2364611625671387,
+ "learning_rate": 8.423729548007123e-07,
+ "loss": 0.3054,
+ "step": 9300
+ },
+ {
+ "epoch": 4.397635933806146,
+ "grad_norm": 3.505194664001465,
+ "learning_rate": 8.419060255157199e-07,
+ "loss": 0.3377,
+ "step": 9301
+ },
+ {
+ "epoch": 4.398108747044917,
+ "grad_norm": 3.337815523147583,
+ "learning_rate": 8.414391994729676e-07,
+ "loss": 0.3709,
+ "step": 9302
+ },
+ {
+ "epoch": 4.398581560283688,
+ "grad_norm": 3.310739040374756,
+ "learning_rate": 8.409724767015248e-07,
+ "loss": 0.3125,
+ "step": 9303
+ },
+ {
+ "epoch": 4.399054373522459,
+ "grad_norm": 2.9035723209381104,
+ "learning_rate": 8.405058572304506e-07,
+ "loss": 0.3294,
+ "step": 9304
+ },
+ {
+ "epoch": 4.39952718676123,
+ "grad_norm": 3.162543535232544,
+ "learning_rate": 8.400393410888008e-07,
+ "loss": 0.3563,
+ "step": 9305
+ },
+ {
+ "epoch": 4.4,
+ "grad_norm": 3.320204973220825,
+ "learning_rate": 8.395729283056222e-07,
+ "loss": 0.3681,
+ "step": 9306
+ },
+ {
+ "epoch": 4.400472813238771,
+ "grad_norm": 3.2953343391418457,
+ "learning_rate": 8.391066189099562e-07,
+ "loss": 0.3574,
+ "step": 9307
+ },
+ {
+ "epoch": 4.400945626477541,
+ "grad_norm": 3.2041780948638916,
+ "learning_rate": 8.386404129308387e-07,
+ "loss": 0.3787,
+ "step": 9308
+ },
+ {
+ "epoch": 4.401418439716312,
+ "grad_norm": 3.493856906890869,
+ "learning_rate": 8.381743103972973e-07,
+ "loss": 0.3678,
+ "step": 9309
+ },
+ {
+ "epoch": 4.401891252955083,
+ "grad_norm": 3.3114027976989746,
+ "learning_rate": 8.377083113383553e-07,
+ "loss": 0.3853,
+ "step": 9310
+ },
+ {
+ "epoch": 4.4023640661938535,
+ "grad_norm": 3.148033857345581,
+ "learning_rate": 8.372424157830281e-07,
+ "loss": 0.41,
+ "step": 9311
+ },
+ {
+ "epoch": 4.402836879432624,
+ "grad_norm": 3.1810758113861084,
+ "learning_rate": 8.367766237603245e-07,
+ "loss": 0.3536,
+ "step": 9312
+ },
+ {
+ "epoch": 4.403309692671395,
+ "grad_norm": 3.110158920288086,
+ "learning_rate": 8.363109352992474e-07,
+ "loss": 0.3453,
+ "step": 9313
+ },
+ {
+ "epoch": 4.403782505910166,
+ "grad_norm": 3.140287399291992,
+ "learning_rate": 8.358453504287934e-07,
+ "loss": 0.3617,
+ "step": 9314
+ },
+ {
+ "epoch": 4.404255319148936,
+ "grad_norm": 3.0819156169891357,
+ "learning_rate": 8.353798691779516e-07,
+ "loss": 0.3033,
+ "step": 9315
+ },
+ {
+ "epoch": 4.4047281323877066,
+ "grad_norm": 3.167506217956543,
+ "learning_rate": 8.349144915757071e-07,
+ "loss": 0.325,
+ "step": 9316
+ },
+ {
+ "epoch": 4.405200945626477,
+ "grad_norm": 3.0423221588134766,
+ "learning_rate": 8.34449217651035e-07,
+ "loss": 0.3213,
+ "step": 9317
+ },
+ {
+ "epoch": 4.405673758865248,
+ "grad_norm": 3.4442083835601807,
+ "learning_rate": 8.339840474329078e-07,
+ "loss": 0.3493,
+ "step": 9318
+ },
+ {
+ "epoch": 4.406146572104019,
+ "grad_norm": 3.2931764125823975,
+ "learning_rate": 8.335189809502886e-07,
+ "loss": 0.3947,
+ "step": 9319
+ },
+ {
+ "epoch": 4.40661938534279,
+ "grad_norm": 3.217146396636963,
+ "learning_rate": 8.330540182321345e-07,
+ "loss": 0.3622,
+ "step": 9320
+ },
+ {
+ "epoch": 4.4070921985815605,
+ "grad_norm": 3.0024805068969727,
+ "learning_rate": 8.325891593073981e-07,
+ "loss": 0.3333,
+ "step": 9321
+ },
+ {
+ "epoch": 4.407565011820331,
+ "grad_norm": 2.9128856658935547,
+ "learning_rate": 8.321244042050225e-07,
+ "loss": 0.3069,
+ "step": 9322
+ },
+ {
+ "epoch": 4.408037825059101,
+ "grad_norm": 3.1456804275512695,
+ "learning_rate": 8.316597529539477e-07,
+ "loss": 0.3749,
+ "step": 9323
+ },
+ {
+ "epoch": 4.408510638297872,
+ "grad_norm": 3.093379497528076,
+ "learning_rate": 8.31195205583105e-07,
+ "loss": 0.3899,
+ "step": 9324
+ },
+ {
+ "epoch": 4.408983451536643,
+ "grad_norm": 2.95357608795166,
+ "learning_rate": 8.307307621214181e-07,
+ "loss": 0.3525,
+ "step": 9325
+ },
+ {
+ "epoch": 4.409456264775414,
+ "grad_norm": 3.0832929611206055,
+ "learning_rate": 8.30266422597808e-07,
+ "loss": 0.3976,
+ "step": 9326
+ },
+ {
+ "epoch": 4.409929078014184,
+ "grad_norm": 3.203678846359253,
+ "learning_rate": 8.298021870411862e-07,
+ "loss": 0.3954,
+ "step": 9327
+ },
+ {
+ "epoch": 4.410401891252955,
+ "grad_norm": 3.0880157947540283,
+ "learning_rate": 8.293380554804586e-07,
+ "loss": 0.3153,
+ "step": 9328
+ },
+ {
+ "epoch": 4.410874704491726,
+ "grad_norm": 2.9109299182891846,
+ "learning_rate": 8.28874027944524e-07,
+ "loss": 0.3526,
+ "step": 9329
+ },
+ {
+ "epoch": 4.411347517730497,
+ "grad_norm": 3.4241647720336914,
+ "learning_rate": 8.284101044622767e-07,
+ "loss": 0.3288,
+ "step": 9330
+ },
+ {
+ "epoch": 4.411820330969267,
+ "grad_norm": 3.110163450241089,
+ "learning_rate": 8.279462850626024e-07,
+ "loss": 0.314,
+ "step": 9331
+ },
+ {
+ "epoch": 4.4122931442080375,
+ "grad_norm": 3.024353504180908,
+ "learning_rate": 8.274825697743805e-07,
+ "loss": 0.3107,
+ "step": 9332
+ },
+ {
+ "epoch": 4.412765957446808,
+ "grad_norm": 2.8271758556365967,
+ "learning_rate": 8.270189586264859e-07,
+ "loss": 0.3339,
+ "step": 9333
+ },
+ {
+ "epoch": 4.413238770685579,
+ "grad_norm": 3.179032325744629,
+ "learning_rate": 8.265554516477853e-07,
+ "loss": 0.3365,
+ "step": 9334
+ },
+ {
+ "epoch": 4.41371158392435,
+ "grad_norm": 2.9746336936950684,
+ "learning_rate": 8.260920488671376e-07,
+ "loss": 0.3127,
+ "step": 9335
+ },
+ {
+ "epoch": 4.414184397163121,
+ "grad_norm": 3.247529983520508,
+ "learning_rate": 8.256287503133992e-07,
+ "loss": 0.3251,
+ "step": 9336
+ },
+ {
+ "epoch": 4.4146572104018915,
+ "grad_norm": 3.3317253589630127,
+ "learning_rate": 8.251655560154168e-07,
+ "loss": 0.3246,
+ "step": 9337
+ },
+ {
+ "epoch": 4.415130023640662,
+ "grad_norm": 2.9890010356903076,
+ "learning_rate": 8.247024660020303e-07,
+ "loss": 0.3317,
+ "step": 9338
+ },
+ {
+ "epoch": 4.415602836879432,
+ "grad_norm": 3.3956406116485596,
+ "learning_rate": 8.242394803020759e-07,
+ "loss": 0.4055,
+ "step": 9339
+ },
+ {
+ "epoch": 4.416075650118203,
+ "grad_norm": 2.9918906688690186,
+ "learning_rate": 8.237765989443805e-07,
+ "loss": 0.3415,
+ "step": 9340
+ },
+ {
+ "epoch": 4.416548463356974,
+ "grad_norm": 3.4310927391052246,
+ "learning_rate": 8.233138219577671e-07,
+ "loss": 0.3654,
+ "step": 9341
+ },
+ {
+ "epoch": 4.417021276595745,
+ "grad_norm": 3.207947254180908,
+ "learning_rate": 8.2285114937105e-07,
+ "loss": 0.3773,
+ "step": 9342
+ },
+ {
+ "epoch": 4.417494089834515,
+ "grad_norm": 3.202953338623047,
+ "learning_rate": 8.223885812130367e-07,
+ "loss": 0.3476,
+ "step": 9343
+ },
+ {
+ "epoch": 4.417966903073286,
+ "grad_norm": 3.160951614379883,
+ "learning_rate": 8.219261175125315e-07,
+ "loss": 0.3583,
+ "step": 9344
+ },
+ {
+ "epoch": 4.418439716312057,
+ "grad_norm": 2.892636775970459,
+ "learning_rate": 8.214637582983284e-07,
+ "loss": 0.3241,
+ "step": 9345
+ },
+ {
+ "epoch": 4.418912529550828,
+ "grad_norm": 2.830085277557373,
+ "learning_rate": 8.210015035992172e-07,
+ "loss": 0.2921,
+ "step": 9346
+ },
+ {
+ "epoch": 4.419385342789598,
+ "grad_norm": 3.2846477031707764,
+ "learning_rate": 8.205393534439801e-07,
+ "loss": 0.4281,
+ "step": 9347
+ },
+ {
+ "epoch": 4.4198581560283685,
+ "grad_norm": 2.6153810024261475,
+ "learning_rate": 8.200773078613924e-07,
+ "loss": 0.2848,
+ "step": 9348
+ },
+ {
+ "epoch": 4.420330969267139,
+ "grad_norm": 3.0541396141052246,
+ "learning_rate": 8.196153668802253e-07,
+ "loss": 0.3619,
+ "step": 9349
+ },
+ {
+ "epoch": 4.42080378250591,
+ "grad_norm": 3.516235589981079,
+ "learning_rate": 8.191535305292406e-07,
+ "loss": 0.3996,
+ "step": 9350
+ },
+ {
+ "epoch": 4.421276595744681,
+ "grad_norm": 3.2205963134765625,
+ "learning_rate": 8.186917988371956e-07,
+ "loss": 0.3219,
+ "step": 9351
+ },
+ {
+ "epoch": 4.421749408983452,
+ "grad_norm": 3.2431082725524902,
+ "learning_rate": 8.1823017183284e-07,
+ "loss": 0.3033,
+ "step": 9352
+ },
+ {
+ "epoch": 4.4222222222222225,
+ "grad_norm": 3.337085485458374,
+ "learning_rate": 8.177686495449166e-07,
+ "loss": 0.3467,
+ "step": 9353
+ },
+ {
+ "epoch": 4.422695035460993,
+ "grad_norm": 3.2539291381835938,
+ "learning_rate": 8.173072320021641e-07,
+ "loss": 0.3565,
+ "step": 9354
+ },
+ {
+ "epoch": 4.423167848699763,
+ "grad_norm": 3.069993734359741,
+ "learning_rate": 8.168459192333105e-07,
+ "loss": 0.3627,
+ "step": 9355
+ },
+ {
+ "epoch": 4.423640661938534,
+ "grad_norm": 3.068195104598999,
+ "learning_rate": 8.163847112670826e-07,
+ "loss": 0.2945,
+ "step": 9356
+ },
+ {
+ "epoch": 4.424113475177305,
+ "grad_norm": 3.273607015609741,
+ "learning_rate": 8.159236081321959e-07,
+ "loss": 0.3451,
+ "step": 9357
+ },
+ {
+ "epoch": 4.424586288416076,
+ "grad_norm": 3.355647325515747,
+ "learning_rate": 8.154626098573607e-07,
+ "loss": 0.3469,
+ "step": 9358
+ },
+ {
+ "epoch": 4.425059101654846,
+ "grad_norm": 2.792948007583618,
+ "learning_rate": 8.150017164712831e-07,
+ "loss": 0.3393,
+ "step": 9359
+ },
+ {
+ "epoch": 4.425531914893617,
+ "grad_norm": 3.031167507171631,
+ "learning_rate": 8.145409280026607e-07,
+ "loss": 0.2924,
+ "step": 9360
+ },
+ {
+ "epoch": 4.426004728132388,
+ "grad_norm": 3.0713601112365723,
+ "learning_rate": 8.140802444801835e-07,
+ "loss": 0.3221,
+ "step": 9361
+ },
+ {
+ "epoch": 4.426477541371159,
+ "grad_norm": 3.225785493850708,
+ "learning_rate": 8.136196659325374e-07,
+ "loss": 0.3626,
+ "step": 9362
+ },
+ {
+ "epoch": 4.426950354609929,
+ "grad_norm": 2.9779045581817627,
+ "learning_rate": 8.131591923883991e-07,
+ "loss": 0.3329,
+ "step": 9363
+ },
+ {
+ "epoch": 4.4274231678486995,
+ "grad_norm": 3.534536600112915,
+ "learning_rate": 8.126988238764422e-07,
+ "loss": 0.4029,
+ "step": 9364
+ },
+ {
+ "epoch": 4.42789598108747,
+ "grad_norm": 3.4237616062164307,
+ "learning_rate": 8.122385604253311e-07,
+ "loss": 0.3763,
+ "step": 9365
+ },
+ {
+ "epoch": 4.428368794326241,
+ "grad_norm": 2.8711681365966797,
+ "learning_rate": 8.117784020637231e-07,
+ "loss": 0.3141,
+ "step": 9366
+ },
+ {
+ "epoch": 4.428841607565012,
+ "grad_norm": 3.0277621746063232,
+ "learning_rate": 8.113183488202725e-07,
+ "loss": 0.3848,
+ "step": 9367
+ },
+ {
+ "epoch": 4.429314420803783,
+ "grad_norm": 3.1275761127471924,
+ "learning_rate": 8.108584007236226e-07,
+ "loss": 0.3343,
+ "step": 9368
+ },
+ {
+ "epoch": 4.4297872340425535,
+ "grad_norm": 3.2320117950439453,
+ "learning_rate": 8.103985578024143e-07,
+ "loss": 0.3211,
+ "step": 9369
+ },
+ {
+ "epoch": 4.430260047281324,
+ "grad_norm": 3.258829355239868,
+ "learning_rate": 8.099388200852792e-07,
+ "loss": 0.3473,
+ "step": 9370
+ },
+ {
+ "epoch": 4.430732860520094,
+ "grad_norm": 2.9609436988830566,
+ "learning_rate": 8.094791876008423e-07,
+ "loss": 0.318,
+ "step": 9371
+ },
+ {
+ "epoch": 4.431205673758865,
+ "grad_norm": 3.128053665161133,
+ "learning_rate": 8.090196603777245e-07,
+ "loss": 0.3372,
+ "step": 9372
+ },
+ {
+ "epoch": 4.431678486997636,
+ "grad_norm": 3.013979196548462,
+ "learning_rate": 8.085602384445368e-07,
+ "loss": 0.3098,
+ "step": 9373
+ },
+ {
+ "epoch": 4.432151300236407,
+ "grad_norm": 3.603433132171631,
+ "learning_rate": 8.081009218298871e-07,
+ "loss": 0.4016,
+ "step": 9374
+ },
+ {
+ "epoch": 4.432624113475177,
+ "grad_norm": 2.687730312347412,
+ "learning_rate": 8.076417105623743e-07,
+ "loss": 0.3173,
+ "step": 9375
+ },
+ {
+ "epoch": 4.433096926713948,
+ "grad_norm": 3.3575692176818848,
+ "learning_rate": 8.071826046705913e-07,
+ "loss": 0.3173,
+ "step": 9376
+ },
+ {
+ "epoch": 4.433569739952719,
+ "grad_norm": 3.3599679470062256,
+ "learning_rate": 8.06723604183125e-07,
+ "loss": 0.3466,
+ "step": 9377
+ },
+ {
+ "epoch": 4.43404255319149,
+ "grad_norm": 3.101696014404297,
+ "learning_rate": 8.062647091285542e-07,
+ "loss": 0.3322,
+ "step": 9378
+ },
+ {
+ "epoch": 4.43451536643026,
+ "grad_norm": 3.0010359287261963,
+ "learning_rate": 8.05805919535454e-07,
+ "loss": 0.3863,
+ "step": 9379
+ },
+ {
+ "epoch": 4.4349881796690305,
+ "grad_norm": 3.1428821086883545,
+ "learning_rate": 8.053472354323902e-07,
+ "loss": 0.3402,
+ "step": 9380
+ },
+ {
+ "epoch": 4.435460992907801,
+ "grad_norm": 3.416954517364502,
+ "learning_rate": 8.048886568479222e-07,
+ "loss": 0.3637,
+ "step": 9381
+ },
+ {
+ "epoch": 4.435933806146572,
+ "grad_norm": 3.015092611312866,
+ "learning_rate": 8.044301838106059e-07,
+ "loss": 0.3371,
+ "step": 9382
+ },
+ {
+ "epoch": 4.436406619385343,
+ "grad_norm": 2.6680097579956055,
+ "learning_rate": 8.039718163489862e-07,
+ "loss": 0.3148,
+ "step": 9383
+ },
+ {
+ "epoch": 4.436879432624114,
+ "grad_norm": 3.098219633102417,
+ "learning_rate": 8.035135544916056e-07,
+ "loss": 0.3348,
+ "step": 9384
+ },
+ {
+ "epoch": 4.4373522458628845,
+ "grad_norm": 3.5400390625,
+ "learning_rate": 8.030553982669969e-07,
+ "loss": 0.4211,
+ "step": 9385
+ },
+ {
+ "epoch": 4.437825059101655,
+ "grad_norm": 3.6074001789093018,
+ "learning_rate": 8.025973477036872e-07,
+ "loss": 0.3179,
+ "step": 9386
+ },
+ {
+ "epoch": 4.438297872340425,
+ "grad_norm": 3.111982583999634,
+ "learning_rate": 8.021394028301982e-07,
+ "loss": 0.3414,
+ "step": 9387
+ },
+ {
+ "epoch": 4.438770685579196,
+ "grad_norm": 3.5494184494018555,
+ "learning_rate": 8.016815636750439e-07,
+ "loss": 0.3847,
+ "step": 9388
+ },
+ {
+ "epoch": 4.439243498817967,
+ "grad_norm": 3.3602912425994873,
+ "learning_rate": 8.012238302667308e-07,
+ "loss": 0.3293,
+ "step": 9389
+ },
+ {
+ "epoch": 4.439716312056738,
+ "grad_norm": 3.1263039112091064,
+ "learning_rate": 8.007662026337617e-07,
+ "loss": 0.3675,
+ "step": 9390
+ },
+ {
+ "epoch": 4.440189125295508,
+ "grad_norm": 3.239863157272339,
+ "learning_rate": 8.003086808046304e-07,
+ "loss": 0.3445,
+ "step": 9391
+ },
+ {
+ "epoch": 4.440661938534279,
+ "grad_norm": 2.8646275997161865,
+ "learning_rate": 7.998512648078244e-07,
+ "loss": 0.2715,
+ "step": 9392
+ },
+ {
+ "epoch": 4.44113475177305,
+ "grad_norm": 2.9777262210845947,
+ "learning_rate": 7.993939546718255e-07,
+ "loss": 0.3143,
+ "step": 9393
+ },
+ {
+ "epoch": 4.441607565011821,
+ "grad_norm": 3.5436604022979736,
+ "learning_rate": 7.98936750425107e-07,
+ "loss": 0.406,
+ "step": 9394
+ },
+ {
+ "epoch": 4.442080378250591,
+ "grad_norm": 3.1395277976989746,
+ "learning_rate": 7.984796520961391e-07,
+ "loss": 0.3893,
+ "step": 9395
+ },
+ {
+ "epoch": 4.4425531914893615,
+ "grad_norm": 3.6379475593566895,
+ "learning_rate": 7.980226597133814e-07,
+ "loss": 0.3878,
+ "step": 9396
+ },
+ {
+ "epoch": 4.443026004728132,
+ "grad_norm": 3.3866498470306396,
+ "learning_rate": 7.975657733052908e-07,
+ "loss": 0.3399,
+ "step": 9397
+ },
+ {
+ "epoch": 4.443498817966903,
+ "grad_norm": 2.9472098350524902,
+ "learning_rate": 7.971089929003142e-07,
+ "loss": 0.3436,
+ "step": 9398
+ },
+ {
+ "epoch": 4.443971631205674,
+ "grad_norm": 3.314652442932129,
+ "learning_rate": 7.966523185268929e-07,
+ "loss": 0.305,
+ "step": 9399
+ },
+ {
+ "epoch": 4.444444444444445,
+ "grad_norm": 3.8230092525482178,
+ "learning_rate": 7.961957502134638e-07,
+ "loss": 0.3632,
+ "step": 9400
+ },
+ {
+ "epoch": 4.444917257683215,
+ "grad_norm": 3.088292360305786,
+ "learning_rate": 7.957392879884534e-07,
+ "loss": 0.3373,
+ "step": 9401
+ },
+ {
+ "epoch": 4.445390070921986,
+ "grad_norm": 3.1412665843963623,
+ "learning_rate": 7.952829318802854e-07,
+ "loss": 0.3703,
+ "step": 9402
+ },
+ {
+ "epoch": 4.445862884160756,
+ "grad_norm": 3.464963674545288,
+ "learning_rate": 7.948266819173745e-07,
+ "loss": 0.3485,
+ "step": 9403
+ },
+ {
+ "epoch": 4.446335697399527,
+ "grad_norm": 3.2092626094818115,
+ "learning_rate": 7.943705381281281e-07,
+ "loss": 0.341,
+ "step": 9404
+ },
+ {
+ "epoch": 4.446808510638298,
+ "grad_norm": 2.5458641052246094,
+ "learning_rate": 7.939145005409502e-07,
+ "loss": 0.3059,
+ "step": 9405
+ },
+ {
+ "epoch": 4.4472813238770685,
+ "grad_norm": 4.1484150886535645,
+ "learning_rate": 7.934585691842353e-07,
+ "loss": 0.3405,
+ "step": 9406
+ },
+ {
+ "epoch": 4.447754137115839,
+ "grad_norm": 2.9794130325317383,
+ "learning_rate": 7.930027440863716e-07,
+ "loss": 0.359,
+ "step": 9407
+ },
+ {
+ "epoch": 4.44822695035461,
+ "grad_norm": 2.951674222946167,
+ "learning_rate": 7.92547025275743e-07,
+ "loss": 0.3087,
+ "step": 9408
+ },
+ {
+ "epoch": 4.448699763593381,
+ "grad_norm": 3.2857377529144287,
+ "learning_rate": 7.920914127807241e-07,
+ "loss": 0.3327,
+ "step": 9409
+ },
+ {
+ "epoch": 4.449172576832151,
+ "grad_norm": 3.4455840587615967,
+ "learning_rate": 7.916359066296839e-07,
+ "loss": 0.3731,
+ "step": 9410
+ },
+ {
+ "epoch": 4.449645390070922,
+ "grad_norm": 3.392779588699341,
+ "learning_rate": 7.911805068509848e-07,
+ "loss": 0.3559,
+ "step": 9411
+ },
+ {
+ "epoch": 4.450118203309692,
+ "grad_norm": 3.3362300395965576,
+ "learning_rate": 7.90725213472982e-07,
+ "loss": 0.333,
+ "step": 9412
+ },
+ {
+ "epoch": 4.450591016548463,
+ "grad_norm": 3.0671608448028564,
+ "learning_rate": 7.902700265240259e-07,
+ "loss": 0.3342,
+ "step": 9413
+ },
+ {
+ "epoch": 4.451063829787234,
+ "grad_norm": 2.8350744247436523,
+ "learning_rate": 7.898149460324575e-07,
+ "loss": 0.3157,
+ "step": 9414
+ },
+ {
+ "epoch": 4.451536643026005,
+ "grad_norm": 2.932446241378784,
+ "learning_rate": 7.893599720266143e-07,
+ "loss": 0.379,
+ "step": 9415
+ },
+ {
+ "epoch": 4.452009456264776,
+ "grad_norm": 3.354112386703491,
+ "learning_rate": 7.889051045348245e-07,
+ "loss": 0.3001,
+ "step": 9416
+ },
+ {
+ "epoch": 4.452482269503546,
+ "grad_norm": 3.068276882171631,
+ "learning_rate": 7.884503435854104e-07,
+ "loss": 0.3466,
+ "step": 9417
+ },
+ {
+ "epoch": 4.452955082742317,
+ "grad_norm": 2.833534002304077,
+ "learning_rate": 7.879956892066892e-07,
+ "loss": 0.3278,
+ "step": 9418
+ },
+ {
+ "epoch": 4.453427895981087,
+ "grad_norm": 2.9622433185577393,
+ "learning_rate": 7.875411414269687e-07,
+ "loss": 0.3725,
+ "step": 9419
+ },
+ {
+ "epoch": 4.453900709219858,
+ "grad_norm": 3.2055954933166504,
+ "learning_rate": 7.870867002745533e-07,
+ "loss": 0.3215,
+ "step": 9420
+ },
+ {
+ "epoch": 4.454373522458629,
+ "grad_norm": 2.877063274383545,
+ "learning_rate": 7.86632365777738e-07,
+ "loss": 0.2845,
+ "step": 9421
+ },
+ {
+ "epoch": 4.4548463356973995,
+ "grad_norm": 3.2809367179870605,
+ "learning_rate": 7.861781379648117e-07,
+ "loss": 0.39,
+ "step": 9422
+ },
+ {
+ "epoch": 4.45531914893617,
+ "grad_norm": 3.404816150665283,
+ "learning_rate": 7.857240168640587e-07,
+ "loss": 0.3003,
+ "step": 9423
+ },
+ {
+ "epoch": 4.455791962174941,
+ "grad_norm": 3.367253303527832,
+ "learning_rate": 7.85270002503754e-07,
+ "loss": 0.3414,
+ "step": 9424
+ },
+ {
+ "epoch": 4.456264775413712,
+ "grad_norm": 3.1247670650482178,
+ "learning_rate": 7.848160949121678e-07,
+ "loss": 0.2922,
+ "step": 9425
+ },
+ {
+ "epoch": 4.456737588652482,
+ "grad_norm": 3.474435806274414,
+ "learning_rate": 7.843622941175624e-07,
+ "loss": 0.3601,
+ "step": 9426
+ },
+ {
+ "epoch": 4.457210401891253,
+ "grad_norm": 3.0552384853363037,
+ "learning_rate": 7.839086001481933e-07,
+ "loss": 0.3905,
+ "step": 9427
+ },
+ {
+ "epoch": 4.457683215130023,
+ "grad_norm": 3.3532586097717285,
+ "learning_rate": 7.834550130323115e-07,
+ "loss": 0.3783,
+ "step": 9428
+ },
+ {
+ "epoch": 4.458156028368794,
+ "grad_norm": 3.7321903705596924,
+ "learning_rate": 7.830015327981585e-07,
+ "loss": 0.3765,
+ "step": 9429
+ },
+ {
+ "epoch": 4.458628841607565,
+ "grad_norm": 3.070158004760742,
+ "learning_rate": 7.82548159473972e-07,
+ "loss": 0.3279,
+ "step": 9430
+ },
+ {
+ "epoch": 4.459101654846336,
+ "grad_norm": 3.498399257659912,
+ "learning_rate": 7.820948930879807e-07,
+ "loss": 0.3864,
+ "step": 9431
+ },
+ {
+ "epoch": 4.459574468085107,
+ "grad_norm": 3.0352776050567627,
+ "learning_rate": 7.816417336684071e-07,
+ "loss": 0.2963,
+ "step": 9432
+ },
+ {
+ "epoch": 4.460047281323877,
+ "grad_norm": 3.190154790878296,
+ "learning_rate": 7.811886812434686e-07,
+ "loss": 0.354,
+ "step": 9433
+ },
+ {
+ "epoch": 4.460520094562648,
+ "grad_norm": 3.1933085918426514,
+ "learning_rate": 7.807357358413742e-07,
+ "loss": 0.3613,
+ "step": 9434
+ },
+ {
+ "epoch": 4.460992907801418,
+ "grad_norm": 4.0385637283325195,
+ "learning_rate": 7.80282897490326e-07,
+ "loss": 0.3257,
+ "step": 9435
+ },
+ {
+ "epoch": 4.461465721040189,
+ "grad_norm": 3.365485191345215,
+ "learning_rate": 7.798301662185218e-07,
+ "loss": 0.3093,
+ "step": 9436
+ },
+ {
+ "epoch": 4.46193853427896,
+ "grad_norm": 3.5345213413238525,
+ "learning_rate": 7.793775420541497e-07,
+ "loss": 0.3262,
+ "step": 9437
+ },
+ {
+ "epoch": 4.4624113475177305,
+ "grad_norm": 3.2894418239593506,
+ "learning_rate": 7.789250250253941e-07,
+ "loss": 0.3417,
+ "step": 9438
+ },
+ {
+ "epoch": 4.462884160756501,
+ "grad_norm": 2.972001791000366,
+ "learning_rate": 7.784726151604305e-07,
+ "loss": 0.3396,
+ "step": 9439
+ },
+ {
+ "epoch": 4.463356973995272,
+ "grad_norm": 3.161794424057007,
+ "learning_rate": 7.780203124874283e-07,
+ "loss": 0.3583,
+ "step": 9440
+ },
+ {
+ "epoch": 4.463829787234043,
+ "grad_norm": 3.0976521968841553,
+ "learning_rate": 7.775681170345508e-07,
+ "loss": 0.3743,
+ "step": 9441
+ },
+ {
+ "epoch": 4.464302600472813,
+ "grad_norm": 3.1454756259918213,
+ "learning_rate": 7.771160288299534e-07,
+ "loss": 0.3483,
+ "step": 9442
+ },
+ {
+ "epoch": 4.464775413711584,
+ "grad_norm": 3.467618942260742,
+ "learning_rate": 7.766640479017868e-07,
+ "loss": 0.3253,
+ "step": 9443
+ },
+ {
+ "epoch": 4.465248226950354,
+ "grad_norm": 3.3349552154541016,
+ "learning_rate": 7.762121742781933e-07,
+ "loss": 0.3579,
+ "step": 9444
+ },
+ {
+ "epoch": 4.465721040189125,
+ "grad_norm": 3.442701578140259,
+ "learning_rate": 7.757604079873085e-07,
+ "loss": 0.3854,
+ "step": 9445
+ },
+ {
+ "epoch": 4.466193853427896,
+ "grad_norm": 3.3095569610595703,
+ "learning_rate": 7.753087490572633e-07,
+ "loss": 0.3385,
+ "step": 9446
+ },
+ {
+ "epoch": 4.466666666666667,
+ "grad_norm": 3.0978634357452393,
+ "learning_rate": 7.748571975161786e-07,
+ "loss": 0.3511,
+ "step": 9447
+ },
+ {
+ "epoch": 4.467139479905438,
+ "grad_norm": 2.9801225662231445,
+ "learning_rate": 7.744057533921731e-07,
+ "loss": 0.3239,
+ "step": 9448
+ },
+ {
+ "epoch": 4.467612293144208,
+ "grad_norm": 3.116586923599243,
+ "learning_rate": 7.739544167133545e-07,
+ "loss": 0.3786,
+ "step": 9449
+ },
+ {
+ "epoch": 4.468085106382979,
+ "grad_norm": 3.2235381603240967,
+ "learning_rate": 7.73503187507825e-07,
+ "loss": 0.3285,
+ "step": 9450
+ },
+ {
+ "epoch": 4.468557919621749,
+ "grad_norm": 3.175649404525757,
+ "learning_rate": 7.730520658036825e-07,
+ "loss": 0.315,
+ "step": 9451
+ },
+ {
+ "epoch": 4.46903073286052,
+ "grad_norm": 3.013848066329956,
+ "learning_rate": 7.726010516290144e-07,
+ "loss": 0.3533,
+ "step": 9452
+ },
+ {
+ "epoch": 4.469503546099291,
+ "grad_norm": 2.87581467628479,
+ "learning_rate": 7.721501450119057e-07,
+ "loss": 0.2948,
+ "step": 9453
+ },
+ {
+ "epoch": 4.4699763593380615,
+ "grad_norm": 3.504119873046875,
+ "learning_rate": 7.716993459804306e-07,
+ "loss": 0.354,
+ "step": 9454
+ },
+ {
+ "epoch": 4.470449172576832,
+ "grad_norm": 3.2914042472839355,
+ "learning_rate": 7.712486545626591e-07,
+ "loss": 0.3724,
+ "step": 9455
+ },
+ {
+ "epoch": 4.470921985815603,
+ "grad_norm": 3.007551908493042,
+ "learning_rate": 7.707980707866533e-07,
+ "loss": 0.3923,
+ "step": 9456
+ },
+ {
+ "epoch": 4.471394799054374,
+ "grad_norm": 3.2758076190948486,
+ "learning_rate": 7.703475946804687e-07,
+ "loss": 0.4092,
+ "step": 9457
+ },
+ {
+ "epoch": 4.471867612293144,
+ "grad_norm": 3.265875816345215,
+ "learning_rate": 7.698972262721557e-07,
+ "loss": 0.4249,
+ "step": 9458
+ },
+ {
+ "epoch": 4.472340425531915,
+ "grad_norm": 3.0962677001953125,
+ "learning_rate": 7.694469655897565e-07,
+ "loss": 0.3273,
+ "step": 9459
+ },
+ {
+ "epoch": 4.472813238770685,
+ "grad_norm": 3.2247416973114014,
+ "learning_rate": 7.689968126613053e-07,
+ "loss": 0.366,
+ "step": 9460
+ },
+ {
+ "epoch": 4.473286052009456,
+ "grad_norm": 3.326211929321289,
+ "learning_rate": 7.685467675148334e-07,
+ "loss": 0.4044,
+ "step": 9461
+ },
+ {
+ "epoch": 4.473758865248227,
+ "grad_norm": 2.9795444011688232,
+ "learning_rate": 7.68096830178362e-07,
+ "loss": 0.3763,
+ "step": 9462
+ },
+ {
+ "epoch": 4.474231678486998,
+ "grad_norm": 3.0721724033355713,
+ "learning_rate": 7.676470006799061e-07,
+ "loss": 0.3586,
+ "step": 9463
+ },
+ {
+ "epoch": 4.474704491725769,
+ "grad_norm": 3.1191349029541016,
+ "learning_rate": 7.67197279047476e-07,
+ "loss": 0.3111,
+ "step": 9464
+ },
+ {
+ "epoch": 4.475177304964539,
+ "grad_norm": 3.2980053424835205,
+ "learning_rate": 7.667476653090727e-07,
+ "loss": 0.3413,
+ "step": 9465
+ },
+ {
+ "epoch": 4.47565011820331,
+ "grad_norm": 3.159794807434082,
+ "learning_rate": 7.662981594926927e-07,
+ "loss": 0.3559,
+ "step": 9466
+ },
+ {
+ "epoch": 4.47612293144208,
+ "grad_norm": 2.9250876903533936,
+ "learning_rate": 7.658487616263244e-07,
+ "loss": 0.3582,
+ "step": 9467
+ },
+ {
+ "epoch": 4.476595744680851,
+ "grad_norm": 2.915234088897705,
+ "learning_rate": 7.65399471737949e-07,
+ "loss": 0.3466,
+ "step": 9468
+ },
+ {
+ "epoch": 4.477068557919622,
+ "grad_norm": 3.2557425498962402,
+ "learning_rate": 7.649502898555431e-07,
+ "loss": 0.3772,
+ "step": 9469
+ },
+ {
+ "epoch": 4.4775413711583925,
+ "grad_norm": 3.1906673908233643,
+ "learning_rate": 7.645012160070748e-07,
+ "loss": 0.3379,
+ "step": 9470
+ },
+ {
+ "epoch": 4.478014184397163,
+ "grad_norm": 3.1513144969940186,
+ "learning_rate": 7.640522502205056e-07,
+ "loss": 0.3529,
+ "step": 9471
+ },
+ {
+ "epoch": 4.478486997635934,
+ "grad_norm": 3.1969199180603027,
+ "learning_rate": 7.636033925237904e-07,
+ "loss": 0.363,
+ "step": 9472
+ },
+ {
+ "epoch": 4.478959810874705,
+ "grad_norm": 3.4546799659729004,
+ "learning_rate": 7.631546429448785e-07,
+ "loss": 0.2944,
+ "step": 9473
+ },
+ {
+ "epoch": 4.479432624113475,
+ "grad_norm": 3.102057456970215,
+ "learning_rate": 7.627060015117116e-07,
+ "loss": 0.3309,
+ "step": 9474
+ },
+ {
+ "epoch": 4.479905437352246,
+ "grad_norm": 3.125751495361328,
+ "learning_rate": 7.622574682522232e-07,
+ "loss": 0.3576,
+ "step": 9475
+ },
+ {
+ "epoch": 4.480378250591016,
+ "grad_norm": 3.071798324584961,
+ "learning_rate": 7.618090431943432e-07,
+ "loss": 0.3298,
+ "step": 9476
+ },
+ {
+ "epoch": 4.480851063829787,
+ "grad_norm": 3.0060672760009766,
+ "learning_rate": 7.613607263659922e-07,
+ "loss": 0.3528,
+ "step": 9477
+ },
+ {
+ "epoch": 4.481323877068558,
+ "grad_norm": 3.254667043685913,
+ "learning_rate": 7.609125177950846e-07,
+ "loss": 0.3701,
+ "step": 9478
+ },
+ {
+ "epoch": 4.481796690307329,
+ "grad_norm": 3.282247304916382,
+ "learning_rate": 7.604644175095293e-07,
+ "loss": 0.4025,
+ "step": 9479
+ },
+ {
+ "epoch": 4.4822695035460995,
+ "grad_norm": 3.231097936630249,
+ "learning_rate": 7.600164255372266e-07,
+ "loss": 0.3395,
+ "step": 9480
+ },
+ {
+ "epoch": 4.48274231678487,
+ "grad_norm": 3.08368182182312,
+ "learning_rate": 7.595685419060722e-07,
+ "loss": 0.3356,
+ "step": 9481
+ },
+ {
+ "epoch": 4.48321513002364,
+ "grad_norm": 3.0406503677368164,
+ "learning_rate": 7.591207666439532e-07,
+ "loss": 0.2851,
+ "step": 9482
+ },
+ {
+ "epoch": 4.483687943262411,
+ "grad_norm": 3.021157741546631,
+ "learning_rate": 7.586730997787495e-07,
+ "loss": 0.3691,
+ "step": 9483
+ },
+ {
+ "epoch": 4.484160756501182,
+ "grad_norm": 2.8793535232543945,
+ "learning_rate": 7.582255413383375e-07,
+ "loss": 0.332,
+ "step": 9484
+ },
+ {
+ "epoch": 4.484633569739953,
+ "grad_norm": 3.319021224975586,
+ "learning_rate": 7.577780913505833e-07,
+ "loss": 0.3489,
+ "step": 9485
+ },
+ {
+ "epoch": 4.485106382978723,
+ "grad_norm": 3.0267672538757324,
+ "learning_rate": 7.573307498433472e-07,
+ "loss": 0.2989,
+ "step": 9486
+ },
+ {
+ "epoch": 4.485579196217494,
+ "grad_norm": 2.8953561782836914,
+ "learning_rate": 7.568835168444849e-07,
+ "loss": 0.306,
+ "step": 9487
+ },
+ {
+ "epoch": 4.486052009456265,
+ "grad_norm": 3.0559732913970947,
+ "learning_rate": 7.564363923818424e-07,
+ "loss": 0.3122,
+ "step": 9488
+ },
+ {
+ "epoch": 4.486524822695036,
+ "grad_norm": 3.369352340698242,
+ "learning_rate": 7.559893764832607e-07,
+ "loss": 0.3999,
+ "step": 9489
+ },
+ {
+ "epoch": 4.486997635933806,
+ "grad_norm": 3.3339598178863525,
+ "learning_rate": 7.555424691765731e-07,
+ "loss": 0.3566,
+ "step": 9490
+ },
+ {
+ "epoch": 4.4874704491725765,
+ "grad_norm": 3.6563758850097656,
+ "learning_rate": 7.550956704896062e-07,
+ "loss": 0.3686,
+ "step": 9491
+ },
+ {
+ "epoch": 4.487943262411347,
+ "grad_norm": 3.492706537246704,
+ "learning_rate": 7.546489804501811e-07,
+ "loss": 0.3382,
+ "step": 9492
+ },
+ {
+ "epoch": 4.488416075650118,
+ "grad_norm": 3.1645941734313965,
+ "learning_rate": 7.542023990861106e-07,
+ "loss": 0.3583,
+ "step": 9493
+ },
+ {
+ "epoch": 4.488888888888889,
+ "grad_norm": 2.9827258586883545,
+ "learning_rate": 7.537559264252021e-07,
+ "loss": 0.3243,
+ "step": 9494
+ },
+ {
+ "epoch": 4.48936170212766,
+ "grad_norm": 3.2876698970794678,
+ "learning_rate": 7.533095624952547e-07,
+ "loss": 0.373,
+ "step": 9495
+ },
+ {
+ "epoch": 4.4898345153664305,
+ "grad_norm": 3.110868453979492,
+ "learning_rate": 7.528633073240616e-07,
+ "loss": 0.351,
+ "step": 9496
+ },
+ {
+ "epoch": 4.490307328605201,
+ "grad_norm": 3.3962604999542236,
+ "learning_rate": 7.524171609394099e-07,
+ "loss": 0.3686,
+ "step": 9497
+ },
+ {
+ "epoch": 4.490780141843971,
+ "grad_norm": 3.272610902786255,
+ "learning_rate": 7.519711233690777e-07,
+ "loss": 0.3667,
+ "step": 9498
+ },
+ {
+ "epoch": 4.491252955082742,
+ "grad_norm": 3.0907654762268066,
+ "learning_rate": 7.515251946408398e-07,
+ "loss": 0.3247,
+ "step": 9499
+ },
+ {
+ "epoch": 4.491725768321513,
+ "grad_norm": 4.225870609283447,
+ "learning_rate": 7.510793747824613e-07,
+ "loss": 0.3636,
+ "step": 9500
+ },
+ {
+ "epoch": 4.492198581560284,
+ "grad_norm": 3.1911606788635254,
+ "learning_rate": 7.506336638217004e-07,
+ "loss": 0.3661,
+ "step": 9501
+ },
+ {
+ "epoch": 4.492671394799054,
+ "grad_norm": 2.907573699951172,
+ "learning_rate": 7.501880617863114e-07,
+ "loss": 0.3285,
+ "step": 9502
+ },
+ {
+ "epoch": 4.493144208037825,
+ "grad_norm": 3.388460397720337,
+ "learning_rate": 7.497425687040388e-07,
+ "loss": 0.3354,
+ "step": 9503
+ },
+ {
+ "epoch": 4.493617021276596,
+ "grad_norm": 2.7236225605010986,
+ "learning_rate": 7.49297184602622e-07,
+ "loss": 0.3389,
+ "step": 9504
+ },
+ {
+ "epoch": 4.494089834515367,
+ "grad_norm": 3.1962947845458984,
+ "learning_rate": 7.488519095097929e-07,
+ "loss": 0.3377,
+ "step": 9505
+ },
+ {
+ "epoch": 4.494562647754137,
+ "grad_norm": 2.936845541000366,
+ "learning_rate": 7.484067434532763e-07,
+ "loss": 0.2893,
+ "step": 9506
+ },
+ {
+ "epoch": 4.4950354609929075,
+ "grad_norm": 2.8567588329315186,
+ "learning_rate": 7.47961686460792e-07,
+ "loss": 0.3026,
+ "step": 9507
+ },
+ {
+ "epoch": 4.495508274231678,
+ "grad_norm": 3.3522651195526123,
+ "learning_rate": 7.475167385600507e-07,
+ "loss": 0.3517,
+ "step": 9508
+ },
+ {
+ "epoch": 4.495981087470449,
+ "grad_norm": 3.338757276535034,
+ "learning_rate": 7.470718997787572e-07,
+ "loss": 0.3224,
+ "step": 9509
+ },
+ {
+ "epoch": 4.49645390070922,
+ "grad_norm": 3.1484947204589844,
+ "learning_rate": 7.466271701446107e-07,
+ "loss": 0.3872,
+ "step": 9510
+ },
+ {
+ "epoch": 4.496926713947991,
+ "grad_norm": 3.3275411128997803,
+ "learning_rate": 7.461825496853012e-07,
+ "loss": 0.3287,
+ "step": 9511
+ },
+ {
+ "epoch": 4.4973995271867615,
+ "grad_norm": 3.101416826248169,
+ "learning_rate": 7.457380384285151e-07,
+ "loss": 0.3223,
+ "step": 9512
+ },
+ {
+ "epoch": 4.497872340425532,
+ "grad_norm": 2.761810779571533,
+ "learning_rate": 7.45293636401929e-07,
+ "loss": 0.3264,
+ "step": 9513
+ },
+ {
+ "epoch": 4.498345153664302,
+ "grad_norm": 3.215078592300415,
+ "learning_rate": 7.448493436332132e-07,
+ "loss": 0.3548,
+ "step": 9514
+ },
+ {
+ "epoch": 4.498817966903073,
+ "grad_norm": 3.00111722946167,
+ "learning_rate": 7.444051601500335e-07,
+ "loss": 0.3271,
+ "step": 9515
+ },
+ {
+ "epoch": 4.499290780141844,
+ "grad_norm": 3.2428977489471436,
+ "learning_rate": 7.439610859800456e-07,
+ "loss": 0.3024,
+ "step": 9516
+ },
+ {
+ "epoch": 4.499763593380615,
+ "grad_norm": 2.7977585792541504,
+ "learning_rate": 7.435171211509018e-07,
+ "loss": 0.334,
+ "step": 9517
+ },
+ {
+ "epoch": 4.500236406619385,
+ "grad_norm": 3.273468494415283,
+ "learning_rate": 7.430732656902447e-07,
+ "loss": 0.337,
+ "step": 9518
+ },
+ {
+ "epoch": 4.500709219858156,
+ "grad_norm": 4.663364410400391,
+ "learning_rate": 7.426295196257116e-07,
+ "loss": 0.3492,
+ "step": 9519
+ },
+ {
+ "epoch": 4.501182033096927,
+ "grad_norm": 3.1147210597991943,
+ "learning_rate": 7.421858829849327e-07,
+ "loss": 0.3273,
+ "step": 9520
+ },
+ {
+ "epoch": 4.501654846335697,
+ "grad_norm": 3.1411445140838623,
+ "learning_rate": 7.4174235579553e-07,
+ "loss": 0.3413,
+ "step": 9521
+ },
+ {
+ "epoch": 4.502127659574468,
+ "grad_norm": 3.0361053943634033,
+ "learning_rate": 7.412989380851218e-07,
+ "loss": 0.2908,
+ "step": 9522
+ },
+ {
+ "epoch": 4.5026004728132385,
+ "grad_norm": 3.4923086166381836,
+ "learning_rate": 7.408556298813172e-07,
+ "loss": 0.3659,
+ "step": 9523
+ },
+ {
+ "epoch": 4.503073286052009,
+ "grad_norm": 3.6827056407928467,
+ "learning_rate": 7.40412431211718e-07,
+ "loss": 0.3485,
+ "step": 9524
+ },
+ {
+ "epoch": 4.50354609929078,
+ "grad_norm": 3.257322311401367,
+ "learning_rate": 7.399693421039219e-07,
+ "loss": 0.3592,
+ "step": 9525
+ },
+ {
+ "epoch": 4.504018912529551,
+ "grad_norm": 3.515291929244995,
+ "learning_rate": 7.395263625855167e-07,
+ "loss": 0.3662,
+ "step": 9526
+ },
+ {
+ "epoch": 4.504491725768322,
+ "grad_norm": 2.899764060974121,
+ "learning_rate": 7.390834926840865e-07,
+ "loss": 0.3564,
+ "step": 9527
+ },
+ {
+ "epoch": 4.5049645390070925,
+ "grad_norm": 3.2578322887420654,
+ "learning_rate": 7.386407324272055e-07,
+ "loss": 0.3074,
+ "step": 9528
+ },
+ {
+ "epoch": 4.505437352245863,
+ "grad_norm": 3.3826515674591064,
+ "learning_rate": 7.381980818424419e-07,
+ "loss": 0.3669,
+ "step": 9529
+ },
+ {
+ "epoch": 4.505910165484633,
+ "grad_norm": 3.470733404159546,
+ "learning_rate": 7.377555409573594e-07,
+ "loss": 0.3905,
+ "step": 9530
+ },
+ {
+ "epoch": 4.506382978723404,
+ "grad_norm": 3.228917121887207,
+ "learning_rate": 7.373131097995123e-07,
+ "loss": 0.3336,
+ "step": 9531
+ },
+ {
+ "epoch": 4.506855791962175,
+ "grad_norm": 3.2193191051483154,
+ "learning_rate": 7.368707883964476e-07,
+ "loss": 0.3285,
+ "step": 9532
+ },
+ {
+ "epoch": 4.507328605200946,
+ "grad_norm": 3.19169020652771,
+ "learning_rate": 7.36428576775709e-07,
+ "loss": 0.4022,
+ "step": 9533
+ },
+ {
+ "epoch": 4.507801418439716,
+ "grad_norm": 3.1887755393981934,
+ "learning_rate": 7.359864749648296e-07,
+ "loss": 0.3749,
+ "step": 9534
+ },
+ {
+ "epoch": 4.508274231678487,
+ "grad_norm": 3.574314832687378,
+ "learning_rate": 7.355444829913375e-07,
+ "loss": 0.3549,
+ "step": 9535
+ },
+ {
+ "epoch": 4.508747044917258,
+ "grad_norm": 3.3482754230499268,
+ "learning_rate": 7.351026008827527e-07,
+ "loss": 0.311,
+ "step": 9536
+ },
+ {
+ "epoch": 4.509219858156028,
+ "grad_norm": 3.5075576305389404,
+ "learning_rate": 7.34660828666591e-07,
+ "loss": 0.3575,
+ "step": 9537
+ },
+ {
+ "epoch": 4.509692671394799,
+ "grad_norm": 3.233328104019165,
+ "learning_rate": 7.342191663703588e-07,
+ "loss": 0.3087,
+ "step": 9538
+ },
+ {
+ "epoch": 4.5101654846335695,
+ "grad_norm": 3.3704137802124023,
+ "learning_rate": 7.337776140215555e-07,
+ "loss": 0.356,
+ "step": 9539
+ },
+ {
+ "epoch": 4.51063829787234,
+ "grad_norm": 4.084654331207275,
+ "learning_rate": 7.333361716476761e-07,
+ "loss": 0.3382,
+ "step": 9540
+ },
+ {
+ "epoch": 4.511111111111111,
+ "grad_norm": 2.985344886779785,
+ "learning_rate": 7.32894839276207e-07,
+ "loss": 0.3476,
+ "step": 9541
+ },
+ {
+ "epoch": 4.511583924349882,
+ "grad_norm": 3.405877113342285,
+ "learning_rate": 7.324536169346269e-07,
+ "loss": 0.3402,
+ "step": 9542
+ },
+ {
+ "epoch": 4.512056737588653,
+ "grad_norm": 3.0168516635894775,
+ "learning_rate": 7.320125046504103e-07,
+ "loss": 0.2936,
+ "step": 9543
+ },
+ {
+ "epoch": 4.5125295508274235,
+ "grad_norm": 2.9846513271331787,
+ "learning_rate": 7.315715024510219e-07,
+ "loss": 0.3165,
+ "step": 9544
+ },
+ {
+ "epoch": 4.513002364066194,
+ "grad_norm": 3.1661694049835205,
+ "learning_rate": 7.311306103639224e-07,
+ "loss": 0.3372,
+ "step": 9545
+ },
+ {
+ "epoch": 4.513475177304964,
+ "grad_norm": 3.3390371799468994,
+ "learning_rate": 7.306898284165637e-07,
+ "loss": 0.3427,
+ "step": 9546
+ },
+ {
+ "epoch": 4.513947990543735,
+ "grad_norm": 3.1360137462615967,
+ "learning_rate": 7.302491566363904e-07,
+ "loss": 0.3786,
+ "step": 9547
+ },
+ {
+ "epoch": 4.514420803782506,
+ "grad_norm": 3.3442773818969727,
+ "learning_rate": 7.298085950508427e-07,
+ "loss": 0.3516,
+ "step": 9548
+ },
+ {
+ "epoch": 4.514893617021277,
+ "grad_norm": 3.1403257846832275,
+ "learning_rate": 7.293681436873518e-07,
+ "loss": 0.32,
+ "step": 9549
+ },
+ {
+ "epoch": 4.515366430260047,
+ "grad_norm": 5.084080696105957,
+ "learning_rate": 7.289278025733417e-07,
+ "loss": 0.3036,
+ "step": 9550
+ },
+ {
+ "epoch": 4.515839243498818,
+ "grad_norm": 3.055558919906616,
+ "learning_rate": 7.284875717362322e-07,
+ "loss": 0.3625,
+ "step": 9551
+ },
+ {
+ "epoch": 4.516312056737589,
+ "grad_norm": 3.3781931400299072,
+ "learning_rate": 7.280474512034338e-07,
+ "loss": 0.3759,
+ "step": 9552
+ },
+ {
+ "epoch": 4.516784869976359,
+ "grad_norm": 3.5266852378845215,
+ "learning_rate": 7.27607441002351e-07,
+ "loss": 0.4057,
+ "step": 9553
+ },
+ {
+ "epoch": 4.51725768321513,
+ "grad_norm": 3.290174961090088,
+ "learning_rate": 7.271675411603802e-07,
+ "loss": 0.3471,
+ "step": 9554
+ },
+ {
+ "epoch": 4.5177304964539005,
+ "grad_norm": 3.399919033050537,
+ "learning_rate": 7.267277517049137e-07,
+ "loss": 0.394,
+ "step": 9555
+ },
+ {
+ "epoch": 4.518203309692671,
+ "grad_norm": 3.4410784244537354,
+ "learning_rate": 7.262880726633348e-07,
+ "loss": 0.4351,
+ "step": 9556
+ },
+ {
+ "epoch": 4.518676122931442,
+ "grad_norm": 2.7317543029785156,
+ "learning_rate": 7.258485040630192e-07,
+ "loss": 0.3039,
+ "step": 9557
+ },
+ {
+ "epoch": 4.519148936170213,
+ "grad_norm": 3.769446849822998,
+ "learning_rate": 7.254090459313384e-07,
+ "loss": 0.4061,
+ "step": 9558
+ },
+ {
+ "epoch": 4.519621749408984,
+ "grad_norm": 3.5625245571136475,
+ "learning_rate": 7.249696982956553e-07,
+ "loss": 0.4148,
+ "step": 9559
+ },
+ {
+ "epoch": 4.520094562647754,
+ "grad_norm": 3.1534764766693115,
+ "learning_rate": 7.245304611833248e-07,
+ "loss": 0.3369,
+ "step": 9560
+ },
+ {
+ "epoch": 4.520567375886525,
+ "grad_norm": 3.179197311401367,
+ "learning_rate": 7.240913346216982e-07,
+ "loss": 0.2798,
+ "step": 9561
+ },
+ {
+ "epoch": 4.521040189125295,
+ "grad_norm": 3.2608659267425537,
+ "learning_rate": 7.236523186381162e-07,
+ "loss": 0.3513,
+ "step": 9562
+ },
+ {
+ "epoch": 4.521513002364066,
+ "grad_norm": 3.055513381958008,
+ "learning_rate": 7.232134132599158e-07,
+ "loss": 0.3098,
+ "step": 9563
+ },
+ {
+ "epoch": 4.521985815602837,
+ "grad_norm": 3.159937620162964,
+ "learning_rate": 7.227746185144258e-07,
+ "loss": 0.3234,
+ "step": 9564
+ },
+ {
+ "epoch": 4.5224586288416075,
+ "grad_norm": 3.176802635192871,
+ "learning_rate": 7.22335934428966e-07,
+ "loss": 0.3547,
+ "step": 9565
+ },
+ {
+ "epoch": 4.522931442080378,
+ "grad_norm": 3.476203680038452,
+ "learning_rate": 7.218973610308538e-07,
+ "loss": 0.3659,
+ "step": 9566
+ },
+ {
+ "epoch": 4.523404255319149,
+ "grad_norm": 3.277595043182373,
+ "learning_rate": 7.214588983473964e-07,
+ "loss": 0.3448,
+ "step": 9567
+ },
+ {
+ "epoch": 4.52387706855792,
+ "grad_norm": 3.0068325996398926,
+ "learning_rate": 7.210205464058944e-07,
+ "loss": 0.3341,
+ "step": 9568
+ },
+ {
+ "epoch": 4.52434988179669,
+ "grad_norm": 3.3836655616760254,
+ "learning_rate": 7.205823052336425e-07,
+ "loss": 0.3253,
+ "step": 9569
+ },
+ {
+ "epoch": 4.524822695035461,
+ "grad_norm": 3.4170496463775635,
+ "learning_rate": 7.201441748579271e-07,
+ "loss": 0.3754,
+ "step": 9570
+ },
+ {
+ "epoch": 4.525295508274231,
+ "grad_norm": 3.5432863235473633,
+ "learning_rate": 7.197061553060303e-07,
+ "loss": 0.3265,
+ "step": 9571
+ },
+ {
+ "epoch": 4.525768321513002,
+ "grad_norm": 2.900520086288452,
+ "learning_rate": 7.192682466052243e-07,
+ "loss": 0.2961,
+ "step": 9572
+ },
+ {
+ "epoch": 4.526241134751773,
+ "grad_norm": 3.028733491897583,
+ "learning_rate": 7.188304487827768e-07,
+ "loss": 0.347,
+ "step": 9573
+ },
+ {
+ "epoch": 4.526713947990544,
+ "grad_norm": 2.8739330768585205,
+ "learning_rate": 7.183927618659473e-07,
+ "loss": 0.3265,
+ "step": 9574
+ },
+ {
+ "epoch": 4.527186761229315,
+ "grad_norm": 3.4727251529693604,
+ "learning_rate": 7.179551858819873e-07,
+ "loss": 0.3882,
+ "step": 9575
+ },
+ {
+ "epoch": 4.527659574468085,
+ "grad_norm": 2.950634002685547,
+ "learning_rate": 7.175177208581449e-07,
+ "loss": 0.2699,
+ "step": 9576
+ },
+ {
+ "epoch": 4.528132387706856,
+ "grad_norm": 3.035752773284912,
+ "learning_rate": 7.170803668216572e-07,
+ "loss": 0.3939,
+ "step": 9577
+ },
+ {
+ "epoch": 4.528605200945626,
+ "grad_norm": 2.9155373573303223,
+ "learning_rate": 7.166431237997579e-07,
+ "loss": 0.3112,
+ "step": 9578
+ },
+ {
+ "epoch": 4.529078014184397,
+ "grad_norm": 3.2655560970306396,
+ "learning_rate": 7.162059918196715e-07,
+ "loss": 0.3605,
+ "step": 9579
+ },
+ {
+ "epoch": 4.529550827423168,
+ "grad_norm": 3.0889620780944824,
+ "learning_rate": 7.157689709086157e-07,
+ "loss": 0.3333,
+ "step": 9580
+ },
+ {
+ "epoch": 4.5300236406619385,
+ "grad_norm": 3.193974256515503,
+ "learning_rate": 7.153320610938031e-07,
+ "loss": 0.3206,
+ "step": 9581
+ },
+ {
+ "epoch": 4.530496453900709,
+ "grad_norm": 3.0833280086517334,
+ "learning_rate": 7.148952624024374e-07,
+ "loss": 0.3483,
+ "step": 9582
+ },
+ {
+ "epoch": 4.53096926713948,
+ "grad_norm": 2.8866562843322754,
+ "learning_rate": 7.144585748617163e-07,
+ "loss": 0.3147,
+ "step": 9583
+ },
+ {
+ "epoch": 4.531442080378251,
+ "grad_norm": 3.2411928176879883,
+ "learning_rate": 7.140219984988305e-07,
+ "loss": 0.336,
+ "step": 9584
+ },
+ {
+ "epoch": 4.531914893617021,
+ "grad_norm": 3.0993618965148926,
+ "learning_rate": 7.13585533340963e-07,
+ "loss": 0.3599,
+ "step": 9585
+ },
+ {
+ "epoch": 4.532387706855792,
+ "grad_norm": 3.361176013946533,
+ "learning_rate": 7.131491794152917e-07,
+ "loss": 0.3448,
+ "step": 9586
+ },
+ {
+ "epoch": 4.532860520094562,
+ "grad_norm": 3.0916879177093506,
+ "learning_rate": 7.12712936748986e-07,
+ "loss": 0.3479,
+ "step": 9587
+ },
+ {
+ "epoch": 4.533333333333333,
+ "grad_norm": 3.254135847091675,
+ "learning_rate": 7.122768053692078e-07,
+ "loss": 0.3536,
+ "step": 9588
+ },
+ {
+ "epoch": 4.533806146572104,
+ "grad_norm": 3.120321035385132,
+ "learning_rate": 7.118407853031148e-07,
+ "loss": 0.3604,
+ "step": 9589
+ },
+ {
+ "epoch": 4.534278959810875,
+ "grad_norm": 3.0456507205963135,
+ "learning_rate": 7.114048765778544e-07,
+ "loss": 0.3473,
+ "step": 9590
+ },
+ {
+ "epoch": 4.534751773049646,
+ "grad_norm": 3.7177469730377197,
+ "learning_rate": 7.109690792205704e-07,
+ "loss": 0.374,
+ "step": 9591
+ },
+ {
+ "epoch": 4.535224586288416,
+ "grad_norm": 3.2694458961486816,
+ "learning_rate": 7.105333932583972e-07,
+ "loss": 0.4206,
+ "step": 9592
+ },
+ {
+ "epoch": 4.535697399527187,
+ "grad_norm": 3.506195068359375,
+ "learning_rate": 7.100978187184624e-07,
+ "loss": 0.3483,
+ "step": 9593
+ },
+ {
+ "epoch": 4.536170212765957,
+ "grad_norm": 3.569413661956787,
+ "learning_rate": 7.096623556278887e-07,
+ "loss": 0.3389,
+ "step": 9594
+ },
+ {
+ "epoch": 4.536643026004728,
+ "grad_norm": 3.2686502933502197,
+ "learning_rate": 7.092270040137886e-07,
+ "loss": 0.3571,
+ "step": 9595
+ },
+ {
+ "epoch": 4.537115839243499,
+ "grad_norm": 3.2042582035064697,
+ "learning_rate": 7.087917639032718e-07,
+ "loss": 0.3742,
+ "step": 9596
+ },
+ {
+ "epoch": 4.5375886524822695,
+ "grad_norm": 3.014989137649536,
+ "learning_rate": 7.083566353234375e-07,
+ "loss": 0.3294,
+ "step": 9597
+ },
+ {
+ "epoch": 4.53806146572104,
+ "grad_norm": 3.4535064697265625,
+ "learning_rate": 7.079216183013793e-07,
+ "loss": 0.3434,
+ "step": 9598
+ },
+ {
+ "epoch": 4.538534278959811,
+ "grad_norm": 3.123633623123169,
+ "learning_rate": 7.074867128641841e-07,
+ "loss": 0.357,
+ "step": 9599
+ },
+ {
+ "epoch": 4.539007092198582,
+ "grad_norm": 3.0646567344665527,
+ "learning_rate": 7.070519190389305e-07,
+ "loss": 0.3488,
+ "step": 9600
+ },
+ {
+ "epoch": 4.539479905437352,
+ "grad_norm": 2.951892852783203,
+ "learning_rate": 7.066172368526927e-07,
+ "loss": 0.329,
+ "step": 9601
+ },
+ {
+ "epoch": 4.539952718676123,
+ "grad_norm": 2.8071751594543457,
+ "learning_rate": 7.061826663325361e-07,
+ "loss": 0.2788,
+ "step": 9602
+ },
+ {
+ "epoch": 4.540425531914893,
+ "grad_norm": 3.9670250415802,
+ "learning_rate": 7.057482075055183e-07,
+ "loss": 0.3776,
+ "step": 9603
+ },
+ {
+ "epoch": 4.540898345153664,
+ "grad_norm": 2.683743476867676,
+ "learning_rate": 7.053138603986928e-07,
+ "loss": 0.3044,
+ "step": 9604
+ },
+ {
+ "epoch": 4.541371158392435,
+ "grad_norm": 2.9766221046447754,
+ "learning_rate": 7.048796250391038e-07,
+ "loss": 0.3542,
+ "step": 9605
+ },
+ {
+ "epoch": 4.541843971631206,
+ "grad_norm": 3.1156277656555176,
+ "learning_rate": 7.044455014537882e-07,
+ "loss": 0.3401,
+ "step": 9606
+ },
+ {
+ "epoch": 4.542316784869977,
+ "grad_norm": 2.8444416522979736,
+ "learning_rate": 7.040114896697789e-07,
+ "loss": 0.3437,
+ "step": 9607
+ },
+ {
+ "epoch": 4.542789598108747,
+ "grad_norm": 2.9964232444763184,
+ "learning_rate": 7.035775897140984e-07,
+ "loss": 0.331,
+ "step": 9608
+ },
+ {
+ "epoch": 4.543262411347518,
+ "grad_norm": 3.511500597000122,
+ "learning_rate": 7.031438016137648e-07,
+ "loss": 0.3685,
+ "step": 9609
+ },
+ {
+ "epoch": 4.543735224586288,
+ "grad_norm": 3.541271686553955,
+ "learning_rate": 7.027101253957877e-07,
+ "loss": 0.3945,
+ "step": 9610
+ },
+ {
+ "epoch": 4.544208037825059,
+ "grad_norm": 3.1483919620513916,
+ "learning_rate": 7.022765610871696e-07,
+ "loss": 0.3681,
+ "step": 9611
+ },
+ {
+ "epoch": 4.54468085106383,
+ "grad_norm": 2.908977508544922,
+ "learning_rate": 7.01843108714908e-07,
+ "loss": 0.3468,
+ "step": 9612
+ },
+ {
+ "epoch": 4.5451536643026005,
+ "grad_norm": 3.3107962608337402,
+ "learning_rate": 7.014097683059912e-07,
+ "loss": 0.419,
+ "step": 9613
+ },
+ {
+ "epoch": 4.545626477541371,
+ "grad_norm": 3.5597898960113525,
+ "learning_rate": 7.009765398874008e-07,
+ "loss": 0.3238,
+ "step": 9614
+ },
+ {
+ "epoch": 4.546099290780142,
+ "grad_norm": 3.091235399246216,
+ "learning_rate": 7.005434234861136e-07,
+ "loss": 0.3632,
+ "step": 9615
+ },
+ {
+ "epoch": 4.546572104018913,
+ "grad_norm": 3.279076099395752,
+ "learning_rate": 7.001104191290972e-07,
+ "loss": 0.4006,
+ "step": 9616
+ },
+ {
+ "epoch": 4.547044917257683,
+ "grad_norm": 3.3877902030944824,
+ "learning_rate": 6.996775268433126e-07,
+ "loss": 0.4183,
+ "step": 9617
+ },
+ {
+ "epoch": 4.547517730496454,
+ "grad_norm": 2.979999542236328,
+ "learning_rate": 6.992447466557134e-07,
+ "loss": 0.2921,
+ "step": 9618
+ },
+ {
+ "epoch": 4.547990543735224,
+ "grad_norm": 3.196361780166626,
+ "learning_rate": 6.988120785932484e-07,
+ "loss": 0.3352,
+ "step": 9619
+ },
+ {
+ "epoch": 4.548463356973995,
+ "grad_norm": 3.3237528800964355,
+ "learning_rate": 6.983795226828577e-07,
+ "loss": 0.3487,
+ "step": 9620
+ },
+ {
+ "epoch": 4.548936170212766,
+ "grad_norm": 3.0740649700164795,
+ "learning_rate": 6.979470789514731e-07,
+ "loss": 0.3497,
+ "step": 9621
+ },
+ {
+ "epoch": 4.549408983451537,
+ "grad_norm": 3.3443479537963867,
+ "learning_rate": 6.97514747426023e-07,
+ "loss": 0.3752,
+ "step": 9622
+ },
+ {
+ "epoch": 4.549881796690308,
+ "grad_norm": 3.450427293777466,
+ "learning_rate": 6.970825281334254e-07,
+ "loss": 0.3981,
+ "step": 9623
+ },
+ {
+ "epoch": 4.550354609929078,
+ "grad_norm": 3.4733047485351562,
+ "learning_rate": 6.966504211005937e-07,
+ "loss": 0.3953,
+ "step": 9624
+ },
+ {
+ "epoch": 4.550827423167849,
+ "grad_norm": 3.1651546955108643,
+ "learning_rate": 6.962184263544328e-07,
+ "loss": 0.3012,
+ "step": 9625
+ },
+ {
+ "epoch": 4.551300236406619,
+ "grad_norm": 2.9222865104675293,
+ "learning_rate": 6.957865439218405e-07,
+ "loss": 0.2774,
+ "step": 9626
+ },
+ {
+ "epoch": 4.55177304964539,
+ "grad_norm": 2.972437620162964,
+ "learning_rate": 6.953547738297095e-07,
+ "loss": 0.3478,
+ "step": 9627
+ },
+ {
+ "epoch": 4.552245862884161,
+ "grad_norm": 3.2741193771362305,
+ "learning_rate": 6.949231161049239e-07,
+ "loss": 0.355,
+ "step": 9628
+ },
+ {
+ "epoch": 4.5527186761229315,
+ "grad_norm": 2.8715150356292725,
+ "learning_rate": 6.9449157077436e-07,
+ "loss": 0.3055,
+ "step": 9629
+ },
+ {
+ "epoch": 4.553191489361702,
+ "grad_norm": 4.50998592376709,
+ "learning_rate": 6.940601378648895e-07,
+ "loss": 0.3732,
+ "step": 9630
+ },
+ {
+ "epoch": 4.553664302600473,
+ "grad_norm": 2.9277849197387695,
+ "learning_rate": 6.936288174033757e-07,
+ "loss": 0.3367,
+ "step": 9631
+ },
+ {
+ "epoch": 4.554137115839244,
+ "grad_norm": 3.169978380203247,
+ "learning_rate": 6.931976094166746e-07,
+ "loss": 0.3529,
+ "step": 9632
+ },
+ {
+ "epoch": 4.554609929078014,
+ "grad_norm": 2.9629712104797363,
+ "learning_rate": 6.927665139316359e-07,
+ "loss": 0.3416,
+ "step": 9633
+ },
+ {
+ "epoch": 4.555082742316785,
+ "grad_norm": 3.1368603706359863,
+ "learning_rate": 6.923355309751012e-07,
+ "loss": 0.3267,
+ "step": 9634
+ },
+ {
+ "epoch": 4.555555555555555,
+ "grad_norm": 2.9895052909851074,
+ "learning_rate": 6.919046605739071e-07,
+ "loss": 0.3411,
+ "step": 9635
+ },
+ {
+ "epoch": 4.556028368794326,
+ "grad_norm": 3.1592509746551514,
+ "learning_rate": 6.914739027548809e-07,
+ "loss": 0.3488,
+ "step": 9636
+ },
+ {
+ "epoch": 4.556501182033097,
+ "grad_norm": 3.0848731994628906,
+ "learning_rate": 6.910432575448456e-07,
+ "loss": 0.3732,
+ "step": 9637
+ },
+ {
+ "epoch": 4.556973995271868,
+ "grad_norm": 3.1475934982299805,
+ "learning_rate": 6.906127249706143e-07,
+ "loss": 0.3525,
+ "step": 9638
+ },
+ {
+ "epoch": 4.5574468085106385,
+ "grad_norm": 2.9435455799102783,
+ "learning_rate": 6.90182305058994e-07,
+ "loss": 0.3155,
+ "step": 9639
+ },
+ {
+ "epoch": 4.557919621749409,
+ "grad_norm": 3.4412894248962402,
+ "learning_rate": 6.897519978367867e-07,
+ "loss": 0.3511,
+ "step": 9640
+ },
+ {
+ "epoch": 4.55839243498818,
+ "grad_norm": 3.3600406646728516,
+ "learning_rate": 6.893218033307838e-07,
+ "loss": 0.4311,
+ "step": 9641
+ },
+ {
+ "epoch": 4.55886524822695,
+ "grad_norm": 3.35927414894104,
+ "learning_rate": 6.888917215677734e-07,
+ "loss": 0.387,
+ "step": 9642
+ },
+ {
+ "epoch": 4.559338061465721,
+ "grad_norm": 3.2481210231781006,
+ "learning_rate": 6.884617525745343e-07,
+ "loss": 0.3456,
+ "step": 9643
+ },
+ {
+ "epoch": 4.559810874704492,
+ "grad_norm": 3.661160469055176,
+ "learning_rate": 6.880318963778374e-07,
+ "loss": 0.4276,
+ "step": 9644
+ },
+ {
+ "epoch": 4.560283687943262,
+ "grad_norm": 3.038726806640625,
+ "learning_rate": 6.876021530044502e-07,
+ "loss": 0.3288,
+ "step": 9645
+ },
+ {
+ "epoch": 4.560756501182033,
+ "grad_norm": 3.0502963066101074,
+ "learning_rate": 6.871725224811296e-07,
+ "loss": 0.3334,
+ "step": 9646
+ },
+ {
+ "epoch": 4.561229314420804,
+ "grad_norm": 3.1810805797576904,
+ "learning_rate": 6.867430048346268e-07,
+ "loss": 0.3335,
+ "step": 9647
+ },
+ {
+ "epoch": 4.561702127659575,
+ "grad_norm": 3.028670072555542,
+ "learning_rate": 6.863136000916864e-07,
+ "loss": 0.3235,
+ "step": 9648
+ },
+ {
+ "epoch": 4.562174940898345,
+ "grad_norm": 2.805989980697632,
+ "learning_rate": 6.858843082790447e-07,
+ "loss": 0.3201,
+ "step": 9649
+ },
+ {
+ "epoch": 4.5626477541371155,
+ "grad_norm": 3.0792744159698486,
+ "learning_rate": 6.854551294234333e-07,
+ "loss": 0.3757,
+ "step": 9650
+ },
+ {
+ "epoch": 4.563120567375886,
+ "grad_norm": 3.115539312362671,
+ "learning_rate": 6.850260635515735e-07,
+ "loss": 0.3311,
+ "step": 9651
+ },
+ {
+ "epoch": 4.563593380614657,
+ "grad_norm": 3.003520965576172,
+ "learning_rate": 6.845971106901831e-07,
+ "loss": 0.35,
+ "step": 9652
+ },
+ {
+ "epoch": 4.564066193853428,
+ "grad_norm": 2.954759359359741,
+ "learning_rate": 6.841682708659702e-07,
+ "loss": 0.279,
+ "step": 9653
+ },
+ {
+ "epoch": 4.564539007092199,
+ "grad_norm": 3.1510894298553467,
+ "learning_rate": 6.83739544105636e-07,
+ "loss": 0.333,
+ "step": 9654
+ },
+ {
+ "epoch": 4.5650118203309695,
+ "grad_norm": 3.3958635330200195,
+ "learning_rate": 6.833109304358776e-07,
+ "loss": 0.3668,
+ "step": 9655
+ },
+ {
+ "epoch": 4.56548463356974,
+ "grad_norm": 3.493522882461548,
+ "learning_rate": 6.828824298833811e-07,
+ "loss": 0.3522,
+ "step": 9656
+ },
+ {
+ "epoch": 4.565957446808511,
+ "grad_norm": 3.217268705368042,
+ "learning_rate": 6.824540424748275e-07,
+ "loss": 0.4066,
+ "step": 9657
+ },
+ {
+ "epoch": 4.566430260047281,
+ "grad_norm": 3.148505210876465,
+ "learning_rate": 6.820257682368914e-07,
+ "loss": 0.3252,
+ "step": 9658
+ },
+ {
+ "epoch": 4.566903073286052,
+ "grad_norm": 3.070316791534424,
+ "learning_rate": 6.815976071962385e-07,
+ "loss": 0.3362,
+ "step": 9659
+ },
+ {
+ "epoch": 4.567375886524823,
+ "grad_norm": 3.0421791076660156,
+ "learning_rate": 6.811695593795301e-07,
+ "loss": 0.3894,
+ "step": 9660
+ },
+ {
+ "epoch": 4.567848699763593,
+ "grad_norm": 2.9165565967559814,
+ "learning_rate": 6.807416248134177e-07,
+ "loss": 0.3147,
+ "step": 9661
+ },
+ {
+ "epoch": 4.568321513002364,
+ "grad_norm": 3.361647129058838,
+ "learning_rate": 6.803138035245471e-07,
+ "loss": 0.3346,
+ "step": 9662
+ },
+ {
+ "epoch": 4.568794326241135,
+ "grad_norm": 3.0013155937194824,
+ "learning_rate": 6.79886095539557e-07,
+ "loss": 0.3476,
+ "step": 9663
+ },
+ {
+ "epoch": 4.569267139479906,
+ "grad_norm": 2.9030165672302246,
+ "learning_rate": 6.794585008850779e-07,
+ "loss": 0.3118,
+ "step": 9664
+ },
+ {
+ "epoch": 4.569739952718676,
+ "grad_norm": 3.229907989501953,
+ "learning_rate": 6.790310195877361e-07,
+ "loss": 0.3257,
+ "step": 9665
+ },
+ {
+ "epoch": 4.5702127659574465,
+ "grad_norm": 3.4075570106506348,
+ "learning_rate": 6.786036516741479e-07,
+ "loss": 0.3545,
+ "step": 9666
+ },
+ {
+ "epoch": 4.570685579196217,
+ "grad_norm": 2.9831581115722656,
+ "learning_rate": 6.781763971709229e-07,
+ "loss": 0.3173,
+ "step": 9667
+ },
+ {
+ "epoch": 4.571158392434988,
+ "grad_norm": 3.8512840270996094,
+ "learning_rate": 6.777492561046659e-07,
+ "loss": 0.288,
+ "step": 9668
+ },
+ {
+ "epoch": 4.571631205673759,
+ "grad_norm": 3.3054401874542236,
+ "learning_rate": 6.773222285019718e-07,
+ "loss": 0.369,
+ "step": 9669
+ },
+ {
+ "epoch": 4.57210401891253,
+ "grad_norm": 2.9155004024505615,
+ "learning_rate": 6.768953143894308e-07,
+ "loss": 0.3334,
+ "step": 9670
+ },
+ {
+ "epoch": 4.5725768321513005,
+ "grad_norm": 3.60557222366333,
+ "learning_rate": 6.764685137936247e-07,
+ "loss": 0.4094,
+ "step": 9671
+ },
+ {
+ "epoch": 4.573049645390071,
+ "grad_norm": 3.271256446838379,
+ "learning_rate": 6.760418267411275e-07,
+ "loss": 0.3646,
+ "step": 9672
+ },
+ {
+ "epoch": 4.573522458628842,
+ "grad_norm": 2.970238447189331,
+ "learning_rate": 6.756152532585086e-07,
+ "loss": 0.34,
+ "step": 9673
+ },
+ {
+ "epoch": 4.573995271867612,
+ "grad_norm": 3.412712574005127,
+ "learning_rate": 6.751887933723277e-07,
+ "loss": 0.3674,
+ "step": 9674
+ },
+ {
+ "epoch": 4.574468085106383,
+ "grad_norm": 2.9984517097473145,
+ "learning_rate": 6.747624471091396e-07,
+ "loss": 0.3579,
+ "step": 9675
+ },
+ {
+ "epoch": 4.574940898345154,
+ "grad_norm": 2.863788366317749,
+ "learning_rate": 6.743362144954907e-07,
+ "loss": 0.3234,
+ "step": 9676
+ },
+ {
+ "epoch": 4.575413711583924,
+ "grad_norm": 3.313793897628784,
+ "learning_rate": 6.739100955579203e-07,
+ "loss": 0.334,
+ "step": 9677
+ },
+ {
+ "epoch": 4.575886524822695,
+ "grad_norm": 3.5350630283355713,
+ "learning_rate": 6.734840903229611e-07,
+ "loss": 0.3682,
+ "step": 9678
+ },
+ {
+ "epoch": 4.576359338061466,
+ "grad_norm": 3.531888723373413,
+ "learning_rate": 6.730581988171378e-07,
+ "loss": 0.3434,
+ "step": 9679
+ },
+ {
+ "epoch": 4.576832151300237,
+ "grad_norm": 3.358574867248535,
+ "learning_rate": 6.726324210669702e-07,
+ "loss": 0.3751,
+ "step": 9680
+ },
+ {
+ "epoch": 4.577304964539007,
+ "grad_norm": 2.9723873138427734,
+ "learning_rate": 6.722067570989691e-07,
+ "loss": 0.3077,
+ "step": 9681
+ },
+ {
+ "epoch": 4.5777777777777775,
+ "grad_norm": 3.2287187576293945,
+ "learning_rate": 6.717812069396379e-07,
+ "loss": 0.3493,
+ "step": 9682
+ },
+ {
+ "epoch": 4.578250591016548,
+ "grad_norm": 2.9089417457580566,
+ "learning_rate": 6.71355770615475e-07,
+ "loss": 0.324,
+ "step": 9683
+ },
+ {
+ "epoch": 4.578723404255319,
+ "grad_norm": 3.2894415855407715,
+ "learning_rate": 6.709304481529703e-07,
+ "loss": 0.3066,
+ "step": 9684
+ },
+ {
+ "epoch": 4.57919621749409,
+ "grad_norm": 3.1914620399475098,
+ "learning_rate": 6.705052395786052e-07,
+ "loss": 0.3453,
+ "step": 9685
+ },
+ {
+ "epoch": 4.579669030732861,
+ "grad_norm": 3.1095924377441406,
+ "learning_rate": 6.700801449188577e-07,
+ "loss": 0.3678,
+ "step": 9686
+ },
+ {
+ "epoch": 4.5801418439716315,
+ "grad_norm": 3.416944980621338,
+ "learning_rate": 6.696551642001948e-07,
+ "loss": 0.3754,
+ "step": 9687
+ },
+ {
+ "epoch": 4.580614657210402,
+ "grad_norm": 3.7102952003479004,
+ "learning_rate": 6.692302974490797e-07,
+ "loss": 0.3723,
+ "step": 9688
+ },
+ {
+ "epoch": 4.581087470449172,
+ "grad_norm": 3.296607494354248,
+ "learning_rate": 6.688055446919664e-07,
+ "loss": 0.3607,
+ "step": 9689
+ },
+ {
+ "epoch": 4.581560283687943,
+ "grad_norm": 3.4449238777160645,
+ "learning_rate": 6.683809059553014e-07,
+ "loss": 0.3102,
+ "step": 9690
+ },
+ {
+ "epoch": 4.582033096926714,
+ "grad_norm": 3.202671766281128,
+ "learning_rate": 6.679563812655268e-07,
+ "loss": 0.3535,
+ "step": 9691
+ },
+ {
+ "epoch": 4.582505910165485,
+ "grad_norm": 3.25919771194458,
+ "learning_rate": 6.675319706490744e-07,
+ "loss": 0.3778,
+ "step": 9692
+ },
+ {
+ "epoch": 4.582978723404255,
+ "grad_norm": 3.316021680831909,
+ "learning_rate": 6.671076741323718e-07,
+ "loss": 0.2943,
+ "step": 9693
+ },
+ {
+ "epoch": 4.583451536643026,
+ "grad_norm": 3.2375826835632324,
+ "learning_rate": 6.666834917418371e-07,
+ "loss": 0.3249,
+ "step": 9694
+ },
+ {
+ "epoch": 4.583924349881797,
+ "grad_norm": 2.875436782836914,
+ "learning_rate": 6.662594235038827e-07,
+ "loss": 0.2991,
+ "step": 9695
+ },
+ {
+ "epoch": 4.584397163120567,
+ "grad_norm": 3.75874924659729,
+ "learning_rate": 6.658354694449134e-07,
+ "loss": 0.3718,
+ "step": 9696
+ },
+ {
+ "epoch": 4.584869976359338,
+ "grad_norm": 3.060943126678467,
+ "learning_rate": 6.65411629591326e-07,
+ "loss": 0.3183,
+ "step": 9697
+ },
+ {
+ "epoch": 4.5853427895981085,
+ "grad_norm": 3.024336576461792,
+ "learning_rate": 6.649879039695126e-07,
+ "loss": 0.3118,
+ "step": 9698
+ },
+ {
+ "epoch": 4.585815602836879,
+ "grad_norm": 3.3640875816345215,
+ "learning_rate": 6.645642926058562e-07,
+ "loss": 0.3408,
+ "step": 9699
+ },
+ {
+ "epoch": 4.58628841607565,
+ "grad_norm": 2.8885910511016846,
+ "learning_rate": 6.641407955267326e-07,
+ "loss": 0.3304,
+ "step": 9700
+ },
+ {
+ "epoch": 4.586761229314421,
+ "grad_norm": 3.8225393295288086,
+ "learning_rate": 6.637174127585122e-07,
+ "loss": 0.3469,
+ "step": 9701
+ },
+ {
+ "epoch": 4.587234042553192,
+ "grad_norm": 3.0624778270721436,
+ "learning_rate": 6.632941443275567e-07,
+ "loss": 0.3177,
+ "step": 9702
+ },
+ {
+ "epoch": 4.5877068557919625,
+ "grad_norm": 3.1422903537750244,
+ "learning_rate": 6.628709902602204e-07,
+ "loss": 0.3205,
+ "step": 9703
+ },
+ {
+ "epoch": 4.588179669030733,
+ "grad_norm": 3.1315362453460693,
+ "learning_rate": 6.62447950582853e-07,
+ "loss": 0.3443,
+ "step": 9704
+ },
+ {
+ "epoch": 4.588652482269503,
+ "grad_norm": 3.096041202545166,
+ "learning_rate": 6.62025025321793e-07,
+ "loss": 0.3567,
+ "step": 9705
+ },
+ {
+ "epoch": 4.589125295508274,
+ "grad_norm": 3.225820302963257,
+ "learning_rate": 6.616022145033766e-07,
+ "loss": 0.3873,
+ "step": 9706
+ },
+ {
+ "epoch": 4.589598108747045,
+ "grad_norm": 3.3879058361053467,
+ "learning_rate": 6.611795181539288e-07,
+ "loss": 0.4379,
+ "step": 9707
+ },
+ {
+ "epoch": 4.590070921985816,
+ "grad_norm": 3.508265733718872,
+ "learning_rate": 6.60756936299769e-07,
+ "loss": 0.3182,
+ "step": 9708
+ },
+ {
+ "epoch": 4.590543735224586,
+ "grad_norm": 3.278857946395874,
+ "learning_rate": 6.603344689672106e-07,
+ "loss": 0.3508,
+ "step": 9709
+ },
+ {
+ "epoch": 4.591016548463357,
+ "grad_norm": 2.9961371421813965,
+ "learning_rate": 6.599121161825581e-07,
+ "loss": 0.3178,
+ "step": 9710
+ },
+ {
+ "epoch": 4.591489361702128,
+ "grad_norm": 3.413717269897461,
+ "learning_rate": 6.594898779721092e-07,
+ "loss": 0.363,
+ "step": 9711
+ },
+ {
+ "epoch": 4.591962174940898,
+ "grad_norm": 3.2014074325561523,
+ "learning_rate": 6.590677543621557e-07,
+ "loss": 0.392,
+ "step": 9712
+ },
+ {
+ "epoch": 4.592434988179669,
+ "grad_norm": 3.0421640872955322,
+ "learning_rate": 6.586457453789802e-07,
+ "loss": 0.3119,
+ "step": 9713
+ },
+ {
+ "epoch": 4.5929078014184395,
+ "grad_norm": 3.0515928268432617,
+ "learning_rate": 6.582238510488604e-07,
+ "loss": 0.3021,
+ "step": 9714
+ },
+ {
+ "epoch": 4.59338061465721,
+ "grad_norm": 3.0824668407440186,
+ "learning_rate": 6.578020713980648e-07,
+ "loss": 0.3551,
+ "step": 9715
+ },
+ {
+ "epoch": 4.593853427895981,
+ "grad_norm": 3.0002171993255615,
+ "learning_rate": 6.573804064528574e-07,
+ "loss": 0.3691,
+ "step": 9716
+ },
+ {
+ "epoch": 4.594326241134752,
+ "grad_norm": 3.0174765586853027,
+ "learning_rate": 6.569588562394924e-07,
+ "loss": 0.3289,
+ "step": 9717
+ },
+ {
+ "epoch": 4.594799054373523,
+ "grad_norm": 4.098819732666016,
+ "learning_rate": 6.565374207842171e-07,
+ "loss": 0.3637,
+ "step": 9718
+ },
+ {
+ "epoch": 4.5952718676122934,
+ "grad_norm": 3.396275281906128,
+ "learning_rate": 6.561161001132737e-07,
+ "loss": 0.348,
+ "step": 9719
+ },
+ {
+ "epoch": 4.595744680851064,
+ "grad_norm": 3.6430864334106445,
+ "learning_rate": 6.556948942528952e-07,
+ "loss": 0.3543,
+ "step": 9720
+ },
+ {
+ "epoch": 4.596217494089834,
+ "grad_norm": 3.170236587524414,
+ "learning_rate": 6.552738032293093e-07,
+ "loss": 0.3565,
+ "step": 9721
+ },
+ {
+ "epoch": 4.596690307328605,
+ "grad_norm": 3.402683734893799,
+ "learning_rate": 6.548528270687349e-07,
+ "loss": 0.4001,
+ "step": 9722
+ },
+ {
+ "epoch": 4.597163120567376,
+ "grad_norm": 2.861463785171509,
+ "learning_rate": 6.544319657973833e-07,
+ "loss": 0.3436,
+ "step": 9723
+ },
+ {
+ "epoch": 4.5976359338061465,
+ "grad_norm": 3.209259510040283,
+ "learning_rate": 6.540112194414613e-07,
+ "loss": 0.3317,
+ "step": 9724
+ },
+ {
+ "epoch": 4.598108747044917,
+ "grad_norm": 3.099533796310425,
+ "learning_rate": 6.535905880271662e-07,
+ "loss": 0.3416,
+ "step": 9725
+ },
+ {
+ "epoch": 4.598581560283688,
+ "grad_norm": 3.3558053970336914,
+ "learning_rate": 6.531700715806891e-07,
+ "loss": 0.3567,
+ "step": 9726
+ },
+ {
+ "epoch": 4.599054373522459,
+ "grad_norm": 3.1330227851867676,
+ "learning_rate": 6.527496701282135e-07,
+ "loss": 0.3025,
+ "step": 9727
+ },
+ {
+ "epoch": 4.599527186761229,
+ "grad_norm": 3.140184164047241,
+ "learning_rate": 6.523293836959152e-07,
+ "loss": 0.3195,
+ "step": 9728
+ },
+ {
+ "epoch": 4.6,
+ "grad_norm": 3.246844530105591,
+ "learning_rate": 6.519092123099652e-07,
+ "loss": 0.3367,
+ "step": 9729
+ },
+ {
+ "epoch": 4.60047281323877,
+ "grad_norm": 3.1590709686279297,
+ "learning_rate": 6.51489155996525e-07,
+ "loss": 0.367,
+ "step": 9730
+ },
+ {
+ "epoch": 4.600945626477541,
+ "grad_norm": 3.122746467590332,
+ "learning_rate": 6.510692147817488e-07,
+ "loss": 0.3401,
+ "step": 9731
+ },
+ {
+ "epoch": 4.601418439716312,
+ "grad_norm": 3.0418715476989746,
+ "learning_rate": 6.506493886917859e-07,
+ "loss": 0.2958,
+ "step": 9732
+ },
+ {
+ "epoch": 4.601891252955083,
+ "grad_norm": 3.06303334236145,
+ "learning_rate": 6.502296777527756e-07,
+ "loss": 0.3459,
+ "step": 9733
+ },
+ {
+ "epoch": 4.602364066193854,
+ "grad_norm": 3.0807206630706787,
+ "learning_rate": 6.498100819908532e-07,
+ "loss": 0.3473,
+ "step": 9734
+ },
+ {
+ "epoch": 4.602836879432624,
+ "grad_norm": 3.790008306503296,
+ "learning_rate": 6.493906014321441e-07,
+ "loss": 0.3541,
+ "step": 9735
+ },
+ {
+ "epoch": 4.603309692671395,
+ "grad_norm": 3.3040049076080322,
+ "learning_rate": 6.489712361027667e-07,
+ "loss": 0.3317,
+ "step": 9736
+ },
+ {
+ "epoch": 4.603782505910165,
+ "grad_norm": 3.3145735263824463,
+ "learning_rate": 6.485519860288347e-07,
+ "loss": 0.3351,
+ "step": 9737
+ },
+ {
+ "epoch": 4.604255319148936,
+ "grad_norm": 3.1374423503875732,
+ "learning_rate": 6.481328512364515e-07,
+ "loss": 0.3544,
+ "step": 9738
+ },
+ {
+ "epoch": 4.604728132387707,
+ "grad_norm": 3.3598453998565674,
+ "learning_rate": 6.477138317517162e-07,
+ "loss": 0.4219,
+ "step": 9739
+ },
+ {
+ "epoch": 4.6052009456264775,
+ "grad_norm": 3.227466583251953,
+ "learning_rate": 6.472949276007187e-07,
+ "loss": 0.3179,
+ "step": 9740
+ },
+ {
+ "epoch": 4.605673758865248,
+ "grad_norm": 2.9815897941589355,
+ "learning_rate": 6.46876138809542e-07,
+ "loss": 0.3753,
+ "step": 9741
+ },
+ {
+ "epoch": 4.606146572104019,
+ "grad_norm": 3.072967290878296,
+ "learning_rate": 6.464574654042624e-07,
+ "loss": 0.3288,
+ "step": 9742
+ },
+ {
+ "epoch": 4.60661938534279,
+ "grad_norm": 3.695613145828247,
+ "learning_rate": 6.460389074109482e-07,
+ "loss": 0.3305,
+ "step": 9743
+ },
+ {
+ "epoch": 4.60709219858156,
+ "grad_norm": 3.205684185028076,
+ "learning_rate": 6.456204648556628e-07,
+ "loss": 0.3305,
+ "step": 9744
+ },
+ {
+ "epoch": 4.607565011820331,
+ "grad_norm": 3.216615915298462,
+ "learning_rate": 6.452021377644596e-07,
+ "loss": 0.3416,
+ "step": 9745
+ },
+ {
+ "epoch": 4.608037825059101,
+ "grad_norm": 3.2224013805389404,
+ "learning_rate": 6.447839261633856e-07,
+ "loss": 0.3773,
+ "step": 9746
+ },
+ {
+ "epoch": 4.608510638297872,
+ "grad_norm": 3.2811145782470703,
+ "learning_rate": 6.443658300784824e-07,
+ "loss": 0.3292,
+ "step": 9747
+ },
+ {
+ "epoch": 4.608983451536643,
+ "grad_norm": 3.8610804080963135,
+ "learning_rate": 6.439478495357815e-07,
+ "loss": 0.3975,
+ "step": 9748
+ },
+ {
+ "epoch": 4.609456264775414,
+ "grad_norm": 3.2154266834259033,
+ "learning_rate": 6.435299845613102e-07,
+ "loss": 0.3367,
+ "step": 9749
+ },
+ {
+ "epoch": 4.609929078014185,
+ "grad_norm": 3.18072509765625,
+ "learning_rate": 6.431122351810862e-07,
+ "loss": 0.3972,
+ "step": 9750
+ },
+ {
+ "epoch": 4.610401891252955,
+ "grad_norm": 3.513521194458008,
+ "learning_rate": 6.426946014211205e-07,
+ "loss": 0.374,
+ "step": 9751
+ },
+ {
+ "epoch": 4.610874704491726,
+ "grad_norm": 3.2900753021240234,
+ "learning_rate": 6.422770833074188e-07,
+ "loss": 0.3823,
+ "step": 9752
+ },
+ {
+ "epoch": 4.611347517730496,
+ "grad_norm": 2.791400194168091,
+ "learning_rate": 6.418596808659772e-07,
+ "loss": 0.3187,
+ "step": 9753
+ },
+ {
+ "epoch": 4.611820330969267,
+ "grad_norm": 3.042336940765381,
+ "learning_rate": 6.414423941227846e-07,
+ "loss": 0.3832,
+ "step": 9754
+ },
+ {
+ "epoch": 4.612293144208038,
+ "grad_norm": 3.130197286605835,
+ "learning_rate": 6.410252231038255e-07,
+ "loss": 0.3152,
+ "step": 9755
+ },
+ {
+ "epoch": 4.6127659574468085,
+ "grad_norm": 3.28125262260437,
+ "learning_rate": 6.406081678350745e-07,
+ "loss": 0.3082,
+ "step": 9756
+ },
+ {
+ "epoch": 4.613238770685579,
+ "grad_norm": 3.5695526599884033,
+ "learning_rate": 6.401912283424988e-07,
+ "loss": 0.4303,
+ "step": 9757
+ },
+ {
+ "epoch": 4.61371158392435,
+ "grad_norm": 2.9045464992523193,
+ "learning_rate": 6.397744046520612e-07,
+ "loss": 0.3392,
+ "step": 9758
+ },
+ {
+ "epoch": 4.614184397163121,
+ "grad_norm": 3.4325780868530273,
+ "learning_rate": 6.393576967897145e-07,
+ "loss": 0.3446,
+ "step": 9759
+ },
+ {
+ "epoch": 4.614657210401891,
+ "grad_norm": 3.1146414279937744,
+ "learning_rate": 6.389411047814053e-07,
+ "loss": 0.3444,
+ "step": 9760
+ },
+ {
+ "epoch": 4.615130023640662,
+ "grad_norm": 3.9922995567321777,
+ "learning_rate": 6.385246286530722e-07,
+ "loss": 0.3431,
+ "step": 9761
+ },
+ {
+ "epoch": 4.615602836879432,
+ "grad_norm": 2.868818759918213,
+ "learning_rate": 6.381082684306491e-07,
+ "loss": 0.2819,
+ "step": 9762
+ },
+ {
+ "epoch": 4.616075650118203,
+ "grad_norm": 3.1957287788391113,
+ "learning_rate": 6.376920241400597e-07,
+ "loss": 0.315,
+ "step": 9763
+ },
+ {
+ "epoch": 4.616548463356974,
+ "grad_norm": 3.327913999557495,
+ "learning_rate": 6.372758958072215e-07,
+ "loss": 0.3224,
+ "step": 9764
+ },
+ {
+ "epoch": 4.617021276595745,
+ "grad_norm": 3.2451798915863037,
+ "learning_rate": 6.368598834580461e-07,
+ "loss": 0.3219,
+ "step": 9765
+ },
+ {
+ "epoch": 4.617494089834516,
+ "grad_norm": 3.328977346420288,
+ "learning_rate": 6.364439871184355e-07,
+ "loss": 0.3123,
+ "step": 9766
+ },
+ {
+ "epoch": 4.617966903073286,
+ "grad_norm": 2.929624557495117,
+ "learning_rate": 6.36028206814287e-07,
+ "loss": 0.3137,
+ "step": 9767
+ },
+ {
+ "epoch": 4.618439716312057,
+ "grad_norm": 3.2356855869293213,
+ "learning_rate": 6.356125425714888e-07,
+ "loss": 0.3672,
+ "step": 9768
+ },
+ {
+ "epoch": 4.618912529550827,
+ "grad_norm": 3.099452018737793,
+ "learning_rate": 6.351969944159217e-07,
+ "loss": 0.3875,
+ "step": 9769
+ },
+ {
+ "epoch": 4.619385342789598,
+ "grad_norm": 4.037657260894775,
+ "learning_rate": 6.347815623734616e-07,
+ "loss": 0.3984,
+ "step": 9770
+ },
+ {
+ "epoch": 4.619858156028369,
+ "grad_norm": 3.350639581680298,
+ "learning_rate": 6.343662464699743e-07,
+ "loss": 0.3325,
+ "step": 9771
+ },
+ {
+ "epoch": 4.6203309692671395,
+ "grad_norm": 3.3933796882629395,
+ "learning_rate": 6.339510467313206e-07,
+ "loss": 0.3922,
+ "step": 9772
+ },
+ {
+ "epoch": 4.62080378250591,
+ "grad_norm": 2.8599045276641846,
+ "learning_rate": 6.335359631833532e-07,
+ "loss": 0.2677,
+ "step": 9773
+ },
+ {
+ "epoch": 4.621276595744681,
+ "grad_norm": 3.0792534351348877,
+ "learning_rate": 6.331209958519172e-07,
+ "loss": 0.3784,
+ "step": 9774
+ },
+ {
+ "epoch": 4.621749408983452,
+ "grad_norm": 3.1678860187530518,
+ "learning_rate": 6.327061447628507e-07,
+ "loss": 0.3698,
+ "step": 9775
+ },
+ {
+ "epoch": 4.622222222222222,
+ "grad_norm": 3.500584602355957,
+ "learning_rate": 6.322914099419846e-07,
+ "loss": 0.281,
+ "step": 9776
+ },
+ {
+ "epoch": 4.622695035460993,
+ "grad_norm": 3.089900016784668,
+ "learning_rate": 6.318767914151422e-07,
+ "loss": 0.3202,
+ "step": 9777
+ },
+ {
+ "epoch": 4.623167848699763,
+ "grad_norm": 3.353118896484375,
+ "learning_rate": 6.31462289208141e-07,
+ "loss": 0.3584,
+ "step": 9778
+ },
+ {
+ "epoch": 4.623640661938534,
+ "grad_norm": 3.1742143630981445,
+ "learning_rate": 6.310479033467893e-07,
+ "loss": 0.3309,
+ "step": 9779
+ },
+ {
+ "epoch": 4.624113475177305,
+ "grad_norm": 3.5430498123168945,
+ "learning_rate": 6.306336338568903e-07,
+ "loss": 0.3972,
+ "step": 9780
+ },
+ {
+ "epoch": 4.624586288416076,
+ "grad_norm": 3.141406774520874,
+ "learning_rate": 6.302194807642379e-07,
+ "loss": 0.3875,
+ "step": 9781
+ },
+ {
+ "epoch": 4.625059101654847,
+ "grad_norm": 3.1661601066589355,
+ "learning_rate": 6.298054440946188e-07,
+ "loss": 0.3969,
+ "step": 9782
+ },
+ {
+ "epoch": 4.625531914893617,
+ "grad_norm": 2.9834651947021484,
+ "learning_rate": 6.293915238738149e-07,
+ "loss": 0.3357,
+ "step": 9783
+ },
+ {
+ "epoch": 4.626004728132388,
+ "grad_norm": 3.497030258178711,
+ "learning_rate": 6.289777201275979e-07,
+ "loss": 0.3683,
+ "step": 9784
+ },
+ {
+ "epoch": 4.626477541371158,
+ "grad_norm": 3.519390106201172,
+ "learning_rate": 6.285640328817347e-07,
+ "loss": 0.3647,
+ "step": 9785
+ },
+ {
+ "epoch": 4.626950354609929,
+ "grad_norm": 3.0032200813293457,
+ "learning_rate": 6.281504621619833e-07,
+ "loss": 0.2854,
+ "step": 9786
+ },
+ {
+ "epoch": 4.6274231678487,
+ "grad_norm": 2.9891152381896973,
+ "learning_rate": 6.277370079940939e-07,
+ "loss": 0.3771,
+ "step": 9787
+ },
+ {
+ "epoch": 4.6278959810874705,
+ "grad_norm": 3.379671812057495,
+ "learning_rate": 6.273236704038122e-07,
+ "loss": 0.3916,
+ "step": 9788
+ },
+ {
+ "epoch": 4.628368794326241,
+ "grad_norm": 2.9964048862457275,
+ "learning_rate": 6.26910449416874e-07,
+ "loss": 0.3618,
+ "step": 9789
+ },
+ {
+ "epoch": 4.628841607565012,
+ "grad_norm": 3.0143628120422363,
+ "learning_rate": 6.264973450590089e-07,
+ "loss": 0.336,
+ "step": 9790
+ },
+ {
+ "epoch": 4.629314420803783,
+ "grad_norm": 2.956737756729126,
+ "learning_rate": 6.260843573559392e-07,
+ "loss": 0.3657,
+ "step": 9791
+ },
+ {
+ "epoch": 4.629787234042553,
+ "grad_norm": 3.057551145553589,
+ "learning_rate": 6.256714863333787e-07,
+ "loss": 0.3475,
+ "step": 9792
+ },
+ {
+ "epoch": 4.630260047281324,
+ "grad_norm": 3.9289608001708984,
+ "learning_rate": 6.25258732017037e-07,
+ "loss": 0.3679,
+ "step": 9793
+ },
+ {
+ "epoch": 4.630732860520094,
+ "grad_norm": 3.8519062995910645,
+ "learning_rate": 6.248460944326129e-07,
+ "loss": 0.4182,
+ "step": 9794
+ },
+ {
+ "epoch": 4.631205673758865,
+ "grad_norm": 3.6360673904418945,
+ "learning_rate": 6.244335736058007e-07,
+ "loss": 0.3836,
+ "step": 9795
+ },
+ {
+ "epoch": 4.631678486997636,
+ "grad_norm": 3.1905548572540283,
+ "learning_rate": 6.240211695622861e-07,
+ "loss": 0.357,
+ "step": 9796
+ },
+ {
+ "epoch": 4.632151300236407,
+ "grad_norm": 3.3542017936706543,
+ "learning_rate": 6.236088823277465e-07,
+ "loss": 0.3191,
+ "step": 9797
+ },
+ {
+ "epoch": 4.6326241134751776,
+ "grad_norm": 3.453275442123413,
+ "learning_rate": 6.231967119278546e-07,
+ "loss": 0.3346,
+ "step": 9798
+ },
+ {
+ "epoch": 4.633096926713948,
+ "grad_norm": 3.559972047805786,
+ "learning_rate": 6.227846583882741e-07,
+ "loss": 0.365,
+ "step": 9799
+ },
+ {
+ "epoch": 4.633569739952719,
+ "grad_norm": 2.795891046524048,
+ "learning_rate": 6.223727217346606e-07,
+ "loss": 0.3346,
+ "step": 9800
+ },
+ {
+ "epoch": 4.634042553191489,
+ "grad_norm": 3.176762342453003,
+ "learning_rate": 6.219609019926653e-07,
+ "loss": 0.3692,
+ "step": 9801
+ },
+ {
+ "epoch": 4.63451536643026,
+ "grad_norm": 3.490229845046997,
+ "learning_rate": 6.215491991879294e-07,
+ "loss": 0.3334,
+ "step": 9802
+ },
+ {
+ "epoch": 4.634988179669031,
+ "grad_norm": 3.27502179145813,
+ "learning_rate": 6.211376133460884e-07,
+ "loss": 0.3484,
+ "step": 9803
+ },
+ {
+ "epoch": 4.6354609929078014,
+ "grad_norm": 2.9768311977386475,
+ "learning_rate": 6.207261444927698e-07,
+ "loss": 0.3342,
+ "step": 9804
+ },
+ {
+ "epoch": 4.635933806146572,
+ "grad_norm": 3.1726930141448975,
+ "learning_rate": 6.203147926535938e-07,
+ "loss": 0.3187,
+ "step": 9805
+ },
+ {
+ "epoch": 4.636406619385343,
+ "grad_norm": 3.1797916889190674,
+ "learning_rate": 6.199035578541737e-07,
+ "loss": 0.3418,
+ "step": 9806
+ },
+ {
+ "epoch": 4.636879432624114,
+ "grad_norm": 3.1262030601501465,
+ "learning_rate": 6.194924401201141e-07,
+ "loss": 0.3099,
+ "step": 9807
+ },
+ {
+ "epoch": 4.637352245862884,
+ "grad_norm": 3.556866407394409,
+ "learning_rate": 6.190814394770153e-07,
+ "loss": 0.2879,
+ "step": 9808
+ },
+ {
+ "epoch": 4.6378250591016545,
+ "grad_norm": 3.508984327316284,
+ "learning_rate": 6.186705559504678e-07,
+ "loss": 0.3414,
+ "step": 9809
+ },
+ {
+ "epoch": 4.638297872340425,
+ "grad_norm": 3.266221761703491,
+ "learning_rate": 6.182597895660544e-07,
+ "loss": 0.3281,
+ "step": 9810
+ },
+ {
+ "epoch": 4.638770685579196,
+ "grad_norm": 3.3781862258911133,
+ "learning_rate": 6.178491403493537e-07,
+ "loss": 0.3583,
+ "step": 9811
+ },
+ {
+ "epoch": 4.639243498817967,
+ "grad_norm": 3.4480984210968018,
+ "learning_rate": 6.174386083259329e-07,
+ "loss": 0.3704,
+ "step": 9812
+ },
+ {
+ "epoch": 4.639716312056738,
+ "grad_norm": 3.2882535457611084,
+ "learning_rate": 6.170281935213563e-07,
+ "loss": 0.3515,
+ "step": 9813
+ },
+ {
+ "epoch": 4.6401891252955085,
+ "grad_norm": 2.852627992630005,
+ "learning_rate": 6.166178959611774e-07,
+ "loss": 0.3266,
+ "step": 9814
+ },
+ {
+ "epoch": 4.640661938534279,
+ "grad_norm": 3.5469841957092285,
+ "learning_rate": 6.162077156709431e-07,
+ "loss": 0.3374,
+ "step": 9815
+ },
+ {
+ "epoch": 4.64113475177305,
+ "grad_norm": 3.343583345413208,
+ "learning_rate": 6.157976526761947e-07,
+ "loss": 0.3084,
+ "step": 9816
+ },
+ {
+ "epoch": 4.64160756501182,
+ "grad_norm": 3.028337001800537,
+ "learning_rate": 6.153877070024639e-07,
+ "loss": 0.3083,
+ "step": 9817
+ },
+ {
+ "epoch": 4.642080378250591,
+ "grad_norm": 3.1543455123901367,
+ "learning_rate": 6.149778786752775e-07,
+ "loss": 0.3273,
+ "step": 9818
+ },
+ {
+ "epoch": 4.642553191489362,
+ "grad_norm": 3.2126576900482178,
+ "learning_rate": 6.145681677201529e-07,
+ "loss": 0.3107,
+ "step": 9819
+ },
+ {
+ "epoch": 4.643026004728132,
+ "grad_norm": 3.4443142414093018,
+ "learning_rate": 6.141585741626014e-07,
+ "loss": 0.3193,
+ "step": 9820
+ },
+ {
+ "epoch": 4.643498817966903,
+ "grad_norm": 3.1558680534362793,
+ "learning_rate": 6.137490980281255e-07,
+ "loss": 0.3855,
+ "step": 9821
+ },
+ {
+ "epoch": 4.643971631205674,
+ "grad_norm": 3.370654821395874,
+ "learning_rate": 6.133397393422228e-07,
+ "loss": 0.309,
+ "step": 9822
+ },
+ {
+ "epoch": 4.644444444444445,
+ "grad_norm": 3.0980682373046875,
+ "learning_rate": 6.129304981303822e-07,
+ "loss": 0.2784,
+ "step": 9823
+ },
+ {
+ "epoch": 4.644917257683215,
+ "grad_norm": 3.102229356765747,
+ "learning_rate": 6.125213744180844e-07,
+ "loss": 0.3064,
+ "step": 9824
+ },
+ {
+ "epoch": 4.6453900709219855,
+ "grad_norm": 2.9737658500671387,
+ "learning_rate": 6.121123682308039e-07,
+ "loss": 0.2926,
+ "step": 9825
+ },
+ {
+ "epoch": 4.645862884160756,
+ "grad_norm": 3.3927671909332275,
+ "learning_rate": 6.117034795940089e-07,
+ "loss": 0.404,
+ "step": 9826
+ },
+ {
+ "epoch": 4.646335697399527,
+ "grad_norm": 2.885082721710205,
+ "learning_rate": 6.112947085331581e-07,
+ "loss": 0.3375,
+ "step": 9827
+ },
+ {
+ "epoch": 4.646808510638298,
+ "grad_norm": 2.9711341857910156,
+ "learning_rate": 6.108860550737034e-07,
+ "loss": 0.3051,
+ "step": 9828
+ },
+ {
+ "epoch": 4.647281323877069,
+ "grad_norm": 3.1437952518463135,
+ "learning_rate": 6.104775192410911e-07,
+ "loss": 0.3408,
+ "step": 9829
+ },
+ {
+ "epoch": 4.6477541371158395,
+ "grad_norm": 3.055950164794922,
+ "learning_rate": 6.100691010607579e-07,
+ "loss": 0.316,
+ "step": 9830
+ },
+ {
+ "epoch": 4.64822695035461,
+ "grad_norm": 3.515423536300659,
+ "learning_rate": 6.096608005581353e-07,
+ "loss": 0.3994,
+ "step": 9831
+ },
+ {
+ "epoch": 4.648699763593381,
+ "grad_norm": 3.1165153980255127,
+ "learning_rate": 6.092526177586455e-07,
+ "loss": 0.3908,
+ "step": 9832
+ },
+ {
+ "epoch": 4.649172576832151,
+ "grad_norm": 3.504673719406128,
+ "learning_rate": 6.088445526877043e-07,
+ "loss": 0.3328,
+ "step": 9833
+ },
+ {
+ "epoch": 4.649645390070922,
+ "grad_norm": 3.4175243377685547,
+ "learning_rate": 6.084366053707208e-07,
+ "loss": 0.3234,
+ "step": 9834
+ },
+ {
+ "epoch": 4.650118203309693,
+ "grad_norm": 3.14725661277771,
+ "learning_rate": 6.080287758330946e-07,
+ "loss": 0.3118,
+ "step": 9835
+ },
+ {
+ "epoch": 4.650591016548463,
+ "grad_norm": 3.7654550075531006,
+ "learning_rate": 6.076210641002217e-07,
+ "loss": 0.4177,
+ "step": 9836
+ },
+ {
+ "epoch": 4.651063829787234,
+ "grad_norm": 3.188804864883423,
+ "learning_rate": 6.072134701974871e-07,
+ "loss": 0.3468,
+ "step": 9837
+ },
+ {
+ "epoch": 4.651536643026005,
+ "grad_norm": 3.2176342010498047,
+ "learning_rate": 6.068059941502702e-07,
+ "loss": 0.3486,
+ "step": 9838
+ },
+ {
+ "epoch": 4.652009456264776,
+ "grad_norm": 4.188257217407227,
+ "learning_rate": 6.063986359839424e-07,
+ "loss": 0.3973,
+ "step": 9839
+ },
+ {
+ "epoch": 4.652482269503546,
+ "grad_norm": 3.206559896469116,
+ "learning_rate": 6.059913957238678e-07,
+ "loss": 0.3088,
+ "step": 9840
+ },
+ {
+ "epoch": 4.6529550827423165,
+ "grad_norm": 3.033918857574463,
+ "learning_rate": 6.055842733954048e-07,
+ "loss": 0.3331,
+ "step": 9841
+ },
+ {
+ "epoch": 4.653427895981087,
+ "grad_norm": 3.2453384399414062,
+ "learning_rate": 6.051772690239022e-07,
+ "loss": 0.323,
+ "step": 9842
+ },
+ {
+ "epoch": 4.653900709219858,
+ "grad_norm": 3.001999855041504,
+ "learning_rate": 6.047703826347017e-07,
+ "loss": 0.3763,
+ "step": 9843
+ },
+ {
+ "epoch": 4.654373522458629,
+ "grad_norm": 3.845486640930176,
+ "learning_rate": 6.043636142531401e-07,
+ "loss": 0.3595,
+ "step": 9844
+ },
+ {
+ "epoch": 4.6548463356974,
+ "grad_norm": 3.347628593444824,
+ "learning_rate": 6.039569639045434e-07,
+ "loss": 0.3388,
+ "step": 9845
+ },
+ {
+ "epoch": 4.6553191489361705,
+ "grad_norm": 3.1889400482177734,
+ "learning_rate": 6.035504316142333e-07,
+ "loss": 0.335,
+ "step": 9846
+ },
+ {
+ "epoch": 4.655791962174941,
+ "grad_norm": 3.3385977745056152,
+ "learning_rate": 6.031440174075221e-07,
+ "loss": 0.3985,
+ "step": 9847
+ },
+ {
+ "epoch": 4.656264775413711,
+ "grad_norm": 2.849853277206421,
+ "learning_rate": 6.027377213097146e-07,
+ "loss": 0.3604,
+ "step": 9848
+ },
+ {
+ "epoch": 4.656737588652482,
+ "grad_norm": 3.243053436279297,
+ "learning_rate": 6.02331543346111e-07,
+ "loss": 0.3257,
+ "step": 9849
+ },
+ {
+ "epoch": 4.657210401891253,
+ "grad_norm": 3.344167709350586,
+ "learning_rate": 6.01925483542001e-07,
+ "loss": 0.3511,
+ "step": 9850
+ },
+ {
+ "epoch": 4.657683215130024,
+ "grad_norm": 2.9741430282592773,
+ "learning_rate": 6.015195419226677e-07,
+ "loss": 0.3303,
+ "step": 9851
+ },
+ {
+ "epoch": 4.658156028368794,
+ "grad_norm": 3.0257937908172607,
+ "learning_rate": 6.011137185133883e-07,
+ "loss": 0.3716,
+ "step": 9852
+ },
+ {
+ "epoch": 4.658628841607565,
+ "grad_norm": 3.5770089626312256,
+ "learning_rate": 6.007080133394316e-07,
+ "loss": 0.3258,
+ "step": 9853
+ },
+ {
+ "epoch": 4.659101654846336,
+ "grad_norm": 3.363703489303589,
+ "learning_rate": 6.003024264260587e-07,
+ "loss": 0.3924,
+ "step": 9854
+ },
+ {
+ "epoch": 4.659574468085106,
+ "grad_norm": 3.3533787727355957,
+ "learning_rate": 5.998969577985239e-07,
+ "loss": 0.3242,
+ "step": 9855
+ },
+ {
+ "epoch": 4.660047281323877,
+ "grad_norm": 2.7335259914398193,
+ "learning_rate": 5.994916074820731e-07,
+ "loss": 0.3269,
+ "step": 9856
+ },
+ {
+ "epoch": 4.6605200945626475,
+ "grad_norm": 3.7654764652252197,
+ "learning_rate": 5.990863755019471e-07,
+ "loss": 0.4,
+ "step": 9857
+ },
+ {
+ "epoch": 4.660992907801418,
+ "grad_norm": 3.372542381286621,
+ "learning_rate": 5.986812618833765e-07,
+ "loss": 0.3423,
+ "step": 9858
+ },
+ {
+ "epoch": 4.661465721040189,
+ "grad_norm": 2.797814130783081,
+ "learning_rate": 5.982762666515873e-07,
+ "loss": 0.3228,
+ "step": 9859
+ },
+ {
+ "epoch": 4.66193853427896,
+ "grad_norm": 3.0121023654937744,
+ "learning_rate": 5.978713898317964e-07,
+ "loss": 0.3063,
+ "step": 9860
+ },
+ {
+ "epoch": 4.662411347517731,
+ "grad_norm": 3.052292823791504,
+ "learning_rate": 5.974666314492126e-07,
+ "loss": 0.3029,
+ "step": 9861
+ },
+ {
+ "epoch": 4.6628841607565015,
+ "grad_norm": 2.990906238555908,
+ "learning_rate": 5.970619915290399e-07,
+ "loss": 0.3788,
+ "step": 9862
+ },
+ {
+ "epoch": 4.663356973995272,
+ "grad_norm": 3.214334726333618,
+ "learning_rate": 5.966574700964722e-07,
+ "loss": 0.3138,
+ "step": 9863
+ },
+ {
+ "epoch": 4.663829787234042,
+ "grad_norm": 3.5982940196990967,
+ "learning_rate": 5.962530671766989e-07,
+ "loss": 0.3685,
+ "step": 9864
+ },
+ {
+ "epoch": 4.664302600472813,
+ "grad_norm": 3.2522151470184326,
+ "learning_rate": 5.958487827948991e-07,
+ "loss": 0.3086,
+ "step": 9865
+ },
+ {
+ "epoch": 4.664775413711584,
+ "grad_norm": 3.070181131362915,
+ "learning_rate": 5.954446169762457e-07,
+ "loss": 0.3534,
+ "step": 9866
+ },
+ {
+ "epoch": 4.665248226950355,
+ "grad_norm": 3.3051350116729736,
+ "learning_rate": 5.950405697459055e-07,
+ "loss": 0.3871,
+ "step": 9867
+ },
+ {
+ "epoch": 4.665721040189125,
+ "grad_norm": 2.8587753772735596,
+ "learning_rate": 5.946366411290358e-07,
+ "loss": 0.3157,
+ "step": 9868
+ },
+ {
+ "epoch": 4.666193853427896,
+ "grad_norm": 3.154926061630249,
+ "learning_rate": 5.942328311507878e-07,
+ "loss": 0.2967,
+ "step": 9869
+ },
+ {
+ "epoch": 4.666666666666667,
+ "grad_norm": 3.0322320461273193,
+ "learning_rate": 5.938291398363049e-07,
+ "loss": 0.323,
+ "step": 9870
+ },
+ {
+ "epoch": 4.667139479905437,
+ "grad_norm": 3.0678954124450684,
+ "learning_rate": 5.934255672107222e-07,
+ "loss": 0.3337,
+ "step": 9871
+ },
+ {
+ "epoch": 4.667612293144208,
+ "grad_norm": 3.4822635650634766,
+ "learning_rate": 5.930221132991704e-07,
+ "loss": 0.4052,
+ "step": 9872
+ },
+ {
+ "epoch": 4.6680851063829785,
+ "grad_norm": 3.26842999458313,
+ "learning_rate": 5.926187781267695e-07,
+ "loss": 0.3501,
+ "step": 9873
+ },
+ {
+ "epoch": 4.668557919621749,
+ "grad_norm": 2.911407709121704,
+ "learning_rate": 5.922155617186332e-07,
+ "loss": 0.3544,
+ "step": 9874
+ },
+ {
+ "epoch": 4.66903073286052,
+ "grad_norm": 3.1876001358032227,
+ "learning_rate": 5.91812464099869e-07,
+ "loss": 0.3389,
+ "step": 9875
+ },
+ {
+ "epoch": 4.669503546099291,
+ "grad_norm": 3.4954607486724854,
+ "learning_rate": 5.914094852955749e-07,
+ "loss": 0.3461,
+ "step": 9876
+ },
+ {
+ "epoch": 4.669976359338062,
+ "grad_norm": 3.8845367431640625,
+ "learning_rate": 5.910066253308439e-07,
+ "loss": 0.3868,
+ "step": 9877
+ },
+ {
+ "epoch": 4.6704491725768325,
+ "grad_norm": 3.18038272857666,
+ "learning_rate": 5.906038842307598e-07,
+ "loss": 0.3311,
+ "step": 9878
+ },
+ {
+ "epoch": 4.670921985815603,
+ "grad_norm": 3.5944042205810547,
+ "learning_rate": 5.902012620203984e-07,
+ "loss": 0.3246,
+ "step": 9879
+ },
+ {
+ "epoch": 4.671394799054373,
+ "grad_norm": 2.980142116546631,
+ "learning_rate": 5.897987587248311e-07,
+ "loss": 0.3361,
+ "step": 9880
+ },
+ {
+ "epoch": 4.671867612293144,
+ "grad_norm": 4.3120269775390625,
+ "learning_rate": 5.893963743691183e-07,
+ "loss": 0.3213,
+ "step": 9881
+ },
+ {
+ "epoch": 4.672340425531915,
+ "grad_norm": 3.42366361618042,
+ "learning_rate": 5.889941089783163e-07,
+ "loss": 0.3515,
+ "step": 9882
+ },
+ {
+ "epoch": 4.6728132387706856,
+ "grad_norm": 2.910720109939575,
+ "learning_rate": 5.885919625774716e-07,
+ "loss": 0.3417,
+ "step": 9883
+ },
+ {
+ "epoch": 4.673286052009456,
+ "grad_norm": 3.122042179107666,
+ "learning_rate": 5.881899351916242e-07,
+ "loss": 0.3714,
+ "step": 9884
+ },
+ {
+ "epoch": 4.673758865248227,
+ "grad_norm": 3.0564188957214355,
+ "learning_rate": 5.877880268458064e-07,
+ "loss": 0.3146,
+ "step": 9885
+ },
+ {
+ "epoch": 4.674231678486998,
+ "grad_norm": 3.303421974182129,
+ "learning_rate": 5.873862375650427e-07,
+ "loss": 0.3476,
+ "step": 9886
+ },
+ {
+ "epoch": 4.674704491725768,
+ "grad_norm": 3.3057096004486084,
+ "learning_rate": 5.869845673743521e-07,
+ "loss": 0.3237,
+ "step": 9887
+ },
+ {
+ "epoch": 4.675177304964539,
+ "grad_norm": 3.1843838691711426,
+ "learning_rate": 5.865830162987443e-07,
+ "loss": 0.3789,
+ "step": 9888
+ },
+ {
+ "epoch": 4.6756501182033094,
+ "grad_norm": 2.865844964981079,
+ "learning_rate": 5.861815843632213e-07,
+ "loss": 0.3173,
+ "step": 9889
+ },
+ {
+ "epoch": 4.67612293144208,
+ "grad_norm": 2.986262083053589,
+ "learning_rate": 5.857802715927796e-07,
+ "loss": 0.2697,
+ "step": 9890
+ },
+ {
+ "epoch": 4.676595744680851,
+ "grad_norm": 3.2936089038848877,
+ "learning_rate": 5.853790780124063e-07,
+ "loss": 0.3839,
+ "step": 9891
+ },
+ {
+ "epoch": 4.677068557919622,
+ "grad_norm": 2.7130303382873535,
+ "learning_rate": 5.849780036470831e-07,
+ "loss": 0.3004,
+ "step": 9892
+ },
+ {
+ "epoch": 4.677541371158393,
+ "grad_norm": 3.0076770782470703,
+ "learning_rate": 5.845770485217827e-07,
+ "loss": 0.3467,
+ "step": 9893
+ },
+ {
+ "epoch": 4.678014184397163,
+ "grad_norm": 3.5340375900268555,
+ "learning_rate": 5.841762126614697e-07,
+ "loss": 0.3759,
+ "step": 9894
+ },
+ {
+ "epoch": 4.678486997635934,
+ "grad_norm": 3.0034375190734863,
+ "learning_rate": 5.837754960911041e-07,
+ "loss": 0.3099,
+ "step": 9895
+ },
+ {
+ "epoch": 4.678959810874704,
+ "grad_norm": 3.576899766921997,
+ "learning_rate": 5.833748988356358e-07,
+ "loss": 0.3612,
+ "step": 9896
+ },
+ {
+ "epoch": 4.679432624113475,
+ "grad_norm": 3.0961546897888184,
+ "learning_rate": 5.829744209200077e-07,
+ "loss": 0.3098,
+ "step": 9897
+ },
+ {
+ "epoch": 4.679905437352246,
+ "grad_norm": 3.1387925148010254,
+ "learning_rate": 5.825740623691576e-07,
+ "loss": 0.3538,
+ "step": 9898
+ },
+ {
+ "epoch": 4.6803782505910165,
+ "grad_norm": 3.4131572246551514,
+ "learning_rate": 5.821738232080127e-07,
+ "loss": 0.3984,
+ "step": 9899
+ },
+ {
+ "epoch": 4.680851063829787,
+ "grad_norm": 3.1346065998077393,
+ "learning_rate": 5.817737034614934e-07,
+ "loss": 0.3585,
+ "step": 9900
+ },
+ {
+ "epoch": 4.681323877068558,
+ "grad_norm": 3.148144483566284,
+ "learning_rate": 5.813737031545155e-07,
+ "loss": 0.3774,
+ "step": 9901
+ },
+ {
+ "epoch": 4.681796690307329,
+ "grad_norm": 3.2461299896240234,
+ "learning_rate": 5.809738223119843e-07,
+ "loss": 0.3181,
+ "step": 9902
+ },
+ {
+ "epoch": 4.682269503546099,
+ "grad_norm": 3.1998214721679688,
+ "learning_rate": 5.805740609587981e-07,
+ "loss": 0.3452,
+ "step": 9903
+ },
+ {
+ "epoch": 4.68274231678487,
+ "grad_norm": 2.897399425506592,
+ "learning_rate": 5.801744191198483e-07,
+ "loss": 0.3247,
+ "step": 9904
+ },
+ {
+ "epoch": 4.68321513002364,
+ "grad_norm": 2.921877384185791,
+ "learning_rate": 5.797748968200198e-07,
+ "loss": 0.2842,
+ "step": 9905
+ },
+ {
+ "epoch": 4.683687943262411,
+ "grad_norm": 3.17667818069458,
+ "learning_rate": 5.793754940841887e-07,
+ "loss": 0.3218,
+ "step": 9906
+ },
+ {
+ "epoch": 4.684160756501182,
+ "grad_norm": 3.499068260192871,
+ "learning_rate": 5.78976210937223e-07,
+ "loss": 0.3352,
+ "step": 9907
+ },
+ {
+ "epoch": 4.684633569739953,
+ "grad_norm": 3.2782368659973145,
+ "learning_rate": 5.785770474039859e-07,
+ "loss": 0.3671,
+ "step": 9908
+ },
+ {
+ "epoch": 4.685106382978724,
+ "grad_norm": 3.089757204055786,
+ "learning_rate": 5.781780035093304e-07,
+ "loss": 0.3613,
+ "step": 9909
+ },
+ {
+ "epoch": 4.685579196217494,
+ "grad_norm": 3.082561492919922,
+ "learning_rate": 5.77779079278104e-07,
+ "loss": 0.3351,
+ "step": 9910
+ },
+ {
+ "epoch": 4.686052009456265,
+ "grad_norm": 3.6009864807128906,
+ "learning_rate": 5.773802747351462e-07,
+ "loss": 0.3545,
+ "step": 9911
+ },
+ {
+ "epoch": 4.686524822695035,
+ "grad_norm": 3.488717555999756,
+ "learning_rate": 5.769815899052872e-07,
+ "loss": 0.3926,
+ "step": 9912
+ },
+ {
+ "epoch": 4.686997635933806,
+ "grad_norm": 3.5619056224823,
+ "learning_rate": 5.765830248133531e-07,
+ "loss": 0.4171,
+ "step": 9913
+ },
+ {
+ "epoch": 4.687470449172577,
+ "grad_norm": 3.30653977394104,
+ "learning_rate": 5.761845794841594e-07,
+ "loss": 0.3713,
+ "step": 9914
+ },
+ {
+ "epoch": 4.6879432624113475,
+ "grad_norm": 2.8256847858428955,
+ "learning_rate": 5.757862539425171e-07,
+ "loss": 0.2633,
+ "step": 9915
+ },
+ {
+ "epoch": 4.688416075650118,
+ "grad_norm": 3.0387041568756104,
+ "learning_rate": 5.753880482132274e-07,
+ "loss": 0.3169,
+ "step": 9916
+ },
+ {
+ "epoch": 4.688888888888889,
+ "grad_norm": 3.2312963008880615,
+ "learning_rate": 5.749899623210845e-07,
+ "loss": 0.3238,
+ "step": 9917
+ },
+ {
+ "epoch": 4.68936170212766,
+ "grad_norm": 3.077155351638794,
+ "learning_rate": 5.74591996290876e-07,
+ "loss": 0.3101,
+ "step": 9918
+ },
+ {
+ "epoch": 4.68983451536643,
+ "grad_norm": 3.461580991744995,
+ "learning_rate": 5.741941501473811e-07,
+ "loss": 0.3756,
+ "step": 9919
+ },
+ {
+ "epoch": 4.690307328605201,
+ "grad_norm": 3.8845605850219727,
+ "learning_rate": 5.737964239153712e-07,
+ "loss": 0.3747,
+ "step": 9920
+ },
+ {
+ "epoch": 4.690780141843971,
+ "grad_norm": 3.1688292026519775,
+ "learning_rate": 5.733988176196129e-07,
+ "loss": 0.3663,
+ "step": 9921
+ },
+ {
+ "epoch": 4.691252955082742,
+ "grad_norm": 3.2730917930603027,
+ "learning_rate": 5.730013312848614e-07,
+ "loss": 0.3697,
+ "step": 9922
+ },
+ {
+ "epoch": 4.691725768321513,
+ "grad_norm": 3.093761682510376,
+ "learning_rate": 5.726039649358681e-07,
+ "loss": 0.3215,
+ "step": 9923
+ },
+ {
+ "epoch": 4.692198581560284,
+ "grad_norm": 3.1679420471191406,
+ "learning_rate": 5.722067185973746e-07,
+ "loss": 0.3019,
+ "step": 9924
+ },
+ {
+ "epoch": 4.692671394799055,
+ "grad_norm": 3.4821531772613525,
+ "learning_rate": 5.718095922941147e-07,
+ "loss": 0.3659,
+ "step": 9925
+ },
+ {
+ "epoch": 4.693144208037825,
+ "grad_norm": 2.985276699066162,
+ "learning_rate": 5.714125860508177e-07,
+ "loss": 0.3293,
+ "step": 9926
+ },
+ {
+ "epoch": 4.693617021276596,
+ "grad_norm": 3.171663999557495,
+ "learning_rate": 5.710156998922015e-07,
+ "loss": 0.3647,
+ "step": 9927
+ },
+ {
+ "epoch": 4.694089834515366,
+ "grad_norm": 3.3699564933776855,
+ "learning_rate": 5.706189338429798e-07,
+ "loss": 0.4021,
+ "step": 9928
+ },
+ {
+ "epoch": 4.694562647754137,
+ "grad_norm": 3.0827202796936035,
+ "learning_rate": 5.702222879278571e-07,
+ "loss": 0.329,
+ "step": 9929
+ },
+ {
+ "epoch": 4.695035460992908,
+ "grad_norm": 3.5798332691192627,
+ "learning_rate": 5.698257621715303e-07,
+ "loss": 0.3777,
+ "step": 9930
+ },
+ {
+ "epoch": 4.6955082742316785,
+ "grad_norm": 2.741230010986328,
+ "learning_rate": 5.6942935659869e-07,
+ "loss": 0.31,
+ "step": 9931
+ },
+ {
+ "epoch": 4.695981087470449,
+ "grad_norm": 2.9929327964782715,
+ "learning_rate": 5.690330712340187e-07,
+ "loss": 0.3132,
+ "step": 9932
+ },
+ {
+ "epoch": 4.69645390070922,
+ "grad_norm": 3.062685489654541,
+ "learning_rate": 5.68636906102191e-07,
+ "loss": 0.3204,
+ "step": 9933
+ },
+ {
+ "epoch": 4.696926713947991,
+ "grad_norm": 3.166281223297119,
+ "learning_rate": 5.682408612278742e-07,
+ "loss": 0.3444,
+ "step": 9934
+ },
+ {
+ "epoch": 4.697399527186761,
+ "grad_norm": 3.0413401126861572,
+ "learning_rate": 5.678449366357278e-07,
+ "loss": 0.3506,
+ "step": 9935
+ },
+ {
+ "epoch": 4.697872340425532,
+ "grad_norm": 3.7843124866485596,
+ "learning_rate": 5.674491323504059e-07,
+ "loss": 0.3349,
+ "step": 9936
+ },
+ {
+ "epoch": 4.698345153664302,
+ "grad_norm": 2.9070212841033936,
+ "learning_rate": 5.670534483965514e-07,
+ "loss": 0.2954,
+ "step": 9937
+ },
+ {
+ "epoch": 4.698817966903073,
+ "grad_norm": 2.924229383468628,
+ "learning_rate": 5.666578847988041e-07,
+ "loss": 0.3392,
+ "step": 9938
+ },
+ {
+ "epoch": 4.699290780141844,
+ "grad_norm": 3.1302332878112793,
+ "learning_rate": 5.662624415817924e-07,
+ "loss": 0.3198,
+ "step": 9939
+ },
+ {
+ "epoch": 4.699763593380615,
+ "grad_norm": 3.163005828857422,
+ "learning_rate": 5.65867118770139e-07,
+ "loss": 0.3817,
+ "step": 9940
+ },
+ {
+ "epoch": 4.700236406619386,
+ "grad_norm": 3.4002792835235596,
+ "learning_rate": 5.654719163884598e-07,
+ "loss": 0.3961,
+ "step": 9941
+ },
+ {
+ "epoch": 4.700709219858156,
+ "grad_norm": 3.9756014347076416,
+ "learning_rate": 5.650768344613616e-07,
+ "loss": 0.4011,
+ "step": 9942
+ },
+ {
+ "epoch": 4.701182033096927,
+ "grad_norm": 3.118243455886841,
+ "learning_rate": 5.64681873013444e-07,
+ "loss": 0.3675,
+ "step": 9943
+ },
+ {
+ "epoch": 4.701654846335697,
+ "grad_norm": 3.0520825386047363,
+ "learning_rate": 5.642870320693005e-07,
+ "loss": 0.2782,
+ "step": 9944
+ },
+ {
+ "epoch": 4.702127659574468,
+ "grad_norm": 3.380565643310547,
+ "learning_rate": 5.638923116535152e-07,
+ "loss": 0.3632,
+ "step": 9945
+ },
+ {
+ "epoch": 4.702600472813239,
+ "grad_norm": 3.2340569496154785,
+ "learning_rate": 5.634977117906668e-07,
+ "loss": 0.3754,
+ "step": 9946
+ },
+ {
+ "epoch": 4.7030732860520095,
+ "grad_norm": 3.0068717002868652,
+ "learning_rate": 5.631032325053243e-07,
+ "loss": 0.3879,
+ "step": 9947
+ },
+ {
+ "epoch": 4.70354609929078,
+ "grad_norm": 3.4717891216278076,
+ "learning_rate": 5.627088738220507e-07,
+ "loss": 0.4053,
+ "step": 9948
+ },
+ {
+ "epoch": 4.704018912529551,
+ "grad_norm": 4.362999439239502,
+ "learning_rate": 5.623146357654008e-07,
+ "loss": 0.3115,
+ "step": 9949
+ },
+ {
+ "epoch": 4.704491725768322,
+ "grad_norm": 3.2190041542053223,
+ "learning_rate": 5.619205183599211e-07,
+ "loss": 0.3267,
+ "step": 9950
+ },
+ {
+ "epoch": 4.704964539007092,
+ "grad_norm": 3.413800001144409,
+ "learning_rate": 5.615265216301532e-07,
+ "loss": 0.4012,
+ "step": 9951
+ },
+ {
+ "epoch": 4.705437352245863,
+ "grad_norm": 3.5244312286376953,
+ "learning_rate": 5.611326456006291e-07,
+ "loss": 0.3484,
+ "step": 9952
+ },
+ {
+ "epoch": 4.705910165484633,
+ "grad_norm": 3.055433511734009,
+ "learning_rate": 5.607388902958727e-07,
+ "loss": 0.3637,
+ "step": 9953
+ },
+ {
+ "epoch": 4.706382978723404,
+ "grad_norm": 3.4459595680236816,
+ "learning_rate": 5.603452557404029e-07,
+ "loss": 0.3632,
+ "step": 9954
+ },
+ {
+ "epoch": 4.706855791962175,
+ "grad_norm": 2.972321033477783,
+ "learning_rate": 5.59951741958728e-07,
+ "loss": 0.3411,
+ "step": 9955
+ },
+ {
+ "epoch": 4.707328605200946,
+ "grad_norm": 3.2460532188415527,
+ "learning_rate": 5.595583489753523e-07,
+ "loss": 0.3779,
+ "step": 9956
+ },
+ {
+ "epoch": 4.707801418439717,
+ "grad_norm": 3.514521837234497,
+ "learning_rate": 5.591650768147694e-07,
+ "loss": 0.3313,
+ "step": 9957
+ },
+ {
+ "epoch": 4.708274231678487,
+ "grad_norm": 2.8473336696624756,
+ "learning_rate": 5.587719255014662e-07,
+ "loss": 0.3078,
+ "step": 9958
+ },
+ {
+ "epoch": 4.708747044917258,
+ "grad_norm": 3.309263229370117,
+ "learning_rate": 5.583788950599239e-07,
+ "loss": 0.3905,
+ "step": 9959
+ },
+ {
+ "epoch": 4.709219858156028,
+ "grad_norm": 3.435980796813965,
+ "learning_rate": 5.579859855146133e-07,
+ "loss": 0.3507,
+ "step": 9960
+ },
+ {
+ "epoch": 4.709692671394799,
+ "grad_norm": 3.0237598419189453,
+ "learning_rate": 5.575931968900006e-07,
+ "loss": 0.3349,
+ "step": 9961
+ },
+ {
+ "epoch": 4.71016548463357,
+ "grad_norm": 3.6978237628936768,
+ "learning_rate": 5.572005292105426e-07,
+ "loss": 0.3672,
+ "step": 9962
+ },
+ {
+ "epoch": 4.7106382978723405,
+ "grad_norm": 3.3029704093933105,
+ "learning_rate": 5.568079825006883e-07,
+ "loss": 0.3438,
+ "step": 9963
+ },
+ {
+ "epoch": 4.711111111111111,
+ "grad_norm": 2.9121241569519043,
+ "learning_rate": 5.5641555678488e-07,
+ "loss": 0.3299,
+ "step": 9964
+ },
+ {
+ "epoch": 4.711583924349882,
+ "grad_norm": 3.2730703353881836,
+ "learning_rate": 5.56023252087553e-07,
+ "loss": 0.3572,
+ "step": 9965
+ },
+ {
+ "epoch": 4.712056737588653,
+ "grad_norm": 3.316593885421753,
+ "learning_rate": 5.556310684331343e-07,
+ "loss": 0.3139,
+ "step": 9966
+ },
+ {
+ "epoch": 4.712529550827423,
+ "grad_norm": 3.1281843185424805,
+ "learning_rate": 5.552390058460427e-07,
+ "loss": 0.3362,
+ "step": 9967
+ },
+ {
+ "epoch": 4.7130023640661936,
+ "grad_norm": 3.3069980144500732,
+ "learning_rate": 5.548470643506904e-07,
+ "loss": 0.3839,
+ "step": 9968
+ },
+ {
+ "epoch": 4.713475177304964,
+ "grad_norm": 4.4018354415893555,
+ "learning_rate": 5.544552439714826e-07,
+ "loss": 0.2954,
+ "step": 9969
+ },
+ {
+ "epoch": 4.713947990543735,
+ "grad_norm": 2.797149658203125,
+ "learning_rate": 5.540635447328161e-07,
+ "loss": 0.3253,
+ "step": 9970
+ },
+ {
+ "epoch": 4.714420803782506,
+ "grad_norm": 3.0065677165985107,
+ "learning_rate": 5.536719666590792e-07,
+ "loss": 0.3376,
+ "step": 9971
+ },
+ {
+ "epoch": 4.714893617021277,
+ "grad_norm": 3.1383140087127686,
+ "learning_rate": 5.532805097746552e-07,
+ "loss": 0.3444,
+ "step": 9972
+ },
+ {
+ "epoch": 4.7153664302600475,
+ "grad_norm": 2.983229398727417,
+ "learning_rate": 5.528891741039169e-07,
+ "loss": 0.3173,
+ "step": 9973
+ },
+ {
+ "epoch": 4.715839243498818,
+ "grad_norm": 3.119361162185669,
+ "learning_rate": 5.524979596712326e-07,
+ "loss": 0.3829,
+ "step": 9974
+ },
+ {
+ "epoch": 4.716312056737589,
+ "grad_norm": 3.4099128246307373,
+ "learning_rate": 5.52106866500961e-07,
+ "loss": 0.3363,
+ "step": 9975
+ },
+ {
+ "epoch": 4.716784869976359,
+ "grad_norm": 2.818964719772339,
+ "learning_rate": 5.517158946174528e-07,
+ "loss": 0.321,
+ "step": 9976
+ },
+ {
+ "epoch": 4.71725768321513,
+ "grad_norm": 3.4968421459198,
+ "learning_rate": 5.513250440450538e-07,
+ "loss": 0.3973,
+ "step": 9977
+ },
+ {
+ "epoch": 4.717730496453901,
+ "grad_norm": 3.3777382373809814,
+ "learning_rate": 5.509343148080987e-07,
+ "loss": 0.3607,
+ "step": 9978
+ },
+ {
+ "epoch": 4.718203309692671,
+ "grad_norm": 2.95882511138916,
+ "learning_rate": 5.50543706930918e-07,
+ "loss": 0.3483,
+ "step": 9979
+ },
+ {
+ "epoch": 4.718676122931442,
+ "grad_norm": 2.8768858909606934,
+ "learning_rate": 5.501532204378327e-07,
+ "loss": 0.3488,
+ "step": 9980
+ },
+ {
+ "epoch": 4.719148936170213,
+ "grad_norm": 2.9310572147369385,
+ "learning_rate": 5.497628553531565e-07,
+ "loss": 0.3174,
+ "step": 9981
+ },
+ {
+ "epoch": 4.719621749408984,
+ "grad_norm": 3.1057486534118652,
+ "learning_rate": 5.493726117011957e-07,
+ "loss": 0.346,
+ "step": 9982
+ },
+ {
+ "epoch": 4.720094562647754,
+ "grad_norm": 3.681593418121338,
+ "learning_rate": 5.489824895062487e-07,
+ "loss": 0.3371,
+ "step": 9983
+ },
+ {
+ "epoch": 4.7205673758865245,
+ "grad_norm": 3.0641729831695557,
+ "learning_rate": 5.485924887926075e-07,
+ "loss": 0.3614,
+ "step": 9984
+ },
+ {
+ "epoch": 4.721040189125295,
+ "grad_norm": 3.2925705909729004,
+ "learning_rate": 5.482026095845555e-07,
+ "loss": 0.3023,
+ "step": 9985
+ },
+ {
+ "epoch": 4.721513002364066,
+ "grad_norm": 2.963693141937256,
+ "learning_rate": 5.47812851906368e-07,
+ "loss": 0.3706,
+ "step": 9986
+ },
+ {
+ "epoch": 4.721985815602837,
+ "grad_norm": 3.187870740890503,
+ "learning_rate": 5.474232157823147e-07,
+ "loss": 0.3332,
+ "step": 9987
+ },
+ {
+ "epoch": 4.722458628841608,
+ "grad_norm": 3.9346799850463867,
+ "learning_rate": 5.470337012366556e-07,
+ "loss": 0.3738,
+ "step": 9988
+ },
+ {
+ "epoch": 4.7229314420803785,
+ "grad_norm": 3.385035753250122,
+ "learning_rate": 5.466443082936446e-07,
+ "loss": 0.3194,
+ "step": 9989
+ },
+ {
+ "epoch": 4.723404255319149,
+ "grad_norm": 3.0829477310180664,
+ "learning_rate": 5.462550369775277e-07,
+ "loss": 0.2877,
+ "step": 9990
+ },
+ {
+ "epoch": 4.72387706855792,
+ "grad_norm": 2.8730506896972656,
+ "learning_rate": 5.458658873125419e-07,
+ "loss": 0.3352,
+ "step": 9991
+ },
+ {
+ "epoch": 4.72434988179669,
+ "grad_norm": 3.198498249053955,
+ "learning_rate": 5.454768593229193e-07,
+ "loss": 0.3697,
+ "step": 9992
+ },
+ {
+ "epoch": 4.724822695035461,
+ "grad_norm": 3.37144136428833,
+ "learning_rate": 5.450879530328824e-07,
+ "loss": 0.4245,
+ "step": 9993
+ },
+ {
+ "epoch": 4.725295508274232,
+ "grad_norm": 3.6235079765319824,
+ "learning_rate": 5.446991684666461e-07,
+ "loss": 0.3707,
+ "step": 9994
+ },
+ {
+ "epoch": 4.725768321513002,
+ "grad_norm": 3.5587494373321533,
+ "learning_rate": 5.443105056484194e-07,
+ "loss": 0.3297,
+ "step": 9995
+ },
+ {
+ "epoch": 4.726241134751773,
+ "grad_norm": 3.5308549404144287,
+ "learning_rate": 5.439219646024018e-07,
+ "loss": 0.3521,
+ "step": 9996
+ },
+ {
+ "epoch": 4.726713947990544,
+ "grad_norm": 3.16542649269104,
+ "learning_rate": 5.435335453527868e-07,
+ "loss": 0.3499,
+ "step": 9997
+ },
+ {
+ "epoch": 4.727186761229315,
+ "grad_norm": 3.2565104961395264,
+ "learning_rate": 5.431452479237586e-07,
+ "loss": 0.338,
+ "step": 9998
+ },
+ {
+ "epoch": 4.727659574468085,
+ "grad_norm": 3.371232032775879,
+ "learning_rate": 5.427570723394951e-07,
+ "loss": 0.3641,
+ "step": 9999
+ },
+ {
+ "epoch": 4.7281323877068555,
+ "grad_norm": 2.9784507751464844,
+ "learning_rate": 5.423690186241668e-07,
+ "loss": 0.3667,
+ "step": 10000
+ },
+ {
+ "epoch": 4.728605200945626,
+ "grad_norm": 3.0877480506896973,
+ "learning_rate": 5.419810868019351e-07,
+ "loss": 0.3098,
+ "step": 10001
+ },
+ {
+ "epoch": 4.729078014184397,
+ "grad_norm": 4.132823467254639,
+ "learning_rate": 5.415932768969562e-07,
+ "loss": 0.3712,
+ "step": 10002
+ },
+ {
+ "epoch": 4.729550827423168,
+ "grad_norm": 2.8105905055999756,
+ "learning_rate": 5.412055889333767e-07,
+ "loss": 0.2829,
+ "step": 10003
+ },
+ {
+ "epoch": 4.730023640661939,
+ "grad_norm": 3.543795585632324,
+ "learning_rate": 5.408180229353352e-07,
+ "loss": 0.3101,
+ "step": 10004
+ },
+ {
+ "epoch": 4.7304964539007095,
+ "grad_norm": 3.307525157928467,
+ "learning_rate": 5.404305789269657e-07,
+ "loss": 0.3585,
+ "step": 10005
+ },
+ {
+ "epoch": 4.73096926713948,
+ "grad_norm": 3.0976414680480957,
+ "learning_rate": 5.400432569323905e-07,
+ "loss": 0.3202,
+ "step": 10006
+ },
+ {
+ "epoch": 4.73144208037825,
+ "grad_norm": 3.0249791145324707,
+ "learning_rate": 5.396560569757284e-07,
+ "loss": 0.3468,
+ "step": 10007
+ },
+ {
+ "epoch": 4.731914893617021,
+ "grad_norm": 3.0199971199035645,
+ "learning_rate": 5.392689790810879e-07,
+ "loss": 0.3483,
+ "step": 10008
+ },
+ {
+ "epoch": 4.732387706855792,
+ "grad_norm": 3.177297592163086,
+ "learning_rate": 5.388820232725697e-07,
+ "loss": 0.3333,
+ "step": 10009
+ },
+ {
+ "epoch": 4.732860520094563,
+ "grad_norm": 3.247121572494507,
+ "learning_rate": 5.384951895742693e-07,
+ "loss": 0.2881,
+ "step": 10010
+ },
+ {
+ "epoch": 4.733333333333333,
+ "grad_norm": 3.513106346130371,
+ "learning_rate": 5.381084780102727e-07,
+ "loss": 0.3786,
+ "step": 10011
+ },
+ {
+ "epoch": 4.733806146572104,
+ "grad_norm": 2.8936305046081543,
+ "learning_rate": 5.377218886046584e-07,
+ "loss": 0.3174,
+ "step": 10012
+ },
+ {
+ "epoch": 4.734278959810875,
+ "grad_norm": 3.1088016033172607,
+ "learning_rate": 5.373354213814977e-07,
+ "loss": 0.3108,
+ "step": 10013
+ },
+ {
+ "epoch": 4.734751773049645,
+ "grad_norm": 2.693617343902588,
+ "learning_rate": 5.369490763648539e-07,
+ "loss": 0.3441,
+ "step": 10014
+ },
+ {
+ "epoch": 4.735224586288416,
+ "grad_norm": 3.4399259090423584,
+ "learning_rate": 5.365628535787837e-07,
+ "loss": 0.3937,
+ "step": 10015
+ },
+ {
+ "epoch": 4.7356973995271865,
+ "grad_norm": 3.28714919090271,
+ "learning_rate": 5.361767530473355e-07,
+ "loss": 0.2993,
+ "step": 10016
+ },
+ {
+ "epoch": 4.736170212765957,
+ "grad_norm": 3.1407346725463867,
+ "learning_rate": 5.35790774794549e-07,
+ "loss": 0.3605,
+ "step": 10017
+ },
+ {
+ "epoch": 4.736643026004728,
+ "grad_norm": 3.464386224746704,
+ "learning_rate": 5.354049188444588e-07,
+ "loss": 0.382,
+ "step": 10018
+ },
+ {
+ "epoch": 4.737115839243499,
+ "grad_norm": 3.303809881210327,
+ "learning_rate": 5.350191852210889e-07,
+ "loss": 0.3438,
+ "step": 10019
+ },
+ {
+ "epoch": 4.73758865248227,
+ "grad_norm": 3.3727755546569824,
+ "learning_rate": 5.346335739484593e-07,
+ "loss": 0.3524,
+ "step": 10020
+ },
+ {
+ "epoch": 4.7380614657210405,
+ "grad_norm": 3.125762939453125,
+ "learning_rate": 5.342480850505788e-07,
+ "loss": 0.3762,
+ "step": 10021
+ },
+ {
+ "epoch": 4.738534278959811,
+ "grad_norm": 3.32598876953125,
+ "learning_rate": 5.3386271855145e-07,
+ "loss": 0.345,
+ "step": 10022
+ },
+ {
+ "epoch": 4.739007092198581,
+ "grad_norm": 2.889338970184326,
+ "learning_rate": 5.334774744750692e-07,
+ "loss": 0.3245,
+ "step": 10023
+ },
+ {
+ "epoch": 4.739479905437352,
+ "grad_norm": 3.3369252681732178,
+ "learning_rate": 5.330923528454223e-07,
+ "loss": 0.366,
+ "step": 10024
+ },
+ {
+ "epoch": 4.739952718676123,
+ "grad_norm": 3.008836269378662,
+ "learning_rate": 5.327073536864908e-07,
+ "loss": 0.358,
+ "step": 10025
+ },
+ {
+ "epoch": 4.740425531914894,
+ "grad_norm": 3.1076738834381104,
+ "learning_rate": 5.323224770222457e-07,
+ "loss": 0.3398,
+ "step": 10026
+ },
+ {
+ "epoch": 4.740898345153664,
+ "grad_norm": 3.269164800643921,
+ "learning_rate": 5.319377228766523e-07,
+ "loss": 0.3364,
+ "step": 10027
+ },
+ {
+ "epoch": 4.741371158392435,
+ "grad_norm": 3.3928871154785156,
+ "learning_rate": 5.315530912736671e-07,
+ "loss": 0.3376,
+ "step": 10028
+ },
+ {
+ "epoch": 4.741843971631206,
+ "grad_norm": 2.7413101196289062,
+ "learning_rate": 5.31168582237239e-07,
+ "loss": 0.3551,
+ "step": 10029
+ },
+ {
+ "epoch": 4.742316784869976,
+ "grad_norm": 2.837280035018921,
+ "learning_rate": 5.307841957913104e-07,
+ "loss": 0.316,
+ "step": 10030
+ },
+ {
+ "epoch": 4.742789598108747,
+ "grad_norm": 3.140482187271118,
+ "learning_rate": 5.303999319598158e-07,
+ "loss": 0.3951,
+ "step": 10031
+ },
+ {
+ "epoch": 4.7432624113475175,
+ "grad_norm": 2.978053331375122,
+ "learning_rate": 5.3001579076668e-07,
+ "loss": 0.3328,
+ "step": 10032
+ },
+ {
+ "epoch": 4.743735224586288,
+ "grad_norm": 3.3469338417053223,
+ "learning_rate": 5.296317722358235e-07,
+ "loss": 0.328,
+ "step": 10033
+ },
+ {
+ "epoch": 4.744208037825059,
+ "grad_norm": 3.1574513912200928,
+ "learning_rate": 5.29247876391156e-07,
+ "loss": 0.3375,
+ "step": 10034
+ },
+ {
+ "epoch": 4.74468085106383,
+ "grad_norm": 2.9314582347869873,
+ "learning_rate": 5.288641032565825e-07,
+ "loss": 0.3025,
+ "step": 10035
+ },
+ {
+ "epoch": 4.745153664302601,
+ "grad_norm": 3.298856258392334,
+ "learning_rate": 5.284804528559981e-07,
+ "loss": 0.3071,
+ "step": 10036
+ },
+ {
+ "epoch": 4.7456264775413715,
+ "grad_norm": 3.489758014678955,
+ "learning_rate": 5.280969252132903e-07,
+ "loss": 0.3392,
+ "step": 10037
+ },
+ {
+ "epoch": 4.746099290780142,
+ "grad_norm": 3.1727964878082275,
+ "learning_rate": 5.277135203523412e-07,
+ "loss": 0.3472,
+ "step": 10038
+ },
+ {
+ "epoch": 4.746572104018912,
+ "grad_norm": 3.267204761505127,
+ "learning_rate": 5.27330238297023e-07,
+ "loss": 0.3555,
+ "step": 10039
+ },
+ {
+ "epoch": 4.747044917257683,
+ "grad_norm": 3.376077175140381,
+ "learning_rate": 5.269470790712003e-07,
+ "loss": 0.4018,
+ "step": 10040
+ },
+ {
+ "epoch": 4.747517730496454,
+ "grad_norm": 3.2389678955078125,
+ "learning_rate": 5.265640426987321e-07,
+ "loss": 0.3742,
+ "step": 10041
+ },
+ {
+ "epoch": 4.7479905437352246,
+ "grad_norm": 3.0280439853668213,
+ "learning_rate": 5.261811292034668e-07,
+ "loss": 0.3254,
+ "step": 10042
+ },
+ {
+ "epoch": 4.748463356973995,
+ "grad_norm": 3.1756322383880615,
+ "learning_rate": 5.257983386092486e-07,
+ "loss": 0.3434,
+ "step": 10043
+ },
+ {
+ "epoch": 4.748936170212766,
+ "grad_norm": 3.220245599746704,
+ "learning_rate": 5.254156709399111e-07,
+ "loss": 0.3795,
+ "step": 10044
+ },
+ {
+ "epoch": 4.749408983451537,
+ "grad_norm": 3.4887516498565674,
+ "learning_rate": 5.250331262192815e-07,
+ "loss": 0.353,
+ "step": 10045
+ },
+ {
+ "epoch": 4.749881796690307,
+ "grad_norm": 3.1106226444244385,
+ "learning_rate": 5.246507044711791e-07,
+ "loss": 0.3329,
+ "step": 10046
+ },
+ {
+ "epoch": 4.750354609929078,
+ "grad_norm": 3.0493836402893066,
+ "learning_rate": 5.24268405719415e-07,
+ "loss": 0.3372,
+ "step": 10047
+ },
+ {
+ "epoch": 4.7508274231678485,
+ "grad_norm": 3.0885660648345947,
+ "learning_rate": 5.238862299877948e-07,
+ "loss": 0.3583,
+ "step": 10048
+ },
+ {
+ "epoch": 4.751300236406619,
+ "grad_norm": 3.194566011428833,
+ "learning_rate": 5.23504177300114e-07,
+ "loss": 0.3886,
+ "step": 10049
+ },
+ {
+ "epoch": 4.75177304964539,
+ "grad_norm": 2.9062368869781494,
+ "learning_rate": 5.231222476801606e-07,
+ "loss": 0.3267,
+ "step": 10050
+ },
+ {
+ "epoch": 4.752245862884161,
+ "grad_norm": 2.9814155101776123,
+ "learning_rate": 5.227404411517173e-07,
+ "loss": 0.3817,
+ "step": 10051
+ },
+ {
+ "epoch": 4.752718676122932,
+ "grad_norm": 3.526301383972168,
+ "learning_rate": 5.22358757738556e-07,
+ "loss": 0.3405,
+ "step": 10052
+ },
+ {
+ "epoch": 4.753191489361702,
+ "grad_norm": 3.2342031002044678,
+ "learning_rate": 5.219771974644439e-07,
+ "loss": 0.3429,
+ "step": 10053
+ },
+ {
+ "epoch": 4.753664302600473,
+ "grad_norm": 3.0213656425476074,
+ "learning_rate": 5.215957603531383e-07,
+ "loss": 0.3482,
+ "step": 10054
+ },
+ {
+ "epoch": 4.754137115839243,
+ "grad_norm": 3.566260576248169,
+ "learning_rate": 5.212144464283889e-07,
+ "loss": 0.3633,
+ "step": 10055
+ },
+ {
+ "epoch": 4.754609929078014,
+ "grad_norm": 3.3363420963287354,
+ "learning_rate": 5.208332557139398e-07,
+ "loss": 0.3528,
+ "step": 10056
+ },
+ {
+ "epoch": 4.755082742316785,
+ "grad_norm": 3.3407959938049316,
+ "learning_rate": 5.204521882335251e-07,
+ "loss": 0.3219,
+ "step": 10057
+ },
+ {
+ "epoch": 4.7555555555555555,
+ "grad_norm": 2.9756882190704346,
+ "learning_rate": 5.200712440108729e-07,
+ "loss": 0.3141,
+ "step": 10058
+ },
+ {
+ "epoch": 4.756028368794326,
+ "grad_norm": 3.7191832065582275,
+ "learning_rate": 5.19690423069703e-07,
+ "loss": 0.3657,
+ "step": 10059
+ },
+ {
+ "epoch": 4.756501182033097,
+ "grad_norm": 3.175494432449341,
+ "learning_rate": 5.193097254337268e-07,
+ "loss": 0.2922,
+ "step": 10060
+ },
+ {
+ "epoch": 4.756973995271868,
+ "grad_norm": 2.9288907051086426,
+ "learning_rate": 5.189291511266489e-07,
+ "loss": 0.3097,
+ "step": 10061
+ },
+ {
+ "epoch": 4.757446808510638,
+ "grad_norm": 3.1014389991760254,
+ "learning_rate": 5.185487001721656e-07,
+ "loss": 0.3443,
+ "step": 10062
+ },
+ {
+ "epoch": 4.757919621749409,
+ "grad_norm": 3.3224666118621826,
+ "learning_rate": 5.181683725939668e-07,
+ "loss": 0.3408,
+ "step": 10063
+ },
+ {
+ "epoch": 4.758392434988179,
+ "grad_norm": 3.248089075088501,
+ "learning_rate": 5.177881684157335e-07,
+ "loss": 0.366,
+ "step": 10064
+ },
+ {
+ "epoch": 4.75886524822695,
+ "grad_norm": 3.3183906078338623,
+ "learning_rate": 5.174080876611385e-07,
+ "loss": 0.3774,
+ "step": 10065
+ },
+ {
+ "epoch": 4.759338061465721,
+ "grad_norm": 3.1653311252593994,
+ "learning_rate": 5.17028130353849e-07,
+ "loss": 0.3208,
+ "step": 10066
+ },
+ {
+ "epoch": 4.759810874704492,
+ "grad_norm": 2.9300882816314697,
+ "learning_rate": 5.166482965175229e-07,
+ "loss": 0.3494,
+ "step": 10067
+ },
+ {
+ "epoch": 4.760283687943263,
+ "grad_norm": 3.504225254058838,
+ "learning_rate": 5.162685861758099e-07,
+ "loss": 0.3777,
+ "step": 10068
+ },
+ {
+ "epoch": 4.760756501182033,
+ "grad_norm": 3.3933908939361572,
+ "learning_rate": 5.158889993523544e-07,
+ "loss": 0.3575,
+ "step": 10069
+ },
+ {
+ "epoch": 4.761229314420804,
+ "grad_norm": 4.30021333694458,
+ "learning_rate": 5.155095360707901e-07,
+ "loss": 0.3435,
+ "step": 10070
+ },
+ {
+ "epoch": 4.761702127659574,
+ "grad_norm": 3.226658582687378,
+ "learning_rate": 5.151301963547462e-07,
+ "loss": 0.3473,
+ "step": 10071
+ },
+ {
+ "epoch": 4.762174940898345,
+ "grad_norm": 3.222884178161621,
+ "learning_rate": 5.14750980227841e-07,
+ "loss": 0.314,
+ "step": 10072
+ },
+ {
+ "epoch": 4.762647754137116,
+ "grad_norm": 3.077139377593994,
+ "learning_rate": 5.143718877136872e-07,
+ "loss": 0.2929,
+ "step": 10073
+ },
+ {
+ "epoch": 4.7631205673758865,
+ "grad_norm": 2.9789531230926514,
+ "learning_rate": 5.139929188358894e-07,
+ "loss": 0.3594,
+ "step": 10074
+ },
+ {
+ "epoch": 4.763593380614657,
+ "grad_norm": 3.558417797088623,
+ "learning_rate": 5.136140736180445e-07,
+ "loss": 0.356,
+ "step": 10075
+ },
+ {
+ "epoch": 4.764066193853428,
+ "grad_norm": 2.8887953758239746,
+ "learning_rate": 5.13235352083741e-07,
+ "loss": 0.2957,
+ "step": 10076
+ },
+ {
+ "epoch": 4.764539007092199,
+ "grad_norm": 3.187857151031494,
+ "learning_rate": 5.128567542565605e-07,
+ "loss": 0.3879,
+ "step": 10077
+ },
+ {
+ "epoch": 4.765011820330969,
+ "grad_norm": 3.761465072631836,
+ "learning_rate": 5.124782801600758e-07,
+ "loss": 0.3163,
+ "step": 10078
+ },
+ {
+ "epoch": 4.76548463356974,
+ "grad_norm": 3.4338560104370117,
+ "learning_rate": 5.120999298178541e-07,
+ "loss": 0.3924,
+ "step": 10079
+ },
+ {
+ "epoch": 4.76595744680851,
+ "grad_norm": 2.8551666736602783,
+ "learning_rate": 5.117217032534528e-07,
+ "loss": 0.329,
+ "step": 10080
+ },
+ {
+ "epoch": 4.766430260047281,
+ "grad_norm": 3.4713878631591797,
+ "learning_rate": 5.113436004904232e-07,
+ "loss": 0.3802,
+ "step": 10081
+ },
+ {
+ "epoch": 4.766903073286052,
+ "grad_norm": 3.1913888454437256,
+ "learning_rate": 5.109656215523076e-07,
+ "loss": 0.3273,
+ "step": 10082
+ },
+ {
+ "epoch": 4.767375886524823,
+ "grad_norm": 2.8070812225341797,
+ "learning_rate": 5.105877664626402e-07,
+ "loss": 0.3398,
+ "step": 10083
+ },
+ {
+ "epoch": 4.767848699763594,
+ "grad_norm": 3.316321849822998,
+ "learning_rate": 5.102100352449502e-07,
+ "loss": 0.3649,
+ "step": 10084
+ },
+ {
+ "epoch": 4.768321513002364,
+ "grad_norm": 3.3555870056152344,
+ "learning_rate": 5.098324279227557e-07,
+ "loss": 0.333,
+ "step": 10085
+ },
+ {
+ "epoch": 4.768794326241135,
+ "grad_norm": 3.0964810848236084,
+ "learning_rate": 5.094549445195699e-07,
+ "loss": 0.3384,
+ "step": 10086
+ },
+ {
+ "epoch": 4.769267139479905,
+ "grad_norm": 3.0406007766723633,
+ "learning_rate": 5.090775850588963e-07,
+ "loss": 0.3582,
+ "step": 10087
+ },
+ {
+ "epoch": 4.769739952718676,
+ "grad_norm": 2.934340238571167,
+ "learning_rate": 5.087003495642309e-07,
+ "loss": 0.3306,
+ "step": 10088
+ },
+ {
+ "epoch": 4.770212765957447,
+ "grad_norm": 3.441734552383423,
+ "learning_rate": 5.083232380590641e-07,
+ "loss": 0.386,
+ "step": 10089
+ },
+ {
+ "epoch": 4.7706855791962175,
+ "grad_norm": 3.176483631134033,
+ "learning_rate": 5.079462505668758e-07,
+ "loss": 0.3516,
+ "step": 10090
+ },
+ {
+ "epoch": 4.771158392434988,
+ "grad_norm": 3.1490824222564697,
+ "learning_rate": 5.075693871111395e-07,
+ "loss": 0.3233,
+ "step": 10091
+ },
+ {
+ "epoch": 4.771631205673759,
+ "grad_norm": 3.300335645675659,
+ "learning_rate": 5.07192647715321e-07,
+ "loss": 0.2975,
+ "step": 10092
+ },
+ {
+ "epoch": 4.77210401891253,
+ "grad_norm": 3.199085235595703,
+ "learning_rate": 5.068160324028776e-07,
+ "loss": 0.3468,
+ "step": 10093
+ },
+ {
+ "epoch": 4.7725768321513,
+ "grad_norm": 3.4611270427703857,
+ "learning_rate": 5.064395411972605e-07,
+ "loss": 0.3319,
+ "step": 10094
+ },
+ {
+ "epoch": 4.773049645390071,
+ "grad_norm": 3.0549957752227783,
+ "learning_rate": 5.060631741219119e-07,
+ "loss": 0.3542,
+ "step": 10095
+ },
+ {
+ "epoch": 4.773522458628841,
+ "grad_norm": 3.085744619369507,
+ "learning_rate": 5.056869312002655e-07,
+ "loss": 0.3611,
+ "step": 10096
+ },
+ {
+ "epoch": 4.773995271867612,
+ "grad_norm": 3.4383676052093506,
+ "learning_rate": 5.053108124557496e-07,
+ "loss": 0.3606,
+ "step": 10097
+ },
+ {
+ "epoch": 4.774468085106383,
+ "grad_norm": 2.8119592666625977,
+ "learning_rate": 5.049348179117825e-07,
+ "loss": 0.3192,
+ "step": 10098
+ },
+ {
+ "epoch": 4.774940898345154,
+ "grad_norm": 2.8554961681365967,
+ "learning_rate": 5.045589475917767e-07,
+ "loss": 0.321,
+ "step": 10099
+ },
+ {
+ "epoch": 4.775413711583925,
+ "grad_norm": 3.612732410430908,
+ "learning_rate": 5.041832015191356e-07,
+ "loss": 0.3385,
+ "step": 10100
+ },
+ {
+ "epoch": 4.775886524822695,
+ "grad_norm": 3.432650327682495,
+ "learning_rate": 5.038075797172543e-07,
+ "loss": 0.3494,
+ "step": 10101
+ },
+ {
+ "epoch": 4.776359338061466,
+ "grad_norm": 3.241612672805786,
+ "learning_rate": 5.034320822095228e-07,
+ "loss": 0.3377,
+ "step": 10102
+ },
+ {
+ "epoch": 4.776832151300236,
+ "grad_norm": 3.5062692165374756,
+ "learning_rate": 5.030567090193203e-07,
+ "loss": 0.4038,
+ "step": 10103
+ },
+ {
+ "epoch": 4.777304964539007,
+ "grad_norm": 2.9015917778015137,
+ "learning_rate": 5.026814601700205e-07,
+ "loss": 0.2987,
+ "step": 10104
+ },
+ {
+ "epoch": 4.777777777777778,
+ "grad_norm": 3.0691189765930176,
+ "learning_rate": 5.023063356849886e-07,
+ "loss": 0.3725,
+ "step": 10105
+ },
+ {
+ "epoch": 4.7782505910165485,
+ "grad_norm": 3.1556789875030518,
+ "learning_rate": 5.019313355875813e-07,
+ "loss": 0.3554,
+ "step": 10106
+ },
+ {
+ "epoch": 4.778723404255319,
+ "grad_norm": 2.84529447555542,
+ "learning_rate": 5.01556459901148e-07,
+ "loss": 0.3369,
+ "step": 10107
+ },
+ {
+ "epoch": 4.77919621749409,
+ "grad_norm": 3.322565793991089,
+ "learning_rate": 5.011817086490315e-07,
+ "loss": 0.3641,
+ "step": 10108
+ },
+ {
+ "epoch": 4.779669030732861,
+ "grad_norm": 3.011988639831543,
+ "learning_rate": 5.008070818545654e-07,
+ "loss": 0.3153,
+ "step": 10109
+ },
+ {
+ "epoch": 4.780141843971631,
+ "grad_norm": 2.937770128250122,
+ "learning_rate": 5.004325795410764e-07,
+ "loss": 0.3235,
+ "step": 10110
+ },
+ {
+ "epoch": 4.780614657210402,
+ "grad_norm": 3.0186142921447754,
+ "learning_rate": 5.00058201731882e-07,
+ "loss": 0.3443,
+ "step": 10111
+ },
+ {
+ "epoch": 4.781087470449172,
+ "grad_norm": 3.1810684204101562,
+ "learning_rate": 4.996839484502946e-07,
+ "loss": 0.3511,
+ "step": 10112
+ },
+ {
+ "epoch": 4.781560283687943,
+ "grad_norm": 3.5470240116119385,
+ "learning_rate": 4.993098197196167e-07,
+ "loss": 0.4096,
+ "step": 10113
+ },
+ {
+ "epoch": 4.782033096926714,
+ "grad_norm": 3.1422345638275146,
+ "learning_rate": 4.989358155631427e-07,
+ "loss": 0.3566,
+ "step": 10114
+ },
+ {
+ "epoch": 4.782505910165485,
+ "grad_norm": 3.3392271995544434,
+ "learning_rate": 4.985619360041619e-07,
+ "loss": 0.3278,
+ "step": 10115
+ },
+ {
+ "epoch": 4.782978723404256,
+ "grad_norm": 3.020026206970215,
+ "learning_rate": 4.981881810659525e-07,
+ "loss": 0.3349,
+ "step": 10116
+ },
+ {
+ "epoch": 4.783451536643026,
+ "grad_norm": 3.061652660369873,
+ "learning_rate": 4.97814550771788e-07,
+ "loss": 0.3275,
+ "step": 10117
+ },
+ {
+ "epoch": 4.783924349881797,
+ "grad_norm": 3.5875346660614014,
+ "learning_rate": 4.974410451449321e-07,
+ "loss": 0.3694,
+ "step": 10118
+ },
+ {
+ "epoch": 4.784397163120567,
+ "grad_norm": 3.848348379135132,
+ "learning_rate": 4.970676642086408e-07,
+ "loss": 0.3539,
+ "step": 10119
+ },
+ {
+ "epoch": 4.784869976359338,
+ "grad_norm": 3.237959146499634,
+ "learning_rate": 4.966944079861641e-07,
+ "loss": 0.3468,
+ "step": 10120
+ },
+ {
+ "epoch": 4.785342789598109,
+ "grad_norm": 3.4829745292663574,
+ "learning_rate": 4.96321276500742e-07,
+ "loss": 0.348,
+ "step": 10121
+ },
+ {
+ "epoch": 4.7858156028368795,
+ "grad_norm": 3.29961895942688,
+ "learning_rate": 4.959482697756085e-07,
+ "loss": 0.3499,
+ "step": 10122
+ },
+ {
+ "epoch": 4.78628841607565,
+ "grad_norm": 3.291260242462158,
+ "learning_rate": 4.955753878339886e-07,
+ "loss": 0.3525,
+ "step": 10123
+ },
+ {
+ "epoch": 4.786761229314421,
+ "grad_norm": 3.543893575668335,
+ "learning_rate": 4.952026306991004e-07,
+ "loss": 0.4274,
+ "step": 10124
+ },
+ {
+ "epoch": 4.787234042553192,
+ "grad_norm": 3.551354169845581,
+ "learning_rate": 4.948299983941534e-07,
+ "loss": 0.3116,
+ "step": 10125
+ },
+ {
+ "epoch": 4.787706855791962,
+ "grad_norm": 3.1988296508789062,
+ "learning_rate": 4.944574909423497e-07,
+ "loss": 0.3273,
+ "step": 10126
+ },
+ {
+ "epoch": 4.7881796690307326,
+ "grad_norm": 2.8899428844451904,
+ "learning_rate": 4.940851083668843e-07,
+ "loss": 0.3518,
+ "step": 10127
+ },
+ {
+ "epoch": 4.788652482269503,
+ "grad_norm": 3.279688835144043,
+ "learning_rate": 4.937128506909439e-07,
+ "loss": 0.3735,
+ "step": 10128
+ },
+ {
+ "epoch": 4.789125295508274,
+ "grad_norm": 3.0784502029418945,
+ "learning_rate": 4.933407179377059e-07,
+ "loss": 0.327,
+ "step": 10129
+ },
+ {
+ "epoch": 4.789598108747045,
+ "grad_norm": 3.390169858932495,
+ "learning_rate": 4.929687101303435e-07,
+ "loss": 0.3895,
+ "step": 10130
+ },
+ {
+ "epoch": 4.790070921985816,
+ "grad_norm": 3.72928524017334,
+ "learning_rate": 4.925968272920181e-07,
+ "loss": 0.3598,
+ "step": 10131
+ },
+ {
+ "epoch": 4.7905437352245865,
+ "grad_norm": 3.3786826133728027,
+ "learning_rate": 4.922250694458866e-07,
+ "loss": 0.363,
+ "step": 10132
+ },
+ {
+ "epoch": 4.791016548463357,
+ "grad_norm": 3.086150884628296,
+ "learning_rate": 4.918534366150965e-07,
+ "loss": 0.2877,
+ "step": 10133
+ },
+ {
+ "epoch": 4.791489361702128,
+ "grad_norm": 3.3568673133850098,
+ "learning_rate": 4.914819288227865e-07,
+ "loss": 0.3153,
+ "step": 10134
+ },
+ {
+ "epoch": 4.791962174940898,
+ "grad_norm": 3.294382095336914,
+ "learning_rate": 4.911105460920904e-07,
+ "loss": 0.3327,
+ "step": 10135
+ },
+ {
+ "epoch": 4.792434988179669,
+ "grad_norm": 3.0562479496002197,
+ "learning_rate": 4.907392884461321e-07,
+ "loss": 0.3368,
+ "step": 10136
+ },
+ {
+ "epoch": 4.79290780141844,
+ "grad_norm": 2.928912878036499,
+ "learning_rate": 4.90368155908027e-07,
+ "loss": 0.295,
+ "step": 10137
+ },
+ {
+ "epoch": 4.79338061465721,
+ "grad_norm": 3.0252797603607178,
+ "learning_rate": 4.899971485008858e-07,
+ "loss": 0.2985,
+ "step": 10138
+ },
+ {
+ "epoch": 4.793853427895981,
+ "grad_norm": 2.830035924911499,
+ "learning_rate": 4.896262662478085e-07,
+ "loss": 0.3518,
+ "step": 10139
+ },
+ {
+ "epoch": 4.794326241134752,
+ "grad_norm": 3.042524576187134,
+ "learning_rate": 4.892555091718884e-07,
+ "loss": 0.2871,
+ "step": 10140
+ },
+ {
+ "epoch": 4.794799054373523,
+ "grad_norm": 2.920741558074951,
+ "learning_rate": 4.888848772962107e-07,
+ "loss": 0.3234,
+ "step": 10141
+ },
+ {
+ "epoch": 4.795271867612293,
+ "grad_norm": 3.3935956954956055,
+ "learning_rate": 4.885143706438527e-07,
+ "loss": 0.3612,
+ "step": 10142
+ },
+ {
+ "epoch": 4.7957446808510635,
+ "grad_norm": 3.1501455307006836,
+ "learning_rate": 4.881439892378853e-07,
+ "loss": 0.3239,
+ "step": 10143
+ },
+ {
+ "epoch": 4.796217494089834,
+ "grad_norm": 3.233794927597046,
+ "learning_rate": 4.877737331013696e-07,
+ "loss": 0.3185,
+ "step": 10144
+ },
+ {
+ "epoch": 4.796690307328605,
+ "grad_norm": 3.1155240535736084,
+ "learning_rate": 4.874036022573605e-07,
+ "loss": 0.3128,
+ "step": 10145
+ },
+ {
+ "epoch": 4.797163120567376,
+ "grad_norm": 3.313546895980835,
+ "learning_rate": 4.870335967289042e-07,
+ "loss": 0.3136,
+ "step": 10146
+ },
+ {
+ "epoch": 4.797635933806147,
+ "grad_norm": 5.024696350097656,
+ "learning_rate": 4.866637165390387e-07,
+ "loss": 0.4032,
+ "step": 10147
+ },
+ {
+ "epoch": 4.7981087470449175,
+ "grad_norm": 3.109086275100708,
+ "learning_rate": 4.862939617107959e-07,
+ "loss": 0.3822,
+ "step": 10148
+ },
+ {
+ "epoch": 4.798581560283688,
+ "grad_norm": 3.144777536392212,
+ "learning_rate": 4.859243322671978e-07,
+ "loss": 0.3362,
+ "step": 10149
+ },
+ {
+ "epoch": 4.799054373522459,
+ "grad_norm": 3.402974843978882,
+ "learning_rate": 4.855548282312605e-07,
+ "loss": 0.3803,
+ "step": 10150
+ },
+ {
+ "epoch": 4.799527186761229,
+ "grad_norm": 2.6077685356140137,
+ "learning_rate": 4.851854496259911e-07,
+ "loss": 0.3043,
+ "step": 10151
+ },
+ {
+ "epoch": 4.8,
+ "grad_norm": 3.38386607170105,
+ "learning_rate": 4.848161964743883e-07,
+ "loss": 0.3129,
+ "step": 10152
+ },
+ {
+ "epoch": 4.800472813238771,
+ "grad_norm": 3.193723440170288,
+ "learning_rate": 4.844470687994454e-07,
+ "loss": 0.3544,
+ "step": 10153
+ },
+ {
+ "epoch": 4.800945626477541,
+ "grad_norm": 2.9620895385742188,
+ "learning_rate": 4.840780666241457e-07,
+ "loss": 0.3376,
+ "step": 10154
+ },
+ {
+ "epoch": 4.801418439716312,
+ "grad_norm": 4.192742824554443,
+ "learning_rate": 4.83709189971465e-07,
+ "loss": 0.3325,
+ "step": 10155
+ },
+ {
+ "epoch": 4.801891252955083,
+ "grad_norm": 2.996617555618286,
+ "learning_rate": 4.83340438864372e-07,
+ "loss": 0.3237,
+ "step": 10156
+ },
+ {
+ "epoch": 4.802364066193854,
+ "grad_norm": 3.255037307739258,
+ "learning_rate": 4.829718133258263e-07,
+ "loss": 0.3575,
+ "step": 10157
+ },
+ {
+ "epoch": 4.802836879432624,
+ "grad_norm": 3.1065316200256348,
+ "learning_rate": 4.826033133787822e-07,
+ "loss": 0.347,
+ "step": 10158
+ },
+ {
+ "epoch": 4.8033096926713945,
+ "grad_norm": 3.321096420288086,
+ "learning_rate": 4.822349390461831e-07,
+ "loss": 0.3628,
+ "step": 10159
+ },
+ {
+ "epoch": 4.803782505910165,
+ "grad_norm": 3.549182653427124,
+ "learning_rate": 4.818666903509672e-07,
+ "loss": 0.3539,
+ "step": 10160
+ },
+ {
+ "epoch": 4.804255319148936,
+ "grad_norm": 2.9063286781311035,
+ "learning_rate": 4.814985673160633e-07,
+ "loss": 0.2956,
+ "step": 10161
+ },
+ {
+ "epoch": 4.804728132387707,
+ "grad_norm": 3.1669399738311768,
+ "learning_rate": 4.81130569964392e-07,
+ "loss": 0.3263,
+ "step": 10162
+ },
+ {
+ "epoch": 4.805200945626478,
+ "grad_norm": 3.667128562927246,
+ "learning_rate": 4.807626983188684e-07,
+ "loss": 0.3514,
+ "step": 10163
+ },
+ {
+ "epoch": 4.8056737588652485,
+ "grad_norm": 3.1469576358795166,
+ "learning_rate": 4.803949524023976e-07,
+ "loss": 0.3273,
+ "step": 10164
+ },
+ {
+ "epoch": 4.806146572104019,
+ "grad_norm": 3.6988110542297363,
+ "learning_rate": 4.800273322378768e-07,
+ "loss": 0.3293,
+ "step": 10165
+ },
+ {
+ "epoch": 4.80661938534279,
+ "grad_norm": 3.6419219970703125,
+ "learning_rate": 4.79659837848197e-07,
+ "loss": 0.3696,
+ "step": 10166
+ },
+ {
+ "epoch": 4.80709219858156,
+ "grad_norm": 3.4860944747924805,
+ "learning_rate": 4.792924692562398e-07,
+ "loss": 0.3372,
+ "step": 10167
+ },
+ {
+ "epoch": 4.807565011820331,
+ "grad_norm": 2.879600763320923,
+ "learning_rate": 4.789252264848806e-07,
+ "loss": 0.3192,
+ "step": 10168
+ },
+ {
+ "epoch": 4.808037825059102,
+ "grad_norm": 3.4475104808807373,
+ "learning_rate": 4.785581095569855e-07,
+ "loss": 0.3285,
+ "step": 10169
+ },
+ {
+ "epoch": 4.808510638297872,
+ "grad_norm": 3.528397560119629,
+ "learning_rate": 4.78191118495413e-07,
+ "loss": 0.3612,
+ "step": 10170
+ },
+ {
+ "epoch": 4.808983451536643,
+ "grad_norm": 3.056796073913574,
+ "learning_rate": 4.778242533230138e-07,
+ "loss": 0.3077,
+ "step": 10171
+ },
+ {
+ "epoch": 4.809456264775414,
+ "grad_norm": 3.302171230316162,
+ "learning_rate": 4.774575140626317e-07,
+ "loss": 0.2963,
+ "step": 10172
+ },
+ {
+ "epoch": 4.809929078014184,
+ "grad_norm": 3.1446237564086914,
+ "learning_rate": 4.770909007371016e-07,
+ "loss": 0.3438,
+ "step": 10173
+ },
+ {
+ "epoch": 4.810401891252955,
+ "grad_norm": 2.917919635772705,
+ "learning_rate": 4.767244133692511e-07,
+ "loss": 0.3353,
+ "step": 10174
+ },
+ {
+ "epoch": 4.8108747044917255,
+ "grad_norm": 3.0808987617492676,
+ "learning_rate": 4.763580519818989e-07,
+ "loss": 0.3574,
+ "step": 10175
+ },
+ {
+ "epoch": 4.811347517730496,
+ "grad_norm": 3.2861616611480713,
+ "learning_rate": 4.75991816597858e-07,
+ "loss": 0.3891,
+ "step": 10176
+ },
+ {
+ "epoch": 4.811820330969267,
+ "grad_norm": 2.835925340652466,
+ "learning_rate": 4.7562570723993116e-07,
+ "loss": 0.3785,
+ "step": 10177
+ },
+ {
+ "epoch": 4.812293144208038,
+ "grad_norm": 3.5441393852233887,
+ "learning_rate": 4.7525972393091534e-07,
+ "loss": 0.3914,
+ "step": 10178
+ },
+ {
+ "epoch": 4.812765957446809,
+ "grad_norm": 3.94022798538208,
+ "learning_rate": 4.748938666935984e-07,
+ "loss": 0.3564,
+ "step": 10179
+ },
+ {
+ "epoch": 4.8132387706855795,
+ "grad_norm": 3.1686532497406006,
+ "learning_rate": 4.7452813555076e-07,
+ "loss": 0.3348,
+ "step": 10180
+ },
+ {
+ "epoch": 4.81371158392435,
+ "grad_norm": 3.534032106399536,
+ "learning_rate": 4.7416253052517374e-07,
+ "loss": 0.3165,
+ "step": 10181
+ },
+ {
+ "epoch": 4.81418439716312,
+ "grad_norm": 3.1169021129608154,
+ "learning_rate": 4.7379705163960317e-07,
+ "loss": 0.3515,
+ "step": 10182
+ },
+ {
+ "epoch": 4.814657210401891,
+ "grad_norm": 3.564509391784668,
+ "learning_rate": 4.7343169891680585e-07,
+ "loss": 0.4195,
+ "step": 10183
+ },
+ {
+ "epoch": 4.815130023640662,
+ "grad_norm": 3.2813005447387695,
+ "learning_rate": 4.7306647237953085e-07,
+ "loss": 0.3574,
+ "step": 10184
+ },
+ {
+ "epoch": 4.815602836879433,
+ "grad_norm": 3.053349018096924,
+ "learning_rate": 4.727013720505177e-07,
+ "loss": 0.3792,
+ "step": 10185
+ },
+ {
+ "epoch": 4.816075650118203,
+ "grad_norm": 3.069258689880371,
+ "learning_rate": 4.723363979525017e-07,
+ "loss": 0.3377,
+ "step": 10186
+ },
+ {
+ "epoch": 4.816548463356974,
+ "grad_norm": 2.999802350997925,
+ "learning_rate": 4.71971550108207e-07,
+ "loss": 0.3232,
+ "step": 10187
+ },
+ {
+ "epoch": 4.817021276595745,
+ "grad_norm": 2.941810131072998,
+ "learning_rate": 4.7160682854035107e-07,
+ "loss": 0.342,
+ "step": 10188
+ },
+ {
+ "epoch": 4.817494089834515,
+ "grad_norm": 3.407975196838379,
+ "learning_rate": 4.71242233271644e-07,
+ "loss": 0.3697,
+ "step": 10189
+ },
+ {
+ "epoch": 4.817966903073286,
+ "grad_norm": 3.148359537124634,
+ "learning_rate": 4.708777643247864e-07,
+ "loss": 0.3297,
+ "step": 10190
+ },
+ {
+ "epoch": 4.8184397163120565,
+ "grad_norm": 2.9067797660827637,
+ "learning_rate": 4.7051342172247354e-07,
+ "loss": 0.2646,
+ "step": 10191
+ },
+ {
+ "epoch": 4.818912529550827,
+ "grad_norm": 3.4185385704040527,
+ "learning_rate": 4.70149205487391e-07,
+ "loss": 0.3296,
+ "step": 10192
+ },
+ {
+ "epoch": 4.819385342789598,
+ "grad_norm": 3.363966464996338,
+ "learning_rate": 4.697851156422162e-07,
+ "loss": 0.3744,
+ "step": 10193
+ },
+ {
+ "epoch": 4.819858156028369,
+ "grad_norm": 2.944939613342285,
+ "learning_rate": 4.6942115220962067e-07,
+ "loss": 0.3311,
+ "step": 10194
+ },
+ {
+ "epoch": 4.82033096926714,
+ "grad_norm": 3.2023603916168213,
+ "learning_rate": 4.6905731521226544e-07,
+ "loss": 0.3114,
+ "step": 10195
+ },
+ {
+ "epoch": 4.8208037825059105,
+ "grad_norm": 2.9747812747955322,
+ "learning_rate": 4.686936046728063e-07,
+ "loss": 0.2891,
+ "step": 10196
+ },
+ {
+ "epoch": 4.821276595744681,
+ "grad_norm": 3.6693246364593506,
+ "learning_rate": 4.6833002061388965e-07,
+ "loss": 0.3758,
+ "step": 10197
+ },
+ {
+ "epoch": 4.821749408983451,
+ "grad_norm": 3.4812891483306885,
+ "learning_rate": 4.679665630581534e-07,
+ "loss": 0.3274,
+ "step": 10198
+ },
+ {
+ "epoch": 4.822222222222222,
+ "grad_norm": 2.888956308364868,
+ "learning_rate": 4.676032320282295e-07,
+ "loss": 0.3304,
+ "step": 10199
+ },
+ {
+ "epoch": 4.822695035460993,
+ "grad_norm": 3.2659964561462402,
+ "learning_rate": 4.6724002754674006e-07,
+ "loss": 0.3267,
+ "step": 10200
+ },
+ {
+ "epoch": 4.823167848699764,
+ "grad_norm": 3.2733213901519775,
+ "learning_rate": 4.6687694963630127e-07,
+ "loss": 0.3067,
+ "step": 10201
+ },
+ {
+ "epoch": 4.823640661938534,
+ "grad_norm": 3.0957846641540527,
+ "learning_rate": 4.6651399831951995e-07,
+ "loss": 0.3586,
+ "step": 10202
+ },
+ {
+ "epoch": 4.824113475177305,
+ "grad_norm": 2.9597535133361816,
+ "learning_rate": 4.6615117361899526e-07,
+ "loss": 0.3409,
+ "step": 10203
+ },
+ {
+ "epoch": 4.824586288416076,
+ "grad_norm": 3.0622851848602295,
+ "learning_rate": 4.657884755573189e-07,
+ "loss": 0.3112,
+ "step": 10204
+ },
+ {
+ "epoch": 4.825059101654846,
+ "grad_norm": 3.088568925857544,
+ "learning_rate": 4.6542590415707355e-07,
+ "loss": 0.3161,
+ "step": 10205
+ },
+ {
+ "epoch": 4.825531914893617,
+ "grad_norm": 3.2927064895629883,
+ "learning_rate": 4.650634594408368e-07,
+ "loss": 0.3368,
+ "step": 10206
+ },
+ {
+ "epoch": 4.8260047281323875,
+ "grad_norm": 2.9728758335113525,
+ "learning_rate": 4.647011414311753e-07,
+ "loss": 0.3615,
+ "step": 10207
+ },
+ {
+ "epoch": 4.826477541371158,
+ "grad_norm": 3.301173686981201,
+ "learning_rate": 4.643389501506487e-07,
+ "loss": 0.3597,
+ "step": 10208
+ },
+ {
+ "epoch": 4.826950354609929,
+ "grad_norm": 3.421177864074707,
+ "learning_rate": 4.639768856218102e-07,
+ "loss": 0.3087,
+ "step": 10209
+ },
+ {
+ "epoch": 4.8274231678487,
+ "grad_norm": 3.1131463050842285,
+ "learning_rate": 4.636149478672031e-07,
+ "loss": 0.3776,
+ "step": 10210
+ },
+ {
+ "epoch": 4.827895981087471,
+ "grad_norm": 3.5807228088378906,
+ "learning_rate": 4.6325313690936347e-07,
+ "loss": 0.3556,
+ "step": 10211
+ },
+ {
+ "epoch": 4.828368794326241,
+ "grad_norm": 3.2873311042785645,
+ "learning_rate": 4.6289145277082085e-07,
+ "loss": 0.377,
+ "step": 10212
+ },
+ {
+ "epoch": 4.828841607565012,
+ "grad_norm": 3.502228021621704,
+ "learning_rate": 4.6252989547409423e-07,
+ "loss": 0.3699,
+ "step": 10213
+ },
+ {
+ "epoch": 4.829314420803782,
+ "grad_norm": 3.8895792961120605,
+ "learning_rate": 4.621684650416977e-07,
+ "loss": 0.3594,
+ "step": 10214
+ },
+ {
+ "epoch": 4.829787234042553,
+ "grad_norm": 3.11706805229187,
+ "learning_rate": 4.6180716149613505e-07,
+ "loss": 0.3404,
+ "step": 10215
+ },
+ {
+ "epoch": 4.830260047281324,
+ "grad_norm": 3.174584150314331,
+ "learning_rate": 4.614459848599029e-07,
+ "loss": 0.3684,
+ "step": 10216
+ },
+ {
+ "epoch": 4.8307328605200945,
+ "grad_norm": 3.0028135776519775,
+ "learning_rate": 4.610849351554908e-07,
+ "loss": 0.3505,
+ "step": 10217
+ },
+ {
+ "epoch": 4.831205673758865,
+ "grad_norm": 3.053354024887085,
+ "learning_rate": 4.6072401240537965e-07,
+ "loss": 0.392,
+ "step": 10218
+ },
+ {
+ "epoch": 4.831678486997636,
+ "grad_norm": 3.2726800441741943,
+ "learning_rate": 4.603632166320424e-07,
+ "loss": 0.3506,
+ "step": 10219
+ },
+ {
+ "epoch": 4.832151300236407,
+ "grad_norm": 3.5746219158172607,
+ "learning_rate": 4.600025478579437e-07,
+ "loss": 0.3585,
+ "step": 10220
+ },
+ {
+ "epoch": 4.832624113475177,
+ "grad_norm": 3.0742499828338623,
+ "learning_rate": 4.596420061055409e-07,
+ "loss": 0.332,
+ "step": 10221
+ },
+ {
+ "epoch": 4.833096926713948,
+ "grad_norm": 3.161022186279297,
+ "learning_rate": 4.5928159139728426e-07,
+ "loss": 0.3303,
+ "step": 10222
+ },
+ {
+ "epoch": 4.833569739952718,
+ "grad_norm": 3.2312185764312744,
+ "learning_rate": 4.5892130375561395e-07,
+ "loss": 0.3532,
+ "step": 10223
+ },
+ {
+ "epoch": 4.834042553191489,
+ "grad_norm": 3.501893997192383,
+ "learning_rate": 4.585611432029649e-07,
+ "loss": 0.3862,
+ "step": 10224
+ },
+ {
+ "epoch": 4.83451536643026,
+ "grad_norm": 3.2005560398101807,
+ "learning_rate": 4.5820110976176194e-07,
+ "loss": 0.3626,
+ "step": 10225
+ },
+ {
+ "epoch": 4.834988179669031,
+ "grad_norm": 3.4039556980133057,
+ "learning_rate": 4.578412034544225e-07,
+ "loss": 0.3789,
+ "step": 10226
+ },
+ {
+ "epoch": 4.835460992907802,
+ "grad_norm": 3.4461448192596436,
+ "learning_rate": 4.574814243033571e-07,
+ "loss": 0.3714,
+ "step": 10227
+ },
+ {
+ "epoch": 4.835933806146572,
+ "grad_norm": 3.435886859893799,
+ "learning_rate": 4.571217723309665e-07,
+ "loss": 0.3015,
+ "step": 10228
+ },
+ {
+ "epoch": 4.836406619385343,
+ "grad_norm": 3.612645149230957,
+ "learning_rate": 4.567622475596462e-07,
+ "loss": 0.3738,
+ "step": 10229
+ },
+ {
+ "epoch": 4.836879432624113,
+ "grad_norm": 3.1911067962646484,
+ "learning_rate": 4.564028500117815e-07,
+ "loss": 0.2994,
+ "step": 10230
+ },
+ {
+ "epoch": 4.837352245862884,
+ "grad_norm": 2.9745163917541504,
+ "learning_rate": 4.5604357970974956e-07,
+ "loss": 0.3353,
+ "step": 10231
+ },
+ {
+ "epoch": 4.837825059101655,
+ "grad_norm": 3.4999606609344482,
+ "learning_rate": 4.556844366759222e-07,
+ "loss": 0.3796,
+ "step": 10232
+ },
+ {
+ "epoch": 4.8382978723404255,
+ "grad_norm": 2.8130152225494385,
+ "learning_rate": 4.553254209326607e-07,
+ "loss": 0.2964,
+ "step": 10233
+ },
+ {
+ "epoch": 4.838770685579196,
+ "grad_norm": 3.5461673736572266,
+ "learning_rate": 4.5496653250232005e-07,
+ "loss": 0.3626,
+ "step": 10234
+ },
+ {
+ "epoch": 4.839243498817967,
+ "grad_norm": 3.3498404026031494,
+ "learning_rate": 4.546077714072458e-07,
+ "loss": 0.2982,
+ "step": 10235
+ },
+ {
+ "epoch": 4.839716312056738,
+ "grad_norm": 2.8942501544952393,
+ "learning_rate": 4.5424913766977635e-07,
+ "loss": 0.2447,
+ "step": 10236
+ },
+ {
+ "epoch": 4.840189125295508,
+ "grad_norm": 3.3506743907928467,
+ "learning_rate": 4.5389063131224346e-07,
+ "loss": 0.2908,
+ "step": 10237
+ },
+ {
+ "epoch": 4.840661938534279,
+ "grad_norm": 3.058872699737549,
+ "learning_rate": 4.535322523569691e-07,
+ "loss": 0.3275,
+ "step": 10238
+ },
+ {
+ "epoch": 4.841134751773049,
+ "grad_norm": 3.0573856830596924,
+ "learning_rate": 4.5317400082626696e-07,
+ "loss": 0.3096,
+ "step": 10239
+ },
+ {
+ "epoch": 4.84160756501182,
+ "grad_norm": 3.3260257244110107,
+ "learning_rate": 4.5281587674244563e-07,
+ "loss": 0.3334,
+ "step": 10240
+ },
+ {
+ "epoch": 4.842080378250591,
+ "grad_norm": 3.265740156173706,
+ "learning_rate": 4.5245788012780234e-07,
+ "loss": 0.3698,
+ "step": 10241
+ },
+ {
+ "epoch": 4.842553191489362,
+ "grad_norm": 3.4116036891937256,
+ "learning_rate": 4.521000110046292e-07,
+ "loss": 0.4159,
+ "step": 10242
+ },
+ {
+ "epoch": 4.843026004728133,
+ "grad_norm": 3.3263189792633057,
+ "learning_rate": 4.5174226939520865e-07,
+ "loss": 0.3579,
+ "step": 10243
+ },
+ {
+ "epoch": 4.843498817966903,
+ "grad_norm": 3.4223177433013916,
+ "learning_rate": 4.5138465532181514e-07,
+ "loss": 0.3539,
+ "step": 10244
+ },
+ {
+ "epoch": 4.843971631205674,
+ "grad_norm": 3.481016159057617,
+ "learning_rate": 4.5102716880671665e-07,
+ "loss": 0.3527,
+ "step": 10245
+ },
+ {
+ "epoch": 4.844444444444444,
+ "grad_norm": 2.830122232437134,
+ "learning_rate": 4.5066980987217124e-07,
+ "loss": 0.3339,
+ "step": 10246
+ },
+ {
+ "epoch": 4.844917257683215,
+ "grad_norm": 2.895792007446289,
+ "learning_rate": 4.5031257854043163e-07,
+ "loss": 0.3056,
+ "step": 10247
+ },
+ {
+ "epoch": 4.845390070921986,
+ "grad_norm": 2.9748036861419678,
+ "learning_rate": 4.499554748337398e-07,
+ "loss": 0.2794,
+ "step": 10248
+ },
+ {
+ "epoch": 4.8458628841607565,
+ "grad_norm": 3.223539113998413,
+ "learning_rate": 4.49598498774331e-07,
+ "loss": 0.3756,
+ "step": 10249
+ },
+ {
+ "epoch": 4.846335697399527,
+ "grad_norm": 3.491365432739258,
+ "learning_rate": 4.492416503844335e-07,
+ "loss": 0.378,
+ "step": 10250
+ },
+ {
+ "epoch": 4.846808510638298,
+ "grad_norm": 2.7236695289611816,
+ "learning_rate": 4.48884929686266e-07,
+ "loss": 0.297,
+ "step": 10251
+ },
+ {
+ "epoch": 4.847281323877069,
+ "grad_norm": 3.3814051151275635,
+ "learning_rate": 4.4852833670204045e-07,
+ "loss": 0.2923,
+ "step": 10252
+ },
+ {
+ "epoch": 4.847754137115839,
+ "grad_norm": 3.168334722518921,
+ "learning_rate": 4.4817187145395956e-07,
+ "loss": 0.3455,
+ "step": 10253
+ },
+ {
+ "epoch": 4.84822695035461,
+ "grad_norm": 3.0346829891204834,
+ "learning_rate": 4.4781553396421873e-07,
+ "loss": 0.3416,
+ "step": 10254
+ },
+ {
+ "epoch": 4.84869976359338,
+ "grad_norm": 3.1232426166534424,
+ "learning_rate": 4.4745932425500657e-07,
+ "loss": 0.3494,
+ "step": 10255
+ },
+ {
+ "epoch": 4.849172576832151,
+ "grad_norm": 3.0737383365631104,
+ "learning_rate": 4.471032423485017e-07,
+ "loss": 0.3246,
+ "step": 10256
+ },
+ {
+ "epoch": 4.849645390070922,
+ "grad_norm": 3.421461582183838,
+ "learning_rate": 4.467472882668769e-07,
+ "loss": 0.3807,
+ "step": 10257
+ },
+ {
+ "epoch": 4.850118203309693,
+ "grad_norm": 3.3846490383148193,
+ "learning_rate": 4.463914620322951e-07,
+ "loss": 0.3695,
+ "step": 10258
+ },
+ {
+ "epoch": 4.850591016548464,
+ "grad_norm": 4.0876007080078125,
+ "learning_rate": 4.460357636669116e-07,
+ "loss": 0.3913,
+ "step": 10259
+ },
+ {
+ "epoch": 4.851063829787234,
+ "grad_norm": 3.2078847885131836,
+ "learning_rate": 4.456801931928753e-07,
+ "loss": 0.3424,
+ "step": 10260
+ },
+ {
+ "epoch": 4.851536643026005,
+ "grad_norm": 3.4163241386413574,
+ "learning_rate": 4.453247506323255e-07,
+ "loss": 0.3907,
+ "step": 10261
+ },
+ {
+ "epoch": 4.852009456264775,
+ "grad_norm": 2.989793539047241,
+ "learning_rate": 4.449694360073931e-07,
+ "loss": 0.3313,
+ "step": 10262
+ },
+ {
+ "epoch": 4.852482269503546,
+ "grad_norm": 3.291537284851074,
+ "learning_rate": 4.446142493402039e-07,
+ "loss": 0.3594,
+ "step": 10263
+ },
+ {
+ "epoch": 4.852955082742317,
+ "grad_norm": 3.6327221393585205,
+ "learning_rate": 4.4425919065287204e-07,
+ "loss": 0.3844,
+ "step": 10264
+ },
+ {
+ "epoch": 4.8534278959810875,
+ "grad_norm": 3.486333131790161,
+ "learning_rate": 4.439042599675067e-07,
+ "loss": 0.3666,
+ "step": 10265
+ },
+ {
+ "epoch": 4.853900709219858,
+ "grad_norm": 3.7585315704345703,
+ "learning_rate": 4.435494573062074e-07,
+ "loss": 0.3287,
+ "step": 10266
+ },
+ {
+ "epoch": 4.854373522458629,
+ "grad_norm": 3.3496108055114746,
+ "learning_rate": 4.4319478269106625e-07,
+ "loss": 0.4021,
+ "step": 10267
+ },
+ {
+ "epoch": 4.8548463356974,
+ "grad_norm": 3.4681267738342285,
+ "learning_rate": 4.428402361441672e-07,
+ "loss": 0.3119,
+ "step": 10268
+ },
+ {
+ "epoch": 4.85531914893617,
+ "grad_norm": 2.9935829639434814,
+ "learning_rate": 4.4248581768758567e-07,
+ "loss": 0.305,
+ "step": 10269
+ },
+ {
+ "epoch": 4.855791962174941,
+ "grad_norm": 3.5839056968688965,
+ "learning_rate": 4.42131527343391e-07,
+ "loss": 0.4095,
+ "step": 10270
+ },
+ {
+ "epoch": 4.856264775413711,
+ "grad_norm": 3.088690757751465,
+ "learning_rate": 4.4177736513364237e-07,
+ "loss": 0.3391,
+ "step": 10271
+ },
+ {
+ "epoch": 4.856737588652482,
+ "grad_norm": 3.2721431255340576,
+ "learning_rate": 4.414233310803917e-07,
+ "loss": 0.3741,
+ "step": 10272
+ },
+ {
+ "epoch": 4.857210401891253,
+ "grad_norm": 3.108041524887085,
+ "learning_rate": 4.4106942520568437e-07,
+ "loss": 0.4041,
+ "step": 10273
+ },
+ {
+ "epoch": 4.857683215130024,
+ "grad_norm": 3.0035696029663086,
+ "learning_rate": 4.407156475315549e-07,
+ "loss": 0.3408,
+ "step": 10274
+ },
+ {
+ "epoch": 4.858156028368795,
+ "grad_norm": 3.0572783946990967,
+ "learning_rate": 4.4036199808003334e-07,
+ "loss": 0.3207,
+ "step": 10275
+ },
+ {
+ "epoch": 4.858628841607565,
+ "grad_norm": 3.1695926189422607,
+ "learning_rate": 4.4000847687313857e-07,
+ "loss": 0.3605,
+ "step": 10276
+ },
+ {
+ "epoch": 4.859101654846336,
+ "grad_norm": 3.690382957458496,
+ "learning_rate": 4.396550839328828e-07,
+ "loss": 0.4076,
+ "step": 10277
+ },
+ {
+ "epoch": 4.859574468085106,
+ "grad_norm": 3.271988868713379,
+ "learning_rate": 4.393018192812712e-07,
+ "loss": 0.4169,
+ "step": 10278
+ },
+ {
+ "epoch": 4.860047281323877,
+ "grad_norm": 2.8622982501983643,
+ "learning_rate": 4.389486829402986e-07,
+ "loss": 0.3114,
+ "step": 10279
+ },
+ {
+ "epoch": 4.860520094562648,
+ "grad_norm": 3.3875632286071777,
+ "learning_rate": 4.385956749319548e-07,
+ "loss": 0.3664,
+ "step": 10280
+ },
+ {
+ "epoch": 4.8609929078014185,
+ "grad_norm": 2.98962664604187,
+ "learning_rate": 4.382427952782195e-07,
+ "loss": 0.314,
+ "step": 10281
+ },
+ {
+ "epoch": 4.861465721040189,
+ "grad_norm": 2.899529457092285,
+ "learning_rate": 4.3789004400106473e-07,
+ "loss": 0.3588,
+ "step": 10282
+ },
+ {
+ "epoch": 4.86193853427896,
+ "grad_norm": 3.11767578125,
+ "learning_rate": 4.3753742112245476e-07,
+ "loss": 0.3311,
+ "step": 10283
+ },
+ {
+ "epoch": 4.862411347517731,
+ "grad_norm": 2.9610254764556885,
+ "learning_rate": 4.3718492666434576e-07,
+ "loss": 0.3234,
+ "step": 10284
+ },
+ {
+ "epoch": 4.862884160756501,
+ "grad_norm": 2.9350297451019287,
+ "learning_rate": 4.368325606486859e-07,
+ "loss": 0.3086,
+ "step": 10285
+ },
+ {
+ "epoch": 4.863356973995272,
+ "grad_norm": 3.0126571655273438,
+ "learning_rate": 4.3648032309741626e-07,
+ "loss": 0.3033,
+ "step": 10286
+ },
+ {
+ "epoch": 4.863829787234042,
+ "grad_norm": 3.0580496788024902,
+ "learning_rate": 4.3612821403246795e-07,
+ "loss": 0.3631,
+ "step": 10287
+ },
+ {
+ "epoch": 4.864302600472813,
+ "grad_norm": 2.9186129570007324,
+ "learning_rate": 4.3577623347576676e-07,
+ "loss": 0.3449,
+ "step": 10288
+ },
+ {
+ "epoch": 4.864775413711584,
+ "grad_norm": 3.146562099456787,
+ "learning_rate": 4.354243814492282e-07,
+ "loss": 0.369,
+ "step": 10289
+ },
+ {
+ "epoch": 4.865248226950355,
+ "grad_norm": 2.646812915802002,
+ "learning_rate": 4.350726579747597e-07,
+ "loss": 0.331,
+ "step": 10290
+ },
+ {
+ "epoch": 4.8657210401891255,
+ "grad_norm": 3.2851274013519287,
+ "learning_rate": 4.3472106307426293e-07,
+ "loss": 0.3445,
+ "step": 10291
+ },
+ {
+ "epoch": 4.866193853427896,
+ "grad_norm": 3.144446849822998,
+ "learning_rate": 4.34369596769629e-07,
+ "loss": 0.3687,
+ "step": 10292
+ },
+ {
+ "epoch": 4.866666666666667,
+ "grad_norm": 3.01517915725708,
+ "learning_rate": 4.3401825908274353e-07,
+ "loss": 0.3282,
+ "step": 10293
+ },
+ {
+ "epoch": 4.867139479905437,
+ "grad_norm": 3.171759605407715,
+ "learning_rate": 4.33667050035482e-07,
+ "loss": 0.351,
+ "step": 10294
+ },
+ {
+ "epoch": 4.867612293144208,
+ "grad_norm": 3.5374269485473633,
+ "learning_rate": 4.333159696497119e-07,
+ "loss": 0.3586,
+ "step": 10295
+ },
+ {
+ "epoch": 4.868085106382979,
+ "grad_norm": 3.4506356716156006,
+ "learning_rate": 4.3296501794729494e-07,
+ "loss": 0.4076,
+ "step": 10296
+ },
+ {
+ "epoch": 4.868557919621749,
+ "grad_norm": 3.348048448562622,
+ "learning_rate": 4.326141949500826e-07,
+ "loss": 0.3256,
+ "step": 10297
+ },
+ {
+ "epoch": 4.86903073286052,
+ "grad_norm": 3.235438108444214,
+ "learning_rate": 4.322635006799192e-07,
+ "loss": 0.3215,
+ "step": 10298
+ },
+ {
+ "epoch": 4.869503546099291,
+ "grad_norm": 3.2025554180145264,
+ "learning_rate": 4.319129351586407e-07,
+ "loss": 0.335,
+ "step": 10299
+ },
+ {
+ "epoch": 4.869976359338062,
+ "grad_norm": 3.0318121910095215,
+ "learning_rate": 4.315624984080749e-07,
+ "loss": 0.3304,
+ "step": 10300
+ },
+ {
+ "epoch": 4.870449172576832,
+ "grad_norm": 2.9115359783172607,
+ "learning_rate": 4.312121904500433e-07,
+ "loss": 0.3459,
+ "step": 10301
+ },
+ {
+ "epoch": 4.8709219858156025,
+ "grad_norm": 3.41164493560791,
+ "learning_rate": 4.3086201130635633e-07,
+ "loss": 0.3846,
+ "step": 10302
+ },
+ {
+ "epoch": 4.871394799054373,
+ "grad_norm": 3.5832016468048096,
+ "learning_rate": 4.305119609988198e-07,
+ "loss": 0.3422,
+ "step": 10303
+ },
+ {
+ "epoch": 4.871867612293144,
+ "grad_norm": 3.5244979858398438,
+ "learning_rate": 4.30162039549229e-07,
+ "loss": 0.3862,
+ "step": 10304
+ },
+ {
+ "epoch": 4.872340425531915,
+ "grad_norm": 3.0881710052490234,
+ "learning_rate": 4.298122469793714e-07,
+ "loss": 0.358,
+ "step": 10305
+ },
+ {
+ "epoch": 4.872813238770686,
+ "grad_norm": 3.3237557411193848,
+ "learning_rate": 4.294625833110283e-07,
+ "loss": 0.3742,
+ "step": 10306
+ },
+ {
+ "epoch": 4.8732860520094565,
+ "grad_norm": 3.1959686279296875,
+ "learning_rate": 4.291130485659711e-07,
+ "loss": 0.3426,
+ "step": 10307
+ },
+ {
+ "epoch": 4.873758865248227,
+ "grad_norm": 3.1890714168548584,
+ "learning_rate": 4.2876364276596333e-07,
+ "loss": 0.3131,
+ "step": 10308
+ },
+ {
+ "epoch": 4.874231678486998,
+ "grad_norm": 2.9387660026550293,
+ "learning_rate": 4.284143659327619e-07,
+ "loss": 0.3227,
+ "step": 10309
+ },
+ {
+ "epoch": 4.874704491725768,
+ "grad_norm": 3.6868603229522705,
+ "learning_rate": 4.2806521808811367e-07,
+ "loss": 0.3159,
+ "step": 10310
+ },
+ {
+ "epoch": 4.875177304964539,
+ "grad_norm": 3.1396310329437256,
+ "learning_rate": 4.277161992537596e-07,
+ "loss": 0.3757,
+ "step": 10311
+ },
+ {
+ "epoch": 4.87565011820331,
+ "grad_norm": 3.4745748043060303,
+ "learning_rate": 4.273673094514313e-07,
+ "loss": 0.347,
+ "step": 10312
+ },
+ {
+ "epoch": 4.87612293144208,
+ "grad_norm": 3.1869146823883057,
+ "learning_rate": 4.270185487028525e-07,
+ "loss": 0.3364,
+ "step": 10313
+ },
+ {
+ "epoch": 4.876595744680851,
+ "grad_norm": 2.8646297454833984,
+ "learning_rate": 4.2666991702973807e-07,
+ "loss": 0.2987,
+ "step": 10314
+ },
+ {
+ "epoch": 4.877068557919622,
+ "grad_norm": 3.3483452796936035,
+ "learning_rate": 4.263214144537975e-07,
+ "loss": 0.307,
+ "step": 10315
+ },
+ {
+ "epoch": 4.877541371158393,
+ "grad_norm": 2.8557562828063965,
+ "learning_rate": 4.259730409967294e-07,
+ "loss": 0.3406,
+ "step": 10316
+ },
+ {
+ "epoch": 4.878014184397163,
+ "grad_norm": 3.351121664047241,
+ "learning_rate": 4.256247966802257e-07,
+ "loss": 0.3571,
+ "step": 10317
+ },
+ {
+ "epoch": 4.8784869976359335,
+ "grad_norm": 3.1691417694091797,
+ "learning_rate": 4.252766815259696e-07,
+ "loss": 0.3686,
+ "step": 10318
+ },
+ {
+ "epoch": 4.878959810874704,
+ "grad_norm": 2.957632303237915,
+ "learning_rate": 4.249286955556378e-07,
+ "loss": 0.3055,
+ "step": 10319
+ },
+ {
+ "epoch": 4.879432624113475,
+ "grad_norm": 3.234708070755005,
+ "learning_rate": 4.2458083879089645e-07,
+ "loss": 0.3733,
+ "step": 10320
+ },
+ {
+ "epoch": 4.879905437352246,
+ "grad_norm": 3.469207525253296,
+ "learning_rate": 4.242331112534065e-07,
+ "loss": 0.3758,
+ "step": 10321
+ },
+ {
+ "epoch": 4.880378250591017,
+ "grad_norm": 3.2442891597747803,
+ "learning_rate": 4.2388551296481896e-07,
+ "loss": 0.3515,
+ "step": 10322
+ },
+ {
+ "epoch": 4.8808510638297875,
+ "grad_norm": 3.3709537982940674,
+ "learning_rate": 4.235380439467762e-07,
+ "loss": 0.421,
+ "step": 10323
+ },
+ {
+ "epoch": 4.881323877068558,
+ "grad_norm": 2.730891227722168,
+ "learning_rate": 4.231907042209149e-07,
+ "loss": 0.3105,
+ "step": 10324
+ },
+ {
+ "epoch": 4.881796690307329,
+ "grad_norm": 3.6933813095092773,
+ "learning_rate": 4.228434938088616e-07,
+ "loss": 0.338,
+ "step": 10325
+ },
+ {
+ "epoch": 4.882269503546099,
+ "grad_norm": 3.2480294704437256,
+ "learning_rate": 4.224964127322362e-07,
+ "loss": 0.3695,
+ "step": 10326
+ },
+ {
+ "epoch": 4.88274231678487,
+ "grad_norm": 3.229762554168701,
+ "learning_rate": 4.2214946101264976e-07,
+ "loss": 0.3768,
+ "step": 10327
+ },
+ {
+ "epoch": 4.883215130023641,
+ "grad_norm": 3.3844475746154785,
+ "learning_rate": 4.218026386717047e-07,
+ "loss": 0.3441,
+ "step": 10328
+ },
+ {
+ "epoch": 4.883687943262411,
+ "grad_norm": 3.159759283065796,
+ "learning_rate": 4.2145594573099745e-07,
+ "loss": 0.3459,
+ "step": 10329
+ },
+ {
+ "epoch": 4.884160756501182,
+ "grad_norm": 3.5672366619110107,
+ "learning_rate": 4.21109382212114e-07,
+ "loss": 0.3908,
+ "step": 10330
+ },
+ {
+ "epoch": 4.884633569739953,
+ "grad_norm": 3.2481353282928467,
+ "learning_rate": 4.2076294813663405e-07,
+ "loss": 0.3778,
+ "step": 10331
+ },
+ {
+ "epoch": 4.885106382978723,
+ "grad_norm": 3.3311941623687744,
+ "learning_rate": 4.2041664352612785e-07,
+ "loss": 0.3171,
+ "step": 10332
+ },
+ {
+ "epoch": 4.885579196217494,
+ "grad_norm": 3.4712841510772705,
+ "learning_rate": 4.2007046840215783e-07,
+ "loss": 0.3858,
+ "step": 10333
+ },
+ {
+ "epoch": 4.8860520094562645,
+ "grad_norm": 3.1591062545776367,
+ "learning_rate": 4.197244227862804e-07,
+ "loss": 0.327,
+ "step": 10334
+ },
+ {
+ "epoch": 4.886524822695035,
+ "grad_norm": 3.400400400161743,
+ "learning_rate": 4.1937850670004136e-07,
+ "loss": 0.3231,
+ "step": 10335
+ },
+ {
+ "epoch": 4.886997635933806,
+ "grad_norm": 2.9156908988952637,
+ "learning_rate": 4.190327201649788e-07,
+ "loss": 0.2834,
+ "step": 10336
+ },
+ {
+ "epoch": 4.887470449172577,
+ "grad_norm": 3.0125153064727783,
+ "learning_rate": 4.1868706320262467e-07,
+ "loss": 0.3143,
+ "step": 10337
+ },
+ {
+ "epoch": 4.887943262411348,
+ "grad_norm": 2.656107187271118,
+ "learning_rate": 4.183415358345003e-07,
+ "loss": 0.3348,
+ "step": 10338
+ },
+ {
+ "epoch": 4.8884160756501185,
+ "grad_norm": 3.0910565853118896,
+ "learning_rate": 4.17996138082121e-07,
+ "loss": 0.3212,
+ "step": 10339
+ },
+ {
+ "epoch": 4.888888888888889,
+ "grad_norm": 3.1303164958953857,
+ "learning_rate": 4.1765086996699315e-07,
+ "loss": 0.3573,
+ "step": 10340
+ },
+ {
+ "epoch": 4.889361702127659,
+ "grad_norm": 3.504901885986328,
+ "learning_rate": 4.173057315106141e-07,
+ "loss": 0.3912,
+ "step": 10341
+ },
+ {
+ "epoch": 4.88983451536643,
+ "grad_norm": 2.994338035583496,
+ "learning_rate": 4.1696072273447547e-07,
+ "loss": 0.3896,
+ "step": 10342
+ },
+ {
+ "epoch": 4.890307328605201,
+ "grad_norm": 3.0409624576568604,
+ "learning_rate": 4.1661584366005814e-07,
+ "loss": 0.3109,
+ "step": 10343
+ },
+ {
+ "epoch": 4.890780141843972,
+ "grad_norm": 3.479952096939087,
+ "learning_rate": 4.1627109430883743e-07,
+ "loss": 0.3265,
+ "step": 10344
+ },
+ {
+ "epoch": 4.891252955082742,
+ "grad_norm": 3.0288894176483154,
+ "learning_rate": 4.159264747022787e-07,
+ "loss": 0.3345,
+ "step": 10345
+ },
+ {
+ "epoch": 4.891725768321513,
+ "grad_norm": 3.7433063983917236,
+ "learning_rate": 4.1558198486184005e-07,
+ "loss": 0.3888,
+ "step": 10346
+ },
+ {
+ "epoch": 4.892198581560284,
+ "grad_norm": 3.431964635848999,
+ "learning_rate": 4.152376248089715e-07,
+ "loss": 0.3062,
+ "step": 10347
+ },
+ {
+ "epoch": 4.892671394799054,
+ "grad_norm": 3.3993113040924072,
+ "learning_rate": 4.1489339456511376e-07,
+ "loss": 0.3955,
+ "step": 10348
+ },
+ {
+ "epoch": 4.893144208037825,
+ "grad_norm": 3.09287428855896,
+ "learning_rate": 4.145492941517024e-07,
+ "loss": 0.2857,
+ "step": 10349
+ },
+ {
+ "epoch": 4.8936170212765955,
+ "grad_norm": 3.355915069580078,
+ "learning_rate": 4.1420532359016166e-07,
+ "loss": 0.3403,
+ "step": 10350
+ },
+ {
+ "epoch": 4.894089834515366,
+ "grad_norm": 4.00920295715332,
+ "learning_rate": 4.1386148290190915e-07,
+ "loss": 0.3455,
+ "step": 10351
+ },
+ {
+ "epoch": 4.894562647754137,
+ "grad_norm": 3.408311605453491,
+ "learning_rate": 4.1351777210835524e-07,
+ "loss": 0.3606,
+ "step": 10352
+ },
+ {
+ "epoch": 4.895035460992908,
+ "grad_norm": 3.031616449356079,
+ "learning_rate": 4.1317419123090007e-07,
+ "loss": 0.3696,
+ "step": 10353
+ },
+ {
+ "epoch": 4.895508274231679,
+ "grad_norm": 3.555751085281372,
+ "learning_rate": 4.1283074029093814e-07,
+ "loss": 0.3197,
+ "step": 10354
+ },
+ {
+ "epoch": 4.8959810874704495,
+ "grad_norm": 3.3839752674102783,
+ "learning_rate": 4.124874193098541e-07,
+ "loss": 0.3744,
+ "step": 10355
+ },
+ {
+ "epoch": 4.89645390070922,
+ "grad_norm": 3.514296531677246,
+ "learning_rate": 4.1214422830902406e-07,
+ "loss": 0.29,
+ "step": 10356
+ },
+ {
+ "epoch": 4.89692671394799,
+ "grad_norm": 3.056325674057007,
+ "learning_rate": 4.1180116730981905e-07,
+ "loss": 0.371,
+ "step": 10357
+ },
+ {
+ "epoch": 4.897399527186761,
+ "grad_norm": 3.7567055225372314,
+ "learning_rate": 4.1145823633359865e-07,
+ "loss": 0.4105,
+ "step": 10358
+ },
+ {
+ "epoch": 4.897872340425532,
+ "grad_norm": 3.0050766468048096,
+ "learning_rate": 4.111154354017152e-07,
+ "loss": 0.3262,
+ "step": 10359
+ },
+ {
+ "epoch": 4.898345153664303,
+ "grad_norm": 3.2767333984375,
+ "learning_rate": 4.1077276453551476e-07,
+ "loss": 0.3253,
+ "step": 10360
+ },
+ {
+ "epoch": 4.898817966903073,
+ "grad_norm": 4.133147239685059,
+ "learning_rate": 4.1043022375633347e-07,
+ "loss": 0.4549,
+ "step": 10361
+ },
+ {
+ "epoch": 4.899290780141844,
+ "grad_norm": 3.372962236404419,
+ "learning_rate": 4.1008781308549934e-07,
+ "loss": 0.315,
+ "step": 10362
+ },
+ {
+ "epoch": 4.899763593380615,
+ "grad_norm": 3.4167628288269043,
+ "learning_rate": 4.0974553254433335e-07,
+ "loss": 0.3832,
+ "step": 10363
+ },
+ {
+ "epoch": 4.900236406619385,
+ "grad_norm": 3.103311061859131,
+ "learning_rate": 4.094033821541468e-07,
+ "loss": 0.3347,
+ "step": 10364
+ },
+ {
+ "epoch": 4.900709219858156,
+ "grad_norm": 2.95872163772583,
+ "learning_rate": 4.0906136193624547e-07,
+ "loss": 0.2861,
+ "step": 10365
+ },
+ {
+ "epoch": 4.9011820330969265,
+ "grad_norm": 3.1035397052764893,
+ "learning_rate": 4.087194719119239e-07,
+ "loss": 0.3089,
+ "step": 10366
+ },
+ {
+ "epoch": 4.901654846335697,
+ "grad_norm": 3.0228095054626465,
+ "learning_rate": 4.083777121024715e-07,
+ "loss": 0.38,
+ "step": 10367
+ },
+ {
+ "epoch": 4.902127659574468,
+ "grad_norm": 3.3528707027435303,
+ "learning_rate": 4.080360825291674e-07,
+ "loss": 0.3614,
+ "step": 10368
+ },
+ {
+ "epoch": 4.902600472813239,
+ "grad_norm": 3.5866968631744385,
+ "learning_rate": 4.076945832132828e-07,
+ "loss": 0.3751,
+ "step": 10369
+ },
+ {
+ "epoch": 4.90307328605201,
+ "grad_norm": 3.388880729675293,
+ "learning_rate": 4.0735321417608276e-07,
+ "loss": 0.3358,
+ "step": 10370
+ },
+ {
+ "epoch": 4.9035460992907804,
+ "grad_norm": 3.5489447116851807,
+ "learning_rate": 4.070119754388213e-07,
+ "loss": 0.3437,
+ "step": 10371
+ },
+ {
+ "epoch": 4.904018912529551,
+ "grad_norm": 2.8234825134277344,
+ "learning_rate": 4.0667086702274733e-07,
+ "loss": 0.3164,
+ "step": 10372
+ },
+ {
+ "epoch": 4.904491725768321,
+ "grad_norm": 3.337445020675659,
+ "learning_rate": 4.0632988894909965e-07,
+ "loss": 0.3213,
+ "step": 10373
+ },
+ {
+ "epoch": 4.904964539007092,
+ "grad_norm": 3.639477491378784,
+ "learning_rate": 4.0598904123910847e-07,
+ "loss": 0.3642,
+ "step": 10374
+ },
+ {
+ "epoch": 4.905437352245863,
+ "grad_norm": 3.101829767227173,
+ "learning_rate": 4.0564832391399857e-07,
+ "loss": 0.3415,
+ "step": 10375
+ },
+ {
+ "epoch": 4.9059101654846335,
+ "grad_norm": 2.8291256427764893,
+ "learning_rate": 4.05307736994984e-07,
+ "loss": 0.3014,
+ "step": 10376
+ },
+ {
+ "epoch": 4.906382978723404,
+ "grad_norm": 2.8689401149749756,
+ "learning_rate": 4.049672805032717e-07,
+ "loss": 0.3151,
+ "step": 10377
+ },
+ {
+ "epoch": 4.906855791962175,
+ "grad_norm": 3.468038320541382,
+ "learning_rate": 4.046269544600598e-07,
+ "loss": 0.3956,
+ "step": 10378
+ },
+ {
+ "epoch": 4.907328605200946,
+ "grad_norm": 3.5246312618255615,
+ "learning_rate": 4.042867588865401e-07,
+ "loss": 0.3003,
+ "step": 10379
+ },
+ {
+ "epoch": 4.907801418439716,
+ "grad_norm": 3.273010730743408,
+ "learning_rate": 4.039466938038944e-07,
+ "loss": 0.3036,
+ "step": 10380
+ },
+ {
+ "epoch": 4.908274231678487,
+ "grad_norm": 3.064718008041382,
+ "learning_rate": 4.0360675923329733e-07,
+ "loss": 0.3353,
+ "step": 10381
+ },
+ {
+ "epoch": 4.908747044917257,
+ "grad_norm": 3.413242816925049,
+ "learning_rate": 4.032669551959142e-07,
+ "loss": 0.354,
+ "step": 10382
+ },
+ {
+ "epoch": 4.909219858156028,
+ "grad_norm": 3.136293411254883,
+ "learning_rate": 4.029272817129046e-07,
+ "loss": 0.331,
+ "step": 10383
+ },
+ {
+ "epoch": 4.909692671394799,
+ "grad_norm": 3.0966274738311768,
+ "learning_rate": 4.025877388054172e-07,
+ "loss": 0.2878,
+ "step": 10384
+ },
+ {
+ "epoch": 4.91016548463357,
+ "grad_norm": 3.334113836288452,
+ "learning_rate": 4.022483264945948e-07,
+ "loss": 0.3276,
+ "step": 10385
+ },
+ {
+ "epoch": 4.910638297872341,
+ "grad_norm": 3.2662229537963867,
+ "learning_rate": 4.019090448015711e-07,
+ "loss": 0.3265,
+ "step": 10386
+ },
+ {
+ "epoch": 4.911111111111111,
+ "grad_norm": 3.134220838546753,
+ "learning_rate": 4.0156989374747047e-07,
+ "loss": 0.3684,
+ "step": 10387
+ },
+ {
+ "epoch": 4.911583924349882,
+ "grad_norm": 3.803694725036621,
+ "learning_rate": 4.012308733534118e-07,
+ "loss": 0.3394,
+ "step": 10388
+ },
+ {
+ "epoch": 4.912056737588652,
+ "grad_norm": 2.788388252258301,
+ "learning_rate": 4.008919836405034e-07,
+ "loss": 0.2835,
+ "step": 10389
+ },
+ {
+ "epoch": 4.912529550827423,
+ "grad_norm": 3.3408966064453125,
+ "learning_rate": 4.005532246298474e-07,
+ "loss": 0.3694,
+ "step": 10390
+ },
+ {
+ "epoch": 4.913002364066194,
+ "grad_norm": 2.913114547729492,
+ "learning_rate": 4.0021459634253605e-07,
+ "loss": 0.3456,
+ "step": 10391
+ },
+ {
+ "epoch": 4.9134751773049645,
+ "grad_norm": 3.778111457824707,
+ "learning_rate": 3.9987609879965414e-07,
+ "loss": 0.3887,
+ "step": 10392
+ },
+ {
+ "epoch": 4.913947990543735,
+ "grad_norm": 2.871978282928467,
+ "learning_rate": 3.995377320222796e-07,
+ "loss": 0.28,
+ "step": 10393
+ },
+ {
+ "epoch": 4.914420803782506,
+ "grad_norm": 3.5189783573150635,
+ "learning_rate": 3.9919949603147987e-07,
+ "loss": 0.3802,
+ "step": 10394
+ },
+ {
+ "epoch": 4.914893617021277,
+ "grad_norm": 3.381014585494995,
+ "learning_rate": 3.9886139084831607e-07,
+ "loss": 0.3661,
+ "step": 10395
+ },
+ {
+ "epoch": 4.915366430260047,
+ "grad_norm": 2.908207654953003,
+ "learning_rate": 3.9852341649384006e-07,
+ "loss": 0.3228,
+ "step": 10396
+ },
+ {
+ "epoch": 4.915839243498818,
+ "grad_norm": 3.4134814739227295,
+ "learning_rate": 3.981855729890957e-07,
+ "loss": 0.3149,
+ "step": 10397
+ },
+ {
+ "epoch": 4.916312056737588,
+ "grad_norm": 4.496891975402832,
+ "learning_rate": 3.9784786035512004e-07,
+ "loss": 0.3516,
+ "step": 10398
+ },
+ {
+ "epoch": 4.916784869976359,
+ "grad_norm": 3.2910919189453125,
+ "learning_rate": 3.975102786129398e-07,
+ "loss": 0.3329,
+ "step": 10399
+ },
+ {
+ "epoch": 4.91725768321513,
+ "grad_norm": 3.6607260704040527,
+ "learning_rate": 3.97172827783576e-07,
+ "loss": 0.3878,
+ "step": 10400
+ },
+ {
+ "epoch": 4.917730496453901,
+ "grad_norm": 3.1500742435455322,
+ "learning_rate": 3.9683550788803983e-07,
+ "loss": 0.3323,
+ "step": 10401
+ },
+ {
+ "epoch": 4.918203309692672,
+ "grad_norm": 3.263714075088501,
+ "learning_rate": 3.964983189473337e-07,
+ "loss": 0.352,
+ "step": 10402
+ },
+ {
+ "epoch": 4.918676122931442,
+ "grad_norm": 3.433868408203125,
+ "learning_rate": 3.961612609824542e-07,
+ "loss": 0.3308,
+ "step": 10403
+ },
+ {
+ "epoch": 4.919148936170213,
+ "grad_norm": 3.3086423873901367,
+ "learning_rate": 3.95824334014388e-07,
+ "loss": 0.3641,
+ "step": 10404
+ },
+ {
+ "epoch": 4.919621749408983,
+ "grad_norm": 3.2854621410369873,
+ "learning_rate": 3.954875380641135e-07,
+ "loss": 0.3405,
+ "step": 10405
+ },
+ {
+ "epoch": 4.920094562647754,
+ "grad_norm": 3.1408650875091553,
+ "learning_rate": 3.9515087315260244e-07,
+ "loss": 0.3103,
+ "step": 10406
+ },
+ {
+ "epoch": 4.920567375886525,
+ "grad_norm": 2.9340312480926514,
+ "learning_rate": 3.948143393008164e-07,
+ "loss": 0.3405,
+ "step": 10407
+ },
+ {
+ "epoch": 4.9210401891252955,
+ "grad_norm": 3.525876522064209,
+ "learning_rate": 3.944779365297113e-07,
+ "loss": 0.3464,
+ "step": 10408
+ },
+ {
+ "epoch": 4.921513002364066,
+ "grad_norm": 3.26991605758667,
+ "learning_rate": 3.9414166486023253e-07,
+ "loss": 0.3529,
+ "step": 10409
+ },
+ {
+ "epoch": 4.921985815602837,
+ "grad_norm": 2.7669694423675537,
+ "learning_rate": 3.938055243133182e-07,
+ "loss": 0.3242,
+ "step": 10410
+ },
+ {
+ "epoch": 4.922458628841608,
+ "grad_norm": 2.8268136978149414,
+ "learning_rate": 3.934695149098988e-07,
+ "loss": 0.3086,
+ "step": 10411
+ },
+ {
+ "epoch": 4.922931442080378,
+ "grad_norm": 3.119053602218628,
+ "learning_rate": 3.931336366708952e-07,
+ "loss": 0.3065,
+ "step": 10412
+ },
+ {
+ "epoch": 4.923404255319149,
+ "grad_norm": 3.1537275314331055,
+ "learning_rate": 3.9279788961722215e-07,
+ "loss": 0.3325,
+ "step": 10413
+ },
+ {
+ "epoch": 4.923877068557919,
+ "grad_norm": 3.1365256309509277,
+ "learning_rate": 3.9246227376978476e-07,
+ "loss": 0.4139,
+ "step": 10414
+ },
+ {
+ "epoch": 4.92434988179669,
+ "grad_norm": 3.3495218753814697,
+ "learning_rate": 3.921267891494798e-07,
+ "loss": 0.3463,
+ "step": 10415
+ },
+ {
+ "epoch": 4.924822695035461,
+ "grad_norm": 3.2402634620666504,
+ "learning_rate": 3.9179143577719736e-07,
+ "loss": 0.3499,
+ "step": 10416
+ },
+ {
+ "epoch": 4.925295508274232,
+ "grad_norm": 2.986429452896118,
+ "learning_rate": 3.914562136738176e-07,
+ "loss": 0.3326,
+ "step": 10417
+ },
+ {
+ "epoch": 4.925768321513003,
+ "grad_norm": 3.1276674270629883,
+ "learning_rate": 3.9112112286021407e-07,
+ "loss": 0.3087,
+ "step": 10418
+ },
+ {
+ "epoch": 4.926241134751773,
+ "grad_norm": 3.1767871379852295,
+ "learning_rate": 3.9078616335725126e-07,
+ "loss": 0.3804,
+ "step": 10419
+ },
+ {
+ "epoch": 4.926713947990544,
+ "grad_norm": 3.1657216548919678,
+ "learning_rate": 3.904513351857847e-07,
+ "loss": 0.333,
+ "step": 10420
+ },
+ {
+ "epoch": 4.927186761229314,
+ "grad_norm": 3.005009174346924,
+ "learning_rate": 3.901166383666641e-07,
+ "loss": 0.3583,
+ "step": 10421
+ },
+ {
+ "epoch": 4.927659574468085,
+ "grad_norm": 2.900146722793579,
+ "learning_rate": 3.897820729207283e-07,
+ "loss": 0.3061,
+ "step": 10422
+ },
+ {
+ "epoch": 4.928132387706856,
+ "grad_norm": 3.2418317794799805,
+ "learning_rate": 3.8944763886881037e-07,
+ "loss": 0.3822,
+ "step": 10423
+ },
+ {
+ "epoch": 4.9286052009456265,
+ "grad_norm": 3.1222848892211914,
+ "learning_rate": 3.8911333623173344e-07,
+ "loss": 0.3167,
+ "step": 10424
+ },
+ {
+ "epoch": 4.929078014184397,
+ "grad_norm": 2.727388858795166,
+ "learning_rate": 3.8877916503031325e-07,
+ "loss": 0.2977,
+ "step": 10425
+ },
+ {
+ "epoch": 4.929550827423168,
+ "grad_norm": 3.190159797668457,
+ "learning_rate": 3.884451252853569e-07,
+ "loss": 0.3399,
+ "step": 10426
+ },
+ {
+ "epoch": 4.930023640661939,
+ "grad_norm": 3.253791570663452,
+ "learning_rate": 3.8811121701766373e-07,
+ "loss": 0.3806,
+ "step": 10427
+ },
+ {
+ "epoch": 4.930496453900709,
+ "grad_norm": 3.4284887313842773,
+ "learning_rate": 3.8777744024802414e-07,
+ "loss": 0.3437,
+ "step": 10428
+ },
+ {
+ "epoch": 4.93096926713948,
+ "grad_norm": 3.3665032386779785,
+ "learning_rate": 3.874437949972221e-07,
+ "loss": 0.375,
+ "step": 10429
+ },
+ {
+ "epoch": 4.93144208037825,
+ "grad_norm": 3.1024677753448486,
+ "learning_rate": 3.8711028128603084e-07,
+ "loss": 0.3493,
+ "step": 10430
+ },
+ {
+ "epoch": 4.931914893617021,
+ "grad_norm": 3.599743604660034,
+ "learning_rate": 3.867768991352186e-07,
+ "loss": 0.3852,
+ "step": 10431
+ },
+ {
+ "epoch": 4.932387706855792,
+ "grad_norm": 3.3676376342773438,
+ "learning_rate": 3.8644364856554236e-07,
+ "loss": 0.3489,
+ "step": 10432
+ },
+ {
+ "epoch": 4.932860520094563,
+ "grad_norm": 3.2186801433563232,
+ "learning_rate": 3.861105295977521e-07,
+ "loss": 0.3413,
+ "step": 10433
+ },
+ {
+ "epoch": 4.933333333333334,
+ "grad_norm": 3.3672704696655273,
+ "learning_rate": 3.8577754225259055e-07,
+ "loss": 0.3392,
+ "step": 10434
+ },
+ {
+ "epoch": 4.933806146572104,
+ "grad_norm": 3.4285950660705566,
+ "learning_rate": 3.854446865507902e-07,
+ "loss": 0.3188,
+ "step": 10435
+ },
+ {
+ "epoch": 4.934278959810875,
+ "grad_norm": 3.187617063522339,
+ "learning_rate": 3.8511196251307783e-07,
+ "loss": 0.3258,
+ "step": 10436
+ },
+ {
+ "epoch": 4.934751773049645,
+ "grad_norm": 2.9744882583618164,
+ "learning_rate": 3.847793701601699e-07,
+ "loss": 0.3879,
+ "step": 10437
+ },
+ {
+ "epoch": 4.935224586288416,
+ "grad_norm": 2.9745848178863525,
+ "learning_rate": 3.844469095127751e-07,
+ "loss": 0.3178,
+ "step": 10438
+ },
+ {
+ "epoch": 4.935697399527187,
+ "grad_norm": 3.7419471740722656,
+ "learning_rate": 3.841145805915955e-07,
+ "loss": 0.3167,
+ "step": 10439
+ },
+ {
+ "epoch": 4.9361702127659575,
+ "grad_norm": 4.295339107513428,
+ "learning_rate": 3.837823834173232e-07,
+ "loss": 0.3209,
+ "step": 10440
+ },
+ {
+ "epoch": 4.936643026004728,
+ "grad_norm": 3.4612984657287598,
+ "learning_rate": 3.8345031801064217e-07,
+ "loss": 0.3351,
+ "step": 10441
+ },
+ {
+ "epoch": 4.937115839243499,
+ "grad_norm": 3.0626909732818604,
+ "learning_rate": 3.8311838439222953e-07,
+ "loss": 0.3395,
+ "step": 10442
+ },
+ {
+ "epoch": 4.93758865248227,
+ "grad_norm": 3.606682538986206,
+ "learning_rate": 3.827865825827518e-07,
+ "loss": 0.4112,
+ "step": 10443
+ },
+ {
+ "epoch": 4.93806146572104,
+ "grad_norm": 3.3908627033233643,
+ "learning_rate": 3.8245491260287064e-07,
+ "loss": 0.3367,
+ "step": 10444
+ },
+ {
+ "epoch": 4.938534278959811,
+ "grad_norm": 2.8598084449768066,
+ "learning_rate": 3.821233744732364e-07,
+ "loss": 0.3408,
+ "step": 10445
+ },
+ {
+ "epoch": 4.939007092198581,
+ "grad_norm": 3.207010269165039,
+ "learning_rate": 3.8179196821449354e-07,
+ "loss": 0.3301,
+ "step": 10446
+ },
+ {
+ "epoch": 4.939479905437352,
+ "grad_norm": 3.018414258956909,
+ "learning_rate": 3.8146069384727674e-07,
+ "loss": 0.3622,
+ "step": 10447
+ },
+ {
+ "epoch": 4.939952718676123,
+ "grad_norm": 3.399415969848633,
+ "learning_rate": 3.811295513922125e-07,
+ "loss": 0.3525,
+ "step": 10448
+ },
+ {
+ "epoch": 4.940425531914894,
+ "grad_norm": 3.175705671310425,
+ "learning_rate": 3.807985408699208e-07,
+ "loss": 0.322,
+ "step": 10449
+ },
+ {
+ "epoch": 4.9408983451536646,
+ "grad_norm": 2.906064033508301,
+ "learning_rate": 3.804676623010109e-07,
+ "loss": 0.3246,
+ "step": 10450
+ },
+ {
+ "epoch": 4.941371158392435,
+ "grad_norm": 3.1224400997161865,
+ "learning_rate": 3.8013691570608634e-07,
+ "loss": 0.3607,
+ "step": 10451
+ },
+ {
+ "epoch": 4.941843971631206,
+ "grad_norm": 3.4386677742004395,
+ "learning_rate": 3.7980630110574067e-07,
+ "loss": 0.3315,
+ "step": 10452
+ },
+ {
+ "epoch": 4.942316784869976,
+ "grad_norm": 3.432509183883667,
+ "learning_rate": 3.794758185205594e-07,
+ "loss": 0.3713,
+ "step": 10453
+ },
+ {
+ "epoch": 4.942789598108747,
+ "grad_norm": 3.314802646636963,
+ "learning_rate": 3.7914546797112097e-07,
+ "loss": 0.3587,
+ "step": 10454
+ },
+ {
+ "epoch": 4.943262411347518,
+ "grad_norm": 2.9151065349578857,
+ "learning_rate": 3.788152494779948e-07,
+ "loss": 0.367,
+ "step": 10455
+ },
+ {
+ "epoch": 4.9437352245862884,
+ "grad_norm": 3.3444712162017822,
+ "learning_rate": 3.784851630617414e-07,
+ "loss": 0.4009,
+ "step": 10456
+ },
+ {
+ "epoch": 4.944208037825059,
+ "grad_norm": 3.2677152156829834,
+ "learning_rate": 3.7815520874291494e-07,
+ "loss": 0.3553,
+ "step": 10457
+ },
+ {
+ "epoch": 4.94468085106383,
+ "grad_norm": 3.2326159477233887,
+ "learning_rate": 3.7782538654205946e-07,
+ "loss": 0.4008,
+ "step": 10458
+ },
+ {
+ "epoch": 4.945153664302601,
+ "grad_norm": 3.3304033279418945,
+ "learning_rate": 3.774956964797119e-07,
+ "loss": 0.3591,
+ "step": 10459
+ },
+ {
+ "epoch": 4.945626477541371,
+ "grad_norm": 3.038605213165283,
+ "learning_rate": 3.7716613857640026e-07,
+ "loss": 0.2907,
+ "step": 10460
+ },
+ {
+ "epoch": 4.9460992907801415,
+ "grad_norm": 3.016227960586548,
+ "learning_rate": 3.768367128526443e-07,
+ "loss": 0.2898,
+ "step": 10461
+ },
+ {
+ "epoch": 4.946572104018912,
+ "grad_norm": 3.354973316192627,
+ "learning_rate": 3.76507419328957e-07,
+ "loss": 0.3498,
+ "step": 10462
+ },
+ {
+ "epoch": 4.947044917257683,
+ "grad_norm": 3.5561892986297607,
+ "learning_rate": 3.761782580258408e-07,
+ "loss": 0.4384,
+ "step": 10463
+ },
+ {
+ "epoch": 4.947517730496454,
+ "grad_norm": 3.2498281002044678,
+ "learning_rate": 3.7584922896379244e-07,
+ "loss": 0.3289,
+ "step": 10464
+ },
+ {
+ "epoch": 4.947990543735225,
+ "grad_norm": 3.250598907470703,
+ "learning_rate": 3.755203321632986e-07,
+ "loss": 0.4104,
+ "step": 10465
+ },
+ {
+ "epoch": 4.9484633569739955,
+ "grad_norm": 2.8788363933563232,
+ "learning_rate": 3.7519156764483727e-07,
+ "loss": 0.2896,
+ "step": 10466
+ },
+ {
+ "epoch": 4.948936170212766,
+ "grad_norm": 3.068180561065674,
+ "learning_rate": 3.7486293542888075e-07,
+ "loss": 0.3346,
+ "step": 10467
+ },
+ {
+ "epoch": 4.949408983451537,
+ "grad_norm": 3.4533181190490723,
+ "learning_rate": 3.7453443553589043e-07,
+ "loss": 0.3917,
+ "step": 10468
+ },
+ {
+ "epoch": 4.949881796690307,
+ "grad_norm": 2.8812358379364014,
+ "learning_rate": 3.7420606798632104e-07,
+ "loss": 0.3276,
+ "step": 10469
+ },
+ {
+ "epoch": 4.950354609929078,
+ "grad_norm": 3.0952184200286865,
+ "learning_rate": 3.7387783280061875e-07,
+ "loss": 0.3261,
+ "step": 10470
+ },
+ {
+ "epoch": 4.950827423167849,
+ "grad_norm": 3.2409560680389404,
+ "learning_rate": 3.735497299992205e-07,
+ "loss": 0.3504,
+ "step": 10471
+ },
+ {
+ "epoch": 4.951300236406619,
+ "grad_norm": 3.3790557384490967,
+ "learning_rate": 3.73221759602557e-07,
+ "loss": 0.3316,
+ "step": 10472
+ },
+ {
+ "epoch": 4.95177304964539,
+ "grad_norm": 3.2161364555358887,
+ "learning_rate": 3.728939216310487e-07,
+ "loss": 0.3364,
+ "step": 10473
+ },
+ {
+ "epoch": 4.952245862884161,
+ "grad_norm": 3.3514342308044434,
+ "learning_rate": 3.7256621610510884e-07,
+ "loss": 0.3912,
+ "step": 10474
+ },
+ {
+ "epoch": 4.952718676122932,
+ "grad_norm": 2.7333486080169678,
+ "learning_rate": 3.722386430451422e-07,
+ "loss": 0.3145,
+ "step": 10475
+ },
+ {
+ "epoch": 4.953191489361702,
+ "grad_norm": 3.104905128479004,
+ "learning_rate": 3.719112024715449e-07,
+ "loss": 0.3599,
+ "step": 10476
+ },
+ {
+ "epoch": 4.9536643026004725,
+ "grad_norm": 3.16666579246521,
+ "learning_rate": 3.715838944047059e-07,
+ "loss": 0.3462,
+ "step": 10477
+ },
+ {
+ "epoch": 4.954137115839243,
+ "grad_norm": 3.078171491622925,
+ "learning_rate": 3.7125671886500514e-07,
+ "loss": 0.3119,
+ "step": 10478
+ },
+ {
+ "epoch": 4.954609929078014,
+ "grad_norm": 3.261456251144409,
+ "learning_rate": 3.709296758728137e-07,
+ "loss": 0.3959,
+ "step": 10479
+ },
+ {
+ "epoch": 4.955082742316785,
+ "grad_norm": 3.0302278995513916,
+ "learning_rate": 3.706027654484962e-07,
+ "loss": 0.3526,
+ "step": 10480
+ },
+ {
+ "epoch": 4.955555555555556,
+ "grad_norm": 3.175342559814453,
+ "learning_rate": 3.702759876124068e-07,
+ "loss": 0.3237,
+ "step": 10481
+ },
+ {
+ "epoch": 4.9560283687943265,
+ "grad_norm": 3.4779844284057617,
+ "learning_rate": 3.699493423848938e-07,
+ "loss": 0.3075,
+ "step": 10482
+ },
+ {
+ "epoch": 4.956501182033097,
+ "grad_norm": 2.809904098510742,
+ "learning_rate": 3.69622829786295e-07,
+ "loss": 0.3238,
+ "step": 10483
+ },
+ {
+ "epoch": 4.956973995271868,
+ "grad_norm": 3.092604875564575,
+ "learning_rate": 3.692964498369406e-07,
+ "loss": 0.3344,
+ "step": 10484
+ },
+ {
+ "epoch": 4.957446808510638,
+ "grad_norm": 3.477560520172119,
+ "learning_rate": 3.689702025571543e-07,
+ "loss": 0.3525,
+ "step": 10485
+ },
+ {
+ "epoch": 4.957919621749409,
+ "grad_norm": 4.119097709655762,
+ "learning_rate": 3.6864408796724815e-07,
+ "loss": 0.3953,
+ "step": 10486
+ },
+ {
+ "epoch": 4.95839243498818,
+ "grad_norm": 3.1418824195861816,
+ "learning_rate": 3.6831810608752986e-07,
+ "loss": 0.3689,
+ "step": 10487
+ },
+ {
+ "epoch": 4.95886524822695,
+ "grad_norm": 3.1947824954986572,
+ "learning_rate": 3.6799225693829596e-07,
+ "loss": 0.3427,
+ "step": 10488
+ },
+ {
+ "epoch": 4.959338061465721,
+ "grad_norm": 3.196894884109497,
+ "learning_rate": 3.6766654053983554e-07,
+ "loss": 0.3138,
+ "step": 10489
+ },
+ {
+ "epoch": 4.959810874704492,
+ "grad_norm": 2.9747161865234375,
+ "learning_rate": 3.6734095691242975e-07,
+ "loss": 0.3336,
+ "step": 10490
+ },
+ {
+ "epoch": 4.960283687943263,
+ "grad_norm": 3.2788970470428467,
+ "learning_rate": 3.670155060763503e-07,
+ "loss": 0.3418,
+ "step": 10491
+ },
+ {
+ "epoch": 4.960756501182033,
+ "grad_norm": 3.1619482040405273,
+ "learning_rate": 3.6669018805186335e-07,
+ "loss": 0.3173,
+ "step": 10492
+ },
+ {
+ "epoch": 4.9612293144208035,
+ "grad_norm": 2.9894869327545166,
+ "learning_rate": 3.6636500285922386e-07,
+ "loss": 0.3057,
+ "step": 10493
+ },
+ {
+ "epoch": 4.961702127659574,
+ "grad_norm": 3.1162378787994385,
+ "learning_rate": 3.660399505186793e-07,
+ "loss": 0.3404,
+ "step": 10494
+ },
+ {
+ "epoch": 4.962174940898345,
+ "grad_norm": 2.811485528945923,
+ "learning_rate": 3.657150310504706e-07,
+ "loss": 0.3199,
+ "step": 10495
+ },
+ {
+ "epoch": 4.962647754137116,
+ "grad_norm": 2.8914854526519775,
+ "learning_rate": 3.653902444748278e-07,
+ "loss": 0.3666,
+ "step": 10496
+ },
+ {
+ "epoch": 4.963120567375887,
+ "grad_norm": 4.075942516326904,
+ "learning_rate": 3.6506559081197517e-07,
+ "loss": 0.3737,
+ "step": 10497
+ },
+ {
+ "epoch": 4.9635933806146575,
+ "grad_norm": 4.395053863525391,
+ "learning_rate": 3.647410700821266e-07,
+ "loss": 0.338,
+ "step": 10498
+ },
+ {
+ "epoch": 4.964066193853428,
+ "grad_norm": 2.89145565032959,
+ "learning_rate": 3.644166823054884e-07,
+ "loss": 0.2893,
+ "step": 10499
+ },
+ {
+ "epoch": 4.964539007092198,
+ "grad_norm": 2.8189663887023926,
+ "learning_rate": 3.640924275022595e-07,
+ "loss": 0.289,
+ "step": 10500
+ },
+ {
+ "epoch": 4.965011820330969,
+ "grad_norm": 3.0912365913391113,
+ "learning_rate": 3.6376830569262946e-07,
+ "loss": 0.3474,
+ "step": 10501
+ },
+ {
+ "epoch": 4.96548463356974,
+ "grad_norm": 3.3087918758392334,
+ "learning_rate": 3.634443168967797e-07,
+ "loss": 0.3104,
+ "step": 10502
+ },
+ {
+ "epoch": 4.965957446808511,
+ "grad_norm": 2.855022430419922,
+ "learning_rate": 3.6312046113488403e-07,
+ "loss": 0.3328,
+ "step": 10503
+ },
+ {
+ "epoch": 4.966430260047281,
+ "grad_norm": 3.5445404052734375,
+ "learning_rate": 3.627967384271072e-07,
+ "loss": 0.322,
+ "step": 10504
+ },
+ {
+ "epoch": 4.966903073286052,
+ "grad_norm": 3.526319742202759,
+ "learning_rate": 3.624731487936065e-07,
+ "loss": 0.3264,
+ "step": 10505
+ },
+ {
+ "epoch": 4.967375886524823,
+ "grad_norm": 3.521204948425293,
+ "learning_rate": 3.621496922545298e-07,
+ "loss": 0.369,
+ "step": 10506
+ },
+ {
+ "epoch": 4.967848699763593,
+ "grad_norm": 2.8956806659698486,
+ "learning_rate": 3.618263688300172e-07,
+ "loss": 0.3396,
+ "step": 10507
+ },
+ {
+ "epoch": 4.968321513002364,
+ "grad_norm": 3.155200958251953,
+ "learning_rate": 3.615031785402015e-07,
+ "loss": 0.354,
+ "step": 10508
+ },
+ {
+ "epoch": 4.9687943262411345,
+ "grad_norm": 3.2896533012390137,
+ "learning_rate": 3.611801214052052e-07,
+ "loss": 0.3034,
+ "step": 10509
+ },
+ {
+ "epoch": 4.969267139479905,
+ "grad_norm": 3.0860259532928467,
+ "learning_rate": 3.608571974451447e-07,
+ "loss": 0.3354,
+ "step": 10510
+ },
+ {
+ "epoch": 4.969739952718676,
+ "grad_norm": 3.3194656372070312,
+ "learning_rate": 3.6053440668012697e-07,
+ "loss": 0.3714,
+ "step": 10511
+ },
+ {
+ "epoch": 4.970212765957447,
+ "grad_norm": 2.9831063747406006,
+ "learning_rate": 3.602117491302498e-07,
+ "loss": 0.311,
+ "step": 10512
+ },
+ {
+ "epoch": 4.970685579196218,
+ "grad_norm": 3.175940752029419,
+ "learning_rate": 3.59889224815605e-07,
+ "loss": 0.3658,
+ "step": 10513
+ },
+ {
+ "epoch": 4.9711583924349885,
+ "grad_norm": 3.051496982574463,
+ "learning_rate": 3.5956683375627324e-07,
+ "loss": 0.3458,
+ "step": 10514
+ },
+ {
+ "epoch": 4.971631205673759,
+ "grad_norm": 3.0264453887939453,
+ "learning_rate": 3.592445759723298e-07,
+ "loss": 0.2843,
+ "step": 10515
+ },
+ {
+ "epoch": 4.972104018912529,
+ "grad_norm": 3.404376745223999,
+ "learning_rate": 3.589224514838399e-07,
+ "loss": 0.366,
+ "step": 10516
+ },
+ {
+ "epoch": 4.9725768321513,
+ "grad_norm": 3.640212297439575,
+ "learning_rate": 3.586004603108598e-07,
+ "loss": 0.3248,
+ "step": 10517
+ },
+ {
+ "epoch": 4.973049645390071,
+ "grad_norm": 3.0829873085021973,
+ "learning_rate": 3.5827860247344e-07,
+ "loss": 0.3613,
+ "step": 10518
+ },
+ {
+ "epoch": 4.973522458628842,
+ "grad_norm": 3.6157045364379883,
+ "learning_rate": 3.5795687799162064e-07,
+ "loss": 0.3599,
+ "step": 10519
+ },
+ {
+ "epoch": 4.973995271867612,
+ "grad_norm": 3.150632619857788,
+ "learning_rate": 3.576352868854335e-07,
+ "loss": 0.3242,
+ "step": 10520
+ },
+ {
+ "epoch": 4.974468085106383,
+ "grad_norm": 3.04829740524292,
+ "learning_rate": 3.5731382917490286e-07,
+ "loss": 0.3819,
+ "step": 10521
+ },
+ {
+ "epoch": 4.974940898345154,
+ "grad_norm": 3.216092348098755,
+ "learning_rate": 3.5699250488004516e-07,
+ "loss": 0.3538,
+ "step": 10522
+ },
+ {
+ "epoch": 4.975413711583924,
+ "grad_norm": 3.36538028717041,
+ "learning_rate": 3.5667131402086717e-07,
+ "loss": 0.3381,
+ "step": 10523
+ },
+ {
+ "epoch": 4.975886524822695,
+ "grad_norm": 3.3398420810699463,
+ "learning_rate": 3.563502566173685e-07,
+ "loss": 0.3085,
+ "step": 10524
+ },
+ {
+ "epoch": 4.9763593380614655,
+ "grad_norm": 3.10583233833313,
+ "learning_rate": 3.5602933268953893e-07,
+ "loss": 0.3023,
+ "step": 10525
+ },
+ {
+ "epoch": 4.976832151300236,
+ "grad_norm": 3.422929525375366,
+ "learning_rate": 3.557085422573625e-07,
+ "loss": 0.3319,
+ "step": 10526
+ },
+ {
+ "epoch": 4.977304964539007,
+ "grad_norm": 3.7357773780822754,
+ "learning_rate": 3.5538788534081214e-07,
+ "loss": 0.3762,
+ "step": 10527
+ },
+ {
+ "epoch": 4.977777777777778,
+ "grad_norm": 3.0172133445739746,
+ "learning_rate": 3.550673619598549e-07,
+ "loss": 0.3292,
+ "step": 10528
+ },
+ {
+ "epoch": 4.978250591016549,
+ "grad_norm": 3.2497189044952393,
+ "learning_rate": 3.5474697213444763e-07,
+ "loss": 0.3292,
+ "step": 10529
+ },
+ {
+ "epoch": 4.9787234042553195,
+ "grad_norm": 2.8510115146636963,
+ "learning_rate": 3.544267158845394e-07,
+ "loss": 0.3717,
+ "step": 10530
+ },
+ {
+ "epoch": 4.97919621749409,
+ "grad_norm": 3.2559750080108643,
+ "learning_rate": 3.541065932300719e-07,
+ "loss": 0.3656,
+ "step": 10531
+ },
+ {
+ "epoch": 4.97966903073286,
+ "grad_norm": 3.3215935230255127,
+ "learning_rate": 3.537866041909768e-07,
+ "loss": 0.364,
+ "step": 10532
+ },
+ {
+ "epoch": 4.980141843971631,
+ "grad_norm": 3.4923696517944336,
+ "learning_rate": 3.5346674878717954e-07,
+ "loss": 0.3464,
+ "step": 10533
+ },
+ {
+ "epoch": 4.980614657210402,
+ "grad_norm": 3.5320425033569336,
+ "learning_rate": 3.531470270385959e-07,
+ "loss": 0.3506,
+ "step": 10534
+ },
+ {
+ "epoch": 4.9810874704491725,
+ "grad_norm": 3.290199041366577,
+ "learning_rate": 3.528274389651323e-07,
+ "loss": 0.4092,
+ "step": 10535
+ },
+ {
+ "epoch": 4.981560283687943,
+ "grad_norm": 3.108628034591675,
+ "learning_rate": 3.5250798458668966e-07,
+ "loss": 0.3522,
+ "step": 10536
+ },
+ {
+ "epoch": 4.982033096926714,
+ "grad_norm": 3.3015148639678955,
+ "learning_rate": 3.521886639231584e-07,
+ "loss": 0.3609,
+ "step": 10537
+ },
+ {
+ "epoch": 4.982505910165485,
+ "grad_norm": 3.506431818008423,
+ "learning_rate": 3.518694769944211e-07,
+ "loss": 0.3458,
+ "step": 10538
+ },
+ {
+ "epoch": 4.982978723404255,
+ "grad_norm": 3.560453414916992,
+ "learning_rate": 3.5155042382035236e-07,
+ "loss": 0.3803,
+ "step": 10539
+ },
+ {
+ "epoch": 4.983451536643026,
+ "grad_norm": 3.1382486820220947,
+ "learning_rate": 3.5123150442081757e-07,
+ "loss": 0.3209,
+ "step": 10540
+ },
+ {
+ "epoch": 4.9839243498817964,
+ "grad_norm": 4.326927661895752,
+ "learning_rate": 3.5091271881567523e-07,
+ "loss": 0.3649,
+ "step": 10541
+ },
+ {
+ "epoch": 4.984397163120567,
+ "grad_norm": 3.0951757431030273,
+ "learning_rate": 3.50594067024774e-07,
+ "loss": 0.3808,
+ "step": 10542
+ },
+ {
+ "epoch": 4.984869976359338,
+ "grad_norm": 3.264277458190918,
+ "learning_rate": 3.5027554906795574e-07,
+ "loss": 0.3408,
+ "step": 10543
+ },
+ {
+ "epoch": 4.985342789598109,
+ "grad_norm": 3.3679237365722656,
+ "learning_rate": 3.4995716496505293e-07,
+ "loss": 0.3746,
+ "step": 10544
+ },
+ {
+ "epoch": 4.98581560283688,
+ "grad_norm": 3.489201545715332,
+ "learning_rate": 3.496389147358892e-07,
+ "loss": 0.3725,
+ "step": 10545
+ },
+ {
+ "epoch": 4.98628841607565,
+ "grad_norm": 2.8233766555786133,
+ "learning_rate": 3.4932079840028193e-07,
+ "loss": 0.3178,
+ "step": 10546
+ },
+ {
+ "epoch": 4.986761229314421,
+ "grad_norm": 3.1723084449768066,
+ "learning_rate": 3.490028159780373e-07,
+ "loss": 0.348,
+ "step": 10547
+ },
+ {
+ "epoch": 4.987234042553191,
+ "grad_norm": 3.2631607055664062,
+ "learning_rate": 3.4868496748895616e-07,
+ "loss": 0.3608,
+ "step": 10548
+ },
+ {
+ "epoch": 4.987706855791962,
+ "grad_norm": 3.4170608520507812,
+ "learning_rate": 3.483672529528287e-07,
+ "loss": 0.3819,
+ "step": 10549
+ },
+ {
+ "epoch": 4.988179669030733,
+ "grad_norm": 3.002686023712158,
+ "learning_rate": 3.480496723894375e-07,
+ "loss": 0.2695,
+ "step": 10550
+ },
+ {
+ "epoch": 4.9886524822695035,
+ "grad_norm": 3.051232099533081,
+ "learning_rate": 3.4773222581855753e-07,
+ "loss": 0.3638,
+ "step": 10551
+ },
+ {
+ "epoch": 4.989125295508274,
+ "grad_norm": 2.959977149963379,
+ "learning_rate": 3.474149132599544e-07,
+ "loss": 0.3338,
+ "step": 10552
+ },
+ {
+ "epoch": 4.989598108747045,
+ "grad_norm": 2.925457000732422,
+ "learning_rate": 3.470977347333859e-07,
+ "loss": 0.3212,
+ "step": 10553
+ },
+ {
+ "epoch": 4.990070921985816,
+ "grad_norm": 3.0996408462524414,
+ "learning_rate": 3.4678069025860154e-07,
+ "loss": 0.3447,
+ "step": 10554
+ },
+ {
+ "epoch": 4.990543735224586,
+ "grad_norm": 2.8487865924835205,
+ "learning_rate": 3.4646377985534106e-07,
+ "loss": 0.3434,
+ "step": 10555
+ },
+ {
+ "epoch": 4.991016548463357,
+ "grad_norm": 2.8337016105651855,
+ "learning_rate": 3.461470035433387e-07,
+ "loss": 0.342,
+ "step": 10556
+ },
+ {
+ "epoch": 4.991489361702127,
+ "grad_norm": 2.9243876934051514,
+ "learning_rate": 3.4583036134231805e-07,
+ "loss": 0.3256,
+ "step": 10557
+ },
+ {
+ "epoch": 4.991962174940898,
+ "grad_norm": 3.2548747062683105,
+ "learning_rate": 3.455138532719948e-07,
+ "loss": 0.3313,
+ "step": 10558
+ },
+ {
+ "epoch": 4.992434988179669,
+ "grad_norm": 3.03932523727417,
+ "learning_rate": 3.451974793520771e-07,
+ "loss": 0.3854,
+ "step": 10559
+ },
+ {
+ "epoch": 4.99290780141844,
+ "grad_norm": 3.4757370948791504,
+ "learning_rate": 3.44881239602263e-07,
+ "loss": 0.3909,
+ "step": 10560
+ },
+ {
+ "epoch": 4.993380614657211,
+ "grad_norm": 2.9729294776916504,
+ "learning_rate": 3.4456513404224513e-07,
+ "loss": 0.3645,
+ "step": 10561
+ },
+ {
+ "epoch": 4.993853427895981,
+ "grad_norm": 3.2144060134887695,
+ "learning_rate": 3.4424916269170495e-07,
+ "loss": 0.3236,
+ "step": 10562
+ },
+ {
+ "epoch": 4.994326241134752,
+ "grad_norm": 3.742386817932129,
+ "learning_rate": 3.4393332557031615e-07,
+ "loss": 0.332,
+ "step": 10563
+ },
+ {
+ "epoch": 4.994799054373522,
+ "grad_norm": 3.2569401264190674,
+ "learning_rate": 3.4361762269774557e-07,
+ "loss": 0.3774,
+ "step": 10564
+ },
+ {
+ "epoch": 4.995271867612293,
+ "grad_norm": 2.91739821434021,
+ "learning_rate": 3.433020540936499e-07,
+ "loss": 0.3061,
+ "step": 10565
+ },
+ {
+ "epoch": 4.995744680851064,
+ "grad_norm": 3.534137487411499,
+ "learning_rate": 3.429866197776788e-07,
+ "loss": 0.357,
+ "step": 10566
+ },
+ {
+ "epoch": 4.9962174940898345,
+ "grad_norm": 3.215837001800537,
+ "learning_rate": 3.4267131976947284e-07,
+ "loss": 0.3395,
+ "step": 10567
+ },
+ {
+ "epoch": 4.996690307328605,
+ "grad_norm": 3.294857978820801,
+ "learning_rate": 3.4235615408866384e-07,
+ "loss": 0.3273,
+ "step": 10568
+ },
+ {
+ "epoch": 4.997163120567376,
+ "grad_norm": 3.519171953201294,
+ "learning_rate": 3.4204112275487646e-07,
+ "loss": 0.3712,
+ "step": 10569
+ },
+ {
+ "epoch": 4.997635933806147,
+ "grad_norm": 3.037527084350586,
+ "learning_rate": 3.4172622578772544e-07,
+ "loss": 0.2949,
+ "step": 10570
+ },
+ {
+ "epoch": 4.998108747044917,
+ "grad_norm": 3.309682846069336,
+ "learning_rate": 3.4141146320681913e-07,
+ "loss": 0.3068,
+ "step": 10571
+ },
+ {
+ "epoch": 4.998581560283688,
+ "grad_norm": 3.2197179794311523,
+ "learning_rate": 3.410968350317559e-07,
+ "loss": 0.3725,
+ "step": 10572
+ },
+ {
+ "epoch": 4.999054373522458,
+ "grad_norm": 3.0465641021728516,
+ "learning_rate": 3.4078234128212537e-07,
+ "loss": 0.3505,
+ "step": 10573
+ },
+ {
+ "epoch": 4.999527186761229,
+ "grad_norm": 3.066941022872925,
+ "learning_rate": 3.404679819775114e-07,
+ "loss": 0.3435,
+ "step": 10574
+ },
+ {
+ "epoch": 5.0,
+ "grad_norm": 3.3947532176971436,
+ "learning_rate": 3.401537571374869e-07,
+ "loss": 0.3344,
+ "step": 10575
+ },
+ {
+ "epoch": 5.000472813238771,
+ "grad_norm": 3.2180113792419434,
+ "learning_rate": 3.398396667816167e-07,
+ "loss": 0.2986,
+ "step": 10576
+ },
+ {
+ "epoch": 5.000945626477542,
+ "grad_norm": 3.039257049560547,
+ "learning_rate": 3.395257109294592e-07,
+ "loss": 0.3435,
+ "step": 10577
+ },
+ {
+ "epoch": 5.001418439716312,
+ "grad_norm": 2.675401210784912,
+ "learning_rate": 3.392118896005614e-07,
+ "loss": 0.2722,
+ "step": 10578
+ },
+ {
+ "epoch": 5.001891252955082,
+ "grad_norm": 3.395113468170166,
+ "learning_rate": 3.388982028144652e-07,
+ "loss": 0.3091,
+ "step": 10579
+ },
+ {
+ "epoch": 5.002364066193853,
+ "grad_norm": 2.859116315841675,
+ "learning_rate": 3.385846505907017e-07,
+ "loss": 0.2849,
+ "step": 10580
+ },
+ {
+ "epoch": 5.002836879432624,
+ "grad_norm": 2.703989028930664,
+ "learning_rate": 3.3827123294879416e-07,
+ "loss": 0.3236,
+ "step": 10581
+ },
+ {
+ "epoch": 5.003309692671395,
+ "grad_norm": 3.112179756164551,
+ "learning_rate": 3.379579499082583e-07,
+ "loss": 0.36,
+ "step": 10582
+ },
+ {
+ "epoch": 5.0037825059101655,
+ "grad_norm": 3.0279061794281006,
+ "learning_rate": 3.376448014886008e-07,
+ "loss": 0.3079,
+ "step": 10583
+ },
+ {
+ "epoch": 5.004255319148936,
+ "grad_norm": 3.638592481613159,
+ "learning_rate": 3.373317877093199e-07,
+ "loss": 0.3535,
+ "step": 10584
+ },
+ {
+ "epoch": 5.004728132387707,
+ "grad_norm": 3.7547621726989746,
+ "learning_rate": 3.3701890858990476e-07,
+ "loss": 0.3489,
+ "step": 10585
+ },
+ {
+ "epoch": 5.005200945626478,
+ "grad_norm": 3.151641607284546,
+ "learning_rate": 3.367061641498387e-07,
+ "loss": 0.2844,
+ "step": 10586
+ },
+ {
+ "epoch": 5.005673758865248,
+ "grad_norm": 2.8997435569763184,
+ "learning_rate": 3.363935544085939e-07,
+ "loss": 0.297,
+ "step": 10587
+ },
+ {
+ "epoch": 5.006146572104019,
+ "grad_norm": 3.1785173416137695,
+ "learning_rate": 3.360810793856345e-07,
+ "loss": 0.3055,
+ "step": 10588
+ },
+ {
+ "epoch": 5.006619385342789,
+ "grad_norm": 2.8876235485076904,
+ "learning_rate": 3.357687391004186e-07,
+ "loss": 0.3005,
+ "step": 10589
+ },
+ {
+ "epoch": 5.00709219858156,
+ "grad_norm": 2.9282939434051514,
+ "learning_rate": 3.354565335723936e-07,
+ "loss": 0.2367,
+ "step": 10590
+ },
+ {
+ "epoch": 5.007565011820331,
+ "grad_norm": 3.283669948577881,
+ "learning_rate": 3.3514446282099823e-07,
+ "loss": 0.3386,
+ "step": 10591
+ },
+ {
+ "epoch": 5.008037825059102,
+ "grad_norm": 3.2054243087768555,
+ "learning_rate": 3.348325268656652e-07,
+ "loss": 0.3192,
+ "step": 10592
+ },
+ {
+ "epoch": 5.008510638297873,
+ "grad_norm": 2.9525279998779297,
+ "learning_rate": 3.345207257258162e-07,
+ "loss": 0.3742,
+ "step": 10593
+ },
+ {
+ "epoch": 5.008983451536643,
+ "grad_norm": 2.8707218170166016,
+ "learning_rate": 3.3420905942086664e-07,
+ "loss": 0.2783,
+ "step": 10594
+ },
+ {
+ "epoch": 5.009456264775413,
+ "grad_norm": 2.9152603149414062,
+ "learning_rate": 3.3389752797022253e-07,
+ "loss": 0.3267,
+ "step": 10595
+ },
+ {
+ "epoch": 5.009929078014184,
+ "grad_norm": 3.2035791873931885,
+ "learning_rate": 3.3358613139328063e-07,
+ "loss": 0.3519,
+ "step": 10596
+ },
+ {
+ "epoch": 5.010401891252955,
+ "grad_norm": 3.4502341747283936,
+ "learning_rate": 3.3327486970943123e-07,
+ "loss": 0.3766,
+ "step": 10597
+ },
+ {
+ "epoch": 5.010874704491726,
+ "grad_norm": 2.740959405899048,
+ "learning_rate": 3.32963742938055e-07,
+ "loss": 0.2985,
+ "step": 10598
+ },
+ {
+ "epoch": 5.0113475177304965,
+ "grad_norm": 2.6652681827545166,
+ "learning_rate": 3.326527510985239e-07,
+ "loss": 0.3016,
+ "step": 10599
+ },
+ {
+ "epoch": 5.011820330969267,
+ "grad_norm": 2.980694532394409,
+ "learning_rate": 3.3234189421020304e-07,
+ "loss": 0.3407,
+ "step": 10600
+ },
+ {
+ "epoch": 5.012293144208038,
+ "grad_norm": 2.976670742034912,
+ "learning_rate": 3.320311722924474e-07,
+ "loss": 0.3156,
+ "step": 10601
+ },
+ {
+ "epoch": 5.012765957446809,
+ "grad_norm": 3.8425865173339844,
+ "learning_rate": 3.317205853646044e-07,
+ "loss": 0.3362,
+ "step": 10602
+ },
+ {
+ "epoch": 5.013238770685579,
+ "grad_norm": 3.247138023376465,
+ "learning_rate": 3.314101334460129e-07,
+ "loss": 0.2753,
+ "step": 10603
+ },
+ {
+ "epoch": 5.01371158392435,
+ "grad_norm": 3.321345567703247,
+ "learning_rate": 3.310998165560031e-07,
+ "loss": 0.3464,
+ "step": 10604
+ },
+ {
+ "epoch": 5.01418439716312,
+ "grad_norm": 3.3347668647766113,
+ "learning_rate": 3.307896347138978e-07,
+ "loss": 0.3349,
+ "step": 10605
+ },
+ {
+ "epoch": 5.014657210401891,
+ "grad_norm": 2.7465898990631104,
+ "learning_rate": 3.304795879390096e-07,
+ "loss": 0.3201,
+ "step": 10606
+ },
+ {
+ "epoch": 5.015130023640662,
+ "grad_norm": 2.986417293548584,
+ "learning_rate": 3.301696762506448e-07,
+ "loss": 0.345,
+ "step": 10607
+ },
+ {
+ "epoch": 5.015602836879433,
+ "grad_norm": 2.928632974624634,
+ "learning_rate": 3.2985989966810017e-07,
+ "loss": 0.3285,
+ "step": 10608
+ },
+ {
+ "epoch": 5.0160756501182036,
+ "grad_norm": 3.1549665927886963,
+ "learning_rate": 3.29550258210663e-07,
+ "loss": 0.3129,
+ "step": 10609
+ },
+ {
+ "epoch": 5.016548463356974,
+ "grad_norm": 3.2318291664123535,
+ "learning_rate": 3.292407518976148e-07,
+ "loss": 0.3152,
+ "step": 10610
+ },
+ {
+ "epoch": 5.017021276595744,
+ "grad_norm": 2.8423120975494385,
+ "learning_rate": 3.2893138074822613e-07,
+ "loss": 0.3186,
+ "step": 10611
+ },
+ {
+ "epoch": 5.017494089834515,
+ "grad_norm": 3.5018017292022705,
+ "learning_rate": 3.2862214478176084e-07,
+ "loss": 0.3586,
+ "step": 10612
+ },
+ {
+ "epoch": 5.017966903073286,
+ "grad_norm": 3.0157501697540283,
+ "learning_rate": 3.283130440174734e-07,
+ "loss": 0.2825,
+ "step": 10613
+ },
+ {
+ "epoch": 5.018439716312057,
+ "grad_norm": 2.756171226501465,
+ "learning_rate": 3.280040784746097e-07,
+ "loss": 0.2934,
+ "step": 10614
+ },
+ {
+ "epoch": 5.0189125295508275,
+ "grad_norm": 3.8830623626708984,
+ "learning_rate": 3.2769524817240855e-07,
+ "loss": 0.3515,
+ "step": 10615
+ },
+ {
+ "epoch": 5.019385342789598,
+ "grad_norm": 3.1528213024139404,
+ "learning_rate": 3.2738655313009903e-07,
+ "loss": 0.37,
+ "step": 10616
+ },
+ {
+ "epoch": 5.019858156028369,
+ "grad_norm": 2.943558692932129,
+ "learning_rate": 3.270779933669022e-07,
+ "loss": 0.2928,
+ "step": 10617
+ },
+ {
+ "epoch": 5.02033096926714,
+ "grad_norm": 3.229234457015991,
+ "learning_rate": 3.267695689020309e-07,
+ "loss": 0.3264,
+ "step": 10618
+ },
+ {
+ "epoch": 5.02080378250591,
+ "grad_norm": 3.3662054538726807,
+ "learning_rate": 3.264612797546884e-07,
+ "loss": 0.3564,
+ "step": 10619
+ },
+ {
+ "epoch": 5.0212765957446805,
+ "grad_norm": 2.895679235458374,
+ "learning_rate": 3.2615312594407227e-07,
+ "loss": 0.3013,
+ "step": 10620
+ },
+ {
+ "epoch": 5.021749408983451,
+ "grad_norm": 3.1217849254608154,
+ "learning_rate": 3.258451074893687e-07,
+ "loss": 0.3122,
+ "step": 10621
+ },
+ {
+ "epoch": 5.022222222222222,
+ "grad_norm": 3.1026153564453125,
+ "learning_rate": 3.255372244097563e-07,
+ "loss": 0.3466,
+ "step": 10622
+ },
+ {
+ "epoch": 5.022695035460993,
+ "grad_norm": 2.99145245552063,
+ "learning_rate": 3.2522947672440684e-07,
+ "loss": 0.2763,
+ "step": 10623
+ },
+ {
+ "epoch": 5.023167848699764,
+ "grad_norm": 3.6011345386505127,
+ "learning_rate": 3.2492186445248125e-07,
+ "loss": 0.3652,
+ "step": 10624
+ },
+ {
+ "epoch": 5.0236406619385345,
+ "grad_norm": 3.3117830753326416,
+ "learning_rate": 3.246143876131344e-07,
+ "loss": 0.3419,
+ "step": 10625
+ },
+ {
+ "epoch": 5.024113475177305,
+ "grad_norm": 3.467041492462158,
+ "learning_rate": 3.243070462255107e-07,
+ "loss": 0.3495,
+ "step": 10626
+ },
+ {
+ "epoch": 5.024586288416075,
+ "grad_norm": 3.3493213653564453,
+ "learning_rate": 3.2399984030874657e-07,
+ "loss": 0.3214,
+ "step": 10627
+ },
+ {
+ "epoch": 5.025059101654846,
+ "grad_norm": 2.9000561237335205,
+ "learning_rate": 3.236927698819714e-07,
+ "loss": 0.3358,
+ "step": 10628
+ },
+ {
+ "epoch": 5.025531914893617,
+ "grad_norm": 3.1759841442108154,
+ "learning_rate": 3.233858349643043e-07,
+ "loss": 0.3122,
+ "step": 10629
+ },
+ {
+ "epoch": 5.026004728132388,
+ "grad_norm": 3.0218966007232666,
+ "learning_rate": 3.2307903557485754e-07,
+ "loss": 0.3051,
+ "step": 10630
+ },
+ {
+ "epoch": 5.026477541371158,
+ "grad_norm": 3.0489871501922607,
+ "learning_rate": 3.227723717327336e-07,
+ "loss": 0.3551,
+ "step": 10631
+ },
+ {
+ "epoch": 5.026950354609929,
+ "grad_norm": 3.190955638885498,
+ "learning_rate": 3.224658434570271e-07,
+ "loss": 0.3282,
+ "step": 10632
+ },
+ {
+ "epoch": 5.0274231678487,
+ "grad_norm": 3.4504268169403076,
+ "learning_rate": 3.2215945076682463e-07,
+ "loss": 0.3401,
+ "step": 10633
+ },
+ {
+ "epoch": 5.027895981087471,
+ "grad_norm": 2.847717761993408,
+ "learning_rate": 3.2185319368120304e-07,
+ "loss": 0.3045,
+ "step": 10634
+ },
+ {
+ "epoch": 5.028368794326241,
+ "grad_norm": 3.450645923614502,
+ "learning_rate": 3.2154707221923265e-07,
+ "loss": 0.3026,
+ "step": 10635
+ },
+ {
+ "epoch": 5.0288416075650115,
+ "grad_norm": 3.739544630050659,
+ "learning_rate": 3.212410863999738e-07,
+ "loss": 0.3375,
+ "step": 10636
+ },
+ {
+ "epoch": 5.029314420803782,
+ "grad_norm": 3.1687986850738525,
+ "learning_rate": 3.2093523624247837e-07,
+ "loss": 0.2995,
+ "step": 10637
+ },
+ {
+ "epoch": 5.029787234042553,
+ "grad_norm": 3.5549468994140625,
+ "learning_rate": 3.2062952176579147e-07,
+ "loss": 0.3618,
+ "step": 10638
+ },
+ {
+ "epoch": 5.030260047281324,
+ "grad_norm": 3.1378657817840576,
+ "learning_rate": 3.2032394298894766e-07,
+ "loss": 0.3621,
+ "step": 10639
+ },
+ {
+ "epoch": 5.030732860520095,
+ "grad_norm": 2.9576902389526367,
+ "learning_rate": 3.200184999309747e-07,
+ "loss": 0.2966,
+ "step": 10640
+ },
+ {
+ "epoch": 5.0312056737588655,
+ "grad_norm": 2.8072168827056885,
+ "learning_rate": 3.197131926108907e-07,
+ "loss": 0.3185,
+ "step": 10641
+ },
+ {
+ "epoch": 5.031678486997636,
+ "grad_norm": 2.894251585006714,
+ "learning_rate": 3.1940802104770547e-07,
+ "loss": 0.3022,
+ "step": 10642
+ },
+ {
+ "epoch": 5.032151300236406,
+ "grad_norm": 3.3244433403015137,
+ "learning_rate": 3.191029852604216e-07,
+ "loss": 0.2689,
+ "step": 10643
+ },
+ {
+ "epoch": 5.032624113475177,
+ "grad_norm": 2.8501358032226562,
+ "learning_rate": 3.187980852680314e-07,
+ "loss": 0.2812,
+ "step": 10644
+ },
+ {
+ "epoch": 5.033096926713948,
+ "grad_norm": 2.975888252258301,
+ "learning_rate": 3.184933210895208e-07,
+ "loss": 0.2849,
+ "step": 10645
+ },
+ {
+ "epoch": 5.033569739952719,
+ "grad_norm": 3.391071081161499,
+ "learning_rate": 3.1818869274386543e-07,
+ "loss": 0.3458,
+ "step": 10646
+ },
+ {
+ "epoch": 5.034042553191489,
+ "grad_norm": 2.735366106033325,
+ "learning_rate": 3.178842002500332e-07,
+ "loss": 0.32,
+ "step": 10647
+ },
+ {
+ "epoch": 5.03451536643026,
+ "grad_norm": 3.1227850914001465,
+ "learning_rate": 3.175798436269836e-07,
+ "loss": 0.268,
+ "step": 10648
+ },
+ {
+ "epoch": 5.034988179669031,
+ "grad_norm": 3.4545602798461914,
+ "learning_rate": 3.172756228936674e-07,
+ "loss": 0.3474,
+ "step": 10649
+ },
+ {
+ "epoch": 5.035460992907802,
+ "grad_norm": 3.0225183963775635,
+ "learning_rate": 3.1697153806902665e-07,
+ "loss": 0.3511,
+ "step": 10650
+ },
+ {
+ "epoch": 5.035933806146572,
+ "grad_norm": 3.1183130741119385,
+ "learning_rate": 3.166675891719967e-07,
+ "loss": 0.3074,
+ "step": 10651
+ },
+ {
+ "epoch": 5.0364066193853425,
+ "grad_norm": 4.368288040161133,
+ "learning_rate": 3.163637762215016e-07,
+ "loss": 0.3359,
+ "step": 10652
+ },
+ {
+ "epoch": 5.036879432624113,
+ "grad_norm": 3.097292184829712,
+ "learning_rate": 3.1606009923645986e-07,
+ "loss": 0.2928,
+ "step": 10653
+ },
+ {
+ "epoch": 5.037352245862884,
+ "grad_norm": 3.81937837600708,
+ "learning_rate": 3.157565582357794e-07,
+ "loss": 0.3097,
+ "step": 10654
+ },
+ {
+ "epoch": 5.037825059101655,
+ "grad_norm": 3.05619215965271,
+ "learning_rate": 3.1545315323835977e-07,
+ "loss": 0.3061,
+ "step": 10655
+ },
+ {
+ "epoch": 5.038297872340426,
+ "grad_norm": 3.4275028705596924,
+ "learning_rate": 3.15149884263094e-07,
+ "loss": 0.3396,
+ "step": 10656
+ },
+ {
+ "epoch": 5.0387706855791965,
+ "grad_norm": 2.899402379989624,
+ "learning_rate": 3.148467513288639e-07,
+ "loss": 0.3049,
+ "step": 10657
+ },
+ {
+ "epoch": 5.039243498817967,
+ "grad_norm": 2.9791312217712402,
+ "learning_rate": 3.145437544545457e-07,
+ "loss": 0.286,
+ "step": 10658
+ },
+ {
+ "epoch": 5.039716312056737,
+ "grad_norm": 2.78840708732605,
+ "learning_rate": 3.1424089365900524e-07,
+ "loss": 0.3045,
+ "step": 10659
+ },
+ {
+ "epoch": 5.040189125295508,
+ "grad_norm": 3.066490650177002,
+ "learning_rate": 3.139381689610993e-07,
+ "loss": 0.3093,
+ "step": 10660
+ },
+ {
+ "epoch": 5.040661938534279,
+ "grad_norm": 3.0271544456481934,
+ "learning_rate": 3.136355803796784e-07,
+ "loss": 0.3018,
+ "step": 10661
+ },
+ {
+ "epoch": 5.04113475177305,
+ "grad_norm": 3.203864336013794,
+ "learning_rate": 3.13333127933583e-07,
+ "loss": 0.3668,
+ "step": 10662
+ },
+ {
+ "epoch": 5.04160756501182,
+ "grad_norm": 3.2266061305999756,
+ "learning_rate": 3.13030811641645e-07,
+ "loss": 0.3551,
+ "step": 10663
+ },
+ {
+ "epoch": 5.042080378250591,
+ "grad_norm": 3.262101650238037,
+ "learning_rate": 3.1272863152268935e-07,
+ "loss": 0.346,
+ "step": 10664
+ },
+ {
+ "epoch": 5.042553191489362,
+ "grad_norm": 3.701680898666382,
+ "learning_rate": 3.12426587595531e-07,
+ "loss": 0.2847,
+ "step": 10665
+ },
+ {
+ "epoch": 5.043026004728133,
+ "grad_norm": 3.3046317100524902,
+ "learning_rate": 3.121246798789768e-07,
+ "loss": 0.3787,
+ "step": 10666
+ },
+ {
+ "epoch": 5.043498817966903,
+ "grad_norm": 3.1488304138183594,
+ "learning_rate": 3.118229083918245e-07,
+ "loss": 0.2567,
+ "step": 10667
+ },
+ {
+ "epoch": 5.0439716312056735,
+ "grad_norm": 3.2503244876861572,
+ "learning_rate": 3.115212731528655e-07,
+ "loss": 0.3322,
+ "step": 10668
+ },
+ {
+ "epoch": 5.044444444444444,
+ "grad_norm": 3.071131706237793,
+ "learning_rate": 3.112197741808809e-07,
+ "loss": 0.2992,
+ "step": 10669
+ },
+ {
+ "epoch": 5.044917257683215,
+ "grad_norm": 3.5055527687072754,
+ "learning_rate": 3.1091841149464256e-07,
+ "loss": 0.3817,
+ "step": 10670
+ },
+ {
+ "epoch": 5.045390070921986,
+ "grad_norm": 3.1892800331115723,
+ "learning_rate": 3.106171851129164e-07,
+ "loss": 0.3413,
+ "step": 10671
+ },
+ {
+ "epoch": 5.045862884160757,
+ "grad_norm": 3.267735481262207,
+ "learning_rate": 3.1031609505445814e-07,
+ "loss": 0.3352,
+ "step": 10672
+ },
+ {
+ "epoch": 5.0463356973995275,
+ "grad_norm": 3.1450655460357666,
+ "learning_rate": 3.100151413380145e-07,
+ "loss": 0.3002,
+ "step": 10673
+ },
+ {
+ "epoch": 5.046808510638298,
+ "grad_norm": 3.761446475982666,
+ "learning_rate": 3.097143239823261e-07,
+ "loss": 0.366,
+ "step": 10674
+ },
+ {
+ "epoch": 5.047281323877068,
+ "grad_norm": 3.272583246231079,
+ "learning_rate": 3.0941364300612177e-07,
+ "loss": 0.3651,
+ "step": 10675
+ },
+ {
+ "epoch": 5.047754137115839,
+ "grad_norm": 3.0774614810943604,
+ "learning_rate": 3.0911309842812487e-07,
+ "loss": 0.3441,
+ "step": 10676
+ },
+ {
+ "epoch": 5.04822695035461,
+ "grad_norm": 3.3999059200286865,
+ "learning_rate": 3.088126902670488e-07,
+ "loss": 0.3242,
+ "step": 10677
+ },
+ {
+ "epoch": 5.048699763593381,
+ "grad_norm": 3.13442325592041,
+ "learning_rate": 3.0851241854159804e-07,
+ "loss": 0.3085,
+ "step": 10678
+ },
+ {
+ "epoch": 5.049172576832151,
+ "grad_norm": 3.2339842319488525,
+ "learning_rate": 3.082122832704698e-07,
+ "loss": 0.3315,
+ "step": 10679
+ },
+ {
+ "epoch": 5.049645390070922,
+ "grad_norm": 2.742600202560425,
+ "learning_rate": 3.079122844723523e-07,
+ "loss": 0.2974,
+ "step": 10680
+ },
+ {
+ "epoch": 5.050118203309693,
+ "grad_norm": 2.778710126876831,
+ "learning_rate": 3.0761242216592463e-07,
+ "loss": 0.317,
+ "step": 10681
+ },
+ {
+ "epoch": 5.050591016548464,
+ "grad_norm": 3.219449520111084,
+ "learning_rate": 3.0731269636985805e-07,
+ "loss": 0.3692,
+ "step": 10682
+ },
+ {
+ "epoch": 5.051063829787234,
+ "grad_norm": 3.0930869579315186,
+ "learning_rate": 3.0701310710281486e-07,
+ "loss": 0.3278,
+ "step": 10683
+ },
+ {
+ "epoch": 5.0515366430260045,
+ "grad_norm": 3.427046298980713,
+ "learning_rate": 3.0671365438345006e-07,
+ "loss": 0.3162,
+ "step": 10684
+ },
+ {
+ "epoch": 5.052009456264775,
+ "grad_norm": 3.582806348800659,
+ "learning_rate": 3.06414338230408e-07,
+ "loss": 0.3586,
+ "step": 10685
+ },
+ {
+ "epoch": 5.052482269503546,
+ "grad_norm": 3.381739377975464,
+ "learning_rate": 3.0611515866232704e-07,
+ "loss": 0.3436,
+ "step": 10686
+ },
+ {
+ "epoch": 5.052955082742317,
+ "grad_norm": 3.377415418624878,
+ "learning_rate": 3.058161156978356e-07,
+ "loss": 0.3767,
+ "step": 10687
+ },
+ {
+ "epoch": 5.053427895981088,
+ "grad_norm": 3.4613475799560547,
+ "learning_rate": 3.055172093555525e-07,
+ "loss": 0.3364,
+ "step": 10688
+ },
+ {
+ "epoch": 5.0539007092198585,
+ "grad_norm": 2.8797378540039062,
+ "learning_rate": 3.052184396540911e-07,
+ "loss": 0.2771,
+ "step": 10689
+ },
+ {
+ "epoch": 5.054373522458629,
+ "grad_norm": 3.4278573989868164,
+ "learning_rate": 3.0491980661205287e-07,
+ "loss": 0.355,
+ "step": 10690
+ },
+ {
+ "epoch": 5.054846335697399,
+ "grad_norm": 2.910229444503784,
+ "learning_rate": 3.046213102480339e-07,
+ "loss": 0.2778,
+ "step": 10691
+ },
+ {
+ "epoch": 5.05531914893617,
+ "grad_norm": 2.9667794704437256,
+ "learning_rate": 3.0432295058061945e-07,
+ "loss": 0.2809,
+ "step": 10692
+ },
+ {
+ "epoch": 5.055791962174941,
+ "grad_norm": 3.119274854660034,
+ "learning_rate": 3.040247276283864e-07,
+ "loss": 0.3429,
+ "step": 10693
+ },
+ {
+ "epoch": 5.0562647754137116,
+ "grad_norm": 3.1743738651275635,
+ "learning_rate": 3.037266414099052e-07,
+ "loss": 0.2842,
+ "step": 10694
+ },
+ {
+ "epoch": 5.056737588652482,
+ "grad_norm": 2.9281697273254395,
+ "learning_rate": 3.0342869194373544e-07,
+ "loss": 0.3083,
+ "step": 10695
+ },
+ {
+ "epoch": 5.057210401891253,
+ "grad_norm": 2.8758695125579834,
+ "learning_rate": 3.0313087924842974e-07,
+ "loss": 0.3253,
+ "step": 10696
+ },
+ {
+ "epoch": 5.057683215130024,
+ "grad_norm": 3.0644049644470215,
+ "learning_rate": 3.0283320334253074e-07,
+ "loss": 0.3108,
+ "step": 10697
+ },
+ {
+ "epoch": 5.058156028368795,
+ "grad_norm": 3.1170527935028076,
+ "learning_rate": 3.025356642445737e-07,
+ "loss": 0.3578,
+ "step": 10698
+ },
+ {
+ "epoch": 5.058628841607565,
+ "grad_norm": 3.1002635955810547,
+ "learning_rate": 3.022382619730857e-07,
+ "loss": 0.2711,
+ "step": 10699
+ },
+ {
+ "epoch": 5.0591016548463354,
+ "grad_norm": 3.3275279998779297,
+ "learning_rate": 3.0194099654658414e-07,
+ "loss": 0.3266,
+ "step": 10700
+ },
+ {
+ "epoch": 5.059574468085106,
+ "grad_norm": 3.003758192062378,
+ "learning_rate": 3.0164386798357814e-07,
+ "loss": 0.325,
+ "step": 10701
+ },
+ {
+ "epoch": 5.060047281323877,
+ "grad_norm": 3.391663074493408,
+ "learning_rate": 3.013468763025695e-07,
+ "loss": 0.3394,
+ "step": 10702
+ },
+ {
+ "epoch": 5.060520094562648,
+ "grad_norm": 3.3918750286102295,
+ "learning_rate": 3.0105002152204936e-07,
+ "loss": 0.3408,
+ "step": 10703
+ },
+ {
+ "epoch": 5.060992907801419,
+ "grad_norm": 3.5883846282958984,
+ "learning_rate": 3.0075330366050317e-07,
+ "loss": 0.3174,
+ "step": 10704
+ },
+ {
+ "epoch": 5.061465721040189,
+ "grad_norm": 2.7748823165893555,
+ "learning_rate": 3.00456722736405e-07,
+ "loss": 0.2628,
+ "step": 10705
+ },
+ {
+ "epoch": 5.06193853427896,
+ "grad_norm": 3.1777453422546387,
+ "learning_rate": 3.0016027876822147e-07,
+ "loss": 0.314,
+ "step": 10706
+ },
+ {
+ "epoch": 5.06241134751773,
+ "grad_norm": 2.857104778289795,
+ "learning_rate": 2.99863971774412e-07,
+ "loss": 0.2587,
+ "step": 10707
+ },
+ {
+ "epoch": 5.062884160756501,
+ "grad_norm": 3.162203550338745,
+ "learning_rate": 2.995678017734252e-07,
+ "loss": 0.3167,
+ "step": 10708
+ },
+ {
+ "epoch": 5.063356973995272,
+ "grad_norm": 2.9197676181793213,
+ "learning_rate": 2.992717687837032e-07,
+ "loss": 0.2971,
+ "step": 10709
+ },
+ {
+ "epoch": 5.0638297872340425,
+ "grad_norm": 3.4100844860076904,
+ "learning_rate": 2.9897587282367856e-07,
+ "loss": 0.2964,
+ "step": 10710
+ },
+ {
+ "epoch": 5.064302600472813,
+ "grad_norm": 3.562899351119995,
+ "learning_rate": 2.986801139117748e-07,
+ "loss": 0.3344,
+ "step": 10711
+ },
+ {
+ "epoch": 5.064775413711584,
+ "grad_norm": 2.8942716121673584,
+ "learning_rate": 2.9838449206640806e-07,
+ "loss": 0.3082,
+ "step": 10712
+ },
+ {
+ "epoch": 5.065248226950355,
+ "grad_norm": 3.159891366958618,
+ "learning_rate": 2.9808900730598445e-07,
+ "loss": 0.3353,
+ "step": 10713
+ },
+ {
+ "epoch": 5.065721040189126,
+ "grad_norm": 2.9876580238342285,
+ "learning_rate": 2.9779365964890395e-07,
+ "loss": 0.3224,
+ "step": 10714
+ },
+ {
+ "epoch": 5.066193853427896,
+ "grad_norm": 3.038656234741211,
+ "learning_rate": 2.974984491135557e-07,
+ "loss": 0.347,
+ "step": 10715
+ },
+ {
+ "epoch": 5.066666666666666,
+ "grad_norm": 3.407076358795166,
+ "learning_rate": 2.9720337571832126e-07,
+ "loss": 0.3057,
+ "step": 10716
+ },
+ {
+ "epoch": 5.067139479905437,
+ "grad_norm": 3.6783857345581055,
+ "learning_rate": 2.9690843948157384e-07,
+ "loss": 0.3764,
+ "step": 10717
+ },
+ {
+ "epoch": 5.067612293144208,
+ "grad_norm": 3.081132173538208,
+ "learning_rate": 2.9661364042167765e-07,
+ "loss": 0.326,
+ "step": 10718
+ },
+ {
+ "epoch": 5.068085106382979,
+ "grad_norm": 3.4150638580322266,
+ "learning_rate": 2.9631897855698784e-07,
+ "loss": 0.3286,
+ "step": 10719
+ },
+ {
+ "epoch": 5.06855791962175,
+ "grad_norm": 3.6125104427337646,
+ "learning_rate": 2.960244539058532e-07,
+ "loss": 0.3523,
+ "step": 10720
+ },
+ {
+ "epoch": 5.06903073286052,
+ "grad_norm": 3.38793683052063,
+ "learning_rate": 2.9573006648661084e-07,
+ "loss": 0.3552,
+ "step": 10721
+ },
+ {
+ "epoch": 5.069503546099291,
+ "grad_norm": 3.3725569248199463,
+ "learning_rate": 2.9543581631759243e-07,
+ "loss": 0.3445,
+ "step": 10722
+ },
+ {
+ "epoch": 5.069976359338061,
+ "grad_norm": 3.1193649768829346,
+ "learning_rate": 2.9514170341711917e-07,
+ "loss": 0.312,
+ "step": 10723
+ },
+ {
+ "epoch": 5.070449172576832,
+ "grad_norm": 3.3182318210601807,
+ "learning_rate": 2.948477278035033e-07,
+ "loss": 0.357,
+ "step": 10724
+ },
+ {
+ "epoch": 5.070921985815603,
+ "grad_norm": 2.945859909057617,
+ "learning_rate": 2.9455388949505087e-07,
+ "loss": 0.2992,
+ "step": 10725
+ },
+ {
+ "epoch": 5.0713947990543735,
+ "grad_norm": 3.269683599472046,
+ "learning_rate": 2.942601885100571e-07,
+ "loss": 0.3415,
+ "step": 10726
+ },
+ {
+ "epoch": 5.071867612293144,
+ "grad_norm": 3.268453598022461,
+ "learning_rate": 2.939666248668094e-07,
+ "loss": 0.3655,
+ "step": 10727
+ },
+ {
+ "epoch": 5.072340425531915,
+ "grad_norm": 3.0961155891418457,
+ "learning_rate": 2.936731985835864e-07,
+ "loss": 0.3281,
+ "step": 10728
+ },
+ {
+ "epoch": 5.072813238770686,
+ "grad_norm": 2.9241912364959717,
+ "learning_rate": 2.9337990967865935e-07,
+ "loss": 0.3229,
+ "step": 10729
+ },
+ {
+ "epoch": 5.073286052009456,
+ "grad_norm": 3.4160847663879395,
+ "learning_rate": 2.9308675817028955e-07,
+ "loss": 0.3711,
+ "step": 10730
+ },
+ {
+ "epoch": 5.073758865248227,
+ "grad_norm": 3.7135009765625,
+ "learning_rate": 2.927937440767298e-07,
+ "loss": 0.3194,
+ "step": 10731
+ },
+ {
+ "epoch": 5.074231678486997,
+ "grad_norm": 3.281792163848877,
+ "learning_rate": 2.925008674162258e-07,
+ "loss": 0.3679,
+ "step": 10732
+ },
+ {
+ "epoch": 5.074704491725768,
+ "grad_norm": 3.290614366531372,
+ "learning_rate": 2.922081282070133e-07,
+ "loss": 0.3811,
+ "step": 10733
+ },
+ {
+ "epoch": 5.075177304964539,
+ "grad_norm": 3.0560739040374756,
+ "learning_rate": 2.9191552646731904e-07,
+ "loss": 0.3258,
+ "step": 10734
+ },
+ {
+ "epoch": 5.07565011820331,
+ "grad_norm": 3.7462680339813232,
+ "learning_rate": 2.916230622153635e-07,
+ "loss": 0.3541,
+ "step": 10735
+ },
+ {
+ "epoch": 5.076122931442081,
+ "grad_norm": 3.315908193588257,
+ "learning_rate": 2.9133073546935564e-07,
+ "loss": 0.3232,
+ "step": 10736
+ },
+ {
+ "epoch": 5.076595744680851,
+ "grad_norm": 5.690020561218262,
+ "learning_rate": 2.9103854624749907e-07,
+ "loss": 0.2898,
+ "step": 10737
+ },
+ {
+ "epoch": 5.077068557919622,
+ "grad_norm": 2.9824652671813965,
+ "learning_rate": 2.907464945679861e-07,
+ "loss": 0.3538,
+ "step": 10738
+ },
+ {
+ "epoch": 5.077541371158392,
+ "grad_norm": 3.5233614444732666,
+ "learning_rate": 2.9045458044900084e-07,
+ "loss": 0.3549,
+ "step": 10739
+ },
+ {
+ "epoch": 5.078014184397163,
+ "grad_norm": 3.1448633670806885,
+ "learning_rate": 2.901628039087212e-07,
+ "loss": 0.3548,
+ "step": 10740
+ },
+ {
+ "epoch": 5.078486997635934,
+ "grad_norm": 3.2169477939605713,
+ "learning_rate": 2.8987116496531356e-07,
+ "loss": 0.371,
+ "step": 10741
+ },
+ {
+ "epoch": 5.0789598108747045,
+ "grad_norm": 2.862016201019287,
+ "learning_rate": 2.895796636369372e-07,
+ "loss": 0.3063,
+ "step": 10742
+ },
+ {
+ "epoch": 5.079432624113475,
+ "grad_norm": 2.875854253768921,
+ "learning_rate": 2.892882999417429e-07,
+ "loss": 0.3384,
+ "step": 10743
+ },
+ {
+ "epoch": 5.079905437352246,
+ "grad_norm": 2.9410934448242188,
+ "learning_rate": 2.8899707389787285e-07,
+ "loss": 0.2969,
+ "step": 10744
+ },
+ {
+ "epoch": 5.080378250591017,
+ "grad_norm": 3.3200604915618896,
+ "learning_rate": 2.8870598552345973e-07,
+ "loss": 0.3632,
+ "step": 10745
+ },
+ {
+ "epoch": 5.080851063829787,
+ "grad_norm": 2.933659315109253,
+ "learning_rate": 2.884150348366288e-07,
+ "loss": 0.2858,
+ "step": 10746
+ },
+ {
+ "epoch": 5.081323877068558,
+ "grad_norm": 3.1934702396392822,
+ "learning_rate": 2.8812422185549584e-07,
+ "loss": 0.322,
+ "step": 10747
+ },
+ {
+ "epoch": 5.081796690307328,
+ "grad_norm": 3.283770799636841,
+ "learning_rate": 2.878335465981691e-07,
+ "loss": 0.3101,
+ "step": 10748
+ },
+ {
+ "epoch": 5.082269503546099,
+ "grad_norm": 3.2978734970092773,
+ "learning_rate": 2.8754300908274665e-07,
+ "loss": 0.3595,
+ "step": 10749
+ },
+ {
+ "epoch": 5.08274231678487,
+ "grad_norm": 3.335786819458008,
+ "learning_rate": 2.872526093273206e-07,
+ "loss": 0.3647,
+ "step": 10750
+ },
+ {
+ "epoch": 5.083215130023641,
+ "grad_norm": 3.0334603786468506,
+ "learning_rate": 2.8696234734997163e-07,
+ "loss": 0.2988,
+ "step": 10751
+ },
+ {
+ "epoch": 5.083687943262412,
+ "grad_norm": 4.127038478851318,
+ "learning_rate": 2.86672223168773e-07,
+ "loss": 0.3852,
+ "step": 10752
+ },
+ {
+ "epoch": 5.084160756501182,
+ "grad_norm": 2.993657350540161,
+ "learning_rate": 2.8638223680179044e-07,
+ "loss": 0.3036,
+ "step": 10753
+ },
+ {
+ "epoch": 5.084633569739952,
+ "grad_norm": 3.19315505027771,
+ "learning_rate": 2.860923882670791e-07,
+ "loss": 0.3314,
+ "step": 10754
+ },
+ {
+ "epoch": 5.085106382978723,
+ "grad_norm": 3.3229005336761475,
+ "learning_rate": 2.8580267758268747e-07,
+ "loss": 0.3511,
+ "step": 10755
+ },
+ {
+ "epoch": 5.085579196217494,
+ "grad_norm": 2.942401885986328,
+ "learning_rate": 2.855131047666543e-07,
+ "loss": 0.2576,
+ "step": 10756
+ },
+ {
+ "epoch": 5.086052009456265,
+ "grad_norm": 3.43926739692688,
+ "learning_rate": 2.8522366983700924e-07,
+ "loss": 0.315,
+ "step": 10757
+ },
+ {
+ "epoch": 5.0865248226950355,
+ "grad_norm": 3.041480541229248,
+ "learning_rate": 2.849343728117754e-07,
+ "loss": 0.323,
+ "step": 10758
+ },
+ {
+ "epoch": 5.086997635933806,
+ "grad_norm": 3.1940221786499023,
+ "learning_rate": 2.846452137089653e-07,
+ "loss": 0.3232,
+ "step": 10759
+ },
+ {
+ "epoch": 5.087470449172577,
+ "grad_norm": 3.105175018310547,
+ "learning_rate": 2.8435619254658347e-07,
+ "loss": 0.3276,
+ "step": 10760
+ },
+ {
+ "epoch": 5.087943262411348,
+ "grad_norm": 3.0791914463043213,
+ "learning_rate": 2.840673093426266e-07,
+ "loss": 0.3319,
+ "step": 10761
+ },
+ {
+ "epoch": 5.088416075650118,
+ "grad_norm": 3.131887435913086,
+ "learning_rate": 2.837785641150811e-07,
+ "loss": 0.3716,
+ "step": 10762
+ },
+ {
+ "epoch": 5.088888888888889,
+ "grad_norm": 3.0012013912200928,
+ "learning_rate": 2.8348995688192716e-07,
+ "loss": 0.3748,
+ "step": 10763
+ },
+ {
+ "epoch": 5.089361702127659,
+ "grad_norm": 3.021074056625366,
+ "learning_rate": 2.832014876611339e-07,
+ "loss": 0.3179,
+ "step": 10764
+ },
+ {
+ "epoch": 5.08983451536643,
+ "grad_norm": 3.4271838665008545,
+ "learning_rate": 2.829131564706641e-07,
+ "loss": 0.3445,
+ "step": 10765
+ },
+ {
+ "epoch": 5.090307328605201,
+ "grad_norm": 3.4268107414245605,
+ "learning_rate": 2.8262496332847057e-07,
+ "loss": 0.3633,
+ "step": 10766
+ },
+ {
+ "epoch": 5.090780141843972,
+ "grad_norm": 2.9987759590148926,
+ "learning_rate": 2.823369082524971e-07,
+ "loss": 0.2973,
+ "step": 10767
+ },
+ {
+ "epoch": 5.091252955082743,
+ "grad_norm": 3.0510613918304443,
+ "learning_rate": 2.8204899126068054e-07,
+ "loss": 0.3014,
+ "step": 10768
+ },
+ {
+ "epoch": 5.091725768321513,
+ "grad_norm": 3.7389485836029053,
+ "learning_rate": 2.817612123709482e-07,
+ "loss": 0.3693,
+ "step": 10769
+ },
+ {
+ "epoch": 5.092198581560283,
+ "grad_norm": 3.183263063430786,
+ "learning_rate": 2.814735716012176e-07,
+ "loss": 0.2895,
+ "step": 10770
+ },
+ {
+ "epoch": 5.092671394799054,
+ "grad_norm": 3.173675537109375,
+ "learning_rate": 2.8118606896940043e-07,
+ "loss": 0.3107,
+ "step": 10771
+ },
+ {
+ "epoch": 5.093144208037825,
+ "grad_norm": 3.2929723262786865,
+ "learning_rate": 2.808987044933972e-07,
+ "loss": 0.3406,
+ "step": 10772
+ },
+ {
+ "epoch": 5.093617021276596,
+ "grad_norm": 3.4711458683013916,
+ "learning_rate": 2.806114781911015e-07,
+ "loss": 0.372,
+ "step": 10773
+ },
+ {
+ "epoch": 5.0940898345153665,
+ "grad_norm": 3.105397939682007,
+ "learning_rate": 2.803243900803973e-07,
+ "loss": 0.328,
+ "step": 10774
+ },
+ {
+ "epoch": 5.094562647754137,
+ "grad_norm": 2.959874391555786,
+ "learning_rate": 2.800374401791606e-07,
+ "loss": 0.2888,
+ "step": 10775
+ },
+ {
+ "epoch": 5.095035460992908,
+ "grad_norm": 2.6287355422973633,
+ "learning_rate": 2.7975062850525785e-07,
+ "loss": 0.2786,
+ "step": 10776
+ },
+ {
+ "epoch": 5.095508274231679,
+ "grad_norm": 3.32594633102417,
+ "learning_rate": 2.794639550765477e-07,
+ "loss": 0.333,
+ "step": 10777
+ },
+ {
+ "epoch": 5.095981087470449,
+ "grad_norm": 3.1033830642700195,
+ "learning_rate": 2.7917741991088096e-07,
+ "loss": 0.3285,
+ "step": 10778
+ },
+ {
+ "epoch": 5.0964539007092196,
+ "grad_norm": 2.793583869934082,
+ "learning_rate": 2.78891023026098e-07,
+ "loss": 0.3097,
+ "step": 10779
+ },
+ {
+ "epoch": 5.09692671394799,
+ "grad_norm": 3.6989400386810303,
+ "learning_rate": 2.786047644400314e-07,
+ "loss": 0.3181,
+ "step": 10780
+ },
+ {
+ "epoch": 5.097399527186761,
+ "grad_norm": 3.064781427383423,
+ "learning_rate": 2.78318644170506e-07,
+ "loss": 0.332,
+ "step": 10781
+ },
+ {
+ "epoch": 5.097872340425532,
+ "grad_norm": 3.2844104766845703,
+ "learning_rate": 2.7803266223533633e-07,
+ "loss": 0.2825,
+ "step": 10782
+ },
+ {
+ "epoch": 5.098345153664303,
+ "grad_norm": 2.9916985034942627,
+ "learning_rate": 2.777468186523305e-07,
+ "loss": 0.3234,
+ "step": 10783
+ },
+ {
+ "epoch": 5.0988179669030735,
+ "grad_norm": 3.1486685276031494,
+ "learning_rate": 2.774611134392857e-07,
+ "loss": 0.3577,
+ "step": 10784
+ },
+ {
+ "epoch": 5.099290780141844,
+ "grad_norm": 3.140198230743408,
+ "learning_rate": 2.7717554661399135e-07,
+ "loss": 0.3689,
+ "step": 10785
+ },
+ {
+ "epoch": 5.099763593380614,
+ "grad_norm": 3.095646619796753,
+ "learning_rate": 2.768901181942296e-07,
+ "loss": 0.2729,
+ "step": 10786
+ },
+ {
+ "epoch": 5.100236406619385,
+ "grad_norm": 3.2754757404327393,
+ "learning_rate": 2.766048281977715e-07,
+ "loss": 0.3382,
+ "step": 10787
+ },
+ {
+ "epoch": 5.100709219858156,
+ "grad_norm": 3.1195523738861084,
+ "learning_rate": 2.76319676642382e-07,
+ "loss": 0.3374,
+ "step": 10788
+ },
+ {
+ "epoch": 5.101182033096927,
+ "grad_norm": 3.3726742267608643,
+ "learning_rate": 2.760346635458158e-07,
+ "loss": 0.3821,
+ "step": 10789
+ },
+ {
+ "epoch": 5.101654846335697,
+ "grad_norm": 3.679518461227417,
+ "learning_rate": 2.757497889258193e-07,
+ "loss": 0.3251,
+ "step": 10790
+ },
+ {
+ "epoch": 5.102127659574468,
+ "grad_norm": 3.3941988945007324,
+ "learning_rate": 2.7546505280013015e-07,
+ "loss": 0.339,
+ "step": 10791
+ },
+ {
+ "epoch": 5.102600472813239,
+ "grad_norm": 3.409128427505493,
+ "learning_rate": 2.7518045518647755e-07,
+ "loss": 0.3833,
+ "step": 10792
+ },
+ {
+ "epoch": 5.10307328605201,
+ "grad_norm": 3.2601206302642822,
+ "learning_rate": 2.7489599610258293e-07,
+ "loss": 0.3334,
+ "step": 10793
+ },
+ {
+ "epoch": 5.10354609929078,
+ "grad_norm": 3.1256062984466553,
+ "learning_rate": 2.746116755661579e-07,
+ "loss": 0.3033,
+ "step": 10794
+ },
+ {
+ "epoch": 5.1040189125295505,
+ "grad_norm": 3.135474443435669,
+ "learning_rate": 2.74327493594905e-07,
+ "loss": 0.3405,
+ "step": 10795
+ },
+ {
+ "epoch": 5.104491725768321,
+ "grad_norm": 3.2788431644439697,
+ "learning_rate": 2.740434502065206e-07,
+ "loss": 0.324,
+ "step": 10796
+ },
+ {
+ "epoch": 5.104964539007092,
+ "grad_norm": 2.980475902557373,
+ "learning_rate": 2.7375954541869005e-07,
+ "loss": 0.3592,
+ "step": 10797
+ },
+ {
+ "epoch": 5.105437352245863,
+ "grad_norm": 3.1811654567718506,
+ "learning_rate": 2.7347577924909023e-07,
+ "loss": 0.3358,
+ "step": 10798
+ },
+ {
+ "epoch": 5.105910165484634,
+ "grad_norm": 3.5174286365509033,
+ "learning_rate": 2.7319215171539126e-07,
+ "loss": 0.3746,
+ "step": 10799
+ },
+ {
+ "epoch": 5.1063829787234045,
+ "grad_norm": 3.340458869934082,
+ "learning_rate": 2.7290866283525203e-07,
+ "loss": 0.328,
+ "step": 10800
+ },
+ {
+ "epoch": 5.106855791962175,
+ "grad_norm": 2.9427425861358643,
+ "learning_rate": 2.7262531262632535e-07,
+ "loss": 0.3039,
+ "step": 10801
+ },
+ {
+ "epoch": 5.107328605200945,
+ "grad_norm": 3.4074652194976807,
+ "learning_rate": 2.723421011062541e-07,
+ "loss": 0.2857,
+ "step": 10802
+ },
+ {
+ "epoch": 5.107801418439716,
+ "grad_norm": 3.5216293334960938,
+ "learning_rate": 2.7205902829267136e-07,
+ "loss": 0.3802,
+ "step": 10803
+ },
+ {
+ "epoch": 5.108274231678487,
+ "grad_norm": 3.243743658065796,
+ "learning_rate": 2.717760942032041e-07,
+ "loss": 0.3502,
+ "step": 10804
+ },
+ {
+ "epoch": 5.108747044917258,
+ "grad_norm": 2.869746685028076,
+ "learning_rate": 2.7149329885546945e-07,
+ "loss": 0.2878,
+ "step": 10805
+ },
+ {
+ "epoch": 5.109219858156028,
+ "grad_norm": 3.643076181411743,
+ "learning_rate": 2.712106422670743e-07,
+ "loss": 0.334,
+ "step": 10806
+ },
+ {
+ "epoch": 5.109692671394799,
+ "grad_norm": 3.114863157272339,
+ "learning_rate": 2.7092812445562053e-07,
+ "loss": 0.3094,
+ "step": 10807
+ },
+ {
+ "epoch": 5.11016548463357,
+ "grad_norm": 3.081373691558838,
+ "learning_rate": 2.706457454386979e-07,
+ "loss": 0.3218,
+ "step": 10808
+ },
+ {
+ "epoch": 5.110638297872341,
+ "grad_norm": 3.2016193866729736,
+ "learning_rate": 2.703635052338896e-07,
+ "loss": 0.307,
+ "step": 10809
+ },
+ {
+ "epoch": 5.111111111111111,
+ "grad_norm": 3.39798641204834,
+ "learning_rate": 2.700814038587685e-07,
+ "loss": 0.3047,
+ "step": 10810
+ },
+ {
+ "epoch": 5.1115839243498815,
+ "grad_norm": 3.0165305137634277,
+ "learning_rate": 2.6979944133090076e-07,
+ "loss": 0.3074,
+ "step": 10811
+ },
+ {
+ "epoch": 5.112056737588652,
+ "grad_norm": 3.063884735107422,
+ "learning_rate": 2.6951761766784295e-07,
+ "loss": 0.3493,
+ "step": 10812
+ },
+ {
+ "epoch": 5.112529550827423,
+ "grad_norm": 3.4540178775787354,
+ "learning_rate": 2.692359328871422e-07,
+ "loss": 0.3983,
+ "step": 10813
+ },
+ {
+ "epoch": 5.113002364066194,
+ "grad_norm": 3.297858476638794,
+ "learning_rate": 2.689543870063388e-07,
+ "loss": 0.3209,
+ "step": 10814
+ },
+ {
+ "epoch": 5.113475177304965,
+ "grad_norm": 3.200965642929077,
+ "learning_rate": 2.686729800429627e-07,
+ "loss": 0.3495,
+ "step": 10815
+ },
+ {
+ "epoch": 5.1139479905437355,
+ "grad_norm": 3.497377395629883,
+ "learning_rate": 2.683917120145357e-07,
+ "loss": 0.3548,
+ "step": 10816
+ },
+ {
+ "epoch": 5.114420803782506,
+ "grad_norm": 3.0813331604003906,
+ "learning_rate": 2.681105829385719e-07,
+ "loss": 0.3473,
+ "step": 10817
+ },
+ {
+ "epoch": 5.114893617021276,
+ "grad_norm": 3.73785138130188,
+ "learning_rate": 2.6782959283257525e-07,
+ "loss": 0.3621,
+ "step": 10818
+ },
+ {
+ "epoch": 5.115366430260047,
+ "grad_norm": 2.8933136463165283,
+ "learning_rate": 2.675487417140424e-07,
+ "loss": 0.3183,
+ "step": 10819
+ },
+ {
+ "epoch": 5.115839243498818,
+ "grad_norm": 3.239682197570801,
+ "learning_rate": 2.6726802960046037e-07,
+ "loss": 0.289,
+ "step": 10820
+ },
+ {
+ "epoch": 5.116312056737589,
+ "grad_norm": 3.0757391452789307,
+ "learning_rate": 2.6698745650930754e-07,
+ "loss": 0.303,
+ "step": 10821
+ },
+ {
+ "epoch": 5.116784869976359,
+ "grad_norm": 3.6242589950561523,
+ "learning_rate": 2.667070224580548e-07,
+ "loss": 0.3479,
+ "step": 10822
+ },
+ {
+ "epoch": 5.11725768321513,
+ "grad_norm": 3.0780205726623535,
+ "learning_rate": 2.6642672746416296e-07,
+ "loss": 0.3395,
+ "step": 10823
+ },
+ {
+ "epoch": 5.117730496453901,
+ "grad_norm": 3.202171564102173,
+ "learning_rate": 2.6614657154508506e-07,
+ "loss": 0.3551,
+ "step": 10824
+ },
+ {
+ "epoch": 5.118203309692672,
+ "grad_norm": 3.312523365020752,
+ "learning_rate": 2.658665547182651e-07,
+ "loss": 0.3553,
+ "step": 10825
+ },
+ {
+ "epoch": 5.118676122931442,
+ "grad_norm": 3.0124878883361816,
+ "learning_rate": 2.6558667700113757e-07,
+ "loss": 0.3094,
+ "step": 10826
+ },
+ {
+ "epoch": 5.1191489361702125,
+ "grad_norm": 3.496133804321289,
+ "learning_rate": 2.653069384111306e-07,
+ "loss": 0.3639,
+ "step": 10827
+ },
+ {
+ "epoch": 5.119621749408983,
+ "grad_norm": 3.0873615741729736,
+ "learning_rate": 2.6502733896566157e-07,
+ "loss": 0.3423,
+ "step": 10828
+ },
+ {
+ "epoch": 5.120094562647754,
+ "grad_norm": 3.3728535175323486,
+ "learning_rate": 2.647478786821403e-07,
+ "loss": 0.3022,
+ "step": 10829
+ },
+ {
+ "epoch": 5.120567375886525,
+ "grad_norm": 2.9433000087738037,
+ "learning_rate": 2.6446855757796736e-07,
+ "loss": 0.325,
+ "step": 10830
+ },
+ {
+ "epoch": 5.121040189125296,
+ "grad_norm": 3.447678804397583,
+ "learning_rate": 2.6418937567053436e-07,
+ "loss": 0.3677,
+ "step": 10831
+ },
+ {
+ "epoch": 5.1215130023640665,
+ "grad_norm": 3.1059327125549316,
+ "learning_rate": 2.6391033297722554e-07,
+ "loss": 0.3491,
+ "step": 10832
+ },
+ {
+ "epoch": 5.121985815602837,
+ "grad_norm": 3.0744543075561523,
+ "learning_rate": 2.636314295154152e-07,
+ "loss": 0.3571,
+ "step": 10833
+ },
+ {
+ "epoch": 5.122458628841607,
+ "grad_norm": 3.4055752754211426,
+ "learning_rate": 2.633526653024698e-07,
+ "loss": 0.3139,
+ "step": 10834
+ },
+ {
+ "epoch": 5.122931442080378,
+ "grad_norm": 3.4030723571777344,
+ "learning_rate": 2.630740403557466e-07,
+ "loss": 0.3373,
+ "step": 10835
+ },
+ {
+ "epoch": 5.123404255319149,
+ "grad_norm": 3.2269482612609863,
+ "learning_rate": 2.6279555469259366e-07,
+ "loss": 0.3636,
+ "step": 10836
+ },
+ {
+ "epoch": 5.12387706855792,
+ "grad_norm": 3.069843053817749,
+ "learning_rate": 2.625172083303523e-07,
+ "loss": 0.2865,
+ "step": 10837
+ },
+ {
+ "epoch": 5.12434988179669,
+ "grad_norm": 3.2536098957061768,
+ "learning_rate": 2.622390012863532e-07,
+ "loss": 0.3573,
+ "step": 10838
+ },
+ {
+ "epoch": 5.124822695035461,
+ "grad_norm": 3.369499683380127,
+ "learning_rate": 2.619609335779194e-07,
+ "loss": 0.3227,
+ "step": 10839
+ },
+ {
+ "epoch": 5.125295508274232,
+ "grad_norm": 3.2754480838775635,
+ "learning_rate": 2.6168300522236456e-07,
+ "loss": 0.3547,
+ "step": 10840
+ },
+ {
+ "epoch": 5.125768321513003,
+ "grad_norm": 3.356863498687744,
+ "learning_rate": 2.61405216236994e-07,
+ "loss": 0.2894,
+ "step": 10841
+ },
+ {
+ "epoch": 5.126241134751773,
+ "grad_norm": 2.770984172821045,
+ "learning_rate": 2.61127566639105e-07,
+ "loss": 0.3068,
+ "step": 10842
+ },
+ {
+ "epoch": 5.1267139479905435,
+ "grad_norm": 3.1126701831817627,
+ "learning_rate": 2.608500564459851e-07,
+ "loss": 0.3644,
+ "step": 10843
+ },
+ {
+ "epoch": 5.127186761229314,
+ "grad_norm": 3.6737382411956787,
+ "learning_rate": 2.605726856749136e-07,
+ "loss": 0.3572,
+ "step": 10844
+ },
+ {
+ "epoch": 5.127659574468085,
+ "grad_norm": 3.1673147678375244,
+ "learning_rate": 2.6029545434316184e-07,
+ "loss": 0.3328,
+ "step": 10845
+ },
+ {
+ "epoch": 5.128132387706856,
+ "grad_norm": 3.1279728412628174,
+ "learning_rate": 2.6001836246799085e-07,
+ "loss": 0.3002,
+ "step": 10846
+ },
+ {
+ "epoch": 5.128605200945627,
+ "grad_norm": 3.544572591781616,
+ "learning_rate": 2.597414100666548e-07,
+ "loss": 0.332,
+ "step": 10847
+ },
+ {
+ "epoch": 5.1290780141843975,
+ "grad_norm": 3.3752872943878174,
+ "learning_rate": 2.5946459715639777e-07,
+ "loss": 0.3625,
+ "step": 10848
+ },
+ {
+ "epoch": 5.129550827423168,
+ "grad_norm": 3.279550075531006,
+ "learning_rate": 2.591879237544556e-07,
+ "loss": 0.3792,
+ "step": 10849
+ },
+ {
+ "epoch": 5.130023640661938,
+ "grad_norm": 3.277784824371338,
+ "learning_rate": 2.58911389878056e-07,
+ "loss": 0.2996,
+ "step": 10850
+ },
+ {
+ "epoch": 5.130496453900709,
+ "grad_norm": 3.387439012527466,
+ "learning_rate": 2.5863499554441703e-07,
+ "loss": 0.2971,
+ "step": 10851
+ },
+ {
+ "epoch": 5.13096926713948,
+ "grad_norm": 3.296769618988037,
+ "learning_rate": 2.5835874077074913e-07,
+ "loss": 0.2694,
+ "step": 10852
+ },
+ {
+ "epoch": 5.131442080378251,
+ "grad_norm": 3.693293333053589,
+ "learning_rate": 2.580826255742533e-07,
+ "loss": 0.4239,
+ "step": 10853
+ },
+ {
+ "epoch": 5.131914893617021,
+ "grad_norm": 3.611297845840454,
+ "learning_rate": 2.578066499721216e-07,
+ "loss": 0.3555,
+ "step": 10854
+ },
+ {
+ "epoch": 5.132387706855792,
+ "grad_norm": 3.4243459701538086,
+ "learning_rate": 2.575308139815383e-07,
+ "loss": 0.3395,
+ "step": 10855
+ },
+ {
+ "epoch": 5.132860520094563,
+ "grad_norm": 2.9317305088043213,
+ "learning_rate": 2.5725511761967775e-07,
+ "loss": 0.2877,
+ "step": 10856
+ },
+ {
+ "epoch": 5.133333333333334,
+ "grad_norm": 6.177371501922607,
+ "learning_rate": 2.5697956090370735e-07,
+ "loss": 0.3329,
+ "step": 10857
+ },
+ {
+ "epoch": 5.133806146572104,
+ "grad_norm": 3.071885108947754,
+ "learning_rate": 2.567041438507842e-07,
+ "loss": 0.3357,
+ "step": 10858
+ },
+ {
+ "epoch": 5.1342789598108745,
+ "grad_norm": 3.5805132389068604,
+ "learning_rate": 2.56428866478057e-07,
+ "loss": 0.317,
+ "step": 10859
+ },
+ {
+ "epoch": 5.134751773049645,
+ "grad_norm": 3.055856704711914,
+ "learning_rate": 2.561537288026672e-07,
+ "loss": 0.3098,
+ "step": 10860
+ },
+ {
+ "epoch": 5.135224586288416,
+ "grad_norm": 2.9992854595184326,
+ "learning_rate": 2.558787308417451e-07,
+ "loss": 0.3025,
+ "step": 10861
+ },
+ {
+ "epoch": 5.135697399527187,
+ "grad_norm": 2.9692001342773438,
+ "learning_rate": 2.5560387261241493e-07,
+ "loss": 0.2837,
+ "step": 10862
+ },
+ {
+ "epoch": 5.136170212765958,
+ "grad_norm": 3.0246338844299316,
+ "learning_rate": 2.553291541317901e-07,
+ "loss": 0.2664,
+ "step": 10863
+ },
+ {
+ "epoch": 5.136643026004728,
+ "grad_norm": 3.2685534954071045,
+ "learning_rate": 2.5505457541697587e-07,
+ "loss": 0.3348,
+ "step": 10864
+ },
+ {
+ "epoch": 5.137115839243499,
+ "grad_norm": 3.8202273845672607,
+ "learning_rate": 2.5478013648506964e-07,
+ "loss": 0.369,
+ "step": 10865
+ },
+ {
+ "epoch": 5.137588652482269,
+ "grad_norm": 3.5275821685791016,
+ "learning_rate": 2.545058373531595e-07,
+ "loss": 0.3216,
+ "step": 10866
+ },
+ {
+ "epoch": 5.13806146572104,
+ "grad_norm": 3.353627920150757,
+ "learning_rate": 2.542316780383242e-07,
+ "loss": 0.3581,
+ "step": 10867
+ },
+ {
+ "epoch": 5.138534278959811,
+ "grad_norm": 3.2958984375,
+ "learning_rate": 2.539576585576356e-07,
+ "loss": 0.3056,
+ "step": 10868
+ },
+ {
+ "epoch": 5.1390070921985815,
+ "grad_norm": 3.8927693367004395,
+ "learning_rate": 2.536837789281546e-07,
+ "loss": 0.3333,
+ "step": 10869
+ },
+ {
+ "epoch": 5.139479905437352,
+ "grad_norm": 2.967616319656372,
+ "learning_rate": 2.5341003916693503e-07,
+ "loss": 0.3337,
+ "step": 10870
+ },
+ {
+ "epoch": 5.139952718676123,
+ "grad_norm": 3.290929079055786,
+ "learning_rate": 2.53136439291021e-07,
+ "loss": 0.2962,
+ "step": 10871
+ },
+ {
+ "epoch": 5.140425531914894,
+ "grad_norm": 3.2634384632110596,
+ "learning_rate": 2.5286297931744916e-07,
+ "loss": 0.2564,
+ "step": 10872
+ },
+ {
+ "epoch": 5.140898345153665,
+ "grad_norm": 2.955380439758301,
+ "learning_rate": 2.52589659263246e-07,
+ "loss": 0.3439,
+ "step": 10873
+ },
+ {
+ "epoch": 5.141371158392435,
+ "grad_norm": 3.039886951446533,
+ "learning_rate": 2.523164791454297e-07,
+ "loss": 0.3105,
+ "step": 10874
+ },
+ {
+ "epoch": 5.141843971631205,
+ "grad_norm": 3.562945604324341,
+ "learning_rate": 2.520434389810111e-07,
+ "loss": 0.323,
+ "step": 10875
+ },
+ {
+ "epoch": 5.142316784869976,
+ "grad_norm": 3.2454209327697754,
+ "learning_rate": 2.5177053878699036e-07,
+ "loss": 0.3365,
+ "step": 10876
+ },
+ {
+ "epoch": 5.142789598108747,
+ "grad_norm": 3.3156983852386475,
+ "learning_rate": 2.5149777858035964e-07,
+ "loss": 0.4163,
+ "step": 10877
+ },
+ {
+ "epoch": 5.143262411347518,
+ "grad_norm": 3.329939126968384,
+ "learning_rate": 2.5122515837810335e-07,
+ "loss": 0.3355,
+ "step": 10878
+ },
+ {
+ "epoch": 5.143735224586289,
+ "grad_norm": 3.1439883708953857,
+ "learning_rate": 2.509526781971955e-07,
+ "loss": 0.3297,
+ "step": 10879
+ },
+ {
+ "epoch": 5.144208037825059,
+ "grad_norm": 3.3979413509368896,
+ "learning_rate": 2.5068033805460274e-07,
+ "loss": 0.3214,
+ "step": 10880
+ },
+ {
+ "epoch": 5.14468085106383,
+ "grad_norm": 3.2689614295959473,
+ "learning_rate": 2.5040813796728266e-07,
+ "loss": 0.3377,
+ "step": 10881
+ },
+ {
+ "epoch": 5.1451536643026,
+ "grad_norm": 3.1197023391723633,
+ "learning_rate": 2.5013607795218303e-07,
+ "loss": 0.3081,
+ "step": 10882
+ },
+ {
+ "epoch": 5.145626477541371,
+ "grad_norm": 2.990771770477295,
+ "learning_rate": 2.498641580262451e-07,
+ "loss": 0.3112,
+ "step": 10883
+ },
+ {
+ "epoch": 5.146099290780142,
+ "grad_norm": 3.3050241470336914,
+ "learning_rate": 2.4959237820639884e-07,
+ "loss": 0.2987,
+ "step": 10884
+ },
+ {
+ "epoch": 5.1465721040189125,
+ "grad_norm": 3.4153940677642822,
+ "learning_rate": 2.493207385095681e-07,
+ "loss": 0.3271,
+ "step": 10885
+ },
+ {
+ "epoch": 5.147044917257683,
+ "grad_norm": 3.5337910652160645,
+ "learning_rate": 2.490492389526658e-07,
+ "loss": 0.3203,
+ "step": 10886
+ },
+ {
+ "epoch": 5.147517730496454,
+ "grad_norm": 3.0076584815979004,
+ "learning_rate": 2.487778795525975e-07,
+ "loss": 0.303,
+ "step": 10887
+ },
+ {
+ "epoch": 5.147990543735225,
+ "grad_norm": 3.292414665222168,
+ "learning_rate": 2.4850666032625925e-07,
+ "loss": 0.3296,
+ "step": 10888
+ },
+ {
+ "epoch": 5.148463356973995,
+ "grad_norm": 2.9681763648986816,
+ "learning_rate": 2.482355812905388e-07,
+ "loss": 0.3118,
+ "step": 10889
+ },
+ {
+ "epoch": 5.148936170212766,
+ "grad_norm": 2.911698818206787,
+ "learning_rate": 2.4796464246231446e-07,
+ "loss": 0.3224,
+ "step": 10890
+ },
+ {
+ "epoch": 5.149408983451536,
+ "grad_norm": 3.751932144165039,
+ "learning_rate": 2.4769384385845723e-07,
+ "loss": 0.3493,
+ "step": 10891
+ },
+ {
+ "epoch": 5.149881796690307,
+ "grad_norm": 3.595027208328247,
+ "learning_rate": 2.474231854958281e-07,
+ "loss": 0.364,
+ "step": 10892
+ },
+ {
+ "epoch": 5.150354609929078,
+ "grad_norm": 3.140770196914673,
+ "learning_rate": 2.4715266739128054e-07,
+ "loss": 0.3421,
+ "step": 10893
+ },
+ {
+ "epoch": 5.150827423167849,
+ "grad_norm": 3.0348970890045166,
+ "learning_rate": 2.4688228956165764e-07,
+ "loss": 0.2917,
+ "step": 10894
+ },
+ {
+ "epoch": 5.15130023640662,
+ "grad_norm": 3.466560125350952,
+ "learning_rate": 2.466120520237947e-07,
+ "loss": 0.3694,
+ "step": 10895
+ },
+ {
+ "epoch": 5.15177304964539,
+ "grad_norm": 3.148052930831909,
+ "learning_rate": 2.4634195479451867e-07,
+ "loss": 0.3096,
+ "step": 10896
+ },
+ {
+ "epoch": 5.152245862884161,
+ "grad_norm": 3.451205253601074,
+ "learning_rate": 2.46071997890647e-07,
+ "loss": 0.3961,
+ "step": 10897
+ },
+ {
+ "epoch": 5.152718676122931,
+ "grad_norm": 3.2556569576263428,
+ "learning_rate": 2.4580218132898904e-07,
+ "loss": 0.2722,
+ "step": 10898
+ },
+ {
+ "epoch": 5.153191489361702,
+ "grad_norm": 2.98459529876709,
+ "learning_rate": 2.455325051263449e-07,
+ "loss": 0.3148,
+ "step": 10899
+ },
+ {
+ "epoch": 5.153664302600473,
+ "grad_norm": 3.6685876846313477,
+ "learning_rate": 2.4526296929950565e-07,
+ "loss": 0.3534,
+ "step": 10900
+ },
+ {
+ "epoch": 5.1541371158392435,
+ "grad_norm": 3.508319139480591,
+ "learning_rate": 2.4499357386525534e-07,
+ "loss": 0.3375,
+ "step": 10901
+ },
+ {
+ "epoch": 5.154609929078014,
+ "grad_norm": 2.869081497192383,
+ "learning_rate": 2.4472431884036693e-07,
+ "loss": 0.3068,
+ "step": 10902
+ },
+ {
+ "epoch": 5.155082742316785,
+ "grad_norm": 3.3256514072418213,
+ "learning_rate": 2.444552042416062e-07,
+ "loss": 0.3254,
+ "step": 10903
+ },
+ {
+ "epoch": 5.155555555555556,
+ "grad_norm": 3.066772699356079,
+ "learning_rate": 2.441862300857298e-07,
+ "loss": 0.2726,
+ "step": 10904
+ },
+ {
+ "epoch": 5.156028368794326,
+ "grad_norm": 3.2044811248779297,
+ "learning_rate": 2.4391739638948476e-07,
+ "loss": 0.3227,
+ "step": 10905
+ },
+ {
+ "epoch": 5.156501182033097,
+ "grad_norm": 3.2946972846984863,
+ "learning_rate": 2.4364870316961135e-07,
+ "loss": 0.3755,
+ "step": 10906
+ },
+ {
+ "epoch": 5.156973995271867,
+ "grad_norm": 3.8998544216156006,
+ "learning_rate": 2.43380150442839e-07,
+ "loss": 0.2534,
+ "step": 10907
+ },
+ {
+ "epoch": 5.157446808510638,
+ "grad_norm": 3.953619956970215,
+ "learning_rate": 2.431117382258902e-07,
+ "loss": 0.2846,
+ "step": 10908
+ },
+ {
+ "epoch": 5.157919621749409,
+ "grad_norm": 3.1574389934539795,
+ "learning_rate": 2.42843466535477e-07,
+ "loss": 0.2833,
+ "step": 10909
+ },
+ {
+ "epoch": 5.15839243498818,
+ "grad_norm": 3.2663321495056152,
+ "learning_rate": 2.425753353883037e-07,
+ "loss": 0.3256,
+ "step": 10910
+ },
+ {
+ "epoch": 5.158865248226951,
+ "grad_norm": 3.3251729011535645,
+ "learning_rate": 2.42307344801066e-07,
+ "loss": 0.3335,
+ "step": 10911
+ },
+ {
+ "epoch": 5.159338061465721,
+ "grad_norm": 2.856858253479004,
+ "learning_rate": 2.420394947904503e-07,
+ "loss": 0.3167,
+ "step": 10912
+ },
+ {
+ "epoch": 5.159810874704492,
+ "grad_norm": 3.1201767921447754,
+ "learning_rate": 2.4177178537313383e-07,
+ "loss": 0.3299,
+ "step": 10913
+ },
+ {
+ "epoch": 5.160283687943262,
+ "grad_norm": 3.229917049407959,
+ "learning_rate": 2.4150421656578686e-07,
+ "loss": 0.3436,
+ "step": 10914
+ },
+ {
+ "epoch": 5.160756501182033,
+ "grad_norm": 3.437685012817383,
+ "learning_rate": 2.412367883850686e-07,
+ "loss": 0.3346,
+ "step": 10915
+ },
+ {
+ "epoch": 5.161229314420804,
+ "grad_norm": 3.7792129516601562,
+ "learning_rate": 2.4096950084763175e-07,
+ "loss": 0.378,
+ "step": 10916
+ },
+ {
+ "epoch": 5.1617021276595745,
+ "grad_norm": 3.606621503829956,
+ "learning_rate": 2.4070235397011837e-07,
+ "loss": 0.3108,
+ "step": 10917
+ },
+ {
+ "epoch": 5.162174940898345,
+ "grad_norm": 3.6572203636169434,
+ "learning_rate": 2.404353477691626e-07,
+ "loss": 0.3648,
+ "step": 10918
+ },
+ {
+ "epoch": 5.162647754137116,
+ "grad_norm": 2.890166759490967,
+ "learning_rate": 2.4016848226139004e-07,
+ "loss": 0.3349,
+ "step": 10919
+ },
+ {
+ "epoch": 5.163120567375887,
+ "grad_norm": 3.198918342590332,
+ "learning_rate": 2.399017574634166e-07,
+ "loss": 0.3144,
+ "step": 10920
+ },
+ {
+ "epoch": 5.163593380614657,
+ "grad_norm": 3.5493834018707275,
+ "learning_rate": 2.3963517339185063e-07,
+ "loss": 0.3892,
+ "step": 10921
+ },
+ {
+ "epoch": 5.164066193853428,
+ "grad_norm": 3.3824760913848877,
+ "learning_rate": 2.393687300632913e-07,
+ "loss": 0.3246,
+ "step": 10922
+ },
+ {
+ "epoch": 5.164539007092198,
+ "grad_norm": 2.872697591781616,
+ "learning_rate": 2.3910242749432793e-07,
+ "loss": 0.2936,
+ "step": 10923
+ },
+ {
+ "epoch": 5.165011820330969,
+ "grad_norm": 3.121612548828125,
+ "learning_rate": 2.3883626570154333e-07,
+ "loss": 0.3245,
+ "step": 10924
+ },
+ {
+ "epoch": 5.16548463356974,
+ "grad_norm": 3.810166835784912,
+ "learning_rate": 2.3857024470150896e-07,
+ "loss": 0.362,
+ "step": 10925
+ },
+ {
+ "epoch": 5.165957446808511,
+ "grad_norm": 2.994041681289673,
+ "learning_rate": 2.3830436451078992e-07,
+ "loss": 0.2917,
+ "step": 10926
+ },
+ {
+ "epoch": 5.166430260047282,
+ "grad_norm": 3.181262254714966,
+ "learning_rate": 2.3803862514594073e-07,
+ "loss": 0.3083,
+ "step": 10927
+ },
+ {
+ "epoch": 5.166903073286052,
+ "grad_norm": 3.463592052459717,
+ "learning_rate": 2.377730266235076e-07,
+ "loss": 0.3156,
+ "step": 10928
+ },
+ {
+ "epoch": 5.167375886524822,
+ "grad_norm": 3.687401056289673,
+ "learning_rate": 2.3750756896002898e-07,
+ "loss": 0.3474,
+ "step": 10929
+ },
+ {
+ "epoch": 5.167848699763593,
+ "grad_norm": 3.6340384483337402,
+ "learning_rate": 2.3724225217203302e-07,
+ "loss": 0.3602,
+ "step": 10930
+ },
+ {
+ "epoch": 5.168321513002364,
+ "grad_norm": 2.7620654106140137,
+ "learning_rate": 2.369770762760404e-07,
+ "loss": 0.3001,
+ "step": 10931
+ },
+ {
+ "epoch": 5.168794326241135,
+ "grad_norm": 3.4769256114959717,
+ "learning_rate": 2.3671204128856207e-07,
+ "loss": 0.3518,
+ "step": 10932
+ },
+ {
+ "epoch": 5.1692671394799055,
+ "grad_norm": 3.533514976501465,
+ "learning_rate": 2.3644714722610097e-07,
+ "loss": 0.3631,
+ "step": 10933
+ },
+ {
+ "epoch": 5.169739952718676,
+ "grad_norm": 2.992804765701294,
+ "learning_rate": 2.3618239410515053e-07,
+ "loss": 0.3175,
+ "step": 10934
+ },
+ {
+ "epoch": 5.170212765957447,
+ "grad_norm": 2.7561144828796387,
+ "learning_rate": 2.3591778194219539e-07,
+ "loss": 0.3038,
+ "step": 10935
+ },
+ {
+ "epoch": 5.170685579196218,
+ "grad_norm": 3.2408576011657715,
+ "learning_rate": 2.356533107537126e-07,
+ "loss": 0.3015,
+ "step": 10936
+ },
+ {
+ "epoch": 5.171158392434988,
+ "grad_norm": 3.2628087997436523,
+ "learning_rate": 2.3538898055616932e-07,
+ "loss": 0.3084,
+ "step": 10937
+ },
+ {
+ "epoch": 5.171631205673759,
+ "grad_norm": 3.4984447956085205,
+ "learning_rate": 2.35124791366024e-07,
+ "loss": 0.354,
+ "step": 10938
+ },
+ {
+ "epoch": 5.172104018912529,
+ "grad_norm": 3.113236904144287,
+ "learning_rate": 2.3486074319972684e-07,
+ "loss": 0.3384,
+ "step": 10939
+ },
+ {
+ "epoch": 5.1725768321513,
+ "grad_norm": 3.231961250305176,
+ "learning_rate": 2.3459683607371914e-07,
+ "loss": 0.3153,
+ "step": 10940
+ },
+ {
+ "epoch": 5.173049645390071,
+ "grad_norm": 3.6173667907714844,
+ "learning_rate": 2.3433307000443244e-07,
+ "loss": 0.3237,
+ "step": 10941
+ },
+ {
+ "epoch": 5.173522458628842,
+ "grad_norm": 3.2845072746276855,
+ "learning_rate": 2.3406944500829116e-07,
+ "loss": 0.3672,
+ "step": 10942
+ },
+ {
+ "epoch": 5.1739952718676125,
+ "grad_norm": 3.26304030418396,
+ "learning_rate": 2.3380596110170932e-07,
+ "loss": 0.3485,
+ "step": 10943
+ },
+ {
+ "epoch": 5.174468085106383,
+ "grad_norm": 3.4171030521392822,
+ "learning_rate": 2.3354261830109388e-07,
+ "loss": 0.3617,
+ "step": 10944
+ },
+ {
+ "epoch": 5.174940898345153,
+ "grad_norm": 3.248331308364868,
+ "learning_rate": 2.3327941662284136e-07,
+ "loss": 0.3254,
+ "step": 10945
+ },
+ {
+ "epoch": 5.175413711583924,
+ "grad_norm": 2.9847238063812256,
+ "learning_rate": 2.3301635608333983e-07,
+ "loss": 0.2844,
+ "step": 10946
+ },
+ {
+ "epoch": 5.175886524822695,
+ "grad_norm": 3.0677788257598877,
+ "learning_rate": 2.3275343669897004e-07,
+ "loss": 0.3071,
+ "step": 10947
+ },
+ {
+ "epoch": 5.176359338061466,
+ "grad_norm": 3.5211243629455566,
+ "learning_rate": 2.3249065848610198e-07,
+ "loss": 0.2984,
+ "step": 10948
+ },
+ {
+ "epoch": 5.176832151300236,
+ "grad_norm": 3.17779803276062,
+ "learning_rate": 2.3222802146109753e-07,
+ "loss": 0.2971,
+ "step": 10949
+ },
+ {
+ "epoch": 5.177304964539007,
+ "grad_norm": 2.7590653896331787,
+ "learning_rate": 2.3196552564031082e-07,
+ "loss": 0.2786,
+ "step": 10950
+ },
+ {
+ "epoch": 5.177777777777778,
+ "grad_norm": 2.994401693344116,
+ "learning_rate": 2.317031710400855e-07,
+ "loss": 0.2987,
+ "step": 10951
+ },
+ {
+ "epoch": 5.178250591016549,
+ "grad_norm": 3.2990148067474365,
+ "learning_rate": 2.314409576767579e-07,
+ "loss": 0.3324,
+ "step": 10952
+ },
+ {
+ "epoch": 5.178723404255319,
+ "grad_norm": 2.8851301670074463,
+ "learning_rate": 2.3117888556665386e-07,
+ "loss": 0.3067,
+ "step": 10953
+ },
+ {
+ "epoch": 5.1791962174940895,
+ "grad_norm": 3.1797075271606445,
+ "learning_rate": 2.3091695472609255e-07,
+ "loss": 0.2849,
+ "step": 10954
+ },
+ {
+ "epoch": 5.17966903073286,
+ "grad_norm": 3.2816412448883057,
+ "learning_rate": 2.3065516517138286e-07,
+ "loss": 0.3587,
+ "step": 10955
+ },
+ {
+ "epoch": 5.180141843971631,
+ "grad_norm": 3.3883779048919678,
+ "learning_rate": 2.3039351691882511e-07,
+ "loss": 0.386,
+ "step": 10956
+ },
+ {
+ "epoch": 5.180614657210402,
+ "grad_norm": 3.026245355606079,
+ "learning_rate": 2.3013200998471124e-07,
+ "loss": 0.2924,
+ "step": 10957
+ },
+ {
+ "epoch": 5.181087470449173,
+ "grad_norm": 3.2695093154907227,
+ "learning_rate": 2.298706443853238e-07,
+ "loss": 0.3446,
+ "step": 10958
+ },
+ {
+ "epoch": 5.1815602836879435,
+ "grad_norm": 3.4874610900878906,
+ "learning_rate": 2.2960942013693727e-07,
+ "loss": 0.3506,
+ "step": 10959
+ },
+ {
+ "epoch": 5.182033096926714,
+ "grad_norm": 3.31487774848938,
+ "learning_rate": 2.2934833725581695e-07,
+ "loss": 0.2892,
+ "step": 10960
+ },
+ {
+ "epoch": 5.182505910165484,
+ "grad_norm": 3.3388876914978027,
+ "learning_rate": 2.2908739575821876e-07,
+ "loss": 0.3508,
+ "step": 10961
+ },
+ {
+ "epoch": 5.182978723404255,
+ "grad_norm": 3.380781888961792,
+ "learning_rate": 2.2882659566039133e-07,
+ "loss": 0.317,
+ "step": 10962
+ },
+ {
+ "epoch": 5.183451536643026,
+ "grad_norm": 3.557917356491089,
+ "learning_rate": 2.285659369785728e-07,
+ "loss": 0.4085,
+ "step": 10963
+ },
+ {
+ "epoch": 5.183924349881797,
+ "grad_norm": 2.9980416297912598,
+ "learning_rate": 2.28305419728993e-07,
+ "loss": 0.3078,
+ "step": 10964
+ },
+ {
+ "epoch": 5.184397163120567,
+ "grad_norm": 3.1981256008148193,
+ "learning_rate": 2.280450439278742e-07,
+ "loss": 0.3257,
+ "step": 10965
+ },
+ {
+ "epoch": 5.184869976359338,
+ "grad_norm": 3.2910351753234863,
+ "learning_rate": 2.2778480959142841e-07,
+ "loss": 0.3591,
+ "step": 10966
+ },
+ {
+ "epoch": 5.185342789598109,
+ "grad_norm": 3.043903112411499,
+ "learning_rate": 2.2752471673585878e-07,
+ "loss": 0.316,
+ "step": 10967
+ },
+ {
+ "epoch": 5.18581560283688,
+ "grad_norm": 2.654751777648926,
+ "learning_rate": 2.2726476537736076e-07,
+ "loss": 0.2766,
+ "step": 10968
+ },
+ {
+ "epoch": 5.18628841607565,
+ "grad_norm": 2.8760786056518555,
+ "learning_rate": 2.2700495553211966e-07,
+ "loss": 0.2834,
+ "step": 10969
+ },
+ {
+ "epoch": 5.1867612293144205,
+ "grad_norm": 3.352128505706787,
+ "learning_rate": 2.2674528721631368e-07,
+ "loss": 0.3811,
+ "step": 10970
+ },
+ {
+ "epoch": 5.187234042553191,
+ "grad_norm": 3.2096493244171143,
+ "learning_rate": 2.2648576044611043e-07,
+ "loss": 0.3684,
+ "step": 10971
+ },
+ {
+ "epoch": 5.187706855791962,
+ "grad_norm": 3.0004148483276367,
+ "learning_rate": 2.2622637523767005e-07,
+ "loss": 0.285,
+ "step": 10972
+ },
+ {
+ "epoch": 5.188179669030733,
+ "grad_norm": 3.2039647102355957,
+ "learning_rate": 2.2596713160714324e-07,
+ "loss": 0.3276,
+ "step": 10973
+ },
+ {
+ "epoch": 5.188652482269504,
+ "grad_norm": 3.132408857345581,
+ "learning_rate": 2.2570802957067124e-07,
+ "loss": 0.3127,
+ "step": 10974
+ },
+ {
+ "epoch": 5.1891252955082745,
+ "grad_norm": 3.1553800106048584,
+ "learning_rate": 2.2544906914438808e-07,
+ "loss": 0.3257,
+ "step": 10975
+ },
+ {
+ "epoch": 5.189598108747045,
+ "grad_norm": 2.853139877319336,
+ "learning_rate": 2.2519025034441755e-07,
+ "loss": 0.2917,
+ "step": 10976
+ },
+ {
+ "epoch": 5.190070921985815,
+ "grad_norm": 3.3710334300994873,
+ "learning_rate": 2.249315731868759e-07,
+ "loss": 0.2887,
+ "step": 10977
+ },
+ {
+ "epoch": 5.190543735224586,
+ "grad_norm": 2.9652931690216064,
+ "learning_rate": 2.246730376878692e-07,
+ "loss": 0.3032,
+ "step": 10978
+ },
+ {
+ "epoch": 5.191016548463357,
+ "grad_norm": 3.727853536605835,
+ "learning_rate": 2.2441464386349505e-07,
+ "loss": 0.3195,
+ "step": 10979
+ },
+ {
+ "epoch": 5.191489361702128,
+ "grad_norm": 3.270364999771118,
+ "learning_rate": 2.241563917298431e-07,
+ "loss": 0.3178,
+ "step": 10980
+ },
+ {
+ "epoch": 5.191962174940898,
+ "grad_norm": 3.409079074859619,
+ "learning_rate": 2.2389828130299358e-07,
+ "loss": 0.3831,
+ "step": 10981
+ },
+ {
+ "epoch": 5.192434988179669,
+ "grad_norm": 3.069596529006958,
+ "learning_rate": 2.2364031259901776e-07,
+ "loss": 0.2832,
+ "step": 10982
+ },
+ {
+ "epoch": 5.19290780141844,
+ "grad_norm": 3.050508499145508,
+ "learning_rate": 2.233824856339778e-07,
+ "loss": 0.3265,
+ "step": 10983
+ },
+ {
+ "epoch": 5.193380614657211,
+ "grad_norm": 3.175166130065918,
+ "learning_rate": 2.2312480042392755e-07,
+ "loss": 0.3027,
+ "step": 10984
+ },
+ {
+ "epoch": 5.193853427895981,
+ "grad_norm": 3.2861170768737793,
+ "learning_rate": 2.2286725698491274e-07,
+ "loss": 0.3604,
+ "step": 10985
+ },
+ {
+ "epoch": 5.1943262411347515,
+ "grad_norm": 3.084089756011963,
+ "learning_rate": 2.2260985533296893e-07,
+ "loss": 0.2893,
+ "step": 10986
+ },
+ {
+ "epoch": 5.194799054373522,
+ "grad_norm": 2.9948923587799072,
+ "learning_rate": 2.2235259548412268e-07,
+ "loss": 0.2897,
+ "step": 10987
+ },
+ {
+ "epoch": 5.195271867612293,
+ "grad_norm": 2.8823280334472656,
+ "learning_rate": 2.2209547745439375e-07,
+ "loss": 0.3006,
+ "step": 10988
+ },
+ {
+ "epoch": 5.195744680851064,
+ "grad_norm": 3.556781053543091,
+ "learning_rate": 2.2183850125979066e-07,
+ "loss": 0.3501,
+ "step": 10989
+ },
+ {
+ "epoch": 5.196217494089835,
+ "grad_norm": 3.4599523544311523,
+ "learning_rate": 2.2158166691631483e-07,
+ "loss": 0.3735,
+ "step": 10990
+ },
+ {
+ "epoch": 5.1966903073286055,
+ "grad_norm": 2.926719903945923,
+ "learning_rate": 2.2132497443995843e-07,
+ "loss": 0.3082,
+ "step": 10991
+ },
+ {
+ "epoch": 5.197163120567376,
+ "grad_norm": 3.20700740814209,
+ "learning_rate": 2.210684238467037e-07,
+ "loss": 0.2761,
+ "step": 10992
+ },
+ {
+ "epoch": 5.197635933806146,
+ "grad_norm": 2.722289800643921,
+ "learning_rate": 2.208120151525256e-07,
+ "loss": 0.2892,
+ "step": 10993
+ },
+ {
+ "epoch": 5.198108747044917,
+ "grad_norm": 2.9280753135681152,
+ "learning_rate": 2.2055574837338916e-07,
+ "loss": 0.3129,
+ "step": 10994
+ },
+ {
+ "epoch": 5.198581560283688,
+ "grad_norm": 3.241697311401367,
+ "learning_rate": 2.2029962352525137e-07,
+ "loss": 0.3287,
+ "step": 10995
+ },
+ {
+ "epoch": 5.199054373522459,
+ "grad_norm": 3.356199264526367,
+ "learning_rate": 2.2004364062405997e-07,
+ "loss": 0.2941,
+ "step": 10996
+ },
+ {
+ "epoch": 5.199527186761229,
+ "grad_norm": 3.0350935459136963,
+ "learning_rate": 2.1978779968575392e-07,
+ "loss": 0.2941,
+ "step": 10997
+ },
+ {
+ "epoch": 5.2,
+ "grad_norm": 2.901796340942383,
+ "learning_rate": 2.1953210072626297e-07,
+ "loss": 0.2897,
+ "step": 10998
+ },
+ {
+ "epoch": 5.200472813238771,
+ "grad_norm": 3.085937976837158,
+ "learning_rate": 2.1927654376150826e-07,
+ "loss": 0.2953,
+ "step": 10999
+ },
+ {
+ "epoch": 5.200945626477542,
+ "grad_norm": 3.3783273696899414,
+ "learning_rate": 2.190211288074029e-07,
+ "loss": 0.3429,
+ "step": 11000
+ },
+ {
+ "epoch": 5.201418439716312,
+ "grad_norm": 3.5308191776275635,
+ "learning_rate": 2.1876585587984999e-07,
+ "loss": 0.3742,
+ "step": 11001
+ },
+ {
+ "epoch": 5.2018912529550825,
+ "grad_norm": 3.7020623683929443,
+ "learning_rate": 2.185107249947438e-07,
+ "loss": 0.3477,
+ "step": 11002
+ },
+ {
+ "epoch": 5.202364066193853,
+ "grad_norm": 3.2107656002044678,
+ "learning_rate": 2.182557361679713e-07,
+ "loss": 0.3137,
+ "step": 11003
+ },
+ {
+ "epoch": 5.202836879432624,
+ "grad_norm": 3.5506410598754883,
+ "learning_rate": 2.180008894154087e-07,
+ "loss": 0.3594,
+ "step": 11004
+ },
+ {
+ "epoch": 5.203309692671395,
+ "grad_norm": 3.173083543777466,
+ "learning_rate": 2.177461847529247e-07,
+ "loss": 0.3096,
+ "step": 11005
+ },
+ {
+ "epoch": 5.203782505910166,
+ "grad_norm": 3.69927716255188,
+ "learning_rate": 2.1749162219637827e-07,
+ "loss": 0.3383,
+ "step": 11006
+ },
+ {
+ "epoch": 5.2042553191489365,
+ "grad_norm": 3.133638620376587,
+ "learning_rate": 2.1723720176161978e-07,
+ "loss": 0.3013,
+ "step": 11007
+ },
+ {
+ "epoch": 5.204728132387707,
+ "grad_norm": 3.6174583435058594,
+ "learning_rate": 2.1698292346449158e-07,
+ "loss": 0.3753,
+ "step": 11008
+ },
+ {
+ "epoch": 5.205200945626477,
+ "grad_norm": 3.191314458847046,
+ "learning_rate": 2.1672878732082596e-07,
+ "loss": 0.3121,
+ "step": 11009
+ },
+ {
+ "epoch": 5.205673758865248,
+ "grad_norm": 3.505059242248535,
+ "learning_rate": 2.1647479334644646e-07,
+ "loss": 0.3372,
+ "step": 11010
+ },
+ {
+ "epoch": 5.206146572104019,
+ "grad_norm": 3.329693555831909,
+ "learning_rate": 2.1622094155716894e-07,
+ "loss": 0.3221,
+ "step": 11011
+ },
+ {
+ "epoch": 5.20661938534279,
+ "grad_norm": 3.3008434772491455,
+ "learning_rate": 2.1596723196879942e-07,
+ "loss": 0.2895,
+ "step": 11012
+ },
+ {
+ "epoch": 5.20709219858156,
+ "grad_norm": 3.349616527557373,
+ "learning_rate": 2.1571366459713472e-07,
+ "loss": 0.3413,
+ "step": 11013
+ },
+ {
+ "epoch": 5.207565011820331,
+ "grad_norm": 2.940938949584961,
+ "learning_rate": 2.154602394579644e-07,
+ "loss": 0.2919,
+ "step": 11014
+ },
+ {
+ "epoch": 5.208037825059102,
+ "grad_norm": 3.792189598083496,
+ "learning_rate": 2.1520695656706725e-07,
+ "loss": 0.318,
+ "step": 11015
+ },
+ {
+ "epoch": 5.208510638297873,
+ "grad_norm": 3.4563612937927246,
+ "learning_rate": 2.149538159402148e-07,
+ "loss": 0.3264,
+ "step": 11016
+ },
+ {
+ "epoch": 5.208983451536643,
+ "grad_norm": 2.9630939960479736,
+ "learning_rate": 2.1470081759316779e-07,
+ "loss": 0.2884,
+ "step": 11017
+ },
+ {
+ "epoch": 5.2094562647754135,
+ "grad_norm": 3.3553643226623535,
+ "learning_rate": 2.1444796154168086e-07,
+ "loss": 0.2755,
+ "step": 11018
+ },
+ {
+ "epoch": 5.209929078014184,
+ "grad_norm": 3.1282877922058105,
+ "learning_rate": 2.1419524780149748e-07,
+ "loss": 0.3648,
+ "step": 11019
+ },
+ {
+ "epoch": 5.210401891252955,
+ "grad_norm": 3.1164352893829346,
+ "learning_rate": 2.139426763883523e-07,
+ "loss": 0.3363,
+ "step": 11020
+ },
+ {
+ "epoch": 5.210874704491726,
+ "grad_norm": 3.6382009983062744,
+ "learning_rate": 2.1369024731797334e-07,
+ "loss": 0.3696,
+ "step": 11021
+ },
+ {
+ "epoch": 5.211347517730497,
+ "grad_norm": 3.323014497756958,
+ "learning_rate": 2.1343796060607714e-07,
+ "loss": 0.3995,
+ "step": 11022
+ },
+ {
+ "epoch": 5.2118203309692674,
+ "grad_norm": 3.161573648452759,
+ "learning_rate": 2.1318581626837308e-07,
+ "loss": 0.304,
+ "step": 11023
+ },
+ {
+ "epoch": 5.212293144208038,
+ "grad_norm": 3.022866725921631,
+ "learning_rate": 2.1293381432056116e-07,
+ "loss": 0.3012,
+ "step": 11024
+ },
+ {
+ "epoch": 5.212765957446808,
+ "grad_norm": 3.100696563720703,
+ "learning_rate": 2.1268195477833152e-07,
+ "loss": 0.2727,
+ "step": 11025
+ },
+ {
+ "epoch": 5.213238770685579,
+ "grad_norm": 3.2253260612487793,
+ "learning_rate": 2.1243023765736725e-07,
+ "loss": 0.2991,
+ "step": 11026
+ },
+ {
+ "epoch": 5.21371158392435,
+ "grad_norm": 2.9985997676849365,
+ "learning_rate": 2.1217866297334105e-07,
+ "loss": 0.3228,
+ "step": 11027
+ },
+ {
+ "epoch": 5.2141843971631205,
+ "grad_norm": 3.197525978088379,
+ "learning_rate": 2.1192723074191817e-07,
+ "loss": 0.3843,
+ "step": 11028
+ },
+ {
+ "epoch": 5.214657210401891,
+ "grad_norm": 3.6619510650634766,
+ "learning_rate": 2.116759409787539e-07,
+ "loss": 0.3678,
+ "step": 11029
+ },
+ {
+ "epoch": 5.215130023640662,
+ "grad_norm": 3.445089340209961,
+ "learning_rate": 2.1142479369949454e-07,
+ "loss": 0.3212,
+ "step": 11030
+ },
+ {
+ "epoch": 5.215602836879433,
+ "grad_norm": 3.347994565963745,
+ "learning_rate": 2.1117378891977847e-07,
+ "loss": 0.3505,
+ "step": 11031
+ },
+ {
+ "epoch": 5.216075650118204,
+ "grad_norm": 3.0974318981170654,
+ "learning_rate": 2.10922926655234e-07,
+ "loss": 0.312,
+ "step": 11032
+ },
+ {
+ "epoch": 5.216548463356974,
+ "grad_norm": 3.206122398376465,
+ "learning_rate": 2.1067220692148143e-07,
+ "loss": 0.3245,
+ "step": 11033
+ },
+ {
+ "epoch": 5.217021276595744,
+ "grad_norm": 3.105717897415161,
+ "learning_rate": 2.1042162973413244e-07,
+ "loss": 0.3045,
+ "step": 11034
+ },
+ {
+ "epoch": 5.217494089834515,
+ "grad_norm": 3.065800905227661,
+ "learning_rate": 2.101711951087887e-07,
+ "loss": 0.3078,
+ "step": 11035
+ },
+ {
+ "epoch": 5.217966903073286,
+ "grad_norm": 3.374640464782715,
+ "learning_rate": 2.099209030610444e-07,
+ "loss": 0.3105,
+ "step": 11036
+ },
+ {
+ "epoch": 5.218439716312057,
+ "grad_norm": 3.3342158794403076,
+ "learning_rate": 2.0967075360648375e-07,
+ "loss": 0.3075,
+ "step": 11037
+ },
+ {
+ "epoch": 5.218912529550828,
+ "grad_norm": 2.6468522548675537,
+ "learning_rate": 2.0942074676068208e-07,
+ "loss": 0.3,
+ "step": 11038
+ },
+ {
+ "epoch": 5.219385342789598,
+ "grad_norm": 2.8999321460723877,
+ "learning_rate": 2.091708825392072e-07,
+ "loss": 0.325,
+ "step": 11039
+ },
+ {
+ "epoch": 5.219858156028369,
+ "grad_norm": 3.4187169075012207,
+ "learning_rate": 2.0892116095761584e-07,
+ "loss": 0.3048,
+ "step": 11040
+ },
+ {
+ "epoch": 5.220330969267139,
+ "grad_norm": 4.025125503540039,
+ "learning_rate": 2.0867158203145832e-07,
+ "loss": 0.3412,
+ "step": 11041
+ },
+ {
+ "epoch": 5.22080378250591,
+ "grad_norm": 3.3094186782836914,
+ "learning_rate": 2.0842214577627418e-07,
+ "loss": 0.3174,
+ "step": 11042
+ },
+ {
+ "epoch": 5.221276595744681,
+ "grad_norm": 3.254117488861084,
+ "learning_rate": 2.0817285220759432e-07,
+ "loss": 0.3346,
+ "step": 11043
+ },
+ {
+ "epoch": 5.2217494089834515,
+ "grad_norm": 3.184781074523926,
+ "learning_rate": 2.0792370134094186e-07,
+ "loss": 0.3554,
+ "step": 11044
+ },
+ {
+ "epoch": 5.222222222222222,
+ "grad_norm": 3.116020917892456,
+ "learning_rate": 2.0767469319183026e-07,
+ "loss": 0.338,
+ "step": 11045
+ },
+ {
+ "epoch": 5.222695035460993,
+ "grad_norm": 3.0979223251342773,
+ "learning_rate": 2.0742582777576376e-07,
+ "loss": 0.3223,
+ "step": 11046
+ },
+ {
+ "epoch": 5.223167848699764,
+ "grad_norm": 3.083552837371826,
+ "learning_rate": 2.0717710510823857e-07,
+ "loss": 0.2609,
+ "step": 11047
+ },
+ {
+ "epoch": 5.223640661938534,
+ "grad_norm": 3.228064775466919,
+ "learning_rate": 2.0692852520474066e-07,
+ "loss": 0.3362,
+ "step": 11048
+ },
+ {
+ "epoch": 5.224113475177305,
+ "grad_norm": 3.6322951316833496,
+ "learning_rate": 2.066800880807493e-07,
+ "loss": 0.3319,
+ "step": 11049
+ },
+ {
+ "epoch": 5.224586288416075,
+ "grad_norm": 3.0053210258483887,
+ "learning_rate": 2.064317937517324e-07,
+ "loss": 0.2936,
+ "step": 11050
+ },
+ {
+ "epoch": 5.225059101654846,
+ "grad_norm": 3.0131611824035645,
+ "learning_rate": 2.0618364223315118e-07,
+ "loss": 0.3449,
+ "step": 11051
+ },
+ {
+ "epoch": 5.225531914893617,
+ "grad_norm": 2.7796759605407715,
+ "learning_rate": 2.0593563354045638e-07,
+ "loss": 0.3093,
+ "step": 11052
+ },
+ {
+ "epoch": 5.226004728132388,
+ "grad_norm": 3.0809054374694824,
+ "learning_rate": 2.0568776768909032e-07,
+ "loss": 0.3143,
+ "step": 11053
+ },
+ {
+ "epoch": 5.226477541371159,
+ "grad_norm": 3.3395402431488037,
+ "learning_rate": 2.054400446944868e-07,
+ "loss": 0.3055,
+ "step": 11054
+ },
+ {
+ "epoch": 5.226950354609929,
+ "grad_norm": 3.47823166847229,
+ "learning_rate": 2.0519246457207016e-07,
+ "loss": 0.3361,
+ "step": 11055
+ },
+ {
+ "epoch": 5.2274231678487,
+ "grad_norm": 3.4772236347198486,
+ "learning_rate": 2.0494502733725663e-07,
+ "loss": 0.2907,
+ "step": 11056
+ },
+ {
+ "epoch": 5.22789598108747,
+ "grad_norm": 3.7166171073913574,
+ "learning_rate": 2.046977330054528e-07,
+ "loss": 0.3157,
+ "step": 11057
+ },
+ {
+ "epoch": 5.228368794326241,
+ "grad_norm": 3.654662609100342,
+ "learning_rate": 2.044505815920564e-07,
+ "loss": 0.3212,
+ "step": 11058
+ },
+ {
+ "epoch": 5.228841607565012,
+ "grad_norm": 3.1599886417388916,
+ "learning_rate": 2.042035731124567e-07,
+ "loss": 0.3675,
+ "step": 11059
+ },
+ {
+ "epoch": 5.2293144208037825,
+ "grad_norm": 3.1967053413391113,
+ "learning_rate": 2.039567075820337e-07,
+ "loss": 0.3512,
+ "step": 11060
+ },
+ {
+ "epoch": 5.229787234042553,
+ "grad_norm": 3.4255285263061523,
+ "learning_rate": 2.0370998501615862e-07,
+ "loss": 0.3127,
+ "step": 11061
+ },
+ {
+ "epoch": 5.230260047281324,
+ "grad_norm": 3.137645959854126,
+ "learning_rate": 2.03463405430194e-07,
+ "loss": 0.3265,
+ "step": 11062
+ },
+ {
+ "epoch": 5.230732860520095,
+ "grad_norm": 4.049947261810303,
+ "learning_rate": 2.0321696883949277e-07,
+ "loss": 0.413,
+ "step": 11063
+ },
+ {
+ "epoch": 5.231205673758865,
+ "grad_norm": 3.052638530731201,
+ "learning_rate": 2.0297067525940017e-07,
+ "loss": 0.3051,
+ "step": 11064
+ },
+ {
+ "epoch": 5.231678486997636,
+ "grad_norm": 3.339456796646118,
+ "learning_rate": 2.0272452470525144e-07,
+ "loss": 0.2821,
+ "step": 11065
+ },
+ {
+ "epoch": 5.232151300236406,
+ "grad_norm": 3.2044780254364014,
+ "learning_rate": 2.0247851719237266e-07,
+ "loss": 0.2997,
+ "step": 11066
+ },
+ {
+ "epoch": 5.232624113475177,
+ "grad_norm": 2.9565982818603516,
+ "learning_rate": 2.0223265273608295e-07,
+ "loss": 0.3197,
+ "step": 11067
+ },
+ {
+ "epoch": 5.233096926713948,
+ "grad_norm": 3.1882998943328857,
+ "learning_rate": 2.0198693135169007e-07,
+ "loss": 0.3151,
+ "step": 11068
+ },
+ {
+ "epoch": 5.233569739952719,
+ "grad_norm": 3.3345093727111816,
+ "learning_rate": 2.017413530544951e-07,
+ "loss": 0.3441,
+ "step": 11069
+ },
+ {
+ "epoch": 5.23404255319149,
+ "grad_norm": 2.9199023246765137,
+ "learning_rate": 2.0149591785978835e-07,
+ "loss": 0.2556,
+ "step": 11070
+ },
+ {
+ "epoch": 5.23451536643026,
+ "grad_norm": 3.177485466003418,
+ "learning_rate": 2.0125062578285143e-07,
+ "loss": 0.3017,
+ "step": 11071
+ },
+ {
+ "epoch": 5.234988179669031,
+ "grad_norm": 3.4713990688323975,
+ "learning_rate": 2.010054768389591e-07,
+ "loss": 0.3658,
+ "step": 11072
+ },
+ {
+ "epoch": 5.235460992907801,
+ "grad_norm": 3.084087371826172,
+ "learning_rate": 2.0076047104337443e-07,
+ "loss": 0.315,
+ "step": 11073
+ },
+ {
+ "epoch": 5.235933806146572,
+ "grad_norm": 3.1807777881622314,
+ "learning_rate": 2.0051560841135382e-07,
+ "loss": 0.3464,
+ "step": 11074
+ },
+ {
+ "epoch": 5.236406619385343,
+ "grad_norm": 2.933351993560791,
+ "learning_rate": 2.002708889581434e-07,
+ "loss": 0.2666,
+ "step": 11075
+ },
+ {
+ "epoch": 5.2368794326241135,
+ "grad_norm": 3.0530927181243896,
+ "learning_rate": 2.000263126989807e-07,
+ "loss": 0.3033,
+ "step": 11076
+ },
+ {
+ "epoch": 5.237352245862884,
+ "grad_norm": 2.93255615234375,
+ "learning_rate": 1.997818796490944e-07,
+ "loss": 0.2745,
+ "step": 11077
+ },
+ {
+ "epoch": 5.237825059101655,
+ "grad_norm": 3.159719944000244,
+ "learning_rate": 1.995375898237037e-07,
+ "loss": 0.2827,
+ "step": 11078
+ },
+ {
+ "epoch": 5.238297872340426,
+ "grad_norm": 2.9349069595336914,
+ "learning_rate": 1.9929344323802057e-07,
+ "loss": 0.2904,
+ "step": 11079
+ },
+ {
+ "epoch": 5.238770685579196,
+ "grad_norm": 3.3207859992980957,
+ "learning_rate": 1.990494399072465e-07,
+ "loss": 0.314,
+ "step": 11080
+ },
+ {
+ "epoch": 5.239243498817967,
+ "grad_norm": 3.2512917518615723,
+ "learning_rate": 1.9880557984657406e-07,
+ "loss": 0.3448,
+ "step": 11081
+ },
+ {
+ "epoch": 5.239716312056737,
+ "grad_norm": 3.1114752292633057,
+ "learning_rate": 1.985618630711883e-07,
+ "loss": 0.298,
+ "step": 11082
+ },
+ {
+ "epoch": 5.240189125295508,
+ "grad_norm": 3.120422840118408,
+ "learning_rate": 1.9831828959626348e-07,
+ "loss": 0.3168,
+ "step": 11083
+ },
+ {
+ "epoch": 5.240661938534279,
+ "grad_norm": 3.0151801109313965,
+ "learning_rate": 1.9807485943696608e-07,
+ "loss": 0.3034,
+ "step": 11084
+ },
+ {
+ "epoch": 5.24113475177305,
+ "grad_norm": 3.2933578491210938,
+ "learning_rate": 1.978315726084537e-07,
+ "loss": 0.3404,
+ "step": 11085
+ },
+ {
+ "epoch": 5.241607565011821,
+ "grad_norm": 2.931190252304077,
+ "learning_rate": 1.9758842912587423e-07,
+ "loss": 0.3329,
+ "step": 11086
+ },
+ {
+ "epoch": 5.242080378250591,
+ "grad_norm": 3.5645482540130615,
+ "learning_rate": 1.9734542900436804e-07,
+ "loss": 0.3416,
+ "step": 11087
+ },
+ {
+ "epoch": 5.242553191489361,
+ "grad_norm": 2.992359161376953,
+ "learning_rate": 1.9710257225906497e-07,
+ "loss": 0.3187,
+ "step": 11088
+ },
+ {
+ "epoch": 5.243026004728132,
+ "grad_norm": 2.9263901710510254,
+ "learning_rate": 1.9685985890508657e-07,
+ "loss": 0.2672,
+ "step": 11089
+ },
+ {
+ "epoch": 5.243498817966903,
+ "grad_norm": 3.1584653854370117,
+ "learning_rate": 1.966172889575463e-07,
+ "loss": 0.2854,
+ "step": 11090
+ },
+ {
+ "epoch": 5.243971631205674,
+ "grad_norm": 3.632632255554199,
+ "learning_rate": 1.9637486243154676e-07,
+ "loss": 0.328,
+ "step": 11091
+ },
+ {
+ "epoch": 5.2444444444444445,
+ "grad_norm": 3.4155561923980713,
+ "learning_rate": 1.9613257934218393e-07,
+ "loss": 0.321,
+ "step": 11092
+ },
+ {
+ "epoch": 5.244917257683215,
+ "grad_norm": 2.820805549621582,
+ "learning_rate": 1.958904397045433e-07,
+ "loss": 0.3102,
+ "step": 11093
+ },
+ {
+ "epoch": 5.245390070921986,
+ "grad_norm": 3.3208353519439697,
+ "learning_rate": 1.9564844353370167e-07,
+ "loss": 0.363,
+ "step": 11094
+ },
+ {
+ "epoch": 5.245862884160757,
+ "grad_norm": 3.082474708557129,
+ "learning_rate": 1.9540659084472724e-07,
+ "loss": 0.3116,
+ "step": 11095
+ },
+ {
+ "epoch": 5.246335697399527,
+ "grad_norm": 3.274022102355957,
+ "learning_rate": 1.9516488165267854e-07,
+ "loss": 0.31,
+ "step": 11096
+ },
+ {
+ "epoch": 5.246808510638298,
+ "grad_norm": 3.6933083534240723,
+ "learning_rate": 1.949233159726069e-07,
+ "loss": 0.3488,
+ "step": 11097
+ },
+ {
+ "epoch": 5.247281323877068,
+ "grad_norm": 3.25183367729187,
+ "learning_rate": 1.9468189381955272e-07,
+ "loss": 0.346,
+ "step": 11098
+ },
+ {
+ "epoch": 5.247754137115839,
+ "grad_norm": 3.847667694091797,
+ "learning_rate": 1.944406152085482e-07,
+ "loss": 0.3151,
+ "step": 11099
+ },
+ {
+ "epoch": 5.24822695035461,
+ "grad_norm": 7.3193559646606445,
+ "learning_rate": 1.9419948015461737e-07,
+ "loss": 0.3774,
+ "step": 11100
+ },
+ {
+ "epoch": 5.248699763593381,
+ "grad_norm": 2.9225029945373535,
+ "learning_rate": 1.9395848867277383e-07,
+ "loss": 0.2671,
+ "step": 11101
+ },
+ {
+ "epoch": 5.2491725768321515,
+ "grad_norm": 3.179107189178467,
+ "learning_rate": 1.9371764077802417e-07,
+ "loss": 0.3126,
+ "step": 11102
+ },
+ {
+ "epoch": 5.249645390070922,
+ "grad_norm": 3.3329577445983887,
+ "learning_rate": 1.9347693648536392e-07,
+ "loss": 0.3322,
+ "step": 11103
+ },
+ {
+ "epoch": 5.250118203309692,
+ "grad_norm": 3.384031057357788,
+ "learning_rate": 1.9323637580978076e-07,
+ "loss": 0.3786,
+ "step": 11104
+ },
+ {
+ "epoch": 5.250591016548463,
+ "grad_norm": 3.2742369174957275,
+ "learning_rate": 1.9299595876625416e-07,
+ "loss": 0.2924,
+ "step": 11105
+ },
+ {
+ "epoch": 5.251063829787234,
+ "grad_norm": 3.4118731021881104,
+ "learning_rate": 1.927556853697532e-07,
+ "loss": 0.452,
+ "step": 11106
+ },
+ {
+ "epoch": 5.251536643026005,
+ "grad_norm": 3.110896110534668,
+ "learning_rate": 1.9251555563523816e-07,
+ "loss": 0.3204,
+ "step": 11107
+ },
+ {
+ "epoch": 5.2520094562647754,
+ "grad_norm": 3.403810977935791,
+ "learning_rate": 1.922755695776621e-07,
+ "loss": 0.357,
+ "step": 11108
+ },
+ {
+ "epoch": 5.252482269503546,
+ "grad_norm": 3.437980890274048,
+ "learning_rate": 1.9203572721196716e-07,
+ "loss": 0.3091,
+ "step": 11109
+ },
+ {
+ "epoch": 5.252955082742317,
+ "grad_norm": 3.0351603031158447,
+ "learning_rate": 1.917960285530876e-07,
+ "loss": 0.3458,
+ "step": 11110
+ },
+ {
+ "epoch": 5.253427895981088,
+ "grad_norm": 3.085078477859497,
+ "learning_rate": 1.9155647361594782e-07,
+ "loss": 0.3112,
+ "step": 11111
+ },
+ {
+ "epoch": 5.253900709219858,
+ "grad_norm": 3.1089236736297607,
+ "learning_rate": 1.9131706241546395e-07,
+ "loss": 0.3101,
+ "step": 11112
+ },
+ {
+ "epoch": 5.2543735224586285,
+ "grad_norm": 3.224823236465454,
+ "learning_rate": 1.9107779496654377e-07,
+ "loss": 0.3156,
+ "step": 11113
+ },
+ {
+ "epoch": 5.254846335697399,
+ "grad_norm": 3.4039080142974854,
+ "learning_rate": 1.9083867128408456e-07,
+ "loss": 0.2953,
+ "step": 11114
+ },
+ {
+ "epoch": 5.25531914893617,
+ "grad_norm": 2.9961628913879395,
+ "learning_rate": 1.9059969138297636e-07,
+ "loss": 0.2958,
+ "step": 11115
+ },
+ {
+ "epoch": 5.255791962174941,
+ "grad_norm": 3.0796079635620117,
+ "learning_rate": 1.9036085527809889e-07,
+ "loss": 0.3311,
+ "step": 11116
+ },
+ {
+ "epoch": 5.256264775413712,
+ "grad_norm": 3.2201931476593018,
+ "learning_rate": 1.9012216298432306e-07,
+ "loss": 0.3793,
+ "step": 11117
+ },
+ {
+ "epoch": 5.2567375886524825,
+ "grad_norm": 2.967545509338379,
+ "learning_rate": 1.89883614516512e-07,
+ "loss": 0.3513,
+ "step": 11118
+ },
+ {
+ "epoch": 5.257210401891253,
+ "grad_norm": 3.1672980785369873,
+ "learning_rate": 1.8964520988951852e-07,
+ "loss": 0.3064,
+ "step": 11119
+ },
+ {
+ "epoch": 5.257683215130023,
+ "grad_norm": 3.426481246948242,
+ "learning_rate": 1.894069491181874e-07,
+ "loss": 0.3429,
+ "step": 11120
+ },
+ {
+ "epoch": 5.258156028368794,
+ "grad_norm": 2.934448480606079,
+ "learning_rate": 1.8916883221735404e-07,
+ "loss": 0.3262,
+ "step": 11121
+ },
+ {
+ "epoch": 5.258628841607565,
+ "grad_norm": 2.945082426071167,
+ "learning_rate": 1.8893085920184433e-07,
+ "loss": 0.3318,
+ "step": 11122
+ },
+ {
+ "epoch": 5.259101654846336,
+ "grad_norm": 3.222621440887451,
+ "learning_rate": 1.886930300864767e-07,
+ "loss": 0.3538,
+ "step": 11123
+ },
+ {
+ "epoch": 5.259574468085106,
+ "grad_norm": 2.8572630882263184,
+ "learning_rate": 1.8845534488605933e-07,
+ "loss": 0.2634,
+ "step": 11124
+ },
+ {
+ "epoch": 5.260047281323877,
+ "grad_norm": 3.195322036743164,
+ "learning_rate": 1.88217803615392e-07,
+ "loss": 0.3027,
+ "step": 11125
+ },
+ {
+ "epoch": 5.260520094562648,
+ "grad_norm": 3.427511692047119,
+ "learning_rate": 1.8798040628926483e-07,
+ "loss": 0.3469,
+ "step": 11126
+ },
+ {
+ "epoch": 5.260992907801419,
+ "grad_norm": 3.2642531394958496,
+ "learning_rate": 1.8774315292245965e-07,
+ "loss": 0.3377,
+ "step": 11127
+ },
+ {
+ "epoch": 5.261465721040189,
+ "grad_norm": 3.043724298477173,
+ "learning_rate": 1.8750604352974987e-07,
+ "loss": 0.303,
+ "step": 11128
+ },
+ {
+ "epoch": 5.2619385342789595,
+ "grad_norm": 3.303520441055298,
+ "learning_rate": 1.8726907812589874e-07,
+ "loss": 0.3193,
+ "step": 11129
+ },
+ {
+ "epoch": 5.26241134751773,
+ "grad_norm": 4.076786994934082,
+ "learning_rate": 1.870322567256605e-07,
+ "loss": 0.3555,
+ "step": 11130
+ },
+ {
+ "epoch": 5.262884160756501,
+ "grad_norm": 3.092057466506958,
+ "learning_rate": 1.8679557934378202e-07,
+ "loss": 0.3223,
+ "step": 11131
+ },
+ {
+ "epoch": 5.263356973995272,
+ "grad_norm": 3.3769257068634033,
+ "learning_rate": 1.8655904599499952e-07,
+ "loss": 0.3498,
+ "step": 11132
+ },
+ {
+ "epoch": 5.263829787234043,
+ "grad_norm": 3.1874232292175293,
+ "learning_rate": 1.8632265669404126e-07,
+ "loss": 0.3292,
+ "step": 11133
+ },
+ {
+ "epoch": 5.2643026004728135,
+ "grad_norm": 3.0761327743530273,
+ "learning_rate": 1.8608641145562627e-07,
+ "loss": 0.293,
+ "step": 11134
+ },
+ {
+ "epoch": 5.264775413711584,
+ "grad_norm": 3.3254477977752686,
+ "learning_rate": 1.8585031029446364e-07,
+ "loss": 0.3414,
+ "step": 11135
+ },
+ {
+ "epoch": 5.265248226950354,
+ "grad_norm": 3.382084608078003,
+ "learning_rate": 1.8561435322525552e-07,
+ "loss": 0.3533,
+ "step": 11136
+ },
+ {
+ "epoch": 5.265721040189125,
+ "grad_norm": 3.598961114883423,
+ "learning_rate": 1.8537854026269259e-07,
+ "loss": 0.3581,
+ "step": 11137
+ },
+ {
+ "epoch": 5.266193853427896,
+ "grad_norm": 3.4922592639923096,
+ "learning_rate": 1.8514287142145926e-07,
+ "loss": 0.2492,
+ "step": 11138
+ },
+ {
+ "epoch": 5.266666666666667,
+ "grad_norm": 2.9022061824798584,
+ "learning_rate": 1.8490734671622905e-07,
+ "loss": 0.2886,
+ "step": 11139
+ },
+ {
+ "epoch": 5.267139479905437,
+ "grad_norm": 2.8640410900115967,
+ "learning_rate": 1.8467196616166694e-07,
+ "loss": 0.276,
+ "step": 11140
+ },
+ {
+ "epoch": 5.267612293144208,
+ "grad_norm": 3.756704092025757,
+ "learning_rate": 1.8443672977242922e-07,
+ "loss": 0.3003,
+ "step": 11141
+ },
+ {
+ "epoch": 5.268085106382979,
+ "grad_norm": 3.145064353942871,
+ "learning_rate": 1.8420163756316222e-07,
+ "loss": 0.2707,
+ "step": 11142
+ },
+ {
+ "epoch": 5.26855791962175,
+ "grad_norm": 3.542193651199341,
+ "learning_rate": 1.8396668954850538e-07,
+ "loss": 0.369,
+ "step": 11143
+ },
+ {
+ "epoch": 5.26903073286052,
+ "grad_norm": 3.2682089805603027,
+ "learning_rate": 1.8373188574308726e-07,
+ "loss": 0.3045,
+ "step": 11144
+ },
+ {
+ "epoch": 5.2695035460992905,
+ "grad_norm": 3.3174448013305664,
+ "learning_rate": 1.8349722616152753e-07,
+ "loss": 0.313,
+ "step": 11145
+ },
+ {
+ "epoch": 5.269976359338061,
+ "grad_norm": 5.076638221740723,
+ "learning_rate": 1.8326271081843873e-07,
+ "loss": 0.3667,
+ "step": 11146
+ },
+ {
+ "epoch": 5.270449172576832,
+ "grad_norm": 3.259685516357422,
+ "learning_rate": 1.830283397284216e-07,
+ "loss": 0.3415,
+ "step": 11147
+ },
+ {
+ "epoch": 5.270921985815603,
+ "grad_norm": 3.1790366172790527,
+ "learning_rate": 1.8279411290607064e-07,
+ "loss": 0.3193,
+ "step": 11148
+ },
+ {
+ "epoch": 5.271394799054374,
+ "grad_norm": 3.091585636138916,
+ "learning_rate": 1.8256003036596942e-07,
+ "loss": 0.3042,
+ "step": 11149
+ },
+ {
+ "epoch": 5.2718676122931445,
+ "grad_norm": 3.1135261058807373,
+ "learning_rate": 1.8232609212269326e-07,
+ "loss": 0.357,
+ "step": 11150
+ },
+ {
+ "epoch": 5.272340425531915,
+ "grad_norm": 3.1233057975769043,
+ "learning_rate": 1.8209229819080908e-07,
+ "loss": 0.3221,
+ "step": 11151
+ },
+ {
+ "epoch": 5.272813238770685,
+ "grad_norm": 3.1838364601135254,
+ "learning_rate": 1.8185864858487335e-07,
+ "loss": 0.3301,
+ "step": 11152
+ },
+ {
+ "epoch": 5.273286052009456,
+ "grad_norm": 3.0430169105529785,
+ "learning_rate": 1.816251433194352e-07,
+ "loss": 0.3051,
+ "step": 11153
+ },
+ {
+ "epoch": 5.273758865248227,
+ "grad_norm": 3.4865505695343018,
+ "learning_rate": 1.813917824090336e-07,
+ "loss": 0.3492,
+ "step": 11154
+ },
+ {
+ "epoch": 5.274231678486998,
+ "grad_norm": 3.3414416313171387,
+ "learning_rate": 1.8115856586819884e-07,
+ "loss": 0.3322,
+ "step": 11155
+ },
+ {
+ "epoch": 5.274704491725768,
+ "grad_norm": 3.0825676918029785,
+ "learning_rate": 1.8092549371145212e-07,
+ "loss": 0.3356,
+ "step": 11156
+ },
+ {
+ "epoch": 5.275177304964539,
+ "grad_norm": 3.4176392555236816,
+ "learning_rate": 1.8069256595330654e-07,
+ "loss": 0.3219,
+ "step": 11157
+ },
+ {
+ "epoch": 5.27565011820331,
+ "grad_norm": 3.1088545322418213,
+ "learning_rate": 1.8045978260826491e-07,
+ "loss": 0.3039,
+ "step": 11158
+ },
+ {
+ "epoch": 5.276122931442081,
+ "grad_norm": 3.428644895553589,
+ "learning_rate": 1.802271436908215e-07,
+ "loss": 0.3175,
+ "step": 11159
+ },
+ {
+ "epoch": 5.276595744680851,
+ "grad_norm": 2.932908296585083,
+ "learning_rate": 1.7999464921546196e-07,
+ "loss": 0.3166,
+ "step": 11160
+ },
+ {
+ "epoch": 5.2770685579196215,
+ "grad_norm": 3.0005784034729004,
+ "learning_rate": 1.7976229919666272e-07,
+ "loss": 0.2563,
+ "step": 11161
+ },
+ {
+ "epoch": 5.277541371158392,
+ "grad_norm": 2.9349775314331055,
+ "learning_rate": 1.7953009364889141e-07,
+ "loss": 0.3195,
+ "step": 11162
+ },
+ {
+ "epoch": 5.278014184397163,
+ "grad_norm": 3.6350808143615723,
+ "learning_rate": 1.792980325866059e-07,
+ "loss": 0.3435,
+ "step": 11163
+ },
+ {
+ "epoch": 5.278486997635934,
+ "grad_norm": 3.0954113006591797,
+ "learning_rate": 1.79066116024256e-07,
+ "loss": 0.3151,
+ "step": 11164
+ },
+ {
+ "epoch": 5.278959810874705,
+ "grad_norm": 3.31296443939209,
+ "learning_rate": 1.7883434397628186e-07,
+ "loss": 0.3618,
+ "step": 11165
+ },
+ {
+ "epoch": 5.2794326241134755,
+ "grad_norm": 2.9701640605926514,
+ "learning_rate": 1.7860271645711552e-07,
+ "loss": 0.3066,
+ "step": 11166
+ },
+ {
+ "epoch": 5.279905437352246,
+ "grad_norm": 2.9575884342193604,
+ "learning_rate": 1.7837123348117906e-07,
+ "loss": 0.3099,
+ "step": 11167
+ },
+ {
+ "epoch": 5.280378250591016,
+ "grad_norm": 3.3828840255737305,
+ "learning_rate": 1.7813989506288542e-07,
+ "loss": 0.3156,
+ "step": 11168
+ },
+ {
+ "epoch": 5.280851063829787,
+ "grad_norm": 3.2371773719787598,
+ "learning_rate": 1.7790870121663972e-07,
+ "loss": 0.3153,
+ "step": 11169
+ },
+ {
+ "epoch": 5.281323877068558,
+ "grad_norm": 3.176521062850952,
+ "learning_rate": 1.7767765195683684e-07,
+ "loss": 0.3007,
+ "step": 11170
+ },
+ {
+ "epoch": 5.281796690307329,
+ "grad_norm": 3.5833663940429688,
+ "learning_rate": 1.7744674729786387e-07,
+ "loss": 0.3289,
+ "step": 11171
+ },
+ {
+ "epoch": 5.282269503546099,
+ "grad_norm": 3.3106162548065186,
+ "learning_rate": 1.7721598725409794e-07,
+ "loss": 0.3204,
+ "step": 11172
+ },
+ {
+ "epoch": 5.28274231678487,
+ "grad_norm": 3.2758400440216064,
+ "learning_rate": 1.7698537183990727e-07,
+ "loss": 0.3158,
+ "step": 11173
+ },
+ {
+ "epoch": 5.283215130023641,
+ "grad_norm": 3.1461472511291504,
+ "learning_rate": 1.7675490106965147e-07,
+ "loss": 0.2968,
+ "step": 11174
+ },
+ {
+ "epoch": 5.283687943262412,
+ "grad_norm": 3.1824328899383545,
+ "learning_rate": 1.765245749576805e-07,
+ "loss": 0.327,
+ "step": 11175
+ },
+ {
+ "epoch": 5.284160756501182,
+ "grad_norm": 3.493117332458496,
+ "learning_rate": 1.7629439351833644e-07,
+ "loss": 0.3457,
+ "step": 11176
+ },
+ {
+ "epoch": 5.2846335697399525,
+ "grad_norm": 2.929245948791504,
+ "learning_rate": 1.7606435676595146e-07,
+ "loss": 0.3446,
+ "step": 11177
+ },
+ {
+ "epoch": 5.285106382978723,
+ "grad_norm": 3.2307543754577637,
+ "learning_rate": 1.7583446471484827e-07,
+ "loss": 0.3238,
+ "step": 11178
+ },
+ {
+ "epoch": 5.285579196217494,
+ "grad_norm": 3.2233543395996094,
+ "learning_rate": 1.7560471737934238e-07,
+ "loss": 0.3217,
+ "step": 11179
+ },
+ {
+ "epoch": 5.286052009456265,
+ "grad_norm": 3.2125415802001953,
+ "learning_rate": 1.7537511477373843e-07,
+ "loss": 0.3346,
+ "step": 11180
+ },
+ {
+ "epoch": 5.286524822695036,
+ "grad_norm": 3.675832986831665,
+ "learning_rate": 1.7514565691233249e-07,
+ "loss": 0.3461,
+ "step": 11181
+ },
+ {
+ "epoch": 5.2869976359338064,
+ "grad_norm": 3.942467451095581,
+ "learning_rate": 1.749163438094126e-07,
+ "loss": 0.3889,
+ "step": 11182
+ },
+ {
+ "epoch": 5.287470449172577,
+ "grad_norm": 3.043288230895996,
+ "learning_rate": 1.7468717547925673e-07,
+ "loss": 0.3068,
+ "step": 11183
+ },
+ {
+ "epoch": 5.287943262411347,
+ "grad_norm": 2.9957966804504395,
+ "learning_rate": 1.7445815193613436e-07,
+ "loss": 0.2928,
+ "step": 11184
+ },
+ {
+ "epoch": 5.288416075650118,
+ "grad_norm": 5.408661365509033,
+ "learning_rate": 1.742292731943057e-07,
+ "loss": 0.3067,
+ "step": 11185
+ },
+ {
+ "epoch": 5.288888888888889,
+ "grad_norm": 2.905014753341675,
+ "learning_rate": 1.7400053926802192e-07,
+ "loss": 0.2998,
+ "step": 11186
+ },
+ {
+ "epoch": 5.2893617021276595,
+ "grad_norm": 2.9951369762420654,
+ "learning_rate": 1.7377195017152542e-07,
+ "loss": 0.2876,
+ "step": 11187
+ },
+ {
+ "epoch": 5.28983451536643,
+ "grad_norm": 3.637373208999634,
+ "learning_rate": 1.735435059190496e-07,
+ "loss": 0.3146,
+ "step": 11188
+ },
+ {
+ "epoch": 5.290307328605201,
+ "grad_norm": 3.262631893157959,
+ "learning_rate": 1.733152065248181e-07,
+ "loss": 0.3384,
+ "step": 11189
+ },
+ {
+ "epoch": 5.290780141843972,
+ "grad_norm": 3.2098207473754883,
+ "learning_rate": 1.7308705200304699e-07,
+ "loss": 0.3152,
+ "step": 11190
+ },
+ {
+ "epoch": 5.291252955082742,
+ "grad_norm": 3.331793785095215,
+ "learning_rate": 1.7285904236794105e-07,
+ "loss": 0.3168,
+ "step": 11191
+ },
+ {
+ "epoch": 5.291725768321513,
+ "grad_norm": 3.5311834812164307,
+ "learning_rate": 1.726311776336992e-07,
+ "loss": 0.3262,
+ "step": 11192
+ },
+ {
+ "epoch": 5.292198581560283,
+ "grad_norm": 3.2608871459960938,
+ "learning_rate": 1.7240345781450784e-07,
+ "loss": 0.3193,
+ "step": 11193
+ },
+ {
+ "epoch": 5.292671394799054,
+ "grad_norm": 3.322798013687134,
+ "learning_rate": 1.7217588292454762e-07,
+ "loss": 0.3669,
+ "step": 11194
+ },
+ {
+ "epoch": 5.293144208037825,
+ "grad_norm": 4.009716033935547,
+ "learning_rate": 1.7194845297798802e-07,
+ "loss": 0.3765,
+ "step": 11195
+ },
+ {
+ "epoch": 5.293617021276596,
+ "grad_norm": 3.0177745819091797,
+ "learning_rate": 1.7172116798898964e-07,
+ "loss": 0.3112,
+ "step": 11196
+ },
+ {
+ "epoch": 5.294089834515367,
+ "grad_norm": 2.870234251022339,
+ "learning_rate": 1.7149402797170532e-07,
+ "loss": 0.2938,
+ "step": 11197
+ },
+ {
+ "epoch": 5.294562647754137,
+ "grad_norm": 3.297724723815918,
+ "learning_rate": 1.7126703294027714e-07,
+ "loss": 0.3328,
+ "step": 11198
+ },
+ {
+ "epoch": 5.295035460992908,
+ "grad_norm": 3.5743629932403564,
+ "learning_rate": 1.7104018290884012e-07,
+ "loss": 0.3555,
+ "step": 11199
+ },
+ {
+ "epoch": 5.295508274231678,
+ "grad_norm": 3.3210482597351074,
+ "learning_rate": 1.7081347789151882e-07,
+ "loss": 0.2825,
+ "step": 11200
+ },
+ {
+ "epoch": 5.295981087470449,
+ "grad_norm": 3.647606611251831,
+ "learning_rate": 1.705869179024286e-07,
+ "loss": 0.3557,
+ "step": 11201
+ },
+ {
+ "epoch": 5.29645390070922,
+ "grad_norm": 3.379843235015869,
+ "learning_rate": 1.7036050295567736e-07,
+ "loss": 0.3648,
+ "step": 11202
+ },
+ {
+ "epoch": 5.2969267139479905,
+ "grad_norm": 3.3235721588134766,
+ "learning_rate": 1.7013423306536215e-07,
+ "loss": 0.3344,
+ "step": 11203
+ },
+ {
+ "epoch": 5.297399527186761,
+ "grad_norm": 3.673867702484131,
+ "learning_rate": 1.6990810824557225e-07,
+ "loss": 0.3667,
+ "step": 11204
+ },
+ {
+ "epoch": 5.297872340425532,
+ "grad_norm": 3.368685245513916,
+ "learning_rate": 1.6968212851038723e-07,
+ "loss": 0.3315,
+ "step": 11205
+ },
+ {
+ "epoch": 5.298345153664303,
+ "grad_norm": 3.5644888877868652,
+ "learning_rate": 1.6945629387387779e-07,
+ "loss": 0.3407,
+ "step": 11206
+ },
+ {
+ "epoch": 5.298817966903073,
+ "grad_norm": 3.110358715057373,
+ "learning_rate": 1.6923060435010602e-07,
+ "loss": 0.2986,
+ "step": 11207
+ },
+ {
+ "epoch": 5.299290780141844,
+ "grad_norm": 3.757765293121338,
+ "learning_rate": 1.6900505995312427e-07,
+ "loss": 0.3656,
+ "step": 11208
+ },
+ {
+ "epoch": 5.299763593380614,
+ "grad_norm": 3.415292501449585,
+ "learning_rate": 1.6877966069697605e-07,
+ "loss": 0.3133,
+ "step": 11209
+ },
+ {
+ "epoch": 5.300236406619385,
+ "grad_norm": 3.2273685932159424,
+ "learning_rate": 1.6855440659569678e-07,
+ "loss": 0.2993,
+ "step": 11210
+ },
+ {
+ "epoch": 5.300709219858156,
+ "grad_norm": 3.3077101707458496,
+ "learning_rate": 1.6832929766331107e-07,
+ "loss": 0.3175,
+ "step": 11211
+ },
+ {
+ "epoch": 5.301182033096927,
+ "grad_norm": 3.5129048824310303,
+ "learning_rate": 1.6810433391383634e-07,
+ "loss": 0.35,
+ "step": 11212
+ },
+ {
+ "epoch": 5.301654846335698,
+ "grad_norm": 3.1101224422454834,
+ "learning_rate": 1.6787951536127944e-07,
+ "loss": 0.3582,
+ "step": 11213
+ },
+ {
+ "epoch": 5.302127659574468,
+ "grad_norm": 3.20463490486145,
+ "learning_rate": 1.676548420196389e-07,
+ "loss": 0.3495,
+ "step": 11214
+ },
+ {
+ "epoch": 5.302600472813239,
+ "grad_norm": 3.0679001808166504,
+ "learning_rate": 1.6743031390290486e-07,
+ "loss": 0.2934,
+ "step": 11215
+ },
+ {
+ "epoch": 5.303073286052009,
+ "grad_norm": 3.2553749084472656,
+ "learning_rate": 1.672059310250565e-07,
+ "loss": 0.2841,
+ "step": 11216
+ },
+ {
+ "epoch": 5.30354609929078,
+ "grad_norm": 3.304185152053833,
+ "learning_rate": 1.669816934000662e-07,
+ "loss": 0.2784,
+ "step": 11217
+ },
+ {
+ "epoch": 5.304018912529551,
+ "grad_norm": 3.133167028427124,
+ "learning_rate": 1.6675760104189615e-07,
+ "loss": 0.3408,
+ "step": 11218
+ },
+ {
+ "epoch": 5.3044917257683215,
+ "grad_norm": 3.26667857170105,
+ "learning_rate": 1.6653365396449907e-07,
+ "loss": 0.3421,
+ "step": 11219
+ },
+ {
+ "epoch": 5.304964539007092,
+ "grad_norm": 3.201523542404175,
+ "learning_rate": 1.6630985218181904e-07,
+ "loss": 0.3737,
+ "step": 11220
+ },
+ {
+ "epoch": 5.305437352245863,
+ "grad_norm": 3.4249019622802734,
+ "learning_rate": 1.660861957077922e-07,
+ "loss": 0.3158,
+ "step": 11221
+ },
+ {
+ "epoch": 5.305910165484634,
+ "grad_norm": 3.1601033210754395,
+ "learning_rate": 1.658626845563438e-07,
+ "loss": 0.2986,
+ "step": 11222
+ },
+ {
+ "epoch": 5.306382978723404,
+ "grad_norm": 3.339919090270996,
+ "learning_rate": 1.6563931874139127e-07,
+ "loss": 0.2935,
+ "step": 11223
+ },
+ {
+ "epoch": 5.306855791962175,
+ "grad_norm": 3.149475574493408,
+ "learning_rate": 1.6541609827684212e-07,
+ "loss": 0.3152,
+ "step": 11224
+ },
+ {
+ "epoch": 5.307328605200945,
+ "grad_norm": 3.6351609230041504,
+ "learning_rate": 1.6519302317659607e-07,
+ "loss": 0.3369,
+ "step": 11225
+ },
+ {
+ "epoch": 5.307801418439716,
+ "grad_norm": 2.76548433303833,
+ "learning_rate": 1.6497009345454257e-07,
+ "loss": 0.2826,
+ "step": 11226
+ },
+ {
+ "epoch": 5.308274231678487,
+ "grad_norm": 3.4355554580688477,
+ "learning_rate": 1.6474730912456217e-07,
+ "loss": 0.3589,
+ "step": 11227
+ },
+ {
+ "epoch": 5.308747044917258,
+ "grad_norm": 3.1523945331573486,
+ "learning_rate": 1.6452467020052766e-07,
+ "loss": 0.3197,
+ "step": 11228
+ },
+ {
+ "epoch": 5.309219858156029,
+ "grad_norm": 2.867628574371338,
+ "learning_rate": 1.6430217669630043e-07,
+ "loss": 0.2954,
+ "step": 11229
+ },
+ {
+ "epoch": 5.309692671394799,
+ "grad_norm": 3.791811227798462,
+ "learning_rate": 1.6407982862573557e-07,
+ "loss": 0.3447,
+ "step": 11230
+ },
+ {
+ "epoch": 5.31016548463357,
+ "grad_norm": 3.406834602355957,
+ "learning_rate": 1.6385762600267697e-07,
+ "loss": 0.3429,
+ "step": 11231
+ },
+ {
+ "epoch": 5.31063829787234,
+ "grad_norm": 3.3508188724517822,
+ "learning_rate": 1.6363556884096022e-07,
+ "loss": 0.3246,
+ "step": 11232
+ },
+ {
+ "epoch": 5.311111111111111,
+ "grad_norm": 3.1348748207092285,
+ "learning_rate": 1.6341365715441205e-07,
+ "loss": 0.3299,
+ "step": 11233
+ },
+ {
+ "epoch": 5.311583924349882,
+ "grad_norm": 3.1326138973236084,
+ "learning_rate": 1.6319189095684944e-07,
+ "loss": 0.3769,
+ "step": 11234
+ },
+ {
+ "epoch": 5.3120567375886525,
+ "grad_norm": 3.0878756046295166,
+ "learning_rate": 1.6297027026208166e-07,
+ "loss": 0.3454,
+ "step": 11235
+ },
+ {
+ "epoch": 5.312529550827423,
+ "grad_norm": 3.303295612335205,
+ "learning_rate": 1.627487950839074e-07,
+ "loss": 0.3706,
+ "step": 11236
+ },
+ {
+ "epoch": 5.313002364066194,
+ "grad_norm": 3.2477850914001465,
+ "learning_rate": 1.6252746543611726e-07,
+ "loss": 0.2839,
+ "step": 11237
+ },
+ {
+ "epoch": 5.313475177304965,
+ "grad_norm": 3.166362762451172,
+ "learning_rate": 1.6230628133249244e-07,
+ "loss": 0.3596,
+ "step": 11238
+ },
+ {
+ "epoch": 5.313947990543735,
+ "grad_norm": 3.155400276184082,
+ "learning_rate": 1.6208524278680442e-07,
+ "loss": 0.2984,
+ "step": 11239
+ },
+ {
+ "epoch": 5.314420803782506,
+ "grad_norm": 3.271667718887329,
+ "learning_rate": 1.6186434981281752e-07,
+ "loss": 0.3046,
+ "step": 11240
+ },
+ {
+ "epoch": 5.314893617021276,
+ "grad_norm": 3.2715513706207275,
+ "learning_rate": 1.6164360242428513e-07,
+ "loss": 0.3416,
+ "step": 11241
+ },
+ {
+ "epoch": 5.315366430260047,
+ "grad_norm": 3.2481918334960938,
+ "learning_rate": 1.6142300063495154e-07,
+ "loss": 0.2906,
+ "step": 11242
+ },
+ {
+ "epoch": 5.315839243498818,
+ "grad_norm": 3.1383323669433594,
+ "learning_rate": 1.612025444585541e-07,
+ "loss": 0.2907,
+ "step": 11243
+ },
+ {
+ "epoch": 5.316312056737589,
+ "grad_norm": 3.3941144943237305,
+ "learning_rate": 1.6098223390881823e-07,
+ "loss": 0.2903,
+ "step": 11244
+ },
+ {
+ "epoch": 5.31678486997636,
+ "grad_norm": 3.858114004135132,
+ "learning_rate": 1.6076206899946267e-07,
+ "loss": 0.3112,
+ "step": 11245
+ },
+ {
+ "epoch": 5.31725768321513,
+ "grad_norm": 2.7895848751068115,
+ "learning_rate": 1.605420497441962e-07,
+ "loss": 0.3045,
+ "step": 11246
+ },
+ {
+ "epoch": 5.317730496453901,
+ "grad_norm": 2.904611349105835,
+ "learning_rate": 1.6032217615671753e-07,
+ "loss": 0.2947,
+ "step": 11247
+ },
+ {
+ "epoch": 5.318203309692671,
+ "grad_norm": 3.3088219165802,
+ "learning_rate": 1.6010244825071796e-07,
+ "loss": 0.3159,
+ "step": 11248
+ },
+ {
+ "epoch": 5.318676122931442,
+ "grad_norm": 3.478278160095215,
+ "learning_rate": 1.5988286603987852e-07,
+ "loss": 0.3566,
+ "step": 11249
+ },
+ {
+ "epoch": 5.319148936170213,
+ "grad_norm": 3.9452574253082275,
+ "learning_rate": 1.596634295378724e-07,
+ "loss": 0.3653,
+ "step": 11250
+ },
+ {
+ "epoch": 5.3196217494089835,
+ "grad_norm": 3.097100257873535,
+ "learning_rate": 1.5944413875836255e-07,
+ "loss": 0.3187,
+ "step": 11251
+ },
+ {
+ "epoch": 5.320094562647754,
+ "grad_norm": 3.021803379058838,
+ "learning_rate": 1.5922499371500282e-07,
+ "loss": 0.2922,
+ "step": 11252
+ },
+ {
+ "epoch": 5.320567375886525,
+ "grad_norm": 3.598921775817871,
+ "learning_rate": 1.5900599442143893e-07,
+ "loss": 0.3554,
+ "step": 11253
+ },
+ {
+ "epoch": 5.321040189125296,
+ "grad_norm": 3.154602527618408,
+ "learning_rate": 1.5878714089130692e-07,
+ "loss": 0.3184,
+ "step": 11254
+ },
+ {
+ "epoch": 5.321513002364066,
+ "grad_norm": 3.1292645931243896,
+ "learning_rate": 1.585684331382334e-07,
+ "loss": 0.311,
+ "step": 11255
+ },
+ {
+ "epoch": 5.321985815602837,
+ "grad_norm": 3.6723337173461914,
+ "learning_rate": 1.583498711758369e-07,
+ "loss": 0.3814,
+ "step": 11256
+ },
+ {
+ "epoch": 5.322458628841607,
+ "grad_norm": 2.859097480773926,
+ "learning_rate": 1.581314550177257e-07,
+ "loss": 0.247,
+ "step": 11257
+ },
+ {
+ "epoch": 5.322931442080378,
+ "grad_norm": 3.692857265472412,
+ "learning_rate": 1.5791318467750033e-07,
+ "loss": 0.338,
+ "step": 11258
+ },
+ {
+ "epoch": 5.323404255319149,
+ "grad_norm": 3.3693666458129883,
+ "learning_rate": 1.576950601687513e-07,
+ "loss": 0.305,
+ "step": 11259
+ },
+ {
+ "epoch": 5.32387706855792,
+ "grad_norm": 3.567326784133911,
+ "learning_rate": 1.574770815050597e-07,
+ "loss": 0.321,
+ "step": 11260
+ },
+ {
+ "epoch": 5.3243498817966906,
+ "grad_norm": 3.062784194946289,
+ "learning_rate": 1.5725924869999908e-07,
+ "loss": 0.2773,
+ "step": 11261
+ },
+ {
+ "epoch": 5.324822695035461,
+ "grad_norm": 3.2731950283050537,
+ "learning_rate": 1.5704156176713197e-07,
+ "loss": 0.2954,
+ "step": 11262
+ },
+ {
+ "epoch": 5.325295508274232,
+ "grad_norm": 3.111539840698242,
+ "learning_rate": 1.5682402072001363e-07,
+ "loss": 0.3003,
+ "step": 11263
+ },
+ {
+ "epoch": 5.325768321513002,
+ "grad_norm": 3.278005361557007,
+ "learning_rate": 1.5660662557218903e-07,
+ "loss": 0.349,
+ "step": 11264
+ },
+ {
+ "epoch": 5.326241134751773,
+ "grad_norm": 3.2765769958496094,
+ "learning_rate": 1.5638937633719402e-07,
+ "loss": 0.3284,
+ "step": 11265
+ },
+ {
+ "epoch": 5.326713947990544,
+ "grad_norm": 3.123499631881714,
+ "learning_rate": 1.561722730285567e-07,
+ "loss": 0.2833,
+ "step": 11266
+ },
+ {
+ "epoch": 5.3271867612293144,
+ "grad_norm": 3.308847188949585,
+ "learning_rate": 1.5595531565979428e-07,
+ "loss": 0.3134,
+ "step": 11267
+ },
+ {
+ "epoch": 5.327659574468085,
+ "grad_norm": 3.1868162155151367,
+ "learning_rate": 1.5573850424441622e-07,
+ "loss": 0.3164,
+ "step": 11268
+ },
+ {
+ "epoch": 5.328132387706856,
+ "grad_norm": 3.9105653762817383,
+ "learning_rate": 1.5552183879592203e-07,
+ "loss": 0.2944,
+ "step": 11269
+ },
+ {
+ "epoch": 5.328605200945627,
+ "grad_norm": 3.2461166381835938,
+ "learning_rate": 1.553053193278023e-07,
+ "loss": 0.2594,
+ "step": 11270
+ },
+ {
+ "epoch": 5.329078014184397,
+ "grad_norm": 3.391753673553467,
+ "learning_rate": 1.5508894585353983e-07,
+ "loss": 0.3062,
+ "step": 11271
+ },
+ {
+ "epoch": 5.3295508274231675,
+ "grad_norm": 3.1136906147003174,
+ "learning_rate": 1.5487271838660584e-07,
+ "loss": 0.3046,
+ "step": 11272
+ },
+ {
+ "epoch": 5.330023640661938,
+ "grad_norm": 3.557689905166626,
+ "learning_rate": 1.5465663694046535e-07,
+ "loss": 0.3514,
+ "step": 11273
+ },
+ {
+ "epoch": 5.330496453900709,
+ "grad_norm": 3.394179105758667,
+ "learning_rate": 1.5444070152857177e-07,
+ "loss": 0.303,
+ "step": 11274
+ },
+ {
+ "epoch": 5.33096926713948,
+ "grad_norm": 3.120568037033081,
+ "learning_rate": 1.5422491216437047e-07,
+ "loss": 0.3611,
+ "step": 11275
+ },
+ {
+ "epoch": 5.331442080378251,
+ "grad_norm": 3.091876983642578,
+ "learning_rate": 1.5400926886129847e-07,
+ "loss": 0.2758,
+ "step": 11276
+ },
+ {
+ "epoch": 5.3319148936170215,
+ "grad_norm": 3.1780812740325928,
+ "learning_rate": 1.5379377163278226e-07,
+ "loss": 0.299,
+ "step": 11277
+ },
+ {
+ "epoch": 5.332387706855792,
+ "grad_norm": 3.105471134185791,
+ "learning_rate": 1.5357842049223969e-07,
+ "loss": 0.3318,
+ "step": 11278
+ },
+ {
+ "epoch": 5.332860520094562,
+ "grad_norm": 3.2104663848876953,
+ "learning_rate": 1.5336321545308063e-07,
+ "loss": 0.312,
+ "step": 11279
+ },
+ {
+ "epoch": 5.333333333333333,
+ "grad_norm": 3.0989763736724854,
+ "learning_rate": 1.5314815652870407e-07,
+ "loss": 0.2851,
+ "step": 11280
+ },
+ {
+ "epoch": 5.333806146572104,
+ "grad_norm": 3.057668685913086,
+ "learning_rate": 1.529332437325015e-07,
+ "loss": 0.3174,
+ "step": 11281
+ },
+ {
+ "epoch": 5.334278959810875,
+ "grad_norm": 3.587592840194702,
+ "learning_rate": 1.527184770778542e-07,
+ "loss": 0.4058,
+ "step": 11282
+ },
+ {
+ "epoch": 5.334751773049645,
+ "grad_norm": 3.0807034969329834,
+ "learning_rate": 1.5250385657813478e-07,
+ "loss": 0.3145,
+ "step": 11283
+ },
+ {
+ "epoch": 5.335224586288416,
+ "grad_norm": 3.3721396923065186,
+ "learning_rate": 1.5228938224670698e-07,
+ "loss": 0.338,
+ "step": 11284
+ },
+ {
+ "epoch": 5.335697399527187,
+ "grad_norm": 3.361618995666504,
+ "learning_rate": 1.5207505409692431e-07,
+ "loss": 0.3018,
+ "step": 11285
+ },
+ {
+ "epoch": 5.336170212765958,
+ "grad_norm": 3.107353448867798,
+ "learning_rate": 1.518608721421333e-07,
+ "loss": 0.3612,
+ "step": 11286
+ },
+ {
+ "epoch": 5.336643026004728,
+ "grad_norm": 2.945077896118164,
+ "learning_rate": 1.5164683639566941e-07,
+ "loss": 0.2925,
+ "step": 11287
+ },
+ {
+ "epoch": 5.3371158392434985,
+ "grad_norm": 3.1644442081451416,
+ "learning_rate": 1.5143294687085918e-07,
+ "loss": 0.3336,
+ "step": 11288
+ },
+ {
+ "epoch": 5.337588652482269,
+ "grad_norm": 3.0085582733154297,
+ "learning_rate": 1.5121920358102172e-07,
+ "loss": 0.2928,
+ "step": 11289
+ },
+ {
+ "epoch": 5.33806146572104,
+ "grad_norm": 3.3779399394989014,
+ "learning_rate": 1.5100560653946522e-07,
+ "loss": 0.3453,
+ "step": 11290
+ },
+ {
+ "epoch": 5.338534278959811,
+ "grad_norm": 3.1938469409942627,
+ "learning_rate": 1.5079215575948963e-07,
+ "loss": 0.2911,
+ "step": 11291
+ },
+ {
+ "epoch": 5.339007092198582,
+ "grad_norm": 3.22524094581604,
+ "learning_rate": 1.505788512543857e-07,
+ "loss": 0.321,
+ "step": 11292
+ },
+ {
+ "epoch": 5.3394799054373525,
+ "grad_norm": 3.5342066287994385,
+ "learning_rate": 1.5036569303743447e-07,
+ "loss": 0.3417,
+ "step": 11293
+ },
+ {
+ "epoch": 5.339952718676123,
+ "grad_norm": 3.2065529823303223,
+ "learning_rate": 1.5015268112190894e-07,
+ "loss": 0.3537,
+ "step": 11294
+ },
+ {
+ "epoch": 5.340425531914893,
+ "grad_norm": 3.2699520587921143,
+ "learning_rate": 1.4993981552107185e-07,
+ "loss": 0.2918,
+ "step": 11295
+ },
+ {
+ "epoch": 5.340898345153664,
+ "grad_norm": 2.932130813598633,
+ "learning_rate": 1.4972709624817812e-07,
+ "loss": 0.2711,
+ "step": 11296
+ },
+ {
+ "epoch": 5.341371158392435,
+ "grad_norm": 3.151578187942505,
+ "learning_rate": 1.4951452331647244e-07,
+ "loss": 0.3254,
+ "step": 11297
+ },
+ {
+ "epoch": 5.341843971631206,
+ "grad_norm": 3.479327440261841,
+ "learning_rate": 1.4930209673919062e-07,
+ "loss": 0.3184,
+ "step": 11298
+ },
+ {
+ "epoch": 5.342316784869976,
+ "grad_norm": 3.1652722358703613,
+ "learning_rate": 1.4908981652955985e-07,
+ "loss": 0.3118,
+ "step": 11299
+ },
+ {
+ "epoch": 5.342789598108747,
+ "grad_norm": 3.445828437805176,
+ "learning_rate": 1.488776827007979e-07,
+ "loss": 0.3279,
+ "step": 11300
+ },
+ {
+ "epoch": 5.343262411347518,
+ "grad_norm": 2.9347822666168213,
+ "learning_rate": 1.4866569526611334e-07,
+ "loss": 0.2942,
+ "step": 11301
+ },
+ {
+ "epoch": 5.343735224586289,
+ "grad_norm": 3.1422226428985596,
+ "learning_rate": 1.4845385423870568e-07,
+ "loss": 0.3397,
+ "step": 11302
+ },
+ {
+ "epoch": 5.344208037825059,
+ "grad_norm": 3.2264926433563232,
+ "learning_rate": 1.4824215963176485e-07,
+ "loss": 0.3245,
+ "step": 11303
+ },
+ {
+ "epoch": 5.3446808510638295,
+ "grad_norm": 3.4117064476013184,
+ "learning_rate": 1.4803061145847286e-07,
+ "loss": 0.3367,
+ "step": 11304
+ },
+ {
+ "epoch": 5.3451536643026,
+ "grad_norm": 3.502324342727661,
+ "learning_rate": 1.4781920973200164e-07,
+ "loss": 0.3258,
+ "step": 11305
+ },
+ {
+ "epoch": 5.345626477541371,
+ "grad_norm": 3.0881006717681885,
+ "learning_rate": 1.47607954465514e-07,
+ "loss": 0.2901,
+ "step": 11306
+ },
+ {
+ "epoch": 5.346099290780142,
+ "grad_norm": 3.305974006652832,
+ "learning_rate": 1.4739684567216445e-07,
+ "loss": 0.3963,
+ "step": 11307
+ },
+ {
+ "epoch": 5.346572104018913,
+ "grad_norm": 3.3296890258789062,
+ "learning_rate": 1.4718588336509714e-07,
+ "loss": 0.2985,
+ "step": 11308
+ },
+ {
+ "epoch": 5.3470449172576835,
+ "grad_norm": 3.5175423622131348,
+ "learning_rate": 1.4697506755744827e-07,
+ "loss": 0.3539,
+ "step": 11309
+ },
+ {
+ "epoch": 5.347517730496454,
+ "grad_norm": 3.3303592205047607,
+ "learning_rate": 1.4676439826234428e-07,
+ "loss": 0.3071,
+ "step": 11310
+ },
+ {
+ "epoch": 5.347990543735224,
+ "grad_norm": 2.9972405433654785,
+ "learning_rate": 1.4655387549290245e-07,
+ "loss": 0.3161,
+ "step": 11311
+ },
+ {
+ "epoch": 5.348463356973995,
+ "grad_norm": 3.5059444904327393,
+ "learning_rate": 1.4634349926223146e-07,
+ "loss": 0.2829,
+ "step": 11312
+ },
+ {
+ "epoch": 5.348936170212766,
+ "grad_norm": 3.2246110439300537,
+ "learning_rate": 1.4613326958342973e-07,
+ "loss": 0.278,
+ "step": 11313
+ },
+ {
+ "epoch": 5.349408983451537,
+ "grad_norm": 3.0903613567352295,
+ "learning_rate": 1.459231864695884e-07,
+ "loss": 0.2877,
+ "step": 11314
+ },
+ {
+ "epoch": 5.349881796690307,
+ "grad_norm": 3.164947748184204,
+ "learning_rate": 1.457132499337882e-07,
+ "loss": 0.3533,
+ "step": 11315
+ },
+ {
+ "epoch": 5.350354609929078,
+ "grad_norm": 3.2235376834869385,
+ "learning_rate": 1.455034599891003e-07,
+ "loss": 0.3384,
+ "step": 11316
+ },
+ {
+ "epoch": 5.350827423167849,
+ "grad_norm": 3.2294938564300537,
+ "learning_rate": 1.4529381664858816e-07,
+ "loss": 0.2604,
+ "step": 11317
+ },
+ {
+ "epoch": 5.35130023640662,
+ "grad_norm": 3.0628116130828857,
+ "learning_rate": 1.4508431992530437e-07,
+ "loss": 0.3166,
+ "step": 11318
+ },
+ {
+ "epoch": 5.35177304964539,
+ "grad_norm": 2.876624584197998,
+ "learning_rate": 1.448749698322943e-07,
+ "loss": 0.3192,
+ "step": 11319
+ },
+ {
+ "epoch": 5.3522458628841605,
+ "grad_norm": 3.4103848934173584,
+ "learning_rate": 1.4466576638259317e-07,
+ "loss": 0.3254,
+ "step": 11320
+ },
+ {
+ "epoch": 5.352718676122931,
+ "grad_norm": 3.0813376903533936,
+ "learning_rate": 1.444567095892263e-07,
+ "loss": 0.3265,
+ "step": 11321
+ },
+ {
+ "epoch": 5.353191489361702,
+ "grad_norm": 4.374908447265625,
+ "learning_rate": 1.4424779946521196e-07,
+ "loss": 0.2742,
+ "step": 11322
+ },
+ {
+ "epoch": 5.353664302600473,
+ "grad_norm": 3.1611711978912354,
+ "learning_rate": 1.4403903602355745e-07,
+ "loss": 0.3475,
+ "step": 11323
+ },
+ {
+ "epoch": 5.354137115839244,
+ "grad_norm": 3.284346103668213,
+ "learning_rate": 1.4383041927726104e-07,
+ "loss": 0.3245,
+ "step": 11324
+ },
+ {
+ "epoch": 5.3546099290780145,
+ "grad_norm": 3.416294813156128,
+ "learning_rate": 1.4362194923931345e-07,
+ "loss": 0.3363,
+ "step": 11325
+ },
+ {
+ "epoch": 5.355082742316785,
+ "grad_norm": 3.465210437774658,
+ "learning_rate": 1.4341362592269454e-07,
+ "loss": 0.3639,
+ "step": 11326
+ },
+ {
+ "epoch": 5.355555555555555,
+ "grad_norm": 3.2791237831115723,
+ "learning_rate": 1.4320544934037588e-07,
+ "loss": 0.368,
+ "step": 11327
+ },
+ {
+ "epoch": 5.356028368794326,
+ "grad_norm": 3.206949234008789,
+ "learning_rate": 1.4299741950531964e-07,
+ "loss": 0.3332,
+ "step": 11328
+ },
+ {
+ "epoch": 5.356501182033097,
+ "grad_norm": 3.33746600151062,
+ "learning_rate": 1.4278953643047877e-07,
+ "loss": 0.3373,
+ "step": 11329
+ },
+ {
+ "epoch": 5.356973995271868,
+ "grad_norm": 3.0807836055755615,
+ "learning_rate": 1.4258180012879762e-07,
+ "loss": 0.3308,
+ "step": 11330
+ },
+ {
+ "epoch": 5.357446808510638,
+ "grad_norm": 3.2733700275421143,
+ "learning_rate": 1.4237421061321087e-07,
+ "loss": 0.3612,
+ "step": 11331
+ },
+ {
+ "epoch": 5.357919621749409,
+ "grad_norm": 3.3739452362060547,
+ "learning_rate": 1.4216676789664402e-07,
+ "loss": 0.3428,
+ "step": 11332
+ },
+ {
+ "epoch": 5.35839243498818,
+ "grad_norm": 3.522418975830078,
+ "learning_rate": 1.4195947199201393e-07,
+ "loss": 0.3346,
+ "step": 11333
+ },
+ {
+ "epoch": 5.358865248226951,
+ "grad_norm": 3.081836223602295,
+ "learning_rate": 1.4175232291222723e-07,
+ "loss": 0.3529,
+ "step": 11334
+ },
+ {
+ "epoch": 5.359338061465721,
+ "grad_norm": 3.188351631164551,
+ "learning_rate": 1.4154532067018333e-07,
+ "loss": 0.2999,
+ "step": 11335
+ },
+ {
+ "epoch": 5.3598108747044915,
+ "grad_norm": 3.2901625633239746,
+ "learning_rate": 1.4133846527877054e-07,
+ "loss": 0.3404,
+ "step": 11336
+ },
+ {
+ "epoch": 5.360283687943262,
+ "grad_norm": 3.5738883018493652,
+ "learning_rate": 1.411317567508694e-07,
+ "loss": 0.2881,
+ "step": 11337
+ },
+ {
+ "epoch": 5.360756501182033,
+ "grad_norm": 3.4825687408447266,
+ "learning_rate": 1.409251950993501e-07,
+ "loss": 0.3163,
+ "step": 11338
+ },
+ {
+ "epoch": 5.361229314420804,
+ "grad_norm": 4.5908918380737305,
+ "learning_rate": 1.4071878033707464e-07,
+ "loss": 0.3113,
+ "step": 11339
+ },
+ {
+ "epoch": 5.361702127659575,
+ "grad_norm": 3.299997091293335,
+ "learning_rate": 1.405125124768958e-07,
+ "loss": 0.3187,
+ "step": 11340
+ },
+ {
+ "epoch": 5.3621749408983455,
+ "grad_norm": 3.676666021347046,
+ "learning_rate": 1.403063915316566e-07,
+ "loss": 0.3299,
+ "step": 11341
+ },
+ {
+ "epoch": 5.362647754137116,
+ "grad_norm": 3.4060325622558594,
+ "learning_rate": 1.401004175141918e-07,
+ "loss": 0.2792,
+ "step": 11342
+ },
+ {
+ "epoch": 5.363120567375886,
+ "grad_norm": 2.99606990814209,
+ "learning_rate": 1.3989459043732583e-07,
+ "loss": 0.2889,
+ "step": 11343
+ },
+ {
+ "epoch": 5.363593380614657,
+ "grad_norm": 3.141608715057373,
+ "learning_rate": 1.3968891031387488e-07,
+ "loss": 0.2819,
+ "step": 11344
+ },
+ {
+ "epoch": 5.364066193853428,
+ "grad_norm": 4.1492085456848145,
+ "learning_rate": 1.3948337715664617e-07,
+ "loss": 0.3781,
+ "step": 11345
+ },
+ {
+ "epoch": 5.3645390070921986,
+ "grad_norm": 2.9972193241119385,
+ "learning_rate": 1.3927799097843698e-07,
+ "loss": 0.2949,
+ "step": 11346
+ },
+ {
+ "epoch": 5.365011820330969,
+ "grad_norm": 4.015519618988037,
+ "learning_rate": 1.3907275179203593e-07,
+ "loss": 0.3573,
+ "step": 11347
+ },
+ {
+ "epoch": 5.36548463356974,
+ "grad_norm": 3.056424617767334,
+ "learning_rate": 1.3886765961022202e-07,
+ "loss": 0.3216,
+ "step": 11348
+ },
+ {
+ "epoch": 5.365957446808511,
+ "grad_norm": 3.364178419113159,
+ "learning_rate": 1.3866271444576556e-07,
+ "loss": 0.3519,
+ "step": 11349
+ },
+ {
+ "epoch": 5.366430260047281,
+ "grad_norm": 3.101644515991211,
+ "learning_rate": 1.3845791631142802e-07,
+ "loss": 0.3582,
+ "step": 11350
+ },
+ {
+ "epoch": 5.366903073286052,
+ "grad_norm": 3.060175895690918,
+ "learning_rate": 1.3825326521996114e-07,
+ "loss": 0.3359,
+ "step": 11351
+ },
+ {
+ "epoch": 5.3673758865248224,
+ "grad_norm": 3.402256488800049,
+ "learning_rate": 1.3804876118410693e-07,
+ "loss": 0.3383,
+ "step": 11352
+ },
+ {
+ "epoch": 5.367848699763593,
+ "grad_norm": 3.2685110569000244,
+ "learning_rate": 1.3784440421659968e-07,
+ "loss": 0.2726,
+ "step": 11353
+ },
+ {
+ "epoch": 5.368321513002364,
+ "grad_norm": 3.367396593093872,
+ "learning_rate": 1.3764019433016362e-07,
+ "loss": 0.35,
+ "step": 11354
+ },
+ {
+ "epoch": 5.368794326241135,
+ "grad_norm": 3.225039005279541,
+ "learning_rate": 1.3743613153751418e-07,
+ "loss": 0.2712,
+ "step": 11355
+ },
+ {
+ "epoch": 5.369267139479906,
+ "grad_norm": 3.262666940689087,
+ "learning_rate": 1.3723221585135753e-07,
+ "loss": 0.2581,
+ "step": 11356
+ },
+ {
+ "epoch": 5.369739952718676,
+ "grad_norm": 3.117643117904663,
+ "learning_rate": 1.3702844728438968e-07,
+ "loss": 0.3097,
+ "step": 11357
+ },
+ {
+ "epoch": 5.370212765957447,
+ "grad_norm": 3.1812870502471924,
+ "learning_rate": 1.3682482584929962e-07,
+ "loss": 0.3229,
+ "step": 11358
+ },
+ {
+ "epoch": 5.370685579196217,
+ "grad_norm": 3.211583137512207,
+ "learning_rate": 1.3662135155876526e-07,
+ "loss": 0.372,
+ "step": 11359
+ },
+ {
+ "epoch": 5.371158392434988,
+ "grad_norm": 3.5000617504119873,
+ "learning_rate": 1.3641802442545648e-07,
+ "loss": 0.3151,
+ "step": 11360
+ },
+ {
+ "epoch": 5.371631205673759,
+ "grad_norm": 3.493748426437378,
+ "learning_rate": 1.3621484446203315e-07,
+ "loss": 0.2983,
+ "step": 11361
+ },
+ {
+ "epoch": 5.3721040189125295,
+ "grad_norm": 3.1911051273345947,
+ "learning_rate": 1.360118116811468e-07,
+ "loss": 0.2753,
+ "step": 11362
+ },
+ {
+ "epoch": 5.3725768321513,
+ "grad_norm": 3.3586859703063965,
+ "learning_rate": 1.3580892609543871e-07,
+ "loss": 0.2857,
+ "step": 11363
+ },
+ {
+ "epoch": 5.373049645390071,
+ "grad_norm": 3.4908719062805176,
+ "learning_rate": 1.356061877175424e-07,
+ "loss": 0.333,
+ "step": 11364
+ },
+ {
+ "epoch": 5.373522458628842,
+ "grad_norm": 3.3023927211761475,
+ "learning_rate": 1.354035965600814e-07,
+ "loss": 0.3248,
+ "step": 11365
+ },
+ {
+ "epoch": 5.373995271867612,
+ "grad_norm": 3.048675060272217,
+ "learning_rate": 1.3520115263566974e-07,
+ "loss": 0.3392,
+ "step": 11366
+ },
+ {
+ "epoch": 5.374468085106383,
+ "grad_norm": 3.3775367736816406,
+ "learning_rate": 1.3499885595691293e-07,
+ "loss": 0.332,
+ "step": 11367
+ },
+ {
+ "epoch": 5.374940898345153,
+ "grad_norm": 3.3747823238372803,
+ "learning_rate": 1.3479670653640726e-07,
+ "loss": 0.4079,
+ "step": 11368
+ },
+ {
+ "epoch": 5.375413711583924,
+ "grad_norm": 3.4345645904541016,
+ "learning_rate": 1.3459470438673932e-07,
+ "loss": 0.3924,
+ "step": 11369
+ },
+ {
+ "epoch": 5.375886524822695,
+ "grad_norm": 3.0778472423553467,
+ "learning_rate": 1.343928495204874e-07,
+ "loss": 0.2865,
+ "step": 11370
+ },
+ {
+ "epoch": 5.376359338061466,
+ "grad_norm": 3.4024085998535156,
+ "learning_rate": 1.3419114195022008e-07,
+ "loss": 0.2823,
+ "step": 11371
+ },
+ {
+ "epoch": 5.376832151300237,
+ "grad_norm": 3.141794443130493,
+ "learning_rate": 1.3398958168849619e-07,
+ "loss": 0.2678,
+ "step": 11372
+ },
+ {
+ "epoch": 5.377304964539007,
+ "grad_norm": 3.4689128398895264,
+ "learning_rate": 1.337881687478665e-07,
+ "loss": 0.3944,
+ "step": 11373
+ },
+ {
+ "epoch": 5.377777777777778,
+ "grad_norm": 2.932950973510742,
+ "learning_rate": 1.3358690314087236e-07,
+ "loss": 0.3301,
+ "step": 11374
+ },
+ {
+ "epoch": 5.378250591016548,
+ "grad_norm": 3.4097161293029785,
+ "learning_rate": 1.333857848800446e-07,
+ "loss": 0.3421,
+ "step": 11375
+ },
+ {
+ "epoch": 5.378723404255319,
+ "grad_norm": 2.732140064239502,
+ "learning_rate": 1.331848139779074e-07,
+ "loss": 0.3288,
+ "step": 11376
+ },
+ {
+ "epoch": 5.37919621749409,
+ "grad_norm": 3.342212677001953,
+ "learning_rate": 1.3298399044697318e-07,
+ "loss": 0.3651,
+ "step": 11377
+ },
+ {
+ "epoch": 5.3796690307328605,
+ "grad_norm": 3.049415111541748,
+ "learning_rate": 1.3278331429974728e-07,
+ "loss": 0.3287,
+ "step": 11378
+ },
+ {
+ "epoch": 5.380141843971631,
+ "grad_norm": 2.870662212371826,
+ "learning_rate": 1.3258278554872439e-07,
+ "loss": 0.3344,
+ "step": 11379
+ },
+ {
+ "epoch": 5.380614657210402,
+ "grad_norm": 3.2381114959716797,
+ "learning_rate": 1.3238240420639065e-07,
+ "loss": 0.3337,
+ "step": 11380
+ },
+ {
+ "epoch": 5.381087470449173,
+ "grad_norm": 3.0187833309173584,
+ "learning_rate": 1.3218217028522272e-07,
+ "loss": 0.2675,
+ "step": 11381
+ },
+ {
+ "epoch": 5.381560283687943,
+ "grad_norm": 3.0431411266326904,
+ "learning_rate": 1.3198208379768844e-07,
+ "loss": 0.2848,
+ "step": 11382
+ },
+ {
+ "epoch": 5.382033096926714,
+ "grad_norm": 3.216615676879883,
+ "learning_rate": 1.3178214475624645e-07,
+ "loss": 0.3268,
+ "step": 11383
+ },
+ {
+ "epoch": 5.382505910165484,
+ "grad_norm": 3.2666940689086914,
+ "learning_rate": 1.3158235317334622e-07,
+ "loss": 0.3412,
+ "step": 11384
+ },
+ {
+ "epoch": 5.382978723404255,
+ "grad_norm": 3.4213311672210693,
+ "learning_rate": 1.3138270906142726e-07,
+ "loss": 0.3205,
+ "step": 11385
+ },
+ {
+ "epoch": 5.383451536643026,
+ "grad_norm": 3.3023040294647217,
+ "learning_rate": 1.31183212432921e-07,
+ "loss": 0.2655,
+ "step": 11386
+ },
+ {
+ "epoch": 5.383924349881797,
+ "grad_norm": 3.4553420543670654,
+ "learning_rate": 1.3098386330024915e-07,
+ "loss": 0.3463,
+ "step": 11387
+ },
+ {
+ "epoch": 5.384397163120568,
+ "grad_norm": 3.350341558456421,
+ "learning_rate": 1.307846616758246e-07,
+ "loss": 0.3076,
+ "step": 11388
+ },
+ {
+ "epoch": 5.384869976359338,
+ "grad_norm": 3.283419132232666,
+ "learning_rate": 1.305856075720502e-07,
+ "loss": 0.3357,
+ "step": 11389
+ },
+ {
+ "epoch": 5.385342789598109,
+ "grad_norm": 3.3644227981567383,
+ "learning_rate": 1.3038670100132046e-07,
+ "loss": 0.3462,
+ "step": 11390
+ },
+ {
+ "epoch": 5.385815602836879,
+ "grad_norm": 3.1411266326904297,
+ "learning_rate": 1.3018794197602047e-07,
+ "loss": 0.3337,
+ "step": 11391
+ },
+ {
+ "epoch": 5.38628841607565,
+ "grad_norm": 3.033926486968994,
+ "learning_rate": 1.299893305085259e-07,
+ "loss": 0.2947,
+ "step": 11392
+ },
+ {
+ "epoch": 5.386761229314421,
+ "grad_norm": 3.4088027477264404,
+ "learning_rate": 1.2979086661120356e-07,
+ "loss": 0.332,
+ "step": 11393
+ },
+ {
+ "epoch": 5.3872340425531915,
+ "grad_norm": 3.315154790878296,
+ "learning_rate": 1.2959255029641126e-07,
+ "loss": 0.3148,
+ "step": 11394
+ },
+ {
+ "epoch": 5.387706855791962,
+ "grad_norm": 3.702969789505005,
+ "learning_rate": 1.293943815764967e-07,
+ "loss": 0.3644,
+ "step": 11395
+ },
+ {
+ "epoch": 5.388179669030733,
+ "grad_norm": 2.868751049041748,
+ "learning_rate": 1.291963604637994e-07,
+ "loss": 0.3484,
+ "step": 11396
+ },
+ {
+ "epoch": 5.388652482269504,
+ "grad_norm": 3.31673264503479,
+ "learning_rate": 1.2899848697064898e-07,
+ "loss": 0.3058,
+ "step": 11397
+ },
+ {
+ "epoch": 5.389125295508274,
+ "grad_norm": 3.311613082885742,
+ "learning_rate": 1.2880076110936585e-07,
+ "loss": 0.3186,
+ "step": 11398
+ },
+ {
+ "epoch": 5.389598108747045,
+ "grad_norm": 3.3614962100982666,
+ "learning_rate": 1.286031828922621e-07,
+ "loss": 0.3101,
+ "step": 11399
+ },
+ {
+ "epoch": 5.390070921985815,
+ "grad_norm": 3.302133321762085,
+ "learning_rate": 1.2840575233163983e-07,
+ "loss": 0.3511,
+ "step": 11400
+ },
+ {
+ "epoch": 5.390543735224586,
+ "grad_norm": 3.287588357925415,
+ "learning_rate": 1.2820846943979254e-07,
+ "loss": 0.3495,
+ "step": 11401
+ },
+ {
+ "epoch": 5.391016548463357,
+ "grad_norm": 3.3546504974365234,
+ "learning_rate": 1.2801133422900374e-07,
+ "loss": 0.3508,
+ "step": 11402
+ },
+ {
+ "epoch": 5.391489361702128,
+ "grad_norm": 3.441850423812866,
+ "learning_rate": 1.278143467115481e-07,
+ "loss": 0.3296,
+ "step": 11403
+ },
+ {
+ "epoch": 5.391962174940899,
+ "grad_norm": 3.529294490814209,
+ "learning_rate": 1.2761750689969154e-07,
+ "loss": 0.3393,
+ "step": 11404
+ },
+ {
+ "epoch": 5.392434988179669,
+ "grad_norm": 3.0237176418304443,
+ "learning_rate": 1.274208148056902e-07,
+ "loss": 0.344,
+ "step": 11405
+ },
+ {
+ "epoch": 5.39290780141844,
+ "grad_norm": 3.643930435180664,
+ "learning_rate": 1.2722427044179142e-07,
+ "loss": 0.3439,
+ "step": 11406
+ },
+ {
+ "epoch": 5.39338061465721,
+ "grad_norm": 3.300459623336792,
+ "learning_rate": 1.2702787382023296e-07,
+ "loss": 0.3344,
+ "step": 11407
+ },
+ {
+ "epoch": 5.393853427895981,
+ "grad_norm": 3.3594553470611572,
+ "learning_rate": 1.2683162495324335e-07,
+ "loss": 0.3267,
+ "step": 11408
+ },
+ {
+ "epoch": 5.394326241134752,
+ "grad_norm": 3.8660781383514404,
+ "learning_rate": 1.2663552385304284e-07,
+ "loss": 0.3595,
+ "step": 11409
+ },
+ {
+ "epoch": 5.3947990543735225,
+ "grad_norm": 3.7904021739959717,
+ "learning_rate": 1.2643957053184107e-07,
+ "loss": 0.3893,
+ "step": 11410
+ },
+ {
+ "epoch": 5.395271867612293,
+ "grad_norm": 2.8576457500457764,
+ "learning_rate": 1.2624376500183973e-07,
+ "loss": 0.2404,
+ "step": 11411
+ },
+ {
+ "epoch": 5.395744680851064,
+ "grad_norm": 4.3170013427734375,
+ "learning_rate": 1.2604810727523066e-07,
+ "loss": 0.3223,
+ "step": 11412
+ },
+ {
+ "epoch": 5.396217494089835,
+ "grad_norm": 3.2365522384643555,
+ "learning_rate": 1.258525973641958e-07,
+ "loss": 0.2327,
+ "step": 11413
+ },
+ {
+ "epoch": 5.396690307328605,
+ "grad_norm": 3.4322235584259033,
+ "learning_rate": 1.2565723528091017e-07,
+ "loss": 0.3513,
+ "step": 11414
+ },
+ {
+ "epoch": 5.397163120567376,
+ "grad_norm": 3.695038080215454,
+ "learning_rate": 1.2546202103753652e-07,
+ "loss": 0.3993,
+ "step": 11415
+ },
+ {
+ "epoch": 5.397635933806146,
+ "grad_norm": 3.412743091583252,
+ "learning_rate": 1.252669546462315e-07,
+ "loss": 0.3392,
+ "step": 11416
+ },
+ {
+ "epoch": 5.398108747044917,
+ "grad_norm": 3.074951171875,
+ "learning_rate": 1.2507203611914016e-07,
+ "loss": 0.3231,
+ "step": 11417
+ },
+ {
+ "epoch": 5.398581560283688,
+ "grad_norm": 3.4410054683685303,
+ "learning_rate": 1.2487726546839884e-07,
+ "loss": 0.3044,
+ "step": 11418
+ },
+ {
+ "epoch": 5.399054373522459,
+ "grad_norm": 3.304527521133423,
+ "learning_rate": 1.2468264270613622e-07,
+ "loss": 0.3058,
+ "step": 11419
+ },
+ {
+ "epoch": 5.39952718676123,
+ "grad_norm": 3.286989212036133,
+ "learning_rate": 1.2448816784446982e-07,
+ "loss": 0.3508,
+ "step": 11420
+ },
+ {
+ "epoch": 5.4,
+ "grad_norm": 3.223222494125366,
+ "learning_rate": 1.2429384089550884e-07,
+ "loss": 0.3327,
+ "step": 11421
+ },
+ {
+ "epoch": 5.400472813238771,
+ "grad_norm": 3.333634853363037,
+ "learning_rate": 1.240996618713533e-07,
+ "loss": 0.3509,
+ "step": 11422
+ },
+ {
+ "epoch": 5.400945626477541,
+ "grad_norm": 2.9547529220581055,
+ "learning_rate": 1.2390563078409356e-07,
+ "loss": 0.27,
+ "step": 11423
+ },
+ {
+ "epoch": 5.401418439716312,
+ "grad_norm": 3.4548802375793457,
+ "learning_rate": 1.2371174764581161e-07,
+ "loss": 0.3565,
+ "step": 11424
+ },
+ {
+ "epoch": 5.401891252955083,
+ "grad_norm": 3.143460273742676,
+ "learning_rate": 1.2351801246857947e-07,
+ "loss": 0.2743,
+ "step": 11425
+ },
+ {
+ "epoch": 5.4023640661938535,
+ "grad_norm": 3.1186251640319824,
+ "learning_rate": 1.2332442526445997e-07,
+ "loss": 0.3026,
+ "step": 11426
+ },
+ {
+ "epoch": 5.402836879432624,
+ "grad_norm": 3.14628529548645,
+ "learning_rate": 1.231309860455071e-07,
+ "loss": 0.3026,
+ "step": 11427
+ },
+ {
+ "epoch": 5.403309692671395,
+ "grad_norm": 3.3630118370056152,
+ "learning_rate": 1.2293769482376565e-07,
+ "loss": 0.3164,
+ "step": 11428
+ },
+ {
+ "epoch": 5.403782505910166,
+ "grad_norm": 2.9600207805633545,
+ "learning_rate": 1.2274455161127074e-07,
+ "loss": 0.2775,
+ "step": 11429
+ },
+ {
+ "epoch": 5.404255319148936,
+ "grad_norm": 3.3323616981506348,
+ "learning_rate": 1.2255155642004885e-07,
+ "loss": 0.2931,
+ "step": 11430
+ },
+ {
+ "epoch": 5.4047281323877066,
+ "grad_norm": 3.339144468307495,
+ "learning_rate": 1.223587092621162e-07,
+ "loss": 0.3357,
+ "step": 11431
+ },
+ {
+ "epoch": 5.405200945626477,
+ "grad_norm": 3.590852737426758,
+ "learning_rate": 1.2216601014948148e-07,
+ "loss": 0.296,
+ "step": 11432
+ },
+ {
+ "epoch": 5.405673758865248,
+ "grad_norm": 3.0115602016448975,
+ "learning_rate": 1.2197345909414237e-07,
+ "loss": 0.2851,
+ "step": 11433
+ },
+ {
+ "epoch": 5.406146572104019,
+ "grad_norm": 3.1093897819519043,
+ "learning_rate": 1.2178105610808928e-07,
+ "loss": 0.3334,
+ "step": 11434
+ },
+ {
+ "epoch": 5.40661938534279,
+ "grad_norm": 3.2710671424865723,
+ "learning_rate": 1.2158880120330147e-07,
+ "loss": 0.3456,
+ "step": 11435
+ },
+ {
+ "epoch": 5.4070921985815605,
+ "grad_norm": 3.035571336746216,
+ "learning_rate": 1.2139669439174968e-07,
+ "loss": 0.2914,
+ "step": 11436
+ },
+ {
+ "epoch": 5.407565011820331,
+ "grad_norm": 2.946021795272827,
+ "learning_rate": 1.2120473568539598e-07,
+ "loss": 0.274,
+ "step": 11437
+ },
+ {
+ "epoch": 5.408037825059101,
+ "grad_norm": 3.0848777294158936,
+ "learning_rate": 1.210129250961925e-07,
+ "loss": 0.2875,
+ "step": 11438
+ },
+ {
+ "epoch": 5.408510638297872,
+ "grad_norm": 3.231348752975464,
+ "learning_rate": 1.20821262636083e-07,
+ "loss": 0.3107,
+ "step": 11439
+ },
+ {
+ "epoch": 5.408983451536643,
+ "grad_norm": 3.4804441928863525,
+ "learning_rate": 1.2062974831700131e-07,
+ "loss": 0.34,
+ "step": 11440
+ },
+ {
+ "epoch": 5.409456264775414,
+ "grad_norm": 3.396928310394287,
+ "learning_rate": 1.2043838215087144e-07,
+ "loss": 0.3577,
+ "step": 11441
+ },
+ {
+ "epoch": 5.409929078014184,
+ "grad_norm": 3.4840028285980225,
+ "learning_rate": 1.2024716414960975e-07,
+ "loss": 0.296,
+ "step": 11442
+ },
+ {
+ "epoch": 5.410401891252955,
+ "grad_norm": 3.8356549739837646,
+ "learning_rate": 1.200560943251222e-07,
+ "loss": 0.3444,
+ "step": 11443
+ },
+ {
+ "epoch": 5.410874704491726,
+ "grad_norm": 3.8281893730163574,
+ "learning_rate": 1.1986517268930603e-07,
+ "loss": 0.3962,
+ "step": 11444
+ },
+ {
+ "epoch": 5.411347517730497,
+ "grad_norm": 3.0665910243988037,
+ "learning_rate": 1.1967439925404888e-07,
+ "loss": 0.3336,
+ "step": 11445
+ },
+ {
+ "epoch": 5.411820330969267,
+ "grad_norm": 3.4650213718414307,
+ "learning_rate": 1.1948377403122906e-07,
+ "loss": 0.3278,
+ "step": 11446
+ },
+ {
+ "epoch": 5.4122931442080375,
+ "grad_norm": 3.654869794845581,
+ "learning_rate": 1.1929329703271707e-07,
+ "loss": 0.3093,
+ "step": 11447
+ },
+ {
+ "epoch": 5.412765957446808,
+ "grad_norm": 3.377669095993042,
+ "learning_rate": 1.1910296827037204e-07,
+ "loss": 0.3239,
+ "step": 11448
+ },
+ {
+ "epoch": 5.413238770685579,
+ "grad_norm": 3.475485324859619,
+ "learning_rate": 1.1891278775604503e-07,
+ "loss": 0.3307,
+ "step": 11449
+ },
+ {
+ "epoch": 5.41371158392435,
+ "grad_norm": 2.9799742698669434,
+ "learning_rate": 1.1872275550157824e-07,
+ "loss": 0.3012,
+ "step": 11450
+ },
+ {
+ "epoch": 5.414184397163121,
+ "grad_norm": 2.9590864181518555,
+ "learning_rate": 1.185328715188036e-07,
+ "loss": 0.2913,
+ "step": 11451
+ },
+ {
+ "epoch": 5.4146572104018915,
+ "grad_norm": 3.7972922325134277,
+ "learning_rate": 1.1834313581954498e-07,
+ "loss": 0.3938,
+ "step": 11452
+ },
+ {
+ "epoch": 5.415130023640662,
+ "grad_norm": 3.234405279159546,
+ "learning_rate": 1.1815354841561627e-07,
+ "loss": 0.3548,
+ "step": 11453
+ },
+ {
+ "epoch": 5.415602836879432,
+ "grad_norm": 3.439035654067993,
+ "learning_rate": 1.1796410931882136e-07,
+ "loss": 0.316,
+ "step": 11454
+ },
+ {
+ "epoch": 5.416075650118203,
+ "grad_norm": 3.5416979789733887,
+ "learning_rate": 1.177748185409569e-07,
+ "loss": 0.2477,
+ "step": 11455
+ },
+ {
+ "epoch": 5.416548463356974,
+ "grad_norm": 3.2233688831329346,
+ "learning_rate": 1.175856760938085e-07,
+ "loss": 0.2967,
+ "step": 11456
+ },
+ {
+ "epoch": 5.417021276595745,
+ "grad_norm": 3.328793525695801,
+ "learning_rate": 1.1739668198915366e-07,
+ "loss": 0.3869,
+ "step": 11457
+ },
+ {
+ "epoch": 5.417494089834515,
+ "grad_norm": 3.4466869831085205,
+ "learning_rate": 1.1720783623875991e-07,
+ "loss": 0.3281,
+ "step": 11458
+ },
+ {
+ "epoch": 5.417966903073286,
+ "grad_norm": 2.856494665145874,
+ "learning_rate": 1.1701913885438621e-07,
+ "loss": 0.2944,
+ "step": 11459
+ },
+ {
+ "epoch": 5.418439716312057,
+ "grad_norm": 3.5706748962402344,
+ "learning_rate": 1.1683058984778172e-07,
+ "loss": 0.302,
+ "step": 11460
+ },
+ {
+ "epoch": 5.418912529550828,
+ "grad_norm": 3.267441511154175,
+ "learning_rate": 1.1664218923068599e-07,
+ "loss": 0.2935,
+ "step": 11461
+ },
+ {
+ "epoch": 5.419385342789598,
+ "grad_norm": 2.9759864807128906,
+ "learning_rate": 1.1645393701483099e-07,
+ "loss": 0.3258,
+ "step": 11462
+ },
+ {
+ "epoch": 5.4198581560283685,
+ "grad_norm": 3.288041353225708,
+ "learning_rate": 1.1626583321193763e-07,
+ "loss": 0.2809,
+ "step": 11463
+ },
+ {
+ "epoch": 5.420330969267139,
+ "grad_norm": 3.2921197414398193,
+ "learning_rate": 1.1607787783371794e-07,
+ "loss": 0.3365,
+ "step": 11464
+ },
+ {
+ "epoch": 5.42080378250591,
+ "grad_norm": 3.1783058643341064,
+ "learning_rate": 1.1589007089187615e-07,
+ "loss": 0.3345,
+ "step": 11465
+ },
+ {
+ "epoch": 5.421276595744681,
+ "grad_norm": 3.314274787902832,
+ "learning_rate": 1.1570241239810542e-07,
+ "loss": 0.35,
+ "step": 11466
+ },
+ {
+ "epoch": 5.421749408983452,
+ "grad_norm": 3.0419068336486816,
+ "learning_rate": 1.1551490236409085e-07,
+ "loss": 0.3057,
+ "step": 11467
+ },
+ {
+ "epoch": 5.4222222222222225,
+ "grad_norm": 3.2282440662384033,
+ "learning_rate": 1.1532754080150782e-07,
+ "loss": 0.3441,
+ "step": 11468
+ },
+ {
+ "epoch": 5.422695035460993,
+ "grad_norm": 2.974402666091919,
+ "learning_rate": 1.1514032772202172e-07,
+ "loss": 0.2715,
+ "step": 11469
+ },
+ {
+ "epoch": 5.423167848699763,
+ "grad_norm": 2.95684814453125,
+ "learning_rate": 1.1495326313729071e-07,
+ "loss": 0.2794,
+ "step": 11470
+ },
+ {
+ "epoch": 5.423640661938534,
+ "grad_norm": 3.2254786491394043,
+ "learning_rate": 1.1476634705896162e-07,
+ "loss": 0.3483,
+ "step": 11471
+ },
+ {
+ "epoch": 5.424113475177305,
+ "grad_norm": 3.2899491786956787,
+ "learning_rate": 1.1457957949867316e-07,
+ "loss": 0.3176,
+ "step": 11472
+ },
+ {
+ "epoch": 5.424586288416076,
+ "grad_norm": 3.2927584648132324,
+ "learning_rate": 1.1439296046805464e-07,
+ "loss": 0.321,
+ "step": 11473
+ },
+ {
+ "epoch": 5.425059101654846,
+ "grad_norm": 2.9653894901275635,
+ "learning_rate": 1.1420648997872596e-07,
+ "loss": 0.3307,
+ "step": 11474
+ },
+ {
+ "epoch": 5.425531914893617,
+ "grad_norm": 3.266310214996338,
+ "learning_rate": 1.1402016804229782e-07,
+ "loss": 0.3411,
+ "step": 11475
+ },
+ {
+ "epoch": 5.426004728132388,
+ "grad_norm": 3.2980828285217285,
+ "learning_rate": 1.1383399467037148e-07,
+ "loss": 0.3512,
+ "step": 11476
+ },
+ {
+ "epoch": 5.426477541371159,
+ "grad_norm": 3.2396881580352783,
+ "learning_rate": 1.1364796987453908e-07,
+ "loss": 0.3564,
+ "step": 11477
+ },
+ {
+ "epoch": 5.426950354609929,
+ "grad_norm": 3.1427674293518066,
+ "learning_rate": 1.1346209366638383e-07,
+ "loss": 0.3109,
+ "step": 11478
+ },
+ {
+ "epoch": 5.4274231678486995,
+ "grad_norm": 3.4141311645507812,
+ "learning_rate": 1.1327636605747926e-07,
+ "loss": 0.3694,
+ "step": 11479
+ },
+ {
+ "epoch": 5.42789598108747,
+ "grad_norm": 3.2203590869903564,
+ "learning_rate": 1.1309078705939031e-07,
+ "loss": 0.3153,
+ "step": 11480
+ },
+ {
+ "epoch": 5.428368794326241,
+ "grad_norm": 3.506516456604004,
+ "learning_rate": 1.1290535668367159e-07,
+ "loss": 0.3423,
+ "step": 11481
+ },
+ {
+ "epoch": 5.428841607565012,
+ "grad_norm": 4.035604953765869,
+ "learning_rate": 1.1272007494186887e-07,
+ "loss": 0.3452,
+ "step": 11482
+ },
+ {
+ "epoch": 5.429314420803783,
+ "grad_norm": 3.4485437870025635,
+ "learning_rate": 1.1253494184551961e-07,
+ "loss": 0.3539,
+ "step": 11483
+ },
+ {
+ "epoch": 5.4297872340425535,
+ "grad_norm": 3.707043409347534,
+ "learning_rate": 1.123499574061504e-07,
+ "loss": 0.3519,
+ "step": 11484
+ },
+ {
+ "epoch": 5.430260047281324,
+ "grad_norm": 4.046982765197754,
+ "learning_rate": 1.121651216352801e-07,
+ "loss": 0.333,
+ "step": 11485
+ },
+ {
+ "epoch": 5.430732860520094,
+ "grad_norm": 2.920891046524048,
+ "learning_rate": 1.1198043454441754e-07,
+ "loss": 0.3112,
+ "step": 11486
+ },
+ {
+ "epoch": 5.431205673758865,
+ "grad_norm": 3.5038676261901855,
+ "learning_rate": 1.1179589614506159e-07,
+ "loss": 0.3393,
+ "step": 11487
+ },
+ {
+ "epoch": 5.431678486997636,
+ "grad_norm": 3.349278211593628,
+ "learning_rate": 1.1161150644870389e-07,
+ "loss": 0.3814,
+ "step": 11488
+ },
+ {
+ "epoch": 5.432151300236407,
+ "grad_norm": 3.5188348293304443,
+ "learning_rate": 1.1142726546682469e-07,
+ "loss": 0.3791,
+ "step": 11489
+ },
+ {
+ "epoch": 5.432624113475177,
+ "grad_norm": 3.6900806427001953,
+ "learning_rate": 1.1124317321089595e-07,
+ "loss": 0.286,
+ "step": 11490
+ },
+ {
+ "epoch": 5.433096926713948,
+ "grad_norm": 2.804166793823242,
+ "learning_rate": 1.110592296923807e-07,
+ "loss": 0.2746,
+ "step": 11491
+ },
+ {
+ "epoch": 5.433569739952719,
+ "grad_norm": 3.1634747982025146,
+ "learning_rate": 1.1087543492273145e-07,
+ "loss": 0.3192,
+ "step": 11492
+ },
+ {
+ "epoch": 5.43404255319149,
+ "grad_norm": 3.8363192081451416,
+ "learning_rate": 1.106917889133935e-07,
+ "loss": 0.3442,
+ "step": 11493
+ },
+ {
+ "epoch": 5.43451536643026,
+ "grad_norm": 3.2438740730285645,
+ "learning_rate": 1.1050829167580073e-07,
+ "loss": 0.3487,
+ "step": 11494
+ },
+ {
+ "epoch": 5.4349881796690305,
+ "grad_norm": 3.2521681785583496,
+ "learning_rate": 1.1032494322137877e-07,
+ "loss": 0.3285,
+ "step": 11495
+ },
+ {
+ "epoch": 5.435460992907801,
+ "grad_norm": 2.9817872047424316,
+ "learning_rate": 1.1014174356154484e-07,
+ "loss": 0.2932,
+ "step": 11496
+ },
+ {
+ "epoch": 5.435933806146572,
+ "grad_norm": 3.4062857627868652,
+ "learning_rate": 1.0995869270770454e-07,
+ "loss": 0.3418,
+ "step": 11497
+ },
+ {
+ "epoch": 5.436406619385343,
+ "grad_norm": 3.4945690631866455,
+ "learning_rate": 1.097757906712571e-07,
+ "loss": 0.3411,
+ "step": 11498
+ },
+ {
+ "epoch": 5.436879432624114,
+ "grad_norm": 3.2375268936157227,
+ "learning_rate": 1.0959303746359007e-07,
+ "loss": 0.3243,
+ "step": 11499
+ },
+ {
+ "epoch": 5.4373522458628845,
+ "grad_norm": 3.2625772953033447,
+ "learning_rate": 1.0941043309608295e-07,
+ "loss": 0.3195,
+ "step": 11500
+ },
+ {
+ "epoch": 5.437825059101655,
+ "grad_norm": 3.473390579223633,
+ "learning_rate": 1.0922797758010584e-07,
+ "loss": 0.3755,
+ "step": 11501
+ },
+ {
+ "epoch": 5.438297872340425,
+ "grad_norm": 3.1895859241485596,
+ "learning_rate": 1.0904567092701907e-07,
+ "loss": 0.2959,
+ "step": 11502
+ },
+ {
+ "epoch": 5.438770685579196,
+ "grad_norm": 3.3767261505126953,
+ "learning_rate": 1.0886351314817467e-07,
+ "loss": 0.3274,
+ "step": 11503
+ },
+ {
+ "epoch": 5.439243498817967,
+ "grad_norm": 3.0629384517669678,
+ "learning_rate": 1.0868150425491469e-07,
+ "loss": 0.3347,
+ "step": 11504
+ },
+ {
+ "epoch": 5.439716312056738,
+ "grad_norm": 3.68630051612854,
+ "learning_rate": 1.0849964425857174e-07,
+ "loss": 0.3857,
+ "step": 11505
+ },
+ {
+ "epoch": 5.440189125295508,
+ "grad_norm": 3.170816659927368,
+ "learning_rate": 1.0831793317046895e-07,
+ "loss": 0.3009,
+ "step": 11506
+ },
+ {
+ "epoch": 5.440661938534279,
+ "grad_norm": 3.1225314140319824,
+ "learning_rate": 1.0813637100192176e-07,
+ "loss": 0.3531,
+ "step": 11507
+ },
+ {
+ "epoch": 5.44113475177305,
+ "grad_norm": 3.2253336906433105,
+ "learning_rate": 1.079549577642347e-07,
+ "loss": 0.3218,
+ "step": 11508
+ },
+ {
+ "epoch": 5.441607565011821,
+ "grad_norm": 2.8939125537872314,
+ "learning_rate": 1.0777369346870376e-07,
+ "loss": 0.2759,
+ "step": 11509
+ },
+ {
+ "epoch": 5.442080378250591,
+ "grad_norm": 3.3873422145843506,
+ "learning_rate": 1.0759257812661489e-07,
+ "loss": 0.3115,
+ "step": 11510
+ },
+ {
+ "epoch": 5.4425531914893615,
+ "grad_norm": 3.2324378490448,
+ "learning_rate": 1.0741161174924603e-07,
+ "loss": 0.3287,
+ "step": 11511
+ },
+ {
+ "epoch": 5.443026004728132,
+ "grad_norm": 3.369802713394165,
+ "learning_rate": 1.0723079434786482e-07,
+ "loss": 0.3218,
+ "step": 11512
+ },
+ {
+ "epoch": 5.443498817966903,
+ "grad_norm": 3.3094370365142822,
+ "learning_rate": 1.0705012593373032e-07,
+ "loss": 0.3079,
+ "step": 11513
+ },
+ {
+ "epoch": 5.443971631205674,
+ "grad_norm": 3.8659892082214355,
+ "learning_rate": 1.0686960651809158e-07,
+ "loss": 0.3199,
+ "step": 11514
+ },
+ {
+ "epoch": 5.444444444444445,
+ "grad_norm": 3.1500043869018555,
+ "learning_rate": 1.0668923611218851e-07,
+ "loss": 0.3224,
+ "step": 11515
+ },
+ {
+ "epoch": 5.444917257683215,
+ "grad_norm": 3.008122444152832,
+ "learning_rate": 1.0650901472725295e-07,
+ "loss": 0.2918,
+ "step": 11516
+ },
+ {
+ "epoch": 5.445390070921986,
+ "grad_norm": 3.1699118614196777,
+ "learning_rate": 1.0632894237450564e-07,
+ "loss": 0.3058,
+ "step": 11517
+ },
+ {
+ "epoch": 5.445862884160756,
+ "grad_norm": 2.907656192779541,
+ "learning_rate": 1.0614901906515901e-07,
+ "loss": 0.2993,
+ "step": 11518
+ },
+ {
+ "epoch": 5.446335697399527,
+ "grad_norm": 3.3137612342834473,
+ "learning_rate": 1.059692448104166e-07,
+ "loss": 0.2962,
+ "step": 11519
+ },
+ {
+ "epoch": 5.446808510638298,
+ "grad_norm": 3.1978371143341064,
+ "learning_rate": 1.0578961962147139e-07,
+ "loss": 0.3076,
+ "step": 11520
+ },
+ {
+ "epoch": 5.4472813238770685,
+ "grad_norm": 2.8629150390625,
+ "learning_rate": 1.056101435095086e-07,
+ "loss": 0.2902,
+ "step": 11521
+ },
+ {
+ "epoch": 5.447754137115839,
+ "grad_norm": 3.287623643875122,
+ "learning_rate": 1.0543081648570291e-07,
+ "loss": 0.2816,
+ "step": 11522
+ },
+ {
+ "epoch": 5.44822695035461,
+ "grad_norm": 3.0980396270751953,
+ "learning_rate": 1.0525163856122068e-07,
+ "loss": 0.3373,
+ "step": 11523
+ },
+ {
+ "epoch": 5.448699763593381,
+ "grad_norm": 3.444204092025757,
+ "learning_rate": 1.0507260974721795e-07,
+ "loss": 0.2918,
+ "step": 11524
+ },
+ {
+ "epoch": 5.449172576832151,
+ "grad_norm": 3.946662187576294,
+ "learning_rate": 1.0489373005484248e-07,
+ "loss": 0.3052,
+ "step": 11525
+ },
+ {
+ "epoch": 5.449645390070922,
+ "grad_norm": 3.6034319400787354,
+ "learning_rate": 1.0471499949523234e-07,
+ "loss": 0.319,
+ "step": 11526
+ },
+ {
+ "epoch": 5.450118203309692,
+ "grad_norm": 3.5294625759124756,
+ "learning_rate": 1.0453641807951609e-07,
+ "loss": 0.3523,
+ "step": 11527
+ },
+ {
+ "epoch": 5.450591016548463,
+ "grad_norm": 3.2164146900177,
+ "learning_rate": 1.043579858188129e-07,
+ "loss": 0.3397,
+ "step": 11528
+ },
+ {
+ "epoch": 5.451063829787234,
+ "grad_norm": 3.060518741607666,
+ "learning_rate": 1.0417970272423388e-07,
+ "loss": 0.3483,
+ "step": 11529
+ },
+ {
+ "epoch": 5.451536643026005,
+ "grad_norm": 3.1823830604553223,
+ "learning_rate": 1.0400156880687906e-07,
+ "loss": 0.3493,
+ "step": 11530
+ },
+ {
+ "epoch": 5.452009456264776,
+ "grad_norm": 3.1154868602752686,
+ "learning_rate": 1.0382358407784065e-07,
+ "loss": 0.2982,
+ "step": 11531
+ },
+ {
+ "epoch": 5.452482269503546,
+ "grad_norm": 3.1093451976776123,
+ "learning_rate": 1.0364574854820064e-07,
+ "loss": 0.2735,
+ "step": 11532
+ },
+ {
+ "epoch": 5.452955082742317,
+ "grad_norm": 3.1191015243530273,
+ "learning_rate": 1.0346806222903211e-07,
+ "loss": 0.3361,
+ "step": 11533
+ },
+ {
+ "epoch": 5.453427895981087,
+ "grad_norm": 3.328355312347412,
+ "learning_rate": 1.0329052513139898e-07,
+ "loss": 0.296,
+ "step": 11534
+ },
+ {
+ "epoch": 5.453900709219858,
+ "grad_norm": 2.975257158279419,
+ "learning_rate": 1.0311313726635546e-07,
+ "loss": 0.2943,
+ "step": 11535
+ },
+ {
+ "epoch": 5.454373522458629,
+ "grad_norm": 2.889234781265259,
+ "learning_rate": 1.029358986449469e-07,
+ "loss": 0.3056,
+ "step": 11536
+ },
+ {
+ "epoch": 5.4548463356973995,
+ "grad_norm": 3.158823251724243,
+ "learning_rate": 1.0275880927820948e-07,
+ "loss": 0.3092,
+ "step": 11537
+ },
+ {
+ "epoch": 5.45531914893617,
+ "grad_norm": 3.256237506866455,
+ "learning_rate": 1.0258186917716906e-07,
+ "loss": 0.3353,
+ "step": 11538
+ },
+ {
+ "epoch": 5.455791962174941,
+ "grad_norm": 3.170668125152588,
+ "learning_rate": 1.0240507835284353e-07,
+ "loss": 0.2833,
+ "step": 11539
+ },
+ {
+ "epoch": 5.456264775413712,
+ "grad_norm": 3.2530853748321533,
+ "learning_rate": 1.0222843681624045e-07,
+ "loss": 0.3327,
+ "step": 11540
+ },
+ {
+ "epoch": 5.456737588652482,
+ "grad_norm": 3.224292516708374,
+ "learning_rate": 1.0205194457835854e-07,
+ "loss": 0.3421,
+ "step": 11541
+ },
+ {
+ "epoch": 5.457210401891253,
+ "grad_norm": 3.392624855041504,
+ "learning_rate": 1.0187560165018761e-07,
+ "loss": 0.4025,
+ "step": 11542
+ },
+ {
+ "epoch": 5.457683215130023,
+ "grad_norm": 3.517279863357544,
+ "learning_rate": 1.0169940804270723e-07,
+ "loss": 0.3634,
+ "step": 11543
+ },
+ {
+ "epoch": 5.458156028368794,
+ "grad_norm": 3.68953275680542,
+ "learning_rate": 1.0152336376688887e-07,
+ "loss": 0.3512,
+ "step": 11544
+ },
+ {
+ "epoch": 5.458628841607565,
+ "grad_norm": 3.429471492767334,
+ "learning_rate": 1.013474688336935e-07,
+ "loss": 0.3601,
+ "step": 11545
+ },
+ {
+ "epoch": 5.459101654846336,
+ "grad_norm": 3.0927700996398926,
+ "learning_rate": 1.0117172325407321e-07,
+ "loss": 0.3181,
+ "step": 11546
+ },
+ {
+ "epoch": 5.459574468085107,
+ "grad_norm": 3.3231961727142334,
+ "learning_rate": 1.0099612703897144e-07,
+ "loss": 0.3376,
+ "step": 11547
+ },
+ {
+ "epoch": 5.460047281323877,
+ "grad_norm": 3.1489665508270264,
+ "learning_rate": 1.008206801993214e-07,
+ "loss": 0.2726,
+ "step": 11548
+ },
+ {
+ "epoch": 5.460520094562648,
+ "grad_norm": 2.998042345046997,
+ "learning_rate": 1.006453827460474e-07,
+ "loss": 0.3258,
+ "step": 11549
+ },
+ {
+ "epoch": 5.460992907801418,
+ "grad_norm": 3.317922592163086,
+ "learning_rate": 1.0047023469006489e-07,
+ "loss": 0.3156,
+ "step": 11550
+ },
+ {
+ "epoch": 5.461465721040189,
+ "grad_norm": 3.0401973724365234,
+ "learning_rate": 1.0029523604227904e-07,
+ "loss": 0.3063,
+ "step": 11551
+ },
+ {
+ "epoch": 5.46193853427896,
+ "grad_norm": 2.92132568359375,
+ "learning_rate": 1.0012038681358638e-07,
+ "loss": 0.3335,
+ "step": 11552
+ },
+ {
+ "epoch": 5.4624113475177305,
+ "grad_norm": 3.06042742729187,
+ "learning_rate": 9.994568701487434e-08,
+ "loss": 0.3317,
+ "step": 11553
+ },
+ {
+ "epoch": 5.462884160756501,
+ "grad_norm": 2.959724187850952,
+ "learning_rate": 9.977113665702059e-08,
+ "loss": 0.3015,
+ "step": 11554
+ },
+ {
+ "epoch": 5.463356973995272,
+ "grad_norm": 3.12992787361145,
+ "learning_rate": 9.959673575089313e-08,
+ "loss": 0.366,
+ "step": 11555
+ },
+ {
+ "epoch": 5.463829787234043,
+ "grad_norm": 3.3487796783447266,
+ "learning_rate": 9.94224843073513e-08,
+ "loss": 0.3029,
+ "step": 11556
+ },
+ {
+ "epoch": 5.464302600472813,
+ "grad_norm": 3.027540445327759,
+ "learning_rate": 9.92483823372456e-08,
+ "loss": 0.3269,
+ "step": 11557
+ },
+ {
+ "epoch": 5.464775413711584,
+ "grad_norm": 3.2602925300598145,
+ "learning_rate": 9.907442985141569e-08,
+ "loss": 0.3376,
+ "step": 11558
+ },
+ {
+ "epoch": 5.465248226950354,
+ "grad_norm": 3.346432685852051,
+ "learning_rate": 9.890062686069374e-08,
+ "loss": 0.3409,
+ "step": 11559
+ },
+ {
+ "epoch": 5.465721040189125,
+ "grad_norm": 3.288007974624634,
+ "learning_rate": 9.872697337590109e-08,
+ "loss": 0.3229,
+ "step": 11560
+ },
+ {
+ "epoch": 5.466193853427896,
+ "grad_norm": 3.5697832107543945,
+ "learning_rate": 9.85534694078502e-08,
+ "loss": 0.4162,
+ "step": 11561
+ },
+ {
+ "epoch": 5.466666666666667,
+ "grad_norm": 3.2786145210266113,
+ "learning_rate": 9.838011496734523e-08,
+ "loss": 0.3339,
+ "step": 11562
+ },
+ {
+ "epoch": 5.467139479905438,
+ "grad_norm": 3.6333465576171875,
+ "learning_rate": 9.820691006517947e-08,
+ "loss": 0.2769,
+ "step": 11563
+ },
+ {
+ "epoch": 5.467612293144208,
+ "grad_norm": 3.409446954727173,
+ "learning_rate": 9.803385471213788e-08,
+ "loss": 0.314,
+ "step": 11564
+ },
+ {
+ "epoch": 5.468085106382979,
+ "grad_norm": 3.376220941543579,
+ "learning_rate": 9.786094891899606e-08,
+ "loss": 0.3211,
+ "step": 11565
+ },
+ {
+ "epoch": 5.468557919621749,
+ "grad_norm": 3.073551893234253,
+ "learning_rate": 9.768819269651952e-08,
+ "loss": 0.2867,
+ "step": 11566
+ },
+ {
+ "epoch": 5.46903073286052,
+ "grad_norm": 2.9466781616210938,
+ "learning_rate": 9.75155860554658e-08,
+ "loss": 0.3323,
+ "step": 11567
+ },
+ {
+ "epoch": 5.469503546099291,
+ "grad_norm": 2.8090531826019287,
+ "learning_rate": 9.734312900658182e-08,
+ "loss": 0.2598,
+ "step": 11568
+ },
+ {
+ "epoch": 5.4699763593380615,
+ "grad_norm": 2.9262332916259766,
+ "learning_rate": 9.717082156060598e-08,
+ "loss": 0.3059,
+ "step": 11569
+ },
+ {
+ "epoch": 5.470449172576832,
+ "grad_norm": 3.2151377201080322,
+ "learning_rate": 9.699866372826661e-08,
+ "loss": 0.2855,
+ "step": 11570
+ },
+ {
+ "epoch": 5.470921985815603,
+ "grad_norm": 3.633512020111084,
+ "learning_rate": 9.682665552028404e-08,
+ "loss": 0.3495,
+ "step": 11571
+ },
+ {
+ "epoch": 5.471394799054374,
+ "grad_norm": 3.3621714115142822,
+ "learning_rate": 9.665479694736773e-08,
+ "loss": 0.2937,
+ "step": 11572
+ },
+ {
+ "epoch": 5.471867612293144,
+ "grad_norm": 2.9446847438812256,
+ "learning_rate": 9.648308802021916e-08,
+ "loss": 0.297,
+ "step": 11573
+ },
+ {
+ "epoch": 5.472340425531915,
+ "grad_norm": 3.4728331565856934,
+ "learning_rate": 9.63115287495292e-08,
+ "loss": 0.3365,
+ "step": 11574
+ },
+ {
+ "epoch": 5.472813238770685,
+ "grad_norm": 3.0443341732025146,
+ "learning_rate": 9.614011914598071e-08,
+ "loss": 0.3059,
+ "step": 11575
+ },
+ {
+ "epoch": 5.473286052009456,
+ "grad_norm": 3.659133195877075,
+ "learning_rate": 9.596885922024623e-08,
+ "loss": 0.3561,
+ "step": 11576
+ },
+ {
+ "epoch": 5.473758865248227,
+ "grad_norm": 3.166276216506958,
+ "learning_rate": 9.579774898298976e-08,
+ "loss": 0.2852,
+ "step": 11577
+ },
+ {
+ "epoch": 5.474231678486998,
+ "grad_norm": 3.019780158996582,
+ "learning_rate": 9.562678844486528e-08,
+ "loss": 0.3586,
+ "step": 11578
+ },
+ {
+ "epoch": 5.474704491725769,
+ "grad_norm": 3.729064464569092,
+ "learning_rate": 9.545597761651759e-08,
+ "loss": 0.3311,
+ "step": 11579
+ },
+ {
+ "epoch": 5.475177304964539,
+ "grad_norm": 3.478250026702881,
+ "learning_rate": 9.528531650858291e-08,
+ "loss": 0.3097,
+ "step": 11580
+ },
+ {
+ "epoch": 5.47565011820331,
+ "grad_norm": 2.939131021499634,
+ "learning_rate": 9.511480513168691e-08,
+ "loss": 0.2958,
+ "step": 11581
+ },
+ {
+ "epoch": 5.47612293144208,
+ "grad_norm": 3.405210256576538,
+ "learning_rate": 9.494444349644721e-08,
+ "loss": 0.2855,
+ "step": 11582
+ },
+ {
+ "epoch": 5.476595744680851,
+ "grad_norm": 3.166900873184204,
+ "learning_rate": 9.477423161347088e-08,
+ "loss": 0.278,
+ "step": 11583
+ },
+ {
+ "epoch": 5.477068557919622,
+ "grad_norm": 3.497921943664551,
+ "learning_rate": 9.460416949335665e-08,
+ "loss": 0.3507,
+ "step": 11584
+ },
+ {
+ "epoch": 5.4775413711583925,
+ "grad_norm": 2.780923843383789,
+ "learning_rate": 9.443425714669358e-08,
+ "loss": 0.2639,
+ "step": 11585
+ },
+ {
+ "epoch": 5.478014184397163,
+ "grad_norm": 3.697463274002075,
+ "learning_rate": 9.426449458406123e-08,
+ "loss": 0.3468,
+ "step": 11586
+ },
+ {
+ "epoch": 5.478486997635934,
+ "grad_norm": 3.344977617263794,
+ "learning_rate": 9.409488181602977e-08,
+ "loss": 0.3299,
+ "step": 11587
+ },
+ {
+ "epoch": 5.478959810874705,
+ "grad_norm": 3.280545949935913,
+ "learning_rate": 9.392541885316075e-08,
+ "loss": 0.3453,
+ "step": 11588
+ },
+ {
+ "epoch": 5.479432624113475,
+ "grad_norm": 4.446796894073486,
+ "learning_rate": 9.375610570600518e-08,
+ "loss": 0.3241,
+ "step": 11589
+ },
+ {
+ "epoch": 5.479905437352246,
+ "grad_norm": 3.322175979614258,
+ "learning_rate": 9.358694238510602e-08,
+ "loss": 0.3399,
+ "step": 11590
+ },
+ {
+ "epoch": 5.480378250591016,
+ "grad_norm": 3.0891571044921875,
+ "learning_rate": 9.341792890099621e-08,
+ "loss": 0.3048,
+ "step": 11591
+ },
+ {
+ "epoch": 5.480851063829787,
+ "grad_norm": 2.99324631690979,
+ "learning_rate": 9.32490652641993e-08,
+ "loss": 0.2799,
+ "step": 11592
+ },
+ {
+ "epoch": 5.481323877068558,
+ "grad_norm": 3.1435763835906982,
+ "learning_rate": 9.308035148522993e-08,
+ "loss": 0.3421,
+ "step": 11593
+ },
+ {
+ "epoch": 5.481796690307329,
+ "grad_norm": 3.177278518676758,
+ "learning_rate": 9.291178757459274e-08,
+ "loss": 0.3049,
+ "step": 11594
+ },
+ {
+ "epoch": 5.4822695035460995,
+ "grad_norm": 3.1422131061553955,
+ "learning_rate": 9.274337354278406e-08,
+ "loss": 0.3157,
+ "step": 11595
+ },
+ {
+ "epoch": 5.48274231678487,
+ "grad_norm": 3.261337995529175,
+ "learning_rate": 9.257510940029024e-08,
+ "loss": 0.3353,
+ "step": 11596
+ },
+ {
+ "epoch": 5.48321513002364,
+ "grad_norm": 3.239210367202759,
+ "learning_rate": 9.24069951575876e-08,
+ "loss": 0.3122,
+ "step": 11597
+ },
+ {
+ "epoch": 5.483687943262411,
+ "grad_norm": 2.982391595840454,
+ "learning_rate": 9.223903082514474e-08,
+ "loss": 0.318,
+ "step": 11598
+ },
+ {
+ "epoch": 5.484160756501182,
+ "grad_norm": 3.305668592453003,
+ "learning_rate": 9.207121641341937e-08,
+ "loss": 0.3537,
+ "step": 11599
+ },
+ {
+ "epoch": 5.484633569739953,
+ "grad_norm": 3.434743881225586,
+ "learning_rate": 9.190355193286121e-08,
+ "loss": 0.3408,
+ "step": 11600
+ },
+ {
+ "epoch": 5.485106382978723,
+ "grad_norm": 3.5249133110046387,
+ "learning_rate": 9.173603739390996e-08,
+ "loss": 0.2927,
+ "step": 11601
+ },
+ {
+ "epoch": 5.485579196217494,
+ "grad_norm": 3.4672904014587402,
+ "learning_rate": 9.156867280699533e-08,
+ "loss": 0.3742,
+ "step": 11602
+ },
+ {
+ "epoch": 5.486052009456265,
+ "grad_norm": 3.588812828063965,
+ "learning_rate": 9.140145818253898e-08,
+ "loss": 0.3527,
+ "step": 11603
+ },
+ {
+ "epoch": 5.486524822695036,
+ "grad_norm": 2.998471260070801,
+ "learning_rate": 9.123439353095231e-08,
+ "loss": 0.3108,
+ "step": 11604
+ },
+ {
+ "epoch": 5.486997635933806,
+ "grad_norm": 3.70161509513855,
+ "learning_rate": 9.106747886263784e-08,
+ "loss": 0.3732,
+ "step": 11605
+ },
+ {
+ "epoch": 5.4874704491725765,
+ "grad_norm": 3.575035333633423,
+ "learning_rate": 9.090071418798862e-08,
+ "loss": 0.3097,
+ "step": 11606
+ },
+ {
+ "epoch": 5.487943262411347,
+ "grad_norm": 2.9582679271698,
+ "learning_rate": 9.073409951738832e-08,
+ "loss": 0.31,
+ "step": 11607
+ },
+ {
+ "epoch": 5.488416075650118,
+ "grad_norm": 2.889258861541748,
+ "learning_rate": 9.05676348612114e-08,
+ "loss": 0.321,
+ "step": 11608
+ },
+ {
+ "epoch": 5.488888888888889,
+ "grad_norm": 3.2909512519836426,
+ "learning_rate": 9.040132022982262e-08,
+ "loss": 0.2967,
+ "step": 11609
+ },
+ {
+ "epoch": 5.48936170212766,
+ "grad_norm": 3.4053189754486084,
+ "learning_rate": 9.023515563357815e-08,
+ "loss": 0.3731,
+ "step": 11610
+ },
+ {
+ "epoch": 5.4898345153664305,
+ "grad_norm": 3.4993951320648193,
+ "learning_rate": 9.006914108282388e-08,
+ "loss": 0.3544,
+ "step": 11611
+ },
+ {
+ "epoch": 5.490307328605201,
+ "grad_norm": 3.1488454341888428,
+ "learning_rate": 8.990327658789683e-08,
+ "loss": 0.3326,
+ "step": 11612
+ },
+ {
+ "epoch": 5.490780141843971,
+ "grad_norm": 3.155266523361206,
+ "learning_rate": 8.97375621591251e-08,
+ "loss": 0.284,
+ "step": 11613
+ },
+ {
+ "epoch": 5.491252955082742,
+ "grad_norm": 3.5206151008605957,
+ "learning_rate": 8.957199780682657e-08,
+ "loss": 0.3104,
+ "step": 11614
+ },
+ {
+ "epoch": 5.491725768321513,
+ "grad_norm": 3.3629000186920166,
+ "learning_rate": 8.94065835413102e-08,
+ "loss": 0.3524,
+ "step": 11615
+ },
+ {
+ "epoch": 5.492198581560284,
+ "grad_norm": 2.872361183166504,
+ "learning_rate": 8.924131937287583e-08,
+ "loss": 0.2656,
+ "step": 11616
+ },
+ {
+ "epoch": 5.492671394799054,
+ "grad_norm": 3.2643532752990723,
+ "learning_rate": 8.907620531181382e-08,
+ "loss": 0.3039,
+ "step": 11617
+ },
+ {
+ "epoch": 5.493144208037825,
+ "grad_norm": 3.0820932388305664,
+ "learning_rate": 8.891124136840484e-08,
+ "loss": 0.2849,
+ "step": 11618
+ },
+ {
+ "epoch": 5.493617021276596,
+ "grad_norm": 3.398268222808838,
+ "learning_rate": 8.87464275529204e-08,
+ "loss": 0.275,
+ "step": 11619
+ },
+ {
+ "epoch": 5.494089834515367,
+ "grad_norm": 3.5962584018707275,
+ "learning_rate": 8.858176387562284e-08,
+ "loss": 0.3416,
+ "step": 11620
+ },
+ {
+ "epoch": 5.494562647754137,
+ "grad_norm": 3.0873095989227295,
+ "learning_rate": 8.841725034676535e-08,
+ "loss": 0.3669,
+ "step": 11621
+ },
+ {
+ "epoch": 5.4950354609929075,
+ "grad_norm": 3.2669544219970703,
+ "learning_rate": 8.825288697659084e-08,
+ "loss": 0.3739,
+ "step": 11622
+ },
+ {
+ "epoch": 5.495508274231678,
+ "grad_norm": 3.4504950046539307,
+ "learning_rate": 8.80886737753342e-08,
+ "loss": 0.295,
+ "step": 11623
+ },
+ {
+ "epoch": 5.495981087470449,
+ "grad_norm": 3.5143895149230957,
+ "learning_rate": 8.792461075322e-08,
+ "loss": 0.3655,
+ "step": 11624
+ },
+ {
+ "epoch": 5.49645390070922,
+ "grad_norm": 3.2914931774139404,
+ "learning_rate": 8.776069792046316e-08,
+ "loss": 0.2812,
+ "step": 11625
+ },
+ {
+ "epoch": 5.496926713947991,
+ "grad_norm": 3.2811596393585205,
+ "learning_rate": 8.759693528727075e-08,
+ "loss": 0.3118,
+ "step": 11626
+ },
+ {
+ "epoch": 5.4973995271867615,
+ "grad_norm": 2.9466712474823,
+ "learning_rate": 8.743332286383882e-08,
+ "loss": 0.3293,
+ "step": 11627
+ },
+ {
+ "epoch": 5.497872340425532,
+ "grad_norm": 3.4942102432250977,
+ "learning_rate": 8.726986066035531e-08,
+ "loss": 0.3452,
+ "step": 11628
+ },
+ {
+ "epoch": 5.498345153664302,
+ "grad_norm": 3.010925054550171,
+ "learning_rate": 8.71065486869982e-08,
+ "loss": 0.2651,
+ "step": 11629
+ },
+ {
+ "epoch": 5.498817966903073,
+ "grad_norm": 3.4343719482421875,
+ "learning_rate": 8.694338695393573e-08,
+ "loss": 0.2914,
+ "step": 11630
+ },
+ {
+ "epoch": 5.499290780141844,
+ "grad_norm": 3.32430362701416,
+ "learning_rate": 8.678037547132784e-08,
+ "loss": 0.3505,
+ "step": 11631
+ },
+ {
+ "epoch": 5.499763593380615,
+ "grad_norm": 3.294867992401123,
+ "learning_rate": 8.661751424932419e-08,
+ "loss": 0.3154,
+ "step": 11632
+ },
+ {
+ "epoch": 5.500236406619385,
+ "grad_norm": 3.3598666191101074,
+ "learning_rate": 8.645480329806583e-08,
+ "loss": 0.3787,
+ "step": 11633
+ },
+ {
+ "epoch": 5.500709219858156,
+ "grad_norm": 3.0285251140594482,
+ "learning_rate": 8.629224262768382e-08,
+ "loss": 0.3218,
+ "step": 11634
+ },
+ {
+ "epoch": 5.501182033096927,
+ "grad_norm": 3.117607355117798,
+ "learning_rate": 8.612983224829952e-08,
+ "loss": 0.3229,
+ "step": 11635
+ },
+ {
+ "epoch": 5.501654846335697,
+ "grad_norm": 3.201794385910034,
+ "learning_rate": 8.596757217002649e-08,
+ "loss": 0.3228,
+ "step": 11636
+ },
+ {
+ "epoch": 5.502127659574468,
+ "grad_norm": 3.0462050437927246,
+ "learning_rate": 8.58054624029675e-08,
+ "loss": 0.334,
+ "step": 11637
+ },
+ {
+ "epoch": 5.5026004728132385,
+ "grad_norm": 3.228955030441284,
+ "learning_rate": 8.564350295721613e-08,
+ "loss": 0.365,
+ "step": 11638
+ },
+ {
+ "epoch": 5.503073286052009,
+ "grad_norm": 3.2337913513183594,
+ "learning_rate": 8.548169384285765e-08,
+ "loss": 0.3145,
+ "step": 11639
+ },
+ {
+ "epoch": 5.50354609929078,
+ "grad_norm": 3.1510117053985596,
+ "learning_rate": 8.532003506996623e-08,
+ "loss": 0.3456,
+ "step": 11640
+ },
+ {
+ "epoch": 5.504018912529551,
+ "grad_norm": 3.1749494075775146,
+ "learning_rate": 8.51585266486088e-08,
+ "loss": 0.3133,
+ "step": 11641
+ },
+ {
+ "epoch": 5.504491725768322,
+ "grad_norm": 3.043325185775757,
+ "learning_rate": 8.499716858884094e-08,
+ "loss": 0.2652,
+ "step": 11642
+ },
+ {
+ "epoch": 5.5049645390070925,
+ "grad_norm": 3.722120761871338,
+ "learning_rate": 8.483596090070962e-08,
+ "loss": 0.3797,
+ "step": 11643
+ },
+ {
+ "epoch": 5.505437352245863,
+ "grad_norm": 3.287811756134033,
+ "learning_rate": 8.46749035942529e-08,
+ "loss": 0.3523,
+ "step": 11644
+ },
+ {
+ "epoch": 5.505910165484633,
+ "grad_norm": 3.303345203399658,
+ "learning_rate": 8.451399667949917e-08,
+ "loss": 0.2991,
+ "step": 11645
+ },
+ {
+ "epoch": 5.506382978723404,
+ "grad_norm": 3.18681001663208,
+ "learning_rate": 8.435324016646734e-08,
+ "loss": 0.2794,
+ "step": 11646
+ },
+ {
+ "epoch": 5.506855791962175,
+ "grad_norm": 3.742612361907959,
+ "learning_rate": 8.419263406516692e-08,
+ "loss": 0.319,
+ "step": 11647
+ },
+ {
+ "epoch": 5.507328605200946,
+ "grad_norm": 3.2310562133789062,
+ "learning_rate": 8.403217838559796e-08,
+ "loss": 0.2719,
+ "step": 11648
+ },
+ {
+ "epoch": 5.507801418439716,
+ "grad_norm": 3.409172773361206,
+ "learning_rate": 8.387187313775191e-08,
+ "loss": 0.3047,
+ "step": 11649
+ },
+ {
+ "epoch": 5.508274231678487,
+ "grad_norm": 2.940403699874878,
+ "learning_rate": 8.371171833160996e-08,
+ "loss": 0.2628,
+ "step": 11650
+ },
+ {
+ "epoch": 5.508747044917258,
+ "grad_norm": 3.0617125034332275,
+ "learning_rate": 8.355171397714413e-08,
+ "loss": 0.3272,
+ "step": 11651
+ },
+ {
+ "epoch": 5.509219858156028,
+ "grad_norm": 3.500164270401001,
+ "learning_rate": 8.339186008431726e-08,
+ "loss": 0.3373,
+ "step": 11652
+ },
+ {
+ "epoch": 5.509692671394799,
+ "grad_norm": 2.9918899536132812,
+ "learning_rate": 8.323215666308227e-08,
+ "loss": 0.2424,
+ "step": 11653
+ },
+ {
+ "epoch": 5.5101654846335695,
+ "grad_norm": 3.025858163833618,
+ "learning_rate": 8.307260372338421e-08,
+ "loss": 0.3076,
+ "step": 11654
+ },
+ {
+ "epoch": 5.51063829787234,
+ "grad_norm": 3.5850777626037598,
+ "learning_rate": 8.291320127515684e-08,
+ "loss": 0.4006,
+ "step": 11655
+ },
+ {
+ "epoch": 5.511111111111111,
+ "grad_norm": 3.1886472702026367,
+ "learning_rate": 8.275394932832609e-08,
+ "loss": 0.3171,
+ "step": 11656
+ },
+ {
+ "epoch": 5.511583924349882,
+ "grad_norm": 3.2190792560577393,
+ "learning_rate": 8.259484789280763e-08,
+ "loss": 0.346,
+ "step": 11657
+ },
+ {
+ "epoch": 5.512056737588653,
+ "grad_norm": 3.3583483695983887,
+ "learning_rate": 8.243589697850774e-08,
+ "loss": 0.3635,
+ "step": 11658
+ },
+ {
+ "epoch": 5.5125295508274235,
+ "grad_norm": 3.2143733501434326,
+ "learning_rate": 8.227709659532429e-08,
+ "loss": 0.2806,
+ "step": 11659
+ },
+ {
+ "epoch": 5.513002364066194,
+ "grad_norm": 3.288947582244873,
+ "learning_rate": 8.211844675314412e-08,
+ "loss": 0.2995,
+ "step": 11660
+ },
+ {
+ "epoch": 5.513475177304964,
+ "grad_norm": 3.106860637664795,
+ "learning_rate": 8.195994746184655e-08,
+ "loss": 0.3263,
+ "step": 11661
+ },
+ {
+ "epoch": 5.513947990543735,
+ "grad_norm": 3.416684627532959,
+ "learning_rate": 8.180159873130006e-08,
+ "loss": 0.327,
+ "step": 11662
+ },
+ {
+ "epoch": 5.514420803782506,
+ "grad_norm": 3.3379695415496826,
+ "learning_rate": 8.164340057136455e-08,
+ "loss": 0.3585,
+ "step": 11663
+ },
+ {
+ "epoch": 5.514893617021277,
+ "grad_norm": 3.3482465744018555,
+ "learning_rate": 8.148535299189048e-08,
+ "loss": 0.3341,
+ "step": 11664
+ },
+ {
+ "epoch": 5.515366430260047,
+ "grad_norm": 3.6307144165039062,
+ "learning_rate": 8.132745600271858e-08,
+ "loss": 0.3141,
+ "step": 11665
+ },
+ {
+ "epoch": 5.515839243498818,
+ "grad_norm": 3.4326119422912598,
+ "learning_rate": 8.116970961368042e-08,
+ "loss": 0.3056,
+ "step": 11666
+ },
+ {
+ "epoch": 5.516312056737589,
+ "grad_norm": 3.681443452835083,
+ "learning_rate": 8.101211383459817e-08,
+ "loss": 0.2637,
+ "step": 11667
+ },
+ {
+ "epoch": 5.516784869976359,
+ "grad_norm": 3.742844343185425,
+ "learning_rate": 8.085466867528452e-08,
+ "loss": 0.3315,
+ "step": 11668
+ },
+ {
+ "epoch": 5.51725768321513,
+ "grad_norm": 3.017211675643921,
+ "learning_rate": 8.069737414554302e-08,
+ "loss": 0.3134,
+ "step": 11669
+ },
+ {
+ "epoch": 5.5177304964539005,
+ "grad_norm": 3.2987756729125977,
+ "learning_rate": 8.054023025516805e-08,
+ "loss": 0.3359,
+ "step": 11670
+ },
+ {
+ "epoch": 5.518203309692671,
+ "grad_norm": 3.357034683227539,
+ "learning_rate": 8.038323701394346e-08,
+ "loss": 0.2902,
+ "step": 11671
+ },
+ {
+ "epoch": 5.518676122931442,
+ "grad_norm": 3.073740005493164,
+ "learning_rate": 8.022639443164531e-08,
+ "loss": 0.3432,
+ "step": 11672
+ },
+ {
+ "epoch": 5.519148936170213,
+ "grad_norm": 3.571768045425415,
+ "learning_rate": 8.006970251803886e-08,
+ "loss": 0.3142,
+ "step": 11673
+ },
+ {
+ "epoch": 5.519621749408984,
+ "grad_norm": 3.1152892112731934,
+ "learning_rate": 7.991316128288129e-08,
+ "loss": 0.2887,
+ "step": 11674
+ },
+ {
+ "epoch": 5.520094562647754,
+ "grad_norm": 3.6066105365753174,
+ "learning_rate": 7.975677073591925e-08,
+ "loss": 0.3432,
+ "step": 11675
+ },
+ {
+ "epoch": 5.520567375886525,
+ "grad_norm": 3.2668449878692627,
+ "learning_rate": 7.960053088689052e-08,
+ "loss": 0.3382,
+ "step": 11676
+ },
+ {
+ "epoch": 5.521040189125295,
+ "grad_norm": 2.9738216400146484,
+ "learning_rate": 7.944444174552395e-08,
+ "loss": 0.3304,
+ "step": 11677
+ },
+ {
+ "epoch": 5.521513002364066,
+ "grad_norm": 3.436067819595337,
+ "learning_rate": 7.928850332153793e-08,
+ "loss": 0.2707,
+ "step": 11678
+ },
+ {
+ "epoch": 5.521985815602837,
+ "grad_norm": 3.2292418479919434,
+ "learning_rate": 7.913271562464242e-08,
+ "loss": 0.3526,
+ "step": 11679
+ },
+ {
+ "epoch": 5.5224586288416075,
+ "grad_norm": 3.575700044631958,
+ "learning_rate": 7.897707866453746e-08,
+ "loss": 0.3719,
+ "step": 11680
+ },
+ {
+ "epoch": 5.522931442080378,
+ "grad_norm": 3.5845069885253906,
+ "learning_rate": 7.88215924509142e-08,
+ "loss": 0.3398,
+ "step": 11681
+ },
+ {
+ "epoch": 5.523404255319149,
+ "grad_norm": 3.5110747814178467,
+ "learning_rate": 7.866625699345376e-08,
+ "loss": 0.3762,
+ "step": 11682
+ },
+ {
+ "epoch": 5.52387706855792,
+ "grad_norm": 3.4815001487731934,
+ "learning_rate": 7.851107230182786e-08,
+ "loss": 0.3138,
+ "step": 11683
+ },
+ {
+ "epoch": 5.52434988179669,
+ "grad_norm": 3.2306344509124756,
+ "learning_rate": 7.835603838569989e-08,
+ "loss": 0.3055,
+ "step": 11684
+ },
+ {
+ "epoch": 5.524822695035461,
+ "grad_norm": 3.54038143157959,
+ "learning_rate": 7.820115525472294e-08,
+ "loss": 0.3291,
+ "step": 11685
+ },
+ {
+ "epoch": 5.525295508274231,
+ "grad_norm": 3.423480987548828,
+ "learning_rate": 7.804642291854043e-08,
+ "loss": 0.3283,
+ "step": 11686
+ },
+ {
+ "epoch": 5.525768321513002,
+ "grad_norm": 3.666048765182495,
+ "learning_rate": 7.78918413867874e-08,
+ "loss": 0.3376,
+ "step": 11687
+ },
+ {
+ "epoch": 5.526241134751773,
+ "grad_norm": 2.820924997329712,
+ "learning_rate": 7.773741066908896e-08,
+ "loss": 0.3115,
+ "step": 11688
+ },
+ {
+ "epoch": 5.526713947990544,
+ "grad_norm": 3.3126847743988037,
+ "learning_rate": 7.758313077506018e-08,
+ "loss": 0.3428,
+ "step": 11689
+ },
+ {
+ "epoch": 5.527186761229315,
+ "grad_norm": 3.6306893825531006,
+ "learning_rate": 7.742900171430839e-08,
+ "loss": 0.3362,
+ "step": 11690
+ },
+ {
+ "epoch": 5.527659574468085,
+ "grad_norm": 3.184955358505249,
+ "learning_rate": 7.727502349642952e-08,
+ "loss": 0.3129,
+ "step": 11691
+ },
+ {
+ "epoch": 5.528132387706856,
+ "grad_norm": 3.0288753509521484,
+ "learning_rate": 7.712119613101199e-08,
+ "loss": 0.2864,
+ "step": 11692
+ },
+ {
+ "epoch": 5.528605200945626,
+ "grad_norm": 3.0057592391967773,
+ "learning_rate": 7.696751962763343e-08,
+ "loss": 0.3508,
+ "step": 11693
+ },
+ {
+ "epoch": 5.529078014184397,
+ "grad_norm": 3.4207417964935303,
+ "learning_rate": 7.68139939958626e-08,
+ "loss": 0.3392,
+ "step": 11694
+ },
+ {
+ "epoch": 5.529550827423168,
+ "grad_norm": 3.4315693378448486,
+ "learning_rate": 7.66606192452593e-08,
+ "loss": 0.3296,
+ "step": 11695
+ },
+ {
+ "epoch": 5.5300236406619385,
+ "grad_norm": 3.496645927429199,
+ "learning_rate": 7.650739538537317e-08,
+ "loss": 0.3628,
+ "step": 11696
+ },
+ {
+ "epoch": 5.530496453900709,
+ "grad_norm": 3.160109281539917,
+ "learning_rate": 7.635432242574486e-08,
+ "loss": 0.3529,
+ "step": 11697
+ },
+ {
+ "epoch": 5.53096926713948,
+ "grad_norm": 3.434227228164673,
+ "learning_rate": 7.62014003759054e-08,
+ "loss": 0.3261,
+ "step": 11698
+ },
+ {
+ "epoch": 5.531442080378251,
+ "grad_norm": 3.334365129470825,
+ "learning_rate": 7.604862924537659e-08,
+ "loss": 0.3512,
+ "step": 11699
+ },
+ {
+ "epoch": 5.531914893617021,
+ "grad_norm": 3.228555917739868,
+ "learning_rate": 7.589600904367111e-08,
+ "loss": 0.3433,
+ "step": 11700
+ },
+ {
+ "epoch": 5.532387706855792,
+ "grad_norm": 3.4976108074188232,
+ "learning_rate": 7.574353978029164e-08,
+ "loss": 0.3406,
+ "step": 11701
+ },
+ {
+ "epoch": 5.532860520094562,
+ "grad_norm": 3.207275152206421,
+ "learning_rate": 7.559122146473197e-08,
+ "loss": 0.2936,
+ "step": 11702
+ },
+ {
+ "epoch": 5.533333333333333,
+ "grad_norm": 3.077650547027588,
+ "learning_rate": 7.543905410647645e-08,
+ "loss": 0.3193,
+ "step": 11703
+ },
+ {
+ "epoch": 5.533806146572104,
+ "grad_norm": 3.0646450519561768,
+ "learning_rate": 7.528703771499918e-08,
+ "loss": 0.2966,
+ "step": 11704
+ },
+ {
+ "epoch": 5.534278959810875,
+ "grad_norm": 3.450676202774048,
+ "learning_rate": 7.513517229976646e-08,
+ "loss": 0.335,
+ "step": 11705
+ },
+ {
+ "epoch": 5.534751773049646,
+ "grad_norm": 3.3997511863708496,
+ "learning_rate": 7.498345787023353e-08,
+ "loss": 0.3309,
+ "step": 11706
+ },
+ {
+ "epoch": 5.535224586288416,
+ "grad_norm": 3.0745410919189453,
+ "learning_rate": 7.483189443584754e-08,
+ "loss": 0.3025,
+ "step": 11707
+ },
+ {
+ "epoch": 5.535697399527187,
+ "grad_norm": 3.0367109775543213,
+ "learning_rate": 7.468048200604539e-08,
+ "loss": 0.3248,
+ "step": 11708
+ },
+ {
+ "epoch": 5.536170212765957,
+ "grad_norm": 3.1814827919006348,
+ "learning_rate": 7.45292205902548e-08,
+ "loss": 0.32,
+ "step": 11709
+ },
+ {
+ "epoch": 5.536643026004728,
+ "grad_norm": 3.1039483547210693,
+ "learning_rate": 7.437811019789437e-08,
+ "loss": 0.2985,
+ "step": 11710
+ },
+ {
+ "epoch": 5.537115839243499,
+ "grad_norm": 3.4284253120422363,
+ "learning_rate": 7.422715083837296e-08,
+ "loss": 0.3141,
+ "step": 11711
+ },
+ {
+ "epoch": 5.5375886524822695,
+ "grad_norm": 2.922107458114624,
+ "learning_rate": 7.407634252109025e-08,
+ "loss": 0.3227,
+ "step": 11712
+ },
+ {
+ "epoch": 5.53806146572104,
+ "grad_norm": 2.925189733505249,
+ "learning_rate": 7.392568525543625e-08,
+ "loss": 0.2847,
+ "step": 11713
+ },
+ {
+ "epoch": 5.538534278959811,
+ "grad_norm": 3.3388655185699463,
+ "learning_rate": 7.377517905079179e-08,
+ "loss": 0.3311,
+ "step": 11714
+ },
+ {
+ "epoch": 5.539007092198582,
+ "grad_norm": 3.5923469066619873,
+ "learning_rate": 7.362482391652853e-08,
+ "loss": 0.2915,
+ "step": 11715
+ },
+ {
+ "epoch": 5.539479905437352,
+ "grad_norm": 3.1141269207000732,
+ "learning_rate": 7.347461986200788e-08,
+ "loss": 0.2816,
+ "step": 11716
+ },
+ {
+ "epoch": 5.539952718676123,
+ "grad_norm": 3.2038733959198,
+ "learning_rate": 7.332456689658263e-08,
+ "loss": 0.3387,
+ "step": 11717
+ },
+ {
+ "epoch": 5.540425531914893,
+ "grad_norm": 3.1562764644622803,
+ "learning_rate": 7.317466502959585e-08,
+ "loss": 0.3439,
+ "step": 11718
+ },
+ {
+ "epoch": 5.540898345153664,
+ "grad_norm": 3.1141879558563232,
+ "learning_rate": 7.302491427038149e-08,
+ "loss": 0.3039,
+ "step": 11719
+ },
+ {
+ "epoch": 5.541371158392435,
+ "grad_norm": 3.2877326011657715,
+ "learning_rate": 7.287531462826375e-08,
+ "loss": 0.2694,
+ "step": 11720
+ },
+ {
+ "epoch": 5.541843971631206,
+ "grad_norm": 3.3915371894836426,
+ "learning_rate": 7.272586611255766e-08,
+ "loss": 0.3022,
+ "step": 11721
+ },
+ {
+ "epoch": 5.542316784869977,
+ "grad_norm": 2.9387362003326416,
+ "learning_rate": 7.257656873256858e-08,
+ "loss": 0.3702,
+ "step": 11722
+ },
+ {
+ "epoch": 5.542789598108747,
+ "grad_norm": 3.7734436988830566,
+ "learning_rate": 7.242742249759265e-08,
+ "loss": 0.3237,
+ "step": 11723
+ },
+ {
+ "epoch": 5.543262411347518,
+ "grad_norm": 3.3691086769104004,
+ "learning_rate": 7.22784274169161e-08,
+ "loss": 0.3418,
+ "step": 11724
+ },
+ {
+ "epoch": 5.543735224586288,
+ "grad_norm": 3.0379209518432617,
+ "learning_rate": 7.21295834998173e-08,
+ "loss": 0.3512,
+ "step": 11725
+ },
+ {
+ "epoch": 5.544208037825059,
+ "grad_norm": 3.5555078983306885,
+ "learning_rate": 7.198089075556302e-08,
+ "loss": 0.3474,
+ "step": 11726
+ },
+ {
+ "epoch": 5.54468085106383,
+ "grad_norm": 3.1998801231384277,
+ "learning_rate": 7.183234919341226e-08,
+ "loss": 0.3437,
+ "step": 11727
+ },
+ {
+ "epoch": 5.5451536643026005,
+ "grad_norm": 3.2798357009887695,
+ "learning_rate": 7.168395882261397e-08,
+ "loss": 0.3047,
+ "step": 11728
+ },
+ {
+ "epoch": 5.545626477541371,
+ "grad_norm": 3.1311612129211426,
+ "learning_rate": 7.153571965240774e-08,
+ "loss": 0.3099,
+ "step": 11729
+ },
+ {
+ "epoch": 5.546099290780142,
+ "grad_norm": 3.3740992546081543,
+ "learning_rate": 7.138763169202367e-08,
+ "loss": 0.3337,
+ "step": 11730
+ },
+ {
+ "epoch": 5.546572104018913,
+ "grad_norm": 3.4915285110473633,
+ "learning_rate": 7.123969495068272e-08,
+ "loss": 0.3462,
+ "step": 11731
+ },
+ {
+ "epoch": 5.547044917257683,
+ "grad_norm": 3.236283540725708,
+ "learning_rate": 7.109190943759615e-08,
+ "loss": 0.2688,
+ "step": 11732
+ },
+ {
+ "epoch": 5.547517730496454,
+ "grad_norm": 2.8927080631256104,
+ "learning_rate": 7.094427516196601e-08,
+ "loss": 0.3078,
+ "step": 11733
+ },
+ {
+ "epoch": 5.547990543735224,
+ "grad_norm": 2.8691484928131104,
+ "learning_rate": 7.079679213298468e-08,
+ "loss": 0.265,
+ "step": 11734
+ },
+ {
+ "epoch": 5.548463356973995,
+ "grad_norm": 2.9787049293518066,
+ "learning_rate": 7.064946035983511e-08,
+ "loss": 0.2901,
+ "step": 11735
+ },
+ {
+ "epoch": 5.548936170212766,
+ "grad_norm": 2.8145298957824707,
+ "learning_rate": 7.050227985169161e-08,
+ "loss": 0.2754,
+ "step": 11736
+ },
+ {
+ "epoch": 5.549408983451537,
+ "grad_norm": 3.2856605052948,
+ "learning_rate": 7.035525061771769e-08,
+ "loss": 0.355,
+ "step": 11737
+ },
+ {
+ "epoch": 5.549881796690308,
+ "grad_norm": 2.770224094390869,
+ "learning_rate": 7.020837266706909e-08,
+ "loss": 0.2802,
+ "step": 11738
+ },
+ {
+ "epoch": 5.550354609929078,
+ "grad_norm": 2.7485382556915283,
+ "learning_rate": 7.00616460088907e-08,
+ "loss": 0.2839,
+ "step": 11739
+ },
+ {
+ "epoch": 5.550827423167849,
+ "grad_norm": 3.6368305683135986,
+ "learning_rate": 6.991507065231857e-08,
+ "loss": 0.3458,
+ "step": 11740
+ },
+ {
+ "epoch": 5.551300236406619,
+ "grad_norm": 3.0573692321777344,
+ "learning_rate": 6.976864660647925e-08,
+ "loss": 0.3257,
+ "step": 11741
+ },
+ {
+ "epoch": 5.55177304964539,
+ "grad_norm": 3.5397047996520996,
+ "learning_rate": 6.96223738804902e-08,
+ "loss": 0.3507,
+ "step": 11742
+ },
+ {
+ "epoch": 5.552245862884161,
+ "grad_norm": 3.4442644119262695,
+ "learning_rate": 6.947625248345912e-08,
+ "loss": 0.3329,
+ "step": 11743
+ },
+ {
+ "epoch": 5.5527186761229315,
+ "grad_norm": 3.613973617553711,
+ "learning_rate": 6.93302824244843e-08,
+ "loss": 0.3207,
+ "step": 11744
+ },
+ {
+ "epoch": 5.553191489361702,
+ "grad_norm": 3.704705238342285,
+ "learning_rate": 6.918446371265458e-08,
+ "loss": 0.2735,
+ "step": 11745
+ },
+ {
+ "epoch": 5.553664302600473,
+ "grad_norm": 3.4365522861480713,
+ "learning_rate": 6.903879635704963e-08,
+ "loss": 0.3157,
+ "step": 11746
+ },
+ {
+ "epoch": 5.554137115839244,
+ "grad_norm": 3.4732930660247803,
+ "learning_rate": 6.889328036673914e-08,
+ "loss": 0.3197,
+ "step": 11747
+ },
+ {
+ "epoch": 5.554609929078014,
+ "grad_norm": 3.4623477458953857,
+ "learning_rate": 6.874791575078421e-08,
+ "loss": 0.3476,
+ "step": 11748
+ },
+ {
+ "epoch": 5.555082742316785,
+ "grad_norm": 3.1602671146392822,
+ "learning_rate": 6.860270251823564e-08,
+ "loss": 0.3074,
+ "step": 11749
+ },
+ {
+ "epoch": 5.555555555555555,
+ "grad_norm": 4.155186653137207,
+ "learning_rate": 6.845764067813538e-08,
+ "loss": 0.3359,
+ "step": 11750
+ },
+ {
+ "epoch": 5.556028368794326,
+ "grad_norm": 3.526486396789551,
+ "learning_rate": 6.831273023951618e-08,
+ "loss": 0.3285,
+ "step": 11751
+ },
+ {
+ "epoch": 5.556501182033097,
+ "grad_norm": 3.4824352264404297,
+ "learning_rate": 6.816797121140029e-08,
+ "loss": 0.315,
+ "step": 11752
+ },
+ {
+ "epoch": 5.556973995271868,
+ "grad_norm": 3.783975601196289,
+ "learning_rate": 6.802336360280187e-08,
+ "loss": 0.3279,
+ "step": 11753
+ },
+ {
+ "epoch": 5.5574468085106385,
+ "grad_norm": 3.435145378112793,
+ "learning_rate": 6.787890742272457e-08,
+ "loss": 0.3309,
+ "step": 11754
+ },
+ {
+ "epoch": 5.557919621749409,
+ "grad_norm": 3.1065540313720703,
+ "learning_rate": 6.773460268016285e-08,
+ "loss": 0.3235,
+ "step": 11755
+ },
+ {
+ "epoch": 5.55839243498818,
+ "grad_norm": 3.195692539215088,
+ "learning_rate": 6.759044938410287e-08,
+ "loss": 0.2965,
+ "step": 11756
+ },
+ {
+ "epoch": 5.55886524822695,
+ "grad_norm": 3.553255796432495,
+ "learning_rate": 6.744644754351909e-08,
+ "loss": 0.3367,
+ "step": 11757
+ },
+ {
+ "epoch": 5.559338061465721,
+ "grad_norm": 3.1541762351989746,
+ "learning_rate": 6.730259716737908e-08,
+ "loss": 0.3525,
+ "step": 11758
+ },
+ {
+ "epoch": 5.559810874704492,
+ "grad_norm": 3.8875532150268555,
+ "learning_rate": 6.715889826463956e-08,
+ "loss": 0.3558,
+ "step": 11759
+ },
+ {
+ "epoch": 5.560283687943262,
+ "grad_norm": 3.260861396789551,
+ "learning_rate": 6.701535084424726e-08,
+ "loss": 0.29,
+ "step": 11760
+ },
+ {
+ "epoch": 5.560756501182033,
+ "grad_norm": 3.1773221492767334,
+ "learning_rate": 6.687195491514114e-08,
+ "loss": 0.2756,
+ "step": 11761
+ },
+ {
+ "epoch": 5.561229314420804,
+ "grad_norm": 3.6206512451171875,
+ "learning_rate": 6.672871048624934e-08,
+ "loss": 0.3412,
+ "step": 11762
+ },
+ {
+ "epoch": 5.561702127659575,
+ "grad_norm": 3.5456814765930176,
+ "learning_rate": 6.658561756649084e-08,
+ "loss": 0.3552,
+ "step": 11763
+ },
+ {
+ "epoch": 5.562174940898345,
+ "grad_norm": 3.60017991065979,
+ "learning_rate": 6.6442676164776e-08,
+ "loss": 0.358,
+ "step": 11764
+ },
+ {
+ "epoch": 5.5626477541371155,
+ "grad_norm": 3.1833202838897705,
+ "learning_rate": 6.629988629000467e-08,
+ "loss": 0.2945,
+ "step": 11765
+ },
+ {
+ "epoch": 5.563120567375886,
+ "grad_norm": 4.464661598205566,
+ "learning_rate": 6.615724795106803e-08,
+ "loss": 0.3482,
+ "step": 11766
+ },
+ {
+ "epoch": 5.563593380614657,
+ "grad_norm": 3.1042027473449707,
+ "learning_rate": 6.601476115684762e-08,
+ "loss": 0.358,
+ "step": 11767
+ },
+ {
+ "epoch": 5.564066193853428,
+ "grad_norm": 3.243957281112671,
+ "learning_rate": 6.587242591621524e-08,
+ "loss": 0.2892,
+ "step": 11768
+ },
+ {
+ "epoch": 5.564539007092199,
+ "grad_norm": 5.185178279876709,
+ "learning_rate": 6.573024223803349e-08,
+ "loss": 0.2877,
+ "step": 11769
+ },
+ {
+ "epoch": 5.5650118203309695,
+ "grad_norm": 3.488581657409668,
+ "learning_rate": 6.558821013115557e-08,
+ "loss": 0.3234,
+ "step": 11770
+ },
+ {
+ "epoch": 5.56548463356974,
+ "grad_norm": 2.9220681190490723,
+ "learning_rate": 6.544632960442554e-08,
+ "loss": 0.2763,
+ "step": 11771
+ },
+ {
+ "epoch": 5.565957446808511,
+ "grad_norm": 3.5609130859375,
+ "learning_rate": 6.530460066667715e-08,
+ "loss": 0.3491,
+ "step": 11772
+ },
+ {
+ "epoch": 5.566430260047281,
+ "grad_norm": 3.2538235187530518,
+ "learning_rate": 6.516302332673558e-08,
+ "loss": 0.3427,
+ "step": 11773
+ },
+ {
+ "epoch": 5.566903073286052,
+ "grad_norm": 3.371915817260742,
+ "learning_rate": 6.502159759341598e-08,
+ "loss": 0.33,
+ "step": 11774
+ },
+ {
+ "epoch": 5.567375886524823,
+ "grad_norm": 3.090161085128784,
+ "learning_rate": 6.488032347552464e-08,
+ "loss": 0.306,
+ "step": 11775
+ },
+ {
+ "epoch": 5.567848699763593,
+ "grad_norm": 3.563584566116333,
+ "learning_rate": 6.473920098185787e-08,
+ "loss": 0.3073,
+ "step": 11776
+ },
+ {
+ "epoch": 5.568321513002364,
+ "grad_norm": 3.434272050857544,
+ "learning_rate": 6.459823012120226e-08,
+ "loss": 0.3244,
+ "step": 11777
+ },
+ {
+ "epoch": 5.568794326241135,
+ "grad_norm": 3.05387544631958,
+ "learning_rate": 6.445741090233659e-08,
+ "loss": 0.3239,
+ "step": 11778
+ },
+ {
+ "epoch": 5.569267139479906,
+ "grad_norm": 3.3350319862365723,
+ "learning_rate": 6.431674333402804e-08,
+ "loss": 0.2885,
+ "step": 11779
+ },
+ {
+ "epoch": 5.569739952718676,
+ "grad_norm": 3.7317748069763184,
+ "learning_rate": 6.417622742503571e-08,
+ "loss": 0.4105,
+ "step": 11780
+ },
+ {
+ "epoch": 5.5702127659574465,
+ "grad_norm": 3.36405873298645,
+ "learning_rate": 6.403586318410899e-08,
+ "loss": 0.2809,
+ "step": 11781
+ },
+ {
+ "epoch": 5.570685579196217,
+ "grad_norm": 3.437410831451416,
+ "learning_rate": 6.389565061998781e-08,
+ "loss": 0.3327,
+ "step": 11782
+ },
+ {
+ "epoch": 5.571158392434988,
+ "grad_norm": 3.6284446716308594,
+ "learning_rate": 6.375558974140244e-08,
+ "loss": 0.3734,
+ "step": 11783
+ },
+ {
+ "epoch": 5.571631205673759,
+ "grad_norm": 3.400245428085327,
+ "learning_rate": 6.361568055707367e-08,
+ "loss": 0.3404,
+ "step": 11784
+ },
+ {
+ "epoch": 5.57210401891253,
+ "grad_norm": 3.2299258708953857,
+ "learning_rate": 6.347592307571371e-08,
+ "loss": 0.2886,
+ "step": 11785
+ },
+ {
+ "epoch": 5.5725768321513005,
+ "grad_norm": 3.069549798965454,
+ "learning_rate": 6.333631730602364e-08,
+ "loss": 0.3234,
+ "step": 11786
+ },
+ {
+ "epoch": 5.573049645390071,
+ "grad_norm": 3.2106802463531494,
+ "learning_rate": 6.319686325669705e-08,
+ "loss": 0.3116,
+ "step": 11787
+ },
+ {
+ "epoch": 5.573522458628842,
+ "grad_norm": 3.651440143585205,
+ "learning_rate": 6.305756093641646e-08,
+ "loss": 0.3248,
+ "step": 11788
+ },
+ {
+ "epoch": 5.573995271867612,
+ "grad_norm": 3.5746328830718994,
+ "learning_rate": 6.291841035385605e-08,
+ "loss": 0.3589,
+ "step": 11789
+ },
+ {
+ "epoch": 5.574468085106383,
+ "grad_norm": 3.3536272048950195,
+ "learning_rate": 6.277941151767997e-08,
+ "loss": 0.3187,
+ "step": 11790
+ },
+ {
+ "epoch": 5.574940898345154,
+ "grad_norm": 3.221473455429077,
+ "learning_rate": 6.264056443654271e-08,
+ "loss": 0.3281,
+ "step": 11791
+ },
+ {
+ "epoch": 5.575413711583924,
+ "grad_norm": 3.3866493701934814,
+ "learning_rate": 6.25018691190904e-08,
+ "loss": 0.3696,
+ "step": 11792
+ },
+ {
+ "epoch": 5.575886524822695,
+ "grad_norm": 4.225553512573242,
+ "learning_rate": 6.236332557395835e-08,
+ "loss": 0.3851,
+ "step": 11793
+ },
+ {
+ "epoch": 5.576359338061466,
+ "grad_norm": 2.9920523166656494,
+ "learning_rate": 6.222493380977357e-08,
+ "loss": 0.3036,
+ "step": 11794
+ },
+ {
+ "epoch": 5.576832151300237,
+ "grad_norm": 3.26487135887146,
+ "learning_rate": 6.208669383515276e-08,
+ "loss": 0.3429,
+ "step": 11795
+ },
+ {
+ "epoch": 5.577304964539007,
+ "grad_norm": 3.1462907791137695,
+ "learning_rate": 6.19486056587032e-08,
+ "loss": 0.3068,
+ "step": 11796
+ },
+ {
+ "epoch": 5.5777777777777775,
+ "grad_norm": 2.806856155395508,
+ "learning_rate": 6.181066928902385e-08,
+ "loss": 0.2884,
+ "step": 11797
+ },
+ {
+ "epoch": 5.578250591016548,
+ "grad_norm": 3.231782913208008,
+ "learning_rate": 6.167288473470284e-08,
+ "loss": 0.3355,
+ "step": 11798
+ },
+ {
+ "epoch": 5.578723404255319,
+ "grad_norm": 3.313009023666382,
+ "learning_rate": 6.15352520043197e-08,
+ "loss": 0.3334,
+ "step": 11799
+ },
+ {
+ "epoch": 5.57919621749409,
+ "grad_norm": 3.366187572479248,
+ "learning_rate": 6.139777110644397e-08,
+ "loss": 0.336,
+ "step": 11800
+ },
+ {
+ "epoch": 5.579669030732861,
+ "grad_norm": 3.2782907485961914,
+ "learning_rate": 6.126044204963572e-08,
+ "loss": 0.3261,
+ "step": 11801
+ },
+ {
+ "epoch": 5.5801418439716315,
+ "grad_norm": 2.9769949913024902,
+ "learning_rate": 6.112326484244674e-08,
+ "loss": 0.3018,
+ "step": 11802
+ },
+ {
+ "epoch": 5.580614657210402,
+ "grad_norm": 3.3499436378479004,
+ "learning_rate": 6.098623949341743e-08,
+ "loss": 0.3115,
+ "step": 11803
+ },
+ {
+ "epoch": 5.581087470449172,
+ "grad_norm": 3.1725752353668213,
+ "learning_rate": 6.084936601108066e-08,
+ "loss": 0.3284,
+ "step": 11804
+ },
+ {
+ "epoch": 5.581560283687943,
+ "grad_norm": 3.5985076427459717,
+ "learning_rate": 6.071264440395852e-08,
+ "loss": 0.3121,
+ "step": 11805
+ },
+ {
+ "epoch": 5.582033096926714,
+ "grad_norm": 3.3965110778808594,
+ "learning_rate": 6.057607468056365e-08,
+ "loss": 0.2935,
+ "step": 11806
+ },
+ {
+ "epoch": 5.582505910165485,
+ "grad_norm": 3.3994903564453125,
+ "learning_rate": 6.043965684940034e-08,
+ "loss": 0.3089,
+ "step": 11807
+ },
+ {
+ "epoch": 5.582978723404255,
+ "grad_norm": 3.0140533447265625,
+ "learning_rate": 6.030339091896265e-08,
+ "loss": 0.2828,
+ "step": 11808
+ },
+ {
+ "epoch": 5.583451536643026,
+ "grad_norm": 3.218672752380371,
+ "learning_rate": 6.016727689773488e-08,
+ "loss": 0.3021,
+ "step": 11809
+ },
+ {
+ "epoch": 5.583924349881797,
+ "grad_norm": 3.5624725818634033,
+ "learning_rate": 6.00313147941925e-08,
+ "loss": 0.3645,
+ "step": 11810
+ },
+ {
+ "epoch": 5.584397163120567,
+ "grad_norm": 3.484443187713623,
+ "learning_rate": 5.989550461680093e-08,
+ "loss": 0.3595,
+ "step": 11811
+ },
+ {
+ "epoch": 5.584869976359338,
+ "grad_norm": 3.805352210998535,
+ "learning_rate": 5.975984637401677e-08,
+ "loss": 0.3161,
+ "step": 11812
+ },
+ {
+ "epoch": 5.5853427895981085,
+ "grad_norm": 3.4212005138397217,
+ "learning_rate": 5.962434007428714e-08,
+ "loss": 0.3195,
+ "step": 11813
+ },
+ {
+ "epoch": 5.585815602836879,
+ "grad_norm": 3.2629356384277344,
+ "learning_rate": 5.9488985726048885e-08,
+ "loss": 0.3264,
+ "step": 11814
+ },
+ {
+ "epoch": 5.58628841607565,
+ "grad_norm": 2.9457015991210938,
+ "learning_rate": 5.9353783337730284e-08,
+ "loss": 0.3001,
+ "step": 11815
+ },
+ {
+ "epoch": 5.586761229314421,
+ "grad_norm": 3.5118699073791504,
+ "learning_rate": 5.921873291774932e-08,
+ "loss": 0.3388,
+ "step": 11816
+ },
+ {
+ "epoch": 5.587234042553192,
+ "grad_norm": 3.5850439071655273,
+ "learning_rate": 5.908383447451593e-08,
+ "loss": 0.2999,
+ "step": 11817
+ },
+ {
+ "epoch": 5.5877068557919625,
+ "grad_norm": 3.5888917446136475,
+ "learning_rate": 5.8949088016428954e-08,
+ "loss": 0.3797,
+ "step": 11818
+ },
+ {
+ "epoch": 5.588179669030733,
+ "grad_norm": 3.3090274333953857,
+ "learning_rate": 5.881449355187807e-08,
+ "loss": 0.2644,
+ "step": 11819
+ },
+ {
+ "epoch": 5.588652482269503,
+ "grad_norm": 3.3722126483917236,
+ "learning_rate": 5.8680051089244906e-08,
+ "loss": 0.3319,
+ "step": 11820
+ },
+ {
+ "epoch": 5.589125295508274,
+ "grad_norm": 3.2284746170043945,
+ "learning_rate": 5.8545760636899985e-08,
+ "loss": 0.3166,
+ "step": 11821
+ },
+ {
+ "epoch": 5.589598108747045,
+ "grad_norm": 3.022043228149414,
+ "learning_rate": 5.841162220320496e-08,
+ "loss": 0.3077,
+ "step": 11822
+ },
+ {
+ "epoch": 5.590070921985816,
+ "grad_norm": 3.0781049728393555,
+ "learning_rate": 5.8277635796512574e-08,
+ "loss": 0.3499,
+ "step": 11823
+ },
+ {
+ "epoch": 5.590543735224586,
+ "grad_norm": 2.954606294631958,
+ "learning_rate": 5.8143801425164794e-08,
+ "loss": 0.2903,
+ "step": 11824
+ },
+ {
+ "epoch": 5.591016548463357,
+ "grad_norm": 3.109799861907959,
+ "learning_rate": 5.801011909749549e-08,
+ "loss": 0.3418,
+ "step": 11825
+ },
+ {
+ "epoch": 5.591489361702128,
+ "grad_norm": 3.096015214920044,
+ "learning_rate": 5.7876588821828003e-08,
+ "loss": 0.296,
+ "step": 11826
+ },
+ {
+ "epoch": 5.591962174940898,
+ "grad_norm": 3.375032663345337,
+ "learning_rate": 5.774321060647708e-08,
+ "loss": 0.2903,
+ "step": 11827
+ },
+ {
+ "epoch": 5.592434988179669,
+ "grad_norm": 3.2566537857055664,
+ "learning_rate": 5.7609984459747745e-08,
+ "loss": 0.3486,
+ "step": 11828
+ },
+ {
+ "epoch": 5.5929078014184395,
+ "grad_norm": 3.127394914627075,
+ "learning_rate": 5.747691038993475e-08,
+ "loss": 0.3028,
+ "step": 11829
+ },
+ {
+ "epoch": 5.59338061465721,
+ "grad_norm": 3.6006057262420654,
+ "learning_rate": 5.734398840532451e-08,
+ "loss": 0.3155,
+ "step": 11830
+ },
+ {
+ "epoch": 5.593853427895981,
+ "grad_norm": 3.31854510307312,
+ "learning_rate": 5.721121851419348e-08,
+ "loss": 0.3526,
+ "step": 11831
+ },
+ {
+ "epoch": 5.594326241134752,
+ "grad_norm": 3.2922966480255127,
+ "learning_rate": 5.7078600724808365e-08,
+ "loss": 0.2947,
+ "step": 11832
+ },
+ {
+ "epoch": 5.594799054373523,
+ "grad_norm": 3.170687198638916,
+ "learning_rate": 5.694613504542701e-08,
+ "loss": 0.3703,
+ "step": 11833
+ },
+ {
+ "epoch": 5.5952718676122934,
+ "grad_norm": 3.654935836791992,
+ "learning_rate": 5.6813821484296985e-08,
+ "loss": 0.2927,
+ "step": 11834
+ },
+ {
+ "epoch": 5.595744680851064,
+ "grad_norm": 3.3022713661193848,
+ "learning_rate": 5.6681660049657805e-08,
+ "loss": 0.2826,
+ "step": 11835
+ },
+ {
+ "epoch": 5.596217494089834,
+ "grad_norm": 3.0262742042541504,
+ "learning_rate": 5.6549650749737893e-08,
+ "loss": 0.2905,
+ "step": 11836
+ },
+ {
+ "epoch": 5.596690307328605,
+ "grad_norm": 3.2589111328125,
+ "learning_rate": 5.6417793592756786e-08,
+ "loss": 0.3173,
+ "step": 11837
+ },
+ {
+ "epoch": 5.597163120567376,
+ "grad_norm": 3.257753849029541,
+ "learning_rate": 5.628608858692514e-08,
+ "loss": 0.3277,
+ "step": 11838
+ },
+ {
+ "epoch": 5.5976359338061465,
+ "grad_norm": 2.7493152618408203,
+ "learning_rate": 5.615453574044333e-08,
+ "loss": 0.2915,
+ "step": 11839
+ },
+ {
+ "epoch": 5.598108747044917,
+ "grad_norm": 3.183833360671997,
+ "learning_rate": 5.602313506150286e-08,
+ "loss": 0.2871,
+ "step": 11840
+ },
+ {
+ "epoch": 5.598581560283688,
+ "grad_norm": 3.640209197998047,
+ "learning_rate": 5.589188655828498e-08,
+ "loss": 0.3322,
+ "step": 11841
+ },
+ {
+ "epoch": 5.599054373522459,
+ "grad_norm": 3.1197102069854736,
+ "learning_rate": 5.576079023896203e-08,
+ "loss": 0.2604,
+ "step": 11842
+ },
+ {
+ "epoch": 5.599527186761229,
+ "grad_norm": 3.4404499530792236,
+ "learning_rate": 5.5629846111697473e-08,
+ "loss": 0.3492,
+ "step": 11843
+ },
+ {
+ "epoch": 5.6,
+ "grad_norm": 3.158811330795288,
+ "learning_rate": 5.5499054184643683e-08,
+ "loss": 0.2998,
+ "step": 11844
+ },
+ {
+ "epoch": 5.60047281323877,
+ "grad_norm": 3.6570982933044434,
+ "learning_rate": 5.5368414465945263e-08,
+ "loss": 0.3322,
+ "step": 11845
+ },
+ {
+ "epoch": 5.600945626477541,
+ "grad_norm": 3.2857882976531982,
+ "learning_rate": 5.523792696373626e-08,
+ "loss": 0.3372,
+ "step": 11846
+ },
+ {
+ "epoch": 5.601418439716312,
+ "grad_norm": 2.7943503856658936,
+ "learning_rate": 5.5107591686141545e-08,
+ "loss": 0.2956,
+ "step": 11847
+ },
+ {
+ "epoch": 5.601891252955083,
+ "grad_norm": 3.353516101837158,
+ "learning_rate": 5.4977408641276595e-08,
+ "loss": 0.3247,
+ "step": 11848
+ },
+ {
+ "epoch": 5.602364066193854,
+ "grad_norm": 3.132786273956299,
+ "learning_rate": 5.484737783724714e-08,
+ "loss": 0.3129,
+ "step": 11849
+ },
+ {
+ "epoch": 5.602836879432624,
+ "grad_norm": 3.406188488006592,
+ "learning_rate": 5.471749928215003e-08,
+ "loss": 0.3419,
+ "step": 11850
+ },
+ {
+ "epoch": 5.603309692671395,
+ "grad_norm": 3.224926233291626,
+ "learning_rate": 5.4587772984071866e-08,
+ "loss": 0.3356,
+ "step": 11851
+ },
+ {
+ "epoch": 5.603782505910165,
+ "grad_norm": 3.068500518798828,
+ "learning_rate": 5.4458198951090337e-08,
+ "loss": 0.3135,
+ "step": 11852
+ },
+ {
+ "epoch": 5.604255319148936,
+ "grad_norm": 3.1098759174346924,
+ "learning_rate": 5.432877719127344e-08,
+ "loss": 0.35,
+ "step": 11853
+ },
+ {
+ "epoch": 5.604728132387707,
+ "grad_norm": 3.4230198860168457,
+ "learning_rate": 5.419950771267973e-08,
+ "loss": 0.3183,
+ "step": 11854
+ },
+ {
+ "epoch": 5.6052009456264775,
+ "grad_norm": 3.6213667392730713,
+ "learning_rate": 5.4070390523357775e-08,
+ "loss": 0.3438,
+ "step": 11855
+ },
+ {
+ "epoch": 5.605673758865248,
+ "grad_norm": 3.7087268829345703,
+ "learning_rate": 5.39414256313478e-08,
+ "loss": 0.3632,
+ "step": 11856
+ },
+ {
+ "epoch": 5.606146572104019,
+ "grad_norm": 3.3394203186035156,
+ "learning_rate": 5.38126130446795e-08,
+ "loss": 0.2977,
+ "step": 11857
+ },
+ {
+ "epoch": 5.60661938534279,
+ "grad_norm": 3.342027425765991,
+ "learning_rate": 5.368395277137367e-08,
+ "loss": 0.2957,
+ "step": 11858
+ },
+ {
+ "epoch": 5.60709219858156,
+ "grad_norm": 3.2655093669891357,
+ "learning_rate": 5.355544481944141e-08,
+ "loss": 0.3546,
+ "step": 11859
+ },
+ {
+ "epoch": 5.607565011820331,
+ "grad_norm": 3.1710312366485596,
+ "learning_rate": 5.3427089196884104e-08,
+ "loss": 0.3428,
+ "step": 11860
+ },
+ {
+ "epoch": 5.608037825059101,
+ "grad_norm": 3.242621898651123,
+ "learning_rate": 5.3298885911694244e-08,
+ "loss": 0.269,
+ "step": 11861
+ },
+ {
+ "epoch": 5.608510638297872,
+ "grad_norm": 3.228151798248291,
+ "learning_rate": 5.317083497185377e-08,
+ "loss": 0.3415,
+ "step": 11862
+ },
+ {
+ "epoch": 5.608983451536643,
+ "grad_norm": 3.1227009296417236,
+ "learning_rate": 5.304293638533686e-08,
+ "loss": 0.2991,
+ "step": 11863
+ },
+ {
+ "epoch": 5.609456264775414,
+ "grad_norm": 2.8883254528045654,
+ "learning_rate": 5.29151901601066e-08,
+ "loss": 0.3063,
+ "step": 11864
+ },
+ {
+ "epoch": 5.609929078014185,
+ "grad_norm": 2.9985411167144775,
+ "learning_rate": 5.2787596304117174e-08,
+ "loss": 0.2947,
+ "step": 11865
+ },
+ {
+ "epoch": 5.610401891252955,
+ "grad_norm": 3.177762985229492,
+ "learning_rate": 5.266015482531389e-08,
+ "loss": 0.3063,
+ "step": 11866
+ },
+ {
+ "epoch": 5.610874704491726,
+ "grad_norm": 2.8067426681518555,
+ "learning_rate": 5.2532865731630966e-08,
+ "loss": 0.2996,
+ "step": 11867
+ },
+ {
+ "epoch": 5.611347517730496,
+ "grad_norm": 3.2768452167510986,
+ "learning_rate": 5.240572903099484e-08,
+ "loss": 0.289,
+ "step": 11868
+ },
+ {
+ "epoch": 5.611820330969267,
+ "grad_norm": 3.130014419555664,
+ "learning_rate": 5.227874473132166e-08,
+ "loss": 0.2907,
+ "step": 11869
+ },
+ {
+ "epoch": 5.612293144208038,
+ "grad_norm": 3.2271652221679688,
+ "learning_rate": 5.21519128405179e-08,
+ "loss": 0.319,
+ "step": 11870
+ },
+ {
+ "epoch": 5.6127659574468085,
+ "grad_norm": 3.3185455799102783,
+ "learning_rate": 5.202523336648141e-08,
+ "loss": 0.3638,
+ "step": 11871
+ },
+ {
+ "epoch": 5.613238770685579,
+ "grad_norm": 3.3324077129364014,
+ "learning_rate": 5.189870631709948e-08,
+ "loss": 0.3147,
+ "step": 11872
+ },
+ {
+ "epoch": 5.61371158392435,
+ "grad_norm": 3.229886770248413,
+ "learning_rate": 5.1772331700250533e-08,
+ "loss": 0.3706,
+ "step": 11873
+ },
+ {
+ "epoch": 5.614184397163121,
+ "grad_norm": 3.1973488330841064,
+ "learning_rate": 5.164610952380328e-08,
+ "loss": 0.3078,
+ "step": 11874
+ },
+ {
+ "epoch": 5.614657210401891,
+ "grad_norm": 3.336578130722046,
+ "learning_rate": 5.152003979561671e-08,
+ "loss": 0.3573,
+ "step": 11875
+ },
+ {
+ "epoch": 5.615130023640662,
+ "grad_norm": 4.805183410644531,
+ "learning_rate": 5.139412252354148e-08,
+ "loss": 0.3777,
+ "step": 11876
+ },
+ {
+ "epoch": 5.615602836879432,
+ "grad_norm": 3.332260847091675,
+ "learning_rate": 5.126835771541716e-08,
+ "loss": 0.2983,
+ "step": 11877
+ },
+ {
+ "epoch": 5.616075650118203,
+ "grad_norm": 3.1888558864593506,
+ "learning_rate": 5.114274537907499e-08,
+ "loss": 0.3025,
+ "step": 11878
+ },
+ {
+ "epoch": 5.616548463356974,
+ "grad_norm": 3.5608468055725098,
+ "learning_rate": 5.10172855223362e-08,
+ "loss": 0.3507,
+ "step": 11879
+ },
+ {
+ "epoch": 5.617021276595745,
+ "grad_norm": 3.5171632766723633,
+ "learning_rate": 5.0891978153012336e-08,
+ "loss": 0.3073,
+ "step": 11880
+ },
+ {
+ "epoch": 5.617494089834516,
+ "grad_norm": 3.1395633220672607,
+ "learning_rate": 5.076682327890603e-08,
+ "loss": 0.2811,
+ "step": 11881
+ },
+ {
+ "epoch": 5.617966903073286,
+ "grad_norm": 3.0650174617767334,
+ "learning_rate": 5.064182090781022e-08,
+ "loss": 0.307,
+ "step": 11882
+ },
+ {
+ "epoch": 5.618439716312057,
+ "grad_norm": 2.9738526344299316,
+ "learning_rate": 5.0516971047508135e-08,
+ "loss": 0.328,
+ "step": 11883
+ },
+ {
+ "epoch": 5.618912529550827,
+ "grad_norm": 3.3862271308898926,
+ "learning_rate": 5.0392273705773544e-08,
+ "loss": 0.3554,
+ "step": 11884
+ },
+ {
+ "epoch": 5.619385342789598,
+ "grad_norm": 3.0164403915405273,
+ "learning_rate": 5.026772889037052e-08,
+ "loss": 0.2704,
+ "step": 11885
+ },
+ {
+ "epoch": 5.619858156028369,
+ "grad_norm": 3.204324245452881,
+ "learning_rate": 5.0143336609054795e-08,
+ "loss": 0.334,
+ "step": 11886
+ },
+ {
+ "epoch": 5.6203309692671395,
+ "grad_norm": 3.560706853866577,
+ "learning_rate": 5.0019096869571015e-08,
+ "loss": 0.3248,
+ "step": 11887
+ },
+ {
+ "epoch": 5.62080378250591,
+ "grad_norm": 3.8754491806030273,
+ "learning_rate": 4.989500967965522e-08,
+ "loss": 0.3389,
+ "step": 11888
+ },
+ {
+ "epoch": 5.621276595744681,
+ "grad_norm": 3.0526723861694336,
+ "learning_rate": 4.977107504703399e-08,
+ "loss": 0.2743,
+ "step": 11889
+ },
+ {
+ "epoch": 5.621749408983452,
+ "grad_norm": 3.337905168533325,
+ "learning_rate": 4.964729297942339e-08,
+ "loss": 0.2813,
+ "step": 11890
+ },
+ {
+ "epoch": 5.622222222222222,
+ "grad_norm": 3.149933099746704,
+ "learning_rate": 4.952366348453197e-08,
+ "loss": 0.3346,
+ "step": 11891
+ },
+ {
+ "epoch": 5.622695035460993,
+ "grad_norm": 3.5325448513031006,
+ "learning_rate": 4.9400186570056904e-08,
+ "loss": 0.4096,
+ "step": 11892
+ },
+ {
+ "epoch": 5.623167848699763,
+ "grad_norm": 2.5968239307403564,
+ "learning_rate": 4.927686224368622e-08,
+ "loss": 0.2666,
+ "step": 11893
+ },
+ {
+ "epoch": 5.623640661938534,
+ "grad_norm": 3.005018711090088,
+ "learning_rate": 4.9153690513099874e-08,
+ "loss": 0.267,
+ "step": 11894
+ },
+ {
+ "epoch": 5.624113475177305,
+ "grad_norm": 3.210495710372925,
+ "learning_rate": 4.90306713859659e-08,
+ "loss": 0.3286,
+ "step": 11895
+ },
+ {
+ "epoch": 5.624586288416076,
+ "grad_norm": 3.488629102706909,
+ "learning_rate": 4.8907804869945394e-08,
+ "loss": 0.3275,
+ "step": 11896
+ },
+ {
+ "epoch": 5.625059101654847,
+ "grad_norm": 3.438709020614624,
+ "learning_rate": 4.8785090972688073e-08,
+ "loss": 0.3774,
+ "step": 11897
+ },
+ {
+ "epoch": 5.625531914893617,
+ "grad_norm": 3.4456019401550293,
+ "learning_rate": 4.8662529701834496e-08,
+ "loss": 0.3146,
+ "step": 11898
+ },
+ {
+ "epoch": 5.626004728132388,
+ "grad_norm": 3.248830556869507,
+ "learning_rate": 4.854012106501688e-08,
+ "loss": 0.2914,
+ "step": 11899
+ },
+ {
+ "epoch": 5.626477541371158,
+ "grad_norm": 3.252931833267212,
+ "learning_rate": 4.841786506985635e-08,
+ "loss": 0.355,
+ "step": 11900
+ },
+ {
+ "epoch": 5.626950354609929,
+ "grad_norm": 3.1938722133636475,
+ "learning_rate": 4.829576172396544e-08,
+ "loss": 0.3503,
+ "step": 11901
+ },
+ {
+ "epoch": 5.6274231678487,
+ "grad_norm": 3.425889492034912,
+ "learning_rate": 4.8173811034947224e-08,
+ "loss": 0.334,
+ "step": 11902
+ },
+ {
+ "epoch": 5.6278959810874705,
+ "grad_norm": 2.86318302154541,
+ "learning_rate": 4.805201301039508e-08,
+ "loss": 0.2957,
+ "step": 11903
+ },
+ {
+ "epoch": 5.628368794326241,
+ "grad_norm": 3.708336591720581,
+ "learning_rate": 4.7930367657892384e-08,
+ "loss": 0.3276,
+ "step": 11904
+ },
+ {
+ "epoch": 5.628841607565012,
+ "grad_norm": 3.3034868240356445,
+ "learning_rate": 4.780887498501363e-08,
+ "loss": 0.3087,
+ "step": 11905
+ },
+ {
+ "epoch": 5.629314420803783,
+ "grad_norm": 3.2360143661499023,
+ "learning_rate": 4.76875349993236e-08,
+ "loss": 0.3501,
+ "step": 11906
+ },
+ {
+ "epoch": 5.629787234042553,
+ "grad_norm": 3.3256373405456543,
+ "learning_rate": 4.756634770837793e-08,
+ "loss": 0.3206,
+ "step": 11907
+ },
+ {
+ "epoch": 5.630260047281324,
+ "grad_norm": 2.9592063426971436,
+ "learning_rate": 4.744531311972195e-08,
+ "loss": 0.3043,
+ "step": 11908
+ },
+ {
+ "epoch": 5.630732860520094,
+ "grad_norm": 3.3246355056762695,
+ "learning_rate": 4.732443124089214e-08,
+ "loss": 0.3046,
+ "step": 11909
+ },
+ {
+ "epoch": 5.631205673758865,
+ "grad_norm": 3.342833995819092,
+ "learning_rate": 4.7203702079415825e-08,
+ "loss": 0.348,
+ "step": 11910
+ },
+ {
+ "epoch": 5.631678486997636,
+ "grad_norm": 3.3094377517700195,
+ "learning_rate": 4.70831256428092e-08,
+ "loss": 0.3195,
+ "step": 11911
+ },
+ {
+ "epoch": 5.632151300236407,
+ "grad_norm": 3.05210018157959,
+ "learning_rate": 4.696270193858099e-08,
+ "loss": 0.3044,
+ "step": 11912
+ },
+ {
+ "epoch": 5.6326241134751776,
+ "grad_norm": 2.889557123184204,
+ "learning_rate": 4.68424309742288e-08,
+ "loss": 0.2952,
+ "step": 11913
+ },
+ {
+ "epoch": 5.633096926713948,
+ "grad_norm": 3.7573699951171875,
+ "learning_rate": 4.672231275724193e-08,
+ "loss": 0.3667,
+ "step": 11914
+ },
+ {
+ "epoch": 5.633569739952719,
+ "grad_norm": 3.1630661487579346,
+ "learning_rate": 4.660234729509938e-08,
+ "loss": 0.3001,
+ "step": 11915
+ },
+ {
+ "epoch": 5.634042553191489,
+ "grad_norm": 3.4200289249420166,
+ "learning_rate": 4.6482534595270466e-08,
+ "loss": 0.332,
+ "step": 11916
+ },
+ {
+ "epoch": 5.63451536643026,
+ "grad_norm": 3.23447847366333,
+ "learning_rate": 4.636287466521616e-08,
+ "loss": 0.3547,
+ "step": 11917
+ },
+ {
+ "epoch": 5.634988179669031,
+ "grad_norm": 3.3752806186676025,
+ "learning_rate": 4.624336751238689e-08,
+ "loss": 0.3021,
+ "step": 11918
+ },
+ {
+ "epoch": 5.6354609929078014,
+ "grad_norm": 3.221673011779785,
+ "learning_rate": 4.612401314422338e-08,
+ "loss": 0.3359,
+ "step": 11919
+ },
+ {
+ "epoch": 5.635933806146572,
+ "grad_norm": 3.3513898849487305,
+ "learning_rate": 4.600481156815773e-08,
+ "loss": 0.2969,
+ "step": 11920
+ },
+ {
+ "epoch": 5.636406619385343,
+ "grad_norm": 3.0128650665283203,
+ "learning_rate": 4.588576279161205e-08,
+ "loss": 0.2816,
+ "step": 11921
+ },
+ {
+ "epoch": 5.636879432624114,
+ "grad_norm": 5.3624725341796875,
+ "learning_rate": 4.5766866821999046e-08,
+ "loss": 0.3359,
+ "step": 11922
+ },
+ {
+ "epoch": 5.637352245862884,
+ "grad_norm": 3.1999800205230713,
+ "learning_rate": 4.5648123666721665e-08,
+ "loss": 0.3015,
+ "step": 11923
+ },
+ {
+ "epoch": 5.6378250591016545,
+ "grad_norm": 3.211670398712158,
+ "learning_rate": 4.5529533333173446e-08,
+ "loss": 0.2871,
+ "step": 11924
+ },
+ {
+ "epoch": 5.638297872340425,
+ "grad_norm": 3.2410199642181396,
+ "learning_rate": 4.5411095828739037e-08,
+ "loss": 0.2818,
+ "step": 11925
+ },
+ {
+ "epoch": 5.638770685579196,
+ "grad_norm": 3.5467281341552734,
+ "learning_rate": 4.529281116079226e-08,
+ "loss": 0.3765,
+ "step": 11926
+ },
+ {
+ "epoch": 5.639243498817967,
+ "grad_norm": 3.2093350887298584,
+ "learning_rate": 4.517467933669889e-08,
+ "loss": 0.3477,
+ "step": 11927
+ },
+ {
+ "epoch": 5.639716312056738,
+ "grad_norm": 3.8581385612487793,
+ "learning_rate": 4.5056700363813874e-08,
+ "loss": 0.3879,
+ "step": 11928
+ },
+ {
+ "epoch": 5.6401891252955085,
+ "grad_norm": 3.2723116874694824,
+ "learning_rate": 4.4938874249483565e-08,
+ "loss": 0.3368,
+ "step": 11929
+ },
+ {
+ "epoch": 5.640661938534279,
+ "grad_norm": 3.173933982849121,
+ "learning_rate": 4.482120100104459e-08,
+ "loss": 0.2887,
+ "step": 11930
+ },
+ {
+ "epoch": 5.64113475177305,
+ "grad_norm": 3.4990296363830566,
+ "learning_rate": 4.470368062582359e-08,
+ "loss": 0.3602,
+ "step": 11931
+ },
+ {
+ "epoch": 5.64160756501182,
+ "grad_norm": 3.5351905822753906,
+ "learning_rate": 4.458631313113831e-08,
+ "loss": 0.3043,
+ "step": 11932
+ },
+ {
+ "epoch": 5.642080378250591,
+ "grad_norm": 3.6162710189819336,
+ "learning_rate": 4.4469098524296815e-08,
+ "loss": 0.311,
+ "step": 11933
+ },
+ {
+ "epoch": 5.642553191489362,
+ "grad_norm": 3.2108359336853027,
+ "learning_rate": 4.4352036812597144e-08,
+ "loss": 0.2995,
+ "step": 11934
+ },
+ {
+ "epoch": 5.643026004728132,
+ "grad_norm": 3.0152573585510254,
+ "learning_rate": 4.423512800332874e-08,
+ "loss": 0.2889,
+ "step": 11935
+ },
+ {
+ "epoch": 5.643498817966903,
+ "grad_norm": 3.6868278980255127,
+ "learning_rate": 4.4118372103770514e-08,
+ "loss": 0.2943,
+ "step": 11936
+ },
+ {
+ "epoch": 5.643971631205674,
+ "grad_norm": 3.021240711212158,
+ "learning_rate": 4.400176912119275e-08,
+ "loss": 0.3377,
+ "step": 11937
+ },
+ {
+ "epoch": 5.644444444444445,
+ "grad_norm": 3.117161989212036,
+ "learning_rate": 4.388531906285548e-08,
+ "loss": 0.3161,
+ "step": 11938
+ },
+ {
+ "epoch": 5.644917257683215,
+ "grad_norm": 3.1031835079193115,
+ "learning_rate": 4.3769021936009295e-08,
+ "loss": 0.2707,
+ "step": 11939
+ },
+ {
+ "epoch": 5.6453900709219855,
+ "grad_norm": 3.2828316688537598,
+ "learning_rate": 4.365287774789617e-08,
+ "loss": 0.3368,
+ "step": 11940
+ },
+ {
+ "epoch": 5.645862884160756,
+ "grad_norm": 2.9725148677825928,
+ "learning_rate": 4.3536886505747555e-08,
+ "loss": 0.3074,
+ "step": 11941
+ },
+ {
+ "epoch": 5.646335697399527,
+ "grad_norm": 3.046449661254883,
+ "learning_rate": 4.342104821678572e-08,
+ "loss": 0.3057,
+ "step": 11942
+ },
+ {
+ "epoch": 5.646808510638298,
+ "grad_norm": 3.3085083961486816,
+ "learning_rate": 4.330536288822379e-08,
+ "loss": 0.3137,
+ "step": 11943
+ },
+ {
+ "epoch": 5.647281323877069,
+ "grad_norm": 3.4323384761810303,
+ "learning_rate": 4.318983052726406e-08,
+ "loss": 0.3116,
+ "step": 11944
+ },
+ {
+ "epoch": 5.6477541371158395,
+ "grad_norm": 3.387890577316284,
+ "learning_rate": 4.307445114110104e-08,
+ "loss": 0.2882,
+ "step": 11945
+ },
+ {
+ "epoch": 5.64822695035461,
+ "grad_norm": 3.2343697547912598,
+ "learning_rate": 4.295922473691872e-08,
+ "loss": 0.3324,
+ "step": 11946
+ },
+ {
+ "epoch": 5.648699763593381,
+ "grad_norm": 3.0639030933380127,
+ "learning_rate": 4.2844151321891626e-08,
+ "loss": 0.3739,
+ "step": 11947
+ },
+ {
+ "epoch": 5.649172576832151,
+ "grad_norm": 3.201939105987549,
+ "learning_rate": 4.272923090318487e-08,
+ "loss": 0.3241,
+ "step": 11948
+ },
+ {
+ "epoch": 5.649645390070922,
+ "grad_norm": 3.60882830619812,
+ "learning_rate": 4.2614463487954114e-08,
+ "loss": 0.274,
+ "step": 11949
+ },
+ {
+ "epoch": 5.650118203309693,
+ "grad_norm": 3.4274656772613525,
+ "learning_rate": 4.249984908334559e-08,
+ "loss": 0.3164,
+ "step": 11950
+ },
+ {
+ "epoch": 5.650591016548463,
+ "grad_norm": 3.332573175430298,
+ "learning_rate": 4.2385387696495525e-08,
+ "loss": 0.2875,
+ "step": 11951
+ },
+ {
+ "epoch": 5.651063829787234,
+ "grad_norm": 3.1913769245147705,
+ "learning_rate": 4.227107933453101e-08,
+ "loss": 0.3334,
+ "step": 11952
+ },
+ {
+ "epoch": 5.651536643026005,
+ "grad_norm": 3.683591842651367,
+ "learning_rate": 4.2156924004569664e-08,
+ "loss": 0.3544,
+ "step": 11953
+ },
+ {
+ "epoch": 5.652009456264776,
+ "grad_norm": 3.415381669998169,
+ "learning_rate": 4.204292171371915e-08,
+ "loss": 0.3764,
+ "step": 11954
+ },
+ {
+ "epoch": 5.652482269503546,
+ "grad_norm": 3.341519832611084,
+ "learning_rate": 4.192907246907824e-08,
+ "loss": 0.3001,
+ "step": 11955
+ },
+ {
+ "epoch": 5.6529550827423165,
+ "grad_norm": 3.161620855331421,
+ "learning_rate": 4.1815376277735686e-08,
+ "loss": 0.3118,
+ "step": 11956
+ },
+ {
+ "epoch": 5.653427895981087,
+ "grad_norm": 2.9771671295166016,
+ "learning_rate": 4.170183314677084e-08,
+ "loss": 0.2941,
+ "step": 11957
+ },
+ {
+ "epoch": 5.653900709219858,
+ "grad_norm": 3.0971219539642334,
+ "learning_rate": 4.1588443083253605e-08,
+ "loss": 0.2684,
+ "step": 11958
+ },
+ {
+ "epoch": 5.654373522458629,
+ "grad_norm": 3.714979410171509,
+ "learning_rate": 4.147520609424388e-08,
+ "loss": 0.3968,
+ "step": 11959
+ },
+ {
+ "epoch": 5.6548463356974,
+ "grad_norm": 2.89011549949646,
+ "learning_rate": 4.1362122186792976e-08,
+ "loss": 0.3221,
+ "step": 11960
+ },
+ {
+ "epoch": 5.6553191489361705,
+ "grad_norm": 3.0199105739593506,
+ "learning_rate": 4.124919136794192e-08,
+ "loss": 0.2991,
+ "step": 11961
+ },
+ {
+ "epoch": 5.655791962174941,
+ "grad_norm": 3.516275644302368,
+ "learning_rate": 4.113641364472232e-08,
+ "loss": 0.3635,
+ "step": 11962
+ },
+ {
+ "epoch": 5.656264775413711,
+ "grad_norm": 3.062115430831909,
+ "learning_rate": 4.1023789024156605e-08,
+ "loss": 0.3347,
+ "step": 11963
+ },
+ {
+ "epoch": 5.656737588652482,
+ "grad_norm": 3.2312190532684326,
+ "learning_rate": 4.091131751325694e-08,
+ "loss": 0.2771,
+ "step": 11964
+ },
+ {
+ "epoch": 5.657210401891253,
+ "grad_norm": 3.2892868518829346,
+ "learning_rate": 4.0798999119027184e-08,
+ "loss": 0.3055,
+ "step": 11965
+ },
+ {
+ "epoch": 5.657683215130024,
+ "grad_norm": 2.9591119289398193,
+ "learning_rate": 4.068683384846034e-08,
+ "loss": 0.2923,
+ "step": 11966
+ },
+ {
+ "epoch": 5.658156028368794,
+ "grad_norm": 3.423375129699707,
+ "learning_rate": 4.057482170854055e-08,
+ "loss": 0.3126,
+ "step": 11967
+ },
+ {
+ "epoch": 5.658628841607565,
+ "grad_norm": 2.994536876678467,
+ "learning_rate": 4.04629627062425e-08,
+ "loss": 0.3032,
+ "step": 11968
+ },
+ {
+ "epoch": 5.659101654846336,
+ "grad_norm": 3.4704277515411377,
+ "learning_rate": 4.035125684853064e-08,
+ "loss": 0.3722,
+ "step": 11969
+ },
+ {
+ "epoch": 5.659574468085106,
+ "grad_norm": 3.5841195583343506,
+ "learning_rate": 4.023970414236134e-08,
+ "loss": 0.3954,
+ "step": 11970
+ },
+ {
+ "epoch": 5.660047281323877,
+ "grad_norm": 3.0709569454193115,
+ "learning_rate": 4.012830459467959e-08,
+ "loss": 0.3378,
+ "step": 11971
+ },
+ {
+ "epoch": 5.6605200945626475,
+ "grad_norm": 3.0199382305145264,
+ "learning_rate": 4.00170582124218e-08,
+ "loss": 0.3131,
+ "step": 11972
+ },
+ {
+ "epoch": 5.660992907801418,
+ "grad_norm": 2.95902419090271,
+ "learning_rate": 3.990596500251576e-08,
+ "loss": 0.269,
+ "step": 11973
+ },
+ {
+ "epoch": 5.661465721040189,
+ "grad_norm": 3.6806280612945557,
+ "learning_rate": 3.979502497187759e-08,
+ "loss": 0.3914,
+ "step": 11974
+ },
+ {
+ "epoch": 5.66193853427896,
+ "grad_norm": 3.4212417602539062,
+ "learning_rate": 3.9684238127415675e-08,
+ "loss": 0.3453,
+ "step": 11975
+ },
+ {
+ "epoch": 5.662411347517731,
+ "grad_norm": 2.9063239097595215,
+ "learning_rate": 3.9573604476028096e-08,
+ "loss": 0.3388,
+ "step": 11976
+ },
+ {
+ "epoch": 5.6628841607565015,
+ "grad_norm": 3.5526041984558105,
+ "learning_rate": 3.946312402460351e-08,
+ "loss": 0.3707,
+ "step": 11977
+ },
+ {
+ "epoch": 5.663356973995272,
+ "grad_norm": 3.1709578037261963,
+ "learning_rate": 3.935279678002113e-08,
+ "loss": 0.2742,
+ "step": 11978
+ },
+ {
+ "epoch": 5.663829787234042,
+ "grad_norm": 3.9405689239501953,
+ "learning_rate": 3.924262274915047e-08,
+ "loss": 0.3712,
+ "step": 11979
+ },
+ {
+ "epoch": 5.664302600472813,
+ "grad_norm": 3.2250664234161377,
+ "learning_rate": 3.9132601938851324e-08,
+ "loss": 0.3081,
+ "step": 11980
+ },
+ {
+ "epoch": 5.664775413711584,
+ "grad_norm": 3.6658518314361572,
+ "learning_rate": 3.9022734355974866e-08,
+ "loss": 0.3464,
+ "step": 11981
+ },
+ {
+ "epoch": 5.665248226950355,
+ "grad_norm": 3.3371896743774414,
+ "learning_rate": 3.891302000736175e-08,
+ "loss": 0.3253,
+ "step": 11982
+ },
+ {
+ "epoch": 5.665721040189125,
+ "grad_norm": 3.1414871215820312,
+ "learning_rate": 3.880345889984316e-08,
+ "loss": 0.2796,
+ "step": 11983
+ },
+ {
+ "epoch": 5.666193853427896,
+ "grad_norm": 3.2845115661621094,
+ "learning_rate": 3.8694051040240876e-08,
+ "loss": 0.328,
+ "step": 11984
+ },
+ {
+ "epoch": 5.666666666666667,
+ "grad_norm": 3.565127372741699,
+ "learning_rate": 3.858479643536778e-08,
+ "loss": 0.3527,
+ "step": 11985
+ },
+ {
+ "epoch": 5.667139479905437,
+ "grad_norm": 3.017859935760498,
+ "learning_rate": 3.8475695092026476e-08,
+ "loss": 0.3379,
+ "step": 11986
+ },
+ {
+ "epoch": 5.667612293144208,
+ "grad_norm": 3.2508249282836914,
+ "learning_rate": 3.836674701701015e-08,
+ "loss": 0.3073,
+ "step": 11987
+ },
+ {
+ "epoch": 5.6680851063829785,
+ "grad_norm": 3.1742143630981445,
+ "learning_rate": 3.825795221710255e-08,
+ "loss": 0.3184,
+ "step": 11988
+ },
+ {
+ "epoch": 5.668557919621749,
+ "grad_norm": 3.096966028213501,
+ "learning_rate": 3.8149310699077956e-08,
+ "loss": 0.2686,
+ "step": 11989
+ },
+ {
+ "epoch": 5.66903073286052,
+ "grad_norm": 3.26834774017334,
+ "learning_rate": 3.80408224697007e-08,
+ "loss": 0.3532,
+ "step": 11990
+ },
+ {
+ "epoch": 5.669503546099291,
+ "grad_norm": 2.8902487754821777,
+ "learning_rate": 3.7932487535725924e-08,
+ "loss": 0.3226,
+ "step": 11991
+ },
+ {
+ "epoch": 5.669976359338062,
+ "grad_norm": 3.4881465435028076,
+ "learning_rate": 3.7824305903899626e-08,
+ "loss": 0.3187,
+ "step": 11992
+ },
+ {
+ "epoch": 5.6704491725768325,
+ "grad_norm": 3.0470924377441406,
+ "learning_rate": 3.771627758095725e-08,
+ "loss": 0.249,
+ "step": 11993
+ },
+ {
+ "epoch": 5.670921985815603,
+ "grad_norm": 3.4965827465057373,
+ "learning_rate": 3.760840257362564e-08,
+ "loss": 0.3688,
+ "step": 11994
+ },
+ {
+ "epoch": 5.671394799054373,
+ "grad_norm": 3.615798234939575,
+ "learning_rate": 3.7500680888621355e-08,
+ "loss": 0.3235,
+ "step": 11995
+ },
+ {
+ "epoch": 5.671867612293144,
+ "grad_norm": 3.8858070373535156,
+ "learning_rate": 3.739311253265237e-08,
+ "loss": 0.3749,
+ "step": 11996
+ },
+ {
+ "epoch": 5.672340425531915,
+ "grad_norm": 3.0234808921813965,
+ "learning_rate": 3.7285697512415844e-08,
+ "loss": 0.2837,
+ "step": 11997
+ },
+ {
+ "epoch": 5.6728132387706856,
+ "grad_norm": 4.143755912780762,
+ "learning_rate": 3.717843583460001e-08,
+ "loss": 0.3399,
+ "step": 11998
+ },
+ {
+ "epoch": 5.673286052009456,
+ "grad_norm": 3.064124822616577,
+ "learning_rate": 3.707132750588399e-08,
+ "loss": 0.3285,
+ "step": 11999
+ },
+ {
+ "epoch": 5.673758865248227,
+ "grad_norm": 3.348433256149292,
+ "learning_rate": 3.696437253293689e-08,
+ "loss": 0.3401,
+ "step": 12000
+ },
+ {
+ "epoch": 5.674231678486998,
+ "grad_norm": 3.011606216430664,
+ "learning_rate": 3.68575709224181e-08,
+ "loss": 0.2829,
+ "step": 12001
+ },
+ {
+ "epoch": 5.674704491725768,
+ "grad_norm": 2.935004234313965,
+ "learning_rate": 3.675092268097785e-08,
+ "loss": 0.3275,
+ "step": 12002
+ },
+ {
+ "epoch": 5.675177304964539,
+ "grad_norm": 3.3137762546539307,
+ "learning_rate": 3.664442781525668e-08,
+ "loss": 0.3274,
+ "step": 12003
+ },
+ {
+ "epoch": 5.6756501182033094,
+ "grad_norm": 3.837202310562134,
+ "learning_rate": 3.653808633188538e-08,
+ "loss": 0.324,
+ "step": 12004
+ },
+ {
+ "epoch": 5.67612293144208,
+ "grad_norm": 2.972749710083008,
+ "learning_rate": 3.643189823748561e-08,
+ "loss": 0.2805,
+ "step": 12005
+ },
+ {
+ "epoch": 5.676595744680851,
+ "grad_norm": 4.356303691864014,
+ "learning_rate": 3.632586353866902e-08,
+ "loss": 0.3509,
+ "step": 12006
+ },
+ {
+ "epoch": 5.677068557919622,
+ "grad_norm": 3.2707748413085938,
+ "learning_rate": 3.62199822420381e-08,
+ "loss": 0.3416,
+ "step": 12007
+ },
+ {
+ "epoch": 5.677541371158393,
+ "grad_norm": 3.460958957672119,
+ "learning_rate": 3.611425435418536e-08,
+ "loss": 0.323,
+ "step": 12008
+ },
+ {
+ "epoch": 5.678014184397163,
+ "grad_norm": 3.1501171588897705,
+ "learning_rate": 3.600867988169443e-08,
+ "loss": 0.3542,
+ "step": 12009
+ },
+ {
+ "epoch": 5.678486997635934,
+ "grad_norm": 3.2512545585632324,
+ "learning_rate": 3.590325883113838e-08,
+ "loss": 0.3031,
+ "step": 12010
+ },
+ {
+ "epoch": 5.678959810874704,
+ "grad_norm": 3.1285507678985596,
+ "learning_rate": 3.579799120908195e-08,
+ "loss": 0.3128,
+ "step": 12011
+ },
+ {
+ "epoch": 5.679432624113475,
+ "grad_norm": 3.4275474548339844,
+ "learning_rate": 3.569287702207963e-08,
+ "loss": 0.3152,
+ "step": 12012
+ },
+ {
+ "epoch": 5.679905437352246,
+ "grad_norm": 3.314131498336792,
+ "learning_rate": 3.558791627667563e-08,
+ "loss": 0.3164,
+ "step": 12013
+ },
+ {
+ "epoch": 5.6803782505910165,
+ "grad_norm": 3.040032148361206,
+ "learning_rate": 3.548310897940638e-08,
+ "loss": 0.2703,
+ "step": 12014
+ },
+ {
+ "epoch": 5.680851063829787,
+ "grad_norm": 3.3257627487182617,
+ "learning_rate": 3.537845513679722e-08,
+ "loss": 0.2802,
+ "step": 12015
+ },
+ {
+ "epoch": 5.681323877068558,
+ "grad_norm": 3.1988141536712646,
+ "learning_rate": 3.527395475536488e-08,
+ "loss": 0.3512,
+ "step": 12016
+ },
+ {
+ "epoch": 5.681796690307329,
+ "grad_norm": 3.1683738231658936,
+ "learning_rate": 3.516960784161555e-08,
+ "loss": 0.2829,
+ "step": 12017
+ },
+ {
+ "epoch": 5.682269503546099,
+ "grad_norm": 3.181068181991577,
+ "learning_rate": 3.5065414402046805e-08,
+ "loss": 0.2705,
+ "step": 12018
+ },
+ {
+ "epoch": 5.68274231678487,
+ "grad_norm": 3.3845975399017334,
+ "learning_rate": 3.4961374443146235e-08,
+ "loss": 0.3357,
+ "step": 12019
+ },
+ {
+ "epoch": 5.68321513002364,
+ "grad_norm": 3.1401607990264893,
+ "learning_rate": 3.485748797139199e-08,
+ "loss": 0.342,
+ "step": 12020
+ },
+ {
+ "epoch": 5.683687943262411,
+ "grad_norm": 2.891299247741699,
+ "learning_rate": 3.4753754993253064e-08,
+ "loss": 0.3359,
+ "step": 12021
+ },
+ {
+ "epoch": 5.684160756501182,
+ "grad_norm": 3.0465214252471924,
+ "learning_rate": 3.465017551518762e-08,
+ "loss": 0.3198,
+ "step": 12022
+ },
+ {
+ "epoch": 5.684633569739953,
+ "grad_norm": 3.44197678565979,
+ "learning_rate": 3.4546749543645506e-08,
+ "loss": 0.3345,
+ "step": 12023
+ },
+ {
+ "epoch": 5.685106382978724,
+ "grad_norm": 3.3884904384613037,
+ "learning_rate": 3.4443477085066847e-08,
+ "loss": 0.316,
+ "step": 12024
+ },
+ {
+ "epoch": 5.685579196217494,
+ "grad_norm": 3.266526937484741,
+ "learning_rate": 3.434035814588177e-08,
+ "loss": 0.2804,
+ "step": 12025
+ },
+ {
+ "epoch": 5.686052009456265,
+ "grad_norm": 3.4479098320007324,
+ "learning_rate": 3.42373927325107e-08,
+ "loss": 0.3282,
+ "step": 12026
+ },
+ {
+ "epoch": 5.686524822695035,
+ "grad_norm": 3.1241097450256348,
+ "learning_rate": 3.4134580851365176e-08,
+ "loss": 0.3045,
+ "step": 12027
+ },
+ {
+ "epoch": 5.686997635933806,
+ "grad_norm": 3.032581090927124,
+ "learning_rate": 3.4031922508847014e-08,
+ "loss": 0.3118,
+ "step": 12028
+ },
+ {
+ "epoch": 5.687470449172577,
+ "grad_norm": 3.0940909385681152,
+ "learning_rate": 3.392941771134806e-08,
+ "loss": 0.3088,
+ "step": 12029
+ },
+ {
+ "epoch": 5.6879432624113475,
+ "grad_norm": 3.195840835571289,
+ "learning_rate": 3.382706646525069e-08,
+ "loss": 0.3332,
+ "step": 12030
+ },
+ {
+ "epoch": 5.688416075650118,
+ "grad_norm": 3.198594093322754,
+ "learning_rate": 3.3724868776928434e-08,
+ "loss": 0.3845,
+ "step": 12031
+ },
+ {
+ "epoch": 5.688888888888889,
+ "grad_norm": 3.1207070350646973,
+ "learning_rate": 3.362282465274397e-08,
+ "loss": 0.3079,
+ "step": 12032
+ },
+ {
+ "epoch": 5.68936170212766,
+ "grad_norm": 3.221534252166748,
+ "learning_rate": 3.352093409905138e-08,
+ "loss": 0.2855,
+ "step": 12033
+ },
+ {
+ "epoch": 5.68983451536643,
+ "grad_norm": 3.2485337257385254,
+ "learning_rate": 3.3419197122195325e-08,
+ "loss": 0.3146,
+ "step": 12034
+ },
+ {
+ "epoch": 5.690307328605201,
+ "grad_norm": 3.200204372406006,
+ "learning_rate": 3.331761372851017e-08,
+ "loss": 0.3504,
+ "step": 12035
+ },
+ {
+ "epoch": 5.690780141843971,
+ "grad_norm": 3.3995420932769775,
+ "learning_rate": 3.321618392432085e-08,
+ "loss": 0.3378,
+ "step": 12036
+ },
+ {
+ "epoch": 5.691252955082742,
+ "grad_norm": 2.949434757232666,
+ "learning_rate": 3.311490771594372e-08,
+ "loss": 0.3069,
+ "step": 12037
+ },
+ {
+ "epoch": 5.691725768321513,
+ "grad_norm": 3.4283101558685303,
+ "learning_rate": 3.3013785109684005e-08,
+ "loss": 0.3318,
+ "step": 12038
+ },
+ {
+ "epoch": 5.692198581560284,
+ "grad_norm": 3.3977811336517334,
+ "learning_rate": 3.291281611183861e-08,
+ "loss": 0.3437,
+ "step": 12039
+ },
+ {
+ "epoch": 5.692671394799055,
+ "grad_norm": 3.0468637943267822,
+ "learning_rate": 3.281200072869417e-08,
+ "loss": 0.2615,
+ "step": 12040
+ },
+ {
+ "epoch": 5.693144208037825,
+ "grad_norm": 3.370964527130127,
+ "learning_rate": 3.271133896652817e-08,
+ "loss": 0.2964,
+ "step": 12041
+ },
+ {
+ "epoch": 5.693617021276596,
+ "grad_norm": 3.3542773723602295,
+ "learning_rate": 3.261083083160838e-08,
+ "loss": 0.3612,
+ "step": 12042
+ },
+ {
+ "epoch": 5.694089834515366,
+ "grad_norm": 3.369004964828491,
+ "learning_rate": 3.251047633019283e-08,
+ "loss": 0.3721,
+ "step": 12043
+ },
+ {
+ "epoch": 5.694562647754137,
+ "grad_norm": 3.358621835708618,
+ "learning_rate": 3.2410275468530705e-08,
+ "loss": 0.3454,
+ "step": 12044
+ },
+ {
+ "epoch": 5.695035460992908,
+ "grad_norm": 2.8612661361694336,
+ "learning_rate": 3.231022825286034e-08,
+ "loss": 0.3404,
+ "step": 12045
+ },
+ {
+ "epoch": 5.6955082742316785,
+ "grad_norm": 3.1585562229156494,
+ "learning_rate": 3.221033468941176e-08,
+ "loss": 0.3204,
+ "step": 12046
+ },
+ {
+ "epoch": 5.695981087470449,
+ "grad_norm": 3.128363609313965,
+ "learning_rate": 3.2110594784404706e-08,
+ "loss": 0.3188,
+ "step": 12047
+ },
+ {
+ "epoch": 5.69645390070922,
+ "grad_norm": 3.4005603790283203,
+ "learning_rate": 3.2011008544049206e-08,
+ "loss": 0.3185,
+ "step": 12048
+ },
+ {
+ "epoch": 5.696926713947991,
+ "grad_norm": 3.1740269660949707,
+ "learning_rate": 3.191157597454669e-08,
+ "loss": 0.2825,
+ "step": 12049
+ },
+ {
+ "epoch": 5.697399527186761,
+ "grad_norm": 3.5193979740142822,
+ "learning_rate": 3.181229708208833e-08,
+ "loss": 0.3448,
+ "step": 12050
+ },
+ {
+ "epoch": 5.697872340425532,
+ "grad_norm": 3.2826907634735107,
+ "learning_rate": 3.1713171872855e-08,
+ "loss": 0.2779,
+ "step": 12051
+ },
+ {
+ "epoch": 5.698345153664302,
+ "grad_norm": 2.7100746631622314,
+ "learning_rate": 3.161420035301982e-08,
+ "loss": 0.2536,
+ "step": 12052
+ },
+ {
+ "epoch": 5.698817966903073,
+ "grad_norm": 3.387242317199707,
+ "learning_rate": 3.151538252874453e-08,
+ "loss": 0.3291,
+ "step": 12053
+ },
+ {
+ "epoch": 5.699290780141844,
+ "grad_norm": 3.3984251022338867,
+ "learning_rate": 3.1416718406182545e-08,
+ "loss": 0.2843,
+ "step": 12054
+ },
+ {
+ "epoch": 5.699763593380615,
+ "grad_norm": 3.0349318981170654,
+ "learning_rate": 3.1318207991477e-08,
+ "loss": 0.3216,
+ "step": 12055
+ },
+ {
+ "epoch": 5.700236406619386,
+ "grad_norm": 3.406585693359375,
+ "learning_rate": 3.12198512907616e-08,
+ "loss": 0.3496,
+ "step": 12056
+ },
+ {
+ "epoch": 5.700709219858156,
+ "grad_norm": 3.709542989730835,
+ "learning_rate": 3.112164831016118e-08,
+ "loss": 0.3285,
+ "step": 12057
+ },
+ {
+ "epoch": 5.701182033096927,
+ "grad_norm": 3.145622491836548,
+ "learning_rate": 3.102359905579e-08,
+ "loss": 0.3323,
+ "step": 12058
+ },
+ {
+ "epoch": 5.701654846335697,
+ "grad_norm": 3.2133283615112305,
+ "learning_rate": 3.0925703533752905e-08,
+ "loss": 0.2981,
+ "step": 12059
+ },
+ {
+ "epoch": 5.702127659574468,
+ "grad_norm": 3.4080936908721924,
+ "learning_rate": 3.082796175014585e-08,
+ "loss": 0.3176,
+ "step": 12060
+ },
+ {
+ "epoch": 5.702600472813239,
+ "grad_norm": 3.290942668914795,
+ "learning_rate": 3.073037371105453e-08,
+ "loss": 0.361,
+ "step": 12061
+ },
+ {
+ "epoch": 5.7030732860520095,
+ "grad_norm": 2.5797150135040283,
+ "learning_rate": 3.0632939422555466e-08,
+ "loss": 0.2767,
+ "step": 12062
+ },
+ {
+ "epoch": 5.70354609929078,
+ "grad_norm": 3.083340883255005,
+ "learning_rate": 3.053565889071547e-08,
+ "loss": 0.347,
+ "step": 12063
+ },
+ {
+ "epoch": 5.704018912529551,
+ "grad_norm": 3.1182093620300293,
+ "learning_rate": 3.043853212159165e-08,
+ "loss": 0.2469,
+ "step": 12064
+ },
+ {
+ "epoch": 5.704491725768322,
+ "grad_norm": 3.1092607975006104,
+ "learning_rate": 3.034155912123193e-08,
+ "loss": 0.3342,
+ "step": 12065
+ },
+ {
+ "epoch": 5.704964539007092,
+ "grad_norm": 3.1561546325683594,
+ "learning_rate": 3.0244739895673725e-08,
+ "loss": 0.3184,
+ "step": 12066
+ },
+ {
+ "epoch": 5.705437352245863,
+ "grad_norm": 3.974445343017578,
+ "learning_rate": 3.014807445094636e-08,
+ "loss": 0.3318,
+ "step": 12067
+ },
+ {
+ "epoch": 5.705910165484633,
+ "grad_norm": 2.7729475498199463,
+ "learning_rate": 3.0051562793068646e-08,
+ "loss": 0.2742,
+ "step": 12068
+ },
+ {
+ "epoch": 5.706382978723404,
+ "grad_norm": 3.2135114669799805,
+ "learning_rate": 2.99552049280491e-08,
+ "loss": 0.3302,
+ "step": 12069
+ },
+ {
+ "epoch": 5.706855791962175,
+ "grad_norm": 3.0425431728363037,
+ "learning_rate": 2.9859000861888757e-08,
+ "loss": 0.2774,
+ "step": 12070
+ },
+ {
+ "epoch": 5.707328605200946,
+ "grad_norm": 3.1506264209747314,
+ "learning_rate": 2.976295060057671e-08,
+ "loss": 0.3346,
+ "step": 12071
+ },
+ {
+ "epoch": 5.707801418439717,
+ "grad_norm": 3.517425537109375,
+ "learning_rate": 2.96670541500943e-08,
+ "loss": 0.3034,
+ "step": 12072
+ },
+ {
+ "epoch": 5.708274231678487,
+ "grad_norm": 3.0956437587738037,
+ "learning_rate": 2.9571311516412295e-08,
+ "loss": 0.3602,
+ "step": 12073
+ },
+ {
+ "epoch": 5.708747044917258,
+ "grad_norm": 3.1720614433288574,
+ "learning_rate": 2.9475722705492037e-08,
+ "loss": 0.3173,
+ "step": 12074
+ },
+ {
+ "epoch": 5.709219858156028,
+ "grad_norm": 2.999795436859131,
+ "learning_rate": 2.9380287723285994e-08,
+ "loss": 0.3113,
+ "step": 12075
+ },
+ {
+ "epoch": 5.709692671394799,
+ "grad_norm": 3.511803150177002,
+ "learning_rate": 2.9285006575735798e-08,
+ "loss": 0.2905,
+ "step": 12076
+ },
+ {
+ "epoch": 5.71016548463357,
+ "grad_norm": 3.7468740940093994,
+ "learning_rate": 2.918987926877448e-08,
+ "loss": 0.3837,
+ "step": 12077
+ },
+ {
+ "epoch": 5.7106382978723405,
+ "grad_norm": 3.5052263736724854,
+ "learning_rate": 2.9094905808325358e-08,
+ "loss": 0.3343,
+ "step": 12078
+ },
+ {
+ "epoch": 5.711111111111111,
+ "grad_norm": 3.2323622703552246,
+ "learning_rate": 2.9000086200302036e-08,
+ "loss": 0.3023,
+ "step": 12079
+ },
+ {
+ "epoch": 5.711583924349882,
+ "grad_norm": 2.9456934928894043,
+ "learning_rate": 2.8905420450608122e-08,
+ "loss": 0.3137,
+ "step": 12080
+ },
+ {
+ "epoch": 5.712056737588653,
+ "grad_norm": 3.1025915145874023,
+ "learning_rate": 2.881090856513835e-08,
+ "loss": 0.3041,
+ "step": 12081
+ },
+ {
+ "epoch": 5.712529550827423,
+ "grad_norm": 2.798013687133789,
+ "learning_rate": 2.8716550549777177e-08,
+ "loss": 0.2928,
+ "step": 12082
+ },
+ {
+ "epoch": 5.7130023640661936,
+ "grad_norm": 3.002685070037842,
+ "learning_rate": 2.8622346410400183e-08,
+ "loss": 0.2916,
+ "step": 12083
+ },
+ {
+ "epoch": 5.713475177304964,
+ "grad_norm": 3.4977736473083496,
+ "learning_rate": 2.8528296152873225e-08,
+ "loss": 0.3135,
+ "step": 12084
+ },
+ {
+ "epoch": 5.713947990543735,
+ "grad_norm": 3.023202419281006,
+ "learning_rate": 2.843439978305218e-08,
+ "loss": 0.3818,
+ "step": 12085
+ },
+ {
+ "epoch": 5.714420803782506,
+ "grad_norm": 3.6928040981292725,
+ "learning_rate": 2.834065730678348e-08,
+ "loss": 0.3388,
+ "step": 12086
+ },
+ {
+ "epoch": 5.714893617021277,
+ "grad_norm": 3.3528921604156494,
+ "learning_rate": 2.8247068729904126e-08,
+ "loss": 0.3263,
+ "step": 12087
+ },
+ {
+ "epoch": 5.7153664302600475,
+ "grad_norm": 3.0671277046203613,
+ "learning_rate": 2.8153634058241397e-08,
+ "loss": 0.3563,
+ "step": 12088
+ },
+ {
+ "epoch": 5.715839243498818,
+ "grad_norm": 3.4238786697387695,
+ "learning_rate": 2.806035329761314e-08,
+ "loss": 0.3039,
+ "step": 12089
+ },
+ {
+ "epoch": 5.716312056737589,
+ "grad_norm": 3.6761975288391113,
+ "learning_rate": 2.7967226453827767e-08,
+ "loss": 0.2712,
+ "step": 12090
+ },
+ {
+ "epoch": 5.716784869976359,
+ "grad_norm": 3.2580909729003906,
+ "learning_rate": 2.787425353268369e-08,
+ "loss": 0.2774,
+ "step": 12091
+ },
+ {
+ "epoch": 5.71725768321513,
+ "grad_norm": 3.3871915340423584,
+ "learning_rate": 2.7781434539969897e-08,
+ "loss": 0.3026,
+ "step": 12092
+ },
+ {
+ "epoch": 5.717730496453901,
+ "grad_norm": 3.752911329269409,
+ "learning_rate": 2.768876948146565e-08,
+ "loss": 0.3212,
+ "step": 12093
+ },
+ {
+ "epoch": 5.718203309692671,
+ "grad_norm": 3.4740591049194336,
+ "learning_rate": 2.7596258362941052e-08,
+ "loss": 0.377,
+ "step": 12094
+ },
+ {
+ "epoch": 5.718676122931442,
+ "grad_norm": 3.0362772941589355,
+ "learning_rate": 2.7503901190156502e-08,
+ "loss": 0.3247,
+ "step": 12095
+ },
+ {
+ "epoch": 5.719148936170213,
+ "grad_norm": 3.3042995929718018,
+ "learning_rate": 2.7411697968862117e-08,
+ "loss": 0.3184,
+ "step": 12096
+ },
+ {
+ "epoch": 5.719621749408984,
+ "grad_norm": 2.989022731781006,
+ "learning_rate": 2.731964870479942e-08,
+ "loss": 0.3252,
+ "step": 12097
+ },
+ {
+ "epoch": 5.720094562647754,
+ "grad_norm": 3.2194690704345703,
+ "learning_rate": 2.722775340369993e-08,
+ "loss": 0.3424,
+ "step": 12098
+ },
+ {
+ "epoch": 5.7205673758865245,
+ "grad_norm": 3.509066581726074,
+ "learning_rate": 2.713601207128547e-08,
+ "loss": 0.3128,
+ "step": 12099
+ },
+ {
+ "epoch": 5.721040189125295,
+ "grad_norm": 3.277519941329956,
+ "learning_rate": 2.7044424713268124e-08,
+ "loss": 0.3411,
+ "step": 12100
+ },
+ {
+ "epoch": 5.721513002364066,
+ "grad_norm": 3.0034539699554443,
+ "learning_rate": 2.695299133535112e-08,
+ "loss": 0.3048,
+ "step": 12101
+ },
+ {
+ "epoch": 5.721985815602837,
+ "grad_norm": 3.2064356803894043,
+ "learning_rate": 2.686171194322712e-08,
+ "loss": 0.3154,
+ "step": 12102
+ },
+ {
+ "epoch": 5.722458628841608,
+ "grad_norm": 3.3201537132263184,
+ "learning_rate": 2.6770586542579914e-08,
+ "loss": 0.3004,
+ "step": 12103
+ },
+ {
+ "epoch": 5.7229314420803785,
+ "grad_norm": 2.984123706817627,
+ "learning_rate": 2.6679615139083847e-08,
+ "loss": 0.2802,
+ "step": 12104
+ },
+ {
+ "epoch": 5.723404255319149,
+ "grad_norm": 3.4404940605163574,
+ "learning_rate": 2.658879773840245e-08,
+ "loss": 0.3032,
+ "step": 12105
+ },
+ {
+ "epoch": 5.72387706855792,
+ "grad_norm": 3.38144850730896,
+ "learning_rate": 2.6498134346191474e-08,
+ "loss": 0.3569,
+ "step": 12106
+ },
+ {
+ "epoch": 5.72434988179669,
+ "grad_norm": 3.053532838821411,
+ "learning_rate": 2.6407624968095568e-08,
+ "loss": 0.3277,
+ "step": 12107
+ },
+ {
+ "epoch": 5.724822695035461,
+ "grad_norm": 3.94144606590271,
+ "learning_rate": 2.6317269609750507e-08,
+ "loss": 0.355,
+ "step": 12108
+ },
+ {
+ "epoch": 5.725295508274232,
+ "grad_norm": 2.9377691745758057,
+ "learning_rate": 2.622706827678234e-08,
+ "loss": 0.3036,
+ "step": 12109
+ },
+ {
+ "epoch": 5.725768321513002,
+ "grad_norm": 3.1034493446350098,
+ "learning_rate": 2.6137020974807415e-08,
+ "loss": 0.2807,
+ "step": 12110
+ },
+ {
+ "epoch": 5.726241134751773,
+ "grad_norm": 3.27827787399292,
+ "learning_rate": 2.604712770943263e-08,
+ "loss": 0.3221,
+ "step": 12111
+ },
+ {
+ "epoch": 5.726713947990544,
+ "grad_norm": 3.3290865421295166,
+ "learning_rate": 2.5957388486255175e-08,
+ "loss": 0.3486,
+ "step": 12112
+ },
+ {
+ "epoch": 5.727186761229315,
+ "grad_norm": 3.1709330081939697,
+ "learning_rate": 2.5867803310862806e-08,
+ "loss": 0.3082,
+ "step": 12113
+ },
+ {
+ "epoch": 5.727659574468085,
+ "grad_norm": 3.3823163509368896,
+ "learning_rate": 2.5778372188833555e-08,
+ "loss": 0.3593,
+ "step": 12114
+ },
+ {
+ "epoch": 5.7281323877068555,
+ "grad_norm": 3.1143476963043213,
+ "learning_rate": 2.568909512573603e-08,
+ "loss": 0.3166,
+ "step": 12115
+ },
+ {
+ "epoch": 5.728605200945626,
+ "grad_norm": 3.666964054107666,
+ "learning_rate": 2.5599972127128834e-08,
+ "loss": 0.3397,
+ "step": 12116
+ },
+ {
+ "epoch": 5.729078014184397,
+ "grad_norm": 3.3389058113098145,
+ "learning_rate": 2.5511003198561424e-08,
+ "loss": 0.3055,
+ "step": 12117
+ },
+ {
+ "epoch": 5.729550827423168,
+ "grad_norm": 2.8332576751708984,
+ "learning_rate": 2.5422188345573807e-08,
+ "loss": 0.3106,
+ "step": 12118
+ },
+ {
+ "epoch": 5.730023640661939,
+ "grad_norm": 3.3573875427246094,
+ "learning_rate": 2.5333527573695728e-08,
+ "loss": 0.3222,
+ "step": 12119
+ },
+ {
+ "epoch": 5.7304964539007095,
+ "grad_norm": 3.2714033126831055,
+ "learning_rate": 2.5245020888447768e-08,
+ "loss": 0.3462,
+ "step": 12120
+ },
+ {
+ "epoch": 5.73096926713948,
+ "grad_norm": 2.817124605178833,
+ "learning_rate": 2.515666829534108e-08,
+ "loss": 0.2802,
+ "step": 12121
+ },
+ {
+ "epoch": 5.73144208037825,
+ "grad_norm": 3.2658917903900146,
+ "learning_rate": 2.506846979987654e-08,
+ "loss": 0.3319,
+ "step": 12122
+ },
+ {
+ "epoch": 5.731914893617021,
+ "grad_norm": 3.310659408569336,
+ "learning_rate": 2.498042540754614e-08,
+ "loss": 0.3271,
+ "step": 12123
+ },
+ {
+ "epoch": 5.732387706855792,
+ "grad_norm": 3.0620813369750977,
+ "learning_rate": 2.489253512383216e-08,
+ "loss": 0.338,
+ "step": 12124
+ },
+ {
+ "epoch": 5.732860520094563,
+ "grad_norm": 2.6192173957824707,
+ "learning_rate": 2.48047989542069e-08,
+ "loss": 0.2672,
+ "step": 12125
+ },
+ {
+ "epoch": 5.733333333333333,
+ "grad_norm": 3.0177063941955566,
+ "learning_rate": 2.4717216904133478e-08,
+ "loss": 0.3229,
+ "step": 12126
+ },
+ {
+ "epoch": 5.733806146572104,
+ "grad_norm": 3.2077372074127197,
+ "learning_rate": 2.462978897906504e-08,
+ "loss": 0.3353,
+ "step": 12127
+ },
+ {
+ "epoch": 5.734278959810875,
+ "grad_norm": 3.0624101161956787,
+ "learning_rate": 2.4542515184445836e-08,
+ "loss": 0.2807,
+ "step": 12128
+ },
+ {
+ "epoch": 5.734751773049645,
+ "grad_norm": 3.681156635284424,
+ "learning_rate": 2.44553955257093e-08,
+ "loss": 0.3494,
+ "step": 12129
+ },
+ {
+ "epoch": 5.735224586288416,
+ "grad_norm": 3.5133650302886963,
+ "learning_rate": 2.4368430008280265e-08,
+ "loss": 0.3255,
+ "step": 12130
+ },
+ {
+ "epoch": 5.7356973995271865,
+ "grad_norm": 3.5622475147247314,
+ "learning_rate": 2.4281618637574113e-08,
+ "loss": 0.3481,
+ "step": 12131
+ },
+ {
+ "epoch": 5.736170212765957,
+ "grad_norm": 3.804569959640503,
+ "learning_rate": 2.4194961418995687e-08,
+ "loss": 0.3104,
+ "step": 12132
+ },
+ {
+ "epoch": 5.736643026004728,
+ "grad_norm": 3.2981555461883545,
+ "learning_rate": 2.4108458357940957e-08,
+ "loss": 0.3233,
+ "step": 12133
+ },
+ {
+ "epoch": 5.737115839243499,
+ "grad_norm": 3.428004026412964,
+ "learning_rate": 2.4022109459796162e-08,
+ "loss": 0.3342,
+ "step": 12134
+ },
+ {
+ "epoch": 5.73758865248227,
+ "grad_norm": 2.9092800617218018,
+ "learning_rate": 2.393591472993756e-08,
+ "loss": 0.3019,
+ "step": 12135
+ },
+ {
+ "epoch": 5.7380614657210405,
+ "grad_norm": 3.9165987968444824,
+ "learning_rate": 2.3849874173732246e-08,
+ "loss": 0.3334,
+ "step": 12136
+ },
+ {
+ "epoch": 5.738534278959811,
+ "grad_norm": 3.541109561920166,
+ "learning_rate": 2.3763987796538156e-08,
+ "loss": 0.2841,
+ "step": 12137
+ },
+ {
+ "epoch": 5.739007092198581,
+ "grad_norm": 3.3009915351867676,
+ "learning_rate": 2.3678255603702127e-08,
+ "loss": 0.3493,
+ "step": 12138
+ },
+ {
+ "epoch": 5.739479905437352,
+ "grad_norm": 3.619457721710205,
+ "learning_rate": 2.359267760056294e-08,
+ "loss": 0.3453,
+ "step": 12139
+ },
+ {
+ "epoch": 5.739952718676123,
+ "grad_norm": 2.9422662258148193,
+ "learning_rate": 2.3507253792448835e-08,
+ "loss": 0.2815,
+ "step": 12140
+ },
+ {
+ "epoch": 5.740425531914894,
+ "grad_norm": 3.0050137042999268,
+ "learning_rate": 2.342198418467917e-08,
+ "loss": 0.3371,
+ "step": 12141
+ },
+ {
+ "epoch": 5.740898345153664,
+ "grad_norm": 3.315720796585083,
+ "learning_rate": 2.3336868782563026e-08,
+ "loss": 0.2809,
+ "step": 12142
+ },
+ {
+ "epoch": 5.741371158392435,
+ "grad_norm": 3.035090684890747,
+ "learning_rate": 2.3251907591400335e-08,
+ "loss": 0.2758,
+ "step": 12143
+ },
+ {
+ "epoch": 5.741843971631206,
+ "grad_norm": 3.3625571727752686,
+ "learning_rate": 2.316710061648103e-08,
+ "loss": 0.3082,
+ "step": 12144
+ },
+ {
+ "epoch": 5.742316784869976,
+ "grad_norm": 3.2533843517303467,
+ "learning_rate": 2.3082447863085887e-08,
+ "loss": 0.36,
+ "step": 12145
+ },
+ {
+ "epoch": 5.742789598108747,
+ "grad_norm": 3.4281437397003174,
+ "learning_rate": 2.2997949336485693e-08,
+ "loss": 0.3048,
+ "step": 12146
+ },
+ {
+ "epoch": 5.7432624113475175,
+ "grad_norm": 3.467841148376465,
+ "learning_rate": 2.291360504194179e-08,
+ "loss": 0.3086,
+ "step": 12147
+ },
+ {
+ "epoch": 5.743735224586288,
+ "grad_norm": 3.471487283706665,
+ "learning_rate": 2.282941498470581e-08,
+ "loss": 0.3201,
+ "step": 12148
+ },
+ {
+ "epoch": 5.744208037825059,
+ "grad_norm": 3.9886951446533203,
+ "learning_rate": 2.274537917002051e-08,
+ "loss": 0.3493,
+ "step": 12149
+ },
+ {
+ "epoch": 5.74468085106383,
+ "grad_norm": 3.6634814739227295,
+ "learning_rate": 2.266149760311781e-08,
+ "loss": 0.2909,
+ "step": 12150
+ },
+ {
+ "epoch": 5.745153664302601,
+ "grad_norm": 3.017439126968384,
+ "learning_rate": 2.2577770289220758e-08,
+ "loss": 0.3173,
+ "step": 12151
+ },
+ {
+ "epoch": 5.7456264775413715,
+ "grad_norm": 3.0666286945343018,
+ "learning_rate": 2.2494197233542957e-08,
+ "loss": 0.3455,
+ "step": 12152
+ },
+ {
+ "epoch": 5.746099290780142,
+ "grad_norm": 3.200896978378296,
+ "learning_rate": 2.241077844128775e-08,
+ "loss": 0.373,
+ "step": 12153
+ },
+ {
+ "epoch": 5.746572104018912,
+ "grad_norm": 3.3661558628082275,
+ "learning_rate": 2.232751391764959e-08,
+ "loss": 0.3274,
+ "step": 12154
+ },
+ {
+ "epoch": 5.747044917257683,
+ "grad_norm": 3.8031365871429443,
+ "learning_rate": 2.2244403667812942e-08,
+ "loss": 0.383,
+ "step": 12155
+ },
+ {
+ "epoch": 5.747517730496454,
+ "grad_norm": 3.1112916469573975,
+ "learning_rate": 2.2161447696952555e-08,
+ "loss": 0.3553,
+ "step": 12156
+ },
+ {
+ "epoch": 5.7479905437352246,
+ "grad_norm": 3.2407584190368652,
+ "learning_rate": 2.2078646010234016e-08,
+ "loss": 0.3317,
+ "step": 12157
+ },
+ {
+ "epoch": 5.748463356973995,
+ "grad_norm": 3.176625967025757,
+ "learning_rate": 2.1995998612812654e-08,
+ "loss": 0.3131,
+ "step": 12158
+ },
+ {
+ "epoch": 5.748936170212766,
+ "grad_norm": 3.056150197982788,
+ "learning_rate": 2.19135055098349e-08,
+ "loss": 0.3119,
+ "step": 12159
+ },
+ {
+ "epoch": 5.749408983451537,
+ "grad_norm": 3.1598548889160156,
+ "learning_rate": 2.1831166706436925e-08,
+ "loss": 0.2552,
+ "step": 12160
+ },
+ {
+ "epoch": 5.749881796690307,
+ "grad_norm": 3.460984230041504,
+ "learning_rate": 2.174898220774574e-08,
+ "loss": 0.3239,
+ "step": 12161
+ },
+ {
+ "epoch": 5.750354609929078,
+ "grad_norm": 3.1156563758850098,
+ "learning_rate": 2.166695201887864e-08,
+ "loss": 0.3393,
+ "step": 12162
+ },
+ {
+ "epoch": 5.7508274231678485,
+ "grad_norm": 3.162257671356201,
+ "learning_rate": 2.1585076144943208e-08,
+ "loss": 0.3006,
+ "step": 12163
+ },
+ {
+ "epoch": 5.751300236406619,
+ "grad_norm": 3.323511838912964,
+ "learning_rate": 2.1503354591037585e-08,
+ "loss": 0.2765,
+ "step": 12164
+ },
+ {
+ "epoch": 5.75177304964539,
+ "grad_norm": 3.358081817626953,
+ "learning_rate": 2.1421787362250478e-08,
+ "loss": 0.3604,
+ "step": 12165
+ },
+ {
+ "epoch": 5.752245862884161,
+ "grad_norm": 3.3027939796447754,
+ "learning_rate": 2.1340374463660053e-08,
+ "loss": 0.345,
+ "step": 12166
+ },
+ {
+ "epoch": 5.752718676122932,
+ "grad_norm": 3.702924966812134,
+ "learning_rate": 2.1259115900336137e-08,
+ "loss": 0.3644,
+ "step": 12167
+ },
+ {
+ "epoch": 5.753191489361702,
+ "grad_norm": 3.689192056655884,
+ "learning_rate": 2.117801167733774e-08,
+ "loss": 0.3382,
+ "step": 12168
+ },
+ {
+ "epoch": 5.753664302600473,
+ "grad_norm": 3.0726840496063232,
+ "learning_rate": 2.1097061799715824e-08,
+ "loss": 0.3438,
+ "step": 12169
+ },
+ {
+ "epoch": 5.754137115839243,
+ "grad_norm": 3.3648977279663086,
+ "learning_rate": 2.1016266272509967e-08,
+ "loss": 0.3409,
+ "step": 12170
+ },
+ {
+ "epoch": 5.754609929078014,
+ "grad_norm": 4.000081539154053,
+ "learning_rate": 2.0935625100751144e-08,
+ "loss": 0.3031,
+ "step": 12171
+ },
+ {
+ "epoch": 5.755082742316785,
+ "grad_norm": 3.1993863582611084,
+ "learning_rate": 2.085513828946062e-08,
+ "loss": 0.3335,
+ "step": 12172
+ },
+ {
+ "epoch": 5.7555555555555555,
+ "grad_norm": 4.029566287994385,
+ "learning_rate": 2.0774805843649937e-08,
+ "loss": 0.3427,
+ "step": 12173
+ },
+ {
+ "epoch": 5.756028368794326,
+ "grad_norm": 3.3171730041503906,
+ "learning_rate": 2.069462776832093e-08,
+ "loss": 0.2913,
+ "step": 12174
+ },
+ {
+ "epoch": 5.756501182033097,
+ "grad_norm": 2.9666988849639893,
+ "learning_rate": 2.0614604068465993e-08,
+ "loss": 0.3242,
+ "step": 12175
+ },
+ {
+ "epoch": 5.756973995271868,
+ "grad_norm": 3.070720672607422,
+ "learning_rate": 2.0534734749068087e-08,
+ "loss": 0.2847,
+ "step": 12176
+ },
+ {
+ "epoch": 5.757446808510638,
+ "grad_norm": 3.260769844055176,
+ "learning_rate": 2.0455019815099897e-08,
+ "loss": 0.3313,
+ "step": 12177
+ },
+ {
+ "epoch": 5.757919621749409,
+ "grad_norm": 3.9504573345184326,
+ "learning_rate": 2.0375459271525232e-08,
+ "loss": 0.3834,
+ "step": 12178
+ },
+ {
+ "epoch": 5.758392434988179,
+ "grad_norm": 3.1657321453094482,
+ "learning_rate": 2.029605312329791e-08,
+ "loss": 0.3396,
+ "step": 12179
+ },
+ {
+ "epoch": 5.75886524822695,
+ "grad_norm": 3.1386959552764893,
+ "learning_rate": 2.0216801375362304e-08,
+ "loss": 0.2789,
+ "step": 12180
+ },
+ {
+ "epoch": 5.759338061465721,
+ "grad_norm": 3.144196033477783,
+ "learning_rate": 2.0137704032652528e-08,
+ "loss": 0.3203,
+ "step": 12181
+ },
+ {
+ "epoch": 5.759810874704492,
+ "grad_norm": 3.669904947280884,
+ "learning_rate": 2.005876110009436e-08,
+ "loss": 0.3259,
+ "step": 12182
+ },
+ {
+ "epoch": 5.760283687943263,
+ "grad_norm": 3.2438392639160156,
+ "learning_rate": 1.9979972582603034e-08,
+ "loss": 0.3269,
+ "step": 12183
+ },
+ {
+ "epoch": 5.760756501182033,
+ "grad_norm": 3.5162949562072754,
+ "learning_rate": 1.9901338485084075e-08,
+ "loss": 0.3881,
+ "step": 12184
+ },
+ {
+ "epoch": 5.761229314420804,
+ "grad_norm": 3.1763012409210205,
+ "learning_rate": 1.9822858812433844e-08,
+ "loss": 0.3194,
+ "step": 12185
+ },
+ {
+ "epoch": 5.761702127659574,
+ "grad_norm": 2.9922640323638916,
+ "learning_rate": 1.974453356953898e-08,
+ "loss": 0.336,
+ "step": 12186
+ },
+ {
+ "epoch": 5.762174940898345,
+ "grad_norm": 3.721003293991089,
+ "learning_rate": 1.9666362761276425e-08,
+ "loss": 0.3112,
+ "step": 12187
+ },
+ {
+ "epoch": 5.762647754137116,
+ "grad_norm": 3.0573668479919434,
+ "learning_rate": 1.958834639251367e-08,
+ "loss": 0.2955,
+ "step": 12188
+ },
+ {
+ "epoch": 5.7631205673758865,
+ "grad_norm": 3.172262191772461,
+ "learning_rate": 1.9510484468108215e-08,
+ "loss": 0.3028,
+ "step": 12189
+ },
+ {
+ "epoch": 5.763593380614657,
+ "grad_norm": 3.4394562244415283,
+ "learning_rate": 1.943277699290841e-08,
+ "loss": 0.3573,
+ "step": 12190
+ },
+ {
+ "epoch": 5.764066193853428,
+ "grad_norm": 3.005544900894165,
+ "learning_rate": 1.9355223971752324e-08,
+ "loss": 0.2809,
+ "step": 12191
+ },
+ {
+ "epoch": 5.764539007092199,
+ "grad_norm": 3.416400194168091,
+ "learning_rate": 1.927782540946943e-08,
+ "loss": 0.2926,
+ "step": 12192
+ },
+ {
+ "epoch": 5.765011820330969,
+ "grad_norm": 2.908393144607544,
+ "learning_rate": 1.9200581310878373e-08,
+ "loss": 0.3074,
+ "step": 12193
+ },
+ {
+ "epoch": 5.76548463356974,
+ "grad_norm": 2.976839542388916,
+ "learning_rate": 1.9123491680789473e-08,
+ "loss": 0.285,
+ "step": 12194
+ },
+ {
+ "epoch": 5.76595744680851,
+ "grad_norm": 3.4792447090148926,
+ "learning_rate": 1.9046556524002225e-08,
+ "loss": 0.367,
+ "step": 12195
+ },
+ {
+ "epoch": 5.766430260047281,
+ "grad_norm": 3.3577182292938232,
+ "learning_rate": 1.8969775845307238e-08,
+ "loss": 0.3373,
+ "step": 12196
+ },
+ {
+ "epoch": 5.766903073286052,
+ "grad_norm": 3.4297826290130615,
+ "learning_rate": 1.889314964948541e-08,
+ "loss": 0.3967,
+ "step": 12197
+ },
+ {
+ "epoch": 5.767375886524823,
+ "grad_norm": 3.2004456520080566,
+ "learning_rate": 1.8816677941307925e-08,
+ "loss": 0.32,
+ "step": 12198
+ },
+ {
+ "epoch": 5.767848699763594,
+ "grad_norm": 3.613758087158203,
+ "learning_rate": 1.8740360725535977e-08,
+ "loss": 0.3547,
+ "step": 12199
+ },
+ {
+ "epoch": 5.768321513002364,
+ "grad_norm": 3.1576809883117676,
+ "learning_rate": 1.8664198006921875e-08,
+ "loss": 0.3024,
+ "step": 12200
+ },
+ {
+ "epoch": 5.768794326241135,
+ "grad_norm": 3.2107057571411133,
+ "learning_rate": 1.8588189790207934e-08,
+ "loss": 0.3262,
+ "step": 12201
+ },
+ {
+ "epoch": 5.769267139479905,
+ "grad_norm": 3.1241042613983154,
+ "learning_rate": 1.851233608012648e-08,
+ "loss": 0.3157,
+ "step": 12202
+ },
+ {
+ "epoch": 5.769739952718676,
+ "grad_norm": 3.1932709217071533,
+ "learning_rate": 1.8436636881401238e-08,
+ "loss": 0.3348,
+ "step": 12203
+ },
+ {
+ "epoch": 5.770212765957447,
+ "grad_norm": 3.298868417739868,
+ "learning_rate": 1.8361092198745102e-08,
+ "loss": 0.3168,
+ "step": 12204
+ },
+ {
+ "epoch": 5.7706855791962175,
+ "grad_norm": 3.7978901863098145,
+ "learning_rate": 1.8285702036862086e-08,
+ "loss": 0.3432,
+ "step": 12205
+ },
+ {
+ "epoch": 5.771158392434988,
+ "grad_norm": 3.0930938720703125,
+ "learning_rate": 1.821046640044649e-08,
+ "loss": 0.3154,
+ "step": 12206
+ },
+ {
+ "epoch": 5.771631205673759,
+ "grad_norm": 3.205141067504883,
+ "learning_rate": 1.8135385294182904e-08,
+ "loss": 0.3347,
+ "step": 12207
+ },
+ {
+ "epoch": 5.77210401891253,
+ "grad_norm": 3.1829981803894043,
+ "learning_rate": 1.80604587227462e-08,
+ "loss": 0.2739,
+ "step": 12208
+ },
+ {
+ "epoch": 5.7725768321513,
+ "grad_norm": 3.0086073875427246,
+ "learning_rate": 1.798568669080153e-08,
+ "loss": 0.2861,
+ "step": 12209
+ },
+ {
+ "epoch": 5.773049645390071,
+ "grad_norm": 3.0506770610809326,
+ "learning_rate": 1.7911069203004895e-08,
+ "loss": 0.2738,
+ "step": 12210
+ },
+ {
+ "epoch": 5.773522458628841,
+ "grad_norm": 3.29156231880188,
+ "learning_rate": 1.7836606264002577e-08,
+ "loss": 0.34,
+ "step": 12211
+ },
+ {
+ "epoch": 5.773995271867612,
+ "grad_norm": 3.8883163928985596,
+ "learning_rate": 1.776229787843059e-08,
+ "loss": 0.2994,
+ "step": 12212
+ },
+ {
+ "epoch": 5.774468085106383,
+ "grad_norm": 3.5105197429656982,
+ "learning_rate": 1.7688144050916066e-08,
+ "loss": 0.3567,
+ "step": 12213
+ },
+ {
+ "epoch": 5.774940898345154,
+ "grad_norm": 3.268084764480591,
+ "learning_rate": 1.761414478607615e-08,
+ "loss": 0.2715,
+ "step": 12214
+ },
+ {
+ "epoch": 5.775413711583925,
+ "grad_norm": 3.4444260597229004,
+ "learning_rate": 1.7540300088518814e-08,
+ "loss": 0.3666,
+ "step": 12215
+ },
+ {
+ "epoch": 5.775886524822695,
+ "grad_norm": 3.3592841625213623,
+ "learning_rate": 1.7466609962841497e-08,
+ "loss": 0.3023,
+ "step": 12216
+ },
+ {
+ "epoch": 5.776359338061466,
+ "grad_norm": 2.869286060333252,
+ "learning_rate": 1.739307441363275e-08,
+ "loss": 0.2475,
+ "step": 12217
+ },
+ {
+ "epoch": 5.776832151300236,
+ "grad_norm": 3.429243326187134,
+ "learning_rate": 1.731969344547141e-08,
+ "loss": 0.3671,
+ "step": 12218
+ },
+ {
+ "epoch": 5.777304964539007,
+ "grad_norm": 3.57275128364563,
+ "learning_rate": 1.7246467062926598e-08,
+ "loss": 0.3131,
+ "step": 12219
+ },
+ {
+ "epoch": 5.777777777777778,
+ "grad_norm": 3.172421455383301,
+ "learning_rate": 1.7173395270557445e-08,
+ "loss": 0.339,
+ "step": 12220
+ },
+ {
+ "epoch": 5.7782505910165485,
+ "grad_norm": 3.8230342864990234,
+ "learning_rate": 1.7100478072914483e-08,
+ "loss": 0.3199,
+ "step": 12221
+ },
+ {
+ "epoch": 5.778723404255319,
+ "grad_norm": 2.9650866985321045,
+ "learning_rate": 1.702771547453741e-08,
+ "loss": 0.3053,
+ "step": 12222
+ },
+ {
+ "epoch": 5.77919621749409,
+ "grad_norm": 3.665842056274414,
+ "learning_rate": 1.6955107479957045e-08,
+ "loss": 0.3845,
+ "step": 12223
+ },
+ {
+ "epoch": 5.779669030732861,
+ "grad_norm": 3.350053548812866,
+ "learning_rate": 1.6882654093694495e-08,
+ "loss": 0.3402,
+ "step": 12224
+ },
+ {
+ "epoch": 5.780141843971631,
+ "grad_norm": 3.5756731033325195,
+ "learning_rate": 1.6810355320260597e-08,
+ "loss": 0.3388,
+ "step": 12225
+ },
+ {
+ "epoch": 5.780614657210402,
+ "grad_norm": 3.5228781700134277,
+ "learning_rate": 1.673821116415758e-08,
+ "loss": 0.3381,
+ "step": 12226
+ },
+ {
+ "epoch": 5.781087470449172,
+ "grad_norm": 3.166459798812866,
+ "learning_rate": 1.666622162987713e-08,
+ "loss": 0.3039,
+ "step": 12227
+ },
+ {
+ "epoch": 5.781560283687943,
+ "grad_norm": 3.1998205184936523,
+ "learning_rate": 1.6594386721902335e-08,
+ "loss": 0.2946,
+ "step": 12228
+ },
+ {
+ "epoch": 5.782033096926714,
+ "grad_norm": 3.009943962097168,
+ "learning_rate": 1.652270644470544e-08,
+ "loss": 0.2878,
+ "step": 12229
+ },
+ {
+ "epoch": 5.782505910165485,
+ "grad_norm": 3.5119593143463135,
+ "learning_rate": 1.645118080274982e-08,
+ "loss": 0.3112,
+ "step": 12230
+ },
+ {
+ "epoch": 5.782978723404256,
+ "grad_norm": 3.491492986679077,
+ "learning_rate": 1.637980980048942e-08,
+ "loss": 0.3286,
+ "step": 12231
+ },
+ {
+ "epoch": 5.783451536643026,
+ "grad_norm": 3.200819730758667,
+ "learning_rate": 1.6308593442367625e-08,
+ "loss": 0.3425,
+ "step": 12232
+ },
+ {
+ "epoch": 5.783924349881797,
+ "grad_norm": 3.3669025897979736,
+ "learning_rate": 1.6237531732819222e-08,
+ "loss": 0.346,
+ "step": 12233
+ },
+ {
+ "epoch": 5.784397163120567,
+ "grad_norm": 3.1921918392181396,
+ "learning_rate": 1.6166624676268727e-08,
+ "loss": 0.3124,
+ "step": 12234
+ },
+ {
+ "epoch": 5.784869976359338,
+ "grad_norm": 2.852695941925049,
+ "learning_rate": 1.609587227713122e-08,
+ "loss": 0.3012,
+ "step": 12235
+ },
+ {
+ "epoch": 5.785342789598109,
+ "grad_norm": 3.1297335624694824,
+ "learning_rate": 1.602527453981234e-08,
+ "loss": 0.3346,
+ "step": 12236
+ },
+ {
+ "epoch": 5.7858156028368795,
+ "grad_norm": 3.3038461208343506,
+ "learning_rate": 1.5954831468707467e-08,
+ "loss": 0.2996,
+ "step": 12237
+ },
+ {
+ "epoch": 5.78628841607565,
+ "grad_norm": 3.753922462463379,
+ "learning_rate": 1.5884543068203086e-08,
+ "loss": 0.3139,
+ "step": 12238
+ },
+ {
+ "epoch": 5.786761229314421,
+ "grad_norm": 3.3022499084472656,
+ "learning_rate": 1.5814409342675695e-08,
+ "loss": 0.3602,
+ "step": 12239
+ },
+ {
+ "epoch": 5.787234042553192,
+ "grad_norm": 3.618237257003784,
+ "learning_rate": 1.5744430296492084e-08,
+ "loss": 0.3555,
+ "step": 12240
+ },
+ {
+ "epoch": 5.787706855791962,
+ "grad_norm": 3.0856997966766357,
+ "learning_rate": 1.567460593400988e-08,
+ "loss": 0.2944,
+ "step": 12241
+ },
+ {
+ "epoch": 5.7881796690307326,
+ "grad_norm": 3.40259051322937,
+ "learning_rate": 1.5604936259576432e-08,
+ "loss": 0.3086,
+ "step": 12242
+ },
+ {
+ "epoch": 5.788652482269503,
+ "grad_norm": 3.634979248046875,
+ "learning_rate": 1.553542127752994e-08,
+ "loss": 0.3264,
+ "step": 12243
+ },
+ {
+ "epoch": 5.789125295508274,
+ "grad_norm": 3.5231590270996094,
+ "learning_rate": 1.5466060992198605e-08,
+ "loss": 0.3841,
+ "step": 12244
+ },
+ {
+ "epoch": 5.789598108747045,
+ "grad_norm": 3.14682936668396,
+ "learning_rate": 1.5396855407901202e-08,
+ "loss": 0.3367,
+ "step": 12245
+ },
+ {
+ "epoch": 5.790070921985816,
+ "grad_norm": 3.124629020690918,
+ "learning_rate": 1.5327804528947333e-08,
+ "loss": 0.3139,
+ "step": 12246
+ },
+ {
+ "epoch": 5.7905437352245865,
+ "grad_norm": 3.116682291030884,
+ "learning_rate": 1.525890835963578e-08,
+ "loss": 0.304,
+ "step": 12247
+ },
+ {
+ "epoch": 5.791016548463357,
+ "grad_norm": 3.085913896560669,
+ "learning_rate": 1.5190166904256732e-08,
+ "loss": 0.2882,
+ "step": 12248
+ },
+ {
+ "epoch": 5.791489361702128,
+ "grad_norm": 3.3703277111053467,
+ "learning_rate": 1.5121580167090642e-08,
+ "loss": 0.3308,
+ "step": 12249
+ },
+ {
+ "epoch": 5.791962174940898,
+ "grad_norm": 3.1782917976379395,
+ "learning_rate": 1.5053148152407714e-08,
+ "loss": 0.3246,
+ "step": 12250
+ },
+ {
+ "epoch": 5.792434988179669,
+ "grad_norm": 3.1751880645751953,
+ "learning_rate": 1.498487086446926e-08,
+ "loss": 0.3074,
+ "step": 12251
+ },
+ {
+ "epoch": 5.79290780141844,
+ "grad_norm": 3.3677165508270264,
+ "learning_rate": 1.491674830752632e-08,
+ "loss": 0.3022,
+ "step": 12252
+ },
+ {
+ "epoch": 5.79338061465721,
+ "grad_norm": 2.866220474243164,
+ "learning_rate": 1.484878048582078e-08,
+ "loss": 0.2842,
+ "step": 12253
+ },
+ {
+ "epoch": 5.793853427895981,
+ "grad_norm": 3.148810386657715,
+ "learning_rate": 1.4780967403584534e-08,
+ "loss": 0.3069,
+ "step": 12254
+ },
+ {
+ "epoch": 5.794326241134752,
+ "grad_norm": 3.145711898803711,
+ "learning_rate": 1.4713309065040039e-08,
+ "loss": 0.3266,
+ "step": 12255
+ },
+ {
+ "epoch": 5.794799054373523,
+ "grad_norm": 3.4532599449157715,
+ "learning_rate": 1.4645805474400032e-08,
+ "loss": 0.3314,
+ "step": 12256
+ },
+ {
+ "epoch": 5.795271867612293,
+ "grad_norm": 2.9855432510375977,
+ "learning_rate": 1.45784566358681e-08,
+ "loss": 0.3385,
+ "step": 12257
+ },
+ {
+ "epoch": 5.7957446808510635,
+ "grad_norm": 3.3268442153930664,
+ "learning_rate": 1.4511262553637274e-08,
+ "loss": 0.3553,
+ "step": 12258
+ },
+ {
+ "epoch": 5.796217494089834,
+ "grad_norm": 2.8287642002105713,
+ "learning_rate": 1.444422323189143e-08,
+ "loss": 0.2748,
+ "step": 12259
+ },
+ {
+ "epoch": 5.796690307328605,
+ "grad_norm": 3.328403949737549,
+ "learning_rate": 1.4377338674805007e-08,
+ "loss": 0.3055,
+ "step": 12260
+ },
+ {
+ "epoch": 5.797163120567376,
+ "grad_norm": 3.004183530807495,
+ "learning_rate": 1.4310608886542732e-08,
+ "loss": 0.3025,
+ "step": 12261
+ },
+ {
+ "epoch": 5.797635933806147,
+ "grad_norm": 3.0779333114624023,
+ "learning_rate": 1.424403387125961e-08,
+ "loss": 0.276,
+ "step": 12262
+ },
+ {
+ "epoch": 5.7981087470449175,
+ "grad_norm": 3.425168037414551,
+ "learning_rate": 1.4177613633100384e-08,
+ "loss": 0.3771,
+ "step": 12263
+ },
+ {
+ "epoch": 5.798581560283688,
+ "grad_norm": 3.3431761264801025,
+ "learning_rate": 1.411134817620119e-08,
+ "loss": 0.3604,
+ "step": 12264
+ },
+ {
+ "epoch": 5.799054373522459,
+ "grad_norm": 3.708122491836548,
+ "learning_rate": 1.404523750468817e-08,
+ "loss": 0.3647,
+ "step": 12265
+ },
+ {
+ "epoch": 5.799527186761229,
+ "grad_norm": 3.414480447769165,
+ "learning_rate": 1.3979281622677753e-08,
+ "loss": 0.3148,
+ "step": 12266
+ },
+ {
+ "epoch": 5.8,
+ "grad_norm": 3.335441827774048,
+ "learning_rate": 1.391348053427638e-08,
+ "loss": 0.2949,
+ "step": 12267
+ },
+ {
+ "epoch": 5.800472813238771,
+ "grad_norm": 3.2817869186401367,
+ "learning_rate": 1.3847834243581603e-08,
+ "loss": 0.3031,
+ "step": 12268
+ },
+ {
+ "epoch": 5.800945626477541,
+ "grad_norm": 3.3932013511657715,
+ "learning_rate": 1.378234275468071e-08,
+ "loss": 0.3035,
+ "step": 12269
+ },
+ {
+ "epoch": 5.801418439716312,
+ "grad_norm": 3.355982780456543,
+ "learning_rate": 1.3717006071651551e-08,
+ "loss": 0.3207,
+ "step": 12270
+ },
+ {
+ "epoch": 5.801891252955083,
+ "grad_norm": 3.441032886505127,
+ "learning_rate": 1.3651824198562258e-08,
+ "loss": 0.2979,
+ "step": 12271
+ },
+ {
+ "epoch": 5.802364066193854,
+ "grad_norm": 3.2429022789001465,
+ "learning_rate": 1.3586797139471807e-08,
+ "loss": 0.315,
+ "step": 12272
+ },
+ {
+ "epoch": 5.802836879432624,
+ "grad_norm": 2.954455852508545,
+ "learning_rate": 1.352192489842863e-08,
+ "loss": 0.3364,
+ "step": 12273
+ },
+ {
+ "epoch": 5.8033096926713945,
+ "grad_norm": 3.57834792137146,
+ "learning_rate": 1.3457207479472545e-08,
+ "loss": 0.3548,
+ "step": 12274
+ },
+ {
+ "epoch": 5.803782505910165,
+ "grad_norm": 3.009730577468872,
+ "learning_rate": 1.3392644886633111e-08,
+ "loss": 0.2929,
+ "step": 12275
+ },
+ {
+ "epoch": 5.804255319148936,
+ "grad_norm": 3.3504159450531006,
+ "learning_rate": 1.3328237123929888e-08,
+ "loss": 0.3276,
+ "step": 12276
+ },
+ {
+ "epoch": 5.804728132387707,
+ "grad_norm": 3.3094518184661865,
+ "learning_rate": 1.3263984195373836e-08,
+ "loss": 0.3314,
+ "step": 12277
+ },
+ {
+ "epoch": 5.805200945626478,
+ "grad_norm": 3.4735429286956787,
+ "learning_rate": 1.3199886104965088e-08,
+ "loss": 0.3502,
+ "step": 12278
+ },
+ {
+ "epoch": 5.8056737588652485,
+ "grad_norm": 3.1584722995758057,
+ "learning_rate": 1.3135942856695728e-08,
+ "loss": 0.3202,
+ "step": 12279
+ },
+ {
+ "epoch": 5.806146572104019,
+ "grad_norm": 3.2173585891723633,
+ "learning_rate": 1.3072154454546181e-08,
+ "loss": 0.3285,
+ "step": 12280
+ },
+ {
+ "epoch": 5.80661938534279,
+ "grad_norm": 3.024911403656006,
+ "learning_rate": 1.300852090248883e-08,
+ "loss": 0.3164,
+ "step": 12281
+ },
+ {
+ "epoch": 5.80709219858156,
+ "grad_norm": 3.0423598289489746,
+ "learning_rate": 1.2945042204486058e-08,
+ "loss": 0.3111,
+ "step": 12282
+ },
+ {
+ "epoch": 5.807565011820331,
+ "grad_norm": 3.4012279510498047,
+ "learning_rate": 1.2881718364489704e-08,
+ "loss": 0.3103,
+ "step": 12283
+ },
+ {
+ "epoch": 5.808037825059102,
+ "grad_norm": 3.214341878890991,
+ "learning_rate": 1.281854938644328e-08,
+ "loss": 0.3463,
+ "step": 12284
+ },
+ {
+ "epoch": 5.808510638297872,
+ "grad_norm": 3.1404919624328613,
+ "learning_rate": 1.2755535274279751e-08,
+ "loss": 0.3568,
+ "step": 12285
+ },
+ {
+ "epoch": 5.808983451536643,
+ "grad_norm": 2.9638466835021973,
+ "learning_rate": 1.2692676031922924e-08,
+ "loss": 0.2977,
+ "step": 12286
+ },
+ {
+ "epoch": 5.809456264775414,
+ "grad_norm": 3.213879108428955,
+ "learning_rate": 1.2629971663286611e-08,
+ "loss": 0.317,
+ "step": 12287
+ },
+ {
+ "epoch": 5.809929078014184,
+ "grad_norm": 3.2124996185302734,
+ "learning_rate": 1.2567422172274912e-08,
+ "loss": 0.3242,
+ "step": 12288
+ },
+ {
+ "epoch": 5.810401891252955,
+ "grad_norm": 3.4042906761169434,
+ "learning_rate": 1.2505027562783046e-08,
+ "loss": 0.3923,
+ "step": 12289
+ },
+ {
+ "epoch": 5.8108747044917255,
+ "grad_norm": 3.2831614017486572,
+ "learning_rate": 1.244278783869568e-08,
+ "loss": 0.3182,
+ "step": 12290
+ },
+ {
+ "epoch": 5.811347517730496,
+ "grad_norm": 3.221754789352417,
+ "learning_rate": 1.2380703003888328e-08,
+ "loss": 0.3148,
+ "step": 12291
+ },
+ {
+ "epoch": 5.811820330969267,
+ "grad_norm": 3.3793656826019287,
+ "learning_rate": 1.2318773062226786e-08,
+ "loss": 0.302,
+ "step": 12292
+ },
+ {
+ "epoch": 5.812293144208038,
+ "grad_norm": 3.3612375259399414,
+ "learning_rate": 1.2256998017566857e-08,
+ "loss": 0.3948,
+ "step": 12293
+ },
+ {
+ "epoch": 5.812765957446809,
+ "grad_norm": 3.12095308303833,
+ "learning_rate": 1.2195377873755466e-08,
+ "loss": 0.3008,
+ "step": 12294
+ },
+ {
+ "epoch": 5.8132387706855795,
+ "grad_norm": 3.474125385284424,
+ "learning_rate": 1.2133912634628987e-08,
+ "loss": 0.3286,
+ "step": 12295
+ },
+ {
+ "epoch": 5.81371158392435,
+ "grad_norm": 3.325859308242798,
+ "learning_rate": 1.2072602304014914e-08,
+ "loss": 0.3191,
+ "step": 12296
+ },
+ {
+ "epoch": 5.81418439716312,
+ "grad_norm": 3.4525535106658936,
+ "learning_rate": 1.2011446885730748e-08,
+ "loss": 0.3264,
+ "step": 12297
+ },
+ {
+ "epoch": 5.814657210401891,
+ "grad_norm": 3.212451934814453,
+ "learning_rate": 1.195044638358428e-08,
+ "loss": 0.3474,
+ "step": 12298
+ },
+ {
+ "epoch": 5.815130023640662,
+ "grad_norm": 3.5907411575317383,
+ "learning_rate": 1.1889600801373579e-08,
+ "loss": 0.3518,
+ "step": 12299
+ },
+ {
+ "epoch": 5.815602836879433,
+ "grad_norm": 3.051485538482666,
+ "learning_rate": 1.1828910142887562e-08,
+ "loss": 0.3446,
+ "step": 12300
+ },
+ {
+ "epoch": 5.816075650118203,
+ "grad_norm": 2.894104480743408,
+ "learning_rate": 1.1768374411905147e-08,
+ "loss": 0.2675,
+ "step": 12301
+ },
+ {
+ "epoch": 5.816548463356974,
+ "grad_norm": 3.1474084854125977,
+ "learning_rate": 1.1707993612195267e-08,
+ "loss": 0.3322,
+ "step": 12302
+ },
+ {
+ "epoch": 5.817021276595745,
+ "grad_norm": 3.2870020866394043,
+ "learning_rate": 1.1647767747517691e-08,
+ "loss": 0.3422,
+ "step": 12303
+ },
+ {
+ "epoch": 5.817494089834515,
+ "grad_norm": 3.2131540775299072,
+ "learning_rate": 1.1587696821622751e-08,
+ "loss": 0.3539,
+ "step": 12304
+ },
+ {
+ "epoch": 5.817966903073286,
+ "grad_norm": 3.316063165664673,
+ "learning_rate": 1.1527780838250513e-08,
+ "loss": 0.3317,
+ "step": 12305
+ },
+ {
+ "epoch": 5.8184397163120565,
+ "grad_norm": 3.1365461349487305,
+ "learning_rate": 1.1468019801131603e-08,
+ "loss": 0.2794,
+ "step": 12306
+ },
+ {
+ "epoch": 5.818912529550827,
+ "grad_norm": 3.479088544845581,
+ "learning_rate": 1.140841371398721e-08,
+ "loss": 0.3171,
+ "step": 12307
+ },
+ {
+ "epoch": 5.819385342789598,
+ "grad_norm": 3.3644347190856934,
+ "learning_rate": 1.1348962580529087e-08,
+ "loss": 0.3226,
+ "step": 12308
+ },
+ {
+ "epoch": 5.819858156028369,
+ "grad_norm": 3.183668375015259,
+ "learning_rate": 1.1289666404458166e-08,
+ "loss": 0.3176,
+ "step": 12309
+ },
+ {
+ "epoch": 5.82033096926714,
+ "grad_norm": 3.087696075439453,
+ "learning_rate": 1.1230525189467323e-08,
+ "loss": 0.3031,
+ "step": 12310
+ },
+ {
+ "epoch": 5.8208037825059105,
+ "grad_norm": 3.1411244869232178,
+ "learning_rate": 1.1171538939238614e-08,
+ "loss": 0.3757,
+ "step": 12311
+ },
+ {
+ "epoch": 5.821276595744681,
+ "grad_norm": 3.7644147872924805,
+ "learning_rate": 1.111270765744521e-08,
+ "loss": 0.3617,
+ "step": 12312
+ },
+ {
+ "epoch": 5.821749408983451,
+ "grad_norm": 3.530128240585327,
+ "learning_rate": 1.1054031347750016e-08,
+ "loss": 0.2842,
+ "step": 12313
+ },
+ {
+ "epoch": 5.822222222222222,
+ "grad_norm": 3.054417848587036,
+ "learning_rate": 1.0995510013806499e-08,
+ "loss": 0.2845,
+ "step": 12314
+ },
+ {
+ "epoch": 5.822695035460993,
+ "grad_norm": 3.606204032897949,
+ "learning_rate": 1.0937143659258686e-08,
+ "loss": 0.3339,
+ "step": 12315
+ },
+ {
+ "epoch": 5.823167848699764,
+ "grad_norm": 3.1252198219299316,
+ "learning_rate": 1.0878932287740618e-08,
+ "loss": 0.3387,
+ "step": 12316
+ },
+ {
+ "epoch": 5.823640661938534,
+ "grad_norm": 3.3949391841888428,
+ "learning_rate": 1.082087590287717e-08,
+ "loss": 0.292,
+ "step": 12317
+ },
+ {
+ "epoch": 5.824113475177305,
+ "grad_norm": 3.2029218673706055,
+ "learning_rate": 1.0762974508282954e-08,
+ "loss": 0.3053,
+ "step": 12318
+ },
+ {
+ "epoch": 5.824586288416076,
+ "grad_norm": 3.2031943798065186,
+ "learning_rate": 1.0705228107563138e-08,
+ "loss": 0.3362,
+ "step": 12319
+ },
+ {
+ "epoch": 5.825059101654846,
+ "grad_norm": 3.107954502105713,
+ "learning_rate": 1.0647636704313736e-08,
+ "loss": 0.3196,
+ "step": 12320
+ },
+ {
+ "epoch": 5.825531914893617,
+ "grad_norm": 3.4354865550994873,
+ "learning_rate": 1.0590200302120767e-08,
+ "loss": 0.3165,
+ "step": 12321
+ },
+ {
+ "epoch": 5.8260047281323875,
+ "grad_norm": 3.4960227012634277,
+ "learning_rate": 1.0532918904559709e-08,
+ "loss": 0.3052,
+ "step": 12322
+ },
+ {
+ "epoch": 5.826477541371158,
+ "grad_norm": 3.085123062133789,
+ "learning_rate": 1.0475792515198258e-08,
+ "loss": 0.3575,
+ "step": 12323
+ },
+ {
+ "epoch": 5.826950354609929,
+ "grad_norm": 3.4668917655944824,
+ "learning_rate": 1.0418821137592738e-08,
+ "loss": 0.3975,
+ "step": 12324
+ },
+ {
+ "epoch": 5.8274231678487,
+ "grad_norm": 4.228496074676514,
+ "learning_rate": 1.0362004775290868e-08,
+ "loss": 0.3641,
+ "step": 12325
+ },
+ {
+ "epoch": 5.827895981087471,
+ "grad_norm": 3.3614673614501953,
+ "learning_rate": 1.0305343431830372e-08,
+ "loss": 0.3568,
+ "step": 12326
+ },
+ {
+ "epoch": 5.828368794326241,
+ "grad_norm": 3.3814778327941895,
+ "learning_rate": 1.0248837110738708e-08,
+ "loss": 0.3703,
+ "step": 12327
+ },
+ {
+ "epoch": 5.828841607565012,
+ "grad_norm": 3.4079811573028564,
+ "learning_rate": 1.0192485815535003e-08,
+ "loss": 0.3243,
+ "step": 12328
+ },
+ {
+ "epoch": 5.829314420803782,
+ "grad_norm": 2.895949363708496,
+ "learning_rate": 1.013628954972784e-08,
+ "loss": 0.2907,
+ "step": 12329
+ },
+ {
+ "epoch": 5.829787234042553,
+ "grad_norm": 3.183469295501709,
+ "learning_rate": 1.0080248316816089e-08,
+ "loss": 0.2966,
+ "step": 12330
+ },
+ {
+ "epoch": 5.830260047281324,
+ "grad_norm": 3.2694790363311768,
+ "learning_rate": 1.0024362120289178e-08,
+ "loss": 0.337,
+ "step": 12331
+ },
+ {
+ "epoch": 5.8307328605200945,
+ "grad_norm": 3.0339701175689697,
+ "learning_rate": 9.968630963627101e-09,
+ "loss": 0.3384,
+ "step": 12332
+ },
+ {
+ "epoch": 5.831205673758865,
+ "grad_norm": 3.1445910930633545,
+ "learning_rate": 9.913054850300142e-09,
+ "loss": 0.3344,
+ "step": 12333
+ },
+ {
+ "epoch": 5.831678486997636,
+ "grad_norm": 2.9692158699035645,
+ "learning_rate": 9.857633783768306e-09,
+ "loss": 0.3272,
+ "step": 12334
+ },
+ {
+ "epoch": 5.832151300236407,
+ "grad_norm": 2.62898588180542,
+ "learning_rate": 9.802367767482723e-09,
+ "loss": 0.2624,
+ "step": 12335
+ },
+ {
+ "epoch": 5.832624113475177,
+ "grad_norm": 3.2836642265319824,
+ "learning_rate": 9.74725680488453e-09,
+ "loss": 0.2886,
+ "step": 12336
+ },
+ {
+ "epoch": 5.833096926713948,
+ "grad_norm": 3.3103747367858887,
+ "learning_rate": 9.69230089940515e-09,
+ "loss": 0.3477,
+ "step": 12337
+ },
+ {
+ "epoch": 5.833569739952718,
+ "grad_norm": 3.3658392429351807,
+ "learning_rate": 9.637500054466564e-09,
+ "loss": 0.3145,
+ "step": 12338
+ },
+ {
+ "epoch": 5.834042553191489,
+ "grad_norm": 2.9636175632476807,
+ "learning_rate": 9.582854273481047e-09,
+ "loss": 0.3039,
+ "step": 12339
+ },
+ {
+ "epoch": 5.83451536643026,
+ "grad_norm": 3.365917444229126,
+ "learning_rate": 9.528363559850873e-09,
+ "loss": 0.3193,
+ "step": 12340
+ },
+ {
+ "epoch": 5.834988179669031,
+ "grad_norm": 3.360271453857422,
+ "learning_rate": 9.474027916969164e-09,
+ "loss": 0.3712,
+ "step": 12341
+ },
+ {
+ "epoch": 5.835460992907802,
+ "grad_norm": 3.8133535385131836,
+ "learning_rate": 9.419847348219047e-09,
+ "loss": 0.3263,
+ "step": 12342
+ },
+ {
+ "epoch": 5.835933806146572,
+ "grad_norm": 3.439425468444824,
+ "learning_rate": 9.365821856974489e-09,
+ "loss": 0.3077,
+ "step": 12343
+ },
+ {
+ "epoch": 5.836406619385343,
+ "grad_norm": 3.4898314476013184,
+ "learning_rate": 9.31195144659891e-09,
+ "loss": 0.3472,
+ "step": 12344
+ },
+ {
+ "epoch": 5.836879432624113,
+ "grad_norm": 2.860374689102173,
+ "learning_rate": 9.258236120446573e-09,
+ "loss": 0.3145,
+ "step": 12345
+ },
+ {
+ "epoch": 5.837352245862884,
+ "grad_norm": 3.079913854598999,
+ "learning_rate": 9.204675881862579e-09,
+ "loss": 0.3144,
+ "step": 12346
+ },
+ {
+ "epoch": 5.837825059101655,
+ "grad_norm": 3.398139715194702,
+ "learning_rate": 9.151270734181482e-09,
+ "loss": 0.2983,
+ "step": 12347
+ },
+ {
+ "epoch": 5.8382978723404255,
+ "grad_norm": 3.1991748809814453,
+ "learning_rate": 9.09802068072868e-09,
+ "loss": 0.3142,
+ "step": 12348
+ },
+ {
+ "epoch": 5.838770685579196,
+ "grad_norm": 4.70399808883667,
+ "learning_rate": 9.044925724819852e-09,
+ "loss": 0.3238,
+ "step": 12349
+ },
+ {
+ "epoch": 5.839243498817967,
+ "grad_norm": 3.3638081550598145,
+ "learning_rate": 8.991985869760966e-09,
+ "loss": 0.3358,
+ "step": 12350
+ },
+ {
+ "epoch": 5.839716312056738,
+ "grad_norm": 2.6990113258361816,
+ "learning_rate": 8.93920111884855e-09,
+ "loss": 0.292,
+ "step": 12351
+ },
+ {
+ "epoch": 5.840189125295508,
+ "grad_norm": 3.119417428970337,
+ "learning_rate": 8.886571475368865e-09,
+ "loss": 0.3018,
+ "step": 12352
+ },
+ {
+ "epoch": 5.840661938534279,
+ "grad_norm": 3.0563628673553467,
+ "learning_rate": 8.834096942599568e-09,
+ "loss": 0.303,
+ "step": 12353
+ },
+ {
+ "epoch": 5.841134751773049,
+ "grad_norm": 3.306051015853882,
+ "learning_rate": 8.781777523807212e-09,
+ "loss": 0.3207,
+ "step": 12354
+ },
+ {
+ "epoch": 5.84160756501182,
+ "grad_norm": 3.476424217224121,
+ "learning_rate": 8.729613222250022e-09,
+ "loss": 0.2788,
+ "step": 12355
+ },
+ {
+ "epoch": 5.842080378250591,
+ "grad_norm": 2.975752353668213,
+ "learning_rate": 8.677604041175957e-09,
+ "loss": 0.3066,
+ "step": 12356
+ },
+ {
+ "epoch": 5.842553191489362,
+ "grad_norm": 2.9331040382385254,
+ "learning_rate": 8.625749983823261e-09,
+ "loss": 0.28,
+ "step": 12357
+ },
+ {
+ "epoch": 5.843026004728133,
+ "grad_norm": 3.269192695617676,
+ "learning_rate": 8.574051053421017e-09,
+ "loss": 0.3116,
+ "step": 12358
+ },
+ {
+ "epoch": 5.843498817966903,
+ "grad_norm": 3.3965625762939453,
+ "learning_rate": 8.522507253188039e-09,
+ "loss": 0.337,
+ "step": 12359
+ },
+ {
+ "epoch": 5.843971631205674,
+ "grad_norm": 3.294217348098755,
+ "learning_rate": 8.471118586333426e-09,
+ "loss": 0.3315,
+ "step": 12360
+ },
+ {
+ "epoch": 5.844444444444444,
+ "grad_norm": 3.662712812423706,
+ "learning_rate": 8.419885056057398e-09,
+ "loss": 0.3138,
+ "step": 12361
+ },
+ {
+ "epoch": 5.844917257683215,
+ "grad_norm": 2.9847161769866943,
+ "learning_rate": 8.3688066655499e-09,
+ "loss": 0.306,
+ "step": 12362
+ },
+ {
+ "epoch": 5.845390070921986,
+ "grad_norm": 3.5540273189544678,
+ "learning_rate": 8.317883417991168e-09,
+ "loss": 0.3661,
+ "step": 12363
+ },
+ {
+ "epoch": 5.8458628841607565,
+ "grad_norm": 3.339153528213501,
+ "learning_rate": 8.267115316552555e-09,
+ "loss": 0.4123,
+ "step": 12364
+ },
+ {
+ "epoch": 5.846335697399527,
+ "grad_norm": 2.979498863220215,
+ "learning_rate": 8.216502364394307e-09,
+ "loss": 0.2785,
+ "step": 12365
+ },
+ {
+ "epoch": 5.846808510638298,
+ "grad_norm": 3.3412187099456787,
+ "learning_rate": 8.166044564668629e-09,
+ "loss": 0.2999,
+ "step": 12366
+ },
+ {
+ "epoch": 5.847281323877069,
+ "grad_norm": 3.178144693374634,
+ "learning_rate": 8.115741920516895e-09,
+ "loss": 0.2632,
+ "step": 12367
+ },
+ {
+ "epoch": 5.847754137115839,
+ "grad_norm": 3.2854394912719727,
+ "learning_rate": 8.065594435071044e-09,
+ "loss": 0.4104,
+ "step": 12368
+ },
+ {
+ "epoch": 5.84822695035461,
+ "grad_norm": 3.0959322452545166,
+ "learning_rate": 8.015602111454135e-09,
+ "loss": 0.279,
+ "step": 12369
+ },
+ {
+ "epoch": 5.84869976359338,
+ "grad_norm": 3.244652509689331,
+ "learning_rate": 7.965764952778677e-09,
+ "loss": 0.3887,
+ "step": 12370
+ },
+ {
+ "epoch": 5.849172576832151,
+ "grad_norm": 2.9431116580963135,
+ "learning_rate": 7.916082962147742e-09,
+ "loss": 0.2984,
+ "step": 12371
+ },
+ {
+ "epoch": 5.849645390070922,
+ "grad_norm": 3.488229513168335,
+ "learning_rate": 7.866556142654691e-09,
+ "loss": 0.3402,
+ "step": 12372
+ },
+ {
+ "epoch": 5.850118203309693,
+ "grad_norm": 3.5097646713256836,
+ "learning_rate": 7.817184497383723e-09,
+ "loss": 0.3445,
+ "step": 12373
+ },
+ {
+ "epoch": 5.850591016548464,
+ "grad_norm": 3.3530054092407227,
+ "learning_rate": 7.767968029408767e-09,
+ "loss": 0.3033,
+ "step": 12374
+ },
+ {
+ "epoch": 5.851063829787234,
+ "grad_norm": 3.4520263671875,
+ "learning_rate": 7.718906741794041e-09,
+ "loss": 0.3666,
+ "step": 12375
+ },
+ {
+ "epoch": 5.851536643026005,
+ "grad_norm": 3.006080389022827,
+ "learning_rate": 7.670000637594876e-09,
+ "loss": 0.2951,
+ "step": 12376
+ },
+ {
+ "epoch": 5.852009456264775,
+ "grad_norm": 3.80802059173584,
+ "learning_rate": 7.621249719856338e-09,
+ "loss": 0.3493,
+ "step": 12377
+ },
+ {
+ "epoch": 5.852482269503546,
+ "grad_norm": 2.9159436225891113,
+ "learning_rate": 7.572653991613777e-09,
+ "loss": 0.3175,
+ "step": 12378
+ },
+ {
+ "epoch": 5.852955082742317,
+ "grad_norm": 3.0945873260498047,
+ "learning_rate": 7.524213455893104e-09,
+ "loss": 0.3494,
+ "step": 12379
+ },
+ {
+ "epoch": 5.8534278959810875,
+ "grad_norm": 3.1333816051483154,
+ "learning_rate": 7.475928115710518e-09,
+ "loss": 0.3044,
+ "step": 12380
+ },
+ {
+ "epoch": 5.853900709219858,
+ "grad_norm": 3.30086350440979,
+ "learning_rate": 7.427797974072226e-09,
+ "loss": 0.3642,
+ "step": 12381
+ },
+ {
+ "epoch": 5.854373522458629,
+ "grad_norm": 3.21052622795105,
+ "learning_rate": 7.37982303397583e-09,
+ "loss": 0.364,
+ "step": 12382
+ },
+ {
+ "epoch": 5.8548463356974,
+ "grad_norm": 2.969539165496826,
+ "learning_rate": 7.3320032984075526e-09,
+ "loss": 0.2997,
+ "step": 12383
+ },
+ {
+ "epoch": 5.85531914893617,
+ "grad_norm": 3.7157652378082275,
+ "learning_rate": 7.284338770345567e-09,
+ "loss": 0.3246,
+ "step": 12384
+ },
+ {
+ "epoch": 5.855791962174941,
+ "grad_norm": 3.2974917888641357,
+ "learning_rate": 7.236829452757776e-09,
+ "loss": 0.3296,
+ "step": 12385
+ },
+ {
+ "epoch": 5.856264775413711,
+ "grad_norm": 3.3709514141082764,
+ "learning_rate": 7.189475348601815e-09,
+ "loss": 0.3505,
+ "step": 12386
+ },
+ {
+ "epoch": 5.856737588652482,
+ "grad_norm": 3.266540050506592,
+ "learning_rate": 7.142276460826991e-09,
+ "loss": 0.327,
+ "step": 12387
+ },
+ {
+ "epoch": 5.857210401891253,
+ "grad_norm": 3.181105613708496,
+ "learning_rate": 7.095232792371509e-09,
+ "loss": 0.3706,
+ "step": 12388
+ },
+ {
+ "epoch": 5.857683215130024,
+ "grad_norm": 3.1525869369506836,
+ "learning_rate": 7.048344346164693e-09,
+ "loss": 0.3277,
+ "step": 12389
+ },
+ {
+ "epoch": 5.858156028368795,
+ "grad_norm": 2.794084310531616,
+ "learning_rate": 7.001611125126429e-09,
+ "loss": 0.2824,
+ "step": 12390
+ },
+ {
+ "epoch": 5.858628841607565,
+ "grad_norm": 3.000612258911133,
+ "learning_rate": 6.955033132166333e-09,
+ "loss": 0.2764,
+ "step": 12391
+ },
+ {
+ "epoch": 5.859101654846336,
+ "grad_norm": 3.364813804626465,
+ "learning_rate": 6.908610370184587e-09,
+ "loss": 0.3611,
+ "step": 12392
+ },
+ {
+ "epoch": 5.859574468085106,
+ "grad_norm": 3.3845462799072266,
+ "learning_rate": 6.862342842071934e-09,
+ "loss": 0.2859,
+ "step": 12393
+ },
+ {
+ "epoch": 5.860047281323877,
+ "grad_norm": 3.277588367462158,
+ "learning_rate": 6.816230550709124e-09,
+ "loss": 0.2971,
+ "step": 12394
+ },
+ {
+ "epoch": 5.860520094562648,
+ "grad_norm": 3.191347599029541,
+ "learning_rate": 6.770273498967195e-09,
+ "loss": 0.345,
+ "step": 12395
+ },
+ {
+ "epoch": 5.8609929078014185,
+ "grad_norm": 3.713665246963501,
+ "learning_rate": 6.724471689708023e-09,
+ "loss": 0.2481,
+ "step": 12396
+ },
+ {
+ "epoch": 5.861465721040189,
+ "grad_norm": 4.008725166320801,
+ "learning_rate": 6.678825125783217e-09,
+ "loss": 0.3994,
+ "step": 12397
+ },
+ {
+ "epoch": 5.86193853427896,
+ "grad_norm": 3.227405548095703,
+ "learning_rate": 6.633333810034948e-09,
+ "loss": 0.3209,
+ "step": 12398
+ },
+ {
+ "epoch": 5.862411347517731,
+ "grad_norm": 3.2580344676971436,
+ "learning_rate": 6.587997745295949e-09,
+ "loss": 0.2656,
+ "step": 12399
+ },
+ {
+ "epoch": 5.862884160756501,
+ "grad_norm": 3.393422842025757,
+ "learning_rate": 6.54281693438924e-09,
+ "loss": 0.3324,
+ "step": 12400
+ },
+ {
+ "epoch": 5.863356973995272,
+ "grad_norm": 3.3529460430145264,
+ "learning_rate": 6.497791380127572e-09,
+ "loss": 0.3522,
+ "step": 12401
+ },
+ {
+ "epoch": 5.863829787234042,
+ "grad_norm": 3.7341339588165283,
+ "learning_rate": 6.4529210853150895e-09,
+ "loss": 0.3344,
+ "step": 12402
+ },
+ {
+ "epoch": 5.864302600472813,
+ "grad_norm": 3.214383363723755,
+ "learning_rate": 6.408206052745114e-09,
+ "loss": 0.3159,
+ "step": 12403
+ },
+ {
+ "epoch": 5.864775413711584,
+ "grad_norm": 3.2047109603881836,
+ "learning_rate": 6.363646285202085e-09,
+ "loss": 0.332,
+ "step": 12404
+ },
+ {
+ "epoch": 5.865248226950355,
+ "grad_norm": 3.943801164627075,
+ "learning_rate": 6.319241785460728e-09,
+ "loss": 0.2709,
+ "step": 12405
+ },
+ {
+ "epoch": 5.8657210401891255,
+ "grad_norm": 3.622441053390503,
+ "learning_rate": 6.274992556285497e-09,
+ "loss": 0.3297,
+ "step": 12406
+ },
+ {
+ "epoch": 5.866193853427896,
+ "grad_norm": 3.2042336463928223,
+ "learning_rate": 6.230898600431967e-09,
+ "loss": 0.3415,
+ "step": 12407
+ },
+ {
+ "epoch": 5.866666666666667,
+ "grad_norm": 3.1624488830566406,
+ "learning_rate": 6.186959920645163e-09,
+ "loss": 0.2946,
+ "step": 12408
+ },
+ {
+ "epoch": 5.867139479905437,
+ "grad_norm": 3.4413647651672363,
+ "learning_rate": 6.143176519661786e-09,
+ "loss": 0.3362,
+ "step": 12409
+ },
+ {
+ "epoch": 5.867612293144208,
+ "grad_norm": 2.8282558917999268,
+ "learning_rate": 6.099548400207434e-09,
+ "loss": 0.3343,
+ "step": 12410
+ },
+ {
+ "epoch": 5.868085106382979,
+ "grad_norm": 3.3495492935180664,
+ "learning_rate": 6.05607556499882e-09,
+ "loss": 0.3359,
+ "step": 12411
+ },
+ {
+ "epoch": 5.868557919621749,
+ "grad_norm": 3.4601523876190186,
+ "learning_rate": 6.01275801674267e-09,
+ "loss": 0.3307,
+ "step": 12412
+ },
+ {
+ "epoch": 5.86903073286052,
+ "grad_norm": 3.019951581954956,
+ "learning_rate": 5.969595758136271e-09,
+ "loss": 0.2836,
+ "step": 12413
+ },
+ {
+ "epoch": 5.869503546099291,
+ "grad_norm": 3.4495139122009277,
+ "learning_rate": 5.926588791867194e-09,
+ "loss": 0.3655,
+ "step": 12414
+ },
+ {
+ "epoch": 5.869976359338062,
+ "grad_norm": 3.377807140350342,
+ "learning_rate": 5.8837371206132975e-09,
+ "loss": 0.3818,
+ "step": 12415
+ },
+ {
+ "epoch": 5.870449172576832,
+ "grad_norm": 3.2344791889190674,
+ "learning_rate": 5.841040747042448e-09,
+ "loss": 0.3216,
+ "step": 12416
+ },
+ {
+ "epoch": 5.8709219858156025,
+ "grad_norm": 3.171966791152954,
+ "learning_rate": 5.798499673813629e-09,
+ "loss": 0.337,
+ "step": 12417
+ },
+ {
+ "epoch": 5.871394799054373,
+ "grad_norm": 3.327817440032959,
+ "learning_rate": 5.7561139035755555e-09,
+ "loss": 0.3246,
+ "step": 12418
+ },
+ {
+ "epoch": 5.871867612293144,
+ "grad_norm": 2.9902989864349365,
+ "learning_rate": 5.713883438967227e-09,
+ "loss": 0.2921,
+ "step": 12419
+ },
+ {
+ "epoch": 5.872340425531915,
+ "grad_norm": 3.0194108486175537,
+ "learning_rate": 5.671808282618485e-09,
+ "loss": 0.3122,
+ "step": 12420
+ },
+ {
+ "epoch": 5.872813238770686,
+ "grad_norm": 3.165776252746582,
+ "learning_rate": 5.629888437148623e-09,
+ "loss": 0.3049,
+ "step": 12421
+ },
+ {
+ "epoch": 5.8732860520094565,
+ "grad_norm": 3.222511053085327,
+ "learning_rate": 5.58812390516833e-09,
+ "loss": 0.3259,
+ "step": 12422
+ },
+ {
+ "epoch": 5.873758865248227,
+ "grad_norm": 3.37233567237854,
+ "learning_rate": 5.546514689277749e-09,
+ "loss": 0.3604,
+ "step": 12423
+ },
+ {
+ "epoch": 5.874231678486998,
+ "grad_norm": 3.3566384315490723,
+ "learning_rate": 5.50506079206814e-09,
+ "loss": 0.3313,
+ "step": 12424
+ },
+ {
+ "epoch": 5.874704491725768,
+ "grad_norm": 3.334174394607544,
+ "learning_rate": 5.463762216119939e-09,
+ "loss": 0.3089,
+ "step": 12425
+ },
+ {
+ "epoch": 5.875177304964539,
+ "grad_norm": 3.4556798934936523,
+ "learning_rate": 5.422618964005255e-09,
+ "loss": 0.3692,
+ "step": 12426
+ },
+ {
+ "epoch": 5.87565011820331,
+ "grad_norm": 3.3695571422576904,
+ "learning_rate": 5.3816310382859286e-09,
+ "loss": 0.3467,
+ "step": 12427
+ },
+ {
+ "epoch": 5.87612293144208,
+ "grad_norm": 3.069835662841797,
+ "learning_rate": 5.340798441513528e-09,
+ "loss": 0.2695,
+ "step": 12428
+ },
+ {
+ "epoch": 5.876595744680851,
+ "grad_norm": 3.415329694747925,
+ "learning_rate": 5.300121176231021e-09,
+ "loss": 0.3901,
+ "step": 12429
+ },
+ {
+ "epoch": 5.877068557919622,
+ "grad_norm": 3.3829095363616943,
+ "learning_rate": 5.2595992449711034e-09,
+ "loss": 0.308,
+ "step": 12430
+ },
+ {
+ "epoch": 5.877541371158393,
+ "grad_norm": 3.1284217834472656,
+ "learning_rate": 5.219232650256756e-09,
+ "loss": 0.2772,
+ "step": 12431
+ },
+ {
+ "epoch": 5.878014184397163,
+ "grad_norm": 3.5533947944641113,
+ "learning_rate": 5.179021394601525e-09,
+ "loss": 0.3212,
+ "step": 12432
+ },
+ {
+ "epoch": 5.8784869976359335,
+ "grad_norm": 3.0174765586853027,
+ "learning_rate": 5.1389654805089616e-09,
+ "loss": 0.3086,
+ "step": 12433
+ },
+ {
+ "epoch": 5.878959810874704,
+ "grad_norm": 2.8360655307769775,
+ "learning_rate": 5.099064910473461e-09,
+ "loss": 0.2882,
+ "step": 12434
+ },
+ {
+ "epoch": 5.879432624113475,
+ "grad_norm": 3.238159418106079,
+ "learning_rate": 5.0593196869797025e-09,
+ "loss": 0.3694,
+ "step": 12435
+ },
+ {
+ "epoch": 5.879905437352246,
+ "grad_norm": 3.080636739730835,
+ "learning_rate": 5.019729812501817e-09,
+ "loss": 0.3548,
+ "step": 12436
+ },
+ {
+ "epoch": 5.880378250591017,
+ "grad_norm": 3.242600202560425,
+ "learning_rate": 4.9802952895050546e-09,
+ "loss": 0.3184,
+ "step": 12437
+ },
+ {
+ "epoch": 5.8808510638297875,
+ "grad_norm": 3.2375903129577637,
+ "learning_rate": 4.94101612044523e-09,
+ "loss": 0.3342,
+ "step": 12438
+ },
+ {
+ "epoch": 5.881323877068558,
+ "grad_norm": 3.0890920162200928,
+ "learning_rate": 4.901892307767886e-09,
+ "loss": 0.3014,
+ "step": 12439
+ },
+ {
+ "epoch": 5.881796690307329,
+ "grad_norm": 3.177412271499634,
+ "learning_rate": 4.862923853908852e-09,
+ "loss": 0.3068,
+ "step": 12440
+ },
+ {
+ "epoch": 5.882269503546099,
+ "grad_norm": 3.2599844932556152,
+ "learning_rate": 4.824110761294798e-09,
+ "loss": 0.3313,
+ "step": 12441
+ },
+ {
+ "epoch": 5.88274231678487,
+ "grad_norm": 3.3482279777526855,
+ "learning_rate": 4.785453032342402e-09,
+ "loss": 0.2724,
+ "step": 12442
+ },
+ {
+ "epoch": 5.883215130023641,
+ "grad_norm": 3.0876619815826416,
+ "learning_rate": 4.74695066945835e-09,
+ "loss": 0.2917,
+ "step": 12443
+ },
+ {
+ "epoch": 5.883687943262411,
+ "grad_norm": 3.6275599002838135,
+ "learning_rate": 4.708603675040724e-09,
+ "loss": 0.3604,
+ "step": 12444
+ },
+ {
+ "epoch": 5.884160756501182,
+ "grad_norm": 3.1014959812164307,
+ "learning_rate": 4.670412051476503e-09,
+ "loss": 0.3094,
+ "step": 12445
+ },
+ {
+ "epoch": 5.884633569739953,
+ "grad_norm": 3.465449810028076,
+ "learning_rate": 4.6323758011443394e-09,
+ "loss": 0.3155,
+ "step": 12446
+ },
+ {
+ "epoch": 5.885106382978723,
+ "grad_norm": 3.210188150405884,
+ "learning_rate": 4.594494926412063e-09,
+ "loss": 0.3525,
+ "step": 12447
+ },
+ {
+ "epoch": 5.885579196217494,
+ "grad_norm": 3.4170849323272705,
+ "learning_rate": 4.556769429638619e-09,
+ "loss": 0.328,
+ "step": 12448
+ },
+ {
+ "epoch": 5.8860520094562645,
+ "grad_norm": 3.739753246307373,
+ "learning_rate": 4.519199313172962e-09,
+ "loss": 0.3263,
+ "step": 12449
+ },
+ {
+ "epoch": 5.886524822695035,
+ "grad_norm": 2.84963321685791,
+ "learning_rate": 4.481784579354331e-09,
+ "loss": 0.2709,
+ "step": 12450
+ },
+ {
+ "epoch": 5.886997635933806,
+ "grad_norm": 3.2073376178741455,
+ "learning_rate": 4.444525230512531e-09,
+ "loss": 0.3467,
+ "step": 12451
+ },
+ {
+ "epoch": 5.887470449172577,
+ "grad_norm": 3.142899751663208,
+ "learning_rate": 4.407421268967371e-09,
+ "loss": 0.2971,
+ "step": 12452
+ },
+ {
+ "epoch": 5.887943262411348,
+ "grad_norm": 3.3176493644714355,
+ "learning_rate": 4.370472697029504e-09,
+ "loss": 0.3345,
+ "step": 12453
+ },
+ {
+ "epoch": 5.8884160756501185,
+ "grad_norm": 3.3850505352020264,
+ "learning_rate": 4.3336795169990344e-09,
+ "loss": 0.3284,
+ "step": 12454
+ },
+ {
+ "epoch": 5.888888888888889,
+ "grad_norm": 3.3981900215148926,
+ "learning_rate": 4.297041731167184e-09,
+ "loss": 0.2875,
+ "step": 12455
+ },
+ {
+ "epoch": 5.889361702127659,
+ "grad_norm": 3.187206983566284,
+ "learning_rate": 4.2605593418154625e-09,
+ "loss": 0.337,
+ "step": 12456
+ },
+ {
+ "epoch": 5.88983451536643,
+ "grad_norm": 3.8661656379699707,
+ "learning_rate": 4.22423235121483e-09,
+ "loss": 0.3013,
+ "step": 12457
+ },
+ {
+ "epoch": 5.890307328605201,
+ "grad_norm": 3.324201822280884,
+ "learning_rate": 4.1880607616279214e-09,
+ "loss": 0.3417,
+ "step": 12458
+ },
+ {
+ "epoch": 5.890780141843972,
+ "grad_norm": 2.9349443912506104,
+ "learning_rate": 4.152044575306546e-09,
+ "loss": 0.2964,
+ "step": 12459
+ },
+ {
+ "epoch": 5.891252955082742,
+ "grad_norm": 3.042742967605591,
+ "learning_rate": 4.116183794493633e-09,
+ "loss": 0.3165,
+ "step": 12460
+ },
+ {
+ "epoch": 5.891725768321513,
+ "grad_norm": 3.0013630390167236,
+ "learning_rate": 4.080478421421563e-09,
+ "loss": 0.3267,
+ "step": 12461
+ },
+ {
+ "epoch": 5.892198581560284,
+ "grad_norm": 2.738849401473999,
+ "learning_rate": 4.044928458313835e-09,
+ "loss": 0.268,
+ "step": 12462
+ },
+ {
+ "epoch": 5.892671394799054,
+ "grad_norm": 3.13663649559021,
+ "learning_rate": 4.0095339073839554e-09,
+ "loss": 0.2789,
+ "step": 12463
+ },
+ {
+ "epoch": 5.893144208037825,
+ "grad_norm": 3.3638975620269775,
+ "learning_rate": 3.974294770835996e-09,
+ "loss": 0.3157,
+ "step": 12464
+ },
+ {
+ "epoch": 5.8936170212765955,
+ "grad_norm": 2.984937906265259,
+ "learning_rate": 3.939211050863756e-09,
+ "loss": 0.2691,
+ "step": 12465
+ },
+ {
+ "epoch": 5.894089834515366,
+ "grad_norm": 3.2892327308654785,
+ "learning_rate": 3.9042827496518775e-09,
+ "loss": 0.3298,
+ "step": 12466
+ },
+ {
+ "epoch": 5.894562647754137,
+ "grad_norm": 3.8666255474090576,
+ "learning_rate": 3.869509869375565e-09,
+ "loss": 0.3105,
+ "step": 12467
+ },
+ {
+ "epoch": 5.895035460992908,
+ "grad_norm": 3.3934621810913086,
+ "learning_rate": 3.834892412199476e-09,
+ "loss": 0.3226,
+ "step": 12468
+ },
+ {
+ "epoch": 5.895508274231679,
+ "grad_norm": 3.4978415966033936,
+ "learning_rate": 3.8004303802793855e-09,
+ "loss": 0.2955,
+ "step": 12469
+ },
+ {
+ "epoch": 5.8959810874704495,
+ "grad_norm": 3.097461462020874,
+ "learning_rate": 3.766123775760799e-09,
+ "loss": 0.3478,
+ "step": 12470
+ },
+ {
+ "epoch": 5.89645390070922,
+ "grad_norm": 2.8790197372436523,
+ "learning_rate": 3.731972600780065e-09,
+ "loss": 0.262,
+ "step": 12471
+ },
+ {
+ "epoch": 5.89692671394799,
+ "grad_norm": 3.2017362117767334,
+ "learning_rate": 3.6979768574638144e-09,
+ "loss": 0.2947,
+ "step": 12472
+ },
+ {
+ "epoch": 5.897399527186761,
+ "grad_norm": 3.7744874954223633,
+ "learning_rate": 3.6641365479284096e-09,
+ "loss": 0.4205,
+ "step": 12473
+ },
+ {
+ "epoch": 5.897872340425532,
+ "grad_norm": 4.276632308959961,
+ "learning_rate": 3.630451674281055e-09,
+ "loss": 0.3479,
+ "step": 12474
+ },
+ {
+ "epoch": 5.898345153664303,
+ "grad_norm": 3.855257511138916,
+ "learning_rate": 3.596922238619516e-09,
+ "loss": 0.3854,
+ "step": 12475
+ },
+ {
+ "epoch": 5.898817966903073,
+ "grad_norm": 3.5090348720550537,
+ "learning_rate": 3.563548243031012e-09,
+ "loss": 0.3254,
+ "step": 12476
+ },
+ {
+ "epoch": 5.899290780141844,
+ "grad_norm": 3.134324550628662,
+ "learning_rate": 3.530329689593881e-09,
+ "loss": 0.3049,
+ "step": 12477
+ },
+ {
+ "epoch": 5.899763593380615,
+ "grad_norm": 3.6900384426116943,
+ "learning_rate": 3.497266580376191e-09,
+ "loss": 0.3574,
+ "step": 12478
+ },
+ {
+ "epoch": 5.900236406619385,
+ "grad_norm": 3.2664496898651123,
+ "learning_rate": 3.4643589174371272e-09,
+ "loss": 0.3132,
+ "step": 12479
+ },
+ {
+ "epoch": 5.900709219858156,
+ "grad_norm": 3.4255833625793457,
+ "learning_rate": 3.4316067028253298e-09,
+ "loss": 0.389,
+ "step": 12480
+ },
+ {
+ "epoch": 5.9011820330969265,
+ "grad_norm": 3.3752331733703613,
+ "learning_rate": 3.399009938580555e-09,
+ "loss": 0.3262,
+ "step": 12481
+ },
+ {
+ "epoch": 5.901654846335697,
+ "grad_norm": 3.054422378540039,
+ "learning_rate": 3.3665686267317364e-09,
+ "loss": 0.3246,
+ "step": 12482
+ },
+ {
+ "epoch": 5.902127659574468,
+ "grad_norm": 3.2461447715759277,
+ "learning_rate": 3.3342827692994793e-09,
+ "loss": 0.3379,
+ "step": 12483
+ },
+ {
+ "epoch": 5.902600472813239,
+ "grad_norm": 3.4133243560791016,
+ "learning_rate": 3.30215236829412e-09,
+ "loss": 0.3372,
+ "step": 12484
+ },
+ {
+ "epoch": 5.90307328605201,
+ "grad_norm": 3.128157377243042,
+ "learning_rate": 3.270177425715726e-09,
+ "loss": 0.2789,
+ "step": 12485
+ },
+ {
+ "epoch": 5.9035460992907804,
+ "grad_norm": 3.051811933517456,
+ "learning_rate": 3.238357943555481e-09,
+ "loss": 0.304,
+ "step": 12486
+ },
+ {
+ "epoch": 5.904018912529551,
+ "grad_norm": 3.2219812870025635,
+ "learning_rate": 3.2066939237951343e-09,
+ "loss": 0.3286,
+ "step": 12487
+ },
+ {
+ "epoch": 5.904491725768321,
+ "grad_norm": 3.207859754562378,
+ "learning_rate": 3.1751853684053314e-09,
+ "loss": 0.3245,
+ "step": 12488
+ },
+ {
+ "epoch": 5.904964539007092,
+ "grad_norm": 3.4851181507110596,
+ "learning_rate": 3.143832279348946e-09,
+ "loss": 0.3181,
+ "step": 12489
+ },
+ {
+ "epoch": 5.905437352245863,
+ "grad_norm": 4.016768455505371,
+ "learning_rate": 3.1126346585774734e-09,
+ "loss": 0.3812,
+ "step": 12490
+ },
+ {
+ "epoch": 5.9059101654846335,
+ "grad_norm": 2.883434772491455,
+ "learning_rate": 3.081592508033804e-09,
+ "loss": 0.2807,
+ "step": 12491
+ },
+ {
+ "epoch": 5.906382978723404,
+ "grad_norm": 3.171082019805908,
+ "learning_rate": 3.0507058296505575e-09,
+ "loss": 0.2997,
+ "step": 12492
+ },
+ {
+ "epoch": 5.906855791962175,
+ "grad_norm": 3.3019914627075195,
+ "learning_rate": 3.019974625351196e-09,
+ "loss": 0.3439,
+ "step": 12493
+ },
+ {
+ "epoch": 5.907328605200946,
+ "grad_norm": 3.309513568878174,
+ "learning_rate": 2.9893988970491896e-09,
+ "loss": 0.3314,
+ "step": 12494
+ },
+ {
+ "epoch": 5.907801418439716,
+ "grad_norm": 3.239665985107422,
+ "learning_rate": 2.958978646648292e-09,
+ "loss": 0.3184,
+ "step": 12495
+ },
+ {
+ "epoch": 5.908274231678487,
+ "grad_norm": 2.879519462585449,
+ "learning_rate": 2.928713876042266e-09,
+ "loss": 0.3068,
+ "step": 12496
+ },
+ {
+ "epoch": 5.908747044917257,
+ "grad_norm": 3.4216079711914062,
+ "learning_rate": 2.8986045871162717e-09,
+ "loss": 0.3702,
+ "step": 12497
+ },
+ {
+ "epoch": 5.909219858156028,
+ "grad_norm": 3.076555013656616,
+ "learning_rate": 2.8686507817443644e-09,
+ "loss": 0.2686,
+ "step": 12498
+ },
+ {
+ "epoch": 5.909692671394799,
+ "grad_norm": 3.1961801052093506,
+ "learning_rate": 2.8388524617922743e-09,
+ "loss": 0.3345,
+ "step": 12499
+ },
+ {
+ "epoch": 5.91016548463357,
+ "grad_norm": 3.0233850479125977,
+ "learning_rate": 2.8092096291149063e-09,
+ "loss": 0.2938,
+ "step": 12500
+ },
+ {
+ "epoch": 5.910638297872341,
+ "grad_norm": 3.181387186050415,
+ "learning_rate": 2.7797222855582838e-09,
+ "loss": 0.3442,
+ "step": 12501
+ },
+ {
+ "epoch": 5.911111111111111,
+ "grad_norm": 2.9414830207824707,
+ "learning_rate": 2.7503904329584385e-09,
+ "loss": 0.3352,
+ "step": 12502
+ },
+ {
+ "epoch": 5.911583924349882,
+ "grad_norm": 2.9812917709350586,
+ "learning_rate": 2.7212140731414095e-09,
+ "loss": 0.3527,
+ "step": 12503
+ },
+ {
+ "epoch": 5.912056737588652,
+ "grad_norm": 3.1165359020233154,
+ "learning_rate": 2.692193207924354e-09,
+ "loss": 0.3205,
+ "step": 12504
+ },
+ {
+ "epoch": 5.912529550827423,
+ "grad_norm": 3.0312206745147705,
+ "learning_rate": 2.6633278391141603e-09,
+ "loss": 0.3348,
+ "step": 12505
+ },
+ {
+ "epoch": 5.913002364066194,
+ "grad_norm": 3.4665310382843018,
+ "learning_rate": 2.634617968508002e-09,
+ "loss": 0.3353,
+ "step": 12506
+ },
+ {
+ "epoch": 5.9134751773049645,
+ "grad_norm": 3.696239948272705,
+ "learning_rate": 2.606063597893338e-09,
+ "loss": 0.3089,
+ "step": 12507
+ },
+ {
+ "epoch": 5.913947990543735,
+ "grad_norm": 4.07038688659668,
+ "learning_rate": 2.5776647290487453e-09,
+ "loss": 0.3589,
+ "step": 12508
+ },
+ {
+ "epoch": 5.914420803782506,
+ "grad_norm": 3.50469970703125,
+ "learning_rate": 2.5494213637416997e-09,
+ "loss": 0.3651,
+ "step": 12509
+ },
+ {
+ "epoch": 5.914893617021277,
+ "grad_norm": 3.255824565887451,
+ "learning_rate": 2.521333503731349e-09,
+ "loss": 0.332,
+ "step": 12510
+ },
+ {
+ "epoch": 5.915366430260047,
+ "grad_norm": 3.1768665313720703,
+ "learning_rate": 2.4934011507665723e-09,
+ "loss": 0.3498,
+ "step": 12511
+ },
+ {
+ "epoch": 5.915839243498818,
+ "grad_norm": 3.471471071243286,
+ "learning_rate": 2.465624306586256e-09,
+ "loss": 0.3494,
+ "step": 12512
+ },
+ {
+ "epoch": 5.916312056737588,
+ "grad_norm": 2.9033734798431396,
+ "learning_rate": 2.4380029729204057e-09,
+ "loss": 0.2507,
+ "step": 12513
+ },
+ {
+ "epoch": 5.916784869976359,
+ "grad_norm": 3.356685161590576,
+ "learning_rate": 2.410537151488479e-09,
+ "loss": 0.2937,
+ "step": 12514
+ },
+ {
+ "epoch": 5.91725768321513,
+ "grad_norm": 2.9482526779174805,
+ "learning_rate": 2.3832268440007743e-09,
+ "loss": 0.3046,
+ "step": 12515
+ },
+ {
+ "epoch": 5.917730496453901,
+ "grad_norm": 3.0559420585632324,
+ "learning_rate": 2.3560720521578763e-09,
+ "loss": 0.3166,
+ "step": 12516
+ },
+ {
+ "epoch": 5.918203309692672,
+ "grad_norm": 3.1434214115142822,
+ "learning_rate": 2.329072777650376e-09,
+ "loss": 0.3308,
+ "step": 12517
+ },
+ {
+ "epoch": 5.918676122931442,
+ "grad_norm": 3.422903299331665,
+ "learning_rate": 2.3022290221597078e-09,
+ "loss": 0.3738,
+ "step": 12518
+ },
+ {
+ "epoch": 5.919148936170213,
+ "grad_norm": 2.9760549068450928,
+ "learning_rate": 2.2755407873570332e-09,
+ "loss": 0.3015,
+ "step": 12519
+ },
+ {
+ "epoch": 5.919621749408983,
+ "grad_norm": 3.3623647689819336,
+ "learning_rate": 2.2490080749043573e-09,
+ "loss": 0.2757,
+ "step": 12520
+ },
+ {
+ "epoch": 5.920094562647754,
+ "grad_norm": 2.9579522609710693,
+ "learning_rate": 2.2226308864536917e-09,
+ "loss": 0.2836,
+ "step": 12521
+ },
+ {
+ "epoch": 5.920567375886525,
+ "grad_norm": 4.006960868835449,
+ "learning_rate": 2.1964092236473335e-09,
+ "loss": 0.3794,
+ "step": 12522
+ },
+ {
+ "epoch": 5.9210401891252955,
+ "grad_norm": 3.3528733253479004,
+ "learning_rate": 2.170343088118143e-09,
+ "loss": 0.2683,
+ "step": 12523
+ },
+ {
+ "epoch": 5.921513002364066,
+ "grad_norm": 3.1904187202453613,
+ "learning_rate": 2.1444324814887118e-09,
+ "loss": 0.305,
+ "step": 12524
+ },
+ {
+ "epoch": 5.921985815602837,
+ "grad_norm": 3.4845130443573,
+ "learning_rate": 2.1186774053730265e-09,
+ "loss": 0.3027,
+ "step": 12525
+ },
+ {
+ "epoch": 5.922458628841608,
+ "grad_norm": 2.857910394668579,
+ "learning_rate": 2.0930778613742485e-09,
+ "loss": 0.2847,
+ "step": 12526
+ },
+ {
+ "epoch": 5.922931442080378,
+ "grad_norm": 2.882591485977173,
+ "learning_rate": 2.067633851086659e-09,
+ "loss": 0.3152,
+ "step": 12527
+ },
+ {
+ "epoch": 5.923404255319149,
+ "grad_norm": 3.1664819717407227,
+ "learning_rate": 2.042345376094268e-09,
+ "loss": 0.3165,
+ "step": 12528
+ },
+ {
+ "epoch": 5.923877068557919,
+ "grad_norm": 3.645685911178589,
+ "learning_rate": 2.0172124379716498e-09,
+ "loss": 0.3531,
+ "step": 12529
+ },
+ {
+ "epoch": 5.92434988179669,
+ "grad_norm": 3.2750864028930664,
+ "learning_rate": 1.992235038284218e-09,
+ "loss": 0.3481,
+ "step": 12530
+ },
+ {
+ "epoch": 5.924822695035461,
+ "grad_norm": 3.969918727874756,
+ "learning_rate": 1.967413178586841e-09,
+ "loss": 0.3035,
+ "step": 12531
+ },
+ {
+ "epoch": 5.925295508274232,
+ "grad_norm": 3.5722131729125977,
+ "learning_rate": 1.942746860424949e-09,
+ "loss": 0.3565,
+ "step": 12532
+ },
+ {
+ "epoch": 5.925768321513003,
+ "grad_norm": 3.3106608390808105,
+ "learning_rate": 1.918236085334535e-09,
+ "loss": 0.3029,
+ "step": 12533
+ },
+ {
+ "epoch": 5.926241134751773,
+ "grad_norm": 2.830862522125244,
+ "learning_rate": 1.8938808548418785e-09,
+ "loss": 0.3048,
+ "step": 12534
+ },
+ {
+ "epoch": 5.926713947990544,
+ "grad_norm": 3.785337209701538,
+ "learning_rate": 1.8696811704635443e-09,
+ "loss": 0.3596,
+ "step": 12535
+ },
+ {
+ "epoch": 5.927186761229314,
+ "grad_norm": 2.95487904548645,
+ "learning_rate": 1.845637033705827e-09,
+ "loss": 0.3138,
+ "step": 12536
+ },
+ {
+ "epoch": 5.927659574468085,
+ "grad_norm": 3.2796683311462402,
+ "learning_rate": 1.821748446066418e-09,
+ "loss": 0.2851,
+ "step": 12537
+ },
+ {
+ "epoch": 5.928132387706856,
+ "grad_norm": 3.2564337253570557,
+ "learning_rate": 1.7980154090327384e-09,
+ "loss": 0.3185,
+ "step": 12538
+ },
+ {
+ "epoch": 5.9286052009456265,
+ "grad_norm": 3.140939950942993,
+ "learning_rate": 1.7744379240822173e-09,
+ "loss": 0.3562,
+ "step": 12539
+ },
+ {
+ "epoch": 5.929078014184397,
+ "grad_norm": 3.1424221992492676,
+ "learning_rate": 1.7510159926828473e-09,
+ "loss": 0.3174,
+ "step": 12540
+ },
+ {
+ "epoch": 5.929550827423168,
+ "grad_norm": 3.8264408111572266,
+ "learning_rate": 1.7277496162934615e-09,
+ "loss": 0.3059,
+ "step": 12541
+ },
+ {
+ "epoch": 5.930023640661939,
+ "grad_norm": 3.426374673843384,
+ "learning_rate": 1.7046387963626232e-09,
+ "loss": 0.335,
+ "step": 12542
+ },
+ {
+ "epoch": 5.930496453900709,
+ "grad_norm": 3.537398099899292,
+ "learning_rate": 1.6816835343289039e-09,
+ "loss": 0.3797,
+ "step": 12543
+ },
+ {
+ "epoch": 5.93096926713948,
+ "grad_norm": 3.5586562156677246,
+ "learning_rate": 1.6588838316219935e-09,
+ "loss": 0.4037,
+ "step": 12544
+ },
+ {
+ "epoch": 5.93144208037825,
+ "grad_norm": 3.411766290664673,
+ "learning_rate": 1.6362396896618672e-09,
+ "loss": 0.3927,
+ "step": 12545
+ },
+ {
+ "epoch": 5.931914893617021,
+ "grad_norm": 3.1392691135406494,
+ "learning_rate": 1.6137511098576752e-09,
+ "loss": 0.2881,
+ "step": 12546
+ },
+ {
+ "epoch": 5.932387706855792,
+ "grad_norm": 3.751774787902832,
+ "learning_rate": 1.5914180936102418e-09,
+ "loss": 0.3058,
+ "step": 12547
+ },
+ {
+ "epoch": 5.932860520094563,
+ "grad_norm": 3.5063114166259766,
+ "learning_rate": 1.5692406423101215e-09,
+ "loss": 0.2882,
+ "step": 12548
+ },
+ {
+ "epoch": 5.933333333333334,
+ "grad_norm": 3.547232151031494,
+ "learning_rate": 1.5472187573378762e-09,
+ "loss": 0.2838,
+ "step": 12549
+ },
+ {
+ "epoch": 5.933806146572104,
+ "grad_norm": 3.0414838790893555,
+ "learning_rate": 1.5253524400651864e-09,
+ "loss": 0.2985,
+ "step": 12550
+ },
+ {
+ "epoch": 5.934278959810875,
+ "grad_norm": 3.165259599685669,
+ "learning_rate": 1.503641691853186e-09,
+ "loss": 0.304,
+ "step": 12551
+ },
+ {
+ "epoch": 5.934751773049645,
+ "grad_norm": 2.8662290573120117,
+ "learning_rate": 1.4820865140538488e-09,
+ "loss": 0.2988,
+ "step": 12552
+ },
+ {
+ "epoch": 5.935224586288416,
+ "grad_norm": 3.483565330505371,
+ "learning_rate": 1.4606869080091567e-09,
+ "loss": 0.3328,
+ "step": 12553
+ },
+ {
+ "epoch": 5.935697399527187,
+ "grad_norm": 3.4734973907470703,
+ "learning_rate": 1.439442875051933e-09,
+ "loss": 0.3263,
+ "step": 12554
+ },
+ {
+ "epoch": 5.9361702127659575,
+ "grad_norm": 3.7281854152679443,
+ "learning_rate": 1.4183544165047303e-09,
+ "loss": 0.3849,
+ "step": 12555
+ },
+ {
+ "epoch": 5.936643026004728,
+ "grad_norm": 3.1552608013153076,
+ "learning_rate": 1.3974215336806652e-09,
+ "loss": 0.3537,
+ "step": 12556
+ },
+ {
+ "epoch": 5.937115839243499,
+ "grad_norm": 3.2095046043395996,
+ "learning_rate": 1.3766442278831393e-09,
+ "loss": 0.3154,
+ "step": 12557
+ },
+ {
+ "epoch": 5.93758865248227,
+ "grad_norm": 3.242431402206421,
+ "learning_rate": 1.3560225004055627e-09,
+ "loss": 0.2952,
+ "step": 12558
+ },
+ {
+ "epoch": 5.93806146572104,
+ "grad_norm": 3.4177846908569336,
+ "learning_rate": 1.3355563525324632e-09,
+ "loss": 0.373,
+ "step": 12559
+ },
+ {
+ "epoch": 5.938534278959811,
+ "grad_norm": 3.2495975494384766,
+ "learning_rate": 1.3152457855380995e-09,
+ "loss": 0.3472,
+ "step": 12560
+ },
+ {
+ "epoch": 5.939007092198581,
+ "grad_norm": 3.5553295612335205,
+ "learning_rate": 1.295090800686738e-09,
+ "loss": 0.302,
+ "step": 12561
+ },
+ {
+ "epoch": 5.939479905437352,
+ "grad_norm": 3.769275188446045,
+ "learning_rate": 1.2750913992337632e-09,
+ "loss": 0.3566,
+ "step": 12562
+ },
+ {
+ "epoch": 5.939952718676123,
+ "grad_norm": 3.7617647647857666,
+ "learning_rate": 1.2552475824240128e-09,
+ "loss": 0.3565,
+ "step": 12563
+ },
+ {
+ "epoch": 5.940425531914894,
+ "grad_norm": 3.5219485759735107,
+ "learning_rate": 1.2355593514934428e-09,
+ "loss": 0.3704,
+ "step": 12564
+ },
+ {
+ "epoch": 5.9408983451536646,
+ "grad_norm": 3.0995635986328125,
+ "learning_rate": 1.2160267076680166e-09,
+ "loss": 0.3038,
+ "step": 12565
+ },
+ {
+ "epoch": 5.941371158392435,
+ "grad_norm": 3.5231125354766846,
+ "learning_rate": 1.196649652163706e-09,
+ "loss": 0.3078,
+ "step": 12566
+ },
+ {
+ "epoch": 5.941843971631206,
+ "grad_norm": 3.571253776550293,
+ "learning_rate": 1.1774281861867687e-09,
+ "loss": 0.3179,
+ "step": 12567
+ },
+ {
+ "epoch": 5.942316784869976,
+ "grad_norm": 3.3609511852264404,
+ "learning_rate": 1.1583623109348575e-09,
+ "loss": 0.3057,
+ "step": 12568
+ },
+ {
+ "epoch": 5.942789598108747,
+ "grad_norm": 3.4012279510498047,
+ "learning_rate": 1.1394520275942455e-09,
+ "loss": 0.328,
+ "step": 12569
+ },
+ {
+ "epoch": 5.943262411347518,
+ "grad_norm": 3.3525643348693848,
+ "learning_rate": 1.1206973373428798e-09,
+ "loss": 0.325,
+ "step": 12570
+ },
+ {
+ "epoch": 5.9437352245862884,
+ "grad_norm": 3.5196216106414795,
+ "learning_rate": 1.1020982413487146e-09,
+ "loss": 0.3484,
+ "step": 12571
+ },
+ {
+ "epoch": 5.944208037825059,
+ "grad_norm": 3.3853707313537598,
+ "learning_rate": 1.0836547407691577e-09,
+ "loss": 0.3082,
+ "step": 12572
+ },
+ {
+ "epoch": 5.94468085106383,
+ "grad_norm": 3.011427640914917,
+ "learning_rate": 1.0653668367532898e-09,
+ "loss": 0.3214,
+ "step": 12573
+ },
+ {
+ "epoch": 5.945153664302601,
+ "grad_norm": 3.4044291973114014,
+ "learning_rate": 1.0472345304393671e-09,
+ "loss": 0.2829,
+ "step": 12574
+ },
+ {
+ "epoch": 5.945626477541371,
+ "grad_norm": 3.5128118991851807,
+ "learning_rate": 1.0292578229564864e-09,
+ "loss": 0.3547,
+ "step": 12575
+ },
+ {
+ "epoch": 5.9460992907801415,
+ "grad_norm": 3.8453915119171143,
+ "learning_rate": 1.0114367154243076e-09,
+ "loss": 0.3614,
+ "step": 12576
+ },
+ {
+ "epoch": 5.946572104018912,
+ "grad_norm": 3.520470380783081,
+ "learning_rate": 9.93771208952221e-10,
+ "loss": 0.3365,
+ "step": 12577
+ },
+ {
+ "epoch": 5.947044917257683,
+ "grad_norm": 2.913461208343506,
+ "learning_rate": 9.762613046399029e-10,
+ "loss": 0.3205,
+ "step": 12578
+ },
+ {
+ "epoch": 5.947517730496454,
+ "grad_norm": 3.097443103790283,
+ "learning_rate": 9.589070035781468e-10,
+ "loss": 0.3293,
+ "step": 12579
+ },
+ {
+ "epoch": 5.947990543735225,
+ "grad_norm": 3.0502073764801025,
+ "learning_rate": 9.417083068472e-10,
+ "loss": 0.2959,
+ "step": 12580
+ },
+ {
+ "epoch": 5.9484633569739955,
+ "grad_norm": 3.49462628364563,
+ "learning_rate": 9.246652155181501e-10,
+ "loss": 0.3158,
+ "step": 12581
+ },
+ {
+ "epoch": 5.948936170212766,
+ "grad_norm": 3.5302231311798096,
+ "learning_rate": 9.077777306518154e-10,
+ "loss": 0.3397,
+ "step": 12582
+ },
+ {
+ "epoch": 5.949408983451537,
+ "grad_norm": 3.1912498474121094,
+ "learning_rate": 8.910458533004096e-10,
+ "loss": 0.3169,
+ "step": 12583
+ },
+ {
+ "epoch": 5.949881796690307,
+ "grad_norm": 3.133894205093384,
+ "learning_rate": 8.744695845050444e-10,
+ "loss": 0.3104,
+ "step": 12584
+ },
+ {
+ "epoch": 5.950354609929078,
+ "grad_norm": 3.1867423057556152,
+ "learning_rate": 8.580489252979495e-10,
+ "loss": 0.2583,
+ "step": 12585
+ },
+ {
+ "epoch": 5.950827423167849,
+ "grad_norm": 3.6315085887908936,
+ "learning_rate": 8.417838767019182e-10,
+ "loss": 0.3268,
+ "step": 12586
+ },
+ {
+ "epoch": 5.951300236406619,
+ "grad_norm": 3.5569820404052734,
+ "learning_rate": 8.256744397294736e-10,
+ "loss": 0.3381,
+ "step": 12587
+ },
+ {
+ "epoch": 5.95177304964539,
+ "grad_norm": 3.262516736984253,
+ "learning_rate": 8.097206153839798e-10,
+ "loss": 0.3579,
+ "step": 12588
+ },
+ {
+ "epoch": 5.952245862884161,
+ "grad_norm": 3.626657485961914,
+ "learning_rate": 7.939224046582539e-10,
+ "loss": 0.3547,
+ "step": 12589
+ },
+ {
+ "epoch": 5.952718676122932,
+ "grad_norm": 3.1696643829345703,
+ "learning_rate": 7.782798085365084e-10,
+ "loss": 0.3459,
+ "step": 12590
+ },
+ {
+ "epoch": 5.953191489361702,
+ "grad_norm": 3.387317419052124,
+ "learning_rate": 7.627928279924091e-10,
+ "loss": 0.3752,
+ "step": 12591
+ },
+ {
+ "epoch": 5.9536643026004725,
+ "grad_norm": 3.160543441772461,
+ "learning_rate": 7.474614639904621e-10,
+ "loss": 0.3343,
+ "step": 12592
+ },
+ {
+ "epoch": 5.954137115839243,
+ "grad_norm": 3.4762585163116455,
+ "learning_rate": 7.32285717484904e-10,
+ "loss": 0.3407,
+ "step": 12593
+ },
+ {
+ "epoch": 5.954609929078014,
+ "grad_norm": 3.5852251052856445,
+ "learning_rate": 7.172655894213676e-10,
+ "loss": 0.2727,
+ "step": 12594
+ },
+ {
+ "epoch": 5.955082742316785,
+ "grad_norm": 3.3408043384552,
+ "learning_rate": 7.024010807343829e-10,
+ "loss": 0.3203,
+ "step": 12595
+ },
+ {
+ "epoch": 5.955555555555556,
+ "grad_norm": 4.2062087059021,
+ "learning_rate": 6.876921923498758e-10,
+ "loss": 0.3682,
+ "step": 12596
+ },
+ {
+ "epoch": 5.9560283687943265,
+ "grad_norm": 4.376652717590332,
+ "learning_rate": 6.73138925183503e-10,
+ "loss": 0.3384,
+ "step": 12597
+ },
+ {
+ "epoch": 5.956501182033097,
+ "grad_norm": 3.4473304748535156,
+ "learning_rate": 6.587412801417614e-10,
+ "loss": 0.2883,
+ "step": 12598
+ },
+ {
+ "epoch": 5.956973995271868,
+ "grad_norm": 3.453704357147217,
+ "learning_rate": 6.444992581208786e-10,
+ "loss": 0.3558,
+ "step": 12599
+ },
+ {
+ "epoch": 5.957446808510638,
+ "grad_norm": 3.6638705730438232,
+ "learning_rate": 6.304128600076453e-10,
+ "loss": 0.3481,
+ "step": 12600
+ },
+ {
+ "epoch": 5.957919621749409,
+ "grad_norm": 3.319033145904541,
+ "learning_rate": 6.164820866791377e-10,
+ "loss": 0.3474,
+ "step": 12601
+ },
+ {
+ "epoch": 5.95839243498818,
+ "grad_norm": 3.0885703563690186,
+ "learning_rate": 6.02706939002995e-10,
+ "loss": 0.3023,
+ "step": 12602
+ },
+ {
+ "epoch": 5.95886524822695,
+ "grad_norm": 3.0238006114959717,
+ "learning_rate": 5.890874178365868e-10,
+ "loss": 0.3123,
+ "step": 12603
+ },
+ {
+ "epoch": 5.959338061465721,
+ "grad_norm": 3.1967854499816895,
+ "learning_rate": 5.756235240281238e-10,
+ "loss": 0.2907,
+ "step": 12604
+ },
+ {
+ "epoch": 5.959810874704492,
+ "grad_norm": 3.2789015769958496,
+ "learning_rate": 5.623152584161018e-10,
+ "loss": 0.3304,
+ "step": 12605
+ },
+ {
+ "epoch": 5.960283687943263,
+ "grad_norm": 2.9390599727630615,
+ "learning_rate": 5.491626218290247e-10,
+ "loss": 0.2878,
+ "step": 12606
+ },
+ {
+ "epoch": 5.960756501182033,
+ "grad_norm": 3.0502469539642334,
+ "learning_rate": 5.36165615085682e-10,
+ "loss": 0.2774,
+ "step": 12607
+ },
+ {
+ "epoch": 5.9612293144208035,
+ "grad_norm": 3.309628963470459,
+ "learning_rate": 5.233242389954262e-10,
+ "loss": 0.3192,
+ "step": 12608
+ },
+ {
+ "epoch": 5.961702127659574,
+ "grad_norm": 3.4259536266326904,
+ "learning_rate": 5.106384943578957e-10,
+ "loss": 0.303,
+ "step": 12609
+ },
+ {
+ "epoch": 5.962174940898345,
+ "grad_norm": 3.8046417236328125,
+ "learning_rate": 4.981083819630139e-10,
+ "loss": 0.3016,
+ "step": 12610
+ },
+ {
+ "epoch": 5.962647754137116,
+ "grad_norm": 3.7150895595550537,
+ "learning_rate": 4.857339025909902e-10,
+ "loss": 0.3508,
+ "step": 12611
+ },
+ {
+ "epoch": 5.963120567375887,
+ "grad_norm": 3.202986717224121,
+ "learning_rate": 4.735150570123192e-10,
+ "loss": 0.2981,
+ "step": 12612
+ },
+ {
+ "epoch": 5.9635933806146575,
+ "grad_norm": 3.3235676288604736,
+ "learning_rate": 4.614518459877815e-10,
+ "loss": 0.3602,
+ "step": 12613
+ },
+ {
+ "epoch": 5.964066193853428,
+ "grad_norm": 3.443566083908081,
+ "learning_rate": 4.4954427026844273e-10,
+ "loss": 0.3364,
+ "step": 12614
+ },
+ {
+ "epoch": 5.964539007092198,
+ "grad_norm": 3.0922253131866455,
+ "learning_rate": 4.377923305956544e-10,
+ "loss": 0.3483,
+ "step": 12615
+ },
+ {
+ "epoch": 5.965011820330969,
+ "grad_norm": 3.164511203765869,
+ "learning_rate": 4.261960277013311e-10,
+ "loss": 0.3303,
+ "step": 12616
+ },
+ {
+ "epoch": 5.96548463356974,
+ "grad_norm": 3.2433879375457764,
+ "learning_rate": 4.1475536230767275e-10,
+ "loss": 0.3382,
+ "step": 12617
+ },
+ {
+ "epoch": 5.965957446808511,
+ "grad_norm": 3.1344754695892334,
+ "learning_rate": 4.0347033512661007e-10,
+ "loss": 0.3195,
+ "step": 12618
+ },
+ {
+ "epoch": 5.966430260047281,
+ "grad_norm": 3.2667789459228516,
+ "learning_rate": 3.923409468611916e-10,
+ "loss": 0.3353,
+ "step": 12619
+ },
+ {
+ "epoch": 5.966903073286052,
+ "grad_norm": 3.0841329097747803,
+ "learning_rate": 3.8136719820419666e-10,
+ "loss": 0.3321,
+ "step": 12620
+ },
+ {
+ "epoch": 5.967375886524823,
+ "grad_norm": 3.5874977111816406,
+ "learning_rate": 3.7054908983896743e-10,
+ "loss": 0.339,
+ "step": 12621
+ },
+ {
+ "epoch": 5.967848699763593,
+ "grad_norm": 3.3214657306671143,
+ "learning_rate": 3.5988662243913174e-10,
+ "loss": 0.2722,
+ "step": 12622
+ },
+ {
+ "epoch": 5.968321513002364,
+ "grad_norm": 3.529088258743286,
+ "learning_rate": 3.49379796668603e-10,
+ "loss": 0.3119,
+ "step": 12623
+ },
+ {
+ "epoch": 5.9687943262411345,
+ "grad_norm": 3.0304949283599854,
+ "learning_rate": 3.390286131815801e-10,
+ "loss": 0.2982,
+ "step": 12624
+ },
+ {
+ "epoch": 5.969267139479905,
+ "grad_norm": 3.099729061126709,
+ "learning_rate": 3.288330726225475e-10,
+ "loss": 0.3886,
+ "step": 12625
+ },
+ {
+ "epoch": 5.969739952718676,
+ "grad_norm": 4.111376762390137,
+ "learning_rate": 3.187931756262752e-10,
+ "loss": 0.3365,
+ "step": 12626
+ },
+ {
+ "epoch": 5.970212765957447,
+ "grad_norm": 3.641390562057495,
+ "learning_rate": 3.089089228178188e-10,
+ "loss": 0.3272,
+ "step": 12627
+ },
+ {
+ "epoch": 5.970685579196218,
+ "grad_norm": 3.644512891769409,
+ "learning_rate": 2.991803148130745e-10,
+ "loss": 0.3679,
+ "step": 12628
+ },
+ {
+ "epoch": 5.9711583924349885,
+ "grad_norm": 3.3023669719696045,
+ "learning_rate": 2.8960735221739146e-10,
+ "loss": 0.3153,
+ "step": 12629
+ },
+ {
+ "epoch": 5.971631205673759,
+ "grad_norm": 3.5878617763519287,
+ "learning_rate": 2.8019003562695937e-10,
+ "loss": 0.3384,
+ "step": 12630
+ },
+ {
+ "epoch": 5.972104018912529,
+ "grad_norm": 3.2044191360473633,
+ "learning_rate": 2.709283656282535e-10,
+ "loss": 0.3238,
+ "step": 12631
+ },
+ {
+ "epoch": 5.9725768321513,
+ "grad_norm": 3.1008009910583496,
+ "learning_rate": 2.6182234279775733e-10,
+ "loss": 0.3038,
+ "step": 12632
+ },
+ {
+ "epoch": 5.973049645390071,
+ "grad_norm": 3.578662872314453,
+ "learning_rate": 2.528719677025171e-10,
+ "loss": 0.3529,
+ "step": 12633
+ },
+ {
+ "epoch": 5.973522458628842,
+ "grad_norm": 3.211794137954712,
+ "learning_rate": 2.4407724089986483e-10,
+ "loss": 0.3326,
+ "step": 12634
+ },
+ {
+ "epoch": 5.973995271867612,
+ "grad_norm": 3.3977439403533936,
+ "learning_rate": 2.3543816293741807e-10,
+ "loss": 0.3552,
+ "step": 12635
+ },
+ {
+ "epoch": 5.974468085106383,
+ "grad_norm": 3.276383638381958,
+ "learning_rate": 2.2695473435335735e-10,
+ "loss": 0.323,
+ "step": 12636
+ },
+ {
+ "epoch": 5.974940898345154,
+ "grad_norm": 3.3910670280456543,
+ "learning_rate": 2.1862695567531623e-10,
+ "loss": 0.3771,
+ "step": 12637
+ },
+ {
+ "epoch": 5.975413711583924,
+ "grad_norm": 3.0644946098327637,
+ "learning_rate": 2.1045482742232392e-10,
+ "loss": 0.328,
+ "step": 12638
+ },
+ {
+ "epoch": 5.975886524822695,
+ "grad_norm": 3.269826650619507,
+ "learning_rate": 2.0243835010314016e-10,
+ "loss": 0.3167,
+ "step": 12639
+ },
+ {
+ "epoch": 5.9763593380614655,
+ "grad_norm": 3.083881378173828,
+ "learning_rate": 1.945775242168102e-10,
+ "loss": 0.299,
+ "step": 12640
+ },
+ {
+ "epoch": 5.976832151300236,
+ "grad_norm": 3.8910908699035645,
+ "learning_rate": 1.8687235025266482e-10,
+ "loss": 0.3703,
+ "step": 12641
+ },
+ {
+ "epoch": 5.977304964539007,
+ "grad_norm": 3.2514829635620117,
+ "learning_rate": 1.793228286905979e-10,
+ "loss": 0.3372,
+ "step": 12642
+ },
+ {
+ "epoch": 5.977777777777778,
+ "grad_norm": 3.677476167678833,
+ "learning_rate": 1.7192896000078896e-10,
+ "loss": 0.3025,
+ "step": 12643
+ },
+ {
+ "epoch": 5.978250591016549,
+ "grad_norm": 3.4517319202423096,
+ "learning_rate": 1.6469074464370295e-10,
+ "loss": 0.3534,
+ "step": 12644
+ },
+ {
+ "epoch": 5.9787234042553195,
+ "grad_norm": 3.403575897216797,
+ "learning_rate": 1.5760818306981286e-10,
+ "loss": 0.3381,
+ "step": 12645
+ },
+ {
+ "epoch": 5.97919621749409,
+ "grad_norm": 3.3433964252471924,
+ "learning_rate": 1.5068127572015477e-10,
+ "loss": 0.3,
+ "step": 12646
+ },
+ {
+ "epoch": 5.97966903073286,
+ "grad_norm": 3.4642858505249023,
+ "learning_rate": 1.439100230260504e-10,
+ "loss": 0.3366,
+ "step": 12647
+ },
+ {
+ "epoch": 5.980141843971631,
+ "grad_norm": 3.233149290084839,
+ "learning_rate": 1.3729442540910687e-10,
+ "loss": 0.3518,
+ "step": 12648
+ },
+ {
+ "epoch": 5.980614657210402,
+ "grad_norm": 4.615501880645752,
+ "learning_rate": 1.3083448328121694e-10,
+ "loss": 0.3188,
+ "step": 12649
+ },
+ {
+ "epoch": 5.9810874704491725,
+ "grad_norm": 3.344639778137207,
+ "learning_rate": 1.2453019704483648e-10,
+ "loss": 0.3275,
+ "step": 12650
+ },
+ {
+ "epoch": 5.981560283687943,
+ "grad_norm": 3.085968017578125,
+ "learning_rate": 1.1838156709215176e-10,
+ "loss": 0.3019,
+ "step": 12651
+ },
+ {
+ "epoch": 5.982033096926714,
+ "grad_norm": 3.3743207454681396,
+ "learning_rate": 1.1238859380618971e-10,
+ "loss": 0.3372,
+ "step": 12652
+ },
+ {
+ "epoch": 5.982505910165485,
+ "grad_norm": 3.885718822479248,
+ "learning_rate": 1.0655127756026285e-10,
+ "loss": 0.3697,
+ "step": 12653
+ },
+ {
+ "epoch": 5.982978723404255,
+ "grad_norm": 3.1877360343933105,
+ "learning_rate": 1.0086961871769163e-10,
+ "loss": 0.314,
+ "step": 12654
+ },
+ {
+ "epoch": 5.983451536643026,
+ "grad_norm": 3.0923852920532227,
+ "learning_rate": 9.534361763208211e-11,
+ "loss": 0.3071,
+ "step": 12655
+ },
+ {
+ "epoch": 5.9839243498817964,
+ "grad_norm": 3.0377893447875977,
+ "learning_rate": 8.997327464788097e-11,
+ "loss": 0.2921,
+ "step": 12656
+ },
+ {
+ "epoch": 5.984397163120567,
+ "grad_norm": 3.351410150527954,
+ "learning_rate": 8.475859009898779e-11,
+ "loss": 0.3607,
+ "step": 12657
+ },
+ {
+ "epoch": 5.984869976359338,
+ "grad_norm": 3.2637524604797363,
+ "learning_rate": 7.969956431069792e-11,
+ "loss": 0.3243,
+ "step": 12658
+ },
+ {
+ "epoch": 5.985342789598109,
+ "grad_norm": 3.276374101638794,
+ "learning_rate": 7.479619759748203e-11,
+ "loss": 0.3364,
+ "step": 12659
+ },
+ {
+ "epoch": 5.98581560283688,
+ "grad_norm": 3.1528098583221436,
+ "learning_rate": 7.004849026492899e-11,
+ "loss": 0.3108,
+ "step": 12660
+ },
+ {
+ "epoch": 5.98628841607565,
+ "grad_norm": 3.1490159034729004,
+ "learning_rate": 6.545644260863571e-11,
+ "loss": 0.2671,
+ "step": 12661
+ },
+ {
+ "epoch": 5.986761229314421,
+ "grad_norm": 3.179043769836426,
+ "learning_rate": 6.10200549144846e-11,
+ "loss": 0.3132,
+ "step": 12662
+ },
+ {
+ "epoch": 5.987234042553191,
+ "grad_norm": 3.195889711380005,
+ "learning_rate": 5.673932745864363e-11,
+ "loss": 0.3239,
+ "step": 12663
+ },
+ {
+ "epoch": 5.987706855791962,
+ "grad_norm": 4.262112140655518,
+ "learning_rate": 5.261426050756635e-11,
+ "loss": 0.3875,
+ "step": 12664
+ },
+ {
+ "epoch": 5.988179669030733,
+ "grad_norm": 3.3035454750061035,
+ "learning_rate": 4.864485431854693e-11,
+ "loss": 0.3001,
+ "step": 12665
+ },
+ {
+ "epoch": 5.9886524822695035,
+ "grad_norm": 3.0973124504089355,
+ "learning_rate": 4.483110913833244e-11,
+ "loss": 0.2993,
+ "step": 12666
+ },
+ {
+ "epoch": 5.989125295508274,
+ "grad_norm": 3.536430835723877,
+ "learning_rate": 4.117302520451061e-11,
+ "loss": 0.3625,
+ "step": 12667
+ },
+ {
+ "epoch": 5.989598108747045,
+ "grad_norm": 3.1173746585845947,
+ "learning_rate": 3.767060274495471e-11,
+ "loss": 0.3317,
+ "step": 12668
+ },
+ {
+ "epoch": 5.990070921985816,
+ "grad_norm": 2.9591150283813477,
+ "learning_rate": 3.432384197754601e-11,
+ "loss": 0.2764,
+ "step": 12669
+ },
+ {
+ "epoch": 5.990543735224586,
+ "grad_norm": 3.2574095726013184,
+ "learning_rate": 3.113274311072889e-11,
+ "loss": 0.2925,
+ "step": 12670
+ },
+ {
+ "epoch": 5.991016548463357,
+ "grad_norm": 3.135549306869507,
+ "learning_rate": 2.8097306343233265e-11,
+ "loss": 0.3145,
+ "step": 12671
+ },
+ {
+ "epoch": 5.991489361702127,
+ "grad_norm": 3.5706069469451904,
+ "learning_rate": 2.5217531864074607e-11,
+ "loss": 0.3425,
+ "step": 12672
+ },
+ {
+ "epoch": 5.991962174940898,
+ "grad_norm": 3.5643908977508545,
+ "learning_rate": 2.2493419852831487e-11,
+ "loss": 0.3576,
+ "step": 12673
+ },
+ {
+ "epoch": 5.992434988179669,
+ "grad_norm": 3.684077739715576,
+ "learning_rate": 1.9924970478535365e-11,
+ "loss": 0.3748,
+ "step": 12674
+ },
+ {
+ "epoch": 5.99290780141844,
+ "grad_norm": 3.212735891342163,
+ "learning_rate": 1.7512183901613468e-11,
+ "loss": 0.2961,
+ "step": 12675
+ },
+ {
+ "epoch": 5.993380614657211,
+ "grad_norm": 2.776723623275757,
+ "learning_rate": 1.5255060271945897e-11,
+ "loss": 0.3052,
+ "step": 12676
+ },
+ {
+ "epoch": 5.993853427895981,
+ "grad_norm": 3.1812119483947754,
+ "learning_rate": 1.315359973025343e-11,
+ "loss": 0.2784,
+ "step": 12677
+ },
+ {
+ "epoch": 5.994326241134752,
+ "grad_norm": 3.5751991271972656,
+ "learning_rate": 1.1207802407542379e-11,
+ "loss": 0.309,
+ "step": 12678
+ },
+ {
+ "epoch": 5.994799054373522,
+ "grad_norm": 3.6249916553497314,
+ "learning_rate": 9.417668424827054e-12,
+ "loss": 0.3345,
+ "step": 12679
+ },
+ {
+ "epoch": 5.995271867612293,
+ "grad_norm": 3.166501522064209,
+ "learning_rate": 7.783197893407313e-12,
+ "loss": 0.3697,
+ "step": 12680
+ },
+ {
+ "epoch": 5.995744680851064,
+ "grad_norm": 2.945590019226074,
+ "learning_rate": 6.304390915146119e-12,
+ "loss": 0.2907,
+ "step": 12681
+ },
+ {
+ "epoch": 5.9962174940898345,
+ "grad_norm": 3.273108959197998,
+ "learning_rate": 4.981247582191984e-12,
+ "loss": 0.3229,
+ "step": 12682
+ },
+ {
+ "epoch": 5.996690307328605,
+ "grad_norm": 3.1620328426361084,
+ "learning_rate": 3.813767976978966e-12,
+ "loss": 0.3279,
+ "step": 12683
+ },
+ {
+ "epoch": 5.997163120567376,
+ "grad_norm": 3.197380304336548,
+ "learning_rate": 2.8019521719491182e-12,
+ "loss": 0.3382,
+ "step": 12684
+ },
+ {
+ "epoch": 5.997635933806147,
+ "grad_norm": 3.36995005607605,
+ "learning_rate": 1.945800230662709e-12,
+ "loss": 0.2993,
+ "step": 12685
+ },
+ {
+ "epoch": 5.998108747044917,
+ "grad_norm": 3.0389227867126465,
+ "learning_rate": 1.2453122055777755e-12,
+ "loss": 0.3357,
+ "step": 12686
+ },
+ {
+ "epoch": 5.998581560283688,
+ "grad_norm": 3.352140426635742,
+ "learning_rate": 7.004881411032394e-13,
+ "loss": 0.3167,
+ "step": 12687
+ },
+ {
+ "epoch": 5.999054373522458,
+ "grad_norm": 3.273684024810791,
+ "learning_rate": 3.1132807082334683e-13,
+ "loss": 0.3124,
+ "step": 12688
+ },
+ {
+ "epoch": 5.999527186761229,
+ "grad_norm": 3.941155433654785,
+ "learning_rate": 7.783201888544867e-14,
+ "loss": 0.3472,
+ "step": 12689
+ },
+ {
+ "epoch": 6.0,
+ "grad_norm": 2.634655237197876,
+ "learning_rate": 0.0,
+ "loss": 0.3022,
+ "step": 12690
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 12690,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 6,
+ "save_steps": 2115,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": true
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 3.2054353166598144e+19,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-12690/training_args.bin b/checkpoint-12690/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3885e6a27b204c834ff6fbc6f3a5a4c7f69fe2cc
--- /dev/null
+++ b/checkpoint-12690/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51188d498a3fe1bf9ddf234271786a31e25e89b3aa68721a97f8264e1013c9b6
+size 8056
diff --git a/checkpoint-12690/zero_to_fp32.py b/checkpoint-12690/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-12690/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-2115/README.md b/checkpoint-2115/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..049d467664ca6172b7ffbe6ba60b3eac7479cac4
--- /dev/null
+++ b/checkpoint-2115/README.md
@@ -0,0 +1,202 @@
+---
+base_model: meta-llama/Llama-3.1-8B
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.14.0
\ No newline at end of file
diff --git a/checkpoint-2115/adapter_config.json b/checkpoint-2115/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..38890ca21f7e1854f7350049a9113a9eec8a443a
--- /dev/null
+++ b/checkpoint-2115/adapter_config.json
@@ -0,0 +1,40 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "meta-llama/Llama-3.1-8B",
+ "bias": "none",
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "v_proj",
+ "q_proj",
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "up_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-2115/adapter_model.safetensors b/checkpoint-2115/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4da763432abb5296cfd6c4ab09bc27dd7dba8c11
--- /dev/null
+++ b/checkpoint-2115/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:026f3c7889342f0fe43b51086158facacc03251e62f9af75a2561c3bd540fe59
+size 3443586272
diff --git a/checkpoint-2115/global_step2115/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-2115/global_step2115/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8d809445a1e9ab5ad0f90d907f08223e2bca9493
--- /dev/null
+++ b/checkpoint-2115/global_step2115/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec14e35a08f2f89de6b56dfedb73f3de522db79770e7c655e61fb822a23a2942
+size 20661195036
diff --git a/checkpoint-2115/global_step2115/mp_rank_00_model_states.pt b/checkpoint-2115/global_step2115/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8dd07acf4b4fbfa964e3f99ed0a94aa23b91cb56
--- /dev/null
+++ b/checkpoint-2115/global_step2115/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aebb51cfec455d55d4ac9fb1829aeb3b1cb2978ad761fc8fb52f73a313fc76a6
+size 3555326777
diff --git a/checkpoint-2115/latest b/checkpoint-2115/latest
new file mode 100644
index 0000000000000000000000000000000000000000..eab2857ea42f429150d4483575e33a16db9abe77
--- /dev/null
+++ b/checkpoint-2115/latest
@@ -0,0 +1 @@
+global_step2115
\ No newline at end of file
diff --git a/checkpoint-2115/rng_state.pth b/checkpoint-2115/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..7435620051efd12b77e0ed7e5c44cebaacb7bbea
--- /dev/null
+++ b/checkpoint-2115/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3626e2b3ca653eb16e81523f796a5be969ddc5e19daefb968419425ad5a11285
+size 14244
diff --git a/checkpoint-2115/scheduler.pt b/checkpoint-2115/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..22828224539edb6b797a5020a5994a7ead0dd2e4
--- /dev/null
+++ b/checkpoint-2115/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6f065ea9d96632a81e5e933322d38152bef3ca141882db2bf749d2021df436b
+size 1064
diff --git a/checkpoint-2115/special_tokens_map.json b/checkpoint-2115/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca
--- /dev/null
+++ b/checkpoint-2115/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-2115/tokenizer.json b/checkpoint-2115/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-2115/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-2115/tokenizer_config.json b/checkpoint-2115/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..81dd14db6632ad5b35b9d447732e37ac074873a5
--- /dev/null
+++ b/checkpoint-2115/tokenizer_config.json
@@ -0,0 +1,2063 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|end_of_text|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-2115/trainer_state.json b/checkpoint-2115/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d54d1ea9b36cf3c6ea365748311c184de9beea1f
--- /dev/null
+++ b/checkpoint-2115/trainer_state.json
@@ -0,0 +1,14838 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.0,
+ "eval_steps": 500,
+ "global_step": 2115,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.00047281323877068556,
+ "grad_norm": 5.163570880889893,
+ "learning_rate": 5.0000000000000004e-08,
+ "loss": 1.4628,
+ "step": 1
+ },
+ {
+ "epoch": 0.0009456264775413711,
+ "grad_norm": 6.298020839691162,
+ "learning_rate": 1.0000000000000001e-07,
+ "loss": 1.5003,
+ "step": 2
+ },
+ {
+ "epoch": 0.0014184397163120568,
+ "grad_norm": 5.853623390197754,
+ "learning_rate": 1.5000000000000002e-07,
+ "loss": 1.4495,
+ "step": 3
+ },
+ {
+ "epoch": 0.0018912529550827422,
+ "grad_norm": 5.456025123596191,
+ "learning_rate": 2.0000000000000002e-07,
+ "loss": 1.3798,
+ "step": 4
+ },
+ {
+ "epoch": 0.002364066193853428,
+ "grad_norm": 5.757407188415527,
+ "learning_rate": 2.5000000000000004e-07,
+ "loss": 1.4515,
+ "step": 5
+ },
+ {
+ "epoch": 0.0028368794326241137,
+ "grad_norm": 5.872277736663818,
+ "learning_rate": 3.0000000000000004e-07,
+ "loss": 1.4424,
+ "step": 6
+ },
+ {
+ "epoch": 0.003309692671394799,
+ "grad_norm": 6.7816009521484375,
+ "learning_rate": 3.5000000000000004e-07,
+ "loss": 1.4004,
+ "step": 7
+ },
+ {
+ "epoch": 0.0037825059101654845,
+ "grad_norm": 6.229667663574219,
+ "learning_rate": 4.0000000000000003e-07,
+ "loss": 1.4494,
+ "step": 8
+ },
+ {
+ "epoch": 0.00425531914893617,
+ "grad_norm": 5.336202621459961,
+ "learning_rate": 4.5000000000000003e-07,
+ "loss": 1.3916,
+ "step": 9
+ },
+ {
+ "epoch": 0.004728132387706856,
+ "grad_norm": 5.589445114135742,
+ "learning_rate": 5.000000000000001e-07,
+ "loss": 1.2318,
+ "step": 10
+ },
+ {
+ "epoch": 0.005200945626477541,
+ "grad_norm": 5.720539569854736,
+ "learning_rate": 5.5e-07,
+ "loss": 1.4367,
+ "step": 11
+ },
+ {
+ "epoch": 0.005673758865248227,
+ "grad_norm": 5.913913726806641,
+ "learning_rate": 6.000000000000001e-07,
+ "loss": 1.342,
+ "step": 12
+ },
+ {
+ "epoch": 0.006146572104018913,
+ "grad_norm": 5.899744987487793,
+ "learning_rate": 6.5e-07,
+ "loss": 1.4307,
+ "step": 13
+ },
+ {
+ "epoch": 0.006619385342789598,
+ "grad_norm": 5.571037292480469,
+ "learning_rate": 7.000000000000001e-07,
+ "loss": 1.3372,
+ "step": 14
+ },
+ {
+ "epoch": 0.0070921985815602835,
+ "grad_norm": 5.480010509490967,
+ "learning_rate": 7.5e-07,
+ "loss": 1.3923,
+ "step": 15
+ },
+ {
+ "epoch": 0.007565011820330969,
+ "grad_norm": 5.254702091217041,
+ "learning_rate": 8.000000000000001e-07,
+ "loss": 1.2928,
+ "step": 16
+ },
+ {
+ "epoch": 0.008037825059101654,
+ "grad_norm": 6.090312480926514,
+ "learning_rate": 8.500000000000001e-07,
+ "loss": 1.4984,
+ "step": 17
+ },
+ {
+ "epoch": 0.00851063829787234,
+ "grad_norm": 5.689319610595703,
+ "learning_rate": 9.000000000000001e-07,
+ "loss": 1.4108,
+ "step": 18
+ },
+ {
+ "epoch": 0.008983451536643027,
+ "grad_norm": 5.386685848236084,
+ "learning_rate": 9.500000000000001e-07,
+ "loss": 1.425,
+ "step": 19
+ },
+ {
+ "epoch": 0.009456264775413711,
+ "grad_norm": 6.451584815979004,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 1.5507,
+ "step": 20
+ },
+ {
+ "epoch": 0.009929078014184398,
+ "grad_norm": 5.37647008895874,
+ "learning_rate": 1.0500000000000001e-06,
+ "loss": 1.4109,
+ "step": 21
+ },
+ {
+ "epoch": 0.010401891252955082,
+ "grad_norm": 4.716553211212158,
+ "learning_rate": 1.1e-06,
+ "loss": 1.2028,
+ "step": 22
+ },
+ {
+ "epoch": 0.010874704491725768,
+ "grad_norm": 4.950989723205566,
+ "learning_rate": 1.1500000000000002e-06,
+ "loss": 1.3043,
+ "step": 23
+ },
+ {
+ "epoch": 0.011347517730496455,
+ "grad_norm": 4.688975811004639,
+ "learning_rate": 1.2000000000000002e-06,
+ "loss": 1.2708,
+ "step": 24
+ },
+ {
+ "epoch": 0.01182033096926714,
+ "grad_norm": 4.905868053436279,
+ "learning_rate": 1.25e-06,
+ "loss": 1.3268,
+ "step": 25
+ },
+ {
+ "epoch": 0.012293144208037825,
+ "grad_norm": 4.503395080566406,
+ "learning_rate": 1.3e-06,
+ "loss": 1.1799,
+ "step": 26
+ },
+ {
+ "epoch": 0.01276595744680851,
+ "grad_norm": 4.77382230758667,
+ "learning_rate": 1.3500000000000002e-06,
+ "loss": 1.3882,
+ "step": 27
+ },
+ {
+ "epoch": 0.013238770685579196,
+ "grad_norm": 4.734329700469971,
+ "learning_rate": 1.4000000000000001e-06,
+ "loss": 1.3476,
+ "step": 28
+ },
+ {
+ "epoch": 0.013711583924349883,
+ "grad_norm": 4.775066375732422,
+ "learning_rate": 1.45e-06,
+ "loss": 1.2429,
+ "step": 29
+ },
+ {
+ "epoch": 0.014184397163120567,
+ "grad_norm": 4.978334426879883,
+ "learning_rate": 1.5e-06,
+ "loss": 1.2119,
+ "step": 30
+ },
+ {
+ "epoch": 0.014657210401891253,
+ "grad_norm": 4.506785869598389,
+ "learning_rate": 1.5500000000000002e-06,
+ "loss": 1.3157,
+ "step": 31
+ },
+ {
+ "epoch": 0.015130023640661938,
+ "grad_norm": 4.007757186889648,
+ "learning_rate": 1.6000000000000001e-06,
+ "loss": 1.1451,
+ "step": 32
+ },
+ {
+ "epoch": 0.015602836879432624,
+ "grad_norm": 3.6621618270874023,
+ "learning_rate": 1.6500000000000003e-06,
+ "loss": 1.093,
+ "step": 33
+ },
+ {
+ "epoch": 0.01607565011820331,
+ "grad_norm": 3.8733766078948975,
+ "learning_rate": 1.7000000000000002e-06,
+ "loss": 1.2289,
+ "step": 34
+ },
+ {
+ "epoch": 0.016548463356973995,
+ "grad_norm": 4.3391900062561035,
+ "learning_rate": 1.75e-06,
+ "loss": 1.1453,
+ "step": 35
+ },
+ {
+ "epoch": 0.01702127659574468,
+ "grad_norm": 3.287623643875122,
+ "learning_rate": 1.8000000000000001e-06,
+ "loss": 1.0257,
+ "step": 36
+ },
+ {
+ "epoch": 0.017494089834515367,
+ "grad_norm": 3.591721773147583,
+ "learning_rate": 1.85e-06,
+ "loss": 0.9976,
+ "step": 37
+ },
+ {
+ "epoch": 0.017966903073286054,
+ "grad_norm": 4.028271675109863,
+ "learning_rate": 1.9000000000000002e-06,
+ "loss": 1.0773,
+ "step": 38
+ },
+ {
+ "epoch": 0.018439716312056736,
+ "grad_norm": 3.3543951511383057,
+ "learning_rate": 1.9500000000000004e-06,
+ "loss": 1.1677,
+ "step": 39
+ },
+ {
+ "epoch": 0.018912529550827423,
+ "grad_norm": 3.807624340057373,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 1.1232,
+ "step": 40
+ },
+ {
+ "epoch": 0.01938534278959811,
+ "grad_norm": 4.242797374725342,
+ "learning_rate": 2.05e-06,
+ "loss": 1.1819,
+ "step": 41
+ },
+ {
+ "epoch": 0.019858156028368795,
+ "grad_norm": 3.4574992656707764,
+ "learning_rate": 2.1000000000000002e-06,
+ "loss": 0.9878,
+ "step": 42
+ },
+ {
+ "epoch": 0.02033096926713948,
+ "grad_norm": 3.906695604324341,
+ "learning_rate": 2.15e-06,
+ "loss": 1.0592,
+ "step": 43
+ },
+ {
+ "epoch": 0.020803782505910164,
+ "grad_norm": 3.7543163299560547,
+ "learning_rate": 2.2e-06,
+ "loss": 1.0309,
+ "step": 44
+ },
+ {
+ "epoch": 0.02127659574468085,
+ "grad_norm": 3.3777148723602295,
+ "learning_rate": 2.25e-06,
+ "loss": 1.0664,
+ "step": 45
+ },
+ {
+ "epoch": 0.021749408983451537,
+ "grad_norm": 3.6003634929656982,
+ "learning_rate": 2.3000000000000004e-06,
+ "loss": 1.0482,
+ "step": 46
+ },
+ {
+ "epoch": 0.022222222222222223,
+ "grad_norm": 3.3961377143859863,
+ "learning_rate": 2.35e-06,
+ "loss": 1.0252,
+ "step": 47
+ },
+ {
+ "epoch": 0.02269503546099291,
+ "grad_norm": 3.1601035594940186,
+ "learning_rate": 2.4000000000000003e-06,
+ "loss": 1.0435,
+ "step": 48
+ },
+ {
+ "epoch": 0.023167848699763592,
+ "grad_norm": 3.4192967414855957,
+ "learning_rate": 2.4500000000000003e-06,
+ "loss": 1.0935,
+ "step": 49
+ },
+ {
+ "epoch": 0.02364066193853428,
+ "grad_norm": 3.1225922107696533,
+ "learning_rate": 2.5e-06,
+ "loss": 0.8988,
+ "step": 50
+ },
+ {
+ "epoch": 0.024113475177304965,
+ "grad_norm": 3.1423380374908447,
+ "learning_rate": 2.55e-06,
+ "loss": 1.0159,
+ "step": 51
+ },
+ {
+ "epoch": 0.02458628841607565,
+ "grad_norm": 3.4782402515411377,
+ "learning_rate": 2.6e-06,
+ "loss": 1.0231,
+ "step": 52
+ },
+ {
+ "epoch": 0.025059101654846337,
+ "grad_norm": 3.8362693786621094,
+ "learning_rate": 2.6500000000000005e-06,
+ "loss": 1.0725,
+ "step": 53
+ },
+ {
+ "epoch": 0.02553191489361702,
+ "grad_norm": 3.033294916152954,
+ "learning_rate": 2.7000000000000004e-06,
+ "loss": 0.9377,
+ "step": 54
+ },
+ {
+ "epoch": 0.026004728132387706,
+ "grad_norm": 3.849741220474243,
+ "learning_rate": 2.7500000000000004e-06,
+ "loss": 1.0046,
+ "step": 55
+ },
+ {
+ "epoch": 0.026477541371158392,
+ "grad_norm": 3.141876220703125,
+ "learning_rate": 2.8000000000000003e-06,
+ "loss": 0.9226,
+ "step": 56
+ },
+ {
+ "epoch": 0.02695035460992908,
+ "grad_norm": 2.773594856262207,
+ "learning_rate": 2.85e-06,
+ "loss": 0.8662,
+ "step": 57
+ },
+ {
+ "epoch": 0.027423167848699765,
+ "grad_norm": 3.1460225582122803,
+ "learning_rate": 2.9e-06,
+ "loss": 0.9304,
+ "step": 58
+ },
+ {
+ "epoch": 0.027895981087470448,
+ "grad_norm": 3.293583631515503,
+ "learning_rate": 2.95e-06,
+ "loss": 1.0374,
+ "step": 59
+ },
+ {
+ "epoch": 0.028368794326241134,
+ "grad_norm": 3.8190863132476807,
+ "learning_rate": 3e-06,
+ "loss": 0.971,
+ "step": 60
+ },
+ {
+ "epoch": 0.02884160756501182,
+ "grad_norm": 3.4566776752471924,
+ "learning_rate": 3.05e-06,
+ "loss": 0.9631,
+ "step": 61
+ },
+ {
+ "epoch": 0.029314420803782507,
+ "grad_norm": 3.355741500854492,
+ "learning_rate": 3.1000000000000004e-06,
+ "loss": 1.0097,
+ "step": 62
+ },
+ {
+ "epoch": 0.029787234042553193,
+ "grad_norm": 3.29746675491333,
+ "learning_rate": 3.1500000000000003e-06,
+ "loss": 0.9459,
+ "step": 63
+ },
+ {
+ "epoch": 0.030260047281323876,
+ "grad_norm": 3.3122968673706055,
+ "learning_rate": 3.2000000000000003e-06,
+ "loss": 0.8594,
+ "step": 64
+ },
+ {
+ "epoch": 0.030732860520094562,
+ "grad_norm": 3.477701187133789,
+ "learning_rate": 3.2500000000000002e-06,
+ "loss": 0.9197,
+ "step": 65
+ },
+ {
+ "epoch": 0.031205673758865248,
+ "grad_norm": 3.3363406658172607,
+ "learning_rate": 3.3000000000000006e-06,
+ "loss": 0.9478,
+ "step": 66
+ },
+ {
+ "epoch": 0.03167848699763593,
+ "grad_norm": 4.143295764923096,
+ "learning_rate": 3.3500000000000005e-06,
+ "loss": 1.0534,
+ "step": 67
+ },
+ {
+ "epoch": 0.03215130023640662,
+ "grad_norm": 3.2363274097442627,
+ "learning_rate": 3.4000000000000005e-06,
+ "loss": 0.9454,
+ "step": 68
+ },
+ {
+ "epoch": 0.032624113475177303,
+ "grad_norm": 3.198746681213379,
+ "learning_rate": 3.45e-06,
+ "loss": 0.9388,
+ "step": 69
+ },
+ {
+ "epoch": 0.03309692671394799,
+ "grad_norm": 3.5751023292541504,
+ "learning_rate": 3.5e-06,
+ "loss": 0.9444,
+ "step": 70
+ },
+ {
+ "epoch": 0.033569739952718676,
+ "grad_norm": 3.1745729446411133,
+ "learning_rate": 3.5500000000000003e-06,
+ "loss": 0.8683,
+ "step": 71
+ },
+ {
+ "epoch": 0.03404255319148936,
+ "grad_norm": 3.3210883140563965,
+ "learning_rate": 3.6000000000000003e-06,
+ "loss": 0.8811,
+ "step": 72
+ },
+ {
+ "epoch": 0.03451536643026005,
+ "grad_norm": 3.2502429485321045,
+ "learning_rate": 3.65e-06,
+ "loss": 1.0012,
+ "step": 73
+ },
+ {
+ "epoch": 0.034988179669030735,
+ "grad_norm": 3.44598126411438,
+ "learning_rate": 3.7e-06,
+ "loss": 0.9217,
+ "step": 74
+ },
+ {
+ "epoch": 0.03546099290780142,
+ "grad_norm": 3.439117431640625,
+ "learning_rate": 3.7500000000000005e-06,
+ "loss": 0.8976,
+ "step": 75
+ },
+ {
+ "epoch": 0.03593380614657211,
+ "grad_norm": 3.523627758026123,
+ "learning_rate": 3.8000000000000005e-06,
+ "loss": 0.8996,
+ "step": 76
+ },
+ {
+ "epoch": 0.03640661938534279,
+ "grad_norm": 3.3716015815734863,
+ "learning_rate": 3.85e-06,
+ "loss": 0.9061,
+ "step": 77
+ },
+ {
+ "epoch": 0.03687943262411347,
+ "grad_norm": 3.33518385887146,
+ "learning_rate": 3.900000000000001e-06,
+ "loss": 0.9371,
+ "step": 78
+ },
+ {
+ "epoch": 0.03735224586288416,
+ "grad_norm": 3.833829879760742,
+ "learning_rate": 3.95e-06,
+ "loss": 0.9669,
+ "step": 79
+ },
+ {
+ "epoch": 0.037825059101654845,
+ "grad_norm": 3.260446786880493,
+ "learning_rate": 4.000000000000001e-06,
+ "loss": 0.9449,
+ "step": 80
+ },
+ {
+ "epoch": 0.03829787234042553,
+ "grad_norm": 3.532451629638672,
+ "learning_rate": 4.05e-06,
+ "loss": 0.897,
+ "step": 81
+ },
+ {
+ "epoch": 0.03877068557919622,
+ "grad_norm": 3.1156492233276367,
+ "learning_rate": 4.1e-06,
+ "loss": 0.8463,
+ "step": 82
+ },
+ {
+ "epoch": 0.039243498817966904,
+ "grad_norm": 2.8801751136779785,
+ "learning_rate": 4.15e-06,
+ "loss": 0.8616,
+ "step": 83
+ },
+ {
+ "epoch": 0.03971631205673759,
+ "grad_norm": 3.072476863861084,
+ "learning_rate": 4.2000000000000004e-06,
+ "loss": 0.8387,
+ "step": 84
+ },
+ {
+ "epoch": 0.04018912529550828,
+ "grad_norm": 2.9601376056671143,
+ "learning_rate": 4.25e-06,
+ "loss": 0.8538,
+ "step": 85
+ },
+ {
+ "epoch": 0.04066193853427896,
+ "grad_norm": 3.521664619445801,
+ "learning_rate": 4.3e-06,
+ "loss": 0.8894,
+ "step": 86
+ },
+ {
+ "epoch": 0.04113475177304964,
+ "grad_norm": 3.2670981884002686,
+ "learning_rate": 4.350000000000001e-06,
+ "loss": 0.8387,
+ "step": 87
+ },
+ {
+ "epoch": 0.04160756501182033,
+ "grad_norm": 3.422089099884033,
+ "learning_rate": 4.4e-06,
+ "loss": 0.7728,
+ "step": 88
+ },
+ {
+ "epoch": 0.042080378250591015,
+ "grad_norm": 3.414034128189087,
+ "learning_rate": 4.450000000000001e-06,
+ "loss": 0.7968,
+ "step": 89
+ },
+ {
+ "epoch": 0.0425531914893617,
+ "grad_norm": 4.234285354614258,
+ "learning_rate": 4.5e-06,
+ "loss": 0.8502,
+ "step": 90
+ },
+ {
+ "epoch": 0.04302600472813239,
+ "grad_norm": 3.1446919441223145,
+ "learning_rate": 4.5500000000000005e-06,
+ "loss": 0.8236,
+ "step": 91
+ },
+ {
+ "epoch": 0.043498817966903074,
+ "grad_norm": 3.683443307876587,
+ "learning_rate": 4.600000000000001e-06,
+ "loss": 0.9792,
+ "step": 92
+ },
+ {
+ "epoch": 0.04397163120567376,
+ "grad_norm": 3.664219617843628,
+ "learning_rate": 4.65e-06,
+ "loss": 0.8743,
+ "step": 93
+ },
+ {
+ "epoch": 0.044444444444444446,
+ "grad_norm": 3.369479179382324,
+ "learning_rate": 4.7e-06,
+ "loss": 0.8741,
+ "step": 94
+ },
+ {
+ "epoch": 0.04491725768321513,
+ "grad_norm": 3.694949150085449,
+ "learning_rate": 4.75e-06,
+ "loss": 0.7574,
+ "step": 95
+ },
+ {
+ "epoch": 0.04539007092198582,
+ "grad_norm": 3.5144498348236084,
+ "learning_rate": 4.800000000000001e-06,
+ "loss": 0.9934,
+ "step": 96
+ },
+ {
+ "epoch": 0.0458628841607565,
+ "grad_norm": 3.164451837539673,
+ "learning_rate": 4.85e-06,
+ "loss": 0.7463,
+ "step": 97
+ },
+ {
+ "epoch": 0.046335697399527184,
+ "grad_norm": 3.222785472869873,
+ "learning_rate": 4.9000000000000005e-06,
+ "loss": 0.7698,
+ "step": 98
+ },
+ {
+ "epoch": 0.04680851063829787,
+ "grad_norm": 2.9129555225372314,
+ "learning_rate": 4.95e-06,
+ "loss": 0.7856,
+ "step": 99
+ },
+ {
+ "epoch": 0.04728132387706856,
+ "grad_norm": 3.5061235427856445,
+ "learning_rate": 5e-06,
+ "loss": 0.8588,
+ "step": 100
+ },
+ {
+ "epoch": 0.04775413711583924,
+ "grad_norm": 3.2805044651031494,
+ "learning_rate": 4.999999922167982e-06,
+ "loss": 0.7643,
+ "step": 101
+ },
+ {
+ "epoch": 0.04822695035460993,
+ "grad_norm": 3.5461678504943848,
+ "learning_rate": 4.999999688671929e-06,
+ "loss": 0.8253,
+ "step": 102
+ },
+ {
+ "epoch": 0.048699763593380616,
+ "grad_norm": 3.2238264083862305,
+ "learning_rate": 4.99999929951186e-06,
+ "loss": 0.7622,
+ "step": 103
+ },
+ {
+ "epoch": 0.0491725768321513,
+ "grad_norm": 3.818955898284912,
+ "learning_rate": 4.999998754687795e-06,
+ "loss": 0.8471,
+ "step": 104
+ },
+ {
+ "epoch": 0.04964539007092199,
+ "grad_norm": 3.1252424716949463,
+ "learning_rate": 4.99999805419977e-06,
+ "loss": 0.8409,
+ "step": 105
+ },
+ {
+ "epoch": 0.050118203309692674,
+ "grad_norm": 3.604283571243286,
+ "learning_rate": 4.999997198047828e-06,
+ "loss": 0.9027,
+ "step": 106
+ },
+ {
+ "epoch": 0.050591016548463354,
+ "grad_norm": 3.6752424240112305,
+ "learning_rate": 4.999996186232023e-06,
+ "loss": 0.9336,
+ "step": 107
+ },
+ {
+ "epoch": 0.05106382978723404,
+ "grad_norm": 3.517557144165039,
+ "learning_rate": 4.9999950187524184e-06,
+ "loss": 0.8351,
+ "step": 108
+ },
+ {
+ "epoch": 0.051536643026004726,
+ "grad_norm": 3.427285671234131,
+ "learning_rate": 4.999993695609085e-06,
+ "loss": 0.8457,
+ "step": 109
+ },
+ {
+ "epoch": 0.05200945626477541,
+ "grad_norm": 3.2792510986328125,
+ "learning_rate": 4.999992216802107e-06,
+ "loss": 0.8391,
+ "step": 110
+ },
+ {
+ "epoch": 0.0524822695035461,
+ "grad_norm": 3.581094741821289,
+ "learning_rate": 4.999990582331576e-06,
+ "loss": 0.7533,
+ "step": 111
+ },
+ {
+ "epoch": 0.052955082742316785,
+ "grad_norm": 3.1667377948760986,
+ "learning_rate": 4.999988792197593e-06,
+ "loss": 0.9562,
+ "step": 112
+ },
+ {
+ "epoch": 0.05342789598108747,
+ "grad_norm": 3.3609890937805176,
+ "learning_rate": 4.99998684640027e-06,
+ "loss": 0.8181,
+ "step": 113
+ },
+ {
+ "epoch": 0.05390070921985816,
+ "grad_norm": 3.260627269744873,
+ "learning_rate": 4.999984744939729e-06,
+ "loss": 0.8012,
+ "step": 114
+ },
+ {
+ "epoch": 0.054373522458628844,
+ "grad_norm": 3.4535653591156006,
+ "learning_rate": 4.9999824878160985e-06,
+ "loss": 0.919,
+ "step": 115
+ },
+ {
+ "epoch": 0.05484633569739953,
+ "grad_norm": 3.4880740642547607,
+ "learning_rate": 4.999980075029522e-06,
+ "loss": 0.8114,
+ "step": 116
+ },
+ {
+ "epoch": 0.05531914893617021,
+ "grad_norm": 3.2546932697296143,
+ "learning_rate": 4.999977506580147e-06,
+ "loss": 0.8274,
+ "step": 117
+ },
+ {
+ "epoch": 0.055791962174940896,
+ "grad_norm": 3.2762744426727295,
+ "learning_rate": 4.999974782468136e-06,
+ "loss": 0.9018,
+ "step": 118
+ },
+ {
+ "epoch": 0.05626477541371158,
+ "grad_norm": 3.42825984954834,
+ "learning_rate": 4.999971902693657e-06,
+ "loss": 0.8262,
+ "step": 119
+ },
+ {
+ "epoch": 0.05673758865248227,
+ "grad_norm": 3.082496404647827,
+ "learning_rate": 4.99996886725689e-06,
+ "loss": 0.8181,
+ "step": 120
+ },
+ {
+ "epoch": 0.057210401891252954,
+ "grad_norm": 3.322869300842285,
+ "learning_rate": 4.9999656761580225e-06,
+ "loss": 0.8382,
+ "step": 121
+ },
+ {
+ "epoch": 0.05768321513002364,
+ "grad_norm": 3.6365339756011963,
+ "learning_rate": 4.9999623293972555e-06,
+ "loss": 0.7489,
+ "step": 122
+ },
+ {
+ "epoch": 0.05815602836879433,
+ "grad_norm": 3.376352548599243,
+ "learning_rate": 4.999958826974796e-06,
+ "loss": 0.9012,
+ "step": 123
+ },
+ {
+ "epoch": 0.05862884160756501,
+ "grad_norm": 3.49088716506958,
+ "learning_rate": 4.999955168890862e-06,
+ "loss": 0.8999,
+ "step": 124
+ },
+ {
+ "epoch": 0.0591016548463357,
+ "grad_norm": 3.3265068531036377,
+ "learning_rate": 4.999951355145682e-06,
+ "loss": 0.8161,
+ "step": 125
+ },
+ {
+ "epoch": 0.059574468085106386,
+ "grad_norm": 3.697282314300537,
+ "learning_rate": 4.999947385739493e-06,
+ "loss": 0.9623,
+ "step": 126
+ },
+ {
+ "epoch": 0.06004728132387707,
+ "grad_norm": 2.7901928424835205,
+ "learning_rate": 4.999943260672542e-06,
+ "loss": 0.7371,
+ "step": 127
+ },
+ {
+ "epoch": 0.06052009456264775,
+ "grad_norm": 3.110319137573242,
+ "learning_rate": 4.999938979945086e-06,
+ "loss": 0.715,
+ "step": 128
+ },
+ {
+ "epoch": 0.06099290780141844,
+ "grad_norm": 3.2211520671844482,
+ "learning_rate": 4.999934543557392e-06,
+ "loss": 0.8888,
+ "step": 129
+ },
+ {
+ "epoch": 0.061465721040189124,
+ "grad_norm": 3.2466187477111816,
+ "learning_rate": 4.999929951509735e-06,
+ "loss": 0.9389,
+ "step": 130
+ },
+ {
+ "epoch": 0.06193853427895981,
+ "grad_norm": 3.3574399948120117,
+ "learning_rate": 4.999925203802403e-06,
+ "loss": 0.8263,
+ "step": 131
+ },
+ {
+ "epoch": 0.062411347517730496,
+ "grad_norm": 3.275601625442505,
+ "learning_rate": 4.99992030043569e-06,
+ "loss": 0.8338,
+ "step": 132
+ },
+ {
+ "epoch": 0.06288416075650118,
+ "grad_norm": 3.6011312007904053,
+ "learning_rate": 4.999915241409902e-06,
+ "loss": 0.8351,
+ "step": 133
+ },
+ {
+ "epoch": 0.06335697399527186,
+ "grad_norm": 2.969011068344116,
+ "learning_rate": 4.999910026725352e-06,
+ "loss": 0.79,
+ "step": 134
+ },
+ {
+ "epoch": 0.06382978723404255,
+ "grad_norm": 3.690784454345703,
+ "learning_rate": 4.999904656382369e-06,
+ "loss": 0.8209,
+ "step": 135
+ },
+ {
+ "epoch": 0.06430260047281323,
+ "grad_norm": 3.3363115787506104,
+ "learning_rate": 4.999899130381283e-06,
+ "loss": 0.858,
+ "step": 136
+ },
+ {
+ "epoch": 0.06477541371158392,
+ "grad_norm": 3.206881523132324,
+ "learning_rate": 4.9998934487224405e-06,
+ "loss": 0.834,
+ "step": 137
+ },
+ {
+ "epoch": 0.06524822695035461,
+ "grad_norm": 2.773146152496338,
+ "learning_rate": 4.999887611406195e-06,
+ "loss": 0.7576,
+ "step": 138
+ },
+ {
+ "epoch": 0.0657210401891253,
+ "grad_norm": 3.307725667953491,
+ "learning_rate": 4.999881618432908e-06,
+ "loss": 0.7487,
+ "step": 139
+ },
+ {
+ "epoch": 0.06619385342789598,
+ "grad_norm": 4.273657321929932,
+ "learning_rate": 4.999875469802956e-06,
+ "loss": 0.8176,
+ "step": 140
+ },
+ {
+ "epoch": 0.06666666666666667,
+ "grad_norm": 3.0898005962371826,
+ "learning_rate": 4.999869165516719e-06,
+ "loss": 0.7578,
+ "step": 141
+ },
+ {
+ "epoch": 0.06713947990543735,
+ "grad_norm": 3.25150990486145,
+ "learning_rate": 4.9998627055745915e-06,
+ "loss": 0.7873,
+ "step": 142
+ },
+ {
+ "epoch": 0.06761229314420804,
+ "grad_norm": 2.9705755710601807,
+ "learning_rate": 4.999856089976974e-06,
+ "loss": 0.6473,
+ "step": 143
+ },
+ {
+ "epoch": 0.06808510638297872,
+ "grad_norm": 3.5658507347106934,
+ "learning_rate": 4.9998493187242804e-06,
+ "loss": 0.855,
+ "step": 144
+ },
+ {
+ "epoch": 0.06855791962174941,
+ "grad_norm": 3.3994076251983643,
+ "learning_rate": 4.99984239181693e-06,
+ "loss": 0.7926,
+ "step": 145
+ },
+ {
+ "epoch": 0.0690307328605201,
+ "grad_norm": 2.8266260623931885,
+ "learning_rate": 4.999835309255357e-06,
+ "loss": 0.7564,
+ "step": 146
+ },
+ {
+ "epoch": 0.06950354609929078,
+ "grad_norm": 3.1143875122070312,
+ "learning_rate": 4.999828071039999e-06,
+ "loss": 0.8398,
+ "step": 147
+ },
+ {
+ "epoch": 0.06997635933806147,
+ "grad_norm": 2.9364278316497803,
+ "learning_rate": 4.99982067717131e-06,
+ "loss": 0.7381,
+ "step": 148
+ },
+ {
+ "epoch": 0.07044917257683216,
+ "grad_norm": 3.4155616760253906,
+ "learning_rate": 4.999813127649748e-06,
+ "loss": 0.7933,
+ "step": 149
+ },
+ {
+ "epoch": 0.07092198581560284,
+ "grad_norm": 4.371236324310303,
+ "learning_rate": 4.999805422475784e-06,
+ "loss": 0.8292,
+ "step": 150
+ },
+ {
+ "epoch": 0.07139479905437353,
+ "grad_norm": 3.3967185020446777,
+ "learning_rate": 4.999797561649897e-06,
+ "loss": 0.8712,
+ "step": 151
+ },
+ {
+ "epoch": 0.07186761229314421,
+ "grad_norm": 3.343303680419922,
+ "learning_rate": 4.999789545172578e-06,
+ "loss": 0.8177,
+ "step": 152
+ },
+ {
+ "epoch": 0.07234042553191489,
+ "grad_norm": 3.040235757827759,
+ "learning_rate": 4.999781373044325e-06,
+ "loss": 0.7379,
+ "step": 153
+ },
+ {
+ "epoch": 0.07281323877068557,
+ "grad_norm": 3.4069204330444336,
+ "learning_rate": 4.999773045265647e-06,
+ "loss": 0.7939,
+ "step": 154
+ },
+ {
+ "epoch": 0.07328605200945626,
+ "grad_norm": 3.1939475536346436,
+ "learning_rate": 4.999764561837063e-06,
+ "loss": 0.8037,
+ "step": 155
+ },
+ {
+ "epoch": 0.07375886524822695,
+ "grad_norm": 4.452004909515381,
+ "learning_rate": 4.999755922759101e-06,
+ "loss": 0.8421,
+ "step": 156
+ },
+ {
+ "epoch": 0.07423167848699763,
+ "grad_norm": 3.2031240463256836,
+ "learning_rate": 4.999747128032298e-06,
+ "loss": 0.794,
+ "step": 157
+ },
+ {
+ "epoch": 0.07470449172576832,
+ "grad_norm": 3.175920009613037,
+ "learning_rate": 4.999738177657203e-06,
+ "loss": 0.759,
+ "step": 158
+ },
+ {
+ "epoch": 0.075177304964539,
+ "grad_norm": 3.7679688930511475,
+ "learning_rate": 4.9997290716343725e-06,
+ "loss": 0.8174,
+ "step": 159
+ },
+ {
+ "epoch": 0.07565011820330969,
+ "grad_norm": 3.7020037174224854,
+ "learning_rate": 4.999719809964373e-06,
+ "loss": 0.7116,
+ "step": 160
+ },
+ {
+ "epoch": 0.07612293144208038,
+ "grad_norm": 4.357471942901611,
+ "learning_rate": 4.999710392647783e-06,
+ "loss": 0.7649,
+ "step": 161
+ },
+ {
+ "epoch": 0.07659574468085106,
+ "grad_norm": 3.3439087867736816,
+ "learning_rate": 4.999700819685187e-06,
+ "loss": 0.7907,
+ "step": 162
+ },
+ {
+ "epoch": 0.07706855791962175,
+ "grad_norm": 3.210815191268921,
+ "learning_rate": 4.999691091077182e-06,
+ "loss": 0.8446,
+ "step": 163
+ },
+ {
+ "epoch": 0.07754137115839244,
+ "grad_norm": 3.1029553413391113,
+ "learning_rate": 4.9996812068243735e-06,
+ "loss": 0.7232,
+ "step": 164
+ },
+ {
+ "epoch": 0.07801418439716312,
+ "grad_norm": 2.9389400482177734,
+ "learning_rate": 4.999671166927378e-06,
+ "loss": 0.7413,
+ "step": 165
+ },
+ {
+ "epoch": 0.07848699763593381,
+ "grad_norm": 3.7062697410583496,
+ "learning_rate": 4.9996609713868185e-06,
+ "loss": 0.8773,
+ "step": 166
+ },
+ {
+ "epoch": 0.0789598108747045,
+ "grad_norm": 3.2768924236297607,
+ "learning_rate": 4.999650620203332e-06,
+ "loss": 0.8046,
+ "step": 167
+ },
+ {
+ "epoch": 0.07943262411347518,
+ "grad_norm": 3.380373001098633,
+ "learning_rate": 4.999640113377561e-06,
+ "loss": 0.7529,
+ "step": 168
+ },
+ {
+ "epoch": 0.07990543735224587,
+ "grad_norm": 3.520022392272949,
+ "learning_rate": 4.999629450910162e-06,
+ "loss": 0.7352,
+ "step": 169
+ },
+ {
+ "epoch": 0.08037825059101655,
+ "grad_norm": 3.43269419670105,
+ "learning_rate": 4.999618632801796e-06,
+ "loss": 0.9371,
+ "step": 170
+ },
+ {
+ "epoch": 0.08085106382978724,
+ "grad_norm": 3.555877923965454,
+ "learning_rate": 4.99960765905314e-06,
+ "loss": 0.8276,
+ "step": 171
+ },
+ {
+ "epoch": 0.08132387706855793,
+ "grad_norm": 3.597050189971924,
+ "learning_rate": 4.999596529664874e-06,
+ "loss": 0.8164,
+ "step": 172
+ },
+ {
+ "epoch": 0.0817966903073286,
+ "grad_norm": 3.2002956867218018,
+ "learning_rate": 4.999585244637693e-06,
+ "loss": 0.7824,
+ "step": 173
+ },
+ {
+ "epoch": 0.08226950354609928,
+ "grad_norm": 3.527275562286377,
+ "learning_rate": 4.999573803972299e-06,
+ "loss": 0.8033,
+ "step": 174
+ },
+ {
+ "epoch": 0.08274231678486997,
+ "grad_norm": 3.5184452533721924,
+ "learning_rate": 4.999562207669405e-06,
+ "loss": 0.724,
+ "step": 175
+ },
+ {
+ "epoch": 0.08321513002364066,
+ "grad_norm": 3.6635067462921143,
+ "learning_rate": 4.999550455729732e-06,
+ "loss": 0.819,
+ "step": 176
+ },
+ {
+ "epoch": 0.08368794326241134,
+ "grad_norm": 3.192399740219116,
+ "learning_rate": 4.999538548154012e-06,
+ "loss": 0.7999,
+ "step": 177
+ },
+ {
+ "epoch": 0.08416075650118203,
+ "grad_norm": 3.0946953296661377,
+ "learning_rate": 4.999526484942988e-06,
+ "loss": 0.7367,
+ "step": 178
+ },
+ {
+ "epoch": 0.08463356973995272,
+ "grad_norm": 2.847198009490967,
+ "learning_rate": 4.99951426609741e-06,
+ "loss": 0.7536,
+ "step": 179
+ },
+ {
+ "epoch": 0.0851063829787234,
+ "grad_norm": 2.7674827575683594,
+ "learning_rate": 4.999501891618037e-06,
+ "loss": 0.701,
+ "step": 180
+ },
+ {
+ "epoch": 0.08557919621749409,
+ "grad_norm": 3.357933521270752,
+ "learning_rate": 4.999489361505643e-06,
+ "loss": 0.8331,
+ "step": 181
+ },
+ {
+ "epoch": 0.08605200945626477,
+ "grad_norm": 3.1464426517486572,
+ "learning_rate": 4.999476675761004e-06,
+ "loss": 0.7931,
+ "step": 182
+ },
+ {
+ "epoch": 0.08652482269503546,
+ "grad_norm": 3.310697078704834,
+ "learning_rate": 4.999463834384915e-06,
+ "loss": 0.753,
+ "step": 183
+ },
+ {
+ "epoch": 0.08699763593380615,
+ "grad_norm": 2.9794881343841553,
+ "learning_rate": 4.999450837378171e-06,
+ "loss": 0.7091,
+ "step": 184
+ },
+ {
+ "epoch": 0.08747044917257683,
+ "grad_norm": 3.0776889324188232,
+ "learning_rate": 4.999437684741584e-06,
+ "loss": 0.7226,
+ "step": 185
+ },
+ {
+ "epoch": 0.08794326241134752,
+ "grad_norm": 3.6657519340515137,
+ "learning_rate": 4.999424376475972e-06,
+ "loss": 0.845,
+ "step": 186
+ },
+ {
+ "epoch": 0.0884160756501182,
+ "grad_norm": 3.872718572616577,
+ "learning_rate": 4.999410912582164e-06,
+ "loss": 0.812,
+ "step": 187
+ },
+ {
+ "epoch": 0.08888888888888889,
+ "grad_norm": 2.9184508323669434,
+ "learning_rate": 4.9993972930609976e-06,
+ "loss": 0.6823,
+ "step": 188
+ },
+ {
+ "epoch": 0.08936170212765958,
+ "grad_norm": 3.5567142963409424,
+ "learning_rate": 4.999383517913321e-06,
+ "loss": 0.7614,
+ "step": 189
+ },
+ {
+ "epoch": 0.08983451536643026,
+ "grad_norm": 3.3688533306121826,
+ "learning_rate": 4.999369587139992e-06,
+ "loss": 0.858,
+ "step": 190
+ },
+ {
+ "epoch": 0.09030732860520095,
+ "grad_norm": 2.893223524093628,
+ "learning_rate": 4.99935550074188e-06,
+ "loss": 0.6761,
+ "step": 191
+ },
+ {
+ "epoch": 0.09078014184397164,
+ "grad_norm": 3.400225877761841,
+ "learning_rate": 4.999341258719859e-06,
+ "loss": 0.7531,
+ "step": 192
+ },
+ {
+ "epoch": 0.09125295508274232,
+ "grad_norm": 3.6167714595794678,
+ "learning_rate": 4.999326861074817e-06,
+ "loss": 0.8164,
+ "step": 193
+ },
+ {
+ "epoch": 0.091725768321513,
+ "grad_norm": 4.325016498565674,
+ "learning_rate": 4.9993123078076506e-06,
+ "loss": 0.7069,
+ "step": 194
+ },
+ {
+ "epoch": 0.09219858156028368,
+ "grad_norm": 3.195317029953003,
+ "learning_rate": 4.999297598919266e-06,
+ "loss": 0.726,
+ "step": 195
+ },
+ {
+ "epoch": 0.09267139479905437,
+ "grad_norm": 3.146530866622925,
+ "learning_rate": 4.999282734410579e-06,
+ "loss": 0.7888,
+ "step": 196
+ },
+ {
+ "epoch": 0.09314420803782505,
+ "grad_norm": 3.5166752338409424,
+ "learning_rate": 4.999267714282515e-06,
+ "loss": 0.8473,
+ "step": 197
+ },
+ {
+ "epoch": 0.09361702127659574,
+ "grad_norm": 3.3140196800231934,
+ "learning_rate": 4.99925253853601e-06,
+ "loss": 0.7233,
+ "step": 198
+ },
+ {
+ "epoch": 0.09408983451536643,
+ "grad_norm": 3.0318164825439453,
+ "learning_rate": 4.999237207172008e-06,
+ "loss": 0.7543,
+ "step": 199
+ },
+ {
+ "epoch": 0.09456264775413711,
+ "grad_norm": 3.662214756011963,
+ "learning_rate": 4.999221720191464e-06,
+ "loss": 0.7783,
+ "step": 200
+ },
+ {
+ "epoch": 0.0950354609929078,
+ "grad_norm": 3.452078104019165,
+ "learning_rate": 4.9992060775953425e-06,
+ "loss": 0.7868,
+ "step": 201
+ },
+ {
+ "epoch": 0.09550827423167849,
+ "grad_norm": 3.4051287174224854,
+ "learning_rate": 4.999190279384617e-06,
+ "loss": 0.7849,
+ "step": 202
+ },
+ {
+ "epoch": 0.09598108747044917,
+ "grad_norm": 3.1377196311950684,
+ "learning_rate": 4.999174325560271e-06,
+ "loss": 0.8364,
+ "step": 203
+ },
+ {
+ "epoch": 0.09645390070921986,
+ "grad_norm": 3.129473924636841,
+ "learning_rate": 4.999158216123299e-06,
+ "loss": 0.7458,
+ "step": 204
+ },
+ {
+ "epoch": 0.09692671394799054,
+ "grad_norm": 3.169548749923706,
+ "learning_rate": 4.999141951074703e-06,
+ "loss": 0.7256,
+ "step": 205
+ },
+ {
+ "epoch": 0.09739952718676123,
+ "grad_norm": 3.186009168624878,
+ "learning_rate": 4.999125530415495e-06,
+ "loss": 0.783,
+ "step": 206
+ },
+ {
+ "epoch": 0.09787234042553192,
+ "grad_norm": 3.0995123386383057,
+ "learning_rate": 4.9991089541467e-06,
+ "loss": 0.7519,
+ "step": 207
+ },
+ {
+ "epoch": 0.0983451536643026,
+ "grad_norm": 3.1854088306427,
+ "learning_rate": 4.999092222269348e-06,
+ "loss": 0.7444,
+ "step": 208
+ },
+ {
+ "epoch": 0.09881796690307329,
+ "grad_norm": 3.1512246131896973,
+ "learning_rate": 4.999075334784482e-06,
+ "loss": 0.7882,
+ "step": 209
+ },
+ {
+ "epoch": 0.09929078014184398,
+ "grad_norm": 3.6199698448181152,
+ "learning_rate": 4.999058291693153e-06,
+ "loss": 0.8048,
+ "step": 210
+ },
+ {
+ "epoch": 0.09976359338061466,
+ "grad_norm": 2.956907272338867,
+ "learning_rate": 4.999041092996422e-06,
+ "loss": 0.7663,
+ "step": 211
+ },
+ {
+ "epoch": 0.10023640661938535,
+ "grad_norm": 3.3493971824645996,
+ "learning_rate": 4.99902373869536e-06,
+ "loss": 0.7639,
+ "step": 212
+ },
+ {
+ "epoch": 0.10070921985815603,
+ "grad_norm": 3.144812822341919,
+ "learning_rate": 4.9990062287910475e-06,
+ "loss": 0.7953,
+ "step": 213
+ },
+ {
+ "epoch": 0.10118203309692671,
+ "grad_norm": 3.5986971855163574,
+ "learning_rate": 4.998988563284576e-06,
+ "loss": 0.8297,
+ "step": 214
+ },
+ {
+ "epoch": 0.1016548463356974,
+ "grad_norm": 3.447584867477417,
+ "learning_rate": 4.998970742177044e-06,
+ "loss": 0.808,
+ "step": 215
+ },
+ {
+ "epoch": 0.10212765957446808,
+ "grad_norm": 3.791353940963745,
+ "learning_rate": 4.998952765469562e-06,
+ "loss": 0.8005,
+ "step": 216
+ },
+ {
+ "epoch": 0.10260047281323877,
+ "grad_norm": 3.4490807056427,
+ "learning_rate": 4.998934633163247e-06,
+ "loss": 0.8135,
+ "step": 217
+ },
+ {
+ "epoch": 0.10307328605200945,
+ "grad_norm": 3.1053314208984375,
+ "learning_rate": 4.998916345259232e-06,
+ "loss": 0.7888,
+ "step": 218
+ },
+ {
+ "epoch": 0.10354609929078014,
+ "grad_norm": 3.407862663269043,
+ "learning_rate": 4.9988979017586514e-06,
+ "loss": 0.7099,
+ "step": 219
+ },
+ {
+ "epoch": 0.10401891252955082,
+ "grad_norm": 3.116656541824341,
+ "learning_rate": 4.998879302662658e-06,
+ "loss": 0.8344,
+ "step": 220
+ },
+ {
+ "epoch": 0.10449172576832151,
+ "grad_norm": 3.339264154434204,
+ "learning_rate": 4.998860547972406e-06,
+ "loss": 0.8496,
+ "step": 221
+ },
+ {
+ "epoch": 0.1049645390070922,
+ "grad_norm": 3.251892566680908,
+ "learning_rate": 4.998841637689066e-06,
+ "loss": 0.7455,
+ "step": 222
+ },
+ {
+ "epoch": 0.10543735224586288,
+ "grad_norm": 4.098135471343994,
+ "learning_rate": 4.998822571813814e-06,
+ "loss": 0.7772,
+ "step": 223
+ },
+ {
+ "epoch": 0.10591016548463357,
+ "grad_norm": 3.9871134757995605,
+ "learning_rate": 4.998803350347837e-06,
+ "loss": 0.8261,
+ "step": 224
+ },
+ {
+ "epoch": 0.10638297872340426,
+ "grad_norm": 3.2822303771972656,
+ "learning_rate": 4.998783973292333e-06,
+ "loss": 0.8623,
+ "step": 225
+ },
+ {
+ "epoch": 0.10685579196217494,
+ "grad_norm": 3.0356857776641846,
+ "learning_rate": 4.998764440648507e-06,
+ "loss": 0.7426,
+ "step": 226
+ },
+ {
+ "epoch": 0.10732860520094563,
+ "grad_norm": 2.8932785987854004,
+ "learning_rate": 4.998744752417576e-06,
+ "loss": 0.6741,
+ "step": 227
+ },
+ {
+ "epoch": 0.10780141843971631,
+ "grad_norm": 3.085820436477661,
+ "learning_rate": 4.998724908600767e-06,
+ "loss": 0.6549,
+ "step": 228
+ },
+ {
+ "epoch": 0.108274231678487,
+ "grad_norm": 3.135829210281372,
+ "learning_rate": 4.998704909199314e-06,
+ "loss": 0.6702,
+ "step": 229
+ },
+ {
+ "epoch": 0.10874704491725769,
+ "grad_norm": 5.016134262084961,
+ "learning_rate": 4.9986847542144625e-06,
+ "loss": 0.7852,
+ "step": 230
+ },
+ {
+ "epoch": 0.10921985815602837,
+ "grad_norm": 3.9056200981140137,
+ "learning_rate": 4.998664443647468e-06,
+ "loss": 0.9654,
+ "step": 231
+ },
+ {
+ "epoch": 0.10969267139479906,
+ "grad_norm": 3.0880749225616455,
+ "learning_rate": 4.998643977499595e-06,
+ "loss": 0.7579,
+ "step": 232
+ },
+ {
+ "epoch": 0.11016548463356975,
+ "grad_norm": 3.6893601417541504,
+ "learning_rate": 4.998623355772118e-06,
+ "loss": 0.713,
+ "step": 233
+ },
+ {
+ "epoch": 0.11063829787234042,
+ "grad_norm": 4.181536674499512,
+ "learning_rate": 4.998602578466319e-06,
+ "loss": 0.7331,
+ "step": 234
+ },
+ {
+ "epoch": 0.1111111111111111,
+ "grad_norm": 3.036386728286743,
+ "learning_rate": 4.998581645583496e-06,
+ "loss": 0.7115,
+ "step": 235
+ },
+ {
+ "epoch": 0.11158392434988179,
+ "grad_norm": 3.6333255767822266,
+ "learning_rate": 4.998560557124948e-06,
+ "loss": 0.7544,
+ "step": 236
+ },
+ {
+ "epoch": 0.11205673758865248,
+ "grad_norm": 2.926417827606201,
+ "learning_rate": 4.9985393130919915e-06,
+ "loss": 0.715,
+ "step": 237
+ },
+ {
+ "epoch": 0.11252955082742316,
+ "grad_norm": 2.969158172607422,
+ "learning_rate": 4.998517913485946e-06,
+ "loss": 0.7304,
+ "step": 238
+ },
+ {
+ "epoch": 0.11300236406619385,
+ "grad_norm": 3.5254971981048584,
+ "learning_rate": 4.9984963583081466e-06,
+ "loss": 0.7725,
+ "step": 239
+ },
+ {
+ "epoch": 0.11347517730496454,
+ "grad_norm": 3.7840335369110107,
+ "learning_rate": 4.998474647559936e-06,
+ "loss": 0.8685,
+ "step": 240
+ },
+ {
+ "epoch": 0.11394799054373522,
+ "grad_norm": 3.0333125591278076,
+ "learning_rate": 4.9984527812426625e-06,
+ "loss": 0.7793,
+ "step": 241
+ },
+ {
+ "epoch": 0.11442080378250591,
+ "grad_norm": 3.290159225463867,
+ "learning_rate": 4.99843075935769e-06,
+ "loss": 0.7158,
+ "step": 242
+ },
+ {
+ "epoch": 0.1148936170212766,
+ "grad_norm": 3.3935494422912598,
+ "learning_rate": 4.99840858190639e-06,
+ "loss": 0.7643,
+ "step": 243
+ },
+ {
+ "epoch": 0.11536643026004728,
+ "grad_norm": 3.333965539932251,
+ "learning_rate": 4.998386248890142e-06,
+ "loss": 0.7255,
+ "step": 244
+ },
+ {
+ "epoch": 0.11583924349881797,
+ "grad_norm": 2.8129613399505615,
+ "learning_rate": 4.998363760310339e-06,
+ "loss": 0.768,
+ "step": 245
+ },
+ {
+ "epoch": 0.11631205673758865,
+ "grad_norm": 2.8678107261657715,
+ "learning_rate": 4.998341116168378e-06,
+ "loss": 0.7403,
+ "step": 246
+ },
+ {
+ "epoch": 0.11678486997635934,
+ "grad_norm": 2.8898239135742188,
+ "learning_rate": 4.998318316465672e-06,
+ "loss": 0.6844,
+ "step": 247
+ },
+ {
+ "epoch": 0.11725768321513003,
+ "grad_norm": 3.139777898788452,
+ "learning_rate": 4.998295361203637e-06,
+ "loss": 0.7936,
+ "step": 248
+ },
+ {
+ "epoch": 0.11773049645390071,
+ "grad_norm": 3.393721103668213,
+ "learning_rate": 4.998272250383707e-06,
+ "loss": 0.8173,
+ "step": 249
+ },
+ {
+ "epoch": 0.1182033096926714,
+ "grad_norm": 3.240973949432373,
+ "learning_rate": 4.998248984007318e-06,
+ "loss": 0.8252,
+ "step": 250
+ },
+ {
+ "epoch": 0.11867612293144209,
+ "grad_norm": 3.384855031967163,
+ "learning_rate": 4.998225562075918e-06,
+ "loss": 0.7244,
+ "step": 251
+ },
+ {
+ "epoch": 0.11914893617021277,
+ "grad_norm": 3.1881816387176514,
+ "learning_rate": 4.9982019845909675e-06,
+ "loss": 0.6818,
+ "step": 252
+ },
+ {
+ "epoch": 0.11962174940898346,
+ "grad_norm": 2.888364553451538,
+ "learning_rate": 4.998178251553934e-06,
+ "loss": 0.6753,
+ "step": 253
+ },
+ {
+ "epoch": 0.12009456264775414,
+ "grad_norm": 3.630093812942505,
+ "learning_rate": 4.9981543629662944e-06,
+ "loss": 0.7995,
+ "step": 254
+ },
+ {
+ "epoch": 0.12056737588652482,
+ "grad_norm": 2.9820947647094727,
+ "learning_rate": 4.998130318829537e-06,
+ "loss": 0.7478,
+ "step": 255
+ },
+ {
+ "epoch": 0.1210401891252955,
+ "grad_norm": 2.7094738483428955,
+ "learning_rate": 4.998106119145159e-06,
+ "loss": 0.7237,
+ "step": 256
+ },
+ {
+ "epoch": 0.12151300236406619,
+ "grad_norm": 3.1808104515075684,
+ "learning_rate": 4.9980817639146665e-06,
+ "loss": 0.7915,
+ "step": 257
+ },
+ {
+ "epoch": 0.12198581560283688,
+ "grad_norm": 3.1661291122436523,
+ "learning_rate": 4.998057253139575e-06,
+ "loss": 0.8053,
+ "step": 258
+ },
+ {
+ "epoch": 0.12245862884160756,
+ "grad_norm": 3.528749942779541,
+ "learning_rate": 4.998032586821413e-06,
+ "loss": 0.7946,
+ "step": 259
+ },
+ {
+ "epoch": 0.12293144208037825,
+ "grad_norm": 3.125964879989624,
+ "learning_rate": 4.998007764961716e-06,
+ "loss": 0.7569,
+ "step": 260
+ },
+ {
+ "epoch": 0.12340425531914893,
+ "grad_norm": 3.0778942108154297,
+ "learning_rate": 4.997982787562029e-06,
+ "loss": 0.7184,
+ "step": 261
+ },
+ {
+ "epoch": 0.12387706855791962,
+ "grad_norm": 3.3531930446624756,
+ "learning_rate": 4.997957654623906e-06,
+ "loss": 0.7586,
+ "step": 262
+ },
+ {
+ "epoch": 0.1243498817966903,
+ "grad_norm": 3.229278564453125,
+ "learning_rate": 4.997932366148913e-06,
+ "loss": 0.6092,
+ "step": 263
+ },
+ {
+ "epoch": 0.12482269503546099,
+ "grad_norm": 3.7286155223846436,
+ "learning_rate": 4.997906922138626e-06,
+ "loss": 0.7965,
+ "step": 264
+ },
+ {
+ "epoch": 0.12529550827423167,
+ "grad_norm": 3.300311803817749,
+ "learning_rate": 4.997881322594628e-06,
+ "loss": 0.7665,
+ "step": 265
+ },
+ {
+ "epoch": 0.12576832151300235,
+ "grad_norm": 3.411482572555542,
+ "learning_rate": 4.9978555675185115e-06,
+ "loss": 0.7253,
+ "step": 266
+ },
+ {
+ "epoch": 0.12624113475177304,
+ "grad_norm": 3.0884511470794678,
+ "learning_rate": 4.9978296569118825e-06,
+ "loss": 0.659,
+ "step": 267
+ },
+ {
+ "epoch": 0.12671394799054372,
+ "grad_norm": 3.0652925968170166,
+ "learning_rate": 4.9978035907763535e-06,
+ "loss": 0.6739,
+ "step": 268
+ },
+ {
+ "epoch": 0.1271867612293144,
+ "grad_norm": 3.280555009841919,
+ "learning_rate": 4.997777369113547e-06,
+ "loss": 0.8003,
+ "step": 269
+ },
+ {
+ "epoch": 0.1276595744680851,
+ "grad_norm": 2.980860948562622,
+ "learning_rate": 4.997750991925096e-06,
+ "loss": 0.7097,
+ "step": 270
+ },
+ {
+ "epoch": 0.12813238770685578,
+ "grad_norm": 3.301760673522949,
+ "learning_rate": 4.997724459212644e-06,
+ "loss": 0.7894,
+ "step": 271
+ },
+ {
+ "epoch": 0.12860520094562647,
+ "grad_norm": 2.9584903717041016,
+ "learning_rate": 4.997697770977841e-06,
+ "loss": 0.733,
+ "step": 272
+ },
+ {
+ "epoch": 0.12907801418439716,
+ "grad_norm": 3.5632214546203613,
+ "learning_rate": 4.99767092722235e-06,
+ "loss": 0.7228,
+ "step": 273
+ },
+ {
+ "epoch": 0.12955082742316784,
+ "grad_norm": 3.5900983810424805,
+ "learning_rate": 4.997643927947843e-06,
+ "loss": 0.7634,
+ "step": 274
+ },
+ {
+ "epoch": 0.13002364066193853,
+ "grad_norm": 3.332650661468506,
+ "learning_rate": 4.997616773156e-06,
+ "loss": 0.797,
+ "step": 275
+ },
+ {
+ "epoch": 0.13049645390070921,
+ "grad_norm": 3.1094167232513428,
+ "learning_rate": 4.997589462848512e-06,
+ "loss": 0.7849,
+ "step": 276
+ },
+ {
+ "epoch": 0.1309692671394799,
+ "grad_norm": 3.5359463691711426,
+ "learning_rate": 4.99756199702708e-06,
+ "loss": 0.6871,
+ "step": 277
+ },
+ {
+ "epoch": 0.1314420803782506,
+ "grad_norm": 3.190441846847534,
+ "learning_rate": 4.997534375693414e-06,
+ "loss": 0.6883,
+ "step": 278
+ },
+ {
+ "epoch": 0.13191489361702127,
+ "grad_norm": 3.063518762588501,
+ "learning_rate": 4.997506598849234e-06,
+ "loss": 0.7586,
+ "step": 279
+ },
+ {
+ "epoch": 0.13238770685579196,
+ "grad_norm": 3.4112050533294678,
+ "learning_rate": 4.997478666496269e-06,
+ "loss": 0.796,
+ "step": 280
+ },
+ {
+ "epoch": 0.13286052009456265,
+ "grad_norm": 3.231886386871338,
+ "learning_rate": 4.997450578636259e-06,
+ "loss": 0.7714,
+ "step": 281
+ },
+ {
+ "epoch": 0.13333333333333333,
+ "grad_norm": 3.279425621032715,
+ "learning_rate": 4.9974223352709515e-06,
+ "loss": 0.7793,
+ "step": 282
+ },
+ {
+ "epoch": 0.13380614657210402,
+ "grad_norm": 3.2154316902160645,
+ "learning_rate": 4.9973939364021075e-06,
+ "loss": 0.791,
+ "step": 283
+ },
+ {
+ "epoch": 0.1342789598108747,
+ "grad_norm": 3.2090768814086914,
+ "learning_rate": 4.9973653820314925e-06,
+ "loss": 0.6433,
+ "step": 284
+ },
+ {
+ "epoch": 0.1347517730496454,
+ "grad_norm": 3.1712026596069336,
+ "learning_rate": 4.997336672160886e-06,
+ "loss": 0.8128,
+ "step": 285
+ },
+ {
+ "epoch": 0.13522458628841608,
+ "grad_norm": 2.929229497909546,
+ "learning_rate": 4.997307806792076e-06,
+ "loss": 0.7594,
+ "step": 286
+ },
+ {
+ "epoch": 0.13569739952718676,
+ "grad_norm": 3.0363314151763916,
+ "learning_rate": 4.997278785926859e-06,
+ "loss": 0.7336,
+ "step": 287
+ },
+ {
+ "epoch": 0.13617021276595745,
+ "grad_norm": 3.1352357864379883,
+ "learning_rate": 4.997249609567042e-06,
+ "loss": 0.7225,
+ "step": 288
+ },
+ {
+ "epoch": 0.13664302600472814,
+ "grad_norm": 3.3171157836914062,
+ "learning_rate": 4.997220277714442e-06,
+ "loss": 0.7777,
+ "step": 289
+ },
+ {
+ "epoch": 0.13711583924349882,
+ "grad_norm": 3.050717353820801,
+ "learning_rate": 4.997190790370885e-06,
+ "loss": 0.6836,
+ "step": 290
+ },
+ {
+ "epoch": 0.1375886524822695,
+ "grad_norm": 3.0297694206237793,
+ "learning_rate": 4.997161147538208e-06,
+ "loss": 0.6883,
+ "step": 291
+ },
+ {
+ "epoch": 0.1380614657210402,
+ "grad_norm": 3.0566554069519043,
+ "learning_rate": 4.997131349218256e-06,
+ "loss": 0.6674,
+ "step": 292
+ },
+ {
+ "epoch": 0.13853427895981088,
+ "grad_norm": 3.799111843109131,
+ "learning_rate": 4.997101395412885e-06,
+ "loss": 0.8256,
+ "step": 293
+ },
+ {
+ "epoch": 0.13900709219858157,
+ "grad_norm": 3.1394248008728027,
+ "learning_rate": 4.9970712861239576e-06,
+ "loss": 0.7306,
+ "step": 294
+ },
+ {
+ "epoch": 0.13947990543735225,
+ "grad_norm": 3.0605666637420654,
+ "learning_rate": 4.997041021353352e-06,
+ "loss": 0.7212,
+ "step": 295
+ },
+ {
+ "epoch": 0.13995271867612294,
+ "grad_norm": 3.8813397884368896,
+ "learning_rate": 4.997010601102951e-06,
+ "loss": 0.769,
+ "step": 296
+ },
+ {
+ "epoch": 0.14042553191489363,
+ "grad_norm": 3.0514819622039795,
+ "learning_rate": 4.996980025374649e-06,
+ "loss": 0.7422,
+ "step": 297
+ },
+ {
+ "epoch": 0.1408983451536643,
+ "grad_norm": 2.9544146060943604,
+ "learning_rate": 4.99694929417035e-06,
+ "loss": 0.6912,
+ "step": 298
+ },
+ {
+ "epoch": 0.141371158392435,
+ "grad_norm": 3.2635602951049805,
+ "learning_rate": 4.996918407491966e-06,
+ "loss": 0.7395,
+ "step": 299
+ },
+ {
+ "epoch": 0.14184397163120568,
+ "grad_norm": 3.373882532119751,
+ "learning_rate": 4.996887365341423e-06,
+ "loss": 0.7799,
+ "step": 300
+ },
+ {
+ "epoch": 0.14231678486997637,
+ "grad_norm": 3.001128673553467,
+ "learning_rate": 4.996856167720652e-06,
+ "loss": 0.7168,
+ "step": 301
+ },
+ {
+ "epoch": 0.14278959810874706,
+ "grad_norm": 3.1026835441589355,
+ "learning_rate": 4.996824814631595e-06,
+ "loss": 0.7492,
+ "step": 302
+ },
+ {
+ "epoch": 0.14326241134751774,
+ "grad_norm": 3.41947603225708,
+ "learning_rate": 4.996793306076205e-06,
+ "loss": 0.6659,
+ "step": 303
+ },
+ {
+ "epoch": 0.14373522458628843,
+ "grad_norm": 3.2272400856018066,
+ "learning_rate": 4.996761642056444e-06,
+ "loss": 0.7184,
+ "step": 304
+ },
+ {
+ "epoch": 0.14420803782505912,
+ "grad_norm": 2.9488935470581055,
+ "learning_rate": 4.996729822574284e-06,
+ "loss": 0.7451,
+ "step": 305
+ },
+ {
+ "epoch": 0.14468085106382977,
+ "grad_norm": 3.268231153488159,
+ "learning_rate": 4.9966978476317065e-06,
+ "loss": 0.7798,
+ "step": 306
+ },
+ {
+ "epoch": 0.14515366430260046,
+ "grad_norm": 3.9086556434631348,
+ "learning_rate": 4.996665717230701e-06,
+ "loss": 0.7871,
+ "step": 307
+ },
+ {
+ "epoch": 0.14562647754137115,
+ "grad_norm": 3.3483879566192627,
+ "learning_rate": 4.996633431373269e-06,
+ "loss": 0.7415,
+ "step": 308
+ },
+ {
+ "epoch": 0.14609929078014183,
+ "grad_norm": 2.839400053024292,
+ "learning_rate": 4.99660099006142e-06,
+ "loss": 0.7192,
+ "step": 309
+ },
+ {
+ "epoch": 0.14657210401891252,
+ "grad_norm": 3.177302598953247,
+ "learning_rate": 4.996568393297175e-06,
+ "loss": 0.755,
+ "step": 310
+ },
+ {
+ "epoch": 0.1470449172576832,
+ "grad_norm": 3.5477044582366943,
+ "learning_rate": 4.996535641082563e-06,
+ "loss": 0.7531,
+ "step": 311
+ },
+ {
+ "epoch": 0.1475177304964539,
+ "grad_norm": 3.418576717376709,
+ "learning_rate": 4.996502733419624e-06,
+ "loss": 0.8009,
+ "step": 312
+ },
+ {
+ "epoch": 0.14799054373522458,
+ "grad_norm": 3.711341619491577,
+ "learning_rate": 4.996469670310407e-06,
+ "loss": 0.7362,
+ "step": 313
+ },
+ {
+ "epoch": 0.14846335697399526,
+ "grad_norm": 3.2419373989105225,
+ "learning_rate": 4.99643645175697e-06,
+ "loss": 0.7761,
+ "step": 314
+ },
+ {
+ "epoch": 0.14893617021276595,
+ "grad_norm": 3.121858835220337,
+ "learning_rate": 4.996403077761381e-06,
+ "loss": 0.6495,
+ "step": 315
+ },
+ {
+ "epoch": 0.14940898345153664,
+ "grad_norm": 3.123054265975952,
+ "learning_rate": 4.996369548325719e-06,
+ "loss": 0.7444,
+ "step": 316
+ },
+ {
+ "epoch": 0.14988179669030732,
+ "grad_norm": 2.780880928039551,
+ "learning_rate": 4.996335863452072e-06,
+ "loss": 0.672,
+ "step": 317
+ },
+ {
+ "epoch": 0.150354609929078,
+ "grad_norm": 3.3738629817962646,
+ "learning_rate": 4.996302023142536e-06,
+ "loss": 0.7972,
+ "step": 318
+ },
+ {
+ "epoch": 0.1508274231678487,
+ "grad_norm": 3.4874777793884277,
+ "learning_rate": 4.99626802739922e-06,
+ "loss": 0.8252,
+ "step": 319
+ },
+ {
+ "epoch": 0.15130023640661938,
+ "grad_norm": 3.7074787616729736,
+ "learning_rate": 4.9962338762242395e-06,
+ "loss": 0.8216,
+ "step": 320
+ },
+ {
+ "epoch": 0.15177304964539007,
+ "grad_norm": 3.281912326812744,
+ "learning_rate": 4.996199569619721e-06,
+ "loss": 0.8175,
+ "step": 321
+ },
+ {
+ "epoch": 0.15224586288416075,
+ "grad_norm": 2.9485340118408203,
+ "learning_rate": 4.996165107587801e-06,
+ "loss": 0.707,
+ "step": 322
+ },
+ {
+ "epoch": 0.15271867612293144,
+ "grad_norm": 3.3757646083831787,
+ "learning_rate": 4.996130490130625e-06,
+ "loss": 0.7955,
+ "step": 323
+ },
+ {
+ "epoch": 0.15319148936170213,
+ "grad_norm": 2.962181568145752,
+ "learning_rate": 4.996095717250349e-06,
+ "loss": 0.7067,
+ "step": 324
+ },
+ {
+ "epoch": 0.1536643026004728,
+ "grad_norm": 3.114272356033325,
+ "learning_rate": 4.996060788949136e-06,
+ "loss": 0.7486,
+ "step": 325
+ },
+ {
+ "epoch": 0.1541371158392435,
+ "grad_norm": 3.0621590614318848,
+ "learning_rate": 4.996025705229165e-06,
+ "loss": 0.6547,
+ "step": 326
+ },
+ {
+ "epoch": 0.15460992907801419,
+ "grad_norm": 2.8745882511138916,
+ "learning_rate": 4.995990466092616e-06,
+ "loss": 0.6435,
+ "step": 327
+ },
+ {
+ "epoch": 0.15508274231678487,
+ "grad_norm": 2.90841007232666,
+ "learning_rate": 4.995955071541686e-06,
+ "loss": 0.7331,
+ "step": 328
+ },
+ {
+ "epoch": 0.15555555555555556,
+ "grad_norm": 2.694580316543579,
+ "learning_rate": 4.9959195215785784e-06,
+ "loss": 0.6731,
+ "step": 329
+ },
+ {
+ "epoch": 0.15602836879432624,
+ "grad_norm": 3.158083438873291,
+ "learning_rate": 4.995883816205507e-06,
+ "loss": 0.7257,
+ "step": 330
+ },
+ {
+ "epoch": 0.15650118203309693,
+ "grad_norm": 3.3234715461730957,
+ "learning_rate": 4.995847955424694e-06,
+ "loss": 0.7389,
+ "step": 331
+ },
+ {
+ "epoch": 0.15697399527186762,
+ "grad_norm": 2.9406495094299316,
+ "learning_rate": 4.995811939238373e-06,
+ "loss": 0.643,
+ "step": 332
+ },
+ {
+ "epoch": 0.1574468085106383,
+ "grad_norm": 3.3191726207733154,
+ "learning_rate": 4.995775767648785e-06,
+ "loss": 0.7879,
+ "step": 333
+ },
+ {
+ "epoch": 0.157919621749409,
+ "grad_norm": 3.711925745010376,
+ "learning_rate": 4.995739440658185e-06,
+ "loss": 0.7586,
+ "step": 334
+ },
+ {
+ "epoch": 0.15839243498817968,
+ "grad_norm": 9.573421478271484,
+ "learning_rate": 4.995702958268833e-06,
+ "loss": 0.7842,
+ "step": 335
+ },
+ {
+ "epoch": 0.15886524822695036,
+ "grad_norm": 3.4154508113861084,
+ "learning_rate": 4.995666320483001e-06,
+ "loss": 0.6735,
+ "step": 336
+ },
+ {
+ "epoch": 0.15933806146572105,
+ "grad_norm": 3.4169859886169434,
+ "learning_rate": 4.995629527302971e-06,
+ "loss": 0.741,
+ "step": 337
+ },
+ {
+ "epoch": 0.15981087470449173,
+ "grad_norm": 3.287503242492676,
+ "learning_rate": 4.9955925787310335e-06,
+ "loss": 0.7139,
+ "step": 338
+ },
+ {
+ "epoch": 0.16028368794326242,
+ "grad_norm": 3.288409471511841,
+ "learning_rate": 4.995555474769488e-06,
+ "loss": 0.7636,
+ "step": 339
+ },
+ {
+ "epoch": 0.1607565011820331,
+ "grad_norm": 2.8021693229675293,
+ "learning_rate": 4.995518215420646e-06,
+ "loss": 0.5883,
+ "step": 340
+ },
+ {
+ "epoch": 0.1612293144208038,
+ "grad_norm": 2.7038564682006836,
+ "learning_rate": 4.995480800686827e-06,
+ "loss": 0.657,
+ "step": 341
+ },
+ {
+ "epoch": 0.16170212765957448,
+ "grad_norm": 3.2370235919952393,
+ "learning_rate": 4.9954432305703615e-06,
+ "loss": 0.6999,
+ "step": 342
+ },
+ {
+ "epoch": 0.16217494089834517,
+ "grad_norm": 2.8666412830352783,
+ "learning_rate": 4.995405505073588e-06,
+ "loss": 0.7199,
+ "step": 343
+ },
+ {
+ "epoch": 0.16264775413711585,
+ "grad_norm": 3.6467232704162598,
+ "learning_rate": 4.995367624198856e-06,
+ "loss": 0.7317,
+ "step": 344
+ },
+ {
+ "epoch": 0.16312056737588654,
+ "grad_norm": 2.7576327323913574,
+ "learning_rate": 4.9953295879485246e-06,
+ "loss": 0.647,
+ "step": 345
+ },
+ {
+ "epoch": 0.1635933806146572,
+ "grad_norm": 2.922232151031494,
+ "learning_rate": 4.995291396324959e-06,
+ "loss": 0.6686,
+ "step": 346
+ },
+ {
+ "epoch": 0.16406619385342788,
+ "grad_norm": 2.8693501949310303,
+ "learning_rate": 4.995253049330542e-06,
+ "loss": 0.6756,
+ "step": 347
+ },
+ {
+ "epoch": 0.16453900709219857,
+ "grad_norm": 3.671865701675415,
+ "learning_rate": 4.995214546967658e-06,
+ "loss": 0.7347,
+ "step": 348
+ },
+ {
+ "epoch": 0.16501182033096926,
+ "grad_norm": 3.024219274520874,
+ "learning_rate": 4.995175889238706e-06,
+ "loss": 0.7547,
+ "step": 349
+ },
+ {
+ "epoch": 0.16548463356973994,
+ "grad_norm": 2.8470778465270996,
+ "learning_rate": 4.995137076146091e-06,
+ "loss": 0.6764,
+ "step": 350
+ },
+ {
+ "epoch": 0.16595744680851063,
+ "grad_norm": 2.905057907104492,
+ "learning_rate": 4.9950981076922324e-06,
+ "loss": 0.6814,
+ "step": 351
+ },
+ {
+ "epoch": 0.16643026004728131,
+ "grad_norm": 3.504377841949463,
+ "learning_rate": 4.995058983879555e-06,
+ "loss": 0.7145,
+ "step": 352
+ },
+ {
+ "epoch": 0.166903073286052,
+ "grad_norm": 3.0029661655426025,
+ "learning_rate": 4.995019704710495e-06,
+ "loss": 0.7114,
+ "step": 353
+ },
+ {
+ "epoch": 0.1673758865248227,
+ "grad_norm": 2.8666274547576904,
+ "learning_rate": 4.994980270187499e-06,
+ "loss": 0.7416,
+ "step": 354
+ },
+ {
+ "epoch": 0.16784869976359337,
+ "grad_norm": 3.1644718647003174,
+ "learning_rate": 4.994940680313021e-06,
+ "loss": 0.661,
+ "step": 355
+ },
+ {
+ "epoch": 0.16832151300236406,
+ "grad_norm": 3.050391674041748,
+ "learning_rate": 4.994900935089527e-06,
+ "loss": 0.7243,
+ "step": 356
+ },
+ {
+ "epoch": 0.16879432624113475,
+ "grad_norm": 2.985466480255127,
+ "learning_rate": 4.994861034519491e-06,
+ "loss": 0.6917,
+ "step": 357
+ },
+ {
+ "epoch": 0.16926713947990543,
+ "grad_norm": 2.909342050552368,
+ "learning_rate": 4.9948209786053995e-06,
+ "loss": 0.6636,
+ "step": 358
+ },
+ {
+ "epoch": 0.16973995271867612,
+ "grad_norm": 3.2214784622192383,
+ "learning_rate": 4.9947807673497435e-06,
+ "loss": 0.7903,
+ "step": 359
+ },
+ {
+ "epoch": 0.1702127659574468,
+ "grad_norm": 2.5654983520507812,
+ "learning_rate": 4.994740400755029e-06,
+ "loss": 0.6129,
+ "step": 360
+ },
+ {
+ "epoch": 0.1706855791962175,
+ "grad_norm": 3.775646448135376,
+ "learning_rate": 4.99469987882377e-06,
+ "loss": 0.7145,
+ "step": 361
+ },
+ {
+ "epoch": 0.17115839243498818,
+ "grad_norm": 2.8965413570404053,
+ "learning_rate": 4.994659201558487e-06,
+ "loss": 0.7177,
+ "step": 362
+ },
+ {
+ "epoch": 0.17163120567375886,
+ "grad_norm": 3.485597848892212,
+ "learning_rate": 4.9946183689617146e-06,
+ "loss": 0.8107,
+ "step": 363
+ },
+ {
+ "epoch": 0.17210401891252955,
+ "grad_norm": 3.277839183807373,
+ "learning_rate": 4.994577381035995e-06,
+ "loss": 0.691,
+ "step": 364
+ },
+ {
+ "epoch": 0.17257683215130024,
+ "grad_norm": 2.8807685375213623,
+ "learning_rate": 4.99453623778388e-06,
+ "loss": 0.7627,
+ "step": 365
+ },
+ {
+ "epoch": 0.17304964539007092,
+ "grad_norm": 3.0659940242767334,
+ "learning_rate": 4.994494939207932e-06,
+ "loss": 0.6858,
+ "step": 366
+ },
+ {
+ "epoch": 0.1735224586288416,
+ "grad_norm": 3.0881855487823486,
+ "learning_rate": 4.994453485310723e-06,
+ "loss": 0.8212,
+ "step": 367
+ },
+ {
+ "epoch": 0.1739952718676123,
+ "grad_norm": 2.7199201583862305,
+ "learning_rate": 4.994411876094832e-06,
+ "loss": 0.6516,
+ "step": 368
+ },
+ {
+ "epoch": 0.17446808510638298,
+ "grad_norm": 2.955889940261841,
+ "learning_rate": 4.994370111562851e-06,
+ "loss": 0.6579,
+ "step": 369
+ },
+ {
+ "epoch": 0.17494089834515367,
+ "grad_norm": 3.1321663856506348,
+ "learning_rate": 4.994328191717382e-06,
+ "loss": 0.6891,
+ "step": 370
+ },
+ {
+ "epoch": 0.17541371158392435,
+ "grad_norm": 3.0560388565063477,
+ "learning_rate": 4.994286116561034e-06,
+ "loss": 0.7243,
+ "step": 371
+ },
+ {
+ "epoch": 0.17588652482269504,
+ "grad_norm": 3.1560704708099365,
+ "learning_rate": 4.994243886096425e-06,
+ "loss": 0.7262,
+ "step": 372
+ },
+ {
+ "epoch": 0.17635933806146573,
+ "grad_norm": 2.913541316986084,
+ "learning_rate": 4.994201500326187e-06,
+ "loss": 0.7318,
+ "step": 373
+ },
+ {
+ "epoch": 0.1768321513002364,
+ "grad_norm": 3.098376512527466,
+ "learning_rate": 4.994158959252958e-06,
+ "loss": 0.6419,
+ "step": 374
+ },
+ {
+ "epoch": 0.1773049645390071,
+ "grad_norm": 2.977508544921875,
+ "learning_rate": 4.994116262879387e-06,
+ "loss": 0.6709,
+ "step": 375
+ },
+ {
+ "epoch": 0.17777777777777778,
+ "grad_norm": 3.168186902999878,
+ "learning_rate": 4.994073411208133e-06,
+ "loss": 0.6608,
+ "step": 376
+ },
+ {
+ "epoch": 0.17825059101654847,
+ "grad_norm": 3.436844825744629,
+ "learning_rate": 4.994030404241864e-06,
+ "loss": 0.7227,
+ "step": 377
+ },
+ {
+ "epoch": 0.17872340425531916,
+ "grad_norm": 2.8998289108276367,
+ "learning_rate": 4.993987241983258e-06,
+ "loss": 0.6512,
+ "step": 378
+ },
+ {
+ "epoch": 0.17919621749408984,
+ "grad_norm": 3.407191514968872,
+ "learning_rate": 4.993943924435002e-06,
+ "loss": 0.616,
+ "step": 379
+ },
+ {
+ "epoch": 0.17966903073286053,
+ "grad_norm": 3.744858741760254,
+ "learning_rate": 4.993900451599793e-06,
+ "loss": 0.8599,
+ "step": 380
+ },
+ {
+ "epoch": 0.18014184397163122,
+ "grad_norm": 3.486283779144287,
+ "learning_rate": 4.993856823480338e-06,
+ "loss": 0.6634,
+ "step": 381
+ },
+ {
+ "epoch": 0.1806146572104019,
+ "grad_norm": 2.895719051361084,
+ "learning_rate": 4.993813040079355e-06,
+ "loss": 0.6972,
+ "step": 382
+ },
+ {
+ "epoch": 0.1810874704491726,
+ "grad_norm": 2.814133882522583,
+ "learning_rate": 4.993769101399569e-06,
+ "loss": 0.6271,
+ "step": 383
+ },
+ {
+ "epoch": 0.18156028368794327,
+ "grad_norm": 2.8609800338745117,
+ "learning_rate": 4.993725007443715e-06,
+ "loss": 0.6481,
+ "step": 384
+ },
+ {
+ "epoch": 0.18203309692671396,
+ "grad_norm": 3.2829644680023193,
+ "learning_rate": 4.99368075821454e-06,
+ "loss": 0.7999,
+ "step": 385
+ },
+ {
+ "epoch": 0.18250591016548465,
+ "grad_norm": 3.1417458057403564,
+ "learning_rate": 4.993636353714798e-06,
+ "loss": 0.6972,
+ "step": 386
+ },
+ {
+ "epoch": 0.1829787234042553,
+ "grad_norm": 3.0679385662078857,
+ "learning_rate": 4.993591793947256e-06,
+ "loss": 0.667,
+ "step": 387
+ },
+ {
+ "epoch": 0.183451536643026,
+ "grad_norm": 3.1387410163879395,
+ "learning_rate": 4.993547078914686e-06,
+ "loss": 0.7618,
+ "step": 388
+ },
+ {
+ "epoch": 0.18392434988179668,
+ "grad_norm": 2.9181406497955322,
+ "learning_rate": 4.993502208619872e-06,
+ "loss": 0.7391,
+ "step": 389
+ },
+ {
+ "epoch": 0.18439716312056736,
+ "grad_norm": 2.8952157497406006,
+ "learning_rate": 4.993457183065611e-06,
+ "loss": 0.6988,
+ "step": 390
+ },
+ {
+ "epoch": 0.18486997635933805,
+ "grad_norm": 3.2274813652038574,
+ "learning_rate": 4.993412002254704e-06,
+ "loss": 0.688,
+ "step": 391
+ },
+ {
+ "epoch": 0.18534278959810874,
+ "grad_norm": 3.4693779945373535,
+ "learning_rate": 4.993366666189965e-06,
+ "loss": 0.6634,
+ "step": 392
+ },
+ {
+ "epoch": 0.18581560283687942,
+ "grad_norm": 3.5358526706695557,
+ "learning_rate": 4.993321174874217e-06,
+ "loss": 0.7343,
+ "step": 393
+ },
+ {
+ "epoch": 0.1862884160756501,
+ "grad_norm": 3.013338088989258,
+ "learning_rate": 4.993275528310292e-06,
+ "loss": 0.7579,
+ "step": 394
+ },
+ {
+ "epoch": 0.1867612293144208,
+ "grad_norm": 2.694772720336914,
+ "learning_rate": 4.993229726501033e-06,
+ "loss": 0.718,
+ "step": 395
+ },
+ {
+ "epoch": 0.18723404255319148,
+ "grad_norm": 3.070612907409668,
+ "learning_rate": 4.9931837694492915e-06,
+ "loss": 0.6438,
+ "step": 396
+ },
+ {
+ "epoch": 0.18770685579196217,
+ "grad_norm": 2.9193027019500732,
+ "learning_rate": 4.993137657157928e-06,
+ "loss": 0.6788,
+ "step": 397
+ },
+ {
+ "epoch": 0.18817966903073285,
+ "grad_norm": 3.047682046890259,
+ "learning_rate": 4.993091389629816e-06,
+ "loss": 0.6826,
+ "step": 398
+ },
+ {
+ "epoch": 0.18865248226950354,
+ "grad_norm": 2.9629905223846436,
+ "learning_rate": 4.993044966867834e-06,
+ "loss": 0.7196,
+ "step": 399
+ },
+ {
+ "epoch": 0.18912529550827423,
+ "grad_norm": 3.0692050457000732,
+ "learning_rate": 4.992998388874874e-06,
+ "loss": 0.7015,
+ "step": 400
+ },
+ {
+ "epoch": 0.1895981087470449,
+ "grad_norm": 3.5427212715148926,
+ "learning_rate": 4.992951655653836e-06,
+ "loss": 0.8292,
+ "step": 401
+ },
+ {
+ "epoch": 0.1900709219858156,
+ "grad_norm": 2.643526554107666,
+ "learning_rate": 4.992904767207629e-06,
+ "loss": 0.624,
+ "step": 402
+ },
+ {
+ "epoch": 0.19054373522458629,
+ "grad_norm": 3.1185996532440186,
+ "learning_rate": 4.992857723539173e-06,
+ "loss": 0.7354,
+ "step": 403
+ },
+ {
+ "epoch": 0.19101654846335697,
+ "grad_norm": 3.006856679916382,
+ "learning_rate": 4.992810524651398e-06,
+ "loss": 0.7752,
+ "step": 404
+ },
+ {
+ "epoch": 0.19148936170212766,
+ "grad_norm": 2.9913275241851807,
+ "learning_rate": 4.9927631705472425e-06,
+ "loss": 0.7306,
+ "step": 405
+ },
+ {
+ "epoch": 0.19196217494089834,
+ "grad_norm": 2.6794071197509766,
+ "learning_rate": 4.992715661229655e-06,
+ "loss": 0.6136,
+ "step": 406
+ },
+ {
+ "epoch": 0.19243498817966903,
+ "grad_norm": 3.5933966636657715,
+ "learning_rate": 4.992667996701593e-06,
+ "loss": 0.7024,
+ "step": 407
+ },
+ {
+ "epoch": 0.19290780141843972,
+ "grad_norm": 2.862187623977661,
+ "learning_rate": 4.992620176966025e-06,
+ "loss": 0.692,
+ "step": 408
+ },
+ {
+ "epoch": 0.1933806146572104,
+ "grad_norm": 3.076845407485962,
+ "learning_rate": 4.9925722020259286e-06,
+ "loss": 0.7475,
+ "step": 409
+ },
+ {
+ "epoch": 0.1938534278959811,
+ "grad_norm": 3.372919797897339,
+ "learning_rate": 4.9925240718842895e-06,
+ "loss": 0.6886,
+ "step": 410
+ },
+ {
+ "epoch": 0.19432624113475178,
+ "grad_norm": 2.922977924346924,
+ "learning_rate": 4.992475786544108e-06,
+ "loss": 0.7049,
+ "step": 411
+ },
+ {
+ "epoch": 0.19479905437352246,
+ "grad_norm": 2.908034324645996,
+ "learning_rate": 4.992427346008387e-06,
+ "loss": 0.6498,
+ "step": 412
+ },
+ {
+ "epoch": 0.19527186761229315,
+ "grad_norm": 3.096723794937134,
+ "learning_rate": 4.992378750280144e-06,
+ "loss": 0.7151,
+ "step": 413
+ },
+ {
+ "epoch": 0.19574468085106383,
+ "grad_norm": 2.895237684249878,
+ "learning_rate": 4.992329999362405e-06,
+ "loss": 0.7277,
+ "step": 414
+ },
+ {
+ "epoch": 0.19621749408983452,
+ "grad_norm": 2.718230724334717,
+ "learning_rate": 4.9922810932582065e-06,
+ "loss": 0.6375,
+ "step": 415
+ },
+ {
+ "epoch": 0.1966903073286052,
+ "grad_norm": 3.187743663787842,
+ "learning_rate": 4.992232031970592e-06,
+ "loss": 0.6528,
+ "step": 416
+ },
+ {
+ "epoch": 0.1971631205673759,
+ "grad_norm": 2.996406316757202,
+ "learning_rate": 4.992182815502616e-06,
+ "loss": 0.6552,
+ "step": 417
+ },
+ {
+ "epoch": 0.19763593380614658,
+ "grad_norm": 3.301084041595459,
+ "learning_rate": 4.992133443857345e-06,
+ "loss": 0.7061,
+ "step": 418
+ },
+ {
+ "epoch": 0.19810874704491727,
+ "grad_norm": 3.7874677181243896,
+ "learning_rate": 4.992083917037853e-06,
+ "loss": 0.7859,
+ "step": 419
+ },
+ {
+ "epoch": 0.19858156028368795,
+ "grad_norm": 3.124253511428833,
+ "learning_rate": 4.992034235047222e-06,
+ "loss": 0.7615,
+ "step": 420
+ },
+ {
+ "epoch": 0.19905437352245864,
+ "grad_norm": 3.0488970279693604,
+ "learning_rate": 4.991984397888546e-06,
+ "loss": 0.6916,
+ "step": 421
+ },
+ {
+ "epoch": 0.19952718676122932,
+ "grad_norm": 3.1241321563720703,
+ "learning_rate": 4.991934405564929e-06,
+ "loss": 0.7055,
+ "step": 422
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 3.396632432937622,
+ "learning_rate": 4.991884258079484e-06,
+ "loss": 0.7675,
+ "step": 423
+ },
+ {
+ "epoch": 0.2004728132387707,
+ "grad_norm": 3.7776873111724854,
+ "learning_rate": 4.9918339554353316e-06,
+ "loss": 0.7371,
+ "step": 424
+ },
+ {
+ "epoch": 0.20094562647754138,
+ "grad_norm": 3.3356032371520996,
+ "learning_rate": 4.991783497635606e-06,
+ "loss": 0.6778,
+ "step": 425
+ },
+ {
+ "epoch": 0.20141843971631207,
+ "grad_norm": 2.988856792449951,
+ "learning_rate": 4.9917328846834474e-06,
+ "loss": 0.6795,
+ "step": 426
+ },
+ {
+ "epoch": 0.20189125295508276,
+ "grad_norm": 3.264183282852173,
+ "learning_rate": 4.99168211658201e-06,
+ "loss": 0.7707,
+ "step": 427
+ },
+ {
+ "epoch": 0.20236406619385341,
+ "grad_norm": 3.878068208694458,
+ "learning_rate": 4.991631193334451e-06,
+ "loss": 0.857,
+ "step": 428
+ },
+ {
+ "epoch": 0.2028368794326241,
+ "grad_norm": 3.6377553939819336,
+ "learning_rate": 4.991580114943943e-06,
+ "loss": 0.8033,
+ "step": 429
+ },
+ {
+ "epoch": 0.2033096926713948,
+ "grad_norm": 2.95393967628479,
+ "learning_rate": 4.991528881413667e-06,
+ "loss": 0.6809,
+ "step": 430
+ },
+ {
+ "epoch": 0.20378250591016547,
+ "grad_norm": 3.058704376220703,
+ "learning_rate": 4.9914774927468125e-06,
+ "loss": 0.6664,
+ "step": 431
+ },
+ {
+ "epoch": 0.20425531914893616,
+ "grad_norm": 2.7783217430114746,
+ "learning_rate": 4.9914259489465795e-06,
+ "loss": 0.6478,
+ "step": 432
+ },
+ {
+ "epoch": 0.20472813238770685,
+ "grad_norm": 2.4825217723846436,
+ "learning_rate": 4.991374250016177e-06,
+ "loss": 0.6598,
+ "step": 433
+ },
+ {
+ "epoch": 0.20520094562647753,
+ "grad_norm": 2.8753600120544434,
+ "learning_rate": 4.991322395958824e-06,
+ "loss": 0.6947,
+ "step": 434
+ },
+ {
+ "epoch": 0.20567375886524822,
+ "grad_norm": 3.2339367866516113,
+ "learning_rate": 4.99127038677775e-06,
+ "loss": 0.8201,
+ "step": 435
+ },
+ {
+ "epoch": 0.2061465721040189,
+ "grad_norm": 2.9065537452697754,
+ "learning_rate": 4.991218222476193e-06,
+ "loss": 0.6679,
+ "step": 436
+ },
+ {
+ "epoch": 0.2066193853427896,
+ "grad_norm": 3.283228874206543,
+ "learning_rate": 4.991165903057401e-06,
+ "loss": 0.8039,
+ "step": 437
+ },
+ {
+ "epoch": 0.20709219858156028,
+ "grad_norm": 3.429872751235962,
+ "learning_rate": 4.991113428524631e-06,
+ "loss": 0.7392,
+ "step": 438
+ },
+ {
+ "epoch": 0.20756501182033096,
+ "grad_norm": 3.118943452835083,
+ "learning_rate": 4.991060798881152e-06,
+ "loss": 0.6794,
+ "step": 439
+ },
+ {
+ "epoch": 0.20803782505910165,
+ "grad_norm": 3.395970106124878,
+ "learning_rate": 4.99100801413024e-06,
+ "loss": 0.6862,
+ "step": 440
+ },
+ {
+ "epoch": 0.20851063829787234,
+ "grad_norm": 2.869191884994507,
+ "learning_rate": 4.99095507427518e-06,
+ "loss": 0.6076,
+ "step": 441
+ },
+ {
+ "epoch": 0.20898345153664302,
+ "grad_norm": 3.1934661865234375,
+ "learning_rate": 4.990901979319272e-06,
+ "loss": 0.6927,
+ "step": 442
+ },
+ {
+ "epoch": 0.2094562647754137,
+ "grad_norm": 2.9068603515625,
+ "learning_rate": 4.990848729265819e-06,
+ "loss": 0.6864,
+ "step": 443
+ },
+ {
+ "epoch": 0.2099290780141844,
+ "grad_norm": 3.0535948276519775,
+ "learning_rate": 4.9907953241181375e-06,
+ "loss": 0.6396,
+ "step": 444
+ },
+ {
+ "epoch": 0.21040189125295508,
+ "grad_norm": 2.871511459350586,
+ "learning_rate": 4.990741763879554e-06,
+ "loss": 0.6743,
+ "step": 445
+ },
+ {
+ "epoch": 0.21087470449172577,
+ "grad_norm": 2.9184393882751465,
+ "learning_rate": 4.9906880485534015e-06,
+ "loss": 0.6786,
+ "step": 446
+ },
+ {
+ "epoch": 0.21134751773049645,
+ "grad_norm": 3.0628271102905273,
+ "learning_rate": 4.990634178143026e-06,
+ "loss": 0.6326,
+ "step": 447
+ },
+ {
+ "epoch": 0.21182033096926714,
+ "grad_norm": 3.7878305912017822,
+ "learning_rate": 4.990580152651782e-06,
+ "loss": 0.7944,
+ "step": 448
+ },
+ {
+ "epoch": 0.21229314420803783,
+ "grad_norm": 2.8577189445495605,
+ "learning_rate": 4.990525972083031e-06,
+ "loss": 0.71,
+ "step": 449
+ },
+ {
+ "epoch": 0.2127659574468085,
+ "grad_norm": 3.307769775390625,
+ "learning_rate": 4.99047163644015e-06,
+ "loss": 0.6893,
+ "step": 450
+ },
+ {
+ "epoch": 0.2132387706855792,
+ "grad_norm": 2.7391717433929443,
+ "learning_rate": 4.990417145726519e-06,
+ "loss": 0.712,
+ "step": 451
+ },
+ {
+ "epoch": 0.21371158392434988,
+ "grad_norm": 2.938044786453247,
+ "learning_rate": 4.990362499945534e-06,
+ "loss": 0.7516,
+ "step": 452
+ },
+ {
+ "epoch": 0.21418439716312057,
+ "grad_norm": 2.7831056118011475,
+ "learning_rate": 4.990307699100595e-06,
+ "loss": 0.6168,
+ "step": 453
+ },
+ {
+ "epoch": 0.21465721040189126,
+ "grad_norm": 2.907977342605591,
+ "learning_rate": 4.990252743195116e-06,
+ "loss": 0.6706,
+ "step": 454
+ },
+ {
+ "epoch": 0.21513002364066194,
+ "grad_norm": 3.7882161140441895,
+ "learning_rate": 4.990197632232517e-06,
+ "loss": 0.6847,
+ "step": 455
+ },
+ {
+ "epoch": 0.21560283687943263,
+ "grad_norm": 2.899716854095459,
+ "learning_rate": 4.990142366216232e-06,
+ "loss": 0.6699,
+ "step": 456
+ },
+ {
+ "epoch": 0.21607565011820332,
+ "grad_norm": 2.907003879547119,
+ "learning_rate": 4.990086945149701e-06,
+ "loss": 0.6864,
+ "step": 457
+ },
+ {
+ "epoch": 0.216548463356974,
+ "grad_norm": 3.2407333850860596,
+ "learning_rate": 4.9900313690363736e-06,
+ "loss": 0.692,
+ "step": 458
+ },
+ {
+ "epoch": 0.2170212765957447,
+ "grad_norm": 2.9055583477020264,
+ "learning_rate": 4.989975637879712e-06,
+ "loss": 0.7113,
+ "step": 459
+ },
+ {
+ "epoch": 0.21749408983451538,
+ "grad_norm": 2.9836206436157227,
+ "learning_rate": 4.989919751683184e-06,
+ "loss": 0.6673,
+ "step": 460
+ },
+ {
+ "epoch": 0.21796690307328606,
+ "grad_norm": 3.371035575866699,
+ "learning_rate": 4.989863710450273e-06,
+ "loss": 0.7181,
+ "step": 461
+ },
+ {
+ "epoch": 0.21843971631205675,
+ "grad_norm": 2.9636635780334473,
+ "learning_rate": 4.989807514184465e-06,
+ "loss": 0.6082,
+ "step": 462
+ },
+ {
+ "epoch": 0.21891252955082743,
+ "grad_norm": 2.9634664058685303,
+ "learning_rate": 4.9897511628892615e-06,
+ "loss": 0.7086,
+ "step": 463
+ },
+ {
+ "epoch": 0.21938534278959812,
+ "grad_norm": 3.154763698577881,
+ "learning_rate": 4.98969465656817e-06,
+ "loss": 0.7027,
+ "step": 464
+ },
+ {
+ "epoch": 0.2198581560283688,
+ "grad_norm": 2.9959890842437744,
+ "learning_rate": 4.98963799522471e-06,
+ "loss": 0.6498,
+ "step": 465
+ },
+ {
+ "epoch": 0.2203309692671395,
+ "grad_norm": 3.5470590591430664,
+ "learning_rate": 4.989581178862408e-06,
+ "loss": 0.7199,
+ "step": 466
+ },
+ {
+ "epoch": 0.22080378250591018,
+ "grad_norm": 7.1873369216918945,
+ "learning_rate": 4.989524207484802e-06,
+ "loss": 0.6676,
+ "step": 467
+ },
+ {
+ "epoch": 0.22127659574468084,
+ "grad_norm": 3.1099541187286377,
+ "learning_rate": 4.98946708109544e-06,
+ "loss": 0.6785,
+ "step": 468
+ },
+ {
+ "epoch": 0.22174940898345152,
+ "grad_norm": 2.830991506576538,
+ "learning_rate": 4.9894097996978795e-06,
+ "loss": 0.6456,
+ "step": 469
+ },
+ {
+ "epoch": 0.2222222222222222,
+ "grad_norm": 3.0212316513061523,
+ "learning_rate": 4.989352363295687e-06,
+ "loss": 0.6048,
+ "step": 470
+ },
+ {
+ "epoch": 0.2226950354609929,
+ "grad_norm": 3.18776798248291,
+ "learning_rate": 4.989294771892437e-06,
+ "loss": 0.7078,
+ "step": 471
+ },
+ {
+ "epoch": 0.22316784869976358,
+ "grad_norm": 2.9972598552703857,
+ "learning_rate": 4.989237025491717e-06,
+ "loss": 0.7082,
+ "step": 472
+ },
+ {
+ "epoch": 0.22364066193853427,
+ "grad_norm": 3.4935688972473145,
+ "learning_rate": 4.989179124097123e-06,
+ "loss": 0.8199,
+ "step": 473
+ },
+ {
+ "epoch": 0.22411347517730495,
+ "grad_norm": 2.6485543251037598,
+ "learning_rate": 4.9891210677122595e-06,
+ "loss": 0.6371,
+ "step": 474
+ },
+ {
+ "epoch": 0.22458628841607564,
+ "grad_norm": 2.969233512878418,
+ "learning_rate": 4.989062856340742e-06,
+ "loss": 0.6879,
+ "step": 475
+ },
+ {
+ "epoch": 0.22505910165484633,
+ "grad_norm": 2.881875514984131,
+ "learning_rate": 4.989004489986194e-06,
+ "loss": 0.7415,
+ "step": 476
+ },
+ {
+ "epoch": 0.225531914893617,
+ "grad_norm": 2.624540090560913,
+ "learning_rate": 4.98894596865225e-06,
+ "loss": 0.6522,
+ "step": 477
+ },
+ {
+ "epoch": 0.2260047281323877,
+ "grad_norm": 3.61075496673584,
+ "learning_rate": 4.988887292342555e-06,
+ "loss": 0.7109,
+ "step": 478
+ },
+ {
+ "epoch": 0.2264775413711584,
+ "grad_norm": 2.9368972778320312,
+ "learning_rate": 4.988828461060762e-06,
+ "loss": 0.6843,
+ "step": 479
+ },
+ {
+ "epoch": 0.22695035460992907,
+ "grad_norm": 3.0670197010040283,
+ "learning_rate": 4.988769474810533e-06,
+ "loss": 0.6807,
+ "step": 480
+ },
+ {
+ "epoch": 0.22742316784869976,
+ "grad_norm": 2.9662792682647705,
+ "learning_rate": 4.988710333595542e-06,
+ "loss": 0.6796,
+ "step": 481
+ },
+ {
+ "epoch": 0.22789598108747045,
+ "grad_norm": 2.971235752105713,
+ "learning_rate": 4.988651037419472e-06,
+ "loss": 0.696,
+ "step": 482
+ },
+ {
+ "epoch": 0.22836879432624113,
+ "grad_norm": 2.931884527206421,
+ "learning_rate": 4.988591586286013e-06,
+ "loss": 0.7323,
+ "step": 483
+ },
+ {
+ "epoch": 0.22884160756501182,
+ "grad_norm": 2.8114213943481445,
+ "learning_rate": 4.988531980198868e-06,
+ "loss": 0.6584,
+ "step": 484
+ },
+ {
+ "epoch": 0.2293144208037825,
+ "grad_norm": 3.2785916328430176,
+ "learning_rate": 4.98847221916175e-06,
+ "loss": 0.7514,
+ "step": 485
+ },
+ {
+ "epoch": 0.2297872340425532,
+ "grad_norm": 3.0520215034484863,
+ "learning_rate": 4.988412303178377e-06,
+ "loss": 0.7564,
+ "step": 486
+ },
+ {
+ "epoch": 0.23026004728132388,
+ "grad_norm": 3.181002616882324,
+ "learning_rate": 4.988352232252483e-06,
+ "loss": 0.6768,
+ "step": 487
+ },
+ {
+ "epoch": 0.23073286052009456,
+ "grad_norm": 3.4953625202178955,
+ "learning_rate": 4.988292006387805e-06,
+ "loss": 0.7143,
+ "step": 488
+ },
+ {
+ "epoch": 0.23120567375886525,
+ "grad_norm": 3.326571226119995,
+ "learning_rate": 4.988231625588096e-06,
+ "loss": 0.7318,
+ "step": 489
+ },
+ {
+ "epoch": 0.23167848699763594,
+ "grad_norm": 3.09614634513855,
+ "learning_rate": 4.988171089857113e-06,
+ "loss": 0.6574,
+ "step": 490
+ },
+ {
+ "epoch": 0.23215130023640662,
+ "grad_norm": 2.7439446449279785,
+ "learning_rate": 4.9881103991986265e-06,
+ "loss": 0.6637,
+ "step": 491
+ },
+ {
+ "epoch": 0.2326241134751773,
+ "grad_norm": 3.0681190490722656,
+ "learning_rate": 4.988049553616416e-06,
+ "loss": 0.6326,
+ "step": 492
+ },
+ {
+ "epoch": 0.233096926713948,
+ "grad_norm": 3.0757341384887695,
+ "learning_rate": 4.98798855311427e-06,
+ "loss": 0.695,
+ "step": 493
+ },
+ {
+ "epoch": 0.23356973995271868,
+ "grad_norm": 2.8637635707855225,
+ "learning_rate": 4.987927397695985e-06,
+ "loss": 0.6598,
+ "step": 494
+ },
+ {
+ "epoch": 0.23404255319148937,
+ "grad_norm": 3.3641068935394287,
+ "learning_rate": 4.9878660873653715e-06,
+ "loss": 0.7435,
+ "step": 495
+ },
+ {
+ "epoch": 0.23451536643026005,
+ "grad_norm": 3.5025596618652344,
+ "learning_rate": 4.987804622126245e-06,
+ "loss": 0.735,
+ "step": 496
+ },
+ {
+ "epoch": 0.23498817966903074,
+ "grad_norm": 2.9298837184906006,
+ "learning_rate": 4.987743001982434e-06,
+ "loss": 0.7063,
+ "step": 497
+ },
+ {
+ "epoch": 0.23546099290780143,
+ "grad_norm": 2.70358943939209,
+ "learning_rate": 4.987681226937774e-06,
+ "loss": 0.6799,
+ "step": 498
+ },
+ {
+ "epoch": 0.2359338061465721,
+ "grad_norm": 3.027871608734131,
+ "learning_rate": 4.9876192969961125e-06,
+ "loss": 0.6881,
+ "step": 499
+ },
+ {
+ "epoch": 0.2364066193853428,
+ "grad_norm": 3.362306594848633,
+ "learning_rate": 4.987557212161304e-06,
+ "loss": 0.7906,
+ "step": 500
+ },
+ {
+ "epoch": 0.23687943262411348,
+ "grad_norm": 3.3136050701141357,
+ "learning_rate": 4.987494972437217e-06,
+ "loss": 0.6878,
+ "step": 501
+ },
+ {
+ "epoch": 0.23735224586288417,
+ "grad_norm": 3.017089605331421,
+ "learning_rate": 4.9874325778277255e-06,
+ "loss": 0.7279,
+ "step": 502
+ },
+ {
+ "epoch": 0.23782505910165486,
+ "grad_norm": 2.8300516605377197,
+ "learning_rate": 4.987370028336714e-06,
+ "loss": 0.6864,
+ "step": 503
+ },
+ {
+ "epoch": 0.23829787234042554,
+ "grad_norm": 3.201860189437866,
+ "learning_rate": 4.987307323968077e-06,
+ "loss": 0.7531,
+ "step": 504
+ },
+ {
+ "epoch": 0.23877068557919623,
+ "grad_norm": 2.685396194458008,
+ "learning_rate": 4.987244464725721e-06,
+ "loss": 0.5849,
+ "step": 505
+ },
+ {
+ "epoch": 0.23924349881796692,
+ "grad_norm": 2.8715312480926514,
+ "learning_rate": 4.987181450613557e-06,
+ "loss": 0.675,
+ "step": 506
+ },
+ {
+ "epoch": 0.2397163120567376,
+ "grad_norm": 2.813908815383911,
+ "learning_rate": 4.987118281635511e-06,
+ "loss": 0.6841,
+ "step": 507
+ },
+ {
+ "epoch": 0.2401891252955083,
+ "grad_norm": 3.2738473415374756,
+ "learning_rate": 4.987054957795514e-06,
+ "loss": 0.7158,
+ "step": 508
+ },
+ {
+ "epoch": 0.24066193853427895,
+ "grad_norm": 2.896134376525879,
+ "learning_rate": 4.986991479097511e-06,
+ "loss": 0.7542,
+ "step": 509
+ },
+ {
+ "epoch": 0.24113475177304963,
+ "grad_norm": 3.0390403270721436,
+ "learning_rate": 4.986927845545454e-06,
+ "loss": 0.6733,
+ "step": 510
+ },
+ {
+ "epoch": 0.24160756501182032,
+ "grad_norm": 3.0300254821777344,
+ "learning_rate": 4.9868640571433044e-06,
+ "loss": 0.722,
+ "step": 511
+ },
+ {
+ "epoch": 0.242080378250591,
+ "grad_norm": 3.3037352561950684,
+ "learning_rate": 4.986800113895035e-06,
+ "loss": 0.6811,
+ "step": 512
+ },
+ {
+ "epoch": 0.2425531914893617,
+ "grad_norm": 3.0358474254608154,
+ "learning_rate": 4.986736015804627e-06,
+ "loss": 0.7348,
+ "step": 513
+ },
+ {
+ "epoch": 0.24302600472813238,
+ "grad_norm": 3.108792304992676,
+ "learning_rate": 4.986671762876071e-06,
+ "loss": 0.6096,
+ "step": 514
+ },
+ {
+ "epoch": 0.24349881796690306,
+ "grad_norm": 3.1316237449645996,
+ "learning_rate": 4.986607355113367e-06,
+ "loss": 0.6357,
+ "step": 515
+ },
+ {
+ "epoch": 0.24397163120567375,
+ "grad_norm": 3.3095219135284424,
+ "learning_rate": 4.986542792520528e-06,
+ "loss": 0.7515,
+ "step": 516
+ },
+ {
+ "epoch": 0.24444444444444444,
+ "grad_norm": 3.4775984287261963,
+ "learning_rate": 4.986478075101572e-06,
+ "loss": 0.7104,
+ "step": 517
+ },
+ {
+ "epoch": 0.24491725768321512,
+ "grad_norm": 3.341708183288574,
+ "learning_rate": 4.986413202860528e-06,
+ "loss": 0.7339,
+ "step": 518
+ },
+ {
+ "epoch": 0.2453900709219858,
+ "grad_norm": 2.9646966457366943,
+ "learning_rate": 4.986348175801438e-06,
+ "loss": 0.6032,
+ "step": 519
+ },
+ {
+ "epoch": 0.2458628841607565,
+ "grad_norm": 3.1853902339935303,
+ "learning_rate": 4.986282993928349e-06,
+ "loss": 0.6925,
+ "step": 520
+ },
+ {
+ "epoch": 0.24633569739952718,
+ "grad_norm": 3.286909818649292,
+ "learning_rate": 4.98621765724532e-06,
+ "loss": 0.7447,
+ "step": 521
+ },
+ {
+ "epoch": 0.24680851063829787,
+ "grad_norm": 3.2255051136016846,
+ "learning_rate": 4.986152165756419e-06,
+ "loss": 0.7747,
+ "step": 522
+ },
+ {
+ "epoch": 0.24728132387706855,
+ "grad_norm": 3.002352237701416,
+ "learning_rate": 4.986086519465724e-06,
+ "loss": 0.6472,
+ "step": 523
+ },
+ {
+ "epoch": 0.24775413711583924,
+ "grad_norm": 3.4738974571228027,
+ "learning_rate": 4.986020718377322e-06,
+ "loss": 0.7381,
+ "step": 524
+ },
+ {
+ "epoch": 0.24822695035460993,
+ "grad_norm": 3.4470200538635254,
+ "learning_rate": 4.985954762495312e-06,
+ "loss": 0.6878,
+ "step": 525
+ },
+ {
+ "epoch": 0.2486997635933806,
+ "grad_norm": 2.9219350814819336,
+ "learning_rate": 4.985888651823799e-06,
+ "loss": 0.6317,
+ "step": 526
+ },
+ {
+ "epoch": 0.2491725768321513,
+ "grad_norm": 3.061767101287842,
+ "learning_rate": 4.985822386366899e-06,
+ "loss": 0.6842,
+ "step": 527
+ },
+ {
+ "epoch": 0.24964539007092199,
+ "grad_norm": 3.0291247367858887,
+ "learning_rate": 4.985755966128742e-06,
+ "loss": 0.6852,
+ "step": 528
+ },
+ {
+ "epoch": 0.25011820330969264,
+ "grad_norm": 2.964280843734741,
+ "learning_rate": 4.985689391113457e-06,
+ "loss": 0.7738,
+ "step": 529
+ },
+ {
+ "epoch": 0.25059101654846333,
+ "grad_norm": 3.058302164077759,
+ "learning_rate": 4.9856226613251955e-06,
+ "loss": 0.6677,
+ "step": 530
+ },
+ {
+ "epoch": 0.251063829787234,
+ "grad_norm": 3.345141649246216,
+ "learning_rate": 4.985555776768109e-06,
+ "loss": 0.7837,
+ "step": 531
+ },
+ {
+ "epoch": 0.2515366430260047,
+ "grad_norm": 3.565031051635742,
+ "learning_rate": 4.9854887374463636e-06,
+ "loss": 0.7231,
+ "step": 532
+ },
+ {
+ "epoch": 0.2520094562647754,
+ "grad_norm": 2.7953789234161377,
+ "learning_rate": 4.985421543364132e-06,
+ "loss": 0.6102,
+ "step": 533
+ },
+ {
+ "epoch": 0.2524822695035461,
+ "grad_norm": 2.887606620788574,
+ "learning_rate": 4.9853541945256e-06,
+ "loss": 0.6289,
+ "step": 534
+ },
+ {
+ "epoch": 0.25295508274231676,
+ "grad_norm": 3.1480495929718018,
+ "learning_rate": 4.985286690934961e-06,
+ "loss": 0.6348,
+ "step": 535
+ },
+ {
+ "epoch": 0.25342789598108745,
+ "grad_norm": 2.8912761211395264,
+ "learning_rate": 4.985219032596416e-06,
+ "loss": 0.595,
+ "step": 536
+ },
+ {
+ "epoch": 0.25390070921985813,
+ "grad_norm": 2.947936534881592,
+ "learning_rate": 4.98515121951418e-06,
+ "loss": 0.6196,
+ "step": 537
+ },
+ {
+ "epoch": 0.2543735224586288,
+ "grad_norm": 3.1085827350616455,
+ "learning_rate": 4.985083251692474e-06,
+ "loss": 0.6387,
+ "step": 538
+ },
+ {
+ "epoch": 0.2548463356973995,
+ "grad_norm": 3.1688334941864014,
+ "learning_rate": 4.985015129135531e-06,
+ "loss": 0.7055,
+ "step": 539
+ },
+ {
+ "epoch": 0.2553191489361702,
+ "grad_norm": 3.075042963027954,
+ "learning_rate": 4.984946851847593e-06,
+ "loss": 0.7515,
+ "step": 540
+ },
+ {
+ "epoch": 0.2557919621749409,
+ "grad_norm": 3.1933093070983887,
+ "learning_rate": 4.98487841983291e-06,
+ "loss": 0.7054,
+ "step": 541
+ },
+ {
+ "epoch": 0.25626477541371157,
+ "grad_norm": 3.043473958969116,
+ "learning_rate": 4.984809833095744e-06,
+ "loss": 0.6281,
+ "step": 542
+ },
+ {
+ "epoch": 0.25673758865248225,
+ "grad_norm": 3.0532584190368652,
+ "learning_rate": 4.9847410916403645e-06,
+ "loss": 0.6155,
+ "step": 543
+ },
+ {
+ "epoch": 0.25721040189125294,
+ "grad_norm": 3.608480215072632,
+ "learning_rate": 4.984672195471053e-06,
+ "loss": 0.7363,
+ "step": 544
+ },
+ {
+ "epoch": 0.2576832151300236,
+ "grad_norm": 2.7491862773895264,
+ "learning_rate": 4.9846031445921e-06,
+ "loss": 0.6594,
+ "step": 545
+ },
+ {
+ "epoch": 0.2581560283687943,
+ "grad_norm": 2.8602418899536133,
+ "learning_rate": 4.984533939007802e-06,
+ "loss": 0.6742,
+ "step": 546
+ },
+ {
+ "epoch": 0.258628841607565,
+ "grad_norm": 3.1782007217407227,
+ "learning_rate": 4.98446457872247e-06,
+ "loss": 0.731,
+ "step": 547
+ },
+ {
+ "epoch": 0.2591016548463357,
+ "grad_norm": 2.796147584915161,
+ "learning_rate": 4.984395063740423e-06,
+ "loss": 0.6617,
+ "step": 548
+ },
+ {
+ "epoch": 0.25957446808510637,
+ "grad_norm": 2.8392202854156494,
+ "learning_rate": 4.984325394065991e-06,
+ "loss": 0.6753,
+ "step": 549
+ },
+ {
+ "epoch": 0.26004728132387706,
+ "grad_norm": 3.134672164916992,
+ "learning_rate": 4.984255569703508e-06,
+ "loss": 0.7222,
+ "step": 550
+ },
+ {
+ "epoch": 0.26052009456264774,
+ "grad_norm": 2.734330177307129,
+ "learning_rate": 4.984185590657325e-06,
+ "loss": 0.6098,
+ "step": 551
+ },
+ {
+ "epoch": 0.26099290780141843,
+ "grad_norm": 3.739010810852051,
+ "learning_rate": 4.984115456931798e-06,
+ "loss": 0.7457,
+ "step": 552
+ },
+ {
+ "epoch": 0.2614657210401891,
+ "grad_norm": 2.8412528038024902,
+ "learning_rate": 4.9840451685312925e-06,
+ "loss": 0.6972,
+ "step": 553
+ },
+ {
+ "epoch": 0.2619385342789598,
+ "grad_norm": 3.017395496368408,
+ "learning_rate": 4.983974725460188e-06,
+ "loss": 0.6887,
+ "step": 554
+ },
+ {
+ "epoch": 0.2624113475177305,
+ "grad_norm": 3.2746949195861816,
+ "learning_rate": 4.98390412772287e-06,
+ "loss": 0.7047,
+ "step": 555
+ },
+ {
+ "epoch": 0.2628841607565012,
+ "grad_norm": 3.1561965942382812,
+ "learning_rate": 4.983833375323732e-06,
+ "loss": 0.7726,
+ "step": 556
+ },
+ {
+ "epoch": 0.26335697399527186,
+ "grad_norm": 3.2367217540740967,
+ "learning_rate": 4.9837624682671816e-06,
+ "loss": 0.6348,
+ "step": 557
+ },
+ {
+ "epoch": 0.26382978723404255,
+ "grad_norm": 2.8195858001708984,
+ "learning_rate": 4.983691406557633e-06,
+ "loss": 0.6387,
+ "step": 558
+ },
+ {
+ "epoch": 0.26430260047281323,
+ "grad_norm": 3.349820852279663,
+ "learning_rate": 4.983620190199511e-06,
+ "loss": 0.6776,
+ "step": 559
+ },
+ {
+ "epoch": 0.2647754137115839,
+ "grad_norm": 2.8025588989257812,
+ "learning_rate": 4.98354881919725e-06,
+ "loss": 0.6512,
+ "step": 560
+ },
+ {
+ "epoch": 0.2652482269503546,
+ "grad_norm": 2.9125499725341797,
+ "learning_rate": 4.983477293555295e-06,
+ "loss": 0.7024,
+ "step": 561
+ },
+ {
+ "epoch": 0.2657210401891253,
+ "grad_norm": 3.3479275703430176,
+ "learning_rate": 4.983405613278098e-06,
+ "loss": 0.688,
+ "step": 562
+ },
+ {
+ "epoch": 0.266193853427896,
+ "grad_norm": 3.123971462249756,
+ "learning_rate": 4.983333778370123e-06,
+ "loss": 0.6743,
+ "step": 563
+ },
+ {
+ "epoch": 0.26666666666666666,
+ "grad_norm": 2.891625165939331,
+ "learning_rate": 4.983261788835843e-06,
+ "loss": 0.5971,
+ "step": 564
+ },
+ {
+ "epoch": 0.26713947990543735,
+ "grad_norm": 3.5066864490509033,
+ "learning_rate": 4.98318964467974e-06,
+ "loss": 0.6958,
+ "step": 565
+ },
+ {
+ "epoch": 0.26761229314420804,
+ "grad_norm": 2.570547342300415,
+ "learning_rate": 4.983117345906306e-06,
+ "loss": 0.609,
+ "step": 566
+ },
+ {
+ "epoch": 0.2680851063829787,
+ "grad_norm": 3.005106210708618,
+ "learning_rate": 4.983044892520044e-06,
+ "loss": 0.6791,
+ "step": 567
+ },
+ {
+ "epoch": 0.2685579196217494,
+ "grad_norm": 3.429675340652466,
+ "learning_rate": 4.982972284525463e-06,
+ "loss": 0.6625,
+ "step": 568
+ },
+ {
+ "epoch": 0.2690307328605201,
+ "grad_norm": 3.825657367706299,
+ "learning_rate": 4.982899521927086e-06,
+ "loss": 0.6368,
+ "step": 569
+ },
+ {
+ "epoch": 0.2695035460992908,
+ "grad_norm": 2.8699095249176025,
+ "learning_rate": 4.982826604729443e-06,
+ "loss": 0.6425,
+ "step": 570
+ },
+ {
+ "epoch": 0.26997635933806147,
+ "grad_norm": 3.1688714027404785,
+ "learning_rate": 4.982753532937074e-06,
+ "loss": 0.6904,
+ "step": 571
+ },
+ {
+ "epoch": 0.27044917257683215,
+ "grad_norm": 3.3889992237091064,
+ "learning_rate": 4.98268030655453e-06,
+ "loss": 0.7575,
+ "step": 572
+ },
+ {
+ "epoch": 0.27092198581560284,
+ "grad_norm": 3.108315944671631,
+ "learning_rate": 4.982606925586367e-06,
+ "loss": 0.6648,
+ "step": 573
+ },
+ {
+ "epoch": 0.2713947990543735,
+ "grad_norm": 3.209831953048706,
+ "learning_rate": 4.982533390037159e-06,
+ "loss": 0.657,
+ "step": 574
+ },
+ {
+ "epoch": 0.2718676122931442,
+ "grad_norm": 3.1740927696228027,
+ "learning_rate": 4.982459699911482e-06,
+ "loss": 0.7262,
+ "step": 575
+ },
+ {
+ "epoch": 0.2723404255319149,
+ "grad_norm": 3.0190417766571045,
+ "learning_rate": 4.982385855213924e-06,
+ "loss": 0.6368,
+ "step": 576
+ },
+ {
+ "epoch": 0.2728132387706856,
+ "grad_norm": 3.05049467086792,
+ "learning_rate": 4.982311855949084e-06,
+ "loss": 0.72,
+ "step": 577
+ },
+ {
+ "epoch": 0.27328605200945627,
+ "grad_norm": 2.984816551208496,
+ "learning_rate": 4.98223770212157e-06,
+ "loss": 0.6856,
+ "step": 578
+ },
+ {
+ "epoch": 0.27375886524822696,
+ "grad_norm": 2.744969606399536,
+ "learning_rate": 4.982163393735998e-06,
+ "loss": 0.6023,
+ "step": 579
+ },
+ {
+ "epoch": 0.27423167848699764,
+ "grad_norm": 3.170564889907837,
+ "learning_rate": 4.982088930796996e-06,
+ "loss": 0.6678,
+ "step": 580
+ },
+ {
+ "epoch": 0.27470449172576833,
+ "grad_norm": 2.8686118125915527,
+ "learning_rate": 4.982014313309199e-06,
+ "loss": 0.6157,
+ "step": 581
+ },
+ {
+ "epoch": 0.275177304964539,
+ "grad_norm": 2.8768694400787354,
+ "learning_rate": 4.981939541277254e-06,
+ "loss": 0.6566,
+ "step": 582
+ },
+ {
+ "epoch": 0.2756501182033097,
+ "grad_norm": 2.621481418609619,
+ "learning_rate": 4.981864614705818e-06,
+ "loss": 0.7372,
+ "step": 583
+ },
+ {
+ "epoch": 0.2761229314420804,
+ "grad_norm": 3.527374267578125,
+ "learning_rate": 4.981789533599554e-06,
+ "loss": 0.6485,
+ "step": 584
+ },
+ {
+ "epoch": 0.2765957446808511,
+ "grad_norm": 3.3141074180603027,
+ "learning_rate": 4.981714297963138e-06,
+ "loss": 0.6816,
+ "step": 585
+ },
+ {
+ "epoch": 0.27706855791962176,
+ "grad_norm": 2.9247069358825684,
+ "learning_rate": 4.981638907801255e-06,
+ "loss": 0.7217,
+ "step": 586
+ },
+ {
+ "epoch": 0.27754137115839245,
+ "grad_norm": 2.875236749649048,
+ "learning_rate": 4.981563363118599e-06,
+ "loss": 0.6662,
+ "step": 587
+ },
+ {
+ "epoch": 0.27801418439716313,
+ "grad_norm": 2.9540364742279053,
+ "learning_rate": 4.981487663919874e-06,
+ "loss": 0.7225,
+ "step": 588
+ },
+ {
+ "epoch": 0.2784869976359338,
+ "grad_norm": 2.90889310836792,
+ "learning_rate": 4.981411810209793e-06,
+ "loss": 0.6054,
+ "step": 589
+ },
+ {
+ "epoch": 0.2789598108747045,
+ "grad_norm": 2.8541409969329834,
+ "learning_rate": 4.981335801993078e-06,
+ "loss": 0.6539,
+ "step": 590
+ },
+ {
+ "epoch": 0.2794326241134752,
+ "grad_norm": 3.1600730419158936,
+ "learning_rate": 4.981259639274465e-06,
+ "loss": 0.6415,
+ "step": 591
+ },
+ {
+ "epoch": 0.2799054373522459,
+ "grad_norm": 3.569376230239868,
+ "learning_rate": 4.981183322058693e-06,
+ "loss": 0.6944,
+ "step": 592
+ },
+ {
+ "epoch": 0.28037825059101656,
+ "grad_norm": 3.067667007446289,
+ "learning_rate": 4.981106850350515e-06,
+ "loss": 0.7378,
+ "step": 593
+ },
+ {
+ "epoch": 0.28085106382978725,
+ "grad_norm": 3.082073450088501,
+ "learning_rate": 4.981030224154693e-06,
+ "loss": 0.693,
+ "step": 594
+ },
+ {
+ "epoch": 0.28132387706855794,
+ "grad_norm": 2.902932643890381,
+ "learning_rate": 4.980953443475998e-06,
+ "loss": 0.6549,
+ "step": 595
+ },
+ {
+ "epoch": 0.2817966903073286,
+ "grad_norm": 2.6821181774139404,
+ "learning_rate": 4.980876508319211e-06,
+ "loss": 0.6231,
+ "step": 596
+ },
+ {
+ "epoch": 0.2822695035460993,
+ "grad_norm": 3.1747355461120605,
+ "learning_rate": 4.9807994186891215e-06,
+ "loss": 0.6826,
+ "step": 597
+ },
+ {
+ "epoch": 0.28274231678487,
+ "grad_norm": 2.6975860595703125,
+ "learning_rate": 4.980722174590531e-06,
+ "loss": 0.6669,
+ "step": 598
+ },
+ {
+ "epoch": 0.2832151300236407,
+ "grad_norm": 2.924285650253296,
+ "learning_rate": 4.9806447760282486e-06,
+ "loss": 0.689,
+ "step": 599
+ },
+ {
+ "epoch": 0.28368794326241137,
+ "grad_norm": 2.941417694091797,
+ "learning_rate": 4.980567223007093e-06,
+ "loss": 0.6672,
+ "step": 600
+ },
+ {
+ "epoch": 0.28416075650118205,
+ "grad_norm": 2.8582186698913574,
+ "learning_rate": 4.980489515531892e-06,
+ "loss": 0.6229,
+ "step": 601
+ },
+ {
+ "epoch": 0.28463356973995274,
+ "grad_norm": 2.6462013721466064,
+ "learning_rate": 4.9804116536074865e-06,
+ "loss": 0.606,
+ "step": 602
+ },
+ {
+ "epoch": 0.2851063829787234,
+ "grad_norm": 2.9029998779296875,
+ "learning_rate": 4.980333637238723e-06,
+ "loss": 0.5915,
+ "step": 603
+ },
+ {
+ "epoch": 0.2855791962174941,
+ "grad_norm": 3.9359042644500732,
+ "learning_rate": 4.980255466430462e-06,
+ "loss": 0.7035,
+ "step": 604
+ },
+ {
+ "epoch": 0.2860520094562648,
+ "grad_norm": 3.200524091720581,
+ "learning_rate": 4.980177141187566e-06,
+ "loss": 0.7156,
+ "step": 605
+ },
+ {
+ "epoch": 0.2865248226950355,
+ "grad_norm": 3.1708686351776123,
+ "learning_rate": 4.980098661514916e-06,
+ "loss": 0.746,
+ "step": 606
+ },
+ {
+ "epoch": 0.28699763593380617,
+ "grad_norm": 2.8926830291748047,
+ "learning_rate": 4.980020027417397e-06,
+ "loss": 0.6282,
+ "step": 607
+ },
+ {
+ "epoch": 0.28747044917257686,
+ "grad_norm": 3.0526294708251953,
+ "learning_rate": 4.979941238899906e-06,
+ "loss": 0.6594,
+ "step": 608
+ },
+ {
+ "epoch": 0.28794326241134754,
+ "grad_norm": 2.9869306087493896,
+ "learning_rate": 4.9798622959673486e-06,
+ "loss": 0.7771,
+ "step": 609
+ },
+ {
+ "epoch": 0.28841607565011823,
+ "grad_norm": 2.7894513607025146,
+ "learning_rate": 4.979783198624638e-06,
+ "loss": 0.6819,
+ "step": 610
+ },
+ {
+ "epoch": 0.28888888888888886,
+ "grad_norm": 2.958575963973999,
+ "learning_rate": 4.9797039468767025e-06,
+ "loss": 0.6474,
+ "step": 611
+ },
+ {
+ "epoch": 0.28936170212765955,
+ "grad_norm": 3.423748016357422,
+ "learning_rate": 4.979624540728475e-06,
+ "loss": 0.7389,
+ "step": 612
+ },
+ {
+ "epoch": 0.28983451536643023,
+ "grad_norm": 2.9641635417938232,
+ "learning_rate": 4.9795449801849e-06,
+ "loss": 0.6005,
+ "step": 613
+ },
+ {
+ "epoch": 0.2903073286052009,
+ "grad_norm": 3.02274227142334,
+ "learning_rate": 4.979465265250933e-06,
+ "loss": 0.6358,
+ "step": 614
+ },
+ {
+ "epoch": 0.2907801418439716,
+ "grad_norm": 3.0562758445739746,
+ "learning_rate": 4.979385395931534e-06,
+ "loss": 0.6313,
+ "step": 615
+ },
+ {
+ "epoch": 0.2912529550827423,
+ "grad_norm": 3.301816701889038,
+ "learning_rate": 4.97930537223168e-06,
+ "loss": 0.7264,
+ "step": 616
+ },
+ {
+ "epoch": 0.291725768321513,
+ "grad_norm": 2.975360870361328,
+ "learning_rate": 4.979225194156351e-06,
+ "loss": 0.613,
+ "step": 617
+ },
+ {
+ "epoch": 0.29219858156028367,
+ "grad_norm": 2.9245030879974365,
+ "learning_rate": 4.97914486171054e-06,
+ "loss": 0.6646,
+ "step": 618
+ },
+ {
+ "epoch": 0.29267139479905435,
+ "grad_norm": 3.1336188316345215,
+ "learning_rate": 4.979064374899249e-06,
+ "loss": 0.6421,
+ "step": 619
+ },
+ {
+ "epoch": 0.29314420803782504,
+ "grad_norm": 3.6298763751983643,
+ "learning_rate": 4.978983733727491e-06,
+ "loss": 0.6433,
+ "step": 620
+ },
+ {
+ "epoch": 0.2936170212765957,
+ "grad_norm": 2.919597625732422,
+ "learning_rate": 4.9789029382002845e-06,
+ "loss": 0.6288,
+ "step": 621
+ },
+ {
+ "epoch": 0.2940898345153664,
+ "grad_norm": 3.2206127643585205,
+ "learning_rate": 4.978821988322662e-06,
+ "loss": 0.7102,
+ "step": 622
+ },
+ {
+ "epoch": 0.2945626477541371,
+ "grad_norm": 3.1767101287841797,
+ "learning_rate": 4.978740884099664e-06,
+ "loss": 0.6722,
+ "step": 623
+ },
+ {
+ "epoch": 0.2950354609929078,
+ "grad_norm": 3.3425452709198,
+ "learning_rate": 4.97865962553634e-06,
+ "loss": 0.6492,
+ "step": 624
+ },
+ {
+ "epoch": 0.29550827423167847,
+ "grad_norm": 3.0408358573913574,
+ "learning_rate": 4.97857821263775e-06,
+ "loss": 0.6522,
+ "step": 625
+ },
+ {
+ "epoch": 0.29598108747044916,
+ "grad_norm": 2.8144783973693848,
+ "learning_rate": 4.978496645408963e-06,
+ "loss": 0.7237,
+ "step": 626
+ },
+ {
+ "epoch": 0.29645390070921984,
+ "grad_norm": 3.7010560035705566,
+ "learning_rate": 4.978414923855057e-06,
+ "loss": 0.7509,
+ "step": 627
+ },
+ {
+ "epoch": 0.29692671394799053,
+ "grad_norm": 2.9438371658325195,
+ "learning_rate": 4.978333047981122e-06,
+ "loss": 0.6244,
+ "step": 628
+ },
+ {
+ "epoch": 0.2973995271867612,
+ "grad_norm": 3.285982370376587,
+ "learning_rate": 4.978251017792255e-06,
+ "loss": 0.7553,
+ "step": 629
+ },
+ {
+ "epoch": 0.2978723404255319,
+ "grad_norm": 3.7021138668060303,
+ "learning_rate": 4.978168833293564e-06,
+ "loss": 0.7859,
+ "step": 630
+ },
+ {
+ "epoch": 0.2983451536643026,
+ "grad_norm": 3.481858730316162,
+ "learning_rate": 4.9780864944901654e-06,
+ "loss": 0.7146,
+ "step": 631
+ },
+ {
+ "epoch": 0.2988179669030733,
+ "grad_norm": 3.693824529647827,
+ "learning_rate": 4.978004001387188e-06,
+ "loss": 0.6608,
+ "step": 632
+ },
+ {
+ "epoch": 0.29929078014184396,
+ "grad_norm": 3.0069146156311035,
+ "learning_rate": 4.9779213539897665e-06,
+ "loss": 0.6506,
+ "step": 633
+ },
+ {
+ "epoch": 0.29976359338061465,
+ "grad_norm": 3.037644147872925,
+ "learning_rate": 4.977838552303048e-06,
+ "loss": 0.6487,
+ "step": 634
+ },
+ {
+ "epoch": 0.30023640661938533,
+ "grad_norm": 3.018554449081421,
+ "learning_rate": 4.977755596332188e-06,
+ "loss": 0.6128,
+ "step": 635
+ },
+ {
+ "epoch": 0.300709219858156,
+ "grad_norm": 3.000312089920044,
+ "learning_rate": 4.977672486082351e-06,
+ "loss": 0.6431,
+ "step": 636
+ },
+ {
+ "epoch": 0.3011820330969267,
+ "grad_norm": 2.836803913116455,
+ "learning_rate": 4.977589221558713e-06,
+ "loss": 0.5914,
+ "step": 637
+ },
+ {
+ "epoch": 0.3016548463356974,
+ "grad_norm": 3.080469846725464,
+ "learning_rate": 4.977505802766457e-06,
+ "loss": 0.7265,
+ "step": 638
+ },
+ {
+ "epoch": 0.3021276595744681,
+ "grad_norm": 3.2245471477508545,
+ "learning_rate": 4.97742222971078e-06,
+ "loss": 0.6895,
+ "step": 639
+ },
+ {
+ "epoch": 0.30260047281323876,
+ "grad_norm": 3.559006452560425,
+ "learning_rate": 4.977338502396882e-06,
+ "loss": 0.7439,
+ "step": 640
+ },
+ {
+ "epoch": 0.30307328605200945,
+ "grad_norm": 2.9116289615631104,
+ "learning_rate": 4.9772546208299795e-06,
+ "loss": 0.6907,
+ "step": 641
+ },
+ {
+ "epoch": 0.30354609929078014,
+ "grad_norm": 3.3645524978637695,
+ "learning_rate": 4.977170585015295e-06,
+ "loss": 0.6983,
+ "step": 642
+ },
+ {
+ "epoch": 0.3040189125295508,
+ "grad_norm": 3.080148458480835,
+ "learning_rate": 4.977086394958058e-06,
+ "loss": 0.7016,
+ "step": 643
+ },
+ {
+ "epoch": 0.3044917257683215,
+ "grad_norm": 2.9276750087738037,
+ "learning_rate": 4.977002050663515e-06,
+ "loss": 0.6509,
+ "step": 644
+ },
+ {
+ "epoch": 0.3049645390070922,
+ "grad_norm": 3.183609962463379,
+ "learning_rate": 4.976917552136914e-06,
+ "loss": 0.6814,
+ "step": 645
+ },
+ {
+ "epoch": 0.3054373522458629,
+ "grad_norm": 3.0980000495910645,
+ "learning_rate": 4.976832899383519e-06,
+ "loss": 0.6319,
+ "step": 646
+ },
+ {
+ "epoch": 0.30591016548463357,
+ "grad_norm": 3.211376190185547,
+ "learning_rate": 4.9767480924086e-06,
+ "loss": 0.6365,
+ "step": 647
+ },
+ {
+ "epoch": 0.30638297872340425,
+ "grad_norm": 3.214430093765259,
+ "learning_rate": 4.976663131217437e-06,
+ "loss": 0.6006,
+ "step": 648
+ },
+ {
+ "epoch": 0.30685579196217494,
+ "grad_norm": 3.0914318561553955,
+ "learning_rate": 4.976578015815321e-06,
+ "loss": 0.7162,
+ "step": 649
+ },
+ {
+ "epoch": 0.3073286052009456,
+ "grad_norm": 2.7644500732421875,
+ "learning_rate": 4.976492746207551e-06,
+ "loss": 0.6045,
+ "step": 650
+ },
+ {
+ "epoch": 0.3078014184397163,
+ "grad_norm": 3.1913280487060547,
+ "learning_rate": 4.9764073223994374e-06,
+ "loss": 0.6796,
+ "step": 651
+ },
+ {
+ "epoch": 0.308274231678487,
+ "grad_norm": 2.8919692039489746,
+ "learning_rate": 4.976321744396299e-06,
+ "loss": 0.6683,
+ "step": 652
+ },
+ {
+ "epoch": 0.3087470449172577,
+ "grad_norm": 2.862234115600586,
+ "learning_rate": 4.976236012203463e-06,
+ "loss": 0.6631,
+ "step": 653
+ },
+ {
+ "epoch": 0.30921985815602837,
+ "grad_norm": 2.9708092212677,
+ "learning_rate": 4.976150125826268e-06,
+ "loss": 0.6326,
+ "step": 654
+ },
+ {
+ "epoch": 0.30969267139479906,
+ "grad_norm": 2.892465353012085,
+ "learning_rate": 4.976064085270063e-06,
+ "loss": 0.6574,
+ "step": 655
+ },
+ {
+ "epoch": 0.31016548463356974,
+ "grad_norm": 3.9215126037597656,
+ "learning_rate": 4.975977890540205e-06,
+ "loss": 0.7351,
+ "step": 656
+ },
+ {
+ "epoch": 0.31063829787234043,
+ "grad_norm": 2.9544081687927246,
+ "learning_rate": 4.975891541642059e-06,
+ "loss": 0.7264,
+ "step": 657
+ },
+ {
+ "epoch": 0.3111111111111111,
+ "grad_norm": 2.995035409927368,
+ "learning_rate": 4.975805038581005e-06,
+ "loss": 0.7405,
+ "step": 658
+ },
+ {
+ "epoch": 0.3115839243498818,
+ "grad_norm": 2.9653120040893555,
+ "learning_rate": 4.975718381362427e-06,
+ "loss": 0.679,
+ "step": 659
+ },
+ {
+ "epoch": 0.3120567375886525,
+ "grad_norm": 2.93976092338562,
+ "learning_rate": 4.9756315699917205e-06,
+ "loss": 0.627,
+ "step": 660
+ },
+ {
+ "epoch": 0.3125295508274232,
+ "grad_norm": 3.106522560119629,
+ "learning_rate": 4.9755446044742915e-06,
+ "loss": 0.6329,
+ "step": 661
+ },
+ {
+ "epoch": 0.31300236406619386,
+ "grad_norm": 3.0238280296325684,
+ "learning_rate": 4.975457484815554e-06,
+ "loss": 0.6643,
+ "step": 662
+ },
+ {
+ "epoch": 0.31347517730496455,
+ "grad_norm": 2.943528175354004,
+ "learning_rate": 4.9753702110209356e-06,
+ "loss": 0.668,
+ "step": 663
+ },
+ {
+ "epoch": 0.31394799054373523,
+ "grad_norm": 2.6840121746063232,
+ "learning_rate": 4.9752827830958676e-06,
+ "loss": 0.5482,
+ "step": 664
+ },
+ {
+ "epoch": 0.3144208037825059,
+ "grad_norm": 2.823875904083252,
+ "learning_rate": 4.975195201045794e-06,
+ "loss": 0.7017,
+ "step": 665
+ },
+ {
+ "epoch": 0.3148936170212766,
+ "grad_norm": 3.148181200027466,
+ "learning_rate": 4.975107464876168e-06,
+ "loss": 0.747,
+ "step": 666
+ },
+ {
+ "epoch": 0.3153664302600473,
+ "grad_norm": 2.630584478378296,
+ "learning_rate": 4.9750195745924545e-06,
+ "loss": 0.5987,
+ "step": 667
+ },
+ {
+ "epoch": 0.315839243498818,
+ "grad_norm": 3.075866460800171,
+ "learning_rate": 4.974931530200124e-06,
+ "loss": 0.664,
+ "step": 668
+ },
+ {
+ "epoch": 0.31631205673758866,
+ "grad_norm": 2.947197914123535,
+ "learning_rate": 4.974843331704659e-06,
+ "loss": 0.631,
+ "step": 669
+ },
+ {
+ "epoch": 0.31678486997635935,
+ "grad_norm": 3.519646644592285,
+ "learning_rate": 4.974754979111552e-06,
+ "loss": 0.7154,
+ "step": 670
+ },
+ {
+ "epoch": 0.31725768321513004,
+ "grad_norm": 2.8687186241149902,
+ "learning_rate": 4.974666472426305e-06,
+ "loss": 0.6366,
+ "step": 671
+ },
+ {
+ "epoch": 0.3177304964539007,
+ "grad_norm": 2.6966612339019775,
+ "learning_rate": 4.974577811654426e-06,
+ "loss": 0.7112,
+ "step": 672
+ },
+ {
+ "epoch": 0.3182033096926714,
+ "grad_norm": 3.1390228271484375,
+ "learning_rate": 4.974488996801439e-06,
+ "loss": 0.6882,
+ "step": 673
+ },
+ {
+ "epoch": 0.3186761229314421,
+ "grad_norm": 3.4667599201202393,
+ "learning_rate": 4.974400027872871e-06,
+ "loss": 0.7153,
+ "step": 674
+ },
+ {
+ "epoch": 0.3191489361702128,
+ "grad_norm": 2.9632184505462646,
+ "learning_rate": 4.974310904874265e-06,
+ "loss": 0.7081,
+ "step": 675
+ },
+ {
+ "epoch": 0.31962174940898347,
+ "grad_norm": 3.46150279045105,
+ "learning_rate": 4.9742216278111666e-06,
+ "loss": 0.6242,
+ "step": 676
+ },
+ {
+ "epoch": 0.32009456264775416,
+ "grad_norm": 3.380403757095337,
+ "learning_rate": 4.974132196689137e-06,
+ "loss": 0.6863,
+ "step": 677
+ },
+ {
+ "epoch": 0.32056737588652484,
+ "grad_norm": 3.4279606342315674,
+ "learning_rate": 4.974042611513746e-06,
+ "loss": 0.6388,
+ "step": 678
+ },
+ {
+ "epoch": 0.3210401891252955,
+ "grad_norm": 2.634523391723633,
+ "learning_rate": 4.973952872290568e-06,
+ "loss": 0.6038,
+ "step": 679
+ },
+ {
+ "epoch": 0.3215130023640662,
+ "grad_norm": 3.19693922996521,
+ "learning_rate": 4.973862979025194e-06,
+ "loss": 0.6383,
+ "step": 680
+ },
+ {
+ "epoch": 0.3219858156028369,
+ "grad_norm": 3.437692165374756,
+ "learning_rate": 4.973772931723218e-06,
+ "loss": 0.7288,
+ "step": 681
+ },
+ {
+ "epoch": 0.3224586288416076,
+ "grad_norm": 2.506301164627075,
+ "learning_rate": 4.97368273039025e-06,
+ "loss": 0.5707,
+ "step": 682
+ },
+ {
+ "epoch": 0.3229314420803783,
+ "grad_norm": 3.0942845344543457,
+ "learning_rate": 4.9735923750319044e-06,
+ "loss": 0.6348,
+ "step": 683
+ },
+ {
+ "epoch": 0.32340425531914896,
+ "grad_norm": 3.0889835357666016,
+ "learning_rate": 4.973501865653809e-06,
+ "loss": 0.6697,
+ "step": 684
+ },
+ {
+ "epoch": 0.32387706855791965,
+ "grad_norm": 3.0391931533813477,
+ "learning_rate": 4.973411202261598e-06,
+ "loss": 0.7091,
+ "step": 685
+ },
+ {
+ "epoch": 0.32434988179669033,
+ "grad_norm": 3.0333497524261475,
+ "learning_rate": 4.973320384860917e-06,
+ "loss": 0.6403,
+ "step": 686
+ },
+ {
+ "epoch": 0.324822695035461,
+ "grad_norm": 2.9714622497558594,
+ "learning_rate": 4.973229413457421e-06,
+ "loss": 0.6977,
+ "step": 687
+ },
+ {
+ "epoch": 0.3252955082742317,
+ "grad_norm": 3.057558298110962,
+ "learning_rate": 4.973138288056774e-06,
+ "loss": 0.7236,
+ "step": 688
+ },
+ {
+ "epoch": 0.3257683215130024,
+ "grad_norm": 2.921093463897705,
+ "learning_rate": 4.97304700866465e-06,
+ "loss": 0.576,
+ "step": 689
+ },
+ {
+ "epoch": 0.3262411347517731,
+ "grad_norm": 3.0287256240844727,
+ "learning_rate": 4.972955575286732e-06,
+ "loss": 0.7077,
+ "step": 690
+ },
+ {
+ "epoch": 0.32671394799054376,
+ "grad_norm": 2.8621346950531006,
+ "learning_rate": 4.972863987928716e-06,
+ "loss": 0.6952,
+ "step": 691
+ },
+ {
+ "epoch": 0.3271867612293144,
+ "grad_norm": 2.631359100341797,
+ "learning_rate": 4.9727722465963006e-06,
+ "loss": 0.6931,
+ "step": 692
+ },
+ {
+ "epoch": 0.3276595744680851,
+ "grad_norm": 2.8484320640563965,
+ "learning_rate": 4.972680351295201e-06,
+ "loss": 0.6292,
+ "step": 693
+ },
+ {
+ "epoch": 0.32813238770685577,
+ "grad_norm": 2.593001365661621,
+ "learning_rate": 4.972588302031138e-06,
+ "loss": 0.5942,
+ "step": 694
+ },
+ {
+ "epoch": 0.32860520094562645,
+ "grad_norm": 2.6321065425872803,
+ "learning_rate": 4.972496098809844e-06,
+ "loss": 0.65,
+ "step": 695
+ },
+ {
+ "epoch": 0.32907801418439714,
+ "grad_norm": 3.2516732215881348,
+ "learning_rate": 4.972403741637059e-06,
+ "loss": 0.7385,
+ "step": 696
+ },
+ {
+ "epoch": 0.3295508274231678,
+ "grad_norm": 3.180854320526123,
+ "learning_rate": 4.972311230518535e-06,
+ "loss": 0.6569,
+ "step": 697
+ },
+ {
+ "epoch": 0.3300236406619385,
+ "grad_norm": 4.161016941070557,
+ "learning_rate": 4.972218565460031e-06,
+ "loss": 0.6416,
+ "step": 698
+ },
+ {
+ "epoch": 0.3304964539007092,
+ "grad_norm": 3.153897762298584,
+ "learning_rate": 4.972125746467317e-06,
+ "loss": 0.7196,
+ "step": 699
+ },
+ {
+ "epoch": 0.3309692671394799,
+ "grad_norm": 2.9595556259155273,
+ "learning_rate": 4.972032773546173e-06,
+ "loss": 0.7093,
+ "step": 700
+ },
+ {
+ "epoch": 0.33144208037825057,
+ "grad_norm": 3.1086833477020264,
+ "learning_rate": 4.9719396467023875e-06,
+ "loss": 0.6963,
+ "step": 701
+ },
+ {
+ "epoch": 0.33191489361702126,
+ "grad_norm": 2.958921432495117,
+ "learning_rate": 4.971846365941759e-06,
+ "loss": 0.6518,
+ "step": 702
+ },
+ {
+ "epoch": 0.33238770685579194,
+ "grad_norm": 2.8745479583740234,
+ "learning_rate": 4.971752931270096e-06,
+ "loss": 0.696,
+ "step": 703
+ },
+ {
+ "epoch": 0.33286052009456263,
+ "grad_norm": 3.224358558654785,
+ "learning_rate": 4.971659342693217e-06,
+ "loss": 0.6769,
+ "step": 704
+ },
+ {
+ "epoch": 0.3333333333333333,
+ "grad_norm": 2.696319580078125,
+ "learning_rate": 4.9715656002169486e-06,
+ "loss": 0.6833,
+ "step": 705
+ },
+ {
+ "epoch": 0.333806146572104,
+ "grad_norm": 2.9283502101898193,
+ "learning_rate": 4.971471703847127e-06,
+ "loss": 0.6784,
+ "step": 706
+ },
+ {
+ "epoch": 0.3342789598108747,
+ "grad_norm": 2.654914140701294,
+ "learning_rate": 4.9713776535896e-06,
+ "loss": 0.6337,
+ "step": 707
+ },
+ {
+ "epoch": 0.3347517730496454,
+ "grad_norm": 3.041555643081665,
+ "learning_rate": 4.971283449450224e-06,
+ "loss": 0.6227,
+ "step": 708
+ },
+ {
+ "epoch": 0.33522458628841606,
+ "grad_norm": 2.893008232116699,
+ "learning_rate": 4.971189091434863e-06,
+ "loss": 0.655,
+ "step": 709
+ },
+ {
+ "epoch": 0.33569739952718675,
+ "grad_norm": 2.8806653022766113,
+ "learning_rate": 4.971094579549393e-06,
+ "loss": 0.7077,
+ "step": 710
+ },
+ {
+ "epoch": 0.33617021276595743,
+ "grad_norm": 3.4830048084259033,
+ "learning_rate": 4.9709999137996986e-06,
+ "loss": 0.7461,
+ "step": 711
+ },
+ {
+ "epoch": 0.3366430260047281,
+ "grad_norm": 3.155444860458374,
+ "learning_rate": 4.970905094191674e-06,
+ "loss": 0.652,
+ "step": 712
+ },
+ {
+ "epoch": 0.3371158392434988,
+ "grad_norm": 2.7608706951141357,
+ "learning_rate": 4.970810120731225e-06,
+ "loss": 0.684,
+ "step": 713
+ },
+ {
+ "epoch": 0.3375886524822695,
+ "grad_norm": 2.8209474086761475,
+ "learning_rate": 4.970714993424265e-06,
+ "loss": 0.6009,
+ "step": 714
+ },
+ {
+ "epoch": 0.3380614657210402,
+ "grad_norm": 3.6532654762268066,
+ "learning_rate": 4.9706197122767145e-06,
+ "loss": 0.702,
+ "step": 715
+ },
+ {
+ "epoch": 0.33853427895981086,
+ "grad_norm": 2.6276566982269287,
+ "learning_rate": 4.970524277294508e-06,
+ "loss": 0.6338,
+ "step": 716
+ },
+ {
+ "epoch": 0.33900709219858155,
+ "grad_norm": 3.509871482849121,
+ "learning_rate": 4.970428688483589e-06,
+ "loss": 0.6853,
+ "step": 717
+ },
+ {
+ "epoch": 0.33947990543735224,
+ "grad_norm": 5.332682132720947,
+ "learning_rate": 4.970332945849906e-06,
+ "loss": 0.6684,
+ "step": 718
+ },
+ {
+ "epoch": 0.3399527186761229,
+ "grad_norm": 2.718801975250244,
+ "learning_rate": 4.970237049399424e-06,
+ "loss": 0.6676,
+ "step": 719
+ },
+ {
+ "epoch": 0.3404255319148936,
+ "grad_norm": 3.891003131866455,
+ "learning_rate": 4.970140999138112e-06,
+ "loss": 0.7043,
+ "step": 720
+ },
+ {
+ "epoch": 0.3408983451536643,
+ "grad_norm": 2.8863155841827393,
+ "learning_rate": 4.970044795071951e-06,
+ "loss": 0.6563,
+ "step": 721
+ },
+ {
+ "epoch": 0.341371158392435,
+ "grad_norm": 3.2527518272399902,
+ "learning_rate": 4.969948437206932e-06,
+ "loss": 0.7244,
+ "step": 722
+ },
+ {
+ "epoch": 0.34184397163120567,
+ "grad_norm": 2.9726758003234863,
+ "learning_rate": 4.969851925549054e-06,
+ "loss": 0.6548,
+ "step": 723
+ },
+ {
+ "epoch": 0.34231678486997635,
+ "grad_norm": 3.118309497833252,
+ "learning_rate": 4.969755260104327e-06,
+ "loss": 0.7293,
+ "step": 724
+ },
+ {
+ "epoch": 0.34278959810874704,
+ "grad_norm": 3.373068332672119,
+ "learning_rate": 4.969658440878769e-06,
+ "loss": 0.6444,
+ "step": 725
+ },
+ {
+ "epoch": 0.3432624113475177,
+ "grad_norm": 2.7157437801361084,
+ "learning_rate": 4.969561467878409e-06,
+ "loss": 0.642,
+ "step": 726
+ },
+ {
+ "epoch": 0.3437352245862884,
+ "grad_norm": 2.58929705619812,
+ "learning_rate": 4.969464341109285e-06,
+ "loss": 0.6165,
+ "step": 727
+ },
+ {
+ "epoch": 0.3442080378250591,
+ "grad_norm": 2.8811306953430176,
+ "learning_rate": 4.969367060577445e-06,
+ "loss": 0.7127,
+ "step": 728
+ },
+ {
+ "epoch": 0.3446808510638298,
+ "grad_norm": 3.494358539581299,
+ "learning_rate": 4.969269626288946e-06,
+ "loss": 0.7103,
+ "step": 729
+ },
+ {
+ "epoch": 0.34515366430260047,
+ "grad_norm": 2.9753928184509277,
+ "learning_rate": 4.969172038249855e-06,
+ "loss": 0.6911,
+ "step": 730
+ },
+ {
+ "epoch": 0.34562647754137116,
+ "grad_norm": 3.2885913848876953,
+ "learning_rate": 4.969074296466247e-06,
+ "loss": 0.6968,
+ "step": 731
+ },
+ {
+ "epoch": 0.34609929078014184,
+ "grad_norm": 2.7564568519592285,
+ "learning_rate": 4.968976400944211e-06,
+ "loss": 0.6843,
+ "step": 732
+ },
+ {
+ "epoch": 0.34657210401891253,
+ "grad_norm": 2.9255006313323975,
+ "learning_rate": 4.96887835168984e-06,
+ "loss": 0.6024,
+ "step": 733
+ },
+ {
+ "epoch": 0.3470449172576832,
+ "grad_norm": 3.1808290481567383,
+ "learning_rate": 4.968780148709239e-06,
+ "loss": 0.7377,
+ "step": 734
+ },
+ {
+ "epoch": 0.3475177304964539,
+ "grad_norm": 2.956666946411133,
+ "learning_rate": 4.968681792008523e-06,
+ "loss": 0.65,
+ "step": 735
+ },
+ {
+ "epoch": 0.3479905437352246,
+ "grad_norm": 2.9631855487823486,
+ "learning_rate": 4.9685832815938175e-06,
+ "loss": 0.677,
+ "step": 736
+ },
+ {
+ "epoch": 0.3484633569739953,
+ "grad_norm": 2.501917600631714,
+ "learning_rate": 4.968484617471256e-06,
+ "loss": 0.6282,
+ "step": 737
+ },
+ {
+ "epoch": 0.34893617021276596,
+ "grad_norm": 2.750779628753662,
+ "learning_rate": 4.968385799646981e-06,
+ "loss": 0.6507,
+ "step": 738
+ },
+ {
+ "epoch": 0.34940898345153665,
+ "grad_norm": 2.872300624847412,
+ "learning_rate": 4.968286828127146e-06,
+ "loss": 0.5949,
+ "step": 739
+ },
+ {
+ "epoch": 0.34988179669030733,
+ "grad_norm": 2.6316142082214355,
+ "learning_rate": 4.9681877029179124e-06,
+ "loss": 0.6328,
+ "step": 740
+ },
+ {
+ "epoch": 0.350354609929078,
+ "grad_norm": 3.244364023208618,
+ "learning_rate": 4.968088424025454e-06,
+ "loss": 0.7393,
+ "step": 741
+ },
+ {
+ "epoch": 0.3508274231678487,
+ "grad_norm": 2.620465040206909,
+ "learning_rate": 4.967988991455951e-06,
+ "loss": 0.6797,
+ "step": 742
+ },
+ {
+ "epoch": 0.3513002364066194,
+ "grad_norm": 2.854513645172119,
+ "learning_rate": 4.967889405215596e-06,
+ "loss": 0.6368,
+ "step": 743
+ },
+ {
+ "epoch": 0.3517730496453901,
+ "grad_norm": 2.579854726791382,
+ "learning_rate": 4.9677896653105886e-06,
+ "loss": 0.6489,
+ "step": 744
+ },
+ {
+ "epoch": 0.35224586288416077,
+ "grad_norm": 3.0697381496429443,
+ "learning_rate": 4.96768977174714e-06,
+ "loss": 0.6313,
+ "step": 745
+ },
+ {
+ "epoch": 0.35271867612293145,
+ "grad_norm": 3.369338035583496,
+ "learning_rate": 4.96758972453147e-06,
+ "loss": 0.7416,
+ "step": 746
+ },
+ {
+ "epoch": 0.35319148936170214,
+ "grad_norm": 2.836221933364868,
+ "learning_rate": 4.967489523669807e-06,
+ "loss": 0.6422,
+ "step": 747
+ },
+ {
+ "epoch": 0.3536643026004728,
+ "grad_norm": 2.929579496383667,
+ "learning_rate": 4.967389169168392e-06,
+ "loss": 0.6482,
+ "step": 748
+ },
+ {
+ "epoch": 0.3541371158392435,
+ "grad_norm": 2.9243831634521484,
+ "learning_rate": 4.967288661033472e-06,
+ "loss": 0.5813,
+ "step": 749
+ },
+ {
+ "epoch": 0.3546099290780142,
+ "grad_norm": 3.7555336952209473,
+ "learning_rate": 4.967187999271306e-06,
+ "loss": 0.6501,
+ "step": 750
+ },
+ {
+ "epoch": 0.3550827423167849,
+ "grad_norm": 3.4279143810272217,
+ "learning_rate": 4.9670871838881615e-06,
+ "loss": 0.6326,
+ "step": 751
+ },
+ {
+ "epoch": 0.35555555555555557,
+ "grad_norm": 2.875066041946411,
+ "learning_rate": 4.9669862148903166e-06,
+ "loss": 0.664,
+ "step": 752
+ },
+ {
+ "epoch": 0.35602836879432626,
+ "grad_norm": 3.130394697189331,
+ "learning_rate": 4.966885092284057e-06,
+ "loss": 0.706,
+ "step": 753
+ },
+ {
+ "epoch": 0.35650118203309694,
+ "grad_norm": 2.9606287479400635,
+ "learning_rate": 4.96678381607568e-06,
+ "loss": 0.693,
+ "step": 754
+ },
+ {
+ "epoch": 0.35697399527186763,
+ "grad_norm": 3.0584909915924072,
+ "learning_rate": 4.966682386271491e-06,
+ "loss": 0.6034,
+ "step": 755
+ },
+ {
+ "epoch": 0.3574468085106383,
+ "grad_norm": 2.8215200901031494,
+ "learning_rate": 4.966580802877805e-06,
+ "loss": 0.6217,
+ "step": 756
+ },
+ {
+ "epoch": 0.357919621749409,
+ "grad_norm": 2.7348055839538574,
+ "learning_rate": 4.966479065900949e-06,
+ "loss": 0.6194,
+ "step": 757
+ },
+ {
+ "epoch": 0.3583924349881797,
+ "grad_norm": 3.2347466945648193,
+ "learning_rate": 4.966377175347257e-06,
+ "loss": 0.6377,
+ "step": 758
+ },
+ {
+ "epoch": 0.3588652482269504,
+ "grad_norm": 3.311845302581787,
+ "learning_rate": 4.966275131223072e-06,
+ "loss": 0.6234,
+ "step": 759
+ },
+ {
+ "epoch": 0.35933806146572106,
+ "grad_norm": 3.0384368896484375,
+ "learning_rate": 4.96617293353475e-06,
+ "loss": 0.609,
+ "step": 760
+ },
+ {
+ "epoch": 0.35981087470449175,
+ "grad_norm": 3.516854763031006,
+ "learning_rate": 4.966070582288653e-06,
+ "loss": 0.6627,
+ "step": 761
+ },
+ {
+ "epoch": 0.36028368794326243,
+ "grad_norm": 3.2425215244293213,
+ "learning_rate": 4.9659680774911534e-06,
+ "loss": 0.7355,
+ "step": 762
+ },
+ {
+ "epoch": 0.3607565011820331,
+ "grad_norm": 3.2665750980377197,
+ "learning_rate": 4.965865419148636e-06,
+ "loss": 0.6787,
+ "step": 763
+ },
+ {
+ "epoch": 0.3612293144208038,
+ "grad_norm": 2.729428291320801,
+ "learning_rate": 4.96576260726749e-06,
+ "loss": 0.6272,
+ "step": 764
+ },
+ {
+ "epoch": 0.3617021276595745,
+ "grad_norm": 3.299969434738159,
+ "learning_rate": 4.965659641854119e-06,
+ "loss": 0.6552,
+ "step": 765
+ },
+ {
+ "epoch": 0.3621749408983452,
+ "grad_norm": 2.7090916633605957,
+ "learning_rate": 4.965556522914934e-06,
+ "loss": 0.6661,
+ "step": 766
+ },
+ {
+ "epoch": 0.36264775413711586,
+ "grad_norm": 2.488846778869629,
+ "learning_rate": 4.965453250456355e-06,
+ "loss": 0.5821,
+ "step": 767
+ },
+ {
+ "epoch": 0.36312056737588655,
+ "grad_norm": 2.5267233848571777,
+ "learning_rate": 4.965349824484813e-06,
+ "loss": 0.5593,
+ "step": 768
+ },
+ {
+ "epoch": 0.36359338061465724,
+ "grad_norm": 3.0646679401397705,
+ "learning_rate": 4.965246245006748e-06,
+ "loss": 0.6341,
+ "step": 769
+ },
+ {
+ "epoch": 0.3640661938534279,
+ "grad_norm": 2.9877712726593018,
+ "learning_rate": 4.965142512028609e-06,
+ "loss": 0.7202,
+ "step": 770
+ },
+ {
+ "epoch": 0.3645390070921986,
+ "grad_norm": 3.7494113445281982,
+ "learning_rate": 4.965038625556854e-06,
+ "loss": 0.7643,
+ "step": 771
+ },
+ {
+ "epoch": 0.3650118203309693,
+ "grad_norm": 2.8382890224456787,
+ "learning_rate": 4.964934585597954e-06,
+ "loss": 0.6522,
+ "step": 772
+ },
+ {
+ "epoch": 0.3654846335697399,
+ "grad_norm": 3.091655731201172,
+ "learning_rate": 4.9648303921583854e-06,
+ "loss": 0.7117,
+ "step": 773
+ },
+ {
+ "epoch": 0.3659574468085106,
+ "grad_norm": 3.0608325004577637,
+ "learning_rate": 4.964726045244635e-06,
+ "loss": 0.6538,
+ "step": 774
+ },
+ {
+ "epoch": 0.3664302600472813,
+ "grad_norm": 2.8492867946624756,
+ "learning_rate": 4.964621544863203e-06,
+ "loss": 0.6079,
+ "step": 775
+ },
+ {
+ "epoch": 0.366903073286052,
+ "grad_norm": 3.0669894218444824,
+ "learning_rate": 4.964516891020594e-06,
+ "loss": 0.6223,
+ "step": 776
+ },
+ {
+ "epoch": 0.36737588652482267,
+ "grad_norm": 3.089984893798828,
+ "learning_rate": 4.964412083723325e-06,
+ "loss": 0.671,
+ "step": 777
+ },
+ {
+ "epoch": 0.36784869976359336,
+ "grad_norm": 2.905242443084717,
+ "learning_rate": 4.964307122977921e-06,
+ "loss": 0.62,
+ "step": 778
+ },
+ {
+ "epoch": 0.36832151300236404,
+ "grad_norm": 3.954436779022217,
+ "learning_rate": 4.964202008790918e-06,
+ "loss": 0.6535,
+ "step": 779
+ },
+ {
+ "epoch": 0.36879432624113473,
+ "grad_norm": 2.6026058197021484,
+ "learning_rate": 4.9640967411688615e-06,
+ "loss": 0.5865,
+ "step": 780
+ },
+ {
+ "epoch": 0.3692671394799054,
+ "grad_norm": 2.9876346588134766,
+ "learning_rate": 4.963991320118306e-06,
+ "loss": 0.6698,
+ "step": 781
+ },
+ {
+ "epoch": 0.3697399527186761,
+ "grad_norm": 2.9411263465881348,
+ "learning_rate": 4.963885745645815e-06,
+ "loss": 0.6173,
+ "step": 782
+ },
+ {
+ "epoch": 0.3702127659574468,
+ "grad_norm": 2.5679805278778076,
+ "learning_rate": 4.963780017757962e-06,
+ "loss": 0.6285,
+ "step": 783
+ },
+ {
+ "epoch": 0.3706855791962175,
+ "grad_norm": 3.3100640773773193,
+ "learning_rate": 4.963674136461332e-06,
+ "loss": 0.5968,
+ "step": 784
+ },
+ {
+ "epoch": 0.37115839243498816,
+ "grad_norm": 3.1293699741363525,
+ "learning_rate": 4.963568101762515e-06,
+ "loss": 0.697,
+ "step": 785
+ },
+ {
+ "epoch": 0.37163120567375885,
+ "grad_norm": 3.043853759765625,
+ "learning_rate": 4.963461913668115e-06,
+ "loss": 0.5881,
+ "step": 786
+ },
+ {
+ "epoch": 0.37210401891252953,
+ "grad_norm": 3.07351016998291,
+ "learning_rate": 4.963355572184744e-06,
+ "loss": 0.6307,
+ "step": 787
+ },
+ {
+ "epoch": 0.3725768321513002,
+ "grad_norm": 2.7381317615509033,
+ "learning_rate": 4.9632490773190225e-06,
+ "loss": 0.716,
+ "step": 788
+ },
+ {
+ "epoch": 0.3730496453900709,
+ "grad_norm": 2.892221450805664,
+ "learning_rate": 4.963142429077582e-06,
+ "loss": 0.6867,
+ "step": 789
+ },
+ {
+ "epoch": 0.3735224586288416,
+ "grad_norm": 3.133122205734253,
+ "learning_rate": 4.963035627467064e-06,
+ "loss": 0.659,
+ "step": 790
+ },
+ {
+ "epoch": 0.3739952718676123,
+ "grad_norm": 3.032599925994873,
+ "learning_rate": 4.962928672494116e-06,
+ "loss": 0.6848,
+ "step": 791
+ },
+ {
+ "epoch": 0.37446808510638296,
+ "grad_norm": 3.0076355934143066,
+ "learning_rate": 4.9628215641654e-06,
+ "loss": 0.6549,
+ "step": 792
+ },
+ {
+ "epoch": 0.37494089834515365,
+ "grad_norm": 2.8904454708099365,
+ "learning_rate": 4.962714302487585e-06,
+ "loss": 0.6484,
+ "step": 793
+ },
+ {
+ "epoch": 0.37541371158392434,
+ "grad_norm": 2.881364107131958,
+ "learning_rate": 4.9626068874673486e-06,
+ "loss": 0.721,
+ "step": 794
+ },
+ {
+ "epoch": 0.375886524822695,
+ "grad_norm": 3.11668062210083,
+ "learning_rate": 4.962499319111379e-06,
+ "loss": 0.7824,
+ "step": 795
+ },
+ {
+ "epoch": 0.3763593380614657,
+ "grad_norm": 2.9201436042785645,
+ "learning_rate": 4.962391597426374e-06,
+ "loss": 0.6911,
+ "step": 796
+ },
+ {
+ "epoch": 0.3768321513002364,
+ "grad_norm": 2.926598072052002,
+ "learning_rate": 4.962283722419043e-06,
+ "loss": 0.6715,
+ "step": 797
+ },
+ {
+ "epoch": 0.3773049645390071,
+ "grad_norm": 2.7267675399780273,
+ "learning_rate": 4.962175694096101e-06,
+ "loss": 0.6111,
+ "step": 798
+ },
+ {
+ "epoch": 0.37777777777777777,
+ "grad_norm": 3.194031000137329,
+ "learning_rate": 4.962067512464275e-06,
+ "loss": 0.6558,
+ "step": 799
+ },
+ {
+ "epoch": 0.37825059101654845,
+ "grad_norm": 2.6249136924743652,
+ "learning_rate": 4.9619591775303e-06,
+ "loss": 0.6166,
+ "step": 800
+ },
+ {
+ "epoch": 0.37872340425531914,
+ "grad_norm": 2.6356167793273926,
+ "learning_rate": 4.961850689300923e-06,
+ "loss": 0.6112,
+ "step": 801
+ },
+ {
+ "epoch": 0.3791962174940898,
+ "grad_norm": 3.030724287033081,
+ "learning_rate": 4.961742047782898e-06,
+ "loss": 0.6511,
+ "step": 802
+ },
+ {
+ "epoch": 0.3796690307328605,
+ "grad_norm": 3.4987757205963135,
+ "learning_rate": 4.96163325298299e-06,
+ "loss": 0.5888,
+ "step": 803
+ },
+ {
+ "epoch": 0.3801418439716312,
+ "grad_norm": 3.0371780395507812,
+ "learning_rate": 4.961524304907974e-06,
+ "loss": 0.6385,
+ "step": 804
+ },
+ {
+ "epoch": 0.3806146572104019,
+ "grad_norm": 3.302570104598999,
+ "learning_rate": 4.961415203564632e-06,
+ "loss": 0.6515,
+ "step": 805
+ },
+ {
+ "epoch": 0.38108747044917257,
+ "grad_norm": 2.7597038745880127,
+ "learning_rate": 4.961305948959759e-06,
+ "loss": 0.6126,
+ "step": 806
+ },
+ {
+ "epoch": 0.38156028368794326,
+ "grad_norm": 2.789811849594116,
+ "learning_rate": 4.9611965411001575e-06,
+ "loss": 0.6601,
+ "step": 807
+ },
+ {
+ "epoch": 0.38203309692671394,
+ "grad_norm": 3.0403921604156494,
+ "learning_rate": 4.961086979992639e-06,
+ "loss": 0.6947,
+ "step": 808
+ },
+ {
+ "epoch": 0.38250591016548463,
+ "grad_norm": 3.2139980792999268,
+ "learning_rate": 4.960977265644026e-06,
+ "loss": 0.6876,
+ "step": 809
+ },
+ {
+ "epoch": 0.3829787234042553,
+ "grad_norm": 2.918515205383301,
+ "learning_rate": 4.960867398061149e-06,
+ "loss": 0.5997,
+ "step": 810
+ },
+ {
+ "epoch": 0.383451536643026,
+ "grad_norm": 3.197636604309082,
+ "learning_rate": 4.9607573772508495e-06,
+ "loss": 0.5754,
+ "step": 811
+ },
+ {
+ "epoch": 0.3839243498817967,
+ "grad_norm": 2.8848466873168945,
+ "learning_rate": 4.960647203219979e-06,
+ "loss": 0.6424,
+ "step": 812
+ },
+ {
+ "epoch": 0.3843971631205674,
+ "grad_norm": 3.4810187816619873,
+ "learning_rate": 4.960536875975397e-06,
+ "loss": 0.6851,
+ "step": 813
+ },
+ {
+ "epoch": 0.38486997635933806,
+ "grad_norm": 3.713934898376465,
+ "learning_rate": 4.960426395523972e-06,
+ "loss": 0.6122,
+ "step": 814
+ },
+ {
+ "epoch": 0.38534278959810875,
+ "grad_norm": 2.862600803375244,
+ "learning_rate": 4.960315761872585e-06,
+ "loss": 0.6493,
+ "step": 815
+ },
+ {
+ "epoch": 0.38581560283687943,
+ "grad_norm": 3.133882522583008,
+ "learning_rate": 4.960204975028123e-06,
+ "loss": 0.7535,
+ "step": 816
+ },
+ {
+ "epoch": 0.3862884160756501,
+ "grad_norm": 3.1526732444763184,
+ "learning_rate": 4.960094034997485e-06,
+ "loss": 0.6512,
+ "step": 817
+ },
+ {
+ "epoch": 0.3867612293144208,
+ "grad_norm": 2.7213544845581055,
+ "learning_rate": 4.959982941787579e-06,
+ "loss": 0.6121,
+ "step": 818
+ },
+ {
+ "epoch": 0.3872340425531915,
+ "grad_norm": 3.4935851097106934,
+ "learning_rate": 4.9598716954053214e-06,
+ "loss": 0.7852,
+ "step": 819
+ },
+ {
+ "epoch": 0.3877068557919622,
+ "grad_norm": 2.691016435623169,
+ "learning_rate": 4.9597602958576395e-06,
+ "loss": 0.6861,
+ "step": 820
+ },
+ {
+ "epoch": 0.38817966903073287,
+ "grad_norm": 2.8621015548706055,
+ "learning_rate": 4.959648743151469e-06,
+ "loss": 0.6262,
+ "step": 821
+ },
+ {
+ "epoch": 0.38865248226950355,
+ "grad_norm": 3.3887462615966797,
+ "learning_rate": 4.959537037293758e-06,
+ "loss": 0.7103,
+ "step": 822
+ },
+ {
+ "epoch": 0.38912529550827424,
+ "grad_norm": 2.7565438747406006,
+ "learning_rate": 4.95942517829146e-06,
+ "loss": 0.6471,
+ "step": 823
+ },
+ {
+ "epoch": 0.3895981087470449,
+ "grad_norm": 2.7920358180999756,
+ "learning_rate": 4.959313166151541e-06,
+ "loss": 0.6239,
+ "step": 824
+ },
+ {
+ "epoch": 0.3900709219858156,
+ "grad_norm": 3.18904185295105,
+ "learning_rate": 4.959201000880973e-06,
+ "loss": 0.7461,
+ "step": 825
+ },
+ {
+ "epoch": 0.3905437352245863,
+ "grad_norm": 2.727872371673584,
+ "learning_rate": 4.959088682486743e-06,
+ "loss": 0.6333,
+ "step": 826
+ },
+ {
+ "epoch": 0.391016548463357,
+ "grad_norm": 2.906378746032715,
+ "learning_rate": 4.958976210975844e-06,
+ "loss": 0.7547,
+ "step": 827
+ },
+ {
+ "epoch": 0.39148936170212767,
+ "grad_norm": 2.96482515335083,
+ "learning_rate": 4.958863586355278e-06,
+ "loss": 0.6312,
+ "step": 828
+ },
+ {
+ "epoch": 0.39196217494089836,
+ "grad_norm": 3.2890889644622803,
+ "learning_rate": 4.958750808632059e-06,
+ "loss": 0.6943,
+ "step": 829
+ },
+ {
+ "epoch": 0.39243498817966904,
+ "grad_norm": 2.7004311084747314,
+ "learning_rate": 4.958637877813207e-06,
+ "loss": 0.5918,
+ "step": 830
+ },
+ {
+ "epoch": 0.39290780141843973,
+ "grad_norm": 2.7487950325012207,
+ "learning_rate": 4.9585247939057566e-06,
+ "loss": 0.6201,
+ "step": 831
+ },
+ {
+ "epoch": 0.3933806146572104,
+ "grad_norm": 2.7873897552490234,
+ "learning_rate": 4.958411556916747e-06,
+ "loss": 0.6268,
+ "step": 832
+ },
+ {
+ "epoch": 0.3938534278959811,
+ "grad_norm": 2.8501343727111816,
+ "learning_rate": 4.958298166853229e-06,
+ "loss": 0.7119,
+ "step": 833
+ },
+ {
+ "epoch": 0.3943262411347518,
+ "grad_norm": 3.0391547679901123,
+ "learning_rate": 4.958184623722265e-06,
+ "loss": 0.6375,
+ "step": 834
+ },
+ {
+ "epoch": 0.3947990543735225,
+ "grad_norm": 2.850520133972168,
+ "learning_rate": 4.958070927530922e-06,
+ "loss": 0.5962,
+ "step": 835
+ },
+ {
+ "epoch": 0.39527186761229316,
+ "grad_norm": 3.351914644241333,
+ "learning_rate": 4.957957078286281e-06,
+ "loss": 0.7247,
+ "step": 836
+ },
+ {
+ "epoch": 0.39574468085106385,
+ "grad_norm": 2.9559543132781982,
+ "learning_rate": 4.957843075995431e-06,
+ "loss": 0.6571,
+ "step": 837
+ },
+ {
+ "epoch": 0.39621749408983453,
+ "grad_norm": 3.225785255432129,
+ "learning_rate": 4.95772892066547e-06,
+ "loss": 0.7074,
+ "step": 838
+ },
+ {
+ "epoch": 0.3966903073286052,
+ "grad_norm": 2.7842373847961426,
+ "learning_rate": 4.957614612303505e-06,
+ "loss": 0.6469,
+ "step": 839
+ },
+ {
+ "epoch": 0.3971631205673759,
+ "grad_norm": 4.249724864959717,
+ "learning_rate": 4.957500150916655e-06,
+ "loss": 0.741,
+ "step": 840
+ },
+ {
+ "epoch": 0.3976359338061466,
+ "grad_norm": 3.138221263885498,
+ "learning_rate": 4.957385536512046e-06,
+ "loss": 0.6676,
+ "step": 841
+ },
+ {
+ "epoch": 0.3981087470449173,
+ "grad_norm": 3.456423759460449,
+ "learning_rate": 4.957270769096816e-06,
+ "loss": 0.6877,
+ "step": 842
+ },
+ {
+ "epoch": 0.39858156028368796,
+ "grad_norm": 2.8676278591156006,
+ "learning_rate": 4.957155848678109e-06,
+ "loss": 0.5986,
+ "step": 843
+ },
+ {
+ "epoch": 0.39905437352245865,
+ "grad_norm": 2.705324411392212,
+ "learning_rate": 4.957040775263082e-06,
+ "loss": 0.6356,
+ "step": 844
+ },
+ {
+ "epoch": 0.39952718676122934,
+ "grad_norm": 3.0767486095428467,
+ "learning_rate": 4.9569255488589e-06,
+ "loss": 0.6844,
+ "step": 845
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 2.7787704467773438,
+ "learning_rate": 4.956810169472736e-06,
+ "loss": 0.6641,
+ "step": 846
+ },
+ {
+ "epoch": 0.4004728132387707,
+ "grad_norm": 2.584277868270874,
+ "learning_rate": 4.956694637111777e-06,
+ "loss": 0.6256,
+ "step": 847
+ },
+ {
+ "epoch": 0.4009456264775414,
+ "grad_norm": 2.751641273498535,
+ "learning_rate": 4.956578951783215e-06,
+ "loss": 0.5954,
+ "step": 848
+ },
+ {
+ "epoch": 0.4014184397163121,
+ "grad_norm": 3.0181658267974854,
+ "learning_rate": 4.956463113494253e-06,
+ "loss": 0.6569,
+ "step": 849
+ },
+ {
+ "epoch": 0.40189125295508277,
+ "grad_norm": 3.0933220386505127,
+ "learning_rate": 4.956347122252104e-06,
+ "loss": 0.6248,
+ "step": 850
+ },
+ {
+ "epoch": 0.40236406619385345,
+ "grad_norm": 3.3767428398132324,
+ "learning_rate": 4.956230978063991e-06,
+ "loss": 0.719,
+ "step": 851
+ },
+ {
+ "epoch": 0.40283687943262414,
+ "grad_norm": 3.7666573524475098,
+ "learning_rate": 4.956114680937145e-06,
+ "loss": 0.6467,
+ "step": 852
+ },
+ {
+ "epoch": 0.4033096926713948,
+ "grad_norm": 2.9836843013763428,
+ "learning_rate": 4.955998230878808e-06,
+ "loss": 0.6993,
+ "step": 853
+ },
+ {
+ "epoch": 0.4037825059101655,
+ "grad_norm": 2.981497049331665,
+ "learning_rate": 4.955881627896229e-06,
+ "loss": 0.6578,
+ "step": 854
+ },
+ {
+ "epoch": 0.40425531914893614,
+ "grad_norm": 3.1369056701660156,
+ "learning_rate": 4.955764871996672e-06,
+ "loss": 0.6763,
+ "step": 855
+ },
+ {
+ "epoch": 0.40472813238770683,
+ "grad_norm": 2.7675817012786865,
+ "learning_rate": 4.9556479631874036e-06,
+ "loss": 0.6488,
+ "step": 856
+ },
+ {
+ "epoch": 0.4052009456264775,
+ "grad_norm": 3.035334825515747,
+ "learning_rate": 4.9555309014757034e-06,
+ "loss": 0.7076,
+ "step": 857
+ },
+ {
+ "epoch": 0.4056737588652482,
+ "grad_norm": 3.493704319000244,
+ "learning_rate": 4.955413686868862e-06,
+ "loss": 0.6773,
+ "step": 858
+ },
+ {
+ "epoch": 0.4061465721040189,
+ "grad_norm": 3.245487928390503,
+ "learning_rate": 4.9552963193741765e-06,
+ "loss": 0.6915,
+ "step": 859
+ },
+ {
+ "epoch": 0.4066193853427896,
+ "grad_norm": 3.189969539642334,
+ "learning_rate": 4.955178798998956e-06,
+ "loss": 0.7318,
+ "step": 860
+ },
+ {
+ "epoch": 0.40709219858156026,
+ "grad_norm": 2.7987146377563477,
+ "learning_rate": 4.955061125750517e-06,
+ "loss": 0.6162,
+ "step": 861
+ },
+ {
+ "epoch": 0.40756501182033095,
+ "grad_norm": 3.020118474960327,
+ "learning_rate": 4.954943299636187e-06,
+ "loss": 0.6678,
+ "step": 862
+ },
+ {
+ "epoch": 0.40803782505910163,
+ "grad_norm": 2.715463876724243,
+ "learning_rate": 4.954825320663302e-06,
+ "loss": 0.668,
+ "step": 863
+ },
+ {
+ "epoch": 0.4085106382978723,
+ "grad_norm": 2.595050096511841,
+ "learning_rate": 4.9547071888392085e-06,
+ "loss": 0.6557,
+ "step": 864
+ },
+ {
+ "epoch": 0.408983451536643,
+ "grad_norm": 3.131596088409424,
+ "learning_rate": 4.954588904171261e-06,
+ "loss": 0.6548,
+ "step": 865
+ },
+ {
+ "epoch": 0.4094562647754137,
+ "grad_norm": 2.5742313861846924,
+ "learning_rate": 4.954470466666827e-06,
+ "loss": 0.6592,
+ "step": 866
+ },
+ {
+ "epoch": 0.4099290780141844,
+ "grad_norm": 2.8612802028656006,
+ "learning_rate": 4.9543518763332785e-06,
+ "loss": 0.5391,
+ "step": 867
+ },
+ {
+ "epoch": 0.41040189125295506,
+ "grad_norm": 2.8973186016082764,
+ "learning_rate": 4.954233133178001e-06,
+ "loss": 0.6649,
+ "step": 868
+ },
+ {
+ "epoch": 0.41087470449172575,
+ "grad_norm": 2.802525043487549,
+ "learning_rate": 4.954114237208388e-06,
+ "loss": 0.6212,
+ "step": 869
+ },
+ {
+ "epoch": 0.41134751773049644,
+ "grad_norm": 2.5919506549835205,
+ "learning_rate": 4.953995188431843e-06,
+ "loss": 0.6596,
+ "step": 870
+ },
+ {
+ "epoch": 0.4118203309692671,
+ "grad_norm": 3.139169454574585,
+ "learning_rate": 4.953875986855777e-06,
+ "loss": 0.6799,
+ "step": 871
+ },
+ {
+ "epoch": 0.4122931442080378,
+ "grad_norm": 3.99727725982666,
+ "learning_rate": 4.953756632487614e-06,
+ "loss": 0.6519,
+ "step": 872
+ },
+ {
+ "epoch": 0.4127659574468085,
+ "grad_norm": 3.238706350326538,
+ "learning_rate": 4.953637125334784e-06,
+ "loss": 0.7361,
+ "step": 873
+ },
+ {
+ "epoch": 0.4132387706855792,
+ "grad_norm": 2.780019998550415,
+ "learning_rate": 4.9535174654047295e-06,
+ "loss": 0.6406,
+ "step": 874
+ },
+ {
+ "epoch": 0.41371158392434987,
+ "grad_norm": 2.7629551887512207,
+ "learning_rate": 4.953397652704901e-06,
+ "loss": 0.6131,
+ "step": 875
+ },
+ {
+ "epoch": 0.41418439716312055,
+ "grad_norm": 2.8008246421813965,
+ "learning_rate": 4.9532776872427585e-06,
+ "loss": 0.6464,
+ "step": 876
+ },
+ {
+ "epoch": 0.41465721040189124,
+ "grad_norm": 3.0970115661621094,
+ "learning_rate": 4.953157569025772e-06,
+ "loss": 0.7066,
+ "step": 877
+ },
+ {
+ "epoch": 0.4151300236406619,
+ "grad_norm": 2.8375589847564697,
+ "learning_rate": 4.9530372980614195e-06,
+ "loss": 0.6551,
+ "step": 878
+ },
+ {
+ "epoch": 0.4156028368794326,
+ "grad_norm": 2.718843936920166,
+ "learning_rate": 4.952916874357191e-06,
+ "loss": 0.5947,
+ "step": 879
+ },
+ {
+ "epoch": 0.4160756501182033,
+ "grad_norm": 2.7104697227478027,
+ "learning_rate": 4.952796297920585e-06,
+ "loss": 0.6708,
+ "step": 880
+ },
+ {
+ "epoch": 0.416548463356974,
+ "grad_norm": 2.8223445415496826,
+ "learning_rate": 4.952675568759108e-06,
+ "loss": 0.6214,
+ "step": 881
+ },
+ {
+ "epoch": 0.41702127659574467,
+ "grad_norm": 2.6598153114318848,
+ "learning_rate": 4.952554686880279e-06,
+ "loss": 0.6116,
+ "step": 882
+ },
+ {
+ "epoch": 0.41749408983451536,
+ "grad_norm": 2.8639824390411377,
+ "learning_rate": 4.952433652291623e-06,
+ "loss": 0.5971,
+ "step": 883
+ },
+ {
+ "epoch": 0.41796690307328604,
+ "grad_norm": 2.9578304290771484,
+ "learning_rate": 4.952312465000677e-06,
+ "loss": 0.6785,
+ "step": 884
+ },
+ {
+ "epoch": 0.41843971631205673,
+ "grad_norm": 2.872144937515259,
+ "learning_rate": 4.952191125014987e-06,
+ "loss": 0.6772,
+ "step": 885
+ },
+ {
+ "epoch": 0.4189125295508274,
+ "grad_norm": 2.7513675689697266,
+ "learning_rate": 4.952069632342108e-06,
+ "loss": 0.702,
+ "step": 886
+ },
+ {
+ "epoch": 0.4193853427895981,
+ "grad_norm": 2.9275078773498535,
+ "learning_rate": 4.951947986989606e-06,
+ "loss": 0.589,
+ "step": 887
+ },
+ {
+ "epoch": 0.4198581560283688,
+ "grad_norm": 2.740549325942993,
+ "learning_rate": 4.951826188965053e-06,
+ "loss": 0.5942,
+ "step": 888
+ },
+ {
+ "epoch": 0.4203309692671395,
+ "grad_norm": 2.92452073097229,
+ "learning_rate": 4.951704238276035e-06,
+ "loss": 0.6819,
+ "step": 889
+ },
+ {
+ "epoch": 0.42080378250591016,
+ "grad_norm": 2.842491865158081,
+ "learning_rate": 4.951582134930144e-06,
+ "loss": 0.6304,
+ "step": 890
+ },
+ {
+ "epoch": 0.42127659574468085,
+ "grad_norm": 2.613478422164917,
+ "learning_rate": 4.951459878934983e-06,
+ "loss": 0.6912,
+ "step": 891
+ },
+ {
+ "epoch": 0.42174940898345153,
+ "grad_norm": 3.2408607006073,
+ "learning_rate": 4.951337470298165e-06,
+ "loss": 0.6755,
+ "step": 892
+ },
+ {
+ "epoch": 0.4222222222222222,
+ "grad_norm": 3.1022439002990723,
+ "learning_rate": 4.9512149090273125e-06,
+ "loss": 0.6138,
+ "step": 893
+ },
+ {
+ "epoch": 0.4226950354609929,
+ "grad_norm": 2.6418895721435547,
+ "learning_rate": 4.951092195130055e-06,
+ "loss": 0.639,
+ "step": 894
+ },
+ {
+ "epoch": 0.4231678486997636,
+ "grad_norm": 3.010744333267212,
+ "learning_rate": 4.950969328614035e-06,
+ "loss": 0.7102,
+ "step": 895
+ },
+ {
+ "epoch": 0.4236406619385343,
+ "grad_norm": 2.673292636871338,
+ "learning_rate": 4.950846309486901e-06,
+ "loss": 0.5676,
+ "step": 896
+ },
+ {
+ "epoch": 0.42411347517730497,
+ "grad_norm": 3.6974737644195557,
+ "learning_rate": 4.950723137756314e-06,
+ "loss": 0.5722,
+ "step": 897
+ },
+ {
+ "epoch": 0.42458628841607565,
+ "grad_norm": 3.69028902053833,
+ "learning_rate": 4.9505998134299435e-06,
+ "loss": 0.6337,
+ "step": 898
+ },
+ {
+ "epoch": 0.42505910165484634,
+ "grad_norm": 3.2136125564575195,
+ "learning_rate": 4.950476336515469e-06,
+ "loss": 0.6469,
+ "step": 899
+ },
+ {
+ "epoch": 0.425531914893617,
+ "grad_norm": 2.7396016120910645,
+ "learning_rate": 4.950352707020577e-06,
+ "loss": 0.6656,
+ "step": 900
+ },
+ {
+ "epoch": 0.4260047281323877,
+ "grad_norm": 2.825416088104248,
+ "learning_rate": 4.950228924952967e-06,
+ "loss": 0.6298,
+ "step": 901
+ },
+ {
+ "epoch": 0.4264775413711584,
+ "grad_norm": 3.401658535003662,
+ "learning_rate": 4.950104990320345e-06,
+ "loss": 0.778,
+ "step": 902
+ },
+ {
+ "epoch": 0.4269503546099291,
+ "grad_norm": 2.7002272605895996,
+ "learning_rate": 4.9499809031304294e-06,
+ "loss": 0.6536,
+ "step": 903
+ },
+ {
+ "epoch": 0.42742316784869977,
+ "grad_norm": 2.62386417388916,
+ "learning_rate": 4.949856663390945e-06,
+ "loss": 0.6629,
+ "step": 904
+ },
+ {
+ "epoch": 0.42789598108747046,
+ "grad_norm": 2.584247589111328,
+ "learning_rate": 4.94973227110963e-06,
+ "loss": 0.5813,
+ "step": 905
+ },
+ {
+ "epoch": 0.42836879432624114,
+ "grad_norm": 3.4365768432617188,
+ "learning_rate": 4.9496077262942265e-06,
+ "loss": 0.7648,
+ "step": 906
+ },
+ {
+ "epoch": 0.42884160756501183,
+ "grad_norm": 2.8993639945983887,
+ "learning_rate": 4.949483028952492e-06,
+ "loss": 0.6696,
+ "step": 907
+ },
+ {
+ "epoch": 0.4293144208037825,
+ "grad_norm": 2.922809362411499,
+ "learning_rate": 4.94935817909219e-06,
+ "loss": 0.6892,
+ "step": 908
+ },
+ {
+ "epoch": 0.4297872340425532,
+ "grad_norm": 2.85478138923645,
+ "learning_rate": 4.9492331767210944e-06,
+ "loss": 0.536,
+ "step": 909
+ },
+ {
+ "epoch": 0.4302600472813239,
+ "grad_norm": 2.8639259338378906,
+ "learning_rate": 4.949108021846988e-06,
+ "loss": 0.634,
+ "step": 910
+ },
+ {
+ "epoch": 0.4307328605200946,
+ "grad_norm": 3.0533697605133057,
+ "learning_rate": 4.948982714477664e-06,
+ "loss": 0.6318,
+ "step": 911
+ },
+ {
+ "epoch": 0.43120567375886526,
+ "grad_norm": 2.331674814224243,
+ "learning_rate": 4.9488572546209255e-06,
+ "loss": 0.6562,
+ "step": 912
+ },
+ {
+ "epoch": 0.43167848699763595,
+ "grad_norm": 3.0154623985290527,
+ "learning_rate": 4.9487316422845835e-06,
+ "loss": 0.6675,
+ "step": 913
+ },
+ {
+ "epoch": 0.43215130023640663,
+ "grad_norm": 2.7354514598846436,
+ "learning_rate": 4.948605877476459e-06,
+ "loss": 0.6012,
+ "step": 914
+ },
+ {
+ "epoch": 0.4326241134751773,
+ "grad_norm": 2.863736629486084,
+ "learning_rate": 4.948479960204383e-06,
+ "loss": 0.6062,
+ "step": 915
+ },
+ {
+ "epoch": 0.433096926713948,
+ "grad_norm": 3.01998233795166,
+ "learning_rate": 4.948353890476197e-06,
+ "loss": 0.6749,
+ "step": 916
+ },
+ {
+ "epoch": 0.4335697399527187,
+ "grad_norm": 2.7550456523895264,
+ "learning_rate": 4.94822766829975e-06,
+ "loss": 0.6507,
+ "step": 917
+ },
+ {
+ "epoch": 0.4340425531914894,
+ "grad_norm": 3.370572805404663,
+ "learning_rate": 4.948101293682901e-06,
+ "loss": 0.714,
+ "step": 918
+ },
+ {
+ "epoch": 0.43451536643026006,
+ "grad_norm": 2.9736790657043457,
+ "learning_rate": 4.947974766633519e-06,
+ "loss": 0.729,
+ "step": 919
+ },
+ {
+ "epoch": 0.43498817966903075,
+ "grad_norm": 3.1036548614501953,
+ "learning_rate": 4.947848087159483e-06,
+ "loss": 0.7547,
+ "step": 920
+ },
+ {
+ "epoch": 0.43546099290780144,
+ "grad_norm": 2.895094871520996,
+ "learning_rate": 4.947721255268679e-06,
+ "loss": 0.6089,
+ "step": 921
+ },
+ {
+ "epoch": 0.4359338061465721,
+ "grad_norm": 2.798476219177246,
+ "learning_rate": 4.947594270969005e-06,
+ "loss": 0.5432,
+ "step": 922
+ },
+ {
+ "epoch": 0.4364066193853428,
+ "grad_norm": 2.7675702571868896,
+ "learning_rate": 4.94746713426837e-06,
+ "loss": 0.5693,
+ "step": 923
+ },
+ {
+ "epoch": 0.4368794326241135,
+ "grad_norm": 2.6851553916931152,
+ "learning_rate": 4.947339845174687e-06,
+ "loss": 0.6503,
+ "step": 924
+ },
+ {
+ "epoch": 0.4373522458628842,
+ "grad_norm": 2.909635543823242,
+ "learning_rate": 4.947212403695883e-06,
+ "loss": 0.6494,
+ "step": 925
+ },
+ {
+ "epoch": 0.43782505910165487,
+ "grad_norm": 2.604526996612549,
+ "learning_rate": 4.947084809839894e-06,
+ "loss": 0.6349,
+ "step": 926
+ },
+ {
+ "epoch": 0.43829787234042555,
+ "grad_norm": 3.118149518966675,
+ "learning_rate": 4.946957063614664e-06,
+ "loss": 0.6219,
+ "step": 927
+ },
+ {
+ "epoch": 0.43877068557919624,
+ "grad_norm": 2.7452616691589355,
+ "learning_rate": 4.9468291650281465e-06,
+ "loss": 0.6096,
+ "step": 928
+ },
+ {
+ "epoch": 0.4392434988179669,
+ "grad_norm": 3.30098819732666,
+ "learning_rate": 4.946701114088307e-06,
+ "loss": 0.6277,
+ "step": 929
+ },
+ {
+ "epoch": 0.4397163120567376,
+ "grad_norm": 2.789482593536377,
+ "learning_rate": 4.946572910803116e-06,
+ "loss": 0.7,
+ "step": 930
+ },
+ {
+ "epoch": 0.4401891252955083,
+ "grad_norm": 2.7283935546875,
+ "learning_rate": 4.946444555180559e-06,
+ "loss": 0.5375,
+ "step": 931
+ },
+ {
+ "epoch": 0.440661938534279,
+ "grad_norm": 3.101304054260254,
+ "learning_rate": 4.946316047228627e-06,
+ "loss": 0.6131,
+ "step": 932
+ },
+ {
+ "epoch": 0.44113475177304967,
+ "grad_norm": 3.573908805847168,
+ "learning_rate": 4.946187386955321e-06,
+ "loss": 0.7073,
+ "step": 933
+ },
+ {
+ "epoch": 0.44160756501182036,
+ "grad_norm": 3.214979648590088,
+ "learning_rate": 4.946058574368653e-06,
+ "loss": 0.6508,
+ "step": 934
+ },
+ {
+ "epoch": 0.44208037825059104,
+ "grad_norm": 3.145082712173462,
+ "learning_rate": 4.945929609476643e-06,
+ "loss": 0.64,
+ "step": 935
+ },
+ {
+ "epoch": 0.4425531914893617,
+ "grad_norm": 2.991780996322632,
+ "learning_rate": 4.945800492287321e-06,
+ "loss": 0.6315,
+ "step": 936
+ },
+ {
+ "epoch": 0.44302600472813236,
+ "grad_norm": 3.2441139221191406,
+ "learning_rate": 4.945671222808727e-06,
+ "loss": 0.7144,
+ "step": 937
+ },
+ {
+ "epoch": 0.44349881796690305,
+ "grad_norm": 2.9397029876708984,
+ "learning_rate": 4.94554180104891e-06,
+ "loss": 0.6818,
+ "step": 938
+ },
+ {
+ "epoch": 0.44397163120567373,
+ "grad_norm": 3.2471461296081543,
+ "learning_rate": 4.945412227015929e-06,
+ "loss": 0.6921,
+ "step": 939
+ },
+ {
+ "epoch": 0.4444444444444444,
+ "grad_norm": 3.0882487297058105,
+ "learning_rate": 4.945282500717851e-06,
+ "loss": 0.718,
+ "step": 940
+ },
+ {
+ "epoch": 0.4449172576832151,
+ "grad_norm": 2.6035783290863037,
+ "learning_rate": 4.945152622162753e-06,
+ "loss": 0.621,
+ "step": 941
+ },
+ {
+ "epoch": 0.4453900709219858,
+ "grad_norm": 2.83659029006958,
+ "learning_rate": 4.945022591358724e-06,
+ "loss": 0.6403,
+ "step": 942
+ },
+ {
+ "epoch": 0.4458628841607565,
+ "grad_norm": 2.824463129043579,
+ "learning_rate": 4.944892408313859e-06,
+ "loss": 0.6594,
+ "step": 943
+ },
+ {
+ "epoch": 0.44633569739952716,
+ "grad_norm": 2.753735065460205,
+ "learning_rate": 4.9447620730362645e-06,
+ "loss": 0.6116,
+ "step": 944
+ },
+ {
+ "epoch": 0.44680851063829785,
+ "grad_norm": 3.0659725666046143,
+ "learning_rate": 4.944631585534056e-06,
+ "loss": 0.5983,
+ "step": 945
+ },
+ {
+ "epoch": 0.44728132387706854,
+ "grad_norm": 2.969113349914551,
+ "learning_rate": 4.944500945815357e-06,
+ "loss": 0.6859,
+ "step": 946
+ },
+ {
+ "epoch": 0.4477541371158392,
+ "grad_norm": 2.810303211212158,
+ "learning_rate": 4.944370153888303e-06,
+ "loss": 0.7025,
+ "step": 947
+ },
+ {
+ "epoch": 0.4482269503546099,
+ "grad_norm": 3.027721643447876,
+ "learning_rate": 4.944239209761038e-06,
+ "loss": 0.7268,
+ "step": 948
+ },
+ {
+ "epoch": 0.4486997635933806,
+ "grad_norm": 2.661503314971924,
+ "learning_rate": 4.944108113441716e-06,
+ "loss": 0.6702,
+ "step": 949
+ },
+ {
+ "epoch": 0.4491725768321513,
+ "grad_norm": 2.738591432571411,
+ "learning_rate": 4.943976864938498e-06,
+ "loss": 0.6728,
+ "step": 950
+ },
+ {
+ "epoch": 0.44964539007092197,
+ "grad_norm": 3.447505474090576,
+ "learning_rate": 4.943845464259557e-06,
+ "loss": 0.6586,
+ "step": 951
+ },
+ {
+ "epoch": 0.45011820330969265,
+ "grad_norm": 3.0968854427337646,
+ "learning_rate": 4.943713911413075e-06,
+ "loss": 0.7666,
+ "step": 952
+ },
+ {
+ "epoch": 0.45059101654846334,
+ "grad_norm": 2.4113779067993164,
+ "learning_rate": 4.943582206407244e-06,
+ "loss": 0.6173,
+ "step": 953
+ },
+ {
+ "epoch": 0.451063829787234,
+ "grad_norm": 2.6357979774475098,
+ "learning_rate": 4.943450349250263e-06,
+ "loss": 0.5589,
+ "step": 954
+ },
+ {
+ "epoch": 0.4515366430260047,
+ "grad_norm": 2.9182233810424805,
+ "learning_rate": 4.9433183399503425e-06,
+ "loss": 0.6252,
+ "step": 955
+ },
+ {
+ "epoch": 0.4520094562647754,
+ "grad_norm": 2.832740306854248,
+ "learning_rate": 4.943186178515703e-06,
+ "loss": 0.6882,
+ "step": 956
+ },
+ {
+ "epoch": 0.4524822695035461,
+ "grad_norm": 2.9508981704711914,
+ "learning_rate": 4.943053864954574e-06,
+ "loss": 0.5722,
+ "step": 957
+ },
+ {
+ "epoch": 0.4529550827423168,
+ "grad_norm": 3.044729471206665,
+ "learning_rate": 4.9429213992751925e-06,
+ "loss": 0.6772,
+ "step": 958
+ },
+ {
+ "epoch": 0.45342789598108746,
+ "grad_norm": 2.606003522872925,
+ "learning_rate": 4.9427887814858075e-06,
+ "loss": 0.6445,
+ "step": 959
+ },
+ {
+ "epoch": 0.45390070921985815,
+ "grad_norm": 2.4634225368499756,
+ "learning_rate": 4.942656011594676e-06,
+ "loss": 0.6151,
+ "step": 960
+ },
+ {
+ "epoch": 0.45437352245862883,
+ "grad_norm": 2.8872334957122803,
+ "learning_rate": 4.942523089610066e-06,
+ "loss": 0.6255,
+ "step": 961
+ },
+ {
+ "epoch": 0.4548463356973995,
+ "grad_norm": 2.870605707168579,
+ "learning_rate": 4.942390015540253e-06,
+ "loss": 0.7481,
+ "step": 962
+ },
+ {
+ "epoch": 0.4553191489361702,
+ "grad_norm": 2.952680826187134,
+ "learning_rate": 4.942256789393524e-06,
+ "loss": 0.5556,
+ "step": 963
+ },
+ {
+ "epoch": 0.4557919621749409,
+ "grad_norm": 2.623680353164673,
+ "learning_rate": 4.9421234111781725e-06,
+ "loss": 0.6115,
+ "step": 964
+ },
+ {
+ "epoch": 0.4562647754137116,
+ "grad_norm": 2.6933600902557373,
+ "learning_rate": 4.941989880902505e-06,
+ "loss": 0.6102,
+ "step": 965
+ },
+ {
+ "epoch": 0.45673758865248226,
+ "grad_norm": 2.6047189235687256,
+ "learning_rate": 4.941856198574836e-06,
+ "loss": 0.612,
+ "step": 966
+ },
+ {
+ "epoch": 0.45721040189125295,
+ "grad_norm": 2.779186725616455,
+ "learning_rate": 4.9417223642034885e-06,
+ "loss": 0.5424,
+ "step": 967
+ },
+ {
+ "epoch": 0.45768321513002364,
+ "grad_norm": 2.6177165508270264,
+ "learning_rate": 4.941588377796795e-06,
+ "loss": 0.4661,
+ "step": 968
+ },
+ {
+ "epoch": 0.4581560283687943,
+ "grad_norm": 2.959676742553711,
+ "learning_rate": 4.941454239363101e-06,
+ "loss": 0.6966,
+ "step": 969
+ },
+ {
+ "epoch": 0.458628841607565,
+ "grad_norm": 2.9788379669189453,
+ "learning_rate": 4.941319948910756e-06,
+ "loss": 0.6181,
+ "step": 970
+ },
+ {
+ "epoch": 0.4591016548463357,
+ "grad_norm": 4.642750263214111,
+ "learning_rate": 4.941185506448122e-06,
+ "loss": 0.5602,
+ "step": 971
+ },
+ {
+ "epoch": 0.4595744680851064,
+ "grad_norm": 2.793002128601074,
+ "learning_rate": 4.941050911983572e-06,
+ "loss": 0.602,
+ "step": 972
+ },
+ {
+ "epoch": 0.46004728132387707,
+ "grad_norm": 2.6833035945892334,
+ "learning_rate": 4.9409161655254845e-06,
+ "loss": 0.5549,
+ "step": 973
+ },
+ {
+ "epoch": 0.46052009456264775,
+ "grad_norm": 3.905032157897949,
+ "learning_rate": 4.94078126708225e-06,
+ "loss": 0.6335,
+ "step": 974
+ },
+ {
+ "epoch": 0.46099290780141844,
+ "grad_norm": 2.922609329223633,
+ "learning_rate": 4.94064621666227e-06,
+ "loss": 0.5839,
+ "step": 975
+ },
+ {
+ "epoch": 0.4614657210401891,
+ "grad_norm": 2.8277416229248047,
+ "learning_rate": 4.940511014273952e-06,
+ "loss": 0.629,
+ "step": 976
+ },
+ {
+ "epoch": 0.4619385342789598,
+ "grad_norm": 3.07511043548584,
+ "learning_rate": 4.940375659925714e-06,
+ "loss": 0.7058,
+ "step": 977
+ },
+ {
+ "epoch": 0.4624113475177305,
+ "grad_norm": 3.65043044090271,
+ "learning_rate": 4.940240153625984e-06,
+ "loss": 0.7174,
+ "step": 978
+ },
+ {
+ "epoch": 0.4628841607565012,
+ "grad_norm": 2.755167245864868,
+ "learning_rate": 4.9401044953832e-06,
+ "loss": 0.6548,
+ "step": 979
+ },
+ {
+ "epoch": 0.46335697399527187,
+ "grad_norm": 2.9881057739257812,
+ "learning_rate": 4.939968685205808e-06,
+ "loss": 0.6245,
+ "step": 980
+ },
+ {
+ "epoch": 0.46382978723404256,
+ "grad_norm": 2.9484212398529053,
+ "learning_rate": 4.939832723102266e-06,
+ "loss": 0.655,
+ "step": 981
+ },
+ {
+ "epoch": 0.46430260047281324,
+ "grad_norm": 2.898918628692627,
+ "learning_rate": 4.939696609081038e-06,
+ "loss": 0.6178,
+ "step": 982
+ },
+ {
+ "epoch": 0.46477541371158393,
+ "grad_norm": 2.7052435874938965,
+ "learning_rate": 4.9395603431506e-06,
+ "loss": 0.6393,
+ "step": 983
+ },
+ {
+ "epoch": 0.4652482269503546,
+ "grad_norm": 2.5610013008117676,
+ "learning_rate": 4.939423925319436e-06,
+ "loss": 0.4847,
+ "step": 984
+ },
+ {
+ "epoch": 0.4657210401891253,
+ "grad_norm": 3.229083299636841,
+ "learning_rate": 4.939287355596042e-06,
+ "loss": 0.6473,
+ "step": 985
+ },
+ {
+ "epoch": 0.466193853427896,
+ "grad_norm": 2.907097816467285,
+ "learning_rate": 4.9391506339889195e-06,
+ "loss": 0.652,
+ "step": 986
+ },
+ {
+ "epoch": 0.4666666666666667,
+ "grad_norm": 2.6929478645324707,
+ "learning_rate": 4.939013760506582e-06,
+ "loss": 0.6175,
+ "step": 987
+ },
+ {
+ "epoch": 0.46713947990543736,
+ "grad_norm": 3.414813280105591,
+ "learning_rate": 4.938876735157554e-06,
+ "loss": 0.7597,
+ "step": 988
+ },
+ {
+ "epoch": 0.46761229314420805,
+ "grad_norm": 3.297360420227051,
+ "learning_rate": 4.938739557950365e-06,
+ "loss": 0.6824,
+ "step": 989
+ },
+ {
+ "epoch": 0.46808510638297873,
+ "grad_norm": 3.083155393600464,
+ "learning_rate": 4.938602228893557e-06,
+ "loss": 0.6505,
+ "step": 990
+ },
+ {
+ "epoch": 0.4685579196217494,
+ "grad_norm": 2.9781153202056885,
+ "learning_rate": 4.938464747995681e-06,
+ "loss": 0.666,
+ "step": 991
+ },
+ {
+ "epoch": 0.4690307328605201,
+ "grad_norm": 3.1494534015655518,
+ "learning_rate": 4.9383271152652975e-06,
+ "loss": 0.6422,
+ "step": 992
+ },
+ {
+ "epoch": 0.4695035460992908,
+ "grad_norm": 2.547868490219116,
+ "learning_rate": 4.938189330710976e-06,
+ "loss": 0.5766,
+ "step": 993
+ },
+ {
+ "epoch": 0.4699763593380615,
+ "grad_norm": 2.684736967086792,
+ "learning_rate": 4.938051394341297e-06,
+ "loss": 0.6407,
+ "step": 994
+ },
+ {
+ "epoch": 0.47044917257683216,
+ "grad_norm": 2.9619693756103516,
+ "learning_rate": 4.937913306164847e-06,
+ "loss": 0.6936,
+ "step": 995
+ },
+ {
+ "epoch": 0.47092198581560285,
+ "grad_norm": 2.9698498249053955,
+ "learning_rate": 4.937775066190227e-06,
+ "loss": 0.6464,
+ "step": 996
+ },
+ {
+ "epoch": 0.47139479905437354,
+ "grad_norm": 3.121049642562866,
+ "learning_rate": 4.937636674426042e-06,
+ "loss": 0.6383,
+ "step": 997
+ },
+ {
+ "epoch": 0.4718676122931442,
+ "grad_norm": 3.113672971725464,
+ "learning_rate": 4.93749813088091e-06,
+ "loss": 0.6892,
+ "step": 998
+ },
+ {
+ "epoch": 0.4723404255319149,
+ "grad_norm": 3.126113176345825,
+ "learning_rate": 4.937359435563458e-06,
+ "loss": 0.6728,
+ "step": 999
+ },
+ {
+ "epoch": 0.4728132387706856,
+ "grad_norm": 3.353966236114502,
+ "learning_rate": 4.937220588482321e-06,
+ "loss": 0.6041,
+ "step": 1000
+ },
+ {
+ "epoch": 0.4732860520094563,
+ "grad_norm": 2.8860628604888916,
+ "learning_rate": 4.937081589646144e-06,
+ "loss": 0.6798,
+ "step": 1001
+ },
+ {
+ "epoch": 0.47375886524822697,
+ "grad_norm": 3.0510590076446533,
+ "learning_rate": 4.936942439063584e-06,
+ "loss": 0.5841,
+ "step": 1002
+ },
+ {
+ "epoch": 0.47423167848699765,
+ "grad_norm": 2.6998369693756104,
+ "learning_rate": 4.936803136743303e-06,
+ "loss": 0.6403,
+ "step": 1003
+ },
+ {
+ "epoch": 0.47470449172576834,
+ "grad_norm": 2.875347137451172,
+ "learning_rate": 4.9366636826939765e-06,
+ "loss": 0.5811,
+ "step": 1004
+ },
+ {
+ "epoch": 0.475177304964539,
+ "grad_norm": 2.9122262001037598,
+ "learning_rate": 4.936524076924287e-06,
+ "loss": 0.6852,
+ "step": 1005
+ },
+ {
+ "epoch": 0.4756501182033097,
+ "grad_norm": 2.5167057514190674,
+ "learning_rate": 4.9363843194429265e-06,
+ "loss": 0.5367,
+ "step": 1006
+ },
+ {
+ "epoch": 0.4761229314420804,
+ "grad_norm": 2.5745551586151123,
+ "learning_rate": 4.9362444102585985e-06,
+ "loss": 0.6241,
+ "step": 1007
+ },
+ {
+ "epoch": 0.4765957446808511,
+ "grad_norm": 2.5024216175079346,
+ "learning_rate": 4.9361043493800125e-06,
+ "loss": 0.6133,
+ "step": 1008
+ },
+ {
+ "epoch": 0.47706855791962177,
+ "grad_norm": 2.7281384468078613,
+ "learning_rate": 4.935964136815892e-06,
+ "loss": 0.6834,
+ "step": 1009
+ },
+ {
+ "epoch": 0.47754137115839246,
+ "grad_norm": 3.0118913650512695,
+ "learning_rate": 4.935823772574965e-06,
+ "loss": 0.6922,
+ "step": 1010
+ },
+ {
+ "epoch": 0.47801418439716314,
+ "grad_norm": 3.016216993331909,
+ "learning_rate": 4.935683256665973e-06,
+ "loss": 0.6653,
+ "step": 1011
+ },
+ {
+ "epoch": 0.47848699763593383,
+ "grad_norm": 2.9526784420013428,
+ "learning_rate": 4.9355425890976636e-06,
+ "loss": 0.6423,
+ "step": 1012
+ },
+ {
+ "epoch": 0.4789598108747045,
+ "grad_norm": 6.222797393798828,
+ "learning_rate": 4.9354017698787985e-06,
+ "loss": 0.5884,
+ "step": 1013
+ },
+ {
+ "epoch": 0.4794326241134752,
+ "grad_norm": 2.6553597450256348,
+ "learning_rate": 4.935260799018143e-06,
+ "loss": 0.6624,
+ "step": 1014
+ },
+ {
+ "epoch": 0.4799054373522459,
+ "grad_norm": 3.0942065715789795,
+ "learning_rate": 4.935119676524475e-06,
+ "loss": 0.6623,
+ "step": 1015
+ },
+ {
+ "epoch": 0.4803782505910166,
+ "grad_norm": 2.626359224319458,
+ "learning_rate": 4.934978402406585e-06,
+ "loss": 0.6195,
+ "step": 1016
+ },
+ {
+ "epoch": 0.4808510638297872,
+ "grad_norm": 2.7954699993133545,
+ "learning_rate": 4.934836976673265e-06,
+ "loss": 0.5545,
+ "step": 1017
+ },
+ {
+ "epoch": 0.4813238770685579,
+ "grad_norm": 2.913557291030884,
+ "learning_rate": 4.934695399333324e-06,
+ "loss": 0.6288,
+ "step": 1018
+ },
+ {
+ "epoch": 0.4817966903073286,
+ "grad_norm": 3.1043739318847656,
+ "learning_rate": 4.9345536703955746e-06,
+ "loss": 0.6771,
+ "step": 1019
+ },
+ {
+ "epoch": 0.48226950354609927,
+ "grad_norm": 2.789357900619507,
+ "learning_rate": 4.934411789868845e-06,
+ "loss": 0.6227,
+ "step": 1020
+ },
+ {
+ "epoch": 0.48274231678486995,
+ "grad_norm": 2.480609655380249,
+ "learning_rate": 4.934269757761967e-06,
+ "loss": 0.5779,
+ "step": 1021
+ },
+ {
+ "epoch": 0.48321513002364064,
+ "grad_norm": 2.7946252822875977,
+ "learning_rate": 4.934127574083785e-06,
+ "loss": 0.6166,
+ "step": 1022
+ },
+ {
+ "epoch": 0.4836879432624113,
+ "grad_norm": 3.0670509338378906,
+ "learning_rate": 4.933985238843153e-06,
+ "loss": 0.7766,
+ "step": 1023
+ },
+ {
+ "epoch": 0.484160756501182,
+ "grad_norm": 2.8567559719085693,
+ "learning_rate": 4.933842752048932e-06,
+ "loss": 0.5088,
+ "step": 1024
+ },
+ {
+ "epoch": 0.4846335697399527,
+ "grad_norm": 2.5674657821655273,
+ "learning_rate": 4.933700113709996e-06,
+ "loss": 0.6036,
+ "step": 1025
+ },
+ {
+ "epoch": 0.4851063829787234,
+ "grad_norm": 2.782339096069336,
+ "learning_rate": 4.933557323835224e-06,
+ "loss": 0.5335,
+ "step": 1026
+ },
+ {
+ "epoch": 0.48557919621749407,
+ "grad_norm": 2.6334071159362793,
+ "learning_rate": 4.93341438243351e-06,
+ "loss": 0.6327,
+ "step": 1027
+ },
+ {
+ "epoch": 0.48605200945626476,
+ "grad_norm": 3.0853965282440186,
+ "learning_rate": 4.933271289513751e-06,
+ "loss": 0.7102,
+ "step": 1028
+ },
+ {
+ "epoch": 0.48652482269503544,
+ "grad_norm": 2.619997501373291,
+ "learning_rate": 4.933128045084859e-06,
+ "loss": 0.6138,
+ "step": 1029
+ },
+ {
+ "epoch": 0.48699763593380613,
+ "grad_norm": 2.8316116333007812,
+ "learning_rate": 4.932984649155753e-06,
+ "loss": 0.6346,
+ "step": 1030
+ },
+ {
+ "epoch": 0.4874704491725768,
+ "grad_norm": 3.153486490249634,
+ "learning_rate": 4.932841101735361e-06,
+ "loss": 0.7626,
+ "step": 1031
+ },
+ {
+ "epoch": 0.4879432624113475,
+ "grad_norm": 3.1831274032592773,
+ "learning_rate": 4.9326974028326214e-06,
+ "loss": 0.6607,
+ "step": 1032
+ },
+ {
+ "epoch": 0.4884160756501182,
+ "grad_norm": 2.791078567504883,
+ "learning_rate": 4.932553552456481e-06,
+ "loss": 0.6141,
+ "step": 1033
+ },
+ {
+ "epoch": 0.4888888888888889,
+ "grad_norm": 2.627263307571411,
+ "learning_rate": 4.932409550615898e-06,
+ "loss": 0.6777,
+ "step": 1034
+ },
+ {
+ "epoch": 0.48936170212765956,
+ "grad_norm": 2.8550007343292236,
+ "learning_rate": 4.932265397319838e-06,
+ "loss": 0.6379,
+ "step": 1035
+ },
+ {
+ "epoch": 0.48983451536643025,
+ "grad_norm": 4.505824089050293,
+ "learning_rate": 4.932121092577276e-06,
+ "loss": 0.5892,
+ "step": 1036
+ },
+ {
+ "epoch": 0.49030732860520093,
+ "grad_norm": 3.100191116333008,
+ "learning_rate": 4.931976636397199e-06,
+ "loss": 0.6443,
+ "step": 1037
+ },
+ {
+ "epoch": 0.4907801418439716,
+ "grad_norm": 2.921494245529175,
+ "learning_rate": 4.9318320287886e-06,
+ "loss": 0.6821,
+ "step": 1038
+ },
+ {
+ "epoch": 0.4912529550827423,
+ "grad_norm": 4.577807903289795,
+ "learning_rate": 4.931687269760485e-06,
+ "loss": 0.5946,
+ "step": 1039
+ },
+ {
+ "epoch": 0.491725768321513,
+ "grad_norm": 2.7347636222839355,
+ "learning_rate": 4.931542359321865e-06,
+ "loss": 0.5689,
+ "step": 1040
+ },
+ {
+ "epoch": 0.4921985815602837,
+ "grad_norm": 2.5289158821105957,
+ "learning_rate": 4.931397297481765e-06,
+ "loss": 0.5632,
+ "step": 1041
+ },
+ {
+ "epoch": 0.49267139479905436,
+ "grad_norm": 3.3518471717834473,
+ "learning_rate": 4.9312520842492165e-06,
+ "loss": 0.6349,
+ "step": 1042
+ },
+ {
+ "epoch": 0.49314420803782505,
+ "grad_norm": 3.0469748973846436,
+ "learning_rate": 4.931106719633261e-06,
+ "loss": 0.5734,
+ "step": 1043
+ },
+ {
+ "epoch": 0.49361702127659574,
+ "grad_norm": 3.104682445526123,
+ "learning_rate": 4.930961203642951e-06,
+ "loss": 0.6101,
+ "step": 1044
+ },
+ {
+ "epoch": 0.4940898345153664,
+ "grad_norm": 2.776705503463745,
+ "learning_rate": 4.930815536287346e-06,
+ "loss": 0.6397,
+ "step": 1045
+ },
+ {
+ "epoch": 0.4945626477541371,
+ "grad_norm": 2.760380983352661,
+ "learning_rate": 4.930669717575516e-06,
+ "loss": 0.668,
+ "step": 1046
+ },
+ {
+ "epoch": 0.4950354609929078,
+ "grad_norm": 2.70084547996521,
+ "learning_rate": 4.930523747516541e-06,
+ "loss": 0.5729,
+ "step": 1047
+ },
+ {
+ "epoch": 0.4955082742316785,
+ "grad_norm": 2.7319583892822266,
+ "learning_rate": 4.930377626119511e-06,
+ "loss": 0.6258,
+ "step": 1048
+ },
+ {
+ "epoch": 0.49598108747044917,
+ "grad_norm": 3.2515223026275635,
+ "learning_rate": 4.930231353393521e-06,
+ "loss": 0.7412,
+ "step": 1049
+ },
+ {
+ "epoch": 0.49645390070921985,
+ "grad_norm": 3.0646486282348633,
+ "learning_rate": 4.930084929347682e-06,
+ "loss": 0.5809,
+ "step": 1050
+ },
+ {
+ "epoch": 0.49692671394799054,
+ "grad_norm": 3.1621921062469482,
+ "learning_rate": 4.9299383539911096e-06,
+ "loss": 0.6282,
+ "step": 1051
+ },
+ {
+ "epoch": 0.4973995271867612,
+ "grad_norm": 2.864713191986084,
+ "learning_rate": 4.929791627332931e-06,
+ "loss": 0.6263,
+ "step": 1052
+ },
+ {
+ "epoch": 0.4978723404255319,
+ "grad_norm": 3.181016683578491,
+ "learning_rate": 4.929644749382283e-06,
+ "loss": 0.5697,
+ "step": 1053
+ },
+ {
+ "epoch": 0.4983451536643026,
+ "grad_norm": 2.9064836502075195,
+ "learning_rate": 4.929497720148309e-06,
+ "loss": 0.6161,
+ "step": 1054
+ },
+ {
+ "epoch": 0.4988179669030733,
+ "grad_norm": 3.058112859725952,
+ "learning_rate": 4.9293505396401655e-06,
+ "loss": 0.6477,
+ "step": 1055
+ },
+ {
+ "epoch": 0.49929078014184397,
+ "grad_norm": 2.5227596759796143,
+ "learning_rate": 4.929203207867016e-06,
+ "loss": 0.5819,
+ "step": 1056
+ },
+ {
+ "epoch": 0.49976359338061466,
+ "grad_norm": 3.386862277984619,
+ "learning_rate": 4.929055724838035e-06,
+ "loss": 0.7342,
+ "step": 1057
+ },
+ {
+ "epoch": 0.5002364066193853,
+ "grad_norm": 3.368346929550171,
+ "learning_rate": 4.928908090562404e-06,
+ "loss": 0.6622,
+ "step": 1058
+ },
+ {
+ "epoch": 0.500709219858156,
+ "grad_norm": 2.9108314514160156,
+ "learning_rate": 4.928760305049317e-06,
+ "loss": 0.6598,
+ "step": 1059
+ },
+ {
+ "epoch": 0.5011820330969267,
+ "grad_norm": 2.822305917739868,
+ "learning_rate": 4.928612368307977e-06,
+ "loss": 0.5841,
+ "step": 1060
+ },
+ {
+ "epoch": 0.5016548463356973,
+ "grad_norm": 2.689131259918213,
+ "learning_rate": 4.928464280347592e-06,
+ "loss": 0.6631,
+ "step": 1061
+ },
+ {
+ "epoch": 0.502127659574468,
+ "grad_norm": 3.337214946746826,
+ "learning_rate": 4.9283160411773864e-06,
+ "loss": 0.6105,
+ "step": 1062
+ },
+ {
+ "epoch": 0.5026004728132387,
+ "grad_norm": 3.035911798477173,
+ "learning_rate": 4.928167650806588e-06,
+ "loss": 0.6981,
+ "step": 1063
+ },
+ {
+ "epoch": 0.5030732860520094,
+ "grad_norm": 2.8820855617523193,
+ "learning_rate": 4.9280191092444375e-06,
+ "loss": 0.6408,
+ "step": 1064
+ },
+ {
+ "epoch": 0.5035460992907801,
+ "grad_norm": 3.080432415008545,
+ "learning_rate": 4.927870416500183e-06,
+ "loss": 0.6398,
+ "step": 1065
+ },
+ {
+ "epoch": 0.5040189125295508,
+ "grad_norm": 2.761612892150879,
+ "learning_rate": 4.927721572583084e-06,
+ "loss": 0.6126,
+ "step": 1066
+ },
+ {
+ "epoch": 0.5044917257683215,
+ "grad_norm": 2.8561882972717285,
+ "learning_rate": 4.927572577502408e-06,
+ "loss": 0.584,
+ "step": 1067
+ },
+ {
+ "epoch": 0.5049645390070922,
+ "grad_norm": 3.3386311531066895,
+ "learning_rate": 4.927423431267432e-06,
+ "loss": 0.6666,
+ "step": 1068
+ },
+ {
+ "epoch": 0.5054373522458628,
+ "grad_norm": 2.632906675338745,
+ "learning_rate": 4.927274133887443e-06,
+ "loss": 0.632,
+ "step": 1069
+ },
+ {
+ "epoch": 0.5059101654846335,
+ "grad_norm": 2.8737308979034424,
+ "learning_rate": 4.927124685371737e-06,
+ "loss": 0.6051,
+ "step": 1070
+ },
+ {
+ "epoch": 0.5063829787234042,
+ "grad_norm": 3.042222738265991,
+ "learning_rate": 4.926975085729619e-06,
+ "loss": 0.6954,
+ "step": 1071
+ },
+ {
+ "epoch": 0.5068557919621749,
+ "grad_norm": 3.3341481685638428,
+ "learning_rate": 4.926825334970404e-06,
+ "loss": 0.7148,
+ "step": 1072
+ },
+ {
+ "epoch": 0.5073286052009456,
+ "grad_norm": 2.7415387630462646,
+ "learning_rate": 4.926675433103418e-06,
+ "loss": 0.5456,
+ "step": 1073
+ },
+ {
+ "epoch": 0.5078014184397163,
+ "grad_norm": 2.7545325756073,
+ "learning_rate": 4.926525380137993e-06,
+ "loss": 0.6213,
+ "step": 1074
+ },
+ {
+ "epoch": 0.508274231678487,
+ "grad_norm": 2.9153690338134766,
+ "learning_rate": 4.926375176083472e-06,
+ "loss": 0.6466,
+ "step": 1075
+ },
+ {
+ "epoch": 0.5087470449172576,
+ "grad_norm": 4.210638523101807,
+ "learning_rate": 4.926224820949209e-06,
+ "loss": 0.6192,
+ "step": 1076
+ },
+ {
+ "epoch": 0.5092198581560283,
+ "grad_norm": 2.4357898235321045,
+ "learning_rate": 4.926074314744565e-06,
+ "loss": 0.594,
+ "step": 1077
+ },
+ {
+ "epoch": 0.509692671394799,
+ "grad_norm": 2.8004701137542725,
+ "learning_rate": 4.92592365747891e-06,
+ "loss": 0.6276,
+ "step": 1078
+ },
+ {
+ "epoch": 0.5101654846335697,
+ "grad_norm": 2.920675039291382,
+ "learning_rate": 4.925772849161628e-06,
+ "loss": 0.6043,
+ "step": 1079
+ },
+ {
+ "epoch": 0.5106382978723404,
+ "grad_norm": 2.791555404663086,
+ "learning_rate": 4.9256218898021055e-06,
+ "loss": 0.6837,
+ "step": 1080
+ },
+ {
+ "epoch": 0.5111111111111111,
+ "grad_norm": 3.1702463626861572,
+ "learning_rate": 4.925470779409746e-06,
+ "loss": 0.668,
+ "step": 1081
+ },
+ {
+ "epoch": 0.5115839243498818,
+ "grad_norm": 2.7149479389190674,
+ "learning_rate": 4.925319517993955e-06,
+ "loss": 0.5842,
+ "step": 1082
+ },
+ {
+ "epoch": 0.5120567375886524,
+ "grad_norm": 2.916311025619507,
+ "learning_rate": 4.925168105564153e-06,
+ "loss": 0.6893,
+ "step": 1083
+ },
+ {
+ "epoch": 0.5125295508274231,
+ "grad_norm": 2.917654514312744,
+ "learning_rate": 4.925016542129767e-06,
+ "loss": 0.6513,
+ "step": 1084
+ },
+ {
+ "epoch": 0.5130023640661938,
+ "grad_norm": 2.5568928718566895,
+ "learning_rate": 4.924864827700234e-06,
+ "loss": 0.6177,
+ "step": 1085
+ },
+ {
+ "epoch": 0.5134751773049645,
+ "grad_norm": 2.816720485687256,
+ "learning_rate": 4.924712962285001e-06,
+ "loss": 0.5833,
+ "step": 1086
+ },
+ {
+ "epoch": 0.5139479905437352,
+ "grad_norm": 2.6989188194274902,
+ "learning_rate": 4.9245609458935235e-06,
+ "loss": 0.6332,
+ "step": 1087
+ },
+ {
+ "epoch": 0.5144208037825059,
+ "grad_norm": 2.959599494934082,
+ "learning_rate": 4.924408778535268e-06,
+ "loss": 0.626,
+ "step": 1088
+ },
+ {
+ "epoch": 0.5148936170212766,
+ "grad_norm": 2.872814416885376,
+ "learning_rate": 4.924256460219708e-06,
+ "loss": 0.6407,
+ "step": 1089
+ },
+ {
+ "epoch": 0.5153664302600472,
+ "grad_norm": 2.6989097595214844,
+ "learning_rate": 4.924103990956329e-06,
+ "loss": 0.6391,
+ "step": 1090
+ },
+ {
+ "epoch": 0.5158392434988179,
+ "grad_norm": 2.986492156982422,
+ "learning_rate": 4.9239513707546235e-06,
+ "loss": 0.6911,
+ "step": 1091
+ },
+ {
+ "epoch": 0.5163120567375886,
+ "grad_norm": 3.069920301437378,
+ "learning_rate": 4.9237985996240954e-06,
+ "loss": 0.671,
+ "step": 1092
+ },
+ {
+ "epoch": 0.5167848699763593,
+ "grad_norm": 2.8214917182922363,
+ "learning_rate": 4.9236456775742555e-06,
+ "loss": 0.5885,
+ "step": 1093
+ },
+ {
+ "epoch": 0.51725768321513,
+ "grad_norm": 2.9416961669921875,
+ "learning_rate": 4.923492604614627e-06,
+ "loss": 0.6293,
+ "step": 1094
+ },
+ {
+ "epoch": 0.5177304964539007,
+ "grad_norm": 2.761780023574829,
+ "learning_rate": 4.923339380754741e-06,
+ "loss": 0.649,
+ "step": 1095
+ },
+ {
+ "epoch": 0.5182033096926714,
+ "grad_norm": 2.7648792266845703,
+ "learning_rate": 4.923186006004138e-06,
+ "loss": 0.5906,
+ "step": 1096
+ },
+ {
+ "epoch": 0.518676122931442,
+ "grad_norm": 3.5535428524017334,
+ "learning_rate": 4.923032480372367e-06,
+ "loss": 0.7138,
+ "step": 1097
+ },
+ {
+ "epoch": 0.5191489361702127,
+ "grad_norm": 2.6252479553222656,
+ "learning_rate": 4.922878803868988e-06,
+ "loss": 0.5499,
+ "step": 1098
+ },
+ {
+ "epoch": 0.5196217494089834,
+ "grad_norm": 2.901002883911133,
+ "learning_rate": 4.9227249765035715e-06,
+ "loss": 0.6991,
+ "step": 1099
+ },
+ {
+ "epoch": 0.5200945626477541,
+ "grad_norm": 2.621877431869507,
+ "learning_rate": 4.9225709982856925e-06,
+ "loss": 0.6269,
+ "step": 1100
+ },
+ {
+ "epoch": 0.5205673758865248,
+ "grad_norm": 2.872483015060425,
+ "learning_rate": 4.92241686922494e-06,
+ "loss": 0.6657,
+ "step": 1101
+ },
+ {
+ "epoch": 0.5210401891252955,
+ "grad_norm": 2.730447769165039,
+ "learning_rate": 4.922262589330912e-06,
+ "loss": 0.6061,
+ "step": 1102
+ },
+ {
+ "epoch": 0.5215130023640662,
+ "grad_norm": 2.646247386932373,
+ "learning_rate": 4.922108158613213e-06,
+ "loss": 0.5923,
+ "step": 1103
+ },
+ {
+ "epoch": 0.5219858156028369,
+ "grad_norm": 2.6488895416259766,
+ "learning_rate": 4.92195357708146e-06,
+ "loss": 0.6293,
+ "step": 1104
+ },
+ {
+ "epoch": 0.5224586288416075,
+ "grad_norm": 2.756338357925415,
+ "learning_rate": 4.921798844745278e-06,
+ "loss": 0.6374,
+ "step": 1105
+ },
+ {
+ "epoch": 0.5229314420803782,
+ "grad_norm": 3.1441280841827393,
+ "learning_rate": 4.921643961614301e-06,
+ "loss": 0.6652,
+ "step": 1106
+ },
+ {
+ "epoch": 0.5234042553191489,
+ "grad_norm": 3.050002098083496,
+ "learning_rate": 4.921488927698172e-06,
+ "loss": 0.6809,
+ "step": 1107
+ },
+ {
+ "epoch": 0.5238770685579196,
+ "grad_norm": 2.71750807762146,
+ "learning_rate": 4.921333743006547e-06,
+ "loss": 0.6266,
+ "step": 1108
+ },
+ {
+ "epoch": 0.5243498817966903,
+ "grad_norm": 2.8439245223999023,
+ "learning_rate": 4.921178407549086e-06,
+ "loss": 0.5663,
+ "step": 1109
+ },
+ {
+ "epoch": 0.524822695035461,
+ "grad_norm": 3.0722241401672363,
+ "learning_rate": 4.921022921335464e-06,
+ "loss": 0.6791,
+ "step": 1110
+ },
+ {
+ "epoch": 0.5252955082742317,
+ "grad_norm": 3.4381656646728516,
+ "learning_rate": 4.920867284375358e-06,
+ "loss": 0.6687,
+ "step": 1111
+ },
+ {
+ "epoch": 0.5257683215130023,
+ "grad_norm": 2.819812774658203,
+ "learning_rate": 4.920711496678463e-06,
+ "loss": 0.6299,
+ "step": 1112
+ },
+ {
+ "epoch": 0.526241134751773,
+ "grad_norm": 3.6587414741516113,
+ "learning_rate": 4.9205555582544765e-06,
+ "loss": 0.7392,
+ "step": 1113
+ },
+ {
+ "epoch": 0.5267139479905437,
+ "grad_norm": 2.774296522140503,
+ "learning_rate": 4.920399469113109e-06,
+ "loss": 0.6652,
+ "step": 1114
+ },
+ {
+ "epoch": 0.5271867612293144,
+ "grad_norm": 2.7480580806732178,
+ "learning_rate": 4.920243229264081e-06,
+ "loss": 0.596,
+ "step": 1115
+ },
+ {
+ "epoch": 0.5276595744680851,
+ "grad_norm": 3.213057518005371,
+ "learning_rate": 4.920086838717119e-06,
+ "loss": 0.6986,
+ "step": 1116
+ },
+ {
+ "epoch": 0.5281323877068558,
+ "grad_norm": 2.940546989440918,
+ "learning_rate": 4.919930297481962e-06,
+ "loss": 0.6481,
+ "step": 1117
+ },
+ {
+ "epoch": 0.5286052009456265,
+ "grad_norm": 2.5970494747161865,
+ "learning_rate": 4.9197736055683555e-06,
+ "loss": 0.5658,
+ "step": 1118
+ },
+ {
+ "epoch": 0.5290780141843971,
+ "grad_norm": 4.49385404586792,
+ "learning_rate": 4.919616762986057e-06,
+ "loss": 0.605,
+ "step": 1119
+ },
+ {
+ "epoch": 0.5295508274231678,
+ "grad_norm": 2.971857786178589,
+ "learning_rate": 4.919459769744833e-06,
+ "loss": 0.6539,
+ "step": 1120
+ },
+ {
+ "epoch": 0.5300236406619385,
+ "grad_norm": 2.6192965507507324,
+ "learning_rate": 4.919302625854457e-06,
+ "loss": 0.6226,
+ "step": 1121
+ },
+ {
+ "epoch": 0.5304964539007092,
+ "grad_norm": 2.665088176727295,
+ "learning_rate": 4.919145331324716e-06,
+ "loss": 0.6647,
+ "step": 1122
+ },
+ {
+ "epoch": 0.5309692671394799,
+ "grad_norm": 2.612126111984253,
+ "learning_rate": 4.918987886165403e-06,
+ "loss": 0.6965,
+ "step": 1123
+ },
+ {
+ "epoch": 0.5314420803782506,
+ "grad_norm": 3.80017352104187,
+ "learning_rate": 4.9188302903863205e-06,
+ "loss": 0.7396,
+ "step": 1124
+ },
+ {
+ "epoch": 0.5319148936170213,
+ "grad_norm": 2.781752824783325,
+ "learning_rate": 4.918672543997282e-06,
+ "loss": 0.5985,
+ "step": 1125
+ },
+ {
+ "epoch": 0.532387706855792,
+ "grad_norm": 2.6067914962768555,
+ "learning_rate": 4.91851464700811e-06,
+ "loss": 0.6159,
+ "step": 1126
+ },
+ {
+ "epoch": 0.5328605200945626,
+ "grad_norm": 2.670807123184204,
+ "learning_rate": 4.918356599428636e-06,
+ "loss": 0.5958,
+ "step": 1127
+ },
+ {
+ "epoch": 0.5333333333333333,
+ "grad_norm": 2.608611822128296,
+ "learning_rate": 4.9181984012687e-06,
+ "loss": 0.5768,
+ "step": 1128
+ },
+ {
+ "epoch": 0.533806146572104,
+ "grad_norm": 2.586764097213745,
+ "learning_rate": 4.918040052538154e-06,
+ "loss": 0.661,
+ "step": 1129
+ },
+ {
+ "epoch": 0.5342789598108747,
+ "grad_norm": 3.1317451000213623,
+ "learning_rate": 4.917881553246856e-06,
+ "loss": 0.6626,
+ "step": 1130
+ },
+ {
+ "epoch": 0.5347517730496454,
+ "grad_norm": 2.7135281562805176,
+ "learning_rate": 4.917722903404676e-06,
+ "loss": 0.6572,
+ "step": 1131
+ },
+ {
+ "epoch": 0.5352245862884161,
+ "grad_norm": 3.4546358585357666,
+ "learning_rate": 4.917564103021493e-06,
+ "loss": 0.5597,
+ "step": 1132
+ },
+ {
+ "epoch": 0.5356973995271868,
+ "grad_norm": 3.0943493843078613,
+ "learning_rate": 4.917405152107193e-06,
+ "loss": 0.7258,
+ "step": 1133
+ },
+ {
+ "epoch": 0.5361702127659574,
+ "grad_norm": 2.6069352626800537,
+ "learning_rate": 4.917246050671674e-06,
+ "loss": 0.6209,
+ "step": 1134
+ },
+ {
+ "epoch": 0.5366430260047281,
+ "grad_norm": 2.584883689880371,
+ "learning_rate": 4.917086798724844e-06,
+ "loss": 0.658,
+ "step": 1135
+ },
+ {
+ "epoch": 0.5371158392434988,
+ "grad_norm": 3.001976490020752,
+ "learning_rate": 4.9169273962766166e-06,
+ "loss": 0.6306,
+ "step": 1136
+ },
+ {
+ "epoch": 0.5375886524822695,
+ "grad_norm": 2.5013928413391113,
+ "learning_rate": 4.916767843336918e-06,
+ "loss": 0.572,
+ "step": 1137
+ },
+ {
+ "epoch": 0.5380614657210402,
+ "grad_norm": 2.9114553928375244,
+ "learning_rate": 4.916608139915684e-06,
+ "loss": 0.5841,
+ "step": 1138
+ },
+ {
+ "epoch": 0.5385342789598109,
+ "grad_norm": 2.8878467082977295,
+ "learning_rate": 4.9164482860228564e-06,
+ "loss": 0.6654,
+ "step": 1139
+ },
+ {
+ "epoch": 0.5390070921985816,
+ "grad_norm": 2.9827866554260254,
+ "learning_rate": 4.91628828166839e-06,
+ "loss": 0.6674,
+ "step": 1140
+ },
+ {
+ "epoch": 0.5394799054373522,
+ "grad_norm": 3.8696281909942627,
+ "learning_rate": 4.916128126862248e-06,
+ "loss": 0.6241,
+ "step": 1141
+ },
+ {
+ "epoch": 0.5399527186761229,
+ "grad_norm": 2.9556291103363037,
+ "learning_rate": 4.915967821614402e-06,
+ "loss": 0.6478,
+ "step": 1142
+ },
+ {
+ "epoch": 0.5404255319148936,
+ "grad_norm": 2.392942428588867,
+ "learning_rate": 4.915807365934834e-06,
+ "loss": 0.6097,
+ "step": 1143
+ },
+ {
+ "epoch": 0.5408983451536643,
+ "grad_norm": 3.032235860824585,
+ "learning_rate": 4.915646759833534e-06,
+ "loss": 0.7193,
+ "step": 1144
+ },
+ {
+ "epoch": 0.541371158392435,
+ "grad_norm": 2.840416193008423,
+ "learning_rate": 4.915486003320501e-06,
+ "loss": 0.5506,
+ "step": 1145
+ },
+ {
+ "epoch": 0.5418439716312057,
+ "grad_norm": 2.5438895225524902,
+ "learning_rate": 4.915325096405747e-06,
+ "loss": 0.6487,
+ "step": 1146
+ },
+ {
+ "epoch": 0.5423167848699764,
+ "grad_norm": 2.544334650039673,
+ "learning_rate": 4.9151640390992905e-06,
+ "loss": 0.6168,
+ "step": 1147
+ },
+ {
+ "epoch": 0.542789598108747,
+ "grad_norm": 2.8535678386688232,
+ "learning_rate": 4.91500283141116e-06,
+ "loss": 0.678,
+ "step": 1148
+ },
+ {
+ "epoch": 0.5432624113475177,
+ "grad_norm": 2.8086955547332764,
+ "learning_rate": 4.9148414733513915e-06,
+ "loss": 0.6473,
+ "step": 1149
+ },
+ {
+ "epoch": 0.5437352245862884,
+ "grad_norm": 2.4709885120391846,
+ "learning_rate": 4.914679964930034e-06,
+ "loss": 0.6797,
+ "step": 1150
+ },
+ {
+ "epoch": 0.5442080378250591,
+ "grad_norm": 2.8546934127807617,
+ "learning_rate": 4.9145183061571435e-06,
+ "loss": 0.6247,
+ "step": 1151
+ },
+ {
+ "epoch": 0.5446808510638298,
+ "grad_norm": 2.991184711456299,
+ "learning_rate": 4.9143564970427844e-06,
+ "loss": 0.5977,
+ "step": 1152
+ },
+ {
+ "epoch": 0.5451536643026005,
+ "grad_norm": 3.011216402053833,
+ "learning_rate": 4.914194537597033e-06,
+ "loss": 0.7005,
+ "step": 1153
+ },
+ {
+ "epoch": 0.5456264775413712,
+ "grad_norm": 2.807521343231201,
+ "learning_rate": 4.9140324278299744e-06,
+ "loss": 0.5412,
+ "step": 1154
+ },
+ {
+ "epoch": 0.5460992907801419,
+ "grad_norm": 3.0401229858398438,
+ "learning_rate": 4.913870167751701e-06,
+ "loss": 0.6394,
+ "step": 1155
+ },
+ {
+ "epoch": 0.5465721040189125,
+ "grad_norm": 2.853914976119995,
+ "learning_rate": 4.913707757372317e-06,
+ "loss": 0.6745,
+ "step": 1156
+ },
+ {
+ "epoch": 0.5470449172576832,
+ "grad_norm": 4.505620956420898,
+ "learning_rate": 4.913545196701935e-06,
+ "loss": 0.6668,
+ "step": 1157
+ },
+ {
+ "epoch": 0.5475177304964539,
+ "grad_norm": 3.0505781173706055,
+ "learning_rate": 4.913382485750676e-06,
+ "loss": 0.6926,
+ "step": 1158
+ },
+ {
+ "epoch": 0.5479905437352246,
+ "grad_norm": 2.798435688018799,
+ "learning_rate": 4.913219624528672e-06,
+ "loss": 0.605,
+ "step": 1159
+ },
+ {
+ "epoch": 0.5484633569739953,
+ "grad_norm": 2.7814908027648926,
+ "learning_rate": 4.913056613046065e-06,
+ "loss": 0.6678,
+ "step": 1160
+ },
+ {
+ "epoch": 0.548936170212766,
+ "grad_norm": 3.2089321613311768,
+ "learning_rate": 4.9128934513130025e-06,
+ "loss": 0.5995,
+ "step": 1161
+ },
+ {
+ "epoch": 0.5494089834515367,
+ "grad_norm": 2.7699952125549316,
+ "learning_rate": 4.9127301393396455e-06,
+ "loss": 0.7062,
+ "step": 1162
+ },
+ {
+ "epoch": 0.5498817966903073,
+ "grad_norm": 2.859368324279785,
+ "learning_rate": 4.912566677136162e-06,
+ "loss": 0.6063,
+ "step": 1163
+ },
+ {
+ "epoch": 0.550354609929078,
+ "grad_norm": 2.727334499359131,
+ "learning_rate": 4.91240306471273e-06,
+ "loss": 0.6848,
+ "step": 1164
+ },
+ {
+ "epoch": 0.5508274231678487,
+ "grad_norm": 2.6017510890960693,
+ "learning_rate": 4.912239302079537e-06,
+ "loss": 0.5808,
+ "step": 1165
+ },
+ {
+ "epoch": 0.5513002364066194,
+ "grad_norm": 3.539583206176758,
+ "learning_rate": 4.912075389246781e-06,
+ "loss": 0.7053,
+ "step": 1166
+ },
+ {
+ "epoch": 0.5517730496453901,
+ "grad_norm": 2.918280601501465,
+ "learning_rate": 4.911911326224666e-06,
+ "loss": 0.5904,
+ "step": 1167
+ },
+ {
+ "epoch": 0.5522458628841608,
+ "grad_norm": 3.0067362785339355,
+ "learning_rate": 4.9117471130234095e-06,
+ "loss": 0.6392,
+ "step": 1168
+ },
+ {
+ "epoch": 0.5527186761229315,
+ "grad_norm": 2.4374797344207764,
+ "learning_rate": 4.911582749653236e-06,
+ "loss": 0.5793,
+ "step": 1169
+ },
+ {
+ "epoch": 0.5531914893617021,
+ "grad_norm": 3.121182918548584,
+ "learning_rate": 4.911418236124378e-06,
+ "loss": 0.6636,
+ "step": 1170
+ },
+ {
+ "epoch": 0.5536643026004728,
+ "grad_norm": 3.1289851665496826,
+ "learning_rate": 4.91125357244708e-06,
+ "loss": 0.656,
+ "step": 1171
+ },
+ {
+ "epoch": 0.5541371158392435,
+ "grad_norm": 2.7034592628479004,
+ "learning_rate": 4.911088758631596e-06,
+ "loss": 0.6001,
+ "step": 1172
+ },
+ {
+ "epoch": 0.5546099290780142,
+ "grad_norm": 2.710146188735962,
+ "learning_rate": 4.910923794688187e-06,
+ "loss": 0.6007,
+ "step": 1173
+ },
+ {
+ "epoch": 0.5550827423167849,
+ "grad_norm": 2.5424487590789795,
+ "learning_rate": 4.910758680627124e-06,
+ "loss": 0.5193,
+ "step": 1174
+ },
+ {
+ "epoch": 0.5555555555555556,
+ "grad_norm": 2.615893602371216,
+ "learning_rate": 4.91059341645869e-06,
+ "loss": 0.5525,
+ "step": 1175
+ },
+ {
+ "epoch": 0.5560283687943263,
+ "grad_norm": 3.3179728984832764,
+ "learning_rate": 4.910428002193174e-06,
+ "loss": 0.7285,
+ "step": 1176
+ },
+ {
+ "epoch": 0.556501182033097,
+ "grad_norm": 2.7234175205230713,
+ "learning_rate": 4.910262437840875e-06,
+ "loss": 0.574,
+ "step": 1177
+ },
+ {
+ "epoch": 0.5569739952718676,
+ "grad_norm": 3.0416605472564697,
+ "learning_rate": 4.9100967234121034e-06,
+ "loss": 0.5623,
+ "step": 1178
+ },
+ {
+ "epoch": 0.5574468085106383,
+ "grad_norm": 3.067786455154419,
+ "learning_rate": 4.909930858917177e-06,
+ "loss": 0.6491,
+ "step": 1179
+ },
+ {
+ "epoch": 0.557919621749409,
+ "grad_norm": 3.0037379264831543,
+ "learning_rate": 4.909764844366422e-06,
+ "loss": 0.5696,
+ "step": 1180
+ },
+ {
+ "epoch": 0.5583924349881797,
+ "grad_norm": 2.966179609298706,
+ "learning_rate": 4.909598679770178e-06,
+ "loss": 0.6042,
+ "step": 1181
+ },
+ {
+ "epoch": 0.5588652482269504,
+ "grad_norm": 2.6000657081604004,
+ "learning_rate": 4.909432365138789e-06,
+ "loss": 0.5883,
+ "step": 1182
+ },
+ {
+ "epoch": 0.5593380614657211,
+ "grad_norm": 2.6794495582580566,
+ "learning_rate": 4.909265900482612e-06,
+ "loss": 0.6809,
+ "step": 1183
+ },
+ {
+ "epoch": 0.5598108747044918,
+ "grad_norm": 2.6765122413635254,
+ "learning_rate": 4.9090992858120115e-06,
+ "loss": 0.6601,
+ "step": 1184
+ },
+ {
+ "epoch": 0.5602836879432624,
+ "grad_norm": 2.6051928997039795,
+ "learning_rate": 4.908932521137363e-06,
+ "loss": 0.5946,
+ "step": 1185
+ },
+ {
+ "epoch": 0.5607565011820331,
+ "grad_norm": 3.0405542850494385,
+ "learning_rate": 4.908765606469048e-06,
+ "loss": 0.6998,
+ "step": 1186
+ },
+ {
+ "epoch": 0.5612293144208038,
+ "grad_norm": 2.7975668907165527,
+ "learning_rate": 4.908598541817462e-06,
+ "loss": 0.6218,
+ "step": 1187
+ },
+ {
+ "epoch": 0.5617021276595745,
+ "grad_norm": 2.5367627143859863,
+ "learning_rate": 4.908431327193005e-06,
+ "loss": 0.6354,
+ "step": 1188
+ },
+ {
+ "epoch": 0.5621749408983452,
+ "grad_norm": 3.7939631938934326,
+ "learning_rate": 4.908263962606091e-06,
+ "loss": 0.6376,
+ "step": 1189
+ },
+ {
+ "epoch": 0.5626477541371159,
+ "grad_norm": 2.864079475402832,
+ "learning_rate": 4.908096448067139e-06,
+ "loss": 0.5485,
+ "step": 1190
+ },
+ {
+ "epoch": 0.5631205673758866,
+ "grad_norm": 2.7855563163757324,
+ "learning_rate": 4.9079287835865804e-06,
+ "loss": 0.6645,
+ "step": 1191
+ },
+ {
+ "epoch": 0.5635933806146572,
+ "grad_norm": 2.6156625747680664,
+ "learning_rate": 4.9077609691748556e-06,
+ "loss": 0.5751,
+ "step": 1192
+ },
+ {
+ "epoch": 0.5640661938534279,
+ "grad_norm": 3.0475659370422363,
+ "learning_rate": 4.907593004842412e-06,
+ "loss": 0.6739,
+ "step": 1193
+ },
+ {
+ "epoch": 0.5645390070921986,
+ "grad_norm": 2.9176738262176514,
+ "learning_rate": 4.9074248905997104e-06,
+ "loss": 0.6493,
+ "step": 1194
+ },
+ {
+ "epoch": 0.5650118203309693,
+ "grad_norm": 2.6168384552001953,
+ "learning_rate": 4.907256626457216e-06,
+ "loss": 0.6154,
+ "step": 1195
+ },
+ {
+ "epoch": 0.56548463356974,
+ "grad_norm": 2.893980026245117,
+ "learning_rate": 4.907088212425408e-06,
+ "loss": 0.5808,
+ "step": 1196
+ },
+ {
+ "epoch": 0.5659574468085107,
+ "grad_norm": 3.3832836151123047,
+ "learning_rate": 4.90691964851477e-06,
+ "loss": 0.7888,
+ "step": 1197
+ },
+ {
+ "epoch": 0.5664302600472814,
+ "grad_norm": 3.088932752609253,
+ "learning_rate": 4.906750934735801e-06,
+ "loss": 0.6516,
+ "step": 1198
+ },
+ {
+ "epoch": 0.566903073286052,
+ "grad_norm": 2.494471549987793,
+ "learning_rate": 4.906582071099004e-06,
+ "loss": 0.6286,
+ "step": 1199
+ },
+ {
+ "epoch": 0.5673758865248227,
+ "grad_norm": 2.716550588607788,
+ "learning_rate": 4.906413057614895e-06,
+ "loss": 0.5939,
+ "step": 1200
+ },
+ {
+ "epoch": 0.5678486997635934,
+ "grad_norm": 2.5821073055267334,
+ "learning_rate": 4.906243894293995e-06,
+ "loss": 0.6668,
+ "step": 1201
+ },
+ {
+ "epoch": 0.5683215130023641,
+ "grad_norm": 3.651787042617798,
+ "learning_rate": 4.90607458114684e-06,
+ "loss": 0.6124,
+ "step": 1202
+ },
+ {
+ "epoch": 0.5687943262411348,
+ "grad_norm": 2.7567858695983887,
+ "learning_rate": 4.9059051181839705e-06,
+ "loss": 0.6656,
+ "step": 1203
+ },
+ {
+ "epoch": 0.5692671394799055,
+ "grad_norm": 2.8067586421966553,
+ "learning_rate": 4.90573550541594e-06,
+ "loss": 0.6306,
+ "step": 1204
+ },
+ {
+ "epoch": 0.5697399527186762,
+ "grad_norm": 2.6136393547058105,
+ "learning_rate": 4.905565742853307e-06,
+ "loss": 0.5992,
+ "step": 1205
+ },
+ {
+ "epoch": 0.5702127659574469,
+ "grad_norm": 2.899049758911133,
+ "learning_rate": 4.905395830506644e-06,
+ "loss": 0.621,
+ "step": 1206
+ },
+ {
+ "epoch": 0.5706855791962175,
+ "grad_norm": 3.036583185195923,
+ "learning_rate": 4.9052257683865294e-06,
+ "loss": 0.652,
+ "step": 1207
+ },
+ {
+ "epoch": 0.5711583924349882,
+ "grad_norm": 2.7947216033935547,
+ "learning_rate": 4.905055556503553e-06,
+ "loss": 0.6636,
+ "step": 1208
+ },
+ {
+ "epoch": 0.5716312056737589,
+ "grad_norm": 3.1646955013275146,
+ "learning_rate": 4.9048851948683135e-06,
+ "loss": 0.6376,
+ "step": 1209
+ },
+ {
+ "epoch": 0.5721040189125296,
+ "grad_norm": 2.8175766468048096,
+ "learning_rate": 4.904714683491417e-06,
+ "loss": 0.5929,
+ "step": 1210
+ },
+ {
+ "epoch": 0.5725768321513003,
+ "grad_norm": 2.923923969268799,
+ "learning_rate": 4.904544022383483e-06,
+ "loss": 0.6633,
+ "step": 1211
+ },
+ {
+ "epoch": 0.573049645390071,
+ "grad_norm": 2.7471134662628174,
+ "learning_rate": 4.9043732115551356e-06,
+ "loss": 0.6551,
+ "step": 1212
+ },
+ {
+ "epoch": 0.5735224586288417,
+ "grad_norm": 2.8660807609558105,
+ "learning_rate": 4.90420225101701e-06,
+ "loss": 0.6423,
+ "step": 1213
+ },
+ {
+ "epoch": 0.5739952718676123,
+ "grad_norm": 2.769247531890869,
+ "learning_rate": 4.904031140779754e-06,
+ "loss": 0.5982,
+ "step": 1214
+ },
+ {
+ "epoch": 0.574468085106383,
+ "grad_norm": 2.9043145179748535,
+ "learning_rate": 4.90385988085402e-06,
+ "loss": 0.5843,
+ "step": 1215
+ },
+ {
+ "epoch": 0.5749408983451537,
+ "grad_norm": 2.6639609336853027,
+ "learning_rate": 4.903688471250471e-06,
+ "loss": 0.5858,
+ "step": 1216
+ },
+ {
+ "epoch": 0.5754137115839244,
+ "grad_norm": 2.6967573165893555,
+ "learning_rate": 4.903516911979781e-06,
+ "loss": 0.5755,
+ "step": 1217
+ },
+ {
+ "epoch": 0.5758865248226951,
+ "grad_norm": 2.8865857124328613,
+ "learning_rate": 4.903345203052633e-06,
+ "loss": 0.6051,
+ "step": 1218
+ },
+ {
+ "epoch": 0.5763593380614658,
+ "grad_norm": 2.381979465484619,
+ "learning_rate": 4.903173344479717e-06,
+ "loss": 0.5727,
+ "step": 1219
+ },
+ {
+ "epoch": 0.5768321513002365,
+ "grad_norm": 2.7717981338500977,
+ "learning_rate": 4.903001336271734e-06,
+ "loss": 0.6406,
+ "step": 1220
+ },
+ {
+ "epoch": 0.577304964539007,
+ "grad_norm": 2.6431570053100586,
+ "learning_rate": 4.902829178439395e-06,
+ "loss": 0.6226,
+ "step": 1221
+ },
+ {
+ "epoch": 0.5777777777777777,
+ "grad_norm": 2.8090415000915527,
+ "learning_rate": 4.902656870993419e-06,
+ "loss": 0.5761,
+ "step": 1222
+ },
+ {
+ "epoch": 0.5782505910165484,
+ "grad_norm": 2.4769368171691895,
+ "learning_rate": 4.902484413944535e-06,
+ "loss": 0.5602,
+ "step": 1223
+ },
+ {
+ "epoch": 0.5787234042553191,
+ "grad_norm": 2.693316698074341,
+ "learning_rate": 4.902311807303481e-06,
+ "loss": 0.5222,
+ "step": 1224
+ },
+ {
+ "epoch": 0.5791962174940898,
+ "grad_norm": 2.7623913288116455,
+ "learning_rate": 4.902139051081004e-06,
+ "loss": 0.6978,
+ "step": 1225
+ },
+ {
+ "epoch": 0.5796690307328605,
+ "grad_norm": 2.6133766174316406,
+ "learning_rate": 4.901966145287863e-06,
+ "loss": 0.5802,
+ "step": 1226
+ },
+ {
+ "epoch": 0.5801418439716312,
+ "grad_norm": 2.7345972061157227,
+ "learning_rate": 4.901793089934821e-06,
+ "loss": 0.6294,
+ "step": 1227
+ },
+ {
+ "epoch": 0.5806146572104018,
+ "grad_norm": 2.7545835971832275,
+ "learning_rate": 4.9016198850326555e-06,
+ "loss": 0.6085,
+ "step": 1228
+ },
+ {
+ "epoch": 0.5810874704491725,
+ "grad_norm": 2.6947758197784424,
+ "learning_rate": 4.90144653059215e-06,
+ "loss": 0.6025,
+ "step": 1229
+ },
+ {
+ "epoch": 0.5815602836879432,
+ "grad_norm": 2.692967414855957,
+ "learning_rate": 4.901273026624099e-06,
+ "loss": 0.5715,
+ "step": 1230
+ },
+ {
+ "epoch": 0.5820330969267139,
+ "grad_norm": 2.78347110748291,
+ "learning_rate": 4.901099373139307e-06,
+ "loss": 0.6063,
+ "step": 1231
+ },
+ {
+ "epoch": 0.5825059101654846,
+ "grad_norm": 2.346496343612671,
+ "learning_rate": 4.900925570148585e-06,
+ "loss": 0.5869,
+ "step": 1232
+ },
+ {
+ "epoch": 0.5829787234042553,
+ "grad_norm": 2.606639862060547,
+ "learning_rate": 4.900751617662755e-06,
+ "loss": 0.6197,
+ "step": 1233
+ },
+ {
+ "epoch": 0.583451536643026,
+ "grad_norm": 2.5825929641723633,
+ "learning_rate": 4.900577515692649e-06,
+ "loss": 0.6721,
+ "step": 1234
+ },
+ {
+ "epoch": 0.5839243498817966,
+ "grad_norm": 2.731349468231201,
+ "learning_rate": 4.900403264249107e-06,
+ "loss": 0.6273,
+ "step": 1235
+ },
+ {
+ "epoch": 0.5843971631205673,
+ "grad_norm": 3.2133874893188477,
+ "learning_rate": 4.90022886334298e-06,
+ "loss": 0.6231,
+ "step": 1236
+ },
+ {
+ "epoch": 0.584869976359338,
+ "grad_norm": 2.9213852882385254,
+ "learning_rate": 4.900054312985127e-06,
+ "loss": 0.6677,
+ "step": 1237
+ },
+ {
+ "epoch": 0.5853427895981087,
+ "grad_norm": 2.815425157546997,
+ "learning_rate": 4.899879613186414e-06,
+ "loss": 0.6405,
+ "step": 1238
+ },
+ {
+ "epoch": 0.5858156028368794,
+ "grad_norm": 2.730782985687256,
+ "learning_rate": 4.899704763957721e-06,
+ "loss": 0.6233,
+ "step": 1239
+ },
+ {
+ "epoch": 0.5862884160756501,
+ "grad_norm": 2.6432766914367676,
+ "learning_rate": 4.899529765309936e-06,
+ "loss": 0.6267,
+ "step": 1240
+ },
+ {
+ "epoch": 0.5867612293144208,
+ "grad_norm": 2.616215229034424,
+ "learning_rate": 4.899354617253953e-06,
+ "loss": 0.6268,
+ "step": 1241
+ },
+ {
+ "epoch": 0.5872340425531914,
+ "grad_norm": 2.7630255222320557,
+ "learning_rate": 4.899179319800679e-06,
+ "loss": 0.6348,
+ "step": 1242
+ },
+ {
+ "epoch": 0.5877068557919621,
+ "grad_norm": 2.785095453262329,
+ "learning_rate": 4.899003872961029e-06,
+ "loss": 0.5839,
+ "step": 1243
+ },
+ {
+ "epoch": 0.5881796690307328,
+ "grad_norm": 2.9050328731536865,
+ "learning_rate": 4.898828276745927e-06,
+ "loss": 0.651,
+ "step": 1244
+ },
+ {
+ "epoch": 0.5886524822695035,
+ "grad_norm": 2.958092212677002,
+ "learning_rate": 4.8986525311663065e-06,
+ "loss": 0.6395,
+ "step": 1245
+ },
+ {
+ "epoch": 0.5891252955082742,
+ "grad_norm": 2.952310800552368,
+ "learning_rate": 4.898476636233111e-06,
+ "loss": 0.6731,
+ "step": 1246
+ },
+ {
+ "epoch": 0.5895981087470449,
+ "grad_norm": 2.9876346588134766,
+ "learning_rate": 4.898300591957293e-06,
+ "loss": 0.7015,
+ "step": 1247
+ },
+ {
+ "epoch": 0.5900709219858156,
+ "grad_norm": 2.8941752910614014,
+ "learning_rate": 4.898124398349813e-06,
+ "loss": 0.6452,
+ "step": 1248
+ },
+ {
+ "epoch": 0.5905437352245863,
+ "grad_norm": 2.9809536933898926,
+ "learning_rate": 4.897948055421642e-06,
+ "loss": 0.5736,
+ "step": 1249
+ },
+ {
+ "epoch": 0.5910165484633569,
+ "grad_norm": 2.927046775817871,
+ "learning_rate": 4.897771563183761e-06,
+ "loss": 0.5918,
+ "step": 1250
+ },
+ {
+ "epoch": 0.5914893617021276,
+ "grad_norm": 2.865020275115967,
+ "learning_rate": 4.897594921647158e-06,
+ "loss": 0.6924,
+ "step": 1251
+ },
+ {
+ "epoch": 0.5919621749408983,
+ "grad_norm": 2.7406699657440186,
+ "learning_rate": 4.897418130822832e-06,
+ "loss": 0.509,
+ "step": 1252
+ },
+ {
+ "epoch": 0.592434988179669,
+ "grad_norm": 2.781606912612915,
+ "learning_rate": 4.897241190721791e-06,
+ "loss": 0.5555,
+ "step": 1253
+ },
+ {
+ "epoch": 0.5929078014184397,
+ "grad_norm": 2.79209303855896,
+ "learning_rate": 4.8970641013550535e-06,
+ "loss": 0.6722,
+ "step": 1254
+ },
+ {
+ "epoch": 0.5933806146572104,
+ "grad_norm": 3.0672268867492676,
+ "learning_rate": 4.896886862733645e-06,
+ "loss": 0.6366,
+ "step": 1255
+ },
+ {
+ "epoch": 0.5938534278959811,
+ "grad_norm": 2.7456953525543213,
+ "learning_rate": 4.896709474868602e-06,
+ "loss": 0.6246,
+ "step": 1256
+ },
+ {
+ "epoch": 0.5943262411347517,
+ "grad_norm": 3.6731202602386475,
+ "learning_rate": 4.896531937770968e-06,
+ "loss": 0.668,
+ "step": 1257
+ },
+ {
+ "epoch": 0.5947990543735224,
+ "grad_norm": 2.6056087017059326,
+ "learning_rate": 4.8963542514518e-06,
+ "loss": 0.5815,
+ "step": 1258
+ },
+ {
+ "epoch": 0.5952718676122931,
+ "grad_norm": 2.719698905944824,
+ "learning_rate": 4.89617641592216e-06,
+ "loss": 0.6058,
+ "step": 1259
+ },
+ {
+ "epoch": 0.5957446808510638,
+ "grad_norm": 2.625838279724121,
+ "learning_rate": 4.895998431193121e-06,
+ "loss": 0.6143,
+ "step": 1260
+ },
+ {
+ "epoch": 0.5962174940898345,
+ "grad_norm": 2.7166085243225098,
+ "learning_rate": 4.895820297275767e-06,
+ "loss": 0.5187,
+ "step": 1261
+ },
+ {
+ "epoch": 0.5966903073286052,
+ "grad_norm": 2.7544102668762207,
+ "learning_rate": 4.8956420141811875e-06,
+ "loss": 0.5928,
+ "step": 1262
+ },
+ {
+ "epoch": 0.5971631205673759,
+ "grad_norm": 2.6678333282470703,
+ "learning_rate": 4.895463581920484e-06,
+ "loss": 0.611,
+ "step": 1263
+ },
+ {
+ "epoch": 0.5976359338061465,
+ "grad_norm": 2.853384494781494,
+ "learning_rate": 4.895285000504768e-06,
+ "loss": 0.642,
+ "step": 1264
+ },
+ {
+ "epoch": 0.5981087470449172,
+ "grad_norm": 2.637852430343628,
+ "learning_rate": 4.895106269945158e-06,
+ "loss": 0.6308,
+ "step": 1265
+ },
+ {
+ "epoch": 0.5985815602836879,
+ "grad_norm": 2.9880387783050537,
+ "learning_rate": 4.8949273902527826e-06,
+ "loss": 0.5781,
+ "step": 1266
+ },
+ {
+ "epoch": 0.5990543735224586,
+ "grad_norm": 3.5984015464782715,
+ "learning_rate": 4.89474836143878e-06,
+ "loss": 0.5865,
+ "step": 1267
+ },
+ {
+ "epoch": 0.5995271867612293,
+ "grad_norm": 2.719855546951294,
+ "learning_rate": 4.8945691835142975e-06,
+ "loss": 0.6393,
+ "step": 1268
+ },
+ {
+ "epoch": 0.6,
+ "grad_norm": 2.7885141372680664,
+ "learning_rate": 4.894389856490492e-06,
+ "loss": 0.66,
+ "step": 1269
+ },
+ {
+ "epoch": 0.6004728132387707,
+ "grad_norm": 2.698819875717163,
+ "learning_rate": 4.894210380378529e-06,
+ "loss": 0.6144,
+ "step": 1270
+ },
+ {
+ "epoch": 0.6009456264775414,
+ "grad_norm": 2.278045654296875,
+ "learning_rate": 4.894030755189584e-06,
+ "loss": 0.5609,
+ "step": 1271
+ },
+ {
+ "epoch": 0.601418439716312,
+ "grad_norm": 2.8729357719421387,
+ "learning_rate": 4.893850980934841e-06,
+ "loss": 0.6715,
+ "step": 1272
+ },
+ {
+ "epoch": 0.6018912529550827,
+ "grad_norm": 2.8541221618652344,
+ "learning_rate": 4.893671057625495e-06,
+ "loss": 0.6787,
+ "step": 1273
+ },
+ {
+ "epoch": 0.6023640661938534,
+ "grad_norm": 2.4561476707458496,
+ "learning_rate": 4.893490985272748e-06,
+ "loss": 0.6331,
+ "step": 1274
+ },
+ {
+ "epoch": 0.6028368794326241,
+ "grad_norm": 2.565739154815674,
+ "learning_rate": 4.893310763887812e-06,
+ "loss": 0.587,
+ "step": 1275
+ },
+ {
+ "epoch": 0.6033096926713948,
+ "grad_norm": 2.384951591491699,
+ "learning_rate": 4.8931303934819095e-06,
+ "loss": 0.5358,
+ "step": 1276
+ },
+ {
+ "epoch": 0.6037825059101655,
+ "grad_norm": 2.380808115005493,
+ "learning_rate": 4.89294987406627e-06,
+ "loss": 0.5402,
+ "step": 1277
+ },
+ {
+ "epoch": 0.6042553191489362,
+ "grad_norm": 2.764815092086792,
+ "learning_rate": 4.892769205652136e-06,
+ "loss": 0.6103,
+ "step": 1278
+ },
+ {
+ "epoch": 0.6047281323877068,
+ "grad_norm": 2.463350296020508,
+ "learning_rate": 4.892588388250754e-06,
+ "loss": 0.5937,
+ "step": 1279
+ },
+ {
+ "epoch": 0.6052009456264775,
+ "grad_norm": 3.099689245223999,
+ "learning_rate": 4.8924074218733855e-06,
+ "loss": 0.6354,
+ "step": 1280
+ },
+ {
+ "epoch": 0.6056737588652482,
+ "grad_norm": 2.804450035095215,
+ "learning_rate": 4.892226306531297e-06,
+ "loss": 0.6595,
+ "step": 1281
+ },
+ {
+ "epoch": 0.6061465721040189,
+ "grad_norm": 3.1559767723083496,
+ "learning_rate": 4.892045042235765e-06,
+ "loss": 0.6664,
+ "step": 1282
+ },
+ {
+ "epoch": 0.6066193853427896,
+ "grad_norm": 2.844341993331909,
+ "learning_rate": 4.891863628998079e-06,
+ "loss": 0.7454,
+ "step": 1283
+ },
+ {
+ "epoch": 0.6070921985815603,
+ "grad_norm": 2.686602830886841,
+ "learning_rate": 4.891682066829532e-06,
+ "loss": 0.6755,
+ "step": 1284
+ },
+ {
+ "epoch": 0.607565011820331,
+ "grad_norm": 2.736457347869873,
+ "learning_rate": 4.8915003557414285e-06,
+ "loss": 0.6305,
+ "step": 1285
+ },
+ {
+ "epoch": 0.6080378250591016,
+ "grad_norm": 2.661362409591675,
+ "learning_rate": 4.891318495745086e-06,
+ "loss": 0.5958,
+ "step": 1286
+ },
+ {
+ "epoch": 0.6085106382978723,
+ "grad_norm": 2.707348108291626,
+ "learning_rate": 4.8911364868518255e-06,
+ "loss": 0.5824,
+ "step": 1287
+ },
+ {
+ "epoch": 0.608983451536643,
+ "grad_norm": 2.9798858165740967,
+ "learning_rate": 4.890954329072981e-06,
+ "loss": 0.5981,
+ "step": 1288
+ },
+ {
+ "epoch": 0.6094562647754137,
+ "grad_norm": 2.6285455226898193,
+ "learning_rate": 4.890772022419895e-06,
+ "loss": 0.6194,
+ "step": 1289
+ },
+ {
+ "epoch": 0.6099290780141844,
+ "grad_norm": 2.9254322052001953,
+ "learning_rate": 4.890589566903917e-06,
+ "loss": 0.6002,
+ "step": 1290
+ },
+ {
+ "epoch": 0.6104018912529551,
+ "grad_norm": 2.6458325386047363,
+ "learning_rate": 4.89040696253641e-06,
+ "loss": 0.5457,
+ "step": 1291
+ },
+ {
+ "epoch": 0.6108747044917258,
+ "grad_norm": 2.508242607116699,
+ "learning_rate": 4.890224209328743e-06,
+ "loss": 0.6168,
+ "step": 1292
+ },
+ {
+ "epoch": 0.6113475177304964,
+ "grad_norm": 3.034785509109497,
+ "learning_rate": 4.890041307292296e-06,
+ "loss": 0.664,
+ "step": 1293
+ },
+ {
+ "epoch": 0.6118203309692671,
+ "grad_norm": 3.52469539642334,
+ "learning_rate": 4.889858256438455e-06,
+ "loss": 0.7301,
+ "step": 1294
+ },
+ {
+ "epoch": 0.6122931442080378,
+ "grad_norm": 2.9145348072052,
+ "learning_rate": 4.889675056778622e-06,
+ "loss": 0.6494,
+ "step": 1295
+ },
+ {
+ "epoch": 0.6127659574468085,
+ "grad_norm": 2.831829071044922,
+ "learning_rate": 4.8894917083242e-06,
+ "loss": 0.6064,
+ "step": 1296
+ },
+ {
+ "epoch": 0.6132387706855792,
+ "grad_norm": 2.6883130073547363,
+ "learning_rate": 4.889308211086608e-06,
+ "loss": 0.5642,
+ "step": 1297
+ },
+ {
+ "epoch": 0.6137115839243499,
+ "grad_norm": 3.0605485439300537,
+ "learning_rate": 4.889124565077269e-06,
+ "loss": 0.6695,
+ "step": 1298
+ },
+ {
+ "epoch": 0.6141843971631206,
+ "grad_norm": 3.44062876701355,
+ "learning_rate": 4.88894077030762e-06,
+ "loss": 0.6415,
+ "step": 1299
+ },
+ {
+ "epoch": 0.6146572104018913,
+ "grad_norm": 2.5970818996429443,
+ "learning_rate": 4.888756826789105e-06,
+ "loss": 0.6518,
+ "step": 1300
+ },
+ {
+ "epoch": 0.6151300236406619,
+ "grad_norm": 4.2233567237854,
+ "learning_rate": 4.8885727345331755e-06,
+ "loss": 0.6555,
+ "step": 1301
+ },
+ {
+ "epoch": 0.6156028368794326,
+ "grad_norm": 2.645385503768921,
+ "learning_rate": 4.888388493551297e-06,
+ "loss": 0.6762,
+ "step": 1302
+ },
+ {
+ "epoch": 0.6160756501182033,
+ "grad_norm": 2.907954454421997,
+ "learning_rate": 4.8882041038549385e-06,
+ "loss": 0.6526,
+ "step": 1303
+ },
+ {
+ "epoch": 0.616548463356974,
+ "grad_norm": 2.482771873474121,
+ "learning_rate": 4.888019565455583e-06,
+ "loss": 0.628,
+ "step": 1304
+ },
+ {
+ "epoch": 0.6170212765957447,
+ "grad_norm": 2.7165915966033936,
+ "learning_rate": 4.88783487836472e-06,
+ "loss": 0.5743,
+ "step": 1305
+ },
+ {
+ "epoch": 0.6174940898345154,
+ "grad_norm": 3.095627546310425,
+ "learning_rate": 4.88765004259385e-06,
+ "loss": 0.627,
+ "step": 1306
+ },
+ {
+ "epoch": 0.6179669030732861,
+ "grad_norm": 2.5018465518951416,
+ "learning_rate": 4.8874650581544805e-06,
+ "loss": 0.5215,
+ "step": 1307
+ },
+ {
+ "epoch": 0.6184397163120567,
+ "grad_norm": 3.094337224960327,
+ "learning_rate": 4.8872799250581316e-06,
+ "loss": 0.6979,
+ "step": 1308
+ },
+ {
+ "epoch": 0.6189125295508274,
+ "grad_norm": 3.1002209186553955,
+ "learning_rate": 4.887094643316329e-06,
+ "loss": 0.6565,
+ "step": 1309
+ },
+ {
+ "epoch": 0.6193853427895981,
+ "grad_norm": 2.551431894302368,
+ "learning_rate": 4.88690921294061e-06,
+ "loss": 0.5748,
+ "step": 1310
+ },
+ {
+ "epoch": 0.6198581560283688,
+ "grad_norm": 2.8282904624938965,
+ "learning_rate": 4.886723633942521e-06,
+ "loss": 0.676,
+ "step": 1311
+ },
+ {
+ "epoch": 0.6203309692671395,
+ "grad_norm": 2.8887810707092285,
+ "learning_rate": 4.886537906333617e-06,
+ "loss": 0.5971,
+ "step": 1312
+ },
+ {
+ "epoch": 0.6208037825059102,
+ "grad_norm": 2.9989118576049805,
+ "learning_rate": 4.886352030125462e-06,
+ "loss": 0.6341,
+ "step": 1313
+ },
+ {
+ "epoch": 0.6212765957446809,
+ "grad_norm": 2.8042776584625244,
+ "learning_rate": 4.886166005329629e-06,
+ "loss": 0.6578,
+ "step": 1314
+ },
+ {
+ "epoch": 0.6217494089834515,
+ "grad_norm": 2.4980967044830322,
+ "learning_rate": 4.8859798319577026e-06,
+ "loss": 0.6711,
+ "step": 1315
+ },
+ {
+ "epoch": 0.6222222222222222,
+ "grad_norm": 2.762369155883789,
+ "learning_rate": 4.885793510021274e-06,
+ "loss": 0.5747,
+ "step": 1316
+ },
+ {
+ "epoch": 0.6226950354609929,
+ "grad_norm": 3.136327028274536,
+ "learning_rate": 4.885607039531945e-06,
+ "loss": 0.7544,
+ "step": 1317
+ },
+ {
+ "epoch": 0.6231678486997636,
+ "grad_norm": 2.8736963272094727,
+ "learning_rate": 4.885420420501327e-06,
+ "loss": 0.6603,
+ "step": 1318
+ },
+ {
+ "epoch": 0.6236406619385343,
+ "grad_norm": 2.766237497329712,
+ "learning_rate": 4.885233652941039e-06,
+ "loss": 0.581,
+ "step": 1319
+ },
+ {
+ "epoch": 0.624113475177305,
+ "grad_norm": 2.4740939140319824,
+ "learning_rate": 4.88504673686271e-06,
+ "loss": 0.6335,
+ "step": 1320
+ },
+ {
+ "epoch": 0.6245862884160757,
+ "grad_norm": 3.324795961380005,
+ "learning_rate": 4.884859672277978e-06,
+ "loss": 0.6019,
+ "step": 1321
+ },
+ {
+ "epoch": 0.6250591016548463,
+ "grad_norm": 3.521327257156372,
+ "learning_rate": 4.884672459198493e-06,
+ "loss": 0.6104,
+ "step": 1322
+ },
+ {
+ "epoch": 0.625531914893617,
+ "grad_norm": 2.7728071212768555,
+ "learning_rate": 4.884485097635909e-06,
+ "loss": 0.6714,
+ "step": 1323
+ },
+ {
+ "epoch": 0.6260047281323877,
+ "grad_norm": 3.0738155841827393,
+ "learning_rate": 4.884297587601895e-06,
+ "loss": 0.604,
+ "step": 1324
+ },
+ {
+ "epoch": 0.6264775413711584,
+ "grad_norm": 2.719240427017212,
+ "learning_rate": 4.884109929108124e-06,
+ "loss": 0.6795,
+ "step": 1325
+ },
+ {
+ "epoch": 0.6269503546099291,
+ "grad_norm": 2.4108200073242188,
+ "learning_rate": 4.883922122166282e-06,
+ "loss": 0.5846,
+ "step": 1326
+ },
+ {
+ "epoch": 0.6274231678486998,
+ "grad_norm": 2.393899917602539,
+ "learning_rate": 4.883734166788063e-06,
+ "loss": 0.6188,
+ "step": 1327
+ },
+ {
+ "epoch": 0.6278959810874705,
+ "grad_norm": 4.555255889892578,
+ "learning_rate": 4.883546062985169e-06,
+ "loss": 0.5962,
+ "step": 1328
+ },
+ {
+ "epoch": 0.6283687943262412,
+ "grad_norm": 2.571075439453125,
+ "learning_rate": 4.883357810769315e-06,
+ "loss": 0.6165,
+ "step": 1329
+ },
+ {
+ "epoch": 0.6288416075650118,
+ "grad_norm": 2.553115129470825,
+ "learning_rate": 4.8831694101522185e-06,
+ "loss": 0.6787,
+ "step": 1330
+ },
+ {
+ "epoch": 0.6293144208037825,
+ "grad_norm": 3.2564642429351807,
+ "learning_rate": 4.882980861145614e-06,
+ "loss": 0.659,
+ "step": 1331
+ },
+ {
+ "epoch": 0.6297872340425532,
+ "grad_norm": 2.535216808319092,
+ "learning_rate": 4.882792163761241e-06,
+ "loss": 0.6176,
+ "step": 1332
+ },
+ {
+ "epoch": 0.6302600472813239,
+ "grad_norm": 3.097921848297119,
+ "learning_rate": 4.882603318010847e-06,
+ "loss": 0.6822,
+ "step": 1333
+ },
+ {
+ "epoch": 0.6307328605200946,
+ "grad_norm": 2.8135175704956055,
+ "learning_rate": 4.882414323906192e-06,
+ "loss": 0.6782,
+ "step": 1334
+ },
+ {
+ "epoch": 0.6312056737588653,
+ "grad_norm": 2.724634885787964,
+ "learning_rate": 4.882225181459044e-06,
+ "loss": 0.6545,
+ "step": 1335
+ },
+ {
+ "epoch": 0.631678486997636,
+ "grad_norm": 2.9585227966308594,
+ "learning_rate": 4.882035890681179e-06,
+ "loss": 0.6218,
+ "step": 1336
+ },
+ {
+ "epoch": 0.6321513002364066,
+ "grad_norm": 2.6952011585235596,
+ "learning_rate": 4.881846451584385e-06,
+ "loss": 0.6,
+ "step": 1337
+ },
+ {
+ "epoch": 0.6326241134751773,
+ "grad_norm": 3.1400704383850098,
+ "learning_rate": 4.881656864180455e-06,
+ "loss": 0.6687,
+ "step": 1338
+ },
+ {
+ "epoch": 0.633096926713948,
+ "grad_norm": 2.8382487297058105,
+ "learning_rate": 4.881467128481197e-06,
+ "loss": 0.574,
+ "step": 1339
+ },
+ {
+ "epoch": 0.6335697399527187,
+ "grad_norm": 2.8520095348358154,
+ "learning_rate": 4.881277244498422e-06,
+ "loss": 0.6582,
+ "step": 1340
+ },
+ {
+ "epoch": 0.6340425531914894,
+ "grad_norm": 2.703498363494873,
+ "learning_rate": 4.881087212243956e-06,
+ "loss": 0.7224,
+ "step": 1341
+ },
+ {
+ "epoch": 0.6345153664302601,
+ "grad_norm": 3.697205066680908,
+ "learning_rate": 4.880897031729629e-06,
+ "loss": 0.6582,
+ "step": 1342
+ },
+ {
+ "epoch": 0.6349881796690308,
+ "grad_norm": 2.7625808715820312,
+ "learning_rate": 4.880706702967284e-06,
+ "loss": 0.574,
+ "step": 1343
+ },
+ {
+ "epoch": 0.6354609929078014,
+ "grad_norm": 2.949984073638916,
+ "learning_rate": 4.880516225968771e-06,
+ "loss": 0.66,
+ "step": 1344
+ },
+ {
+ "epoch": 0.6359338061465721,
+ "grad_norm": 2.548269748687744,
+ "learning_rate": 4.8803256007459525e-06,
+ "loss": 0.642,
+ "step": 1345
+ },
+ {
+ "epoch": 0.6364066193853428,
+ "grad_norm": 2.5102174282073975,
+ "learning_rate": 4.8801348273106945e-06,
+ "loss": 0.6238,
+ "step": 1346
+ },
+ {
+ "epoch": 0.6368794326241135,
+ "grad_norm": 2.9847946166992188,
+ "learning_rate": 4.8799439056748786e-06,
+ "loss": 0.5416,
+ "step": 1347
+ },
+ {
+ "epoch": 0.6373522458628842,
+ "grad_norm": 2.8711049556732178,
+ "learning_rate": 4.879752835850391e-06,
+ "loss": 0.6427,
+ "step": 1348
+ },
+ {
+ "epoch": 0.6378250591016549,
+ "grad_norm": 2.7901716232299805,
+ "learning_rate": 4.879561617849129e-06,
+ "loss": 0.6026,
+ "step": 1349
+ },
+ {
+ "epoch": 0.6382978723404256,
+ "grad_norm": 2.659778356552124,
+ "learning_rate": 4.879370251682999e-06,
+ "loss": 0.6623,
+ "step": 1350
+ },
+ {
+ "epoch": 0.6387706855791963,
+ "grad_norm": 3.224386692047119,
+ "learning_rate": 4.879178737363917e-06,
+ "loss": 0.6485,
+ "step": 1351
+ },
+ {
+ "epoch": 0.6392434988179669,
+ "grad_norm": 2.6385605335235596,
+ "learning_rate": 4.8789870749038076e-06,
+ "loss": 0.5866,
+ "step": 1352
+ },
+ {
+ "epoch": 0.6397163120567376,
+ "grad_norm": 2.807713270187378,
+ "learning_rate": 4.8787952643146045e-06,
+ "loss": 0.6537,
+ "step": 1353
+ },
+ {
+ "epoch": 0.6401891252955083,
+ "grad_norm": 2.5689280033111572,
+ "learning_rate": 4.878603305608251e-06,
+ "loss": 0.6216,
+ "step": 1354
+ },
+ {
+ "epoch": 0.640661938534279,
+ "grad_norm": 2.7347843647003174,
+ "learning_rate": 4.8784111987967e-06,
+ "loss": 0.6318,
+ "step": 1355
+ },
+ {
+ "epoch": 0.6411347517730497,
+ "grad_norm": 2.5210378170013428,
+ "learning_rate": 4.878218943891911e-06,
+ "loss": 0.5472,
+ "step": 1356
+ },
+ {
+ "epoch": 0.6416075650118204,
+ "grad_norm": 2.866785764694214,
+ "learning_rate": 4.878026540905858e-06,
+ "loss": 0.7108,
+ "step": 1357
+ },
+ {
+ "epoch": 0.642080378250591,
+ "grad_norm": 2.923314332962036,
+ "learning_rate": 4.877833989850519e-06,
+ "loss": 0.5557,
+ "step": 1358
+ },
+ {
+ "epoch": 0.6425531914893617,
+ "grad_norm": 2.925463914871216,
+ "learning_rate": 4.8776412907378845e-06,
+ "loss": 0.6382,
+ "step": 1359
+ },
+ {
+ "epoch": 0.6430260047281324,
+ "grad_norm": 2.909644365310669,
+ "learning_rate": 4.877448443579952e-06,
+ "loss": 0.5603,
+ "step": 1360
+ },
+ {
+ "epoch": 0.6434988179669031,
+ "grad_norm": 3.501148223876953,
+ "learning_rate": 4.8772554483887306e-06,
+ "loss": 0.6722,
+ "step": 1361
+ },
+ {
+ "epoch": 0.6439716312056738,
+ "grad_norm": 2.823765516281128,
+ "learning_rate": 4.877062305176235e-06,
+ "loss": 0.6408,
+ "step": 1362
+ },
+ {
+ "epoch": 0.6444444444444445,
+ "grad_norm": 2.9807584285736084,
+ "learning_rate": 4.8768690139544935e-06,
+ "loss": 0.5984,
+ "step": 1363
+ },
+ {
+ "epoch": 0.6449172576832152,
+ "grad_norm": 2.8411378860473633,
+ "learning_rate": 4.8766755747355405e-06,
+ "loss": 0.6231,
+ "step": 1364
+ },
+ {
+ "epoch": 0.6453900709219859,
+ "grad_norm": 3.158952236175537,
+ "learning_rate": 4.8764819875314215e-06,
+ "loss": 0.6441,
+ "step": 1365
+ },
+ {
+ "epoch": 0.6458628841607565,
+ "grad_norm": 2.9614369869232178,
+ "learning_rate": 4.876288252354189e-06,
+ "loss": 0.6308,
+ "step": 1366
+ },
+ {
+ "epoch": 0.6463356973995272,
+ "grad_norm": 3.073805570602417,
+ "learning_rate": 4.876094369215907e-06,
+ "loss": 0.6046,
+ "step": 1367
+ },
+ {
+ "epoch": 0.6468085106382979,
+ "grad_norm": 2.719189405441284,
+ "learning_rate": 4.875900338128648e-06,
+ "loss": 0.6082,
+ "step": 1368
+ },
+ {
+ "epoch": 0.6472813238770686,
+ "grad_norm": 2.676726818084717,
+ "learning_rate": 4.8757061591044914e-06,
+ "loss": 0.6344,
+ "step": 1369
+ },
+ {
+ "epoch": 0.6477541371158393,
+ "grad_norm": 2.955256938934326,
+ "learning_rate": 4.87551183215553e-06,
+ "loss": 0.6506,
+ "step": 1370
+ },
+ {
+ "epoch": 0.64822695035461,
+ "grad_norm": 2.5672218799591064,
+ "learning_rate": 4.875317357293864e-06,
+ "loss": 0.5284,
+ "step": 1371
+ },
+ {
+ "epoch": 0.6486997635933807,
+ "grad_norm": 2.5860238075256348,
+ "learning_rate": 4.875122734531602e-06,
+ "loss": 0.667,
+ "step": 1372
+ },
+ {
+ "epoch": 0.6491725768321513,
+ "grad_norm": 3.1037003993988037,
+ "learning_rate": 4.8749279638808605e-06,
+ "loss": 0.6902,
+ "step": 1373
+ },
+ {
+ "epoch": 0.649645390070922,
+ "grad_norm": 2.7715282440185547,
+ "learning_rate": 4.874733045353769e-06,
+ "loss": 0.6291,
+ "step": 1374
+ },
+ {
+ "epoch": 0.6501182033096927,
+ "grad_norm": 2.527071475982666,
+ "learning_rate": 4.874537978962463e-06,
+ "loss": 0.5565,
+ "step": 1375
+ },
+ {
+ "epoch": 0.6505910165484634,
+ "grad_norm": 2.722092628479004,
+ "learning_rate": 4.874342764719091e-06,
+ "loss": 0.5724,
+ "step": 1376
+ },
+ {
+ "epoch": 0.6510638297872341,
+ "grad_norm": 2.6342411041259766,
+ "learning_rate": 4.874147402635805e-06,
+ "loss": 0.6308,
+ "step": 1377
+ },
+ {
+ "epoch": 0.6515366430260048,
+ "grad_norm": 2.3850719928741455,
+ "learning_rate": 4.8739518927247695e-06,
+ "loss": 0.5692,
+ "step": 1378
+ },
+ {
+ "epoch": 0.6520094562647755,
+ "grad_norm": 2.9787259101867676,
+ "learning_rate": 4.873756234998161e-06,
+ "loss": 0.6953,
+ "step": 1379
+ },
+ {
+ "epoch": 0.6524822695035462,
+ "grad_norm": 2.634141683578491,
+ "learning_rate": 4.873560429468159e-06,
+ "loss": 0.6077,
+ "step": 1380
+ },
+ {
+ "epoch": 0.6529550827423168,
+ "grad_norm": 2.803046941757202,
+ "learning_rate": 4.873364476146958e-06,
+ "loss": 0.6657,
+ "step": 1381
+ },
+ {
+ "epoch": 0.6534278959810875,
+ "grad_norm": 2.762827157974243,
+ "learning_rate": 4.8731683750467574e-06,
+ "loss": 0.6061,
+ "step": 1382
+ },
+ {
+ "epoch": 0.6539007092198581,
+ "grad_norm": 2.6654391288757324,
+ "learning_rate": 4.872972126179768e-06,
+ "loss": 0.6387,
+ "step": 1383
+ },
+ {
+ "epoch": 0.6543735224586288,
+ "grad_norm": 2.4363625049591064,
+ "learning_rate": 4.872775729558209e-06,
+ "loss": 0.5623,
+ "step": 1384
+ },
+ {
+ "epoch": 0.6548463356973995,
+ "grad_norm": 2.528959035873413,
+ "learning_rate": 4.87257918519431e-06,
+ "loss": 0.5609,
+ "step": 1385
+ },
+ {
+ "epoch": 0.6553191489361702,
+ "grad_norm": 2.718383312225342,
+ "learning_rate": 4.872382493100309e-06,
+ "loss": 0.5575,
+ "step": 1386
+ },
+ {
+ "epoch": 0.6557919621749408,
+ "grad_norm": 2.660841226577759,
+ "learning_rate": 4.872185653288453e-06,
+ "loss": 0.6106,
+ "step": 1387
+ },
+ {
+ "epoch": 0.6562647754137115,
+ "grad_norm": 2.508753538131714,
+ "learning_rate": 4.871988665770997e-06,
+ "loss": 0.5705,
+ "step": 1388
+ },
+ {
+ "epoch": 0.6567375886524822,
+ "grad_norm": 2.5134334564208984,
+ "learning_rate": 4.871791530560208e-06,
+ "loss": 0.5592,
+ "step": 1389
+ },
+ {
+ "epoch": 0.6572104018912529,
+ "grad_norm": 2.7475597858428955,
+ "learning_rate": 4.871594247668361e-06,
+ "loss": 0.6277,
+ "step": 1390
+ },
+ {
+ "epoch": 0.6576832151300236,
+ "grad_norm": 2.793616533279419,
+ "learning_rate": 4.871396817107739e-06,
+ "loss": 0.595,
+ "step": 1391
+ },
+ {
+ "epoch": 0.6581560283687943,
+ "grad_norm": 2.8285086154937744,
+ "learning_rate": 4.871199238890635e-06,
+ "loss": 0.6094,
+ "step": 1392
+ },
+ {
+ "epoch": 0.658628841607565,
+ "grad_norm": 2.74124813079834,
+ "learning_rate": 4.871001513029352e-06,
+ "loss": 0.6296,
+ "step": 1393
+ },
+ {
+ "epoch": 0.6591016548463356,
+ "grad_norm": 2.761237621307373,
+ "learning_rate": 4.870803639536202e-06,
+ "loss": 0.5702,
+ "step": 1394
+ },
+ {
+ "epoch": 0.6595744680851063,
+ "grad_norm": 2.761038064956665,
+ "learning_rate": 4.870605618423504e-06,
+ "loss": 0.6195,
+ "step": 1395
+ },
+ {
+ "epoch": 0.660047281323877,
+ "grad_norm": 2.8812482357025146,
+ "learning_rate": 4.870407449703589e-06,
+ "loss": 0.616,
+ "step": 1396
+ },
+ {
+ "epoch": 0.6605200945626477,
+ "grad_norm": 2.9966578483581543,
+ "learning_rate": 4.870209133388797e-06,
+ "loss": 0.6547,
+ "step": 1397
+ },
+ {
+ "epoch": 0.6609929078014184,
+ "grad_norm": 2.7969017028808594,
+ "learning_rate": 4.870010669491474e-06,
+ "loss": 0.5762,
+ "step": 1398
+ },
+ {
+ "epoch": 0.6614657210401891,
+ "grad_norm": 2.557783842086792,
+ "learning_rate": 4.86981205802398e-06,
+ "loss": 0.6184,
+ "step": 1399
+ },
+ {
+ "epoch": 0.6619385342789598,
+ "grad_norm": 2.5393927097320557,
+ "learning_rate": 4.86961329899868e-06,
+ "loss": 0.5953,
+ "step": 1400
+ },
+ {
+ "epoch": 0.6624113475177305,
+ "grad_norm": 2.7745981216430664,
+ "learning_rate": 4.86941439242795e-06,
+ "loss": 0.5967,
+ "step": 1401
+ },
+ {
+ "epoch": 0.6628841607565011,
+ "grad_norm": 2.650381326675415,
+ "learning_rate": 4.869215338324176e-06,
+ "loss": 0.5667,
+ "step": 1402
+ },
+ {
+ "epoch": 0.6633569739952718,
+ "grad_norm": 2.583169937133789,
+ "learning_rate": 4.869016136699751e-06,
+ "loss": 0.549,
+ "step": 1403
+ },
+ {
+ "epoch": 0.6638297872340425,
+ "grad_norm": 2.984978437423706,
+ "learning_rate": 4.868816787567079e-06,
+ "loss": 0.5931,
+ "step": 1404
+ },
+ {
+ "epoch": 0.6643026004728132,
+ "grad_norm": 3.1947181224823,
+ "learning_rate": 4.868617290938573e-06,
+ "loss": 0.5473,
+ "step": 1405
+ },
+ {
+ "epoch": 0.6647754137115839,
+ "grad_norm": 2.562927007675171,
+ "learning_rate": 4.868417646826654e-06,
+ "loss": 0.6878,
+ "step": 1406
+ },
+ {
+ "epoch": 0.6652482269503546,
+ "grad_norm": 2.8741261959075928,
+ "learning_rate": 4.868217855243754e-06,
+ "loss": 0.6312,
+ "step": 1407
+ },
+ {
+ "epoch": 0.6657210401891253,
+ "grad_norm": 2.9834797382354736,
+ "learning_rate": 4.868017916202312e-06,
+ "loss": 0.5624,
+ "step": 1408
+ },
+ {
+ "epoch": 0.6661938534278959,
+ "grad_norm": 2.6935982704162598,
+ "learning_rate": 4.8678178297147785e-06,
+ "loss": 0.5857,
+ "step": 1409
+ },
+ {
+ "epoch": 0.6666666666666666,
+ "grad_norm": 2.8200576305389404,
+ "learning_rate": 4.86761759579361e-06,
+ "loss": 0.6153,
+ "step": 1410
+ },
+ {
+ "epoch": 0.6671394799054373,
+ "grad_norm": 2.831425189971924,
+ "learning_rate": 4.867417214451276e-06,
+ "loss": 0.6495,
+ "step": 1411
+ },
+ {
+ "epoch": 0.667612293144208,
+ "grad_norm": 2.733565092086792,
+ "learning_rate": 4.867216685700253e-06,
+ "loss": 0.6036,
+ "step": 1412
+ },
+ {
+ "epoch": 0.6680851063829787,
+ "grad_norm": 3.0609400272369385,
+ "learning_rate": 4.867016009553027e-06,
+ "loss": 0.6773,
+ "step": 1413
+ },
+ {
+ "epoch": 0.6685579196217494,
+ "grad_norm": 2.665452241897583,
+ "learning_rate": 4.866815186022093e-06,
+ "loss": 0.6256,
+ "step": 1414
+ },
+ {
+ "epoch": 0.6690307328605201,
+ "grad_norm": 2.9480721950531006,
+ "learning_rate": 4.866614215119956e-06,
+ "loss": 0.535,
+ "step": 1415
+ },
+ {
+ "epoch": 0.6695035460992907,
+ "grad_norm": 2.5514180660247803,
+ "learning_rate": 4.866413096859128e-06,
+ "loss": 0.6588,
+ "step": 1416
+ },
+ {
+ "epoch": 0.6699763593380614,
+ "grad_norm": 3.3442373275756836,
+ "learning_rate": 4.866211831252134e-06,
+ "loss": 0.5754,
+ "step": 1417
+ },
+ {
+ "epoch": 0.6704491725768321,
+ "grad_norm": 2.521467685699463,
+ "learning_rate": 4.866010418311504e-06,
+ "loss": 0.5546,
+ "step": 1418
+ },
+ {
+ "epoch": 0.6709219858156028,
+ "grad_norm": 2.930706262588501,
+ "learning_rate": 4.865808858049781e-06,
+ "loss": 0.589,
+ "step": 1419
+ },
+ {
+ "epoch": 0.6713947990543735,
+ "grad_norm": 2.6298375129699707,
+ "learning_rate": 4.865607150479513e-06,
+ "loss": 0.5915,
+ "step": 1420
+ },
+ {
+ "epoch": 0.6718676122931442,
+ "grad_norm": 2.9554293155670166,
+ "learning_rate": 4.8654052956132615e-06,
+ "loss": 0.6654,
+ "step": 1421
+ },
+ {
+ "epoch": 0.6723404255319149,
+ "grad_norm": 3.2706902027130127,
+ "learning_rate": 4.865203293463593e-06,
+ "loss": 0.7115,
+ "step": 1422
+ },
+ {
+ "epoch": 0.6728132387706856,
+ "grad_norm": 3.041539430618286,
+ "learning_rate": 4.865001144043088e-06,
+ "loss": 0.5818,
+ "step": 1423
+ },
+ {
+ "epoch": 0.6732860520094562,
+ "grad_norm": 3.1314544677734375,
+ "learning_rate": 4.864798847364331e-06,
+ "loss": 0.5822,
+ "step": 1424
+ },
+ {
+ "epoch": 0.6737588652482269,
+ "grad_norm": 2.5301461219787598,
+ "learning_rate": 4.86459640343992e-06,
+ "loss": 0.5525,
+ "step": 1425
+ },
+ {
+ "epoch": 0.6742316784869976,
+ "grad_norm": 2.809295892715454,
+ "learning_rate": 4.864393812282458e-06,
+ "loss": 0.6768,
+ "step": 1426
+ },
+ {
+ "epoch": 0.6747044917257683,
+ "grad_norm": 2.794664144515991,
+ "learning_rate": 4.864191073904562e-06,
+ "loss": 0.5793,
+ "step": 1427
+ },
+ {
+ "epoch": 0.675177304964539,
+ "grad_norm": 2.7771105766296387,
+ "learning_rate": 4.863988188318854e-06,
+ "loss": 0.6453,
+ "step": 1428
+ },
+ {
+ "epoch": 0.6756501182033097,
+ "grad_norm": 2.6431946754455566,
+ "learning_rate": 4.863785155537967e-06,
+ "loss": 0.5877,
+ "step": 1429
+ },
+ {
+ "epoch": 0.6761229314420804,
+ "grad_norm": 2.951353073120117,
+ "learning_rate": 4.863581975574544e-06,
+ "loss": 0.6793,
+ "step": 1430
+ },
+ {
+ "epoch": 0.676595744680851,
+ "grad_norm": 3.1336071491241455,
+ "learning_rate": 4.863378648441235e-06,
+ "loss": 0.6695,
+ "step": 1431
+ },
+ {
+ "epoch": 0.6770685579196217,
+ "grad_norm": 2.735982656478882,
+ "learning_rate": 4.8631751741507e-06,
+ "loss": 0.5239,
+ "step": 1432
+ },
+ {
+ "epoch": 0.6775413711583924,
+ "grad_norm": 2.7085206508636475,
+ "learning_rate": 4.862971552715611e-06,
+ "loss": 0.6837,
+ "step": 1433
+ },
+ {
+ "epoch": 0.6780141843971631,
+ "grad_norm": 3.136528730392456,
+ "learning_rate": 4.8627677841486436e-06,
+ "loss": 0.683,
+ "step": 1434
+ },
+ {
+ "epoch": 0.6784869976359338,
+ "grad_norm": 2.7879369258880615,
+ "learning_rate": 4.862563868462486e-06,
+ "loss": 0.608,
+ "step": 1435
+ },
+ {
+ "epoch": 0.6789598108747045,
+ "grad_norm": 2.7937729358673096,
+ "learning_rate": 4.862359805669837e-06,
+ "loss": 0.6131,
+ "step": 1436
+ },
+ {
+ "epoch": 0.6794326241134752,
+ "grad_norm": 2.5988364219665527,
+ "learning_rate": 4.862155595783401e-06,
+ "loss": 0.6303,
+ "step": 1437
+ },
+ {
+ "epoch": 0.6799054373522458,
+ "grad_norm": 3.251070499420166,
+ "learning_rate": 4.861951238815894e-06,
+ "loss": 0.7246,
+ "step": 1438
+ },
+ {
+ "epoch": 0.6803782505910165,
+ "grad_norm": 2.646759271621704,
+ "learning_rate": 4.861746734780039e-06,
+ "loss": 0.6313,
+ "step": 1439
+ },
+ {
+ "epoch": 0.6808510638297872,
+ "grad_norm": 2.773866891860962,
+ "learning_rate": 4.861542083688573e-06,
+ "loss": 0.6463,
+ "step": 1440
+ },
+ {
+ "epoch": 0.6813238770685579,
+ "grad_norm": 2.759965658187866,
+ "learning_rate": 4.861337285554235e-06,
+ "loss": 0.5428,
+ "step": 1441
+ },
+ {
+ "epoch": 0.6817966903073286,
+ "grad_norm": 3.3250818252563477,
+ "learning_rate": 4.861132340389779e-06,
+ "loss": 0.6522,
+ "step": 1442
+ },
+ {
+ "epoch": 0.6822695035460993,
+ "grad_norm": 2.661797523498535,
+ "learning_rate": 4.860927248207965e-06,
+ "loss": 0.5871,
+ "step": 1443
+ },
+ {
+ "epoch": 0.68274231678487,
+ "grad_norm": 2.706289052963257,
+ "learning_rate": 4.860722009021563e-06,
+ "loss": 0.6651,
+ "step": 1444
+ },
+ {
+ "epoch": 0.6832151300236406,
+ "grad_norm": 2.8459298610687256,
+ "learning_rate": 4.860516622843354e-06,
+ "loss": 0.5827,
+ "step": 1445
+ },
+ {
+ "epoch": 0.6836879432624113,
+ "grad_norm": 3.1041831970214844,
+ "learning_rate": 4.860311089686125e-06,
+ "loss": 0.6727,
+ "step": 1446
+ },
+ {
+ "epoch": 0.684160756501182,
+ "grad_norm": 2.9382801055908203,
+ "learning_rate": 4.8601054095626746e-06,
+ "loss": 0.6002,
+ "step": 1447
+ },
+ {
+ "epoch": 0.6846335697399527,
+ "grad_norm": 2.782475471496582,
+ "learning_rate": 4.859899582485808e-06,
+ "loss": 0.6951,
+ "step": 1448
+ },
+ {
+ "epoch": 0.6851063829787234,
+ "grad_norm": 3.313894510269165,
+ "learning_rate": 4.859693608468343e-06,
+ "loss": 0.6363,
+ "step": 1449
+ },
+ {
+ "epoch": 0.6855791962174941,
+ "grad_norm": 3.1639695167541504,
+ "learning_rate": 4.8594874875231045e-06,
+ "loss": 0.7002,
+ "step": 1450
+ },
+ {
+ "epoch": 0.6860520094562648,
+ "grad_norm": 2.6762218475341797,
+ "learning_rate": 4.859281219662926e-06,
+ "loss": 0.6246,
+ "step": 1451
+ },
+ {
+ "epoch": 0.6865248226950355,
+ "grad_norm": 2.8368663787841797,
+ "learning_rate": 4.85907480490065e-06,
+ "loss": 0.5906,
+ "step": 1452
+ },
+ {
+ "epoch": 0.6869976359338061,
+ "grad_norm": 2.887373208999634,
+ "learning_rate": 4.858868243249131e-06,
+ "loss": 0.5931,
+ "step": 1453
+ },
+ {
+ "epoch": 0.6874704491725768,
+ "grad_norm": 2.8115322589874268,
+ "learning_rate": 4.858661534721229e-06,
+ "loss": 0.6337,
+ "step": 1454
+ },
+ {
+ "epoch": 0.6879432624113475,
+ "grad_norm": 2.8470499515533447,
+ "learning_rate": 4.8584546793298174e-06,
+ "loss": 0.632,
+ "step": 1455
+ },
+ {
+ "epoch": 0.6884160756501182,
+ "grad_norm": 2.8229613304138184,
+ "learning_rate": 4.8582476770877725e-06,
+ "loss": 0.6494,
+ "step": 1456
+ },
+ {
+ "epoch": 0.6888888888888889,
+ "grad_norm": 2.4235479831695557,
+ "learning_rate": 4.858040528007987e-06,
+ "loss": 0.5709,
+ "step": 1457
+ },
+ {
+ "epoch": 0.6893617021276596,
+ "grad_norm": 2.9348199367523193,
+ "learning_rate": 4.857833232103356e-06,
+ "loss": 0.5404,
+ "step": 1458
+ },
+ {
+ "epoch": 0.6898345153664303,
+ "grad_norm": 2.8274219036102295,
+ "learning_rate": 4.857625789386789e-06,
+ "loss": 0.701,
+ "step": 1459
+ },
+ {
+ "epoch": 0.6903073286052009,
+ "grad_norm": 3.136929988861084,
+ "learning_rate": 4.857418199871203e-06,
+ "loss": 0.6971,
+ "step": 1460
+ },
+ {
+ "epoch": 0.6907801418439716,
+ "grad_norm": 2.8987185955047607,
+ "learning_rate": 4.8572104635695214e-06,
+ "loss": 0.6613,
+ "step": 1461
+ },
+ {
+ "epoch": 0.6912529550827423,
+ "grad_norm": 2.5073442459106445,
+ "learning_rate": 4.857002580494681e-06,
+ "loss": 0.6032,
+ "step": 1462
+ },
+ {
+ "epoch": 0.691725768321513,
+ "grad_norm": 2.7019522190093994,
+ "learning_rate": 4.856794550659625e-06,
+ "loss": 0.567,
+ "step": 1463
+ },
+ {
+ "epoch": 0.6921985815602837,
+ "grad_norm": 2.4795594215393066,
+ "learning_rate": 4.8565863740773054e-06,
+ "loss": 0.5777,
+ "step": 1464
+ },
+ {
+ "epoch": 0.6926713947990544,
+ "grad_norm": 3.032506227493286,
+ "learning_rate": 4.856378050760687e-06,
+ "loss": 0.607,
+ "step": 1465
+ },
+ {
+ "epoch": 0.6931442080378251,
+ "grad_norm": 3.052091121673584,
+ "learning_rate": 4.85616958072274e-06,
+ "loss": 0.591,
+ "step": 1466
+ },
+ {
+ "epoch": 0.6936170212765957,
+ "grad_norm": 2.704831838607788,
+ "learning_rate": 4.855960963976443e-06,
+ "loss": 0.6528,
+ "step": 1467
+ },
+ {
+ "epoch": 0.6940898345153664,
+ "grad_norm": 2.680995225906372,
+ "learning_rate": 4.855752200534788e-06,
+ "loss": 0.6294,
+ "step": 1468
+ },
+ {
+ "epoch": 0.6945626477541371,
+ "grad_norm": 2.3948659896850586,
+ "learning_rate": 4.855543290410774e-06,
+ "loss": 0.6091,
+ "step": 1469
+ },
+ {
+ "epoch": 0.6950354609929078,
+ "grad_norm": 2.6407411098480225,
+ "learning_rate": 4.855334233617407e-06,
+ "loss": 0.5572,
+ "step": 1470
+ },
+ {
+ "epoch": 0.6955082742316785,
+ "grad_norm": 2.5526835918426514,
+ "learning_rate": 4.8551250301677064e-06,
+ "loss": 0.5432,
+ "step": 1471
+ },
+ {
+ "epoch": 0.6959810874704492,
+ "grad_norm": 3.1237430572509766,
+ "learning_rate": 4.8549156800746965e-06,
+ "loss": 0.5944,
+ "step": 1472
+ },
+ {
+ "epoch": 0.6964539007092199,
+ "grad_norm": 2.8112540245056152,
+ "learning_rate": 4.854706183351412e-06,
+ "loss": 0.604,
+ "step": 1473
+ },
+ {
+ "epoch": 0.6969267139479906,
+ "grad_norm": 2.664644479751587,
+ "learning_rate": 4.8544965400109e-06,
+ "loss": 0.5647,
+ "step": 1474
+ },
+ {
+ "epoch": 0.6973995271867612,
+ "grad_norm": 3.26310133934021,
+ "learning_rate": 4.854286750066212e-06,
+ "loss": 0.6999,
+ "step": 1475
+ },
+ {
+ "epoch": 0.6978723404255319,
+ "grad_norm": 2.9717442989349365,
+ "learning_rate": 4.8540768135304115e-06,
+ "loss": 0.6655,
+ "step": 1476
+ },
+ {
+ "epoch": 0.6983451536643026,
+ "grad_norm": 2.5302982330322266,
+ "learning_rate": 4.85386673041657e-06,
+ "loss": 0.6384,
+ "step": 1477
+ },
+ {
+ "epoch": 0.6988179669030733,
+ "grad_norm": 2.864877700805664,
+ "learning_rate": 4.853656500737769e-06,
+ "loss": 0.6834,
+ "step": 1478
+ },
+ {
+ "epoch": 0.699290780141844,
+ "grad_norm": 2.5522031784057617,
+ "learning_rate": 4.853446124507098e-06,
+ "loss": 0.5929,
+ "step": 1479
+ },
+ {
+ "epoch": 0.6997635933806147,
+ "grad_norm": 3.096477746963501,
+ "learning_rate": 4.853235601737656e-06,
+ "loss": 0.5737,
+ "step": 1480
+ },
+ {
+ "epoch": 0.7002364066193854,
+ "grad_norm": 2.884779214859009,
+ "learning_rate": 4.853024932442552e-06,
+ "loss": 0.6362,
+ "step": 1481
+ },
+ {
+ "epoch": 0.700709219858156,
+ "grad_norm": 3.368558406829834,
+ "learning_rate": 4.852814116634903e-06,
+ "loss": 0.6721,
+ "step": 1482
+ },
+ {
+ "epoch": 0.7011820330969267,
+ "grad_norm": 2.742414951324463,
+ "learning_rate": 4.852603154327837e-06,
+ "loss": 0.6212,
+ "step": 1483
+ },
+ {
+ "epoch": 0.7016548463356974,
+ "grad_norm": 2.53454852104187,
+ "learning_rate": 4.8523920455344864e-06,
+ "loss": 0.6675,
+ "step": 1484
+ },
+ {
+ "epoch": 0.7021276595744681,
+ "grad_norm": 2.9354238510131836,
+ "learning_rate": 4.852180790267999e-06,
+ "loss": 0.6692,
+ "step": 1485
+ },
+ {
+ "epoch": 0.7026004728132388,
+ "grad_norm": 2.585070848464966,
+ "learning_rate": 4.8519693885415274e-06,
+ "loss": 0.6215,
+ "step": 1486
+ },
+ {
+ "epoch": 0.7030732860520095,
+ "grad_norm": 2.9047999382019043,
+ "learning_rate": 4.851757840368235e-06,
+ "loss": 0.6231,
+ "step": 1487
+ },
+ {
+ "epoch": 0.7035460992907802,
+ "grad_norm": 3.0930933952331543,
+ "learning_rate": 4.851546145761295e-06,
+ "loss": 0.7267,
+ "step": 1488
+ },
+ {
+ "epoch": 0.7040189125295508,
+ "grad_norm": 3.0224719047546387,
+ "learning_rate": 4.8513343047338875e-06,
+ "loss": 0.6293,
+ "step": 1489
+ },
+ {
+ "epoch": 0.7044917257683215,
+ "grad_norm": 2.5758471488952637,
+ "learning_rate": 4.851122317299203e-06,
+ "loss": 0.5855,
+ "step": 1490
+ },
+ {
+ "epoch": 0.7049645390070922,
+ "grad_norm": 2.579272508621216,
+ "learning_rate": 4.850910183470441e-06,
+ "loss": 0.582,
+ "step": 1491
+ },
+ {
+ "epoch": 0.7054373522458629,
+ "grad_norm": 2.8148300647735596,
+ "learning_rate": 4.85069790326081e-06,
+ "loss": 0.6396,
+ "step": 1492
+ },
+ {
+ "epoch": 0.7059101654846336,
+ "grad_norm": 2.6380527019500732,
+ "learning_rate": 4.850485476683528e-06,
+ "loss": 0.6114,
+ "step": 1493
+ },
+ {
+ "epoch": 0.7063829787234043,
+ "grad_norm": 2.7736263275146484,
+ "learning_rate": 4.850272903751823e-06,
+ "loss": 0.6683,
+ "step": 1494
+ },
+ {
+ "epoch": 0.706855791962175,
+ "grad_norm": 3.1958179473876953,
+ "learning_rate": 4.8500601844789285e-06,
+ "loss": 0.6265,
+ "step": 1495
+ },
+ {
+ "epoch": 0.7073286052009456,
+ "grad_norm": 3.783212423324585,
+ "learning_rate": 4.8498473188780916e-06,
+ "loss": 0.6078,
+ "step": 1496
+ },
+ {
+ "epoch": 0.7078014184397163,
+ "grad_norm": 2.6656646728515625,
+ "learning_rate": 4.849634306962566e-06,
+ "loss": 0.5756,
+ "step": 1497
+ },
+ {
+ "epoch": 0.708274231678487,
+ "grad_norm": 2.757141590118408,
+ "learning_rate": 4.849421148745615e-06,
+ "loss": 0.5596,
+ "step": 1498
+ },
+ {
+ "epoch": 0.7087470449172577,
+ "grad_norm": 3.0391886234283447,
+ "learning_rate": 4.849207844240511e-06,
+ "loss": 0.5293,
+ "step": 1499
+ },
+ {
+ "epoch": 0.7092198581560284,
+ "grad_norm": 2.981912851333618,
+ "learning_rate": 4.848994393460535e-06,
+ "loss": 0.598,
+ "step": 1500
+ },
+ {
+ "epoch": 0.7096926713947991,
+ "grad_norm": 2.5470798015594482,
+ "learning_rate": 4.848780796418978e-06,
+ "loss": 0.6266,
+ "step": 1501
+ },
+ {
+ "epoch": 0.7101654846335698,
+ "grad_norm": 2.8394415378570557,
+ "learning_rate": 4.8485670531291415e-06,
+ "loss": 0.6844,
+ "step": 1502
+ },
+ {
+ "epoch": 0.7106382978723405,
+ "grad_norm": 3.2023508548736572,
+ "learning_rate": 4.848353163604331e-06,
+ "loss": 0.6134,
+ "step": 1503
+ },
+ {
+ "epoch": 0.7111111111111111,
+ "grad_norm": 2.98245906829834,
+ "learning_rate": 4.848139127857867e-06,
+ "loss": 0.7084,
+ "step": 1504
+ },
+ {
+ "epoch": 0.7115839243498818,
+ "grad_norm": 2.5917441844940186,
+ "learning_rate": 4.847924945903076e-06,
+ "loss": 0.5676,
+ "step": 1505
+ },
+ {
+ "epoch": 0.7120567375886525,
+ "grad_norm": 2.8736681938171387,
+ "learning_rate": 4.847710617753294e-06,
+ "loss": 0.6304,
+ "step": 1506
+ },
+ {
+ "epoch": 0.7125295508274232,
+ "grad_norm": 2.7832682132720947,
+ "learning_rate": 4.847496143421866e-06,
+ "loss": 0.5705,
+ "step": 1507
+ },
+ {
+ "epoch": 0.7130023640661939,
+ "grad_norm": 2.480560779571533,
+ "learning_rate": 4.847281522922147e-06,
+ "loss": 0.5595,
+ "step": 1508
+ },
+ {
+ "epoch": 0.7134751773049646,
+ "grad_norm": 2.357675313949585,
+ "learning_rate": 4.847066756267499e-06,
+ "loss": 0.5065,
+ "step": 1509
+ },
+ {
+ "epoch": 0.7139479905437353,
+ "grad_norm": 2.632669448852539,
+ "learning_rate": 4.846851843471296e-06,
+ "loss": 0.6949,
+ "step": 1510
+ },
+ {
+ "epoch": 0.7144208037825059,
+ "grad_norm": 2.7691073417663574,
+ "learning_rate": 4.84663678454692e-06,
+ "loss": 0.6638,
+ "step": 1511
+ },
+ {
+ "epoch": 0.7148936170212766,
+ "grad_norm": 2.5647685527801514,
+ "learning_rate": 4.846421579507761e-06,
+ "loss": 0.6098,
+ "step": 1512
+ },
+ {
+ "epoch": 0.7153664302600473,
+ "grad_norm": 2.476701021194458,
+ "learning_rate": 4.846206228367218e-06,
+ "loss": 0.592,
+ "step": 1513
+ },
+ {
+ "epoch": 0.715839243498818,
+ "grad_norm": 2.805727958679199,
+ "learning_rate": 4.845990731138702e-06,
+ "loss": 0.5466,
+ "step": 1514
+ },
+ {
+ "epoch": 0.7163120567375887,
+ "grad_norm": 2.551392078399658,
+ "learning_rate": 4.84577508783563e-06,
+ "loss": 0.6039,
+ "step": 1515
+ },
+ {
+ "epoch": 0.7167848699763594,
+ "grad_norm": 2.6861350536346436,
+ "learning_rate": 4.845559298471429e-06,
+ "loss": 0.6427,
+ "step": 1516
+ },
+ {
+ "epoch": 0.7172576832151301,
+ "grad_norm": 3.1908371448516846,
+ "learning_rate": 4.845343363059535e-06,
+ "loss": 0.5447,
+ "step": 1517
+ },
+ {
+ "epoch": 0.7177304964539007,
+ "grad_norm": 2.9021761417388916,
+ "learning_rate": 4.845127281613394e-06,
+ "loss": 0.5836,
+ "step": 1518
+ },
+ {
+ "epoch": 0.7182033096926714,
+ "grad_norm": 2.476670742034912,
+ "learning_rate": 4.844911054146461e-06,
+ "loss": 0.5863,
+ "step": 1519
+ },
+ {
+ "epoch": 0.7186761229314421,
+ "grad_norm": 2.662935495376587,
+ "learning_rate": 4.844694680672198e-06,
+ "loss": 0.5678,
+ "step": 1520
+ },
+ {
+ "epoch": 0.7191489361702128,
+ "grad_norm": 2.677896738052368,
+ "learning_rate": 4.844478161204079e-06,
+ "loss": 0.6195,
+ "step": 1521
+ },
+ {
+ "epoch": 0.7196217494089835,
+ "grad_norm": 2.781921863555908,
+ "learning_rate": 4.844261495755585e-06,
+ "loss": 0.643,
+ "step": 1522
+ },
+ {
+ "epoch": 0.7200945626477542,
+ "grad_norm": 3.0157392024993896,
+ "learning_rate": 4.844044684340206e-06,
+ "loss": 0.7559,
+ "step": 1523
+ },
+ {
+ "epoch": 0.7205673758865249,
+ "grad_norm": 2.8109354972839355,
+ "learning_rate": 4.843827726971444e-06,
+ "loss": 0.6264,
+ "step": 1524
+ },
+ {
+ "epoch": 0.7210401891252955,
+ "grad_norm": 3.0953569412231445,
+ "learning_rate": 4.8436106236628064e-06,
+ "loss": 0.6429,
+ "step": 1525
+ },
+ {
+ "epoch": 0.7215130023640662,
+ "grad_norm": 2.6850643157958984,
+ "learning_rate": 4.843393374427812e-06,
+ "loss": 0.6598,
+ "step": 1526
+ },
+ {
+ "epoch": 0.7219858156028369,
+ "grad_norm": 3.043480634689331,
+ "learning_rate": 4.8431759792799874e-06,
+ "loss": 0.6331,
+ "step": 1527
+ },
+ {
+ "epoch": 0.7224586288416076,
+ "grad_norm": 2.723870038986206,
+ "learning_rate": 4.842958438232868e-06,
+ "loss": 0.6259,
+ "step": 1528
+ },
+ {
+ "epoch": 0.7229314420803783,
+ "grad_norm": 2.822492837905884,
+ "learning_rate": 4.842740751300002e-06,
+ "loss": 0.6554,
+ "step": 1529
+ },
+ {
+ "epoch": 0.723404255319149,
+ "grad_norm": 2.7866315841674805,
+ "learning_rate": 4.842522918494941e-06,
+ "loss": 0.6991,
+ "step": 1530
+ },
+ {
+ "epoch": 0.7238770685579197,
+ "grad_norm": 2.8881826400756836,
+ "learning_rate": 4.84230493983125e-06,
+ "loss": 0.5876,
+ "step": 1531
+ },
+ {
+ "epoch": 0.7243498817966904,
+ "grad_norm": 2.7456939220428467,
+ "learning_rate": 4.8420868153225e-06,
+ "loss": 0.6188,
+ "step": 1532
+ },
+ {
+ "epoch": 0.724822695035461,
+ "grad_norm": 3.0257532596588135,
+ "learning_rate": 4.841868544982274e-06,
+ "loss": 0.63,
+ "step": 1533
+ },
+ {
+ "epoch": 0.7252955082742317,
+ "grad_norm": 3.1581954956054688,
+ "learning_rate": 4.841650128824164e-06,
+ "loss": 0.7214,
+ "step": 1534
+ },
+ {
+ "epoch": 0.7257683215130024,
+ "grad_norm": 2.9174306392669678,
+ "learning_rate": 4.841431566861767e-06,
+ "loss": 0.704,
+ "step": 1535
+ },
+ {
+ "epoch": 0.7262411347517731,
+ "grad_norm": 2.5019054412841797,
+ "learning_rate": 4.8412128591086935e-06,
+ "loss": 0.6298,
+ "step": 1536
+ },
+ {
+ "epoch": 0.7267139479905438,
+ "grad_norm": 2.724285125732422,
+ "learning_rate": 4.840994005578562e-06,
+ "loss": 0.6289,
+ "step": 1537
+ },
+ {
+ "epoch": 0.7271867612293145,
+ "grad_norm": 2.5882341861724854,
+ "learning_rate": 4.840775006284998e-06,
+ "loss": 0.6355,
+ "step": 1538
+ },
+ {
+ "epoch": 0.7276595744680852,
+ "grad_norm": 3.1281991004943848,
+ "learning_rate": 4.840555861241638e-06,
+ "loss": 0.5551,
+ "step": 1539
+ },
+ {
+ "epoch": 0.7281323877068558,
+ "grad_norm": 2.6064817905426025,
+ "learning_rate": 4.840336570462127e-06,
+ "loss": 0.5543,
+ "step": 1540
+ },
+ {
+ "epoch": 0.7286052009456265,
+ "grad_norm": 2.67112398147583,
+ "learning_rate": 4.840117133960122e-06,
+ "loss": 0.6044,
+ "step": 1541
+ },
+ {
+ "epoch": 0.7290780141843972,
+ "grad_norm": 2.838022232055664,
+ "learning_rate": 4.839897551749282e-06,
+ "loss": 0.6814,
+ "step": 1542
+ },
+ {
+ "epoch": 0.7295508274231679,
+ "grad_norm": 2.8897151947021484,
+ "learning_rate": 4.839677823843283e-06,
+ "loss": 0.593,
+ "step": 1543
+ },
+ {
+ "epoch": 0.7300236406619386,
+ "grad_norm": 2.9238014221191406,
+ "learning_rate": 4.839457950255805e-06,
+ "loss": 0.5544,
+ "step": 1544
+ },
+ {
+ "epoch": 0.7304964539007093,
+ "grad_norm": 3.016876459121704,
+ "learning_rate": 4.839237931000538e-06,
+ "loss": 0.6099,
+ "step": 1545
+ },
+ {
+ "epoch": 0.7309692671394799,
+ "grad_norm": 2.9415392875671387,
+ "learning_rate": 4.839017766091182e-06,
+ "loss": 0.6413,
+ "step": 1546
+ },
+ {
+ "epoch": 0.7314420803782505,
+ "grad_norm": 2.658067226409912,
+ "learning_rate": 4.838797455541446e-06,
+ "loss": 0.6534,
+ "step": 1547
+ },
+ {
+ "epoch": 0.7319148936170212,
+ "grad_norm": 2.460358142852783,
+ "learning_rate": 4.838576999365049e-06,
+ "loss": 0.5307,
+ "step": 1548
+ },
+ {
+ "epoch": 0.7323877068557919,
+ "grad_norm": 2.5818674564361572,
+ "learning_rate": 4.838356397575716e-06,
+ "loss": 0.6265,
+ "step": 1549
+ },
+ {
+ "epoch": 0.7328605200945626,
+ "grad_norm": 3.009197473526001,
+ "learning_rate": 4.838135650187183e-06,
+ "loss": 0.6957,
+ "step": 1550
+ },
+ {
+ "epoch": 0.7333333333333333,
+ "grad_norm": 2.738543748855591,
+ "learning_rate": 4.837914757213196e-06,
+ "loss": 0.646,
+ "step": 1551
+ },
+ {
+ "epoch": 0.733806146572104,
+ "grad_norm": 2.8208494186401367,
+ "learning_rate": 4.837693718667508e-06,
+ "loss": 0.5936,
+ "step": 1552
+ },
+ {
+ "epoch": 0.7342789598108747,
+ "grad_norm": 3.1574649810791016,
+ "learning_rate": 4.837472534563883e-06,
+ "loss": 0.6455,
+ "step": 1553
+ },
+ {
+ "epoch": 0.7347517730496453,
+ "grad_norm": 2.6737420558929443,
+ "learning_rate": 4.837251204916093e-06,
+ "loss": 0.5921,
+ "step": 1554
+ },
+ {
+ "epoch": 0.735224586288416,
+ "grad_norm": 2.424983024597168,
+ "learning_rate": 4.837029729737918e-06,
+ "loss": 0.6346,
+ "step": 1555
+ },
+ {
+ "epoch": 0.7356973995271867,
+ "grad_norm": 2.5163493156433105,
+ "learning_rate": 4.836808109043151e-06,
+ "loss": 0.6061,
+ "step": 1556
+ },
+ {
+ "epoch": 0.7361702127659574,
+ "grad_norm": 2.8377044200897217,
+ "learning_rate": 4.836586342845588e-06,
+ "loss": 0.611,
+ "step": 1557
+ },
+ {
+ "epoch": 0.7366430260047281,
+ "grad_norm": 2.5929181575775146,
+ "learning_rate": 4.83636443115904e-06,
+ "loss": 0.5496,
+ "step": 1558
+ },
+ {
+ "epoch": 0.7371158392434988,
+ "grad_norm": 2.5017223358154297,
+ "learning_rate": 4.836142373997323e-06,
+ "loss": 0.6235,
+ "step": 1559
+ },
+ {
+ "epoch": 0.7375886524822695,
+ "grad_norm": 2.822500228881836,
+ "learning_rate": 4.835920171374265e-06,
+ "loss": 0.6147,
+ "step": 1560
+ },
+ {
+ "epoch": 0.7380614657210401,
+ "grad_norm": 2.7234230041503906,
+ "learning_rate": 4.8356978233037e-06,
+ "loss": 0.6228,
+ "step": 1561
+ },
+ {
+ "epoch": 0.7385342789598108,
+ "grad_norm": 2.9565515518188477,
+ "learning_rate": 4.835475329799472e-06,
+ "loss": 0.5728,
+ "step": 1562
+ },
+ {
+ "epoch": 0.7390070921985815,
+ "grad_norm": 2.4356038570404053,
+ "learning_rate": 4.835252690875438e-06,
+ "loss": 0.6723,
+ "step": 1563
+ },
+ {
+ "epoch": 0.7394799054373522,
+ "grad_norm": 2.765913248062134,
+ "learning_rate": 4.835029906545458e-06,
+ "loss": 0.5805,
+ "step": 1564
+ },
+ {
+ "epoch": 0.7399527186761229,
+ "grad_norm": 2.4481914043426514,
+ "learning_rate": 4.834806976823405e-06,
+ "loss": 0.599,
+ "step": 1565
+ },
+ {
+ "epoch": 0.7404255319148936,
+ "grad_norm": 2.620779514312744,
+ "learning_rate": 4.834583901723158e-06,
+ "loss": 0.63,
+ "step": 1566
+ },
+ {
+ "epoch": 0.7408983451536643,
+ "grad_norm": 2.654426097869873,
+ "learning_rate": 4.83436068125861e-06,
+ "loss": 0.6544,
+ "step": 1567
+ },
+ {
+ "epoch": 0.741371158392435,
+ "grad_norm": 2.589623212814331,
+ "learning_rate": 4.834137315443656e-06,
+ "loss": 0.5596,
+ "step": 1568
+ },
+ {
+ "epoch": 0.7418439716312056,
+ "grad_norm": 2.572883129119873,
+ "learning_rate": 4.833913804292209e-06,
+ "loss": 0.5974,
+ "step": 1569
+ },
+ {
+ "epoch": 0.7423167848699763,
+ "grad_norm": 2.8744914531707764,
+ "learning_rate": 4.833690147818181e-06,
+ "loss": 0.5364,
+ "step": 1570
+ },
+ {
+ "epoch": 0.742789598108747,
+ "grad_norm": 2.9800851345062256,
+ "learning_rate": 4.833466346035502e-06,
+ "loss": 0.6287,
+ "step": 1571
+ },
+ {
+ "epoch": 0.7432624113475177,
+ "grad_norm": 2.627784490585327,
+ "learning_rate": 4.833242398958105e-06,
+ "loss": 0.621,
+ "step": 1572
+ },
+ {
+ "epoch": 0.7437352245862884,
+ "grad_norm": 2.5187721252441406,
+ "learning_rate": 4.833018306599933e-06,
+ "loss": 0.5901,
+ "step": 1573
+ },
+ {
+ "epoch": 0.7442080378250591,
+ "grad_norm": 2.4843688011169434,
+ "learning_rate": 4.832794068974944e-06,
+ "loss": 0.6336,
+ "step": 1574
+ },
+ {
+ "epoch": 0.7446808510638298,
+ "grad_norm": 2.774911880493164,
+ "learning_rate": 4.832569686097096e-06,
+ "loss": 0.6091,
+ "step": 1575
+ },
+ {
+ "epoch": 0.7451536643026004,
+ "grad_norm": 3.2562527656555176,
+ "learning_rate": 4.8323451579803615e-06,
+ "loss": 0.7686,
+ "step": 1576
+ },
+ {
+ "epoch": 0.7456264775413711,
+ "grad_norm": 2.799570083618164,
+ "learning_rate": 4.832120484638721e-06,
+ "loss": 0.6233,
+ "step": 1577
+ },
+ {
+ "epoch": 0.7460992907801418,
+ "grad_norm": 2.661893367767334,
+ "learning_rate": 4.831895666086164e-06,
+ "loss": 0.5841,
+ "step": 1578
+ },
+ {
+ "epoch": 0.7465721040189125,
+ "grad_norm": 3.0382652282714844,
+ "learning_rate": 4.831670702336689e-06,
+ "loss": 0.5769,
+ "step": 1579
+ },
+ {
+ "epoch": 0.7470449172576832,
+ "grad_norm": 2.676398515701294,
+ "learning_rate": 4.831445593404304e-06,
+ "loss": 0.619,
+ "step": 1580
+ },
+ {
+ "epoch": 0.7475177304964539,
+ "grad_norm": 2.717916965484619,
+ "learning_rate": 4.831220339303024e-06,
+ "loss": 0.5787,
+ "step": 1581
+ },
+ {
+ "epoch": 0.7479905437352246,
+ "grad_norm": 2.3918066024780273,
+ "learning_rate": 4.830994940046876e-06,
+ "loss": 0.5108,
+ "step": 1582
+ },
+ {
+ "epoch": 0.7484633569739952,
+ "grad_norm": 2.709144115447998,
+ "learning_rate": 4.830769395649895e-06,
+ "loss": 0.6875,
+ "step": 1583
+ },
+ {
+ "epoch": 0.7489361702127659,
+ "grad_norm": 2.8711116313934326,
+ "learning_rate": 4.830543706126123e-06,
+ "loss": 0.6745,
+ "step": 1584
+ },
+ {
+ "epoch": 0.7494089834515366,
+ "grad_norm": 2.612339496612549,
+ "learning_rate": 4.830317871489614e-06,
+ "loss": 0.5738,
+ "step": 1585
+ },
+ {
+ "epoch": 0.7498817966903073,
+ "grad_norm": 2.4355857372283936,
+ "learning_rate": 4.830091891754429e-06,
+ "loss": 0.5907,
+ "step": 1586
+ },
+ {
+ "epoch": 0.750354609929078,
+ "grad_norm": 2.676051378250122,
+ "learning_rate": 4.829865766934638e-06,
+ "loss": 0.6628,
+ "step": 1587
+ },
+ {
+ "epoch": 0.7508274231678487,
+ "grad_norm": 2.66489839553833,
+ "learning_rate": 4.829639497044323e-06,
+ "loss": 0.5984,
+ "step": 1588
+ },
+ {
+ "epoch": 0.7513002364066194,
+ "grad_norm": 2.5358035564422607,
+ "learning_rate": 4.829413082097572e-06,
+ "loss": 0.5867,
+ "step": 1589
+ },
+ {
+ "epoch": 0.75177304964539,
+ "grad_norm": 2.6530144214630127,
+ "learning_rate": 4.8291865221084815e-06,
+ "loss": 0.5917,
+ "step": 1590
+ },
+ {
+ "epoch": 0.7522458628841607,
+ "grad_norm": 2.5160958766937256,
+ "learning_rate": 4.82895981709116e-06,
+ "loss": 0.6347,
+ "step": 1591
+ },
+ {
+ "epoch": 0.7527186761229314,
+ "grad_norm": 2.61592698097229,
+ "learning_rate": 4.8287329670597225e-06,
+ "loss": 0.5472,
+ "step": 1592
+ },
+ {
+ "epoch": 0.7531914893617021,
+ "grad_norm": 2.7528622150421143,
+ "learning_rate": 4.828505972028296e-06,
+ "loss": 0.5842,
+ "step": 1593
+ },
+ {
+ "epoch": 0.7536643026004728,
+ "grad_norm": 2.8154072761535645,
+ "learning_rate": 4.828278832011011e-06,
+ "loss": 0.5757,
+ "step": 1594
+ },
+ {
+ "epoch": 0.7541371158392435,
+ "grad_norm": 3.118515729904175,
+ "learning_rate": 4.828051547022013e-06,
+ "loss": 0.6472,
+ "step": 1595
+ },
+ {
+ "epoch": 0.7546099290780142,
+ "grad_norm": 2.452033758163452,
+ "learning_rate": 4.827824117075453e-06,
+ "loss": 0.5571,
+ "step": 1596
+ },
+ {
+ "epoch": 0.7550827423167848,
+ "grad_norm": 2.984388828277588,
+ "learning_rate": 4.827596542185492e-06,
+ "loss": 0.6656,
+ "step": 1597
+ },
+ {
+ "epoch": 0.7555555555555555,
+ "grad_norm": 2.61356782913208,
+ "learning_rate": 4.8273688223663014e-06,
+ "loss": 0.6444,
+ "step": 1598
+ },
+ {
+ "epoch": 0.7560283687943262,
+ "grad_norm": 2.8967196941375732,
+ "learning_rate": 4.8271409576320595e-06,
+ "loss": 0.6457,
+ "step": 1599
+ },
+ {
+ "epoch": 0.7565011820330969,
+ "grad_norm": 2.852367639541626,
+ "learning_rate": 4.826912947996954e-06,
+ "loss": 0.5629,
+ "step": 1600
+ },
+ {
+ "epoch": 0.7569739952718676,
+ "grad_norm": 2.905280590057373,
+ "learning_rate": 4.826684793475182e-06,
+ "loss": 0.6245,
+ "step": 1601
+ },
+ {
+ "epoch": 0.7574468085106383,
+ "grad_norm": 2.6156530380249023,
+ "learning_rate": 4.826456494080951e-06,
+ "loss": 0.5869,
+ "step": 1602
+ },
+ {
+ "epoch": 0.757919621749409,
+ "grad_norm": 2.6490228176116943,
+ "learning_rate": 4.826228049828475e-06,
+ "loss": 0.5461,
+ "step": 1603
+ },
+ {
+ "epoch": 0.7583924349881797,
+ "grad_norm": 2.9626693725585938,
+ "learning_rate": 4.825999460731978e-06,
+ "loss": 0.6842,
+ "step": 1604
+ },
+ {
+ "epoch": 0.7588652482269503,
+ "grad_norm": 2.6866023540496826,
+ "learning_rate": 4.825770726805695e-06,
+ "loss": 0.5726,
+ "step": 1605
+ },
+ {
+ "epoch": 0.759338061465721,
+ "grad_norm": 2.5525858402252197,
+ "learning_rate": 4.825541848063866e-06,
+ "loss": 0.6061,
+ "step": 1606
+ },
+ {
+ "epoch": 0.7598108747044917,
+ "grad_norm": 2.703977584838867,
+ "learning_rate": 4.825312824520743e-06,
+ "loss": 0.6726,
+ "step": 1607
+ },
+ {
+ "epoch": 0.7602836879432624,
+ "grad_norm": 2.856534957885742,
+ "learning_rate": 4.825083656190588e-06,
+ "loss": 0.625,
+ "step": 1608
+ },
+ {
+ "epoch": 0.7607565011820331,
+ "grad_norm": 2.8564887046813965,
+ "learning_rate": 4.824854343087668e-06,
+ "loss": 0.7251,
+ "step": 1609
+ },
+ {
+ "epoch": 0.7612293144208038,
+ "grad_norm": 2.327650308609009,
+ "learning_rate": 4.824624885226262e-06,
+ "loss": 0.526,
+ "step": 1610
+ },
+ {
+ "epoch": 0.7617021276595745,
+ "grad_norm": 3.0025737285614014,
+ "learning_rate": 4.824395282620659e-06,
+ "loss": 0.6043,
+ "step": 1611
+ },
+ {
+ "epoch": 0.7621749408983451,
+ "grad_norm": 2.5441737174987793,
+ "learning_rate": 4.824165535285152e-06,
+ "loss": 0.6276,
+ "step": 1612
+ },
+ {
+ "epoch": 0.7626477541371158,
+ "grad_norm": 2.4177372455596924,
+ "learning_rate": 4.823935643234049e-06,
+ "loss": 0.6419,
+ "step": 1613
+ },
+ {
+ "epoch": 0.7631205673758865,
+ "grad_norm": 2.9210550785064697,
+ "learning_rate": 4.823705606481664e-06,
+ "loss": 0.5663,
+ "step": 1614
+ },
+ {
+ "epoch": 0.7635933806146572,
+ "grad_norm": 2.6353724002838135,
+ "learning_rate": 4.82347542504232e-06,
+ "loss": 0.5669,
+ "step": 1615
+ },
+ {
+ "epoch": 0.7640661938534279,
+ "grad_norm": 2.419081926345825,
+ "learning_rate": 4.823245098930349e-06,
+ "loss": 0.5777,
+ "step": 1616
+ },
+ {
+ "epoch": 0.7645390070921986,
+ "grad_norm": 2.5077571868896484,
+ "learning_rate": 4.823014628160093e-06,
+ "loss": 0.5924,
+ "step": 1617
+ },
+ {
+ "epoch": 0.7650118203309693,
+ "grad_norm": 2.816056251525879,
+ "learning_rate": 4.822784012745902e-06,
+ "loss": 0.7273,
+ "step": 1618
+ },
+ {
+ "epoch": 0.76548463356974,
+ "grad_norm": 2.7163147926330566,
+ "learning_rate": 4.8225532527021366e-06,
+ "loss": 0.5545,
+ "step": 1619
+ },
+ {
+ "epoch": 0.7659574468085106,
+ "grad_norm": 2.4784302711486816,
+ "learning_rate": 4.822322348043164e-06,
+ "loss": 0.556,
+ "step": 1620
+ },
+ {
+ "epoch": 0.7664302600472813,
+ "grad_norm": 2.712467670440674,
+ "learning_rate": 4.822091298783361e-06,
+ "loss": 0.6501,
+ "step": 1621
+ },
+ {
+ "epoch": 0.766903073286052,
+ "grad_norm": 2.7217724323272705,
+ "learning_rate": 4.821860104937115e-06,
+ "loss": 0.5989,
+ "step": 1622
+ },
+ {
+ "epoch": 0.7673758865248227,
+ "grad_norm": 2.5622854232788086,
+ "learning_rate": 4.821628766518821e-06,
+ "loss": 0.5263,
+ "step": 1623
+ },
+ {
+ "epoch": 0.7678486997635934,
+ "grad_norm": 3.230923891067505,
+ "learning_rate": 4.821397283542884e-06,
+ "loss": 0.6707,
+ "step": 1624
+ },
+ {
+ "epoch": 0.7683215130023641,
+ "grad_norm": 2.37929105758667,
+ "learning_rate": 4.821165656023718e-06,
+ "loss": 0.6124,
+ "step": 1625
+ },
+ {
+ "epoch": 0.7687943262411348,
+ "grad_norm": 2.9811325073242188,
+ "learning_rate": 4.820933883975745e-06,
+ "loss": 0.6435,
+ "step": 1626
+ },
+ {
+ "epoch": 0.7692671394799054,
+ "grad_norm": 2.887380838394165,
+ "learning_rate": 4.820701967413395e-06,
+ "loss": 0.621,
+ "step": 1627
+ },
+ {
+ "epoch": 0.7697399527186761,
+ "grad_norm": 2.6762876510620117,
+ "learning_rate": 4.820469906351109e-06,
+ "loss": 0.5713,
+ "step": 1628
+ },
+ {
+ "epoch": 0.7702127659574468,
+ "grad_norm": 2.7347512245178223,
+ "learning_rate": 4.820237700803337e-06,
+ "loss": 0.6136,
+ "step": 1629
+ },
+ {
+ "epoch": 0.7706855791962175,
+ "grad_norm": 2.7244746685028076,
+ "learning_rate": 4.820005350784539e-06,
+ "loss": 0.5816,
+ "step": 1630
+ },
+ {
+ "epoch": 0.7711583924349882,
+ "grad_norm": 2.9293999671936035,
+ "learning_rate": 4.8197728563091795e-06,
+ "loss": 0.6649,
+ "step": 1631
+ },
+ {
+ "epoch": 0.7716312056737589,
+ "grad_norm": 2.4402127265930176,
+ "learning_rate": 4.819540217391736e-06,
+ "loss": 0.6481,
+ "step": 1632
+ },
+ {
+ "epoch": 0.7721040189125296,
+ "grad_norm": 3.083941698074341,
+ "learning_rate": 4.819307434046694e-06,
+ "loss": 0.6951,
+ "step": 1633
+ },
+ {
+ "epoch": 0.7725768321513002,
+ "grad_norm": 2.544952392578125,
+ "learning_rate": 4.819074506288548e-06,
+ "loss": 0.539,
+ "step": 1634
+ },
+ {
+ "epoch": 0.7730496453900709,
+ "grad_norm": 2.7791268825531006,
+ "learning_rate": 4.818841434131801e-06,
+ "loss": 0.5827,
+ "step": 1635
+ },
+ {
+ "epoch": 0.7735224586288416,
+ "grad_norm": 2.7349796295166016,
+ "learning_rate": 4.818608217590967e-06,
+ "loss": 0.5584,
+ "step": 1636
+ },
+ {
+ "epoch": 0.7739952718676123,
+ "grad_norm": 2.637652635574341,
+ "learning_rate": 4.818374856680565e-06,
+ "loss": 0.6386,
+ "step": 1637
+ },
+ {
+ "epoch": 0.774468085106383,
+ "grad_norm": 2.9821584224700928,
+ "learning_rate": 4.818141351415127e-06,
+ "loss": 0.6734,
+ "step": 1638
+ },
+ {
+ "epoch": 0.7749408983451537,
+ "grad_norm": 2.992938995361328,
+ "learning_rate": 4.817907701809192e-06,
+ "loss": 0.5899,
+ "step": 1639
+ },
+ {
+ "epoch": 0.7754137115839244,
+ "grad_norm": 4.35719633102417,
+ "learning_rate": 4.8176739078773076e-06,
+ "loss": 0.6281,
+ "step": 1640
+ },
+ {
+ "epoch": 0.775886524822695,
+ "grad_norm": 2.838146209716797,
+ "learning_rate": 4.8174399696340315e-06,
+ "loss": 0.5766,
+ "step": 1641
+ },
+ {
+ "epoch": 0.7763593380614657,
+ "grad_norm": 3.3116989135742188,
+ "learning_rate": 4.81720588709393e-06,
+ "loss": 0.6409,
+ "step": 1642
+ },
+ {
+ "epoch": 0.7768321513002364,
+ "grad_norm": 2.9843590259552,
+ "learning_rate": 4.816971660271579e-06,
+ "loss": 0.6108,
+ "step": 1643
+ },
+ {
+ "epoch": 0.7773049645390071,
+ "grad_norm": 2.843770742416382,
+ "learning_rate": 4.816737289181562e-06,
+ "loss": 0.6053,
+ "step": 1644
+ },
+ {
+ "epoch": 0.7777777777777778,
+ "grad_norm": 2.7608556747436523,
+ "learning_rate": 4.816502773838473e-06,
+ "loss": 0.5854,
+ "step": 1645
+ },
+ {
+ "epoch": 0.7782505910165485,
+ "grad_norm": 3.343682289123535,
+ "learning_rate": 4.816268114256914e-06,
+ "loss": 0.6329,
+ "step": 1646
+ },
+ {
+ "epoch": 0.7787234042553192,
+ "grad_norm": 2.769768476486206,
+ "learning_rate": 4.816033310451496e-06,
+ "loss": 0.6242,
+ "step": 1647
+ },
+ {
+ "epoch": 0.7791962174940898,
+ "grad_norm": 2.989851713180542,
+ "learning_rate": 4.815798362436838e-06,
+ "loss": 0.6493,
+ "step": 1648
+ },
+ {
+ "epoch": 0.7796690307328605,
+ "grad_norm": 3.170736312866211,
+ "learning_rate": 4.8155632702275716e-06,
+ "loss": 0.6341,
+ "step": 1649
+ },
+ {
+ "epoch": 0.7801418439716312,
+ "grad_norm": 2.7372522354125977,
+ "learning_rate": 4.815328033838334e-06,
+ "loss": 0.5445,
+ "step": 1650
+ },
+ {
+ "epoch": 0.7806146572104019,
+ "grad_norm": 2.6947238445281982,
+ "learning_rate": 4.8150926532837715e-06,
+ "loss": 0.6437,
+ "step": 1651
+ },
+ {
+ "epoch": 0.7810874704491726,
+ "grad_norm": 2.472323179244995,
+ "learning_rate": 4.81485712857854e-06,
+ "loss": 0.5751,
+ "step": 1652
+ },
+ {
+ "epoch": 0.7815602836879433,
+ "grad_norm": 2.791114091873169,
+ "learning_rate": 4.814621459737308e-06,
+ "loss": 0.5996,
+ "step": 1653
+ },
+ {
+ "epoch": 0.782033096926714,
+ "grad_norm": 3.1957521438598633,
+ "learning_rate": 4.814385646774745e-06,
+ "loss": 0.5803,
+ "step": 1654
+ },
+ {
+ "epoch": 0.7825059101654847,
+ "grad_norm": 2.4120798110961914,
+ "learning_rate": 4.8141496897055364e-06,
+ "loss": 0.5814,
+ "step": 1655
+ },
+ {
+ "epoch": 0.7829787234042553,
+ "grad_norm": 2.9262423515319824,
+ "learning_rate": 4.813913588544374e-06,
+ "loss": 0.6292,
+ "step": 1656
+ },
+ {
+ "epoch": 0.783451536643026,
+ "grad_norm": 2.8251047134399414,
+ "learning_rate": 4.813677343305959e-06,
+ "loss": 0.6787,
+ "step": 1657
+ },
+ {
+ "epoch": 0.7839243498817967,
+ "grad_norm": 2.931659698486328,
+ "learning_rate": 4.8134409540050005e-06,
+ "loss": 0.6163,
+ "step": 1658
+ },
+ {
+ "epoch": 0.7843971631205674,
+ "grad_norm": 2.7160706520080566,
+ "learning_rate": 4.813204420656219e-06,
+ "loss": 0.6831,
+ "step": 1659
+ },
+ {
+ "epoch": 0.7848699763593381,
+ "grad_norm": 3.2134454250335693,
+ "learning_rate": 4.81296774327434e-06,
+ "loss": 0.6002,
+ "step": 1660
+ },
+ {
+ "epoch": 0.7853427895981088,
+ "grad_norm": 2.4002513885498047,
+ "learning_rate": 4.812730921874103e-06,
+ "loss": 0.5488,
+ "step": 1661
+ },
+ {
+ "epoch": 0.7858156028368795,
+ "grad_norm": 2.5559282302856445,
+ "learning_rate": 4.812493956470251e-06,
+ "loss": 0.5802,
+ "step": 1662
+ },
+ {
+ "epoch": 0.7862884160756501,
+ "grad_norm": 2.57478404045105,
+ "learning_rate": 4.812256847077541e-06,
+ "loss": 0.646,
+ "step": 1663
+ },
+ {
+ "epoch": 0.7867612293144208,
+ "grad_norm": 2.811851978302002,
+ "learning_rate": 4.812019593710736e-06,
+ "loss": 0.6245,
+ "step": 1664
+ },
+ {
+ "epoch": 0.7872340425531915,
+ "grad_norm": 2.5228829383850098,
+ "learning_rate": 4.811782196384609e-06,
+ "loss": 0.5949,
+ "step": 1665
+ },
+ {
+ "epoch": 0.7877068557919622,
+ "grad_norm": 2.744096040725708,
+ "learning_rate": 4.8115446551139415e-06,
+ "loss": 0.6006,
+ "step": 1666
+ },
+ {
+ "epoch": 0.7881796690307329,
+ "grad_norm": 3.129242420196533,
+ "learning_rate": 4.811306969913524e-06,
+ "loss": 0.7251,
+ "step": 1667
+ },
+ {
+ "epoch": 0.7886524822695036,
+ "grad_norm": 2.7855660915374756,
+ "learning_rate": 4.811069140798156e-06,
+ "loss": 0.6534,
+ "step": 1668
+ },
+ {
+ "epoch": 0.7891252955082743,
+ "grad_norm": 2.836603879928589,
+ "learning_rate": 4.810831167782647e-06,
+ "loss": 0.6661,
+ "step": 1669
+ },
+ {
+ "epoch": 0.789598108747045,
+ "grad_norm": 2.5339887142181396,
+ "learning_rate": 4.810593050881813e-06,
+ "loss": 0.5354,
+ "step": 1670
+ },
+ {
+ "epoch": 0.7900709219858156,
+ "grad_norm": 2.9553709030151367,
+ "learning_rate": 4.810354790110482e-06,
+ "loss": 0.6001,
+ "step": 1671
+ },
+ {
+ "epoch": 0.7905437352245863,
+ "grad_norm": 2.6581788063049316,
+ "learning_rate": 4.8101163854834885e-06,
+ "loss": 0.6802,
+ "step": 1672
+ },
+ {
+ "epoch": 0.791016548463357,
+ "grad_norm": 3.2002551555633545,
+ "learning_rate": 4.809877837015677e-06,
+ "loss": 0.6641,
+ "step": 1673
+ },
+ {
+ "epoch": 0.7914893617021277,
+ "grad_norm": 2.918792963027954,
+ "learning_rate": 4.809639144721902e-06,
+ "loss": 0.6758,
+ "step": 1674
+ },
+ {
+ "epoch": 0.7919621749408984,
+ "grad_norm": 2.7993946075439453,
+ "learning_rate": 4.8094003086170245e-06,
+ "loss": 0.5889,
+ "step": 1675
+ },
+ {
+ "epoch": 0.7924349881796691,
+ "grad_norm": 2.3698952198028564,
+ "learning_rate": 4.809161328715916e-06,
+ "loss": 0.6244,
+ "step": 1676
+ },
+ {
+ "epoch": 0.7929078014184398,
+ "grad_norm": 2.8891594409942627,
+ "learning_rate": 4.808922205033458e-06,
+ "loss": 0.5835,
+ "step": 1677
+ },
+ {
+ "epoch": 0.7933806146572104,
+ "grad_norm": 2.838345766067505,
+ "learning_rate": 4.808682937584537e-06,
+ "loss": 0.6907,
+ "step": 1678
+ },
+ {
+ "epoch": 0.7938534278959811,
+ "grad_norm": 2.8443174362182617,
+ "learning_rate": 4.808443526384053e-06,
+ "loss": 0.6692,
+ "step": 1679
+ },
+ {
+ "epoch": 0.7943262411347518,
+ "grad_norm": 2.7355034351348877,
+ "learning_rate": 4.808203971446913e-06,
+ "loss": 0.5799,
+ "step": 1680
+ },
+ {
+ "epoch": 0.7947990543735225,
+ "grad_norm": 2.7108020782470703,
+ "learning_rate": 4.807964272788033e-06,
+ "loss": 0.652,
+ "step": 1681
+ },
+ {
+ "epoch": 0.7952718676122932,
+ "grad_norm": 2.397650957107544,
+ "learning_rate": 4.807724430422338e-06,
+ "loss": 0.5418,
+ "step": 1682
+ },
+ {
+ "epoch": 0.7957446808510639,
+ "grad_norm": 2.4981582164764404,
+ "learning_rate": 4.807484444364762e-06,
+ "loss": 0.5731,
+ "step": 1683
+ },
+ {
+ "epoch": 0.7962174940898346,
+ "grad_norm": 2.7943713665008545,
+ "learning_rate": 4.8072443146302475e-06,
+ "loss": 0.5913,
+ "step": 1684
+ },
+ {
+ "epoch": 0.7966903073286052,
+ "grad_norm": 2.5691423416137695,
+ "learning_rate": 4.807004041233746e-06,
+ "loss": 0.6475,
+ "step": 1685
+ },
+ {
+ "epoch": 0.7971631205673759,
+ "grad_norm": 3.2367498874664307,
+ "learning_rate": 4.8067636241902195e-06,
+ "loss": 0.675,
+ "step": 1686
+ },
+ {
+ "epoch": 0.7976359338061466,
+ "grad_norm": 3.000595808029175,
+ "learning_rate": 4.806523063514637e-06,
+ "loss": 0.5481,
+ "step": 1687
+ },
+ {
+ "epoch": 0.7981087470449173,
+ "grad_norm": 2.702014207839966,
+ "learning_rate": 4.806282359221976e-06,
+ "loss": 0.5993,
+ "step": 1688
+ },
+ {
+ "epoch": 0.798581560283688,
+ "grad_norm": 2.383671998977661,
+ "learning_rate": 4.806041511327226e-06,
+ "loss": 0.562,
+ "step": 1689
+ },
+ {
+ "epoch": 0.7990543735224587,
+ "grad_norm": 2.6965041160583496,
+ "learning_rate": 4.8058005198453834e-06,
+ "loss": 0.5955,
+ "step": 1690
+ },
+ {
+ "epoch": 0.7995271867612294,
+ "grad_norm": 2.5906765460968018,
+ "learning_rate": 4.805559384791453e-06,
+ "loss": 0.5151,
+ "step": 1691
+ },
+ {
+ "epoch": 0.8,
+ "grad_norm": 2.5454652309417725,
+ "learning_rate": 4.8053181061804475e-06,
+ "loss": 0.5843,
+ "step": 1692
+ },
+ {
+ "epoch": 0.8004728132387707,
+ "grad_norm": 2.661343812942505,
+ "learning_rate": 4.8050766840273935e-06,
+ "loss": 0.5995,
+ "step": 1693
+ },
+ {
+ "epoch": 0.8009456264775414,
+ "grad_norm": 2.5635924339294434,
+ "learning_rate": 4.8048351183473215e-06,
+ "loss": 0.5676,
+ "step": 1694
+ },
+ {
+ "epoch": 0.8014184397163121,
+ "grad_norm": 2.5936667919158936,
+ "learning_rate": 4.804593409155274e-06,
+ "loss": 0.6291,
+ "step": 1695
+ },
+ {
+ "epoch": 0.8018912529550828,
+ "grad_norm": 2.6902432441711426,
+ "learning_rate": 4.804351556466299e-06,
+ "loss": 0.6114,
+ "step": 1696
+ },
+ {
+ "epoch": 0.8023640661938535,
+ "grad_norm": 2.7764673233032227,
+ "learning_rate": 4.804109560295457e-06,
+ "loss": 0.5768,
+ "step": 1697
+ },
+ {
+ "epoch": 0.8028368794326242,
+ "grad_norm": 2.9587221145629883,
+ "learning_rate": 4.803867420657816e-06,
+ "loss": 0.6048,
+ "step": 1698
+ },
+ {
+ "epoch": 0.8033096926713948,
+ "grad_norm": 2.9238998889923096,
+ "learning_rate": 4.803625137568453e-06,
+ "loss": 0.6329,
+ "step": 1699
+ },
+ {
+ "epoch": 0.8037825059101655,
+ "grad_norm": 2.70473313331604,
+ "learning_rate": 4.803382711042455e-06,
+ "loss": 0.5427,
+ "step": 1700
+ },
+ {
+ "epoch": 0.8042553191489362,
+ "grad_norm": 3.1604979038238525,
+ "learning_rate": 4.803140141094914e-06,
+ "loss": 0.626,
+ "step": 1701
+ },
+ {
+ "epoch": 0.8047281323877069,
+ "grad_norm": 2.9567699432373047,
+ "learning_rate": 4.802897427740936e-06,
+ "loss": 0.5319,
+ "step": 1702
+ },
+ {
+ "epoch": 0.8052009456264776,
+ "grad_norm": 2.90983247756958,
+ "learning_rate": 4.802654570995632e-06,
+ "loss": 0.586,
+ "step": 1703
+ },
+ {
+ "epoch": 0.8056737588652483,
+ "grad_norm": 2.783480167388916,
+ "learning_rate": 4.8024115708741255e-06,
+ "loss": 0.5773,
+ "step": 1704
+ },
+ {
+ "epoch": 0.806146572104019,
+ "grad_norm": 3.3307793140411377,
+ "learning_rate": 4.802168427391547e-06,
+ "loss": 0.6257,
+ "step": 1705
+ },
+ {
+ "epoch": 0.8066193853427897,
+ "grad_norm": 3.0475001335144043,
+ "learning_rate": 4.801925140563034e-06,
+ "loss": 0.6612,
+ "step": 1706
+ },
+ {
+ "epoch": 0.8070921985815603,
+ "grad_norm": 2.8278894424438477,
+ "learning_rate": 4.8016817104037375e-06,
+ "loss": 0.6449,
+ "step": 1707
+ },
+ {
+ "epoch": 0.807565011820331,
+ "grad_norm": 2.760244369506836,
+ "learning_rate": 4.801438136928812e-06,
+ "loss": 0.7007,
+ "step": 1708
+ },
+ {
+ "epoch": 0.8080378250591016,
+ "grad_norm": 2.827634572982788,
+ "learning_rate": 4.801194420153427e-06,
+ "loss": 0.6418,
+ "step": 1709
+ },
+ {
+ "epoch": 0.8085106382978723,
+ "grad_norm": 2.8655009269714355,
+ "learning_rate": 4.800950560092754e-06,
+ "loss": 0.6231,
+ "step": 1710
+ },
+ {
+ "epoch": 0.808983451536643,
+ "grad_norm": 2.738112688064575,
+ "learning_rate": 4.800706556761981e-06,
+ "loss": 0.6463,
+ "step": 1711
+ },
+ {
+ "epoch": 0.8094562647754137,
+ "grad_norm": 2.4781179428100586,
+ "learning_rate": 4.800462410176296e-06,
+ "loss": 0.5365,
+ "step": 1712
+ },
+ {
+ "epoch": 0.8099290780141843,
+ "grad_norm": 2.6049838066101074,
+ "learning_rate": 4.800218120350906e-06,
+ "loss": 0.6035,
+ "step": 1713
+ },
+ {
+ "epoch": 0.810401891252955,
+ "grad_norm": 2.9089980125427246,
+ "learning_rate": 4.79997368730102e-06,
+ "loss": 0.5828,
+ "step": 1714
+ },
+ {
+ "epoch": 0.8108747044917257,
+ "grad_norm": 2.831871747970581,
+ "learning_rate": 4.799729111041857e-06,
+ "loss": 0.5953,
+ "step": 1715
+ },
+ {
+ "epoch": 0.8113475177304964,
+ "grad_norm": 2.5611300468444824,
+ "learning_rate": 4.799484391588647e-06,
+ "loss": 0.6302,
+ "step": 1716
+ },
+ {
+ "epoch": 0.8118203309692671,
+ "grad_norm": 2.744070053100586,
+ "learning_rate": 4.799239528956625e-06,
+ "loss": 0.5561,
+ "step": 1717
+ },
+ {
+ "epoch": 0.8122931442080378,
+ "grad_norm": 2.7344231605529785,
+ "learning_rate": 4.798994523161041e-06,
+ "loss": 0.6317,
+ "step": 1718
+ },
+ {
+ "epoch": 0.8127659574468085,
+ "grad_norm": 2.3420889377593994,
+ "learning_rate": 4.798749374217149e-06,
+ "loss": 0.5415,
+ "step": 1719
+ },
+ {
+ "epoch": 0.8132387706855791,
+ "grad_norm": 2.57384991645813,
+ "learning_rate": 4.798504082140212e-06,
+ "loss": 0.6383,
+ "step": 1720
+ },
+ {
+ "epoch": 0.8137115839243498,
+ "grad_norm": 2.8819844722747803,
+ "learning_rate": 4.798258646945505e-06,
+ "loss": 0.6355,
+ "step": 1721
+ },
+ {
+ "epoch": 0.8141843971631205,
+ "grad_norm": 2.908123254776001,
+ "learning_rate": 4.79801306864831e-06,
+ "loss": 0.701,
+ "step": 1722
+ },
+ {
+ "epoch": 0.8146572104018912,
+ "grad_norm": 2.6500701904296875,
+ "learning_rate": 4.797767347263917e-06,
+ "loss": 0.6152,
+ "step": 1723
+ },
+ {
+ "epoch": 0.8151300236406619,
+ "grad_norm": 2.5513017177581787,
+ "learning_rate": 4.797521482807628e-06,
+ "loss": 0.6241,
+ "step": 1724
+ },
+ {
+ "epoch": 0.8156028368794326,
+ "grad_norm": 2.6239185333251953,
+ "learning_rate": 4.7972754752947495e-06,
+ "loss": 0.6072,
+ "step": 1725
+ },
+ {
+ "epoch": 0.8160756501182033,
+ "grad_norm": 2.673436403274536,
+ "learning_rate": 4.797029324740601e-06,
+ "loss": 0.5802,
+ "step": 1726
+ },
+ {
+ "epoch": 0.816548463356974,
+ "grad_norm": 2.533831834793091,
+ "learning_rate": 4.796783031160508e-06,
+ "loss": 0.5566,
+ "step": 1727
+ },
+ {
+ "epoch": 0.8170212765957446,
+ "grad_norm": 2.9806582927703857,
+ "learning_rate": 4.796536594569807e-06,
+ "loss": 0.6945,
+ "step": 1728
+ },
+ {
+ "epoch": 0.8174940898345153,
+ "grad_norm": 2.7093560695648193,
+ "learning_rate": 4.796290014983842e-06,
+ "loss": 0.7143,
+ "step": 1729
+ },
+ {
+ "epoch": 0.817966903073286,
+ "grad_norm": 2.814507246017456,
+ "learning_rate": 4.796043292417967e-06,
+ "loss": 0.6122,
+ "step": 1730
+ },
+ {
+ "epoch": 0.8184397163120567,
+ "grad_norm": 2.537156820297241,
+ "learning_rate": 4.795796426887543e-06,
+ "loss": 0.6229,
+ "step": 1731
+ },
+ {
+ "epoch": 0.8189125295508274,
+ "grad_norm": 2.4878013134002686,
+ "learning_rate": 4.795549418407944e-06,
+ "loss": 0.5442,
+ "step": 1732
+ },
+ {
+ "epoch": 0.8193853427895981,
+ "grad_norm": 2.839383363723755,
+ "learning_rate": 4.795302266994548e-06,
+ "loss": 0.6717,
+ "step": 1733
+ },
+ {
+ "epoch": 0.8198581560283688,
+ "grad_norm": 3.1981801986694336,
+ "learning_rate": 4.795054972662744e-06,
+ "loss": 0.6596,
+ "step": 1734
+ },
+ {
+ "epoch": 0.8203309692671394,
+ "grad_norm": 2.781730890274048,
+ "learning_rate": 4.79480753542793e-06,
+ "loss": 0.5845,
+ "step": 1735
+ },
+ {
+ "epoch": 0.8208037825059101,
+ "grad_norm": 2.689948558807373,
+ "learning_rate": 4.794559955305513e-06,
+ "loss": 0.5928,
+ "step": 1736
+ },
+ {
+ "epoch": 0.8212765957446808,
+ "grad_norm": 2.7267637252807617,
+ "learning_rate": 4.7943122323109105e-06,
+ "loss": 0.5224,
+ "step": 1737
+ },
+ {
+ "epoch": 0.8217494089834515,
+ "grad_norm": 2.4346601963043213,
+ "learning_rate": 4.794064366459544e-06,
+ "loss": 0.6431,
+ "step": 1738
+ },
+ {
+ "epoch": 0.8222222222222222,
+ "grad_norm": 2.7440176010131836,
+ "learning_rate": 4.793816357766849e-06,
+ "loss": 0.6083,
+ "step": 1739
+ },
+ {
+ "epoch": 0.8226950354609929,
+ "grad_norm": 2.6558027267456055,
+ "learning_rate": 4.793568206248268e-06,
+ "loss": 0.698,
+ "step": 1740
+ },
+ {
+ "epoch": 0.8231678486997636,
+ "grad_norm": 2.591658353805542,
+ "learning_rate": 4.793319911919251e-06,
+ "loss": 0.6601,
+ "step": 1741
+ },
+ {
+ "epoch": 0.8236406619385342,
+ "grad_norm": 2.5431172847747803,
+ "learning_rate": 4.79307147479526e-06,
+ "loss": 0.5917,
+ "step": 1742
+ },
+ {
+ "epoch": 0.8241134751773049,
+ "grad_norm": 2.7335588932037354,
+ "learning_rate": 4.792822894891762e-06,
+ "loss": 0.5925,
+ "step": 1743
+ },
+ {
+ "epoch": 0.8245862884160756,
+ "grad_norm": 2.2500839233398438,
+ "learning_rate": 4.792574172224237e-06,
+ "loss": 0.4984,
+ "step": 1744
+ },
+ {
+ "epoch": 0.8250591016548463,
+ "grad_norm": 2.691343069076538,
+ "learning_rate": 4.79232530680817e-06,
+ "loss": 0.6262,
+ "step": 1745
+ },
+ {
+ "epoch": 0.825531914893617,
+ "grad_norm": 2.612204074859619,
+ "learning_rate": 4.792076298659058e-06,
+ "loss": 0.5822,
+ "step": 1746
+ },
+ {
+ "epoch": 0.8260047281323877,
+ "grad_norm": 3.0163519382476807,
+ "learning_rate": 4.791827147792406e-06,
+ "loss": 0.6263,
+ "step": 1747
+ },
+ {
+ "epoch": 0.8264775413711584,
+ "grad_norm": 2.742183208465576,
+ "learning_rate": 4.791577854223727e-06,
+ "loss": 0.6628,
+ "step": 1748
+ },
+ {
+ "epoch": 0.826950354609929,
+ "grad_norm": 2.872213840484619,
+ "learning_rate": 4.791328417968542e-06,
+ "loss": 0.6332,
+ "step": 1749
+ },
+ {
+ "epoch": 0.8274231678486997,
+ "grad_norm": 2.725006580352783,
+ "learning_rate": 4.7910788390423844e-06,
+ "loss": 0.6266,
+ "step": 1750
+ },
+ {
+ "epoch": 0.8278959810874704,
+ "grad_norm": 3.0366697311401367,
+ "learning_rate": 4.790829117460793e-06,
+ "loss": 0.6403,
+ "step": 1751
+ },
+ {
+ "epoch": 0.8283687943262411,
+ "grad_norm": 2.594881772994995,
+ "learning_rate": 4.790579253239318e-06,
+ "loss": 0.521,
+ "step": 1752
+ },
+ {
+ "epoch": 0.8288416075650118,
+ "grad_norm": 2.4496347904205322,
+ "learning_rate": 4.790329246393517e-06,
+ "loss": 0.54,
+ "step": 1753
+ },
+ {
+ "epoch": 0.8293144208037825,
+ "grad_norm": 3.102278470993042,
+ "learning_rate": 4.790079096938956e-06,
+ "loss": 0.6142,
+ "step": 1754
+ },
+ {
+ "epoch": 0.8297872340425532,
+ "grad_norm": 2.4645912647247314,
+ "learning_rate": 4.789828804891212e-06,
+ "loss": 0.5212,
+ "step": 1755
+ },
+ {
+ "epoch": 0.8302600472813239,
+ "grad_norm": 2.7482516765594482,
+ "learning_rate": 4.789578370265868e-06,
+ "loss": 0.6712,
+ "step": 1756
+ },
+ {
+ "epoch": 0.8307328605200945,
+ "grad_norm": 2.61360502243042,
+ "learning_rate": 4.7893277930785195e-06,
+ "loss": 0.6367,
+ "step": 1757
+ },
+ {
+ "epoch": 0.8312056737588652,
+ "grad_norm": 2.79028058052063,
+ "learning_rate": 4.789077073344767e-06,
+ "loss": 0.5099,
+ "step": 1758
+ },
+ {
+ "epoch": 0.8316784869976359,
+ "grad_norm": 2.647662401199341,
+ "learning_rate": 4.788826211080222e-06,
+ "loss": 0.6698,
+ "step": 1759
+ },
+ {
+ "epoch": 0.8321513002364066,
+ "grad_norm": 3.0214831829071045,
+ "learning_rate": 4.7885752063005055e-06,
+ "loss": 0.6121,
+ "step": 1760
+ },
+ {
+ "epoch": 0.8326241134751773,
+ "grad_norm": 2.8244032859802246,
+ "learning_rate": 4.788324059021247e-06,
+ "loss": 0.6921,
+ "step": 1761
+ },
+ {
+ "epoch": 0.833096926713948,
+ "grad_norm": 3.1501076221466064,
+ "learning_rate": 4.788072769258082e-06,
+ "loss": 0.6872,
+ "step": 1762
+ },
+ {
+ "epoch": 0.8335697399527187,
+ "grad_norm": 2.6989903450012207,
+ "learning_rate": 4.7878213370266594e-06,
+ "loss": 0.5884,
+ "step": 1763
+ },
+ {
+ "epoch": 0.8340425531914893,
+ "grad_norm": 2.6982665061950684,
+ "learning_rate": 4.787569762342633e-06,
+ "loss": 0.6112,
+ "step": 1764
+ },
+ {
+ "epoch": 0.83451536643026,
+ "grad_norm": 2.6918323040008545,
+ "learning_rate": 4.7873180452216685e-06,
+ "loss": 0.5315,
+ "step": 1765
+ },
+ {
+ "epoch": 0.8349881796690307,
+ "grad_norm": 2.5494401454925537,
+ "learning_rate": 4.78706618567944e-06,
+ "loss": 0.5909,
+ "step": 1766
+ },
+ {
+ "epoch": 0.8354609929078014,
+ "grad_norm": 2.7532095909118652,
+ "learning_rate": 4.786814183731627e-06,
+ "loss": 0.5566,
+ "step": 1767
+ },
+ {
+ "epoch": 0.8359338061465721,
+ "grad_norm": 2.550865888595581,
+ "learning_rate": 4.786562039393923e-06,
+ "loss": 0.555,
+ "step": 1768
+ },
+ {
+ "epoch": 0.8364066193853428,
+ "grad_norm": 2.4477791786193848,
+ "learning_rate": 4.786309752682028e-06,
+ "loss": 0.5844,
+ "step": 1769
+ },
+ {
+ "epoch": 0.8368794326241135,
+ "grad_norm": 2.6982262134552,
+ "learning_rate": 4.7860573236116485e-06,
+ "loss": 0.6136,
+ "step": 1770
+ },
+ {
+ "epoch": 0.8373522458628841,
+ "grad_norm": 2.456263542175293,
+ "learning_rate": 4.785804752198503e-06,
+ "loss": 0.5055,
+ "step": 1771
+ },
+ {
+ "epoch": 0.8378250591016548,
+ "grad_norm": 2.428544521331787,
+ "learning_rate": 4.78555203845832e-06,
+ "loss": 0.5859,
+ "step": 1772
+ },
+ {
+ "epoch": 0.8382978723404255,
+ "grad_norm": 2.1782307624816895,
+ "learning_rate": 4.785299182406833e-06,
+ "loss": 0.5325,
+ "step": 1773
+ },
+ {
+ "epoch": 0.8387706855791962,
+ "grad_norm": 3.137956142425537,
+ "learning_rate": 4.785046184059786e-06,
+ "loss": 0.6097,
+ "step": 1774
+ },
+ {
+ "epoch": 0.8392434988179669,
+ "grad_norm": 2.6269001960754395,
+ "learning_rate": 4.7847930434329336e-06,
+ "loss": 0.5972,
+ "step": 1775
+ },
+ {
+ "epoch": 0.8397163120567376,
+ "grad_norm": 2.732659339904785,
+ "learning_rate": 4.784539760542037e-06,
+ "loss": 0.6054,
+ "step": 1776
+ },
+ {
+ "epoch": 0.8401891252955083,
+ "grad_norm": 2.5346736907958984,
+ "learning_rate": 4.784286335402866e-06,
+ "loss": 0.5521,
+ "step": 1777
+ },
+ {
+ "epoch": 0.840661938534279,
+ "grad_norm": 3.1420228481292725,
+ "learning_rate": 4.784032768031202e-06,
+ "loss": 0.6165,
+ "step": 1778
+ },
+ {
+ "epoch": 0.8411347517730496,
+ "grad_norm": 3.073793411254883,
+ "learning_rate": 4.783779058442831e-06,
+ "loss": 0.6414,
+ "step": 1779
+ },
+ {
+ "epoch": 0.8416075650118203,
+ "grad_norm": 2.6621336936950684,
+ "learning_rate": 4.783525206653554e-06,
+ "loss": 0.5836,
+ "step": 1780
+ },
+ {
+ "epoch": 0.842080378250591,
+ "grad_norm": 2.7029049396514893,
+ "learning_rate": 4.7832712126791745e-06,
+ "loss": 0.5897,
+ "step": 1781
+ },
+ {
+ "epoch": 0.8425531914893617,
+ "grad_norm": 2.4733822345733643,
+ "learning_rate": 4.783017076535509e-06,
+ "loss": 0.5913,
+ "step": 1782
+ },
+ {
+ "epoch": 0.8430260047281324,
+ "grad_norm": 2.8119473457336426,
+ "learning_rate": 4.782762798238381e-06,
+ "loss": 0.6105,
+ "step": 1783
+ },
+ {
+ "epoch": 0.8434988179669031,
+ "grad_norm": 2.5290818214416504,
+ "learning_rate": 4.782508377803622e-06,
+ "loss": 0.6119,
+ "step": 1784
+ },
+ {
+ "epoch": 0.8439716312056738,
+ "grad_norm": 3.193472385406494,
+ "learning_rate": 4.782253815247076e-06,
+ "loss": 0.6665,
+ "step": 1785
+ },
+ {
+ "epoch": 0.8444444444444444,
+ "grad_norm": 3.206759452819824,
+ "learning_rate": 4.781999110584592e-06,
+ "loss": 0.6012,
+ "step": 1786
+ },
+ {
+ "epoch": 0.8449172576832151,
+ "grad_norm": 2.6227457523345947,
+ "learning_rate": 4.781744263832029e-06,
+ "loss": 0.5845,
+ "step": 1787
+ },
+ {
+ "epoch": 0.8453900709219858,
+ "grad_norm": 2.838365316390991,
+ "learning_rate": 4.781489275005257e-06,
+ "loss": 0.5695,
+ "step": 1788
+ },
+ {
+ "epoch": 0.8458628841607565,
+ "grad_norm": 2.8348326683044434,
+ "learning_rate": 4.78123414412015e-06,
+ "loss": 0.6136,
+ "step": 1789
+ },
+ {
+ "epoch": 0.8463356973995272,
+ "grad_norm": 2.5698344707489014,
+ "learning_rate": 4.780978871192597e-06,
+ "loss": 0.6576,
+ "step": 1790
+ },
+ {
+ "epoch": 0.8468085106382979,
+ "grad_norm": 2.5198330879211426,
+ "learning_rate": 4.780723456238492e-06,
+ "loss": 0.5521,
+ "step": 1791
+ },
+ {
+ "epoch": 0.8472813238770686,
+ "grad_norm": 3.001325845718384,
+ "learning_rate": 4.780467899273737e-06,
+ "loss": 0.6075,
+ "step": 1792
+ },
+ {
+ "epoch": 0.8477541371158392,
+ "grad_norm": 2.7732746601104736,
+ "learning_rate": 4.780212200314247e-06,
+ "loss": 0.6245,
+ "step": 1793
+ },
+ {
+ "epoch": 0.8482269503546099,
+ "grad_norm": 2.6950337886810303,
+ "learning_rate": 4.77995635937594e-06,
+ "loss": 0.5723,
+ "step": 1794
+ },
+ {
+ "epoch": 0.8486997635933806,
+ "grad_norm": 2.82051420211792,
+ "learning_rate": 4.779700376474749e-06,
+ "loss": 0.6184,
+ "step": 1795
+ },
+ {
+ "epoch": 0.8491725768321513,
+ "grad_norm": 2.757791757583618,
+ "learning_rate": 4.779444251626611e-06,
+ "loss": 0.608,
+ "step": 1796
+ },
+ {
+ "epoch": 0.849645390070922,
+ "grad_norm": 2.394108533859253,
+ "learning_rate": 4.779187984847475e-06,
+ "loss": 0.6174,
+ "step": 1797
+ },
+ {
+ "epoch": 0.8501182033096927,
+ "grad_norm": 2.427562713623047,
+ "learning_rate": 4.778931576153296e-06,
+ "loss": 0.5618,
+ "step": 1798
+ },
+ {
+ "epoch": 0.8505910165484634,
+ "grad_norm": 2.891268491744995,
+ "learning_rate": 4.778675025560042e-06,
+ "loss": 0.6865,
+ "step": 1799
+ },
+ {
+ "epoch": 0.851063829787234,
+ "grad_norm": 2.665534257888794,
+ "learning_rate": 4.778418333083685e-06,
+ "loss": 0.5852,
+ "step": 1800
+ },
+ {
+ "epoch": 0.8515366430260047,
+ "grad_norm": 2.5492889881134033,
+ "learning_rate": 4.7781614987402095e-06,
+ "loss": 0.5161,
+ "step": 1801
+ },
+ {
+ "epoch": 0.8520094562647754,
+ "grad_norm": 2.400177001953125,
+ "learning_rate": 4.777904522545607e-06,
+ "loss": 0.5128,
+ "step": 1802
+ },
+ {
+ "epoch": 0.8524822695035461,
+ "grad_norm": 2.3949809074401855,
+ "learning_rate": 4.777647404515878e-06,
+ "loss": 0.571,
+ "step": 1803
+ },
+ {
+ "epoch": 0.8529550827423168,
+ "grad_norm": 2.3624472618103027,
+ "learning_rate": 4.7773901446670325e-06,
+ "loss": 0.5486,
+ "step": 1804
+ },
+ {
+ "epoch": 0.8534278959810875,
+ "grad_norm": 2.711366891860962,
+ "learning_rate": 4.7771327430150885e-06,
+ "loss": 0.5667,
+ "step": 1805
+ },
+ {
+ "epoch": 0.8539007092198582,
+ "grad_norm": 2.7681493759155273,
+ "learning_rate": 4.776875199576073e-06,
+ "loss": 0.5686,
+ "step": 1806
+ },
+ {
+ "epoch": 0.8543735224586289,
+ "grad_norm": 3.0369436740875244,
+ "learning_rate": 4.776617514366023e-06,
+ "loss": 0.6635,
+ "step": 1807
+ },
+ {
+ "epoch": 0.8548463356973995,
+ "grad_norm": 2.919649600982666,
+ "learning_rate": 4.776359687400983e-06,
+ "loss": 0.5749,
+ "step": 1808
+ },
+ {
+ "epoch": 0.8553191489361702,
+ "grad_norm": 2.7986185550689697,
+ "learning_rate": 4.776101718697007e-06,
+ "loss": 0.559,
+ "step": 1809
+ },
+ {
+ "epoch": 0.8557919621749409,
+ "grad_norm": 2.5951223373413086,
+ "learning_rate": 4.775843608270158e-06,
+ "loss": 0.5654,
+ "step": 1810
+ },
+ {
+ "epoch": 0.8562647754137116,
+ "grad_norm": 2.674138069152832,
+ "learning_rate": 4.775585356136505e-06,
+ "loss": 0.5286,
+ "step": 1811
+ },
+ {
+ "epoch": 0.8567375886524823,
+ "grad_norm": 3.045437812805176,
+ "learning_rate": 4.775326962312131e-06,
+ "loss": 0.6185,
+ "step": 1812
+ },
+ {
+ "epoch": 0.857210401891253,
+ "grad_norm": 2.6145293712615967,
+ "learning_rate": 4.775068426813124e-06,
+ "loss": 0.6075,
+ "step": 1813
+ },
+ {
+ "epoch": 0.8576832151300237,
+ "grad_norm": 2.6320106983184814,
+ "learning_rate": 4.7748097496555824e-06,
+ "loss": 0.561,
+ "step": 1814
+ },
+ {
+ "epoch": 0.8581560283687943,
+ "grad_norm": 2.5038623809814453,
+ "learning_rate": 4.774550930855612e-06,
+ "loss": 0.593,
+ "step": 1815
+ },
+ {
+ "epoch": 0.858628841607565,
+ "grad_norm": 2.8168089389801025,
+ "learning_rate": 4.774291970429329e-06,
+ "loss": 0.5196,
+ "step": 1816
+ },
+ {
+ "epoch": 0.8591016548463357,
+ "grad_norm": 2.778130292892456,
+ "learning_rate": 4.774032868392858e-06,
+ "loss": 0.5984,
+ "step": 1817
+ },
+ {
+ "epoch": 0.8595744680851064,
+ "grad_norm": 2.536458730697632,
+ "learning_rate": 4.7737736247623305e-06,
+ "loss": 0.568,
+ "step": 1818
+ },
+ {
+ "epoch": 0.8600472813238771,
+ "grad_norm": 2.6669719219207764,
+ "learning_rate": 4.77351423955389e-06,
+ "loss": 0.6233,
+ "step": 1819
+ },
+ {
+ "epoch": 0.8605200945626478,
+ "grad_norm": 2.578242540359497,
+ "learning_rate": 4.773254712783687e-06,
+ "loss": 0.579,
+ "step": 1820
+ },
+ {
+ "epoch": 0.8609929078014185,
+ "grad_norm": 2.816664457321167,
+ "learning_rate": 4.772995044467881e-06,
+ "loss": 0.6635,
+ "step": 1821
+ },
+ {
+ "epoch": 0.8614657210401891,
+ "grad_norm": 3.1111979484558105,
+ "learning_rate": 4.77273523462264e-06,
+ "loss": 0.6372,
+ "step": 1822
+ },
+ {
+ "epoch": 0.8619385342789598,
+ "grad_norm": 2.764552354812622,
+ "learning_rate": 4.772475283264142e-06,
+ "loss": 0.6216,
+ "step": 1823
+ },
+ {
+ "epoch": 0.8624113475177305,
+ "grad_norm": 2.9126830101013184,
+ "learning_rate": 4.772215190408572e-06,
+ "loss": 0.6396,
+ "step": 1824
+ },
+ {
+ "epoch": 0.8628841607565012,
+ "grad_norm": 2.7502307891845703,
+ "learning_rate": 4.7719549560721264e-06,
+ "loss": 0.6186,
+ "step": 1825
+ },
+ {
+ "epoch": 0.8633569739952719,
+ "grad_norm": 2.6279006004333496,
+ "learning_rate": 4.771694580271007e-06,
+ "loss": 0.5557,
+ "step": 1826
+ },
+ {
+ "epoch": 0.8638297872340426,
+ "grad_norm": 2.996563196182251,
+ "learning_rate": 4.7714340630214276e-06,
+ "loss": 0.6259,
+ "step": 1827
+ },
+ {
+ "epoch": 0.8643026004728133,
+ "grad_norm": 3.231323480606079,
+ "learning_rate": 4.771173404339609e-06,
+ "loss": 0.5473,
+ "step": 1828
+ },
+ {
+ "epoch": 0.864775413711584,
+ "grad_norm": 3.143519878387451,
+ "learning_rate": 4.770912604241781e-06,
+ "loss": 0.593,
+ "step": 1829
+ },
+ {
+ "epoch": 0.8652482269503546,
+ "grad_norm": 2.515484094619751,
+ "learning_rate": 4.770651662744184e-06,
+ "loss": 0.538,
+ "step": 1830
+ },
+ {
+ "epoch": 0.8657210401891253,
+ "grad_norm": 2.629058837890625,
+ "learning_rate": 4.770390579863064e-06,
+ "loss": 0.5745,
+ "step": 1831
+ },
+ {
+ "epoch": 0.866193853427896,
+ "grad_norm": 2.5826802253723145,
+ "learning_rate": 4.770129355614677e-06,
+ "loss": 0.6397,
+ "step": 1832
+ },
+ {
+ "epoch": 0.8666666666666667,
+ "grad_norm": 2.954623222351074,
+ "learning_rate": 4.769867990015289e-06,
+ "loss": 0.6106,
+ "step": 1833
+ },
+ {
+ "epoch": 0.8671394799054374,
+ "grad_norm": 2.742192268371582,
+ "learning_rate": 4.769606483081175e-06,
+ "loss": 0.6902,
+ "step": 1834
+ },
+ {
+ "epoch": 0.8676122931442081,
+ "grad_norm": 2.2619097232818604,
+ "learning_rate": 4.769344834828618e-06,
+ "loss": 0.5414,
+ "step": 1835
+ },
+ {
+ "epoch": 0.8680851063829788,
+ "grad_norm": 2.7384188175201416,
+ "learning_rate": 4.769083045273908e-06,
+ "loss": 0.5787,
+ "step": 1836
+ },
+ {
+ "epoch": 0.8685579196217494,
+ "grad_norm": 2.6734485626220703,
+ "learning_rate": 4.768821114433346e-06,
+ "loss": 0.5923,
+ "step": 1837
+ },
+ {
+ "epoch": 0.8690307328605201,
+ "grad_norm": 2.286140203475952,
+ "learning_rate": 4.768559042323243e-06,
+ "loss": 0.5822,
+ "step": 1838
+ },
+ {
+ "epoch": 0.8695035460992908,
+ "grad_norm": 3.0243725776672363,
+ "learning_rate": 4.768296828959915e-06,
+ "loss": 0.6623,
+ "step": 1839
+ },
+ {
+ "epoch": 0.8699763593380615,
+ "grad_norm": 2.4026312828063965,
+ "learning_rate": 4.768034474359689e-06,
+ "loss": 0.5554,
+ "step": 1840
+ },
+ {
+ "epoch": 0.8704491725768322,
+ "grad_norm": 2.7469029426574707,
+ "learning_rate": 4.767771978538903e-06,
+ "loss": 0.6316,
+ "step": 1841
+ },
+ {
+ "epoch": 0.8709219858156029,
+ "grad_norm": 2.729659080505371,
+ "learning_rate": 4.767509341513899e-06,
+ "loss": 0.5807,
+ "step": 1842
+ },
+ {
+ "epoch": 0.8713947990543736,
+ "grad_norm": 2.5336945056915283,
+ "learning_rate": 4.76724656330103e-06,
+ "loss": 0.6109,
+ "step": 1843
+ },
+ {
+ "epoch": 0.8718676122931442,
+ "grad_norm": 2.519880533218384,
+ "learning_rate": 4.76698364391666e-06,
+ "loss": 0.5313,
+ "step": 1844
+ },
+ {
+ "epoch": 0.8723404255319149,
+ "grad_norm": 2.698862075805664,
+ "learning_rate": 4.766720583377159e-06,
+ "loss": 0.5953,
+ "step": 1845
+ },
+ {
+ "epoch": 0.8728132387706856,
+ "grad_norm": 3.0195560455322266,
+ "learning_rate": 4.766457381698907e-06,
+ "loss": 0.5965,
+ "step": 1846
+ },
+ {
+ "epoch": 0.8732860520094563,
+ "grad_norm": 2.5972697734832764,
+ "learning_rate": 4.766194038898291e-06,
+ "loss": 0.6014,
+ "step": 1847
+ },
+ {
+ "epoch": 0.873758865248227,
+ "grad_norm": 2.7132294178009033,
+ "learning_rate": 4.76593055499171e-06,
+ "loss": 0.5638,
+ "step": 1848
+ },
+ {
+ "epoch": 0.8742316784869977,
+ "grad_norm": 2.7134575843811035,
+ "learning_rate": 4.765666929995568e-06,
+ "loss": 0.52,
+ "step": 1849
+ },
+ {
+ "epoch": 0.8747044917257684,
+ "grad_norm": 2.3804993629455566,
+ "learning_rate": 4.765403163926282e-06,
+ "loss": 0.5435,
+ "step": 1850
+ },
+ {
+ "epoch": 0.875177304964539,
+ "grad_norm": 2.8782761096954346,
+ "learning_rate": 4.765139256800274e-06,
+ "loss": 0.5843,
+ "step": 1851
+ },
+ {
+ "epoch": 0.8756501182033097,
+ "grad_norm": 2.836209774017334,
+ "learning_rate": 4.764875208633977e-06,
+ "loss": 0.6667,
+ "step": 1852
+ },
+ {
+ "epoch": 0.8761229314420804,
+ "grad_norm": 2.608851194381714,
+ "learning_rate": 4.764611019443831e-06,
+ "loss": 0.5436,
+ "step": 1853
+ },
+ {
+ "epoch": 0.8765957446808511,
+ "grad_norm": 2.788738965988159,
+ "learning_rate": 4.764346689246288e-06,
+ "loss": 0.7331,
+ "step": 1854
+ },
+ {
+ "epoch": 0.8770685579196218,
+ "grad_norm": 2.524277687072754,
+ "learning_rate": 4.764082218057805e-06,
+ "loss": 0.5067,
+ "step": 1855
+ },
+ {
+ "epoch": 0.8775413711583925,
+ "grad_norm": 3.7559316158294678,
+ "learning_rate": 4.763817605894851e-06,
+ "loss": 0.6809,
+ "step": 1856
+ },
+ {
+ "epoch": 0.8780141843971632,
+ "grad_norm": 2.9070613384246826,
+ "learning_rate": 4.763552852773899e-06,
+ "loss": 0.5913,
+ "step": 1857
+ },
+ {
+ "epoch": 0.8784869976359339,
+ "grad_norm": 2.7050609588623047,
+ "learning_rate": 4.7632879587114386e-06,
+ "loss": 0.6074,
+ "step": 1858
+ },
+ {
+ "epoch": 0.8789598108747045,
+ "grad_norm": 2.891134262084961,
+ "learning_rate": 4.76302292372396e-06,
+ "loss": 0.5939,
+ "step": 1859
+ },
+ {
+ "epoch": 0.8794326241134752,
+ "grad_norm": 2.8581702709198,
+ "learning_rate": 4.762757747827968e-06,
+ "loss": 0.5972,
+ "step": 1860
+ },
+ {
+ "epoch": 0.8799054373522459,
+ "grad_norm": 2.8266196250915527,
+ "learning_rate": 4.762492431039971e-06,
+ "loss": 0.5993,
+ "step": 1861
+ },
+ {
+ "epoch": 0.8803782505910166,
+ "grad_norm": 2.4853954315185547,
+ "learning_rate": 4.762226973376493e-06,
+ "loss": 0.6388,
+ "step": 1862
+ },
+ {
+ "epoch": 0.8808510638297873,
+ "grad_norm": 3.2212886810302734,
+ "learning_rate": 4.761961374854059e-06,
+ "loss": 0.6698,
+ "step": 1863
+ },
+ {
+ "epoch": 0.881323877068558,
+ "grad_norm": 3.1254501342773438,
+ "learning_rate": 4.761695635489211e-06,
+ "loss": 0.5263,
+ "step": 1864
+ },
+ {
+ "epoch": 0.8817966903073287,
+ "grad_norm": 2.6891462802886963,
+ "learning_rate": 4.761429755298491e-06,
+ "loss": 0.5359,
+ "step": 1865
+ },
+ {
+ "epoch": 0.8822695035460993,
+ "grad_norm": 2.8557538986206055,
+ "learning_rate": 4.761163734298457e-06,
+ "loss": 0.5933,
+ "step": 1866
+ },
+ {
+ "epoch": 0.88274231678487,
+ "grad_norm": 2.53548264503479,
+ "learning_rate": 4.7608975725056724e-06,
+ "loss": 0.6397,
+ "step": 1867
+ },
+ {
+ "epoch": 0.8832151300236407,
+ "grad_norm": 3.0237956047058105,
+ "learning_rate": 4.76063126993671e-06,
+ "loss": 0.6845,
+ "step": 1868
+ },
+ {
+ "epoch": 0.8836879432624114,
+ "grad_norm": 3.222886800765991,
+ "learning_rate": 4.76036482660815e-06,
+ "loss": 0.6055,
+ "step": 1869
+ },
+ {
+ "epoch": 0.8841607565011821,
+ "grad_norm": 3.1867551803588867,
+ "learning_rate": 4.760098242536584e-06,
+ "loss": 0.6592,
+ "step": 1870
+ },
+ {
+ "epoch": 0.8846335697399527,
+ "grad_norm": 2.782209873199463,
+ "learning_rate": 4.7598315177386115e-06,
+ "loss": 0.5833,
+ "step": 1871
+ },
+ {
+ "epoch": 0.8851063829787233,
+ "grad_norm": 2.899871587753296,
+ "learning_rate": 4.759564652230838e-06,
+ "loss": 0.6129,
+ "step": 1872
+ },
+ {
+ "epoch": 0.885579196217494,
+ "grad_norm": 2.5690579414367676,
+ "learning_rate": 4.759297646029882e-06,
+ "loss": 0.5827,
+ "step": 1873
+ },
+ {
+ "epoch": 0.8860520094562647,
+ "grad_norm": 2.666130304336548,
+ "learning_rate": 4.759030499152368e-06,
+ "loss": 0.5272,
+ "step": 1874
+ },
+ {
+ "epoch": 0.8865248226950354,
+ "grad_norm": 2.7030911445617676,
+ "learning_rate": 4.758763211614932e-06,
+ "loss": 0.6415,
+ "step": 1875
+ },
+ {
+ "epoch": 0.8869976359338061,
+ "grad_norm": 2.717512845993042,
+ "learning_rate": 4.7584957834342135e-06,
+ "loss": 0.5827,
+ "step": 1876
+ },
+ {
+ "epoch": 0.8874704491725768,
+ "grad_norm": 2.665823459625244,
+ "learning_rate": 4.758228214626867e-06,
+ "loss": 0.6209,
+ "step": 1877
+ },
+ {
+ "epoch": 0.8879432624113475,
+ "grad_norm": 2.636653184890747,
+ "learning_rate": 4.75796050520955e-06,
+ "loss": 0.6413,
+ "step": 1878
+ },
+ {
+ "epoch": 0.8884160756501182,
+ "grad_norm": 2.585115671157837,
+ "learning_rate": 4.7576926551989345e-06,
+ "loss": 0.5518,
+ "step": 1879
+ },
+ {
+ "epoch": 0.8888888888888888,
+ "grad_norm": 2.808526039123535,
+ "learning_rate": 4.757424664611697e-06,
+ "loss": 0.5717,
+ "step": 1880
+ },
+ {
+ "epoch": 0.8893617021276595,
+ "grad_norm": 3.5957939624786377,
+ "learning_rate": 4.757156533464524e-06,
+ "loss": 0.6323,
+ "step": 1881
+ },
+ {
+ "epoch": 0.8898345153664302,
+ "grad_norm": 2.5003883838653564,
+ "learning_rate": 4.756888261774111e-06,
+ "loss": 0.5937,
+ "step": 1882
+ },
+ {
+ "epoch": 0.8903073286052009,
+ "grad_norm": 2.749061346054077,
+ "learning_rate": 4.756619849557161e-06,
+ "loss": 0.6642,
+ "step": 1883
+ },
+ {
+ "epoch": 0.8907801418439716,
+ "grad_norm": 2.6757891178131104,
+ "learning_rate": 4.756351296830389e-06,
+ "loss": 0.5887,
+ "step": 1884
+ },
+ {
+ "epoch": 0.8912529550827423,
+ "grad_norm": 2.811925172805786,
+ "learning_rate": 4.756082603610516e-06,
+ "loss": 0.6571,
+ "step": 1885
+ },
+ {
+ "epoch": 0.891725768321513,
+ "grad_norm": 2.5054616928100586,
+ "learning_rate": 4.755813769914271e-06,
+ "loss": 0.6312,
+ "step": 1886
+ },
+ {
+ "epoch": 0.8921985815602836,
+ "grad_norm": 2.7518467903137207,
+ "learning_rate": 4.755544795758395e-06,
+ "loss": 0.6685,
+ "step": 1887
+ },
+ {
+ "epoch": 0.8926713947990543,
+ "grad_norm": 2.7527287006378174,
+ "learning_rate": 4.755275681159634e-06,
+ "loss": 0.5886,
+ "step": 1888
+ },
+ {
+ "epoch": 0.893144208037825,
+ "grad_norm": 2.6162452697753906,
+ "learning_rate": 4.755006426134745e-06,
+ "loss": 0.546,
+ "step": 1889
+ },
+ {
+ "epoch": 0.8936170212765957,
+ "grad_norm": 2.4016737937927246,
+ "learning_rate": 4.754737030700495e-06,
+ "loss": 0.5726,
+ "step": 1890
+ },
+ {
+ "epoch": 0.8940898345153664,
+ "grad_norm": 2.528327703475952,
+ "learning_rate": 4.754467494873656e-06,
+ "loss": 0.5682,
+ "step": 1891
+ },
+ {
+ "epoch": 0.8945626477541371,
+ "grad_norm": 2.3139286041259766,
+ "learning_rate": 4.7541978186710115e-06,
+ "loss": 0.6108,
+ "step": 1892
+ },
+ {
+ "epoch": 0.8950354609929078,
+ "grad_norm": 2.7269136905670166,
+ "learning_rate": 4.753928002109354e-06,
+ "loss": 0.5875,
+ "step": 1893
+ },
+ {
+ "epoch": 0.8955082742316784,
+ "grad_norm": 4.425495147705078,
+ "learning_rate": 4.753658045205482e-06,
+ "loss": 0.5572,
+ "step": 1894
+ },
+ {
+ "epoch": 0.8959810874704491,
+ "grad_norm": 2.535409927368164,
+ "learning_rate": 4.753387947976206e-06,
+ "loss": 0.5868,
+ "step": 1895
+ },
+ {
+ "epoch": 0.8964539007092198,
+ "grad_norm": 2.722458600997925,
+ "learning_rate": 4.753117710438343e-06,
+ "loss": 0.5935,
+ "step": 1896
+ },
+ {
+ "epoch": 0.8969267139479905,
+ "grad_norm": 2.743861436843872,
+ "learning_rate": 4.75284733260872e-06,
+ "loss": 0.572,
+ "step": 1897
+ },
+ {
+ "epoch": 0.8973995271867612,
+ "grad_norm": 2.60640549659729,
+ "learning_rate": 4.752576814504173e-06,
+ "loss": 0.567,
+ "step": 1898
+ },
+ {
+ "epoch": 0.8978723404255319,
+ "grad_norm": 2.7486042976379395,
+ "learning_rate": 4.7523061561415435e-06,
+ "loss": 0.5768,
+ "step": 1899
+ },
+ {
+ "epoch": 0.8983451536643026,
+ "grad_norm": 3.8410251140594482,
+ "learning_rate": 4.752035357537686e-06,
+ "loss": 0.6034,
+ "step": 1900
+ },
+ {
+ "epoch": 0.8988179669030733,
+ "grad_norm": 3.0935890674591064,
+ "learning_rate": 4.751764418709462e-06,
+ "loss": 0.5644,
+ "step": 1901
+ },
+ {
+ "epoch": 0.8992907801418439,
+ "grad_norm": 2.7989892959594727,
+ "learning_rate": 4.751493339673742e-06,
+ "loss": 0.656,
+ "step": 1902
+ },
+ {
+ "epoch": 0.8997635933806146,
+ "grad_norm": 3.6940557956695557,
+ "learning_rate": 4.751222120447403e-06,
+ "loss": 0.6632,
+ "step": 1903
+ },
+ {
+ "epoch": 0.9002364066193853,
+ "grad_norm": 2.3428797721862793,
+ "learning_rate": 4.750950761047335e-06,
+ "loss": 0.4485,
+ "step": 1904
+ },
+ {
+ "epoch": 0.900709219858156,
+ "grad_norm": 2.622544050216675,
+ "learning_rate": 4.750679261490432e-06,
+ "loss": 0.5857,
+ "step": 1905
+ },
+ {
+ "epoch": 0.9011820330969267,
+ "grad_norm": 2.4911322593688965,
+ "learning_rate": 4.750407621793601e-06,
+ "loss": 0.5618,
+ "step": 1906
+ },
+ {
+ "epoch": 0.9016548463356974,
+ "grad_norm": 2.6434662342071533,
+ "learning_rate": 4.750135841973755e-06,
+ "loss": 0.6057,
+ "step": 1907
+ },
+ {
+ "epoch": 0.902127659574468,
+ "grad_norm": 3.115443706512451,
+ "learning_rate": 4.749863922047817e-06,
+ "loss": 0.6064,
+ "step": 1908
+ },
+ {
+ "epoch": 0.9026004728132387,
+ "grad_norm": 2.5671091079711914,
+ "learning_rate": 4.749591862032718e-06,
+ "loss": 0.5625,
+ "step": 1909
+ },
+ {
+ "epoch": 0.9030732860520094,
+ "grad_norm": 3.2008655071258545,
+ "learning_rate": 4.749319661945398e-06,
+ "loss": 0.5547,
+ "step": 1910
+ },
+ {
+ "epoch": 0.9035460992907801,
+ "grad_norm": 2.905987024307251,
+ "learning_rate": 4.749047321802805e-06,
+ "loss": 0.6033,
+ "step": 1911
+ },
+ {
+ "epoch": 0.9040189125295508,
+ "grad_norm": 3.1456053256988525,
+ "learning_rate": 4.748774841621897e-06,
+ "loss": 0.5651,
+ "step": 1912
+ },
+ {
+ "epoch": 0.9044917257683215,
+ "grad_norm": 2.8116416931152344,
+ "learning_rate": 4.748502221419641e-06,
+ "loss": 0.5853,
+ "step": 1913
+ },
+ {
+ "epoch": 0.9049645390070922,
+ "grad_norm": 3.123835325241089,
+ "learning_rate": 4.748229461213011e-06,
+ "loss": 0.5427,
+ "step": 1914
+ },
+ {
+ "epoch": 0.9054373522458629,
+ "grad_norm": 2.4750146865844727,
+ "learning_rate": 4.747956561018989e-06,
+ "loss": 0.6517,
+ "step": 1915
+ },
+ {
+ "epoch": 0.9059101654846335,
+ "grad_norm": 2.6174299716949463,
+ "learning_rate": 4.7476835208545705e-06,
+ "loss": 0.6119,
+ "step": 1916
+ },
+ {
+ "epoch": 0.9063829787234042,
+ "grad_norm": 2.7390382289886475,
+ "learning_rate": 4.747410340736755e-06,
+ "loss": 0.5664,
+ "step": 1917
+ },
+ {
+ "epoch": 0.9068557919621749,
+ "grad_norm": 2.7940444946289062,
+ "learning_rate": 4.747137020682552e-06,
+ "loss": 0.5628,
+ "step": 1918
+ },
+ {
+ "epoch": 0.9073286052009456,
+ "grad_norm": 2.477365016937256,
+ "learning_rate": 4.7468635607089795e-06,
+ "loss": 0.5261,
+ "step": 1919
+ },
+ {
+ "epoch": 0.9078014184397163,
+ "grad_norm": 2.7016685009002686,
+ "learning_rate": 4.746589960833066e-06,
+ "loss": 0.5576,
+ "step": 1920
+ },
+ {
+ "epoch": 0.908274231678487,
+ "grad_norm": 2.8806519508361816,
+ "learning_rate": 4.746316221071846e-06,
+ "loss": 0.5925,
+ "step": 1921
+ },
+ {
+ "epoch": 0.9087470449172577,
+ "grad_norm": 3.0315234661102295,
+ "learning_rate": 4.746042341442365e-06,
+ "loss": 0.6142,
+ "step": 1922
+ },
+ {
+ "epoch": 0.9092198581560283,
+ "grad_norm": 4.2446160316467285,
+ "learning_rate": 4.745768321961676e-06,
+ "loss": 0.5352,
+ "step": 1923
+ },
+ {
+ "epoch": 0.909692671394799,
+ "grad_norm": 2.6517012119293213,
+ "learning_rate": 4.745494162646841e-06,
+ "loss": 0.6118,
+ "step": 1924
+ },
+ {
+ "epoch": 0.9101654846335697,
+ "grad_norm": 2.774900197982788,
+ "learning_rate": 4.7452198635149304e-06,
+ "loss": 0.572,
+ "step": 1925
+ },
+ {
+ "epoch": 0.9106382978723404,
+ "grad_norm": 3.0133683681488037,
+ "learning_rate": 4.744945424583024e-06,
+ "loss": 0.5897,
+ "step": 1926
+ },
+ {
+ "epoch": 0.9111111111111111,
+ "grad_norm": 2.7344839572906494,
+ "learning_rate": 4.744670845868211e-06,
+ "loss": 0.6207,
+ "step": 1927
+ },
+ {
+ "epoch": 0.9115839243498818,
+ "grad_norm": 2.636578321456909,
+ "learning_rate": 4.744396127387586e-06,
+ "loss": 0.6687,
+ "step": 1928
+ },
+ {
+ "epoch": 0.9120567375886525,
+ "grad_norm": 2.8663458824157715,
+ "learning_rate": 4.744121269158255e-06,
+ "loss": 0.5002,
+ "step": 1929
+ },
+ {
+ "epoch": 0.9125295508274232,
+ "grad_norm": 2.661079168319702,
+ "learning_rate": 4.743846271197333e-06,
+ "loss": 0.5848,
+ "step": 1930
+ },
+ {
+ "epoch": 0.9130023640661938,
+ "grad_norm": 2.881256341934204,
+ "learning_rate": 4.743571133521943e-06,
+ "loss": 0.5911,
+ "step": 1931
+ },
+ {
+ "epoch": 0.9134751773049645,
+ "grad_norm": 2.5540573596954346,
+ "learning_rate": 4.743295856149217e-06,
+ "loss": 0.5647,
+ "step": 1932
+ },
+ {
+ "epoch": 0.9139479905437352,
+ "grad_norm": 2.7060387134552,
+ "learning_rate": 4.743020439096293e-06,
+ "loss": 0.6267,
+ "step": 1933
+ },
+ {
+ "epoch": 0.9144208037825059,
+ "grad_norm": 2.694481372833252,
+ "learning_rate": 4.742744882380323e-06,
+ "loss": 0.6283,
+ "step": 1934
+ },
+ {
+ "epoch": 0.9148936170212766,
+ "grad_norm": 2.711555242538452,
+ "learning_rate": 4.7424691860184625e-06,
+ "loss": 0.5784,
+ "step": 1935
+ },
+ {
+ "epoch": 0.9153664302600473,
+ "grad_norm": 2.9077224731445312,
+ "learning_rate": 4.742193350027879e-06,
+ "loss": 0.5948,
+ "step": 1936
+ },
+ {
+ "epoch": 0.915839243498818,
+ "grad_norm": 2.9824187755584717,
+ "learning_rate": 4.7419173744257476e-06,
+ "loss": 0.6115,
+ "step": 1937
+ },
+ {
+ "epoch": 0.9163120567375886,
+ "grad_norm": 2.5127830505371094,
+ "learning_rate": 4.7416412592292515e-06,
+ "loss": 0.5803,
+ "step": 1938
+ },
+ {
+ "epoch": 0.9167848699763593,
+ "grad_norm": 3.1307175159454346,
+ "learning_rate": 4.741365004455583e-06,
+ "loss": 0.5657,
+ "step": 1939
+ },
+ {
+ "epoch": 0.91725768321513,
+ "grad_norm": 2.8205273151397705,
+ "learning_rate": 4.741088610121944e-06,
+ "loss": 0.6145,
+ "step": 1940
+ },
+ {
+ "epoch": 0.9177304964539007,
+ "grad_norm": 2.6119720935821533,
+ "learning_rate": 4.7408120762455444e-06,
+ "loss": 0.6058,
+ "step": 1941
+ },
+ {
+ "epoch": 0.9182033096926714,
+ "grad_norm": 2.421276092529297,
+ "learning_rate": 4.7405354028436025e-06,
+ "loss": 0.5973,
+ "step": 1942
+ },
+ {
+ "epoch": 0.9186761229314421,
+ "grad_norm": 2.9846808910369873,
+ "learning_rate": 4.740258589933346e-06,
+ "loss": 0.6892,
+ "step": 1943
+ },
+ {
+ "epoch": 0.9191489361702128,
+ "grad_norm": 2.6899871826171875,
+ "learning_rate": 4.739981637532009e-06,
+ "loss": 0.5705,
+ "step": 1944
+ },
+ {
+ "epoch": 0.9196217494089834,
+ "grad_norm": 2.8636131286621094,
+ "learning_rate": 4.739704545656839e-06,
+ "loss": 0.5775,
+ "step": 1945
+ },
+ {
+ "epoch": 0.9200945626477541,
+ "grad_norm": 2.7659449577331543,
+ "learning_rate": 4.739427314325087e-06,
+ "loss": 0.5823,
+ "step": 1946
+ },
+ {
+ "epoch": 0.9205673758865248,
+ "grad_norm": 4.71295166015625,
+ "learning_rate": 4.739149943554016e-06,
+ "loss": 0.5601,
+ "step": 1947
+ },
+ {
+ "epoch": 0.9210401891252955,
+ "grad_norm": 2.642636775970459,
+ "learning_rate": 4.738872433360896e-06,
+ "loss": 0.5278,
+ "step": 1948
+ },
+ {
+ "epoch": 0.9215130023640662,
+ "grad_norm": 2.4658217430114746,
+ "learning_rate": 4.7385947837630065e-06,
+ "loss": 0.6392,
+ "step": 1949
+ },
+ {
+ "epoch": 0.9219858156028369,
+ "grad_norm": 2.851602792739868,
+ "learning_rate": 4.738316994777636e-06,
+ "loss": 0.6164,
+ "step": 1950
+ },
+ {
+ "epoch": 0.9224586288416076,
+ "grad_norm": 2.394226551055908,
+ "learning_rate": 4.738039066422081e-06,
+ "loss": 0.5556,
+ "step": 1951
+ },
+ {
+ "epoch": 0.9229314420803783,
+ "grad_norm": 2.7985100746154785,
+ "learning_rate": 4.737760998713647e-06,
+ "loss": 0.5799,
+ "step": 1952
+ },
+ {
+ "epoch": 0.9234042553191489,
+ "grad_norm": 2.5974674224853516,
+ "learning_rate": 4.737482791669648e-06,
+ "loss": 0.6984,
+ "step": 1953
+ },
+ {
+ "epoch": 0.9238770685579196,
+ "grad_norm": 2.707636594772339,
+ "learning_rate": 4.737204445307406e-06,
+ "loss": 0.5548,
+ "step": 1954
+ },
+ {
+ "epoch": 0.9243498817966903,
+ "grad_norm": 2.7882707118988037,
+ "learning_rate": 4.736925959644254e-06,
+ "loss": 0.6026,
+ "step": 1955
+ },
+ {
+ "epoch": 0.924822695035461,
+ "grad_norm": 2.474482774734497,
+ "learning_rate": 4.7366473346975304e-06,
+ "loss": 0.5832,
+ "step": 1956
+ },
+ {
+ "epoch": 0.9252955082742317,
+ "grad_norm": 2.6196324825286865,
+ "learning_rate": 4.736368570484585e-06,
+ "loss": 0.5861,
+ "step": 1957
+ },
+ {
+ "epoch": 0.9257683215130024,
+ "grad_norm": 2.826864004135132,
+ "learning_rate": 4.736089667022775e-06,
+ "loss": 0.6173,
+ "step": 1958
+ },
+ {
+ "epoch": 0.926241134751773,
+ "grad_norm": 2.414473056793213,
+ "learning_rate": 4.735810624329466e-06,
+ "loss": 0.5753,
+ "step": 1959
+ },
+ {
+ "epoch": 0.9267139479905437,
+ "grad_norm": 2.8037970066070557,
+ "learning_rate": 4.7355314424220335e-06,
+ "loss": 0.6207,
+ "step": 1960
+ },
+ {
+ "epoch": 0.9271867612293144,
+ "grad_norm": 2.645458698272705,
+ "learning_rate": 4.735252121317861e-06,
+ "loss": 0.5959,
+ "step": 1961
+ },
+ {
+ "epoch": 0.9276595744680851,
+ "grad_norm": 2.7983884811401367,
+ "learning_rate": 4.734972661034339e-06,
+ "loss": 0.5696,
+ "step": 1962
+ },
+ {
+ "epoch": 0.9281323877068558,
+ "grad_norm": 3.0568997859954834,
+ "learning_rate": 4.73469306158887e-06,
+ "loss": 0.6194,
+ "step": 1963
+ },
+ {
+ "epoch": 0.9286052009456265,
+ "grad_norm": 2.7205135822296143,
+ "learning_rate": 4.734413322998863e-06,
+ "loss": 0.5292,
+ "step": 1964
+ },
+ {
+ "epoch": 0.9290780141843972,
+ "grad_norm": 3.3168489933013916,
+ "learning_rate": 4.734133445281735e-06,
+ "loss": 0.5654,
+ "step": 1965
+ },
+ {
+ "epoch": 0.9295508274231679,
+ "grad_norm": 3.0095653533935547,
+ "learning_rate": 4.733853428454916e-06,
+ "loss": 0.6508,
+ "step": 1966
+ },
+ {
+ "epoch": 0.9300236406619385,
+ "grad_norm": 2.7726712226867676,
+ "learning_rate": 4.733573272535838e-06,
+ "loss": 0.644,
+ "step": 1967
+ },
+ {
+ "epoch": 0.9304964539007092,
+ "grad_norm": 2.474397659301758,
+ "learning_rate": 4.7332929775419456e-06,
+ "loss": 0.5479,
+ "step": 1968
+ },
+ {
+ "epoch": 0.9309692671394799,
+ "grad_norm": 2.4518635272979736,
+ "learning_rate": 4.733012543490693e-06,
+ "loss": 0.6,
+ "step": 1969
+ },
+ {
+ "epoch": 0.9314420803782506,
+ "grad_norm": 2.9292192459106445,
+ "learning_rate": 4.73273197039954e-06,
+ "loss": 0.6647,
+ "step": 1970
+ },
+ {
+ "epoch": 0.9319148936170213,
+ "grad_norm": 2.425004720687866,
+ "learning_rate": 4.732451258285958e-06,
+ "loss": 0.6338,
+ "step": 1971
+ },
+ {
+ "epoch": 0.932387706855792,
+ "grad_norm": 2.904479503631592,
+ "learning_rate": 4.7321704071674255e-06,
+ "loss": 0.5923,
+ "step": 1972
+ },
+ {
+ "epoch": 0.9328605200945627,
+ "grad_norm": 2.477085590362549,
+ "learning_rate": 4.731889417061428e-06,
+ "loss": 0.5984,
+ "step": 1973
+ },
+ {
+ "epoch": 0.9333333333333333,
+ "grad_norm": 2.585240364074707,
+ "learning_rate": 4.731608287985465e-06,
+ "loss": 0.558,
+ "step": 1974
+ },
+ {
+ "epoch": 0.933806146572104,
+ "grad_norm": 2.658714532852173,
+ "learning_rate": 4.731327019957039e-06,
+ "loss": 0.5567,
+ "step": 1975
+ },
+ {
+ "epoch": 0.9342789598108747,
+ "grad_norm": 2.7593026161193848,
+ "learning_rate": 4.731045612993662e-06,
+ "loss": 0.5772,
+ "step": 1976
+ },
+ {
+ "epoch": 0.9347517730496454,
+ "grad_norm": 2.4386026859283447,
+ "learning_rate": 4.7307640671128585e-06,
+ "loss": 0.6199,
+ "step": 1977
+ },
+ {
+ "epoch": 0.9352245862884161,
+ "grad_norm": 2.681910514831543,
+ "learning_rate": 4.730482382332158e-06,
+ "loss": 0.5971,
+ "step": 1978
+ },
+ {
+ "epoch": 0.9356973995271868,
+ "grad_norm": 3.7593860626220703,
+ "learning_rate": 4.7302005586691e-06,
+ "loss": 0.6346,
+ "step": 1979
+ },
+ {
+ "epoch": 0.9361702127659575,
+ "grad_norm": 2.5789096355438232,
+ "learning_rate": 4.729918596141232e-06,
+ "loss": 0.5684,
+ "step": 1980
+ },
+ {
+ "epoch": 0.9366430260047282,
+ "grad_norm": 3.0607335567474365,
+ "learning_rate": 4.729636494766111e-06,
+ "loss": 0.6223,
+ "step": 1981
+ },
+ {
+ "epoch": 0.9371158392434988,
+ "grad_norm": 2.906643867492676,
+ "learning_rate": 4.729354254561303e-06,
+ "loss": 0.6513,
+ "step": 1982
+ },
+ {
+ "epoch": 0.9375886524822695,
+ "grad_norm": 3.192430019378662,
+ "learning_rate": 4.7290718755443795e-06,
+ "loss": 0.5095,
+ "step": 1983
+ },
+ {
+ "epoch": 0.9380614657210402,
+ "grad_norm": 2.661536931991577,
+ "learning_rate": 4.7287893577329255e-06,
+ "loss": 0.5525,
+ "step": 1984
+ },
+ {
+ "epoch": 0.9385342789598109,
+ "grad_norm": 2.8436734676361084,
+ "learning_rate": 4.728506701144531e-06,
+ "loss": 0.6323,
+ "step": 1985
+ },
+ {
+ "epoch": 0.9390070921985816,
+ "grad_norm": 2.75544810295105,
+ "learning_rate": 4.728223905796796e-06,
+ "loss": 0.6018,
+ "step": 1986
+ },
+ {
+ "epoch": 0.9394799054373523,
+ "grad_norm": 3.0652759075164795,
+ "learning_rate": 4.727940971707329e-06,
+ "loss": 0.62,
+ "step": 1987
+ },
+ {
+ "epoch": 0.939952718676123,
+ "grad_norm": 2.802567720413208,
+ "learning_rate": 4.727657898893747e-06,
+ "loss": 0.5809,
+ "step": 1988
+ },
+ {
+ "epoch": 0.9404255319148936,
+ "grad_norm": 2.6208512783050537,
+ "learning_rate": 4.7273746873736745e-06,
+ "loss": 0.5762,
+ "step": 1989
+ },
+ {
+ "epoch": 0.9408983451536643,
+ "grad_norm": 2.5901873111724854,
+ "learning_rate": 4.727091337164748e-06,
+ "loss": 0.6111,
+ "step": 1990
+ },
+ {
+ "epoch": 0.941371158392435,
+ "grad_norm": 3.002347707748413,
+ "learning_rate": 4.726807848284609e-06,
+ "loss": 0.6419,
+ "step": 1991
+ },
+ {
+ "epoch": 0.9418439716312057,
+ "grad_norm": 2.522151470184326,
+ "learning_rate": 4.72652422075091e-06,
+ "loss": 0.642,
+ "step": 1992
+ },
+ {
+ "epoch": 0.9423167848699764,
+ "grad_norm": 2.5571532249450684,
+ "learning_rate": 4.726240454581311e-06,
+ "loss": 0.5729,
+ "step": 1993
+ },
+ {
+ "epoch": 0.9427895981087471,
+ "grad_norm": 2.7704918384552,
+ "learning_rate": 4.72595654979348e-06,
+ "loss": 0.6816,
+ "step": 1994
+ },
+ {
+ "epoch": 0.9432624113475178,
+ "grad_norm": 2.517040491104126,
+ "learning_rate": 4.7256725064050955e-06,
+ "loss": 0.5782,
+ "step": 1995
+ },
+ {
+ "epoch": 0.9437352245862884,
+ "grad_norm": 2.613955020904541,
+ "learning_rate": 4.725388324433843e-06,
+ "loss": 0.6291,
+ "step": 1996
+ },
+ {
+ "epoch": 0.9442080378250591,
+ "grad_norm": 2.848891258239746,
+ "learning_rate": 4.725104003897418e-06,
+ "loss": 0.6544,
+ "step": 1997
+ },
+ {
+ "epoch": 0.9446808510638298,
+ "grad_norm": 3.0162429809570312,
+ "learning_rate": 4.724819544813523e-06,
+ "loss": 0.6301,
+ "step": 1998
+ },
+ {
+ "epoch": 0.9451536643026005,
+ "grad_norm": 2.613614559173584,
+ "learning_rate": 4.72453494719987e-06,
+ "loss": 0.5829,
+ "step": 1999
+ },
+ {
+ "epoch": 0.9456264775413712,
+ "grad_norm": 2.4838767051696777,
+ "learning_rate": 4.724250211074182e-06,
+ "loss": 0.6042,
+ "step": 2000
+ },
+ {
+ "epoch": 0.9460992907801419,
+ "grad_norm": 2.526470899581909,
+ "learning_rate": 4.723965336454185e-06,
+ "loss": 0.6167,
+ "step": 2001
+ },
+ {
+ "epoch": 0.9465721040189126,
+ "grad_norm": 2.504506826400757,
+ "learning_rate": 4.723680323357618e-06,
+ "loss": 0.6061,
+ "step": 2002
+ },
+ {
+ "epoch": 0.9470449172576832,
+ "grad_norm": 3.0547544956207275,
+ "learning_rate": 4.723395171802228e-06,
+ "loss": 0.6619,
+ "step": 2003
+ },
+ {
+ "epoch": 0.9475177304964539,
+ "grad_norm": 2.8692407608032227,
+ "learning_rate": 4.723109881805771e-06,
+ "loss": 0.5985,
+ "step": 2004
+ },
+ {
+ "epoch": 0.9479905437352246,
+ "grad_norm": 2.7929654121398926,
+ "learning_rate": 4.7228244533860094e-06,
+ "loss": 0.5869,
+ "step": 2005
+ },
+ {
+ "epoch": 0.9484633569739953,
+ "grad_norm": 2.764869451522827,
+ "learning_rate": 4.7225388865607146e-06,
+ "loss": 0.6288,
+ "step": 2006
+ },
+ {
+ "epoch": 0.948936170212766,
+ "grad_norm": 2.7656404972076416,
+ "learning_rate": 4.722253181347671e-06,
+ "loss": 0.5831,
+ "step": 2007
+ },
+ {
+ "epoch": 0.9494089834515367,
+ "grad_norm": 2.6698336601257324,
+ "learning_rate": 4.7219673377646635e-06,
+ "loss": 0.6087,
+ "step": 2008
+ },
+ {
+ "epoch": 0.9498817966903074,
+ "grad_norm": 2.524935722351074,
+ "learning_rate": 4.7216813558294946e-06,
+ "loss": 0.5675,
+ "step": 2009
+ },
+ {
+ "epoch": 0.950354609929078,
+ "grad_norm": 2.5998785495758057,
+ "learning_rate": 4.721395235559969e-06,
+ "loss": 0.5667,
+ "step": 2010
+ },
+ {
+ "epoch": 0.9508274231678487,
+ "grad_norm": 2.758021354675293,
+ "learning_rate": 4.721108976973902e-06,
+ "loss": 0.4931,
+ "step": 2011
+ },
+ {
+ "epoch": 0.9513002364066194,
+ "grad_norm": 2.767695903778076,
+ "learning_rate": 4.72082258008912e-06,
+ "loss": 0.5778,
+ "step": 2012
+ },
+ {
+ "epoch": 0.9517730496453901,
+ "grad_norm": 2.982314348220825,
+ "learning_rate": 4.720536044923453e-06,
+ "loss": 0.6096,
+ "step": 2013
+ },
+ {
+ "epoch": 0.9522458628841608,
+ "grad_norm": 2.7608799934387207,
+ "learning_rate": 4.720249371494743e-06,
+ "loss": 0.6242,
+ "step": 2014
+ },
+ {
+ "epoch": 0.9527186761229315,
+ "grad_norm": 2.60054349899292,
+ "learning_rate": 4.71996255982084e-06,
+ "loss": 0.6249,
+ "step": 2015
+ },
+ {
+ "epoch": 0.9531914893617022,
+ "grad_norm": 2.654355764389038,
+ "learning_rate": 4.719675609919603e-06,
+ "loss": 0.6327,
+ "step": 2016
+ },
+ {
+ "epoch": 0.9536643026004729,
+ "grad_norm": 2.589404582977295,
+ "learning_rate": 4.719388521808899e-06,
+ "loss": 0.6357,
+ "step": 2017
+ },
+ {
+ "epoch": 0.9541371158392435,
+ "grad_norm": 2.8016581535339355,
+ "learning_rate": 4.719101295506603e-06,
+ "loss": 0.5901,
+ "step": 2018
+ },
+ {
+ "epoch": 0.9546099290780142,
+ "grad_norm": 3.1408045291900635,
+ "learning_rate": 4.7188139310306e-06,
+ "loss": 0.598,
+ "step": 2019
+ },
+ {
+ "epoch": 0.9550827423167849,
+ "grad_norm": 2.7432665824890137,
+ "learning_rate": 4.718526428398783e-06,
+ "loss": 0.5508,
+ "step": 2020
+ },
+ {
+ "epoch": 0.9555555555555556,
+ "grad_norm": 2.947800874710083,
+ "learning_rate": 4.718238787629053e-06,
+ "loss": 0.6439,
+ "step": 2021
+ },
+ {
+ "epoch": 0.9560283687943263,
+ "grad_norm": 2.50828218460083,
+ "learning_rate": 4.71795100873932e-06,
+ "loss": 0.5441,
+ "step": 2022
+ },
+ {
+ "epoch": 0.956501182033097,
+ "grad_norm": 2.8558974266052246,
+ "learning_rate": 4.717663091747503e-06,
+ "loss": 0.5416,
+ "step": 2023
+ },
+ {
+ "epoch": 0.9569739952718677,
+ "grad_norm": 2.4803316593170166,
+ "learning_rate": 4.71737503667153e-06,
+ "loss": 0.5317,
+ "step": 2024
+ },
+ {
+ "epoch": 0.9574468085106383,
+ "grad_norm": 4.36754035949707,
+ "learning_rate": 4.717086843529336e-06,
+ "loss": 0.5808,
+ "step": 2025
+ },
+ {
+ "epoch": 0.957919621749409,
+ "grad_norm": 2.730185031890869,
+ "learning_rate": 4.7167985123388665e-06,
+ "loss": 0.5257,
+ "step": 2026
+ },
+ {
+ "epoch": 0.9583924349881797,
+ "grad_norm": 2.8136069774627686,
+ "learning_rate": 4.716510043118074e-06,
+ "loss": 0.5836,
+ "step": 2027
+ },
+ {
+ "epoch": 0.9588652482269504,
+ "grad_norm": 2.793975353240967,
+ "learning_rate": 4.71622143588492e-06,
+ "loss": 0.5706,
+ "step": 2028
+ },
+ {
+ "epoch": 0.9593380614657211,
+ "grad_norm": 2.3883821964263916,
+ "learning_rate": 4.7159326906573745e-06,
+ "loss": 0.5291,
+ "step": 2029
+ },
+ {
+ "epoch": 0.9598108747044918,
+ "grad_norm": 2.6135976314544678,
+ "learning_rate": 4.715643807453417e-06,
+ "loss": 0.6199,
+ "step": 2030
+ },
+ {
+ "epoch": 0.9602836879432625,
+ "grad_norm": 2.6245670318603516,
+ "learning_rate": 4.715354786291035e-06,
+ "loss": 0.5585,
+ "step": 2031
+ },
+ {
+ "epoch": 0.9607565011820332,
+ "grad_norm": 2.7870967388153076,
+ "learning_rate": 4.715065627188225e-06,
+ "loss": 0.6196,
+ "step": 2032
+ },
+ {
+ "epoch": 0.9612293144208038,
+ "grad_norm": 2.6983911991119385,
+ "learning_rate": 4.714776330162991e-06,
+ "loss": 0.6424,
+ "step": 2033
+ },
+ {
+ "epoch": 0.9617021276595744,
+ "grad_norm": 2.3221919536590576,
+ "learning_rate": 4.7144868952333465e-06,
+ "loss": 0.568,
+ "step": 2034
+ },
+ {
+ "epoch": 0.9621749408983451,
+ "grad_norm": 2.9408178329467773,
+ "learning_rate": 4.714197322417314e-06,
+ "loss": 0.6175,
+ "step": 2035
+ },
+ {
+ "epoch": 0.9626477541371158,
+ "grad_norm": 2.404057264328003,
+ "learning_rate": 4.713907611732921e-06,
+ "loss": 0.4943,
+ "step": 2036
+ },
+ {
+ "epoch": 0.9631205673758865,
+ "grad_norm": 3.547607660293579,
+ "learning_rate": 4.71361776319821e-06,
+ "loss": 0.5488,
+ "step": 2037
+ },
+ {
+ "epoch": 0.9635933806146572,
+ "grad_norm": 2.679614543914795,
+ "learning_rate": 4.713327776831227e-06,
+ "loss": 0.6234,
+ "step": 2038
+ },
+ {
+ "epoch": 0.9640661938534278,
+ "grad_norm": 2.526914119720459,
+ "learning_rate": 4.7130376526500286e-06,
+ "loss": 0.5891,
+ "step": 2039
+ },
+ {
+ "epoch": 0.9645390070921985,
+ "grad_norm": 2.6953470706939697,
+ "learning_rate": 4.71274739067268e-06,
+ "loss": 0.69,
+ "step": 2040
+ },
+ {
+ "epoch": 0.9650118203309692,
+ "grad_norm": 2.546660900115967,
+ "learning_rate": 4.712456990917254e-06,
+ "loss": 0.6185,
+ "step": 2041
+ },
+ {
+ "epoch": 0.9654846335697399,
+ "grad_norm": 3.3920490741729736,
+ "learning_rate": 4.712166453401832e-06,
+ "loss": 0.587,
+ "step": 2042
+ },
+ {
+ "epoch": 0.9659574468085106,
+ "grad_norm": 2.5961573123931885,
+ "learning_rate": 4.711875778144504e-06,
+ "loss": 0.6105,
+ "step": 2043
+ },
+ {
+ "epoch": 0.9664302600472813,
+ "grad_norm": 2.5111498832702637,
+ "learning_rate": 4.711584965163372e-06,
+ "loss": 0.5533,
+ "step": 2044
+ },
+ {
+ "epoch": 0.966903073286052,
+ "grad_norm": 2.4878132343292236,
+ "learning_rate": 4.7112940144765405e-06,
+ "loss": 0.5604,
+ "step": 2045
+ },
+ {
+ "epoch": 0.9673758865248226,
+ "grad_norm": 2.5714077949523926,
+ "learning_rate": 4.711002926102128e-06,
+ "loss": 0.5794,
+ "step": 2046
+ },
+ {
+ "epoch": 0.9678486997635933,
+ "grad_norm": 2.7069091796875,
+ "learning_rate": 4.710711700058257e-06,
+ "loss": 0.594,
+ "step": 2047
+ },
+ {
+ "epoch": 0.968321513002364,
+ "grad_norm": 2.8104631900787354,
+ "learning_rate": 4.710420336363063e-06,
+ "loss": 0.6247,
+ "step": 2048
+ },
+ {
+ "epoch": 0.9687943262411347,
+ "grad_norm": 2.8464386463165283,
+ "learning_rate": 4.7101288350346865e-06,
+ "loss": 0.6162,
+ "step": 2049
+ },
+ {
+ "epoch": 0.9692671394799054,
+ "grad_norm": 2.7187976837158203,
+ "learning_rate": 4.709837196091279e-06,
+ "loss": 0.6109,
+ "step": 2050
+ },
+ {
+ "epoch": 0.9697399527186761,
+ "grad_norm": 2.556734085083008,
+ "learning_rate": 4.709545419550999e-06,
+ "loss": 0.6297,
+ "step": 2051
+ },
+ {
+ "epoch": 0.9702127659574468,
+ "grad_norm": 2.937195062637329,
+ "learning_rate": 4.709253505432014e-06,
+ "loss": 0.6862,
+ "step": 2052
+ },
+ {
+ "epoch": 0.9706855791962175,
+ "grad_norm": 2.792175531387329,
+ "learning_rate": 4.7089614537525015e-06,
+ "loss": 0.6105,
+ "step": 2053
+ },
+ {
+ "epoch": 0.9711583924349881,
+ "grad_norm": 2.625636100769043,
+ "learning_rate": 4.708669264530644e-06,
+ "loss": 0.5849,
+ "step": 2054
+ },
+ {
+ "epoch": 0.9716312056737588,
+ "grad_norm": 2.6752610206604004,
+ "learning_rate": 4.708376937784637e-06,
+ "loss": 0.5949,
+ "step": 2055
+ },
+ {
+ "epoch": 0.9721040189125295,
+ "grad_norm": 2.6072793006896973,
+ "learning_rate": 4.708084473532681e-06,
+ "loss": 0.5776,
+ "step": 2056
+ },
+ {
+ "epoch": 0.9725768321513002,
+ "grad_norm": 2.728632926940918,
+ "learning_rate": 4.707791871792988e-06,
+ "loss": 0.6352,
+ "step": 2057
+ },
+ {
+ "epoch": 0.9730496453900709,
+ "grad_norm": 2.5841758251190186,
+ "learning_rate": 4.707499132583775e-06,
+ "loss": 0.5488,
+ "step": 2058
+ },
+ {
+ "epoch": 0.9735224586288416,
+ "grad_norm": 2.8464293479919434,
+ "learning_rate": 4.707206255923271e-06,
+ "loss": 0.7051,
+ "step": 2059
+ },
+ {
+ "epoch": 0.9739952718676123,
+ "grad_norm": 2.547297239303589,
+ "learning_rate": 4.706913241829712e-06,
+ "loss": 0.5937,
+ "step": 2060
+ },
+ {
+ "epoch": 0.9744680851063829,
+ "grad_norm": 2.6572306156158447,
+ "learning_rate": 4.706620090321341e-06,
+ "loss": 0.6041,
+ "step": 2061
+ },
+ {
+ "epoch": 0.9749408983451536,
+ "grad_norm": 2.3262805938720703,
+ "learning_rate": 4.706326801416414e-06,
+ "loss": 0.5144,
+ "step": 2062
+ },
+ {
+ "epoch": 0.9754137115839243,
+ "grad_norm": 2.9693965911865234,
+ "learning_rate": 4.706033375133191e-06,
+ "loss": 0.551,
+ "step": 2063
+ },
+ {
+ "epoch": 0.975886524822695,
+ "grad_norm": 2.5993731021881104,
+ "learning_rate": 4.7057398114899435e-06,
+ "loss": 0.6143,
+ "step": 2064
+ },
+ {
+ "epoch": 0.9763593380614657,
+ "grad_norm": 2.453336477279663,
+ "learning_rate": 4.70544611050495e-06,
+ "loss": 0.6093,
+ "step": 2065
+ },
+ {
+ "epoch": 0.9768321513002364,
+ "grad_norm": 2.898629665374756,
+ "learning_rate": 4.705152272196497e-06,
+ "loss": 0.6007,
+ "step": 2066
+ },
+ {
+ "epoch": 0.9773049645390071,
+ "grad_norm": 2.7990612983703613,
+ "learning_rate": 4.7048582965828815e-06,
+ "loss": 0.6687,
+ "step": 2067
+ },
+ {
+ "epoch": 0.9777777777777777,
+ "grad_norm": 2.635284423828125,
+ "learning_rate": 4.704564183682408e-06,
+ "loss": 0.5564,
+ "step": 2068
+ },
+ {
+ "epoch": 0.9782505910165484,
+ "grad_norm": 3.014547109603882,
+ "learning_rate": 4.704269933513389e-06,
+ "loss": 0.6084,
+ "step": 2069
+ },
+ {
+ "epoch": 0.9787234042553191,
+ "grad_norm": 2.659357786178589,
+ "learning_rate": 4.703975546094147e-06,
+ "loss": 0.6031,
+ "step": 2070
+ },
+ {
+ "epoch": 0.9791962174940898,
+ "grad_norm": 2.326932668685913,
+ "learning_rate": 4.703681021443013e-06,
+ "loss": 0.5859,
+ "step": 2071
+ },
+ {
+ "epoch": 0.9796690307328605,
+ "grad_norm": 2.958803653717041,
+ "learning_rate": 4.7033863595783235e-06,
+ "loss": 0.5586,
+ "step": 2072
+ },
+ {
+ "epoch": 0.9801418439716312,
+ "grad_norm": 2.921386957168579,
+ "learning_rate": 4.703091560518427e-06,
+ "loss": 0.6126,
+ "step": 2073
+ },
+ {
+ "epoch": 0.9806146572104019,
+ "grad_norm": 2.6500775814056396,
+ "learning_rate": 4.702796624281679e-06,
+ "loss": 0.5678,
+ "step": 2074
+ },
+ {
+ "epoch": 0.9810874704491725,
+ "grad_norm": 2.7740228176116943,
+ "learning_rate": 4.702501550886445e-06,
+ "loss": 0.6067,
+ "step": 2075
+ },
+ {
+ "epoch": 0.9815602836879432,
+ "grad_norm": 2.3296213150024414,
+ "learning_rate": 4.702206340351096e-06,
+ "loss": 0.5247,
+ "step": 2076
+ },
+ {
+ "epoch": 0.9820330969267139,
+ "grad_norm": 2.748300790786743,
+ "learning_rate": 4.701910992694016e-06,
+ "loss": 0.5197,
+ "step": 2077
+ },
+ {
+ "epoch": 0.9825059101654846,
+ "grad_norm": 2.250985622406006,
+ "learning_rate": 4.7016155079335926e-06,
+ "loss": 0.5214,
+ "step": 2078
+ },
+ {
+ "epoch": 0.9829787234042553,
+ "grad_norm": 2.389845848083496,
+ "learning_rate": 4.701319886088226e-06,
+ "loss": 0.519,
+ "step": 2079
+ },
+ {
+ "epoch": 0.983451536643026,
+ "grad_norm": 2.818220853805542,
+ "learning_rate": 4.701024127176322e-06,
+ "loss": 0.607,
+ "step": 2080
+ },
+ {
+ "epoch": 0.9839243498817967,
+ "grad_norm": 3.4058034420013428,
+ "learning_rate": 4.700728231216297e-06,
+ "loss": 0.5711,
+ "step": 2081
+ },
+ {
+ "epoch": 0.9843971631205674,
+ "grad_norm": 2.5297787189483643,
+ "learning_rate": 4.700432198226575e-06,
+ "loss": 0.5979,
+ "step": 2082
+ },
+ {
+ "epoch": 0.984869976359338,
+ "grad_norm": 3.0548105239868164,
+ "learning_rate": 4.7001360282255885e-06,
+ "loss": 0.6041,
+ "step": 2083
+ },
+ {
+ "epoch": 0.9853427895981087,
+ "grad_norm": 2.8983733654022217,
+ "learning_rate": 4.699839721231779e-06,
+ "loss": 0.5926,
+ "step": 2084
+ },
+ {
+ "epoch": 0.9858156028368794,
+ "grad_norm": 3.2717764377593994,
+ "learning_rate": 4.699543277263596e-06,
+ "loss": 0.6477,
+ "step": 2085
+ },
+ {
+ "epoch": 0.9862884160756501,
+ "grad_norm": 3.03729248046875,
+ "learning_rate": 4.699246696339497e-06,
+ "loss": 0.6786,
+ "step": 2086
+ },
+ {
+ "epoch": 0.9867612293144208,
+ "grad_norm": 2.852301597595215,
+ "learning_rate": 4.698949978477951e-06,
+ "loss": 0.6565,
+ "step": 2087
+ },
+ {
+ "epoch": 0.9872340425531915,
+ "grad_norm": 2.843485116958618,
+ "learning_rate": 4.698653123697431e-06,
+ "loss": 0.6627,
+ "step": 2088
+ },
+ {
+ "epoch": 0.9877068557919622,
+ "grad_norm": 2.6315064430236816,
+ "learning_rate": 4.698356132016423e-06,
+ "loss": 0.6577,
+ "step": 2089
+ },
+ {
+ "epoch": 0.9881796690307328,
+ "grad_norm": 2.7482151985168457,
+ "learning_rate": 4.698059003453417e-06,
+ "loss": 0.5514,
+ "step": 2090
+ },
+ {
+ "epoch": 0.9886524822695035,
+ "grad_norm": 2.826673746109009,
+ "learning_rate": 4.6977617380269145e-06,
+ "loss": 0.565,
+ "step": 2091
+ },
+ {
+ "epoch": 0.9891252955082742,
+ "grad_norm": 3.0273752212524414,
+ "learning_rate": 4.697464335755427e-06,
+ "loss": 0.6331,
+ "step": 2092
+ },
+ {
+ "epoch": 0.9895981087470449,
+ "grad_norm": 2.7551653385162354,
+ "learning_rate": 4.6971667966574695e-06,
+ "loss": 0.6486,
+ "step": 2093
+ },
+ {
+ "epoch": 0.9900709219858156,
+ "grad_norm": 2.656299114227295,
+ "learning_rate": 4.696869120751571e-06,
+ "loss": 0.6562,
+ "step": 2094
+ },
+ {
+ "epoch": 0.9905437352245863,
+ "grad_norm": 2.785322904586792,
+ "learning_rate": 4.696571308056265e-06,
+ "loss": 0.5892,
+ "step": 2095
+ },
+ {
+ "epoch": 0.991016548463357,
+ "grad_norm": 2.9334635734558105,
+ "learning_rate": 4.696273358590095e-06,
+ "loss": 0.6346,
+ "step": 2096
+ },
+ {
+ "epoch": 0.9914893617021276,
+ "grad_norm": 2.7944300174713135,
+ "learning_rate": 4.695975272371613e-06,
+ "loss": 0.5859,
+ "step": 2097
+ },
+ {
+ "epoch": 0.9919621749408983,
+ "grad_norm": 2.5416972637176514,
+ "learning_rate": 4.695677049419381e-06,
+ "loss": 0.5658,
+ "step": 2098
+ },
+ {
+ "epoch": 0.992434988179669,
+ "grad_norm": 2.4056856632232666,
+ "learning_rate": 4.695378689751966e-06,
+ "loss": 0.5121,
+ "step": 2099
+ },
+ {
+ "epoch": 0.9929078014184397,
+ "grad_norm": 2.614548683166504,
+ "learning_rate": 4.695080193387948e-06,
+ "loss": 0.5961,
+ "step": 2100
+ },
+ {
+ "epoch": 0.9933806146572104,
+ "grad_norm": 2.8966517448425293,
+ "learning_rate": 4.69478156034591e-06,
+ "loss": 0.5985,
+ "step": 2101
+ },
+ {
+ "epoch": 0.9938534278959811,
+ "grad_norm": 2.9514098167419434,
+ "learning_rate": 4.694482790644448e-06,
+ "loss": 0.5677,
+ "step": 2102
+ },
+ {
+ "epoch": 0.9943262411347518,
+ "grad_norm": 2.4326791763305664,
+ "learning_rate": 4.694183884302165e-06,
+ "loss": 0.5698,
+ "step": 2103
+ },
+ {
+ "epoch": 0.9947990543735225,
+ "grad_norm": 2.9242892265319824,
+ "learning_rate": 4.6938848413376735e-06,
+ "loss": 0.6245,
+ "step": 2104
+ },
+ {
+ "epoch": 0.9952718676122931,
+ "grad_norm": 2.9134104251861572,
+ "learning_rate": 4.693585661769593e-06,
+ "loss": 0.6164,
+ "step": 2105
+ },
+ {
+ "epoch": 0.9957446808510638,
+ "grad_norm": 2.472564458847046,
+ "learning_rate": 4.693286345616551e-06,
+ "loss": 0.5616,
+ "step": 2106
+ },
+ {
+ "epoch": 0.9962174940898345,
+ "grad_norm": 3.2456448078155518,
+ "learning_rate": 4.692986892897186e-06,
+ "loss": 0.6977,
+ "step": 2107
+ },
+ {
+ "epoch": 0.9966903073286052,
+ "grad_norm": 3.4032769203186035,
+ "learning_rate": 4.692687303630143e-06,
+ "loss": 0.643,
+ "step": 2108
+ },
+ {
+ "epoch": 0.9971631205673759,
+ "grad_norm": 2.722200870513916,
+ "learning_rate": 4.692387577834076e-06,
+ "loss": 0.5873,
+ "step": 2109
+ },
+ {
+ "epoch": 0.9976359338061466,
+ "grad_norm": 2.687532663345337,
+ "learning_rate": 4.692087715527648e-06,
+ "loss": 0.5423,
+ "step": 2110
+ },
+ {
+ "epoch": 0.9981087470449173,
+ "grad_norm": 2.578613042831421,
+ "learning_rate": 4.6917877167295305e-06,
+ "loss": 0.5689,
+ "step": 2111
+ },
+ {
+ "epoch": 0.9985815602836879,
+ "grad_norm": 3.1806094646453857,
+ "learning_rate": 4.691487581458402e-06,
+ "loss": 0.6133,
+ "step": 2112
+ },
+ {
+ "epoch": 0.9990543735224586,
+ "grad_norm": 2.4449520111083984,
+ "learning_rate": 4.691187309732952e-06,
+ "loss": 0.5841,
+ "step": 2113
+ },
+ {
+ "epoch": 0.9995271867612293,
+ "grad_norm": 2.908749580383301,
+ "learning_rate": 4.690886901571875e-06,
+ "loss": 0.534,
+ "step": 2114
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 4.019968032836914,
+ "learning_rate": 4.6905863569938785e-06,
+ "loss": 0.596,
+ "step": 2115
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 12690,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 6,
+ "save_steps": 2115,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.341936104473887e+18,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-2115/training_args.bin b/checkpoint-2115/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3885e6a27b204c834ff6fbc6f3a5a4c7f69fe2cc
--- /dev/null
+++ b/checkpoint-2115/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51188d498a3fe1bf9ddf234271786a31e25e89b3aa68721a97f8264e1013c9b6
+size 8056
diff --git a/checkpoint-2115/zero_to_fp32.py b/checkpoint-2115/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-2115/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-4230/README.md b/checkpoint-4230/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..049d467664ca6172b7ffbe6ba60b3eac7479cac4
--- /dev/null
+++ b/checkpoint-4230/README.md
@@ -0,0 +1,202 @@
+---
+base_model: meta-llama/Llama-3.1-8B
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.14.0
\ No newline at end of file
diff --git a/checkpoint-4230/adapter_config.json b/checkpoint-4230/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..38890ca21f7e1854f7350049a9113a9eec8a443a
--- /dev/null
+++ b/checkpoint-4230/adapter_config.json
@@ -0,0 +1,40 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "meta-llama/Llama-3.1-8B",
+ "bias": "none",
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "v_proj",
+ "q_proj",
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "up_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-4230/adapter_model.safetensors b/checkpoint-4230/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..420d07ca284a2570953e1bd2322de57b2b701082
--- /dev/null
+++ b/checkpoint-4230/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e32d7e7fb1567458c31490ab395d2f18a8f2a2690e95b59b9e604be0d81e8bc
+size 3443586272
diff --git a/checkpoint-4230/global_step4230/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-4230/global_step4230/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f48fbb730ff80865dd6aded2e67d5ce04cf2d15a
--- /dev/null
+++ b/checkpoint-4230/global_step4230/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a5a654f580570a7d079875df3d7a10ca68d7313b922e4c6d48890d70c339aa2
+size 20661195036
diff --git a/checkpoint-4230/global_step4230/mp_rank_00_model_states.pt b/checkpoint-4230/global_step4230/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6df6bf0bc1f5de93b6204a0cbdf465d83d2245e4
--- /dev/null
+++ b/checkpoint-4230/global_step4230/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f90d26a9e90140c4f5a4396c1ef868c0e735530fe319c146a64e9e907214f2d
+size 3555326777
diff --git a/checkpoint-4230/latest b/checkpoint-4230/latest
new file mode 100644
index 0000000000000000000000000000000000000000..f2e9527786b2225f95d976a4280b0a1cffe4254a
--- /dev/null
+++ b/checkpoint-4230/latest
@@ -0,0 +1 @@
+global_step4230
\ No newline at end of file
diff --git a/checkpoint-4230/rng_state.pth b/checkpoint-4230/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..bb90bc95800d8247bb004ebf48ee2f7d06c22c57
--- /dev/null
+++ b/checkpoint-4230/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a422aae9cec4d59c271c92acda4a907bc97c3691c619213443cf5eb6a7b483f5
+size 14244
diff --git a/checkpoint-4230/scheduler.pt b/checkpoint-4230/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..92005bd02a40f30b4b2615a6f0e6bb7f48ac4ce5
--- /dev/null
+++ b/checkpoint-4230/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37808eff2730899ea378e6807a984be98b958d7ff33bf2192d4b87ce8d6f1fb2
+size 1064
diff --git a/checkpoint-4230/special_tokens_map.json b/checkpoint-4230/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca
--- /dev/null
+++ b/checkpoint-4230/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-4230/tokenizer.json b/checkpoint-4230/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-4230/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-4230/tokenizer_config.json b/checkpoint-4230/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..81dd14db6632ad5b35b9d447732e37ac074873a5
--- /dev/null
+++ b/checkpoint-4230/tokenizer_config.json
@@ -0,0 +1,2063 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|end_of_text|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-4230/trainer_state.json b/checkpoint-4230/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..378b191d53a48520b41e4a2d095e1445ea939ca6
--- /dev/null
+++ b/checkpoint-4230/trainer_state.json
@@ -0,0 +1,29643 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 2.0,
+ "eval_steps": 500,
+ "global_step": 4230,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.00047281323877068556,
+ "grad_norm": 5.163570880889893,
+ "learning_rate": 5.0000000000000004e-08,
+ "loss": 1.4628,
+ "step": 1
+ },
+ {
+ "epoch": 0.0009456264775413711,
+ "grad_norm": 6.298020839691162,
+ "learning_rate": 1.0000000000000001e-07,
+ "loss": 1.5003,
+ "step": 2
+ },
+ {
+ "epoch": 0.0014184397163120568,
+ "grad_norm": 5.853623390197754,
+ "learning_rate": 1.5000000000000002e-07,
+ "loss": 1.4495,
+ "step": 3
+ },
+ {
+ "epoch": 0.0018912529550827422,
+ "grad_norm": 5.456025123596191,
+ "learning_rate": 2.0000000000000002e-07,
+ "loss": 1.3798,
+ "step": 4
+ },
+ {
+ "epoch": 0.002364066193853428,
+ "grad_norm": 5.757407188415527,
+ "learning_rate": 2.5000000000000004e-07,
+ "loss": 1.4515,
+ "step": 5
+ },
+ {
+ "epoch": 0.0028368794326241137,
+ "grad_norm": 5.872277736663818,
+ "learning_rate": 3.0000000000000004e-07,
+ "loss": 1.4424,
+ "step": 6
+ },
+ {
+ "epoch": 0.003309692671394799,
+ "grad_norm": 6.7816009521484375,
+ "learning_rate": 3.5000000000000004e-07,
+ "loss": 1.4004,
+ "step": 7
+ },
+ {
+ "epoch": 0.0037825059101654845,
+ "grad_norm": 6.229667663574219,
+ "learning_rate": 4.0000000000000003e-07,
+ "loss": 1.4494,
+ "step": 8
+ },
+ {
+ "epoch": 0.00425531914893617,
+ "grad_norm": 5.336202621459961,
+ "learning_rate": 4.5000000000000003e-07,
+ "loss": 1.3916,
+ "step": 9
+ },
+ {
+ "epoch": 0.004728132387706856,
+ "grad_norm": 5.589445114135742,
+ "learning_rate": 5.000000000000001e-07,
+ "loss": 1.2318,
+ "step": 10
+ },
+ {
+ "epoch": 0.005200945626477541,
+ "grad_norm": 5.720539569854736,
+ "learning_rate": 5.5e-07,
+ "loss": 1.4367,
+ "step": 11
+ },
+ {
+ "epoch": 0.005673758865248227,
+ "grad_norm": 5.913913726806641,
+ "learning_rate": 6.000000000000001e-07,
+ "loss": 1.342,
+ "step": 12
+ },
+ {
+ "epoch": 0.006146572104018913,
+ "grad_norm": 5.899744987487793,
+ "learning_rate": 6.5e-07,
+ "loss": 1.4307,
+ "step": 13
+ },
+ {
+ "epoch": 0.006619385342789598,
+ "grad_norm": 5.571037292480469,
+ "learning_rate": 7.000000000000001e-07,
+ "loss": 1.3372,
+ "step": 14
+ },
+ {
+ "epoch": 0.0070921985815602835,
+ "grad_norm": 5.480010509490967,
+ "learning_rate": 7.5e-07,
+ "loss": 1.3923,
+ "step": 15
+ },
+ {
+ "epoch": 0.007565011820330969,
+ "grad_norm": 5.254702091217041,
+ "learning_rate": 8.000000000000001e-07,
+ "loss": 1.2928,
+ "step": 16
+ },
+ {
+ "epoch": 0.008037825059101654,
+ "grad_norm": 6.090312480926514,
+ "learning_rate": 8.500000000000001e-07,
+ "loss": 1.4984,
+ "step": 17
+ },
+ {
+ "epoch": 0.00851063829787234,
+ "grad_norm": 5.689319610595703,
+ "learning_rate": 9.000000000000001e-07,
+ "loss": 1.4108,
+ "step": 18
+ },
+ {
+ "epoch": 0.008983451536643027,
+ "grad_norm": 5.386685848236084,
+ "learning_rate": 9.500000000000001e-07,
+ "loss": 1.425,
+ "step": 19
+ },
+ {
+ "epoch": 0.009456264775413711,
+ "grad_norm": 6.451584815979004,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 1.5507,
+ "step": 20
+ },
+ {
+ "epoch": 0.009929078014184398,
+ "grad_norm": 5.37647008895874,
+ "learning_rate": 1.0500000000000001e-06,
+ "loss": 1.4109,
+ "step": 21
+ },
+ {
+ "epoch": 0.010401891252955082,
+ "grad_norm": 4.716553211212158,
+ "learning_rate": 1.1e-06,
+ "loss": 1.2028,
+ "step": 22
+ },
+ {
+ "epoch": 0.010874704491725768,
+ "grad_norm": 4.950989723205566,
+ "learning_rate": 1.1500000000000002e-06,
+ "loss": 1.3043,
+ "step": 23
+ },
+ {
+ "epoch": 0.011347517730496455,
+ "grad_norm": 4.688975811004639,
+ "learning_rate": 1.2000000000000002e-06,
+ "loss": 1.2708,
+ "step": 24
+ },
+ {
+ "epoch": 0.01182033096926714,
+ "grad_norm": 4.905868053436279,
+ "learning_rate": 1.25e-06,
+ "loss": 1.3268,
+ "step": 25
+ },
+ {
+ "epoch": 0.012293144208037825,
+ "grad_norm": 4.503395080566406,
+ "learning_rate": 1.3e-06,
+ "loss": 1.1799,
+ "step": 26
+ },
+ {
+ "epoch": 0.01276595744680851,
+ "grad_norm": 4.77382230758667,
+ "learning_rate": 1.3500000000000002e-06,
+ "loss": 1.3882,
+ "step": 27
+ },
+ {
+ "epoch": 0.013238770685579196,
+ "grad_norm": 4.734329700469971,
+ "learning_rate": 1.4000000000000001e-06,
+ "loss": 1.3476,
+ "step": 28
+ },
+ {
+ "epoch": 0.013711583924349883,
+ "grad_norm": 4.775066375732422,
+ "learning_rate": 1.45e-06,
+ "loss": 1.2429,
+ "step": 29
+ },
+ {
+ "epoch": 0.014184397163120567,
+ "grad_norm": 4.978334426879883,
+ "learning_rate": 1.5e-06,
+ "loss": 1.2119,
+ "step": 30
+ },
+ {
+ "epoch": 0.014657210401891253,
+ "grad_norm": 4.506785869598389,
+ "learning_rate": 1.5500000000000002e-06,
+ "loss": 1.3157,
+ "step": 31
+ },
+ {
+ "epoch": 0.015130023640661938,
+ "grad_norm": 4.007757186889648,
+ "learning_rate": 1.6000000000000001e-06,
+ "loss": 1.1451,
+ "step": 32
+ },
+ {
+ "epoch": 0.015602836879432624,
+ "grad_norm": 3.6621618270874023,
+ "learning_rate": 1.6500000000000003e-06,
+ "loss": 1.093,
+ "step": 33
+ },
+ {
+ "epoch": 0.01607565011820331,
+ "grad_norm": 3.8733766078948975,
+ "learning_rate": 1.7000000000000002e-06,
+ "loss": 1.2289,
+ "step": 34
+ },
+ {
+ "epoch": 0.016548463356973995,
+ "grad_norm": 4.3391900062561035,
+ "learning_rate": 1.75e-06,
+ "loss": 1.1453,
+ "step": 35
+ },
+ {
+ "epoch": 0.01702127659574468,
+ "grad_norm": 3.287623643875122,
+ "learning_rate": 1.8000000000000001e-06,
+ "loss": 1.0257,
+ "step": 36
+ },
+ {
+ "epoch": 0.017494089834515367,
+ "grad_norm": 3.591721773147583,
+ "learning_rate": 1.85e-06,
+ "loss": 0.9976,
+ "step": 37
+ },
+ {
+ "epoch": 0.017966903073286054,
+ "grad_norm": 4.028271675109863,
+ "learning_rate": 1.9000000000000002e-06,
+ "loss": 1.0773,
+ "step": 38
+ },
+ {
+ "epoch": 0.018439716312056736,
+ "grad_norm": 3.3543951511383057,
+ "learning_rate": 1.9500000000000004e-06,
+ "loss": 1.1677,
+ "step": 39
+ },
+ {
+ "epoch": 0.018912529550827423,
+ "grad_norm": 3.807624340057373,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 1.1232,
+ "step": 40
+ },
+ {
+ "epoch": 0.01938534278959811,
+ "grad_norm": 4.242797374725342,
+ "learning_rate": 2.05e-06,
+ "loss": 1.1819,
+ "step": 41
+ },
+ {
+ "epoch": 0.019858156028368795,
+ "grad_norm": 3.4574992656707764,
+ "learning_rate": 2.1000000000000002e-06,
+ "loss": 0.9878,
+ "step": 42
+ },
+ {
+ "epoch": 0.02033096926713948,
+ "grad_norm": 3.906695604324341,
+ "learning_rate": 2.15e-06,
+ "loss": 1.0592,
+ "step": 43
+ },
+ {
+ "epoch": 0.020803782505910164,
+ "grad_norm": 3.7543163299560547,
+ "learning_rate": 2.2e-06,
+ "loss": 1.0309,
+ "step": 44
+ },
+ {
+ "epoch": 0.02127659574468085,
+ "grad_norm": 3.3777148723602295,
+ "learning_rate": 2.25e-06,
+ "loss": 1.0664,
+ "step": 45
+ },
+ {
+ "epoch": 0.021749408983451537,
+ "grad_norm": 3.6003634929656982,
+ "learning_rate": 2.3000000000000004e-06,
+ "loss": 1.0482,
+ "step": 46
+ },
+ {
+ "epoch": 0.022222222222222223,
+ "grad_norm": 3.3961377143859863,
+ "learning_rate": 2.35e-06,
+ "loss": 1.0252,
+ "step": 47
+ },
+ {
+ "epoch": 0.02269503546099291,
+ "grad_norm": 3.1601035594940186,
+ "learning_rate": 2.4000000000000003e-06,
+ "loss": 1.0435,
+ "step": 48
+ },
+ {
+ "epoch": 0.023167848699763592,
+ "grad_norm": 3.4192967414855957,
+ "learning_rate": 2.4500000000000003e-06,
+ "loss": 1.0935,
+ "step": 49
+ },
+ {
+ "epoch": 0.02364066193853428,
+ "grad_norm": 3.1225922107696533,
+ "learning_rate": 2.5e-06,
+ "loss": 0.8988,
+ "step": 50
+ },
+ {
+ "epoch": 0.024113475177304965,
+ "grad_norm": 3.1423380374908447,
+ "learning_rate": 2.55e-06,
+ "loss": 1.0159,
+ "step": 51
+ },
+ {
+ "epoch": 0.02458628841607565,
+ "grad_norm": 3.4782402515411377,
+ "learning_rate": 2.6e-06,
+ "loss": 1.0231,
+ "step": 52
+ },
+ {
+ "epoch": 0.025059101654846337,
+ "grad_norm": 3.8362693786621094,
+ "learning_rate": 2.6500000000000005e-06,
+ "loss": 1.0725,
+ "step": 53
+ },
+ {
+ "epoch": 0.02553191489361702,
+ "grad_norm": 3.033294916152954,
+ "learning_rate": 2.7000000000000004e-06,
+ "loss": 0.9377,
+ "step": 54
+ },
+ {
+ "epoch": 0.026004728132387706,
+ "grad_norm": 3.849741220474243,
+ "learning_rate": 2.7500000000000004e-06,
+ "loss": 1.0046,
+ "step": 55
+ },
+ {
+ "epoch": 0.026477541371158392,
+ "grad_norm": 3.141876220703125,
+ "learning_rate": 2.8000000000000003e-06,
+ "loss": 0.9226,
+ "step": 56
+ },
+ {
+ "epoch": 0.02695035460992908,
+ "grad_norm": 2.773594856262207,
+ "learning_rate": 2.85e-06,
+ "loss": 0.8662,
+ "step": 57
+ },
+ {
+ "epoch": 0.027423167848699765,
+ "grad_norm": 3.1460225582122803,
+ "learning_rate": 2.9e-06,
+ "loss": 0.9304,
+ "step": 58
+ },
+ {
+ "epoch": 0.027895981087470448,
+ "grad_norm": 3.293583631515503,
+ "learning_rate": 2.95e-06,
+ "loss": 1.0374,
+ "step": 59
+ },
+ {
+ "epoch": 0.028368794326241134,
+ "grad_norm": 3.8190863132476807,
+ "learning_rate": 3e-06,
+ "loss": 0.971,
+ "step": 60
+ },
+ {
+ "epoch": 0.02884160756501182,
+ "grad_norm": 3.4566776752471924,
+ "learning_rate": 3.05e-06,
+ "loss": 0.9631,
+ "step": 61
+ },
+ {
+ "epoch": 0.029314420803782507,
+ "grad_norm": 3.355741500854492,
+ "learning_rate": 3.1000000000000004e-06,
+ "loss": 1.0097,
+ "step": 62
+ },
+ {
+ "epoch": 0.029787234042553193,
+ "grad_norm": 3.29746675491333,
+ "learning_rate": 3.1500000000000003e-06,
+ "loss": 0.9459,
+ "step": 63
+ },
+ {
+ "epoch": 0.030260047281323876,
+ "grad_norm": 3.3122968673706055,
+ "learning_rate": 3.2000000000000003e-06,
+ "loss": 0.8594,
+ "step": 64
+ },
+ {
+ "epoch": 0.030732860520094562,
+ "grad_norm": 3.477701187133789,
+ "learning_rate": 3.2500000000000002e-06,
+ "loss": 0.9197,
+ "step": 65
+ },
+ {
+ "epoch": 0.031205673758865248,
+ "grad_norm": 3.3363406658172607,
+ "learning_rate": 3.3000000000000006e-06,
+ "loss": 0.9478,
+ "step": 66
+ },
+ {
+ "epoch": 0.03167848699763593,
+ "grad_norm": 4.143295764923096,
+ "learning_rate": 3.3500000000000005e-06,
+ "loss": 1.0534,
+ "step": 67
+ },
+ {
+ "epoch": 0.03215130023640662,
+ "grad_norm": 3.2363274097442627,
+ "learning_rate": 3.4000000000000005e-06,
+ "loss": 0.9454,
+ "step": 68
+ },
+ {
+ "epoch": 0.032624113475177303,
+ "grad_norm": 3.198746681213379,
+ "learning_rate": 3.45e-06,
+ "loss": 0.9388,
+ "step": 69
+ },
+ {
+ "epoch": 0.03309692671394799,
+ "grad_norm": 3.5751023292541504,
+ "learning_rate": 3.5e-06,
+ "loss": 0.9444,
+ "step": 70
+ },
+ {
+ "epoch": 0.033569739952718676,
+ "grad_norm": 3.1745729446411133,
+ "learning_rate": 3.5500000000000003e-06,
+ "loss": 0.8683,
+ "step": 71
+ },
+ {
+ "epoch": 0.03404255319148936,
+ "grad_norm": 3.3210883140563965,
+ "learning_rate": 3.6000000000000003e-06,
+ "loss": 0.8811,
+ "step": 72
+ },
+ {
+ "epoch": 0.03451536643026005,
+ "grad_norm": 3.2502429485321045,
+ "learning_rate": 3.65e-06,
+ "loss": 1.0012,
+ "step": 73
+ },
+ {
+ "epoch": 0.034988179669030735,
+ "grad_norm": 3.44598126411438,
+ "learning_rate": 3.7e-06,
+ "loss": 0.9217,
+ "step": 74
+ },
+ {
+ "epoch": 0.03546099290780142,
+ "grad_norm": 3.439117431640625,
+ "learning_rate": 3.7500000000000005e-06,
+ "loss": 0.8976,
+ "step": 75
+ },
+ {
+ "epoch": 0.03593380614657211,
+ "grad_norm": 3.523627758026123,
+ "learning_rate": 3.8000000000000005e-06,
+ "loss": 0.8996,
+ "step": 76
+ },
+ {
+ "epoch": 0.03640661938534279,
+ "grad_norm": 3.3716015815734863,
+ "learning_rate": 3.85e-06,
+ "loss": 0.9061,
+ "step": 77
+ },
+ {
+ "epoch": 0.03687943262411347,
+ "grad_norm": 3.33518385887146,
+ "learning_rate": 3.900000000000001e-06,
+ "loss": 0.9371,
+ "step": 78
+ },
+ {
+ "epoch": 0.03735224586288416,
+ "grad_norm": 3.833829879760742,
+ "learning_rate": 3.95e-06,
+ "loss": 0.9669,
+ "step": 79
+ },
+ {
+ "epoch": 0.037825059101654845,
+ "grad_norm": 3.260446786880493,
+ "learning_rate": 4.000000000000001e-06,
+ "loss": 0.9449,
+ "step": 80
+ },
+ {
+ "epoch": 0.03829787234042553,
+ "grad_norm": 3.532451629638672,
+ "learning_rate": 4.05e-06,
+ "loss": 0.897,
+ "step": 81
+ },
+ {
+ "epoch": 0.03877068557919622,
+ "grad_norm": 3.1156492233276367,
+ "learning_rate": 4.1e-06,
+ "loss": 0.8463,
+ "step": 82
+ },
+ {
+ "epoch": 0.039243498817966904,
+ "grad_norm": 2.8801751136779785,
+ "learning_rate": 4.15e-06,
+ "loss": 0.8616,
+ "step": 83
+ },
+ {
+ "epoch": 0.03971631205673759,
+ "grad_norm": 3.072476863861084,
+ "learning_rate": 4.2000000000000004e-06,
+ "loss": 0.8387,
+ "step": 84
+ },
+ {
+ "epoch": 0.04018912529550828,
+ "grad_norm": 2.9601376056671143,
+ "learning_rate": 4.25e-06,
+ "loss": 0.8538,
+ "step": 85
+ },
+ {
+ "epoch": 0.04066193853427896,
+ "grad_norm": 3.521664619445801,
+ "learning_rate": 4.3e-06,
+ "loss": 0.8894,
+ "step": 86
+ },
+ {
+ "epoch": 0.04113475177304964,
+ "grad_norm": 3.2670981884002686,
+ "learning_rate": 4.350000000000001e-06,
+ "loss": 0.8387,
+ "step": 87
+ },
+ {
+ "epoch": 0.04160756501182033,
+ "grad_norm": 3.422089099884033,
+ "learning_rate": 4.4e-06,
+ "loss": 0.7728,
+ "step": 88
+ },
+ {
+ "epoch": 0.042080378250591015,
+ "grad_norm": 3.414034128189087,
+ "learning_rate": 4.450000000000001e-06,
+ "loss": 0.7968,
+ "step": 89
+ },
+ {
+ "epoch": 0.0425531914893617,
+ "grad_norm": 4.234285354614258,
+ "learning_rate": 4.5e-06,
+ "loss": 0.8502,
+ "step": 90
+ },
+ {
+ "epoch": 0.04302600472813239,
+ "grad_norm": 3.1446919441223145,
+ "learning_rate": 4.5500000000000005e-06,
+ "loss": 0.8236,
+ "step": 91
+ },
+ {
+ "epoch": 0.043498817966903074,
+ "grad_norm": 3.683443307876587,
+ "learning_rate": 4.600000000000001e-06,
+ "loss": 0.9792,
+ "step": 92
+ },
+ {
+ "epoch": 0.04397163120567376,
+ "grad_norm": 3.664219617843628,
+ "learning_rate": 4.65e-06,
+ "loss": 0.8743,
+ "step": 93
+ },
+ {
+ "epoch": 0.044444444444444446,
+ "grad_norm": 3.369479179382324,
+ "learning_rate": 4.7e-06,
+ "loss": 0.8741,
+ "step": 94
+ },
+ {
+ "epoch": 0.04491725768321513,
+ "grad_norm": 3.694949150085449,
+ "learning_rate": 4.75e-06,
+ "loss": 0.7574,
+ "step": 95
+ },
+ {
+ "epoch": 0.04539007092198582,
+ "grad_norm": 3.5144498348236084,
+ "learning_rate": 4.800000000000001e-06,
+ "loss": 0.9934,
+ "step": 96
+ },
+ {
+ "epoch": 0.0458628841607565,
+ "grad_norm": 3.164451837539673,
+ "learning_rate": 4.85e-06,
+ "loss": 0.7463,
+ "step": 97
+ },
+ {
+ "epoch": 0.046335697399527184,
+ "grad_norm": 3.222785472869873,
+ "learning_rate": 4.9000000000000005e-06,
+ "loss": 0.7698,
+ "step": 98
+ },
+ {
+ "epoch": 0.04680851063829787,
+ "grad_norm": 2.9129555225372314,
+ "learning_rate": 4.95e-06,
+ "loss": 0.7856,
+ "step": 99
+ },
+ {
+ "epoch": 0.04728132387706856,
+ "grad_norm": 3.5061235427856445,
+ "learning_rate": 5e-06,
+ "loss": 0.8588,
+ "step": 100
+ },
+ {
+ "epoch": 0.04775413711583924,
+ "grad_norm": 3.2805044651031494,
+ "learning_rate": 4.999999922167982e-06,
+ "loss": 0.7643,
+ "step": 101
+ },
+ {
+ "epoch": 0.04822695035460993,
+ "grad_norm": 3.5461678504943848,
+ "learning_rate": 4.999999688671929e-06,
+ "loss": 0.8253,
+ "step": 102
+ },
+ {
+ "epoch": 0.048699763593380616,
+ "grad_norm": 3.2238264083862305,
+ "learning_rate": 4.99999929951186e-06,
+ "loss": 0.7622,
+ "step": 103
+ },
+ {
+ "epoch": 0.0491725768321513,
+ "grad_norm": 3.818955898284912,
+ "learning_rate": 4.999998754687795e-06,
+ "loss": 0.8471,
+ "step": 104
+ },
+ {
+ "epoch": 0.04964539007092199,
+ "grad_norm": 3.1252424716949463,
+ "learning_rate": 4.99999805419977e-06,
+ "loss": 0.8409,
+ "step": 105
+ },
+ {
+ "epoch": 0.050118203309692674,
+ "grad_norm": 3.604283571243286,
+ "learning_rate": 4.999997198047828e-06,
+ "loss": 0.9027,
+ "step": 106
+ },
+ {
+ "epoch": 0.050591016548463354,
+ "grad_norm": 3.6752424240112305,
+ "learning_rate": 4.999996186232023e-06,
+ "loss": 0.9336,
+ "step": 107
+ },
+ {
+ "epoch": 0.05106382978723404,
+ "grad_norm": 3.517557144165039,
+ "learning_rate": 4.9999950187524184e-06,
+ "loss": 0.8351,
+ "step": 108
+ },
+ {
+ "epoch": 0.051536643026004726,
+ "grad_norm": 3.427285671234131,
+ "learning_rate": 4.999993695609085e-06,
+ "loss": 0.8457,
+ "step": 109
+ },
+ {
+ "epoch": 0.05200945626477541,
+ "grad_norm": 3.2792510986328125,
+ "learning_rate": 4.999992216802107e-06,
+ "loss": 0.8391,
+ "step": 110
+ },
+ {
+ "epoch": 0.0524822695035461,
+ "grad_norm": 3.581094741821289,
+ "learning_rate": 4.999990582331576e-06,
+ "loss": 0.7533,
+ "step": 111
+ },
+ {
+ "epoch": 0.052955082742316785,
+ "grad_norm": 3.1667377948760986,
+ "learning_rate": 4.999988792197593e-06,
+ "loss": 0.9562,
+ "step": 112
+ },
+ {
+ "epoch": 0.05342789598108747,
+ "grad_norm": 3.3609890937805176,
+ "learning_rate": 4.99998684640027e-06,
+ "loss": 0.8181,
+ "step": 113
+ },
+ {
+ "epoch": 0.05390070921985816,
+ "grad_norm": 3.260627269744873,
+ "learning_rate": 4.999984744939729e-06,
+ "loss": 0.8012,
+ "step": 114
+ },
+ {
+ "epoch": 0.054373522458628844,
+ "grad_norm": 3.4535653591156006,
+ "learning_rate": 4.9999824878160985e-06,
+ "loss": 0.919,
+ "step": 115
+ },
+ {
+ "epoch": 0.05484633569739953,
+ "grad_norm": 3.4880740642547607,
+ "learning_rate": 4.999980075029522e-06,
+ "loss": 0.8114,
+ "step": 116
+ },
+ {
+ "epoch": 0.05531914893617021,
+ "grad_norm": 3.2546932697296143,
+ "learning_rate": 4.999977506580147e-06,
+ "loss": 0.8274,
+ "step": 117
+ },
+ {
+ "epoch": 0.055791962174940896,
+ "grad_norm": 3.2762744426727295,
+ "learning_rate": 4.999974782468136e-06,
+ "loss": 0.9018,
+ "step": 118
+ },
+ {
+ "epoch": 0.05626477541371158,
+ "grad_norm": 3.42825984954834,
+ "learning_rate": 4.999971902693657e-06,
+ "loss": 0.8262,
+ "step": 119
+ },
+ {
+ "epoch": 0.05673758865248227,
+ "grad_norm": 3.082496404647827,
+ "learning_rate": 4.99996886725689e-06,
+ "loss": 0.8181,
+ "step": 120
+ },
+ {
+ "epoch": 0.057210401891252954,
+ "grad_norm": 3.322869300842285,
+ "learning_rate": 4.9999656761580225e-06,
+ "loss": 0.8382,
+ "step": 121
+ },
+ {
+ "epoch": 0.05768321513002364,
+ "grad_norm": 3.6365339756011963,
+ "learning_rate": 4.9999623293972555e-06,
+ "loss": 0.7489,
+ "step": 122
+ },
+ {
+ "epoch": 0.05815602836879433,
+ "grad_norm": 3.376352548599243,
+ "learning_rate": 4.999958826974796e-06,
+ "loss": 0.9012,
+ "step": 123
+ },
+ {
+ "epoch": 0.05862884160756501,
+ "grad_norm": 3.49088716506958,
+ "learning_rate": 4.999955168890862e-06,
+ "loss": 0.8999,
+ "step": 124
+ },
+ {
+ "epoch": 0.0591016548463357,
+ "grad_norm": 3.3265068531036377,
+ "learning_rate": 4.999951355145682e-06,
+ "loss": 0.8161,
+ "step": 125
+ },
+ {
+ "epoch": 0.059574468085106386,
+ "grad_norm": 3.697282314300537,
+ "learning_rate": 4.999947385739493e-06,
+ "loss": 0.9623,
+ "step": 126
+ },
+ {
+ "epoch": 0.06004728132387707,
+ "grad_norm": 2.7901928424835205,
+ "learning_rate": 4.999943260672542e-06,
+ "loss": 0.7371,
+ "step": 127
+ },
+ {
+ "epoch": 0.06052009456264775,
+ "grad_norm": 3.110319137573242,
+ "learning_rate": 4.999938979945086e-06,
+ "loss": 0.715,
+ "step": 128
+ },
+ {
+ "epoch": 0.06099290780141844,
+ "grad_norm": 3.2211520671844482,
+ "learning_rate": 4.999934543557392e-06,
+ "loss": 0.8888,
+ "step": 129
+ },
+ {
+ "epoch": 0.061465721040189124,
+ "grad_norm": 3.2466187477111816,
+ "learning_rate": 4.999929951509735e-06,
+ "loss": 0.9389,
+ "step": 130
+ },
+ {
+ "epoch": 0.06193853427895981,
+ "grad_norm": 3.3574399948120117,
+ "learning_rate": 4.999925203802403e-06,
+ "loss": 0.8263,
+ "step": 131
+ },
+ {
+ "epoch": 0.062411347517730496,
+ "grad_norm": 3.275601625442505,
+ "learning_rate": 4.99992030043569e-06,
+ "loss": 0.8338,
+ "step": 132
+ },
+ {
+ "epoch": 0.06288416075650118,
+ "grad_norm": 3.6011312007904053,
+ "learning_rate": 4.999915241409902e-06,
+ "loss": 0.8351,
+ "step": 133
+ },
+ {
+ "epoch": 0.06335697399527186,
+ "grad_norm": 2.969011068344116,
+ "learning_rate": 4.999910026725352e-06,
+ "loss": 0.79,
+ "step": 134
+ },
+ {
+ "epoch": 0.06382978723404255,
+ "grad_norm": 3.690784454345703,
+ "learning_rate": 4.999904656382369e-06,
+ "loss": 0.8209,
+ "step": 135
+ },
+ {
+ "epoch": 0.06430260047281323,
+ "grad_norm": 3.3363115787506104,
+ "learning_rate": 4.999899130381283e-06,
+ "loss": 0.858,
+ "step": 136
+ },
+ {
+ "epoch": 0.06477541371158392,
+ "grad_norm": 3.206881523132324,
+ "learning_rate": 4.9998934487224405e-06,
+ "loss": 0.834,
+ "step": 137
+ },
+ {
+ "epoch": 0.06524822695035461,
+ "grad_norm": 2.773146152496338,
+ "learning_rate": 4.999887611406195e-06,
+ "loss": 0.7576,
+ "step": 138
+ },
+ {
+ "epoch": 0.0657210401891253,
+ "grad_norm": 3.307725667953491,
+ "learning_rate": 4.999881618432908e-06,
+ "loss": 0.7487,
+ "step": 139
+ },
+ {
+ "epoch": 0.06619385342789598,
+ "grad_norm": 4.273657321929932,
+ "learning_rate": 4.999875469802956e-06,
+ "loss": 0.8176,
+ "step": 140
+ },
+ {
+ "epoch": 0.06666666666666667,
+ "grad_norm": 3.0898005962371826,
+ "learning_rate": 4.999869165516719e-06,
+ "loss": 0.7578,
+ "step": 141
+ },
+ {
+ "epoch": 0.06713947990543735,
+ "grad_norm": 3.25150990486145,
+ "learning_rate": 4.9998627055745915e-06,
+ "loss": 0.7873,
+ "step": 142
+ },
+ {
+ "epoch": 0.06761229314420804,
+ "grad_norm": 2.9705755710601807,
+ "learning_rate": 4.999856089976974e-06,
+ "loss": 0.6473,
+ "step": 143
+ },
+ {
+ "epoch": 0.06808510638297872,
+ "grad_norm": 3.5658507347106934,
+ "learning_rate": 4.9998493187242804e-06,
+ "loss": 0.855,
+ "step": 144
+ },
+ {
+ "epoch": 0.06855791962174941,
+ "grad_norm": 3.3994076251983643,
+ "learning_rate": 4.99984239181693e-06,
+ "loss": 0.7926,
+ "step": 145
+ },
+ {
+ "epoch": 0.0690307328605201,
+ "grad_norm": 2.8266260623931885,
+ "learning_rate": 4.999835309255357e-06,
+ "loss": 0.7564,
+ "step": 146
+ },
+ {
+ "epoch": 0.06950354609929078,
+ "grad_norm": 3.1143875122070312,
+ "learning_rate": 4.999828071039999e-06,
+ "loss": 0.8398,
+ "step": 147
+ },
+ {
+ "epoch": 0.06997635933806147,
+ "grad_norm": 2.9364278316497803,
+ "learning_rate": 4.99982067717131e-06,
+ "loss": 0.7381,
+ "step": 148
+ },
+ {
+ "epoch": 0.07044917257683216,
+ "grad_norm": 3.4155616760253906,
+ "learning_rate": 4.999813127649748e-06,
+ "loss": 0.7933,
+ "step": 149
+ },
+ {
+ "epoch": 0.07092198581560284,
+ "grad_norm": 4.371236324310303,
+ "learning_rate": 4.999805422475784e-06,
+ "loss": 0.8292,
+ "step": 150
+ },
+ {
+ "epoch": 0.07139479905437353,
+ "grad_norm": 3.3967185020446777,
+ "learning_rate": 4.999797561649897e-06,
+ "loss": 0.8712,
+ "step": 151
+ },
+ {
+ "epoch": 0.07186761229314421,
+ "grad_norm": 3.343303680419922,
+ "learning_rate": 4.999789545172578e-06,
+ "loss": 0.8177,
+ "step": 152
+ },
+ {
+ "epoch": 0.07234042553191489,
+ "grad_norm": 3.040235757827759,
+ "learning_rate": 4.999781373044325e-06,
+ "loss": 0.7379,
+ "step": 153
+ },
+ {
+ "epoch": 0.07281323877068557,
+ "grad_norm": 3.4069204330444336,
+ "learning_rate": 4.999773045265647e-06,
+ "loss": 0.7939,
+ "step": 154
+ },
+ {
+ "epoch": 0.07328605200945626,
+ "grad_norm": 3.1939475536346436,
+ "learning_rate": 4.999764561837063e-06,
+ "loss": 0.8037,
+ "step": 155
+ },
+ {
+ "epoch": 0.07375886524822695,
+ "grad_norm": 4.452004909515381,
+ "learning_rate": 4.999755922759101e-06,
+ "loss": 0.8421,
+ "step": 156
+ },
+ {
+ "epoch": 0.07423167848699763,
+ "grad_norm": 3.2031240463256836,
+ "learning_rate": 4.999747128032298e-06,
+ "loss": 0.794,
+ "step": 157
+ },
+ {
+ "epoch": 0.07470449172576832,
+ "grad_norm": 3.175920009613037,
+ "learning_rate": 4.999738177657203e-06,
+ "loss": 0.759,
+ "step": 158
+ },
+ {
+ "epoch": 0.075177304964539,
+ "grad_norm": 3.7679688930511475,
+ "learning_rate": 4.9997290716343725e-06,
+ "loss": 0.8174,
+ "step": 159
+ },
+ {
+ "epoch": 0.07565011820330969,
+ "grad_norm": 3.7020037174224854,
+ "learning_rate": 4.999719809964373e-06,
+ "loss": 0.7116,
+ "step": 160
+ },
+ {
+ "epoch": 0.07612293144208038,
+ "grad_norm": 4.357471942901611,
+ "learning_rate": 4.999710392647783e-06,
+ "loss": 0.7649,
+ "step": 161
+ },
+ {
+ "epoch": 0.07659574468085106,
+ "grad_norm": 3.3439087867736816,
+ "learning_rate": 4.999700819685187e-06,
+ "loss": 0.7907,
+ "step": 162
+ },
+ {
+ "epoch": 0.07706855791962175,
+ "grad_norm": 3.210815191268921,
+ "learning_rate": 4.999691091077182e-06,
+ "loss": 0.8446,
+ "step": 163
+ },
+ {
+ "epoch": 0.07754137115839244,
+ "grad_norm": 3.1029553413391113,
+ "learning_rate": 4.9996812068243735e-06,
+ "loss": 0.7232,
+ "step": 164
+ },
+ {
+ "epoch": 0.07801418439716312,
+ "grad_norm": 2.9389400482177734,
+ "learning_rate": 4.999671166927378e-06,
+ "loss": 0.7413,
+ "step": 165
+ },
+ {
+ "epoch": 0.07848699763593381,
+ "grad_norm": 3.7062697410583496,
+ "learning_rate": 4.9996609713868185e-06,
+ "loss": 0.8773,
+ "step": 166
+ },
+ {
+ "epoch": 0.0789598108747045,
+ "grad_norm": 3.2768924236297607,
+ "learning_rate": 4.999650620203332e-06,
+ "loss": 0.8046,
+ "step": 167
+ },
+ {
+ "epoch": 0.07943262411347518,
+ "grad_norm": 3.380373001098633,
+ "learning_rate": 4.999640113377561e-06,
+ "loss": 0.7529,
+ "step": 168
+ },
+ {
+ "epoch": 0.07990543735224587,
+ "grad_norm": 3.520022392272949,
+ "learning_rate": 4.999629450910162e-06,
+ "loss": 0.7352,
+ "step": 169
+ },
+ {
+ "epoch": 0.08037825059101655,
+ "grad_norm": 3.43269419670105,
+ "learning_rate": 4.999618632801796e-06,
+ "loss": 0.9371,
+ "step": 170
+ },
+ {
+ "epoch": 0.08085106382978724,
+ "grad_norm": 3.555877923965454,
+ "learning_rate": 4.99960765905314e-06,
+ "loss": 0.8276,
+ "step": 171
+ },
+ {
+ "epoch": 0.08132387706855793,
+ "grad_norm": 3.597050189971924,
+ "learning_rate": 4.999596529664874e-06,
+ "loss": 0.8164,
+ "step": 172
+ },
+ {
+ "epoch": 0.0817966903073286,
+ "grad_norm": 3.2002956867218018,
+ "learning_rate": 4.999585244637693e-06,
+ "loss": 0.7824,
+ "step": 173
+ },
+ {
+ "epoch": 0.08226950354609928,
+ "grad_norm": 3.527275562286377,
+ "learning_rate": 4.999573803972299e-06,
+ "loss": 0.8033,
+ "step": 174
+ },
+ {
+ "epoch": 0.08274231678486997,
+ "grad_norm": 3.5184452533721924,
+ "learning_rate": 4.999562207669405e-06,
+ "loss": 0.724,
+ "step": 175
+ },
+ {
+ "epoch": 0.08321513002364066,
+ "grad_norm": 3.6635067462921143,
+ "learning_rate": 4.999550455729732e-06,
+ "loss": 0.819,
+ "step": 176
+ },
+ {
+ "epoch": 0.08368794326241134,
+ "grad_norm": 3.192399740219116,
+ "learning_rate": 4.999538548154012e-06,
+ "loss": 0.7999,
+ "step": 177
+ },
+ {
+ "epoch": 0.08416075650118203,
+ "grad_norm": 3.0946953296661377,
+ "learning_rate": 4.999526484942988e-06,
+ "loss": 0.7367,
+ "step": 178
+ },
+ {
+ "epoch": 0.08463356973995272,
+ "grad_norm": 2.847198009490967,
+ "learning_rate": 4.99951426609741e-06,
+ "loss": 0.7536,
+ "step": 179
+ },
+ {
+ "epoch": 0.0851063829787234,
+ "grad_norm": 2.7674827575683594,
+ "learning_rate": 4.999501891618037e-06,
+ "loss": 0.701,
+ "step": 180
+ },
+ {
+ "epoch": 0.08557919621749409,
+ "grad_norm": 3.357933521270752,
+ "learning_rate": 4.999489361505643e-06,
+ "loss": 0.8331,
+ "step": 181
+ },
+ {
+ "epoch": 0.08605200945626477,
+ "grad_norm": 3.1464426517486572,
+ "learning_rate": 4.999476675761004e-06,
+ "loss": 0.7931,
+ "step": 182
+ },
+ {
+ "epoch": 0.08652482269503546,
+ "grad_norm": 3.310697078704834,
+ "learning_rate": 4.999463834384915e-06,
+ "loss": 0.753,
+ "step": 183
+ },
+ {
+ "epoch": 0.08699763593380615,
+ "grad_norm": 2.9794881343841553,
+ "learning_rate": 4.999450837378171e-06,
+ "loss": 0.7091,
+ "step": 184
+ },
+ {
+ "epoch": 0.08747044917257683,
+ "grad_norm": 3.0776889324188232,
+ "learning_rate": 4.999437684741584e-06,
+ "loss": 0.7226,
+ "step": 185
+ },
+ {
+ "epoch": 0.08794326241134752,
+ "grad_norm": 3.6657519340515137,
+ "learning_rate": 4.999424376475972e-06,
+ "loss": 0.845,
+ "step": 186
+ },
+ {
+ "epoch": 0.0884160756501182,
+ "grad_norm": 3.872718572616577,
+ "learning_rate": 4.999410912582164e-06,
+ "loss": 0.812,
+ "step": 187
+ },
+ {
+ "epoch": 0.08888888888888889,
+ "grad_norm": 2.9184508323669434,
+ "learning_rate": 4.9993972930609976e-06,
+ "loss": 0.6823,
+ "step": 188
+ },
+ {
+ "epoch": 0.08936170212765958,
+ "grad_norm": 3.5567142963409424,
+ "learning_rate": 4.999383517913321e-06,
+ "loss": 0.7614,
+ "step": 189
+ },
+ {
+ "epoch": 0.08983451536643026,
+ "grad_norm": 3.3688533306121826,
+ "learning_rate": 4.999369587139992e-06,
+ "loss": 0.858,
+ "step": 190
+ },
+ {
+ "epoch": 0.09030732860520095,
+ "grad_norm": 2.893223524093628,
+ "learning_rate": 4.99935550074188e-06,
+ "loss": 0.6761,
+ "step": 191
+ },
+ {
+ "epoch": 0.09078014184397164,
+ "grad_norm": 3.400225877761841,
+ "learning_rate": 4.999341258719859e-06,
+ "loss": 0.7531,
+ "step": 192
+ },
+ {
+ "epoch": 0.09125295508274232,
+ "grad_norm": 3.6167714595794678,
+ "learning_rate": 4.999326861074817e-06,
+ "loss": 0.8164,
+ "step": 193
+ },
+ {
+ "epoch": 0.091725768321513,
+ "grad_norm": 4.325016498565674,
+ "learning_rate": 4.9993123078076506e-06,
+ "loss": 0.7069,
+ "step": 194
+ },
+ {
+ "epoch": 0.09219858156028368,
+ "grad_norm": 3.195317029953003,
+ "learning_rate": 4.999297598919266e-06,
+ "loss": 0.726,
+ "step": 195
+ },
+ {
+ "epoch": 0.09267139479905437,
+ "grad_norm": 3.146530866622925,
+ "learning_rate": 4.999282734410579e-06,
+ "loss": 0.7888,
+ "step": 196
+ },
+ {
+ "epoch": 0.09314420803782505,
+ "grad_norm": 3.5166752338409424,
+ "learning_rate": 4.999267714282515e-06,
+ "loss": 0.8473,
+ "step": 197
+ },
+ {
+ "epoch": 0.09361702127659574,
+ "grad_norm": 3.3140196800231934,
+ "learning_rate": 4.99925253853601e-06,
+ "loss": 0.7233,
+ "step": 198
+ },
+ {
+ "epoch": 0.09408983451536643,
+ "grad_norm": 3.0318164825439453,
+ "learning_rate": 4.999237207172008e-06,
+ "loss": 0.7543,
+ "step": 199
+ },
+ {
+ "epoch": 0.09456264775413711,
+ "grad_norm": 3.662214756011963,
+ "learning_rate": 4.999221720191464e-06,
+ "loss": 0.7783,
+ "step": 200
+ },
+ {
+ "epoch": 0.0950354609929078,
+ "grad_norm": 3.452078104019165,
+ "learning_rate": 4.9992060775953425e-06,
+ "loss": 0.7868,
+ "step": 201
+ },
+ {
+ "epoch": 0.09550827423167849,
+ "grad_norm": 3.4051287174224854,
+ "learning_rate": 4.999190279384617e-06,
+ "loss": 0.7849,
+ "step": 202
+ },
+ {
+ "epoch": 0.09598108747044917,
+ "grad_norm": 3.1377196311950684,
+ "learning_rate": 4.999174325560271e-06,
+ "loss": 0.8364,
+ "step": 203
+ },
+ {
+ "epoch": 0.09645390070921986,
+ "grad_norm": 3.129473924636841,
+ "learning_rate": 4.999158216123299e-06,
+ "loss": 0.7458,
+ "step": 204
+ },
+ {
+ "epoch": 0.09692671394799054,
+ "grad_norm": 3.169548749923706,
+ "learning_rate": 4.999141951074703e-06,
+ "loss": 0.7256,
+ "step": 205
+ },
+ {
+ "epoch": 0.09739952718676123,
+ "grad_norm": 3.186009168624878,
+ "learning_rate": 4.999125530415495e-06,
+ "loss": 0.783,
+ "step": 206
+ },
+ {
+ "epoch": 0.09787234042553192,
+ "grad_norm": 3.0995123386383057,
+ "learning_rate": 4.9991089541467e-06,
+ "loss": 0.7519,
+ "step": 207
+ },
+ {
+ "epoch": 0.0983451536643026,
+ "grad_norm": 3.1854088306427,
+ "learning_rate": 4.999092222269348e-06,
+ "loss": 0.7444,
+ "step": 208
+ },
+ {
+ "epoch": 0.09881796690307329,
+ "grad_norm": 3.1512246131896973,
+ "learning_rate": 4.999075334784482e-06,
+ "loss": 0.7882,
+ "step": 209
+ },
+ {
+ "epoch": 0.09929078014184398,
+ "grad_norm": 3.6199698448181152,
+ "learning_rate": 4.999058291693153e-06,
+ "loss": 0.8048,
+ "step": 210
+ },
+ {
+ "epoch": 0.09976359338061466,
+ "grad_norm": 2.956907272338867,
+ "learning_rate": 4.999041092996422e-06,
+ "loss": 0.7663,
+ "step": 211
+ },
+ {
+ "epoch": 0.10023640661938535,
+ "grad_norm": 3.3493971824645996,
+ "learning_rate": 4.99902373869536e-06,
+ "loss": 0.7639,
+ "step": 212
+ },
+ {
+ "epoch": 0.10070921985815603,
+ "grad_norm": 3.144812822341919,
+ "learning_rate": 4.9990062287910475e-06,
+ "loss": 0.7953,
+ "step": 213
+ },
+ {
+ "epoch": 0.10118203309692671,
+ "grad_norm": 3.5986971855163574,
+ "learning_rate": 4.998988563284576e-06,
+ "loss": 0.8297,
+ "step": 214
+ },
+ {
+ "epoch": 0.1016548463356974,
+ "grad_norm": 3.447584867477417,
+ "learning_rate": 4.998970742177044e-06,
+ "loss": 0.808,
+ "step": 215
+ },
+ {
+ "epoch": 0.10212765957446808,
+ "grad_norm": 3.791353940963745,
+ "learning_rate": 4.998952765469562e-06,
+ "loss": 0.8005,
+ "step": 216
+ },
+ {
+ "epoch": 0.10260047281323877,
+ "grad_norm": 3.4490807056427,
+ "learning_rate": 4.998934633163247e-06,
+ "loss": 0.8135,
+ "step": 217
+ },
+ {
+ "epoch": 0.10307328605200945,
+ "grad_norm": 3.1053314208984375,
+ "learning_rate": 4.998916345259232e-06,
+ "loss": 0.7888,
+ "step": 218
+ },
+ {
+ "epoch": 0.10354609929078014,
+ "grad_norm": 3.407862663269043,
+ "learning_rate": 4.9988979017586514e-06,
+ "loss": 0.7099,
+ "step": 219
+ },
+ {
+ "epoch": 0.10401891252955082,
+ "grad_norm": 3.116656541824341,
+ "learning_rate": 4.998879302662658e-06,
+ "loss": 0.8344,
+ "step": 220
+ },
+ {
+ "epoch": 0.10449172576832151,
+ "grad_norm": 3.339264154434204,
+ "learning_rate": 4.998860547972406e-06,
+ "loss": 0.8496,
+ "step": 221
+ },
+ {
+ "epoch": 0.1049645390070922,
+ "grad_norm": 3.251892566680908,
+ "learning_rate": 4.998841637689066e-06,
+ "loss": 0.7455,
+ "step": 222
+ },
+ {
+ "epoch": 0.10543735224586288,
+ "grad_norm": 4.098135471343994,
+ "learning_rate": 4.998822571813814e-06,
+ "loss": 0.7772,
+ "step": 223
+ },
+ {
+ "epoch": 0.10591016548463357,
+ "grad_norm": 3.9871134757995605,
+ "learning_rate": 4.998803350347837e-06,
+ "loss": 0.8261,
+ "step": 224
+ },
+ {
+ "epoch": 0.10638297872340426,
+ "grad_norm": 3.2822303771972656,
+ "learning_rate": 4.998783973292333e-06,
+ "loss": 0.8623,
+ "step": 225
+ },
+ {
+ "epoch": 0.10685579196217494,
+ "grad_norm": 3.0356857776641846,
+ "learning_rate": 4.998764440648507e-06,
+ "loss": 0.7426,
+ "step": 226
+ },
+ {
+ "epoch": 0.10732860520094563,
+ "grad_norm": 2.8932785987854004,
+ "learning_rate": 4.998744752417576e-06,
+ "loss": 0.6741,
+ "step": 227
+ },
+ {
+ "epoch": 0.10780141843971631,
+ "grad_norm": 3.085820436477661,
+ "learning_rate": 4.998724908600767e-06,
+ "loss": 0.6549,
+ "step": 228
+ },
+ {
+ "epoch": 0.108274231678487,
+ "grad_norm": 3.135829210281372,
+ "learning_rate": 4.998704909199314e-06,
+ "loss": 0.6702,
+ "step": 229
+ },
+ {
+ "epoch": 0.10874704491725769,
+ "grad_norm": 5.016134262084961,
+ "learning_rate": 4.9986847542144625e-06,
+ "loss": 0.7852,
+ "step": 230
+ },
+ {
+ "epoch": 0.10921985815602837,
+ "grad_norm": 3.9056200981140137,
+ "learning_rate": 4.998664443647468e-06,
+ "loss": 0.9654,
+ "step": 231
+ },
+ {
+ "epoch": 0.10969267139479906,
+ "grad_norm": 3.0880749225616455,
+ "learning_rate": 4.998643977499595e-06,
+ "loss": 0.7579,
+ "step": 232
+ },
+ {
+ "epoch": 0.11016548463356975,
+ "grad_norm": 3.6893601417541504,
+ "learning_rate": 4.998623355772118e-06,
+ "loss": 0.713,
+ "step": 233
+ },
+ {
+ "epoch": 0.11063829787234042,
+ "grad_norm": 4.181536674499512,
+ "learning_rate": 4.998602578466319e-06,
+ "loss": 0.7331,
+ "step": 234
+ },
+ {
+ "epoch": 0.1111111111111111,
+ "grad_norm": 3.036386728286743,
+ "learning_rate": 4.998581645583496e-06,
+ "loss": 0.7115,
+ "step": 235
+ },
+ {
+ "epoch": 0.11158392434988179,
+ "grad_norm": 3.6333255767822266,
+ "learning_rate": 4.998560557124948e-06,
+ "loss": 0.7544,
+ "step": 236
+ },
+ {
+ "epoch": 0.11205673758865248,
+ "grad_norm": 2.926417827606201,
+ "learning_rate": 4.9985393130919915e-06,
+ "loss": 0.715,
+ "step": 237
+ },
+ {
+ "epoch": 0.11252955082742316,
+ "grad_norm": 2.969158172607422,
+ "learning_rate": 4.998517913485946e-06,
+ "loss": 0.7304,
+ "step": 238
+ },
+ {
+ "epoch": 0.11300236406619385,
+ "grad_norm": 3.5254971981048584,
+ "learning_rate": 4.9984963583081466e-06,
+ "loss": 0.7725,
+ "step": 239
+ },
+ {
+ "epoch": 0.11347517730496454,
+ "grad_norm": 3.7840335369110107,
+ "learning_rate": 4.998474647559936e-06,
+ "loss": 0.8685,
+ "step": 240
+ },
+ {
+ "epoch": 0.11394799054373522,
+ "grad_norm": 3.0333125591278076,
+ "learning_rate": 4.9984527812426625e-06,
+ "loss": 0.7793,
+ "step": 241
+ },
+ {
+ "epoch": 0.11442080378250591,
+ "grad_norm": 3.290159225463867,
+ "learning_rate": 4.99843075935769e-06,
+ "loss": 0.7158,
+ "step": 242
+ },
+ {
+ "epoch": 0.1148936170212766,
+ "grad_norm": 3.3935494422912598,
+ "learning_rate": 4.99840858190639e-06,
+ "loss": 0.7643,
+ "step": 243
+ },
+ {
+ "epoch": 0.11536643026004728,
+ "grad_norm": 3.333965539932251,
+ "learning_rate": 4.998386248890142e-06,
+ "loss": 0.7255,
+ "step": 244
+ },
+ {
+ "epoch": 0.11583924349881797,
+ "grad_norm": 2.8129613399505615,
+ "learning_rate": 4.998363760310339e-06,
+ "loss": 0.768,
+ "step": 245
+ },
+ {
+ "epoch": 0.11631205673758865,
+ "grad_norm": 2.8678107261657715,
+ "learning_rate": 4.998341116168378e-06,
+ "loss": 0.7403,
+ "step": 246
+ },
+ {
+ "epoch": 0.11678486997635934,
+ "grad_norm": 2.8898239135742188,
+ "learning_rate": 4.998318316465672e-06,
+ "loss": 0.6844,
+ "step": 247
+ },
+ {
+ "epoch": 0.11725768321513003,
+ "grad_norm": 3.139777898788452,
+ "learning_rate": 4.998295361203637e-06,
+ "loss": 0.7936,
+ "step": 248
+ },
+ {
+ "epoch": 0.11773049645390071,
+ "grad_norm": 3.393721103668213,
+ "learning_rate": 4.998272250383707e-06,
+ "loss": 0.8173,
+ "step": 249
+ },
+ {
+ "epoch": 0.1182033096926714,
+ "grad_norm": 3.240973949432373,
+ "learning_rate": 4.998248984007318e-06,
+ "loss": 0.8252,
+ "step": 250
+ },
+ {
+ "epoch": 0.11867612293144209,
+ "grad_norm": 3.384855031967163,
+ "learning_rate": 4.998225562075918e-06,
+ "loss": 0.7244,
+ "step": 251
+ },
+ {
+ "epoch": 0.11914893617021277,
+ "grad_norm": 3.1881816387176514,
+ "learning_rate": 4.9982019845909675e-06,
+ "loss": 0.6818,
+ "step": 252
+ },
+ {
+ "epoch": 0.11962174940898346,
+ "grad_norm": 2.888364553451538,
+ "learning_rate": 4.998178251553934e-06,
+ "loss": 0.6753,
+ "step": 253
+ },
+ {
+ "epoch": 0.12009456264775414,
+ "grad_norm": 3.630093812942505,
+ "learning_rate": 4.9981543629662944e-06,
+ "loss": 0.7995,
+ "step": 254
+ },
+ {
+ "epoch": 0.12056737588652482,
+ "grad_norm": 2.9820947647094727,
+ "learning_rate": 4.998130318829537e-06,
+ "loss": 0.7478,
+ "step": 255
+ },
+ {
+ "epoch": 0.1210401891252955,
+ "grad_norm": 2.7094738483428955,
+ "learning_rate": 4.998106119145159e-06,
+ "loss": 0.7237,
+ "step": 256
+ },
+ {
+ "epoch": 0.12151300236406619,
+ "grad_norm": 3.1808104515075684,
+ "learning_rate": 4.9980817639146665e-06,
+ "loss": 0.7915,
+ "step": 257
+ },
+ {
+ "epoch": 0.12198581560283688,
+ "grad_norm": 3.1661291122436523,
+ "learning_rate": 4.998057253139575e-06,
+ "loss": 0.8053,
+ "step": 258
+ },
+ {
+ "epoch": 0.12245862884160756,
+ "grad_norm": 3.528749942779541,
+ "learning_rate": 4.998032586821413e-06,
+ "loss": 0.7946,
+ "step": 259
+ },
+ {
+ "epoch": 0.12293144208037825,
+ "grad_norm": 3.125964879989624,
+ "learning_rate": 4.998007764961716e-06,
+ "loss": 0.7569,
+ "step": 260
+ },
+ {
+ "epoch": 0.12340425531914893,
+ "grad_norm": 3.0778942108154297,
+ "learning_rate": 4.997982787562029e-06,
+ "loss": 0.7184,
+ "step": 261
+ },
+ {
+ "epoch": 0.12387706855791962,
+ "grad_norm": 3.3531930446624756,
+ "learning_rate": 4.997957654623906e-06,
+ "loss": 0.7586,
+ "step": 262
+ },
+ {
+ "epoch": 0.1243498817966903,
+ "grad_norm": 3.229278564453125,
+ "learning_rate": 4.997932366148913e-06,
+ "loss": 0.6092,
+ "step": 263
+ },
+ {
+ "epoch": 0.12482269503546099,
+ "grad_norm": 3.7286155223846436,
+ "learning_rate": 4.997906922138626e-06,
+ "loss": 0.7965,
+ "step": 264
+ },
+ {
+ "epoch": 0.12529550827423167,
+ "grad_norm": 3.300311803817749,
+ "learning_rate": 4.997881322594628e-06,
+ "loss": 0.7665,
+ "step": 265
+ },
+ {
+ "epoch": 0.12576832151300235,
+ "grad_norm": 3.411482572555542,
+ "learning_rate": 4.9978555675185115e-06,
+ "loss": 0.7253,
+ "step": 266
+ },
+ {
+ "epoch": 0.12624113475177304,
+ "grad_norm": 3.0884511470794678,
+ "learning_rate": 4.9978296569118825e-06,
+ "loss": 0.659,
+ "step": 267
+ },
+ {
+ "epoch": 0.12671394799054372,
+ "grad_norm": 3.0652925968170166,
+ "learning_rate": 4.9978035907763535e-06,
+ "loss": 0.6739,
+ "step": 268
+ },
+ {
+ "epoch": 0.1271867612293144,
+ "grad_norm": 3.280555009841919,
+ "learning_rate": 4.997777369113547e-06,
+ "loss": 0.8003,
+ "step": 269
+ },
+ {
+ "epoch": 0.1276595744680851,
+ "grad_norm": 2.980860948562622,
+ "learning_rate": 4.997750991925096e-06,
+ "loss": 0.7097,
+ "step": 270
+ },
+ {
+ "epoch": 0.12813238770685578,
+ "grad_norm": 3.301760673522949,
+ "learning_rate": 4.997724459212644e-06,
+ "loss": 0.7894,
+ "step": 271
+ },
+ {
+ "epoch": 0.12860520094562647,
+ "grad_norm": 2.9584903717041016,
+ "learning_rate": 4.997697770977841e-06,
+ "loss": 0.733,
+ "step": 272
+ },
+ {
+ "epoch": 0.12907801418439716,
+ "grad_norm": 3.5632214546203613,
+ "learning_rate": 4.99767092722235e-06,
+ "loss": 0.7228,
+ "step": 273
+ },
+ {
+ "epoch": 0.12955082742316784,
+ "grad_norm": 3.5900983810424805,
+ "learning_rate": 4.997643927947843e-06,
+ "loss": 0.7634,
+ "step": 274
+ },
+ {
+ "epoch": 0.13002364066193853,
+ "grad_norm": 3.332650661468506,
+ "learning_rate": 4.997616773156e-06,
+ "loss": 0.797,
+ "step": 275
+ },
+ {
+ "epoch": 0.13049645390070921,
+ "grad_norm": 3.1094167232513428,
+ "learning_rate": 4.997589462848512e-06,
+ "loss": 0.7849,
+ "step": 276
+ },
+ {
+ "epoch": 0.1309692671394799,
+ "grad_norm": 3.5359463691711426,
+ "learning_rate": 4.99756199702708e-06,
+ "loss": 0.6871,
+ "step": 277
+ },
+ {
+ "epoch": 0.1314420803782506,
+ "grad_norm": 3.190441846847534,
+ "learning_rate": 4.997534375693414e-06,
+ "loss": 0.6883,
+ "step": 278
+ },
+ {
+ "epoch": 0.13191489361702127,
+ "grad_norm": 3.063518762588501,
+ "learning_rate": 4.997506598849234e-06,
+ "loss": 0.7586,
+ "step": 279
+ },
+ {
+ "epoch": 0.13238770685579196,
+ "grad_norm": 3.4112050533294678,
+ "learning_rate": 4.997478666496269e-06,
+ "loss": 0.796,
+ "step": 280
+ },
+ {
+ "epoch": 0.13286052009456265,
+ "grad_norm": 3.231886386871338,
+ "learning_rate": 4.997450578636259e-06,
+ "loss": 0.7714,
+ "step": 281
+ },
+ {
+ "epoch": 0.13333333333333333,
+ "grad_norm": 3.279425621032715,
+ "learning_rate": 4.9974223352709515e-06,
+ "loss": 0.7793,
+ "step": 282
+ },
+ {
+ "epoch": 0.13380614657210402,
+ "grad_norm": 3.2154316902160645,
+ "learning_rate": 4.9973939364021075e-06,
+ "loss": 0.791,
+ "step": 283
+ },
+ {
+ "epoch": 0.1342789598108747,
+ "grad_norm": 3.2090768814086914,
+ "learning_rate": 4.9973653820314925e-06,
+ "loss": 0.6433,
+ "step": 284
+ },
+ {
+ "epoch": 0.1347517730496454,
+ "grad_norm": 3.1712026596069336,
+ "learning_rate": 4.997336672160886e-06,
+ "loss": 0.8128,
+ "step": 285
+ },
+ {
+ "epoch": 0.13522458628841608,
+ "grad_norm": 2.929229497909546,
+ "learning_rate": 4.997307806792076e-06,
+ "loss": 0.7594,
+ "step": 286
+ },
+ {
+ "epoch": 0.13569739952718676,
+ "grad_norm": 3.0363314151763916,
+ "learning_rate": 4.997278785926859e-06,
+ "loss": 0.7336,
+ "step": 287
+ },
+ {
+ "epoch": 0.13617021276595745,
+ "grad_norm": 3.1352357864379883,
+ "learning_rate": 4.997249609567042e-06,
+ "loss": 0.7225,
+ "step": 288
+ },
+ {
+ "epoch": 0.13664302600472814,
+ "grad_norm": 3.3171157836914062,
+ "learning_rate": 4.997220277714442e-06,
+ "loss": 0.7777,
+ "step": 289
+ },
+ {
+ "epoch": 0.13711583924349882,
+ "grad_norm": 3.050717353820801,
+ "learning_rate": 4.997190790370885e-06,
+ "loss": 0.6836,
+ "step": 290
+ },
+ {
+ "epoch": 0.1375886524822695,
+ "grad_norm": 3.0297694206237793,
+ "learning_rate": 4.997161147538208e-06,
+ "loss": 0.6883,
+ "step": 291
+ },
+ {
+ "epoch": 0.1380614657210402,
+ "grad_norm": 3.0566554069519043,
+ "learning_rate": 4.997131349218256e-06,
+ "loss": 0.6674,
+ "step": 292
+ },
+ {
+ "epoch": 0.13853427895981088,
+ "grad_norm": 3.799111843109131,
+ "learning_rate": 4.997101395412885e-06,
+ "loss": 0.8256,
+ "step": 293
+ },
+ {
+ "epoch": 0.13900709219858157,
+ "grad_norm": 3.1394248008728027,
+ "learning_rate": 4.9970712861239576e-06,
+ "loss": 0.7306,
+ "step": 294
+ },
+ {
+ "epoch": 0.13947990543735225,
+ "grad_norm": 3.0605666637420654,
+ "learning_rate": 4.997041021353352e-06,
+ "loss": 0.7212,
+ "step": 295
+ },
+ {
+ "epoch": 0.13995271867612294,
+ "grad_norm": 3.8813397884368896,
+ "learning_rate": 4.997010601102951e-06,
+ "loss": 0.769,
+ "step": 296
+ },
+ {
+ "epoch": 0.14042553191489363,
+ "grad_norm": 3.0514819622039795,
+ "learning_rate": 4.996980025374649e-06,
+ "loss": 0.7422,
+ "step": 297
+ },
+ {
+ "epoch": 0.1408983451536643,
+ "grad_norm": 2.9544146060943604,
+ "learning_rate": 4.99694929417035e-06,
+ "loss": 0.6912,
+ "step": 298
+ },
+ {
+ "epoch": 0.141371158392435,
+ "grad_norm": 3.2635602951049805,
+ "learning_rate": 4.996918407491966e-06,
+ "loss": 0.7395,
+ "step": 299
+ },
+ {
+ "epoch": 0.14184397163120568,
+ "grad_norm": 3.373882532119751,
+ "learning_rate": 4.996887365341423e-06,
+ "loss": 0.7799,
+ "step": 300
+ },
+ {
+ "epoch": 0.14231678486997637,
+ "grad_norm": 3.001128673553467,
+ "learning_rate": 4.996856167720652e-06,
+ "loss": 0.7168,
+ "step": 301
+ },
+ {
+ "epoch": 0.14278959810874706,
+ "grad_norm": 3.1026835441589355,
+ "learning_rate": 4.996824814631595e-06,
+ "loss": 0.7492,
+ "step": 302
+ },
+ {
+ "epoch": 0.14326241134751774,
+ "grad_norm": 3.41947603225708,
+ "learning_rate": 4.996793306076205e-06,
+ "loss": 0.6659,
+ "step": 303
+ },
+ {
+ "epoch": 0.14373522458628843,
+ "grad_norm": 3.2272400856018066,
+ "learning_rate": 4.996761642056444e-06,
+ "loss": 0.7184,
+ "step": 304
+ },
+ {
+ "epoch": 0.14420803782505912,
+ "grad_norm": 2.9488935470581055,
+ "learning_rate": 4.996729822574284e-06,
+ "loss": 0.7451,
+ "step": 305
+ },
+ {
+ "epoch": 0.14468085106382977,
+ "grad_norm": 3.268231153488159,
+ "learning_rate": 4.9966978476317065e-06,
+ "loss": 0.7798,
+ "step": 306
+ },
+ {
+ "epoch": 0.14515366430260046,
+ "grad_norm": 3.9086556434631348,
+ "learning_rate": 4.996665717230701e-06,
+ "loss": 0.7871,
+ "step": 307
+ },
+ {
+ "epoch": 0.14562647754137115,
+ "grad_norm": 3.3483879566192627,
+ "learning_rate": 4.996633431373269e-06,
+ "loss": 0.7415,
+ "step": 308
+ },
+ {
+ "epoch": 0.14609929078014183,
+ "grad_norm": 2.839400053024292,
+ "learning_rate": 4.99660099006142e-06,
+ "loss": 0.7192,
+ "step": 309
+ },
+ {
+ "epoch": 0.14657210401891252,
+ "grad_norm": 3.177302598953247,
+ "learning_rate": 4.996568393297175e-06,
+ "loss": 0.755,
+ "step": 310
+ },
+ {
+ "epoch": 0.1470449172576832,
+ "grad_norm": 3.5477044582366943,
+ "learning_rate": 4.996535641082563e-06,
+ "loss": 0.7531,
+ "step": 311
+ },
+ {
+ "epoch": 0.1475177304964539,
+ "grad_norm": 3.418576717376709,
+ "learning_rate": 4.996502733419624e-06,
+ "loss": 0.8009,
+ "step": 312
+ },
+ {
+ "epoch": 0.14799054373522458,
+ "grad_norm": 3.711341619491577,
+ "learning_rate": 4.996469670310407e-06,
+ "loss": 0.7362,
+ "step": 313
+ },
+ {
+ "epoch": 0.14846335697399526,
+ "grad_norm": 3.2419373989105225,
+ "learning_rate": 4.99643645175697e-06,
+ "loss": 0.7761,
+ "step": 314
+ },
+ {
+ "epoch": 0.14893617021276595,
+ "grad_norm": 3.121858835220337,
+ "learning_rate": 4.996403077761381e-06,
+ "loss": 0.6495,
+ "step": 315
+ },
+ {
+ "epoch": 0.14940898345153664,
+ "grad_norm": 3.123054265975952,
+ "learning_rate": 4.996369548325719e-06,
+ "loss": 0.7444,
+ "step": 316
+ },
+ {
+ "epoch": 0.14988179669030732,
+ "grad_norm": 2.780880928039551,
+ "learning_rate": 4.996335863452072e-06,
+ "loss": 0.672,
+ "step": 317
+ },
+ {
+ "epoch": 0.150354609929078,
+ "grad_norm": 3.3738629817962646,
+ "learning_rate": 4.996302023142536e-06,
+ "loss": 0.7972,
+ "step": 318
+ },
+ {
+ "epoch": 0.1508274231678487,
+ "grad_norm": 3.4874777793884277,
+ "learning_rate": 4.99626802739922e-06,
+ "loss": 0.8252,
+ "step": 319
+ },
+ {
+ "epoch": 0.15130023640661938,
+ "grad_norm": 3.7074787616729736,
+ "learning_rate": 4.9962338762242395e-06,
+ "loss": 0.8216,
+ "step": 320
+ },
+ {
+ "epoch": 0.15177304964539007,
+ "grad_norm": 3.281912326812744,
+ "learning_rate": 4.996199569619721e-06,
+ "loss": 0.8175,
+ "step": 321
+ },
+ {
+ "epoch": 0.15224586288416075,
+ "grad_norm": 2.9485340118408203,
+ "learning_rate": 4.996165107587801e-06,
+ "loss": 0.707,
+ "step": 322
+ },
+ {
+ "epoch": 0.15271867612293144,
+ "grad_norm": 3.3757646083831787,
+ "learning_rate": 4.996130490130625e-06,
+ "loss": 0.7955,
+ "step": 323
+ },
+ {
+ "epoch": 0.15319148936170213,
+ "grad_norm": 2.962181568145752,
+ "learning_rate": 4.996095717250349e-06,
+ "loss": 0.7067,
+ "step": 324
+ },
+ {
+ "epoch": 0.1536643026004728,
+ "grad_norm": 3.114272356033325,
+ "learning_rate": 4.996060788949136e-06,
+ "loss": 0.7486,
+ "step": 325
+ },
+ {
+ "epoch": 0.1541371158392435,
+ "grad_norm": 3.0621590614318848,
+ "learning_rate": 4.996025705229165e-06,
+ "loss": 0.6547,
+ "step": 326
+ },
+ {
+ "epoch": 0.15460992907801419,
+ "grad_norm": 2.8745882511138916,
+ "learning_rate": 4.995990466092616e-06,
+ "loss": 0.6435,
+ "step": 327
+ },
+ {
+ "epoch": 0.15508274231678487,
+ "grad_norm": 2.90841007232666,
+ "learning_rate": 4.995955071541686e-06,
+ "loss": 0.7331,
+ "step": 328
+ },
+ {
+ "epoch": 0.15555555555555556,
+ "grad_norm": 2.694580316543579,
+ "learning_rate": 4.9959195215785784e-06,
+ "loss": 0.6731,
+ "step": 329
+ },
+ {
+ "epoch": 0.15602836879432624,
+ "grad_norm": 3.158083438873291,
+ "learning_rate": 4.995883816205507e-06,
+ "loss": 0.7257,
+ "step": 330
+ },
+ {
+ "epoch": 0.15650118203309693,
+ "grad_norm": 3.3234715461730957,
+ "learning_rate": 4.995847955424694e-06,
+ "loss": 0.7389,
+ "step": 331
+ },
+ {
+ "epoch": 0.15697399527186762,
+ "grad_norm": 2.9406495094299316,
+ "learning_rate": 4.995811939238373e-06,
+ "loss": 0.643,
+ "step": 332
+ },
+ {
+ "epoch": 0.1574468085106383,
+ "grad_norm": 3.3191726207733154,
+ "learning_rate": 4.995775767648785e-06,
+ "loss": 0.7879,
+ "step": 333
+ },
+ {
+ "epoch": 0.157919621749409,
+ "grad_norm": 3.711925745010376,
+ "learning_rate": 4.995739440658185e-06,
+ "loss": 0.7586,
+ "step": 334
+ },
+ {
+ "epoch": 0.15839243498817968,
+ "grad_norm": 9.573421478271484,
+ "learning_rate": 4.995702958268833e-06,
+ "loss": 0.7842,
+ "step": 335
+ },
+ {
+ "epoch": 0.15886524822695036,
+ "grad_norm": 3.4154508113861084,
+ "learning_rate": 4.995666320483001e-06,
+ "loss": 0.6735,
+ "step": 336
+ },
+ {
+ "epoch": 0.15933806146572105,
+ "grad_norm": 3.4169859886169434,
+ "learning_rate": 4.995629527302971e-06,
+ "loss": 0.741,
+ "step": 337
+ },
+ {
+ "epoch": 0.15981087470449173,
+ "grad_norm": 3.287503242492676,
+ "learning_rate": 4.9955925787310335e-06,
+ "loss": 0.7139,
+ "step": 338
+ },
+ {
+ "epoch": 0.16028368794326242,
+ "grad_norm": 3.288409471511841,
+ "learning_rate": 4.995555474769488e-06,
+ "loss": 0.7636,
+ "step": 339
+ },
+ {
+ "epoch": 0.1607565011820331,
+ "grad_norm": 2.8021693229675293,
+ "learning_rate": 4.995518215420646e-06,
+ "loss": 0.5883,
+ "step": 340
+ },
+ {
+ "epoch": 0.1612293144208038,
+ "grad_norm": 2.7038564682006836,
+ "learning_rate": 4.995480800686827e-06,
+ "loss": 0.657,
+ "step": 341
+ },
+ {
+ "epoch": 0.16170212765957448,
+ "grad_norm": 3.2370235919952393,
+ "learning_rate": 4.9954432305703615e-06,
+ "loss": 0.6999,
+ "step": 342
+ },
+ {
+ "epoch": 0.16217494089834517,
+ "grad_norm": 2.8666412830352783,
+ "learning_rate": 4.995405505073588e-06,
+ "loss": 0.7199,
+ "step": 343
+ },
+ {
+ "epoch": 0.16264775413711585,
+ "grad_norm": 3.6467232704162598,
+ "learning_rate": 4.995367624198856e-06,
+ "loss": 0.7317,
+ "step": 344
+ },
+ {
+ "epoch": 0.16312056737588654,
+ "grad_norm": 2.7576327323913574,
+ "learning_rate": 4.9953295879485246e-06,
+ "loss": 0.647,
+ "step": 345
+ },
+ {
+ "epoch": 0.1635933806146572,
+ "grad_norm": 2.922232151031494,
+ "learning_rate": 4.995291396324959e-06,
+ "loss": 0.6686,
+ "step": 346
+ },
+ {
+ "epoch": 0.16406619385342788,
+ "grad_norm": 2.8693501949310303,
+ "learning_rate": 4.995253049330542e-06,
+ "loss": 0.6756,
+ "step": 347
+ },
+ {
+ "epoch": 0.16453900709219857,
+ "grad_norm": 3.671865701675415,
+ "learning_rate": 4.995214546967658e-06,
+ "loss": 0.7347,
+ "step": 348
+ },
+ {
+ "epoch": 0.16501182033096926,
+ "grad_norm": 3.024219274520874,
+ "learning_rate": 4.995175889238706e-06,
+ "loss": 0.7547,
+ "step": 349
+ },
+ {
+ "epoch": 0.16548463356973994,
+ "grad_norm": 2.8470778465270996,
+ "learning_rate": 4.995137076146091e-06,
+ "loss": 0.6764,
+ "step": 350
+ },
+ {
+ "epoch": 0.16595744680851063,
+ "grad_norm": 2.905057907104492,
+ "learning_rate": 4.9950981076922324e-06,
+ "loss": 0.6814,
+ "step": 351
+ },
+ {
+ "epoch": 0.16643026004728131,
+ "grad_norm": 3.504377841949463,
+ "learning_rate": 4.995058983879555e-06,
+ "loss": 0.7145,
+ "step": 352
+ },
+ {
+ "epoch": 0.166903073286052,
+ "grad_norm": 3.0029661655426025,
+ "learning_rate": 4.995019704710495e-06,
+ "loss": 0.7114,
+ "step": 353
+ },
+ {
+ "epoch": 0.1673758865248227,
+ "grad_norm": 2.8666274547576904,
+ "learning_rate": 4.994980270187499e-06,
+ "loss": 0.7416,
+ "step": 354
+ },
+ {
+ "epoch": 0.16784869976359337,
+ "grad_norm": 3.1644718647003174,
+ "learning_rate": 4.994940680313021e-06,
+ "loss": 0.661,
+ "step": 355
+ },
+ {
+ "epoch": 0.16832151300236406,
+ "grad_norm": 3.050391674041748,
+ "learning_rate": 4.994900935089527e-06,
+ "loss": 0.7243,
+ "step": 356
+ },
+ {
+ "epoch": 0.16879432624113475,
+ "grad_norm": 2.985466480255127,
+ "learning_rate": 4.994861034519491e-06,
+ "loss": 0.6917,
+ "step": 357
+ },
+ {
+ "epoch": 0.16926713947990543,
+ "grad_norm": 2.909342050552368,
+ "learning_rate": 4.9948209786053995e-06,
+ "loss": 0.6636,
+ "step": 358
+ },
+ {
+ "epoch": 0.16973995271867612,
+ "grad_norm": 3.2214784622192383,
+ "learning_rate": 4.9947807673497435e-06,
+ "loss": 0.7903,
+ "step": 359
+ },
+ {
+ "epoch": 0.1702127659574468,
+ "grad_norm": 2.5654983520507812,
+ "learning_rate": 4.994740400755029e-06,
+ "loss": 0.6129,
+ "step": 360
+ },
+ {
+ "epoch": 0.1706855791962175,
+ "grad_norm": 3.775646448135376,
+ "learning_rate": 4.99469987882377e-06,
+ "loss": 0.7145,
+ "step": 361
+ },
+ {
+ "epoch": 0.17115839243498818,
+ "grad_norm": 2.8965413570404053,
+ "learning_rate": 4.994659201558487e-06,
+ "loss": 0.7177,
+ "step": 362
+ },
+ {
+ "epoch": 0.17163120567375886,
+ "grad_norm": 3.485597848892212,
+ "learning_rate": 4.9946183689617146e-06,
+ "loss": 0.8107,
+ "step": 363
+ },
+ {
+ "epoch": 0.17210401891252955,
+ "grad_norm": 3.277839183807373,
+ "learning_rate": 4.994577381035995e-06,
+ "loss": 0.691,
+ "step": 364
+ },
+ {
+ "epoch": 0.17257683215130024,
+ "grad_norm": 2.8807685375213623,
+ "learning_rate": 4.99453623778388e-06,
+ "loss": 0.7627,
+ "step": 365
+ },
+ {
+ "epoch": 0.17304964539007092,
+ "grad_norm": 3.0659940242767334,
+ "learning_rate": 4.994494939207932e-06,
+ "loss": 0.6858,
+ "step": 366
+ },
+ {
+ "epoch": 0.1735224586288416,
+ "grad_norm": 3.0881855487823486,
+ "learning_rate": 4.994453485310723e-06,
+ "loss": 0.8212,
+ "step": 367
+ },
+ {
+ "epoch": 0.1739952718676123,
+ "grad_norm": 2.7199201583862305,
+ "learning_rate": 4.994411876094832e-06,
+ "loss": 0.6516,
+ "step": 368
+ },
+ {
+ "epoch": 0.17446808510638298,
+ "grad_norm": 2.955889940261841,
+ "learning_rate": 4.994370111562851e-06,
+ "loss": 0.6579,
+ "step": 369
+ },
+ {
+ "epoch": 0.17494089834515367,
+ "grad_norm": 3.1321663856506348,
+ "learning_rate": 4.994328191717382e-06,
+ "loss": 0.6891,
+ "step": 370
+ },
+ {
+ "epoch": 0.17541371158392435,
+ "grad_norm": 3.0560388565063477,
+ "learning_rate": 4.994286116561034e-06,
+ "loss": 0.7243,
+ "step": 371
+ },
+ {
+ "epoch": 0.17588652482269504,
+ "grad_norm": 3.1560704708099365,
+ "learning_rate": 4.994243886096425e-06,
+ "loss": 0.7262,
+ "step": 372
+ },
+ {
+ "epoch": 0.17635933806146573,
+ "grad_norm": 2.913541316986084,
+ "learning_rate": 4.994201500326187e-06,
+ "loss": 0.7318,
+ "step": 373
+ },
+ {
+ "epoch": 0.1768321513002364,
+ "grad_norm": 3.098376512527466,
+ "learning_rate": 4.994158959252958e-06,
+ "loss": 0.6419,
+ "step": 374
+ },
+ {
+ "epoch": 0.1773049645390071,
+ "grad_norm": 2.977508544921875,
+ "learning_rate": 4.994116262879387e-06,
+ "loss": 0.6709,
+ "step": 375
+ },
+ {
+ "epoch": 0.17777777777777778,
+ "grad_norm": 3.168186902999878,
+ "learning_rate": 4.994073411208133e-06,
+ "loss": 0.6608,
+ "step": 376
+ },
+ {
+ "epoch": 0.17825059101654847,
+ "grad_norm": 3.436844825744629,
+ "learning_rate": 4.994030404241864e-06,
+ "loss": 0.7227,
+ "step": 377
+ },
+ {
+ "epoch": 0.17872340425531916,
+ "grad_norm": 2.8998289108276367,
+ "learning_rate": 4.993987241983258e-06,
+ "loss": 0.6512,
+ "step": 378
+ },
+ {
+ "epoch": 0.17919621749408984,
+ "grad_norm": 3.407191514968872,
+ "learning_rate": 4.993943924435002e-06,
+ "loss": 0.616,
+ "step": 379
+ },
+ {
+ "epoch": 0.17966903073286053,
+ "grad_norm": 3.744858741760254,
+ "learning_rate": 4.993900451599793e-06,
+ "loss": 0.8599,
+ "step": 380
+ },
+ {
+ "epoch": 0.18014184397163122,
+ "grad_norm": 3.486283779144287,
+ "learning_rate": 4.993856823480338e-06,
+ "loss": 0.6634,
+ "step": 381
+ },
+ {
+ "epoch": 0.1806146572104019,
+ "grad_norm": 2.895719051361084,
+ "learning_rate": 4.993813040079355e-06,
+ "loss": 0.6972,
+ "step": 382
+ },
+ {
+ "epoch": 0.1810874704491726,
+ "grad_norm": 2.814133882522583,
+ "learning_rate": 4.993769101399569e-06,
+ "loss": 0.6271,
+ "step": 383
+ },
+ {
+ "epoch": 0.18156028368794327,
+ "grad_norm": 2.8609800338745117,
+ "learning_rate": 4.993725007443715e-06,
+ "loss": 0.6481,
+ "step": 384
+ },
+ {
+ "epoch": 0.18203309692671396,
+ "grad_norm": 3.2829644680023193,
+ "learning_rate": 4.99368075821454e-06,
+ "loss": 0.7999,
+ "step": 385
+ },
+ {
+ "epoch": 0.18250591016548465,
+ "grad_norm": 3.1417458057403564,
+ "learning_rate": 4.993636353714798e-06,
+ "loss": 0.6972,
+ "step": 386
+ },
+ {
+ "epoch": 0.1829787234042553,
+ "grad_norm": 3.0679385662078857,
+ "learning_rate": 4.993591793947256e-06,
+ "loss": 0.667,
+ "step": 387
+ },
+ {
+ "epoch": 0.183451536643026,
+ "grad_norm": 3.1387410163879395,
+ "learning_rate": 4.993547078914686e-06,
+ "loss": 0.7618,
+ "step": 388
+ },
+ {
+ "epoch": 0.18392434988179668,
+ "grad_norm": 2.9181406497955322,
+ "learning_rate": 4.993502208619872e-06,
+ "loss": 0.7391,
+ "step": 389
+ },
+ {
+ "epoch": 0.18439716312056736,
+ "grad_norm": 2.8952157497406006,
+ "learning_rate": 4.993457183065611e-06,
+ "loss": 0.6988,
+ "step": 390
+ },
+ {
+ "epoch": 0.18486997635933805,
+ "grad_norm": 3.2274813652038574,
+ "learning_rate": 4.993412002254704e-06,
+ "loss": 0.688,
+ "step": 391
+ },
+ {
+ "epoch": 0.18534278959810874,
+ "grad_norm": 3.4693779945373535,
+ "learning_rate": 4.993366666189965e-06,
+ "loss": 0.6634,
+ "step": 392
+ },
+ {
+ "epoch": 0.18581560283687942,
+ "grad_norm": 3.5358526706695557,
+ "learning_rate": 4.993321174874217e-06,
+ "loss": 0.7343,
+ "step": 393
+ },
+ {
+ "epoch": 0.1862884160756501,
+ "grad_norm": 3.013338088989258,
+ "learning_rate": 4.993275528310292e-06,
+ "loss": 0.7579,
+ "step": 394
+ },
+ {
+ "epoch": 0.1867612293144208,
+ "grad_norm": 2.694772720336914,
+ "learning_rate": 4.993229726501033e-06,
+ "loss": 0.718,
+ "step": 395
+ },
+ {
+ "epoch": 0.18723404255319148,
+ "grad_norm": 3.070612907409668,
+ "learning_rate": 4.9931837694492915e-06,
+ "loss": 0.6438,
+ "step": 396
+ },
+ {
+ "epoch": 0.18770685579196217,
+ "grad_norm": 2.9193027019500732,
+ "learning_rate": 4.993137657157928e-06,
+ "loss": 0.6788,
+ "step": 397
+ },
+ {
+ "epoch": 0.18817966903073285,
+ "grad_norm": 3.047682046890259,
+ "learning_rate": 4.993091389629816e-06,
+ "loss": 0.6826,
+ "step": 398
+ },
+ {
+ "epoch": 0.18865248226950354,
+ "grad_norm": 2.9629905223846436,
+ "learning_rate": 4.993044966867834e-06,
+ "loss": 0.7196,
+ "step": 399
+ },
+ {
+ "epoch": 0.18912529550827423,
+ "grad_norm": 3.0692050457000732,
+ "learning_rate": 4.992998388874874e-06,
+ "loss": 0.7015,
+ "step": 400
+ },
+ {
+ "epoch": 0.1895981087470449,
+ "grad_norm": 3.5427212715148926,
+ "learning_rate": 4.992951655653836e-06,
+ "loss": 0.8292,
+ "step": 401
+ },
+ {
+ "epoch": 0.1900709219858156,
+ "grad_norm": 2.643526554107666,
+ "learning_rate": 4.992904767207629e-06,
+ "loss": 0.624,
+ "step": 402
+ },
+ {
+ "epoch": 0.19054373522458629,
+ "grad_norm": 3.1185996532440186,
+ "learning_rate": 4.992857723539173e-06,
+ "loss": 0.7354,
+ "step": 403
+ },
+ {
+ "epoch": 0.19101654846335697,
+ "grad_norm": 3.006856679916382,
+ "learning_rate": 4.992810524651398e-06,
+ "loss": 0.7752,
+ "step": 404
+ },
+ {
+ "epoch": 0.19148936170212766,
+ "grad_norm": 2.9913275241851807,
+ "learning_rate": 4.9927631705472425e-06,
+ "loss": 0.7306,
+ "step": 405
+ },
+ {
+ "epoch": 0.19196217494089834,
+ "grad_norm": 2.6794071197509766,
+ "learning_rate": 4.992715661229655e-06,
+ "loss": 0.6136,
+ "step": 406
+ },
+ {
+ "epoch": 0.19243498817966903,
+ "grad_norm": 3.5933966636657715,
+ "learning_rate": 4.992667996701593e-06,
+ "loss": 0.7024,
+ "step": 407
+ },
+ {
+ "epoch": 0.19290780141843972,
+ "grad_norm": 2.862187623977661,
+ "learning_rate": 4.992620176966025e-06,
+ "loss": 0.692,
+ "step": 408
+ },
+ {
+ "epoch": 0.1933806146572104,
+ "grad_norm": 3.076845407485962,
+ "learning_rate": 4.9925722020259286e-06,
+ "loss": 0.7475,
+ "step": 409
+ },
+ {
+ "epoch": 0.1938534278959811,
+ "grad_norm": 3.372919797897339,
+ "learning_rate": 4.9925240718842895e-06,
+ "loss": 0.6886,
+ "step": 410
+ },
+ {
+ "epoch": 0.19432624113475178,
+ "grad_norm": 2.922977924346924,
+ "learning_rate": 4.992475786544108e-06,
+ "loss": 0.7049,
+ "step": 411
+ },
+ {
+ "epoch": 0.19479905437352246,
+ "grad_norm": 2.908034324645996,
+ "learning_rate": 4.992427346008387e-06,
+ "loss": 0.6498,
+ "step": 412
+ },
+ {
+ "epoch": 0.19527186761229315,
+ "grad_norm": 3.096723794937134,
+ "learning_rate": 4.992378750280144e-06,
+ "loss": 0.7151,
+ "step": 413
+ },
+ {
+ "epoch": 0.19574468085106383,
+ "grad_norm": 2.895237684249878,
+ "learning_rate": 4.992329999362405e-06,
+ "loss": 0.7277,
+ "step": 414
+ },
+ {
+ "epoch": 0.19621749408983452,
+ "grad_norm": 2.718230724334717,
+ "learning_rate": 4.9922810932582065e-06,
+ "loss": 0.6375,
+ "step": 415
+ },
+ {
+ "epoch": 0.1966903073286052,
+ "grad_norm": 3.187743663787842,
+ "learning_rate": 4.992232031970592e-06,
+ "loss": 0.6528,
+ "step": 416
+ },
+ {
+ "epoch": 0.1971631205673759,
+ "grad_norm": 2.996406316757202,
+ "learning_rate": 4.992182815502616e-06,
+ "loss": 0.6552,
+ "step": 417
+ },
+ {
+ "epoch": 0.19763593380614658,
+ "grad_norm": 3.301084041595459,
+ "learning_rate": 4.992133443857345e-06,
+ "loss": 0.7061,
+ "step": 418
+ },
+ {
+ "epoch": 0.19810874704491727,
+ "grad_norm": 3.7874677181243896,
+ "learning_rate": 4.992083917037853e-06,
+ "loss": 0.7859,
+ "step": 419
+ },
+ {
+ "epoch": 0.19858156028368795,
+ "grad_norm": 3.124253511428833,
+ "learning_rate": 4.992034235047222e-06,
+ "loss": 0.7615,
+ "step": 420
+ },
+ {
+ "epoch": 0.19905437352245864,
+ "grad_norm": 3.0488970279693604,
+ "learning_rate": 4.991984397888546e-06,
+ "loss": 0.6916,
+ "step": 421
+ },
+ {
+ "epoch": 0.19952718676122932,
+ "grad_norm": 3.1241321563720703,
+ "learning_rate": 4.991934405564929e-06,
+ "loss": 0.7055,
+ "step": 422
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 3.396632432937622,
+ "learning_rate": 4.991884258079484e-06,
+ "loss": 0.7675,
+ "step": 423
+ },
+ {
+ "epoch": 0.2004728132387707,
+ "grad_norm": 3.7776873111724854,
+ "learning_rate": 4.9918339554353316e-06,
+ "loss": 0.7371,
+ "step": 424
+ },
+ {
+ "epoch": 0.20094562647754138,
+ "grad_norm": 3.3356032371520996,
+ "learning_rate": 4.991783497635606e-06,
+ "loss": 0.6778,
+ "step": 425
+ },
+ {
+ "epoch": 0.20141843971631207,
+ "grad_norm": 2.988856792449951,
+ "learning_rate": 4.9917328846834474e-06,
+ "loss": 0.6795,
+ "step": 426
+ },
+ {
+ "epoch": 0.20189125295508276,
+ "grad_norm": 3.264183282852173,
+ "learning_rate": 4.99168211658201e-06,
+ "loss": 0.7707,
+ "step": 427
+ },
+ {
+ "epoch": 0.20236406619385341,
+ "grad_norm": 3.878068208694458,
+ "learning_rate": 4.991631193334451e-06,
+ "loss": 0.857,
+ "step": 428
+ },
+ {
+ "epoch": 0.2028368794326241,
+ "grad_norm": 3.6377553939819336,
+ "learning_rate": 4.991580114943943e-06,
+ "loss": 0.8033,
+ "step": 429
+ },
+ {
+ "epoch": 0.2033096926713948,
+ "grad_norm": 2.95393967628479,
+ "learning_rate": 4.991528881413667e-06,
+ "loss": 0.6809,
+ "step": 430
+ },
+ {
+ "epoch": 0.20378250591016547,
+ "grad_norm": 3.058704376220703,
+ "learning_rate": 4.9914774927468125e-06,
+ "loss": 0.6664,
+ "step": 431
+ },
+ {
+ "epoch": 0.20425531914893616,
+ "grad_norm": 2.7783217430114746,
+ "learning_rate": 4.9914259489465795e-06,
+ "loss": 0.6478,
+ "step": 432
+ },
+ {
+ "epoch": 0.20472813238770685,
+ "grad_norm": 2.4825217723846436,
+ "learning_rate": 4.991374250016177e-06,
+ "loss": 0.6598,
+ "step": 433
+ },
+ {
+ "epoch": 0.20520094562647753,
+ "grad_norm": 2.8753600120544434,
+ "learning_rate": 4.991322395958824e-06,
+ "loss": 0.6947,
+ "step": 434
+ },
+ {
+ "epoch": 0.20567375886524822,
+ "grad_norm": 3.2339367866516113,
+ "learning_rate": 4.99127038677775e-06,
+ "loss": 0.8201,
+ "step": 435
+ },
+ {
+ "epoch": 0.2061465721040189,
+ "grad_norm": 2.9065537452697754,
+ "learning_rate": 4.991218222476193e-06,
+ "loss": 0.6679,
+ "step": 436
+ },
+ {
+ "epoch": 0.2066193853427896,
+ "grad_norm": 3.283228874206543,
+ "learning_rate": 4.991165903057401e-06,
+ "loss": 0.8039,
+ "step": 437
+ },
+ {
+ "epoch": 0.20709219858156028,
+ "grad_norm": 3.429872751235962,
+ "learning_rate": 4.991113428524631e-06,
+ "loss": 0.7392,
+ "step": 438
+ },
+ {
+ "epoch": 0.20756501182033096,
+ "grad_norm": 3.118943452835083,
+ "learning_rate": 4.991060798881152e-06,
+ "loss": 0.6794,
+ "step": 439
+ },
+ {
+ "epoch": 0.20803782505910165,
+ "grad_norm": 3.395970106124878,
+ "learning_rate": 4.99100801413024e-06,
+ "loss": 0.6862,
+ "step": 440
+ },
+ {
+ "epoch": 0.20851063829787234,
+ "grad_norm": 2.869191884994507,
+ "learning_rate": 4.99095507427518e-06,
+ "loss": 0.6076,
+ "step": 441
+ },
+ {
+ "epoch": 0.20898345153664302,
+ "grad_norm": 3.1934661865234375,
+ "learning_rate": 4.990901979319272e-06,
+ "loss": 0.6927,
+ "step": 442
+ },
+ {
+ "epoch": 0.2094562647754137,
+ "grad_norm": 2.9068603515625,
+ "learning_rate": 4.990848729265819e-06,
+ "loss": 0.6864,
+ "step": 443
+ },
+ {
+ "epoch": 0.2099290780141844,
+ "grad_norm": 3.0535948276519775,
+ "learning_rate": 4.9907953241181375e-06,
+ "loss": 0.6396,
+ "step": 444
+ },
+ {
+ "epoch": 0.21040189125295508,
+ "grad_norm": 2.871511459350586,
+ "learning_rate": 4.990741763879554e-06,
+ "loss": 0.6743,
+ "step": 445
+ },
+ {
+ "epoch": 0.21087470449172577,
+ "grad_norm": 2.9184393882751465,
+ "learning_rate": 4.9906880485534015e-06,
+ "loss": 0.6786,
+ "step": 446
+ },
+ {
+ "epoch": 0.21134751773049645,
+ "grad_norm": 3.0628271102905273,
+ "learning_rate": 4.990634178143026e-06,
+ "loss": 0.6326,
+ "step": 447
+ },
+ {
+ "epoch": 0.21182033096926714,
+ "grad_norm": 3.7878305912017822,
+ "learning_rate": 4.990580152651782e-06,
+ "loss": 0.7944,
+ "step": 448
+ },
+ {
+ "epoch": 0.21229314420803783,
+ "grad_norm": 2.8577189445495605,
+ "learning_rate": 4.990525972083031e-06,
+ "loss": 0.71,
+ "step": 449
+ },
+ {
+ "epoch": 0.2127659574468085,
+ "grad_norm": 3.307769775390625,
+ "learning_rate": 4.99047163644015e-06,
+ "loss": 0.6893,
+ "step": 450
+ },
+ {
+ "epoch": 0.2132387706855792,
+ "grad_norm": 2.7391717433929443,
+ "learning_rate": 4.990417145726519e-06,
+ "loss": 0.712,
+ "step": 451
+ },
+ {
+ "epoch": 0.21371158392434988,
+ "grad_norm": 2.938044786453247,
+ "learning_rate": 4.990362499945534e-06,
+ "loss": 0.7516,
+ "step": 452
+ },
+ {
+ "epoch": 0.21418439716312057,
+ "grad_norm": 2.7831056118011475,
+ "learning_rate": 4.990307699100595e-06,
+ "loss": 0.6168,
+ "step": 453
+ },
+ {
+ "epoch": 0.21465721040189126,
+ "grad_norm": 2.907977342605591,
+ "learning_rate": 4.990252743195116e-06,
+ "loss": 0.6706,
+ "step": 454
+ },
+ {
+ "epoch": 0.21513002364066194,
+ "grad_norm": 3.7882161140441895,
+ "learning_rate": 4.990197632232517e-06,
+ "loss": 0.6847,
+ "step": 455
+ },
+ {
+ "epoch": 0.21560283687943263,
+ "grad_norm": 2.899716854095459,
+ "learning_rate": 4.990142366216232e-06,
+ "loss": 0.6699,
+ "step": 456
+ },
+ {
+ "epoch": 0.21607565011820332,
+ "grad_norm": 2.907003879547119,
+ "learning_rate": 4.990086945149701e-06,
+ "loss": 0.6864,
+ "step": 457
+ },
+ {
+ "epoch": 0.216548463356974,
+ "grad_norm": 3.2407333850860596,
+ "learning_rate": 4.9900313690363736e-06,
+ "loss": 0.692,
+ "step": 458
+ },
+ {
+ "epoch": 0.2170212765957447,
+ "grad_norm": 2.9055583477020264,
+ "learning_rate": 4.989975637879712e-06,
+ "loss": 0.7113,
+ "step": 459
+ },
+ {
+ "epoch": 0.21749408983451538,
+ "grad_norm": 2.9836206436157227,
+ "learning_rate": 4.989919751683184e-06,
+ "loss": 0.6673,
+ "step": 460
+ },
+ {
+ "epoch": 0.21796690307328606,
+ "grad_norm": 3.371035575866699,
+ "learning_rate": 4.989863710450273e-06,
+ "loss": 0.7181,
+ "step": 461
+ },
+ {
+ "epoch": 0.21843971631205675,
+ "grad_norm": 2.9636635780334473,
+ "learning_rate": 4.989807514184465e-06,
+ "loss": 0.6082,
+ "step": 462
+ },
+ {
+ "epoch": 0.21891252955082743,
+ "grad_norm": 2.9634664058685303,
+ "learning_rate": 4.9897511628892615e-06,
+ "loss": 0.7086,
+ "step": 463
+ },
+ {
+ "epoch": 0.21938534278959812,
+ "grad_norm": 3.154763698577881,
+ "learning_rate": 4.98969465656817e-06,
+ "loss": 0.7027,
+ "step": 464
+ },
+ {
+ "epoch": 0.2198581560283688,
+ "grad_norm": 2.9959890842437744,
+ "learning_rate": 4.98963799522471e-06,
+ "loss": 0.6498,
+ "step": 465
+ },
+ {
+ "epoch": 0.2203309692671395,
+ "grad_norm": 3.5470590591430664,
+ "learning_rate": 4.989581178862408e-06,
+ "loss": 0.7199,
+ "step": 466
+ },
+ {
+ "epoch": 0.22080378250591018,
+ "grad_norm": 7.1873369216918945,
+ "learning_rate": 4.989524207484802e-06,
+ "loss": 0.6676,
+ "step": 467
+ },
+ {
+ "epoch": 0.22127659574468084,
+ "grad_norm": 3.1099541187286377,
+ "learning_rate": 4.98946708109544e-06,
+ "loss": 0.6785,
+ "step": 468
+ },
+ {
+ "epoch": 0.22174940898345152,
+ "grad_norm": 2.830991506576538,
+ "learning_rate": 4.9894097996978795e-06,
+ "loss": 0.6456,
+ "step": 469
+ },
+ {
+ "epoch": 0.2222222222222222,
+ "grad_norm": 3.0212316513061523,
+ "learning_rate": 4.989352363295687e-06,
+ "loss": 0.6048,
+ "step": 470
+ },
+ {
+ "epoch": 0.2226950354609929,
+ "grad_norm": 3.18776798248291,
+ "learning_rate": 4.989294771892437e-06,
+ "loss": 0.7078,
+ "step": 471
+ },
+ {
+ "epoch": 0.22316784869976358,
+ "grad_norm": 2.9972598552703857,
+ "learning_rate": 4.989237025491717e-06,
+ "loss": 0.7082,
+ "step": 472
+ },
+ {
+ "epoch": 0.22364066193853427,
+ "grad_norm": 3.4935688972473145,
+ "learning_rate": 4.989179124097123e-06,
+ "loss": 0.8199,
+ "step": 473
+ },
+ {
+ "epoch": 0.22411347517730495,
+ "grad_norm": 2.6485543251037598,
+ "learning_rate": 4.9891210677122595e-06,
+ "loss": 0.6371,
+ "step": 474
+ },
+ {
+ "epoch": 0.22458628841607564,
+ "grad_norm": 2.969233512878418,
+ "learning_rate": 4.989062856340742e-06,
+ "loss": 0.6879,
+ "step": 475
+ },
+ {
+ "epoch": 0.22505910165484633,
+ "grad_norm": 2.881875514984131,
+ "learning_rate": 4.989004489986194e-06,
+ "loss": 0.7415,
+ "step": 476
+ },
+ {
+ "epoch": 0.225531914893617,
+ "grad_norm": 2.624540090560913,
+ "learning_rate": 4.98894596865225e-06,
+ "loss": 0.6522,
+ "step": 477
+ },
+ {
+ "epoch": 0.2260047281323877,
+ "grad_norm": 3.61075496673584,
+ "learning_rate": 4.988887292342555e-06,
+ "loss": 0.7109,
+ "step": 478
+ },
+ {
+ "epoch": 0.2264775413711584,
+ "grad_norm": 2.9368972778320312,
+ "learning_rate": 4.988828461060762e-06,
+ "loss": 0.6843,
+ "step": 479
+ },
+ {
+ "epoch": 0.22695035460992907,
+ "grad_norm": 3.0670197010040283,
+ "learning_rate": 4.988769474810533e-06,
+ "loss": 0.6807,
+ "step": 480
+ },
+ {
+ "epoch": 0.22742316784869976,
+ "grad_norm": 2.9662792682647705,
+ "learning_rate": 4.988710333595542e-06,
+ "loss": 0.6796,
+ "step": 481
+ },
+ {
+ "epoch": 0.22789598108747045,
+ "grad_norm": 2.971235752105713,
+ "learning_rate": 4.988651037419472e-06,
+ "loss": 0.696,
+ "step": 482
+ },
+ {
+ "epoch": 0.22836879432624113,
+ "grad_norm": 2.931884527206421,
+ "learning_rate": 4.988591586286013e-06,
+ "loss": 0.7323,
+ "step": 483
+ },
+ {
+ "epoch": 0.22884160756501182,
+ "grad_norm": 2.8114213943481445,
+ "learning_rate": 4.988531980198868e-06,
+ "loss": 0.6584,
+ "step": 484
+ },
+ {
+ "epoch": 0.2293144208037825,
+ "grad_norm": 3.2785916328430176,
+ "learning_rate": 4.98847221916175e-06,
+ "loss": 0.7514,
+ "step": 485
+ },
+ {
+ "epoch": 0.2297872340425532,
+ "grad_norm": 3.0520215034484863,
+ "learning_rate": 4.988412303178377e-06,
+ "loss": 0.7564,
+ "step": 486
+ },
+ {
+ "epoch": 0.23026004728132388,
+ "grad_norm": 3.181002616882324,
+ "learning_rate": 4.988352232252483e-06,
+ "loss": 0.6768,
+ "step": 487
+ },
+ {
+ "epoch": 0.23073286052009456,
+ "grad_norm": 3.4953625202178955,
+ "learning_rate": 4.988292006387805e-06,
+ "loss": 0.7143,
+ "step": 488
+ },
+ {
+ "epoch": 0.23120567375886525,
+ "grad_norm": 3.326571226119995,
+ "learning_rate": 4.988231625588096e-06,
+ "loss": 0.7318,
+ "step": 489
+ },
+ {
+ "epoch": 0.23167848699763594,
+ "grad_norm": 3.09614634513855,
+ "learning_rate": 4.988171089857113e-06,
+ "loss": 0.6574,
+ "step": 490
+ },
+ {
+ "epoch": 0.23215130023640662,
+ "grad_norm": 2.7439446449279785,
+ "learning_rate": 4.9881103991986265e-06,
+ "loss": 0.6637,
+ "step": 491
+ },
+ {
+ "epoch": 0.2326241134751773,
+ "grad_norm": 3.0681190490722656,
+ "learning_rate": 4.988049553616416e-06,
+ "loss": 0.6326,
+ "step": 492
+ },
+ {
+ "epoch": 0.233096926713948,
+ "grad_norm": 3.0757341384887695,
+ "learning_rate": 4.98798855311427e-06,
+ "loss": 0.695,
+ "step": 493
+ },
+ {
+ "epoch": 0.23356973995271868,
+ "grad_norm": 2.8637635707855225,
+ "learning_rate": 4.987927397695985e-06,
+ "loss": 0.6598,
+ "step": 494
+ },
+ {
+ "epoch": 0.23404255319148937,
+ "grad_norm": 3.3641068935394287,
+ "learning_rate": 4.9878660873653715e-06,
+ "loss": 0.7435,
+ "step": 495
+ },
+ {
+ "epoch": 0.23451536643026005,
+ "grad_norm": 3.5025596618652344,
+ "learning_rate": 4.987804622126245e-06,
+ "loss": 0.735,
+ "step": 496
+ },
+ {
+ "epoch": 0.23498817966903074,
+ "grad_norm": 2.9298837184906006,
+ "learning_rate": 4.987743001982434e-06,
+ "loss": 0.7063,
+ "step": 497
+ },
+ {
+ "epoch": 0.23546099290780143,
+ "grad_norm": 2.70358943939209,
+ "learning_rate": 4.987681226937774e-06,
+ "loss": 0.6799,
+ "step": 498
+ },
+ {
+ "epoch": 0.2359338061465721,
+ "grad_norm": 3.027871608734131,
+ "learning_rate": 4.9876192969961125e-06,
+ "loss": 0.6881,
+ "step": 499
+ },
+ {
+ "epoch": 0.2364066193853428,
+ "grad_norm": 3.362306594848633,
+ "learning_rate": 4.987557212161304e-06,
+ "loss": 0.7906,
+ "step": 500
+ },
+ {
+ "epoch": 0.23687943262411348,
+ "grad_norm": 3.3136050701141357,
+ "learning_rate": 4.987494972437217e-06,
+ "loss": 0.6878,
+ "step": 501
+ },
+ {
+ "epoch": 0.23735224586288417,
+ "grad_norm": 3.017089605331421,
+ "learning_rate": 4.9874325778277255e-06,
+ "loss": 0.7279,
+ "step": 502
+ },
+ {
+ "epoch": 0.23782505910165486,
+ "grad_norm": 2.8300516605377197,
+ "learning_rate": 4.987370028336714e-06,
+ "loss": 0.6864,
+ "step": 503
+ },
+ {
+ "epoch": 0.23829787234042554,
+ "grad_norm": 3.201860189437866,
+ "learning_rate": 4.987307323968077e-06,
+ "loss": 0.7531,
+ "step": 504
+ },
+ {
+ "epoch": 0.23877068557919623,
+ "grad_norm": 2.685396194458008,
+ "learning_rate": 4.987244464725721e-06,
+ "loss": 0.5849,
+ "step": 505
+ },
+ {
+ "epoch": 0.23924349881796692,
+ "grad_norm": 2.8715312480926514,
+ "learning_rate": 4.987181450613557e-06,
+ "loss": 0.675,
+ "step": 506
+ },
+ {
+ "epoch": 0.2397163120567376,
+ "grad_norm": 2.813908815383911,
+ "learning_rate": 4.987118281635511e-06,
+ "loss": 0.6841,
+ "step": 507
+ },
+ {
+ "epoch": 0.2401891252955083,
+ "grad_norm": 3.2738473415374756,
+ "learning_rate": 4.987054957795514e-06,
+ "loss": 0.7158,
+ "step": 508
+ },
+ {
+ "epoch": 0.24066193853427895,
+ "grad_norm": 2.896134376525879,
+ "learning_rate": 4.986991479097511e-06,
+ "loss": 0.7542,
+ "step": 509
+ },
+ {
+ "epoch": 0.24113475177304963,
+ "grad_norm": 3.0390403270721436,
+ "learning_rate": 4.986927845545454e-06,
+ "loss": 0.6733,
+ "step": 510
+ },
+ {
+ "epoch": 0.24160756501182032,
+ "grad_norm": 3.0300254821777344,
+ "learning_rate": 4.9868640571433044e-06,
+ "loss": 0.722,
+ "step": 511
+ },
+ {
+ "epoch": 0.242080378250591,
+ "grad_norm": 3.3037352561950684,
+ "learning_rate": 4.986800113895035e-06,
+ "loss": 0.6811,
+ "step": 512
+ },
+ {
+ "epoch": 0.2425531914893617,
+ "grad_norm": 3.0358474254608154,
+ "learning_rate": 4.986736015804627e-06,
+ "loss": 0.7348,
+ "step": 513
+ },
+ {
+ "epoch": 0.24302600472813238,
+ "grad_norm": 3.108792304992676,
+ "learning_rate": 4.986671762876071e-06,
+ "loss": 0.6096,
+ "step": 514
+ },
+ {
+ "epoch": 0.24349881796690306,
+ "grad_norm": 3.1316237449645996,
+ "learning_rate": 4.986607355113367e-06,
+ "loss": 0.6357,
+ "step": 515
+ },
+ {
+ "epoch": 0.24397163120567375,
+ "grad_norm": 3.3095219135284424,
+ "learning_rate": 4.986542792520528e-06,
+ "loss": 0.7515,
+ "step": 516
+ },
+ {
+ "epoch": 0.24444444444444444,
+ "grad_norm": 3.4775984287261963,
+ "learning_rate": 4.986478075101572e-06,
+ "loss": 0.7104,
+ "step": 517
+ },
+ {
+ "epoch": 0.24491725768321512,
+ "grad_norm": 3.341708183288574,
+ "learning_rate": 4.986413202860528e-06,
+ "loss": 0.7339,
+ "step": 518
+ },
+ {
+ "epoch": 0.2453900709219858,
+ "grad_norm": 2.9646966457366943,
+ "learning_rate": 4.986348175801438e-06,
+ "loss": 0.6032,
+ "step": 519
+ },
+ {
+ "epoch": 0.2458628841607565,
+ "grad_norm": 3.1853902339935303,
+ "learning_rate": 4.986282993928349e-06,
+ "loss": 0.6925,
+ "step": 520
+ },
+ {
+ "epoch": 0.24633569739952718,
+ "grad_norm": 3.286909818649292,
+ "learning_rate": 4.98621765724532e-06,
+ "loss": 0.7447,
+ "step": 521
+ },
+ {
+ "epoch": 0.24680851063829787,
+ "grad_norm": 3.2255051136016846,
+ "learning_rate": 4.986152165756419e-06,
+ "loss": 0.7747,
+ "step": 522
+ },
+ {
+ "epoch": 0.24728132387706855,
+ "grad_norm": 3.002352237701416,
+ "learning_rate": 4.986086519465724e-06,
+ "loss": 0.6472,
+ "step": 523
+ },
+ {
+ "epoch": 0.24775413711583924,
+ "grad_norm": 3.4738974571228027,
+ "learning_rate": 4.986020718377322e-06,
+ "loss": 0.7381,
+ "step": 524
+ },
+ {
+ "epoch": 0.24822695035460993,
+ "grad_norm": 3.4470200538635254,
+ "learning_rate": 4.985954762495312e-06,
+ "loss": 0.6878,
+ "step": 525
+ },
+ {
+ "epoch": 0.2486997635933806,
+ "grad_norm": 2.9219350814819336,
+ "learning_rate": 4.985888651823799e-06,
+ "loss": 0.6317,
+ "step": 526
+ },
+ {
+ "epoch": 0.2491725768321513,
+ "grad_norm": 3.061767101287842,
+ "learning_rate": 4.985822386366899e-06,
+ "loss": 0.6842,
+ "step": 527
+ },
+ {
+ "epoch": 0.24964539007092199,
+ "grad_norm": 3.0291247367858887,
+ "learning_rate": 4.985755966128742e-06,
+ "loss": 0.6852,
+ "step": 528
+ },
+ {
+ "epoch": 0.25011820330969264,
+ "grad_norm": 2.964280843734741,
+ "learning_rate": 4.985689391113457e-06,
+ "loss": 0.7738,
+ "step": 529
+ },
+ {
+ "epoch": 0.25059101654846333,
+ "grad_norm": 3.058302164077759,
+ "learning_rate": 4.9856226613251955e-06,
+ "loss": 0.6677,
+ "step": 530
+ },
+ {
+ "epoch": 0.251063829787234,
+ "grad_norm": 3.345141649246216,
+ "learning_rate": 4.985555776768109e-06,
+ "loss": 0.7837,
+ "step": 531
+ },
+ {
+ "epoch": 0.2515366430260047,
+ "grad_norm": 3.565031051635742,
+ "learning_rate": 4.9854887374463636e-06,
+ "loss": 0.7231,
+ "step": 532
+ },
+ {
+ "epoch": 0.2520094562647754,
+ "grad_norm": 2.7953789234161377,
+ "learning_rate": 4.985421543364132e-06,
+ "loss": 0.6102,
+ "step": 533
+ },
+ {
+ "epoch": 0.2524822695035461,
+ "grad_norm": 2.887606620788574,
+ "learning_rate": 4.9853541945256e-06,
+ "loss": 0.6289,
+ "step": 534
+ },
+ {
+ "epoch": 0.25295508274231676,
+ "grad_norm": 3.1480495929718018,
+ "learning_rate": 4.985286690934961e-06,
+ "loss": 0.6348,
+ "step": 535
+ },
+ {
+ "epoch": 0.25342789598108745,
+ "grad_norm": 2.8912761211395264,
+ "learning_rate": 4.985219032596416e-06,
+ "loss": 0.595,
+ "step": 536
+ },
+ {
+ "epoch": 0.25390070921985813,
+ "grad_norm": 2.947936534881592,
+ "learning_rate": 4.98515121951418e-06,
+ "loss": 0.6196,
+ "step": 537
+ },
+ {
+ "epoch": 0.2543735224586288,
+ "grad_norm": 3.1085827350616455,
+ "learning_rate": 4.985083251692474e-06,
+ "loss": 0.6387,
+ "step": 538
+ },
+ {
+ "epoch": 0.2548463356973995,
+ "grad_norm": 3.1688334941864014,
+ "learning_rate": 4.985015129135531e-06,
+ "loss": 0.7055,
+ "step": 539
+ },
+ {
+ "epoch": 0.2553191489361702,
+ "grad_norm": 3.075042963027954,
+ "learning_rate": 4.984946851847593e-06,
+ "loss": 0.7515,
+ "step": 540
+ },
+ {
+ "epoch": 0.2557919621749409,
+ "grad_norm": 3.1933093070983887,
+ "learning_rate": 4.98487841983291e-06,
+ "loss": 0.7054,
+ "step": 541
+ },
+ {
+ "epoch": 0.25626477541371157,
+ "grad_norm": 3.043473958969116,
+ "learning_rate": 4.984809833095744e-06,
+ "loss": 0.6281,
+ "step": 542
+ },
+ {
+ "epoch": 0.25673758865248225,
+ "grad_norm": 3.0532584190368652,
+ "learning_rate": 4.9847410916403645e-06,
+ "loss": 0.6155,
+ "step": 543
+ },
+ {
+ "epoch": 0.25721040189125294,
+ "grad_norm": 3.608480215072632,
+ "learning_rate": 4.984672195471053e-06,
+ "loss": 0.7363,
+ "step": 544
+ },
+ {
+ "epoch": 0.2576832151300236,
+ "grad_norm": 2.7491862773895264,
+ "learning_rate": 4.9846031445921e-06,
+ "loss": 0.6594,
+ "step": 545
+ },
+ {
+ "epoch": 0.2581560283687943,
+ "grad_norm": 2.8602418899536133,
+ "learning_rate": 4.984533939007802e-06,
+ "loss": 0.6742,
+ "step": 546
+ },
+ {
+ "epoch": 0.258628841607565,
+ "grad_norm": 3.1782007217407227,
+ "learning_rate": 4.98446457872247e-06,
+ "loss": 0.731,
+ "step": 547
+ },
+ {
+ "epoch": 0.2591016548463357,
+ "grad_norm": 2.796147584915161,
+ "learning_rate": 4.984395063740423e-06,
+ "loss": 0.6617,
+ "step": 548
+ },
+ {
+ "epoch": 0.25957446808510637,
+ "grad_norm": 2.8392202854156494,
+ "learning_rate": 4.984325394065991e-06,
+ "loss": 0.6753,
+ "step": 549
+ },
+ {
+ "epoch": 0.26004728132387706,
+ "grad_norm": 3.134672164916992,
+ "learning_rate": 4.984255569703508e-06,
+ "loss": 0.7222,
+ "step": 550
+ },
+ {
+ "epoch": 0.26052009456264774,
+ "grad_norm": 2.734330177307129,
+ "learning_rate": 4.984185590657325e-06,
+ "loss": 0.6098,
+ "step": 551
+ },
+ {
+ "epoch": 0.26099290780141843,
+ "grad_norm": 3.739010810852051,
+ "learning_rate": 4.984115456931798e-06,
+ "loss": 0.7457,
+ "step": 552
+ },
+ {
+ "epoch": 0.2614657210401891,
+ "grad_norm": 2.8412528038024902,
+ "learning_rate": 4.9840451685312925e-06,
+ "loss": 0.6972,
+ "step": 553
+ },
+ {
+ "epoch": 0.2619385342789598,
+ "grad_norm": 3.017395496368408,
+ "learning_rate": 4.983974725460188e-06,
+ "loss": 0.6887,
+ "step": 554
+ },
+ {
+ "epoch": 0.2624113475177305,
+ "grad_norm": 3.2746949195861816,
+ "learning_rate": 4.98390412772287e-06,
+ "loss": 0.7047,
+ "step": 555
+ },
+ {
+ "epoch": 0.2628841607565012,
+ "grad_norm": 3.1561965942382812,
+ "learning_rate": 4.983833375323732e-06,
+ "loss": 0.7726,
+ "step": 556
+ },
+ {
+ "epoch": 0.26335697399527186,
+ "grad_norm": 3.2367217540740967,
+ "learning_rate": 4.9837624682671816e-06,
+ "loss": 0.6348,
+ "step": 557
+ },
+ {
+ "epoch": 0.26382978723404255,
+ "grad_norm": 2.8195858001708984,
+ "learning_rate": 4.983691406557633e-06,
+ "loss": 0.6387,
+ "step": 558
+ },
+ {
+ "epoch": 0.26430260047281323,
+ "grad_norm": 3.349820852279663,
+ "learning_rate": 4.983620190199511e-06,
+ "loss": 0.6776,
+ "step": 559
+ },
+ {
+ "epoch": 0.2647754137115839,
+ "grad_norm": 2.8025588989257812,
+ "learning_rate": 4.98354881919725e-06,
+ "loss": 0.6512,
+ "step": 560
+ },
+ {
+ "epoch": 0.2652482269503546,
+ "grad_norm": 2.9125499725341797,
+ "learning_rate": 4.983477293555295e-06,
+ "loss": 0.7024,
+ "step": 561
+ },
+ {
+ "epoch": 0.2657210401891253,
+ "grad_norm": 3.3479275703430176,
+ "learning_rate": 4.983405613278098e-06,
+ "loss": 0.688,
+ "step": 562
+ },
+ {
+ "epoch": 0.266193853427896,
+ "grad_norm": 3.123971462249756,
+ "learning_rate": 4.983333778370123e-06,
+ "loss": 0.6743,
+ "step": 563
+ },
+ {
+ "epoch": 0.26666666666666666,
+ "grad_norm": 2.891625165939331,
+ "learning_rate": 4.983261788835843e-06,
+ "loss": 0.5971,
+ "step": 564
+ },
+ {
+ "epoch": 0.26713947990543735,
+ "grad_norm": 3.5066864490509033,
+ "learning_rate": 4.98318964467974e-06,
+ "loss": 0.6958,
+ "step": 565
+ },
+ {
+ "epoch": 0.26761229314420804,
+ "grad_norm": 2.570547342300415,
+ "learning_rate": 4.983117345906306e-06,
+ "loss": 0.609,
+ "step": 566
+ },
+ {
+ "epoch": 0.2680851063829787,
+ "grad_norm": 3.005106210708618,
+ "learning_rate": 4.983044892520044e-06,
+ "loss": 0.6791,
+ "step": 567
+ },
+ {
+ "epoch": 0.2685579196217494,
+ "grad_norm": 3.429675340652466,
+ "learning_rate": 4.982972284525463e-06,
+ "loss": 0.6625,
+ "step": 568
+ },
+ {
+ "epoch": 0.2690307328605201,
+ "grad_norm": 3.825657367706299,
+ "learning_rate": 4.982899521927086e-06,
+ "loss": 0.6368,
+ "step": 569
+ },
+ {
+ "epoch": 0.2695035460992908,
+ "grad_norm": 2.8699095249176025,
+ "learning_rate": 4.982826604729443e-06,
+ "loss": 0.6425,
+ "step": 570
+ },
+ {
+ "epoch": 0.26997635933806147,
+ "grad_norm": 3.1688714027404785,
+ "learning_rate": 4.982753532937074e-06,
+ "loss": 0.6904,
+ "step": 571
+ },
+ {
+ "epoch": 0.27044917257683215,
+ "grad_norm": 3.3889992237091064,
+ "learning_rate": 4.98268030655453e-06,
+ "loss": 0.7575,
+ "step": 572
+ },
+ {
+ "epoch": 0.27092198581560284,
+ "grad_norm": 3.108315944671631,
+ "learning_rate": 4.982606925586367e-06,
+ "loss": 0.6648,
+ "step": 573
+ },
+ {
+ "epoch": 0.2713947990543735,
+ "grad_norm": 3.209831953048706,
+ "learning_rate": 4.982533390037159e-06,
+ "loss": 0.657,
+ "step": 574
+ },
+ {
+ "epoch": 0.2718676122931442,
+ "grad_norm": 3.1740927696228027,
+ "learning_rate": 4.982459699911482e-06,
+ "loss": 0.7262,
+ "step": 575
+ },
+ {
+ "epoch": 0.2723404255319149,
+ "grad_norm": 3.0190417766571045,
+ "learning_rate": 4.982385855213924e-06,
+ "loss": 0.6368,
+ "step": 576
+ },
+ {
+ "epoch": 0.2728132387706856,
+ "grad_norm": 3.05049467086792,
+ "learning_rate": 4.982311855949084e-06,
+ "loss": 0.72,
+ "step": 577
+ },
+ {
+ "epoch": 0.27328605200945627,
+ "grad_norm": 2.984816551208496,
+ "learning_rate": 4.98223770212157e-06,
+ "loss": 0.6856,
+ "step": 578
+ },
+ {
+ "epoch": 0.27375886524822696,
+ "grad_norm": 2.744969606399536,
+ "learning_rate": 4.982163393735998e-06,
+ "loss": 0.6023,
+ "step": 579
+ },
+ {
+ "epoch": 0.27423167848699764,
+ "grad_norm": 3.170564889907837,
+ "learning_rate": 4.982088930796996e-06,
+ "loss": 0.6678,
+ "step": 580
+ },
+ {
+ "epoch": 0.27470449172576833,
+ "grad_norm": 2.8686118125915527,
+ "learning_rate": 4.982014313309199e-06,
+ "loss": 0.6157,
+ "step": 581
+ },
+ {
+ "epoch": 0.275177304964539,
+ "grad_norm": 2.8768694400787354,
+ "learning_rate": 4.981939541277254e-06,
+ "loss": 0.6566,
+ "step": 582
+ },
+ {
+ "epoch": 0.2756501182033097,
+ "grad_norm": 2.621481418609619,
+ "learning_rate": 4.981864614705818e-06,
+ "loss": 0.7372,
+ "step": 583
+ },
+ {
+ "epoch": 0.2761229314420804,
+ "grad_norm": 3.527374267578125,
+ "learning_rate": 4.981789533599554e-06,
+ "loss": 0.6485,
+ "step": 584
+ },
+ {
+ "epoch": 0.2765957446808511,
+ "grad_norm": 3.3141074180603027,
+ "learning_rate": 4.981714297963138e-06,
+ "loss": 0.6816,
+ "step": 585
+ },
+ {
+ "epoch": 0.27706855791962176,
+ "grad_norm": 2.9247069358825684,
+ "learning_rate": 4.981638907801255e-06,
+ "loss": 0.7217,
+ "step": 586
+ },
+ {
+ "epoch": 0.27754137115839245,
+ "grad_norm": 2.875236749649048,
+ "learning_rate": 4.981563363118599e-06,
+ "loss": 0.6662,
+ "step": 587
+ },
+ {
+ "epoch": 0.27801418439716313,
+ "grad_norm": 2.9540364742279053,
+ "learning_rate": 4.981487663919874e-06,
+ "loss": 0.7225,
+ "step": 588
+ },
+ {
+ "epoch": 0.2784869976359338,
+ "grad_norm": 2.90889310836792,
+ "learning_rate": 4.981411810209793e-06,
+ "loss": 0.6054,
+ "step": 589
+ },
+ {
+ "epoch": 0.2789598108747045,
+ "grad_norm": 2.8541409969329834,
+ "learning_rate": 4.981335801993078e-06,
+ "loss": 0.6539,
+ "step": 590
+ },
+ {
+ "epoch": 0.2794326241134752,
+ "grad_norm": 3.1600730419158936,
+ "learning_rate": 4.981259639274465e-06,
+ "loss": 0.6415,
+ "step": 591
+ },
+ {
+ "epoch": 0.2799054373522459,
+ "grad_norm": 3.569376230239868,
+ "learning_rate": 4.981183322058693e-06,
+ "loss": 0.6944,
+ "step": 592
+ },
+ {
+ "epoch": 0.28037825059101656,
+ "grad_norm": 3.067667007446289,
+ "learning_rate": 4.981106850350515e-06,
+ "loss": 0.7378,
+ "step": 593
+ },
+ {
+ "epoch": 0.28085106382978725,
+ "grad_norm": 3.082073450088501,
+ "learning_rate": 4.981030224154693e-06,
+ "loss": 0.693,
+ "step": 594
+ },
+ {
+ "epoch": 0.28132387706855794,
+ "grad_norm": 2.902932643890381,
+ "learning_rate": 4.980953443475998e-06,
+ "loss": 0.6549,
+ "step": 595
+ },
+ {
+ "epoch": 0.2817966903073286,
+ "grad_norm": 2.6821181774139404,
+ "learning_rate": 4.980876508319211e-06,
+ "loss": 0.6231,
+ "step": 596
+ },
+ {
+ "epoch": 0.2822695035460993,
+ "grad_norm": 3.1747355461120605,
+ "learning_rate": 4.9807994186891215e-06,
+ "loss": 0.6826,
+ "step": 597
+ },
+ {
+ "epoch": 0.28274231678487,
+ "grad_norm": 2.6975860595703125,
+ "learning_rate": 4.980722174590531e-06,
+ "loss": 0.6669,
+ "step": 598
+ },
+ {
+ "epoch": 0.2832151300236407,
+ "grad_norm": 2.924285650253296,
+ "learning_rate": 4.9806447760282486e-06,
+ "loss": 0.689,
+ "step": 599
+ },
+ {
+ "epoch": 0.28368794326241137,
+ "grad_norm": 2.941417694091797,
+ "learning_rate": 4.980567223007093e-06,
+ "loss": 0.6672,
+ "step": 600
+ },
+ {
+ "epoch": 0.28416075650118205,
+ "grad_norm": 2.8582186698913574,
+ "learning_rate": 4.980489515531892e-06,
+ "loss": 0.6229,
+ "step": 601
+ },
+ {
+ "epoch": 0.28463356973995274,
+ "grad_norm": 2.6462013721466064,
+ "learning_rate": 4.9804116536074865e-06,
+ "loss": 0.606,
+ "step": 602
+ },
+ {
+ "epoch": 0.2851063829787234,
+ "grad_norm": 2.9029998779296875,
+ "learning_rate": 4.980333637238723e-06,
+ "loss": 0.5915,
+ "step": 603
+ },
+ {
+ "epoch": 0.2855791962174941,
+ "grad_norm": 3.9359042644500732,
+ "learning_rate": 4.980255466430462e-06,
+ "loss": 0.7035,
+ "step": 604
+ },
+ {
+ "epoch": 0.2860520094562648,
+ "grad_norm": 3.200524091720581,
+ "learning_rate": 4.980177141187566e-06,
+ "loss": 0.7156,
+ "step": 605
+ },
+ {
+ "epoch": 0.2865248226950355,
+ "grad_norm": 3.1708686351776123,
+ "learning_rate": 4.980098661514916e-06,
+ "loss": 0.746,
+ "step": 606
+ },
+ {
+ "epoch": 0.28699763593380617,
+ "grad_norm": 2.8926830291748047,
+ "learning_rate": 4.980020027417397e-06,
+ "loss": 0.6282,
+ "step": 607
+ },
+ {
+ "epoch": 0.28747044917257686,
+ "grad_norm": 3.0526294708251953,
+ "learning_rate": 4.979941238899906e-06,
+ "loss": 0.6594,
+ "step": 608
+ },
+ {
+ "epoch": 0.28794326241134754,
+ "grad_norm": 2.9869306087493896,
+ "learning_rate": 4.9798622959673486e-06,
+ "loss": 0.7771,
+ "step": 609
+ },
+ {
+ "epoch": 0.28841607565011823,
+ "grad_norm": 2.7894513607025146,
+ "learning_rate": 4.979783198624638e-06,
+ "loss": 0.6819,
+ "step": 610
+ },
+ {
+ "epoch": 0.28888888888888886,
+ "grad_norm": 2.958575963973999,
+ "learning_rate": 4.9797039468767025e-06,
+ "loss": 0.6474,
+ "step": 611
+ },
+ {
+ "epoch": 0.28936170212765955,
+ "grad_norm": 3.423748016357422,
+ "learning_rate": 4.979624540728475e-06,
+ "loss": 0.7389,
+ "step": 612
+ },
+ {
+ "epoch": 0.28983451536643023,
+ "grad_norm": 2.9641635417938232,
+ "learning_rate": 4.9795449801849e-06,
+ "loss": 0.6005,
+ "step": 613
+ },
+ {
+ "epoch": 0.2903073286052009,
+ "grad_norm": 3.02274227142334,
+ "learning_rate": 4.979465265250933e-06,
+ "loss": 0.6358,
+ "step": 614
+ },
+ {
+ "epoch": 0.2907801418439716,
+ "grad_norm": 3.0562758445739746,
+ "learning_rate": 4.979385395931534e-06,
+ "loss": 0.6313,
+ "step": 615
+ },
+ {
+ "epoch": 0.2912529550827423,
+ "grad_norm": 3.301816701889038,
+ "learning_rate": 4.97930537223168e-06,
+ "loss": 0.7264,
+ "step": 616
+ },
+ {
+ "epoch": 0.291725768321513,
+ "grad_norm": 2.975360870361328,
+ "learning_rate": 4.979225194156351e-06,
+ "loss": 0.613,
+ "step": 617
+ },
+ {
+ "epoch": 0.29219858156028367,
+ "grad_norm": 2.9245030879974365,
+ "learning_rate": 4.97914486171054e-06,
+ "loss": 0.6646,
+ "step": 618
+ },
+ {
+ "epoch": 0.29267139479905435,
+ "grad_norm": 3.1336188316345215,
+ "learning_rate": 4.979064374899249e-06,
+ "loss": 0.6421,
+ "step": 619
+ },
+ {
+ "epoch": 0.29314420803782504,
+ "grad_norm": 3.6298763751983643,
+ "learning_rate": 4.978983733727491e-06,
+ "loss": 0.6433,
+ "step": 620
+ },
+ {
+ "epoch": 0.2936170212765957,
+ "grad_norm": 2.919597625732422,
+ "learning_rate": 4.9789029382002845e-06,
+ "loss": 0.6288,
+ "step": 621
+ },
+ {
+ "epoch": 0.2940898345153664,
+ "grad_norm": 3.2206127643585205,
+ "learning_rate": 4.978821988322662e-06,
+ "loss": 0.7102,
+ "step": 622
+ },
+ {
+ "epoch": 0.2945626477541371,
+ "grad_norm": 3.1767101287841797,
+ "learning_rate": 4.978740884099664e-06,
+ "loss": 0.6722,
+ "step": 623
+ },
+ {
+ "epoch": 0.2950354609929078,
+ "grad_norm": 3.3425452709198,
+ "learning_rate": 4.97865962553634e-06,
+ "loss": 0.6492,
+ "step": 624
+ },
+ {
+ "epoch": 0.29550827423167847,
+ "grad_norm": 3.0408358573913574,
+ "learning_rate": 4.97857821263775e-06,
+ "loss": 0.6522,
+ "step": 625
+ },
+ {
+ "epoch": 0.29598108747044916,
+ "grad_norm": 2.8144783973693848,
+ "learning_rate": 4.978496645408963e-06,
+ "loss": 0.7237,
+ "step": 626
+ },
+ {
+ "epoch": 0.29645390070921984,
+ "grad_norm": 3.7010560035705566,
+ "learning_rate": 4.978414923855057e-06,
+ "loss": 0.7509,
+ "step": 627
+ },
+ {
+ "epoch": 0.29692671394799053,
+ "grad_norm": 2.9438371658325195,
+ "learning_rate": 4.978333047981122e-06,
+ "loss": 0.6244,
+ "step": 628
+ },
+ {
+ "epoch": 0.2973995271867612,
+ "grad_norm": 3.285982370376587,
+ "learning_rate": 4.978251017792255e-06,
+ "loss": 0.7553,
+ "step": 629
+ },
+ {
+ "epoch": 0.2978723404255319,
+ "grad_norm": 3.7021138668060303,
+ "learning_rate": 4.978168833293564e-06,
+ "loss": 0.7859,
+ "step": 630
+ },
+ {
+ "epoch": 0.2983451536643026,
+ "grad_norm": 3.481858730316162,
+ "learning_rate": 4.9780864944901654e-06,
+ "loss": 0.7146,
+ "step": 631
+ },
+ {
+ "epoch": 0.2988179669030733,
+ "grad_norm": 3.693824529647827,
+ "learning_rate": 4.978004001387188e-06,
+ "loss": 0.6608,
+ "step": 632
+ },
+ {
+ "epoch": 0.29929078014184396,
+ "grad_norm": 3.0069146156311035,
+ "learning_rate": 4.9779213539897665e-06,
+ "loss": 0.6506,
+ "step": 633
+ },
+ {
+ "epoch": 0.29976359338061465,
+ "grad_norm": 3.037644147872925,
+ "learning_rate": 4.977838552303048e-06,
+ "loss": 0.6487,
+ "step": 634
+ },
+ {
+ "epoch": 0.30023640661938533,
+ "grad_norm": 3.018554449081421,
+ "learning_rate": 4.977755596332188e-06,
+ "loss": 0.6128,
+ "step": 635
+ },
+ {
+ "epoch": 0.300709219858156,
+ "grad_norm": 3.000312089920044,
+ "learning_rate": 4.977672486082351e-06,
+ "loss": 0.6431,
+ "step": 636
+ },
+ {
+ "epoch": 0.3011820330969267,
+ "grad_norm": 2.836803913116455,
+ "learning_rate": 4.977589221558713e-06,
+ "loss": 0.5914,
+ "step": 637
+ },
+ {
+ "epoch": 0.3016548463356974,
+ "grad_norm": 3.080469846725464,
+ "learning_rate": 4.977505802766457e-06,
+ "loss": 0.7265,
+ "step": 638
+ },
+ {
+ "epoch": 0.3021276595744681,
+ "grad_norm": 3.2245471477508545,
+ "learning_rate": 4.97742222971078e-06,
+ "loss": 0.6895,
+ "step": 639
+ },
+ {
+ "epoch": 0.30260047281323876,
+ "grad_norm": 3.559006452560425,
+ "learning_rate": 4.977338502396882e-06,
+ "loss": 0.7439,
+ "step": 640
+ },
+ {
+ "epoch": 0.30307328605200945,
+ "grad_norm": 2.9116289615631104,
+ "learning_rate": 4.9772546208299795e-06,
+ "loss": 0.6907,
+ "step": 641
+ },
+ {
+ "epoch": 0.30354609929078014,
+ "grad_norm": 3.3645524978637695,
+ "learning_rate": 4.977170585015295e-06,
+ "loss": 0.6983,
+ "step": 642
+ },
+ {
+ "epoch": 0.3040189125295508,
+ "grad_norm": 3.080148458480835,
+ "learning_rate": 4.977086394958058e-06,
+ "loss": 0.7016,
+ "step": 643
+ },
+ {
+ "epoch": 0.3044917257683215,
+ "grad_norm": 2.9276750087738037,
+ "learning_rate": 4.977002050663515e-06,
+ "loss": 0.6509,
+ "step": 644
+ },
+ {
+ "epoch": 0.3049645390070922,
+ "grad_norm": 3.183609962463379,
+ "learning_rate": 4.976917552136914e-06,
+ "loss": 0.6814,
+ "step": 645
+ },
+ {
+ "epoch": 0.3054373522458629,
+ "grad_norm": 3.0980000495910645,
+ "learning_rate": 4.976832899383519e-06,
+ "loss": 0.6319,
+ "step": 646
+ },
+ {
+ "epoch": 0.30591016548463357,
+ "grad_norm": 3.211376190185547,
+ "learning_rate": 4.9767480924086e-06,
+ "loss": 0.6365,
+ "step": 647
+ },
+ {
+ "epoch": 0.30638297872340425,
+ "grad_norm": 3.214430093765259,
+ "learning_rate": 4.976663131217437e-06,
+ "loss": 0.6006,
+ "step": 648
+ },
+ {
+ "epoch": 0.30685579196217494,
+ "grad_norm": 3.0914318561553955,
+ "learning_rate": 4.976578015815321e-06,
+ "loss": 0.7162,
+ "step": 649
+ },
+ {
+ "epoch": 0.3073286052009456,
+ "grad_norm": 2.7644500732421875,
+ "learning_rate": 4.976492746207551e-06,
+ "loss": 0.6045,
+ "step": 650
+ },
+ {
+ "epoch": 0.3078014184397163,
+ "grad_norm": 3.1913280487060547,
+ "learning_rate": 4.9764073223994374e-06,
+ "loss": 0.6796,
+ "step": 651
+ },
+ {
+ "epoch": 0.308274231678487,
+ "grad_norm": 2.8919692039489746,
+ "learning_rate": 4.976321744396299e-06,
+ "loss": 0.6683,
+ "step": 652
+ },
+ {
+ "epoch": 0.3087470449172577,
+ "grad_norm": 2.862234115600586,
+ "learning_rate": 4.976236012203463e-06,
+ "loss": 0.6631,
+ "step": 653
+ },
+ {
+ "epoch": 0.30921985815602837,
+ "grad_norm": 2.9708092212677,
+ "learning_rate": 4.976150125826268e-06,
+ "loss": 0.6326,
+ "step": 654
+ },
+ {
+ "epoch": 0.30969267139479906,
+ "grad_norm": 2.892465353012085,
+ "learning_rate": 4.976064085270063e-06,
+ "loss": 0.6574,
+ "step": 655
+ },
+ {
+ "epoch": 0.31016548463356974,
+ "grad_norm": 3.9215126037597656,
+ "learning_rate": 4.975977890540205e-06,
+ "loss": 0.7351,
+ "step": 656
+ },
+ {
+ "epoch": 0.31063829787234043,
+ "grad_norm": 2.9544081687927246,
+ "learning_rate": 4.975891541642059e-06,
+ "loss": 0.7264,
+ "step": 657
+ },
+ {
+ "epoch": 0.3111111111111111,
+ "grad_norm": 2.995035409927368,
+ "learning_rate": 4.975805038581005e-06,
+ "loss": 0.7405,
+ "step": 658
+ },
+ {
+ "epoch": 0.3115839243498818,
+ "grad_norm": 2.9653120040893555,
+ "learning_rate": 4.975718381362427e-06,
+ "loss": 0.679,
+ "step": 659
+ },
+ {
+ "epoch": 0.3120567375886525,
+ "grad_norm": 2.93976092338562,
+ "learning_rate": 4.9756315699917205e-06,
+ "loss": 0.627,
+ "step": 660
+ },
+ {
+ "epoch": 0.3125295508274232,
+ "grad_norm": 3.106522560119629,
+ "learning_rate": 4.9755446044742915e-06,
+ "loss": 0.6329,
+ "step": 661
+ },
+ {
+ "epoch": 0.31300236406619386,
+ "grad_norm": 3.0238280296325684,
+ "learning_rate": 4.975457484815554e-06,
+ "loss": 0.6643,
+ "step": 662
+ },
+ {
+ "epoch": 0.31347517730496455,
+ "grad_norm": 2.943528175354004,
+ "learning_rate": 4.9753702110209356e-06,
+ "loss": 0.668,
+ "step": 663
+ },
+ {
+ "epoch": 0.31394799054373523,
+ "grad_norm": 2.6840121746063232,
+ "learning_rate": 4.9752827830958676e-06,
+ "loss": 0.5482,
+ "step": 664
+ },
+ {
+ "epoch": 0.3144208037825059,
+ "grad_norm": 2.823875904083252,
+ "learning_rate": 4.975195201045794e-06,
+ "loss": 0.7017,
+ "step": 665
+ },
+ {
+ "epoch": 0.3148936170212766,
+ "grad_norm": 3.148181200027466,
+ "learning_rate": 4.975107464876168e-06,
+ "loss": 0.747,
+ "step": 666
+ },
+ {
+ "epoch": 0.3153664302600473,
+ "grad_norm": 2.630584478378296,
+ "learning_rate": 4.9750195745924545e-06,
+ "loss": 0.5987,
+ "step": 667
+ },
+ {
+ "epoch": 0.315839243498818,
+ "grad_norm": 3.075866460800171,
+ "learning_rate": 4.974931530200124e-06,
+ "loss": 0.664,
+ "step": 668
+ },
+ {
+ "epoch": 0.31631205673758866,
+ "grad_norm": 2.947197914123535,
+ "learning_rate": 4.974843331704659e-06,
+ "loss": 0.631,
+ "step": 669
+ },
+ {
+ "epoch": 0.31678486997635935,
+ "grad_norm": 3.519646644592285,
+ "learning_rate": 4.974754979111552e-06,
+ "loss": 0.7154,
+ "step": 670
+ },
+ {
+ "epoch": 0.31725768321513004,
+ "grad_norm": 2.8687186241149902,
+ "learning_rate": 4.974666472426305e-06,
+ "loss": 0.6366,
+ "step": 671
+ },
+ {
+ "epoch": 0.3177304964539007,
+ "grad_norm": 2.6966612339019775,
+ "learning_rate": 4.974577811654426e-06,
+ "loss": 0.7112,
+ "step": 672
+ },
+ {
+ "epoch": 0.3182033096926714,
+ "grad_norm": 3.1390228271484375,
+ "learning_rate": 4.974488996801439e-06,
+ "loss": 0.6882,
+ "step": 673
+ },
+ {
+ "epoch": 0.3186761229314421,
+ "grad_norm": 3.4667599201202393,
+ "learning_rate": 4.974400027872871e-06,
+ "loss": 0.7153,
+ "step": 674
+ },
+ {
+ "epoch": 0.3191489361702128,
+ "grad_norm": 2.9632184505462646,
+ "learning_rate": 4.974310904874265e-06,
+ "loss": 0.7081,
+ "step": 675
+ },
+ {
+ "epoch": 0.31962174940898347,
+ "grad_norm": 3.46150279045105,
+ "learning_rate": 4.9742216278111666e-06,
+ "loss": 0.6242,
+ "step": 676
+ },
+ {
+ "epoch": 0.32009456264775416,
+ "grad_norm": 3.380403757095337,
+ "learning_rate": 4.974132196689137e-06,
+ "loss": 0.6863,
+ "step": 677
+ },
+ {
+ "epoch": 0.32056737588652484,
+ "grad_norm": 3.4279606342315674,
+ "learning_rate": 4.974042611513746e-06,
+ "loss": 0.6388,
+ "step": 678
+ },
+ {
+ "epoch": 0.3210401891252955,
+ "grad_norm": 2.634523391723633,
+ "learning_rate": 4.973952872290568e-06,
+ "loss": 0.6038,
+ "step": 679
+ },
+ {
+ "epoch": 0.3215130023640662,
+ "grad_norm": 3.19693922996521,
+ "learning_rate": 4.973862979025194e-06,
+ "loss": 0.6383,
+ "step": 680
+ },
+ {
+ "epoch": 0.3219858156028369,
+ "grad_norm": 3.437692165374756,
+ "learning_rate": 4.973772931723218e-06,
+ "loss": 0.7288,
+ "step": 681
+ },
+ {
+ "epoch": 0.3224586288416076,
+ "grad_norm": 2.506301164627075,
+ "learning_rate": 4.97368273039025e-06,
+ "loss": 0.5707,
+ "step": 682
+ },
+ {
+ "epoch": 0.3229314420803783,
+ "grad_norm": 3.0942845344543457,
+ "learning_rate": 4.9735923750319044e-06,
+ "loss": 0.6348,
+ "step": 683
+ },
+ {
+ "epoch": 0.32340425531914896,
+ "grad_norm": 3.0889835357666016,
+ "learning_rate": 4.973501865653809e-06,
+ "loss": 0.6697,
+ "step": 684
+ },
+ {
+ "epoch": 0.32387706855791965,
+ "grad_norm": 3.0391931533813477,
+ "learning_rate": 4.973411202261598e-06,
+ "loss": 0.7091,
+ "step": 685
+ },
+ {
+ "epoch": 0.32434988179669033,
+ "grad_norm": 3.0333497524261475,
+ "learning_rate": 4.973320384860917e-06,
+ "loss": 0.6403,
+ "step": 686
+ },
+ {
+ "epoch": 0.324822695035461,
+ "grad_norm": 2.9714622497558594,
+ "learning_rate": 4.973229413457421e-06,
+ "loss": 0.6977,
+ "step": 687
+ },
+ {
+ "epoch": 0.3252955082742317,
+ "grad_norm": 3.057558298110962,
+ "learning_rate": 4.973138288056774e-06,
+ "loss": 0.7236,
+ "step": 688
+ },
+ {
+ "epoch": 0.3257683215130024,
+ "grad_norm": 2.921093463897705,
+ "learning_rate": 4.97304700866465e-06,
+ "loss": 0.576,
+ "step": 689
+ },
+ {
+ "epoch": 0.3262411347517731,
+ "grad_norm": 3.0287256240844727,
+ "learning_rate": 4.972955575286732e-06,
+ "loss": 0.7077,
+ "step": 690
+ },
+ {
+ "epoch": 0.32671394799054376,
+ "grad_norm": 2.8621346950531006,
+ "learning_rate": 4.972863987928716e-06,
+ "loss": 0.6952,
+ "step": 691
+ },
+ {
+ "epoch": 0.3271867612293144,
+ "grad_norm": 2.631359100341797,
+ "learning_rate": 4.9727722465963006e-06,
+ "loss": 0.6931,
+ "step": 692
+ },
+ {
+ "epoch": 0.3276595744680851,
+ "grad_norm": 2.8484320640563965,
+ "learning_rate": 4.972680351295201e-06,
+ "loss": 0.6292,
+ "step": 693
+ },
+ {
+ "epoch": 0.32813238770685577,
+ "grad_norm": 2.593001365661621,
+ "learning_rate": 4.972588302031138e-06,
+ "loss": 0.5942,
+ "step": 694
+ },
+ {
+ "epoch": 0.32860520094562645,
+ "grad_norm": 2.6321065425872803,
+ "learning_rate": 4.972496098809844e-06,
+ "loss": 0.65,
+ "step": 695
+ },
+ {
+ "epoch": 0.32907801418439714,
+ "grad_norm": 3.2516732215881348,
+ "learning_rate": 4.972403741637059e-06,
+ "loss": 0.7385,
+ "step": 696
+ },
+ {
+ "epoch": 0.3295508274231678,
+ "grad_norm": 3.180854320526123,
+ "learning_rate": 4.972311230518535e-06,
+ "loss": 0.6569,
+ "step": 697
+ },
+ {
+ "epoch": 0.3300236406619385,
+ "grad_norm": 4.161016941070557,
+ "learning_rate": 4.972218565460031e-06,
+ "loss": 0.6416,
+ "step": 698
+ },
+ {
+ "epoch": 0.3304964539007092,
+ "grad_norm": 3.153897762298584,
+ "learning_rate": 4.972125746467317e-06,
+ "loss": 0.7196,
+ "step": 699
+ },
+ {
+ "epoch": 0.3309692671394799,
+ "grad_norm": 2.9595556259155273,
+ "learning_rate": 4.972032773546173e-06,
+ "loss": 0.7093,
+ "step": 700
+ },
+ {
+ "epoch": 0.33144208037825057,
+ "grad_norm": 3.1086833477020264,
+ "learning_rate": 4.9719396467023875e-06,
+ "loss": 0.6963,
+ "step": 701
+ },
+ {
+ "epoch": 0.33191489361702126,
+ "grad_norm": 2.958921432495117,
+ "learning_rate": 4.971846365941759e-06,
+ "loss": 0.6518,
+ "step": 702
+ },
+ {
+ "epoch": 0.33238770685579194,
+ "grad_norm": 2.8745479583740234,
+ "learning_rate": 4.971752931270096e-06,
+ "loss": 0.696,
+ "step": 703
+ },
+ {
+ "epoch": 0.33286052009456263,
+ "grad_norm": 3.224358558654785,
+ "learning_rate": 4.971659342693217e-06,
+ "loss": 0.6769,
+ "step": 704
+ },
+ {
+ "epoch": 0.3333333333333333,
+ "grad_norm": 2.696319580078125,
+ "learning_rate": 4.9715656002169486e-06,
+ "loss": 0.6833,
+ "step": 705
+ },
+ {
+ "epoch": 0.333806146572104,
+ "grad_norm": 2.9283502101898193,
+ "learning_rate": 4.971471703847127e-06,
+ "loss": 0.6784,
+ "step": 706
+ },
+ {
+ "epoch": 0.3342789598108747,
+ "grad_norm": 2.654914140701294,
+ "learning_rate": 4.9713776535896e-06,
+ "loss": 0.6337,
+ "step": 707
+ },
+ {
+ "epoch": 0.3347517730496454,
+ "grad_norm": 3.041555643081665,
+ "learning_rate": 4.971283449450224e-06,
+ "loss": 0.6227,
+ "step": 708
+ },
+ {
+ "epoch": 0.33522458628841606,
+ "grad_norm": 2.893008232116699,
+ "learning_rate": 4.971189091434863e-06,
+ "loss": 0.655,
+ "step": 709
+ },
+ {
+ "epoch": 0.33569739952718675,
+ "grad_norm": 2.8806653022766113,
+ "learning_rate": 4.971094579549393e-06,
+ "loss": 0.7077,
+ "step": 710
+ },
+ {
+ "epoch": 0.33617021276595743,
+ "grad_norm": 3.4830048084259033,
+ "learning_rate": 4.9709999137996986e-06,
+ "loss": 0.7461,
+ "step": 711
+ },
+ {
+ "epoch": 0.3366430260047281,
+ "grad_norm": 3.155444860458374,
+ "learning_rate": 4.970905094191674e-06,
+ "loss": 0.652,
+ "step": 712
+ },
+ {
+ "epoch": 0.3371158392434988,
+ "grad_norm": 2.7608706951141357,
+ "learning_rate": 4.970810120731225e-06,
+ "loss": 0.684,
+ "step": 713
+ },
+ {
+ "epoch": 0.3375886524822695,
+ "grad_norm": 2.8209474086761475,
+ "learning_rate": 4.970714993424265e-06,
+ "loss": 0.6009,
+ "step": 714
+ },
+ {
+ "epoch": 0.3380614657210402,
+ "grad_norm": 3.6532654762268066,
+ "learning_rate": 4.9706197122767145e-06,
+ "loss": 0.702,
+ "step": 715
+ },
+ {
+ "epoch": 0.33853427895981086,
+ "grad_norm": 2.6276566982269287,
+ "learning_rate": 4.970524277294508e-06,
+ "loss": 0.6338,
+ "step": 716
+ },
+ {
+ "epoch": 0.33900709219858155,
+ "grad_norm": 3.509871482849121,
+ "learning_rate": 4.970428688483589e-06,
+ "loss": 0.6853,
+ "step": 717
+ },
+ {
+ "epoch": 0.33947990543735224,
+ "grad_norm": 5.332682132720947,
+ "learning_rate": 4.970332945849906e-06,
+ "loss": 0.6684,
+ "step": 718
+ },
+ {
+ "epoch": 0.3399527186761229,
+ "grad_norm": 2.718801975250244,
+ "learning_rate": 4.970237049399424e-06,
+ "loss": 0.6676,
+ "step": 719
+ },
+ {
+ "epoch": 0.3404255319148936,
+ "grad_norm": 3.891003131866455,
+ "learning_rate": 4.970140999138112e-06,
+ "loss": 0.7043,
+ "step": 720
+ },
+ {
+ "epoch": 0.3408983451536643,
+ "grad_norm": 2.8863155841827393,
+ "learning_rate": 4.970044795071951e-06,
+ "loss": 0.6563,
+ "step": 721
+ },
+ {
+ "epoch": 0.341371158392435,
+ "grad_norm": 3.2527518272399902,
+ "learning_rate": 4.969948437206932e-06,
+ "loss": 0.7244,
+ "step": 722
+ },
+ {
+ "epoch": 0.34184397163120567,
+ "grad_norm": 2.9726758003234863,
+ "learning_rate": 4.969851925549054e-06,
+ "loss": 0.6548,
+ "step": 723
+ },
+ {
+ "epoch": 0.34231678486997635,
+ "grad_norm": 3.118309497833252,
+ "learning_rate": 4.969755260104327e-06,
+ "loss": 0.7293,
+ "step": 724
+ },
+ {
+ "epoch": 0.34278959810874704,
+ "grad_norm": 3.373068332672119,
+ "learning_rate": 4.969658440878769e-06,
+ "loss": 0.6444,
+ "step": 725
+ },
+ {
+ "epoch": 0.3432624113475177,
+ "grad_norm": 2.7157437801361084,
+ "learning_rate": 4.969561467878409e-06,
+ "loss": 0.642,
+ "step": 726
+ },
+ {
+ "epoch": 0.3437352245862884,
+ "grad_norm": 2.58929705619812,
+ "learning_rate": 4.969464341109285e-06,
+ "loss": 0.6165,
+ "step": 727
+ },
+ {
+ "epoch": 0.3442080378250591,
+ "grad_norm": 2.8811306953430176,
+ "learning_rate": 4.969367060577445e-06,
+ "loss": 0.7127,
+ "step": 728
+ },
+ {
+ "epoch": 0.3446808510638298,
+ "grad_norm": 3.494358539581299,
+ "learning_rate": 4.969269626288946e-06,
+ "loss": 0.7103,
+ "step": 729
+ },
+ {
+ "epoch": 0.34515366430260047,
+ "grad_norm": 2.9753928184509277,
+ "learning_rate": 4.969172038249855e-06,
+ "loss": 0.6911,
+ "step": 730
+ },
+ {
+ "epoch": 0.34562647754137116,
+ "grad_norm": 3.2885913848876953,
+ "learning_rate": 4.969074296466247e-06,
+ "loss": 0.6968,
+ "step": 731
+ },
+ {
+ "epoch": 0.34609929078014184,
+ "grad_norm": 2.7564568519592285,
+ "learning_rate": 4.968976400944211e-06,
+ "loss": 0.6843,
+ "step": 732
+ },
+ {
+ "epoch": 0.34657210401891253,
+ "grad_norm": 2.9255006313323975,
+ "learning_rate": 4.96887835168984e-06,
+ "loss": 0.6024,
+ "step": 733
+ },
+ {
+ "epoch": 0.3470449172576832,
+ "grad_norm": 3.1808290481567383,
+ "learning_rate": 4.968780148709239e-06,
+ "loss": 0.7377,
+ "step": 734
+ },
+ {
+ "epoch": 0.3475177304964539,
+ "grad_norm": 2.956666946411133,
+ "learning_rate": 4.968681792008523e-06,
+ "loss": 0.65,
+ "step": 735
+ },
+ {
+ "epoch": 0.3479905437352246,
+ "grad_norm": 2.9631855487823486,
+ "learning_rate": 4.9685832815938175e-06,
+ "loss": 0.677,
+ "step": 736
+ },
+ {
+ "epoch": 0.3484633569739953,
+ "grad_norm": 2.501917600631714,
+ "learning_rate": 4.968484617471256e-06,
+ "loss": 0.6282,
+ "step": 737
+ },
+ {
+ "epoch": 0.34893617021276596,
+ "grad_norm": 2.750779628753662,
+ "learning_rate": 4.968385799646981e-06,
+ "loss": 0.6507,
+ "step": 738
+ },
+ {
+ "epoch": 0.34940898345153665,
+ "grad_norm": 2.872300624847412,
+ "learning_rate": 4.968286828127146e-06,
+ "loss": 0.5949,
+ "step": 739
+ },
+ {
+ "epoch": 0.34988179669030733,
+ "grad_norm": 2.6316142082214355,
+ "learning_rate": 4.9681877029179124e-06,
+ "loss": 0.6328,
+ "step": 740
+ },
+ {
+ "epoch": 0.350354609929078,
+ "grad_norm": 3.244364023208618,
+ "learning_rate": 4.968088424025454e-06,
+ "loss": 0.7393,
+ "step": 741
+ },
+ {
+ "epoch": 0.3508274231678487,
+ "grad_norm": 2.620465040206909,
+ "learning_rate": 4.967988991455951e-06,
+ "loss": 0.6797,
+ "step": 742
+ },
+ {
+ "epoch": 0.3513002364066194,
+ "grad_norm": 2.854513645172119,
+ "learning_rate": 4.967889405215596e-06,
+ "loss": 0.6368,
+ "step": 743
+ },
+ {
+ "epoch": 0.3517730496453901,
+ "grad_norm": 2.579854726791382,
+ "learning_rate": 4.9677896653105886e-06,
+ "loss": 0.6489,
+ "step": 744
+ },
+ {
+ "epoch": 0.35224586288416077,
+ "grad_norm": 3.0697381496429443,
+ "learning_rate": 4.96768977174714e-06,
+ "loss": 0.6313,
+ "step": 745
+ },
+ {
+ "epoch": 0.35271867612293145,
+ "grad_norm": 3.369338035583496,
+ "learning_rate": 4.96758972453147e-06,
+ "loss": 0.7416,
+ "step": 746
+ },
+ {
+ "epoch": 0.35319148936170214,
+ "grad_norm": 2.836221933364868,
+ "learning_rate": 4.967489523669807e-06,
+ "loss": 0.6422,
+ "step": 747
+ },
+ {
+ "epoch": 0.3536643026004728,
+ "grad_norm": 2.929579496383667,
+ "learning_rate": 4.967389169168392e-06,
+ "loss": 0.6482,
+ "step": 748
+ },
+ {
+ "epoch": 0.3541371158392435,
+ "grad_norm": 2.9243831634521484,
+ "learning_rate": 4.967288661033472e-06,
+ "loss": 0.5813,
+ "step": 749
+ },
+ {
+ "epoch": 0.3546099290780142,
+ "grad_norm": 3.7555336952209473,
+ "learning_rate": 4.967187999271306e-06,
+ "loss": 0.6501,
+ "step": 750
+ },
+ {
+ "epoch": 0.3550827423167849,
+ "grad_norm": 3.4279143810272217,
+ "learning_rate": 4.9670871838881615e-06,
+ "loss": 0.6326,
+ "step": 751
+ },
+ {
+ "epoch": 0.35555555555555557,
+ "grad_norm": 2.875066041946411,
+ "learning_rate": 4.9669862148903166e-06,
+ "loss": 0.664,
+ "step": 752
+ },
+ {
+ "epoch": 0.35602836879432626,
+ "grad_norm": 3.130394697189331,
+ "learning_rate": 4.966885092284057e-06,
+ "loss": 0.706,
+ "step": 753
+ },
+ {
+ "epoch": 0.35650118203309694,
+ "grad_norm": 2.9606287479400635,
+ "learning_rate": 4.96678381607568e-06,
+ "loss": 0.693,
+ "step": 754
+ },
+ {
+ "epoch": 0.35697399527186763,
+ "grad_norm": 3.0584909915924072,
+ "learning_rate": 4.966682386271491e-06,
+ "loss": 0.6034,
+ "step": 755
+ },
+ {
+ "epoch": 0.3574468085106383,
+ "grad_norm": 2.8215200901031494,
+ "learning_rate": 4.966580802877805e-06,
+ "loss": 0.6217,
+ "step": 756
+ },
+ {
+ "epoch": 0.357919621749409,
+ "grad_norm": 2.7348055839538574,
+ "learning_rate": 4.966479065900949e-06,
+ "loss": 0.6194,
+ "step": 757
+ },
+ {
+ "epoch": 0.3583924349881797,
+ "grad_norm": 3.2347466945648193,
+ "learning_rate": 4.966377175347257e-06,
+ "loss": 0.6377,
+ "step": 758
+ },
+ {
+ "epoch": 0.3588652482269504,
+ "grad_norm": 3.311845302581787,
+ "learning_rate": 4.966275131223072e-06,
+ "loss": 0.6234,
+ "step": 759
+ },
+ {
+ "epoch": 0.35933806146572106,
+ "grad_norm": 3.0384368896484375,
+ "learning_rate": 4.96617293353475e-06,
+ "loss": 0.609,
+ "step": 760
+ },
+ {
+ "epoch": 0.35981087470449175,
+ "grad_norm": 3.516854763031006,
+ "learning_rate": 4.966070582288653e-06,
+ "loss": 0.6627,
+ "step": 761
+ },
+ {
+ "epoch": 0.36028368794326243,
+ "grad_norm": 3.2425215244293213,
+ "learning_rate": 4.9659680774911534e-06,
+ "loss": 0.7355,
+ "step": 762
+ },
+ {
+ "epoch": 0.3607565011820331,
+ "grad_norm": 3.2665750980377197,
+ "learning_rate": 4.965865419148636e-06,
+ "loss": 0.6787,
+ "step": 763
+ },
+ {
+ "epoch": 0.3612293144208038,
+ "grad_norm": 2.729428291320801,
+ "learning_rate": 4.96576260726749e-06,
+ "loss": 0.6272,
+ "step": 764
+ },
+ {
+ "epoch": 0.3617021276595745,
+ "grad_norm": 3.299969434738159,
+ "learning_rate": 4.965659641854119e-06,
+ "loss": 0.6552,
+ "step": 765
+ },
+ {
+ "epoch": 0.3621749408983452,
+ "grad_norm": 2.7090916633605957,
+ "learning_rate": 4.965556522914934e-06,
+ "loss": 0.6661,
+ "step": 766
+ },
+ {
+ "epoch": 0.36264775413711586,
+ "grad_norm": 2.488846778869629,
+ "learning_rate": 4.965453250456355e-06,
+ "loss": 0.5821,
+ "step": 767
+ },
+ {
+ "epoch": 0.36312056737588655,
+ "grad_norm": 2.5267233848571777,
+ "learning_rate": 4.965349824484813e-06,
+ "loss": 0.5593,
+ "step": 768
+ },
+ {
+ "epoch": 0.36359338061465724,
+ "grad_norm": 3.0646679401397705,
+ "learning_rate": 4.965246245006748e-06,
+ "loss": 0.6341,
+ "step": 769
+ },
+ {
+ "epoch": 0.3640661938534279,
+ "grad_norm": 2.9877712726593018,
+ "learning_rate": 4.965142512028609e-06,
+ "loss": 0.7202,
+ "step": 770
+ },
+ {
+ "epoch": 0.3645390070921986,
+ "grad_norm": 3.7494113445281982,
+ "learning_rate": 4.965038625556854e-06,
+ "loss": 0.7643,
+ "step": 771
+ },
+ {
+ "epoch": 0.3650118203309693,
+ "grad_norm": 2.8382890224456787,
+ "learning_rate": 4.964934585597954e-06,
+ "loss": 0.6522,
+ "step": 772
+ },
+ {
+ "epoch": 0.3654846335697399,
+ "grad_norm": 3.091655731201172,
+ "learning_rate": 4.9648303921583854e-06,
+ "loss": 0.7117,
+ "step": 773
+ },
+ {
+ "epoch": 0.3659574468085106,
+ "grad_norm": 3.0608325004577637,
+ "learning_rate": 4.964726045244635e-06,
+ "loss": 0.6538,
+ "step": 774
+ },
+ {
+ "epoch": 0.3664302600472813,
+ "grad_norm": 2.8492867946624756,
+ "learning_rate": 4.964621544863203e-06,
+ "loss": 0.6079,
+ "step": 775
+ },
+ {
+ "epoch": 0.366903073286052,
+ "grad_norm": 3.0669894218444824,
+ "learning_rate": 4.964516891020594e-06,
+ "loss": 0.6223,
+ "step": 776
+ },
+ {
+ "epoch": 0.36737588652482267,
+ "grad_norm": 3.089984893798828,
+ "learning_rate": 4.964412083723325e-06,
+ "loss": 0.671,
+ "step": 777
+ },
+ {
+ "epoch": 0.36784869976359336,
+ "grad_norm": 2.905242443084717,
+ "learning_rate": 4.964307122977921e-06,
+ "loss": 0.62,
+ "step": 778
+ },
+ {
+ "epoch": 0.36832151300236404,
+ "grad_norm": 3.954436779022217,
+ "learning_rate": 4.964202008790918e-06,
+ "loss": 0.6535,
+ "step": 779
+ },
+ {
+ "epoch": 0.36879432624113473,
+ "grad_norm": 2.6026058197021484,
+ "learning_rate": 4.9640967411688615e-06,
+ "loss": 0.5865,
+ "step": 780
+ },
+ {
+ "epoch": 0.3692671394799054,
+ "grad_norm": 2.9876346588134766,
+ "learning_rate": 4.963991320118306e-06,
+ "loss": 0.6698,
+ "step": 781
+ },
+ {
+ "epoch": 0.3697399527186761,
+ "grad_norm": 2.9411263465881348,
+ "learning_rate": 4.963885745645815e-06,
+ "loss": 0.6173,
+ "step": 782
+ },
+ {
+ "epoch": 0.3702127659574468,
+ "grad_norm": 2.5679805278778076,
+ "learning_rate": 4.963780017757962e-06,
+ "loss": 0.6285,
+ "step": 783
+ },
+ {
+ "epoch": 0.3706855791962175,
+ "grad_norm": 3.3100640773773193,
+ "learning_rate": 4.963674136461332e-06,
+ "loss": 0.5968,
+ "step": 784
+ },
+ {
+ "epoch": 0.37115839243498816,
+ "grad_norm": 3.1293699741363525,
+ "learning_rate": 4.963568101762515e-06,
+ "loss": 0.697,
+ "step": 785
+ },
+ {
+ "epoch": 0.37163120567375885,
+ "grad_norm": 3.043853759765625,
+ "learning_rate": 4.963461913668115e-06,
+ "loss": 0.5881,
+ "step": 786
+ },
+ {
+ "epoch": 0.37210401891252953,
+ "grad_norm": 3.07351016998291,
+ "learning_rate": 4.963355572184744e-06,
+ "loss": 0.6307,
+ "step": 787
+ },
+ {
+ "epoch": 0.3725768321513002,
+ "grad_norm": 2.7381317615509033,
+ "learning_rate": 4.9632490773190225e-06,
+ "loss": 0.716,
+ "step": 788
+ },
+ {
+ "epoch": 0.3730496453900709,
+ "grad_norm": 2.892221450805664,
+ "learning_rate": 4.963142429077582e-06,
+ "loss": 0.6867,
+ "step": 789
+ },
+ {
+ "epoch": 0.3735224586288416,
+ "grad_norm": 3.133122205734253,
+ "learning_rate": 4.963035627467064e-06,
+ "loss": 0.659,
+ "step": 790
+ },
+ {
+ "epoch": 0.3739952718676123,
+ "grad_norm": 3.032599925994873,
+ "learning_rate": 4.962928672494116e-06,
+ "loss": 0.6848,
+ "step": 791
+ },
+ {
+ "epoch": 0.37446808510638296,
+ "grad_norm": 3.0076355934143066,
+ "learning_rate": 4.9628215641654e-06,
+ "loss": 0.6549,
+ "step": 792
+ },
+ {
+ "epoch": 0.37494089834515365,
+ "grad_norm": 2.8904454708099365,
+ "learning_rate": 4.962714302487585e-06,
+ "loss": 0.6484,
+ "step": 793
+ },
+ {
+ "epoch": 0.37541371158392434,
+ "grad_norm": 2.881364107131958,
+ "learning_rate": 4.9626068874673486e-06,
+ "loss": 0.721,
+ "step": 794
+ },
+ {
+ "epoch": 0.375886524822695,
+ "grad_norm": 3.11668062210083,
+ "learning_rate": 4.962499319111379e-06,
+ "loss": 0.7824,
+ "step": 795
+ },
+ {
+ "epoch": 0.3763593380614657,
+ "grad_norm": 2.9201436042785645,
+ "learning_rate": 4.962391597426374e-06,
+ "loss": 0.6911,
+ "step": 796
+ },
+ {
+ "epoch": 0.3768321513002364,
+ "grad_norm": 2.926598072052002,
+ "learning_rate": 4.962283722419043e-06,
+ "loss": 0.6715,
+ "step": 797
+ },
+ {
+ "epoch": 0.3773049645390071,
+ "grad_norm": 2.7267675399780273,
+ "learning_rate": 4.962175694096101e-06,
+ "loss": 0.6111,
+ "step": 798
+ },
+ {
+ "epoch": 0.37777777777777777,
+ "grad_norm": 3.194031000137329,
+ "learning_rate": 4.962067512464275e-06,
+ "loss": 0.6558,
+ "step": 799
+ },
+ {
+ "epoch": 0.37825059101654845,
+ "grad_norm": 2.6249136924743652,
+ "learning_rate": 4.9619591775303e-06,
+ "loss": 0.6166,
+ "step": 800
+ },
+ {
+ "epoch": 0.37872340425531914,
+ "grad_norm": 2.6356167793273926,
+ "learning_rate": 4.961850689300923e-06,
+ "loss": 0.6112,
+ "step": 801
+ },
+ {
+ "epoch": 0.3791962174940898,
+ "grad_norm": 3.030724287033081,
+ "learning_rate": 4.961742047782898e-06,
+ "loss": 0.6511,
+ "step": 802
+ },
+ {
+ "epoch": 0.3796690307328605,
+ "grad_norm": 3.4987757205963135,
+ "learning_rate": 4.96163325298299e-06,
+ "loss": 0.5888,
+ "step": 803
+ },
+ {
+ "epoch": 0.3801418439716312,
+ "grad_norm": 3.0371780395507812,
+ "learning_rate": 4.961524304907974e-06,
+ "loss": 0.6385,
+ "step": 804
+ },
+ {
+ "epoch": 0.3806146572104019,
+ "grad_norm": 3.302570104598999,
+ "learning_rate": 4.961415203564632e-06,
+ "loss": 0.6515,
+ "step": 805
+ },
+ {
+ "epoch": 0.38108747044917257,
+ "grad_norm": 2.7597038745880127,
+ "learning_rate": 4.961305948959759e-06,
+ "loss": 0.6126,
+ "step": 806
+ },
+ {
+ "epoch": 0.38156028368794326,
+ "grad_norm": 2.789811849594116,
+ "learning_rate": 4.9611965411001575e-06,
+ "loss": 0.6601,
+ "step": 807
+ },
+ {
+ "epoch": 0.38203309692671394,
+ "grad_norm": 3.0403921604156494,
+ "learning_rate": 4.961086979992639e-06,
+ "loss": 0.6947,
+ "step": 808
+ },
+ {
+ "epoch": 0.38250591016548463,
+ "grad_norm": 3.2139980792999268,
+ "learning_rate": 4.960977265644026e-06,
+ "loss": 0.6876,
+ "step": 809
+ },
+ {
+ "epoch": 0.3829787234042553,
+ "grad_norm": 2.918515205383301,
+ "learning_rate": 4.960867398061149e-06,
+ "loss": 0.5997,
+ "step": 810
+ },
+ {
+ "epoch": 0.383451536643026,
+ "grad_norm": 3.197636604309082,
+ "learning_rate": 4.9607573772508495e-06,
+ "loss": 0.5754,
+ "step": 811
+ },
+ {
+ "epoch": 0.3839243498817967,
+ "grad_norm": 2.8848466873168945,
+ "learning_rate": 4.960647203219979e-06,
+ "loss": 0.6424,
+ "step": 812
+ },
+ {
+ "epoch": 0.3843971631205674,
+ "grad_norm": 3.4810187816619873,
+ "learning_rate": 4.960536875975397e-06,
+ "loss": 0.6851,
+ "step": 813
+ },
+ {
+ "epoch": 0.38486997635933806,
+ "grad_norm": 3.713934898376465,
+ "learning_rate": 4.960426395523972e-06,
+ "loss": 0.6122,
+ "step": 814
+ },
+ {
+ "epoch": 0.38534278959810875,
+ "grad_norm": 2.862600803375244,
+ "learning_rate": 4.960315761872585e-06,
+ "loss": 0.6493,
+ "step": 815
+ },
+ {
+ "epoch": 0.38581560283687943,
+ "grad_norm": 3.133882522583008,
+ "learning_rate": 4.960204975028123e-06,
+ "loss": 0.7535,
+ "step": 816
+ },
+ {
+ "epoch": 0.3862884160756501,
+ "grad_norm": 3.1526732444763184,
+ "learning_rate": 4.960094034997485e-06,
+ "loss": 0.6512,
+ "step": 817
+ },
+ {
+ "epoch": 0.3867612293144208,
+ "grad_norm": 2.7213544845581055,
+ "learning_rate": 4.959982941787579e-06,
+ "loss": 0.6121,
+ "step": 818
+ },
+ {
+ "epoch": 0.3872340425531915,
+ "grad_norm": 3.4935851097106934,
+ "learning_rate": 4.9598716954053214e-06,
+ "loss": 0.7852,
+ "step": 819
+ },
+ {
+ "epoch": 0.3877068557919622,
+ "grad_norm": 2.691016435623169,
+ "learning_rate": 4.9597602958576395e-06,
+ "loss": 0.6861,
+ "step": 820
+ },
+ {
+ "epoch": 0.38817966903073287,
+ "grad_norm": 2.8621015548706055,
+ "learning_rate": 4.959648743151469e-06,
+ "loss": 0.6262,
+ "step": 821
+ },
+ {
+ "epoch": 0.38865248226950355,
+ "grad_norm": 3.3887462615966797,
+ "learning_rate": 4.959537037293758e-06,
+ "loss": 0.7103,
+ "step": 822
+ },
+ {
+ "epoch": 0.38912529550827424,
+ "grad_norm": 2.7565438747406006,
+ "learning_rate": 4.95942517829146e-06,
+ "loss": 0.6471,
+ "step": 823
+ },
+ {
+ "epoch": 0.3895981087470449,
+ "grad_norm": 2.7920358180999756,
+ "learning_rate": 4.959313166151541e-06,
+ "loss": 0.6239,
+ "step": 824
+ },
+ {
+ "epoch": 0.3900709219858156,
+ "grad_norm": 3.18904185295105,
+ "learning_rate": 4.959201000880973e-06,
+ "loss": 0.7461,
+ "step": 825
+ },
+ {
+ "epoch": 0.3905437352245863,
+ "grad_norm": 2.727872371673584,
+ "learning_rate": 4.959088682486743e-06,
+ "loss": 0.6333,
+ "step": 826
+ },
+ {
+ "epoch": 0.391016548463357,
+ "grad_norm": 2.906378746032715,
+ "learning_rate": 4.958976210975844e-06,
+ "loss": 0.7547,
+ "step": 827
+ },
+ {
+ "epoch": 0.39148936170212767,
+ "grad_norm": 2.96482515335083,
+ "learning_rate": 4.958863586355278e-06,
+ "loss": 0.6312,
+ "step": 828
+ },
+ {
+ "epoch": 0.39196217494089836,
+ "grad_norm": 3.2890889644622803,
+ "learning_rate": 4.958750808632059e-06,
+ "loss": 0.6943,
+ "step": 829
+ },
+ {
+ "epoch": 0.39243498817966904,
+ "grad_norm": 2.7004311084747314,
+ "learning_rate": 4.958637877813207e-06,
+ "loss": 0.5918,
+ "step": 830
+ },
+ {
+ "epoch": 0.39290780141843973,
+ "grad_norm": 2.7487950325012207,
+ "learning_rate": 4.9585247939057566e-06,
+ "loss": 0.6201,
+ "step": 831
+ },
+ {
+ "epoch": 0.3933806146572104,
+ "grad_norm": 2.7873897552490234,
+ "learning_rate": 4.958411556916747e-06,
+ "loss": 0.6268,
+ "step": 832
+ },
+ {
+ "epoch": 0.3938534278959811,
+ "grad_norm": 2.8501343727111816,
+ "learning_rate": 4.958298166853229e-06,
+ "loss": 0.7119,
+ "step": 833
+ },
+ {
+ "epoch": 0.3943262411347518,
+ "grad_norm": 3.0391547679901123,
+ "learning_rate": 4.958184623722265e-06,
+ "loss": 0.6375,
+ "step": 834
+ },
+ {
+ "epoch": 0.3947990543735225,
+ "grad_norm": 2.850520133972168,
+ "learning_rate": 4.958070927530922e-06,
+ "loss": 0.5962,
+ "step": 835
+ },
+ {
+ "epoch": 0.39527186761229316,
+ "grad_norm": 3.351914644241333,
+ "learning_rate": 4.957957078286281e-06,
+ "loss": 0.7247,
+ "step": 836
+ },
+ {
+ "epoch": 0.39574468085106385,
+ "grad_norm": 2.9559543132781982,
+ "learning_rate": 4.957843075995431e-06,
+ "loss": 0.6571,
+ "step": 837
+ },
+ {
+ "epoch": 0.39621749408983453,
+ "grad_norm": 3.225785255432129,
+ "learning_rate": 4.95772892066547e-06,
+ "loss": 0.7074,
+ "step": 838
+ },
+ {
+ "epoch": 0.3966903073286052,
+ "grad_norm": 2.7842373847961426,
+ "learning_rate": 4.957614612303505e-06,
+ "loss": 0.6469,
+ "step": 839
+ },
+ {
+ "epoch": 0.3971631205673759,
+ "grad_norm": 4.249724864959717,
+ "learning_rate": 4.957500150916655e-06,
+ "loss": 0.741,
+ "step": 840
+ },
+ {
+ "epoch": 0.3976359338061466,
+ "grad_norm": 3.138221263885498,
+ "learning_rate": 4.957385536512046e-06,
+ "loss": 0.6676,
+ "step": 841
+ },
+ {
+ "epoch": 0.3981087470449173,
+ "grad_norm": 3.456423759460449,
+ "learning_rate": 4.957270769096816e-06,
+ "loss": 0.6877,
+ "step": 842
+ },
+ {
+ "epoch": 0.39858156028368796,
+ "grad_norm": 2.8676278591156006,
+ "learning_rate": 4.957155848678109e-06,
+ "loss": 0.5986,
+ "step": 843
+ },
+ {
+ "epoch": 0.39905437352245865,
+ "grad_norm": 2.705324411392212,
+ "learning_rate": 4.957040775263082e-06,
+ "loss": 0.6356,
+ "step": 844
+ },
+ {
+ "epoch": 0.39952718676122934,
+ "grad_norm": 3.0767486095428467,
+ "learning_rate": 4.9569255488589e-06,
+ "loss": 0.6844,
+ "step": 845
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 2.7787704467773438,
+ "learning_rate": 4.956810169472736e-06,
+ "loss": 0.6641,
+ "step": 846
+ },
+ {
+ "epoch": 0.4004728132387707,
+ "grad_norm": 2.584277868270874,
+ "learning_rate": 4.956694637111777e-06,
+ "loss": 0.6256,
+ "step": 847
+ },
+ {
+ "epoch": 0.4009456264775414,
+ "grad_norm": 2.751641273498535,
+ "learning_rate": 4.956578951783215e-06,
+ "loss": 0.5954,
+ "step": 848
+ },
+ {
+ "epoch": 0.4014184397163121,
+ "grad_norm": 3.0181658267974854,
+ "learning_rate": 4.956463113494253e-06,
+ "loss": 0.6569,
+ "step": 849
+ },
+ {
+ "epoch": 0.40189125295508277,
+ "grad_norm": 3.0933220386505127,
+ "learning_rate": 4.956347122252104e-06,
+ "loss": 0.6248,
+ "step": 850
+ },
+ {
+ "epoch": 0.40236406619385345,
+ "grad_norm": 3.3767428398132324,
+ "learning_rate": 4.956230978063991e-06,
+ "loss": 0.719,
+ "step": 851
+ },
+ {
+ "epoch": 0.40283687943262414,
+ "grad_norm": 3.7666573524475098,
+ "learning_rate": 4.956114680937145e-06,
+ "loss": 0.6467,
+ "step": 852
+ },
+ {
+ "epoch": 0.4033096926713948,
+ "grad_norm": 2.9836843013763428,
+ "learning_rate": 4.955998230878808e-06,
+ "loss": 0.6993,
+ "step": 853
+ },
+ {
+ "epoch": 0.4037825059101655,
+ "grad_norm": 2.981497049331665,
+ "learning_rate": 4.955881627896229e-06,
+ "loss": 0.6578,
+ "step": 854
+ },
+ {
+ "epoch": 0.40425531914893614,
+ "grad_norm": 3.1369056701660156,
+ "learning_rate": 4.955764871996672e-06,
+ "loss": 0.6763,
+ "step": 855
+ },
+ {
+ "epoch": 0.40472813238770683,
+ "grad_norm": 2.7675817012786865,
+ "learning_rate": 4.9556479631874036e-06,
+ "loss": 0.6488,
+ "step": 856
+ },
+ {
+ "epoch": 0.4052009456264775,
+ "grad_norm": 3.035334825515747,
+ "learning_rate": 4.9555309014757034e-06,
+ "loss": 0.7076,
+ "step": 857
+ },
+ {
+ "epoch": 0.4056737588652482,
+ "grad_norm": 3.493704319000244,
+ "learning_rate": 4.955413686868862e-06,
+ "loss": 0.6773,
+ "step": 858
+ },
+ {
+ "epoch": 0.4061465721040189,
+ "grad_norm": 3.245487928390503,
+ "learning_rate": 4.9552963193741765e-06,
+ "loss": 0.6915,
+ "step": 859
+ },
+ {
+ "epoch": 0.4066193853427896,
+ "grad_norm": 3.189969539642334,
+ "learning_rate": 4.955178798998956e-06,
+ "loss": 0.7318,
+ "step": 860
+ },
+ {
+ "epoch": 0.40709219858156026,
+ "grad_norm": 2.7987146377563477,
+ "learning_rate": 4.955061125750517e-06,
+ "loss": 0.6162,
+ "step": 861
+ },
+ {
+ "epoch": 0.40756501182033095,
+ "grad_norm": 3.020118474960327,
+ "learning_rate": 4.954943299636187e-06,
+ "loss": 0.6678,
+ "step": 862
+ },
+ {
+ "epoch": 0.40803782505910163,
+ "grad_norm": 2.715463876724243,
+ "learning_rate": 4.954825320663302e-06,
+ "loss": 0.668,
+ "step": 863
+ },
+ {
+ "epoch": 0.4085106382978723,
+ "grad_norm": 2.595050096511841,
+ "learning_rate": 4.9547071888392085e-06,
+ "loss": 0.6557,
+ "step": 864
+ },
+ {
+ "epoch": 0.408983451536643,
+ "grad_norm": 3.131596088409424,
+ "learning_rate": 4.954588904171261e-06,
+ "loss": 0.6548,
+ "step": 865
+ },
+ {
+ "epoch": 0.4094562647754137,
+ "grad_norm": 2.5742313861846924,
+ "learning_rate": 4.954470466666827e-06,
+ "loss": 0.6592,
+ "step": 866
+ },
+ {
+ "epoch": 0.4099290780141844,
+ "grad_norm": 2.8612802028656006,
+ "learning_rate": 4.9543518763332785e-06,
+ "loss": 0.5391,
+ "step": 867
+ },
+ {
+ "epoch": 0.41040189125295506,
+ "grad_norm": 2.8973186016082764,
+ "learning_rate": 4.954233133178001e-06,
+ "loss": 0.6649,
+ "step": 868
+ },
+ {
+ "epoch": 0.41087470449172575,
+ "grad_norm": 2.802525043487549,
+ "learning_rate": 4.954114237208388e-06,
+ "loss": 0.6212,
+ "step": 869
+ },
+ {
+ "epoch": 0.41134751773049644,
+ "grad_norm": 2.5919506549835205,
+ "learning_rate": 4.953995188431843e-06,
+ "loss": 0.6596,
+ "step": 870
+ },
+ {
+ "epoch": 0.4118203309692671,
+ "grad_norm": 3.139169454574585,
+ "learning_rate": 4.953875986855777e-06,
+ "loss": 0.6799,
+ "step": 871
+ },
+ {
+ "epoch": 0.4122931442080378,
+ "grad_norm": 3.99727725982666,
+ "learning_rate": 4.953756632487614e-06,
+ "loss": 0.6519,
+ "step": 872
+ },
+ {
+ "epoch": 0.4127659574468085,
+ "grad_norm": 3.238706350326538,
+ "learning_rate": 4.953637125334784e-06,
+ "loss": 0.7361,
+ "step": 873
+ },
+ {
+ "epoch": 0.4132387706855792,
+ "grad_norm": 2.780019998550415,
+ "learning_rate": 4.9535174654047295e-06,
+ "loss": 0.6406,
+ "step": 874
+ },
+ {
+ "epoch": 0.41371158392434987,
+ "grad_norm": 2.7629551887512207,
+ "learning_rate": 4.953397652704901e-06,
+ "loss": 0.6131,
+ "step": 875
+ },
+ {
+ "epoch": 0.41418439716312055,
+ "grad_norm": 2.8008246421813965,
+ "learning_rate": 4.9532776872427585e-06,
+ "loss": 0.6464,
+ "step": 876
+ },
+ {
+ "epoch": 0.41465721040189124,
+ "grad_norm": 3.0970115661621094,
+ "learning_rate": 4.953157569025772e-06,
+ "loss": 0.7066,
+ "step": 877
+ },
+ {
+ "epoch": 0.4151300236406619,
+ "grad_norm": 2.8375589847564697,
+ "learning_rate": 4.9530372980614195e-06,
+ "loss": 0.6551,
+ "step": 878
+ },
+ {
+ "epoch": 0.4156028368794326,
+ "grad_norm": 2.718843936920166,
+ "learning_rate": 4.952916874357191e-06,
+ "loss": 0.5947,
+ "step": 879
+ },
+ {
+ "epoch": 0.4160756501182033,
+ "grad_norm": 2.7104697227478027,
+ "learning_rate": 4.952796297920585e-06,
+ "loss": 0.6708,
+ "step": 880
+ },
+ {
+ "epoch": 0.416548463356974,
+ "grad_norm": 2.8223445415496826,
+ "learning_rate": 4.952675568759108e-06,
+ "loss": 0.6214,
+ "step": 881
+ },
+ {
+ "epoch": 0.41702127659574467,
+ "grad_norm": 2.6598153114318848,
+ "learning_rate": 4.952554686880279e-06,
+ "loss": 0.6116,
+ "step": 882
+ },
+ {
+ "epoch": 0.41749408983451536,
+ "grad_norm": 2.8639824390411377,
+ "learning_rate": 4.952433652291623e-06,
+ "loss": 0.5971,
+ "step": 883
+ },
+ {
+ "epoch": 0.41796690307328604,
+ "grad_norm": 2.9578304290771484,
+ "learning_rate": 4.952312465000677e-06,
+ "loss": 0.6785,
+ "step": 884
+ },
+ {
+ "epoch": 0.41843971631205673,
+ "grad_norm": 2.872144937515259,
+ "learning_rate": 4.952191125014987e-06,
+ "loss": 0.6772,
+ "step": 885
+ },
+ {
+ "epoch": 0.4189125295508274,
+ "grad_norm": 2.7513675689697266,
+ "learning_rate": 4.952069632342108e-06,
+ "loss": 0.702,
+ "step": 886
+ },
+ {
+ "epoch": 0.4193853427895981,
+ "grad_norm": 2.9275078773498535,
+ "learning_rate": 4.951947986989606e-06,
+ "loss": 0.589,
+ "step": 887
+ },
+ {
+ "epoch": 0.4198581560283688,
+ "grad_norm": 2.740549325942993,
+ "learning_rate": 4.951826188965053e-06,
+ "loss": 0.5942,
+ "step": 888
+ },
+ {
+ "epoch": 0.4203309692671395,
+ "grad_norm": 2.92452073097229,
+ "learning_rate": 4.951704238276035e-06,
+ "loss": 0.6819,
+ "step": 889
+ },
+ {
+ "epoch": 0.42080378250591016,
+ "grad_norm": 2.842491865158081,
+ "learning_rate": 4.951582134930144e-06,
+ "loss": 0.6304,
+ "step": 890
+ },
+ {
+ "epoch": 0.42127659574468085,
+ "grad_norm": 2.613478422164917,
+ "learning_rate": 4.951459878934983e-06,
+ "loss": 0.6912,
+ "step": 891
+ },
+ {
+ "epoch": 0.42174940898345153,
+ "grad_norm": 3.2408607006073,
+ "learning_rate": 4.951337470298165e-06,
+ "loss": 0.6755,
+ "step": 892
+ },
+ {
+ "epoch": 0.4222222222222222,
+ "grad_norm": 3.1022439002990723,
+ "learning_rate": 4.9512149090273125e-06,
+ "loss": 0.6138,
+ "step": 893
+ },
+ {
+ "epoch": 0.4226950354609929,
+ "grad_norm": 2.6418895721435547,
+ "learning_rate": 4.951092195130055e-06,
+ "loss": 0.639,
+ "step": 894
+ },
+ {
+ "epoch": 0.4231678486997636,
+ "grad_norm": 3.010744333267212,
+ "learning_rate": 4.950969328614035e-06,
+ "loss": 0.7102,
+ "step": 895
+ },
+ {
+ "epoch": 0.4236406619385343,
+ "grad_norm": 2.673292636871338,
+ "learning_rate": 4.950846309486901e-06,
+ "loss": 0.5676,
+ "step": 896
+ },
+ {
+ "epoch": 0.42411347517730497,
+ "grad_norm": 3.6974737644195557,
+ "learning_rate": 4.950723137756314e-06,
+ "loss": 0.5722,
+ "step": 897
+ },
+ {
+ "epoch": 0.42458628841607565,
+ "grad_norm": 3.69028902053833,
+ "learning_rate": 4.9505998134299435e-06,
+ "loss": 0.6337,
+ "step": 898
+ },
+ {
+ "epoch": 0.42505910165484634,
+ "grad_norm": 3.2136125564575195,
+ "learning_rate": 4.950476336515469e-06,
+ "loss": 0.6469,
+ "step": 899
+ },
+ {
+ "epoch": 0.425531914893617,
+ "grad_norm": 2.7396016120910645,
+ "learning_rate": 4.950352707020577e-06,
+ "loss": 0.6656,
+ "step": 900
+ },
+ {
+ "epoch": 0.4260047281323877,
+ "grad_norm": 2.825416088104248,
+ "learning_rate": 4.950228924952967e-06,
+ "loss": 0.6298,
+ "step": 901
+ },
+ {
+ "epoch": 0.4264775413711584,
+ "grad_norm": 3.401658535003662,
+ "learning_rate": 4.950104990320345e-06,
+ "loss": 0.778,
+ "step": 902
+ },
+ {
+ "epoch": 0.4269503546099291,
+ "grad_norm": 2.7002272605895996,
+ "learning_rate": 4.9499809031304294e-06,
+ "loss": 0.6536,
+ "step": 903
+ },
+ {
+ "epoch": 0.42742316784869977,
+ "grad_norm": 2.62386417388916,
+ "learning_rate": 4.949856663390945e-06,
+ "loss": 0.6629,
+ "step": 904
+ },
+ {
+ "epoch": 0.42789598108747046,
+ "grad_norm": 2.584247589111328,
+ "learning_rate": 4.94973227110963e-06,
+ "loss": 0.5813,
+ "step": 905
+ },
+ {
+ "epoch": 0.42836879432624114,
+ "grad_norm": 3.4365768432617188,
+ "learning_rate": 4.9496077262942265e-06,
+ "loss": 0.7648,
+ "step": 906
+ },
+ {
+ "epoch": 0.42884160756501183,
+ "grad_norm": 2.8993639945983887,
+ "learning_rate": 4.949483028952492e-06,
+ "loss": 0.6696,
+ "step": 907
+ },
+ {
+ "epoch": 0.4293144208037825,
+ "grad_norm": 2.922809362411499,
+ "learning_rate": 4.94935817909219e-06,
+ "loss": 0.6892,
+ "step": 908
+ },
+ {
+ "epoch": 0.4297872340425532,
+ "grad_norm": 2.85478138923645,
+ "learning_rate": 4.9492331767210944e-06,
+ "loss": 0.536,
+ "step": 909
+ },
+ {
+ "epoch": 0.4302600472813239,
+ "grad_norm": 2.8639259338378906,
+ "learning_rate": 4.949108021846988e-06,
+ "loss": 0.634,
+ "step": 910
+ },
+ {
+ "epoch": 0.4307328605200946,
+ "grad_norm": 3.0533697605133057,
+ "learning_rate": 4.948982714477664e-06,
+ "loss": 0.6318,
+ "step": 911
+ },
+ {
+ "epoch": 0.43120567375886526,
+ "grad_norm": 2.331674814224243,
+ "learning_rate": 4.9488572546209255e-06,
+ "loss": 0.6562,
+ "step": 912
+ },
+ {
+ "epoch": 0.43167848699763595,
+ "grad_norm": 3.0154623985290527,
+ "learning_rate": 4.9487316422845835e-06,
+ "loss": 0.6675,
+ "step": 913
+ },
+ {
+ "epoch": 0.43215130023640663,
+ "grad_norm": 2.7354514598846436,
+ "learning_rate": 4.948605877476459e-06,
+ "loss": 0.6012,
+ "step": 914
+ },
+ {
+ "epoch": 0.4326241134751773,
+ "grad_norm": 2.863736629486084,
+ "learning_rate": 4.948479960204383e-06,
+ "loss": 0.6062,
+ "step": 915
+ },
+ {
+ "epoch": 0.433096926713948,
+ "grad_norm": 3.01998233795166,
+ "learning_rate": 4.948353890476197e-06,
+ "loss": 0.6749,
+ "step": 916
+ },
+ {
+ "epoch": 0.4335697399527187,
+ "grad_norm": 2.7550456523895264,
+ "learning_rate": 4.94822766829975e-06,
+ "loss": 0.6507,
+ "step": 917
+ },
+ {
+ "epoch": 0.4340425531914894,
+ "grad_norm": 3.370572805404663,
+ "learning_rate": 4.948101293682901e-06,
+ "loss": 0.714,
+ "step": 918
+ },
+ {
+ "epoch": 0.43451536643026006,
+ "grad_norm": 2.9736790657043457,
+ "learning_rate": 4.947974766633519e-06,
+ "loss": 0.729,
+ "step": 919
+ },
+ {
+ "epoch": 0.43498817966903075,
+ "grad_norm": 3.1036548614501953,
+ "learning_rate": 4.947848087159483e-06,
+ "loss": 0.7547,
+ "step": 920
+ },
+ {
+ "epoch": 0.43546099290780144,
+ "grad_norm": 2.895094871520996,
+ "learning_rate": 4.947721255268679e-06,
+ "loss": 0.6089,
+ "step": 921
+ },
+ {
+ "epoch": 0.4359338061465721,
+ "grad_norm": 2.798476219177246,
+ "learning_rate": 4.947594270969005e-06,
+ "loss": 0.5432,
+ "step": 922
+ },
+ {
+ "epoch": 0.4364066193853428,
+ "grad_norm": 2.7675702571868896,
+ "learning_rate": 4.94746713426837e-06,
+ "loss": 0.5693,
+ "step": 923
+ },
+ {
+ "epoch": 0.4368794326241135,
+ "grad_norm": 2.6851553916931152,
+ "learning_rate": 4.947339845174687e-06,
+ "loss": 0.6503,
+ "step": 924
+ },
+ {
+ "epoch": 0.4373522458628842,
+ "grad_norm": 2.909635543823242,
+ "learning_rate": 4.947212403695883e-06,
+ "loss": 0.6494,
+ "step": 925
+ },
+ {
+ "epoch": 0.43782505910165487,
+ "grad_norm": 2.604526996612549,
+ "learning_rate": 4.947084809839894e-06,
+ "loss": 0.6349,
+ "step": 926
+ },
+ {
+ "epoch": 0.43829787234042555,
+ "grad_norm": 3.118149518966675,
+ "learning_rate": 4.946957063614664e-06,
+ "loss": 0.6219,
+ "step": 927
+ },
+ {
+ "epoch": 0.43877068557919624,
+ "grad_norm": 2.7452616691589355,
+ "learning_rate": 4.9468291650281465e-06,
+ "loss": 0.6096,
+ "step": 928
+ },
+ {
+ "epoch": 0.4392434988179669,
+ "grad_norm": 3.30098819732666,
+ "learning_rate": 4.946701114088307e-06,
+ "loss": 0.6277,
+ "step": 929
+ },
+ {
+ "epoch": 0.4397163120567376,
+ "grad_norm": 2.789482593536377,
+ "learning_rate": 4.946572910803116e-06,
+ "loss": 0.7,
+ "step": 930
+ },
+ {
+ "epoch": 0.4401891252955083,
+ "grad_norm": 2.7283935546875,
+ "learning_rate": 4.946444555180559e-06,
+ "loss": 0.5375,
+ "step": 931
+ },
+ {
+ "epoch": 0.440661938534279,
+ "grad_norm": 3.101304054260254,
+ "learning_rate": 4.946316047228627e-06,
+ "loss": 0.6131,
+ "step": 932
+ },
+ {
+ "epoch": 0.44113475177304967,
+ "grad_norm": 3.573908805847168,
+ "learning_rate": 4.946187386955321e-06,
+ "loss": 0.7073,
+ "step": 933
+ },
+ {
+ "epoch": 0.44160756501182036,
+ "grad_norm": 3.214979648590088,
+ "learning_rate": 4.946058574368653e-06,
+ "loss": 0.6508,
+ "step": 934
+ },
+ {
+ "epoch": 0.44208037825059104,
+ "grad_norm": 3.145082712173462,
+ "learning_rate": 4.945929609476643e-06,
+ "loss": 0.64,
+ "step": 935
+ },
+ {
+ "epoch": 0.4425531914893617,
+ "grad_norm": 2.991780996322632,
+ "learning_rate": 4.945800492287321e-06,
+ "loss": 0.6315,
+ "step": 936
+ },
+ {
+ "epoch": 0.44302600472813236,
+ "grad_norm": 3.2441139221191406,
+ "learning_rate": 4.945671222808727e-06,
+ "loss": 0.7144,
+ "step": 937
+ },
+ {
+ "epoch": 0.44349881796690305,
+ "grad_norm": 2.9397029876708984,
+ "learning_rate": 4.94554180104891e-06,
+ "loss": 0.6818,
+ "step": 938
+ },
+ {
+ "epoch": 0.44397163120567373,
+ "grad_norm": 3.2471461296081543,
+ "learning_rate": 4.945412227015929e-06,
+ "loss": 0.6921,
+ "step": 939
+ },
+ {
+ "epoch": 0.4444444444444444,
+ "grad_norm": 3.0882487297058105,
+ "learning_rate": 4.945282500717851e-06,
+ "loss": 0.718,
+ "step": 940
+ },
+ {
+ "epoch": 0.4449172576832151,
+ "grad_norm": 2.6035783290863037,
+ "learning_rate": 4.945152622162753e-06,
+ "loss": 0.621,
+ "step": 941
+ },
+ {
+ "epoch": 0.4453900709219858,
+ "grad_norm": 2.83659029006958,
+ "learning_rate": 4.945022591358724e-06,
+ "loss": 0.6403,
+ "step": 942
+ },
+ {
+ "epoch": 0.4458628841607565,
+ "grad_norm": 2.824463129043579,
+ "learning_rate": 4.944892408313859e-06,
+ "loss": 0.6594,
+ "step": 943
+ },
+ {
+ "epoch": 0.44633569739952716,
+ "grad_norm": 2.753735065460205,
+ "learning_rate": 4.9447620730362645e-06,
+ "loss": 0.6116,
+ "step": 944
+ },
+ {
+ "epoch": 0.44680851063829785,
+ "grad_norm": 3.0659725666046143,
+ "learning_rate": 4.944631585534056e-06,
+ "loss": 0.5983,
+ "step": 945
+ },
+ {
+ "epoch": 0.44728132387706854,
+ "grad_norm": 2.969113349914551,
+ "learning_rate": 4.944500945815357e-06,
+ "loss": 0.6859,
+ "step": 946
+ },
+ {
+ "epoch": 0.4477541371158392,
+ "grad_norm": 2.810303211212158,
+ "learning_rate": 4.944370153888303e-06,
+ "loss": 0.7025,
+ "step": 947
+ },
+ {
+ "epoch": 0.4482269503546099,
+ "grad_norm": 3.027721643447876,
+ "learning_rate": 4.944239209761038e-06,
+ "loss": 0.7268,
+ "step": 948
+ },
+ {
+ "epoch": 0.4486997635933806,
+ "grad_norm": 2.661503314971924,
+ "learning_rate": 4.944108113441716e-06,
+ "loss": 0.6702,
+ "step": 949
+ },
+ {
+ "epoch": 0.4491725768321513,
+ "grad_norm": 2.738591432571411,
+ "learning_rate": 4.943976864938498e-06,
+ "loss": 0.6728,
+ "step": 950
+ },
+ {
+ "epoch": 0.44964539007092197,
+ "grad_norm": 3.447505474090576,
+ "learning_rate": 4.943845464259557e-06,
+ "loss": 0.6586,
+ "step": 951
+ },
+ {
+ "epoch": 0.45011820330969265,
+ "grad_norm": 3.0968854427337646,
+ "learning_rate": 4.943713911413075e-06,
+ "loss": 0.7666,
+ "step": 952
+ },
+ {
+ "epoch": 0.45059101654846334,
+ "grad_norm": 2.4113779067993164,
+ "learning_rate": 4.943582206407244e-06,
+ "loss": 0.6173,
+ "step": 953
+ },
+ {
+ "epoch": 0.451063829787234,
+ "grad_norm": 2.6357979774475098,
+ "learning_rate": 4.943450349250263e-06,
+ "loss": 0.5589,
+ "step": 954
+ },
+ {
+ "epoch": 0.4515366430260047,
+ "grad_norm": 2.9182233810424805,
+ "learning_rate": 4.9433183399503425e-06,
+ "loss": 0.6252,
+ "step": 955
+ },
+ {
+ "epoch": 0.4520094562647754,
+ "grad_norm": 2.832740306854248,
+ "learning_rate": 4.943186178515703e-06,
+ "loss": 0.6882,
+ "step": 956
+ },
+ {
+ "epoch": 0.4524822695035461,
+ "grad_norm": 2.9508981704711914,
+ "learning_rate": 4.943053864954574e-06,
+ "loss": 0.5722,
+ "step": 957
+ },
+ {
+ "epoch": 0.4529550827423168,
+ "grad_norm": 3.044729471206665,
+ "learning_rate": 4.9429213992751925e-06,
+ "loss": 0.6772,
+ "step": 958
+ },
+ {
+ "epoch": 0.45342789598108746,
+ "grad_norm": 2.606003522872925,
+ "learning_rate": 4.9427887814858075e-06,
+ "loss": 0.6445,
+ "step": 959
+ },
+ {
+ "epoch": 0.45390070921985815,
+ "grad_norm": 2.4634225368499756,
+ "learning_rate": 4.942656011594676e-06,
+ "loss": 0.6151,
+ "step": 960
+ },
+ {
+ "epoch": 0.45437352245862883,
+ "grad_norm": 2.8872334957122803,
+ "learning_rate": 4.942523089610066e-06,
+ "loss": 0.6255,
+ "step": 961
+ },
+ {
+ "epoch": 0.4548463356973995,
+ "grad_norm": 2.870605707168579,
+ "learning_rate": 4.942390015540253e-06,
+ "loss": 0.7481,
+ "step": 962
+ },
+ {
+ "epoch": 0.4553191489361702,
+ "grad_norm": 2.952680826187134,
+ "learning_rate": 4.942256789393524e-06,
+ "loss": 0.5556,
+ "step": 963
+ },
+ {
+ "epoch": 0.4557919621749409,
+ "grad_norm": 2.623680353164673,
+ "learning_rate": 4.9421234111781725e-06,
+ "loss": 0.6115,
+ "step": 964
+ },
+ {
+ "epoch": 0.4562647754137116,
+ "grad_norm": 2.6933600902557373,
+ "learning_rate": 4.941989880902505e-06,
+ "loss": 0.6102,
+ "step": 965
+ },
+ {
+ "epoch": 0.45673758865248226,
+ "grad_norm": 2.6047189235687256,
+ "learning_rate": 4.941856198574836e-06,
+ "loss": 0.612,
+ "step": 966
+ },
+ {
+ "epoch": 0.45721040189125295,
+ "grad_norm": 2.779186725616455,
+ "learning_rate": 4.9417223642034885e-06,
+ "loss": 0.5424,
+ "step": 967
+ },
+ {
+ "epoch": 0.45768321513002364,
+ "grad_norm": 2.6177165508270264,
+ "learning_rate": 4.941588377796795e-06,
+ "loss": 0.4661,
+ "step": 968
+ },
+ {
+ "epoch": 0.4581560283687943,
+ "grad_norm": 2.959676742553711,
+ "learning_rate": 4.941454239363101e-06,
+ "loss": 0.6966,
+ "step": 969
+ },
+ {
+ "epoch": 0.458628841607565,
+ "grad_norm": 2.9788379669189453,
+ "learning_rate": 4.941319948910756e-06,
+ "loss": 0.6181,
+ "step": 970
+ },
+ {
+ "epoch": 0.4591016548463357,
+ "grad_norm": 4.642750263214111,
+ "learning_rate": 4.941185506448122e-06,
+ "loss": 0.5602,
+ "step": 971
+ },
+ {
+ "epoch": 0.4595744680851064,
+ "grad_norm": 2.793002128601074,
+ "learning_rate": 4.941050911983572e-06,
+ "loss": 0.602,
+ "step": 972
+ },
+ {
+ "epoch": 0.46004728132387707,
+ "grad_norm": 2.6833035945892334,
+ "learning_rate": 4.9409161655254845e-06,
+ "loss": 0.5549,
+ "step": 973
+ },
+ {
+ "epoch": 0.46052009456264775,
+ "grad_norm": 3.905032157897949,
+ "learning_rate": 4.94078126708225e-06,
+ "loss": 0.6335,
+ "step": 974
+ },
+ {
+ "epoch": 0.46099290780141844,
+ "grad_norm": 2.922609329223633,
+ "learning_rate": 4.94064621666227e-06,
+ "loss": 0.5839,
+ "step": 975
+ },
+ {
+ "epoch": 0.4614657210401891,
+ "grad_norm": 2.8277416229248047,
+ "learning_rate": 4.940511014273952e-06,
+ "loss": 0.629,
+ "step": 976
+ },
+ {
+ "epoch": 0.4619385342789598,
+ "grad_norm": 3.07511043548584,
+ "learning_rate": 4.940375659925714e-06,
+ "loss": 0.7058,
+ "step": 977
+ },
+ {
+ "epoch": 0.4624113475177305,
+ "grad_norm": 3.65043044090271,
+ "learning_rate": 4.940240153625984e-06,
+ "loss": 0.7174,
+ "step": 978
+ },
+ {
+ "epoch": 0.4628841607565012,
+ "grad_norm": 2.755167245864868,
+ "learning_rate": 4.9401044953832e-06,
+ "loss": 0.6548,
+ "step": 979
+ },
+ {
+ "epoch": 0.46335697399527187,
+ "grad_norm": 2.9881057739257812,
+ "learning_rate": 4.939968685205808e-06,
+ "loss": 0.6245,
+ "step": 980
+ },
+ {
+ "epoch": 0.46382978723404256,
+ "grad_norm": 2.9484212398529053,
+ "learning_rate": 4.939832723102266e-06,
+ "loss": 0.655,
+ "step": 981
+ },
+ {
+ "epoch": 0.46430260047281324,
+ "grad_norm": 2.898918628692627,
+ "learning_rate": 4.939696609081038e-06,
+ "loss": 0.6178,
+ "step": 982
+ },
+ {
+ "epoch": 0.46477541371158393,
+ "grad_norm": 2.7052435874938965,
+ "learning_rate": 4.9395603431506e-06,
+ "loss": 0.6393,
+ "step": 983
+ },
+ {
+ "epoch": 0.4652482269503546,
+ "grad_norm": 2.5610013008117676,
+ "learning_rate": 4.939423925319436e-06,
+ "loss": 0.4847,
+ "step": 984
+ },
+ {
+ "epoch": 0.4657210401891253,
+ "grad_norm": 3.229083299636841,
+ "learning_rate": 4.939287355596042e-06,
+ "loss": 0.6473,
+ "step": 985
+ },
+ {
+ "epoch": 0.466193853427896,
+ "grad_norm": 2.907097816467285,
+ "learning_rate": 4.9391506339889195e-06,
+ "loss": 0.652,
+ "step": 986
+ },
+ {
+ "epoch": 0.4666666666666667,
+ "grad_norm": 2.6929478645324707,
+ "learning_rate": 4.939013760506582e-06,
+ "loss": 0.6175,
+ "step": 987
+ },
+ {
+ "epoch": 0.46713947990543736,
+ "grad_norm": 3.414813280105591,
+ "learning_rate": 4.938876735157554e-06,
+ "loss": 0.7597,
+ "step": 988
+ },
+ {
+ "epoch": 0.46761229314420805,
+ "grad_norm": 3.297360420227051,
+ "learning_rate": 4.938739557950365e-06,
+ "loss": 0.6824,
+ "step": 989
+ },
+ {
+ "epoch": 0.46808510638297873,
+ "grad_norm": 3.083155393600464,
+ "learning_rate": 4.938602228893557e-06,
+ "loss": 0.6505,
+ "step": 990
+ },
+ {
+ "epoch": 0.4685579196217494,
+ "grad_norm": 2.9781153202056885,
+ "learning_rate": 4.938464747995681e-06,
+ "loss": 0.666,
+ "step": 991
+ },
+ {
+ "epoch": 0.4690307328605201,
+ "grad_norm": 3.1494534015655518,
+ "learning_rate": 4.9383271152652975e-06,
+ "loss": 0.6422,
+ "step": 992
+ },
+ {
+ "epoch": 0.4695035460992908,
+ "grad_norm": 2.547868490219116,
+ "learning_rate": 4.938189330710976e-06,
+ "loss": 0.5766,
+ "step": 993
+ },
+ {
+ "epoch": 0.4699763593380615,
+ "grad_norm": 2.684736967086792,
+ "learning_rate": 4.938051394341297e-06,
+ "loss": 0.6407,
+ "step": 994
+ },
+ {
+ "epoch": 0.47044917257683216,
+ "grad_norm": 2.9619693756103516,
+ "learning_rate": 4.937913306164847e-06,
+ "loss": 0.6936,
+ "step": 995
+ },
+ {
+ "epoch": 0.47092198581560285,
+ "grad_norm": 2.9698498249053955,
+ "learning_rate": 4.937775066190227e-06,
+ "loss": 0.6464,
+ "step": 996
+ },
+ {
+ "epoch": 0.47139479905437354,
+ "grad_norm": 3.121049642562866,
+ "learning_rate": 4.937636674426042e-06,
+ "loss": 0.6383,
+ "step": 997
+ },
+ {
+ "epoch": 0.4718676122931442,
+ "grad_norm": 3.113672971725464,
+ "learning_rate": 4.93749813088091e-06,
+ "loss": 0.6892,
+ "step": 998
+ },
+ {
+ "epoch": 0.4723404255319149,
+ "grad_norm": 3.126113176345825,
+ "learning_rate": 4.937359435563458e-06,
+ "loss": 0.6728,
+ "step": 999
+ },
+ {
+ "epoch": 0.4728132387706856,
+ "grad_norm": 3.353966236114502,
+ "learning_rate": 4.937220588482321e-06,
+ "loss": 0.6041,
+ "step": 1000
+ },
+ {
+ "epoch": 0.4732860520094563,
+ "grad_norm": 2.8860628604888916,
+ "learning_rate": 4.937081589646144e-06,
+ "loss": 0.6798,
+ "step": 1001
+ },
+ {
+ "epoch": 0.47375886524822697,
+ "grad_norm": 3.0510590076446533,
+ "learning_rate": 4.936942439063584e-06,
+ "loss": 0.5841,
+ "step": 1002
+ },
+ {
+ "epoch": 0.47423167848699765,
+ "grad_norm": 2.6998369693756104,
+ "learning_rate": 4.936803136743303e-06,
+ "loss": 0.6403,
+ "step": 1003
+ },
+ {
+ "epoch": 0.47470449172576834,
+ "grad_norm": 2.875347137451172,
+ "learning_rate": 4.9366636826939765e-06,
+ "loss": 0.5811,
+ "step": 1004
+ },
+ {
+ "epoch": 0.475177304964539,
+ "grad_norm": 2.9122262001037598,
+ "learning_rate": 4.936524076924287e-06,
+ "loss": 0.6852,
+ "step": 1005
+ },
+ {
+ "epoch": 0.4756501182033097,
+ "grad_norm": 2.5167057514190674,
+ "learning_rate": 4.9363843194429265e-06,
+ "loss": 0.5367,
+ "step": 1006
+ },
+ {
+ "epoch": 0.4761229314420804,
+ "grad_norm": 2.5745551586151123,
+ "learning_rate": 4.9362444102585985e-06,
+ "loss": 0.6241,
+ "step": 1007
+ },
+ {
+ "epoch": 0.4765957446808511,
+ "grad_norm": 2.5024216175079346,
+ "learning_rate": 4.9361043493800125e-06,
+ "loss": 0.6133,
+ "step": 1008
+ },
+ {
+ "epoch": 0.47706855791962177,
+ "grad_norm": 2.7281384468078613,
+ "learning_rate": 4.935964136815892e-06,
+ "loss": 0.6834,
+ "step": 1009
+ },
+ {
+ "epoch": 0.47754137115839246,
+ "grad_norm": 3.0118913650512695,
+ "learning_rate": 4.935823772574965e-06,
+ "loss": 0.6922,
+ "step": 1010
+ },
+ {
+ "epoch": 0.47801418439716314,
+ "grad_norm": 3.016216993331909,
+ "learning_rate": 4.935683256665973e-06,
+ "loss": 0.6653,
+ "step": 1011
+ },
+ {
+ "epoch": 0.47848699763593383,
+ "grad_norm": 2.9526784420013428,
+ "learning_rate": 4.9355425890976636e-06,
+ "loss": 0.6423,
+ "step": 1012
+ },
+ {
+ "epoch": 0.4789598108747045,
+ "grad_norm": 6.222797393798828,
+ "learning_rate": 4.9354017698787985e-06,
+ "loss": 0.5884,
+ "step": 1013
+ },
+ {
+ "epoch": 0.4794326241134752,
+ "grad_norm": 2.6553597450256348,
+ "learning_rate": 4.935260799018143e-06,
+ "loss": 0.6624,
+ "step": 1014
+ },
+ {
+ "epoch": 0.4799054373522459,
+ "grad_norm": 3.0942065715789795,
+ "learning_rate": 4.935119676524475e-06,
+ "loss": 0.6623,
+ "step": 1015
+ },
+ {
+ "epoch": 0.4803782505910166,
+ "grad_norm": 2.626359224319458,
+ "learning_rate": 4.934978402406585e-06,
+ "loss": 0.6195,
+ "step": 1016
+ },
+ {
+ "epoch": 0.4808510638297872,
+ "grad_norm": 2.7954699993133545,
+ "learning_rate": 4.934836976673265e-06,
+ "loss": 0.5545,
+ "step": 1017
+ },
+ {
+ "epoch": 0.4813238770685579,
+ "grad_norm": 2.913557291030884,
+ "learning_rate": 4.934695399333324e-06,
+ "loss": 0.6288,
+ "step": 1018
+ },
+ {
+ "epoch": 0.4817966903073286,
+ "grad_norm": 3.1043739318847656,
+ "learning_rate": 4.9345536703955746e-06,
+ "loss": 0.6771,
+ "step": 1019
+ },
+ {
+ "epoch": 0.48226950354609927,
+ "grad_norm": 2.789357900619507,
+ "learning_rate": 4.934411789868845e-06,
+ "loss": 0.6227,
+ "step": 1020
+ },
+ {
+ "epoch": 0.48274231678486995,
+ "grad_norm": 2.480609655380249,
+ "learning_rate": 4.934269757761967e-06,
+ "loss": 0.5779,
+ "step": 1021
+ },
+ {
+ "epoch": 0.48321513002364064,
+ "grad_norm": 2.7946252822875977,
+ "learning_rate": 4.934127574083785e-06,
+ "loss": 0.6166,
+ "step": 1022
+ },
+ {
+ "epoch": 0.4836879432624113,
+ "grad_norm": 3.0670509338378906,
+ "learning_rate": 4.933985238843153e-06,
+ "loss": 0.7766,
+ "step": 1023
+ },
+ {
+ "epoch": 0.484160756501182,
+ "grad_norm": 2.8567559719085693,
+ "learning_rate": 4.933842752048932e-06,
+ "loss": 0.5088,
+ "step": 1024
+ },
+ {
+ "epoch": 0.4846335697399527,
+ "grad_norm": 2.5674657821655273,
+ "learning_rate": 4.933700113709996e-06,
+ "loss": 0.6036,
+ "step": 1025
+ },
+ {
+ "epoch": 0.4851063829787234,
+ "grad_norm": 2.782339096069336,
+ "learning_rate": 4.933557323835224e-06,
+ "loss": 0.5335,
+ "step": 1026
+ },
+ {
+ "epoch": 0.48557919621749407,
+ "grad_norm": 2.6334071159362793,
+ "learning_rate": 4.93341438243351e-06,
+ "loss": 0.6327,
+ "step": 1027
+ },
+ {
+ "epoch": 0.48605200945626476,
+ "grad_norm": 3.0853965282440186,
+ "learning_rate": 4.933271289513751e-06,
+ "loss": 0.7102,
+ "step": 1028
+ },
+ {
+ "epoch": 0.48652482269503544,
+ "grad_norm": 2.619997501373291,
+ "learning_rate": 4.933128045084859e-06,
+ "loss": 0.6138,
+ "step": 1029
+ },
+ {
+ "epoch": 0.48699763593380613,
+ "grad_norm": 2.8316116333007812,
+ "learning_rate": 4.932984649155753e-06,
+ "loss": 0.6346,
+ "step": 1030
+ },
+ {
+ "epoch": 0.4874704491725768,
+ "grad_norm": 3.153486490249634,
+ "learning_rate": 4.932841101735361e-06,
+ "loss": 0.7626,
+ "step": 1031
+ },
+ {
+ "epoch": 0.4879432624113475,
+ "grad_norm": 3.1831274032592773,
+ "learning_rate": 4.9326974028326214e-06,
+ "loss": 0.6607,
+ "step": 1032
+ },
+ {
+ "epoch": 0.4884160756501182,
+ "grad_norm": 2.791078567504883,
+ "learning_rate": 4.932553552456481e-06,
+ "loss": 0.6141,
+ "step": 1033
+ },
+ {
+ "epoch": 0.4888888888888889,
+ "grad_norm": 2.627263307571411,
+ "learning_rate": 4.932409550615898e-06,
+ "loss": 0.6777,
+ "step": 1034
+ },
+ {
+ "epoch": 0.48936170212765956,
+ "grad_norm": 2.8550007343292236,
+ "learning_rate": 4.932265397319838e-06,
+ "loss": 0.6379,
+ "step": 1035
+ },
+ {
+ "epoch": 0.48983451536643025,
+ "grad_norm": 4.505824089050293,
+ "learning_rate": 4.932121092577276e-06,
+ "loss": 0.5892,
+ "step": 1036
+ },
+ {
+ "epoch": 0.49030732860520093,
+ "grad_norm": 3.100191116333008,
+ "learning_rate": 4.931976636397199e-06,
+ "loss": 0.6443,
+ "step": 1037
+ },
+ {
+ "epoch": 0.4907801418439716,
+ "grad_norm": 2.921494245529175,
+ "learning_rate": 4.9318320287886e-06,
+ "loss": 0.6821,
+ "step": 1038
+ },
+ {
+ "epoch": 0.4912529550827423,
+ "grad_norm": 4.577807903289795,
+ "learning_rate": 4.931687269760485e-06,
+ "loss": 0.5946,
+ "step": 1039
+ },
+ {
+ "epoch": 0.491725768321513,
+ "grad_norm": 2.7347636222839355,
+ "learning_rate": 4.931542359321865e-06,
+ "loss": 0.5689,
+ "step": 1040
+ },
+ {
+ "epoch": 0.4921985815602837,
+ "grad_norm": 2.5289158821105957,
+ "learning_rate": 4.931397297481765e-06,
+ "loss": 0.5632,
+ "step": 1041
+ },
+ {
+ "epoch": 0.49267139479905436,
+ "grad_norm": 3.3518471717834473,
+ "learning_rate": 4.9312520842492165e-06,
+ "loss": 0.6349,
+ "step": 1042
+ },
+ {
+ "epoch": 0.49314420803782505,
+ "grad_norm": 3.0469748973846436,
+ "learning_rate": 4.931106719633261e-06,
+ "loss": 0.5734,
+ "step": 1043
+ },
+ {
+ "epoch": 0.49361702127659574,
+ "grad_norm": 3.104682445526123,
+ "learning_rate": 4.930961203642951e-06,
+ "loss": 0.6101,
+ "step": 1044
+ },
+ {
+ "epoch": 0.4940898345153664,
+ "grad_norm": 2.776705503463745,
+ "learning_rate": 4.930815536287346e-06,
+ "loss": 0.6397,
+ "step": 1045
+ },
+ {
+ "epoch": 0.4945626477541371,
+ "grad_norm": 2.760380983352661,
+ "learning_rate": 4.930669717575516e-06,
+ "loss": 0.668,
+ "step": 1046
+ },
+ {
+ "epoch": 0.4950354609929078,
+ "grad_norm": 2.70084547996521,
+ "learning_rate": 4.930523747516541e-06,
+ "loss": 0.5729,
+ "step": 1047
+ },
+ {
+ "epoch": 0.4955082742316785,
+ "grad_norm": 2.7319583892822266,
+ "learning_rate": 4.930377626119511e-06,
+ "loss": 0.6258,
+ "step": 1048
+ },
+ {
+ "epoch": 0.49598108747044917,
+ "grad_norm": 3.2515223026275635,
+ "learning_rate": 4.930231353393521e-06,
+ "loss": 0.7412,
+ "step": 1049
+ },
+ {
+ "epoch": 0.49645390070921985,
+ "grad_norm": 3.0646486282348633,
+ "learning_rate": 4.930084929347682e-06,
+ "loss": 0.5809,
+ "step": 1050
+ },
+ {
+ "epoch": 0.49692671394799054,
+ "grad_norm": 3.1621921062469482,
+ "learning_rate": 4.9299383539911096e-06,
+ "loss": 0.6282,
+ "step": 1051
+ },
+ {
+ "epoch": 0.4973995271867612,
+ "grad_norm": 2.864713191986084,
+ "learning_rate": 4.929791627332931e-06,
+ "loss": 0.6263,
+ "step": 1052
+ },
+ {
+ "epoch": 0.4978723404255319,
+ "grad_norm": 3.181016683578491,
+ "learning_rate": 4.929644749382283e-06,
+ "loss": 0.5697,
+ "step": 1053
+ },
+ {
+ "epoch": 0.4983451536643026,
+ "grad_norm": 2.9064836502075195,
+ "learning_rate": 4.929497720148309e-06,
+ "loss": 0.6161,
+ "step": 1054
+ },
+ {
+ "epoch": 0.4988179669030733,
+ "grad_norm": 3.058112859725952,
+ "learning_rate": 4.9293505396401655e-06,
+ "loss": 0.6477,
+ "step": 1055
+ },
+ {
+ "epoch": 0.49929078014184397,
+ "grad_norm": 2.5227596759796143,
+ "learning_rate": 4.929203207867016e-06,
+ "loss": 0.5819,
+ "step": 1056
+ },
+ {
+ "epoch": 0.49976359338061466,
+ "grad_norm": 3.386862277984619,
+ "learning_rate": 4.929055724838035e-06,
+ "loss": 0.7342,
+ "step": 1057
+ },
+ {
+ "epoch": 0.5002364066193853,
+ "grad_norm": 3.368346929550171,
+ "learning_rate": 4.928908090562404e-06,
+ "loss": 0.6622,
+ "step": 1058
+ },
+ {
+ "epoch": 0.500709219858156,
+ "grad_norm": 2.9108314514160156,
+ "learning_rate": 4.928760305049317e-06,
+ "loss": 0.6598,
+ "step": 1059
+ },
+ {
+ "epoch": 0.5011820330969267,
+ "grad_norm": 2.822305917739868,
+ "learning_rate": 4.928612368307977e-06,
+ "loss": 0.5841,
+ "step": 1060
+ },
+ {
+ "epoch": 0.5016548463356973,
+ "grad_norm": 2.689131259918213,
+ "learning_rate": 4.928464280347592e-06,
+ "loss": 0.6631,
+ "step": 1061
+ },
+ {
+ "epoch": 0.502127659574468,
+ "grad_norm": 3.337214946746826,
+ "learning_rate": 4.9283160411773864e-06,
+ "loss": 0.6105,
+ "step": 1062
+ },
+ {
+ "epoch": 0.5026004728132387,
+ "grad_norm": 3.035911798477173,
+ "learning_rate": 4.928167650806588e-06,
+ "loss": 0.6981,
+ "step": 1063
+ },
+ {
+ "epoch": 0.5030732860520094,
+ "grad_norm": 2.8820855617523193,
+ "learning_rate": 4.9280191092444375e-06,
+ "loss": 0.6408,
+ "step": 1064
+ },
+ {
+ "epoch": 0.5035460992907801,
+ "grad_norm": 3.080432415008545,
+ "learning_rate": 4.927870416500183e-06,
+ "loss": 0.6398,
+ "step": 1065
+ },
+ {
+ "epoch": 0.5040189125295508,
+ "grad_norm": 2.761612892150879,
+ "learning_rate": 4.927721572583084e-06,
+ "loss": 0.6126,
+ "step": 1066
+ },
+ {
+ "epoch": 0.5044917257683215,
+ "grad_norm": 2.8561882972717285,
+ "learning_rate": 4.927572577502408e-06,
+ "loss": 0.584,
+ "step": 1067
+ },
+ {
+ "epoch": 0.5049645390070922,
+ "grad_norm": 3.3386311531066895,
+ "learning_rate": 4.927423431267432e-06,
+ "loss": 0.6666,
+ "step": 1068
+ },
+ {
+ "epoch": 0.5054373522458628,
+ "grad_norm": 2.632906675338745,
+ "learning_rate": 4.927274133887443e-06,
+ "loss": 0.632,
+ "step": 1069
+ },
+ {
+ "epoch": 0.5059101654846335,
+ "grad_norm": 2.8737308979034424,
+ "learning_rate": 4.927124685371737e-06,
+ "loss": 0.6051,
+ "step": 1070
+ },
+ {
+ "epoch": 0.5063829787234042,
+ "grad_norm": 3.042222738265991,
+ "learning_rate": 4.926975085729619e-06,
+ "loss": 0.6954,
+ "step": 1071
+ },
+ {
+ "epoch": 0.5068557919621749,
+ "grad_norm": 3.3341481685638428,
+ "learning_rate": 4.926825334970404e-06,
+ "loss": 0.7148,
+ "step": 1072
+ },
+ {
+ "epoch": 0.5073286052009456,
+ "grad_norm": 2.7415387630462646,
+ "learning_rate": 4.926675433103418e-06,
+ "loss": 0.5456,
+ "step": 1073
+ },
+ {
+ "epoch": 0.5078014184397163,
+ "grad_norm": 2.7545325756073,
+ "learning_rate": 4.926525380137993e-06,
+ "loss": 0.6213,
+ "step": 1074
+ },
+ {
+ "epoch": 0.508274231678487,
+ "grad_norm": 2.9153690338134766,
+ "learning_rate": 4.926375176083472e-06,
+ "loss": 0.6466,
+ "step": 1075
+ },
+ {
+ "epoch": 0.5087470449172576,
+ "grad_norm": 4.210638523101807,
+ "learning_rate": 4.926224820949209e-06,
+ "loss": 0.6192,
+ "step": 1076
+ },
+ {
+ "epoch": 0.5092198581560283,
+ "grad_norm": 2.4357898235321045,
+ "learning_rate": 4.926074314744565e-06,
+ "loss": 0.594,
+ "step": 1077
+ },
+ {
+ "epoch": 0.509692671394799,
+ "grad_norm": 2.8004701137542725,
+ "learning_rate": 4.92592365747891e-06,
+ "loss": 0.6276,
+ "step": 1078
+ },
+ {
+ "epoch": 0.5101654846335697,
+ "grad_norm": 2.920675039291382,
+ "learning_rate": 4.925772849161628e-06,
+ "loss": 0.6043,
+ "step": 1079
+ },
+ {
+ "epoch": 0.5106382978723404,
+ "grad_norm": 2.791555404663086,
+ "learning_rate": 4.9256218898021055e-06,
+ "loss": 0.6837,
+ "step": 1080
+ },
+ {
+ "epoch": 0.5111111111111111,
+ "grad_norm": 3.1702463626861572,
+ "learning_rate": 4.925470779409746e-06,
+ "loss": 0.668,
+ "step": 1081
+ },
+ {
+ "epoch": 0.5115839243498818,
+ "grad_norm": 2.7149479389190674,
+ "learning_rate": 4.925319517993955e-06,
+ "loss": 0.5842,
+ "step": 1082
+ },
+ {
+ "epoch": 0.5120567375886524,
+ "grad_norm": 2.916311025619507,
+ "learning_rate": 4.925168105564153e-06,
+ "loss": 0.6893,
+ "step": 1083
+ },
+ {
+ "epoch": 0.5125295508274231,
+ "grad_norm": 2.917654514312744,
+ "learning_rate": 4.925016542129767e-06,
+ "loss": 0.6513,
+ "step": 1084
+ },
+ {
+ "epoch": 0.5130023640661938,
+ "grad_norm": 2.5568928718566895,
+ "learning_rate": 4.924864827700234e-06,
+ "loss": 0.6177,
+ "step": 1085
+ },
+ {
+ "epoch": 0.5134751773049645,
+ "grad_norm": 2.816720485687256,
+ "learning_rate": 4.924712962285001e-06,
+ "loss": 0.5833,
+ "step": 1086
+ },
+ {
+ "epoch": 0.5139479905437352,
+ "grad_norm": 2.6989188194274902,
+ "learning_rate": 4.9245609458935235e-06,
+ "loss": 0.6332,
+ "step": 1087
+ },
+ {
+ "epoch": 0.5144208037825059,
+ "grad_norm": 2.959599494934082,
+ "learning_rate": 4.924408778535268e-06,
+ "loss": 0.626,
+ "step": 1088
+ },
+ {
+ "epoch": 0.5148936170212766,
+ "grad_norm": 2.872814416885376,
+ "learning_rate": 4.924256460219708e-06,
+ "loss": 0.6407,
+ "step": 1089
+ },
+ {
+ "epoch": 0.5153664302600472,
+ "grad_norm": 2.6989097595214844,
+ "learning_rate": 4.924103990956329e-06,
+ "loss": 0.6391,
+ "step": 1090
+ },
+ {
+ "epoch": 0.5158392434988179,
+ "grad_norm": 2.986492156982422,
+ "learning_rate": 4.9239513707546235e-06,
+ "loss": 0.6911,
+ "step": 1091
+ },
+ {
+ "epoch": 0.5163120567375886,
+ "grad_norm": 3.069920301437378,
+ "learning_rate": 4.9237985996240954e-06,
+ "loss": 0.671,
+ "step": 1092
+ },
+ {
+ "epoch": 0.5167848699763593,
+ "grad_norm": 2.8214917182922363,
+ "learning_rate": 4.9236456775742555e-06,
+ "loss": 0.5885,
+ "step": 1093
+ },
+ {
+ "epoch": 0.51725768321513,
+ "grad_norm": 2.9416961669921875,
+ "learning_rate": 4.923492604614627e-06,
+ "loss": 0.6293,
+ "step": 1094
+ },
+ {
+ "epoch": 0.5177304964539007,
+ "grad_norm": 2.761780023574829,
+ "learning_rate": 4.923339380754741e-06,
+ "loss": 0.649,
+ "step": 1095
+ },
+ {
+ "epoch": 0.5182033096926714,
+ "grad_norm": 2.7648792266845703,
+ "learning_rate": 4.923186006004138e-06,
+ "loss": 0.5906,
+ "step": 1096
+ },
+ {
+ "epoch": 0.518676122931442,
+ "grad_norm": 3.5535428524017334,
+ "learning_rate": 4.923032480372367e-06,
+ "loss": 0.7138,
+ "step": 1097
+ },
+ {
+ "epoch": 0.5191489361702127,
+ "grad_norm": 2.6252479553222656,
+ "learning_rate": 4.922878803868988e-06,
+ "loss": 0.5499,
+ "step": 1098
+ },
+ {
+ "epoch": 0.5196217494089834,
+ "grad_norm": 2.901002883911133,
+ "learning_rate": 4.9227249765035715e-06,
+ "loss": 0.6991,
+ "step": 1099
+ },
+ {
+ "epoch": 0.5200945626477541,
+ "grad_norm": 2.621877431869507,
+ "learning_rate": 4.9225709982856925e-06,
+ "loss": 0.6269,
+ "step": 1100
+ },
+ {
+ "epoch": 0.5205673758865248,
+ "grad_norm": 2.872483015060425,
+ "learning_rate": 4.92241686922494e-06,
+ "loss": 0.6657,
+ "step": 1101
+ },
+ {
+ "epoch": 0.5210401891252955,
+ "grad_norm": 2.730447769165039,
+ "learning_rate": 4.922262589330912e-06,
+ "loss": 0.6061,
+ "step": 1102
+ },
+ {
+ "epoch": 0.5215130023640662,
+ "grad_norm": 2.646247386932373,
+ "learning_rate": 4.922108158613213e-06,
+ "loss": 0.5923,
+ "step": 1103
+ },
+ {
+ "epoch": 0.5219858156028369,
+ "grad_norm": 2.6488895416259766,
+ "learning_rate": 4.92195357708146e-06,
+ "loss": 0.6293,
+ "step": 1104
+ },
+ {
+ "epoch": 0.5224586288416075,
+ "grad_norm": 2.756338357925415,
+ "learning_rate": 4.921798844745278e-06,
+ "loss": 0.6374,
+ "step": 1105
+ },
+ {
+ "epoch": 0.5229314420803782,
+ "grad_norm": 3.1441280841827393,
+ "learning_rate": 4.921643961614301e-06,
+ "loss": 0.6652,
+ "step": 1106
+ },
+ {
+ "epoch": 0.5234042553191489,
+ "grad_norm": 3.050002098083496,
+ "learning_rate": 4.921488927698172e-06,
+ "loss": 0.6809,
+ "step": 1107
+ },
+ {
+ "epoch": 0.5238770685579196,
+ "grad_norm": 2.71750807762146,
+ "learning_rate": 4.921333743006547e-06,
+ "loss": 0.6266,
+ "step": 1108
+ },
+ {
+ "epoch": 0.5243498817966903,
+ "grad_norm": 2.8439245223999023,
+ "learning_rate": 4.921178407549086e-06,
+ "loss": 0.5663,
+ "step": 1109
+ },
+ {
+ "epoch": 0.524822695035461,
+ "grad_norm": 3.0722241401672363,
+ "learning_rate": 4.921022921335464e-06,
+ "loss": 0.6791,
+ "step": 1110
+ },
+ {
+ "epoch": 0.5252955082742317,
+ "grad_norm": 3.4381656646728516,
+ "learning_rate": 4.920867284375358e-06,
+ "loss": 0.6687,
+ "step": 1111
+ },
+ {
+ "epoch": 0.5257683215130023,
+ "grad_norm": 2.819812774658203,
+ "learning_rate": 4.920711496678463e-06,
+ "loss": 0.6299,
+ "step": 1112
+ },
+ {
+ "epoch": 0.526241134751773,
+ "grad_norm": 3.6587414741516113,
+ "learning_rate": 4.9205555582544765e-06,
+ "loss": 0.7392,
+ "step": 1113
+ },
+ {
+ "epoch": 0.5267139479905437,
+ "grad_norm": 2.774296522140503,
+ "learning_rate": 4.920399469113109e-06,
+ "loss": 0.6652,
+ "step": 1114
+ },
+ {
+ "epoch": 0.5271867612293144,
+ "grad_norm": 2.7480580806732178,
+ "learning_rate": 4.920243229264081e-06,
+ "loss": 0.596,
+ "step": 1115
+ },
+ {
+ "epoch": 0.5276595744680851,
+ "grad_norm": 3.213057518005371,
+ "learning_rate": 4.920086838717119e-06,
+ "loss": 0.6986,
+ "step": 1116
+ },
+ {
+ "epoch": 0.5281323877068558,
+ "grad_norm": 2.940546989440918,
+ "learning_rate": 4.919930297481962e-06,
+ "loss": 0.6481,
+ "step": 1117
+ },
+ {
+ "epoch": 0.5286052009456265,
+ "grad_norm": 2.5970494747161865,
+ "learning_rate": 4.9197736055683555e-06,
+ "loss": 0.5658,
+ "step": 1118
+ },
+ {
+ "epoch": 0.5290780141843971,
+ "grad_norm": 4.49385404586792,
+ "learning_rate": 4.919616762986057e-06,
+ "loss": 0.605,
+ "step": 1119
+ },
+ {
+ "epoch": 0.5295508274231678,
+ "grad_norm": 2.971857786178589,
+ "learning_rate": 4.919459769744833e-06,
+ "loss": 0.6539,
+ "step": 1120
+ },
+ {
+ "epoch": 0.5300236406619385,
+ "grad_norm": 2.6192965507507324,
+ "learning_rate": 4.919302625854457e-06,
+ "loss": 0.6226,
+ "step": 1121
+ },
+ {
+ "epoch": 0.5304964539007092,
+ "grad_norm": 2.665088176727295,
+ "learning_rate": 4.919145331324716e-06,
+ "loss": 0.6647,
+ "step": 1122
+ },
+ {
+ "epoch": 0.5309692671394799,
+ "grad_norm": 2.612126111984253,
+ "learning_rate": 4.918987886165403e-06,
+ "loss": 0.6965,
+ "step": 1123
+ },
+ {
+ "epoch": 0.5314420803782506,
+ "grad_norm": 3.80017352104187,
+ "learning_rate": 4.9188302903863205e-06,
+ "loss": 0.7396,
+ "step": 1124
+ },
+ {
+ "epoch": 0.5319148936170213,
+ "grad_norm": 2.781752824783325,
+ "learning_rate": 4.918672543997282e-06,
+ "loss": 0.5985,
+ "step": 1125
+ },
+ {
+ "epoch": 0.532387706855792,
+ "grad_norm": 2.6067914962768555,
+ "learning_rate": 4.91851464700811e-06,
+ "loss": 0.6159,
+ "step": 1126
+ },
+ {
+ "epoch": 0.5328605200945626,
+ "grad_norm": 2.670807123184204,
+ "learning_rate": 4.918356599428636e-06,
+ "loss": 0.5958,
+ "step": 1127
+ },
+ {
+ "epoch": 0.5333333333333333,
+ "grad_norm": 2.608611822128296,
+ "learning_rate": 4.9181984012687e-06,
+ "loss": 0.5768,
+ "step": 1128
+ },
+ {
+ "epoch": 0.533806146572104,
+ "grad_norm": 2.586764097213745,
+ "learning_rate": 4.918040052538154e-06,
+ "loss": 0.661,
+ "step": 1129
+ },
+ {
+ "epoch": 0.5342789598108747,
+ "grad_norm": 3.1317451000213623,
+ "learning_rate": 4.917881553246856e-06,
+ "loss": 0.6626,
+ "step": 1130
+ },
+ {
+ "epoch": 0.5347517730496454,
+ "grad_norm": 2.7135281562805176,
+ "learning_rate": 4.917722903404676e-06,
+ "loss": 0.6572,
+ "step": 1131
+ },
+ {
+ "epoch": 0.5352245862884161,
+ "grad_norm": 3.4546358585357666,
+ "learning_rate": 4.917564103021493e-06,
+ "loss": 0.5597,
+ "step": 1132
+ },
+ {
+ "epoch": 0.5356973995271868,
+ "grad_norm": 3.0943493843078613,
+ "learning_rate": 4.917405152107193e-06,
+ "loss": 0.7258,
+ "step": 1133
+ },
+ {
+ "epoch": 0.5361702127659574,
+ "grad_norm": 2.6069352626800537,
+ "learning_rate": 4.917246050671674e-06,
+ "loss": 0.6209,
+ "step": 1134
+ },
+ {
+ "epoch": 0.5366430260047281,
+ "grad_norm": 2.584883689880371,
+ "learning_rate": 4.917086798724844e-06,
+ "loss": 0.658,
+ "step": 1135
+ },
+ {
+ "epoch": 0.5371158392434988,
+ "grad_norm": 3.001976490020752,
+ "learning_rate": 4.9169273962766166e-06,
+ "loss": 0.6306,
+ "step": 1136
+ },
+ {
+ "epoch": 0.5375886524822695,
+ "grad_norm": 2.5013928413391113,
+ "learning_rate": 4.916767843336918e-06,
+ "loss": 0.572,
+ "step": 1137
+ },
+ {
+ "epoch": 0.5380614657210402,
+ "grad_norm": 2.9114553928375244,
+ "learning_rate": 4.916608139915684e-06,
+ "loss": 0.5841,
+ "step": 1138
+ },
+ {
+ "epoch": 0.5385342789598109,
+ "grad_norm": 2.8878467082977295,
+ "learning_rate": 4.9164482860228564e-06,
+ "loss": 0.6654,
+ "step": 1139
+ },
+ {
+ "epoch": 0.5390070921985816,
+ "grad_norm": 2.9827866554260254,
+ "learning_rate": 4.91628828166839e-06,
+ "loss": 0.6674,
+ "step": 1140
+ },
+ {
+ "epoch": 0.5394799054373522,
+ "grad_norm": 3.8696281909942627,
+ "learning_rate": 4.916128126862248e-06,
+ "loss": 0.6241,
+ "step": 1141
+ },
+ {
+ "epoch": 0.5399527186761229,
+ "grad_norm": 2.9556291103363037,
+ "learning_rate": 4.915967821614402e-06,
+ "loss": 0.6478,
+ "step": 1142
+ },
+ {
+ "epoch": 0.5404255319148936,
+ "grad_norm": 2.392942428588867,
+ "learning_rate": 4.915807365934834e-06,
+ "loss": 0.6097,
+ "step": 1143
+ },
+ {
+ "epoch": 0.5408983451536643,
+ "grad_norm": 3.032235860824585,
+ "learning_rate": 4.915646759833534e-06,
+ "loss": 0.7193,
+ "step": 1144
+ },
+ {
+ "epoch": 0.541371158392435,
+ "grad_norm": 2.840416193008423,
+ "learning_rate": 4.915486003320501e-06,
+ "loss": 0.5506,
+ "step": 1145
+ },
+ {
+ "epoch": 0.5418439716312057,
+ "grad_norm": 2.5438895225524902,
+ "learning_rate": 4.915325096405747e-06,
+ "loss": 0.6487,
+ "step": 1146
+ },
+ {
+ "epoch": 0.5423167848699764,
+ "grad_norm": 2.544334650039673,
+ "learning_rate": 4.9151640390992905e-06,
+ "loss": 0.6168,
+ "step": 1147
+ },
+ {
+ "epoch": 0.542789598108747,
+ "grad_norm": 2.8535678386688232,
+ "learning_rate": 4.91500283141116e-06,
+ "loss": 0.678,
+ "step": 1148
+ },
+ {
+ "epoch": 0.5432624113475177,
+ "grad_norm": 2.8086955547332764,
+ "learning_rate": 4.9148414733513915e-06,
+ "loss": 0.6473,
+ "step": 1149
+ },
+ {
+ "epoch": 0.5437352245862884,
+ "grad_norm": 2.4709885120391846,
+ "learning_rate": 4.914679964930034e-06,
+ "loss": 0.6797,
+ "step": 1150
+ },
+ {
+ "epoch": 0.5442080378250591,
+ "grad_norm": 2.8546934127807617,
+ "learning_rate": 4.9145183061571435e-06,
+ "loss": 0.6247,
+ "step": 1151
+ },
+ {
+ "epoch": 0.5446808510638298,
+ "grad_norm": 2.991184711456299,
+ "learning_rate": 4.9143564970427844e-06,
+ "loss": 0.5977,
+ "step": 1152
+ },
+ {
+ "epoch": 0.5451536643026005,
+ "grad_norm": 3.011216402053833,
+ "learning_rate": 4.914194537597033e-06,
+ "loss": 0.7005,
+ "step": 1153
+ },
+ {
+ "epoch": 0.5456264775413712,
+ "grad_norm": 2.807521343231201,
+ "learning_rate": 4.9140324278299744e-06,
+ "loss": 0.5412,
+ "step": 1154
+ },
+ {
+ "epoch": 0.5460992907801419,
+ "grad_norm": 3.0401229858398438,
+ "learning_rate": 4.913870167751701e-06,
+ "loss": 0.6394,
+ "step": 1155
+ },
+ {
+ "epoch": 0.5465721040189125,
+ "grad_norm": 2.853914976119995,
+ "learning_rate": 4.913707757372317e-06,
+ "loss": 0.6745,
+ "step": 1156
+ },
+ {
+ "epoch": 0.5470449172576832,
+ "grad_norm": 4.505620956420898,
+ "learning_rate": 4.913545196701935e-06,
+ "loss": 0.6668,
+ "step": 1157
+ },
+ {
+ "epoch": 0.5475177304964539,
+ "grad_norm": 3.0505781173706055,
+ "learning_rate": 4.913382485750676e-06,
+ "loss": 0.6926,
+ "step": 1158
+ },
+ {
+ "epoch": 0.5479905437352246,
+ "grad_norm": 2.798435688018799,
+ "learning_rate": 4.913219624528672e-06,
+ "loss": 0.605,
+ "step": 1159
+ },
+ {
+ "epoch": 0.5484633569739953,
+ "grad_norm": 2.7814908027648926,
+ "learning_rate": 4.913056613046065e-06,
+ "loss": 0.6678,
+ "step": 1160
+ },
+ {
+ "epoch": 0.548936170212766,
+ "grad_norm": 3.2089321613311768,
+ "learning_rate": 4.9128934513130025e-06,
+ "loss": 0.5995,
+ "step": 1161
+ },
+ {
+ "epoch": 0.5494089834515367,
+ "grad_norm": 2.7699952125549316,
+ "learning_rate": 4.9127301393396455e-06,
+ "loss": 0.7062,
+ "step": 1162
+ },
+ {
+ "epoch": 0.5498817966903073,
+ "grad_norm": 2.859368324279785,
+ "learning_rate": 4.912566677136162e-06,
+ "loss": 0.6063,
+ "step": 1163
+ },
+ {
+ "epoch": 0.550354609929078,
+ "grad_norm": 2.727334499359131,
+ "learning_rate": 4.91240306471273e-06,
+ "loss": 0.6848,
+ "step": 1164
+ },
+ {
+ "epoch": 0.5508274231678487,
+ "grad_norm": 2.6017510890960693,
+ "learning_rate": 4.912239302079537e-06,
+ "loss": 0.5808,
+ "step": 1165
+ },
+ {
+ "epoch": 0.5513002364066194,
+ "grad_norm": 3.539583206176758,
+ "learning_rate": 4.912075389246781e-06,
+ "loss": 0.7053,
+ "step": 1166
+ },
+ {
+ "epoch": 0.5517730496453901,
+ "grad_norm": 2.918280601501465,
+ "learning_rate": 4.911911326224666e-06,
+ "loss": 0.5904,
+ "step": 1167
+ },
+ {
+ "epoch": 0.5522458628841608,
+ "grad_norm": 3.0067362785339355,
+ "learning_rate": 4.9117471130234095e-06,
+ "loss": 0.6392,
+ "step": 1168
+ },
+ {
+ "epoch": 0.5527186761229315,
+ "grad_norm": 2.4374797344207764,
+ "learning_rate": 4.911582749653236e-06,
+ "loss": 0.5793,
+ "step": 1169
+ },
+ {
+ "epoch": 0.5531914893617021,
+ "grad_norm": 3.121182918548584,
+ "learning_rate": 4.911418236124378e-06,
+ "loss": 0.6636,
+ "step": 1170
+ },
+ {
+ "epoch": 0.5536643026004728,
+ "grad_norm": 3.1289851665496826,
+ "learning_rate": 4.91125357244708e-06,
+ "loss": 0.656,
+ "step": 1171
+ },
+ {
+ "epoch": 0.5541371158392435,
+ "grad_norm": 2.7034592628479004,
+ "learning_rate": 4.911088758631596e-06,
+ "loss": 0.6001,
+ "step": 1172
+ },
+ {
+ "epoch": 0.5546099290780142,
+ "grad_norm": 2.710146188735962,
+ "learning_rate": 4.910923794688187e-06,
+ "loss": 0.6007,
+ "step": 1173
+ },
+ {
+ "epoch": 0.5550827423167849,
+ "grad_norm": 2.5424487590789795,
+ "learning_rate": 4.910758680627124e-06,
+ "loss": 0.5193,
+ "step": 1174
+ },
+ {
+ "epoch": 0.5555555555555556,
+ "grad_norm": 2.615893602371216,
+ "learning_rate": 4.91059341645869e-06,
+ "loss": 0.5525,
+ "step": 1175
+ },
+ {
+ "epoch": 0.5560283687943263,
+ "grad_norm": 3.3179728984832764,
+ "learning_rate": 4.910428002193174e-06,
+ "loss": 0.7285,
+ "step": 1176
+ },
+ {
+ "epoch": 0.556501182033097,
+ "grad_norm": 2.7234175205230713,
+ "learning_rate": 4.910262437840875e-06,
+ "loss": 0.574,
+ "step": 1177
+ },
+ {
+ "epoch": 0.5569739952718676,
+ "grad_norm": 3.0416605472564697,
+ "learning_rate": 4.9100967234121034e-06,
+ "loss": 0.5623,
+ "step": 1178
+ },
+ {
+ "epoch": 0.5574468085106383,
+ "grad_norm": 3.067786455154419,
+ "learning_rate": 4.909930858917177e-06,
+ "loss": 0.6491,
+ "step": 1179
+ },
+ {
+ "epoch": 0.557919621749409,
+ "grad_norm": 3.0037379264831543,
+ "learning_rate": 4.909764844366422e-06,
+ "loss": 0.5696,
+ "step": 1180
+ },
+ {
+ "epoch": 0.5583924349881797,
+ "grad_norm": 2.966179609298706,
+ "learning_rate": 4.909598679770178e-06,
+ "loss": 0.6042,
+ "step": 1181
+ },
+ {
+ "epoch": 0.5588652482269504,
+ "grad_norm": 2.6000657081604004,
+ "learning_rate": 4.909432365138789e-06,
+ "loss": 0.5883,
+ "step": 1182
+ },
+ {
+ "epoch": 0.5593380614657211,
+ "grad_norm": 2.6794495582580566,
+ "learning_rate": 4.909265900482612e-06,
+ "loss": 0.6809,
+ "step": 1183
+ },
+ {
+ "epoch": 0.5598108747044918,
+ "grad_norm": 2.6765122413635254,
+ "learning_rate": 4.9090992858120115e-06,
+ "loss": 0.6601,
+ "step": 1184
+ },
+ {
+ "epoch": 0.5602836879432624,
+ "grad_norm": 2.6051928997039795,
+ "learning_rate": 4.908932521137363e-06,
+ "loss": 0.5946,
+ "step": 1185
+ },
+ {
+ "epoch": 0.5607565011820331,
+ "grad_norm": 3.0405542850494385,
+ "learning_rate": 4.908765606469048e-06,
+ "loss": 0.6998,
+ "step": 1186
+ },
+ {
+ "epoch": 0.5612293144208038,
+ "grad_norm": 2.7975668907165527,
+ "learning_rate": 4.908598541817462e-06,
+ "loss": 0.6218,
+ "step": 1187
+ },
+ {
+ "epoch": 0.5617021276595745,
+ "grad_norm": 2.5367627143859863,
+ "learning_rate": 4.908431327193005e-06,
+ "loss": 0.6354,
+ "step": 1188
+ },
+ {
+ "epoch": 0.5621749408983452,
+ "grad_norm": 3.7939631938934326,
+ "learning_rate": 4.908263962606091e-06,
+ "loss": 0.6376,
+ "step": 1189
+ },
+ {
+ "epoch": 0.5626477541371159,
+ "grad_norm": 2.864079475402832,
+ "learning_rate": 4.908096448067139e-06,
+ "loss": 0.5485,
+ "step": 1190
+ },
+ {
+ "epoch": 0.5631205673758866,
+ "grad_norm": 2.7855563163757324,
+ "learning_rate": 4.9079287835865804e-06,
+ "loss": 0.6645,
+ "step": 1191
+ },
+ {
+ "epoch": 0.5635933806146572,
+ "grad_norm": 2.6156625747680664,
+ "learning_rate": 4.9077609691748556e-06,
+ "loss": 0.5751,
+ "step": 1192
+ },
+ {
+ "epoch": 0.5640661938534279,
+ "grad_norm": 3.0475659370422363,
+ "learning_rate": 4.907593004842412e-06,
+ "loss": 0.6739,
+ "step": 1193
+ },
+ {
+ "epoch": 0.5645390070921986,
+ "grad_norm": 2.9176738262176514,
+ "learning_rate": 4.9074248905997104e-06,
+ "loss": 0.6493,
+ "step": 1194
+ },
+ {
+ "epoch": 0.5650118203309693,
+ "grad_norm": 2.6168384552001953,
+ "learning_rate": 4.907256626457216e-06,
+ "loss": 0.6154,
+ "step": 1195
+ },
+ {
+ "epoch": 0.56548463356974,
+ "grad_norm": 2.893980026245117,
+ "learning_rate": 4.907088212425408e-06,
+ "loss": 0.5808,
+ "step": 1196
+ },
+ {
+ "epoch": 0.5659574468085107,
+ "grad_norm": 3.3832836151123047,
+ "learning_rate": 4.90691964851477e-06,
+ "loss": 0.7888,
+ "step": 1197
+ },
+ {
+ "epoch": 0.5664302600472814,
+ "grad_norm": 3.088932752609253,
+ "learning_rate": 4.906750934735801e-06,
+ "loss": 0.6516,
+ "step": 1198
+ },
+ {
+ "epoch": 0.566903073286052,
+ "grad_norm": 2.494471549987793,
+ "learning_rate": 4.906582071099004e-06,
+ "loss": 0.6286,
+ "step": 1199
+ },
+ {
+ "epoch": 0.5673758865248227,
+ "grad_norm": 2.716550588607788,
+ "learning_rate": 4.906413057614895e-06,
+ "loss": 0.5939,
+ "step": 1200
+ },
+ {
+ "epoch": 0.5678486997635934,
+ "grad_norm": 2.5821073055267334,
+ "learning_rate": 4.906243894293995e-06,
+ "loss": 0.6668,
+ "step": 1201
+ },
+ {
+ "epoch": 0.5683215130023641,
+ "grad_norm": 3.651787042617798,
+ "learning_rate": 4.90607458114684e-06,
+ "loss": 0.6124,
+ "step": 1202
+ },
+ {
+ "epoch": 0.5687943262411348,
+ "grad_norm": 2.7567858695983887,
+ "learning_rate": 4.9059051181839705e-06,
+ "loss": 0.6656,
+ "step": 1203
+ },
+ {
+ "epoch": 0.5692671394799055,
+ "grad_norm": 2.8067586421966553,
+ "learning_rate": 4.90573550541594e-06,
+ "loss": 0.6306,
+ "step": 1204
+ },
+ {
+ "epoch": 0.5697399527186762,
+ "grad_norm": 2.6136393547058105,
+ "learning_rate": 4.905565742853307e-06,
+ "loss": 0.5992,
+ "step": 1205
+ },
+ {
+ "epoch": 0.5702127659574469,
+ "grad_norm": 2.899049758911133,
+ "learning_rate": 4.905395830506644e-06,
+ "loss": 0.621,
+ "step": 1206
+ },
+ {
+ "epoch": 0.5706855791962175,
+ "grad_norm": 3.036583185195923,
+ "learning_rate": 4.9052257683865294e-06,
+ "loss": 0.652,
+ "step": 1207
+ },
+ {
+ "epoch": 0.5711583924349882,
+ "grad_norm": 2.7947216033935547,
+ "learning_rate": 4.905055556503553e-06,
+ "loss": 0.6636,
+ "step": 1208
+ },
+ {
+ "epoch": 0.5716312056737589,
+ "grad_norm": 3.1646955013275146,
+ "learning_rate": 4.9048851948683135e-06,
+ "loss": 0.6376,
+ "step": 1209
+ },
+ {
+ "epoch": 0.5721040189125296,
+ "grad_norm": 2.8175766468048096,
+ "learning_rate": 4.904714683491417e-06,
+ "loss": 0.5929,
+ "step": 1210
+ },
+ {
+ "epoch": 0.5725768321513003,
+ "grad_norm": 2.923923969268799,
+ "learning_rate": 4.904544022383483e-06,
+ "loss": 0.6633,
+ "step": 1211
+ },
+ {
+ "epoch": 0.573049645390071,
+ "grad_norm": 2.7471134662628174,
+ "learning_rate": 4.9043732115551356e-06,
+ "loss": 0.6551,
+ "step": 1212
+ },
+ {
+ "epoch": 0.5735224586288417,
+ "grad_norm": 2.8660807609558105,
+ "learning_rate": 4.90420225101701e-06,
+ "loss": 0.6423,
+ "step": 1213
+ },
+ {
+ "epoch": 0.5739952718676123,
+ "grad_norm": 2.769247531890869,
+ "learning_rate": 4.904031140779754e-06,
+ "loss": 0.5982,
+ "step": 1214
+ },
+ {
+ "epoch": 0.574468085106383,
+ "grad_norm": 2.9043145179748535,
+ "learning_rate": 4.90385988085402e-06,
+ "loss": 0.5843,
+ "step": 1215
+ },
+ {
+ "epoch": 0.5749408983451537,
+ "grad_norm": 2.6639609336853027,
+ "learning_rate": 4.903688471250471e-06,
+ "loss": 0.5858,
+ "step": 1216
+ },
+ {
+ "epoch": 0.5754137115839244,
+ "grad_norm": 2.6967573165893555,
+ "learning_rate": 4.903516911979781e-06,
+ "loss": 0.5755,
+ "step": 1217
+ },
+ {
+ "epoch": 0.5758865248226951,
+ "grad_norm": 2.8865857124328613,
+ "learning_rate": 4.903345203052633e-06,
+ "loss": 0.6051,
+ "step": 1218
+ },
+ {
+ "epoch": 0.5763593380614658,
+ "grad_norm": 2.381979465484619,
+ "learning_rate": 4.903173344479717e-06,
+ "loss": 0.5727,
+ "step": 1219
+ },
+ {
+ "epoch": 0.5768321513002365,
+ "grad_norm": 2.7717981338500977,
+ "learning_rate": 4.903001336271734e-06,
+ "loss": 0.6406,
+ "step": 1220
+ },
+ {
+ "epoch": 0.577304964539007,
+ "grad_norm": 2.6431570053100586,
+ "learning_rate": 4.902829178439395e-06,
+ "loss": 0.6226,
+ "step": 1221
+ },
+ {
+ "epoch": 0.5777777777777777,
+ "grad_norm": 2.8090415000915527,
+ "learning_rate": 4.902656870993419e-06,
+ "loss": 0.5761,
+ "step": 1222
+ },
+ {
+ "epoch": 0.5782505910165484,
+ "grad_norm": 2.4769368171691895,
+ "learning_rate": 4.902484413944535e-06,
+ "loss": 0.5602,
+ "step": 1223
+ },
+ {
+ "epoch": 0.5787234042553191,
+ "grad_norm": 2.693316698074341,
+ "learning_rate": 4.902311807303481e-06,
+ "loss": 0.5222,
+ "step": 1224
+ },
+ {
+ "epoch": 0.5791962174940898,
+ "grad_norm": 2.7623913288116455,
+ "learning_rate": 4.902139051081004e-06,
+ "loss": 0.6978,
+ "step": 1225
+ },
+ {
+ "epoch": 0.5796690307328605,
+ "grad_norm": 2.6133766174316406,
+ "learning_rate": 4.901966145287863e-06,
+ "loss": 0.5802,
+ "step": 1226
+ },
+ {
+ "epoch": 0.5801418439716312,
+ "grad_norm": 2.7345972061157227,
+ "learning_rate": 4.901793089934821e-06,
+ "loss": 0.6294,
+ "step": 1227
+ },
+ {
+ "epoch": 0.5806146572104018,
+ "grad_norm": 2.7545835971832275,
+ "learning_rate": 4.9016198850326555e-06,
+ "loss": 0.6085,
+ "step": 1228
+ },
+ {
+ "epoch": 0.5810874704491725,
+ "grad_norm": 2.6947758197784424,
+ "learning_rate": 4.90144653059215e-06,
+ "loss": 0.6025,
+ "step": 1229
+ },
+ {
+ "epoch": 0.5815602836879432,
+ "grad_norm": 2.692967414855957,
+ "learning_rate": 4.901273026624099e-06,
+ "loss": 0.5715,
+ "step": 1230
+ },
+ {
+ "epoch": 0.5820330969267139,
+ "grad_norm": 2.78347110748291,
+ "learning_rate": 4.901099373139307e-06,
+ "loss": 0.6063,
+ "step": 1231
+ },
+ {
+ "epoch": 0.5825059101654846,
+ "grad_norm": 2.346496343612671,
+ "learning_rate": 4.900925570148585e-06,
+ "loss": 0.5869,
+ "step": 1232
+ },
+ {
+ "epoch": 0.5829787234042553,
+ "grad_norm": 2.606639862060547,
+ "learning_rate": 4.900751617662755e-06,
+ "loss": 0.6197,
+ "step": 1233
+ },
+ {
+ "epoch": 0.583451536643026,
+ "grad_norm": 2.5825929641723633,
+ "learning_rate": 4.900577515692649e-06,
+ "loss": 0.6721,
+ "step": 1234
+ },
+ {
+ "epoch": 0.5839243498817966,
+ "grad_norm": 2.731349468231201,
+ "learning_rate": 4.900403264249107e-06,
+ "loss": 0.6273,
+ "step": 1235
+ },
+ {
+ "epoch": 0.5843971631205673,
+ "grad_norm": 3.2133874893188477,
+ "learning_rate": 4.90022886334298e-06,
+ "loss": 0.6231,
+ "step": 1236
+ },
+ {
+ "epoch": 0.584869976359338,
+ "grad_norm": 2.9213852882385254,
+ "learning_rate": 4.900054312985127e-06,
+ "loss": 0.6677,
+ "step": 1237
+ },
+ {
+ "epoch": 0.5853427895981087,
+ "grad_norm": 2.815425157546997,
+ "learning_rate": 4.899879613186414e-06,
+ "loss": 0.6405,
+ "step": 1238
+ },
+ {
+ "epoch": 0.5858156028368794,
+ "grad_norm": 2.730782985687256,
+ "learning_rate": 4.899704763957721e-06,
+ "loss": 0.6233,
+ "step": 1239
+ },
+ {
+ "epoch": 0.5862884160756501,
+ "grad_norm": 2.6432766914367676,
+ "learning_rate": 4.899529765309936e-06,
+ "loss": 0.6267,
+ "step": 1240
+ },
+ {
+ "epoch": 0.5867612293144208,
+ "grad_norm": 2.616215229034424,
+ "learning_rate": 4.899354617253953e-06,
+ "loss": 0.6268,
+ "step": 1241
+ },
+ {
+ "epoch": 0.5872340425531914,
+ "grad_norm": 2.7630255222320557,
+ "learning_rate": 4.899179319800679e-06,
+ "loss": 0.6348,
+ "step": 1242
+ },
+ {
+ "epoch": 0.5877068557919621,
+ "grad_norm": 2.785095453262329,
+ "learning_rate": 4.899003872961029e-06,
+ "loss": 0.5839,
+ "step": 1243
+ },
+ {
+ "epoch": 0.5881796690307328,
+ "grad_norm": 2.9050328731536865,
+ "learning_rate": 4.898828276745927e-06,
+ "loss": 0.651,
+ "step": 1244
+ },
+ {
+ "epoch": 0.5886524822695035,
+ "grad_norm": 2.958092212677002,
+ "learning_rate": 4.8986525311663065e-06,
+ "loss": 0.6395,
+ "step": 1245
+ },
+ {
+ "epoch": 0.5891252955082742,
+ "grad_norm": 2.952310800552368,
+ "learning_rate": 4.898476636233111e-06,
+ "loss": 0.6731,
+ "step": 1246
+ },
+ {
+ "epoch": 0.5895981087470449,
+ "grad_norm": 2.9876346588134766,
+ "learning_rate": 4.898300591957293e-06,
+ "loss": 0.7015,
+ "step": 1247
+ },
+ {
+ "epoch": 0.5900709219858156,
+ "grad_norm": 2.8941752910614014,
+ "learning_rate": 4.898124398349813e-06,
+ "loss": 0.6452,
+ "step": 1248
+ },
+ {
+ "epoch": 0.5905437352245863,
+ "grad_norm": 2.9809536933898926,
+ "learning_rate": 4.897948055421642e-06,
+ "loss": 0.5736,
+ "step": 1249
+ },
+ {
+ "epoch": 0.5910165484633569,
+ "grad_norm": 2.927046775817871,
+ "learning_rate": 4.897771563183761e-06,
+ "loss": 0.5918,
+ "step": 1250
+ },
+ {
+ "epoch": 0.5914893617021276,
+ "grad_norm": 2.865020275115967,
+ "learning_rate": 4.897594921647158e-06,
+ "loss": 0.6924,
+ "step": 1251
+ },
+ {
+ "epoch": 0.5919621749408983,
+ "grad_norm": 2.7406699657440186,
+ "learning_rate": 4.897418130822832e-06,
+ "loss": 0.509,
+ "step": 1252
+ },
+ {
+ "epoch": 0.592434988179669,
+ "grad_norm": 2.781606912612915,
+ "learning_rate": 4.897241190721791e-06,
+ "loss": 0.5555,
+ "step": 1253
+ },
+ {
+ "epoch": 0.5929078014184397,
+ "grad_norm": 2.79209303855896,
+ "learning_rate": 4.8970641013550535e-06,
+ "loss": 0.6722,
+ "step": 1254
+ },
+ {
+ "epoch": 0.5933806146572104,
+ "grad_norm": 3.0672268867492676,
+ "learning_rate": 4.896886862733645e-06,
+ "loss": 0.6366,
+ "step": 1255
+ },
+ {
+ "epoch": 0.5938534278959811,
+ "grad_norm": 2.7456953525543213,
+ "learning_rate": 4.896709474868602e-06,
+ "loss": 0.6246,
+ "step": 1256
+ },
+ {
+ "epoch": 0.5943262411347517,
+ "grad_norm": 3.6731202602386475,
+ "learning_rate": 4.896531937770968e-06,
+ "loss": 0.668,
+ "step": 1257
+ },
+ {
+ "epoch": 0.5947990543735224,
+ "grad_norm": 2.6056087017059326,
+ "learning_rate": 4.8963542514518e-06,
+ "loss": 0.5815,
+ "step": 1258
+ },
+ {
+ "epoch": 0.5952718676122931,
+ "grad_norm": 2.719698905944824,
+ "learning_rate": 4.89617641592216e-06,
+ "loss": 0.6058,
+ "step": 1259
+ },
+ {
+ "epoch": 0.5957446808510638,
+ "grad_norm": 2.625838279724121,
+ "learning_rate": 4.895998431193121e-06,
+ "loss": 0.6143,
+ "step": 1260
+ },
+ {
+ "epoch": 0.5962174940898345,
+ "grad_norm": 2.7166085243225098,
+ "learning_rate": 4.895820297275767e-06,
+ "loss": 0.5187,
+ "step": 1261
+ },
+ {
+ "epoch": 0.5966903073286052,
+ "grad_norm": 2.7544102668762207,
+ "learning_rate": 4.8956420141811875e-06,
+ "loss": 0.5928,
+ "step": 1262
+ },
+ {
+ "epoch": 0.5971631205673759,
+ "grad_norm": 2.6678333282470703,
+ "learning_rate": 4.895463581920484e-06,
+ "loss": 0.611,
+ "step": 1263
+ },
+ {
+ "epoch": 0.5976359338061465,
+ "grad_norm": 2.853384494781494,
+ "learning_rate": 4.895285000504768e-06,
+ "loss": 0.642,
+ "step": 1264
+ },
+ {
+ "epoch": 0.5981087470449172,
+ "grad_norm": 2.637852430343628,
+ "learning_rate": 4.895106269945158e-06,
+ "loss": 0.6308,
+ "step": 1265
+ },
+ {
+ "epoch": 0.5985815602836879,
+ "grad_norm": 2.9880387783050537,
+ "learning_rate": 4.8949273902527826e-06,
+ "loss": 0.5781,
+ "step": 1266
+ },
+ {
+ "epoch": 0.5990543735224586,
+ "grad_norm": 3.5984015464782715,
+ "learning_rate": 4.89474836143878e-06,
+ "loss": 0.5865,
+ "step": 1267
+ },
+ {
+ "epoch": 0.5995271867612293,
+ "grad_norm": 2.719855546951294,
+ "learning_rate": 4.8945691835142975e-06,
+ "loss": 0.6393,
+ "step": 1268
+ },
+ {
+ "epoch": 0.6,
+ "grad_norm": 2.7885141372680664,
+ "learning_rate": 4.894389856490492e-06,
+ "loss": 0.66,
+ "step": 1269
+ },
+ {
+ "epoch": 0.6004728132387707,
+ "grad_norm": 2.698819875717163,
+ "learning_rate": 4.894210380378529e-06,
+ "loss": 0.6144,
+ "step": 1270
+ },
+ {
+ "epoch": 0.6009456264775414,
+ "grad_norm": 2.278045654296875,
+ "learning_rate": 4.894030755189584e-06,
+ "loss": 0.5609,
+ "step": 1271
+ },
+ {
+ "epoch": 0.601418439716312,
+ "grad_norm": 2.8729357719421387,
+ "learning_rate": 4.893850980934841e-06,
+ "loss": 0.6715,
+ "step": 1272
+ },
+ {
+ "epoch": 0.6018912529550827,
+ "grad_norm": 2.8541221618652344,
+ "learning_rate": 4.893671057625495e-06,
+ "loss": 0.6787,
+ "step": 1273
+ },
+ {
+ "epoch": 0.6023640661938534,
+ "grad_norm": 2.4561476707458496,
+ "learning_rate": 4.893490985272748e-06,
+ "loss": 0.6331,
+ "step": 1274
+ },
+ {
+ "epoch": 0.6028368794326241,
+ "grad_norm": 2.565739154815674,
+ "learning_rate": 4.893310763887812e-06,
+ "loss": 0.587,
+ "step": 1275
+ },
+ {
+ "epoch": 0.6033096926713948,
+ "grad_norm": 2.384951591491699,
+ "learning_rate": 4.8931303934819095e-06,
+ "loss": 0.5358,
+ "step": 1276
+ },
+ {
+ "epoch": 0.6037825059101655,
+ "grad_norm": 2.380808115005493,
+ "learning_rate": 4.89294987406627e-06,
+ "loss": 0.5402,
+ "step": 1277
+ },
+ {
+ "epoch": 0.6042553191489362,
+ "grad_norm": 2.764815092086792,
+ "learning_rate": 4.892769205652136e-06,
+ "loss": 0.6103,
+ "step": 1278
+ },
+ {
+ "epoch": 0.6047281323877068,
+ "grad_norm": 2.463350296020508,
+ "learning_rate": 4.892588388250754e-06,
+ "loss": 0.5937,
+ "step": 1279
+ },
+ {
+ "epoch": 0.6052009456264775,
+ "grad_norm": 3.099689245223999,
+ "learning_rate": 4.8924074218733855e-06,
+ "loss": 0.6354,
+ "step": 1280
+ },
+ {
+ "epoch": 0.6056737588652482,
+ "grad_norm": 2.804450035095215,
+ "learning_rate": 4.892226306531297e-06,
+ "loss": 0.6595,
+ "step": 1281
+ },
+ {
+ "epoch": 0.6061465721040189,
+ "grad_norm": 3.1559767723083496,
+ "learning_rate": 4.892045042235765e-06,
+ "loss": 0.6664,
+ "step": 1282
+ },
+ {
+ "epoch": 0.6066193853427896,
+ "grad_norm": 2.844341993331909,
+ "learning_rate": 4.891863628998079e-06,
+ "loss": 0.7454,
+ "step": 1283
+ },
+ {
+ "epoch": 0.6070921985815603,
+ "grad_norm": 2.686602830886841,
+ "learning_rate": 4.891682066829532e-06,
+ "loss": 0.6755,
+ "step": 1284
+ },
+ {
+ "epoch": 0.607565011820331,
+ "grad_norm": 2.736457347869873,
+ "learning_rate": 4.8915003557414285e-06,
+ "loss": 0.6305,
+ "step": 1285
+ },
+ {
+ "epoch": 0.6080378250591016,
+ "grad_norm": 2.661362409591675,
+ "learning_rate": 4.891318495745086e-06,
+ "loss": 0.5958,
+ "step": 1286
+ },
+ {
+ "epoch": 0.6085106382978723,
+ "grad_norm": 2.707348108291626,
+ "learning_rate": 4.8911364868518255e-06,
+ "loss": 0.5824,
+ "step": 1287
+ },
+ {
+ "epoch": 0.608983451536643,
+ "grad_norm": 2.9798858165740967,
+ "learning_rate": 4.890954329072981e-06,
+ "loss": 0.5981,
+ "step": 1288
+ },
+ {
+ "epoch": 0.6094562647754137,
+ "grad_norm": 2.6285455226898193,
+ "learning_rate": 4.890772022419895e-06,
+ "loss": 0.6194,
+ "step": 1289
+ },
+ {
+ "epoch": 0.6099290780141844,
+ "grad_norm": 2.9254322052001953,
+ "learning_rate": 4.890589566903917e-06,
+ "loss": 0.6002,
+ "step": 1290
+ },
+ {
+ "epoch": 0.6104018912529551,
+ "grad_norm": 2.6458325386047363,
+ "learning_rate": 4.89040696253641e-06,
+ "loss": 0.5457,
+ "step": 1291
+ },
+ {
+ "epoch": 0.6108747044917258,
+ "grad_norm": 2.508242607116699,
+ "learning_rate": 4.890224209328743e-06,
+ "loss": 0.6168,
+ "step": 1292
+ },
+ {
+ "epoch": 0.6113475177304964,
+ "grad_norm": 3.034785509109497,
+ "learning_rate": 4.890041307292296e-06,
+ "loss": 0.664,
+ "step": 1293
+ },
+ {
+ "epoch": 0.6118203309692671,
+ "grad_norm": 3.52469539642334,
+ "learning_rate": 4.889858256438455e-06,
+ "loss": 0.7301,
+ "step": 1294
+ },
+ {
+ "epoch": 0.6122931442080378,
+ "grad_norm": 2.9145348072052,
+ "learning_rate": 4.889675056778622e-06,
+ "loss": 0.6494,
+ "step": 1295
+ },
+ {
+ "epoch": 0.6127659574468085,
+ "grad_norm": 2.831829071044922,
+ "learning_rate": 4.8894917083242e-06,
+ "loss": 0.6064,
+ "step": 1296
+ },
+ {
+ "epoch": 0.6132387706855792,
+ "grad_norm": 2.6883130073547363,
+ "learning_rate": 4.889308211086608e-06,
+ "loss": 0.5642,
+ "step": 1297
+ },
+ {
+ "epoch": 0.6137115839243499,
+ "grad_norm": 3.0605485439300537,
+ "learning_rate": 4.889124565077269e-06,
+ "loss": 0.6695,
+ "step": 1298
+ },
+ {
+ "epoch": 0.6141843971631206,
+ "grad_norm": 3.44062876701355,
+ "learning_rate": 4.88894077030762e-06,
+ "loss": 0.6415,
+ "step": 1299
+ },
+ {
+ "epoch": 0.6146572104018913,
+ "grad_norm": 2.5970818996429443,
+ "learning_rate": 4.888756826789105e-06,
+ "loss": 0.6518,
+ "step": 1300
+ },
+ {
+ "epoch": 0.6151300236406619,
+ "grad_norm": 4.2233567237854,
+ "learning_rate": 4.8885727345331755e-06,
+ "loss": 0.6555,
+ "step": 1301
+ },
+ {
+ "epoch": 0.6156028368794326,
+ "grad_norm": 2.645385503768921,
+ "learning_rate": 4.888388493551297e-06,
+ "loss": 0.6762,
+ "step": 1302
+ },
+ {
+ "epoch": 0.6160756501182033,
+ "grad_norm": 2.907954454421997,
+ "learning_rate": 4.8882041038549385e-06,
+ "loss": 0.6526,
+ "step": 1303
+ },
+ {
+ "epoch": 0.616548463356974,
+ "grad_norm": 2.482771873474121,
+ "learning_rate": 4.888019565455583e-06,
+ "loss": 0.628,
+ "step": 1304
+ },
+ {
+ "epoch": 0.6170212765957447,
+ "grad_norm": 2.7165915966033936,
+ "learning_rate": 4.88783487836472e-06,
+ "loss": 0.5743,
+ "step": 1305
+ },
+ {
+ "epoch": 0.6174940898345154,
+ "grad_norm": 3.095627546310425,
+ "learning_rate": 4.88765004259385e-06,
+ "loss": 0.627,
+ "step": 1306
+ },
+ {
+ "epoch": 0.6179669030732861,
+ "grad_norm": 2.5018465518951416,
+ "learning_rate": 4.8874650581544805e-06,
+ "loss": 0.5215,
+ "step": 1307
+ },
+ {
+ "epoch": 0.6184397163120567,
+ "grad_norm": 3.094337224960327,
+ "learning_rate": 4.8872799250581316e-06,
+ "loss": 0.6979,
+ "step": 1308
+ },
+ {
+ "epoch": 0.6189125295508274,
+ "grad_norm": 3.1002209186553955,
+ "learning_rate": 4.887094643316329e-06,
+ "loss": 0.6565,
+ "step": 1309
+ },
+ {
+ "epoch": 0.6193853427895981,
+ "grad_norm": 2.551431894302368,
+ "learning_rate": 4.88690921294061e-06,
+ "loss": 0.5748,
+ "step": 1310
+ },
+ {
+ "epoch": 0.6198581560283688,
+ "grad_norm": 2.8282904624938965,
+ "learning_rate": 4.886723633942521e-06,
+ "loss": 0.676,
+ "step": 1311
+ },
+ {
+ "epoch": 0.6203309692671395,
+ "grad_norm": 2.8887810707092285,
+ "learning_rate": 4.886537906333617e-06,
+ "loss": 0.5971,
+ "step": 1312
+ },
+ {
+ "epoch": 0.6208037825059102,
+ "grad_norm": 2.9989118576049805,
+ "learning_rate": 4.886352030125462e-06,
+ "loss": 0.6341,
+ "step": 1313
+ },
+ {
+ "epoch": 0.6212765957446809,
+ "grad_norm": 2.8042776584625244,
+ "learning_rate": 4.886166005329629e-06,
+ "loss": 0.6578,
+ "step": 1314
+ },
+ {
+ "epoch": 0.6217494089834515,
+ "grad_norm": 2.4980967044830322,
+ "learning_rate": 4.8859798319577026e-06,
+ "loss": 0.6711,
+ "step": 1315
+ },
+ {
+ "epoch": 0.6222222222222222,
+ "grad_norm": 2.762369155883789,
+ "learning_rate": 4.885793510021274e-06,
+ "loss": 0.5747,
+ "step": 1316
+ },
+ {
+ "epoch": 0.6226950354609929,
+ "grad_norm": 3.136327028274536,
+ "learning_rate": 4.885607039531945e-06,
+ "loss": 0.7544,
+ "step": 1317
+ },
+ {
+ "epoch": 0.6231678486997636,
+ "grad_norm": 2.8736963272094727,
+ "learning_rate": 4.885420420501327e-06,
+ "loss": 0.6603,
+ "step": 1318
+ },
+ {
+ "epoch": 0.6236406619385343,
+ "grad_norm": 2.766237497329712,
+ "learning_rate": 4.885233652941039e-06,
+ "loss": 0.581,
+ "step": 1319
+ },
+ {
+ "epoch": 0.624113475177305,
+ "grad_norm": 2.4740939140319824,
+ "learning_rate": 4.88504673686271e-06,
+ "loss": 0.6335,
+ "step": 1320
+ },
+ {
+ "epoch": 0.6245862884160757,
+ "grad_norm": 3.324795961380005,
+ "learning_rate": 4.884859672277978e-06,
+ "loss": 0.6019,
+ "step": 1321
+ },
+ {
+ "epoch": 0.6250591016548463,
+ "grad_norm": 3.521327257156372,
+ "learning_rate": 4.884672459198493e-06,
+ "loss": 0.6104,
+ "step": 1322
+ },
+ {
+ "epoch": 0.625531914893617,
+ "grad_norm": 2.7728071212768555,
+ "learning_rate": 4.884485097635909e-06,
+ "loss": 0.6714,
+ "step": 1323
+ },
+ {
+ "epoch": 0.6260047281323877,
+ "grad_norm": 3.0738155841827393,
+ "learning_rate": 4.884297587601895e-06,
+ "loss": 0.604,
+ "step": 1324
+ },
+ {
+ "epoch": 0.6264775413711584,
+ "grad_norm": 2.719240427017212,
+ "learning_rate": 4.884109929108124e-06,
+ "loss": 0.6795,
+ "step": 1325
+ },
+ {
+ "epoch": 0.6269503546099291,
+ "grad_norm": 2.4108200073242188,
+ "learning_rate": 4.883922122166282e-06,
+ "loss": 0.5846,
+ "step": 1326
+ },
+ {
+ "epoch": 0.6274231678486998,
+ "grad_norm": 2.393899917602539,
+ "learning_rate": 4.883734166788063e-06,
+ "loss": 0.6188,
+ "step": 1327
+ },
+ {
+ "epoch": 0.6278959810874705,
+ "grad_norm": 4.555255889892578,
+ "learning_rate": 4.883546062985169e-06,
+ "loss": 0.5962,
+ "step": 1328
+ },
+ {
+ "epoch": 0.6283687943262412,
+ "grad_norm": 2.571075439453125,
+ "learning_rate": 4.883357810769315e-06,
+ "loss": 0.6165,
+ "step": 1329
+ },
+ {
+ "epoch": 0.6288416075650118,
+ "grad_norm": 2.553115129470825,
+ "learning_rate": 4.8831694101522185e-06,
+ "loss": 0.6787,
+ "step": 1330
+ },
+ {
+ "epoch": 0.6293144208037825,
+ "grad_norm": 3.2564642429351807,
+ "learning_rate": 4.882980861145614e-06,
+ "loss": 0.659,
+ "step": 1331
+ },
+ {
+ "epoch": 0.6297872340425532,
+ "grad_norm": 2.535216808319092,
+ "learning_rate": 4.882792163761241e-06,
+ "loss": 0.6176,
+ "step": 1332
+ },
+ {
+ "epoch": 0.6302600472813239,
+ "grad_norm": 3.097921848297119,
+ "learning_rate": 4.882603318010847e-06,
+ "loss": 0.6822,
+ "step": 1333
+ },
+ {
+ "epoch": 0.6307328605200946,
+ "grad_norm": 2.8135175704956055,
+ "learning_rate": 4.882414323906192e-06,
+ "loss": 0.6782,
+ "step": 1334
+ },
+ {
+ "epoch": 0.6312056737588653,
+ "grad_norm": 2.724634885787964,
+ "learning_rate": 4.882225181459044e-06,
+ "loss": 0.6545,
+ "step": 1335
+ },
+ {
+ "epoch": 0.631678486997636,
+ "grad_norm": 2.9585227966308594,
+ "learning_rate": 4.882035890681179e-06,
+ "loss": 0.6218,
+ "step": 1336
+ },
+ {
+ "epoch": 0.6321513002364066,
+ "grad_norm": 2.6952011585235596,
+ "learning_rate": 4.881846451584385e-06,
+ "loss": 0.6,
+ "step": 1337
+ },
+ {
+ "epoch": 0.6326241134751773,
+ "grad_norm": 3.1400704383850098,
+ "learning_rate": 4.881656864180455e-06,
+ "loss": 0.6687,
+ "step": 1338
+ },
+ {
+ "epoch": 0.633096926713948,
+ "grad_norm": 2.8382487297058105,
+ "learning_rate": 4.881467128481197e-06,
+ "loss": 0.574,
+ "step": 1339
+ },
+ {
+ "epoch": 0.6335697399527187,
+ "grad_norm": 2.8520095348358154,
+ "learning_rate": 4.881277244498422e-06,
+ "loss": 0.6582,
+ "step": 1340
+ },
+ {
+ "epoch": 0.6340425531914894,
+ "grad_norm": 2.703498363494873,
+ "learning_rate": 4.881087212243956e-06,
+ "loss": 0.7224,
+ "step": 1341
+ },
+ {
+ "epoch": 0.6345153664302601,
+ "grad_norm": 3.697205066680908,
+ "learning_rate": 4.880897031729629e-06,
+ "loss": 0.6582,
+ "step": 1342
+ },
+ {
+ "epoch": 0.6349881796690308,
+ "grad_norm": 2.7625808715820312,
+ "learning_rate": 4.880706702967284e-06,
+ "loss": 0.574,
+ "step": 1343
+ },
+ {
+ "epoch": 0.6354609929078014,
+ "grad_norm": 2.949984073638916,
+ "learning_rate": 4.880516225968771e-06,
+ "loss": 0.66,
+ "step": 1344
+ },
+ {
+ "epoch": 0.6359338061465721,
+ "grad_norm": 2.548269748687744,
+ "learning_rate": 4.8803256007459525e-06,
+ "loss": 0.642,
+ "step": 1345
+ },
+ {
+ "epoch": 0.6364066193853428,
+ "grad_norm": 2.5102174282073975,
+ "learning_rate": 4.8801348273106945e-06,
+ "loss": 0.6238,
+ "step": 1346
+ },
+ {
+ "epoch": 0.6368794326241135,
+ "grad_norm": 2.9847946166992188,
+ "learning_rate": 4.8799439056748786e-06,
+ "loss": 0.5416,
+ "step": 1347
+ },
+ {
+ "epoch": 0.6373522458628842,
+ "grad_norm": 2.8711049556732178,
+ "learning_rate": 4.879752835850391e-06,
+ "loss": 0.6427,
+ "step": 1348
+ },
+ {
+ "epoch": 0.6378250591016549,
+ "grad_norm": 2.7901716232299805,
+ "learning_rate": 4.879561617849129e-06,
+ "loss": 0.6026,
+ "step": 1349
+ },
+ {
+ "epoch": 0.6382978723404256,
+ "grad_norm": 2.659778356552124,
+ "learning_rate": 4.879370251682999e-06,
+ "loss": 0.6623,
+ "step": 1350
+ },
+ {
+ "epoch": 0.6387706855791963,
+ "grad_norm": 3.224386692047119,
+ "learning_rate": 4.879178737363917e-06,
+ "loss": 0.6485,
+ "step": 1351
+ },
+ {
+ "epoch": 0.6392434988179669,
+ "grad_norm": 2.6385605335235596,
+ "learning_rate": 4.8789870749038076e-06,
+ "loss": 0.5866,
+ "step": 1352
+ },
+ {
+ "epoch": 0.6397163120567376,
+ "grad_norm": 2.807713270187378,
+ "learning_rate": 4.8787952643146045e-06,
+ "loss": 0.6537,
+ "step": 1353
+ },
+ {
+ "epoch": 0.6401891252955083,
+ "grad_norm": 2.5689280033111572,
+ "learning_rate": 4.878603305608251e-06,
+ "loss": 0.6216,
+ "step": 1354
+ },
+ {
+ "epoch": 0.640661938534279,
+ "grad_norm": 2.7347843647003174,
+ "learning_rate": 4.8784111987967e-06,
+ "loss": 0.6318,
+ "step": 1355
+ },
+ {
+ "epoch": 0.6411347517730497,
+ "grad_norm": 2.5210378170013428,
+ "learning_rate": 4.878218943891911e-06,
+ "loss": 0.5472,
+ "step": 1356
+ },
+ {
+ "epoch": 0.6416075650118204,
+ "grad_norm": 2.866785764694214,
+ "learning_rate": 4.878026540905858e-06,
+ "loss": 0.7108,
+ "step": 1357
+ },
+ {
+ "epoch": 0.642080378250591,
+ "grad_norm": 2.923314332962036,
+ "learning_rate": 4.877833989850519e-06,
+ "loss": 0.5557,
+ "step": 1358
+ },
+ {
+ "epoch": 0.6425531914893617,
+ "grad_norm": 2.925463914871216,
+ "learning_rate": 4.8776412907378845e-06,
+ "loss": 0.6382,
+ "step": 1359
+ },
+ {
+ "epoch": 0.6430260047281324,
+ "grad_norm": 2.909644365310669,
+ "learning_rate": 4.877448443579952e-06,
+ "loss": 0.5603,
+ "step": 1360
+ },
+ {
+ "epoch": 0.6434988179669031,
+ "grad_norm": 3.501148223876953,
+ "learning_rate": 4.8772554483887306e-06,
+ "loss": 0.6722,
+ "step": 1361
+ },
+ {
+ "epoch": 0.6439716312056738,
+ "grad_norm": 2.823765516281128,
+ "learning_rate": 4.877062305176235e-06,
+ "loss": 0.6408,
+ "step": 1362
+ },
+ {
+ "epoch": 0.6444444444444445,
+ "grad_norm": 2.9807584285736084,
+ "learning_rate": 4.8768690139544935e-06,
+ "loss": 0.5984,
+ "step": 1363
+ },
+ {
+ "epoch": 0.6449172576832152,
+ "grad_norm": 2.8411378860473633,
+ "learning_rate": 4.8766755747355405e-06,
+ "loss": 0.6231,
+ "step": 1364
+ },
+ {
+ "epoch": 0.6453900709219859,
+ "grad_norm": 3.158952236175537,
+ "learning_rate": 4.8764819875314215e-06,
+ "loss": 0.6441,
+ "step": 1365
+ },
+ {
+ "epoch": 0.6458628841607565,
+ "grad_norm": 2.9614369869232178,
+ "learning_rate": 4.876288252354189e-06,
+ "loss": 0.6308,
+ "step": 1366
+ },
+ {
+ "epoch": 0.6463356973995272,
+ "grad_norm": 3.073805570602417,
+ "learning_rate": 4.876094369215907e-06,
+ "loss": 0.6046,
+ "step": 1367
+ },
+ {
+ "epoch": 0.6468085106382979,
+ "grad_norm": 2.719189405441284,
+ "learning_rate": 4.875900338128648e-06,
+ "loss": 0.6082,
+ "step": 1368
+ },
+ {
+ "epoch": 0.6472813238770686,
+ "grad_norm": 2.676726818084717,
+ "learning_rate": 4.8757061591044914e-06,
+ "loss": 0.6344,
+ "step": 1369
+ },
+ {
+ "epoch": 0.6477541371158393,
+ "grad_norm": 2.955256938934326,
+ "learning_rate": 4.87551183215553e-06,
+ "loss": 0.6506,
+ "step": 1370
+ },
+ {
+ "epoch": 0.64822695035461,
+ "grad_norm": 2.5672218799591064,
+ "learning_rate": 4.875317357293864e-06,
+ "loss": 0.5284,
+ "step": 1371
+ },
+ {
+ "epoch": 0.6486997635933807,
+ "grad_norm": 2.5860238075256348,
+ "learning_rate": 4.875122734531602e-06,
+ "loss": 0.667,
+ "step": 1372
+ },
+ {
+ "epoch": 0.6491725768321513,
+ "grad_norm": 3.1037003993988037,
+ "learning_rate": 4.8749279638808605e-06,
+ "loss": 0.6902,
+ "step": 1373
+ },
+ {
+ "epoch": 0.649645390070922,
+ "grad_norm": 2.7715282440185547,
+ "learning_rate": 4.874733045353769e-06,
+ "loss": 0.6291,
+ "step": 1374
+ },
+ {
+ "epoch": 0.6501182033096927,
+ "grad_norm": 2.527071475982666,
+ "learning_rate": 4.874537978962463e-06,
+ "loss": 0.5565,
+ "step": 1375
+ },
+ {
+ "epoch": 0.6505910165484634,
+ "grad_norm": 2.722092628479004,
+ "learning_rate": 4.874342764719091e-06,
+ "loss": 0.5724,
+ "step": 1376
+ },
+ {
+ "epoch": 0.6510638297872341,
+ "grad_norm": 2.6342411041259766,
+ "learning_rate": 4.874147402635805e-06,
+ "loss": 0.6308,
+ "step": 1377
+ },
+ {
+ "epoch": 0.6515366430260048,
+ "grad_norm": 2.3850719928741455,
+ "learning_rate": 4.8739518927247695e-06,
+ "loss": 0.5692,
+ "step": 1378
+ },
+ {
+ "epoch": 0.6520094562647755,
+ "grad_norm": 2.9787259101867676,
+ "learning_rate": 4.873756234998161e-06,
+ "loss": 0.6953,
+ "step": 1379
+ },
+ {
+ "epoch": 0.6524822695035462,
+ "grad_norm": 2.634141683578491,
+ "learning_rate": 4.873560429468159e-06,
+ "loss": 0.6077,
+ "step": 1380
+ },
+ {
+ "epoch": 0.6529550827423168,
+ "grad_norm": 2.803046941757202,
+ "learning_rate": 4.873364476146958e-06,
+ "loss": 0.6657,
+ "step": 1381
+ },
+ {
+ "epoch": 0.6534278959810875,
+ "grad_norm": 2.762827157974243,
+ "learning_rate": 4.8731683750467574e-06,
+ "loss": 0.6061,
+ "step": 1382
+ },
+ {
+ "epoch": 0.6539007092198581,
+ "grad_norm": 2.6654391288757324,
+ "learning_rate": 4.872972126179768e-06,
+ "loss": 0.6387,
+ "step": 1383
+ },
+ {
+ "epoch": 0.6543735224586288,
+ "grad_norm": 2.4363625049591064,
+ "learning_rate": 4.872775729558209e-06,
+ "loss": 0.5623,
+ "step": 1384
+ },
+ {
+ "epoch": 0.6548463356973995,
+ "grad_norm": 2.528959035873413,
+ "learning_rate": 4.87257918519431e-06,
+ "loss": 0.5609,
+ "step": 1385
+ },
+ {
+ "epoch": 0.6553191489361702,
+ "grad_norm": 2.718383312225342,
+ "learning_rate": 4.872382493100309e-06,
+ "loss": 0.5575,
+ "step": 1386
+ },
+ {
+ "epoch": 0.6557919621749408,
+ "grad_norm": 2.660841226577759,
+ "learning_rate": 4.872185653288453e-06,
+ "loss": 0.6106,
+ "step": 1387
+ },
+ {
+ "epoch": 0.6562647754137115,
+ "grad_norm": 2.508753538131714,
+ "learning_rate": 4.871988665770997e-06,
+ "loss": 0.5705,
+ "step": 1388
+ },
+ {
+ "epoch": 0.6567375886524822,
+ "grad_norm": 2.5134334564208984,
+ "learning_rate": 4.871791530560208e-06,
+ "loss": 0.5592,
+ "step": 1389
+ },
+ {
+ "epoch": 0.6572104018912529,
+ "grad_norm": 2.7475597858428955,
+ "learning_rate": 4.871594247668361e-06,
+ "loss": 0.6277,
+ "step": 1390
+ },
+ {
+ "epoch": 0.6576832151300236,
+ "grad_norm": 2.793616533279419,
+ "learning_rate": 4.871396817107739e-06,
+ "loss": 0.595,
+ "step": 1391
+ },
+ {
+ "epoch": 0.6581560283687943,
+ "grad_norm": 2.8285086154937744,
+ "learning_rate": 4.871199238890635e-06,
+ "loss": 0.6094,
+ "step": 1392
+ },
+ {
+ "epoch": 0.658628841607565,
+ "grad_norm": 2.74124813079834,
+ "learning_rate": 4.871001513029352e-06,
+ "loss": 0.6296,
+ "step": 1393
+ },
+ {
+ "epoch": 0.6591016548463356,
+ "grad_norm": 2.761237621307373,
+ "learning_rate": 4.870803639536202e-06,
+ "loss": 0.5702,
+ "step": 1394
+ },
+ {
+ "epoch": 0.6595744680851063,
+ "grad_norm": 2.761038064956665,
+ "learning_rate": 4.870605618423504e-06,
+ "loss": 0.6195,
+ "step": 1395
+ },
+ {
+ "epoch": 0.660047281323877,
+ "grad_norm": 2.8812482357025146,
+ "learning_rate": 4.870407449703589e-06,
+ "loss": 0.616,
+ "step": 1396
+ },
+ {
+ "epoch": 0.6605200945626477,
+ "grad_norm": 2.9966578483581543,
+ "learning_rate": 4.870209133388797e-06,
+ "loss": 0.6547,
+ "step": 1397
+ },
+ {
+ "epoch": 0.6609929078014184,
+ "grad_norm": 2.7969017028808594,
+ "learning_rate": 4.870010669491474e-06,
+ "loss": 0.5762,
+ "step": 1398
+ },
+ {
+ "epoch": 0.6614657210401891,
+ "grad_norm": 2.557783842086792,
+ "learning_rate": 4.86981205802398e-06,
+ "loss": 0.6184,
+ "step": 1399
+ },
+ {
+ "epoch": 0.6619385342789598,
+ "grad_norm": 2.5393927097320557,
+ "learning_rate": 4.86961329899868e-06,
+ "loss": 0.5953,
+ "step": 1400
+ },
+ {
+ "epoch": 0.6624113475177305,
+ "grad_norm": 2.7745981216430664,
+ "learning_rate": 4.86941439242795e-06,
+ "loss": 0.5967,
+ "step": 1401
+ },
+ {
+ "epoch": 0.6628841607565011,
+ "grad_norm": 2.650381326675415,
+ "learning_rate": 4.869215338324176e-06,
+ "loss": 0.5667,
+ "step": 1402
+ },
+ {
+ "epoch": 0.6633569739952718,
+ "grad_norm": 2.583169937133789,
+ "learning_rate": 4.869016136699751e-06,
+ "loss": 0.549,
+ "step": 1403
+ },
+ {
+ "epoch": 0.6638297872340425,
+ "grad_norm": 2.984978437423706,
+ "learning_rate": 4.868816787567079e-06,
+ "loss": 0.5931,
+ "step": 1404
+ },
+ {
+ "epoch": 0.6643026004728132,
+ "grad_norm": 3.1947181224823,
+ "learning_rate": 4.868617290938573e-06,
+ "loss": 0.5473,
+ "step": 1405
+ },
+ {
+ "epoch": 0.6647754137115839,
+ "grad_norm": 2.562927007675171,
+ "learning_rate": 4.868417646826654e-06,
+ "loss": 0.6878,
+ "step": 1406
+ },
+ {
+ "epoch": 0.6652482269503546,
+ "grad_norm": 2.8741261959075928,
+ "learning_rate": 4.868217855243754e-06,
+ "loss": 0.6312,
+ "step": 1407
+ },
+ {
+ "epoch": 0.6657210401891253,
+ "grad_norm": 2.9834797382354736,
+ "learning_rate": 4.868017916202312e-06,
+ "loss": 0.5624,
+ "step": 1408
+ },
+ {
+ "epoch": 0.6661938534278959,
+ "grad_norm": 2.6935982704162598,
+ "learning_rate": 4.8678178297147785e-06,
+ "loss": 0.5857,
+ "step": 1409
+ },
+ {
+ "epoch": 0.6666666666666666,
+ "grad_norm": 2.8200576305389404,
+ "learning_rate": 4.86761759579361e-06,
+ "loss": 0.6153,
+ "step": 1410
+ },
+ {
+ "epoch": 0.6671394799054373,
+ "grad_norm": 2.831425189971924,
+ "learning_rate": 4.867417214451276e-06,
+ "loss": 0.6495,
+ "step": 1411
+ },
+ {
+ "epoch": 0.667612293144208,
+ "grad_norm": 2.733565092086792,
+ "learning_rate": 4.867216685700253e-06,
+ "loss": 0.6036,
+ "step": 1412
+ },
+ {
+ "epoch": 0.6680851063829787,
+ "grad_norm": 3.0609400272369385,
+ "learning_rate": 4.867016009553027e-06,
+ "loss": 0.6773,
+ "step": 1413
+ },
+ {
+ "epoch": 0.6685579196217494,
+ "grad_norm": 2.665452241897583,
+ "learning_rate": 4.866815186022093e-06,
+ "loss": 0.6256,
+ "step": 1414
+ },
+ {
+ "epoch": 0.6690307328605201,
+ "grad_norm": 2.9480721950531006,
+ "learning_rate": 4.866614215119956e-06,
+ "loss": 0.535,
+ "step": 1415
+ },
+ {
+ "epoch": 0.6695035460992907,
+ "grad_norm": 2.5514180660247803,
+ "learning_rate": 4.866413096859128e-06,
+ "loss": 0.6588,
+ "step": 1416
+ },
+ {
+ "epoch": 0.6699763593380614,
+ "grad_norm": 3.3442373275756836,
+ "learning_rate": 4.866211831252134e-06,
+ "loss": 0.5754,
+ "step": 1417
+ },
+ {
+ "epoch": 0.6704491725768321,
+ "grad_norm": 2.521467685699463,
+ "learning_rate": 4.866010418311504e-06,
+ "loss": 0.5546,
+ "step": 1418
+ },
+ {
+ "epoch": 0.6709219858156028,
+ "grad_norm": 2.930706262588501,
+ "learning_rate": 4.865808858049781e-06,
+ "loss": 0.589,
+ "step": 1419
+ },
+ {
+ "epoch": 0.6713947990543735,
+ "grad_norm": 2.6298375129699707,
+ "learning_rate": 4.865607150479513e-06,
+ "loss": 0.5915,
+ "step": 1420
+ },
+ {
+ "epoch": 0.6718676122931442,
+ "grad_norm": 2.9554293155670166,
+ "learning_rate": 4.8654052956132615e-06,
+ "loss": 0.6654,
+ "step": 1421
+ },
+ {
+ "epoch": 0.6723404255319149,
+ "grad_norm": 3.2706902027130127,
+ "learning_rate": 4.865203293463593e-06,
+ "loss": 0.7115,
+ "step": 1422
+ },
+ {
+ "epoch": 0.6728132387706856,
+ "grad_norm": 3.041539430618286,
+ "learning_rate": 4.865001144043088e-06,
+ "loss": 0.5818,
+ "step": 1423
+ },
+ {
+ "epoch": 0.6732860520094562,
+ "grad_norm": 3.1314544677734375,
+ "learning_rate": 4.864798847364331e-06,
+ "loss": 0.5822,
+ "step": 1424
+ },
+ {
+ "epoch": 0.6737588652482269,
+ "grad_norm": 2.5301461219787598,
+ "learning_rate": 4.86459640343992e-06,
+ "loss": 0.5525,
+ "step": 1425
+ },
+ {
+ "epoch": 0.6742316784869976,
+ "grad_norm": 2.809295892715454,
+ "learning_rate": 4.864393812282458e-06,
+ "loss": 0.6768,
+ "step": 1426
+ },
+ {
+ "epoch": 0.6747044917257683,
+ "grad_norm": 2.794664144515991,
+ "learning_rate": 4.864191073904562e-06,
+ "loss": 0.5793,
+ "step": 1427
+ },
+ {
+ "epoch": 0.675177304964539,
+ "grad_norm": 2.7771105766296387,
+ "learning_rate": 4.863988188318854e-06,
+ "loss": 0.6453,
+ "step": 1428
+ },
+ {
+ "epoch": 0.6756501182033097,
+ "grad_norm": 2.6431946754455566,
+ "learning_rate": 4.863785155537967e-06,
+ "loss": 0.5877,
+ "step": 1429
+ },
+ {
+ "epoch": 0.6761229314420804,
+ "grad_norm": 2.951353073120117,
+ "learning_rate": 4.863581975574544e-06,
+ "loss": 0.6793,
+ "step": 1430
+ },
+ {
+ "epoch": 0.676595744680851,
+ "grad_norm": 3.1336071491241455,
+ "learning_rate": 4.863378648441235e-06,
+ "loss": 0.6695,
+ "step": 1431
+ },
+ {
+ "epoch": 0.6770685579196217,
+ "grad_norm": 2.735982656478882,
+ "learning_rate": 4.8631751741507e-06,
+ "loss": 0.5239,
+ "step": 1432
+ },
+ {
+ "epoch": 0.6775413711583924,
+ "grad_norm": 2.7085206508636475,
+ "learning_rate": 4.862971552715611e-06,
+ "loss": 0.6837,
+ "step": 1433
+ },
+ {
+ "epoch": 0.6780141843971631,
+ "grad_norm": 3.136528730392456,
+ "learning_rate": 4.8627677841486436e-06,
+ "loss": 0.683,
+ "step": 1434
+ },
+ {
+ "epoch": 0.6784869976359338,
+ "grad_norm": 2.7879369258880615,
+ "learning_rate": 4.862563868462486e-06,
+ "loss": 0.608,
+ "step": 1435
+ },
+ {
+ "epoch": 0.6789598108747045,
+ "grad_norm": 2.7937729358673096,
+ "learning_rate": 4.862359805669837e-06,
+ "loss": 0.6131,
+ "step": 1436
+ },
+ {
+ "epoch": 0.6794326241134752,
+ "grad_norm": 2.5988364219665527,
+ "learning_rate": 4.862155595783401e-06,
+ "loss": 0.6303,
+ "step": 1437
+ },
+ {
+ "epoch": 0.6799054373522458,
+ "grad_norm": 3.251070499420166,
+ "learning_rate": 4.861951238815894e-06,
+ "loss": 0.7246,
+ "step": 1438
+ },
+ {
+ "epoch": 0.6803782505910165,
+ "grad_norm": 2.646759271621704,
+ "learning_rate": 4.861746734780039e-06,
+ "loss": 0.6313,
+ "step": 1439
+ },
+ {
+ "epoch": 0.6808510638297872,
+ "grad_norm": 2.773866891860962,
+ "learning_rate": 4.861542083688573e-06,
+ "loss": 0.6463,
+ "step": 1440
+ },
+ {
+ "epoch": 0.6813238770685579,
+ "grad_norm": 2.759965658187866,
+ "learning_rate": 4.861337285554235e-06,
+ "loss": 0.5428,
+ "step": 1441
+ },
+ {
+ "epoch": 0.6817966903073286,
+ "grad_norm": 3.3250818252563477,
+ "learning_rate": 4.861132340389779e-06,
+ "loss": 0.6522,
+ "step": 1442
+ },
+ {
+ "epoch": 0.6822695035460993,
+ "grad_norm": 2.661797523498535,
+ "learning_rate": 4.860927248207965e-06,
+ "loss": 0.5871,
+ "step": 1443
+ },
+ {
+ "epoch": 0.68274231678487,
+ "grad_norm": 2.706289052963257,
+ "learning_rate": 4.860722009021563e-06,
+ "loss": 0.6651,
+ "step": 1444
+ },
+ {
+ "epoch": 0.6832151300236406,
+ "grad_norm": 2.8459298610687256,
+ "learning_rate": 4.860516622843354e-06,
+ "loss": 0.5827,
+ "step": 1445
+ },
+ {
+ "epoch": 0.6836879432624113,
+ "grad_norm": 3.1041831970214844,
+ "learning_rate": 4.860311089686125e-06,
+ "loss": 0.6727,
+ "step": 1446
+ },
+ {
+ "epoch": 0.684160756501182,
+ "grad_norm": 2.9382801055908203,
+ "learning_rate": 4.8601054095626746e-06,
+ "loss": 0.6002,
+ "step": 1447
+ },
+ {
+ "epoch": 0.6846335697399527,
+ "grad_norm": 2.782475471496582,
+ "learning_rate": 4.859899582485808e-06,
+ "loss": 0.6951,
+ "step": 1448
+ },
+ {
+ "epoch": 0.6851063829787234,
+ "grad_norm": 3.313894510269165,
+ "learning_rate": 4.859693608468343e-06,
+ "loss": 0.6363,
+ "step": 1449
+ },
+ {
+ "epoch": 0.6855791962174941,
+ "grad_norm": 3.1639695167541504,
+ "learning_rate": 4.8594874875231045e-06,
+ "loss": 0.7002,
+ "step": 1450
+ },
+ {
+ "epoch": 0.6860520094562648,
+ "grad_norm": 2.6762218475341797,
+ "learning_rate": 4.859281219662926e-06,
+ "loss": 0.6246,
+ "step": 1451
+ },
+ {
+ "epoch": 0.6865248226950355,
+ "grad_norm": 2.8368663787841797,
+ "learning_rate": 4.85907480490065e-06,
+ "loss": 0.5906,
+ "step": 1452
+ },
+ {
+ "epoch": 0.6869976359338061,
+ "grad_norm": 2.887373208999634,
+ "learning_rate": 4.858868243249131e-06,
+ "loss": 0.5931,
+ "step": 1453
+ },
+ {
+ "epoch": 0.6874704491725768,
+ "grad_norm": 2.8115322589874268,
+ "learning_rate": 4.858661534721229e-06,
+ "loss": 0.6337,
+ "step": 1454
+ },
+ {
+ "epoch": 0.6879432624113475,
+ "grad_norm": 2.8470499515533447,
+ "learning_rate": 4.8584546793298174e-06,
+ "loss": 0.632,
+ "step": 1455
+ },
+ {
+ "epoch": 0.6884160756501182,
+ "grad_norm": 2.8229613304138184,
+ "learning_rate": 4.8582476770877725e-06,
+ "loss": 0.6494,
+ "step": 1456
+ },
+ {
+ "epoch": 0.6888888888888889,
+ "grad_norm": 2.4235479831695557,
+ "learning_rate": 4.858040528007987e-06,
+ "loss": 0.5709,
+ "step": 1457
+ },
+ {
+ "epoch": 0.6893617021276596,
+ "grad_norm": 2.9348199367523193,
+ "learning_rate": 4.857833232103356e-06,
+ "loss": 0.5404,
+ "step": 1458
+ },
+ {
+ "epoch": 0.6898345153664303,
+ "grad_norm": 2.8274219036102295,
+ "learning_rate": 4.857625789386789e-06,
+ "loss": 0.701,
+ "step": 1459
+ },
+ {
+ "epoch": 0.6903073286052009,
+ "grad_norm": 3.136929988861084,
+ "learning_rate": 4.857418199871203e-06,
+ "loss": 0.6971,
+ "step": 1460
+ },
+ {
+ "epoch": 0.6907801418439716,
+ "grad_norm": 2.8987185955047607,
+ "learning_rate": 4.8572104635695214e-06,
+ "loss": 0.6613,
+ "step": 1461
+ },
+ {
+ "epoch": 0.6912529550827423,
+ "grad_norm": 2.5073442459106445,
+ "learning_rate": 4.857002580494681e-06,
+ "loss": 0.6032,
+ "step": 1462
+ },
+ {
+ "epoch": 0.691725768321513,
+ "grad_norm": 2.7019522190093994,
+ "learning_rate": 4.856794550659625e-06,
+ "loss": 0.567,
+ "step": 1463
+ },
+ {
+ "epoch": 0.6921985815602837,
+ "grad_norm": 2.4795594215393066,
+ "learning_rate": 4.8565863740773054e-06,
+ "loss": 0.5777,
+ "step": 1464
+ },
+ {
+ "epoch": 0.6926713947990544,
+ "grad_norm": 3.032506227493286,
+ "learning_rate": 4.856378050760687e-06,
+ "loss": 0.607,
+ "step": 1465
+ },
+ {
+ "epoch": 0.6931442080378251,
+ "grad_norm": 3.052091121673584,
+ "learning_rate": 4.85616958072274e-06,
+ "loss": 0.591,
+ "step": 1466
+ },
+ {
+ "epoch": 0.6936170212765957,
+ "grad_norm": 2.704831838607788,
+ "learning_rate": 4.855960963976443e-06,
+ "loss": 0.6528,
+ "step": 1467
+ },
+ {
+ "epoch": 0.6940898345153664,
+ "grad_norm": 2.680995225906372,
+ "learning_rate": 4.855752200534788e-06,
+ "loss": 0.6294,
+ "step": 1468
+ },
+ {
+ "epoch": 0.6945626477541371,
+ "grad_norm": 2.3948659896850586,
+ "learning_rate": 4.855543290410774e-06,
+ "loss": 0.6091,
+ "step": 1469
+ },
+ {
+ "epoch": 0.6950354609929078,
+ "grad_norm": 2.6407411098480225,
+ "learning_rate": 4.855334233617407e-06,
+ "loss": 0.5572,
+ "step": 1470
+ },
+ {
+ "epoch": 0.6955082742316785,
+ "grad_norm": 2.5526835918426514,
+ "learning_rate": 4.8551250301677064e-06,
+ "loss": 0.5432,
+ "step": 1471
+ },
+ {
+ "epoch": 0.6959810874704492,
+ "grad_norm": 3.1237430572509766,
+ "learning_rate": 4.8549156800746965e-06,
+ "loss": 0.5944,
+ "step": 1472
+ },
+ {
+ "epoch": 0.6964539007092199,
+ "grad_norm": 2.8112540245056152,
+ "learning_rate": 4.854706183351412e-06,
+ "loss": 0.604,
+ "step": 1473
+ },
+ {
+ "epoch": 0.6969267139479906,
+ "grad_norm": 2.664644479751587,
+ "learning_rate": 4.8544965400109e-06,
+ "loss": 0.5647,
+ "step": 1474
+ },
+ {
+ "epoch": 0.6973995271867612,
+ "grad_norm": 3.26310133934021,
+ "learning_rate": 4.854286750066212e-06,
+ "loss": 0.6999,
+ "step": 1475
+ },
+ {
+ "epoch": 0.6978723404255319,
+ "grad_norm": 2.9717442989349365,
+ "learning_rate": 4.8540768135304115e-06,
+ "loss": 0.6655,
+ "step": 1476
+ },
+ {
+ "epoch": 0.6983451536643026,
+ "grad_norm": 2.5302982330322266,
+ "learning_rate": 4.85386673041657e-06,
+ "loss": 0.6384,
+ "step": 1477
+ },
+ {
+ "epoch": 0.6988179669030733,
+ "grad_norm": 2.864877700805664,
+ "learning_rate": 4.853656500737769e-06,
+ "loss": 0.6834,
+ "step": 1478
+ },
+ {
+ "epoch": 0.699290780141844,
+ "grad_norm": 2.5522031784057617,
+ "learning_rate": 4.853446124507098e-06,
+ "loss": 0.5929,
+ "step": 1479
+ },
+ {
+ "epoch": 0.6997635933806147,
+ "grad_norm": 3.096477746963501,
+ "learning_rate": 4.853235601737656e-06,
+ "loss": 0.5737,
+ "step": 1480
+ },
+ {
+ "epoch": 0.7002364066193854,
+ "grad_norm": 2.884779214859009,
+ "learning_rate": 4.853024932442552e-06,
+ "loss": 0.6362,
+ "step": 1481
+ },
+ {
+ "epoch": 0.700709219858156,
+ "grad_norm": 3.368558406829834,
+ "learning_rate": 4.852814116634903e-06,
+ "loss": 0.6721,
+ "step": 1482
+ },
+ {
+ "epoch": 0.7011820330969267,
+ "grad_norm": 2.742414951324463,
+ "learning_rate": 4.852603154327837e-06,
+ "loss": 0.6212,
+ "step": 1483
+ },
+ {
+ "epoch": 0.7016548463356974,
+ "grad_norm": 2.53454852104187,
+ "learning_rate": 4.8523920455344864e-06,
+ "loss": 0.6675,
+ "step": 1484
+ },
+ {
+ "epoch": 0.7021276595744681,
+ "grad_norm": 2.9354238510131836,
+ "learning_rate": 4.852180790267999e-06,
+ "loss": 0.6692,
+ "step": 1485
+ },
+ {
+ "epoch": 0.7026004728132388,
+ "grad_norm": 2.585070848464966,
+ "learning_rate": 4.8519693885415274e-06,
+ "loss": 0.6215,
+ "step": 1486
+ },
+ {
+ "epoch": 0.7030732860520095,
+ "grad_norm": 2.9047999382019043,
+ "learning_rate": 4.851757840368235e-06,
+ "loss": 0.6231,
+ "step": 1487
+ },
+ {
+ "epoch": 0.7035460992907802,
+ "grad_norm": 3.0930933952331543,
+ "learning_rate": 4.851546145761295e-06,
+ "loss": 0.7267,
+ "step": 1488
+ },
+ {
+ "epoch": 0.7040189125295508,
+ "grad_norm": 3.0224719047546387,
+ "learning_rate": 4.8513343047338875e-06,
+ "loss": 0.6293,
+ "step": 1489
+ },
+ {
+ "epoch": 0.7044917257683215,
+ "grad_norm": 2.5758471488952637,
+ "learning_rate": 4.851122317299203e-06,
+ "loss": 0.5855,
+ "step": 1490
+ },
+ {
+ "epoch": 0.7049645390070922,
+ "grad_norm": 2.579272508621216,
+ "learning_rate": 4.850910183470441e-06,
+ "loss": 0.582,
+ "step": 1491
+ },
+ {
+ "epoch": 0.7054373522458629,
+ "grad_norm": 2.8148300647735596,
+ "learning_rate": 4.85069790326081e-06,
+ "loss": 0.6396,
+ "step": 1492
+ },
+ {
+ "epoch": 0.7059101654846336,
+ "grad_norm": 2.6380527019500732,
+ "learning_rate": 4.850485476683528e-06,
+ "loss": 0.6114,
+ "step": 1493
+ },
+ {
+ "epoch": 0.7063829787234043,
+ "grad_norm": 2.7736263275146484,
+ "learning_rate": 4.850272903751823e-06,
+ "loss": 0.6683,
+ "step": 1494
+ },
+ {
+ "epoch": 0.706855791962175,
+ "grad_norm": 3.1958179473876953,
+ "learning_rate": 4.8500601844789285e-06,
+ "loss": 0.6265,
+ "step": 1495
+ },
+ {
+ "epoch": 0.7073286052009456,
+ "grad_norm": 3.783212423324585,
+ "learning_rate": 4.8498473188780916e-06,
+ "loss": 0.6078,
+ "step": 1496
+ },
+ {
+ "epoch": 0.7078014184397163,
+ "grad_norm": 2.6656646728515625,
+ "learning_rate": 4.849634306962566e-06,
+ "loss": 0.5756,
+ "step": 1497
+ },
+ {
+ "epoch": 0.708274231678487,
+ "grad_norm": 2.757141590118408,
+ "learning_rate": 4.849421148745615e-06,
+ "loss": 0.5596,
+ "step": 1498
+ },
+ {
+ "epoch": 0.7087470449172577,
+ "grad_norm": 3.0391886234283447,
+ "learning_rate": 4.849207844240511e-06,
+ "loss": 0.5293,
+ "step": 1499
+ },
+ {
+ "epoch": 0.7092198581560284,
+ "grad_norm": 2.981912851333618,
+ "learning_rate": 4.848994393460535e-06,
+ "loss": 0.598,
+ "step": 1500
+ },
+ {
+ "epoch": 0.7096926713947991,
+ "grad_norm": 2.5470798015594482,
+ "learning_rate": 4.848780796418978e-06,
+ "loss": 0.6266,
+ "step": 1501
+ },
+ {
+ "epoch": 0.7101654846335698,
+ "grad_norm": 2.8394415378570557,
+ "learning_rate": 4.8485670531291415e-06,
+ "loss": 0.6844,
+ "step": 1502
+ },
+ {
+ "epoch": 0.7106382978723405,
+ "grad_norm": 3.2023508548736572,
+ "learning_rate": 4.848353163604331e-06,
+ "loss": 0.6134,
+ "step": 1503
+ },
+ {
+ "epoch": 0.7111111111111111,
+ "grad_norm": 2.98245906829834,
+ "learning_rate": 4.848139127857867e-06,
+ "loss": 0.7084,
+ "step": 1504
+ },
+ {
+ "epoch": 0.7115839243498818,
+ "grad_norm": 2.5917441844940186,
+ "learning_rate": 4.847924945903076e-06,
+ "loss": 0.5676,
+ "step": 1505
+ },
+ {
+ "epoch": 0.7120567375886525,
+ "grad_norm": 2.8736681938171387,
+ "learning_rate": 4.847710617753294e-06,
+ "loss": 0.6304,
+ "step": 1506
+ },
+ {
+ "epoch": 0.7125295508274232,
+ "grad_norm": 2.7832682132720947,
+ "learning_rate": 4.847496143421866e-06,
+ "loss": 0.5705,
+ "step": 1507
+ },
+ {
+ "epoch": 0.7130023640661939,
+ "grad_norm": 2.480560779571533,
+ "learning_rate": 4.847281522922147e-06,
+ "loss": 0.5595,
+ "step": 1508
+ },
+ {
+ "epoch": 0.7134751773049646,
+ "grad_norm": 2.357675313949585,
+ "learning_rate": 4.847066756267499e-06,
+ "loss": 0.5065,
+ "step": 1509
+ },
+ {
+ "epoch": 0.7139479905437353,
+ "grad_norm": 2.632669448852539,
+ "learning_rate": 4.846851843471296e-06,
+ "loss": 0.6949,
+ "step": 1510
+ },
+ {
+ "epoch": 0.7144208037825059,
+ "grad_norm": 2.7691073417663574,
+ "learning_rate": 4.84663678454692e-06,
+ "loss": 0.6638,
+ "step": 1511
+ },
+ {
+ "epoch": 0.7148936170212766,
+ "grad_norm": 2.5647685527801514,
+ "learning_rate": 4.846421579507761e-06,
+ "loss": 0.6098,
+ "step": 1512
+ },
+ {
+ "epoch": 0.7153664302600473,
+ "grad_norm": 2.476701021194458,
+ "learning_rate": 4.846206228367218e-06,
+ "loss": 0.592,
+ "step": 1513
+ },
+ {
+ "epoch": 0.715839243498818,
+ "grad_norm": 2.805727958679199,
+ "learning_rate": 4.845990731138702e-06,
+ "loss": 0.5466,
+ "step": 1514
+ },
+ {
+ "epoch": 0.7163120567375887,
+ "grad_norm": 2.551392078399658,
+ "learning_rate": 4.84577508783563e-06,
+ "loss": 0.6039,
+ "step": 1515
+ },
+ {
+ "epoch": 0.7167848699763594,
+ "grad_norm": 2.6861350536346436,
+ "learning_rate": 4.845559298471429e-06,
+ "loss": 0.6427,
+ "step": 1516
+ },
+ {
+ "epoch": 0.7172576832151301,
+ "grad_norm": 3.1908371448516846,
+ "learning_rate": 4.845343363059535e-06,
+ "loss": 0.5447,
+ "step": 1517
+ },
+ {
+ "epoch": 0.7177304964539007,
+ "grad_norm": 2.9021761417388916,
+ "learning_rate": 4.845127281613394e-06,
+ "loss": 0.5836,
+ "step": 1518
+ },
+ {
+ "epoch": 0.7182033096926714,
+ "grad_norm": 2.476670742034912,
+ "learning_rate": 4.844911054146461e-06,
+ "loss": 0.5863,
+ "step": 1519
+ },
+ {
+ "epoch": 0.7186761229314421,
+ "grad_norm": 2.662935495376587,
+ "learning_rate": 4.844694680672198e-06,
+ "loss": 0.5678,
+ "step": 1520
+ },
+ {
+ "epoch": 0.7191489361702128,
+ "grad_norm": 2.677896738052368,
+ "learning_rate": 4.844478161204079e-06,
+ "loss": 0.6195,
+ "step": 1521
+ },
+ {
+ "epoch": 0.7196217494089835,
+ "grad_norm": 2.781921863555908,
+ "learning_rate": 4.844261495755585e-06,
+ "loss": 0.643,
+ "step": 1522
+ },
+ {
+ "epoch": 0.7200945626477542,
+ "grad_norm": 3.0157392024993896,
+ "learning_rate": 4.844044684340206e-06,
+ "loss": 0.7559,
+ "step": 1523
+ },
+ {
+ "epoch": 0.7205673758865249,
+ "grad_norm": 2.8109354972839355,
+ "learning_rate": 4.843827726971444e-06,
+ "loss": 0.6264,
+ "step": 1524
+ },
+ {
+ "epoch": 0.7210401891252955,
+ "grad_norm": 3.0953569412231445,
+ "learning_rate": 4.8436106236628064e-06,
+ "loss": 0.6429,
+ "step": 1525
+ },
+ {
+ "epoch": 0.7215130023640662,
+ "grad_norm": 2.6850643157958984,
+ "learning_rate": 4.843393374427812e-06,
+ "loss": 0.6598,
+ "step": 1526
+ },
+ {
+ "epoch": 0.7219858156028369,
+ "grad_norm": 3.043480634689331,
+ "learning_rate": 4.8431759792799874e-06,
+ "loss": 0.6331,
+ "step": 1527
+ },
+ {
+ "epoch": 0.7224586288416076,
+ "grad_norm": 2.723870038986206,
+ "learning_rate": 4.842958438232868e-06,
+ "loss": 0.6259,
+ "step": 1528
+ },
+ {
+ "epoch": 0.7229314420803783,
+ "grad_norm": 2.822492837905884,
+ "learning_rate": 4.842740751300002e-06,
+ "loss": 0.6554,
+ "step": 1529
+ },
+ {
+ "epoch": 0.723404255319149,
+ "grad_norm": 2.7866315841674805,
+ "learning_rate": 4.842522918494941e-06,
+ "loss": 0.6991,
+ "step": 1530
+ },
+ {
+ "epoch": 0.7238770685579197,
+ "grad_norm": 2.8881826400756836,
+ "learning_rate": 4.84230493983125e-06,
+ "loss": 0.5876,
+ "step": 1531
+ },
+ {
+ "epoch": 0.7243498817966904,
+ "grad_norm": 2.7456939220428467,
+ "learning_rate": 4.8420868153225e-06,
+ "loss": 0.6188,
+ "step": 1532
+ },
+ {
+ "epoch": 0.724822695035461,
+ "grad_norm": 3.0257532596588135,
+ "learning_rate": 4.841868544982274e-06,
+ "loss": 0.63,
+ "step": 1533
+ },
+ {
+ "epoch": 0.7252955082742317,
+ "grad_norm": 3.1581954956054688,
+ "learning_rate": 4.841650128824164e-06,
+ "loss": 0.7214,
+ "step": 1534
+ },
+ {
+ "epoch": 0.7257683215130024,
+ "grad_norm": 2.9174306392669678,
+ "learning_rate": 4.841431566861767e-06,
+ "loss": 0.704,
+ "step": 1535
+ },
+ {
+ "epoch": 0.7262411347517731,
+ "grad_norm": 2.5019054412841797,
+ "learning_rate": 4.8412128591086935e-06,
+ "loss": 0.6298,
+ "step": 1536
+ },
+ {
+ "epoch": 0.7267139479905438,
+ "grad_norm": 2.724285125732422,
+ "learning_rate": 4.840994005578562e-06,
+ "loss": 0.6289,
+ "step": 1537
+ },
+ {
+ "epoch": 0.7271867612293145,
+ "grad_norm": 2.5882341861724854,
+ "learning_rate": 4.840775006284998e-06,
+ "loss": 0.6355,
+ "step": 1538
+ },
+ {
+ "epoch": 0.7276595744680852,
+ "grad_norm": 3.1281991004943848,
+ "learning_rate": 4.840555861241638e-06,
+ "loss": 0.5551,
+ "step": 1539
+ },
+ {
+ "epoch": 0.7281323877068558,
+ "grad_norm": 2.6064817905426025,
+ "learning_rate": 4.840336570462127e-06,
+ "loss": 0.5543,
+ "step": 1540
+ },
+ {
+ "epoch": 0.7286052009456265,
+ "grad_norm": 2.67112398147583,
+ "learning_rate": 4.840117133960122e-06,
+ "loss": 0.6044,
+ "step": 1541
+ },
+ {
+ "epoch": 0.7290780141843972,
+ "grad_norm": 2.838022232055664,
+ "learning_rate": 4.839897551749282e-06,
+ "loss": 0.6814,
+ "step": 1542
+ },
+ {
+ "epoch": 0.7295508274231679,
+ "grad_norm": 2.8897151947021484,
+ "learning_rate": 4.839677823843283e-06,
+ "loss": 0.593,
+ "step": 1543
+ },
+ {
+ "epoch": 0.7300236406619386,
+ "grad_norm": 2.9238014221191406,
+ "learning_rate": 4.839457950255805e-06,
+ "loss": 0.5544,
+ "step": 1544
+ },
+ {
+ "epoch": 0.7304964539007093,
+ "grad_norm": 3.016876459121704,
+ "learning_rate": 4.839237931000538e-06,
+ "loss": 0.6099,
+ "step": 1545
+ },
+ {
+ "epoch": 0.7309692671394799,
+ "grad_norm": 2.9415392875671387,
+ "learning_rate": 4.839017766091182e-06,
+ "loss": 0.6413,
+ "step": 1546
+ },
+ {
+ "epoch": 0.7314420803782505,
+ "grad_norm": 2.658067226409912,
+ "learning_rate": 4.838797455541446e-06,
+ "loss": 0.6534,
+ "step": 1547
+ },
+ {
+ "epoch": 0.7319148936170212,
+ "grad_norm": 2.460358142852783,
+ "learning_rate": 4.838576999365049e-06,
+ "loss": 0.5307,
+ "step": 1548
+ },
+ {
+ "epoch": 0.7323877068557919,
+ "grad_norm": 2.5818674564361572,
+ "learning_rate": 4.838356397575716e-06,
+ "loss": 0.6265,
+ "step": 1549
+ },
+ {
+ "epoch": 0.7328605200945626,
+ "grad_norm": 3.009197473526001,
+ "learning_rate": 4.838135650187183e-06,
+ "loss": 0.6957,
+ "step": 1550
+ },
+ {
+ "epoch": 0.7333333333333333,
+ "grad_norm": 2.738543748855591,
+ "learning_rate": 4.837914757213196e-06,
+ "loss": 0.646,
+ "step": 1551
+ },
+ {
+ "epoch": 0.733806146572104,
+ "grad_norm": 2.8208494186401367,
+ "learning_rate": 4.837693718667508e-06,
+ "loss": 0.5936,
+ "step": 1552
+ },
+ {
+ "epoch": 0.7342789598108747,
+ "grad_norm": 3.1574649810791016,
+ "learning_rate": 4.837472534563883e-06,
+ "loss": 0.6455,
+ "step": 1553
+ },
+ {
+ "epoch": 0.7347517730496453,
+ "grad_norm": 2.6737420558929443,
+ "learning_rate": 4.837251204916093e-06,
+ "loss": 0.5921,
+ "step": 1554
+ },
+ {
+ "epoch": 0.735224586288416,
+ "grad_norm": 2.424983024597168,
+ "learning_rate": 4.837029729737918e-06,
+ "loss": 0.6346,
+ "step": 1555
+ },
+ {
+ "epoch": 0.7356973995271867,
+ "grad_norm": 2.5163493156433105,
+ "learning_rate": 4.836808109043151e-06,
+ "loss": 0.6061,
+ "step": 1556
+ },
+ {
+ "epoch": 0.7361702127659574,
+ "grad_norm": 2.8377044200897217,
+ "learning_rate": 4.836586342845588e-06,
+ "loss": 0.611,
+ "step": 1557
+ },
+ {
+ "epoch": 0.7366430260047281,
+ "grad_norm": 2.5929181575775146,
+ "learning_rate": 4.83636443115904e-06,
+ "loss": 0.5496,
+ "step": 1558
+ },
+ {
+ "epoch": 0.7371158392434988,
+ "grad_norm": 2.5017223358154297,
+ "learning_rate": 4.836142373997323e-06,
+ "loss": 0.6235,
+ "step": 1559
+ },
+ {
+ "epoch": 0.7375886524822695,
+ "grad_norm": 2.822500228881836,
+ "learning_rate": 4.835920171374265e-06,
+ "loss": 0.6147,
+ "step": 1560
+ },
+ {
+ "epoch": 0.7380614657210401,
+ "grad_norm": 2.7234230041503906,
+ "learning_rate": 4.8356978233037e-06,
+ "loss": 0.6228,
+ "step": 1561
+ },
+ {
+ "epoch": 0.7385342789598108,
+ "grad_norm": 2.9565515518188477,
+ "learning_rate": 4.835475329799472e-06,
+ "loss": 0.5728,
+ "step": 1562
+ },
+ {
+ "epoch": 0.7390070921985815,
+ "grad_norm": 2.4356038570404053,
+ "learning_rate": 4.835252690875438e-06,
+ "loss": 0.6723,
+ "step": 1563
+ },
+ {
+ "epoch": 0.7394799054373522,
+ "grad_norm": 2.765913248062134,
+ "learning_rate": 4.835029906545458e-06,
+ "loss": 0.5805,
+ "step": 1564
+ },
+ {
+ "epoch": 0.7399527186761229,
+ "grad_norm": 2.4481914043426514,
+ "learning_rate": 4.834806976823405e-06,
+ "loss": 0.599,
+ "step": 1565
+ },
+ {
+ "epoch": 0.7404255319148936,
+ "grad_norm": 2.620779514312744,
+ "learning_rate": 4.834583901723158e-06,
+ "loss": 0.63,
+ "step": 1566
+ },
+ {
+ "epoch": 0.7408983451536643,
+ "grad_norm": 2.654426097869873,
+ "learning_rate": 4.83436068125861e-06,
+ "loss": 0.6544,
+ "step": 1567
+ },
+ {
+ "epoch": 0.741371158392435,
+ "grad_norm": 2.589623212814331,
+ "learning_rate": 4.834137315443656e-06,
+ "loss": 0.5596,
+ "step": 1568
+ },
+ {
+ "epoch": 0.7418439716312056,
+ "grad_norm": 2.572883129119873,
+ "learning_rate": 4.833913804292209e-06,
+ "loss": 0.5974,
+ "step": 1569
+ },
+ {
+ "epoch": 0.7423167848699763,
+ "grad_norm": 2.8744914531707764,
+ "learning_rate": 4.833690147818181e-06,
+ "loss": 0.5364,
+ "step": 1570
+ },
+ {
+ "epoch": 0.742789598108747,
+ "grad_norm": 2.9800851345062256,
+ "learning_rate": 4.833466346035502e-06,
+ "loss": 0.6287,
+ "step": 1571
+ },
+ {
+ "epoch": 0.7432624113475177,
+ "grad_norm": 2.627784490585327,
+ "learning_rate": 4.833242398958105e-06,
+ "loss": 0.621,
+ "step": 1572
+ },
+ {
+ "epoch": 0.7437352245862884,
+ "grad_norm": 2.5187721252441406,
+ "learning_rate": 4.833018306599933e-06,
+ "loss": 0.5901,
+ "step": 1573
+ },
+ {
+ "epoch": 0.7442080378250591,
+ "grad_norm": 2.4843688011169434,
+ "learning_rate": 4.832794068974944e-06,
+ "loss": 0.6336,
+ "step": 1574
+ },
+ {
+ "epoch": 0.7446808510638298,
+ "grad_norm": 2.774911880493164,
+ "learning_rate": 4.832569686097096e-06,
+ "loss": 0.6091,
+ "step": 1575
+ },
+ {
+ "epoch": 0.7451536643026004,
+ "grad_norm": 3.2562527656555176,
+ "learning_rate": 4.8323451579803615e-06,
+ "loss": 0.7686,
+ "step": 1576
+ },
+ {
+ "epoch": 0.7456264775413711,
+ "grad_norm": 2.799570083618164,
+ "learning_rate": 4.832120484638721e-06,
+ "loss": 0.6233,
+ "step": 1577
+ },
+ {
+ "epoch": 0.7460992907801418,
+ "grad_norm": 2.661893367767334,
+ "learning_rate": 4.831895666086164e-06,
+ "loss": 0.5841,
+ "step": 1578
+ },
+ {
+ "epoch": 0.7465721040189125,
+ "grad_norm": 3.0382652282714844,
+ "learning_rate": 4.831670702336689e-06,
+ "loss": 0.5769,
+ "step": 1579
+ },
+ {
+ "epoch": 0.7470449172576832,
+ "grad_norm": 2.676398515701294,
+ "learning_rate": 4.831445593404304e-06,
+ "loss": 0.619,
+ "step": 1580
+ },
+ {
+ "epoch": 0.7475177304964539,
+ "grad_norm": 2.717916965484619,
+ "learning_rate": 4.831220339303024e-06,
+ "loss": 0.5787,
+ "step": 1581
+ },
+ {
+ "epoch": 0.7479905437352246,
+ "grad_norm": 2.3918066024780273,
+ "learning_rate": 4.830994940046876e-06,
+ "loss": 0.5108,
+ "step": 1582
+ },
+ {
+ "epoch": 0.7484633569739952,
+ "grad_norm": 2.709144115447998,
+ "learning_rate": 4.830769395649895e-06,
+ "loss": 0.6875,
+ "step": 1583
+ },
+ {
+ "epoch": 0.7489361702127659,
+ "grad_norm": 2.8711116313934326,
+ "learning_rate": 4.830543706126123e-06,
+ "loss": 0.6745,
+ "step": 1584
+ },
+ {
+ "epoch": 0.7494089834515366,
+ "grad_norm": 2.612339496612549,
+ "learning_rate": 4.830317871489614e-06,
+ "loss": 0.5738,
+ "step": 1585
+ },
+ {
+ "epoch": 0.7498817966903073,
+ "grad_norm": 2.4355857372283936,
+ "learning_rate": 4.830091891754429e-06,
+ "loss": 0.5907,
+ "step": 1586
+ },
+ {
+ "epoch": 0.750354609929078,
+ "grad_norm": 2.676051378250122,
+ "learning_rate": 4.829865766934638e-06,
+ "loss": 0.6628,
+ "step": 1587
+ },
+ {
+ "epoch": 0.7508274231678487,
+ "grad_norm": 2.66489839553833,
+ "learning_rate": 4.829639497044323e-06,
+ "loss": 0.5984,
+ "step": 1588
+ },
+ {
+ "epoch": 0.7513002364066194,
+ "grad_norm": 2.5358035564422607,
+ "learning_rate": 4.829413082097572e-06,
+ "loss": 0.5867,
+ "step": 1589
+ },
+ {
+ "epoch": 0.75177304964539,
+ "grad_norm": 2.6530144214630127,
+ "learning_rate": 4.8291865221084815e-06,
+ "loss": 0.5917,
+ "step": 1590
+ },
+ {
+ "epoch": 0.7522458628841607,
+ "grad_norm": 2.5160958766937256,
+ "learning_rate": 4.82895981709116e-06,
+ "loss": 0.6347,
+ "step": 1591
+ },
+ {
+ "epoch": 0.7527186761229314,
+ "grad_norm": 2.61592698097229,
+ "learning_rate": 4.8287329670597225e-06,
+ "loss": 0.5472,
+ "step": 1592
+ },
+ {
+ "epoch": 0.7531914893617021,
+ "grad_norm": 2.7528622150421143,
+ "learning_rate": 4.828505972028296e-06,
+ "loss": 0.5842,
+ "step": 1593
+ },
+ {
+ "epoch": 0.7536643026004728,
+ "grad_norm": 2.8154072761535645,
+ "learning_rate": 4.828278832011011e-06,
+ "loss": 0.5757,
+ "step": 1594
+ },
+ {
+ "epoch": 0.7541371158392435,
+ "grad_norm": 3.118515729904175,
+ "learning_rate": 4.828051547022013e-06,
+ "loss": 0.6472,
+ "step": 1595
+ },
+ {
+ "epoch": 0.7546099290780142,
+ "grad_norm": 2.452033758163452,
+ "learning_rate": 4.827824117075453e-06,
+ "loss": 0.5571,
+ "step": 1596
+ },
+ {
+ "epoch": 0.7550827423167848,
+ "grad_norm": 2.984388828277588,
+ "learning_rate": 4.827596542185492e-06,
+ "loss": 0.6656,
+ "step": 1597
+ },
+ {
+ "epoch": 0.7555555555555555,
+ "grad_norm": 2.61356782913208,
+ "learning_rate": 4.8273688223663014e-06,
+ "loss": 0.6444,
+ "step": 1598
+ },
+ {
+ "epoch": 0.7560283687943262,
+ "grad_norm": 2.8967196941375732,
+ "learning_rate": 4.8271409576320595e-06,
+ "loss": 0.6457,
+ "step": 1599
+ },
+ {
+ "epoch": 0.7565011820330969,
+ "grad_norm": 2.852367639541626,
+ "learning_rate": 4.826912947996954e-06,
+ "loss": 0.5629,
+ "step": 1600
+ },
+ {
+ "epoch": 0.7569739952718676,
+ "grad_norm": 2.905280590057373,
+ "learning_rate": 4.826684793475182e-06,
+ "loss": 0.6245,
+ "step": 1601
+ },
+ {
+ "epoch": 0.7574468085106383,
+ "grad_norm": 2.6156530380249023,
+ "learning_rate": 4.826456494080951e-06,
+ "loss": 0.5869,
+ "step": 1602
+ },
+ {
+ "epoch": 0.757919621749409,
+ "grad_norm": 2.6490228176116943,
+ "learning_rate": 4.826228049828475e-06,
+ "loss": 0.5461,
+ "step": 1603
+ },
+ {
+ "epoch": 0.7583924349881797,
+ "grad_norm": 2.9626693725585938,
+ "learning_rate": 4.825999460731978e-06,
+ "loss": 0.6842,
+ "step": 1604
+ },
+ {
+ "epoch": 0.7588652482269503,
+ "grad_norm": 2.6866023540496826,
+ "learning_rate": 4.825770726805695e-06,
+ "loss": 0.5726,
+ "step": 1605
+ },
+ {
+ "epoch": 0.759338061465721,
+ "grad_norm": 2.5525858402252197,
+ "learning_rate": 4.825541848063866e-06,
+ "loss": 0.6061,
+ "step": 1606
+ },
+ {
+ "epoch": 0.7598108747044917,
+ "grad_norm": 2.703977584838867,
+ "learning_rate": 4.825312824520743e-06,
+ "loss": 0.6726,
+ "step": 1607
+ },
+ {
+ "epoch": 0.7602836879432624,
+ "grad_norm": 2.856534957885742,
+ "learning_rate": 4.825083656190588e-06,
+ "loss": 0.625,
+ "step": 1608
+ },
+ {
+ "epoch": 0.7607565011820331,
+ "grad_norm": 2.8564887046813965,
+ "learning_rate": 4.824854343087668e-06,
+ "loss": 0.7251,
+ "step": 1609
+ },
+ {
+ "epoch": 0.7612293144208038,
+ "grad_norm": 2.327650308609009,
+ "learning_rate": 4.824624885226262e-06,
+ "loss": 0.526,
+ "step": 1610
+ },
+ {
+ "epoch": 0.7617021276595745,
+ "grad_norm": 3.0025737285614014,
+ "learning_rate": 4.824395282620659e-06,
+ "loss": 0.6043,
+ "step": 1611
+ },
+ {
+ "epoch": 0.7621749408983451,
+ "grad_norm": 2.5441737174987793,
+ "learning_rate": 4.824165535285152e-06,
+ "loss": 0.6276,
+ "step": 1612
+ },
+ {
+ "epoch": 0.7626477541371158,
+ "grad_norm": 2.4177372455596924,
+ "learning_rate": 4.823935643234049e-06,
+ "loss": 0.6419,
+ "step": 1613
+ },
+ {
+ "epoch": 0.7631205673758865,
+ "grad_norm": 2.9210550785064697,
+ "learning_rate": 4.823705606481664e-06,
+ "loss": 0.5663,
+ "step": 1614
+ },
+ {
+ "epoch": 0.7635933806146572,
+ "grad_norm": 2.6353724002838135,
+ "learning_rate": 4.82347542504232e-06,
+ "loss": 0.5669,
+ "step": 1615
+ },
+ {
+ "epoch": 0.7640661938534279,
+ "grad_norm": 2.419081926345825,
+ "learning_rate": 4.823245098930349e-06,
+ "loss": 0.5777,
+ "step": 1616
+ },
+ {
+ "epoch": 0.7645390070921986,
+ "grad_norm": 2.5077571868896484,
+ "learning_rate": 4.823014628160093e-06,
+ "loss": 0.5924,
+ "step": 1617
+ },
+ {
+ "epoch": 0.7650118203309693,
+ "grad_norm": 2.816056251525879,
+ "learning_rate": 4.822784012745902e-06,
+ "loss": 0.7273,
+ "step": 1618
+ },
+ {
+ "epoch": 0.76548463356974,
+ "grad_norm": 2.7163147926330566,
+ "learning_rate": 4.8225532527021366e-06,
+ "loss": 0.5545,
+ "step": 1619
+ },
+ {
+ "epoch": 0.7659574468085106,
+ "grad_norm": 2.4784302711486816,
+ "learning_rate": 4.822322348043164e-06,
+ "loss": 0.556,
+ "step": 1620
+ },
+ {
+ "epoch": 0.7664302600472813,
+ "grad_norm": 2.712467670440674,
+ "learning_rate": 4.822091298783361e-06,
+ "loss": 0.6501,
+ "step": 1621
+ },
+ {
+ "epoch": 0.766903073286052,
+ "grad_norm": 2.7217724323272705,
+ "learning_rate": 4.821860104937115e-06,
+ "loss": 0.5989,
+ "step": 1622
+ },
+ {
+ "epoch": 0.7673758865248227,
+ "grad_norm": 2.5622854232788086,
+ "learning_rate": 4.821628766518821e-06,
+ "loss": 0.5263,
+ "step": 1623
+ },
+ {
+ "epoch": 0.7678486997635934,
+ "grad_norm": 3.230923891067505,
+ "learning_rate": 4.821397283542884e-06,
+ "loss": 0.6707,
+ "step": 1624
+ },
+ {
+ "epoch": 0.7683215130023641,
+ "grad_norm": 2.37929105758667,
+ "learning_rate": 4.821165656023718e-06,
+ "loss": 0.6124,
+ "step": 1625
+ },
+ {
+ "epoch": 0.7687943262411348,
+ "grad_norm": 2.9811325073242188,
+ "learning_rate": 4.820933883975745e-06,
+ "loss": 0.6435,
+ "step": 1626
+ },
+ {
+ "epoch": 0.7692671394799054,
+ "grad_norm": 2.887380838394165,
+ "learning_rate": 4.820701967413395e-06,
+ "loss": 0.621,
+ "step": 1627
+ },
+ {
+ "epoch": 0.7697399527186761,
+ "grad_norm": 2.6762876510620117,
+ "learning_rate": 4.820469906351109e-06,
+ "loss": 0.5713,
+ "step": 1628
+ },
+ {
+ "epoch": 0.7702127659574468,
+ "grad_norm": 2.7347512245178223,
+ "learning_rate": 4.820237700803337e-06,
+ "loss": 0.6136,
+ "step": 1629
+ },
+ {
+ "epoch": 0.7706855791962175,
+ "grad_norm": 2.7244746685028076,
+ "learning_rate": 4.820005350784539e-06,
+ "loss": 0.5816,
+ "step": 1630
+ },
+ {
+ "epoch": 0.7711583924349882,
+ "grad_norm": 2.9293999671936035,
+ "learning_rate": 4.8197728563091795e-06,
+ "loss": 0.6649,
+ "step": 1631
+ },
+ {
+ "epoch": 0.7716312056737589,
+ "grad_norm": 2.4402127265930176,
+ "learning_rate": 4.819540217391736e-06,
+ "loss": 0.6481,
+ "step": 1632
+ },
+ {
+ "epoch": 0.7721040189125296,
+ "grad_norm": 3.083941698074341,
+ "learning_rate": 4.819307434046694e-06,
+ "loss": 0.6951,
+ "step": 1633
+ },
+ {
+ "epoch": 0.7725768321513002,
+ "grad_norm": 2.544952392578125,
+ "learning_rate": 4.819074506288548e-06,
+ "loss": 0.539,
+ "step": 1634
+ },
+ {
+ "epoch": 0.7730496453900709,
+ "grad_norm": 2.7791268825531006,
+ "learning_rate": 4.818841434131801e-06,
+ "loss": 0.5827,
+ "step": 1635
+ },
+ {
+ "epoch": 0.7735224586288416,
+ "grad_norm": 2.7349796295166016,
+ "learning_rate": 4.818608217590967e-06,
+ "loss": 0.5584,
+ "step": 1636
+ },
+ {
+ "epoch": 0.7739952718676123,
+ "grad_norm": 2.637652635574341,
+ "learning_rate": 4.818374856680565e-06,
+ "loss": 0.6386,
+ "step": 1637
+ },
+ {
+ "epoch": 0.774468085106383,
+ "grad_norm": 2.9821584224700928,
+ "learning_rate": 4.818141351415127e-06,
+ "loss": 0.6734,
+ "step": 1638
+ },
+ {
+ "epoch": 0.7749408983451537,
+ "grad_norm": 2.992938995361328,
+ "learning_rate": 4.817907701809192e-06,
+ "loss": 0.5899,
+ "step": 1639
+ },
+ {
+ "epoch": 0.7754137115839244,
+ "grad_norm": 4.35719633102417,
+ "learning_rate": 4.8176739078773076e-06,
+ "loss": 0.6281,
+ "step": 1640
+ },
+ {
+ "epoch": 0.775886524822695,
+ "grad_norm": 2.838146209716797,
+ "learning_rate": 4.8174399696340315e-06,
+ "loss": 0.5766,
+ "step": 1641
+ },
+ {
+ "epoch": 0.7763593380614657,
+ "grad_norm": 3.3116989135742188,
+ "learning_rate": 4.81720588709393e-06,
+ "loss": 0.6409,
+ "step": 1642
+ },
+ {
+ "epoch": 0.7768321513002364,
+ "grad_norm": 2.9843590259552,
+ "learning_rate": 4.816971660271579e-06,
+ "loss": 0.6108,
+ "step": 1643
+ },
+ {
+ "epoch": 0.7773049645390071,
+ "grad_norm": 2.843770742416382,
+ "learning_rate": 4.816737289181562e-06,
+ "loss": 0.6053,
+ "step": 1644
+ },
+ {
+ "epoch": 0.7777777777777778,
+ "grad_norm": 2.7608556747436523,
+ "learning_rate": 4.816502773838473e-06,
+ "loss": 0.5854,
+ "step": 1645
+ },
+ {
+ "epoch": 0.7782505910165485,
+ "grad_norm": 3.343682289123535,
+ "learning_rate": 4.816268114256914e-06,
+ "loss": 0.6329,
+ "step": 1646
+ },
+ {
+ "epoch": 0.7787234042553192,
+ "grad_norm": 2.769768476486206,
+ "learning_rate": 4.816033310451496e-06,
+ "loss": 0.6242,
+ "step": 1647
+ },
+ {
+ "epoch": 0.7791962174940898,
+ "grad_norm": 2.989851713180542,
+ "learning_rate": 4.815798362436838e-06,
+ "loss": 0.6493,
+ "step": 1648
+ },
+ {
+ "epoch": 0.7796690307328605,
+ "grad_norm": 3.170736312866211,
+ "learning_rate": 4.8155632702275716e-06,
+ "loss": 0.6341,
+ "step": 1649
+ },
+ {
+ "epoch": 0.7801418439716312,
+ "grad_norm": 2.7372522354125977,
+ "learning_rate": 4.815328033838334e-06,
+ "loss": 0.5445,
+ "step": 1650
+ },
+ {
+ "epoch": 0.7806146572104019,
+ "grad_norm": 2.6947238445281982,
+ "learning_rate": 4.8150926532837715e-06,
+ "loss": 0.6437,
+ "step": 1651
+ },
+ {
+ "epoch": 0.7810874704491726,
+ "grad_norm": 2.472323179244995,
+ "learning_rate": 4.81485712857854e-06,
+ "loss": 0.5751,
+ "step": 1652
+ },
+ {
+ "epoch": 0.7815602836879433,
+ "grad_norm": 2.791114091873169,
+ "learning_rate": 4.814621459737308e-06,
+ "loss": 0.5996,
+ "step": 1653
+ },
+ {
+ "epoch": 0.782033096926714,
+ "grad_norm": 3.1957521438598633,
+ "learning_rate": 4.814385646774745e-06,
+ "loss": 0.5803,
+ "step": 1654
+ },
+ {
+ "epoch": 0.7825059101654847,
+ "grad_norm": 2.4120798110961914,
+ "learning_rate": 4.8141496897055364e-06,
+ "loss": 0.5814,
+ "step": 1655
+ },
+ {
+ "epoch": 0.7829787234042553,
+ "grad_norm": 2.9262423515319824,
+ "learning_rate": 4.813913588544374e-06,
+ "loss": 0.6292,
+ "step": 1656
+ },
+ {
+ "epoch": 0.783451536643026,
+ "grad_norm": 2.8251047134399414,
+ "learning_rate": 4.813677343305959e-06,
+ "loss": 0.6787,
+ "step": 1657
+ },
+ {
+ "epoch": 0.7839243498817967,
+ "grad_norm": 2.931659698486328,
+ "learning_rate": 4.8134409540050005e-06,
+ "loss": 0.6163,
+ "step": 1658
+ },
+ {
+ "epoch": 0.7843971631205674,
+ "grad_norm": 2.7160706520080566,
+ "learning_rate": 4.813204420656219e-06,
+ "loss": 0.6831,
+ "step": 1659
+ },
+ {
+ "epoch": 0.7848699763593381,
+ "grad_norm": 3.2134454250335693,
+ "learning_rate": 4.81296774327434e-06,
+ "loss": 0.6002,
+ "step": 1660
+ },
+ {
+ "epoch": 0.7853427895981088,
+ "grad_norm": 2.4002513885498047,
+ "learning_rate": 4.812730921874103e-06,
+ "loss": 0.5488,
+ "step": 1661
+ },
+ {
+ "epoch": 0.7858156028368795,
+ "grad_norm": 2.5559282302856445,
+ "learning_rate": 4.812493956470251e-06,
+ "loss": 0.5802,
+ "step": 1662
+ },
+ {
+ "epoch": 0.7862884160756501,
+ "grad_norm": 2.57478404045105,
+ "learning_rate": 4.812256847077541e-06,
+ "loss": 0.646,
+ "step": 1663
+ },
+ {
+ "epoch": 0.7867612293144208,
+ "grad_norm": 2.811851978302002,
+ "learning_rate": 4.812019593710736e-06,
+ "loss": 0.6245,
+ "step": 1664
+ },
+ {
+ "epoch": 0.7872340425531915,
+ "grad_norm": 2.5228829383850098,
+ "learning_rate": 4.811782196384609e-06,
+ "loss": 0.5949,
+ "step": 1665
+ },
+ {
+ "epoch": 0.7877068557919622,
+ "grad_norm": 2.744096040725708,
+ "learning_rate": 4.8115446551139415e-06,
+ "loss": 0.6006,
+ "step": 1666
+ },
+ {
+ "epoch": 0.7881796690307329,
+ "grad_norm": 3.129242420196533,
+ "learning_rate": 4.811306969913524e-06,
+ "loss": 0.7251,
+ "step": 1667
+ },
+ {
+ "epoch": 0.7886524822695036,
+ "grad_norm": 2.7855660915374756,
+ "learning_rate": 4.811069140798156e-06,
+ "loss": 0.6534,
+ "step": 1668
+ },
+ {
+ "epoch": 0.7891252955082743,
+ "grad_norm": 2.836603879928589,
+ "learning_rate": 4.810831167782647e-06,
+ "loss": 0.6661,
+ "step": 1669
+ },
+ {
+ "epoch": 0.789598108747045,
+ "grad_norm": 2.5339887142181396,
+ "learning_rate": 4.810593050881813e-06,
+ "loss": 0.5354,
+ "step": 1670
+ },
+ {
+ "epoch": 0.7900709219858156,
+ "grad_norm": 2.9553709030151367,
+ "learning_rate": 4.810354790110482e-06,
+ "loss": 0.6001,
+ "step": 1671
+ },
+ {
+ "epoch": 0.7905437352245863,
+ "grad_norm": 2.6581788063049316,
+ "learning_rate": 4.8101163854834885e-06,
+ "loss": 0.6802,
+ "step": 1672
+ },
+ {
+ "epoch": 0.791016548463357,
+ "grad_norm": 3.2002551555633545,
+ "learning_rate": 4.809877837015677e-06,
+ "loss": 0.6641,
+ "step": 1673
+ },
+ {
+ "epoch": 0.7914893617021277,
+ "grad_norm": 2.918792963027954,
+ "learning_rate": 4.809639144721902e-06,
+ "loss": 0.6758,
+ "step": 1674
+ },
+ {
+ "epoch": 0.7919621749408984,
+ "grad_norm": 2.7993946075439453,
+ "learning_rate": 4.8094003086170245e-06,
+ "loss": 0.5889,
+ "step": 1675
+ },
+ {
+ "epoch": 0.7924349881796691,
+ "grad_norm": 2.3698952198028564,
+ "learning_rate": 4.809161328715916e-06,
+ "loss": 0.6244,
+ "step": 1676
+ },
+ {
+ "epoch": 0.7929078014184398,
+ "grad_norm": 2.8891594409942627,
+ "learning_rate": 4.808922205033458e-06,
+ "loss": 0.5835,
+ "step": 1677
+ },
+ {
+ "epoch": 0.7933806146572104,
+ "grad_norm": 2.838345766067505,
+ "learning_rate": 4.808682937584537e-06,
+ "loss": 0.6907,
+ "step": 1678
+ },
+ {
+ "epoch": 0.7938534278959811,
+ "grad_norm": 2.8443174362182617,
+ "learning_rate": 4.808443526384053e-06,
+ "loss": 0.6692,
+ "step": 1679
+ },
+ {
+ "epoch": 0.7943262411347518,
+ "grad_norm": 2.7355034351348877,
+ "learning_rate": 4.808203971446913e-06,
+ "loss": 0.5799,
+ "step": 1680
+ },
+ {
+ "epoch": 0.7947990543735225,
+ "grad_norm": 2.7108020782470703,
+ "learning_rate": 4.807964272788033e-06,
+ "loss": 0.652,
+ "step": 1681
+ },
+ {
+ "epoch": 0.7952718676122932,
+ "grad_norm": 2.397650957107544,
+ "learning_rate": 4.807724430422338e-06,
+ "loss": 0.5418,
+ "step": 1682
+ },
+ {
+ "epoch": 0.7957446808510639,
+ "grad_norm": 2.4981582164764404,
+ "learning_rate": 4.807484444364762e-06,
+ "loss": 0.5731,
+ "step": 1683
+ },
+ {
+ "epoch": 0.7962174940898346,
+ "grad_norm": 2.7943713665008545,
+ "learning_rate": 4.8072443146302475e-06,
+ "loss": 0.5913,
+ "step": 1684
+ },
+ {
+ "epoch": 0.7966903073286052,
+ "grad_norm": 2.5691423416137695,
+ "learning_rate": 4.807004041233746e-06,
+ "loss": 0.6475,
+ "step": 1685
+ },
+ {
+ "epoch": 0.7971631205673759,
+ "grad_norm": 3.2367498874664307,
+ "learning_rate": 4.8067636241902195e-06,
+ "loss": 0.675,
+ "step": 1686
+ },
+ {
+ "epoch": 0.7976359338061466,
+ "grad_norm": 3.000595808029175,
+ "learning_rate": 4.806523063514637e-06,
+ "loss": 0.5481,
+ "step": 1687
+ },
+ {
+ "epoch": 0.7981087470449173,
+ "grad_norm": 2.702014207839966,
+ "learning_rate": 4.806282359221976e-06,
+ "loss": 0.5993,
+ "step": 1688
+ },
+ {
+ "epoch": 0.798581560283688,
+ "grad_norm": 2.383671998977661,
+ "learning_rate": 4.806041511327226e-06,
+ "loss": 0.562,
+ "step": 1689
+ },
+ {
+ "epoch": 0.7990543735224587,
+ "grad_norm": 2.6965041160583496,
+ "learning_rate": 4.8058005198453834e-06,
+ "loss": 0.5955,
+ "step": 1690
+ },
+ {
+ "epoch": 0.7995271867612294,
+ "grad_norm": 2.5906765460968018,
+ "learning_rate": 4.805559384791453e-06,
+ "loss": 0.5151,
+ "step": 1691
+ },
+ {
+ "epoch": 0.8,
+ "grad_norm": 2.5454652309417725,
+ "learning_rate": 4.8053181061804475e-06,
+ "loss": 0.5843,
+ "step": 1692
+ },
+ {
+ "epoch": 0.8004728132387707,
+ "grad_norm": 2.661343812942505,
+ "learning_rate": 4.8050766840273935e-06,
+ "loss": 0.5995,
+ "step": 1693
+ },
+ {
+ "epoch": 0.8009456264775414,
+ "grad_norm": 2.5635924339294434,
+ "learning_rate": 4.8048351183473215e-06,
+ "loss": 0.5676,
+ "step": 1694
+ },
+ {
+ "epoch": 0.8014184397163121,
+ "grad_norm": 2.5936667919158936,
+ "learning_rate": 4.804593409155274e-06,
+ "loss": 0.6291,
+ "step": 1695
+ },
+ {
+ "epoch": 0.8018912529550828,
+ "grad_norm": 2.6902432441711426,
+ "learning_rate": 4.804351556466299e-06,
+ "loss": 0.6114,
+ "step": 1696
+ },
+ {
+ "epoch": 0.8023640661938535,
+ "grad_norm": 2.7764673233032227,
+ "learning_rate": 4.804109560295457e-06,
+ "loss": 0.5768,
+ "step": 1697
+ },
+ {
+ "epoch": 0.8028368794326242,
+ "grad_norm": 2.9587221145629883,
+ "learning_rate": 4.803867420657816e-06,
+ "loss": 0.6048,
+ "step": 1698
+ },
+ {
+ "epoch": 0.8033096926713948,
+ "grad_norm": 2.9238998889923096,
+ "learning_rate": 4.803625137568453e-06,
+ "loss": 0.6329,
+ "step": 1699
+ },
+ {
+ "epoch": 0.8037825059101655,
+ "grad_norm": 2.70473313331604,
+ "learning_rate": 4.803382711042455e-06,
+ "loss": 0.5427,
+ "step": 1700
+ },
+ {
+ "epoch": 0.8042553191489362,
+ "grad_norm": 3.1604979038238525,
+ "learning_rate": 4.803140141094914e-06,
+ "loss": 0.626,
+ "step": 1701
+ },
+ {
+ "epoch": 0.8047281323877069,
+ "grad_norm": 2.9567699432373047,
+ "learning_rate": 4.802897427740936e-06,
+ "loss": 0.5319,
+ "step": 1702
+ },
+ {
+ "epoch": 0.8052009456264776,
+ "grad_norm": 2.90983247756958,
+ "learning_rate": 4.802654570995632e-06,
+ "loss": 0.586,
+ "step": 1703
+ },
+ {
+ "epoch": 0.8056737588652483,
+ "grad_norm": 2.783480167388916,
+ "learning_rate": 4.8024115708741255e-06,
+ "loss": 0.5773,
+ "step": 1704
+ },
+ {
+ "epoch": 0.806146572104019,
+ "grad_norm": 3.3307793140411377,
+ "learning_rate": 4.802168427391547e-06,
+ "loss": 0.6257,
+ "step": 1705
+ },
+ {
+ "epoch": 0.8066193853427897,
+ "grad_norm": 3.0475001335144043,
+ "learning_rate": 4.801925140563034e-06,
+ "loss": 0.6612,
+ "step": 1706
+ },
+ {
+ "epoch": 0.8070921985815603,
+ "grad_norm": 2.8278894424438477,
+ "learning_rate": 4.8016817104037375e-06,
+ "loss": 0.6449,
+ "step": 1707
+ },
+ {
+ "epoch": 0.807565011820331,
+ "grad_norm": 2.760244369506836,
+ "learning_rate": 4.801438136928812e-06,
+ "loss": 0.7007,
+ "step": 1708
+ },
+ {
+ "epoch": 0.8080378250591016,
+ "grad_norm": 2.827634572982788,
+ "learning_rate": 4.801194420153427e-06,
+ "loss": 0.6418,
+ "step": 1709
+ },
+ {
+ "epoch": 0.8085106382978723,
+ "grad_norm": 2.8655009269714355,
+ "learning_rate": 4.800950560092754e-06,
+ "loss": 0.6231,
+ "step": 1710
+ },
+ {
+ "epoch": 0.808983451536643,
+ "grad_norm": 2.738112688064575,
+ "learning_rate": 4.800706556761981e-06,
+ "loss": 0.6463,
+ "step": 1711
+ },
+ {
+ "epoch": 0.8094562647754137,
+ "grad_norm": 2.4781179428100586,
+ "learning_rate": 4.800462410176296e-06,
+ "loss": 0.5365,
+ "step": 1712
+ },
+ {
+ "epoch": 0.8099290780141843,
+ "grad_norm": 2.6049838066101074,
+ "learning_rate": 4.800218120350906e-06,
+ "loss": 0.6035,
+ "step": 1713
+ },
+ {
+ "epoch": 0.810401891252955,
+ "grad_norm": 2.9089980125427246,
+ "learning_rate": 4.79997368730102e-06,
+ "loss": 0.5828,
+ "step": 1714
+ },
+ {
+ "epoch": 0.8108747044917257,
+ "grad_norm": 2.831871747970581,
+ "learning_rate": 4.799729111041857e-06,
+ "loss": 0.5953,
+ "step": 1715
+ },
+ {
+ "epoch": 0.8113475177304964,
+ "grad_norm": 2.5611300468444824,
+ "learning_rate": 4.799484391588647e-06,
+ "loss": 0.6302,
+ "step": 1716
+ },
+ {
+ "epoch": 0.8118203309692671,
+ "grad_norm": 2.744070053100586,
+ "learning_rate": 4.799239528956625e-06,
+ "loss": 0.5561,
+ "step": 1717
+ },
+ {
+ "epoch": 0.8122931442080378,
+ "grad_norm": 2.7344231605529785,
+ "learning_rate": 4.798994523161041e-06,
+ "loss": 0.6317,
+ "step": 1718
+ },
+ {
+ "epoch": 0.8127659574468085,
+ "grad_norm": 2.3420889377593994,
+ "learning_rate": 4.798749374217149e-06,
+ "loss": 0.5415,
+ "step": 1719
+ },
+ {
+ "epoch": 0.8132387706855791,
+ "grad_norm": 2.57384991645813,
+ "learning_rate": 4.798504082140212e-06,
+ "loss": 0.6383,
+ "step": 1720
+ },
+ {
+ "epoch": 0.8137115839243498,
+ "grad_norm": 2.8819844722747803,
+ "learning_rate": 4.798258646945505e-06,
+ "loss": 0.6355,
+ "step": 1721
+ },
+ {
+ "epoch": 0.8141843971631205,
+ "grad_norm": 2.908123254776001,
+ "learning_rate": 4.79801306864831e-06,
+ "loss": 0.701,
+ "step": 1722
+ },
+ {
+ "epoch": 0.8146572104018912,
+ "grad_norm": 2.6500701904296875,
+ "learning_rate": 4.797767347263917e-06,
+ "loss": 0.6152,
+ "step": 1723
+ },
+ {
+ "epoch": 0.8151300236406619,
+ "grad_norm": 2.5513017177581787,
+ "learning_rate": 4.797521482807628e-06,
+ "loss": 0.6241,
+ "step": 1724
+ },
+ {
+ "epoch": 0.8156028368794326,
+ "grad_norm": 2.6239185333251953,
+ "learning_rate": 4.7972754752947495e-06,
+ "loss": 0.6072,
+ "step": 1725
+ },
+ {
+ "epoch": 0.8160756501182033,
+ "grad_norm": 2.673436403274536,
+ "learning_rate": 4.797029324740601e-06,
+ "loss": 0.5802,
+ "step": 1726
+ },
+ {
+ "epoch": 0.816548463356974,
+ "grad_norm": 2.533831834793091,
+ "learning_rate": 4.796783031160508e-06,
+ "loss": 0.5566,
+ "step": 1727
+ },
+ {
+ "epoch": 0.8170212765957446,
+ "grad_norm": 2.9806582927703857,
+ "learning_rate": 4.796536594569807e-06,
+ "loss": 0.6945,
+ "step": 1728
+ },
+ {
+ "epoch": 0.8174940898345153,
+ "grad_norm": 2.7093560695648193,
+ "learning_rate": 4.796290014983842e-06,
+ "loss": 0.7143,
+ "step": 1729
+ },
+ {
+ "epoch": 0.817966903073286,
+ "grad_norm": 2.814507246017456,
+ "learning_rate": 4.796043292417967e-06,
+ "loss": 0.6122,
+ "step": 1730
+ },
+ {
+ "epoch": 0.8184397163120567,
+ "grad_norm": 2.537156820297241,
+ "learning_rate": 4.795796426887543e-06,
+ "loss": 0.6229,
+ "step": 1731
+ },
+ {
+ "epoch": 0.8189125295508274,
+ "grad_norm": 2.4878013134002686,
+ "learning_rate": 4.795549418407944e-06,
+ "loss": 0.5442,
+ "step": 1732
+ },
+ {
+ "epoch": 0.8193853427895981,
+ "grad_norm": 2.839383363723755,
+ "learning_rate": 4.795302266994548e-06,
+ "loss": 0.6717,
+ "step": 1733
+ },
+ {
+ "epoch": 0.8198581560283688,
+ "grad_norm": 3.1981801986694336,
+ "learning_rate": 4.795054972662744e-06,
+ "loss": 0.6596,
+ "step": 1734
+ },
+ {
+ "epoch": 0.8203309692671394,
+ "grad_norm": 2.781730890274048,
+ "learning_rate": 4.79480753542793e-06,
+ "loss": 0.5845,
+ "step": 1735
+ },
+ {
+ "epoch": 0.8208037825059101,
+ "grad_norm": 2.689948558807373,
+ "learning_rate": 4.794559955305513e-06,
+ "loss": 0.5928,
+ "step": 1736
+ },
+ {
+ "epoch": 0.8212765957446808,
+ "grad_norm": 2.7267637252807617,
+ "learning_rate": 4.7943122323109105e-06,
+ "loss": 0.5224,
+ "step": 1737
+ },
+ {
+ "epoch": 0.8217494089834515,
+ "grad_norm": 2.4346601963043213,
+ "learning_rate": 4.794064366459544e-06,
+ "loss": 0.6431,
+ "step": 1738
+ },
+ {
+ "epoch": 0.8222222222222222,
+ "grad_norm": 2.7440176010131836,
+ "learning_rate": 4.793816357766849e-06,
+ "loss": 0.6083,
+ "step": 1739
+ },
+ {
+ "epoch": 0.8226950354609929,
+ "grad_norm": 2.6558027267456055,
+ "learning_rate": 4.793568206248268e-06,
+ "loss": 0.698,
+ "step": 1740
+ },
+ {
+ "epoch": 0.8231678486997636,
+ "grad_norm": 2.591658353805542,
+ "learning_rate": 4.793319911919251e-06,
+ "loss": 0.6601,
+ "step": 1741
+ },
+ {
+ "epoch": 0.8236406619385342,
+ "grad_norm": 2.5431172847747803,
+ "learning_rate": 4.79307147479526e-06,
+ "loss": 0.5917,
+ "step": 1742
+ },
+ {
+ "epoch": 0.8241134751773049,
+ "grad_norm": 2.7335588932037354,
+ "learning_rate": 4.792822894891762e-06,
+ "loss": 0.5925,
+ "step": 1743
+ },
+ {
+ "epoch": 0.8245862884160756,
+ "grad_norm": 2.2500839233398438,
+ "learning_rate": 4.792574172224237e-06,
+ "loss": 0.4984,
+ "step": 1744
+ },
+ {
+ "epoch": 0.8250591016548463,
+ "grad_norm": 2.691343069076538,
+ "learning_rate": 4.79232530680817e-06,
+ "loss": 0.6262,
+ "step": 1745
+ },
+ {
+ "epoch": 0.825531914893617,
+ "grad_norm": 2.612204074859619,
+ "learning_rate": 4.792076298659058e-06,
+ "loss": 0.5822,
+ "step": 1746
+ },
+ {
+ "epoch": 0.8260047281323877,
+ "grad_norm": 3.0163519382476807,
+ "learning_rate": 4.791827147792406e-06,
+ "loss": 0.6263,
+ "step": 1747
+ },
+ {
+ "epoch": 0.8264775413711584,
+ "grad_norm": 2.742183208465576,
+ "learning_rate": 4.791577854223727e-06,
+ "loss": 0.6628,
+ "step": 1748
+ },
+ {
+ "epoch": 0.826950354609929,
+ "grad_norm": 2.872213840484619,
+ "learning_rate": 4.791328417968542e-06,
+ "loss": 0.6332,
+ "step": 1749
+ },
+ {
+ "epoch": 0.8274231678486997,
+ "grad_norm": 2.725006580352783,
+ "learning_rate": 4.7910788390423844e-06,
+ "loss": 0.6266,
+ "step": 1750
+ },
+ {
+ "epoch": 0.8278959810874704,
+ "grad_norm": 3.0366697311401367,
+ "learning_rate": 4.790829117460793e-06,
+ "loss": 0.6403,
+ "step": 1751
+ },
+ {
+ "epoch": 0.8283687943262411,
+ "grad_norm": 2.594881772994995,
+ "learning_rate": 4.790579253239318e-06,
+ "loss": 0.521,
+ "step": 1752
+ },
+ {
+ "epoch": 0.8288416075650118,
+ "grad_norm": 2.4496347904205322,
+ "learning_rate": 4.790329246393517e-06,
+ "loss": 0.54,
+ "step": 1753
+ },
+ {
+ "epoch": 0.8293144208037825,
+ "grad_norm": 3.102278470993042,
+ "learning_rate": 4.790079096938956e-06,
+ "loss": 0.6142,
+ "step": 1754
+ },
+ {
+ "epoch": 0.8297872340425532,
+ "grad_norm": 2.4645912647247314,
+ "learning_rate": 4.789828804891212e-06,
+ "loss": 0.5212,
+ "step": 1755
+ },
+ {
+ "epoch": 0.8302600472813239,
+ "grad_norm": 2.7482516765594482,
+ "learning_rate": 4.789578370265868e-06,
+ "loss": 0.6712,
+ "step": 1756
+ },
+ {
+ "epoch": 0.8307328605200945,
+ "grad_norm": 2.61360502243042,
+ "learning_rate": 4.7893277930785195e-06,
+ "loss": 0.6367,
+ "step": 1757
+ },
+ {
+ "epoch": 0.8312056737588652,
+ "grad_norm": 2.79028058052063,
+ "learning_rate": 4.789077073344767e-06,
+ "loss": 0.5099,
+ "step": 1758
+ },
+ {
+ "epoch": 0.8316784869976359,
+ "grad_norm": 2.647662401199341,
+ "learning_rate": 4.788826211080222e-06,
+ "loss": 0.6698,
+ "step": 1759
+ },
+ {
+ "epoch": 0.8321513002364066,
+ "grad_norm": 3.0214831829071045,
+ "learning_rate": 4.7885752063005055e-06,
+ "loss": 0.6121,
+ "step": 1760
+ },
+ {
+ "epoch": 0.8326241134751773,
+ "grad_norm": 2.8244032859802246,
+ "learning_rate": 4.788324059021247e-06,
+ "loss": 0.6921,
+ "step": 1761
+ },
+ {
+ "epoch": 0.833096926713948,
+ "grad_norm": 3.1501076221466064,
+ "learning_rate": 4.788072769258082e-06,
+ "loss": 0.6872,
+ "step": 1762
+ },
+ {
+ "epoch": 0.8335697399527187,
+ "grad_norm": 2.6989903450012207,
+ "learning_rate": 4.7878213370266594e-06,
+ "loss": 0.5884,
+ "step": 1763
+ },
+ {
+ "epoch": 0.8340425531914893,
+ "grad_norm": 2.6982665061950684,
+ "learning_rate": 4.787569762342633e-06,
+ "loss": 0.6112,
+ "step": 1764
+ },
+ {
+ "epoch": 0.83451536643026,
+ "grad_norm": 2.6918323040008545,
+ "learning_rate": 4.7873180452216685e-06,
+ "loss": 0.5315,
+ "step": 1765
+ },
+ {
+ "epoch": 0.8349881796690307,
+ "grad_norm": 2.5494401454925537,
+ "learning_rate": 4.78706618567944e-06,
+ "loss": 0.5909,
+ "step": 1766
+ },
+ {
+ "epoch": 0.8354609929078014,
+ "grad_norm": 2.7532095909118652,
+ "learning_rate": 4.786814183731627e-06,
+ "loss": 0.5566,
+ "step": 1767
+ },
+ {
+ "epoch": 0.8359338061465721,
+ "grad_norm": 2.550865888595581,
+ "learning_rate": 4.786562039393923e-06,
+ "loss": 0.555,
+ "step": 1768
+ },
+ {
+ "epoch": 0.8364066193853428,
+ "grad_norm": 2.4477791786193848,
+ "learning_rate": 4.786309752682028e-06,
+ "loss": 0.5844,
+ "step": 1769
+ },
+ {
+ "epoch": 0.8368794326241135,
+ "grad_norm": 2.6982262134552,
+ "learning_rate": 4.7860573236116485e-06,
+ "loss": 0.6136,
+ "step": 1770
+ },
+ {
+ "epoch": 0.8373522458628841,
+ "grad_norm": 2.456263542175293,
+ "learning_rate": 4.785804752198503e-06,
+ "loss": 0.5055,
+ "step": 1771
+ },
+ {
+ "epoch": 0.8378250591016548,
+ "grad_norm": 2.428544521331787,
+ "learning_rate": 4.78555203845832e-06,
+ "loss": 0.5859,
+ "step": 1772
+ },
+ {
+ "epoch": 0.8382978723404255,
+ "grad_norm": 2.1782307624816895,
+ "learning_rate": 4.785299182406833e-06,
+ "loss": 0.5325,
+ "step": 1773
+ },
+ {
+ "epoch": 0.8387706855791962,
+ "grad_norm": 3.137956142425537,
+ "learning_rate": 4.785046184059786e-06,
+ "loss": 0.6097,
+ "step": 1774
+ },
+ {
+ "epoch": 0.8392434988179669,
+ "grad_norm": 2.6269001960754395,
+ "learning_rate": 4.7847930434329336e-06,
+ "loss": 0.5972,
+ "step": 1775
+ },
+ {
+ "epoch": 0.8397163120567376,
+ "grad_norm": 2.732659339904785,
+ "learning_rate": 4.784539760542037e-06,
+ "loss": 0.6054,
+ "step": 1776
+ },
+ {
+ "epoch": 0.8401891252955083,
+ "grad_norm": 2.5346736907958984,
+ "learning_rate": 4.784286335402866e-06,
+ "loss": 0.5521,
+ "step": 1777
+ },
+ {
+ "epoch": 0.840661938534279,
+ "grad_norm": 3.1420228481292725,
+ "learning_rate": 4.784032768031202e-06,
+ "loss": 0.6165,
+ "step": 1778
+ },
+ {
+ "epoch": 0.8411347517730496,
+ "grad_norm": 3.073793411254883,
+ "learning_rate": 4.783779058442831e-06,
+ "loss": 0.6414,
+ "step": 1779
+ },
+ {
+ "epoch": 0.8416075650118203,
+ "grad_norm": 2.6621336936950684,
+ "learning_rate": 4.783525206653554e-06,
+ "loss": 0.5836,
+ "step": 1780
+ },
+ {
+ "epoch": 0.842080378250591,
+ "grad_norm": 2.7029049396514893,
+ "learning_rate": 4.7832712126791745e-06,
+ "loss": 0.5897,
+ "step": 1781
+ },
+ {
+ "epoch": 0.8425531914893617,
+ "grad_norm": 2.4733822345733643,
+ "learning_rate": 4.783017076535509e-06,
+ "loss": 0.5913,
+ "step": 1782
+ },
+ {
+ "epoch": 0.8430260047281324,
+ "grad_norm": 2.8119473457336426,
+ "learning_rate": 4.782762798238381e-06,
+ "loss": 0.6105,
+ "step": 1783
+ },
+ {
+ "epoch": 0.8434988179669031,
+ "grad_norm": 2.5290818214416504,
+ "learning_rate": 4.782508377803622e-06,
+ "loss": 0.6119,
+ "step": 1784
+ },
+ {
+ "epoch": 0.8439716312056738,
+ "grad_norm": 3.193472385406494,
+ "learning_rate": 4.782253815247076e-06,
+ "loss": 0.6665,
+ "step": 1785
+ },
+ {
+ "epoch": 0.8444444444444444,
+ "grad_norm": 3.206759452819824,
+ "learning_rate": 4.781999110584592e-06,
+ "loss": 0.6012,
+ "step": 1786
+ },
+ {
+ "epoch": 0.8449172576832151,
+ "grad_norm": 2.6227457523345947,
+ "learning_rate": 4.781744263832029e-06,
+ "loss": 0.5845,
+ "step": 1787
+ },
+ {
+ "epoch": 0.8453900709219858,
+ "grad_norm": 2.838365316390991,
+ "learning_rate": 4.781489275005257e-06,
+ "loss": 0.5695,
+ "step": 1788
+ },
+ {
+ "epoch": 0.8458628841607565,
+ "grad_norm": 2.8348326683044434,
+ "learning_rate": 4.78123414412015e-06,
+ "loss": 0.6136,
+ "step": 1789
+ },
+ {
+ "epoch": 0.8463356973995272,
+ "grad_norm": 2.5698344707489014,
+ "learning_rate": 4.780978871192597e-06,
+ "loss": 0.6576,
+ "step": 1790
+ },
+ {
+ "epoch": 0.8468085106382979,
+ "grad_norm": 2.5198330879211426,
+ "learning_rate": 4.780723456238492e-06,
+ "loss": 0.5521,
+ "step": 1791
+ },
+ {
+ "epoch": 0.8472813238770686,
+ "grad_norm": 3.001325845718384,
+ "learning_rate": 4.780467899273737e-06,
+ "loss": 0.6075,
+ "step": 1792
+ },
+ {
+ "epoch": 0.8477541371158392,
+ "grad_norm": 2.7732746601104736,
+ "learning_rate": 4.780212200314247e-06,
+ "loss": 0.6245,
+ "step": 1793
+ },
+ {
+ "epoch": 0.8482269503546099,
+ "grad_norm": 2.6950337886810303,
+ "learning_rate": 4.77995635937594e-06,
+ "loss": 0.5723,
+ "step": 1794
+ },
+ {
+ "epoch": 0.8486997635933806,
+ "grad_norm": 2.82051420211792,
+ "learning_rate": 4.779700376474749e-06,
+ "loss": 0.6184,
+ "step": 1795
+ },
+ {
+ "epoch": 0.8491725768321513,
+ "grad_norm": 2.757791757583618,
+ "learning_rate": 4.779444251626611e-06,
+ "loss": 0.608,
+ "step": 1796
+ },
+ {
+ "epoch": 0.849645390070922,
+ "grad_norm": 2.394108533859253,
+ "learning_rate": 4.779187984847475e-06,
+ "loss": 0.6174,
+ "step": 1797
+ },
+ {
+ "epoch": 0.8501182033096927,
+ "grad_norm": 2.427562713623047,
+ "learning_rate": 4.778931576153296e-06,
+ "loss": 0.5618,
+ "step": 1798
+ },
+ {
+ "epoch": 0.8505910165484634,
+ "grad_norm": 2.891268491744995,
+ "learning_rate": 4.778675025560042e-06,
+ "loss": 0.6865,
+ "step": 1799
+ },
+ {
+ "epoch": 0.851063829787234,
+ "grad_norm": 2.665534257888794,
+ "learning_rate": 4.778418333083685e-06,
+ "loss": 0.5852,
+ "step": 1800
+ },
+ {
+ "epoch": 0.8515366430260047,
+ "grad_norm": 2.5492889881134033,
+ "learning_rate": 4.7781614987402095e-06,
+ "loss": 0.5161,
+ "step": 1801
+ },
+ {
+ "epoch": 0.8520094562647754,
+ "grad_norm": 2.400177001953125,
+ "learning_rate": 4.777904522545607e-06,
+ "loss": 0.5128,
+ "step": 1802
+ },
+ {
+ "epoch": 0.8524822695035461,
+ "grad_norm": 2.3949809074401855,
+ "learning_rate": 4.777647404515878e-06,
+ "loss": 0.571,
+ "step": 1803
+ },
+ {
+ "epoch": 0.8529550827423168,
+ "grad_norm": 2.3624472618103027,
+ "learning_rate": 4.7773901446670325e-06,
+ "loss": 0.5486,
+ "step": 1804
+ },
+ {
+ "epoch": 0.8534278959810875,
+ "grad_norm": 2.711366891860962,
+ "learning_rate": 4.7771327430150885e-06,
+ "loss": 0.5667,
+ "step": 1805
+ },
+ {
+ "epoch": 0.8539007092198582,
+ "grad_norm": 2.7681493759155273,
+ "learning_rate": 4.776875199576073e-06,
+ "loss": 0.5686,
+ "step": 1806
+ },
+ {
+ "epoch": 0.8543735224586289,
+ "grad_norm": 3.0369436740875244,
+ "learning_rate": 4.776617514366023e-06,
+ "loss": 0.6635,
+ "step": 1807
+ },
+ {
+ "epoch": 0.8548463356973995,
+ "grad_norm": 2.919649600982666,
+ "learning_rate": 4.776359687400983e-06,
+ "loss": 0.5749,
+ "step": 1808
+ },
+ {
+ "epoch": 0.8553191489361702,
+ "grad_norm": 2.7986185550689697,
+ "learning_rate": 4.776101718697007e-06,
+ "loss": 0.559,
+ "step": 1809
+ },
+ {
+ "epoch": 0.8557919621749409,
+ "grad_norm": 2.5951223373413086,
+ "learning_rate": 4.775843608270158e-06,
+ "loss": 0.5654,
+ "step": 1810
+ },
+ {
+ "epoch": 0.8562647754137116,
+ "grad_norm": 2.674138069152832,
+ "learning_rate": 4.775585356136505e-06,
+ "loss": 0.5286,
+ "step": 1811
+ },
+ {
+ "epoch": 0.8567375886524823,
+ "grad_norm": 3.045437812805176,
+ "learning_rate": 4.775326962312131e-06,
+ "loss": 0.6185,
+ "step": 1812
+ },
+ {
+ "epoch": 0.857210401891253,
+ "grad_norm": 2.6145293712615967,
+ "learning_rate": 4.775068426813124e-06,
+ "loss": 0.6075,
+ "step": 1813
+ },
+ {
+ "epoch": 0.8576832151300237,
+ "grad_norm": 2.6320106983184814,
+ "learning_rate": 4.7748097496555824e-06,
+ "loss": 0.561,
+ "step": 1814
+ },
+ {
+ "epoch": 0.8581560283687943,
+ "grad_norm": 2.5038623809814453,
+ "learning_rate": 4.774550930855612e-06,
+ "loss": 0.593,
+ "step": 1815
+ },
+ {
+ "epoch": 0.858628841607565,
+ "grad_norm": 2.8168089389801025,
+ "learning_rate": 4.774291970429329e-06,
+ "loss": 0.5196,
+ "step": 1816
+ },
+ {
+ "epoch": 0.8591016548463357,
+ "grad_norm": 2.778130292892456,
+ "learning_rate": 4.774032868392858e-06,
+ "loss": 0.5984,
+ "step": 1817
+ },
+ {
+ "epoch": 0.8595744680851064,
+ "grad_norm": 2.536458730697632,
+ "learning_rate": 4.7737736247623305e-06,
+ "loss": 0.568,
+ "step": 1818
+ },
+ {
+ "epoch": 0.8600472813238771,
+ "grad_norm": 2.6669719219207764,
+ "learning_rate": 4.77351423955389e-06,
+ "loss": 0.6233,
+ "step": 1819
+ },
+ {
+ "epoch": 0.8605200945626478,
+ "grad_norm": 2.578242540359497,
+ "learning_rate": 4.773254712783687e-06,
+ "loss": 0.579,
+ "step": 1820
+ },
+ {
+ "epoch": 0.8609929078014185,
+ "grad_norm": 2.816664457321167,
+ "learning_rate": 4.772995044467881e-06,
+ "loss": 0.6635,
+ "step": 1821
+ },
+ {
+ "epoch": 0.8614657210401891,
+ "grad_norm": 3.1111979484558105,
+ "learning_rate": 4.77273523462264e-06,
+ "loss": 0.6372,
+ "step": 1822
+ },
+ {
+ "epoch": 0.8619385342789598,
+ "grad_norm": 2.764552354812622,
+ "learning_rate": 4.772475283264142e-06,
+ "loss": 0.6216,
+ "step": 1823
+ },
+ {
+ "epoch": 0.8624113475177305,
+ "grad_norm": 2.9126830101013184,
+ "learning_rate": 4.772215190408572e-06,
+ "loss": 0.6396,
+ "step": 1824
+ },
+ {
+ "epoch": 0.8628841607565012,
+ "grad_norm": 2.7502307891845703,
+ "learning_rate": 4.7719549560721264e-06,
+ "loss": 0.6186,
+ "step": 1825
+ },
+ {
+ "epoch": 0.8633569739952719,
+ "grad_norm": 2.6279006004333496,
+ "learning_rate": 4.771694580271007e-06,
+ "loss": 0.5557,
+ "step": 1826
+ },
+ {
+ "epoch": 0.8638297872340426,
+ "grad_norm": 2.996563196182251,
+ "learning_rate": 4.7714340630214276e-06,
+ "loss": 0.6259,
+ "step": 1827
+ },
+ {
+ "epoch": 0.8643026004728133,
+ "grad_norm": 3.231323480606079,
+ "learning_rate": 4.771173404339609e-06,
+ "loss": 0.5473,
+ "step": 1828
+ },
+ {
+ "epoch": 0.864775413711584,
+ "grad_norm": 3.143519878387451,
+ "learning_rate": 4.770912604241781e-06,
+ "loss": 0.593,
+ "step": 1829
+ },
+ {
+ "epoch": 0.8652482269503546,
+ "grad_norm": 2.515484094619751,
+ "learning_rate": 4.770651662744184e-06,
+ "loss": 0.538,
+ "step": 1830
+ },
+ {
+ "epoch": 0.8657210401891253,
+ "grad_norm": 2.629058837890625,
+ "learning_rate": 4.770390579863064e-06,
+ "loss": 0.5745,
+ "step": 1831
+ },
+ {
+ "epoch": 0.866193853427896,
+ "grad_norm": 2.5826802253723145,
+ "learning_rate": 4.770129355614677e-06,
+ "loss": 0.6397,
+ "step": 1832
+ },
+ {
+ "epoch": 0.8666666666666667,
+ "grad_norm": 2.954623222351074,
+ "learning_rate": 4.769867990015289e-06,
+ "loss": 0.6106,
+ "step": 1833
+ },
+ {
+ "epoch": 0.8671394799054374,
+ "grad_norm": 2.742192268371582,
+ "learning_rate": 4.769606483081175e-06,
+ "loss": 0.6902,
+ "step": 1834
+ },
+ {
+ "epoch": 0.8676122931442081,
+ "grad_norm": 2.2619097232818604,
+ "learning_rate": 4.769344834828618e-06,
+ "loss": 0.5414,
+ "step": 1835
+ },
+ {
+ "epoch": 0.8680851063829788,
+ "grad_norm": 2.7384188175201416,
+ "learning_rate": 4.769083045273908e-06,
+ "loss": 0.5787,
+ "step": 1836
+ },
+ {
+ "epoch": 0.8685579196217494,
+ "grad_norm": 2.6734485626220703,
+ "learning_rate": 4.768821114433346e-06,
+ "loss": 0.5923,
+ "step": 1837
+ },
+ {
+ "epoch": 0.8690307328605201,
+ "grad_norm": 2.286140203475952,
+ "learning_rate": 4.768559042323243e-06,
+ "loss": 0.5822,
+ "step": 1838
+ },
+ {
+ "epoch": 0.8695035460992908,
+ "grad_norm": 3.0243725776672363,
+ "learning_rate": 4.768296828959915e-06,
+ "loss": 0.6623,
+ "step": 1839
+ },
+ {
+ "epoch": 0.8699763593380615,
+ "grad_norm": 2.4026312828063965,
+ "learning_rate": 4.768034474359689e-06,
+ "loss": 0.5554,
+ "step": 1840
+ },
+ {
+ "epoch": 0.8704491725768322,
+ "grad_norm": 2.7469029426574707,
+ "learning_rate": 4.767771978538903e-06,
+ "loss": 0.6316,
+ "step": 1841
+ },
+ {
+ "epoch": 0.8709219858156029,
+ "grad_norm": 2.729659080505371,
+ "learning_rate": 4.767509341513899e-06,
+ "loss": 0.5807,
+ "step": 1842
+ },
+ {
+ "epoch": 0.8713947990543736,
+ "grad_norm": 2.5336945056915283,
+ "learning_rate": 4.76724656330103e-06,
+ "loss": 0.6109,
+ "step": 1843
+ },
+ {
+ "epoch": 0.8718676122931442,
+ "grad_norm": 2.519880533218384,
+ "learning_rate": 4.76698364391666e-06,
+ "loss": 0.5313,
+ "step": 1844
+ },
+ {
+ "epoch": 0.8723404255319149,
+ "grad_norm": 2.698862075805664,
+ "learning_rate": 4.766720583377159e-06,
+ "loss": 0.5953,
+ "step": 1845
+ },
+ {
+ "epoch": 0.8728132387706856,
+ "grad_norm": 3.0195560455322266,
+ "learning_rate": 4.766457381698907e-06,
+ "loss": 0.5965,
+ "step": 1846
+ },
+ {
+ "epoch": 0.8732860520094563,
+ "grad_norm": 2.5972697734832764,
+ "learning_rate": 4.766194038898291e-06,
+ "loss": 0.6014,
+ "step": 1847
+ },
+ {
+ "epoch": 0.873758865248227,
+ "grad_norm": 2.7132294178009033,
+ "learning_rate": 4.76593055499171e-06,
+ "loss": 0.5638,
+ "step": 1848
+ },
+ {
+ "epoch": 0.8742316784869977,
+ "grad_norm": 2.7134575843811035,
+ "learning_rate": 4.765666929995568e-06,
+ "loss": 0.52,
+ "step": 1849
+ },
+ {
+ "epoch": 0.8747044917257684,
+ "grad_norm": 2.3804993629455566,
+ "learning_rate": 4.765403163926282e-06,
+ "loss": 0.5435,
+ "step": 1850
+ },
+ {
+ "epoch": 0.875177304964539,
+ "grad_norm": 2.8782761096954346,
+ "learning_rate": 4.765139256800274e-06,
+ "loss": 0.5843,
+ "step": 1851
+ },
+ {
+ "epoch": 0.8756501182033097,
+ "grad_norm": 2.836209774017334,
+ "learning_rate": 4.764875208633977e-06,
+ "loss": 0.6667,
+ "step": 1852
+ },
+ {
+ "epoch": 0.8761229314420804,
+ "grad_norm": 2.608851194381714,
+ "learning_rate": 4.764611019443831e-06,
+ "loss": 0.5436,
+ "step": 1853
+ },
+ {
+ "epoch": 0.8765957446808511,
+ "grad_norm": 2.788738965988159,
+ "learning_rate": 4.764346689246288e-06,
+ "loss": 0.7331,
+ "step": 1854
+ },
+ {
+ "epoch": 0.8770685579196218,
+ "grad_norm": 2.524277687072754,
+ "learning_rate": 4.764082218057805e-06,
+ "loss": 0.5067,
+ "step": 1855
+ },
+ {
+ "epoch": 0.8775413711583925,
+ "grad_norm": 3.7559316158294678,
+ "learning_rate": 4.763817605894851e-06,
+ "loss": 0.6809,
+ "step": 1856
+ },
+ {
+ "epoch": 0.8780141843971632,
+ "grad_norm": 2.9070613384246826,
+ "learning_rate": 4.763552852773899e-06,
+ "loss": 0.5913,
+ "step": 1857
+ },
+ {
+ "epoch": 0.8784869976359339,
+ "grad_norm": 2.7050609588623047,
+ "learning_rate": 4.7632879587114386e-06,
+ "loss": 0.6074,
+ "step": 1858
+ },
+ {
+ "epoch": 0.8789598108747045,
+ "grad_norm": 2.891134262084961,
+ "learning_rate": 4.76302292372396e-06,
+ "loss": 0.5939,
+ "step": 1859
+ },
+ {
+ "epoch": 0.8794326241134752,
+ "grad_norm": 2.8581702709198,
+ "learning_rate": 4.762757747827968e-06,
+ "loss": 0.5972,
+ "step": 1860
+ },
+ {
+ "epoch": 0.8799054373522459,
+ "grad_norm": 2.8266196250915527,
+ "learning_rate": 4.762492431039971e-06,
+ "loss": 0.5993,
+ "step": 1861
+ },
+ {
+ "epoch": 0.8803782505910166,
+ "grad_norm": 2.4853954315185547,
+ "learning_rate": 4.762226973376493e-06,
+ "loss": 0.6388,
+ "step": 1862
+ },
+ {
+ "epoch": 0.8808510638297873,
+ "grad_norm": 3.2212886810302734,
+ "learning_rate": 4.761961374854059e-06,
+ "loss": 0.6698,
+ "step": 1863
+ },
+ {
+ "epoch": 0.881323877068558,
+ "grad_norm": 3.1254501342773438,
+ "learning_rate": 4.761695635489211e-06,
+ "loss": 0.5263,
+ "step": 1864
+ },
+ {
+ "epoch": 0.8817966903073287,
+ "grad_norm": 2.6891462802886963,
+ "learning_rate": 4.761429755298491e-06,
+ "loss": 0.5359,
+ "step": 1865
+ },
+ {
+ "epoch": 0.8822695035460993,
+ "grad_norm": 2.8557538986206055,
+ "learning_rate": 4.761163734298457e-06,
+ "loss": 0.5933,
+ "step": 1866
+ },
+ {
+ "epoch": 0.88274231678487,
+ "grad_norm": 2.53548264503479,
+ "learning_rate": 4.7608975725056724e-06,
+ "loss": 0.6397,
+ "step": 1867
+ },
+ {
+ "epoch": 0.8832151300236407,
+ "grad_norm": 3.0237956047058105,
+ "learning_rate": 4.76063126993671e-06,
+ "loss": 0.6845,
+ "step": 1868
+ },
+ {
+ "epoch": 0.8836879432624114,
+ "grad_norm": 3.222886800765991,
+ "learning_rate": 4.76036482660815e-06,
+ "loss": 0.6055,
+ "step": 1869
+ },
+ {
+ "epoch": 0.8841607565011821,
+ "grad_norm": 3.1867551803588867,
+ "learning_rate": 4.760098242536584e-06,
+ "loss": 0.6592,
+ "step": 1870
+ },
+ {
+ "epoch": 0.8846335697399527,
+ "grad_norm": 2.782209873199463,
+ "learning_rate": 4.7598315177386115e-06,
+ "loss": 0.5833,
+ "step": 1871
+ },
+ {
+ "epoch": 0.8851063829787233,
+ "grad_norm": 2.899871587753296,
+ "learning_rate": 4.759564652230838e-06,
+ "loss": 0.6129,
+ "step": 1872
+ },
+ {
+ "epoch": 0.885579196217494,
+ "grad_norm": 2.5690579414367676,
+ "learning_rate": 4.759297646029882e-06,
+ "loss": 0.5827,
+ "step": 1873
+ },
+ {
+ "epoch": 0.8860520094562647,
+ "grad_norm": 2.666130304336548,
+ "learning_rate": 4.759030499152368e-06,
+ "loss": 0.5272,
+ "step": 1874
+ },
+ {
+ "epoch": 0.8865248226950354,
+ "grad_norm": 2.7030911445617676,
+ "learning_rate": 4.758763211614932e-06,
+ "loss": 0.6415,
+ "step": 1875
+ },
+ {
+ "epoch": 0.8869976359338061,
+ "grad_norm": 2.717512845993042,
+ "learning_rate": 4.7584957834342135e-06,
+ "loss": 0.5827,
+ "step": 1876
+ },
+ {
+ "epoch": 0.8874704491725768,
+ "grad_norm": 2.665823459625244,
+ "learning_rate": 4.758228214626867e-06,
+ "loss": 0.6209,
+ "step": 1877
+ },
+ {
+ "epoch": 0.8879432624113475,
+ "grad_norm": 2.636653184890747,
+ "learning_rate": 4.75796050520955e-06,
+ "loss": 0.6413,
+ "step": 1878
+ },
+ {
+ "epoch": 0.8884160756501182,
+ "grad_norm": 2.585115671157837,
+ "learning_rate": 4.7576926551989345e-06,
+ "loss": 0.5518,
+ "step": 1879
+ },
+ {
+ "epoch": 0.8888888888888888,
+ "grad_norm": 2.808526039123535,
+ "learning_rate": 4.757424664611697e-06,
+ "loss": 0.5717,
+ "step": 1880
+ },
+ {
+ "epoch": 0.8893617021276595,
+ "grad_norm": 3.5957939624786377,
+ "learning_rate": 4.757156533464524e-06,
+ "loss": 0.6323,
+ "step": 1881
+ },
+ {
+ "epoch": 0.8898345153664302,
+ "grad_norm": 2.5003883838653564,
+ "learning_rate": 4.756888261774111e-06,
+ "loss": 0.5937,
+ "step": 1882
+ },
+ {
+ "epoch": 0.8903073286052009,
+ "grad_norm": 2.749061346054077,
+ "learning_rate": 4.756619849557161e-06,
+ "loss": 0.6642,
+ "step": 1883
+ },
+ {
+ "epoch": 0.8907801418439716,
+ "grad_norm": 2.6757891178131104,
+ "learning_rate": 4.756351296830389e-06,
+ "loss": 0.5887,
+ "step": 1884
+ },
+ {
+ "epoch": 0.8912529550827423,
+ "grad_norm": 2.811925172805786,
+ "learning_rate": 4.756082603610516e-06,
+ "loss": 0.6571,
+ "step": 1885
+ },
+ {
+ "epoch": 0.891725768321513,
+ "grad_norm": 2.5054616928100586,
+ "learning_rate": 4.755813769914271e-06,
+ "loss": 0.6312,
+ "step": 1886
+ },
+ {
+ "epoch": 0.8921985815602836,
+ "grad_norm": 2.7518467903137207,
+ "learning_rate": 4.755544795758395e-06,
+ "loss": 0.6685,
+ "step": 1887
+ },
+ {
+ "epoch": 0.8926713947990543,
+ "grad_norm": 2.7527287006378174,
+ "learning_rate": 4.755275681159634e-06,
+ "loss": 0.5886,
+ "step": 1888
+ },
+ {
+ "epoch": 0.893144208037825,
+ "grad_norm": 2.6162452697753906,
+ "learning_rate": 4.755006426134745e-06,
+ "loss": 0.546,
+ "step": 1889
+ },
+ {
+ "epoch": 0.8936170212765957,
+ "grad_norm": 2.4016737937927246,
+ "learning_rate": 4.754737030700495e-06,
+ "loss": 0.5726,
+ "step": 1890
+ },
+ {
+ "epoch": 0.8940898345153664,
+ "grad_norm": 2.528327703475952,
+ "learning_rate": 4.754467494873656e-06,
+ "loss": 0.5682,
+ "step": 1891
+ },
+ {
+ "epoch": 0.8945626477541371,
+ "grad_norm": 2.3139286041259766,
+ "learning_rate": 4.7541978186710115e-06,
+ "loss": 0.6108,
+ "step": 1892
+ },
+ {
+ "epoch": 0.8950354609929078,
+ "grad_norm": 2.7269136905670166,
+ "learning_rate": 4.753928002109354e-06,
+ "loss": 0.5875,
+ "step": 1893
+ },
+ {
+ "epoch": 0.8955082742316784,
+ "grad_norm": 4.425495147705078,
+ "learning_rate": 4.753658045205482e-06,
+ "loss": 0.5572,
+ "step": 1894
+ },
+ {
+ "epoch": 0.8959810874704491,
+ "grad_norm": 2.535409927368164,
+ "learning_rate": 4.753387947976206e-06,
+ "loss": 0.5868,
+ "step": 1895
+ },
+ {
+ "epoch": 0.8964539007092198,
+ "grad_norm": 2.722458600997925,
+ "learning_rate": 4.753117710438343e-06,
+ "loss": 0.5935,
+ "step": 1896
+ },
+ {
+ "epoch": 0.8969267139479905,
+ "grad_norm": 2.743861436843872,
+ "learning_rate": 4.75284733260872e-06,
+ "loss": 0.572,
+ "step": 1897
+ },
+ {
+ "epoch": 0.8973995271867612,
+ "grad_norm": 2.60640549659729,
+ "learning_rate": 4.752576814504173e-06,
+ "loss": 0.567,
+ "step": 1898
+ },
+ {
+ "epoch": 0.8978723404255319,
+ "grad_norm": 2.7486042976379395,
+ "learning_rate": 4.7523061561415435e-06,
+ "loss": 0.5768,
+ "step": 1899
+ },
+ {
+ "epoch": 0.8983451536643026,
+ "grad_norm": 3.8410251140594482,
+ "learning_rate": 4.752035357537686e-06,
+ "loss": 0.6034,
+ "step": 1900
+ },
+ {
+ "epoch": 0.8988179669030733,
+ "grad_norm": 3.0935890674591064,
+ "learning_rate": 4.751764418709462e-06,
+ "loss": 0.5644,
+ "step": 1901
+ },
+ {
+ "epoch": 0.8992907801418439,
+ "grad_norm": 2.7989892959594727,
+ "learning_rate": 4.751493339673742e-06,
+ "loss": 0.656,
+ "step": 1902
+ },
+ {
+ "epoch": 0.8997635933806146,
+ "grad_norm": 3.6940557956695557,
+ "learning_rate": 4.751222120447403e-06,
+ "loss": 0.6632,
+ "step": 1903
+ },
+ {
+ "epoch": 0.9002364066193853,
+ "grad_norm": 2.3428797721862793,
+ "learning_rate": 4.750950761047335e-06,
+ "loss": 0.4485,
+ "step": 1904
+ },
+ {
+ "epoch": 0.900709219858156,
+ "grad_norm": 2.622544050216675,
+ "learning_rate": 4.750679261490432e-06,
+ "loss": 0.5857,
+ "step": 1905
+ },
+ {
+ "epoch": 0.9011820330969267,
+ "grad_norm": 2.4911322593688965,
+ "learning_rate": 4.750407621793601e-06,
+ "loss": 0.5618,
+ "step": 1906
+ },
+ {
+ "epoch": 0.9016548463356974,
+ "grad_norm": 2.6434662342071533,
+ "learning_rate": 4.750135841973755e-06,
+ "loss": 0.6057,
+ "step": 1907
+ },
+ {
+ "epoch": 0.902127659574468,
+ "grad_norm": 3.115443706512451,
+ "learning_rate": 4.749863922047817e-06,
+ "loss": 0.6064,
+ "step": 1908
+ },
+ {
+ "epoch": 0.9026004728132387,
+ "grad_norm": 2.5671091079711914,
+ "learning_rate": 4.749591862032718e-06,
+ "loss": 0.5625,
+ "step": 1909
+ },
+ {
+ "epoch": 0.9030732860520094,
+ "grad_norm": 3.2008655071258545,
+ "learning_rate": 4.749319661945398e-06,
+ "loss": 0.5547,
+ "step": 1910
+ },
+ {
+ "epoch": 0.9035460992907801,
+ "grad_norm": 2.905987024307251,
+ "learning_rate": 4.749047321802805e-06,
+ "loss": 0.6033,
+ "step": 1911
+ },
+ {
+ "epoch": 0.9040189125295508,
+ "grad_norm": 3.1456053256988525,
+ "learning_rate": 4.748774841621897e-06,
+ "loss": 0.5651,
+ "step": 1912
+ },
+ {
+ "epoch": 0.9044917257683215,
+ "grad_norm": 2.8116416931152344,
+ "learning_rate": 4.748502221419641e-06,
+ "loss": 0.5853,
+ "step": 1913
+ },
+ {
+ "epoch": 0.9049645390070922,
+ "grad_norm": 3.123835325241089,
+ "learning_rate": 4.748229461213011e-06,
+ "loss": 0.5427,
+ "step": 1914
+ },
+ {
+ "epoch": 0.9054373522458629,
+ "grad_norm": 2.4750146865844727,
+ "learning_rate": 4.747956561018989e-06,
+ "loss": 0.6517,
+ "step": 1915
+ },
+ {
+ "epoch": 0.9059101654846335,
+ "grad_norm": 2.6174299716949463,
+ "learning_rate": 4.7476835208545705e-06,
+ "loss": 0.6119,
+ "step": 1916
+ },
+ {
+ "epoch": 0.9063829787234042,
+ "grad_norm": 2.7390382289886475,
+ "learning_rate": 4.747410340736755e-06,
+ "loss": 0.5664,
+ "step": 1917
+ },
+ {
+ "epoch": 0.9068557919621749,
+ "grad_norm": 2.7940444946289062,
+ "learning_rate": 4.747137020682552e-06,
+ "loss": 0.5628,
+ "step": 1918
+ },
+ {
+ "epoch": 0.9073286052009456,
+ "grad_norm": 2.477365016937256,
+ "learning_rate": 4.7468635607089795e-06,
+ "loss": 0.5261,
+ "step": 1919
+ },
+ {
+ "epoch": 0.9078014184397163,
+ "grad_norm": 2.7016685009002686,
+ "learning_rate": 4.746589960833066e-06,
+ "loss": 0.5576,
+ "step": 1920
+ },
+ {
+ "epoch": 0.908274231678487,
+ "grad_norm": 2.8806519508361816,
+ "learning_rate": 4.746316221071846e-06,
+ "loss": 0.5925,
+ "step": 1921
+ },
+ {
+ "epoch": 0.9087470449172577,
+ "grad_norm": 3.0315234661102295,
+ "learning_rate": 4.746042341442365e-06,
+ "loss": 0.6142,
+ "step": 1922
+ },
+ {
+ "epoch": 0.9092198581560283,
+ "grad_norm": 4.2446160316467285,
+ "learning_rate": 4.745768321961676e-06,
+ "loss": 0.5352,
+ "step": 1923
+ },
+ {
+ "epoch": 0.909692671394799,
+ "grad_norm": 2.6517012119293213,
+ "learning_rate": 4.745494162646841e-06,
+ "loss": 0.6118,
+ "step": 1924
+ },
+ {
+ "epoch": 0.9101654846335697,
+ "grad_norm": 2.774900197982788,
+ "learning_rate": 4.7452198635149304e-06,
+ "loss": 0.572,
+ "step": 1925
+ },
+ {
+ "epoch": 0.9106382978723404,
+ "grad_norm": 3.0133683681488037,
+ "learning_rate": 4.744945424583024e-06,
+ "loss": 0.5897,
+ "step": 1926
+ },
+ {
+ "epoch": 0.9111111111111111,
+ "grad_norm": 2.7344839572906494,
+ "learning_rate": 4.744670845868211e-06,
+ "loss": 0.6207,
+ "step": 1927
+ },
+ {
+ "epoch": 0.9115839243498818,
+ "grad_norm": 2.636578321456909,
+ "learning_rate": 4.744396127387586e-06,
+ "loss": 0.6687,
+ "step": 1928
+ },
+ {
+ "epoch": 0.9120567375886525,
+ "grad_norm": 2.8663458824157715,
+ "learning_rate": 4.744121269158255e-06,
+ "loss": 0.5002,
+ "step": 1929
+ },
+ {
+ "epoch": 0.9125295508274232,
+ "grad_norm": 2.661079168319702,
+ "learning_rate": 4.743846271197333e-06,
+ "loss": 0.5848,
+ "step": 1930
+ },
+ {
+ "epoch": 0.9130023640661938,
+ "grad_norm": 2.881256341934204,
+ "learning_rate": 4.743571133521943e-06,
+ "loss": 0.5911,
+ "step": 1931
+ },
+ {
+ "epoch": 0.9134751773049645,
+ "grad_norm": 2.5540573596954346,
+ "learning_rate": 4.743295856149217e-06,
+ "loss": 0.5647,
+ "step": 1932
+ },
+ {
+ "epoch": 0.9139479905437352,
+ "grad_norm": 2.7060387134552,
+ "learning_rate": 4.743020439096293e-06,
+ "loss": 0.6267,
+ "step": 1933
+ },
+ {
+ "epoch": 0.9144208037825059,
+ "grad_norm": 2.694481372833252,
+ "learning_rate": 4.742744882380323e-06,
+ "loss": 0.6283,
+ "step": 1934
+ },
+ {
+ "epoch": 0.9148936170212766,
+ "grad_norm": 2.711555242538452,
+ "learning_rate": 4.7424691860184625e-06,
+ "loss": 0.5784,
+ "step": 1935
+ },
+ {
+ "epoch": 0.9153664302600473,
+ "grad_norm": 2.9077224731445312,
+ "learning_rate": 4.742193350027879e-06,
+ "loss": 0.5948,
+ "step": 1936
+ },
+ {
+ "epoch": 0.915839243498818,
+ "grad_norm": 2.9824187755584717,
+ "learning_rate": 4.7419173744257476e-06,
+ "loss": 0.6115,
+ "step": 1937
+ },
+ {
+ "epoch": 0.9163120567375886,
+ "grad_norm": 2.5127830505371094,
+ "learning_rate": 4.7416412592292515e-06,
+ "loss": 0.5803,
+ "step": 1938
+ },
+ {
+ "epoch": 0.9167848699763593,
+ "grad_norm": 3.1307175159454346,
+ "learning_rate": 4.741365004455583e-06,
+ "loss": 0.5657,
+ "step": 1939
+ },
+ {
+ "epoch": 0.91725768321513,
+ "grad_norm": 2.8205273151397705,
+ "learning_rate": 4.741088610121944e-06,
+ "loss": 0.6145,
+ "step": 1940
+ },
+ {
+ "epoch": 0.9177304964539007,
+ "grad_norm": 2.6119720935821533,
+ "learning_rate": 4.7408120762455444e-06,
+ "loss": 0.6058,
+ "step": 1941
+ },
+ {
+ "epoch": 0.9182033096926714,
+ "grad_norm": 2.421276092529297,
+ "learning_rate": 4.7405354028436025e-06,
+ "loss": 0.5973,
+ "step": 1942
+ },
+ {
+ "epoch": 0.9186761229314421,
+ "grad_norm": 2.9846808910369873,
+ "learning_rate": 4.740258589933346e-06,
+ "loss": 0.6892,
+ "step": 1943
+ },
+ {
+ "epoch": 0.9191489361702128,
+ "grad_norm": 2.6899871826171875,
+ "learning_rate": 4.739981637532009e-06,
+ "loss": 0.5705,
+ "step": 1944
+ },
+ {
+ "epoch": 0.9196217494089834,
+ "grad_norm": 2.8636131286621094,
+ "learning_rate": 4.739704545656839e-06,
+ "loss": 0.5775,
+ "step": 1945
+ },
+ {
+ "epoch": 0.9200945626477541,
+ "grad_norm": 2.7659449577331543,
+ "learning_rate": 4.739427314325087e-06,
+ "loss": 0.5823,
+ "step": 1946
+ },
+ {
+ "epoch": 0.9205673758865248,
+ "grad_norm": 4.71295166015625,
+ "learning_rate": 4.739149943554016e-06,
+ "loss": 0.5601,
+ "step": 1947
+ },
+ {
+ "epoch": 0.9210401891252955,
+ "grad_norm": 2.642636775970459,
+ "learning_rate": 4.738872433360896e-06,
+ "loss": 0.5278,
+ "step": 1948
+ },
+ {
+ "epoch": 0.9215130023640662,
+ "grad_norm": 2.4658217430114746,
+ "learning_rate": 4.7385947837630065e-06,
+ "loss": 0.6392,
+ "step": 1949
+ },
+ {
+ "epoch": 0.9219858156028369,
+ "grad_norm": 2.851602792739868,
+ "learning_rate": 4.738316994777636e-06,
+ "loss": 0.6164,
+ "step": 1950
+ },
+ {
+ "epoch": 0.9224586288416076,
+ "grad_norm": 2.394226551055908,
+ "learning_rate": 4.738039066422081e-06,
+ "loss": 0.5556,
+ "step": 1951
+ },
+ {
+ "epoch": 0.9229314420803783,
+ "grad_norm": 2.7985100746154785,
+ "learning_rate": 4.737760998713647e-06,
+ "loss": 0.5799,
+ "step": 1952
+ },
+ {
+ "epoch": 0.9234042553191489,
+ "grad_norm": 2.5974674224853516,
+ "learning_rate": 4.737482791669648e-06,
+ "loss": 0.6984,
+ "step": 1953
+ },
+ {
+ "epoch": 0.9238770685579196,
+ "grad_norm": 2.707636594772339,
+ "learning_rate": 4.737204445307406e-06,
+ "loss": 0.5548,
+ "step": 1954
+ },
+ {
+ "epoch": 0.9243498817966903,
+ "grad_norm": 2.7882707118988037,
+ "learning_rate": 4.736925959644254e-06,
+ "loss": 0.6026,
+ "step": 1955
+ },
+ {
+ "epoch": 0.924822695035461,
+ "grad_norm": 2.474482774734497,
+ "learning_rate": 4.7366473346975304e-06,
+ "loss": 0.5832,
+ "step": 1956
+ },
+ {
+ "epoch": 0.9252955082742317,
+ "grad_norm": 2.6196324825286865,
+ "learning_rate": 4.736368570484585e-06,
+ "loss": 0.5861,
+ "step": 1957
+ },
+ {
+ "epoch": 0.9257683215130024,
+ "grad_norm": 2.826864004135132,
+ "learning_rate": 4.736089667022775e-06,
+ "loss": 0.6173,
+ "step": 1958
+ },
+ {
+ "epoch": 0.926241134751773,
+ "grad_norm": 2.414473056793213,
+ "learning_rate": 4.735810624329466e-06,
+ "loss": 0.5753,
+ "step": 1959
+ },
+ {
+ "epoch": 0.9267139479905437,
+ "grad_norm": 2.8037970066070557,
+ "learning_rate": 4.7355314424220335e-06,
+ "loss": 0.6207,
+ "step": 1960
+ },
+ {
+ "epoch": 0.9271867612293144,
+ "grad_norm": 2.645458698272705,
+ "learning_rate": 4.735252121317861e-06,
+ "loss": 0.5959,
+ "step": 1961
+ },
+ {
+ "epoch": 0.9276595744680851,
+ "grad_norm": 2.7983884811401367,
+ "learning_rate": 4.734972661034339e-06,
+ "loss": 0.5696,
+ "step": 1962
+ },
+ {
+ "epoch": 0.9281323877068558,
+ "grad_norm": 3.0568997859954834,
+ "learning_rate": 4.73469306158887e-06,
+ "loss": 0.6194,
+ "step": 1963
+ },
+ {
+ "epoch": 0.9286052009456265,
+ "grad_norm": 2.7205135822296143,
+ "learning_rate": 4.734413322998863e-06,
+ "loss": 0.5292,
+ "step": 1964
+ },
+ {
+ "epoch": 0.9290780141843972,
+ "grad_norm": 3.3168489933013916,
+ "learning_rate": 4.734133445281735e-06,
+ "loss": 0.5654,
+ "step": 1965
+ },
+ {
+ "epoch": 0.9295508274231679,
+ "grad_norm": 3.0095653533935547,
+ "learning_rate": 4.733853428454916e-06,
+ "loss": 0.6508,
+ "step": 1966
+ },
+ {
+ "epoch": 0.9300236406619385,
+ "grad_norm": 2.7726712226867676,
+ "learning_rate": 4.733573272535838e-06,
+ "loss": 0.644,
+ "step": 1967
+ },
+ {
+ "epoch": 0.9304964539007092,
+ "grad_norm": 2.474397659301758,
+ "learning_rate": 4.7332929775419456e-06,
+ "loss": 0.5479,
+ "step": 1968
+ },
+ {
+ "epoch": 0.9309692671394799,
+ "grad_norm": 2.4518635272979736,
+ "learning_rate": 4.733012543490693e-06,
+ "loss": 0.6,
+ "step": 1969
+ },
+ {
+ "epoch": 0.9314420803782506,
+ "grad_norm": 2.9292192459106445,
+ "learning_rate": 4.73273197039954e-06,
+ "loss": 0.6647,
+ "step": 1970
+ },
+ {
+ "epoch": 0.9319148936170213,
+ "grad_norm": 2.425004720687866,
+ "learning_rate": 4.732451258285958e-06,
+ "loss": 0.6338,
+ "step": 1971
+ },
+ {
+ "epoch": 0.932387706855792,
+ "grad_norm": 2.904479503631592,
+ "learning_rate": 4.7321704071674255e-06,
+ "loss": 0.5923,
+ "step": 1972
+ },
+ {
+ "epoch": 0.9328605200945627,
+ "grad_norm": 2.477085590362549,
+ "learning_rate": 4.731889417061428e-06,
+ "loss": 0.5984,
+ "step": 1973
+ },
+ {
+ "epoch": 0.9333333333333333,
+ "grad_norm": 2.585240364074707,
+ "learning_rate": 4.731608287985465e-06,
+ "loss": 0.558,
+ "step": 1974
+ },
+ {
+ "epoch": 0.933806146572104,
+ "grad_norm": 2.658714532852173,
+ "learning_rate": 4.731327019957039e-06,
+ "loss": 0.5567,
+ "step": 1975
+ },
+ {
+ "epoch": 0.9342789598108747,
+ "grad_norm": 2.7593026161193848,
+ "learning_rate": 4.731045612993662e-06,
+ "loss": 0.5772,
+ "step": 1976
+ },
+ {
+ "epoch": 0.9347517730496454,
+ "grad_norm": 2.4386026859283447,
+ "learning_rate": 4.7307640671128585e-06,
+ "loss": 0.6199,
+ "step": 1977
+ },
+ {
+ "epoch": 0.9352245862884161,
+ "grad_norm": 2.681910514831543,
+ "learning_rate": 4.730482382332158e-06,
+ "loss": 0.5971,
+ "step": 1978
+ },
+ {
+ "epoch": 0.9356973995271868,
+ "grad_norm": 3.7593860626220703,
+ "learning_rate": 4.7302005586691e-06,
+ "loss": 0.6346,
+ "step": 1979
+ },
+ {
+ "epoch": 0.9361702127659575,
+ "grad_norm": 2.5789096355438232,
+ "learning_rate": 4.729918596141232e-06,
+ "loss": 0.5684,
+ "step": 1980
+ },
+ {
+ "epoch": 0.9366430260047282,
+ "grad_norm": 3.0607335567474365,
+ "learning_rate": 4.729636494766111e-06,
+ "loss": 0.6223,
+ "step": 1981
+ },
+ {
+ "epoch": 0.9371158392434988,
+ "grad_norm": 2.906643867492676,
+ "learning_rate": 4.729354254561303e-06,
+ "loss": 0.6513,
+ "step": 1982
+ },
+ {
+ "epoch": 0.9375886524822695,
+ "grad_norm": 3.192430019378662,
+ "learning_rate": 4.7290718755443795e-06,
+ "loss": 0.5095,
+ "step": 1983
+ },
+ {
+ "epoch": 0.9380614657210402,
+ "grad_norm": 2.661536931991577,
+ "learning_rate": 4.7287893577329255e-06,
+ "loss": 0.5525,
+ "step": 1984
+ },
+ {
+ "epoch": 0.9385342789598109,
+ "grad_norm": 2.8436734676361084,
+ "learning_rate": 4.728506701144531e-06,
+ "loss": 0.6323,
+ "step": 1985
+ },
+ {
+ "epoch": 0.9390070921985816,
+ "grad_norm": 2.75544810295105,
+ "learning_rate": 4.728223905796796e-06,
+ "loss": 0.6018,
+ "step": 1986
+ },
+ {
+ "epoch": 0.9394799054373523,
+ "grad_norm": 3.0652759075164795,
+ "learning_rate": 4.727940971707329e-06,
+ "loss": 0.62,
+ "step": 1987
+ },
+ {
+ "epoch": 0.939952718676123,
+ "grad_norm": 2.802567720413208,
+ "learning_rate": 4.727657898893747e-06,
+ "loss": 0.5809,
+ "step": 1988
+ },
+ {
+ "epoch": 0.9404255319148936,
+ "grad_norm": 2.6208512783050537,
+ "learning_rate": 4.7273746873736745e-06,
+ "loss": 0.5762,
+ "step": 1989
+ },
+ {
+ "epoch": 0.9408983451536643,
+ "grad_norm": 2.5901873111724854,
+ "learning_rate": 4.727091337164748e-06,
+ "loss": 0.6111,
+ "step": 1990
+ },
+ {
+ "epoch": 0.941371158392435,
+ "grad_norm": 3.002347707748413,
+ "learning_rate": 4.726807848284609e-06,
+ "loss": 0.6419,
+ "step": 1991
+ },
+ {
+ "epoch": 0.9418439716312057,
+ "grad_norm": 2.522151470184326,
+ "learning_rate": 4.72652422075091e-06,
+ "loss": 0.642,
+ "step": 1992
+ },
+ {
+ "epoch": 0.9423167848699764,
+ "grad_norm": 2.5571532249450684,
+ "learning_rate": 4.726240454581311e-06,
+ "loss": 0.5729,
+ "step": 1993
+ },
+ {
+ "epoch": 0.9427895981087471,
+ "grad_norm": 2.7704918384552,
+ "learning_rate": 4.72595654979348e-06,
+ "loss": 0.6816,
+ "step": 1994
+ },
+ {
+ "epoch": 0.9432624113475178,
+ "grad_norm": 2.517040491104126,
+ "learning_rate": 4.7256725064050955e-06,
+ "loss": 0.5782,
+ "step": 1995
+ },
+ {
+ "epoch": 0.9437352245862884,
+ "grad_norm": 2.613955020904541,
+ "learning_rate": 4.725388324433843e-06,
+ "loss": 0.6291,
+ "step": 1996
+ },
+ {
+ "epoch": 0.9442080378250591,
+ "grad_norm": 2.848891258239746,
+ "learning_rate": 4.725104003897418e-06,
+ "loss": 0.6544,
+ "step": 1997
+ },
+ {
+ "epoch": 0.9446808510638298,
+ "grad_norm": 3.0162429809570312,
+ "learning_rate": 4.724819544813523e-06,
+ "loss": 0.6301,
+ "step": 1998
+ },
+ {
+ "epoch": 0.9451536643026005,
+ "grad_norm": 2.613614559173584,
+ "learning_rate": 4.72453494719987e-06,
+ "loss": 0.5829,
+ "step": 1999
+ },
+ {
+ "epoch": 0.9456264775413712,
+ "grad_norm": 2.4838767051696777,
+ "learning_rate": 4.724250211074182e-06,
+ "loss": 0.6042,
+ "step": 2000
+ },
+ {
+ "epoch": 0.9460992907801419,
+ "grad_norm": 2.526470899581909,
+ "learning_rate": 4.723965336454185e-06,
+ "loss": 0.6167,
+ "step": 2001
+ },
+ {
+ "epoch": 0.9465721040189126,
+ "grad_norm": 2.504506826400757,
+ "learning_rate": 4.723680323357618e-06,
+ "loss": 0.6061,
+ "step": 2002
+ },
+ {
+ "epoch": 0.9470449172576832,
+ "grad_norm": 3.0547544956207275,
+ "learning_rate": 4.723395171802228e-06,
+ "loss": 0.6619,
+ "step": 2003
+ },
+ {
+ "epoch": 0.9475177304964539,
+ "grad_norm": 2.8692407608032227,
+ "learning_rate": 4.723109881805771e-06,
+ "loss": 0.5985,
+ "step": 2004
+ },
+ {
+ "epoch": 0.9479905437352246,
+ "grad_norm": 2.7929654121398926,
+ "learning_rate": 4.7228244533860094e-06,
+ "loss": 0.5869,
+ "step": 2005
+ },
+ {
+ "epoch": 0.9484633569739953,
+ "grad_norm": 2.764869451522827,
+ "learning_rate": 4.7225388865607146e-06,
+ "loss": 0.6288,
+ "step": 2006
+ },
+ {
+ "epoch": 0.948936170212766,
+ "grad_norm": 2.7656404972076416,
+ "learning_rate": 4.722253181347671e-06,
+ "loss": 0.5831,
+ "step": 2007
+ },
+ {
+ "epoch": 0.9494089834515367,
+ "grad_norm": 2.6698336601257324,
+ "learning_rate": 4.7219673377646635e-06,
+ "loss": 0.6087,
+ "step": 2008
+ },
+ {
+ "epoch": 0.9498817966903074,
+ "grad_norm": 2.524935722351074,
+ "learning_rate": 4.7216813558294946e-06,
+ "loss": 0.5675,
+ "step": 2009
+ },
+ {
+ "epoch": 0.950354609929078,
+ "grad_norm": 2.5998785495758057,
+ "learning_rate": 4.721395235559969e-06,
+ "loss": 0.5667,
+ "step": 2010
+ },
+ {
+ "epoch": 0.9508274231678487,
+ "grad_norm": 2.758021354675293,
+ "learning_rate": 4.721108976973902e-06,
+ "loss": 0.4931,
+ "step": 2011
+ },
+ {
+ "epoch": 0.9513002364066194,
+ "grad_norm": 2.767695903778076,
+ "learning_rate": 4.72082258008912e-06,
+ "loss": 0.5778,
+ "step": 2012
+ },
+ {
+ "epoch": 0.9517730496453901,
+ "grad_norm": 2.982314348220825,
+ "learning_rate": 4.720536044923453e-06,
+ "loss": 0.6096,
+ "step": 2013
+ },
+ {
+ "epoch": 0.9522458628841608,
+ "grad_norm": 2.7608799934387207,
+ "learning_rate": 4.720249371494743e-06,
+ "loss": 0.6242,
+ "step": 2014
+ },
+ {
+ "epoch": 0.9527186761229315,
+ "grad_norm": 2.60054349899292,
+ "learning_rate": 4.71996255982084e-06,
+ "loss": 0.6249,
+ "step": 2015
+ },
+ {
+ "epoch": 0.9531914893617022,
+ "grad_norm": 2.654355764389038,
+ "learning_rate": 4.719675609919603e-06,
+ "loss": 0.6327,
+ "step": 2016
+ },
+ {
+ "epoch": 0.9536643026004729,
+ "grad_norm": 2.589404582977295,
+ "learning_rate": 4.719388521808899e-06,
+ "loss": 0.6357,
+ "step": 2017
+ },
+ {
+ "epoch": 0.9541371158392435,
+ "grad_norm": 2.8016581535339355,
+ "learning_rate": 4.719101295506603e-06,
+ "loss": 0.5901,
+ "step": 2018
+ },
+ {
+ "epoch": 0.9546099290780142,
+ "grad_norm": 3.1408045291900635,
+ "learning_rate": 4.7188139310306e-06,
+ "loss": 0.598,
+ "step": 2019
+ },
+ {
+ "epoch": 0.9550827423167849,
+ "grad_norm": 2.7432665824890137,
+ "learning_rate": 4.718526428398783e-06,
+ "loss": 0.5508,
+ "step": 2020
+ },
+ {
+ "epoch": 0.9555555555555556,
+ "grad_norm": 2.947800874710083,
+ "learning_rate": 4.718238787629053e-06,
+ "loss": 0.6439,
+ "step": 2021
+ },
+ {
+ "epoch": 0.9560283687943263,
+ "grad_norm": 2.50828218460083,
+ "learning_rate": 4.71795100873932e-06,
+ "loss": 0.5441,
+ "step": 2022
+ },
+ {
+ "epoch": 0.956501182033097,
+ "grad_norm": 2.8558974266052246,
+ "learning_rate": 4.717663091747503e-06,
+ "loss": 0.5416,
+ "step": 2023
+ },
+ {
+ "epoch": 0.9569739952718677,
+ "grad_norm": 2.4803316593170166,
+ "learning_rate": 4.71737503667153e-06,
+ "loss": 0.5317,
+ "step": 2024
+ },
+ {
+ "epoch": 0.9574468085106383,
+ "grad_norm": 4.36754035949707,
+ "learning_rate": 4.717086843529336e-06,
+ "loss": 0.5808,
+ "step": 2025
+ },
+ {
+ "epoch": 0.957919621749409,
+ "grad_norm": 2.730185031890869,
+ "learning_rate": 4.7167985123388665e-06,
+ "loss": 0.5257,
+ "step": 2026
+ },
+ {
+ "epoch": 0.9583924349881797,
+ "grad_norm": 2.8136069774627686,
+ "learning_rate": 4.716510043118074e-06,
+ "loss": 0.5836,
+ "step": 2027
+ },
+ {
+ "epoch": 0.9588652482269504,
+ "grad_norm": 2.793975353240967,
+ "learning_rate": 4.71622143588492e-06,
+ "loss": 0.5706,
+ "step": 2028
+ },
+ {
+ "epoch": 0.9593380614657211,
+ "grad_norm": 2.3883821964263916,
+ "learning_rate": 4.7159326906573745e-06,
+ "loss": 0.5291,
+ "step": 2029
+ },
+ {
+ "epoch": 0.9598108747044918,
+ "grad_norm": 2.6135976314544678,
+ "learning_rate": 4.715643807453417e-06,
+ "loss": 0.6199,
+ "step": 2030
+ },
+ {
+ "epoch": 0.9602836879432625,
+ "grad_norm": 2.6245670318603516,
+ "learning_rate": 4.715354786291035e-06,
+ "loss": 0.5585,
+ "step": 2031
+ },
+ {
+ "epoch": 0.9607565011820332,
+ "grad_norm": 2.7870967388153076,
+ "learning_rate": 4.715065627188225e-06,
+ "loss": 0.6196,
+ "step": 2032
+ },
+ {
+ "epoch": 0.9612293144208038,
+ "grad_norm": 2.6983911991119385,
+ "learning_rate": 4.714776330162991e-06,
+ "loss": 0.6424,
+ "step": 2033
+ },
+ {
+ "epoch": 0.9617021276595744,
+ "grad_norm": 2.3221919536590576,
+ "learning_rate": 4.7144868952333465e-06,
+ "loss": 0.568,
+ "step": 2034
+ },
+ {
+ "epoch": 0.9621749408983451,
+ "grad_norm": 2.9408178329467773,
+ "learning_rate": 4.714197322417314e-06,
+ "loss": 0.6175,
+ "step": 2035
+ },
+ {
+ "epoch": 0.9626477541371158,
+ "grad_norm": 2.404057264328003,
+ "learning_rate": 4.713907611732921e-06,
+ "loss": 0.4943,
+ "step": 2036
+ },
+ {
+ "epoch": 0.9631205673758865,
+ "grad_norm": 3.547607660293579,
+ "learning_rate": 4.71361776319821e-06,
+ "loss": 0.5488,
+ "step": 2037
+ },
+ {
+ "epoch": 0.9635933806146572,
+ "grad_norm": 2.679614543914795,
+ "learning_rate": 4.713327776831227e-06,
+ "loss": 0.6234,
+ "step": 2038
+ },
+ {
+ "epoch": 0.9640661938534278,
+ "grad_norm": 2.526914119720459,
+ "learning_rate": 4.7130376526500286e-06,
+ "loss": 0.5891,
+ "step": 2039
+ },
+ {
+ "epoch": 0.9645390070921985,
+ "grad_norm": 2.6953470706939697,
+ "learning_rate": 4.71274739067268e-06,
+ "loss": 0.69,
+ "step": 2040
+ },
+ {
+ "epoch": 0.9650118203309692,
+ "grad_norm": 2.546660900115967,
+ "learning_rate": 4.712456990917254e-06,
+ "loss": 0.6185,
+ "step": 2041
+ },
+ {
+ "epoch": 0.9654846335697399,
+ "grad_norm": 3.3920490741729736,
+ "learning_rate": 4.712166453401832e-06,
+ "loss": 0.587,
+ "step": 2042
+ },
+ {
+ "epoch": 0.9659574468085106,
+ "grad_norm": 2.5961573123931885,
+ "learning_rate": 4.711875778144504e-06,
+ "loss": 0.6105,
+ "step": 2043
+ },
+ {
+ "epoch": 0.9664302600472813,
+ "grad_norm": 2.5111498832702637,
+ "learning_rate": 4.711584965163372e-06,
+ "loss": 0.5533,
+ "step": 2044
+ },
+ {
+ "epoch": 0.966903073286052,
+ "grad_norm": 2.4878132343292236,
+ "learning_rate": 4.7112940144765405e-06,
+ "loss": 0.5604,
+ "step": 2045
+ },
+ {
+ "epoch": 0.9673758865248226,
+ "grad_norm": 2.5714077949523926,
+ "learning_rate": 4.711002926102128e-06,
+ "loss": 0.5794,
+ "step": 2046
+ },
+ {
+ "epoch": 0.9678486997635933,
+ "grad_norm": 2.7069091796875,
+ "learning_rate": 4.710711700058257e-06,
+ "loss": 0.594,
+ "step": 2047
+ },
+ {
+ "epoch": 0.968321513002364,
+ "grad_norm": 2.8104631900787354,
+ "learning_rate": 4.710420336363063e-06,
+ "loss": 0.6247,
+ "step": 2048
+ },
+ {
+ "epoch": 0.9687943262411347,
+ "grad_norm": 2.8464386463165283,
+ "learning_rate": 4.7101288350346865e-06,
+ "loss": 0.6162,
+ "step": 2049
+ },
+ {
+ "epoch": 0.9692671394799054,
+ "grad_norm": 2.7187976837158203,
+ "learning_rate": 4.709837196091279e-06,
+ "loss": 0.6109,
+ "step": 2050
+ },
+ {
+ "epoch": 0.9697399527186761,
+ "grad_norm": 2.556734085083008,
+ "learning_rate": 4.709545419550999e-06,
+ "loss": 0.6297,
+ "step": 2051
+ },
+ {
+ "epoch": 0.9702127659574468,
+ "grad_norm": 2.937195062637329,
+ "learning_rate": 4.709253505432014e-06,
+ "loss": 0.6862,
+ "step": 2052
+ },
+ {
+ "epoch": 0.9706855791962175,
+ "grad_norm": 2.792175531387329,
+ "learning_rate": 4.7089614537525015e-06,
+ "loss": 0.6105,
+ "step": 2053
+ },
+ {
+ "epoch": 0.9711583924349881,
+ "grad_norm": 2.625636100769043,
+ "learning_rate": 4.708669264530644e-06,
+ "loss": 0.5849,
+ "step": 2054
+ },
+ {
+ "epoch": 0.9716312056737588,
+ "grad_norm": 2.6752610206604004,
+ "learning_rate": 4.708376937784637e-06,
+ "loss": 0.5949,
+ "step": 2055
+ },
+ {
+ "epoch": 0.9721040189125295,
+ "grad_norm": 2.6072793006896973,
+ "learning_rate": 4.708084473532681e-06,
+ "loss": 0.5776,
+ "step": 2056
+ },
+ {
+ "epoch": 0.9725768321513002,
+ "grad_norm": 2.728632926940918,
+ "learning_rate": 4.707791871792988e-06,
+ "loss": 0.6352,
+ "step": 2057
+ },
+ {
+ "epoch": 0.9730496453900709,
+ "grad_norm": 2.5841758251190186,
+ "learning_rate": 4.707499132583775e-06,
+ "loss": 0.5488,
+ "step": 2058
+ },
+ {
+ "epoch": 0.9735224586288416,
+ "grad_norm": 2.8464293479919434,
+ "learning_rate": 4.707206255923271e-06,
+ "loss": 0.7051,
+ "step": 2059
+ },
+ {
+ "epoch": 0.9739952718676123,
+ "grad_norm": 2.547297239303589,
+ "learning_rate": 4.706913241829712e-06,
+ "loss": 0.5937,
+ "step": 2060
+ },
+ {
+ "epoch": 0.9744680851063829,
+ "grad_norm": 2.6572306156158447,
+ "learning_rate": 4.706620090321341e-06,
+ "loss": 0.6041,
+ "step": 2061
+ },
+ {
+ "epoch": 0.9749408983451536,
+ "grad_norm": 2.3262805938720703,
+ "learning_rate": 4.706326801416414e-06,
+ "loss": 0.5144,
+ "step": 2062
+ },
+ {
+ "epoch": 0.9754137115839243,
+ "grad_norm": 2.9693965911865234,
+ "learning_rate": 4.706033375133191e-06,
+ "loss": 0.551,
+ "step": 2063
+ },
+ {
+ "epoch": 0.975886524822695,
+ "grad_norm": 2.5993731021881104,
+ "learning_rate": 4.7057398114899435e-06,
+ "loss": 0.6143,
+ "step": 2064
+ },
+ {
+ "epoch": 0.9763593380614657,
+ "grad_norm": 2.453336477279663,
+ "learning_rate": 4.70544611050495e-06,
+ "loss": 0.6093,
+ "step": 2065
+ },
+ {
+ "epoch": 0.9768321513002364,
+ "grad_norm": 2.898629665374756,
+ "learning_rate": 4.705152272196497e-06,
+ "loss": 0.6007,
+ "step": 2066
+ },
+ {
+ "epoch": 0.9773049645390071,
+ "grad_norm": 2.7990612983703613,
+ "learning_rate": 4.7048582965828815e-06,
+ "loss": 0.6687,
+ "step": 2067
+ },
+ {
+ "epoch": 0.9777777777777777,
+ "grad_norm": 2.635284423828125,
+ "learning_rate": 4.704564183682408e-06,
+ "loss": 0.5564,
+ "step": 2068
+ },
+ {
+ "epoch": 0.9782505910165484,
+ "grad_norm": 3.014547109603882,
+ "learning_rate": 4.704269933513389e-06,
+ "loss": 0.6084,
+ "step": 2069
+ },
+ {
+ "epoch": 0.9787234042553191,
+ "grad_norm": 2.659357786178589,
+ "learning_rate": 4.703975546094147e-06,
+ "loss": 0.6031,
+ "step": 2070
+ },
+ {
+ "epoch": 0.9791962174940898,
+ "grad_norm": 2.326932668685913,
+ "learning_rate": 4.703681021443013e-06,
+ "loss": 0.5859,
+ "step": 2071
+ },
+ {
+ "epoch": 0.9796690307328605,
+ "grad_norm": 2.958803653717041,
+ "learning_rate": 4.7033863595783235e-06,
+ "loss": 0.5586,
+ "step": 2072
+ },
+ {
+ "epoch": 0.9801418439716312,
+ "grad_norm": 2.921386957168579,
+ "learning_rate": 4.703091560518427e-06,
+ "loss": 0.6126,
+ "step": 2073
+ },
+ {
+ "epoch": 0.9806146572104019,
+ "grad_norm": 2.6500775814056396,
+ "learning_rate": 4.702796624281679e-06,
+ "loss": 0.5678,
+ "step": 2074
+ },
+ {
+ "epoch": 0.9810874704491725,
+ "grad_norm": 2.7740228176116943,
+ "learning_rate": 4.702501550886445e-06,
+ "loss": 0.6067,
+ "step": 2075
+ },
+ {
+ "epoch": 0.9815602836879432,
+ "grad_norm": 2.3296213150024414,
+ "learning_rate": 4.702206340351096e-06,
+ "loss": 0.5247,
+ "step": 2076
+ },
+ {
+ "epoch": 0.9820330969267139,
+ "grad_norm": 2.748300790786743,
+ "learning_rate": 4.701910992694016e-06,
+ "loss": 0.5197,
+ "step": 2077
+ },
+ {
+ "epoch": 0.9825059101654846,
+ "grad_norm": 2.250985622406006,
+ "learning_rate": 4.7016155079335926e-06,
+ "loss": 0.5214,
+ "step": 2078
+ },
+ {
+ "epoch": 0.9829787234042553,
+ "grad_norm": 2.389845848083496,
+ "learning_rate": 4.701319886088226e-06,
+ "loss": 0.519,
+ "step": 2079
+ },
+ {
+ "epoch": 0.983451536643026,
+ "grad_norm": 2.818220853805542,
+ "learning_rate": 4.701024127176322e-06,
+ "loss": 0.607,
+ "step": 2080
+ },
+ {
+ "epoch": 0.9839243498817967,
+ "grad_norm": 3.4058034420013428,
+ "learning_rate": 4.700728231216297e-06,
+ "loss": 0.5711,
+ "step": 2081
+ },
+ {
+ "epoch": 0.9843971631205674,
+ "grad_norm": 2.5297787189483643,
+ "learning_rate": 4.700432198226575e-06,
+ "loss": 0.5979,
+ "step": 2082
+ },
+ {
+ "epoch": 0.984869976359338,
+ "grad_norm": 3.0548105239868164,
+ "learning_rate": 4.7001360282255885e-06,
+ "loss": 0.6041,
+ "step": 2083
+ },
+ {
+ "epoch": 0.9853427895981087,
+ "grad_norm": 2.8983733654022217,
+ "learning_rate": 4.699839721231779e-06,
+ "loss": 0.5926,
+ "step": 2084
+ },
+ {
+ "epoch": 0.9858156028368794,
+ "grad_norm": 3.2717764377593994,
+ "learning_rate": 4.699543277263596e-06,
+ "loss": 0.6477,
+ "step": 2085
+ },
+ {
+ "epoch": 0.9862884160756501,
+ "grad_norm": 3.03729248046875,
+ "learning_rate": 4.699246696339497e-06,
+ "loss": 0.6786,
+ "step": 2086
+ },
+ {
+ "epoch": 0.9867612293144208,
+ "grad_norm": 2.852301597595215,
+ "learning_rate": 4.698949978477951e-06,
+ "loss": 0.6565,
+ "step": 2087
+ },
+ {
+ "epoch": 0.9872340425531915,
+ "grad_norm": 2.843485116958618,
+ "learning_rate": 4.698653123697431e-06,
+ "loss": 0.6627,
+ "step": 2088
+ },
+ {
+ "epoch": 0.9877068557919622,
+ "grad_norm": 2.6315064430236816,
+ "learning_rate": 4.698356132016423e-06,
+ "loss": 0.6577,
+ "step": 2089
+ },
+ {
+ "epoch": 0.9881796690307328,
+ "grad_norm": 2.7482151985168457,
+ "learning_rate": 4.698059003453417e-06,
+ "loss": 0.5514,
+ "step": 2090
+ },
+ {
+ "epoch": 0.9886524822695035,
+ "grad_norm": 2.826673746109009,
+ "learning_rate": 4.6977617380269145e-06,
+ "loss": 0.565,
+ "step": 2091
+ },
+ {
+ "epoch": 0.9891252955082742,
+ "grad_norm": 3.0273752212524414,
+ "learning_rate": 4.697464335755427e-06,
+ "loss": 0.6331,
+ "step": 2092
+ },
+ {
+ "epoch": 0.9895981087470449,
+ "grad_norm": 2.7551653385162354,
+ "learning_rate": 4.6971667966574695e-06,
+ "loss": 0.6486,
+ "step": 2093
+ },
+ {
+ "epoch": 0.9900709219858156,
+ "grad_norm": 2.656299114227295,
+ "learning_rate": 4.696869120751571e-06,
+ "loss": 0.6562,
+ "step": 2094
+ },
+ {
+ "epoch": 0.9905437352245863,
+ "grad_norm": 2.785322904586792,
+ "learning_rate": 4.696571308056265e-06,
+ "loss": 0.5892,
+ "step": 2095
+ },
+ {
+ "epoch": 0.991016548463357,
+ "grad_norm": 2.9334635734558105,
+ "learning_rate": 4.696273358590095e-06,
+ "loss": 0.6346,
+ "step": 2096
+ },
+ {
+ "epoch": 0.9914893617021276,
+ "grad_norm": 2.7944300174713135,
+ "learning_rate": 4.695975272371613e-06,
+ "loss": 0.5859,
+ "step": 2097
+ },
+ {
+ "epoch": 0.9919621749408983,
+ "grad_norm": 2.5416972637176514,
+ "learning_rate": 4.695677049419381e-06,
+ "loss": 0.5658,
+ "step": 2098
+ },
+ {
+ "epoch": 0.992434988179669,
+ "grad_norm": 2.4056856632232666,
+ "learning_rate": 4.695378689751966e-06,
+ "loss": 0.5121,
+ "step": 2099
+ },
+ {
+ "epoch": 0.9929078014184397,
+ "grad_norm": 2.614548683166504,
+ "learning_rate": 4.695080193387948e-06,
+ "loss": 0.5961,
+ "step": 2100
+ },
+ {
+ "epoch": 0.9933806146572104,
+ "grad_norm": 2.8966517448425293,
+ "learning_rate": 4.69478156034591e-06,
+ "loss": 0.5985,
+ "step": 2101
+ },
+ {
+ "epoch": 0.9938534278959811,
+ "grad_norm": 2.9514098167419434,
+ "learning_rate": 4.694482790644448e-06,
+ "loss": 0.5677,
+ "step": 2102
+ },
+ {
+ "epoch": 0.9943262411347518,
+ "grad_norm": 2.4326791763305664,
+ "learning_rate": 4.694183884302165e-06,
+ "loss": 0.5698,
+ "step": 2103
+ },
+ {
+ "epoch": 0.9947990543735225,
+ "grad_norm": 2.9242892265319824,
+ "learning_rate": 4.6938848413376735e-06,
+ "loss": 0.6245,
+ "step": 2104
+ },
+ {
+ "epoch": 0.9952718676122931,
+ "grad_norm": 2.9134104251861572,
+ "learning_rate": 4.693585661769593e-06,
+ "loss": 0.6164,
+ "step": 2105
+ },
+ {
+ "epoch": 0.9957446808510638,
+ "grad_norm": 2.472564458847046,
+ "learning_rate": 4.693286345616551e-06,
+ "loss": 0.5616,
+ "step": 2106
+ },
+ {
+ "epoch": 0.9962174940898345,
+ "grad_norm": 3.2456448078155518,
+ "learning_rate": 4.692986892897186e-06,
+ "loss": 0.6977,
+ "step": 2107
+ },
+ {
+ "epoch": 0.9966903073286052,
+ "grad_norm": 3.4032769203186035,
+ "learning_rate": 4.692687303630143e-06,
+ "loss": 0.643,
+ "step": 2108
+ },
+ {
+ "epoch": 0.9971631205673759,
+ "grad_norm": 2.722200870513916,
+ "learning_rate": 4.692387577834076e-06,
+ "loss": 0.5873,
+ "step": 2109
+ },
+ {
+ "epoch": 0.9976359338061466,
+ "grad_norm": 2.687532663345337,
+ "learning_rate": 4.692087715527648e-06,
+ "loss": 0.5423,
+ "step": 2110
+ },
+ {
+ "epoch": 0.9981087470449173,
+ "grad_norm": 2.578613042831421,
+ "learning_rate": 4.6917877167295305e-06,
+ "loss": 0.5689,
+ "step": 2111
+ },
+ {
+ "epoch": 0.9985815602836879,
+ "grad_norm": 3.1806094646453857,
+ "learning_rate": 4.691487581458402e-06,
+ "loss": 0.6133,
+ "step": 2112
+ },
+ {
+ "epoch": 0.9990543735224586,
+ "grad_norm": 2.4449520111083984,
+ "learning_rate": 4.691187309732952e-06,
+ "loss": 0.5841,
+ "step": 2113
+ },
+ {
+ "epoch": 0.9995271867612293,
+ "grad_norm": 2.908749580383301,
+ "learning_rate": 4.690886901571875e-06,
+ "loss": 0.534,
+ "step": 2114
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 4.019968032836914,
+ "learning_rate": 4.6905863569938785e-06,
+ "loss": 0.596,
+ "step": 2115
+ },
+ {
+ "epoch": 1.0004728132387706,
+ "grad_norm": 2.4319307804107666,
+ "learning_rate": 4.690285676017675e-06,
+ "loss": 0.4973,
+ "step": 2116
+ },
+ {
+ "epoch": 1.0009456264775414,
+ "grad_norm": 2.6366477012634277,
+ "learning_rate": 4.689984858661986e-06,
+ "loss": 0.5682,
+ "step": 2117
+ },
+ {
+ "epoch": 1.001418439716312,
+ "grad_norm": 2.815114974975586,
+ "learning_rate": 4.689683904945542e-06,
+ "loss": 0.5616,
+ "step": 2118
+ },
+ {
+ "epoch": 1.0018912529550827,
+ "grad_norm": 2.6680490970611572,
+ "learning_rate": 4.689382814887084e-06,
+ "loss": 0.5161,
+ "step": 2119
+ },
+ {
+ "epoch": 1.0023640661938533,
+ "grad_norm": 2.7406351566314697,
+ "learning_rate": 4.689081588505358e-06,
+ "loss": 0.4937,
+ "step": 2120
+ },
+ {
+ "epoch": 1.0028368794326241,
+ "grad_norm": 2.2832298278808594,
+ "learning_rate": 4.68878022581912e-06,
+ "loss": 0.4986,
+ "step": 2121
+ },
+ {
+ "epoch": 1.0033096926713947,
+ "grad_norm": 2.5525307655334473,
+ "learning_rate": 4.688478726847136e-06,
+ "loss": 0.4909,
+ "step": 2122
+ },
+ {
+ "epoch": 1.0037825059101655,
+ "grad_norm": 2.9843199253082275,
+ "learning_rate": 4.688177091608176e-06,
+ "loss": 0.6046,
+ "step": 2123
+ },
+ {
+ "epoch": 1.004255319148936,
+ "grad_norm": 2.5231106281280518,
+ "learning_rate": 4.687875320121024e-06,
+ "loss": 0.5423,
+ "step": 2124
+ },
+ {
+ "epoch": 1.0047281323877069,
+ "grad_norm": 2.567599058151245,
+ "learning_rate": 4.68757341240447e-06,
+ "loss": 0.5092,
+ "step": 2125
+ },
+ {
+ "epoch": 1.0052009456264774,
+ "grad_norm": 2.768111228942871,
+ "learning_rate": 4.687271368477311e-06,
+ "loss": 0.5175,
+ "step": 2126
+ },
+ {
+ "epoch": 1.0056737588652482,
+ "grad_norm": 2.7223286628723145,
+ "learning_rate": 4.686969188358355e-06,
+ "loss": 0.5412,
+ "step": 2127
+ },
+ {
+ "epoch": 1.0061465721040188,
+ "grad_norm": 2.488299608230591,
+ "learning_rate": 4.686666872066418e-06,
+ "loss": 0.5288,
+ "step": 2128
+ },
+ {
+ "epoch": 1.0066193853427896,
+ "grad_norm": 2.882981777191162,
+ "learning_rate": 4.6863644196203215e-06,
+ "loss": 0.6117,
+ "step": 2129
+ },
+ {
+ "epoch": 1.0070921985815602,
+ "grad_norm": 3.0019447803497314,
+ "learning_rate": 4.686061831038901e-06,
+ "loss": 0.5308,
+ "step": 2130
+ },
+ {
+ "epoch": 1.007565011820331,
+ "grad_norm": 3.0056138038635254,
+ "learning_rate": 4.685759106340996e-06,
+ "loss": 0.5833,
+ "step": 2131
+ },
+ {
+ "epoch": 1.0080378250591016,
+ "grad_norm": 2.5709075927734375,
+ "learning_rate": 4.685456245545454e-06,
+ "loss": 0.5071,
+ "step": 2132
+ },
+ {
+ "epoch": 1.0085106382978724,
+ "grad_norm": 2.4641504287719727,
+ "learning_rate": 4.685153248671136e-06,
+ "loss": 0.4813,
+ "step": 2133
+ },
+ {
+ "epoch": 1.008983451536643,
+ "grad_norm": 2.374413013458252,
+ "learning_rate": 4.684850115736906e-06,
+ "loss": 0.5179,
+ "step": 2134
+ },
+ {
+ "epoch": 1.0094562647754137,
+ "grad_norm": 2.6504571437835693,
+ "learning_rate": 4.684546846761641e-06,
+ "loss": 0.437,
+ "step": 2135
+ },
+ {
+ "epoch": 1.0099290780141843,
+ "grad_norm": 2.5977871417999268,
+ "learning_rate": 4.684243441764221e-06,
+ "loss": 0.497,
+ "step": 2136
+ },
+ {
+ "epoch": 1.010401891252955,
+ "grad_norm": 2.4950785636901855,
+ "learning_rate": 4.683939900763541e-06,
+ "loss": 0.5624,
+ "step": 2137
+ },
+ {
+ "epoch": 1.0108747044917257,
+ "grad_norm": 3.065718412399292,
+ "learning_rate": 4.6836362237785e-06,
+ "loss": 0.512,
+ "step": 2138
+ },
+ {
+ "epoch": 1.0113475177304965,
+ "grad_norm": 2.7419207096099854,
+ "learning_rate": 4.6833324108280045e-06,
+ "loss": 0.5585,
+ "step": 2139
+ },
+ {
+ "epoch": 1.011820330969267,
+ "grad_norm": 2.623610496520996,
+ "learning_rate": 4.6830284619309744e-06,
+ "loss": 0.5163,
+ "step": 2140
+ },
+ {
+ "epoch": 1.0122931442080378,
+ "grad_norm": 2.774322986602783,
+ "learning_rate": 4.682724377106334e-06,
+ "loss": 0.527,
+ "step": 2141
+ },
+ {
+ "epoch": 1.0127659574468084,
+ "grad_norm": 2.959935188293457,
+ "learning_rate": 4.682420156373017e-06,
+ "loss": 0.6166,
+ "step": 2142
+ },
+ {
+ "epoch": 1.0132387706855792,
+ "grad_norm": 2.584026336669922,
+ "learning_rate": 4.682115799749968e-06,
+ "loss": 0.5086,
+ "step": 2143
+ },
+ {
+ "epoch": 1.0137115839243498,
+ "grad_norm": 2.6039700508117676,
+ "learning_rate": 4.6818113072561346e-06,
+ "loss": 0.49,
+ "step": 2144
+ },
+ {
+ "epoch": 1.0141843971631206,
+ "grad_norm": 2.466381072998047,
+ "learning_rate": 4.681506678910479e-06,
+ "loss": 0.4959,
+ "step": 2145
+ },
+ {
+ "epoch": 1.0146572104018912,
+ "grad_norm": 2.432636260986328,
+ "learning_rate": 4.681201914731969e-06,
+ "loss": 0.5057,
+ "step": 2146
+ },
+ {
+ "epoch": 1.015130023640662,
+ "grad_norm": 2.6134090423583984,
+ "learning_rate": 4.680897014739579e-06,
+ "loss": 0.4874,
+ "step": 2147
+ },
+ {
+ "epoch": 1.0156028368794325,
+ "grad_norm": 2.774481773376465,
+ "learning_rate": 4.680591978952295e-06,
+ "loss": 0.4967,
+ "step": 2148
+ },
+ {
+ "epoch": 1.0160756501182033,
+ "grad_norm": 2.66050124168396,
+ "learning_rate": 4.68028680738911e-06,
+ "loss": 0.4932,
+ "step": 2149
+ },
+ {
+ "epoch": 1.016548463356974,
+ "grad_norm": 3.020594835281372,
+ "learning_rate": 4.679981500069026e-06,
+ "loss": 0.5788,
+ "step": 2150
+ },
+ {
+ "epoch": 1.0170212765957447,
+ "grad_norm": 2.697758436203003,
+ "learning_rate": 4.679676057011053e-06,
+ "loss": 0.5441,
+ "step": 2151
+ },
+ {
+ "epoch": 1.0174940898345153,
+ "grad_norm": 6.986445903778076,
+ "learning_rate": 4.679370478234209e-06,
+ "loss": 0.6483,
+ "step": 2152
+ },
+ {
+ "epoch": 1.017966903073286,
+ "grad_norm": 2.6637115478515625,
+ "learning_rate": 4.679064763757522e-06,
+ "loss": 0.5859,
+ "step": 2153
+ },
+ {
+ "epoch": 1.0184397163120567,
+ "grad_norm": 2.7501862049102783,
+ "learning_rate": 4.678758913600027e-06,
+ "loss": 0.5745,
+ "step": 2154
+ },
+ {
+ "epoch": 1.0189125295508275,
+ "grad_norm": 2.7959372997283936,
+ "learning_rate": 4.678452927780768e-06,
+ "loss": 0.5076,
+ "step": 2155
+ },
+ {
+ "epoch": 1.019385342789598,
+ "grad_norm": 2.4377388954162598,
+ "learning_rate": 4.678146806318798e-06,
+ "loss": 0.5061,
+ "step": 2156
+ },
+ {
+ "epoch": 1.0198581560283688,
+ "grad_norm": 2.5478947162628174,
+ "learning_rate": 4.677840549233176e-06,
+ "loss": 0.4941,
+ "step": 2157
+ },
+ {
+ "epoch": 1.0203309692671394,
+ "grad_norm": 3.0956528186798096,
+ "learning_rate": 4.677534156542973e-06,
+ "loss": 0.5879,
+ "step": 2158
+ },
+ {
+ "epoch": 1.0208037825059102,
+ "grad_norm": 2.5247607231140137,
+ "learning_rate": 4.6772276282672666e-06,
+ "loss": 0.5532,
+ "step": 2159
+ },
+ {
+ "epoch": 1.0212765957446808,
+ "grad_norm": 3.1972787380218506,
+ "learning_rate": 4.676920964425143e-06,
+ "loss": 0.6081,
+ "step": 2160
+ },
+ {
+ "epoch": 1.0217494089834516,
+ "grad_norm": 2.6173388957977295,
+ "learning_rate": 4.6766141650356955e-06,
+ "loss": 0.5001,
+ "step": 2161
+ },
+ {
+ "epoch": 1.0222222222222221,
+ "grad_norm": 2.9914398193359375,
+ "learning_rate": 4.676307230118029e-06,
+ "loss": 0.5566,
+ "step": 2162
+ },
+ {
+ "epoch": 1.022695035460993,
+ "grad_norm": 2.8011834621429443,
+ "learning_rate": 4.676000159691254e-06,
+ "loss": 0.4909,
+ "step": 2163
+ },
+ {
+ "epoch": 1.0231678486997635,
+ "grad_norm": 2.6049559116363525,
+ "learning_rate": 4.67569295377449e-06,
+ "loss": 0.5018,
+ "step": 2164
+ },
+ {
+ "epoch": 1.0236406619385343,
+ "grad_norm": 2.8175013065338135,
+ "learning_rate": 4.675385612386866e-06,
+ "loss": 0.5309,
+ "step": 2165
+ },
+ {
+ "epoch": 1.0241134751773049,
+ "grad_norm": 2.854696750640869,
+ "learning_rate": 4.675078135547519e-06,
+ "loss": 0.5627,
+ "step": 2166
+ },
+ {
+ "epoch": 1.0245862884160757,
+ "grad_norm": 3.1856436729431152,
+ "learning_rate": 4.674770523275594e-06,
+ "loss": 0.5475,
+ "step": 2167
+ },
+ {
+ "epoch": 1.0250591016548463,
+ "grad_norm": 2.8289129734039307,
+ "learning_rate": 4.674462775590244e-06,
+ "loss": 0.5878,
+ "step": 2168
+ },
+ {
+ "epoch": 1.025531914893617,
+ "grad_norm": 2.8824517726898193,
+ "learning_rate": 4.6741548925106325e-06,
+ "loss": 0.4392,
+ "step": 2169
+ },
+ {
+ "epoch": 1.0260047281323876,
+ "grad_norm": 2.7044589519500732,
+ "learning_rate": 4.673846874055928e-06,
+ "loss": 0.5264,
+ "step": 2170
+ },
+ {
+ "epoch": 1.0264775413711584,
+ "grad_norm": 2.575035810470581,
+ "learning_rate": 4.673538720245312e-06,
+ "loss": 0.4615,
+ "step": 2171
+ },
+ {
+ "epoch": 1.026950354609929,
+ "grad_norm": 2.48168683052063,
+ "learning_rate": 4.67323043109797e-06,
+ "loss": 0.4404,
+ "step": 2172
+ },
+ {
+ "epoch": 1.0274231678486998,
+ "grad_norm": 2.926593065261841,
+ "learning_rate": 4.672922006633098e-06,
+ "loss": 0.54,
+ "step": 2173
+ },
+ {
+ "epoch": 1.0278959810874704,
+ "grad_norm": 2.4610698223114014,
+ "learning_rate": 4.672613446869901e-06,
+ "loss": 0.5555,
+ "step": 2174
+ },
+ {
+ "epoch": 1.0283687943262412,
+ "grad_norm": 3.026901960372925,
+ "learning_rate": 4.672304751827592e-06,
+ "loss": 0.62,
+ "step": 2175
+ },
+ {
+ "epoch": 1.0288416075650118,
+ "grad_norm": 2.3946213722229004,
+ "learning_rate": 4.671995921525391e-06,
+ "loss": 0.5228,
+ "step": 2176
+ },
+ {
+ "epoch": 1.0293144208037825,
+ "grad_norm": 2.985020399093628,
+ "learning_rate": 4.671686955982528e-06,
+ "loss": 0.6256,
+ "step": 2177
+ },
+ {
+ "epoch": 1.0297872340425531,
+ "grad_norm": 3.0910139083862305,
+ "learning_rate": 4.671377855218239e-06,
+ "loss": 0.5893,
+ "step": 2178
+ },
+ {
+ "epoch": 1.030260047281324,
+ "grad_norm": 2.507805109024048,
+ "learning_rate": 4.6710686192517744e-06,
+ "loss": 0.5329,
+ "step": 2179
+ },
+ {
+ "epoch": 1.0307328605200945,
+ "grad_norm": 2.4514641761779785,
+ "learning_rate": 4.670759248102386e-06,
+ "loss": 0.4585,
+ "step": 2180
+ },
+ {
+ "epoch": 1.0312056737588653,
+ "grad_norm": 2.742838144302368,
+ "learning_rate": 4.670449741789337e-06,
+ "loss": 0.6255,
+ "step": 2181
+ },
+ {
+ "epoch": 1.0316784869976359,
+ "grad_norm": 2.374349594116211,
+ "learning_rate": 4.670140100331901e-06,
+ "loss": 0.5049,
+ "step": 2182
+ },
+ {
+ "epoch": 1.0321513002364067,
+ "grad_norm": 2.78894305229187,
+ "learning_rate": 4.669830323749356e-06,
+ "loss": 0.6061,
+ "step": 2183
+ },
+ {
+ "epoch": 1.0326241134751772,
+ "grad_norm": 2.7195091247558594,
+ "learning_rate": 4.6695204120609905e-06,
+ "loss": 0.592,
+ "step": 2184
+ },
+ {
+ "epoch": 1.033096926713948,
+ "grad_norm": 2.824411630630493,
+ "learning_rate": 4.6692103652861035e-06,
+ "loss": 0.5666,
+ "step": 2185
+ },
+ {
+ "epoch": 1.0335697399527186,
+ "grad_norm": 2.4981014728546143,
+ "learning_rate": 4.6689001834439975e-06,
+ "loss": 0.5045,
+ "step": 2186
+ },
+ {
+ "epoch": 1.0340425531914894,
+ "grad_norm": 2.7375214099884033,
+ "learning_rate": 4.668589866553988e-06,
+ "loss": 0.5305,
+ "step": 2187
+ },
+ {
+ "epoch": 1.03451536643026,
+ "grad_norm": 2.625345468521118,
+ "learning_rate": 4.668279414635396e-06,
+ "loss": 0.4819,
+ "step": 2188
+ },
+ {
+ "epoch": 1.0349881796690308,
+ "grad_norm": 2.60479736328125,
+ "learning_rate": 4.667968827707553e-06,
+ "loss": 0.55,
+ "step": 2189
+ },
+ {
+ "epoch": 1.0354609929078014,
+ "grad_norm": 2.642014741897583,
+ "learning_rate": 4.667658105789797e-06,
+ "loss": 0.5264,
+ "step": 2190
+ },
+ {
+ "epoch": 1.0359338061465722,
+ "grad_norm": 2.5439083576202393,
+ "learning_rate": 4.667347248901476e-06,
+ "loss": 0.4657,
+ "step": 2191
+ },
+ {
+ "epoch": 1.0364066193853427,
+ "grad_norm": 2.5537586212158203,
+ "learning_rate": 4.667036257061945e-06,
+ "loss": 0.527,
+ "step": 2192
+ },
+ {
+ "epoch": 1.0368794326241135,
+ "grad_norm": 2.595466375350952,
+ "learning_rate": 4.666725130290569e-06,
+ "loss": 0.5336,
+ "step": 2193
+ },
+ {
+ "epoch": 1.037352245862884,
+ "grad_norm": 3.5106313228607178,
+ "learning_rate": 4.666413868606719e-06,
+ "loss": 0.5176,
+ "step": 2194
+ },
+ {
+ "epoch": 1.037825059101655,
+ "grad_norm": 2.931553363800049,
+ "learning_rate": 4.666102472029778e-06,
+ "loss": 0.549,
+ "step": 2195
+ },
+ {
+ "epoch": 1.0382978723404255,
+ "grad_norm": 2.4325125217437744,
+ "learning_rate": 4.665790940579133e-06,
+ "loss": 0.5095,
+ "step": 2196
+ },
+ {
+ "epoch": 1.0387706855791963,
+ "grad_norm": 2.708477258682251,
+ "learning_rate": 4.665479274274184e-06,
+ "loss": 0.5264,
+ "step": 2197
+ },
+ {
+ "epoch": 1.0392434988179668,
+ "grad_norm": 2.905977487564087,
+ "learning_rate": 4.665167473134335e-06,
+ "loss": 0.5575,
+ "step": 2198
+ },
+ {
+ "epoch": 1.0397163120567376,
+ "grad_norm": 2.428938865661621,
+ "learning_rate": 4.664855537179003e-06,
+ "loss": 0.5099,
+ "step": 2199
+ },
+ {
+ "epoch": 1.0401891252955082,
+ "grad_norm": 2.8432137966156006,
+ "learning_rate": 4.6645434664276075e-06,
+ "loss": 0.5331,
+ "step": 2200
+ },
+ {
+ "epoch": 1.040661938534279,
+ "grad_norm": 2.5185136795043945,
+ "learning_rate": 4.6642312608995825e-06,
+ "loss": 0.5217,
+ "step": 2201
+ },
+ {
+ "epoch": 1.0411347517730496,
+ "grad_norm": 2.556607723236084,
+ "learning_rate": 4.663918920614366e-06,
+ "loss": 0.4431,
+ "step": 2202
+ },
+ {
+ "epoch": 1.0416075650118204,
+ "grad_norm": 3.1271166801452637,
+ "learning_rate": 4.663606445591407e-06,
+ "loss": 0.5398,
+ "step": 2203
+ },
+ {
+ "epoch": 1.042080378250591,
+ "grad_norm": 2.573680877685547,
+ "learning_rate": 4.663293835850162e-06,
+ "loss": 0.4713,
+ "step": 2204
+ },
+ {
+ "epoch": 1.0425531914893618,
+ "grad_norm": 2.5230324268341064,
+ "learning_rate": 4.662981091410096e-06,
+ "loss": 0.5571,
+ "step": 2205
+ },
+ {
+ "epoch": 1.0430260047281323,
+ "grad_norm": 2.552182912826538,
+ "learning_rate": 4.662668212290681e-06,
+ "loss": 0.5173,
+ "step": 2206
+ },
+ {
+ "epoch": 1.0434988179669031,
+ "grad_norm": 2.832345724105835,
+ "learning_rate": 4.6623551985113995e-06,
+ "loss": 0.525,
+ "step": 2207
+ },
+ {
+ "epoch": 1.0439716312056737,
+ "grad_norm": 2.9729080200195312,
+ "learning_rate": 4.6620420500917416e-06,
+ "loss": 0.6308,
+ "step": 2208
+ },
+ {
+ "epoch": 1.0444444444444445,
+ "grad_norm": 2.618187665939331,
+ "learning_rate": 4.661728767051206e-06,
+ "loss": 0.4942,
+ "step": 2209
+ },
+ {
+ "epoch": 1.044917257683215,
+ "grad_norm": 2.515566349029541,
+ "learning_rate": 4.661415349409299e-06,
+ "loss": 0.5229,
+ "step": 2210
+ },
+ {
+ "epoch": 1.0453900709219859,
+ "grad_norm": 2.8651459217071533,
+ "learning_rate": 4.6611017971855356e-06,
+ "loss": 0.5029,
+ "step": 2211
+ },
+ {
+ "epoch": 1.0458628841607565,
+ "grad_norm": 2.502405881881714,
+ "learning_rate": 4.660788110399439e-06,
+ "loss": 0.4732,
+ "step": 2212
+ },
+ {
+ "epoch": 1.0463356973995273,
+ "grad_norm": 2.540668249130249,
+ "learning_rate": 4.660474289070541e-06,
+ "loss": 0.547,
+ "step": 2213
+ },
+ {
+ "epoch": 1.0468085106382978,
+ "grad_norm": 2.803469181060791,
+ "learning_rate": 4.660160333218384e-06,
+ "loss": 0.5441,
+ "step": 2214
+ },
+ {
+ "epoch": 1.0472813238770686,
+ "grad_norm": 3.233325481414795,
+ "learning_rate": 4.659846242862514e-06,
+ "loss": 0.4457,
+ "step": 2215
+ },
+ {
+ "epoch": 1.0477541371158392,
+ "grad_norm": 2.549548387527466,
+ "learning_rate": 4.659532018022489e-06,
+ "loss": 0.5684,
+ "step": 2216
+ },
+ {
+ "epoch": 1.04822695035461,
+ "grad_norm": 2.6112852096557617,
+ "learning_rate": 4.659217658717875e-06,
+ "loss": 0.5323,
+ "step": 2217
+ },
+ {
+ "epoch": 1.0486997635933806,
+ "grad_norm": 2.347418785095215,
+ "learning_rate": 4.658903164968245e-06,
+ "loss": 0.5349,
+ "step": 2218
+ },
+ {
+ "epoch": 1.0491725768321514,
+ "grad_norm": 2.695502281188965,
+ "learning_rate": 4.658588536793182e-06,
+ "loss": 0.4883,
+ "step": 2219
+ },
+ {
+ "epoch": 1.049645390070922,
+ "grad_norm": 2.7575674057006836,
+ "learning_rate": 4.658273774212275e-06,
+ "loss": 0.5517,
+ "step": 2220
+ },
+ {
+ "epoch": 1.0501182033096927,
+ "grad_norm": 2.787855386734009,
+ "learning_rate": 4.6579588772451245e-06,
+ "loss": 0.5744,
+ "step": 2221
+ },
+ {
+ "epoch": 1.0505910165484633,
+ "grad_norm": 3.0699398517608643,
+ "learning_rate": 4.657643845911337e-06,
+ "loss": 0.5258,
+ "step": 2222
+ },
+ {
+ "epoch": 1.0510638297872341,
+ "grad_norm": 2.652040719985962,
+ "learning_rate": 4.657328680230527e-06,
+ "loss": 0.5141,
+ "step": 2223
+ },
+ {
+ "epoch": 1.0515366430260047,
+ "grad_norm": 2.6896369457244873,
+ "learning_rate": 4.657013380222322e-06,
+ "loss": 0.5139,
+ "step": 2224
+ },
+ {
+ "epoch": 1.0520094562647755,
+ "grad_norm": 2.551839590072632,
+ "learning_rate": 4.65669794590635e-06,
+ "loss": 0.5099,
+ "step": 2225
+ },
+ {
+ "epoch": 1.052482269503546,
+ "grad_norm": 2.8543262481689453,
+ "learning_rate": 4.656382377302255e-06,
+ "loss": 0.6085,
+ "step": 2226
+ },
+ {
+ "epoch": 1.0529550827423169,
+ "grad_norm": 2.871469259262085,
+ "learning_rate": 4.656066674429685e-06,
+ "loss": 0.6108,
+ "step": 2227
+ },
+ {
+ "epoch": 1.0534278959810874,
+ "grad_norm": 2.4840824604034424,
+ "learning_rate": 4.655750837308296e-06,
+ "loss": 0.4994,
+ "step": 2228
+ },
+ {
+ "epoch": 1.0539007092198582,
+ "grad_norm": 2.5203280448913574,
+ "learning_rate": 4.6554348659577555e-06,
+ "loss": 0.4928,
+ "step": 2229
+ },
+ {
+ "epoch": 1.0543735224586288,
+ "grad_norm": 2.9327683448791504,
+ "learning_rate": 4.655118760397737e-06,
+ "loss": 0.6324,
+ "step": 2230
+ },
+ {
+ "epoch": 1.0548463356973996,
+ "grad_norm": 2.6766855716705322,
+ "learning_rate": 4.654802520647924e-06,
+ "loss": 0.5178,
+ "step": 2231
+ },
+ {
+ "epoch": 1.0553191489361702,
+ "grad_norm": 2.8438873291015625,
+ "learning_rate": 4.654486146728006e-06,
+ "loss": 0.509,
+ "step": 2232
+ },
+ {
+ "epoch": 1.055791962174941,
+ "grad_norm": 2.538661241531372,
+ "learning_rate": 4.6541696386576826e-06,
+ "loss": 0.5463,
+ "step": 2233
+ },
+ {
+ "epoch": 1.0562647754137116,
+ "grad_norm": 2.829030990600586,
+ "learning_rate": 4.653852996456662e-06,
+ "loss": 0.5404,
+ "step": 2234
+ },
+ {
+ "epoch": 1.0567375886524824,
+ "grad_norm": 2.5657269954681396,
+ "learning_rate": 4.653536220144659e-06,
+ "loss": 0.5479,
+ "step": 2235
+ },
+ {
+ "epoch": 1.057210401891253,
+ "grad_norm": 2.6641297340393066,
+ "learning_rate": 4.653219309741399e-06,
+ "loss": 0.5503,
+ "step": 2236
+ },
+ {
+ "epoch": 1.0576832151300237,
+ "grad_norm": 2.966350555419922,
+ "learning_rate": 4.652902265266615e-06,
+ "loss": 0.6404,
+ "step": 2237
+ },
+ {
+ "epoch": 1.0581560283687943,
+ "grad_norm": 2.462430000305176,
+ "learning_rate": 4.6525850867400455e-06,
+ "loss": 0.4885,
+ "step": 2238
+ },
+ {
+ "epoch": 1.058628841607565,
+ "grad_norm": 2.1791880130767822,
+ "learning_rate": 4.652267774181443e-06,
+ "loss": 0.4405,
+ "step": 2239
+ },
+ {
+ "epoch": 1.0591016548463357,
+ "grad_norm": 2.5473732948303223,
+ "learning_rate": 4.651950327610563e-06,
+ "loss": 0.5295,
+ "step": 2240
+ },
+ {
+ "epoch": 1.0595744680851065,
+ "grad_norm": 2.70904803276062,
+ "learning_rate": 4.651632747047172e-06,
+ "loss": 0.5169,
+ "step": 2241
+ },
+ {
+ "epoch": 1.060047281323877,
+ "grad_norm": 3.8442928791046143,
+ "learning_rate": 4.651315032511045e-06,
+ "loss": 0.5473,
+ "step": 2242
+ },
+ {
+ "epoch": 1.0605200945626478,
+ "grad_norm": 2.8613383769989014,
+ "learning_rate": 4.650997184021963e-06,
+ "loss": 0.5445,
+ "step": 2243
+ },
+ {
+ "epoch": 1.0609929078014184,
+ "grad_norm": 2.5995829105377197,
+ "learning_rate": 4.6506792015997184e-06,
+ "loss": 0.5525,
+ "step": 2244
+ },
+ {
+ "epoch": 1.0614657210401892,
+ "grad_norm": 2.5465996265411377,
+ "learning_rate": 4.650361085264111e-06,
+ "loss": 0.5093,
+ "step": 2245
+ },
+ {
+ "epoch": 1.0619385342789598,
+ "grad_norm": 2.46553111076355,
+ "learning_rate": 4.650042835034948e-06,
+ "loss": 0.5375,
+ "step": 2246
+ },
+ {
+ "epoch": 1.0624113475177306,
+ "grad_norm": 2.6907830238342285,
+ "learning_rate": 4.649724450932045e-06,
+ "loss": 0.572,
+ "step": 2247
+ },
+ {
+ "epoch": 1.0628841607565012,
+ "grad_norm": 3.0671346187591553,
+ "learning_rate": 4.649405932975226e-06,
+ "loss": 0.4974,
+ "step": 2248
+ },
+ {
+ "epoch": 1.063356973995272,
+ "grad_norm": 2.5392491817474365,
+ "learning_rate": 4.649087281184325e-06,
+ "loss": 0.524,
+ "step": 2249
+ },
+ {
+ "epoch": 1.0638297872340425,
+ "grad_norm": 2.7498562335968018,
+ "learning_rate": 4.648768495579183e-06,
+ "loss": 0.5801,
+ "step": 2250
+ },
+ {
+ "epoch": 1.0643026004728133,
+ "grad_norm": 2.8536248207092285,
+ "learning_rate": 4.648449576179649e-06,
+ "loss": 0.5384,
+ "step": 2251
+ },
+ {
+ "epoch": 1.064775413711584,
+ "grad_norm": 2.7062792778015137,
+ "learning_rate": 4.64813052300558e-06,
+ "loss": 0.5262,
+ "step": 2252
+ },
+ {
+ "epoch": 1.0652482269503547,
+ "grad_norm": 2.798650026321411,
+ "learning_rate": 4.647811336076841e-06,
+ "loss": 0.5719,
+ "step": 2253
+ },
+ {
+ "epoch": 1.0657210401891253,
+ "grad_norm": 2.9793951511383057,
+ "learning_rate": 4.647492015413311e-06,
+ "loss": 0.5377,
+ "step": 2254
+ },
+ {
+ "epoch": 1.066193853427896,
+ "grad_norm": 2.572129011154175,
+ "learning_rate": 4.647172561034868e-06,
+ "loss": 0.4791,
+ "step": 2255
+ },
+ {
+ "epoch": 1.0666666666666667,
+ "grad_norm": 3.7490930557250977,
+ "learning_rate": 4.646852972961405e-06,
+ "loss": 0.5423,
+ "step": 2256
+ },
+ {
+ "epoch": 1.0671394799054374,
+ "grad_norm": 2.626255750656128,
+ "learning_rate": 4.646533251212821e-06,
+ "loss": 0.5558,
+ "step": 2257
+ },
+ {
+ "epoch": 1.067612293144208,
+ "grad_norm": 2.8408126831054688,
+ "learning_rate": 4.646213395809023e-06,
+ "loss": 0.55,
+ "step": 2258
+ },
+ {
+ "epoch": 1.0680851063829788,
+ "grad_norm": 3.255606174468994,
+ "learning_rate": 4.645893406769929e-06,
+ "loss": 0.547,
+ "step": 2259
+ },
+ {
+ "epoch": 1.0685579196217494,
+ "grad_norm": 2.4352102279663086,
+ "learning_rate": 4.645573284115461e-06,
+ "loss": 0.4898,
+ "step": 2260
+ },
+ {
+ "epoch": 1.0690307328605202,
+ "grad_norm": 2.408634662628174,
+ "learning_rate": 4.6452530278655535e-06,
+ "loss": 0.5264,
+ "step": 2261
+ },
+ {
+ "epoch": 1.0695035460992908,
+ "grad_norm": 2.4220449924468994,
+ "learning_rate": 4.644932638040146e-06,
+ "loss": 0.5166,
+ "step": 2262
+ },
+ {
+ "epoch": 1.0699763593380616,
+ "grad_norm": 2.9188082218170166,
+ "learning_rate": 4.644612114659188e-06,
+ "loss": 0.5611,
+ "step": 2263
+ },
+ {
+ "epoch": 1.0704491725768321,
+ "grad_norm": 2.906557083129883,
+ "learning_rate": 4.644291457742638e-06,
+ "loss": 0.5515,
+ "step": 2264
+ },
+ {
+ "epoch": 1.070921985815603,
+ "grad_norm": 2.9039015769958496,
+ "learning_rate": 4.643970667310462e-06,
+ "loss": 0.5732,
+ "step": 2265
+ },
+ {
+ "epoch": 1.0713947990543735,
+ "grad_norm": 2.9985480308532715,
+ "learning_rate": 4.643649743382632e-06,
+ "loss": 0.563,
+ "step": 2266
+ },
+ {
+ "epoch": 1.0718676122931443,
+ "grad_norm": 2.5780906677246094,
+ "learning_rate": 4.6433286859791335e-06,
+ "loss": 0.502,
+ "step": 2267
+ },
+ {
+ "epoch": 1.0723404255319149,
+ "grad_norm": 2.590209722518921,
+ "learning_rate": 4.643007495119955e-06,
+ "loss": 0.4995,
+ "step": 2268
+ },
+ {
+ "epoch": 1.0728132387706855,
+ "grad_norm": 2.378894805908203,
+ "learning_rate": 4.642686170825097e-06,
+ "loss": 0.4886,
+ "step": 2269
+ },
+ {
+ "epoch": 1.0732860520094563,
+ "grad_norm": 2.6826229095458984,
+ "learning_rate": 4.642364713114567e-06,
+ "loss": 0.465,
+ "step": 2270
+ },
+ {
+ "epoch": 1.073758865248227,
+ "grad_norm": 2.627819538116455,
+ "learning_rate": 4.64204312200838e-06,
+ "loss": 0.4954,
+ "step": 2271
+ },
+ {
+ "epoch": 1.0742316784869976,
+ "grad_norm": 2.993021249771118,
+ "learning_rate": 4.641721397526561e-06,
+ "loss": 0.5073,
+ "step": 2272
+ },
+ {
+ "epoch": 1.0747044917257682,
+ "grad_norm": 2.719052791595459,
+ "learning_rate": 4.64139953968914e-06,
+ "loss": 0.538,
+ "step": 2273
+ },
+ {
+ "epoch": 1.075177304964539,
+ "grad_norm": 2.729252576828003,
+ "learning_rate": 4.6410775485161605e-06,
+ "loss": 0.552,
+ "step": 2274
+ },
+ {
+ "epoch": 1.0756501182033098,
+ "grad_norm": 2.924142599105835,
+ "learning_rate": 4.640755424027671e-06,
+ "loss": 0.522,
+ "step": 2275
+ },
+ {
+ "epoch": 1.0761229314420804,
+ "grad_norm": 3.329162120819092,
+ "learning_rate": 4.640433166243728e-06,
+ "loss": 0.5965,
+ "step": 2276
+ },
+ {
+ "epoch": 1.076595744680851,
+ "grad_norm": 2.9810245037078857,
+ "learning_rate": 4.640110775184396e-06,
+ "loss": 0.5653,
+ "step": 2277
+ },
+ {
+ "epoch": 1.0770685579196217,
+ "grad_norm": 2.61772084236145,
+ "learning_rate": 4.639788250869751e-06,
+ "loss": 0.5382,
+ "step": 2278
+ },
+ {
+ "epoch": 1.0775413711583925,
+ "grad_norm": 2.741225004196167,
+ "learning_rate": 4.639465593319874e-06,
+ "loss": 0.4866,
+ "step": 2279
+ },
+ {
+ "epoch": 1.0780141843971631,
+ "grad_norm": 2.7945218086242676,
+ "learning_rate": 4.639142802554856e-06,
+ "loss": 0.4711,
+ "step": 2280
+ },
+ {
+ "epoch": 1.0784869976359337,
+ "grad_norm": 2.4282329082489014,
+ "learning_rate": 4.638819878594795e-06,
+ "loss": 0.4911,
+ "step": 2281
+ },
+ {
+ "epoch": 1.0789598108747045,
+ "grad_norm": 2.551741361618042,
+ "learning_rate": 4.638496821459799e-06,
+ "loss": 0.453,
+ "step": 2282
+ },
+ {
+ "epoch": 1.0794326241134753,
+ "grad_norm": 2.5622754096984863,
+ "learning_rate": 4.638173631169983e-06,
+ "loss": 0.5983,
+ "step": 2283
+ },
+ {
+ "epoch": 1.0799054373522459,
+ "grad_norm": 2.7748284339904785,
+ "learning_rate": 4.6378503077454715e-06,
+ "loss": 0.5143,
+ "step": 2284
+ },
+ {
+ "epoch": 1.0803782505910164,
+ "grad_norm": 2.7693238258361816,
+ "learning_rate": 4.637526851206394e-06,
+ "loss": 0.5929,
+ "step": 2285
+ },
+ {
+ "epoch": 1.0808510638297872,
+ "grad_norm": 2.705548048019409,
+ "learning_rate": 4.637203261572893e-06,
+ "loss": 0.5577,
+ "step": 2286
+ },
+ {
+ "epoch": 1.081323877068558,
+ "grad_norm": 2.739307165145874,
+ "learning_rate": 4.636879538865117e-06,
+ "loss": 0.5676,
+ "step": 2287
+ },
+ {
+ "epoch": 1.0817966903073286,
+ "grad_norm": 2.514059543609619,
+ "learning_rate": 4.636555683103221e-06,
+ "loss": 0.5001,
+ "step": 2288
+ },
+ {
+ "epoch": 1.0822695035460992,
+ "grad_norm": 2.7166874408721924,
+ "learning_rate": 4.636231694307372e-06,
+ "loss": 0.5411,
+ "step": 2289
+ },
+ {
+ "epoch": 1.08274231678487,
+ "grad_norm": 2.7661683559417725,
+ "learning_rate": 4.635907572497741e-06,
+ "loss": 0.6353,
+ "step": 2290
+ },
+ {
+ "epoch": 1.0832151300236406,
+ "grad_norm": 2.598381996154785,
+ "learning_rate": 4.635583317694512e-06,
+ "loss": 0.5213,
+ "step": 2291
+ },
+ {
+ "epoch": 1.0836879432624114,
+ "grad_norm": 2.821491003036499,
+ "learning_rate": 4.6352589299178744e-06,
+ "loss": 0.6172,
+ "step": 2292
+ },
+ {
+ "epoch": 1.084160756501182,
+ "grad_norm": 2.5422823429107666,
+ "learning_rate": 4.634934409188025e-06,
+ "loss": 0.5245,
+ "step": 2293
+ },
+ {
+ "epoch": 1.0846335697399527,
+ "grad_norm": 2.8264620304107666,
+ "learning_rate": 4.634609755525173e-06,
+ "loss": 0.5004,
+ "step": 2294
+ },
+ {
+ "epoch": 1.0851063829787233,
+ "grad_norm": 2.3286643028259277,
+ "learning_rate": 4.63428496894953e-06,
+ "loss": 0.4561,
+ "step": 2295
+ },
+ {
+ "epoch": 1.085579196217494,
+ "grad_norm": 2.462005376815796,
+ "learning_rate": 4.633960049481321e-06,
+ "loss": 0.4948,
+ "step": 2296
+ },
+ {
+ "epoch": 1.0860520094562647,
+ "grad_norm": 2.760258913040161,
+ "learning_rate": 4.633634997140777e-06,
+ "loss": 0.5407,
+ "step": 2297
+ },
+ {
+ "epoch": 1.0865248226950355,
+ "grad_norm": 3.0234217643737793,
+ "learning_rate": 4.633309811948138e-06,
+ "loss": 0.4914,
+ "step": 2298
+ },
+ {
+ "epoch": 1.086997635933806,
+ "grad_norm": 2.8380849361419678,
+ "learning_rate": 4.63298449392365e-06,
+ "loss": 0.5562,
+ "step": 2299
+ },
+ {
+ "epoch": 1.0874704491725768,
+ "grad_norm": 2.6201648712158203,
+ "learning_rate": 4.632659043087572e-06,
+ "loss": 0.5882,
+ "step": 2300
+ },
+ {
+ "epoch": 1.0879432624113474,
+ "grad_norm": 2.586339235305786,
+ "learning_rate": 4.632333459460165e-06,
+ "loss": 0.4991,
+ "step": 2301
+ },
+ {
+ "epoch": 1.0884160756501182,
+ "grad_norm": 2.500115394592285,
+ "learning_rate": 4.632007743061705e-06,
+ "loss": 0.552,
+ "step": 2302
+ },
+ {
+ "epoch": 1.0888888888888888,
+ "grad_norm": 2.816390037536621,
+ "learning_rate": 4.63168189391247e-06,
+ "loss": 0.5301,
+ "step": 2303
+ },
+ {
+ "epoch": 1.0893617021276596,
+ "grad_norm": 2.975400924682617,
+ "learning_rate": 4.631355912032753e-06,
+ "loss": 0.6056,
+ "step": 2304
+ },
+ {
+ "epoch": 1.0898345153664302,
+ "grad_norm": 2.747985363006592,
+ "learning_rate": 4.631029797442846e-06,
+ "loss": 0.5335,
+ "step": 2305
+ },
+ {
+ "epoch": 1.090307328605201,
+ "grad_norm": 2.609281539916992,
+ "learning_rate": 4.630703550163059e-06,
+ "loss": 0.5189,
+ "step": 2306
+ },
+ {
+ "epoch": 1.0907801418439715,
+ "grad_norm": 2.624131202697754,
+ "learning_rate": 4.630377170213705e-06,
+ "loss": 0.5646,
+ "step": 2307
+ },
+ {
+ "epoch": 1.0912529550827423,
+ "grad_norm": 2.6186959743499756,
+ "learning_rate": 4.630050657615107e-06,
+ "loss": 0.5187,
+ "step": 2308
+ },
+ {
+ "epoch": 1.091725768321513,
+ "grad_norm": 2.9961764812469482,
+ "learning_rate": 4.629724012387594e-06,
+ "loss": 0.6207,
+ "step": 2309
+ },
+ {
+ "epoch": 1.0921985815602837,
+ "grad_norm": 2.665799140930176,
+ "learning_rate": 4.629397234551505e-06,
+ "loss": 0.5046,
+ "step": 2310
+ },
+ {
+ "epoch": 1.0926713947990543,
+ "grad_norm": 2.6154725551605225,
+ "learning_rate": 4.629070324127187e-06,
+ "loss": 0.5553,
+ "step": 2311
+ },
+ {
+ "epoch": 1.093144208037825,
+ "grad_norm": 2.702967643737793,
+ "learning_rate": 4.628743281134996e-06,
+ "loss": 0.5159,
+ "step": 2312
+ },
+ {
+ "epoch": 1.0936170212765957,
+ "grad_norm": 2.578080177307129,
+ "learning_rate": 4.628416105595295e-06,
+ "loss": 0.4934,
+ "step": 2313
+ },
+ {
+ "epoch": 1.0940898345153665,
+ "grad_norm": 2.8763060569763184,
+ "learning_rate": 4.628088797528456e-06,
+ "loss": 0.5404,
+ "step": 2314
+ },
+ {
+ "epoch": 1.094562647754137,
+ "grad_norm": 2.5301198959350586,
+ "learning_rate": 4.6277613569548585e-06,
+ "loss": 0.524,
+ "step": 2315
+ },
+ {
+ "epoch": 1.0950354609929078,
+ "grad_norm": 2.559903144836426,
+ "learning_rate": 4.627433783894892e-06,
+ "loss": 0.5177,
+ "step": 2316
+ },
+ {
+ "epoch": 1.0955082742316784,
+ "grad_norm": 2.430863380432129,
+ "learning_rate": 4.627106078368952e-06,
+ "loss": 0.5368,
+ "step": 2317
+ },
+ {
+ "epoch": 1.0959810874704492,
+ "grad_norm": 2.687567949295044,
+ "learning_rate": 4.626778240397444e-06,
+ "loss": 0.5385,
+ "step": 2318
+ },
+ {
+ "epoch": 1.0964539007092198,
+ "grad_norm": 3.053466558456421,
+ "learning_rate": 4.62645027000078e-06,
+ "loss": 0.5814,
+ "step": 2319
+ },
+ {
+ "epoch": 1.0969267139479906,
+ "grad_norm": 2.4612979888916016,
+ "learning_rate": 4.6261221671993815e-06,
+ "loss": 0.5069,
+ "step": 2320
+ },
+ {
+ "epoch": 1.0973995271867611,
+ "grad_norm": 2.6153628826141357,
+ "learning_rate": 4.625793932013679e-06,
+ "loss": 0.5422,
+ "step": 2321
+ },
+ {
+ "epoch": 1.097872340425532,
+ "grad_norm": 2.8918874263763428,
+ "learning_rate": 4.62546556446411e-06,
+ "loss": 0.5326,
+ "step": 2322
+ },
+ {
+ "epoch": 1.0983451536643025,
+ "grad_norm": 3.62565279006958,
+ "learning_rate": 4.625137064571119e-06,
+ "loss": 0.5164,
+ "step": 2323
+ },
+ {
+ "epoch": 1.0988179669030733,
+ "grad_norm": 2.4285085201263428,
+ "learning_rate": 4.624808432355164e-06,
+ "loss": 0.5084,
+ "step": 2324
+ },
+ {
+ "epoch": 1.099290780141844,
+ "grad_norm": 2.593979835510254,
+ "learning_rate": 4.624479667836702e-06,
+ "loss": 0.4986,
+ "step": 2325
+ },
+ {
+ "epoch": 1.0997635933806147,
+ "grad_norm": 2.490752935409546,
+ "learning_rate": 4.624150771036208e-06,
+ "loss": 0.5296,
+ "step": 2326
+ },
+ {
+ "epoch": 1.1002364066193853,
+ "grad_norm": 2.67694091796875,
+ "learning_rate": 4.6238217419741595e-06,
+ "loss": 0.5229,
+ "step": 2327
+ },
+ {
+ "epoch": 1.100709219858156,
+ "grad_norm": 2.594147205352783,
+ "learning_rate": 4.623492580671044e-06,
+ "loss": 0.4916,
+ "step": 2328
+ },
+ {
+ "epoch": 1.1011820330969266,
+ "grad_norm": 2.943472385406494,
+ "learning_rate": 4.623163287147356e-06,
+ "loss": 0.5591,
+ "step": 2329
+ },
+ {
+ "epoch": 1.1016548463356974,
+ "grad_norm": 2.569410562515259,
+ "learning_rate": 4.622833861423601e-06,
+ "loss": 0.4648,
+ "step": 2330
+ },
+ {
+ "epoch": 1.102127659574468,
+ "grad_norm": 2.5490405559539795,
+ "learning_rate": 4.6225043035202886e-06,
+ "loss": 0.5493,
+ "step": 2331
+ },
+ {
+ "epoch": 1.1026004728132388,
+ "grad_norm": 2.5964598655700684,
+ "learning_rate": 4.622174613457941e-06,
+ "loss": 0.5358,
+ "step": 2332
+ },
+ {
+ "epoch": 1.1030732860520094,
+ "grad_norm": 2.6456820964813232,
+ "learning_rate": 4.621844791257085e-06,
+ "loss": 0.5864,
+ "step": 2333
+ },
+ {
+ "epoch": 1.1035460992907802,
+ "grad_norm": 2.861180067062378,
+ "learning_rate": 4.621514836938259e-06,
+ "loss": 0.6064,
+ "step": 2334
+ },
+ {
+ "epoch": 1.1040189125295508,
+ "grad_norm": 2.8199548721313477,
+ "learning_rate": 4.621184750522005e-06,
+ "loss": 0.5244,
+ "step": 2335
+ },
+ {
+ "epoch": 1.1044917257683216,
+ "grad_norm": 2.7398853302001953,
+ "learning_rate": 4.6208545320288795e-06,
+ "loss": 0.5496,
+ "step": 2336
+ },
+ {
+ "epoch": 1.1049645390070921,
+ "grad_norm": 2.7941031455993652,
+ "learning_rate": 4.620524181479441e-06,
+ "loss": 0.5496,
+ "step": 2337
+ },
+ {
+ "epoch": 1.105437352245863,
+ "grad_norm": 2.973785161972046,
+ "learning_rate": 4.620193698894259e-06,
+ "loss": 0.5492,
+ "step": 2338
+ },
+ {
+ "epoch": 1.1059101654846335,
+ "grad_norm": 2.650355815887451,
+ "learning_rate": 4.6198630842939144e-06,
+ "loss": 0.5392,
+ "step": 2339
+ },
+ {
+ "epoch": 1.1063829787234043,
+ "grad_norm": 2.9092214107513428,
+ "learning_rate": 4.61953233769899e-06,
+ "loss": 0.5305,
+ "step": 2340
+ },
+ {
+ "epoch": 1.1068557919621749,
+ "grad_norm": 2.6329731941223145,
+ "learning_rate": 4.61920145913008e-06,
+ "loss": 0.5031,
+ "step": 2341
+ },
+ {
+ "epoch": 1.1073286052009457,
+ "grad_norm": 2.7214207649230957,
+ "learning_rate": 4.618870448607788e-06,
+ "loss": 0.5536,
+ "step": 2342
+ },
+ {
+ "epoch": 1.1078014184397162,
+ "grad_norm": 2.873119592666626,
+ "learning_rate": 4.618539306152724e-06,
+ "loss": 0.4531,
+ "step": 2343
+ },
+ {
+ "epoch": 1.108274231678487,
+ "grad_norm": 2.701042413711548,
+ "learning_rate": 4.618208031785507e-06,
+ "loss": 0.5217,
+ "step": 2344
+ },
+ {
+ "epoch": 1.1087470449172576,
+ "grad_norm": 2.7189881801605225,
+ "learning_rate": 4.6178766255267635e-06,
+ "loss": 0.6205,
+ "step": 2345
+ },
+ {
+ "epoch": 1.1092198581560284,
+ "grad_norm": 2.546382188796997,
+ "learning_rate": 4.61754508739713e-06,
+ "loss": 0.5475,
+ "step": 2346
+ },
+ {
+ "epoch": 1.109692671394799,
+ "grad_norm": 2.8429276943206787,
+ "learning_rate": 4.617213417417249e-06,
+ "loss": 0.4809,
+ "step": 2347
+ },
+ {
+ "epoch": 1.1101654846335698,
+ "grad_norm": 2.9515812397003174,
+ "learning_rate": 4.616881615607772e-06,
+ "loss": 0.5067,
+ "step": 2348
+ },
+ {
+ "epoch": 1.1106382978723404,
+ "grad_norm": 2.5910723209381104,
+ "learning_rate": 4.616549681989358e-06,
+ "loss": 0.5368,
+ "step": 2349
+ },
+ {
+ "epoch": 1.1111111111111112,
+ "grad_norm": 2.80855655670166,
+ "learning_rate": 4.616217616582678e-06,
+ "loss": 0.5827,
+ "step": 2350
+ },
+ {
+ "epoch": 1.1115839243498817,
+ "grad_norm": 2.604383945465088,
+ "learning_rate": 4.6158854194084044e-06,
+ "loss": 0.5716,
+ "step": 2351
+ },
+ {
+ "epoch": 1.1120567375886525,
+ "grad_norm": 3.0585904121398926,
+ "learning_rate": 4.6155530904872246e-06,
+ "loss": 0.4998,
+ "step": 2352
+ },
+ {
+ "epoch": 1.112529550827423,
+ "grad_norm": 2.660961627960205,
+ "learning_rate": 4.61522062983983e-06,
+ "loss": 0.4533,
+ "step": 2353
+ },
+ {
+ "epoch": 1.113002364066194,
+ "grad_norm": 2.8042070865631104,
+ "learning_rate": 4.614888037486923e-06,
+ "loss": 0.5592,
+ "step": 2354
+ },
+ {
+ "epoch": 1.1134751773049645,
+ "grad_norm": 2.681664228439331,
+ "learning_rate": 4.61455531344921e-06,
+ "loss": 0.5439,
+ "step": 2355
+ },
+ {
+ "epoch": 1.1139479905437353,
+ "grad_norm": 2.905054807662964,
+ "learning_rate": 4.61422245774741e-06,
+ "loss": 0.5497,
+ "step": 2356
+ },
+ {
+ "epoch": 1.1144208037825059,
+ "grad_norm": 2.7979753017425537,
+ "learning_rate": 4.6138894704022484e-06,
+ "loss": 0.5374,
+ "step": 2357
+ },
+ {
+ "epoch": 1.1148936170212767,
+ "grad_norm": 2.965611696243286,
+ "learning_rate": 4.613556351434458e-06,
+ "loss": 0.5145,
+ "step": 2358
+ },
+ {
+ "epoch": 1.1153664302600472,
+ "grad_norm": 2.583134889602661,
+ "learning_rate": 4.613223100864782e-06,
+ "loss": 0.535,
+ "step": 2359
+ },
+ {
+ "epoch": 1.115839243498818,
+ "grad_norm": 2.5979621410369873,
+ "learning_rate": 4.61288971871397e-06,
+ "loss": 0.5514,
+ "step": 2360
+ },
+ {
+ "epoch": 1.1163120567375886,
+ "grad_norm": 3.0117669105529785,
+ "learning_rate": 4.612556205002779e-06,
+ "loss": 0.5266,
+ "step": 2361
+ },
+ {
+ "epoch": 1.1167848699763594,
+ "grad_norm": 2.425133466720581,
+ "learning_rate": 4.612222559751976e-06,
+ "loss": 0.4838,
+ "step": 2362
+ },
+ {
+ "epoch": 1.11725768321513,
+ "grad_norm": 2.5102691650390625,
+ "learning_rate": 4.611888782982337e-06,
+ "loss": 0.3947,
+ "step": 2363
+ },
+ {
+ "epoch": 1.1177304964539008,
+ "grad_norm": 3.0327367782592773,
+ "learning_rate": 4.611554874714645e-06,
+ "loss": 0.5753,
+ "step": 2364
+ },
+ {
+ "epoch": 1.1182033096926713,
+ "grad_norm": 2.4561009407043457,
+ "learning_rate": 4.6112208349696875e-06,
+ "loss": 0.5054,
+ "step": 2365
+ },
+ {
+ "epoch": 1.1186761229314421,
+ "grad_norm": 3.3898050785064697,
+ "learning_rate": 4.610886663768267e-06,
+ "loss": 0.5946,
+ "step": 2366
+ },
+ {
+ "epoch": 1.1191489361702127,
+ "grad_norm": 2.8112242221832275,
+ "learning_rate": 4.61055236113119e-06,
+ "loss": 0.5475,
+ "step": 2367
+ },
+ {
+ "epoch": 1.1196217494089835,
+ "grad_norm": 3.152946710586548,
+ "learning_rate": 4.610217927079272e-06,
+ "loss": 0.5165,
+ "step": 2368
+ },
+ {
+ "epoch": 1.120094562647754,
+ "grad_norm": 2.7847867012023926,
+ "learning_rate": 4.609883361633336e-06,
+ "loss": 0.5533,
+ "step": 2369
+ },
+ {
+ "epoch": 1.1205673758865249,
+ "grad_norm": 2.6376686096191406,
+ "learning_rate": 4.6095486648142155e-06,
+ "loss": 0.4942,
+ "step": 2370
+ },
+ {
+ "epoch": 1.1210401891252955,
+ "grad_norm": 3.123072862625122,
+ "learning_rate": 4.609213836642749e-06,
+ "loss": 0.616,
+ "step": 2371
+ },
+ {
+ "epoch": 1.1215130023640663,
+ "grad_norm": 2.802694320678711,
+ "learning_rate": 4.608878877139786e-06,
+ "loss": 0.5323,
+ "step": 2372
+ },
+ {
+ "epoch": 1.1219858156028368,
+ "grad_norm": 2.3567938804626465,
+ "learning_rate": 4.6085437863261825e-06,
+ "loss": 0.4822,
+ "step": 2373
+ },
+ {
+ "epoch": 1.1224586288416076,
+ "grad_norm": 2.553112030029297,
+ "learning_rate": 4.608208564222804e-06,
+ "loss": 0.5447,
+ "step": 2374
+ },
+ {
+ "epoch": 1.1229314420803782,
+ "grad_norm": 3.0020132064819336,
+ "learning_rate": 4.607873210850521e-06,
+ "loss": 0.6486,
+ "step": 2375
+ },
+ {
+ "epoch": 1.123404255319149,
+ "grad_norm": 2.832442045211792,
+ "learning_rate": 4.607537726230216e-06,
+ "loss": 0.5257,
+ "step": 2376
+ },
+ {
+ "epoch": 1.1238770685579196,
+ "grad_norm": 2.471527099609375,
+ "learning_rate": 4.607202110382778e-06,
+ "loss": 0.4816,
+ "step": 2377
+ },
+ {
+ "epoch": 1.1243498817966904,
+ "grad_norm": 2.4232118129730225,
+ "learning_rate": 4.606866363329105e-06,
+ "loss": 0.5533,
+ "step": 2378
+ },
+ {
+ "epoch": 1.124822695035461,
+ "grad_norm": 2.477506637573242,
+ "learning_rate": 4.6065304850901025e-06,
+ "loss": 0.5223,
+ "step": 2379
+ },
+ {
+ "epoch": 1.1252955082742317,
+ "grad_norm": 3.54127836227417,
+ "learning_rate": 4.6061944756866824e-06,
+ "loss": 0.6514,
+ "step": 2380
+ },
+ {
+ "epoch": 1.1257683215130023,
+ "grad_norm": 2.5148677825927734,
+ "learning_rate": 4.605858335139768e-06,
+ "loss": 0.4864,
+ "step": 2381
+ },
+ {
+ "epoch": 1.1262411347517731,
+ "grad_norm": 2.8363659381866455,
+ "learning_rate": 4.605522063470289e-06,
+ "loss": 0.5034,
+ "step": 2382
+ },
+ {
+ "epoch": 1.1267139479905437,
+ "grad_norm": 2.4996654987335205,
+ "learning_rate": 4.605185660699184e-06,
+ "loss": 0.4126,
+ "step": 2383
+ },
+ {
+ "epoch": 1.1271867612293145,
+ "grad_norm": 2.352543830871582,
+ "learning_rate": 4.604849126847398e-06,
+ "loss": 0.5224,
+ "step": 2384
+ },
+ {
+ "epoch": 1.127659574468085,
+ "grad_norm": 2.60101056098938,
+ "learning_rate": 4.6045124619358875e-06,
+ "loss": 0.4867,
+ "step": 2385
+ },
+ {
+ "epoch": 1.1281323877068559,
+ "grad_norm": 2.9471068382263184,
+ "learning_rate": 4.604175665985613e-06,
+ "loss": 0.6474,
+ "step": 2386
+ },
+ {
+ "epoch": 1.1286052009456264,
+ "grad_norm": 2.5933351516723633,
+ "learning_rate": 4.603838739017546e-06,
+ "loss": 0.5081,
+ "step": 2387
+ },
+ {
+ "epoch": 1.1290780141843972,
+ "grad_norm": 2.3740346431732178,
+ "learning_rate": 4.6035016810526665e-06,
+ "loss": 0.4438,
+ "step": 2388
+ },
+ {
+ "epoch": 1.1295508274231678,
+ "grad_norm": 2.675020217895508,
+ "learning_rate": 4.6031644921119614e-06,
+ "loss": 0.4968,
+ "step": 2389
+ },
+ {
+ "epoch": 1.1300236406619386,
+ "grad_norm": 2.599472999572754,
+ "learning_rate": 4.602827172216424e-06,
+ "loss": 0.5131,
+ "step": 2390
+ },
+ {
+ "epoch": 1.1304964539007092,
+ "grad_norm": 2.8176097869873047,
+ "learning_rate": 4.602489721387061e-06,
+ "loss": 0.5549,
+ "step": 2391
+ },
+ {
+ "epoch": 1.13096926713948,
+ "grad_norm": 2.466914176940918,
+ "learning_rate": 4.602152139644881e-06,
+ "loss": 0.5052,
+ "step": 2392
+ },
+ {
+ "epoch": 1.1314420803782506,
+ "grad_norm": 2.8938796520233154,
+ "learning_rate": 4.601814427010905e-06,
+ "loss": 0.6181,
+ "step": 2393
+ },
+ {
+ "epoch": 1.1319148936170214,
+ "grad_norm": 2.7390825748443604,
+ "learning_rate": 4.601476583506161e-06,
+ "loss": 0.5178,
+ "step": 2394
+ },
+ {
+ "epoch": 1.132387706855792,
+ "grad_norm": 3.180112838745117,
+ "learning_rate": 4.601138609151685e-06,
+ "loss": 0.6071,
+ "step": 2395
+ },
+ {
+ "epoch": 1.1328605200945627,
+ "grad_norm": 2.9282350540161133,
+ "learning_rate": 4.600800503968521e-06,
+ "loss": 0.5557,
+ "step": 2396
+ },
+ {
+ "epoch": 1.1333333333333333,
+ "grad_norm": 2.6689717769622803,
+ "learning_rate": 4.6004622679777215e-06,
+ "loss": 0.4679,
+ "step": 2397
+ },
+ {
+ "epoch": 1.133806146572104,
+ "grad_norm": 2.651582956314087,
+ "learning_rate": 4.600123901200347e-06,
+ "loss": 0.4907,
+ "step": 2398
+ },
+ {
+ "epoch": 1.1342789598108747,
+ "grad_norm": 2.5702924728393555,
+ "learning_rate": 4.599785403657464e-06,
+ "loss": 0.4919,
+ "step": 2399
+ },
+ {
+ "epoch": 1.1347517730496455,
+ "grad_norm": 2.636812448501587,
+ "learning_rate": 4.599446775370153e-06,
+ "loss": 0.5091,
+ "step": 2400
+ },
+ {
+ "epoch": 1.135224586288416,
+ "grad_norm": 2.5965442657470703,
+ "learning_rate": 4.599108016359497e-06,
+ "loss": 0.5035,
+ "step": 2401
+ },
+ {
+ "epoch": 1.1356973995271868,
+ "grad_norm": 2.689732313156128,
+ "learning_rate": 4.5987691266465885e-06,
+ "loss": 0.5307,
+ "step": 2402
+ },
+ {
+ "epoch": 1.1361702127659574,
+ "grad_norm": 2.7256956100463867,
+ "learning_rate": 4.59843010625253e-06,
+ "loss": 0.5066,
+ "step": 2403
+ },
+ {
+ "epoch": 1.1366430260047282,
+ "grad_norm": 2.726020574569702,
+ "learning_rate": 4.59809095519843e-06,
+ "loss": 0.4805,
+ "step": 2404
+ },
+ {
+ "epoch": 1.1371158392434988,
+ "grad_norm": 2.703339099884033,
+ "learning_rate": 4.597751673505406e-06,
+ "loss": 0.4992,
+ "step": 2405
+ },
+ {
+ "epoch": 1.1375886524822696,
+ "grad_norm": 2.54455304145813,
+ "learning_rate": 4.5974122611945835e-06,
+ "loss": 0.5251,
+ "step": 2406
+ },
+ {
+ "epoch": 1.1380614657210402,
+ "grad_norm": 2.623507022857666,
+ "learning_rate": 4.597072718287096e-06,
+ "loss": 0.4831,
+ "step": 2407
+ },
+ {
+ "epoch": 1.138534278959811,
+ "grad_norm": 2.653590202331543,
+ "learning_rate": 4.596733044804086e-06,
+ "loss": 0.5646,
+ "step": 2408
+ },
+ {
+ "epoch": 1.1390070921985815,
+ "grad_norm": 2.8230600357055664,
+ "learning_rate": 4.5963932407667035e-06,
+ "loss": 0.514,
+ "step": 2409
+ },
+ {
+ "epoch": 1.1394799054373523,
+ "grad_norm": 2.6077451705932617,
+ "learning_rate": 4.5960533061961065e-06,
+ "loss": 0.4713,
+ "step": 2410
+ },
+ {
+ "epoch": 1.139952718676123,
+ "grad_norm": 2.3945798873901367,
+ "learning_rate": 4.595713241113461e-06,
+ "loss": 0.466,
+ "step": 2411
+ },
+ {
+ "epoch": 1.1404255319148937,
+ "grad_norm": 2.8100006580352783,
+ "learning_rate": 4.595373045539941e-06,
+ "loss": 0.5365,
+ "step": 2412
+ },
+ {
+ "epoch": 1.1408983451536643,
+ "grad_norm": 2.6825881004333496,
+ "learning_rate": 4.59503271949673e-06,
+ "loss": 0.4457,
+ "step": 2413
+ },
+ {
+ "epoch": 1.141371158392435,
+ "grad_norm": 2.969435691833496,
+ "learning_rate": 4.594692263005016e-06,
+ "loss": 0.5459,
+ "step": 2414
+ },
+ {
+ "epoch": 1.1418439716312057,
+ "grad_norm": 2.4103164672851562,
+ "learning_rate": 4.594351676086002e-06,
+ "loss": 0.4573,
+ "step": 2415
+ },
+ {
+ "epoch": 1.1423167848699765,
+ "grad_norm": 2.9450128078460693,
+ "learning_rate": 4.594010958760892e-06,
+ "loss": 0.5529,
+ "step": 2416
+ },
+ {
+ "epoch": 1.142789598108747,
+ "grad_norm": 2.6416335105895996,
+ "learning_rate": 4.593670111050901e-06,
+ "loss": 0.5153,
+ "step": 2417
+ },
+ {
+ "epoch": 1.1432624113475178,
+ "grad_norm": 2.473177194595337,
+ "learning_rate": 4.593329132977253e-06,
+ "loss": 0.4962,
+ "step": 2418
+ },
+ {
+ "epoch": 1.1437352245862884,
+ "grad_norm": 2.4494502544403076,
+ "learning_rate": 4.592988024561179e-06,
+ "loss": 0.5182,
+ "step": 2419
+ },
+ {
+ "epoch": 1.1442080378250592,
+ "grad_norm": 2.773930311203003,
+ "learning_rate": 4.592646785823918e-06,
+ "loss": 0.4442,
+ "step": 2420
+ },
+ {
+ "epoch": 1.1446808510638298,
+ "grad_norm": 2.4733314514160156,
+ "learning_rate": 4.592305416786718e-06,
+ "loss": 0.5106,
+ "step": 2421
+ },
+ {
+ "epoch": 1.1451536643026006,
+ "grad_norm": 2.6870038509368896,
+ "learning_rate": 4.591963917470834e-06,
+ "loss": 0.5316,
+ "step": 2422
+ },
+ {
+ "epoch": 1.1456264775413711,
+ "grad_norm": 2.8989531993865967,
+ "learning_rate": 4.591622287897529e-06,
+ "loss": 0.5906,
+ "step": 2423
+ },
+ {
+ "epoch": 1.1460992907801417,
+ "grad_norm": 2.6349124908447266,
+ "learning_rate": 4.591280528088077e-06,
+ "loss": 0.6225,
+ "step": 2424
+ },
+ {
+ "epoch": 1.1465721040189125,
+ "grad_norm": 3.19022274017334,
+ "learning_rate": 4.5909386380637555e-06,
+ "loss": 0.555,
+ "step": 2425
+ },
+ {
+ "epoch": 1.1470449172576833,
+ "grad_norm": 3.1473541259765625,
+ "learning_rate": 4.5905966178458535e-06,
+ "loss": 0.537,
+ "step": 2426
+ },
+ {
+ "epoch": 1.147517730496454,
+ "grad_norm": 2.6996145248413086,
+ "learning_rate": 4.590254467455667e-06,
+ "loss": 0.565,
+ "step": 2427
+ },
+ {
+ "epoch": 1.1479905437352245,
+ "grad_norm": 2.830188274383545,
+ "learning_rate": 4.5899121869145015e-06,
+ "loss": 0.6773,
+ "step": 2428
+ },
+ {
+ "epoch": 1.1484633569739953,
+ "grad_norm": 2.4937260150909424,
+ "learning_rate": 4.589569776243667e-06,
+ "loss": 0.5484,
+ "step": 2429
+ },
+ {
+ "epoch": 1.148936170212766,
+ "grad_norm": 2.54011869430542,
+ "learning_rate": 4.589227235464486e-06,
+ "loss": 0.5307,
+ "step": 2430
+ },
+ {
+ "epoch": 1.1494089834515366,
+ "grad_norm": 2.8764214515686035,
+ "learning_rate": 4.5888845645982845e-06,
+ "loss": 0.5296,
+ "step": 2431
+ },
+ {
+ "epoch": 1.1498817966903072,
+ "grad_norm": 2.637033462524414,
+ "learning_rate": 4.588541763666402e-06,
+ "loss": 0.5975,
+ "step": 2432
+ },
+ {
+ "epoch": 1.150354609929078,
+ "grad_norm": 2.8534255027770996,
+ "learning_rate": 4.5881988326901815e-06,
+ "loss": 0.5431,
+ "step": 2433
+ },
+ {
+ "epoch": 1.1508274231678488,
+ "grad_norm": 2.8546559810638428,
+ "learning_rate": 4.587855771690976e-06,
+ "loss": 0.469,
+ "step": 2434
+ },
+ {
+ "epoch": 1.1513002364066194,
+ "grad_norm": 2.9084973335266113,
+ "learning_rate": 4.587512580690146e-06,
+ "loss": 0.5566,
+ "step": 2435
+ },
+ {
+ "epoch": 1.15177304964539,
+ "grad_norm": 3.0993130207061768,
+ "learning_rate": 4.587169259709063e-06,
+ "loss": 0.5612,
+ "step": 2436
+ },
+ {
+ "epoch": 1.1522458628841608,
+ "grad_norm": 10.847400665283203,
+ "learning_rate": 4.5868258087691e-06,
+ "loss": 0.4678,
+ "step": 2437
+ },
+ {
+ "epoch": 1.1527186761229316,
+ "grad_norm": 2.6648571491241455,
+ "learning_rate": 4.586482227891645e-06,
+ "loss": 0.5951,
+ "step": 2438
+ },
+ {
+ "epoch": 1.1531914893617021,
+ "grad_norm": 2.529043197631836,
+ "learning_rate": 4.586138517098091e-06,
+ "loss": 0.5048,
+ "step": 2439
+ },
+ {
+ "epoch": 1.1536643026004727,
+ "grad_norm": 2.833904504776001,
+ "learning_rate": 4.585794676409839e-06,
+ "loss": 0.536,
+ "step": 2440
+ },
+ {
+ "epoch": 1.1541371158392435,
+ "grad_norm": 3.507657766342163,
+ "learning_rate": 4.585450705848298e-06,
+ "loss": 0.5954,
+ "step": 2441
+ },
+ {
+ "epoch": 1.1546099290780143,
+ "grad_norm": 2.6108388900756836,
+ "learning_rate": 4.585106605434887e-06,
+ "loss": 0.5684,
+ "step": 2442
+ },
+ {
+ "epoch": 1.1550827423167849,
+ "grad_norm": 2.490708589553833,
+ "learning_rate": 4.58476237519103e-06,
+ "loss": 0.4678,
+ "step": 2443
+ },
+ {
+ "epoch": 1.1555555555555554,
+ "grad_norm": 2.8192343711853027,
+ "learning_rate": 4.584418015138161e-06,
+ "loss": 0.5291,
+ "step": 2444
+ },
+ {
+ "epoch": 1.1560283687943262,
+ "grad_norm": 3.0878679752349854,
+ "learning_rate": 4.584073525297722e-06,
+ "loss": 0.5691,
+ "step": 2445
+ },
+ {
+ "epoch": 1.156501182033097,
+ "grad_norm": 3.1444318294525146,
+ "learning_rate": 4.583728905691163e-06,
+ "loss": 0.5643,
+ "step": 2446
+ },
+ {
+ "epoch": 1.1569739952718676,
+ "grad_norm": 3.02382230758667,
+ "learning_rate": 4.583384156339942e-06,
+ "loss": 0.6008,
+ "step": 2447
+ },
+ {
+ "epoch": 1.1574468085106382,
+ "grad_norm": 2.5942490100860596,
+ "learning_rate": 4.583039277265525e-06,
+ "loss": 0.5105,
+ "step": 2448
+ },
+ {
+ "epoch": 1.157919621749409,
+ "grad_norm": 2.938608407974243,
+ "learning_rate": 4.582694268489386e-06,
+ "loss": 0.5123,
+ "step": 2449
+ },
+ {
+ "epoch": 1.1583924349881798,
+ "grad_norm": 2.4622268676757812,
+ "learning_rate": 4.5823491300330075e-06,
+ "loss": 0.4538,
+ "step": 2450
+ },
+ {
+ "epoch": 1.1588652482269504,
+ "grad_norm": 2.4380505084991455,
+ "learning_rate": 4.5820038619178795e-06,
+ "loss": 0.4682,
+ "step": 2451
+ },
+ {
+ "epoch": 1.159338061465721,
+ "grad_norm": 2.479896068572998,
+ "learning_rate": 4.581658464165501e-06,
+ "loss": 0.4877,
+ "step": 2452
+ },
+ {
+ "epoch": 1.1598108747044917,
+ "grad_norm": 2.3373546600341797,
+ "learning_rate": 4.5813129367973765e-06,
+ "loss": 0.445,
+ "step": 2453
+ },
+ {
+ "epoch": 1.1602836879432625,
+ "grad_norm": 2.8586013317108154,
+ "learning_rate": 4.5809672798350214e-06,
+ "loss": 0.5232,
+ "step": 2454
+ },
+ {
+ "epoch": 1.160756501182033,
+ "grad_norm": 3.2302439212799072,
+ "learning_rate": 4.5806214932999595e-06,
+ "loss": 0.5336,
+ "step": 2455
+ },
+ {
+ "epoch": 1.1612293144208037,
+ "grad_norm": 3.1005783081054688,
+ "learning_rate": 4.580275577213721e-06,
+ "loss": 0.5123,
+ "step": 2456
+ },
+ {
+ "epoch": 1.1617021276595745,
+ "grad_norm": 2.7131073474884033,
+ "learning_rate": 4.579929531597842e-06,
+ "loss": 0.5648,
+ "step": 2457
+ },
+ {
+ "epoch": 1.1621749408983453,
+ "grad_norm": 2.5067050457000732,
+ "learning_rate": 4.579583356473874e-06,
+ "loss": 0.5324,
+ "step": 2458
+ },
+ {
+ "epoch": 1.1626477541371159,
+ "grad_norm": 2.7870543003082275,
+ "learning_rate": 4.579237051863366e-06,
+ "loss": 0.5094,
+ "step": 2459
+ },
+ {
+ "epoch": 1.1631205673758864,
+ "grad_norm": 2.739196300506592,
+ "learning_rate": 4.578890617787887e-06,
+ "loss": 0.5103,
+ "step": 2460
+ },
+ {
+ "epoch": 1.1635933806146572,
+ "grad_norm": 2.7108185291290283,
+ "learning_rate": 4.578544054269003e-06,
+ "loss": 0.533,
+ "step": 2461
+ },
+ {
+ "epoch": 1.1640661938534278,
+ "grad_norm": 3.028005361557007,
+ "learning_rate": 4.578197361328295e-06,
+ "loss": 0.636,
+ "step": 2462
+ },
+ {
+ "epoch": 1.1645390070921986,
+ "grad_norm": 2.4855129718780518,
+ "learning_rate": 4.5778505389873505e-06,
+ "loss": 0.501,
+ "step": 2463
+ },
+ {
+ "epoch": 1.1650118203309692,
+ "grad_norm": 2.6314198970794678,
+ "learning_rate": 4.577503587267764e-06,
+ "loss": 0.5812,
+ "step": 2464
+ },
+ {
+ "epoch": 1.16548463356974,
+ "grad_norm": 2.4209671020507812,
+ "learning_rate": 4.5771565061911385e-06,
+ "loss": 0.5168,
+ "step": 2465
+ },
+ {
+ "epoch": 1.1659574468085105,
+ "grad_norm": 2.526388645172119,
+ "learning_rate": 4.576809295779085e-06,
+ "loss": 0.5047,
+ "step": 2466
+ },
+ {
+ "epoch": 1.1664302600472813,
+ "grad_norm": 2.8278191089630127,
+ "learning_rate": 4.576461956053224e-06,
+ "loss": 0.4759,
+ "step": 2467
+ },
+ {
+ "epoch": 1.166903073286052,
+ "grad_norm": 2.7862167358398438,
+ "learning_rate": 4.576114487035182e-06,
+ "loss": 0.5492,
+ "step": 2468
+ },
+ {
+ "epoch": 1.1673758865248227,
+ "grad_norm": 2.6303019523620605,
+ "learning_rate": 4.575766888746594e-06,
+ "loss": 0.5538,
+ "step": 2469
+ },
+ {
+ "epoch": 1.1678486997635933,
+ "grad_norm": 2.613104820251465,
+ "learning_rate": 4.5754191612091034e-06,
+ "loss": 0.5114,
+ "step": 2470
+ },
+ {
+ "epoch": 1.168321513002364,
+ "grad_norm": 2.653958320617676,
+ "learning_rate": 4.5750713044443625e-06,
+ "loss": 0.5858,
+ "step": 2471
+ },
+ {
+ "epoch": 1.1687943262411347,
+ "grad_norm": 3.1143975257873535,
+ "learning_rate": 4.574723318474031e-06,
+ "loss": 0.5193,
+ "step": 2472
+ },
+ {
+ "epoch": 1.1692671394799055,
+ "grad_norm": 3.05454421043396,
+ "learning_rate": 4.574375203319775e-06,
+ "loss": 0.464,
+ "step": 2473
+ },
+ {
+ "epoch": 1.169739952718676,
+ "grad_norm": 2.66626238822937,
+ "learning_rate": 4.574026959003272e-06,
+ "loss": 0.4988,
+ "step": 2474
+ },
+ {
+ "epoch": 1.1702127659574468,
+ "grad_norm": 2.8871963024139404,
+ "learning_rate": 4.573678585546203e-06,
+ "loss": 0.5557,
+ "step": 2475
+ },
+ {
+ "epoch": 1.1706855791962174,
+ "grad_norm": 2.592949628829956,
+ "learning_rate": 4.573330082970262e-06,
+ "loss": 0.5178,
+ "step": 2476
+ },
+ {
+ "epoch": 1.1711583924349882,
+ "grad_norm": 2.9111456871032715,
+ "learning_rate": 4.572981451297148e-06,
+ "loss": 0.5712,
+ "step": 2477
+ },
+ {
+ "epoch": 1.1716312056737588,
+ "grad_norm": 2.8152248859405518,
+ "learning_rate": 4.57263269054857e-06,
+ "loss": 0.5548,
+ "step": 2478
+ },
+ {
+ "epoch": 1.1721040189125296,
+ "grad_norm": 3.0292418003082275,
+ "learning_rate": 4.572283800746241e-06,
+ "loss": 0.5937,
+ "step": 2479
+ },
+ {
+ "epoch": 1.1725768321513002,
+ "grad_norm": 3.454618215560913,
+ "learning_rate": 4.571934781911886e-06,
+ "loss": 0.5537,
+ "step": 2480
+ },
+ {
+ "epoch": 1.173049645390071,
+ "grad_norm": 2.7817866802215576,
+ "learning_rate": 4.571585634067239e-06,
+ "loss": 0.5649,
+ "step": 2481
+ },
+ {
+ "epoch": 1.1735224586288415,
+ "grad_norm": 2.7989349365234375,
+ "learning_rate": 4.571236357234037e-06,
+ "loss": 0.5448,
+ "step": 2482
+ },
+ {
+ "epoch": 1.1739952718676123,
+ "grad_norm": 2.8863933086395264,
+ "learning_rate": 4.57088695143403e-06,
+ "loss": 0.63,
+ "step": 2483
+ },
+ {
+ "epoch": 1.174468085106383,
+ "grad_norm": 2.5738039016723633,
+ "learning_rate": 4.570537416688972e-06,
+ "loss": 0.4702,
+ "step": 2484
+ },
+ {
+ "epoch": 1.1749408983451537,
+ "grad_norm": 3.003643274307251,
+ "learning_rate": 4.570187753020629e-06,
+ "loss": 0.5918,
+ "step": 2485
+ },
+ {
+ "epoch": 1.1754137115839243,
+ "grad_norm": 2.8619167804718018,
+ "learning_rate": 4.569837960450772e-06,
+ "loss": 0.5268,
+ "step": 2486
+ },
+ {
+ "epoch": 1.175886524822695,
+ "grad_norm": 2.876077175140381,
+ "learning_rate": 4.569488039001181e-06,
+ "loss": 0.4915,
+ "step": 2487
+ },
+ {
+ "epoch": 1.1763593380614656,
+ "grad_norm": 3.407115936279297,
+ "learning_rate": 4.569137988693644e-06,
+ "loss": 0.5761,
+ "step": 2488
+ },
+ {
+ "epoch": 1.1768321513002364,
+ "grad_norm": 2.7292826175689697,
+ "learning_rate": 4.568787809549958e-06,
+ "loss": 0.541,
+ "step": 2489
+ },
+ {
+ "epoch": 1.177304964539007,
+ "grad_norm": 2.8805999755859375,
+ "learning_rate": 4.568437501591926e-06,
+ "loss": 0.6223,
+ "step": 2490
+ },
+ {
+ "epoch": 1.1777777777777778,
+ "grad_norm": 2.9264373779296875,
+ "learning_rate": 4.56808706484136e-06,
+ "loss": 0.6081,
+ "step": 2491
+ },
+ {
+ "epoch": 1.1782505910165484,
+ "grad_norm": 2.5167033672332764,
+ "learning_rate": 4.567736499320082e-06,
+ "loss": 0.5393,
+ "step": 2492
+ },
+ {
+ "epoch": 1.1787234042553192,
+ "grad_norm": 3.4647862911224365,
+ "learning_rate": 4.567385805049918e-06,
+ "loss": 0.4826,
+ "step": 2493
+ },
+ {
+ "epoch": 1.1791962174940898,
+ "grad_norm": 2.9824202060699463,
+ "learning_rate": 4.5670349820527055e-06,
+ "loss": 0.541,
+ "step": 2494
+ },
+ {
+ "epoch": 1.1796690307328606,
+ "grad_norm": 2.997105836868286,
+ "learning_rate": 4.5666840303502885e-06,
+ "loss": 0.5771,
+ "step": 2495
+ },
+ {
+ "epoch": 1.1801418439716311,
+ "grad_norm": 2.8728017807006836,
+ "learning_rate": 4.56633294996452e-06,
+ "loss": 0.4877,
+ "step": 2496
+ },
+ {
+ "epoch": 1.180614657210402,
+ "grad_norm": 2.626498222351074,
+ "learning_rate": 4.5659817409172565e-06,
+ "loss": 0.5296,
+ "step": 2497
+ },
+ {
+ "epoch": 1.1810874704491725,
+ "grad_norm": 2.87037992477417,
+ "learning_rate": 4.565630403230371e-06,
+ "loss": 0.539,
+ "step": 2498
+ },
+ {
+ "epoch": 1.1815602836879433,
+ "grad_norm": 2.5719685554504395,
+ "learning_rate": 4.5652789369257375e-06,
+ "loss": 0.5653,
+ "step": 2499
+ },
+ {
+ "epoch": 1.1820330969267139,
+ "grad_norm": 2.4842135906219482,
+ "learning_rate": 4.56492734202524e-06,
+ "loss": 0.515,
+ "step": 2500
+ },
+ {
+ "epoch": 1.1825059101654847,
+ "grad_norm": 2.640951156616211,
+ "learning_rate": 4.564575618550773e-06,
+ "loss": 0.5601,
+ "step": 2501
+ },
+ {
+ "epoch": 1.1829787234042553,
+ "grad_norm": 2.624394655227661,
+ "learning_rate": 4.564223766524234e-06,
+ "loss": 0.5551,
+ "step": 2502
+ },
+ {
+ "epoch": 1.183451536643026,
+ "grad_norm": 3.014537811279297,
+ "learning_rate": 4.563871785967533e-06,
+ "loss": 0.5212,
+ "step": 2503
+ },
+ {
+ "epoch": 1.1839243498817966,
+ "grad_norm": 2.8756890296936035,
+ "learning_rate": 4.563519676902585e-06,
+ "loss": 0.5132,
+ "step": 2504
+ },
+ {
+ "epoch": 1.1843971631205674,
+ "grad_norm": 2.636781692504883,
+ "learning_rate": 4.5631674393513145e-06,
+ "loss": 0.5323,
+ "step": 2505
+ },
+ {
+ "epoch": 1.184869976359338,
+ "grad_norm": 2.7233786582946777,
+ "learning_rate": 4.562815073335655e-06,
+ "loss": 0.5608,
+ "step": 2506
+ },
+ {
+ "epoch": 1.1853427895981088,
+ "grad_norm": 2.7158713340759277,
+ "learning_rate": 4.562462578877546e-06,
+ "loss": 0.5373,
+ "step": 2507
+ },
+ {
+ "epoch": 1.1858156028368794,
+ "grad_norm": 2.9754762649536133,
+ "learning_rate": 4.562109955998936e-06,
+ "loss": 0.5712,
+ "step": 2508
+ },
+ {
+ "epoch": 1.1862884160756502,
+ "grad_norm": 2.8815054893493652,
+ "learning_rate": 4.561757204721781e-06,
+ "loss": 0.6126,
+ "step": 2509
+ },
+ {
+ "epoch": 1.1867612293144207,
+ "grad_norm": 2.866319417953491,
+ "learning_rate": 4.561404325068045e-06,
+ "loss": 0.506,
+ "step": 2510
+ },
+ {
+ "epoch": 1.1872340425531915,
+ "grad_norm": 2.6187376976013184,
+ "learning_rate": 4.561051317059701e-06,
+ "loss": 0.4674,
+ "step": 2511
+ },
+ {
+ "epoch": 1.1877068557919621,
+ "grad_norm": 2.642552137374878,
+ "learning_rate": 4.560698180718729e-06,
+ "loss": 0.4793,
+ "step": 2512
+ },
+ {
+ "epoch": 1.188179669030733,
+ "grad_norm": 2.7815041542053223,
+ "learning_rate": 4.560344916067117e-06,
+ "loss": 0.5034,
+ "step": 2513
+ },
+ {
+ "epoch": 1.1886524822695035,
+ "grad_norm": 2.70853590965271,
+ "learning_rate": 4.559991523126862e-06,
+ "loss": 0.4811,
+ "step": 2514
+ },
+ {
+ "epoch": 1.1891252955082743,
+ "grad_norm": 2.7049436569213867,
+ "learning_rate": 4.559638001919967e-06,
+ "loss": 0.547,
+ "step": 2515
+ },
+ {
+ "epoch": 1.1895981087470449,
+ "grad_norm": 2.766773223876953,
+ "learning_rate": 4.559284352468445e-06,
+ "loss": 0.5362,
+ "step": 2516
+ },
+ {
+ "epoch": 1.1900709219858157,
+ "grad_norm": 3.0064334869384766,
+ "learning_rate": 4.558930574794316e-06,
+ "loss": 0.5915,
+ "step": 2517
+ },
+ {
+ "epoch": 1.1905437352245862,
+ "grad_norm": 2.4899885654449463,
+ "learning_rate": 4.558576668919609e-06,
+ "loss": 0.4379,
+ "step": 2518
+ },
+ {
+ "epoch": 1.191016548463357,
+ "grad_norm": 2.925963878631592,
+ "learning_rate": 4.558222634866358e-06,
+ "loss": 0.5389,
+ "step": 2519
+ },
+ {
+ "epoch": 1.1914893617021276,
+ "grad_norm": 6.087667465209961,
+ "learning_rate": 4.55786847265661e-06,
+ "loss": 0.4777,
+ "step": 2520
+ },
+ {
+ "epoch": 1.1919621749408984,
+ "grad_norm": 2.4560582637786865,
+ "learning_rate": 4.5575141823124145e-06,
+ "loss": 0.5576,
+ "step": 2521
+ },
+ {
+ "epoch": 1.192434988179669,
+ "grad_norm": 3.184252977371216,
+ "learning_rate": 4.557159763855834e-06,
+ "loss": 0.5151,
+ "step": 2522
+ },
+ {
+ "epoch": 1.1929078014184398,
+ "grad_norm": 2.359722137451172,
+ "learning_rate": 4.556805217308935e-06,
+ "loss": 0.478,
+ "step": 2523
+ },
+ {
+ "epoch": 1.1933806146572103,
+ "grad_norm": 3.0821568965911865,
+ "learning_rate": 4.5564505426937935e-06,
+ "loss": 0.5784,
+ "step": 2524
+ },
+ {
+ "epoch": 1.1938534278959811,
+ "grad_norm": 2.9905128479003906,
+ "learning_rate": 4.5560957400324936e-06,
+ "loss": 0.6087,
+ "step": 2525
+ },
+ {
+ "epoch": 1.1943262411347517,
+ "grad_norm": 2.462102174758911,
+ "learning_rate": 4.555740809347128e-06,
+ "loss": 0.4739,
+ "step": 2526
+ },
+ {
+ "epoch": 1.1947990543735225,
+ "grad_norm": 2.7931067943573,
+ "learning_rate": 4.555385750659796e-06,
+ "loss": 0.4961,
+ "step": 2527
+ },
+ {
+ "epoch": 1.195271867612293,
+ "grad_norm": 2.660320997238159,
+ "learning_rate": 4.555030563992607e-06,
+ "loss": 0.487,
+ "step": 2528
+ },
+ {
+ "epoch": 1.195744680851064,
+ "grad_norm": 2.8135557174682617,
+ "learning_rate": 4.554675249367675e-06,
+ "loss": 0.5269,
+ "step": 2529
+ },
+ {
+ "epoch": 1.1962174940898345,
+ "grad_norm": 2.661933422088623,
+ "learning_rate": 4.554319806807126e-06,
+ "loss": 0.4723,
+ "step": 2530
+ },
+ {
+ "epoch": 1.1966903073286053,
+ "grad_norm": 2.568176507949829,
+ "learning_rate": 4.553964236333089e-06,
+ "loss": 0.5258,
+ "step": 2531
+ },
+ {
+ "epoch": 1.1971631205673758,
+ "grad_norm": 2.6890947818756104,
+ "learning_rate": 4.553608537967705e-06,
+ "loss": 0.4965,
+ "step": 2532
+ },
+ {
+ "epoch": 1.1976359338061466,
+ "grad_norm": 3.133470058441162,
+ "learning_rate": 4.553252711733124e-06,
+ "loss": 0.5423,
+ "step": 2533
+ },
+ {
+ "epoch": 1.1981087470449172,
+ "grad_norm": 2.7086687088012695,
+ "learning_rate": 4.552896757651498e-06,
+ "loss": 0.5326,
+ "step": 2534
+ },
+ {
+ "epoch": 1.198581560283688,
+ "grad_norm": 2.8411715030670166,
+ "learning_rate": 4.552540675744994e-06,
+ "loss": 0.5793,
+ "step": 2535
+ },
+ {
+ "epoch": 1.1990543735224586,
+ "grad_norm": 3.041077136993408,
+ "learning_rate": 4.552184466035782e-06,
+ "loss": 0.5068,
+ "step": 2536
+ },
+ {
+ "epoch": 1.1995271867612294,
+ "grad_norm": 2.5921192169189453,
+ "learning_rate": 4.551828128546041e-06,
+ "loss": 0.5189,
+ "step": 2537
+ },
+ {
+ "epoch": 1.2,
+ "grad_norm": 2.923305034637451,
+ "learning_rate": 4.5514716632979605e-06,
+ "loss": 0.516,
+ "step": 2538
+ },
+ {
+ "epoch": 1.2004728132387708,
+ "grad_norm": 2.7083024978637695,
+ "learning_rate": 4.551115070313734e-06,
+ "loss": 0.4825,
+ "step": 2539
+ },
+ {
+ "epoch": 1.2009456264775413,
+ "grad_norm": 2.746842384338379,
+ "learning_rate": 4.550758349615567e-06,
+ "loss": 0.5691,
+ "step": 2540
+ },
+ {
+ "epoch": 1.2014184397163121,
+ "grad_norm": 2.6596429347991943,
+ "learning_rate": 4.550401501225669e-06,
+ "loss": 0.5983,
+ "step": 2541
+ },
+ {
+ "epoch": 1.2018912529550827,
+ "grad_norm": 2.9057931900024414,
+ "learning_rate": 4.550044525166261e-06,
+ "loss": 0.5069,
+ "step": 2542
+ },
+ {
+ "epoch": 1.2023640661938535,
+ "grad_norm": 2.6139039993286133,
+ "learning_rate": 4.5496874214595686e-06,
+ "loss": 0.5102,
+ "step": 2543
+ },
+ {
+ "epoch": 1.202836879432624,
+ "grad_norm": 2.630286455154419,
+ "learning_rate": 4.5493301901278285e-06,
+ "loss": 0.4902,
+ "step": 2544
+ },
+ {
+ "epoch": 1.2033096926713949,
+ "grad_norm": 2.639174222946167,
+ "learning_rate": 4.548972831193284e-06,
+ "loss": 0.4566,
+ "step": 2545
+ },
+ {
+ "epoch": 1.2037825059101654,
+ "grad_norm": 2.9569664001464844,
+ "learning_rate": 4.548615344678186e-06,
+ "loss": 0.5636,
+ "step": 2546
+ },
+ {
+ "epoch": 1.2042553191489362,
+ "grad_norm": 2.981734037399292,
+ "learning_rate": 4.5482577306047924e-06,
+ "loss": 0.4884,
+ "step": 2547
+ },
+ {
+ "epoch": 1.2047281323877068,
+ "grad_norm": 2.6760342121124268,
+ "learning_rate": 4.547899988995371e-06,
+ "loss": 0.5426,
+ "step": 2548
+ },
+ {
+ "epoch": 1.2052009456264776,
+ "grad_norm": 2.825805902481079,
+ "learning_rate": 4.547542119872198e-06,
+ "loss": 0.4989,
+ "step": 2549
+ },
+ {
+ "epoch": 1.2056737588652482,
+ "grad_norm": 2.856426954269409,
+ "learning_rate": 4.547184123257555e-06,
+ "loss": 0.5734,
+ "step": 2550
+ },
+ {
+ "epoch": 1.206146572104019,
+ "grad_norm": 2.555682420730591,
+ "learning_rate": 4.5468259991737334e-06,
+ "loss": 0.5299,
+ "step": 2551
+ },
+ {
+ "epoch": 1.2066193853427896,
+ "grad_norm": 2.6324024200439453,
+ "learning_rate": 4.546467747643032e-06,
+ "loss": 0.5906,
+ "step": 2552
+ },
+ {
+ "epoch": 1.2070921985815604,
+ "grad_norm": 3.4145350456237793,
+ "learning_rate": 4.546109368687757e-06,
+ "loss": 0.5153,
+ "step": 2553
+ },
+ {
+ "epoch": 1.207565011820331,
+ "grad_norm": 2.658691644668579,
+ "learning_rate": 4.545750862330225e-06,
+ "loss": 0.5759,
+ "step": 2554
+ },
+ {
+ "epoch": 1.2080378250591017,
+ "grad_norm": 3.162605047225952,
+ "learning_rate": 4.545392228592755e-06,
+ "loss": 0.5379,
+ "step": 2555
+ },
+ {
+ "epoch": 1.2085106382978723,
+ "grad_norm": 2.8631198406219482,
+ "learning_rate": 4.545033467497681e-06,
+ "loss": 0.5959,
+ "step": 2556
+ },
+ {
+ "epoch": 1.208983451536643,
+ "grad_norm": 2.457109212875366,
+ "learning_rate": 4.54467457906734e-06,
+ "loss": 0.4864,
+ "step": 2557
+ },
+ {
+ "epoch": 1.2094562647754137,
+ "grad_norm": 2.5307061672210693,
+ "learning_rate": 4.544315563324078e-06,
+ "loss": 0.5308,
+ "step": 2558
+ },
+ {
+ "epoch": 1.2099290780141845,
+ "grad_norm": 2.8482773303985596,
+ "learning_rate": 4.543956420290251e-06,
+ "loss": 0.5126,
+ "step": 2559
+ },
+ {
+ "epoch": 1.210401891252955,
+ "grad_norm": 2.4990832805633545,
+ "learning_rate": 4.5435971499882195e-06,
+ "loss": 0.4534,
+ "step": 2560
+ },
+ {
+ "epoch": 1.2108747044917259,
+ "grad_norm": 2.6292665004730225,
+ "learning_rate": 4.543237752440354e-06,
+ "loss": 0.4434,
+ "step": 2561
+ },
+ {
+ "epoch": 1.2113475177304964,
+ "grad_norm": 2.865983247756958,
+ "learning_rate": 4.542878227669033e-06,
+ "loss": 0.5667,
+ "step": 2562
+ },
+ {
+ "epoch": 1.2118203309692672,
+ "grad_norm": 2.745614528656006,
+ "learning_rate": 4.542518575696644e-06,
+ "loss": 0.4724,
+ "step": 2563
+ },
+ {
+ "epoch": 1.2122931442080378,
+ "grad_norm": 2.8562581539154053,
+ "learning_rate": 4.5421587965455785e-06,
+ "loss": 0.5405,
+ "step": 2564
+ },
+ {
+ "epoch": 1.2127659574468086,
+ "grad_norm": 2.6670095920562744,
+ "learning_rate": 4.5417988902382385e-06,
+ "loss": 0.5432,
+ "step": 2565
+ },
+ {
+ "epoch": 1.2132387706855792,
+ "grad_norm": 2.9320743083953857,
+ "learning_rate": 4.541438856797036e-06,
+ "loss": 0.5862,
+ "step": 2566
+ },
+ {
+ "epoch": 1.21371158392435,
+ "grad_norm": 2.577505588531494,
+ "learning_rate": 4.541078696244386e-06,
+ "loss": 0.4742,
+ "step": 2567
+ },
+ {
+ "epoch": 1.2141843971631205,
+ "grad_norm": 3.4476120471954346,
+ "learning_rate": 4.540718408602717e-06,
+ "loss": 0.5903,
+ "step": 2568
+ },
+ {
+ "epoch": 1.2146572104018913,
+ "grad_norm": 2.816210985183716,
+ "learning_rate": 4.540357993894459e-06,
+ "loss": 0.5033,
+ "step": 2569
+ },
+ {
+ "epoch": 1.215130023640662,
+ "grad_norm": 3.0806639194488525,
+ "learning_rate": 4.539997452142058e-06,
+ "loss": 0.6064,
+ "step": 2570
+ },
+ {
+ "epoch": 1.2156028368794327,
+ "grad_norm": 2.563060760498047,
+ "learning_rate": 4.5396367833679586e-06,
+ "loss": 0.5597,
+ "step": 2571
+ },
+ {
+ "epoch": 1.2160756501182033,
+ "grad_norm": 3.1014397144317627,
+ "learning_rate": 4.5392759875946215e-06,
+ "loss": 0.54,
+ "step": 2572
+ },
+ {
+ "epoch": 1.216548463356974,
+ "grad_norm": 3.124190330505371,
+ "learning_rate": 4.53891506484451e-06,
+ "loss": 0.5122,
+ "step": 2573
+ },
+ {
+ "epoch": 1.2170212765957447,
+ "grad_norm": 2.6688716411590576,
+ "learning_rate": 4.538554015140097e-06,
+ "loss": 0.5615,
+ "step": 2574
+ },
+ {
+ "epoch": 1.2174940898345155,
+ "grad_norm": 2.775543689727783,
+ "learning_rate": 4.538192838503866e-06,
+ "loss": 0.496,
+ "step": 2575
+ },
+ {
+ "epoch": 1.217966903073286,
+ "grad_norm": 2.7877283096313477,
+ "learning_rate": 4.537831534958303e-06,
+ "loss": 0.4995,
+ "step": 2576
+ },
+ {
+ "epoch": 1.2184397163120568,
+ "grad_norm": 2.824810028076172,
+ "learning_rate": 4.537470104525906e-06,
+ "loss": 0.5481,
+ "step": 2577
+ },
+ {
+ "epoch": 1.2189125295508274,
+ "grad_norm": 2.801269292831421,
+ "learning_rate": 4.53710854722918e-06,
+ "loss": 0.5628,
+ "step": 2578
+ },
+ {
+ "epoch": 1.2193853427895982,
+ "grad_norm": 2.7780683040618896,
+ "learning_rate": 4.536746863090637e-06,
+ "loss": 0.4845,
+ "step": 2579
+ },
+ {
+ "epoch": 1.2198581560283688,
+ "grad_norm": 2.536010265350342,
+ "learning_rate": 4.536385052132798e-06,
+ "loss": 0.4771,
+ "step": 2580
+ },
+ {
+ "epoch": 1.2203309692671396,
+ "grad_norm": 2.768775701522827,
+ "learning_rate": 4.536023114378191e-06,
+ "loss": 0.5366,
+ "step": 2581
+ },
+ {
+ "epoch": 1.2208037825059102,
+ "grad_norm": 2.658125877380371,
+ "learning_rate": 4.535661049849352e-06,
+ "loss": 0.524,
+ "step": 2582
+ },
+ {
+ "epoch": 1.2212765957446807,
+ "grad_norm": 2.558696746826172,
+ "learning_rate": 4.535298858568825e-06,
+ "loss": 0.5482,
+ "step": 2583
+ },
+ {
+ "epoch": 1.2217494089834515,
+ "grad_norm": 2.5284535884857178,
+ "learning_rate": 4.534936540559164e-06,
+ "loss": 0.4454,
+ "step": 2584
+ },
+ {
+ "epoch": 1.2222222222222223,
+ "grad_norm": 7.617330074310303,
+ "learning_rate": 4.534574095842927e-06,
+ "loss": 0.5615,
+ "step": 2585
+ },
+ {
+ "epoch": 1.222695035460993,
+ "grad_norm": 2.9120311737060547,
+ "learning_rate": 4.534211524442682e-06,
+ "loss": 0.5624,
+ "step": 2586
+ },
+ {
+ "epoch": 1.2231678486997635,
+ "grad_norm": 2.5004289150238037,
+ "learning_rate": 4.533848826381005e-06,
+ "loss": 0.4743,
+ "step": 2587
+ },
+ {
+ "epoch": 1.2236406619385343,
+ "grad_norm": 2.8395533561706543,
+ "learning_rate": 4.53348600168048e-06,
+ "loss": 0.4457,
+ "step": 2588
+ },
+ {
+ "epoch": 1.224113475177305,
+ "grad_norm": 2.832211494445801,
+ "learning_rate": 4.533123050363699e-06,
+ "loss": 0.5559,
+ "step": 2589
+ },
+ {
+ "epoch": 1.2245862884160756,
+ "grad_norm": 2.6318583488464355,
+ "learning_rate": 4.53275997245326e-06,
+ "loss": 0.5281,
+ "step": 2590
+ },
+ {
+ "epoch": 1.2250591016548462,
+ "grad_norm": 3.0509233474731445,
+ "learning_rate": 4.532396767971771e-06,
+ "loss": 0.6003,
+ "step": 2591
+ },
+ {
+ "epoch": 1.225531914893617,
+ "grad_norm": 2.6863620281219482,
+ "learning_rate": 4.532033436941847e-06,
+ "loss": 0.5219,
+ "step": 2592
+ },
+ {
+ "epoch": 1.2260047281323878,
+ "grad_norm": 2.401463747024536,
+ "learning_rate": 4.5316699793861104e-06,
+ "loss": 0.5994,
+ "step": 2593
+ },
+ {
+ "epoch": 1.2264775413711584,
+ "grad_norm": 2.613517999649048,
+ "learning_rate": 4.531306395327194e-06,
+ "loss": 0.5785,
+ "step": 2594
+ },
+ {
+ "epoch": 1.226950354609929,
+ "grad_norm": 2.5016374588012695,
+ "learning_rate": 4.530942684787735e-06,
+ "loss": 0.5695,
+ "step": 2595
+ },
+ {
+ "epoch": 1.2274231678486998,
+ "grad_norm": 2.576464891433716,
+ "learning_rate": 4.53057884779038e-06,
+ "loss": 0.4427,
+ "step": 2596
+ },
+ {
+ "epoch": 1.2278959810874706,
+ "grad_norm": 2.5688700675964355,
+ "learning_rate": 4.530214884357785e-06,
+ "loss": 0.4966,
+ "step": 2597
+ },
+ {
+ "epoch": 1.2283687943262411,
+ "grad_norm": 3.179013729095459,
+ "learning_rate": 4.52985079451261e-06,
+ "loss": 0.5239,
+ "step": 2598
+ },
+ {
+ "epoch": 1.2288416075650117,
+ "grad_norm": 2.6015284061431885,
+ "learning_rate": 4.529486578277527e-06,
+ "loss": 0.5135,
+ "step": 2599
+ },
+ {
+ "epoch": 1.2293144208037825,
+ "grad_norm": 2.3029589653015137,
+ "learning_rate": 4.529122235675214e-06,
+ "loss": 0.4044,
+ "step": 2600
+ },
+ {
+ "epoch": 1.2297872340425533,
+ "grad_norm": 2.994093656539917,
+ "learning_rate": 4.528757766728357e-06,
+ "loss": 0.5419,
+ "step": 2601
+ },
+ {
+ "epoch": 1.2302600472813239,
+ "grad_norm": 2.6297390460968018,
+ "learning_rate": 4.52839317145965e-06,
+ "loss": 0.488,
+ "step": 2602
+ },
+ {
+ "epoch": 1.2307328605200945,
+ "grad_norm": 2.4814043045043945,
+ "learning_rate": 4.528028449891793e-06,
+ "loss": 0.4917,
+ "step": 2603
+ },
+ {
+ "epoch": 1.2312056737588652,
+ "grad_norm": 3.6052863597869873,
+ "learning_rate": 4.527663602047499e-06,
+ "loss": 0.5301,
+ "step": 2604
+ },
+ {
+ "epoch": 1.231678486997636,
+ "grad_norm": 2.6984751224517822,
+ "learning_rate": 4.5272986279494825e-06,
+ "loss": 0.5253,
+ "step": 2605
+ },
+ {
+ "epoch": 1.2321513002364066,
+ "grad_norm": 2.514000415802002,
+ "learning_rate": 4.526933527620469e-06,
+ "loss": 0.5661,
+ "step": 2606
+ },
+ {
+ "epoch": 1.2326241134751772,
+ "grad_norm": 2.890921115875244,
+ "learning_rate": 4.526568301083195e-06,
+ "loss": 0.5585,
+ "step": 2607
+ },
+ {
+ "epoch": 1.233096926713948,
+ "grad_norm": 2.6390011310577393,
+ "learning_rate": 4.526202948360397e-06,
+ "loss": 0.5168,
+ "step": 2608
+ },
+ {
+ "epoch": 1.2335697399527188,
+ "grad_norm": 2.7370636463165283,
+ "learning_rate": 4.5258374694748266e-06,
+ "loss": 0.5453,
+ "step": 2609
+ },
+ {
+ "epoch": 1.2340425531914894,
+ "grad_norm": 2.8203976154327393,
+ "learning_rate": 4.52547186444924e-06,
+ "loss": 0.5763,
+ "step": 2610
+ },
+ {
+ "epoch": 1.23451536643026,
+ "grad_norm": 2.7567849159240723,
+ "learning_rate": 4.5251061333064025e-06,
+ "loss": 0.5194,
+ "step": 2611
+ },
+ {
+ "epoch": 1.2349881796690307,
+ "grad_norm": 2.767519474029541,
+ "learning_rate": 4.524740276069085e-06,
+ "loss": 0.5355,
+ "step": 2612
+ },
+ {
+ "epoch": 1.2354609929078015,
+ "grad_norm": 3.072035312652588,
+ "learning_rate": 4.5243742927600695e-06,
+ "loss": 0.5391,
+ "step": 2613
+ },
+ {
+ "epoch": 1.2359338061465721,
+ "grad_norm": 2.5957462787628174,
+ "learning_rate": 4.524008183402143e-06,
+ "loss": 0.5645,
+ "step": 2614
+ },
+ {
+ "epoch": 1.2364066193853427,
+ "grad_norm": 2.774897575378418,
+ "learning_rate": 4.523641948018101e-06,
+ "loss": 0.5576,
+ "step": 2615
+ },
+ {
+ "epoch": 1.2368794326241135,
+ "grad_norm": 2.635887622833252,
+ "learning_rate": 4.5232755866307496e-06,
+ "loss": 0.5254,
+ "step": 2616
+ },
+ {
+ "epoch": 1.2373522458628843,
+ "grad_norm": 2.4860997200012207,
+ "learning_rate": 4.522909099262899e-06,
+ "loss": 0.4692,
+ "step": 2617
+ },
+ {
+ "epoch": 1.2378250591016549,
+ "grad_norm": 2.595513105392456,
+ "learning_rate": 4.522542485937369e-06,
+ "loss": 0.5166,
+ "step": 2618
+ },
+ {
+ "epoch": 1.2382978723404254,
+ "grad_norm": 2.961474895477295,
+ "learning_rate": 4.522175746676986e-06,
+ "loss": 0.5455,
+ "step": 2619
+ },
+ {
+ "epoch": 1.2387706855791962,
+ "grad_norm": 2.813889741897583,
+ "learning_rate": 4.521808881504588e-06,
+ "loss": 0.5249,
+ "step": 2620
+ },
+ {
+ "epoch": 1.239243498817967,
+ "grad_norm": 2.8434813022613525,
+ "learning_rate": 4.521441890443015e-06,
+ "loss": 0.472,
+ "step": 2621
+ },
+ {
+ "epoch": 1.2397163120567376,
+ "grad_norm": 2.4264845848083496,
+ "learning_rate": 4.521074773515119e-06,
+ "loss": 0.4783,
+ "step": 2622
+ },
+ {
+ "epoch": 1.2401891252955082,
+ "grad_norm": 2.615169048309326,
+ "learning_rate": 4.520707530743761e-06,
+ "loss": 0.5324,
+ "step": 2623
+ },
+ {
+ "epoch": 1.240661938534279,
+ "grad_norm": 2.6772537231445312,
+ "learning_rate": 4.520340162151803e-06,
+ "loss": 0.5224,
+ "step": 2624
+ },
+ {
+ "epoch": 1.2411347517730495,
+ "grad_norm": 2.683393955230713,
+ "learning_rate": 4.519972667762124e-06,
+ "loss": 0.4863,
+ "step": 2625
+ },
+ {
+ "epoch": 1.2416075650118203,
+ "grad_norm": 3.0335750579833984,
+ "learning_rate": 4.519605047597603e-06,
+ "loss": 0.544,
+ "step": 2626
+ },
+ {
+ "epoch": 1.242080378250591,
+ "grad_norm": 2.8694353103637695,
+ "learning_rate": 4.519237301681132e-06,
+ "loss": 0.5576,
+ "step": 2627
+ },
+ {
+ "epoch": 1.2425531914893617,
+ "grad_norm": 3.217808246612549,
+ "learning_rate": 4.518869430035609e-06,
+ "loss": 0.5459,
+ "step": 2628
+ },
+ {
+ "epoch": 1.2430260047281323,
+ "grad_norm": 2.7700083255767822,
+ "learning_rate": 4.518501432683937e-06,
+ "loss": 0.5579,
+ "step": 2629
+ },
+ {
+ "epoch": 1.243498817966903,
+ "grad_norm": 2.4759175777435303,
+ "learning_rate": 4.5181333096490335e-06,
+ "loss": 0.5049,
+ "step": 2630
+ },
+ {
+ "epoch": 1.2439716312056737,
+ "grad_norm": 2.8652584552764893,
+ "learning_rate": 4.517765060953818e-06,
+ "loss": 0.5366,
+ "step": 2631
+ },
+ {
+ "epoch": 1.2444444444444445,
+ "grad_norm": 2.776334524154663,
+ "learning_rate": 4.517396686621218e-06,
+ "loss": 0.5677,
+ "step": 2632
+ },
+ {
+ "epoch": 1.244917257683215,
+ "grad_norm": 2.676708221435547,
+ "learning_rate": 4.517028186674174e-06,
+ "loss": 0.5055,
+ "step": 2633
+ },
+ {
+ "epoch": 1.2453900709219858,
+ "grad_norm": 2.6851537227630615,
+ "learning_rate": 4.516659561135629e-06,
+ "loss": 0.5537,
+ "step": 2634
+ },
+ {
+ "epoch": 1.2458628841607564,
+ "grad_norm": 2.619971513748169,
+ "learning_rate": 4.516290810028536e-06,
+ "loss": 0.5765,
+ "step": 2635
+ },
+ {
+ "epoch": 1.2463356973995272,
+ "grad_norm": 2.7302334308624268,
+ "learning_rate": 4.515921933375855e-06,
+ "loss": 0.5611,
+ "step": 2636
+ },
+ {
+ "epoch": 1.2468085106382978,
+ "grad_norm": 2.5005829334259033,
+ "learning_rate": 4.5155529312005554e-06,
+ "loss": 0.442,
+ "step": 2637
+ },
+ {
+ "epoch": 1.2472813238770686,
+ "grad_norm": 2.713587522506714,
+ "learning_rate": 4.515183803525612e-06,
+ "loss": 0.5023,
+ "step": 2638
+ },
+ {
+ "epoch": 1.2477541371158392,
+ "grad_norm": 2.5146236419677734,
+ "learning_rate": 4.514814550374009e-06,
+ "loss": 0.5195,
+ "step": 2639
+ },
+ {
+ "epoch": 1.24822695035461,
+ "grad_norm": 2.761060953140259,
+ "learning_rate": 4.51444517176874e-06,
+ "loss": 0.5138,
+ "step": 2640
+ },
+ {
+ "epoch": 1.2486997635933805,
+ "grad_norm": 3.082329273223877,
+ "learning_rate": 4.5140756677328026e-06,
+ "loss": 0.6105,
+ "step": 2641
+ },
+ {
+ "epoch": 1.2491725768321513,
+ "grad_norm": 2.6933493614196777,
+ "learning_rate": 4.513706038289205e-06,
+ "loss": 0.5185,
+ "step": 2642
+ },
+ {
+ "epoch": 1.249645390070922,
+ "grad_norm": 2.515856981277466,
+ "learning_rate": 4.513336283460962e-06,
+ "loss": 0.5375,
+ "step": 2643
+ },
+ {
+ "epoch": 1.2501182033096927,
+ "grad_norm": 2.8553731441497803,
+ "learning_rate": 4.512966403271096e-06,
+ "loss": 0.5582,
+ "step": 2644
+ },
+ {
+ "epoch": 1.2505910165484633,
+ "grad_norm": 2.640880823135376,
+ "learning_rate": 4.5125963977426405e-06,
+ "loss": 0.5125,
+ "step": 2645
+ },
+ {
+ "epoch": 1.251063829787234,
+ "grad_norm": 2.9845943450927734,
+ "learning_rate": 4.512226266898631e-06,
+ "loss": 0.4749,
+ "step": 2646
+ },
+ {
+ "epoch": 1.2515366430260046,
+ "grad_norm": 2.5131032466888428,
+ "learning_rate": 4.511856010762116e-06,
+ "loss": 0.4764,
+ "step": 2647
+ },
+ {
+ "epoch": 1.2520094562647754,
+ "grad_norm": 2.370638370513916,
+ "learning_rate": 4.511485629356148e-06,
+ "loss": 0.5153,
+ "step": 2648
+ },
+ {
+ "epoch": 1.252482269503546,
+ "grad_norm": 2.912461996078491,
+ "learning_rate": 4.511115122703791e-06,
+ "loss": 0.6117,
+ "step": 2649
+ },
+ {
+ "epoch": 1.2529550827423168,
+ "grad_norm": 2.7308082580566406,
+ "learning_rate": 4.510744490828113e-06,
+ "loss": 0.5076,
+ "step": 2650
+ },
+ {
+ "epoch": 1.2534278959810874,
+ "grad_norm": 2.8524296283721924,
+ "learning_rate": 4.510373733752193e-06,
+ "loss": 0.542,
+ "step": 2651
+ },
+ {
+ "epoch": 1.2539007092198582,
+ "grad_norm": 2.799377202987671,
+ "learning_rate": 4.5100028514991145e-06,
+ "loss": 0.486,
+ "step": 2652
+ },
+ {
+ "epoch": 1.2543735224586288,
+ "grad_norm": 2.7248027324676514,
+ "learning_rate": 4.509631844091973e-06,
+ "loss": 0.4972,
+ "step": 2653
+ },
+ {
+ "epoch": 1.2548463356973996,
+ "grad_norm": 2.8041458129882812,
+ "learning_rate": 4.5092607115538686e-06,
+ "loss": 0.588,
+ "step": 2654
+ },
+ {
+ "epoch": 1.2553191489361701,
+ "grad_norm": 2.679417133331299,
+ "learning_rate": 4.50888945390791e-06,
+ "loss": 0.4639,
+ "step": 2655
+ },
+ {
+ "epoch": 1.255791962174941,
+ "grad_norm": 3.1049270629882812,
+ "learning_rate": 4.508518071177214e-06,
+ "loss": 0.5857,
+ "step": 2656
+ },
+ {
+ "epoch": 1.2562647754137115,
+ "grad_norm": 2.8590362071990967,
+ "learning_rate": 4.508146563384904e-06,
+ "loss": 0.5451,
+ "step": 2657
+ },
+ {
+ "epoch": 1.2567375886524823,
+ "grad_norm": 2.9774081707000732,
+ "learning_rate": 4.507774930554114e-06,
+ "loss": 0.5493,
+ "step": 2658
+ },
+ {
+ "epoch": 1.2572104018912529,
+ "grad_norm": 2.617643356323242,
+ "learning_rate": 4.507403172707983e-06,
+ "loss": 0.5472,
+ "step": 2659
+ },
+ {
+ "epoch": 1.2576832151300237,
+ "grad_norm": 2.9195587635040283,
+ "learning_rate": 4.507031289869658e-06,
+ "loss": 0.5403,
+ "step": 2660
+ },
+ {
+ "epoch": 1.2581560283687943,
+ "grad_norm": 2.706089496612549,
+ "learning_rate": 4.506659282062295e-06,
+ "loss": 0.4899,
+ "step": 2661
+ },
+ {
+ "epoch": 1.258628841607565,
+ "grad_norm": 2.8229358196258545,
+ "learning_rate": 4.506287149309057e-06,
+ "loss": 0.5336,
+ "step": 2662
+ },
+ {
+ "epoch": 1.2591016548463356,
+ "grad_norm": 2.5295674800872803,
+ "learning_rate": 4.505914891633117e-06,
+ "loss": 0.4806,
+ "step": 2663
+ },
+ {
+ "epoch": 1.2595744680851064,
+ "grad_norm": 3.098208427429199,
+ "learning_rate": 4.505542509057651e-06,
+ "loss": 0.6039,
+ "step": 2664
+ },
+ {
+ "epoch": 1.260047281323877,
+ "grad_norm": 2.5118041038513184,
+ "learning_rate": 4.5051700016058475e-06,
+ "loss": 0.5279,
+ "step": 2665
+ },
+ {
+ "epoch": 1.2605200945626478,
+ "grad_norm": 2.6901369094848633,
+ "learning_rate": 4.5047973693009005e-06,
+ "loss": 0.5515,
+ "step": 2666
+ },
+ {
+ "epoch": 1.2609929078014184,
+ "grad_norm": 2.5622377395629883,
+ "learning_rate": 4.504424612166012e-06,
+ "loss": 0.5405,
+ "step": 2667
+ },
+ {
+ "epoch": 1.2614657210401892,
+ "grad_norm": 2.685751438140869,
+ "learning_rate": 4.5040517302243915e-06,
+ "loss": 0.5797,
+ "step": 2668
+ },
+ {
+ "epoch": 1.2619385342789597,
+ "grad_norm": 2.8525350093841553,
+ "learning_rate": 4.503678723499259e-06,
+ "loss": 0.5561,
+ "step": 2669
+ },
+ {
+ "epoch": 1.2624113475177305,
+ "grad_norm": 2.803386926651001,
+ "learning_rate": 4.503305592013836e-06,
+ "loss": 0.5376,
+ "step": 2670
+ },
+ {
+ "epoch": 1.2628841607565011,
+ "grad_norm": 2.78633189201355,
+ "learning_rate": 4.502932335791359e-06,
+ "loss": 0.4739,
+ "step": 2671
+ },
+ {
+ "epoch": 1.263356973995272,
+ "grad_norm": 2.8337297439575195,
+ "learning_rate": 4.502558954855069e-06,
+ "loss": 0.5406,
+ "step": 2672
+ },
+ {
+ "epoch": 1.2638297872340425,
+ "grad_norm": 2.610275983810425,
+ "learning_rate": 4.502185449228213e-06,
+ "loss": 0.5343,
+ "step": 2673
+ },
+ {
+ "epoch": 1.2643026004728133,
+ "grad_norm": 2.7842252254486084,
+ "learning_rate": 4.501811818934048e-06,
+ "loss": 0.532,
+ "step": 2674
+ },
+ {
+ "epoch": 1.2647754137115839,
+ "grad_norm": 2.4472389221191406,
+ "learning_rate": 4.501438063995839e-06,
+ "loss": 0.4976,
+ "step": 2675
+ },
+ {
+ "epoch": 1.2652482269503547,
+ "grad_norm": 3.076580762863159,
+ "learning_rate": 4.501064184436858e-06,
+ "loss": 0.507,
+ "step": 2676
+ },
+ {
+ "epoch": 1.2657210401891252,
+ "grad_norm": 2.5952908992767334,
+ "learning_rate": 4.500690180280384e-06,
+ "loss": 0.5498,
+ "step": 2677
+ },
+ {
+ "epoch": 1.266193853427896,
+ "grad_norm": 2.476943016052246,
+ "learning_rate": 4.500316051549706e-06,
+ "loss": 0.557,
+ "step": 2678
+ },
+ {
+ "epoch": 1.2666666666666666,
+ "grad_norm": 2.730579376220703,
+ "learning_rate": 4.499941798268118e-06,
+ "loss": 0.4975,
+ "step": 2679
+ },
+ {
+ "epoch": 1.2671394799054374,
+ "grad_norm": 2.7916698455810547,
+ "learning_rate": 4.499567420458924e-06,
+ "loss": 0.5673,
+ "step": 2680
+ },
+ {
+ "epoch": 1.267612293144208,
+ "grad_norm": 2.4249091148376465,
+ "learning_rate": 4.4991929181454355e-06,
+ "loss": 0.4836,
+ "step": 2681
+ },
+ {
+ "epoch": 1.2680851063829788,
+ "grad_norm": 2.661911725997925,
+ "learning_rate": 4.498818291350969e-06,
+ "loss": 0.5332,
+ "step": 2682
+ },
+ {
+ "epoch": 1.2685579196217494,
+ "grad_norm": 2.693657875061035,
+ "learning_rate": 4.498443540098852e-06,
+ "loss": 0.5257,
+ "step": 2683
+ },
+ {
+ "epoch": 1.2690307328605201,
+ "grad_norm": 2.609386682510376,
+ "learning_rate": 4.4980686644124195e-06,
+ "loss": 0.4918,
+ "step": 2684
+ },
+ {
+ "epoch": 1.2695035460992907,
+ "grad_norm": 3.2104930877685547,
+ "learning_rate": 4.4976936643150124e-06,
+ "loss": 0.6097,
+ "step": 2685
+ },
+ {
+ "epoch": 1.2699763593380615,
+ "grad_norm": 2.707860231399536,
+ "learning_rate": 4.49731853982998e-06,
+ "loss": 0.5109,
+ "step": 2686
+ },
+ {
+ "epoch": 1.270449172576832,
+ "grad_norm": 3.5046379566192627,
+ "learning_rate": 4.49694329098068e-06,
+ "loss": 0.5883,
+ "step": 2687
+ },
+ {
+ "epoch": 1.270921985815603,
+ "grad_norm": 2.5362324714660645,
+ "learning_rate": 4.496567917790477e-06,
+ "loss": 0.5301,
+ "step": 2688
+ },
+ {
+ "epoch": 1.2713947990543735,
+ "grad_norm": 2.7095518112182617,
+ "learning_rate": 4.496192420282746e-06,
+ "loss": 0.4772,
+ "step": 2689
+ },
+ {
+ "epoch": 1.2718676122931443,
+ "grad_norm": 2.416433095932007,
+ "learning_rate": 4.495816798480865e-06,
+ "loss": 0.5012,
+ "step": 2690
+ },
+ {
+ "epoch": 1.2723404255319148,
+ "grad_norm": 2.5362391471862793,
+ "learning_rate": 4.495441052408224e-06,
+ "loss": 0.5197,
+ "step": 2691
+ },
+ {
+ "epoch": 1.2728132387706856,
+ "grad_norm": 2.9093947410583496,
+ "learning_rate": 4.495065182088218e-06,
+ "loss": 0.4893,
+ "step": 2692
+ },
+ {
+ "epoch": 1.2732860520094562,
+ "grad_norm": 2.520470142364502,
+ "learning_rate": 4.494689187544251e-06,
+ "loss": 0.5072,
+ "step": 2693
+ },
+ {
+ "epoch": 1.273758865248227,
+ "grad_norm": 2.4385125637054443,
+ "learning_rate": 4.494313068799735e-06,
+ "loss": 0.4923,
+ "step": 2694
+ },
+ {
+ "epoch": 1.2742316784869976,
+ "grad_norm": 2.636852502822876,
+ "learning_rate": 4.493936825878089e-06,
+ "loss": 0.5409,
+ "step": 2695
+ },
+ {
+ "epoch": 1.2747044917257684,
+ "grad_norm": 2.7027053833007812,
+ "learning_rate": 4.493560458802741e-06,
+ "loss": 0.5906,
+ "step": 2696
+ },
+ {
+ "epoch": 1.275177304964539,
+ "grad_norm": 2.58752179145813,
+ "learning_rate": 4.493183967597123e-06,
+ "loss": 0.5292,
+ "step": 2697
+ },
+ {
+ "epoch": 1.2756501182033098,
+ "grad_norm": 2.7658379077911377,
+ "learning_rate": 4.49280735228468e-06,
+ "loss": 0.5613,
+ "step": 2698
+ },
+ {
+ "epoch": 1.2761229314420803,
+ "grad_norm": 3.272688388824463,
+ "learning_rate": 4.492430612888861e-06,
+ "loss": 0.5654,
+ "step": 2699
+ },
+ {
+ "epoch": 1.2765957446808511,
+ "grad_norm": 2.806819438934326,
+ "learning_rate": 4.492053749433125e-06,
+ "loss": 0.5388,
+ "step": 2700
+ },
+ {
+ "epoch": 1.2770685579196217,
+ "grad_norm": 2.879727602005005,
+ "learning_rate": 4.491676761940936e-06,
+ "loss": 0.5033,
+ "step": 2701
+ },
+ {
+ "epoch": 1.2775413711583925,
+ "grad_norm": 2.733347177505493,
+ "learning_rate": 4.4912996504357695e-06,
+ "loss": 0.5113,
+ "step": 2702
+ },
+ {
+ "epoch": 1.278014184397163,
+ "grad_norm": 2.7431252002716064,
+ "learning_rate": 4.490922414941104e-06,
+ "loss": 0.5417,
+ "step": 2703
+ },
+ {
+ "epoch": 1.2784869976359339,
+ "grad_norm": 2.9287240505218506,
+ "learning_rate": 4.490545055480431e-06,
+ "loss": 0.5875,
+ "step": 2704
+ },
+ {
+ "epoch": 1.2789598108747045,
+ "grad_norm": 2.576775550842285,
+ "learning_rate": 4.490167572077244e-06,
+ "loss": 0.5176,
+ "step": 2705
+ },
+ {
+ "epoch": 1.2794326241134752,
+ "grad_norm": 2.4335594177246094,
+ "learning_rate": 4.4897899647550505e-06,
+ "loss": 0.4749,
+ "step": 2706
+ },
+ {
+ "epoch": 1.2799054373522458,
+ "grad_norm": 2.6798062324523926,
+ "learning_rate": 4.489412233537361e-06,
+ "loss": 0.5439,
+ "step": 2707
+ },
+ {
+ "epoch": 1.2803782505910166,
+ "grad_norm": 2.8440675735473633,
+ "learning_rate": 4.489034378447693e-06,
+ "loss": 0.552,
+ "step": 2708
+ },
+ {
+ "epoch": 1.2808510638297872,
+ "grad_norm": 2.9059503078460693,
+ "learning_rate": 4.488656399509577e-06,
+ "loss": 0.5667,
+ "step": 2709
+ },
+ {
+ "epoch": 1.281323877068558,
+ "grad_norm": 2.7415006160736084,
+ "learning_rate": 4.488278296746548e-06,
+ "loss": 0.5676,
+ "step": 2710
+ },
+ {
+ "epoch": 1.2817966903073286,
+ "grad_norm": 2.4584875106811523,
+ "learning_rate": 4.487900070182147e-06,
+ "loss": 0.4787,
+ "step": 2711
+ },
+ {
+ "epoch": 1.2822695035460994,
+ "grad_norm": 2.990940809249878,
+ "learning_rate": 4.487521719839924e-06,
+ "loss": 0.5239,
+ "step": 2712
+ },
+ {
+ "epoch": 1.28274231678487,
+ "grad_norm": 3.075201988220215,
+ "learning_rate": 4.487143245743441e-06,
+ "loss": 0.5103,
+ "step": 2713
+ },
+ {
+ "epoch": 1.2832151300236407,
+ "grad_norm": 2.543341875076294,
+ "learning_rate": 4.486764647916259e-06,
+ "loss": 0.5475,
+ "step": 2714
+ },
+ {
+ "epoch": 1.2836879432624113,
+ "grad_norm": 2.9927213191986084,
+ "learning_rate": 4.486385926381957e-06,
+ "loss": 0.4923,
+ "step": 2715
+ },
+ {
+ "epoch": 1.284160756501182,
+ "grad_norm": 2.4220657348632812,
+ "learning_rate": 4.486007081164111e-06,
+ "loss": 0.543,
+ "step": 2716
+ },
+ {
+ "epoch": 1.2846335697399527,
+ "grad_norm": 2.468214988708496,
+ "learning_rate": 4.4856281122863134e-06,
+ "loss": 0.5248,
+ "step": 2717
+ },
+ {
+ "epoch": 1.2851063829787235,
+ "grad_norm": 2.633711099624634,
+ "learning_rate": 4.48524901977216e-06,
+ "loss": 0.4764,
+ "step": 2718
+ },
+ {
+ "epoch": 1.285579196217494,
+ "grad_norm": 2.8399546146392822,
+ "learning_rate": 4.484869803645254e-06,
+ "loss": 0.5503,
+ "step": 2719
+ },
+ {
+ "epoch": 1.2860520094562649,
+ "grad_norm": 2.769063949584961,
+ "learning_rate": 4.484490463929209e-06,
+ "loss": 0.5468,
+ "step": 2720
+ },
+ {
+ "epoch": 1.2865248226950354,
+ "grad_norm": 2.617863893508911,
+ "learning_rate": 4.4841110006476465e-06,
+ "loss": 0.5906,
+ "step": 2721
+ },
+ {
+ "epoch": 1.2869976359338062,
+ "grad_norm": 2.7639541625976562,
+ "learning_rate": 4.4837314138241905e-06,
+ "loss": 0.552,
+ "step": 2722
+ },
+ {
+ "epoch": 1.2874704491725768,
+ "grad_norm": 2.7711129188537598,
+ "learning_rate": 4.483351703482478e-06,
+ "loss": 0.5229,
+ "step": 2723
+ },
+ {
+ "epoch": 1.2879432624113476,
+ "grad_norm": 2.611205577850342,
+ "learning_rate": 4.482971869646152e-06,
+ "loss": 0.5055,
+ "step": 2724
+ },
+ {
+ "epoch": 1.2884160756501182,
+ "grad_norm": 2.8602211475372314,
+ "learning_rate": 4.482591912338862e-06,
+ "loss": 0.5561,
+ "step": 2725
+ },
+ {
+ "epoch": 1.2888888888888888,
+ "grad_norm": 2.5882298946380615,
+ "learning_rate": 4.4822118315842675e-06,
+ "loss": 0.5555,
+ "step": 2726
+ },
+ {
+ "epoch": 1.2893617021276595,
+ "grad_norm": 2.7533531188964844,
+ "learning_rate": 4.481831627406033e-06,
+ "loss": 0.5346,
+ "step": 2727
+ },
+ {
+ "epoch": 1.2898345153664303,
+ "grad_norm": 2.4296958446502686,
+ "learning_rate": 4.481451299827835e-06,
+ "loss": 0.4915,
+ "step": 2728
+ },
+ {
+ "epoch": 1.290307328605201,
+ "grad_norm": 2.4403445720672607,
+ "learning_rate": 4.481070848873352e-06,
+ "loss": 0.5648,
+ "step": 2729
+ },
+ {
+ "epoch": 1.2907801418439715,
+ "grad_norm": 2.473224401473999,
+ "learning_rate": 4.480690274566274e-06,
+ "loss": 0.4849,
+ "step": 2730
+ },
+ {
+ "epoch": 1.2912529550827423,
+ "grad_norm": 2.637899875640869,
+ "learning_rate": 4.480309576930297e-06,
+ "loss": 0.4968,
+ "step": 2731
+ },
+ {
+ "epoch": 1.291725768321513,
+ "grad_norm": 2.7156927585601807,
+ "learning_rate": 4.479928755989127e-06,
+ "loss": 0.4759,
+ "step": 2732
+ },
+ {
+ "epoch": 1.2921985815602837,
+ "grad_norm": 2.632786989212036,
+ "learning_rate": 4.479547811766475e-06,
+ "loss": 0.5468,
+ "step": 2733
+ },
+ {
+ "epoch": 1.2926713947990542,
+ "grad_norm": 2.529218912124634,
+ "learning_rate": 4.479166744286061e-06,
+ "loss": 0.4852,
+ "step": 2734
+ },
+ {
+ "epoch": 1.293144208037825,
+ "grad_norm": 2.561978340148926,
+ "learning_rate": 4.4787855535716115e-06,
+ "loss": 0.546,
+ "step": 2735
+ },
+ {
+ "epoch": 1.2936170212765958,
+ "grad_norm": 2.3684909343719482,
+ "learning_rate": 4.478404239646862e-06,
+ "loss": 0.5369,
+ "step": 2736
+ },
+ {
+ "epoch": 1.2940898345153664,
+ "grad_norm": 2.8940367698669434,
+ "learning_rate": 4.4780228025355566e-06,
+ "loss": 0.568,
+ "step": 2737
+ },
+ {
+ "epoch": 1.294562647754137,
+ "grad_norm": 2.6950316429138184,
+ "learning_rate": 4.477641242261445e-06,
+ "loss": 0.4576,
+ "step": 2738
+ },
+ {
+ "epoch": 1.2950354609929078,
+ "grad_norm": 2.4211716651916504,
+ "learning_rate": 4.4772595588482835e-06,
+ "loss": 0.4341,
+ "step": 2739
+ },
+ {
+ "epoch": 1.2955082742316786,
+ "grad_norm": 3.141097068786621,
+ "learning_rate": 4.47687775231984e-06,
+ "loss": 0.5944,
+ "step": 2740
+ },
+ {
+ "epoch": 1.2959810874704492,
+ "grad_norm": 3.077522039413452,
+ "learning_rate": 4.476495822699887e-06,
+ "loss": 0.5786,
+ "step": 2741
+ },
+ {
+ "epoch": 1.2964539007092197,
+ "grad_norm": 2.708139419555664,
+ "learning_rate": 4.476113770012206e-06,
+ "loss": 0.5014,
+ "step": 2742
+ },
+ {
+ "epoch": 1.2969267139479905,
+ "grad_norm": 2.7572035789489746,
+ "learning_rate": 4.475731594280586e-06,
+ "loss": 0.594,
+ "step": 2743
+ },
+ {
+ "epoch": 1.2973995271867613,
+ "grad_norm": 2.673126459121704,
+ "learning_rate": 4.475349295528822e-06,
+ "loss": 0.5317,
+ "step": 2744
+ },
+ {
+ "epoch": 1.297872340425532,
+ "grad_norm": 2.6757819652557373,
+ "learning_rate": 4.4749668737807195e-06,
+ "loss": 0.5614,
+ "step": 2745
+ },
+ {
+ "epoch": 1.2983451536643025,
+ "grad_norm": 2.7077620029449463,
+ "learning_rate": 4.47458432906009e-06,
+ "loss": 0.4916,
+ "step": 2746
+ },
+ {
+ "epoch": 1.2988179669030733,
+ "grad_norm": 2.446570873260498,
+ "learning_rate": 4.474201661390752e-06,
+ "loss": 0.5005,
+ "step": 2747
+ },
+ {
+ "epoch": 1.299290780141844,
+ "grad_norm": 2.642695665359497,
+ "learning_rate": 4.473818870796533e-06,
+ "loss": 0.5048,
+ "step": 2748
+ },
+ {
+ "epoch": 1.2997635933806146,
+ "grad_norm": 2.519824743270874,
+ "learning_rate": 4.4734359573012686e-06,
+ "loss": 0.5131,
+ "step": 2749
+ },
+ {
+ "epoch": 1.3002364066193852,
+ "grad_norm": 2.5901925563812256,
+ "learning_rate": 4.4730529209287995e-06,
+ "loss": 0.4582,
+ "step": 2750
+ },
+ {
+ "epoch": 1.300709219858156,
+ "grad_norm": 2.6789121627807617,
+ "learning_rate": 4.472669761702978e-06,
+ "loss": 0.5685,
+ "step": 2751
+ },
+ {
+ "epoch": 1.3011820330969268,
+ "grad_norm": 2.408003807067871,
+ "learning_rate": 4.472286479647659e-06,
+ "loss": 0.4329,
+ "step": 2752
+ },
+ {
+ "epoch": 1.3016548463356974,
+ "grad_norm": 2.681403398513794,
+ "learning_rate": 4.47190307478671e-06,
+ "loss": 0.4853,
+ "step": 2753
+ },
+ {
+ "epoch": 1.302127659574468,
+ "grad_norm": 2.9923183917999268,
+ "learning_rate": 4.4715195471440025e-06,
+ "loss": 0.5184,
+ "step": 2754
+ },
+ {
+ "epoch": 1.3026004728132388,
+ "grad_norm": 2.5100321769714355,
+ "learning_rate": 4.471135896743418e-06,
+ "loss": 0.5148,
+ "step": 2755
+ },
+ {
+ "epoch": 1.3030732860520096,
+ "grad_norm": 2.267881393432617,
+ "learning_rate": 4.4707521236088444e-06,
+ "loss": 0.5028,
+ "step": 2756
+ },
+ {
+ "epoch": 1.3035460992907801,
+ "grad_norm": 2.7779829502105713,
+ "learning_rate": 4.4703682277641775e-06,
+ "loss": 0.5724,
+ "step": 2757
+ },
+ {
+ "epoch": 1.3040189125295507,
+ "grad_norm": 2.4262194633483887,
+ "learning_rate": 4.4699842092333205e-06,
+ "loss": 0.5341,
+ "step": 2758
+ },
+ {
+ "epoch": 1.3044917257683215,
+ "grad_norm": 2.8682050704956055,
+ "learning_rate": 4.469600068040185e-06,
+ "loss": 0.6114,
+ "step": 2759
+ },
+ {
+ "epoch": 1.3049645390070923,
+ "grad_norm": 2.647853374481201,
+ "learning_rate": 4.46921580420869e-06,
+ "loss": 0.5107,
+ "step": 2760
+ },
+ {
+ "epoch": 1.3054373522458629,
+ "grad_norm": 2.561998128890991,
+ "learning_rate": 4.468831417762762e-06,
+ "loss": 0.6019,
+ "step": 2761
+ },
+ {
+ "epoch": 1.3059101654846335,
+ "grad_norm": 2.763425350189209,
+ "learning_rate": 4.468446908726334e-06,
+ "loss": 0.572,
+ "step": 2762
+ },
+ {
+ "epoch": 1.3063829787234043,
+ "grad_norm": 2.7052934169769287,
+ "learning_rate": 4.468062277123348e-06,
+ "loss": 0.4876,
+ "step": 2763
+ },
+ {
+ "epoch": 1.306855791962175,
+ "grad_norm": 2.997845411300659,
+ "learning_rate": 4.467677522977755e-06,
+ "loss": 0.5683,
+ "step": 2764
+ },
+ {
+ "epoch": 1.3073286052009456,
+ "grad_norm": 2.503129005432129,
+ "learning_rate": 4.46729264631351e-06,
+ "loss": 0.4951,
+ "step": 2765
+ },
+ {
+ "epoch": 1.3078014184397162,
+ "grad_norm": 2.617492437362671,
+ "learning_rate": 4.466907647154578e-06,
+ "loss": 0.5054,
+ "step": 2766
+ },
+ {
+ "epoch": 1.308274231678487,
+ "grad_norm": 2.934967279434204,
+ "learning_rate": 4.4665225255249315e-06,
+ "loss": 0.5299,
+ "step": 2767
+ },
+ {
+ "epoch": 1.3087470449172578,
+ "grad_norm": 2.787252187728882,
+ "learning_rate": 4.46613728144855e-06,
+ "loss": 0.4652,
+ "step": 2768
+ },
+ {
+ "epoch": 1.3092198581560284,
+ "grad_norm": 2.567439556121826,
+ "learning_rate": 4.465751914949422e-06,
+ "loss": 0.538,
+ "step": 2769
+ },
+ {
+ "epoch": 1.309692671394799,
+ "grad_norm": 2.6386024951934814,
+ "learning_rate": 4.4653664260515416e-06,
+ "loss": 0.464,
+ "step": 2770
+ },
+ {
+ "epoch": 1.3101654846335697,
+ "grad_norm": 2.966848134994507,
+ "learning_rate": 4.464980814778912e-06,
+ "loss": 0.4889,
+ "step": 2771
+ },
+ {
+ "epoch": 1.3106382978723405,
+ "grad_norm": 2.571256637573242,
+ "learning_rate": 4.464595081155542e-06,
+ "loss": 0.4979,
+ "step": 2772
+ },
+ {
+ "epoch": 1.3111111111111111,
+ "grad_norm": 2.774203062057495,
+ "learning_rate": 4.4642092252054515e-06,
+ "loss": 0.5366,
+ "step": 2773
+ },
+ {
+ "epoch": 1.3115839243498817,
+ "grad_norm": 2.682969331741333,
+ "learning_rate": 4.463823246952666e-06,
+ "loss": 0.5118,
+ "step": 2774
+ },
+ {
+ "epoch": 1.3120567375886525,
+ "grad_norm": 2.4873905181884766,
+ "learning_rate": 4.463437146421217e-06,
+ "loss": 0.5548,
+ "step": 2775
+ },
+ {
+ "epoch": 1.3125295508274233,
+ "grad_norm": 2.6769661903381348,
+ "learning_rate": 4.463050923635147e-06,
+ "loss": 0.5023,
+ "step": 2776
+ },
+ {
+ "epoch": 1.3130023640661939,
+ "grad_norm": 2.7190892696380615,
+ "learning_rate": 4.462664578618503e-06,
+ "loss": 0.5546,
+ "step": 2777
+ },
+ {
+ "epoch": 1.3134751773049644,
+ "grad_norm": 2.8193624019622803,
+ "learning_rate": 4.462278111395343e-06,
+ "loss": 0.5265,
+ "step": 2778
+ },
+ {
+ "epoch": 1.3139479905437352,
+ "grad_norm": 2.7324538230895996,
+ "learning_rate": 4.461891521989728e-06,
+ "loss": 0.5449,
+ "step": 2779
+ },
+ {
+ "epoch": 1.314420803782506,
+ "grad_norm": 2.87320876121521,
+ "learning_rate": 4.4615048104257305e-06,
+ "loss": 0.5367,
+ "step": 2780
+ },
+ {
+ "epoch": 1.3148936170212766,
+ "grad_norm": 2.6777031421661377,
+ "learning_rate": 4.4611179767274306e-06,
+ "loss": 0.5026,
+ "step": 2781
+ },
+ {
+ "epoch": 1.3153664302600472,
+ "grad_norm": 3.714524269104004,
+ "learning_rate": 4.460731020918913e-06,
+ "loss": 0.569,
+ "step": 2782
+ },
+ {
+ "epoch": 1.315839243498818,
+ "grad_norm": 2.7493600845336914,
+ "learning_rate": 4.460343943024273e-06,
+ "loss": 0.5826,
+ "step": 2783
+ },
+ {
+ "epoch": 1.3163120567375888,
+ "grad_norm": 2.6544079780578613,
+ "learning_rate": 4.459956743067609e-06,
+ "loss": 0.5399,
+ "step": 2784
+ },
+ {
+ "epoch": 1.3167848699763594,
+ "grad_norm": 2.4338037967681885,
+ "learning_rate": 4.459569421073036e-06,
+ "loss": 0.5186,
+ "step": 2785
+ },
+ {
+ "epoch": 1.31725768321513,
+ "grad_norm": 2.9312374591827393,
+ "learning_rate": 4.459181977064665e-06,
+ "loss": 0.5571,
+ "step": 2786
+ },
+ {
+ "epoch": 1.3177304964539007,
+ "grad_norm": 2.5988922119140625,
+ "learning_rate": 4.458794411066624e-06,
+ "loss": 0.5926,
+ "step": 2787
+ },
+ {
+ "epoch": 1.3182033096926715,
+ "grad_norm": 2.5193772315979004,
+ "learning_rate": 4.458406723103044e-06,
+ "loss": 0.5243,
+ "step": 2788
+ },
+ {
+ "epoch": 1.318676122931442,
+ "grad_norm": 2.8653743267059326,
+ "learning_rate": 4.458018913198066e-06,
+ "loss": 0.5421,
+ "step": 2789
+ },
+ {
+ "epoch": 1.3191489361702127,
+ "grad_norm": 2.486245632171631,
+ "learning_rate": 4.457630981375834e-06,
+ "loss": 0.4862,
+ "step": 2790
+ },
+ {
+ "epoch": 1.3196217494089835,
+ "grad_norm": 3.155435800552368,
+ "learning_rate": 4.457242927660506e-06,
+ "loss": 0.5386,
+ "step": 2791
+ },
+ {
+ "epoch": 1.3200945626477543,
+ "grad_norm": 3.102023124694824,
+ "learning_rate": 4.456854752076242e-06,
+ "loss": 0.5527,
+ "step": 2792
+ },
+ {
+ "epoch": 1.3205673758865248,
+ "grad_norm": 2.7995986938476562,
+ "learning_rate": 4.456466454647215e-06,
+ "loss": 0.4364,
+ "step": 2793
+ },
+ {
+ "epoch": 1.3210401891252954,
+ "grad_norm": 2.8328311443328857,
+ "learning_rate": 4.456078035397599e-06,
+ "loss": 0.5516,
+ "step": 2794
+ },
+ {
+ "epoch": 1.3215130023640662,
+ "grad_norm": 2.606161594390869,
+ "learning_rate": 4.455689494351581e-06,
+ "loss": 0.5042,
+ "step": 2795
+ },
+ {
+ "epoch": 1.321985815602837,
+ "grad_norm": 2.6344757080078125,
+ "learning_rate": 4.455300831533354e-06,
+ "loss": 0.4807,
+ "step": 2796
+ },
+ {
+ "epoch": 1.3224586288416076,
+ "grad_norm": 2.8539786338806152,
+ "learning_rate": 4.454912046967118e-06,
+ "loss": 0.4694,
+ "step": 2797
+ },
+ {
+ "epoch": 1.3229314420803782,
+ "grad_norm": 2.849066734313965,
+ "learning_rate": 4.454523140677081e-06,
+ "loss": 0.5037,
+ "step": 2798
+ },
+ {
+ "epoch": 1.323404255319149,
+ "grad_norm": 2.6803371906280518,
+ "learning_rate": 4.454134112687458e-06,
+ "loss": 0.4959,
+ "step": 2799
+ },
+ {
+ "epoch": 1.3238770685579198,
+ "grad_norm": 3.0546066761016846,
+ "learning_rate": 4.453744963022473e-06,
+ "loss": 0.5935,
+ "step": 2800
+ },
+ {
+ "epoch": 1.3243498817966903,
+ "grad_norm": 2.625602960586548,
+ "learning_rate": 4.453355691706356e-06,
+ "loss": 0.5349,
+ "step": 2801
+ },
+ {
+ "epoch": 1.324822695035461,
+ "grad_norm": 2.7568554878234863,
+ "learning_rate": 4.452966298763345e-06,
+ "loss": 0.5012,
+ "step": 2802
+ },
+ {
+ "epoch": 1.3252955082742317,
+ "grad_norm": 2.940427303314209,
+ "learning_rate": 4.452576784217686e-06,
+ "loss": 0.5246,
+ "step": 2803
+ },
+ {
+ "epoch": 1.3257683215130025,
+ "grad_norm": 2.5485289096832275,
+ "learning_rate": 4.452187148093633e-06,
+ "loss": 0.5282,
+ "step": 2804
+ },
+ {
+ "epoch": 1.326241134751773,
+ "grad_norm": 2.8152987957000732,
+ "learning_rate": 4.4517973904154455e-06,
+ "loss": 0.5468,
+ "step": 2805
+ },
+ {
+ "epoch": 1.3267139479905437,
+ "grad_norm": 2.9399688243865967,
+ "learning_rate": 4.451407511207393e-06,
+ "loss": 0.5586,
+ "step": 2806
+ },
+ {
+ "epoch": 1.3271867612293144,
+ "grad_norm": 2.3870036602020264,
+ "learning_rate": 4.451017510493751e-06,
+ "loss": 0.4807,
+ "step": 2807
+ },
+ {
+ "epoch": 1.327659574468085,
+ "grad_norm": 3.4667887687683105,
+ "learning_rate": 4.450627388298805e-06,
+ "loss": 0.5571,
+ "step": 2808
+ },
+ {
+ "epoch": 1.3281323877068558,
+ "grad_norm": 2.685986042022705,
+ "learning_rate": 4.450237144646844e-06,
+ "loss": 0.5525,
+ "step": 2809
+ },
+ {
+ "epoch": 1.3286052009456264,
+ "grad_norm": 2.8529131412506104,
+ "learning_rate": 4.449846779562168e-06,
+ "loss": 0.491,
+ "step": 2810
+ },
+ {
+ "epoch": 1.3290780141843972,
+ "grad_norm": 2.7360332012176514,
+ "learning_rate": 4.449456293069082e-06,
+ "loss": 0.5574,
+ "step": 2811
+ },
+ {
+ "epoch": 1.3295508274231678,
+ "grad_norm": 2.4656026363372803,
+ "learning_rate": 4.4490656851919015e-06,
+ "loss": 0.4678,
+ "step": 2812
+ },
+ {
+ "epoch": 1.3300236406619386,
+ "grad_norm": 2.602651357650757,
+ "learning_rate": 4.448674955954947e-06,
+ "loss": 0.5118,
+ "step": 2813
+ },
+ {
+ "epoch": 1.3304964539007091,
+ "grad_norm": 3.0129756927490234,
+ "learning_rate": 4.448284105382548e-06,
+ "loss": 0.6136,
+ "step": 2814
+ },
+ {
+ "epoch": 1.33096926713948,
+ "grad_norm": 2.8499927520751953,
+ "learning_rate": 4.447893133499039e-06,
+ "loss": 0.5286,
+ "step": 2815
+ },
+ {
+ "epoch": 1.3314420803782505,
+ "grad_norm": 2.8320744037628174,
+ "learning_rate": 4.447502040328767e-06,
+ "loss": 0.5186,
+ "step": 2816
+ },
+ {
+ "epoch": 1.3319148936170213,
+ "grad_norm": 2.499950885772705,
+ "learning_rate": 4.447110825896084e-06,
+ "loss": 0.5338,
+ "step": 2817
+ },
+ {
+ "epoch": 1.3323877068557919,
+ "grad_norm": 2.530895233154297,
+ "learning_rate": 4.446719490225346e-06,
+ "loss": 0.5151,
+ "step": 2818
+ },
+ {
+ "epoch": 1.3328605200945627,
+ "grad_norm": 2.5276098251342773,
+ "learning_rate": 4.446328033340921e-06,
+ "loss": 0.5424,
+ "step": 2819
+ },
+ {
+ "epoch": 1.3333333333333333,
+ "grad_norm": 2.90218186378479,
+ "learning_rate": 4.4459364552671845e-06,
+ "loss": 0.5747,
+ "step": 2820
+ },
+ {
+ "epoch": 1.333806146572104,
+ "grad_norm": 2.500943183898926,
+ "learning_rate": 4.445544756028518e-06,
+ "loss": 0.5459,
+ "step": 2821
+ },
+ {
+ "epoch": 1.3342789598108746,
+ "grad_norm": 2.960374355316162,
+ "learning_rate": 4.44515293564931e-06,
+ "loss": 0.6092,
+ "step": 2822
+ },
+ {
+ "epoch": 1.3347517730496454,
+ "grad_norm": 2.813671827316284,
+ "learning_rate": 4.444760994153958e-06,
+ "loss": 0.5536,
+ "step": 2823
+ },
+ {
+ "epoch": 1.335224586288416,
+ "grad_norm": 2.7147483825683594,
+ "learning_rate": 4.444368931566867e-06,
+ "loss": 0.5291,
+ "step": 2824
+ },
+ {
+ "epoch": 1.3356973995271868,
+ "grad_norm": 2.710101842880249,
+ "learning_rate": 4.443976747912447e-06,
+ "loss": 0.5138,
+ "step": 2825
+ },
+ {
+ "epoch": 1.3361702127659574,
+ "grad_norm": 2.711419105529785,
+ "learning_rate": 4.443584443215121e-06,
+ "loss": 0.5223,
+ "step": 2826
+ },
+ {
+ "epoch": 1.3366430260047282,
+ "grad_norm": 2.887472152709961,
+ "learning_rate": 4.443192017499313e-06,
+ "loss": 0.5464,
+ "step": 2827
+ },
+ {
+ "epoch": 1.3371158392434987,
+ "grad_norm": 2.8867223262786865,
+ "learning_rate": 4.4427994707894585e-06,
+ "loss": 0.5748,
+ "step": 2828
+ },
+ {
+ "epoch": 1.3375886524822695,
+ "grad_norm": 2.407247543334961,
+ "learning_rate": 4.44240680311e-06,
+ "loss": 0.4727,
+ "step": 2829
+ },
+ {
+ "epoch": 1.3380614657210401,
+ "grad_norm": 2.578420877456665,
+ "learning_rate": 4.4420140144853865e-06,
+ "loss": 0.5129,
+ "step": 2830
+ },
+ {
+ "epoch": 1.338534278959811,
+ "grad_norm": 2.884373426437378,
+ "learning_rate": 4.441621104940077e-06,
+ "loss": 0.5366,
+ "step": 2831
+ },
+ {
+ "epoch": 1.3390070921985815,
+ "grad_norm": 2.8652374744415283,
+ "learning_rate": 4.441228074498534e-06,
+ "loss": 0.5045,
+ "step": 2832
+ },
+ {
+ "epoch": 1.3394799054373523,
+ "grad_norm": 2.5380210876464844,
+ "learning_rate": 4.440834923185231e-06,
+ "loss": 0.509,
+ "step": 2833
+ },
+ {
+ "epoch": 1.3399527186761229,
+ "grad_norm": 2.415734052658081,
+ "learning_rate": 4.440441651024648e-06,
+ "loss": 0.5066,
+ "step": 2834
+ },
+ {
+ "epoch": 1.3404255319148937,
+ "grad_norm": 2.503051996231079,
+ "learning_rate": 4.440048258041272e-06,
+ "loss": 0.5118,
+ "step": 2835
+ },
+ {
+ "epoch": 1.3408983451536642,
+ "grad_norm": 3.351001024246216,
+ "learning_rate": 4.439654744259598e-06,
+ "loss": 0.5758,
+ "step": 2836
+ },
+ {
+ "epoch": 1.341371158392435,
+ "grad_norm": 2.7368781566619873,
+ "learning_rate": 4.439261109704129e-06,
+ "loss": 0.5674,
+ "step": 2837
+ },
+ {
+ "epoch": 1.3418439716312056,
+ "grad_norm": 3.008199453353882,
+ "learning_rate": 4.438867354399372e-06,
+ "loss": 0.5891,
+ "step": 2838
+ },
+ {
+ "epoch": 1.3423167848699764,
+ "grad_norm": 2.538907766342163,
+ "learning_rate": 4.438473478369847e-06,
+ "loss": 0.5102,
+ "step": 2839
+ },
+ {
+ "epoch": 1.342789598108747,
+ "grad_norm": 2.7169063091278076,
+ "learning_rate": 4.438079481640079e-06,
+ "loss": 0.6131,
+ "step": 2840
+ },
+ {
+ "epoch": 1.3432624113475178,
+ "grad_norm": 2.7411608695983887,
+ "learning_rate": 4.437685364234601e-06,
+ "loss": 0.5337,
+ "step": 2841
+ },
+ {
+ "epoch": 1.3437352245862884,
+ "grad_norm": 3.2374939918518066,
+ "learning_rate": 4.43729112617795e-06,
+ "loss": 0.5401,
+ "step": 2842
+ },
+ {
+ "epoch": 1.3442080378250592,
+ "grad_norm": 2.4712226390838623,
+ "learning_rate": 4.436896767494676e-06,
+ "loss": 0.5365,
+ "step": 2843
+ },
+ {
+ "epoch": 1.3446808510638297,
+ "grad_norm": 2.661619186401367,
+ "learning_rate": 4.436502288209334e-06,
+ "loss": 0.4919,
+ "step": 2844
+ },
+ {
+ "epoch": 1.3451536643026005,
+ "grad_norm": 2.5943779945373535,
+ "learning_rate": 4.4361076883464845e-06,
+ "loss": 0.5253,
+ "step": 2845
+ },
+ {
+ "epoch": 1.345626477541371,
+ "grad_norm": 2.672297477722168,
+ "learning_rate": 4.4357129679307e-06,
+ "loss": 0.541,
+ "step": 2846
+ },
+ {
+ "epoch": 1.346099290780142,
+ "grad_norm": 2.6830925941467285,
+ "learning_rate": 4.435318126986557e-06,
+ "loss": 0.5641,
+ "step": 2847
+ },
+ {
+ "epoch": 1.3465721040189125,
+ "grad_norm": 2.7394626140594482,
+ "learning_rate": 4.434923165538639e-06,
+ "loss": 0.5591,
+ "step": 2848
+ },
+ {
+ "epoch": 1.3470449172576833,
+ "grad_norm": 2.9656317234039307,
+ "learning_rate": 4.434528083611541e-06,
+ "loss": 0.515,
+ "step": 2849
+ },
+ {
+ "epoch": 1.3475177304964538,
+ "grad_norm": 3.30155086517334,
+ "learning_rate": 4.434132881229861e-06,
+ "loss": 0.5871,
+ "step": 2850
+ },
+ {
+ "epoch": 1.3479905437352246,
+ "grad_norm": 2.6222476959228516,
+ "learning_rate": 4.433737558418209e-06,
+ "loss": 0.5143,
+ "step": 2851
+ },
+ {
+ "epoch": 1.3484633569739952,
+ "grad_norm": 2.903158187866211,
+ "learning_rate": 4.4333421152011965e-06,
+ "loss": 0.4484,
+ "step": 2852
+ },
+ {
+ "epoch": 1.348936170212766,
+ "grad_norm": 2.863116979598999,
+ "learning_rate": 4.432946551603449e-06,
+ "loss": 0.5213,
+ "step": 2853
+ },
+ {
+ "epoch": 1.3494089834515366,
+ "grad_norm": 2.8253962993621826,
+ "learning_rate": 4.432550867649596e-06,
+ "loss": 0.5713,
+ "step": 2854
+ },
+ {
+ "epoch": 1.3498817966903074,
+ "grad_norm": 2.652493953704834,
+ "learning_rate": 4.432155063364273e-06,
+ "loss": 0.5559,
+ "step": 2855
+ },
+ {
+ "epoch": 1.350354609929078,
+ "grad_norm": 2.4289376735687256,
+ "learning_rate": 4.431759138772127e-06,
+ "loss": 0.5122,
+ "step": 2856
+ },
+ {
+ "epoch": 1.3508274231678488,
+ "grad_norm": 2.6329853534698486,
+ "learning_rate": 4.43136309389781e-06,
+ "loss": 0.5332,
+ "step": 2857
+ },
+ {
+ "epoch": 1.3513002364066193,
+ "grad_norm": 2.431103229522705,
+ "learning_rate": 4.430966928765982e-06,
+ "loss": 0.4863,
+ "step": 2858
+ },
+ {
+ "epoch": 1.3517730496453901,
+ "grad_norm": 2.7529025077819824,
+ "learning_rate": 4.4305706434013106e-06,
+ "loss": 0.5263,
+ "step": 2859
+ },
+ {
+ "epoch": 1.3522458628841607,
+ "grad_norm": 2.884605646133423,
+ "learning_rate": 4.43017423782847e-06,
+ "loss": 0.564,
+ "step": 2860
+ },
+ {
+ "epoch": 1.3527186761229315,
+ "grad_norm": 3.027771234512329,
+ "learning_rate": 4.4297777120721435e-06,
+ "loss": 0.5846,
+ "step": 2861
+ },
+ {
+ "epoch": 1.353191489361702,
+ "grad_norm": 3.0140626430511475,
+ "learning_rate": 4.4293810661570205e-06,
+ "loss": 0.6621,
+ "step": 2862
+ },
+ {
+ "epoch": 1.3536643026004729,
+ "grad_norm": 2.721799612045288,
+ "learning_rate": 4.428984300107799e-06,
+ "loss": 0.5566,
+ "step": 2863
+ },
+ {
+ "epoch": 1.3541371158392435,
+ "grad_norm": 3.0016496181488037,
+ "learning_rate": 4.428587413949183e-06,
+ "loss": 0.5525,
+ "step": 2864
+ },
+ {
+ "epoch": 1.3546099290780143,
+ "grad_norm": 2.77138614654541,
+ "learning_rate": 4.428190407705886e-06,
+ "loss": 0.6016,
+ "step": 2865
+ },
+ {
+ "epoch": 1.3550827423167848,
+ "grad_norm": 2.9783477783203125,
+ "learning_rate": 4.427793281402627e-06,
+ "loss": 0.5556,
+ "step": 2866
+ },
+ {
+ "epoch": 1.3555555555555556,
+ "grad_norm": 2.2490382194519043,
+ "learning_rate": 4.427396035064132e-06,
+ "loss": 0.5138,
+ "step": 2867
+ },
+ {
+ "epoch": 1.3560283687943262,
+ "grad_norm": 2.442225217819214,
+ "learning_rate": 4.426998668715139e-06,
+ "loss": 0.4843,
+ "step": 2868
+ },
+ {
+ "epoch": 1.356501182033097,
+ "grad_norm": 2.74040150642395,
+ "learning_rate": 4.426601182380388e-06,
+ "loss": 0.54,
+ "step": 2869
+ },
+ {
+ "epoch": 1.3569739952718676,
+ "grad_norm": 2.4434332847595215,
+ "learning_rate": 4.426203576084629e-06,
+ "loss": 0.5199,
+ "step": 2870
+ },
+ {
+ "epoch": 1.3574468085106384,
+ "grad_norm": 2.6380388736724854,
+ "learning_rate": 4.42580584985262e-06,
+ "loss": 0.5049,
+ "step": 2871
+ },
+ {
+ "epoch": 1.357919621749409,
+ "grad_norm": 2.7324254512786865,
+ "learning_rate": 4.425408003709125e-06,
+ "loss": 0.5036,
+ "step": 2872
+ },
+ {
+ "epoch": 1.3583924349881797,
+ "grad_norm": 2.661012649536133,
+ "learning_rate": 4.425010037678916e-06,
+ "loss": 0.4965,
+ "step": 2873
+ },
+ {
+ "epoch": 1.3588652482269503,
+ "grad_norm": 2.5380208492279053,
+ "learning_rate": 4.424611951786773e-06,
+ "loss": 0.4293,
+ "step": 2874
+ },
+ {
+ "epoch": 1.3593380614657211,
+ "grad_norm": 2.6060714721679688,
+ "learning_rate": 4.424213746057483e-06,
+ "loss": 0.5335,
+ "step": 2875
+ },
+ {
+ "epoch": 1.3598108747044917,
+ "grad_norm": 2.98282527923584,
+ "learning_rate": 4.423815420515841e-06,
+ "loss": 0.5626,
+ "step": 2876
+ },
+ {
+ "epoch": 1.3602836879432625,
+ "grad_norm": 2.779371500015259,
+ "learning_rate": 4.423416975186647e-06,
+ "loss": 0.5353,
+ "step": 2877
+ },
+ {
+ "epoch": 1.360756501182033,
+ "grad_norm": 2.8033530712127686,
+ "learning_rate": 4.423018410094713e-06,
+ "loss": 0.538,
+ "step": 2878
+ },
+ {
+ "epoch": 1.3612293144208039,
+ "grad_norm": 3.225177764892578,
+ "learning_rate": 4.422619725264855e-06,
+ "loss": 0.5441,
+ "step": 2879
+ },
+ {
+ "epoch": 1.3617021276595744,
+ "grad_norm": 2.959135055541992,
+ "learning_rate": 4.422220920721896e-06,
+ "loss": 0.5293,
+ "step": 2880
+ },
+ {
+ "epoch": 1.3621749408983452,
+ "grad_norm": 2.5558884143829346,
+ "learning_rate": 4.4218219964906704e-06,
+ "loss": 0.442,
+ "step": 2881
+ },
+ {
+ "epoch": 1.3626477541371158,
+ "grad_norm": 2.694899797439575,
+ "learning_rate": 4.421422952596015e-06,
+ "loss": 0.5318,
+ "step": 2882
+ },
+ {
+ "epoch": 1.3631205673758866,
+ "grad_norm": 2.7909531593322754,
+ "learning_rate": 4.421023789062777e-06,
+ "loss": 0.6648,
+ "step": 2883
+ },
+ {
+ "epoch": 1.3635933806146572,
+ "grad_norm": 2.421995162963867,
+ "learning_rate": 4.420624505915813e-06,
+ "loss": 0.4644,
+ "step": 2884
+ },
+ {
+ "epoch": 1.364066193853428,
+ "grad_norm": 2.5876688957214355,
+ "learning_rate": 4.420225103179981e-06,
+ "loss": 0.5743,
+ "step": 2885
+ },
+ {
+ "epoch": 1.3645390070921986,
+ "grad_norm": 2.89341139793396,
+ "learning_rate": 4.419825580880152e-06,
+ "loss": 0.5454,
+ "step": 2886
+ },
+ {
+ "epoch": 1.3650118203309693,
+ "grad_norm": 2.534708261489868,
+ "learning_rate": 4.419425939041203e-06,
+ "loss": 0.5572,
+ "step": 2887
+ },
+ {
+ "epoch": 1.36548463356974,
+ "grad_norm": 2.6052141189575195,
+ "learning_rate": 4.419026177688017e-06,
+ "loss": 0.4763,
+ "step": 2888
+ },
+ {
+ "epoch": 1.3659574468085105,
+ "grad_norm": 2.723720073699951,
+ "learning_rate": 4.4186262968454854e-06,
+ "loss": 0.5659,
+ "step": 2889
+ },
+ {
+ "epoch": 1.3664302600472813,
+ "grad_norm": 2.8909599781036377,
+ "learning_rate": 4.418226296538507e-06,
+ "loss": 0.4996,
+ "step": 2890
+ },
+ {
+ "epoch": 1.366903073286052,
+ "grad_norm": 2.551375389099121,
+ "learning_rate": 4.417826176791988e-06,
+ "loss": 0.5259,
+ "step": 2891
+ },
+ {
+ "epoch": 1.3673758865248227,
+ "grad_norm": 3.360267162322998,
+ "learning_rate": 4.417425937630843e-06,
+ "loss": 0.5381,
+ "step": 2892
+ },
+ {
+ "epoch": 1.3678486997635932,
+ "grad_norm": 2.7611942291259766,
+ "learning_rate": 4.417025579079992e-06,
+ "loss": 0.6022,
+ "step": 2893
+ },
+ {
+ "epoch": 1.368321513002364,
+ "grad_norm": 2.5931224822998047,
+ "learning_rate": 4.416625101164365e-06,
+ "loss": 0.5102,
+ "step": 2894
+ },
+ {
+ "epoch": 1.3687943262411348,
+ "grad_norm": 2.5888102054595947,
+ "learning_rate": 4.416224503908897e-06,
+ "loss": 0.4955,
+ "step": 2895
+ },
+ {
+ "epoch": 1.3692671394799054,
+ "grad_norm": 2.6262896060943604,
+ "learning_rate": 4.41582378733853e-06,
+ "loss": 0.5101,
+ "step": 2896
+ },
+ {
+ "epoch": 1.369739952718676,
+ "grad_norm": 3.339170217514038,
+ "learning_rate": 4.415422951478218e-06,
+ "loss": 0.4939,
+ "step": 2897
+ },
+ {
+ "epoch": 1.3702127659574468,
+ "grad_norm": 2.940866708755493,
+ "learning_rate": 4.415021996352917e-06,
+ "loss": 0.5157,
+ "step": 2898
+ },
+ {
+ "epoch": 1.3706855791962176,
+ "grad_norm": 2.7423818111419678,
+ "learning_rate": 4.414620921987594e-06,
+ "loss": 0.5308,
+ "step": 2899
+ },
+ {
+ "epoch": 1.3711583924349882,
+ "grad_norm": 2.7177040576934814,
+ "learning_rate": 4.414219728407221e-06,
+ "loss": 0.5429,
+ "step": 2900
+ },
+ {
+ "epoch": 1.3716312056737587,
+ "grad_norm": 2.560774087905884,
+ "learning_rate": 4.4138184156367794e-06,
+ "loss": 0.5266,
+ "step": 2901
+ },
+ {
+ "epoch": 1.3721040189125295,
+ "grad_norm": 2.5649116039276123,
+ "learning_rate": 4.413416983701256e-06,
+ "loss": 0.4718,
+ "step": 2902
+ },
+ {
+ "epoch": 1.3725768321513003,
+ "grad_norm": 2.8547167778015137,
+ "learning_rate": 4.413015432625648e-06,
+ "loss": 0.5129,
+ "step": 2903
+ },
+ {
+ "epoch": 1.373049645390071,
+ "grad_norm": 2.5413618087768555,
+ "learning_rate": 4.412613762434958e-06,
+ "loss": 0.5738,
+ "step": 2904
+ },
+ {
+ "epoch": 1.3735224586288415,
+ "grad_norm": 3.3252241611480713,
+ "learning_rate": 4.412211973154195e-06,
+ "loss": 0.5639,
+ "step": 2905
+ },
+ {
+ "epoch": 1.3739952718676123,
+ "grad_norm": 2.869102954864502,
+ "learning_rate": 4.411810064808376e-06,
+ "loss": 0.5384,
+ "step": 2906
+ },
+ {
+ "epoch": 1.374468085106383,
+ "grad_norm": 2.703199863433838,
+ "learning_rate": 4.411408037422529e-06,
+ "loss": 0.5742,
+ "step": 2907
+ },
+ {
+ "epoch": 1.3749408983451537,
+ "grad_norm": 2.685450792312622,
+ "learning_rate": 4.411005891021684e-06,
+ "loss": 0.5121,
+ "step": 2908
+ },
+ {
+ "epoch": 1.3754137115839242,
+ "grad_norm": 2.9572203159332275,
+ "learning_rate": 4.410603625630882e-06,
+ "loss": 0.5444,
+ "step": 2909
+ },
+ {
+ "epoch": 1.375886524822695,
+ "grad_norm": 2.707002878189087,
+ "learning_rate": 4.410201241275169e-06,
+ "loss": 0.5125,
+ "step": 2910
+ },
+ {
+ "epoch": 1.3763593380614658,
+ "grad_norm": 3.0158939361572266,
+ "learning_rate": 4.409798737979602e-06,
+ "loss": 0.5299,
+ "step": 2911
+ },
+ {
+ "epoch": 1.3768321513002364,
+ "grad_norm": 2.7932698726654053,
+ "learning_rate": 4.4093961157692415e-06,
+ "loss": 0.5437,
+ "step": 2912
+ },
+ {
+ "epoch": 1.377304964539007,
+ "grad_norm": 2.459510326385498,
+ "learning_rate": 4.408993374669156e-06,
+ "loss": 0.5548,
+ "step": 2913
+ },
+ {
+ "epoch": 1.3777777777777778,
+ "grad_norm": 2.7500696182250977,
+ "learning_rate": 4.408590514704425e-06,
+ "loss": 0.5186,
+ "step": 2914
+ },
+ {
+ "epoch": 1.3782505910165486,
+ "grad_norm": 2.7824268341064453,
+ "learning_rate": 4.4081875359001315e-06,
+ "loss": 0.4762,
+ "step": 2915
+ },
+ {
+ "epoch": 1.3787234042553191,
+ "grad_norm": 2.4202158451080322,
+ "learning_rate": 4.4077844382813675e-06,
+ "loss": 0.5005,
+ "step": 2916
+ },
+ {
+ "epoch": 1.3791962174940897,
+ "grad_norm": 2.5566670894622803,
+ "learning_rate": 4.4073812218732316e-06,
+ "loss": 0.5377,
+ "step": 2917
+ },
+ {
+ "epoch": 1.3796690307328605,
+ "grad_norm": 3.400874376296997,
+ "learning_rate": 4.406977886700831e-06,
+ "loss": 0.6637,
+ "step": 2918
+ },
+ {
+ "epoch": 1.3801418439716313,
+ "grad_norm": 2.8187878131866455,
+ "learning_rate": 4.406574432789278e-06,
+ "loss": 0.5033,
+ "step": 2919
+ },
+ {
+ "epoch": 1.3806146572104019,
+ "grad_norm": 2.5578041076660156,
+ "learning_rate": 4.406170860163697e-06,
+ "loss": 0.5293,
+ "step": 2920
+ },
+ {
+ "epoch": 1.3810874704491725,
+ "grad_norm": 2.6709718704223633,
+ "learning_rate": 4.405767168849213e-06,
+ "loss": 0.5144,
+ "step": 2921
+ },
+ {
+ "epoch": 1.3815602836879433,
+ "grad_norm": 3.049365997314453,
+ "learning_rate": 4.405363358870965e-06,
+ "loss": 0.4894,
+ "step": 2922
+ },
+ {
+ "epoch": 1.382033096926714,
+ "grad_norm": 2.5569891929626465,
+ "learning_rate": 4.404959430254095e-06,
+ "loss": 0.4929,
+ "step": 2923
+ },
+ {
+ "epoch": 1.3825059101654846,
+ "grad_norm": 2.8288230895996094,
+ "learning_rate": 4.404555383023754e-06,
+ "loss": 0.5438,
+ "step": 2924
+ },
+ {
+ "epoch": 1.3829787234042552,
+ "grad_norm": 2.8363358974456787,
+ "learning_rate": 4.404151217205102e-06,
+ "loss": 0.545,
+ "step": 2925
+ },
+ {
+ "epoch": 1.383451536643026,
+ "grad_norm": 2.720972776412964,
+ "learning_rate": 4.403746932823302e-06,
+ "loss": 0.5732,
+ "step": 2926
+ },
+ {
+ "epoch": 1.3839243498817968,
+ "grad_norm": 2.728043794631958,
+ "learning_rate": 4.403342529903528e-06,
+ "loss": 0.4944,
+ "step": 2927
+ },
+ {
+ "epoch": 1.3843971631205674,
+ "grad_norm": 2.4366135597229004,
+ "learning_rate": 4.402938008470961e-06,
+ "loss": 0.4441,
+ "step": 2928
+ },
+ {
+ "epoch": 1.384869976359338,
+ "grad_norm": 2.858454704284668,
+ "learning_rate": 4.402533368550788e-06,
+ "loss": 0.5359,
+ "step": 2929
+ },
+ {
+ "epoch": 1.3853427895981087,
+ "grad_norm": 2.805795907974243,
+ "learning_rate": 4.402128610168205e-06,
+ "loss": 0.4954,
+ "step": 2930
+ },
+ {
+ "epoch": 1.3858156028368795,
+ "grad_norm": 3.3514177799224854,
+ "learning_rate": 4.401723733348413e-06,
+ "loss": 0.579,
+ "step": 2931
+ },
+ {
+ "epoch": 1.3862884160756501,
+ "grad_norm": 2.6255125999450684,
+ "learning_rate": 4.401318738116624e-06,
+ "loss": 0.5002,
+ "step": 2932
+ },
+ {
+ "epoch": 1.3867612293144207,
+ "grad_norm": 2.3480796813964844,
+ "learning_rate": 4.400913624498054e-06,
+ "loss": 0.4688,
+ "step": 2933
+ },
+ {
+ "epoch": 1.3872340425531915,
+ "grad_norm": 2.710165023803711,
+ "learning_rate": 4.400508392517927e-06,
+ "loss": 0.5099,
+ "step": 2934
+ },
+ {
+ "epoch": 1.3877068557919623,
+ "grad_norm": 2.5820295810699463,
+ "learning_rate": 4.400103042201477e-06,
+ "loss": 0.512,
+ "step": 2935
+ },
+ {
+ "epoch": 1.3881796690307329,
+ "grad_norm": 2.750596523284912,
+ "learning_rate": 4.399697573573942e-06,
+ "loss": 0.463,
+ "step": 2936
+ },
+ {
+ "epoch": 1.3886524822695034,
+ "grad_norm": 3.497537612915039,
+ "learning_rate": 4.399291986660569e-06,
+ "loss": 0.5676,
+ "step": 2937
+ },
+ {
+ "epoch": 1.3891252955082742,
+ "grad_norm": 2.4046003818511963,
+ "learning_rate": 4.398886281486612e-06,
+ "loss": 0.5408,
+ "step": 2938
+ },
+ {
+ "epoch": 1.389598108747045,
+ "grad_norm": 2.941606283187866,
+ "learning_rate": 4.398480458077332e-06,
+ "loss": 0.5734,
+ "step": 2939
+ },
+ {
+ "epoch": 1.3900709219858156,
+ "grad_norm": 3.030214309692383,
+ "learning_rate": 4.398074516458e-06,
+ "loss": 0.5353,
+ "step": 2940
+ },
+ {
+ "epoch": 1.3905437352245862,
+ "grad_norm": 2.9991626739501953,
+ "learning_rate": 4.397668456653889e-06,
+ "loss": 0.5989,
+ "step": 2941
+ },
+ {
+ "epoch": 1.391016548463357,
+ "grad_norm": 4.163141250610352,
+ "learning_rate": 4.397262278690285e-06,
+ "loss": 0.5436,
+ "step": 2942
+ },
+ {
+ "epoch": 1.3914893617021278,
+ "grad_norm": 2.6576037406921387,
+ "learning_rate": 4.396855982592478e-06,
+ "loss": 0.5206,
+ "step": 2943
+ },
+ {
+ "epoch": 1.3919621749408984,
+ "grad_norm": 2.7729203701019287,
+ "learning_rate": 4.396449568385768e-06,
+ "loss": 0.5403,
+ "step": 2944
+ },
+ {
+ "epoch": 1.392434988179669,
+ "grad_norm": 2.4560446739196777,
+ "learning_rate": 4.396043036095457e-06,
+ "loss": 0.4924,
+ "step": 2945
+ },
+ {
+ "epoch": 1.3929078014184397,
+ "grad_norm": 2.6370556354522705,
+ "learning_rate": 4.39563638574686e-06,
+ "loss": 0.5543,
+ "step": 2946
+ },
+ {
+ "epoch": 1.3933806146572105,
+ "grad_norm": 2.593914270401001,
+ "learning_rate": 4.395229617365298e-06,
+ "loss": 0.5133,
+ "step": 2947
+ },
+ {
+ "epoch": 1.393853427895981,
+ "grad_norm": 2.3583998680114746,
+ "learning_rate": 4.394822730976099e-06,
+ "loss": 0.4436,
+ "step": 2948
+ },
+ {
+ "epoch": 1.3943262411347517,
+ "grad_norm": 3.2768537998199463,
+ "learning_rate": 4.394415726604596e-06,
+ "loss": 0.5489,
+ "step": 2949
+ },
+ {
+ "epoch": 1.3947990543735225,
+ "grad_norm": 2.88662052154541,
+ "learning_rate": 4.394008604276133e-06,
+ "loss": 0.5194,
+ "step": 2950
+ },
+ {
+ "epoch": 1.3952718676122933,
+ "grad_norm": 2.46610426902771,
+ "learning_rate": 4.393601364016059e-06,
+ "loss": 0.5255,
+ "step": 2951
+ },
+ {
+ "epoch": 1.3957446808510638,
+ "grad_norm": 3.122509241104126,
+ "learning_rate": 4.393194005849731e-06,
+ "loss": 0.6046,
+ "step": 2952
+ },
+ {
+ "epoch": 1.3962174940898344,
+ "grad_norm": 2.724926471710205,
+ "learning_rate": 4.392786529802513e-06,
+ "loss": 0.4958,
+ "step": 2953
+ },
+ {
+ "epoch": 1.3966903073286052,
+ "grad_norm": 2.491485595703125,
+ "learning_rate": 4.3923789358997785e-06,
+ "loss": 0.5209,
+ "step": 2954
+ },
+ {
+ "epoch": 1.397163120567376,
+ "grad_norm": 2.61110520362854,
+ "learning_rate": 4.3919712241669056e-06,
+ "loss": 0.5202,
+ "step": 2955
+ },
+ {
+ "epoch": 1.3976359338061466,
+ "grad_norm": 2.3814501762390137,
+ "learning_rate": 4.39156339462928e-06,
+ "loss": 0.4966,
+ "step": 2956
+ },
+ {
+ "epoch": 1.3981087470449172,
+ "grad_norm": 2.762498617172241,
+ "learning_rate": 4.391155447312296e-06,
+ "loss": 0.6025,
+ "step": 2957
+ },
+ {
+ "epoch": 1.398581560283688,
+ "grad_norm": 2.964975595474243,
+ "learning_rate": 4.390747382241355e-06,
+ "loss": 0.4845,
+ "step": 2958
+ },
+ {
+ "epoch": 1.3990543735224588,
+ "grad_norm": 3.0117249488830566,
+ "learning_rate": 4.3903391994418655e-06,
+ "loss": 0.5326,
+ "step": 2959
+ },
+ {
+ "epoch": 1.3995271867612293,
+ "grad_norm": 2.578626871109009,
+ "learning_rate": 4.389930898939243e-06,
+ "loss": 0.5271,
+ "step": 2960
+ },
+ {
+ "epoch": 1.4,
+ "grad_norm": 2.747441053390503,
+ "learning_rate": 4.38952248075891e-06,
+ "loss": 0.5553,
+ "step": 2961
+ },
+ {
+ "epoch": 1.4004728132387707,
+ "grad_norm": 2.8273086547851562,
+ "learning_rate": 4.389113944926297e-06,
+ "loss": 0.5475,
+ "step": 2962
+ },
+ {
+ "epoch": 1.4009456264775415,
+ "grad_norm": 2.55238676071167,
+ "learning_rate": 4.388705291466843e-06,
+ "loss": 0.4864,
+ "step": 2963
+ },
+ {
+ "epoch": 1.401418439716312,
+ "grad_norm": 2.597214460372925,
+ "learning_rate": 4.388296520405992e-06,
+ "loss": 0.4845,
+ "step": 2964
+ },
+ {
+ "epoch": 1.4018912529550827,
+ "grad_norm": 2.608962297439575,
+ "learning_rate": 4.387887631769196e-06,
+ "loss": 0.5544,
+ "step": 2965
+ },
+ {
+ "epoch": 1.4023640661938535,
+ "grad_norm": 2.2754876613616943,
+ "learning_rate": 4.3874786255819165e-06,
+ "loss": 0.5045,
+ "step": 2966
+ },
+ {
+ "epoch": 1.4028368794326243,
+ "grad_norm": 2.9900264739990234,
+ "learning_rate": 4.387069501869618e-06,
+ "loss": 0.562,
+ "step": 2967
+ },
+ {
+ "epoch": 1.4033096926713948,
+ "grad_norm": 2.8069417476654053,
+ "learning_rate": 4.386660260657778e-06,
+ "loss": 0.5284,
+ "step": 2968
+ },
+ {
+ "epoch": 1.4037825059101654,
+ "grad_norm": 2.68894624710083,
+ "learning_rate": 4.386250901971875e-06,
+ "loss": 0.5879,
+ "step": 2969
+ },
+ {
+ "epoch": 1.4042553191489362,
+ "grad_norm": 2.614485025405884,
+ "learning_rate": 4.385841425837399e-06,
+ "loss": 0.4771,
+ "step": 2970
+ },
+ {
+ "epoch": 1.4047281323877068,
+ "grad_norm": 2.487950325012207,
+ "learning_rate": 4.385431832279848e-06,
+ "loss": 0.5552,
+ "step": 2971
+ },
+ {
+ "epoch": 1.4052009456264776,
+ "grad_norm": 2.5098392963409424,
+ "learning_rate": 4.385022121324723e-06,
+ "loss": 0.5267,
+ "step": 2972
+ },
+ {
+ "epoch": 1.4056737588652481,
+ "grad_norm": 2.825838565826416,
+ "learning_rate": 4.384612292997537e-06,
+ "loss": 0.5336,
+ "step": 2973
+ },
+ {
+ "epoch": 1.406146572104019,
+ "grad_norm": 2.898188829421997,
+ "learning_rate": 4.384202347323806e-06,
+ "loss": 0.5685,
+ "step": 2974
+ },
+ {
+ "epoch": 1.4066193853427895,
+ "grad_norm": 2.8722569942474365,
+ "learning_rate": 4.383792284329057e-06,
+ "loss": 0.5977,
+ "step": 2975
+ },
+ {
+ "epoch": 1.4070921985815603,
+ "grad_norm": 2.832951307296753,
+ "learning_rate": 4.3833821040388235e-06,
+ "loss": 0.5766,
+ "step": 2976
+ },
+ {
+ "epoch": 1.407565011820331,
+ "grad_norm": 2.7353670597076416,
+ "learning_rate": 4.3829718064786446e-06,
+ "loss": 0.5461,
+ "step": 2977
+ },
+ {
+ "epoch": 1.4080378250591017,
+ "grad_norm": 2.6050429344177246,
+ "learning_rate": 4.3825613916740675e-06,
+ "loss": 0.5501,
+ "step": 2978
+ },
+ {
+ "epoch": 1.4085106382978723,
+ "grad_norm": 2.79719877243042,
+ "learning_rate": 4.382150859650647e-06,
+ "loss": 0.502,
+ "step": 2979
+ },
+ {
+ "epoch": 1.408983451536643,
+ "grad_norm": 2.5538079738616943,
+ "learning_rate": 4.381740210433946e-06,
+ "loss": 0.4762,
+ "step": 2980
+ },
+ {
+ "epoch": 1.4094562647754136,
+ "grad_norm": 2.7256062030792236,
+ "learning_rate": 4.381329444049533e-06,
+ "loss": 0.4692,
+ "step": 2981
+ },
+ {
+ "epoch": 1.4099290780141844,
+ "grad_norm": 2.7778146266937256,
+ "learning_rate": 4.3809185605229855e-06,
+ "loss": 0.5366,
+ "step": 2982
+ },
+ {
+ "epoch": 1.410401891252955,
+ "grad_norm": 2.6289451122283936,
+ "learning_rate": 4.380507559879887e-06,
+ "loss": 0.5412,
+ "step": 2983
+ },
+ {
+ "epoch": 1.4108747044917258,
+ "grad_norm": 2.697204828262329,
+ "learning_rate": 4.380096442145827e-06,
+ "loss": 0.5065,
+ "step": 2984
+ },
+ {
+ "epoch": 1.4113475177304964,
+ "grad_norm": 2.4709219932556152,
+ "learning_rate": 4.379685207346407e-06,
+ "loss": 0.568,
+ "step": 2985
+ },
+ {
+ "epoch": 1.4118203309692672,
+ "grad_norm": 2.9740655422210693,
+ "learning_rate": 4.379273855507231e-06,
+ "loss": 0.5512,
+ "step": 2986
+ },
+ {
+ "epoch": 1.4122931442080378,
+ "grad_norm": 3.0090627670288086,
+ "learning_rate": 4.378862386653911e-06,
+ "loss": 0.5459,
+ "step": 2987
+ },
+ {
+ "epoch": 1.4127659574468086,
+ "grad_norm": 2.8835368156433105,
+ "learning_rate": 4.378450800812071e-06,
+ "loss": 0.5357,
+ "step": 2988
+ },
+ {
+ "epoch": 1.4132387706855791,
+ "grad_norm": 2.558824062347412,
+ "learning_rate": 4.378039098007335e-06,
+ "loss": 0.536,
+ "step": 2989
+ },
+ {
+ "epoch": 1.41371158392435,
+ "grad_norm": 2.5572092533111572,
+ "learning_rate": 4.377627278265339e-06,
+ "loss": 0.5183,
+ "step": 2990
+ },
+ {
+ "epoch": 1.4141843971631205,
+ "grad_norm": 2.7356579303741455,
+ "learning_rate": 4.377215341611727e-06,
+ "loss": 0.5087,
+ "step": 2991
+ },
+ {
+ "epoch": 1.4146572104018913,
+ "grad_norm": 2.7541024684906006,
+ "learning_rate": 4.376803288072146e-06,
+ "loss": 0.4509,
+ "step": 2992
+ },
+ {
+ "epoch": 1.4151300236406619,
+ "grad_norm": 2.7548446655273438,
+ "learning_rate": 4.376391117672254e-06,
+ "loss": 0.5532,
+ "step": 2993
+ },
+ {
+ "epoch": 1.4156028368794327,
+ "grad_norm": 2.9107465744018555,
+ "learning_rate": 4.375978830437715e-06,
+ "loss": 0.5719,
+ "step": 2994
+ },
+ {
+ "epoch": 1.4160756501182032,
+ "grad_norm": 2.7077393531799316,
+ "learning_rate": 4.3755664263942e-06,
+ "loss": 0.5084,
+ "step": 2995
+ },
+ {
+ "epoch": 1.416548463356974,
+ "grad_norm": 2.764209270477295,
+ "learning_rate": 4.375153905567388e-06,
+ "loss": 0.5976,
+ "step": 2996
+ },
+ {
+ "epoch": 1.4170212765957446,
+ "grad_norm": 2.7792932987213135,
+ "learning_rate": 4.374741267982964e-06,
+ "loss": 0.5358,
+ "step": 2997
+ },
+ {
+ "epoch": 1.4174940898345154,
+ "grad_norm": 2.459212064743042,
+ "learning_rate": 4.374328513666622e-06,
+ "loss": 0.5181,
+ "step": 2998
+ },
+ {
+ "epoch": 1.417966903073286,
+ "grad_norm": 2.548546552658081,
+ "learning_rate": 4.373915642644062e-06,
+ "loss": 0.528,
+ "step": 2999
+ },
+ {
+ "epoch": 1.4184397163120568,
+ "grad_norm": 2.998138189315796,
+ "learning_rate": 4.373502654940992e-06,
+ "loss": 0.5233,
+ "step": 3000
+ },
+ {
+ "epoch": 1.4189125295508274,
+ "grad_norm": 2.604341983795166,
+ "learning_rate": 4.373089550583126e-06,
+ "loss": 0.5274,
+ "step": 3001
+ },
+ {
+ "epoch": 1.4193853427895982,
+ "grad_norm": 2.6792588233947754,
+ "learning_rate": 4.372676329596188e-06,
+ "loss": 0.5061,
+ "step": 3002
+ },
+ {
+ "epoch": 1.4198581560283687,
+ "grad_norm": 2.5182368755340576,
+ "learning_rate": 4.372262992005906e-06,
+ "loss": 0.541,
+ "step": 3003
+ },
+ {
+ "epoch": 1.4203309692671395,
+ "grad_norm": 2.690718173980713,
+ "learning_rate": 4.371849537838018e-06,
+ "loss": 0.5308,
+ "step": 3004
+ },
+ {
+ "epoch": 1.42080378250591,
+ "grad_norm": 2.6797590255737305,
+ "learning_rate": 4.371435967118266e-06,
+ "loss": 0.5728,
+ "step": 3005
+ },
+ {
+ "epoch": 1.421276595744681,
+ "grad_norm": 2.847900152206421,
+ "learning_rate": 4.371022279872403e-06,
+ "loss": 0.5053,
+ "step": 3006
+ },
+ {
+ "epoch": 1.4217494089834515,
+ "grad_norm": 2.497810125350952,
+ "learning_rate": 4.370608476126186e-06,
+ "loss": 0.5057,
+ "step": 3007
+ },
+ {
+ "epoch": 1.4222222222222223,
+ "grad_norm": 2.5259225368499756,
+ "learning_rate": 4.370194555905382e-06,
+ "loss": 0.5508,
+ "step": 3008
+ },
+ {
+ "epoch": 1.4226950354609929,
+ "grad_norm": 2.774118423461914,
+ "learning_rate": 4.369780519235763e-06,
+ "loss": 0.5419,
+ "step": 3009
+ },
+ {
+ "epoch": 1.4231678486997636,
+ "grad_norm": 2.2764663696289062,
+ "learning_rate": 4.369366366143111e-06,
+ "loss": 0.5032,
+ "step": 3010
+ },
+ {
+ "epoch": 1.4236406619385342,
+ "grad_norm": 2.736347198486328,
+ "learning_rate": 4.368952096653211e-06,
+ "loss": 0.5184,
+ "step": 3011
+ },
+ {
+ "epoch": 1.424113475177305,
+ "grad_norm": 2.476762056350708,
+ "learning_rate": 4.36853771079186e-06,
+ "loss": 0.5331,
+ "step": 3012
+ },
+ {
+ "epoch": 1.4245862884160756,
+ "grad_norm": 2.8006162643432617,
+ "learning_rate": 4.3681232085848585e-06,
+ "loss": 0.5331,
+ "step": 3013
+ },
+ {
+ "epoch": 1.4250591016548464,
+ "grad_norm": 2.509143590927124,
+ "learning_rate": 4.367708590058016e-06,
+ "loss": 0.5127,
+ "step": 3014
+ },
+ {
+ "epoch": 1.425531914893617,
+ "grad_norm": 3.030137538909912,
+ "learning_rate": 4.3672938552371505e-06,
+ "loss": 0.5555,
+ "step": 3015
+ },
+ {
+ "epoch": 1.4260047281323878,
+ "grad_norm": 3.0536904335021973,
+ "learning_rate": 4.3668790041480835e-06,
+ "loss": 0.5241,
+ "step": 3016
+ },
+ {
+ "epoch": 1.4264775413711583,
+ "grad_norm": 2.6400439739227295,
+ "learning_rate": 4.366464036816647e-06,
+ "loss": 0.4946,
+ "step": 3017
+ },
+ {
+ "epoch": 1.4269503546099291,
+ "grad_norm": 2.7302589416503906,
+ "learning_rate": 4.366048953268679e-06,
+ "loss": 0.5105,
+ "step": 3018
+ },
+ {
+ "epoch": 1.4274231678486997,
+ "grad_norm": 2.504549264907837,
+ "learning_rate": 4.365633753530026e-06,
+ "loss": 0.4844,
+ "step": 3019
+ },
+ {
+ "epoch": 1.4278959810874705,
+ "grad_norm": 2.3872320652008057,
+ "learning_rate": 4.365218437626539e-06,
+ "loss": 0.4402,
+ "step": 3020
+ },
+ {
+ "epoch": 1.428368794326241,
+ "grad_norm": 2.531649351119995,
+ "learning_rate": 4.364803005584078e-06,
+ "loss": 0.4913,
+ "step": 3021
+ },
+ {
+ "epoch": 1.4288416075650119,
+ "grad_norm": 2.4683783054351807,
+ "learning_rate": 4.364387457428512e-06,
+ "loss": 0.515,
+ "step": 3022
+ },
+ {
+ "epoch": 1.4293144208037825,
+ "grad_norm": 2.632336378097534,
+ "learning_rate": 4.363971793185713e-06,
+ "loss": 0.5398,
+ "step": 3023
+ },
+ {
+ "epoch": 1.4297872340425533,
+ "grad_norm": 2.7456719875335693,
+ "learning_rate": 4.363556012881565e-06,
+ "loss": 0.5254,
+ "step": 3024
+ },
+ {
+ "epoch": 1.4302600472813238,
+ "grad_norm": 2.607177972793579,
+ "learning_rate": 4.363140116541955e-06,
+ "loss": 0.5266,
+ "step": 3025
+ },
+ {
+ "epoch": 1.4307328605200946,
+ "grad_norm": 2.640127420425415,
+ "learning_rate": 4.3627241041927796e-06,
+ "loss": 0.5157,
+ "step": 3026
+ },
+ {
+ "epoch": 1.4312056737588652,
+ "grad_norm": 2.4210736751556396,
+ "learning_rate": 4.362307975859941e-06,
+ "loss": 0.4599,
+ "step": 3027
+ },
+ {
+ "epoch": 1.431678486997636,
+ "grad_norm": 2.6007790565490723,
+ "learning_rate": 4.361891731569352e-06,
+ "loss": 0.5298,
+ "step": 3028
+ },
+ {
+ "epoch": 1.4321513002364066,
+ "grad_norm": 2.5352046489715576,
+ "learning_rate": 4.361475371346928e-06,
+ "loss": 0.5128,
+ "step": 3029
+ },
+ {
+ "epoch": 1.4326241134751774,
+ "grad_norm": 2.4204049110412598,
+ "learning_rate": 4.361058895218596e-06,
+ "loss": 0.4669,
+ "step": 3030
+ },
+ {
+ "epoch": 1.433096926713948,
+ "grad_norm": 2.525240182876587,
+ "learning_rate": 4.360642303210286e-06,
+ "loss": 0.4925,
+ "step": 3031
+ },
+ {
+ "epoch": 1.4335697399527187,
+ "grad_norm": 2.839646339416504,
+ "learning_rate": 4.360225595347939e-06,
+ "loss": 0.5868,
+ "step": 3032
+ },
+ {
+ "epoch": 1.4340425531914893,
+ "grad_norm": 2.5043296813964844,
+ "learning_rate": 4.359808771657501e-06,
+ "loss": 0.4951,
+ "step": 3033
+ },
+ {
+ "epoch": 1.4345153664302601,
+ "grad_norm": 2.9082300662994385,
+ "learning_rate": 4.359391832164927e-06,
+ "loss": 0.5259,
+ "step": 3034
+ },
+ {
+ "epoch": 1.4349881796690307,
+ "grad_norm": 2.6651999950408936,
+ "learning_rate": 4.3589747768961745e-06,
+ "loss": 0.537,
+ "step": 3035
+ },
+ {
+ "epoch": 1.4354609929078015,
+ "grad_norm": 2.577077865600586,
+ "learning_rate": 4.358557605877216e-06,
+ "loss": 0.5186,
+ "step": 3036
+ },
+ {
+ "epoch": 1.435933806146572,
+ "grad_norm": 2.7445287704467773,
+ "learning_rate": 4.3581403191340236e-06,
+ "loss": 0.5573,
+ "step": 3037
+ },
+ {
+ "epoch": 1.4364066193853429,
+ "grad_norm": 2.502086639404297,
+ "learning_rate": 4.357722916692582e-06,
+ "loss": 0.5039,
+ "step": 3038
+ },
+ {
+ "epoch": 1.4368794326241134,
+ "grad_norm": 2.4476163387298584,
+ "learning_rate": 4.357305398578879e-06,
+ "loss": 0.5638,
+ "step": 3039
+ },
+ {
+ "epoch": 1.4373522458628842,
+ "grad_norm": 2.7705588340759277,
+ "learning_rate": 4.356887764818915e-06,
+ "loss": 0.5485,
+ "step": 3040
+ },
+ {
+ "epoch": 1.4378250591016548,
+ "grad_norm": 2.498225450515747,
+ "learning_rate": 4.356470015438691e-06,
+ "loss": 0.5486,
+ "step": 3041
+ },
+ {
+ "epoch": 1.4382978723404256,
+ "grad_norm": 2.394320011138916,
+ "learning_rate": 4.356052150464219e-06,
+ "loss": 0.512,
+ "step": 3042
+ },
+ {
+ "epoch": 1.4387706855791962,
+ "grad_norm": 2.8725767135620117,
+ "learning_rate": 4.3556341699215185e-06,
+ "loss": 0.5202,
+ "step": 3043
+ },
+ {
+ "epoch": 1.439243498817967,
+ "grad_norm": 3.1707918643951416,
+ "learning_rate": 4.355216073836615e-06,
+ "loss": 0.5229,
+ "step": 3044
+ },
+ {
+ "epoch": 1.4397163120567376,
+ "grad_norm": 2.532578468322754,
+ "learning_rate": 4.3547978622355415e-06,
+ "loss": 0.4569,
+ "step": 3045
+ },
+ {
+ "epoch": 1.4401891252955084,
+ "grad_norm": 3.0111029148101807,
+ "learning_rate": 4.354379535144338e-06,
+ "loss": 0.5801,
+ "step": 3046
+ },
+ {
+ "epoch": 1.440661938534279,
+ "grad_norm": 2.9554224014282227,
+ "learning_rate": 4.353961092589052e-06,
+ "loss": 0.5968,
+ "step": 3047
+ },
+ {
+ "epoch": 1.4411347517730497,
+ "grad_norm": 2.7562637329101562,
+ "learning_rate": 4.353542534595738e-06,
+ "loss": 0.5005,
+ "step": 3048
+ },
+ {
+ "epoch": 1.4416075650118203,
+ "grad_norm": 3.083254337310791,
+ "learning_rate": 4.3531238611904595e-06,
+ "loss": 0.5389,
+ "step": 3049
+ },
+ {
+ "epoch": 1.442080378250591,
+ "grad_norm": 2.7778005599975586,
+ "learning_rate": 4.352705072399282e-06,
+ "loss": 0.5342,
+ "step": 3050
+ },
+ {
+ "epoch": 1.4425531914893617,
+ "grad_norm": 2.6673996448516846,
+ "learning_rate": 4.3522861682482845e-06,
+ "loss": 0.5213,
+ "step": 3051
+ },
+ {
+ "epoch": 1.4430260047281322,
+ "grad_norm": 2.637605905532837,
+ "learning_rate": 4.351867148763548e-06,
+ "loss": 0.4893,
+ "step": 3052
+ },
+ {
+ "epoch": 1.443498817966903,
+ "grad_norm": 2.834469795227051,
+ "learning_rate": 4.351448013971166e-06,
+ "loss": 0.5391,
+ "step": 3053
+ },
+ {
+ "epoch": 1.4439716312056738,
+ "grad_norm": 2.824153184890747,
+ "learning_rate": 4.351028763897234e-06,
+ "loss": 0.6403,
+ "step": 3054
+ },
+ {
+ "epoch": 1.4444444444444444,
+ "grad_norm": 2.558966875076294,
+ "learning_rate": 4.350609398567857e-06,
+ "loss": 0.4912,
+ "step": 3055
+ },
+ {
+ "epoch": 1.444917257683215,
+ "grad_norm": 2.281726360321045,
+ "learning_rate": 4.3501899180091475e-06,
+ "loss": 0.4655,
+ "step": 3056
+ },
+ {
+ "epoch": 1.4453900709219858,
+ "grad_norm": 2.499472141265869,
+ "learning_rate": 4.349770322247225e-06,
+ "loss": 0.4878,
+ "step": 3057
+ },
+ {
+ "epoch": 1.4458628841607566,
+ "grad_norm": 2.578615188598633,
+ "learning_rate": 4.349350611308215e-06,
+ "loss": 0.4855,
+ "step": 3058
+ },
+ {
+ "epoch": 1.4463356973995272,
+ "grad_norm": 2.7111165523529053,
+ "learning_rate": 4.348930785218252e-06,
+ "loss": 0.5415,
+ "step": 3059
+ },
+ {
+ "epoch": 1.4468085106382977,
+ "grad_norm": 2.8081610202789307,
+ "learning_rate": 4.348510844003476e-06,
+ "loss": 0.4881,
+ "step": 3060
+ },
+ {
+ "epoch": 1.4472813238770685,
+ "grad_norm": 2.9439868927001953,
+ "learning_rate": 4.348090787690036e-06,
+ "loss": 0.5485,
+ "step": 3061
+ },
+ {
+ "epoch": 1.4477541371158393,
+ "grad_norm": 2.592532157897949,
+ "learning_rate": 4.347670616304085e-06,
+ "loss": 0.4912,
+ "step": 3062
+ },
+ {
+ "epoch": 1.44822695035461,
+ "grad_norm": 2.960592746734619,
+ "learning_rate": 4.347250329871787e-06,
+ "loss": 0.5473,
+ "step": 3063
+ },
+ {
+ "epoch": 1.4486997635933805,
+ "grad_norm": 2.5786688327789307,
+ "learning_rate": 4.3468299284193116e-06,
+ "loss": 0.5348,
+ "step": 3064
+ },
+ {
+ "epoch": 1.4491725768321513,
+ "grad_norm": 2.6084046363830566,
+ "learning_rate": 4.346409411972834e-06,
+ "loss": 0.527,
+ "step": 3065
+ },
+ {
+ "epoch": 1.449645390070922,
+ "grad_norm": 2.489748239517212,
+ "learning_rate": 4.3459887805585385e-06,
+ "loss": 0.4943,
+ "step": 3066
+ },
+ {
+ "epoch": 1.4501182033096927,
+ "grad_norm": 2.452131986618042,
+ "learning_rate": 4.345568034202617e-06,
+ "loss": 0.4886,
+ "step": 3067
+ },
+ {
+ "epoch": 1.4505910165484632,
+ "grad_norm": 2.4034671783447266,
+ "learning_rate": 4.345147172931266e-06,
+ "loss": 0.4689,
+ "step": 3068
+ },
+ {
+ "epoch": 1.451063829787234,
+ "grad_norm": 2.6045448780059814,
+ "learning_rate": 4.344726196770691e-06,
+ "loss": 0.5842,
+ "step": 3069
+ },
+ {
+ "epoch": 1.4515366430260048,
+ "grad_norm": 2.697593927383423,
+ "learning_rate": 4.3443051057471045e-06,
+ "loss": 0.5358,
+ "step": 3070
+ },
+ {
+ "epoch": 1.4520094562647754,
+ "grad_norm": 2.6080820560455322,
+ "learning_rate": 4.343883899886727e-06,
+ "loss": 0.5361,
+ "step": 3071
+ },
+ {
+ "epoch": 1.452482269503546,
+ "grad_norm": 2.4605307579040527,
+ "learning_rate": 4.343462579215783e-06,
+ "loss": 0.4941,
+ "step": 3072
+ },
+ {
+ "epoch": 1.4529550827423168,
+ "grad_norm": 2.8025355339050293,
+ "learning_rate": 4.343041143760509e-06,
+ "loss": 0.5116,
+ "step": 3073
+ },
+ {
+ "epoch": 1.4534278959810876,
+ "grad_norm": 2.432515859603882,
+ "learning_rate": 4.3426195935471434e-06,
+ "loss": 0.4991,
+ "step": 3074
+ },
+ {
+ "epoch": 1.4539007092198581,
+ "grad_norm": 2.5838661193847656,
+ "learning_rate": 4.342197928601935e-06,
+ "loss": 0.4994,
+ "step": 3075
+ },
+ {
+ "epoch": 1.4543735224586287,
+ "grad_norm": 2.421692371368408,
+ "learning_rate": 4.341776148951141e-06,
+ "loss": 0.4945,
+ "step": 3076
+ },
+ {
+ "epoch": 1.4548463356973995,
+ "grad_norm": 2.5354676246643066,
+ "learning_rate": 4.341354254621021e-06,
+ "loss": 0.4859,
+ "step": 3077
+ },
+ {
+ "epoch": 1.4553191489361703,
+ "grad_norm": 2.7316789627075195,
+ "learning_rate": 4.340932245637846e-06,
+ "loss": 0.5136,
+ "step": 3078
+ },
+ {
+ "epoch": 1.455791962174941,
+ "grad_norm": 3.5903496742248535,
+ "learning_rate": 4.340510122027891e-06,
+ "loss": 0.6451,
+ "step": 3079
+ },
+ {
+ "epoch": 1.4562647754137115,
+ "grad_norm": 2.95190167427063,
+ "learning_rate": 4.340087883817442e-06,
+ "loss": 0.6354,
+ "step": 3080
+ },
+ {
+ "epoch": 1.4567375886524823,
+ "grad_norm": 2.8659214973449707,
+ "learning_rate": 4.339665531032789e-06,
+ "loss": 0.5514,
+ "step": 3081
+ },
+ {
+ "epoch": 1.457210401891253,
+ "grad_norm": 2.5681674480438232,
+ "learning_rate": 4.339243063700231e-06,
+ "loss": 0.5135,
+ "step": 3082
+ },
+ {
+ "epoch": 1.4576832151300236,
+ "grad_norm": 2.7353906631469727,
+ "learning_rate": 4.338820481846072e-06,
+ "loss": 0.4608,
+ "step": 3083
+ },
+ {
+ "epoch": 1.4581560283687942,
+ "grad_norm": 2.6116466522216797,
+ "learning_rate": 4.3383977854966245e-06,
+ "loss": 0.4924,
+ "step": 3084
+ },
+ {
+ "epoch": 1.458628841607565,
+ "grad_norm": 2.6676487922668457,
+ "learning_rate": 4.337974974678207e-06,
+ "loss": 0.5747,
+ "step": 3085
+ },
+ {
+ "epoch": 1.4591016548463358,
+ "grad_norm": 2.909031629562378,
+ "learning_rate": 4.337552049417147e-06,
+ "loss": 0.4618,
+ "step": 3086
+ },
+ {
+ "epoch": 1.4595744680851064,
+ "grad_norm": 2.7614190578460693,
+ "learning_rate": 4.33712900973978e-06,
+ "loss": 0.5154,
+ "step": 3087
+ },
+ {
+ "epoch": 1.460047281323877,
+ "grad_norm": 2.452188014984131,
+ "learning_rate": 4.336705855672444e-06,
+ "loss": 0.542,
+ "step": 3088
+ },
+ {
+ "epoch": 1.4605200945626478,
+ "grad_norm": 3.0004117488861084,
+ "learning_rate": 4.336282587241488e-06,
+ "loss": 0.5857,
+ "step": 3089
+ },
+ {
+ "epoch": 1.4609929078014185,
+ "grad_norm": 2.870783567428589,
+ "learning_rate": 4.335859204473268e-06,
+ "loss": 0.5506,
+ "step": 3090
+ },
+ {
+ "epoch": 1.4614657210401891,
+ "grad_norm": 3.1078689098358154,
+ "learning_rate": 4.335435707394145e-06,
+ "loss": 0.5138,
+ "step": 3091
+ },
+ {
+ "epoch": 1.4619385342789597,
+ "grad_norm": 2.8516197204589844,
+ "learning_rate": 4.335012096030488e-06,
+ "loss": 0.5842,
+ "step": 3092
+ },
+ {
+ "epoch": 1.4624113475177305,
+ "grad_norm": 2.615922212600708,
+ "learning_rate": 4.334588370408675e-06,
+ "loss": 0.4896,
+ "step": 3093
+ },
+ {
+ "epoch": 1.4628841607565013,
+ "grad_norm": 3.1911802291870117,
+ "learning_rate": 4.334164530555088e-06,
+ "loss": 0.4974,
+ "step": 3094
+ },
+ {
+ "epoch": 1.4633569739952719,
+ "grad_norm": 3.075051784515381,
+ "learning_rate": 4.3337405764961186e-06,
+ "loss": 0.567,
+ "step": 3095
+ },
+ {
+ "epoch": 1.4638297872340424,
+ "grad_norm": 2.550625801086426,
+ "learning_rate": 4.333316508258163e-06,
+ "loss": 0.4887,
+ "step": 3096
+ },
+ {
+ "epoch": 1.4643026004728132,
+ "grad_norm": 2.3986475467681885,
+ "learning_rate": 4.332892325867629e-06,
+ "loss": 0.5047,
+ "step": 3097
+ },
+ {
+ "epoch": 1.464775413711584,
+ "grad_norm": 2.5045125484466553,
+ "learning_rate": 4.332468029350926e-06,
+ "loss": 0.4721,
+ "step": 3098
+ },
+ {
+ "epoch": 1.4652482269503546,
+ "grad_norm": 2.347365617752075,
+ "learning_rate": 4.332043618734474e-06,
+ "loss": 0.4913,
+ "step": 3099
+ },
+ {
+ "epoch": 1.4657210401891252,
+ "grad_norm": 2.459928512573242,
+ "learning_rate": 4.331619094044699e-06,
+ "loss": 0.523,
+ "step": 3100
+ },
+ {
+ "epoch": 1.466193853427896,
+ "grad_norm": 2.5771310329437256,
+ "learning_rate": 4.331194455308035e-06,
+ "loss": 0.593,
+ "step": 3101
+ },
+ {
+ "epoch": 1.4666666666666668,
+ "grad_norm": 3.1351823806762695,
+ "learning_rate": 4.330769702550921e-06,
+ "loss": 0.5852,
+ "step": 3102
+ },
+ {
+ "epoch": 1.4671394799054374,
+ "grad_norm": 2.589817523956299,
+ "learning_rate": 4.330344835799806e-06,
+ "loss": 0.508,
+ "step": 3103
+ },
+ {
+ "epoch": 1.467612293144208,
+ "grad_norm": 3.1140341758728027,
+ "learning_rate": 4.329919855081144e-06,
+ "loss": 0.469,
+ "step": 3104
+ },
+ {
+ "epoch": 1.4680851063829787,
+ "grad_norm": 2.8186635971069336,
+ "learning_rate": 4.329494760421396e-06,
+ "loss": 0.5088,
+ "step": 3105
+ },
+ {
+ "epoch": 1.4685579196217495,
+ "grad_norm": 2.676077365875244,
+ "learning_rate": 4.329069551847031e-06,
+ "loss": 0.52,
+ "step": 3106
+ },
+ {
+ "epoch": 1.46903073286052,
+ "grad_norm": 2.5543313026428223,
+ "learning_rate": 4.328644229384526e-06,
+ "loss": 0.5066,
+ "step": 3107
+ },
+ {
+ "epoch": 1.4695035460992907,
+ "grad_norm": 2.8176217079162598,
+ "learning_rate": 4.328218793060362e-06,
+ "loss": 0.6404,
+ "step": 3108
+ },
+ {
+ "epoch": 1.4699763593380615,
+ "grad_norm": 2.485217332839966,
+ "learning_rate": 4.3277932429010314e-06,
+ "loss": 0.4578,
+ "step": 3109
+ },
+ {
+ "epoch": 1.4704491725768323,
+ "grad_norm": 2.6741621494293213,
+ "learning_rate": 4.327367578933031e-06,
+ "loss": 0.5068,
+ "step": 3110
+ },
+ {
+ "epoch": 1.4709219858156029,
+ "grad_norm": 2.377242088317871,
+ "learning_rate": 4.326941801182863e-06,
+ "loss": 0.5249,
+ "step": 3111
+ },
+ {
+ "epoch": 1.4713947990543734,
+ "grad_norm": 2.790046215057373,
+ "learning_rate": 4.32651590967704e-06,
+ "loss": 0.5532,
+ "step": 3112
+ },
+ {
+ "epoch": 1.4718676122931442,
+ "grad_norm": 2.78019642829895,
+ "learning_rate": 4.326089904442081e-06,
+ "loss": 0.5362,
+ "step": 3113
+ },
+ {
+ "epoch": 1.472340425531915,
+ "grad_norm": 2.5661380290985107,
+ "learning_rate": 4.32566378550451e-06,
+ "loss": 0.5041,
+ "step": 3114
+ },
+ {
+ "epoch": 1.4728132387706856,
+ "grad_norm": 2.522153615951538,
+ "learning_rate": 4.3252375528908605e-06,
+ "loss": 0.5074,
+ "step": 3115
+ },
+ {
+ "epoch": 1.4732860520094562,
+ "grad_norm": 2.874688148498535,
+ "learning_rate": 4.3248112066276725e-06,
+ "loss": 0.59,
+ "step": 3116
+ },
+ {
+ "epoch": 1.473758865248227,
+ "grad_norm": 3.067866802215576,
+ "learning_rate": 4.324384746741492e-06,
+ "loss": 0.5924,
+ "step": 3117
+ },
+ {
+ "epoch": 1.4742316784869978,
+ "grad_norm": 3.359463930130005,
+ "learning_rate": 4.323958173258873e-06,
+ "loss": 0.6346,
+ "step": 3118
+ },
+ {
+ "epoch": 1.4747044917257683,
+ "grad_norm": 2.193024158477783,
+ "learning_rate": 4.323531486206376e-06,
+ "loss": 0.4594,
+ "step": 3119
+ },
+ {
+ "epoch": 1.475177304964539,
+ "grad_norm": 2.886889934539795,
+ "learning_rate": 4.323104685610569e-06,
+ "loss": 0.523,
+ "step": 3120
+ },
+ {
+ "epoch": 1.4756501182033097,
+ "grad_norm": 2.7558681964874268,
+ "learning_rate": 4.322677771498028e-06,
+ "loss": 0.5387,
+ "step": 3121
+ },
+ {
+ "epoch": 1.4761229314420805,
+ "grad_norm": 2.639277935028076,
+ "learning_rate": 4.322250743895335e-06,
+ "loss": 0.5599,
+ "step": 3122
+ },
+ {
+ "epoch": 1.476595744680851,
+ "grad_norm": 2.786198616027832,
+ "learning_rate": 4.321823602829078e-06,
+ "loss": 0.5405,
+ "step": 3123
+ },
+ {
+ "epoch": 1.4770685579196217,
+ "grad_norm": 2.582315683364868,
+ "learning_rate": 4.321396348325853e-06,
+ "loss": 0.4452,
+ "step": 3124
+ },
+ {
+ "epoch": 1.4775413711583925,
+ "grad_norm": 2.8574297428131104,
+ "learning_rate": 4.320968980412265e-06,
+ "loss": 0.4846,
+ "step": 3125
+ },
+ {
+ "epoch": 1.4780141843971633,
+ "grad_norm": 2.705281972885132,
+ "learning_rate": 4.320541499114922e-06,
+ "loss": 0.5548,
+ "step": 3126
+ },
+ {
+ "epoch": 1.4784869976359338,
+ "grad_norm": 2.3152754306793213,
+ "learning_rate": 4.320113904460444e-06,
+ "loss": 0.5216,
+ "step": 3127
+ },
+ {
+ "epoch": 1.4789598108747044,
+ "grad_norm": 3.230764150619507,
+ "learning_rate": 4.319686196475453e-06,
+ "loss": 0.6192,
+ "step": 3128
+ },
+ {
+ "epoch": 1.4794326241134752,
+ "grad_norm": 2.463380813598633,
+ "learning_rate": 4.319258375186583e-06,
+ "loss": 0.4872,
+ "step": 3129
+ },
+ {
+ "epoch": 1.479905437352246,
+ "grad_norm": 2.8477656841278076,
+ "learning_rate": 4.31883044062047e-06,
+ "loss": 0.5371,
+ "step": 3130
+ },
+ {
+ "epoch": 1.4803782505910166,
+ "grad_norm": 2.393911123275757,
+ "learning_rate": 4.318402392803762e-06,
+ "loss": 0.5334,
+ "step": 3131
+ },
+ {
+ "epoch": 1.4808510638297872,
+ "grad_norm": 2.6113736629486084,
+ "learning_rate": 4.317974231763109e-06,
+ "loss": 0.5572,
+ "step": 3132
+ },
+ {
+ "epoch": 1.481323877068558,
+ "grad_norm": 2.3941731452941895,
+ "learning_rate": 4.317545957525173e-06,
+ "loss": 0.4849,
+ "step": 3133
+ },
+ {
+ "epoch": 1.4817966903073285,
+ "grad_norm": 2.9536755084991455,
+ "learning_rate": 4.317117570116619e-06,
+ "loss": 0.6058,
+ "step": 3134
+ },
+ {
+ "epoch": 1.4822695035460993,
+ "grad_norm": 2.595754623413086,
+ "learning_rate": 4.316689069564123e-06,
+ "loss": 0.5193,
+ "step": 3135
+ },
+ {
+ "epoch": 1.48274231678487,
+ "grad_norm": 2.569833993911743,
+ "learning_rate": 4.316260455894364e-06,
+ "loss": 0.543,
+ "step": 3136
+ },
+ {
+ "epoch": 1.4832151300236407,
+ "grad_norm": 2.5137455463409424,
+ "learning_rate": 4.315831729134031e-06,
+ "loss": 0.5415,
+ "step": 3137
+ },
+ {
+ "epoch": 1.4836879432624113,
+ "grad_norm": 2.5582292079925537,
+ "learning_rate": 4.3154028893098176e-06,
+ "loss": 0.5338,
+ "step": 3138
+ },
+ {
+ "epoch": 1.484160756501182,
+ "grad_norm": 2.666426181793213,
+ "learning_rate": 4.3149739364484265e-06,
+ "loss": 0.5435,
+ "step": 3139
+ },
+ {
+ "epoch": 1.4846335697399526,
+ "grad_norm": 2.790851354598999,
+ "learning_rate": 4.314544870576568e-06,
+ "loss": 0.5746,
+ "step": 3140
+ },
+ {
+ "epoch": 1.4851063829787234,
+ "grad_norm": 2.620326042175293,
+ "learning_rate": 4.314115691720956e-06,
+ "loss": 0.5076,
+ "step": 3141
+ },
+ {
+ "epoch": 1.485579196217494,
+ "grad_norm": 3.075674533843994,
+ "learning_rate": 4.313686399908314e-06,
+ "loss": 0.5486,
+ "step": 3142
+ },
+ {
+ "epoch": 1.4860520094562648,
+ "grad_norm": 3.1347315311431885,
+ "learning_rate": 4.3132569951653745e-06,
+ "loss": 0.531,
+ "step": 3143
+ },
+ {
+ "epoch": 1.4865248226950354,
+ "grad_norm": 2.5783653259277344,
+ "learning_rate": 4.312827477518871e-06,
+ "loss": 0.5818,
+ "step": 3144
+ },
+ {
+ "epoch": 1.4869976359338062,
+ "grad_norm": 3.0247137546539307,
+ "learning_rate": 4.3123978469955505e-06,
+ "loss": 0.5347,
+ "step": 3145
+ },
+ {
+ "epoch": 1.4874704491725768,
+ "grad_norm": 2.4789345264434814,
+ "learning_rate": 4.311968103622163e-06,
+ "loss": 0.5,
+ "step": 3146
+ },
+ {
+ "epoch": 1.4879432624113476,
+ "grad_norm": 2.663341522216797,
+ "learning_rate": 4.311538247425466e-06,
+ "loss": 0.4825,
+ "step": 3147
+ },
+ {
+ "epoch": 1.4884160756501181,
+ "grad_norm": 2.633711099624634,
+ "learning_rate": 4.311108278432226e-06,
+ "loss": 0.5244,
+ "step": 3148
+ },
+ {
+ "epoch": 1.488888888888889,
+ "grad_norm": 2.51312518119812,
+ "learning_rate": 4.310678196669216e-06,
+ "loss": 0.513,
+ "step": 3149
+ },
+ {
+ "epoch": 1.4893617021276595,
+ "grad_norm": 2.5263755321502686,
+ "learning_rate": 4.310248002163214e-06,
+ "loss": 0.5236,
+ "step": 3150
+ },
+ {
+ "epoch": 1.4898345153664303,
+ "grad_norm": 2.559216260910034,
+ "learning_rate": 4.309817694941007e-06,
+ "loss": 0.5107,
+ "step": 3151
+ },
+ {
+ "epoch": 1.4903073286052009,
+ "grad_norm": 2.5023303031921387,
+ "learning_rate": 4.309387275029386e-06,
+ "loss": 0.4685,
+ "step": 3152
+ },
+ {
+ "epoch": 1.4907801418439717,
+ "grad_norm": 3.0314254760742188,
+ "learning_rate": 4.308956742455155e-06,
+ "loss": 0.5462,
+ "step": 3153
+ },
+ {
+ "epoch": 1.4912529550827422,
+ "grad_norm": 2.675295114517212,
+ "learning_rate": 4.308526097245119e-06,
+ "loss": 0.5398,
+ "step": 3154
+ },
+ {
+ "epoch": 1.491725768321513,
+ "grad_norm": 2.6613399982452393,
+ "learning_rate": 4.308095339426094e-06,
+ "loss": 0.5376,
+ "step": 3155
+ },
+ {
+ "epoch": 1.4921985815602836,
+ "grad_norm": 2.58937668800354,
+ "learning_rate": 4.307664469024899e-06,
+ "loss": 0.5385,
+ "step": 3156
+ },
+ {
+ "epoch": 1.4926713947990544,
+ "grad_norm": 2.583631992340088,
+ "learning_rate": 4.3072334860683655e-06,
+ "loss": 0.4927,
+ "step": 3157
+ },
+ {
+ "epoch": 1.493144208037825,
+ "grad_norm": 2.5889222621917725,
+ "learning_rate": 4.306802390583327e-06,
+ "loss": 0.47,
+ "step": 3158
+ },
+ {
+ "epoch": 1.4936170212765958,
+ "grad_norm": 2.9362716674804688,
+ "learning_rate": 4.3063711825966244e-06,
+ "loss": 0.4902,
+ "step": 3159
+ },
+ {
+ "epoch": 1.4940898345153664,
+ "grad_norm": 2.5385425090789795,
+ "learning_rate": 4.305939862135111e-06,
+ "loss": 0.5396,
+ "step": 3160
+ },
+ {
+ "epoch": 1.4945626477541372,
+ "grad_norm": 2.776326894760132,
+ "learning_rate": 4.305508429225641e-06,
+ "loss": 0.5169,
+ "step": 3161
+ },
+ {
+ "epoch": 1.4950354609929077,
+ "grad_norm": 2.575063467025757,
+ "learning_rate": 4.305076883895076e-06,
+ "loss": 0.4938,
+ "step": 3162
+ },
+ {
+ "epoch": 1.4955082742316785,
+ "grad_norm": 2.7552313804626465,
+ "learning_rate": 4.304645226170291e-06,
+ "loss": 0.6211,
+ "step": 3163
+ },
+ {
+ "epoch": 1.4959810874704491,
+ "grad_norm": 2.57149338722229,
+ "learning_rate": 4.30421345607816e-06,
+ "loss": 0.5241,
+ "step": 3164
+ },
+ {
+ "epoch": 1.49645390070922,
+ "grad_norm": 2.8142426013946533,
+ "learning_rate": 4.303781573645568e-06,
+ "loss": 0.5699,
+ "step": 3165
+ },
+ {
+ "epoch": 1.4969267139479905,
+ "grad_norm": 2.6344845294952393,
+ "learning_rate": 4.303349578899407e-06,
+ "loss": 0.5049,
+ "step": 3166
+ },
+ {
+ "epoch": 1.4973995271867613,
+ "grad_norm": 2.554410934448242,
+ "learning_rate": 4.302917471866575e-06,
+ "loss": 0.4404,
+ "step": 3167
+ },
+ {
+ "epoch": 1.4978723404255319,
+ "grad_norm": 2.896240711212158,
+ "learning_rate": 4.302485252573978e-06,
+ "loss": 0.602,
+ "step": 3168
+ },
+ {
+ "epoch": 1.4983451536643027,
+ "grad_norm": 2.4044477939605713,
+ "learning_rate": 4.302052921048527e-06,
+ "loss": 0.4857,
+ "step": 3169
+ },
+ {
+ "epoch": 1.4988179669030732,
+ "grad_norm": 2.7447879314422607,
+ "learning_rate": 4.301620477317144e-06,
+ "loss": 0.5438,
+ "step": 3170
+ },
+ {
+ "epoch": 1.499290780141844,
+ "grad_norm": 2.851820945739746,
+ "learning_rate": 4.301187921406752e-06,
+ "loss": 0.5245,
+ "step": 3171
+ },
+ {
+ "epoch": 1.4997635933806146,
+ "grad_norm": 3.247114419937134,
+ "learning_rate": 4.300755253344287e-06,
+ "loss": 0.504,
+ "step": 3172
+ },
+ {
+ "epoch": 1.5002364066193854,
+ "grad_norm": 3.117490291595459,
+ "learning_rate": 4.300322473156688e-06,
+ "loss": 0.4627,
+ "step": 3173
+ },
+ {
+ "epoch": 1.500709219858156,
+ "grad_norm": 2.558319330215454,
+ "learning_rate": 4.299889580870904e-06,
+ "loss": 0.5721,
+ "step": 3174
+ },
+ {
+ "epoch": 1.5011820330969265,
+ "grad_norm": 2.8983113765716553,
+ "learning_rate": 4.2994565765138865e-06,
+ "loss": 0.5257,
+ "step": 3175
+ },
+ {
+ "epoch": 1.5016548463356973,
+ "grad_norm": 2.744056463241577,
+ "learning_rate": 4.299023460112599e-06,
+ "loss": 0.4892,
+ "step": 3176
+ },
+ {
+ "epoch": 1.5021276595744681,
+ "grad_norm": 2.5506751537323,
+ "learning_rate": 4.29859023169401e-06,
+ "loss": 0.4933,
+ "step": 3177
+ },
+ {
+ "epoch": 1.5026004728132387,
+ "grad_norm": 2.842615842819214,
+ "learning_rate": 4.298156891285092e-06,
+ "loss": 0.6124,
+ "step": 3178
+ },
+ {
+ "epoch": 1.5030732860520093,
+ "grad_norm": 2.5355329513549805,
+ "learning_rate": 4.2977234389128305e-06,
+ "loss": 0.641,
+ "step": 3179
+ },
+ {
+ "epoch": 1.50354609929078,
+ "grad_norm": 2.674781084060669,
+ "learning_rate": 4.297289874604213e-06,
+ "loss": 0.475,
+ "step": 3180
+ },
+ {
+ "epoch": 1.5040189125295509,
+ "grad_norm": 2.6845548152923584,
+ "learning_rate": 4.296856198386235e-06,
+ "loss": 0.5328,
+ "step": 3181
+ },
+ {
+ "epoch": 1.5044917257683215,
+ "grad_norm": 2.9686241149902344,
+ "learning_rate": 4.296422410285902e-06,
+ "loss": 0.6216,
+ "step": 3182
+ },
+ {
+ "epoch": 1.504964539007092,
+ "grad_norm": 2.5095980167388916,
+ "learning_rate": 4.295988510330222e-06,
+ "loss": 0.4993,
+ "step": 3183
+ },
+ {
+ "epoch": 1.5054373522458628,
+ "grad_norm": 2.4906392097473145,
+ "learning_rate": 4.2955544985462125e-06,
+ "loss": 0.4795,
+ "step": 3184
+ },
+ {
+ "epoch": 1.5059101654846336,
+ "grad_norm": 2.5593366622924805,
+ "learning_rate": 4.295120374960897e-06,
+ "loss": 0.5527,
+ "step": 3185
+ },
+ {
+ "epoch": 1.5063829787234042,
+ "grad_norm": 2.691495180130005,
+ "learning_rate": 4.294686139601308e-06,
+ "loss": 0.5646,
+ "step": 3186
+ },
+ {
+ "epoch": 1.5068557919621748,
+ "grad_norm": 2.74320387840271,
+ "learning_rate": 4.294251792494483e-06,
+ "loss": 0.6149,
+ "step": 3187
+ },
+ {
+ "epoch": 1.5073286052009456,
+ "grad_norm": 2.8827052116394043,
+ "learning_rate": 4.293817333667465e-06,
+ "loss": 0.5414,
+ "step": 3188
+ },
+ {
+ "epoch": 1.5078014184397164,
+ "grad_norm": 2.5652425289154053,
+ "learning_rate": 4.293382763147308e-06,
+ "loss": 0.5006,
+ "step": 3189
+ },
+ {
+ "epoch": 1.508274231678487,
+ "grad_norm": 2.729295253753662,
+ "learning_rate": 4.29294808096107e-06,
+ "loss": 0.522,
+ "step": 3190
+ },
+ {
+ "epoch": 1.5087470449172575,
+ "grad_norm": 2.348118305206299,
+ "learning_rate": 4.292513287135817e-06,
+ "loss": 0.4125,
+ "step": 3191
+ },
+ {
+ "epoch": 1.5092198581560283,
+ "grad_norm": 2.809551954269409,
+ "learning_rate": 4.292078381698621e-06,
+ "loss": 0.5577,
+ "step": 3192
+ },
+ {
+ "epoch": 1.5096926713947991,
+ "grad_norm": 2.6925361156463623,
+ "learning_rate": 4.291643364676563e-06,
+ "loss": 0.62,
+ "step": 3193
+ },
+ {
+ "epoch": 1.5101654846335697,
+ "grad_norm": 2.4200620651245117,
+ "learning_rate": 4.291208236096729e-06,
+ "loss": 0.5464,
+ "step": 3194
+ },
+ {
+ "epoch": 1.5106382978723403,
+ "grad_norm": 2.5659191608428955,
+ "learning_rate": 4.290772995986211e-06,
+ "loss": 0.5402,
+ "step": 3195
+ },
+ {
+ "epoch": 1.511111111111111,
+ "grad_norm": 2.3877315521240234,
+ "learning_rate": 4.290337644372113e-06,
+ "loss": 0.463,
+ "step": 3196
+ },
+ {
+ "epoch": 1.5115839243498819,
+ "grad_norm": 2.7063233852386475,
+ "learning_rate": 4.289902181281538e-06,
+ "loss": 0.5253,
+ "step": 3197
+ },
+ {
+ "epoch": 1.5120567375886524,
+ "grad_norm": 2.56788969039917,
+ "learning_rate": 4.289466606741603e-06,
+ "loss": 0.5012,
+ "step": 3198
+ },
+ {
+ "epoch": 1.512529550827423,
+ "grad_norm": 2.637164831161499,
+ "learning_rate": 4.28903092077943e-06,
+ "loss": 0.5236,
+ "step": 3199
+ },
+ {
+ "epoch": 1.5130023640661938,
+ "grad_norm": 2.767526865005493,
+ "learning_rate": 4.288595123422146e-06,
+ "loss": 0.5832,
+ "step": 3200
+ },
+ {
+ "epoch": 1.5134751773049646,
+ "grad_norm": 2.33365535736084,
+ "learning_rate": 4.2881592146968866e-06,
+ "loss": 0.4548,
+ "step": 3201
+ },
+ {
+ "epoch": 1.5139479905437352,
+ "grad_norm": 2.544189453125,
+ "learning_rate": 4.287723194630793e-06,
+ "loss": 0.5115,
+ "step": 3202
+ },
+ {
+ "epoch": 1.5144208037825058,
+ "grad_norm": 2.588793992996216,
+ "learning_rate": 4.2872870632510155e-06,
+ "loss": 0.4766,
+ "step": 3203
+ },
+ {
+ "epoch": 1.5148936170212766,
+ "grad_norm": 2.5382184982299805,
+ "learning_rate": 4.286850820584709e-06,
+ "loss": 0.5401,
+ "step": 3204
+ },
+ {
+ "epoch": 1.5153664302600474,
+ "grad_norm": 2.597930669784546,
+ "learning_rate": 4.286414466659038e-06,
+ "loss": 0.5346,
+ "step": 3205
+ },
+ {
+ "epoch": 1.515839243498818,
+ "grad_norm": 2.8522393703460693,
+ "learning_rate": 4.28597800150117e-06,
+ "loss": 0.486,
+ "step": 3206
+ },
+ {
+ "epoch": 1.5163120567375885,
+ "grad_norm": 2.4801454544067383,
+ "learning_rate": 4.285541425138285e-06,
+ "loss": 0.5162,
+ "step": 3207
+ },
+ {
+ "epoch": 1.5167848699763593,
+ "grad_norm": 2.353665351867676,
+ "learning_rate": 4.285104737597563e-06,
+ "loss": 0.5066,
+ "step": 3208
+ },
+ {
+ "epoch": 1.51725768321513,
+ "grad_norm": 2.767976760864258,
+ "learning_rate": 4.2846679389061975e-06,
+ "loss": 0.5331,
+ "step": 3209
+ },
+ {
+ "epoch": 1.5177304964539007,
+ "grad_norm": 2.9307682514190674,
+ "learning_rate": 4.284231029091385e-06,
+ "loss": 0.5291,
+ "step": 3210
+ },
+ {
+ "epoch": 1.5182033096926713,
+ "grad_norm": 2.39719820022583,
+ "learning_rate": 4.283794008180329e-06,
+ "loss": 0.4759,
+ "step": 3211
+ },
+ {
+ "epoch": 1.518676122931442,
+ "grad_norm": 2.452244758605957,
+ "learning_rate": 4.283356876200242e-06,
+ "loss": 0.4283,
+ "step": 3212
+ },
+ {
+ "epoch": 1.5191489361702128,
+ "grad_norm": 2.4911608695983887,
+ "learning_rate": 4.282919633178343e-06,
+ "loss": 0.4812,
+ "step": 3213
+ },
+ {
+ "epoch": 1.5196217494089834,
+ "grad_norm": 2.5813944339752197,
+ "learning_rate": 4.282482279141856e-06,
+ "loss": 0.4911,
+ "step": 3214
+ },
+ {
+ "epoch": 1.520094562647754,
+ "grad_norm": 2.503542184829712,
+ "learning_rate": 4.282044814118013e-06,
+ "loss": 0.4969,
+ "step": 3215
+ },
+ {
+ "epoch": 1.5205673758865248,
+ "grad_norm": 2.5090713500976562,
+ "learning_rate": 4.281607238134053e-06,
+ "loss": 0.5293,
+ "step": 3216
+ },
+ {
+ "epoch": 1.5210401891252956,
+ "grad_norm": 2.425994396209717,
+ "learning_rate": 4.281169551217223e-06,
+ "loss": 0.5365,
+ "step": 3217
+ },
+ {
+ "epoch": 1.5215130023640662,
+ "grad_norm": 2.637655258178711,
+ "learning_rate": 4.2807317533947765e-06,
+ "loss": 0.5589,
+ "step": 3218
+ },
+ {
+ "epoch": 1.5219858156028367,
+ "grad_norm": 2.9335296154022217,
+ "learning_rate": 4.28029384469397e-06,
+ "loss": 0.6071,
+ "step": 3219
+ },
+ {
+ "epoch": 1.5224586288416075,
+ "grad_norm": 2.898683547973633,
+ "learning_rate": 4.279855825142073e-06,
+ "loss": 0.5392,
+ "step": 3220
+ },
+ {
+ "epoch": 1.5229314420803783,
+ "grad_norm": 2.613914966583252,
+ "learning_rate": 4.279417694766359e-06,
+ "loss": 0.4968,
+ "step": 3221
+ },
+ {
+ "epoch": 1.523404255319149,
+ "grad_norm": 2.500682830810547,
+ "learning_rate": 4.278979453594106e-06,
+ "loss": 0.471,
+ "step": 3222
+ },
+ {
+ "epoch": 1.5238770685579195,
+ "grad_norm": 2.5269598960876465,
+ "learning_rate": 4.278541101652605e-06,
+ "loss": 0.471,
+ "step": 3223
+ },
+ {
+ "epoch": 1.5243498817966903,
+ "grad_norm": 2.8153114318847656,
+ "learning_rate": 4.2781026389691465e-06,
+ "loss": 0.5742,
+ "step": 3224
+ },
+ {
+ "epoch": 1.524822695035461,
+ "grad_norm": 2.5648019313812256,
+ "learning_rate": 4.277664065571034e-06,
+ "loss": 0.5315,
+ "step": 3225
+ },
+ {
+ "epoch": 1.5252955082742317,
+ "grad_norm": 2.778355836868286,
+ "learning_rate": 4.277225381485575e-06,
+ "loss": 0.5543,
+ "step": 3226
+ },
+ {
+ "epoch": 1.5257683215130022,
+ "grad_norm": 2.6736745834350586,
+ "learning_rate": 4.2767865867400846e-06,
+ "loss": 0.4947,
+ "step": 3227
+ },
+ {
+ "epoch": 1.526241134751773,
+ "grad_norm": 2.9560294151306152,
+ "learning_rate": 4.276347681361884e-06,
+ "loss": 0.5835,
+ "step": 3228
+ },
+ {
+ "epoch": 1.5267139479905438,
+ "grad_norm": 2.5580296516418457,
+ "learning_rate": 4.275908665378302e-06,
+ "loss": 0.4751,
+ "step": 3229
+ },
+ {
+ "epoch": 1.5271867612293144,
+ "grad_norm": 3.0705175399780273,
+ "learning_rate": 4.2754695388166755e-06,
+ "loss": 0.5327,
+ "step": 3230
+ },
+ {
+ "epoch": 1.527659574468085,
+ "grad_norm": 2.664652109146118,
+ "learning_rate": 4.275030301704346e-06,
+ "loss": 0.4934,
+ "step": 3231
+ },
+ {
+ "epoch": 1.5281323877068558,
+ "grad_norm": 2.308499813079834,
+ "learning_rate": 4.274590954068663e-06,
+ "loss": 0.4412,
+ "step": 3232
+ },
+ {
+ "epoch": 1.5286052009456266,
+ "grad_norm": 2.871189594268799,
+ "learning_rate": 4.2741514959369815e-06,
+ "loss": 0.5001,
+ "step": 3233
+ },
+ {
+ "epoch": 1.5290780141843971,
+ "grad_norm": 2.5274453163146973,
+ "learning_rate": 4.273711927336666e-06,
+ "loss": 0.4938,
+ "step": 3234
+ },
+ {
+ "epoch": 1.5295508274231677,
+ "grad_norm": 2.8848133087158203,
+ "learning_rate": 4.273272248295087e-06,
+ "loss": 0.5397,
+ "step": 3235
+ },
+ {
+ "epoch": 1.5300236406619385,
+ "grad_norm": 2.3927090167999268,
+ "learning_rate": 4.27283245883962e-06,
+ "loss": 0.5497,
+ "step": 3236
+ },
+ {
+ "epoch": 1.5304964539007093,
+ "grad_norm": 2.5413873195648193,
+ "learning_rate": 4.27239255899765e-06,
+ "loss": 0.5108,
+ "step": 3237
+ },
+ {
+ "epoch": 1.53096926713948,
+ "grad_norm": 2.7692389488220215,
+ "learning_rate": 4.271952548796567e-06,
+ "loss": 0.5768,
+ "step": 3238
+ },
+ {
+ "epoch": 1.5314420803782505,
+ "grad_norm": 2.4621126651763916,
+ "learning_rate": 4.271512428263768e-06,
+ "loss": 0.4698,
+ "step": 3239
+ },
+ {
+ "epoch": 1.5319148936170213,
+ "grad_norm": 2.6423375606536865,
+ "learning_rate": 4.271072197426659e-06,
+ "loss": 0.4929,
+ "step": 3240
+ },
+ {
+ "epoch": 1.532387706855792,
+ "grad_norm": 2.7097692489624023,
+ "learning_rate": 4.270631856312649e-06,
+ "loss": 0.4836,
+ "step": 3241
+ },
+ {
+ "epoch": 1.5328605200945626,
+ "grad_norm": 2.545706272125244,
+ "learning_rate": 4.270191404949158e-06,
+ "loss": 0.4636,
+ "step": 3242
+ },
+ {
+ "epoch": 1.5333333333333332,
+ "grad_norm": 3.138781785964966,
+ "learning_rate": 4.26975084336361e-06,
+ "loss": 0.5988,
+ "step": 3243
+ },
+ {
+ "epoch": 1.533806146572104,
+ "grad_norm": 2.492715835571289,
+ "learning_rate": 4.269310171583438e-06,
+ "loss": 0.5095,
+ "step": 3244
+ },
+ {
+ "epoch": 1.5342789598108748,
+ "grad_norm": 2.5705838203430176,
+ "learning_rate": 4.268869389636077e-06,
+ "loss": 0.4818,
+ "step": 3245
+ },
+ {
+ "epoch": 1.5347517730496454,
+ "grad_norm": 2.7633554935455322,
+ "learning_rate": 4.268428497548979e-06,
+ "loss": 0.547,
+ "step": 3246
+ },
+ {
+ "epoch": 1.535224586288416,
+ "grad_norm": 2.654528856277466,
+ "learning_rate": 4.2679874953495905e-06,
+ "loss": 0.5261,
+ "step": 3247
+ },
+ {
+ "epoch": 1.5356973995271868,
+ "grad_norm": 2.5039751529693604,
+ "learning_rate": 4.2675463830653744e-06,
+ "loss": 0.4941,
+ "step": 3248
+ },
+ {
+ "epoch": 1.5361702127659576,
+ "grad_norm": 2.897268295288086,
+ "learning_rate": 4.267105160723794e-06,
+ "loss": 0.5404,
+ "step": 3249
+ },
+ {
+ "epoch": 1.5366430260047281,
+ "grad_norm": 2.500732421875,
+ "learning_rate": 4.266663828352324e-06,
+ "loss": 0.5375,
+ "step": 3250
+ },
+ {
+ "epoch": 1.5371158392434987,
+ "grad_norm": 2.6310064792633057,
+ "learning_rate": 4.266222385978444e-06,
+ "loss": 0.5217,
+ "step": 3251
+ },
+ {
+ "epoch": 1.5375886524822695,
+ "grad_norm": 2.7440476417541504,
+ "learning_rate": 4.265780833629642e-06,
+ "loss": 0.5419,
+ "step": 3252
+ },
+ {
+ "epoch": 1.5380614657210403,
+ "grad_norm": 2.7037577629089355,
+ "learning_rate": 4.2653391713334095e-06,
+ "loss": 0.5634,
+ "step": 3253
+ },
+ {
+ "epoch": 1.5385342789598109,
+ "grad_norm": 2.548525810241699,
+ "learning_rate": 4.264897399117248e-06,
+ "loss": 0.535,
+ "step": 3254
+ },
+ {
+ "epoch": 1.5390070921985815,
+ "grad_norm": 2.6127355098724365,
+ "learning_rate": 4.264455517008663e-06,
+ "loss": 0.4619,
+ "step": 3255
+ },
+ {
+ "epoch": 1.5394799054373522,
+ "grad_norm": 2.5597004890441895,
+ "learning_rate": 4.264013525035171e-06,
+ "loss": 0.4477,
+ "step": 3256
+ },
+ {
+ "epoch": 1.539952718676123,
+ "grad_norm": 2.642432689666748,
+ "learning_rate": 4.263571423224292e-06,
+ "loss": 0.4749,
+ "step": 3257
+ },
+ {
+ "epoch": 1.5404255319148936,
+ "grad_norm": 2.5121877193450928,
+ "learning_rate": 4.2631292116035526e-06,
+ "loss": 0.4693,
+ "step": 3258
+ },
+ {
+ "epoch": 1.5408983451536642,
+ "grad_norm": 2.390292167663574,
+ "learning_rate": 4.262686890200489e-06,
+ "loss": 0.4872,
+ "step": 3259
+ },
+ {
+ "epoch": 1.541371158392435,
+ "grad_norm": 2.5898337364196777,
+ "learning_rate": 4.2622444590426405e-06,
+ "loss": 0.5193,
+ "step": 3260
+ },
+ {
+ "epoch": 1.5418439716312058,
+ "grad_norm": 2.508821487426758,
+ "learning_rate": 4.261801918157558e-06,
+ "loss": 0.511,
+ "step": 3261
+ },
+ {
+ "epoch": 1.5423167848699764,
+ "grad_norm": 2.6992101669311523,
+ "learning_rate": 4.261359267572795e-06,
+ "loss": 0.5069,
+ "step": 3262
+ },
+ {
+ "epoch": 1.542789598108747,
+ "grad_norm": 2.6011030673980713,
+ "learning_rate": 4.2609165073159145e-06,
+ "loss": 0.5887,
+ "step": 3263
+ },
+ {
+ "epoch": 1.5432624113475177,
+ "grad_norm": 2.887053966522217,
+ "learning_rate": 4.260473637414483e-06,
+ "loss": 0.5556,
+ "step": 3264
+ },
+ {
+ "epoch": 1.5437352245862885,
+ "grad_norm": 2.6433887481689453,
+ "learning_rate": 4.260030657896079e-06,
+ "loss": 0.4728,
+ "step": 3265
+ },
+ {
+ "epoch": 1.544208037825059,
+ "grad_norm": 2.6134607791900635,
+ "learning_rate": 4.259587568788282e-06,
+ "loss": 0.483,
+ "step": 3266
+ },
+ {
+ "epoch": 1.5446808510638297,
+ "grad_norm": 2.5308640003204346,
+ "learning_rate": 4.259144370118684e-06,
+ "loss": 0.5115,
+ "step": 3267
+ },
+ {
+ "epoch": 1.5451536643026005,
+ "grad_norm": 2.8256733417510986,
+ "learning_rate": 4.258701061914879e-06,
+ "loss": 0.5414,
+ "step": 3268
+ },
+ {
+ "epoch": 1.5456264775413713,
+ "grad_norm": 2.8648319244384766,
+ "learning_rate": 4.258257644204471e-06,
+ "loss": 0.5695,
+ "step": 3269
+ },
+ {
+ "epoch": 1.5460992907801419,
+ "grad_norm": 2.8568081855773926,
+ "learning_rate": 4.257814117015069e-06,
+ "loss": 0.5264,
+ "step": 3270
+ },
+ {
+ "epoch": 1.5465721040189124,
+ "grad_norm": 2.6065011024475098,
+ "learning_rate": 4.257370480374289e-06,
+ "loss": 0.5646,
+ "step": 3271
+ },
+ {
+ "epoch": 1.5470449172576832,
+ "grad_norm": 2.7840216159820557,
+ "learning_rate": 4.256926734309756e-06,
+ "loss": 0.5191,
+ "step": 3272
+ },
+ {
+ "epoch": 1.547517730496454,
+ "grad_norm": 2.85906982421875,
+ "learning_rate": 4.256482878849099e-06,
+ "loss": 0.5911,
+ "step": 3273
+ },
+ {
+ "epoch": 1.5479905437352246,
+ "grad_norm": 2.916029930114746,
+ "learning_rate": 4.256038914019954e-06,
+ "loss": 0.5589,
+ "step": 3274
+ },
+ {
+ "epoch": 1.5484633569739952,
+ "grad_norm": 2.6748716831207275,
+ "learning_rate": 4.255594839849967e-06,
+ "loss": 0.5323,
+ "step": 3275
+ },
+ {
+ "epoch": 1.548936170212766,
+ "grad_norm": 2.717212200164795,
+ "learning_rate": 4.255150656366787e-06,
+ "loss": 0.453,
+ "step": 3276
+ },
+ {
+ "epoch": 1.5494089834515368,
+ "grad_norm": 2.4974849224090576,
+ "learning_rate": 4.254706363598072e-06,
+ "loss": 0.4516,
+ "step": 3277
+ },
+ {
+ "epoch": 1.5498817966903073,
+ "grad_norm": 2.648151397705078,
+ "learning_rate": 4.254261961571485e-06,
+ "loss": 0.5452,
+ "step": 3278
+ },
+ {
+ "epoch": 1.550354609929078,
+ "grad_norm": 2.932905435562134,
+ "learning_rate": 4.253817450314699e-06,
+ "loss": 0.4813,
+ "step": 3279
+ },
+ {
+ "epoch": 1.5508274231678487,
+ "grad_norm": 2.862912178039551,
+ "learning_rate": 4.25337282985539e-06,
+ "loss": 0.5689,
+ "step": 3280
+ },
+ {
+ "epoch": 1.5513002364066195,
+ "grad_norm": 2.532156467437744,
+ "learning_rate": 4.2529281002212436e-06,
+ "loss": 0.485,
+ "step": 3281
+ },
+ {
+ "epoch": 1.55177304964539,
+ "grad_norm": 2.583299160003662,
+ "learning_rate": 4.25248326143995e-06,
+ "loss": 0.4661,
+ "step": 3282
+ },
+ {
+ "epoch": 1.5522458628841607,
+ "grad_norm": 2.5790653228759766,
+ "learning_rate": 4.252038313539209e-06,
+ "loss": 0.5455,
+ "step": 3283
+ },
+ {
+ "epoch": 1.5527186761229315,
+ "grad_norm": 2.872864007949829,
+ "learning_rate": 4.251593256546724e-06,
+ "loss": 0.5317,
+ "step": 3284
+ },
+ {
+ "epoch": 1.5531914893617023,
+ "grad_norm": 3.0382463932037354,
+ "learning_rate": 4.251148090490208e-06,
+ "loss": 0.5131,
+ "step": 3285
+ },
+ {
+ "epoch": 1.5536643026004728,
+ "grad_norm": 2.574399709701538,
+ "learning_rate": 4.250702815397379e-06,
+ "loss": 0.5399,
+ "step": 3286
+ },
+ {
+ "epoch": 1.5541371158392434,
+ "grad_norm": 2.9784770011901855,
+ "learning_rate": 4.250257431295962e-06,
+ "loss": 0.5209,
+ "step": 3287
+ },
+ {
+ "epoch": 1.5546099290780142,
+ "grad_norm": 2.6482062339782715,
+ "learning_rate": 4.249811938213689e-06,
+ "loss": 0.5416,
+ "step": 3288
+ },
+ {
+ "epoch": 1.555082742316785,
+ "grad_norm": 2.82142972946167,
+ "learning_rate": 4.2493663361783e-06,
+ "loss": 0.594,
+ "step": 3289
+ },
+ {
+ "epoch": 1.5555555555555556,
+ "grad_norm": 2.815595865249634,
+ "learning_rate": 4.24892062521754e-06,
+ "loss": 0.5381,
+ "step": 3290
+ },
+ {
+ "epoch": 1.5560283687943262,
+ "grad_norm": 2.689764976501465,
+ "learning_rate": 4.248474805359161e-06,
+ "loss": 0.5141,
+ "step": 3291
+ },
+ {
+ "epoch": 1.556501182033097,
+ "grad_norm": 2.7718515396118164,
+ "learning_rate": 4.248028876630922e-06,
+ "loss": 0.5324,
+ "step": 3292
+ },
+ {
+ "epoch": 1.5569739952718678,
+ "grad_norm": 3.0196774005889893,
+ "learning_rate": 4.247582839060591e-06,
+ "loss": 0.4971,
+ "step": 3293
+ },
+ {
+ "epoch": 1.5574468085106383,
+ "grad_norm": 2.608475923538208,
+ "learning_rate": 4.247136692675939e-06,
+ "loss": 0.5795,
+ "step": 3294
+ },
+ {
+ "epoch": 1.557919621749409,
+ "grad_norm": 2.4912326335906982,
+ "learning_rate": 4.246690437504746e-06,
+ "loss": 0.5348,
+ "step": 3295
+ },
+ {
+ "epoch": 1.5583924349881797,
+ "grad_norm": 2.519303560256958,
+ "learning_rate": 4.246244073574799e-06,
+ "loss": 0.4953,
+ "step": 3296
+ },
+ {
+ "epoch": 1.5588652482269505,
+ "grad_norm": 2.5667171478271484,
+ "learning_rate": 4.24579760091389e-06,
+ "loss": 0.5353,
+ "step": 3297
+ },
+ {
+ "epoch": 1.559338061465721,
+ "grad_norm": 2.8835761547088623,
+ "learning_rate": 4.24535101954982e-06,
+ "loss": 0.578,
+ "step": 3298
+ },
+ {
+ "epoch": 1.5598108747044916,
+ "grad_norm": 3.0506930351257324,
+ "learning_rate": 4.244904329510395e-06,
+ "loss": 0.6418,
+ "step": 3299
+ },
+ {
+ "epoch": 1.5602836879432624,
+ "grad_norm": 2.579446315765381,
+ "learning_rate": 4.244457530823428e-06,
+ "loss": 0.5027,
+ "step": 3300
+ },
+ {
+ "epoch": 1.5607565011820332,
+ "grad_norm": 2.72012996673584,
+ "learning_rate": 4.24401062351674e-06,
+ "loss": 0.5438,
+ "step": 3301
+ },
+ {
+ "epoch": 1.5612293144208038,
+ "grad_norm": 2.527007818222046,
+ "learning_rate": 4.243563607618158e-06,
+ "loss": 0.5303,
+ "step": 3302
+ },
+ {
+ "epoch": 1.5617021276595744,
+ "grad_norm": 2.4415159225463867,
+ "learning_rate": 4.243116483155516e-06,
+ "loss": 0.4893,
+ "step": 3303
+ },
+ {
+ "epoch": 1.5621749408983452,
+ "grad_norm": 2.462256669998169,
+ "learning_rate": 4.242669250156653e-06,
+ "loss": 0.5671,
+ "step": 3304
+ },
+ {
+ "epoch": 1.562647754137116,
+ "grad_norm": 2.479865074157715,
+ "learning_rate": 4.242221908649418e-06,
+ "loss": 0.5038,
+ "step": 3305
+ },
+ {
+ "epoch": 1.5631205673758866,
+ "grad_norm": 2.74670672416687,
+ "learning_rate": 4.241774458661662e-06,
+ "loss": 0.5689,
+ "step": 3306
+ },
+ {
+ "epoch": 1.5635933806146571,
+ "grad_norm": 2.55938982963562,
+ "learning_rate": 4.24132690022125e-06,
+ "loss": 0.492,
+ "step": 3307
+ },
+ {
+ "epoch": 1.564066193853428,
+ "grad_norm": 2.634956121444702,
+ "learning_rate": 4.240879233356048e-06,
+ "loss": 0.503,
+ "step": 3308
+ },
+ {
+ "epoch": 1.5645390070921987,
+ "grad_norm": 2.381775140762329,
+ "learning_rate": 4.240431458093928e-06,
+ "loss": 0.4939,
+ "step": 3309
+ },
+ {
+ "epoch": 1.5650118203309693,
+ "grad_norm": 2.8176610469818115,
+ "learning_rate": 4.239983574462774e-06,
+ "loss": 0.5609,
+ "step": 3310
+ },
+ {
+ "epoch": 1.5654846335697399,
+ "grad_norm": 3.0268442630767822,
+ "learning_rate": 4.239535582490471e-06,
+ "loss": 0.5427,
+ "step": 3311
+ },
+ {
+ "epoch": 1.5659574468085107,
+ "grad_norm": 2.5881481170654297,
+ "learning_rate": 4.239087482204916e-06,
+ "loss": 0.5538,
+ "step": 3312
+ },
+ {
+ "epoch": 1.5664302600472815,
+ "grad_norm": 2.5317704677581787,
+ "learning_rate": 4.238639273634008e-06,
+ "loss": 0.4915,
+ "step": 3313
+ },
+ {
+ "epoch": 1.566903073286052,
+ "grad_norm": 2.9608731269836426,
+ "learning_rate": 4.238190956805658e-06,
+ "loss": 0.564,
+ "step": 3314
+ },
+ {
+ "epoch": 1.5673758865248226,
+ "grad_norm": 3.022686243057251,
+ "learning_rate": 4.237742531747777e-06,
+ "loss": 0.5503,
+ "step": 3315
+ },
+ {
+ "epoch": 1.5678486997635934,
+ "grad_norm": 2.763622283935547,
+ "learning_rate": 4.23729399848829e-06,
+ "loss": 0.5241,
+ "step": 3316
+ },
+ {
+ "epoch": 1.5683215130023642,
+ "grad_norm": 2.6112794876098633,
+ "learning_rate": 4.236845357055122e-06,
+ "loss": 0.4919,
+ "step": 3317
+ },
+ {
+ "epoch": 1.5687943262411348,
+ "grad_norm": 2.649829149246216,
+ "learning_rate": 4.23639660747621e-06,
+ "loss": 0.5472,
+ "step": 3318
+ },
+ {
+ "epoch": 1.5692671394799054,
+ "grad_norm": 2.8888115882873535,
+ "learning_rate": 4.2359477497794955e-06,
+ "loss": 0.5077,
+ "step": 3319
+ },
+ {
+ "epoch": 1.5697399527186762,
+ "grad_norm": 2.5666911602020264,
+ "learning_rate": 4.235498783992927e-06,
+ "loss": 0.5365,
+ "step": 3320
+ },
+ {
+ "epoch": 1.570212765957447,
+ "grad_norm": 2.448758363723755,
+ "learning_rate": 4.2350497101444575e-06,
+ "loss": 0.5043,
+ "step": 3321
+ },
+ {
+ "epoch": 1.5706855791962175,
+ "grad_norm": 2.595207691192627,
+ "learning_rate": 4.234600528262052e-06,
+ "loss": 0.5303,
+ "step": 3322
+ },
+ {
+ "epoch": 1.5711583924349881,
+ "grad_norm": 2.7814228534698486,
+ "learning_rate": 4.234151238373676e-06,
+ "loss": 0.4521,
+ "step": 3323
+ },
+ {
+ "epoch": 1.571631205673759,
+ "grad_norm": 2.781538724899292,
+ "learning_rate": 4.233701840507308e-06,
+ "loss": 0.5193,
+ "step": 3324
+ },
+ {
+ "epoch": 1.5721040189125297,
+ "grad_norm": 2.771907329559326,
+ "learning_rate": 4.233252334690928e-06,
+ "loss": 0.497,
+ "step": 3325
+ },
+ {
+ "epoch": 1.5725768321513003,
+ "grad_norm": 2.5557498931884766,
+ "learning_rate": 4.232802720952525e-06,
+ "loss": 0.4913,
+ "step": 3326
+ },
+ {
+ "epoch": 1.5730496453900709,
+ "grad_norm": 2.478267192840576,
+ "learning_rate": 4.232352999320094e-06,
+ "loss": 0.4967,
+ "step": 3327
+ },
+ {
+ "epoch": 1.5735224586288417,
+ "grad_norm": 3.1548502445220947,
+ "learning_rate": 4.231903169821639e-06,
+ "loss": 0.5009,
+ "step": 3328
+ },
+ {
+ "epoch": 1.5739952718676125,
+ "grad_norm": 2.634824275970459,
+ "learning_rate": 4.231453232485168e-06,
+ "loss": 0.5223,
+ "step": 3329
+ },
+ {
+ "epoch": 1.574468085106383,
+ "grad_norm": 2.579102039337158,
+ "learning_rate": 4.231003187338695e-06,
+ "loss": 0.5513,
+ "step": 3330
+ },
+ {
+ "epoch": 1.5749408983451536,
+ "grad_norm": 2.8477070331573486,
+ "learning_rate": 4.230553034410245e-06,
+ "loss": 0.561,
+ "step": 3331
+ },
+ {
+ "epoch": 1.5754137115839244,
+ "grad_norm": 2.6714725494384766,
+ "learning_rate": 4.2301027737278446e-06,
+ "loss": 0.4687,
+ "step": 3332
+ },
+ {
+ "epoch": 1.5758865248226952,
+ "grad_norm": 2.6562764644622803,
+ "learning_rate": 4.229652405319532e-06,
+ "loss": 0.5925,
+ "step": 3333
+ },
+ {
+ "epoch": 1.5763593380614658,
+ "grad_norm": 2.750946283340454,
+ "learning_rate": 4.229201929213348e-06,
+ "loss": 0.4748,
+ "step": 3334
+ },
+ {
+ "epoch": 1.5768321513002364,
+ "grad_norm": 2.760470151901245,
+ "learning_rate": 4.228751345437342e-06,
+ "loss": 0.5989,
+ "step": 3335
+ },
+ {
+ "epoch": 1.5773049645390071,
+ "grad_norm": 3.1451845169067383,
+ "learning_rate": 4.2283006540195706e-06,
+ "loss": 0.562,
+ "step": 3336
+ },
+ {
+ "epoch": 1.5777777777777777,
+ "grad_norm": 2.563011407852173,
+ "learning_rate": 4.227849854988095e-06,
+ "loss": 0.5473,
+ "step": 3337
+ },
+ {
+ "epoch": 1.5782505910165483,
+ "grad_norm": 2.310469388961792,
+ "learning_rate": 4.2273989483709856e-06,
+ "loss": 0.5033,
+ "step": 3338
+ },
+ {
+ "epoch": 1.578723404255319,
+ "grad_norm": 2.677978754043579,
+ "learning_rate": 4.226947934196318e-06,
+ "loss": 0.5291,
+ "step": 3339
+ },
+ {
+ "epoch": 1.57919621749409,
+ "grad_norm": 3.0423545837402344,
+ "learning_rate": 4.226496812492176e-06,
+ "loss": 0.5201,
+ "step": 3340
+ },
+ {
+ "epoch": 1.5796690307328605,
+ "grad_norm": 2.357513904571533,
+ "learning_rate": 4.226045583286647e-06,
+ "loss": 0.4421,
+ "step": 3341
+ },
+ {
+ "epoch": 1.580141843971631,
+ "grad_norm": 2.719860315322876,
+ "learning_rate": 4.225594246607828e-06,
+ "loss": 0.4855,
+ "step": 3342
+ },
+ {
+ "epoch": 1.5806146572104018,
+ "grad_norm": 3.2645058631896973,
+ "learning_rate": 4.2251428024838215e-06,
+ "loss": 0.6654,
+ "step": 3343
+ },
+ {
+ "epoch": 1.5810874704491726,
+ "grad_norm": 2.2997004985809326,
+ "learning_rate": 4.224691250942737e-06,
+ "loss": 0.4565,
+ "step": 3344
+ },
+ {
+ "epoch": 1.5815602836879432,
+ "grad_norm": 2.8103034496307373,
+ "learning_rate": 4.2242395920126926e-06,
+ "loss": 0.5543,
+ "step": 3345
+ },
+ {
+ "epoch": 1.5820330969267138,
+ "grad_norm": 2.720254898071289,
+ "learning_rate": 4.223787825721808e-06,
+ "loss": 0.5028,
+ "step": 3346
+ },
+ {
+ "epoch": 1.5825059101654846,
+ "grad_norm": 2.735544204711914,
+ "learning_rate": 4.223335952098214e-06,
+ "loss": 0.5169,
+ "step": 3347
+ },
+ {
+ "epoch": 1.5829787234042554,
+ "grad_norm": 2.784254550933838,
+ "learning_rate": 4.222883971170047e-06,
+ "loss": 0.4989,
+ "step": 3348
+ },
+ {
+ "epoch": 1.583451536643026,
+ "grad_norm": 2.7192094326019287,
+ "learning_rate": 4.22243188296545e-06,
+ "loss": 0.502,
+ "step": 3349
+ },
+ {
+ "epoch": 1.5839243498817965,
+ "grad_norm": 2.716501474380493,
+ "learning_rate": 4.221979687512573e-06,
+ "loss": 0.5687,
+ "step": 3350
+ },
+ {
+ "epoch": 1.5843971631205673,
+ "grad_norm": 2.8420114517211914,
+ "learning_rate": 4.22152738483957e-06,
+ "loss": 0.5903,
+ "step": 3351
+ },
+ {
+ "epoch": 1.5848699763593381,
+ "grad_norm": 2.734872579574585,
+ "learning_rate": 4.2210749749746065e-06,
+ "loss": 0.5397,
+ "step": 3352
+ },
+ {
+ "epoch": 1.5853427895981087,
+ "grad_norm": 2.4343836307525635,
+ "learning_rate": 4.220622457945851e-06,
+ "loss": 0.436,
+ "step": 3353
+ },
+ {
+ "epoch": 1.5858156028368793,
+ "grad_norm": 2.728177547454834,
+ "learning_rate": 4.2201698337814785e-06,
+ "loss": 0.5703,
+ "step": 3354
+ },
+ {
+ "epoch": 1.58628841607565,
+ "grad_norm": 2.502098560333252,
+ "learning_rate": 4.219717102509674e-06,
+ "loss": 0.5275,
+ "step": 3355
+ },
+ {
+ "epoch": 1.5867612293144209,
+ "grad_norm": 2.6595494747161865,
+ "learning_rate": 4.219264264158627e-06,
+ "loss": 0.4659,
+ "step": 3356
+ },
+ {
+ "epoch": 1.5872340425531914,
+ "grad_norm": 2.5307185649871826,
+ "learning_rate": 4.218811318756532e-06,
+ "loss": 0.5048,
+ "step": 3357
+ },
+ {
+ "epoch": 1.587706855791962,
+ "grad_norm": 2.9300129413604736,
+ "learning_rate": 4.218358266331593e-06,
+ "loss": 0.5137,
+ "step": 3358
+ },
+ {
+ "epoch": 1.5881796690307328,
+ "grad_norm": 2.686586618423462,
+ "learning_rate": 4.21790510691202e-06,
+ "loss": 0.4529,
+ "step": 3359
+ },
+ {
+ "epoch": 1.5886524822695036,
+ "grad_norm": 2.9981517791748047,
+ "learning_rate": 4.217451840526029e-06,
+ "loss": 0.6054,
+ "step": 3360
+ },
+ {
+ "epoch": 1.5891252955082742,
+ "grad_norm": 2.6943674087524414,
+ "learning_rate": 4.216998467201841e-06,
+ "loss": 0.5153,
+ "step": 3361
+ },
+ {
+ "epoch": 1.5895981087470448,
+ "grad_norm": 2.707084894180298,
+ "learning_rate": 4.216544986967689e-06,
+ "loss": 0.5235,
+ "step": 3362
+ },
+ {
+ "epoch": 1.5900709219858156,
+ "grad_norm": 2.6553728580474854,
+ "learning_rate": 4.216091399851808e-06,
+ "loss": 0.5275,
+ "step": 3363
+ },
+ {
+ "epoch": 1.5905437352245864,
+ "grad_norm": 2.9136953353881836,
+ "learning_rate": 4.215637705882439e-06,
+ "loss": 0.5834,
+ "step": 3364
+ },
+ {
+ "epoch": 1.591016548463357,
+ "grad_norm": 2.7647159099578857,
+ "learning_rate": 4.2151839050878325e-06,
+ "loss": 0.5641,
+ "step": 3365
+ },
+ {
+ "epoch": 1.5914893617021275,
+ "grad_norm": 2.4556827545166016,
+ "learning_rate": 4.214729997496246e-06,
+ "loss": 0.5636,
+ "step": 3366
+ },
+ {
+ "epoch": 1.5919621749408983,
+ "grad_norm": 2.6111652851104736,
+ "learning_rate": 4.2142759831359414e-06,
+ "loss": 0.5097,
+ "step": 3367
+ },
+ {
+ "epoch": 1.592434988179669,
+ "grad_norm": 2.4886903762817383,
+ "learning_rate": 4.213821862035189e-06,
+ "loss": 0.531,
+ "step": 3368
+ },
+ {
+ "epoch": 1.5929078014184397,
+ "grad_norm": 2.5245840549468994,
+ "learning_rate": 4.213367634222263e-06,
+ "loss": 0.5085,
+ "step": 3369
+ },
+ {
+ "epoch": 1.5933806146572103,
+ "grad_norm": 2.970214605331421,
+ "learning_rate": 4.212913299725447e-06,
+ "loss": 0.5851,
+ "step": 3370
+ },
+ {
+ "epoch": 1.593853427895981,
+ "grad_norm": 2.5433361530303955,
+ "learning_rate": 4.212458858573032e-06,
+ "loss": 0.48,
+ "step": 3371
+ },
+ {
+ "epoch": 1.5943262411347519,
+ "grad_norm": 2.3550102710723877,
+ "learning_rate": 4.212004310793312e-06,
+ "loss": 0.4405,
+ "step": 3372
+ },
+ {
+ "epoch": 1.5947990543735224,
+ "grad_norm": 2.4824719429016113,
+ "learning_rate": 4.2115496564145896e-06,
+ "loss": 0.4634,
+ "step": 3373
+ },
+ {
+ "epoch": 1.595271867612293,
+ "grad_norm": 2.4751930236816406,
+ "learning_rate": 4.211094895465176e-06,
+ "loss": 0.5662,
+ "step": 3374
+ },
+ {
+ "epoch": 1.5957446808510638,
+ "grad_norm": 2.4193356037139893,
+ "learning_rate": 4.210640027973386e-06,
+ "loss": 0.4441,
+ "step": 3375
+ },
+ {
+ "epoch": 1.5962174940898346,
+ "grad_norm": 2.4477498531341553,
+ "learning_rate": 4.210185053967543e-06,
+ "loss": 0.5205,
+ "step": 3376
+ },
+ {
+ "epoch": 1.5966903073286052,
+ "grad_norm": 2.7954161167144775,
+ "learning_rate": 4.209729973475976e-06,
+ "loss": 0.4951,
+ "step": 3377
+ },
+ {
+ "epoch": 1.5971631205673757,
+ "grad_norm": 3.1907570362091064,
+ "learning_rate": 4.209274786527019e-06,
+ "loss": 0.6024,
+ "step": 3378
+ },
+ {
+ "epoch": 1.5976359338061465,
+ "grad_norm": 2.485245704650879,
+ "learning_rate": 4.2088194931490165e-06,
+ "loss": 0.5652,
+ "step": 3379
+ },
+ {
+ "epoch": 1.5981087470449173,
+ "grad_norm": 2.589310884475708,
+ "learning_rate": 4.208364093370317e-06,
+ "loss": 0.5085,
+ "step": 3380
+ },
+ {
+ "epoch": 1.598581560283688,
+ "grad_norm": 2.8941214084625244,
+ "learning_rate": 4.207908587219276e-06,
+ "loss": 0.53,
+ "step": 3381
+ },
+ {
+ "epoch": 1.5990543735224585,
+ "grad_norm": 2.480509042739868,
+ "learning_rate": 4.207452974724258e-06,
+ "loss": 0.4543,
+ "step": 3382
+ },
+ {
+ "epoch": 1.5995271867612293,
+ "grad_norm": 2.7884905338287354,
+ "learning_rate": 4.206997255913629e-06,
+ "loss": 0.5483,
+ "step": 3383
+ },
+ {
+ "epoch": 1.6,
+ "grad_norm": 2.7976696491241455,
+ "learning_rate": 4.206541430815766e-06,
+ "loss": 0.4734,
+ "step": 3384
+ },
+ {
+ "epoch": 1.6004728132387707,
+ "grad_norm": 2.5463132858276367,
+ "learning_rate": 4.206085499459051e-06,
+ "loss": 0.4931,
+ "step": 3385
+ },
+ {
+ "epoch": 1.6009456264775412,
+ "grad_norm": 2.8384251594543457,
+ "learning_rate": 4.205629461871871e-06,
+ "loss": 0.5066,
+ "step": 3386
+ },
+ {
+ "epoch": 1.601418439716312,
+ "grad_norm": 2.8578574657440186,
+ "learning_rate": 4.205173318082626e-06,
+ "loss": 0.458,
+ "step": 3387
+ },
+ {
+ "epoch": 1.6018912529550828,
+ "grad_norm": 2.7779932022094727,
+ "learning_rate": 4.204717068119715e-06,
+ "loss": 0.5293,
+ "step": 3388
+ },
+ {
+ "epoch": 1.6023640661938534,
+ "grad_norm": 2.9123778343200684,
+ "learning_rate": 4.204260712011546e-06,
+ "loss": 0.4866,
+ "step": 3389
+ },
+ {
+ "epoch": 1.602836879432624,
+ "grad_norm": 2.757922887802124,
+ "learning_rate": 4.203804249786537e-06,
+ "loss": 0.4925,
+ "step": 3390
+ },
+ {
+ "epoch": 1.6033096926713948,
+ "grad_norm": 3.287733316421509,
+ "learning_rate": 4.203347681473107e-06,
+ "loss": 0.6694,
+ "step": 3391
+ },
+ {
+ "epoch": 1.6037825059101656,
+ "grad_norm": 3.2117912769317627,
+ "learning_rate": 4.202891007099687e-06,
+ "loss": 0.5269,
+ "step": 3392
+ },
+ {
+ "epoch": 1.6042553191489362,
+ "grad_norm": 2.8489456176757812,
+ "learning_rate": 4.20243422669471e-06,
+ "loss": 0.5073,
+ "step": 3393
+ },
+ {
+ "epoch": 1.6047281323877067,
+ "grad_norm": 2.7660224437713623,
+ "learning_rate": 4.201977340286619e-06,
+ "loss": 0.5014,
+ "step": 3394
+ },
+ {
+ "epoch": 1.6052009456264775,
+ "grad_norm": 2.68182110786438,
+ "learning_rate": 4.201520347903862e-06,
+ "loss": 0.4542,
+ "step": 3395
+ },
+ {
+ "epoch": 1.6056737588652483,
+ "grad_norm": 2.7546045780181885,
+ "learning_rate": 4.2010632495748934e-06,
+ "loss": 0.516,
+ "step": 3396
+ },
+ {
+ "epoch": 1.606146572104019,
+ "grad_norm": 2.744668483734131,
+ "learning_rate": 4.200606045328176e-06,
+ "loss": 0.5243,
+ "step": 3397
+ },
+ {
+ "epoch": 1.6066193853427895,
+ "grad_norm": 2.935343027114868,
+ "learning_rate": 4.200148735192177e-06,
+ "loss": 0.5624,
+ "step": 3398
+ },
+ {
+ "epoch": 1.6070921985815603,
+ "grad_norm": 2.7392852306365967,
+ "learning_rate": 4.19969131919537e-06,
+ "loss": 0.5796,
+ "step": 3399
+ },
+ {
+ "epoch": 1.607565011820331,
+ "grad_norm": 2.864750385284424,
+ "learning_rate": 4.199233797366239e-06,
+ "loss": 0.549,
+ "step": 3400
+ },
+ {
+ "epoch": 1.6080378250591016,
+ "grad_norm": 2.684157371520996,
+ "learning_rate": 4.198776169733269e-06,
+ "loss": 0.5532,
+ "step": 3401
+ },
+ {
+ "epoch": 1.6085106382978722,
+ "grad_norm": 2.4717135429382324,
+ "learning_rate": 4.198318436324957e-06,
+ "loss": 0.5174,
+ "step": 3402
+ },
+ {
+ "epoch": 1.608983451536643,
+ "grad_norm": 2.640242338180542,
+ "learning_rate": 4.197860597169802e-06,
+ "loss": 0.5117,
+ "step": 3403
+ },
+ {
+ "epoch": 1.6094562647754138,
+ "grad_norm": 2.4957473278045654,
+ "learning_rate": 4.197402652296313e-06,
+ "loss": 0.474,
+ "step": 3404
+ },
+ {
+ "epoch": 1.6099290780141844,
+ "grad_norm": 2.416138172149658,
+ "learning_rate": 4.196944601733004e-06,
+ "loss": 0.4858,
+ "step": 3405
+ },
+ {
+ "epoch": 1.610401891252955,
+ "grad_norm": 2.4498109817504883,
+ "learning_rate": 4.196486445508395e-06,
+ "loss": 0.5048,
+ "step": 3406
+ },
+ {
+ "epoch": 1.6108747044917258,
+ "grad_norm": 2.415895938873291,
+ "learning_rate": 4.196028183651014e-06,
+ "loss": 0.4745,
+ "step": 3407
+ },
+ {
+ "epoch": 1.6113475177304966,
+ "grad_norm": 2.843665838241577,
+ "learning_rate": 4.195569816189395e-06,
+ "loss": 0.5219,
+ "step": 3408
+ },
+ {
+ "epoch": 1.6118203309692671,
+ "grad_norm": 2.608579158782959,
+ "learning_rate": 4.195111343152079e-06,
+ "loss": 0.4941,
+ "step": 3409
+ },
+ {
+ "epoch": 1.6122931442080377,
+ "grad_norm": 2.643789529800415,
+ "learning_rate": 4.194652764567611e-06,
+ "loss": 0.515,
+ "step": 3410
+ },
+ {
+ "epoch": 1.6127659574468085,
+ "grad_norm": 2.8099429607391357,
+ "learning_rate": 4.194194080464547e-06,
+ "loss": 0.4935,
+ "step": 3411
+ },
+ {
+ "epoch": 1.6132387706855793,
+ "grad_norm": 2.595628261566162,
+ "learning_rate": 4.193735290871446e-06,
+ "loss": 0.5571,
+ "step": 3412
+ },
+ {
+ "epoch": 1.6137115839243499,
+ "grad_norm": 2.7903778553009033,
+ "learning_rate": 4.193276395816876e-06,
+ "loss": 0.5228,
+ "step": 3413
+ },
+ {
+ "epoch": 1.6141843971631205,
+ "grad_norm": 2.83910870552063,
+ "learning_rate": 4.192817395329409e-06,
+ "loss": 0.6124,
+ "step": 3414
+ },
+ {
+ "epoch": 1.6146572104018913,
+ "grad_norm": 2.6155734062194824,
+ "learning_rate": 4.192358289437626e-06,
+ "loss": 0.552,
+ "step": 3415
+ },
+ {
+ "epoch": 1.615130023640662,
+ "grad_norm": 2.795832872390747,
+ "learning_rate": 4.191899078170113e-06,
+ "loss": 0.5561,
+ "step": 3416
+ },
+ {
+ "epoch": 1.6156028368794326,
+ "grad_norm": 2.3402161598205566,
+ "learning_rate": 4.191439761555464e-06,
+ "loss": 0.4889,
+ "step": 3417
+ },
+ {
+ "epoch": 1.6160756501182032,
+ "grad_norm": 3.1183433532714844,
+ "learning_rate": 4.190980339622276e-06,
+ "loss": 0.5337,
+ "step": 3418
+ },
+ {
+ "epoch": 1.616548463356974,
+ "grad_norm": 2.6262872219085693,
+ "learning_rate": 4.190520812399158e-06,
+ "loss": 0.525,
+ "step": 3419
+ },
+ {
+ "epoch": 1.6170212765957448,
+ "grad_norm": 2.578340530395508,
+ "learning_rate": 4.190061179914722e-06,
+ "loss": 0.4975,
+ "step": 3420
+ },
+ {
+ "epoch": 1.6174940898345154,
+ "grad_norm": 3.19482159614563,
+ "learning_rate": 4.189601442197586e-06,
+ "loss": 0.5832,
+ "step": 3421
+ },
+ {
+ "epoch": 1.617966903073286,
+ "grad_norm": 2.6398792266845703,
+ "learning_rate": 4.189141599276378e-06,
+ "loss": 0.4676,
+ "step": 3422
+ },
+ {
+ "epoch": 1.6184397163120567,
+ "grad_norm": 2.624865770339966,
+ "learning_rate": 4.1886816511797275e-06,
+ "loss": 0.4507,
+ "step": 3423
+ },
+ {
+ "epoch": 1.6189125295508275,
+ "grad_norm": 2.4136857986450195,
+ "learning_rate": 4.1882215979362775e-06,
+ "loss": 0.4616,
+ "step": 3424
+ },
+ {
+ "epoch": 1.6193853427895981,
+ "grad_norm": 2.6906614303588867,
+ "learning_rate": 4.18776143957467e-06,
+ "loss": 0.5142,
+ "step": 3425
+ },
+ {
+ "epoch": 1.6198581560283687,
+ "grad_norm": 2.5149154663085938,
+ "learning_rate": 4.187301176123558e-06,
+ "loss": 0.5252,
+ "step": 3426
+ },
+ {
+ "epoch": 1.6203309692671395,
+ "grad_norm": 2.677405834197998,
+ "learning_rate": 4.186840807611602e-06,
+ "loss": 0.4635,
+ "step": 3427
+ },
+ {
+ "epoch": 1.6208037825059103,
+ "grad_norm": 2.7164649963378906,
+ "learning_rate": 4.186380334067464e-06,
+ "loss": 0.5634,
+ "step": 3428
+ },
+ {
+ "epoch": 1.6212765957446809,
+ "grad_norm": 2.8299832344055176,
+ "learning_rate": 4.185919755519817e-06,
+ "loss": 0.5166,
+ "step": 3429
+ },
+ {
+ "epoch": 1.6217494089834514,
+ "grad_norm": 2.465848207473755,
+ "learning_rate": 4.18545907199734e-06,
+ "loss": 0.4696,
+ "step": 3430
+ },
+ {
+ "epoch": 1.6222222222222222,
+ "grad_norm": 2.407616376876831,
+ "learning_rate": 4.1849982835287175e-06,
+ "loss": 0.5111,
+ "step": 3431
+ },
+ {
+ "epoch": 1.622695035460993,
+ "grad_norm": 2.452146291732788,
+ "learning_rate": 4.184537390142639e-06,
+ "loss": 0.4574,
+ "step": 3432
+ },
+ {
+ "epoch": 1.6231678486997636,
+ "grad_norm": 2.653071165084839,
+ "learning_rate": 4.1840763918678055e-06,
+ "loss": 0.5611,
+ "step": 3433
+ },
+ {
+ "epoch": 1.6236406619385342,
+ "grad_norm": 2.5920350551605225,
+ "learning_rate": 4.183615288732919e-06,
+ "loss": 0.5437,
+ "step": 3434
+ },
+ {
+ "epoch": 1.624113475177305,
+ "grad_norm": 2.782900810241699,
+ "learning_rate": 4.18315408076669e-06,
+ "loss": 0.5824,
+ "step": 3435
+ },
+ {
+ "epoch": 1.6245862884160758,
+ "grad_norm": 2.8769774436950684,
+ "learning_rate": 4.1826927679978365e-06,
+ "loss": 0.5271,
+ "step": 3436
+ },
+ {
+ "epoch": 1.6250591016548463,
+ "grad_norm": 2.488598585128784,
+ "learning_rate": 4.182231350455084e-06,
+ "loss": 0.4684,
+ "step": 3437
+ },
+ {
+ "epoch": 1.625531914893617,
+ "grad_norm": 2.6472036838531494,
+ "learning_rate": 4.181769828167161e-06,
+ "loss": 0.5372,
+ "step": 3438
+ },
+ {
+ "epoch": 1.6260047281323877,
+ "grad_norm": 2.6498794555664062,
+ "learning_rate": 4.1813082011628045e-06,
+ "loss": 0.4805,
+ "step": 3439
+ },
+ {
+ "epoch": 1.6264775413711585,
+ "grad_norm": 2.5386533737182617,
+ "learning_rate": 4.1808464694707595e-06,
+ "loss": 0.5015,
+ "step": 3440
+ },
+ {
+ "epoch": 1.626950354609929,
+ "grad_norm": 2.8812551498413086,
+ "learning_rate": 4.180384633119775e-06,
+ "loss": 0.5225,
+ "step": 3441
+ },
+ {
+ "epoch": 1.6274231678486997,
+ "grad_norm": 2.870124578475952,
+ "learning_rate": 4.179922692138609e-06,
+ "loss": 0.537,
+ "step": 3442
+ },
+ {
+ "epoch": 1.6278959810874705,
+ "grad_norm": 2.5759785175323486,
+ "learning_rate": 4.179460646556021e-06,
+ "loss": 0.5142,
+ "step": 3443
+ },
+ {
+ "epoch": 1.6283687943262413,
+ "grad_norm": 2.629347324371338,
+ "learning_rate": 4.1789984964007836e-06,
+ "loss": 0.5007,
+ "step": 3444
+ },
+ {
+ "epoch": 1.6288416075650118,
+ "grad_norm": 2.751128673553467,
+ "learning_rate": 4.178536241701672e-06,
+ "loss": 0.5677,
+ "step": 3445
+ },
+ {
+ "epoch": 1.6293144208037824,
+ "grad_norm": 2.7582364082336426,
+ "learning_rate": 4.178073882487469e-06,
+ "loss": 0.499,
+ "step": 3446
+ },
+ {
+ "epoch": 1.6297872340425532,
+ "grad_norm": 3.136711359024048,
+ "learning_rate": 4.177611418786963e-06,
+ "loss": 0.5294,
+ "step": 3447
+ },
+ {
+ "epoch": 1.630260047281324,
+ "grad_norm": 2.7363100051879883,
+ "learning_rate": 4.17714885062895e-06,
+ "loss": 0.5264,
+ "step": 3448
+ },
+ {
+ "epoch": 1.6307328605200946,
+ "grad_norm": 2.7305946350097656,
+ "learning_rate": 4.176686178042233e-06,
+ "loss": 0.5235,
+ "step": 3449
+ },
+ {
+ "epoch": 1.6312056737588652,
+ "grad_norm": 2.6500556468963623,
+ "learning_rate": 4.176223401055619e-06,
+ "loss": 0.5463,
+ "step": 3450
+ },
+ {
+ "epoch": 1.631678486997636,
+ "grad_norm": 2.756321907043457,
+ "learning_rate": 4.175760519697924e-06,
+ "loss": 0.545,
+ "step": 3451
+ },
+ {
+ "epoch": 1.6321513002364068,
+ "grad_norm": 2.6234960556030273,
+ "learning_rate": 4.17529753399797e-06,
+ "loss": 0.4927,
+ "step": 3452
+ },
+ {
+ "epoch": 1.6326241134751773,
+ "grad_norm": 2.6358842849731445,
+ "learning_rate": 4.174834443984584e-06,
+ "loss": 0.5445,
+ "step": 3453
+ },
+ {
+ "epoch": 1.633096926713948,
+ "grad_norm": 2.541147470474243,
+ "learning_rate": 4.174371249686601e-06,
+ "loss": 0.4691,
+ "step": 3454
+ },
+ {
+ "epoch": 1.6335697399527187,
+ "grad_norm": 2.566981077194214,
+ "learning_rate": 4.173907951132863e-06,
+ "loss": 0.4932,
+ "step": 3455
+ },
+ {
+ "epoch": 1.6340425531914895,
+ "grad_norm": 2.670940399169922,
+ "learning_rate": 4.173444548352216e-06,
+ "loss": 0.4979,
+ "step": 3456
+ },
+ {
+ "epoch": 1.63451536643026,
+ "grad_norm": 2.5440268516540527,
+ "learning_rate": 4.172981041373515e-06,
+ "loss": 0.4716,
+ "step": 3457
+ },
+ {
+ "epoch": 1.6349881796690307,
+ "grad_norm": 2.3801631927490234,
+ "learning_rate": 4.17251743022562e-06,
+ "loss": 0.5126,
+ "step": 3458
+ },
+ {
+ "epoch": 1.6354609929078014,
+ "grad_norm": 2.5051121711730957,
+ "learning_rate": 4.1720537149373985e-06,
+ "loss": 0.4964,
+ "step": 3459
+ },
+ {
+ "epoch": 1.6359338061465722,
+ "grad_norm": 3.5521697998046875,
+ "learning_rate": 4.171589895537724e-06,
+ "loss": 0.5447,
+ "step": 3460
+ },
+ {
+ "epoch": 1.6364066193853428,
+ "grad_norm": 2.6041572093963623,
+ "learning_rate": 4.171125972055477e-06,
+ "loss": 0.4637,
+ "step": 3461
+ },
+ {
+ "epoch": 1.6368794326241134,
+ "grad_norm": 2.2297258377075195,
+ "learning_rate": 4.170661944519543e-06,
+ "loss": 0.4702,
+ "step": 3462
+ },
+ {
+ "epoch": 1.6373522458628842,
+ "grad_norm": 2.6764535903930664,
+ "learning_rate": 4.170197812958815e-06,
+ "loss": 0.5111,
+ "step": 3463
+ },
+ {
+ "epoch": 1.637825059101655,
+ "grad_norm": 2.86892032623291,
+ "learning_rate": 4.169733577402193e-06,
+ "loss": 0.5437,
+ "step": 3464
+ },
+ {
+ "epoch": 1.6382978723404256,
+ "grad_norm": 2.9007070064544678,
+ "learning_rate": 4.1692692378785825e-06,
+ "loss": 0.5425,
+ "step": 3465
+ },
+ {
+ "epoch": 1.6387706855791961,
+ "grad_norm": 2.5902905464172363,
+ "learning_rate": 4.168804794416896e-06,
+ "loss": 0.5252,
+ "step": 3466
+ },
+ {
+ "epoch": 1.639243498817967,
+ "grad_norm": 2.821183681488037,
+ "learning_rate": 4.168340247046053e-06,
+ "loss": 0.5265,
+ "step": 3467
+ },
+ {
+ "epoch": 1.6397163120567377,
+ "grad_norm": 2.7928314208984375,
+ "learning_rate": 4.167875595794978e-06,
+ "loss": 0.5151,
+ "step": 3468
+ },
+ {
+ "epoch": 1.6401891252955083,
+ "grad_norm": 2.3130412101745605,
+ "learning_rate": 4.167410840692603e-06,
+ "loss": 0.4941,
+ "step": 3469
+ },
+ {
+ "epoch": 1.6406619385342789,
+ "grad_norm": 2.6078619956970215,
+ "learning_rate": 4.1669459817678655e-06,
+ "loss": 0.493,
+ "step": 3470
+ },
+ {
+ "epoch": 1.6411347517730497,
+ "grad_norm": 2.5335731506347656,
+ "learning_rate": 4.166481019049712e-06,
+ "loss": 0.4969,
+ "step": 3471
+ },
+ {
+ "epoch": 1.6416075650118205,
+ "grad_norm": 2.8181469440460205,
+ "learning_rate": 4.166015952567093e-06,
+ "loss": 0.5062,
+ "step": 3472
+ },
+ {
+ "epoch": 1.642080378250591,
+ "grad_norm": 2.7256782054901123,
+ "learning_rate": 4.165550782348966e-06,
+ "loss": 0.5397,
+ "step": 3473
+ },
+ {
+ "epoch": 1.6425531914893616,
+ "grad_norm": 2.284345865249634,
+ "learning_rate": 4.1650855084242946e-06,
+ "loss": 0.4448,
+ "step": 3474
+ },
+ {
+ "epoch": 1.6430260047281324,
+ "grad_norm": 3.0383145809173584,
+ "learning_rate": 4.164620130822049e-06,
+ "loss": 0.5873,
+ "step": 3475
+ },
+ {
+ "epoch": 1.6434988179669032,
+ "grad_norm": 2.754448652267456,
+ "learning_rate": 4.1641546495712085e-06,
+ "loss": 0.4852,
+ "step": 3476
+ },
+ {
+ "epoch": 1.6439716312056738,
+ "grad_norm": 2.6820101737976074,
+ "learning_rate": 4.1636890647007535e-06,
+ "loss": 0.5325,
+ "step": 3477
+ },
+ {
+ "epoch": 1.6444444444444444,
+ "grad_norm": 2.6396398544311523,
+ "learning_rate": 4.163223376239676e-06,
+ "loss": 0.466,
+ "step": 3478
+ },
+ {
+ "epoch": 1.6449172576832152,
+ "grad_norm": 2.395049810409546,
+ "learning_rate": 4.162757584216972e-06,
+ "loss": 0.4531,
+ "step": 3479
+ },
+ {
+ "epoch": 1.645390070921986,
+ "grad_norm": 2.596670627593994,
+ "learning_rate": 4.162291688661645e-06,
+ "loss": 0.5207,
+ "step": 3480
+ },
+ {
+ "epoch": 1.6458628841607565,
+ "grad_norm": 2.4391872882843018,
+ "learning_rate": 4.161825689602703e-06,
+ "loss": 0.5133,
+ "step": 3481
+ },
+ {
+ "epoch": 1.6463356973995271,
+ "grad_norm": 2.6169841289520264,
+ "learning_rate": 4.161359587069162e-06,
+ "loss": 0.5096,
+ "step": 3482
+ },
+ {
+ "epoch": 1.646808510638298,
+ "grad_norm": 2.634089946746826,
+ "learning_rate": 4.1608933810900445e-06,
+ "loss": 0.4921,
+ "step": 3483
+ },
+ {
+ "epoch": 1.6472813238770687,
+ "grad_norm": 2.815877914428711,
+ "learning_rate": 4.160427071694379e-06,
+ "loss": 0.5045,
+ "step": 3484
+ },
+ {
+ "epoch": 1.6477541371158393,
+ "grad_norm": 2.417525053024292,
+ "learning_rate": 4.159960658911199e-06,
+ "loss": 0.4997,
+ "step": 3485
+ },
+ {
+ "epoch": 1.6482269503546099,
+ "grad_norm": 2.5713605880737305,
+ "learning_rate": 4.15949414276955e-06,
+ "loss": 0.5246,
+ "step": 3486
+ },
+ {
+ "epoch": 1.6486997635933807,
+ "grad_norm": 3.49833607673645,
+ "learning_rate": 4.159027523298475e-06,
+ "loss": 0.4901,
+ "step": 3487
+ },
+ {
+ "epoch": 1.6491725768321515,
+ "grad_norm": 2.985464334487915,
+ "learning_rate": 4.158560800527033e-06,
+ "loss": 0.5726,
+ "step": 3488
+ },
+ {
+ "epoch": 1.649645390070922,
+ "grad_norm": 2.72745680809021,
+ "learning_rate": 4.158093974484282e-06,
+ "loss": 0.5119,
+ "step": 3489
+ },
+ {
+ "epoch": 1.6501182033096926,
+ "grad_norm": 2.4885571002960205,
+ "learning_rate": 4.157627045199289e-06,
+ "loss": 0.4838,
+ "step": 3490
+ },
+ {
+ "epoch": 1.6505910165484634,
+ "grad_norm": 2.7622628211975098,
+ "learning_rate": 4.157160012701128e-06,
+ "loss": 0.5269,
+ "step": 3491
+ },
+ {
+ "epoch": 1.6510638297872342,
+ "grad_norm": 2.615122079849243,
+ "learning_rate": 4.156692877018879e-06,
+ "loss": 0.5501,
+ "step": 3492
+ },
+ {
+ "epoch": 1.6515366430260048,
+ "grad_norm": 2.827753782272339,
+ "learning_rate": 4.156225638181631e-06,
+ "loss": 0.5452,
+ "step": 3493
+ },
+ {
+ "epoch": 1.6520094562647754,
+ "grad_norm": 2.724820137023926,
+ "learning_rate": 4.155758296218474e-06,
+ "loss": 0.5155,
+ "step": 3494
+ },
+ {
+ "epoch": 1.6524822695035462,
+ "grad_norm": 2.5806174278259277,
+ "learning_rate": 4.155290851158508e-06,
+ "loss": 0.5292,
+ "step": 3495
+ },
+ {
+ "epoch": 1.652955082742317,
+ "grad_norm": 2.5655179023742676,
+ "learning_rate": 4.154823303030838e-06,
+ "loss": 0.4959,
+ "step": 3496
+ },
+ {
+ "epoch": 1.6534278959810875,
+ "grad_norm": 2.656548261642456,
+ "learning_rate": 4.154355651864579e-06,
+ "loss": 0.5703,
+ "step": 3497
+ },
+ {
+ "epoch": 1.653900709219858,
+ "grad_norm": 2.9085004329681396,
+ "learning_rate": 4.153887897688847e-06,
+ "loss": 0.5061,
+ "step": 3498
+ },
+ {
+ "epoch": 1.654373522458629,
+ "grad_norm": 2.608010768890381,
+ "learning_rate": 4.1534200405327665e-06,
+ "loss": 0.5165,
+ "step": 3499
+ },
+ {
+ "epoch": 1.6548463356973995,
+ "grad_norm": 2.600463628768921,
+ "learning_rate": 4.152952080425471e-06,
+ "loss": 0.4946,
+ "step": 3500
+ },
+ {
+ "epoch": 1.65531914893617,
+ "grad_norm": 2.5561563968658447,
+ "learning_rate": 4.152484017396098e-06,
+ "loss": 0.4804,
+ "step": 3501
+ },
+ {
+ "epoch": 1.6557919621749408,
+ "grad_norm": 2.788594961166382,
+ "learning_rate": 4.152015851473791e-06,
+ "loss": 0.5635,
+ "step": 3502
+ },
+ {
+ "epoch": 1.6562647754137116,
+ "grad_norm": 2.693302631378174,
+ "learning_rate": 4.151547582687699e-06,
+ "loss": 0.5139,
+ "step": 3503
+ },
+ {
+ "epoch": 1.6567375886524822,
+ "grad_norm": 2.7887485027313232,
+ "learning_rate": 4.1510792110669825e-06,
+ "loss": 0.4952,
+ "step": 3504
+ },
+ {
+ "epoch": 1.6572104018912528,
+ "grad_norm": 2.8982298374176025,
+ "learning_rate": 4.150610736640803e-06,
+ "loss": 0.4136,
+ "step": 3505
+ },
+ {
+ "epoch": 1.6576832151300236,
+ "grad_norm": 2.7569408416748047,
+ "learning_rate": 4.150142159438331e-06,
+ "loss": 0.5272,
+ "step": 3506
+ },
+ {
+ "epoch": 1.6581560283687944,
+ "grad_norm": 2.531648874282837,
+ "learning_rate": 4.149673479488742e-06,
+ "loss": 0.5016,
+ "step": 3507
+ },
+ {
+ "epoch": 1.658628841607565,
+ "grad_norm": 2.7706353664398193,
+ "learning_rate": 4.149204696821219e-06,
+ "loss": 0.5512,
+ "step": 3508
+ },
+ {
+ "epoch": 1.6591016548463355,
+ "grad_norm": 2.7307450771331787,
+ "learning_rate": 4.148735811464951e-06,
+ "loss": 0.4968,
+ "step": 3509
+ },
+ {
+ "epoch": 1.6595744680851063,
+ "grad_norm": 3.0097429752349854,
+ "learning_rate": 4.1482668234491335e-06,
+ "loss": 0.4797,
+ "step": 3510
+ },
+ {
+ "epoch": 1.6600472813238771,
+ "grad_norm": 2.6045308113098145,
+ "learning_rate": 4.147797732802969e-06,
+ "loss": 0.5496,
+ "step": 3511
+ },
+ {
+ "epoch": 1.6605200945626477,
+ "grad_norm": 2.702061176300049,
+ "learning_rate": 4.147328539555664e-06,
+ "loss": 0.5302,
+ "step": 3512
+ },
+ {
+ "epoch": 1.6609929078014183,
+ "grad_norm": 3.3724892139434814,
+ "learning_rate": 4.1468592437364356e-06,
+ "loss": 0.5124,
+ "step": 3513
+ },
+ {
+ "epoch": 1.661465721040189,
+ "grad_norm": 2.5117242336273193,
+ "learning_rate": 4.146389845374502e-06,
+ "loss": 0.4953,
+ "step": 3514
+ },
+ {
+ "epoch": 1.6619385342789599,
+ "grad_norm": 2.86547589302063,
+ "learning_rate": 4.145920344499092e-06,
+ "loss": 0.5337,
+ "step": 3515
+ },
+ {
+ "epoch": 1.6624113475177305,
+ "grad_norm": 2.745149850845337,
+ "learning_rate": 4.14545074113944e-06,
+ "loss": 0.5187,
+ "step": 3516
+ },
+ {
+ "epoch": 1.662884160756501,
+ "grad_norm": 2.5560994148254395,
+ "learning_rate": 4.1449810353247855e-06,
+ "loss": 0.5183,
+ "step": 3517
+ },
+ {
+ "epoch": 1.6633569739952718,
+ "grad_norm": 2.2318122386932373,
+ "learning_rate": 4.144511227084374e-06,
+ "loss": 0.4452,
+ "step": 3518
+ },
+ {
+ "epoch": 1.6638297872340426,
+ "grad_norm": 2.6980903148651123,
+ "learning_rate": 4.14404131644746e-06,
+ "loss": 0.4974,
+ "step": 3519
+ },
+ {
+ "epoch": 1.6643026004728132,
+ "grad_norm": 2.6875357627868652,
+ "learning_rate": 4.1435713034433025e-06,
+ "loss": 0.4582,
+ "step": 3520
+ },
+ {
+ "epoch": 1.6647754137115838,
+ "grad_norm": 2.9430019855499268,
+ "learning_rate": 4.143101188101166e-06,
+ "loss": 0.5004,
+ "step": 3521
+ },
+ {
+ "epoch": 1.6652482269503546,
+ "grad_norm": 2.4447221755981445,
+ "learning_rate": 4.142630970450323e-06,
+ "loss": 0.5436,
+ "step": 3522
+ },
+ {
+ "epoch": 1.6657210401891254,
+ "grad_norm": 2.571023941040039,
+ "learning_rate": 4.142160650520053e-06,
+ "loss": 0.5307,
+ "step": 3523
+ },
+ {
+ "epoch": 1.666193853427896,
+ "grad_norm": 2.9725306034088135,
+ "learning_rate": 4.14169022833964e-06,
+ "loss": 0.5918,
+ "step": 3524
+ },
+ {
+ "epoch": 1.6666666666666665,
+ "grad_norm": 2.5958926677703857,
+ "learning_rate": 4.141219703938375e-06,
+ "loss": 0.5036,
+ "step": 3525
+ },
+ {
+ "epoch": 1.6671394799054373,
+ "grad_norm": 2.935788631439209,
+ "learning_rate": 4.140749077345556e-06,
+ "loss": 0.5773,
+ "step": 3526
+ },
+ {
+ "epoch": 1.6676122931442081,
+ "grad_norm": 2.5460526943206787,
+ "learning_rate": 4.140278348590485e-06,
+ "loss": 0.4762,
+ "step": 3527
+ },
+ {
+ "epoch": 1.6680851063829787,
+ "grad_norm": 2.5729143619537354,
+ "learning_rate": 4.139807517702475e-06,
+ "loss": 0.5515,
+ "step": 3528
+ },
+ {
+ "epoch": 1.6685579196217493,
+ "grad_norm": 2.4377381801605225,
+ "learning_rate": 4.13933658471084e-06,
+ "loss": 0.5383,
+ "step": 3529
+ },
+ {
+ "epoch": 1.66903073286052,
+ "grad_norm": 2.6284425258636475,
+ "learning_rate": 4.138865549644905e-06,
+ "loss": 0.5396,
+ "step": 3530
+ },
+ {
+ "epoch": 1.6695035460992909,
+ "grad_norm": 2.857250928878784,
+ "learning_rate": 4.138394412533998e-06,
+ "loss": 0.5861,
+ "step": 3531
+ },
+ {
+ "epoch": 1.6699763593380614,
+ "grad_norm": 2.9226012229919434,
+ "learning_rate": 4.137923173407456e-06,
+ "loss": 0.5262,
+ "step": 3532
+ },
+ {
+ "epoch": 1.670449172576832,
+ "grad_norm": 4.839131832122803,
+ "learning_rate": 4.137451832294619e-06,
+ "loss": 0.651,
+ "step": 3533
+ },
+ {
+ "epoch": 1.6709219858156028,
+ "grad_norm": 2.4727771282196045,
+ "learning_rate": 4.1369803892248375e-06,
+ "loss": 0.5149,
+ "step": 3534
+ },
+ {
+ "epoch": 1.6713947990543736,
+ "grad_norm": 2.5391688346862793,
+ "learning_rate": 4.1365088442274635e-06,
+ "loss": 0.4907,
+ "step": 3535
+ },
+ {
+ "epoch": 1.6718676122931442,
+ "grad_norm": 2.5168209075927734,
+ "learning_rate": 4.136037197331862e-06,
+ "loss": 0.5091,
+ "step": 3536
+ },
+ {
+ "epoch": 1.6723404255319148,
+ "grad_norm": 2.6278600692749023,
+ "learning_rate": 4.135565448567396e-06,
+ "loss": 0.4357,
+ "step": 3537
+ },
+ {
+ "epoch": 1.6728132387706856,
+ "grad_norm": 2.835184097290039,
+ "learning_rate": 4.135093597963441e-06,
+ "loss": 0.4786,
+ "step": 3538
+ },
+ {
+ "epoch": 1.6732860520094563,
+ "grad_norm": 2.385328531265259,
+ "learning_rate": 4.134621645549379e-06,
+ "loss": 0.4849,
+ "step": 3539
+ },
+ {
+ "epoch": 1.673758865248227,
+ "grad_norm": 2.6504149436950684,
+ "learning_rate": 4.134149591354593e-06,
+ "loss": 0.6037,
+ "step": 3540
+ },
+ {
+ "epoch": 1.6742316784869975,
+ "grad_norm": 2.945634126663208,
+ "learning_rate": 4.1336774354084786e-06,
+ "loss": 0.532,
+ "step": 3541
+ },
+ {
+ "epoch": 1.6747044917257683,
+ "grad_norm": 2.8373215198516846,
+ "learning_rate": 4.133205177740434e-06,
+ "loss": 0.5138,
+ "step": 3542
+ },
+ {
+ "epoch": 1.675177304964539,
+ "grad_norm": 2.6616621017456055,
+ "learning_rate": 4.1327328183798634e-06,
+ "loss": 0.5543,
+ "step": 3543
+ },
+ {
+ "epoch": 1.6756501182033097,
+ "grad_norm": 3.0843071937561035,
+ "learning_rate": 4.13226035735618e-06,
+ "loss": 0.6585,
+ "step": 3544
+ },
+ {
+ "epoch": 1.6761229314420802,
+ "grad_norm": 2.2214272022247314,
+ "learning_rate": 4.131787794698802e-06,
+ "loss": 0.5413,
+ "step": 3545
+ },
+ {
+ "epoch": 1.676595744680851,
+ "grad_norm": 2.4515018463134766,
+ "learning_rate": 4.131315130437152e-06,
+ "loss": 0.4966,
+ "step": 3546
+ },
+ {
+ "epoch": 1.6770685579196218,
+ "grad_norm": 2.647414207458496,
+ "learning_rate": 4.130842364600663e-06,
+ "loss": 0.5401,
+ "step": 3547
+ },
+ {
+ "epoch": 1.6775413711583924,
+ "grad_norm": 2.648941993713379,
+ "learning_rate": 4.13036949721877e-06,
+ "loss": 0.4796,
+ "step": 3548
+ },
+ {
+ "epoch": 1.678014184397163,
+ "grad_norm": 2.7835679054260254,
+ "learning_rate": 4.129896528320919e-06,
+ "loss": 0.5653,
+ "step": 3549
+ },
+ {
+ "epoch": 1.6784869976359338,
+ "grad_norm": 2.995964288711548,
+ "learning_rate": 4.129423457936556e-06,
+ "loss": 0.4999,
+ "step": 3550
+ },
+ {
+ "epoch": 1.6789598108747046,
+ "grad_norm": 2.5980007648468018,
+ "learning_rate": 4.1289502860951405e-06,
+ "loss": 0.5177,
+ "step": 3551
+ },
+ {
+ "epoch": 1.6794326241134752,
+ "grad_norm": 2.442254066467285,
+ "learning_rate": 4.128477012826133e-06,
+ "loss": 0.5062,
+ "step": 3552
+ },
+ {
+ "epoch": 1.6799054373522457,
+ "grad_norm": 2.3007538318634033,
+ "learning_rate": 4.1280036381590025e-06,
+ "loss": 0.5029,
+ "step": 3553
+ },
+ {
+ "epoch": 1.6803782505910165,
+ "grad_norm": 2.4169347286224365,
+ "learning_rate": 4.1275301621232245e-06,
+ "loss": 0.515,
+ "step": 3554
+ },
+ {
+ "epoch": 1.6808510638297873,
+ "grad_norm": 2.6456379890441895,
+ "learning_rate": 4.127056584748279e-06,
+ "loss": 0.5343,
+ "step": 3555
+ },
+ {
+ "epoch": 1.681323877068558,
+ "grad_norm": 2.6406595706939697,
+ "learning_rate": 4.1265829060636546e-06,
+ "loss": 0.5047,
+ "step": 3556
+ },
+ {
+ "epoch": 1.6817966903073285,
+ "grad_norm": 2.9344475269317627,
+ "learning_rate": 4.126109126098846e-06,
+ "loss": 0.5501,
+ "step": 3557
+ },
+ {
+ "epoch": 1.6822695035460993,
+ "grad_norm": 2.3292455673217773,
+ "learning_rate": 4.125635244883351e-06,
+ "loss": 0.463,
+ "step": 3558
+ },
+ {
+ "epoch": 1.68274231678487,
+ "grad_norm": 2.4150657653808594,
+ "learning_rate": 4.125161262446677e-06,
+ "loss": 0.4802,
+ "step": 3559
+ },
+ {
+ "epoch": 1.6832151300236406,
+ "grad_norm": 2.604292392730713,
+ "learning_rate": 4.124687178818339e-06,
+ "loss": 0.5683,
+ "step": 3560
+ },
+ {
+ "epoch": 1.6836879432624112,
+ "grad_norm": 2.5676791667938232,
+ "learning_rate": 4.1242129940278544e-06,
+ "loss": 0.5519,
+ "step": 3561
+ },
+ {
+ "epoch": 1.684160756501182,
+ "grad_norm": 3.078514814376831,
+ "learning_rate": 4.123738708104748e-06,
+ "loss": 0.5194,
+ "step": 3562
+ },
+ {
+ "epoch": 1.6846335697399528,
+ "grad_norm": 2.893577814102173,
+ "learning_rate": 4.123264321078552e-06,
+ "loss": 0.5107,
+ "step": 3563
+ },
+ {
+ "epoch": 1.6851063829787234,
+ "grad_norm": 2.772413730621338,
+ "learning_rate": 4.122789832978804e-06,
+ "loss": 0.6147,
+ "step": 3564
+ },
+ {
+ "epoch": 1.685579196217494,
+ "grad_norm": 2.5804643630981445,
+ "learning_rate": 4.12231524383505e-06,
+ "loss": 0.5057,
+ "step": 3565
+ },
+ {
+ "epoch": 1.6860520094562648,
+ "grad_norm": 2.599571466445923,
+ "learning_rate": 4.121840553676839e-06,
+ "loss": 0.5591,
+ "step": 3566
+ },
+ {
+ "epoch": 1.6865248226950356,
+ "grad_norm": 2.9124577045440674,
+ "learning_rate": 4.1213657625337275e-06,
+ "loss": 0.565,
+ "step": 3567
+ },
+ {
+ "epoch": 1.6869976359338061,
+ "grad_norm": 2.6582155227661133,
+ "learning_rate": 4.120890870435281e-06,
+ "loss": 0.4607,
+ "step": 3568
+ },
+ {
+ "epoch": 1.6874704491725767,
+ "grad_norm": 2.929227590560913,
+ "learning_rate": 4.120415877411066e-06,
+ "loss": 0.5705,
+ "step": 3569
+ },
+ {
+ "epoch": 1.6879432624113475,
+ "grad_norm": 2.4443247318267822,
+ "learning_rate": 4.11994078349066e-06,
+ "loss": 0.4592,
+ "step": 3570
+ },
+ {
+ "epoch": 1.6884160756501183,
+ "grad_norm": 2.4799163341522217,
+ "learning_rate": 4.119465588703645e-06,
+ "loss": 0.5361,
+ "step": 3571
+ },
+ {
+ "epoch": 1.6888888888888889,
+ "grad_norm": 2.9408936500549316,
+ "learning_rate": 4.1189902930796085e-06,
+ "loss": 0.5347,
+ "step": 3572
+ },
+ {
+ "epoch": 1.6893617021276595,
+ "grad_norm": 3.3348076343536377,
+ "learning_rate": 4.118514896648146e-06,
+ "loss": 0.5612,
+ "step": 3573
+ },
+ {
+ "epoch": 1.6898345153664303,
+ "grad_norm": 2.764889717102051,
+ "learning_rate": 4.118039399438857e-06,
+ "loss": 0.4745,
+ "step": 3574
+ },
+ {
+ "epoch": 1.690307328605201,
+ "grad_norm": 2.7023751735687256,
+ "learning_rate": 4.11756380148135e-06,
+ "loss": 0.5106,
+ "step": 3575
+ },
+ {
+ "epoch": 1.6907801418439716,
+ "grad_norm": 2.8816208839416504,
+ "learning_rate": 4.117088102805238e-06,
+ "loss": 0.6016,
+ "step": 3576
+ },
+ {
+ "epoch": 1.6912529550827422,
+ "grad_norm": 2.215733289718628,
+ "learning_rate": 4.11661230344014e-06,
+ "loss": 0.4404,
+ "step": 3577
+ },
+ {
+ "epoch": 1.691725768321513,
+ "grad_norm": 2.8190999031066895,
+ "learning_rate": 4.116136403415683e-06,
+ "loss": 0.5038,
+ "step": 3578
+ },
+ {
+ "epoch": 1.6921985815602838,
+ "grad_norm": 2.616424083709717,
+ "learning_rate": 4.115660402761499e-06,
+ "loss": 0.5493,
+ "step": 3579
+ },
+ {
+ "epoch": 1.6926713947990544,
+ "grad_norm": 2.7738113403320312,
+ "learning_rate": 4.115184301507226e-06,
+ "loss": 0.5416,
+ "step": 3580
+ },
+ {
+ "epoch": 1.693144208037825,
+ "grad_norm": 2.4793593883514404,
+ "learning_rate": 4.114708099682509e-06,
+ "loss": 0.4526,
+ "step": 3581
+ },
+ {
+ "epoch": 1.6936170212765957,
+ "grad_norm": 2.390652894973755,
+ "learning_rate": 4.114231797316999e-06,
+ "loss": 0.4908,
+ "step": 3582
+ },
+ {
+ "epoch": 1.6940898345153665,
+ "grad_norm": 2.513197660446167,
+ "learning_rate": 4.113755394440352e-06,
+ "loss": 0.4738,
+ "step": 3583
+ },
+ {
+ "epoch": 1.6945626477541371,
+ "grad_norm": 2.504497766494751,
+ "learning_rate": 4.113278891082234e-06,
+ "loss": 0.4661,
+ "step": 3584
+ },
+ {
+ "epoch": 1.6950354609929077,
+ "grad_norm": 2.4966917037963867,
+ "learning_rate": 4.112802287272314e-06,
+ "loss": 0.4979,
+ "step": 3585
+ },
+ {
+ "epoch": 1.6955082742316785,
+ "grad_norm": 2.3129689693450928,
+ "learning_rate": 4.112325583040265e-06,
+ "loss": 0.4933,
+ "step": 3586
+ },
+ {
+ "epoch": 1.6959810874704493,
+ "grad_norm": 2.822136878967285,
+ "learning_rate": 4.111848778415774e-06,
+ "loss": 0.5087,
+ "step": 3587
+ },
+ {
+ "epoch": 1.6964539007092199,
+ "grad_norm": 2.5181210041046143,
+ "learning_rate": 4.111371873428527e-06,
+ "loss": 0.4836,
+ "step": 3588
+ },
+ {
+ "epoch": 1.6969267139479904,
+ "grad_norm": 2.7564687728881836,
+ "learning_rate": 4.110894868108218e-06,
+ "loss": 0.5224,
+ "step": 3589
+ },
+ {
+ "epoch": 1.6973995271867612,
+ "grad_norm": 2.424421787261963,
+ "learning_rate": 4.11041776248455e-06,
+ "loss": 0.4552,
+ "step": 3590
+ },
+ {
+ "epoch": 1.697872340425532,
+ "grad_norm": 2.7013823986053467,
+ "learning_rate": 4.10994055658723e-06,
+ "loss": 0.5535,
+ "step": 3591
+ },
+ {
+ "epoch": 1.6983451536643026,
+ "grad_norm": 2.5660946369171143,
+ "learning_rate": 4.10946325044597e-06,
+ "loss": 0.5351,
+ "step": 3592
+ },
+ {
+ "epoch": 1.6988179669030732,
+ "grad_norm": 2.5598108768463135,
+ "learning_rate": 4.10898584409049e-06,
+ "loss": 0.5246,
+ "step": 3593
+ },
+ {
+ "epoch": 1.699290780141844,
+ "grad_norm": 2.6318907737731934,
+ "learning_rate": 4.108508337550518e-06,
+ "loss": 0.5002,
+ "step": 3594
+ },
+ {
+ "epoch": 1.6997635933806148,
+ "grad_norm": 2.527099132537842,
+ "learning_rate": 4.108030730855784e-06,
+ "loss": 0.5366,
+ "step": 3595
+ },
+ {
+ "epoch": 1.7002364066193854,
+ "grad_norm": 2.8629603385925293,
+ "learning_rate": 4.107553024036029e-06,
+ "loss": 0.5742,
+ "step": 3596
+ },
+ {
+ "epoch": 1.700709219858156,
+ "grad_norm": 2.8084018230438232,
+ "learning_rate": 4.107075217120994e-06,
+ "loss": 0.5618,
+ "step": 3597
+ },
+ {
+ "epoch": 1.7011820330969267,
+ "grad_norm": 3.6470065116882324,
+ "learning_rate": 4.1065973101404325e-06,
+ "loss": 0.508,
+ "step": 3598
+ },
+ {
+ "epoch": 1.7016548463356975,
+ "grad_norm": 3.0332422256469727,
+ "learning_rate": 4.106119303124102e-06,
+ "loss": 0.51,
+ "step": 3599
+ },
+ {
+ "epoch": 1.702127659574468,
+ "grad_norm": 2.4887590408325195,
+ "learning_rate": 4.105641196101765e-06,
+ "loss": 0.5109,
+ "step": 3600
+ },
+ {
+ "epoch": 1.7026004728132387,
+ "grad_norm": 2.6102066040039062,
+ "learning_rate": 4.105162989103191e-06,
+ "loss": 0.5278,
+ "step": 3601
+ },
+ {
+ "epoch": 1.7030732860520095,
+ "grad_norm": 2.771578073501587,
+ "learning_rate": 4.104684682158156e-06,
+ "loss": 0.498,
+ "step": 3602
+ },
+ {
+ "epoch": 1.7035460992907803,
+ "grad_norm": 2.5452702045440674,
+ "learning_rate": 4.1042062752964425e-06,
+ "loss": 0.4939,
+ "step": 3603
+ },
+ {
+ "epoch": 1.7040189125295508,
+ "grad_norm": 2.4287021160125732,
+ "learning_rate": 4.103727768547838e-06,
+ "loss": 0.4819,
+ "step": 3604
+ },
+ {
+ "epoch": 1.7044917257683214,
+ "grad_norm": 2.412280321121216,
+ "learning_rate": 4.103249161942138e-06,
+ "loss": 0.5196,
+ "step": 3605
+ },
+ {
+ "epoch": 1.7049645390070922,
+ "grad_norm": 2.8850717544555664,
+ "learning_rate": 4.102770455509142e-06,
+ "loss": 0.5724,
+ "step": 3606
+ },
+ {
+ "epoch": 1.705437352245863,
+ "grad_norm": 2.7979609966278076,
+ "learning_rate": 4.102291649278659e-06,
+ "loss": 0.5295,
+ "step": 3607
+ },
+ {
+ "epoch": 1.7059101654846336,
+ "grad_norm": 2.762238025665283,
+ "learning_rate": 4.1018127432805e-06,
+ "loss": 0.5166,
+ "step": 3608
+ },
+ {
+ "epoch": 1.7063829787234042,
+ "grad_norm": 2.921586513519287,
+ "learning_rate": 4.101333737544485e-06,
+ "loss": 0.5607,
+ "step": 3609
+ },
+ {
+ "epoch": 1.706855791962175,
+ "grad_norm": 3.001929998397827,
+ "learning_rate": 4.100854632100439e-06,
+ "loss": 0.6255,
+ "step": 3610
+ },
+ {
+ "epoch": 1.7073286052009458,
+ "grad_norm": 2.752713918685913,
+ "learning_rate": 4.100375426978196e-06,
+ "loss": 0.5732,
+ "step": 3611
+ },
+ {
+ "epoch": 1.7078014184397163,
+ "grad_norm": 2.6496472358703613,
+ "learning_rate": 4.099896122207593e-06,
+ "loss": 0.5138,
+ "step": 3612
+ },
+ {
+ "epoch": 1.708274231678487,
+ "grad_norm": 3.0079452991485596,
+ "learning_rate": 4.099416717818473e-06,
+ "loss": 0.5746,
+ "step": 3613
+ },
+ {
+ "epoch": 1.7087470449172577,
+ "grad_norm": 2.5762360095977783,
+ "learning_rate": 4.098937213840687e-06,
+ "loss": 0.5308,
+ "step": 3614
+ },
+ {
+ "epoch": 1.7092198581560285,
+ "grad_norm": 2.6026158332824707,
+ "learning_rate": 4.098457610304092e-06,
+ "loss": 0.4857,
+ "step": 3615
+ },
+ {
+ "epoch": 1.709692671394799,
+ "grad_norm": 2.587583541870117,
+ "learning_rate": 4.097977907238551e-06,
+ "loss": 0.4591,
+ "step": 3616
+ },
+ {
+ "epoch": 1.7101654846335697,
+ "grad_norm": 2.6996991634368896,
+ "learning_rate": 4.097498104673932e-06,
+ "loss": 0.5298,
+ "step": 3617
+ },
+ {
+ "epoch": 1.7106382978723405,
+ "grad_norm": 2.600029945373535,
+ "learning_rate": 4.097018202640111e-06,
+ "loss": 0.4726,
+ "step": 3618
+ },
+ {
+ "epoch": 1.7111111111111112,
+ "grad_norm": 2.8261220455169678,
+ "learning_rate": 4.096538201166969e-06,
+ "loss": 0.5242,
+ "step": 3619
+ },
+ {
+ "epoch": 1.7115839243498818,
+ "grad_norm": 3.053027629852295,
+ "learning_rate": 4.096058100284394e-06,
+ "loss": 0.5568,
+ "step": 3620
+ },
+ {
+ "epoch": 1.7120567375886524,
+ "grad_norm": 2.9638442993164062,
+ "learning_rate": 4.0955779000222805e-06,
+ "loss": 0.5325,
+ "step": 3621
+ },
+ {
+ "epoch": 1.7125295508274232,
+ "grad_norm": 2.731095790863037,
+ "learning_rate": 4.095097600410527e-06,
+ "loss": 0.4733,
+ "step": 3622
+ },
+ {
+ "epoch": 1.713002364066194,
+ "grad_norm": 2.632490873336792,
+ "learning_rate": 4.09461720147904e-06,
+ "loss": 0.5253,
+ "step": 3623
+ },
+ {
+ "epoch": 1.7134751773049646,
+ "grad_norm": 2.847689390182495,
+ "learning_rate": 4.094136703257732e-06,
+ "loss": 0.57,
+ "step": 3624
+ },
+ {
+ "epoch": 1.7139479905437351,
+ "grad_norm": 3.1078696250915527,
+ "learning_rate": 4.0936561057765215e-06,
+ "loss": 0.5368,
+ "step": 3625
+ },
+ {
+ "epoch": 1.714420803782506,
+ "grad_norm": 2.696349620819092,
+ "learning_rate": 4.0931754090653334e-06,
+ "loss": 0.491,
+ "step": 3626
+ },
+ {
+ "epoch": 1.7148936170212767,
+ "grad_norm": 2.712958812713623,
+ "learning_rate": 4.092694613154099e-06,
+ "loss": 0.5768,
+ "step": 3627
+ },
+ {
+ "epoch": 1.7153664302600473,
+ "grad_norm": 2.5421478748321533,
+ "learning_rate": 4.092213718072754e-06,
+ "loss": 0.4839,
+ "step": 3628
+ },
+ {
+ "epoch": 1.715839243498818,
+ "grad_norm": 2.5176162719726562,
+ "learning_rate": 4.091732723851243e-06,
+ "loss": 0.5049,
+ "step": 3629
+ },
+ {
+ "epoch": 1.7163120567375887,
+ "grad_norm": 2.642185926437378,
+ "learning_rate": 4.091251630519514e-06,
+ "loss": 0.589,
+ "step": 3630
+ },
+ {
+ "epoch": 1.7167848699763595,
+ "grad_norm": 2.587348461151123,
+ "learning_rate": 4.0907704381075245e-06,
+ "loss": 0.5281,
+ "step": 3631
+ },
+ {
+ "epoch": 1.71725768321513,
+ "grad_norm": 2.4628195762634277,
+ "learning_rate": 4.090289146645234e-06,
+ "loss": 0.5592,
+ "step": 3632
+ },
+ {
+ "epoch": 1.7177304964539006,
+ "grad_norm": 2.2751028537750244,
+ "learning_rate": 4.0898077561626125e-06,
+ "loss": 0.502,
+ "step": 3633
+ },
+ {
+ "epoch": 1.7182033096926714,
+ "grad_norm": 2.7712769508361816,
+ "learning_rate": 4.089326266689632e-06,
+ "loss": 0.5143,
+ "step": 3634
+ },
+ {
+ "epoch": 1.7186761229314422,
+ "grad_norm": 2.5297727584838867,
+ "learning_rate": 4.088844678256275e-06,
+ "loss": 0.5035,
+ "step": 3635
+ },
+ {
+ "epoch": 1.7191489361702128,
+ "grad_norm": 2.739130735397339,
+ "learning_rate": 4.088362990892527e-06,
+ "loss": 0.5959,
+ "step": 3636
+ },
+ {
+ "epoch": 1.7196217494089834,
+ "grad_norm": 2.3708314895629883,
+ "learning_rate": 4.08788120462838e-06,
+ "loss": 0.4796,
+ "step": 3637
+ },
+ {
+ "epoch": 1.7200945626477542,
+ "grad_norm": 2.7664241790771484,
+ "learning_rate": 4.087399319493832e-06,
+ "loss": 0.6052,
+ "step": 3638
+ },
+ {
+ "epoch": 1.720567375886525,
+ "grad_norm": 2.5900204181671143,
+ "learning_rate": 4.0869173355188895e-06,
+ "loss": 0.4955,
+ "step": 3639
+ },
+ {
+ "epoch": 1.7210401891252955,
+ "grad_norm": 2.6771862506866455,
+ "learning_rate": 4.0864352527335635e-06,
+ "loss": 0.4889,
+ "step": 3640
+ },
+ {
+ "epoch": 1.7215130023640661,
+ "grad_norm": 2.888479471206665,
+ "learning_rate": 4.085953071167871e-06,
+ "loss": 0.5719,
+ "step": 3641
+ },
+ {
+ "epoch": 1.721985815602837,
+ "grad_norm": 2.5967187881469727,
+ "learning_rate": 4.085470790851833e-06,
+ "loss": 0.4959,
+ "step": 3642
+ },
+ {
+ "epoch": 1.7224586288416077,
+ "grad_norm": 2.5317695140838623,
+ "learning_rate": 4.084988411815483e-06,
+ "loss": 0.4596,
+ "step": 3643
+ },
+ {
+ "epoch": 1.7229314420803783,
+ "grad_norm": 2.6531455516815186,
+ "learning_rate": 4.084505934088853e-06,
+ "loss": 0.5346,
+ "step": 3644
+ },
+ {
+ "epoch": 1.7234042553191489,
+ "grad_norm": 2.6525208950042725,
+ "learning_rate": 4.084023357701987e-06,
+ "loss": 0.5178,
+ "step": 3645
+ },
+ {
+ "epoch": 1.7238770685579197,
+ "grad_norm": 2.461954116821289,
+ "learning_rate": 4.083540682684932e-06,
+ "loss": 0.4802,
+ "step": 3646
+ },
+ {
+ "epoch": 1.7243498817966905,
+ "grad_norm": 2.794696807861328,
+ "learning_rate": 4.083057909067743e-06,
+ "loss": 0.5148,
+ "step": 3647
+ },
+ {
+ "epoch": 1.724822695035461,
+ "grad_norm": 2.867572546005249,
+ "learning_rate": 4.082575036880479e-06,
+ "loss": 0.5352,
+ "step": 3648
+ },
+ {
+ "epoch": 1.7252955082742316,
+ "grad_norm": 2.642820358276367,
+ "learning_rate": 4.082092066153207e-06,
+ "loss": 0.4652,
+ "step": 3649
+ },
+ {
+ "epoch": 1.7257683215130024,
+ "grad_norm": 2.782142400741577,
+ "learning_rate": 4.081608996915999e-06,
+ "loss": 0.5591,
+ "step": 3650
+ },
+ {
+ "epoch": 1.7262411347517732,
+ "grad_norm": 2.327331304550171,
+ "learning_rate": 4.081125829198934e-06,
+ "loss": 0.4339,
+ "step": 3651
+ },
+ {
+ "epoch": 1.7267139479905438,
+ "grad_norm": 2.7959988117218018,
+ "learning_rate": 4.0806425630320965e-06,
+ "loss": 0.5783,
+ "step": 3652
+ },
+ {
+ "epoch": 1.7271867612293144,
+ "grad_norm": 2.595053195953369,
+ "learning_rate": 4.080159198445578e-06,
+ "loss": 0.4602,
+ "step": 3653
+ },
+ {
+ "epoch": 1.7276595744680852,
+ "grad_norm": 3.0968129634857178,
+ "learning_rate": 4.079675735469475e-06,
+ "loss": 0.5775,
+ "step": 3654
+ },
+ {
+ "epoch": 1.728132387706856,
+ "grad_norm": 2.628044605255127,
+ "learning_rate": 4.07919217413389e-06,
+ "loss": 0.486,
+ "step": 3655
+ },
+ {
+ "epoch": 1.7286052009456265,
+ "grad_norm": 2.782799005508423,
+ "learning_rate": 4.078708514468933e-06,
+ "loss": 0.5282,
+ "step": 3656
+ },
+ {
+ "epoch": 1.729078014184397,
+ "grad_norm": 2.655365467071533,
+ "learning_rate": 4.0782247565047205e-06,
+ "loss": 0.4873,
+ "step": 3657
+ },
+ {
+ "epoch": 1.729550827423168,
+ "grad_norm": 2.9461584091186523,
+ "learning_rate": 4.077740900271371e-06,
+ "loss": 0.548,
+ "step": 3658
+ },
+ {
+ "epoch": 1.7300236406619387,
+ "grad_norm": 2.5094761848449707,
+ "learning_rate": 4.077256945799015e-06,
+ "loss": 0.5437,
+ "step": 3659
+ },
+ {
+ "epoch": 1.7304964539007093,
+ "grad_norm": 2.555793285369873,
+ "learning_rate": 4.0767728931177845e-06,
+ "loss": 0.5268,
+ "step": 3660
+ },
+ {
+ "epoch": 1.7309692671394799,
+ "grad_norm": 2.4433486461639404,
+ "learning_rate": 4.07628874225782e-06,
+ "loss": 0.5211,
+ "step": 3661
+ },
+ {
+ "epoch": 1.7314420803782506,
+ "grad_norm": 2.365206003189087,
+ "learning_rate": 4.075804493249267e-06,
+ "loss": 0.5084,
+ "step": 3662
+ },
+ {
+ "epoch": 1.7319148936170212,
+ "grad_norm": 2.514305830001831,
+ "learning_rate": 4.075320146122278e-06,
+ "loss": 0.4693,
+ "step": 3663
+ },
+ {
+ "epoch": 1.7323877068557918,
+ "grad_norm": 2.9270083904266357,
+ "learning_rate": 4.074835700907012e-06,
+ "loss": 0.5724,
+ "step": 3664
+ },
+ {
+ "epoch": 1.7328605200945626,
+ "grad_norm": 2.938692569732666,
+ "learning_rate": 4.0743511576336315e-06,
+ "loss": 0.5361,
+ "step": 3665
+ },
+ {
+ "epoch": 1.7333333333333334,
+ "grad_norm": 3.1978867053985596,
+ "learning_rate": 4.073866516332307e-06,
+ "loss": 0.6277,
+ "step": 3666
+ },
+ {
+ "epoch": 1.733806146572104,
+ "grad_norm": 2.3477370738983154,
+ "learning_rate": 4.073381777033217e-06,
+ "loss": 0.5139,
+ "step": 3667
+ },
+ {
+ "epoch": 1.7342789598108745,
+ "grad_norm": 2.5954184532165527,
+ "learning_rate": 4.072896939766543e-06,
+ "loss": 0.537,
+ "step": 3668
+ },
+ {
+ "epoch": 1.7347517730496453,
+ "grad_norm": 2.8999998569488525,
+ "learning_rate": 4.072412004562472e-06,
+ "loss": 0.5486,
+ "step": 3669
+ },
+ {
+ "epoch": 1.7352245862884161,
+ "grad_norm": 2.7320556640625,
+ "learning_rate": 4.071926971451201e-06,
+ "loss": 0.6025,
+ "step": 3670
+ },
+ {
+ "epoch": 1.7356973995271867,
+ "grad_norm": 2.499234676361084,
+ "learning_rate": 4.0714418404629304e-06,
+ "loss": 0.456,
+ "step": 3671
+ },
+ {
+ "epoch": 1.7361702127659573,
+ "grad_norm": 2.485924243927002,
+ "learning_rate": 4.070956611627867e-06,
+ "loss": 0.5097,
+ "step": 3672
+ },
+ {
+ "epoch": 1.736643026004728,
+ "grad_norm": 2.513723373413086,
+ "learning_rate": 4.070471284976225e-06,
+ "loss": 0.4744,
+ "step": 3673
+ },
+ {
+ "epoch": 1.7371158392434989,
+ "grad_norm": 2.281977653503418,
+ "learning_rate": 4.06998586053822e-06,
+ "loss": 0.5124,
+ "step": 3674
+ },
+ {
+ "epoch": 1.7375886524822695,
+ "grad_norm": 2.3683905601501465,
+ "learning_rate": 4.069500338344081e-06,
+ "loss": 0.4816,
+ "step": 3675
+ },
+ {
+ "epoch": 1.73806146572104,
+ "grad_norm": 2.5635924339294434,
+ "learning_rate": 4.069014718424038e-06,
+ "loss": 0.5665,
+ "step": 3676
+ },
+ {
+ "epoch": 1.7385342789598108,
+ "grad_norm": 2.7308456897735596,
+ "learning_rate": 4.068529000808328e-06,
+ "loss": 0.534,
+ "step": 3677
+ },
+ {
+ "epoch": 1.7390070921985816,
+ "grad_norm": 2.788452625274658,
+ "learning_rate": 4.068043185527196e-06,
+ "loss": 0.5609,
+ "step": 3678
+ },
+ {
+ "epoch": 1.7394799054373522,
+ "grad_norm": 2.832368850708008,
+ "learning_rate": 4.067557272610889e-06,
+ "loss": 0.553,
+ "step": 3679
+ },
+ {
+ "epoch": 1.7399527186761228,
+ "grad_norm": 2.9987435340881348,
+ "learning_rate": 4.067071262089665e-06,
+ "loss": 0.5,
+ "step": 3680
+ },
+ {
+ "epoch": 1.7404255319148936,
+ "grad_norm": 3.04913067817688,
+ "learning_rate": 4.066585153993785e-06,
+ "loss": 0.5158,
+ "step": 3681
+ },
+ {
+ "epoch": 1.7408983451536644,
+ "grad_norm": 2.5177130699157715,
+ "learning_rate": 4.066098948353516e-06,
+ "loss": 0.4508,
+ "step": 3682
+ },
+ {
+ "epoch": 1.741371158392435,
+ "grad_norm": 2.8991222381591797,
+ "learning_rate": 4.065612645199133e-06,
+ "loss": 0.5268,
+ "step": 3683
+ },
+ {
+ "epoch": 1.7418439716312055,
+ "grad_norm": 2.4928159713745117,
+ "learning_rate": 4.0651262445609156e-06,
+ "loss": 0.5024,
+ "step": 3684
+ },
+ {
+ "epoch": 1.7423167848699763,
+ "grad_norm": 2.9737319946289062,
+ "learning_rate": 4.06463974646915e-06,
+ "loss": 0.5429,
+ "step": 3685
+ },
+ {
+ "epoch": 1.7427895981087471,
+ "grad_norm": 2.6485493183135986,
+ "learning_rate": 4.064153150954128e-06,
+ "loss": 0.5619,
+ "step": 3686
+ },
+ {
+ "epoch": 1.7432624113475177,
+ "grad_norm": 2.564861297607422,
+ "learning_rate": 4.063666458046148e-06,
+ "loss": 0.4878,
+ "step": 3687
+ },
+ {
+ "epoch": 1.7437352245862883,
+ "grad_norm": 2.6048383712768555,
+ "learning_rate": 4.063179667775514e-06,
+ "loss": 0.4836,
+ "step": 3688
+ },
+ {
+ "epoch": 1.744208037825059,
+ "grad_norm": 2.751638650894165,
+ "learning_rate": 4.062692780172536e-06,
+ "loss": 0.5558,
+ "step": 3689
+ },
+ {
+ "epoch": 1.7446808510638299,
+ "grad_norm": 3.3866634368896484,
+ "learning_rate": 4.062205795267531e-06,
+ "loss": 0.4825,
+ "step": 3690
+ },
+ {
+ "epoch": 1.7451536643026004,
+ "grad_norm": 3.0112249851226807,
+ "learning_rate": 4.061718713090822e-06,
+ "loss": 0.5732,
+ "step": 3691
+ },
+ {
+ "epoch": 1.745626477541371,
+ "grad_norm": 2.5889365673065186,
+ "learning_rate": 4.061231533672736e-06,
+ "loss": 0.483,
+ "step": 3692
+ },
+ {
+ "epoch": 1.7460992907801418,
+ "grad_norm": 2.624598979949951,
+ "learning_rate": 4.0607442570436085e-06,
+ "loss": 0.5706,
+ "step": 3693
+ },
+ {
+ "epoch": 1.7465721040189126,
+ "grad_norm": 2.9219250679016113,
+ "learning_rate": 4.060256883233779e-06,
+ "loss": 0.5153,
+ "step": 3694
+ },
+ {
+ "epoch": 1.7470449172576832,
+ "grad_norm": 3.2219252586364746,
+ "learning_rate": 4.059769412273595e-06,
+ "loss": 0.5184,
+ "step": 3695
+ },
+ {
+ "epoch": 1.7475177304964538,
+ "grad_norm": 2.890697956085205,
+ "learning_rate": 4.05928184419341e-06,
+ "loss": 0.5312,
+ "step": 3696
+ },
+ {
+ "epoch": 1.7479905437352246,
+ "grad_norm": 2.673809289932251,
+ "learning_rate": 4.0587941790235816e-06,
+ "loss": 0.4893,
+ "step": 3697
+ },
+ {
+ "epoch": 1.7484633569739954,
+ "grad_norm": 2.5339348316192627,
+ "learning_rate": 4.058306416794474e-06,
+ "loss": 0.5115,
+ "step": 3698
+ },
+ {
+ "epoch": 1.748936170212766,
+ "grad_norm": 2.6525840759277344,
+ "learning_rate": 4.05781855753646e-06,
+ "loss": 0.5256,
+ "step": 3699
+ },
+ {
+ "epoch": 1.7494089834515365,
+ "grad_norm": 2.7868754863739014,
+ "learning_rate": 4.057330601279914e-06,
+ "loss": 0.5227,
+ "step": 3700
+ },
+ {
+ "epoch": 1.7498817966903073,
+ "grad_norm": 3.1629884243011475,
+ "learning_rate": 4.056842548055221e-06,
+ "loss": 0.5617,
+ "step": 3701
+ },
+ {
+ "epoch": 1.750354609929078,
+ "grad_norm": 2.9350688457489014,
+ "learning_rate": 4.056354397892769e-06,
+ "loss": 0.4753,
+ "step": 3702
+ },
+ {
+ "epoch": 1.7508274231678487,
+ "grad_norm": 2.9688615798950195,
+ "learning_rate": 4.0558661508229525e-06,
+ "loss": 0.596,
+ "step": 3703
+ },
+ {
+ "epoch": 1.7513002364066192,
+ "grad_norm": 2.802205801010132,
+ "learning_rate": 4.055377806876174e-06,
+ "loss": 0.5793,
+ "step": 3704
+ },
+ {
+ "epoch": 1.75177304964539,
+ "grad_norm": 2.4933416843414307,
+ "learning_rate": 4.054889366082839e-06,
+ "loss": 0.4824,
+ "step": 3705
+ },
+ {
+ "epoch": 1.7522458628841608,
+ "grad_norm": 3.7904608249664307,
+ "learning_rate": 4.054400828473361e-06,
+ "loss": 0.5124,
+ "step": 3706
+ },
+ {
+ "epoch": 1.7527186761229314,
+ "grad_norm": 2.694838762283325,
+ "learning_rate": 4.053912194078159e-06,
+ "loss": 0.5604,
+ "step": 3707
+ },
+ {
+ "epoch": 1.753191489361702,
+ "grad_norm": 2.3721256256103516,
+ "learning_rate": 4.053423462927659e-06,
+ "loss": 0.4978,
+ "step": 3708
+ },
+ {
+ "epoch": 1.7536643026004728,
+ "grad_norm": 2.718512773513794,
+ "learning_rate": 4.052934635052292e-06,
+ "loss": 0.5029,
+ "step": 3709
+ },
+ {
+ "epoch": 1.7541371158392436,
+ "grad_norm": 3.061558246612549,
+ "learning_rate": 4.052445710482493e-06,
+ "loss": 0.4886,
+ "step": 3710
+ },
+ {
+ "epoch": 1.7546099290780142,
+ "grad_norm": 3.0490729808807373,
+ "learning_rate": 4.051956689248709e-06,
+ "loss": 0.5363,
+ "step": 3711
+ },
+ {
+ "epoch": 1.7550827423167847,
+ "grad_norm": 2.611661672592163,
+ "learning_rate": 4.051467571381385e-06,
+ "loss": 0.5397,
+ "step": 3712
+ },
+ {
+ "epoch": 1.7555555555555555,
+ "grad_norm": 2.7829177379608154,
+ "learning_rate": 4.050978356910979e-06,
+ "loss": 0.4973,
+ "step": 3713
+ },
+ {
+ "epoch": 1.7560283687943263,
+ "grad_norm": 2.6228256225585938,
+ "learning_rate": 4.0504890458679525e-06,
+ "loss": 0.4551,
+ "step": 3714
+ },
+ {
+ "epoch": 1.756501182033097,
+ "grad_norm": 2.6801326274871826,
+ "learning_rate": 4.049999638282771e-06,
+ "loss": 0.5581,
+ "step": 3715
+ },
+ {
+ "epoch": 1.7569739952718675,
+ "grad_norm": 2.4476819038391113,
+ "learning_rate": 4.049510134185908e-06,
+ "loss": 0.5226,
+ "step": 3716
+ },
+ {
+ "epoch": 1.7574468085106383,
+ "grad_norm": 2.5661075115203857,
+ "learning_rate": 4.049020533607844e-06,
+ "loss": 0.5163,
+ "step": 3717
+ },
+ {
+ "epoch": 1.757919621749409,
+ "grad_norm": 2.3923349380493164,
+ "learning_rate": 4.048530836579065e-06,
+ "loss": 0.5076,
+ "step": 3718
+ },
+ {
+ "epoch": 1.7583924349881797,
+ "grad_norm": 2.8204405307769775,
+ "learning_rate": 4.0480410431300585e-06,
+ "loss": 0.5883,
+ "step": 3719
+ },
+ {
+ "epoch": 1.7588652482269502,
+ "grad_norm": 2.323107957839966,
+ "learning_rate": 4.047551153291325e-06,
+ "loss": 0.5116,
+ "step": 3720
+ },
+ {
+ "epoch": 1.759338061465721,
+ "grad_norm": 2.8306009769439697,
+ "learning_rate": 4.047061167093368e-06,
+ "loss": 0.5094,
+ "step": 3721
+ },
+ {
+ "epoch": 1.7598108747044918,
+ "grad_norm": 2.568765640258789,
+ "learning_rate": 4.046571084566695e-06,
+ "loss": 0.4725,
+ "step": 3722
+ },
+ {
+ "epoch": 1.7602836879432624,
+ "grad_norm": 2.7212061882019043,
+ "learning_rate": 4.046080905741822e-06,
+ "loss": 0.4741,
+ "step": 3723
+ },
+ {
+ "epoch": 1.760756501182033,
+ "grad_norm": 2.802917003631592,
+ "learning_rate": 4.04559063064927e-06,
+ "loss": 0.5691,
+ "step": 3724
+ },
+ {
+ "epoch": 1.7612293144208038,
+ "grad_norm": 3.1044139862060547,
+ "learning_rate": 4.0451002593195675e-06,
+ "loss": 0.5472,
+ "step": 3725
+ },
+ {
+ "epoch": 1.7617021276595746,
+ "grad_norm": 2.5855562686920166,
+ "learning_rate": 4.044609791783246e-06,
+ "loss": 0.4852,
+ "step": 3726
+ },
+ {
+ "epoch": 1.7621749408983451,
+ "grad_norm": 2.6235129833221436,
+ "learning_rate": 4.0441192280708465e-06,
+ "loss": 0.5269,
+ "step": 3727
+ },
+ {
+ "epoch": 1.7626477541371157,
+ "grad_norm": 3.535630464553833,
+ "learning_rate": 4.043628568212914e-06,
+ "loss": 0.5266,
+ "step": 3728
+ },
+ {
+ "epoch": 1.7631205673758865,
+ "grad_norm": 2.7783355712890625,
+ "learning_rate": 4.043137812239998e-06,
+ "loss": 0.5609,
+ "step": 3729
+ },
+ {
+ "epoch": 1.7635933806146573,
+ "grad_norm": 2.9344944953918457,
+ "learning_rate": 4.042646960182657e-06,
+ "loss": 0.5056,
+ "step": 3730
+ },
+ {
+ "epoch": 1.7640661938534279,
+ "grad_norm": 2.6205739974975586,
+ "learning_rate": 4.042156012071453e-06,
+ "loss": 0.4914,
+ "step": 3731
+ },
+ {
+ "epoch": 1.7645390070921985,
+ "grad_norm": 2.8004493713378906,
+ "learning_rate": 4.041664967936958e-06,
+ "loss": 0.4901,
+ "step": 3732
+ },
+ {
+ "epoch": 1.7650118203309693,
+ "grad_norm": 2.944589138031006,
+ "learning_rate": 4.041173827809745e-06,
+ "loss": 0.5572,
+ "step": 3733
+ },
+ {
+ "epoch": 1.76548463356974,
+ "grad_norm": 2.5021605491638184,
+ "learning_rate": 4.040682591720397e-06,
+ "loss": 0.4637,
+ "step": 3734
+ },
+ {
+ "epoch": 1.7659574468085106,
+ "grad_norm": 2.448030948638916,
+ "learning_rate": 4.040191259699497e-06,
+ "loss": 0.4785,
+ "step": 3735
+ },
+ {
+ "epoch": 1.7664302600472812,
+ "grad_norm": 2.7171032428741455,
+ "learning_rate": 4.039699831777643e-06,
+ "loss": 0.4919,
+ "step": 3736
+ },
+ {
+ "epoch": 1.766903073286052,
+ "grad_norm": 2.453118324279785,
+ "learning_rate": 4.03920830798543e-06,
+ "loss": 0.4326,
+ "step": 3737
+ },
+ {
+ "epoch": 1.7673758865248228,
+ "grad_norm": 3.112877368927002,
+ "learning_rate": 4.038716688353466e-06,
+ "loss": 0.5375,
+ "step": 3738
+ },
+ {
+ "epoch": 1.7678486997635934,
+ "grad_norm": 2.742239236831665,
+ "learning_rate": 4.038224972912361e-06,
+ "loss": 0.5267,
+ "step": 3739
+ },
+ {
+ "epoch": 1.768321513002364,
+ "grad_norm": 2.544785737991333,
+ "learning_rate": 4.037733161692731e-06,
+ "loss": 0.5032,
+ "step": 3740
+ },
+ {
+ "epoch": 1.7687943262411348,
+ "grad_norm": 2.4639062881469727,
+ "learning_rate": 4.037241254725201e-06,
+ "loss": 0.5532,
+ "step": 3741
+ },
+ {
+ "epoch": 1.7692671394799055,
+ "grad_norm": 2.866290330886841,
+ "learning_rate": 4.036749252040398e-06,
+ "loss": 0.5503,
+ "step": 3742
+ },
+ {
+ "epoch": 1.7697399527186761,
+ "grad_norm": 2.3466262817382812,
+ "learning_rate": 4.0362571536689575e-06,
+ "loss": 0.5286,
+ "step": 3743
+ },
+ {
+ "epoch": 1.7702127659574467,
+ "grad_norm": 2.246464967727661,
+ "learning_rate": 4.03576495964152e-06,
+ "loss": 0.4656,
+ "step": 3744
+ },
+ {
+ "epoch": 1.7706855791962175,
+ "grad_norm": 2.667558431625366,
+ "learning_rate": 4.035272669988733e-06,
+ "loss": 0.5205,
+ "step": 3745
+ },
+ {
+ "epoch": 1.7711583924349883,
+ "grad_norm": 2.974666118621826,
+ "learning_rate": 4.034780284741249e-06,
+ "loss": 0.6007,
+ "step": 3746
+ },
+ {
+ "epoch": 1.7716312056737589,
+ "grad_norm": 2.7164433002471924,
+ "learning_rate": 4.034287803929726e-06,
+ "loss": 0.4913,
+ "step": 3747
+ },
+ {
+ "epoch": 1.7721040189125294,
+ "grad_norm": 2.5923962593078613,
+ "learning_rate": 4.033795227584829e-06,
+ "loss": 0.5275,
+ "step": 3748
+ },
+ {
+ "epoch": 1.7725768321513002,
+ "grad_norm": 2.606027126312256,
+ "learning_rate": 4.033302555737229e-06,
+ "loss": 0.4869,
+ "step": 3749
+ },
+ {
+ "epoch": 1.773049645390071,
+ "grad_norm": 3.0110089778900146,
+ "learning_rate": 4.032809788417602e-06,
+ "loss": 0.4956,
+ "step": 3750
+ },
+ {
+ "epoch": 1.7735224586288416,
+ "grad_norm": 3.004598617553711,
+ "learning_rate": 4.032316925656632e-06,
+ "loss": 0.5159,
+ "step": 3751
+ },
+ {
+ "epoch": 1.7739952718676122,
+ "grad_norm": 2.731539249420166,
+ "learning_rate": 4.031823967485005e-06,
+ "loss": 0.5237,
+ "step": 3752
+ },
+ {
+ "epoch": 1.774468085106383,
+ "grad_norm": 2.7466373443603516,
+ "learning_rate": 4.0313309139334155e-06,
+ "loss": 0.4948,
+ "step": 3753
+ },
+ {
+ "epoch": 1.7749408983451538,
+ "grad_norm": 2.8596460819244385,
+ "learning_rate": 4.030837765032565e-06,
+ "loss": 0.5016,
+ "step": 3754
+ },
+ {
+ "epoch": 1.7754137115839244,
+ "grad_norm": 3.2886788845062256,
+ "learning_rate": 4.03034452081316e-06,
+ "loss": 0.5377,
+ "step": 3755
+ },
+ {
+ "epoch": 1.775886524822695,
+ "grad_norm": 2.5629258155822754,
+ "learning_rate": 4.029851181305912e-06,
+ "loss": 0.519,
+ "step": 3756
+ },
+ {
+ "epoch": 1.7763593380614657,
+ "grad_norm": 2.5988714694976807,
+ "learning_rate": 4.029357746541539e-06,
+ "loss": 0.5521,
+ "step": 3757
+ },
+ {
+ "epoch": 1.7768321513002365,
+ "grad_norm": 2.987884759902954,
+ "learning_rate": 4.028864216550765e-06,
+ "loss": 0.6225,
+ "step": 3758
+ },
+ {
+ "epoch": 1.777304964539007,
+ "grad_norm": 2.6875851154327393,
+ "learning_rate": 4.02837059136432e-06,
+ "loss": 0.5321,
+ "step": 3759
+ },
+ {
+ "epoch": 1.7777777777777777,
+ "grad_norm": 2.6414570808410645,
+ "learning_rate": 4.02787687101294e-06,
+ "loss": 0.4831,
+ "step": 3760
+ },
+ {
+ "epoch": 1.7782505910165485,
+ "grad_norm": 2.581475019454956,
+ "learning_rate": 4.027383055527368e-06,
+ "loss": 0.5204,
+ "step": 3761
+ },
+ {
+ "epoch": 1.7787234042553193,
+ "grad_norm": 2.811298131942749,
+ "learning_rate": 4.026889144938349e-06,
+ "loss": 0.5486,
+ "step": 3762
+ },
+ {
+ "epoch": 1.7791962174940898,
+ "grad_norm": 3.1589081287384033,
+ "learning_rate": 4.026395139276639e-06,
+ "loss": 0.4979,
+ "step": 3763
+ },
+ {
+ "epoch": 1.7796690307328604,
+ "grad_norm": 2.3773093223571777,
+ "learning_rate": 4.025901038572996e-06,
+ "loss": 0.503,
+ "step": 3764
+ },
+ {
+ "epoch": 1.7801418439716312,
+ "grad_norm": 2.962541341781616,
+ "learning_rate": 4.025406842858187e-06,
+ "loss": 0.4613,
+ "step": 3765
+ },
+ {
+ "epoch": 1.780614657210402,
+ "grad_norm": 2.603092908859253,
+ "learning_rate": 4.024912552162982e-06,
+ "loss": 0.5142,
+ "step": 3766
+ },
+ {
+ "epoch": 1.7810874704491726,
+ "grad_norm": 2.648927927017212,
+ "learning_rate": 4.024418166518159e-06,
+ "loss": 0.4491,
+ "step": 3767
+ },
+ {
+ "epoch": 1.7815602836879432,
+ "grad_norm": 3.3239917755126953,
+ "learning_rate": 4.023923685954502e-06,
+ "loss": 0.6272,
+ "step": 3768
+ },
+ {
+ "epoch": 1.782033096926714,
+ "grad_norm": 2.672821283340454,
+ "learning_rate": 4.023429110502798e-06,
+ "loss": 0.5171,
+ "step": 3769
+ },
+ {
+ "epoch": 1.7825059101654848,
+ "grad_norm": 2.364332437515259,
+ "learning_rate": 4.022934440193844e-06,
+ "loss": 0.4513,
+ "step": 3770
+ },
+ {
+ "epoch": 1.7829787234042553,
+ "grad_norm": 3.03108549118042,
+ "learning_rate": 4.022439675058441e-06,
+ "loss": 0.4324,
+ "step": 3771
+ },
+ {
+ "epoch": 1.783451536643026,
+ "grad_norm": 2.647557020187378,
+ "learning_rate": 4.021944815127393e-06,
+ "loss": 0.5162,
+ "step": 3772
+ },
+ {
+ "epoch": 1.7839243498817967,
+ "grad_norm": 2.4111907482147217,
+ "learning_rate": 4.021449860431517e-06,
+ "loss": 0.4712,
+ "step": 3773
+ },
+ {
+ "epoch": 1.7843971631205675,
+ "grad_norm": 2.796175718307495,
+ "learning_rate": 4.020954811001629e-06,
+ "loss": 0.5131,
+ "step": 3774
+ },
+ {
+ "epoch": 1.784869976359338,
+ "grad_norm": 2.4594924449920654,
+ "learning_rate": 4.020459666868553e-06,
+ "loss": 0.4739,
+ "step": 3775
+ },
+ {
+ "epoch": 1.7853427895981087,
+ "grad_norm": 2.5735671520233154,
+ "learning_rate": 4.0199644280631215e-06,
+ "loss": 0.4716,
+ "step": 3776
+ },
+ {
+ "epoch": 1.7858156028368795,
+ "grad_norm": 2.419990062713623,
+ "learning_rate": 4.01946909461617e-06,
+ "loss": 0.4866,
+ "step": 3777
+ },
+ {
+ "epoch": 1.7862884160756503,
+ "grad_norm": 2.5597951412200928,
+ "learning_rate": 4.01897366655854e-06,
+ "loss": 0.5569,
+ "step": 3778
+ },
+ {
+ "epoch": 1.7867612293144208,
+ "grad_norm": 2.462383985519409,
+ "learning_rate": 4.018478143921081e-06,
+ "loss": 0.4588,
+ "step": 3779
+ },
+ {
+ "epoch": 1.7872340425531914,
+ "grad_norm": 2.536701202392578,
+ "learning_rate": 4.017982526734646e-06,
+ "loss": 0.5278,
+ "step": 3780
+ },
+ {
+ "epoch": 1.7877068557919622,
+ "grad_norm": 2.691077470779419,
+ "learning_rate": 4.017486815030095e-06,
+ "loss": 0.4815,
+ "step": 3781
+ },
+ {
+ "epoch": 1.788179669030733,
+ "grad_norm": 2.4277288913726807,
+ "learning_rate": 4.016991008838294e-06,
+ "loss": 0.4877,
+ "step": 3782
+ },
+ {
+ "epoch": 1.7886524822695036,
+ "grad_norm": 2.6740009784698486,
+ "learning_rate": 4.016495108190115e-06,
+ "loss": 0.572,
+ "step": 3783
+ },
+ {
+ "epoch": 1.7891252955082741,
+ "grad_norm": 3.179232120513916,
+ "learning_rate": 4.0159991131164355e-06,
+ "loss": 0.4821,
+ "step": 3784
+ },
+ {
+ "epoch": 1.789598108747045,
+ "grad_norm": 3.2747793197631836,
+ "learning_rate": 4.015503023648138e-06,
+ "loss": 0.5517,
+ "step": 3785
+ },
+ {
+ "epoch": 1.7900709219858157,
+ "grad_norm": 2.671367645263672,
+ "learning_rate": 4.015006839816113e-06,
+ "loss": 0.5158,
+ "step": 3786
+ },
+ {
+ "epoch": 1.7905437352245863,
+ "grad_norm": 2.6600193977355957,
+ "learning_rate": 4.014510561651256e-06,
+ "loss": 0.535,
+ "step": 3787
+ },
+ {
+ "epoch": 1.791016548463357,
+ "grad_norm": 2.481509208679199,
+ "learning_rate": 4.014014189184466e-06,
+ "loss": 0.5596,
+ "step": 3788
+ },
+ {
+ "epoch": 1.7914893617021277,
+ "grad_norm": 2.759816884994507,
+ "learning_rate": 4.013517722446652e-06,
+ "loss": 0.5201,
+ "step": 3789
+ },
+ {
+ "epoch": 1.7919621749408985,
+ "grad_norm": 2.6913561820983887,
+ "learning_rate": 4.013021161468724e-06,
+ "loss": 0.5758,
+ "step": 3790
+ },
+ {
+ "epoch": 1.792434988179669,
+ "grad_norm": 2.775087594985962,
+ "learning_rate": 4.0125245062816044e-06,
+ "loss": 0.499,
+ "step": 3791
+ },
+ {
+ "epoch": 1.7929078014184396,
+ "grad_norm": 2.6134777069091797,
+ "learning_rate": 4.012027756916216e-06,
+ "loss": 0.5659,
+ "step": 3792
+ },
+ {
+ "epoch": 1.7933806146572104,
+ "grad_norm": 2.7109756469726562,
+ "learning_rate": 4.0115309134034895e-06,
+ "loss": 0.5337,
+ "step": 3793
+ },
+ {
+ "epoch": 1.7938534278959812,
+ "grad_norm": 2.5389950275421143,
+ "learning_rate": 4.0110339757743595e-06,
+ "loss": 0.4501,
+ "step": 3794
+ },
+ {
+ "epoch": 1.7943262411347518,
+ "grad_norm": 2.634648561477661,
+ "learning_rate": 4.010536944059771e-06,
+ "loss": 0.4411,
+ "step": 3795
+ },
+ {
+ "epoch": 1.7947990543735224,
+ "grad_norm": 2.527070999145508,
+ "learning_rate": 4.0100398182906695e-06,
+ "loss": 0.5145,
+ "step": 3796
+ },
+ {
+ "epoch": 1.7952718676122932,
+ "grad_norm": 2.62988543510437,
+ "learning_rate": 4.0095425984980105e-06,
+ "loss": 0.4981,
+ "step": 3797
+ },
+ {
+ "epoch": 1.795744680851064,
+ "grad_norm": 2.6032519340515137,
+ "learning_rate": 4.009045284712752e-06,
+ "loss": 0.453,
+ "step": 3798
+ },
+ {
+ "epoch": 1.7962174940898346,
+ "grad_norm": 2.735173463821411,
+ "learning_rate": 4.008547876965863e-06,
+ "loss": 0.5925,
+ "step": 3799
+ },
+ {
+ "epoch": 1.7966903073286051,
+ "grad_norm": 2.6296730041503906,
+ "learning_rate": 4.00805037528831e-06,
+ "loss": 0.5651,
+ "step": 3800
+ },
+ {
+ "epoch": 1.797163120567376,
+ "grad_norm": 2.641214370727539,
+ "learning_rate": 4.0075527797110735e-06,
+ "loss": 0.4973,
+ "step": 3801
+ },
+ {
+ "epoch": 1.7976359338061467,
+ "grad_norm": 2.6104819774627686,
+ "learning_rate": 4.007055090265136e-06,
+ "loss": 0.4432,
+ "step": 3802
+ },
+ {
+ "epoch": 1.7981087470449173,
+ "grad_norm": 2.8200619220733643,
+ "learning_rate": 4.0065573069814865e-06,
+ "loss": 0.4899,
+ "step": 3803
+ },
+ {
+ "epoch": 1.7985815602836879,
+ "grad_norm": 2.982354164123535,
+ "learning_rate": 4.006059429891119e-06,
+ "loss": 0.5488,
+ "step": 3804
+ },
+ {
+ "epoch": 1.7990543735224587,
+ "grad_norm": 2.7561678886413574,
+ "learning_rate": 4.005561459025034e-06,
+ "loss": 0.5637,
+ "step": 3805
+ },
+ {
+ "epoch": 1.7995271867612295,
+ "grad_norm": 2.702212333679199,
+ "learning_rate": 4.005063394414241e-06,
+ "loss": 0.4804,
+ "step": 3806
+ },
+ {
+ "epoch": 1.8,
+ "grad_norm": 2.8655319213867188,
+ "learning_rate": 4.004565236089748e-06,
+ "loss": 0.5759,
+ "step": 3807
+ },
+ {
+ "epoch": 1.8004728132387706,
+ "grad_norm": 2.703676223754883,
+ "learning_rate": 4.0040669840825756e-06,
+ "loss": 0.4728,
+ "step": 3808
+ },
+ {
+ "epoch": 1.8009456264775414,
+ "grad_norm": 2.802645683288574,
+ "learning_rate": 4.003568638423747e-06,
+ "loss": 0.5421,
+ "step": 3809
+ },
+ {
+ "epoch": 1.8014184397163122,
+ "grad_norm": 2.4723124504089355,
+ "learning_rate": 4.003070199144292e-06,
+ "loss": 0.4944,
+ "step": 3810
+ },
+ {
+ "epoch": 1.8018912529550828,
+ "grad_norm": 2.4889068603515625,
+ "learning_rate": 4.0025716662752475e-06,
+ "loss": 0.4774,
+ "step": 3811
+ },
+ {
+ "epoch": 1.8023640661938534,
+ "grad_norm": 2.5408077239990234,
+ "learning_rate": 4.002073039847653e-06,
+ "loss": 0.5233,
+ "step": 3812
+ },
+ {
+ "epoch": 1.8028368794326242,
+ "grad_norm": 2.734602689743042,
+ "learning_rate": 4.001574319892557e-06,
+ "loss": 0.5403,
+ "step": 3813
+ },
+ {
+ "epoch": 1.803309692671395,
+ "grad_norm": 3.3786163330078125,
+ "learning_rate": 4.001075506441012e-06,
+ "loss": 0.6969,
+ "step": 3814
+ },
+ {
+ "epoch": 1.8037825059101655,
+ "grad_norm": 2.7375378608703613,
+ "learning_rate": 4.000576599524078e-06,
+ "loss": 0.4907,
+ "step": 3815
+ },
+ {
+ "epoch": 1.804255319148936,
+ "grad_norm": 3.041804075241089,
+ "learning_rate": 4.000077599172818e-06,
+ "loss": 0.6021,
+ "step": 3816
+ },
+ {
+ "epoch": 1.804728132387707,
+ "grad_norm": 2.697599411010742,
+ "learning_rate": 3.999578505418305e-06,
+ "loss": 0.4743,
+ "step": 3817
+ },
+ {
+ "epoch": 1.8052009456264777,
+ "grad_norm": 2.276921272277832,
+ "learning_rate": 3.999079318291612e-06,
+ "loss": 0.4885,
+ "step": 3818
+ },
+ {
+ "epoch": 1.8056737588652483,
+ "grad_norm": 2.4896953105926514,
+ "learning_rate": 3.998580037823825e-06,
+ "loss": 0.503,
+ "step": 3819
+ },
+ {
+ "epoch": 1.8061465721040189,
+ "grad_norm": 2.6232175827026367,
+ "learning_rate": 3.998080664046029e-06,
+ "loss": 0.5058,
+ "step": 3820
+ },
+ {
+ "epoch": 1.8066193853427897,
+ "grad_norm": 2.695861339569092,
+ "learning_rate": 3.997581196989319e-06,
+ "loss": 0.4949,
+ "step": 3821
+ },
+ {
+ "epoch": 1.8070921985815604,
+ "grad_norm": 2.912886142730713,
+ "learning_rate": 3.997081636684795e-06,
+ "loss": 0.4971,
+ "step": 3822
+ },
+ {
+ "epoch": 1.807565011820331,
+ "grad_norm": 2.876500368118286,
+ "learning_rate": 3.996581983163561e-06,
+ "loss": 0.5584,
+ "step": 3823
+ },
+ {
+ "epoch": 1.8080378250591016,
+ "grad_norm": 2.857069730758667,
+ "learning_rate": 3.99608223645673e-06,
+ "loss": 0.5457,
+ "step": 3824
+ },
+ {
+ "epoch": 1.8085106382978724,
+ "grad_norm": 2.486743211746216,
+ "learning_rate": 3.995582396595419e-06,
+ "loss": 0.5291,
+ "step": 3825
+ },
+ {
+ "epoch": 1.808983451536643,
+ "grad_norm": 2.509441375732422,
+ "learning_rate": 3.9950824636107486e-06,
+ "loss": 0.4747,
+ "step": 3826
+ },
+ {
+ "epoch": 1.8094562647754135,
+ "grad_norm": 2.931394100189209,
+ "learning_rate": 3.99458243753385e-06,
+ "loss": 0.5116,
+ "step": 3827
+ },
+ {
+ "epoch": 1.8099290780141843,
+ "grad_norm": 2.4868650436401367,
+ "learning_rate": 3.994082318395856e-06,
+ "loss": 0.4671,
+ "step": 3828
+ },
+ {
+ "epoch": 1.8104018912529551,
+ "grad_norm": 2.5554752349853516,
+ "learning_rate": 3.993582106227907e-06,
+ "loss": 0.4969,
+ "step": 3829
+ },
+ {
+ "epoch": 1.8108747044917257,
+ "grad_norm": 2.8367133140563965,
+ "learning_rate": 3.99308180106115e-06,
+ "loss": 0.5507,
+ "step": 3830
+ },
+ {
+ "epoch": 1.8113475177304963,
+ "grad_norm": 2.68245792388916,
+ "learning_rate": 3.992581402926737e-06,
+ "loss": 0.5115,
+ "step": 3831
+ },
+ {
+ "epoch": 1.811820330969267,
+ "grad_norm": 2.406674385070801,
+ "learning_rate": 3.992080911855824e-06,
+ "loss": 0.545,
+ "step": 3832
+ },
+ {
+ "epoch": 1.8122931442080379,
+ "grad_norm": 2.5003464221954346,
+ "learning_rate": 3.991580327879575e-06,
+ "loss": 0.4331,
+ "step": 3833
+ },
+ {
+ "epoch": 1.8127659574468085,
+ "grad_norm": 2.49320912361145,
+ "learning_rate": 3.99107965102916e-06,
+ "loss": 0.5118,
+ "step": 3834
+ },
+ {
+ "epoch": 1.813238770685579,
+ "grad_norm": 2.6183295249938965,
+ "learning_rate": 3.990578881335752e-06,
+ "loss": 0.5286,
+ "step": 3835
+ },
+ {
+ "epoch": 1.8137115839243498,
+ "grad_norm": 3.1999518871307373,
+ "learning_rate": 3.990078018830534e-06,
+ "loss": 0.5048,
+ "step": 3836
+ },
+ {
+ "epoch": 1.8141843971631206,
+ "grad_norm": 2.4351117610931396,
+ "learning_rate": 3.9895770635446915e-06,
+ "loss": 0.514,
+ "step": 3837
+ },
+ {
+ "epoch": 1.8146572104018912,
+ "grad_norm": 2.6859259605407715,
+ "learning_rate": 3.989076015509416e-06,
+ "loss": 0.5575,
+ "step": 3838
+ },
+ {
+ "epoch": 1.8151300236406618,
+ "grad_norm": 2.790421962738037,
+ "learning_rate": 3.988574874755909e-06,
+ "loss": 0.5467,
+ "step": 3839
+ },
+ {
+ "epoch": 1.8156028368794326,
+ "grad_norm": 2.5202765464782715,
+ "learning_rate": 3.988073641315369e-06,
+ "loss": 0.5229,
+ "step": 3840
+ },
+ {
+ "epoch": 1.8160756501182034,
+ "grad_norm": 2.623652219772339,
+ "learning_rate": 3.987572315219009e-06,
+ "loss": 0.509,
+ "step": 3841
+ },
+ {
+ "epoch": 1.816548463356974,
+ "grad_norm": 2.6038360595703125,
+ "learning_rate": 3.987070896498044e-06,
+ "loss": 0.5304,
+ "step": 3842
+ },
+ {
+ "epoch": 1.8170212765957445,
+ "grad_norm": 2.9378011226654053,
+ "learning_rate": 3.9865693851836955e-06,
+ "loss": 0.5845,
+ "step": 3843
+ },
+ {
+ "epoch": 1.8174940898345153,
+ "grad_norm": 2.4061124324798584,
+ "learning_rate": 3.98606778130719e-06,
+ "loss": 0.4333,
+ "step": 3844
+ },
+ {
+ "epoch": 1.8179669030732861,
+ "grad_norm": 2.483489751815796,
+ "learning_rate": 3.985566084899759e-06,
+ "loss": 0.4827,
+ "step": 3845
+ },
+ {
+ "epoch": 1.8184397163120567,
+ "grad_norm": 2.7774932384490967,
+ "learning_rate": 3.985064295992642e-06,
+ "loss": 0.5016,
+ "step": 3846
+ },
+ {
+ "epoch": 1.8189125295508273,
+ "grad_norm": 2.5936765670776367,
+ "learning_rate": 3.984562414617083e-06,
+ "loss": 0.4448,
+ "step": 3847
+ },
+ {
+ "epoch": 1.819385342789598,
+ "grad_norm": 2.8608627319335938,
+ "learning_rate": 3.9840604408043325e-06,
+ "loss": 0.5735,
+ "step": 3848
+ },
+ {
+ "epoch": 1.8198581560283689,
+ "grad_norm": 2.6212472915649414,
+ "learning_rate": 3.983558374585646e-06,
+ "loss": 0.5091,
+ "step": 3849
+ },
+ {
+ "epoch": 1.8203309692671394,
+ "grad_norm": 2.832460641860962,
+ "learning_rate": 3.983056215992284e-06,
+ "loss": 0.5169,
+ "step": 3850
+ },
+ {
+ "epoch": 1.82080378250591,
+ "grad_norm": 2.5293610095977783,
+ "learning_rate": 3.982553965055514e-06,
+ "loss": 0.4708,
+ "step": 3851
+ },
+ {
+ "epoch": 1.8212765957446808,
+ "grad_norm": 2.9362871646881104,
+ "learning_rate": 3.982051621806611e-06,
+ "loss": 0.575,
+ "step": 3852
+ },
+ {
+ "epoch": 1.8217494089834516,
+ "grad_norm": 2.69073486328125,
+ "learning_rate": 3.98154918627685e-06,
+ "loss": 0.5278,
+ "step": 3853
+ },
+ {
+ "epoch": 1.8222222222222222,
+ "grad_norm": 2.6711034774780273,
+ "learning_rate": 3.98104665849752e-06,
+ "loss": 0.4918,
+ "step": 3854
+ },
+ {
+ "epoch": 1.8226950354609928,
+ "grad_norm": 2.571110963821411,
+ "learning_rate": 3.980544038499907e-06,
+ "loss": 0.5234,
+ "step": 3855
+ },
+ {
+ "epoch": 1.8231678486997636,
+ "grad_norm": 3.2603371143341064,
+ "learning_rate": 3.980041326315309e-06,
+ "loss": 0.5996,
+ "step": 3856
+ },
+ {
+ "epoch": 1.8236406619385344,
+ "grad_norm": 2.8472323417663574,
+ "learning_rate": 3.979538521975028e-06,
+ "loss": 0.4769,
+ "step": 3857
+ },
+ {
+ "epoch": 1.824113475177305,
+ "grad_norm": 2.6714751720428467,
+ "learning_rate": 3.979035625510371e-06,
+ "loss": 0.4826,
+ "step": 3858
+ },
+ {
+ "epoch": 1.8245862884160755,
+ "grad_norm": 2.6816468238830566,
+ "learning_rate": 3.97853263695265e-06,
+ "loss": 0.5127,
+ "step": 3859
+ },
+ {
+ "epoch": 1.8250591016548463,
+ "grad_norm": 2.6464123725891113,
+ "learning_rate": 3.978029556333185e-06,
+ "loss": 0.4925,
+ "step": 3860
+ },
+ {
+ "epoch": 1.825531914893617,
+ "grad_norm": 2.5317227840423584,
+ "learning_rate": 3.977526383683301e-06,
+ "loss": 0.4765,
+ "step": 3861
+ },
+ {
+ "epoch": 1.8260047281323877,
+ "grad_norm": 2.5052425861358643,
+ "learning_rate": 3.977023119034328e-06,
+ "loss": 0.4804,
+ "step": 3862
+ },
+ {
+ "epoch": 1.8264775413711583,
+ "grad_norm": 2.7022836208343506,
+ "learning_rate": 3.976519762417602e-06,
+ "loss": 0.4824,
+ "step": 3863
+ },
+ {
+ "epoch": 1.826950354609929,
+ "grad_norm": 2.7445900440216064,
+ "learning_rate": 3.976016313864464e-06,
+ "loss": 0.5698,
+ "step": 3864
+ },
+ {
+ "epoch": 1.8274231678486998,
+ "grad_norm": 2.442518711090088,
+ "learning_rate": 3.975512773406262e-06,
+ "loss": 0.5133,
+ "step": 3865
+ },
+ {
+ "epoch": 1.8278959810874704,
+ "grad_norm": 2.4100050926208496,
+ "learning_rate": 3.975009141074351e-06,
+ "loss": 0.5044,
+ "step": 3866
+ },
+ {
+ "epoch": 1.828368794326241,
+ "grad_norm": 2.9507648944854736,
+ "learning_rate": 3.974505416900088e-06,
+ "loss": 0.5367,
+ "step": 3867
+ },
+ {
+ "epoch": 1.8288416075650118,
+ "grad_norm": 2.5662600994110107,
+ "learning_rate": 3.974001600914837e-06,
+ "loss": 0.5878,
+ "step": 3868
+ },
+ {
+ "epoch": 1.8293144208037826,
+ "grad_norm": 2.4306657314300537,
+ "learning_rate": 3.973497693149971e-06,
+ "loss": 0.4647,
+ "step": 3869
+ },
+ {
+ "epoch": 1.8297872340425532,
+ "grad_norm": 2.974686622619629,
+ "learning_rate": 3.972993693636864e-06,
+ "loss": 0.4911,
+ "step": 3870
+ },
+ {
+ "epoch": 1.8302600472813237,
+ "grad_norm": 2.5711987018585205,
+ "learning_rate": 3.972489602406899e-06,
+ "loss": 0.5089,
+ "step": 3871
+ },
+ {
+ "epoch": 1.8307328605200945,
+ "grad_norm": 3.259617328643799,
+ "learning_rate": 3.971985419491463e-06,
+ "loss": 0.5966,
+ "step": 3872
+ },
+ {
+ "epoch": 1.8312056737588653,
+ "grad_norm": 2.7437000274658203,
+ "learning_rate": 3.971481144921949e-06,
+ "loss": 0.5097,
+ "step": 3873
+ },
+ {
+ "epoch": 1.831678486997636,
+ "grad_norm": 2.9597461223602295,
+ "learning_rate": 3.970976778729757e-06,
+ "loss": 0.5672,
+ "step": 3874
+ },
+ {
+ "epoch": 1.8321513002364065,
+ "grad_norm": 2.5775723457336426,
+ "learning_rate": 3.970472320946291e-06,
+ "loss": 0.4749,
+ "step": 3875
+ },
+ {
+ "epoch": 1.8326241134751773,
+ "grad_norm": 2.7381200790405273,
+ "learning_rate": 3.969967771602961e-06,
+ "loss": 0.5255,
+ "step": 3876
+ },
+ {
+ "epoch": 1.833096926713948,
+ "grad_norm": 2.651698350906372,
+ "learning_rate": 3.969463130731183e-06,
+ "loss": 0.5098,
+ "step": 3877
+ },
+ {
+ "epoch": 1.8335697399527187,
+ "grad_norm": 2.7277021408081055,
+ "learning_rate": 3.968958398362381e-06,
+ "loss": 0.5251,
+ "step": 3878
+ },
+ {
+ "epoch": 1.8340425531914892,
+ "grad_norm": 2.5184953212738037,
+ "learning_rate": 3.968453574527978e-06,
+ "loss": 0.5086,
+ "step": 3879
+ },
+ {
+ "epoch": 1.83451536643026,
+ "grad_norm": 2.8227882385253906,
+ "learning_rate": 3.967948659259412e-06,
+ "loss": 0.5742,
+ "step": 3880
+ },
+ {
+ "epoch": 1.8349881796690308,
+ "grad_norm": 2.547922134399414,
+ "learning_rate": 3.967443652588119e-06,
+ "loss": 0.5411,
+ "step": 3881
+ },
+ {
+ "epoch": 1.8354609929078014,
+ "grad_norm": 2.6572835445404053,
+ "learning_rate": 3.966938554545545e-06,
+ "loss": 0.4854,
+ "step": 3882
+ },
+ {
+ "epoch": 1.835933806146572,
+ "grad_norm": 2.9416658878326416,
+ "learning_rate": 3.966433365163139e-06,
+ "loss": 0.5236,
+ "step": 3883
+ },
+ {
+ "epoch": 1.8364066193853428,
+ "grad_norm": 2.344325304031372,
+ "learning_rate": 3.965928084472357e-06,
+ "loss": 0.4916,
+ "step": 3884
+ },
+ {
+ "epoch": 1.8368794326241136,
+ "grad_norm": 2.890418291091919,
+ "learning_rate": 3.965422712504662e-06,
+ "loss": 0.5287,
+ "step": 3885
+ },
+ {
+ "epoch": 1.8373522458628841,
+ "grad_norm": 2.6063363552093506,
+ "learning_rate": 3.96491724929152e-06,
+ "loss": 0.4842,
+ "step": 3886
+ },
+ {
+ "epoch": 1.8378250591016547,
+ "grad_norm": 2.5582427978515625,
+ "learning_rate": 3.964411694864404e-06,
+ "loss": 0.4768,
+ "step": 3887
+ },
+ {
+ "epoch": 1.8382978723404255,
+ "grad_norm": 2.84356951713562,
+ "learning_rate": 3.963906049254793e-06,
+ "loss": 0.5284,
+ "step": 3888
+ },
+ {
+ "epoch": 1.8387706855791963,
+ "grad_norm": 2.7048516273498535,
+ "learning_rate": 3.963400312494172e-06,
+ "loss": 0.5271,
+ "step": 3889
+ },
+ {
+ "epoch": 1.839243498817967,
+ "grad_norm": 2.5401699542999268,
+ "learning_rate": 3.962894484614031e-06,
+ "loss": 0.4734,
+ "step": 3890
+ },
+ {
+ "epoch": 1.8397163120567375,
+ "grad_norm": 2.208256244659424,
+ "learning_rate": 3.962388565645864e-06,
+ "loss": 0.4113,
+ "step": 3891
+ },
+ {
+ "epoch": 1.8401891252955083,
+ "grad_norm": 2.775139331817627,
+ "learning_rate": 3.961882555621173e-06,
+ "loss": 0.5172,
+ "step": 3892
+ },
+ {
+ "epoch": 1.840661938534279,
+ "grad_norm": 2.7540855407714844,
+ "learning_rate": 3.961376454571466e-06,
+ "loss": 0.5252,
+ "step": 3893
+ },
+ {
+ "epoch": 1.8411347517730496,
+ "grad_norm": 2.6731574535369873,
+ "learning_rate": 3.960870262528255e-06,
+ "loss": 0.4495,
+ "step": 3894
+ },
+ {
+ "epoch": 1.8416075650118202,
+ "grad_norm": 2.791492223739624,
+ "learning_rate": 3.960363979523058e-06,
+ "loss": 0.5457,
+ "step": 3895
+ },
+ {
+ "epoch": 1.842080378250591,
+ "grad_norm": 2.9280290603637695,
+ "learning_rate": 3.959857605587401e-06,
+ "loss": 0.5373,
+ "step": 3896
+ },
+ {
+ "epoch": 1.8425531914893618,
+ "grad_norm": 2.5652217864990234,
+ "learning_rate": 3.95935114075281e-06,
+ "loss": 0.5191,
+ "step": 3897
+ },
+ {
+ "epoch": 1.8430260047281324,
+ "grad_norm": 2.7297749519348145,
+ "learning_rate": 3.958844585050824e-06,
+ "loss": 0.5366,
+ "step": 3898
+ },
+ {
+ "epoch": 1.843498817966903,
+ "grad_norm": 2.5302982330322266,
+ "learning_rate": 3.958337938512983e-06,
+ "loss": 0.569,
+ "step": 3899
+ },
+ {
+ "epoch": 1.8439716312056738,
+ "grad_norm": 2.644777297973633,
+ "learning_rate": 3.957831201170832e-06,
+ "loss": 0.521,
+ "step": 3900
+ },
+ {
+ "epoch": 1.8444444444444446,
+ "grad_norm": 2.8375515937805176,
+ "learning_rate": 3.957324373055925e-06,
+ "loss": 0.573,
+ "step": 3901
+ },
+ {
+ "epoch": 1.8449172576832151,
+ "grad_norm": 2.512296676635742,
+ "learning_rate": 3.956817454199819e-06,
+ "loss": 0.5081,
+ "step": 3902
+ },
+ {
+ "epoch": 1.8453900709219857,
+ "grad_norm": 2.3662109375,
+ "learning_rate": 3.956310444634079e-06,
+ "loss": 0.4989,
+ "step": 3903
+ },
+ {
+ "epoch": 1.8458628841607565,
+ "grad_norm": 2.6849682331085205,
+ "learning_rate": 3.955803344390272e-06,
+ "loss": 0.5459,
+ "step": 3904
+ },
+ {
+ "epoch": 1.8463356973995273,
+ "grad_norm": 2.8364317417144775,
+ "learning_rate": 3.9552961534999756e-06,
+ "loss": 0.5704,
+ "step": 3905
+ },
+ {
+ "epoch": 1.8468085106382979,
+ "grad_norm": 2.6006948947906494,
+ "learning_rate": 3.954788871994768e-06,
+ "loss": 0.5696,
+ "step": 3906
+ },
+ {
+ "epoch": 1.8472813238770684,
+ "grad_norm": 2.558300018310547,
+ "learning_rate": 3.9542814999062375e-06,
+ "loss": 0.5047,
+ "step": 3907
+ },
+ {
+ "epoch": 1.8477541371158392,
+ "grad_norm": 2.6343321800231934,
+ "learning_rate": 3.953774037265974e-06,
+ "loss": 0.525,
+ "step": 3908
+ },
+ {
+ "epoch": 1.84822695035461,
+ "grad_norm": 2.5050008296966553,
+ "learning_rate": 3.953266484105576e-06,
+ "loss": 0.4867,
+ "step": 3909
+ },
+ {
+ "epoch": 1.8486997635933806,
+ "grad_norm": 2.3775103092193604,
+ "learning_rate": 3.952758840456647e-06,
+ "loss": 0.4349,
+ "step": 3910
+ },
+ {
+ "epoch": 1.8491725768321512,
+ "grad_norm": 2.508376359939575,
+ "learning_rate": 3.952251106350794e-06,
+ "loss": 0.539,
+ "step": 3911
+ },
+ {
+ "epoch": 1.849645390070922,
+ "grad_norm": 2.7403106689453125,
+ "learning_rate": 3.951743281819633e-06,
+ "loss": 0.4478,
+ "step": 3912
+ },
+ {
+ "epoch": 1.8501182033096928,
+ "grad_norm": 2.5332062244415283,
+ "learning_rate": 3.951235366894784e-06,
+ "loss": 0.4658,
+ "step": 3913
+ },
+ {
+ "epoch": 1.8505910165484634,
+ "grad_norm": 3.0137248039245605,
+ "learning_rate": 3.950727361607872e-06,
+ "loss": 0.5047,
+ "step": 3914
+ },
+ {
+ "epoch": 1.851063829787234,
+ "grad_norm": 2.5820653438568115,
+ "learning_rate": 3.950219265990528e-06,
+ "loss": 0.542,
+ "step": 3915
+ },
+ {
+ "epoch": 1.8515366430260047,
+ "grad_norm": 2.555133819580078,
+ "learning_rate": 3.949711080074389e-06,
+ "loss": 0.5253,
+ "step": 3916
+ },
+ {
+ "epoch": 1.8520094562647755,
+ "grad_norm": 2.876882791519165,
+ "learning_rate": 3.949202803891099e-06,
+ "loss": 0.5242,
+ "step": 3917
+ },
+ {
+ "epoch": 1.852482269503546,
+ "grad_norm": 2.5929203033447266,
+ "learning_rate": 3.948694437472305e-06,
+ "loss": 0.5358,
+ "step": 3918
+ },
+ {
+ "epoch": 1.8529550827423167,
+ "grad_norm": 2.468513250350952,
+ "learning_rate": 3.948185980849659e-06,
+ "loss": 0.5119,
+ "step": 3919
+ },
+ {
+ "epoch": 1.8534278959810875,
+ "grad_norm": 2.9259560108184814,
+ "learning_rate": 3.947677434054824e-06,
+ "loss": 0.4756,
+ "step": 3920
+ },
+ {
+ "epoch": 1.8539007092198583,
+ "grad_norm": 2.5247011184692383,
+ "learning_rate": 3.947168797119462e-06,
+ "loss": 0.4627,
+ "step": 3921
+ },
+ {
+ "epoch": 1.8543735224586289,
+ "grad_norm": 2.7396671772003174,
+ "learning_rate": 3.946660070075245e-06,
+ "loss": 0.5013,
+ "step": 3922
+ },
+ {
+ "epoch": 1.8548463356973994,
+ "grad_norm": 2.7059738636016846,
+ "learning_rate": 3.946151252953849e-06,
+ "loss": 0.5875,
+ "step": 3923
+ },
+ {
+ "epoch": 1.8553191489361702,
+ "grad_norm": 2.5638437271118164,
+ "learning_rate": 3.945642345786955e-06,
+ "loss": 0.5063,
+ "step": 3924
+ },
+ {
+ "epoch": 1.855791962174941,
+ "grad_norm": 2.6647839546203613,
+ "learning_rate": 3.945133348606251e-06,
+ "loss": 0.5421,
+ "step": 3925
+ },
+ {
+ "epoch": 1.8562647754137116,
+ "grad_norm": 3.7235286235809326,
+ "learning_rate": 3.944624261443431e-06,
+ "loss": 0.5958,
+ "step": 3926
+ },
+ {
+ "epoch": 1.8567375886524822,
+ "grad_norm": 2.769984245300293,
+ "learning_rate": 3.944115084330192e-06,
+ "loss": 0.5678,
+ "step": 3927
+ },
+ {
+ "epoch": 1.857210401891253,
+ "grad_norm": 2.567249059677124,
+ "learning_rate": 3.9436058172982395e-06,
+ "loss": 0.4767,
+ "step": 3928
+ },
+ {
+ "epoch": 1.8576832151300238,
+ "grad_norm": 2.6196048259735107,
+ "learning_rate": 3.943096460379283e-06,
+ "loss": 0.5345,
+ "step": 3929
+ },
+ {
+ "epoch": 1.8581560283687943,
+ "grad_norm": 2.5999555587768555,
+ "learning_rate": 3.942587013605037e-06,
+ "loss": 0.5482,
+ "step": 3930
+ },
+ {
+ "epoch": 1.858628841607565,
+ "grad_norm": 2.630387783050537,
+ "learning_rate": 3.942077477007224e-06,
+ "loss": 0.6023,
+ "step": 3931
+ },
+ {
+ "epoch": 1.8591016548463357,
+ "grad_norm": 2.543503761291504,
+ "learning_rate": 3.941567850617569e-06,
+ "loss": 0.5157,
+ "step": 3932
+ },
+ {
+ "epoch": 1.8595744680851065,
+ "grad_norm": 2.5109236240386963,
+ "learning_rate": 3.941058134467805e-06,
+ "loss": 0.4774,
+ "step": 3933
+ },
+ {
+ "epoch": 1.860047281323877,
+ "grad_norm": 2.5110230445861816,
+ "learning_rate": 3.94054832858967e-06,
+ "loss": 0.5064,
+ "step": 3934
+ },
+ {
+ "epoch": 1.8605200945626477,
+ "grad_norm": 2.4780776500701904,
+ "learning_rate": 3.940038433014908e-06,
+ "loss": 0.5216,
+ "step": 3935
+ },
+ {
+ "epoch": 1.8609929078014185,
+ "grad_norm": 2.4398856163024902,
+ "learning_rate": 3.939528447775266e-06,
+ "loss": 0.4958,
+ "step": 3936
+ },
+ {
+ "epoch": 1.8614657210401893,
+ "grad_norm": 2.449498176574707,
+ "learning_rate": 3.9390183729025e-06,
+ "loss": 0.5165,
+ "step": 3937
+ },
+ {
+ "epoch": 1.8619385342789598,
+ "grad_norm": 2.982544422149658,
+ "learning_rate": 3.938508208428371e-06,
+ "loss": 0.4803,
+ "step": 3938
+ },
+ {
+ "epoch": 1.8624113475177304,
+ "grad_norm": 2.6574015617370605,
+ "learning_rate": 3.937997954384641e-06,
+ "loss": 0.4797,
+ "step": 3939
+ },
+ {
+ "epoch": 1.8628841607565012,
+ "grad_norm": 2.7773542404174805,
+ "learning_rate": 3.937487610803086e-06,
+ "loss": 0.4843,
+ "step": 3940
+ },
+ {
+ "epoch": 1.863356973995272,
+ "grad_norm": 2.588937759399414,
+ "learning_rate": 3.9369771777154805e-06,
+ "loss": 0.5426,
+ "step": 3941
+ },
+ {
+ "epoch": 1.8638297872340426,
+ "grad_norm": 2.855442523956299,
+ "learning_rate": 3.936466655153607e-06,
+ "loss": 0.5443,
+ "step": 3942
+ },
+ {
+ "epoch": 1.8643026004728132,
+ "grad_norm": 2.554676055908203,
+ "learning_rate": 3.935956043149253e-06,
+ "loss": 0.5334,
+ "step": 3943
+ },
+ {
+ "epoch": 1.864775413711584,
+ "grad_norm": 2.901599884033203,
+ "learning_rate": 3.935445341734212e-06,
+ "loss": 0.5842,
+ "step": 3944
+ },
+ {
+ "epoch": 1.8652482269503547,
+ "grad_norm": 2.554485321044922,
+ "learning_rate": 3.934934550940285e-06,
+ "loss": 0.4941,
+ "step": 3945
+ },
+ {
+ "epoch": 1.8657210401891253,
+ "grad_norm": 2.357203245162964,
+ "learning_rate": 3.934423670799275e-06,
+ "loss": 0.4402,
+ "step": 3946
+ },
+ {
+ "epoch": 1.866193853427896,
+ "grad_norm": 2.7036049365997314,
+ "learning_rate": 3.933912701342993e-06,
+ "loss": 0.4966,
+ "step": 3947
+ },
+ {
+ "epoch": 1.8666666666666667,
+ "grad_norm": 2.7817211151123047,
+ "learning_rate": 3.933401642603255e-06,
+ "loss": 0.4908,
+ "step": 3948
+ },
+ {
+ "epoch": 1.8671394799054375,
+ "grad_norm": 2.439490795135498,
+ "learning_rate": 3.932890494611882e-06,
+ "loss": 0.4322,
+ "step": 3949
+ },
+ {
+ "epoch": 1.867612293144208,
+ "grad_norm": 3.187152147293091,
+ "learning_rate": 3.9323792574007e-06,
+ "loss": 0.501,
+ "step": 3950
+ },
+ {
+ "epoch": 1.8680851063829786,
+ "grad_norm": 2.405773401260376,
+ "learning_rate": 3.931867931001543e-06,
+ "loss": 0.4477,
+ "step": 3951
+ },
+ {
+ "epoch": 1.8685579196217494,
+ "grad_norm": 2.4922525882720947,
+ "learning_rate": 3.931356515446248e-06,
+ "loss": 0.5098,
+ "step": 3952
+ },
+ {
+ "epoch": 1.8690307328605202,
+ "grad_norm": 2.7781267166137695,
+ "learning_rate": 3.93084501076666e-06,
+ "loss": 0.5815,
+ "step": 3953
+ },
+ {
+ "epoch": 1.8695035460992908,
+ "grad_norm": 2.74621844291687,
+ "learning_rate": 3.930333416994626e-06,
+ "loss": 0.5605,
+ "step": 3954
+ },
+ {
+ "epoch": 1.8699763593380614,
+ "grad_norm": 2.5527689456939697,
+ "learning_rate": 3.929821734162004e-06,
+ "loss": 0.5141,
+ "step": 3955
+ },
+ {
+ "epoch": 1.8704491725768322,
+ "grad_norm": 2.5730628967285156,
+ "learning_rate": 3.92930996230065e-06,
+ "loss": 0.5446,
+ "step": 3956
+ },
+ {
+ "epoch": 1.870921985815603,
+ "grad_norm": 2.7053353786468506,
+ "learning_rate": 3.9287981014424334e-06,
+ "loss": 0.4722,
+ "step": 3957
+ },
+ {
+ "epoch": 1.8713947990543736,
+ "grad_norm": 2.7591893672943115,
+ "learning_rate": 3.928286151619224e-06,
+ "loss": 0.509,
+ "step": 3958
+ },
+ {
+ "epoch": 1.8718676122931441,
+ "grad_norm": 2.6233739852905273,
+ "learning_rate": 3.927774112862898e-06,
+ "loss": 0.5266,
+ "step": 3959
+ },
+ {
+ "epoch": 1.872340425531915,
+ "grad_norm": 2.7715370655059814,
+ "learning_rate": 3.9272619852053396e-06,
+ "loss": 0.5612,
+ "step": 3960
+ },
+ {
+ "epoch": 1.8728132387706857,
+ "grad_norm": 2.4815211296081543,
+ "learning_rate": 3.926749768678435e-06,
+ "loss": 0.5498,
+ "step": 3961
+ },
+ {
+ "epoch": 1.8732860520094563,
+ "grad_norm": 2.6819605827331543,
+ "learning_rate": 3.926237463314078e-06,
+ "loss": 0.5499,
+ "step": 3962
+ },
+ {
+ "epoch": 1.8737588652482269,
+ "grad_norm": 2.638664722442627,
+ "learning_rate": 3.925725069144168e-06,
+ "loss": 0.5429,
+ "step": 3963
+ },
+ {
+ "epoch": 1.8742316784869977,
+ "grad_norm": 2.527294874191284,
+ "learning_rate": 3.925212586200611e-06,
+ "loss": 0.5451,
+ "step": 3964
+ },
+ {
+ "epoch": 1.8747044917257685,
+ "grad_norm": 2.831638813018799,
+ "learning_rate": 3.924700014515315e-06,
+ "loss": 0.5276,
+ "step": 3965
+ },
+ {
+ "epoch": 1.875177304964539,
+ "grad_norm": 2.5906996726989746,
+ "learning_rate": 3.924187354120196e-06,
+ "loss": 0.5323,
+ "step": 3966
+ },
+ {
+ "epoch": 1.8756501182033096,
+ "grad_norm": 2.5482442378997803,
+ "learning_rate": 3.923674605047175e-06,
+ "loss": 0.4882,
+ "step": 3967
+ },
+ {
+ "epoch": 1.8761229314420804,
+ "grad_norm": 2.56402850151062,
+ "learning_rate": 3.923161767328179e-06,
+ "loss": 0.5111,
+ "step": 3968
+ },
+ {
+ "epoch": 1.8765957446808512,
+ "grad_norm": 3.223782539367676,
+ "learning_rate": 3.9226488409951405e-06,
+ "loss": 0.5829,
+ "step": 3969
+ },
+ {
+ "epoch": 1.8770685579196218,
+ "grad_norm": 2.665964365005493,
+ "learning_rate": 3.922135826079997e-06,
+ "loss": 0.4739,
+ "step": 3970
+ },
+ {
+ "epoch": 1.8775413711583924,
+ "grad_norm": 2.602696418762207,
+ "learning_rate": 3.921622722614691e-06,
+ "loss": 0.5199,
+ "step": 3971
+ },
+ {
+ "epoch": 1.8780141843971632,
+ "grad_norm": 2.5384418964385986,
+ "learning_rate": 3.921109530631172e-06,
+ "loss": 0.5086,
+ "step": 3972
+ },
+ {
+ "epoch": 1.878486997635934,
+ "grad_norm": 2.7961080074310303,
+ "learning_rate": 3.920596250161394e-06,
+ "loss": 0.5454,
+ "step": 3973
+ },
+ {
+ "epoch": 1.8789598108747045,
+ "grad_norm": 3.022007465362549,
+ "learning_rate": 3.920082881237317e-06,
+ "loss": 0.5537,
+ "step": 3974
+ },
+ {
+ "epoch": 1.8794326241134751,
+ "grad_norm": 2.699885129928589,
+ "learning_rate": 3.9195694238909045e-06,
+ "loss": 0.5274,
+ "step": 3975
+ },
+ {
+ "epoch": 1.879905437352246,
+ "grad_norm": 2.3994593620300293,
+ "learning_rate": 3.919055878154129e-06,
+ "loss": 0.4134,
+ "step": 3976
+ },
+ {
+ "epoch": 1.8803782505910167,
+ "grad_norm": 4.093045711517334,
+ "learning_rate": 3.918542244058967e-06,
+ "loss": 0.5305,
+ "step": 3977
+ },
+ {
+ "epoch": 1.8808510638297873,
+ "grad_norm": 3.011643171310425,
+ "learning_rate": 3.9180285216374e-06,
+ "loss": 0.5481,
+ "step": 3978
+ },
+ {
+ "epoch": 1.8813238770685579,
+ "grad_norm": 2.6426854133605957,
+ "learning_rate": 3.917514710921414e-06,
+ "loss": 0.5415,
+ "step": 3979
+ },
+ {
+ "epoch": 1.8817966903073287,
+ "grad_norm": 2.4379019737243652,
+ "learning_rate": 3.917000811943002e-06,
+ "loss": 0.4566,
+ "step": 3980
+ },
+ {
+ "epoch": 1.8822695035460995,
+ "grad_norm": 3.18522047996521,
+ "learning_rate": 3.9164868247341634e-06,
+ "loss": 0.6079,
+ "step": 3981
+ },
+ {
+ "epoch": 1.88274231678487,
+ "grad_norm": 2.6451141834259033,
+ "learning_rate": 3.915972749326903e-06,
+ "loss": 0.515,
+ "step": 3982
+ },
+ {
+ "epoch": 1.8832151300236406,
+ "grad_norm": 2.565598726272583,
+ "learning_rate": 3.915458585753226e-06,
+ "loss": 0.4799,
+ "step": 3983
+ },
+ {
+ "epoch": 1.8836879432624114,
+ "grad_norm": 2.711651563644409,
+ "learning_rate": 3.91494433404515e-06,
+ "loss": 0.5595,
+ "step": 3984
+ },
+ {
+ "epoch": 1.8841607565011822,
+ "grad_norm": 2.749328851699829,
+ "learning_rate": 3.914429994234695e-06,
+ "loss": 0.495,
+ "step": 3985
+ },
+ {
+ "epoch": 1.8846335697399526,
+ "grad_norm": 2.9492287635803223,
+ "learning_rate": 3.913915566353886e-06,
+ "loss": 0.5683,
+ "step": 3986
+ },
+ {
+ "epoch": 1.8851063829787233,
+ "grad_norm": 3.07747745513916,
+ "learning_rate": 3.913401050434756e-06,
+ "loss": 0.4953,
+ "step": 3987
+ },
+ {
+ "epoch": 1.8855791962174941,
+ "grad_norm": 2.8746345043182373,
+ "learning_rate": 3.912886446509338e-06,
+ "loss": 0.4752,
+ "step": 3988
+ },
+ {
+ "epoch": 1.8860520094562647,
+ "grad_norm": 2.772954225540161,
+ "learning_rate": 3.912371754609677e-06,
+ "loss": 0.5473,
+ "step": 3989
+ },
+ {
+ "epoch": 1.8865248226950353,
+ "grad_norm": 2.8906044960021973,
+ "learning_rate": 3.911856974767821e-06,
+ "loss": 0.5285,
+ "step": 3990
+ },
+ {
+ "epoch": 1.886997635933806,
+ "grad_norm": 2.8992726802825928,
+ "learning_rate": 3.9113421070158206e-06,
+ "loss": 0.571,
+ "step": 3991
+ },
+ {
+ "epoch": 1.887470449172577,
+ "grad_norm": 2.624662160873413,
+ "learning_rate": 3.910827151385737e-06,
+ "loss": 0.5183,
+ "step": 3992
+ },
+ {
+ "epoch": 1.8879432624113475,
+ "grad_norm": 2.4491732120513916,
+ "learning_rate": 3.910312107909632e-06,
+ "loss": 0.4205,
+ "step": 3993
+ },
+ {
+ "epoch": 1.888416075650118,
+ "grad_norm": 2.278259515762329,
+ "learning_rate": 3.909796976619575e-06,
+ "loss": 0.4464,
+ "step": 3994
+ },
+ {
+ "epoch": 1.8888888888888888,
+ "grad_norm": 2.6481523513793945,
+ "learning_rate": 3.909281757547644e-06,
+ "loss": 0.5023,
+ "step": 3995
+ },
+ {
+ "epoch": 1.8893617021276596,
+ "grad_norm": 2.6687493324279785,
+ "learning_rate": 3.908766450725917e-06,
+ "loss": 0.495,
+ "step": 3996
+ },
+ {
+ "epoch": 1.8898345153664302,
+ "grad_norm": 2.507525682449341,
+ "learning_rate": 3.908251056186481e-06,
+ "loss": 0.4155,
+ "step": 3997
+ },
+ {
+ "epoch": 1.8903073286052008,
+ "grad_norm": 2.7048323154449463,
+ "learning_rate": 3.907735573961426e-06,
+ "loss": 0.4601,
+ "step": 3998
+ },
+ {
+ "epoch": 1.8907801418439716,
+ "grad_norm": 2.6825389862060547,
+ "learning_rate": 3.907220004082848e-06,
+ "loss": 0.5067,
+ "step": 3999
+ },
+ {
+ "epoch": 1.8912529550827424,
+ "grad_norm": 2.775696039199829,
+ "learning_rate": 3.906704346582852e-06,
+ "loss": 0.5411,
+ "step": 4000
+ },
+ {
+ "epoch": 1.891725768321513,
+ "grad_norm": 2.4492077827453613,
+ "learning_rate": 3.906188601493545e-06,
+ "loss": 0.4931,
+ "step": 4001
+ },
+ {
+ "epoch": 1.8921985815602835,
+ "grad_norm": 2.320810556411743,
+ "learning_rate": 3.905672768847041e-06,
+ "loss": 0.4908,
+ "step": 4002
+ },
+ {
+ "epoch": 1.8926713947990543,
+ "grad_norm": 2.455162525177002,
+ "learning_rate": 3.905156848675455e-06,
+ "loss": 0.508,
+ "step": 4003
+ },
+ {
+ "epoch": 1.8931442080378251,
+ "grad_norm": 2.515921115875244,
+ "learning_rate": 3.904640841010915e-06,
+ "loss": 0.5318,
+ "step": 4004
+ },
+ {
+ "epoch": 1.8936170212765957,
+ "grad_norm": 2.7230770587921143,
+ "learning_rate": 3.904124745885548e-06,
+ "loss": 0.4793,
+ "step": 4005
+ },
+ {
+ "epoch": 1.8940898345153663,
+ "grad_norm": 2.519934892654419,
+ "learning_rate": 3.903608563331491e-06,
+ "loss": 0.5013,
+ "step": 4006
+ },
+ {
+ "epoch": 1.894562647754137,
+ "grad_norm": 2.719674587249756,
+ "learning_rate": 3.903092293380883e-06,
+ "loss": 0.516,
+ "step": 4007
+ },
+ {
+ "epoch": 1.8950354609929079,
+ "grad_norm": 3.2107343673706055,
+ "learning_rate": 3.902575936065869e-06,
+ "loss": 0.6297,
+ "step": 4008
+ },
+ {
+ "epoch": 1.8955082742316784,
+ "grad_norm": 2.9773149490356445,
+ "learning_rate": 3.902059491418603e-06,
+ "loss": 0.566,
+ "step": 4009
+ },
+ {
+ "epoch": 1.895981087470449,
+ "grad_norm": 2.6754770278930664,
+ "learning_rate": 3.90154295947124e-06,
+ "loss": 0.5187,
+ "step": 4010
+ },
+ {
+ "epoch": 1.8964539007092198,
+ "grad_norm": 2.457303762435913,
+ "learning_rate": 3.901026340255943e-06,
+ "loss": 0.5757,
+ "step": 4011
+ },
+ {
+ "epoch": 1.8969267139479906,
+ "grad_norm": 2.5944161415100098,
+ "learning_rate": 3.900509633804878e-06,
+ "loss": 0.5049,
+ "step": 4012
+ },
+ {
+ "epoch": 1.8973995271867612,
+ "grad_norm": 2.610445022583008,
+ "learning_rate": 3.89999284015022e-06,
+ "loss": 0.521,
+ "step": 4013
+ },
+ {
+ "epoch": 1.8978723404255318,
+ "grad_norm": 2.6949338912963867,
+ "learning_rate": 3.899475959324146e-06,
+ "loss": 0.5619,
+ "step": 4014
+ },
+ {
+ "epoch": 1.8983451536643026,
+ "grad_norm": 2.7889559268951416,
+ "learning_rate": 3.898958991358841e-06,
+ "loss": 0.5223,
+ "step": 4015
+ },
+ {
+ "epoch": 1.8988179669030734,
+ "grad_norm": 2.569265842437744,
+ "learning_rate": 3.898441936286493e-06,
+ "loss": 0.5724,
+ "step": 4016
+ },
+ {
+ "epoch": 1.899290780141844,
+ "grad_norm": 2.3567774295806885,
+ "learning_rate": 3.897924794139299e-06,
+ "loss": 0.4784,
+ "step": 4017
+ },
+ {
+ "epoch": 1.8997635933806145,
+ "grad_norm": 2.9176526069641113,
+ "learning_rate": 3.897407564949457e-06,
+ "loss": 0.646,
+ "step": 4018
+ },
+ {
+ "epoch": 1.9002364066193853,
+ "grad_norm": 2.7870090007781982,
+ "learning_rate": 3.896890248749174e-06,
+ "loss": 0.4922,
+ "step": 4019
+ },
+ {
+ "epoch": 1.900709219858156,
+ "grad_norm": 2.8310980796813965,
+ "learning_rate": 3.89637284557066e-06,
+ "loss": 0.4746,
+ "step": 4020
+ },
+ {
+ "epoch": 1.9011820330969267,
+ "grad_norm": 2.434915542602539,
+ "learning_rate": 3.895855355446131e-06,
+ "loss": 0.4537,
+ "step": 4021
+ },
+ {
+ "epoch": 1.9016548463356973,
+ "grad_norm": 3.0547034740448,
+ "learning_rate": 3.89533777840781e-06,
+ "loss": 0.6161,
+ "step": 4022
+ },
+ {
+ "epoch": 1.902127659574468,
+ "grad_norm": 3.416774272918701,
+ "learning_rate": 3.894820114487925e-06,
+ "loss": 0.5448,
+ "step": 4023
+ },
+ {
+ "epoch": 1.9026004728132389,
+ "grad_norm": 2.606951951980591,
+ "learning_rate": 3.894302363718707e-06,
+ "loss": 0.5501,
+ "step": 4024
+ },
+ {
+ "epoch": 1.9030732860520094,
+ "grad_norm": 3.082165002822876,
+ "learning_rate": 3.8937845261323945e-06,
+ "loss": 0.6035,
+ "step": 4025
+ },
+ {
+ "epoch": 1.90354609929078,
+ "grad_norm": 2.616093397140503,
+ "learning_rate": 3.893266601761231e-06,
+ "loss": 0.5294,
+ "step": 4026
+ },
+ {
+ "epoch": 1.9040189125295508,
+ "grad_norm": 2.7141637802124023,
+ "learning_rate": 3.8927485906374654e-06,
+ "loss": 0.5481,
+ "step": 4027
+ },
+ {
+ "epoch": 1.9044917257683216,
+ "grad_norm": 2.5129404067993164,
+ "learning_rate": 3.892230492793352e-06,
+ "loss": 0.4958,
+ "step": 4028
+ },
+ {
+ "epoch": 1.9049645390070922,
+ "grad_norm": 2.703403949737549,
+ "learning_rate": 3.891712308261151e-06,
+ "loss": 0.4852,
+ "step": 4029
+ },
+ {
+ "epoch": 1.9054373522458627,
+ "grad_norm": 2.881058931350708,
+ "learning_rate": 3.891194037073127e-06,
+ "loss": 0.4662,
+ "step": 4030
+ },
+ {
+ "epoch": 1.9059101654846335,
+ "grad_norm": 3.216769218444824,
+ "learning_rate": 3.8906756792615505e-06,
+ "loss": 0.5076,
+ "step": 4031
+ },
+ {
+ "epoch": 1.9063829787234043,
+ "grad_norm": 2.442265748977661,
+ "learning_rate": 3.890157234858697e-06,
+ "loss": 0.4748,
+ "step": 4032
+ },
+ {
+ "epoch": 1.906855791962175,
+ "grad_norm": 3.088672399520874,
+ "learning_rate": 3.889638703896849e-06,
+ "loss": 0.5729,
+ "step": 4033
+ },
+ {
+ "epoch": 1.9073286052009455,
+ "grad_norm": 2.9304986000061035,
+ "learning_rate": 3.889120086408291e-06,
+ "loss": 0.603,
+ "step": 4034
+ },
+ {
+ "epoch": 1.9078014184397163,
+ "grad_norm": 2.686093807220459,
+ "learning_rate": 3.888601382425318e-06,
+ "loss": 0.4978,
+ "step": 4035
+ },
+ {
+ "epoch": 1.908274231678487,
+ "grad_norm": 2.5668389797210693,
+ "learning_rate": 3.888082591980225e-06,
+ "loss": 0.5086,
+ "step": 4036
+ },
+ {
+ "epoch": 1.9087470449172577,
+ "grad_norm": 2.530996561050415,
+ "learning_rate": 3.887563715105315e-06,
+ "loss": 0.4678,
+ "step": 4037
+ },
+ {
+ "epoch": 1.9092198581560282,
+ "grad_norm": 3.043342351913452,
+ "learning_rate": 3.887044751832897e-06,
+ "loss": 0.5452,
+ "step": 4038
+ },
+ {
+ "epoch": 1.909692671394799,
+ "grad_norm": 2.799734115600586,
+ "learning_rate": 3.886525702195284e-06,
+ "loss": 0.5265,
+ "step": 4039
+ },
+ {
+ "epoch": 1.9101654846335698,
+ "grad_norm": 2.890022039413452,
+ "learning_rate": 3.886006566224796e-06,
+ "loss": 0.4634,
+ "step": 4040
+ },
+ {
+ "epoch": 1.9106382978723404,
+ "grad_norm": 2.6804237365722656,
+ "learning_rate": 3.8854873439537555e-06,
+ "loss": 0.5031,
+ "step": 4041
+ },
+ {
+ "epoch": 1.911111111111111,
+ "grad_norm": 2.43038272857666,
+ "learning_rate": 3.884968035414495e-06,
+ "loss": 0.5098,
+ "step": 4042
+ },
+ {
+ "epoch": 1.9115839243498818,
+ "grad_norm": 2.589583396911621,
+ "learning_rate": 3.884448640639346e-06,
+ "loss": 0.498,
+ "step": 4043
+ },
+ {
+ "epoch": 1.9120567375886526,
+ "grad_norm": 2.4565231800079346,
+ "learning_rate": 3.8839291596606524e-06,
+ "loss": 0.4318,
+ "step": 4044
+ },
+ {
+ "epoch": 1.9125295508274232,
+ "grad_norm": 2.66762638092041,
+ "learning_rate": 3.8834095925107575e-06,
+ "loss": 0.5441,
+ "step": 4045
+ },
+ {
+ "epoch": 1.9130023640661937,
+ "grad_norm": 2.7334461212158203,
+ "learning_rate": 3.882889939222013e-06,
+ "loss": 0.5209,
+ "step": 4046
+ },
+ {
+ "epoch": 1.9134751773049645,
+ "grad_norm": 2.6398537158966064,
+ "learning_rate": 3.8823701998267765e-06,
+ "loss": 0.4874,
+ "step": 4047
+ },
+ {
+ "epoch": 1.9139479905437353,
+ "grad_norm": 2.82405161857605,
+ "learning_rate": 3.881850374357409e-06,
+ "loss": 0.4519,
+ "step": 4048
+ },
+ {
+ "epoch": 1.914420803782506,
+ "grad_norm": 2.7552523612976074,
+ "learning_rate": 3.8813304628462776e-06,
+ "loss": 0.547,
+ "step": 4049
+ },
+ {
+ "epoch": 1.9148936170212765,
+ "grad_norm": 2.5287928581237793,
+ "learning_rate": 3.880810465325755e-06,
+ "loss": 0.5226,
+ "step": 4050
+ },
+ {
+ "epoch": 1.9153664302600473,
+ "grad_norm": 2.7597358226776123,
+ "learning_rate": 3.88029038182822e-06,
+ "loss": 0.5171,
+ "step": 4051
+ },
+ {
+ "epoch": 1.915839243498818,
+ "grad_norm": 2.563899278640747,
+ "learning_rate": 3.879770212386055e-06,
+ "loss": 0.4911,
+ "step": 4052
+ },
+ {
+ "epoch": 1.9163120567375886,
+ "grad_norm": 2.499404191970825,
+ "learning_rate": 3.879249957031649e-06,
+ "loss": 0.5072,
+ "step": 4053
+ },
+ {
+ "epoch": 1.9167848699763592,
+ "grad_norm": 2.817713499069214,
+ "learning_rate": 3.878729615797396e-06,
+ "loss": 0.5452,
+ "step": 4054
+ },
+ {
+ "epoch": 1.91725768321513,
+ "grad_norm": 2.7152490615844727,
+ "learning_rate": 3.878209188715696e-06,
+ "loss": 0.4917,
+ "step": 4055
+ },
+ {
+ "epoch": 1.9177304964539008,
+ "grad_norm": 2.384265661239624,
+ "learning_rate": 3.877688675818953e-06,
+ "loss": 0.4823,
+ "step": 4056
+ },
+ {
+ "epoch": 1.9182033096926714,
+ "grad_norm": 2.61059308052063,
+ "learning_rate": 3.877168077139577e-06,
+ "loss": 0.478,
+ "step": 4057
+ },
+ {
+ "epoch": 1.918676122931442,
+ "grad_norm": 2.6107938289642334,
+ "learning_rate": 3.8766473927099824e-06,
+ "loss": 0.5202,
+ "step": 4058
+ },
+ {
+ "epoch": 1.9191489361702128,
+ "grad_norm": 2.2339766025543213,
+ "learning_rate": 3.876126622562592e-06,
+ "loss": 0.547,
+ "step": 4059
+ },
+ {
+ "epoch": 1.9196217494089836,
+ "grad_norm": 2.4324610233306885,
+ "learning_rate": 3.8756057667298304e-06,
+ "loss": 0.5333,
+ "step": 4060
+ },
+ {
+ "epoch": 1.9200945626477541,
+ "grad_norm": 2.5521230697631836,
+ "learning_rate": 3.875084825244131e-06,
+ "loss": 0.5503,
+ "step": 4061
+ },
+ {
+ "epoch": 1.9205673758865247,
+ "grad_norm": 2.6985747814178467,
+ "learning_rate": 3.874563798137928e-06,
+ "loss": 0.4944,
+ "step": 4062
+ },
+ {
+ "epoch": 1.9210401891252955,
+ "grad_norm": 2.422332525253296,
+ "learning_rate": 3.874042685443664e-06,
+ "loss": 0.4807,
+ "step": 4063
+ },
+ {
+ "epoch": 1.9215130023640663,
+ "grad_norm": 2.914553165435791,
+ "learning_rate": 3.873521487193788e-06,
+ "loss": 0.4439,
+ "step": 4064
+ },
+ {
+ "epoch": 1.9219858156028369,
+ "grad_norm": 2.8098697662353516,
+ "learning_rate": 3.873000203420752e-06,
+ "loss": 0.5433,
+ "step": 4065
+ },
+ {
+ "epoch": 1.9224586288416075,
+ "grad_norm": 2.6124703884124756,
+ "learning_rate": 3.872478834157013e-06,
+ "loss": 0.4812,
+ "step": 4066
+ },
+ {
+ "epoch": 1.9229314420803783,
+ "grad_norm": 2.511059522628784,
+ "learning_rate": 3.871957379435035e-06,
+ "loss": 0.4666,
+ "step": 4067
+ },
+ {
+ "epoch": 1.923404255319149,
+ "grad_norm": 2.950542688369751,
+ "learning_rate": 3.871435839287287e-06,
+ "loss": 0.5687,
+ "step": 4068
+ },
+ {
+ "epoch": 1.9238770685579196,
+ "grad_norm": 2.4969422817230225,
+ "learning_rate": 3.870914213746243e-06,
+ "loss": 0.5235,
+ "step": 4069
+ },
+ {
+ "epoch": 1.9243498817966902,
+ "grad_norm": 2.512152910232544,
+ "learning_rate": 3.870392502844382e-06,
+ "loss": 0.4524,
+ "step": 4070
+ },
+ {
+ "epoch": 1.924822695035461,
+ "grad_norm": 3.0212557315826416,
+ "learning_rate": 3.86987070661419e-06,
+ "loss": 0.4868,
+ "step": 4071
+ },
+ {
+ "epoch": 1.9252955082742318,
+ "grad_norm": 2.8949966430664062,
+ "learning_rate": 3.869348825088154e-06,
+ "loss": 0.5556,
+ "step": 4072
+ },
+ {
+ "epoch": 1.9257683215130024,
+ "grad_norm": 2.402043581008911,
+ "learning_rate": 3.868826858298772e-06,
+ "loss": 0.5307,
+ "step": 4073
+ },
+ {
+ "epoch": 1.926241134751773,
+ "grad_norm": 2.980992078781128,
+ "learning_rate": 3.868304806278543e-06,
+ "loss": 0.6313,
+ "step": 4074
+ },
+ {
+ "epoch": 1.9267139479905437,
+ "grad_norm": 2.7140514850616455,
+ "learning_rate": 3.867782669059975e-06,
+ "loss": 0.5359,
+ "step": 4075
+ },
+ {
+ "epoch": 1.9271867612293145,
+ "grad_norm": 2.499631643295288,
+ "learning_rate": 3.867260446675577e-06,
+ "loss": 0.4873,
+ "step": 4076
+ },
+ {
+ "epoch": 1.9276595744680851,
+ "grad_norm": 2.915583610534668,
+ "learning_rate": 3.866738139157866e-06,
+ "loss": 0.5736,
+ "step": 4077
+ },
+ {
+ "epoch": 1.9281323877068557,
+ "grad_norm": 2.4231131076812744,
+ "learning_rate": 3.866215746539363e-06,
+ "loss": 0.5096,
+ "step": 4078
+ },
+ {
+ "epoch": 1.9286052009456265,
+ "grad_norm": 2.360074996948242,
+ "learning_rate": 3.865693268852599e-06,
+ "loss": 0.4907,
+ "step": 4079
+ },
+ {
+ "epoch": 1.9290780141843973,
+ "grad_norm": 2.5410032272338867,
+ "learning_rate": 3.865170706130101e-06,
+ "loss": 0.473,
+ "step": 4080
+ },
+ {
+ "epoch": 1.9295508274231679,
+ "grad_norm": 2.780090808868408,
+ "learning_rate": 3.86464805840441e-06,
+ "loss": 0.5213,
+ "step": 4081
+ },
+ {
+ "epoch": 1.9300236406619384,
+ "grad_norm": 2.7318382263183594,
+ "learning_rate": 3.864125325708068e-06,
+ "loss": 0.5617,
+ "step": 4082
+ },
+ {
+ "epoch": 1.9304964539007092,
+ "grad_norm": 2.76509165763855,
+ "learning_rate": 3.863602508073623e-06,
+ "loss": 0.52,
+ "step": 4083
+ },
+ {
+ "epoch": 1.93096926713948,
+ "grad_norm": 2.8041110038757324,
+ "learning_rate": 3.863079605533631e-06,
+ "loss": 0.5343,
+ "step": 4084
+ },
+ {
+ "epoch": 1.9314420803782506,
+ "grad_norm": 2.4462404251098633,
+ "learning_rate": 3.862556618120647e-06,
+ "loss": 0.4657,
+ "step": 4085
+ },
+ {
+ "epoch": 1.9319148936170212,
+ "grad_norm": 2.460864305496216,
+ "learning_rate": 3.862033545867238e-06,
+ "loss": 0.517,
+ "step": 4086
+ },
+ {
+ "epoch": 1.932387706855792,
+ "grad_norm": 2.6480276584625244,
+ "learning_rate": 3.8615103888059715e-06,
+ "loss": 0.4702,
+ "step": 4087
+ },
+ {
+ "epoch": 1.9328605200945628,
+ "grad_norm": 2.7175381183624268,
+ "learning_rate": 3.860987146969424e-06,
+ "loss": 0.5073,
+ "step": 4088
+ },
+ {
+ "epoch": 1.9333333333333333,
+ "grad_norm": 2.4963486194610596,
+ "learning_rate": 3.860463820390175e-06,
+ "loss": 0.4491,
+ "step": 4089
+ },
+ {
+ "epoch": 1.933806146572104,
+ "grad_norm": 2.548135757446289,
+ "learning_rate": 3.8599404091008075e-06,
+ "loss": 0.5134,
+ "step": 4090
+ },
+ {
+ "epoch": 1.9342789598108747,
+ "grad_norm": 2.8693668842315674,
+ "learning_rate": 3.859416913133916e-06,
+ "loss": 0.5467,
+ "step": 4091
+ },
+ {
+ "epoch": 1.9347517730496455,
+ "grad_norm": 2.711273670196533,
+ "learning_rate": 3.858893332522092e-06,
+ "loss": 0.6287,
+ "step": 4092
+ },
+ {
+ "epoch": 1.935224586288416,
+ "grad_norm": 2.8604533672332764,
+ "learning_rate": 3.858369667297941e-06,
+ "loss": 0.5661,
+ "step": 4093
+ },
+ {
+ "epoch": 1.9356973995271867,
+ "grad_norm": 2.936988353729248,
+ "learning_rate": 3.857845917494066e-06,
+ "loss": 0.5311,
+ "step": 4094
+ },
+ {
+ "epoch": 1.9361702127659575,
+ "grad_norm": 2.414093494415283,
+ "learning_rate": 3.857322083143079e-06,
+ "loss": 0.505,
+ "step": 4095
+ },
+ {
+ "epoch": 1.9366430260047283,
+ "grad_norm": 2.5528934001922607,
+ "learning_rate": 3.856798164277599e-06,
+ "loss": 0.4759,
+ "step": 4096
+ },
+ {
+ "epoch": 1.9371158392434988,
+ "grad_norm": 2.592893600463867,
+ "learning_rate": 3.8562741609302456e-06,
+ "loss": 0.4932,
+ "step": 4097
+ },
+ {
+ "epoch": 1.9375886524822694,
+ "grad_norm": 2.9619107246398926,
+ "learning_rate": 3.855750073133648e-06,
+ "loss": 0.5563,
+ "step": 4098
+ },
+ {
+ "epoch": 1.9380614657210402,
+ "grad_norm": 2.864889621734619,
+ "learning_rate": 3.855225900920438e-06,
+ "loss": 0.5069,
+ "step": 4099
+ },
+ {
+ "epoch": 1.938534278959811,
+ "grad_norm": 2.3951032161712646,
+ "learning_rate": 3.854701644323253e-06,
+ "loss": 0.4883,
+ "step": 4100
+ },
+ {
+ "epoch": 1.9390070921985816,
+ "grad_norm": 2.6339633464813232,
+ "learning_rate": 3.854177303374737e-06,
+ "loss": 0.5207,
+ "step": 4101
+ },
+ {
+ "epoch": 1.9394799054373522,
+ "grad_norm": 2.6435508728027344,
+ "learning_rate": 3.853652878107539e-06,
+ "loss": 0.4679,
+ "step": 4102
+ },
+ {
+ "epoch": 1.939952718676123,
+ "grad_norm": 2.4635629653930664,
+ "learning_rate": 3.853128368554311e-06,
+ "loss": 0.5639,
+ "step": 4103
+ },
+ {
+ "epoch": 1.9404255319148938,
+ "grad_norm": 2.664635419845581,
+ "learning_rate": 3.852603774747714e-06,
+ "loss": 0.5697,
+ "step": 4104
+ },
+ {
+ "epoch": 1.9408983451536643,
+ "grad_norm": 2.7020363807678223,
+ "learning_rate": 3.8520790967204095e-06,
+ "loss": 0.5462,
+ "step": 4105
+ },
+ {
+ "epoch": 1.941371158392435,
+ "grad_norm": 3.529282331466675,
+ "learning_rate": 3.851554334505069e-06,
+ "loss": 0.54,
+ "step": 4106
+ },
+ {
+ "epoch": 1.9418439716312057,
+ "grad_norm": 2.7125768661499023,
+ "learning_rate": 3.851029488134367e-06,
+ "loss": 0.5355,
+ "step": 4107
+ },
+ {
+ "epoch": 1.9423167848699765,
+ "grad_norm": 2.5226643085479736,
+ "learning_rate": 3.850504557640981e-06,
+ "loss": 0.5106,
+ "step": 4108
+ },
+ {
+ "epoch": 1.942789598108747,
+ "grad_norm": 2.834352731704712,
+ "learning_rate": 3.8499795430575995e-06,
+ "loss": 0.6069,
+ "step": 4109
+ },
+ {
+ "epoch": 1.9432624113475176,
+ "grad_norm": 2.8484177589416504,
+ "learning_rate": 3.849454444416911e-06,
+ "loss": 0.5542,
+ "step": 4110
+ },
+ {
+ "epoch": 1.9437352245862884,
+ "grad_norm": 2.402539014816284,
+ "learning_rate": 3.848929261751612e-06,
+ "loss": 0.47,
+ "step": 4111
+ },
+ {
+ "epoch": 1.9442080378250592,
+ "grad_norm": 2.7010042667388916,
+ "learning_rate": 3.848403995094402e-06,
+ "loss": 0.5263,
+ "step": 4112
+ },
+ {
+ "epoch": 1.9446808510638298,
+ "grad_norm": 2.441689968109131,
+ "learning_rate": 3.847878644477988e-06,
+ "loss": 0.5607,
+ "step": 4113
+ },
+ {
+ "epoch": 1.9451536643026004,
+ "grad_norm": 2.5994722843170166,
+ "learning_rate": 3.847353209935081e-06,
+ "loss": 0.5103,
+ "step": 4114
+ },
+ {
+ "epoch": 1.9456264775413712,
+ "grad_norm": 2.452242136001587,
+ "learning_rate": 3.8468276914983975e-06,
+ "loss": 0.4409,
+ "step": 4115
+ },
+ {
+ "epoch": 1.946099290780142,
+ "grad_norm": 2.421023368835449,
+ "learning_rate": 3.84630208920066e-06,
+ "loss": 0.4429,
+ "step": 4116
+ },
+ {
+ "epoch": 1.9465721040189126,
+ "grad_norm": 2.696399688720703,
+ "learning_rate": 3.8457764030745945e-06,
+ "loss": 0.5352,
+ "step": 4117
+ },
+ {
+ "epoch": 1.9470449172576831,
+ "grad_norm": 2.3963489532470703,
+ "learning_rate": 3.845250633152933e-06,
+ "loss": 0.4505,
+ "step": 4118
+ },
+ {
+ "epoch": 1.947517730496454,
+ "grad_norm": 2.610649585723877,
+ "learning_rate": 3.8447247794684135e-06,
+ "loss": 0.501,
+ "step": 4119
+ },
+ {
+ "epoch": 1.9479905437352247,
+ "grad_norm": 2.740412712097168,
+ "learning_rate": 3.8441988420537775e-06,
+ "loss": 0.5362,
+ "step": 4120
+ },
+ {
+ "epoch": 1.9484633569739953,
+ "grad_norm": 2.2614004611968994,
+ "learning_rate": 3.8436728209417755e-06,
+ "loss": 0.4199,
+ "step": 4121
+ },
+ {
+ "epoch": 1.9489361702127659,
+ "grad_norm": 3.0683481693267822,
+ "learning_rate": 3.843146716165158e-06,
+ "loss": 0.5248,
+ "step": 4122
+ },
+ {
+ "epoch": 1.9494089834515367,
+ "grad_norm": 3.005174398422241,
+ "learning_rate": 3.842620527756684e-06,
+ "loss": 0.5246,
+ "step": 4123
+ },
+ {
+ "epoch": 1.9498817966903075,
+ "grad_norm": 2.672896385192871,
+ "learning_rate": 3.842094255749117e-06,
+ "loss": 0.5586,
+ "step": 4124
+ },
+ {
+ "epoch": 1.950354609929078,
+ "grad_norm": 2.5481197834014893,
+ "learning_rate": 3.8415679001752255e-06,
+ "loss": 0.5061,
+ "step": 4125
+ },
+ {
+ "epoch": 1.9508274231678486,
+ "grad_norm": 2.515789270401001,
+ "learning_rate": 3.8410414610677835e-06,
+ "loss": 0.4645,
+ "step": 4126
+ },
+ {
+ "epoch": 1.9513002364066194,
+ "grad_norm": 2.7236077785491943,
+ "learning_rate": 3.84051493845957e-06,
+ "loss": 0.5623,
+ "step": 4127
+ },
+ {
+ "epoch": 1.9517730496453902,
+ "grad_norm": 2.6252009868621826,
+ "learning_rate": 3.839988332383369e-06,
+ "loss": 0.5078,
+ "step": 4128
+ },
+ {
+ "epoch": 1.9522458628841608,
+ "grad_norm": 2.719196081161499,
+ "learning_rate": 3.83946164287197e-06,
+ "loss": 0.5481,
+ "step": 4129
+ },
+ {
+ "epoch": 1.9527186761229314,
+ "grad_norm": 2.484163284301758,
+ "learning_rate": 3.838934869958169e-06,
+ "loss": 0.5332,
+ "step": 4130
+ },
+ {
+ "epoch": 1.9531914893617022,
+ "grad_norm": 2.615382671356201,
+ "learning_rate": 3.838408013674764e-06,
+ "loss": 0.4742,
+ "step": 4131
+ },
+ {
+ "epoch": 1.953664302600473,
+ "grad_norm": 2.735321044921875,
+ "learning_rate": 3.83788107405456e-06,
+ "loss": 0.421,
+ "step": 4132
+ },
+ {
+ "epoch": 1.9541371158392435,
+ "grad_norm": 2.892652750015259,
+ "learning_rate": 3.837354051130369e-06,
+ "loss": 0.5326,
+ "step": 4133
+ },
+ {
+ "epoch": 1.9546099290780141,
+ "grad_norm": 2.6800546646118164,
+ "learning_rate": 3.8368269449350055e-06,
+ "loss": 0.5041,
+ "step": 4134
+ },
+ {
+ "epoch": 1.955082742316785,
+ "grad_norm": 2.362470865249634,
+ "learning_rate": 3.836299755501289e-06,
+ "loss": 0.4697,
+ "step": 4135
+ },
+ {
+ "epoch": 1.9555555555555557,
+ "grad_norm": 2.3855135440826416,
+ "learning_rate": 3.835772482862047e-06,
+ "loss": 0.5148,
+ "step": 4136
+ },
+ {
+ "epoch": 1.9560283687943263,
+ "grad_norm": 2.3338418006896973,
+ "learning_rate": 3.83524512705011e-06,
+ "loss": 0.4643,
+ "step": 4137
+ },
+ {
+ "epoch": 1.9565011820330969,
+ "grad_norm": 2.261355400085449,
+ "learning_rate": 3.834717688098313e-06,
+ "loss": 0.5573,
+ "step": 4138
+ },
+ {
+ "epoch": 1.9569739952718677,
+ "grad_norm": 2.8166391849517822,
+ "learning_rate": 3.834190166039498e-06,
+ "loss": 0.4868,
+ "step": 4139
+ },
+ {
+ "epoch": 1.9574468085106385,
+ "grad_norm": 2.4155869483947754,
+ "learning_rate": 3.833662560906512e-06,
+ "loss": 0.4923,
+ "step": 4140
+ },
+ {
+ "epoch": 1.957919621749409,
+ "grad_norm": 2.3977696895599365,
+ "learning_rate": 3.833134872732206e-06,
+ "loss": 0.5106,
+ "step": 4141
+ },
+ {
+ "epoch": 1.9583924349881796,
+ "grad_norm": 2.9541378021240234,
+ "learning_rate": 3.832607101549438e-06,
+ "loss": 0.4683,
+ "step": 4142
+ },
+ {
+ "epoch": 1.9588652482269504,
+ "grad_norm": 2.5862700939178467,
+ "learning_rate": 3.832079247391068e-06,
+ "loss": 0.4453,
+ "step": 4143
+ },
+ {
+ "epoch": 1.9593380614657212,
+ "grad_norm": 2.7459371089935303,
+ "learning_rate": 3.8315513102899644e-06,
+ "loss": 0.5511,
+ "step": 4144
+ },
+ {
+ "epoch": 1.9598108747044918,
+ "grad_norm": 2.904869556427002,
+ "learning_rate": 3.831023290279e-06,
+ "loss": 0.5348,
+ "step": 4145
+ },
+ {
+ "epoch": 1.9602836879432624,
+ "grad_norm": 3.092846632003784,
+ "learning_rate": 3.830495187391051e-06,
+ "loss": 0.5664,
+ "step": 4146
+ },
+ {
+ "epoch": 1.9607565011820332,
+ "grad_norm": 3.2838528156280518,
+ "learning_rate": 3.829967001659001e-06,
+ "loss": 0.5115,
+ "step": 4147
+ },
+ {
+ "epoch": 1.961229314420804,
+ "grad_norm": 2.7799549102783203,
+ "learning_rate": 3.829438733115738e-06,
+ "loss": 0.5145,
+ "step": 4148
+ },
+ {
+ "epoch": 1.9617021276595743,
+ "grad_norm": 2.436084270477295,
+ "learning_rate": 3.828910381794154e-06,
+ "loss": 0.4718,
+ "step": 4149
+ },
+ {
+ "epoch": 1.962174940898345,
+ "grad_norm": 2.6662371158599854,
+ "learning_rate": 3.828381947727148e-06,
+ "loss": 0.6129,
+ "step": 4150
+ },
+ {
+ "epoch": 1.962647754137116,
+ "grad_norm": 2.937000036239624,
+ "learning_rate": 3.827853430947622e-06,
+ "loss": 0.522,
+ "step": 4151
+ },
+ {
+ "epoch": 1.9631205673758865,
+ "grad_norm": 2.5737369060516357,
+ "learning_rate": 3.827324831488486e-06,
+ "loss": 0.4916,
+ "step": 4152
+ },
+ {
+ "epoch": 1.963593380614657,
+ "grad_norm": 2.70232892036438,
+ "learning_rate": 3.826796149382653e-06,
+ "loss": 0.4726,
+ "step": 4153
+ },
+ {
+ "epoch": 1.9640661938534278,
+ "grad_norm": 2.6899707317352295,
+ "learning_rate": 3.826267384663042e-06,
+ "loss": 0.529,
+ "step": 4154
+ },
+ {
+ "epoch": 1.9645390070921986,
+ "grad_norm": 2.6142728328704834,
+ "learning_rate": 3.825738537362575e-06,
+ "loss": 0.4999,
+ "step": 4155
+ },
+ {
+ "epoch": 1.9650118203309692,
+ "grad_norm": 2.43949818611145,
+ "learning_rate": 3.825209607514183e-06,
+ "loss": 0.5035,
+ "step": 4156
+ },
+ {
+ "epoch": 1.9654846335697398,
+ "grad_norm": 2.3735458850860596,
+ "learning_rate": 3.824680595150801e-06,
+ "loss": 0.4779,
+ "step": 4157
+ },
+ {
+ "epoch": 1.9659574468085106,
+ "grad_norm": 2.444307565689087,
+ "learning_rate": 3.824151500305365e-06,
+ "loss": 0.4825,
+ "step": 4158
+ },
+ {
+ "epoch": 1.9664302600472814,
+ "grad_norm": 2.8219668865203857,
+ "learning_rate": 3.8236223230108224e-06,
+ "loss": 0.5354,
+ "step": 4159
+ },
+ {
+ "epoch": 1.966903073286052,
+ "grad_norm": 2.720721483230591,
+ "learning_rate": 3.823093063300121e-06,
+ "loss": 0.5064,
+ "step": 4160
+ },
+ {
+ "epoch": 1.9673758865248225,
+ "grad_norm": 2.324190616607666,
+ "learning_rate": 3.822563721206217e-06,
+ "loss": 0.5348,
+ "step": 4161
+ },
+ {
+ "epoch": 1.9678486997635933,
+ "grad_norm": 2.702155351638794,
+ "learning_rate": 3.8220342967620695e-06,
+ "loss": 0.5388,
+ "step": 4162
+ },
+ {
+ "epoch": 1.9683215130023641,
+ "grad_norm": 2.4956369400024414,
+ "learning_rate": 3.821504790000642e-06,
+ "loss": 0.5071,
+ "step": 4163
+ },
+ {
+ "epoch": 1.9687943262411347,
+ "grad_norm": 2.568039655685425,
+ "learning_rate": 3.820975200954906e-06,
+ "loss": 0.5133,
+ "step": 4164
+ },
+ {
+ "epoch": 1.9692671394799053,
+ "grad_norm": 2.810868978500366,
+ "learning_rate": 3.820445529657837e-06,
+ "loss": 0.4856,
+ "step": 4165
+ },
+ {
+ "epoch": 1.969739952718676,
+ "grad_norm": 2.66365647315979,
+ "learning_rate": 3.819915776142415e-06,
+ "loss": 0.5235,
+ "step": 4166
+ },
+ {
+ "epoch": 1.9702127659574469,
+ "grad_norm": 2.2982139587402344,
+ "learning_rate": 3.8193859404416265e-06,
+ "loss": 0.4361,
+ "step": 4167
+ },
+ {
+ "epoch": 1.9706855791962175,
+ "grad_norm": 2.585672378540039,
+ "learning_rate": 3.818856022588458e-06,
+ "loss": 0.4842,
+ "step": 4168
+ },
+ {
+ "epoch": 1.971158392434988,
+ "grad_norm": 2.57857346534729,
+ "learning_rate": 3.81832602261591e-06,
+ "loss": 0.5249,
+ "step": 4169
+ },
+ {
+ "epoch": 1.9716312056737588,
+ "grad_norm": 2.6947224140167236,
+ "learning_rate": 3.817795940556981e-06,
+ "loss": 0.5234,
+ "step": 4170
+ },
+ {
+ "epoch": 1.9721040189125296,
+ "grad_norm": 2.7453415393829346,
+ "learning_rate": 3.8172657764446764e-06,
+ "loss": 0.5219,
+ "step": 4171
+ },
+ {
+ "epoch": 1.9725768321513002,
+ "grad_norm": 8.424073219299316,
+ "learning_rate": 3.816735530312009e-06,
+ "loss": 0.5162,
+ "step": 4172
+ },
+ {
+ "epoch": 1.9730496453900708,
+ "grad_norm": 2.8229739665985107,
+ "learning_rate": 3.816205202191993e-06,
+ "loss": 0.4621,
+ "step": 4173
+ },
+ {
+ "epoch": 1.9735224586288416,
+ "grad_norm": 2.5969009399414062,
+ "learning_rate": 3.815674792117651e-06,
+ "loss": 0.5044,
+ "step": 4174
+ },
+ {
+ "epoch": 1.9739952718676124,
+ "grad_norm": 2.646024227142334,
+ "learning_rate": 3.815144300122009e-06,
+ "loss": 0.5094,
+ "step": 4175
+ },
+ {
+ "epoch": 1.974468085106383,
+ "grad_norm": 2.4950616359710693,
+ "learning_rate": 3.814613726238097e-06,
+ "loss": 0.4827,
+ "step": 4176
+ },
+ {
+ "epoch": 1.9749408983451535,
+ "grad_norm": 2.5636119842529297,
+ "learning_rate": 3.8140830704989535e-06,
+ "loss": 0.5241,
+ "step": 4177
+ },
+ {
+ "epoch": 1.9754137115839243,
+ "grad_norm": 2.7936553955078125,
+ "learning_rate": 3.813552332937619e-06,
+ "loss": 0.5344,
+ "step": 4178
+ },
+ {
+ "epoch": 1.9758865248226951,
+ "grad_norm": 2.8085341453552246,
+ "learning_rate": 3.8130215135871405e-06,
+ "loss": 0.5647,
+ "step": 4179
+ },
+ {
+ "epoch": 1.9763593380614657,
+ "grad_norm": 2.4776322841644287,
+ "learning_rate": 3.8124906124805694e-06,
+ "loss": 0.542,
+ "step": 4180
+ },
+ {
+ "epoch": 1.9768321513002363,
+ "grad_norm": 2.3227856159210205,
+ "learning_rate": 3.8119596296509635e-06,
+ "loss": 0.4618,
+ "step": 4181
+ },
+ {
+ "epoch": 1.977304964539007,
+ "grad_norm": 2.5157814025878906,
+ "learning_rate": 3.8114285651313848e-06,
+ "loss": 0.538,
+ "step": 4182
+ },
+ {
+ "epoch": 1.9777777777777779,
+ "grad_norm": 2.5630218982696533,
+ "learning_rate": 3.8108974189548987e-06,
+ "loss": 0.5254,
+ "step": 4183
+ },
+ {
+ "epoch": 1.9782505910165484,
+ "grad_norm": 2.703237533569336,
+ "learning_rate": 3.8103661911545787e-06,
+ "loss": 0.4859,
+ "step": 4184
+ },
+ {
+ "epoch": 1.978723404255319,
+ "grad_norm": 2.8808000087738037,
+ "learning_rate": 3.809834881763502e-06,
+ "loss": 0.5585,
+ "step": 4185
+ },
+ {
+ "epoch": 1.9791962174940898,
+ "grad_norm": 2.9047577381134033,
+ "learning_rate": 3.8093034908147507e-06,
+ "loss": 0.5022,
+ "step": 4186
+ },
+ {
+ "epoch": 1.9796690307328606,
+ "grad_norm": 2.7417640686035156,
+ "learning_rate": 3.8087720183414125e-06,
+ "loss": 0.5275,
+ "step": 4187
+ },
+ {
+ "epoch": 1.9801418439716312,
+ "grad_norm": 2.952012062072754,
+ "learning_rate": 3.8082404643765786e-06,
+ "loss": 0.543,
+ "step": 4188
+ },
+ {
+ "epoch": 1.9806146572104018,
+ "grad_norm": 2.538376569747925,
+ "learning_rate": 3.807708828953348e-06,
+ "loss": 0.4969,
+ "step": 4189
+ },
+ {
+ "epoch": 1.9810874704491725,
+ "grad_norm": 2.3476181030273438,
+ "learning_rate": 3.807177112104823e-06,
+ "loss": 0.4979,
+ "step": 4190
+ },
+ {
+ "epoch": 1.9815602836879433,
+ "grad_norm": 2.6480464935302734,
+ "learning_rate": 3.80664531386411e-06,
+ "loss": 0.4894,
+ "step": 4191
+ },
+ {
+ "epoch": 1.982033096926714,
+ "grad_norm": 2.792916774749756,
+ "learning_rate": 3.8061134342643235e-06,
+ "loss": 0.5468,
+ "step": 4192
+ },
+ {
+ "epoch": 1.9825059101654845,
+ "grad_norm": 2.368736743927002,
+ "learning_rate": 3.805581473338581e-06,
+ "loss": 0.4672,
+ "step": 4193
+ },
+ {
+ "epoch": 1.9829787234042553,
+ "grad_norm": 2.379084348678589,
+ "learning_rate": 3.8050494311200037e-06,
+ "loss": 0.4577,
+ "step": 4194
+ },
+ {
+ "epoch": 1.983451536643026,
+ "grad_norm": 2.722471237182617,
+ "learning_rate": 3.804517307641722e-06,
+ "loss": 0.4988,
+ "step": 4195
+ },
+ {
+ "epoch": 1.9839243498817967,
+ "grad_norm": 2.356649875640869,
+ "learning_rate": 3.8039851029368674e-06,
+ "loss": 0.4933,
+ "step": 4196
+ },
+ {
+ "epoch": 1.9843971631205672,
+ "grad_norm": 2.9182281494140625,
+ "learning_rate": 3.8034528170385776e-06,
+ "loss": 0.4873,
+ "step": 4197
+ },
+ {
+ "epoch": 1.984869976359338,
+ "grad_norm": 2.6232199668884277,
+ "learning_rate": 3.8029204499799976e-06,
+ "loss": 0.4425,
+ "step": 4198
+ },
+ {
+ "epoch": 1.9853427895981088,
+ "grad_norm": 2.667541980743408,
+ "learning_rate": 3.802388001794274e-06,
+ "loss": 0.5022,
+ "step": 4199
+ },
+ {
+ "epoch": 1.9858156028368794,
+ "grad_norm": 3.168470621109009,
+ "learning_rate": 3.8018554725145596e-06,
+ "loss": 0.5505,
+ "step": 4200
+ },
+ {
+ "epoch": 1.98628841607565,
+ "grad_norm": 2.716625452041626,
+ "learning_rate": 3.8013228621740132e-06,
+ "loss": 0.4937,
+ "step": 4201
+ },
+ {
+ "epoch": 1.9867612293144208,
+ "grad_norm": 2.3014442920684814,
+ "learning_rate": 3.800790170805799e-06,
+ "loss": 0.4734,
+ "step": 4202
+ },
+ {
+ "epoch": 1.9872340425531916,
+ "grad_norm": 2.9426841735839844,
+ "learning_rate": 3.8002573984430847e-06,
+ "loss": 0.4983,
+ "step": 4203
+ },
+ {
+ "epoch": 1.9877068557919622,
+ "grad_norm": 2.5598278045654297,
+ "learning_rate": 3.7997245451190435e-06,
+ "loss": 0.4834,
+ "step": 4204
+ },
+ {
+ "epoch": 1.9881796690307327,
+ "grad_norm": 2.86458420753479,
+ "learning_rate": 3.7991916108668538e-06,
+ "loss": 0.5613,
+ "step": 4205
+ },
+ {
+ "epoch": 1.9886524822695035,
+ "grad_norm": 2.842914342880249,
+ "learning_rate": 3.7986585957196997e-06,
+ "loss": 0.4951,
+ "step": 4206
+ },
+ {
+ "epoch": 1.9891252955082743,
+ "grad_norm": 3.1828150749206543,
+ "learning_rate": 3.7981254997107686e-06,
+ "loss": 0.5913,
+ "step": 4207
+ },
+ {
+ "epoch": 1.989598108747045,
+ "grad_norm": 2.5765931606292725,
+ "learning_rate": 3.7975923228732547e-06,
+ "loss": 0.5544,
+ "step": 4208
+ },
+ {
+ "epoch": 1.9900709219858155,
+ "grad_norm": 2.492234945297241,
+ "learning_rate": 3.797059065240357e-06,
+ "loss": 0.5046,
+ "step": 4209
+ },
+ {
+ "epoch": 1.9905437352245863,
+ "grad_norm": 2.870346784591675,
+ "learning_rate": 3.7965257268452795e-06,
+ "loss": 0.5354,
+ "step": 4210
+ },
+ {
+ "epoch": 1.991016548463357,
+ "grad_norm": 2.4989993572235107,
+ "learning_rate": 3.795992307721229e-06,
+ "loss": 0.4677,
+ "step": 4211
+ },
+ {
+ "epoch": 1.9914893617021276,
+ "grad_norm": 2.931114673614502,
+ "learning_rate": 3.7954588079014206e-06,
+ "loss": 0.5504,
+ "step": 4212
+ },
+ {
+ "epoch": 1.9919621749408982,
+ "grad_norm": 2.5247652530670166,
+ "learning_rate": 3.794925227419073e-06,
+ "loss": 0.4736,
+ "step": 4213
+ },
+ {
+ "epoch": 1.992434988179669,
+ "grad_norm": 2.6238436698913574,
+ "learning_rate": 3.794391566307409e-06,
+ "loss": 0.4591,
+ "step": 4214
+ },
+ {
+ "epoch": 1.9929078014184398,
+ "grad_norm": 2.654886245727539,
+ "learning_rate": 3.7938578245996584e-06,
+ "loss": 0.5149,
+ "step": 4215
+ },
+ {
+ "epoch": 1.9933806146572104,
+ "grad_norm": 2.509164810180664,
+ "learning_rate": 3.793324002329054e-06,
+ "loss": 0.4951,
+ "step": 4216
+ },
+ {
+ "epoch": 1.993853427895981,
+ "grad_norm": 2.909632921218872,
+ "learning_rate": 3.7927900995288345e-06,
+ "loss": 0.5131,
+ "step": 4217
+ },
+ {
+ "epoch": 1.9943262411347518,
+ "grad_norm": 2.4354615211486816,
+ "learning_rate": 3.7922561162322456e-06,
+ "loss": 0.4716,
+ "step": 4218
+ },
+ {
+ "epoch": 1.9947990543735226,
+ "grad_norm": 2.6514649391174316,
+ "learning_rate": 3.791722052472534e-06,
+ "loss": 0.5714,
+ "step": 4219
+ },
+ {
+ "epoch": 1.9952718676122931,
+ "grad_norm": 2.77089262008667,
+ "learning_rate": 3.791187908282954e-06,
+ "loss": 0.5736,
+ "step": 4220
+ },
+ {
+ "epoch": 1.9957446808510637,
+ "grad_norm": 2.7651021480560303,
+ "learning_rate": 3.7906536836967657e-06,
+ "loss": 0.4948,
+ "step": 4221
+ },
+ {
+ "epoch": 1.9962174940898345,
+ "grad_norm": 2.7536795139312744,
+ "learning_rate": 3.7901193787472306e-06,
+ "loss": 0.512,
+ "step": 4222
+ },
+ {
+ "epoch": 1.9966903073286053,
+ "grad_norm": 2.684893846511841,
+ "learning_rate": 3.78958499346762e-06,
+ "loss": 0.5118,
+ "step": 4223
+ },
+ {
+ "epoch": 1.9971631205673759,
+ "grad_norm": 2.7616753578186035,
+ "learning_rate": 3.7890505278912054e-06,
+ "loss": 0.4516,
+ "step": 4224
+ },
+ {
+ "epoch": 1.9976359338061465,
+ "grad_norm": 2.4731967449188232,
+ "learning_rate": 3.7885159820512666e-06,
+ "loss": 0.4736,
+ "step": 4225
+ },
+ {
+ "epoch": 1.9981087470449173,
+ "grad_norm": 2.366631031036377,
+ "learning_rate": 3.7879813559810884e-06,
+ "loss": 0.4999,
+ "step": 4226
+ },
+ {
+ "epoch": 1.998581560283688,
+ "grad_norm": 2.994624137878418,
+ "learning_rate": 3.7874466497139582e-06,
+ "loss": 0.5273,
+ "step": 4227
+ },
+ {
+ "epoch": 1.9990543735224586,
+ "grad_norm": 2.4499242305755615,
+ "learning_rate": 3.7869118632831712e-06,
+ "loss": 0.5761,
+ "step": 4228
+ },
+ {
+ "epoch": 1.9995271867612292,
+ "grad_norm": 2.3370113372802734,
+ "learning_rate": 3.7863769967220243e-06,
+ "loss": 0.4673,
+ "step": 4229
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 3.1131203174591064,
+ "learning_rate": 3.7858420500638236e-06,
+ "loss": 0.5118,
+ "step": 4230
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 12690,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 6,
+ "save_steps": 2115,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.0684503718121964e+19,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-4230/training_args.bin b/checkpoint-4230/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3885e6a27b204c834ff6fbc6f3a5a4c7f69fe2cc
--- /dev/null
+++ b/checkpoint-4230/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51188d498a3fe1bf9ddf234271786a31e25e89b3aa68721a97f8264e1013c9b6
+size 8056
diff --git a/checkpoint-4230/zero_to_fp32.py b/checkpoint-4230/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-4230/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-6345/README.md b/checkpoint-6345/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..049d467664ca6172b7ffbe6ba60b3eac7479cac4
--- /dev/null
+++ b/checkpoint-6345/README.md
@@ -0,0 +1,202 @@
+---
+base_model: meta-llama/Llama-3.1-8B
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.14.0
\ No newline at end of file
diff --git a/checkpoint-6345/adapter_config.json b/checkpoint-6345/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..38890ca21f7e1854f7350049a9113a9eec8a443a
--- /dev/null
+++ b/checkpoint-6345/adapter_config.json
@@ -0,0 +1,40 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "meta-llama/Llama-3.1-8B",
+ "bias": "none",
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "v_proj",
+ "q_proj",
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "up_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-6345/adapter_model.safetensors b/checkpoint-6345/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a53ac6eab0e0f6a4c9380ba3986fc09997e7c966
--- /dev/null
+++ b/checkpoint-6345/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3db9057267cf2d082b32710b5fe435b23b00f8298e615fe927c3a0adc70c4767
+size 3443586272
diff --git a/checkpoint-6345/global_step6345/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-6345/global_step6345/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d16de72224564ee5a203b10247bcc7645b7fbf44
--- /dev/null
+++ b/checkpoint-6345/global_step6345/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d957c7c6d3a677cf4693895155fa37c363f329306246b9a663e31cae9a23672
+size 20661195036
diff --git a/checkpoint-6345/global_step6345/mp_rank_00_model_states.pt b/checkpoint-6345/global_step6345/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..157a53c213e78312f6246ec75059d2ac3e78ff5b
--- /dev/null
+++ b/checkpoint-6345/global_step6345/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e51765e012d7c7d3bbdd93b2bc0bf866d582a03c65957556bf9c5fa12d9ab138
+size 3555326841
diff --git a/checkpoint-6345/latest b/checkpoint-6345/latest
new file mode 100644
index 0000000000000000000000000000000000000000..fd3df6b13e10c2bf305cd21c1bc31f479846db0c
--- /dev/null
+++ b/checkpoint-6345/latest
@@ -0,0 +1 @@
+global_step6345
\ No newline at end of file
diff --git a/checkpoint-6345/rng_state.pth b/checkpoint-6345/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..130c6b431a345102d115f1e8198f9da92ce28bc5
--- /dev/null
+++ b/checkpoint-6345/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8dfe287174d835b24c21de3b28711ba5592cb0f747c67f0d46f820548100d9cb
+size 14244
diff --git a/checkpoint-6345/scheduler.pt b/checkpoint-6345/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2bca9b33032354b22350732af9e832e85170752f
--- /dev/null
+++ b/checkpoint-6345/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db2cf1b2eb6f1e141df847d489467e491cfee5e0bebdaddda4c4564e7cfa498b
+size 1064
diff --git a/checkpoint-6345/special_tokens_map.json b/checkpoint-6345/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca
--- /dev/null
+++ b/checkpoint-6345/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-6345/tokenizer.json b/checkpoint-6345/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-6345/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-6345/tokenizer_config.json b/checkpoint-6345/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..81dd14db6632ad5b35b9d447732e37ac074873a5
--- /dev/null
+++ b/checkpoint-6345/tokenizer_config.json
@@ -0,0 +1,2063 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|end_of_text|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-6345/trainer_state.json b/checkpoint-6345/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6b3d8eecb764dad678e9a60e3411302a3f63f2c
--- /dev/null
+++ b/checkpoint-6345/trainer_state.json
@@ -0,0 +1,44448 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 3.0,
+ "eval_steps": 500,
+ "global_step": 6345,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.00047281323877068556,
+ "grad_norm": 5.163570880889893,
+ "learning_rate": 5.0000000000000004e-08,
+ "loss": 1.4628,
+ "step": 1
+ },
+ {
+ "epoch": 0.0009456264775413711,
+ "grad_norm": 6.298020839691162,
+ "learning_rate": 1.0000000000000001e-07,
+ "loss": 1.5003,
+ "step": 2
+ },
+ {
+ "epoch": 0.0014184397163120568,
+ "grad_norm": 5.853623390197754,
+ "learning_rate": 1.5000000000000002e-07,
+ "loss": 1.4495,
+ "step": 3
+ },
+ {
+ "epoch": 0.0018912529550827422,
+ "grad_norm": 5.456025123596191,
+ "learning_rate": 2.0000000000000002e-07,
+ "loss": 1.3798,
+ "step": 4
+ },
+ {
+ "epoch": 0.002364066193853428,
+ "grad_norm": 5.757407188415527,
+ "learning_rate": 2.5000000000000004e-07,
+ "loss": 1.4515,
+ "step": 5
+ },
+ {
+ "epoch": 0.0028368794326241137,
+ "grad_norm": 5.872277736663818,
+ "learning_rate": 3.0000000000000004e-07,
+ "loss": 1.4424,
+ "step": 6
+ },
+ {
+ "epoch": 0.003309692671394799,
+ "grad_norm": 6.7816009521484375,
+ "learning_rate": 3.5000000000000004e-07,
+ "loss": 1.4004,
+ "step": 7
+ },
+ {
+ "epoch": 0.0037825059101654845,
+ "grad_norm": 6.229667663574219,
+ "learning_rate": 4.0000000000000003e-07,
+ "loss": 1.4494,
+ "step": 8
+ },
+ {
+ "epoch": 0.00425531914893617,
+ "grad_norm": 5.336202621459961,
+ "learning_rate": 4.5000000000000003e-07,
+ "loss": 1.3916,
+ "step": 9
+ },
+ {
+ "epoch": 0.004728132387706856,
+ "grad_norm": 5.589445114135742,
+ "learning_rate": 5.000000000000001e-07,
+ "loss": 1.2318,
+ "step": 10
+ },
+ {
+ "epoch": 0.005200945626477541,
+ "grad_norm": 5.720539569854736,
+ "learning_rate": 5.5e-07,
+ "loss": 1.4367,
+ "step": 11
+ },
+ {
+ "epoch": 0.005673758865248227,
+ "grad_norm": 5.913913726806641,
+ "learning_rate": 6.000000000000001e-07,
+ "loss": 1.342,
+ "step": 12
+ },
+ {
+ "epoch": 0.006146572104018913,
+ "grad_norm": 5.899744987487793,
+ "learning_rate": 6.5e-07,
+ "loss": 1.4307,
+ "step": 13
+ },
+ {
+ "epoch": 0.006619385342789598,
+ "grad_norm": 5.571037292480469,
+ "learning_rate": 7.000000000000001e-07,
+ "loss": 1.3372,
+ "step": 14
+ },
+ {
+ "epoch": 0.0070921985815602835,
+ "grad_norm": 5.480010509490967,
+ "learning_rate": 7.5e-07,
+ "loss": 1.3923,
+ "step": 15
+ },
+ {
+ "epoch": 0.007565011820330969,
+ "grad_norm": 5.254702091217041,
+ "learning_rate": 8.000000000000001e-07,
+ "loss": 1.2928,
+ "step": 16
+ },
+ {
+ "epoch": 0.008037825059101654,
+ "grad_norm": 6.090312480926514,
+ "learning_rate": 8.500000000000001e-07,
+ "loss": 1.4984,
+ "step": 17
+ },
+ {
+ "epoch": 0.00851063829787234,
+ "grad_norm": 5.689319610595703,
+ "learning_rate": 9.000000000000001e-07,
+ "loss": 1.4108,
+ "step": 18
+ },
+ {
+ "epoch": 0.008983451536643027,
+ "grad_norm": 5.386685848236084,
+ "learning_rate": 9.500000000000001e-07,
+ "loss": 1.425,
+ "step": 19
+ },
+ {
+ "epoch": 0.009456264775413711,
+ "grad_norm": 6.451584815979004,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 1.5507,
+ "step": 20
+ },
+ {
+ "epoch": 0.009929078014184398,
+ "grad_norm": 5.37647008895874,
+ "learning_rate": 1.0500000000000001e-06,
+ "loss": 1.4109,
+ "step": 21
+ },
+ {
+ "epoch": 0.010401891252955082,
+ "grad_norm": 4.716553211212158,
+ "learning_rate": 1.1e-06,
+ "loss": 1.2028,
+ "step": 22
+ },
+ {
+ "epoch": 0.010874704491725768,
+ "grad_norm": 4.950989723205566,
+ "learning_rate": 1.1500000000000002e-06,
+ "loss": 1.3043,
+ "step": 23
+ },
+ {
+ "epoch": 0.011347517730496455,
+ "grad_norm": 4.688975811004639,
+ "learning_rate": 1.2000000000000002e-06,
+ "loss": 1.2708,
+ "step": 24
+ },
+ {
+ "epoch": 0.01182033096926714,
+ "grad_norm": 4.905868053436279,
+ "learning_rate": 1.25e-06,
+ "loss": 1.3268,
+ "step": 25
+ },
+ {
+ "epoch": 0.012293144208037825,
+ "grad_norm": 4.503395080566406,
+ "learning_rate": 1.3e-06,
+ "loss": 1.1799,
+ "step": 26
+ },
+ {
+ "epoch": 0.01276595744680851,
+ "grad_norm": 4.77382230758667,
+ "learning_rate": 1.3500000000000002e-06,
+ "loss": 1.3882,
+ "step": 27
+ },
+ {
+ "epoch": 0.013238770685579196,
+ "grad_norm": 4.734329700469971,
+ "learning_rate": 1.4000000000000001e-06,
+ "loss": 1.3476,
+ "step": 28
+ },
+ {
+ "epoch": 0.013711583924349883,
+ "grad_norm": 4.775066375732422,
+ "learning_rate": 1.45e-06,
+ "loss": 1.2429,
+ "step": 29
+ },
+ {
+ "epoch": 0.014184397163120567,
+ "grad_norm": 4.978334426879883,
+ "learning_rate": 1.5e-06,
+ "loss": 1.2119,
+ "step": 30
+ },
+ {
+ "epoch": 0.014657210401891253,
+ "grad_norm": 4.506785869598389,
+ "learning_rate": 1.5500000000000002e-06,
+ "loss": 1.3157,
+ "step": 31
+ },
+ {
+ "epoch": 0.015130023640661938,
+ "grad_norm": 4.007757186889648,
+ "learning_rate": 1.6000000000000001e-06,
+ "loss": 1.1451,
+ "step": 32
+ },
+ {
+ "epoch": 0.015602836879432624,
+ "grad_norm": 3.6621618270874023,
+ "learning_rate": 1.6500000000000003e-06,
+ "loss": 1.093,
+ "step": 33
+ },
+ {
+ "epoch": 0.01607565011820331,
+ "grad_norm": 3.8733766078948975,
+ "learning_rate": 1.7000000000000002e-06,
+ "loss": 1.2289,
+ "step": 34
+ },
+ {
+ "epoch": 0.016548463356973995,
+ "grad_norm": 4.3391900062561035,
+ "learning_rate": 1.75e-06,
+ "loss": 1.1453,
+ "step": 35
+ },
+ {
+ "epoch": 0.01702127659574468,
+ "grad_norm": 3.287623643875122,
+ "learning_rate": 1.8000000000000001e-06,
+ "loss": 1.0257,
+ "step": 36
+ },
+ {
+ "epoch": 0.017494089834515367,
+ "grad_norm": 3.591721773147583,
+ "learning_rate": 1.85e-06,
+ "loss": 0.9976,
+ "step": 37
+ },
+ {
+ "epoch": 0.017966903073286054,
+ "grad_norm": 4.028271675109863,
+ "learning_rate": 1.9000000000000002e-06,
+ "loss": 1.0773,
+ "step": 38
+ },
+ {
+ "epoch": 0.018439716312056736,
+ "grad_norm": 3.3543951511383057,
+ "learning_rate": 1.9500000000000004e-06,
+ "loss": 1.1677,
+ "step": 39
+ },
+ {
+ "epoch": 0.018912529550827423,
+ "grad_norm": 3.807624340057373,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 1.1232,
+ "step": 40
+ },
+ {
+ "epoch": 0.01938534278959811,
+ "grad_norm": 4.242797374725342,
+ "learning_rate": 2.05e-06,
+ "loss": 1.1819,
+ "step": 41
+ },
+ {
+ "epoch": 0.019858156028368795,
+ "grad_norm": 3.4574992656707764,
+ "learning_rate": 2.1000000000000002e-06,
+ "loss": 0.9878,
+ "step": 42
+ },
+ {
+ "epoch": 0.02033096926713948,
+ "grad_norm": 3.906695604324341,
+ "learning_rate": 2.15e-06,
+ "loss": 1.0592,
+ "step": 43
+ },
+ {
+ "epoch": 0.020803782505910164,
+ "grad_norm": 3.7543163299560547,
+ "learning_rate": 2.2e-06,
+ "loss": 1.0309,
+ "step": 44
+ },
+ {
+ "epoch": 0.02127659574468085,
+ "grad_norm": 3.3777148723602295,
+ "learning_rate": 2.25e-06,
+ "loss": 1.0664,
+ "step": 45
+ },
+ {
+ "epoch": 0.021749408983451537,
+ "grad_norm": 3.6003634929656982,
+ "learning_rate": 2.3000000000000004e-06,
+ "loss": 1.0482,
+ "step": 46
+ },
+ {
+ "epoch": 0.022222222222222223,
+ "grad_norm": 3.3961377143859863,
+ "learning_rate": 2.35e-06,
+ "loss": 1.0252,
+ "step": 47
+ },
+ {
+ "epoch": 0.02269503546099291,
+ "grad_norm": 3.1601035594940186,
+ "learning_rate": 2.4000000000000003e-06,
+ "loss": 1.0435,
+ "step": 48
+ },
+ {
+ "epoch": 0.023167848699763592,
+ "grad_norm": 3.4192967414855957,
+ "learning_rate": 2.4500000000000003e-06,
+ "loss": 1.0935,
+ "step": 49
+ },
+ {
+ "epoch": 0.02364066193853428,
+ "grad_norm": 3.1225922107696533,
+ "learning_rate": 2.5e-06,
+ "loss": 0.8988,
+ "step": 50
+ },
+ {
+ "epoch": 0.024113475177304965,
+ "grad_norm": 3.1423380374908447,
+ "learning_rate": 2.55e-06,
+ "loss": 1.0159,
+ "step": 51
+ },
+ {
+ "epoch": 0.02458628841607565,
+ "grad_norm": 3.4782402515411377,
+ "learning_rate": 2.6e-06,
+ "loss": 1.0231,
+ "step": 52
+ },
+ {
+ "epoch": 0.025059101654846337,
+ "grad_norm": 3.8362693786621094,
+ "learning_rate": 2.6500000000000005e-06,
+ "loss": 1.0725,
+ "step": 53
+ },
+ {
+ "epoch": 0.02553191489361702,
+ "grad_norm": 3.033294916152954,
+ "learning_rate": 2.7000000000000004e-06,
+ "loss": 0.9377,
+ "step": 54
+ },
+ {
+ "epoch": 0.026004728132387706,
+ "grad_norm": 3.849741220474243,
+ "learning_rate": 2.7500000000000004e-06,
+ "loss": 1.0046,
+ "step": 55
+ },
+ {
+ "epoch": 0.026477541371158392,
+ "grad_norm": 3.141876220703125,
+ "learning_rate": 2.8000000000000003e-06,
+ "loss": 0.9226,
+ "step": 56
+ },
+ {
+ "epoch": 0.02695035460992908,
+ "grad_norm": 2.773594856262207,
+ "learning_rate": 2.85e-06,
+ "loss": 0.8662,
+ "step": 57
+ },
+ {
+ "epoch": 0.027423167848699765,
+ "grad_norm": 3.1460225582122803,
+ "learning_rate": 2.9e-06,
+ "loss": 0.9304,
+ "step": 58
+ },
+ {
+ "epoch": 0.027895981087470448,
+ "grad_norm": 3.293583631515503,
+ "learning_rate": 2.95e-06,
+ "loss": 1.0374,
+ "step": 59
+ },
+ {
+ "epoch": 0.028368794326241134,
+ "grad_norm": 3.8190863132476807,
+ "learning_rate": 3e-06,
+ "loss": 0.971,
+ "step": 60
+ },
+ {
+ "epoch": 0.02884160756501182,
+ "grad_norm": 3.4566776752471924,
+ "learning_rate": 3.05e-06,
+ "loss": 0.9631,
+ "step": 61
+ },
+ {
+ "epoch": 0.029314420803782507,
+ "grad_norm": 3.355741500854492,
+ "learning_rate": 3.1000000000000004e-06,
+ "loss": 1.0097,
+ "step": 62
+ },
+ {
+ "epoch": 0.029787234042553193,
+ "grad_norm": 3.29746675491333,
+ "learning_rate": 3.1500000000000003e-06,
+ "loss": 0.9459,
+ "step": 63
+ },
+ {
+ "epoch": 0.030260047281323876,
+ "grad_norm": 3.3122968673706055,
+ "learning_rate": 3.2000000000000003e-06,
+ "loss": 0.8594,
+ "step": 64
+ },
+ {
+ "epoch": 0.030732860520094562,
+ "grad_norm": 3.477701187133789,
+ "learning_rate": 3.2500000000000002e-06,
+ "loss": 0.9197,
+ "step": 65
+ },
+ {
+ "epoch": 0.031205673758865248,
+ "grad_norm": 3.3363406658172607,
+ "learning_rate": 3.3000000000000006e-06,
+ "loss": 0.9478,
+ "step": 66
+ },
+ {
+ "epoch": 0.03167848699763593,
+ "grad_norm": 4.143295764923096,
+ "learning_rate": 3.3500000000000005e-06,
+ "loss": 1.0534,
+ "step": 67
+ },
+ {
+ "epoch": 0.03215130023640662,
+ "grad_norm": 3.2363274097442627,
+ "learning_rate": 3.4000000000000005e-06,
+ "loss": 0.9454,
+ "step": 68
+ },
+ {
+ "epoch": 0.032624113475177303,
+ "grad_norm": 3.198746681213379,
+ "learning_rate": 3.45e-06,
+ "loss": 0.9388,
+ "step": 69
+ },
+ {
+ "epoch": 0.03309692671394799,
+ "grad_norm": 3.5751023292541504,
+ "learning_rate": 3.5e-06,
+ "loss": 0.9444,
+ "step": 70
+ },
+ {
+ "epoch": 0.033569739952718676,
+ "grad_norm": 3.1745729446411133,
+ "learning_rate": 3.5500000000000003e-06,
+ "loss": 0.8683,
+ "step": 71
+ },
+ {
+ "epoch": 0.03404255319148936,
+ "grad_norm": 3.3210883140563965,
+ "learning_rate": 3.6000000000000003e-06,
+ "loss": 0.8811,
+ "step": 72
+ },
+ {
+ "epoch": 0.03451536643026005,
+ "grad_norm": 3.2502429485321045,
+ "learning_rate": 3.65e-06,
+ "loss": 1.0012,
+ "step": 73
+ },
+ {
+ "epoch": 0.034988179669030735,
+ "grad_norm": 3.44598126411438,
+ "learning_rate": 3.7e-06,
+ "loss": 0.9217,
+ "step": 74
+ },
+ {
+ "epoch": 0.03546099290780142,
+ "grad_norm": 3.439117431640625,
+ "learning_rate": 3.7500000000000005e-06,
+ "loss": 0.8976,
+ "step": 75
+ },
+ {
+ "epoch": 0.03593380614657211,
+ "grad_norm": 3.523627758026123,
+ "learning_rate": 3.8000000000000005e-06,
+ "loss": 0.8996,
+ "step": 76
+ },
+ {
+ "epoch": 0.03640661938534279,
+ "grad_norm": 3.3716015815734863,
+ "learning_rate": 3.85e-06,
+ "loss": 0.9061,
+ "step": 77
+ },
+ {
+ "epoch": 0.03687943262411347,
+ "grad_norm": 3.33518385887146,
+ "learning_rate": 3.900000000000001e-06,
+ "loss": 0.9371,
+ "step": 78
+ },
+ {
+ "epoch": 0.03735224586288416,
+ "grad_norm": 3.833829879760742,
+ "learning_rate": 3.95e-06,
+ "loss": 0.9669,
+ "step": 79
+ },
+ {
+ "epoch": 0.037825059101654845,
+ "grad_norm": 3.260446786880493,
+ "learning_rate": 4.000000000000001e-06,
+ "loss": 0.9449,
+ "step": 80
+ },
+ {
+ "epoch": 0.03829787234042553,
+ "grad_norm": 3.532451629638672,
+ "learning_rate": 4.05e-06,
+ "loss": 0.897,
+ "step": 81
+ },
+ {
+ "epoch": 0.03877068557919622,
+ "grad_norm": 3.1156492233276367,
+ "learning_rate": 4.1e-06,
+ "loss": 0.8463,
+ "step": 82
+ },
+ {
+ "epoch": 0.039243498817966904,
+ "grad_norm": 2.8801751136779785,
+ "learning_rate": 4.15e-06,
+ "loss": 0.8616,
+ "step": 83
+ },
+ {
+ "epoch": 0.03971631205673759,
+ "grad_norm": 3.072476863861084,
+ "learning_rate": 4.2000000000000004e-06,
+ "loss": 0.8387,
+ "step": 84
+ },
+ {
+ "epoch": 0.04018912529550828,
+ "grad_norm": 2.9601376056671143,
+ "learning_rate": 4.25e-06,
+ "loss": 0.8538,
+ "step": 85
+ },
+ {
+ "epoch": 0.04066193853427896,
+ "grad_norm": 3.521664619445801,
+ "learning_rate": 4.3e-06,
+ "loss": 0.8894,
+ "step": 86
+ },
+ {
+ "epoch": 0.04113475177304964,
+ "grad_norm": 3.2670981884002686,
+ "learning_rate": 4.350000000000001e-06,
+ "loss": 0.8387,
+ "step": 87
+ },
+ {
+ "epoch": 0.04160756501182033,
+ "grad_norm": 3.422089099884033,
+ "learning_rate": 4.4e-06,
+ "loss": 0.7728,
+ "step": 88
+ },
+ {
+ "epoch": 0.042080378250591015,
+ "grad_norm": 3.414034128189087,
+ "learning_rate": 4.450000000000001e-06,
+ "loss": 0.7968,
+ "step": 89
+ },
+ {
+ "epoch": 0.0425531914893617,
+ "grad_norm": 4.234285354614258,
+ "learning_rate": 4.5e-06,
+ "loss": 0.8502,
+ "step": 90
+ },
+ {
+ "epoch": 0.04302600472813239,
+ "grad_norm": 3.1446919441223145,
+ "learning_rate": 4.5500000000000005e-06,
+ "loss": 0.8236,
+ "step": 91
+ },
+ {
+ "epoch": 0.043498817966903074,
+ "grad_norm": 3.683443307876587,
+ "learning_rate": 4.600000000000001e-06,
+ "loss": 0.9792,
+ "step": 92
+ },
+ {
+ "epoch": 0.04397163120567376,
+ "grad_norm": 3.664219617843628,
+ "learning_rate": 4.65e-06,
+ "loss": 0.8743,
+ "step": 93
+ },
+ {
+ "epoch": 0.044444444444444446,
+ "grad_norm": 3.369479179382324,
+ "learning_rate": 4.7e-06,
+ "loss": 0.8741,
+ "step": 94
+ },
+ {
+ "epoch": 0.04491725768321513,
+ "grad_norm": 3.694949150085449,
+ "learning_rate": 4.75e-06,
+ "loss": 0.7574,
+ "step": 95
+ },
+ {
+ "epoch": 0.04539007092198582,
+ "grad_norm": 3.5144498348236084,
+ "learning_rate": 4.800000000000001e-06,
+ "loss": 0.9934,
+ "step": 96
+ },
+ {
+ "epoch": 0.0458628841607565,
+ "grad_norm": 3.164451837539673,
+ "learning_rate": 4.85e-06,
+ "loss": 0.7463,
+ "step": 97
+ },
+ {
+ "epoch": 0.046335697399527184,
+ "grad_norm": 3.222785472869873,
+ "learning_rate": 4.9000000000000005e-06,
+ "loss": 0.7698,
+ "step": 98
+ },
+ {
+ "epoch": 0.04680851063829787,
+ "grad_norm": 2.9129555225372314,
+ "learning_rate": 4.95e-06,
+ "loss": 0.7856,
+ "step": 99
+ },
+ {
+ "epoch": 0.04728132387706856,
+ "grad_norm": 3.5061235427856445,
+ "learning_rate": 5e-06,
+ "loss": 0.8588,
+ "step": 100
+ },
+ {
+ "epoch": 0.04775413711583924,
+ "grad_norm": 3.2805044651031494,
+ "learning_rate": 4.999999922167982e-06,
+ "loss": 0.7643,
+ "step": 101
+ },
+ {
+ "epoch": 0.04822695035460993,
+ "grad_norm": 3.5461678504943848,
+ "learning_rate": 4.999999688671929e-06,
+ "loss": 0.8253,
+ "step": 102
+ },
+ {
+ "epoch": 0.048699763593380616,
+ "grad_norm": 3.2238264083862305,
+ "learning_rate": 4.99999929951186e-06,
+ "loss": 0.7622,
+ "step": 103
+ },
+ {
+ "epoch": 0.0491725768321513,
+ "grad_norm": 3.818955898284912,
+ "learning_rate": 4.999998754687795e-06,
+ "loss": 0.8471,
+ "step": 104
+ },
+ {
+ "epoch": 0.04964539007092199,
+ "grad_norm": 3.1252424716949463,
+ "learning_rate": 4.99999805419977e-06,
+ "loss": 0.8409,
+ "step": 105
+ },
+ {
+ "epoch": 0.050118203309692674,
+ "grad_norm": 3.604283571243286,
+ "learning_rate": 4.999997198047828e-06,
+ "loss": 0.9027,
+ "step": 106
+ },
+ {
+ "epoch": 0.050591016548463354,
+ "grad_norm": 3.6752424240112305,
+ "learning_rate": 4.999996186232023e-06,
+ "loss": 0.9336,
+ "step": 107
+ },
+ {
+ "epoch": 0.05106382978723404,
+ "grad_norm": 3.517557144165039,
+ "learning_rate": 4.9999950187524184e-06,
+ "loss": 0.8351,
+ "step": 108
+ },
+ {
+ "epoch": 0.051536643026004726,
+ "grad_norm": 3.427285671234131,
+ "learning_rate": 4.999993695609085e-06,
+ "loss": 0.8457,
+ "step": 109
+ },
+ {
+ "epoch": 0.05200945626477541,
+ "grad_norm": 3.2792510986328125,
+ "learning_rate": 4.999992216802107e-06,
+ "loss": 0.8391,
+ "step": 110
+ },
+ {
+ "epoch": 0.0524822695035461,
+ "grad_norm": 3.581094741821289,
+ "learning_rate": 4.999990582331576e-06,
+ "loss": 0.7533,
+ "step": 111
+ },
+ {
+ "epoch": 0.052955082742316785,
+ "grad_norm": 3.1667377948760986,
+ "learning_rate": 4.999988792197593e-06,
+ "loss": 0.9562,
+ "step": 112
+ },
+ {
+ "epoch": 0.05342789598108747,
+ "grad_norm": 3.3609890937805176,
+ "learning_rate": 4.99998684640027e-06,
+ "loss": 0.8181,
+ "step": 113
+ },
+ {
+ "epoch": 0.05390070921985816,
+ "grad_norm": 3.260627269744873,
+ "learning_rate": 4.999984744939729e-06,
+ "loss": 0.8012,
+ "step": 114
+ },
+ {
+ "epoch": 0.054373522458628844,
+ "grad_norm": 3.4535653591156006,
+ "learning_rate": 4.9999824878160985e-06,
+ "loss": 0.919,
+ "step": 115
+ },
+ {
+ "epoch": 0.05484633569739953,
+ "grad_norm": 3.4880740642547607,
+ "learning_rate": 4.999980075029522e-06,
+ "loss": 0.8114,
+ "step": 116
+ },
+ {
+ "epoch": 0.05531914893617021,
+ "grad_norm": 3.2546932697296143,
+ "learning_rate": 4.999977506580147e-06,
+ "loss": 0.8274,
+ "step": 117
+ },
+ {
+ "epoch": 0.055791962174940896,
+ "grad_norm": 3.2762744426727295,
+ "learning_rate": 4.999974782468136e-06,
+ "loss": 0.9018,
+ "step": 118
+ },
+ {
+ "epoch": 0.05626477541371158,
+ "grad_norm": 3.42825984954834,
+ "learning_rate": 4.999971902693657e-06,
+ "loss": 0.8262,
+ "step": 119
+ },
+ {
+ "epoch": 0.05673758865248227,
+ "grad_norm": 3.082496404647827,
+ "learning_rate": 4.99996886725689e-06,
+ "loss": 0.8181,
+ "step": 120
+ },
+ {
+ "epoch": 0.057210401891252954,
+ "grad_norm": 3.322869300842285,
+ "learning_rate": 4.9999656761580225e-06,
+ "loss": 0.8382,
+ "step": 121
+ },
+ {
+ "epoch": 0.05768321513002364,
+ "grad_norm": 3.6365339756011963,
+ "learning_rate": 4.9999623293972555e-06,
+ "loss": 0.7489,
+ "step": 122
+ },
+ {
+ "epoch": 0.05815602836879433,
+ "grad_norm": 3.376352548599243,
+ "learning_rate": 4.999958826974796e-06,
+ "loss": 0.9012,
+ "step": 123
+ },
+ {
+ "epoch": 0.05862884160756501,
+ "grad_norm": 3.49088716506958,
+ "learning_rate": 4.999955168890862e-06,
+ "loss": 0.8999,
+ "step": 124
+ },
+ {
+ "epoch": 0.0591016548463357,
+ "grad_norm": 3.3265068531036377,
+ "learning_rate": 4.999951355145682e-06,
+ "loss": 0.8161,
+ "step": 125
+ },
+ {
+ "epoch": 0.059574468085106386,
+ "grad_norm": 3.697282314300537,
+ "learning_rate": 4.999947385739493e-06,
+ "loss": 0.9623,
+ "step": 126
+ },
+ {
+ "epoch": 0.06004728132387707,
+ "grad_norm": 2.7901928424835205,
+ "learning_rate": 4.999943260672542e-06,
+ "loss": 0.7371,
+ "step": 127
+ },
+ {
+ "epoch": 0.06052009456264775,
+ "grad_norm": 3.110319137573242,
+ "learning_rate": 4.999938979945086e-06,
+ "loss": 0.715,
+ "step": 128
+ },
+ {
+ "epoch": 0.06099290780141844,
+ "grad_norm": 3.2211520671844482,
+ "learning_rate": 4.999934543557392e-06,
+ "loss": 0.8888,
+ "step": 129
+ },
+ {
+ "epoch": 0.061465721040189124,
+ "grad_norm": 3.2466187477111816,
+ "learning_rate": 4.999929951509735e-06,
+ "loss": 0.9389,
+ "step": 130
+ },
+ {
+ "epoch": 0.06193853427895981,
+ "grad_norm": 3.3574399948120117,
+ "learning_rate": 4.999925203802403e-06,
+ "loss": 0.8263,
+ "step": 131
+ },
+ {
+ "epoch": 0.062411347517730496,
+ "grad_norm": 3.275601625442505,
+ "learning_rate": 4.99992030043569e-06,
+ "loss": 0.8338,
+ "step": 132
+ },
+ {
+ "epoch": 0.06288416075650118,
+ "grad_norm": 3.6011312007904053,
+ "learning_rate": 4.999915241409902e-06,
+ "loss": 0.8351,
+ "step": 133
+ },
+ {
+ "epoch": 0.06335697399527186,
+ "grad_norm": 2.969011068344116,
+ "learning_rate": 4.999910026725352e-06,
+ "loss": 0.79,
+ "step": 134
+ },
+ {
+ "epoch": 0.06382978723404255,
+ "grad_norm": 3.690784454345703,
+ "learning_rate": 4.999904656382369e-06,
+ "loss": 0.8209,
+ "step": 135
+ },
+ {
+ "epoch": 0.06430260047281323,
+ "grad_norm": 3.3363115787506104,
+ "learning_rate": 4.999899130381283e-06,
+ "loss": 0.858,
+ "step": 136
+ },
+ {
+ "epoch": 0.06477541371158392,
+ "grad_norm": 3.206881523132324,
+ "learning_rate": 4.9998934487224405e-06,
+ "loss": 0.834,
+ "step": 137
+ },
+ {
+ "epoch": 0.06524822695035461,
+ "grad_norm": 2.773146152496338,
+ "learning_rate": 4.999887611406195e-06,
+ "loss": 0.7576,
+ "step": 138
+ },
+ {
+ "epoch": 0.0657210401891253,
+ "grad_norm": 3.307725667953491,
+ "learning_rate": 4.999881618432908e-06,
+ "loss": 0.7487,
+ "step": 139
+ },
+ {
+ "epoch": 0.06619385342789598,
+ "grad_norm": 4.273657321929932,
+ "learning_rate": 4.999875469802956e-06,
+ "loss": 0.8176,
+ "step": 140
+ },
+ {
+ "epoch": 0.06666666666666667,
+ "grad_norm": 3.0898005962371826,
+ "learning_rate": 4.999869165516719e-06,
+ "loss": 0.7578,
+ "step": 141
+ },
+ {
+ "epoch": 0.06713947990543735,
+ "grad_norm": 3.25150990486145,
+ "learning_rate": 4.9998627055745915e-06,
+ "loss": 0.7873,
+ "step": 142
+ },
+ {
+ "epoch": 0.06761229314420804,
+ "grad_norm": 2.9705755710601807,
+ "learning_rate": 4.999856089976974e-06,
+ "loss": 0.6473,
+ "step": 143
+ },
+ {
+ "epoch": 0.06808510638297872,
+ "grad_norm": 3.5658507347106934,
+ "learning_rate": 4.9998493187242804e-06,
+ "loss": 0.855,
+ "step": 144
+ },
+ {
+ "epoch": 0.06855791962174941,
+ "grad_norm": 3.3994076251983643,
+ "learning_rate": 4.99984239181693e-06,
+ "loss": 0.7926,
+ "step": 145
+ },
+ {
+ "epoch": 0.0690307328605201,
+ "grad_norm": 2.8266260623931885,
+ "learning_rate": 4.999835309255357e-06,
+ "loss": 0.7564,
+ "step": 146
+ },
+ {
+ "epoch": 0.06950354609929078,
+ "grad_norm": 3.1143875122070312,
+ "learning_rate": 4.999828071039999e-06,
+ "loss": 0.8398,
+ "step": 147
+ },
+ {
+ "epoch": 0.06997635933806147,
+ "grad_norm": 2.9364278316497803,
+ "learning_rate": 4.99982067717131e-06,
+ "loss": 0.7381,
+ "step": 148
+ },
+ {
+ "epoch": 0.07044917257683216,
+ "grad_norm": 3.4155616760253906,
+ "learning_rate": 4.999813127649748e-06,
+ "loss": 0.7933,
+ "step": 149
+ },
+ {
+ "epoch": 0.07092198581560284,
+ "grad_norm": 4.371236324310303,
+ "learning_rate": 4.999805422475784e-06,
+ "loss": 0.8292,
+ "step": 150
+ },
+ {
+ "epoch": 0.07139479905437353,
+ "grad_norm": 3.3967185020446777,
+ "learning_rate": 4.999797561649897e-06,
+ "loss": 0.8712,
+ "step": 151
+ },
+ {
+ "epoch": 0.07186761229314421,
+ "grad_norm": 3.343303680419922,
+ "learning_rate": 4.999789545172578e-06,
+ "loss": 0.8177,
+ "step": 152
+ },
+ {
+ "epoch": 0.07234042553191489,
+ "grad_norm": 3.040235757827759,
+ "learning_rate": 4.999781373044325e-06,
+ "loss": 0.7379,
+ "step": 153
+ },
+ {
+ "epoch": 0.07281323877068557,
+ "grad_norm": 3.4069204330444336,
+ "learning_rate": 4.999773045265647e-06,
+ "loss": 0.7939,
+ "step": 154
+ },
+ {
+ "epoch": 0.07328605200945626,
+ "grad_norm": 3.1939475536346436,
+ "learning_rate": 4.999764561837063e-06,
+ "loss": 0.8037,
+ "step": 155
+ },
+ {
+ "epoch": 0.07375886524822695,
+ "grad_norm": 4.452004909515381,
+ "learning_rate": 4.999755922759101e-06,
+ "loss": 0.8421,
+ "step": 156
+ },
+ {
+ "epoch": 0.07423167848699763,
+ "grad_norm": 3.2031240463256836,
+ "learning_rate": 4.999747128032298e-06,
+ "loss": 0.794,
+ "step": 157
+ },
+ {
+ "epoch": 0.07470449172576832,
+ "grad_norm": 3.175920009613037,
+ "learning_rate": 4.999738177657203e-06,
+ "loss": 0.759,
+ "step": 158
+ },
+ {
+ "epoch": 0.075177304964539,
+ "grad_norm": 3.7679688930511475,
+ "learning_rate": 4.9997290716343725e-06,
+ "loss": 0.8174,
+ "step": 159
+ },
+ {
+ "epoch": 0.07565011820330969,
+ "grad_norm": 3.7020037174224854,
+ "learning_rate": 4.999719809964373e-06,
+ "loss": 0.7116,
+ "step": 160
+ },
+ {
+ "epoch": 0.07612293144208038,
+ "grad_norm": 4.357471942901611,
+ "learning_rate": 4.999710392647783e-06,
+ "loss": 0.7649,
+ "step": 161
+ },
+ {
+ "epoch": 0.07659574468085106,
+ "grad_norm": 3.3439087867736816,
+ "learning_rate": 4.999700819685187e-06,
+ "loss": 0.7907,
+ "step": 162
+ },
+ {
+ "epoch": 0.07706855791962175,
+ "grad_norm": 3.210815191268921,
+ "learning_rate": 4.999691091077182e-06,
+ "loss": 0.8446,
+ "step": 163
+ },
+ {
+ "epoch": 0.07754137115839244,
+ "grad_norm": 3.1029553413391113,
+ "learning_rate": 4.9996812068243735e-06,
+ "loss": 0.7232,
+ "step": 164
+ },
+ {
+ "epoch": 0.07801418439716312,
+ "grad_norm": 2.9389400482177734,
+ "learning_rate": 4.999671166927378e-06,
+ "loss": 0.7413,
+ "step": 165
+ },
+ {
+ "epoch": 0.07848699763593381,
+ "grad_norm": 3.7062697410583496,
+ "learning_rate": 4.9996609713868185e-06,
+ "loss": 0.8773,
+ "step": 166
+ },
+ {
+ "epoch": 0.0789598108747045,
+ "grad_norm": 3.2768924236297607,
+ "learning_rate": 4.999650620203332e-06,
+ "loss": 0.8046,
+ "step": 167
+ },
+ {
+ "epoch": 0.07943262411347518,
+ "grad_norm": 3.380373001098633,
+ "learning_rate": 4.999640113377561e-06,
+ "loss": 0.7529,
+ "step": 168
+ },
+ {
+ "epoch": 0.07990543735224587,
+ "grad_norm": 3.520022392272949,
+ "learning_rate": 4.999629450910162e-06,
+ "loss": 0.7352,
+ "step": 169
+ },
+ {
+ "epoch": 0.08037825059101655,
+ "grad_norm": 3.43269419670105,
+ "learning_rate": 4.999618632801796e-06,
+ "loss": 0.9371,
+ "step": 170
+ },
+ {
+ "epoch": 0.08085106382978724,
+ "grad_norm": 3.555877923965454,
+ "learning_rate": 4.99960765905314e-06,
+ "loss": 0.8276,
+ "step": 171
+ },
+ {
+ "epoch": 0.08132387706855793,
+ "grad_norm": 3.597050189971924,
+ "learning_rate": 4.999596529664874e-06,
+ "loss": 0.8164,
+ "step": 172
+ },
+ {
+ "epoch": 0.0817966903073286,
+ "grad_norm": 3.2002956867218018,
+ "learning_rate": 4.999585244637693e-06,
+ "loss": 0.7824,
+ "step": 173
+ },
+ {
+ "epoch": 0.08226950354609928,
+ "grad_norm": 3.527275562286377,
+ "learning_rate": 4.999573803972299e-06,
+ "loss": 0.8033,
+ "step": 174
+ },
+ {
+ "epoch": 0.08274231678486997,
+ "grad_norm": 3.5184452533721924,
+ "learning_rate": 4.999562207669405e-06,
+ "loss": 0.724,
+ "step": 175
+ },
+ {
+ "epoch": 0.08321513002364066,
+ "grad_norm": 3.6635067462921143,
+ "learning_rate": 4.999550455729732e-06,
+ "loss": 0.819,
+ "step": 176
+ },
+ {
+ "epoch": 0.08368794326241134,
+ "grad_norm": 3.192399740219116,
+ "learning_rate": 4.999538548154012e-06,
+ "loss": 0.7999,
+ "step": 177
+ },
+ {
+ "epoch": 0.08416075650118203,
+ "grad_norm": 3.0946953296661377,
+ "learning_rate": 4.999526484942988e-06,
+ "loss": 0.7367,
+ "step": 178
+ },
+ {
+ "epoch": 0.08463356973995272,
+ "grad_norm": 2.847198009490967,
+ "learning_rate": 4.99951426609741e-06,
+ "loss": 0.7536,
+ "step": 179
+ },
+ {
+ "epoch": 0.0851063829787234,
+ "grad_norm": 2.7674827575683594,
+ "learning_rate": 4.999501891618037e-06,
+ "loss": 0.701,
+ "step": 180
+ },
+ {
+ "epoch": 0.08557919621749409,
+ "grad_norm": 3.357933521270752,
+ "learning_rate": 4.999489361505643e-06,
+ "loss": 0.8331,
+ "step": 181
+ },
+ {
+ "epoch": 0.08605200945626477,
+ "grad_norm": 3.1464426517486572,
+ "learning_rate": 4.999476675761004e-06,
+ "loss": 0.7931,
+ "step": 182
+ },
+ {
+ "epoch": 0.08652482269503546,
+ "grad_norm": 3.310697078704834,
+ "learning_rate": 4.999463834384915e-06,
+ "loss": 0.753,
+ "step": 183
+ },
+ {
+ "epoch": 0.08699763593380615,
+ "grad_norm": 2.9794881343841553,
+ "learning_rate": 4.999450837378171e-06,
+ "loss": 0.7091,
+ "step": 184
+ },
+ {
+ "epoch": 0.08747044917257683,
+ "grad_norm": 3.0776889324188232,
+ "learning_rate": 4.999437684741584e-06,
+ "loss": 0.7226,
+ "step": 185
+ },
+ {
+ "epoch": 0.08794326241134752,
+ "grad_norm": 3.6657519340515137,
+ "learning_rate": 4.999424376475972e-06,
+ "loss": 0.845,
+ "step": 186
+ },
+ {
+ "epoch": 0.0884160756501182,
+ "grad_norm": 3.872718572616577,
+ "learning_rate": 4.999410912582164e-06,
+ "loss": 0.812,
+ "step": 187
+ },
+ {
+ "epoch": 0.08888888888888889,
+ "grad_norm": 2.9184508323669434,
+ "learning_rate": 4.9993972930609976e-06,
+ "loss": 0.6823,
+ "step": 188
+ },
+ {
+ "epoch": 0.08936170212765958,
+ "grad_norm": 3.5567142963409424,
+ "learning_rate": 4.999383517913321e-06,
+ "loss": 0.7614,
+ "step": 189
+ },
+ {
+ "epoch": 0.08983451536643026,
+ "grad_norm": 3.3688533306121826,
+ "learning_rate": 4.999369587139992e-06,
+ "loss": 0.858,
+ "step": 190
+ },
+ {
+ "epoch": 0.09030732860520095,
+ "grad_norm": 2.893223524093628,
+ "learning_rate": 4.99935550074188e-06,
+ "loss": 0.6761,
+ "step": 191
+ },
+ {
+ "epoch": 0.09078014184397164,
+ "grad_norm": 3.400225877761841,
+ "learning_rate": 4.999341258719859e-06,
+ "loss": 0.7531,
+ "step": 192
+ },
+ {
+ "epoch": 0.09125295508274232,
+ "grad_norm": 3.6167714595794678,
+ "learning_rate": 4.999326861074817e-06,
+ "loss": 0.8164,
+ "step": 193
+ },
+ {
+ "epoch": 0.091725768321513,
+ "grad_norm": 4.325016498565674,
+ "learning_rate": 4.9993123078076506e-06,
+ "loss": 0.7069,
+ "step": 194
+ },
+ {
+ "epoch": 0.09219858156028368,
+ "grad_norm": 3.195317029953003,
+ "learning_rate": 4.999297598919266e-06,
+ "loss": 0.726,
+ "step": 195
+ },
+ {
+ "epoch": 0.09267139479905437,
+ "grad_norm": 3.146530866622925,
+ "learning_rate": 4.999282734410579e-06,
+ "loss": 0.7888,
+ "step": 196
+ },
+ {
+ "epoch": 0.09314420803782505,
+ "grad_norm": 3.5166752338409424,
+ "learning_rate": 4.999267714282515e-06,
+ "loss": 0.8473,
+ "step": 197
+ },
+ {
+ "epoch": 0.09361702127659574,
+ "grad_norm": 3.3140196800231934,
+ "learning_rate": 4.99925253853601e-06,
+ "loss": 0.7233,
+ "step": 198
+ },
+ {
+ "epoch": 0.09408983451536643,
+ "grad_norm": 3.0318164825439453,
+ "learning_rate": 4.999237207172008e-06,
+ "loss": 0.7543,
+ "step": 199
+ },
+ {
+ "epoch": 0.09456264775413711,
+ "grad_norm": 3.662214756011963,
+ "learning_rate": 4.999221720191464e-06,
+ "loss": 0.7783,
+ "step": 200
+ },
+ {
+ "epoch": 0.0950354609929078,
+ "grad_norm": 3.452078104019165,
+ "learning_rate": 4.9992060775953425e-06,
+ "loss": 0.7868,
+ "step": 201
+ },
+ {
+ "epoch": 0.09550827423167849,
+ "grad_norm": 3.4051287174224854,
+ "learning_rate": 4.999190279384617e-06,
+ "loss": 0.7849,
+ "step": 202
+ },
+ {
+ "epoch": 0.09598108747044917,
+ "grad_norm": 3.1377196311950684,
+ "learning_rate": 4.999174325560271e-06,
+ "loss": 0.8364,
+ "step": 203
+ },
+ {
+ "epoch": 0.09645390070921986,
+ "grad_norm": 3.129473924636841,
+ "learning_rate": 4.999158216123299e-06,
+ "loss": 0.7458,
+ "step": 204
+ },
+ {
+ "epoch": 0.09692671394799054,
+ "grad_norm": 3.169548749923706,
+ "learning_rate": 4.999141951074703e-06,
+ "loss": 0.7256,
+ "step": 205
+ },
+ {
+ "epoch": 0.09739952718676123,
+ "grad_norm": 3.186009168624878,
+ "learning_rate": 4.999125530415495e-06,
+ "loss": 0.783,
+ "step": 206
+ },
+ {
+ "epoch": 0.09787234042553192,
+ "grad_norm": 3.0995123386383057,
+ "learning_rate": 4.9991089541467e-06,
+ "loss": 0.7519,
+ "step": 207
+ },
+ {
+ "epoch": 0.0983451536643026,
+ "grad_norm": 3.1854088306427,
+ "learning_rate": 4.999092222269348e-06,
+ "loss": 0.7444,
+ "step": 208
+ },
+ {
+ "epoch": 0.09881796690307329,
+ "grad_norm": 3.1512246131896973,
+ "learning_rate": 4.999075334784482e-06,
+ "loss": 0.7882,
+ "step": 209
+ },
+ {
+ "epoch": 0.09929078014184398,
+ "grad_norm": 3.6199698448181152,
+ "learning_rate": 4.999058291693153e-06,
+ "loss": 0.8048,
+ "step": 210
+ },
+ {
+ "epoch": 0.09976359338061466,
+ "grad_norm": 2.956907272338867,
+ "learning_rate": 4.999041092996422e-06,
+ "loss": 0.7663,
+ "step": 211
+ },
+ {
+ "epoch": 0.10023640661938535,
+ "grad_norm": 3.3493971824645996,
+ "learning_rate": 4.99902373869536e-06,
+ "loss": 0.7639,
+ "step": 212
+ },
+ {
+ "epoch": 0.10070921985815603,
+ "grad_norm": 3.144812822341919,
+ "learning_rate": 4.9990062287910475e-06,
+ "loss": 0.7953,
+ "step": 213
+ },
+ {
+ "epoch": 0.10118203309692671,
+ "grad_norm": 3.5986971855163574,
+ "learning_rate": 4.998988563284576e-06,
+ "loss": 0.8297,
+ "step": 214
+ },
+ {
+ "epoch": 0.1016548463356974,
+ "grad_norm": 3.447584867477417,
+ "learning_rate": 4.998970742177044e-06,
+ "loss": 0.808,
+ "step": 215
+ },
+ {
+ "epoch": 0.10212765957446808,
+ "grad_norm": 3.791353940963745,
+ "learning_rate": 4.998952765469562e-06,
+ "loss": 0.8005,
+ "step": 216
+ },
+ {
+ "epoch": 0.10260047281323877,
+ "grad_norm": 3.4490807056427,
+ "learning_rate": 4.998934633163247e-06,
+ "loss": 0.8135,
+ "step": 217
+ },
+ {
+ "epoch": 0.10307328605200945,
+ "grad_norm": 3.1053314208984375,
+ "learning_rate": 4.998916345259232e-06,
+ "loss": 0.7888,
+ "step": 218
+ },
+ {
+ "epoch": 0.10354609929078014,
+ "grad_norm": 3.407862663269043,
+ "learning_rate": 4.9988979017586514e-06,
+ "loss": 0.7099,
+ "step": 219
+ },
+ {
+ "epoch": 0.10401891252955082,
+ "grad_norm": 3.116656541824341,
+ "learning_rate": 4.998879302662658e-06,
+ "loss": 0.8344,
+ "step": 220
+ },
+ {
+ "epoch": 0.10449172576832151,
+ "grad_norm": 3.339264154434204,
+ "learning_rate": 4.998860547972406e-06,
+ "loss": 0.8496,
+ "step": 221
+ },
+ {
+ "epoch": 0.1049645390070922,
+ "grad_norm": 3.251892566680908,
+ "learning_rate": 4.998841637689066e-06,
+ "loss": 0.7455,
+ "step": 222
+ },
+ {
+ "epoch": 0.10543735224586288,
+ "grad_norm": 4.098135471343994,
+ "learning_rate": 4.998822571813814e-06,
+ "loss": 0.7772,
+ "step": 223
+ },
+ {
+ "epoch": 0.10591016548463357,
+ "grad_norm": 3.9871134757995605,
+ "learning_rate": 4.998803350347837e-06,
+ "loss": 0.8261,
+ "step": 224
+ },
+ {
+ "epoch": 0.10638297872340426,
+ "grad_norm": 3.2822303771972656,
+ "learning_rate": 4.998783973292333e-06,
+ "loss": 0.8623,
+ "step": 225
+ },
+ {
+ "epoch": 0.10685579196217494,
+ "grad_norm": 3.0356857776641846,
+ "learning_rate": 4.998764440648507e-06,
+ "loss": 0.7426,
+ "step": 226
+ },
+ {
+ "epoch": 0.10732860520094563,
+ "grad_norm": 2.8932785987854004,
+ "learning_rate": 4.998744752417576e-06,
+ "loss": 0.6741,
+ "step": 227
+ },
+ {
+ "epoch": 0.10780141843971631,
+ "grad_norm": 3.085820436477661,
+ "learning_rate": 4.998724908600767e-06,
+ "loss": 0.6549,
+ "step": 228
+ },
+ {
+ "epoch": 0.108274231678487,
+ "grad_norm": 3.135829210281372,
+ "learning_rate": 4.998704909199314e-06,
+ "loss": 0.6702,
+ "step": 229
+ },
+ {
+ "epoch": 0.10874704491725769,
+ "grad_norm": 5.016134262084961,
+ "learning_rate": 4.9986847542144625e-06,
+ "loss": 0.7852,
+ "step": 230
+ },
+ {
+ "epoch": 0.10921985815602837,
+ "grad_norm": 3.9056200981140137,
+ "learning_rate": 4.998664443647468e-06,
+ "loss": 0.9654,
+ "step": 231
+ },
+ {
+ "epoch": 0.10969267139479906,
+ "grad_norm": 3.0880749225616455,
+ "learning_rate": 4.998643977499595e-06,
+ "loss": 0.7579,
+ "step": 232
+ },
+ {
+ "epoch": 0.11016548463356975,
+ "grad_norm": 3.6893601417541504,
+ "learning_rate": 4.998623355772118e-06,
+ "loss": 0.713,
+ "step": 233
+ },
+ {
+ "epoch": 0.11063829787234042,
+ "grad_norm": 4.181536674499512,
+ "learning_rate": 4.998602578466319e-06,
+ "loss": 0.7331,
+ "step": 234
+ },
+ {
+ "epoch": 0.1111111111111111,
+ "grad_norm": 3.036386728286743,
+ "learning_rate": 4.998581645583496e-06,
+ "loss": 0.7115,
+ "step": 235
+ },
+ {
+ "epoch": 0.11158392434988179,
+ "grad_norm": 3.6333255767822266,
+ "learning_rate": 4.998560557124948e-06,
+ "loss": 0.7544,
+ "step": 236
+ },
+ {
+ "epoch": 0.11205673758865248,
+ "grad_norm": 2.926417827606201,
+ "learning_rate": 4.9985393130919915e-06,
+ "loss": 0.715,
+ "step": 237
+ },
+ {
+ "epoch": 0.11252955082742316,
+ "grad_norm": 2.969158172607422,
+ "learning_rate": 4.998517913485946e-06,
+ "loss": 0.7304,
+ "step": 238
+ },
+ {
+ "epoch": 0.11300236406619385,
+ "grad_norm": 3.5254971981048584,
+ "learning_rate": 4.9984963583081466e-06,
+ "loss": 0.7725,
+ "step": 239
+ },
+ {
+ "epoch": 0.11347517730496454,
+ "grad_norm": 3.7840335369110107,
+ "learning_rate": 4.998474647559936e-06,
+ "loss": 0.8685,
+ "step": 240
+ },
+ {
+ "epoch": 0.11394799054373522,
+ "grad_norm": 3.0333125591278076,
+ "learning_rate": 4.9984527812426625e-06,
+ "loss": 0.7793,
+ "step": 241
+ },
+ {
+ "epoch": 0.11442080378250591,
+ "grad_norm": 3.290159225463867,
+ "learning_rate": 4.99843075935769e-06,
+ "loss": 0.7158,
+ "step": 242
+ },
+ {
+ "epoch": 0.1148936170212766,
+ "grad_norm": 3.3935494422912598,
+ "learning_rate": 4.99840858190639e-06,
+ "loss": 0.7643,
+ "step": 243
+ },
+ {
+ "epoch": 0.11536643026004728,
+ "grad_norm": 3.333965539932251,
+ "learning_rate": 4.998386248890142e-06,
+ "loss": 0.7255,
+ "step": 244
+ },
+ {
+ "epoch": 0.11583924349881797,
+ "grad_norm": 2.8129613399505615,
+ "learning_rate": 4.998363760310339e-06,
+ "loss": 0.768,
+ "step": 245
+ },
+ {
+ "epoch": 0.11631205673758865,
+ "grad_norm": 2.8678107261657715,
+ "learning_rate": 4.998341116168378e-06,
+ "loss": 0.7403,
+ "step": 246
+ },
+ {
+ "epoch": 0.11678486997635934,
+ "grad_norm": 2.8898239135742188,
+ "learning_rate": 4.998318316465672e-06,
+ "loss": 0.6844,
+ "step": 247
+ },
+ {
+ "epoch": 0.11725768321513003,
+ "grad_norm": 3.139777898788452,
+ "learning_rate": 4.998295361203637e-06,
+ "loss": 0.7936,
+ "step": 248
+ },
+ {
+ "epoch": 0.11773049645390071,
+ "grad_norm": 3.393721103668213,
+ "learning_rate": 4.998272250383707e-06,
+ "loss": 0.8173,
+ "step": 249
+ },
+ {
+ "epoch": 0.1182033096926714,
+ "grad_norm": 3.240973949432373,
+ "learning_rate": 4.998248984007318e-06,
+ "loss": 0.8252,
+ "step": 250
+ },
+ {
+ "epoch": 0.11867612293144209,
+ "grad_norm": 3.384855031967163,
+ "learning_rate": 4.998225562075918e-06,
+ "loss": 0.7244,
+ "step": 251
+ },
+ {
+ "epoch": 0.11914893617021277,
+ "grad_norm": 3.1881816387176514,
+ "learning_rate": 4.9982019845909675e-06,
+ "loss": 0.6818,
+ "step": 252
+ },
+ {
+ "epoch": 0.11962174940898346,
+ "grad_norm": 2.888364553451538,
+ "learning_rate": 4.998178251553934e-06,
+ "loss": 0.6753,
+ "step": 253
+ },
+ {
+ "epoch": 0.12009456264775414,
+ "grad_norm": 3.630093812942505,
+ "learning_rate": 4.9981543629662944e-06,
+ "loss": 0.7995,
+ "step": 254
+ },
+ {
+ "epoch": 0.12056737588652482,
+ "grad_norm": 2.9820947647094727,
+ "learning_rate": 4.998130318829537e-06,
+ "loss": 0.7478,
+ "step": 255
+ },
+ {
+ "epoch": 0.1210401891252955,
+ "grad_norm": 2.7094738483428955,
+ "learning_rate": 4.998106119145159e-06,
+ "loss": 0.7237,
+ "step": 256
+ },
+ {
+ "epoch": 0.12151300236406619,
+ "grad_norm": 3.1808104515075684,
+ "learning_rate": 4.9980817639146665e-06,
+ "loss": 0.7915,
+ "step": 257
+ },
+ {
+ "epoch": 0.12198581560283688,
+ "grad_norm": 3.1661291122436523,
+ "learning_rate": 4.998057253139575e-06,
+ "loss": 0.8053,
+ "step": 258
+ },
+ {
+ "epoch": 0.12245862884160756,
+ "grad_norm": 3.528749942779541,
+ "learning_rate": 4.998032586821413e-06,
+ "loss": 0.7946,
+ "step": 259
+ },
+ {
+ "epoch": 0.12293144208037825,
+ "grad_norm": 3.125964879989624,
+ "learning_rate": 4.998007764961716e-06,
+ "loss": 0.7569,
+ "step": 260
+ },
+ {
+ "epoch": 0.12340425531914893,
+ "grad_norm": 3.0778942108154297,
+ "learning_rate": 4.997982787562029e-06,
+ "loss": 0.7184,
+ "step": 261
+ },
+ {
+ "epoch": 0.12387706855791962,
+ "grad_norm": 3.3531930446624756,
+ "learning_rate": 4.997957654623906e-06,
+ "loss": 0.7586,
+ "step": 262
+ },
+ {
+ "epoch": 0.1243498817966903,
+ "grad_norm": 3.229278564453125,
+ "learning_rate": 4.997932366148913e-06,
+ "loss": 0.6092,
+ "step": 263
+ },
+ {
+ "epoch": 0.12482269503546099,
+ "grad_norm": 3.7286155223846436,
+ "learning_rate": 4.997906922138626e-06,
+ "loss": 0.7965,
+ "step": 264
+ },
+ {
+ "epoch": 0.12529550827423167,
+ "grad_norm": 3.300311803817749,
+ "learning_rate": 4.997881322594628e-06,
+ "loss": 0.7665,
+ "step": 265
+ },
+ {
+ "epoch": 0.12576832151300235,
+ "grad_norm": 3.411482572555542,
+ "learning_rate": 4.9978555675185115e-06,
+ "loss": 0.7253,
+ "step": 266
+ },
+ {
+ "epoch": 0.12624113475177304,
+ "grad_norm": 3.0884511470794678,
+ "learning_rate": 4.9978296569118825e-06,
+ "loss": 0.659,
+ "step": 267
+ },
+ {
+ "epoch": 0.12671394799054372,
+ "grad_norm": 3.0652925968170166,
+ "learning_rate": 4.9978035907763535e-06,
+ "loss": 0.6739,
+ "step": 268
+ },
+ {
+ "epoch": 0.1271867612293144,
+ "grad_norm": 3.280555009841919,
+ "learning_rate": 4.997777369113547e-06,
+ "loss": 0.8003,
+ "step": 269
+ },
+ {
+ "epoch": 0.1276595744680851,
+ "grad_norm": 2.980860948562622,
+ "learning_rate": 4.997750991925096e-06,
+ "loss": 0.7097,
+ "step": 270
+ },
+ {
+ "epoch": 0.12813238770685578,
+ "grad_norm": 3.301760673522949,
+ "learning_rate": 4.997724459212644e-06,
+ "loss": 0.7894,
+ "step": 271
+ },
+ {
+ "epoch": 0.12860520094562647,
+ "grad_norm": 2.9584903717041016,
+ "learning_rate": 4.997697770977841e-06,
+ "loss": 0.733,
+ "step": 272
+ },
+ {
+ "epoch": 0.12907801418439716,
+ "grad_norm": 3.5632214546203613,
+ "learning_rate": 4.99767092722235e-06,
+ "loss": 0.7228,
+ "step": 273
+ },
+ {
+ "epoch": 0.12955082742316784,
+ "grad_norm": 3.5900983810424805,
+ "learning_rate": 4.997643927947843e-06,
+ "loss": 0.7634,
+ "step": 274
+ },
+ {
+ "epoch": 0.13002364066193853,
+ "grad_norm": 3.332650661468506,
+ "learning_rate": 4.997616773156e-06,
+ "loss": 0.797,
+ "step": 275
+ },
+ {
+ "epoch": 0.13049645390070921,
+ "grad_norm": 3.1094167232513428,
+ "learning_rate": 4.997589462848512e-06,
+ "loss": 0.7849,
+ "step": 276
+ },
+ {
+ "epoch": 0.1309692671394799,
+ "grad_norm": 3.5359463691711426,
+ "learning_rate": 4.99756199702708e-06,
+ "loss": 0.6871,
+ "step": 277
+ },
+ {
+ "epoch": 0.1314420803782506,
+ "grad_norm": 3.190441846847534,
+ "learning_rate": 4.997534375693414e-06,
+ "loss": 0.6883,
+ "step": 278
+ },
+ {
+ "epoch": 0.13191489361702127,
+ "grad_norm": 3.063518762588501,
+ "learning_rate": 4.997506598849234e-06,
+ "loss": 0.7586,
+ "step": 279
+ },
+ {
+ "epoch": 0.13238770685579196,
+ "grad_norm": 3.4112050533294678,
+ "learning_rate": 4.997478666496269e-06,
+ "loss": 0.796,
+ "step": 280
+ },
+ {
+ "epoch": 0.13286052009456265,
+ "grad_norm": 3.231886386871338,
+ "learning_rate": 4.997450578636259e-06,
+ "loss": 0.7714,
+ "step": 281
+ },
+ {
+ "epoch": 0.13333333333333333,
+ "grad_norm": 3.279425621032715,
+ "learning_rate": 4.9974223352709515e-06,
+ "loss": 0.7793,
+ "step": 282
+ },
+ {
+ "epoch": 0.13380614657210402,
+ "grad_norm": 3.2154316902160645,
+ "learning_rate": 4.9973939364021075e-06,
+ "loss": 0.791,
+ "step": 283
+ },
+ {
+ "epoch": 0.1342789598108747,
+ "grad_norm": 3.2090768814086914,
+ "learning_rate": 4.9973653820314925e-06,
+ "loss": 0.6433,
+ "step": 284
+ },
+ {
+ "epoch": 0.1347517730496454,
+ "grad_norm": 3.1712026596069336,
+ "learning_rate": 4.997336672160886e-06,
+ "loss": 0.8128,
+ "step": 285
+ },
+ {
+ "epoch": 0.13522458628841608,
+ "grad_norm": 2.929229497909546,
+ "learning_rate": 4.997307806792076e-06,
+ "loss": 0.7594,
+ "step": 286
+ },
+ {
+ "epoch": 0.13569739952718676,
+ "grad_norm": 3.0363314151763916,
+ "learning_rate": 4.997278785926859e-06,
+ "loss": 0.7336,
+ "step": 287
+ },
+ {
+ "epoch": 0.13617021276595745,
+ "grad_norm": 3.1352357864379883,
+ "learning_rate": 4.997249609567042e-06,
+ "loss": 0.7225,
+ "step": 288
+ },
+ {
+ "epoch": 0.13664302600472814,
+ "grad_norm": 3.3171157836914062,
+ "learning_rate": 4.997220277714442e-06,
+ "loss": 0.7777,
+ "step": 289
+ },
+ {
+ "epoch": 0.13711583924349882,
+ "grad_norm": 3.050717353820801,
+ "learning_rate": 4.997190790370885e-06,
+ "loss": 0.6836,
+ "step": 290
+ },
+ {
+ "epoch": 0.1375886524822695,
+ "grad_norm": 3.0297694206237793,
+ "learning_rate": 4.997161147538208e-06,
+ "loss": 0.6883,
+ "step": 291
+ },
+ {
+ "epoch": 0.1380614657210402,
+ "grad_norm": 3.0566554069519043,
+ "learning_rate": 4.997131349218256e-06,
+ "loss": 0.6674,
+ "step": 292
+ },
+ {
+ "epoch": 0.13853427895981088,
+ "grad_norm": 3.799111843109131,
+ "learning_rate": 4.997101395412885e-06,
+ "loss": 0.8256,
+ "step": 293
+ },
+ {
+ "epoch": 0.13900709219858157,
+ "grad_norm": 3.1394248008728027,
+ "learning_rate": 4.9970712861239576e-06,
+ "loss": 0.7306,
+ "step": 294
+ },
+ {
+ "epoch": 0.13947990543735225,
+ "grad_norm": 3.0605666637420654,
+ "learning_rate": 4.997041021353352e-06,
+ "loss": 0.7212,
+ "step": 295
+ },
+ {
+ "epoch": 0.13995271867612294,
+ "grad_norm": 3.8813397884368896,
+ "learning_rate": 4.997010601102951e-06,
+ "loss": 0.769,
+ "step": 296
+ },
+ {
+ "epoch": 0.14042553191489363,
+ "grad_norm": 3.0514819622039795,
+ "learning_rate": 4.996980025374649e-06,
+ "loss": 0.7422,
+ "step": 297
+ },
+ {
+ "epoch": 0.1408983451536643,
+ "grad_norm": 2.9544146060943604,
+ "learning_rate": 4.99694929417035e-06,
+ "loss": 0.6912,
+ "step": 298
+ },
+ {
+ "epoch": 0.141371158392435,
+ "grad_norm": 3.2635602951049805,
+ "learning_rate": 4.996918407491966e-06,
+ "loss": 0.7395,
+ "step": 299
+ },
+ {
+ "epoch": 0.14184397163120568,
+ "grad_norm": 3.373882532119751,
+ "learning_rate": 4.996887365341423e-06,
+ "loss": 0.7799,
+ "step": 300
+ },
+ {
+ "epoch": 0.14231678486997637,
+ "grad_norm": 3.001128673553467,
+ "learning_rate": 4.996856167720652e-06,
+ "loss": 0.7168,
+ "step": 301
+ },
+ {
+ "epoch": 0.14278959810874706,
+ "grad_norm": 3.1026835441589355,
+ "learning_rate": 4.996824814631595e-06,
+ "loss": 0.7492,
+ "step": 302
+ },
+ {
+ "epoch": 0.14326241134751774,
+ "grad_norm": 3.41947603225708,
+ "learning_rate": 4.996793306076205e-06,
+ "loss": 0.6659,
+ "step": 303
+ },
+ {
+ "epoch": 0.14373522458628843,
+ "grad_norm": 3.2272400856018066,
+ "learning_rate": 4.996761642056444e-06,
+ "loss": 0.7184,
+ "step": 304
+ },
+ {
+ "epoch": 0.14420803782505912,
+ "grad_norm": 2.9488935470581055,
+ "learning_rate": 4.996729822574284e-06,
+ "loss": 0.7451,
+ "step": 305
+ },
+ {
+ "epoch": 0.14468085106382977,
+ "grad_norm": 3.268231153488159,
+ "learning_rate": 4.9966978476317065e-06,
+ "loss": 0.7798,
+ "step": 306
+ },
+ {
+ "epoch": 0.14515366430260046,
+ "grad_norm": 3.9086556434631348,
+ "learning_rate": 4.996665717230701e-06,
+ "loss": 0.7871,
+ "step": 307
+ },
+ {
+ "epoch": 0.14562647754137115,
+ "grad_norm": 3.3483879566192627,
+ "learning_rate": 4.996633431373269e-06,
+ "loss": 0.7415,
+ "step": 308
+ },
+ {
+ "epoch": 0.14609929078014183,
+ "grad_norm": 2.839400053024292,
+ "learning_rate": 4.99660099006142e-06,
+ "loss": 0.7192,
+ "step": 309
+ },
+ {
+ "epoch": 0.14657210401891252,
+ "grad_norm": 3.177302598953247,
+ "learning_rate": 4.996568393297175e-06,
+ "loss": 0.755,
+ "step": 310
+ },
+ {
+ "epoch": 0.1470449172576832,
+ "grad_norm": 3.5477044582366943,
+ "learning_rate": 4.996535641082563e-06,
+ "loss": 0.7531,
+ "step": 311
+ },
+ {
+ "epoch": 0.1475177304964539,
+ "grad_norm": 3.418576717376709,
+ "learning_rate": 4.996502733419624e-06,
+ "loss": 0.8009,
+ "step": 312
+ },
+ {
+ "epoch": 0.14799054373522458,
+ "grad_norm": 3.711341619491577,
+ "learning_rate": 4.996469670310407e-06,
+ "loss": 0.7362,
+ "step": 313
+ },
+ {
+ "epoch": 0.14846335697399526,
+ "grad_norm": 3.2419373989105225,
+ "learning_rate": 4.99643645175697e-06,
+ "loss": 0.7761,
+ "step": 314
+ },
+ {
+ "epoch": 0.14893617021276595,
+ "grad_norm": 3.121858835220337,
+ "learning_rate": 4.996403077761381e-06,
+ "loss": 0.6495,
+ "step": 315
+ },
+ {
+ "epoch": 0.14940898345153664,
+ "grad_norm": 3.123054265975952,
+ "learning_rate": 4.996369548325719e-06,
+ "loss": 0.7444,
+ "step": 316
+ },
+ {
+ "epoch": 0.14988179669030732,
+ "grad_norm": 2.780880928039551,
+ "learning_rate": 4.996335863452072e-06,
+ "loss": 0.672,
+ "step": 317
+ },
+ {
+ "epoch": 0.150354609929078,
+ "grad_norm": 3.3738629817962646,
+ "learning_rate": 4.996302023142536e-06,
+ "loss": 0.7972,
+ "step": 318
+ },
+ {
+ "epoch": 0.1508274231678487,
+ "grad_norm": 3.4874777793884277,
+ "learning_rate": 4.99626802739922e-06,
+ "loss": 0.8252,
+ "step": 319
+ },
+ {
+ "epoch": 0.15130023640661938,
+ "grad_norm": 3.7074787616729736,
+ "learning_rate": 4.9962338762242395e-06,
+ "loss": 0.8216,
+ "step": 320
+ },
+ {
+ "epoch": 0.15177304964539007,
+ "grad_norm": 3.281912326812744,
+ "learning_rate": 4.996199569619721e-06,
+ "loss": 0.8175,
+ "step": 321
+ },
+ {
+ "epoch": 0.15224586288416075,
+ "grad_norm": 2.9485340118408203,
+ "learning_rate": 4.996165107587801e-06,
+ "loss": 0.707,
+ "step": 322
+ },
+ {
+ "epoch": 0.15271867612293144,
+ "grad_norm": 3.3757646083831787,
+ "learning_rate": 4.996130490130625e-06,
+ "loss": 0.7955,
+ "step": 323
+ },
+ {
+ "epoch": 0.15319148936170213,
+ "grad_norm": 2.962181568145752,
+ "learning_rate": 4.996095717250349e-06,
+ "loss": 0.7067,
+ "step": 324
+ },
+ {
+ "epoch": 0.1536643026004728,
+ "grad_norm": 3.114272356033325,
+ "learning_rate": 4.996060788949136e-06,
+ "loss": 0.7486,
+ "step": 325
+ },
+ {
+ "epoch": 0.1541371158392435,
+ "grad_norm": 3.0621590614318848,
+ "learning_rate": 4.996025705229165e-06,
+ "loss": 0.6547,
+ "step": 326
+ },
+ {
+ "epoch": 0.15460992907801419,
+ "grad_norm": 2.8745882511138916,
+ "learning_rate": 4.995990466092616e-06,
+ "loss": 0.6435,
+ "step": 327
+ },
+ {
+ "epoch": 0.15508274231678487,
+ "grad_norm": 2.90841007232666,
+ "learning_rate": 4.995955071541686e-06,
+ "loss": 0.7331,
+ "step": 328
+ },
+ {
+ "epoch": 0.15555555555555556,
+ "grad_norm": 2.694580316543579,
+ "learning_rate": 4.9959195215785784e-06,
+ "loss": 0.6731,
+ "step": 329
+ },
+ {
+ "epoch": 0.15602836879432624,
+ "grad_norm": 3.158083438873291,
+ "learning_rate": 4.995883816205507e-06,
+ "loss": 0.7257,
+ "step": 330
+ },
+ {
+ "epoch": 0.15650118203309693,
+ "grad_norm": 3.3234715461730957,
+ "learning_rate": 4.995847955424694e-06,
+ "loss": 0.7389,
+ "step": 331
+ },
+ {
+ "epoch": 0.15697399527186762,
+ "grad_norm": 2.9406495094299316,
+ "learning_rate": 4.995811939238373e-06,
+ "loss": 0.643,
+ "step": 332
+ },
+ {
+ "epoch": 0.1574468085106383,
+ "grad_norm": 3.3191726207733154,
+ "learning_rate": 4.995775767648785e-06,
+ "loss": 0.7879,
+ "step": 333
+ },
+ {
+ "epoch": 0.157919621749409,
+ "grad_norm": 3.711925745010376,
+ "learning_rate": 4.995739440658185e-06,
+ "loss": 0.7586,
+ "step": 334
+ },
+ {
+ "epoch": 0.15839243498817968,
+ "grad_norm": 9.573421478271484,
+ "learning_rate": 4.995702958268833e-06,
+ "loss": 0.7842,
+ "step": 335
+ },
+ {
+ "epoch": 0.15886524822695036,
+ "grad_norm": 3.4154508113861084,
+ "learning_rate": 4.995666320483001e-06,
+ "loss": 0.6735,
+ "step": 336
+ },
+ {
+ "epoch": 0.15933806146572105,
+ "grad_norm": 3.4169859886169434,
+ "learning_rate": 4.995629527302971e-06,
+ "loss": 0.741,
+ "step": 337
+ },
+ {
+ "epoch": 0.15981087470449173,
+ "grad_norm": 3.287503242492676,
+ "learning_rate": 4.9955925787310335e-06,
+ "loss": 0.7139,
+ "step": 338
+ },
+ {
+ "epoch": 0.16028368794326242,
+ "grad_norm": 3.288409471511841,
+ "learning_rate": 4.995555474769488e-06,
+ "loss": 0.7636,
+ "step": 339
+ },
+ {
+ "epoch": 0.1607565011820331,
+ "grad_norm": 2.8021693229675293,
+ "learning_rate": 4.995518215420646e-06,
+ "loss": 0.5883,
+ "step": 340
+ },
+ {
+ "epoch": 0.1612293144208038,
+ "grad_norm": 2.7038564682006836,
+ "learning_rate": 4.995480800686827e-06,
+ "loss": 0.657,
+ "step": 341
+ },
+ {
+ "epoch": 0.16170212765957448,
+ "grad_norm": 3.2370235919952393,
+ "learning_rate": 4.9954432305703615e-06,
+ "loss": 0.6999,
+ "step": 342
+ },
+ {
+ "epoch": 0.16217494089834517,
+ "grad_norm": 2.8666412830352783,
+ "learning_rate": 4.995405505073588e-06,
+ "loss": 0.7199,
+ "step": 343
+ },
+ {
+ "epoch": 0.16264775413711585,
+ "grad_norm": 3.6467232704162598,
+ "learning_rate": 4.995367624198856e-06,
+ "loss": 0.7317,
+ "step": 344
+ },
+ {
+ "epoch": 0.16312056737588654,
+ "grad_norm": 2.7576327323913574,
+ "learning_rate": 4.9953295879485246e-06,
+ "loss": 0.647,
+ "step": 345
+ },
+ {
+ "epoch": 0.1635933806146572,
+ "grad_norm": 2.922232151031494,
+ "learning_rate": 4.995291396324959e-06,
+ "loss": 0.6686,
+ "step": 346
+ },
+ {
+ "epoch": 0.16406619385342788,
+ "grad_norm": 2.8693501949310303,
+ "learning_rate": 4.995253049330542e-06,
+ "loss": 0.6756,
+ "step": 347
+ },
+ {
+ "epoch": 0.16453900709219857,
+ "grad_norm": 3.671865701675415,
+ "learning_rate": 4.995214546967658e-06,
+ "loss": 0.7347,
+ "step": 348
+ },
+ {
+ "epoch": 0.16501182033096926,
+ "grad_norm": 3.024219274520874,
+ "learning_rate": 4.995175889238706e-06,
+ "loss": 0.7547,
+ "step": 349
+ },
+ {
+ "epoch": 0.16548463356973994,
+ "grad_norm": 2.8470778465270996,
+ "learning_rate": 4.995137076146091e-06,
+ "loss": 0.6764,
+ "step": 350
+ },
+ {
+ "epoch": 0.16595744680851063,
+ "grad_norm": 2.905057907104492,
+ "learning_rate": 4.9950981076922324e-06,
+ "loss": 0.6814,
+ "step": 351
+ },
+ {
+ "epoch": 0.16643026004728131,
+ "grad_norm": 3.504377841949463,
+ "learning_rate": 4.995058983879555e-06,
+ "loss": 0.7145,
+ "step": 352
+ },
+ {
+ "epoch": 0.166903073286052,
+ "grad_norm": 3.0029661655426025,
+ "learning_rate": 4.995019704710495e-06,
+ "loss": 0.7114,
+ "step": 353
+ },
+ {
+ "epoch": 0.1673758865248227,
+ "grad_norm": 2.8666274547576904,
+ "learning_rate": 4.994980270187499e-06,
+ "loss": 0.7416,
+ "step": 354
+ },
+ {
+ "epoch": 0.16784869976359337,
+ "grad_norm": 3.1644718647003174,
+ "learning_rate": 4.994940680313021e-06,
+ "loss": 0.661,
+ "step": 355
+ },
+ {
+ "epoch": 0.16832151300236406,
+ "grad_norm": 3.050391674041748,
+ "learning_rate": 4.994900935089527e-06,
+ "loss": 0.7243,
+ "step": 356
+ },
+ {
+ "epoch": 0.16879432624113475,
+ "grad_norm": 2.985466480255127,
+ "learning_rate": 4.994861034519491e-06,
+ "loss": 0.6917,
+ "step": 357
+ },
+ {
+ "epoch": 0.16926713947990543,
+ "grad_norm": 2.909342050552368,
+ "learning_rate": 4.9948209786053995e-06,
+ "loss": 0.6636,
+ "step": 358
+ },
+ {
+ "epoch": 0.16973995271867612,
+ "grad_norm": 3.2214784622192383,
+ "learning_rate": 4.9947807673497435e-06,
+ "loss": 0.7903,
+ "step": 359
+ },
+ {
+ "epoch": 0.1702127659574468,
+ "grad_norm": 2.5654983520507812,
+ "learning_rate": 4.994740400755029e-06,
+ "loss": 0.6129,
+ "step": 360
+ },
+ {
+ "epoch": 0.1706855791962175,
+ "grad_norm": 3.775646448135376,
+ "learning_rate": 4.99469987882377e-06,
+ "loss": 0.7145,
+ "step": 361
+ },
+ {
+ "epoch": 0.17115839243498818,
+ "grad_norm": 2.8965413570404053,
+ "learning_rate": 4.994659201558487e-06,
+ "loss": 0.7177,
+ "step": 362
+ },
+ {
+ "epoch": 0.17163120567375886,
+ "grad_norm": 3.485597848892212,
+ "learning_rate": 4.9946183689617146e-06,
+ "loss": 0.8107,
+ "step": 363
+ },
+ {
+ "epoch": 0.17210401891252955,
+ "grad_norm": 3.277839183807373,
+ "learning_rate": 4.994577381035995e-06,
+ "loss": 0.691,
+ "step": 364
+ },
+ {
+ "epoch": 0.17257683215130024,
+ "grad_norm": 2.8807685375213623,
+ "learning_rate": 4.99453623778388e-06,
+ "loss": 0.7627,
+ "step": 365
+ },
+ {
+ "epoch": 0.17304964539007092,
+ "grad_norm": 3.0659940242767334,
+ "learning_rate": 4.994494939207932e-06,
+ "loss": 0.6858,
+ "step": 366
+ },
+ {
+ "epoch": 0.1735224586288416,
+ "grad_norm": 3.0881855487823486,
+ "learning_rate": 4.994453485310723e-06,
+ "loss": 0.8212,
+ "step": 367
+ },
+ {
+ "epoch": 0.1739952718676123,
+ "grad_norm": 2.7199201583862305,
+ "learning_rate": 4.994411876094832e-06,
+ "loss": 0.6516,
+ "step": 368
+ },
+ {
+ "epoch": 0.17446808510638298,
+ "grad_norm": 2.955889940261841,
+ "learning_rate": 4.994370111562851e-06,
+ "loss": 0.6579,
+ "step": 369
+ },
+ {
+ "epoch": 0.17494089834515367,
+ "grad_norm": 3.1321663856506348,
+ "learning_rate": 4.994328191717382e-06,
+ "loss": 0.6891,
+ "step": 370
+ },
+ {
+ "epoch": 0.17541371158392435,
+ "grad_norm": 3.0560388565063477,
+ "learning_rate": 4.994286116561034e-06,
+ "loss": 0.7243,
+ "step": 371
+ },
+ {
+ "epoch": 0.17588652482269504,
+ "grad_norm": 3.1560704708099365,
+ "learning_rate": 4.994243886096425e-06,
+ "loss": 0.7262,
+ "step": 372
+ },
+ {
+ "epoch": 0.17635933806146573,
+ "grad_norm": 2.913541316986084,
+ "learning_rate": 4.994201500326187e-06,
+ "loss": 0.7318,
+ "step": 373
+ },
+ {
+ "epoch": 0.1768321513002364,
+ "grad_norm": 3.098376512527466,
+ "learning_rate": 4.994158959252958e-06,
+ "loss": 0.6419,
+ "step": 374
+ },
+ {
+ "epoch": 0.1773049645390071,
+ "grad_norm": 2.977508544921875,
+ "learning_rate": 4.994116262879387e-06,
+ "loss": 0.6709,
+ "step": 375
+ },
+ {
+ "epoch": 0.17777777777777778,
+ "grad_norm": 3.168186902999878,
+ "learning_rate": 4.994073411208133e-06,
+ "loss": 0.6608,
+ "step": 376
+ },
+ {
+ "epoch": 0.17825059101654847,
+ "grad_norm": 3.436844825744629,
+ "learning_rate": 4.994030404241864e-06,
+ "loss": 0.7227,
+ "step": 377
+ },
+ {
+ "epoch": 0.17872340425531916,
+ "grad_norm": 2.8998289108276367,
+ "learning_rate": 4.993987241983258e-06,
+ "loss": 0.6512,
+ "step": 378
+ },
+ {
+ "epoch": 0.17919621749408984,
+ "grad_norm": 3.407191514968872,
+ "learning_rate": 4.993943924435002e-06,
+ "loss": 0.616,
+ "step": 379
+ },
+ {
+ "epoch": 0.17966903073286053,
+ "grad_norm": 3.744858741760254,
+ "learning_rate": 4.993900451599793e-06,
+ "loss": 0.8599,
+ "step": 380
+ },
+ {
+ "epoch": 0.18014184397163122,
+ "grad_norm": 3.486283779144287,
+ "learning_rate": 4.993856823480338e-06,
+ "loss": 0.6634,
+ "step": 381
+ },
+ {
+ "epoch": 0.1806146572104019,
+ "grad_norm": 2.895719051361084,
+ "learning_rate": 4.993813040079355e-06,
+ "loss": 0.6972,
+ "step": 382
+ },
+ {
+ "epoch": 0.1810874704491726,
+ "grad_norm": 2.814133882522583,
+ "learning_rate": 4.993769101399569e-06,
+ "loss": 0.6271,
+ "step": 383
+ },
+ {
+ "epoch": 0.18156028368794327,
+ "grad_norm": 2.8609800338745117,
+ "learning_rate": 4.993725007443715e-06,
+ "loss": 0.6481,
+ "step": 384
+ },
+ {
+ "epoch": 0.18203309692671396,
+ "grad_norm": 3.2829644680023193,
+ "learning_rate": 4.99368075821454e-06,
+ "loss": 0.7999,
+ "step": 385
+ },
+ {
+ "epoch": 0.18250591016548465,
+ "grad_norm": 3.1417458057403564,
+ "learning_rate": 4.993636353714798e-06,
+ "loss": 0.6972,
+ "step": 386
+ },
+ {
+ "epoch": 0.1829787234042553,
+ "grad_norm": 3.0679385662078857,
+ "learning_rate": 4.993591793947256e-06,
+ "loss": 0.667,
+ "step": 387
+ },
+ {
+ "epoch": 0.183451536643026,
+ "grad_norm": 3.1387410163879395,
+ "learning_rate": 4.993547078914686e-06,
+ "loss": 0.7618,
+ "step": 388
+ },
+ {
+ "epoch": 0.18392434988179668,
+ "grad_norm": 2.9181406497955322,
+ "learning_rate": 4.993502208619872e-06,
+ "loss": 0.7391,
+ "step": 389
+ },
+ {
+ "epoch": 0.18439716312056736,
+ "grad_norm": 2.8952157497406006,
+ "learning_rate": 4.993457183065611e-06,
+ "loss": 0.6988,
+ "step": 390
+ },
+ {
+ "epoch": 0.18486997635933805,
+ "grad_norm": 3.2274813652038574,
+ "learning_rate": 4.993412002254704e-06,
+ "loss": 0.688,
+ "step": 391
+ },
+ {
+ "epoch": 0.18534278959810874,
+ "grad_norm": 3.4693779945373535,
+ "learning_rate": 4.993366666189965e-06,
+ "loss": 0.6634,
+ "step": 392
+ },
+ {
+ "epoch": 0.18581560283687942,
+ "grad_norm": 3.5358526706695557,
+ "learning_rate": 4.993321174874217e-06,
+ "loss": 0.7343,
+ "step": 393
+ },
+ {
+ "epoch": 0.1862884160756501,
+ "grad_norm": 3.013338088989258,
+ "learning_rate": 4.993275528310292e-06,
+ "loss": 0.7579,
+ "step": 394
+ },
+ {
+ "epoch": 0.1867612293144208,
+ "grad_norm": 2.694772720336914,
+ "learning_rate": 4.993229726501033e-06,
+ "loss": 0.718,
+ "step": 395
+ },
+ {
+ "epoch": 0.18723404255319148,
+ "grad_norm": 3.070612907409668,
+ "learning_rate": 4.9931837694492915e-06,
+ "loss": 0.6438,
+ "step": 396
+ },
+ {
+ "epoch": 0.18770685579196217,
+ "grad_norm": 2.9193027019500732,
+ "learning_rate": 4.993137657157928e-06,
+ "loss": 0.6788,
+ "step": 397
+ },
+ {
+ "epoch": 0.18817966903073285,
+ "grad_norm": 3.047682046890259,
+ "learning_rate": 4.993091389629816e-06,
+ "loss": 0.6826,
+ "step": 398
+ },
+ {
+ "epoch": 0.18865248226950354,
+ "grad_norm": 2.9629905223846436,
+ "learning_rate": 4.993044966867834e-06,
+ "loss": 0.7196,
+ "step": 399
+ },
+ {
+ "epoch": 0.18912529550827423,
+ "grad_norm": 3.0692050457000732,
+ "learning_rate": 4.992998388874874e-06,
+ "loss": 0.7015,
+ "step": 400
+ },
+ {
+ "epoch": 0.1895981087470449,
+ "grad_norm": 3.5427212715148926,
+ "learning_rate": 4.992951655653836e-06,
+ "loss": 0.8292,
+ "step": 401
+ },
+ {
+ "epoch": 0.1900709219858156,
+ "grad_norm": 2.643526554107666,
+ "learning_rate": 4.992904767207629e-06,
+ "loss": 0.624,
+ "step": 402
+ },
+ {
+ "epoch": 0.19054373522458629,
+ "grad_norm": 3.1185996532440186,
+ "learning_rate": 4.992857723539173e-06,
+ "loss": 0.7354,
+ "step": 403
+ },
+ {
+ "epoch": 0.19101654846335697,
+ "grad_norm": 3.006856679916382,
+ "learning_rate": 4.992810524651398e-06,
+ "loss": 0.7752,
+ "step": 404
+ },
+ {
+ "epoch": 0.19148936170212766,
+ "grad_norm": 2.9913275241851807,
+ "learning_rate": 4.9927631705472425e-06,
+ "loss": 0.7306,
+ "step": 405
+ },
+ {
+ "epoch": 0.19196217494089834,
+ "grad_norm": 2.6794071197509766,
+ "learning_rate": 4.992715661229655e-06,
+ "loss": 0.6136,
+ "step": 406
+ },
+ {
+ "epoch": 0.19243498817966903,
+ "grad_norm": 3.5933966636657715,
+ "learning_rate": 4.992667996701593e-06,
+ "loss": 0.7024,
+ "step": 407
+ },
+ {
+ "epoch": 0.19290780141843972,
+ "grad_norm": 2.862187623977661,
+ "learning_rate": 4.992620176966025e-06,
+ "loss": 0.692,
+ "step": 408
+ },
+ {
+ "epoch": 0.1933806146572104,
+ "grad_norm": 3.076845407485962,
+ "learning_rate": 4.9925722020259286e-06,
+ "loss": 0.7475,
+ "step": 409
+ },
+ {
+ "epoch": 0.1938534278959811,
+ "grad_norm": 3.372919797897339,
+ "learning_rate": 4.9925240718842895e-06,
+ "loss": 0.6886,
+ "step": 410
+ },
+ {
+ "epoch": 0.19432624113475178,
+ "grad_norm": 2.922977924346924,
+ "learning_rate": 4.992475786544108e-06,
+ "loss": 0.7049,
+ "step": 411
+ },
+ {
+ "epoch": 0.19479905437352246,
+ "grad_norm": 2.908034324645996,
+ "learning_rate": 4.992427346008387e-06,
+ "loss": 0.6498,
+ "step": 412
+ },
+ {
+ "epoch": 0.19527186761229315,
+ "grad_norm": 3.096723794937134,
+ "learning_rate": 4.992378750280144e-06,
+ "loss": 0.7151,
+ "step": 413
+ },
+ {
+ "epoch": 0.19574468085106383,
+ "grad_norm": 2.895237684249878,
+ "learning_rate": 4.992329999362405e-06,
+ "loss": 0.7277,
+ "step": 414
+ },
+ {
+ "epoch": 0.19621749408983452,
+ "grad_norm": 2.718230724334717,
+ "learning_rate": 4.9922810932582065e-06,
+ "loss": 0.6375,
+ "step": 415
+ },
+ {
+ "epoch": 0.1966903073286052,
+ "grad_norm": 3.187743663787842,
+ "learning_rate": 4.992232031970592e-06,
+ "loss": 0.6528,
+ "step": 416
+ },
+ {
+ "epoch": 0.1971631205673759,
+ "grad_norm": 2.996406316757202,
+ "learning_rate": 4.992182815502616e-06,
+ "loss": 0.6552,
+ "step": 417
+ },
+ {
+ "epoch": 0.19763593380614658,
+ "grad_norm": 3.301084041595459,
+ "learning_rate": 4.992133443857345e-06,
+ "loss": 0.7061,
+ "step": 418
+ },
+ {
+ "epoch": 0.19810874704491727,
+ "grad_norm": 3.7874677181243896,
+ "learning_rate": 4.992083917037853e-06,
+ "loss": 0.7859,
+ "step": 419
+ },
+ {
+ "epoch": 0.19858156028368795,
+ "grad_norm": 3.124253511428833,
+ "learning_rate": 4.992034235047222e-06,
+ "loss": 0.7615,
+ "step": 420
+ },
+ {
+ "epoch": 0.19905437352245864,
+ "grad_norm": 3.0488970279693604,
+ "learning_rate": 4.991984397888546e-06,
+ "loss": 0.6916,
+ "step": 421
+ },
+ {
+ "epoch": 0.19952718676122932,
+ "grad_norm": 3.1241321563720703,
+ "learning_rate": 4.991934405564929e-06,
+ "loss": 0.7055,
+ "step": 422
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 3.396632432937622,
+ "learning_rate": 4.991884258079484e-06,
+ "loss": 0.7675,
+ "step": 423
+ },
+ {
+ "epoch": 0.2004728132387707,
+ "grad_norm": 3.7776873111724854,
+ "learning_rate": 4.9918339554353316e-06,
+ "loss": 0.7371,
+ "step": 424
+ },
+ {
+ "epoch": 0.20094562647754138,
+ "grad_norm": 3.3356032371520996,
+ "learning_rate": 4.991783497635606e-06,
+ "loss": 0.6778,
+ "step": 425
+ },
+ {
+ "epoch": 0.20141843971631207,
+ "grad_norm": 2.988856792449951,
+ "learning_rate": 4.9917328846834474e-06,
+ "loss": 0.6795,
+ "step": 426
+ },
+ {
+ "epoch": 0.20189125295508276,
+ "grad_norm": 3.264183282852173,
+ "learning_rate": 4.99168211658201e-06,
+ "loss": 0.7707,
+ "step": 427
+ },
+ {
+ "epoch": 0.20236406619385341,
+ "grad_norm": 3.878068208694458,
+ "learning_rate": 4.991631193334451e-06,
+ "loss": 0.857,
+ "step": 428
+ },
+ {
+ "epoch": 0.2028368794326241,
+ "grad_norm": 3.6377553939819336,
+ "learning_rate": 4.991580114943943e-06,
+ "loss": 0.8033,
+ "step": 429
+ },
+ {
+ "epoch": 0.2033096926713948,
+ "grad_norm": 2.95393967628479,
+ "learning_rate": 4.991528881413667e-06,
+ "loss": 0.6809,
+ "step": 430
+ },
+ {
+ "epoch": 0.20378250591016547,
+ "grad_norm": 3.058704376220703,
+ "learning_rate": 4.9914774927468125e-06,
+ "loss": 0.6664,
+ "step": 431
+ },
+ {
+ "epoch": 0.20425531914893616,
+ "grad_norm": 2.7783217430114746,
+ "learning_rate": 4.9914259489465795e-06,
+ "loss": 0.6478,
+ "step": 432
+ },
+ {
+ "epoch": 0.20472813238770685,
+ "grad_norm": 2.4825217723846436,
+ "learning_rate": 4.991374250016177e-06,
+ "loss": 0.6598,
+ "step": 433
+ },
+ {
+ "epoch": 0.20520094562647753,
+ "grad_norm": 2.8753600120544434,
+ "learning_rate": 4.991322395958824e-06,
+ "loss": 0.6947,
+ "step": 434
+ },
+ {
+ "epoch": 0.20567375886524822,
+ "grad_norm": 3.2339367866516113,
+ "learning_rate": 4.99127038677775e-06,
+ "loss": 0.8201,
+ "step": 435
+ },
+ {
+ "epoch": 0.2061465721040189,
+ "grad_norm": 2.9065537452697754,
+ "learning_rate": 4.991218222476193e-06,
+ "loss": 0.6679,
+ "step": 436
+ },
+ {
+ "epoch": 0.2066193853427896,
+ "grad_norm": 3.283228874206543,
+ "learning_rate": 4.991165903057401e-06,
+ "loss": 0.8039,
+ "step": 437
+ },
+ {
+ "epoch": 0.20709219858156028,
+ "grad_norm": 3.429872751235962,
+ "learning_rate": 4.991113428524631e-06,
+ "loss": 0.7392,
+ "step": 438
+ },
+ {
+ "epoch": 0.20756501182033096,
+ "grad_norm": 3.118943452835083,
+ "learning_rate": 4.991060798881152e-06,
+ "loss": 0.6794,
+ "step": 439
+ },
+ {
+ "epoch": 0.20803782505910165,
+ "grad_norm": 3.395970106124878,
+ "learning_rate": 4.99100801413024e-06,
+ "loss": 0.6862,
+ "step": 440
+ },
+ {
+ "epoch": 0.20851063829787234,
+ "grad_norm": 2.869191884994507,
+ "learning_rate": 4.99095507427518e-06,
+ "loss": 0.6076,
+ "step": 441
+ },
+ {
+ "epoch": 0.20898345153664302,
+ "grad_norm": 3.1934661865234375,
+ "learning_rate": 4.990901979319272e-06,
+ "loss": 0.6927,
+ "step": 442
+ },
+ {
+ "epoch": 0.2094562647754137,
+ "grad_norm": 2.9068603515625,
+ "learning_rate": 4.990848729265819e-06,
+ "loss": 0.6864,
+ "step": 443
+ },
+ {
+ "epoch": 0.2099290780141844,
+ "grad_norm": 3.0535948276519775,
+ "learning_rate": 4.9907953241181375e-06,
+ "loss": 0.6396,
+ "step": 444
+ },
+ {
+ "epoch": 0.21040189125295508,
+ "grad_norm": 2.871511459350586,
+ "learning_rate": 4.990741763879554e-06,
+ "loss": 0.6743,
+ "step": 445
+ },
+ {
+ "epoch": 0.21087470449172577,
+ "grad_norm": 2.9184393882751465,
+ "learning_rate": 4.9906880485534015e-06,
+ "loss": 0.6786,
+ "step": 446
+ },
+ {
+ "epoch": 0.21134751773049645,
+ "grad_norm": 3.0628271102905273,
+ "learning_rate": 4.990634178143026e-06,
+ "loss": 0.6326,
+ "step": 447
+ },
+ {
+ "epoch": 0.21182033096926714,
+ "grad_norm": 3.7878305912017822,
+ "learning_rate": 4.990580152651782e-06,
+ "loss": 0.7944,
+ "step": 448
+ },
+ {
+ "epoch": 0.21229314420803783,
+ "grad_norm": 2.8577189445495605,
+ "learning_rate": 4.990525972083031e-06,
+ "loss": 0.71,
+ "step": 449
+ },
+ {
+ "epoch": 0.2127659574468085,
+ "grad_norm": 3.307769775390625,
+ "learning_rate": 4.99047163644015e-06,
+ "loss": 0.6893,
+ "step": 450
+ },
+ {
+ "epoch": 0.2132387706855792,
+ "grad_norm": 2.7391717433929443,
+ "learning_rate": 4.990417145726519e-06,
+ "loss": 0.712,
+ "step": 451
+ },
+ {
+ "epoch": 0.21371158392434988,
+ "grad_norm": 2.938044786453247,
+ "learning_rate": 4.990362499945534e-06,
+ "loss": 0.7516,
+ "step": 452
+ },
+ {
+ "epoch": 0.21418439716312057,
+ "grad_norm": 2.7831056118011475,
+ "learning_rate": 4.990307699100595e-06,
+ "loss": 0.6168,
+ "step": 453
+ },
+ {
+ "epoch": 0.21465721040189126,
+ "grad_norm": 2.907977342605591,
+ "learning_rate": 4.990252743195116e-06,
+ "loss": 0.6706,
+ "step": 454
+ },
+ {
+ "epoch": 0.21513002364066194,
+ "grad_norm": 3.7882161140441895,
+ "learning_rate": 4.990197632232517e-06,
+ "loss": 0.6847,
+ "step": 455
+ },
+ {
+ "epoch": 0.21560283687943263,
+ "grad_norm": 2.899716854095459,
+ "learning_rate": 4.990142366216232e-06,
+ "loss": 0.6699,
+ "step": 456
+ },
+ {
+ "epoch": 0.21607565011820332,
+ "grad_norm": 2.907003879547119,
+ "learning_rate": 4.990086945149701e-06,
+ "loss": 0.6864,
+ "step": 457
+ },
+ {
+ "epoch": 0.216548463356974,
+ "grad_norm": 3.2407333850860596,
+ "learning_rate": 4.9900313690363736e-06,
+ "loss": 0.692,
+ "step": 458
+ },
+ {
+ "epoch": 0.2170212765957447,
+ "grad_norm": 2.9055583477020264,
+ "learning_rate": 4.989975637879712e-06,
+ "loss": 0.7113,
+ "step": 459
+ },
+ {
+ "epoch": 0.21749408983451538,
+ "grad_norm": 2.9836206436157227,
+ "learning_rate": 4.989919751683184e-06,
+ "loss": 0.6673,
+ "step": 460
+ },
+ {
+ "epoch": 0.21796690307328606,
+ "grad_norm": 3.371035575866699,
+ "learning_rate": 4.989863710450273e-06,
+ "loss": 0.7181,
+ "step": 461
+ },
+ {
+ "epoch": 0.21843971631205675,
+ "grad_norm": 2.9636635780334473,
+ "learning_rate": 4.989807514184465e-06,
+ "loss": 0.6082,
+ "step": 462
+ },
+ {
+ "epoch": 0.21891252955082743,
+ "grad_norm": 2.9634664058685303,
+ "learning_rate": 4.9897511628892615e-06,
+ "loss": 0.7086,
+ "step": 463
+ },
+ {
+ "epoch": 0.21938534278959812,
+ "grad_norm": 3.154763698577881,
+ "learning_rate": 4.98969465656817e-06,
+ "loss": 0.7027,
+ "step": 464
+ },
+ {
+ "epoch": 0.2198581560283688,
+ "grad_norm": 2.9959890842437744,
+ "learning_rate": 4.98963799522471e-06,
+ "loss": 0.6498,
+ "step": 465
+ },
+ {
+ "epoch": 0.2203309692671395,
+ "grad_norm": 3.5470590591430664,
+ "learning_rate": 4.989581178862408e-06,
+ "loss": 0.7199,
+ "step": 466
+ },
+ {
+ "epoch": 0.22080378250591018,
+ "grad_norm": 7.1873369216918945,
+ "learning_rate": 4.989524207484802e-06,
+ "loss": 0.6676,
+ "step": 467
+ },
+ {
+ "epoch": 0.22127659574468084,
+ "grad_norm": 3.1099541187286377,
+ "learning_rate": 4.98946708109544e-06,
+ "loss": 0.6785,
+ "step": 468
+ },
+ {
+ "epoch": 0.22174940898345152,
+ "grad_norm": 2.830991506576538,
+ "learning_rate": 4.9894097996978795e-06,
+ "loss": 0.6456,
+ "step": 469
+ },
+ {
+ "epoch": 0.2222222222222222,
+ "grad_norm": 3.0212316513061523,
+ "learning_rate": 4.989352363295687e-06,
+ "loss": 0.6048,
+ "step": 470
+ },
+ {
+ "epoch": 0.2226950354609929,
+ "grad_norm": 3.18776798248291,
+ "learning_rate": 4.989294771892437e-06,
+ "loss": 0.7078,
+ "step": 471
+ },
+ {
+ "epoch": 0.22316784869976358,
+ "grad_norm": 2.9972598552703857,
+ "learning_rate": 4.989237025491717e-06,
+ "loss": 0.7082,
+ "step": 472
+ },
+ {
+ "epoch": 0.22364066193853427,
+ "grad_norm": 3.4935688972473145,
+ "learning_rate": 4.989179124097123e-06,
+ "loss": 0.8199,
+ "step": 473
+ },
+ {
+ "epoch": 0.22411347517730495,
+ "grad_norm": 2.6485543251037598,
+ "learning_rate": 4.9891210677122595e-06,
+ "loss": 0.6371,
+ "step": 474
+ },
+ {
+ "epoch": 0.22458628841607564,
+ "grad_norm": 2.969233512878418,
+ "learning_rate": 4.989062856340742e-06,
+ "loss": 0.6879,
+ "step": 475
+ },
+ {
+ "epoch": 0.22505910165484633,
+ "grad_norm": 2.881875514984131,
+ "learning_rate": 4.989004489986194e-06,
+ "loss": 0.7415,
+ "step": 476
+ },
+ {
+ "epoch": 0.225531914893617,
+ "grad_norm": 2.624540090560913,
+ "learning_rate": 4.98894596865225e-06,
+ "loss": 0.6522,
+ "step": 477
+ },
+ {
+ "epoch": 0.2260047281323877,
+ "grad_norm": 3.61075496673584,
+ "learning_rate": 4.988887292342555e-06,
+ "loss": 0.7109,
+ "step": 478
+ },
+ {
+ "epoch": 0.2264775413711584,
+ "grad_norm": 2.9368972778320312,
+ "learning_rate": 4.988828461060762e-06,
+ "loss": 0.6843,
+ "step": 479
+ },
+ {
+ "epoch": 0.22695035460992907,
+ "grad_norm": 3.0670197010040283,
+ "learning_rate": 4.988769474810533e-06,
+ "loss": 0.6807,
+ "step": 480
+ },
+ {
+ "epoch": 0.22742316784869976,
+ "grad_norm": 2.9662792682647705,
+ "learning_rate": 4.988710333595542e-06,
+ "loss": 0.6796,
+ "step": 481
+ },
+ {
+ "epoch": 0.22789598108747045,
+ "grad_norm": 2.971235752105713,
+ "learning_rate": 4.988651037419472e-06,
+ "loss": 0.696,
+ "step": 482
+ },
+ {
+ "epoch": 0.22836879432624113,
+ "grad_norm": 2.931884527206421,
+ "learning_rate": 4.988591586286013e-06,
+ "loss": 0.7323,
+ "step": 483
+ },
+ {
+ "epoch": 0.22884160756501182,
+ "grad_norm": 2.8114213943481445,
+ "learning_rate": 4.988531980198868e-06,
+ "loss": 0.6584,
+ "step": 484
+ },
+ {
+ "epoch": 0.2293144208037825,
+ "grad_norm": 3.2785916328430176,
+ "learning_rate": 4.98847221916175e-06,
+ "loss": 0.7514,
+ "step": 485
+ },
+ {
+ "epoch": 0.2297872340425532,
+ "grad_norm": 3.0520215034484863,
+ "learning_rate": 4.988412303178377e-06,
+ "loss": 0.7564,
+ "step": 486
+ },
+ {
+ "epoch": 0.23026004728132388,
+ "grad_norm": 3.181002616882324,
+ "learning_rate": 4.988352232252483e-06,
+ "loss": 0.6768,
+ "step": 487
+ },
+ {
+ "epoch": 0.23073286052009456,
+ "grad_norm": 3.4953625202178955,
+ "learning_rate": 4.988292006387805e-06,
+ "loss": 0.7143,
+ "step": 488
+ },
+ {
+ "epoch": 0.23120567375886525,
+ "grad_norm": 3.326571226119995,
+ "learning_rate": 4.988231625588096e-06,
+ "loss": 0.7318,
+ "step": 489
+ },
+ {
+ "epoch": 0.23167848699763594,
+ "grad_norm": 3.09614634513855,
+ "learning_rate": 4.988171089857113e-06,
+ "loss": 0.6574,
+ "step": 490
+ },
+ {
+ "epoch": 0.23215130023640662,
+ "grad_norm": 2.7439446449279785,
+ "learning_rate": 4.9881103991986265e-06,
+ "loss": 0.6637,
+ "step": 491
+ },
+ {
+ "epoch": 0.2326241134751773,
+ "grad_norm": 3.0681190490722656,
+ "learning_rate": 4.988049553616416e-06,
+ "loss": 0.6326,
+ "step": 492
+ },
+ {
+ "epoch": 0.233096926713948,
+ "grad_norm": 3.0757341384887695,
+ "learning_rate": 4.98798855311427e-06,
+ "loss": 0.695,
+ "step": 493
+ },
+ {
+ "epoch": 0.23356973995271868,
+ "grad_norm": 2.8637635707855225,
+ "learning_rate": 4.987927397695985e-06,
+ "loss": 0.6598,
+ "step": 494
+ },
+ {
+ "epoch": 0.23404255319148937,
+ "grad_norm": 3.3641068935394287,
+ "learning_rate": 4.9878660873653715e-06,
+ "loss": 0.7435,
+ "step": 495
+ },
+ {
+ "epoch": 0.23451536643026005,
+ "grad_norm": 3.5025596618652344,
+ "learning_rate": 4.987804622126245e-06,
+ "loss": 0.735,
+ "step": 496
+ },
+ {
+ "epoch": 0.23498817966903074,
+ "grad_norm": 2.9298837184906006,
+ "learning_rate": 4.987743001982434e-06,
+ "loss": 0.7063,
+ "step": 497
+ },
+ {
+ "epoch": 0.23546099290780143,
+ "grad_norm": 2.70358943939209,
+ "learning_rate": 4.987681226937774e-06,
+ "loss": 0.6799,
+ "step": 498
+ },
+ {
+ "epoch": 0.2359338061465721,
+ "grad_norm": 3.027871608734131,
+ "learning_rate": 4.9876192969961125e-06,
+ "loss": 0.6881,
+ "step": 499
+ },
+ {
+ "epoch": 0.2364066193853428,
+ "grad_norm": 3.362306594848633,
+ "learning_rate": 4.987557212161304e-06,
+ "loss": 0.7906,
+ "step": 500
+ },
+ {
+ "epoch": 0.23687943262411348,
+ "grad_norm": 3.3136050701141357,
+ "learning_rate": 4.987494972437217e-06,
+ "loss": 0.6878,
+ "step": 501
+ },
+ {
+ "epoch": 0.23735224586288417,
+ "grad_norm": 3.017089605331421,
+ "learning_rate": 4.9874325778277255e-06,
+ "loss": 0.7279,
+ "step": 502
+ },
+ {
+ "epoch": 0.23782505910165486,
+ "grad_norm": 2.8300516605377197,
+ "learning_rate": 4.987370028336714e-06,
+ "loss": 0.6864,
+ "step": 503
+ },
+ {
+ "epoch": 0.23829787234042554,
+ "grad_norm": 3.201860189437866,
+ "learning_rate": 4.987307323968077e-06,
+ "loss": 0.7531,
+ "step": 504
+ },
+ {
+ "epoch": 0.23877068557919623,
+ "grad_norm": 2.685396194458008,
+ "learning_rate": 4.987244464725721e-06,
+ "loss": 0.5849,
+ "step": 505
+ },
+ {
+ "epoch": 0.23924349881796692,
+ "grad_norm": 2.8715312480926514,
+ "learning_rate": 4.987181450613557e-06,
+ "loss": 0.675,
+ "step": 506
+ },
+ {
+ "epoch": 0.2397163120567376,
+ "grad_norm": 2.813908815383911,
+ "learning_rate": 4.987118281635511e-06,
+ "loss": 0.6841,
+ "step": 507
+ },
+ {
+ "epoch": 0.2401891252955083,
+ "grad_norm": 3.2738473415374756,
+ "learning_rate": 4.987054957795514e-06,
+ "loss": 0.7158,
+ "step": 508
+ },
+ {
+ "epoch": 0.24066193853427895,
+ "grad_norm": 2.896134376525879,
+ "learning_rate": 4.986991479097511e-06,
+ "loss": 0.7542,
+ "step": 509
+ },
+ {
+ "epoch": 0.24113475177304963,
+ "grad_norm": 3.0390403270721436,
+ "learning_rate": 4.986927845545454e-06,
+ "loss": 0.6733,
+ "step": 510
+ },
+ {
+ "epoch": 0.24160756501182032,
+ "grad_norm": 3.0300254821777344,
+ "learning_rate": 4.9868640571433044e-06,
+ "loss": 0.722,
+ "step": 511
+ },
+ {
+ "epoch": 0.242080378250591,
+ "grad_norm": 3.3037352561950684,
+ "learning_rate": 4.986800113895035e-06,
+ "loss": 0.6811,
+ "step": 512
+ },
+ {
+ "epoch": 0.2425531914893617,
+ "grad_norm": 3.0358474254608154,
+ "learning_rate": 4.986736015804627e-06,
+ "loss": 0.7348,
+ "step": 513
+ },
+ {
+ "epoch": 0.24302600472813238,
+ "grad_norm": 3.108792304992676,
+ "learning_rate": 4.986671762876071e-06,
+ "loss": 0.6096,
+ "step": 514
+ },
+ {
+ "epoch": 0.24349881796690306,
+ "grad_norm": 3.1316237449645996,
+ "learning_rate": 4.986607355113367e-06,
+ "loss": 0.6357,
+ "step": 515
+ },
+ {
+ "epoch": 0.24397163120567375,
+ "grad_norm": 3.3095219135284424,
+ "learning_rate": 4.986542792520528e-06,
+ "loss": 0.7515,
+ "step": 516
+ },
+ {
+ "epoch": 0.24444444444444444,
+ "grad_norm": 3.4775984287261963,
+ "learning_rate": 4.986478075101572e-06,
+ "loss": 0.7104,
+ "step": 517
+ },
+ {
+ "epoch": 0.24491725768321512,
+ "grad_norm": 3.341708183288574,
+ "learning_rate": 4.986413202860528e-06,
+ "loss": 0.7339,
+ "step": 518
+ },
+ {
+ "epoch": 0.2453900709219858,
+ "grad_norm": 2.9646966457366943,
+ "learning_rate": 4.986348175801438e-06,
+ "loss": 0.6032,
+ "step": 519
+ },
+ {
+ "epoch": 0.2458628841607565,
+ "grad_norm": 3.1853902339935303,
+ "learning_rate": 4.986282993928349e-06,
+ "loss": 0.6925,
+ "step": 520
+ },
+ {
+ "epoch": 0.24633569739952718,
+ "grad_norm": 3.286909818649292,
+ "learning_rate": 4.98621765724532e-06,
+ "loss": 0.7447,
+ "step": 521
+ },
+ {
+ "epoch": 0.24680851063829787,
+ "grad_norm": 3.2255051136016846,
+ "learning_rate": 4.986152165756419e-06,
+ "loss": 0.7747,
+ "step": 522
+ },
+ {
+ "epoch": 0.24728132387706855,
+ "grad_norm": 3.002352237701416,
+ "learning_rate": 4.986086519465724e-06,
+ "loss": 0.6472,
+ "step": 523
+ },
+ {
+ "epoch": 0.24775413711583924,
+ "grad_norm": 3.4738974571228027,
+ "learning_rate": 4.986020718377322e-06,
+ "loss": 0.7381,
+ "step": 524
+ },
+ {
+ "epoch": 0.24822695035460993,
+ "grad_norm": 3.4470200538635254,
+ "learning_rate": 4.985954762495312e-06,
+ "loss": 0.6878,
+ "step": 525
+ },
+ {
+ "epoch": 0.2486997635933806,
+ "grad_norm": 2.9219350814819336,
+ "learning_rate": 4.985888651823799e-06,
+ "loss": 0.6317,
+ "step": 526
+ },
+ {
+ "epoch": 0.2491725768321513,
+ "grad_norm": 3.061767101287842,
+ "learning_rate": 4.985822386366899e-06,
+ "loss": 0.6842,
+ "step": 527
+ },
+ {
+ "epoch": 0.24964539007092199,
+ "grad_norm": 3.0291247367858887,
+ "learning_rate": 4.985755966128742e-06,
+ "loss": 0.6852,
+ "step": 528
+ },
+ {
+ "epoch": 0.25011820330969264,
+ "grad_norm": 2.964280843734741,
+ "learning_rate": 4.985689391113457e-06,
+ "loss": 0.7738,
+ "step": 529
+ },
+ {
+ "epoch": 0.25059101654846333,
+ "grad_norm": 3.058302164077759,
+ "learning_rate": 4.9856226613251955e-06,
+ "loss": 0.6677,
+ "step": 530
+ },
+ {
+ "epoch": 0.251063829787234,
+ "grad_norm": 3.345141649246216,
+ "learning_rate": 4.985555776768109e-06,
+ "loss": 0.7837,
+ "step": 531
+ },
+ {
+ "epoch": 0.2515366430260047,
+ "grad_norm": 3.565031051635742,
+ "learning_rate": 4.9854887374463636e-06,
+ "loss": 0.7231,
+ "step": 532
+ },
+ {
+ "epoch": 0.2520094562647754,
+ "grad_norm": 2.7953789234161377,
+ "learning_rate": 4.985421543364132e-06,
+ "loss": 0.6102,
+ "step": 533
+ },
+ {
+ "epoch": 0.2524822695035461,
+ "grad_norm": 2.887606620788574,
+ "learning_rate": 4.9853541945256e-06,
+ "loss": 0.6289,
+ "step": 534
+ },
+ {
+ "epoch": 0.25295508274231676,
+ "grad_norm": 3.1480495929718018,
+ "learning_rate": 4.985286690934961e-06,
+ "loss": 0.6348,
+ "step": 535
+ },
+ {
+ "epoch": 0.25342789598108745,
+ "grad_norm": 2.8912761211395264,
+ "learning_rate": 4.985219032596416e-06,
+ "loss": 0.595,
+ "step": 536
+ },
+ {
+ "epoch": 0.25390070921985813,
+ "grad_norm": 2.947936534881592,
+ "learning_rate": 4.98515121951418e-06,
+ "loss": 0.6196,
+ "step": 537
+ },
+ {
+ "epoch": 0.2543735224586288,
+ "grad_norm": 3.1085827350616455,
+ "learning_rate": 4.985083251692474e-06,
+ "loss": 0.6387,
+ "step": 538
+ },
+ {
+ "epoch": 0.2548463356973995,
+ "grad_norm": 3.1688334941864014,
+ "learning_rate": 4.985015129135531e-06,
+ "loss": 0.7055,
+ "step": 539
+ },
+ {
+ "epoch": 0.2553191489361702,
+ "grad_norm": 3.075042963027954,
+ "learning_rate": 4.984946851847593e-06,
+ "loss": 0.7515,
+ "step": 540
+ },
+ {
+ "epoch": 0.2557919621749409,
+ "grad_norm": 3.1933093070983887,
+ "learning_rate": 4.98487841983291e-06,
+ "loss": 0.7054,
+ "step": 541
+ },
+ {
+ "epoch": 0.25626477541371157,
+ "grad_norm": 3.043473958969116,
+ "learning_rate": 4.984809833095744e-06,
+ "loss": 0.6281,
+ "step": 542
+ },
+ {
+ "epoch": 0.25673758865248225,
+ "grad_norm": 3.0532584190368652,
+ "learning_rate": 4.9847410916403645e-06,
+ "loss": 0.6155,
+ "step": 543
+ },
+ {
+ "epoch": 0.25721040189125294,
+ "grad_norm": 3.608480215072632,
+ "learning_rate": 4.984672195471053e-06,
+ "loss": 0.7363,
+ "step": 544
+ },
+ {
+ "epoch": 0.2576832151300236,
+ "grad_norm": 2.7491862773895264,
+ "learning_rate": 4.9846031445921e-06,
+ "loss": 0.6594,
+ "step": 545
+ },
+ {
+ "epoch": 0.2581560283687943,
+ "grad_norm": 2.8602418899536133,
+ "learning_rate": 4.984533939007802e-06,
+ "loss": 0.6742,
+ "step": 546
+ },
+ {
+ "epoch": 0.258628841607565,
+ "grad_norm": 3.1782007217407227,
+ "learning_rate": 4.98446457872247e-06,
+ "loss": 0.731,
+ "step": 547
+ },
+ {
+ "epoch": 0.2591016548463357,
+ "grad_norm": 2.796147584915161,
+ "learning_rate": 4.984395063740423e-06,
+ "loss": 0.6617,
+ "step": 548
+ },
+ {
+ "epoch": 0.25957446808510637,
+ "grad_norm": 2.8392202854156494,
+ "learning_rate": 4.984325394065991e-06,
+ "loss": 0.6753,
+ "step": 549
+ },
+ {
+ "epoch": 0.26004728132387706,
+ "grad_norm": 3.134672164916992,
+ "learning_rate": 4.984255569703508e-06,
+ "loss": 0.7222,
+ "step": 550
+ },
+ {
+ "epoch": 0.26052009456264774,
+ "grad_norm": 2.734330177307129,
+ "learning_rate": 4.984185590657325e-06,
+ "loss": 0.6098,
+ "step": 551
+ },
+ {
+ "epoch": 0.26099290780141843,
+ "grad_norm": 3.739010810852051,
+ "learning_rate": 4.984115456931798e-06,
+ "loss": 0.7457,
+ "step": 552
+ },
+ {
+ "epoch": 0.2614657210401891,
+ "grad_norm": 2.8412528038024902,
+ "learning_rate": 4.9840451685312925e-06,
+ "loss": 0.6972,
+ "step": 553
+ },
+ {
+ "epoch": 0.2619385342789598,
+ "grad_norm": 3.017395496368408,
+ "learning_rate": 4.983974725460188e-06,
+ "loss": 0.6887,
+ "step": 554
+ },
+ {
+ "epoch": 0.2624113475177305,
+ "grad_norm": 3.2746949195861816,
+ "learning_rate": 4.98390412772287e-06,
+ "loss": 0.7047,
+ "step": 555
+ },
+ {
+ "epoch": 0.2628841607565012,
+ "grad_norm": 3.1561965942382812,
+ "learning_rate": 4.983833375323732e-06,
+ "loss": 0.7726,
+ "step": 556
+ },
+ {
+ "epoch": 0.26335697399527186,
+ "grad_norm": 3.2367217540740967,
+ "learning_rate": 4.9837624682671816e-06,
+ "loss": 0.6348,
+ "step": 557
+ },
+ {
+ "epoch": 0.26382978723404255,
+ "grad_norm": 2.8195858001708984,
+ "learning_rate": 4.983691406557633e-06,
+ "loss": 0.6387,
+ "step": 558
+ },
+ {
+ "epoch": 0.26430260047281323,
+ "grad_norm": 3.349820852279663,
+ "learning_rate": 4.983620190199511e-06,
+ "loss": 0.6776,
+ "step": 559
+ },
+ {
+ "epoch": 0.2647754137115839,
+ "grad_norm": 2.8025588989257812,
+ "learning_rate": 4.98354881919725e-06,
+ "loss": 0.6512,
+ "step": 560
+ },
+ {
+ "epoch": 0.2652482269503546,
+ "grad_norm": 2.9125499725341797,
+ "learning_rate": 4.983477293555295e-06,
+ "loss": 0.7024,
+ "step": 561
+ },
+ {
+ "epoch": 0.2657210401891253,
+ "grad_norm": 3.3479275703430176,
+ "learning_rate": 4.983405613278098e-06,
+ "loss": 0.688,
+ "step": 562
+ },
+ {
+ "epoch": 0.266193853427896,
+ "grad_norm": 3.123971462249756,
+ "learning_rate": 4.983333778370123e-06,
+ "loss": 0.6743,
+ "step": 563
+ },
+ {
+ "epoch": 0.26666666666666666,
+ "grad_norm": 2.891625165939331,
+ "learning_rate": 4.983261788835843e-06,
+ "loss": 0.5971,
+ "step": 564
+ },
+ {
+ "epoch": 0.26713947990543735,
+ "grad_norm": 3.5066864490509033,
+ "learning_rate": 4.98318964467974e-06,
+ "loss": 0.6958,
+ "step": 565
+ },
+ {
+ "epoch": 0.26761229314420804,
+ "grad_norm": 2.570547342300415,
+ "learning_rate": 4.983117345906306e-06,
+ "loss": 0.609,
+ "step": 566
+ },
+ {
+ "epoch": 0.2680851063829787,
+ "grad_norm": 3.005106210708618,
+ "learning_rate": 4.983044892520044e-06,
+ "loss": 0.6791,
+ "step": 567
+ },
+ {
+ "epoch": 0.2685579196217494,
+ "grad_norm": 3.429675340652466,
+ "learning_rate": 4.982972284525463e-06,
+ "loss": 0.6625,
+ "step": 568
+ },
+ {
+ "epoch": 0.2690307328605201,
+ "grad_norm": 3.825657367706299,
+ "learning_rate": 4.982899521927086e-06,
+ "loss": 0.6368,
+ "step": 569
+ },
+ {
+ "epoch": 0.2695035460992908,
+ "grad_norm": 2.8699095249176025,
+ "learning_rate": 4.982826604729443e-06,
+ "loss": 0.6425,
+ "step": 570
+ },
+ {
+ "epoch": 0.26997635933806147,
+ "grad_norm": 3.1688714027404785,
+ "learning_rate": 4.982753532937074e-06,
+ "loss": 0.6904,
+ "step": 571
+ },
+ {
+ "epoch": 0.27044917257683215,
+ "grad_norm": 3.3889992237091064,
+ "learning_rate": 4.98268030655453e-06,
+ "loss": 0.7575,
+ "step": 572
+ },
+ {
+ "epoch": 0.27092198581560284,
+ "grad_norm": 3.108315944671631,
+ "learning_rate": 4.982606925586367e-06,
+ "loss": 0.6648,
+ "step": 573
+ },
+ {
+ "epoch": 0.2713947990543735,
+ "grad_norm": 3.209831953048706,
+ "learning_rate": 4.982533390037159e-06,
+ "loss": 0.657,
+ "step": 574
+ },
+ {
+ "epoch": 0.2718676122931442,
+ "grad_norm": 3.1740927696228027,
+ "learning_rate": 4.982459699911482e-06,
+ "loss": 0.7262,
+ "step": 575
+ },
+ {
+ "epoch": 0.2723404255319149,
+ "grad_norm": 3.0190417766571045,
+ "learning_rate": 4.982385855213924e-06,
+ "loss": 0.6368,
+ "step": 576
+ },
+ {
+ "epoch": 0.2728132387706856,
+ "grad_norm": 3.05049467086792,
+ "learning_rate": 4.982311855949084e-06,
+ "loss": 0.72,
+ "step": 577
+ },
+ {
+ "epoch": 0.27328605200945627,
+ "grad_norm": 2.984816551208496,
+ "learning_rate": 4.98223770212157e-06,
+ "loss": 0.6856,
+ "step": 578
+ },
+ {
+ "epoch": 0.27375886524822696,
+ "grad_norm": 2.744969606399536,
+ "learning_rate": 4.982163393735998e-06,
+ "loss": 0.6023,
+ "step": 579
+ },
+ {
+ "epoch": 0.27423167848699764,
+ "grad_norm": 3.170564889907837,
+ "learning_rate": 4.982088930796996e-06,
+ "loss": 0.6678,
+ "step": 580
+ },
+ {
+ "epoch": 0.27470449172576833,
+ "grad_norm": 2.8686118125915527,
+ "learning_rate": 4.982014313309199e-06,
+ "loss": 0.6157,
+ "step": 581
+ },
+ {
+ "epoch": 0.275177304964539,
+ "grad_norm": 2.8768694400787354,
+ "learning_rate": 4.981939541277254e-06,
+ "loss": 0.6566,
+ "step": 582
+ },
+ {
+ "epoch": 0.2756501182033097,
+ "grad_norm": 2.621481418609619,
+ "learning_rate": 4.981864614705818e-06,
+ "loss": 0.7372,
+ "step": 583
+ },
+ {
+ "epoch": 0.2761229314420804,
+ "grad_norm": 3.527374267578125,
+ "learning_rate": 4.981789533599554e-06,
+ "loss": 0.6485,
+ "step": 584
+ },
+ {
+ "epoch": 0.2765957446808511,
+ "grad_norm": 3.3141074180603027,
+ "learning_rate": 4.981714297963138e-06,
+ "loss": 0.6816,
+ "step": 585
+ },
+ {
+ "epoch": 0.27706855791962176,
+ "grad_norm": 2.9247069358825684,
+ "learning_rate": 4.981638907801255e-06,
+ "loss": 0.7217,
+ "step": 586
+ },
+ {
+ "epoch": 0.27754137115839245,
+ "grad_norm": 2.875236749649048,
+ "learning_rate": 4.981563363118599e-06,
+ "loss": 0.6662,
+ "step": 587
+ },
+ {
+ "epoch": 0.27801418439716313,
+ "grad_norm": 2.9540364742279053,
+ "learning_rate": 4.981487663919874e-06,
+ "loss": 0.7225,
+ "step": 588
+ },
+ {
+ "epoch": 0.2784869976359338,
+ "grad_norm": 2.90889310836792,
+ "learning_rate": 4.981411810209793e-06,
+ "loss": 0.6054,
+ "step": 589
+ },
+ {
+ "epoch": 0.2789598108747045,
+ "grad_norm": 2.8541409969329834,
+ "learning_rate": 4.981335801993078e-06,
+ "loss": 0.6539,
+ "step": 590
+ },
+ {
+ "epoch": 0.2794326241134752,
+ "grad_norm": 3.1600730419158936,
+ "learning_rate": 4.981259639274465e-06,
+ "loss": 0.6415,
+ "step": 591
+ },
+ {
+ "epoch": 0.2799054373522459,
+ "grad_norm": 3.569376230239868,
+ "learning_rate": 4.981183322058693e-06,
+ "loss": 0.6944,
+ "step": 592
+ },
+ {
+ "epoch": 0.28037825059101656,
+ "grad_norm": 3.067667007446289,
+ "learning_rate": 4.981106850350515e-06,
+ "loss": 0.7378,
+ "step": 593
+ },
+ {
+ "epoch": 0.28085106382978725,
+ "grad_norm": 3.082073450088501,
+ "learning_rate": 4.981030224154693e-06,
+ "loss": 0.693,
+ "step": 594
+ },
+ {
+ "epoch": 0.28132387706855794,
+ "grad_norm": 2.902932643890381,
+ "learning_rate": 4.980953443475998e-06,
+ "loss": 0.6549,
+ "step": 595
+ },
+ {
+ "epoch": 0.2817966903073286,
+ "grad_norm": 2.6821181774139404,
+ "learning_rate": 4.980876508319211e-06,
+ "loss": 0.6231,
+ "step": 596
+ },
+ {
+ "epoch": 0.2822695035460993,
+ "grad_norm": 3.1747355461120605,
+ "learning_rate": 4.9807994186891215e-06,
+ "loss": 0.6826,
+ "step": 597
+ },
+ {
+ "epoch": 0.28274231678487,
+ "grad_norm": 2.6975860595703125,
+ "learning_rate": 4.980722174590531e-06,
+ "loss": 0.6669,
+ "step": 598
+ },
+ {
+ "epoch": 0.2832151300236407,
+ "grad_norm": 2.924285650253296,
+ "learning_rate": 4.9806447760282486e-06,
+ "loss": 0.689,
+ "step": 599
+ },
+ {
+ "epoch": 0.28368794326241137,
+ "grad_norm": 2.941417694091797,
+ "learning_rate": 4.980567223007093e-06,
+ "loss": 0.6672,
+ "step": 600
+ },
+ {
+ "epoch": 0.28416075650118205,
+ "grad_norm": 2.8582186698913574,
+ "learning_rate": 4.980489515531892e-06,
+ "loss": 0.6229,
+ "step": 601
+ },
+ {
+ "epoch": 0.28463356973995274,
+ "grad_norm": 2.6462013721466064,
+ "learning_rate": 4.9804116536074865e-06,
+ "loss": 0.606,
+ "step": 602
+ },
+ {
+ "epoch": 0.2851063829787234,
+ "grad_norm": 2.9029998779296875,
+ "learning_rate": 4.980333637238723e-06,
+ "loss": 0.5915,
+ "step": 603
+ },
+ {
+ "epoch": 0.2855791962174941,
+ "grad_norm": 3.9359042644500732,
+ "learning_rate": 4.980255466430462e-06,
+ "loss": 0.7035,
+ "step": 604
+ },
+ {
+ "epoch": 0.2860520094562648,
+ "grad_norm": 3.200524091720581,
+ "learning_rate": 4.980177141187566e-06,
+ "loss": 0.7156,
+ "step": 605
+ },
+ {
+ "epoch": 0.2865248226950355,
+ "grad_norm": 3.1708686351776123,
+ "learning_rate": 4.980098661514916e-06,
+ "loss": 0.746,
+ "step": 606
+ },
+ {
+ "epoch": 0.28699763593380617,
+ "grad_norm": 2.8926830291748047,
+ "learning_rate": 4.980020027417397e-06,
+ "loss": 0.6282,
+ "step": 607
+ },
+ {
+ "epoch": 0.28747044917257686,
+ "grad_norm": 3.0526294708251953,
+ "learning_rate": 4.979941238899906e-06,
+ "loss": 0.6594,
+ "step": 608
+ },
+ {
+ "epoch": 0.28794326241134754,
+ "grad_norm": 2.9869306087493896,
+ "learning_rate": 4.9798622959673486e-06,
+ "loss": 0.7771,
+ "step": 609
+ },
+ {
+ "epoch": 0.28841607565011823,
+ "grad_norm": 2.7894513607025146,
+ "learning_rate": 4.979783198624638e-06,
+ "loss": 0.6819,
+ "step": 610
+ },
+ {
+ "epoch": 0.28888888888888886,
+ "grad_norm": 2.958575963973999,
+ "learning_rate": 4.9797039468767025e-06,
+ "loss": 0.6474,
+ "step": 611
+ },
+ {
+ "epoch": 0.28936170212765955,
+ "grad_norm": 3.423748016357422,
+ "learning_rate": 4.979624540728475e-06,
+ "loss": 0.7389,
+ "step": 612
+ },
+ {
+ "epoch": 0.28983451536643023,
+ "grad_norm": 2.9641635417938232,
+ "learning_rate": 4.9795449801849e-06,
+ "loss": 0.6005,
+ "step": 613
+ },
+ {
+ "epoch": 0.2903073286052009,
+ "grad_norm": 3.02274227142334,
+ "learning_rate": 4.979465265250933e-06,
+ "loss": 0.6358,
+ "step": 614
+ },
+ {
+ "epoch": 0.2907801418439716,
+ "grad_norm": 3.0562758445739746,
+ "learning_rate": 4.979385395931534e-06,
+ "loss": 0.6313,
+ "step": 615
+ },
+ {
+ "epoch": 0.2912529550827423,
+ "grad_norm": 3.301816701889038,
+ "learning_rate": 4.97930537223168e-06,
+ "loss": 0.7264,
+ "step": 616
+ },
+ {
+ "epoch": 0.291725768321513,
+ "grad_norm": 2.975360870361328,
+ "learning_rate": 4.979225194156351e-06,
+ "loss": 0.613,
+ "step": 617
+ },
+ {
+ "epoch": 0.29219858156028367,
+ "grad_norm": 2.9245030879974365,
+ "learning_rate": 4.97914486171054e-06,
+ "loss": 0.6646,
+ "step": 618
+ },
+ {
+ "epoch": 0.29267139479905435,
+ "grad_norm": 3.1336188316345215,
+ "learning_rate": 4.979064374899249e-06,
+ "loss": 0.6421,
+ "step": 619
+ },
+ {
+ "epoch": 0.29314420803782504,
+ "grad_norm": 3.6298763751983643,
+ "learning_rate": 4.978983733727491e-06,
+ "loss": 0.6433,
+ "step": 620
+ },
+ {
+ "epoch": 0.2936170212765957,
+ "grad_norm": 2.919597625732422,
+ "learning_rate": 4.9789029382002845e-06,
+ "loss": 0.6288,
+ "step": 621
+ },
+ {
+ "epoch": 0.2940898345153664,
+ "grad_norm": 3.2206127643585205,
+ "learning_rate": 4.978821988322662e-06,
+ "loss": 0.7102,
+ "step": 622
+ },
+ {
+ "epoch": 0.2945626477541371,
+ "grad_norm": 3.1767101287841797,
+ "learning_rate": 4.978740884099664e-06,
+ "loss": 0.6722,
+ "step": 623
+ },
+ {
+ "epoch": 0.2950354609929078,
+ "grad_norm": 3.3425452709198,
+ "learning_rate": 4.97865962553634e-06,
+ "loss": 0.6492,
+ "step": 624
+ },
+ {
+ "epoch": 0.29550827423167847,
+ "grad_norm": 3.0408358573913574,
+ "learning_rate": 4.97857821263775e-06,
+ "loss": 0.6522,
+ "step": 625
+ },
+ {
+ "epoch": 0.29598108747044916,
+ "grad_norm": 2.8144783973693848,
+ "learning_rate": 4.978496645408963e-06,
+ "loss": 0.7237,
+ "step": 626
+ },
+ {
+ "epoch": 0.29645390070921984,
+ "grad_norm": 3.7010560035705566,
+ "learning_rate": 4.978414923855057e-06,
+ "loss": 0.7509,
+ "step": 627
+ },
+ {
+ "epoch": 0.29692671394799053,
+ "grad_norm": 2.9438371658325195,
+ "learning_rate": 4.978333047981122e-06,
+ "loss": 0.6244,
+ "step": 628
+ },
+ {
+ "epoch": 0.2973995271867612,
+ "grad_norm": 3.285982370376587,
+ "learning_rate": 4.978251017792255e-06,
+ "loss": 0.7553,
+ "step": 629
+ },
+ {
+ "epoch": 0.2978723404255319,
+ "grad_norm": 3.7021138668060303,
+ "learning_rate": 4.978168833293564e-06,
+ "loss": 0.7859,
+ "step": 630
+ },
+ {
+ "epoch": 0.2983451536643026,
+ "grad_norm": 3.481858730316162,
+ "learning_rate": 4.9780864944901654e-06,
+ "loss": 0.7146,
+ "step": 631
+ },
+ {
+ "epoch": 0.2988179669030733,
+ "grad_norm": 3.693824529647827,
+ "learning_rate": 4.978004001387188e-06,
+ "loss": 0.6608,
+ "step": 632
+ },
+ {
+ "epoch": 0.29929078014184396,
+ "grad_norm": 3.0069146156311035,
+ "learning_rate": 4.9779213539897665e-06,
+ "loss": 0.6506,
+ "step": 633
+ },
+ {
+ "epoch": 0.29976359338061465,
+ "grad_norm": 3.037644147872925,
+ "learning_rate": 4.977838552303048e-06,
+ "loss": 0.6487,
+ "step": 634
+ },
+ {
+ "epoch": 0.30023640661938533,
+ "grad_norm": 3.018554449081421,
+ "learning_rate": 4.977755596332188e-06,
+ "loss": 0.6128,
+ "step": 635
+ },
+ {
+ "epoch": 0.300709219858156,
+ "grad_norm": 3.000312089920044,
+ "learning_rate": 4.977672486082351e-06,
+ "loss": 0.6431,
+ "step": 636
+ },
+ {
+ "epoch": 0.3011820330969267,
+ "grad_norm": 2.836803913116455,
+ "learning_rate": 4.977589221558713e-06,
+ "loss": 0.5914,
+ "step": 637
+ },
+ {
+ "epoch": 0.3016548463356974,
+ "grad_norm": 3.080469846725464,
+ "learning_rate": 4.977505802766457e-06,
+ "loss": 0.7265,
+ "step": 638
+ },
+ {
+ "epoch": 0.3021276595744681,
+ "grad_norm": 3.2245471477508545,
+ "learning_rate": 4.97742222971078e-06,
+ "loss": 0.6895,
+ "step": 639
+ },
+ {
+ "epoch": 0.30260047281323876,
+ "grad_norm": 3.559006452560425,
+ "learning_rate": 4.977338502396882e-06,
+ "loss": 0.7439,
+ "step": 640
+ },
+ {
+ "epoch": 0.30307328605200945,
+ "grad_norm": 2.9116289615631104,
+ "learning_rate": 4.9772546208299795e-06,
+ "loss": 0.6907,
+ "step": 641
+ },
+ {
+ "epoch": 0.30354609929078014,
+ "grad_norm": 3.3645524978637695,
+ "learning_rate": 4.977170585015295e-06,
+ "loss": 0.6983,
+ "step": 642
+ },
+ {
+ "epoch": 0.3040189125295508,
+ "grad_norm": 3.080148458480835,
+ "learning_rate": 4.977086394958058e-06,
+ "loss": 0.7016,
+ "step": 643
+ },
+ {
+ "epoch": 0.3044917257683215,
+ "grad_norm": 2.9276750087738037,
+ "learning_rate": 4.977002050663515e-06,
+ "loss": 0.6509,
+ "step": 644
+ },
+ {
+ "epoch": 0.3049645390070922,
+ "grad_norm": 3.183609962463379,
+ "learning_rate": 4.976917552136914e-06,
+ "loss": 0.6814,
+ "step": 645
+ },
+ {
+ "epoch": 0.3054373522458629,
+ "grad_norm": 3.0980000495910645,
+ "learning_rate": 4.976832899383519e-06,
+ "loss": 0.6319,
+ "step": 646
+ },
+ {
+ "epoch": 0.30591016548463357,
+ "grad_norm": 3.211376190185547,
+ "learning_rate": 4.9767480924086e-06,
+ "loss": 0.6365,
+ "step": 647
+ },
+ {
+ "epoch": 0.30638297872340425,
+ "grad_norm": 3.214430093765259,
+ "learning_rate": 4.976663131217437e-06,
+ "loss": 0.6006,
+ "step": 648
+ },
+ {
+ "epoch": 0.30685579196217494,
+ "grad_norm": 3.0914318561553955,
+ "learning_rate": 4.976578015815321e-06,
+ "loss": 0.7162,
+ "step": 649
+ },
+ {
+ "epoch": 0.3073286052009456,
+ "grad_norm": 2.7644500732421875,
+ "learning_rate": 4.976492746207551e-06,
+ "loss": 0.6045,
+ "step": 650
+ },
+ {
+ "epoch": 0.3078014184397163,
+ "grad_norm": 3.1913280487060547,
+ "learning_rate": 4.9764073223994374e-06,
+ "loss": 0.6796,
+ "step": 651
+ },
+ {
+ "epoch": 0.308274231678487,
+ "grad_norm": 2.8919692039489746,
+ "learning_rate": 4.976321744396299e-06,
+ "loss": 0.6683,
+ "step": 652
+ },
+ {
+ "epoch": 0.3087470449172577,
+ "grad_norm": 2.862234115600586,
+ "learning_rate": 4.976236012203463e-06,
+ "loss": 0.6631,
+ "step": 653
+ },
+ {
+ "epoch": 0.30921985815602837,
+ "grad_norm": 2.9708092212677,
+ "learning_rate": 4.976150125826268e-06,
+ "loss": 0.6326,
+ "step": 654
+ },
+ {
+ "epoch": 0.30969267139479906,
+ "grad_norm": 2.892465353012085,
+ "learning_rate": 4.976064085270063e-06,
+ "loss": 0.6574,
+ "step": 655
+ },
+ {
+ "epoch": 0.31016548463356974,
+ "grad_norm": 3.9215126037597656,
+ "learning_rate": 4.975977890540205e-06,
+ "loss": 0.7351,
+ "step": 656
+ },
+ {
+ "epoch": 0.31063829787234043,
+ "grad_norm": 2.9544081687927246,
+ "learning_rate": 4.975891541642059e-06,
+ "loss": 0.7264,
+ "step": 657
+ },
+ {
+ "epoch": 0.3111111111111111,
+ "grad_norm": 2.995035409927368,
+ "learning_rate": 4.975805038581005e-06,
+ "loss": 0.7405,
+ "step": 658
+ },
+ {
+ "epoch": 0.3115839243498818,
+ "grad_norm": 2.9653120040893555,
+ "learning_rate": 4.975718381362427e-06,
+ "loss": 0.679,
+ "step": 659
+ },
+ {
+ "epoch": 0.3120567375886525,
+ "grad_norm": 2.93976092338562,
+ "learning_rate": 4.9756315699917205e-06,
+ "loss": 0.627,
+ "step": 660
+ },
+ {
+ "epoch": 0.3125295508274232,
+ "grad_norm": 3.106522560119629,
+ "learning_rate": 4.9755446044742915e-06,
+ "loss": 0.6329,
+ "step": 661
+ },
+ {
+ "epoch": 0.31300236406619386,
+ "grad_norm": 3.0238280296325684,
+ "learning_rate": 4.975457484815554e-06,
+ "loss": 0.6643,
+ "step": 662
+ },
+ {
+ "epoch": 0.31347517730496455,
+ "grad_norm": 2.943528175354004,
+ "learning_rate": 4.9753702110209356e-06,
+ "loss": 0.668,
+ "step": 663
+ },
+ {
+ "epoch": 0.31394799054373523,
+ "grad_norm": 2.6840121746063232,
+ "learning_rate": 4.9752827830958676e-06,
+ "loss": 0.5482,
+ "step": 664
+ },
+ {
+ "epoch": 0.3144208037825059,
+ "grad_norm": 2.823875904083252,
+ "learning_rate": 4.975195201045794e-06,
+ "loss": 0.7017,
+ "step": 665
+ },
+ {
+ "epoch": 0.3148936170212766,
+ "grad_norm": 3.148181200027466,
+ "learning_rate": 4.975107464876168e-06,
+ "loss": 0.747,
+ "step": 666
+ },
+ {
+ "epoch": 0.3153664302600473,
+ "grad_norm": 2.630584478378296,
+ "learning_rate": 4.9750195745924545e-06,
+ "loss": 0.5987,
+ "step": 667
+ },
+ {
+ "epoch": 0.315839243498818,
+ "grad_norm": 3.075866460800171,
+ "learning_rate": 4.974931530200124e-06,
+ "loss": 0.664,
+ "step": 668
+ },
+ {
+ "epoch": 0.31631205673758866,
+ "grad_norm": 2.947197914123535,
+ "learning_rate": 4.974843331704659e-06,
+ "loss": 0.631,
+ "step": 669
+ },
+ {
+ "epoch": 0.31678486997635935,
+ "grad_norm": 3.519646644592285,
+ "learning_rate": 4.974754979111552e-06,
+ "loss": 0.7154,
+ "step": 670
+ },
+ {
+ "epoch": 0.31725768321513004,
+ "grad_norm": 2.8687186241149902,
+ "learning_rate": 4.974666472426305e-06,
+ "loss": 0.6366,
+ "step": 671
+ },
+ {
+ "epoch": 0.3177304964539007,
+ "grad_norm": 2.6966612339019775,
+ "learning_rate": 4.974577811654426e-06,
+ "loss": 0.7112,
+ "step": 672
+ },
+ {
+ "epoch": 0.3182033096926714,
+ "grad_norm": 3.1390228271484375,
+ "learning_rate": 4.974488996801439e-06,
+ "loss": 0.6882,
+ "step": 673
+ },
+ {
+ "epoch": 0.3186761229314421,
+ "grad_norm": 3.4667599201202393,
+ "learning_rate": 4.974400027872871e-06,
+ "loss": 0.7153,
+ "step": 674
+ },
+ {
+ "epoch": 0.3191489361702128,
+ "grad_norm": 2.9632184505462646,
+ "learning_rate": 4.974310904874265e-06,
+ "loss": 0.7081,
+ "step": 675
+ },
+ {
+ "epoch": 0.31962174940898347,
+ "grad_norm": 3.46150279045105,
+ "learning_rate": 4.9742216278111666e-06,
+ "loss": 0.6242,
+ "step": 676
+ },
+ {
+ "epoch": 0.32009456264775416,
+ "grad_norm": 3.380403757095337,
+ "learning_rate": 4.974132196689137e-06,
+ "loss": 0.6863,
+ "step": 677
+ },
+ {
+ "epoch": 0.32056737588652484,
+ "grad_norm": 3.4279606342315674,
+ "learning_rate": 4.974042611513746e-06,
+ "loss": 0.6388,
+ "step": 678
+ },
+ {
+ "epoch": 0.3210401891252955,
+ "grad_norm": 2.634523391723633,
+ "learning_rate": 4.973952872290568e-06,
+ "loss": 0.6038,
+ "step": 679
+ },
+ {
+ "epoch": 0.3215130023640662,
+ "grad_norm": 3.19693922996521,
+ "learning_rate": 4.973862979025194e-06,
+ "loss": 0.6383,
+ "step": 680
+ },
+ {
+ "epoch": 0.3219858156028369,
+ "grad_norm": 3.437692165374756,
+ "learning_rate": 4.973772931723218e-06,
+ "loss": 0.7288,
+ "step": 681
+ },
+ {
+ "epoch": 0.3224586288416076,
+ "grad_norm": 2.506301164627075,
+ "learning_rate": 4.97368273039025e-06,
+ "loss": 0.5707,
+ "step": 682
+ },
+ {
+ "epoch": 0.3229314420803783,
+ "grad_norm": 3.0942845344543457,
+ "learning_rate": 4.9735923750319044e-06,
+ "loss": 0.6348,
+ "step": 683
+ },
+ {
+ "epoch": 0.32340425531914896,
+ "grad_norm": 3.0889835357666016,
+ "learning_rate": 4.973501865653809e-06,
+ "loss": 0.6697,
+ "step": 684
+ },
+ {
+ "epoch": 0.32387706855791965,
+ "grad_norm": 3.0391931533813477,
+ "learning_rate": 4.973411202261598e-06,
+ "loss": 0.7091,
+ "step": 685
+ },
+ {
+ "epoch": 0.32434988179669033,
+ "grad_norm": 3.0333497524261475,
+ "learning_rate": 4.973320384860917e-06,
+ "loss": 0.6403,
+ "step": 686
+ },
+ {
+ "epoch": 0.324822695035461,
+ "grad_norm": 2.9714622497558594,
+ "learning_rate": 4.973229413457421e-06,
+ "loss": 0.6977,
+ "step": 687
+ },
+ {
+ "epoch": 0.3252955082742317,
+ "grad_norm": 3.057558298110962,
+ "learning_rate": 4.973138288056774e-06,
+ "loss": 0.7236,
+ "step": 688
+ },
+ {
+ "epoch": 0.3257683215130024,
+ "grad_norm": 2.921093463897705,
+ "learning_rate": 4.97304700866465e-06,
+ "loss": 0.576,
+ "step": 689
+ },
+ {
+ "epoch": 0.3262411347517731,
+ "grad_norm": 3.0287256240844727,
+ "learning_rate": 4.972955575286732e-06,
+ "loss": 0.7077,
+ "step": 690
+ },
+ {
+ "epoch": 0.32671394799054376,
+ "grad_norm": 2.8621346950531006,
+ "learning_rate": 4.972863987928716e-06,
+ "loss": 0.6952,
+ "step": 691
+ },
+ {
+ "epoch": 0.3271867612293144,
+ "grad_norm": 2.631359100341797,
+ "learning_rate": 4.9727722465963006e-06,
+ "loss": 0.6931,
+ "step": 692
+ },
+ {
+ "epoch": 0.3276595744680851,
+ "grad_norm": 2.8484320640563965,
+ "learning_rate": 4.972680351295201e-06,
+ "loss": 0.6292,
+ "step": 693
+ },
+ {
+ "epoch": 0.32813238770685577,
+ "grad_norm": 2.593001365661621,
+ "learning_rate": 4.972588302031138e-06,
+ "loss": 0.5942,
+ "step": 694
+ },
+ {
+ "epoch": 0.32860520094562645,
+ "grad_norm": 2.6321065425872803,
+ "learning_rate": 4.972496098809844e-06,
+ "loss": 0.65,
+ "step": 695
+ },
+ {
+ "epoch": 0.32907801418439714,
+ "grad_norm": 3.2516732215881348,
+ "learning_rate": 4.972403741637059e-06,
+ "loss": 0.7385,
+ "step": 696
+ },
+ {
+ "epoch": 0.3295508274231678,
+ "grad_norm": 3.180854320526123,
+ "learning_rate": 4.972311230518535e-06,
+ "loss": 0.6569,
+ "step": 697
+ },
+ {
+ "epoch": 0.3300236406619385,
+ "grad_norm": 4.161016941070557,
+ "learning_rate": 4.972218565460031e-06,
+ "loss": 0.6416,
+ "step": 698
+ },
+ {
+ "epoch": 0.3304964539007092,
+ "grad_norm": 3.153897762298584,
+ "learning_rate": 4.972125746467317e-06,
+ "loss": 0.7196,
+ "step": 699
+ },
+ {
+ "epoch": 0.3309692671394799,
+ "grad_norm": 2.9595556259155273,
+ "learning_rate": 4.972032773546173e-06,
+ "loss": 0.7093,
+ "step": 700
+ },
+ {
+ "epoch": 0.33144208037825057,
+ "grad_norm": 3.1086833477020264,
+ "learning_rate": 4.9719396467023875e-06,
+ "loss": 0.6963,
+ "step": 701
+ },
+ {
+ "epoch": 0.33191489361702126,
+ "grad_norm": 2.958921432495117,
+ "learning_rate": 4.971846365941759e-06,
+ "loss": 0.6518,
+ "step": 702
+ },
+ {
+ "epoch": 0.33238770685579194,
+ "grad_norm": 2.8745479583740234,
+ "learning_rate": 4.971752931270096e-06,
+ "loss": 0.696,
+ "step": 703
+ },
+ {
+ "epoch": 0.33286052009456263,
+ "grad_norm": 3.224358558654785,
+ "learning_rate": 4.971659342693217e-06,
+ "loss": 0.6769,
+ "step": 704
+ },
+ {
+ "epoch": 0.3333333333333333,
+ "grad_norm": 2.696319580078125,
+ "learning_rate": 4.9715656002169486e-06,
+ "loss": 0.6833,
+ "step": 705
+ },
+ {
+ "epoch": 0.333806146572104,
+ "grad_norm": 2.9283502101898193,
+ "learning_rate": 4.971471703847127e-06,
+ "loss": 0.6784,
+ "step": 706
+ },
+ {
+ "epoch": 0.3342789598108747,
+ "grad_norm": 2.654914140701294,
+ "learning_rate": 4.9713776535896e-06,
+ "loss": 0.6337,
+ "step": 707
+ },
+ {
+ "epoch": 0.3347517730496454,
+ "grad_norm": 3.041555643081665,
+ "learning_rate": 4.971283449450224e-06,
+ "loss": 0.6227,
+ "step": 708
+ },
+ {
+ "epoch": 0.33522458628841606,
+ "grad_norm": 2.893008232116699,
+ "learning_rate": 4.971189091434863e-06,
+ "loss": 0.655,
+ "step": 709
+ },
+ {
+ "epoch": 0.33569739952718675,
+ "grad_norm": 2.8806653022766113,
+ "learning_rate": 4.971094579549393e-06,
+ "loss": 0.7077,
+ "step": 710
+ },
+ {
+ "epoch": 0.33617021276595743,
+ "grad_norm": 3.4830048084259033,
+ "learning_rate": 4.9709999137996986e-06,
+ "loss": 0.7461,
+ "step": 711
+ },
+ {
+ "epoch": 0.3366430260047281,
+ "grad_norm": 3.155444860458374,
+ "learning_rate": 4.970905094191674e-06,
+ "loss": 0.652,
+ "step": 712
+ },
+ {
+ "epoch": 0.3371158392434988,
+ "grad_norm": 2.7608706951141357,
+ "learning_rate": 4.970810120731225e-06,
+ "loss": 0.684,
+ "step": 713
+ },
+ {
+ "epoch": 0.3375886524822695,
+ "grad_norm": 2.8209474086761475,
+ "learning_rate": 4.970714993424265e-06,
+ "loss": 0.6009,
+ "step": 714
+ },
+ {
+ "epoch": 0.3380614657210402,
+ "grad_norm": 3.6532654762268066,
+ "learning_rate": 4.9706197122767145e-06,
+ "loss": 0.702,
+ "step": 715
+ },
+ {
+ "epoch": 0.33853427895981086,
+ "grad_norm": 2.6276566982269287,
+ "learning_rate": 4.970524277294508e-06,
+ "loss": 0.6338,
+ "step": 716
+ },
+ {
+ "epoch": 0.33900709219858155,
+ "grad_norm": 3.509871482849121,
+ "learning_rate": 4.970428688483589e-06,
+ "loss": 0.6853,
+ "step": 717
+ },
+ {
+ "epoch": 0.33947990543735224,
+ "grad_norm": 5.332682132720947,
+ "learning_rate": 4.970332945849906e-06,
+ "loss": 0.6684,
+ "step": 718
+ },
+ {
+ "epoch": 0.3399527186761229,
+ "grad_norm": 2.718801975250244,
+ "learning_rate": 4.970237049399424e-06,
+ "loss": 0.6676,
+ "step": 719
+ },
+ {
+ "epoch": 0.3404255319148936,
+ "grad_norm": 3.891003131866455,
+ "learning_rate": 4.970140999138112e-06,
+ "loss": 0.7043,
+ "step": 720
+ },
+ {
+ "epoch": 0.3408983451536643,
+ "grad_norm": 2.8863155841827393,
+ "learning_rate": 4.970044795071951e-06,
+ "loss": 0.6563,
+ "step": 721
+ },
+ {
+ "epoch": 0.341371158392435,
+ "grad_norm": 3.2527518272399902,
+ "learning_rate": 4.969948437206932e-06,
+ "loss": 0.7244,
+ "step": 722
+ },
+ {
+ "epoch": 0.34184397163120567,
+ "grad_norm": 2.9726758003234863,
+ "learning_rate": 4.969851925549054e-06,
+ "loss": 0.6548,
+ "step": 723
+ },
+ {
+ "epoch": 0.34231678486997635,
+ "grad_norm": 3.118309497833252,
+ "learning_rate": 4.969755260104327e-06,
+ "loss": 0.7293,
+ "step": 724
+ },
+ {
+ "epoch": 0.34278959810874704,
+ "grad_norm": 3.373068332672119,
+ "learning_rate": 4.969658440878769e-06,
+ "loss": 0.6444,
+ "step": 725
+ },
+ {
+ "epoch": 0.3432624113475177,
+ "grad_norm": 2.7157437801361084,
+ "learning_rate": 4.969561467878409e-06,
+ "loss": 0.642,
+ "step": 726
+ },
+ {
+ "epoch": 0.3437352245862884,
+ "grad_norm": 2.58929705619812,
+ "learning_rate": 4.969464341109285e-06,
+ "loss": 0.6165,
+ "step": 727
+ },
+ {
+ "epoch": 0.3442080378250591,
+ "grad_norm": 2.8811306953430176,
+ "learning_rate": 4.969367060577445e-06,
+ "loss": 0.7127,
+ "step": 728
+ },
+ {
+ "epoch": 0.3446808510638298,
+ "grad_norm": 3.494358539581299,
+ "learning_rate": 4.969269626288946e-06,
+ "loss": 0.7103,
+ "step": 729
+ },
+ {
+ "epoch": 0.34515366430260047,
+ "grad_norm": 2.9753928184509277,
+ "learning_rate": 4.969172038249855e-06,
+ "loss": 0.6911,
+ "step": 730
+ },
+ {
+ "epoch": 0.34562647754137116,
+ "grad_norm": 3.2885913848876953,
+ "learning_rate": 4.969074296466247e-06,
+ "loss": 0.6968,
+ "step": 731
+ },
+ {
+ "epoch": 0.34609929078014184,
+ "grad_norm": 2.7564568519592285,
+ "learning_rate": 4.968976400944211e-06,
+ "loss": 0.6843,
+ "step": 732
+ },
+ {
+ "epoch": 0.34657210401891253,
+ "grad_norm": 2.9255006313323975,
+ "learning_rate": 4.96887835168984e-06,
+ "loss": 0.6024,
+ "step": 733
+ },
+ {
+ "epoch": 0.3470449172576832,
+ "grad_norm": 3.1808290481567383,
+ "learning_rate": 4.968780148709239e-06,
+ "loss": 0.7377,
+ "step": 734
+ },
+ {
+ "epoch": 0.3475177304964539,
+ "grad_norm": 2.956666946411133,
+ "learning_rate": 4.968681792008523e-06,
+ "loss": 0.65,
+ "step": 735
+ },
+ {
+ "epoch": 0.3479905437352246,
+ "grad_norm": 2.9631855487823486,
+ "learning_rate": 4.9685832815938175e-06,
+ "loss": 0.677,
+ "step": 736
+ },
+ {
+ "epoch": 0.3484633569739953,
+ "grad_norm": 2.501917600631714,
+ "learning_rate": 4.968484617471256e-06,
+ "loss": 0.6282,
+ "step": 737
+ },
+ {
+ "epoch": 0.34893617021276596,
+ "grad_norm": 2.750779628753662,
+ "learning_rate": 4.968385799646981e-06,
+ "loss": 0.6507,
+ "step": 738
+ },
+ {
+ "epoch": 0.34940898345153665,
+ "grad_norm": 2.872300624847412,
+ "learning_rate": 4.968286828127146e-06,
+ "loss": 0.5949,
+ "step": 739
+ },
+ {
+ "epoch": 0.34988179669030733,
+ "grad_norm": 2.6316142082214355,
+ "learning_rate": 4.9681877029179124e-06,
+ "loss": 0.6328,
+ "step": 740
+ },
+ {
+ "epoch": 0.350354609929078,
+ "grad_norm": 3.244364023208618,
+ "learning_rate": 4.968088424025454e-06,
+ "loss": 0.7393,
+ "step": 741
+ },
+ {
+ "epoch": 0.3508274231678487,
+ "grad_norm": 2.620465040206909,
+ "learning_rate": 4.967988991455951e-06,
+ "loss": 0.6797,
+ "step": 742
+ },
+ {
+ "epoch": 0.3513002364066194,
+ "grad_norm": 2.854513645172119,
+ "learning_rate": 4.967889405215596e-06,
+ "loss": 0.6368,
+ "step": 743
+ },
+ {
+ "epoch": 0.3517730496453901,
+ "grad_norm": 2.579854726791382,
+ "learning_rate": 4.9677896653105886e-06,
+ "loss": 0.6489,
+ "step": 744
+ },
+ {
+ "epoch": 0.35224586288416077,
+ "grad_norm": 3.0697381496429443,
+ "learning_rate": 4.96768977174714e-06,
+ "loss": 0.6313,
+ "step": 745
+ },
+ {
+ "epoch": 0.35271867612293145,
+ "grad_norm": 3.369338035583496,
+ "learning_rate": 4.96758972453147e-06,
+ "loss": 0.7416,
+ "step": 746
+ },
+ {
+ "epoch": 0.35319148936170214,
+ "grad_norm": 2.836221933364868,
+ "learning_rate": 4.967489523669807e-06,
+ "loss": 0.6422,
+ "step": 747
+ },
+ {
+ "epoch": 0.3536643026004728,
+ "grad_norm": 2.929579496383667,
+ "learning_rate": 4.967389169168392e-06,
+ "loss": 0.6482,
+ "step": 748
+ },
+ {
+ "epoch": 0.3541371158392435,
+ "grad_norm": 2.9243831634521484,
+ "learning_rate": 4.967288661033472e-06,
+ "loss": 0.5813,
+ "step": 749
+ },
+ {
+ "epoch": 0.3546099290780142,
+ "grad_norm": 3.7555336952209473,
+ "learning_rate": 4.967187999271306e-06,
+ "loss": 0.6501,
+ "step": 750
+ },
+ {
+ "epoch": 0.3550827423167849,
+ "grad_norm": 3.4279143810272217,
+ "learning_rate": 4.9670871838881615e-06,
+ "loss": 0.6326,
+ "step": 751
+ },
+ {
+ "epoch": 0.35555555555555557,
+ "grad_norm": 2.875066041946411,
+ "learning_rate": 4.9669862148903166e-06,
+ "loss": 0.664,
+ "step": 752
+ },
+ {
+ "epoch": 0.35602836879432626,
+ "grad_norm": 3.130394697189331,
+ "learning_rate": 4.966885092284057e-06,
+ "loss": 0.706,
+ "step": 753
+ },
+ {
+ "epoch": 0.35650118203309694,
+ "grad_norm": 2.9606287479400635,
+ "learning_rate": 4.96678381607568e-06,
+ "loss": 0.693,
+ "step": 754
+ },
+ {
+ "epoch": 0.35697399527186763,
+ "grad_norm": 3.0584909915924072,
+ "learning_rate": 4.966682386271491e-06,
+ "loss": 0.6034,
+ "step": 755
+ },
+ {
+ "epoch": 0.3574468085106383,
+ "grad_norm": 2.8215200901031494,
+ "learning_rate": 4.966580802877805e-06,
+ "loss": 0.6217,
+ "step": 756
+ },
+ {
+ "epoch": 0.357919621749409,
+ "grad_norm": 2.7348055839538574,
+ "learning_rate": 4.966479065900949e-06,
+ "loss": 0.6194,
+ "step": 757
+ },
+ {
+ "epoch": 0.3583924349881797,
+ "grad_norm": 3.2347466945648193,
+ "learning_rate": 4.966377175347257e-06,
+ "loss": 0.6377,
+ "step": 758
+ },
+ {
+ "epoch": 0.3588652482269504,
+ "grad_norm": 3.311845302581787,
+ "learning_rate": 4.966275131223072e-06,
+ "loss": 0.6234,
+ "step": 759
+ },
+ {
+ "epoch": 0.35933806146572106,
+ "grad_norm": 3.0384368896484375,
+ "learning_rate": 4.96617293353475e-06,
+ "loss": 0.609,
+ "step": 760
+ },
+ {
+ "epoch": 0.35981087470449175,
+ "grad_norm": 3.516854763031006,
+ "learning_rate": 4.966070582288653e-06,
+ "loss": 0.6627,
+ "step": 761
+ },
+ {
+ "epoch": 0.36028368794326243,
+ "grad_norm": 3.2425215244293213,
+ "learning_rate": 4.9659680774911534e-06,
+ "loss": 0.7355,
+ "step": 762
+ },
+ {
+ "epoch": 0.3607565011820331,
+ "grad_norm": 3.2665750980377197,
+ "learning_rate": 4.965865419148636e-06,
+ "loss": 0.6787,
+ "step": 763
+ },
+ {
+ "epoch": 0.3612293144208038,
+ "grad_norm": 2.729428291320801,
+ "learning_rate": 4.96576260726749e-06,
+ "loss": 0.6272,
+ "step": 764
+ },
+ {
+ "epoch": 0.3617021276595745,
+ "grad_norm": 3.299969434738159,
+ "learning_rate": 4.965659641854119e-06,
+ "loss": 0.6552,
+ "step": 765
+ },
+ {
+ "epoch": 0.3621749408983452,
+ "grad_norm": 2.7090916633605957,
+ "learning_rate": 4.965556522914934e-06,
+ "loss": 0.6661,
+ "step": 766
+ },
+ {
+ "epoch": 0.36264775413711586,
+ "grad_norm": 2.488846778869629,
+ "learning_rate": 4.965453250456355e-06,
+ "loss": 0.5821,
+ "step": 767
+ },
+ {
+ "epoch": 0.36312056737588655,
+ "grad_norm": 2.5267233848571777,
+ "learning_rate": 4.965349824484813e-06,
+ "loss": 0.5593,
+ "step": 768
+ },
+ {
+ "epoch": 0.36359338061465724,
+ "grad_norm": 3.0646679401397705,
+ "learning_rate": 4.965246245006748e-06,
+ "loss": 0.6341,
+ "step": 769
+ },
+ {
+ "epoch": 0.3640661938534279,
+ "grad_norm": 2.9877712726593018,
+ "learning_rate": 4.965142512028609e-06,
+ "loss": 0.7202,
+ "step": 770
+ },
+ {
+ "epoch": 0.3645390070921986,
+ "grad_norm": 3.7494113445281982,
+ "learning_rate": 4.965038625556854e-06,
+ "loss": 0.7643,
+ "step": 771
+ },
+ {
+ "epoch": 0.3650118203309693,
+ "grad_norm": 2.8382890224456787,
+ "learning_rate": 4.964934585597954e-06,
+ "loss": 0.6522,
+ "step": 772
+ },
+ {
+ "epoch": 0.3654846335697399,
+ "grad_norm": 3.091655731201172,
+ "learning_rate": 4.9648303921583854e-06,
+ "loss": 0.7117,
+ "step": 773
+ },
+ {
+ "epoch": 0.3659574468085106,
+ "grad_norm": 3.0608325004577637,
+ "learning_rate": 4.964726045244635e-06,
+ "loss": 0.6538,
+ "step": 774
+ },
+ {
+ "epoch": 0.3664302600472813,
+ "grad_norm": 2.8492867946624756,
+ "learning_rate": 4.964621544863203e-06,
+ "loss": 0.6079,
+ "step": 775
+ },
+ {
+ "epoch": 0.366903073286052,
+ "grad_norm": 3.0669894218444824,
+ "learning_rate": 4.964516891020594e-06,
+ "loss": 0.6223,
+ "step": 776
+ },
+ {
+ "epoch": 0.36737588652482267,
+ "grad_norm": 3.089984893798828,
+ "learning_rate": 4.964412083723325e-06,
+ "loss": 0.671,
+ "step": 777
+ },
+ {
+ "epoch": 0.36784869976359336,
+ "grad_norm": 2.905242443084717,
+ "learning_rate": 4.964307122977921e-06,
+ "loss": 0.62,
+ "step": 778
+ },
+ {
+ "epoch": 0.36832151300236404,
+ "grad_norm": 3.954436779022217,
+ "learning_rate": 4.964202008790918e-06,
+ "loss": 0.6535,
+ "step": 779
+ },
+ {
+ "epoch": 0.36879432624113473,
+ "grad_norm": 2.6026058197021484,
+ "learning_rate": 4.9640967411688615e-06,
+ "loss": 0.5865,
+ "step": 780
+ },
+ {
+ "epoch": 0.3692671394799054,
+ "grad_norm": 2.9876346588134766,
+ "learning_rate": 4.963991320118306e-06,
+ "loss": 0.6698,
+ "step": 781
+ },
+ {
+ "epoch": 0.3697399527186761,
+ "grad_norm": 2.9411263465881348,
+ "learning_rate": 4.963885745645815e-06,
+ "loss": 0.6173,
+ "step": 782
+ },
+ {
+ "epoch": 0.3702127659574468,
+ "grad_norm": 2.5679805278778076,
+ "learning_rate": 4.963780017757962e-06,
+ "loss": 0.6285,
+ "step": 783
+ },
+ {
+ "epoch": 0.3706855791962175,
+ "grad_norm": 3.3100640773773193,
+ "learning_rate": 4.963674136461332e-06,
+ "loss": 0.5968,
+ "step": 784
+ },
+ {
+ "epoch": 0.37115839243498816,
+ "grad_norm": 3.1293699741363525,
+ "learning_rate": 4.963568101762515e-06,
+ "loss": 0.697,
+ "step": 785
+ },
+ {
+ "epoch": 0.37163120567375885,
+ "grad_norm": 3.043853759765625,
+ "learning_rate": 4.963461913668115e-06,
+ "loss": 0.5881,
+ "step": 786
+ },
+ {
+ "epoch": 0.37210401891252953,
+ "grad_norm": 3.07351016998291,
+ "learning_rate": 4.963355572184744e-06,
+ "loss": 0.6307,
+ "step": 787
+ },
+ {
+ "epoch": 0.3725768321513002,
+ "grad_norm": 2.7381317615509033,
+ "learning_rate": 4.9632490773190225e-06,
+ "loss": 0.716,
+ "step": 788
+ },
+ {
+ "epoch": 0.3730496453900709,
+ "grad_norm": 2.892221450805664,
+ "learning_rate": 4.963142429077582e-06,
+ "loss": 0.6867,
+ "step": 789
+ },
+ {
+ "epoch": 0.3735224586288416,
+ "grad_norm": 3.133122205734253,
+ "learning_rate": 4.963035627467064e-06,
+ "loss": 0.659,
+ "step": 790
+ },
+ {
+ "epoch": 0.3739952718676123,
+ "grad_norm": 3.032599925994873,
+ "learning_rate": 4.962928672494116e-06,
+ "loss": 0.6848,
+ "step": 791
+ },
+ {
+ "epoch": 0.37446808510638296,
+ "grad_norm": 3.0076355934143066,
+ "learning_rate": 4.9628215641654e-06,
+ "loss": 0.6549,
+ "step": 792
+ },
+ {
+ "epoch": 0.37494089834515365,
+ "grad_norm": 2.8904454708099365,
+ "learning_rate": 4.962714302487585e-06,
+ "loss": 0.6484,
+ "step": 793
+ },
+ {
+ "epoch": 0.37541371158392434,
+ "grad_norm": 2.881364107131958,
+ "learning_rate": 4.9626068874673486e-06,
+ "loss": 0.721,
+ "step": 794
+ },
+ {
+ "epoch": 0.375886524822695,
+ "grad_norm": 3.11668062210083,
+ "learning_rate": 4.962499319111379e-06,
+ "loss": 0.7824,
+ "step": 795
+ },
+ {
+ "epoch": 0.3763593380614657,
+ "grad_norm": 2.9201436042785645,
+ "learning_rate": 4.962391597426374e-06,
+ "loss": 0.6911,
+ "step": 796
+ },
+ {
+ "epoch": 0.3768321513002364,
+ "grad_norm": 2.926598072052002,
+ "learning_rate": 4.962283722419043e-06,
+ "loss": 0.6715,
+ "step": 797
+ },
+ {
+ "epoch": 0.3773049645390071,
+ "grad_norm": 2.7267675399780273,
+ "learning_rate": 4.962175694096101e-06,
+ "loss": 0.6111,
+ "step": 798
+ },
+ {
+ "epoch": 0.37777777777777777,
+ "grad_norm": 3.194031000137329,
+ "learning_rate": 4.962067512464275e-06,
+ "loss": 0.6558,
+ "step": 799
+ },
+ {
+ "epoch": 0.37825059101654845,
+ "grad_norm": 2.6249136924743652,
+ "learning_rate": 4.9619591775303e-06,
+ "loss": 0.6166,
+ "step": 800
+ },
+ {
+ "epoch": 0.37872340425531914,
+ "grad_norm": 2.6356167793273926,
+ "learning_rate": 4.961850689300923e-06,
+ "loss": 0.6112,
+ "step": 801
+ },
+ {
+ "epoch": 0.3791962174940898,
+ "grad_norm": 3.030724287033081,
+ "learning_rate": 4.961742047782898e-06,
+ "loss": 0.6511,
+ "step": 802
+ },
+ {
+ "epoch": 0.3796690307328605,
+ "grad_norm": 3.4987757205963135,
+ "learning_rate": 4.96163325298299e-06,
+ "loss": 0.5888,
+ "step": 803
+ },
+ {
+ "epoch": 0.3801418439716312,
+ "grad_norm": 3.0371780395507812,
+ "learning_rate": 4.961524304907974e-06,
+ "loss": 0.6385,
+ "step": 804
+ },
+ {
+ "epoch": 0.3806146572104019,
+ "grad_norm": 3.302570104598999,
+ "learning_rate": 4.961415203564632e-06,
+ "loss": 0.6515,
+ "step": 805
+ },
+ {
+ "epoch": 0.38108747044917257,
+ "grad_norm": 2.7597038745880127,
+ "learning_rate": 4.961305948959759e-06,
+ "loss": 0.6126,
+ "step": 806
+ },
+ {
+ "epoch": 0.38156028368794326,
+ "grad_norm": 2.789811849594116,
+ "learning_rate": 4.9611965411001575e-06,
+ "loss": 0.6601,
+ "step": 807
+ },
+ {
+ "epoch": 0.38203309692671394,
+ "grad_norm": 3.0403921604156494,
+ "learning_rate": 4.961086979992639e-06,
+ "loss": 0.6947,
+ "step": 808
+ },
+ {
+ "epoch": 0.38250591016548463,
+ "grad_norm": 3.2139980792999268,
+ "learning_rate": 4.960977265644026e-06,
+ "loss": 0.6876,
+ "step": 809
+ },
+ {
+ "epoch": 0.3829787234042553,
+ "grad_norm": 2.918515205383301,
+ "learning_rate": 4.960867398061149e-06,
+ "loss": 0.5997,
+ "step": 810
+ },
+ {
+ "epoch": 0.383451536643026,
+ "grad_norm": 3.197636604309082,
+ "learning_rate": 4.9607573772508495e-06,
+ "loss": 0.5754,
+ "step": 811
+ },
+ {
+ "epoch": 0.3839243498817967,
+ "grad_norm": 2.8848466873168945,
+ "learning_rate": 4.960647203219979e-06,
+ "loss": 0.6424,
+ "step": 812
+ },
+ {
+ "epoch": 0.3843971631205674,
+ "grad_norm": 3.4810187816619873,
+ "learning_rate": 4.960536875975397e-06,
+ "loss": 0.6851,
+ "step": 813
+ },
+ {
+ "epoch": 0.38486997635933806,
+ "grad_norm": 3.713934898376465,
+ "learning_rate": 4.960426395523972e-06,
+ "loss": 0.6122,
+ "step": 814
+ },
+ {
+ "epoch": 0.38534278959810875,
+ "grad_norm": 2.862600803375244,
+ "learning_rate": 4.960315761872585e-06,
+ "loss": 0.6493,
+ "step": 815
+ },
+ {
+ "epoch": 0.38581560283687943,
+ "grad_norm": 3.133882522583008,
+ "learning_rate": 4.960204975028123e-06,
+ "loss": 0.7535,
+ "step": 816
+ },
+ {
+ "epoch": 0.3862884160756501,
+ "grad_norm": 3.1526732444763184,
+ "learning_rate": 4.960094034997485e-06,
+ "loss": 0.6512,
+ "step": 817
+ },
+ {
+ "epoch": 0.3867612293144208,
+ "grad_norm": 2.7213544845581055,
+ "learning_rate": 4.959982941787579e-06,
+ "loss": 0.6121,
+ "step": 818
+ },
+ {
+ "epoch": 0.3872340425531915,
+ "grad_norm": 3.4935851097106934,
+ "learning_rate": 4.9598716954053214e-06,
+ "loss": 0.7852,
+ "step": 819
+ },
+ {
+ "epoch": 0.3877068557919622,
+ "grad_norm": 2.691016435623169,
+ "learning_rate": 4.9597602958576395e-06,
+ "loss": 0.6861,
+ "step": 820
+ },
+ {
+ "epoch": 0.38817966903073287,
+ "grad_norm": 2.8621015548706055,
+ "learning_rate": 4.959648743151469e-06,
+ "loss": 0.6262,
+ "step": 821
+ },
+ {
+ "epoch": 0.38865248226950355,
+ "grad_norm": 3.3887462615966797,
+ "learning_rate": 4.959537037293758e-06,
+ "loss": 0.7103,
+ "step": 822
+ },
+ {
+ "epoch": 0.38912529550827424,
+ "grad_norm": 2.7565438747406006,
+ "learning_rate": 4.95942517829146e-06,
+ "loss": 0.6471,
+ "step": 823
+ },
+ {
+ "epoch": 0.3895981087470449,
+ "grad_norm": 2.7920358180999756,
+ "learning_rate": 4.959313166151541e-06,
+ "loss": 0.6239,
+ "step": 824
+ },
+ {
+ "epoch": 0.3900709219858156,
+ "grad_norm": 3.18904185295105,
+ "learning_rate": 4.959201000880973e-06,
+ "loss": 0.7461,
+ "step": 825
+ },
+ {
+ "epoch": 0.3905437352245863,
+ "grad_norm": 2.727872371673584,
+ "learning_rate": 4.959088682486743e-06,
+ "loss": 0.6333,
+ "step": 826
+ },
+ {
+ "epoch": 0.391016548463357,
+ "grad_norm": 2.906378746032715,
+ "learning_rate": 4.958976210975844e-06,
+ "loss": 0.7547,
+ "step": 827
+ },
+ {
+ "epoch": 0.39148936170212767,
+ "grad_norm": 2.96482515335083,
+ "learning_rate": 4.958863586355278e-06,
+ "loss": 0.6312,
+ "step": 828
+ },
+ {
+ "epoch": 0.39196217494089836,
+ "grad_norm": 3.2890889644622803,
+ "learning_rate": 4.958750808632059e-06,
+ "loss": 0.6943,
+ "step": 829
+ },
+ {
+ "epoch": 0.39243498817966904,
+ "grad_norm": 2.7004311084747314,
+ "learning_rate": 4.958637877813207e-06,
+ "loss": 0.5918,
+ "step": 830
+ },
+ {
+ "epoch": 0.39290780141843973,
+ "grad_norm": 2.7487950325012207,
+ "learning_rate": 4.9585247939057566e-06,
+ "loss": 0.6201,
+ "step": 831
+ },
+ {
+ "epoch": 0.3933806146572104,
+ "grad_norm": 2.7873897552490234,
+ "learning_rate": 4.958411556916747e-06,
+ "loss": 0.6268,
+ "step": 832
+ },
+ {
+ "epoch": 0.3938534278959811,
+ "grad_norm": 2.8501343727111816,
+ "learning_rate": 4.958298166853229e-06,
+ "loss": 0.7119,
+ "step": 833
+ },
+ {
+ "epoch": 0.3943262411347518,
+ "grad_norm": 3.0391547679901123,
+ "learning_rate": 4.958184623722265e-06,
+ "loss": 0.6375,
+ "step": 834
+ },
+ {
+ "epoch": 0.3947990543735225,
+ "grad_norm": 2.850520133972168,
+ "learning_rate": 4.958070927530922e-06,
+ "loss": 0.5962,
+ "step": 835
+ },
+ {
+ "epoch": 0.39527186761229316,
+ "grad_norm": 3.351914644241333,
+ "learning_rate": 4.957957078286281e-06,
+ "loss": 0.7247,
+ "step": 836
+ },
+ {
+ "epoch": 0.39574468085106385,
+ "grad_norm": 2.9559543132781982,
+ "learning_rate": 4.957843075995431e-06,
+ "loss": 0.6571,
+ "step": 837
+ },
+ {
+ "epoch": 0.39621749408983453,
+ "grad_norm": 3.225785255432129,
+ "learning_rate": 4.95772892066547e-06,
+ "loss": 0.7074,
+ "step": 838
+ },
+ {
+ "epoch": 0.3966903073286052,
+ "grad_norm": 2.7842373847961426,
+ "learning_rate": 4.957614612303505e-06,
+ "loss": 0.6469,
+ "step": 839
+ },
+ {
+ "epoch": 0.3971631205673759,
+ "grad_norm": 4.249724864959717,
+ "learning_rate": 4.957500150916655e-06,
+ "loss": 0.741,
+ "step": 840
+ },
+ {
+ "epoch": 0.3976359338061466,
+ "grad_norm": 3.138221263885498,
+ "learning_rate": 4.957385536512046e-06,
+ "loss": 0.6676,
+ "step": 841
+ },
+ {
+ "epoch": 0.3981087470449173,
+ "grad_norm": 3.456423759460449,
+ "learning_rate": 4.957270769096816e-06,
+ "loss": 0.6877,
+ "step": 842
+ },
+ {
+ "epoch": 0.39858156028368796,
+ "grad_norm": 2.8676278591156006,
+ "learning_rate": 4.957155848678109e-06,
+ "loss": 0.5986,
+ "step": 843
+ },
+ {
+ "epoch": 0.39905437352245865,
+ "grad_norm": 2.705324411392212,
+ "learning_rate": 4.957040775263082e-06,
+ "loss": 0.6356,
+ "step": 844
+ },
+ {
+ "epoch": 0.39952718676122934,
+ "grad_norm": 3.0767486095428467,
+ "learning_rate": 4.9569255488589e-06,
+ "loss": 0.6844,
+ "step": 845
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 2.7787704467773438,
+ "learning_rate": 4.956810169472736e-06,
+ "loss": 0.6641,
+ "step": 846
+ },
+ {
+ "epoch": 0.4004728132387707,
+ "grad_norm": 2.584277868270874,
+ "learning_rate": 4.956694637111777e-06,
+ "loss": 0.6256,
+ "step": 847
+ },
+ {
+ "epoch": 0.4009456264775414,
+ "grad_norm": 2.751641273498535,
+ "learning_rate": 4.956578951783215e-06,
+ "loss": 0.5954,
+ "step": 848
+ },
+ {
+ "epoch": 0.4014184397163121,
+ "grad_norm": 3.0181658267974854,
+ "learning_rate": 4.956463113494253e-06,
+ "loss": 0.6569,
+ "step": 849
+ },
+ {
+ "epoch": 0.40189125295508277,
+ "grad_norm": 3.0933220386505127,
+ "learning_rate": 4.956347122252104e-06,
+ "loss": 0.6248,
+ "step": 850
+ },
+ {
+ "epoch": 0.40236406619385345,
+ "grad_norm": 3.3767428398132324,
+ "learning_rate": 4.956230978063991e-06,
+ "loss": 0.719,
+ "step": 851
+ },
+ {
+ "epoch": 0.40283687943262414,
+ "grad_norm": 3.7666573524475098,
+ "learning_rate": 4.956114680937145e-06,
+ "loss": 0.6467,
+ "step": 852
+ },
+ {
+ "epoch": 0.4033096926713948,
+ "grad_norm": 2.9836843013763428,
+ "learning_rate": 4.955998230878808e-06,
+ "loss": 0.6993,
+ "step": 853
+ },
+ {
+ "epoch": 0.4037825059101655,
+ "grad_norm": 2.981497049331665,
+ "learning_rate": 4.955881627896229e-06,
+ "loss": 0.6578,
+ "step": 854
+ },
+ {
+ "epoch": 0.40425531914893614,
+ "grad_norm": 3.1369056701660156,
+ "learning_rate": 4.955764871996672e-06,
+ "loss": 0.6763,
+ "step": 855
+ },
+ {
+ "epoch": 0.40472813238770683,
+ "grad_norm": 2.7675817012786865,
+ "learning_rate": 4.9556479631874036e-06,
+ "loss": 0.6488,
+ "step": 856
+ },
+ {
+ "epoch": 0.4052009456264775,
+ "grad_norm": 3.035334825515747,
+ "learning_rate": 4.9555309014757034e-06,
+ "loss": 0.7076,
+ "step": 857
+ },
+ {
+ "epoch": 0.4056737588652482,
+ "grad_norm": 3.493704319000244,
+ "learning_rate": 4.955413686868862e-06,
+ "loss": 0.6773,
+ "step": 858
+ },
+ {
+ "epoch": 0.4061465721040189,
+ "grad_norm": 3.245487928390503,
+ "learning_rate": 4.9552963193741765e-06,
+ "loss": 0.6915,
+ "step": 859
+ },
+ {
+ "epoch": 0.4066193853427896,
+ "grad_norm": 3.189969539642334,
+ "learning_rate": 4.955178798998956e-06,
+ "loss": 0.7318,
+ "step": 860
+ },
+ {
+ "epoch": 0.40709219858156026,
+ "grad_norm": 2.7987146377563477,
+ "learning_rate": 4.955061125750517e-06,
+ "loss": 0.6162,
+ "step": 861
+ },
+ {
+ "epoch": 0.40756501182033095,
+ "grad_norm": 3.020118474960327,
+ "learning_rate": 4.954943299636187e-06,
+ "loss": 0.6678,
+ "step": 862
+ },
+ {
+ "epoch": 0.40803782505910163,
+ "grad_norm": 2.715463876724243,
+ "learning_rate": 4.954825320663302e-06,
+ "loss": 0.668,
+ "step": 863
+ },
+ {
+ "epoch": 0.4085106382978723,
+ "grad_norm": 2.595050096511841,
+ "learning_rate": 4.9547071888392085e-06,
+ "loss": 0.6557,
+ "step": 864
+ },
+ {
+ "epoch": 0.408983451536643,
+ "grad_norm": 3.131596088409424,
+ "learning_rate": 4.954588904171261e-06,
+ "loss": 0.6548,
+ "step": 865
+ },
+ {
+ "epoch": 0.4094562647754137,
+ "grad_norm": 2.5742313861846924,
+ "learning_rate": 4.954470466666827e-06,
+ "loss": 0.6592,
+ "step": 866
+ },
+ {
+ "epoch": 0.4099290780141844,
+ "grad_norm": 2.8612802028656006,
+ "learning_rate": 4.9543518763332785e-06,
+ "loss": 0.5391,
+ "step": 867
+ },
+ {
+ "epoch": 0.41040189125295506,
+ "grad_norm": 2.8973186016082764,
+ "learning_rate": 4.954233133178001e-06,
+ "loss": 0.6649,
+ "step": 868
+ },
+ {
+ "epoch": 0.41087470449172575,
+ "grad_norm": 2.802525043487549,
+ "learning_rate": 4.954114237208388e-06,
+ "loss": 0.6212,
+ "step": 869
+ },
+ {
+ "epoch": 0.41134751773049644,
+ "grad_norm": 2.5919506549835205,
+ "learning_rate": 4.953995188431843e-06,
+ "loss": 0.6596,
+ "step": 870
+ },
+ {
+ "epoch": 0.4118203309692671,
+ "grad_norm": 3.139169454574585,
+ "learning_rate": 4.953875986855777e-06,
+ "loss": 0.6799,
+ "step": 871
+ },
+ {
+ "epoch": 0.4122931442080378,
+ "grad_norm": 3.99727725982666,
+ "learning_rate": 4.953756632487614e-06,
+ "loss": 0.6519,
+ "step": 872
+ },
+ {
+ "epoch": 0.4127659574468085,
+ "grad_norm": 3.238706350326538,
+ "learning_rate": 4.953637125334784e-06,
+ "loss": 0.7361,
+ "step": 873
+ },
+ {
+ "epoch": 0.4132387706855792,
+ "grad_norm": 2.780019998550415,
+ "learning_rate": 4.9535174654047295e-06,
+ "loss": 0.6406,
+ "step": 874
+ },
+ {
+ "epoch": 0.41371158392434987,
+ "grad_norm": 2.7629551887512207,
+ "learning_rate": 4.953397652704901e-06,
+ "loss": 0.6131,
+ "step": 875
+ },
+ {
+ "epoch": 0.41418439716312055,
+ "grad_norm": 2.8008246421813965,
+ "learning_rate": 4.9532776872427585e-06,
+ "loss": 0.6464,
+ "step": 876
+ },
+ {
+ "epoch": 0.41465721040189124,
+ "grad_norm": 3.0970115661621094,
+ "learning_rate": 4.953157569025772e-06,
+ "loss": 0.7066,
+ "step": 877
+ },
+ {
+ "epoch": 0.4151300236406619,
+ "grad_norm": 2.8375589847564697,
+ "learning_rate": 4.9530372980614195e-06,
+ "loss": 0.6551,
+ "step": 878
+ },
+ {
+ "epoch": 0.4156028368794326,
+ "grad_norm": 2.718843936920166,
+ "learning_rate": 4.952916874357191e-06,
+ "loss": 0.5947,
+ "step": 879
+ },
+ {
+ "epoch": 0.4160756501182033,
+ "grad_norm": 2.7104697227478027,
+ "learning_rate": 4.952796297920585e-06,
+ "loss": 0.6708,
+ "step": 880
+ },
+ {
+ "epoch": 0.416548463356974,
+ "grad_norm": 2.8223445415496826,
+ "learning_rate": 4.952675568759108e-06,
+ "loss": 0.6214,
+ "step": 881
+ },
+ {
+ "epoch": 0.41702127659574467,
+ "grad_norm": 2.6598153114318848,
+ "learning_rate": 4.952554686880279e-06,
+ "loss": 0.6116,
+ "step": 882
+ },
+ {
+ "epoch": 0.41749408983451536,
+ "grad_norm": 2.8639824390411377,
+ "learning_rate": 4.952433652291623e-06,
+ "loss": 0.5971,
+ "step": 883
+ },
+ {
+ "epoch": 0.41796690307328604,
+ "grad_norm": 2.9578304290771484,
+ "learning_rate": 4.952312465000677e-06,
+ "loss": 0.6785,
+ "step": 884
+ },
+ {
+ "epoch": 0.41843971631205673,
+ "grad_norm": 2.872144937515259,
+ "learning_rate": 4.952191125014987e-06,
+ "loss": 0.6772,
+ "step": 885
+ },
+ {
+ "epoch": 0.4189125295508274,
+ "grad_norm": 2.7513675689697266,
+ "learning_rate": 4.952069632342108e-06,
+ "loss": 0.702,
+ "step": 886
+ },
+ {
+ "epoch": 0.4193853427895981,
+ "grad_norm": 2.9275078773498535,
+ "learning_rate": 4.951947986989606e-06,
+ "loss": 0.589,
+ "step": 887
+ },
+ {
+ "epoch": 0.4198581560283688,
+ "grad_norm": 2.740549325942993,
+ "learning_rate": 4.951826188965053e-06,
+ "loss": 0.5942,
+ "step": 888
+ },
+ {
+ "epoch": 0.4203309692671395,
+ "grad_norm": 2.92452073097229,
+ "learning_rate": 4.951704238276035e-06,
+ "loss": 0.6819,
+ "step": 889
+ },
+ {
+ "epoch": 0.42080378250591016,
+ "grad_norm": 2.842491865158081,
+ "learning_rate": 4.951582134930144e-06,
+ "loss": 0.6304,
+ "step": 890
+ },
+ {
+ "epoch": 0.42127659574468085,
+ "grad_norm": 2.613478422164917,
+ "learning_rate": 4.951459878934983e-06,
+ "loss": 0.6912,
+ "step": 891
+ },
+ {
+ "epoch": 0.42174940898345153,
+ "grad_norm": 3.2408607006073,
+ "learning_rate": 4.951337470298165e-06,
+ "loss": 0.6755,
+ "step": 892
+ },
+ {
+ "epoch": 0.4222222222222222,
+ "grad_norm": 3.1022439002990723,
+ "learning_rate": 4.9512149090273125e-06,
+ "loss": 0.6138,
+ "step": 893
+ },
+ {
+ "epoch": 0.4226950354609929,
+ "grad_norm": 2.6418895721435547,
+ "learning_rate": 4.951092195130055e-06,
+ "loss": 0.639,
+ "step": 894
+ },
+ {
+ "epoch": 0.4231678486997636,
+ "grad_norm": 3.010744333267212,
+ "learning_rate": 4.950969328614035e-06,
+ "loss": 0.7102,
+ "step": 895
+ },
+ {
+ "epoch": 0.4236406619385343,
+ "grad_norm": 2.673292636871338,
+ "learning_rate": 4.950846309486901e-06,
+ "loss": 0.5676,
+ "step": 896
+ },
+ {
+ "epoch": 0.42411347517730497,
+ "grad_norm": 3.6974737644195557,
+ "learning_rate": 4.950723137756314e-06,
+ "loss": 0.5722,
+ "step": 897
+ },
+ {
+ "epoch": 0.42458628841607565,
+ "grad_norm": 3.69028902053833,
+ "learning_rate": 4.9505998134299435e-06,
+ "loss": 0.6337,
+ "step": 898
+ },
+ {
+ "epoch": 0.42505910165484634,
+ "grad_norm": 3.2136125564575195,
+ "learning_rate": 4.950476336515469e-06,
+ "loss": 0.6469,
+ "step": 899
+ },
+ {
+ "epoch": 0.425531914893617,
+ "grad_norm": 2.7396016120910645,
+ "learning_rate": 4.950352707020577e-06,
+ "loss": 0.6656,
+ "step": 900
+ },
+ {
+ "epoch": 0.4260047281323877,
+ "grad_norm": 2.825416088104248,
+ "learning_rate": 4.950228924952967e-06,
+ "loss": 0.6298,
+ "step": 901
+ },
+ {
+ "epoch": 0.4264775413711584,
+ "grad_norm": 3.401658535003662,
+ "learning_rate": 4.950104990320345e-06,
+ "loss": 0.778,
+ "step": 902
+ },
+ {
+ "epoch": 0.4269503546099291,
+ "grad_norm": 2.7002272605895996,
+ "learning_rate": 4.9499809031304294e-06,
+ "loss": 0.6536,
+ "step": 903
+ },
+ {
+ "epoch": 0.42742316784869977,
+ "grad_norm": 2.62386417388916,
+ "learning_rate": 4.949856663390945e-06,
+ "loss": 0.6629,
+ "step": 904
+ },
+ {
+ "epoch": 0.42789598108747046,
+ "grad_norm": 2.584247589111328,
+ "learning_rate": 4.94973227110963e-06,
+ "loss": 0.5813,
+ "step": 905
+ },
+ {
+ "epoch": 0.42836879432624114,
+ "grad_norm": 3.4365768432617188,
+ "learning_rate": 4.9496077262942265e-06,
+ "loss": 0.7648,
+ "step": 906
+ },
+ {
+ "epoch": 0.42884160756501183,
+ "grad_norm": 2.8993639945983887,
+ "learning_rate": 4.949483028952492e-06,
+ "loss": 0.6696,
+ "step": 907
+ },
+ {
+ "epoch": 0.4293144208037825,
+ "grad_norm": 2.922809362411499,
+ "learning_rate": 4.94935817909219e-06,
+ "loss": 0.6892,
+ "step": 908
+ },
+ {
+ "epoch": 0.4297872340425532,
+ "grad_norm": 2.85478138923645,
+ "learning_rate": 4.9492331767210944e-06,
+ "loss": 0.536,
+ "step": 909
+ },
+ {
+ "epoch": 0.4302600472813239,
+ "grad_norm": 2.8639259338378906,
+ "learning_rate": 4.949108021846988e-06,
+ "loss": 0.634,
+ "step": 910
+ },
+ {
+ "epoch": 0.4307328605200946,
+ "grad_norm": 3.0533697605133057,
+ "learning_rate": 4.948982714477664e-06,
+ "loss": 0.6318,
+ "step": 911
+ },
+ {
+ "epoch": 0.43120567375886526,
+ "grad_norm": 2.331674814224243,
+ "learning_rate": 4.9488572546209255e-06,
+ "loss": 0.6562,
+ "step": 912
+ },
+ {
+ "epoch": 0.43167848699763595,
+ "grad_norm": 3.0154623985290527,
+ "learning_rate": 4.9487316422845835e-06,
+ "loss": 0.6675,
+ "step": 913
+ },
+ {
+ "epoch": 0.43215130023640663,
+ "grad_norm": 2.7354514598846436,
+ "learning_rate": 4.948605877476459e-06,
+ "loss": 0.6012,
+ "step": 914
+ },
+ {
+ "epoch": 0.4326241134751773,
+ "grad_norm": 2.863736629486084,
+ "learning_rate": 4.948479960204383e-06,
+ "loss": 0.6062,
+ "step": 915
+ },
+ {
+ "epoch": 0.433096926713948,
+ "grad_norm": 3.01998233795166,
+ "learning_rate": 4.948353890476197e-06,
+ "loss": 0.6749,
+ "step": 916
+ },
+ {
+ "epoch": 0.4335697399527187,
+ "grad_norm": 2.7550456523895264,
+ "learning_rate": 4.94822766829975e-06,
+ "loss": 0.6507,
+ "step": 917
+ },
+ {
+ "epoch": 0.4340425531914894,
+ "grad_norm": 3.370572805404663,
+ "learning_rate": 4.948101293682901e-06,
+ "loss": 0.714,
+ "step": 918
+ },
+ {
+ "epoch": 0.43451536643026006,
+ "grad_norm": 2.9736790657043457,
+ "learning_rate": 4.947974766633519e-06,
+ "loss": 0.729,
+ "step": 919
+ },
+ {
+ "epoch": 0.43498817966903075,
+ "grad_norm": 3.1036548614501953,
+ "learning_rate": 4.947848087159483e-06,
+ "loss": 0.7547,
+ "step": 920
+ },
+ {
+ "epoch": 0.43546099290780144,
+ "grad_norm": 2.895094871520996,
+ "learning_rate": 4.947721255268679e-06,
+ "loss": 0.6089,
+ "step": 921
+ },
+ {
+ "epoch": 0.4359338061465721,
+ "grad_norm": 2.798476219177246,
+ "learning_rate": 4.947594270969005e-06,
+ "loss": 0.5432,
+ "step": 922
+ },
+ {
+ "epoch": 0.4364066193853428,
+ "grad_norm": 2.7675702571868896,
+ "learning_rate": 4.94746713426837e-06,
+ "loss": 0.5693,
+ "step": 923
+ },
+ {
+ "epoch": 0.4368794326241135,
+ "grad_norm": 2.6851553916931152,
+ "learning_rate": 4.947339845174687e-06,
+ "loss": 0.6503,
+ "step": 924
+ },
+ {
+ "epoch": 0.4373522458628842,
+ "grad_norm": 2.909635543823242,
+ "learning_rate": 4.947212403695883e-06,
+ "loss": 0.6494,
+ "step": 925
+ },
+ {
+ "epoch": 0.43782505910165487,
+ "grad_norm": 2.604526996612549,
+ "learning_rate": 4.947084809839894e-06,
+ "loss": 0.6349,
+ "step": 926
+ },
+ {
+ "epoch": 0.43829787234042555,
+ "grad_norm": 3.118149518966675,
+ "learning_rate": 4.946957063614664e-06,
+ "loss": 0.6219,
+ "step": 927
+ },
+ {
+ "epoch": 0.43877068557919624,
+ "grad_norm": 2.7452616691589355,
+ "learning_rate": 4.9468291650281465e-06,
+ "loss": 0.6096,
+ "step": 928
+ },
+ {
+ "epoch": 0.4392434988179669,
+ "grad_norm": 3.30098819732666,
+ "learning_rate": 4.946701114088307e-06,
+ "loss": 0.6277,
+ "step": 929
+ },
+ {
+ "epoch": 0.4397163120567376,
+ "grad_norm": 2.789482593536377,
+ "learning_rate": 4.946572910803116e-06,
+ "loss": 0.7,
+ "step": 930
+ },
+ {
+ "epoch": 0.4401891252955083,
+ "grad_norm": 2.7283935546875,
+ "learning_rate": 4.946444555180559e-06,
+ "loss": 0.5375,
+ "step": 931
+ },
+ {
+ "epoch": 0.440661938534279,
+ "grad_norm": 3.101304054260254,
+ "learning_rate": 4.946316047228627e-06,
+ "loss": 0.6131,
+ "step": 932
+ },
+ {
+ "epoch": 0.44113475177304967,
+ "grad_norm": 3.573908805847168,
+ "learning_rate": 4.946187386955321e-06,
+ "loss": 0.7073,
+ "step": 933
+ },
+ {
+ "epoch": 0.44160756501182036,
+ "grad_norm": 3.214979648590088,
+ "learning_rate": 4.946058574368653e-06,
+ "loss": 0.6508,
+ "step": 934
+ },
+ {
+ "epoch": 0.44208037825059104,
+ "grad_norm": 3.145082712173462,
+ "learning_rate": 4.945929609476643e-06,
+ "loss": 0.64,
+ "step": 935
+ },
+ {
+ "epoch": 0.4425531914893617,
+ "grad_norm": 2.991780996322632,
+ "learning_rate": 4.945800492287321e-06,
+ "loss": 0.6315,
+ "step": 936
+ },
+ {
+ "epoch": 0.44302600472813236,
+ "grad_norm": 3.2441139221191406,
+ "learning_rate": 4.945671222808727e-06,
+ "loss": 0.7144,
+ "step": 937
+ },
+ {
+ "epoch": 0.44349881796690305,
+ "grad_norm": 2.9397029876708984,
+ "learning_rate": 4.94554180104891e-06,
+ "loss": 0.6818,
+ "step": 938
+ },
+ {
+ "epoch": 0.44397163120567373,
+ "grad_norm": 3.2471461296081543,
+ "learning_rate": 4.945412227015929e-06,
+ "loss": 0.6921,
+ "step": 939
+ },
+ {
+ "epoch": 0.4444444444444444,
+ "grad_norm": 3.0882487297058105,
+ "learning_rate": 4.945282500717851e-06,
+ "loss": 0.718,
+ "step": 940
+ },
+ {
+ "epoch": 0.4449172576832151,
+ "grad_norm": 2.6035783290863037,
+ "learning_rate": 4.945152622162753e-06,
+ "loss": 0.621,
+ "step": 941
+ },
+ {
+ "epoch": 0.4453900709219858,
+ "grad_norm": 2.83659029006958,
+ "learning_rate": 4.945022591358724e-06,
+ "loss": 0.6403,
+ "step": 942
+ },
+ {
+ "epoch": 0.4458628841607565,
+ "grad_norm": 2.824463129043579,
+ "learning_rate": 4.944892408313859e-06,
+ "loss": 0.6594,
+ "step": 943
+ },
+ {
+ "epoch": 0.44633569739952716,
+ "grad_norm": 2.753735065460205,
+ "learning_rate": 4.9447620730362645e-06,
+ "loss": 0.6116,
+ "step": 944
+ },
+ {
+ "epoch": 0.44680851063829785,
+ "grad_norm": 3.0659725666046143,
+ "learning_rate": 4.944631585534056e-06,
+ "loss": 0.5983,
+ "step": 945
+ },
+ {
+ "epoch": 0.44728132387706854,
+ "grad_norm": 2.969113349914551,
+ "learning_rate": 4.944500945815357e-06,
+ "loss": 0.6859,
+ "step": 946
+ },
+ {
+ "epoch": 0.4477541371158392,
+ "grad_norm": 2.810303211212158,
+ "learning_rate": 4.944370153888303e-06,
+ "loss": 0.7025,
+ "step": 947
+ },
+ {
+ "epoch": 0.4482269503546099,
+ "grad_norm": 3.027721643447876,
+ "learning_rate": 4.944239209761038e-06,
+ "loss": 0.7268,
+ "step": 948
+ },
+ {
+ "epoch": 0.4486997635933806,
+ "grad_norm": 2.661503314971924,
+ "learning_rate": 4.944108113441716e-06,
+ "loss": 0.6702,
+ "step": 949
+ },
+ {
+ "epoch": 0.4491725768321513,
+ "grad_norm": 2.738591432571411,
+ "learning_rate": 4.943976864938498e-06,
+ "loss": 0.6728,
+ "step": 950
+ },
+ {
+ "epoch": 0.44964539007092197,
+ "grad_norm": 3.447505474090576,
+ "learning_rate": 4.943845464259557e-06,
+ "loss": 0.6586,
+ "step": 951
+ },
+ {
+ "epoch": 0.45011820330969265,
+ "grad_norm": 3.0968854427337646,
+ "learning_rate": 4.943713911413075e-06,
+ "loss": 0.7666,
+ "step": 952
+ },
+ {
+ "epoch": 0.45059101654846334,
+ "grad_norm": 2.4113779067993164,
+ "learning_rate": 4.943582206407244e-06,
+ "loss": 0.6173,
+ "step": 953
+ },
+ {
+ "epoch": 0.451063829787234,
+ "grad_norm": 2.6357979774475098,
+ "learning_rate": 4.943450349250263e-06,
+ "loss": 0.5589,
+ "step": 954
+ },
+ {
+ "epoch": 0.4515366430260047,
+ "grad_norm": 2.9182233810424805,
+ "learning_rate": 4.9433183399503425e-06,
+ "loss": 0.6252,
+ "step": 955
+ },
+ {
+ "epoch": 0.4520094562647754,
+ "grad_norm": 2.832740306854248,
+ "learning_rate": 4.943186178515703e-06,
+ "loss": 0.6882,
+ "step": 956
+ },
+ {
+ "epoch": 0.4524822695035461,
+ "grad_norm": 2.9508981704711914,
+ "learning_rate": 4.943053864954574e-06,
+ "loss": 0.5722,
+ "step": 957
+ },
+ {
+ "epoch": 0.4529550827423168,
+ "grad_norm": 3.044729471206665,
+ "learning_rate": 4.9429213992751925e-06,
+ "loss": 0.6772,
+ "step": 958
+ },
+ {
+ "epoch": 0.45342789598108746,
+ "grad_norm": 2.606003522872925,
+ "learning_rate": 4.9427887814858075e-06,
+ "loss": 0.6445,
+ "step": 959
+ },
+ {
+ "epoch": 0.45390070921985815,
+ "grad_norm": 2.4634225368499756,
+ "learning_rate": 4.942656011594676e-06,
+ "loss": 0.6151,
+ "step": 960
+ },
+ {
+ "epoch": 0.45437352245862883,
+ "grad_norm": 2.8872334957122803,
+ "learning_rate": 4.942523089610066e-06,
+ "loss": 0.6255,
+ "step": 961
+ },
+ {
+ "epoch": 0.4548463356973995,
+ "grad_norm": 2.870605707168579,
+ "learning_rate": 4.942390015540253e-06,
+ "loss": 0.7481,
+ "step": 962
+ },
+ {
+ "epoch": 0.4553191489361702,
+ "grad_norm": 2.952680826187134,
+ "learning_rate": 4.942256789393524e-06,
+ "loss": 0.5556,
+ "step": 963
+ },
+ {
+ "epoch": 0.4557919621749409,
+ "grad_norm": 2.623680353164673,
+ "learning_rate": 4.9421234111781725e-06,
+ "loss": 0.6115,
+ "step": 964
+ },
+ {
+ "epoch": 0.4562647754137116,
+ "grad_norm": 2.6933600902557373,
+ "learning_rate": 4.941989880902505e-06,
+ "loss": 0.6102,
+ "step": 965
+ },
+ {
+ "epoch": 0.45673758865248226,
+ "grad_norm": 2.6047189235687256,
+ "learning_rate": 4.941856198574836e-06,
+ "loss": 0.612,
+ "step": 966
+ },
+ {
+ "epoch": 0.45721040189125295,
+ "grad_norm": 2.779186725616455,
+ "learning_rate": 4.9417223642034885e-06,
+ "loss": 0.5424,
+ "step": 967
+ },
+ {
+ "epoch": 0.45768321513002364,
+ "grad_norm": 2.6177165508270264,
+ "learning_rate": 4.941588377796795e-06,
+ "loss": 0.4661,
+ "step": 968
+ },
+ {
+ "epoch": 0.4581560283687943,
+ "grad_norm": 2.959676742553711,
+ "learning_rate": 4.941454239363101e-06,
+ "loss": 0.6966,
+ "step": 969
+ },
+ {
+ "epoch": 0.458628841607565,
+ "grad_norm": 2.9788379669189453,
+ "learning_rate": 4.941319948910756e-06,
+ "loss": 0.6181,
+ "step": 970
+ },
+ {
+ "epoch": 0.4591016548463357,
+ "grad_norm": 4.642750263214111,
+ "learning_rate": 4.941185506448122e-06,
+ "loss": 0.5602,
+ "step": 971
+ },
+ {
+ "epoch": 0.4595744680851064,
+ "grad_norm": 2.793002128601074,
+ "learning_rate": 4.941050911983572e-06,
+ "loss": 0.602,
+ "step": 972
+ },
+ {
+ "epoch": 0.46004728132387707,
+ "grad_norm": 2.6833035945892334,
+ "learning_rate": 4.9409161655254845e-06,
+ "loss": 0.5549,
+ "step": 973
+ },
+ {
+ "epoch": 0.46052009456264775,
+ "grad_norm": 3.905032157897949,
+ "learning_rate": 4.94078126708225e-06,
+ "loss": 0.6335,
+ "step": 974
+ },
+ {
+ "epoch": 0.46099290780141844,
+ "grad_norm": 2.922609329223633,
+ "learning_rate": 4.94064621666227e-06,
+ "loss": 0.5839,
+ "step": 975
+ },
+ {
+ "epoch": 0.4614657210401891,
+ "grad_norm": 2.8277416229248047,
+ "learning_rate": 4.940511014273952e-06,
+ "loss": 0.629,
+ "step": 976
+ },
+ {
+ "epoch": 0.4619385342789598,
+ "grad_norm": 3.07511043548584,
+ "learning_rate": 4.940375659925714e-06,
+ "loss": 0.7058,
+ "step": 977
+ },
+ {
+ "epoch": 0.4624113475177305,
+ "grad_norm": 3.65043044090271,
+ "learning_rate": 4.940240153625984e-06,
+ "loss": 0.7174,
+ "step": 978
+ },
+ {
+ "epoch": 0.4628841607565012,
+ "grad_norm": 2.755167245864868,
+ "learning_rate": 4.9401044953832e-06,
+ "loss": 0.6548,
+ "step": 979
+ },
+ {
+ "epoch": 0.46335697399527187,
+ "grad_norm": 2.9881057739257812,
+ "learning_rate": 4.939968685205808e-06,
+ "loss": 0.6245,
+ "step": 980
+ },
+ {
+ "epoch": 0.46382978723404256,
+ "grad_norm": 2.9484212398529053,
+ "learning_rate": 4.939832723102266e-06,
+ "loss": 0.655,
+ "step": 981
+ },
+ {
+ "epoch": 0.46430260047281324,
+ "grad_norm": 2.898918628692627,
+ "learning_rate": 4.939696609081038e-06,
+ "loss": 0.6178,
+ "step": 982
+ },
+ {
+ "epoch": 0.46477541371158393,
+ "grad_norm": 2.7052435874938965,
+ "learning_rate": 4.9395603431506e-06,
+ "loss": 0.6393,
+ "step": 983
+ },
+ {
+ "epoch": 0.4652482269503546,
+ "grad_norm": 2.5610013008117676,
+ "learning_rate": 4.939423925319436e-06,
+ "loss": 0.4847,
+ "step": 984
+ },
+ {
+ "epoch": 0.4657210401891253,
+ "grad_norm": 3.229083299636841,
+ "learning_rate": 4.939287355596042e-06,
+ "loss": 0.6473,
+ "step": 985
+ },
+ {
+ "epoch": 0.466193853427896,
+ "grad_norm": 2.907097816467285,
+ "learning_rate": 4.9391506339889195e-06,
+ "loss": 0.652,
+ "step": 986
+ },
+ {
+ "epoch": 0.4666666666666667,
+ "grad_norm": 2.6929478645324707,
+ "learning_rate": 4.939013760506582e-06,
+ "loss": 0.6175,
+ "step": 987
+ },
+ {
+ "epoch": 0.46713947990543736,
+ "grad_norm": 3.414813280105591,
+ "learning_rate": 4.938876735157554e-06,
+ "loss": 0.7597,
+ "step": 988
+ },
+ {
+ "epoch": 0.46761229314420805,
+ "grad_norm": 3.297360420227051,
+ "learning_rate": 4.938739557950365e-06,
+ "loss": 0.6824,
+ "step": 989
+ },
+ {
+ "epoch": 0.46808510638297873,
+ "grad_norm": 3.083155393600464,
+ "learning_rate": 4.938602228893557e-06,
+ "loss": 0.6505,
+ "step": 990
+ },
+ {
+ "epoch": 0.4685579196217494,
+ "grad_norm": 2.9781153202056885,
+ "learning_rate": 4.938464747995681e-06,
+ "loss": 0.666,
+ "step": 991
+ },
+ {
+ "epoch": 0.4690307328605201,
+ "grad_norm": 3.1494534015655518,
+ "learning_rate": 4.9383271152652975e-06,
+ "loss": 0.6422,
+ "step": 992
+ },
+ {
+ "epoch": 0.4695035460992908,
+ "grad_norm": 2.547868490219116,
+ "learning_rate": 4.938189330710976e-06,
+ "loss": 0.5766,
+ "step": 993
+ },
+ {
+ "epoch": 0.4699763593380615,
+ "grad_norm": 2.684736967086792,
+ "learning_rate": 4.938051394341297e-06,
+ "loss": 0.6407,
+ "step": 994
+ },
+ {
+ "epoch": 0.47044917257683216,
+ "grad_norm": 2.9619693756103516,
+ "learning_rate": 4.937913306164847e-06,
+ "loss": 0.6936,
+ "step": 995
+ },
+ {
+ "epoch": 0.47092198581560285,
+ "grad_norm": 2.9698498249053955,
+ "learning_rate": 4.937775066190227e-06,
+ "loss": 0.6464,
+ "step": 996
+ },
+ {
+ "epoch": 0.47139479905437354,
+ "grad_norm": 3.121049642562866,
+ "learning_rate": 4.937636674426042e-06,
+ "loss": 0.6383,
+ "step": 997
+ },
+ {
+ "epoch": 0.4718676122931442,
+ "grad_norm": 3.113672971725464,
+ "learning_rate": 4.93749813088091e-06,
+ "loss": 0.6892,
+ "step": 998
+ },
+ {
+ "epoch": 0.4723404255319149,
+ "grad_norm": 3.126113176345825,
+ "learning_rate": 4.937359435563458e-06,
+ "loss": 0.6728,
+ "step": 999
+ },
+ {
+ "epoch": 0.4728132387706856,
+ "grad_norm": 3.353966236114502,
+ "learning_rate": 4.937220588482321e-06,
+ "loss": 0.6041,
+ "step": 1000
+ },
+ {
+ "epoch": 0.4732860520094563,
+ "grad_norm": 2.8860628604888916,
+ "learning_rate": 4.937081589646144e-06,
+ "loss": 0.6798,
+ "step": 1001
+ },
+ {
+ "epoch": 0.47375886524822697,
+ "grad_norm": 3.0510590076446533,
+ "learning_rate": 4.936942439063584e-06,
+ "loss": 0.5841,
+ "step": 1002
+ },
+ {
+ "epoch": 0.47423167848699765,
+ "grad_norm": 2.6998369693756104,
+ "learning_rate": 4.936803136743303e-06,
+ "loss": 0.6403,
+ "step": 1003
+ },
+ {
+ "epoch": 0.47470449172576834,
+ "grad_norm": 2.875347137451172,
+ "learning_rate": 4.9366636826939765e-06,
+ "loss": 0.5811,
+ "step": 1004
+ },
+ {
+ "epoch": 0.475177304964539,
+ "grad_norm": 2.9122262001037598,
+ "learning_rate": 4.936524076924287e-06,
+ "loss": 0.6852,
+ "step": 1005
+ },
+ {
+ "epoch": 0.4756501182033097,
+ "grad_norm": 2.5167057514190674,
+ "learning_rate": 4.9363843194429265e-06,
+ "loss": 0.5367,
+ "step": 1006
+ },
+ {
+ "epoch": 0.4761229314420804,
+ "grad_norm": 2.5745551586151123,
+ "learning_rate": 4.9362444102585985e-06,
+ "loss": 0.6241,
+ "step": 1007
+ },
+ {
+ "epoch": 0.4765957446808511,
+ "grad_norm": 2.5024216175079346,
+ "learning_rate": 4.9361043493800125e-06,
+ "loss": 0.6133,
+ "step": 1008
+ },
+ {
+ "epoch": 0.47706855791962177,
+ "grad_norm": 2.7281384468078613,
+ "learning_rate": 4.935964136815892e-06,
+ "loss": 0.6834,
+ "step": 1009
+ },
+ {
+ "epoch": 0.47754137115839246,
+ "grad_norm": 3.0118913650512695,
+ "learning_rate": 4.935823772574965e-06,
+ "loss": 0.6922,
+ "step": 1010
+ },
+ {
+ "epoch": 0.47801418439716314,
+ "grad_norm": 3.016216993331909,
+ "learning_rate": 4.935683256665973e-06,
+ "loss": 0.6653,
+ "step": 1011
+ },
+ {
+ "epoch": 0.47848699763593383,
+ "grad_norm": 2.9526784420013428,
+ "learning_rate": 4.9355425890976636e-06,
+ "loss": 0.6423,
+ "step": 1012
+ },
+ {
+ "epoch": 0.4789598108747045,
+ "grad_norm": 6.222797393798828,
+ "learning_rate": 4.9354017698787985e-06,
+ "loss": 0.5884,
+ "step": 1013
+ },
+ {
+ "epoch": 0.4794326241134752,
+ "grad_norm": 2.6553597450256348,
+ "learning_rate": 4.935260799018143e-06,
+ "loss": 0.6624,
+ "step": 1014
+ },
+ {
+ "epoch": 0.4799054373522459,
+ "grad_norm": 3.0942065715789795,
+ "learning_rate": 4.935119676524475e-06,
+ "loss": 0.6623,
+ "step": 1015
+ },
+ {
+ "epoch": 0.4803782505910166,
+ "grad_norm": 2.626359224319458,
+ "learning_rate": 4.934978402406585e-06,
+ "loss": 0.6195,
+ "step": 1016
+ },
+ {
+ "epoch": 0.4808510638297872,
+ "grad_norm": 2.7954699993133545,
+ "learning_rate": 4.934836976673265e-06,
+ "loss": 0.5545,
+ "step": 1017
+ },
+ {
+ "epoch": 0.4813238770685579,
+ "grad_norm": 2.913557291030884,
+ "learning_rate": 4.934695399333324e-06,
+ "loss": 0.6288,
+ "step": 1018
+ },
+ {
+ "epoch": 0.4817966903073286,
+ "grad_norm": 3.1043739318847656,
+ "learning_rate": 4.9345536703955746e-06,
+ "loss": 0.6771,
+ "step": 1019
+ },
+ {
+ "epoch": 0.48226950354609927,
+ "grad_norm": 2.789357900619507,
+ "learning_rate": 4.934411789868845e-06,
+ "loss": 0.6227,
+ "step": 1020
+ },
+ {
+ "epoch": 0.48274231678486995,
+ "grad_norm": 2.480609655380249,
+ "learning_rate": 4.934269757761967e-06,
+ "loss": 0.5779,
+ "step": 1021
+ },
+ {
+ "epoch": 0.48321513002364064,
+ "grad_norm": 2.7946252822875977,
+ "learning_rate": 4.934127574083785e-06,
+ "loss": 0.6166,
+ "step": 1022
+ },
+ {
+ "epoch": 0.4836879432624113,
+ "grad_norm": 3.0670509338378906,
+ "learning_rate": 4.933985238843153e-06,
+ "loss": 0.7766,
+ "step": 1023
+ },
+ {
+ "epoch": 0.484160756501182,
+ "grad_norm": 2.8567559719085693,
+ "learning_rate": 4.933842752048932e-06,
+ "loss": 0.5088,
+ "step": 1024
+ },
+ {
+ "epoch": 0.4846335697399527,
+ "grad_norm": 2.5674657821655273,
+ "learning_rate": 4.933700113709996e-06,
+ "loss": 0.6036,
+ "step": 1025
+ },
+ {
+ "epoch": 0.4851063829787234,
+ "grad_norm": 2.782339096069336,
+ "learning_rate": 4.933557323835224e-06,
+ "loss": 0.5335,
+ "step": 1026
+ },
+ {
+ "epoch": 0.48557919621749407,
+ "grad_norm": 2.6334071159362793,
+ "learning_rate": 4.93341438243351e-06,
+ "loss": 0.6327,
+ "step": 1027
+ },
+ {
+ "epoch": 0.48605200945626476,
+ "grad_norm": 3.0853965282440186,
+ "learning_rate": 4.933271289513751e-06,
+ "loss": 0.7102,
+ "step": 1028
+ },
+ {
+ "epoch": 0.48652482269503544,
+ "grad_norm": 2.619997501373291,
+ "learning_rate": 4.933128045084859e-06,
+ "loss": 0.6138,
+ "step": 1029
+ },
+ {
+ "epoch": 0.48699763593380613,
+ "grad_norm": 2.8316116333007812,
+ "learning_rate": 4.932984649155753e-06,
+ "loss": 0.6346,
+ "step": 1030
+ },
+ {
+ "epoch": 0.4874704491725768,
+ "grad_norm": 3.153486490249634,
+ "learning_rate": 4.932841101735361e-06,
+ "loss": 0.7626,
+ "step": 1031
+ },
+ {
+ "epoch": 0.4879432624113475,
+ "grad_norm": 3.1831274032592773,
+ "learning_rate": 4.9326974028326214e-06,
+ "loss": 0.6607,
+ "step": 1032
+ },
+ {
+ "epoch": 0.4884160756501182,
+ "grad_norm": 2.791078567504883,
+ "learning_rate": 4.932553552456481e-06,
+ "loss": 0.6141,
+ "step": 1033
+ },
+ {
+ "epoch": 0.4888888888888889,
+ "grad_norm": 2.627263307571411,
+ "learning_rate": 4.932409550615898e-06,
+ "loss": 0.6777,
+ "step": 1034
+ },
+ {
+ "epoch": 0.48936170212765956,
+ "grad_norm": 2.8550007343292236,
+ "learning_rate": 4.932265397319838e-06,
+ "loss": 0.6379,
+ "step": 1035
+ },
+ {
+ "epoch": 0.48983451536643025,
+ "grad_norm": 4.505824089050293,
+ "learning_rate": 4.932121092577276e-06,
+ "loss": 0.5892,
+ "step": 1036
+ },
+ {
+ "epoch": 0.49030732860520093,
+ "grad_norm": 3.100191116333008,
+ "learning_rate": 4.931976636397199e-06,
+ "loss": 0.6443,
+ "step": 1037
+ },
+ {
+ "epoch": 0.4907801418439716,
+ "grad_norm": 2.921494245529175,
+ "learning_rate": 4.9318320287886e-06,
+ "loss": 0.6821,
+ "step": 1038
+ },
+ {
+ "epoch": 0.4912529550827423,
+ "grad_norm": 4.577807903289795,
+ "learning_rate": 4.931687269760485e-06,
+ "loss": 0.5946,
+ "step": 1039
+ },
+ {
+ "epoch": 0.491725768321513,
+ "grad_norm": 2.7347636222839355,
+ "learning_rate": 4.931542359321865e-06,
+ "loss": 0.5689,
+ "step": 1040
+ },
+ {
+ "epoch": 0.4921985815602837,
+ "grad_norm": 2.5289158821105957,
+ "learning_rate": 4.931397297481765e-06,
+ "loss": 0.5632,
+ "step": 1041
+ },
+ {
+ "epoch": 0.49267139479905436,
+ "grad_norm": 3.3518471717834473,
+ "learning_rate": 4.9312520842492165e-06,
+ "loss": 0.6349,
+ "step": 1042
+ },
+ {
+ "epoch": 0.49314420803782505,
+ "grad_norm": 3.0469748973846436,
+ "learning_rate": 4.931106719633261e-06,
+ "loss": 0.5734,
+ "step": 1043
+ },
+ {
+ "epoch": 0.49361702127659574,
+ "grad_norm": 3.104682445526123,
+ "learning_rate": 4.930961203642951e-06,
+ "loss": 0.6101,
+ "step": 1044
+ },
+ {
+ "epoch": 0.4940898345153664,
+ "grad_norm": 2.776705503463745,
+ "learning_rate": 4.930815536287346e-06,
+ "loss": 0.6397,
+ "step": 1045
+ },
+ {
+ "epoch": 0.4945626477541371,
+ "grad_norm": 2.760380983352661,
+ "learning_rate": 4.930669717575516e-06,
+ "loss": 0.668,
+ "step": 1046
+ },
+ {
+ "epoch": 0.4950354609929078,
+ "grad_norm": 2.70084547996521,
+ "learning_rate": 4.930523747516541e-06,
+ "loss": 0.5729,
+ "step": 1047
+ },
+ {
+ "epoch": 0.4955082742316785,
+ "grad_norm": 2.7319583892822266,
+ "learning_rate": 4.930377626119511e-06,
+ "loss": 0.6258,
+ "step": 1048
+ },
+ {
+ "epoch": 0.49598108747044917,
+ "grad_norm": 3.2515223026275635,
+ "learning_rate": 4.930231353393521e-06,
+ "loss": 0.7412,
+ "step": 1049
+ },
+ {
+ "epoch": 0.49645390070921985,
+ "grad_norm": 3.0646486282348633,
+ "learning_rate": 4.930084929347682e-06,
+ "loss": 0.5809,
+ "step": 1050
+ },
+ {
+ "epoch": 0.49692671394799054,
+ "grad_norm": 3.1621921062469482,
+ "learning_rate": 4.9299383539911096e-06,
+ "loss": 0.6282,
+ "step": 1051
+ },
+ {
+ "epoch": 0.4973995271867612,
+ "grad_norm": 2.864713191986084,
+ "learning_rate": 4.929791627332931e-06,
+ "loss": 0.6263,
+ "step": 1052
+ },
+ {
+ "epoch": 0.4978723404255319,
+ "grad_norm": 3.181016683578491,
+ "learning_rate": 4.929644749382283e-06,
+ "loss": 0.5697,
+ "step": 1053
+ },
+ {
+ "epoch": 0.4983451536643026,
+ "grad_norm": 2.9064836502075195,
+ "learning_rate": 4.929497720148309e-06,
+ "loss": 0.6161,
+ "step": 1054
+ },
+ {
+ "epoch": 0.4988179669030733,
+ "grad_norm": 3.058112859725952,
+ "learning_rate": 4.9293505396401655e-06,
+ "loss": 0.6477,
+ "step": 1055
+ },
+ {
+ "epoch": 0.49929078014184397,
+ "grad_norm": 2.5227596759796143,
+ "learning_rate": 4.929203207867016e-06,
+ "loss": 0.5819,
+ "step": 1056
+ },
+ {
+ "epoch": 0.49976359338061466,
+ "grad_norm": 3.386862277984619,
+ "learning_rate": 4.929055724838035e-06,
+ "loss": 0.7342,
+ "step": 1057
+ },
+ {
+ "epoch": 0.5002364066193853,
+ "grad_norm": 3.368346929550171,
+ "learning_rate": 4.928908090562404e-06,
+ "loss": 0.6622,
+ "step": 1058
+ },
+ {
+ "epoch": 0.500709219858156,
+ "grad_norm": 2.9108314514160156,
+ "learning_rate": 4.928760305049317e-06,
+ "loss": 0.6598,
+ "step": 1059
+ },
+ {
+ "epoch": 0.5011820330969267,
+ "grad_norm": 2.822305917739868,
+ "learning_rate": 4.928612368307977e-06,
+ "loss": 0.5841,
+ "step": 1060
+ },
+ {
+ "epoch": 0.5016548463356973,
+ "grad_norm": 2.689131259918213,
+ "learning_rate": 4.928464280347592e-06,
+ "loss": 0.6631,
+ "step": 1061
+ },
+ {
+ "epoch": 0.502127659574468,
+ "grad_norm": 3.337214946746826,
+ "learning_rate": 4.9283160411773864e-06,
+ "loss": 0.6105,
+ "step": 1062
+ },
+ {
+ "epoch": 0.5026004728132387,
+ "grad_norm": 3.035911798477173,
+ "learning_rate": 4.928167650806588e-06,
+ "loss": 0.6981,
+ "step": 1063
+ },
+ {
+ "epoch": 0.5030732860520094,
+ "grad_norm": 2.8820855617523193,
+ "learning_rate": 4.9280191092444375e-06,
+ "loss": 0.6408,
+ "step": 1064
+ },
+ {
+ "epoch": 0.5035460992907801,
+ "grad_norm": 3.080432415008545,
+ "learning_rate": 4.927870416500183e-06,
+ "loss": 0.6398,
+ "step": 1065
+ },
+ {
+ "epoch": 0.5040189125295508,
+ "grad_norm": 2.761612892150879,
+ "learning_rate": 4.927721572583084e-06,
+ "loss": 0.6126,
+ "step": 1066
+ },
+ {
+ "epoch": 0.5044917257683215,
+ "grad_norm": 2.8561882972717285,
+ "learning_rate": 4.927572577502408e-06,
+ "loss": 0.584,
+ "step": 1067
+ },
+ {
+ "epoch": 0.5049645390070922,
+ "grad_norm": 3.3386311531066895,
+ "learning_rate": 4.927423431267432e-06,
+ "loss": 0.6666,
+ "step": 1068
+ },
+ {
+ "epoch": 0.5054373522458628,
+ "grad_norm": 2.632906675338745,
+ "learning_rate": 4.927274133887443e-06,
+ "loss": 0.632,
+ "step": 1069
+ },
+ {
+ "epoch": 0.5059101654846335,
+ "grad_norm": 2.8737308979034424,
+ "learning_rate": 4.927124685371737e-06,
+ "loss": 0.6051,
+ "step": 1070
+ },
+ {
+ "epoch": 0.5063829787234042,
+ "grad_norm": 3.042222738265991,
+ "learning_rate": 4.926975085729619e-06,
+ "loss": 0.6954,
+ "step": 1071
+ },
+ {
+ "epoch": 0.5068557919621749,
+ "grad_norm": 3.3341481685638428,
+ "learning_rate": 4.926825334970404e-06,
+ "loss": 0.7148,
+ "step": 1072
+ },
+ {
+ "epoch": 0.5073286052009456,
+ "grad_norm": 2.7415387630462646,
+ "learning_rate": 4.926675433103418e-06,
+ "loss": 0.5456,
+ "step": 1073
+ },
+ {
+ "epoch": 0.5078014184397163,
+ "grad_norm": 2.7545325756073,
+ "learning_rate": 4.926525380137993e-06,
+ "loss": 0.6213,
+ "step": 1074
+ },
+ {
+ "epoch": 0.508274231678487,
+ "grad_norm": 2.9153690338134766,
+ "learning_rate": 4.926375176083472e-06,
+ "loss": 0.6466,
+ "step": 1075
+ },
+ {
+ "epoch": 0.5087470449172576,
+ "grad_norm": 4.210638523101807,
+ "learning_rate": 4.926224820949209e-06,
+ "loss": 0.6192,
+ "step": 1076
+ },
+ {
+ "epoch": 0.5092198581560283,
+ "grad_norm": 2.4357898235321045,
+ "learning_rate": 4.926074314744565e-06,
+ "loss": 0.594,
+ "step": 1077
+ },
+ {
+ "epoch": 0.509692671394799,
+ "grad_norm": 2.8004701137542725,
+ "learning_rate": 4.92592365747891e-06,
+ "loss": 0.6276,
+ "step": 1078
+ },
+ {
+ "epoch": 0.5101654846335697,
+ "grad_norm": 2.920675039291382,
+ "learning_rate": 4.925772849161628e-06,
+ "loss": 0.6043,
+ "step": 1079
+ },
+ {
+ "epoch": 0.5106382978723404,
+ "grad_norm": 2.791555404663086,
+ "learning_rate": 4.9256218898021055e-06,
+ "loss": 0.6837,
+ "step": 1080
+ },
+ {
+ "epoch": 0.5111111111111111,
+ "grad_norm": 3.1702463626861572,
+ "learning_rate": 4.925470779409746e-06,
+ "loss": 0.668,
+ "step": 1081
+ },
+ {
+ "epoch": 0.5115839243498818,
+ "grad_norm": 2.7149479389190674,
+ "learning_rate": 4.925319517993955e-06,
+ "loss": 0.5842,
+ "step": 1082
+ },
+ {
+ "epoch": 0.5120567375886524,
+ "grad_norm": 2.916311025619507,
+ "learning_rate": 4.925168105564153e-06,
+ "loss": 0.6893,
+ "step": 1083
+ },
+ {
+ "epoch": 0.5125295508274231,
+ "grad_norm": 2.917654514312744,
+ "learning_rate": 4.925016542129767e-06,
+ "loss": 0.6513,
+ "step": 1084
+ },
+ {
+ "epoch": 0.5130023640661938,
+ "grad_norm": 2.5568928718566895,
+ "learning_rate": 4.924864827700234e-06,
+ "loss": 0.6177,
+ "step": 1085
+ },
+ {
+ "epoch": 0.5134751773049645,
+ "grad_norm": 2.816720485687256,
+ "learning_rate": 4.924712962285001e-06,
+ "loss": 0.5833,
+ "step": 1086
+ },
+ {
+ "epoch": 0.5139479905437352,
+ "grad_norm": 2.6989188194274902,
+ "learning_rate": 4.9245609458935235e-06,
+ "loss": 0.6332,
+ "step": 1087
+ },
+ {
+ "epoch": 0.5144208037825059,
+ "grad_norm": 2.959599494934082,
+ "learning_rate": 4.924408778535268e-06,
+ "loss": 0.626,
+ "step": 1088
+ },
+ {
+ "epoch": 0.5148936170212766,
+ "grad_norm": 2.872814416885376,
+ "learning_rate": 4.924256460219708e-06,
+ "loss": 0.6407,
+ "step": 1089
+ },
+ {
+ "epoch": 0.5153664302600472,
+ "grad_norm": 2.6989097595214844,
+ "learning_rate": 4.924103990956329e-06,
+ "loss": 0.6391,
+ "step": 1090
+ },
+ {
+ "epoch": 0.5158392434988179,
+ "grad_norm": 2.986492156982422,
+ "learning_rate": 4.9239513707546235e-06,
+ "loss": 0.6911,
+ "step": 1091
+ },
+ {
+ "epoch": 0.5163120567375886,
+ "grad_norm": 3.069920301437378,
+ "learning_rate": 4.9237985996240954e-06,
+ "loss": 0.671,
+ "step": 1092
+ },
+ {
+ "epoch": 0.5167848699763593,
+ "grad_norm": 2.8214917182922363,
+ "learning_rate": 4.9236456775742555e-06,
+ "loss": 0.5885,
+ "step": 1093
+ },
+ {
+ "epoch": 0.51725768321513,
+ "grad_norm": 2.9416961669921875,
+ "learning_rate": 4.923492604614627e-06,
+ "loss": 0.6293,
+ "step": 1094
+ },
+ {
+ "epoch": 0.5177304964539007,
+ "grad_norm": 2.761780023574829,
+ "learning_rate": 4.923339380754741e-06,
+ "loss": 0.649,
+ "step": 1095
+ },
+ {
+ "epoch": 0.5182033096926714,
+ "grad_norm": 2.7648792266845703,
+ "learning_rate": 4.923186006004138e-06,
+ "loss": 0.5906,
+ "step": 1096
+ },
+ {
+ "epoch": 0.518676122931442,
+ "grad_norm": 3.5535428524017334,
+ "learning_rate": 4.923032480372367e-06,
+ "loss": 0.7138,
+ "step": 1097
+ },
+ {
+ "epoch": 0.5191489361702127,
+ "grad_norm": 2.6252479553222656,
+ "learning_rate": 4.922878803868988e-06,
+ "loss": 0.5499,
+ "step": 1098
+ },
+ {
+ "epoch": 0.5196217494089834,
+ "grad_norm": 2.901002883911133,
+ "learning_rate": 4.9227249765035715e-06,
+ "loss": 0.6991,
+ "step": 1099
+ },
+ {
+ "epoch": 0.5200945626477541,
+ "grad_norm": 2.621877431869507,
+ "learning_rate": 4.9225709982856925e-06,
+ "loss": 0.6269,
+ "step": 1100
+ },
+ {
+ "epoch": 0.5205673758865248,
+ "grad_norm": 2.872483015060425,
+ "learning_rate": 4.92241686922494e-06,
+ "loss": 0.6657,
+ "step": 1101
+ },
+ {
+ "epoch": 0.5210401891252955,
+ "grad_norm": 2.730447769165039,
+ "learning_rate": 4.922262589330912e-06,
+ "loss": 0.6061,
+ "step": 1102
+ },
+ {
+ "epoch": 0.5215130023640662,
+ "grad_norm": 2.646247386932373,
+ "learning_rate": 4.922108158613213e-06,
+ "loss": 0.5923,
+ "step": 1103
+ },
+ {
+ "epoch": 0.5219858156028369,
+ "grad_norm": 2.6488895416259766,
+ "learning_rate": 4.92195357708146e-06,
+ "loss": 0.6293,
+ "step": 1104
+ },
+ {
+ "epoch": 0.5224586288416075,
+ "grad_norm": 2.756338357925415,
+ "learning_rate": 4.921798844745278e-06,
+ "loss": 0.6374,
+ "step": 1105
+ },
+ {
+ "epoch": 0.5229314420803782,
+ "grad_norm": 3.1441280841827393,
+ "learning_rate": 4.921643961614301e-06,
+ "loss": 0.6652,
+ "step": 1106
+ },
+ {
+ "epoch": 0.5234042553191489,
+ "grad_norm": 3.050002098083496,
+ "learning_rate": 4.921488927698172e-06,
+ "loss": 0.6809,
+ "step": 1107
+ },
+ {
+ "epoch": 0.5238770685579196,
+ "grad_norm": 2.71750807762146,
+ "learning_rate": 4.921333743006547e-06,
+ "loss": 0.6266,
+ "step": 1108
+ },
+ {
+ "epoch": 0.5243498817966903,
+ "grad_norm": 2.8439245223999023,
+ "learning_rate": 4.921178407549086e-06,
+ "loss": 0.5663,
+ "step": 1109
+ },
+ {
+ "epoch": 0.524822695035461,
+ "grad_norm": 3.0722241401672363,
+ "learning_rate": 4.921022921335464e-06,
+ "loss": 0.6791,
+ "step": 1110
+ },
+ {
+ "epoch": 0.5252955082742317,
+ "grad_norm": 3.4381656646728516,
+ "learning_rate": 4.920867284375358e-06,
+ "loss": 0.6687,
+ "step": 1111
+ },
+ {
+ "epoch": 0.5257683215130023,
+ "grad_norm": 2.819812774658203,
+ "learning_rate": 4.920711496678463e-06,
+ "loss": 0.6299,
+ "step": 1112
+ },
+ {
+ "epoch": 0.526241134751773,
+ "grad_norm": 3.6587414741516113,
+ "learning_rate": 4.9205555582544765e-06,
+ "loss": 0.7392,
+ "step": 1113
+ },
+ {
+ "epoch": 0.5267139479905437,
+ "grad_norm": 2.774296522140503,
+ "learning_rate": 4.920399469113109e-06,
+ "loss": 0.6652,
+ "step": 1114
+ },
+ {
+ "epoch": 0.5271867612293144,
+ "grad_norm": 2.7480580806732178,
+ "learning_rate": 4.920243229264081e-06,
+ "loss": 0.596,
+ "step": 1115
+ },
+ {
+ "epoch": 0.5276595744680851,
+ "grad_norm": 3.213057518005371,
+ "learning_rate": 4.920086838717119e-06,
+ "loss": 0.6986,
+ "step": 1116
+ },
+ {
+ "epoch": 0.5281323877068558,
+ "grad_norm": 2.940546989440918,
+ "learning_rate": 4.919930297481962e-06,
+ "loss": 0.6481,
+ "step": 1117
+ },
+ {
+ "epoch": 0.5286052009456265,
+ "grad_norm": 2.5970494747161865,
+ "learning_rate": 4.9197736055683555e-06,
+ "loss": 0.5658,
+ "step": 1118
+ },
+ {
+ "epoch": 0.5290780141843971,
+ "grad_norm": 4.49385404586792,
+ "learning_rate": 4.919616762986057e-06,
+ "loss": 0.605,
+ "step": 1119
+ },
+ {
+ "epoch": 0.5295508274231678,
+ "grad_norm": 2.971857786178589,
+ "learning_rate": 4.919459769744833e-06,
+ "loss": 0.6539,
+ "step": 1120
+ },
+ {
+ "epoch": 0.5300236406619385,
+ "grad_norm": 2.6192965507507324,
+ "learning_rate": 4.919302625854457e-06,
+ "loss": 0.6226,
+ "step": 1121
+ },
+ {
+ "epoch": 0.5304964539007092,
+ "grad_norm": 2.665088176727295,
+ "learning_rate": 4.919145331324716e-06,
+ "loss": 0.6647,
+ "step": 1122
+ },
+ {
+ "epoch": 0.5309692671394799,
+ "grad_norm": 2.612126111984253,
+ "learning_rate": 4.918987886165403e-06,
+ "loss": 0.6965,
+ "step": 1123
+ },
+ {
+ "epoch": 0.5314420803782506,
+ "grad_norm": 3.80017352104187,
+ "learning_rate": 4.9188302903863205e-06,
+ "loss": 0.7396,
+ "step": 1124
+ },
+ {
+ "epoch": 0.5319148936170213,
+ "grad_norm": 2.781752824783325,
+ "learning_rate": 4.918672543997282e-06,
+ "loss": 0.5985,
+ "step": 1125
+ },
+ {
+ "epoch": 0.532387706855792,
+ "grad_norm": 2.6067914962768555,
+ "learning_rate": 4.91851464700811e-06,
+ "loss": 0.6159,
+ "step": 1126
+ },
+ {
+ "epoch": 0.5328605200945626,
+ "grad_norm": 2.670807123184204,
+ "learning_rate": 4.918356599428636e-06,
+ "loss": 0.5958,
+ "step": 1127
+ },
+ {
+ "epoch": 0.5333333333333333,
+ "grad_norm": 2.608611822128296,
+ "learning_rate": 4.9181984012687e-06,
+ "loss": 0.5768,
+ "step": 1128
+ },
+ {
+ "epoch": 0.533806146572104,
+ "grad_norm": 2.586764097213745,
+ "learning_rate": 4.918040052538154e-06,
+ "loss": 0.661,
+ "step": 1129
+ },
+ {
+ "epoch": 0.5342789598108747,
+ "grad_norm": 3.1317451000213623,
+ "learning_rate": 4.917881553246856e-06,
+ "loss": 0.6626,
+ "step": 1130
+ },
+ {
+ "epoch": 0.5347517730496454,
+ "grad_norm": 2.7135281562805176,
+ "learning_rate": 4.917722903404676e-06,
+ "loss": 0.6572,
+ "step": 1131
+ },
+ {
+ "epoch": 0.5352245862884161,
+ "grad_norm": 3.4546358585357666,
+ "learning_rate": 4.917564103021493e-06,
+ "loss": 0.5597,
+ "step": 1132
+ },
+ {
+ "epoch": 0.5356973995271868,
+ "grad_norm": 3.0943493843078613,
+ "learning_rate": 4.917405152107193e-06,
+ "loss": 0.7258,
+ "step": 1133
+ },
+ {
+ "epoch": 0.5361702127659574,
+ "grad_norm": 2.6069352626800537,
+ "learning_rate": 4.917246050671674e-06,
+ "loss": 0.6209,
+ "step": 1134
+ },
+ {
+ "epoch": 0.5366430260047281,
+ "grad_norm": 2.584883689880371,
+ "learning_rate": 4.917086798724844e-06,
+ "loss": 0.658,
+ "step": 1135
+ },
+ {
+ "epoch": 0.5371158392434988,
+ "grad_norm": 3.001976490020752,
+ "learning_rate": 4.9169273962766166e-06,
+ "loss": 0.6306,
+ "step": 1136
+ },
+ {
+ "epoch": 0.5375886524822695,
+ "grad_norm": 2.5013928413391113,
+ "learning_rate": 4.916767843336918e-06,
+ "loss": 0.572,
+ "step": 1137
+ },
+ {
+ "epoch": 0.5380614657210402,
+ "grad_norm": 2.9114553928375244,
+ "learning_rate": 4.916608139915684e-06,
+ "loss": 0.5841,
+ "step": 1138
+ },
+ {
+ "epoch": 0.5385342789598109,
+ "grad_norm": 2.8878467082977295,
+ "learning_rate": 4.9164482860228564e-06,
+ "loss": 0.6654,
+ "step": 1139
+ },
+ {
+ "epoch": 0.5390070921985816,
+ "grad_norm": 2.9827866554260254,
+ "learning_rate": 4.91628828166839e-06,
+ "loss": 0.6674,
+ "step": 1140
+ },
+ {
+ "epoch": 0.5394799054373522,
+ "grad_norm": 3.8696281909942627,
+ "learning_rate": 4.916128126862248e-06,
+ "loss": 0.6241,
+ "step": 1141
+ },
+ {
+ "epoch": 0.5399527186761229,
+ "grad_norm": 2.9556291103363037,
+ "learning_rate": 4.915967821614402e-06,
+ "loss": 0.6478,
+ "step": 1142
+ },
+ {
+ "epoch": 0.5404255319148936,
+ "grad_norm": 2.392942428588867,
+ "learning_rate": 4.915807365934834e-06,
+ "loss": 0.6097,
+ "step": 1143
+ },
+ {
+ "epoch": 0.5408983451536643,
+ "grad_norm": 3.032235860824585,
+ "learning_rate": 4.915646759833534e-06,
+ "loss": 0.7193,
+ "step": 1144
+ },
+ {
+ "epoch": 0.541371158392435,
+ "grad_norm": 2.840416193008423,
+ "learning_rate": 4.915486003320501e-06,
+ "loss": 0.5506,
+ "step": 1145
+ },
+ {
+ "epoch": 0.5418439716312057,
+ "grad_norm": 2.5438895225524902,
+ "learning_rate": 4.915325096405747e-06,
+ "loss": 0.6487,
+ "step": 1146
+ },
+ {
+ "epoch": 0.5423167848699764,
+ "grad_norm": 2.544334650039673,
+ "learning_rate": 4.9151640390992905e-06,
+ "loss": 0.6168,
+ "step": 1147
+ },
+ {
+ "epoch": 0.542789598108747,
+ "grad_norm": 2.8535678386688232,
+ "learning_rate": 4.91500283141116e-06,
+ "loss": 0.678,
+ "step": 1148
+ },
+ {
+ "epoch": 0.5432624113475177,
+ "grad_norm": 2.8086955547332764,
+ "learning_rate": 4.9148414733513915e-06,
+ "loss": 0.6473,
+ "step": 1149
+ },
+ {
+ "epoch": 0.5437352245862884,
+ "grad_norm": 2.4709885120391846,
+ "learning_rate": 4.914679964930034e-06,
+ "loss": 0.6797,
+ "step": 1150
+ },
+ {
+ "epoch": 0.5442080378250591,
+ "grad_norm": 2.8546934127807617,
+ "learning_rate": 4.9145183061571435e-06,
+ "loss": 0.6247,
+ "step": 1151
+ },
+ {
+ "epoch": 0.5446808510638298,
+ "grad_norm": 2.991184711456299,
+ "learning_rate": 4.9143564970427844e-06,
+ "loss": 0.5977,
+ "step": 1152
+ },
+ {
+ "epoch": 0.5451536643026005,
+ "grad_norm": 3.011216402053833,
+ "learning_rate": 4.914194537597033e-06,
+ "loss": 0.7005,
+ "step": 1153
+ },
+ {
+ "epoch": 0.5456264775413712,
+ "grad_norm": 2.807521343231201,
+ "learning_rate": 4.9140324278299744e-06,
+ "loss": 0.5412,
+ "step": 1154
+ },
+ {
+ "epoch": 0.5460992907801419,
+ "grad_norm": 3.0401229858398438,
+ "learning_rate": 4.913870167751701e-06,
+ "loss": 0.6394,
+ "step": 1155
+ },
+ {
+ "epoch": 0.5465721040189125,
+ "grad_norm": 2.853914976119995,
+ "learning_rate": 4.913707757372317e-06,
+ "loss": 0.6745,
+ "step": 1156
+ },
+ {
+ "epoch": 0.5470449172576832,
+ "grad_norm": 4.505620956420898,
+ "learning_rate": 4.913545196701935e-06,
+ "loss": 0.6668,
+ "step": 1157
+ },
+ {
+ "epoch": 0.5475177304964539,
+ "grad_norm": 3.0505781173706055,
+ "learning_rate": 4.913382485750676e-06,
+ "loss": 0.6926,
+ "step": 1158
+ },
+ {
+ "epoch": 0.5479905437352246,
+ "grad_norm": 2.798435688018799,
+ "learning_rate": 4.913219624528672e-06,
+ "loss": 0.605,
+ "step": 1159
+ },
+ {
+ "epoch": 0.5484633569739953,
+ "grad_norm": 2.7814908027648926,
+ "learning_rate": 4.913056613046065e-06,
+ "loss": 0.6678,
+ "step": 1160
+ },
+ {
+ "epoch": 0.548936170212766,
+ "grad_norm": 3.2089321613311768,
+ "learning_rate": 4.9128934513130025e-06,
+ "loss": 0.5995,
+ "step": 1161
+ },
+ {
+ "epoch": 0.5494089834515367,
+ "grad_norm": 2.7699952125549316,
+ "learning_rate": 4.9127301393396455e-06,
+ "loss": 0.7062,
+ "step": 1162
+ },
+ {
+ "epoch": 0.5498817966903073,
+ "grad_norm": 2.859368324279785,
+ "learning_rate": 4.912566677136162e-06,
+ "loss": 0.6063,
+ "step": 1163
+ },
+ {
+ "epoch": 0.550354609929078,
+ "grad_norm": 2.727334499359131,
+ "learning_rate": 4.91240306471273e-06,
+ "loss": 0.6848,
+ "step": 1164
+ },
+ {
+ "epoch": 0.5508274231678487,
+ "grad_norm": 2.6017510890960693,
+ "learning_rate": 4.912239302079537e-06,
+ "loss": 0.5808,
+ "step": 1165
+ },
+ {
+ "epoch": 0.5513002364066194,
+ "grad_norm": 3.539583206176758,
+ "learning_rate": 4.912075389246781e-06,
+ "loss": 0.7053,
+ "step": 1166
+ },
+ {
+ "epoch": 0.5517730496453901,
+ "grad_norm": 2.918280601501465,
+ "learning_rate": 4.911911326224666e-06,
+ "loss": 0.5904,
+ "step": 1167
+ },
+ {
+ "epoch": 0.5522458628841608,
+ "grad_norm": 3.0067362785339355,
+ "learning_rate": 4.9117471130234095e-06,
+ "loss": 0.6392,
+ "step": 1168
+ },
+ {
+ "epoch": 0.5527186761229315,
+ "grad_norm": 2.4374797344207764,
+ "learning_rate": 4.911582749653236e-06,
+ "loss": 0.5793,
+ "step": 1169
+ },
+ {
+ "epoch": 0.5531914893617021,
+ "grad_norm": 3.121182918548584,
+ "learning_rate": 4.911418236124378e-06,
+ "loss": 0.6636,
+ "step": 1170
+ },
+ {
+ "epoch": 0.5536643026004728,
+ "grad_norm": 3.1289851665496826,
+ "learning_rate": 4.91125357244708e-06,
+ "loss": 0.656,
+ "step": 1171
+ },
+ {
+ "epoch": 0.5541371158392435,
+ "grad_norm": 2.7034592628479004,
+ "learning_rate": 4.911088758631596e-06,
+ "loss": 0.6001,
+ "step": 1172
+ },
+ {
+ "epoch": 0.5546099290780142,
+ "grad_norm": 2.710146188735962,
+ "learning_rate": 4.910923794688187e-06,
+ "loss": 0.6007,
+ "step": 1173
+ },
+ {
+ "epoch": 0.5550827423167849,
+ "grad_norm": 2.5424487590789795,
+ "learning_rate": 4.910758680627124e-06,
+ "loss": 0.5193,
+ "step": 1174
+ },
+ {
+ "epoch": 0.5555555555555556,
+ "grad_norm": 2.615893602371216,
+ "learning_rate": 4.91059341645869e-06,
+ "loss": 0.5525,
+ "step": 1175
+ },
+ {
+ "epoch": 0.5560283687943263,
+ "grad_norm": 3.3179728984832764,
+ "learning_rate": 4.910428002193174e-06,
+ "loss": 0.7285,
+ "step": 1176
+ },
+ {
+ "epoch": 0.556501182033097,
+ "grad_norm": 2.7234175205230713,
+ "learning_rate": 4.910262437840875e-06,
+ "loss": 0.574,
+ "step": 1177
+ },
+ {
+ "epoch": 0.5569739952718676,
+ "grad_norm": 3.0416605472564697,
+ "learning_rate": 4.9100967234121034e-06,
+ "loss": 0.5623,
+ "step": 1178
+ },
+ {
+ "epoch": 0.5574468085106383,
+ "grad_norm": 3.067786455154419,
+ "learning_rate": 4.909930858917177e-06,
+ "loss": 0.6491,
+ "step": 1179
+ },
+ {
+ "epoch": 0.557919621749409,
+ "grad_norm": 3.0037379264831543,
+ "learning_rate": 4.909764844366422e-06,
+ "loss": 0.5696,
+ "step": 1180
+ },
+ {
+ "epoch": 0.5583924349881797,
+ "grad_norm": 2.966179609298706,
+ "learning_rate": 4.909598679770178e-06,
+ "loss": 0.6042,
+ "step": 1181
+ },
+ {
+ "epoch": 0.5588652482269504,
+ "grad_norm": 2.6000657081604004,
+ "learning_rate": 4.909432365138789e-06,
+ "loss": 0.5883,
+ "step": 1182
+ },
+ {
+ "epoch": 0.5593380614657211,
+ "grad_norm": 2.6794495582580566,
+ "learning_rate": 4.909265900482612e-06,
+ "loss": 0.6809,
+ "step": 1183
+ },
+ {
+ "epoch": 0.5598108747044918,
+ "grad_norm": 2.6765122413635254,
+ "learning_rate": 4.9090992858120115e-06,
+ "loss": 0.6601,
+ "step": 1184
+ },
+ {
+ "epoch": 0.5602836879432624,
+ "grad_norm": 2.6051928997039795,
+ "learning_rate": 4.908932521137363e-06,
+ "loss": 0.5946,
+ "step": 1185
+ },
+ {
+ "epoch": 0.5607565011820331,
+ "grad_norm": 3.0405542850494385,
+ "learning_rate": 4.908765606469048e-06,
+ "loss": 0.6998,
+ "step": 1186
+ },
+ {
+ "epoch": 0.5612293144208038,
+ "grad_norm": 2.7975668907165527,
+ "learning_rate": 4.908598541817462e-06,
+ "loss": 0.6218,
+ "step": 1187
+ },
+ {
+ "epoch": 0.5617021276595745,
+ "grad_norm": 2.5367627143859863,
+ "learning_rate": 4.908431327193005e-06,
+ "loss": 0.6354,
+ "step": 1188
+ },
+ {
+ "epoch": 0.5621749408983452,
+ "grad_norm": 3.7939631938934326,
+ "learning_rate": 4.908263962606091e-06,
+ "loss": 0.6376,
+ "step": 1189
+ },
+ {
+ "epoch": 0.5626477541371159,
+ "grad_norm": 2.864079475402832,
+ "learning_rate": 4.908096448067139e-06,
+ "loss": 0.5485,
+ "step": 1190
+ },
+ {
+ "epoch": 0.5631205673758866,
+ "grad_norm": 2.7855563163757324,
+ "learning_rate": 4.9079287835865804e-06,
+ "loss": 0.6645,
+ "step": 1191
+ },
+ {
+ "epoch": 0.5635933806146572,
+ "grad_norm": 2.6156625747680664,
+ "learning_rate": 4.9077609691748556e-06,
+ "loss": 0.5751,
+ "step": 1192
+ },
+ {
+ "epoch": 0.5640661938534279,
+ "grad_norm": 3.0475659370422363,
+ "learning_rate": 4.907593004842412e-06,
+ "loss": 0.6739,
+ "step": 1193
+ },
+ {
+ "epoch": 0.5645390070921986,
+ "grad_norm": 2.9176738262176514,
+ "learning_rate": 4.9074248905997104e-06,
+ "loss": 0.6493,
+ "step": 1194
+ },
+ {
+ "epoch": 0.5650118203309693,
+ "grad_norm": 2.6168384552001953,
+ "learning_rate": 4.907256626457216e-06,
+ "loss": 0.6154,
+ "step": 1195
+ },
+ {
+ "epoch": 0.56548463356974,
+ "grad_norm": 2.893980026245117,
+ "learning_rate": 4.907088212425408e-06,
+ "loss": 0.5808,
+ "step": 1196
+ },
+ {
+ "epoch": 0.5659574468085107,
+ "grad_norm": 3.3832836151123047,
+ "learning_rate": 4.90691964851477e-06,
+ "loss": 0.7888,
+ "step": 1197
+ },
+ {
+ "epoch": 0.5664302600472814,
+ "grad_norm": 3.088932752609253,
+ "learning_rate": 4.906750934735801e-06,
+ "loss": 0.6516,
+ "step": 1198
+ },
+ {
+ "epoch": 0.566903073286052,
+ "grad_norm": 2.494471549987793,
+ "learning_rate": 4.906582071099004e-06,
+ "loss": 0.6286,
+ "step": 1199
+ },
+ {
+ "epoch": 0.5673758865248227,
+ "grad_norm": 2.716550588607788,
+ "learning_rate": 4.906413057614895e-06,
+ "loss": 0.5939,
+ "step": 1200
+ },
+ {
+ "epoch": 0.5678486997635934,
+ "grad_norm": 2.5821073055267334,
+ "learning_rate": 4.906243894293995e-06,
+ "loss": 0.6668,
+ "step": 1201
+ },
+ {
+ "epoch": 0.5683215130023641,
+ "grad_norm": 3.651787042617798,
+ "learning_rate": 4.90607458114684e-06,
+ "loss": 0.6124,
+ "step": 1202
+ },
+ {
+ "epoch": 0.5687943262411348,
+ "grad_norm": 2.7567858695983887,
+ "learning_rate": 4.9059051181839705e-06,
+ "loss": 0.6656,
+ "step": 1203
+ },
+ {
+ "epoch": 0.5692671394799055,
+ "grad_norm": 2.8067586421966553,
+ "learning_rate": 4.90573550541594e-06,
+ "loss": 0.6306,
+ "step": 1204
+ },
+ {
+ "epoch": 0.5697399527186762,
+ "grad_norm": 2.6136393547058105,
+ "learning_rate": 4.905565742853307e-06,
+ "loss": 0.5992,
+ "step": 1205
+ },
+ {
+ "epoch": 0.5702127659574469,
+ "grad_norm": 2.899049758911133,
+ "learning_rate": 4.905395830506644e-06,
+ "loss": 0.621,
+ "step": 1206
+ },
+ {
+ "epoch": 0.5706855791962175,
+ "grad_norm": 3.036583185195923,
+ "learning_rate": 4.9052257683865294e-06,
+ "loss": 0.652,
+ "step": 1207
+ },
+ {
+ "epoch": 0.5711583924349882,
+ "grad_norm": 2.7947216033935547,
+ "learning_rate": 4.905055556503553e-06,
+ "loss": 0.6636,
+ "step": 1208
+ },
+ {
+ "epoch": 0.5716312056737589,
+ "grad_norm": 3.1646955013275146,
+ "learning_rate": 4.9048851948683135e-06,
+ "loss": 0.6376,
+ "step": 1209
+ },
+ {
+ "epoch": 0.5721040189125296,
+ "grad_norm": 2.8175766468048096,
+ "learning_rate": 4.904714683491417e-06,
+ "loss": 0.5929,
+ "step": 1210
+ },
+ {
+ "epoch": 0.5725768321513003,
+ "grad_norm": 2.923923969268799,
+ "learning_rate": 4.904544022383483e-06,
+ "loss": 0.6633,
+ "step": 1211
+ },
+ {
+ "epoch": 0.573049645390071,
+ "grad_norm": 2.7471134662628174,
+ "learning_rate": 4.9043732115551356e-06,
+ "loss": 0.6551,
+ "step": 1212
+ },
+ {
+ "epoch": 0.5735224586288417,
+ "grad_norm": 2.8660807609558105,
+ "learning_rate": 4.90420225101701e-06,
+ "loss": 0.6423,
+ "step": 1213
+ },
+ {
+ "epoch": 0.5739952718676123,
+ "grad_norm": 2.769247531890869,
+ "learning_rate": 4.904031140779754e-06,
+ "loss": 0.5982,
+ "step": 1214
+ },
+ {
+ "epoch": 0.574468085106383,
+ "grad_norm": 2.9043145179748535,
+ "learning_rate": 4.90385988085402e-06,
+ "loss": 0.5843,
+ "step": 1215
+ },
+ {
+ "epoch": 0.5749408983451537,
+ "grad_norm": 2.6639609336853027,
+ "learning_rate": 4.903688471250471e-06,
+ "loss": 0.5858,
+ "step": 1216
+ },
+ {
+ "epoch": 0.5754137115839244,
+ "grad_norm": 2.6967573165893555,
+ "learning_rate": 4.903516911979781e-06,
+ "loss": 0.5755,
+ "step": 1217
+ },
+ {
+ "epoch": 0.5758865248226951,
+ "grad_norm": 2.8865857124328613,
+ "learning_rate": 4.903345203052633e-06,
+ "loss": 0.6051,
+ "step": 1218
+ },
+ {
+ "epoch": 0.5763593380614658,
+ "grad_norm": 2.381979465484619,
+ "learning_rate": 4.903173344479717e-06,
+ "loss": 0.5727,
+ "step": 1219
+ },
+ {
+ "epoch": 0.5768321513002365,
+ "grad_norm": 2.7717981338500977,
+ "learning_rate": 4.903001336271734e-06,
+ "loss": 0.6406,
+ "step": 1220
+ },
+ {
+ "epoch": 0.577304964539007,
+ "grad_norm": 2.6431570053100586,
+ "learning_rate": 4.902829178439395e-06,
+ "loss": 0.6226,
+ "step": 1221
+ },
+ {
+ "epoch": 0.5777777777777777,
+ "grad_norm": 2.8090415000915527,
+ "learning_rate": 4.902656870993419e-06,
+ "loss": 0.5761,
+ "step": 1222
+ },
+ {
+ "epoch": 0.5782505910165484,
+ "grad_norm": 2.4769368171691895,
+ "learning_rate": 4.902484413944535e-06,
+ "loss": 0.5602,
+ "step": 1223
+ },
+ {
+ "epoch": 0.5787234042553191,
+ "grad_norm": 2.693316698074341,
+ "learning_rate": 4.902311807303481e-06,
+ "loss": 0.5222,
+ "step": 1224
+ },
+ {
+ "epoch": 0.5791962174940898,
+ "grad_norm": 2.7623913288116455,
+ "learning_rate": 4.902139051081004e-06,
+ "loss": 0.6978,
+ "step": 1225
+ },
+ {
+ "epoch": 0.5796690307328605,
+ "grad_norm": 2.6133766174316406,
+ "learning_rate": 4.901966145287863e-06,
+ "loss": 0.5802,
+ "step": 1226
+ },
+ {
+ "epoch": 0.5801418439716312,
+ "grad_norm": 2.7345972061157227,
+ "learning_rate": 4.901793089934821e-06,
+ "loss": 0.6294,
+ "step": 1227
+ },
+ {
+ "epoch": 0.5806146572104018,
+ "grad_norm": 2.7545835971832275,
+ "learning_rate": 4.9016198850326555e-06,
+ "loss": 0.6085,
+ "step": 1228
+ },
+ {
+ "epoch": 0.5810874704491725,
+ "grad_norm": 2.6947758197784424,
+ "learning_rate": 4.90144653059215e-06,
+ "loss": 0.6025,
+ "step": 1229
+ },
+ {
+ "epoch": 0.5815602836879432,
+ "grad_norm": 2.692967414855957,
+ "learning_rate": 4.901273026624099e-06,
+ "loss": 0.5715,
+ "step": 1230
+ },
+ {
+ "epoch": 0.5820330969267139,
+ "grad_norm": 2.78347110748291,
+ "learning_rate": 4.901099373139307e-06,
+ "loss": 0.6063,
+ "step": 1231
+ },
+ {
+ "epoch": 0.5825059101654846,
+ "grad_norm": 2.346496343612671,
+ "learning_rate": 4.900925570148585e-06,
+ "loss": 0.5869,
+ "step": 1232
+ },
+ {
+ "epoch": 0.5829787234042553,
+ "grad_norm": 2.606639862060547,
+ "learning_rate": 4.900751617662755e-06,
+ "loss": 0.6197,
+ "step": 1233
+ },
+ {
+ "epoch": 0.583451536643026,
+ "grad_norm": 2.5825929641723633,
+ "learning_rate": 4.900577515692649e-06,
+ "loss": 0.6721,
+ "step": 1234
+ },
+ {
+ "epoch": 0.5839243498817966,
+ "grad_norm": 2.731349468231201,
+ "learning_rate": 4.900403264249107e-06,
+ "loss": 0.6273,
+ "step": 1235
+ },
+ {
+ "epoch": 0.5843971631205673,
+ "grad_norm": 3.2133874893188477,
+ "learning_rate": 4.90022886334298e-06,
+ "loss": 0.6231,
+ "step": 1236
+ },
+ {
+ "epoch": 0.584869976359338,
+ "grad_norm": 2.9213852882385254,
+ "learning_rate": 4.900054312985127e-06,
+ "loss": 0.6677,
+ "step": 1237
+ },
+ {
+ "epoch": 0.5853427895981087,
+ "grad_norm": 2.815425157546997,
+ "learning_rate": 4.899879613186414e-06,
+ "loss": 0.6405,
+ "step": 1238
+ },
+ {
+ "epoch": 0.5858156028368794,
+ "grad_norm": 2.730782985687256,
+ "learning_rate": 4.899704763957721e-06,
+ "loss": 0.6233,
+ "step": 1239
+ },
+ {
+ "epoch": 0.5862884160756501,
+ "grad_norm": 2.6432766914367676,
+ "learning_rate": 4.899529765309936e-06,
+ "loss": 0.6267,
+ "step": 1240
+ },
+ {
+ "epoch": 0.5867612293144208,
+ "grad_norm": 2.616215229034424,
+ "learning_rate": 4.899354617253953e-06,
+ "loss": 0.6268,
+ "step": 1241
+ },
+ {
+ "epoch": 0.5872340425531914,
+ "grad_norm": 2.7630255222320557,
+ "learning_rate": 4.899179319800679e-06,
+ "loss": 0.6348,
+ "step": 1242
+ },
+ {
+ "epoch": 0.5877068557919621,
+ "grad_norm": 2.785095453262329,
+ "learning_rate": 4.899003872961029e-06,
+ "loss": 0.5839,
+ "step": 1243
+ },
+ {
+ "epoch": 0.5881796690307328,
+ "grad_norm": 2.9050328731536865,
+ "learning_rate": 4.898828276745927e-06,
+ "loss": 0.651,
+ "step": 1244
+ },
+ {
+ "epoch": 0.5886524822695035,
+ "grad_norm": 2.958092212677002,
+ "learning_rate": 4.8986525311663065e-06,
+ "loss": 0.6395,
+ "step": 1245
+ },
+ {
+ "epoch": 0.5891252955082742,
+ "grad_norm": 2.952310800552368,
+ "learning_rate": 4.898476636233111e-06,
+ "loss": 0.6731,
+ "step": 1246
+ },
+ {
+ "epoch": 0.5895981087470449,
+ "grad_norm": 2.9876346588134766,
+ "learning_rate": 4.898300591957293e-06,
+ "loss": 0.7015,
+ "step": 1247
+ },
+ {
+ "epoch": 0.5900709219858156,
+ "grad_norm": 2.8941752910614014,
+ "learning_rate": 4.898124398349813e-06,
+ "loss": 0.6452,
+ "step": 1248
+ },
+ {
+ "epoch": 0.5905437352245863,
+ "grad_norm": 2.9809536933898926,
+ "learning_rate": 4.897948055421642e-06,
+ "loss": 0.5736,
+ "step": 1249
+ },
+ {
+ "epoch": 0.5910165484633569,
+ "grad_norm": 2.927046775817871,
+ "learning_rate": 4.897771563183761e-06,
+ "loss": 0.5918,
+ "step": 1250
+ },
+ {
+ "epoch": 0.5914893617021276,
+ "grad_norm": 2.865020275115967,
+ "learning_rate": 4.897594921647158e-06,
+ "loss": 0.6924,
+ "step": 1251
+ },
+ {
+ "epoch": 0.5919621749408983,
+ "grad_norm": 2.7406699657440186,
+ "learning_rate": 4.897418130822832e-06,
+ "loss": 0.509,
+ "step": 1252
+ },
+ {
+ "epoch": 0.592434988179669,
+ "grad_norm": 2.781606912612915,
+ "learning_rate": 4.897241190721791e-06,
+ "loss": 0.5555,
+ "step": 1253
+ },
+ {
+ "epoch": 0.5929078014184397,
+ "grad_norm": 2.79209303855896,
+ "learning_rate": 4.8970641013550535e-06,
+ "loss": 0.6722,
+ "step": 1254
+ },
+ {
+ "epoch": 0.5933806146572104,
+ "grad_norm": 3.0672268867492676,
+ "learning_rate": 4.896886862733645e-06,
+ "loss": 0.6366,
+ "step": 1255
+ },
+ {
+ "epoch": 0.5938534278959811,
+ "grad_norm": 2.7456953525543213,
+ "learning_rate": 4.896709474868602e-06,
+ "loss": 0.6246,
+ "step": 1256
+ },
+ {
+ "epoch": 0.5943262411347517,
+ "grad_norm": 3.6731202602386475,
+ "learning_rate": 4.896531937770968e-06,
+ "loss": 0.668,
+ "step": 1257
+ },
+ {
+ "epoch": 0.5947990543735224,
+ "grad_norm": 2.6056087017059326,
+ "learning_rate": 4.8963542514518e-06,
+ "loss": 0.5815,
+ "step": 1258
+ },
+ {
+ "epoch": 0.5952718676122931,
+ "grad_norm": 2.719698905944824,
+ "learning_rate": 4.89617641592216e-06,
+ "loss": 0.6058,
+ "step": 1259
+ },
+ {
+ "epoch": 0.5957446808510638,
+ "grad_norm": 2.625838279724121,
+ "learning_rate": 4.895998431193121e-06,
+ "loss": 0.6143,
+ "step": 1260
+ },
+ {
+ "epoch": 0.5962174940898345,
+ "grad_norm": 2.7166085243225098,
+ "learning_rate": 4.895820297275767e-06,
+ "loss": 0.5187,
+ "step": 1261
+ },
+ {
+ "epoch": 0.5966903073286052,
+ "grad_norm": 2.7544102668762207,
+ "learning_rate": 4.8956420141811875e-06,
+ "loss": 0.5928,
+ "step": 1262
+ },
+ {
+ "epoch": 0.5971631205673759,
+ "grad_norm": 2.6678333282470703,
+ "learning_rate": 4.895463581920484e-06,
+ "loss": 0.611,
+ "step": 1263
+ },
+ {
+ "epoch": 0.5976359338061465,
+ "grad_norm": 2.853384494781494,
+ "learning_rate": 4.895285000504768e-06,
+ "loss": 0.642,
+ "step": 1264
+ },
+ {
+ "epoch": 0.5981087470449172,
+ "grad_norm": 2.637852430343628,
+ "learning_rate": 4.895106269945158e-06,
+ "loss": 0.6308,
+ "step": 1265
+ },
+ {
+ "epoch": 0.5985815602836879,
+ "grad_norm": 2.9880387783050537,
+ "learning_rate": 4.8949273902527826e-06,
+ "loss": 0.5781,
+ "step": 1266
+ },
+ {
+ "epoch": 0.5990543735224586,
+ "grad_norm": 3.5984015464782715,
+ "learning_rate": 4.89474836143878e-06,
+ "loss": 0.5865,
+ "step": 1267
+ },
+ {
+ "epoch": 0.5995271867612293,
+ "grad_norm": 2.719855546951294,
+ "learning_rate": 4.8945691835142975e-06,
+ "loss": 0.6393,
+ "step": 1268
+ },
+ {
+ "epoch": 0.6,
+ "grad_norm": 2.7885141372680664,
+ "learning_rate": 4.894389856490492e-06,
+ "loss": 0.66,
+ "step": 1269
+ },
+ {
+ "epoch": 0.6004728132387707,
+ "grad_norm": 2.698819875717163,
+ "learning_rate": 4.894210380378529e-06,
+ "loss": 0.6144,
+ "step": 1270
+ },
+ {
+ "epoch": 0.6009456264775414,
+ "grad_norm": 2.278045654296875,
+ "learning_rate": 4.894030755189584e-06,
+ "loss": 0.5609,
+ "step": 1271
+ },
+ {
+ "epoch": 0.601418439716312,
+ "grad_norm": 2.8729357719421387,
+ "learning_rate": 4.893850980934841e-06,
+ "loss": 0.6715,
+ "step": 1272
+ },
+ {
+ "epoch": 0.6018912529550827,
+ "grad_norm": 2.8541221618652344,
+ "learning_rate": 4.893671057625495e-06,
+ "loss": 0.6787,
+ "step": 1273
+ },
+ {
+ "epoch": 0.6023640661938534,
+ "grad_norm": 2.4561476707458496,
+ "learning_rate": 4.893490985272748e-06,
+ "loss": 0.6331,
+ "step": 1274
+ },
+ {
+ "epoch": 0.6028368794326241,
+ "grad_norm": 2.565739154815674,
+ "learning_rate": 4.893310763887812e-06,
+ "loss": 0.587,
+ "step": 1275
+ },
+ {
+ "epoch": 0.6033096926713948,
+ "grad_norm": 2.384951591491699,
+ "learning_rate": 4.8931303934819095e-06,
+ "loss": 0.5358,
+ "step": 1276
+ },
+ {
+ "epoch": 0.6037825059101655,
+ "grad_norm": 2.380808115005493,
+ "learning_rate": 4.89294987406627e-06,
+ "loss": 0.5402,
+ "step": 1277
+ },
+ {
+ "epoch": 0.6042553191489362,
+ "grad_norm": 2.764815092086792,
+ "learning_rate": 4.892769205652136e-06,
+ "loss": 0.6103,
+ "step": 1278
+ },
+ {
+ "epoch": 0.6047281323877068,
+ "grad_norm": 2.463350296020508,
+ "learning_rate": 4.892588388250754e-06,
+ "loss": 0.5937,
+ "step": 1279
+ },
+ {
+ "epoch": 0.6052009456264775,
+ "grad_norm": 3.099689245223999,
+ "learning_rate": 4.8924074218733855e-06,
+ "loss": 0.6354,
+ "step": 1280
+ },
+ {
+ "epoch": 0.6056737588652482,
+ "grad_norm": 2.804450035095215,
+ "learning_rate": 4.892226306531297e-06,
+ "loss": 0.6595,
+ "step": 1281
+ },
+ {
+ "epoch": 0.6061465721040189,
+ "grad_norm": 3.1559767723083496,
+ "learning_rate": 4.892045042235765e-06,
+ "loss": 0.6664,
+ "step": 1282
+ },
+ {
+ "epoch": 0.6066193853427896,
+ "grad_norm": 2.844341993331909,
+ "learning_rate": 4.891863628998079e-06,
+ "loss": 0.7454,
+ "step": 1283
+ },
+ {
+ "epoch": 0.6070921985815603,
+ "grad_norm": 2.686602830886841,
+ "learning_rate": 4.891682066829532e-06,
+ "loss": 0.6755,
+ "step": 1284
+ },
+ {
+ "epoch": 0.607565011820331,
+ "grad_norm": 2.736457347869873,
+ "learning_rate": 4.8915003557414285e-06,
+ "loss": 0.6305,
+ "step": 1285
+ },
+ {
+ "epoch": 0.6080378250591016,
+ "grad_norm": 2.661362409591675,
+ "learning_rate": 4.891318495745086e-06,
+ "loss": 0.5958,
+ "step": 1286
+ },
+ {
+ "epoch": 0.6085106382978723,
+ "grad_norm": 2.707348108291626,
+ "learning_rate": 4.8911364868518255e-06,
+ "loss": 0.5824,
+ "step": 1287
+ },
+ {
+ "epoch": 0.608983451536643,
+ "grad_norm": 2.9798858165740967,
+ "learning_rate": 4.890954329072981e-06,
+ "loss": 0.5981,
+ "step": 1288
+ },
+ {
+ "epoch": 0.6094562647754137,
+ "grad_norm": 2.6285455226898193,
+ "learning_rate": 4.890772022419895e-06,
+ "loss": 0.6194,
+ "step": 1289
+ },
+ {
+ "epoch": 0.6099290780141844,
+ "grad_norm": 2.9254322052001953,
+ "learning_rate": 4.890589566903917e-06,
+ "loss": 0.6002,
+ "step": 1290
+ },
+ {
+ "epoch": 0.6104018912529551,
+ "grad_norm": 2.6458325386047363,
+ "learning_rate": 4.89040696253641e-06,
+ "loss": 0.5457,
+ "step": 1291
+ },
+ {
+ "epoch": 0.6108747044917258,
+ "grad_norm": 2.508242607116699,
+ "learning_rate": 4.890224209328743e-06,
+ "loss": 0.6168,
+ "step": 1292
+ },
+ {
+ "epoch": 0.6113475177304964,
+ "grad_norm": 3.034785509109497,
+ "learning_rate": 4.890041307292296e-06,
+ "loss": 0.664,
+ "step": 1293
+ },
+ {
+ "epoch": 0.6118203309692671,
+ "grad_norm": 3.52469539642334,
+ "learning_rate": 4.889858256438455e-06,
+ "loss": 0.7301,
+ "step": 1294
+ },
+ {
+ "epoch": 0.6122931442080378,
+ "grad_norm": 2.9145348072052,
+ "learning_rate": 4.889675056778622e-06,
+ "loss": 0.6494,
+ "step": 1295
+ },
+ {
+ "epoch": 0.6127659574468085,
+ "grad_norm": 2.831829071044922,
+ "learning_rate": 4.8894917083242e-06,
+ "loss": 0.6064,
+ "step": 1296
+ },
+ {
+ "epoch": 0.6132387706855792,
+ "grad_norm": 2.6883130073547363,
+ "learning_rate": 4.889308211086608e-06,
+ "loss": 0.5642,
+ "step": 1297
+ },
+ {
+ "epoch": 0.6137115839243499,
+ "grad_norm": 3.0605485439300537,
+ "learning_rate": 4.889124565077269e-06,
+ "loss": 0.6695,
+ "step": 1298
+ },
+ {
+ "epoch": 0.6141843971631206,
+ "grad_norm": 3.44062876701355,
+ "learning_rate": 4.88894077030762e-06,
+ "loss": 0.6415,
+ "step": 1299
+ },
+ {
+ "epoch": 0.6146572104018913,
+ "grad_norm": 2.5970818996429443,
+ "learning_rate": 4.888756826789105e-06,
+ "loss": 0.6518,
+ "step": 1300
+ },
+ {
+ "epoch": 0.6151300236406619,
+ "grad_norm": 4.2233567237854,
+ "learning_rate": 4.8885727345331755e-06,
+ "loss": 0.6555,
+ "step": 1301
+ },
+ {
+ "epoch": 0.6156028368794326,
+ "grad_norm": 2.645385503768921,
+ "learning_rate": 4.888388493551297e-06,
+ "loss": 0.6762,
+ "step": 1302
+ },
+ {
+ "epoch": 0.6160756501182033,
+ "grad_norm": 2.907954454421997,
+ "learning_rate": 4.8882041038549385e-06,
+ "loss": 0.6526,
+ "step": 1303
+ },
+ {
+ "epoch": 0.616548463356974,
+ "grad_norm": 2.482771873474121,
+ "learning_rate": 4.888019565455583e-06,
+ "loss": 0.628,
+ "step": 1304
+ },
+ {
+ "epoch": 0.6170212765957447,
+ "grad_norm": 2.7165915966033936,
+ "learning_rate": 4.88783487836472e-06,
+ "loss": 0.5743,
+ "step": 1305
+ },
+ {
+ "epoch": 0.6174940898345154,
+ "grad_norm": 3.095627546310425,
+ "learning_rate": 4.88765004259385e-06,
+ "loss": 0.627,
+ "step": 1306
+ },
+ {
+ "epoch": 0.6179669030732861,
+ "grad_norm": 2.5018465518951416,
+ "learning_rate": 4.8874650581544805e-06,
+ "loss": 0.5215,
+ "step": 1307
+ },
+ {
+ "epoch": 0.6184397163120567,
+ "grad_norm": 3.094337224960327,
+ "learning_rate": 4.8872799250581316e-06,
+ "loss": 0.6979,
+ "step": 1308
+ },
+ {
+ "epoch": 0.6189125295508274,
+ "grad_norm": 3.1002209186553955,
+ "learning_rate": 4.887094643316329e-06,
+ "loss": 0.6565,
+ "step": 1309
+ },
+ {
+ "epoch": 0.6193853427895981,
+ "grad_norm": 2.551431894302368,
+ "learning_rate": 4.88690921294061e-06,
+ "loss": 0.5748,
+ "step": 1310
+ },
+ {
+ "epoch": 0.6198581560283688,
+ "grad_norm": 2.8282904624938965,
+ "learning_rate": 4.886723633942521e-06,
+ "loss": 0.676,
+ "step": 1311
+ },
+ {
+ "epoch": 0.6203309692671395,
+ "grad_norm": 2.8887810707092285,
+ "learning_rate": 4.886537906333617e-06,
+ "loss": 0.5971,
+ "step": 1312
+ },
+ {
+ "epoch": 0.6208037825059102,
+ "grad_norm": 2.9989118576049805,
+ "learning_rate": 4.886352030125462e-06,
+ "loss": 0.6341,
+ "step": 1313
+ },
+ {
+ "epoch": 0.6212765957446809,
+ "grad_norm": 2.8042776584625244,
+ "learning_rate": 4.886166005329629e-06,
+ "loss": 0.6578,
+ "step": 1314
+ },
+ {
+ "epoch": 0.6217494089834515,
+ "grad_norm": 2.4980967044830322,
+ "learning_rate": 4.8859798319577026e-06,
+ "loss": 0.6711,
+ "step": 1315
+ },
+ {
+ "epoch": 0.6222222222222222,
+ "grad_norm": 2.762369155883789,
+ "learning_rate": 4.885793510021274e-06,
+ "loss": 0.5747,
+ "step": 1316
+ },
+ {
+ "epoch": 0.6226950354609929,
+ "grad_norm": 3.136327028274536,
+ "learning_rate": 4.885607039531945e-06,
+ "loss": 0.7544,
+ "step": 1317
+ },
+ {
+ "epoch": 0.6231678486997636,
+ "grad_norm": 2.8736963272094727,
+ "learning_rate": 4.885420420501327e-06,
+ "loss": 0.6603,
+ "step": 1318
+ },
+ {
+ "epoch": 0.6236406619385343,
+ "grad_norm": 2.766237497329712,
+ "learning_rate": 4.885233652941039e-06,
+ "loss": 0.581,
+ "step": 1319
+ },
+ {
+ "epoch": 0.624113475177305,
+ "grad_norm": 2.4740939140319824,
+ "learning_rate": 4.88504673686271e-06,
+ "loss": 0.6335,
+ "step": 1320
+ },
+ {
+ "epoch": 0.6245862884160757,
+ "grad_norm": 3.324795961380005,
+ "learning_rate": 4.884859672277978e-06,
+ "loss": 0.6019,
+ "step": 1321
+ },
+ {
+ "epoch": 0.6250591016548463,
+ "grad_norm": 3.521327257156372,
+ "learning_rate": 4.884672459198493e-06,
+ "loss": 0.6104,
+ "step": 1322
+ },
+ {
+ "epoch": 0.625531914893617,
+ "grad_norm": 2.7728071212768555,
+ "learning_rate": 4.884485097635909e-06,
+ "loss": 0.6714,
+ "step": 1323
+ },
+ {
+ "epoch": 0.6260047281323877,
+ "grad_norm": 3.0738155841827393,
+ "learning_rate": 4.884297587601895e-06,
+ "loss": 0.604,
+ "step": 1324
+ },
+ {
+ "epoch": 0.6264775413711584,
+ "grad_norm": 2.719240427017212,
+ "learning_rate": 4.884109929108124e-06,
+ "loss": 0.6795,
+ "step": 1325
+ },
+ {
+ "epoch": 0.6269503546099291,
+ "grad_norm": 2.4108200073242188,
+ "learning_rate": 4.883922122166282e-06,
+ "loss": 0.5846,
+ "step": 1326
+ },
+ {
+ "epoch": 0.6274231678486998,
+ "grad_norm": 2.393899917602539,
+ "learning_rate": 4.883734166788063e-06,
+ "loss": 0.6188,
+ "step": 1327
+ },
+ {
+ "epoch": 0.6278959810874705,
+ "grad_norm": 4.555255889892578,
+ "learning_rate": 4.883546062985169e-06,
+ "loss": 0.5962,
+ "step": 1328
+ },
+ {
+ "epoch": 0.6283687943262412,
+ "grad_norm": 2.571075439453125,
+ "learning_rate": 4.883357810769315e-06,
+ "loss": 0.6165,
+ "step": 1329
+ },
+ {
+ "epoch": 0.6288416075650118,
+ "grad_norm": 2.553115129470825,
+ "learning_rate": 4.8831694101522185e-06,
+ "loss": 0.6787,
+ "step": 1330
+ },
+ {
+ "epoch": 0.6293144208037825,
+ "grad_norm": 3.2564642429351807,
+ "learning_rate": 4.882980861145614e-06,
+ "loss": 0.659,
+ "step": 1331
+ },
+ {
+ "epoch": 0.6297872340425532,
+ "grad_norm": 2.535216808319092,
+ "learning_rate": 4.882792163761241e-06,
+ "loss": 0.6176,
+ "step": 1332
+ },
+ {
+ "epoch": 0.6302600472813239,
+ "grad_norm": 3.097921848297119,
+ "learning_rate": 4.882603318010847e-06,
+ "loss": 0.6822,
+ "step": 1333
+ },
+ {
+ "epoch": 0.6307328605200946,
+ "grad_norm": 2.8135175704956055,
+ "learning_rate": 4.882414323906192e-06,
+ "loss": 0.6782,
+ "step": 1334
+ },
+ {
+ "epoch": 0.6312056737588653,
+ "grad_norm": 2.724634885787964,
+ "learning_rate": 4.882225181459044e-06,
+ "loss": 0.6545,
+ "step": 1335
+ },
+ {
+ "epoch": 0.631678486997636,
+ "grad_norm": 2.9585227966308594,
+ "learning_rate": 4.882035890681179e-06,
+ "loss": 0.6218,
+ "step": 1336
+ },
+ {
+ "epoch": 0.6321513002364066,
+ "grad_norm": 2.6952011585235596,
+ "learning_rate": 4.881846451584385e-06,
+ "loss": 0.6,
+ "step": 1337
+ },
+ {
+ "epoch": 0.6326241134751773,
+ "grad_norm": 3.1400704383850098,
+ "learning_rate": 4.881656864180455e-06,
+ "loss": 0.6687,
+ "step": 1338
+ },
+ {
+ "epoch": 0.633096926713948,
+ "grad_norm": 2.8382487297058105,
+ "learning_rate": 4.881467128481197e-06,
+ "loss": 0.574,
+ "step": 1339
+ },
+ {
+ "epoch": 0.6335697399527187,
+ "grad_norm": 2.8520095348358154,
+ "learning_rate": 4.881277244498422e-06,
+ "loss": 0.6582,
+ "step": 1340
+ },
+ {
+ "epoch": 0.6340425531914894,
+ "grad_norm": 2.703498363494873,
+ "learning_rate": 4.881087212243956e-06,
+ "loss": 0.7224,
+ "step": 1341
+ },
+ {
+ "epoch": 0.6345153664302601,
+ "grad_norm": 3.697205066680908,
+ "learning_rate": 4.880897031729629e-06,
+ "loss": 0.6582,
+ "step": 1342
+ },
+ {
+ "epoch": 0.6349881796690308,
+ "grad_norm": 2.7625808715820312,
+ "learning_rate": 4.880706702967284e-06,
+ "loss": 0.574,
+ "step": 1343
+ },
+ {
+ "epoch": 0.6354609929078014,
+ "grad_norm": 2.949984073638916,
+ "learning_rate": 4.880516225968771e-06,
+ "loss": 0.66,
+ "step": 1344
+ },
+ {
+ "epoch": 0.6359338061465721,
+ "grad_norm": 2.548269748687744,
+ "learning_rate": 4.8803256007459525e-06,
+ "loss": 0.642,
+ "step": 1345
+ },
+ {
+ "epoch": 0.6364066193853428,
+ "grad_norm": 2.5102174282073975,
+ "learning_rate": 4.8801348273106945e-06,
+ "loss": 0.6238,
+ "step": 1346
+ },
+ {
+ "epoch": 0.6368794326241135,
+ "grad_norm": 2.9847946166992188,
+ "learning_rate": 4.8799439056748786e-06,
+ "loss": 0.5416,
+ "step": 1347
+ },
+ {
+ "epoch": 0.6373522458628842,
+ "grad_norm": 2.8711049556732178,
+ "learning_rate": 4.879752835850391e-06,
+ "loss": 0.6427,
+ "step": 1348
+ },
+ {
+ "epoch": 0.6378250591016549,
+ "grad_norm": 2.7901716232299805,
+ "learning_rate": 4.879561617849129e-06,
+ "loss": 0.6026,
+ "step": 1349
+ },
+ {
+ "epoch": 0.6382978723404256,
+ "grad_norm": 2.659778356552124,
+ "learning_rate": 4.879370251682999e-06,
+ "loss": 0.6623,
+ "step": 1350
+ },
+ {
+ "epoch": 0.6387706855791963,
+ "grad_norm": 3.224386692047119,
+ "learning_rate": 4.879178737363917e-06,
+ "loss": 0.6485,
+ "step": 1351
+ },
+ {
+ "epoch": 0.6392434988179669,
+ "grad_norm": 2.6385605335235596,
+ "learning_rate": 4.8789870749038076e-06,
+ "loss": 0.5866,
+ "step": 1352
+ },
+ {
+ "epoch": 0.6397163120567376,
+ "grad_norm": 2.807713270187378,
+ "learning_rate": 4.8787952643146045e-06,
+ "loss": 0.6537,
+ "step": 1353
+ },
+ {
+ "epoch": 0.6401891252955083,
+ "grad_norm": 2.5689280033111572,
+ "learning_rate": 4.878603305608251e-06,
+ "loss": 0.6216,
+ "step": 1354
+ },
+ {
+ "epoch": 0.640661938534279,
+ "grad_norm": 2.7347843647003174,
+ "learning_rate": 4.8784111987967e-06,
+ "loss": 0.6318,
+ "step": 1355
+ },
+ {
+ "epoch": 0.6411347517730497,
+ "grad_norm": 2.5210378170013428,
+ "learning_rate": 4.878218943891911e-06,
+ "loss": 0.5472,
+ "step": 1356
+ },
+ {
+ "epoch": 0.6416075650118204,
+ "grad_norm": 2.866785764694214,
+ "learning_rate": 4.878026540905858e-06,
+ "loss": 0.7108,
+ "step": 1357
+ },
+ {
+ "epoch": 0.642080378250591,
+ "grad_norm": 2.923314332962036,
+ "learning_rate": 4.877833989850519e-06,
+ "loss": 0.5557,
+ "step": 1358
+ },
+ {
+ "epoch": 0.6425531914893617,
+ "grad_norm": 2.925463914871216,
+ "learning_rate": 4.8776412907378845e-06,
+ "loss": 0.6382,
+ "step": 1359
+ },
+ {
+ "epoch": 0.6430260047281324,
+ "grad_norm": 2.909644365310669,
+ "learning_rate": 4.877448443579952e-06,
+ "loss": 0.5603,
+ "step": 1360
+ },
+ {
+ "epoch": 0.6434988179669031,
+ "grad_norm": 3.501148223876953,
+ "learning_rate": 4.8772554483887306e-06,
+ "loss": 0.6722,
+ "step": 1361
+ },
+ {
+ "epoch": 0.6439716312056738,
+ "grad_norm": 2.823765516281128,
+ "learning_rate": 4.877062305176235e-06,
+ "loss": 0.6408,
+ "step": 1362
+ },
+ {
+ "epoch": 0.6444444444444445,
+ "grad_norm": 2.9807584285736084,
+ "learning_rate": 4.8768690139544935e-06,
+ "loss": 0.5984,
+ "step": 1363
+ },
+ {
+ "epoch": 0.6449172576832152,
+ "grad_norm": 2.8411378860473633,
+ "learning_rate": 4.8766755747355405e-06,
+ "loss": 0.6231,
+ "step": 1364
+ },
+ {
+ "epoch": 0.6453900709219859,
+ "grad_norm": 3.158952236175537,
+ "learning_rate": 4.8764819875314215e-06,
+ "loss": 0.6441,
+ "step": 1365
+ },
+ {
+ "epoch": 0.6458628841607565,
+ "grad_norm": 2.9614369869232178,
+ "learning_rate": 4.876288252354189e-06,
+ "loss": 0.6308,
+ "step": 1366
+ },
+ {
+ "epoch": 0.6463356973995272,
+ "grad_norm": 3.073805570602417,
+ "learning_rate": 4.876094369215907e-06,
+ "loss": 0.6046,
+ "step": 1367
+ },
+ {
+ "epoch": 0.6468085106382979,
+ "grad_norm": 2.719189405441284,
+ "learning_rate": 4.875900338128648e-06,
+ "loss": 0.6082,
+ "step": 1368
+ },
+ {
+ "epoch": 0.6472813238770686,
+ "grad_norm": 2.676726818084717,
+ "learning_rate": 4.8757061591044914e-06,
+ "loss": 0.6344,
+ "step": 1369
+ },
+ {
+ "epoch": 0.6477541371158393,
+ "grad_norm": 2.955256938934326,
+ "learning_rate": 4.87551183215553e-06,
+ "loss": 0.6506,
+ "step": 1370
+ },
+ {
+ "epoch": 0.64822695035461,
+ "grad_norm": 2.5672218799591064,
+ "learning_rate": 4.875317357293864e-06,
+ "loss": 0.5284,
+ "step": 1371
+ },
+ {
+ "epoch": 0.6486997635933807,
+ "grad_norm": 2.5860238075256348,
+ "learning_rate": 4.875122734531602e-06,
+ "loss": 0.667,
+ "step": 1372
+ },
+ {
+ "epoch": 0.6491725768321513,
+ "grad_norm": 3.1037003993988037,
+ "learning_rate": 4.8749279638808605e-06,
+ "loss": 0.6902,
+ "step": 1373
+ },
+ {
+ "epoch": 0.649645390070922,
+ "grad_norm": 2.7715282440185547,
+ "learning_rate": 4.874733045353769e-06,
+ "loss": 0.6291,
+ "step": 1374
+ },
+ {
+ "epoch": 0.6501182033096927,
+ "grad_norm": 2.527071475982666,
+ "learning_rate": 4.874537978962463e-06,
+ "loss": 0.5565,
+ "step": 1375
+ },
+ {
+ "epoch": 0.6505910165484634,
+ "grad_norm": 2.722092628479004,
+ "learning_rate": 4.874342764719091e-06,
+ "loss": 0.5724,
+ "step": 1376
+ },
+ {
+ "epoch": 0.6510638297872341,
+ "grad_norm": 2.6342411041259766,
+ "learning_rate": 4.874147402635805e-06,
+ "loss": 0.6308,
+ "step": 1377
+ },
+ {
+ "epoch": 0.6515366430260048,
+ "grad_norm": 2.3850719928741455,
+ "learning_rate": 4.8739518927247695e-06,
+ "loss": 0.5692,
+ "step": 1378
+ },
+ {
+ "epoch": 0.6520094562647755,
+ "grad_norm": 2.9787259101867676,
+ "learning_rate": 4.873756234998161e-06,
+ "loss": 0.6953,
+ "step": 1379
+ },
+ {
+ "epoch": 0.6524822695035462,
+ "grad_norm": 2.634141683578491,
+ "learning_rate": 4.873560429468159e-06,
+ "loss": 0.6077,
+ "step": 1380
+ },
+ {
+ "epoch": 0.6529550827423168,
+ "grad_norm": 2.803046941757202,
+ "learning_rate": 4.873364476146958e-06,
+ "loss": 0.6657,
+ "step": 1381
+ },
+ {
+ "epoch": 0.6534278959810875,
+ "grad_norm": 2.762827157974243,
+ "learning_rate": 4.8731683750467574e-06,
+ "loss": 0.6061,
+ "step": 1382
+ },
+ {
+ "epoch": 0.6539007092198581,
+ "grad_norm": 2.6654391288757324,
+ "learning_rate": 4.872972126179768e-06,
+ "loss": 0.6387,
+ "step": 1383
+ },
+ {
+ "epoch": 0.6543735224586288,
+ "grad_norm": 2.4363625049591064,
+ "learning_rate": 4.872775729558209e-06,
+ "loss": 0.5623,
+ "step": 1384
+ },
+ {
+ "epoch": 0.6548463356973995,
+ "grad_norm": 2.528959035873413,
+ "learning_rate": 4.87257918519431e-06,
+ "loss": 0.5609,
+ "step": 1385
+ },
+ {
+ "epoch": 0.6553191489361702,
+ "grad_norm": 2.718383312225342,
+ "learning_rate": 4.872382493100309e-06,
+ "loss": 0.5575,
+ "step": 1386
+ },
+ {
+ "epoch": 0.6557919621749408,
+ "grad_norm": 2.660841226577759,
+ "learning_rate": 4.872185653288453e-06,
+ "loss": 0.6106,
+ "step": 1387
+ },
+ {
+ "epoch": 0.6562647754137115,
+ "grad_norm": 2.508753538131714,
+ "learning_rate": 4.871988665770997e-06,
+ "loss": 0.5705,
+ "step": 1388
+ },
+ {
+ "epoch": 0.6567375886524822,
+ "grad_norm": 2.5134334564208984,
+ "learning_rate": 4.871791530560208e-06,
+ "loss": 0.5592,
+ "step": 1389
+ },
+ {
+ "epoch": 0.6572104018912529,
+ "grad_norm": 2.7475597858428955,
+ "learning_rate": 4.871594247668361e-06,
+ "loss": 0.6277,
+ "step": 1390
+ },
+ {
+ "epoch": 0.6576832151300236,
+ "grad_norm": 2.793616533279419,
+ "learning_rate": 4.871396817107739e-06,
+ "loss": 0.595,
+ "step": 1391
+ },
+ {
+ "epoch": 0.6581560283687943,
+ "grad_norm": 2.8285086154937744,
+ "learning_rate": 4.871199238890635e-06,
+ "loss": 0.6094,
+ "step": 1392
+ },
+ {
+ "epoch": 0.658628841607565,
+ "grad_norm": 2.74124813079834,
+ "learning_rate": 4.871001513029352e-06,
+ "loss": 0.6296,
+ "step": 1393
+ },
+ {
+ "epoch": 0.6591016548463356,
+ "grad_norm": 2.761237621307373,
+ "learning_rate": 4.870803639536202e-06,
+ "loss": 0.5702,
+ "step": 1394
+ },
+ {
+ "epoch": 0.6595744680851063,
+ "grad_norm": 2.761038064956665,
+ "learning_rate": 4.870605618423504e-06,
+ "loss": 0.6195,
+ "step": 1395
+ },
+ {
+ "epoch": 0.660047281323877,
+ "grad_norm": 2.8812482357025146,
+ "learning_rate": 4.870407449703589e-06,
+ "loss": 0.616,
+ "step": 1396
+ },
+ {
+ "epoch": 0.6605200945626477,
+ "grad_norm": 2.9966578483581543,
+ "learning_rate": 4.870209133388797e-06,
+ "loss": 0.6547,
+ "step": 1397
+ },
+ {
+ "epoch": 0.6609929078014184,
+ "grad_norm": 2.7969017028808594,
+ "learning_rate": 4.870010669491474e-06,
+ "loss": 0.5762,
+ "step": 1398
+ },
+ {
+ "epoch": 0.6614657210401891,
+ "grad_norm": 2.557783842086792,
+ "learning_rate": 4.86981205802398e-06,
+ "loss": 0.6184,
+ "step": 1399
+ },
+ {
+ "epoch": 0.6619385342789598,
+ "grad_norm": 2.5393927097320557,
+ "learning_rate": 4.86961329899868e-06,
+ "loss": 0.5953,
+ "step": 1400
+ },
+ {
+ "epoch": 0.6624113475177305,
+ "grad_norm": 2.7745981216430664,
+ "learning_rate": 4.86941439242795e-06,
+ "loss": 0.5967,
+ "step": 1401
+ },
+ {
+ "epoch": 0.6628841607565011,
+ "grad_norm": 2.650381326675415,
+ "learning_rate": 4.869215338324176e-06,
+ "loss": 0.5667,
+ "step": 1402
+ },
+ {
+ "epoch": 0.6633569739952718,
+ "grad_norm": 2.583169937133789,
+ "learning_rate": 4.869016136699751e-06,
+ "loss": 0.549,
+ "step": 1403
+ },
+ {
+ "epoch": 0.6638297872340425,
+ "grad_norm": 2.984978437423706,
+ "learning_rate": 4.868816787567079e-06,
+ "loss": 0.5931,
+ "step": 1404
+ },
+ {
+ "epoch": 0.6643026004728132,
+ "grad_norm": 3.1947181224823,
+ "learning_rate": 4.868617290938573e-06,
+ "loss": 0.5473,
+ "step": 1405
+ },
+ {
+ "epoch": 0.6647754137115839,
+ "grad_norm": 2.562927007675171,
+ "learning_rate": 4.868417646826654e-06,
+ "loss": 0.6878,
+ "step": 1406
+ },
+ {
+ "epoch": 0.6652482269503546,
+ "grad_norm": 2.8741261959075928,
+ "learning_rate": 4.868217855243754e-06,
+ "loss": 0.6312,
+ "step": 1407
+ },
+ {
+ "epoch": 0.6657210401891253,
+ "grad_norm": 2.9834797382354736,
+ "learning_rate": 4.868017916202312e-06,
+ "loss": 0.5624,
+ "step": 1408
+ },
+ {
+ "epoch": 0.6661938534278959,
+ "grad_norm": 2.6935982704162598,
+ "learning_rate": 4.8678178297147785e-06,
+ "loss": 0.5857,
+ "step": 1409
+ },
+ {
+ "epoch": 0.6666666666666666,
+ "grad_norm": 2.8200576305389404,
+ "learning_rate": 4.86761759579361e-06,
+ "loss": 0.6153,
+ "step": 1410
+ },
+ {
+ "epoch": 0.6671394799054373,
+ "grad_norm": 2.831425189971924,
+ "learning_rate": 4.867417214451276e-06,
+ "loss": 0.6495,
+ "step": 1411
+ },
+ {
+ "epoch": 0.667612293144208,
+ "grad_norm": 2.733565092086792,
+ "learning_rate": 4.867216685700253e-06,
+ "loss": 0.6036,
+ "step": 1412
+ },
+ {
+ "epoch": 0.6680851063829787,
+ "grad_norm": 3.0609400272369385,
+ "learning_rate": 4.867016009553027e-06,
+ "loss": 0.6773,
+ "step": 1413
+ },
+ {
+ "epoch": 0.6685579196217494,
+ "grad_norm": 2.665452241897583,
+ "learning_rate": 4.866815186022093e-06,
+ "loss": 0.6256,
+ "step": 1414
+ },
+ {
+ "epoch": 0.6690307328605201,
+ "grad_norm": 2.9480721950531006,
+ "learning_rate": 4.866614215119956e-06,
+ "loss": 0.535,
+ "step": 1415
+ },
+ {
+ "epoch": 0.6695035460992907,
+ "grad_norm": 2.5514180660247803,
+ "learning_rate": 4.866413096859128e-06,
+ "loss": 0.6588,
+ "step": 1416
+ },
+ {
+ "epoch": 0.6699763593380614,
+ "grad_norm": 3.3442373275756836,
+ "learning_rate": 4.866211831252134e-06,
+ "loss": 0.5754,
+ "step": 1417
+ },
+ {
+ "epoch": 0.6704491725768321,
+ "grad_norm": 2.521467685699463,
+ "learning_rate": 4.866010418311504e-06,
+ "loss": 0.5546,
+ "step": 1418
+ },
+ {
+ "epoch": 0.6709219858156028,
+ "grad_norm": 2.930706262588501,
+ "learning_rate": 4.865808858049781e-06,
+ "loss": 0.589,
+ "step": 1419
+ },
+ {
+ "epoch": 0.6713947990543735,
+ "grad_norm": 2.6298375129699707,
+ "learning_rate": 4.865607150479513e-06,
+ "loss": 0.5915,
+ "step": 1420
+ },
+ {
+ "epoch": 0.6718676122931442,
+ "grad_norm": 2.9554293155670166,
+ "learning_rate": 4.8654052956132615e-06,
+ "loss": 0.6654,
+ "step": 1421
+ },
+ {
+ "epoch": 0.6723404255319149,
+ "grad_norm": 3.2706902027130127,
+ "learning_rate": 4.865203293463593e-06,
+ "loss": 0.7115,
+ "step": 1422
+ },
+ {
+ "epoch": 0.6728132387706856,
+ "grad_norm": 3.041539430618286,
+ "learning_rate": 4.865001144043088e-06,
+ "loss": 0.5818,
+ "step": 1423
+ },
+ {
+ "epoch": 0.6732860520094562,
+ "grad_norm": 3.1314544677734375,
+ "learning_rate": 4.864798847364331e-06,
+ "loss": 0.5822,
+ "step": 1424
+ },
+ {
+ "epoch": 0.6737588652482269,
+ "grad_norm": 2.5301461219787598,
+ "learning_rate": 4.86459640343992e-06,
+ "loss": 0.5525,
+ "step": 1425
+ },
+ {
+ "epoch": 0.6742316784869976,
+ "grad_norm": 2.809295892715454,
+ "learning_rate": 4.864393812282458e-06,
+ "loss": 0.6768,
+ "step": 1426
+ },
+ {
+ "epoch": 0.6747044917257683,
+ "grad_norm": 2.794664144515991,
+ "learning_rate": 4.864191073904562e-06,
+ "loss": 0.5793,
+ "step": 1427
+ },
+ {
+ "epoch": 0.675177304964539,
+ "grad_norm": 2.7771105766296387,
+ "learning_rate": 4.863988188318854e-06,
+ "loss": 0.6453,
+ "step": 1428
+ },
+ {
+ "epoch": 0.6756501182033097,
+ "grad_norm": 2.6431946754455566,
+ "learning_rate": 4.863785155537967e-06,
+ "loss": 0.5877,
+ "step": 1429
+ },
+ {
+ "epoch": 0.6761229314420804,
+ "grad_norm": 2.951353073120117,
+ "learning_rate": 4.863581975574544e-06,
+ "loss": 0.6793,
+ "step": 1430
+ },
+ {
+ "epoch": 0.676595744680851,
+ "grad_norm": 3.1336071491241455,
+ "learning_rate": 4.863378648441235e-06,
+ "loss": 0.6695,
+ "step": 1431
+ },
+ {
+ "epoch": 0.6770685579196217,
+ "grad_norm": 2.735982656478882,
+ "learning_rate": 4.8631751741507e-06,
+ "loss": 0.5239,
+ "step": 1432
+ },
+ {
+ "epoch": 0.6775413711583924,
+ "grad_norm": 2.7085206508636475,
+ "learning_rate": 4.862971552715611e-06,
+ "loss": 0.6837,
+ "step": 1433
+ },
+ {
+ "epoch": 0.6780141843971631,
+ "grad_norm": 3.136528730392456,
+ "learning_rate": 4.8627677841486436e-06,
+ "loss": 0.683,
+ "step": 1434
+ },
+ {
+ "epoch": 0.6784869976359338,
+ "grad_norm": 2.7879369258880615,
+ "learning_rate": 4.862563868462486e-06,
+ "loss": 0.608,
+ "step": 1435
+ },
+ {
+ "epoch": 0.6789598108747045,
+ "grad_norm": 2.7937729358673096,
+ "learning_rate": 4.862359805669837e-06,
+ "loss": 0.6131,
+ "step": 1436
+ },
+ {
+ "epoch": 0.6794326241134752,
+ "grad_norm": 2.5988364219665527,
+ "learning_rate": 4.862155595783401e-06,
+ "loss": 0.6303,
+ "step": 1437
+ },
+ {
+ "epoch": 0.6799054373522458,
+ "grad_norm": 3.251070499420166,
+ "learning_rate": 4.861951238815894e-06,
+ "loss": 0.7246,
+ "step": 1438
+ },
+ {
+ "epoch": 0.6803782505910165,
+ "grad_norm": 2.646759271621704,
+ "learning_rate": 4.861746734780039e-06,
+ "loss": 0.6313,
+ "step": 1439
+ },
+ {
+ "epoch": 0.6808510638297872,
+ "grad_norm": 2.773866891860962,
+ "learning_rate": 4.861542083688573e-06,
+ "loss": 0.6463,
+ "step": 1440
+ },
+ {
+ "epoch": 0.6813238770685579,
+ "grad_norm": 2.759965658187866,
+ "learning_rate": 4.861337285554235e-06,
+ "loss": 0.5428,
+ "step": 1441
+ },
+ {
+ "epoch": 0.6817966903073286,
+ "grad_norm": 3.3250818252563477,
+ "learning_rate": 4.861132340389779e-06,
+ "loss": 0.6522,
+ "step": 1442
+ },
+ {
+ "epoch": 0.6822695035460993,
+ "grad_norm": 2.661797523498535,
+ "learning_rate": 4.860927248207965e-06,
+ "loss": 0.5871,
+ "step": 1443
+ },
+ {
+ "epoch": 0.68274231678487,
+ "grad_norm": 2.706289052963257,
+ "learning_rate": 4.860722009021563e-06,
+ "loss": 0.6651,
+ "step": 1444
+ },
+ {
+ "epoch": 0.6832151300236406,
+ "grad_norm": 2.8459298610687256,
+ "learning_rate": 4.860516622843354e-06,
+ "loss": 0.5827,
+ "step": 1445
+ },
+ {
+ "epoch": 0.6836879432624113,
+ "grad_norm": 3.1041831970214844,
+ "learning_rate": 4.860311089686125e-06,
+ "loss": 0.6727,
+ "step": 1446
+ },
+ {
+ "epoch": 0.684160756501182,
+ "grad_norm": 2.9382801055908203,
+ "learning_rate": 4.8601054095626746e-06,
+ "loss": 0.6002,
+ "step": 1447
+ },
+ {
+ "epoch": 0.6846335697399527,
+ "grad_norm": 2.782475471496582,
+ "learning_rate": 4.859899582485808e-06,
+ "loss": 0.6951,
+ "step": 1448
+ },
+ {
+ "epoch": 0.6851063829787234,
+ "grad_norm": 3.313894510269165,
+ "learning_rate": 4.859693608468343e-06,
+ "loss": 0.6363,
+ "step": 1449
+ },
+ {
+ "epoch": 0.6855791962174941,
+ "grad_norm": 3.1639695167541504,
+ "learning_rate": 4.8594874875231045e-06,
+ "loss": 0.7002,
+ "step": 1450
+ },
+ {
+ "epoch": 0.6860520094562648,
+ "grad_norm": 2.6762218475341797,
+ "learning_rate": 4.859281219662926e-06,
+ "loss": 0.6246,
+ "step": 1451
+ },
+ {
+ "epoch": 0.6865248226950355,
+ "grad_norm": 2.8368663787841797,
+ "learning_rate": 4.85907480490065e-06,
+ "loss": 0.5906,
+ "step": 1452
+ },
+ {
+ "epoch": 0.6869976359338061,
+ "grad_norm": 2.887373208999634,
+ "learning_rate": 4.858868243249131e-06,
+ "loss": 0.5931,
+ "step": 1453
+ },
+ {
+ "epoch": 0.6874704491725768,
+ "grad_norm": 2.8115322589874268,
+ "learning_rate": 4.858661534721229e-06,
+ "loss": 0.6337,
+ "step": 1454
+ },
+ {
+ "epoch": 0.6879432624113475,
+ "grad_norm": 2.8470499515533447,
+ "learning_rate": 4.8584546793298174e-06,
+ "loss": 0.632,
+ "step": 1455
+ },
+ {
+ "epoch": 0.6884160756501182,
+ "grad_norm": 2.8229613304138184,
+ "learning_rate": 4.8582476770877725e-06,
+ "loss": 0.6494,
+ "step": 1456
+ },
+ {
+ "epoch": 0.6888888888888889,
+ "grad_norm": 2.4235479831695557,
+ "learning_rate": 4.858040528007987e-06,
+ "loss": 0.5709,
+ "step": 1457
+ },
+ {
+ "epoch": 0.6893617021276596,
+ "grad_norm": 2.9348199367523193,
+ "learning_rate": 4.857833232103356e-06,
+ "loss": 0.5404,
+ "step": 1458
+ },
+ {
+ "epoch": 0.6898345153664303,
+ "grad_norm": 2.8274219036102295,
+ "learning_rate": 4.857625789386789e-06,
+ "loss": 0.701,
+ "step": 1459
+ },
+ {
+ "epoch": 0.6903073286052009,
+ "grad_norm": 3.136929988861084,
+ "learning_rate": 4.857418199871203e-06,
+ "loss": 0.6971,
+ "step": 1460
+ },
+ {
+ "epoch": 0.6907801418439716,
+ "grad_norm": 2.8987185955047607,
+ "learning_rate": 4.8572104635695214e-06,
+ "loss": 0.6613,
+ "step": 1461
+ },
+ {
+ "epoch": 0.6912529550827423,
+ "grad_norm": 2.5073442459106445,
+ "learning_rate": 4.857002580494681e-06,
+ "loss": 0.6032,
+ "step": 1462
+ },
+ {
+ "epoch": 0.691725768321513,
+ "grad_norm": 2.7019522190093994,
+ "learning_rate": 4.856794550659625e-06,
+ "loss": 0.567,
+ "step": 1463
+ },
+ {
+ "epoch": 0.6921985815602837,
+ "grad_norm": 2.4795594215393066,
+ "learning_rate": 4.8565863740773054e-06,
+ "loss": 0.5777,
+ "step": 1464
+ },
+ {
+ "epoch": 0.6926713947990544,
+ "grad_norm": 3.032506227493286,
+ "learning_rate": 4.856378050760687e-06,
+ "loss": 0.607,
+ "step": 1465
+ },
+ {
+ "epoch": 0.6931442080378251,
+ "grad_norm": 3.052091121673584,
+ "learning_rate": 4.85616958072274e-06,
+ "loss": 0.591,
+ "step": 1466
+ },
+ {
+ "epoch": 0.6936170212765957,
+ "grad_norm": 2.704831838607788,
+ "learning_rate": 4.855960963976443e-06,
+ "loss": 0.6528,
+ "step": 1467
+ },
+ {
+ "epoch": 0.6940898345153664,
+ "grad_norm": 2.680995225906372,
+ "learning_rate": 4.855752200534788e-06,
+ "loss": 0.6294,
+ "step": 1468
+ },
+ {
+ "epoch": 0.6945626477541371,
+ "grad_norm": 2.3948659896850586,
+ "learning_rate": 4.855543290410774e-06,
+ "loss": 0.6091,
+ "step": 1469
+ },
+ {
+ "epoch": 0.6950354609929078,
+ "grad_norm": 2.6407411098480225,
+ "learning_rate": 4.855334233617407e-06,
+ "loss": 0.5572,
+ "step": 1470
+ },
+ {
+ "epoch": 0.6955082742316785,
+ "grad_norm": 2.5526835918426514,
+ "learning_rate": 4.8551250301677064e-06,
+ "loss": 0.5432,
+ "step": 1471
+ },
+ {
+ "epoch": 0.6959810874704492,
+ "grad_norm": 3.1237430572509766,
+ "learning_rate": 4.8549156800746965e-06,
+ "loss": 0.5944,
+ "step": 1472
+ },
+ {
+ "epoch": 0.6964539007092199,
+ "grad_norm": 2.8112540245056152,
+ "learning_rate": 4.854706183351412e-06,
+ "loss": 0.604,
+ "step": 1473
+ },
+ {
+ "epoch": 0.6969267139479906,
+ "grad_norm": 2.664644479751587,
+ "learning_rate": 4.8544965400109e-06,
+ "loss": 0.5647,
+ "step": 1474
+ },
+ {
+ "epoch": 0.6973995271867612,
+ "grad_norm": 3.26310133934021,
+ "learning_rate": 4.854286750066212e-06,
+ "loss": 0.6999,
+ "step": 1475
+ },
+ {
+ "epoch": 0.6978723404255319,
+ "grad_norm": 2.9717442989349365,
+ "learning_rate": 4.8540768135304115e-06,
+ "loss": 0.6655,
+ "step": 1476
+ },
+ {
+ "epoch": 0.6983451536643026,
+ "grad_norm": 2.5302982330322266,
+ "learning_rate": 4.85386673041657e-06,
+ "loss": 0.6384,
+ "step": 1477
+ },
+ {
+ "epoch": 0.6988179669030733,
+ "grad_norm": 2.864877700805664,
+ "learning_rate": 4.853656500737769e-06,
+ "loss": 0.6834,
+ "step": 1478
+ },
+ {
+ "epoch": 0.699290780141844,
+ "grad_norm": 2.5522031784057617,
+ "learning_rate": 4.853446124507098e-06,
+ "loss": 0.5929,
+ "step": 1479
+ },
+ {
+ "epoch": 0.6997635933806147,
+ "grad_norm": 3.096477746963501,
+ "learning_rate": 4.853235601737656e-06,
+ "loss": 0.5737,
+ "step": 1480
+ },
+ {
+ "epoch": 0.7002364066193854,
+ "grad_norm": 2.884779214859009,
+ "learning_rate": 4.853024932442552e-06,
+ "loss": 0.6362,
+ "step": 1481
+ },
+ {
+ "epoch": 0.700709219858156,
+ "grad_norm": 3.368558406829834,
+ "learning_rate": 4.852814116634903e-06,
+ "loss": 0.6721,
+ "step": 1482
+ },
+ {
+ "epoch": 0.7011820330969267,
+ "grad_norm": 2.742414951324463,
+ "learning_rate": 4.852603154327837e-06,
+ "loss": 0.6212,
+ "step": 1483
+ },
+ {
+ "epoch": 0.7016548463356974,
+ "grad_norm": 2.53454852104187,
+ "learning_rate": 4.8523920455344864e-06,
+ "loss": 0.6675,
+ "step": 1484
+ },
+ {
+ "epoch": 0.7021276595744681,
+ "grad_norm": 2.9354238510131836,
+ "learning_rate": 4.852180790267999e-06,
+ "loss": 0.6692,
+ "step": 1485
+ },
+ {
+ "epoch": 0.7026004728132388,
+ "grad_norm": 2.585070848464966,
+ "learning_rate": 4.8519693885415274e-06,
+ "loss": 0.6215,
+ "step": 1486
+ },
+ {
+ "epoch": 0.7030732860520095,
+ "grad_norm": 2.9047999382019043,
+ "learning_rate": 4.851757840368235e-06,
+ "loss": 0.6231,
+ "step": 1487
+ },
+ {
+ "epoch": 0.7035460992907802,
+ "grad_norm": 3.0930933952331543,
+ "learning_rate": 4.851546145761295e-06,
+ "loss": 0.7267,
+ "step": 1488
+ },
+ {
+ "epoch": 0.7040189125295508,
+ "grad_norm": 3.0224719047546387,
+ "learning_rate": 4.8513343047338875e-06,
+ "loss": 0.6293,
+ "step": 1489
+ },
+ {
+ "epoch": 0.7044917257683215,
+ "grad_norm": 2.5758471488952637,
+ "learning_rate": 4.851122317299203e-06,
+ "loss": 0.5855,
+ "step": 1490
+ },
+ {
+ "epoch": 0.7049645390070922,
+ "grad_norm": 2.579272508621216,
+ "learning_rate": 4.850910183470441e-06,
+ "loss": 0.582,
+ "step": 1491
+ },
+ {
+ "epoch": 0.7054373522458629,
+ "grad_norm": 2.8148300647735596,
+ "learning_rate": 4.85069790326081e-06,
+ "loss": 0.6396,
+ "step": 1492
+ },
+ {
+ "epoch": 0.7059101654846336,
+ "grad_norm": 2.6380527019500732,
+ "learning_rate": 4.850485476683528e-06,
+ "loss": 0.6114,
+ "step": 1493
+ },
+ {
+ "epoch": 0.7063829787234043,
+ "grad_norm": 2.7736263275146484,
+ "learning_rate": 4.850272903751823e-06,
+ "loss": 0.6683,
+ "step": 1494
+ },
+ {
+ "epoch": 0.706855791962175,
+ "grad_norm": 3.1958179473876953,
+ "learning_rate": 4.8500601844789285e-06,
+ "loss": 0.6265,
+ "step": 1495
+ },
+ {
+ "epoch": 0.7073286052009456,
+ "grad_norm": 3.783212423324585,
+ "learning_rate": 4.8498473188780916e-06,
+ "loss": 0.6078,
+ "step": 1496
+ },
+ {
+ "epoch": 0.7078014184397163,
+ "grad_norm": 2.6656646728515625,
+ "learning_rate": 4.849634306962566e-06,
+ "loss": 0.5756,
+ "step": 1497
+ },
+ {
+ "epoch": 0.708274231678487,
+ "grad_norm": 2.757141590118408,
+ "learning_rate": 4.849421148745615e-06,
+ "loss": 0.5596,
+ "step": 1498
+ },
+ {
+ "epoch": 0.7087470449172577,
+ "grad_norm": 3.0391886234283447,
+ "learning_rate": 4.849207844240511e-06,
+ "loss": 0.5293,
+ "step": 1499
+ },
+ {
+ "epoch": 0.7092198581560284,
+ "grad_norm": 2.981912851333618,
+ "learning_rate": 4.848994393460535e-06,
+ "loss": 0.598,
+ "step": 1500
+ },
+ {
+ "epoch": 0.7096926713947991,
+ "grad_norm": 2.5470798015594482,
+ "learning_rate": 4.848780796418978e-06,
+ "loss": 0.6266,
+ "step": 1501
+ },
+ {
+ "epoch": 0.7101654846335698,
+ "grad_norm": 2.8394415378570557,
+ "learning_rate": 4.8485670531291415e-06,
+ "loss": 0.6844,
+ "step": 1502
+ },
+ {
+ "epoch": 0.7106382978723405,
+ "grad_norm": 3.2023508548736572,
+ "learning_rate": 4.848353163604331e-06,
+ "loss": 0.6134,
+ "step": 1503
+ },
+ {
+ "epoch": 0.7111111111111111,
+ "grad_norm": 2.98245906829834,
+ "learning_rate": 4.848139127857867e-06,
+ "loss": 0.7084,
+ "step": 1504
+ },
+ {
+ "epoch": 0.7115839243498818,
+ "grad_norm": 2.5917441844940186,
+ "learning_rate": 4.847924945903076e-06,
+ "loss": 0.5676,
+ "step": 1505
+ },
+ {
+ "epoch": 0.7120567375886525,
+ "grad_norm": 2.8736681938171387,
+ "learning_rate": 4.847710617753294e-06,
+ "loss": 0.6304,
+ "step": 1506
+ },
+ {
+ "epoch": 0.7125295508274232,
+ "grad_norm": 2.7832682132720947,
+ "learning_rate": 4.847496143421866e-06,
+ "loss": 0.5705,
+ "step": 1507
+ },
+ {
+ "epoch": 0.7130023640661939,
+ "grad_norm": 2.480560779571533,
+ "learning_rate": 4.847281522922147e-06,
+ "loss": 0.5595,
+ "step": 1508
+ },
+ {
+ "epoch": 0.7134751773049646,
+ "grad_norm": 2.357675313949585,
+ "learning_rate": 4.847066756267499e-06,
+ "loss": 0.5065,
+ "step": 1509
+ },
+ {
+ "epoch": 0.7139479905437353,
+ "grad_norm": 2.632669448852539,
+ "learning_rate": 4.846851843471296e-06,
+ "loss": 0.6949,
+ "step": 1510
+ },
+ {
+ "epoch": 0.7144208037825059,
+ "grad_norm": 2.7691073417663574,
+ "learning_rate": 4.84663678454692e-06,
+ "loss": 0.6638,
+ "step": 1511
+ },
+ {
+ "epoch": 0.7148936170212766,
+ "grad_norm": 2.5647685527801514,
+ "learning_rate": 4.846421579507761e-06,
+ "loss": 0.6098,
+ "step": 1512
+ },
+ {
+ "epoch": 0.7153664302600473,
+ "grad_norm": 2.476701021194458,
+ "learning_rate": 4.846206228367218e-06,
+ "loss": 0.592,
+ "step": 1513
+ },
+ {
+ "epoch": 0.715839243498818,
+ "grad_norm": 2.805727958679199,
+ "learning_rate": 4.845990731138702e-06,
+ "loss": 0.5466,
+ "step": 1514
+ },
+ {
+ "epoch": 0.7163120567375887,
+ "grad_norm": 2.551392078399658,
+ "learning_rate": 4.84577508783563e-06,
+ "loss": 0.6039,
+ "step": 1515
+ },
+ {
+ "epoch": 0.7167848699763594,
+ "grad_norm": 2.6861350536346436,
+ "learning_rate": 4.845559298471429e-06,
+ "loss": 0.6427,
+ "step": 1516
+ },
+ {
+ "epoch": 0.7172576832151301,
+ "grad_norm": 3.1908371448516846,
+ "learning_rate": 4.845343363059535e-06,
+ "loss": 0.5447,
+ "step": 1517
+ },
+ {
+ "epoch": 0.7177304964539007,
+ "grad_norm": 2.9021761417388916,
+ "learning_rate": 4.845127281613394e-06,
+ "loss": 0.5836,
+ "step": 1518
+ },
+ {
+ "epoch": 0.7182033096926714,
+ "grad_norm": 2.476670742034912,
+ "learning_rate": 4.844911054146461e-06,
+ "loss": 0.5863,
+ "step": 1519
+ },
+ {
+ "epoch": 0.7186761229314421,
+ "grad_norm": 2.662935495376587,
+ "learning_rate": 4.844694680672198e-06,
+ "loss": 0.5678,
+ "step": 1520
+ },
+ {
+ "epoch": 0.7191489361702128,
+ "grad_norm": 2.677896738052368,
+ "learning_rate": 4.844478161204079e-06,
+ "loss": 0.6195,
+ "step": 1521
+ },
+ {
+ "epoch": 0.7196217494089835,
+ "grad_norm": 2.781921863555908,
+ "learning_rate": 4.844261495755585e-06,
+ "loss": 0.643,
+ "step": 1522
+ },
+ {
+ "epoch": 0.7200945626477542,
+ "grad_norm": 3.0157392024993896,
+ "learning_rate": 4.844044684340206e-06,
+ "loss": 0.7559,
+ "step": 1523
+ },
+ {
+ "epoch": 0.7205673758865249,
+ "grad_norm": 2.8109354972839355,
+ "learning_rate": 4.843827726971444e-06,
+ "loss": 0.6264,
+ "step": 1524
+ },
+ {
+ "epoch": 0.7210401891252955,
+ "grad_norm": 3.0953569412231445,
+ "learning_rate": 4.8436106236628064e-06,
+ "loss": 0.6429,
+ "step": 1525
+ },
+ {
+ "epoch": 0.7215130023640662,
+ "grad_norm": 2.6850643157958984,
+ "learning_rate": 4.843393374427812e-06,
+ "loss": 0.6598,
+ "step": 1526
+ },
+ {
+ "epoch": 0.7219858156028369,
+ "grad_norm": 3.043480634689331,
+ "learning_rate": 4.8431759792799874e-06,
+ "loss": 0.6331,
+ "step": 1527
+ },
+ {
+ "epoch": 0.7224586288416076,
+ "grad_norm": 2.723870038986206,
+ "learning_rate": 4.842958438232868e-06,
+ "loss": 0.6259,
+ "step": 1528
+ },
+ {
+ "epoch": 0.7229314420803783,
+ "grad_norm": 2.822492837905884,
+ "learning_rate": 4.842740751300002e-06,
+ "loss": 0.6554,
+ "step": 1529
+ },
+ {
+ "epoch": 0.723404255319149,
+ "grad_norm": 2.7866315841674805,
+ "learning_rate": 4.842522918494941e-06,
+ "loss": 0.6991,
+ "step": 1530
+ },
+ {
+ "epoch": 0.7238770685579197,
+ "grad_norm": 2.8881826400756836,
+ "learning_rate": 4.84230493983125e-06,
+ "loss": 0.5876,
+ "step": 1531
+ },
+ {
+ "epoch": 0.7243498817966904,
+ "grad_norm": 2.7456939220428467,
+ "learning_rate": 4.8420868153225e-06,
+ "loss": 0.6188,
+ "step": 1532
+ },
+ {
+ "epoch": 0.724822695035461,
+ "grad_norm": 3.0257532596588135,
+ "learning_rate": 4.841868544982274e-06,
+ "loss": 0.63,
+ "step": 1533
+ },
+ {
+ "epoch": 0.7252955082742317,
+ "grad_norm": 3.1581954956054688,
+ "learning_rate": 4.841650128824164e-06,
+ "loss": 0.7214,
+ "step": 1534
+ },
+ {
+ "epoch": 0.7257683215130024,
+ "grad_norm": 2.9174306392669678,
+ "learning_rate": 4.841431566861767e-06,
+ "loss": 0.704,
+ "step": 1535
+ },
+ {
+ "epoch": 0.7262411347517731,
+ "grad_norm": 2.5019054412841797,
+ "learning_rate": 4.8412128591086935e-06,
+ "loss": 0.6298,
+ "step": 1536
+ },
+ {
+ "epoch": 0.7267139479905438,
+ "grad_norm": 2.724285125732422,
+ "learning_rate": 4.840994005578562e-06,
+ "loss": 0.6289,
+ "step": 1537
+ },
+ {
+ "epoch": 0.7271867612293145,
+ "grad_norm": 2.5882341861724854,
+ "learning_rate": 4.840775006284998e-06,
+ "loss": 0.6355,
+ "step": 1538
+ },
+ {
+ "epoch": 0.7276595744680852,
+ "grad_norm": 3.1281991004943848,
+ "learning_rate": 4.840555861241638e-06,
+ "loss": 0.5551,
+ "step": 1539
+ },
+ {
+ "epoch": 0.7281323877068558,
+ "grad_norm": 2.6064817905426025,
+ "learning_rate": 4.840336570462127e-06,
+ "loss": 0.5543,
+ "step": 1540
+ },
+ {
+ "epoch": 0.7286052009456265,
+ "grad_norm": 2.67112398147583,
+ "learning_rate": 4.840117133960122e-06,
+ "loss": 0.6044,
+ "step": 1541
+ },
+ {
+ "epoch": 0.7290780141843972,
+ "grad_norm": 2.838022232055664,
+ "learning_rate": 4.839897551749282e-06,
+ "loss": 0.6814,
+ "step": 1542
+ },
+ {
+ "epoch": 0.7295508274231679,
+ "grad_norm": 2.8897151947021484,
+ "learning_rate": 4.839677823843283e-06,
+ "loss": 0.593,
+ "step": 1543
+ },
+ {
+ "epoch": 0.7300236406619386,
+ "grad_norm": 2.9238014221191406,
+ "learning_rate": 4.839457950255805e-06,
+ "loss": 0.5544,
+ "step": 1544
+ },
+ {
+ "epoch": 0.7304964539007093,
+ "grad_norm": 3.016876459121704,
+ "learning_rate": 4.839237931000538e-06,
+ "loss": 0.6099,
+ "step": 1545
+ },
+ {
+ "epoch": 0.7309692671394799,
+ "grad_norm": 2.9415392875671387,
+ "learning_rate": 4.839017766091182e-06,
+ "loss": 0.6413,
+ "step": 1546
+ },
+ {
+ "epoch": 0.7314420803782505,
+ "grad_norm": 2.658067226409912,
+ "learning_rate": 4.838797455541446e-06,
+ "loss": 0.6534,
+ "step": 1547
+ },
+ {
+ "epoch": 0.7319148936170212,
+ "grad_norm": 2.460358142852783,
+ "learning_rate": 4.838576999365049e-06,
+ "loss": 0.5307,
+ "step": 1548
+ },
+ {
+ "epoch": 0.7323877068557919,
+ "grad_norm": 2.5818674564361572,
+ "learning_rate": 4.838356397575716e-06,
+ "loss": 0.6265,
+ "step": 1549
+ },
+ {
+ "epoch": 0.7328605200945626,
+ "grad_norm": 3.009197473526001,
+ "learning_rate": 4.838135650187183e-06,
+ "loss": 0.6957,
+ "step": 1550
+ },
+ {
+ "epoch": 0.7333333333333333,
+ "grad_norm": 2.738543748855591,
+ "learning_rate": 4.837914757213196e-06,
+ "loss": 0.646,
+ "step": 1551
+ },
+ {
+ "epoch": 0.733806146572104,
+ "grad_norm": 2.8208494186401367,
+ "learning_rate": 4.837693718667508e-06,
+ "loss": 0.5936,
+ "step": 1552
+ },
+ {
+ "epoch": 0.7342789598108747,
+ "grad_norm": 3.1574649810791016,
+ "learning_rate": 4.837472534563883e-06,
+ "loss": 0.6455,
+ "step": 1553
+ },
+ {
+ "epoch": 0.7347517730496453,
+ "grad_norm": 2.6737420558929443,
+ "learning_rate": 4.837251204916093e-06,
+ "loss": 0.5921,
+ "step": 1554
+ },
+ {
+ "epoch": 0.735224586288416,
+ "grad_norm": 2.424983024597168,
+ "learning_rate": 4.837029729737918e-06,
+ "loss": 0.6346,
+ "step": 1555
+ },
+ {
+ "epoch": 0.7356973995271867,
+ "grad_norm": 2.5163493156433105,
+ "learning_rate": 4.836808109043151e-06,
+ "loss": 0.6061,
+ "step": 1556
+ },
+ {
+ "epoch": 0.7361702127659574,
+ "grad_norm": 2.8377044200897217,
+ "learning_rate": 4.836586342845588e-06,
+ "loss": 0.611,
+ "step": 1557
+ },
+ {
+ "epoch": 0.7366430260047281,
+ "grad_norm": 2.5929181575775146,
+ "learning_rate": 4.83636443115904e-06,
+ "loss": 0.5496,
+ "step": 1558
+ },
+ {
+ "epoch": 0.7371158392434988,
+ "grad_norm": 2.5017223358154297,
+ "learning_rate": 4.836142373997323e-06,
+ "loss": 0.6235,
+ "step": 1559
+ },
+ {
+ "epoch": 0.7375886524822695,
+ "grad_norm": 2.822500228881836,
+ "learning_rate": 4.835920171374265e-06,
+ "loss": 0.6147,
+ "step": 1560
+ },
+ {
+ "epoch": 0.7380614657210401,
+ "grad_norm": 2.7234230041503906,
+ "learning_rate": 4.8356978233037e-06,
+ "loss": 0.6228,
+ "step": 1561
+ },
+ {
+ "epoch": 0.7385342789598108,
+ "grad_norm": 2.9565515518188477,
+ "learning_rate": 4.835475329799472e-06,
+ "loss": 0.5728,
+ "step": 1562
+ },
+ {
+ "epoch": 0.7390070921985815,
+ "grad_norm": 2.4356038570404053,
+ "learning_rate": 4.835252690875438e-06,
+ "loss": 0.6723,
+ "step": 1563
+ },
+ {
+ "epoch": 0.7394799054373522,
+ "grad_norm": 2.765913248062134,
+ "learning_rate": 4.835029906545458e-06,
+ "loss": 0.5805,
+ "step": 1564
+ },
+ {
+ "epoch": 0.7399527186761229,
+ "grad_norm": 2.4481914043426514,
+ "learning_rate": 4.834806976823405e-06,
+ "loss": 0.599,
+ "step": 1565
+ },
+ {
+ "epoch": 0.7404255319148936,
+ "grad_norm": 2.620779514312744,
+ "learning_rate": 4.834583901723158e-06,
+ "loss": 0.63,
+ "step": 1566
+ },
+ {
+ "epoch": 0.7408983451536643,
+ "grad_norm": 2.654426097869873,
+ "learning_rate": 4.83436068125861e-06,
+ "loss": 0.6544,
+ "step": 1567
+ },
+ {
+ "epoch": 0.741371158392435,
+ "grad_norm": 2.589623212814331,
+ "learning_rate": 4.834137315443656e-06,
+ "loss": 0.5596,
+ "step": 1568
+ },
+ {
+ "epoch": 0.7418439716312056,
+ "grad_norm": 2.572883129119873,
+ "learning_rate": 4.833913804292209e-06,
+ "loss": 0.5974,
+ "step": 1569
+ },
+ {
+ "epoch": 0.7423167848699763,
+ "grad_norm": 2.8744914531707764,
+ "learning_rate": 4.833690147818181e-06,
+ "loss": 0.5364,
+ "step": 1570
+ },
+ {
+ "epoch": 0.742789598108747,
+ "grad_norm": 2.9800851345062256,
+ "learning_rate": 4.833466346035502e-06,
+ "loss": 0.6287,
+ "step": 1571
+ },
+ {
+ "epoch": 0.7432624113475177,
+ "grad_norm": 2.627784490585327,
+ "learning_rate": 4.833242398958105e-06,
+ "loss": 0.621,
+ "step": 1572
+ },
+ {
+ "epoch": 0.7437352245862884,
+ "grad_norm": 2.5187721252441406,
+ "learning_rate": 4.833018306599933e-06,
+ "loss": 0.5901,
+ "step": 1573
+ },
+ {
+ "epoch": 0.7442080378250591,
+ "grad_norm": 2.4843688011169434,
+ "learning_rate": 4.832794068974944e-06,
+ "loss": 0.6336,
+ "step": 1574
+ },
+ {
+ "epoch": 0.7446808510638298,
+ "grad_norm": 2.774911880493164,
+ "learning_rate": 4.832569686097096e-06,
+ "loss": 0.6091,
+ "step": 1575
+ },
+ {
+ "epoch": 0.7451536643026004,
+ "grad_norm": 3.2562527656555176,
+ "learning_rate": 4.8323451579803615e-06,
+ "loss": 0.7686,
+ "step": 1576
+ },
+ {
+ "epoch": 0.7456264775413711,
+ "grad_norm": 2.799570083618164,
+ "learning_rate": 4.832120484638721e-06,
+ "loss": 0.6233,
+ "step": 1577
+ },
+ {
+ "epoch": 0.7460992907801418,
+ "grad_norm": 2.661893367767334,
+ "learning_rate": 4.831895666086164e-06,
+ "loss": 0.5841,
+ "step": 1578
+ },
+ {
+ "epoch": 0.7465721040189125,
+ "grad_norm": 3.0382652282714844,
+ "learning_rate": 4.831670702336689e-06,
+ "loss": 0.5769,
+ "step": 1579
+ },
+ {
+ "epoch": 0.7470449172576832,
+ "grad_norm": 2.676398515701294,
+ "learning_rate": 4.831445593404304e-06,
+ "loss": 0.619,
+ "step": 1580
+ },
+ {
+ "epoch": 0.7475177304964539,
+ "grad_norm": 2.717916965484619,
+ "learning_rate": 4.831220339303024e-06,
+ "loss": 0.5787,
+ "step": 1581
+ },
+ {
+ "epoch": 0.7479905437352246,
+ "grad_norm": 2.3918066024780273,
+ "learning_rate": 4.830994940046876e-06,
+ "loss": 0.5108,
+ "step": 1582
+ },
+ {
+ "epoch": 0.7484633569739952,
+ "grad_norm": 2.709144115447998,
+ "learning_rate": 4.830769395649895e-06,
+ "loss": 0.6875,
+ "step": 1583
+ },
+ {
+ "epoch": 0.7489361702127659,
+ "grad_norm": 2.8711116313934326,
+ "learning_rate": 4.830543706126123e-06,
+ "loss": 0.6745,
+ "step": 1584
+ },
+ {
+ "epoch": 0.7494089834515366,
+ "grad_norm": 2.612339496612549,
+ "learning_rate": 4.830317871489614e-06,
+ "loss": 0.5738,
+ "step": 1585
+ },
+ {
+ "epoch": 0.7498817966903073,
+ "grad_norm": 2.4355857372283936,
+ "learning_rate": 4.830091891754429e-06,
+ "loss": 0.5907,
+ "step": 1586
+ },
+ {
+ "epoch": 0.750354609929078,
+ "grad_norm": 2.676051378250122,
+ "learning_rate": 4.829865766934638e-06,
+ "loss": 0.6628,
+ "step": 1587
+ },
+ {
+ "epoch": 0.7508274231678487,
+ "grad_norm": 2.66489839553833,
+ "learning_rate": 4.829639497044323e-06,
+ "loss": 0.5984,
+ "step": 1588
+ },
+ {
+ "epoch": 0.7513002364066194,
+ "grad_norm": 2.5358035564422607,
+ "learning_rate": 4.829413082097572e-06,
+ "loss": 0.5867,
+ "step": 1589
+ },
+ {
+ "epoch": 0.75177304964539,
+ "grad_norm": 2.6530144214630127,
+ "learning_rate": 4.8291865221084815e-06,
+ "loss": 0.5917,
+ "step": 1590
+ },
+ {
+ "epoch": 0.7522458628841607,
+ "grad_norm": 2.5160958766937256,
+ "learning_rate": 4.82895981709116e-06,
+ "loss": 0.6347,
+ "step": 1591
+ },
+ {
+ "epoch": 0.7527186761229314,
+ "grad_norm": 2.61592698097229,
+ "learning_rate": 4.8287329670597225e-06,
+ "loss": 0.5472,
+ "step": 1592
+ },
+ {
+ "epoch": 0.7531914893617021,
+ "grad_norm": 2.7528622150421143,
+ "learning_rate": 4.828505972028296e-06,
+ "loss": 0.5842,
+ "step": 1593
+ },
+ {
+ "epoch": 0.7536643026004728,
+ "grad_norm": 2.8154072761535645,
+ "learning_rate": 4.828278832011011e-06,
+ "loss": 0.5757,
+ "step": 1594
+ },
+ {
+ "epoch": 0.7541371158392435,
+ "grad_norm": 3.118515729904175,
+ "learning_rate": 4.828051547022013e-06,
+ "loss": 0.6472,
+ "step": 1595
+ },
+ {
+ "epoch": 0.7546099290780142,
+ "grad_norm": 2.452033758163452,
+ "learning_rate": 4.827824117075453e-06,
+ "loss": 0.5571,
+ "step": 1596
+ },
+ {
+ "epoch": 0.7550827423167848,
+ "grad_norm": 2.984388828277588,
+ "learning_rate": 4.827596542185492e-06,
+ "loss": 0.6656,
+ "step": 1597
+ },
+ {
+ "epoch": 0.7555555555555555,
+ "grad_norm": 2.61356782913208,
+ "learning_rate": 4.8273688223663014e-06,
+ "loss": 0.6444,
+ "step": 1598
+ },
+ {
+ "epoch": 0.7560283687943262,
+ "grad_norm": 2.8967196941375732,
+ "learning_rate": 4.8271409576320595e-06,
+ "loss": 0.6457,
+ "step": 1599
+ },
+ {
+ "epoch": 0.7565011820330969,
+ "grad_norm": 2.852367639541626,
+ "learning_rate": 4.826912947996954e-06,
+ "loss": 0.5629,
+ "step": 1600
+ },
+ {
+ "epoch": 0.7569739952718676,
+ "grad_norm": 2.905280590057373,
+ "learning_rate": 4.826684793475182e-06,
+ "loss": 0.6245,
+ "step": 1601
+ },
+ {
+ "epoch": 0.7574468085106383,
+ "grad_norm": 2.6156530380249023,
+ "learning_rate": 4.826456494080951e-06,
+ "loss": 0.5869,
+ "step": 1602
+ },
+ {
+ "epoch": 0.757919621749409,
+ "grad_norm": 2.6490228176116943,
+ "learning_rate": 4.826228049828475e-06,
+ "loss": 0.5461,
+ "step": 1603
+ },
+ {
+ "epoch": 0.7583924349881797,
+ "grad_norm": 2.9626693725585938,
+ "learning_rate": 4.825999460731978e-06,
+ "loss": 0.6842,
+ "step": 1604
+ },
+ {
+ "epoch": 0.7588652482269503,
+ "grad_norm": 2.6866023540496826,
+ "learning_rate": 4.825770726805695e-06,
+ "loss": 0.5726,
+ "step": 1605
+ },
+ {
+ "epoch": 0.759338061465721,
+ "grad_norm": 2.5525858402252197,
+ "learning_rate": 4.825541848063866e-06,
+ "loss": 0.6061,
+ "step": 1606
+ },
+ {
+ "epoch": 0.7598108747044917,
+ "grad_norm": 2.703977584838867,
+ "learning_rate": 4.825312824520743e-06,
+ "loss": 0.6726,
+ "step": 1607
+ },
+ {
+ "epoch": 0.7602836879432624,
+ "grad_norm": 2.856534957885742,
+ "learning_rate": 4.825083656190588e-06,
+ "loss": 0.625,
+ "step": 1608
+ },
+ {
+ "epoch": 0.7607565011820331,
+ "grad_norm": 2.8564887046813965,
+ "learning_rate": 4.824854343087668e-06,
+ "loss": 0.7251,
+ "step": 1609
+ },
+ {
+ "epoch": 0.7612293144208038,
+ "grad_norm": 2.327650308609009,
+ "learning_rate": 4.824624885226262e-06,
+ "loss": 0.526,
+ "step": 1610
+ },
+ {
+ "epoch": 0.7617021276595745,
+ "grad_norm": 3.0025737285614014,
+ "learning_rate": 4.824395282620659e-06,
+ "loss": 0.6043,
+ "step": 1611
+ },
+ {
+ "epoch": 0.7621749408983451,
+ "grad_norm": 2.5441737174987793,
+ "learning_rate": 4.824165535285152e-06,
+ "loss": 0.6276,
+ "step": 1612
+ },
+ {
+ "epoch": 0.7626477541371158,
+ "grad_norm": 2.4177372455596924,
+ "learning_rate": 4.823935643234049e-06,
+ "loss": 0.6419,
+ "step": 1613
+ },
+ {
+ "epoch": 0.7631205673758865,
+ "grad_norm": 2.9210550785064697,
+ "learning_rate": 4.823705606481664e-06,
+ "loss": 0.5663,
+ "step": 1614
+ },
+ {
+ "epoch": 0.7635933806146572,
+ "grad_norm": 2.6353724002838135,
+ "learning_rate": 4.82347542504232e-06,
+ "loss": 0.5669,
+ "step": 1615
+ },
+ {
+ "epoch": 0.7640661938534279,
+ "grad_norm": 2.419081926345825,
+ "learning_rate": 4.823245098930349e-06,
+ "loss": 0.5777,
+ "step": 1616
+ },
+ {
+ "epoch": 0.7645390070921986,
+ "grad_norm": 2.5077571868896484,
+ "learning_rate": 4.823014628160093e-06,
+ "loss": 0.5924,
+ "step": 1617
+ },
+ {
+ "epoch": 0.7650118203309693,
+ "grad_norm": 2.816056251525879,
+ "learning_rate": 4.822784012745902e-06,
+ "loss": 0.7273,
+ "step": 1618
+ },
+ {
+ "epoch": 0.76548463356974,
+ "grad_norm": 2.7163147926330566,
+ "learning_rate": 4.8225532527021366e-06,
+ "loss": 0.5545,
+ "step": 1619
+ },
+ {
+ "epoch": 0.7659574468085106,
+ "grad_norm": 2.4784302711486816,
+ "learning_rate": 4.822322348043164e-06,
+ "loss": 0.556,
+ "step": 1620
+ },
+ {
+ "epoch": 0.7664302600472813,
+ "grad_norm": 2.712467670440674,
+ "learning_rate": 4.822091298783361e-06,
+ "loss": 0.6501,
+ "step": 1621
+ },
+ {
+ "epoch": 0.766903073286052,
+ "grad_norm": 2.7217724323272705,
+ "learning_rate": 4.821860104937115e-06,
+ "loss": 0.5989,
+ "step": 1622
+ },
+ {
+ "epoch": 0.7673758865248227,
+ "grad_norm": 2.5622854232788086,
+ "learning_rate": 4.821628766518821e-06,
+ "loss": 0.5263,
+ "step": 1623
+ },
+ {
+ "epoch": 0.7678486997635934,
+ "grad_norm": 3.230923891067505,
+ "learning_rate": 4.821397283542884e-06,
+ "loss": 0.6707,
+ "step": 1624
+ },
+ {
+ "epoch": 0.7683215130023641,
+ "grad_norm": 2.37929105758667,
+ "learning_rate": 4.821165656023718e-06,
+ "loss": 0.6124,
+ "step": 1625
+ },
+ {
+ "epoch": 0.7687943262411348,
+ "grad_norm": 2.9811325073242188,
+ "learning_rate": 4.820933883975745e-06,
+ "loss": 0.6435,
+ "step": 1626
+ },
+ {
+ "epoch": 0.7692671394799054,
+ "grad_norm": 2.887380838394165,
+ "learning_rate": 4.820701967413395e-06,
+ "loss": 0.621,
+ "step": 1627
+ },
+ {
+ "epoch": 0.7697399527186761,
+ "grad_norm": 2.6762876510620117,
+ "learning_rate": 4.820469906351109e-06,
+ "loss": 0.5713,
+ "step": 1628
+ },
+ {
+ "epoch": 0.7702127659574468,
+ "grad_norm": 2.7347512245178223,
+ "learning_rate": 4.820237700803337e-06,
+ "loss": 0.6136,
+ "step": 1629
+ },
+ {
+ "epoch": 0.7706855791962175,
+ "grad_norm": 2.7244746685028076,
+ "learning_rate": 4.820005350784539e-06,
+ "loss": 0.5816,
+ "step": 1630
+ },
+ {
+ "epoch": 0.7711583924349882,
+ "grad_norm": 2.9293999671936035,
+ "learning_rate": 4.8197728563091795e-06,
+ "loss": 0.6649,
+ "step": 1631
+ },
+ {
+ "epoch": 0.7716312056737589,
+ "grad_norm": 2.4402127265930176,
+ "learning_rate": 4.819540217391736e-06,
+ "loss": 0.6481,
+ "step": 1632
+ },
+ {
+ "epoch": 0.7721040189125296,
+ "grad_norm": 3.083941698074341,
+ "learning_rate": 4.819307434046694e-06,
+ "loss": 0.6951,
+ "step": 1633
+ },
+ {
+ "epoch": 0.7725768321513002,
+ "grad_norm": 2.544952392578125,
+ "learning_rate": 4.819074506288548e-06,
+ "loss": 0.539,
+ "step": 1634
+ },
+ {
+ "epoch": 0.7730496453900709,
+ "grad_norm": 2.7791268825531006,
+ "learning_rate": 4.818841434131801e-06,
+ "loss": 0.5827,
+ "step": 1635
+ },
+ {
+ "epoch": 0.7735224586288416,
+ "grad_norm": 2.7349796295166016,
+ "learning_rate": 4.818608217590967e-06,
+ "loss": 0.5584,
+ "step": 1636
+ },
+ {
+ "epoch": 0.7739952718676123,
+ "grad_norm": 2.637652635574341,
+ "learning_rate": 4.818374856680565e-06,
+ "loss": 0.6386,
+ "step": 1637
+ },
+ {
+ "epoch": 0.774468085106383,
+ "grad_norm": 2.9821584224700928,
+ "learning_rate": 4.818141351415127e-06,
+ "loss": 0.6734,
+ "step": 1638
+ },
+ {
+ "epoch": 0.7749408983451537,
+ "grad_norm": 2.992938995361328,
+ "learning_rate": 4.817907701809192e-06,
+ "loss": 0.5899,
+ "step": 1639
+ },
+ {
+ "epoch": 0.7754137115839244,
+ "grad_norm": 4.35719633102417,
+ "learning_rate": 4.8176739078773076e-06,
+ "loss": 0.6281,
+ "step": 1640
+ },
+ {
+ "epoch": 0.775886524822695,
+ "grad_norm": 2.838146209716797,
+ "learning_rate": 4.8174399696340315e-06,
+ "loss": 0.5766,
+ "step": 1641
+ },
+ {
+ "epoch": 0.7763593380614657,
+ "grad_norm": 3.3116989135742188,
+ "learning_rate": 4.81720588709393e-06,
+ "loss": 0.6409,
+ "step": 1642
+ },
+ {
+ "epoch": 0.7768321513002364,
+ "grad_norm": 2.9843590259552,
+ "learning_rate": 4.816971660271579e-06,
+ "loss": 0.6108,
+ "step": 1643
+ },
+ {
+ "epoch": 0.7773049645390071,
+ "grad_norm": 2.843770742416382,
+ "learning_rate": 4.816737289181562e-06,
+ "loss": 0.6053,
+ "step": 1644
+ },
+ {
+ "epoch": 0.7777777777777778,
+ "grad_norm": 2.7608556747436523,
+ "learning_rate": 4.816502773838473e-06,
+ "loss": 0.5854,
+ "step": 1645
+ },
+ {
+ "epoch": 0.7782505910165485,
+ "grad_norm": 3.343682289123535,
+ "learning_rate": 4.816268114256914e-06,
+ "loss": 0.6329,
+ "step": 1646
+ },
+ {
+ "epoch": 0.7787234042553192,
+ "grad_norm": 2.769768476486206,
+ "learning_rate": 4.816033310451496e-06,
+ "loss": 0.6242,
+ "step": 1647
+ },
+ {
+ "epoch": 0.7791962174940898,
+ "grad_norm": 2.989851713180542,
+ "learning_rate": 4.815798362436838e-06,
+ "loss": 0.6493,
+ "step": 1648
+ },
+ {
+ "epoch": 0.7796690307328605,
+ "grad_norm": 3.170736312866211,
+ "learning_rate": 4.8155632702275716e-06,
+ "loss": 0.6341,
+ "step": 1649
+ },
+ {
+ "epoch": 0.7801418439716312,
+ "grad_norm": 2.7372522354125977,
+ "learning_rate": 4.815328033838334e-06,
+ "loss": 0.5445,
+ "step": 1650
+ },
+ {
+ "epoch": 0.7806146572104019,
+ "grad_norm": 2.6947238445281982,
+ "learning_rate": 4.8150926532837715e-06,
+ "loss": 0.6437,
+ "step": 1651
+ },
+ {
+ "epoch": 0.7810874704491726,
+ "grad_norm": 2.472323179244995,
+ "learning_rate": 4.81485712857854e-06,
+ "loss": 0.5751,
+ "step": 1652
+ },
+ {
+ "epoch": 0.7815602836879433,
+ "grad_norm": 2.791114091873169,
+ "learning_rate": 4.814621459737308e-06,
+ "loss": 0.5996,
+ "step": 1653
+ },
+ {
+ "epoch": 0.782033096926714,
+ "grad_norm": 3.1957521438598633,
+ "learning_rate": 4.814385646774745e-06,
+ "loss": 0.5803,
+ "step": 1654
+ },
+ {
+ "epoch": 0.7825059101654847,
+ "grad_norm": 2.4120798110961914,
+ "learning_rate": 4.8141496897055364e-06,
+ "loss": 0.5814,
+ "step": 1655
+ },
+ {
+ "epoch": 0.7829787234042553,
+ "grad_norm": 2.9262423515319824,
+ "learning_rate": 4.813913588544374e-06,
+ "loss": 0.6292,
+ "step": 1656
+ },
+ {
+ "epoch": 0.783451536643026,
+ "grad_norm": 2.8251047134399414,
+ "learning_rate": 4.813677343305959e-06,
+ "loss": 0.6787,
+ "step": 1657
+ },
+ {
+ "epoch": 0.7839243498817967,
+ "grad_norm": 2.931659698486328,
+ "learning_rate": 4.8134409540050005e-06,
+ "loss": 0.6163,
+ "step": 1658
+ },
+ {
+ "epoch": 0.7843971631205674,
+ "grad_norm": 2.7160706520080566,
+ "learning_rate": 4.813204420656219e-06,
+ "loss": 0.6831,
+ "step": 1659
+ },
+ {
+ "epoch": 0.7848699763593381,
+ "grad_norm": 3.2134454250335693,
+ "learning_rate": 4.81296774327434e-06,
+ "loss": 0.6002,
+ "step": 1660
+ },
+ {
+ "epoch": 0.7853427895981088,
+ "grad_norm": 2.4002513885498047,
+ "learning_rate": 4.812730921874103e-06,
+ "loss": 0.5488,
+ "step": 1661
+ },
+ {
+ "epoch": 0.7858156028368795,
+ "grad_norm": 2.5559282302856445,
+ "learning_rate": 4.812493956470251e-06,
+ "loss": 0.5802,
+ "step": 1662
+ },
+ {
+ "epoch": 0.7862884160756501,
+ "grad_norm": 2.57478404045105,
+ "learning_rate": 4.812256847077541e-06,
+ "loss": 0.646,
+ "step": 1663
+ },
+ {
+ "epoch": 0.7867612293144208,
+ "grad_norm": 2.811851978302002,
+ "learning_rate": 4.812019593710736e-06,
+ "loss": 0.6245,
+ "step": 1664
+ },
+ {
+ "epoch": 0.7872340425531915,
+ "grad_norm": 2.5228829383850098,
+ "learning_rate": 4.811782196384609e-06,
+ "loss": 0.5949,
+ "step": 1665
+ },
+ {
+ "epoch": 0.7877068557919622,
+ "grad_norm": 2.744096040725708,
+ "learning_rate": 4.8115446551139415e-06,
+ "loss": 0.6006,
+ "step": 1666
+ },
+ {
+ "epoch": 0.7881796690307329,
+ "grad_norm": 3.129242420196533,
+ "learning_rate": 4.811306969913524e-06,
+ "loss": 0.7251,
+ "step": 1667
+ },
+ {
+ "epoch": 0.7886524822695036,
+ "grad_norm": 2.7855660915374756,
+ "learning_rate": 4.811069140798156e-06,
+ "loss": 0.6534,
+ "step": 1668
+ },
+ {
+ "epoch": 0.7891252955082743,
+ "grad_norm": 2.836603879928589,
+ "learning_rate": 4.810831167782647e-06,
+ "loss": 0.6661,
+ "step": 1669
+ },
+ {
+ "epoch": 0.789598108747045,
+ "grad_norm": 2.5339887142181396,
+ "learning_rate": 4.810593050881813e-06,
+ "loss": 0.5354,
+ "step": 1670
+ },
+ {
+ "epoch": 0.7900709219858156,
+ "grad_norm": 2.9553709030151367,
+ "learning_rate": 4.810354790110482e-06,
+ "loss": 0.6001,
+ "step": 1671
+ },
+ {
+ "epoch": 0.7905437352245863,
+ "grad_norm": 2.6581788063049316,
+ "learning_rate": 4.8101163854834885e-06,
+ "loss": 0.6802,
+ "step": 1672
+ },
+ {
+ "epoch": 0.791016548463357,
+ "grad_norm": 3.2002551555633545,
+ "learning_rate": 4.809877837015677e-06,
+ "loss": 0.6641,
+ "step": 1673
+ },
+ {
+ "epoch": 0.7914893617021277,
+ "grad_norm": 2.918792963027954,
+ "learning_rate": 4.809639144721902e-06,
+ "loss": 0.6758,
+ "step": 1674
+ },
+ {
+ "epoch": 0.7919621749408984,
+ "grad_norm": 2.7993946075439453,
+ "learning_rate": 4.8094003086170245e-06,
+ "loss": 0.5889,
+ "step": 1675
+ },
+ {
+ "epoch": 0.7924349881796691,
+ "grad_norm": 2.3698952198028564,
+ "learning_rate": 4.809161328715916e-06,
+ "loss": 0.6244,
+ "step": 1676
+ },
+ {
+ "epoch": 0.7929078014184398,
+ "grad_norm": 2.8891594409942627,
+ "learning_rate": 4.808922205033458e-06,
+ "loss": 0.5835,
+ "step": 1677
+ },
+ {
+ "epoch": 0.7933806146572104,
+ "grad_norm": 2.838345766067505,
+ "learning_rate": 4.808682937584537e-06,
+ "loss": 0.6907,
+ "step": 1678
+ },
+ {
+ "epoch": 0.7938534278959811,
+ "grad_norm": 2.8443174362182617,
+ "learning_rate": 4.808443526384053e-06,
+ "loss": 0.6692,
+ "step": 1679
+ },
+ {
+ "epoch": 0.7943262411347518,
+ "grad_norm": 2.7355034351348877,
+ "learning_rate": 4.808203971446913e-06,
+ "loss": 0.5799,
+ "step": 1680
+ },
+ {
+ "epoch": 0.7947990543735225,
+ "grad_norm": 2.7108020782470703,
+ "learning_rate": 4.807964272788033e-06,
+ "loss": 0.652,
+ "step": 1681
+ },
+ {
+ "epoch": 0.7952718676122932,
+ "grad_norm": 2.397650957107544,
+ "learning_rate": 4.807724430422338e-06,
+ "loss": 0.5418,
+ "step": 1682
+ },
+ {
+ "epoch": 0.7957446808510639,
+ "grad_norm": 2.4981582164764404,
+ "learning_rate": 4.807484444364762e-06,
+ "loss": 0.5731,
+ "step": 1683
+ },
+ {
+ "epoch": 0.7962174940898346,
+ "grad_norm": 2.7943713665008545,
+ "learning_rate": 4.8072443146302475e-06,
+ "loss": 0.5913,
+ "step": 1684
+ },
+ {
+ "epoch": 0.7966903073286052,
+ "grad_norm": 2.5691423416137695,
+ "learning_rate": 4.807004041233746e-06,
+ "loss": 0.6475,
+ "step": 1685
+ },
+ {
+ "epoch": 0.7971631205673759,
+ "grad_norm": 3.2367498874664307,
+ "learning_rate": 4.8067636241902195e-06,
+ "loss": 0.675,
+ "step": 1686
+ },
+ {
+ "epoch": 0.7976359338061466,
+ "grad_norm": 3.000595808029175,
+ "learning_rate": 4.806523063514637e-06,
+ "loss": 0.5481,
+ "step": 1687
+ },
+ {
+ "epoch": 0.7981087470449173,
+ "grad_norm": 2.702014207839966,
+ "learning_rate": 4.806282359221976e-06,
+ "loss": 0.5993,
+ "step": 1688
+ },
+ {
+ "epoch": 0.798581560283688,
+ "grad_norm": 2.383671998977661,
+ "learning_rate": 4.806041511327226e-06,
+ "loss": 0.562,
+ "step": 1689
+ },
+ {
+ "epoch": 0.7990543735224587,
+ "grad_norm": 2.6965041160583496,
+ "learning_rate": 4.8058005198453834e-06,
+ "loss": 0.5955,
+ "step": 1690
+ },
+ {
+ "epoch": 0.7995271867612294,
+ "grad_norm": 2.5906765460968018,
+ "learning_rate": 4.805559384791453e-06,
+ "loss": 0.5151,
+ "step": 1691
+ },
+ {
+ "epoch": 0.8,
+ "grad_norm": 2.5454652309417725,
+ "learning_rate": 4.8053181061804475e-06,
+ "loss": 0.5843,
+ "step": 1692
+ },
+ {
+ "epoch": 0.8004728132387707,
+ "grad_norm": 2.661343812942505,
+ "learning_rate": 4.8050766840273935e-06,
+ "loss": 0.5995,
+ "step": 1693
+ },
+ {
+ "epoch": 0.8009456264775414,
+ "grad_norm": 2.5635924339294434,
+ "learning_rate": 4.8048351183473215e-06,
+ "loss": 0.5676,
+ "step": 1694
+ },
+ {
+ "epoch": 0.8014184397163121,
+ "grad_norm": 2.5936667919158936,
+ "learning_rate": 4.804593409155274e-06,
+ "loss": 0.6291,
+ "step": 1695
+ },
+ {
+ "epoch": 0.8018912529550828,
+ "grad_norm": 2.6902432441711426,
+ "learning_rate": 4.804351556466299e-06,
+ "loss": 0.6114,
+ "step": 1696
+ },
+ {
+ "epoch": 0.8023640661938535,
+ "grad_norm": 2.7764673233032227,
+ "learning_rate": 4.804109560295457e-06,
+ "loss": 0.5768,
+ "step": 1697
+ },
+ {
+ "epoch": 0.8028368794326242,
+ "grad_norm": 2.9587221145629883,
+ "learning_rate": 4.803867420657816e-06,
+ "loss": 0.6048,
+ "step": 1698
+ },
+ {
+ "epoch": 0.8033096926713948,
+ "grad_norm": 2.9238998889923096,
+ "learning_rate": 4.803625137568453e-06,
+ "loss": 0.6329,
+ "step": 1699
+ },
+ {
+ "epoch": 0.8037825059101655,
+ "grad_norm": 2.70473313331604,
+ "learning_rate": 4.803382711042455e-06,
+ "loss": 0.5427,
+ "step": 1700
+ },
+ {
+ "epoch": 0.8042553191489362,
+ "grad_norm": 3.1604979038238525,
+ "learning_rate": 4.803140141094914e-06,
+ "loss": 0.626,
+ "step": 1701
+ },
+ {
+ "epoch": 0.8047281323877069,
+ "grad_norm": 2.9567699432373047,
+ "learning_rate": 4.802897427740936e-06,
+ "loss": 0.5319,
+ "step": 1702
+ },
+ {
+ "epoch": 0.8052009456264776,
+ "grad_norm": 2.90983247756958,
+ "learning_rate": 4.802654570995632e-06,
+ "loss": 0.586,
+ "step": 1703
+ },
+ {
+ "epoch": 0.8056737588652483,
+ "grad_norm": 2.783480167388916,
+ "learning_rate": 4.8024115708741255e-06,
+ "loss": 0.5773,
+ "step": 1704
+ },
+ {
+ "epoch": 0.806146572104019,
+ "grad_norm": 3.3307793140411377,
+ "learning_rate": 4.802168427391547e-06,
+ "loss": 0.6257,
+ "step": 1705
+ },
+ {
+ "epoch": 0.8066193853427897,
+ "grad_norm": 3.0475001335144043,
+ "learning_rate": 4.801925140563034e-06,
+ "loss": 0.6612,
+ "step": 1706
+ },
+ {
+ "epoch": 0.8070921985815603,
+ "grad_norm": 2.8278894424438477,
+ "learning_rate": 4.8016817104037375e-06,
+ "loss": 0.6449,
+ "step": 1707
+ },
+ {
+ "epoch": 0.807565011820331,
+ "grad_norm": 2.760244369506836,
+ "learning_rate": 4.801438136928812e-06,
+ "loss": 0.7007,
+ "step": 1708
+ },
+ {
+ "epoch": 0.8080378250591016,
+ "grad_norm": 2.827634572982788,
+ "learning_rate": 4.801194420153427e-06,
+ "loss": 0.6418,
+ "step": 1709
+ },
+ {
+ "epoch": 0.8085106382978723,
+ "grad_norm": 2.8655009269714355,
+ "learning_rate": 4.800950560092754e-06,
+ "loss": 0.6231,
+ "step": 1710
+ },
+ {
+ "epoch": 0.808983451536643,
+ "grad_norm": 2.738112688064575,
+ "learning_rate": 4.800706556761981e-06,
+ "loss": 0.6463,
+ "step": 1711
+ },
+ {
+ "epoch": 0.8094562647754137,
+ "grad_norm": 2.4781179428100586,
+ "learning_rate": 4.800462410176296e-06,
+ "loss": 0.5365,
+ "step": 1712
+ },
+ {
+ "epoch": 0.8099290780141843,
+ "grad_norm": 2.6049838066101074,
+ "learning_rate": 4.800218120350906e-06,
+ "loss": 0.6035,
+ "step": 1713
+ },
+ {
+ "epoch": 0.810401891252955,
+ "grad_norm": 2.9089980125427246,
+ "learning_rate": 4.79997368730102e-06,
+ "loss": 0.5828,
+ "step": 1714
+ },
+ {
+ "epoch": 0.8108747044917257,
+ "grad_norm": 2.831871747970581,
+ "learning_rate": 4.799729111041857e-06,
+ "loss": 0.5953,
+ "step": 1715
+ },
+ {
+ "epoch": 0.8113475177304964,
+ "grad_norm": 2.5611300468444824,
+ "learning_rate": 4.799484391588647e-06,
+ "loss": 0.6302,
+ "step": 1716
+ },
+ {
+ "epoch": 0.8118203309692671,
+ "grad_norm": 2.744070053100586,
+ "learning_rate": 4.799239528956625e-06,
+ "loss": 0.5561,
+ "step": 1717
+ },
+ {
+ "epoch": 0.8122931442080378,
+ "grad_norm": 2.7344231605529785,
+ "learning_rate": 4.798994523161041e-06,
+ "loss": 0.6317,
+ "step": 1718
+ },
+ {
+ "epoch": 0.8127659574468085,
+ "grad_norm": 2.3420889377593994,
+ "learning_rate": 4.798749374217149e-06,
+ "loss": 0.5415,
+ "step": 1719
+ },
+ {
+ "epoch": 0.8132387706855791,
+ "grad_norm": 2.57384991645813,
+ "learning_rate": 4.798504082140212e-06,
+ "loss": 0.6383,
+ "step": 1720
+ },
+ {
+ "epoch": 0.8137115839243498,
+ "grad_norm": 2.8819844722747803,
+ "learning_rate": 4.798258646945505e-06,
+ "loss": 0.6355,
+ "step": 1721
+ },
+ {
+ "epoch": 0.8141843971631205,
+ "grad_norm": 2.908123254776001,
+ "learning_rate": 4.79801306864831e-06,
+ "loss": 0.701,
+ "step": 1722
+ },
+ {
+ "epoch": 0.8146572104018912,
+ "grad_norm": 2.6500701904296875,
+ "learning_rate": 4.797767347263917e-06,
+ "loss": 0.6152,
+ "step": 1723
+ },
+ {
+ "epoch": 0.8151300236406619,
+ "grad_norm": 2.5513017177581787,
+ "learning_rate": 4.797521482807628e-06,
+ "loss": 0.6241,
+ "step": 1724
+ },
+ {
+ "epoch": 0.8156028368794326,
+ "grad_norm": 2.6239185333251953,
+ "learning_rate": 4.7972754752947495e-06,
+ "loss": 0.6072,
+ "step": 1725
+ },
+ {
+ "epoch": 0.8160756501182033,
+ "grad_norm": 2.673436403274536,
+ "learning_rate": 4.797029324740601e-06,
+ "loss": 0.5802,
+ "step": 1726
+ },
+ {
+ "epoch": 0.816548463356974,
+ "grad_norm": 2.533831834793091,
+ "learning_rate": 4.796783031160508e-06,
+ "loss": 0.5566,
+ "step": 1727
+ },
+ {
+ "epoch": 0.8170212765957446,
+ "grad_norm": 2.9806582927703857,
+ "learning_rate": 4.796536594569807e-06,
+ "loss": 0.6945,
+ "step": 1728
+ },
+ {
+ "epoch": 0.8174940898345153,
+ "grad_norm": 2.7093560695648193,
+ "learning_rate": 4.796290014983842e-06,
+ "loss": 0.7143,
+ "step": 1729
+ },
+ {
+ "epoch": 0.817966903073286,
+ "grad_norm": 2.814507246017456,
+ "learning_rate": 4.796043292417967e-06,
+ "loss": 0.6122,
+ "step": 1730
+ },
+ {
+ "epoch": 0.8184397163120567,
+ "grad_norm": 2.537156820297241,
+ "learning_rate": 4.795796426887543e-06,
+ "loss": 0.6229,
+ "step": 1731
+ },
+ {
+ "epoch": 0.8189125295508274,
+ "grad_norm": 2.4878013134002686,
+ "learning_rate": 4.795549418407944e-06,
+ "loss": 0.5442,
+ "step": 1732
+ },
+ {
+ "epoch": 0.8193853427895981,
+ "grad_norm": 2.839383363723755,
+ "learning_rate": 4.795302266994548e-06,
+ "loss": 0.6717,
+ "step": 1733
+ },
+ {
+ "epoch": 0.8198581560283688,
+ "grad_norm": 3.1981801986694336,
+ "learning_rate": 4.795054972662744e-06,
+ "loss": 0.6596,
+ "step": 1734
+ },
+ {
+ "epoch": 0.8203309692671394,
+ "grad_norm": 2.781730890274048,
+ "learning_rate": 4.79480753542793e-06,
+ "loss": 0.5845,
+ "step": 1735
+ },
+ {
+ "epoch": 0.8208037825059101,
+ "grad_norm": 2.689948558807373,
+ "learning_rate": 4.794559955305513e-06,
+ "loss": 0.5928,
+ "step": 1736
+ },
+ {
+ "epoch": 0.8212765957446808,
+ "grad_norm": 2.7267637252807617,
+ "learning_rate": 4.7943122323109105e-06,
+ "loss": 0.5224,
+ "step": 1737
+ },
+ {
+ "epoch": 0.8217494089834515,
+ "grad_norm": 2.4346601963043213,
+ "learning_rate": 4.794064366459544e-06,
+ "loss": 0.6431,
+ "step": 1738
+ },
+ {
+ "epoch": 0.8222222222222222,
+ "grad_norm": 2.7440176010131836,
+ "learning_rate": 4.793816357766849e-06,
+ "loss": 0.6083,
+ "step": 1739
+ },
+ {
+ "epoch": 0.8226950354609929,
+ "grad_norm": 2.6558027267456055,
+ "learning_rate": 4.793568206248268e-06,
+ "loss": 0.698,
+ "step": 1740
+ },
+ {
+ "epoch": 0.8231678486997636,
+ "grad_norm": 2.591658353805542,
+ "learning_rate": 4.793319911919251e-06,
+ "loss": 0.6601,
+ "step": 1741
+ },
+ {
+ "epoch": 0.8236406619385342,
+ "grad_norm": 2.5431172847747803,
+ "learning_rate": 4.79307147479526e-06,
+ "loss": 0.5917,
+ "step": 1742
+ },
+ {
+ "epoch": 0.8241134751773049,
+ "grad_norm": 2.7335588932037354,
+ "learning_rate": 4.792822894891762e-06,
+ "loss": 0.5925,
+ "step": 1743
+ },
+ {
+ "epoch": 0.8245862884160756,
+ "grad_norm": 2.2500839233398438,
+ "learning_rate": 4.792574172224237e-06,
+ "loss": 0.4984,
+ "step": 1744
+ },
+ {
+ "epoch": 0.8250591016548463,
+ "grad_norm": 2.691343069076538,
+ "learning_rate": 4.79232530680817e-06,
+ "loss": 0.6262,
+ "step": 1745
+ },
+ {
+ "epoch": 0.825531914893617,
+ "grad_norm": 2.612204074859619,
+ "learning_rate": 4.792076298659058e-06,
+ "loss": 0.5822,
+ "step": 1746
+ },
+ {
+ "epoch": 0.8260047281323877,
+ "grad_norm": 3.0163519382476807,
+ "learning_rate": 4.791827147792406e-06,
+ "loss": 0.6263,
+ "step": 1747
+ },
+ {
+ "epoch": 0.8264775413711584,
+ "grad_norm": 2.742183208465576,
+ "learning_rate": 4.791577854223727e-06,
+ "loss": 0.6628,
+ "step": 1748
+ },
+ {
+ "epoch": 0.826950354609929,
+ "grad_norm": 2.872213840484619,
+ "learning_rate": 4.791328417968542e-06,
+ "loss": 0.6332,
+ "step": 1749
+ },
+ {
+ "epoch": 0.8274231678486997,
+ "grad_norm": 2.725006580352783,
+ "learning_rate": 4.7910788390423844e-06,
+ "loss": 0.6266,
+ "step": 1750
+ },
+ {
+ "epoch": 0.8278959810874704,
+ "grad_norm": 3.0366697311401367,
+ "learning_rate": 4.790829117460793e-06,
+ "loss": 0.6403,
+ "step": 1751
+ },
+ {
+ "epoch": 0.8283687943262411,
+ "grad_norm": 2.594881772994995,
+ "learning_rate": 4.790579253239318e-06,
+ "loss": 0.521,
+ "step": 1752
+ },
+ {
+ "epoch": 0.8288416075650118,
+ "grad_norm": 2.4496347904205322,
+ "learning_rate": 4.790329246393517e-06,
+ "loss": 0.54,
+ "step": 1753
+ },
+ {
+ "epoch": 0.8293144208037825,
+ "grad_norm": 3.102278470993042,
+ "learning_rate": 4.790079096938956e-06,
+ "loss": 0.6142,
+ "step": 1754
+ },
+ {
+ "epoch": 0.8297872340425532,
+ "grad_norm": 2.4645912647247314,
+ "learning_rate": 4.789828804891212e-06,
+ "loss": 0.5212,
+ "step": 1755
+ },
+ {
+ "epoch": 0.8302600472813239,
+ "grad_norm": 2.7482516765594482,
+ "learning_rate": 4.789578370265868e-06,
+ "loss": 0.6712,
+ "step": 1756
+ },
+ {
+ "epoch": 0.8307328605200945,
+ "grad_norm": 2.61360502243042,
+ "learning_rate": 4.7893277930785195e-06,
+ "loss": 0.6367,
+ "step": 1757
+ },
+ {
+ "epoch": 0.8312056737588652,
+ "grad_norm": 2.79028058052063,
+ "learning_rate": 4.789077073344767e-06,
+ "loss": 0.5099,
+ "step": 1758
+ },
+ {
+ "epoch": 0.8316784869976359,
+ "grad_norm": 2.647662401199341,
+ "learning_rate": 4.788826211080222e-06,
+ "loss": 0.6698,
+ "step": 1759
+ },
+ {
+ "epoch": 0.8321513002364066,
+ "grad_norm": 3.0214831829071045,
+ "learning_rate": 4.7885752063005055e-06,
+ "loss": 0.6121,
+ "step": 1760
+ },
+ {
+ "epoch": 0.8326241134751773,
+ "grad_norm": 2.8244032859802246,
+ "learning_rate": 4.788324059021247e-06,
+ "loss": 0.6921,
+ "step": 1761
+ },
+ {
+ "epoch": 0.833096926713948,
+ "grad_norm": 3.1501076221466064,
+ "learning_rate": 4.788072769258082e-06,
+ "loss": 0.6872,
+ "step": 1762
+ },
+ {
+ "epoch": 0.8335697399527187,
+ "grad_norm": 2.6989903450012207,
+ "learning_rate": 4.7878213370266594e-06,
+ "loss": 0.5884,
+ "step": 1763
+ },
+ {
+ "epoch": 0.8340425531914893,
+ "grad_norm": 2.6982665061950684,
+ "learning_rate": 4.787569762342633e-06,
+ "loss": 0.6112,
+ "step": 1764
+ },
+ {
+ "epoch": 0.83451536643026,
+ "grad_norm": 2.6918323040008545,
+ "learning_rate": 4.7873180452216685e-06,
+ "loss": 0.5315,
+ "step": 1765
+ },
+ {
+ "epoch": 0.8349881796690307,
+ "grad_norm": 2.5494401454925537,
+ "learning_rate": 4.78706618567944e-06,
+ "loss": 0.5909,
+ "step": 1766
+ },
+ {
+ "epoch": 0.8354609929078014,
+ "grad_norm": 2.7532095909118652,
+ "learning_rate": 4.786814183731627e-06,
+ "loss": 0.5566,
+ "step": 1767
+ },
+ {
+ "epoch": 0.8359338061465721,
+ "grad_norm": 2.550865888595581,
+ "learning_rate": 4.786562039393923e-06,
+ "loss": 0.555,
+ "step": 1768
+ },
+ {
+ "epoch": 0.8364066193853428,
+ "grad_norm": 2.4477791786193848,
+ "learning_rate": 4.786309752682028e-06,
+ "loss": 0.5844,
+ "step": 1769
+ },
+ {
+ "epoch": 0.8368794326241135,
+ "grad_norm": 2.6982262134552,
+ "learning_rate": 4.7860573236116485e-06,
+ "loss": 0.6136,
+ "step": 1770
+ },
+ {
+ "epoch": 0.8373522458628841,
+ "grad_norm": 2.456263542175293,
+ "learning_rate": 4.785804752198503e-06,
+ "loss": 0.5055,
+ "step": 1771
+ },
+ {
+ "epoch": 0.8378250591016548,
+ "grad_norm": 2.428544521331787,
+ "learning_rate": 4.78555203845832e-06,
+ "loss": 0.5859,
+ "step": 1772
+ },
+ {
+ "epoch": 0.8382978723404255,
+ "grad_norm": 2.1782307624816895,
+ "learning_rate": 4.785299182406833e-06,
+ "loss": 0.5325,
+ "step": 1773
+ },
+ {
+ "epoch": 0.8387706855791962,
+ "grad_norm": 3.137956142425537,
+ "learning_rate": 4.785046184059786e-06,
+ "loss": 0.6097,
+ "step": 1774
+ },
+ {
+ "epoch": 0.8392434988179669,
+ "grad_norm": 2.6269001960754395,
+ "learning_rate": 4.7847930434329336e-06,
+ "loss": 0.5972,
+ "step": 1775
+ },
+ {
+ "epoch": 0.8397163120567376,
+ "grad_norm": 2.732659339904785,
+ "learning_rate": 4.784539760542037e-06,
+ "loss": 0.6054,
+ "step": 1776
+ },
+ {
+ "epoch": 0.8401891252955083,
+ "grad_norm": 2.5346736907958984,
+ "learning_rate": 4.784286335402866e-06,
+ "loss": 0.5521,
+ "step": 1777
+ },
+ {
+ "epoch": 0.840661938534279,
+ "grad_norm": 3.1420228481292725,
+ "learning_rate": 4.784032768031202e-06,
+ "loss": 0.6165,
+ "step": 1778
+ },
+ {
+ "epoch": 0.8411347517730496,
+ "grad_norm": 3.073793411254883,
+ "learning_rate": 4.783779058442831e-06,
+ "loss": 0.6414,
+ "step": 1779
+ },
+ {
+ "epoch": 0.8416075650118203,
+ "grad_norm": 2.6621336936950684,
+ "learning_rate": 4.783525206653554e-06,
+ "loss": 0.5836,
+ "step": 1780
+ },
+ {
+ "epoch": 0.842080378250591,
+ "grad_norm": 2.7029049396514893,
+ "learning_rate": 4.7832712126791745e-06,
+ "loss": 0.5897,
+ "step": 1781
+ },
+ {
+ "epoch": 0.8425531914893617,
+ "grad_norm": 2.4733822345733643,
+ "learning_rate": 4.783017076535509e-06,
+ "loss": 0.5913,
+ "step": 1782
+ },
+ {
+ "epoch": 0.8430260047281324,
+ "grad_norm": 2.8119473457336426,
+ "learning_rate": 4.782762798238381e-06,
+ "loss": 0.6105,
+ "step": 1783
+ },
+ {
+ "epoch": 0.8434988179669031,
+ "grad_norm": 2.5290818214416504,
+ "learning_rate": 4.782508377803622e-06,
+ "loss": 0.6119,
+ "step": 1784
+ },
+ {
+ "epoch": 0.8439716312056738,
+ "grad_norm": 3.193472385406494,
+ "learning_rate": 4.782253815247076e-06,
+ "loss": 0.6665,
+ "step": 1785
+ },
+ {
+ "epoch": 0.8444444444444444,
+ "grad_norm": 3.206759452819824,
+ "learning_rate": 4.781999110584592e-06,
+ "loss": 0.6012,
+ "step": 1786
+ },
+ {
+ "epoch": 0.8449172576832151,
+ "grad_norm": 2.6227457523345947,
+ "learning_rate": 4.781744263832029e-06,
+ "loss": 0.5845,
+ "step": 1787
+ },
+ {
+ "epoch": 0.8453900709219858,
+ "grad_norm": 2.838365316390991,
+ "learning_rate": 4.781489275005257e-06,
+ "loss": 0.5695,
+ "step": 1788
+ },
+ {
+ "epoch": 0.8458628841607565,
+ "grad_norm": 2.8348326683044434,
+ "learning_rate": 4.78123414412015e-06,
+ "loss": 0.6136,
+ "step": 1789
+ },
+ {
+ "epoch": 0.8463356973995272,
+ "grad_norm": 2.5698344707489014,
+ "learning_rate": 4.780978871192597e-06,
+ "loss": 0.6576,
+ "step": 1790
+ },
+ {
+ "epoch": 0.8468085106382979,
+ "grad_norm": 2.5198330879211426,
+ "learning_rate": 4.780723456238492e-06,
+ "loss": 0.5521,
+ "step": 1791
+ },
+ {
+ "epoch": 0.8472813238770686,
+ "grad_norm": 3.001325845718384,
+ "learning_rate": 4.780467899273737e-06,
+ "loss": 0.6075,
+ "step": 1792
+ },
+ {
+ "epoch": 0.8477541371158392,
+ "grad_norm": 2.7732746601104736,
+ "learning_rate": 4.780212200314247e-06,
+ "loss": 0.6245,
+ "step": 1793
+ },
+ {
+ "epoch": 0.8482269503546099,
+ "grad_norm": 2.6950337886810303,
+ "learning_rate": 4.77995635937594e-06,
+ "loss": 0.5723,
+ "step": 1794
+ },
+ {
+ "epoch": 0.8486997635933806,
+ "grad_norm": 2.82051420211792,
+ "learning_rate": 4.779700376474749e-06,
+ "loss": 0.6184,
+ "step": 1795
+ },
+ {
+ "epoch": 0.8491725768321513,
+ "grad_norm": 2.757791757583618,
+ "learning_rate": 4.779444251626611e-06,
+ "loss": 0.608,
+ "step": 1796
+ },
+ {
+ "epoch": 0.849645390070922,
+ "grad_norm": 2.394108533859253,
+ "learning_rate": 4.779187984847475e-06,
+ "loss": 0.6174,
+ "step": 1797
+ },
+ {
+ "epoch": 0.8501182033096927,
+ "grad_norm": 2.427562713623047,
+ "learning_rate": 4.778931576153296e-06,
+ "loss": 0.5618,
+ "step": 1798
+ },
+ {
+ "epoch": 0.8505910165484634,
+ "grad_norm": 2.891268491744995,
+ "learning_rate": 4.778675025560042e-06,
+ "loss": 0.6865,
+ "step": 1799
+ },
+ {
+ "epoch": 0.851063829787234,
+ "grad_norm": 2.665534257888794,
+ "learning_rate": 4.778418333083685e-06,
+ "loss": 0.5852,
+ "step": 1800
+ },
+ {
+ "epoch": 0.8515366430260047,
+ "grad_norm": 2.5492889881134033,
+ "learning_rate": 4.7781614987402095e-06,
+ "loss": 0.5161,
+ "step": 1801
+ },
+ {
+ "epoch": 0.8520094562647754,
+ "grad_norm": 2.400177001953125,
+ "learning_rate": 4.777904522545607e-06,
+ "loss": 0.5128,
+ "step": 1802
+ },
+ {
+ "epoch": 0.8524822695035461,
+ "grad_norm": 2.3949809074401855,
+ "learning_rate": 4.777647404515878e-06,
+ "loss": 0.571,
+ "step": 1803
+ },
+ {
+ "epoch": 0.8529550827423168,
+ "grad_norm": 2.3624472618103027,
+ "learning_rate": 4.7773901446670325e-06,
+ "loss": 0.5486,
+ "step": 1804
+ },
+ {
+ "epoch": 0.8534278959810875,
+ "grad_norm": 2.711366891860962,
+ "learning_rate": 4.7771327430150885e-06,
+ "loss": 0.5667,
+ "step": 1805
+ },
+ {
+ "epoch": 0.8539007092198582,
+ "grad_norm": 2.7681493759155273,
+ "learning_rate": 4.776875199576073e-06,
+ "loss": 0.5686,
+ "step": 1806
+ },
+ {
+ "epoch": 0.8543735224586289,
+ "grad_norm": 3.0369436740875244,
+ "learning_rate": 4.776617514366023e-06,
+ "loss": 0.6635,
+ "step": 1807
+ },
+ {
+ "epoch": 0.8548463356973995,
+ "grad_norm": 2.919649600982666,
+ "learning_rate": 4.776359687400983e-06,
+ "loss": 0.5749,
+ "step": 1808
+ },
+ {
+ "epoch": 0.8553191489361702,
+ "grad_norm": 2.7986185550689697,
+ "learning_rate": 4.776101718697007e-06,
+ "loss": 0.559,
+ "step": 1809
+ },
+ {
+ "epoch": 0.8557919621749409,
+ "grad_norm": 2.5951223373413086,
+ "learning_rate": 4.775843608270158e-06,
+ "loss": 0.5654,
+ "step": 1810
+ },
+ {
+ "epoch": 0.8562647754137116,
+ "grad_norm": 2.674138069152832,
+ "learning_rate": 4.775585356136505e-06,
+ "loss": 0.5286,
+ "step": 1811
+ },
+ {
+ "epoch": 0.8567375886524823,
+ "grad_norm": 3.045437812805176,
+ "learning_rate": 4.775326962312131e-06,
+ "loss": 0.6185,
+ "step": 1812
+ },
+ {
+ "epoch": 0.857210401891253,
+ "grad_norm": 2.6145293712615967,
+ "learning_rate": 4.775068426813124e-06,
+ "loss": 0.6075,
+ "step": 1813
+ },
+ {
+ "epoch": 0.8576832151300237,
+ "grad_norm": 2.6320106983184814,
+ "learning_rate": 4.7748097496555824e-06,
+ "loss": 0.561,
+ "step": 1814
+ },
+ {
+ "epoch": 0.8581560283687943,
+ "grad_norm": 2.5038623809814453,
+ "learning_rate": 4.774550930855612e-06,
+ "loss": 0.593,
+ "step": 1815
+ },
+ {
+ "epoch": 0.858628841607565,
+ "grad_norm": 2.8168089389801025,
+ "learning_rate": 4.774291970429329e-06,
+ "loss": 0.5196,
+ "step": 1816
+ },
+ {
+ "epoch": 0.8591016548463357,
+ "grad_norm": 2.778130292892456,
+ "learning_rate": 4.774032868392858e-06,
+ "loss": 0.5984,
+ "step": 1817
+ },
+ {
+ "epoch": 0.8595744680851064,
+ "grad_norm": 2.536458730697632,
+ "learning_rate": 4.7737736247623305e-06,
+ "loss": 0.568,
+ "step": 1818
+ },
+ {
+ "epoch": 0.8600472813238771,
+ "grad_norm": 2.6669719219207764,
+ "learning_rate": 4.77351423955389e-06,
+ "loss": 0.6233,
+ "step": 1819
+ },
+ {
+ "epoch": 0.8605200945626478,
+ "grad_norm": 2.578242540359497,
+ "learning_rate": 4.773254712783687e-06,
+ "loss": 0.579,
+ "step": 1820
+ },
+ {
+ "epoch": 0.8609929078014185,
+ "grad_norm": 2.816664457321167,
+ "learning_rate": 4.772995044467881e-06,
+ "loss": 0.6635,
+ "step": 1821
+ },
+ {
+ "epoch": 0.8614657210401891,
+ "grad_norm": 3.1111979484558105,
+ "learning_rate": 4.77273523462264e-06,
+ "loss": 0.6372,
+ "step": 1822
+ },
+ {
+ "epoch": 0.8619385342789598,
+ "grad_norm": 2.764552354812622,
+ "learning_rate": 4.772475283264142e-06,
+ "loss": 0.6216,
+ "step": 1823
+ },
+ {
+ "epoch": 0.8624113475177305,
+ "grad_norm": 2.9126830101013184,
+ "learning_rate": 4.772215190408572e-06,
+ "loss": 0.6396,
+ "step": 1824
+ },
+ {
+ "epoch": 0.8628841607565012,
+ "grad_norm": 2.7502307891845703,
+ "learning_rate": 4.7719549560721264e-06,
+ "loss": 0.6186,
+ "step": 1825
+ },
+ {
+ "epoch": 0.8633569739952719,
+ "grad_norm": 2.6279006004333496,
+ "learning_rate": 4.771694580271007e-06,
+ "loss": 0.5557,
+ "step": 1826
+ },
+ {
+ "epoch": 0.8638297872340426,
+ "grad_norm": 2.996563196182251,
+ "learning_rate": 4.7714340630214276e-06,
+ "loss": 0.6259,
+ "step": 1827
+ },
+ {
+ "epoch": 0.8643026004728133,
+ "grad_norm": 3.231323480606079,
+ "learning_rate": 4.771173404339609e-06,
+ "loss": 0.5473,
+ "step": 1828
+ },
+ {
+ "epoch": 0.864775413711584,
+ "grad_norm": 3.143519878387451,
+ "learning_rate": 4.770912604241781e-06,
+ "loss": 0.593,
+ "step": 1829
+ },
+ {
+ "epoch": 0.8652482269503546,
+ "grad_norm": 2.515484094619751,
+ "learning_rate": 4.770651662744184e-06,
+ "loss": 0.538,
+ "step": 1830
+ },
+ {
+ "epoch": 0.8657210401891253,
+ "grad_norm": 2.629058837890625,
+ "learning_rate": 4.770390579863064e-06,
+ "loss": 0.5745,
+ "step": 1831
+ },
+ {
+ "epoch": 0.866193853427896,
+ "grad_norm": 2.5826802253723145,
+ "learning_rate": 4.770129355614677e-06,
+ "loss": 0.6397,
+ "step": 1832
+ },
+ {
+ "epoch": 0.8666666666666667,
+ "grad_norm": 2.954623222351074,
+ "learning_rate": 4.769867990015289e-06,
+ "loss": 0.6106,
+ "step": 1833
+ },
+ {
+ "epoch": 0.8671394799054374,
+ "grad_norm": 2.742192268371582,
+ "learning_rate": 4.769606483081175e-06,
+ "loss": 0.6902,
+ "step": 1834
+ },
+ {
+ "epoch": 0.8676122931442081,
+ "grad_norm": 2.2619097232818604,
+ "learning_rate": 4.769344834828618e-06,
+ "loss": 0.5414,
+ "step": 1835
+ },
+ {
+ "epoch": 0.8680851063829788,
+ "grad_norm": 2.7384188175201416,
+ "learning_rate": 4.769083045273908e-06,
+ "loss": 0.5787,
+ "step": 1836
+ },
+ {
+ "epoch": 0.8685579196217494,
+ "grad_norm": 2.6734485626220703,
+ "learning_rate": 4.768821114433346e-06,
+ "loss": 0.5923,
+ "step": 1837
+ },
+ {
+ "epoch": 0.8690307328605201,
+ "grad_norm": 2.286140203475952,
+ "learning_rate": 4.768559042323243e-06,
+ "loss": 0.5822,
+ "step": 1838
+ },
+ {
+ "epoch": 0.8695035460992908,
+ "grad_norm": 3.0243725776672363,
+ "learning_rate": 4.768296828959915e-06,
+ "loss": 0.6623,
+ "step": 1839
+ },
+ {
+ "epoch": 0.8699763593380615,
+ "grad_norm": 2.4026312828063965,
+ "learning_rate": 4.768034474359689e-06,
+ "loss": 0.5554,
+ "step": 1840
+ },
+ {
+ "epoch": 0.8704491725768322,
+ "grad_norm": 2.7469029426574707,
+ "learning_rate": 4.767771978538903e-06,
+ "loss": 0.6316,
+ "step": 1841
+ },
+ {
+ "epoch": 0.8709219858156029,
+ "grad_norm": 2.729659080505371,
+ "learning_rate": 4.767509341513899e-06,
+ "loss": 0.5807,
+ "step": 1842
+ },
+ {
+ "epoch": 0.8713947990543736,
+ "grad_norm": 2.5336945056915283,
+ "learning_rate": 4.76724656330103e-06,
+ "loss": 0.6109,
+ "step": 1843
+ },
+ {
+ "epoch": 0.8718676122931442,
+ "grad_norm": 2.519880533218384,
+ "learning_rate": 4.76698364391666e-06,
+ "loss": 0.5313,
+ "step": 1844
+ },
+ {
+ "epoch": 0.8723404255319149,
+ "grad_norm": 2.698862075805664,
+ "learning_rate": 4.766720583377159e-06,
+ "loss": 0.5953,
+ "step": 1845
+ },
+ {
+ "epoch": 0.8728132387706856,
+ "grad_norm": 3.0195560455322266,
+ "learning_rate": 4.766457381698907e-06,
+ "loss": 0.5965,
+ "step": 1846
+ },
+ {
+ "epoch": 0.8732860520094563,
+ "grad_norm": 2.5972697734832764,
+ "learning_rate": 4.766194038898291e-06,
+ "loss": 0.6014,
+ "step": 1847
+ },
+ {
+ "epoch": 0.873758865248227,
+ "grad_norm": 2.7132294178009033,
+ "learning_rate": 4.76593055499171e-06,
+ "loss": 0.5638,
+ "step": 1848
+ },
+ {
+ "epoch": 0.8742316784869977,
+ "grad_norm": 2.7134575843811035,
+ "learning_rate": 4.765666929995568e-06,
+ "loss": 0.52,
+ "step": 1849
+ },
+ {
+ "epoch": 0.8747044917257684,
+ "grad_norm": 2.3804993629455566,
+ "learning_rate": 4.765403163926282e-06,
+ "loss": 0.5435,
+ "step": 1850
+ },
+ {
+ "epoch": 0.875177304964539,
+ "grad_norm": 2.8782761096954346,
+ "learning_rate": 4.765139256800274e-06,
+ "loss": 0.5843,
+ "step": 1851
+ },
+ {
+ "epoch": 0.8756501182033097,
+ "grad_norm": 2.836209774017334,
+ "learning_rate": 4.764875208633977e-06,
+ "loss": 0.6667,
+ "step": 1852
+ },
+ {
+ "epoch": 0.8761229314420804,
+ "grad_norm": 2.608851194381714,
+ "learning_rate": 4.764611019443831e-06,
+ "loss": 0.5436,
+ "step": 1853
+ },
+ {
+ "epoch": 0.8765957446808511,
+ "grad_norm": 2.788738965988159,
+ "learning_rate": 4.764346689246288e-06,
+ "loss": 0.7331,
+ "step": 1854
+ },
+ {
+ "epoch": 0.8770685579196218,
+ "grad_norm": 2.524277687072754,
+ "learning_rate": 4.764082218057805e-06,
+ "loss": 0.5067,
+ "step": 1855
+ },
+ {
+ "epoch": 0.8775413711583925,
+ "grad_norm": 3.7559316158294678,
+ "learning_rate": 4.763817605894851e-06,
+ "loss": 0.6809,
+ "step": 1856
+ },
+ {
+ "epoch": 0.8780141843971632,
+ "grad_norm": 2.9070613384246826,
+ "learning_rate": 4.763552852773899e-06,
+ "loss": 0.5913,
+ "step": 1857
+ },
+ {
+ "epoch": 0.8784869976359339,
+ "grad_norm": 2.7050609588623047,
+ "learning_rate": 4.7632879587114386e-06,
+ "loss": 0.6074,
+ "step": 1858
+ },
+ {
+ "epoch": 0.8789598108747045,
+ "grad_norm": 2.891134262084961,
+ "learning_rate": 4.76302292372396e-06,
+ "loss": 0.5939,
+ "step": 1859
+ },
+ {
+ "epoch": 0.8794326241134752,
+ "grad_norm": 2.8581702709198,
+ "learning_rate": 4.762757747827968e-06,
+ "loss": 0.5972,
+ "step": 1860
+ },
+ {
+ "epoch": 0.8799054373522459,
+ "grad_norm": 2.8266196250915527,
+ "learning_rate": 4.762492431039971e-06,
+ "loss": 0.5993,
+ "step": 1861
+ },
+ {
+ "epoch": 0.8803782505910166,
+ "grad_norm": 2.4853954315185547,
+ "learning_rate": 4.762226973376493e-06,
+ "loss": 0.6388,
+ "step": 1862
+ },
+ {
+ "epoch": 0.8808510638297873,
+ "grad_norm": 3.2212886810302734,
+ "learning_rate": 4.761961374854059e-06,
+ "loss": 0.6698,
+ "step": 1863
+ },
+ {
+ "epoch": 0.881323877068558,
+ "grad_norm": 3.1254501342773438,
+ "learning_rate": 4.761695635489211e-06,
+ "loss": 0.5263,
+ "step": 1864
+ },
+ {
+ "epoch": 0.8817966903073287,
+ "grad_norm": 2.6891462802886963,
+ "learning_rate": 4.761429755298491e-06,
+ "loss": 0.5359,
+ "step": 1865
+ },
+ {
+ "epoch": 0.8822695035460993,
+ "grad_norm": 2.8557538986206055,
+ "learning_rate": 4.761163734298457e-06,
+ "loss": 0.5933,
+ "step": 1866
+ },
+ {
+ "epoch": 0.88274231678487,
+ "grad_norm": 2.53548264503479,
+ "learning_rate": 4.7608975725056724e-06,
+ "loss": 0.6397,
+ "step": 1867
+ },
+ {
+ "epoch": 0.8832151300236407,
+ "grad_norm": 3.0237956047058105,
+ "learning_rate": 4.76063126993671e-06,
+ "loss": 0.6845,
+ "step": 1868
+ },
+ {
+ "epoch": 0.8836879432624114,
+ "grad_norm": 3.222886800765991,
+ "learning_rate": 4.76036482660815e-06,
+ "loss": 0.6055,
+ "step": 1869
+ },
+ {
+ "epoch": 0.8841607565011821,
+ "grad_norm": 3.1867551803588867,
+ "learning_rate": 4.760098242536584e-06,
+ "loss": 0.6592,
+ "step": 1870
+ },
+ {
+ "epoch": 0.8846335697399527,
+ "grad_norm": 2.782209873199463,
+ "learning_rate": 4.7598315177386115e-06,
+ "loss": 0.5833,
+ "step": 1871
+ },
+ {
+ "epoch": 0.8851063829787233,
+ "grad_norm": 2.899871587753296,
+ "learning_rate": 4.759564652230838e-06,
+ "loss": 0.6129,
+ "step": 1872
+ },
+ {
+ "epoch": 0.885579196217494,
+ "grad_norm": 2.5690579414367676,
+ "learning_rate": 4.759297646029882e-06,
+ "loss": 0.5827,
+ "step": 1873
+ },
+ {
+ "epoch": 0.8860520094562647,
+ "grad_norm": 2.666130304336548,
+ "learning_rate": 4.759030499152368e-06,
+ "loss": 0.5272,
+ "step": 1874
+ },
+ {
+ "epoch": 0.8865248226950354,
+ "grad_norm": 2.7030911445617676,
+ "learning_rate": 4.758763211614932e-06,
+ "loss": 0.6415,
+ "step": 1875
+ },
+ {
+ "epoch": 0.8869976359338061,
+ "grad_norm": 2.717512845993042,
+ "learning_rate": 4.7584957834342135e-06,
+ "loss": 0.5827,
+ "step": 1876
+ },
+ {
+ "epoch": 0.8874704491725768,
+ "grad_norm": 2.665823459625244,
+ "learning_rate": 4.758228214626867e-06,
+ "loss": 0.6209,
+ "step": 1877
+ },
+ {
+ "epoch": 0.8879432624113475,
+ "grad_norm": 2.636653184890747,
+ "learning_rate": 4.75796050520955e-06,
+ "loss": 0.6413,
+ "step": 1878
+ },
+ {
+ "epoch": 0.8884160756501182,
+ "grad_norm": 2.585115671157837,
+ "learning_rate": 4.7576926551989345e-06,
+ "loss": 0.5518,
+ "step": 1879
+ },
+ {
+ "epoch": 0.8888888888888888,
+ "grad_norm": 2.808526039123535,
+ "learning_rate": 4.757424664611697e-06,
+ "loss": 0.5717,
+ "step": 1880
+ },
+ {
+ "epoch": 0.8893617021276595,
+ "grad_norm": 3.5957939624786377,
+ "learning_rate": 4.757156533464524e-06,
+ "loss": 0.6323,
+ "step": 1881
+ },
+ {
+ "epoch": 0.8898345153664302,
+ "grad_norm": 2.5003883838653564,
+ "learning_rate": 4.756888261774111e-06,
+ "loss": 0.5937,
+ "step": 1882
+ },
+ {
+ "epoch": 0.8903073286052009,
+ "grad_norm": 2.749061346054077,
+ "learning_rate": 4.756619849557161e-06,
+ "loss": 0.6642,
+ "step": 1883
+ },
+ {
+ "epoch": 0.8907801418439716,
+ "grad_norm": 2.6757891178131104,
+ "learning_rate": 4.756351296830389e-06,
+ "loss": 0.5887,
+ "step": 1884
+ },
+ {
+ "epoch": 0.8912529550827423,
+ "grad_norm": 2.811925172805786,
+ "learning_rate": 4.756082603610516e-06,
+ "loss": 0.6571,
+ "step": 1885
+ },
+ {
+ "epoch": 0.891725768321513,
+ "grad_norm": 2.5054616928100586,
+ "learning_rate": 4.755813769914271e-06,
+ "loss": 0.6312,
+ "step": 1886
+ },
+ {
+ "epoch": 0.8921985815602836,
+ "grad_norm": 2.7518467903137207,
+ "learning_rate": 4.755544795758395e-06,
+ "loss": 0.6685,
+ "step": 1887
+ },
+ {
+ "epoch": 0.8926713947990543,
+ "grad_norm": 2.7527287006378174,
+ "learning_rate": 4.755275681159634e-06,
+ "loss": 0.5886,
+ "step": 1888
+ },
+ {
+ "epoch": 0.893144208037825,
+ "grad_norm": 2.6162452697753906,
+ "learning_rate": 4.755006426134745e-06,
+ "loss": 0.546,
+ "step": 1889
+ },
+ {
+ "epoch": 0.8936170212765957,
+ "grad_norm": 2.4016737937927246,
+ "learning_rate": 4.754737030700495e-06,
+ "loss": 0.5726,
+ "step": 1890
+ },
+ {
+ "epoch": 0.8940898345153664,
+ "grad_norm": 2.528327703475952,
+ "learning_rate": 4.754467494873656e-06,
+ "loss": 0.5682,
+ "step": 1891
+ },
+ {
+ "epoch": 0.8945626477541371,
+ "grad_norm": 2.3139286041259766,
+ "learning_rate": 4.7541978186710115e-06,
+ "loss": 0.6108,
+ "step": 1892
+ },
+ {
+ "epoch": 0.8950354609929078,
+ "grad_norm": 2.7269136905670166,
+ "learning_rate": 4.753928002109354e-06,
+ "loss": 0.5875,
+ "step": 1893
+ },
+ {
+ "epoch": 0.8955082742316784,
+ "grad_norm": 4.425495147705078,
+ "learning_rate": 4.753658045205482e-06,
+ "loss": 0.5572,
+ "step": 1894
+ },
+ {
+ "epoch": 0.8959810874704491,
+ "grad_norm": 2.535409927368164,
+ "learning_rate": 4.753387947976206e-06,
+ "loss": 0.5868,
+ "step": 1895
+ },
+ {
+ "epoch": 0.8964539007092198,
+ "grad_norm": 2.722458600997925,
+ "learning_rate": 4.753117710438343e-06,
+ "loss": 0.5935,
+ "step": 1896
+ },
+ {
+ "epoch": 0.8969267139479905,
+ "grad_norm": 2.743861436843872,
+ "learning_rate": 4.75284733260872e-06,
+ "loss": 0.572,
+ "step": 1897
+ },
+ {
+ "epoch": 0.8973995271867612,
+ "grad_norm": 2.60640549659729,
+ "learning_rate": 4.752576814504173e-06,
+ "loss": 0.567,
+ "step": 1898
+ },
+ {
+ "epoch": 0.8978723404255319,
+ "grad_norm": 2.7486042976379395,
+ "learning_rate": 4.7523061561415435e-06,
+ "loss": 0.5768,
+ "step": 1899
+ },
+ {
+ "epoch": 0.8983451536643026,
+ "grad_norm": 3.8410251140594482,
+ "learning_rate": 4.752035357537686e-06,
+ "loss": 0.6034,
+ "step": 1900
+ },
+ {
+ "epoch": 0.8988179669030733,
+ "grad_norm": 3.0935890674591064,
+ "learning_rate": 4.751764418709462e-06,
+ "loss": 0.5644,
+ "step": 1901
+ },
+ {
+ "epoch": 0.8992907801418439,
+ "grad_norm": 2.7989892959594727,
+ "learning_rate": 4.751493339673742e-06,
+ "loss": 0.656,
+ "step": 1902
+ },
+ {
+ "epoch": 0.8997635933806146,
+ "grad_norm": 3.6940557956695557,
+ "learning_rate": 4.751222120447403e-06,
+ "loss": 0.6632,
+ "step": 1903
+ },
+ {
+ "epoch": 0.9002364066193853,
+ "grad_norm": 2.3428797721862793,
+ "learning_rate": 4.750950761047335e-06,
+ "loss": 0.4485,
+ "step": 1904
+ },
+ {
+ "epoch": 0.900709219858156,
+ "grad_norm": 2.622544050216675,
+ "learning_rate": 4.750679261490432e-06,
+ "loss": 0.5857,
+ "step": 1905
+ },
+ {
+ "epoch": 0.9011820330969267,
+ "grad_norm": 2.4911322593688965,
+ "learning_rate": 4.750407621793601e-06,
+ "loss": 0.5618,
+ "step": 1906
+ },
+ {
+ "epoch": 0.9016548463356974,
+ "grad_norm": 2.6434662342071533,
+ "learning_rate": 4.750135841973755e-06,
+ "loss": 0.6057,
+ "step": 1907
+ },
+ {
+ "epoch": 0.902127659574468,
+ "grad_norm": 3.115443706512451,
+ "learning_rate": 4.749863922047817e-06,
+ "loss": 0.6064,
+ "step": 1908
+ },
+ {
+ "epoch": 0.9026004728132387,
+ "grad_norm": 2.5671091079711914,
+ "learning_rate": 4.749591862032718e-06,
+ "loss": 0.5625,
+ "step": 1909
+ },
+ {
+ "epoch": 0.9030732860520094,
+ "grad_norm": 3.2008655071258545,
+ "learning_rate": 4.749319661945398e-06,
+ "loss": 0.5547,
+ "step": 1910
+ },
+ {
+ "epoch": 0.9035460992907801,
+ "grad_norm": 2.905987024307251,
+ "learning_rate": 4.749047321802805e-06,
+ "loss": 0.6033,
+ "step": 1911
+ },
+ {
+ "epoch": 0.9040189125295508,
+ "grad_norm": 3.1456053256988525,
+ "learning_rate": 4.748774841621897e-06,
+ "loss": 0.5651,
+ "step": 1912
+ },
+ {
+ "epoch": 0.9044917257683215,
+ "grad_norm": 2.8116416931152344,
+ "learning_rate": 4.748502221419641e-06,
+ "loss": 0.5853,
+ "step": 1913
+ },
+ {
+ "epoch": 0.9049645390070922,
+ "grad_norm": 3.123835325241089,
+ "learning_rate": 4.748229461213011e-06,
+ "loss": 0.5427,
+ "step": 1914
+ },
+ {
+ "epoch": 0.9054373522458629,
+ "grad_norm": 2.4750146865844727,
+ "learning_rate": 4.747956561018989e-06,
+ "loss": 0.6517,
+ "step": 1915
+ },
+ {
+ "epoch": 0.9059101654846335,
+ "grad_norm": 2.6174299716949463,
+ "learning_rate": 4.7476835208545705e-06,
+ "loss": 0.6119,
+ "step": 1916
+ },
+ {
+ "epoch": 0.9063829787234042,
+ "grad_norm": 2.7390382289886475,
+ "learning_rate": 4.747410340736755e-06,
+ "loss": 0.5664,
+ "step": 1917
+ },
+ {
+ "epoch": 0.9068557919621749,
+ "grad_norm": 2.7940444946289062,
+ "learning_rate": 4.747137020682552e-06,
+ "loss": 0.5628,
+ "step": 1918
+ },
+ {
+ "epoch": 0.9073286052009456,
+ "grad_norm": 2.477365016937256,
+ "learning_rate": 4.7468635607089795e-06,
+ "loss": 0.5261,
+ "step": 1919
+ },
+ {
+ "epoch": 0.9078014184397163,
+ "grad_norm": 2.7016685009002686,
+ "learning_rate": 4.746589960833066e-06,
+ "loss": 0.5576,
+ "step": 1920
+ },
+ {
+ "epoch": 0.908274231678487,
+ "grad_norm": 2.8806519508361816,
+ "learning_rate": 4.746316221071846e-06,
+ "loss": 0.5925,
+ "step": 1921
+ },
+ {
+ "epoch": 0.9087470449172577,
+ "grad_norm": 3.0315234661102295,
+ "learning_rate": 4.746042341442365e-06,
+ "loss": 0.6142,
+ "step": 1922
+ },
+ {
+ "epoch": 0.9092198581560283,
+ "grad_norm": 4.2446160316467285,
+ "learning_rate": 4.745768321961676e-06,
+ "loss": 0.5352,
+ "step": 1923
+ },
+ {
+ "epoch": 0.909692671394799,
+ "grad_norm": 2.6517012119293213,
+ "learning_rate": 4.745494162646841e-06,
+ "loss": 0.6118,
+ "step": 1924
+ },
+ {
+ "epoch": 0.9101654846335697,
+ "grad_norm": 2.774900197982788,
+ "learning_rate": 4.7452198635149304e-06,
+ "loss": 0.572,
+ "step": 1925
+ },
+ {
+ "epoch": 0.9106382978723404,
+ "grad_norm": 3.0133683681488037,
+ "learning_rate": 4.744945424583024e-06,
+ "loss": 0.5897,
+ "step": 1926
+ },
+ {
+ "epoch": 0.9111111111111111,
+ "grad_norm": 2.7344839572906494,
+ "learning_rate": 4.744670845868211e-06,
+ "loss": 0.6207,
+ "step": 1927
+ },
+ {
+ "epoch": 0.9115839243498818,
+ "grad_norm": 2.636578321456909,
+ "learning_rate": 4.744396127387586e-06,
+ "loss": 0.6687,
+ "step": 1928
+ },
+ {
+ "epoch": 0.9120567375886525,
+ "grad_norm": 2.8663458824157715,
+ "learning_rate": 4.744121269158255e-06,
+ "loss": 0.5002,
+ "step": 1929
+ },
+ {
+ "epoch": 0.9125295508274232,
+ "grad_norm": 2.661079168319702,
+ "learning_rate": 4.743846271197333e-06,
+ "loss": 0.5848,
+ "step": 1930
+ },
+ {
+ "epoch": 0.9130023640661938,
+ "grad_norm": 2.881256341934204,
+ "learning_rate": 4.743571133521943e-06,
+ "loss": 0.5911,
+ "step": 1931
+ },
+ {
+ "epoch": 0.9134751773049645,
+ "grad_norm": 2.5540573596954346,
+ "learning_rate": 4.743295856149217e-06,
+ "loss": 0.5647,
+ "step": 1932
+ },
+ {
+ "epoch": 0.9139479905437352,
+ "grad_norm": 2.7060387134552,
+ "learning_rate": 4.743020439096293e-06,
+ "loss": 0.6267,
+ "step": 1933
+ },
+ {
+ "epoch": 0.9144208037825059,
+ "grad_norm": 2.694481372833252,
+ "learning_rate": 4.742744882380323e-06,
+ "loss": 0.6283,
+ "step": 1934
+ },
+ {
+ "epoch": 0.9148936170212766,
+ "grad_norm": 2.711555242538452,
+ "learning_rate": 4.7424691860184625e-06,
+ "loss": 0.5784,
+ "step": 1935
+ },
+ {
+ "epoch": 0.9153664302600473,
+ "grad_norm": 2.9077224731445312,
+ "learning_rate": 4.742193350027879e-06,
+ "loss": 0.5948,
+ "step": 1936
+ },
+ {
+ "epoch": 0.915839243498818,
+ "grad_norm": 2.9824187755584717,
+ "learning_rate": 4.7419173744257476e-06,
+ "loss": 0.6115,
+ "step": 1937
+ },
+ {
+ "epoch": 0.9163120567375886,
+ "grad_norm": 2.5127830505371094,
+ "learning_rate": 4.7416412592292515e-06,
+ "loss": 0.5803,
+ "step": 1938
+ },
+ {
+ "epoch": 0.9167848699763593,
+ "grad_norm": 3.1307175159454346,
+ "learning_rate": 4.741365004455583e-06,
+ "loss": 0.5657,
+ "step": 1939
+ },
+ {
+ "epoch": 0.91725768321513,
+ "grad_norm": 2.8205273151397705,
+ "learning_rate": 4.741088610121944e-06,
+ "loss": 0.6145,
+ "step": 1940
+ },
+ {
+ "epoch": 0.9177304964539007,
+ "grad_norm": 2.6119720935821533,
+ "learning_rate": 4.7408120762455444e-06,
+ "loss": 0.6058,
+ "step": 1941
+ },
+ {
+ "epoch": 0.9182033096926714,
+ "grad_norm": 2.421276092529297,
+ "learning_rate": 4.7405354028436025e-06,
+ "loss": 0.5973,
+ "step": 1942
+ },
+ {
+ "epoch": 0.9186761229314421,
+ "grad_norm": 2.9846808910369873,
+ "learning_rate": 4.740258589933346e-06,
+ "loss": 0.6892,
+ "step": 1943
+ },
+ {
+ "epoch": 0.9191489361702128,
+ "grad_norm": 2.6899871826171875,
+ "learning_rate": 4.739981637532009e-06,
+ "loss": 0.5705,
+ "step": 1944
+ },
+ {
+ "epoch": 0.9196217494089834,
+ "grad_norm": 2.8636131286621094,
+ "learning_rate": 4.739704545656839e-06,
+ "loss": 0.5775,
+ "step": 1945
+ },
+ {
+ "epoch": 0.9200945626477541,
+ "grad_norm": 2.7659449577331543,
+ "learning_rate": 4.739427314325087e-06,
+ "loss": 0.5823,
+ "step": 1946
+ },
+ {
+ "epoch": 0.9205673758865248,
+ "grad_norm": 4.71295166015625,
+ "learning_rate": 4.739149943554016e-06,
+ "loss": 0.5601,
+ "step": 1947
+ },
+ {
+ "epoch": 0.9210401891252955,
+ "grad_norm": 2.642636775970459,
+ "learning_rate": 4.738872433360896e-06,
+ "loss": 0.5278,
+ "step": 1948
+ },
+ {
+ "epoch": 0.9215130023640662,
+ "grad_norm": 2.4658217430114746,
+ "learning_rate": 4.7385947837630065e-06,
+ "loss": 0.6392,
+ "step": 1949
+ },
+ {
+ "epoch": 0.9219858156028369,
+ "grad_norm": 2.851602792739868,
+ "learning_rate": 4.738316994777636e-06,
+ "loss": 0.6164,
+ "step": 1950
+ },
+ {
+ "epoch": 0.9224586288416076,
+ "grad_norm": 2.394226551055908,
+ "learning_rate": 4.738039066422081e-06,
+ "loss": 0.5556,
+ "step": 1951
+ },
+ {
+ "epoch": 0.9229314420803783,
+ "grad_norm": 2.7985100746154785,
+ "learning_rate": 4.737760998713647e-06,
+ "loss": 0.5799,
+ "step": 1952
+ },
+ {
+ "epoch": 0.9234042553191489,
+ "grad_norm": 2.5974674224853516,
+ "learning_rate": 4.737482791669648e-06,
+ "loss": 0.6984,
+ "step": 1953
+ },
+ {
+ "epoch": 0.9238770685579196,
+ "grad_norm": 2.707636594772339,
+ "learning_rate": 4.737204445307406e-06,
+ "loss": 0.5548,
+ "step": 1954
+ },
+ {
+ "epoch": 0.9243498817966903,
+ "grad_norm": 2.7882707118988037,
+ "learning_rate": 4.736925959644254e-06,
+ "loss": 0.6026,
+ "step": 1955
+ },
+ {
+ "epoch": 0.924822695035461,
+ "grad_norm": 2.474482774734497,
+ "learning_rate": 4.7366473346975304e-06,
+ "loss": 0.5832,
+ "step": 1956
+ },
+ {
+ "epoch": 0.9252955082742317,
+ "grad_norm": 2.6196324825286865,
+ "learning_rate": 4.736368570484585e-06,
+ "loss": 0.5861,
+ "step": 1957
+ },
+ {
+ "epoch": 0.9257683215130024,
+ "grad_norm": 2.826864004135132,
+ "learning_rate": 4.736089667022775e-06,
+ "loss": 0.6173,
+ "step": 1958
+ },
+ {
+ "epoch": 0.926241134751773,
+ "grad_norm": 2.414473056793213,
+ "learning_rate": 4.735810624329466e-06,
+ "loss": 0.5753,
+ "step": 1959
+ },
+ {
+ "epoch": 0.9267139479905437,
+ "grad_norm": 2.8037970066070557,
+ "learning_rate": 4.7355314424220335e-06,
+ "loss": 0.6207,
+ "step": 1960
+ },
+ {
+ "epoch": 0.9271867612293144,
+ "grad_norm": 2.645458698272705,
+ "learning_rate": 4.735252121317861e-06,
+ "loss": 0.5959,
+ "step": 1961
+ },
+ {
+ "epoch": 0.9276595744680851,
+ "grad_norm": 2.7983884811401367,
+ "learning_rate": 4.734972661034339e-06,
+ "loss": 0.5696,
+ "step": 1962
+ },
+ {
+ "epoch": 0.9281323877068558,
+ "grad_norm": 3.0568997859954834,
+ "learning_rate": 4.73469306158887e-06,
+ "loss": 0.6194,
+ "step": 1963
+ },
+ {
+ "epoch": 0.9286052009456265,
+ "grad_norm": 2.7205135822296143,
+ "learning_rate": 4.734413322998863e-06,
+ "loss": 0.5292,
+ "step": 1964
+ },
+ {
+ "epoch": 0.9290780141843972,
+ "grad_norm": 3.3168489933013916,
+ "learning_rate": 4.734133445281735e-06,
+ "loss": 0.5654,
+ "step": 1965
+ },
+ {
+ "epoch": 0.9295508274231679,
+ "grad_norm": 3.0095653533935547,
+ "learning_rate": 4.733853428454916e-06,
+ "loss": 0.6508,
+ "step": 1966
+ },
+ {
+ "epoch": 0.9300236406619385,
+ "grad_norm": 2.7726712226867676,
+ "learning_rate": 4.733573272535838e-06,
+ "loss": 0.644,
+ "step": 1967
+ },
+ {
+ "epoch": 0.9304964539007092,
+ "grad_norm": 2.474397659301758,
+ "learning_rate": 4.7332929775419456e-06,
+ "loss": 0.5479,
+ "step": 1968
+ },
+ {
+ "epoch": 0.9309692671394799,
+ "grad_norm": 2.4518635272979736,
+ "learning_rate": 4.733012543490693e-06,
+ "loss": 0.6,
+ "step": 1969
+ },
+ {
+ "epoch": 0.9314420803782506,
+ "grad_norm": 2.9292192459106445,
+ "learning_rate": 4.73273197039954e-06,
+ "loss": 0.6647,
+ "step": 1970
+ },
+ {
+ "epoch": 0.9319148936170213,
+ "grad_norm": 2.425004720687866,
+ "learning_rate": 4.732451258285958e-06,
+ "loss": 0.6338,
+ "step": 1971
+ },
+ {
+ "epoch": 0.932387706855792,
+ "grad_norm": 2.904479503631592,
+ "learning_rate": 4.7321704071674255e-06,
+ "loss": 0.5923,
+ "step": 1972
+ },
+ {
+ "epoch": 0.9328605200945627,
+ "grad_norm": 2.477085590362549,
+ "learning_rate": 4.731889417061428e-06,
+ "loss": 0.5984,
+ "step": 1973
+ },
+ {
+ "epoch": 0.9333333333333333,
+ "grad_norm": 2.585240364074707,
+ "learning_rate": 4.731608287985465e-06,
+ "loss": 0.558,
+ "step": 1974
+ },
+ {
+ "epoch": 0.933806146572104,
+ "grad_norm": 2.658714532852173,
+ "learning_rate": 4.731327019957039e-06,
+ "loss": 0.5567,
+ "step": 1975
+ },
+ {
+ "epoch": 0.9342789598108747,
+ "grad_norm": 2.7593026161193848,
+ "learning_rate": 4.731045612993662e-06,
+ "loss": 0.5772,
+ "step": 1976
+ },
+ {
+ "epoch": 0.9347517730496454,
+ "grad_norm": 2.4386026859283447,
+ "learning_rate": 4.7307640671128585e-06,
+ "loss": 0.6199,
+ "step": 1977
+ },
+ {
+ "epoch": 0.9352245862884161,
+ "grad_norm": 2.681910514831543,
+ "learning_rate": 4.730482382332158e-06,
+ "loss": 0.5971,
+ "step": 1978
+ },
+ {
+ "epoch": 0.9356973995271868,
+ "grad_norm": 3.7593860626220703,
+ "learning_rate": 4.7302005586691e-06,
+ "loss": 0.6346,
+ "step": 1979
+ },
+ {
+ "epoch": 0.9361702127659575,
+ "grad_norm": 2.5789096355438232,
+ "learning_rate": 4.729918596141232e-06,
+ "loss": 0.5684,
+ "step": 1980
+ },
+ {
+ "epoch": 0.9366430260047282,
+ "grad_norm": 3.0607335567474365,
+ "learning_rate": 4.729636494766111e-06,
+ "loss": 0.6223,
+ "step": 1981
+ },
+ {
+ "epoch": 0.9371158392434988,
+ "grad_norm": 2.906643867492676,
+ "learning_rate": 4.729354254561303e-06,
+ "loss": 0.6513,
+ "step": 1982
+ },
+ {
+ "epoch": 0.9375886524822695,
+ "grad_norm": 3.192430019378662,
+ "learning_rate": 4.7290718755443795e-06,
+ "loss": 0.5095,
+ "step": 1983
+ },
+ {
+ "epoch": 0.9380614657210402,
+ "grad_norm": 2.661536931991577,
+ "learning_rate": 4.7287893577329255e-06,
+ "loss": 0.5525,
+ "step": 1984
+ },
+ {
+ "epoch": 0.9385342789598109,
+ "grad_norm": 2.8436734676361084,
+ "learning_rate": 4.728506701144531e-06,
+ "loss": 0.6323,
+ "step": 1985
+ },
+ {
+ "epoch": 0.9390070921985816,
+ "grad_norm": 2.75544810295105,
+ "learning_rate": 4.728223905796796e-06,
+ "loss": 0.6018,
+ "step": 1986
+ },
+ {
+ "epoch": 0.9394799054373523,
+ "grad_norm": 3.0652759075164795,
+ "learning_rate": 4.727940971707329e-06,
+ "loss": 0.62,
+ "step": 1987
+ },
+ {
+ "epoch": 0.939952718676123,
+ "grad_norm": 2.802567720413208,
+ "learning_rate": 4.727657898893747e-06,
+ "loss": 0.5809,
+ "step": 1988
+ },
+ {
+ "epoch": 0.9404255319148936,
+ "grad_norm": 2.6208512783050537,
+ "learning_rate": 4.7273746873736745e-06,
+ "loss": 0.5762,
+ "step": 1989
+ },
+ {
+ "epoch": 0.9408983451536643,
+ "grad_norm": 2.5901873111724854,
+ "learning_rate": 4.727091337164748e-06,
+ "loss": 0.6111,
+ "step": 1990
+ },
+ {
+ "epoch": 0.941371158392435,
+ "grad_norm": 3.002347707748413,
+ "learning_rate": 4.726807848284609e-06,
+ "loss": 0.6419,
+ "step": 1991
+ },
+ {
+ "epoch": 0.9418439716312057,
+ "grad_norm": 2.522151470184326,
+ "learning_rate": 4.72652422075091e-06,
+ "loss": 0.642,
+ "step": 1992
+ },
+ {
+ "epoch": 0.9423167848699764,
+ "grad_norm": 2.5571532249450684,
+ "learning_rate": 4.726240454581311e-06,
+ "loss": 0.5729,
+ "step": 1993
+ },
+ {
+ "epoch": 0.9427895981087471,
+ "grad_norm": 2.7704918384552,
+ "learning_rate": 4.72595654979348e-06,
+ "loss": 0.6816,
+ "step": 1994
+ },
+ {
+ "epoch": 0.9432624113475178,
+ "grad_norm": 2.517040491104126,
+ "learning_rate": 4.7256725064050955e-06,
+ "loss": 0.5782,
+ "step": 1995
+ },
+ {
+ "epoch": 0.9437352245862884,
+ "grad_norm": 2.613955020904541,
+ "learning_rate": 4.725388324433843e-06,
+ "loss": 0.6291,
+ "step": 1996
+ },
+ {
+ "epoch": 0.9442080378250591,
+ "grad_norm": 2.848891258239746,
+ "learning_rate": 4.725104003897418e-06,
+ "loss": 0.6544,
+ "step": 1997
+ },
+ {
+ "epoch": 0.9446808510638298,
+ "grad_norm": 3.0162429809570312,
+ "learning_rate": 4.724819544813523e-06,
+ "loss": 0.6301,
+ "step": 1998
+ },
+ {
+ "epoch": 0.9451536643026005,
+ "grad_norm": 2.613614559173584,
+ "learning_rate": 4.72453494719987e-06,
+ "loss": 0.5829,
+ "step": 1999
+ },
+ {
+ "epoch": 0.9456264775413712,
+ "grad_norm": 2.4838767051696777,
+ "learning_rate": 4.724250211074182e-06,
+ "loss": 0.6042,
+ "step": 2000
+ },
+ {
+ "epoch": 0.9460992907801419,
+ "grad_norm": 2.526470899581909,
+ "learning_rate": 4.723965336454185e-06,
+ "loss": 0.6167,
+ "step": 2001
+ },
+ {
+ "epoch": 0.9465721040189126,
+ "grad_norm": 2.504506826400757,
+ "learning_rate": 4.723680323357618e-06,
+ "loss": 0.6061,
+ "step": 2002
+ },
+ {
+ "epoch": 0.9470449172576832,
+ "grad_norm": 3.0547544956207275,
+ "learning_rate": 4.723395171802228e-06,
+ "loss": 0.6619,
+ "step": 2003
+ },
+ {
+ "epoch": 0.9475177304964539,
+ "grad_norm": 2.8692407608032227,
+ "learning_rate": 4.723109881805771e-06,
+ "loss": 0.5985,
+ "step": 2004
+ },
+ {
+ "epoch": 0.9479905437352246,
+ "grad_norm": 2.7929654121398926,
+ "learning_rate": 4.7228244533860094e-06,
+ "loss": 0.5869,
+ "step": 2005
+ },
+ {
+ "epoch": 0.9484633569739953,
+ "grad_norm": 2.764869451522827,
+ "learning_rate": 4.7225388865607146e-06,
+ "loss": 0.6288,
+ "step": 2006
+ },
+ {
+ "epoch": 0.948936170212766,
+ "grad_norm": 2.7656404972076416,
+ "learning_rate": 4.722253181347671e-06,
+ "loss": 0.5831,
+ "step": 2007
+ },
+ {
+ "epoch": 0.9494089834515367,
+ "grad_norm": 2.6698336601257324,
+ "learning_rate": 4.7219673377646635e-06,
+ "loss": 0.6087,
+ "step": 2008
+ },
+ {
+ "epoch": 0.9498817966903074,
+ "grad_norm": 2.524935722351074,
+ "learning_rate": 4.7216813558294946e-06,
+ "loss": 0.5675,
+ "step": 2009
+ },
+ {
+ "epoch": 0.950354609929078,
+ "grad_norm": 2.5998785495758057,
+ "learning_rate": 4.721395235559969e-06,
+ "loss": 0.5667,
+ "step": 2010
+ },
+ {
+ "epoch": 0.9508274231678487,
+ "grad_norm": 2.758021354675293,
+ "learning_rate": 4.721108976973902e-06,
+ "loss": 0.4931,
+ "step": 2011
+ },
+ {
+ "epoch": 0.9513002364066194,
+ "grad_norm": 2.767695903778076,
+ "learning_rate": 4.72082258008912e-06,
+ "loss": 0.5778,
+ "step": 2012
+ },
+ {
+ "epoch": 0.9517730496453901,
+ "grad_norm": 2.982314348220825,
+ "learning_rate": 4.720536044923453e-06,
+ "loss": 0.6096,
+ "step": 2013
+ },
+ {
+ "epoch": 0.9522458628841608,
+ "grad_norm": 2.7608799934387207,
+ "learning_rate": 4.720249371494743e-06,
+ "loss": 0.6242,
+ "step": 2014
+ },
+ {
+ "epoch": 0.9527186761229315,
+ "grad_norm": 2.60054349899292,
+ "learning_rate": 4.71996255982084e-06,
+ "loss": 0.6249,
+ "step": 2015
+ },
+ {
+ "epoch": 0.9531914893617022,
+ "grad_norm": 2.654355764389038,
+ "learning_rate": 4.719675609919603e-06,
+ "loss": 0.6327,
+ "step": 2016
+ },
+ {
+ "epoch": 0.9536643026004729,
+ "grad_norm": 2.589404582977295,
+ "learning_rate": 4.719388521808899e-06,
+ "loss": 0.6357,
+ "step": 2017
+ },
+ {
+ "epoch": 0.9541371158392435,
+ "grad_norm": 2.8016581535339355,
+ "learning_rate": 4.719101295506603e-06,
+ "loss": 0.5901,
+ "step": 2018
+ },
+ {
+ "epoch": 0.9546099290780142,
+ "grad_norm": 3.1408045291900635,
+ "learning_rate": 4.7188139310306e-06,
+ "loss": 0.598,
+ "step": 2019
+ },
+ {
+ "epoch": 0.9550827423167849,
+ "grad_norm": 2.7432665824890137,
+ "learning_rate": 4.718526428398783e-06,
+ "loss": 0.5508,
+ "step": 2020
+ },
+ {
+ "epoch": 0.9555555555555556,
+ "grad_norm": 2.947800874710083,
+ "learning_rate": 4.718238787629053e-06,
+ "loss": 0.6439,
+ "step": 2021
+ },
+ {
+ "epoch": 0.9560283687943263,
+ "grad_norm": 2.50828218460083,
+ "learning_rate": 4.71795100873932e-06,
+ "loss": 0.5441,
+ "step": 2022
+ },
+ {
+ "epoch": 0.956501182033097,
+ "grad_norm": 2.8558974266052246,
+ "learning_rate": 4.717663091747503e-06,
+ "loss": 0.5416,
+ "step": 2023
+ },
+ {
+ "epoch": 0.9569739952718677,
+ "grad_norm": 2.4803316593170166,
+ "learning_rate": 4.71737503667153e-06,
+ "loss": 0.5317,
+ "step": 2024
+ },
+ {
+ "epoch": 0.9574468085106383,
+ "grad_norm": 4.36754035949707,
+ "learning_rate": 4.717086843529336e-06,
+ "loss": 0.5808,
+ "step": 2025
+ },
+ {
+ "epoch": 0.957919621749409,
+ "grad_norm": 2.730185031890869,
+ "learning_rate": 4.7167985123388665e-06,
+ "loss": 0.5257,
+ "step": 2026
+ },
+ {
+ "epoch": 0.9583924349881797,
+ "grad_norm": 2.8136069774627686,
+ "learning_rate": 4.716510043118074e-06,
+ "loss": 0.5836,
+ "step": 2027
+ },
+ {
+ "epoch": 0.9588652482269504,
+ "grad_norm": 2.793975353240967,
+ "learning_rate": 4.71622143588492e-06,
+ "loss": 0.5706,
+ "step": 2028
+ },
+ {
+ "epoch": 0.9593380614657211,
+ "grad_norm": 2.3883821964263916,
+ "learning_rate": 4.7159326906573745e-06,
+ "loss": 0.5291,
+ "step": 2029
+ },
+ {
+ "epoch": 0.9598108747044918,
+ "grad_norm": 2.6135976314544678,
+ "learning_rate": 4.715643807453417e-06,
+ "loss": 0.6199,
+ "step": 2030
+ },
+ {
+ "epoch": 0.9602836879432625,
+ "grad_norm": 2.6245670318603516,
+ "learning_rate": 4.715354786291035e-06,
+ "loss": 0.5585,
+ "step": 2031
+ },
+ {
+ "epoch": 0.9607565011820332,
+ "grad_norm": 2.7870967388153076,
+ "learning_rate": 4.715065627188225e-06,
+ "loss": 0.6196,
+ "step": 2032
+ },
+ {
+ "epoch": 0.9612293144208038,
+ "grad_norm": 2.6983911991119385,
+ "learning_rate": 4.714776330162991e-06,
+ "loss": 0.6424,
+ "step": 2033
+ },
+ {
+ "epoch": 0.9617021276595744,
+ "grad_norm": 2.3221919536590576,
+ "learning_rate": 4.7144868952333465e-06,
+ "loss": 0.568,
+ "step": 2034
+ },
+ {
+ "epoch": 0.9621749408983451,
+ "grad_norm": 2.9408178329467773,
+ "learning_rate": 4.714197322417314e-06,
+ "loss": 0.6175,
+ "step": 2035
+ },
+ {
+ "epoch": 0.9626477541371158,
+ "grad_norm": 2.404057264328003,
+ "learning_rate": 4.713907611732921e-06,
+ "loss": 0.4943,
+ "step": 2036
+ },
+ {
+ "epoch": 0.9631205673758865,
+ "grad_norm": 3.547607660293579,
+ "learning_rate": 4.71361776319821e-06,
+ "loss": 0.5488,
+ "step": 2037
+ },
+ {
+ "epoch": 0.9635933806146572,
+ "grad_norm": 2.679614543914795,
+ "learning_rate": 4.713327776831227e-06,
+ "loss": 0.6234,
+ "step": 2038
+ },
+ {
+ "epoch": 0.9640661938534278,
+ "grad_norm": 2.526914119720459,
+ "learning_rate": 4.7130376526500286e-06,
+ "loss": 0.5891,
+ "step": 2039
+ },
+ {
+ "epoch": 0.9645390070921985,
+ "grad_norm": 2.6953470706939697,
+ "learning_rate": 4.71274739067268e-06,
+ "loss": 0.69,
+ "step": 2040
+ },
+ {
+ "epoch": 0.9650118203309692,
+ "grad_norm": 2.546660900115967,
+ "learning_rate": 4.712456990917254e-06,
+ "loss": 0.6185,
+ "step": 2041
+ },
+ {
+ "epoch": 0.9654846335697399,
+ "grad_norm": 3.3920490741729736,
+ "learning_rate": 4.712166453401832e-06,
+ "loss": 0.587,
+ "step": 2042
+ },
+ {
+ "epoch": 0.9659574468085106,
+ "grad_norm": 2.5961573123931885,
+ "learning_rate": 4.711875778144504e-06,
+ "loss": 0.6105,
+ "step": 2043
+ },
+ {
+ "epoch": 0.9664302600472813,
+ "grad_norm": 2.5111498832702637,
+ "learning_rate": 4.711584965163372e-06,
+ "loss": 0.5533,
+ "step": 2044
+ },
+ {
+ "epoch": 0.966903073286052,
+ "grad_norm": 2.4878132343292236,
+ "learning_rate": 4.7112940144765405e-06,
+ "loss": 0.5604,
+ "step": 2045
+ },
+ {
+ "epoch": 0.9673758865248226,
+ "grad_norm": 2.5714077949523926,
+ "learning_rate": 4.711002926102128e-06,
+ "loss": 0.5794,
+ "step": 2046
+ },
+ {
+ "epoch": 0.9678486997635933,
+ "grad_norm": 2.7069091796875,
+ "learning_rate": 4.710711700058257e-06,
+ "loss": 0.594,
+ "step": 2047
+ },
+ {
+ "epoch": 0.968321513002364,
+ "grad_norm": 2.8104631900787354,
+ "learning_rate": 4.710420336363063e-06,
+ "loss": 0.6247,
+ "step": 2048
+ },
+ {
+ "epoch": 0.9687943262411347,
+ "grad_norm": 2.8464386463165283,
+ "learning_rate": 4.7101288350346865e-06,
+ "loss": 0.6162,
+ "step": 2049
+ },
+ {
+ "epoch": 0.9692671394799054,
+ "grad_norm": 2.7187976837158203,
+ "learning_rate": 4.709837196091279e-06,
+ "loss": 0.6109,
+ "step": 2050
+ },
+ {
+ "epoch": 0.9697399527186761,
+ "grad_norm": 2.556734085083008,
+ "learning_rate": 4.709545419550999e-06,
+ "loss": 0.6297,
+ "step": 2051
+ },
+ {
+ "epoch": 0.9702127659574468,
+ "grad_norm": 2.937195062637329,
+ "learning_rate": 4.709253505432014e-06,
+ "loss": 0.6862,
+ "step": 2052
+ },
+ {
+ "epoch": 0.9706855791962175,
+ "grad_norm": 2.792175531387329,
+ "learning_rate": 4.7089614537525015e-06,
+ "loss": 0.6105,
+ "step": 2053
+ },
+ {
+ "epoch": 0.9711583924349881,
+ "grad_norm": 2.625636100769043,
+ "learning_rate": 4.708669264530644e-06,
+ "loss": 0.5849,
+ "step": 2054
+ },
+ {
+ "epoch": 0.9716312056737588,
+ "grad_norm": 2.6752610206604004,
+ "learning_rate": 4.708376937784637e-06,
+ "loss": 0.5949,
+ "step": 2055
+ },
+ {
+ "epoch": 0.9721040189125295,
+ "grad_norm": 2.6072793006896973,
+ "learning_rate": 4.708084473532681e-06,
+ "loss": 0.5776,
+ "step": 2056
+ },
+ {
+ "epoch": 0.9725768321513002,
+ "grad_norm": 2.728632926940918,
+ "learning_rate": 4.707791871792988e-06,
+ "loss": 0.6352,
+ "step": 2057
+ },
+ {
+ "epoch": 0.9730496453900709,
+ "grad_norm": 2.5841758251190186,
+ "learning_rate": 4.707499132583775e-06,
+ "loss": 0.5488,
+ "step": 2058
+ },
+ {
+ "epoch": 0.9735224586288416,
+ "grad_norm": 2.8464293479919434,
+ "learning_rate": 4.707206255923271e-06,
+ "loss": 0.7051,
+ "step": 2059
+ },
+ {
+ "epoch": 0.9739952718676123,
+ "grad_norm": 2.547297239303589,
+ "learning_rate": 4.706913241829712e-06,
+ "loss": 0.5937,
+ "step": 2060
+ },
+ {
+ "epoch": 0.9744680851063829,
+ "grad_norm": 2.6572306156158447,
+ "learning_rate": 4.706620090321341e-06,
+ "loss": 0.6041,
+ "step": 2061
+ },
+ {
+ "epoch": 0.9749408983451536,
+ "grad_norm": 2.3262805938720703,
+ "learning_rate": 4.706326801416414e-06,
+ "loss": 0.5144,
+ "step": 2062
+ },
+ {
+ "epoch": 0.9754137115839243,
+ "grad_norm": 2.9693965911865234,
+ "learning_rate": 4.706033375133191e-06,
+ "loss": 0.551,
+ "step": 2063
+ },
+ {
+ "epoch": 0.975886524822695,
+ "grad_norm": 2.5993731021881104,
+ "learning_rate": 4.7057398114899435e-06,
+ "loss": 0.6143,
+ "step": 2064
+ },
+ {
+ "epoch": 0.9763593380614657,
+ "grad_norm": 2.453336477279663,
+ "learning_rate": 4.70544611050495e-06,
+ "loss": 0.6093,
+ "step": 2065
+ },
+ {
+ "epoch": 0.9768321513002364,
+ "grad_norm": 2.898629665374756,
+ "learning_rate": 4.705152272196497e-06,
+ "loss": 0.6007,
+ "step": 2066
+ },
+ {
+ "epoch": 0.9773049645390071,
+ "grad_norm": 2.7990612983703613,
+ "learning_rate": 4.7048582965828815e-06,
+ "loss": 0.6687,
+ "step": 2067
+ },
+ {
+ "epoch": 0.9777777777777777,
+ "grad_norm": 2.635284423828125,
+ "learning_rate": 4.704564183682408e-06,
+ "loss": 0.5564,
+ "step": 2068
+ },
+ {
+ "epoch": 0.9782505910165484,
+ "grad_norm": 3.014547109603882,
+ "learning_rate": 4.704269933513389e-06,
+ "loss": 0.6084,
+ "step": 2069
+ },
+ {
+ "epoch": 0.9787234042553191,
+ "grad_norm": 2.659357786178589,
+ "learning_rate": 4.703975546094147e-06,
+ "loss": 0.6031,
+ "step": 2070
+ },
+ {
+ "epoch": 0.9791962174940898,
+ "grad_norm": 2.326932668685913,
+ "learning_rate": 4.703681021443013e-06,
+ "loss": 0.5859,
+ "step": 2071
+ },
+ {
+ "epoch": 0.9796690307328605,
+ "grad_norm": 2.958803653717041,
+ "learning_rate": 4.7033863595783235e-06,
+ "loss": 0.5586,
+ "step": 2072
+ },
+ {
+ "epoch": 0.9801418439716312,
+ "grad_norm": 2.921386957168579,
+ "learning_rate": 4.703091560518427e-06,
+ "loss": 0.6126,
+ "step": 2073
+ },
+ {
+ "epoch": 0.9806146572104019,
+ "grad_norm": 2.6500775814056396,
+ "learning_rate": 4.702796624281679e-06,
+ "loss": 0.5678,
+ "step": 2074
+ },
+ {
+ "epoch": 0.9810874704491725,
+ "grad_norm": 2.7740228176116943,
+ "learning_rate": 4.702501550886445e-06,
+ "loss": 0.6067,
+ "step": 2075
+ },
+ {
+ "epoch": 0.9815602836879432,
+ "grad_norm": 2.3296213150024414,
+ "learning_rate": 4.702206340351096e-06,
+ "loss": 0.5247,
+ "step": 2076
+ },
+ {
+ "epoch": 0.9820330969267139,
+ "grad_norm": 2.748300790786743,
+ "learning_rate": 4.701910992694016e-06,
+ "loss": 0.5197,
+ "step": 2077
+ },
+ {
+ "epoch": 0.9825059101654846,
+ "grad_norm": 2.250985622406006,
+ "learning_rate": 4.7016155079335926e-06,
+ "loss": 0.5214,
+ "step": 2078
+ },
+ {
+ "epoch": 0.9829787234042553,
+ "grad_norm": 2.389845848083496,
+ "learning_rate": 4.701319886088226e-06,
+ "loss": 0.519,
+ "step": 2079
+ },
+ {
+ "epoch": 0.983451536643026,
+ "grad_norm": 2.818220853805542,
+ "learning_rate": 4.701024127176322e-06,
+ "loss": 0.607,
+ "step": 2080
+ },
+ {
+ "epoch": 0.9839243498817967,
+ "grad_norm": 3.4058034420013428,
+ "learning_rate": 4.700728231216297e-06,
+ "loss": 0.5711,
+ "step": 2081
+ },
+ {
+ "epoch": 0.9843971631205674,
+ "grad_norm": 2.5297787189483643,
+ "learning_rate": 4.700432198226575e-06,
+ "loss": 0.5979,
+ "step": 2082
+ },
+ {
+ "epoch": 0.984869976359338,
+ "grad_norm": 3.0548105239868164,
+ "learning_rate": 4.7001360282255885e-06,
+ "loss": 0.6041,
+ "step": 2083
+ },
+ {
+ "epoch": 0.9853427895981087,
+ "grad_norm": 2.8983733654022217,
+ "learning_rate": 4.699839721231779e-06,
+ "loss": 0.5926,
+ "step": 2084
+ },
+ {
+ "epoch": 0.9858156028368794,
+ "grad_norm": 3.2717764377593994,
+ "learning_rate": 4.699543277263596e-06,
+ "loss": 0.6477,
+ "step": 2085
+ },
+ {
+ "epoch": 0.9862884160756501,
+ "grad_norm": 3.03729248046875,
+ "learning_rate": 4.699246696339497e-06,
+ "loss": 0.6786,
+ "step": 2086
+ },
+ {
+ "epoch": 0.9867612293144208,
+ "grad_norm": 2.852301597595215,
+ "learning_rate": 4.698949978477951e-06,
+ "loss": 0.6565,
+ "step": 2087
+ },
+ {
+ "epoch": 0.9872340425531915,
+ "grad_norm": 2.843485116958618,
+ "learning_rate": 4.698653123697431e-06,
+ "loss": 0.6627,
+ "step": 2088
+ },
+ {
+ "epoch": 0.9877068557919622,
+ "grad_norm": 2.6315064430236816,
+ "learning_rate": 4.698356132016423e-06,
+ "loss": 0.6577,
+ "step": 2089
+ },
+ {
+ "epoch": 0.9881796690307328,
+ "grad_norm": 2.7482151985168457,
+ "learning_rate": 4.698059003453417e-06,
+ "loss": 0.5514,
+ "step": 2090
+ },
+ {
+ "epoch": 0.9886524822695035,
+ "grad_norm": 2.826673746109009,
+ "learning_rate": 4.6977617380269145e-06,
+ "loss": 0.565,
+ "step": 2091
+ },
+ {
+ "epoch": 0.9891252955082742,
+ "grad_norm": 3.0273752212524414,
+ "learning_rate": 4.697464335755427e-06,
+ "loss": 0.6331,
+ "step": 2092
+ },
+ {
+ "epoch": 0.9895981087470449,
+ "grad_norm": 2.7551653385162354,
+ "learning_rate": 4.6971667966574695e-06,
+ "loss": 0.6486,
+ "step": 2093
+ },
+ {
+ "epoch": 0.9900709219858156,
+ "grad_norm": 2.656299114227295,
+ "learning_rate": 4.696869120751571e-06,
+ "loss": 0.6562,
+ "step": 2094
+ },
+ {
+ "epoch": 0.9905437352245863,
+ "grad_norm": 2.785322904586792,
+ "learning_rate": 4.696571308056265e-06,
+ "loss": 0.5892,
+ "step": 2095
+ },
+ {
+ "epoch": 0.991016548463357,
+ "grad_norm": 2.9334635734558105,
+ "learning_rate": 4.696273358590095e-06,
+ "loss": 0.6346,
+ "step": 2096
+ },
+ {
+ "epoch": 0.9914893617021276,
+ "grad_norm": 2.7944300174713135,
+ "learning_rate": 4.695975272371613e-06,
+ "loss": 0.5859,
+ "step": 2097
+ },
+ {
+ "epoch": 0.9919621749408983,
+ "grad_norm": 2.5416972637176514,
+ "learning_rate": 4.695677049419381e-06,
+ "loss": 0.5658,
+ "step": 2098
+ },
+ {
+ "epoch": 0.992434988179669,
+ "grad_norm": 2.4056856632232666,
+ "learning_rate": 4.695378689751966e-06,
+ "loss": 0.5121,
+ "step": 2099
+ },
+ {
+ "epoch": 0.9929078014184397,
+ "grad_norm": 2.614548683166504,
+ "learning_rate": 4.695080193387948e-06,
+ "loss": 0.5961,
+ "step": 2100
+ },
+ {
+ "epoch": 0.9933806146572104,
+ "grad_norm": 2.8966517448425293,
+ "learning_rate": 4.69478156034591e-06,
+ "loss": 0.5985,
+ "step": 2101
+ },
+ {
+ "epoch": 0.9938534278959811,
+ "grad_norm": 2.9514098167419434,
+ "learning_rate": 4.694482790644448e-06,
+ "loss": 0.5677,
+ "step": 2102
+ },
+ {
+ "epoch": 0.9943262411347518,
+ "grad_norm": 2.4326791763305664,
+ "learning_rate": 4.694183884302165e-06,
+ "loss": 0.5698,
+ "step": 2103
+ },
+ {
+ "epoch": 0.9947990543735225,
+ "grad_norm": 2.9242892265319824,
+ "learning_rate": 4.6938848413376735e-06,
+ "loss": 0.6245,
+ "step": 2104
+ },
+ {
+ "epoch": 0.9952718676122931,
+ "grad_norm": 2.9134104251861572,
+ "learning_rate": 4.693585661769593e-06,
+ "loss": 0.6164,
+ "step": 2105
+ },
+ {
+ "epoch": 0.9957446808510638,
+ "grad_norm": 2.472564458847046,
+ "learning_rate": 4.693286345616551e-06,
+ "loss": 0.5616,
+ "step": 2106
+ },
+ {
+ "epoch": 0.9962174940898345,
+ "grad_norm": 3.2456448078155518,
+ "learning_rate": 4.692986892897186e-06,
+ "loss": 0.6977,
+ "step": 2107
+ },
+ {
+ "epoch": 0.9966903073286052,
+ "grad_norm": 3.4032769203186035,
+ "learning_rate": 4.692687303630143e-06,
+ "loss": 0.643,
+ "step": 2108
+ },
+ {
+ "epoch": 0.9971631205673759,
+ "grad_norm": 2.722200870513916,
+ "learning_rate": 4.692387577834076e-06,
+ "loss": 0.5873,
+ "step": 2109
+ },
+ {
+ "epoch": 0.9976359338061466,
+ "grad_norm": 2.687532663345337,
+ "learning_rate": 4.692087715527648e-06,
+ "loss": 0.5423,
+ "step": 2110
+ },
+ {
+ "epoch": 0.9981087470449173,
+ "grad_norm": 2.578613042831421,
+ "learning_rate": 4.6917877167295305e-06,
+ "loss": 0.5689,
+ "step": 2111
+ },
+ {
+ "epoch": 0.9985815602836879,
+ "grad_norm": 3.1806094646453857,
+ "learning_rate": 4.691487581458402e-06,
+ "loss": 0.6133,
+ "step": 2112
+ },
+ {
+ "epoch": 0.9990543735224586,
+ "grad_norm": 2.4449520111083984,
+ "learning_rate": 4.691187309732952e-06,
+ "loss": 0.5841,
+ "step": 2113
+ },
+ {
+ "epoch": 0.9995271867612293,
+ "grad_norm": 2.908749580383301,
+ "learning_rate": 4.690886901571875e-06,
+ "loss": 0.534,
+ "step": 2114
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 4.019968032836914,
+ "learning_rate": 4.6905863569938785e-06,
+ "loss": 0.596,
+ "step": 2115
+ },
+ {
+ "epoch": 1.0004728132387706,
+ "grad_norm": 2.4319307804107666,
+ "learning_rate": 4.690285676017675e-06,
+ "loss": 0.4973,
+ "step": 2116
+ },
+ {
+ "epoch": 1.0009456264775414,
+ "grad_norm": 2.6366477012634277,
+ "learning_rate": 4.689984858661986e-06,
+ "loss": 0.5682,
+ "step": 2117
+ },
+ {
+ "epoch": 1.001418439716312,
+ "grad_norm": 2.815114974975586,
+ "learning_rate": 4.689683904945542e-06,
+ "loss": 0.5616,
+ "step": 2118
+ },
+ {
+ "epoch": 1.0018912529550827,
+ "grad_norm": 2.6680490970611572,
+ "learning_rate": 4.689382814887084e-06,
+ "loss": 0.5161,
+ "step": 2119
+ },
+ {
+ "epoch": 1.0023640661938533,
+ "grad_norm": 2.7406351566314697,
+ "learning_rate": 4.689081588505358e-06,
+ "loss": 0.4937,
+ "step": 2120
+ },
+ {
+ "epoch": 1.0028368794326241,
+ "grad_norm": 2.2832298278808594,
+ "learning_rate": 4.68878022581912e-06,
+ "loss": 0.4986,
+ "step": 2121
+ },
+ {
+ "epoch": 1.0033096926713947,
+ "grad_norm": 2.5525307655334473,
+ "learning_rate": 4.688478726847136e-06,
+ "loss": 0.4909,
+ "step": 2122
+ },
+ {
+ "epoch": 1.0037825059101655,
+ "grad_norm": 2.9843199253082275,
+ "learning_rate": 4.688177091608176e-06,
+ "loss": 0.6046,
+ "step": 2123
+ },
+ {
+ "epoch": 1.004255319148936,
+ "grad_norm": 2.5231106281280518,
+ "learning_rate": 4.687875320121024e-06,
+ "loss": 0.5423,
+ "step": 2124
+ },
+ {
+ "epoch": 1.0047281323877069,
+ "grad_norm": 2.567599058151245,
+ "learning_rate": 4.68757341240447e-06,
+ "loss": 0.5092,
+ "step": 2125
+ },
+ {
+ "epoch": 1.0052009456264774,
+ "grad_norm": 2.768111228942871,
+ "learning_rate": 4.687271368477311e-06,
+ "loss": 0.5175,
+ "step": 2126
+ },
+ {
+ "epoch": 1.0056737588652482,
+ "grad_norm": 2.7223286628723145,
+ "learning_rate": 4.686969188358355e-06,
+ "loss": 0.5412,
+ "step": 2127
+ },
+ {
+ "epoch": 1.0061465721040188,
+ "grad_norm": 2.488299608230591,
+ "learning_rate": 4.686666872066418e-06,
+ "loss": 0.5288,
+ "step": 2128
+ },
+ {
+ "epoch": 1.0066193853427896,
+ "grad_norm": 2.882981777191162,
+ "learning_rate": 4.6863644196203215e-06,
+ "loss": 0.6117,
+ "step": 2129
+ },
+ {
+ "epoch": 1.0070921985815602,
+ "grad_norm": 3.0019447803497314,
+ "learning_rate": 4.686061831038901e-06,
+ "loss": 0.5308,
+ "step": 2130
+ },
+ {
+ "epoch": 1.007565011820331,
+ "grad_norm": 3.0056138038635254,
+ "learning_rate": 4.685759106340996e-06,
+ "loss": 0.5833,
+ "step": 2131
+ },
+ {
+ "epoch": 1.0080378250591016,
+ "grad_norm": 2.5709075927734375,
+ "learning_rate": 4.685456245545454e-06,
+ "loss": 0.5071,
+ "step": 2132
+ },
+ {
+ "epoch": 1.0085106382978724,
+ "grad_norm": 2.4641504287719727,
+ "learning_rate": 4.685153248671136e-06,
+ "loss": 0.4813,
+ "step": 2133
+ },
+ {
+ "epoch": 1.008983451536643,
+ "grad_norm": 2.374413013458252,
+ "learning_rate": 4.684850115736906e-06,
+ "loss": 0.5179,
+ "step": 2134
+ },
+ {
+ "epoch": 1.0094562647754137,
+ "grad_norm": 2.6504571437835693,
+ "learning_rate": 4.684546846761641e-06,
+ "loss": 0.437,
+ "step": 2135
+ },
+ {
+ "epoch": 1.0099290780141843,
+ "grad_norm": 2.5977871417999268,
+ "learning_rate": 4.684243441764221e-06,
+ "loss": 0.497,
+ "step": 2136
+ },
+ {
+ "epoch": 1.010401891252955,
+ "grad_norm": 2.4950785636901855,
+ "learning_rate": 4.683939900763541e-06,
+ "loss": 0.5624,
+ "step": 2137
+ },
+ {
+ "epoch": 1.0108747044917257,
+ "grad_norm": 3.065718412399292,
+ "learning_rate": 4.6836362237785e-06,
+ "loss": 0.512,
+ "step": 2138
+ },
+ {
+ "epoch": 1.0113475177304965,
+ "grad_norm": 2.7419207096099854,
+ "learning_rate": 4.6833324108280045e-06,
+ "loss": 0.5585,
+ "step": 2139
+ },
+ {
+ "epoch": 1.011820330969267,
+ "grad_norm": 2.623610496520996,
+ "learning_rate": 4.6830284619309744e-06,
+ "loss": 0.5163,
+ "step": 2140
+ },
+ {
+ "epoch": 1.0122931442080378,
+ "grad_norm": 2.774322986602783,
+ "learning_rate": 4.682724377106334e-06,
+ "loss": 0.527,
+ "step": 2141
+ },
+ {
+ "epoch": 1.0127659574468084,
+ "grad_norm": 2.959935188293457,
+ "learning_rate": 4.682420156373017e-06,
+ "loss": 0.6166,
+ "step": 2142
+ },
+ {
+ "epoch": 1.0132387706855792,
+ "grad_norm": 2.584026336669922,
+ "learning_rate": 4.682115799749968e-06,
+ "loss": 0.5086,
+ "step": 2143
+ },
+ {
+ "epoch": 1.0137115839243498,
+ "grad_norm": 2.6039700508117676,
+ "learning_rate": 4.6818113072561346e-06,
+ "loss": 0.49,
+ "step": 2144
+ },
+ {
+ "epoch": 1.0141843971631206,
+ "grad_norm": 2.466381072998047,
+ "learning_rate": 4.681506678910479e-06,
+ "loss": 0.4959,
+ "step": 2145
+ },
+ {
+ "epoch": 1.0146572104018912,
+ "grad_norm": 2.432636260986328,
+ "learning_rate": 4.681201914731969e-06,
+ "loss": 0.5057,
+ "step": 2146
+ },
+ {
+ "epoch": 1.015130023640662,
+ "grad_norm": 2.6134090423583984,
+ "learning_rate": 4.680897014739579e-06,
+ "loss": 0.4874,
+ "step": 2147
+ },
+ {
+ "epoch": 1.0156028368794325,
+ "grad_norm": 2.774481773376465,
+ "learning_rate": 4.680591978952295e-06,
+ "loss": 0.4967,
+ "step": 2148
+ },
+ {
+ "epoch": 1.0160756501182033,
+ "grad_norm": 2.66050124168396,
+ "learning_rate": 4.68028680738911e-06,
+ "loss": 0.4932,
+ "step": 2149
+ },
+ {
+ "epoch": 1.016548463356974,
+ "grad_norm": 3.020594835281372,
+ "learning_rate": 4.679981500069026e-06,
+ "loss": 0.5788,
+ "step": 2150
+ },
+ {
+ "epoch": 1.0170212765957447,
+ "grad_norm": 2.697758436203003,
+ "learning_rate": 4.679676057011053e-06,
+ "loss": 0.5441,
+ "step": 2151
+ },
+ {
+ "epoch": 1.0174940898345153,
+ "grad_norm": 6.986445903778076,
+ "learning_rate": 4.679370478234209e-06,
+ "loss": 0.6483,
+ "step": 2152
+ },
+ {
+ "epoch": 1.017966903073286,
+ "grad_norm": 2.6637115478515625,
+ "learning_rate": 4.679064763757522e-06,
+ "loss": 0.5859,
+ "step": 2153
+ },
+ {
+ "epoch": 1.0184397163120567,
+ "grad_norm": 2.7501862049102783,
+ "learning_rate": 4.678758913600027e-06,
+ "loss": 0.5745,
+ "step": 2154
+ },
+ {
+ "epoch": 1.0189125295508275,
+ "grad_norm": 2.7959372997283936,
+ "learning_rate": 4.678452927780768e-06,
+ "loss": 0.5076,
+ "step": 2155
+ },
+ {
+ "epoch": 1.019385342789598,
+ "grad_norm": 2.4377388954162598,
+ "learning_rate": 4.678146806318798e-06,
+ "loss": 0.5061,
+ "step": 2156
+ },
+ {
+ "epoch": 1.0198581560283688,
+ "grad_norm": 2.5478947162628174,
+ "learning_rate": 4.677840549233176e-06,
+ "loss": 0.4941,
+ "step": 2157
+ },
+ {
+ "epoch": 1.0203309692671394,
+ "grad_norm": 3.0956528186798096,
+ "learning_rate": 4.677534156542973e-06,
+ "loss": 0.5879,
+ "step": 2158
+ },
+ {
+ "epoch": 1.0208037825059102,
+ "grad_norm": 2.5247607231140137,
+ "learning_rate": 4.6772276282672666e-06,
+ "loss": 0.5532,
+ "step": 2159
+ },
+ {
+ "epoch": 1.0212765957446808,
+ "grad_norm": 3.1972787380218506,
+ "learning_rate": 4.676920964425143e-06,
+ "loss": 0.6081,
+ "step": 2160
+ },
+ {
+ "epoch": 1.0217494089834516,
+ "grad_norm": 2.6173388957977295,
+ "learning_rate": 4.6766141650356955e-06,
+ "loss": 0.5001,
+ "step": 2161
+ },
+ {
+ "epoch": 1.0222222222222221,
+ "grad_norm": 2.9914398193359375,
+ "learning_rate": 4.676307230118029e-06,
+ "loss": 0.5566,
+ "step": 2162
+ },
+ {
+ "epoch": 1.022695035460993,
+ "grad_norm": 2.8011834621429443,
+ "learning_rate": 4.676000159691254e-06,
+ "loss": 0.4909,
+ "step": 2163
+ },
+ {
+ "epoch": 1.0231678486997635,
+ "grad_norm": 2.6049559116363525,
+ "learning_rate": 4.67569295377449e-06,
+ "loss": 0.5018,
+ "step": 2164
+ },
+ {
+ "epoch": 1.0236406619385343,
+ "grad_norm": 2.8175013065338135,
+ "learning_rate": 4.675385612386866e-06,
+ "loss": 0.5309,
+ "step": 2165
+ },
+ {
+ "epoch": 1.0241134751773049,
+ "grad_norm": 2.854696750640869,
+ "learning_rate": 4.675078135547519e-06,
+ "loss": 0.5627,
+ "step": 2166
+ },
+ {
+ "epoch": 1.0245862884160757,
+ "grad_norm": 3.1856436729431152,
+ "learning_rate": 4.674770523275594e-06,
+ "loss": 0.5475,
+ "step": 2167
+ },
+ {
+ "epoch": 1.0250591016548463,
+ "grad_norm": 2.8289129734039307,
+ "learning_rate": 4.674462775590244e-06,
+ "loss": 0.5878,
+ "step": 2168
+ },
+ {
+ "epoch": 1.025531914893617,
+ "grad_norm": 2.8824517726898193,
+ "learning_rate": 4.6741548925106325e-06,
+ "loss": 0.4392,
+ "step": 2169
+ },
+ {
+ "epoch": 1.0260047281323876,
+ "grad_norm": 2.7044589519500732,
+ "learning_rate": 4.673846874055928e-06,
+ "loss": 0.5264,
+ "step": 2170
+ },
+ {
+ "epoch": 1.0264775413711584,
+ "grad_norm": 2.575035810470581,
+ "learning_rate": 4.673538720245312e-06,
+ "loss": 0.4615,
+ "step": 2171
+ },
+ {
+ "epoch": 1.026950354609929,
+ "grad_norm": 2.48168683052063,
+ "learning_rate": 4.67323043109797e-06,
+ "loss": 0.4404,
+ "step": 2172
+ },
+ {
+ "epoch": 1.0274231678486998,
+ "grad_norm": 2.926593065261841,
+ "learning_rate": 4.672922006633098e-06,
+ "loss": 0.54,
+ "step": 2173
+ },
+ {
+ "epoch": 1.0278959810874704,
+ "grad_norm": 2.4610698223114014,
+ "learning_rate": 4.672613446869901e-06,
+ "loss": 0.5555,
+ "step": 2174
+ },
+ {
+ "epoch": 1.0283687943262412,
+ "grad_norm": 3.026901960372925,
+ "learning_rate": 4.672304751827592e-06,
+ "loss": 0.62,
+ "step": 2175
+ },
+ {
+ "epoch": 1.0288416075650118,
+ "grad_norm": 2.3946213722229004,
+ "learning_rate": 4.671995921525391e-06,
+ "loss": 0.5228,
+ "step": 2176
+ },
+ {
+ "epoch": 1.0293144208037825,
+ "grad_norm": 2.985020399093628,
+ "learning_rate": 4.671686955982528e-06,
+ "loss": 0.6256,
+ "step": 2177
+ },
+ {
+ "epoch": 1.0297872340425531,
+ "grad_norm": 3.0910139083862305,
+ "learning_rate": 4.671377855218239e-06,
+ "loss": 0.5893,
+ "step": 2178
+ },
+ {
+ "epoch": 1.030260047281324,
+ "grad_norm": 2.507805109024048,
+ "learning_rate": 4.6710686192517744e-06,
+ "loss": 0.5329,
+ "step": 2179
+ },
+ {
+ "epoch": 1.0307328605200945,
+ "grad_norm": 2.4514641761779785,
+ "learning_rate": 4.670759248102386e-06,
+ "loss": 0.4585,
+ "step": 2180
+ },
+ {
+ "epoch": 1.0312056737588653,
+ "grad_norm": 2.742838144302368,
+ "learning_rate": 4.670449741789337e-06,
+ "loss": 0.6255,
+ "step": 2181
+ },
+ {
+ "epoch": 1.0316784869976359,
+ "grad_norm": 2.374349594116211,
+ "learning_rate": 4.670140100331901e-06,
+ "loss": 0.5049,
+ "step": 2182
+ },
+ {
+ "epoch": 1.0321513002364067,
+ "grad_norm": 2.78894305229187,
+ "learning_rate": 4.669830323749356e-06,
+ "loss": 0.6061,
+ "step": 2183
+ },
+ {
+ "epoch": 1.0326241134751772,
+ "grad_norm": 2.7195091247558594,
+ "learning_rate": 4.6695204120609905e-06,
+ "loss": 0.592,
+ "step": 2184
+ },
+ {
+ "epoch": 1.033096926713948,
+ "grad_norm": 2.824411630630493,
+ "learning_rate": 4.6692103652861035e-06,
+ "loss": 0.5666,
+ "step": 2185
+ },
+ {
+ "epoch": 1.0335697399527186,
+ "grad_norm": 2.4981014728546143,
+ "learning_rate": 4.6689001834439975e-06,
+ "loss": 0.5045,
+ "step": 2186
+ },
+ {
+ "epoch": 1.0340425531914894,
+ "grad_norm": 2.7375214099884033,
+ "learning_rate": 4.668589866553988e-06,
+ "loss": 0.5305,
+ "step": 2187
+ },
+ {
+ "epoch": 1.03451536643026,
+ "grad_norm": 2.625345468521118,
+ "learning_rate": 4.668279414635396e-06,
+ "loss": 0.4819,
+ "step": 2188
+ },
+ {
+ "epoch": 1.0349881796690308,
+ "grad_norm": 2.60479736328125,
+ "learning_rate": 4.667968827707553e-06,
+ "loss": 0.55,
+ "step": 2189
+ },
+ {
+ "epoch": 1.0354609929078014,
+ "grad_norm": 2.642014741897583,
+ "learning_rate": 4.667658105789797e-06,
+ "loss": 0.5264,
+ "step": 2190
+ },
+ {
+ "epoch": 1.0359338061465722,
+ "grad_norm": 2.5439083576202393,
+ "learning_rate": 4.667347248901476e-06,
+ "loss": 0.4657,
+ "step": 2191
+ },
+ {
+ "epoch": 1.0364066193853427,
+ "grad_norm": 2.5537586212158203,
+ "learning_rate": 4.667036257061945e-06,
+ "loss": 0.527,
+ "step": 2192
+ },
+ {
+ "epoch": 1.0368794326241135,
+ "grad_norm": 2.595466375350952,
+ "learning_rate": 4.666725130290569e-06,
+ "loss": 0.5336,
+ "step": 2193
+ },
+ {
+ "epoch": 1.037352245862884,
+ "grad_norm": 3.5106313228607178,
+ "learning_rate": 4.666413868606719e-06,
+ "loss": 0.5176,
+ "step": 2194
+ },
+ {
+ "epoch": 1.037825059101655,
+ "grad_norm": 2.931553363800049,
+ "learning_rate": 4.666102472029778e-06,
+ "loss": 0.549,
+ "step": 2195
+ },
+ {
+ "epoch": 1.0382978723404255,
+ "grad_norm": 2.4325125217437744,
+ "learning_rate": 4.665790940579133e-06,
+ "loss": 0.5095,
+ "step": 2196
+ },
+ {
+ "epoch": 1.0387706855791963,
+ "grad_norm": 2.708477258682251,
+ "learning_rate": 4.665479274274184e-06,
+ "loss": 0.5264,
+ "step": 2197
+ },
+ {
+ "epoch": 1.0392434988179668,
+ "grad_norm": 2.905977487564087,
+ "learning_rate": 4.665167473134335e-06,
+ "loss": 0.5575,
+ "step": 2198
+ },
+ {
+ "epoch": 1.0397163120567376,
+ "grad_norm": 2.428938865661621,
+ "learning_rate": 4.664855537179003e-06,
+ "loss": 0.5099,
+ "step": 2199
+ },
+ {
+ "epoch": 1.0401891252955082,
+ "grad_norm": 2.8432137966156006,
+ "learning_rate": 4.6645434664276075e-06,
+ "loss": 0.5331,
+ "step": 2200
+ },
+ {
+ "epoch": 1.040661938534279,
+ "grad_norm": 2.5185136795043945,
+ "learning_rate": 4.6642312608995825e-06,
+ "loss": 0.5217,
+ "step": 2201
+ },
+ {
+ "epoch": 1.0411347517730496,
+ "grad_norm": 2.556607723236084,
+ "learning_rate": 4.663918920614366e-06,
+ "loss": 0.4431,
+ "step": 2202
+ },
+ {
+ "epoch": 1.0416075650118204,
+ "grad_norm": 3.1271166801452637,
+ "learning_rate": 4.663606445591407e-06,
+ "loss": 0.5398,
+ "step": 2203
+ },
+ {
+ "epoch": 1.042080378250591,
+ "grad_norm": 2.573680877685547,
+ "learning_rate": 4.663293835850162e-06,
+ "loss": 0.4713,
+ "step": 2204
+ },
+ {
+ "epoch": 1.0425531914893618,
+ "grad_norm": 2.5230324268341064,
+ "learning_rate": 4.662981091410096e-06,
+ "loss": 0.5571,
+ "step": 2205
+ },
+ {
+ "epoch": 1.0430260047281323,
+ "grad_norm": 2.552182912826538,
+ "learning_rate": 4.662668212290681e-06,
+ "loss": 0.5173,
+ "step": 2206
+ },
+ {
+ "epoch": 1.0434988179669031,
+ "grad_norm": 2.832345724105835,
+ "learning_rate": 4.6623551985113995e-06,
+ "loss": 0.525,
+ "step": 2207
+ },
+ {
+ "epoch": 1.0439716312056737,
+ "grad_norm": 2.9729080200195312,
+ "learning_rate": 4.6620420500917416e-06,
+ "loss": 0.6308,
+ "step": 2208
+ },
+ {
+ "epoch": 1.0444444444444445,
+ "grad_norm": 2.618187665939331,
+ "learning_rate": 4.661728767051206e-06,
+ "loss": 0.4942,
+ "step": 2209
+ },
+ {
+ "epoch": 1.044917257683215,
+ "grad_norm": 2.515566349029541,
+ "learning_rate": 4.661415349409299e-06,
+ "loss": 0.5229,
+ "step": 2210
+ },
+ {
+ "epoch": 1.0453900709219859,
+ "grad_norm": 2.8651459217071533,
+ "learning_rate": 4.6611017971855356e-06,
+ "loss": 0.5029,
+ "step": 2211
+ },
+ {
+ "epoch": 1.0458628841607565,
+ "grad_norm": 2.502405881881714,
+ "learning_rate": 4.660788110399439e-06,
+ "loss": 0.4732,
+ "step": 2212
+ },
+ {
+ "epoch": 1.0463356973995273,
+ "grad_norm": 2.540668249130249,
+ "learning_rate": 4.660474289070541e-06,
+ "loss": 0.547,
+ "step": 2213
+ },
+ {
+ "epoch": 1.0468085106382978,
+ "grad_norm": 2.803469181060791,
+ "learning_rate": 4.660160333218384e-06,
+ "loss": 0.5441,
+ "step": 2214
+ },
+ {
+ "epoch": 1.0472813238770686,
+ "grad_norm": 3.233325481414795,
+ "learning_rate": 4.659846242862514e-06,
+ "loss": 0.4457,
+ "step": 2215
+ },
+ {
+ "epoch": 1.0477541371158392,
+ "grad_norm": 2.549548387527466,
+ "learning_rate": 4.659532018022489e-06,
+ "loss": 0.5684,
+ "step": 2216
+ },
+ {
+ "epoch": 1.04822695035461,
+ "grad_norm": 2.6112852096557617,
+ "learning_rate": 4.659217658717875e-06,
+ "loss": 0.5323,
+ "step": 2217
+ },
+ {
+ "epoch": 1.0486997635933806,
+ "grad_norm": 2.347418785095215,
+ "learning_rate": 4.658903164968245e-06,
+ "loss": 0.5349,
+ "step": 2218
+ },
+ {
+ "epoch": 1.0491725768321514,
+ "grad_norm": 2.695502281188965,
+ "learning_rate": 4.658588536793182e-06,
+ "loss": 0.4883,
+ "step": 2219
+ },
+ {
+ "epoch": 1.049645390070922,
+ "grad_norm": 2.7575674057006836,
+ "learning_rate": 4.658273774212275e-06,
+ "loss": 0.5517,
+ "step": 2220
+ },
+ {
+ "epoch": 1.0501182033096927,
+ "grad_norm": 2.787855386734009,
+ "learning_rate": 4.6579588772451245e-06,
+ "loss": 0.5744,
+ "step": 2221
+ },
+ {
+ "epoch": 1.0505910165484633,
+ "grad_norm": 3.0699398517608643,
+ "learning_rate": 4.657643845911337e-06,
+ "loss": 0.5258,
+ "step": 2222
+ },
+ {
+ "epoch": 1.0510638297872341,
+ "grad_norm": 2.652040719985962,
+ "learning_rate": 4.657328680230527e-06,
+ "loss": 0.5141,
+ "step": 2223
+ },
+ {
+ "epoch": 1.0515366430260047,
+ "grad_norm": 2.6896369457244873,
+ "learning_rate": 4.657013380222322e-06,
+ "loss": 0.5139,
+ "step": 2224
+ },
+ {
+ "epoch": 1.0520094562647755,
+ "grad_norm": 2.551839590072632,
+ "learning_rate": 4.65669794590635e-06,
+ "loss": 0.5099,
+ "step": 2225
+ },
+ {
+ "epoch": 1.052482269503546,
+ "grad_norm": 2.8543262481689453,
+ "learning_rate": 4.656382377302255e-06,
+ "loss": 0.6085,
+ "step": 2226
+ },
+ {
+ "epoch": 1.0529550827423169,
+ "grad_norm": 2.871469259262085,
+ "learning_rate": 4.656066674429685e-06,
+ "loss": 0.6108,
+ "step": 2227
+ },
+ {
+ "epoch": 1.0534278959810874,
+ "grad_norm": 2.4840824604034424,
+ "learning_rate": 4.655750837308296e-06,
+ "loss": 0.4994,
+ "step": 2228
+ },
+ {
+ "epoch": 1.0539007092198582,
+ "grad_norm": 2.5203280448913574,
+ "learning_rate": 4.6554348659577555e-06,
+ "loss": 0.4928,
+ "step": 2229
+ },
+ {
+ "epoch": 1.0543735224586288,
+ "grad_norm": 2.9327683448791504,
+ "learning_rate": 4.655118760397737e-06,
+ "loss": 0.6324,
+ "step": 2230
+ },
+ {
+ "epoch": 1.0548463356973996,
+ "grad_norm": 2.6766855716705322,
+ "learning_rate": 4.654802520647924e-06,
+ "loss": 0.5178,
+ "step": 2231
+ },
+ {
+ "epoch": 1.0553191489361702,
+ "grad_norm": 2.8438873291015625,
+ "learning_rate": 4.654486146728006e-06,
+ "loss": 0.509,
+ "step": 2232
+ },
+ {
+ "epoch": 1.055791962174941,
+ "grad_norm": 2.538661241531372,
+ "learning_rate": 4.6541696386576826e-06,
+ "loss": 0.5463,
+ "step": 2233
+ },
+ {
+ "epoch": 1.0562647754137116,
+ "grad_norm": 2.829030990600586,
+ "learning_rate": 4.653852996456662e-06,
+ "loss": 0.5404,
+ "step": 2234
+ },
+ {
+ "epoch": 1.0567375886524824,
+ "grad_norm": 2.5657269954681396,
+ "learning_rate": 4.653536220144659e-06,
+ "loss": 0.5479,
+ "step": 2235
+ },
+ {
+ "epoch": 1.057210401891253,
+ "grad_norm": 2.6641297340393066,
+ "learning_rate": 4.653219309741399e-06,
+ "loss": 0.5503,
+ "step": 2236
+ },
+ {
+ "epoch": 1.0576832151300237,
+ "grad_norm": 2.966350555419922,
+ "learning_rate": 4.652902265266615e-06,
+ "loss": 0.6404,
+ "step": 2237
+ },
+ {
+ "epoch": 1.0581560283687943,
+ "grad_norm": 2.462430000305176,
+ "learning_rate": 4.6525850867400455e-06,
+ "loss": 0.4885,
+ "step": 2238
+ },
+ {
+ "epoch": 1.058628841607565,
+ "grad_norm": 2.1791880130767822,
+ "learning_rate": 4.652267774181443e-06,
+ "loss": 0.4405,
+ "step": 2239
+ },
+ {
+ "epoch": 1.0591016548463357,
+ "grad_norm": 2.5473732948303223,
+ "learning_rate": 4.651950327610563e-06,
+ "loss": 0.5295,
+ "step": 2240
+ },
+ {
+ "epoch": 1.0595744680851065,
+ "grad_norm": 2.70904803276062,
+ "learning_rate": 4.651632747047172e-06,
+ "loss": 0.5169,
+ "step": 2241
+ },
+ {
+ "epoch": 1.060047281323877,
+ "grad_norm": 3.8442928791046143,
+ "learning_rate": 4.651315032511045e-06,
+ "loss": 0.5473,
+ "step": 2242
+ },
+ {
+ "epoch": 1.0605200945626478,
+ "grad_norm": 2.8613383769989014,
+ "learning_rate": 4.650997184021963e-06,
+ "loss": 0.5445,
+ "step": 2243
+ },
+ {
+ "epoch": 1.0609929078014184,
+ "grad_norm": 2.5995829105377197,
+ "learning_rate": 4.6506792015997184e-06,
+ "loss": 0.5525,
+ "step": 2244
+ },
+ {
+ "epoch": 1.0614657210401892,
+ "grad_norm": 2.5465996265411377,
+ "learning_rate": 4.650361085264111e-06,
+ "loss": 0.5093,
+ "step": 2245
+ },
+ {
+ "epoch": 1.0619385342789598,
+ "grad_norm": 2.46553111076355,
+ "learning_rate": 4.650042835034948e-06,
+ "loss": 0.5375,
+ "step": 2246
+ },
+ {
+ "epoch": 1.0624113475177306,
+ "grad_norm": 2.6907830238342285,
+ "learning_rate": 4.649724450932045e-06,
+ "loss": 0.572,
+ "step": 2247
+ },
+ {
+ "epoch": 1.0628841607565012,
+ "grad_norm": 3.0671346187591553,
+ "learning_rate": 4.649405932975226e-06,
+ "loss": 0.4974,
+ "step": 2248
+ },
+ {
+ "epoch": 1.063356973995272,
+ "grad_norm": 2.5392491817474365,
+ "learning_rate": 4.649087281184325e-06,
+ "loss": 0.524,
+ "step": 2249
+ },
+ {
+ "epoch": 1.0638297872340425,
+ "grad_norm": 2.7498562335968018,
+ "learning_rate": 4.648768495579183e-06,
+ "loss": 0.5801,
+ "step": 2250
+ },
+ {
+ "epoch": 1.0643026004728133,
+ "grad_norm": 2.8536248207092285,
+ "learning_rate": 4.648449576179649e-06,
+ "loss": 0.5384,
+ "step": 2251
+ },
+ {
+ "epoch": 1.064775413711584,
+ "grad_norm": 2.7062792778015137,
+ "learning_rate": 4.64813052300558e-06,
+ "loss": 0.5262,
+ "step": 2252
+ },
+ {
+ "epoch": 1.0652482269503547,
+ "grad_norm": 2.798650026321411,
+ "learning_rate": 4.647811336076841e-06,
+ "loss": 0.5719,
+ "step": 2253
+ },
+ {
+ "epoch": 1.0657210401891253,
+ "grad_norm": 2.9793951511383057,
+ "learning_rate": 4.647492015413311e-06,
+ "loss": 0.5377,
+ "step": 2254
+ },
+ {
+ "epoch": 1.066193853427896,
+ "grad_norm": 2.572129011154175,
+ "learning_rate": 4.647172561034868e-06,
+ "loss": 0.4791,
+ "step": 2255
+ },
+ {
+ "epoch": 1.0666666666666667,
+ "grad_norm": 3.7490930557250977,
+ "learning_rate": 4.646852972961405e-06,
+ "loss": 0.5423,
+ "step": 2256
+ },
+ {
+ "epoch": 1.0671394799054374,
+ "grad_norm": 2.626255750656128,
+ "learning_rate": 4.646533251212821e-06,
+ "loss": 0.5558,
+ "step": 2257
+ },
+ {
+ "epoch": 1.067612293144208,
+ "grad_norm": 2.8408126831054688,
+ "learning_rate": 4.646213395809023e-06,
+ "loss": 0.55,
+ "step": 2258
+ },
+ {
+ "epoch": 1.0680851063829788,
+ "grad_norm": 3.255606174468994,
+ "learning_rate": 4.645893406769929e-06,
+ "loss": 0.547,
+ "step": 2259
+ },
+ {
+ "epoch": 1.0685579196217494,
+ "grad_norm": 2.4352102279663086,
+ "learning_rate": 4.645573284115461e-06,
+ "loss": 0.4898,
+ "step": 2260
+ },
+ {
+ "epoch": 1.0690307328605202,
+ "grad_norm": 2.408634662628174,
+ "learning_rate": 4.6452530278655535e-06,
+ "loss": 0.5264,
+ "step": 2261
+ },
+ {
+ "epoch": 1.0695035460992908,
+ "grad_norm": 2.4220449924468994,
+ "learning_rate": 4.644932638040146e-06,
+ "loss": 0.5166,
+ "step": 2262
+ },
+ {
+ "epoch": 1.0699763593380616,
+ "grad_norm": 2.9188082218170166,
+ "learning_rate": 4.644612114659188e-06,
+ "loss": 0.5611,
+ "step": 2263
+ },
+ {
+ "epoch": 1.0704491725768321,
+ "grad_norm": 2.906557083129883,
+ "learning_rate": 4.644291457742638e-06,
+ "loss": 0.5515,
+ "step": 2264
+ },
+ {
+ "epoch": 1.070921985815603,
+ "grad_norm": 2.9039015769958496,
+ "learning_rate": 4.643970667310462e-06,
+ "loss": 0.5732,
+ "step": 2265
+ },
+ {
+ "epoch": 1.0713947990543735,
+ "grad_norm": 2.9985480308532715,
+ "learning_rate": 4.643649743382632e-06,
+ "loss": 0.563,
+ "step": 2266
+ },
+ {
+ "epoch": 1.0718676122931443,
+ "grad_norm": 2.5780906677246094,
+ "learning_rate": 4.6433286859791335e-06,
+ "loss": 0.502,
+ "step": 2267
+ },
+ {
+ "epoch": 1.0723404255319149,
+ "grad_norm": 2.590209722518921,
+ "learning_rate": 4.643007495119955e-06,
+ "loss": 0.4995,
+ "step": 2268
+ },
+ {
+ "epoch": 1.0728132387706855,
+ "grad_norm": 2.378894805908203,
+ "learning_rate": 4.642686170825097e-06,
+ "loss": 0.4886,
+ "step": 2269
+ },
+ {
+ "epoch": 1.0732860520094563,
+ "grad_norm": 2.6826229095458984,
+ "learning_rate": 4.642364713114567e-06,
+ "loss": 0.465,
+ "step": 2270
+ },
+ {
+ "epoch": 1.073758865248227,
+ "grad_norm": 2.627819538116455,
+ "learning_rate": 4.64204312200838e-06,
+ "loss": 0.4954,
+ "step": 2271
+ },
+ {
+ "epoch": 1.0742316784869976,
+ "grad_norm": 2.993021249771118,
+ "learning_rate": 4.641721397526561e-06,
+ "loss": 0.5073,
+ "step": 2272
+ },
+ {
+ "epoch": 1.0747044917257682,
+ "grad_norm": 2.719052791595459,
+ "learning_rate": 4.64139953968914e-06,
+ "loss": 0.538,
+ "step": 2273
+ },
+ {
+ "epoch": 1.075177304964539,
+ "grad_norm": 2.729252576828003,
+ "learning_rate": 4.6410775485161605e-06,
+ "loss": 0.552,
+ "step": 2274
+ },
+ {
+ "epoch": 1.0756501182033098,
+ "grad_norm": 2.924142599105835,
+ "learning_rate": 4.640755424027671e-06,
+ "loss": 0.522,
+ "step": 2275
+ },
+ {
+ "epoch": 1.0761229314420804,
+ "grad_norm": 3.329162120819092,
+ "learning_rate": 4.640433166243728e-06,
+ "loss": 0.5965,
+ "step": 2276
+ },
+ {
+ "epoch": 1.076595744680851,
+ "grad_norm": 2.9810245037078857,
+ "learning_rate": 4.640110775184396e-06,
+ "loss": 0.5653,
+ "step": 2277
+ },
+ {
+ "epoch": 1.0770685579196217,
+ "grad_norm": 2.61772084236145,
+ "learning_rate": 4.639788250869751e-06,
+ "loss": 0.5382,
+ "step": 2278
+ },
+ {
+ "epoch": 1.0775413711583925,
+ "grad_norm": 2.741225004196167,
+ "learning_rate": 4.639465593319874e-06,
+ "loss": 0.4866,
+ "step": 2279
+ },
+ {
+ "epoch": 1.0780141843971631,
+ "grad_norm": 2.7945218086242676,
+ "learning_rate": 4.639142802554856e-06,
+ "loss": 0.4711,
+ "step": 2280
+ },
+ {
+ "epoch": 1.0784869976359337,
+ "grad_norm": 2.4282329082489014,
+ "learning_rate": 4.638819878594795e-06,
+ "loss": 0.4911,
+ "step": 2281
+ },
+ {
+ "epoch": 1.0789598108747045,
+ "grad_norm": 2.551741361618042,
+ "learning_rate": 4.638496821459799e-06,
+ "loss": 0.453,
+ "step": 2282
+ },
+ {
+ "epoch": 1.0794326241134753,
+ "grad_norm": 2.5622754096984863,
+ "learning_rate": 4.638173631169983e-06,
+ "loss": 0.5983,
+ "step": 2283
+ },
+ {
+ "epoch": 1.0799054373522459,
+ "grad_norm": 2.7748284339904785,
+ "learning_rate": 4.6378503077454715e-06,
+ "loss": 0.5143,
+ "step": 2284
+ },
+ {
+ "epoch": 1.0803782505910164,
+ "grad_norm": 2.7693238258361816,
+ "learning_rate": 4.637526851206394e-06,
+ "loss": 0.5929,
+ "step": 2285
+ },
+ {
+ "epoch": 1.0808510638297872,
+ "grad_norm": 2.705548048019409,
+ "learning_rate": 4.637203261572893e-06,
+ "loss": 0.5577,
+ "step": 2286
+ },
+ {
+ "epoch": 1.081323877068558,
+ "grad_norm": 2.739307165145874,
+ "learning_rate": 4.636879538865117e-06,
+ "loss": 0.5676,
+ "step": 2287
+ },
+ {
+ "epoch": 1.0817966903073286,
+ "grad_norm": 2.514059543609619,
+ "learning_rate": 4.636555683103221e-06,
+ "loss": 0.5001,
+ "step": 2288
+ },
+ {
+ "epoch": 1.0822695035460992,
+ "grad_norm": 2.7166874408721924,
+ "learning_rate": 4.636231694307372e-06,
+ "loss": 0.5411,
+ "step": 2289
+ },
+ {
+ "epoch": 1.08274231678487,
+ "grad_norm": 2.7661683559417725,
+ "learning_rate": 4.635907572497741e-06,
+ "loss": 0.6353,
+ "step": 2290
+ },
+ {
+ "epoch": 1.0832151300236406,
+ "grad_norm": 2.598381996154785,
+ "learning_rate": 4.635583317694512e-06,
+ "loss": 0.5213,
+ "step": 2291
+ },
+ {
+ "epoch": 1.0836879432624114,
+ "grad_norm": 2.821491003036499,
+ "learning_rate": 4.6352589299178744e-06,
+ "loss": 0.6172,
+ "step": 2292
+ },
+ {
+ "epoch": 1.084160756501182,
+ "grad_norm": 2.5422823429107666,
+ "learning_rate": 4.634934409188025e-06,
+ "loss": 0.5245,
+ "step": 2293
+ },
+ {
+ "epoch": 1.0846335697399527,
+ "grad_norm": 2.8264620304107666,
+ "learning_rate": 4.634609755525173e-06,
+ "loss": 0.5004,
+ "step": 2294
+ },
+ {
+ "epoch": 1.0851063829787233,
+ "grad_norm": 2.3286643028259277,
+ "learning_rate": 4.63428496894953e-06,
+ "loss": 0.4561,
+ "step": 2295
+ },
+ {
+ "epoch": 1.085579196217494,
+ "grad_norm": 2.462005376815796,
+ "learning_rate": 4.633960049481321e-06,
+ "loss": 0.4948,
+ "step": 2296
+ },
+ {
+ "epoch": 1.0860520094562647,
+ "grad_norm": 2.760258913040161,
+ "learning_rate": 4.633634997140777e-06,
+ "loss": 0.5407,
+ "step": 2297
+ },
+ {
+ "epoch": 1.0865248226950355,
+ "grad_norm": 3.0234217643737793,
+ "learning_rate": 4.633309811948138e-06,
+ "loss": 0.4914,
+ "step": 2298
+ },
+ {
+ "epoch": 1.086997635933806,
+ "grad_norm": 2.8380849361419678,
+ "learning_rate": 4.63298449392365e-06,
+ "loss": 0.5562,
+ "step": 2299
+ },
+ {
+ "epoch": 1.0874704491725768,
+ "grad_norm": 2.6201648712158203,
+ "learning_rate": 4.632659043087572e-06,
+ "loss": 0.5882,
+ "step": 2300
+ },
+ {
+ "epoch": 1.0879432624113474,
+ "grad_norm": 2.586339235305786,
+ "learning_rate": 4.632333459460165e-06,
+ "loss": 0.4991,
+ "step": 2301
+ },
+ {
+ "epoch": 1.0884160756501182,
+ "grad_norm": 2.500115394592285,
+ "learning_rate": 4.632007743061705e-06,
+ "loss": 0.552,
+ "step": 2302
+ },
+ {
+ "epoch": 1.0888888888888888,
+ "grad_norm": 2.816390037536621,
+ "learning_rate": 4.63168189391247e-06,
+ "loss": 0.5301,
+ "step": 2303
+ },
+ {
+ "epoch": 1.0893617021276596,
+ "grad_norm": 2.975400924682617,
+ "learning_rate": 4.631355912032753e-06,
+ "loss": 0.6056,
+ "step": 2304
+ },
+ {
+ "epoch": 1.0898345153664302,
+ "grad_norm": 2.747985363006592,
+ "learning_rate": 4.631029797442846e-06,
+ "loss": 0.5335,
+ "step": 2305
+ },
+ {
+ "epoch": 1.090307328605201,
+ "grad_norm": 2.609281539916992,
+ "learning_rate": 4.630703550163059e-06,
+ "loss": 0.5189,
+ "step": 2306
+ },
+ {
+ "epoch": 1.0907801418439715,
+ "grad_norm": 2.624131202697754,
+ "learning_rate": 4.630377170213705e-06,
+ "loss": 0.5646,
+ "step": 2307
+ },
+ {
+ "epoch": 1.0912529550827423,
+ "grad_norm": 2.6186959743499756,
+ "learning_rate": 4.630050657615107e-06,
+ "loss": 0.5187,
+ "step": 2308
+ },
+ {
+ "epoch": 1.091725768321513,
+ "grad_norm": 2.9961764812469482,
+ "learning_rate": 4.629724012387594e-06,
+ "loss": 0.6207,
+ "step": 2309
+ },
+ {
+ "epoch": 1.0921985815602837,
+ "grad_norm": 2.665799140930176,
+ "learning_rate": 4.629397234551505e-06,
+ "loss": 0.5046,
+ "step": 2310
+ },
+ {
+ "epoch": 1.0926713947990543,
+ "grad_norm": 2.6154725551605225,
+ "learning_rate": 4.629070324127187e-06,
+ "loss": 0.5553,
+ "step": 2311
+ },
+ {
+ "epoch": 1.093144208037825,
+ "grad_norm": 2.702967643737793,
+ "learning_rate": 4.628743281134996e-06,
+ "loss": 0.5159,
+ "step": 2312
+ },
+ {
+ "epoch": 1.0936170212765957,
+ "grad_norm": 2.578080177307129,
+ "learning_rate": 4.628416105595295e-06,
+ "loss": 0.4934,
+ "step": 2313
+ },
+ {
+ "epoch": 1.0940898345153665,
+ "grad_norm": 2.8763060569763184,
+ "learning_rate": 4.628088797528456e-06,
+ "loss": 0.5404,
+ "step": 2314
+ },
+ {
+ "epoch": 1.094562647754137,
+ "grad_norm": 2.5301198959350586,
+ "learning_rate": 4.6277613569548585e-06,
+ "loss": 0.524,
+ "step": 2315
+ },
+ {
+ "epoch": 1.0950354609929078,
+ "grad_norm": 2.559903144836426,
+ "learning_rate": 4.627433783894892e-06,
+ "loss": 0.5177,
+ "step": 2316
+ },
+ {
+ "epoch": 1.0955082742316784,
+ "grad_norm": 2.430863380432129,
+ "learning_rate": 4.627106078368952e-06,
+ "loss": 0.5368,
+ "step": 2317
+ },
+ {
+ "epoch": 1.0959810874704492,
+ "grad_norm": 2.687567949295044,
+ "learning_rate": 4.626778240397444e-06,
+ "loss": 0.5385,
+ "step": 2318
+ },
+ {
+ "epoch": 1.0964539007092198,
+ "grad_norm": 3.053466558456421,
+ "learning_rate": 4.62645027000078e-06,
+ "loss": 0.5814,
+ "step": 2319
+ },
+ {
+ "epoch": 1.0969267139479906,
+ "grad_norm": 2.4612979888916016,
+ "learning_rate": 4.6261221671993815e-06,
+ "loss": 0.5069,
+ "step": 2320
+ },
+ {
+ "epoch": 1.0973995271867611,
+ "grad_norm": 2.6153628826141357,
+ "learning_rate": 4.625793932013679e-06,
+ "loss": 0.5422,
+ "step": 2321
+ },
+ {
+ "epoch": 1.097872340425532,
+ "grad_norm": 2.8918874263763428,
+ "learning_rate": 4.62546556446411e-06,
+ "loss": 0.5326,
+ "step": 2322
+ },
+ {
+ "epoch": 1.0983451536643025,
+ "grad_norm": 3.62565279006958,
+ "learning_rate": 4.625137064571119e-06,
+ "loss": 0.5164,
+ "step": 2323
+ },
+ {
+ "epoch": 1.0988179669030733,
+ "grad_norm": 2.4285085201263428,
+ "learning_rate": 4.624808432355164e-06,
+ "loss": 0.5084,
+ "step": 2324
+ },
+ {
+ "epoch": 1.099290780141844,
+ "grad_norm": 2.593979835510254,
+ "learning_rate": 4.624479667836702e-06,
+ "loss": 0.4986,
+ "step": 2325
+ },
+ {
+ "epoch": 1.0997635933806147,
+ "grad_norm": 2.490752935409546,
+ "learning_rate": 4.624150771036208e-06,
+ "loss": 0.5296,
+ "step": 2326
+ },
+ {
+ "epoch": 1.1002364066193853,
+ "grad_norm": 2.67694091796875,
+ "learning_rate": 4.6238217419741595e-06,
+ "loss": 0.5229,
+ "step": 2327
+ },
+ {
+ "epoch": 1.100709219858156,
+ "grad_norm": 2.594147205352783,
+ "learning_rate": 4.623492580671044e-06,
+ "loss": 0.4916,
+ "step": 2328
+ },
+ {
+ "epoch": 1.1011820330969266,
+ "grad_norm": 2.943472385406494,
+ "learning_rate": 4.623163287147356e-06,
+ "loss": 0.5591,
+ "step": 2329
+ },
+ {
+ "epoch": 1.1016548463356974,
+ "grad_norm": 2.569410562515259,
+ "learning_rate": 4.622833861423601e-06,
+ "loss": 0.4648,
+ "step": 2330
+ },
+ {
+ "epoch": 1.102127659574468,
+ "grad_norm": 2.5490405559539795,
+ "learning_rate": 4.6225043035202886e-06,
+ "loss": 0.5493,
+ "step": 2331
+ },
+ {
+ "epoch": 1.1026004728132388,
+ "grad_norm": 2.5964598655700684,
+ "learning_rate": 4.622174613457941e-06,
+ "loss": 0.5358,
+ "step": 2332
+ },
+ {
+ "epoch": 1.1030732860520094,
+ "grad_norm": 2.6456820964813232,
+ "learning_rate": 4.621844791257085e-06,
+ "loss": 0.5864,
+ "step": 2333
+ },
+ {
+ "epoch": 1.1035460992907802,
+ "grad_norm": 2.861180067062378,
+ "learning_rate": 4.621514836938259e-06,
+ "loss": 0.6064,
+ "step": 2334
+ },
+ {
+ "epoch": 1.1040189125295508,
+ "grad_norm": 2.8199548721313477,
+ "learning_rate": 4.621184750522005e-06,
+ "loss": 0.5244,
+ "step": 2335
+ },
+ {
+ "epoch": 1.1044917257683216,
+ "grad_norm": 2.7398853302001953,
+ "learning_rate": 4.6208545320288795e-06,
+ "loss": 0.5496,
+ "step": 2336
+ },
+ {
+ "epoch": 1.1049645390070921,
+ "grad_norm": 2.7941031455993652,
+ "learning_rate": 4.620524181479441e-06,
+ "loss": 0.5496,
+ "step": 2337
+ },
+ {
+ "epoch": 1.105437352245863,
+ "grad_norm": 2.973785161972046,
+ "learning_rate": 4.620193698894259e-06,
+ "loss": 0.5492,
+ "step": 2338
+ },
+ {
+ "epoch": 1.1059101654846335,
+ "grad_norm": 2.650355815887451,
+ "learning_rate": 4.6198630842939144e-06,
+ "loss": 0.5392,
+ "step": 2339
+ },
+ {
+ "epoch": 1.1063829787234043,
+ "grad_norm": 2.9092214107513428,
+ "learning_rate": 4.61953233769899e-06,
+ "loss": 0.5305,
+ "step": 2340
+ },
+ {
+ "epoch": 1.1068557919621749,
+ "grad_norm": 2.6329731941223145,
+ "learning_rate": 4.61920145913008e-06,
+ "loss": 0.5031,
+ "step": 2341
+ },
+ {
+ "epoch": 1.1073286052009457,
+ "grad_norm": 2.7214207649230957,
+ "learning_rate": 4.618870448607788e-06,
+ "loss": 0.5536,
+ "step": 2342
+ },
+ {
+ "epoch": 1.1078014184397162,
+ "grad_norm": 2.873119592666626,
+ "learning_rate": 4.618539306152724e-06,
+ "loss": 0.4531,
+ "step": 2343
+ },
+ {
+ "epoch": 1.108274231678487,
+ "grad_norm": 2.701042413711548,
+ "learning_rate": 4.618208031785507e-06,
+ "loss": 0.5217,
+ "step": 2344
+ },
+ {
+ "epoch": 1.1087470449172576,
+ "grad_norm": 2.7189881801605225,
+ "learning_rate": 4.6178766255267635e-06,
+ "loss": 0.6205,
+ "step": 2345
+ },
+ {
+ "epoch": 1.1092198581560284,
+ "grad_norm": 2.546382188796997,
+ "learning_rate": 4.61754508739713e-06,
+ "loss": 0.5475,
+ "step": 2346
+ },
+ {
+ "epoch": 1.109692671394799,
+ "grad_norm": 2.8429276943206787,
+ "learning_rate": 4.617213417417249e-06,
+ "loss": 0.4809,
+ "step": 2347
+ },
+ {
+ "epoch": 1.1101654846335698,
+ "grad_norm": 2.9515812397003174,
+ "learning_rate": 4.616881615607772e-06,
+ "loss": 0.5067,
+ "step": 2348
+ },
+ {
+ "epoch": 1.1106382978723404,
+ "grad_norm": 2.5910723209381104,
+ "learning_rate": 4.616549681989358e-06,
+ "loss": 0.5368,
+ "step": 2349
+ },
+ {
+ "epoch": 1.1111111111111112,
+ "grad_norm": 2.80855655670166,
+ "learning_rate": 4.616217616582678e-06,
+ "loss": 0.5827,
+ "step": 2350
+ },
+ {
+ "epoch": 1.1115839243498817,
+ "grad_norm": 2.604383945465088,
+ "learning_rate": 4.6158854194084044e-06,
+ "loss": 0.5716,
+ "step": 2351
+ },
+ {
+ "epoch": 1.1120567375886525,
+ "grad_norm": 3.0585904121398926,
+ "learning_rate": 4.6155530904872246e-06,
+ "loss": 0.4998,
+ "step": 2352
+ },
+ {
+ "epoch": 1.112529550827423,
+ "grad_norm": 2.660961627960205,
+ "learning_rate": 4.61522062983983e-06,
+ "loss": 0.4533,
+ "step": 2353
+ },
+ {
+ "epoch": 1.113002364066194,
+ "grad_norm": 2.8042070865631104,
+ "learning_rate": 4.614888037486923e-06,
+ "loss": 0.5592,
+ "step": 2354
+ },
+ {
+ "epoch": 1.1134751773049645,
+ "grad_norm": 2.681664228439331,
+ "learning_rate": 4.61455531344921e-06,
+ "loss": 0.5439,
+ "step": 2355
+ },
+ {
+ "epoch": 1.1139479905437353,
+ "grad_norm": 2.905054807662964,
+ "learning_rate": 4.61422245774741e-06,
+ "loss": 0.5497,
+ "step": 2356
+ },
+ {
+ "epoch": 1.1144208037825059,
+ "grad_norm": 2.7979753017425537,
+ "learning_rate": 4.6138894704022484e-06,
+ "loss": 0.5374,
+ "step": 2357
+ },
+ {
+ "epoch": 1.1148936170212767,
+ "grad_norm": 2.965611696243286,
+ "learning_rate": 4.613556351434458e-06,
+ "loss": 0.5145,
+ "step": 2358
+ },
+ {
+ "epoch": 1.1153664302600472,
+ "grad_norm": 2.583134889602661,
+ "learning_rate": 4.613223100864782e-06,
+ "loss": 0.535,
+ "step": 2359
+ },
+ {
+ "epoch": 1.115839243498818,
+ "grad_norm": 2.5979621410369873,
+ "learning_rate": 4.61288971871397e-06,
+ "loss": 0.5514,
+ "step": 2360
+ },
+ {
+ "epoch": 1.1163120567375886,
+ "grad_norm": 3.0117669105529785,
+ "learning_rate": 4.612556205002779e-06,
+ "loss": 0.5266,
+ "step": 2361
+ },
+ {
+ "epoch": 1.1167848699763594,
+ "grad_norm": 2.425133466720581,
+ "learning_rate": 4.612222559751976e-06,
+ "loss": 0.4838,
+ "step": 2362
+ },
+ {
+ "epoch": 1.11725768321513,
+ "grad_norm": 2.5102691650390625,
+ "learning_rate": 4.611888782982337e-06,
+ "loss": 0.3947,
+ "step": 2363
+ },
+ {
+ "epoch": 1.1177304964539008,
+ "grad_norm": 3.0327367782592773,
+ "learning_rate": 4.611554874714645e-06,
+ "loss": 0.5753,
+ "step": 2364
+ },
+ {
+ "epoch": 1.1182033096926713,
+ "grad_norm": 2.4561009407043457,
+ "learning_rate": 4.6112208349696875e-06,
+ "loss": 0.5054,
+ "step": 2365
+ },
+ {
+ "epoch": 1.1186761229314421,
+ "grad_norm": 3.3898050785064697,
+ "learning_rate": 4.610886663768267e-06,
+ "loss": 0.5946,
+ "step": 2366
+ },
+ {
+ "epoch": 1.1191489361702127,
+ "grad_norm": 2.8112242221832275,
+ "learning_rate": 4.61055236113119e-06,
+ "loss": 0.5475,
+ "step": 2367
+ },
+ {
+ "epoch": 1.1196217494089835,
+ "grad_norm": 3.152946710586548,
+ "learning_rate": 4.610217927079272e-06,
+ "loss": 0.5165,
+ "step": 2368
+ },
+ {
+ "epoch": 1.120094562647754,
+ "grad_norm": 2.7847867012023926,
+ "learning_rate": 4.609883361633336e-06,
+ "loss": 0.5533,
+ "step": 2369
+ },
+ {
+ "epoch": 1.1205673758865249,
+ "grad_norm": 2.6376686096191406,
+ "learning_rate": 4.6095486648142155e-06,
+ "loss": 0.4942,
+ "step": 2370
+ },
+ {
+ "epoch": 1.1210401891252955,
+ "grad_norm": 3.123072862625122,
+ "learning_rate": 4.609213836642749e-06,
+ "loss": 0.616,
+ "step": 2371
+ },
+ {
+ "epoch": 1.1215130023640663,
+ "grad_norm": 2.802694320678711,
+ "learning_rate": 4.608878877139786e-06,
+ "loss": 0.5323,
+ "step": 2372
+ },
+ {
+ "epoch": 1.1219858156028368,
+ "grad_norm": 2.3567938804626465,
+ "learning_rate": 4.6085437863261825e-06,
+ "loss": 0.4822,
+ "step": 2373
+ },
+ {
+ "epoch": 1.1224586288416076,
+ "grad_norm": 2.553112030029297,
+ "learning_rate": 4.608208564222804e-06,
+ "loss": 0.5447,
+ "step": 2374
+ },
+ {
+ "epoch": 1.1229314420803782,
+ "grad_norm": 3.0020132064819336,
+ "learning_rate": 4.607873210850521e-06,
+ "loss": 0.6486,
+ "step": 2375
+ },
+ {
+ "epoch": 1.123404255319149,
+ "grad_norm": 2.832442045211792,
+ "learning_rate": 4.607537726230216e-06,
+ "loss": 0.5257,
+ "step": 2376
+ },
+ {
+ "epoch": 1.1238770685579196,
+ "grad_norm": 2.471527099609375,
+ "learning_rate": 4.607202110382778e-06,
+ "loss": 0.4816,
+ "step": 2377
+ },
+ {
+ "epoch": 1.1243498817966904,
+ "grad_norm": 2.4232118129730225,
+ "learning_rate": 4.606866363329105e-06,
+ "loss": 0.5533,
+ "step": 2378
+ },
+ {
+ "epoch": 1.124822695035461,
+ "grad_norm": 2.477506637573242,
+ "learning_rate": 4.6065304850901025e-06,
+ "loss": 0.5223,
+ "step": 2379
+ },
+ {
+ "epoch": 1.1252955082742317,
+ "grad_norm": 3.54127836227417,
+ "learning_rate": 4.6061944756866824e-06,
+ "loss": 0.6514,
+ "step": 2380
+ },
+ {
+ "epoch": 1.1257683215130023,
+ "grad_norm": 2.5148677825927734,
+ "learning_rate": 4.605858335139768e-06,
+ "loss": 0.4864,
+ "step": 2381
+ },
+ {
+ "epoch": 1.1262411347517731,
+ "grad_norm": 2.8363659381866455,
+ "learning_rate": 4.605522063470289e-06,
+ "loss": 0.5034,
+ "step": 2382
+ },
+ {
+ "epoch": 1.1267139479905437,
+ "grad_norm": 2.4996654987335205,
+ "learning_rate": 4.605185660699184e-06,
+ "loss": 0.4126,
+ "step": 2383
+ },
+ {
+ "epoch": 1.1271867612293145,
+ "grad_norm": 2.352543830871582,
+ "learning_rate": 4.604849126847398e-06,
+ "loss": 0.5224,
+ "step": 2384
+ },
+ {
+ "epoch": 1.127659574468085,
+ "grad_norm": 2.60101056098938,
+ "learning_rate": 4.6045124619358875e-06,
+ "loss": 0.4867,
+ "step": 2385
+ },
+ {
+ "epoch": 1.1281323877068559,
+ "grad_norm": 2.9471068382263184,
+ "learning_rate": 4.604175665985613e-06,
+ "loss": 0.6474,
+ "step": 2386
+ },
+ {
+ "epoch": 1.1286052009456264,
+ "grad_norm": 2.5933351516723633,
+ "learning_rate": 4.603838739017546e-06,
+ "loss": 0.5081,
+ "step": 2387
+ },
+ {
+ "epoch": 1.1290780141843972,
+ "grad_norm": 2.3740346431732178,
+ "learning_rate": 4.6035016810526665e-06,
+ "loss": 0.4438,
+ "step": 2388
+ },
+ {
+ "epoch": 1.1295508274231678,
+ "grad_norm": 2.675020217895508,
+ "learning_rate": 4.6031644921119614e-06,
+ "loss": 0.4968,
+ "step": 2389
+ },
+ {
+ "epoch": 1.1300236406619386,
+ "grad_norm": 2.599472999572754,
+ "learning_rate": 4.602827172216424e-06,
+ "loss": 0.5131,
+ "step": 2390
+ },
+ {
+ "epoch": 1.1304964539007092,
+ "grad_norm": 2.8176097869873047,
+ "learning_rate": 4.602489721387061e-06,
+ "loss": 0.5549,
+ "step": 2391
+ },
+ {
+ "epoch": 1.13096926713948,
+ "grad_norm": 2.466914176940918,
+ "learning_rate": 4.602152139644881e-06,
+ "loss": 0.5052,
+ "step": 2392
+ },
+ {
+ "epoch": 1.1314420803782506,
+ "grad_norm": 2.8938796520233154,
+ "learning_rate": 4.601814427010905e-06,
+ "loss": 0.6181,
+ "step": 2393
+ },
+ {
+ "epoch": 1.1319148936170214,
+ "grad_norm": 2.7390825748443604,
+ "learning_rate": 4.601476583506161e-06,
+ "loss": 0.5178,
+ "step": 2394
+ },
+ {
+ "epoch": 1.132387706855792,
+ "grad_norm": 3.180112838745117,
+ "learning_rate": 4.601138609151685e-06,
+ "loss": 0.6071,
+ "step": 2395
+ },
+ {
+ "epoch": 1.1328605200945627,
+ "grad_norm": 2.9282350540161133,
+ "learning_rate": 4.600800503968521e-06,
+ "loss": 0.5557,
+ "step": 2396
+ },
+ {
+ "epoch": 1.1333333333333333,
+ "grad_norm": 2.6689717769622803,
+ "learning_rate": 4.6004622679777215e-06,
+ "loss": 0.4679,
+ "step": 2397
+ },
+ {
+ "epoch": 1.133806146572104,
+ "grad_norm": 2.651582956314087,
+ "learning_rate": 4.600123901200347e-06,
+ "loss": 0.4907,
+ "step": 2398
+ },
+ {
+ "epoch": 1.1342789598108747,
+ "grad_norm": 2.5702924728393555,
+ "learning_rate": 4.599785403657464e-06,
+ "loss": 0.4919,
+ "step": 2399
+ },
+ {
+ "epoch": 1.1347517730496455,
+ "grad_norm": 2.636812448501587,
+ "learning_rate": 4.599446775370153e-06,
+ "loss": 0.5091,
+ "step": 2400
+ },
+ {
+ "epoch": 1.135224586288416,
+ "grad_norm": 2.5965442657470703,
+ "learning_rate": 4.599108016359497e-06,
+ "loss": 0.5035,
+ "step": 2401
+ },
+ {
+ "epoch": 1.1356973995271868,
+ "grad_norm": 2.689732313156128,
+ "learning_rate": 4.5987691266465885e-06,
+ "loss": 0.5307,
+ "step": 2402
+ },
+ {
+ "epoch": 1.1361702127659574,
+ "grad_norm": 2.7256956100463867,
+ "learning_rate": 4.59843010625253e-06,
+ "loss": 0.5066,
+ "step": 2403
+ },
+ {
+ "epoch": 1.1366430260047282,
+ "grad_norm": 2.726020574569702,
+ "learning_rate": 4.59809095519843e-06,
+ "loss": 0.4805,
+ "step": 2404
+ },
+ {
+ "epoch": 1.1371158392434988,
+ "grad_norm": 2.703339099884033,
+ "learning_rate": 4.597751673505406e-06,
+ "loss": 0.4992,
+ "step": 2405
+ },
+ {
+ "epoch": 1.1375886524822696,
+ "grad_norm": 2.54455304145813,
+ "learning_rate": 4.5974122611945835e-06,
+ "loss": 0.5251,
+ "step": 2406
+ },
+ {
+ "epoch": 1.1380614657210402,
+ "grad_norm": 2.623507022857666,
+ "learning_rate": 4.597072718287096e-06,
+ "loss": 0.4831,
+ "step": 2407
+ },
+ {
+ "epoch": 1.138534278959811,
+ "grad_norm": 2.653590202331543,
+ "learning_rate": 4.596733044804086e-06,
+ "loss": 0.5646,
+ "step": 2408
+ },
+ {
+ "epoch": 1.1390070921985815,
+ "grad_norm": 2.8230600357055664,
+ "learning_rate": 4.5963932407667035e-06,
+ "loss": 0.514,
+ "step": 2409
+ },
+ {
+ "epoch": 1.1394799054373523,
+ "grad_norm": 2.6077451705932617,
+ "learning_rate": 4.5960533061961065e-06,
+ "loss": 0.4713,
+ "step": 2410
+ },
+ {
+ "epoch": 1.139952718676123,
+ "grad_norm": 2.3945798873901367,
+ "learning_rate": 4.595713241113461e-06,
+ "loss": 0.466,
+ "step": 2411
+ },
+ {
+ "epoch": 1.1404255319148937,
+ "grad_norm": 2.8100006580352783,
+ "learning_rate": 4.595373045539941e-06,
+ "loss": 0.5365,
+ "step": 2412
+ },
+ {
+ "epoch": 1.1408983451536643,
+ "grad_norm": 2.6825881004333496,
+ "learning_rate": 4.59503271949673e-06,
+ "loss": 0.4457,
+ "step": 2413
+ },
+ {
+ "epoch": 1.141371158392435,
+ "grad_norm": 2.969435691833496,
+ "learning_rate": 4.594692263005016e-06,
+ "loss": 0.5459,
+ "step": 2414
+ },
+ {
+ "epoch": 1.1418439716312057,
+ "grad_norm": 2.4103164672851562,
+ "learning_rate": 4.594351676086002e-06,
+ "loss": 0.4573,
+ "step": 2415
+ },
+ {
+ "epoch": 1.1423167848699765,
+ "grad_norm": 2.9450128078460693,
+ "learning_rate": 4.594010958760892e-06,
+ "loss": 0.5529,
+ "step": 2416
+ },
+ {
+ "epoch": 1.142789598108747,
+ "grad_norm": 2.6416335105895996,
+ "learning_rate": 4.593670111050901e-06,
+ "loss": 0.5153,
+ "step": 2417
+ },
+ {
+ "epoch": 1.1432624113475178,
+ "grad_norm": 2.473177194595337,
+ "learning_rate": 4.593329132977253e-06,
+ "loss": 0.4962,
+ "step": 2418
+ },
+ {
+ "epoch": 1.1437352245862884,
+ "grad_norm": 2.4494502544403076,
+ "learning_rate": 4.592988024561179e-06,
+ "loss": 0.5182,
+ "step": 2419
+ },
+ {
+ "epoch": 1.1442080378250592,
+ "grad_norm": 2.773930311203003,
+ "learning_rate": 4.592646785823918e-06,
+ "loss": 0.4442,
+ "step": 2420
+ },
+ {
+ "epoch": 1.1446808510638298,
+ "grad_norm": 2.4733314514160156,
+ "learning_rate": 4.592305416786718e-06,
+ "loss": 0.5106,
+ "step": 2421
+ },
+ {
+ "epoch": 1.1451536643026006,
+ "grad_norm": 2.6870038509368896,
+ "learning_rate": 4.591963917470834e-06,
+ "loss": 0.5316,
+ "step": 2422
+ },
+ {
+ "epoch": 1.1456264775413711,
+ "grad_norm": 2.8989531993865967,
+ "learning_rate": 4.591622287897529e-06,
+ "loss": 0.5906,
+ "step": 2423
+ },
+ {
+ "epoch": 1.1460992907801417,
+ "grad_norm": 2.6349124908447266,
+ "learning_rate": 4.591280528088077e-06,
+ "loss": 0.6225,
+ "step": 2424
+ },
+ {
+ "epoch": 1.1465721040189125,
+ "grad_norm": 3.19022274017334,
+ "learning_rate": 4.5909386380637555e-06,
+ "loss": 0.555,
+ "step": 2425
+ },
+ {
+ "epoch": 1.1470449172576833,
+ "grad_norm": 3.1473541259765625,
+ "learning_rate": 4.5905966178458535e-06,
+ "loss": 0.537,
+ "step": 2426
+ },
+ {
+ "epoch": 1.147517730496454,
+ "grad_norm": 2.6996145248413086,
+ "learning_rate": 4.590254467455667e-06,
+ "loss": 0.565,
+ "step": 2427
+ },
+ {
+ "epoch": 1.1479905437352245,
+ "grad_norm": 2.830188274383545,
+ "learning_rate": 4.5899121869145015e-06,
+ "loss": 0.6773,
+ "step": 2428
+ },
+ {
+ "epoch": 1.1484633569739953,
+ "grad_norm": 2.4937260150909424,
+ "learning_rate": 4.589569776243667e-06,
+ "loss": 0.5484,
+ "step": 2429
+ },
+ {
+ "epoch": 1.148936170212766,
+ "grad_norm": 2.54011869430542,
+ "learning_rate": 4.589227235464486e-06,
+ "loss": 0.5307,
+ "step": 2430
+ },
+ {
+ "epoch": 1.1494089834515366,
+ "grad_norm": 2.8764214515686035,
+ "learning_rate": 4.5888845645982845e-06,
+ "loss": 0.5296,
+ "step": 2431
+ },
+ {
+ "epoch": 1.1498817966903072,
+ "grad_norm": 2.637033462524414,
+ "learning_rate": 4.588541763666402e-06,
+ "loss": 0.5975,
+ "step": 2432
+ },
+ {
+ "epoch": 1.150354609929078,
+ "grad_norm": 2.8534255027770996,
+ "learning_rate": 4.5881988326901815e-06,
+ "loss": 0.5431,
+ "step": 2433
+ },
+ {
+ "epoch": 1.1508274231678488,
+ "grad_norm": 2.8546559810638428,
+ "learning_rate": 4.587855771690976e-06,
+ "loss": 0.469,
+ "step": 2434
+ },
+ {
+ "epoch": 1.1513002364066194,
+ "grad_norm": 2.9084973335266113,
+ "learning_rate": 4.587512580690146e-06,
+ "loss": 0.5566,
+ "step": 2435
+ },
+ {
+ "epoch": 1.15177304964539,
+ "grad_norm": 3.0993130207061768,
+ "learning_rate": 4.587169259709063e-06,
+ "loss": 0.5612,
+ "step": 2436
+ },
+ {
+ "epoch": 1.1522458628841608,
+ "grad_norm": 10.847400665283203,
+ "learning_rate": 4.5868258087691e-06,
+ "loss": 0.4678,
+ "step": 2437
+ },
+ {
+ "epoch": 1.1527186761229316,
+ "grad_norm": 2.6648571491241455,
+ "learning_rate": 4.586482227891645e-06,
+ "loss": 0.5951,
+ "step": 2438
+ },
+ {
+ "epoch": 1.1531914893617021,
+ "grad_norm": 2.529043197631836,
+ "learning_rate": 4.586138517098091e-06,
+ "loss": 0.5048,
+ "step": 2439
+ },
+ {
+ "epoch": 1.1536643026004727,
+ "grad_norm": 2.833904504776001,
+ "learning_rate": 4.585794676409839e-06,
+ "loss": 0.536,
+ "step": 2440
+ },
+ {
+ "epoch": 1.1541371158392435,
+ "grad_norm": 3.507657766342163,
+ "learning_rate": 4.585450705848298e-06,
+ "loss": 0.5954,
+ "step": 2441
+ },
+ {
+ "epoch": 1.1546099290780143,
+ "grad_norm": 2.6108388900756836,
+ "learning_rate": 4.585106605434887e-06,
+ "loss": 0.5684,
+ "step": 2442
+ },
+ {
+ "epoch": 1.1550827423167849,
+ "grad_norm": 2.490708589553833,
+ "learning_rate": 4.58476237519103e-06,
+ "loss": 0.4678,
+ "step": 2443
+ },
+ {
+ "epoch": 1.1555555555555554,
+ "grad_norm": 2.8192343711853027,
+ "learning_rate": 4.584418015138161e-06,
+ "loss": 0.5291,
+ "step": 2444
+ },
+ {
+ "epoch": 1.1560283687943262,
+ "grad_norm": 3.0878679752349854,
+ "learning_rate": 4.584073525297722e-06,
+ "loss": 0.5691,
+ "step": 2445
+ },
+ {
+ "epoch": 1.156501182033097,
+ "grad_norm": 3.1444318294525146,
+ "learning_rate": 4.583728905691163e-06,
+ "loss": 0.5643,
+ "step": 2446
+ },
+ {
+ "epoch": 1.1569739952718676,
+ "grad_norm": 3.02382230758667,
+ "learning_rate": 4.583384156339942e-06,
+ "loss": 0.6008,
+ "step": 2447
+ },
+ {
+ "epoch": 1.1574468085106382,
+ "grad_norm": 2.5942490100860596,
+ "learning_rate": 4.583039277265525e-06,
+ "loss": 0.5105,
+ "step": 2448
+ },
+ {
+ "epoch": 1.157919621749409,
+ "grad_norm": 2.938608407974243,
+ "learning_rate": 4.582694268489386e-06,
+ "loss": 0.5123,
+ "step": 2449
+ },
+ {
+ "epoch": 1.1583924349881798,
+ "grad_norm": 2.4622268676757812,
+ "learning_rate": 4.5823491300330075e-06,
+ "loss": 0.4538,
+ "step": 2450
+ },
+ {
+ "epoch": 1.1588652482269504,
+ "grad_norm": 2.4380505084991455,
+ "learning_rate": 4.5820038619178795e-06,
+ "loss": 0.4682,
+ "step": 2451
+ },
+ {
+ "epoch": 1.159338061465721,
+ "grad_norm": 2.479896068572998,
+ "learning_rate": 4.581658464165501e-06,
+ "loss": 0.4877,
+ "step": 2452
+ },
+ {
+ "epoch": 1.1598108747044917,
+ "grad_norm": 2.3373546600341797,
+ "learning_rate": 4.5813129367973765e-06,
+ "loss": 0.445,
+ "step": 2453
+ },
+ {
+ "epoch": 1.1602836879432625,
+ "grad_norm": 2.8586013317108154,
+ "learning_rate": 4.5809672798350214e-06,
+ "loss": 0.5232,
+ "step": 2454
+ },
+ {
+ "epoch": 1.160756501182033,
+ "grad_norm": 3.2302439212799072,
+ "learning_rate": 4.5806214932999595e-06,
+ "loss": 0.5336,
+ "step": 2455
+ },
+ {
+ "epoch": 1.1612293144208037,
+ "grad_norm": 3.1005783081054688,
+ "learning_rate": 4.580275577213721e-06,
+ "loss": 0.5123,
+ "step": 2456
+ },
+ {
+ "epoch": 1.1617021276595745,
+ "grad_norm": 2.7131073474884033,
+ "learning_rate": 4.579929531597842e-06,
+ "loss": 0.5648,
+ "step": 2457
+ },
+ {
+ "epoch": 1.1621749408983453,
+ "grad_norm": 2.5067050457000732,
+ "learning_rate": 4.579583356473874e-06,
+ "loss": 0.5324,
+ "step": 2458
+ },
+ {
+ "epoch": 1.1626477541371159,
+ "grad_norm": 2.7870543003082275,
+ "learning_rate": 4.579237051863366e-06,
+ "loss": 0.5094,
+ "step": 2459
+ },
+ {
+ "epoch": 1.1631205673758864,
+ "grad_norm": 2.739196300506592,
+ "learning_rate": 4.578890617787887e-06,
+ "loss": 0.5103,
+ "step": 2460
+ },
+ {
+ "epoch": 1.1635933806146572,
+ "grad_norm": 2.7108185291290283,
+ "learning_rate": 4.578544054269003e-06,
+ "loss": 0.533,
+ "step": 2461
+ },
+ {
+ "epoch": 1.1640661938534278,
+ "grad_norm": 3.028005361557007,
+ "learning_rate": 4.578197361328295e-06,
+ "loss": 0.636,
+ "step": 2462
+ },
+ {
+ "epoch": 1.1645390070921986,
+ "grad_norm": 2.4855129718780518,
+ "learning_rate": 4.5778505389873505e-06,
+ "loss": 0.501,
+ "step": 2463
+ },
+ {
+ "epoch": 1.1650118203309692,
+ "grad_norm": 2.6314198970794678,
+ "learning_rate": 4.577503587267764e-06,
+ "loss": 0.5812,
+ "step": 2464
+ },
+ {
+ "epoch": 1.16548463356974,
+ "grad_norm": 2.4209671020507812,
+ "learning_rate": 4.5771565061911385e-06,
+ "loss": 0.5168,
+ "step": 2465
+ },
+ {
+ "epoch": 1.1659574468085105,
+ "grad_norm": 2.526388645172119,
+ "learning_rate": 4.576809295779085e-06,
+ "loss": 0.5047,
+ "step": 2466
+ },
+ {
+ "epoch": 1.1664302600472813,
+ "grad_norm": 2.8278191089630127,
+ "learning_rate": 4.576461956053224e-06,
+ "loss": 0.4759,
+ "step": 2467
+ },
+ {
+ "epoch": 1.166903073286052,
+ "grad_norm": 2.7862167358398438,
+ "learning_rate": 4.576114487035182e-06,
+ "loss": 0.5492,
+ "step": 2468
+ },
+ {
+ "epoch": 1.1673758865248227,
+ "grad_norm": 2.6303019523620605,
+ "learning_rate": 4.575766888746594e-06,
+ "loss": 0.5538,
+ "step": 2469
+ },
+ {
+ "epoch": 1.1678486997635933,
+ "grad_norm": 2.613104820251465,
+ "learning_rate": 4.5754191612091034e-06,
+ "loss": 0.5114,
+ "step": 2470
+ },
+ {
+ "epoch": 1.168321513002364,
+ "grad_norm": 2.653958320617676,
+ "learning_rate": 4.5750713044443625e-06,
+ "loss": 0.5858,
+ "step": 2471
+ },
+ {
+ "epoch": 1.1687943262411347,
+ "grad_norm": 3.1143975257873535,
+ "learning_rate": 4.574723318474031e-06,
+ "loss": 0.5193,
+ "step": 2472
+ },
+ {
+ "epoch": 1.1692671394799055,
+ "grad_norm": 3.05454421043396,
+ "learning_rate": 4.574375203319775e-06,
+ "loss": 0.464,
+ "step": 2473
+ },
+ {
+ "epoch": 1.169739952718676,
+ "grad_norm": 2.66626238822937,
+ "learning_rate": 4.574026959003272e-06,
+ "loss": 0.4988,
+ "step": 2474
+ },
+ {
+ "epoch": 1.1702127659574468,
+ "grad_norm": 2.8871963024139404,
+ "learning_rate": 4.573678585546203e-06,
+ "loss": 0.5557,
+ "step": 2475
+ },
+ {
+ "epoch": 1.1706855791962174,
+ "grad_norm": 2.592949628829956,
+ "learning_rate": 4.573330082970262e-06,
+ "loss": 0.5178,
+ "step": 2476
+ },
+ {
+ "epoch": 1.1711583924349882,
+ "grad_norm": 2.9111456871032715,
+ "learning_rate": 4.572981451297148e-06,
+ "loss": 0.5712,
+ "step": 2477
+ },
+ {
+ "epoch": 1.1716312056737588,
+ "grad_norm": 2.8152248859405518,
+ "learning_rate": 4.57263269054857e-06,
+ "loss": 0.5548,
+ "step": 2478
+ },
+ {
+ "epoch": 1.1721040189125296,
+ "grad_norm": 3.0292418003082275,
+ "learning_rate": 4.572283800746241e-06,
+ "loss": 0.5937,
+ "step": 2479
+ },
+ {
+ "epoch": 1.1725768321513002,
+ "grad_norm": 3.454618215560913,
+ "learning_rate": 4.571934781911886e-06,
+ "loss": 0.5537,
+ "step": 2480
+ },
+ {
+ "epoch": 1.173049645390071,
+ "grad_norm": 2.7817866802215576,
+ "learning_rate": 4.571585634067239e-06,
+ "loss": 0.5649,
+ "step": 2481
+ },
+ {
+ "epoch": 1.1735224586288415,
+ "grad_norm": 2.7989349365234375,
+ "learning_rate": 4.571236357234037e-06,
+ "loss": 0.5448,
+ "step": 2482
+ },
+ {
+ "epoch": 1.1739952718676123,
+ "grad_norm": 2.8863933086395264,
+ "learning_rate": 4.57088695143403e-06,
+ "loss": 0.63,
+ "step": 2483
+ },
+ {
+ "epoch": 1.174468085106383,
+ "grad_norm": 2.5738039016723633,
+ "learning_rate": 4.570537416688972e-06,
+ "loss": 0.4702,
+ "step": 2484
+ },
+ {
+ "epoch": 1.1749408983451537,
+ "grad_norm": 3.003643274307251,
+ "learning_rate": 4.570187753020629e-06,
+ "loss": 0.5918,
+ "step": 2485
+ },
+ {
+ "epoch": 1.1754137115839243,
+ "grad_norm": 2.8619167804718018,
+ "learning_rate": 4.569837960450772e-06,
+ "loss": 0.5268,
+ "step": 2486
+ },
+ {
+ "epoch": 1.175886524822695,
+ "grad_norm": 2.876077175140381,
+ "learning_rate": 4.569488039001181e-06,
+ "loss": 0.4915,
+ "step": 2487
+ },
+ {
+ "epoch": 1.1763593380614656,
+ "grad_norm": 3.407115936279297,
+ "learning_rate": 4.569137988693644e-06,
+ "loss": 0.5761,
+ "step": 2488
+ },
+ {
+ "epoch": 1.1768321513002364,
+ "grad_norm": 2.7292826175689697,
+ "learning_rate": 4.568787809549958e-06,
+ "loss": 0.541,
+ "step": 2489
+ },
+ {
+ "epoch": 1.177304964539007,
+ "grad_norm": 2.8805999755859375,
+ "learning_rate": 4.568437501591926e-06,
+ "loss": 0.6223,
+ "step": 2490
+ },
+ {
+ "epoch": 1.1777777777777778,
+ "grad_norm": 2.9264373779296875,
+ "learning_rate": 4.56808706484136e-06,
+ "loss": 0.6081,
+ "step": 2491
+ },
+ {
+ "epoch": 1.1782505910165484,
+ "grad_norm": 2.5167033672332764,
+ "learning_rate": 4.567736499320082e-06,
+ "loss": 0.5393,
+ "step": 2492
+ },
+ {
+ "epoch": 1.1787234042553192,
+ "grad_norm": 3.4647862911224365,
+ "learning_rate": 4.567385805049918e-06,
+ "loss": 0.4826,
+ "step": 2493
+ },
+ {
+ "epoch": 1.1791962174940898,
+ "grad_norm": 2.9824202060699463,
+ "learning_rate": 4.5670349820527055e-06,
+ "loss": 0.541,
+ "step": 2494
+ },
+ {
+ "epoch": 1.1796690307328606,
+ "grad_norm": 2.997105836868286,
+ "learning_rate": 4.5666840303502885e-06,
+ "loss": 0.5771,
+ "step": 2495
+ },
+ {
+ "epoch": 1.1801418439716311,
+ "grad_norm": 2.8728017807006836,
+ "learning_rate": 4.56633294996452e-06,
+ "loss": 0.4877,
+ "step": 2496
+ },
+ {
+ "epoch": 1.180614657210402,
+ "grad_norm": 2.626498222351074,
+ "learning_rate": 4.5659817409172565e-06,
+ "loss": 0.5296,
+ "step": 2497
+ },
+ {
+ "epoch": 1.1810874704491725,
+ "grad_norm": 2.87037992477417,
+ "learning_rate": 4.565630403230371e-06,
+ "loss": 0.539,
+ "step": 2498
+ },
+ {
+ "epoch": 1.1815602836879433,
+ "grad_norm": 2.5719685554504395,
+ "learning_rate": 4.5652789369257375e-06,
+ "loss": 0.5653,
+ "step": 2499
+ },
+ {
+ "epoch": 1.1820330969267139,
+ "grad_norm": 2.4842135906219482,
+ "learning_rate": 4.56492734202524e-06,
+ "loss": 0.515,
+ "step": 2500
+ },
+ {
+ "epoch": 1.1825059101654847,
+ "grad_norm": 2.640951156616211,
+ "learning_rate": 4.564575618550773e-06,
+ "loss": 0.5601,
+ "step": 2501
+ },
+ {
+ "epoch": 1.1829787234042553,
+ "grad_norm": 2.624394655227661,
+ "learning_rate": 4.564223766524234e-06,
+ "loss": 0.5551,
+ "step": 2502
+ },
+ {
+ "epoch": 1.183451536643026,
+ "grad_norm": 3.014537811279297,
+ "learning_rate": 4.563871785967533e-06,
+ "loss": 0.5212,
+ "step": 2503
+ },
+ {
+ "epoch": 1.1839243498817966,
+ "grad_norm": 2.8756890296936035,
+ "learning_rate": 4.563519676902585e-06,
+ "loss": 0.5132,
+ "step": 2504
+ },
+ {
+ "epoch": 1.1843971631205674,
+ "grad_norm": 2.636781692504883,
+ "learning_rate": 4.5631674393513145e-06,
+ "loss": 0.5323,
+ "step": 2505
+ },
+ {
+ "epoch": 1.184869976359338,
+ "grad_norm": 2.7233786582946777,
+ "learning_rate": 4.562815073335655e-06,
+ "loss": 0.5608,
+ "step": 2506
+ },
+ {
+ "epoch": 1.1853427895981088,
+ "grad_norm": 2.7158713340759277,
+ "learning_rate": 4.562462578877546e-06,
+ "loss": 0.5373,
+ "step": 2507
+ },
+ {
+ "epoch": 1.1858156028368794,
+ "grad_norm": 2.9754762649536133,
+ "learning_rate": 4.562109955998936e-06,
+ "loss": 0.5712,
+ "step": 2508
+ },
+ {
+ "epoch": 1.1862884160756502,
+ "grad_norm": 2.8815054893493652,
+ "learning_rate": 4.561757204721781e-06,
+ "loss": 0.6126,
+ "step": 2509
+ },
+ {
+ "epoch": 1.1867612293144207,
+ "grad_norm": 2.866319417953491,
+ "learning_rate": 4.561404325068045e-06,
+ "loss": 0.506,
+ "step": 2510
+ },
+ {
+ "epoch": 1.1872340425531915,
+ "grad_norm": 2.6187376976013184,
+ "learning_rate": 4.561051317059701e-06,
+ "loss": 0.4674,
+ "step": 2511
+ },
+ {
+ "epoch": 1.1877068557919621,
+ "grad_norm": 2.642552137374878,
+ "learning_rate": 4.560698180718729e-06,
+ "loss": 0.4793,
+ "step": 2512
+ },
+ {
+ "epoch": 1.188179669030733,
+ "grad_norm": 2.7815041542053223,
+ "learning_rate": 4.560344916067117e-06,
+ "loss": 0.5034,
+ "step": 2513
+ },
+ {
+ "epoch": 1.1886524822695035,
+ "grad_norm": 2.70853590965271,
+ "learning_rate": 4.559991523126862e-06,
+ "loss": 0.4811,
+ "step": 2514
+ },
+ {
+ "epoch": 1.1891252955082743,
+ "grad_norm": 2.7049436569213867,
+ "learning_rate": 4.559638001919967e-06,
+ "loss": 0.547,
+ "step": 2515
+ },
+ {
+ "epoch": 1.1895981087470449,
+ "grad_norm": 2.766773223876953,
+ "learning_rate": 4.559284352468445e-06,
+ "loss": 0.5362,
+ "step": 2516
+ },
+ {
+ "epoch": 1.1900709219858157,
+ "grad_norm": 3.0064334869384766,
+ "learning_rate": 4.558930574794316e-06,
+ "loss": 0.5915,
+ "step": 2517
+ },
+ {
+ "epoch": 1.1905437352245862,
+ "grad_norm": 2.4899885654449463,
+ "learning_rate": 4.558576668919609e-06,
+ "loss": 0.4379,
+ "step": 2518
+ },
+ {
+ "epoch": 1.191016548463357,
+ "grad_norm": 2.925963878631592,
+ "learning_rate": 4.558222634866358e-06,
+ "loss": 0.5389,
+ "step": 2519
+ },
+ {
+ "epoch": 1.1914893617021276,
+ "grad_norm": 6.087667465209961,
+ "learning_rate": 4.55786847265661e-06,
+ "loss": 0.4777,
+ "step": 2520
+ },
+ {
+ "epoch": 1.1919621749408984,
+ "grad_norm": 2.4560582637786865,
+ "learning_rate": 4.5575141823124145e-06,
+ "loss": 0.5576,
+ "step": 2521
+ },
+ {
+ "epoch": 1.192434988179669,
+ "grad_norm": 3.184252977371216,
+ "learning_rate": 4.557159763855834e-06,
+ "loss": 0.5151,
+ "step": 2522
+ },
+ {
+ "epoch": 1.1929078014184398,
+ "grad_norm": 2.359722137451172,
+ "learning_rate": 4.556805217308935e-06,
+ "loss": 0.478,
+ "step": 2523
+ },
+ {
+ "epoch": 1.1933806146572103,
+ "grad_norm": 3.0821568965911865,
+ "learning_rate": 4.5564505426937935e-06,
+ "loss": 0.5784,
+ "step": 2524
+ },
+ {
+ "epoch": 1.1938534278959811,
+ "grad_norm": 2.9905128479003906,
+ "learning_rate": 4.5560957400324936e-06,
+ "loss": 0.6087,
+ "step": 2525
+ },
+ {
+ "epoch": 1.1943262411347517,
+ "grad_norm": 2.462102174758911,
+ "learning_rate": 4.555740809347128e-06,
+ "loss": 0.4739,
+ "step": 2526
+ },
+ {
+ "epoch": 1.1947990543735225,
+ "grad_norm": 2.7931067943573,
+ "learning_rate": 4.555385750659796e-06,
+ "loss": 0.4961,
+ "step": 2527
+ },
+ {
+ "epoch": 1.195271867612293,
+ "grad_norm": 2.660320997238159,
+ "learning_rate": 4.555030563992607e-06,
+ "loss": 0.487,
+ "step": 2528
+ },
+ {
+ "epoch": 1.195744680851064,
+ "grad_norm": 2.8135557174682617,
+ "learning_rate": 4.554675249367675e-06,
+ "loss": 0.5269,
+ "step": 2529
+ },
+ {
+ "epoch": 1.1962174940898345,
+ "grad_norm": 2.661933422088623,
+ "learning_rate": 4.554319806807126e-06,
+ "loss": 0.4723,
+ "step": 2530
+ },
+ {
+ "epoch": 1.1966903073286053,
+ "grad_norm": 2.568176507949829,
+ "learning_rate": 4.553964236333089e-06,
+ "loss": 0.5258,
+ "step": 2531
+ },
+ {
+ "epoch": 1.1971631205673758,
+ "grad_norm": 2.6890947818756104,
+ "learning_rate": 4.553608537967705e-06,
+ "loss": 0.4965,
+ "step": 2532
+ },
+ {
+ "epoch": 1.1976359338061466,
+ "grad_norm": 3.133470058441162,
+ "learning_rate": 4.553252711733124e-06,
+ "loss": 0.5423,
+ "step": 2533
+ },
+ {
+ "epoch": 1.1981087470449172,
+ "grad_norm": 2.7086687088012695,
+ "learning_rate": 4.552896757651498e-06,
+ "loss": 0.5326,
+ "step": 2534
+ },
+ {
+ "epoch": 1.198581560283688,
+ "grad_norm": 2.8411715030670166,
+ "learning_rate": 4.552540675744994e-06,
+ "loss": 0.5793,
+ "step": 2535
+ },
+ {
+ "epoch": 1.1990543735224586,
+ "grad_norm": 3.041077136993408,
+ "learning_rate": 4.552184466035782e-06,
+ "loss": 0.5068,
+ "step": 2536
+ },
+ {
+ "epoch": 1.1995271867612294,
+ "grad_norm": 2.5921192169189453,
+ "learning_rate": 4.551828128546041e-06,
+ "loss": 0.5189,
+ "step": 2537
+ },
+ {
+ "epoch": 1.2,
+ "grad_norm": 2.923305034637451,
+ "learning_rate": 4.5514716632979605e-06,
+ "loss": 0.516,
+ "step": 2538
+ },
+ {
+ "epoch": 1.2004728132387708,
+ "grad_norm": 2.7083024978637695,
+ "learning_rate": 4.551115070313734e-06,
+ "loss": 0.4825,
+ "step": 2539
+ },
+ {
+ "epoch": 1.2009456264775413,
+ "grad_norm": 2.746842384338379,
+ "learning_rate": 4.550758349615567e-06,
+ "loss": 0.5691,
+ "step": 2540
+ },
+ {
+ "epoch": 1.2014184397163121,
+ "grad_norm": 2.6596429347991943,
+ "learning_rate": 4.550401501225669e-06,
+ "loss": 0.5983,
+ "step": 2541
+ },
+ {
+ "epoch": 1.2018912529550827,
+ "grad_norm": 2.9057931900024414,
+ "learning_rate": 4.550044525166261e-06,
+ "loss": 0.5069,
+ "step": 2542
+ },
+ {
+ "epoch": 1.2023640661938535,
+ "grad_norm": 2.6139039993286133,
+ "learning_rate": 4.5496874214595686e-06,
+ "loss": 0.5102,
+ "step": 2543
+ },
+ {
+ "epoch": 1.202836879432624,
+ "grad_norm": 2.630286455154419,
+ "learning_rate": 4.5493301901278285e-06,
+ "loss": 0.4902,
+ "step": 2544
+ },
+ {
+ "epoch": 1.2033096926713949,
+ "grad_norm": 2.639174222946167,
+ "learning_rate": 4.548972831193284e-06,
+ "loss": 0.4566,
+ "step": 2545
+ },
+ {
+ "epoch": 1.2037825059101654,
+ "grad_norm": 2.9569664001464844,
+ "learning_rate": 4.548615344678186e-06,
+ "loss": 0.5636,
+ "step": 2546
+ },
+ {
+ "epoch": 1.2042553191489362,
+ "grad_norm": 2.981734037399292,
+ "learning_rate": 4.5482577306047924e-06,
+ "loss": 0.4884,
+ "step": 2547
+ },
+ {
+ "epoch": 1.2047281323877068,
+ "grad_norm": 2.6760342121124268,
+ "learning_rate": 4.547899988995371e-06,
+ "loss": 0.5426,
+ "step": 2548
+ },
+ {
+ "epoch": 1.2052009456264776,
+ "grad_norm": 2.825805902481079,
+ "learning_rate": 4.547542119872198e-06,
+ "loss": 0.4989,
+ "step": 2549
+ },
+ {
+ "epoch": 1.2056737588652482,
+ "grad_norm": 2.856426954269409,
+ "learning_rate": 4.547184123257555e-06,
+ "loss": 0.5734,
+ "step": 2550
+ },
+ {
+ "epoch": 1.206146572104019,
+ "grad_norm": 2.555682420730591,
+ "learning_rate": 4.5468259991737334e-06,
+ "loss": 0.5299,
+ "step": 2551
+ },
+ {
+ "epoch": 1.2066193853427896,
+ "grad_norm": 2.6324024200439453,
+ "learning_rate": 4.546467747643032e-06,
+ "loss": 0.5906,
+ "step": 2552
+ },
+ {
+ "epoch": 1.2070921985815604,
+ "grad_norm": 3.4145350456237793,
+ "learning_rate": 4.546109368687757e-06,
+ "loss": 0.5153,
+ "step": 2553
+ },
+ {
+ "epoch": 1.207565011820331,
+ "grad_norm": 2.658691644668579,
+ "learning_rate": 4.545750862330225e-06,
+ "loss": 0.5759,
+ "step": 2554
+ },
+ {
+ "epoch": 1.2080378250591017,
+ "grad_norm": 3.162605047225952,
+ "learning_rate": 4.545392228592755e-06,
+ "loss": 0.5379,
+ "step": 2555
+ },
+ {
+ "epoch": 1.2085106382978723,
+ "grad_norm": 2.8631198406219482,
+ "learning_rate": 4.545033467497681e-06,
+ "loss": 0.5959,
+ "step": 2556
+ },
+ {
+ "epoch": 1.208983451536643,
+ "grad_norm": 2.457109212875366,
+ "learning_rate": 4.54467457906734e-06,
+ "loss": 0.4864,
+ "step": 2557
+ },
+ {
+ "epoch": 1.2094562647754137,
+ "grad_norm": 2.5307061672210693,
+ "learning_rate": 4.544315563324078e-06,
+ "loss": 0.5308,
+ "step": 2558
+ },
+ {
+ "epoch": 1.2099290780141845,
+ "grad_norm": 2.8482773303985596,
+ "learning_rate": 4.543956420290251e-06,
+ "loss": 0.5126,
+ "step": 2559
+ },
+ {
+ "epoch": 1.210401891252955,
+ "grad_norm": 2.4990832805633545,
+ "learning_rate": 4.5435971499882195e-06,
+ "loss": 0.4534,
+ "step": 2560
+ },
+ {
+ "epoch": 1.2108747044917259,
+ "grad_norm": 2.6292665004730225,
+ "learning_rate": 4.543237752440354e-06,
+ "loss": 0.4434,
+ "step": 2561
+ },
+ {
+ "epoch": 1.2113475177304964,
+ "grad_norm": 2.865983247756958,
+ "learning_rate": 4.542878227669033e-06,
+ "loss": 0.5667,
+ "step": 2562
+ },
+ {
+ "epoch": 1.2118203309692672,
+ "grad_norm": 2.745614528656006,
+ "learning_rate": 4.542518575696644e-06,
+ "loss": 0.4724,
+ "step": 2563
+ },
+ {
+ "epoch": 1.2122931442080378,
+ "grad_norm": 2.8562581539154053,
+ "learning_rate": 4.5421587965455785e-06,
+ "loss": 0.5405,
+ "step": 2564
+ },
+ {
+ "epoch": 1.2127659574468086,
+ "grad_norm": 2.6670095920562744,
+ "learning_rate": 4.5417988902382385e-06,
+ "loss": 0.5432,
+ "step": 2565
+ },
+ {
+ "epoch": 1.2132387706855792,
+ "grad_norm": 2.9320743083953857,
+ "learning_rate": 4.541438856797036e-06,
+ "loss": 0.5862,
+ "step": 2566
+ },
+ {
+ "epoch": 1.21371158392435,
+ "grad_norm": 2.577505588531494,
+ "learning_rate": 4.541078696244386e-06,
+ "loss": 0.4742,
+ "step": 2567
+ },
+ {
+ "epoch": 1.2141843971631205,
+ "grad_norm": 3.4476120471954346,
+ "learning_rate": 4.540718408602717e-06,
+ "loss": 0.5903,
+ "step": 2568
+ },
+ {
+ "epoch": 1.2146572104018913,
+ "grad_norm": 2.816210985183716,
+ "learning_rate": 4.540357993894459e-06,
+ "loss": 0.5033,
+ "step": 2569
+ },
+ {
+ "epoch": 1.215130023640662,
+ "grad_norm": 3.0806639194488525,
+ "learning_rate": 4.539997452142058e-06,
+ "loss": 0.6064,
+ "step": 2570
+ },
+ {
+ "epoch": 1.2156028368794327,
+ "grad_norm": 2.563060760498047,
+ "learning_rate": 4.5396367833679586e-06,
+ "loss": 0.5597,
+ "step": 2571
+ },
+ {
+ "epoch": 1.2160756501182033,
+ "grad_norm": 3.1014397144317627,
+ "learning_rate": 4.5392759875946215e-06,
+ "loss": 0.54,
+ "step": 2572
+ },
+ {
+ "epoch": 1.216548463356974,
+ "grad_norm": 3.124190330505371,
+ "learning_rate": 4.53891506484451e-06,
+ "loss": 0.5122,
+ "step": 2573
+ },
+ {
+ "epoch": 1.2170212765957447,
+ "grad_norm": 2.6688716411590576,
+ "learning_rate": 4.538554015140097e-06,
+ "loss": 0.5615,
+ "step": 2574
+ },
+ {
+ "epoch": 1.2174940898345155,
+ "grad_norm": 2.775543689727783,
+ "learning_rate": 4.538192838503866e-06,
+ "loss": 0.496,
+ "step": 2575
+ },
+ {
+ "epoch": 1.217966903073286,
+ "grad_norm": 2.7877283096313477,
+ "learning_rate": 4.537831534958303e-06,
+ "loss": 0.4995,
+ "step": 2576
+ },
+ {
+ "epoch": 1.2184397163120568,
+ "grad_norm": 2.824810028076172,
+ "learning_rate": 4.537470104525906e-06,
+ "loss": 0.5481,
+ "step": 2577
+ },
+ {
+ "epoch": 1.2189125295508274,
+ "grad_norm": 2.801269292831421,
+ "learning_rate": 4.53710854722918e-06,
+ "loss": 0.5628,
+ "step": 2578
+ },
+ {
+ "epoch": 1.2193853427895982,
+ "grad_norm": 2.7780683040618896,
+ "learning_rate": 4.536746863090637e-06,
+ "loss": 0.4845,
+ "step": 2579
+ },
+ {
+ "epoch": 1.2198581560283688,
+ "grad_norm": 2.536010265350342,
+ "learning_rate": 4.536385052132798e-06,
+ "loss": 0.4771,
+ "step": 2580
+ },
+ {
+ "epoch": 1.2203309692671396,
+ "grad_norm": 2.768775701522827,
+ "learning_rate": 4.536023114378191e-06,
+ "loss": 0.5366,
+ "step": 2581
+ },
+ {
+ "epoch": 1.2208037825059102,
+ "grad_norm": 2.658125877380371,
+ "learning_rate": 4.535661049849352e-06,
+ "loss": 0.524,
+ "step": 2582
+ },
+ {
+ "epoch": 1.2212765957446807,
+ "grad_norm": 2.558696746826172,
+ "learning_rate": 4.535298858568825e-06,
+ "loss": 0.5482,
+ "step": 2583
+ },
+ {
+ "epoch": 1.2217494089834515,
+ "grad_norm": 2.5284535884857178,
+ "learning_rate": 4.534936540559164e-06,
+ "loss": 0.4454,
+ "step": 2584
+ },
+ {
+ "epoch": 1.2222222222222223,
+ "grad_norm": 7.617330074310303,
+ "learning_rate": 4.534574095842927e-06,
+ "loss": 0.5615,
+ "step": 2585
+ },
+ {
+ "epoch": 1.222695035460993,
+ "grad_norm": 2.9120311737060547,
+ "learning_rate": 4.534211524442682e-06,
+ "loss": 0.5624,
+ "step": 2586
+ },
+ {
+ "epoch": 1.2231678486997635,
+ "grad_norm": 2.5004289150238037,
+ "learning_rate": 4.533848826381005e-06,
+ "loss": 0.4743,
+ "step": 2587
+ },
+ {
+ "epoch": 1.2236406619385343,
+ "grad_norm": 2.8395533561706543,
+ "learning_rate": 4.53348600168048e-06,
+ "loss": 0.4457,
+ "step": 2588
+ },
+ {
+ "epoch": 1.224113475177305,
+ "grad_norm": 2.832211494445801,
+ "learning_rate": 4.533123050363699e-06,
+ "loss": 0.5559,
+ "step": 2589
+ },
+ {
+ "epoch": 1.2245862884160756,
+ "grad_norm": 2.6318583488464355,
+ "learning_rate": 4.53275997245326e-06,
+ "loss": 0.5281,
+ "step": 2590
+ },
+ {
+ "epoch": 1.2250591016548462,
+ "grad_norm": 3.0509233474731445,
+ "learning_rate": 4.532396767971771e-06,
+ "loss": 0.6003,
+ "step": 2591
+ },
+ {
+ "epoch": 1.225531914893617,
+ "grad_norm": 2.6863620281219482,
+ "learning_rate": 4.532033436941847e-06,
+ "loss": 0.5219,
+ "step": 2592
+ },
+ {
+ "epoch": 1.2260047281323878,
+ "grad_norm": 2.401463747024536,
+ "learning_rate": 4.5316699793861104e-06,
+ "loss": 0.5994,
+ "step": 2593
+ },
+ {
+ "epoch": 1.2264775413711584,
+ "grad_norm": 2.613517999649048,
+ "learning_rate": 4.531306395327194e-06,
+ "loss": 0.5785,
+ "step": 2594
+ },
+ {
+ "epoch": 1.226950354609929,
+ "grad_norm": 2.5016374588012695,
+ "learning_rate": 4.530942684787735e-06,
+ "loss": 0.5695,
+ "step": 2595
+ },
+ {
+ "epoch": 1.2274231678486998,
+ "grad_norm": 2.576464891433716,
+ "learning_rate": 4.53057884779038e-06,
+ "loss": 0.4427,
+ "step": 2596
+ },
+ {
+ "epoch": 1.2278959810874706,
+ "grad_norm": 2.5688700675964355,
+ "learning_rate": 4.530214884357785e-06,
+ "loss": 0.4966,
+ "step": 2597
+ },
+ {
+ "epoch": 1.2283687943262411,
+ "grad_norm": 3.179013729095459,
+ "learning_rate": 4.52985079451261e-06,
+ "loss": 0.5239,
+ "step": 2598
+ },
+ {
+ "epoch": 1.2288416075650117,
+ "grad_norm": 2.6015284061431885,
+ "learning_rate": 4.529486578277527e-06,
+ "loss": 0.5135,
+ "step": 2599
+ },
+ {
+ "epoch": 1.2293144208037825,
+ "grad_norm": 2.3029589653015137,
+ "learning_rate": 4.529122235675214e-06,
+ "loss": 0.4044,
+ "step": 2600
+ },
+ {
+ "epoch": 1.2297872340425533,
+ "grad_norm": 2.994093656539917,
+ "learning_rate": 4.528757766728357e-06,
+ "loss": 0.5419,
+ "step": 2601
+ },
+ {
+ "epoch": 1.2302600472813239,
+ "grad_norm": 2.6297390460968018,
+ "learning_rate": 4.52839317145965e-06,
+ "loss": 0.488,
+ "step": 2602
+ },
+ {
+ "epoch": 1.2307328605200945,
+ "grad_norm": 2.4814043045043945,
+ "learning_rate": 4.528028449891793e-06,
+ "loss": 0.4917,
+ "step": 2603
+ },
+ {
+ "epoch": 1.2312056737588652,
+ "grad_norm": 3.6052863597869873,
+ "learning_rate": 4.527663602047499e-06,
+ "loss": 0.5301,
+ "step": 2604
+ },
+ {
+ "epoch": 1.231678486997636,
+ "grad_norm": 2.6984751224517822,
+ "learning_rate": 4.5272986279494825e-06,
+ "loss": 0.5253,
+ "step": 2605
+ },
+ {
+ "epoch": 1.2321513002364066,
+ "grad_norm": 2.514000415802002,
+ "learning_rate": 4.526933527620469e-06,
+ "loss": 0.5661,
+ "step": 2606
+ },
+ {
+ "epoch": 1.2326241134751772,
+ "grad_norm": 2.890921115875244,
+ "learning_rate": 4.526568301083195e-06,
+ "loss": 0.5585,
+ "step": 2607
+ },
+ {
+ "epoch": 1.233096926713948,
+ "grad_norm": 2.6390011310577393,
+ "learning_rate": 4.526202948360397e-06,
+ "loss": 0.5168,
+ "step": 2608
+ },
+ {
+ "epoch": 1.2335697399527188,
+ "grad_norm": 2.7370636463165283,
+ "learning_rate": 4.5258374694748266e-06,
+ "loss": 0.5453,
+ "step": 2609
+ },
+ {
+ "epoch": 1.2340425531914894,
+ "grad_norm": 2.8203976154327393,
+ "learning_rate": 4.52547186444924e-06,
+ "loss": 0.5763,
+ "step": 2610
+ },
+ {
+ "epoch": 1.23451536643026,
+ "grad_norm": 2.7567849159240723,
+ "learning_rate": 4.5251061333064025e-06,
+ "loss": 0.5194,
+ "step": 2611
+ },
+ {
+ "epoch": 1.2349881796690307,
+ "grad_norm": 2.767519474029541,
+ "learning_rate": 4.524740276069085e-06,
+ "loss": 0.5355,
+ "step": 2612
+ },
+ {
+ "epoch": 1.2354609929078015,
+ "grad_norm": 3.072035312652588,
+ "learning_rate": 4.5243742927600695e-06,
+ "loss": 0.5391,
+ "step": 2613
+ },
+ {
+ "epoch": 1.2359338061465721,
+ "grad_norm": 2.5957462787628174,
+ "learning_rate": 4.524008183402143e-06,
+ "loss": 0.5645,
+ "step": 2614
+ },
+ {
+ "epoch": 1.2364066193853427,
+ "grad_norm": 2.774897575378418,
+ "learning_rate": 4.523641948018101e-06,
+ "loss": 0.5576,
+ "step": 2615
+ },
+ {
+ "epoch": 1.2368794326241135,
+ "grad_norm": 2.635887622833252,
+ "learning_rate": 4.5232755866307496e-06,
+ "loss": 0.5254,
+ "step": 2616
+ },
+ {
+ "epoch": 1.2373522458628843,
+ "grad_norm": 2.4860997200012207,
+ "learning_rate": 4.522909099262899e-06,
+ "loss": 0.4692,
+ "step": 2617
+ },
+ {
+ "epoch": 1.2378250591016549,
+ "grad_norm": 2.595513105392456,
+ "learning_rate": 4.522542485937369e-06,
+ "loss": 0.5166,
+ "step": 2618
+ },
+ {
+ "epoch": 1.2382978723404254,
+ "grad_norm": 2.961474895477295,
+ "learning_rate": 4.522175746676986e-06,
+ "loss": 0.5455,
+ "step": 2619
+ },
+ {
+ "epoch": 1.2387706855791962,
+ "grad_norm": 2.813889741897583,
+ "learning_rate": 4.521808881504588e-06,
+ "loss": 0.5249,
+ "step": 2620
+ },
+ {
+ "epoch": 1.239243498817967,
+ "grad_norm": 2.8434813022613525,
+ "learning_rate": 4.521441890443015e-06,
+ "loss": 0.472,
+ "step": 2621
+ },
+ {
+ "epoch": 1.2397163120567376,
+ "grad_norm": 2.4264845848083496,
+ "learning_rate": 4.521074773515119e-06,
+ "loss": 0.4783,
+ "step": 2622
+ },
+ {
+ "epoch": 1.2401891252955082,
+ "grad_norm": 2.615169048309326,
+ "learning_rate": 4.520707530743761e-06,
+ "loss": 0.5324,
+ "step": 2623
+ },
+ {
+ "epoch": 1.240661938534279,
+ "grad_norm": 2.6772537231445312,
+ "learning_rate": 4.520340162151803e-06,
+ "loss": 0.5224,
+ "step": 2624
+ },
+ {
+ "epoch": 1.2411347517730495,
+ "grad_norm": 2.683393955230713,
+ "learning_rate": 4.519972667762124e-06,
+ "loss": 0.4863,
+ "step": 2625
+ },
+ {
+ "epoch": 1.2416075650118203,
+ "grad_norm": 3.0335750579833984,
+ "learning_rate": 4.519605047597603e-06,
+ "loss": 0.544,
+ "step": 2626
+ },
+ {
+ "epoch": 1.242080378250591,
+ "grad_norm": 2.8694353103637695,
+ "learning_rate": 4.519237301681132e-06,
+ "loss": 0.5576,
+ "step": 2627
+ },
+ {
+ "epoch": 1.2425531914893617,
+ "grad_norm": 3.217808246612549,
+ "learning_rate": 4.518869430035609e-06,
+ "loss": 0.5459,
+ "step": 2628
+ },
+ {
+ "epoch": 1.2430260047281323,
+ "grad_norm": 2.7700083255767822,
+ "learning_rate": 4.518501432683937e-06,
+ "loss": 0.5579,
+ "step": 2629
+ },
+ {
+ "epoch": 1.243498817966903,
+ "grad_norm": 2.4759175777435303,
+ "learning_rate": 4.5181333096490335e-06,
+ "loss": 0.5049,
+ "step": 2630
+ },
+ {
+ "epoch": 1.2439716312056737,
+ "grad_norm": 2.8652584552764893,
+ "learning_rate": 4.517765060953818e-06,
+ "loss": 0.5366,
+ "step": 2631
+ },
+ {
+ "epoch": 1.2444444444444445,
+ "grad_norm": 2.776334524154663,
+ "learning_rate": 4.517396686621218e-06,
+ "loss": 0.5677,
+ "step": 2632
+ },
+ {
+ "epoch": 1.244917257683215,
+ "grad_norm": 2.676708221435547,
+ "learning_rate": 4.517028186674174e-06,
+ "loss": 0.5055,
+ "step": 2633
+ },
+ {
+ "epoch": 1.2453900709219858,
+ "grad_norm": 2.6851537227630615,
+ "learning_rate": 4.516659561135629e-06,
+ "loss": 0.5537,
+ "step": 2634
+ },
+ {
+ "epoch": 1.2458628841607564,
+ "grad_norm": 2.619971513748169,
+ "learning_rate": 4.516290810028536e-06,
+ "loss": 0.5765,
+ "step": 2635
+ },
+ {
+ "epoch": 1.2463356973995272,
+ "grad_norm": 2.7302334308624268,
+ "learning_rate": 4.515921933375855e-06,
+ "loss": 0.5611,
+ "step": 2636
+ },
+ {
+ "epoch": 1.2468085106382978,
+ "grad_norm": 2.5005829334259033,
+ "learning_rate": 4.5155529312005554e-06,
+ "loss": 0.442,
+ "step": 2637
+ },
+ {
+ "epoch": 1.2472813238770686,
+ "grad_norm": 2.713587522506714,
+ "learning_rate": 4.515183803525612e-06,
+ "loss": 0.5023,
+ "step": 2638
+ },
+ {
+ "epoch": 1.2477541371158392,
+ "grad_norm": 2.5146236419677734,
+ "learning_rate": 4.514814550374009e-06,
+ "loss": 0.5195,
+ "step": 2639
+ },
+ {
+ "epoch": 1.24822695035461,
+ "grad_norm": 2.761060953140259,
+ "learning_rate": 4.51444517176874e-06,
+ "loss": 0.5138,
+ "step": 2640
+ },
+ {
+ "epoch": 1.2486997635933805,
+ "grad_norm": 3.082329273223877,
+ "learning_rate": 4.5140756677328026e-06,
+ "loss": 0.6105,
+ "step": 2641
+ },
+ {
+ "epoch": 1.2491725768321513,
+ "grad_norm": 2.6933493614196777,
+ "learning_rate": 4.513706038289205e-06,
+ "loss": 0.5185,
+ "step": 2642
+ },
+ {
+ "epoch": 1.249645390070922,
+ "grad_norm": 2.515856981277466,
+ "learning_rate": 4.513336283460962e-06,
+ "loss": 0.5375,
+ "step": 2643
+ },
+ {
+ "epoch": 1.2501182033096927,
+ "grad_norm": 2.8553731441497803,
+ "learning_rate": 4.512966403271096e-06,
+ "loss": 0.5582,
+ "step": 2644
+ },
+ {
+ "epoch": 1.2505910165484633,
+ "grad_norm": 2.640880823135376,
+ "learning_rate": 4.5125963977426405e-06,
+ "loss": 0.5125,
+ "step": 2645
+ },
+ {
+ "epoch": 1.251063829787234,
+ "grad_norm": 2.9845943450927734,
+ "learning_rate": 4.512226266898631e-06,
+ "loss": 0.4749,
+ "step": 2646
+ },
+ {
+ "epoch": 1.2515366430260046,
+ "grad_norm": 2.5131032466888428,
+ "learning_rate": 4.511856010762116e-06,
+ "loss": 0.4764,
+ "step": 2647
+ },
+ {
+ "epoch": 1.2520094562647754,
+ "grad_norm": 2.370638370513916,
+ "learning_rate": 4.511485629356148e-06,
+ "loss": 0.5153,
+ "step": 2648
+ },
+ {
+ "epoch": 1.252482269503546,
+ "grad_norm": 2.912461996078491,
+ "learning_rate": 4.511115122703791e-06,
+ "loss": 0.6117,
+ "step": 2649
+ },
+ {
+ "epoch": 1.2529550827423168,
+ "grad_norm": 2.7308082580566406,
+ "learning_rate": 4.510744490828113e-06,
+ "loss": 0.5076,
+ "step": 2650
+ },
+ {
+ "epoch": 1.2534278959810874,
+ "grad_norm": 2.8524296283721924,
+ "learning_rate": 4.510373733752193e-06,
+ "loss": 0.542,
+ "step": 2651
+ },
+ {
+ "epoch": 1.2539007092198582,
+ "grad_norm": 2.799377202987671,
+ "learning_rate": 4.5100028514991145e-06,
+ "loss": 0.486,
+ "step": 2652
+ },
+ {
+ "epoch": 1.2543735224586288,
+ "grad_norm": 2.7248027324676514,
+ "learning_rate": 4.509631844091973e-06,
+ "loss": 0.4972,
+ "step": 2653
+ },
+ {
+ "epoch": 1.2548463356973996,
+ "grad_norm": 2.8041458129882812,
+ "learning_rate": 4.5092607115538686e-06,
+ "loss": 0.588,
+ "step": 2654
+ },
+ {
+ "epoch": 1.2553191489361701,
+ "grad_norm": 2.679417133331299,
+ "learning_rate": 4.50888945390791e-06,
+ "loss": 0.4639,
+ "step": 2655
+ },
+ {
+ "epoch": 1.255791962174941,
+ "grad_norm": 3.1049270629882812,
+ "learning_rate": 4.508518071177214e-06,
+ "loss": 0.5857,
+ "step": 2656
+ },
+ {
+ "epoch": 1.2562647754137115,
+ "grad_norm": 2.8590362071990967,
+ "learning_rate": 4.508146563384904e-06,
+ "loss": 0.5451,
+ "step": 2657
+ },
+ {
+ "epoch": 1.2567375886524823,
+ "grad_norm": 2.9774081707000732,
+ "learning_rate": 4.507774930554114e-06,
+ "loss": 0.5493,
+ "step": 2658
+ },
+ {
+ "epoch": 1.2572104018912529,
+ "grad_norm": 2.617643356323242,
+ "learning_rate": 4.507403172707983e-06,
+ "loss": 0.5472,
+ "step": 2659
+ },
+ {
+ "epoch": 1.2576832151300237,
+ "grad_norm": 2.9195587635040283,
+ "learning_rate": 4.507031289869658e-06,
+ "loss": 0.5403,
+ "step": 2660
+ },
+ {
+ "epoch": 1.2581560283687943,
+ "grad_norm": 2.706089496612549,
+ "learning_rate": 4.506659282062295e-06,
+ "loss": 0.4899,
+ "step": 2661
+ },
+ {
+ "epoch": 1.258628841607565,
+ "grad_norm": 2.8229358196258545,
+ "learning_rate": 4.506287149309057e-06,
+ "loss": 0.5336,
+ "step": 2662
+ },
+ {
+ "epoch": 1.2591016548463356,
+ "grad_norm": 2.5295674800872803,
+ "learning_rate": 4.505914891633117e-06,
+ "loss": 0.4806,
+ "step": 2663
+ },
+ {
+ "epoch": 1.2595744680851064,
+ "grad_norm": 3.098208427429199,
+ "learning_rate": 4.505542509057651e-06,
+ "loss": 0.6039,
+ "step": 2664
+ },
+ {
+ "epoch": 1.260047281323877,
+ "grad_norm": 2.5118041038513184,
+ "learning_rate": 4.5051700016058475e-06,
+ "loss": 0.5279,
+ "step": 2665
+ },
+ {
+ "epoch": 1.2605200945626478,
+ "grad_norm": 2.6901369094848633,
+ "learning_rate": 4.5047973693009005e-06,
+ "loss": 0.5515,
+ "step": 2666
+ },
+ {
+ "epoch": 1.2609929078014184,
+ "grad_norm": 2.5622377395629883,
+ "learning_rate": 4.504424612166012e-06,
+ "loss": 0.5405,
+ "step": 2667
+ },
+ {
+ "epoch": 1.2614657210401892,
+ "grad_norm": 2.685751438140869,
+ "learning_rate": 4.5040517302243915e-06,
+ "loss": 0.5797,
+ "step": 2668
+ },
+ {
+ "epoch": 1.2619385342789597,
+ "grad_norm": 2.8525350093841553,
+ "learning_rate": 4.503678723499259e-06,
+ "loss": 0.5561,
+ "step": 2669
+ },
+ {
+ "epoch": 1.2624113475177305,
+ "grad_norm": 2.803386926651001,
+ "learning_rate": 4.503305592013836e-06,
+ "loss": 0.5376,
+ "step": 2670
+ },
+ {
+ "epoch": 1.2628841607565011,
+ "grad_norm": 2.78633189201355,
+ "learning_rate": 4.502932335791359e-06,
+ "loss": 0.4739,
+ "step": 2671
+ },
+ {
+ "epoch": 1.263356973995272,
+ "grad_norm": 2.8337297439575195,
+ "learning_rate": 4.502558954855069e-06,
+ "loss": 0.5406,
+ "step": 2672
+ },
+ {
+ "epoch": 1.2638297872340425,
+ "grad_norm": 2.610275983810425,
+ "learning_rate": 4.502185449228213e-06,
+ "loss": 0.5343,
+ "step": 2673
+ },
+ {
+ "epoch": 1.2643026004728133,
+ "grad_norm": 2.7842252254486084,
+ "learning_rate": 4.501811818934048e-06,
+ "loss": 0.532,
+ "step": 2674
+ },
+ {
+ "epoch": 1.2647754137115839,
+ "grad_norm": 2.4472389221191406,
+ "learning_rate": 4.501438063995839e-06,
+ "loss": 0.4976,
+ "step": 2675
+ },
+ {
+ "epoch": 1.2652482269503547,
+ "grad_norm": 3.076580762863159,
+ "learning_rate": 4.501064184436858e-06,
+ "loss": 0.507,
+ "step": 2676
+ },
+ {
+ "epoch": 1.2657210401891252,
+ "grad_norm": 2.5952908992767334,
+ "learning_rate": 4.500690180280384e-06,
+ "loss": 0.5498,
+ "step": 2677
+ },
+ {
+ "epoch": 1.266193853427896,
+ "grad_norm": 2.476943016052246,
+ "learning_rate": 4.500316051549706e-06,
+ "loss": 0.557,
+ "step": 2678
+ },
+ {
+ "epoch": 1.2666666666666666,
+ "grad_norm": 2.730579376220703,
+ "learning_rate": 4.499941798268118e-06,
+ "loss": 0.4975,
+ "step": 2679
+ },
+ {
+ "epoch": 1.2671394799054374,
+ "grad_norm": 2.7916698455810547,
+ "learning_rate": 4.499567420458924e-06,
+ "loss": 0.5673,
+ "step": 2680
+ },
+ {
+ "epoch": 1.267612293144208,
+ "grad_norm": 2.4249091148376465,
+ "learning_rate": 4.4991929181454355e-06,
+ "loss": 0.4836,
+ "step": 2681
+ },
+ {
+ "epoch": 1.2680851063829788,
+ "grad_norm": 2.661911725997925,
+ "learning_rate": 4.498818291350969e-06,
+ "loss": 0.5332,
+ "step": 2682
+ },
+ {
+ "epoch": 1.2685579196217494,
+ "grad_norm": 2.693657875061035,
+ "learning_rate": 4.498443540098852e-06,
+ "loss": 0.5257,
+ "step": 2683
+ },
+ {
+ "epoch": 1.2690307328605201,
+ "grad_norm": 2.609386682510376,
+ "learning_rate": 4.4980686644124195e-06,
+ "loss": 0.4918,
+ "step": 2684
+ },
+ {
+ "epoch": 1.2695035460992907,
+ "grad_norm": 3.2104930877685547,
+ "learning_rate": 4.4976936643150124e-06,
+ "loss": 0.6097,
+ "step": 2685
+ },
+ {
+ "epoch": 1.2699763593380615,
+ "grad_norm": 2.707860231399536,
+ "learning_rate": 4.49731853982998e-06,
+ "loss": 0.5109,
+ "step": 2686
+ },
+ {
+ "epoch": 1.270449172576832,
+ "grad_norm": 3.5046379566192627,
+ "learning_rate": 4.49694329098068e-06,
+ "loss": 0.5883,
+ "step": 2687
+ },
+ {
+ "epoch": 1.270921985815603,
+ "grad_norm": 2.5362324714660645,
+ "learning_rate": 4.496567917790477e-06,
+ "loss": 0.5301,
+ "step": 2688
+ },
+ {
+ "epoch": 1.2713947990543735,
+ "grad_norm": 2.7095518112182617,
+ "learning_rate": 4.496192420282746e-06,
+ "loss": 0.4772,
+ "step": 2689
+ },
+ {
+ "epoch": 1.2718676122931443,
+ "grad_norm": 2.416433095932007,
+ "learning_rate": 4.495816798480865e-06,
+ "loss": 0.5012,
+ "step": 2690
+ },
+ {
+ "epoch": 1.2723404255319148,
+ "grad_norm": 2.5362391471862793,
+ "learning_rate": 4.495441052408224e-06,
+ "loss": 0.5197,
+ "step": 2691
+ },
+ {
+ "epoch": 1.2728132387706856,
+ "grad_norm": 2.9093947410583496,
+ "learning_rate": 4.495065182088218e-06,
+ "loss": 0.4893,
+ "step": 2692
+ },
+ {
+ "epoch": 1.2732860520094562,
+ "grad_norm": 2.520470142364502,
+ "learning_rate": 4.494689187544251e-06,
+ "loss": 0.5072,
+ "step": 2693
+ },
+ {
+ "epoch": 1.273758865248227,
+ "grad_norm": 2.4385125637054443,
+ "learning_rate": 4.494313068799735e-06,
+ "loss": 0.4923,
+ "step": 2694
+ },
+ {
+ "epoch": 1.2742316784869976,
+ "grad_norm": 2.636852502822876,
+ "learning_rate": 4.493936825878089e-06,
+ "loss": 0.5409,
+ "step": 2695
+ },
+ {
+ "epoch": 1.2747044917257684,
+ "grad_norm": 2.7027053833007812,
+ "learning_rate": 4.493560458802741e-06,
+ "loss": 0.5906,
+ "step": 2696
+ },
+ {
+ "epoch": 1.275177304964539,
+ "grad_norm": 2.58752179145813,
+ "learning_rate": 4.493183967597123e-06,
+ "loss": 0.5292,
+ "step": 2697
+ },
+ {
+ "epoch": 1.2756501182033098,
+ "grad_norm": 2.7658379077911377,
+ "learning_rate": 4.49280735228468e-06,
+ "loss": 0.5613,
+ "step": 2698
+ },
+ {
+ "epoch": 1.2761229314420803,
+ "grad_norm": 3.272688388824463,
+ "learning_rate": 4.492430612888861e-06,
+ "loss": 0.5654,
+ "step": 2699
+ },
+ {
+ "epoch": 1.2765957446808511,
+ "grad_norm": 2.806819438934326,
+ "learning_rate": 4.492053749433125e-06,
+ "loss": 0.5388,
+ "step": 2700
+ },
+ {
+ "epoch": 1.2770685579196217,
+ "grad_norm": 2.879727602005005,
+ "learning_rate": 4.491676761940936e-06,
+ "loss": 0.5033,
+ "step": 2701
+ },
+ {
+ "epoch": 1.2775413711583925,
+ "grad_norm": 2.733347177505493,
+ "learning_rate": 4.4912996504357695e-06,
+ "loss": 0.5113,
+ "step": 2702
+ },
+ {
+ "epoch": 1.278014184397163,
+ "grad_norm": 2.7431252002716064,
+ "learning_rate": 4.490922414941104e-06,
+ "loss": 0.5417,
+ "step": 2703
+ },
+ {
+ "epoch": 1.2784869976359339,
+ "grad_norm": 2.9287240505218506,
+ "learning_rate": 4.490545055480431e-06,
+ "loss": 0.5875,
+ "step": 2704
+ },
+ {
+ "epoch": 1.2789598108747045,
+ "grad_norm": 2.576775550842285,
+ "learning_rate": 4.490167572077244e-06,
+ "loss": 0.5176,
+ "step": 2705
+ },
+ {
+ "epoch": 1.2794326241134752,
+ "grad_norm": 2.4335594177246094,
+ "learning_rate": 4.4897899647550505e-06,
+ "loss": 0.4749,
+ "step": 2706
+ },
+ {
+ "epoch": 1.2799054373522458,
+ "grad_norm": 2.6798062324523926,
+ "learning_rate": 4.489412233537361e-06,
+ "loss": 0.5439,
+ "step": 2707
+ },
+ {
+ "epoch": 1.2803782505910166,
+ "grad_norm": 2.8440675735473633,
+ "learning_rate": 4.489034378447693e-06,
+ "loss": 0.552,
+ "step": 2708
+ },
+ {
+ "epoch": 1.2808510638297872,
+ "grad_norm": 2.9059503078460693,
+ "learning_rate": 4.488656399509577e-06,
+ "loss": 0.5667,
+ "step": 2709
+ },
+ {
+ "epoch": 1.281323877068558,
+ "grad_norm": 2.7415006160736084,
+ "learning_rate": 4.488278296746548e-06,
+ "loss": 0.5676,
+ "step": 2710
+ },
+ {
+ "epoch": 1.2817966903073286,
+ "grad_norm": 2.4584875106811523,
+ "learning_rate": 4.487900070182147e-06,
+ "loss": 0.4787,
+ "step": 2711
+ },
+ {
+ "epoch": 1.2822695035460994,
+ "grad_norm": 2.990940809249878,
+ "learning_rate": 4.487521719839924e-06,
+ "loss": 0.5239,
+ "step": 2712
+ },
+ {
+ "epoch": 1.28274231678487,
+ "grad_norm": 3.075201988220215,
+ "learning_rate": 4.487143245743441e-06,
+ "loss": 0.5103,
+ "step": 2713
+ },
+ {
+ "epoch": 1.2832151300236407,
+ "grad_norm": 2.543341875076294,
+ "learning_rate": 4.486764647916259e-06,
+ "loss": 0.5475,
+ "step": 2714
+ },
+ {
+ "epoch": 1.2836879432624113,
+ "grad_norm": 2.9927213191986084,
+ "learning_rate": 4.486385926381957e-06,
+ "loss": 0.4923,
+ "step": 2715
+ },
+ {
+ "epoch": 1.284160756501182,
+ "grad_norm": 2.4220657348632812,
+ "learning_rate": 4.486007081164111e-06,
+ "loss": 0.543,
+ "step": 2716
+ },
+ {
+ "epoch": 1.2846335697399527,
+ "grad_norm": 2.468214988708496,
+ "learning_rate": 4.4856281122863134e-06,
+ "loss": 0.5248,
+ "step": 2717
+ },
+ {
+ "epoch": 1.2851063829787235,
+ "grad_norm": 2.633711099624634,
+ "learning_rate": 4.48524901977216e-06,
+ "loss": 0.4764,
+ "step": 2718
+ },
+ {
+ "epoch": 1.285579196217494,
+ "grad_norm": 2.8399546146392822,
+ "learning_rate": 4.484869803645254e-06,
+ "loss": 0.5503,
+ "step": 2719
+ },
+ {
+ "epoch": 1.2860520094562649,
+ "grad_norm": 2.769063949584961,
+ "learning_rate": 4.484490463929209e-06,
+ "loss": 0.5468,
+ "step": 2720
+ },
+ {
+ "epoch": 1.2865248226950354,
+ "grad_norm": 2.617863893508911,
+ "learning_rate": 4.4841110006476465e-06,
+ "loss": 0.5906,
+ "step": 2721
+ },
+ {
+ "epoch": 1.2869976359338062,
+ "grad_norm": 2.7639541625976562,
+ "learning_rate": 4.4837314138241905e-06,
+ "loss": 0.552,
+ "step": 2722
+ },
+ {
+ "epoch": 1.2874704491725768,
+ "grad_norm": 2.7711129188537598,
+ "learning_rate": 4.483351703482478e-06,
+ "loss": 0.5229,
+ "step": 2723
+ },
+ {
+ "epoch": 1.2879432624113476,
+ "grad_norm": 2.611205577850342,
+ "learning_rate": 4.482971869646152e-06,
+ "loss": 0.5055,
+ "step": 2724
+ },
+ {
+ "epoch": 1.2884160756501182,
+ "grad_norm": 2.8602211475372314,
+ "learning_rate": 4.482591912338862e-06,
+ "loss": 0.5561,
+ "step": 2725
+ },
+ {
+ "epoch": 1.2888888888888888,
+ "grad_norm": 2.5882298946380615,
+ "learning_rate": 4.4822118315842675e-06,
+ "loss": 0.5555,
+ "step": 2726
+ },
+ {
+ "epoch": 1.2893617021276595,
+ "grad_norm": 2.7533531188964844,
+ "learning_rate": 4.481831627406033e-06,
+ "loss": 0.5346,
+ "step": 2727
+ },
+ {
+ "epoch": 1.2898345153664303,
+ "grad_norm": 2.4296958446502686,
+ "learning_rate": 4.481451299827835e-06,
+ "loss": 0.4915,
+ "step": 2728
+ },
+ {
+ "epoch": 1.290307328605201,
+ "grad_norm": 2.4403445720672607,
+ "learning_rate": 4.481070848873352e-06,
+ "loss": 0.5648,
+ "step": 2729
+ },
+ {
+ "epoch": 1.2907801418439715,
+ "grad_norm": 2.473224401473999,
+ "learning_rate": 4.480690274566274e-06,
+ "loss": 0.4849,
+ "step": 2730
+ },
+ {
+ "epoch": 1.2912529550827423,
+ "grad_norm": 2.637899875640869,
+ "learning_rate": 4.480309576930297e-06,
+ "loss": 0.4968,
+ "step": 2731
+ },
+ {
+ "epoch": 1.291725768321513,
+ "grad_norm": 2.7156927585601807,
+ "learning_rate": 4.479928755989127e-06,
+ "loss": 0.4759,
+ "step": 2732
+ },
+ {
+ "epoch": 1.2921985815602837,
+ "grad_norm": 2.632786989212036,
+ "learning_rate": 4.479547811766475e-06,
+ "loss": 0.5468,
+ "step": 2733
+ },
+ {
+ "epoch": 1.2926713947990542,
+ "grad_norm": 2.529218912124634,
+ "learning_rate": 4.479166744286061e-06,
+ "loss": 0.4852,
+ "step": 2734
+ },
+ {
+ "epoch": 1.293144208037825,
+ "grad_norm": 2.561978340148926,
+ "learning_rate": 4.4787855535716115e-06,
+ "loss": 0.546,
+ "step": 2735
+ },
+ {
+ "epoch": 1.2936170212765958,
+ "grad_norm": 2.3684909343719482,
+ "learning_rate": 4.478404239646862e-06,
+ "loss": 0.5369,
+ "step": 2736
+ },
+ {
+ "epoch": 1.2940898345153664,
+ "grad_norm": 2.8940367698669434,
+ "learning_rate": 4.4780228025355566e-06,
+ "loss": 0.568,
+ "step": 2737
+ },
+ {
+ "epoch": 1.294562647754137,
+ "grad_norm": 2.6950316429138184,
+ "learning_rate": 4.477641242261445e-06,
+ "loss": 0.4576,
+ "step": 2738
+ },
+ {
+ "epoch": 1.2950354609929078,
+ "grad_norm": 2.4211716651916504,
+ "learning_rate": 4.4772595588482835e-06,
+ "loss": 0.4341,
+ "step": 2739
+ },
+ {
+ "epoch": 1.2955082742316786,
+ "grad_norm": 3.141097068786621,
+ "learning_rate": 4.47687775231984e-06,
+ "loss": 0.5944,
+ "step": 2740
+ },
+ {
+ "epoch": 1.2959810874704492,
+ "grad_norm": 3.077522039413452,
+ "learning_rate": 4.476495822699887e-06,
+ "loss": 0.5786,
+ "step": 2741
+ },
+ {
+ "epoch": 1.2964539007092197,
+ "grad_norm": 2.708139419555664,
+ "learning_rate": 4.476113770012206e-06,
+ "loss": 0.5014,
+ "step": 2742
+ },
+ {
+ "epoch": 1.2969267139479905,
+ "grad_norm": 2.7572035789489746,
+ "learning_rate": 4.475731594280586e-06,
+ "loss": 0.594,
+ "step": 2743
+ },
+ {
+ "epoch": 1.2973995271867613,
+ "grad_norm": 2.673126459121704,
+ "learning_rate": 4.475349295528822e-06,
+ "loss": 0.5317,
+ "step": 2744
+ },
+ {
+ "epoch": 1.297872340425532,
+ "grad_norm": 2.6757819652557373,
+ "learning_rate": 4.4749668737807195e-06,
+ "loss": 0.5614,
+ "step": 2745
+ },
+ {
+ "epoch": 1.2983451536643025,
+ "grad_norm": 2.7077620029449463,
+ "learning_rate": 4.47458432906009e-06,
+ "loss": 0.4916,
+ "step": 2746
+ },
+ {
+ "epoch": 1.2988179669030733,
+ "grad_norm": 2.446570873260498,
+ "learning_rate": 4.474201661390752e-06,
+ "loss": 0.5005,
+ "step": 2747
+ },
+ {
+ "epoch": 1.299290780141844,
+ "grad_norm": 2.642695665359497,
+ "learning_rate": 4.473818870796533e-06,
+ "loss": 0.5048,
+ "step": 2748
+ },
+ {
+ "epoch": 1.2997635933806146,
+ "grad_norm": 2.519824743270874,
+ "learning_rate": 4.4734359573012686e-06,
+ "loss": 0.5131,
+ "step": 2749
+ },
+ {
+ "epoch": 1.3002364066193852,
+ "grad_norm": 2.5901925563812256,
+ "learning_rate": 4.4730529209287995e-06,
+ "loss": 0.4582,
+ "step": 2750
+ },
+ {
+ "epoch": 1.300709219858156,
+ "grad_norm": 2.6789121627807617,
+ "learning_rate": 4.472669761702978e-06,
+ "loss": 0.5685,
+ "step": 2751
+ },
+ {
+ "epoch": 1.3011820330969268,
+ "grad_norm": 2.408003807067871,
+ "learning_rate": 4.472286479647659e-06,
+ "loss": 0.4329,
+ "step": 2752
+ },
+ {
+ "epoch": 1.3016548463356974,
+ "grad_norm": 2.681403398513794,
+ "learning_rate": 4.47190307478671e-06,
+ "loss": 0.4853,
+ "step": 2753
+ },
+ {
+ "epoch": 1.302127659574468,
+ "grad_norm": 2.9923183917999268,
+ "learning_rate": 4.4715195471440025e-06,
+ "loss": 0.5184,
+ "step": 2754
+ },
+ {
+ "epoch": 1.3026004728132388,
+ "grad_norm": 2.5100321769714355,
+ "learning_rate": 4.471135896743418e-06,
+ "loss": 0.5148,
+ "step": 2755
+ },
+ {
+ "epoch": 1.3030732860520096,
+ "grad_norm": 2.267881393432617,
+ "learning_rate": 4.4707521236088444e-06,
+ "loss": 0.5028,
+ "step": 2756
+ },
+ {
+ "epoch": 1.3035460992907801,
+ "grad_norm": 2.7779829502105713,
+ "learning_rate": 4.4703682277641775e-06,
+ "loss": 0.5724,
+ "step": 2757
+ },
+ {
+ "epoch": 1.3040189125295507,
+ "grad_norm": 2.4262194633483887,
+ "learning_rate": 4.4699842092333205e-06,
+ "loss": 0.5341,
+ "step": 2758
+ },
+ {
+ "epoch": 1.3044917257683215,
+ "grad_norm": 2.8682050704956055,
+ "learning_rate": 4.469600068040185e-06,
+ "loss": 0.6114,
+ "step": 2759
+ },
+ {
+ "epoch": 1.3049645390070923,
+ "grad_norm": 2.647853374481201,
+ "learning_rate": 4.46921580420869e-06,
+ "loss": 0.5107,
+ "step": 2760
+ },
+ {
+ "epoch": 1.3054373522458629,
+ "grad_norm": 2.561998128890991,
+ "learning_rate": 4.468831417762762e-06,
+ "loss": 0.6019,
+ "step": 2761
+ },
+ {
+ "epoch": 1.3059101654846335,
+ "grad_norm": 2.763425350189209,
+ "learning_rate": 4.468446908726334e-06,
+ "loss": 0.572,
+ "step": 2762
+ },
+ {
+ "epoch": 1.3063829787234043,
+ "grad_norm": 2.7052934169769287,
+ "learning_rate": 4.468062277123348e-06,
+ "loss": 0.4876,
+ "step": 2763
+ },
+ {
+ "epoch": 1.306855791962175,
+ "grad_norm": 2.997845411300659,
+ "learning_rate": 4.467677522977755e-06,
+ "loss": 0.5683,
+ "step": 2764
+ },
+ {
+ "epoch": 1.3073286052009456,
+ "grad_norm": 2.503129005432129,
+ "learning_rate": 4.46729264631351e-06,
+ "loss": 0.4951,
+ "step": 2765
+ },
+ {
+ "epoch": 1.3078014184397162,
+ "grad_norm": 2.617492437362671,
+ "learning_rate": 4.466907647154578e-06,
+ "loss": 0.5054,
+ "step": 2766
+ },
+ {
+ "epoch": 1.308274231678487,
+ "grad_norm": 2.934967279434204,
+ "learning_rate": 4.4665225255249315e-06,
+ "loss": 0.5299,
+ "step": 2767
+ },
+ {
+ "epoch": 1.3087470449172578,
+ "grad_norm": 2.787252187728882,
+ "learning_rate": 4.46613728144855e-06,
+ "loss": 0.4652,
+ "step": 2768
+ },
+ {
+ "epoch": 1.3092198581560284,
+ "grad_norm": 2.567439556121826,
+ "learning_rate": 4.465751914949422e-06,
+ "loss": 0.538,
+ "step": 2769
+ },
+ {
+ "epoch": 1.309692671394799,
+ "grad_norm": 2.6386024951934814,
+ "learning_rate": 4.4653664260515416e-06,
+ "loss": 0.464,
+ "step": 2770
+ },
+ {
+ "epoch": 1.3101654846335697,
+ "grad_norm": 2.966848134994507,
+ "learning_rate": 4.464980814778912e-06,
+ "loss": 0.4889,
+ "step": 2771
+ },
+ {
+ "epoch": 1.3106382978723405,
+ "grad_norm": 2.571256637573242,
+ "learning_rate": 4.464595081155542e-06,
+ "loss": 0.4979,
+ "step": 2772
+ },
+ {
+ "epoch": 1.3111111111111111,
+ "grad_norm": 2.774203062057495,
+ "learning_rate": 4.4642092252054515e-06,
+ "loss": 0.5366,
+ "step": 2773
+ },
+ {
+ "epoch": 1.3115839243498817,
+ "grad_norm": 2.682969331741333,
+ "learning_rate": 4.463823246952666e-06,
+ "loss": 0.5118,
+ "step": 2774
+ },
+ {
+ "epoch": 1.3120567375886525,
+ "grad_norm": 2.4873905181884766,
+ "learning_rate": 4.463437146421217e-06,
+ "loss": 0.5548,
+ "step": 2775
+ },
+ {
+ "epoch": 1.3125295508274233,
+ "grad_norm": 2.6769661903381348,
+ "learning_rate": 4.463050923635147e-06,
+ "loss": 0.5023,
+ "step": 2776
+ },
+ {
+ "epoch": 1.3130023640661939,
+ "grad_norm": 2.7190892696380615,
+ "learning_rate": 4.462664578618503e-06,
+ "loss": 0.5546,
+ "step": 2777
+ },
+ {
+ "epoch": 1.3134751773049644,
+ "grad_norm": 2.8193624019622803,
+ "learning_rate": 4.462278111395343e-06,
+ "loss": 0.5265,
+ "step": 2778
+ },
+ {
+ "epoch": 1.3139479905437352,
+ "grad_norm": 2.7324538230895996,
+ "learning_rate": 4.461891521989728e-06,
+ "loss": 0.5449,
+ "step": 2779
+ },
+ {
+ "epoch": 1.314420803782506,
+ "grad_norm": 2.87320876121521,
+ "learning_rate": 4.4615048104257305e-06,
+ "loss": 0.5367,
+ "step": 2780
+ },
+ {
+ "epoch": 1.3148936170212766,
+ "grad_norm": 2.6777031421661377,
+ "learning_rate": 4.4611179767274306e-06,
+ "loss": 0.5026,
+ "step": 2781
+ },
+ {
+ "epoch": 1.3153664302600472,
+ "grad_norm": 3.714524269104004,
+ "learning_rate": 4.460731020918913e-06,
+ "loss": 0.569,
+ "step": 2782
+ },
+ {
+ "epoch": 1.315839243498818,
+ "grad_norm": 2.7493600845336914,
+ "learning_rate": 4.460343943024273e-06,
+ "loss": 0.5826,
+ "step": 2783
+ },
+ {
+ "epoch": 1.3163120567375888,
+ "grad_norm": 2.6544079780578613,
+ "learning_rate": 4.459956743067609e-06,
+ "loss": 0.5399,
+ "step": 2784
+ },
+ {
+ "epoch": 1.3167848699763594,
+ "grad_norm": 2.4338037967681885,
+ "learning_rate": 4.459569421073036e-06,
+ "loss": 0.5186,
+ "step": 2785
+ },
+ {
+ "epoch": 1.31725768321513,
+ "grad_norm": 2.9312374591827393,
+ "learning_rate": 4.459181977064665e-06,
+ "loss": 0.5571,
+ "step": 2786
+ },
+ {
+ "epoch": 1.3177304964539007,
+ "grad_norm": 2.5988922119140625,
+ "learning_rate": 4.458794411066624e-06,
+ "loss": 0.5926,
+ "step": 2787
+ },
+ {
+ "epoch": 1.3182033096926715,
+ "grad_norm": 2.5193772315979004,
+ "learning_rate": 4.458406723103044e-06,
+ "loss": 0.5243,
+ "step": 2788
+ },
+ {
+ "epoch": 1.318676122931442,
+ "grad_norm": 2.8653743267059326,
+ "learning_rate": 4.458018913198066e-06,
+ "loss": 0.5421,
+ "step": 2789
+ },
+ {
+ "epoch": 1.3191489361702127,
+ "grad_norm": 2.486245632171631,
+ "learning_rate": 4.457630981375834e-06,
+ "loss": 0.4862,
+ "step": 2790
+ },
+ {
+ "epoch": 1.3196217494089835,
+ "grad_norm": 3.155435800552368,
+ "learning_rate": 4.457242927660506e-06,
+ "loss": 0.5386,
+ "step": 2791
+ },
+ {
+ "epoch": 1.3200945626477543,
+ "grad_norm": 3.102023124694824,
+ "learning_rate": 4.456854752076242e-06,
+ "loss": 0.5527,
+ "step": 2792
+ },
+ {
+ "epoch": 1.3205673758865248,
+ "grad_norm": 2.7995986938476562,
+ "learning_rate": 4.456466454647215e-06,
+ "loss": 0.4364,
+ "step": 2793
+ },
+ {
+ "epoch": 1.3210401891252954,
+ "grad_norm": 2.8328311443328857,
+ "learning_rate": 4.456078035397599e-06,
+ "loss": 0.5516,
+ "step": 2794
+ },
+ {
+ "epoch": 1.3215130023640662,
+ "grad_norm": 2.606161594390869,
+ "learning_rate": 4.455689494351581e-06,
+ "loss": 0.5042,
+ "step": 2795
+ },
+ {
+ "epoch": 1.321985815602837,
+ "grad_norm": 2.6344757080078125,
+ "learning_rate": 4.455300831533354e-06,
+ "loss": 0.4807,
+ "step": 2796
+ },
+ {
+ "epoch": 1.3224586288416076,
+ "grad_norm": 2.8539786338806152,
+ "learning_rate": 4.454912046967118e-06,
+ "loss": 0.4694,
+ "step": 2797
+ },
+ {
+ "epoch": 1.3229314420803782,
+ "grad_norm": 2.849066734313965,
+ "learning_rate": 4.454523140677081e-06,
+ "loss": 0.5037,
+ "step": 2798
+ },
+ {
+ "epoch": 1.323404255319149,
+ "grad_norm": 2.6803371906280518,
+ "learning_rate": 4.454134112687458e-06,
+ "loss": 0.4959,
+ "step": 2799
+ },
+ {
+ "epoch": 1.3238770685579198,
+ "grad_norm": 3.0546066761016846,
+ "learning_rate": 4.453744963022473e-06,
+ "loss": 0.5935,
+ "step": 2800
+ },
+ {
+ "epoch": 1.3243498817966903,
+ "grad_norm": 2.625602960586548,
+ "learning_rate": 4.453355691706356e-06,
+ "loss": 0.5349,
+ "step": 2801
+ },
+ {
+ "epoch": 1.324822695035461,
+ "grad_norm": 2.7568554878234863,
+ "learning_rate": 4.452966298763345e-06,
+ "loss": 0.5012,
+ "step": 2802
+ },
+ {
+ "epoch": 1.3252955082742317,
+ "grad_norm": 2.940427303314209,
+ "learning_rate": 4.452576784217686e-06,
+ "loss": 0.5246,
+ "step": 2803
+ },
+ {
+ "epoch": 1.3257683215130025,
+ "grad_norm": 2.5485289096832275,
+ "learning_rate": 4.452187148093633e-06,
+ "loss": 0.5282,
+ "step": 2804
+ },
+ {
+ "epoch": 1.326241134751773,
+ "grad_norm": 2.8152987957000732,
+ "learning_rate": 4.4517973904154455e-06,
+ "loss": 0.5468,
+ "step": 2805
+ },
+ {
+ "epoch": 1.3267139479905437,
+ "grad_norm": 2.9399688243865967,
+ "learning_rate": 4.451407511207393e-06,
+ "loss": 0.5586,
+ "step": 2806
+ },
+ {
+ "epoch": 1.3271867612293144,
+ "grad_norm": 2.3870036602020264,
+ "learning_rate": 4.451017510493751e-06,
+ "loss": 0.4807,
+ "step": 2807
+ },
+ {
+ "epoch": 1.327659574468085,
+ "grad_norm": 3.4667887687683105,
+ "learning_rate": 4.450627388298805e-06,
+ "loss": 0.5571,
+ "step": 2808
+ },
+ {
+ "epoch": 1.3281323877068558,
+ "grad_norm": 2.685986042022705,
+ "learning_rate": 4.450237144646844e-06,
+ "loss": 0.5525,
+ "step": 2809
+ },
+ {
+ "epoch": 1.3286052009456264,
+ "grad_norm": 2.8529131412506104,
+ "learning_rate": 4.449846779562168e-06,
+ "loss": 0.491,
+ "step": 2810
+ },
+ {
+ "epoch": 1.3290780141843972,
+ "grad_norm": 2.7360332012176514,
+ "learning_rate": 4.449456293069082e-06,
+ "loss": 0.5574,
+ "step": 2811
+ },
+ {
+ "epoch": 1.3295508274231678,
+ "grad_norm": 2.4656026363372803,
+ "learning_rate": 4.4490656851919015e-06,
+ "loss": 0.4678,
+ "step": 2812
+ },
+ {
+ "epoch": 1.3300236406619386,
+ "grad_norm": 2.602651357650757,
+ "learning_rate": 4.448674955954947e-06,
+ "loss": 0.5118,
+ "step": 2813
+ },
+ {
+ "epoch": 1.3304964539007091,
+ "grad_norm": 3.0129756927490234,
+ "learning_rate": 4.448284105382548e-06,
+ "loss": 0.6136,
+ "step": 2814
+ },
+ {
+ "epoch": 1.33096926713948,
+ "grad_norm": 2.8499927520751953,
+ "learning_rate": 4.447893133499039e-06,
+ "loss": 0.5286,
+ "step": 2815
+ },
+ {
+ "epoch": 1.3314420803782505,
+ "grad_norm": 2.8320744037628174,
+ "learning_rate": 4.447502040328767e-06,
+ "loss": 0.5186,
+ "step": 2816
+ },
+ {
+ "epoch": 1.3319148936170213,
+ "grad_norm": 2.499950885772705,
+ "learning_rate": 4.447110825896084e-06,
+ "loss": 0.5338,
+ "step": 2817
+ },
+ {
+ "epoch": 1.3323877068557919,
+ "grad_norm": 2.530895233154297,
+ "learning_rate": 4.446719490225346e-06,
+ "loss": 0.5151,
+ "step": 2818
+ },
+ {
+ "epoch": 1.3328605200945627,
+ "grad_norm": 2.5276098251342773,
+ "learning_rate": 4.446328033340921e-06,
+ "loss": 0.5424,
+ "step": 2819
+ },
+ {
+ "epoch": 1.3333333333333333,
+ "grad_norm": 2.90218186378479,
+ "learning_rate": 4.4459364552671845e-06,
+ "loss": 0.5747,
+ "step": 2820
+ },
+ {
+ "epoch": 1.333806146572104,
+ "grad_norm": 2.500943183898926,
+ "learning_rate": 4.445544756028518e-06,
+ "loss": 0.5459,
+ "step": 2821
+ },
+ {
+ "epoch": 1.3342789598108746,
+ "grad_norm": 2.960374355316162,
+ "learning_rate": 4.44515293564931e-06,
+ "loss": 0.6092,
+ "step": 2822
+ },
+ {
+ "epoch": 1.3347517730496454,
+ "grad_norm": 2.813671827316284,
+ "learning_rate": 4.444760994153958e-06,
+ "loss": 0.5536,
+ "step": 2823
+ },
+ {
+ "epoch": 1.335224586288416,
+ "grad_norm": 2.7147483825683594,
+ "learning_rate": 4.444368931566867e-06,
+ "loss": 0.5291,
+ "step": 2824
+ },
+ {
+ "epoch": 1.3356973995271868,
+ "grad_norm": 2.710101842880249,
+ "learning_rate": 4.443976747912447e-06,
+ "loss": 0.5138,
+ "step": 2825
+ },
+ {
+ "epoch": 1.3361702127659574,
+ "grad_norm": 2.711419105529785,
+ "learning_rate": 4.443584443215121e-06,
+ "loss": 0.5223,
+ "step": 2826
+ },
+ {
+ "epoch": 1.3366430260047282,
+ "grad_norm": 2.887472152709961,
+ "learning_rate": 4.443192017499313e-06,
+ "loss": 0.5464,
+ "step": 2827
+ },
+ {
+ "epoch": 1.3371158392434987,
+ "grad_norm": 2.8867223262786865,
+ "learning_rate": 4.4427994707894585e-06,
+ "loss": 0.5748,
+ "step": 2828
+ },
+ {
+ "epoch": 1.3375886524822695,
+ "grad_norm": 2.407247543334961,
+ "learning_rate": 4.44240680311e-06,
+ "loss": 0.4727,
+ "step": 2829
+ },
+ {
+ "epoch": 1.3380614657210401,
+ "grad_norm": 2.578420877456665,
+ "learning_rate": 4.4420140144853865e-06,
+ "loss": 0.5129,
+ "step": 2830
+ },
+ {
+ "epoch": 1.338534278959811,
+ "grad_norm": 2.884373426437378,
+ "learning_rate": 4.441621104940077e-06,
+ "loss": 0.5366,
+ "step": 2831
+ },
+ {
+ "epoch": 1.3390070921985815,
+ "grad_norm": 2.8652374744415283,
+ "learning_rate": 4.441228074498534e-06,
+ "loss": 0.5045,
+ "step": 2832
+ },
+ {
+ "epoch": 1.3394799054373523,
+ "grad_norm": 2.5380210876464844,
+ "learning_rate": 4.440834923185231e-06,
+ "loss": 0.509,
+ "step": 2833
+ },
+ {
+ "epoch": 1.3399527186761229,
+ "grad_norm": 2.415734052658081,
+ "learning_rate": 4.440441651024648e-06,
+ "loss": 0.5066,
+ "step": 2834
+ },
+ {
+ "epoch": 1.3404255319148937,
+ "grad_norm": 2.503051996231079,
+ "learning_rate": 4.440048258041272e-06,
+ "loss": 0.5118,
+ "step": 2835
+ },
+ {
+ "epoch": 1.3408983451536642,
+ "grad_norm": 3.351001024246216,
+ "learning_rate": 4.439654744259598e-06,
+ "loss": 0.5758,
+ "step": 2836
+ },
+ {
+ "epoch": 1.341371158392435,
+ "grad_norm": 2.7368781566619873,
+ "learning_rate": 4.439261109704129e-06,
+ "loss": 0.5674,
+ "step": 2837
+ },
+ {
+ "epoch": 1.3418439716312056,
+ "grad_norm": 3.008199453353882,
+ "learning_rate": 4.438867354399372e-06,
+ "loss": 0.5891,
+ "step": 2838
+ },
+ {
+ "epoch": 1.3423167848699764,
+ "grad_norm": 2.538907766342163,
+ "learning_rate": 4.438473478369847e-06,
+ "loss": 0.5102,
+ "step": 2839
+ },
+ {
+ "epoch": 1.342789598108747,
+ "grad_norm": 2.7169063091278076,
+ "learning_rate": 4.438079481640079e-06,
+ "loss": 0.6131,
+ "step": 2840
+ },
+ {
+ "epoch": 1.3432624113475178,
+ "grad_norm": 2.7411608695983887,
+ "learning_rate": 4.437685364234601e-06,
+ "loss": 0.5337,
+ "step": 2841
+ },
+ {
+ "epoch": 1.3437352245862884,
+ "grad_norm": 3.2374939918518066,
+ "learning_rate": 4.43729112617795e-06,
+ "loss": 0.5401,
+ "step": 2842
+ },
+ {
+ "epoch": 1.3442080378250592,
+ "grad_norm": 2.4712226390838623,
+ "learning_rate": 4.436896767494676e-06,
+ "loss": 0.5365,
+ "step": 2843
+ },
+ {
+ "epoch": 1.3446808510638297,
+ "grad_norm": 2.661619186401367,
+ "learning_rate": 4.436502288209334e-06,
+ "loss": 0.4919,
+ "step": 2844
+ },
+ {
+ "epoch": 1.3451536643026005,
+ "grad_norm": 2.5943779945373535,
+ "learning_rate": 4.4361076883464845e-06,
+ "loss": 0.5253,
+ "step": 2845
+ },
+ {
+ "epoch": 1.345626477541371,
+ "grad_norm": 2.672297477722168,
+ "learning_rate": 4.4357129679307e-06,
+ "loss": 0.541,
+ "step": 2846
+ },
+ {
+ "epoch": 1.346099290780142,
+ "grad_norm": 2.6830925941467285,
+ "learning_rate": 4.435318126986557e-06,
+ "loss": 0.5641,
+ "step": 2847
+ },
+ {
+ "epoch": 1.3465721040189125,
+ "grad_norm": 2.7394626140594482,
+ "learning_rate": 4.434923165538639e-06,
+ "loss": 0.5591,
+ "step": 2848
+ },
+ {
+ "epoch": 1.3470449172576833,
+ "grad_norm": 2.9656317234039307,
+ "learning_rate": 4.434528083611541e-06,
+ "loss": 0.515,
+ "step": 2849
+ },
+ {
+ "epoch": 1.3475177304964538,
+ "grad_norm": 3.30155086517334,
+ "learning_rate": 4.434132881229861e-06,
+ "loss": 0.5871,
+ "step": 2850
+ },
+ {
+ "epoch": 1.3479905437352246,
+ "grad_norm": 2.6222476959228516,
+ "learning_rate": 4.433737558418209e-06,
+ "loss": 0.5143,
+ "step": 2851
+ },
+ {
+ "epoch": 1.3484633569739952,
+ "grad_norm": 2.903158187866211,
+ "learning_rate": 4.4333421152011965e-06,
+ "loss": 0.4484,
+ "step": 2852
+ },
+ {
+ "epoch": 1.348936170212766,
+ "grad_norm": 2.863116979598999,
+ "learning_rate": 4.432946551603449e-06,
+ "loss": 0.5213,
+ "step": 2853
+ },
+ {
+ "epoch": 1.3494089834515366,
+ "grad_norm": 2.8253962993621826,
+ "learning_rate": 4.432550867649596e-06,
+ "loss": 0.5713,
+ "step": 2854
+ },
+ {
+ "epoch": 1.3498817966903074,
+ "grad_norm": 2.652493953704834,
+ "learning_rate": 4.432155063364273e-06,
+ "loss": 0.5559,
+ "step": 2855
+ },
+ {
+ "epoch": 1.350354609929078,
+ "grad_norm": 2.4289376735687256,
+ "learning_rate": 4.431759138772127e-06,
+ "loss": 0.5122,
+ "step": 2856
+ },
+ {
+ "epoch": 1.3508274231678488,
+ "grad_norm": 2.6329853534698486,
+ "learning_rate": 4.43136309389781e-06,
+ "loss": 0.5332,
+ "step": 2857
+ },
+ {
+ "epoch": 1.3513002364066193,
+ "grad_norm": 2.431103229522705,
+ "learning_rate": 4.430966928765982e-06,
+ "loss": 0.4863,
+ "step": 2858
+ },
+ {
+ "epoch": 1.3517730496453901,
+ "grad_norm": 2.7529025077819824,
+ "learning_rate": 4.4305706434013106e-06,
+ "loss": 0.5263,
+ "step": 2859
+ },
+ {
+ "epoch": 1.3522458628841607,
+ "grad_norm": 2.884605646133423,
+ "learning_rate": 4.43017423782847e-06,
+ "loss": 0.564,
+ "step": 2860
+ },
+ {
+ "epoch": 1.3527186761229315,
+ "grad_norm": 3.027771234512329,
+ "learning_rate": 4.4297777120721435e-06,
+ "loss": 0.5846,
+ "step": 2861
+ },
+ {
+ "epoch": 1.353191489361702,
+ "grad_norm": 3.0140626430511475,
+ "learning_rate": 4.4293810661570205e-06,
+ "loss": 0.6621,
+ "step": 2862
+ },
+ {
+ "epoch": 1.3536643026004729,
+ "grad_norm": 2.721799612045288,
+ "learning_rate": 4.428984300107799e-06,
+ "loss": 0.5566,
+ "step": 2863
+ },
+ {
+ "epoch": 1.3541371158392435,
+ "grad_norm": 3.0016496181488037,
+ "learning_rate": 4.428587413949183e-06,
+ "loss": 0.5525,
+ "step": 2864
+ },
+ {
+ "epoch": 1.3546099290780143,
+ "grad_norm": 2.77138614654541,
+ "learning_rate": 4.428190407705886e-06,
+ "loss": 0.6016,
+ "step": 2865
+ },
+ {
+ "epoch": 1.3550827423167848,
+ "grad_norm": 2.9783477783203125,
+ "learning_rate": 4.427793281402627e-06,
+ "loss": 0.5556,
+ "step": 2866
+ },
+ {
+ "epoch": 1.3555555555555556,
+ "grad_norm": 2.2490382194519043,
+ "learning_rate": 4.427396035064132e-06,
+ "loss": 0.5138,
+ "step": 2867
+ },
+ {
+ "epoch": 1.3560283687943262,
+ "grad_norm": 2.442225217819214,
+ "learning_rate": 4.426998668715139e-06,
+ "loss": 0.4843,
+ "step": 2868
+ },
+ {
+ "epoch": 1.356501182033097,
+ "grad_norm": 2.74040150642395,
+ "learning_rate": 4.426601182380388e-06,
+ "loss": 0.54,
+ "step": 2869
+ },
+ {
+ "epoch": 1.3569739952718676,
+ "grad_norm": 2.4434332847595215,
+ "learning_rate": 4.426203576084629e-06,
+ "loss": 0.5199,
+ "step": 2870
+ },
+ {
+ "epoch": 1.3574468085106384,
+ "grad_norm": 2.6380388736724854,
+ "learning_rate": 4.42580584985262e-06,
+ "loss": 0.5049,
+ "step": 2871
+ },
+ {
+ "epoch": 1.357919621749409,
+ "grad_norm": 2.7324254512786865,
+ "learning_rate": 4.425408003709125e-06,
+ "loss": 0.5036,
+ "step": 2872
+ },
+ {
+ "epoch": 1.3583924349881797,
+ "grad_norm": 2.661012649536133,
+ "learning_rate": 4.425010037678916e-06,
+ "loss": 0.4965,
+ "step": 2873
+ },
+ {
+ "epoch": 1.3588652482269503,
+ "grad_norm": 2.5380208492279053,
+ "learning_rate": 4.424611951786773e-06,
+ "loss": 0.4293,
+ "step": 2874
+ },
+ {
+ "epoch": 1.3593380614657211,
+ "grad_norm": 2.6060714721679688,
+ "learning_rate": 4.424213746057483e-06,
+ "loss": 0.5335,
+ "step": 2875
+ },
+ {
+ "epoch": 1.3598108747044917,
+ "grad_norm": 2.98282527923584,
+ "learning_rate": 4.423815420515841e-06,
+ "loss": 0.5626,
+ "step": 2876
+ },
+ {
+ "epoch": 1.3602836879432625,
+ "grad_norm": 2.779371500015259,
+ "learning_rate": 4.423416975186647e-06,
+ "loss": 0.5353,
+ "step": 2877
+ },
+ {
+ "epoch": 1.360756501182033,
+ "grad_norm": 2.8033530712127686,
+ "learning_rate": 4.423018410094713e-06,
+ "loss": 0.538,
+ "step": 2878
+ },
+ {
+ "epoch": 1.3612293144208039,
+ "grad_norm": 3.225177764892578,
+ "learning_rate": 4.422619725264855e-06,
+ "loss": 0.5441,
+ "step": 2879
+ },
+ {
+ "epoch": 1.3617021276595744,
+ "grad_norm": 2.959135055541992,
+ "learning_rate": 4.422220920721896e-06,
+ "loss": 0.5293,
+ "step": 2880
+ },
+ {
+ "epoch": 1.3621749408983452,
+ "grad_norm": 2.5558884143829346,
+ "learning_rate": 4.4218219964906704e-06,
+ "loss": 0.442,
+ "step": 2881
+ },
+ {
+ "epoch": 1.3626477541371158,
+ "grad_norm": 2.694899797439575,
+ "learning_rate": 4.421422952596015e-06,
+ "loss": 0.5318,
+ "step": 2882
+ },
+ {
+ "epoch": 1.3631205673758866,
+ "grad_norm": 2.7909531593322754,
+ "learning_rate": 4.421023789062777e-06,
+ "loss": 0.6648,
+ "step": 2883
+ },
+ {
+ "epoch": 1.3635933806146572,
+ "grad_norm": 2.421995162963867,
+ "learning_rate": 4.420624505915813e-06,
+ "loss": 0.4644,
+ "step": 2884
+ },
+ {
+ "epoch": 1.364066193853428,
+ "grad_norm": 2.5876688957214355,
+ "learning_rate": 4.420225103179981e-06,
+ "loss": 0.5743,
+ "step": 2885
+ },
+ {
+ "epoch": 1.3645390070921986,
+ "grad_norm": 2.89341139793396,
+ "learning_rate": 4.419825580880152e-06,
+ "loss": 0.5454,
+ "step": 2886
+ },
+ {
+ "epoch": 1.3650118203309693,
+ "grad_norm": 2.534708261489868,
+ "learning_rate": 4.419425939041203e-06,
+ "loss": 0.5572,
+ "step": 2887
+ },
+ {
+ "epoch": 1.36548463356974,
+ "grad_norm": 2.6052141189575195,
+ "learning_rate": 4.419026177688017e-06,
+ "loss": 0.4763,
+ "step": 2888
+ },
+ {
+ "epoch": 1.3659574468085105,
+ "grad_norm": 2.723720073699951,
+ "learning_rate": 4.4186262968454854e-06,
+ "loss": 0.5659,
+ "step": 2889
+ },
+ {
+ "epoch": 1.3664302600472813,
+ "grad_norm": 2.8909599781036377,
+ "learning_rate": 4.418226296538507e-06,
+ "loss": 0.4996,
+ "step": 2890
+ },
+ {
+ "epoch": 1.366903073286052,
+ "grad_norm": 2.551375389099121,
+ "learning_rate": 4.417826176791988e-06,
+ "loss": 0.5259,
+ "step": 2891
+ },
+ {
+ "epoch": 1.3673758865248227,
+ "grad_norm": 3.360267162322998,
+ "learning_rate": 4.417425937630843e-06,
+ "loss": 0.5381,
+ "step": 2892
+ },
+ {
+ "epoch": 1.3678486997635932,
+ "grad_norm": 2.7611942291259766,
+ "learning_rate": 4.417025579079992e-06,
+ "loss": 0.6022,
+ "step": 2893
+ },
+ {
+ "epoch": 1.368321513002364,
+ "grad_norm": 2.5931224822998047,
+ "learning_rate": 4.416625101164365e-06,
+ "loss": 0.5102,
+ "step": 2894
+ },
+ {
+ "epoch": 1.3687943262411348,
+ "grad_norm": 2.5888102054595947,
+ "learning_rate": 4.416224503908897e-06,
+ "loss": 0.4955,
+ "step": 2895
+ },
+ {
+ "epoch": 1.3692671394799054,
+ "grad_norm": 2.6262896060943604,
+ "learning_rate": 4.41582378733853e-06,
+ "loss": 0.5101,
+ "step": 2896
+ },
+ {
+ "epoch": 1.369739952718676,
+ "grad_norm": 3.339170217514038,
+ "learning_rate": 4.415422951478218e-06,
+ "loss": 0.4939,
+ "step": 2897
+ },
+ {
+ "epoch": 1.3702127659574468,
+ "grad_norm": 2.940866708755493,
+ "learning_rate": 4.415021996352917e-06,
+ "loss": 0.5157,
+ "step": 2898
+ },
+ {
+ "epoch": 1.3706855791962176,
+ "grad_norm": 2.7423818111419678,
+ "learning_rate": 4.414620921987594e-06,
+ "loss": 0.5308,
+ "step": 2899
+ },
+ {
+ "epoch": 1.3711583924349882,
+ "grad_norm": 2.7177040576934814,
+ "learning_rate": 4.414219728407221e-06,
+ "loss": 0.5429,
+ "step": 2900
+ },
+ {
+ "epoch": 1.3716312056737587,
+ "grad_norm": 2.560774087905884,
+ "learning_rate": 4.4138184156367794e-06,
+ "loss": 0.5266,
+ "step": 2901
+ },
+ {
+ "epoch": 1.3721040189125295,
+ "grad_norm": 2.5649116039276123,
+ "learning_rate": 4.413416983701256e-06,
+ "loss": 0.4718,
+ "step": 2902
+ },
+ {
+ "epoch": 1.3725768321513003,
+ "grad_norm": 2.8547167778015137,
+ "learning_rate": 4.413015432625648e-06,
+ "loss": 0.5129,
+ "step": 2903
+ },
+ {
+ "epoch": 1.373049645390071,
+ "grad_norm": 2.5413618087768555,
+ "learning_rate": 4.412613762434958e-06,
+ "loss": 0.5738,
+ "step": 2904
+ },
+ {
+ "epoch": 1.3735224586288415,
+ "grad_norm": 3.3252241611480713,
+ "learning_rate": 4.412211973154195e-06,
+ "loss": 0.5639,
+ "step": 2905
+ },
+ {
+ "epoch": 1.3739952718676123,
+ "grad_norm": 2.869102954864502,
+ "learning_rate": 4.411810064808376e-06,
+ "loss": 0.5384,
+ "step": 2906
+ },
+ {
+ "epoch": 1.374468085106383,
+ "grad_norm": 2.703199863433838,
+ "learning_rate": 4.411408037422529e-06,
+ "loss": 0.5742,
+ "step": 2907
+ },
+ {
+ "epoch": 1.3749408983451537,
+ "grad_norm": 2.685450792312622,
+ "learning_rate": 4.411005891021684e-06,
+ "loss": 0.5121,
+ "step": 2908
+ },
+ {
+ "epoch": 1.3754137115839242,
+ "grad_norm": 2.9572203159332275,
+ "learning_rate": 4.410603625630882e-06,
+ "loss": 0.5444,
+ "step": 2909
+ },
+ {
+ "epoch": 1.375886524822695,
+ "grad_norm": 2.707002878189087,
+ "learning_rate": 4.410201241275169e-06,
+ "loss": 0.5125,
+ "step": 2910
+ },
+ {
+ "epoch": 1.3763593380614658,
+ "grad_norm": 3.0158939361572266,
+ "learning_rate": 4.409798737979602e-06,
+ "loss": 0.5299,
+ "step": 2911
+ },
+ {
+ "epoch": 1.3768321513002364,
+ "grad_norm": 2.7932698726654053,
+ "learning_rate": 4.4093961157692415e-06,
+ "loss": 0.5437,
+ "step": 2912
+ },
+ {
+ "epoch": 1.377304964539007,
+ "grad_norm": 2.459510326385498,
+ "learning_rate": 4.408993374669156e-06,
+ "loss": 0.5548,
+ "step": 2913
+ },
+ {
+ "epoch": 1.3777777777777778,
+ "grad_norm": 2.7500696182250977,
+ "learning_rate": 4.408590514704425e-06,
+ "loss": 0.5186,
+ "step": 2914
+ },
+ {
+ "epoch": 1.3782505910165486,
+ "grad_norm": 2.7824268341064453,
+ "learning_rate": 4.4081875359001315e-06,
+ "loss": 0.4762,
+ "step": 2915
+ },
+ {
+ "epoch": 1.3787234042553191,
+ "grad_norm": 2.4202158451080322,
+ "learning_rate": 4.4077844382813675e-06,
+ "loss": 0.5005,
+ "step": 2916
+ },
+ {
+ "epoch": 1.3791962174940897,
+ "grad_norm": 2.5566670894622803,
+ "learning_rate": 4.4073812218732316e-06,
+ "loss": 0.5377,
+ "step": 2917
+ },
+ {
+ "epoch": 1.3796690307328605,
+ "grad_norm": 3.400874376296997,
+ "learning_rate": 4.406977886700831e-06,
+ "loss": 0.6637,
+ "step": 2918
+ },
+ {
+ "epoch": 1.3801418439716313,
+ "grad_norm": 2.8187878131866455,
+ "learning_rate": 4.406574432789278e-06,
+ "loss": 0.5033,
+ "step": 2919
+ },
+ {
+ "epoch": 1.3806146572104019,
+ "grad_norm": 2.5578041076660156,
+ "learning_rate": 4.406170860163697e-06,
+ "loss": 0.5293,
+ "step": 2920
+ },
+ {
+ "epoch": 1.3810874704491725,
+ "grad_norm": 2.6709718704223633,
+ "learning_rate": 4.405767168849213e-06,
+ "loss": 0.5144,
+ "step": 2921
+ },
+ {
+ "epoch": 1.3815602836879433,
+ "grad_norm": 3.049365997314453,
+ "learning_rate": 4.405363358870965e-06,
+ "loss": 0.4894,
+ "step": 2922
+ },
+ {
+ "epoch": 1.382033096926714,
+ "grad_norm": 2.5569891929626465,
+ "learning_rate": 4.404959430254095e-06,
+ "loss": 0.4929,
+ "step": 2923
+ },
+ {
+ "epoch": 1.3825059101654846,
+ "grad_norm": 2.8288230895996094,
+ "learning_rate": 4.404555383023754e-06,
+ "loss": 0.5438,
+ "step": 2924
+ },
+ {
+ "epoch": 1.3829787234042552,
+ "grad_norm": 2.8363358974456787,
+ "learning_rate": 4.404151217205102e-06,
+ "loss": 0.545,
+ "step": 2925
+ },
+ {
+ "epoch": 1.383451536643026,
+ "grad_norm": 2.720972776412964,
+ "learning_rate": 4.403746932823302e-06,
+ "loss": 0.5732,
+ "step": 2926
+ },
+ {
+ "epoch": 1.3839243498817968,
+ "grad_norm": 2.728043794631958,
+ "learning_rate": 4.403342529903528e-06,
+ "loss": 0.4944,
+ "step": 2927
+ },
+ {
+ "epoch": 1.3843971631205674,
+ "grad_norm": 2.4366135597229004,
+ "learning_rate": 4.402938008470961e-06,
+ "loss": 0.4441,
+ "step": 2928
+ },
+ {
+ "epoch": 1.384869976359338,
+ "grad_norm": 2.858454704284668,
+ "learning_rate": 4.402533368550788e-06,
+ "loss": 0.5359,
+ "step": 2929
+ },
+ {
+ "epoch": 1.3853427895981087,
+ "grad_norm": 2.805795907974243,
+ "learning_rate": 4.402128610168205e-06,
+ "loss": 0.4954,
+ "step": 2930
+ },
+ {
+ "epoch": 1.3858156028368795,
+ "grad_norm": 3.3514177799224854,
+ "learning_rate": 4.401723733348413e-06,
+ "loss": 0.579,
+ "step": 2931
+ },
+ {
+ "epoch": 1.3862884160756501,
+ "grad_norm": 2.6255125999450684,
+ "learning_rate": 4.401318738116624e-06,
+ "loss": 0.5002,
+ "step": 2932
+ },
+ {
+ "epoch": 1.3867612293144207,
+ "grad_norm": 2.3480796813964844,
+ "learning_rate": 4.400913624498054e-06,
+ "loss": 0.4688,
+ "step": 2933
+ },
+ {
+ "epoch": 1.3872340425531915,
+ "grad_norm": 2.710165023803711,
+ "learning_rate": 4.400508392517927e-06,
+ "loss": 0.5099,
+ "step": 2934
+ },
+ {
+ "epoch": 1.3877068557919623,
+ "grad_norm": 2.5820295810699463,
+ "learning_rate": 4.400103042201477e-06,
+ "loss": 0.512,
+ "step": 2935
+ },
+ {
+ "epoch": 1.3881796690307329,
+ "grad_norm": 2.750596523284912,
+ "learning_rate": 4.399697573573942e-06,
+ "loss": 0.463,
+ "step": 2936
+ },
+ {
+ "epoch": 1.3886524822695034,
+ "grad_norm": 3.497537612915039,
+ "learning_rate": 4.399291986660569e-06,
+ "loss": 0.5676,
+ "step": 2937
+ },
+ {
+ "epoch": 1.3891252955082742,
+ "grad_norm": 2.4046003818511963,
+ "learning_rate": 4.398886281486612e-06,
+ "loss": 0.5408,
+ "step": 2938
+ },
+ {
+ "epoch": 1.389598108747045,
+ "grad_norm": 2.941606283187866,
+ "learning_rate": 4.398480458077332e-06,
+ "loss": 0.5734,
+ "step": 2939
+ },
+ {
+ "epoch": 1.3900709219858156,
+ "grad_norm": 3.030214309692383,
+ "learning_rate": 4.398074516458e-06,
+ "loss": 0.5353,
+ "step": 2940
+ },
+ {
+ "epoch": 1.3905437352245862,
+ "grad_norm": 2.9991626739501953,
+ "learning_rate": 4.397668456653889e-06,
+ "loss": 0.5989,
+ "step": 2941
+ },
+ {
+ "epoch": 1.391016548463357,
+ "grad_norm": 4.163141250610352,
+ "learning_rate": 4.397262278690285e-06,
+ "loss": 0.5436,
+ "step": 2942
+ },
+ {
+ "epoch": 1.3914893617021278,
+ "grad_norm": 2.6576037406921387,
+ "learning_rate": 4.396855982592478e-06,
+ "loss": 0.5206,
+ "step": 2943
+ },
+ {
+ "epoch": 1.3919621749408984,
+ "grad_norm": 2.7729203701019287,
+ "learning_rate": 4.396449568385768e-06,
+ "loss": 0.5403,
+ "step": 2944
+ },
+ {
+ "epoch": 1.392434988179669,
+ "grad_norm": 2.4560446739196777,
+ "learning_rate": 4.396043036095457e-06,
+ "loss": 0.4924,
+ "step": 2945
+ },
+ {
+ "epoch": 1.3929078014184397,
+ "grad_norm": 2.6370556354522705,
+ "learning_rate": 4.39563638574686e-06,
+ "loss": 0.5543,
+ "step": 2946
+ },
+ {
+ "epoch": 1.3933806146572105,
+ "grad_norm": 2.593914270401001,
+ "learning_rate": 4.395229617365298e-06,
+ "loss": 0.5133,
+ "step": 2947
+ },
+ {
+ "epoch": 1.393853427895981,
+ "grad_norm": 2.3583998680114746,
+ "learning_rate": 4.394822730976099e-06,
+ "loss": 0.4436,
+ "step": 2948
+ },
+ {
+ "epoch": 1.3943262411347517,
+ "grad_norm": 3.2768537998199463,
+ "learning_rate": 4.394415726604596e-06,
+ "loss": 0.5489,
+ "step": 2949
+ },
+ {
+ "epoch": 1.3947990543735225,
+ "grad_norm": 2.88662052154541,
+ "learning_rate": 4.394008604276133e-06,
+ "loss": 0.5194,
+ "step": 2950
+ },
+ {
+ "epoch": 1.3952718676122933,
+ "grad_norm": 2.46610426902771,
+ "learning_rate": 4.393601364016059e-06,
+ "loss": 0.5255,
+ "step": 2951
+ },
+ {
+ "epoch": 1.3957446808510638,
+ "grad_norm": 3.122509241104126,
+ "learning_rate": 4.393194005849731e-06,
+ "loss": 0.6046,
+ "step": 2952
+ },
+ {
+ "epoch": 1.3962174940898344,
+ "grad_norm": 2.724926471710205,
+ "learning_rate": 4.392786529802513e-06,
+ "loss": 0.4958,
+ "step": 2953
+ },
+ {
+ "epoch": 1.3966903073286052,
+ "grad_norm": 2.491485595703125,
+ "learning_rate": 4.3923789358997785e-06,
+ "loss": 0.5209,
+ "step": 2954
+ },
+ {
+ "epoch": 1.397163120567376,
+ "grad_norm": 2.61110520362854,
+ "learning_rate": 4.3919712241669056e-06,
+ "loss": 0.5202,
+ "step": 2955
+ },
+ {
+ "epoch": 1.3976359338061466,
+ "grad_norm": 2.3814501762390137,
+ "learning_rate": 4.39156339462928e-06,
+ "loss": 0.4966,
+ "step": 2956
+ },
+ {
+ "epoch": 1.3981087470449172,
+ "grad_norm": 2.762498617172241,
+ "learning_rate": 4.391155447312296e-06,
+ "loss": 0.6025,
+ "step": 2957
+ },
+ {
+ "epoch": 1.398581560283688,
+ "grad_norm": 2.964975595474243,
+ "learning_rate": 4.390747382241355e-06,
+ "loss": 0.4845,
+ "step": 2958
+ },
+ {
+ "epoch": 1.3990543735224588,
+ "grad_norm": 3.0117249488830566,
+ "learning_rate": 4.3903391994418655e-06,
+ "loss": 0.5326,
+ "step": 2959
+ },
+ {
+ "epoch": 1.3995271867612293,
+ "grad_norm": 2.578626871109009,
+ "learning_rate": 4.389930898939243e-06,
+ "loss": 0.5271,
+ "step": 2960
+ },
+ {
+ "epoch": 1.4,
+ "grad_norm": 2.747441053390503,
+ "learning_rate": 4.38952248075891e-06,
+ "loss": 0.5553,
+ "step": 2961
+ },
+ {
+ "epoch": 1.4004728132387707,
+ "grad_norm": 2.8273086547851562,
+ "learning_rate": 4.389113944926297e-06,
+ "loss": 0.5475,
+ "step": 2962
+ },
+ {
+ "epoch": 1.4009456264775415,
+ "grad_norm": 2.55238676071167,
+ "learning_rate": 4.388705291466843e-06,
+ "loss": 0.4864,
+ "step": 2963
+ },
+ {
+ "epoch": 1.401418439716312,
+ "grad_norm": 2.597214460372925,
+ "learning_rate": 4.388296520405992e-06,
+ "loss": 0.4845,
+ "step": 2964
+ },
+ {
+ "epoch": 1.4018912529550827,
+ "grad_norm": 2.608962297439575,
+ "learning_rate": 4.387887631769196e-06,
+ "loss": 0.5544,
+ "step": 2965
+ },
+ {
+ "epoch": 1.4023640661938535,
+ "grad_norm": 2.2754876613616943,
+ "learning_rate": 4.3874786255819165e-06,
+ "loss": 0.5045,
+ "step": 2966
+ },
+ {
+ "epoch": 1.4028368794326243,
+ "grad_norm": 2.9900264739990234,
+ "learning_rate": 4.387069501869618e-06,
+ "loss": 0.562,
+ "step": 2967
+ },
+ {
+ "epoch": 1.4033096926713948,
+ "grad_norm": 2.8069417476654053,
+ "learning_rate": 4.386660260657778e-06,
+ "loss": 0.5284,
+ "step": 2968
+ },
+ {
+ "epoch": 1.4037825059101654,
+ "grad_norm": 2.68894624710083,
+ "learning_rate": 4.386250901971875e-06,
+ "loss": 0.5879,
+ "step": 2969
+ },
+ {
+ "epoch": 1.4042553191489362,
+ "grad_norm": 2.614485025405884,
+ "learning_rate": 4.385841425837399e-06,
+ "loss": 0.4771,
+ "step": 2970
+ },
+ {
+ "epoch": 1.4047281323877068,
+ "grad_norm": 2.487950325012207,
+ "learning_rate": 4.385431832279848e-06,
+ "loss": 0.5552,
+ "step": 2971
+ },
+ {
+ "epoch": 1.4052009456264776,
+ "grad_norm": 2.5098392963409424,
+ "learning_rate": 4.385022121324723e-06,
+ "loss": 0.5267,
+ "step": 2972
+ },
+ {
+ "epoch": 1.4056737588652481,
+ "grad_norm": 2.825838565826416,
+ "learning_rate": 4.384612292997537e-06,
+ "loss": 0.5336,
+ "step": 2973
+ },
+ {
+ "epoch": 1.406146572104019,
+ "grad_norm": 2.898188829421997,
+ "learning_rate": 4.384202347323806e-06,
+ "loss": 0.5685,
+ "step": 2974
+ },
+ {
+ "epoch": 1.4066193853427895,
+ "grad_norm": 2.8722569942474365,
+ "learning_rate": 4.383792284329057e-06,
+ "loss": 0.5977,
+ "step": 2975
+ },
+ {
+ "epoch": 1.4070921985815603,
+ "grad_norm": 2.832951307296753,
+ "learning_rate": 4.3833821040388235e-06,
+ "loss": 0.5766,
+ "step": 2976
+ },
+ {
+ "epoch": 1.407565011820331,
+ "grad_norm": 2.7353670597076416,
+ "learning_rate": 4.3829718064786446e-06,
+ "loss": 0.5461,
+ "step": 2977
+ },
+ {
+ "epoch": 1.4080378250591017,
+ "grad_norm": 2.6050429344177246,
+ "learning_rate": 4.3825613916740675e-06,
+ "loss": 0.5501,
+ "step": 2978
+ },
+ {
+ "epoch": 1.4085106382978723,
+ "grad_norm": 2.79719877243042,
+ "learning_rate": 4.382150859650647e-06,
+ "loss": 0.502,
+ "step": 2979
+ },
+ {
+ "epoch": 1.408983451536643,
+ "grad_norm": 2.5538079738616943,
+ "learning_rate": 4.381740210433946e-06,
+ "loss": 0.4762,
+ "step": 2980
+ },
+ {
+ "epoch": 1.4094562647754136,
+ "grad_norm": 2.7256062030792236,
+ "learning_rate": 4.381329444049533e-06,
+ "loss": 0.4692,
+ "step": 2981
+ },
+ {
+ "epoch": 1.4099290780141844,
+ "grad_norm": 2.7778146266937256,
+ "learning_rate": 4.3809185605229855e-06,
+ "loss": 0.5366,
+ "step": 2982
+ },
+ {
+ "epoch": 1.410401891252955,
+ "grad_norm": 2.6289451122283936,
+ "learning_rate": 4.380507559879887e-06,
+ "loss": 0.5412,
+ "step": 2983
+ },
+ {
+ "epoch": 1.4108747044917258,
+ "grad_norm": 2.697204828262329,
+ "learning_rate": 4.380096442145827e-06,
+ "loss": 0.5065,
+ "step": 2984
+ },
+ {
+ "epoch": 1.4113475177304964,
+ "grad_norm": 2.4709219932556152,
+ "learning_rate": 4.379685207346407e-06,
+ "loss": 0.568,
+ "step": 2985
+ },
+ {
+ "epoch": 1.4118203309692672,
+ "grad_norm": 2.9740655422210693,
+ "learning_rate": 4.379273855507231e-06,
+ "loss": 0.5512,
+ "step": 2986
+ },
+ {
+ "epoch": 1.4122931442080378,
+ "grad_norm": 3.0090627670288086,
+ "learning_rate": 4.378862386653911e-06,
+ "loss": 0.5459,
+ "step": 2987
+ },
+ {
+ "epoch": 1.4127659574468086,
+ "grad_norm": 2.8835368156433105,
+ "learning_rate": 4.378450800812071e-06,
+ "loss": 0.5357,
+ "step": 2988
+ },
+ {
+ "epoch": 1.4132387706855791,
+ "grad_norm": 2.558824062347412,
+ "learning_rate": 4.378039098007335e-06,
+ "loss": 0.536,
+ "step": 2989
+ },
+ {
+ "epoch": 1.41371158392435,
+ "grad_norm": 2.5572092533111572,
+ "learning_rate": 4.377627278265339e-06,
+ "loss": 0.5183,
+ "step": 2990
+ },
+ {
+ "epoch": 1.4141843971631205,
+ "grad_norm": 2.7356579303741455,
+ "learning_rate": 4.377215341611727e-06,
+ "loss": 0.5087,
+ "step": 2991
+ },
+ {
+ "epoch": 1.4146572104018913,
+ "grad_norm": 2.7541024684906006,
+ "learning_rate": 4.376803288072146e-06,
+ "loss": 0.4509,
+ "step": 2992
+ },
+ {
+ "epoch": 1.4151300236406619,
+ "grad_norm": 2.7548446655273438,
+ "learning_rate": 4.376391117672254e-06,
+ "loss": 0.5532,
+ "step": 2993
+ },
+ {
+ "epoch": 1.4156028368794327,
+ "grad_norm": 2.9107465744018555,
+ "learning_rate": 4.375978830437715e-06,
+ "loss": 0.5719,
+ "step": 2994
+ },
+ {
+ "epoch": 1.4160756501182032,
+ "grad_norm": 2.7077393531799316,
+ "learning_rate": 4.3755664263942e-06,
+ "loss": 0.5084,
+ "step": 2995
+ },
+ {
+ "epoch": 1.416548463356974,
+ "grad_norm": 2.764209270477295,
+ "learning_rate": 4.375153905567388e-06,
+ "loss": 0.5976,
+ "step": 2996
+ },
+ {
+ "epoch": 1.4170212765957446,
+ "grad_norm": 2.7792932987213135,
+ "learning_rate": 4.374741267982964e-06,
+ "loss": 0.5358,
+ "step": 2997
+ },
+ {
+ "epoch": 1.4174940898345154,
+ "grad_norm": 2.459212064743042,
+ "learning_rate": 4.374328513666622e-06,
+ "loss": 0.5181,
+ "step": 2998
+ },
+ {
+ "epoch": 1.417966903073286,
+ "grad_norm": 2.548546552658081,
+ "learning_rate": 4.373915642644062e-06,
+ "loss": 0.528,
+ "step": 2999
+ },
+ {
+ "epoch": 1.4184397163120568,
+ "grad_norm": 2.998138189315796,
+ "learning_rate": 4.373502654940992e-06,
+ "loss": 0.5233,
+ "step": 3000
+ },
+ {
+ "epoch": 1.4189125295508274,
+ "grad_norm": 2.604341983795166,
+ "learning_rate": 4.373089550583126e-06,
+ "loss": 0.5274,
+ "step": 3001
+ },
+ {
+ "epoch": 1.4193853427895982,
+ "grad_norm": 2.6792588233947754,
+ "learning_rate": 4.372676329596188e-06,
+ "loss": 0.5061,
+ "step": 3002
+ },
+ {
+ "epoch": 1.4198581560283687,
+ "grad_norm": 2.5182368755340576,
+ "learning_rate": 4.372262992005906e-06,
+ "loss": 0.541,
+ "step": 3003
+ },
+ {
+ "epoch": 1.4203309692671395,
+ "grad_norm": 2.690718173980713,
+ "learning_rate": 4.371849537838018e-06,
+ "loss": 0.5308,
+ "step": 3004
+ },
+ {
+ "epoch": 1.42080378250591,
+ "grad_norm": 2.6797590255737305,
+ "learning_rate": 4.371435967118266e-06,
+ "loss": 0.5728,
+ "step": 3005
+ },
+ {
+ "epoch": 1.421276595744681,
+ "grad_norm": 2.847900152206421,
+ "learning_rate": 4.371022279872403e-06,
+ "loss": 0.5053,
+ "step": 3006
+ },
+ {
+ "epoch": 1.4217494089834515,
+ "grad_norm": 2.497810125350952,
+ "learning_rate": 4.370608476126186e-06,
+ "loss": 0.5057,
+ "step": 3007
+ },
+ {
+ "epoch": 1.4222222222222223,
+ "grad_norm": 2.5259225368499756,
+ "learning_rate": 4.370194555905382e-06,
+ "loss": 0.5508,
+ "step": 3008
+ },
+ {
+ "epoch": 1.4226950354609929,
+ "grad_norm": 2.774118423461914,
+ "learning_rate": 4.369780519235763e-06,
+ "loss": 0.5419,
+ "step": 3009
+ },
+ {
+ "epoch": 1.4231678486997636,
+ "grad_norm": 2.2764663696289062,
+ "learning_rate": 4.369366366143111e-06,
+ "loss": 0.5032,
+ "step": 3010
+ },
+ {
+ "epoch": 1.4236406619385342,
+ "grad_norm": 2.736347198486328,
+ "learning_rate": 4.368952096653211e-06,
+ "loss": 0.5184,
+ "step": 3011
+ },
+ {
+ "epoch": 1.424113475177305,
+ "grad_norm": 2.476762056350708,
+ "learning_rate": 4.36853771079186e-06,
+ "loss": 0.5331,
+ "step": 3012
+ },
+ {
+ "epoch": 1.4245862884160756,
+ "grad_norm": 2.8006162643432617,
+ "learning_rate": 4.3681232085848585e-06,
+ "loss": 0.5331,
+ "step": 3013
+ },
+ {
+ "epoch": 1.4250591016548464,
+ "grad_norm": 2.509143590927124,
+ "learning_rate": 4.367708590058016e-06,
+ "loss": 0.5127,
+ "step": 3014
+ },
+ {
+ "epoch": 1.425531914893617,
+ "grad_norm": 3.030137538909912,
+ "learning_rate": 4.3672938552371505e-06,
+ "loss": 0.5555,
+ "step": 3015
+ },
+ {
+ "epoch": 1.4260047281323878,
+ "grad_norm": 3.0536904335021973,
+ "learning_rate": 4.3668790041480835e-06,
+ "loss": 0.5241,
+ "step": 3016
+ },
+ {
+ "epoch": 1.4264775413711583,
+ "grad_norm": 2.6400439739227295,
+ "learning_rate": 4.366464036816647e-06,
+ "loss": 0.4946,
+ "step": 3017
+ },
+ {
+ "epoch": 1.4269503546099291,
+ "grad_norm": 2.7302589416503906,
+ "learning_rate": 4.366048953268679e-06,
+ "loss": 0.5105,
+ "step": 3018
+ },
+ {
+ "epoch": 1.4274231678486997,
+ "grad_norm": 2.504549264907837,
+ "learning_rate": 4.365633753530026e-06,
+ "loss": 0.4844,
+ "step": 3019
+ },
+ {
+ "epoch": 1.4278959810874705,
+ "grad_norm": 2.3872320652008057,
+ "learning_rate": 4.365218437626539e-06,
+ "loss": 0.4402,
+ "step": 3020
+ },
+ {
+ "epoch": 1.428368794326241,
+ "grad_norm": 2.531649351119995,
+ "learning_rate": 4.364803005584078e-06,
+ "loss": 0.4913,
+ "step": 3021
+ },
+ {
+ "epoch": 1.4288416075650119,
+ "grad_norm": 2.4683783054351807,
+ "learning_rate": 4.364387457428512e-06,
+ "loss": 0.515,
+ "step": 3022
+ },
+ {
+ "epoch": 1.4293144208037825,
+ "grad_norm": 2.632336378097534,
+ "learning_rate": 4.363971793185713e-06,
+ "loss": 0.5398,
+ "step": 3023
+ },
+ {
+ "epoch": 1.4297872340425533,
+ "grad_norm": 2.7456719875335693,
+ "learning_rate": 4.363556012881565e-06,
+ "loss": 0.5254,
+ "step": 3024
+ },
+ {
+ "epoch": 1.4302600472813238,
+ "grad_norm": 2.607177972793579,
+ "learning_rate": 4.363140116541955e-06,
+ "loss": 0.5266,
+ "step": 3025
+ },
+ {
+ "epoch": 1.4307328605200946,
+ "grad_norm": 2.640127420425415,
+ "learning_rate": 4.3627241041927796e-06,
+ "loss": 0.5157,
+ "step": 3026
+ },
+ {
+ "epoch": 1.4312056737588652,
+ "grad_norm": 2.4210736751556396,
+ "learning_rate": 4.362307975859941e-06,
+ "loss": 0.4599,
+ "step": 3027
+ },
+ {
+ "epoch": 1.431678486997636,
+ "grad_norm": 2.6007790565490723,
+ "learning_rate": 4.361891731569352e-06,
+ "loss": 0.5298,
+ "step": 3028
+ },
+ {
+ "epoch": 1.4321513002364066,
+ "grad_norm": 2.5352046489715576,
+ "learning_rate": 4.361475371346928e-06,
+ "loss": 0.5128,
+ "step": 3029
+ },
+ {
+ "epoch": 1.4326241134751774,
+ "grad_norm": 2.4204049110412598,
+ "learning_rate": 4.361058895218596e-06,
+ "loss": 0.4669,
+ "step": 3030
+ },
+ {
+ "epoch": 1.433096926713948,
+ "grad_norm": 2.525240182876587,
+ "learning_rate": 4.360642303210286e-06,
+ "loss": 0.4925,
+ "step": 3031
+ },
+ {
+ "epoch": 1.4335697399527187,
+ "grad_norm": 2.839646339416504,
+ "learning_rate": 4.360225595347939e-06,
+ "loss": 0.5868,
+ "step": 3032
+ },
+ {
+ "epoch": 1.4340425531914893,
+ "grad_norm": 2.5043296813964844,
+ "learning_rate": 4.359808771657501e-06,
+ "loss": 0.4951,
+ "step": 3033
+ },
+ {
+ "epoch": 1.4345153664302601,
+ "grad_norm": 2.9082300662994385,
+ "learning_rate": 4.359391832164927e-06,
+ "loss": 0.5259,
+ "step": 3034
+ },
+ {
+ "epoch": 1.4349881796690307,
+ "grad_norm": 2.6651999950408936,
+ "learning_rate": 4.3589747768961745e-06,
+ "loss": 0.537,
+ "step": 3035
+ },
+ {
+ "epoch": 1.4354609929078015,
+ "grad_norm": 2.577077865600586,
+ "learning_rate": 4.358557605877216e-06,
+ "loss": 0.5186,
+ "step": 3036
+ },
+ {
+ "epoch": 1.435933806146572,
+ "grad_norm": 2.7445287704467773,
+ "learning_rate": 4.3581403191340236e-06,
+ "loss": 0.5573,
+ "step": 3037
+ },
+ {
+ "epoch": 1.4364066193853429,
+ "grad_norm": 2.502086639404297,
+ "learning_rate": 4.357722916692582e-06,
+ "loss": 0.5039,
+ "step": 3038
+ },
+ {
+ "epoch": 1.4368794326241134,
+ "grad_norm": 2.4476163387298584,
+ "learning_rate": 4.357305398578879e-06,
+ "loss": 0.5638,
+ "step": 3039
+ },
+ {
+ "epoch": 1.4373522458628842,
+ "grad_norm": 2.7705588340759277,
+ "learning_rate": 4.356887764818915e-06,
+ "loss": 0.5485,
+ "step": 3040
+ },
+ {
+ "epoch": 1.4378250591016548,
+ "grad_norm": 2.498225450515747,
+ "learning_rate": 4.356470015438691e-06,
+ "loss": 0.5486,
+ "step": 3041
+ },
+ {
+ "epoch": 1.4382978723404256,
+ "grad_norm": 2.394320011138916,
+ "learning_rate": 4.356052150464219e-06,
+ "loss": 0.512,
+ "step": 3042
+ },
+ {
+ "epoch": 1.4387706855791962,
+ "grad_norm": 2.8725767135620117,
+ "learning_rate": 4.3556341699215185e-06,
+ "loss": 0.5202,
+ "step": 3043
+ },
+ {
+ "epoch": 1.439243498817967,
+ "grad_norm": 3.1707918643951416,
+ "learning_rate": 4.355216073836615e-06,
+ "loss": 0.5229,
+ "step": 3044
+ },
+ {
+ "epoch": 1.4397163120567376,
+ "grad_norm": 2.532578468322754,
+ "learning_rate": 4.3547978622355415e-06,
+ "loss": 0.4569,
+ "step": 3045
+ },
+ {
+ "epoch": 1.4401891252955084,
+ "grad_norm": 3.0111029148101807,
+ "learning_rate": 4.354379535144338e-06,
+ "loss": 0.5801,
+ "step": 3046
+ },
+ {
+ "epoch": 1.440661938534279,
+ "grad_norm": 2.9554224014282227,
+ "learning_rate": 4.353961092589052e-06,
+ "loss": 0.5968,
+ "step": 3047
+ },
+ {
+ "epoch": 1.4411347517730497,
+ "grad_norm": 2.7562637329101562,
+ "learning_rate": 4.353542534595738e-06,
+ "loss": 0.5005,
+ "step": 3048
+ },
+ {
+ "epoch": 1.4416075650118203,
+ "grad_norm": 3.083254337310791,
+ "learning_rate": 4.3531238611904595e-06,
+ "loss": 0.5389,
+ "step": 3049
+ },
+ {
+ "epoch": 1.442080378250591,
+ "grad_norm": 2.7778005599975586,
+ "learning_rate": 4.352705072399282e-06,
+ "loss": 0.5342,
+ "step": 3050
+ },
+ {
+ "epoch": 1.4425531914893617,
+ "grad_norm": 2.6673996448516846,
+ "learning_rate": 4.3522861682482845e-06,
+ "loss": 0.5213,
+ "step": 3051
+ },
+ {
+ "epoch": 1.4430260047281322,
+ "grad_norm": 2.637605905532837,
+ "learning_rate": 4.351867148763548e-06,
+ "loss": 0.4893,
+ "step": 3052
+ },
+ {
+ "epoch": 1.443498817966903,
+ "grad_norm": 2.834469795227051,
+ "learning_rate": 4.351448013971166e-06,
+ "loss": 0.5391,
+ "step": 3053
+ },
+ {
+ "epoch": 1.4439716312056738,
+ "grad_norm": 2.824153184890747,
+ "learning_rate": 4.351028763897234e-06,
+ "loss": 0.6403,
+ "step": 3054
+ },
+ {
+ "epoch": 1.4444444444444444,
+ "grad_norm": 2.558966875076294,
+ "learning_rate": 4.350609398567857e-06,
+ "loss": 0.4912,
+ "step": 3055
+ },
+ {
+ "epoch": 1.444917257683215,
+ "grad_norm": 2.281726360321045,
+ "learning_rate": 4.3501899180091475e-06,
+ "loss": 0.4655,
+ "step": 3056
+ },
+ {
+ "epoch": 1.4453900709219858,
+ "grad_norm": 2.499472141265869,
+ "learning_rate": 4.349770322247225e-06,
+ "loss": 0.4878,
+ "step": 3057
+ },
+ {
+ "epoch": 1.4458628841607566,
+ "grad_norm": 2.578615188598633,
+ "learning_rate": 4.349350611308215e-06,
+ "loss": 0.4855,
+ "step": 3058
+ },
+ {
+ "epoch": 1.4463356973995272,
+ "grad_norm": 2.7111165523529053,
+ "learning_rate": 4.348930785218252e-06,
+ "loss": 0.5415,
+ "step": 3059
+ },
+ {
+ "epoch": 1.4468085106382977,
+ "grad_norm": 2.8081610202789307,
+ "learning_rate": 4.348510844003476e-06,
+ "loss": 0.4881,
+ "step": 3060
+ },
+ {
+ "epoch": 1.4472813238770685,
+ "grad_norm": 2.9439868927001953,
+ "learning_rate": 4.348090787690036e-06,
+ "loss": 0.5485,
+ "step": 3061
+ },
+ {
+ "epoch": 1.4477541371158393,
+ "grad_norm": 2.592532157897949,
+ "learning_rate": 4.347670616304085e-06,
+ "loss": 0.4912,
+ "step": 3062
+ },
+ {
+ "epoch": 1.44822695035461,
+ "grad_norm": 2.960592746734619,
+ "learning_rate": 4.347250329871787e-06,
+ "loss": 0.5473,
+ "step": 3063
+ },
+ {
+ "epoch": 1.4486997635933805,
+ "grad_norm": 2.5786688327789307,
+ "learning_rate": 4.3468299284193116e-06,
+ "loss": 0.5348,
+ "step": 3064
+ },
+ {
+ "epoch": 1.4491725768321513,
+ "grad_norm": 2.6084046363830566,
+ "learning_rate": 4.346409411972834e-06,
+ "loss": 0.527,
+ "step": 3065
+ },
+ {
+ "epoch": 1.449645390070922,
+ "grad_norm": 2.489748239517212,
+ "learning_rate": 4.3459887805585385e-06,
+ "loss": 0.4943,
+ "step": 3066
+ },
+ {
+ "epoch": 1.4501182033096927,
+ "grad_norm": 2.452131986618042,
+ "learning_rate": 4.345568034202617e-06,
+ "loss": 0.4886,
+ "step": 3067
+ },
+ {
+ "epoch": 1.4505910165484632,
+ "grad_norm": 2.4034671783447266,
+ "learning_rate": 4.345147172931266e-06,
+ "loss": 0.4689,
+ "step": 3068
+ },
+ {
+ "epoch": 1.451063829787234,
+ "grad_norm": 2.6045448780059814,
+ "learning_rate": 4.344726196770691e-06,
+ "loss": 0.5842,
+ "step": 3069
+ },
+ {
+ "epoch": 1.4515366430260048,
+ "grad_norm": 2.697593927383423,
+ "learning_rate": 4.3443051057471045e-06,
+ "loss": 0.5358,
+ "step": 3070
+ },
+ {
+ "epoch": 1.4520094562647754,
+ "grad_norm": 2.6080820560455322,
+ "learning_rate": 4.343883899886727e-06,
+ "loss": 0.5361,
+ "step": 3071
+ },
+ {
+ "epoch": 1.452482269503546,
+ "grad_norm": 2.4605307579040527,
+ "learning_rate": 4.343462579215783e-06,
+ "loss": 0.4941,
+ "step": 3072
+ },
+ {
+ "epoch": 1.4529550827423168,
+ "grad_norm": 2.8025355339050293,
+ "learning_rate": 4.343041143760509e-06,
+ "loss": 0.5116,
+ "step": 3073
+ },
+ {
+ "epoch": 1.4534278959810876,
+ "grad_norm": 2.432515859603882,
+ "learning_rate": 4.3426195935471434e-06,
+ "loss": 0.4991,
+ "step": 3074
+ },
+ {
+ "epoch": 1.4539007092198581,
+ "grad_norm": 2.5838661193847656,
+ "learning_rate": 4.342197928601935e-06,
+ "loss": 0.4994,
+ "step": 3075
+ },
+ {
+ "epoch": 1.4543735224586287,
+ "grad_norm": 2.421692371368408,
+ "learning_rate": 4.341776148951141e-06,
+ "loss": 0.4945,
+ "step": 3076
+ },
+ {
+ "epoch": 1.4548463356973995,
+ "grad_norm": 2.5354676246643066,
+ "learning_rate": 4.341354254621021e-06,
+ "loss": 0.4859,
+ "step": 3077
+ },
+ {
+ "epoch": 1.4553191489361703,
+ "grad_norm": 2.7316789627075195,
+ "learning_rate": 4.340932245637846e-06,
+ "loss": 0.5136,
+ "step": 3078
+ },
+ {
+ "epoch": 1.455791962174941,
+ "grad_norm": 3.5903496742248535,
+ "learning_rate": 4.340510122027891e-06,
+ "loss": 0.6451,
+ "step": 3079
+ },
+ {
+ "epoch": 1.4562647754137115,
+ "grad_norm": 2.95190167427063,
+ "learning_rate": 4.340087883817442e-06,
+ "loss": 0.6354,
+ "step": 3080
+ },
+ {
+ "epoch": 1.4567375886524823,
+ "grad_norm": 2.8659214973449707,
+ "learning_rate": 4.339665531032789e-06,
+ "loss": 0.5514,
+ "step": 3081
+ },
+ {
+ "epoch": 1.457210401891253,
+ "grad_norm": 2.5681674480438232,
+ "learning_rate": 4.339243063700231e-06,
+ "loss": 0.5135,
+ "step": 3082
+ },
+ {
+ "epoch": 1.4576832151300236,
+ "grad_norm": 2.7353906631469727,
+ "learning_rate": 4.338820481846072e-06,
+ "loss": 0.4608,
+ "step": 3083
+ },
+ {
+ "epoch": 1.4581560283687942,
+ "grad_norm": 2.6116466522216797,
+ "learning_rate": 4.3383977854966245e-06,
+ "loss": 0.4924,
+ "step": 3084
+ },
+ {
+ "epoch": 1.458628841607565,
+ "grad_norm": 2.6676487922668457,
+ "learning_rate": 4.337974974678207e-06,
+ "loss": 0.5747,
+ "step": 3085
+ },
+ {
+ "epoch": 1.4591016548463358,
+ "grad_norm": 2.909031629562378,
+ "learning_rate": 4.337552049417147e-06,
+ "loss": 0.4618,
+ "step": 3086
+ },
+ {
+ "epoch": 1.4595744680851064,
+ "grad_norm": 2.7614190578460693,
+ "learning_rate": 4.33712900973978e-06,
+ "loss": 0.5154,
+ "step": 3087
+ },
+ {
+ "epoch": 1.460047281323877,
+ "grad_norm": 2.452188014984131,
+ "learning_rate": 4.336705855672444e-06,
+ "loss": 0.542,
+ "step": 3088
+ },
+ {
+ "epoch": 1.4605200945626478,
+ "grad_norm": 3.0004117488861084,
+ "learning_rate": 4.336282587241488e-06,
+ "loss": 0.5857,
+ "step": 3089
+ },
+ {
+ "epoch": 1.4609929078014185,
+ "grad_norm": 2.870783567428589,
+ "learning_rate": 4.335859204473268e-06,
+ "loss": 0.5506,
+ "step": 3090
+ },
+ {
+ "epoch": 1.4614657210401891,
+ "grad_norm": 3.1078689098358154,
+ "learning_rate": 4.335435707394145e-06,
+ "loss": 0.5138,
+ "step": 3091
+ },
+ {
+ "epoch": 1.4619385342789597,
+ "grad_norm": 2.8516197204589844,
+ "learning_rate": 4.335012096030488e-06,
+ "loss": 0.5842,
+ "step": 3092
+ },
+ {
+ "epoch": 1.4624113475177305,
+ "grad_norm": 2.615922212600708,
+ "learning_rate": 4.334588370408675e-06,
+ "loss": 0.4896,
+ "step": 3093
+ },
+ {
+ "epoch": 1.4628841607565013,
+ "grad_norm": 3.1911802291870117,
+ "learning_rate": 4.334164530555088e-06,
+ "loss": 0.4974,
+ "step": 3094
+ },
+ {
+ "epoch": 1.4633569739952719,
+ "grad_norm": 3.075051784515381,
+ "learning_rate": 4.3337405764961186e-06,
+ "loss": 0.567,
+ "step": 3095
+ },
+ {
+ "epoch": 1.4638297872340424,
+ "grad_norm": 2.550625801086426,
+ "learning_rate": 4.333316508258163e-06,
+ "loss": 0.4887,
+ "step": 3096
+ },
+ {
+ "epoch": 1.4643026004728132,
+ "grad_norm": 2.3986475467681885,
+ "learning_rate": 4.332892325867629e-06,
+ "loss": 0.5047,
+ "step": 3097
+ },
+ {
+ "epoch": 1.464775413711584,
+ "grad_norm": 2.5045125484466553,
+ "learning_rate": 4.332468029350926e-06,
+ "loss": 0.4721,
+ "step": 3098
+ },
+ {
+ "epoch": 1.4652482269503546,
+ "grad_norm": 2.347365617752075,
+ "learning_rate": 4.332043618734474e-06,
+ "loss": 0.4913,
+ "step": 3099
+ },
+ {
+ "epoch": 1.4657210401891252,
+ "grad_norm": 2.459928512573242,
+ "learning_rate": 4.331619094044699e-06,
+ "loss": 0.523,
+ "step": 3100
+ },
+ {
+ "epoch": 1.466193853427896,
+ "grad_norm": 2.5771310329437256,
+ "learning_rate": 4.331194455308035e-06,
+ "loss": 0.593,
+ "step": 3101
+ },
+ {
+ "epoch": 1.4666666666666668,
+ "grad_norm": 3.1351823806762695,
+ "learning_rate": 4.330769702550921e-06,
+ "loss": 0.5852,
+ "step": 3102
+ },
+ {
+ "epoch": 1.4671394799054374,
+ "grad_norm": 2.589817523956299,
+ "learning_rate": 4.330344835799806e-06,
+ "loss": 0.508,
+ "step": 3103
+ },
+ {
+ "epoch": 1.467612293144208,
+ "grad_norm": 3.1140341758728027,
+ "learning_rate": 4.329919855081144e-06,
+ "loss": 0.469,
+ "step": 3104
+ },
+ {
+ "epoch": 1.4680851063829787,
+ "grad_norm": 2.8186635971069336,
+ "learning_rate": 4.329494760421396e-06,
+ "loss": 0.5088,
+ "step": 3105
+ },
+ {
+ "epoch": 1.4685579196217495,
+ "grad_norm": 2.676077365875244,
+ "learning_rate": 4.329069551847031e-06,
+ "loss": 0.52,
+ "step": 3106
+ },
+ {
+ "epoch": 1.46903073286052,
+ "grad_norm": 2.5543313026428223,
+ "learning_rate": 4.328644229384526e-06,
+ "loss": 0.5066,
+ "step": 3107
+ },
+ {
+ "epoch": 1.4695035460992907,
+ "grad_norm": 2.8176217079162598,
+ "learning_rate": 4.328218793060362e-06,
+ "loss": 0.6404,
+ "step": 3108
+ },
+ {
+ "epoch": 1.4699763593380615,
+ "grad_norm": 2.485217332839966,
+ "learning_rate": 4.3277932429010314e-06,
+ "loss": 0.4578,
+ "step": 3109
+ },
+ {
+ "epoch": 1.4704491725768323,
+ "grad_norm": 2.6741621494293213,
+ "learning_rate": 4.327367578933031e-06,
+ "loss": 0.5068,
+ "step": 3110
+ },
+ {
+ "epoch": 1.4709219858156029,
+ "grad_norm": 2.377242088317871,
+ "learning_rate": 4.326941801182863e-06,
+ "loss": 0.5249,
+ "step": 3111
+ },
+ {
+ "epoch": 1.4713947990543734,
+ "grad_norm": 2.790046215057373,
+ "learning_rate": 4.32651590967704e-06,
+ "loss": 0.5532,
+ "step": 3112
+ },
+ {
+ "epoch": 1.4718676122931442,
+ "grad_norm": 2.78019642829895,
+ "learning_rate": 4.326089904442081e-06,
+ "loss": 0.5362,
+ "step": 3113
+ },
+ {
+ "epoch": 1.472340425531915,
+ "grad_norm": 2.5661380290985107,
+ "learning_rate": 4.32566378550451e-06,
+ "loss": 0.5041,
+ "step": 3114
+ },
+ {
+ "epoch": 1.4728132387706856,
+ "grad_norm": 2.522153615951538,
+ "learning_rate": 4.3252375528908605e-06,
+ "loss": 0.5074,
+ "step": 3115
+ },
+ {
+ "epoch": 1.4732860520094562,
+ "grad_norm": 2.874688148498535,
+ "learning_rate": 4.3248112066276725e-06,
+ "loss": 0.59,
+ "step": 3116
+ },
+ {
+ "epoch": 1.473758865248227,
+ "grad_norm": 3.067866802215576,
+ "learning_rate": 4.324384746741492e-06,
+ "loss": 0.5924,
+ "step": 3117
+ },
+ {
+ "epoch": 1.4742316784869978,
+ "grad_norm": 3.359463930130005,
+ "learning_rate": 4.323958173258873e-06,
+ "loss": 0.6346,
+ "step": 3118
+ },
+ {
+ "epoch": 1.4747044917257683,
+ "grad_norm": 2.193024158477783,
+ "learning_rate": 4.323531486206376e-06,
+ "loss": 0.4594,
+ "step": 3119
+ },
+ {
+ "epoch": 1.475177304964539,
+ "grad_norm": 2.886889934539795,
+ "learning_rate": 4.323104685610569e-06,
+ "loss": 0.523,
+ "step": 3120
+ },
+ {
+ "epoch": 1.4756501182033097,
+ "grad_norm": 2.7558681964874268,
+ "learning_rate": 4.322677771498028e-06,
+ "loss": 0.5387,
+ "step": 3121
+ },
+ {
+ "epoch": 1.4761229314420805,
+ "grad_norm": 2.639277935028076,
+ "learning_rate": 4.322250743895335e-06,
+ "loss": 0.5599,
+ "step": 3122
+ },
+ {
+ "epoch": 1.476595744680851,
+ "grad_norm": 2.786198616027832,
+ "learning_rate": 4.321823602829078e-06,
+ "loss": 0.5405,
+ "step": 3123
+ },
+ {
+ "epoch": 1.4770685579196217,
+ "grad_norm": 2.582315683364868,
+ "learning_rate": 4.321396348325853e-06,
+ "loss": 0.4452,
+ "step": 3124
+ },
+ {
+ "epoch": 1.4775413711583925,
+ "grad_norm": 2.8574297428131104,
+ "learning_rate": 4.320968980412265e-06,
+ "loss": 0.4846,
+ "step": 3125
+ },
+ {
+ "epoch": 1.4780141843971633,
+ "grad_norm": 2.705281972885132,
+ "learning_rate": 4.320541499114922e-06,
+ "loss": 0.5548,
+ "step": 3126
+ },
+ {
+ "epoch": 1.4784869976359338,
+ "grad_norm": 2.3152754306793213,
+ "learning_rate": 4.320113904460444e-06,
+ "loss": 0.5216,
+ "step": 3127
+ },
+ {
+ "epoch": 1.4789598108747044,
+ "grad_norm": 3.230764150619507,
+ "learning_rate": 4.319686196475453e-06,
+ "loss": 0.6192,
+ "step": 3128
+ },
+ {
+ "epoch": 1.4794326241134752,
+ "grad_norm": 2.463380813598633,
+ "learning_rate": 4.319258375186583e-06,
+ "loss": 0.4872,
+ "step": 3129
+ },
+ {
+ "epoch": 1.479905437352246,
+ "grad_norm": 2.8477656841278076,
+ "learning_rate": 4.31883044062047e-06,
+ "loss": 0.5371,
+ "step": 3130
+ },
+ {
+ "epoch": 1.4803782505910166,
+ "grad_norm": 2.393911123275757,
+ "learning_rate": 4.318402392803762e-06,
+ "loss": 0.5334,
+ "step": 3131
+ },
+ {
+ "epoch": 1.4808510638297872,
+ "grad_norm": 2.6113736629486084,
+ "learning_rate": 4.317974231763109e-06,
+ "loss": 0.5572,
+ "step": 3132
+ },
+ {
+ "epoch": 1.481323877068558,
+ "grad_norm": 2.3941731452941895,
+ "learning_rate": 4.317545957525173e-06,
+ "loss": 0.4849,
+ "step": 3133
+ },
+ {
+ "epoch": 1.4817966903073285,
+ "grad_norm": 2.9536755084991455,
+ "learning_rate": 4.317117570116619e-06,
+ "loss": 0.6058,
+ "step": 3134
+ },
+ {
+ "epoch": 1.4822695035460993,
+ "grad_norm": 2.595754623413086,
+ "learning_rate": 4.316689069564123e-06,
+ "loss": 0.5193,
+ "step": 3135
+ },
+ {
+ "epoch": 1.48274231678487,
+ "grad_norm": 2.569833993911743,
+ "learning_rate": 4.316260455894364e-06,
+ "loss": 0.543,
+ "step": 3136
+ },
+ {
+ "epoch": 1.4832151300236407,
+ "grad_norm": 2.5137455463409424,
+ "learning_rate": 4.315831729134031e-06,
+ "loss": 0.5415,
+ "step": 3137
+ },
+ {
+ "epoch": 1.4836879432624113,
+ "grad_norm": 2.5582292079925537,
+ "learning_rate": 4.3154028893098176e-06,
+ "loss": 0.5338,
+ "step": 3138
+ },
+ {
+ "epoch": 1.484160756501182,
+ "grad_norm": 2.666426181793213,
+ "learning_rate": 4.3149739364484265e-06,
+ "loss": 0.5435,
+ "step": 3139
+ },
+ {
+ "epoch": 1.4846335697399526,
+ "grad_norm": 2.790851354598999,
+ "learning_rate": 4.314544870576568e-06,
+ "loss": 0.5746,
+ "step": 3140
+ },
+ {
+ "epoch": 1.4851063829787234,
+ "grad_norm": 2.620326042175293,
+ "learning_rate": 4.314115691720956e-06,
+ "loss": 0.5076,
+ "step": 3141
+ },
+ {
+ "epoch": 1.485579196217494,
+ "grad_norm": 3.075674533843994,
+ "learning_rate": 4.313686399908314e-06,
+ "loss": 0.5486,
+ "step": 3142
+ },
+ {
+ "epoch": 1.4860520094562648,
+ "grad_norm": 3.1347315311431885,
+ "learning_rate": 4.3132569951653745e-06,
+ "loss": 0.531,
+ "step": 3143
+ },
+ {
+ "epoch": 1.4865248226950354,
+ "grad_norm": 2.5783653259277344,
+ "learning_rate": 4.312827477518871e-06,
+ "loss": 0.5818,
+ "step": 3144
+ },
+ {
+ "epoch": 1.4869976359338062,
+ "grad_norm": 3.0247137546539307,
+ "learning_rate": 4.3123978469955505e-06,
+ "loss": 0.5347,
+ "step": 3145
+ },
+ {
+ "epoch": 1.4874704491725768,
+ "grad_norm": 2.4789345264434814,
+ "learning_rate": 4.311968103622163e-06,
+ "loss": 0.5,
+ "step": 3146
+ },
+ {
+ "epoch": 1.4879432624113476,
+ "grad_norm": 2.663341522216797,
+ "learning_rate": 4.311538247425466e-06,
+ "loss": 0.4825,
+ "step": 3147
+ },
+ {
+ "epoch": 1.4884160756501181,
+ "grad_norm": 2.633711099624634,
+ "learning_rate": 4.311108278432226e-06,
+ "loss": 0.5244,
+ "step": 3148
+ },
+ {
+ "epoch": 1.488888888888889,
+ "grad_norm": 2.51312518119812,
+ "learning_rate": 4.310678196669216e-06,
+ "loss": 0.513,
+ "step": 3149
+ },
+ {
+ "epoch": 1.4893617021276595,
+ "grad_norm": 2.5263755321502686,
+ "learning_rate": 4.310248002163214e-06,
+ "loss": 0.5236,
+ "step": 3150
+ },
+ {
+ "epoch": 1.4898345153664303,
+ "grad_norm": 2.559216260910034,
+ "learning_rate": 4.309817694941007e-06,
+ "loss": 0.5107,
+ "step": 3151
+ },
+ {
+ "epoch": 1.4903073286052009,
+ "grad_norm": 2.5023303031921387,
+ "learning_rate": 4.309387275029386e-06,
+ "loss": 0.4685,
+ "step": 3152
+ },
+ {
+ "epoch": 1.4907801418439717,
+ "grad_norm": 3.0314254760742188,
+ "learning_rate": 4.308956742455155e-06,
+ "loss": 0.5462,
+ "step": 3153
+ },
+ {
+ "epoch": 1.4912529550827422,
+ "grad_norm": 2.675295114517212,
+ "learning_rate": 4.308526097245119e-06,
+ "loss": 0.5398,
+ "step": 3154
+ },
+ {
+ "epoch": 1.491725768321513,
+ "grad_norm": 2.6613399982452393,
+ "learning_rate": 4.308095339426094e-06,
+ "loss": 0.5376,
+ "step": 3155
+ },
+ {
+ "epoch": 1.4921985815602836,
+ "grad_norm": 2.58937668800354,
+ "learning_rate": 4.307664469024899e-06,
+ "loss": 0.5385,
+ "step": 3156
+ },
+ {
+ "epoch": 1.4926713947990544,
+ "grad_norm": 2.583631992340088,
+ "learning_rate": 4.3072334860683655e-06,
+ "loss": 0.4927,
+ "step": 3157
+ },
+ {
+ "epoch": 1.493144208037825,
+ "grad_norm": 2.5889222621917725,
+ "learning_rate": 4.306802390583327e-06,
+ "loss": 0.47,
+ "step": 3158
+ },
+ {
+ "epoch": 1.4936170212765958,
+ "grad_norm": 2.9362716674804688,
+ "learning_rate": 4.3063711825966244e-06,
+ "loss": 0.4902,
+ "step": 3159
+ },
+ {
+ "epoch": 1.4940898345153664,
+ "grad_norm": 2.5385425090789795,
+ "learning_rate": 4.305939862135111e-06,
+ "loss": 0.5396,
+ "step": 3160
+ },
+ {
+ "epoch": 1.4945626477541372,
+ "grad_norm": 2.776326894760132,
+ "learning_rate": 4.305508429225641e-06,
+ "loss": 0.5169,
+ "step": 3161
+ },
+ {
+ "epoch": 1.4950354609929077,
+ "grad_norm": 2.575063467025757,
+ "learning_rate": 4.305076883895076e-06,
+ "loss": 0.4938,
+ "step": 3162
+ },
+ {
+ "epoch": 1.4955082742316785,
+ "grad_norm": 2.7552313804626465,
+ "learning_rate": 4.304645226170291e-06,
+ "loss": 0.6211,
+ "step": 3163
+ },
+ {
+ "epoch": 1.4959810874704491,
+ "grad_norm": 2.57149338722229,
+ "learning_rate": 4.30421345607816e-06,
+ "loss": 0.5241,
+ "step": 3164
+ },
+ {
+ "epoch": 1.49645390070922,
+ "grad_norm": 2.8142426013946533,
+ "learning_rate": 4.303781573645568e-06,
+ "loss": 0.5699,
+ "step": 3165
+ },
+ {
+ "epoch": 1.4969267139479905,
+ "grad_norm": 2.6344845294952393,
+ "learning_rate": 4.303349578899407e-06,
+ "loss": 0.5049,
+ "step": 3166
+ },
+ {
+ "epoch": 1.4973995271867613,
+ "grad_norm": 2.554410934448242,
+ "learning_rate": 4.302917471866575e-06,
+ "loss": 0.4404,
+ "step": 3167
+ },
+ {
+ "epoch": 1.4978723404255319,
+ "grad_norm": 2.896240711212158,
+ "learning_rate": 4.302485252573978e-06,
+ "loss": 0.602,
+ "step": 3168
+ },
+ {
+ "epoch": 1.4983451536643027,
+ "grad_norm": 2.4044477939605713,
+ "learning_rate": 4.302052921048527e-06,
+ "loss": 0.4857,
+ "step": 3169
+ },
+ {
+ "epoch": 1.4988179669030732,
+ "grad_norm": 2.7447879314422607,
+ "learning_rate": 4.301620477317144e-06,
+ "loss": 0.5438,
+ "step": 3170
+ },
+ {
+ "epoch": 1.499290780141844,
+ "grad_norm": 2.851820945739746,
+ "learning_rate": 4.301187921406752e-06,
+ "loss": 0.5245,
+ "step": 3171
+ },
+ {
+ "epoch": 1.4997635933806146,
+ "grad_norm": 3.247114419937134,
+ "learning_rate": 4.300755253344287e-06,
+ "loss": 0.504,
+ "step": 3172
+ },
+ {
+ "epoch": 1.5002364066193854,
+ "grad_norm": 3.117490291595459,
+ "learning_rate": 4.300322473156688e-06,
+ "loss": 0.4627,
+ "step": 3173
+ },
+ {
+ "epoch": 1.500709219858156,
+ "grad_norm": 2.558319330215454,
+ "learning_rate": 4.299889580870904e-06,
+ "loss": 0.5721,
+ "step": 3174
+ },
+ {
+ "epoch": 1.5011820330969265,
+ "grad_norm": 2.8983113765716553,
+ "learning_rate": 4.2994565765138865e-06,
+ "loss": 0.5257,
+ "step": 3175
+ },
+ {
+ "epoch": 1.5016548463356973,
+ "grad_norm": 2.744056463241577,
+ "learning_rate": 4.299023460112599e-06,
+ "loss": 0.4892,
+ "step": 3176
+ },
+ {
+ "epoch": 1.5021276595744681,
+ "grad_norm": 2.5506751537323,
+ "learning_rate": 4.29859023169401e-06,
+ "loss": 0.4933,
+ "step": 3177
+ },
+ {
+ "epoch": 1.5026004728132387,
+ "grad_norm": 2.842615842819214,
+ "learning_rate": 4.298156891285092e-06,
+ "loss": 0.6124,
+ "step": 3178
+ },
+ {
+ "epoch": 1.5030732860520093,
+ "grad_norm": 2.5355329513549805,
+ "learning_rate": 4.2977234389128305e-06,
+ "loss": 0.641,
+ "step": 3179
+ },
+ {
+ "epoch": 1.50354609929078,
+ "grad_norm": 2.674781084060669,
+ "learning_rate": 4.297289874604213e-06,
+ "loss": 0.475,
+ "step": 3180
+ },
+ {
+ "epoch": 1.5040189125295509,
+ "grad_norm": 2.6845548152923584,
+ "learning_rate": 4.296856198386235e-06,
+ "loss": 0.5328,
+ "step": 3181
+ },
+ {
+ "epoch": 1.5044917257683215,
+ "grad_norm": 2.9686241149902344,
+ "learning_rate": 4.296422410285902e-06,
+ "loss": 0.6216,
+ "step": 3182
+ },
+ {
+ "epoch": 1.504964539007092,
+ "grad_norm": 2.5095980167388916,
+ "learning_rate": 4.295988510330222e-06,
+ "loss": 0.4993,
+ "step": 3183
+ },
+ {
+ "epoch": 1.5054373522458628,
+ "grad_norm": 2.4906392097473145,
+ "learning_rate": 4.2955544985462125e-06,
+ "loss": 0.4795,
+ "step": 3184
+ },
+ {
+ "epoch": 1.5059101654846336,
+ "grad_norm": 2.5593366622924805,
+ "learning_rate": 4.295120374960897e-06,
+ "loss": 0.5527,
+ "step": 3185
+ },
+ {
+ "epoch": 1.5063829787234042,
+ "grad_norm": 2.691495180130005,
+ "learning_rate": 4.294686139601308e-06,
+ "loss": 0.5646,
+ "step": 3186
+ },
+ {
+ "epoch": 1.5068557919621748,
+ "grad_norm": 2.74320387840271,
+ "learning_rate": 4.294251792494483e-06,
+ "loss": 0.6149,
+ "step": 3187
+ },
+ {
+ "epoch": 1.5073286052009456,
+ "grad_norm": 2.8827052116394043,
+ "learning_rate": 4.293817333667465e-06,
+ "loss": 0.5414,
+ "step": 3188
+ },
+ {
+ "epoch": 1.5078014184397164,
+ "grad_norm": 2.5652425289154053,
+ "learning_rate": 4.293382763147308e-06,
+ "loss": 0.5006,
+ "step": 3189
+ },
+ {
+ "epoch": 1.508274231678487,
+ "grad_norm": 2.729295253753662,
+ "learning_rate": 4.29294808096107e-06,
+ "loss": 0.522,
+ "step": 3190
+ },
+ {
+ "epoch": 1.5087470449172575,
+ "grad_norm": 2.348118305206299,
+ "learning_rate": 4.292513287135817e-06,
+ "loss": 0.4125,
+ "step": 3191
+ },
+ {
+ "epoch": 1.5092198581560283,
+ "grad_norm": 2.809551954269409,
+ "learning_rate": 4.292078381698621e-06,
+ "loss": 0.5577,
+ "step": 3192
+ },
+ {
+ "epoch": 1.5096926713947991,
+ "grad_norm": 2.6925361156463623,
+ "learning_rate": 4.291643364676563e-06,
+ "loss": 0.62,
+ "step": 3193
+ },
+ {
+ "epoch": 1.5101654846335697,
+ "grad_norm": 2.4200620651245117,
+ "learning_rate": 4.291208236096729e-06,
+ "loss": 0.5464,
+ "step": 3194
+ },
+ {
+ "epoch": 1.5106382978723403,
+ "grad_norm": 2.5659191608428955,
+ "learning_rate": 4.290772995986211e-06,
+ "loss": 0.5402,
+ "step": 3195
+ },
+ {
+ "epoch": 1.511111111111111,
+ "grad_norm": 2.3877315521240234,
+ "learning_rate": 4.290337644372113e-06,
+ "loss": 0.463,
+ "step": 3196
+ },
+ {
+ "epoch": 1.5115839243498819,
+ "grad_norm": 2.7063233852386475,
+ "learning_rate": 4.289902181281538e-06,
+ "loss": 0.5253,
+ "step": 3197
+ },
+ {
+ "epoch": 1.5120567375886524,
+ "grad_norm": 2.56788969039917,
+ "learning_rate": 4.289466606741603e-06,
+ "loss": 0.5012,
+ "step": 3198
+ },
+ {
+ "epoch": 1.512529550827423,
+ "grad_norm": 2.637164831161499,
+ "learning_rate": 4.28903092077943e-06,
+ "loss": 0.5236,
+ "step": 3199
+ },
+ {
+ "epoch": 1.5130023640661938,
+ "grad_norm": 2.767526865005493,
+ "learning_rate": 4.288595123422146e-06,
+ "loss": 0.5832,
+ "step": 3200
+ },
+ {
+ "epoch": 1.5134751773049646,
+ "grad_norm": 2.33365535736084,
+ "learning_rate": 4.2881592146968866e-06,
+ "loss": 0.4548,
+ "step": 3201
+ },
+ {
+ "epoch": 1.5139479905437352,
+ "grad_norm": 2.544189453125,
+ "learning_rate": 4.287723194630793e-06,
+ "loss": 0.5115,
+ "step": 3202
+ },
+ {
+ "epoch": 1.5144208037825058,
+ "grad_norm": 2.588793992996216,
+ "learning_rate": 4.2872870632510155e-06,
+ "loss": 0.4766,
+ "step": 3203
+ },
+ {
+ "epoch": 1.5148936170212766,
+ "grad_norm": 2.5382184982299805,
+ "learning_rate": 4.286850820584709e-06,
+ "loss": 0.5401,
+ "step": 3204
+ },
+ {
+ "epoch": 1.5153664302600474,
+ "grad_norm": 2.597930669784546,
+ "learning_rate": 4.286414466659038e-06,
+ "loss": 0.5346,
+ "step": 3205
+ },
+ {
+ "epoch": 1.515839243498818,
+ "grad_norm": 2.8522393703460693,
+ "learning_rate": 4.28597800150117e-06,
+ "loss": 0.486,
+ "step": 3206
+ },
+ {
+ "epoch": 1.5163120567375885,
+ "grad_norm": 2.4801454544067383,
+ "learning_rate": 4.285541425138285e-06,
+ "loss": 0.5162,
+ "step": 3207
+ },
+ {
+ "epoch": 1.5167848699763593,
+ "grad_norm": 2.353665351867676,
+ "learning_rate": 4.285104737597563e-06,
+ "loss": 0.5066,
+ "step": 3208
+ },
+ {
+ "epoch": 1.51725768321513,
+ "grad_norm": 2.767976760864258,
+ "learning_rate": 4.2846679389061975e-06,
+ "loss": 0.5331,
+ "step": 3209
+ },
+ {
+ "epoch": 1.5177304964539007,
+ "grad_norm": 2.9307682514190674,
+ "learning_rate": 4.284231029091385e-06,
+ "loss": 0.5291,
+ "step": 3210
+ },
+ {
+ "epoch": 1.5182033096926713,
+ "grad_norm": 2.39719820022583,
+ "learning_rate": 4.283794008180329e-06,
+ "loss": 0.4759,
+ "step": 3211
+ },
+ {
+ "epoch": 1.518676122931442,
+ "grad_norm": 2.452244758605957,
+ "learning_rate": 4.283356876200242e-06,
+ "loss": 0.4283,
+ "step": 3212
+ },
+ {
+ "epoch": 1.5191489361702128,
+ "grad_norm": 2.4911608695983887,
+ "learning_rate": 4.282919633178343e-06,
+ "loss": 0.4812,
+ "step": 3213
+ },
+ {
+ "epoch": 1.5196217494089834,
+ "grad_norm": 2.5813944339752197,
+ "learning_rate": 4.282482279141856e-06,
+ "loss": 0.4911,
+ "step": 3214
+ },
+ {
+ "epoch": 1.520094562647754,
+ "grad_norm": 2.503542184829712,
+ "learning_rate": 4.282044814118013e-06,
+ "loss": 0.4969,
+ "step": 3215
+ },
+ {
+ "epoch": 1.5205673758865248,
+ "grad_norm": 2.5090713500976562,
+ "learning_rate": 4.281607238134053e-06,
+ "loss": 0.5293,
+ "step": 3216
+ },
+ {
+ "epoch": 1.5210401891252956,
+ "grad_norm": 2.425994396209717,
+ "learning_rate": 4.281169551217223e-06,
+ "loss": 0.5365,
+ "step": 3217
+ },
+ {
+ "epoch": 1.5215130023640662,
+ "grad_norm": 2.637655258178711,
+ "learning_rate": 4.2807317533947765e-06,
+ "loss": 0.5589,
+ "step": 3218
+ },
+ {
+ "epoch": 1.5219858156028367,
+ "grad_norm": 2.9335296154022217,
+ "learning_rate": 4.28029384469397e-06,
+ "loss": 0.6071,
+ "step": 3219
+ },
+ {
+ "epoch": 1.5224586288416075,
+ "grad_norm": 2.898683547973633,
+ "learning_rate": 4.279855825142073e-06,
+ "loss": 0.5392,
+ "step": 3220
+ },
+ {
+ "epoch": 1.5229314420803783,
+ "grad_norm": 2.613914966583252,
+ "learning_rate": 4.279417694766359e-06,
+ "loss": 0.4968,
+ "step": 3221
+ },
+ {
+ "epoch": 1.523404255319149,
+ "grad_norm": 2.500682830810547,
+ "learning_rate": 4.278979453594106e-06,
+ "loss": 0.471,
+ "step": 3222
+ },
+ {
+ "epoch": 1.5238770685579195,
+ "grad_norm": 2.5269598960876465,
+ "learning_rate": 4.278541101652605e-06,
+ "loss": 0.471,
+ "step": 3223
+ },
+ {
+ "epoch": 1.5243498817966903,
+ "grad_norm": 2.8153114318847656,
+ "learning_rate": 4.2781026389691465e-06,
+ "loss": 0.5742,
+ "step": 3224
+ },
+ {
+ "epoch": 1.524822695035461,
+ "grad_norm": 2.5648019313812256,
+ "learning_rate": 4.277664065571034e-06,
+ "loss": 0.5315,
+ "step": 3225
+ },
+ {
+ "epoch": 1.5252955082742317,
+ "grad_norm": 2.778355836868286,
+ "learning_rate": 4.277225381485575e-06,
+ "loss": 0.5543,
+ "step": 3226
+ },
+ {
+ "epoch": 1.5257683215130022,
+ "grad_norm": 2.6736745834350586,
+ "learning_rate": 4.2767865867400846e-06,
+ "loss": 0.4947,
+ "step": 3227
+ },
+ {
+ "epoch": 1.526241134751773,
+ "grad_norm": 2.9560294151306152,
+ "learning_rate": 4.276347681361884e-06,
+ "loss": 0.5835,
+ "step": 3228
+ },
+ {
+ "epoch": 1.5267139479905438,
+ "grad_norm": 2.5580296516418457,
+ "learning_rate": 4.275908665378302e-06,
+ "loss": 0.4751,
+ "step": 3229
+ },
+ {
+ "epoch": 1.5271867612293144,
+ "grad_norm": 3.0705175399780273,
+ "learning_rate": 4.2754695388166755e-06,
+ "loss": 0.5327,
+ "step": 3230
+ },
+ {
+ "epoch": 1.527659574468085,
+ "grad_norm": 2.664652109146118,
+ "learning_rate": 4.275030301704346e-06,
+ "loss": 0.4934,
+ "step": 3231
+ },
+ {
+ "epoch": 1.5281323877068558,
+ "grad_norm": 2.308499813079834,
+ "learning_rate": 4.274590954068663e-06,
+ "loss": 0.4412,
+ "step": 3232
+ },
+ {
+ "epoch": 1.5286052009456266,
+ "grad_norm": 2.871189594268799,
+ "learning_rate": 4.2741514959369815e-06,
+ "loss": 0.5001,
+ "step": 3233
+ },
+ {
+ "epoch": 1.5290780141843971,
+ "grad_norm": 2.5274453163146973,
+ "learning_rate": 4.273711927336666e-06,
+ "loss": 0.4938,
+ "step": 3234
+ },
+ {
+ "epoch": 1.5295508274231677,
+ "grad_norm": 2.8848133087158203,
+ "learning_rate": 4.273272248295087e-06,
+ "loss": 0.5397,
+ "step": 3235
+ },
+ {
+ "epoch": 1.5300236406619385,
+ "grad_norm": 2.3927090167999268,
+ "learning_rate": 4.27283245883962e-06,
+ "loss": 0.5497,
+ "step": 3236
+ },
+ {
+ "epoch": 1.5304964539007093,
+ "grad_norm": 2.5413873195648193,
+ "learning_rate": 4.27239255899765e-06,
+ "loss": 0.5108,
+ "step": 3237
+ },
+ {
+ "epoch": 1.53096926713948,
+ "grad_norm": 2.7692389488220215,
+ "learning_rate": 4.271952548796567e-06,
+ "loss": 0.5768,
+ "step": 3238
+ },
+ {
+ "epoch": 1.5314420803782505,
+ "grad_norm": 2.4621126651763916,
+ "learning_rate": 4.271512428263768e-06,
+ "loss": 0.4698,
+ "step": 3239
+ },
+ {
+ "epoch": 1.5319148936170213,
+ "grad_norm": 2.6423375606536865,
+ "learning_rate": 4.271072197426659e-06,
+ "loss": 0.4929,
+ "step": 3240
+ },
+ {
+ "epoch": 1.532387706855792,
+ "grad_norm": 2.7097692489624023,
+ "learning_rate": 4.270631856312649e-06,
+ "loss": 0.4836,
+ "step": 3241
+ },
+ {
+ "epoch": 1.5328605200945626,
+ "grad_norm": 2.545706272125244,
+ "learning_rate": 4.270191404949158e-06,
+ "loss": 0.4636,
+ "step": 3242
+ },
+ {
+ "epoch": 1.5333333333333332,
+ "grad_norm": 3.138781785964966,
+ "learning_rate": 4.26975084336361e-06,
+ "loss": 0.5988,
+ "step": 3243
+ },
+ {
+ "epoch": 1.533806146572104,
+ "grad_norm": 2.492715835571289,
+ "learning_rate": 4.269310171583438e-06,
+ "loss": 0.5095,
+ "step": 3244
+ },
+ {
+ "epoch": 1.5342789598108748,
+ "grad_norm": 2.5705838203430176,
+ "learning_rate": 4.268869389636077e-06,
+ "loss": 0.4818,
+ "step": 3245
+ },
+ {
+ "epoch": 1.5347517730496454,
+ "grad_norm": 2.7633554935455322,
+ "learning_rate": 4.268428497548979e-06,
+ "loss": 0.547,
+ "step": 3246
+ },
+ {
+ "epoch": 1.535224586288416,
+ "grad_norm": 2.654528856277466,
+ "learning_rate": 4.2679874953495905e-06,
+ "loss": 0.5261,
+ "step": 3247
+ },
+ {
+ "epoch": 1.5356973995271868,
+ "grad_norm": 2.5039751529693604,
+ "learning_rate": 4.2675463830653744e-06,
+ "loss": 0.4941,
+ "step": 3248
+ },
+ {
+ "epoch": 1.5361702127659576,
+ "grad_norm": 2.897268295288086,
+ "learning_rate": 4.267105160723794e-06,
+ "loss": 0.5404,
+ "step": 3249
+ },
+ {
+ "epoch": 1.5366430260047281,
+ "grad_norm": 2.500732421875,
+ "learning_rate": 4.266663828352324e-06,
+ "loss": 0.5375,
+ "step": 3250
+ },
+ {
+ "epoch": 1.5371158392434987,
+ "grad_norm": 2.6310064792633057,
+ "learning_rate": 4.266222385978444e-06,
+ "loss": 0.5217,
+ "step": 3251
+ },
+ {
+ "epoch": 1.5375886524822695,
+ "grad_norm": 2.7440476417541504,
+ "learning_rate": 4.265780833629642e-06,
+ "loss": 0.5419,
+ "step": 3252
+ },
+ {
+ "epoch": 1.5380614657210403,
+ "grad_norm": 2.7037577629089355,
+ "learning_rate": 4.2653391713334095e-06,
+ "loss": 0.5634,
+ "step": 3253
+ },
+ {
+ "epoch": 1.5385342789598109,
+ "grad_norm": 2.548525810241699,
+ "learning_rate": 4.264897399117248e-06,
+ "loss": 0.535,
+ "step": 3254
+ },
+ {
+ "epoch": 1.5390070921985815,
+ "grad_norm": 2.6127355098724365,
+ "learning_rate": 4.264455517008663e-06,
+ "loss": 0.4619,
+ "step": 3255
+ },
+ {
+ "epoch": 1.5394799054373522,
+ "grad_norm": 2.5597004890441895,
+ "learning_rate": 4.264013525035171e-06,
+ "loss": 0.4477,
+ "step": 3256
+ },
+ {
+ "epoch": 1.539952718676123,
+ "grad_norm": 2.642432689666748,
+ "learning_rate": 4.263571423224292e-06,
+ "loss": 0.4749,
+ "step": 3257
+ },
+ {
+ "epoch": 1.5404255319148936,
+ "grad_norm": 2.5121877193450928,
+ "learning_rate": 4.2631292116035526e-06,
+ "loss": 0.4693,
+ "step": 3258
+ },
+ {
+ "epoch": 1.5408983451536642,
+ "grad_norm": 2.390292167663574,
+ "learning_rate": 4.262686890200489e-06,
+ "loss": 0.4872,
+ "step": 3259
+ },
+ {
+ "epoch": 1.541371158392435,
+ "grad_norm": 2.5898337364196777,
+ "learning_rate": 4.2622444590426405e-06,
+ "loss": 0.5193,
+ "step": 3260
+ },
+ {
+ "epoch": 1.5418439716312058,
+ "grad_norm": 2.508821487426758,
+ "learning_rate": 4.261801918157558e-06,
+ "loss": 0.511,
+ "step": 3261
+ },
+ {
+ "epoch": 1.5423167848699764,
+ "grad_norm": 2.6992101669311523,
+ "learning_rate": 4.261359267572795e-06,
+ "loss": 0.5069,
+ "step": 3262
+ },
+ {
+ "epoch": 1.542789598108747,
+ "grad_norm": 2.6011030673980713,
+ "learning_rate": 4.2609165073159145e-06,
+ "loss": 0.5887,
+ "step": 3263
+ },
+ {
+ "epoch": 1.5432624113475177,
+ "grad_norm": 2.887053966522217,
+ "learning_rate": 4.260473637414483e-06,
+ "loss": 0.5556,
+ "step": 3264
+ },
+ {
+ "epoch": 1.5437352245862885,
+ "grad_norm": 2.6433887481689453,
+ "learning_rate": 4.260030657896079e-06,
+ "loss": 0.4728,
+ "step": 3265
+ },
+ {
+ "epoch": 1.544208037825059,
+ "grad_norm": 2.6134607791900635,
+ "learning_rate": 4.259587568788282e-06,
+ "loss": 0.483,
+ "step": 3266
+ },
+ {
+ "epoch": 1.5446808510638297,
+ "grad_norm": 2.5308640003204346,
+ "learning_rate": 4.259144370118684e-06,
+ "loss": 0.5115,
+ "step": 3267
+ },
+ {
+ "epoch": 1.5451536643026005,
+ "grad_norm": 2.8256733417510986,
+ "learning_rate": 4.258701061914879e-06,
+ "loss": 0.5414,
+ "step": 3268
+ },
+ {
+ "epoch": 1.5456264775413713,
+ "grad_norm": 2.8648319244384766,
+ "learning_rate": 4.258257644204471e-06,
+ "loss": 0.5695,
+ "step": 3269
+ },
+ {
+ "epoch": 1.5460992907801419,
+ "grad_norm": 2.8568081855773926,
+ "learning_rate": 4.257814117015069e-06,
+ "loss": 0.5264,
+ "step": 3270
+ },
+ {
+ "epoch": 1.5465721040189124,
+ "grad_norm": 2.6065011024475098,
+ "learning_rate": 4.257370480374289e-06,
+ "loss": 0.5646,
+ "step": 3271
+ },
+ {
+ "epoch": 1.5470449172576832,
+ "grad_norm": 2.7840216159820557,
+ "learning_rate": 4.256926734309756e-06,
+ "loss": 0.5191,
+ "step": 3272
+ },
+ {
+ "epoch": 1.547517730496454,
+ "grad_norm": 2.85906982421875,
+ "learning_rate": 4.256482878849099e-06,
+ "loss": 0.5911,
+ "step": 3273
+ },
+ {
+ "epoch": 1.5479905437352246,
+ "grad_norm": 2.916029930114746,
+ "learning_rate": 4.256038914019954e-06,
+ "loss": 0.5589,
+ "step": 3274
+ },
+ {
+ "epoch": 1.5484633569739952,
+ "grad_norm": 2.6748716831207275,
+ "learning_rate": 4.255594839849967e-06,
+ "loss": 0.5323,
+ "step": 3275
+ },
+ {
+ "epoch": 1.548936170212766,
+ "grad_norm": 2.717212200164795,
+ "learning_rate": 4.255150656366787e-06,
+ "loss": 0.453,
+ "step": 3276
+ },
+ {
+ "epoch": 1.5494089834515368,
+ "grad_norm": 2.4974849224090576,
+ "learning_rate": 4.254706363598072e-06,
+ "loss": 0.4516,
+ "step": 3277
+ },
+ {
+ "epoch": 1.5498817966903073,
+ "grad_norm": 2.648151397705078,
+ "learning_rate": 4.254261961571485e-06,
+ "loss": 0.5452,
+ "step": 3278
+ },
+ {
+ "epoch": 1.550354609929078,
+ "grad_norm": 2.932905435562134,
+ "learning_rate": 4.253817450314699e-06,
+ "loss": 0.4813,
+ "step": 3279
+ },
+ {
+ "epoch": 1.5508274231678487,
+ "grad_norm": 2.862912178039551,
+ "learning_rate": 4.25337282985539e-06,
+ "loss": 0.5689,
+ "step": 3280
+ },
+ {
+ "epoch": 1.5513002364066195,
+ "grad_norm": 2.532156467437744,
+ "learning_rate": 4.2529281002212436e-06,
+ "loss": 0.485,
+ "step": 3281
+ },
+ {
+ "epoch": 1.55177304964539,
+ "grad_norm": 2.583299160003662,
+ "learning_rate": 4.25248326143995e-06,
+ "loss": 0.4661,
+ "step": 3282
+ },
+ {
+ "epoch": 1.5522458628841607,
+ "grad_norm": 2.5790653228759766,
+ "learning_rate": 4.252038313539209e-06,
+ "loss": 0.5455,
+ "step": 3283
+ },
+ {
+ "epoch": 1.5527186761229315,
+ "grad_norm": 2.872864007949829,
+ "learning_rate": 4.251593256546724e-06,
+ "loss": 0.5317,
+ "step": 3284
+ },
+ {
+ "epoch": 1.5531914893617023,
+ "grad_norm": 3.0382463932037354,
+ "learning_rate": 4.251148090490208e-06,
+ "loss": 0.5131,
+ "step": 3285
+ },
+ {
+ "epoch": 1.5536643026004728,
+ "grad_norm": 2.574399709701538,
+ "learning_rate": 4.250702815397379e-06,
+ "loss": 0.5399,
+ "step": 3286
+ },
+ {
+ "epoch": 1.5541371158392434,
+ "grad_norm": 2.9784770011901855,
+ "learning_rate": 4.250257431295962e-06,
+ "loss": 0.5209,
+ "step": 3287
+ },
+ {
+ "epoch": 1.5546099290780142,
+ "grad_norm": 2.6482062339782715,
+ "learning_rate": 4.249811938213689e-06,
+ "loss": 0.5416,
+ "step": 3288
+ },
+ {
+ "epoch": 1.555082742316785,
+ "grad_norm": 2.82142972946167,
+ "learning_rate": 4.2493663361783e-06,
+ "loss": 0.594,
+ "step": 3289
+ },
+ {
+ "epoch": 1.5555555555555556,
+ "grad_norm": 2.815595865249634,
+ "learning_rate": 4.24892062521754e-06,
+ "loss": 0.5381,
+ "step": 3290
+ },
+ {
+ "epoch": 1.5560283687943262,
+ "grad_norm": 2.689764976501465,
+ "learning_rate": 4.248474805359161e-06,
+ "loss": 0.5141,
+ "step": 3291
+ },
+ {
+ "epoch": 1.556501182033097,
+ "grad_norm": 2.7718515396118164,
+ "learning_rate": 4.248028876630922e-06,
+ "loss": 0.5324,
+ "step": 3292
+ },
+ {
+ "epoch": 1.5569739952718678,
+ "grad_norm": 3.0196774005889893,
+ "learning_rate": 4.247582839060591e-06,
+ "loss": 0.4971,
+ "step": 3293
+ },
+ {
+ "epoch": 1.5574468085106383,
+ "grad_norm": 2.608475923538208,
+ "learning_rate": 4.247136692675939e-06,
+ "loss": 0.5795,
+ "step": 3294
+ },
+ {
+ "epoch": 1.557919621749409,
+ "grad_norm": 2.4912326335906982,
+ "learning_rate": 4.246690437504746e-06,
+ "loss": 0.5348,
+ "step": 3295
+ },
+ {
+ "epoch": 1.5583924349881797,
+ "grad_norm": 2.519303560256958,
+ "learning_rate": 4.246244073574799e-06,
+ "loss": 0.4953,
+ "step": 3296
+ },
+ {
+ "epoch": 1.5588652482269505,
+ "grad_norm": 2.5667171478271484,
+ "learning_rate": 4.24579760091389e-06,
+ "loss": 0.5353,
+ "step": 3297
+ },
+ {
+ "epoch": 1.559338061465721,
+ "grad_norm": 2.8835761547088623,
+ "learning_rate": 4.24535101954982e-06,
+ "loss": 0.578,
+ "step": 3298
+ },
+ {
+ "epoch": 1.5598108747044916,
+ "grad_norm": 3.0506930351257324,
+ "learning_rate": 4.244904329510395e-06,
+ "loss": 0.6418,
+ "step": 3299
+ },
+ {
+ "epoch": 1.5602836879432624,
+ "grad_norm": 2.579446315765381,
+ "learning_rate": 4.244457530823428e-06,
+ "loss": 0.5027,
+ "step": 3300
+ },
+ {
+ "epoch": 1.5607565011820332,
+ "grad_norm": 2.72012996673584,
+ "learning_rate": 4.24401062351674e-06,
+ "loss": 0.5438,
+ "step": 3301
+ },
+ {
+ "epoch": 1.5612293144208038,
+ "grad_norm": 2.527007818222046,
+ "learning_rate": 4.243563607618158e-06,
+ "loss": 0.5303,
+ "step": 3302
+ },
+ {
+ "epoch": 1.5617021276595744,
+ "grad_norm": 2.4415159225463867,
+ "learning_rate": 4.243116483155516e-06,
+ "loss": 0.4893,
+ "step": 3303
+ },
+ {
+ "epoch": 1.5621749408983452,
+ "grad_norm": 2.462256669998169,
+ "learning_rate": 4.242669250156653e-06,
+ "loss": 0.5671,
+ "step": 3304
+ },
+ {
+ "epoch": 1.562647754137116,
+ "grad_norm": 2.479865074157715,
+ "learning_rate": 4.242221908649418e-06,
+ "loss": 0.5038,
+ "step": 3305
+ },
+ {
+ "epoch": 1.5631205673758866,
+ "grad_norm": 2.74670672416687,
+ "learning_rate": 4.241774458661662e-06,
+ "loss": 0.5689,
+ "step": 3306
+ },
+ {
+ "epoch": 1.5635933806146571,
+ "grad_norm": 2.55938982963562,
+ "learning_rate": 4.24132690022125e-06,
+ "loss": 0.492,
+ "step": 3307
+ },
+ {
+ "epoch": 1.564066193853428,
+ "grad_norm": 2.634956121444702,
+ "learning_rate": 4.240879233356048e-06,
+ "loss": 0.503,
+ "step": 3308
+ },
+ {
+ "epoch": 1.5645390070921987,
+ "grad_norm": 2.381775140762329,
+ "learning_rate": 4.240431458093928e-06,
+ "loss": 0.4939,
+ "step": 3309
+ },
+ {
+ "epoch": 1.5650118203309693,
+ "grad_norm": 2.8176610469818115,
+ "learning_rate": 4.239983574462774e-06,
+ "loss": 0.5609,
+ "step": 3310
+ },
+ {
+ "epoch": 1.5654846335697399,
+ "grad_norm": 3.0268442630767822,
+ "learning_rate": 4.239535582490471e-06,
+ "loss": 0.5427,
+ "step": 3311
+ },
+ {
+ "epoch": 1.5659574468085107,
+ "grad_norm": 2.5881481170654297,
+ "learning_rate": 4.239087482204916e-06,
+ "loss": 0.5538,
+ "step": 3312
+ },
+ {
+ "epoch": 1.5664302600472815,
+ "grad_norm": 2.5317704677581787,
+ "learning_rate": 4.238639273634008e-06,
+ "loss": 0.4915,
+ "step": 3313
+ },
+ {
+ "epoch": 1.566903073286052,
+ "grad_norm": 2.9608731269836426,
+ "learning_rate": 4.238190956805658e-06,
+ "loss": 0.564,
+ "step": 3314
+ },
+ {
+ "epoch": 1.5673758865248226,
+ "grad_norm": 3.022686243057251,
+ "learning_rate": 4.237742531747777e-06,
+ "loss": 0.5503,
+ "step": 3315
+ },
+ {
+ "epoch": 1.5678486997635934,
+ "grad_norm": 2.763622283935547,
+ "learning_rate": 4.23729399848829e-06,
+ "loss": 0.5241,
+ "step": 3316
+ },
+ {
+ "epoch": 1.5683215130023642,
+ "grad_norm": 2.6112794876098633,
+ "learning_rate": 4.236845357055122e-06,
+ "loss": 0.4919,
+ "step": 3317
+ },
+ {
+ "epoch": 1.5687943262411348,
+ "grad_norm": 2.649829149246216,
+ "learning_rate": 4.23639660747621e-06,
+ "loss": 0.5472,
+ "step": 3318
+ },
+ {
+ "epoch": 1.5692671394799054,
+ "grad_norm": 2.8888115882873535,
+ "learning_rate": 4.2359477497794955e-06,
+ "loss": 0.5077,
+ "step": 3319
+ },
+ {
+ "epoch": 1.5697399527186762,
+ "grad_norm": 2.5666911602020264,
+ "learning_rate": 4.235498783992927e-06,
+ "loss": 0.5365,
+ "step": 3320
+ },
+ {
+ "epoch": 1.570212765957447,
+ "grad_norm": 2.448758363723755,
+ "learning_rate": 4.2350497101444575e-06,
+ "loss": 0.5043,
+ "step": 3321
+ },
+ {
+ "epoch": 1.5706855791962175,
+ "grad_norm": 2.595207691192627,
+ "learning_rate": 4.234600528262052e-06,
+ "loss": 0.5303,
+ "step": 3322
+ },
+ {
+ "epoch": 1.5711583924349881,
+ "grad_norm": 2.7814228534698486,
+ "learning_rate": 4.234151238373676e-06,
+ "loss": 0.4521,
+ "step": 3323
+ },
+ {
+ "epoch": 1.571631205673759,
+ "grad_norm": 2.781538724899292,
+ "learning_rate": 4.233701840507308e-06,
+ "loss": 0.5193,
+ "step": 3324
+ },
+ {
+ "epoch": 1.5721040189125297,
+ "grad_norm": 2.771907329559326,
+ "learning_rate": 4.233252334690928e-06,
+ "loss": 0.497,
+ "step": 3325
+ },
+ {
+ "epoch": 1.5725768321513003,
+ "grad_norm": 2.5557498931884766,
+ "learning_rate": 4.232802720952525e-06,
+ "loss": 0.4913,
+ "step": 3326
+ },
+ {
+ "epoch": 1.5730496453900709,
+ "grad_norm": 2.478267192840576,
+ "learning_rate": 4.232352999320094e-06,
+ "loss": 0.4967,
+ "step": 3327
+ },
+ {
+ "epoch": 1.5735224586288417,
+ "grad_norm": 3.1548502445220947,
+ "learning_rate": 4.231903169821639e-06,
+ "loss": 0.5009,
+ "step": 3328
+ },
+ {
+ "epoch": 1.5739952718676125,
+ "grad_norm": 2.634824275970459,
+ "learning_rate": 4.231453232485168e-06,
+ "loss": 0.5223,
+ "step": 3329
+ },
+ {
+ "epoch": 1.574468085106383,
+ "grad_norm": 2.579102039337158,
+ "learning_rate": 4.231003187338695e-06,
+ "loss": 0.5513,
+ "step": 3330
+ },
+ {
+ "epoch": 1.5749408983451536,
+ "grad_norm": 2.8477070331573486,
+ "learning_rate": 4.230553034410245e-06,
+ "loss": 0.561,
+ "step": 3331
+ },
+ {
+ "epoch": 1.5754137115839244,
+ "grad_norm": 2.6714725494384766,
+ "learning_rate": 4.2301027737278446e-06,
+ "loss": 0.4687,
+ "step": 3332
+ },
+ {
+ "epoch": 1.5758865248226952,
+ "grad_norm": 2.6562764644622803,
+ "learning_rate": 4.229652405319532e-06,
+ "loss": 0.5925,
+ "step": 3333
+ },
+ {
+ "epoch": 1.5763593380614658,
+ "grad_norm": 2.750946283340454,
+ "learning_rate": 4.229201929213348e-06,
+ "loss": 0.4748,
+ "step": 3334
+ },
+ {
+ "epoch": 1.5768321513002364,
+ "grad_norm": 2.760470151901245,
+ "learning_rate": 4.228751345437342e-06,
+ "loss": 0.5989,
+ "step": 3335
+ },
+ {
+ "epoch": 1.5773049645390071,
+ "grad_norm": 3.1451845169067383,
+ "learning_rate": 4.2283006540195706e-06,
+ "loss": 0.562,
+ "step": 3336
+ },
+ {
+ "epoch": 1.5777777777777777,
+ "grad_norm": 2.563011407852173,
+ "learning_rate": 4.227849854988095e-06,
+ "loss": 0.5473,
+ "step": 3337
+ },
+ {
+ "epoch": 1.5782505910165483,
+ "grad_norm": 2.310469388961792,
+ "learning_rate": 4.2273989483709856e-06,
+ "loss": 0.5033,
+ "step": 3338
+ },
+ {
+ "epoch": 1.578723404255319,
+ "grad_norm": 2.677978754043579,
+ "learning_rate": 4.226947934196318e-06,
+ "loss": 0.5291,
+ "step": 3339
+ },
+ {
+ "epoch": 1.57919621749409,
+ "grad_norm": 3.0423545837402344,
+ "learning_rate": 4.226496812492176e-06,
+ "loss": 0.5201,
+ "step": 3340
+ },
+ {
+ "epoch": 1.5796690307328605,
+ "grad_norm": 2.357513904571533,
+ "learning_rate": 4.226045583286647e-06,
+ "loss": 0.4421,
+ "step": 3341
+ },
+ {
+ "epoch": 1.580141843971631,
+ "grad_norm": 2.719860315322876,
+ "learning_rate": 4.225594246607828e-06,
+ "loss": 0.4855,
+ "step": 3342
+ },
+ {
+ "epoch": 1.5806146572104018,
+ "grad_norm": 3.2645058631896973,
+ "learning_rate": 4.2251428024838215e-06,
+ "loss": 0.6654,
+ "step": 3343
+ },
+ {
+ "epoch": 1.5810874704491726,
+ "grad_norm": 2.2997004985809326,
+ "learning_rate": 4.224691250942737e-06,
+ "loss": 0.4565,
+ "step": 3344
+ },
+ {
+ "epoch": 1.5815602836879432,
+ "grad_norm": 2.8103034496307373,
+ "learning_rate": 4.2242395920126926e-06,
+ "loss": 0.5543,
+ "step": 3345
+ },
+ {
+ "epoch": 1.5820330969267138,
+ "grad_norm": 2.720254898071289,
+ "learning_rate": 4.223787825721808e-06,
+ "loss": 0.5028,
+ "step": 3346
+ },
+ {
+ "epoch": 1.5825059101654846,
+ "grad_norm": 2.735544204711914,
+ "learning_rate": 4.223335952098214e-06,
+ "loss": 0.5169,
+ "step": 3347
+ },
+ {
+ "epoch": 1.5829787234042554,
+ "grad_norm": 2.784254550933838,
+ "learning_rate": 4.222883971170047e-06,
+ "loss": 0.4989,
+ "step": 3348
+ },
+ {
+ "epoch": 1.583451536643026,
+ "grad_norm": 2.7192094326019287,
+ "learning_rate": 4.22243188296545e-06,
+ "loss": 0.502,
+ "step": 3349
+ },
+ {
+ "epoch": 1.5839243498817965,
+ "grad_norm": 2.716501474380493,
+ "learning_rate": 4.221979687512573e-06,
+ "loss": 0.5687,
+ "step": 3350
+ },
+ {
+ "epoch": 1.5843971631205673,
+ "grad_norm": 2.8420114517211914,
+ "learning_rate": 4.22152738483957e-06,
+ "loss": 0.5903,
+ "step": 3351
+ },
+ {
+ "epoch": 1.5848699763593381,
+ "grad_norm": 2.734872579574585,
+ "learning_rate": 4.2210749749746065e-06,
+ "loss": 0.5397,
+ "step": 3352
+ },
+ {
+ "epoch": 1.5853427895981087,
+ "grad_norm": 2.4343836307525635,
+ "learning_rate": 4.220622457945851e-06,
+ "loss": 0.436,
+ "step": 3353
+ },
+ {
+ "epoch": 1.5858156028368793,
+ "grad_norm": 2.728177547454834,
+ "learning_rate": 4.2201698337814785e-06,
+ "loss": 0.5703,
+ "step": 3354
+ },
+ {
+ "epoch": 1.58628841607565,
+ "grad_norm": 2.502098560333252,
+ "learning_rate": 4.219717102509674e-06,
+ "loss": 0.5275,
+ "step": 3355
+ },
+ {
+ "epoch": 1.5867612293144209,
+ "grad_norm": 2.6595494747161865,
+ "learning_rate": 4.219264264158627e-06,
+ "loss": 0.4659,
+ "step": 3356
+ },
+ {
+ "epoch": 1.5872340425531914,
+ "grad_norm": 2.5307185649871826,
+ "learning_rate": 4.218811318756532e-06,
+ "loss": 0.5048,
+ "step": 3357
+ },
+ {
+ "epoch": 1.587706855791962,
+ "grad_norm": 2.9300129413604736,
+ "learning_rate": 4.218358266331593e-06,
+ "loss": 0.5137,
+ "step": 3358
+ },
+ {
+ "epoch": 1.5881796690307328,
+ "grad_norm": 2.686586618423462,
+ "learning_rate": 4.21790510691202e-06,
+ "loss": 0.4529,
+ "step": 3359
+ },
+ {
+ "epoch": 1.5886524822695036,
+ "grad_norm": 2.9981517791748047,
+ "learning_rate": 4.217451840526029e-06,
+ "loss": 0.6054,
+ "step": 3360
+ },
+ {
+ "epoch": 1.5891252955082742,
+ "grad_norm": 2.6943674087524414,
+ "learning_rate": 4.216998467201841e-06,
+ "loss": 0.5153,
+ "step": 3361
+ },
+ {
+ "epoch": 1.5895981087470448,
+ "grad_norm": 2.707084894180298,
+ "learning_rate": 4.216544986967689e-06,
+ "loss": 0.5235,
+ "step": 3362
+ },
+ {
+ "epoch": 1.5900709219858156,
+ "grad_norm": 2.6553728580474854,
+ "learning_rate": 4.216091399851808e-06,
+ "loss": 0.5275,
+ "step": 3363
+ },
+ {
+ "epoch": 1.5905437352245864,
+ "grad_norm": 2.9136953353881836,
+ "learning_rate": 4.215637705882439e-06,
+ "loss": 0.5834,
+ "step": 3364
+ },
+ {
+ "epoch": 1.591016548463357,
+ "grad_norm": 2.7647159099578857,
+ "learning_rate": 4.2151839050878325e-06,
+ "loss": 0.5641,
+ "step": 3365
+ },
+ {
+ "epoch": 1.5914893617021275,
+ "grad_norm": 2.4556827545166016,
+ "learning_rate": 4.214729997496246e-06,
+ "loss": 0.5636,
+ "step": 3366
+ },
+ {
+ "epoch": 1.5919621749408983,
+ "grad_norm": 2.6111652851104736,
+ "learning_rate": 4.2142759831359414e-06,
+ "loss": 0.5097,
+ "step": 3367
+ },
+ {
+ "epoch": 1.592434988179669,
+ "grad_norm": 2.4886903762817383,
+ "learning_rate": 4.213821862035189e-06,
+ "loss": 0.531,
+ "step": 3368
+ },
+ {
+ "epoch": 1.5929078014184397,
+ "grad_norm": 2.5245840549468994,
+ "learning_rate": 4.213367634222263e-06,
+ "loss": 0.5085,
+ "step": 3369
+ },
+ {
+ "epoch": 1.5933806146572103,
+ "grad_norm": 2.970214605331421,
+ "learning_rate": 4.212913299725447e-06,
+ "loss": 0.5851,
+ "step": 3370
+ },
+ {
+ "epoch": 1.593853427895981,
+ "grad_norm": 2.5433361530303955,
+ "learning_rate": 4.212458858573032e-06,
+ "loss": 0.48,
+ "step": 3371
+ },
+ {
+ "epoch": 1.5943262411347519,
+ "grad_norm": 2.3550102710723877,
+ "learning_rate": 4.212004310793312e-06,
+ "loss": 0.4405,
+ "step": 3372
+ },
+ {
+ "epoch": 1.5947990543735224,
+ "grad_norm": 2.4824719429016113,
+ "learning_rate": 4.2115496564145896e-06,
+ "loss": 0.4634,
+ "step": 3373
+ },
+ {
+ "epoch": 1.595271867612293,
+ "grad_norm": 2.4751930236816406,
+ "learning_rate": 4.211094895465176e-06,
+ "loss": 0.5662,
+ "step": 3374
+ },
+ {
+ "epoch": 1.5957446808510638,
+ "grad_norm": 2.4193356037139893,
+ "learning_rate": 4.210640027973386e-06,
+ "loss": 0.4441,
+ "step": 3375
+ },
+ {
+ "epoch": 1.5962174940898346,
+ "grad_norm": 2.4477498531341553,
+ "learning_rate": 4.210185053967543e-06,
+ "loss": 0.5205,
+ "step": 3376
+ },
+ {
+ "epoch": 1.5966903073286052,
+ "grad_norm": 2.7954161167144775,
+ "learning_rate": 4.209729973475976e-06,
+ "loss": 0.4951,
+ "step": 3377
+ },
+ {
+ "epoch": 1.5971631205673757,
+ "grad_norm": 3.1907570362091064,
+ "learning_rate": 4.209274786527019e-06,
+ "loss": 0.6024,
+ "step": 3378
+ },
+ {
+ "epoch": 1.5976359338061465,
+ "grad_norm": 2.485245704650879,
+ "learning_rate": 4.2088194931490165e-06,
+ "loss": 0.5652,
+ "step": 3379
+ },
+ {
+ "epoch": 1.5981087470449173,
+ "grad_norm": 2.589310884475708,
+ "learning_rate": 4.208364093370317e-06,
+ "loss": 0.5085,
+ "step": 3380
+ },
+ {
+ "epoch": 1.598581560283688,
+ "grad_norm": 2.8941214084625244,
+ "learning_rate": 4.207908587219276e-06,
+ "loss": 0.53,
+ "step": 3381
+ },
+ {
+ "epoch": 1.5990543735224585,
+ "grad_norm": 2.480509042739868,
+ "learning_rate": 4.207452974724258e-06,
+ "loss": 0.4543,
+ "step": 3382
+ },
+ {
+ "epoch": 1.5995271867612293,
+ "grad_norm": 2.7884905338287354,
+ "learning_rate": 4.206997255913629e-06,
+ "loss": 0.5483,
+ "step": 3383
+ },
+ {
+ "epoch": 1.6,
+ "grad_norm": 2.7976696491241455,
+ "learning_rate": 4.206541430815766e-06,
+ "loss": 0.4734,
+ "step": 3384
+ },
+ {
+ "epoch": 1.6004728132387707,
+ "grad_norm": 2.5463132858276367,
+ "learning_rate": 4.206085499459051e-06,
+ "loss": 0.4931,
+ "step": 3385
+ },
+ {
+ "epoch": 1.6009456264775412,
+ "grad_norm": 2.8384251594543457,
+ "learning_rate": 4.205629461871871e-06,
+ "loss": 0.5066,
+ "step": 3386
+ },
+ {
+ "epoch": 1.601418439716312,
+ "grad_norm": 2.8578574657440186,
+ "learning_rate": 4.205173318082626e-06,
+ "loss": 0.458,
+ "step": 3387
+ },
+ {
+ "epoch": 1.6018912529550828,
+ "grad_norm": 2.7779932022094727,
+ "learning_rate": 4.204717068119715e-06,
+ "loss": 0.5293,
+ "step": 3388
+ },
+ {
+ "epoch": 1.6023640661938534,
+ "grad_norm": 2.9123778343200684,
+ "learning_rate": 4.204260712011546e-06,
+ "loss": 0.4866,
+ "step": 3389
+ },
+ {
+ "epoch": 1.602836879432624,
+ "grad_norm": 2.757922887802124,
+ "learning_rate": 4.203804249786537e-06,
+ "loss": 0.4925,
+ "step": 3390
+ },
+ {
+ "epoch": 1.6033096926713948,
+ "grad_norm": 3.287733316421509,
+ "learning_rate": 4.203347681473107e-06,
+ "loss": 0.6694,
+ "step": 3391
+ },
+ {
+ "epoch": 1.6037825059101656,
+ "grad_norm": 3.2117912769317627,
+ "learning_rate": 4.202891007099687e-06,
+ "loss": 0.5269,
+ "step": 3392
+ },
+ {
+ "epoch": 1.6042553191489362,
+ "grad_norm": 2.8489456176757812,
+ "learning_rate": 4.20243422669471e-06,
+ "loss": 0.5073,
+ "step": 3393
+ },
+ {
+ "epoch": 1.6047281323877067,
+ "grad_norm": 2.7660224437713623,
+ "learning_rate": 4.201977340286619e-06,
+ "loss": 0.5014,
+ "step": 3394
+ },
+ {
+ "epoch": 1.6052009456264775,
+ "grad_norm": 2.68182110786438,
+ "learning_rate": 4.201520347903862e-06,
+ "loss": 0.4542,
+ "step": 3395
+ },
+ {
+ "epoch": 1.6056737588652483,
+ "grad_norm": 2.7546045780181885,
+ "learning_rate": 4.2010632495748934e-06,
+ "loss": 0.516,
+ "step": 3396
+ },
+ {
+ "epoch": 1.606146572104019,
+ "grad_norm": 2.744668483734131,
+ "learning_rate": 4.200606045328176e-06,
+ "loss": 0.5243,
+ "step": 3397
+ },
+ {
+ "epoch": 1.6066193853427895,
+ "grad_norm": 2.935343027114868,
+ "learning_rate": 4.200148735192177e-06,
+ "loss": 0.5624,
+ "step": 3398
+ },
+ {
+ "epoch": 1.6070921985815603,
+ "grad_norm": 2.7392852306365967,
+ "learning_rate": 4.19969131919537e-06,
+ "loss": 0.5796,
+ "step": 3399
+ },
+ {
+ "epoch": 1.607565011820331,
+ "grad_norm": 2.864750385284424,
+ "learning_rate": 4.199233797366239e-06,
+ "loss": 0.549,
+ "step": 3400
+ },
+ {
+ "epoch": 1.6080378250591016,
+ "grad_norm": 2.684157371520996,
+ "learning_rate": 4.198776169733269e-06,
+ "loss": 0.5532,
+ "step": 3401
+ },
+ {
+ "epoch": 1.6085106382978722,
+ "grad_norm": 2.4717135429382324,
+ "learning_rate": 4.198318436324957e-06,
+ "loss": 0.5174,
+ "step": 3402
+ },
+ {
+ "epoch": 1.608983451536643,
+ "grad_norm": 2.640242338180542,
+ "learning_rate": 4.197860597169802e-06,
+ "loss": 0.5117,
+ "step": 3403
+ },
+ {
+ "epoch": 1.6094562647754138,
+ "grad_norm": 2.4957473278045654,
+ "learning_rate": 4.197402652296313e-06,
+ "loss": 0.474,
+ "step": 3404
+ },
+ {
+ "epoch": 1.6099290780141844,
+ "grad_norm": 2.416138172149658,
+ "learning_rate": 4.196944601733004e-06,
+ "loss": 0.4858,
+ "step": 3405
+ },
+ {
+ "epoch": 1.610401891252955,
+ "grad_norm": 2.4498109817504883,
+ "learning_rate": 4.196486445508395e-06,
+ "loss": 0.5048,
+ "step": 3406
+ },
+ {
+ "epoch": 1.6108747044917258,
+ "grad_norm": 2.415895938873291,
+ "learning_rate": 4.196028183651014e-06,
+ "loss": 0.4745,
+ "step": 3407
+ },
+ {
+ "epoch": 1.6113475177304966,
+ "grad_norm": 2.843665838241577,
+ "learning_rate": 4.195569816189395e-06,
+ "loss": 0.5219,
+ "step": 3408
+ },
+ {
+ "epoch": 1.6118203309692671,
+ "grad_norm": 2.608579158782959,
+ "learning_rate": 4.195111343152079e-06,
+ "loss": 0.4941,
+ "step": 3409
+ },
+ {
+ "epoch": 1.6122931442080377,
+ "grad_norm": 2.643789529800415,
+ "learning_rate": 4.194652764567611e-06,
+ "loss": 0.515,
+ "step": 3410
+ },
+ {
+ "epoch": 1.6127659574468085,
+ "grad_norm": 2.8099429607391357,
+ "learning_rate": 4.194194080464547e-06,
+ "loss": 0.4935,
+ "step": 3411
+ },
+ {
+ "epoch": 1.6132387706855793,
+ "grad_norm": 2.595628261566162,
+ "learning_rate": 4.193735290871446e-06,
+ "loss": 0.5571,
+ "step": 3412
+ },
+ {
+ "epoch": 1.6137115839243499,
+ "grad_norm": 2.7903778553009033,
+ "learning_rate": 4.193276395816876e-06,
+ "loss": 0.5228,
+ "step": 3413
+ },
+ {
+ "epoch": 1.6141843971631205,
+ "grad_norm": 2.83910870552063,
+ "learning_rate": 4.192817395329409e-06,
+ "loss": 0.6124,
+ "step": 3414
+ },
+ {
+ "epoch": 1.6146572104018913,
+ "grad_norm": 2.6155734062194824,
+ "learning_rate": 4.192358289437626e-06,
+ "loss": 0.552,
+ "step": 3415
+ },
+ {
+ "epoch": 1.615130023640662,
+ "grad_norm": 2.795832872390747,
+ "learning_rate": 4.191899078170113e-06,
+ "loss": 0.5561,
+ "step": 3416
+ },
+ {
+ "epoch": 1.6156028368794326,
+ "grad_norm": 2.3402161598205566,
+ "learning_rate": 4.191439761555464e-06,
+ "loss": 0.4889,
+ "step": 3417
+ },
+ {
+ "epoch": 1.6160756501182032,
+ "grad_norm": 3.1183433532714844,
+ "learning_rate": 4.190980339622276e-06,
+ "loss": 0.5337,
+ "step": 3418
+ },
+ {
+ "epoch": 1.616548463356974,
+ "grad_norm": 2.6262872219085693,
+ "learning_rate": 4.190520812399158e-06,
+ "loss": 0.525,
+ "step": 3419
+ },
+ {
+ "epoch": 1.6170212765957448,
+ "grad_norm": 2.578340530395508,
+ "learning_rate": 4.190061179914722e-06,
+ "loss": 0.4975,
+ "step": 3420
+ },
+ {
+ "epoch": 1.6174940898345154,
+ "grad_norm": 3.19482159614563,
+ "learning_rate": 4.189601442197586e-06,
+ "loss": 0.5832,
+ "step": 3421
+ },
+ {
+ "epoch": 1.617966903073286,
+ "grad_norm": 2.6398792266845703,
+ "learning_rate": 4.189141599276378e-06,
+ "loss": 0.4676,
+ "step": 3422
+ },
+ {
+ "epoch": 1.6184397163120567,
+ "grad_norm": 2.624865770339966,
+ "learning_rate": 4.1886816511797275e-06,
+ "loss": 0.4507,
+ "step": 3423
+ },
+ {
+ "epoch": 1.6189125295508275,
+ "grad_norm": 2.4136857986450195,
+ "learning_rate": 4.1882215979362775e-06,
+ "loss": 0.4616,
+ "step": 3424
+ },
+ {
+ "epoch": 1.6193853427895981,
+ "grad_norm": 2.6906614303588867,
+ "learning_rate": 4.18776143957467e-06,
+ "loss": 0.5142,
+ "step": 3425
+ },
+ {
+ "epoch": 1.6198581560283687,
+ "grad_norm": 2.5149154663085938,
+ "learning_rate": 4.187301176123558e-06,
+ "loss": 0.5252,
+ "step": 3426
+ },
+ {
+ "epoch": 1.6203309692671395,
+ "grad_norm": 2.677405834197998,
+ "learning_rate": 4.186840807611602e-06,
+ "loss": 0.4635,
+ "step": 3427
+ },
+ {
+ "epoch": 1.6208037825059103,
+ "grad_norm": 2.7164649963378906,
+ "learning_rate": 4.186380334067464e-06,
+ "loss": 0.5634,
+ "step": 3428
+ },
+ {
+ "epoch": 1.6212765957446809,
+ "grad_norm": 2.8299832344055176,
+ "learning_rate": 4.185919755519817e-06,
+ "loss": 0.5166,
+ "step": 3429
+ },
+ {
+ "epoch": 1.6217494089834514,
+ "grad_norm": 2.465848207473755,
+ "learning_rate": 4.18545907199734e-06,
+ "loss": 0.4696,
+ "step": 3430
+ },
+ {
+ "epoch": 1.6222222222222222,
+ "grad_norm": 2.407616376876831,
+ "learning_rate": 4.1849982835287175e-06,
+ "loss": 0.5111,
+ "step": 3431
+ },
+ {
+ "epoch": 1.622695035460993,
+ "grad_norm": 2.452146291732788,
+ "learning_rate": 4.184537390142639e-06,
+ "loss": 0.4574,
+ "step": 3432
+ },
+ {
+ "epoch": 1.6231678486997636,
+ "grad_norm": 2.653071165084839,
+ "learning_rate": 4.1840763918678055e-06,
+ "loss": 0.5611,
+ "step": 3433
+ },
+ {
+ "epoch": 1.6236406619385342,
+ "grad_norm": 2.5920350551605225,
+ "learning_rate": 4.183615288732919e-06,
+ "loss": 0.5437,
+ "step": 3434
+ },
+ {
+ "epoch": 1.624113475177305,
+ "grad_norm": 2.782900810241699,
+ "learning_rate": 4.18315408076669e-06,
+ "loss": 0.5824,
+ "step": 3435
+ },
+ {
+ "epoch": 1.6245862884160758,
+ "grad_norm": 2.8769774436950684,
+ "learning_rate": 4.1826927679978365e-06,
+ "loss": 0.5271,
+ "step": 3436
+ },
+ {
+ "epoch": 1.6250591016548463,
+ "grad_norm": 2.488598585128784,
+ "learning_rate": 4.182231350455084e-06,
+ "loss": 0.4684,
+ "step": 3437
+ },
+ {
+ "epoch": 1.625531914893617,
+ "grad_norm": 2.6472036838531494,
+ "learning_rate": 4.181769828167161e-06,
+ "loss": 0.5372,
+ "step": 3438
+ },
+ {
+ "epoch": 1.6260047281323877,
+ "grad_norm": 2.6498794555664062,
+ "learning_rate": 4.1813082011628045e-06,
+ "loss": 0.4805,
+ "step": 3439
+ },
+ {
+ "epoch": 1.6264775413711585,
+ "grad_norm": 2.5386533737182617,
+ "learning_rate": 4.1808464694707595e-06,
+ "loss": 0.5015,
+ "step": 3440
+ },
+ {
+ "epoch": 1.626950354609929,
+ "grad_norm": 2.8812551498413086,
+ "learning_rate": 4.180384633119775e-06,
+ "loss": 0.5225,
+ "step": 3441
+ },
+ {
+ "epoch": 1.6274231678486997,
+ "grad_norm": 2.870124578475952,
+ "learning_rate": 4.179922692138609e-06,
+ "loss": 0.537,
+ "step": 3442
+ },
+ {
+ "epoch": 1.6278959810874705,
+ "grad_norm": 2.5759785175323486,
+ "learning_rate": 4.179460646556021e-06,
+ "loss": 0.5142,
+ "step": 3443
+ },
+ {
+ "epoch": 1.6283687943262413,
+ "grad_norm": 2.629347324371338,
+ "learning_rate": 4.1789984964007836e-06,
+ "loss": 0.5007,
+ "step": 3444
+ },
+ {
+ "epoch": 1.6288416075650118,
+ "grad_norm": 2.751128673553467,
+ "learning_rate": 4.178536241701672e-06,
+ "loss": 0.5677,
+ "step": 3445
+ },
+ {
+ "epoch": 1.6293144208037824,
+ "grad_norm": 2.7582364082336426,
+ "learning_rate": 4.178073882487469e-06,
+ "loss": 0.499,
+ "step": 3446
+ },
+ {
+ "epoch": 1.6297872340425532,
+ "grad_norm": 3.136711359024048,
+ "learning_rate": 4.177611418786963e-06,
+ "loss": 0.5294,
+ "step": 3447
+ },
+ {
+ "epoch": 1.630260047281324,
+ "grad_norm": 2.7363100051879883,
+ "learning_rate": 4.17714885062895e-06,
+ "loss": 0.5264,
+ "step": 3448
+ },
+ {
+ "epoch": 1.6307328605200946,
+ "grad_norm": 2.7305946350097656,
+ "learning_rate": 4.176686178042233e-06,
+ "loss": 0.5235,
+ "step": 3449
+ },
+ {
+ "epoch": 1.6312056737588652,
+ "grad_norm": 2.6500556468963623,
+ "learning_rate": 4.176223401055619e-06,
+ "loss": 0.5463,
+ "step": 3450
+ },
+ {
+ "epoch": 1.631678486997636,
+ "grad_norm": 2.756321907043457,
+ "learning_rate": 4.175760519697924e-06,
+ "loss": 0.545,
+ "step": 3451
+ },
+ {
+ "epoch": 1.6321513002364068,
+ "grad_norm": 2.6234960556030273,
+ "learning_rate": 4.17529753399797e-06,
+ "loss": 0.4927,
+ "step": 3452
+ },
+ {
+ "epoch": 1.6326241134751773,
+ "grad_norm": 2.6358842849731445,
+ "learning_rate": 4.174834443984584e-06,
+ "loss": 0.5445,
+ "step": 3453
+ },
+ {
+ "epoch": 1.633096926713948,
+ "grad_norm": 2.541147470474243,
+ "learning_rate": 4.174371249686601e-06,
+ "loss": 0.4691,
+ "step": 3454
+ },
+ {
+ "epoch": 1.6335697399527187,
+ "grad_norm": 2.566981077194214,
+ "learning_rate": 4.173907951132863e-06,
+ "loss": 0.4932,
+ "step": 3455
+ },
+ {
+ "epoch": 1.6340425531914895,
+ "grad_norm": 2.670940399169922,
+ "learning_rate": 4.173444548352216e-06,
+ "loss": 0.4979,
+ "step": 3456
+ },
+ {
+ "epoch": 1.63451536643026,
+ "grad_norm": 2.5440268516540527,
+ "learning_rate": 4.172981041373515e-06,
+ "loss": 0.4716,
+ "step": 3457
+ },
+ {
+ "epoch": 1.6349881796690307,
+ "grad_norm": 2.3801631927490234,
+ "learning_rate": 4.17251743022562e-06,
+ "loss": 0.5126,
+ "step": 3458
+ },
+ {
+ "epoch": 1.6354609929078014,
+ "grad_norm": 2.5051121711730957,
+ "learning_rate": 4.1720537149373985e-06,
+ "loss": 0.4964,
+ "step": 3459
+ },
+ {
+ "epoch": 1.6359338061465722,
+ "grad_norm": 3.5521697998046875,
+ "learning_rate": 4.171589895537724e-06,
+ "loss": 0.5447,
+ "step": 3460
+ },
+ {
+ "epoch": 1.6364066193853428,
+ "grad_norm": 2.6041572093963623,
+ "learning_rate": 4.171125972055477e-06,
+ "loss": 0.4637,
+ "step": 3461
+ },
+ {
+ "epoch": 1.6368794326241134,
+ "grad_norm": 2.2297258377075195,
+ "learning_rate": 4.170661944519543e-06,
+ "loss": 0.4702,
+ "step": 3462
+ },
+ {
+ "epoch": 1.6373522458628842,
+ "grad_norm": 2.6764535903930664,
+ "learning_rate": 4.170197812958815e-06,
+ "loss": 0.5111,
+ "step": 3463
+ },
+ {
+ "epoch": 1.637825059101655,
+ "grad_norm": 2.86892032623291,
+ "learning_rate": 4.169733577402193e-06,
+ "loss": 0.5437,
+ "step": 3464
+ },
+ {
+ "epoch": 1.6382978723404256,
+ "grad_norm": 2.9007070064544678,
+ "learning_rate": 4.1692692378785825e-06,
+ "loss": 0.5425,
+ "step": 3465
+ },
+ {
+ "epoch": 1.6387706855791961,
+ "grad_norm": 2.5902905464172363,
+ "learning_rate": 4.168804794416896e-06,
+ "loss": 0.5252,
+ "step": 3466
+ },
+ {
+ "epoch": 1.639243498817967,
+ "grad_norm": 2.821183681488037,
+ "learning_rate": 4.168340247046053e-06,
+ "loss": 0.5265,
+ "step": 3467
+ },
+ {
+ "epoch": 1.6397163120567377,
+ "grad_norm": 2.7928314208984375,
+ "learning_rate": 4.167875595794978e-06,
+ "loss": 0.5151,
+ "step": 3468
+ },
+ {
+ "epoch": 1.6401891252955083,
+ "grad_norm": 2.3130412101745605,
+ "learning_rate": 4.167410840692603e-06,
+ "loss": 0.4941,
+ "step": 3469
+ },
+ {
+ "epoch": 1.6406619385342789,
+ "grad_norm": 2.6078619956970215,
+ "learning_rate": 4.1669459817678655e-06,
+ "loss": 0.493,
+ "step": 3470
+ },
+ {
+ "epoch": 1.6411347517730497,
+ "grad_norm": 2.5335731506347656,
+ "learning_rate": 4.166481019049712e-06,
+ "loss": 0.4969,
+ "step": 3471
+ },
+ {
+ "epoch": 1.6416075650118205,
+ "grad_norm": 2.8181469440460205,
+ "learning_rate": 4.166015952567093e-06,
+ "loss": 0.5062,
+ "step": 3472
+ },
+ {
+ "epoch": 1.642080378250591,
+ "grad_norm": 2.7256782054901123,
+ "learning_rate": 4.165550782348966e-06,
+ "loss": 0.5397,
+ "step": 3473
+ },
+ {
+ "epoch": 1.6425531914893616,
+ "grad_norm": 2.284345865249634,
+ "learning_rate": 4.1650855084242946e-06,
+ "loss": 0.4448,
+ "step": 3474
+ },
+ {
+ "epoch": 1.6430260047281324,
+ "grad_norm": 3.0383145809173584,
+ "learning_rate": 4.164620130822049e-06,
+ "loss": 0.5873,
+ "step": 3475
+ },
+ {
+ "epoch": 1.6434988179669032,
+ "grad_norm": 2.754448652267456,
+ "learning_rate": 4.1641546495712085e-06,
+ "loss": 0.4852,
+ "step": 3476
+ },
+ {
+ "epoch": 1.6439716312056738,
+ "grad_norm": 2.6820101737976074,
+ "learning_rate": 4.1636890647007535e-06,
+ "loss": 0.5325,
+ "step": 3477
+ },
+ {
+ "epoch": 1.6444444444444444,
+ "grad_norm": 2.6396398544311523,
+ "learning_rate": 4.163223376239676e-06,
+ "loss": 0.466,
+ "step": 3478
+ },
+ {
+ "epoch": 1.6449172576832152,
+ "grad_norm": 2.395049810409546,
+ "learning_rate": 4.162757584216972e-06,
+ "loss": 0.4531,
+ "step": 3479
+ },
+ {
+ "epoch": 1.645390070921986,
+ "grad_norm": 2.596670627593994,
+ "learning_rate": 4.162291688661645e-06,
+ "loss": 0.5207,
+ "step": 3480
+ },
+ {
+ "epoch": 1.6458628841607565,
+ "grad_norm": 2.4391872882843018,
+ "learning_rate": 4.161825689602703e-06,
+ "loss": 0.5133,
+ "step": 3481
+ },
+ {
+ "epoch": 1.6463356973995271,
+ "grad_norm": 2.6169841289520264,
+ "learning_rate": 4.161359587069162e-06,
+ "loss": 0.5096,
+ "step": 3482
+ },
+ {
+ "epoch": 1.646808510638298,
+ "grad_norm": 2.634089946746826,
+ "learning_rate": 4.1608933810900445e-06,
+ "loss": 0.4921,
+ "step": 3483
+ },
+ {
+ "epoch": 1.6472813238770687,
+ "grad_norm": 2.815877914428711,
+ "learning_rate": 4.160427071694379e-06,
+ "loss": 0.5045,
+ "step": 3484
+ },
+ {
+ "epoch": 1.6477541371158393,
+ "grad_norm": 2.417525053024292,
+ "learning_rate": 4.159960658911199e-06,
+ "loss": 0.4997,
+ "step": 3485
+ },
+ {
+ "epoch": 1.6482269503546099,
+ "grad_norm": 2.5713605880737305,
+ "learning_rate": 4.15949414276955e-06,
+ "loss": 0.5246,
+ "step": 3486
+ },
+ {
+ "epoch": 1.6486997635933807,
+ "grad_norm": 3.49833607673645,
+ "learning_rate": 4.159027523298475e-06,
+ "loss": 0.4901,
+ "step": 3487
+ },
+ {
+ "epoch": 1.6491725768321515,
+ "grad_norm": 2.985464334487915,
+ "learning_rate": 4.158560800527033e-06,
+ "loss": 0.5726,
+ "step": 3488
+ },
+ {
+ "epoch": 1.649645390070922,
+ "grad_norm": 2.72745680809021,
+ "learning_rate": 4.158093974484282e-06,
+ "loss": 0.5119,
+ "step": 3489
+ },
+ {
+ "epoch": 1.6501182033096926,
+ "grad_norm": 2.4885571002960205,
+ "learning_rate": 4.157627045199289e-06,
+ "loss": 0.4838,
+ "step": 3490
+ },
+ {
+ "epoch": 1.6505910165484634,
+ "grad_norm": 2.7622628211975098,
+ "learning_rate": 4.157160012701128e-06,
+ "loss": 0.5269,
+ "step": 3491
+ },
+ {
+ "epoch": 1.6510638297872342,
+ "grad_norm": 2.615122079849243,
+ "learning_rate": 4.156692877018879e-06,
+ "loss": 0.5501,
+ "step": 3492
+ },
+ {
+ "epoch": 1.6515366430260048,
+ "grad_norm": 2.827753782272339,
+ "learning_rate": 4.156225638181631e-06,
+ "loss": 0.5452,
+ "step": 3493
+ },
+ {
+ "epoch": 1.6520094562647754,
+ "grad_norm": 2.724820137023926,
+ "learning_rate": 4.155758296218474e-06,
+ "loss": 0.5155,
+ "step": 3494
+ },
+ {
+ "epoch": 1.6524822695035462,
+ "grad_norm": 2.5806174278259277,
+ "learning_rate": 4.155290851158508e-06,
+ "loss": 0.5292,
+ "step": 3495
+ },
+ {
+ "epoch": 1.652955082742317,
+ "grad_norm": 2.5655179023742676,
+ "learning_rate": 4.154823303030838e-06,
+ "loss": 0.4959,
+ "step": 3496
+ },
+ {
+ "epoch": 1.6534278959810875,
+ "grad_norm": 2.656548261642456,
+ "learning_rate": 4.154355651864579e-06,
+ "loss": 0.5703,
+ "step": 3497
+ },
+ {
+ "epoch": 1.653900709219858,
+ "grad_norm": 2.9085004329681396,
+ "learning_rate": 4.153887897688847e-06,
+ "loss": 0.5061,
+ "step": 3498
+ },
+ {
+ "epoch": 1.654373522458629,
+ "grad_norm": 2.608010768890381,
+ "learning_rate": 4.1534200405327665e-06,
+ "loss": 0.5165,
+ "step": 3499
+ },
+ {
+ "epoch": 1.6548463356973995,
+ "grad_norm": 2.600463628768921,
+ "learning_rate": 4.152952080425471e-06,
+ "loss": 0.4946,
+ "step": 3500
+ },
+ {
+ "epoch": 1.65531914893617,
+ "grad_norm": 2.5561563968658447,
+ "learning_rate": 4.152484017396098e-06,
+ "loss": 0.4804,
+ "step": 3501
+ },
+ {
+ "epoch": 1.6557919621749408,
+ "grad_norm": 2.788594961166382,
+ "learning_rate": 4.152015851473791e-06,
+ "loss": 0.5635,
+ "step": 3502
+ },
+ {
+ "epoch": 1.6562647754137116,
+ "grad_norm": 2.693302631378174,
+ "learning_rate": 4.151547582687699e-06,
+ "loss": 0.5139,
+ "step": 3503
+ },
+ {
+ "epoch": 1.6567375886524822,
+ "grad_norm": 2.7887485027313232,
+ "learning_rate": 4.1510792110669825e-06,
+ "loss": 0.4952,
+ "step": 3504
+ },
+ {
+ "epoch": 1.6572104018912528,
+ "grad_norm": 2.8982298374176025,
+ "learning_rate": 4.150610736640803e-06,
+ "loss": 0.4136,
+ "step": 3505
+ },
+ {
+ "epoch": 1.6576832151300236,
+ "grad_norm": 2.7569408416748047,
+ "learning_rate": 4.150142159438331e-06,
+ "loss": 0.5272,
+ "step": 3506
+ },
+ {
+ "epoch": 1.6581560283687944,
+ "grad_norm": 2.531648874282837,
+ "learning_rate": 4.149673479488742e-06,
+ "loss": 0.5016,
+ "step": 3507
+ },
+ {
+ "epoch": 1.658628841607565,
+ "grad_norm": 2.7706353664398193,
+ "learning_rate": 4.149204696821219e-06,
+ "loss": 0.5512,
+ "step": 3508
+ },
+ {
+ "epoch": 1.6591016548463355,
+ "grad_norm": 2.7307450771331787,
+ "learning_rate": 4.148735811464951e-06,
+ "loss": 0.4968,
+ "step": 3509
+ },
+ {
+ "epoch": 1.6595744680851063,
+ "grad_norm": 3.0097429752349854,
+ "learning_rate": 4.1482668234491335e-06,
+ "loss": 0.4797,
+ "step": 3510
+ },
+ {
+ "epoch": 1.6600472813238771,
+ "grad_norm": 2.6045308113098145,
+ "learning_rate": 4.147797732802969e-06,
+ "loss": 0.5496,
+ "step": 3511
+ },
+ {
+ "epoch": 1.6605200945626477,
+ "grad_norm": 2.702061176300049,
+ "learning_rate": 4.147328539555664e-06,
+ "loss": 0.5302,
+ "step": 3512
+ },
+ {
+ "epoch": 1.6609929078014183,
+ "grad_norm": 3.3724892139434814,
+ "learning_rate": 4.1468592437364356e-06,
+ "loss": 0.5124,
+ "step": 3513
+ },
+ {
+ "epoch": 1.661465721040189,
+ "grad_norm": 2.5117242336273193,
+ "learning_rate": 4.146389845374502e-06,
+ "loss": 0.4953,
+ "step": 3514
+ },
+ {
+ "epoch": 1.6619385342789599,
+ "grad_norm": 2.86547589302063,
+ "learning_rate": 4.145920344499092e-06,
+ "loss": 0.5337,
+ "step": 3515
+ },
+ {
+ "epoch": 1.6624113475177305,
+ "grad_norm": 2.745149850845337,
+ "learning_rate": 4.14545074113944e-06,
+ "loss": 0.5187,
+ "step": 3516
+ },
+ {
+ "epoch": 1.662884160756501,
+ "grad_norm": 2.5560994148254395,
+ "learning_rate": 4.1449810353247855e-06,
+ "loss": 0.5183,
+ "step": 3517
+ },
+ {
+ "epoch": 1.6633569739952718,
+ "grad_norm": 2.2318122386932373,
+ "learning_rate": 4.144511227084374e-06,
+ "loss": 0.4452,
+ "step": 3518
+ },
+ {
+ "epoch": 1.6638297872340426,
+ "grad_norm": 2.6980903148651123,
+ "learning_rate": 4.14404131644746e-06,
+ "loss": 0.4974,
+ "step": 3519
+ },
+ {
+ "epoch": 1.6643026004728132,
+ "grad_norm": 2.6875357627868652,
+ "learning_rate": 4.1435713034433025e-06,
+ "loss": 0.4582,
+ "step": 3520
+ },
+ {
+ "epoch": 1.6647754137115838,
+ "grad_norm": 2.9430019855499268,
+ "learning_rate": 4.143101188101166e-06,
+ "loss": 0.5004,
+ "step": 3521
+ },
+ {
+ "epoch": 1.6652482269503546,
+ "grad_norm": 2.4447221755981445,
+ "learning_rate": 4.142630970450323e-06,
+ "loss": 0.5436,
+ "step": 3522
+ },
+ {
+ "epoch": 1.6657210401891254,
+ "grad_norm": 2.571023941040039,
+ "learning_rate": 4.142160650520053e-06,
+ "loss": 0.5307,
+ "step": 3523
+ },
+ {
+ "epoch": 1.666193853427896,
+ "grad_norm": 2.9725306034088135,
+ "learning_rate": 4.14169022833964e-06,
+ "loss": 0.5918,
+ "step": 3524
+ },
+ {
+ "epoch": 1.6666666666666665,
+ "grad_norm": 2.5958926677703857,
+ "learning_rate": 4.141219703938375e-06,
+ "loss": 0.5036,
+ "step": 3525
+ },
+ {
+ "epoch": 1.6671394799054373,
+ "grad_norm": 2.935788631439209,
+ "learning_rate": 4.140749077345556e-06,
+ "loss": 0.5773,
+ "step": 3526
+ },
+ {
+ "epoch": 1.6676122931442081,
+ "grad_norm": 2.5460526943206787,
+ "learning_rate": 4.140278348590485e-06,
+ "loss": 0.4762,
+ "step": 3527
+ },
+ {
+ "epoch": 1.6680851063829787,
+ "grad_norm": 2.5729143619537354,
+ "learning_rate": 4.139807517702475e-06,
+ "loss": 0.5515,
+ "step": 3528
+ },
+ {
+ "epoch": 1.6685579196217493,
+ "grad_norm": 2.4377381801605225,
+ "learning_rate": 4.13933658471084e-06,
+ "loss": 0.5383,
+ "step": 3529
+ },
+ {
+ "epoch": 1.66903073286052,
+ "grad_norm": 2.6284425258636475,
+ "learning_rate": 4.138865549644905e-06,
+ "loss": 0.5396,
+ "step": 3530
+ },
+ {
+ "epoch": 1.6695035460992909,
+ "grad_norm": 2.857250928878784,
+ "learning_rate": 4.138394412533998e-06,
+ "loss": 0.5861,
+ "step": 3531
+ },
+ {
+ "epoch": 1.6699763593380614,
+ "grad_norm": 2.9226012229919434,
+ "learning_rate": 4.137923173407456e-06,
+ "loss": 0.5262,
+ "step": 3532
+ },
+ {
+ "epoch": 1.670449172576832,
+ "grad_norm": 4.839131832122803,
+ "learning_rate": 4.137451832294619e-06,
+ "loss": 0.651,
+ "step": 3533
+ },
+ {
+ "epoch": 1.6709219858156028,
+ "grad_norm": 2.4727771282196045,
+ "learning_rate": 4.1369803892248375e-06,
+ "loss": 0.5149,
+ "step": 3534
+ },
+ {
+ "epoch": 1.6713947990543736,
+ "grad_norm": 2.5391688346862793,
+ "learning_rate": 4.1365088442274635e-06,
+ "loss": 0.4907,
+ "step": 3535
+ },
+ {
+ "epoch": 1.6718676122931442,
+ "grad_norm": 2.5168209075927734,
+ "learning_rate": 4.136037197331862e-06,
+ "loss": 0.5091,
+ "step": 3536
+ },
+ {
+ "epoch": 1.6723404255319148,
+ "grad_norm": 2.6278600692749023,
+ "learning_rate": 4.135565448567396e-06,
+ "loss": 0.4357,
+ "step": 3537
+ },
+ {
+ "epoch": 1.6728132387706856,
+ "grad_norm": 2.835184097290039,
+ "learning_rate": 4.135093597963441e-06,
+ "loss": 0.4786,
+ "step": 3538
+ },
+ {
+ "epoch": 1.6732860520094563,
+ "grad_norm": 2.385328531265259,
+ "learning_rate": 4.134621645549379e-06,
+ "loss": 0.4849,
+ "step": 3539
+ },
+ {
+ "epoch": 1.673758865248227,
+ "grad_norm": 2.6504149436950684,
+ "learning_rate": 4.134149591354593e-06,
+ "loss": 0.6037,
+ "step": 3540
+ },
+ {
+ "epoch": 1.6742316784869975,
+ "grad_norm": 2.945634126663208,
+ "learning_rate": 4.1336774354084786e-06,
+ "loss": 0.532,
+ "step": 3541
+ },
+ {
+ "epoch": 1.6747044917257683,
+ "grad_norm": 2.8373215198516846,
+ "learning_rate": 4.133205177740434e-06,
+ "loss": 0.5138,
+ "step": 3542
+ },
+ {
+ "epoch": 1.675177304964539,
+ "grad_norm": 2.6616621017456055,
+ "learning_rate": 4.1327328183798634e-06,
+ "loss": 0.5543,
+ "step": 3543
+ },
+ {
+ "epoch": 1.6756501182033097,
+ "grad_norm": 3.0843071937561035,
+ "learning_rate": 4.13226035735618e-06,
+ "loss": 0.6585,
+ "step": 3544
+ },
+ {
+ "epoch": 1.6761229314420802,
+ "grad_norm": 2.2214272022247314,
+ "learning_rate": 4.131787794698802e-06,
+ "loss": 0.5413,
+ "step": 3545
+ },
+ {
+ "epoch": 1.676595744680851,
+ "grad_norm": 2.4515018463134766,
+ "learning_rate": 4.131315130437152e-06,
+ "loss": 0.4966,
+ "step": 3546
+ },
+ {
+ "epoch": 1.6770685579196218,
+ "grad_norm": 2.647414207458496,
+ "learning_rate": 4.130842364600663e-06,
+ "loss": 0.5401,
+ "step": 3547
+ },
+ {
+ "epoch": 1.6775413711583924,
+ "grad_norm": 2.648941993713379,
+ "learning_rate": 4.13036949721877e-06,
+ "loss": 0.4796,
+ "step": 3548
+ },
+ {
+ "epoch": 1.678014184397163,
+ "grad_norm": 2.7835679054260254,
+ "learning_rate": 4.129896528320919e-06,
+ "loss": 0.5653,
+ "step": 3549
+ },
+ {
+ "epoch": 1.6784869976359338,
+ "grad_norm": 2.995964288711548,
+ "learning_rate": 4.129423457936556e-06,
+ "loss": 0.4999,
+ "step": 3550
+ },
+ {
+ "epoch": 1.6789598108747046,
+ "grad_norm": 2.5980007648468018,
+ "learning_rate": 4.1289502860951405e-06,
+ "loss": 0.5177,
+ "step": 3551
+ },
+ {
+ "epoch": 1.6794326241134752,
+ "grad_norm": 2.442254066467285,
+ "learning_rate": 4.128477012826133e-06,
+ "loss": 0.5062,
+ "step": 3552
+ },
+ {
+ "epoch": 1.6799054373522457,
+ "grad_norm": 2.3007538318634033,
+ "learning_rate": 4.1280036381590025e-06,
+ "loss": 0.5029,
+ "step": 3553
+ },
+ {
+ "epoch": 1.6803782505910165,
+ "grad_norm": 2.4169347286224365,
+ "learning_rate": 4.1275301621232245e-06,
+ "loss": 0.515,
+ "step": 3554
+ },
+ {
+ "epoch": 1.6808510638297873,
+ "grad_norm": 2.6456379890441895,
+ "learning_rate": 4.127056584748279e-06,
+ "loss": 0.5343,
+ "step": 3555
+ },
+ {
+ "epoch": 1.681323877068558,
+ "grad_norm": 2.6406595706939697,
+ "learning_rate": 4.1265829060636546e-06,
+ "loss": 0.5047,
+ "step": 3556
+ },
+ {
+ "epoch": 1.6817966903073285,
+ "grad_norm": 2.9344475269317627,
+ "learning_rate": 4.126109126098846e-06,
+ "loss": 0.5501,
+ "step": 3557
+ },
+ {
+ "epoch": 1.6822695035460993,
+ "grad_norm": 2.3292455673217773,
+ "learning_rate": 4.125635244883351e-06,
+ "loss": 0.463,
+ "step": 3558
+ },
+ {
+ "epoch": 1.68274231678487,
+ "grad_norm": 2.4150657653808594,
+ "learning_rate": 4.125161262446677e-06,
+ "loss": 0.4802,
+ "step": 3559
+ },
+ {
+ "epoch": 1.6832151300236406,
+ "grad_norm": 2.604292392730713,
+ "learning_rate": 4.124687178818339e-06,
+ "loss": 0.5683,
+ "step": 3560
+ },
+ {
+ "epoch": 1.6836879432624112,
+ "grad_norm": 2.5676791667938232,
+ "learning_rate": 4.1242129940278544e-06,
+ "loss": 0.5519,
+ "step": 3561
+ },
+ {
+ "epoch": 1.684160756501182,
+ "grad_norm": 3.078514814376831,
+ "learning_rate": 4.123738708104748e-06,
+ "loss": 0.5194,
+ "step": 3562
+ },
+ {
+ "epoch": 1.6846335697399528,
+ "grad_norm": 2.893577814102173,
+ "learning_rate": 4.123264321078552e-06,
+ "loss": 0.5107,
+ "step": 3563
+ },
+ {
+ "epoch": 1.6851063829787234,
+ "grad_norm": 2.772413730621338,
+ "learning_rate": 4.122789832978804e-06,
+ "loss": 0.6147,
+ "step": 3564
+ },
+ {
+ "epoch": 1.685579196217494,
+ "grad_norm": 2.5804643630981445,
+ "learning_rate": 4.12231524383505e-06,
+ "loss": 0.5057,
+ "step": 3565
+ },
+ {
+ "epoch": 1.6860520094562648,
+ "grad_norm": 2.599571466445923,
+ "learning_rate": 4.121840553676839e-06,
+ "loss": 0.5591,
+ "step": 3566
+ },
+ {
+ "epoch": 1.6865248226950356,
+ "grad_norm": 2.9124577045440674,
+ "learning_rate": 4.1213657625337275e-06,
+ "loss": 0.565,
+ "step": 3567
+ },
+ {
+ "epoch": 1.6869976359338061,
+ "grad_norm": 2.6582155227661133,
+ "learning_rate": 4.120890870435281e-06,
+ "loss": 0.4607,
+ "step": 3568
+ },
+ {
+ "epoch": 1.6874704491725767,
+ "grad_norm": 2.929227590560913,
+ "learning_rate": 4.120415877411066e-06,
+ "loss": 0.5705,
+ "step": 3569
+ },
+ {
+ "epoch": 1.6879432624113475,
+ "grad_norm": 2.4443247318267822,
+ "learning_rate": 4.11994078349066e-06,
+ "loss": 0.4592,
+ "step": 3570
+ },
+ {
+ "epoch": 1.6884160756501183,
+ "grad_norm": 2.4799163341522217,
+ "learning_rate": 4.119465588703645e-06,
+ "loss": 0.5361,
+ "step": 3571
+ },
+ {
+ "epoch": 1.6888888888888889,
+ "grad_norm": 2.9408936500549316,
+ "learning_rate": 4.1189902930796085e-06,
+ "loss": 0.5347,
+ "step": 3572
+ },
+ {
+ "epoch": 1.6893617021276595,
+ "grad_norm": 3.3348076343536377,
+ "learning_rate": 4.118514896648146e-06,
+ "loss": 0.5612,
+ "step": 3573
+ },
+ {
+ "epoch": 1.6898345153664303,
+ "grad_norm": 2.764889717102051,
+ "learning_rate": 4.118039399438857e-06,
+ "loss": 0.4745,
+ "step": 3574
+ },
+ {
+ "epoch": 1.690307328605201,
+ "grad_norm": 2.7023751735687256,
+ "learning_rate": 4.11756380148135e-06,
+ "loss": 0.5106,
+ "step": 3575
+ },
+ {
+ "epoch": 1.6907801418439716,
+ "grad_norm": 2.8816208839416504,
+ "learning_rate": 4.117088102805238e-06,
+ "loss": 0.6016,
+ "step": 3576
+ },
+ {
+ "epoch": 1.6912529550827422,
+ "grad_norm": 2.215733289718628,
+ "learning_rate": 4.11661230344014e-06,
+ "loss": 0.4404,
+ "step": 3577
+ },
+ {
+ "epoch": 1.691725768321513,
+ "grad_norm": 2.8190999031066895,
+ "learning_rate": 4.116136403415683e-06,
+ "loss": 0.5038,
+ "step": 3578
+ },
+ {
+ "epoch": 1.6921985815602838,
+ "grad_norm": 2.616424083709717,
+ "learning_rate": 4.115660402761499e-06,
+ "loss": 0.5493,
+ "step": 3579
+ },
+ {
+ "epoch": 1.6926713947990544,
+ "grad_norm": 2.7738113403320312,
+ "learning_rate": 4.115184301507226e-06,
+ "loss": 0.5416,
+ "step": 3580
+ },
+ {
+ "epoch": 1.693144208037825,
+ "grad_norm": 2.4793593883514404,
+ "learning_rate": 4.114708099682509e-06,
+ "loss": 0.4526,
+ "step": 3581
+ },
+ {
+ "epoch": 1.6936170212765957,
+ "grad_norm": 2.390652894973755,
+ "learning_rate": 4.114231797316999e-06,
+ "loss": 0.4908,
+ "step": 3582
+ },
+ {
+ "epoch": 1.6940898345153665,
+ "grad_norm": 2.513197660446167,
+ "learning_rate": 4.113755394440352e-06,
+ "loss": 0.4738,
+ "step": 3583
+ },
+ {
+ "epoch": 1.6945626477541371,
+ "grad_norm": 2.504497766494751,
+ "learning_rate": 4.113278891082234e-06,
+ "loss": 0.4661,
+ "step": 3584
+ },
+ {
+ "epoch": 1.6950354609929077,
+ "grad_norm": 2.4966917037963867,
+ "learning_rate": 4.112802287272314e-06,
+ "loss": 0.4979,
+ "step": 3585
+ },
+ {
+ "epoch": 1.6955082742316785,
+ "grad_norm": 2.3129689693450928,
+ "learning_rate": 4.112325583040265e-06,
+ "loss": 0.4933,
+ "step": 3586
+ },
+ {
+ "epoch": 1.6959810874704493,
+ "grad_norm": 2.822136878967285,
+ "learning_rate": 4.111848778415774e-06,
+ "loss": 0.5087,
+ "step": 3587
+ },
+ {
+ "epoch": 1.6964539007092199,
+ "grad_norm": 2.5181210041046143,
+ "learning_rate": 4.111371873428527e-06,
+ "loss": 0.4836,
+ "step": 3588
+ },
+ {
+ "epoch": 1.6969267139479904,
+ "grad_norm": 2.7564687728881836,
+ "learning_rate": 4.110894868108218e-06,
+ "loss": 0.5224,
+ "step": 3589
+ },
+ {
+ "epoch": 1.6973995271867612,
+ "grad_norm": 2.424421787261963,
+ "learning_rate": 4.11041776248455e-06,
+ "loss": 0.4552,
+ "step": 3590
+ },
+ {
+ "epoch": 1.697872340425532,
+ "grad_norm": 2.7013823986053467,
+ "learning_rate": 4.10994055658723e-06,
+ "loss": 0.5535,
+ "step": 3591
+ },
+ {
+ "epoch": 1.6983451536643026,
+ "grad_norm": 2.5660946369171143,
+ "learning_rate": 4.10946325044597e-06,
+ "loss": 0.5351,
+ "step": 3592
+ },
+ {
+ "epoch": 1.6988179669030732,
+ "grad_norm": 2.5598108768463135,
+ "learning_rate": 4.10898584409049e-06,
+ "loss": 0.5246,
+ "step": 3593
+ },
+ {
+ "epoch": 1.699290780141844,
+ "grad_norm": 2.6318907737731934,
+ "learning_rate": 4.108508337550518e-06,
+ "loss": 0.5002,
+ "step": 3594
+ },
+ {
+ "epoch": 1.6997635933806148,
+ "grad_norm": 2.527099132537842,
+ "learning_rate": 4.108030730855784e-06,
+ "loss": 0.5366,
+ "step": 3595
+ },
+ {
+ "epoch": 1.7002364066193854,
+ "grad_norm": 2.8629603385925293,
+ "learning_rate": 4.107553024036029e-06,
+ "loss": 0.5742,
+ "step": 3596
+ },
+ {
+ "epoch": 1.700709219858156,
+ "grad_norm": 2.8084018230438232,
+ "learning_rate": 4.107075217120994e-06,
+ "loss": 0.5618,
+ "step": 3597
+ },
+ {
+ "epoch": 1.7011820330969267,
+ "grad_norm": 3.6470065116882324,
+ "learning_rate": 4.1065973101404325e-06,
+ "loss": 0.508,
+ "step": 3598
+ },
+ {
+ "epoch": 1.7016548463356975,
+ "grad_norm": 3.0332422256469727,
+ "learning_rate": 4.106119303124102e-06,
+ "loss": 0.51,
+ "step": 3599
+ },
+ {
+ "epoch": 1.702127659574468,
+ "grad_norm": 2.4887590408325195,
+ "learning_rate": 4.105641196101765e-06,
+ "loss": 0.5109,
+ "step": 3600
+ },
+ {
+ "epoch": 1.7026004728132387,
+ "grad_norm": 2.6102066040039062,
+ "learning_rate": 4.105162989103191e-06,
+ "loss": 0.5278,
+ "step": 3601
+ },
+ {
+ "epoch": 1.7030732860520095,
+ "grad_norm": 2.771578073501587,
+ "learning_rate": 4.104684682158156e-06,
+ "loss": 0.498,
+ "step": 3602
+ },
+ {
+ "epoch": 1.7035460992907803,
+ "grad_norm": 2.5452702045440674,
+ "learning_rate": 4.1042062752964425e-06,
+ "loss": 0.4939,
+ "step": 3603
+ },
+ {
+ "epoch": 1.7040189125295508,
+ "grad_norm": 2.4287021160125732,
+ "learning_rate": 4.103727768547838e-06,
+ "loss": 0.4819,
+ "step": 3604
+ },
+ {
+ "epoch": 1.7044917257683214,
+ "grad_norm": 2.412280321121216,
+ "learning_rate": 4.103249161942138e-06,
+ "loss": 0.5196,
+ "step": 3605
+ },
+ {
+ "epoch": 1.7049645390070922,
+ "grad_norm": 2.8850717544555664,
+ "learning_rate": 4.102770455509142e-06,
+ "loss": 0.5724,
+ "step": 3606
+ },
+ {
+ "epoch": 1.705437352245863,
+ "grad_norm": 2.7979609966278076,
+ "learning_rate": 4.102291649278659e-06,
+ "loss": 0.5295,
+ "step": 3607
+ },
+ {
+ "epoch": 1.7059101654846336,
+ "grad_norm": 2.762238025665283,
+ "learning_rate": 4.1018127432805e-06,
+ "loss": 0.5166,
+ "step": 3608
+ },
+ {
+ "epoch": 1.7063829787234042,
+ "grad_norm": 2.921586513519287,
+ "learning_rate": 4.101333737544485e-06,
+ "loss": 0.5607,
+ "step": 3609
+ },
+ {
+ "epoch": 1.706855791962175,
+ "grad_norm": 3.001929998397827,
+ "learning_rate": 4.100854632100439e-06,
+ "loss": 0.6255,
+ "step": 3610
+ },
+ {
+ "epoch": 1.7073286052009458,
+ "grad_norm": 2.752713918685913,
+ "learning_rate": 4.100375426978196e-06,
+ "loss": 0.5732,
+ "step": 3611
+ },
+ {
+ "epoch": 1.7078014184397163,
+ "grad_norm": 2.6496472358703613,
+ "learning_rate": 4.099896122207593e-06,
+ "loss": 0.5138,
+ "step": 3612
+ },
+ {
+ "epoch": 1.708274231678487,
+ "grad_norm": 3.0079452991485596,
+ "learning_rate": 4.099416717818473e-06,
+ "loss": 0.5746,
+ "step": 3613
+ },
+ {
+ "epoch": 1.7087470449172577,
+ "grad_norm": 2.5762360095977783,
+ "learning_rate": 4.098937213840687e-06,
+ "loss": 0.5308,
+ "step": 3614
+ },
+ {
+ "epoch": 1.7092198581560285,
+ "grad_norm": 2.6026158332824707,
+ "learning_rate": 4.098457610304092e-06,
+ "loss": 0.4857,
+ "step": 3615
+ },
+ {
+ "epoch": 1.709692671394799,
+ "grad_norm": 2.587583541870117,
+ "learning_rate": 4.097977907238551e-06,
+ "loss": 0.4591,
+ "step": 3616
+ },
+ {
+ "epoch": 1.7101654846335697,
+ "grad_norm": 2.6996991634368896,
+ "learning_rate": 4.097498104673932e-06,
+ "loss": 0.5298,
+ "step": 3617
+ },
+ {
+ "epoch": 1.7106382978723405,
+ "grad_norm": 2.600029945373535,
+ "learning_rate": 4.097018202640111e-06,
+ "loss": 0.4726,
+ "step": 3618
+ },
+ {
+ "epoch": 1.7111111111111112,
+ "grad_norm": 2.8261220455169678,
+ "learning_rate": 4.096538201166969e-06,
+ "loss": 0.5242,
+ "step": 3619
+ },
+ {
+ "epoch": 1.7115839243498818,
+ "grad_norm": 3.053027629852295,
+ "learning_rate": 4.096058100284394e-06,
+ "loss": 0.5568,
+ "step": 3620
+ },
+ {
+ "epoch": 1.7120567375886524,
+ "grad_norm": 2.9638442993164062,
+ "learning_rate": 4.0955779000222805e-06,
+ "loss": 0.5325,
+ "step": 3621
+ },
+ {
+ "epoch": 1.7125295508274232,
+ "grad_norm": 2.731095790863037,
+ "learning_rate": 4.095097600410527e-06,
+ "loss": 0.4733,
+ "step": 3622
+ },
+ {
+ "epoch": 1.713002364066194,
+ "grad_norm": 2.632490873336792,
+ "learning_rate": 4.09461720147904e-06,
+ "loss": 0.5253,
+ "step": 3623
+ },
+ {
+ "epoch": 1.7134751773049646,
+ "grad_norm": 2.847689390182495,
+ "learning_rate": 4.094136703257732e-06,
+ "loss": 0.57,
+ "step": 3624
+ },
+ {
+ "epoch": 1.7139479905437351,
+ "grad_norm": 3.1078696250915527,
+ "learning_rate": 4.0936561057765215e-06,
+ "loss": 0.5368,
+ "step": 3625
+ },
+ {
+ "epoch": 1.714420803782506,
+ "grad_norm": 2.696349620819092,
+ "learning_rate": 4.0931754090653334e-06,
+ "loss": 0.491,
+ "step": 3626
+ },
+ {
+ "epoch": 1.7148936170212767,
+ "grad_norm": 2.712958812713623,
+ "learning_rate": 4.092694613154099e-06,
+ "loss": 0.5768,
+ "step": 3627
+ },
+ {
+ "epoch": 1.7153664302600473,
+ "grad_norm": 2.5421478748321533,
+ "learning_rate": 4.092213718072754e-06,
+ "loss": 0.4839,
+ "step": 3628
+ },
+ {
+ "epoch": 1.715839243498818,
+ "grad_norm": 2.5176162719726562,
+ "learning_rate": 4.091732723851243e-06,
+ "loss": 0.5049,
+ "step": 3629
+ },
+ {
+ "epoch": 1.7163120567375887,
+ "grad_norm": 2.642185926437378,
+ "learning_rate": 4.091251630519514e-06,
+ "loss": 0.589,
+ "step": 3630
+ },
+ {
+ "epoch": 1.7167848699763595,
+ "grad_norm": 2.587348461151123,
+ "learning_rate": 4.0907704381075245e-06,
+ "loss": 0.5281,
+ "step": 3631
+ },
+ {
+ "epoch": 1.71725768321513,
+ "grad_norm": 2.4628195762634277,
+ "learning_rate": 4.090289146645234e-06,
+ "loss": 0.5592,
+ "step": 3632
+ },
+ {
+ "epoch": 1.7177304964539006,
+ "grad_norm": 2.2751028537750244,
+ "learning_rate": 4.0898077561626125e-06,
+ "loss": 0.502,
+ "step": 3633
+ },
+ {
+ "epoch": 1.7182033096926714,
+ "grad_norm": 2.7712769508361816,
+ "learning_rate": 4.089326266689632e-06,
+ "loss": 0.5143,
+ "step": 3634
+ },
+ {
+ "epoch": 1.7186761229314422,
+ "grad_norm": 2.5297727584838867,
+ "learning_rate": 4.088844678256275e-06,
+ "loss": 0.5035,
+ "step": 3635
+ },
+ {
+ "epoch": 1.7191489361702128,
+ "grad_norm": 2.739130735397339,
+ "learning_rate": 4.088362990892527e-06,
+ "loss": 0.5959,
+ "step": 3636
+ },
+ {
+ "epoch": 1.7196217494089834,
+ "grad_norm": 2.3708314895629883,
+ "learning_rate": 4.08788120462838e-06,
+ "loss": 0.4796,
+ "step": 3637
+ },
+ {
+ "epoch": 1.7200945626477542,
+ "grad_norm": 2.7664241790771484,
+ "learning_rate": 4.087399319493832e-06,
+ "loss": 0.6052,
+ "step": 3638
+ },
+ {
+ "epoch": 1.720567375886525,
+ "grad_norm": 2.5900204181671143,
+ "learning_rate": 4.0869173355188895e-06,
+ "loss": 0.4955,
+ "step": 3639
+ },
+ {
+ "epoch": 1.7210401891252955,
+ "grad_norm": 2.6771862506866455,
+ "learning_rate": 4.0864352527335635e-06,
+ "loss": 0.4889,
+ "step": 3640
+ },
+ {
+ "epoch": 1.7215130023640661,
+ "grad_norm": 2.888479471206665,
+ "learning_rate": 4.085953071167871e-06,
+ "loss": 0.5719,
+ "step": 3641
+ },
+ {
+ "epoch": 1.721985815602837,
+ "grad_norm": 2.5967187881469727,
+ "learning_rate": 4.085470790851833e-06,
+ "loss": 0.4959,
+ "step": 3642
+ },
+ {
+ "epoch": 1.7224586288416077,
+ "grad_norm": 2.5317695140838623,
+ "learning_rate": 4.084988411815483e-06,
+ "loss": 0.4596,
+ "step": 3643
+ },
+ {
+ "epoch": 1.7229314420803783,
+ "grad_norm": 2.6531455516815186,
+ "learning_rate": 4.084505934088853e-06,
+ "loss": 0.5346,
+ "step": 3644
+ },
+ {
+ "epoch": 1.7234042553191489,
+ "grad_norm": 2.6525208950042725,
+ "learning_rate": 4.084023357701987e-06,
+ "loss": 0.5178,
+ "step": 3645
+ },
+ {
+ "epoch": 1.7238770685579197,
+ "grad_norm": 2.461954116821289,
+ "learning_rate": 4.083540682684932e-06,
+ "loss": 0.4802,
+ "step": 3646
+ },
+ {
+ "epoch": 1.7243498817966905,
+ "grad_norm": 2.794696807861328,
+ "learning_rate": 4.083057909067743e-06,
+ "loss": 0.5148,
+ "step": 3647
+ },
+ {
+ "epoch": 1.724822695035461,
+ "grad_norm": 2.867572546005249,
+ "learning_rate": 4.082575036880479e-06,
+ "loss": 0.5352,
+ "step": 3648
+ },
+ {
+ "epoch": 1.7252955082742316,
+ "grad_norm": 2.642820358276367,
+ "learning_rate": 4.082092066153207e-06,
+ "loss": 0.4652,
+ "step": 3649
+ },
+ {
+ "epoch": 1.7257683215130024,
+ "grad_norm": 2.782142400741577,
+ "learning_rate": 4.081608996915999e-06,
+ "loss": 0.5591,
+ "step": 3650
+ },
+ {
+ "epoch": 1.7262411347517732,
+ "grad_norm": 2.327331304550171,
+ "learning_rate": 4.081125829198934e-06,
+ "loss": 0.4339,
+ "step": 3651
+ },
+ {
+ "epoch": 1.7267139479905438,
+ "grad_norm": 2.7959988117218018,
+ "learning_rate": 4.0806425630320965e-06,
+ "loss": 0.5783,
+ "step": 3652
+ },
+ {
+ "epoch": 1.7271867612293144,
+ "grad_norm": 2.595053195953369,
+ "learning_rate": 4.080159198445578e-06,
+ "loss": 0.4602,
+ "step": 3653
+ },
+ {
+ "epoch": 1.7276595744680852,
+ "grad_norm": 3.0968129634857178,
+ "learning_rate": 4.079675735469475e-06,
+ "loss": 0.5775,
+ "step": 3654
+ },
+ {
+ "epoch": 1.728132387706856,
+ "grad_norm": 2.628044605255127,
+ "learning_rate": 4.07919217413389e-06,
+ "loss": 0.486,
+ "step": 3655
+ },
+ {
+ "epoch": 1.7286052009456265,
+ "grad_norm": 2.782799005508423,
+ "learning_rate": 4.078708514468933e-06,
+ "loss": 0.5282,
+ "step": 3656
+ },
+ {
+ "epoch": 1.729078014184397,
+ "grad_norm": 2.655365467071533,
+ "learning_rate": 4.0782247565047205e-06,
+ "loss": 0.4873,
+ "step": 3657
+ },
+ {
+ "epoch": 1.729550827423168,
+ "grad_norm": 2.9461584091186523,
+ "learning_rate": 4.077740900271371e-06,
+ "loss": 0.548,
+ "step": 3658
+ },
+ {
+ "epoch": 1.7300236406619387,
+ "grad_norm": 2.5094761848449707,
+ "learning_rate": 4.077256945799015e-06,
+ "loss": 0.5437,
+ "step": 3659
+ },
+ {
+ "epoch": 1.7304964539007093,
+ "grad_norm": 2.555793285369873,
+ "learning_rate": 4.0767728931177845e-06,
+ "loss": 0.5268,
+ "step": 3660
+ },
+ {
+ "epoch": 1.7309692671394799,
+ "grad_norm": 2.4433486461639404,
+ "learning_rate": 4.07628874225782e-06,
+ "loss": 0.5211,
+ "step": 3661
+ },
+ {
+ "epoch": 1.7314420803782506,
+ "grad_norm": 2.365206003189087,
+ "learning_rate": 4.075804493249267e-06,
+ "loss": 0.5084,
+ "step": 3662
+ },
+ {
+ "epoch": 1.7319148936170212,
+ "grad_norm": 2.514305830001831,
+ "learning_rate": 4.075320146122278e-06,
+ "loss": 0.4693,
+ "step": 3663
+ },
+ {
+ "epoch": 1.7323877068557918,
+ "grad_norm": 2.9270083904266357,
+ "learning_rate": 4.074835700907012e-06,
+ "loss": 0.5724,
+ "step": 3664
+ },
+ {
+ "epoch": 1.7328605200945626,
+ "grad_norm": 2.938692569732666,
+ "learning_rate": 4.0743511576336315e-06,
+ "loss": 0.5361,
+ "step": 3665
+ },
+ {
+ "epoch": 1.7333333333333334,
+ "grad_norm": 3.1978867053985596,
+ "learning_rate": 4.073866516332307e-06,
+ "loss": 0.6277,
+ "step": 3666
+ },
+ {
+ "epoch": 1.733806146572104,
+ "grad_norm": 2.3477370738983154,
+ "learning_rate": 4.073381777033217e-06,
+ "loss": 0.5139,
+ "step": 3667
+ },
+ {
+ "epoch": 1.7342789598108745,
+ "grad_norm": 2.5954184532165527,
+ "learning_rate": 4.072896939766543e-06,
+ "loss": 0.537,
+ "step": 3668
+ },
+ {
+ "epoch": 1.7347517730496453,
+ "grad_norm": 2.8999998569488525,
+ "learning_rate": 4.072412004562472e-06,
+ "loss": 0.5486,
+ "step": 3669
+ },
+ {
+ "epoch": 1.7352245862884161,
+ "grad_norm": 2.7320556640625,
+ "learning_rate": 4.071926971451201e-06,
+ "loss": 0.6025,
+ "step": 3670
+ },
+ {
+ "epoch": 1.7356973995271867,
+ "grad_norm": 2.499234676361084,
+ "learning_rate": 4.0714418404629304e-06,
+ "loss": 0.456,
+ "step": 3671
+ },
+ {
+ "epoch": 1.7361702127659573,
+ "grad_norm": 2.485924243927002,
+ "learning_rate": 4.070956611627867e-06,
+ "loss": 0.5097,
+ "step": 3672
+ },
+ {
+ "epoch": 1.736643026004728,
+ "grad_norm": 2.513723373413086,
+ "learning_rate": 4.070471284976225e-06,
+ "loss": 0.4744,
+ "step": 3673
+ },
+ {
+ "epoch": 1.7371158392434989,
+ "grad_norm": 2.281977653503418,
+ "learning_rate": 4.06998586053822e-06,
+ "loss": 0.5124,
+ "step": 3674
+ },
+ {
+ "epoch": 1.7375886524822695,
+ "grad_norm": 2.3683905601501465,
+ "learning_rate": 4.069500338344081e-06,
+ "loss": 0.4816,
+ "step": 3675
+ },
+ {
+ "epoch": 1.73806146572104,
+ "grad_norm": 2.5635924339294434,
+ "learning_rate": 4.069014718424038e-06,
+ "loss": 0.5665,
+ "step": 3676
+ },
+ {
+ "epoch": 1.7385342789598108,
+ "grad_norm": 2.7308456897735596,
+ "learning_rate": 4.068529000808328e-06,
+ "loss": 0.534,
+ "step": 3677
+ },
+ {
+ "epoch": 1.7390070921985816,
+ "grad_norm": 2.788452625274658,
+ "learning_rate": 4.068043185527196e-06,
+ "loss": 0.5609,
+ "step": 3678
+ },
+ {
+ "epoch": 1.7394799054373522,
+ "grad_norm": 2.832368850708008,
+ "learning_rate": 4.067557272610889e-06,
+ "loss": 0.553,
+ "step": 3679
+ },
+ {
+ "epoch": 1.7399527186761228,
+ "grad_norm": 2.9987435340881348,
+ "learning_rate": 4.067071262089665e-06,
+ "loss": 0.5,
+ "step": 3680
+ },
+ {
+ "epoch": 1.7404255319148936,
+ "grad_norm": 3.04913067817688,
+ "learning_rate": 4.066585153993785e-06,
+ "loss": 0.5158,
+ "step": 3681
+ },
+ {
+ "epoch": 1.7408983451536644,
+ "grad_norm": 2.5177130699157715,
+ "learning_rate": 4.066098948353516e-06,
+ "loss": 0.4508,
+ "step": 3682
+ },
+ {
+ "epoch": 1.741371158392435,
+ "grad_norm": 2.8991222381591797,
+ "learning_rate": 4.065612645199133e-06,
+ "loss": 0.5268,
+ "step": 3683
+ },
+ {
+ "epoch": 1.7418439716312055,
+ "grad_norm": 2.4928159713745117,
+ "learning_rate": 4.0651262445609156e-06,
+ "loss": 0.5024,
+ "step": 3684
+ },
+ {
+ "epoch": 1.7423167848699763,
+ "grad_norm": 2.9737319946289062,
+ "learning_rate": 4.06463974646915e-06,
+ "loss": 0.5429,
+ "step": 3685
+ },
+ {
+ "epoch": 1.7427895981087471,
+ "grad_norm": 2.6485493183135986,
+ "learning_rate": 4.064153150954128e-06,
+ "loss": 0.5619,
+ "step": 3686
+ },
+ {
+ "epoch": 1.7432624113475177,
+ "grad_norm": 2.564861297607422,
+ "learning_rate": 4.063666458046148e-06,
+ "loss": 0.4878,
+ "step": 3687
+ },
+ {
+ "epoch": 1.7437352245862883,
+ "grad_norm": 2.6048383712768555,
+ "learning_rate": 4.063179667775514e-06,
+ "loss": 0.4836,
+ "step": 3688
+ },
+ {
+ "epoch": 1.744208037825059,
+ "grad_norm": 2.751638650894165,
+ "learning_rate": 4.062692780172536e-06,
+ "loss": 0.5558,
+ "step": 3689
+ },
+ {
+ "epoch": 1.7446808510638299,
+ "grad_norm": 3.3866634368896484,
+ "learning_rate": 4.062205795267531e-06,
+ "loss": 0.4825,
+ "step": 3690
+ },
+ {
+ "epoch": 1.7451536643026004,
+ "grad_norm": 3.0112249851226807,
+ "learning_rate": 4.061718713090822e-06,
+ "loss": 0.5732,
+ "step": 3691
+ },
+ {
+ "epoch": 1.745626477541371,
+ "grad_norm": 2.5889365673065186,
+ "learning_rate": 4.061231533672736e-06,
+ "loss": 0.483,
+ "step": 3692
+ },
+ {
+ "epoch": 1.7460992907801418,
+ "grad_norm": 2.624598979949951,
+ "learning_rate": 4.0607442570436085e-06,
+ "loss": 0.5706,
+ "step": 3693
+ },
+ {
+ "epoch": 1.7465721040189126,
+ "grad_norm": 2.9219250679016113,
+ "learning_rate": 4.060256883233779e-06,
+ "loss": 0.5153,
+ "step": 3694
+ },
+ {
+ "epoch": 1.7470449172576832,
+ "grad_norm": 3.2219252586364746,
+ "learning_rate": 4.059769412273595e-06,
+ "loss": 0.5184,
+ "step": 3695
+ },
+ {
+ "epoch": 1.7475177304964538,
+ "grad_norm": 2.890697956085205,
+ "learning_rate": 4.05928184419341e-06,
+ "loss": 0.5312,
+ "step": 3696
+ },
+ {
+ "epoch": 1.7479905437352246,
+ "grad_norm": 2.673809289932251,
+ "learning_rate": 4.0587941790235816e-06,
+ "loss": 0.4893,
+ "step": 3697
+ },
+ {
+ "epoch": 1.7484633569739954,
+ "grad_norm": 2.5339348316192627,
+ "learning_rate": 4.058306416794474e-06,
+ "loss": 0.5115,
+ "step": 3698
+ },
+ {
+ "epoch": 1.748936170212766,
+ "grad_norm": 2.6525840759277344,
+ "learning_rate": 4.05781855753646e-06,
+ "loss": 0.5256,
+ "step": 3699
+ },
+ {
+ "epoch": 1.7494089834515365,
+ "grad_norm": 2.7868754863739014,
+ "learning_rate": 4.057330601279914e-06,
+ "loss": 0.5227,
+ "step": 3700
+ },
+ {
+ "epoch": 1.7498817966903073,
+ "grad_norm": 3.1629884243011475,
+ "learning_rate": 4.056842548055221e-06,
+ "loss": 0.5617,
+ "step": 3701
+ },
+ {
+ "epoch": 1.750354609929078,
+ "grad_norm": 2.9350688457489014,
+ "learning_rate": 4.056354397892769e-06,
+ "loss": 0.4753,
+ "step": 3702
+ },
+ {
+ "epoch": 1.7508274231678487,
+ "grad_norm": 2.9688615798950195,
+ "learning_rate": 4.0558661508229525e-06,
+ "loss": 0.596,
+ "step": 3703
+ },
+ {
+ "epoch": 1.7513002364066192,
+ "grad_norm": 2.802205801010132,
+ "learning_rate": 4.055377806876174e-06,
+ "loss": 0.5793,
+ "step": 3704
+ },
+ {
+ "epoch": 1.75177304964539,
+ "grad_norm": 2.4933416843414307,
+ "learning_rate": 4.054889366082839e-06,
+ "loss": 0.4824,
+ "step": 3705
+ },
+ {
+ "epoch": 1.7522458628841608,
+ "grad_norm": 3.7904608249664307,
+ "learning_rate": 4.054400828473361e-06,
+ "loss": 0.5124,
+ "step": 3706
+ },
+ {
+ "epoch": 1.7527186761229314,
+ "grad_norm": 2.694838762283325,
+ "learning_rate": 4.053912194078159e-06,
+ "loss": 0.5604,
+ "step": 3707
+ },
+ {
+ "epoch": 1.753191489361702,
+ "grad_norm": 2.3721256256103516,
+ "learning_rate": 4.053423462927659e-06,
+ "loss": 0.4978,
+ "step": 3708
+ },
+ {
+ "epoch": 1.7536643026004728,
+ "grad_norm": 2.718512773513794,
+ "learning_rate": 4.052934635052292e-06,
+ "loss": 0.5029,
+ "step": 3709
+ },
+ {
+ "epoch": 1.7541371158392436,
+ "grad_norm": 3.061558246612549,
+ "learning_rate": 4.052445710482493e-06,
+ "loss": 0.4886,
+ "step": 3710
+ },
+ {
+ "epoch": 1.7546099290780142,
+ "grad_norm": 3.0490729808807373,
+ "learning_rate": 4.051956689248709e-06,
+ "loss": 0.5363,
+ "step": 3711
+ },
+ {
+ "epoch": 1.7550827423167847,
+ "grad_norm": 2.611661672592163,
+ "learning_rate": 4.051467571381385e-06,
+ "loss": 0.5397,
+ "step": 3712
+ },
+ {
+ "epoch": 1.7555555555555555,
+ "grad_norm": 2.7829177379608154,
+ "learning_rate": 4.050978356910979e-06,
+ "loss": 0.4973,
+ "step": 3713
+ },
+ {
+ "epoch": 1.7560283687943263,
+ "grad_norm": 2.6228256225585938,
+ "learning_rate": 4.0504890458679525e-06,
+ "loss": 0.4551,
+ "step": 3714
+ },
+ {
+ "epoch": 1.756501182033097,
+ "grad_norm": 2.6801326274871826,
+ "learning_rate": 4.049999638282771e-06,
+ "loss": 0.5581,
+ "step": 3715
+ },
+ {
+ "epoch": 1.7569739952718675,
+ "grad_norm": 2.4476819038391113,
+ "learning_rate": 4.049510134185908e-06,
+ "loss": 0.5226,
+ "step": 3716
+ },
+ {
+ "epoch": 1.7574468085106383,
+ "grad_norm": 2.5661075115203857,
+ "learning_rate": 4.049020533607844e-06,
+ "loss": 0.5163,
+ "step": 3717
+ },
+ {
+ "epoch": 1.757919621749409,
+ "grad_norm": 2.3923349380493164,
+ "learning_rate": 4.048530836579065e-06,
+ "loss": 0.5076,
+ "step": 3718
+ },
+ {
+ "epoch": 1.7583924349881797,
+ "grad_norm": 2.8204405307769775,
+ "learning_rate": 4.0480410431300585e-06,
+ "loss": 0.5883,
+ "step": 3719
+ },
+ {
+ "epoch": 1.7588652482269502,
+ "grad_norm": 2.323107957839966,
+ "learning_rate": 4.047551153291325e-06,
+ "loss": 0.5116,
+ "step": 3720
+ },
+ {
+ "epoch": 1.759338061465721,
+ "grad_norm": 2.8306009769439697,
+ "learning_rate": 4.047061167093368e-06,
+ "loss": 0.5094,
+ "step": 3721
+ },
+ {
+ "epoch": 1.7598108747044918,
+ "grad_norm": 2.568765640258789,
+ "learning_rate": 4.046571084566695e-06,
+ "loss": 0.4725,
+ "step": 3722
+ },
+ {
+ "epoch": 1.7602836879432624,
+ "grad_norm": 2.7212061882019043,
+ "learning_rate": 4.046080905741822e-06,
+ "loss": 0.4741,
+ "step": 3723
+ },
+ {
+ "epoch": 1.760756501182033,
+ "grad_norm": 2.802917003631592,
+ "learning_rate": 4.04559063064927e-06,
+ "loss": 0.5691,
+ "step": 3724
+ },
+ {
+ "epoch": 1.7612293144208038,
+ "grad_norm": 3.1044139862060547,
+ "learning_rate": 4.0451002593195675e-06,
+ "loss": 0.5472,
+ "step": 3725
+ },
+ {
+ "epoch": 1.7617021276595746,
+ "grad_norm": 2.5855562686920166,
+ "learning_rate": 4.044609791783246e-06,
+ "loss": 0.4852,
+ "step": 3726
+ },
+ {
+ "epoch": 1.7621749408983451,
+ "grad_norm": 2.6235129833221436,
+ "learning_rate": 4.0441192280708465e-06,
+ "loss": 0.5269,
+ "step": 3727
+ },
+ {
+ "epoch": 1.7626477541371157,
+ "grad_norm": 3.535630464553833,
+ "learning_rate": 4.043628568212914e-06,
+ "loss": 0.5266,
+ "step": 3728
+ },
+ {
+ "epoch": 1.7631205673758865,
+ "grad_norm": 2.7783355712890625,
+ "learning_rate": 4.043137812239998e-06,
+ "loss": 0.5609,
+ "step": 3729
+ },
+ {
+ "epoch": 1.7635933806146573,
+ "grad_norm": 2.9344944953918457,
+ "learning_rate": 4.042646960182657e-06,
+ "loss": 0.5056,
+ "step": 3730
+ },
+ {
+ "epoch": 1.7640661938534279,
+ "grad_norm": 2.6205739974975586,
+ "learning_rate": 4.042156012071453e-06,
+ "loss": 0.4914,
+ "step": 3731
+ },
+ {
+ "epoch": 1.7645390070921985,
+ "grad_norm": 2.8004493713378906,
+ "learning_rate": 4.041664967936958e-06,
+ "loss": 0.4901,
+ "step": 3732
+ },
+ {
+ "epoch": 1.7650118203309693,
+ "grad_norm": 2.944589138031006,
+ "learning_rate": 4.041173827809745e-06,
+ "loss": 0.5572,
+ "step": 3733
+ },
+ {
+ "epoch": 1.76548463356974,
+ "grad_norm": 2.5021605491638184,
+ "learning_rate": 4.040682591720397e-06,
+ "loss": 0.4637,
+ "step": 3734
+ },
+ {
+ "epoch": 1.7659574468085106,
+ "grad_norm": 2.448030948638916,
+ "learning_rate": 4.040191259699497e-06,
+ "loss": 0.4785,
+ "step": 3735
+ },
+ {
+ "epoch": 1.7664302600472812,
+ "grad_norm": 2.7171032428741455,
+ "learning_rate": 4.039699831777643e-06,
+ "loss": 0.4919,
+ "step": 3736
+ },
+ {
+ "epoch": 1.766903073286052,
+ "grad_norm": 2.453118324279785,
+ "learning_rate": 4.03920830798543e-06,
+ "loss": 0.4326,
+ "step": 3737
+ },
+ {
+ "epoch": 1.7673758865248228,
+ "grad_norm": 3.112877368927002,
+ "learning_rate": 4.038716688353466e-06,
+ "loss": 0.5375,
+ "step": 3738
+ },
+ {
+ "epoch": 1.7678486997635934,
+ "grad_norm": 2.742239236831665,
+ "learning_rate": 4.038224972912361e-06,
+ "loss": 0.5267,
+ "step": 3739
+ },
+ {
+ "epoch": 1.768321513002364,
+ "grad_norm": 2.544785737991333,
+ "learning_rate": 4.037733161692731e-06,
+ "loss": 0.5032,
+ "step": 3740
+ },
+ {
+ "epoch": 1.7687943262411348,
+ "grad_norm": 2.4639062881469727,
+ "learning_rate": 4.037241254725201e-06,
+ "loss": 0.5532,
+ "step": 3741
+ },
+ {
+ "epoch": 1.7692671394799055,
+ "grad_norm": 2.866290330886841,
+ "learning_rate": 4.036749252040398e-06,
+ "loss": 0.5503,
+ "step": 3742
+ },
+ {
+ "epoch": 1.7697399527186761,
+ "grad_norm": 2.3466262817382812,
+ "learning_rate": 4.0362571536689575e-06,
+ "loss": 0.5286,
+ "step": 3743
+ },
+ {
+ "epoch": 1.7702127659574467,
+ "grad_norm": 2.246464967727661,
+ "learning_rate": 4.03576495964152e-06,
+ "loss": 0.4656,
+ "step": 3744
+ },
+ {
+ "epoch": 1.7706855791962175,
+ "grad_norm": 2.667558431625366,
+ "learning_rate": 4.035272669988733e-06,
+ "loss": 0.5205,
+ "step": 3745
+ },
+ {
+ "epoch": 1.7711583924349883,
+ "grad_norm": 2.974666118621826,
+ "learning_rate": 4.034780284741249e-06,
+ "loss": 0.6007,
+ "step": 3746
+ },
+ {
+ "epoch": 1.7716312056737589,
+ "grad_norm": 2.7164433002471924,
+ "learning_rate": 4.034287803929726e-06,
+ "loss": 0.4913,
+ "step": 3747
+ },
+ {
+ "epoch": 1.7721040189125294,
+ "grad_norm": 2.5923962593078613,
+ "learning_rate": 4.033795227584829e-06,
+ "loss": 0.5275,
+ "step": 3748
+ },
+ {
+ "epoch": 1.7725768321513002,
+ "grad_norm": 2.606027126312256,
+ "learning_rate": 4.033302555737229e-06,
+ "loss": 0.4869,
+ "step": 3749
+ },
+ {
+ "epoch": 1.773049645390071,
+ "grad_norm": 3.0110089778900146,
+ "learning_rate": 4.032809788417602e-06,
+ "loss": 0.4956,
+ "step": 3750
+ },
+ {
+ "epoch": 1.7735224586288416,
+ "grad_norm": 3.004598617553711,
+ "learning_rate": 4.032316925656632e-06,
+ "loss": 0.5159,
+ "step": 3751
+ },
+ {
+ "epoch": 1.7739952718676122,
+ "grad_norm": 2.731539249420166,
+ "learning_rate": 4.031823967485005e-06,
+ "loss": 0.5237,
+ "step": 3752
+ },
+ {
+ "epoch": 1.774468085106383,
+ "grad_norm": 2.7466373443603516,
+ "learning_rate": 4.0313309139334155e-06,
+ "loss": 0.4948,
+ "step": 3753
+ },
+ {
+ "epoch": 1.7749408983451538,
+ "grad_norm": 2.8596460819244385,
+ "learning_rate": 4.030837765032565e-06,
+ "loss": 0.5016,
+ "step": 3754
+ },
+ {
+ "epoch": 1.7754137115839244,
+ "grad_norm": 3.2886788845062256,
+ "learning_rate": 4.03034452081316e-06,
+ "loss": 0.5377,
+ "step": 3755
+ },
+ {
+ "epoch": 1.775886524822695,
+ "grad_norm": 2.5629258155822754,
+ "learning_rate": 4.029851181305912e-06,
+ "loss": 0.519,
+ "step": 3756
+ },
+ {
+ "epoch": 1.7763593380614657,
+ "grad_norm": 2.5988714694976807,
+ "learning_rate": 4.029357746541539e-06,
+ "loss": 0.5521,
+ "step": 3757
+ },
+ {
+ "epoch": 1.7768321513002365,
+ "grad_norm": 2.987884759902954,
+ "learning_rate": 4.028864216550765e-06,
+ "loss": 0.6225,
+ "step": 3758
+ },
+ {
+ "epoch": 1.777304964539007,
+ "grad_norm": 2.6875851154327393,
+ "learning_rate": 4.02837059136432e-06,
+ "loss": 0.5321,
+ "step": 3759
+ },
+ {
+ "epoch": 1.7777777777777777,
+ "grad_norm": 2.6414570808410645,
+ "learning_rate": 4.02787687101294e-06,
+ "loss": 0.4831,
+ "step": 3760
+ },
+ {
+ "epoch": 1.7782505910165485,
+ "grad_norm": 2.581475019454956,
+ "learning_rate": 4.027383055527368e-06,
+ "loss": 0.5204,
+ "step": 3761
+ },
+ {
+ "epoch": 1.7787234042553193,
+ "grad_norm": 2.811298131942749,
+ "learning_rate": 4.026889144938349e-06,
+ "loss": 0.5486,
+ "step": 3762
+ },
+ {
+ "epoch": 1.7791962174940898,
+ "grad_norm": 3.1589081287384033,
+ "learning_rate": 4.026395139276639e-06,
+ "loss": 0.4979,
+ "step": 3763
+ },
+ {
+ "epoch": 1.7796690307328604,
+ "grad_norm": 2.3773093223571777,
+ "learning_rate": 4.025901038572996e-06,
+ "loss": 0.503,
+ "step": 3764
+ },
+ {
+ "epoch": 1.7801418439716312,
+ "grad_norm": 2.962541341781616,
+ "learning_rate": 4.025406842858187e-06,
+ "loss": 0.4613,
+ "step": 3765
+ },
+ {
+ "epoch": 1.780614657210402,
+ "grad_norm": 2.603092908859253,
+ "learning_rate": 4.024912552162982e-06,
+ "loss": 0.5142,
+ "step": 3766
+ },
+ {
+ "epoch": 1.7810874704491726,
+ "grad_norm": 2.648927927017212,
+ "learning_rate": 4.024418166518159e-06,
+ "loss": 0.4491,
+ "step": 3767
+ },
+ {
+ "epoch": 1.7815602836879432,
+ "grad_norm": 3.3239917755126953,
+ "learning_rate": 4.023923685954502e-06,
+ "loss": 0.6272,
+ "step": 3768
+ },
+ {
+ "epoch": 1.782033096926714,
+ "grad_norm": 2.672821283340454,
+ "learning_rate": 4.023429110502798e-06,
+ "loss": 0.5171,
+ "step": 3769
+ },
+ {
+ "epoch": 1.7825059101654848,
+ "grad_norm": 2.364332437515259,
+ "learning_rate": 4.022934440193844e-06,
+ "loss": 0.4513,
+ "step": 3770
+ },
+ {
+ "epoch": 1.7829787234042553,
+ "grad_norm": 3.03108549118042,
+ "learning_rate": 4.022439675058441e-06,
+ "loss": 0.4324,
+ "step": 3771
+ },
+ {
+ "epoch": 1.783451536643026,
+ "grad_norm": 2.647557020187378,
+ "learning_rate": 4.021944815127393e-06,
+ "loss": 0.5162,
+ "step": 3772
+ },
+ {
+ "epoch": 1.7839243498817967,
+ "grad_norm": 2.4111907482147217,
+ "learning_rate": 4.021449860431517e-06,
+ "loss": 0.4712,
+ "step": 3773
+ },
+ {
+ "epoch": 1.7843971631205675,
+ "grad_norm": 2.796175718307495,
+ "learning_rate": 4.020954811001629e-06,
+ "loss": 0.5131,
+ "step": 3774
+ },
+ {
+ "epoch": 1.784869976359338,
+ "grad_norm": 2.4594924449920654,
+ "learning_rate": 4.020459666868553e-06,
+ "loss": 0.4739,
+ "step": 3775
+ },
+ {
+ "epoch": 1.7853427895981087,
+ "grad_norm": 2.5735671520233154,
+ "learning_rate": 4.0199644280631215e-06,
+ "loss": 0.4716,
+ "step": 3776
+ },
+ {
+ "epoch": 1.7858156028368795,
+ "grad_norm": 2.419990062713623,
+ "learning_rate": 4.01946909461617e-06,
+ "loss": 0.4866,
+ "step": 3777
+ },
+ {
+ "epoch": 1.7862884160756503,
+ "grad_norm": 2.5597951412200928,
+ "learning_rate": 4.01897366655854e-06,
+ "loss": 0.5569,
+ "step": 3778
+ },
+ {
+ "epoch": 1.7867612293144208,
+ "grad_norm": 2.462383985519409,
+ "learning_rate": 4.018478143921081e-06,
+ "loss": 0.4588,
+ "step": 3779
+ },
+ {
+ "epoch": 1.7872340425531914,
+ "grad_norm": 2.536701202392578,
+ "learning_rate": 4.017982526734646e-06,
+ "loss": 0.5278,
+ "step": 3780
+ },
+ {
+ "epoch": 1.7877068557919622,
+ "grad_norm": 2.691077470779419,
+ "learning_rate": 4.017486815030095e-06,
+ "loss": 0.4815,
+ "step": 3781
+ },
+ {
+ "epoch": 1.788179669030733,
+ "grad_norm": 2.4277288913726807,
+ "learning_rate": 4.016991008838294e-06,
+ "loss": 0.4877,
+ "step": 3782
+ },
+ {
+ "epoch": 1.7886524822695036,
+ "grad_norm": 2.6740009784698486,
+ "learning_rate": 4.016495108190115e-06,
+ "loss": 0.572,
+ "step": 3783
+ },
+ {
+ "epoch": 1.7891252955082741,
+ "grad_norm": 3.179232120513916,
+ "learning_rate": 4.0159991131164355e-06,
+ "loss": 0.4821,
+ "step": 3784
+ },
+ {
+ "epoch": 1.789598108747045,
+ "grad_norm": 3.2747793197631836,
+ "learning_rate": 4.015503023648138e-06,
+ "loss": 0.5517,
+ "step": 3785
+ },
+ {
+ "epoch": 1.7900709219858157,
+ "grad_norm": 2.671367645263672,
+ "learning_rate": 4.015006839816113e-06,
+ "loss": 0.5158,
+ "step": 3786
+ },
+ {
+ "epoch": 1.7905437352245863,
+ "grad_norm": 2.6600193977355957,
+ "learning_rate": 4.014510561651256e-06,
+ "loss": 0.535,
+ "step": 3787
+ },
+ {
+ "epoch": 1.791016548463357,
+ "grad_norm": 2.481509208679199,
+ "learning_rate": 4.014014189184466e-06,
+ "loss": 0.5596,
+ "step": 3788
+ },
+ {
+ "epoch": 1.7914893617021277,
+ "grad_norm": 2.759816884994507,
+ "learning_rate": 4.013517722446652e-06,
+ "loss": 0.5201,
+ "step": 3789
+ },
+ {
+ "epoch": 1.7919621749408985,
+ "grad_norm": 2.6913561820983887,
+ "learning_rate": 4.013021161468724e-06,
+ "loss": 0.5758,
+ "step": 3790
+ },
+ {
+ "epoch": 1.792434988179669,
+ "grad_norm": 2.775087594985962,
+ "learning_rate": 4.0125245062816044e-06,
+ "loss": 0.499,
+ "step": 3791
+ },
+ {
+ "epoch": 1.7929078014184396,
+ "grad_norm": 2.6134777069091797,
+ "learning_rate": 4.012027756916216e-06,
+ "loss": 0.5659,
+ "step": 3792
+ },
+ {
+ "epoch": 1.7933806146572104,
+ "grad_norm": 2.7109756469726562,
+ "learning_rate": 4.0115309134034895e-06,
+ "loss": 0.5337,
+ "step": 3793
+ },
+ {
+ "epoch": 1.7938534278959812,
+ "grad_norm": 2.5389950275421143,
+ "learning_rate": 4.0110339757743595e-06,
+ "loss": 0.4501,
+ "step": 3794
+ },
+ {
+ "epoch": 1.7943262411347518,
+ "grad_norm": 2.634648561477661,
+ "learning_rate": 4.010536944059771e-06,
+ "loss": 0.4411,
+ "step": 3795
+ },
+ {
+ "epoch": 1.7947990543735224,
+ "grad_norm": 2.527070999145508,
+ "learning_rate": 4.0100398182906695e-06,
+ "loss": 0.5145,
+ "step": 3796
+ },
+ {
+ "epoch": 1.7952718676122932,
+ "grad_norm": 2.62988543510437,
+ "learning_rate": 4.0095425984980105e-06,
+ "loss": 0.4981,
+ "step": 3797
+ },
+ {
+ "epoch": 1.795744680851064,
+ "grad_norm": 2.6032519340515137,
+ "learning_rate": 4.009045284712752e-06,
+ "loss": 0.453,
+ "step": 3798
+ },
+ {
+ "epoch": 1.7962174940898346,
+ "grad_norm": 2.735173463821411,
+ "learning_rate": 4.008547876965863e-06,
+ "loss": 0.5925,
+ "step": 3799
+ },
+ {
+ "epoch": 1.7966903073286051,
+ "grad_norm": 2.6296730041503906,
+ "learning_rate": 4.00805037528831e-06,
+ "loss": 0.5651,
+ "step": 3800
+ },
+ {
+ "epoch": 1.797163120567376,
+ "grad_norm": 2.641214370727539,
+ "learning_rate": 4.0075527797110735e-06,
+ "loss": 0.4973,
+ "step": 3801
+ },
+ {
+ "epoch": 1.7976359338061467,
+ "grad_norm": 2.6104819774627686,
+ "learning_rate": 4.007055090265136e-06,
+ "loss": 0.4432,
+ "step": 3802
+ },
+ {
+ "epoch": 1.7981087470449173,
+ "grad_norm": 2.8200619220733643,
+ "learning_rate": 4.0065573069814865e-06,
+ "loss": 0.4899,
+ "step": 3803
+ },
+ {
+ "epoch": 1.7985815602836879,
+ "grad_norm": 2.982354164123535,
+ "learning_rate": 4.006059429891119e-06,
+ "loss": 0.5488,
+ "step": 3804
+ },
+ {
+ "epoch": 1.7990543735224587,
+ "grad_norm": 2.7561678886413574,
+ "learning_rate": 4.005561459025034e-06,
+ "loss": 0.5637,
+ "step": 3805
+ },
+ {
+ "epoch": 1.7995271867612295,
+ "grad_norm": 2.702212333679199,
+ "learning_rate": 4.005063394414241e-06,
+ "loss": 0.4804,
+ "step": 3806
+ },
+ {
+ "epoch": 1.8,
+ "grad_norm": 2.8655319213867188,
+ "learning_rate": 4.004565236089748e-06,
+ "loss": 0.5759,
+ "step": 3807
+ },
+ {
+ "epoch": 1.8004728132387706,
+ "grad_norm": 2.703676223754883,
+ "learning_rate": 4.0040669840825756e-06,
+ "loss": 0.4728,
+ "step": 3808
+ },
+ {
+ "epoch": 1.8009456264775414,
+ "grad_norm": 2.802645683288574,
+ "learning_rate": 4.003568638423747e-06,
+ "loss": 0.5421,
+ "step": 3809
+ },
+ {
+ "epoch": 1.8014184397163122,
+ "grad_norm": 2.4723124504089355,
+ "learning_rate": 4.003070199144292e-06,
+ "loss": 0.4944,
+ "step": 3810
+ },
+ {
+ "epoch": 1.8018912529550828,
+ "grad_norm": 2.4889068603515625,
+ "learning_rate": 4.0025716662752475e-06,
+ "loss": 0.4774,
+ "step": 3811
+ },
+ {
+ "epoch": 1.8023640661938534,
+ "grad_norm": 2.5408077239990234,
+ "learning_rate": 4.002073039847653e-06,
+ "loss": 0.5233,
+ "step": 3812
+ },
+ {
+ "epoch": 1.8028368794326242,
+ "grad_norm": 2.734602689743042,
+ "learning_rate": 4.001574319892557e-06,
+ "loss": 0.5403,
+ "step": 3813
+ },
+ {
+ "epoch": 1.803309692671395,
+ "grad_norm": 3.3786163330078125,
+ "learning_rate": 4.001075506441012e-06,
+ "loss": 0.6969,
+ "step": 3814
+ },
+ {
+ "epoch": 1.8037825059101655,
+ "grad_norm": 2.7375378608703613,
+ "learning_rate": 4.000576599524078e-06,
+ "loss": 0.4907,
+ "step": 3815
+ },
+ {
+ "epoch": 1.804255319148936,
+ "grad_norm": 3.041804075241089,
+ "learning_rate": 4.000077599172818e-06,
+ "loss": 0.6021,
+ "step": 3816
+ },
+ {
+ "epoch": 1.804728132387707,
+ "grad_norm": 2.697599411010742,
+ "learning_rate": 3.999578505418305e-06,
+ "loss": 0.4743,
+ "step": 3817
+ },
+ {
+ "epoch": 1.8052009456264777,
+ "grad_norm": 2.276921272277832,
+ "learning_rate": 3.999079318291612e-06,
+ "loss": 0.4885,
+ "step": 3818
+ },
+ {
+ "epoch": 1.8056737588652483,
+ "grad_norm": 2.4896953105926514,
+ "learning_rate": 3.998580037823825e-06,
+ "loss": 0.503,
+ "step": 3819
+ },
+ {
+ "epoch": 1.8061465721040189,
+ "grad_norm": 2.6232175827026367,
+ "learning_rate": 3.998080664046029e-06,
+ "loss": 0.5058,
+ "step": 3820
+ },
+ {
+ "epoch": 1.8066193853427897,
+ "grad_norm": 2.695861339569092,
+ "learning_rate": 3.997581196989319e-06,
+ "loss": 0.4949,
+ "step": 3821
+ },
+ {
+ "epoch": 1.8070921985815604,
+ "grad_norm": 2.912886142730713,
+ "learning_rate": 3.997081636684795e-06,
+ "loss": 0.4971,
+ "step": 3822
+ },
+ {
+ "epoch": 1.807565011820331,
+ "grad_norm": 2.876500368118286,
+ "learning_rate": 3.996581983163561e-06,
+ "loss": 0.5584,
+ "step": 3823
+ },
+ {
+ "epoch": 1.8080378250591016,
+ "grad_norm": 2.857069730758667,
+ "learning_rate": 3.99608223645673e-06,
+ "loss": 0.5457,
+ "step": 3824
+ },
+ {
+ "epoch": 1.8085106382978724,
+ "grad_norm": 2.486743211746216,
+ "learning_rate": 3.995582396595419e-06,
+ "loss": 0.5291,
+ "step": 3825
+ },
+ {
+ "epoch": 1.808983451536643,
+ "grad_norm": 2.509441375732422,
+ "learning_rate": 3.9950824636107486e-06,
+ "loss": 0.4747,
+ "step": 3826
+ },
+ {
+ "epoch": 1.8094562647754135,
+ "grad_norm": 2.931394100189209,
+ "learning_rate": 3.99458243753385e-06,
+ "loss": 0.5116,
+ "step": 3827
+ },
+ {
+ "epoch": 1.8099290780141843,
+ "grad_norm": 2.4868650436401367,
+ "learning_rate": 3.994082318395856e-06,
+ "loss": 0.4671,
+ "step": 3828
+ },
+ {
+ "epoch": 1.8104018912529551,
+ "grad_norm": 2.5554752349853516,
+ "learning_rate": 3.993582106227907e-06,
+ "loss": 0.4969,
+ "step": 3829
+ },
+ {
+ "epoch": 1.8108747044917257,
+ "grad_norm": 2.8367133140563965,
+ "learning_rate": 3.99308180106115e-06,
+ "loss": 0.5507,
+ "step": 3830
+ },
+ {
+ "epoch": 1.8113475177304963,
+ "grad_norm": 2.68245792388916,
+ "learning_rate": 3.992581402926737e-06,
+ "loss": 0.5115,
+ "step": 3831
+ },
+ {
+ "epoch": 1.811820330969267,
+ "grad_norm": 2.406674385070801,
+ "learning_rate": 3.992080911855824e-06,
+ "loss": 0.545,
+ "step": 3832
+ },
+ {
+ "epoch": 1.8122931442080379,
+ "grad_norm": 2.5003464221954346,
+ "learning_rate": 3.991580327879575e-06,
+ "loss": 0.4331,
+ "step": 3833
+ },
+ {
+ "epoch": 1.8127659574468085,
+ "grad_norm": 2.49320912361145,
+ "learning_rate": 3.99107965102916e-06,
+ "loss": 0.5118,
+ "step": 3834
+ },
+ {
+ "epoch": 1.813238770685579,
+ "grad_norm": 2.6183295249938965,
+ "learning_rate": 3.990578881335752e-06,
+ "loss": 0.5286,
+ "step": 3835
+ },
+ {
+ "epoch": 1.8137115839243498,
+ "grad_norm": 3.1999518871307373,
+ "learning_rate": 3.990078018830534e-06,
+ "loss": 0.5048,
+ "step": 3836
+ },
+ {
+ "epoch": 1.8141843971631206,
+ "grad_norm": 2.4351117610931396,
+ "learning_rate": 3.9895770635446915e-06,
+ "loss": 0.514,
+ "step": 3837
+ },
+ {
+ "epoch": 1.8146572104018912,
+ "grad_norm": 2.6859259605407715,
+ "learning_rate": 3.989076015509416e-06,
+ "loss": 0.5575,
+ "step": 3838
+ },
+ {
+ "epoch": 1.8151300236406618,
+ "grad_norm": 2.790421962738037,
+ "learning_rate": 3.988574874755909e-06,
+ "loss": 0.5467,
+ "step": 3839
+ },
+ {
+ "epoch": 1.8156028368794326,
+ "grad_norm": 2.5202765464782715,
+ "learning_rate": 3.988073641315369e-06,
+ "loss": 0.5229,
+ "step": 3840
+ },
+ {
+ "epoch": 1.8160756501182034,
+ "grad_norm": 2.623652219772339,
+ "learning_rate": 3.987572315219009e-06,
+ "loss": 0.509,
+ "step": 3841
+ },
+ {
+ "epoch": 1.816548463356974,
+ "grad_norm": 2.6038360595703125,
+ "learning_rate": 3.987070896498044e-06,
+ "loss": 0.5304,
+ "step": 3842
+ },
+ {
+ "epoch": 1.8170212765957445,
+ "grad_norm": 2.9378011226654053,
+ "learning_rate": 3.9865693851836955e-06,
+ "loss": 0.5845,
+ "step": 3843
+ },
+ {
+ "epoch": 1.8174940898345153,
+ "grad_norm": 2.4061124324798584,
+ "learning_rate": 3.98606778130719e-06,
+ "loss": 0.4333,
+ "step": 3844
+ },
+ {
+ "epoch": 1.8179669030732861,
+ "grad_norm": 2.483489751815796,
+ "learning_rate": 3.985566084899759e-06,
+ "loss": 0.4827,
+ "step": 3845
+ },
+ {
+ "epoch": 1.8184397163120567,
+ "grad_norm": 2.7774932384490967,
+ "learning_rate": 3.985064295992642e-06,
+ "loss": 0.5016,
+ "step": 3846
+ },
+ {
+ "epoch": 1.8189125295508273,
+ "grad_norm": 2.5936765670776367,
+ "learning_rate": 3.984562414617083e-06,
+ "loss": 0.4448,
+ "step": 3847
+ },
+ {
+ "epoch": 1.819385342789598,
+ "grad_norm": 2.8608627319335938,
+ "learning_rate": 3.9840604408043325e-06,
+ "loss": 0.5735,
+ "step": 3848
+ },
+ {
+ "epoch": 1.8198581560283689,
+ "grad_norm": 2.6212472915649414,
+ "learning_rate": 3.983558374585646e-06,
+ "loss": 0.5091,
+ "step": 3849
+ },
+ {
+ "epoch": 1.8203309692671394,
+ "grad_norm": 2.832460641860962,
+ "learning_rate": 3.983056215992284e-06,
+ "loss": 0.5169,
+ "step": 3850
+ },
+ {
+ "epoch": 1.82080378250591,
+ "grad_norm": 2.5293610095977783,
+ "learning_rate": 3.982553965055514e-06,
+ "loss": 0.4708,
+ "step": 3851
+ },
+ {
+ "epoch": 1.8212765957446808,
+ "grad_norm": 2.9362871646881104,
+ "learning_rate": 3.982051621806611e-06,
+ "loss": 0.575,
+ "step": 3852
+ },
+ {
+ "epoch": 1.8217494089834516,
+ "grad_norm": 2.69073486328125,
+ "learning_rate": 3.98154918627685e-06,
+ "loss": 0.5278,
+ "step": 3853
+ },
+ {
+ "epoch": 1.8222222222222222,
+ "grad_norm": 2.6711034774780273,
+ "learning_rate": 3.98104665849752e-06,
+ "loss": 0.4918,
+ "step": 3854
+ },
+ {
+ "epoch": 1.8226950354609928,
+ "grad_norm": 2.571110963821411,
+ "learning_rate": 3.980544038499907e-06,
+ "loss": 0.5234,
+ "step": 3855
+ },
+ {
+ "epoch": 1.8231678486997636,
+ "grad_norm": 3.2603371143341064,
+ "learning_rate": 3.980041326315309e-06,
+ "loss": 0.5996,
+ "step": 3856
+ },
+ {
+ "epoch": 1.8236406619385344,
+ "grad_norm": 2.8472323417663574,
+ "learning_rate": 3.979538521975028e-06,
+ "loss": 0.4769,
+ "step": 3857
+ },
+ {
+ "epoch": 1.824113475177305,
+ "grad_norm": 2.6714751720428467,
+ "learning_rate": 3.979035625510371e-06,
+ "loss": 0.4826,
+ "step": 3858
+ },
+ {
+ "epoch": 1.8245862884160755,
+ "grad_norm": 2.6816468238830566,
+ "learning_rate": 3.97853263695265e-06,
+ "loss": 0.5127,
+ "step": 3859
+ },
+ {
+ "epoch": 1.8250591016548463,
+ "grad_norm": 2.6464123725891113,
+ "learning_rate": 3.978029556333185e-06,
+ "loss": 0.4925,
+ "step": 3860
+ },
+ {
+ "epoch": 1.825531914893617,
+ "grad_norm": 2.5317227840423584,
+ "learning_rate": 3.977526383683301e-06,
+ "loss": 0.4765,
+ "step": 3861
+ },
+ {
+ "epoch": 1.8260047281323877,
+ "grad_norm": 2.5052425861358643,
+ "learning_rate": 3.977023119034328e-06,
+ "loss": 0.4804,
+ "step": 3862
+ },
+ {
+ "epoch": 1.8264775413711583,
+ "grad_norm": 2.7022836208343506,
+ "learning_rate": 3.976519762417602e-06,
+ "loss": 0.4824,
+ "step": 3863
+ },
+ {
+ "epoch": 1.826950354609929,
+ "grad_norm": 2.7445900440216064,
+ "learning_rate": 3.976016313864464e-06,
+ "loss": 0.5698,
+ "step": 3864
+ },
+ {
+ "epoch": 1.8274231678486998,
+ "grad_norm": 2.442518711090088,
+ "learning_rate": 3.975512773406262e-06,
+ "loss": 0.5133,
+ "step": 3865
+ },
+ {
+ "epoch": 1.8278959810874704,
+ "grad_norm": 2.4100050926208496,
+ "learning_rate": 3.975009141074351e-06,
+ "loss": 0.5044,
+ "step": 3866
+ },
+ {
+ "epoch": 1.828368794326241,
+ "grad_norm": 2.9507648944854736,
+ "learning_rate": 3.974505416900088e-06,
+ "loss": 0.5367,
+ "step": 3867
+ },
+ {
+ "epoch": 1.8288416075650118,
+ "grad_norm": 2.5662600994110107,
+ "learning_rate": 3.974001600914837e-06,
+ "loss": 0.5878,
+ "step": 3868
+ },
+ {
+ "epoch": 1.8293144208037826,
+ "grad_norm": 2.4306657314300537,
+ "learning_rate": 3.973497693149971e-06,
+ "loss": 0.4647,
+ "step": 3869
+ },
+ {
+ "epoch": 1.8297872340425532,
+ "grad_norm": 2.974686622619629,
+ "learning_rate": 3.972993693636864e-06,
+ "loss": 0.4911,
+ "step": 3870
+ },
+ {
+ "epoch": 1.8302600472813237,
+ "grad_norm": 2.5711987018585205,
+ "learning_rate": 3.972489602406899e-06,
+ "loss": 0.5089,
+ "step": 3871
+ },
+ {
+ "epoch": 1.8307328605200945,
+ "grad_norm": 3.259617328643799,
+ "learning_rate": 3.971985419491463e-06,
+ "loss": 0.5966,
+ "step": 3872
+ },
+ {
+ "epoch": 1.8312056737588653,
+ "grad_norm": 2.7437000274658203,
+ "learning_rate": 3.971481144921949e-06,
+ "loss": 0.5097,
+ "step": 3873
+ },
+ {
+ "epoch": 1.831678486997636,
+ "grad_norm": 2.9597461223602295,
+ "learning_rate": 3.970976778729757e-06,
+ "loss": 0.5672,
+ "step": 3874
+ },
+ {
+ "epoch": 1.8321513002364065,
+ "grad_norm": 2.5775723457336426,
+ "learning_rate": 3.970472320946291e-06,
+ "loss": 0.4749,
+ "step": 3875
+ },
+ {
+ "epoch": 1.8326241134751773,
+ "grad_norm": 2.7381200790405273,
+ "learning_rate": 3.969967771602961e-06,
+ "loss": 0.5255,
+ "step": 3876
+ },
+ {
+ "epoch": 1.833096926713948,
+ "grad_norm": 2.651698350906372,
+ "learning_rate": 3.969463130731183e-06,
+ "loss": 0.5098,
+ "step": 3877
+ },
+ {
+ "epoch": 1.8335697399527187,
+ "grad_norm": 2.7277021408081055,
+ "learning_rate": 3.968958398362381e-06,
+ "loss": 0.5251,
+ "step": 3878
+ },
+ {
+ "epoch": 1.8340425531914892,
+ "grad_norm": 2.5184953212738037,
+ "learning_rate": 3.968453574527978e-06,
+ "loss": 0.5086,
+ "step": 3879
+ },
+ {
+ "epoch": 1.83451536643026,
+ "grad_norm": 2.8227882385253906,
+ "learning_rate": 3.967948659259412e-06,
+ "loss": 0.5742,
+ "step": 3880
+ },
+ {
+ "epoch": 1.8349881796690308,
+ "grad_norm": 2.547922134399414,
+ "learning_rate": 3.967443652588119e-06,
+ "loss": 0.5411,
+ "step": 3881
+ },
+ {
+ "epoch": 1.8354609929078014,
+ "grad_norm": 2.6572835445404053,
+ "learning_rate": 3.966938554545545e-06,
+ "loss": 0.4854,
+ "step": 3882
+ },
+ {
+ "epoch": 1.835933806146572,
+ "grad_norm": 2.9416658878326416,
+ "learning_rate": 3.966433365163139e-06,
+ "loss": 0.5236,
+ "step": 3883
+ },
+ {
+ "epoch": 1.8364066193853428,
+ "grad_norm": 2.344325304031372,
+ "learning_rate": 3.965928084472357e-06,
+ "loss": 0.4916,
+ "step": 3884
+ },
+ {
+ "epoch": 1.8368794326241136,
+ "grad_norm": 2.890418291091919,
+ "learning_rate": 3.965422712504662e-06,
+ "loss": 0.5287,
+ "step": 3885
+ },
+ {
+ "epoch": 1.8373522458628841,
+ "grad_norm": 2.6063363552093506,
+ "learning_rate": 3.96491724929152e-06,
+ "loss": 0.4842,
+ "step": 3886
+ },
+ {
+ "epoch": 1.8378250591016547,
+ "grad_norm": 2.5582427978515625,
+ "learning_rate": 3.964411694864404e-06,
+ "loss": 0.4768,
+ "step": 3887
+ },
+ {
+ "epoch": 1.8382978723404255,
+ "grad_norm": 2.84356951713562,
+ "learning_rate": 3.963906049254793e-06,
+ "loss": 0.5284,
+ "step": 3888
+ },
+ {
+ "epoch": 1.8387706855791963,
+ "grad_norm": 2.7048516273498535,
+ "learning_rate": 3.963400312494172e-06,
+ "loss": 0.5271,
+ "step": 3889
+ },
+ {
+ "epoch": 1.839243498817967,
+ "grad_norm": 2.5401699542999268,
+ "learning_rate": 3.962894484614031e-06,
+ "loss": 0.4734,
+ "step": 3890
+ },
+ {
+ "epoch": 1.8397163120567375,
+ "grad_norm": 2.208256244659424,
+ "learning_rate": 3.962388565645864e-06,
+ "loss": 0.4113,
+ "step": 3891
+ },
+ {
+ "epoch": 1.8401891252955083,
+ "grad_norm": 2.775139331817627,
+ "learning_rate": 3.961882555621173e-06,
+ "loss": 0.5172,
+ "step": 3892
+ },
+ {
+ "epoch": 1.840661938534279,
+ "grad_norm": 2.7540855407714844,
+ "learning_rate": 3.961376454571466e-06,
+ "loss": 0.5252,
+ "step": 3893
+ },
+ {
+ "epoch": 1.8411347517730496,
+ "grad_norm": 2.6731574535369873,
+ "learning_rate": 3.960870262528255e-06,
+ "loss": 0.4495,
+ "step": 3894
+ },
+ {
+ "epoch": 1.8416075650118202,
+ "grad_norm": 2.791492223739624,
+ "learning_rate": 3.960363979523058e-06,
+ "loss": 0.5457,
+ "step": 3895
+ },
+ {
+ "epoch": 1.842080378250591,
+ "grad_norm": 2.9280290603637695,
+ "learning_rate": 3.959857605587401e-06,
+ "loss": 0.5373,
+ "step": 3896
+ },
+ {
+ "epoch": 1.8425531914893618,
+ "grad_norm": 2.5652217864990234,
+ "learning_rate": 3.95935114075281e-06,
+ "loss": 0.5191,
+ "step": 3897
+ },
+ {
+ "epoch": 1.8430260047281324,
+ "grad_norm": 2.7297749519348145,
+ "learning_rate": 3.958844585050824e-06,
+ "loss": 0.5366,
+ "step": 3898
+ },
+ {
+ "epoch": 1.843498817966903,
+ "grad_norm": 2.5302982330322266,
+ "learning_rate": 3.958337938512983e-06,
+ "loss": 0.569,
+ "step": 3899
+ },
+ {
+ "epoch": 1.8439716312056738,
+ "grad_norm": 2.644777297973633,
+ "learning_rate": 3.957831201170832e-06,
+ "loss": 0.521,
+ "step": 3900
+ },
+ {
+ "epoch": 1.8444444444444446,
+ "grad_norm": 2.8375515937805176,
+ "learning_rate": 3.957324373055925e-06,
+ "loss": 0.573,
+ "step": 3901
+ },
+ {
+ "epoch": 1.8449172576832151,
+ "grad_norm": 2.512296676635742,
+ "learning_rate": 3.956817454199819e-06,
+ "loss": 0.5081,
+ "step": 3902
+ },
+ {
+ "epoch": 1.8453900709219857,
+ "grad_norm": 2.3662109375,
+ "learning_rate": 3.956310444634079e-06,
+ "loss": 0.4989,
+ "step": 3903
+ },
+ {
+ "epoch": 1.8458628841607565,
+ "grad_norm": 2.6849682331085205,
+ "learning_rate": 3.955803344390272e-06,
+ "loss": 0.5459,
+ "step": 3904
+ },
+ {
+ "epoch": 1.8463356973995273,
+ "grad_norm": 2.8364317417144775,
+ "learning_rate": 3.9552961534999756e-06,
+ "loss": 0.5704,
+ "step": 3905
+ },
+ {
+ "epoch": 1.8468085106382979,
+ "grad_norm": 2.6006948947906494,
+ "learning_rate": 3.954788871994768e-06,
+ "loss": 0.5696,
+ "step": 3906
+ },
+ {
+ "epoch": 1.8472813238770684,
+ "grad_norm": 2.558300018310547,
+ "learning_rate": 3.9542814999062375e-06,
+ "loss": 0.5047,
+ "step": 3907
+ },
+ {
+ "epoch": 1.8477541371158392,
+ "grad_norm": 2.6343321800231934,
+ "learning_rate": 3.953774037265974e-06,
+ "loss": 0.525,
+ "step": 3908
+ },
+ {
+ "epoch": 1.84822695035461,
+ "grad_norm": 2.5050008296966553,
+ "learning_rate": 3.953266484105576e-06,
+ "loss": 0.4867,
+ "step": 3909
+ },
+ {
+ "epoch": 1.8486997635933806,
+ "grad_norm": 2.3775103092193604,
+ "learning_rate": 3.952758840456647e-06,
+ "loss": 0.4349,
+ "step": 3910
+ },
+ {
+ "epoch": 1.8491725768321512,
+ "grad_norm": 2.508376359939575,
+ "learning_rate": 3.952251106350794e-06,
+ "loss": 0.539,
+ "step": 3911
+ },
+ {
+ "epoch": 1.849645390070922,
+ "grad_norm": 2.7403106689453125,
+ "learning_rate": 3.951743281819633e-06,
+ "loss": 0.4478,
+ "step": 3912
+ },
+ {
+ "epoch": 1.8501182033096928,
+ "grad_norm": 2.5332062244415283,
+ "learning_rate": 3.951235366894784e-06,
+ "loss": 0.4658,
+ "step": 3913
+ },
+ {
+ "epoch": 1.8505910165484634,
+ "grad_norm": 3.0137248039245605,
+ "learning_rate": 3.950727361607872e-06,
+ "loss": 0.5047,
+ "step": 3914
+ },
+ {
+ "epoch": 1.851063829787234,
+ "grad_norm": 2.5820653438568115,
+ "learning_rate": 3.950219265990528e-06,
+ "loss": 0.542,
+ "step": 3915
+ },
+ {
+ "epoch": 1.8515366430260047,
+ "grad_norm": 2.555133819580078,
+ "learning_rate": 3.949711080074389e-06,
+ "loss": 0.5253,
+ "step": 3916
+ },
+ {
+ "epoch": 1.8520094562647755,
+ "grad_norm": 2.876882791519165,
+ "learning_rate": 3.949202803891099e-06,
+ "loss": 0.5242,
+ "step": 3917
+ },
+ {
+ "epoch": 1.852482269503546,
+ "grad_norm": 2.5929203033447266,
+ "learning_rate": 3.948694437472305e-06,
+ "loss": 0.5358,
+ "step": 3918
+ },
+ {
+ "epoch": 1.8529550827423167,
+ "grad_norm": 2.468513250350952,
+ "learning_rate": 3.948185980849659e-06,
+ "loss": 0.5119,
+ "step": 3919
+ },
+ {
+ "epoch": 1.8534278959810875,
+ "grad_norm": 2.9259560108184814,
+ "learning_rate": 3.947677434054824e-06,
+ "loss": 0.4756,
+ "step": 3920
+ },
+ {
+ "epoch": 1.8539007092198583,
+ "grad_norm": 2.5247011184692383,
+ "learning_rate": 3.947168797119462e-06,
+ "loss": 0.4627,
+ "step": 3921
+ },
+ {
+ "epoch": 1.8543735224586289,
+ "grad_norm": 2.7396671772003174,
+ "learning_rate": 3.946660070075245e-06,
+ "loss": 0.5013,
+ "step": 3922
+ },
+ {
+ "epoch": 1.8548463356973994,
+ "grad_norm": 2.7059738636016846,
+ "learning_rate": 3.946151252953849e-06,
+ "loss": 0.5875,
+ "step": 3923
+ },
+ {
+ "epoch": 1.8553191489361702,
+ "grad_norm": 2.5638437271118164,
+ "learning_rate": 3.945642345786955e-06,
+ "loss": 0.5063,
+ "step": 3924
+ },
+ {
+ "epoch": 1.855791962174941,
+ "grad_norm": 2.6647839546203613,
+ "learning_rate": 3.945133348606251e-06,
+ "loss": 0.5421,
+ "step": 3925
+ },
+ {
+ "epoch": 1.8562647754137116,
+ "grad_norm": 3.7235286235809326,
+ "learning_rate": 3.944624261443431e-06,
+ "loss": 0.5958,
+ "step": 3926
+ },
+ {
+ "epoch": 1.8567375886524822,
+ "grad_norm": 2.769984245300293,
+ "learning_rate": 3.944115084330192e-06,
+ "loss": 0.5678,
+ "step": 3927
+ },
+ {
+ "epoch": 1.857210401891253,
+ "grad_norm": 2.567249059677124,
+ "learning_rate": 3.9436058172982395e-06,
+ "loss": 0.4767,
+ "step": 3928
+ },
+ {
+ "epoch": 1.8576832151300238,
+ "grad_norm": 2.6196048259735107,
+ "learning_rate": 3.943096460379283e-06,
+ "loss": 0.5345,
+ "step": 3929
+ },
+ {
+ "epoch": 1.8581560283687943,
+ "grad_norm": 2.5999555587768555,
+ "learning_rate": 3.942587013605037e-06,
+ "loss": 0.5482,
+ "step": 3930
+ },
+ {
+ "epoch": 1.858628841607565,
+ "grad_norm": 2.630387783050537,
+ "learning_rate": 3.942077477007224e-06,
+ "loss": 0.6023,
+ "step": 3931
+ },
+ {
+ "epoch": 1.8591016548463357,
+ "grad_norm": 2.543503761291504,
+ "learning_rate": 3.941567850617569e-06,
+ "loss": 0.5157,
+ "step": 3932
+ },
+ {
+ "epoch": 1.8595744680851065,
+ "grad_norm": 2.5109236240386963,
+ "learning_rate": 3.941058134467805e-06,
+ "loss": 0.4774,
+ "step": 3933
+ },
+ {
+ "epoch": 1.860047281323877,
+ "grad_norm": 2.5110230445861816,
+ "learning_rate": 3.94054832858967e-06,
+ "loss": 0.5064,
+ "step": 3934
+ },
+ {
+ "epoch": 1.8605200945626477,
+ "grad_norm": 2.4780776500701904,
+ "learning_rate": 3.940038433014908e-06,
+ "loss": 0.5216,
+ "step": 3935
+ },
+ {
+ "epoch": 1.8609929078014185,
+ "grad_norm": 2.4398856163024902,
+ "learning_rate": 3.939528447775266e-06,
+ "loss": 0.4958,
+ "step": 3936
+ },
+ {
+ "epoch": 1.8614657210401893,
+ "grad_norm": 2.449498176574707,
+ "learning_rate": 3.9390183729025e-06,
+ "loss": 0.5165,
+ "step": 3937
+ },
+ {
+ "epoch": 1.8619385342789598,
+ "grad_norm": 2.982544422149658,
+ "learning_rate": 3.938508208428371e-06,
+ "loss": 0.4803,
+ "step": 3938
+ },
+ {
+ "epoch": 1.8624113475177304,
+ "grad_norm": 2.6574015617370605,
+ "learning_rate": 3.937997954384641e-06,
+ "loss": 0.4797,
+ "step": 3939
+ },
+ {
+ "epoch": 1.8628841607565012,
+ "grad_norm": 2.7773542404174805,
+ "learning_rate": 3.937487610803086e-06,
+ "loss": 0.4843,
+ "step": 3940
+ },
+ {
+ "epoch": 1.863356973995272,
+ "grad_norm": 2.588937759399414,
+ "learning_rate": 3.9369771777154805e-06,
+ "loss": 0.5426,
+ "step": 3941
+ },
+ {
+ "epoch": 1.8638297872340426,
+ "grad_norm": 2.855442523956299,
+ "learning_rate": 3.936466655153607e-06,
+ "loss": 0.5443,
+ "step": 3942
+ },
+ {
+ "epoch": 1.8643026004728132,
+ "grad_norm": 2.554676055908203,
+ "learning_rate": 3.935956043149253e-06,
+ "loss": 0.5334,
+ "step": 3943
+ },
+ {
+ "epoch": 1.864775413711584,
+ "grad_norm": 2.901599884033203,
+ "learning_rate": 3.935445341734212e-06,
+ "loss": 0.5842,
+ "step": 3944
+ },
+ {
+ "epoch": 1.8652482269503547,
+ "grad_norm": 2.554485321044922,
+ "learning_rate": 3.934934550940285e-06,
+ "loss": 0.4941,
+ "step": 3945
+ },
+ {
+ "epoch": 1.8657210401891253,
+ "grad_norm": 2.357203245162964,
+ "learning_rate": 3.934423670799275e-06,
+ "loss": 0.4402,
+ "step": 3946
+ },
+ {
+ "epoch": 1.866193853427896,
+ "grad_norm": 2.7036049365997314,
+ "learning_rate": 3.933912701342993e-06,
+ "loss": 0.4966,
+ "step": 3947
+ },
+ {
+ "epoch": 1.8666666666666667,
+ "grad_norm": 2.7817211151123047,
+ "learning_rate": 3.933401642603255e-06,
+ "loss": 0.4908,
+ "step": 3948
+ },
+ {
+ "epoch": 1.8671394799054375,
+ "grad_norm": 2.439490795135498,
+ "learning_rate": 3.932890494611882e-06,
+ "loss": 0.4322,
+ "step": 3949
+ },
+ {
+ "epoch": 1.867612293144208,
+ "grad_norm": 3.187152147293091,
+ "learning_rate": 3.9323792574007e-06,
+ "loss": 0.501,
+ "step": 3950
+ },
+ {
+ "epoch": 1.8680851063829786,
+ "grad_norm": 2.405773401260376,
+ "learning_rate": 3.931867931001543e-06,
+ "loss": 0.4477,
+ "step": 3951
+ },
+ {
+ "epoch": 1.8685579196217494,
+ "grad_norm": 2.4922525882720947,
+ "learning_rate": 3.931356515446248e-06,
+ "loss": 0.5098,
+ "step": 3952
+ },
+ {
+ "epoch": 1.8690307328605202,
+ "grad_norm": 2.7781267166137695,
+ "learning_rate": 3.93084501076666e-06,
+ "loss": 0.5815,
+ "step": 3953
+ },
+ {
+ "epoch": 1.8695035460992908,
+ "grad_norm": 2.74621844291687,
+ "learning_rate": 3.930333416994626e-06,
+ "loss": 0.5605,
+ "step": 3954
+ },
+ {
+ "epoch": 1.8699763593380614,
+ "grad_norm": 2.5527689456939697,
+ "learning_rate": 3.929821734162004e-06,
+ "loss": 0.5141,
+ "step": 3955
+ },
+ {
+ "epoch": 1.8704491725768322,
+ "grad_norm": 2.5730628967285156,
+ "learning_rate": 3.92930996230065e-06,
+ "loss": 0.5446,
+ "step": 3956
+ },
+ {
+ "epoch": 1.870921985815603,
+ "grad_norm": 2.7053353786468506,
+ "learning_rate": 3.9287981014424334e-06,
+ "loss": 0.4722,
+ "step": 3957
+ },
+ {
+ "epoch": 1.8713947990543736,
+ "grad_norm": 2.7591893672943115,
+ "learning_rate": 3.928286151619224e-06,
+ "loss": 0.509,
+ "step": 3958
+ },
+ {
+ "epoch": 1.8718676122931441,
+ "grad_norm": 2.6233739852905273,
+ "learning_rate": 3.927774112862898e-06,
+ "loss": 0.5266,
+ "step": 3959
+ },
+ {
+ "epoch": 1.872340425531915,
+ "grad_norm": 2.7715370655059814,
+ "learning_rate": 3.9272619852053396e-06,
+ "loss": 0.5612,
+ "step": 3960
+ },
+ {
+ "epoch": 1.8728132387706857,
+ "grad_norm": 2.4815211296081543,
+ "learning_rate": 3.926749768678435e-06,
+ "loss": 0.5498,
+ "step": 3961
+ },
+ {
+ "epoch": 1.8732860520094563,
+ "grad_norm": 2.6819605827331543,
+ "learning_rate": 3.926237463314078e-06,
+ "loss": 0.5499,
+ "step": 3962
+ },
+ {
+ "epoch": 1.8737588652482269,
+ "grad_norm": 2.638664722442627,
+ "learning_rate": 3.925725069144168e-06,
+ "loss": 0.5429,
+ "step": 3963
+ },
+ {
+ "epoch": 1.8742316784869977,
+ "grad_norm": 2.527294874191284,
+ "learning_rate": 3.925212586200611e-06,
+ "loss": 0.5451,
+ "step": 3964
+ },
+ {
+ "epoch": 1.8747044917257685,
+ "grad_norm": 2.831638813018799,
+ "learning_rate": 3.924700014515315e-06,
+ "loss": 0.5276,
+ "step": 3965
+ },
+ {
+ "epoch": 1.875177304964539,
+ "grad_norm": 2.5906996726989746,
+ "learning_rate": 3.924187354120196e-06,
+ "loss": 0.5323,
+ "step": 3966
+ },
+ {
+ "epoch": 1.8756501182033096,
+ "grad_norm": 2.5482442378997803,
+ "learning_rate": 3.923674605047175e-06,
+ "loss": 0.4882,
+ "step": 3967
+ },
+ {
+ "epoch": 1.8761229314420804,
+ "grad_norm": 2.56402850151062,
+ "learning_rate": 3.923161767328179e-06,
+ "loss": 0.5111,
+ "step": 3968
+ },
+ {
+ "epoch": 1.8765957446808512,
+ "grad_norm": 3.223782539367676,
+ "learning_rate": 3.9226488409951405e-06,
+ "loss": 0.5829,
+ "step": 3969
+ },
+ {
+ "epoch": 1.8770685579196218,
+ "grad_norm": 2.665964365005493,
+ "learning_rate": 3.922135826079997e-06,
+ "loss": 0.4739,
+ "step": 3970
+ },
+ {
+ "epoch": 1.8775413711583924,
+ "grad_norm": 2.602696418762207,
+ "learning_rate": 3.921622722614691e-06,
+ "loss": 0.5199,
+ "step": 3971
+ },
+ {
+ "epoch": 1.8780141843971632,
+ "grad_norm": 2.5384418964385986,
+ "learning_rate": 3.921109530631172e-06,
+ "loss": 0.5086,
+ "step": 3972
+ },
+ {
+ "epoch": 1.878486997635934,
+ "grad_norm": 2.7961080074310303,
+ "learning_rate": 3.920596250161394e-06,
+ "loss": 0.5454,
+ "step": 3973
+ },
+ {
+ "epoch": 1.8789598108747045,
+ "grad_norm": 3.022007465362549,
+ "learning_rate": 3.920082881237317e-06,
+ "loss": 0.5537,
+ "step": 3974
+ },
+ {
+ "epoch": 1.8794326241134751,
+ "grad_norm": 2.699885129928589,
+ "learning_rate": 3.9195694238909045e-06,
+ "loss": 0.5274,
+ "step": 3975
+ },
+ {
+ "epoch": 1.879905437352246,
+ "grad_norm": 2.3994593620300293,
+ "learning_rate": 3.919055878154129e-06,
+ "loss": 0.4134,
+ "step": 3976
+ },
+ {
+ "epoch": 1.8803782505910167,
+ "grad_norm": 4.093045711517334,
+ "learning_rate": 3.918542244058967e-06,
+ "loss": 0.5305,
+ "step": 3977
+ },
+ {
+ "epoch": 1.8808510638297873,
+ "grad_norm": 3.011643171310425,
+ "learning_rate": 3.9180285216374e-06,
+ "loss": 0.5481,
+ "step": 3978
+ },
+ {
+ "epoch": 1.8813238770685579,
+ "grad_norm": 2.6426854133605957,
+ "learning_rate": 3.917514710921414e-06,
+ "loss": 0.5415,
+ "step": 3979
+ },
+ {
+ "epoch": 1.8817966903073287,
+ "grad_norm": 2.4379019737243652,
+ "learning_rate": 3.917000811943002e-06,
+ "loss": 0.4566,
+ "step": 3980
+ },
+ {
+ "epoch": 1.8822695035460995,
+ "grad_norm": 3.18522047996521,
+ "learning_rate": 3.9164868247341634e-06,
+ "loss": 0.6079,
+ "step": 3981
+ },
+ {
+ "epoch": 1.88274231678487,
+ "grad_norm": 2.6451141834259033,
+ "learning_rate": 3.915972749326903e-06,
+ "loss": 0.515,
+ "step": 3982
+ },
+ {
+ "epoch": 1.8832151300236406,
+ "grad_norm": 2.565598726272583,
+ "learning_rate": 3.915458585753226e-06,
+ "loss": 0.4799,
+ "step": 3983
+ },
+ {
+ "epoch": 1.8836879432624114,
+ "grad_norm": 2.711651563644409,
+ "learning_rate": 3.91494433404515e-06,
+ "loss": 0.5595,
+ "step": 3984
+ },
+ {
+ "epoch": 1.8841607565011822,
+ "grad_norm": 2.749328851699829,
+ "learning_rate": 3.914429994234695e-06,
+ "loss": 0.495,
+ "step": 3985
+ },
+ {
+ "epoch": 1.8846335697399526,
+ "grad_norm": 2.9492287635803223,
+ "learning_rate": 3.913915566353886e-06,
+ "loss": 0.5683,
+ "step": 3986
+ },
+ {
+ "epoch": 1.8851063829787233,
+ "grad_norm": 3.07747745513916,
+ "learning_rate": 3.913401050434756e-06,
+ "loss": 0.4953,
+ "step": 3987
+ },
+ {
+ "epoch": 1.8855791962174941,
+ "grad_norm": 2.8746345043182373,
+ "learning_rate": 3.912886446509338e-06,
+ "loss": 0.4752,
+ "step": 3988
+ },
+ {
+ "epoch": 1.8860520094562647,
+ "grad_norm": 2.772954225540161,
+ "learning_rate": 3.912371754609677e-06,
+ "loss": 0.5473,
+ "step": 3989
+ },
+ {
+ "epoch": 1.8865248226950353,
+ "grad_norm": 2.8906044960021973,
+ "learning_rate": 3.911856974767821e-06,
+ "loss": 0.5285,
+ "step": 3990
+ },
+ {
+ "epoch": 1.886997635933806,
+ "grad_norm": 2.8992726802825928,
+ "learning_rate": 3.9113421070158206e-06,
+ "loss": 0.571,
+ "step": 3991
+ },
+ {
+ "epoch": 1.887470449172577,
+ "grad_norm": 2.624662160873413,
+ "learning_rate": 3.910827151385737e-06,
+ "loss": 0.5183,
+ "step": 3992
+ },
+ {
+ "epoch": 1.8879432624113475,
+ "grad_norm": 2.4491732120513916,
+ "learning_rate": 3.910312107909632e-06,
+ "loss": 0.4205,
+ "step": 3993
+ },
+ {
+ "epoch": 1.888416075650118,
+ "grad_norm": 2.278259515762329,
+ "learning_rate": 3.909796976619575e-06,
+ "loss": 0.4464,
+ "step": 3994
+ },
+ {
+ "epoch": 1.8888888888888888,
+ "grad_norm": 2.6481523513793945,
+ "learning_rate": 3.909281757547644e-06,
+ "loss": 0.5023,
+ "step": 3995
+ },
+ {
+ "epoch": 1.8893617021276596,
+ "grad_norm": 2.6687493324279785,
+ "learning_rate": 3.908766450725917e-06,
+ "loss": 0.495,
+ "step": 3996
+ },
+ {
+ "epoch": 1.8898345153664302,
+ "grad_norm": 2.507525682449341,
+ "learning_rate": 3.908251056186481e-06,
+ "loss": 0.4155,
+ "step": 3997
+ },
+ {
+ "epoch": 1.8903073286052008,
+ "grad_norm": 2.7048323154449463,
+ "learning_rate": 3.907735573961426e-06,
+ "loss": 0.4601,
+ "step": 3998
+ },
+ {
+ "epoch": 1.8907801418439716,
+ "grad_norm": 2.6825389862060547,
+ "learning_rate": 3.907220004082848e-06,
+ "loss": 0.5067,
+ "step": 3999
+ },
+ {
+ "epoch": 1.8912529550827424,
+ "grad_norm": 2.775696039199829,
+ "learning_rate": 3.906704346582852e-06,
+ "loss": 0.5411,
+ "step": 4000
+ },
+ {
+ "epoch": 1.891725768321513,
+ "grad_norm": 2.4492077827453613,
+ "learning_rate": 3.906188601493545e-06,
+ "loss": 0.4931,
+ "step": 4001
+ },
+ {
+ "epoch": 1.8921985815602835,
+ "grad_norm": 2.320810556411743,
+ "learning_rate": 3.905672768847041e-06,
+ "loss": 0.4908,
+ "step": 4002
+ },
+ {
+ "epoch": 1.8926713947990543,
+ "grad_norm": 2.455162525177002,
+ "learning_rate": 3.905156848675455e-06,
+ "loss": 0.508,
+ "step": 4003
+ },
+ {
+ "epoch": 1.8931442080378251,
+ "grad_norm": 2.515921115875244,
+ "learning_rate": 3.904640841010915e-06,
+ "loss": 0.5318,
+ "step": 4004
+ },
+ {
+ "epoch": 1.8936170212765957,
+ "grad_norm": 2.7230770587921143,
+ "learning_rate": 3.904124745885548e-06,
+ "loss": 0.4793,
+ "step": 4005
+ },
+ {
+ "epoch": 1.8940898345153663,
+ "grad_norm": 2.519934892654419,
+ "learning_rate": 3.903608563331491e-06,
+ "loss": 0.5013,
+ "step": 4006
+ },
+ {
+ "epoch": 1.894562647754137,
+ "grad_norm": 2.719674587249756,
+ "learning_rate": 3.903092293380883e-06,
+ "loss": 0.516,
+ "step": 4007
+ },
+ {
+ "epoch": 1.8950354609929079,
+ "grad_norm": 3.2107343673706055,
+ "learning_rate": 3.902575936065869e-06,
+ "loss": 0.6297,
+ "step": 4008
+ },
+ {
+ "epoch": 1.8955082742316784,
+ "grad_norm": 2.9773149490356445,
+ "learning_rate": 3.902059491418603e-06,
+ "loss": 0.566,
+ "step": 4009
+ },
+ {
+ "epoch": 1.895981087470449,
+ "grad_norm": 2.6754770278930664,
+ "learning_rate": 3.90154295947124e-06,
+ "loss": 0.5187,
+ "step": 4010
+ },
+ {
+ "epoch": 1.8964539007092198,
+ "grad_norm": 2.457303762435913,
+ "learning_rate": 3.901026340255943e-06,
+ "loss": 0.5757,
+ "step": 4011
+ },
+ {
+ "epoch": 1.8969267139479906,
+ "grad_norm": 2.5944161415100098,
+ "learning_rate": 3.900509633804878e-06,
+ "loss": 0.5049,
+ "step": 4012
+ },
+ {
+ "epoch": 1.8973995271867612,
+ "grad_norm": 2.610445022583008,
+ "learning_rate": 3.89999284015022e-06,
+ "loss": 0.521,
+ "step": 4013
+ },
+ {
+ "epoch": 1.8978723404255318,
+ "grad_norm": 2.6949338912963867,
+ "learning_rate": 3.899475959324146e-06,
+ "loss": 0.5619,
+ "step": 4014
+ },
+ {
+ "epoch": 1.8983451536643026,
+ "grad_norm": 2.7889559268951416,
+ "learning_rate": 3.898958991358841e-06,
+ "loss": 0.5223,
+ "step": 4015
+ },
+ {
+ "epoch": 1.8988179669030734,
+ "grad_norm": 2.569265842437744,
+ "learning_rate": 3.898441936286493e-06,
+ "loss": 0.5724,
+ "step": 4016
+ },
+ {
+ "epoch": 1.899290780141844,
+ "grad_norm": 2.3567774295806885,
+ "learning_rate": 3.897924794139299e-06,
+ "loss": 0.4784,
+ "step": 4017
+ },
+ {
+ "epoch": 1.8997635933806145,
+ "grad_norm": 2.9176526069641113,
+ "learning_rate": 3.897407564949457e-06,
+ "loss": 0.646,
+ "step": 4018
+ },
+ {
+ "epoch": 1.9002364066193853,
+ "grad_norm": 2.7870090007781982,
+ "learning_rate": 3.896890248749174e-06,
+ "loss": 0.4922,
+ "step": 4019
+ },
+ {
+ "epoch": 1.900709219858156,
+ "grad_norm": 2.8310980796813965,
+ "learning_rate": 3.89637284557066e-06,
+ "loss": 0.4746,
+ "step": 4020
+ },
+ {
+ "epoch": 1.9011820330969267,
+ "grad_norm": 2.434915542602539,
+ "learning_rate": 3.895855355446131e-06,
+ "loss": 0.4537,
+ "step": 4021
+ },
+ {
+ "epoch": 1.9016548463356973,
+ "grad_norm": 3.0547034740448,
+ "learning_rate": 3.89533777840781e-06,
+ "loss": 0.6161,
+ "step": 4022
+ },
+ {
+ "epoch": 1.902127659574468,
+ "grad_norm": 3.416774272918701,
+ "learning_rate": 3.894820114487925e-06,
+ "loss": 0.5448,
+ "step": 4023
+ },
+ {
+ "epoch": 1.9026004728132389,
+ "grad_norm": 2.606951951980591,
+ "learning_rate": 3.894302363718707e-06,
+ "loss": 0.5501,
+ "step": 4024
+ },
+ {
+ "epoch": 1.9030732860520094,
+ "grad_norm": 3.082165002822876,
+ "learning_rate": 3.8937845261323945e-06,
+ "loss": 0.6035,
+ "step": 4025
+ },
+ {
+ "epoch": 1.90354609929078,
+ "grad_norm": 2.616093397140503,
+ "learning_rate": 3.893266601761231e-06,
+ "loss": 0.5294,
+ "step": 4026
+ },
+ {
+ "epoch": 1.9040189125295508,
+ "grad_norm": 2.7141637802124023,
+ "learning_rate": 3.8927485906374654e-06,
+ "loss": 0.5481,
+ "step": 4027
+ },
+ {
+ "epoch": 1.9044917257683216,
+ "grad_norm": 2.5129404067993164,
+ "learning_rate": 3.892230492793352e-06,
+ "loss": 0.4958,
+ "step": 4028
+ },
+ {
+ "epoch": 1.9049645390070922,
+ "grad_norm": 2.703403949737549,
+ "learning_rate": 3.891712308261151e-06,
+ "loss": 0.4852,
+ "step": 4029
+ },
+ {
+ "epoch": 1.9054373522458627,
+ "grad_norm": 2.881058931350708,
+ "learning_rate": 3.891194037073127e-06,
+ "loss": 0.4662,
+ "step": 4030
+ },
+ {
+ "epoch": 1.9059101654846335,
+ "grad_norm": 3.216769218444824,
+ "learning_rate": 3.8906756792615505e-06,
+ "loss": 0.5076,
+ "step": 4031
+ },
+ {
+ "epoch": 1.9063829787234043,
+ "grad_norm": 2.442265748977661,
+ "learning_rate": 3.890157234858697e-06,
+ "loss": 0.4748,
+ "step": 4032
+ },
+ {
+ "epoch": 1.906855791962175,
+ "grad_norm": 3.088672399520874,
+ "learning_rate": 3.889638703896849e-06,
+ "loss": 0.5729,
+ "step": 4033
+ },
+ {
+ "epoch": 1.9073286052009455,
+ "grad_norm": 2.9304986000061035,
+ "learning_rate": 3.889120086408291e-06,
+ "loss": 0.603,
+ "step": 4034
+ },
+ {
+ "epoch": 1.9078014184397163,
+ "grad_norm": 2.686093807220459,
+ "learning_rate": 3.888601382425318e-06,
+ "loss": 0.4978,
+ "step": 4035
+ },
+ {
+ "epoch": 1.908274231678487,
+ "grad_norm": 2.5668389797210693,
+ "learning_rate": 3.888082591980225e-06,
+ "loss": 0.5086,
+ "step": 4036
+ },
+ {
+ "epoch": 1.9087470449172577,
+ "grad_norm": 2.530996561050415,
+ "learning_rate": 3.887563715105315e-06,
+ "loss": 0.4678,
+ "step": 4037
+ },
+ {
+ "epoch": 1.9092198581560282,
+ "grad_norm": 3.043342351913452,
+ "learning_rate": 3.887044751832897e-06,
+ "loss": 0.5452,
+ "step": 4038
+ },
+ {
+ "epoch": 1.909692671394799,
+ "grad_norm": 2.799734115600586,
+ "learning_rate": 3.886525702195284e-06,
+ "loss": 0.5265,
+ "step": 4039
+ },
+ {
+ "epoch": 1.9101654846335698,
+ "grad_norm": 2.890022039413452,
+ "learning_rate": 3.886006566224796e-06,
+ "loss": 0.4634,
+ "step": 4040
+ },
+ {
+ "epoch": 1.9106382978723404,
+ "grad_norm": 2.6804237365722656,
+ "learning_rate": 3.8854873439537555e-06,
+ "loss": 0.5031,
+ "step": 4041
+ },
+ {
+ "epoch": 1.911111111111111,
+ "grad_norm": 2.43038272857666,
+ "learning_rate": 3.884968035414495e-06,
+ "loss": 0.5098,
+ "step": 4042
+ },
+ {
+ "epoch": 1.9115839243498818,
+ "grad_norm": 2.589583396911621,
+ "learning_rate": 3.884448640639346e-06,
+ "loss": 0.498,
+ "step": 4043
+ },
+ {
+ "epoch": 1.9120567375886526,
+ "grad_norm": 2.4565231800079346,
+ "learning_rate": 3.8839291596606524e-06,
+ "loss": 0.4318,
+ "step": 4044
+ },
+ {
+ "epoch": 1.9125295508274232,
+ "grad_norm": 2.66762638092041,
+ "learning_rate": 3.8834095925107575e-06,
+ "loss": 0.5441,
+ "step": 4045
+ },
+ {
+ "epoch": 1.9130023640661937,
+ "grad_norm": 2.7334461212158203,
+ "learning_rate": 3.882889939222013e-06,
+ "loss": 0.5209,
+ "step": 4046
+ },
+ {
+ "epoch": 1.9134751773049645,
+ "grad_norm": 2.6398537158966064,
+ "learning_rate": 3.8823701998267765e-06,
+ "loss": 0.4874,
+ "step": 4047
+ },
+ {
+ "epoch": 1.9139479905437353,
+ "grad_norm": 2.82405161857605,
+ "learning_rate": 3.881850374357409e-06,
+ "loss": 0.4519,
+ "step": 4048
+ },
+ {
+ "epoch": 1.914420803782506,
+ "grad_norm": 2.7552523612976074,
+ "learning_rate": 3.8813304628462776e-06,
+ "loss": 0.547,
+ "step": 4049
+ },
+ {
+ "epoch": 1.9148936170212765,
+ "grad_norm": 2.5287928581237793,
+ "learning_rate": 3.880810465325755e-06,
+ "loss": 0.5226,
+ "step": 4050
+ },
+ {
+ "epoch": 1.9153664302600473,
+ "grad_norm": 2.7597358226776123,
+ "learning_rate": 3.88029038182822e-06,
+ "loss": 0.5171,
+ "step": 4051
+ },
+ {
+ "epoch": 1.915839243498818,
+ "grad_norm": 2.563899278640747,
+ "learning_rate": 3.879770212386055e-06,
+ "loss": 0.4911,
+ "step": 4052
+ },
+ {
+ "epoch": 1.9163120567375886,
+ "grad_norm": 2.499404191970825,
+ "learning_rate": 3.879249957031649e-06,
+ "loss": 0.5072,
+ "step": 4053
+ },
+ {
+ "epoch": 1.9167848699763592,
+ "grad_norm": 2.817713499069214,
+ "learning_rate": 3.878729615797396e-06,
+ "loss": 0.5452,
+ "step": 4054
+ },
+ {
+ "epoch": 1.91725768321513,
+ "grad_norm": 2.7152490615844727,
+ "learning_rate": 3.878209188715696e-06,
+ "loss": 0.4917,
+ "step": 4055
+ },
+ {
+ "epoch": 1.9177304964539008,
+ "grad_norm": 2.384265661239624,
+ "learning_rate": 3.877688675818953e-06,
+ "loss": 0.4823,
+ "step": 4056
+ },
+ {
+ "epoch": 1.9182033096926714,
+ "grad_norm": 2.61059308052063,
+ "learning_rate": 3.877168077139577e-06,
+ "loss": 0.478,
+ "step": 4057
+ },
+ {
+ "epoch": 1.918676122931442,
+ "grad_norm": 2.6107938289642334,
+ "learning_rate": 3.8766473927099824e-06,
+ "loss": 0.5202,
+ "step": 4058
+ },
+ {
+ "epoch": 1.9191489361702128,
+ "grad_norm": 2.2339766025543213,
+ "learning_rate": 3.876126622562592e-06,
+ "loss": 0.547,
+ "step": 4059
+ },
+ {
+ "epoch": 1.9196217494089836,
+ "grad_norm": 2.4324610233306885,
+ "learning_rate": 3.8756057667298304e-06,
+ "loss": 0.5333,
+ "step": 4060
+ },
+ {
+ "epoch": 1.9200945626477541,
+ "grad_norm": 2.5521230697631836,
+ "learning_rate": 3.875084825244131e-06,
+ "loss": 0.5503,
+ "step": 4061
+ },
+ {
+ "epoch": 1.9205673758865247,
+ "grad_norm": 2.6985747814178467,
+ "learning_rate": 3.874563798137928e-06,
+ "loss": 0.4944,
+ "step": 4062
+ },
+ {
+ "epoch": 1.9210401891252955,
+ "grad_norm": 2.422332525253296,
+ "learning_rate": 3.874042685443664e-06,
+ "loss": 0.4807,
+ "step": 4063
+ },
+ {
+ "epoch": 1.9215130023640663,
+ "grad_norm": 2.914553165435791,
+ "learning_rate": 3.873521487193788e-06,
+ "loss": 0.4439,
+ "step": 4064
+ },
+ {
+ "epoch": 1.9219858156028369,
+ "grad_norm": 2.8098697662353516,
+ "learning_rate": 3.873000203420752e-06,
+ "loss": 0.5433,
+ "step": 4065
+ },
+ {
+ "epoch": 1.9224586288416075,
+ "grad_norm": 2.6124703884124756,
+ "learning_rate": 3.872478834157013e-06,
+ "loss": 0.4812,
+ "step": 4066
+ },
+ {
+ "epoch": 1.9229314420803783,
+ "grad_norm": 2.511059522628784,
+ "learning_rate": 3.871957379435035e-06,
+ "loss": 0.4666,
+ "step": 4067
+ },
+ {
+ "epoch": 1.923404255319149,
+ "grad_norm": 2.950542688369751,
+ "learning_rate": 3.871435839287287e-06,
+ "loss": 0.5687,
+ "step": 4068
+ },
+ {
+ "epoch": 1.9238770685579196,
+ "grad_norm": 2.4969422817230225,
+ "learning_rate": 3.870914213746243e-06,
+ "loss": 0.5235,
+ "step": 4069
+ },
+ {
+ "epoch": 1.9243498817966902,
+ "grad_norm": 2.512152910232544,
+ "learning_rate": 3.870392502844382e-06,
+ "loss": 0.4524,
+ "step": 4070
+ },
+ {
+ "epoch": 1.924822695035461,
+ "grad_norm": 3.0212557315826416,
+ "learning_rate": 3.86987070661419e-06,
+ "loss": 0.4868,
+ "step": 4071
+ },
+ {
+ "epoch": 1.9252955082742318,
+ "grad_norm": 2.8949966430664062,
+ "learning_rate": 3.869348825088154e-06,
+ "loss": 0.5556,
+ "step": 4072
+ },
+ {
+ "epoch": 1.9257683215130024,
+ "grad_norm": 2.402043581008911,
+ "learning_rate": 3.868826858298772e-06,
+ "loss": 0.5307,
+ "step": 4073
+ },
+ {
+ "epoch": 1.926241134751773,
+ "grad_norm": 2.980992078781128,
+ "learning_rate": 3.868304806278543e-06,
+ "loss": 0.6313,
+ "step": 4074
+ },
+ {
+ "epoch": 1.9267139479905437,
+ "grad_norm": 2.7140514850616455,
+ "learning_rate": 3.867782669059975e-06,
+ "loss": 0.5359,
+ "step": 4075
+ },
+ {
+ "epoch": 1.9271867612293145,
+ "grad_norm": 2.499631643295288,
+ "learning_rate": 3.867260446675577e-06,
+ "loss": 0.4873,
+ "step": 4076
+ },
+ {
+ "epoch": 1.9276595744680851,
+ "grad_norm": 2.915583610534668,
+ "learning_rate": 3.866738139157866e-06,
+ "loss": 0.5736,
+ "step": 4077
+ },
+ {
+ "epoch": 1.9281323877068557,
+ "grad_norm": 2.4231131076812744,
+ "learning_rate": 3.866215746539363e-06,
+ "loss": 0.5096,
+ "step": 4078
+ },
+ {
+ "epoch": 1.9286052009456265,
+ "grad_norm": 2.360074996948242,
+ "learning_rate": 3.865693268852599e-06,
+ "loss": 0.4907,
+ "step": 4079
+ },
+ {
+ "epoch": 1.9290780141843973,
+ "grad_norm": 2.5410032272338867,
+ "learning_rate": 3.865170706130101e-06,
+ "loss": 0.473,
+ "step": 4080
+ },
+ {
+ "epoch": 1.9295508274231679,
+ "grad_norm": 2.780090808868408,
+ "learning_rate": 3.86464805840441e-06,
+ "loss": 0.5213,
+ "step": 4081
+ },
+ {
+ "epoch": 1.9300236406619384,
+ "grad_norm": 2.7318382263183594,
+ "learning_rate": 3.864125325708068e-06,
+ "loss": 0.5617,
+ "step": 4082
+ },
+ {
+ "epoch": 1.9304964539007092,
+ "grad_norm": 2.76509165763855,
+ "learning_rate": 3.863602508073623e-06,
+ "loss": 0.52,
+ "step": 4083
+ },
+ {
+ "epoch": 1.93096926713948,
+ "grad_norm": 2.8041110038757324,
+ "learning_rate": 3.863079605533631e-06,
+ "loss": 0.5343,
+ "step": 4084
+ },
+ {
+ "epoch": 1.9314420803782506,
+ "grad_norm": 2.4462404251098633,
+ "learning_rate": 3.862556618120647e-06,
+ "loss": 0.4657,
+ "step": 4085
+ },
+ {
+ "epoch": 1.9319148936170212,
+ "grad_norm": 2.460864305496216,
+ "learning_rate": 3.862033545867238e-06,
+ "loss": 0.517,
+ "step": 4086
+ },
+ {
+ "epoch": 1.932387706855792,
+ "grad_norm": 2.6480276584625244,
+ "learning_rate": 3.8615103888059715e-06,
+ "loss": 0.4702,
+ "step": 4087
+ },
+ {
+ "epoch": 1.9328605200945628,
+ "grad_norm": 2.7175381183624268,
+ "learning_rate": 3.860987146969424e-06,
+ "loss": 0.5073,
+ "step": 4088
+ },
+ {
+ "epoch": 1.9333333333333333,
+ "grad_norm": 2.4963486194610596,
+ "learning_rate": 3.860463820390175e-06,
+ "loss": 0.4491,
+ "step": 4089
+ },
+ {
+ "epoch": 1.933806146572104,
+ "grad_norm": 2.548135757446289,
+ "learning_rate": 3.8599404091008075e-06,
+ "loss": 0.5134,
+ "step": 4090
+ },
+ {
+ "epoch": 1.9342789598108747,
+ "grad_norm": 2.8693668842315674,
+ "learning_rate": 3.859416913133916e-06,
+ "loss": 0.5467,
+ "step": 4091
+ },
+ {
+ "epoch": 1.9347517730496455,
+ "grad_norm": 2.711273670196533,
+ "learning_rate": 3.858893332522092e-06,
+ "loss": 0.6287,
+ "step": 4092
+ },
+ {
+ "epoch": 1.935224586288416,
+ "grad_norm": 2.8604533672332764,
+ "learning_rate": 3.858369667297941e-06,
+ "loss": 0.5661,
+ "step": 4093
+ },
+ {
+ "epoch": 1.9356973995271867,
+ "grad_norm": 2.936988353729248,
+ "learning_rate": 3.857845917494066e-06,
+ "loss": 0.5311,
+ "step": 4094
+ },
+ {
+ "epoch": 1.9361702127659575,
+ "grad_norm": 2.414093494415283,
+ "learning_rate": 3.857322083143079e-06,
+ "loss": 0.505,
+ "step": 4095
+ },
+ {
+ "epoch": 1.9366430260047283,
+ "grad_norm": 2.5528934001922607,
+ "learning_rate": 3.856798164277599e-06,
+ "loss": 0.4759,
+ "step": 4096
+ },
+ {
+ "epoch": 1.9371158392434988,
+ "grad_norm": 2.592893600463867,
+ "learning_rate": 3.8562741609302456e-06,
+ "loss": 0.4932,
+ "step": 4097
+ },
+ {
+ "epoch": 1.9375886524822694,
+ "grad_norm": 2.9619107246398926,
+ "learning_rate": 3.855750073133648e-06,
+ "loss": 0.5563,
+ "step": 4098
+ },
+ {
+ "epoch": 1.9380614657210402,
+ "grad_norm": 2.864889621734619,
+ "learning_rate": 3.855225900920438e-06,
+ "loss": 0.5069,
+ "step": 4099
+ },
+ {
+ "epoch": 1.938534278959811,
+ "grad_norm": 2.3951032161712646,
+ "learning_rate": 3.854701644323253e-06,
+ "loss": 0.4883,
+ "step": 4100
+ },
+ {
+ "epoch": 1.9390070921985816,
+ "grad_norm": 2.6339633464813232,
+ "learning_rate": 3.854177303374737e-06,
+ "loss": 0.5207,
+ "step": 4101
+ },
+ {
+ "epoch": 1.9394799054373522,
+ "grad_norm": 2.6435508728027344,
+ "learning_rate": 3.853652878107539e-06,
+ "loss": 0.4679,
+ "step": 4102
+ },
+ {
+ "epoch": 1.939952718676123,
+ "grad_norm": 2.4635629653930664,
+ "learning_rate": 3.853128368554311e-06,
+ "loss": 0.5639,
+ "step": 4103
+ },
+ {
+ "epoch": 1.9404255319148938,
+ "grad_norm": 2.664635419845581,
+ "learning_rate": 3.852603774747714e-06,
+ "loss": 0.5697,
+ "step": 4104
+ },
+ {
+ "epoch": 1.9408983451536643,
+ "grad_norm": 2.7020363807678223,
+ "learning_rate": 3.8520790967204095e-06,
+ "loss": 0.5462,
+ "step": 4105
+ },
+ {
+ "epoch": 1.941371158392435,
+ "grad_norm": 3.529282331466675,
+ "learning_rate": 3.851554334505069e-06,
+ "loss": 0.54,
+ "step": 4106
+ },
+ {
+ "epoch": 1.9418439716312057,
+ "grad_norm": 2.7125768661499023,
+ "learning_rate": 3.851029488134367e-06,
+ "loss": 0.5355,
+ "step": 4107
+ },
+ {
+ "epoch": 1.9423167848699765,
+ "grad_norm": 2.5226643085479736,
+ "learning_rate": 3.850504557640981e-06,
+ "loss": 0.5106,
+ "step": 4108
+ },
+ {
+ "epoch": 1.942789598108747,
+ "grad_norm": 2.834352731704712,
+ "learning_rate": 3.8499795430575995e-06,
+ "loss": 0.6069,
+ "step": 4109
+ },
+ {
+ "epoch": 1.9432624113475176,
+ "grad_norm": 2.8484177589416504,
+ "learning_rate": 3.849454444416911e-06,
+ "loss": 0.5542,
+ "step": 4110
+ },
+ {
+ "epoch": 1.9437352245862884,
+ "grad_norm": 2.402539014816284,
+ "learning_rate": 3.848929261751612e-06,
+ "loss": 0.47,
+ "step": 4111
+ },
+ {
+ "epoch": 1.9442080378250592,
+ "grad_norm": 2.7010042667388916,
+ "learning_rate": 3.848403995094402e-06,
+ "loss": 0.5263,
+ "step": 4112
+ },
+ {
+ "epoch": 1.9446808510638298,
+ "grad_norm": 2.441689968109131,
+ "learning_rate": 3.847878644477988e-06,
+ "loss": 0.5607,
+ "step": 4113
+ },
+ {
+ "epoch": 1.9451536643026004,
+ "grad_norm": 2.5994722843170166,
+ "learning_rate": 3.847353209935081e-06,
+ "loss": 0.5103,
+ "step": 4114
+ },
+ {
+ "epoch": 1.9456264775413712,
+ "grad_norm": 2.452242136001587,
+ "learning_rate": 3.8468276914983975e-06,
+ "loss": 0.4409,
+ "step": 4115
+ },
+ {
+ "epoch": 1.946099290780142,
+ "grad_norm": 2.421023368835449,
+ "learning_rate": 3.84630208920066e-06,
+ "loss": 0.4429,
+ "step": 4116
+ },
+ {
+ "epoch": 1.9465721040189126,
+ "grad_norm": 2.696399688720703,
+ "learning_rate": 3.8457764030745945e-06,
+ "loss": 0.5352,
+ "step": 4117
+ },
+ {
+ "epoch": 1.9470449172576831,
+ "grad_norm": 2.3963489532470703,
+ "learning_rate": 3.845250633152933e-06,
+ "loss": 0.4505,
+ "step": 4118
+ },
+ {
+ "epoch": 1.947517730496454,
+ "grad_norm": 2.610649585723877,
+ "learning_rate": 3.8447247794684135e-06,
+ "loss": 0.501,
+ "step": 4119
+ },
+ {
+ "epoch": 1.9479905437352247,
+ "grad_norm": 2.740412712097168,
+ "learning_rate": 3.8441988420537775e-06,
+ "loss": 0.5362,
+ "step": 4120
+ },
+ {
+ "epoch": 1.9484633569739953,
+ "grad_norm": 2.2614004611968994,
+ "learning_rate": 3.8436728209417755e-06,
+ "loss": 0.4199,
+ "step": 4121
+ },
+ {
+ "epoch": 1.9489361702127659,
+ "grad_norm": 3.0683481693267822,
+ "learning_rate": 3.843146716165158e-06,
+ "loss": 0.5248,
+ "step": 4122
+ },
+ {
+ "epoch": 1.9494089834515367,
+ "grad_norm": 3.005174398422241,
+ "learning_rate": 3.842620527756684e-06,
+ "loss": 0.5246,
+ "step": 4123
+ },
+ {
+ "epoch": 1.9498817966903075,
+ "grad_norm": 2.672896385192871,
+ "learning_rate": 3.842094255749117e-06,
+ "loss": 0.5586,
+ "step": 4124
+ },
+ {
+ "epoch": 1.950354609929078,
+ "grad_norm": 2.5481197834014893,
+ "learning_rate": 3.8415679001752255e-06,
+ "loss": 0.5061,
+ "step": 4125
+ },
+ {
+ "epoch": 1.9508274231678486,
+ "grad_norm": 2.515789270401001,
+ "learning_rate": 3.8410414610677835e-06,
+ "loss": 0.4645,
+ "step": 4126
+ },
+ {
+ "epoch": 1.9513002364066194,
+ "grad_norm": 2.7236077785491943,
+ "learning_rate": 3.84051493845957e-06,
+ "loss": 0.5623,
+ "step": 4127
+ },
+ {
+ "epoch": 1.9517730496453902,
+ "grad_norm": 2.6252009868621826,
+ "learning_rate": 3.839988332383369e-06,
+ "loss": 0.5078,
+ "step": 4128
+ },
+ {
+ "epoch": 1.9522458628841608,
+ "grad_norm": 2.719196081161499,
+ "learning_rate": 3.83946164287197e-06,
+ "loss": 0.5481,
+ "step": 4129
+ },
+ {
+ "epoch": 1.9527186761229314,
+ "grad_norm": 2.484163284301758,
+ "learning_rate": 3.838934869958169e-06,
+ "loss": 0.5332,
+ "step": 4130
+ },
+ {
+ "epoch": 1.9531914893617022,
+ "grad_norm": 2.615382671356201,
+ "learning_rate": 3.838408013674764e-06,
+ "loss": 0.4742,
+ "step": 4131
+ },
+ {
+ "epoch": 1.953664302600473,
+ "grad_norm": 2.735321044921875,
+ "learning_rate": 3.83788107405456e-06,
+ "loss": 0.421,
+ "step": 4132
+ },
+ {
+ "epoch": 1.9541371158392435,
+ "grad_norm": 2.892652750015259,
+ "learning_rate": 3.837354051130369e-06,
+ "loss": 0.5326,
+ "step": 4133
+ },
+ {
+ "epoch": 1.9546099290780141,
+ "grad_norm": 2.6800546646118164,
+ "learning_rate": 3.8368269449350055e-06,
+ "loss": 0.5041,
+ "step": 4134
+ },
+ {
+ "epoch": 1.955082742316785,
+ "grad_norm": 2.362470865249634,
+ "learning_rate": 3.836299755501289e-06,
+ "loss": 0.4697,
+ "step": 4135
+ },
+ {
+ "epoch": 1.9555555555555557,
+ "grad_norm": 2.3855135440826416,
+ "learning_rate": 3.835772482862047e-06,
+ "loss": 0.5148,
+ "step": 4136
+ },
+ {
+ "epoch": 1.9560283687943263,
+ "grad_norm": 2.3338418006896973,
+ "learning_rate": 3.83524512705011e-06,
+ "loss": 0.4643,
+ "step": 4137
+ },
+ {
+ "epoch": 1.9565011820330969,
+ "grad_norm": 2.261355400085449,
+ "learning_rate": 3.834717688098313e-06,
+ "loss": 0.5573,
+ "step": 4138
+ },
+ {
+ "epoch": 1.9569739952718677,
+ "grad_norm": 2.8166391849517822,
+ "learning_rate": 3.834190166039498e-06,
+ "loss": 0.4868,
+ "step": 4139
+ },
+ {
+ "epoch": 1.9574468085106385,
+ "grad_norm": 2.4155869483947754,
+ "learning_rate": 3.833662560906512e-06,
+ "loss": 0.4923,
+ "step": 4140
+ },
+ {
+ "epoch": 1.957919621749409,
+ "grad_norm": 2.3977696895599365,
+ "learning_rate": 3.833134872732206e-06,
+ "loss": 0.5106,
+ "step": 4141
+ },
+ {
+ "epoch": 1.9583924349881796,
+ "grad_norm": 2.9541378021240234,
+ "learning_rate": 3.832607101549438e-06,
+ "loss": 0.4683,
+ "step": 4142
+ },
+ {
+ "epoch": 1.9588652482269504,
+ "grad_norm": 2.5862700939178467,
+ "learning_rate": 3.832079247391068e-06,
+ "loss": 0.4453,
+ "step": 4143
+ },
+ {
+ "epoch": 1.9593380614657212,
+ "grad_norm": 2.7459371089935303,
+ "learning_rate": 3.8315513102899644e-06,
+ "loss": 0.5511,
+ "step": 4144
+ },
+ {
+ "epoch": 1.9598108747044918,
+ "grad_norm": 2.904869556427002,
+ "learning_rate": 3.831023290279e-06,
+ "loss": 0.5348,
+ "step": 4145
+ },
+ {
+ "epoch": 1.9602836879432624,
+ "grad_norm": 3.092846632003784,
+ "learning_rate": 3.830495187391051e-06,
+ "loss": 0.5664,
+ "step": 4146
+ },
+ {
+ "epoch": 1.9607565011820332,
+ "grad_norm": 3.2838528156280518,
+ "learning_rate": 3.829967001659001e-06,
+ "loss": 0.5115,
+ "step": 4147
+ },
+ {
+ "epoch": 1.961229314420804,
+ "grad_norm": 2.7799549102783203,
+ "learning_rate": 3.829438733115738e-06,
+ "loss": 0.5145,
+ "step": 4148
+ },
+ {
+ "epoch": 1.9617021276595743,
+ "grad_norm": 2.436084270477295,
+ "learning_rate": 3.828910381794154e-06,
+ "loss": 0.4718,
+ "step": 4149
+ },
+ {
+ "epoch": 1.962174940898345,
+ "grad_norm": 2.6662371158599854,
+ "learning_rate": 3.828381947727148e-06,
+ "loss": 0.6129,
+ "step": 4150
+ },
+ {
+ "epoch": 1.962647754137116,
+ "grad_norm": 2.937000036239624,
+ "learning_rate": 3.827853430947622e-06,
+ "loss": 0.522,
+ "step": 4151
+ },
+ {
+ "epoch": 1.9631205673758865,
+ "grad_norm": 2.5737369060516357,
+ "learning_rate": 3.827324831488486e-06,
+ "loss": 0.4916,
+ "step": 4152
+ },
+ {
+ "epoch": 1.963593380614657,
+ "grad_norm": 2.70232892036438,
+ "learning_rate": 3.826796149382653e-06,
+ "loss": 0.4726,
+ "step": 4153
+ },
+ {
+ "epoch": 1.9640661938534278,
+ "grad_norm": 2.6899707317352295,
+ "learning_rate": 3.826267384663042e-06,
+ "loss": 0.529,
+ "step": 4154
+ },
+ {
+ "epoch": 1.9645390070921986,
+ "grad_norm": 2.6142728328704834,
+ "learning_rate": 3.825738537362575e-06,
+ "loss": 0.4999,
+ "step": 4155
+ },
+ {
+ "epoch": 1.9650118203309692,
+ "grad_norm": 2.43949818611145,
+ "learning_rate": 3.825209607514183e-06,
+ "loss": 0.5035,
+ "step": 4156
+ },
+ {
+ "epoch": 1.9654846335697398,
+ "grad_norm": 2.3735458850860596,
+ "learning_rate": 3.824680595150801e-06,
+ "loss": 0.4779,
+ "step": 4157
+ },
+ {
+ "epoch": 1.9659574468085106,
+ "grad_norm": 2.444307565689087,
+ "learning_rate": 3.824151500305365e-06,
+ "loss": 0.4825,
+ "step": 4158
+ },
+ {
+ "epoch": 1.9664302600472814,
+ "grad_norm": 2.8219668865203857,
+ "learning_rate": 3.8236223230108224e-06,
+ "loss": 0.5354,
+ "step": 4159
+ },
+ {
+ "epoch": 1.966903073286052,
+ "grad_norm": 2.720721483230591,
+ "learning_rate": 3.823093063300121e-06,
+ "loss": 0.5064,
+ "step": 4160
+ },
+ {
+ "epoch": 1.9673758865248225,
+ "grad_norm": 2.324190616607666,
+ "learning_rate": 3.822563721206217e-06,
+ "loss": 0.5348,
+ "step": 4161
+ },
+ {
+ "epoch": 1.9678486997635933,
+ "grad_norm": 2.702155351638794,
+ "learning_rate": 3.8220342967620695e-06,
+ "loss": 0.5388,
+ "step": 4162
+ },
+ {
+ "epoch": 1.9683215130023641,
+ "grad_norm": 2.4956369400024414,
+ "learning_rate": 3.821504790000642e-06,
+ "loss": 0.5071,
+ "step": 4163
+ },
+ {
+ "epoch": 1.9687943262411347,
+ "grad_norm": 2.568039655685425,
+ "learning_rate": 3.820975200954906e-06,
+ "loss": 0.5133,
+ "step": 4164
+ },
+ {
+ "epoch": 1.9692671394799053,
+ "grad_norm": 2.810868978500366,
+ "learning_rate": 3.820445529657837e-06,
+ "loss": 0.4856,
+ "step": 4165
+ },
+ {
+ "epoch": 1.969739952718676,
+ "grad_norm": 2.66365647315979,
+ "learning_rate": 3.819915776142415e-06,
+ "loss": 0.5235,
+ "step": 4166
+ },
+ {
+ "epoch": 1.9702127659574469,
+ "grad_norm": 2.2982139587402344,
+ "learning_rate": 3.8193859404416265e-06,
+ "loss": 0.4361,
+ "step": 4167
+ },
+ {
+ "epoch": 1.9706855791962175,
+ "grad_norm": 2.585672378540039,
+ "learning_rate": 3.818856022588458e-06,
+ "loss": 0.4842,
+ "step": 4168
+ },
+ {
+ "epoch": 1.971158392434988,
+ "grad_norm": 2.57857346534729,
+ "learning_rate": 3.81832602261591e-06,
+ "loss": 0.5249,
+ "step": 4169
+ },
+ {
+ "epoch": 1.9716312056737588,
+ "grad_norm": 2.6947224140167236,
+ "learning_rate": 3.817795940556981e-06,
+ "loss": 0.5234,
+ "step": 4170
+ },
+ {
+ "epoch": 1.9721040189125296,
+ "grad_norm": 2.7453415393829346,
+ "learning_rate": 3.8172657764446764e-06,
+ "loss": 0.5219,
+ "step": 4171
+ },
+ {
+ "epoch": 1.9725768321513002,
+ "grad_norm": 8.424073219299316,
+ "learning_rate": 3.816735530312009e-06,
+ "loss": 0.5162,
+ "step": 4172
+ },
+ {
+ "epoch": 1.9730496453900708,
+ "grad_norm": 2.8229739665985107,
+ "learning_rate": 3.816205202191993e-06,
+ "loss": 0.4621,
+ "step": 4173
+ },
+ {
+ "epoch": 1.9735224586288416,
+ "grad_norm": 2.5969009399414062,
+ "learning_rate": 3.815674792117651e-06,
+ "loss": 0.5044,
+ "step": 4174
+ },
+ {
+ "epoch": 1.9739952718676124,
+ "grad_norm": 2.646024227142334,
+ "learning_rate": 3.815144300122009e-06,
+ "loss": 0.5094,
+ "step": 4175
+ },
+ {
+ "epoch": 1.974468085106383,
+ "grad_norm": 2.4950616359710693,
+ "learning_rate": 3.814613726238097e-06,
+ "loss": 0.4827,
+ "step": 4176
+ },
+ {
+ "epoch": 1.9749408983451535,
+ "grad_norm": 2.5636119842529297,
+ "learning_rate": 3.8140830704989535e-06,
+ "loss": 0.5241,
+ "step": 4177
+ },
+ {
+ "epoch": 1.9754137115839243,
+ "grad_norm": 2.7936553955078125,
+ "learning_rate": 3.813552332937619e-06,
+ "loss": 0.5344,
+ "step": 4178
+ },
+ {
+ "epoch": 1.9758865248226951,
+ "grad_norm": 2.8085341453552246,
+ "learning_rate": 3.8130215135871405e-06,
+ "loss": 0.5647,
+ "step": 4179
+ },
+ {
+ "epoch": 1.9763593380614657,
+ "grad_norm": 2.4776322841644287,
+ "learning_rate": 3.8124906124805694e-06,
+ "loss": 0.542,
+ "step": 4180
+ },
+ {
+ "epoch": 1.9768321513002363,
+ "grad_norm": 2.3227856159210205,
+ "learning_rate": 3.8119596296509635e-06,
+ "loss": 0.4618,
+ "step": 4181
+ },
+ {
+ "epoch": 1.977304964539007,
+ "grad_norm": 2.5157814025878906,
+ "learning_rate": 3.8114285651313848e-06,
+ "loss": 0.538,
+ "step": 4182
+ },
+ {
+ "epoch": 1.9777777777777779,
+ "grad_norm": 2.5630218982696533,
+ "learning_rate": 3.8108974189548987e-06,
+ "loss": 0.5254,
+ "step": 4183
+ },
+ {
+ "epoch": 1.9782505910165484,
+ "grad_norm": 2.703237533569336,
+ "learning_rate": 3.8103661911545787e-06,
+ "loss": 0.4859,
+ "step": 4184
+ },
+ {
+ "epoch": 1.978723404255319,
+ "grad_norm": 2.8808000087738037,
+ "learning_rate": 3.809834881763502e-06,
+ "loss": 0.5585,
+ "step": 4185
+ },
+ {
+ "epoch": 1.9791962174940898,
+ "grad_norm": 2.9047577381134033,
+ "learning_rate": 3.8093034908147507e-06,
+ "loss": 0.5022,
+ "step": 4186
+ },
+ {
+ "epoch": 1.9796690307328606,
+ "grad_norm": 2.7417640686035156,
+ "learning_rate": 3.8087720183414125e-06,
+ "loss": 0.5275,
+ "step": 4187
+ },
+ {
+ "epoch": 1.9801418439716312,
+ "grad_norm": 2.952012062072754,
+ "learning_rate": 3.8082404643765786e-06,
+ "loss": 0.543,
+ "step": 4188
+ },
+ {
+ "epoch": 1.9806146572104018,
+ "grad_norm": 2.538376569747925,
+ "learning_rate": 3.807708828953348e-06,
+ "loss": 0.4969,
+ "step": 4189
+ },
+ {
+ "epoch": 1.9810874704491725,
+ "grad_norm": 2.3476181030273438,
+ "learning_rate": 3.807177112104823e-06,
+ "loss": 0.4979,
+ "step": 4190
+ },
+ {
+ "epoch": 1.9815602836879433,
+ "grad_norm": 2.6480464935302734,
+ "learning_rate": 3.80664531386411e-06,
+ "loss": 0.4894,
+ "step": 4191
+ },
+ {
+ "epoch": 1.982033096926714,
+ "grad_norm": 2.792916774749756,
+ "learning_rate": 3.8061134342643235e-06,
+ "loss": 0.5468,
+ "step": 4192
+ },
+ {
+ "epoch": 1.9825059101654845,
+ "grad_norm": 2.368736743927002,
+ "learning_rate": 3.805581473338581e-06,
+ "loss": 0.4672,
+ "step": 4193
+ },
+ {
+ "epoch": 1.9829787234042553,
+ "grad_norm": 2.379084348678589,
+ "learning_rate": 3.8050494311200037e-06,
+ "loss": 0.4577,
+ "step": 4194
+ },
+ {
+ "epoch": 1.983451536643026,
+ "grad_norm": 2.722471237182617,
+ "learning_rate": 3.804517307641722e-06,
+ "loss": 0.4988,
+ "step": 4195
+ },
+ {
+ "epoch": 1.9839243498817967,
+ "grad_norm": 2.356649875640869,
+ "learning_rate": 3.8039851029368674e-06,
+ "loss": 0.4933,
+ "step": 4196
+ },
+ {
+ "epoch": 1.9843971631205672,
+ "grad_norm": 2.9182281494140625,
+ "learning_rate": 3.8034528170385776e-06,
+ "loss": 0.4873,
+ "step": 4197
+ },
+ {
+ "epoch": 1.984869976359338,
+ "grad_norm": 2.6232199668884277,
+ "learning_rate": 3.8029204499799976e-06,
+ "loss": 0.4425,
+ "step": 4198
+ },
+ {
+ "epoch": 1.9853427895981088,
+ "grad_norm": 2.667541980743408,
+ "learning_rate": 3.802388001794274e-06,
+ "loss": 0.5022,
+ "step": 4199
+ },
+ {
+ "epoch": 1.9858156028368794,
+ "grad_norm": 3.168470621109009,
+ "learning_rate": 3.8018554725145596e-06,
+ "loss": 0.5505,
+ "step": 4200
+ },
+ {
+ "epoch": 1.98628841607565,
+ "grad_norm": 2.716625452041626,
+ "learning_rate": 3.8013228621740132e-06,
+ "loss": 0.4937,
+ "step": 4201
+ },
+ {
+ "epoch": 1.9867612293144208,
+ "grad_norm": 2.3014442920684814,
+ "learning_rate": 3.800790170805799e-06,
+ "loss": 0.4734,
+ "step": 4202
+ },
+ {
+ "epoch": 1.9872340425531916,
+ "grad_norm": 2.9426841735839844,
+ "learning_rate": 3.8002573984430847e-06,
+ "loss": 0.4983,
+ "step": 4203
+ },
+ {
+ "epoch": 1.9877068557919622,
+ "grad_norm": 2.5598278045654297,
+ "learning_rate": 3.7997245451190435e-06,
+ "loss": 0.4834,
+ "step": 4204
+ },
+ {
+ "epoch": 1.9881796690307327,
+ "grad_norm": 2.86458420753479,
+ "learning_rate": 3.7991916108668538e-06,
+ "loss": 0.5613,
+ "step": 4205
+ },
+ {
+ "epoch": 1.9886524822695035,
+ "grad_norm": 2.842914342880249,
+ "learning_rate": 3.7986585957196997e-06,
+ "loss": 0.4951,
+ "step": 4206
+ },
+ {
+ "epoch": 1.9891252955082743,
+ "grad_norm": 3.1828150749206543,
+ "learning_rate": 3.7981254997107686e-06,
+ "loss": 0.5913,
+ "step": 4207
+ },
+ {
+ "epoch": 1.989598108747045,
+ "grad_norm": 2.5765931606292725,
+ "learning_rate": 3.7975923228732547e-06,
+ "loss": 0.5544,
+ "step": 4208
+ },
+ {
+ "epoch": 1.9900709219858155,
+ "grad_norm": 2.492234945297241,
+ "learning_rate": 3.797059065240357e-06,
+ "loss": 0.5046,
+ "step": 4209
+ },
+ {
+ "epoch": 1.9905437352245863,
+ "grad_norm": 2.870346784591675,
+ "learning_rate": 3.7965257268452795e-06,
+ "loss": 0.5354,
+ "step": 4210
+ },
+ {
+ "epoch": 1.991016548463357,
+ "grad_norm": 2.4989993572235107,
+ "learning_rate": 3.795992307721229e-06,
+ "loss": 0.4677,
+ "step": 4211
+ },
+ {
+ "epoch": 1.9914893617021276,
+ "grad_norm": 2.931114673614502,
+ "learning_rate": 3.7954588079014206e-06,
+ "loss": 0.5504,
+ "step": 4212
+ },
+ {
+ "epoch": 1.9919621749408982,
+ "grad_norm": 2.5247652530670166,
+ "learning_rate": 3.794925227419073e-06,
+ "loss": 0.4736,
+ "step": 4213
+ },
+ {
+ "epoch": 1.992434988179669,
+ "grad_norm": 2.6238436698913574,
+ "learning_rate": 3.794391566307409e-06,
+ "loss": 0.4591,
+ "step": 4214
+ },
+ {
+ "epoch": 1.9929078014184398,
+ "grad_norm": 2.654886245727539,
+ "learning_rate": 3.7938578245996584e-06,
+ "loss": 0.5149,
+ "step": 4215
+ },
+ {
+ "epoch": 1.9933806146572104,
+ "grad_norm": 2.509164810180664,
+ "learning_rate": 3.793324002329054e-06,
+ "loss": 0.4951,
+ "step": 4216
+ },
+ {
+ "epoch": 1.993853427895981,
+ "grad_norm": 2.909632921218872,
+ "learning_rate": 3.7927900995288345e-06,
+ "loss": 0.5131,
+ "step": 4217
+ },
+ {
+ "epoch": 1.9943262411347518,
+ "grad_norm": 2.4354615211486816,
+ "learning_rate": 3.7922561162322456e-06,
+ "loss": 0.4716,
+ "step": 4218
+ },
+ {
+ "epoch": 1.9947990543735226,
+ "grad_norm": 2.6514649391174316,
+ "learning_rate": 3.791722052472534e-06,
+ "loss": 0.5714,
+ "step": 4219
+ },
+ {
+ "epoch": 1.9952718676122931,
+ "grad_norm": 2.77089262008667,
+ "learning_rate": 3.791187908282954e-06,
+ "loss": 0.5736,
+ "step": 4220
+ },
+ {
+ "epoch": 1.9957446808510637,
+ "grad_norm": 2.7651021480560303,
+ "learning_rate": 3.7906536836967657e-06,
+ "loss": 0.4948,
+ "step": 4221
+ },
+ {
+ "epoch": 1.9962174940898345,
+ "grad_norm": 2.7536795139312744,
+ "learning_rate": 3.7901193787472306e-06,
+ "loss": 0.512,
+ "step": 4222
+ },
+ {
+ "epoch": 1.9966903073286053,
+ "grad_norm": 2.684893846511841,
+ "learning_rate": 3.78958499346762e-06,
+ "loss": 0.5118,
+ "step": 4223
+ },
+ {
+ "epoch": 1.9971631205673759,
+ "grad_norm": 2.7616753578186035,
+ "learning_rate": 3.7890505278912054e-06,
+ "loss": 0.4516,
+ "step": 4224
+ },
+ {
+ "epoch": 1.9976359338061465,
+ "grad_norm": 2.4731967449188232,
+ "learning_rate": 3.7885159820512666e-06,
+ "loss": 0.4736,
+ "step": 4225
+ },
+ {
+ "epoch": 1.9981087470449173,
+ "grad_norm": 2.366631031036377,
+ "learning_rate": 3.7879813559810884e-06,
+ "loss": 0.4999,
+ "step": 4226
+ },
+ {
+ "epoch": 1.998581560283688,
+ "grad_norm": 2.994624137878418,
+ "learning_rate": 3.7874466497139582e-06,
+ "loss": 0.5273,
+ "step": 4227
+ },
+ {
+ "epoch": 1.9990543735224586,
+ "grad_norm": 2.4499242305755615,
+ "learning_rate": 3.7869118632831712e-06,
+ "loss": 0.5761,
+ "step": 4228
+ },
+ {
+ "epoch": 1.9995271867612292,
+ "grad_norm": 2.3370113372802734,
+ "learning_rate": 3.7863769967220243e-06,
+ "loss": 0.4673,
+ "step": 4229
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 3.1131203174591064,
+ "learning_rate": 3.7858420500638236e-06,
+ "loss": 0.5118,
+ "step": 4230
+ },
+ {
+ "epoch": 2.000472813238771,
+ "grad_norm": 2.2747561931610107,
+ "learning_rate": 3.785307023341876e-06,
+ "loss": 0.4166,
+ "step": 4231
+ },
+ {
+ "epoch": 2.000945626477541,
+ "grad_norm": 2.4347424507141113,
+ "learning_rate": 3.7847719165894963e-06,
+ "loss": 0.4161,
+ "step": 4232
+ },
+ {
+ "epoch": 2.001418439716312,
+ "grad_norm": 2.398805618286133,
+ "learning_rate": 3.784236729840003e-06,
+ "loss": 0.4652,
+ "step": 4233
+ },
+ {
+ "epoch": 2.0018912529550827,
+ "grad_norm": 2.1904916763305664,
+ "learning_rate": 3.783701463126719e-06,
+ "loss": 0.4554,
+ "step": 4234
+ },
+ {
+ "epoch": 2.0023640661938535,
+ "grad_norm": 2.237330913543701,
+ "learning_rate": 3.7831661164829735e-06,
+ "loss": 0.4471,
+ "step": 4235
+ },
+ {
+ "epoch": 2.002836879432624,
+ "grad_norm": 2.3656628131866455,
+ "learning_rate": 3.7826306899421016e-06,
+ "loss": 0.4052,
+ "step": 4236
+ },
+ {
+ "epoch": 2.0033096926713947,
+ "grad_norm": 2.615489959716797,
+ "learning_rate": 3.7820951835374405e-06,
+ "loss": 0.4847,
+ "step": 4237
+ },
+ {
+ "epoch": 2.0037825059101655,
+ "grad_norm": 2.453036308288574,
+ "learning_rate": 3.7815595973023347e-06,
+ "loss": 0.4672,
+ "step": 4238
+ },
+ {
+ "epoch": 2.0042553191489363,
+ "grad_norm": 2.537468671798706,
+ "learning_rate": 3.7810239312701306e-06,
+ "loss": 0.467,
+ "step": 4239
+ },
+ {
+ "epoch": 2.0047281323877066,
+ "grad_norm": 2.3321666717529297,
+ "learning_rate": 3.780488185474184e-06,
+ "loss": 0.3557,
+ "step": 4240
+ },
+ {
+ "epoch": 2.0052009456264774,
+ "grad_norm": 2.9051828384399414,
+ "learning_rate": 3.779952359947854e-06,
+ "loss": 0.5474,
+ "step": 4241
+ },
+ {
+ "epoch": 2.0056737588652482,
+ "grad_norm": 2.7458817958831787,
+ "learning_rate": 3.7794164547245015e-06,
+ "loss": 0.4659,
+ "step": 4242
+ },
+ {
+ "epoch": 2.006146572104019,
+ "grad_norm": 2.627046585083008,
+ "learning_rate": 3.778880469837497e-06,
+ "loss": 0.4179,
+ "step": 4243
+ },
+ {
+ "epoch": 2.0066193853427894,
+ "grad_norm": 2.4186174869537354,
+ "learning_rate": 3.7783444053202135e-06,
+ "loss": 0.3976,
+ "step": 4244
+ },
+ {
+ "epoch": 2.00709219858156,
+ "grad_norm": 3.109376907348633,
+ "learning_rate": 3.7778082612060296e-06,
+ "loss": 0.4095,
+ "step": 4245
+ },
+ {
+ "epoch": 2.007565011820331,
+ "grad_norm": 2.583376169204712,
+ "learning_rate": 3.7772720375283282e-06,
+ "loss": 0.4325,
+ "step": 4246
+ },
+ {
+ "epoch": 2.0080378250591018,
+ "grad_norm": 2.6199896335601807,
+ "learning_rate": 3.776735734320497e-06,
+ "loss": 0.4207,
+ "step": 4247
+ },
+ {
+ "epoch": 2.008510638297872,
+ "grad_norm": 2.545353651046753,
+ "learning_rate": 3.77619935161593e-06,
+ "loss": 0.4483,
+ "step": 4248
+ },
+ {
+ "epoch": 2.008983451536643,
+ "grad_norm": 2.770266056060791,
+ "learning_rate": 3.7756628894480263e-06,
+ "loss": 0.457,
+ "step": 4249
+ },
+ {
+ "epoch": 2.0094562647754137,
+ "grad_norm": 2.903254985809326,
+ "learning_rate": 3.7751263478501878e-06,
+ "loss": 0.4171,
+ "step": 4250
+ },
+ {
+ "epoch": 2.0099290780141845,
+ "grad_norm": 2.5576963424682617,
+ "learning_rate": 3.774589726855822e-06,
+ "loss": 0.3631,
+ "step": 4251
+ },
+ {
+ "epoch": 2.010401891252955,
+ "grad_norm": 3.7584285736083984,
+ "learning_rate": 3.7740530264983434e-06,
+ "loss": 0.4827,
+ "step": 4252
+ },
+ {
+ "epoch": 2.0108747044917257,
+ "grad_norm": 3.3116581439971924,
+ "learning_rate": 3.77351624681117e-06,
+ "loss": 0.5071,
+ "step": 4253
+ },
+ {
+ "epoch": 2.0113475177304965,
+ "grad_norm": 3.1370885372161865,
+ "learning_rate": 3.772979387827723e-06,
+ "loss": 0.4963,
+ "step": 4254
+ },
+ {
+ "epoch": 2.0118203309692673,
+ "grad_norm": 2.4832639694213867,
+ "learning_rate": 3.772442449581432e-06,
+ "loss": 0.4442,
+ "step": 4255
+ },
+ {
+ "epoch": 2.0122931442080376,
+ "grad_norm": 2.7645785808563232,
+ "learning_rate": 3.7719054321057293e-06,
+ "loss": 0.4572,
+ "step": 4256
+ },
+ {
+ "epoch": 2.0127659574468084,
+ "grad_norm": 2.7962236404418945,
+ "learning_rate": 3.7713683354340515e-06,
+ "loss": 0.4906,
+ "step": 4257
+ },
+ {
+ "epoch": 2.013238770685579,
+ "grad_norm": 2.647991895675659,
+ "learning_rate": 3.7708311595998425e-06,
+ "loss": 0.4027,
+ "step": 4258
+ },
+ {
+ "epoch": 2.01371158392435,
+ "grad_norm": 2.3780267238616943,
+ "learning_rate": 3.7702939046365504e-06,
+ "loss": 0.4285,
+ "step": 4259
+ },
+ {
+ "epoch": 2.0141843971631204,
+ "grad_norm": 2.5185933113098145,
+ "learning_rate": 3.7697565705776266e-06,
+ "loss": 0.4834,
+ "step": 4260
+ },
+ {
+ "epoch": 2.014657210401891,
+ "grad_norm": 2.432507276535034,
+ "learning_rate": 3.7692191574565294e-06,
+ "loss": 0.3695,
+ "step": 4261
+ },
+ {
+ "epoch": 2.015130023640662,
+ "grad_norm": 2.8010706901550293,
+ "learning_rate": 3.76868166530672e-06,
+ "loss": 0.478,
+ "step": 4262
+ },
+ {
+ "epoch": 2.0156028368794328,
+ "grad_norm": 2.32817006111145,
+ "learning_rate": 3.768144094161666e-06,
+ "loss": 0.4154,
+ "step": 4263
+ },
+ {
+ "epoch": 2.016075650118203,
+ "grad_norm": 3.062812328338623,
+ "learning_rate": 3.7676064440548405e-06,
+ "loss": 0.5015,
+ "step": 4264
+ },
+ {
+ "epoch": 2.016548463356974,
+ "grad_norm": 2.6129536628723145,
+ "learning_rate": 3.7670687150197194e-06,
+ "loss": 0.3843,
+ "step": 4265
+ },
+ {
+ "epoch": 2.0170212765957447,
+ "grad_norm": 2.838259696960449,
+ "learning_rate": 3.766530907089786e-06,
+ "loss": 0.4937,
+ "step": 4266
+ },
+ {
+ "epoch": 2.0174940898345155,
+ "grad_norm": 2.601203680038452,
+ "learning_rate": 3.7659930202985263e-06,
+ "loss": 0.4644,
+ "step": 4267
+ },
+ {
+ "epoch": 2.017966903073286,
+ "grad_norm": 2.5964133739471436,
+ "learning_rate": 3.7654550546794322e-06,
+ "loss": 0.4365,
+ "step": 4268
+ },
+ {
+ "epoch": 2.0184397163120567,
+ "grad_norm": 3.0028915405273438,
+ "learning_rate": 3.764917010266001e-06,
+ "loss": 0.434,
+ "step": 4269
+ },
+ {
+ "epoch": 2.0189125295508275,
+ "grad_norm": 2.719252586364746,
+ "learning_rate": 3.764378887091734e-06,
+ "loss": 0.4401,
+ "step": 4270
+ },
+ {
+ "epoch": 2.0193853427895982,
+ "grad_norm": 2.400254011154175,
+ "learning_rate": 3.7638406851901377e-06,
+ "loss": 0.4904,
+ "step": 4271
+ },
+ {
+ "epoch": 2.0198581560283686,
+ "grad_norm": 2.8015363216400146,
+ "learning_rate": 3.763302404594724e-06,
+ "loss": 0.4569,
+ "step": 4272
+ },
+ {
+ "epoch": 2.0203309692671394,
+ "grad_norm": 2.718416452407837,
+ "learning_rate": 3.762764045339009e-06,
+ "loss": 0.5124,
+ "step": 4273
+ },
+ {
+ "epoch": 2.02080378250591,
+ "grad_norm": 2.484049081802368,
+ "learning_rate": 3.762225607456514e-06,
+ "loss": 0.4255,
+ "step": 4274
+ },
+ {
+ "epoch": 2.021276595744681,
+ "grad_norm": 2.6377930641174316,
+ "learning_rate": 3.7616870909807645e-06,
+ "loss": 0.5044,
+ "step": 4275
+ },
+ {
+ "epoch": 2.0217494089834513,
+ "grad_norm": 2.8845038414001465,
+ "learning_rate": 3.7611484959452927e-06,
+ "loss": 0.4924,
+ "step": 4276
+ },
+ {
+ "epoch": 2.022222222222222,
+ "grad_norm": 2.5939974784851074,
+ "learning_rate": 3.7606098223836342e-06,
+ "loss": 0.4873,
+ "step": 4277
+ },
+ {
+ "epoch": 2.022695035460993,
+ "grad_norm": 2.499826431274414,
+ "learning_rate": 3.76007107032933e-06,
+ "loss": 0.4515,
+ "step": 4278
+ },
+ {
+ "epoch": 2.0231678486997637,
+ "grad_norm": 3.0318663120269775,
+ "learning_rate": 3.759532239815924e-06,
+ "loss": 0.4901,
+ "step": 4279
+ },
+ {
+ "epoch": 2.023640661938534,
+ "grad_norm": 2.857977867126465,
+ "learning_rate": 3.758993330876969e-06,
+ "loss": 0.4659,
+ "step": 4280
+ },
+ {
+ "epoch": 2.024113475177305,
+ "grad_norm": 2.47918438911438,
+ "learning_rate": 3.7584543435460196e-06,
+ "loss": 0.4323,
+ "step": 4281
+ },
+ {
+ "epoch": 2.0245862884160757,
+ "grad_norm": 2.6033785343170166,
+ "learning_rate": 3.757915277856637e-06,
+ "loss": 0.4437,
+ "step": 4282
+ },
+ {
+ "epoch": 2.0250591016548465,
+ "grad_norm": 2.799781322479248,
+ "learning_rate": 3.757376133842386e-06,
+ "loss": 0.4523,
+ "step": 4283
+ },
+ {
+ "epoch": 2.025531914893617,
+ "grad_norm": 2.6092529296875,
+ "learning_rate": 3.756836911536836e-06,
+ "loss": 0.3898,
+ "step": 4284
+ },
+ {
+ "epoch": 2.0260047281323876,
+ "grad_norm": 2.66229248046875,
+ "learning_rate": 3.7562976109735627e-06,
+ "loss": 0.4731,
+ "step": 4285
+ },
+ {
+ "epoch": 2.0264775413711584,
+ "grad_norm": 2.90142822265625,
+ "learning_rate": 3.7557582321861463e-06,
+ "loss": 0.4285,
+ "step": 4286
+ },
+ {
+ "epoch": 2.0269503546099292,
+ "grad_norm": 2.5138802528381348,
+ "learning_rate": 3.7552187752081707e-06,
+ "loss": 0.4467,
+ "step": 4287
+ },
+ {
+ "epoch": 2.0274231678486996,
+ "grad_norm": 3.0656235218048096,
+ "learning_rate": 3.754679240073226e-06,
+ "loss": 0.4718,
+ "step": 4288
+ },
+ {
+ "epoch": 2.0278959810874704,
+ "grad_norm": 2.9633383750915527,
+ "learning_rate": 3.754139626814907e-06,
+ "loss": 0.4741,
+ "step": 4289
+ },
+ {
+ "epoch": 2.028368794326241,
+ "grad_norm": 2.5925145149230957,
+ "learning_rate": 3.753599935466812e-06,
+ "loss": 0.4281,
+ "step": 4290
+ },
+ {
+ "epoch": 2.028841607565012,
+ "grad_norm": 2.837740659713745,
+ "learning_rate": 3.7530601660625456e-06,
+ "loss": 0.4757,
+ "step": 4291
+ },
+ {
+ "epoch": 2.0293144208037823,
+ "grad_norm": 2.3995790481567383,
+ "learning_rate": 3.752520318635718e-06,
+ "loss": 0.4148,
+ "step": 4292
+ },
+ {
+ "epoch": 2.029787234042553,
+ "grad_norm": 2.572601795196533,
+ "learning_rate": 3.7519803932199424e-06,
+ "loss": 0.4051,
+ "step": 4293
+ },
+ {
+ "epoch": 2.030260047281324,
+ "grad_norm": 2.6780295372009277,
+ "learning_rate": 3.751440389848837e-06,
+ "loss": 0.4626,
+ "step": 4294
+ },
+ {
+ "epoch": 2.0307328605200947,
+ "grad_norm": 2.8666839599609375,
+ "learning_rate": 3.7509003085560257e-06,
+ "loss": 0.4255,
+ "step": 4295
+ },
+ {
+ "epoch": 2.031205673758865,
+ "grad_norm": 2.4398207664489746,
+ "learning_rate": 3.750360149375138e-06,
+ "loss": 0.4235,
+ "step": 4296
+ },
+ {
+ "epoch": 2.031678486997636,
+ "grad_norm": 2.436840534210205,
+ "learning_rate": 3.7498199123398062e-06,
+ "loss": 0.3907,
+ "step": 4297
+ },
+ {
+ "epoch": 2.0321513002364067,
+ "grad_norm": 3.3945820331573486,
+ "learning_rate": 3.7492795974836683e-06,
+ "loss": 0.465,
+ "step": 4298
+ },
+ {
+ "epoch": 2.0326241134751775,
+ "grad_norm": 2.6693103313446045,
+ "learning_rate": 3.7487392048403678e-06,
+ "loss": 0.4948,
+ "step": 4299
+ },
+ {
+ "epoch": 2.033096926713948,
+ "grad_norm": 2.7642734050750732,
+ "learning_rate": 3.748198734443553e-06,
+ "loss": 0.4538,
+ "step": 4300
+ },
+ {
+ "epoch": 2.0335697399527186,
+ "grad_norm": 3.1436543464660645,
+ "learning_rate": 3.747658186326876e-06,
+ "loss": 0.5137,
+ "step": 4301
+ },
+ {
+ "epoch": 2.0340425531914894,
+ "grad_norm": 3.482678174972534,
+ "learning_rate": 3.7471175605239947e-06,
+ "loss": 0.4982,
+ "step": 4302
+ },
+ {
+ "epoch": 2.03451536643026,
+ "grad_norm": 2.712557077407837,
+ "learning_rate": 3.746576857068571e-06,
+ "loss": 0.4459,
+ "step": 4303
+ },
+ {
+ "epoch": 2.0349881796690306,
+ "grad_norm": 3.147440195083618,
+ "learning_rate": 3.7460360759942726e-06,
+ "loss": 0.5063,
+ "step": 4304
+ },
+ {
+ "epoch": 2.0354609929078014,
+ "grad_norm": 2.840672492980957,
+ "learning_rate": 3.7454952173347714e-06,
+ "loss": 0.5041,
+ "step": 4305
+ },
+ {
+ "epoch": 2.035933806146572,
+ "grad_norm": 2.584122657775879,
+ "learning_rate": 3.744954281123745e-06,
+ "loss": 0.4487,
+ "step": 4306
+ },
+ {
+ "epoch": 2.036406619385343,
+ "grad_norm": 2.9869542121887207,
+ "learning_rate": 3.7444132673948737e-06,
+ "loss": 0.479,
+ "step": 4307
+ },
+ {
+ "epoch": 2.0368794326241133,
+ "grad_norm": 2.478459358215332,
+ "learning_rate": 3.7438721761818446e-06,
+ "loss": 0.4636,
+ "step": 4308
+ },
+ {
+ "epoch": 2.037352245862884,
+ "grad_norm": 2.5524215698242188,
+ "learning_rate": 3.7433310075183504e-06,
+ "loss": 0.4601,
+ "step": 4309
+ },
+ {
+ "epoch": 2.037825059101655,
+ "grad_norm": 2.3709988594055176,
+ "learning_rate": 3.742789761438086e-06,
+ "loss": 0.4163,
+ "step": 4310
+ },
+ {
+ "epoch": 2.0382978723404257,
+ "grad_norm": 3.140355348587036,
+ "learning_rate": 3.742248437974752e-06,
+ "loss": 0.4433,
+ "step": 4311
+ },
+ {
+ "epoch": 2.038770685579196,
+ "grad_norm": 2.940948486328125,
+ "learning_rate": 3.741707037162055e-06,
+ "loss": 0.4299,
+ "step": 4312
+ },
+ {
+ "epoch": 2.039243498817967,
+ "grad_norm": 3.009157419204712,
+ "learning_rate": 3.7411655590337055e-06,
+ "loss": 0.463,
+ "step": 4313
+ },
+ {
+ "epoch": 2.0397163120567376,
+ "grad_norm": 2.672945737838745,
+ "learning_rate": 3.7406240036234185e-06,
+ "loss": 0.4696,
+ "step": 4314
+ },
+ {
+ "epoch": 2.0401891252955084,
+ "grad_norm": 2.745962142944336,
+ "learning_rate": 3.740082370964916e-06,
+ "loss": 0.4931,
+ "step": 4315
+ },
+ {
+ "epoch": 2.040661938534279,
+ "grad_norm": 2.3939316272735596,
+ "learning_rate": 3.7395406610919217e-06,
+ "loss": 0.4396,
+ "step": 4316
+ },
+ {
+ "epoch": 2.0411347517730496,
+ "grad_norm": 2.4364447593688965,
+ "learning_rate": 3.738998874038165e-06,
+ "loss": 0.4807,
+ "step": 4317
+ },
+ {
+ "epoch": 2.0416075650118204,
+ "grad_norm": 2.360489845275879,
+ "learning_rate": 3.738457009837381e-06,
+ "loss": 0.4426,
+ "step": 4318
+ },
+ {
+ "epoch": 2.042080378250591,
+ "grad_norm": 2.5494935512542725,
+ "learning_rate": 3.7379150685233108e-06,
+ "loss": 0.4189,
+ "step": 4319
+ },
+ {
+ "epoch": 2.0425531914893615,
+ "grad_norm": 2.635472059249878,
+ "learning_rate": 3.7373730501296963e-06,
+ "loss": 0.5014,
+ "step": 4320
+ },
+ {
+ "epoch": 2.0430260047281323,
+ "grad_norm": 2.4982943534851074,
+ "learning_rate": 3.7368309546902876e-06,
+ "loss": 0.4658,
+ "step": 4321
+ },
+ {
+ "epoch": 2.043498817966903,
+ "grad_norm": 2.692742109298706,
+ "learning_rate": 3.736288782238839e-06,
+ "loss": 0.4454,
+ "step": 4322
+ },
+ {
+ "epoch": 2.043971631205674,
+ "grad_norm": 2.6774091720581055,
+ "learning_rate": 3.7357465328091086e-06,
+ "loss": 0.5002,
+ "step": 4323
+ },
+ {
+ "epoch": 2.0444444444444443,
+ "grad_norm": 2.695138692855835,
+ "learning_rate": 3.735204206434861e-06,
+ "loss": 0.448,
+ "step": 4324
+ },
+ {
+ "epoch": 2.044917257683215,
+ "grad_norm": 2.5383570194244385,
+ "learning_rate": 3.7346618031498635e-06,
+ "loss": 0.4352,
+ "step": 4325
+ },
+ {
+ "epoch": 2.045390070921986,
+ "grad_norm": 2.267277240753174,
+ "learning_rate": 3.7341193229878886e-06,
+ "loss": 0.4162,
+ "step": 4326
+ },
+ {
+ "epoch": 2.0458628841607567,
+ "grad_norm": 2.6037328243255615,
+ "learning_rate": 3.733576765982715e-06,
+ "loss": 0.4471,
+ "step": 4327
+ },
+ {
+ "epoch": 2.046335697399527,
+ "grad_norm": 3.261385440826416,
+ "learning_rate": 3.7330341321681253e-06,
+ "loss": 0.4618,
+ "step": 4328
+ },
+ {
+ "epoch": 2.046808510638298,
+ "grad_norm": 2.440650463104248,
+ "learning_rate": 3.7324914215779072e-06,
+ "loss": 0.4476,
+ "step": 4329
+ },
+ {
+ "epoch": 2.0472813238770686,
+ "grad_norm": 2.5940682888031006,
+ "learning_rate": 3.731948634245853e-06,
+ "loss": 0.4389,
+ "step": 4330
+ },
+ {
+ "epoch": 2.0477541371158394,
+ "grad_norm": 2.7428150177001953,
+ "learning_rate": 3.7314057702057582e-06,
+ "loss": 0.4477,
+ "step": 4331
+ },
+ {
+ "epoch": 2.0482269503546098,
+ "grad_norm": 2.3546223640441895,
+ "learning_rate": 3.730862829491427e-06,
+ "loss": 0.4047,
+ "step": 4332
+ },
+ {
+ "epoch": 2.0486997635933806,
+ "grad_norm": 2.552422523498535,
+ "learning_rate": 3.7303198121366637e-06,
+ "loss": 0.4438,
+ "step": 4333
+ },
+ {
+ "epoch": 2.0491725768321514,
+ "grad_norm": 2.99226713180542,
+ "learning_rate": 3.729776718175281e-06,
+ "loss": 0.491,
+ "step": 4334
+ },
+ {
+ "epoch": 2.049645390070922,
+ "grad_norm": 3.2003321647644043,
+ "learning_rate": 3.7292335476410935e-06,
+ "loss": 0.5458,
+ "step": 4335
+ },
+ {
+ "epoch": 2.0501182033096925,
+ "grad_norm": 2.739847183227539,
+ "learning_rate": 3.7286903005679237e-06,
+ "loss": 0.4499,
+ "step": 4336
+ },
+ {
+ "epoch": 2.0505910165484633,
+ "grad_norm": 2.5917470455169678,
+ "learning_rate": 3.7281469769895963e-06,
+ "loss": 0.4714,
+ "step": 4337
+ },
+ {
+ "epoch": 2.051063829787234,
+ "grad_norm": 2.8029327392578125,
+ "learning_rate": 3.7276035769399422e-06,
+ "loss": 0.42,
+ "step": 4338
+ },
+ {
+ "epoch": 2.051536643026005,
+ "grad_norm": 2.484879493713379,
+ "learning_rate": 3.727060100452796e-06,
+ "loss": 0.4163,
+ "step": 4339
+ },
+ {
+ "epoch": 2.0520094562647753,
+ "grad_norm": 2.7126030921936035,
+ "learning_rate": 3.7265165475619973e-06,
+ "loss": 0.4112,
+ "step": 4340
+ },
+ {
+ "epoch": 2.052482269503546,
+ "grad_norm": 2.618267774581909,
+ "learning_rate": 3.7259729183013927e-06,
+ "loss": 0.4281,
+ "step": 4341
+ },
+ {
+ "epoch": 2.052955082742317,
+ "grad_norm": 2.703270673751831,
+ "learning_rate": 3.7254292127048293e-06,
+ "loss": 0.4437,
+ "step": 4342
+ },
+ {
+ "epoch": 2.0534278959810877,
+ "grad_norm": 2.429150104522705,
+ "learning_rate": 3.7248854308061623e-06,
+ "loss": 0.3971,
+ "step": 4343
+ },
+ {
+ "epoch": 2.053900709219858,
+ "grad_norm": 2.54354190826416,
+ "learning_rate": 3.7243415726392508e-06,
+ "loss": 0.4485,
+ "step": 4344
+ },
+ {
+ "epoch": 2.054373522458629,
+ "grad_norm": 2.9515016078948975,
+ "learning_rate": 3.723797638237957e-06,
+ "loss": 0.4386,
+ "step": 4345
+ },
+ {
+ "epoch": 2.0548463356973996,
+ "grad_norm": 2.9129958152770996,
+ "learning_rate": 3.7232536276361514e-06,
+ "loss": 0.4595,
+ "step": 4346
+ },
+ {
+ "epoch": 2.0553191489361704,
+ "grad_norm": 2.5397512912750244,
+ "learning_rate": 3.722709540867706e-06,
+ "loss": 0.3681,
+ "step": 4347
+ },
+ {
+ "epoch": 2.0557919621749408,
+ "grad_norm": 2.79884672164917,
+ "learning_rate": 3.722165377966499e-06,
+ "loss": 0.4576,
+ "step": 4348
+ },
+ {
+ "epoch": 2.0562647754137116,
+ "grad_norm": 2.669936180114746,
+ "learning_rate": 3.7216211389664137e-06,
+ "loss": 0.3692,
+ "step": 4349
+ },
+ {
+ "epoch": 2.0567375886524824,
+ "grad_norm": 2.512326240539551,
+ "learning_rate": 3.7210768239013355e-06,
+ "loss": 0.4554,
+ "step": 4350
+ },
+ {
+ "epoch": 2.057210401891253,
+ "grad_norm": 2.913693904876709,
+ "learning_rate": 3.7205324328051583e-06,
+ "loss": 0.5282,
+ "step": 4351
+ },
+ {
+ "epoch": 2.0576832151300235,
+ "grad_norm": 3.040891170501709,
+ "learning_rate": 3.719987965711778e-06,
+ "loss": 0.4778,
+ "step": 4352
+ },
+ {
+ "epoch": 2.0581560283687943,
+ "grad_norm": 2.7504117488861084,
+ "learning_rate": 3.7194434226550966e-06,
+ "loss": 0.4217,
+ "step": 4353
+ },
+ {
+ "epoch": 2.058628841607565,
+ "grad_norm": 2.5522971153259277,
+ "learning_rate": 3.718898803669021e-06,
+ "loss": 0.437,
+ "step": 4354
+ },
+ {
+ "epoch": 2.059101654846336,
+ "grad_norm": 2.8531908988952637,
+ "learning_rate": 3.718354108787461e-06,
+ "loss": 0.4251,
+ "step": 4355
+ },
+ {
+ "epoch": 2.0595744680851062,
+ "grad_norm": 2.5812065601348877,
+ "learning_rate": 3.7178093380443337e-06,
+ "loss": 0.4374,
+ "step": 4356
+ },
+ {
+ "epoch": 2.060047281323877,
+ "grad_norm": 2.627871513366699,
+ "learning_rate": 3.7172644914735583e-06,
+ "loss": 0.436,
+ "step": 4357
+ },
+ {
+ "epoch": 2.060520094562648,
+ "grad_norm": 2.7146239280700684,
+ "learning_rate": 3.7167195691090607e-06,
+ "loss": 0.4204,
+ "step": 4358
+ },
+ {
+ "epoch": 2.0609929078014186,
+ "grad_norm": 2.486483573913574,
+ "learning_rate": 3.7161745709847706e-06,
+ "loss": 0.4015,
+ "step": 4359
+ },
+ {
+ "epoch": 2.061465721040189,
+ "grad_norm": 2.866049289703369,
+ "learning_rate": 3.7156294971346226e-06,
+ "loss": 0.4087,
+ "step": 4360
+ },
+ {
+ "epoch": 2.06193853427896,
+ "grad_norm": 2.9345552921295166,
+ "learning_rate": 3.715084347592556e-06,
+ "loss": 0.5074,
+ "step": 4361
+ },
+ {
+ "epoch": 2.0624113475177306,
+ "grad_norm": 2.502455711364746,
+ "learning_rate": 3.7145391223925155e-06,
+ "loss": 0.469,
+ "step": 4362
+ },
+ {
+ "epoch": 2.0628841607565014,
+ "grad_norm": 2.6419875621795654,
+ "learning_rate": 3.713993821568449e-06,
+ "loss": 0.4493,
+ "step": 4363
+ },
+ {
+ "epoch": 2.0633569739952717,
+ "grad_norm": 3.812079429626465,
+ "learning_rate": 3.7134484451543114e-06,
+ "loss": 0.4764,
+ "step": 4364
+ },
+ {
+ "epoch": 2.0638297872340425,
+ "grad_norm": 2.581780195236206,
+ "learning_rate": 3.712902993184059e-06,
+ "loss": 0.3994,
+ "step": 4365
+ },
+ {
+ "epoch": 2.0643026004728133,
+ "grad_norm": 2.282508134841919,
+ "learning_rate": 3.712357465691656e-06,
+ "loss": 0.4252,
+ "step": 4366
+ },
+ {
+ "epoch": 2.064775413711584,
+ "grad_norm": 2.4727818965911865,
+ "learning_rate": 3.71181186271107e-06,
+ "loss": 0.4558,
+ "step": 4367
+ },
+ {
+ "epoch": 2.0652482269503545,
+ "grad_norm": 2.7661173343658447,
+ "learning_rate": 3.711266184276272e-06,
+ "loss": 0.505,
+ "step": 4368
+ },
+ {
+ "epoch": 2.0657210401891253,
+ "grad_norm": 2.6264543533325195,
+ "learning_rate": 3.71072043042124e-06,
+ "loss": 0.4297,
+ "step": 4369
+ },
+ {
+ "epoch": 2.066193853427896,
+ "grad_norm": 2.773699998855591,
+ "learning_rate": 3.7101746011799565e-06,
+ "loss": 0.4267,
+ "step": 4370
+ },
+ {
+ "epoch": 2.066666666666667,
+ "grad_norm": 2.686955213546753,
+ "learning_rate": 3.709628696586407e-06,
+ "loss": 0.4099,
+ "step": 4371
+ },
+ {
+ "epoch": 2.0671394799054372,
+ "grad_norm": 2.6066620349884033,
+ "learning_rate": 3.709082716674582e-06,
+ "loss": 0.4146,
+ "step": 4372
+ },
+ {
+ "epoch": 2.067612293144208,
+ "grad_norm": 2.7769250869750977,
+ "learning_rate": 3.7085366614784784e-06,
+ "loss": 0.4047,
+ "step": 4373
+ },
+ {
+ "epoch": 2.068085106382979,
+ "grad_norm": 2.4986939430236816,
+ "learning_rate": 3.7079905310320957e-06,
+ "loss": 0.4021,
+ "step": 4374
+ },
+ {
+ "epoch": 2.0685579196217496,
+ "grad_norm": 2.5456206798553467,
+ "learning_rate": 3.7074443253694402e-06,
+ "loss": 0.3569,
+ "step": 4375
+ },
+ {
+ "epoch": 2.06903073286052,
+ "grad_norm": 2.4079296588897705,
+ "learning_rate": 3.70689804452452e-06,
+ "loss": 0.4308,
+ "step": 4376
+ },
+ {
+ "epoch": 2.0695035460992908,
+ "grad_norm": 2.86014723777771,
+ "learning_rate": 3.7063516885313513e-06,
+ "loss": 0.4577,
+ "step": 4377
+ },
+ {
+ "epoch": 2.0699763593380616,
+ "grad_norm": 2.8025779724121094,
+ "learning_rate": 3.7058052574239523e-06,
+ "loss": 0.4615,
+ "step": 4378
+ },
+ {
+ "epoch": 2.0704491725768324,
+ "grad_norm": 2.902676820755005,
+ "learning_rate": 3.7052587512363475e-06,
+ "loss": 0.4765,
+ "step": 4379
+ },
+ {
+ "epoch": 2.0709219858156027,
+ "grad_norm": 2.814509391784668,
+ "learning_rate": 3.704712170002566e-06,
+ "loss": 0.434,
+ "step": 4380
+ },
+ {
+ "epoch": 2.0713947990543735,
+ "grad_norm": 2.7923502922058105,
+ "learning_rate": 3.704165513756639e-06,
+ "loss": 0.4626,
+ "step": 4381
+ },
+ {
+ "epoch": 2.0718676122931443,
+ "grad_norm": 2.6802031993865967,
+ "learning_rate": 3.703618782532606e-06,
+ "loss": 0.4835,
+ "step": 4382
+ },
+ {
+ "epoch": 2.072340425531915,
+ "grad_norm": 3.0963687896728516,
+ "learning_rate": 3.7030719763645085e-06,
+ "loss": 0.4813,
+ "step": 4383
+ },
+ {
+ "epoch": 2.0728132387706855,
+ "grad_norm": 2.5658695697784424,
+ "learning_rate": 3.7025250952863956e-06,
+ "loss": 0.4428,
+ "step": 4384
+ },
+ {
+ "epoch": 2.0732860520094563,
+ "grad_norm": 2.7738289833068848,
+ "learning_rate": 3.7019781393323167e-06,
+ "loss": 0.4376,
+ "step": 4385
+ },
+ {
+ "epoch": 2.073758865248227,
+ "grad_norm": 2.6446938514709473,
+ "learning_rate": 3.7014311085363303e-06,
+ "loss": 0.4208,
+ "step": 4386
+ },
+ {
+ "epoch": 2.0742316784869974,
+ "grad_norm": 2.7556118965148926,
+ "learning_rate": 3.7008840029324967e-06,
+ "loss": 0.3831,
+ "step": 4387
+ },
+ {
+ "epoch": 2.074704491725768,
+ "grad_norm": 2.573141574859619,
+ "learning_rate": 3.700336822554882e-06,
+ "loss": 0.4396,
+ "step": 4388
+ },
+ {
+ "epoch": 2.075177304964539,
+ "grad_norm": 2.762319803237915,
+ "learning_rate": 3.6997895674375566e-06,
+ "loss": 0.4579,
+ "step": 4389
+ },
+ {
+ "epoch": 2.07565011820331,
+ "grad_norm": 2.729780435562134,
+ "learning_rate": 3.699242237614596e-06,
+ "loss": 0.4262,
+ "step": 4390
+ },
+ {
+ "epoch": 2.0761229314420806,
+ "grad_norm": 2.657480001449585,
+ "learning_rate": 3.698694833120079e-06,
+ "loss": 0.4176,
+ "step": 4391
+ },
+ {
+ "epoch": 2.076595744680851,
+ "grad_norm": 2.8433303833007812,
+ "learning_rate": 3.6981473539880914e-06,
+ "loss": 0.457,
+ "step": 4392
+ },
+ {
+ "epoch": 2.0770685579196217,
+ "grad_norm": 2.819047212600708,
+ "learning_rate": 3.6975998002527225e-06,
+ "loss": 0.4244,
+ "step": 4393
+ },
+ {
+ "epoch": 2.0775413711583925,
+ "grad_norm": 2.6565003395080566,
+ "learning_rate": 3.697052171948064e-06,
+ "loss": 0.4384,
+ "step": 4394
+ },
+ {
+ "epoch": 2.078014184397163,
+ "grad_norm": 2.5795063972473145,
+ "learning_rate": 3.696504469108216e-06,
+ "loss": 0.4958,
+ "step": 4395
+ },
+ {
+ "epoch": 2.0784869976359337,
+ "grad_norm": 2.455730676651001,
+ "learning_rate": 3.6959566917672822e-06,
+ "loss": 0.4191,
+ "step": 4396
+ },
+ {
+ "epoch": 2.0789598108747045,
+ "grad_norm": 2.6706607341766357,
+ "learning_rate": 3.6954088399593684e-06,
+ "loss": 0.4709,
+ "step": 4397
+ },
+ {
+ "epoch": 2.0794326241134753,
+ "grad_norm": 2.3758466243743896,
+ "learning_rate": 3.694860913718589e-06,
+ "loss": 0.4231,
+ "step": 4398
+ },
+ {
+ "epoch": 2.079905437352246,
+ "grad_norm": 2.3488340377807617,
+ "learning_rate": 3.6943129130790583e-06,
+ "loss": 0.4321,
+ "step": 4399
+ },
+ {
+ "epoch": 2.0803782505910164,
+ "grad_norm": 2.6438148021698,
+ "learning_rate": 3.6937648380748996e-06,
+ "loss": 0.4907,
+ "step": 4400
+ },
+ {
+ "epoch": 2.0808510638297872,
+ "grad_norm": 2.9826784133911133,
+ "learning_rate": 3.6932166887402395e-06,
+ "loss": 0.4404,
+ "step": 4401
+ },
+ {
+ "epoch": 2.081323877068558,
+ "grad_norm": 2.5203495025634766,
+ "learning_rate": 3.6926684651092076e-06,
+ "loss": 0.4337,
+ "step": 4402
+ },
+ {
+ "epoch": 2.0817966903073284,
+ "grad_norm": 2.7704148292541504,
+ "learning_rate": 3.692120167215941e-06,
+ "loss": 0.4195,
+ "step": 4403
+ },
+ {
+ "epoch": 2.082269503546099,
+ "grad_norm": 2.879430055618286,
+ "learning_rate": 3.6915717950945782e-06,
+ "loss": 0.4498,
+ "step": 4404
+ },
+ {
+ "epoch": 2.08274231678487,
+ "grad_norm": 2.7659497261047363,
+ "learning_rate": 3.6910233487792655e-06,
+ "loss": 0.4017,
+ "step": 4405
+ },
+ {
+ "epoch": 2.083215130023641,
+ "grad_norm": 3.4017205238342285,
+ "learning_rate": 3.6904748283041503e-06,
+ "loss": 0.4733,
+ "step": 4406
+ },
+ {
+ "epoch": 2.083687943262411,
+ "grad_norm": 2.706223249435425,
+ "learning_rate": 3.6899262337033887e-06,
+ "loss": 0.4926,
+ "step": 4407
+ },
+ {
+ "epoch": 2.084160756501182,
+ "grad_norm": 2.644932508468628,
+ "learning_rate": 3.6893775650111372e-06,
+ "loss": 0.3904,
+ "step": 4408
+ },
+ {
+ "epoch": 2.0846335697399527,
+ "grad_norm": 2.666585683822632,
+ "learning_rate": 3.6888288222615603e-06,
+ "loss": 0.4698,
+ "step": 4409
+ },
+ {
+ "epoch": 2.0851063829787235,
+ "grad_norm": 3.0058486461639404,
+ "learning_rate": 3.688280005488826e-06,
+ "loss": 0.5291,
+ "step": 4410
+ },
+ {
+ "epoch": 2.085579196217494,
+ "grad_norm": 2.533088445663452,
+ "learning_rate": 3.687731114727105e-06,
+ "loss": 0.393,
+ "step": 4411
+ },
+ {
+ "epoch": 2.0860520094562647,
+ "grad_norm": 2.921687364578247,
+ "learning_rate": 3.6871821500105763e-06,
+ "loss": 0.4719,
+ "step": 4412
+ },
+ {
+ "epoch": 2.0865248226950355,
+ "grad_norm": 2.291804313659668,
+ "learning_rate": 3.686633111373421e-06,
+ "loss": 0.4105,
+ "step": 4413
+ },
+ {
+ "epoch": 2.0869976359338063,
+ "grad_norm": 2.496333122253418,
+ "learning_rate": 3.6860839988498255e-06,
+ "loss": 0.4704,
+ "step": 4414
+ },
+ {
+ "epoch": 2.0874704491725766,
+ "grad_norm": 2.8059427738189697,
+ "learning_rate": 3.6855348124739787e-06,
+ "loss": 0.4961,
+ "step": 4415
+ },
+ {
+ "epoch": 2.0879432624113474,
+ "grad_norm": 2.683922290802002,
+ "learning_rate": 3.6849855522800795e-06,
+ "loss": 0.4838,
+ "step": 4416
+ },
+ {
+ "epoch": 2.088416075650118,
+ "grad_norm": 2.694148540496826,
+ "learning_rate": 3.684436218302324e-06,
+ "loss": 0.4812,
+ "step": 4417
+ },
+ {
+ "epoch": 2.088888888888889,
+ "grad_norm": 2.724531888961792,
+ "learning_rate": 3.683886810574919e-06,
+ "loss": 0.4495,
+ "step": 4418
+ },
+ {
+ "epoch": 2.0893617021276594,
+ "grad_norm": 2.6176564693450928,
+ "learning_rate": 3.6833373291320746e-06,
+ "loss": 0.4698,
+ "step": 4419
+ },
+ {
+ "epoch": 2.08983451536643,
+ "grad_norm": 2.534116268157959,
+ "learning_rate": 3.6827877740080032e-06,
+ "loss": 0.3912,
+ "step": 4420
+ },
+ {
+ "epoch": 2.090307328605201,
+ "grad_norm": 2.5747432708740234,
+ "learning_rate": 3.682238145236924e-06,
+ "loss": 0.4072,
+ "step": 4421
+ },
+ {
+ "epoch": 2.0907801418439718,
+ "grad_norm": 2.5947659015655518,
+ "learning_rate": 3.6816884428530588e-06,
+ "loss": 0.4638,
+ "step": 4422
+ },
+ {
+ "epoch": 2.091252955082742,
+ "grad_norm": 2.811992883682251,
+ "learning_rate": 3.6811386668906353e-06,
+ "loss": 0.4345,
+ "step": 4423
+ },
+ {
+ "epoch": 2.091725768321513,
+ "grad_norm": 2.7482287883758545,
+ "learning_rate": 3.680588817383886e-06,
+ "loss": 0.4541,
+ "step": 4424
+ },
+ {
+ "epoch": 2.0921985815602837,
+ "grad_norm": 2.987131357192993,
+ "learning_rate": 3.6800388943670484e-06,
+ "loss": 0.4571,
+ "step": 4425
+ },
+ {
+ "epoch": 2.0926713947990545,
+ "grad_norm": 3.1918671131134033,
+ "learning_rate": 3.6794888978743637e-06,
+ "loss": 0.5722,
+ "step": 4426
+ },
+ {
+ "epoch": 2.093144208037825,
+ "grad_norm": 2.5654571056365967,
+ "learning_rate": 3.678938827940076e-06,
+ "loss": 0.4686,
+ "step": 4427
+ },
+ {
+ "epoch": 2.0936170212765957,
+ "grad_norm": 2.942084789276123,
+ "learning_rate": 3.6783886845984383e-06,
+ "loss": 0.4512,
+ "step": 4428
+ },
+ {
+ "epoch": 2.0940898345153665,
+ "grad_norm": 2.74847674369812,
+ "learning_rate": 3.677838467883703e-06,
+ "loss": 0.4506,
+ "step": 4429
+ },
+ {
+ "epoch": 2.0945626477541373,
+ "grad_norm": 2.7569334506988525,
+ "learning_rate": 3.6772881778301322e-06,
+ "loss": 0.502,
+ "step": 4430
+ },
+ {
+ "epoch": 2.0950354609929076,
+ "grad_norm": 2.969966173171997,
+ "learning_rate": 3.6767378144719884e-06,
+ "loss": 0.4772,
+ "step": 4431
+ },
+ {
+ "epoch": 2.0955082742316784,
+ "grad_norm": 2.773524522781372,
+ "learning_rate": 3.67618737784354e-06,
+ "loss": 0.5183,
+ "step": 4432
+ },
+ {
+ "epoch": 2.095981087470449,
+ "grad_norm": 2.6760106086730957,
+ "learning_rate": 3.6756368679790617e-06,
+ "loss": 0.4787,
+ "step": 4433
+ },
+ {
+ "epoch": 2.09645390070922,
+ "grad_norm": 2.8758978843688965,
+ "learning_rate": 3.6750862849128304e-06,
+ "loss": 0.4275,
+ "step": 4434
+ },
+ {
+ "epoch": 2.0969267139479904,
+ "grad_norm": 2.670509099960327,
+ "learning_rate": 3.6745356286791288e-06,
+ "loss": 0.4401,
+ "step": 4435
+ },
+ {
+ "epoch": 2.097399527186761,
+ "grad_norm": 2.8453969955444336,
+ "learning_rate": 3.673984899312244e-06,
+ "loss": 0.4303,
+ "step": 4436
+ },
+ {
+ "epoch": 2.097872340425532,
+ "grad_norm": 2.6212339401245117,
+ "learning_rate": 3.673434096846468e-06,
+ "loss": 0.4675,
+ "step": 4437
+ },
+ {
+ "epoch": 2.0983451536643027,
+ "grad_norm": 2.8211941719055176,
+ "learning_rate": 3.672883221316095e-06,
+ "loss": 0.4678,
+ "step": 4438
+ },
+ {
+ "epoch": 2.098817966903073,
+ "grad_norm": 2.4838058948516846,
+ "learning_rate": 3.672332272755427e-06,
+ "loss": 0.4128,
+ "step": 4439
+ },
+ {
+ "epoch": 2.099290780141844,
+ "grad_norm": 2.596660852432251,
+ "learning_rate": 3.671781251198769e-06,
+ "loss": 0.423,
+ "step": 4440
+ },
+ {
+ "epoch": 2.0997635933806147,
+ "grad_norm": 2.9979989528656006,
+ "learning_rate": 3.67123015668043e-06,
+ "loss": 0.4493,
+ "step": 4441
+ },
+ {
+ "epoch": 2.1002364066193855,
+ "grad_norm": 2.6232850551605225,
+ "learning_rate": 3.670678989234725e-06,
+ "loss": 0.4237,
+ "step": 4442
+ },
+ {
+ "epoch": 2.100709219858156,
+ "grad_norm": 2.575039863586426,
+ "learning_rate": 3.670127748895973e-06,
+ "loss": 0.4464,
+ "step": 4443
+ },
+ {
+ "epoch": 2.1011820330969266,
+ "grad_norm": 2.3381190299987793,
+ "learning_rate": 3.669576435698497e-06,
+ "loss": 0.4208,
+ "step": 4444
+ },
+ {
+ "epoch": 2.1016548463356974,
+ "grad_norm": 2.9645180702209473,
+ "learning_rate": 3.669025049676625e-06,
+ "loss": 0.5272,
+ "step": 4445
+ },
+ {
+ "epoch": 2.1021276595744682,
+ "grad_norm": 2.719320297241211,
+ "learning_rate": 3.668473590864689e-06,
+ "loss": 0.4485,
+ "step": 4446
+ },
+ {
+ "epoch": 2.1026004728132386,
+ "grad_norm": 2.8665547370910645,
+ "learning_rate": 3.6679220592970254e-06,
+ "loss": 0.4433,
+ "step": 4447
+ },
+ {
+ "epoch": 2.1030732860520094,
+ "grad_norm": 2.6922879219055176,
+ "learning_rate": 3.667370455007977e-06,
+ "loss": 0.502,
+ "step": 4448
+ },
+ {
+ "epoch": 2.10354609929078,
+ "grad_norm": 3.018228530883789,
+ "learning_rate": 3.6668187780318894e-06,
+ "loss": 0.4939,
+ "step": 4449
+ },
+ {
+ "epoch": 2.104018912529551,
+ "grad_norm": 3.187901735305786,
+ "learning_rate": 3.666267028403112e-06,
+ "loss": 0.4151,
+ "step": 4450
+ },
+ {
+ "epoch": 2.1044917257683213,
+ "grad_norm": 2.9521446228027344,
+ "learning_rate": 3.6657152061560012e-06,
+ "loss": 0.4343,
+ "step": 4451
+ },
+ {
+ "epoch": 2.104964539007092,
+ "grad_norm": 2.5125739574432373,
+ "learning_rate": 3.6651633113249164e-06,
+ "loss": 0.4071,
+ "step": 4452
+ },
+ {
+ "epoch": 2.105437352245863,
+ "grad_norm": 2.9164133071899414,
+ "learning_rate": 3.664611343944221e-06,
+ "loss": 0.4173,
+ "step": 4453
+ },
+ {
+ "epoch": 2.1059101654846337,
+ "grad_norm": 2.680893898010254,
+ "learning_rate": 3.6640593040482834e-06,
+ "loss": 0.4917,
+ "step": 4454
+ },
+ {
+ "epoch": 2.106382978723404,
+ "grad_norm": 2.6823534965515137,
+ "learning_rate": 3.6635071916714774e-06,
+ "loss": 0.4668,
+ "step": 4455
+ },
+ {
+ "epoch": 2.106855791962175,
+ "grad_norm": 2.6221907138824463,
+ "learning_rate": 3.6629550068481806e-06,
+ "loss": 0.4956,
+ "step": 4456
+ },
+ {
+ "epoch": 2.1073286052009457,
+ "grad_norm": 3.096370220184326,
+ "learning_rate": 3.6624027496127745e-06,
+ "loss": 0.3995,
+ "step": 4457
+ },
+ {
+ "epoch": 2.1078014184397165,
+ "grad_norm": 2.752885341644287,
+ "learning_rate": 3.661850419999647e-06,
+ "loss": 0.4838,
+ "step": 4458
+ },
+ {
+ "epoch": 2.108274231678487,
+ "grad_norm": 2.6806766986846924,
+ "learning_rate": 3.661298018043188e-06,
+ "loss": 0.4817,
+ "step": 4459
+ },
+ {
+ "epoch": 2.1087470449172576,
+ "grad_norm": 2.6317873001098633,
+ "learning_rate": 3.660745543777794e-06,
+ "loss": 0.4777,
+ "step": 4460
+ },
+ {
+ "epoch": 2.1092198581560284,
+ "grad_norm": 2.4939377307891846,
+ "learning_rate": 3.6601929972378634e-06,
+ "loss": 0.4525,
+ "step": 4461
+ },
+ {
+ "epoch": 2.109692671394799,
+ "grad_norm": 2.4902873039245605,
+ "learning_rate": 3.659640378457803e-06,
+ "loss": 0.4392,
+ "step": 4462
+ },
+ {
+ "epoch": 2.1101654846335696,
+ "grad_norm": 2.5082345008850098,
+ "learning_rate": 3.6590876874720216e-06,
+ "loss": 0.4224,
+ "step": 4463
+ },
+ {
+ "epoch": 2.1106382978723404,
+ "grad_norm": 2.658407211303711,
+ "learning_rate": 3.6585349243149313e-06,
+ "loss": 0.4316,
+ "step": 4464
+ },
+ {
+ "epoch": 2.111111111111111,
+ "grad_norm": 2.562883138656616,
+ "learning_rate": 3.6579820890209515e-06,
+ "loss": 0.4491,
+ "step": 4465
+ },
+ {
+ "epoch": 2.111583924349882,
+ "grad_norm": 2.5719261169433594,
+ "learning_rate": 3.657429181624505e-06,
+ "loss": 0.4406,
+ "step": 4466
+ },
+ {
+ "epoch": 2.1120567375886523,
+ "grad_norm": 2.8840596675872803,
+ "learning_rate": 3.6568762021600184e-06,
+ "loss": 0.4267,
+ "step": 4467
+ },
+ {
+ "epoch": 2.112529550827423,
+ "grad_norm": 2.660304546356201,
+ "learning_rate": 3.656323150661924e-06,
+ "loss": 0.4502,
+ "step": 4468
+ },
+ {
+ "epoch": 2.113002364066194,
+ "grad_norm": 2.610996961593628,
+ "learning_rate": 3.655770027164657e-06,
+ "loss": 0.3934,
+ "step": 4469
+ },
+ {
+ "epoch": 2.1134751773049647,
+ "grad_norm": 2.6000053882598877,
+ "learning_rate": 3.655216831702658e-06,
+ "loss": 0.4582,
+ "step": 4470
+ },
+ {
+ "epoch": 2.113947990543735,
+ "grad_norm": 2.73124098777771,
+ "learning_rate": 3.654663564310372e-06,
+ "loss": 0.4748,
+ "step": 4471
+ },
+ {
+ "epoch": 2.114420803782506,
+ "grad_norm": 2.711091995239258,
+ "learning_rate": 3.6541102250222495e-06,
+ "loss": 0.4145,
+ "step": 4472
+ },
+ {
+ "epoch": 2.1148936170212767,
+ "grad_norm": 2.655996561050415,
+ "learning_rate": 3.6535568138727438e-06,
+ "loss": 0.4407,
+ "step": 4473
+ },
+ {
+ "epoch": 2.1153664302600474,
+ "grad_norm": 2.7630865573883057,
+ "learning_rate": 3.653003330896313e-06,
+ "loss": 0.4298,
+ "step": 4474
+ },
+ {
+ "epoch": 2.115839243498818,
+ "grad_norm": 2.554415464401245,
+ "learning_rate": 3.6524497761274214e-06,
+ "loss": 0.44,
+ "step": 4475
+ },
+ {
+ "epoch": 2.1163120567375886,
+ "grad_norm": 2.790328025817871,
+ "learning_rate": 3.651896149600535e-06,
+ "loss": 0.5061,
+ "step": 4476
+ },
+ {
+ "epoch": 2.1167848699763594,
+ "grad_norm": 2.755267381668091,
+ "learning_rate": 3.651342451350127e-06,
+ "loss": 0.4588,
+ "step": 4477
+ },
+ {
+ "epoch": 2.11725768321513,
+ "grad_norm": 2.8936638832092285,
+ "learning_rate": 3.6507886814106722e-06,
+ "loss": 0.468,
+ "step": 4478
+ },
+ {
+ "epoch": 2.1177304964539005,
+ "grad_norm": 2.7394332885742188,
+ "learning_rate": 3.6502348398166525e-06,
+ "loss": 0.383,
+ "step": 4479
+ },
+ {
+ "epoch": 2.1182033096926713,
+ "grad_norm": 2.3359546661376953,
+ "learning_rate": 3.649680926602553e-06,
+ "loss": 0.3903,
+ "step": 4480
+ },
+ {
+ "epoch": 2.118676122931442,
+ "grad_norm": 3.102202892303467,
+ "learning_rate": 3.6491269418028637e-06,
+ "loss": 0.4525,
+ "step": 4481
+ },
+ {
+ "epoch": 2.119148936170213,
+ "grad_norm": 2.467970848083496,
+ "learning_rate": 3.648572885452078e-06,
+ "loss": 0.414,
+ "step": 4482
+ },
+ {
+ "epoch": 2.1196217494089833,
+ "grad_norm": 2.8984131813049316,
+ "learning_rate": 3.6480187575846952e-06,
+ "loss": 0.4571,
+ "step": 4483
+ },
+ {
+ "epoch": 2.120094562647754,
+ "grad_norm": 2.674834966659546,
+ "learning_rate": 3.6474645582352187e-06,
+ "loss": 0.455,
+ "step": 4484
+ },
+ {
+ "epoch": 2.120567375886525,
+ "grad_norm": 2.8713369369506836,
+ "learning_rate": 3.6469102874381552e-06,
+ "loss": 0.4567,
+ "step": 4485
+ },
+ {
+ "epoch": 2.1210401891252957,
+ "grad_norm": 3.174814462661743,
+ "learning_rate": 3.646355945228017e-06,
+ "loss": 0.5295,
+ "step": 4486
+ },
+ {
+ "epoch": 2.121513002364066,
+ "grad_norm": 2.6409823894500732,
+ "learning_rate": 3.6458015316393215e-06,
+ "loss": 0.4308,
+ "step": 4487
+ },
+ {
+ "epoch": 2.121985815602837,
+ "grad_norm": 2.4228954315185547,
+ "learning_rate": 3.645247046706588e-06,
+ "loss": 0.4042,
+ "step": 4488
+ },
+ {
+ "epoch": 2.1224586288416076,
+ "grad_norm": 2.553551435470581,
+ "learning_rate": 3.6446924904643427e-06,
+ "loss": 0.3925,
+ "step": 4489
+ },
+ {
+ "epoch": 2.1229314420803784,
+ "grad_norm": 2.8019237518310547,
+ "learning_rate": 3.6441378629471157e-06,
+ "loss": 0.4079,
+ "step": 4490
+ },
+ {
+ "epoch": 2.123404255319149,
+ "grad_norm": 2.993251085281372,
+ "learning_rate": 3.643583164189441e-06,
+ "loss": 0.4558,
+ "step": 4491
+ },
+ {
+ "epoch": 2.1238770685579196,
+ "grad_norm": 2.4531471729278564,
+ "learning_rate": 3.643028394225857e-06,
+ "loss": 0.4167,
+ "step": 4492
+ },
+ {
+ "epoch": 2.1243498817966904,
+ "grad_norm": 2.6827852725982666,
+ "learning_rate": 3.6424735530909065e-06,
+ "loss": 0.4311,
+ "step": 4493
+ },
+ {
+ "epoch": 2.124822695035461,
+ "grad_norm": 3.1232128143310547,
+ "learning_rate": 3.6419186408191377e-06,
+ "loss": 0.4537,
+ "step": 4494
+ },
+ {
+ "epoch": 2.1252955082742315,
+ "grad_norm": 2.816348075866699,
+ "learning_rate": 3.641363657445103e-06,
+ "loss": 0.4869,
+ "step": 4495
+ },
+ {
+ "epoch": 2.1257683215130023,
+ "grad_norm": 2.6269683837890625,
+ "learning_rate": 3.6408086030033575e-06,
+ "loss": 0.4066,
+ "step": 4496
+ },
+ {
+ "epoch": 2.126241134751773,
+ "grad_norm": 4.6375956535339355,
+ "learning_rate": 3.640253477528462e-06,
+ "loss": 0.4488,
+ "step": 4497
+ },
+ {
+ "epoch": 2.126713947990544,
+ "grad_norm": 3.020970582962036,
+ "learning_rate": 3.639698281054983e-06,
+ "loss": 0.4197,
+ "step": 4498
+ },
+ {
+ "epoch": 2.1271867612293143,
+ "grad_norm": 2.87904691696167,
+ "learning_rate": 3.6391430136174892e-06,
+ "loss": 0.4743,
+ "step": 4499
+ },
+ {
+ "epoch": 2.127659574468085,
+ "grad_norm": 2.719892978668213,
+ "learning_rate": 3.6385876752505554e-06,
+ "loss": 0.388,
+ "step": 4500
+ },
+ {
+ "epoch": 2.128132387706856,
+ "grad_norm": 2.7321808338165283,
+ "learning_rate": 3.638032265988759e-06,
+ "loss": 0.4857,
+ "step": 4501
+ },
+ {
+ "epoch": 2.1286052009456267,
+ "grad_norm": 2.700814723968506,
+ "learning_rate": 3.6374767858666836e-06,
+ "loss": 0.4819,
+ "step": 4502
+ },
+ {
+ "epoch": 2.129078014184397,
+ "grad_norm": 2.658423662185669,
+ "learning_rate": 3.6369212349189164e-06,
+ "loss": 0.4113,
+ "step": 4503
+ },
+ {
+ "epoch": 2.129550827423168,
+ "grad_norm": 2.673877716064453,
+ "learning_rate": 3.63636561318005e-06,
+ "loss": 0.3745,
+ "step": 4504
+ },
+ {
+ "epoch": 2.1300236406619386,
+ "grad_norm": 2.607758045196533,
+ "learning_rate": 3.6358099206846787e-06,
+ "loss": 0.4409,
+ "step": 4505
+ },
+ {
+ "epoch": 2.1304964539007094,
+ "grad_norm": 2.8117682933807373,
+ "learning_rate": 3.6352541574674044e-06,
+ "loss": 0.426,
+ "step": 4506
+ },
+ {
+ "epoch": 2.1309692671394798,
+ "grad_norm": 2.6970250606536865,
+ "learning_rate": 3.634698323562832e-06,
+ "loss": 0.4295,
+ "step": 4507
+ },
+ {
+ "epoch": 2.1314420803782506,
+ "grad_norm": 2.7133560180664062,
+ "learning_rate": 3.6341424190055696e-06,
+ "loss": 0.4443,
+ "step": 4508
+ },
+ {
+ "epoch": 2.1319148936170214,
+ "grad_norm": 2.57181715965271,
+ "learning_rate": 3.6335864438302328e-06,
+ "loss": 0.3995,
+ "step": 4509
+ },
+ {
+ "epoch": 2.132387706855792,
+ "grad_norm": 2.8618004322052,
+ "learning_rate": 3.633030398071438e-06,
+ "loss": 0.5075,
+ "step": 4510
+ },
+ {
+ "epoch": 2.1328605200945625,
+ "grad_norm": 2.7586729526519775,
+ "learning_rate": 3.6324742817638087e-06,
+ "loss": 0.4322,
+ "step": 4511
+ },
+ {
+ "epoch": 2.1333333333333333,
+ "grad_norm": 2.913256883621216,
+ "learning_rate": 3.631918094941972e-06,
+ "loss": 0.4708,
+ "step": 4512
+ },
+ {
+ "epoch": 2.133806146572104,
+ "grad_norm": 2.7715728282928467,
+ "learning_rate": 3.6313618376405585e-06,
+ "loss": 0.5194,
+ "step": 4513
+ },
+ {
+ "epoch": 2.134278959810875,
+ "grad_norm": 2.7986366748809814,
+ "learning_rate": 3.6308055098942042e-06,
+ "loss": 0.4419,
+ "step": 4514
+ },
+ {
+ "epoch": 2.1347517730496453,
+ "grad_norm": 3.043549060821533,
+ "learning_rate": 3.6302491117375492e-06,
+ "loss": 0.4441,
+ "step": 4515
+ },
+ {
+ "epoch": 2.135224586288416,
+ "grad_norm": 2.771761417388916,
+ "learning_rate": 3.629692643205238e-06,
+ "loss": 0.4752,
+ "step": 4516
+ },
+ {
+ "epoch": 2.135697399527187,
+ "grad_norm": 2.804941415786743,
+ "learning_rate": 3.6291361043319202e-06,
+ "loss": 0.4089,
+ "step": 4517
+ },
+ {
+ "epoch": 2.1361702127659576,
+ "grad_norm": 2.9897940158843994,
+ "learning_rate": 3.628579495152248e-06,
+ "loss": 0.4829,
+ "step": 4518
+ },
+ {
+ "epoch": 2.136643026004728,
+ "grad_norm": 2.9273486137390137,
+ "learning_rate": 3.6280228157008784e-06,
+ "loss": 0.4469,
+ "step": 4519
+ },
+ {
+ "epoch": 2.137115839243499,
+ "grad_norm": 2.584373950958252,
+ "learning_rate": 3.627466066012475e-06,
+ "loss": 0.4277,
+ "step": 4520
+ },
+ {
+ "epoch": 2.1375886524822696,
+ "grad_norm": 3.009333848953247,
+ "learning_rate": 3.626909246121703e-06,
+ "loss": 0.4025,
+ "step": 4521
+ },
+ {
+ "epoch": 2.1380614657210404,
+ "grad_norm": 2.634615659713745,
+ "learning_rate": 3.626352356063234e-06,
+ "loss": 0.4046,
+ "step": 4522
+ },
+ {
+ "epoch": 2.1385342789598107,
+ "grad_norm": 2.87310528755188,
+ "learning_rate": 3.625795395871743e-06,
+ "loss": 0.4426,
+ "step": 4523
+ },
+ {
+ "epoch": 2.1390070921985815,
+ "grad_norm": 2.94985032081604,
+ "learning_rate": 3.625238365581909e-06,
+ "loss": 0.445,
+ "step": 4524
+ },
+ {
+ "epoch": 2.1394799054373523,
+ "grad_norm": 2.470189332962036,
+ "learning_rate": 3.624681265228416e-06,
+ "loss": 0.4082,
+ "step": 4525
+ },
+ {
+ "epoch": 2.139952718676123,
+ "grad_norm": 2.5304040908813477,
+ "learning_rate": 3.624124094845952e-06,
+ "loss": 0.403,
+ "step": 4526
+ },
+ {
+ "epoch": 2.1404255319148935,
+ "grad_norm": 2.6148900985717773,
+ "learning_rate": 3.62356685446921e-06,
+ "loss": 0.3867,
+ "step": 4527
+ },
+ {
+ "epoch": 2.1408983451536643,
+ "grad_norm": 2.885549783706665,
+ "learning_rate": 3.623009544132886e-06,
+ "loss": 0.4706,
+ "step": 4528
+ },
+ {
+ "epoch": 2.141371158392435,
+ "grad_norm": 3.00490665435791,
+ "learning_rate": 3.6224521638716827e-06,
+ "loss": 0.4733,
+ "step": 4529
+ },
+ {
+ "epoch": 2.141843971631206,
+ "grad_norm": 2.925879716873169,
+ "learning_rate": 3.6218947137203043e-06,
+ "loss": 0.4581,
+ "step": 4530
+ },
+ {
+ "epoch": 2.1423167848699762,
+ "grad_norm": 3.10861873626709,
+ "learning_rate": 3.621337193713462e-06,
+ "loss": 0.4579,
+ "step": 4531
+ },
+ {
+ "epoch": 2.142789598108747,
+ "grad_norm": 2.7386577129364014,
+ "learning_rate": 3.6207796038858693e-06,
+ "loss": 0.4248,
+ "step": 4532
+ },
+ {
+ "epoch": 2.143262411347518,
+ "grad_norm": 2.601836681365967,
+ "learning_rate": 3.6202219442722453e-06,
+ "loss": 0.4928,
+ "step": 4533
+ },
+ {
+ "epoch": 2.1437352245862886,
+ "grad_norm": 2.598778247833252,
+ "learning_rate": 3.6196642149073123e-06,
+ "loss": 0.4415,
+ "step": 4534
+ },
+ {
+ "epoch": 2.144208037825059,
+ "grad_norm": 2.443995714187622,
+ "learning_rate": 3.619106415825798e-06,
+ "loss": 0.3917,
+ "step": 4535
+ },
+ {
+ "epoch": 2.1446808510638298,
+ "grad_norm": 2.84643816947937,
+ "learning_rate": 3.6185485470624354e-06,
+ "loss": 0.4162,
+ "step": 4536
+ },
+ {
+ "epoch": 2.1451536643026006,
+ "grad_norm": 2.4568188190460205,
+ "learning_rate": 3.617990608651959e-06,
+ "loss": 0.4298,
+ "step": 4537
+ },
+ {
+ "epoch": 2.145626477541371,
+ "grad_norm": 2.968804359436035,
+ "learning_rate": 3.61743260062911e-06,
+ "loss": 0.4696,
+ "step": 4538
+ },
+ {
+ "epoch": 2.1460992907801417,
+ "grad_norm": 2.629075288772583,
+ "learning_rate": 3.6168745230286327e-06,
+ "loss": 0.4234,
+ "step": 4539
+ },
+ {
+ "epoch": 2.1465721040189125,
+ "grad_norm": 2.7680578231811523,
+ "learning_rate": 3.6163163758852754e-06,
+ "loss": 0.4669,
+ "step": 4540
+ },
+ {
+ "epoch": 2.1470449172576833,
+ "grad_norm": 2.782825469970703,
+ "learning_rate": 3.615758159233793e-06,
+ "loss": 0.4552,
+ "step": 4541
+ },
+ {
+ "epoch": 2.147517730496454,
+ "grad_norm": 2.653047561645508,
+ "learning_rate": 3.615199873108942e-06,
+ "loss": 0.4393,
+ "step": 4542
+ },
+ {
+ "epoch": 2.1479905437352245,
+ "grad_norm": 2.4175806045532227,
+ "learning_rate": 3.6146415175454852e-06,
+ "loss": 0.4114,
+ "step": 4543
+ },
+ {
+ "epoch": 2.1484633569739953,
+ "grad_norm": 2.627943515777588,
+ "learning_rate": 3.614083092578189e-06,
+ "loss": 0.4215,
+ "step": 4544
+ },
+ {
+ "epoch": 2.148936170212766,
+ "grad_norm": 2.8934123516082764,
+ "learning_rate": 3.6135245982418227e-06,
+ "loss": 0.4815,
+ "step": 4545
+ },
+ {
+ "epoch": 2.1494089834515364,
+ "grad_norm": 2.8535244464874268,
+ "learning_rate": 3.612966034571164e-06,
+ "loss": 0.4683,
+ "step": 4546
+ },
+ {
+ "epoch": 2.149881796690307,
+ "grad_norm": 2.7826647758483887,
+ "learning_rate": 3.6124074016009893e-06,
+ "loss": 0.4351,
+ "step": 4547
+ },
+ {
+ "epoch": 2.150354609929078,
+ "grad_norm": 2.6906018257141113,
+ "learning_rate": 3.6118486993660834e-06,
+ "loss": 0.4585,
+ "step": 4548
+ },
+ {
+ "epoch": 2.150827423167849,
+ "grad_norm": 2.726766586303711,
+ "learning_rate": 3.6112899279012346e-06,
+ "loss": 0.4753,
+ "step": 4549
+ },
+ {
+ "epoch": 2.1513002364066196,
+ "grad_norm": 3.0193991661071777,
+ "learning_rate": 3.6107310872412348e-06,
+ "loss": 0.4827,
+ "step": 4550
+ },
+ {
+ "epoch": 2.15177304964539,
+ "grad_norm": 2.6788697242736816,
+ "learning_rate": 3.610172177420881e-06,
+ "loss": 0.4333,
+ "step": 4551
+ },
+ {
+ "epoch": 2.1522458628841608,
+ "grad_norm": 2.865410327911377,
+ "learning_rate": 3.609613198474973e-06,
+ "loss": 0.4569,
+ "step": 4552
+ },
+ {
+ "epoch": 2.1527186761229316,
+ "grad_norm": 2.9199366569519043,
+ "learning_rate": 3.609054150438317e-06,
+ "loss": 0.5097,
+ "step": 4553
+ },
+ {
+ "epoch": 2.153191489361702,
+ "grad_norm": 2.761035203933716,
+ "learning_rate": 3.6084950333457215e-06,
+ "loss": 0.5002,
+ "step": 4554
+ },
+ {
+ "epoch": 2.1536643026004727,
+ "grad_norm": 2.514223337173462,
+ "learning_rate": 3.607935847232002e-06,
+ "loss": 0.4171,
+ "step": 4555
+ },
+ {
+ "epoch": 2.1541371158392435,
+ "grad_norm": 2.5167524814605713,
+ "learning_rate": 3.6073765921319747e-06,
+ "loss": 0.4494,
+ "step": 4556
+ },
+ {
+ "epoch": 2.1546099290780143,
+ "grad_norm": 2.7540643215179443,
+ "learning_rate": 3.606817268080463e-06,
+ "loss": 0.4472,
+ "step": 4557
+ },
+ {
+ "epoch": 2.155082742316785,
+ "grad_norm": 2.7728664875030518,
+ "learning_rate": 3.6062578751122936e-06,
+ "loss": 0.4669,
+ "step": 4558
+ },
+ {
+ "epoch": 2.1555555555555554,
+ "grad_norm": 2.7788400650024414,
+ "learning_rate": 3.605698413262296e-06,
+ "loss": 0.4613,
+ "step": 4559
+ },
+ {
+ "epoch": 2.1560283687943262,
+ "grad_norm": 2.7811810970306396,
+ "learning_rate": 3.605138882565308e-06,
+ "loss": 0.4242,
+ "step": 4560
+ },
+ {
+ "epoch": 2.156501182033097,
+ "grad_norm": 2.7819995880126953,
+ "learning_rate": 3.6045792830561664e-06,
+ "loss": 0.443,
+ "step": 4561
+ },
+ {
+ "epoch": 2.1569739952718674,
+ "grad_norm": 2.671259641647339,
+ "learning_rate": 3.6040196147697166e-06,
+ "loss": 0.4336,
+ "step": 4562
+ },
+ {
+ "epoch": 2.157446808510638,
+ "grad_norm": 2.9296300411224365,
+ "learning_rate": 3.603459877740807e-06,
+ "loss": 0.479,
+ "step": 4563
+ },
+ {
+ "epoch": 2.157919621749409,
+ "grad_norm": 2.834937334060669,
+ "learning_rate": 3.602900072004289e-06,
+ "loss": 0.4603,
+ "step": 4564
+ },
+ {
+ "epoch": 2.15839243498818,
+ "grad_norm": 2.8434760570526123,
+ "learning_rate": 3.602340197595019e-06,
+ "loss": 0.4288,
+ "step": 4565
+ },
+ {
+ "epoch": 2.1588652482269506,
+ "grad_norm": 2.7245426177978516,
+ "learning_rate": 3.6017802545478593e-06,
+ "loss": 0.4194,
+ "step": 4566
+ },
+ {
+ "epoch": 2.159338061465721,
+ "grad_norm": 2.7795023918151855,
+ "learning_rate": 3.6012202428976735e-06,
+ "loss": 0.4481,
+ "step": 4567
+ },
+ {
+ "epoch": 2.1598108747044917,
+ "grad_norm": 2.9482083320617676,
+ "learning_rate": 3.6006601626793325e-06,
+ "loss": 0.468,
+ "step": 4568
+ },
+ {
+ "epoch": 2.1602836879432625,
+ "grad_norm": 2.9563326835632324,
+ "learning_rate": 3.6001000139277094e-06,
+ "loss": 0.4427,
+ "step": 4569
+ },
+ {
+ "epoch": 2.160756501182033,
+ "grad_norm": 2.7755916118621826,
+ "learning_rate": 3.599539796677682e-06,
+ "loss": 0.4258,
+ "step": 4570
+ },
+ {
+ "epoch": 2.1612293144208037,
+ "grad_norm": 2.961045265197754,
+ "learning_rate": 3.5989795109641333e-06,
+ "loss": 0.4645,
+ "step": 4571
+ },
+ {
+ "epoch": 2.1617021276595745,
+ "grad_norm": 3.0184407234191895,
+ "learning_rate": 3.5984191568219482e-06,
+ "loss": 0.4192,
+ "step": 4572
+ },
+ {
+ "epoch": 2.1621749408983453,
+ "grad_norm": 2.9811131954193115,
+ "learning_rate": 3.5978587342860192e-06,
+ "loss": 0.408,
+ "step": 4573
+ },
+ {
+ "epoch": 2.162647754137116,
+ "grad_norm": 2.9172329902648926,
+ "learning_rate": 3.597298243391242e-06,
+ "loss": 0.4528,
+ "step": 4574
+ },
+ {
+ "epoch": 2.1631205673758864,
+ "grad_norm": 2.7798452377319336,
+ "learning_rate": 3.596737684172513e-06,
+ "loss": 0.391,
+ "step": 4575
+ },
+ {
+ "epoch": 2.1635933806146572,
+ "grad_norm": 2.526277542114258,
+ "learning_rate": 3.596177056664738e-06,
+ "loss": 0.3699,
+ "step": 4576
+ },
+ {
+ "epoch": 2.164066193853428,
+ "grad_norm": 2.856269121170044,
+ "learning_rate": 3.5956163609028244e-06,
+ "loss": 0.4082,
+ "step": 4577
+ },
+ {
+ "epoch": 2.1645390070921984,
+ "grad_norm": 2.7681572437286377,
+ "learning_rate": 3.5950555969216845e-06,
+ "loss": 0.4064,
+ "step": 4578
+ },
+ {
+ "epoch": 2.165011820330969,
+ "grad_norm": 2.2924954891204834,
+ "learning_rate": 3.5944947647562333e-06,
+ "loss": 0.416,
+ "step": 4579
+ },
+ {
+ "epoch": 2.16548463356974,
+ "grad_norm": 2.439929485321045,
+ "learning_rate": 3.5939338644413936e-06,
+ "loss": 0.4476,
+ "step": 4580
+ },
+ {
+ "epoch": 2.1659574468085108,
+ "grad_norm": 2.786442518234253,
+ "learning_rate": 3.5933728960120877e-06,
+ "loss": 0.4525,
+ "step": 4581
+ },
+ {
+ "epoch": 2.166430260047281,
+ "grad_norm": 2.5910253524780273,
+ "learning_rate": 3.5928118595032465e-06,
+ "loss": 0.4441,
+ "step": 4582
+ },
+ {
+ "epoch": 2.166903073286052,
+ "grad_norm": 2.8144876956939697,
+ "learning_rate": 3.5922507549498024e-06,
+ "loss": 0.497,
+ "step": 4583
+ },
+ {
+ "epoch": 2.1673758865248227,
+ "grad_norm": 2.5714170932769775,
+ "learning_rate": 3.591689582386694e-06,
+ "loss": 0.4625,
+ "step": 4584
+ },
+ {
+ "epoch": 2.1678486997635935,
+ "grad_norm": 2.878187894821167,
+ "learning_rate": 3.591128341848861e-06,
+ "loss": 0.4835,
+ "step": 4585
+ },
+ {
+ "epoch": 2.168321513002364,
+ "grad_norm": 2.4946508407592773,
+ "learning_rate": 3.5905670333712504e-06,
+ "loss": 0.4278,
+ "step": 4586
+ },
+ {
+ "epoch": 2.1687943262411347,
+ "grad_norm": 2.9186196327209473,
+ "learning_rate": 3.590005656988814e-06,
+ "loss": 0.465,
+ "step": 4587
+ },
+ {
+ "epoch": 2.1692671394799055,
+ "grad_norm": 3.136807441711426,
+ "learning_rate": 3.5894442127365046e-06,
+ "loss": 0.4146,
+ "step": 4588
+ },
+ {
+ "epoch": 2.1697399527186763,
+ "grad_norm": 2.8106343746185303,
+ "learning_rate": 3.5888827006492804e-06,
+ "loss": 0.4737,
+ "step": 4589
+ },
+ {
+ "epoch": 2.1702127659574466,
+ "grad_norm": 2.874553680419922,
+ "learning_rate": 3.5883211207621047e-06,
+ "loss": 0.3962,
+ "step": 4590
+ },
+ {
+ "epoch": 2.1706855791962174,
+ "grad_norm": 2.7914116382598877,
+ "learning_rate": 3.587759473109946e-06,
+ "loss": 0.4705,
+ "step": 4591
+ },
+ {
+ "epoch": 2.171158392434988,
+ "grad_norm": 2.7273290157318115,
+ "learning_rate": 3.5871977577277745e-06,
+ "loss": 0.4827,
+ "step": 4592
+ },
+ {
+ "epoch": 2.171631205673759,
+ "grad_norm": 2.4167256355285645,
+ "learning_rate": 3.5866359746505653e-06,
+ "loss": 0.4181,
+ "step": 4593
+ },
+ {
+ "epoch": 2.1721040189125294,
+ "grad_norm": 2.8929779529571533,
+ "learning_rate": 3.586074123913299e-06,
+ "loss": 0.4006,
+ "step": 4594
+ },
+ {
+ "epoch": 2.1725768321513,
+ "grad_norm": 2.6996190547943115,
+ "learning_rate": 3.5855122055509593e-06,
+ "loss": 0.4792,
+ "step": 4595
+ },
+ {
+ "epoch": 2.173049645390071,
+ "grad_norm": 2.9341464042663574,
+ "learning_rate": 3.584950219598534e-06,
+ "loss": 0.3903,
+ "step": 4596
+ },
+ {
+ "epoch": 2.1735224586288417,
+ "grad_norm": 2.799330234527588,
+ "learning_rate": 3.5843881660910166e-06,
+ "loss": 0.4717,
+ "step": 4597
+ },
+ {
+ "epoch": 2.173995271867612,
+ "grad_norm": 2.5028693675994873,
+ "learning_rate": 3.5838260450634028e-06,
+ "loss": 0.4462,
+ "step": 4598
+ },
+ {
+ "epoch": 2.174468085106383,
+ "grad_norm": 2.5845541954040527,
+ "learning_rate": 3.583263856550693e-06,
+ "loss": 0.4327,
+ "step": 4599
+ },
+ {
+ "epoch": 2.1749408983451537,
+ "grad_norm": 2.4804906845092773,
+ "learning_rate": 3.5827016005878933e-06,
+ "loss": 0.4555,
+ "step": 4600
+ },
+ {
+ "epoch": 2.1754137115839245,
+ "grad_norm": 2.625746011734009,
+ "learning_rate": 3.5821392772100125e-06,
+ "loss": 0.455,
+ "step": 4601
+ },
+ {
+ "epoch": 2.175886524822695,
+ "grad_norm": 2.6230757236480713,
+ "learning_rate": 3.581576886452064e-06,
+ "loss": 0.4422,
+ "step": 4602
+ },
+ {
+ "epoch": 2.1763593380614656,
+ "grad_norm": 3.3104100227355957,
+ "learning_rate": 3.5810144283490656e-06,
+ "loss": 0.4212,
+ "step": 4603
+ },
+ {
+ "epoch": 2.1768321513002364,
+ "grad_norm": 2.6799755096435547,
+ "learning_rate": 3.5804519029360384e-06,
+ "loss": 0.4575,
+ "step": 4604
+ },
+ {
+ "epoch": 2.1773049645390072,
+ "grad_norm": 2.462216854095459,
+ "learning_rate": 3.5798893102480085e-06,
+ "loss": 0.4096,
+ "step": 4605
+ },
+ {
+ "epoch": 2.1777777777777776,
+ "grad_norm": 2.8600878715515137,
+ "learning_rate": 3.5793266503200074e-06,
+ "loss": 0.4798,
+ "step": 4606
+ },
+ {
+ "epoch": 2.1782505910165484,
+ "grad_norm": 2.935746431350708,
+ "learning_rate": 3.5787639231870673e-06,
+ "loss": 0.4021,
+ "step": 4607
+ },
+ {
+ "epoch": 2.178723404255319,
+ "grad_norm": 2.8655526638031006,
+ "learning_rate": 3.578201128884229e-06,
+ "loss": 0.4553,
+ "step": 4608
+ },
+ {
+ "epoch": 2.17919621749409,
+ "grad_norm": 3.219498634338379,
+ "learning_rate": 3.577638267446533e-06,
+ "loss": 0.4692,
+ "step": 4609
+ },
+ {
+ "epoch": 2.1796690307328603,
+ "grad_norm": 3.0449860095977783,
+ "learning_rate": 3.5770753389090283e-06,
+ "loss": 0.4675,
+ "step": 4610
+ },
+ {
+ "epoch": 2.180141843971631,
+ "grad_norm": 2.7045507431030273,
+ "learning_rate": 3.576512343306765e-06,
+ "loss": 0.4773,
+ "step": 4611
+ },
+ {
+ "epoch": 2.180614657210402,
+ "grad_norm": 2.601499557495117,
+ "learning_rate": 3.5759492806747985e-06,
+ "loss": 0.4112,
+ "step": 4612
+ },
+ {
+ "epoch": 2.1810874704491727,
+ "grad_norm": 2.987741470336914,
+ "learning_rate": 3.575386151048188e-06,
+ "loss": 0.4651,
+ "step": 4613
+ },
+ {
+ "epoch": 2.181560283687943,
+ "grad_norm": 2.961228847503662,
+ "learning_rate": 3.5748229544619973e-06,
+ "loss": 0.5116,
+ "step": 4614
+ },
+ {
+ "epoch": 2.182033096926714,
+ "grad_norm": 2.8008430004119873,
+ "learning_rate": 3.574259690951295e-06,
+ "loss": 0.4152,
+ "step": 4615
+ },
+ {
+ "epoch": 2.1825059101654847,
+ "grad_norm": 2.5429348945617676,
+ "learning_rate": 3.573696360551151e-06,
+ "loss": 0.4188,
+ "step": 4616
+ },
+ {
+ "epoch": 2.1829787234042555,
+ "grad_norm": 2.9566478729248047,
+ "learning_rate": 3.5731329632966428e-06,
+ "loss": 0.5156,
+ "step": 4617
+ },
+ {
+ "epoch": 2.183451536643026,
+ "grad_norm": 2.5302467346191406,
+ "learning_rate": 3.572569499222851e-06,
+ "loss": 0.4361,
+ "step": 4618
+ },
+ {
+ "epoch": 2.1839243498817966,
+ "grad_norm": 3.206803560256958,
+ "learning_rate": 3.5720059683648593e-06,
+ "loss": 0.5149,
+ "step": 4619
+ },
+ {
+ "epoch": 2.1843971631205674,
+ "grad_norm": 2.9432034492492676,
+ "learning_rate": 3.5714423707577573e-06,
+ "loss": 0.4411,
+ "step": 4620
+ },
+ {
+ "epoch": 2.184869976359338,
+ "grad_norm": 2.9412078857421875,
+ "learning_rate": 3.5708787064366358e-06,
+ "loss": 0.4372,
+ "step": 4621
+ },
+ {
+ "epoch": 2.1853427895981086,
+ "grad_norm": 3.1702330112457275,
+ "learning_rate": 3.5703149754365935e-06,
+ "loss": 0.4761,
+ "step": 4622
+ },
+ {
+ "epoch": 2.1858156028368794,
+ "grad_norm": 3.1240456104278564,
+ "learning_rate": 3.569751177792731e-06,
+ "loss": 0.4854,
+ "step": 4623
+ },
+ {
+ "epoch": 2.18628841607565,
+ "grad_norm": 2.7221994400024414,
+ "learning_rate": 3.5691873135401534e-06,
+ "loss": 0.4048,
+ "step": 4624
+ },
+ {
+ "epoch": 2.186761229314421,
+ "grad_norm": 2.74397873878479,
+ "learning_rate": 3.5686233827139695e-06,
+ "loss": 0.4747,
+ "step": 4625
+ },
+ {
+ "epoch": 2.1872340425531913,
+ "grad_norm": 2.7379889488220215,
+ "learning_rate": 3.5680593853492932e-06,
+ "loss": 0.4963,
+ "step": 4626
+ },
+ {
+ "epoch": 2.187706855791962,
+ "grad_norm": 3.040205478668213,
+ "learning_rate": 3.5674953214812435e-06,
+ "loss": 0.4917,
+ "step": 4627
+ },
+ {
+ "epoch": 2.188179669030733,
+ "grad_norm": 2.95302677154541,
+ "learning_rate": 3.56693119114494e-06,
+ "loss": 0.4758,
+ "step": 4628
+ },
+ {
+ "epoch": 2.1886524822695037,
+ "grad_norm": 2.5488312244415283,
+ "learning_rate": 3.56636699437551e-06,
+ "loss": 0.4057,
+ "step": 4629
+ },
+ {
+ "epoch": 2.189125295508274,
+ "grad_norm": 2.8379666805267334,
+ "learning_rate": 3.565802731208083e-06,
+ "loss": 0.4755,
+ "step": 4630
+ },
+ {
+ "epoch": 2.189598108747045,
+ "grad_norm": 2.8765869140625,
+ "learning_rate": 3.565238401677793e-06,
+ "loss": 0.4232,
+ "step": 4631
+ },
+ {
+ "epoch": 2.1900709219858157,
+ "grad_norm": 2.9091262817382812,
+ "learning_rate": 3.5646740058197784e-06,
+ "loss": 0.3874,
+ "step": 4632
+ },
+ {
+ "epoch": 2.1905437352245865,
+ "grad_norm": 2.7067387104034424,
+ "learning_rate": 3.5641095436691826e-06,
+ "loss": 0.4771,
+ "step": 4633
+ },
+ {
+ "epoch": 2.191016548463357,
+ "grad_norm": 2.403043508529663,
+ "learning_rate": 3.563545015261151e-06,
+ "loss": 0.4062,
+ "step": 4634
+ },
+ {
+ "epoch": 2.1914893617021276,
+ "grad_norm": 2.8059732913970947,
+ "learning_rate": 3.562980420630836e-06,
+ "loss": 0.4635,
+ "step": 4635
+ },
+ {
+ "epoch": 2.1919621749408984,
+ "grad_norm": 2.5467724800109863,
+ "learning_rate": 3.56241575981339e-06,
+ "loss": 0.4552,
+ "step": 4636
+ },
+ {
+ "epoch": 2.192434988179669,
+ "grad_norm": 2.651024103164673,
+ "learning_rate": 3.561851032843973e-06,
+ "loss": 0.38,
+ "step": 4637
+ },
+ {
+ "epoch": 2.1929078014184396,
+ "grad_norm": 2.5529849529266357,
+ "learning_rate": 3.5612862397577496e-06,
+ "loss": 0.4106,
+ "step": 4638
+ },
+ {
+ "epoch": 2.1933806146572103,
+ "grad_norm": 3.069258451461792,
+ "learning_rate": 3.5607213805898844e-06,
+ "loss": 0.461,
+ "step": 4639
+ },
+ {
+ "epoch": 2.193853427895981,
+ "grad_norm": 2.5652637481689453,
+ "learning_rate": 3.56015645537555e-06,
+ "loss": 0.4497,
+ "step": 4640
+ },
+ {
+ "epoch": 2.194326241134752,
+ "grad_norm": 2.699101209640503,
+ "learning_rate": 3.5595914641499224e-06,
+ "loss": 0.4887,
+ "step": 4641
+ },
+ {
+ "epoch": 2.1947990543735223,
+ "grad_norm": 2.9292235374450684,
+ "learning_rate": 3.5590264069481805e-06,
+ "loss": 0.4462,
+ "step": 4642
+ },
+ {
+ "epoch": 2.195271867612293,
+ "grad_norm": 2.6151106357574463,
+ "learning_rate": 3.5584612838055077e-06,
+ "loss": 0.4334,
+ "step": 4643
+ },
+ {
+ "epoch": 2.195744680851064,
+ "grad_norm": 2.895798444747925,
+ "learning_rate": 3.5578960947570923e-06,
+ "loss": 0.4448,
+ "step": 4644
+ },
+ {
+ "epoch": 2.1962174940898347,
+ "grad_norm": 2.627631425857544,
+ "learning_rate": 3.557330839838125e-06,
+ "loss": 0.436,
+ "step": 4645
+ },
+ {
+ "epoch": 2.196690307328605,
+ "grad_norm": 2.8803584575653076,
+ "learning_rate": 3.556765519083803e-06,
+ "loss": 0.4698,
+ "step": 4646
+ },
+ {
+ "epoch": 2.197163120567376,
+ "grad_norm": 2.436609983444214,
+ "learning_rate": 3.5562001325293265e-06,
+ "loss": 0.4043,
+ "step": 4647
+ },
+ {
+ "epoch": 2.1976359338061466,
+ "grad_norm": 2.5090718269348145,
+ "learning_rate": 3.5556346802098985e-06,
+ "loss": 0.4505,
+ "step": 4648
+ },
+ {
+ "epoch": 2.1981087470449174,
+ "grad_norm": 2.792783737182617,
+ "learning_rate": 3.5550691621607277e-06,
+ "loss": 0.43,
+ "step": 4649
+ },
+ {
+ "epoch": 2.198581560283688,
+ "grad_norm": 2.74153470993042,
+ "learning_rate": 3.554503578417026e-06,
+ "loss": 0.4496,
+ "step": 4650
+ },
+ {
+ "epoch": 2.1990543735224586,
+ "grad_norm": 3.0262627601623535,
+ "learning_rate": 3.5539379290140114e-06,
+ "loss": 0.4503,
+ "step": 4651
+ },
+ {
+ "epoch": 2.1995271867612294,
+ "grad_norm": 2.783811330795288,
+ "learning_rate": 3.553372213986903e-06,
+ "loss": 0.432,
+ "step": 4652
+ },
+ {
+ "epoch": 2.2,
+ "grad_norm": 3.091191053390503,
+ "learning_rate": 3.5528064333709255e-06,
+ "loss": 0.4658,
+ "step": 4653
+ },
+ {
+ "epoch": 2.2004728132387705,
+ "grad_norm": 2.814634084701538,
+ "learning_rate": 3.5522405872013076e-06,
+ "loss": 0.4473,
+ "step": 4654
+ },
+ {
+ "epoch": 2.2009456264775413,
+ "grad_norm": 2.6918299198150635,
+ "learning_rate": 3.5516746755132824e-06,
+ "loss": 0.5323,
+ "step": 4655
+ },
+ {
+ "epoch": 2.201418439716312,
+ "grad_norm": 2.9902455806732178,
+ "learning_rate": 3.5511086983420867e-06,
+ "loss": 0.5166,
+ "step": 4656
+ },
+ {
+ "epoch": 2.201891252955083,
+ "grad_norm": 2.932699203491211,
+ "learning_rate": 3.5505426557229616e-06,
+ "loss": 0.5197,
+ "step": 4657
+ },
+ {
+ "epoch": 2.2023640661938533,
+ "grad_norm": 2.585712432861328,
+ "learning_rate": 3.549976547691152e-06,
+ "loss": 0.425,
+ "step": 4658
+ },
+ {
+ "epoch": 2.202836879432624,
+ "grad_norm": 3.1019949913024902,
+ "learning_rate": 3.5494103742819065e-06,
+ "loss": 0.485,
+ "step": 4659
+ },
+ {
+ "epoch": 2.203309692671395,
+ "grad_norm": 2.3169195652008057,
+ "learning_rate": 3.548844135530478e-06,
+ "loss": 0.4064,
+ "step": 4660
+ },
+ {
+ "epoch": 2.2037825059101657,
+ "grad_norm": 2.779240846633911,
+ "learning_rate": 3.5482778314721257e-06,
+ "loss": 0.427,
+ "step": 4661
+ },
+ {
+ "epoch": 2.204255319148936,
+ "grad_norm": 2.765423059463501,
+ "learning_rate": 3.5477114621421078e-06,
+ "loss": 0.5125,
+ "step": 4662
+ },
+ {
+ "epoch": 2.204728132387707,
+ "grad_norm": 2.5590033531188965,
+ "learning_rate": 3.5471450275756913e-06,
+ "loss": 0.4009,
+ "step": 4663
+ },
+ {
+ "epoch": 2.2052009456264776,
+ "grad_norm": 2.706068515777588,
+ "learning_rate": 3.546578527808146e-06,
+ "loss": 0.4604,
+ "step": 4664
+ },
+ {
+ "epoch": 2.2056737588652484,
+ "grad_norm": 2.7995102405548096,
+ "learning_rate": 3.546011962874745e-06,
+ "loss": 0.4088,
+ "step": 4665
+ },
+ {
+ "epoch": 2.2061465721040188,
+ "grad_norm": 2.6369729042053223,
+ "learning_rate": 3.5454453328107656e-06,
+ "loss": 0.4634,
+ "step": 4666
+ },
+ {
+ "epoch": 2.2066193853427896,
+ "grad_norm": 3.1426475048065186,
+ "learning_rate": 3.54487863765149e-06,
+ "loss": 0.4761,
+ "step": 4667
+ },
+ {
+ "epoch": 2.2070921985815604,
+ "grad_norm": 2.7739460468292236,
+ "learning_rate": 3.5443118774322027e-06,
+ "loss": 0.467,
+ "step": 4668
+ },
+ {
+ "epoch": 2.207565011820331,
+ "grad_norm": 2.559105157852173,
+ "learning_rate": 3.5437450521881934e-06,
+ "loss": 0.4268,
+ "step": 4669
+ },
+ {
+ "epoch": 2.2080378250591015,
+ "grad_norm": 2.726593017578125,
+ "learning_rate": 3.543178161954758e-06,
+ "loss": 0.462,
+ "step": 4670
+ },
+ {
+ "epoch": 2.2085106382978723,
+ "grad_norm": 2.796109199523926,
+ "learning_rate": 3.5426112067671907e-06,
+ "loss": 0.4571,
+ "step": 4671
+ },
+ {
+ "epoch": 2.208983451536643,
+ "grad_norm": 2.7989072799682617,
+ "learning_rate": 3.5420441866607964e-06,
+ "loss": 0.4648,
+ "step": 4672
+ },
+ {
+ "epoch": 2.209456264775414,
+ "grad_norm": 2.6750967502593994,
+ "learning_rate": 3.5414771016708795e-06,
+ "loss": 0.4717,
+ "step": 4673
+ },
+ {
+ "epoch": 2.2099290780141843,
+ "grad_norm": 2.705659866333008,
+ "learning_rate": 3.5409099518327507e-06,
+ "loss": 0.4738,
+ "step": 4674
+ },
+ {
+ "epoch": 2.210401891252955,
+ "grad_norm": 2.79276442527771,
+ "learning_rate": 3.5403427371817234e-06,
+ "loss": 0.4625,
+ "step": 4675
+ },
+ {
+ "epoch": 2.210874704491726,
+ "grad_norm": 2.781339406967163,
+ "learning_rate": 3.539775457753115e-06,
+ "loss": 0.438,
+ "step": 4676
+ },
+ {
+ "epoch": 2.2113475177304966,
+ "grad_norm": 3.0088918209075928,
+ "learning_rate": 3.5392081135822488e-06,
+ "loss": 0.4776,
+ "step": 4677
+ },
+ {
+ "epoch": 2.211820330969267,
+ "grad_norm": 3.0291390419006348,
+ "learning_rate": 3.538640704704449e-06,
+ "loss": 0.4634,
+ "step": 4678
+ },
+ {
+ "epoch": 2.212293144208038,
+ "grad_norm": 2.967867374420166,
+ "learning_rate": 3.5380732311550477e-06,
+ "loss": 0.4776,
+ "step": 4679
+ },
+ {
+ "epoch": 2.2127659574468086,
+ "grad_norm": 2.6268832683563232,
+ "learning_rate": 3.5375056929693787e-06,
+ "loss": 0.4646,
+ "step": 4680
+ },
+ {
+ "epoch": 2.2132387706855794,
+ "grad_norm": 2.6688554286956787,
+ "learning_rate": 3.536938090182778e-06,
+ "loss": 0.3975,
+ "step": 4681
+ },
+ {
+ "epoch": 2.2137115839243497,
+ "grad_norm": 3.0079736709594727,
+ "learning_rate": 3.5363704228305906e-06,
+ "loss": 0.4724,
+ "step": 4682
+ },
+ {
+ "epoch": 2.2141843971631205,
+ "grad_norm": 2.4287586212158203,
+ "learning_rate": 3.535802690948161e-06,
+ "loss": 0.4371,
+ "step": 4683
+ },
+ {
+ "epoch": 2.2146572104018913,
+ "grad_norm": 2.960679531097412,
+ "learning_rate": 3.53523489457084e-06,
+ "loss": 0.4347,
+ "step": 4684
+ },
+ {
+ "epoch": 2.215130023640662,
+ "grad_norm": 2.9646008014678955,
+ "learning_rate": 3.5346670337339807e-06,
+ "loss": 0.4803,
+ "step": 4685
+ },
+ {
+ "epoch": 2.2156028368794325,
+ "grad_norm": 3.0518898963928223,
+ "learning_rate": 3.534099108472942e-06,
+ "loss": 0.4712,
+ "step": 4686
+ },
+ {
+ "epoch": 2.2160756501182033,
+ "grad_norm": 2.776681900024414,
+ "learning_rate": 3.533531118823086e-06,
+ "loss": 0.4347,
+ "step": 4687
+ },
+ {
+ "epoch": 2.216548463356974,
+ "grad_norm": 2.18019437789917,
+ "learning_rate": 3.53296306481978e-06,
+ "loss": 0.3551,
+ "step": 4688
+ },
+ {
+ "epoch": 2.217021276595745,
+ "grad_norm": 2.9400811195373535,
+ "learning_rate": 3.5323949464983937e-06,
+ "loss": 0.4912,
+ "step": 4689
+ },
+ {
+ "epoch": 2.2174940898345152,
+ "grad_norm": 2.798386812210083,
+ "learning_rate": 3.5318267638943e-06,
+ "loss": 0.3967,
+ "step": 4690
+ },
+ {
+ "epoch": 2.217966903073286,
+ "grad_norm": 2.5452775955200195,
+ "learning_rate": 3.531258517042879e-06,
+ "loss": 0.3773,
+ "step": 4691
+ },
+ {
+ "epoch": 2.218439716312057,
+ "grad_norm": 2.711137294769287,
+ "learning_rate": 3.5306902059795113e-06,
+ "loss": 0.4123,
+ "step": 4692
+ },
+ {
+ "epoch": 2.2189125295508276,
+ "grad_norm": 3.0022387504577637,
+ "learning_rate": 3.530121830739584e-06,
+ "loss": 0.4898,
+ "step": 4693
+ },
+ {
+ "epoch": 2.219385342789598,
+ "grad_norm": 2.871814250946045,
+ "learning_rate": 3.5295533913584877e-06,
+ "loss": 0.4497,
+ "step": 4694
+ },
+ {
+ "epoch": 2.219858156028369,
+ "grad_norm": 2.9782521724700928,
+ "learning_rate": 3.528984887871616e-06,
+ "loss": 0.4797,
+ "step": 4695
+ },
+ {
+ "epoch": 2.2203309692671396,
+ "grad_norm": 2.6896398067474365,
+ "learning_rate": 3.5284163203143673e-06,
+ "loss": 0.439,
+ "step": 4696
+ },
+ {
+ "epoch": 2.2208037825059104,
+ "grad_norm": 2.7898833751678467,
+ "learning_rate": 3.5278476887221436e-06,
+ "loss": 0.4656,
+ "step": 4697
+ },
+ {
+ "epoch": 2.2212765957446807,
+ "grad_norm": 2.800416946411133,
+ "learning_rate": 3.527278993130352e-06,
+ "loss": 0.4452,
+ "step": 4698
+ },
+ {
+ "epoch": 2.2217494089834515,
+ "grad_norm": 3.653228998184204,
+ "learning_rate": 3.526710233574401e-06,
+ "loss": 0.4189,
+ "step": 4699
+ },
+ {
+ "epoch": 2.2222222222222223,
+ "grad_norm": 2.856956958770752,
+ "learning_rate": 3.5261414100897064e-06,
+ "loss": 0.4298,
+ "step": 4700
+ },
+ {
+ "epoch": 2.222695035460993,
+ "grad_norm": 2.8576223850250244,
+ "learning_rate": 3.5255725227116854e-06,
+ "loss": 0.4425,
+ "step": 4701
+ },
+ {
+ "epoch": 2.2231678486997635,
+ "grad_norm": 3.1161351203918457,
+ "learning_rate": 3.5250035714757603e-06,
+ "loss": 0.4609,
+ "step": 4702
+ },
+ {
+ "epoch": 2.2236406619385343,
+ "grad_norm": 2.843379259109497,
+ "learning_rate": 3.5244345564173578e-06,
+ "loss": 0.3589,
+ "step": 4703
+ },
+ {
+ "epoch": 2.224113475177305,
+ "grad_norm": 2.877157211303711,
+ "learning_rate": 3.5238654775719068e-06,
+ "loss": 0.4591,
+ "step": 4704
+ },
+ {
+ "epoch": 2.2245862884160754,
+ "grad_norm": 3.488954782485962,
+ "learning_rate": 3.5232963349748424e-06,
+ "loss": 0.4836,
+ "step": 4705
+ },
+ {
+ "epoch": 2.225059101654846,
+ "grad_norm": 2.929037570953369,
+ "learning_rate": 3.5227271286616025e-06,
+ "loss": 0.5293,
+ "step": 4706
+ },
+ {
+ "epoch": 2.225531914893617,
+ "grad_norm": 2.6230576038360596,
+ "learning_rate": 3.5221578586676286e-06,
+ "loss": 0.4235,
+ "step": 4707
+ },
+ {
+ "epoch": 2.226004728132388,
+ "grad_norm": 2.529998302459717,
+ "learning_rate": 3.5215885250283664e-06,
+ "loss": 0.4369,
+ "step": 4708
+ },
+ {
+ "epoch": 2.2264775413711586,
+ "grad_norm": 2.817279577255249,
+ "learning_rate": 3.521019127779267e-06,
+ "loss": 0.481,
+ "step": 4709
+ },
+ {
+ "epoch": 2.226950354609929,
+ "grad_norm": 3.1513843536376953,
+ "learning_rate": 3.5204496669557833e-06,
+ "loss": 0.463,
+ "step": 4710
+ },
+ {
+ "epoch": 2.2274231678486998,
+ "grad_norm": 2.9403610229492188,
+ "learning_rate": 3.5198801425933725e-06,
+ "loss": 0.455,
+ "step": 4711
+ },
+ {
+ "epoch": 2.2278959810874706,
+ "grad_norm": 2.648346424102783,
+ "learning_rate": 3.5193105547274987e-06,
+ "loss": 0.4441,
+ "step": 4712
+ },
+ {
+ "epoch": 2.228368794326241,
+ "grad_norm": 2.791898727416992,
+ "learning_rate": 3.5187409033936252e-06,
+ "loss": 0.4682,
+ "step": 4713
+ },
+ {
+ "epoch": 2.2288416075650117,
+ "grad_norm": 2.8157432079315186,
+ "learning_rate": 3.5181711886272242e-06,
+ "loss": 0.4572,
+ "step": 4714
+ },
+ {
+ "epoch": 2.2293144208037825,
+ "grad_norm": 3.250319480895996,
+ "learning_rate": 3.5176014104637665e-06,
+ "loss": 0.4599,
+ "step": 4715
+ },
+ {
+ "epoch": 2.2297872340425533,
+ "grad_norm": 2.6747050285339355,
+ "learning_rate": 3.5170315689387307e-06,
+ "loss": 0.4328,
+ "step": 4716
+ },
+ {
+ "epoch": 2.230260047281324,
+ "grad_norm": 2.584094762802124,
+ "learning_rate": 3.5164616640875993e-06,
+ "loss": 0.4268,
+ "step": 4717
+ },
+ {
+ "epoch": 2.2307328605200945,
+ "grad_norm": 2.480710506439209,
+ "learning_rate": 3.5158916959458573e-06,
+ "loss": 0.438,
+ "step": 4718
+ },
+ {
+ "epoch": 2.2312056737588652,
+ "grad_norm": 2.9338483810424805,
+ "learning_rate": 3.515321664548993e-06,
+ "loss": 0.4937,
+ "step": 4719
+ },
+ {
+ "epoch": 2.231678486997636,
+ "grad_norm": 2.7880783081054688,
+ "learning_rate": 3.5147515699325013e-06,
+ "loss": 0.4624,
+ "step": 4720
+ },
+ {
+ "epoch": 2.2321513002364064,
+ "grad_norm": 2.740841865539551,
+ "learning_rate": 3.5141814121318797e-06,
+ "loss": 0.3689,
+ "step": 4721
+ },
+ {
+ "epoch": 2.232624113475177,
+ "grad_norm": 2.9541244506835938,
+ "learning_rate": 3.5136111911826277e-06,
+ "loss": 0.4092,
+ "step": 4722
+ },
+ {
+ "epoch": 2.233096926713948,
+ "grad_norm": 2.7205398082733154,
+ "learning_rate": 3.5130409071202515e-06,
+ "loss": 0.445,
+ "step": 4723
+ },
+ {
+ "epoch": 2.233569739952719,
+ "grad_norm": 2.563406229019165,
+ "learning_rate": 3.51247055998026e-06,
+ "loss": 0.4335,
+ "step": 4724
+ },
+ {
+ "epoch": 2.2340425531914896,
+ "grad_norm": 2.4249489307403564,
+ "learning_rate": 3.5119001497981666e-06,
+ "loss": 0.4671,
+ "step": 4725
+ },
+ {
+ "epoch": 2.23451536643026,
+ "grad_norm": 2.711630344390869,
+ "learning_rate": 3.5113296766094875e-06,
+ "loss": 0.4177,
+ "step": 4726
+ },
+ {
+ "epoch": 2.2349881796690307,
+ "grad_norm": 3.0257632732391357,
+ "learning_rate": 3.5107591404497443e-06,
+ "loss": 0.4976,
+ "step": 4727
+ },
+ {
+ "epoch": 2.2354609929078015,
+ "grad_norm": 2.717303991317749,
+ "learning_rate": 3.5101885413544614e-06,
+ "loss": 0.4621,
+ "step": 4728
+ },
+ {
+ "epoch": 2.235933806146572,
+ "grad_norm": 3.2846004962921143,
+ "learning_rate": 3.509617879359167e-06,
+ "loss": 0.4284,
+ "step": 4729
+ },
+ {
+ "epoch": 2.2364066193853427,
+ "grad_norm": 2.7217819690704346,
+ "learning_rate": 3.5090471544993953e-06,
+ "loss": 0.4247,
+ "step": 4730
+ },
+ {
+ "epoch": 2.2368794326241135,
+ "grad_norm": 2.5003223419189453,
+ "learning_rate": 3.5084763668106812e-06,
+ "loss": 0.4096,
+ "step": 4731
+ },
+ {
+ "epoch": 2.2373522458628843,
+ "grad_norm": 2.7312731742858887,
+ "learning_rate": 3.5079055163285658e-06,
+ "loss": 0.4741,
+ "step": 4732
+ },
+ {
+ "epoch": 2.237825059101655,
+ "grad_norm": 2.84940767288208,
+ "learning_rate": 3.5073346030885934e-06,
+ "loss": 0.4887,
+ "step": 4733
+ },
+ {
+ "epoch": 2.2382978723404254,
+ "grad_norm": 3.1188511848449707,
+ "learning_rate": 3.506763627126313e-06,
+ "loss": 0.5335,
+ "step": 4734
+ },
+ {
+ "epoch": 2.2387706855791962,
+ "grad_norm": 2.6741397380828857,
+ "learning_rate": 3.5061925884772753e-06,
+ "loss": 0.4137,
+ "step": 4735
+ },
+ {
+ "epoch": 2.239243498817967,
+ "grad_norm": 3.1542465686798096,
+ "learning_rate": 3.505621487177037e-06,
+ "loss": 0.5303,
+ "step": 4736
+ },
+ {
+ "epoch": 2.2397163120567374,
+ "grad_norm": 5.448268890380859,
+ "learning_rate": 3.505050323261159e-06,
+ "loss": 0.4995,
+ "step": 4737
+ },
+ {
+ "epoch": 2.240189125295508,
+ "grad_norm": 2.7317898273468018,
+ "learning_rate": 3.5044790967652037e-06,
+ "loss": 0.4595,
+ "step": 4738
+ },
+ {
+ "epoch": 2.240661938534279,
+ "grad_norm": 2.8135695457458496,
+ "learning_rate": 3.50390780772474e-06,
+ "loss": 0.4593,
+ "step": 4739
+ },
+ {
+ "epoch": 2.2411347517730498,
+ "grad_norm": 3.1391844749450684,
+ "learning_rate": 3.5033364561753393e-06,
+ "loss": 0.4902,
+ "step": 4740
+ },
+ {
+ "epoch": 2.24160756501182,
+ "grad_norm": 2.6383132934570312,
+ "learning_rate": 3.5027650421525762e-06,
+ "loss": 0.3832,
+ "step": 4741
+ },
+ {
+ "epoch": 2.242080378250591,
+ "grad_norm": 2.742546558380127,
+ "learning_rate": 3.5021935656920314e-06,
+ "loss": 0.4012,
+ "step": 4742
+ },
+ {
+ "epoch": 2.2425531914893617,
+ "grad_norm": 3.1243674755096436,
+ "learning_rate": 3.5016220268292873e-06,
+ "loss": 0.4271,
+ "step": 4743
+ },
+ {
+ "epoch": 2.2430260047281325,
+ "grad_norm": 2.794717788696289,
+ "learning_rate": 3.501050425599932e-06,
+ "loss": 0.4604,
+ "step": 4744
+ },
+ {
+ "epoch": 2.243498817966903,
+ "grad_norm": 2.8481621742248535,
+ "learning_rate": 3.5004787620395565e-06,
+ "loss": 0.4814,
+ "step": 4745
+ },
+ {
+ "epoch": 2.2439716312056737,
+ "grad_norm": 2.8842051029205322,
+ "learning_rate": 3.499907036183755e-06,
+ "loss": 0.4987,
+ "step": 4746
+ },
+ {
+ "epoch": 2.2444444444444445,
+ "grad_norm": 3.074805974960327,
+ "learning_rate": 3.4993352480681265e-06,
+ "loss": 0.4966,
+ "step": 4747
+ },
+ {
+ "epoch": 2.2449172576832153,
+ "grad_norm": 2.7204246520996094,
+ "learning_rate": 3.4987633977282742e-06,
+ "loss": 0.4,
+ "step": 4748
+ },
+ {
+ "epoch": 2.2453900709219856,
+ "grad_norm": 2.685884952545166,
+ "learning_rate": 3.4981914851998055e-06,
+ "loss": 0.4285,
+ "step": 4749
+ },
+ {
+ "epoch": 2.2458628841607564,
+ "grad_norm": 2.1666336059570312,
+ "learning_rate": 3.4976195105183287e-06,
+ "loss": 0.3756,
+ "step": 4750
+ },
+ {
+ "epoch": 2.246335697399527,
+ "grad_norm": 2.863006353378296,
+ "learning_rate": 3.49704747371946e-06,
+ "loss": 0.4535,
+ "step": 4751
+ },
+ {
+ "epoch": 2.246808510638298,
+ "grad_norm": 2.5558736324310303,
+ "learning_rate": 3.496475374838817e-06,
+ "loss": 0.4129,
+ "step": 4752
+ },
+ {
+ "epoch": 2.2472813238770684,
+ "grad_norm": 2.9780309200286865,
+ "learning_rate": 3.495903213912022e-06,
+ "loss": 0.4871,
+ "step": 4753
+ },
+ {
+ "epoch": 2.247754137115839,
+ "grad_norm": 2.951779365539551,
+ "learning_rate": 3.4953309909747e-06,
+ "loss": 0.5162,
+ "step": 4754
+ },
+ {
+ "epoch": 2.24822695035461,
+ "grad_norm": 2.7654693126678467,
+ "learning_rate": 3.4947587060624834e-06,
+ "loss": 0.4662,
+ "step": 4755
+ },
+ {
+ "epoch": 2.2486997635933808,
+ "grad_norm": 2.708247184753418,
+ "learning_rate": 3.494186359211002e-06,
+ "loss": 0.4279,
+ "step": 4756
+ },
+ {
+ "epoch": 2.249172576832151,
+ "grad_norm": 3.09916615486145,
+ "learning_rate": 3.4936139504558963e-06,
+ "loss": 0.4085,
+ "step": 4757
+ },
+ {
+ "epoch": 2.249645390070922,
+ "grad_norm": 2.913806200027466,
+ "learning_rate": 3.493041479832807e-06,
+ "loss": 0.4653,
+ "step": 4758
+ },
+ {
+ "epoch": 2.2501182033096927,
+ "grad_norm": 3.2903928756713867,
+ "learning_rate": 3.4924689473773787e-06,
+ "loss": 0.5167,
+ "step": 4759
+ },
+ {
+ "epoch": 2.2505910165484635,
+ "grad_norm": 3.1302902698516846,
+ "learning_rate": 3.4918963531252607e-06,
+ "loss": 0.5398,
+ "step": 4760
+ },
+ {
+ "epoch": 2.251063829787234,
+ "grad_norm": 2.8858273029327393,
+ "learning_rate": 3.4913236971121063e-06,
+ "loss": 0.4395,
+ "step": 4761
+ },
+ {
+ "epoch": 2.2515366430260046,
+ "grad_norm": 3.194521903991699,
+ "learning_rate": 3.4907509793735727e-06,
+ "loss": 0.5258,
+ "step": 4762
+ },
+ {
+ "epoch": 2.2520094562647754,
+ "grad_norm": 2.8640544414520264,
+ "learning_rate": 3.49017819994532e-06,
+ "loss": 0.4073,
+ "step": 4763
+ },
+ {
+ "epoch": 2.2524822695035462,
+ "grad_norm": 3.139995813369751,
+ "learning_rate": 3.489605358863011e-06,
+ "loss": 0.4653,
+ "step": 4764
+ },
+ {
+ "epoch": 2.2529550827423166,
+ "grad_norm": 2.6228537559509277,
+ "learning_rate": 3.489032456162317e-06,
+ "loss": 0.4546,
+ "step": 4765
+ },
+ {
+ "epoch": 2.2534278959810874,
+ "grad_norm": 2.8197672367095947,
+ "learning_rate": 3.4884594918789083e-06,
+ "loss": 0.479,
+ "step": 4766
+ },
+ {
+ "epoch": 2.253900709219858,
+ "grad_norm": 2.7839298248291016,
+ "learning_rate": 3.4878864660484612e-06,
+ "loss": 0.5081,
+ "step": 4767
+ },
+ {
+ "epoch": 2.254373522458629,
+ "grad_norm": 2.8630709648132324,
+ "learning_rate": 3.487313378706656e-06,
+ "loss": 0.4345,
+ "step": 4768
+ },
+ {
+ "epoch": 2.2548463356973993,
+ "grad_norm": 2.5661563873291016,
+ "learning_rate": 3.4867402298891755e-06,
+ "loss": 0.4266,
+ "step": 4769
+ },
+ {
+ "epoch": 2.25531914893617,
+ "grad_norm": 2.6274025440216064,
+ "learning_rate": 3.4861670196317084e-06,
+ "loss": 0.4645,
+ "step": 4770
+ },
+ {
+ "epoch": 2.255791962174941,
+ "grad_norm": 2.578702449798584,
+ "learning_rate": 3.485593747969944e-06,
+ "loss": 0.4242,
+ "step": 4771
+ },
+ {
+ "epoch": 2.2562647754137117,
+ "grad_norm": 2.322476625442505,
+ "learning_rate": 3.48502041493958e-06,
+ "loss": 0.3975,
+ "step": 4772
+ },
+ {
+ "epoch": 2.256737588652482,
+ "grad_norm": 2.8412630558013916,
+ "learning_rate": 3.484447020576313e-06,
+ "loss": 0.4276,
+ "step": 4773
+ },
+ {
+ "epoch": 2.257210401891253,
+ "grad_norm": 2.6090497970581055,
+ "learning_rate": 3.483873564915847e-06,
+ "loss": 0.429,
+ "step": 4774
+ },
+ {
+ "epoch": 2.2576832151300237,
+ "grad_norm": 2.692458152770996,
+ "learning_rate": 3.4833000479938877e-06,
+ "loss": 0.4211,
+ "step": 4775
+ },
+ {
+ "epoch": 2.2581560283687945,
+ "grad_norm": 2.5546815395355225,
+ "learning_rate": 3.482726469846146e-06,
+ "loss": 0.4751,
+ "step": 4776
+ },
+ {
+ "epoch": 2.258628841607565,
+ "grad_norm": 2.8409626483917236,
+ "learning_rate": 3.4821528305083376e-06,
+ "loss": 0.4821,
+ "step": 4777
+ },
+ {
+ "epoch": 2.2591016548463356,
+ "grad_norm": 2.722966432571411,
+ "learning_rate": 3.4815791300161785e-06,
+ "loss": 0.5029,
+ "step": 4778
+ },
+ {
+ "epoch": 2.2595744680851064,
+ "grad_norm": 2.691603899002075,
+ "learning_rate": 3.48100536840539e-06,
+ "loss": 0.4242,
+ "step": 4779
+ },
+ {
+ "epoch": 2.260047281323877,
+ "grad_norm": 2.64035964012146,
+ "learning_rate": 3.4804315457116992e-06,
+ "loss": 0.4033,
+ "step": 4780
+ },
+ {
+ "epoch": 2.2605200945626476,
+ "grad_norm": 2.758819580078125,
+ "learning_rate": 3.4798576619708357e-06,
+ "loss": 0.4321,
+ "step": 4781
+ },
+ {
+ "epoch": 2.2609929078014184,
+ "grad_norm": 2.8204405307769775,
+ "learning_rate": 3.4792837172185324e-06,
+ "loss": 0.4309,
+ "step": 4782
+ },
+ {
+ "epoch": 2.261465721040189,
+ "grad_norm": 2.529771327972412,
+ "learning_rate": 3.478709711490525e-06,
+ "loss": 0.4398,
+ "step": 4783
+ },
+ {
+ "epoch": 2.26193853427896,
+ "grad_norm": 2.8156251907348633,
+ "learning_rate": 3.4781356448225557e-06,
+ "loss": 0.447,
+ "step": 4784
+ },
+ {
+ "epoch": 2.2624113475177303,
+ "grad_norm": 2.689528703689575,
+ "learning_rate": 3.477561517250369e-06,
+ "loss": 0.3907,
+ "step": 4785
+ },
+ {
+ "epoch": 2.262884160756501,
+ "grad_norm": 2.9148027896881104,
+ "learning_rate": 3.476987328809713e-06,
+ "loss": 0.4287,
+ "step": 4786
+ },
+ {
+ "epoch": 2.263356973995272,
+ "grad_norm": 2.933021306991577,
+ "learning_rate": 3.4764130795363404e-06,
+ "loss": 0.4847,
+ "step": 4787
+ },
+ {
+ "epoch": 2.2638297872340427,
+ "grad_norm": 2.8559257984161377,
+ "learning_rate": 3.4758387694660064e-06,
+ "loss": 0.4554,
+ "step": 4788
+ },
+ {
+ "epoch": 2.264302600472813,
+ "grad_norm": 3.0355522632598877,
+ "learning_rate": 3.4752643986344707e-06,
+ "loss": 0.4286,
+ "step": 4789
+ },
+ {
+ "epoch": 2.264775413711584,
+ "grad_norm": 2.9768362045288086,
+ "learning_rate": 3.474689967077498e-06,
+ "loss": 0.4917,
+ "step": 4790
+ },
+ {
+ "epoch": 2.2652482269503547,
+ "grad_norm": 2.827971935272217,
+ "learning_rate": 3.474115474830855e-06,
+ "loss": 0.4542,
+ "step": 4791
+ },
+ {
+ "epoch": 2.2657210401891255,
+ "grad_norm": 2.559659719467163,
+ "learning_rate": 3.4735409219303123e-06,
+ "loss": 0.4168,
+ "step": 4792
+ },
+ {
+ "epoch": 2.266193853427896,
+ "grad_norm": 2.3172824382781982,
+ "learning_rate": 3.472966308411645e-06,
+ "loss": 0.3535,
+ "step": 4793
+ },
+ {
+ "epoch": 2.2666666666666666,
+ "grad_norm": 2.6779656410217285,
+ "learning_rate": 3.4723916343106327e-06,
+ "loss": 0.4599,
+ "step": 4794
+ },
+ {
+ "epoch": 2.2671394799054374,
+ "grad_norm": 2.55780291557312,
+ "learning_rate": 3.4718168996630573e-06,
+ "loss": 0.4185,
+ "step": 4795
+ },
+ {
+ "epoch": 2.267612293144208,
+ "grad_norm": 2.4929800033569336,
+ "learning_rate": 3.471242104504704e-06,
+ "loss": 0.4008,
+ "step": 4796
+ },
+ {
+ "epoch": 2.2680851063829786,
+ "grad_norm": 2.849475145339966,
+ "learning_rate": 3.4706672488713642e-06,
+ "loss": 0.396,
+ "step": 4797
+ },
+ {
+ "epoch": 2.2685579196217494,
+ "grad_norm": 2.4830739498138428,
+ "learning_rate": 3.4700923327988306e-06,
+ "loss": 0.4087,
+ "step": 4798
+ },
+ {
+ "epoch": 2.26903073286052,
+ "grad_norm": 3.2748119831085205,
+ "learning_rate": 3.469517356322901e-06,
+ "loss": 0.4496,
+ "step": 4799
+ },
+ {
+ "epoch": 2.269503546099291,
+ "grad_norm": 3.0440170764923096,
+ "learning_rate": 3.468942319479378e-06,
+ "loss": 0.4903,
+ "step": 4800
+ },
+ {
+ "epoch": 2.2699763593380613,
+ "grad_norm": 2.8200504779815674,
+ "learning_rate": 3.4683672223040645e-06,
+ "loss": 0.4588,
+ "step": 4801
+ },
+ {
+ "epoch": 2.270449172576832,
+ "grad_norm": 2.675206184387207,
+ "learning_rate": 3.4677920648327707e-06,
+ "loss": 0.4257,
+ "step": 4802
+ },
+ {
+ "epoch": 2.270921985815603,
+ "grad_norm": 2.862675905227661,
+ "learning_rate": 3.4672168471013084e-06,
+ "loss": 0.466,
+ "step": 4803
+ },
+ {
+ "epoch": 2.2713947990543737,
+ "grad_norm": 2.65663743019104,
+ "learning_rate": 3.4666415691454947e-06,
+ "loss": 0.4784,
+ "step": 4804
+ },
+ {
+ "epoch": 2.271867612293144,
+ "grad_norm": 2.5610506534576416,
+ "learning_rate": 3.4660662310011483e-06,
+ "loss": 0.4429,
+ "step": 4805
+ },
+ {
+ "epoch": 2.272340425531915,
+ "grad_norm": 2.6459643840789795,
+ "learning_rate": 3.465490832704094e-06,
+ "loss": 0.4345,
+ "step": 4806
+ },
+ {
+ "epoch": 2.2728132387706856,
+ "grad_norm": 2.426013469696045,
+ "learning_rate": 3.4649153742901585e-06,
+ "loss": 0.4533,
+ "step": 4807
+ },
+ {
+ "epoch": 2.2732860520094564,
+ "grad_norm": 2.6714842319488525,
+ "learning_rate": 3.4643398557951745e-06,
+ "loss": 0.4409,
+ "step": 4808
+ },
+ {
+ "epoch": 2.273758865248227,
+ "grad_norm": 2.703629493713379,
+ "learning_rate": 3.463764277254976e-06,
+ "loss": 0.3656,
+ "step": 4809
+ },
+ {
+ "epoch": 2.2742316784869976,
+ "grad_norm": 2.811753988265991,
+ "learning_rate": 3.4631886387054025e-06,
+ "loss": 0.4957,
+ "step": 4810
+ },
+ {
+ "epoch": 2.2747044917257684,
+ "grad_norm": 2.9469289779663086,
+ "learning_rate": 3.462612940182295e-06,
+ "loss": 0.4582,
+ "step": 4811
+ },
+ {
+ "epoch": 2.275177304964539,
+ "grad_norm": 2.6287801265716553,
+ "learning_rate": 3.462037181721501e-06,
+ "loss": 0.4072,
+ "step": 4812
+ },
+ {
+ "epoch": 2.2756501182033095,
+ "grad_norm": 2.7104952335357666,
+ "learning_rate": 3.46146136335887e-06,
+ "loss": 0.4998,
+ "step": 4813
+ },
+ {
+ "epoch": 2.2761229314420803,
+ "grad_norm": 3.170363187789917,
+ "learning_rate": 3.460885485130256e-06,
+ "loss": 0.4722,
+ "step": 4814
+ },
+ {
+ "epoch": 2.276595744680851,
+ "grad_norm": 2.7315151691436768,
+ "learning_rate": 3.460309547071516e-06,
+ "loss": 0.4482,
+ "step": 4815
+ },
+ {
+ "epoch": 2.277068557919622,
+ "grad_norm": 2.685988187789917,
+ "learning_rate": 3.4597335492185113e-06,
+ "loss": 0.4419,
+ "step": 4816
+ },
+ {
+ "epoch": 2.2775413711583923,
+ "grad_norm": 2.532790184020996,
+ "learning_rate": 3.459157491607107e-06,
+ "loss": 0.3961,
+ "step": 4817
+ },
+ {
+ "epoch": 2.278014184397163,
+ "grad_norm": 2.920729875564575,
+ "learning_rate": 3.458581374273171e-06,
+ "loss": 0.4767,
+ "step": 4818
+ },
+ {
+ "epoch": 2.278486997635934,
+ "grad_norm": 3.2481250762939453,
+ "learning_rate": 3.458005197252577e-06,
+ "loss": 0.4985,
+ "step": 4819
+ },
+ {
+ "epoch": 2.2789598108747047,
+ "grad_norm": 2.373809814453125,
+ "learning_rate": 3.4574289605811994e-06,
+ "loss": 0.4259,
+ "step": 4820
+ },
+ {
+ "epoch": 2.279432624113475,
+ "grad_norm": 2.7851033210754395,
+ "learning_rate": 3.4568526642949184e-06,
+ "loss": 0.4829,
+ "step": 4821
+ },
+ {
+ "epoch": 2.279905437352246,
+ "grad_norm": 2.9777133464813232,
+ "learning_rate": 3.456276308429618e-06,
+ "loss": 0.4896,
+ "step": 4822
+ },
+ {
+ "epoch": 2.2803782505910166,
+ "grad_norm": 2.7922022342681885,
+ "learning_rate": 3.4556998930211853e-06,
+ "loss": 0.4908,
+ "step": 4823
+ },
+ {
+ "epoch": 2.2808510638297874,
+ "grad_norm": 2.699180841445923,
+ "learning_rate": 3.4551234181055104e-06,
+ "loss": 0.4518,
+ "step": 4824
+ },
+ {
+ "epoch": 2.2813238770685578,
+ "grad_norm": 3.1200520992279053,
+ "learning_rate": 3.4545468837184885e-06,
+ "loss": 0.4877,
+ "step": 4825
+ },
+ {
+ "epoch": 2.2817966903073286,
+ "grad_norm": 2.56782603263855,
+ "learning_rate": 3.453970289896018e-06,
+ "loss": 0.4281,
+ "step": 4826
+ },
+ {
+ "epoch": 2.2822695035460994,
+ "grad_norm": 3.241356372833252,
+ "learning_rate": 3.4533936366740007e-06,
+ "loss": 0.4338,
+ "step": 4827
+ },
+ {
+ "epoch": 2.28274231678487,
+ "grad_norm": 3.560295343399048,
+ "learning_rate": 3.452816924088342e-06,
+ "loss": 0.4121,
+ "step": 4828
+ },
+ {
+ "epoch": 2.2832151300236405,
+ "grad_norm": 2.8512449264526367,
+ "learning_rate": 3.452240152174951e-06,
+ "loss": 0.4357,
+ "step": 4829
+ },
+ {
+ "epoch": 2.2836879432624113,
+ "grad_norm": 3.0332651138305664,
+ "learning_rate": 3.4516633209697408e-06,
+ "loss": 0.4985,
+ "step": 4830
+ },
+ {
+ "epoch": 2.284160756501182,
+ "grad_norm": 2.520930528640747,
+ "learning_rate": 3.451086430508629e-06,
+ "loss": 0.4021,
+ "step": 4831
+ },
+ {
+ "epoch": 2.284633569739953,
+ "grad_norm": 2.508227825164795,
+ "learning_rate": 3.4505094808275363e-06,
+ "loss": 0.3935,
+ "step": 4832
+ },
+ {
+ "epoch": 2.2851063829787233,
+ "grad_norm": 2.56752610206604,
+ "learning_rate": 3.449932471962385e-06,
+ "loss": 0.4689,
+ "step": 4833
+ },
+ {
+ "epoch": 2.285579196217494,
+ "grad_norm": 2.7757534980773926,
+ "learning_rate": 3.449355403949105e-06,
+ "loss": 0.4565,
+ "step": 4834
+ },
+ {
+ "epoch": 2.286052009456265,
+ "grad_norm": 3.364821195602417,
+ "learning_rate": 3.448778276823626e-06,
+ "loss": 0.4729,
+ "step": 4835
+ },
+ {
+ "epoch": 2.2865248226950357,
+ "grad_norm": 3.0045557022094727,
+ "learning_rate": 3.448201090621884e-06,
+ "loss": 0.4834,
+ "step": 4836
+ },
+ {
+ "epoch": 2.286997635933806,
+ "grad_norm": 2.9451794624328613,
+ "learning_rate": 3.4476238453798183e-06,
+ "loss": 0.489,
+ "step": 4837
+ },
+ {
+ "epoch": 2.287470449172577,
+ "grad_norm": 2.8307435512542725,
+ "learning_rate": 3.4470465411333708e-06,
+ "loss": 0.5079,
+ "step": 4838
+ },
+ {
+ "epoch": 2.2879432624113476,
+ "grad_norm": 2.7118136882781982,
+ "learning_rate": 3.4464691779184876e-06,
+ "loss": 0.4794,
+ "step": 4839
+ },
+ {
+ "epoch": 2.2884160756501184,
+ "grad_norm": 2.6724441051483154,
+ "learning_rate": 3.445891755771119e-06,
+ "loss": 0.4619,
+ "step": 4840
+ },
+ {
+ "epoch": 2.2888888888888888,
+ "grad_norm": 2.8161258697509766,
+ "learning_rate": 3.445314274727218e-06,
+ "loss": 0.4287,
+ "step": 4841
+ },
+ {
+ "epoch": 2.2893617021276595,
+ "grad_norm": 2.5681750774383545,
+ "learning_rate": 3.4447367348227433e-06,
+ "loss": 0.4167,
+ "step": 4842
+ },
+ {
+ "epoch": 2.2898345153664303,
+ "grad_norm": 2.8136284351348877,
+ "learning_rate": 3.444159136093654e-06,
+ "loss": 0.4195,
+ "step": 4843
+ },
+ {
+ "epoch": 2.290307328605201,
+ "grad_norm": 3.153651714324951,
+ "learning_rate": 3.443581478575915e-06,
+ "loss": 0.4821,
+ "step": 4844
+ },
+ {
+ "epoch": 2.2907801418439715,
+ "grad_norm": 2.980883836746216,
+ "learning_rate": 3.4430037623054953e-06,
+ "loss": 0.4627,
+ "step": 4845
+ },
+ {
+ "epoch": 2.2912529550827423,
+ "grad_norm": 2.786182403564453,
+ "learning_rate": 3.4424259873183664e-06,
+ "loss": 0.4342,
+ "step": 4846
+ },
+ {
+ "epoch": 2.291725768321513,
+ "grad_norm": 2.8938279151916504,
+ "learning_rate": 3.4418481536505026e-06,
+ "loss": 0.3997,
+ "step": 4847
+ },
+ {
+ "epoch": 2.2921985815602834,
+ "grad_norm": 2.5534510612487793,
+ "learning_rate": 3.4412702613378844e-06,
+ "loss": 0.3982,
+ "step": 4848
+ },
+ {
+ "epoch": 2.2926713947990542,
+ "grad_norm": 2.7907063961029053,
+ "learning_rate": 3.4406923104164956e-06,
+ "loss": 0.4484,
+ "step": 4849
+ },
+ {
+ "epoch": 2.293144208037825,
+ "grad_norm": 3.162702798843384,
+ "learning_rate": 3.4401143009223203e-06,
+ "loss": 0.4528,
+ "step": 4850
+ },
+ {
+ "epoch": 2.293617021276596,
+ "grad_norm": 2.4647393226623535,
+ "learning_rate": 3.4395362328913505e-06,
+ "loss": 0.3759,
+ "step": 4851
+ },
+ {
+ "epoch": 2.2940898345153666,
+ "grad_norm": 2.8219876289367676,
+ "learning_rate": 3.438958106359579e-06,
+ "loss": 0.4903,
+ "step": 4852
+ },
+ {
+ "epoch": 2.294562647754137,
+ "grad_norm": 2.827073097229004,
+ "learning_rate": 3.438379921363003e-06,
+ "loss": 0.4315,
+ "step": 4853
+ },
+ {
+ "epoch": 2.295035460992908,
+ "grad_norm": 2.472470283508301,
+ "learning_rate": 3.4378016779376244e-06,
+ "loss": 0.4478,
+ "step": 4854
+ },
+ {
+ "epoch": 2.2955082742316786,
+ "grad_norm": 3.3994734287261963,
+ "learning_rate": 3.4372233761194473e-06,
+ "loss": 0.5086,
+ "step": 4855
+ },
+ {
+ "epoch": 2.295981087470449,
+ "grad_norm": 3.030465602874756,
+ "learning_rate": 3.4366450159444796e-06,
+ "loss": 0.4159,
+ "step": 4856
+ },
+ {
+ "epoch": 2.2964539007092197,
+ "grad_norm": 2.5460705757141113,
+ "learning_rate": 3.4360665974487346e-06,
+ "loss": 0.4097,
+ "step": 4857
+ },
+ {
+ "epoch": 2.2969267139479905,
+ "grad_norm": 2.884469509124756,
+ "learning_rate": 3.4354881206682273e-06,
+ "loss": 0.4478,
+ "step": 4858
+ },
+ {
+ "epoch": 2.2973995271867613,
+ "grad_norm": 2.5139710903167725,
+ "learning_rate": 3.4349095856389765e-06,
+ "loss": 0.4286,
+ "step": 4859
+ },
+ {
+ "epoch": 2.297872340425532,
+ "grad_norm": 3.1628260612487793,
+ "learning_rate": 3.4343309923970053e-06,
+ "loss": 0.4617,
+ "step": 4860
+ },
+ {
+ "epoch": 2.2983451536643025,
+ "grad_norm": 2.6141695976257324,
+ "learning_rate": 3.4337523409783395e-06,
+ "loss": 0.3841,
+ "step": 4861
+ },
+ {
+ "epoch": 2.2988179669030733,
+ "grad_norm": 2.766834259033203,
+ "learning_rate": 3.43317363141901e-06,
+ "loss": 0.4484,
+ "step": 4862
+ },
+ {
+ "epoch": 2.299290780141844,
+ "grad_norm": 2.785491943359375,
+ "learning_rate": 3.4325948637550503e-06,
+ "loss": 0.4363,
+ "step": 4863
+ },
+ {
+ "epoch": 2.2997635933806144,
+ "grad_norm": 2.624929189682007,
+ "learning_rate": 3.4320160380224988e-06,
+ "loss": 0.4518,
+ "step": 4864
+ },
+ {
+ "epoch": 2.300236406619385,
+ "grad_norm": 2.895413398742676,
+ "learning_rate": 3.4314371542573944e-06,
+ "loss": 0.4745,
+ "step": 4865
+ },
+ {
+ "epoch": 2.300709219858156,
+ "grad_norm": 2.603816270828247,
+ "learning_rate": 3.430858212495783e-06,
+ "loss": 0.4444,
+ "step": 4866
+ },
+ {
+ "epoch": 2.301182033096927,
+ "grad_norm": 3.387360095977783,
+ "learning_rate": 3.4302792127737116e-06,
+ "loss": 0.4169,
+ "step": 4867
+ },
+ {
+ "epoch": 2.3016548463356976,
+ "grad_norm": 2.894054651260376,
+ "learning_rate": 3.4297001551272334e-06,
+ "loss": 0.4493,
+ "step": 4868
+ },
+ {
+ "epoch": 2.302127659574468,
+ "grad_norm": 3.0432028770446777,
+ "learning_rate": 3.4291210395924035e-06,
+ "loss": 0.4854,
+ "step": 4869
+ },
+ {
+ "epoch": 2.3026004728132388,
+ "grad_norm": 2.5144734382629395,
+ "learning_rate": 3.42854186620528e-06,
+ "loss": 0.4556,
+ "step": 4870
+ },
+ {
+ "epoch": 2.3030732860520096,
+ "grad_norm": 2.964812755584717,
+ "learning_rate": 3.427962635001926e-06,
+ "loss": 0.495,
+ "step": 4871
+ },
+ {
+ "epoch": 2.30354609929078,
+ "grad_norm": 2.9991118907928467,
+ "learning_rate": 3.4273833460184077e-06,
+ "loss": 0.4787,
+ "step": 4872
+ },
+ {
+ "epoch": 2.3040189125295507,
+ "grad_norm": 2.9424328804016113,
+ "learning_rate": 3.4268039992907955e-06,
+ "loss": 0.5006,
+ "step": 4873
+ },
+ {
+ "epoch": 2.3044917257683215,
+ "grad_norm": 2.792880058288574,
+ "learning_rate": 3.426224594855162e-06,
+ "loss": 0.4399,
+ "step": 4874
+ },
+ {
+ "epoch": 2.3049645390070923,
+ "grad_norm": 2.5308053493499756,
+ "learning_rate": 3.4256451327475838e-06,
+ "loss": 0.4843,
+ "step": 4875
+ },
+ {
+ "epoch": 2.305437352245863,
+ "grad_norm": 2.7937564849853516,
+ "learning_rate": 3.425065613004142e-06,
+ "loss": 0.4428,
+ "step": 4876
+ },
+ {
+ "epoch": 2.3059101654846335,
+ "grad_norm": 2.4231557846069336,
+ "learning_rate": 3.424486035660921e-06,
+ "loss": 0.4054,
+ "step": 4877
+ },
+ {
+ "epoch": 2.3063829787234043,
+ "grad_norm": 3.0622596740722656,
+ "learning_rate": 3.423906400754009e-06,
+ "loss": 0.4623,
+ "step": 4878
+ },
+ {
+ "epoch": 2.306855791962175,
+ "grad_norm": 2.6532933712005615,
+ "learning_rate": 3.4233267083194955e-06,
+ "loss": 0.4387,
+ "step": 4879
+ },
+ {
+ "epoch": 2.3073286052009454,
+ "grad_norm": 2.793325185775757,
+ "learning_rate": 3.422746958393477e-06,
+ "loss": 0.4047,
+ "step": 4880
+ },
+ {
+ "epoch": 2.307801418439716,
+ "grad_norm": 2.9178314208984375,
+ "learning_rate": 3.422167151012052e-06,
+ "loss": 0.4397,
+ "step": 4881
+ },
+ {
+ "epoch": 2.308274231678487,
+ "grad_norm": 3.463913917541504,
+ "learning_rate": 3.4215872862113214e-06,
+ "loss": 0.4347,
+ "step": 4882
+ },
+ {
+ "epoch": 2.308747044917258,
+ "grad_norm": 3.228403091430664,
+ "learning_rate": 3.421007364027392e-06,
+ "loss": 0.4405,
+ "step": 4883
+ },
+ {
+ "epoch": 2.3092198581560286,
+ "grad_norm": 2.896933078765869,
+ "learning_rate": 3.420427384496372e-06,
+ "loss": 0.4429,
+ "step": 4884
+ },
+ {
+ "epoch": 2.309692671394799,
+ "grad_norm": 2.5559937953948975,
+ "learning_rate": 3.4198473476543755e-06,
+ "loss": 0.4281,
+ "step": 4885
+ },
+ {
+ "epoch": 2.3101654846335697,
+ "grad_norm": 3.457918167114258,
+ "learning_rate": 3.419267253537517e-06,
+ "loss": 0.4495,
+ "step": 4886
+ },
+ {
+ "epoch": 2.3106382978723405,
+ "grad_norm": 2.6554839611053467,
+ "learning_rate": 3.418687102181918e-06,
+ "loss": 0.4682,
+ "step": 4887
+ },
+ {
+ "epoch": 2.311111111111111,
+ "grad_norm": 2.8171639442443848,
+ "learning_rate": 3.4181068936237024e-06,
+ "loss": 0.4184,
+ "step": 4888
+ },
+ {
+ "epoch": 2.3115839243498817,
+ "grad_norm": 2.9272499084472656,
+ "learning_rate": 3.4175266278989955e-06,
+ "loss": 0.5445,
+ "step": 4889
+ },
+ {
+ "epoch": 2.3120567375886525,
+ "grad_norm": 2.5928499698638916,
+ "learning_rate": 3.4169463050439284e-06,
+ "loss": 0.3808,
+ "step": 4890
+ },
+ {
+ "epoch": 2.3125295508274233,
+ "grad_norm": 2.6624577045440674,
+ "learning_rate": 3.4163659250946356e-06,
+ "loss": 0.4678,
+ "step": 4891
+ },
+ {
+ "epoch": 2.313002364066194,
+ "grad_norm": 2.666555643081665,
+ "learning_rate": 3.4157854880872553e-06,
+ "loss": 0.457,
+ "step": 4892
+ },
+ {
+ "epoch": 2.3134751773049644,
+ "grad_norm": 3.2987406253814697,
+ "learning_rate": 3.4152049940579278e-06,
+ "loss": 0.551,
+ "step": 4893
+ },
+ {
+ "epoch": 2.3139479905437352,
+ "grad_norm": 2.728119134902954,
+ "learning_rate": 3.414624443042799e-06,
+ "loss": 0.3935,
+ "step": 4894
+ },
+ {
+ "epoch": 2.314420803782506,
+ "grad_norm": 3.133005380630493,
+ "learning_rate": 3.4140438350780157e-06,
+ "loss": 0.4981,
+ "step": 4895
+ },
+ {
+ "epoch": 2.3148936170212764,
+ "grad_norm": 2.591252565383911,
+ "learning_rate": 3.4134631701997312e-06,
+ "loss": 0.4251,
+ "step": 4896
+ },
+ {
+ "epoch": 2.315366430260047,
+ "grad_norm": 3.007136344909668,
+ "learning_rate": 3.412882448444101e-06,
+ "loss": 0.4492,
+ "step": 4897
+ },
+ {
+ "epoch": 2.315839243498818,
+ "grad_norm": 2.6391026973724365,
+ "learning_rate": 3.412301669847284e-06,
+ "loss": 0.5151,
+ "step": 4898
+ },
+ {
+ "epoch": 2.3163120567375888,
+ "grad_norm": 7.453699111938477,
+ "learning_rate": 3.411720834445441e-06,
+ "loss": 0.4983,
+ "step": 4899
+ },
+ {
+ "epoch": 2.3167848699763596,
+ "grad_norm": 2.667712688446045,
+ "learning_rate": 3.41113994227474e-06,
+ "loss": 0.4581,
+ "step": 4900
+ },
+ {
+ "epoch": 2.31725768321513,
+ "grad_norm": 2.7727627754211426,
+ "learning_rate": 3.41055899337135e-06,
+ "loss": 0.4731,
+ "step": 4901
+ },
+ {
+ "epoch": 2.3177304964539007,
+ "grad_norm": 3.0096890926361084,
+ "learning_rate": 3.409977987771444e-06,
+ "loss": 0.4996,
+ "step": 4902
+ },
+ {
+ "epoch": 2.3182033096926715,
+ "grad_norm": 2.725830078125,
+ "learning_rate": 3.4093969255111993e-06,
+ "loss": 0.4544,
+ "step": 4903
+ },
+ {
+ "epoch": 2.318676122931442,
+ "grad_norm": 2.7596993446350098,
+ "learning_rate": 3.4088158066267945e-06,
+ "loss": 0.4846,
+ "step": 4904
+ },
+ {
+ "epoch": 2.3191489361702127,
+ "grad_norm": 2.702620029449463,
+ "learning_rate": 3.4082346311544156e-06,
+ "loss": 0.4849,
+ "step": 4905
+ },
+ {
+ "epoch": 2.3196217494089835,
+ "grad_norm": 2.725374460220337,
+ "learning_rate": 3.407653399130249e-06,
+ "loss": 0.4116,
+ "step": 4906
+ },
+ {
+ "epoch": 2.3200945626477543,
+ "grad_norm": 2.6770219802856445,
+ "learning_rate": 3.4070721105904847e-06,
+ "loss": 0.4606,
+ "step": 4907
+ },
+ {
+ "epoch": 2.320567375886525,
+ "grad_norm": 2.9249117374420166,
+ "learning_rate": 3.406490765571317e-06,
+ "loss": 0.461,
+ "step": 4908
+ },
+ {
+ "epoch": 2.3210401891252954,
+ "grad_norm": 2.7568278312683105,
+ "learning_rate": 3.405909364108944e-06,
+ "loss": 0.4065,
+ "step": 4909
+ },
+ {
+ "epoch": 2.321513002364066,
+ "grad_norm": 2.7231340408325195,
+ "learning_rate": 3.4053279062395676e-06,
+ "loss": 0.4173,
+ "step": 4910
+ },
+ {
+ "epoch": 2.321985815602837,
+ "grad_norm": 3.1401100158691406,
+ "learning_rate": 3.404746391999393e-06,
+ "loss": 0.4287,
+ "step": 4911
+ },
+ {
+ "epoch": 2.3224586288416074,
+ "grad_norm": 2.714853525161743,
+ "learning_rate": 3.404164821424627e-06,
+ "loss": 0.4552,
+ "step": 4912
+ },
+ {
+ "epoch": 2.322931442080378,
+ "grad_norm": 3.1509978771209717,
+ "learning_rate": 3.4035831945514825e-06,
+ "loss": 0.5296,
+ "step": 4913
+ },
+ {
+ "epoch": 2.323404255319149,
+ "grad_norm": 2.567194938659668,
+ "learning_rate": 3.403001511416174e-06,
+ "loss": 0.4306,
+ "step": 4914
+ },
+ {
+ "epoch": 2.3238770685579198,
+ "grad_norm": 2.7473888397216797,
+ "learning_rate": 3.402419772054922e-06,
+ "loss": 0.4009,
+ "step": 4915
+ },
+ {
+ "epoch": 2.3243498817966906,
+ "grad_norm": 2.8617780208587646,
+ "learning_rate": 3.401837976503947e-06,
+ "loss": 0.4545,
+ "step": 4916
+ },
+ {
+ "epoch": 2.324822695035461,
+ "grad_norm": 2.3650572299957275,
+ "learning_rate": 3.401256124799475e-06,
+ "loss": 0.4046,
+ "step": 4917
+ },
+ {
+ "epoch": 2.3252955082742317,
+ "grad_norm": 2.418407678604126,
+ "learning_rate": 3.4006742169777364e-06,
+ "loss": 0.4222,
+ "step": 4918
+ },
+ {
+ "epoch": 2.3257683215130025,
+ "grad_norm": 2.7232494354248047,
+ "learning_rate": 3.400092253074964e-06,
+ "loss": 0.4373,
+ "step": 4919
+ },
+ {
+ "epoch": 2.326241134751773,
+ "grad_norm": 2.702965497970581,
+ "learning_rate": 3.399510233127394e-06,
+ "loss": 0.437,
+ "step": 4920
+ },
+ {
+ "epoch": 2.3267139479905437,
+ "grad_norm": 2.8381760120391846,
+ "learning_rate": 3.3989281571712664e-06,
+ "loss": 0.4294,
+ "step": 4921
+ },
+ {
+ "epoch": 2.3271867612293144,
+ "grad_norm": 2.767131805419922,
+ "learning_rate": 3.398346025242823e-06,
+ "loss": 0.4673,
+ "step": 4922
+ },
+ {
+ "epoch": 2.3276595744680852,
+ "grad_norm": 2.5261805057525635,
+ "learning_rate": 3.3977638373783123e-06,
+ "loss": 0.4147,
+ "step": 4923
+ },
+ {
+ "epoch": 2.3281323877068556,
+ "grad_norm": 2.7176897525787354,
+ "learning_rate": 3.3971815936139836e-06,
+ "loss": 0.3885,
+ "step": 4924
+ },
+ {
+ "epoch": 2.3286052009456264,
+ "grad_norm": 2.849043130874634,
+ "learning_rate": 3.396599293986092e-06,
+ "loss": 0.4842,
+ "step": 4925
+ },
+ {
+ "epoch": 2.329078014184397,
+ "grad_norm": 2.550673484802246,
+ "learning_rate": 3.3960169385308927e-06,
+ "loss": 0.4049,
+ "step": 4926
+ },
+ {
+ "epoch": 2.329550827423168,
+ "grad_norm": 3.0821585655212402,
+ "learning_rate": 3.3954345272846477e-06,
+ "loss": 0.53,
+ "step": 4927
+ },
+ {
+ "epoch": 2.3300236406619383,
+ "grad_norm": 2.68658185005188,
+ "learning_rate": 3.3948520602836223e-06,
+ "loss": 0.4592,
+ "step": 4928
+ },
+ {
+ "epoch": 2.330496453900709,
+ "grad_norm": 2.7391903400421143,
+ "learning_rate": 3.394269537564082e-06,
+ "loss": 0.4773,
+ "step": 4929
+ },
+ {
+ "epoch": 2.33096926713948,
+ "grad_norm": 2.665114164352417,
+ "learning_rate": 3.393686959162299e-06,
+ "loss": 0.4671,
+ "step": 4930
+ },
+ {
+ "epoch": 2.3314420803782507,
+ "grad_norm": 2.6827399730682373,
+ "learning_rate": 3.3931043251145477e-06,
+ "loss": 0.4669,
+ "step": 4931
+ },
+ {
+ "epoch": 2.331914893617021,
+ "grad_norm": 3.1760666370391846,
+ "learning_rate": 3.392521635457106e-06,
+ "loss": 0.4729,
+ "step": 4932
+ },
+ {
+ "epoch": 2.332387706855792,
+ "grad_norm": 2.9686226844787598,
+ "learning_rate": 3.3919388902262555e-06,
+ "loss": 0.5017,
+ "step": 4933
+ },
+ {
+ "epoch": 2.3328605200945627,
+ "grad_norm": 2.471325397491455,
+ "learning_rate": 3.3913560894582818e-06,
+ "loss": 0.4195,
+ "step": 4934
+ },
+ {
+ "epoch": 2.3333333333333335,
+ "grad_norm": 2.4062955379486084,
+ "learning_rate": 3.3907732331894732e-06,
+ "loss": 0.3666,
+ "step": 4935
+ },
+ {
+ "epoch": 2.333806146572104,
+ "grad_norm": 2.6800320148468018,
+ "learning_rate": 3.3901903214561206e-06,
+ "loss": 0.4774,
+ "step": 4936
+ },
+ {
+ "epoch": 2.3342789598108746,
+ "grad_norm": 2.923741102218628,
+ "learning_rate": 3.389607354294521e-06,
+ "loss": 0.4546,
+ "step": 4937
+ },
+ {
+ "epoch": 2.3347517730496454,
+ "grad_norm": 3.0034096240997314,
+ "learning_rate": 3.3890243317409716e-06,
+ "loss": 0.5373,
+ "step": 4938
+ },
+ {
+ "epoch": 2.3352245862884162,
+ "grad_norm": 3.0757339000701904,
+ "learning_rate": 3.388441253831775e-06,
+ "loss": 0.4655,
+ "step": 4939
+ },
+ {
+ "epoch": 2.3356973995271866,
+ "grad_norm": 2.5352041721343994,
+ "learning_rate": 3.3878581206032373e-06,
+ "loss": 0.4391,
+ "step": 4940
+ },
+ {
+ "epoch": 2.3361702127659574,
+ "grad_norm": 2.9332237243652344,
+ "learning_rate": 3.3872749320916675e-06,
+ "loss": 0.4685,
+ "step": 4941
+ },
+ {
+ "epoch": 2.336643026004728,
+ "grad_norm": 2.4871222972869873,
+ "learning_rate": 3.386691688333379e-06,
+ "loss": 0.3952,
+ "step": 4942
+ },
+ {
+ "epoch": 2.337115839243499,
+ "grad_norm": 2.6384918689727783,
+ "learning_rate": 3.386108389364687e-06,
+ "loss": 0.4044,
+ "step": 4943
+ },
+ {
+ "epoch": 2.3375886524822693,
+ "grad_norm": 2.3545165061950684,
+ "learning_rate": 3.3855250352219102e-06,
+ "loss": 0.426,
+ "step": 4944
+ },
+ {
+ "epoch": 2.33806146572104,
+ "grad_norm": 2.972242593765259,
+ "learning_rate": 3.3849416259413735e-06,
+ "loss": 0.5033,
+ "step": 4945
+ },
+ {
+ "epoch": 2.338534278959811,
+ "grad_norm": 3.117351770401001,
+ "learning_rate": 3.384358161559401e-06,
+ "loss": 0.4695,
+ "step": 4946
+ },
+ {
+ "epoch": 2.3390070921985817,
+ "grad_norm": 2.888916492462158,
+ "learning_rate": 3.383774642112324e-06,
+ "loss": 0.437,
+ "step": 4947
+ },
+ {
+ "epoch": 2.339479905437352,
+ "grad_norm": 3.0677435398101807,
+ "learning_rate": 3.3831910676364753e-06,
+ "loss": 0.4293,
+ "step": 4948
+ },
+ {
+ "epoch": 2.339952718676123,
+ "grad_norm": 2.8571784496307373,
+ "learning_rate": 3.3826074381681916e-06,
+ "loss": 0.4574,
+ "step": 4949
+ },
+ {
+ "epoch": 2.3404255319148937,
+ "grad_norm": 2.907276153564453,
+ "learning_rate": 3.3820237537438127e-06,
+ "loss": 0.4731,
+ "step": 4950
+ },
+ {
+ "epoch": 2.3408983451536645,
+ "grad_norm": 2.923762559890747,
+ "learning_rate": 3.3814400143996823e-06,
+ "loss": 0.4648,
+ "step": 4951
+ },
+ {
+ "epoch": 2.341371158392435,
+ "grad_norm": 2.6206982135772705,
+ "learning_rate": 3.3808562201721473e-06,
+ "loss": 0.436,
+ "step": 4952
+ },
+ {
+ "epoch": 2.3418439716312056,
+ "grad_norm": 6.279088973999023,
+ "learning_rate": 3.380272371097558e-06,
+ "loss": 0.4461,
+ "step": 4953
+ },
+ {
+ "epoch": 2.3423167848699764,
+ "grad_norm": 2.785297155380249,
+ "learning_rate": 3.3796884672122684e-06,
+ "loss": 0.4619,
+ "step": 4954
+ },
+ {
+ "epoch": 2.342789598108747,
+ "grad_norm": 2.6241793632507324,
+ "learning_rate": 3.379104508552634e-06,
+ "loss": 0.4323,
+ "step": 4955
+ },
+ {
+ "epoch": 2.3432624113475176,
+ "grad_norm": 2.6052167415618896,
+ "learning_rate": 3.378520495155017e-06,
+ "loss": 0.3943,
+ "step": 4956
+ },
+ {
+ "epoch": 2.3437352245862884,
+ "grad_norm": 2.8247411251068115,
+ "learning_rate": 3.3779364270557818e-06,
+ "loss": 0.4689,
+ "step": 4957
+ },
+ {
+ "epoch": 2.344208037825059,
+ "grad_norm": 2.5348927974700928,
+ "learning_rate": 3.377352304291294e-06,
+ "loss": 0.4619,
+ "step": 4958
+ },
+ {
+ "epoch": 2.34468085106383,
+ "grad_norm": 2.906648874282837,
+ "learning_rate": 3.376768126897926e-06,
+ "loss": 0.5191,
+ "step": 4959
+ },
+ {
+ "epoch": 2.3451536643026003,
+ "grad_norm": 2.796870470046997,
+ "learning_rate": 3.3761838949120514e-06,
+ "loss": 0.4227,
+ "step": 4960
+ },
+ {
+ "epoch": 2.345626477541371,
+ "grad_norm": 2.789635419845581,
+ "learning_rate": 3.3755996083700464e-06,
+ "loss": 0.3927,
+ "step": 4961
+ },
+ {
+ "epoch": 2.346099290780142,
+ "grad_norm": 2.86641263961792,
+ "learning_rate": 3.375015267308295e-06,
+ "loss": 0.4097,
+ "step": 4962
+ },
+ {
+ "epoch": 2.3465721040189127,
+ "grad_norm": 2.8374414443969727,
+ "learning_rate": 3.374430871763178e-06,
+ "loss": 0.4566,
+ "step": 4963
+ },
+ {
+ "epoch": 2.347044917257683,
+ "grad_norm": 2.71951961517334,
+ "learning_rate": 3.3738464217710854e-06,
+ "loss": 0.4748,
+ "step": 4964
+ },
+ {
+ "epoch": 2.347517730496454,
+ "grad_norm": 2.6939785480499268,
+ "learning_rate": 3.373261917368408e-06,
+ "loss": 0.4499,
+ "step": 4965
+ },
+ {
+ "epoch": 2.3479905437352246,
+ "grad_norm": 2.862661600112915,
+ "learning_rate": 3.37267735859154e-06,
+ "loss": 0.415,
+ "step": 4966
+ },
+ {
+ "epoch": 2.3484633569739954,
+ "grad_norm": 2.3657119274139404,
+ "learning_rate": 3.3720927454768793e-06,
+ "loss": 0.4112,
+ "step": 4967
+ },
+ {
+ "epoch": 2.348936170212766,
+ "grad_norm": 3.701571464538574,
+ "learning_rate": 3.3715080780608277e-06,
+ "loss": 0.4735,
+ "step": 4968
+ },
+ {
+ "epoch": 2.3494089834515366,
+ "grad_norm": 2.894350528717041,
+ "learning_rate": 3.3709233563797895e-06,
+ "loss": 0.4278,
+ "step": 4969
+ },
+ {
+ "epoch": 2.3498817966903074,
+ "grad_norm": 3.0072877407073975,
+ "learning_rate": 3.3703385804701727e-06,
+ "loss": 0.4718,
+ "step": 4970
+ },
+ {
+ "epoch": 2.350354609929078,
+ "grad_norm": 2.9920408725738525,
+ "learning_rate": 3.369753750368389e-06,
+ "loss": 0.4636,
+ "step": 4971
+ },
+ {
+ "epoch": 2.3508274231678485,
+ "grad_norm": 2.381770372390747,
+ "learning_rate": 3.369168866110853e-06,
+ "loss": 0.3841,
+ "step": 4972
+ },
+ {
+ "epoch": 2.3513002364066193,
+ "grad_norm": 2.6195342540740967,
+ "learning_rate": 3.3685839277339825e-06,
+ "loss": 0.4422,
+ "step": 4973
+ },
+ {
+ "epoch": 2.35177304964539,
+ "grad_norm": 2.885852575302124,
+ "learning_rate": 3.3679989352741992e-06,
+ "loss": 0.4798,
+ "step": 4974
+ },
+ {
+ "epoch": 2.352245862884161,
+ "grad_norm": 2.820004940032959,
+ "learning_rate": 3.367413888767929e-06,
+ "loss": 0.4498,
+ "step": 4975
+ },
+ {
+ "epoch": 2.3527186761229313,
+ "grad_norm": 2.579680919647217,
+ "learning_rate": 3.366828788251599e-06,
+ "loss": 0.4894,
+ "step": 4976
+ },
+ {
+ "epoch": 2.353191489361702,
+ "grad_norm": 2.7509915828704834,
+ "learning_rate": 3.366243633761642e-06,
+ "loss": 0.4354,
+ "step": 4977
+ },
+ {
+ "epoch": 2.353664302600473,
+ "grad_norm": 3.061767339706421,
+ "learning_rate": 3.3656584253344917e-06,
+ "loss": 0.4651,
+ "step": 4978
+ },
+ {
+ "epoch": 2.3541371158392437,
+ "grad_norm": 2.6109485626220703,
+ "learning_rate": 3.365073163006587e-06,
+ "loss": 0.44,
+ "step": 4979
+ },
+ {
+ "epoch": 2.354609929078014,
+ "grad_norm": 3.4247376918792725,
+ "learning_rate": 3.36448784681437e-06,
+ "loss": 0.3993,
+ "step": 4980
+ },
+ {
+ "epoch": 2.355082742316785,
+ "grad_norm": 2.953695297241211,
+ "learning_rate": 3.363902476794285e-06,
+ "loss": 0.4763,
+ "step": 4981
+ },
+ {
+ "epoch": 2.3555555555555556,
+ "grad_norm": 2.836543083190918,
+ "learning_rate": 3.3633170529827806e-06,
+ "loss": 0.4755,
+ "step": 4982
+ },
+ {
+ "epoch": 2.3560283687943264,
+ "grad_norm": 2.944082021713257,
+ "learning_rate": 3.36273157541631e-06,
+ "loss": 0.472,
+ "step": 4983
+ },
+ {
+ "epoch": 2.3565011820330968,
+ "grad_norm": 2.891716957092285,
+ "learning_rate": 3.3621460441313262e-06,
+ "loss": 0.5259,
+ "step": 4984
+ },
+ {
+ "epoch": 2.3569739952718676,
+ "grad_norm": 2.8448829650878906,
+ "learning_rate": 3.3615604591642896e-06,
+ "loss": 0.4587,
+ "step": 4985
+ },
+ {
+ "epoch": 2.3574468085106384,
+ "grad_norm": 3.114393711090088,
+ "learning_rate": 3.36097482055166e-06,
+ "loss": 0.4352,
+ "step": 4986
+ },
+ {
+ "epoch": 2.357919621749409,
+ "grad_norm": 2.964851140975952,
+ "learning_rate": 3.360389128329904e-06,
+ "loss": 0.5015,
+ "step": 4987
+ },
+ {
+ "epoch": 2.3583924349881795,
+ "grad_norm": 2.4819815158843994,
+ "learning_rate": 3.3598033825354893e-06,
+ "loss": 0.3459,
+ "step": 4988
+ },
+ {
+ "epoch": 2.3588652482269503,
+ "grad_norm": 2.635754346847534,
+ "learning_rate": 3.359217583204889e-06,
+ "loss": 0.4367,
+ "step": 4989
+ },
+ {
+ "epoch": 2.359338061465721,
+ "grad_norm": 2.542482376098633,
+ "learning_rate": 3.358631730374576e-06,
+ "loss": 0.3978,
+ "step": 4990
+ },
+ {
+ "epoch": 2.359810874704492,
+ "grad_norm": 2.614018678665161,
+ "learning_rate": 3.358045824081031e-06,
+ "loss": 0.424,
+ "step": 4991
+ },
+ {
+ "epoch": 2.3602836879432623,
+ "grad_norm": 2.775373697280884,
+ "learning_rate": 3.3574598643607354e-06,
+ "loss": 0.4901,
+ "step": 4992
+ },
+ {
+ "epoch": 2.360756501182033,
+ "grad_norm": 3.091381311416626,
+ "learning_rate": 3.356873851250173e-06,
+ "loss": 0.4954,
+ "step": 4993
+ },
+ {
+ "epoch": 2.361229314420804,
+ "grad_norm": 2.440023422241211,
+ "learning_rate": 3.3562877847858337e-06,
+ "loss": 0.4053,
+ "step": 4994
+ },
+ {
+ "epoch": 2.3617021276595747,
+ "grad_norm": 2.8879518508911133,
+ "learning_rate": 3.3557016650042084e-06,
+ "loss": 0.4766,
+ "step": 4995
+ },
+ {
+ "epoch": 2.362174940898345,
+ "grad_norm": 3.1298391819000244,
+ "learning_rate": 3.355115491941793e-06,
+ "loss": 0.4743,
+ "step": 4996
+ },
+ {
+ "epoch": 2.362647754137116,
+ "grad_norm": 3.3325259685516357,
+ "learning_rate": 3.3545292656350845e-06,
+ "loss": 0.4703,
+ "step": 4997
+ },
+ {
+ "epoch": 2.3631205673758866,
+ "grad_norm": 2.7935359477996826,
+ "learning_rate": 3.353942986120587e-06,
+ "loss": 0.432,
+ "step": 4998
+ },
+ {
+ "epoch": 2.3635933806146574,
+ "grad_norm": 2.623624324798584,
+ "learning_rate": 3.3533566534348033e-06,
+ "loss": 0.4302,
+ "step": 4999
+ },
+ {
+ "epoch": 2.3640661938534278,
+ "grad_norm": 3.1467108726501465,
+ "learning_rate": 3.3527702676142426e-06,
+ "loss": 0.4661,
+ "step": 5000
+ },
+ {
+ "epoch": 2.3645390070921986,
+ "grad_norm": 2.5364840030670166,
+ "learning_rate": 3.352183828695418e-06,
+ "loss": 0.4134,
+ "step": 5001
+ },
+ {
+ "epoch": 2.3650118203309693,
+ "grad_norm": 3.002777338027954,
+ "learning_rate": 3.3515973367148415e-06,
+ "loss": 0.3771,
+ "step": 5002
+ },
+ {
+ "epoch": 2.36548463356974,
+ "grad_norm": 2.660043954849243,
+ "learning_rate": 3.3510107917090335e-06,
+ "loss": 0.4254,
+ "step": 5003
+ },
+ {
+ "epoch": 2.3659574468085105,
+ "grad_norm": 2.7041075229644775,
+ "learning_rate": 3.3504241937145148e-06,
+ "loss": 0.4651,
+ "step": 5004
+ },
+ {
+ "epoch": 2.3664302600472813,
+ "grad_norm": 2.7387280464172363,
+ "learning_rate": 3.349837542767811e-06,
+ "loss": 0.3874,
+ "step": 5005
+ },
+ {
+ "epoch": 2.366903073286052,
+ "grad_norm": 3.012188196182251,
+ "learning_rate": 3.349250838905449e-06,
+ "loss": 0.4508,
+ "step": 5006
+ },
+ {
+ "epoch": 2.3673758865248224,
+ "grad_norm": 2.3108484745025635,
+ "learning_rate": 3.3486640821639616e-06,
+ "loss": 0.3783,
+ "step": 5007
+ },
+ {
+ "epoch": 2.3678486997635932,
+ "grad_norm": 3.2188332080841064,
+ "learning_rate": 3.3480772725798837e-06,
+ "loss": 0.4879,
+ "step": 5008
+ },
+ {
+ "epoch": 2.368321513002364,
+ "grad_norm": 2.566087484359741,
+ "learning_rate": 3.3474904101897526e-06,
+ "loss": 0.3847,
+ "step": 5009
+ },
+ {
+ "epoch": 2.368794326241135,
+ "grad_norm": 2.5581698417663574,
+ "learning_rate": 3.3469034950301092e-06,
+ "loss": 0.4201,
+ "step": 5010
+ },
+ {
+ "epoch": 2.3692671394799056,
+ "grad_norm": 2.900296926498413,
+ "learning_rate": 3.3463165271374992e-06,
+ "loss": 0.4568,
+ "step": 5011
+ },
+ {
+ "epoch": 2.369739952718676,
+ "grad_norm": 2.8239312171936035,
+ "learning_rate": 3.34572950654847e-06,
+ "loss": 0.4583,
+ "step": 5012
+ },
+ {
+ "epoch": 2.370212765957447,
+ "grad_norm": 3.219465970993042,
+ "learning_rate": 3.3451424332995723e-06,
+ "loss": 0.5435,
+ "step": 5013
+ },
+ {
+ "epoch": 2.3706855791962176,
+ "grad_norm": 3.3111915588378906,
+ "learning_rate": 3.344555307427362e-06,
+ "loss": 0.435,
+ "step": 5014
+ },
+ {
+ "epoch": 2.371158392434988,
+ "grad_norm": 3.296668529510498,
+ "learning_rate": 3.3439681289683946e-06,
+ "loss": 0.4738,
+ "step": 5015
+ },
+ {
+ "epoch": 2.3716312056737587,
+ "grad_norm": 3.005722761154175,
+ "learning_rate": 3.343380897959234e-06,
+ "loss": 0.4267,
+ "step": 5016
+ },
+ {
+ "epoch": 2.3721040189125295,
+ "grad_norm": 2.7844085693359375,
+ "learning_rate": 3.3427936144364425e-06,
+ "loss": 0.4558,
+ "step": 5017
+ },
+ {
+ "epoch": 2.3725768321513003,
+ "grad_norm": 2.7532076835632324,
+ "learning_rate": 3.3422062784365884e-06,
+ "loss": 0.4144,
+ "step": 5018
+ },
+ {
+ "epoch": 2.373049645390071,
+ "grad_norm": 2.835764169692993,
+ "learning_rate": 3.3416188899962413e-06,
+ "loss": 0.4945,
+ "step": 5019
+ },
+ {
+ "epoch": 2.3735224586288415,
+ "grad_norm": 3.1513726711273193,
+ "learning_rate": 3.3410314491519767e-06,
+ "loss": 0.4971,
+ "step": 5020
+ },
+ {
+ "epoch": 2.3739952718676123,
+ "grad_norm": 3.0162220001220703,
+ "learning_rate": 3.3404439559403723e-06,
+ "loss": 0.4477,
+ "step": 5021
+ },
+ {
+ "epoch": 2.374468085106383,
+ "grad_norm": 2.676391363143921,
+ "learning_rate": 3.3398564103980073e-06,
+ "loss": 0.432,
+ "step": 5022
+ },
+ {
+ "epoch": 2.3749408983451534,
+ "grad_norm": 2.7806248664855957,
+ "learning_rate": 3.3392688125614663e-06,
+ "loss": 0.4818,
+ "step": 5023
+ },
+ {
+ "epoch": 2.3754137115839242,
+ "grad_norm": 2.968806505203247,
+ "learning_rate": 3.3386811624673373e-06,
+ "loss": 0.4893,
+ "step": 5024
+ },
+ {
+ "epoch": 2.375886524822695,
+ "grad_norm": 2.992684841156006,
+ "learning_rate": 3.3380934601522087e-06,
+ "loss": 0.4423,
+ "step": 5025
+ },
+ {
+ "epoch": 2.376359338061466,
+ "grad_norm": 2.578420639038086,
+ "learning_rate": 3.3375057056526762e-06,
+ "loss": 0.3682,
+ "step": 5026
+ },
+ {
+ "epoch": 2.3768321513002366,
+ "grad_norm": 2.7683115005493164,
+ "learning_rate": 3.336917899005335e-06,
+ "loss": 0.4038,
+ "step": 5027
+ },
+ {
+ "epoch": 2.377304964539007,
+ "grad_norm": 2.838812828063965,
+ "learning_rate": 3.336330040246786e-06,
+ "loss": 0.442,
+ "step": 5028
+ },
+ {
+ "epoch": 2.3777777777777778,
+ "grad_norm": 2.766136646270752,
+ "learning_rate": 3.335742129413633e-06,
+ "loss": 0.4745,
+ "step": 5029
+ },
+ {
+ "epoch": 2.3782505910165486,
+ "grad_norm": 2.862656593322754,
+ "learning_rate": 3.3351541665424812e-06,
+ "loss": 0.4324,
+ "step": 5030
+ },
+ {
+ "epoch": 2.378723404255319,
+ "grad_norm": 2.71425199508667,
+ "learning_rate": 3.3345661516699433e-06,
+ "loss": 0.4013,
+ "step": 5031
+ },
+ {
+ "epoch": 2.3791962174940897,
+ "grad_norm": 2.8404030799865723,
+ "learning_rate": 3.333978084832629e-06,
+ "loss": 0.5038,
+ "step": 5032
+ },
+ {
+ "epoch": 2.3796690307328605,
+ "grad_norm": 2.965851068496704,
+ "learning_rate": 3.3333899660671574e-06,
+ "loss": 0.4668,
+ "step": 5033
+ },
+ {
+ "epoch": 2.3801418439716313,
+ "grad_norm": 2.686452627182007,
+ "learning_rate": 3.3328017954101464e-06,
+ "loss": 0.4167,
+ "step": 5034
+ },
+ {
+ "epoch": 2.380614657210402,
+ "grad_norm": 2.8676156997680664,
+ "learning_rate": 3.3322135728982197e-06,
+ "loss": 0.4531,
+ "step": 5035
+ },
+ {
+ "epoch": 2.3810874704491725,
+ "grad_norm": 2.4456300735473633,
+ "learning_rate": 3.3316252985680026e-06,
+ "loss": 0.4173,
+ "step": 5036
+ },
+ {
+ "epoch": 2.3815602836879433,
+ "grad_norm": 2.5472559928894043,
+ "learning_rate": 3.331036972456124e-06,
+ "loss": 0.3926,
+ "step": 5037
+ },
+ {
+ "epoch": 2.382033096926714,
+ "grad_norm": 2.81900954246521,
+ "learning_rate": 3.330448594599218e-06,
+ "loss": 0.4785,
+ "step": 5038
+ },
+ {
+ "epoch": 2.3825059101654844,
+ "grad_norm": 3.0930590629577637,
+ "learning_rate": 3.329860165033919e-06,
+ "loss": 0.4587,
+ "step": 5039
+ },
+ {
+ "epoch": 2.382978723404255,
+ "grad_norm": 3.0553040504455566,
+ "learning_rate": 3.3292716837968673e-06,
+ "loss": 0.5285,
+ "step": 5040
+ },
+ {
+ "epoch": 2.383451536643026,
+ "grad_norm": 2.577580690383911,
+ "learning_rate": 3.328683150924704e-06,
+ "loss": 0.4184,
+ "step": 5041
+ },
+ {
+ "epoch": 2.383924349881797,
+ "grad_norm": 2.6430366039276123,
+ "learning_rate": 3.3280945664540735e-06,
+ "loss": 0.4636,
+ "step": 5042
+ },
+ {
+ "epoch": 2.3843971631205676,
+ "grad_norm": 3.228360891342163,
+ "learning_rate": 3.3275059304216255e-06,
+ "loss": 0.455,
+ "step": 5043
+ },
+ {
+ "epoch": 2.384869976359338,
+ "grad_norm": 2.776142120361328,
+ "learning_rate": 3.3269172428640125e-06,
+ "loss": 0.4785,
+ "step": 5044
+ },
+ {
+ "epoch": 2.3853427895981087,
+ "grad_norm": 2.755671739578247,
+ "learning_rate": 3.3263285038178882e-06,
+ "loss": 0.4625,
+ "step": 5045
+ },
+ {
+ "epoch": 2.3858156028368795,
+ "grad_norm": 3.061004400253296,
+ "learning_rate": 3.3257397133199114e-06,
+ "loss": 0.4641,
+ "step": 5046
+ },
+ {
+ "epoch": 2.38628841607565,
+ "grad_norm": 2.8391458988189697,
+ "learning_rate": 3.3251508714067432e-06,
+ "loss": 0.5003,
+ "step": 5047
+ },
+ {
+ "epoch": 2.3867612293144207,
+ "grad_norm": 2.390810966491699,
+ "learning_rate": 3.324561978115049e-06,
+ "loss": 0.4446,
+ "step": 5048
+ },
+ {
+ "epoch": 2.3872340425531915,
+ "grad_norm": 2.7760825157165527,
+ "learning_rate": 3.323973033481496e-06,
+ "loss": 0.4443,
+ "step": 5049
+ },
+ {
+ "epoch": 2.3877068557919623,
+ "grad_norm": 3.157893419265747,
+ "learning_rate": 3.3233840375427552e-06,
+ "loss": 0.4934,
+ "step": 5050
+ },
+ {
+ "epoch": 2.388179669030733,
+ "grad_norm": 2.7245349884033203,
+ "learning_rate": 3.3227949903355e-06,
+ "loss": 0.4254,
+ "step": 5051
+ },
+ {
+ "epoch": 2.3886524822695034,
+ "grad_norm": 2.6674044132232666,
+ "learning_rate": 3.322205891896409e-06,
+ "loss": 0.4116,
+ "step": 5052
+ },
+ {
+ "epoch": 2.3891252955082742,
+ "grad_norm": 3.1490554809570312,
+ "learning_rate": 3.3216167422621627e-06,
+ "loss": 0.4604,
+ "step": 5053
+ },
+ {
+ "epoch": 2.389598108747045,
+ "grad_norm": 2.725731134414673,
+ "learning_rate": 3.321027541469444e-06,
+ "loss": 0.4836,
+ "step": 5054
+ },
+ {
+ "epoch": 2.3900709219858154,
+ "grad_norm": 2.5378828048706055,
+ "learning_rate": 3.3204382895549407e-06,
+ "loss": 0.4228,
+ "step": 5055
+ },
+ {
+ "epoch": 2.390543735224586,
+ "grad_norm": 2.8191192150115967,
+ "learning_rate": 3.3198489865553427e-06,
+ "loss": 0.4371,
+ "step": 5056
+ },
+ {
+ "epoch": 2.391016548463357,
+ "grad_norm": 2.5676498413085938,
+ "learning_rate": 3.3192596325073433e-06,
+ "loss": 0.4463,
+ "step": 5057
+ },
+ {
+ "epoch": 2.391489361702128,
+ "grad_norm": 3.0846121311187744,
+ "learning_rate": 3.3186702274476397e-06,
+ "loss": 0.5049,
+ "step": 5058
+ },
+ {
+ "epoch": 2.3919621749408986,
+ "grad_norm": 2.6085152626037598,
+ "learning_rate": 3.3180807714129293e-06,
+ "loss": 0.4376,
+ "step": 5059
+ },
+ {
+ "epoch": 2.392434988179669,
+ "grad_norm": 3.0218591690063477,
+ "learning_rate": 3.3174912644399172e-06,
+ "loss": 0.4734,
+ "step": 5060
+ },
+ {
+ "epoch": 2.3929078014184397,
+ "grad_norm": 2.5904781818389893,
+ "learning_rate": 3.316901706565308e-06,
+ "loss": 0.4924,
+ "step": 5061
+ },
+ {
+ "epoch": 2.3933806146572105,
+ "grad_norm": 2.675478458404541,
+ "learning_rate": 3.3163120978258123e-06,
+ "loss": 0.4072,
+ "step": 5062
+ },
+ {
+ "epoch": 2.393853427895981,
+ "grad_norm": 2.7944445610046387,
+ "learning_rate": 3.3157224382581415e-06,
+ "loss": 0.4328,
+ "step": 5063
+ },
+ {
+ "epoch": 2.3943262411347517,
+ "grad_norm": 2.846224546432495,
+ "learning_rate": 3.315132727899012e-06,
+ "loss": 0.4447,
+ "step": 5064
+ },
+ {
+ "epoch": 2.3947990543735225,
+ "grad_norm": 2.6825828552246094,
+ "learning_rate": 3.3145429667851402e-06,
+ "loss": 0.4528,
+ "step": 5065
+ },
+ {
+ "epoch": 2.3952718676122933,
+ "grad_norm": 3.0305285453796387,
+ "learning_rate": 3.3139531549532505e-06,
+ "loss": 0.4538,
+ "step": 5066
+ },
+ {
+ "epoch": 2.395744680851064,
+ "grad_norm": 2.707540988922119,
+ "learning_rate": 3.313363292440067e-06,
+ "loss": 0.4412,
+ "step": 5067
+ },
+ {
+ "epoch": 2.3962174940898344,
+ "grad_norm": 3.0458385944366455,
+ "learning_rate": 3.3127733792823173e-06,
+ "loss": 0.4587,
+ "step": 5068
+ },
+ {
+ "epoch": 2.396690307328605,
+ "grad_norm": 2.7711992263793945,
+ "learning_rate": 3.312183415516733e-06,
+ "loss": 0.4157,
+ "step": 5069
+ },
+ {
+ "epoch": 2.397163120567376,
+ "grad_norm": 2.6953988075256348,
+ "learning_rate": 3.3115934011800494e-06,
+ "loss": 0.3828,
+ "step": 5070
+ },
+ {
+ "epoch": 2.3976359338061464,
+ "grad_norm": 3.033721923828125,
+ "learning_rate": 3.311003336309003e-06,
+ "loss": 0.5204,
+ "step": 5071
+ },
+ {
+ "epoch": 2.398108747044917,
+ "grad_norm": 2.6134517192840576,
+ "learning_rate": 3.3104132209403355e-06,
+ "loss": 0.4181,
+ "step": 5072
+ },
+ {
+ "epoch": 2.398581560283688,
+ "grad_norm": 2.8800251483917236,
+ "learning_rate": 3.30982305511079e-06,
+ "loss": 0.466,
+ "step": 5073
+ },
+ {
+ "epoch": 2.3990543735224588,
+ "grad_norm": 2.5043210983276367,
+ "learning_rate": 3.309232838857114e-06,
+ "loss": 0.4161,
+ "step": 5074
+ },
+ {
+ "epoch": 2.3995271867612296,
+ "grad_norm": 2.6577322483062744,
+ "learning_rate": 3.308642572216057e-06,
+ "loss": 0.465,
+ "step": 5075
+ },
+ {
+ "epoch": 2.4,
+ "grad_norm": 2.549098253250122,
+ "learning_rate": 3.3080522552243734e-06,
+ "loss": 0.4571,
+ "step": 5076
+ },
+ {
+ "epoch": 2.4004728132387707,
+ "grad_norm": 2.881958246231079,
+ "learning_rate": 3.3074618879188186e-06,
+ "loss": 0.4443,
+ "step": 5077
+ },
+ {
+ "epoch": 2.4009456264775415,
+ "grad_norm": 2.608397960662842,
+ "learning_rate": 3.3068714703361528e-06,
+ "loss": 0.3843,
+ "step": 5078
+ },
+ {
+ "epoch": 2.401418439716312,
+ "grad_norm": 2.8666789531707764,
+ "learning_rate": 3.306281002513139e-06,
+ "loss": 0.4857,
+ "step": 5079
+ },
+ {
+ "epoch": 2.4018912529550827,
+ "grad_norm": 2.9008588790893555,
+ "learning_rate": 3.3056904844865422e-06,
+ "loss": 0.4454,
+ "step": 5080
+ },
+ {
+ "epoch": 2.4023640661938535,
+ "grad_norm": 2.7446060180664062,
+ "learning_rate": 3.3050999162931315e-06,
+ "loss": 0.4522,
+ "step": 5081
+ },
+ {
+ "epoch": 2.4028368794326243,
+ "grad_norm": 2.787116765975952,
+ "learning_rate": 3.3045092979696804e-06,
+ "loss": 0.4714,
+ "step": 5082
+ },
+ {
+ "epoch": 2.403309692671395,
+ "grad_norm": 2.7494192123413086,
+ "learning_rate": 3.3039186295529613e-06,
+ "loss": 0.4107,
+ "step": 5083
+ },
+ {
+ "epoch": 2.4037825059101654,
+ "grad_norm": 2.733794927597046,
+ "learning_rate": 3.303327911079755e-06,
+ "loss": 0.4169,
+ "step": 5084
+ },
+ {
+ "epoch": 2.404255319148936,
+ "grad_norm": 2.7313334941864014,
+ "learning_rate": 3.3027371425868422e-06,
+ "loss": 0.4287,
+ "step": 5085
+ },
+ {
+ "epoch": 2.404728132387707,
+ "grad_norm": 2.7832977771759033,
+ "learning_rate": 3.3021463241110075e-06,
+ "loss": 0.5307,
+ "step": 5086
+ },
+ {
+ "epoch": 2.4052009456264773,
+ "grad_norm": 2.6615281105041504,
+ "learning_rate": 3.301555455689038e-06,
+ "loss": 0.4519,
+ "step": 5087
+ },
+ {
+ "epoch": 2.405673758865248,
+ "grad_norm": 2.343921422958374,
+ "learning_rate": 3.3009645373577264e-06,
+ "loss": 0.46,
+ "step": 5088
+ },
+ {
+ "epoch": 2.406146572104019,
+ "grad_norm": 2.6115355491638184,
+ "learning_rate": 3.300373569153864e-06,
+ "loss": 0.4782,
+ "step": 5089
+ },
+ {
+ "epoch": 2.4066193853427897,
+ "grad_norm": 2.730625629425049,
+ "learning_rate": 3.299782551114249e-06,
+ "loss": 0.4632,
+ "step": 5090
+ },
+ {
+ "epoch": 2.40709219858156,
+ "grad_norm": 2.4495043754577637,
+ "learning_rate": 3.2991914832756824e-06,
+ "loss": 0.4243,
+ "step": 5091
+ },
+ {
+ "epoch": 2.407565011820331,
+ "grad_norm": 2.8731648921966553,
+ "learning_rate": 3.2986003656749654e-06,
+ "loss": 0.4262,
+ "step": 5092
+ },
+ {
+ "epoch": 2.4080378250591017,
+ "grad_norm": 2.870342969894409,
+ "learning_rate": 3.2980091983489053e-06,
+ "loss": 0.4735,
+ "step": 5093
+ },
+ {
+ "epoch": 2.4085106382978725,
+ "grad_norm": 2.500786542892456,
+ "learning_rate": 3.297417981334312e-06,
+ "loss": 0.4007,
+ "step": 5094
+ },
+ {
+ "epoch": 2.408983451536643,
+ "grad_norm": 2.7787322998046875,
+ "learning_rate": 3.2968267146679978e-06,
+ "loss": 0.493,
+ "step": 5095
+ },
+ {
+ "epoch": 2.4094562647754136,
+ "grad_norm": 2.5229599475860596,
+ "learning_rate": 3.2962353983867783e-06,
+ "loss": 0.3676,
+ "step": 5096
+ },
+ {
+ "epoch": 2.4099290780141844,
+ "grad_norm": 3.1955904960632324,
+ "learning_rate": 3.2956440325274715e-06,
+ "loss": 0.4888,
+ "step": 5097
+ },
+ {
+ "epoch": 2.4104018912529552,
+ "grad_norm": 2.8580288887023926,
+ "learning_rate": 3.2950526171268995e-06,
+ "loss": 0.4892,
+ "step": 5098
+ },
+ {
+ "epoch": 2.4108747044917256,
+ "grad_norm": 2.6321749687194824,
+ "learning_rate": 3.294461152221887e-06,
+ "loss": 0.3823,
+ "step": 5099
+ },
+ {
+ "epoch": 2.4113475177304964,
+ "grad_norm": 2.881127119064331,
+ "learning_rate": 3.293869637849263e-06,
+ "loss": 0.4569,
+ "step": 5100
+ },
+ {
+ "epoch": 2.411820330969267,
+ "grad_norm": 2.7742316722869873,
+ "learning_rate": 3.293278074045857e-06,
+ "loss": 0.4445,
+ "step": 5101
+ },
+ {
+ "epoch": 2.412293144208038,
+ "grad_norm": 2.546701431274414,
+ "learning_rate": 3.2926864608485037e-06,
+ "loss": 0.3995,
+ "step": 5102
+ },
+ {
+ "epoch": 2.4127659574468083,
+ "grad_norm": 2.588226318359375,
+ "learning_rate": 3.292094798294041e-06,
+ "loss": 0.4081,
+ "step": 5103
+ },
+ {
+ "epoch": 2.413238770685579,
+ "grad_norm": 2.968689441680908,
+ "learning_rate": 3.2915030864193077e-06,
+ "loss": 0.4475,
+ "step": 5104
+ },
+ {
+ "epoch": 2.41371158392435,
+ "grad_norm": 2.9249184131622314,
+ "learning_rate": 3.290911325261148e-06,
+ "loss": 0.4763,
+ "step": 5105
+ },
+ {
+ "epoch": 2.4141843971631207,
+ "grad_norm": 2.817596673965454,
+ "learning_rate": 3.2903195148564083e-06,
+ "loss": 0.4451,
+ "step": 5106
+ },
+ {
+ "epoch": 2.414657210401891,
+ "grad_norm": 2.6465954780578613,
+ "learning_rate": 3.2897276552419377e-06,
+ "loss": 0.4665,
+ "step": 5107
+ },
+ {
+ "epoch": 2.415130023640662,
+ "grad_norm": 2.8613853454589844,
+ "learning_rate": 3.2891357464545885e-06,
+ "loss": 0.4398,
+ "step": 5108
+ },
+ {
+ "epoch": 2.4156028368794327,
+ "grad_norm": 2.756321907043457,
+ "learning_rate": 3.2885437885312175e-06,
+ "loss": 0.4634,
+ "step": 5109
+ },
+ {
+ "epoch": 2.4160756501182035,
+ "grad_norm": 2.8965282440185547,
+ "learning_rate": 3.287951781508682e-06,
+ "loss": 0.4319,
+ "step": 5110
+ },
+ {
+ "epoch": 2.416548463356974,
+ "grad_norm": 2.896756172180176,
+ "learning_rate": 3.287359725423844e-06,
+ "loss": 0.4771,
+ "step": 5111
+ },
+ {
+ "epoch": 2.4170212765957446,
+ "grad_norm": 2.952911376953125,
+ "learning_rate": 3.286767620313569e-06,
+ "loss": 0.5026,
+ "step": 5112
+ },
+ {
+ "epoch": 2.4174940898345154,
+ "grad_norm": 3.850515604019165,
+ "learning_rate": 3.2861754662147234e-06,
+ "loss": 0.4387,
+ "step": 5113
+ },
+ {
+ "epoch": 2.417966903073286,
+ "grad_norm": 3.0072689056396484,
+ "learning_rate": 3.2855832631641794e-06,
+ "loss": 0.4586,
+ "step": 5114
+ },
+ {
+ "epoch": 2.4184397163120566,
+ "grad_norm": 3.166790246963501,
+ "learning_rate": 3.2849910111988092e-06,
+ "loss": 0.4842,
+ "step": 5115
+ },
+ {
+ "epoch": 2.4189125295508274,
+ "grad_norm": 3.5397679805755615,
+ "learning_rate": 3.284398710355492e-06,
+ "loss": 0.5138,
+ "step": 5116
+ },
+ {
+ "epoch": 2.419385342789598,
+ "grad_norm": 2.779609441757202,
+ "learning_rate": 3.283806360671106e-06,
+ "loss": 0.4049,
+ "step": 5117
+ },
+ {
+ "epoch": 2.419858156028369,
+ "grad_norm": 2.5924575328826904,
+ "learning_rate": 3.283213962182535e-06,
+ "loss": 0.433,
+ "step": 5118
+ },
+ {
+ "epoch": 2.4203309692671393,
+ "grad_norm": 2.7429699897766113,
+ "learning_rate": 3.282621514926665e-06,
+ "loss": 0.4674,
+ "step": 5119
+ },
+ {
+ "epoch": 2.42080378250591,
+ "grad_norm": 2.8113889694213867,
+ "learning_rate": 3.2820290189403846e-06,
+ "loss": 0.3898,
+ "step": 5120
+ },
+ {
+ "epoch": 2.421276595744681,
+ "grad_norm": 2.867105722427368,
+ "learning_rate": 3.2814364742605863e-06,
+ "loss": 0.4439,
+ "step": 5121
+ },
+ {
+ "epoch": 2.4217494089834517,
+ "grad_norm": 2.428597927093506,
+ "learning_rate": 3.2808438809241654e-06,
+ "loss": 0.4339,
+ "step": 5122
+ },
+ {
+ "epoch": 2.422222222222222,
+ "grad_norm": 3.071735143661499,
+ "learning_rate": 3.2802512389680203e-06,
+ "loss": 0.4583,
+ "step": 5123
+ },
+ {
+ "epoch": 2.422695035460993,
+ "grad_norm": 3.046313762664795,
+ "learning_rate": 3.279658548429051e-06,
+ "loss": 0.5351,
+ "step": 5124
+ },
+ {
+ "epoch": 2.4231678486997636,
+ "grad_norm": 2.8412697315216064,
+ "learning_rate": 3.279065809344163e-06,
+ "loss": 0.5258,
+ "step": 5125
+ },
+ {
+ "epoch": 2.4236406619385344,
+ "grad_norm": 2.887169122695923,
+ "learning_rate": 3.278473021750263e-06,
+ "loss": 0.4568,
+ "step": 5126
+ },
+ {
+ "epoch": 2.424113475177305,
+ "grad_norm": 2.8316574096679688,
+ "learning_rate": 3.2778801856842624e-06,
+ "loss": 0.46,
+ "step": 5127
+ },
+ {
+ "epoch": 2.4245862884160756,
+ "grad_norm": 2.7660772800445557,
+ "learning_rate": 3.277287301183073e-06,
+ "loss": 0.4323,
+ "step": 5128
+ },
+ {
+ "epoch": 2.4250591016548464,
+ "grad_norm": 2.737682819366455,
+ "learning_rate": 3.276694368283611e-06,
+ "loss": 0.4296,
+ "step": 5129
+ },
+ {
+ "epoch": 2.425531914893617,
+ "grad_norm": 2.8807425498962402,
+ "learning_rate": 3.276101387022797e-06,
+ "loss": 0.4673,
+ "step": 5130
+ },
+ {
+ "epoch": 2.4260047281323875,
+ "grad_norm": 2.530526876449585,
+ "learning_rate": 3.275508357437552e-06,
+ "loss": 0.416,
+ "step": 5131
+ },
+ {
+ "epoch": 2.4264775413711583,
+ "grad_norm": 3.1189746856689453,
+ "learning_rate": 3.274915279564803e-06,
+ "loss": 0.4171,
+ "step": 5132
+ },
+ {
+ "epoch": 2.426950354609929,
+ "grad_norm": 2.6612462997436523,
+ "learning_rate": 3.274322153441477e-06,
+ "loss": 0.4104,
+ "step": 5133
+ },
+ {
+ "epoch": 2.4274231678487,
+ "grad_norm": 2.717973470687866,
+ "learning_rate": 3.2737289791045064e-06,
+ "loss": 0.479,
+ "step": 5134
+ },
+ {
+ "epoch": 2.4278959810874703,
+ "grad_norm": 2.764216661453247,
+ "learning_rate": 3.2731357565908247e-06,
+ "loss": 0.481,
+ "step": 5135
+ },
+ {
+ "epoch": 2.428368794326241,
+ "grad_norm": 2.5081393718719482,
+ "learning_rate": 3.272542485937369e-06,
+ "loss": 0.4592,
+ "step": 5136
+ },
+ {
+ "epoch": 2.428841607565012,
+ "grad_norm": 3.1380364894866943,
+ "learning_rate": 3.271949167181081e-06,
+ "loss": 0.4179,
+ "step": 5137
+ },
+ {
+ "epoch": 2.4293144208037827,
+ "grad_norm": 2.9275963306427,
+ "learning_rate": 3.2713558003589026e-06,
+ "loss": 0.5196,
+ "step": 5138
+ },
+ {
+ "epoch": 2.429787234042553,
+ "grad_norm": 2.8215506076812744,
+ "learning_rate": 3.270762385507781e-06,
+ "loss": 0.4081,
+ "step": 5139
+ },
+ {
+ "epoch": 2.430260047281324,
+ "grad_norm": 2.9185614585876465,
+ "learning_rate": 3.270168922664665e-06,
+ "loss": 0.4936,
+ "step": 5140
+ },
+ {
+ "epoch": 2.4307328605200946,
+ "grad_norm": 2.6507248878479004,
+ "learning_rate": 3.269575411866507e-06,
+ "loss": 0.4834,
+ "step": 5141
+ },
+ {
+ "epoch": 2.4312056737588654,
+ "grad_norm": 2.864741563796997,
+ "learning_rate": 3.2689818531502637e-06,
+ "loss": 0.4562,
+ "step": 5142
+ },
+ {
+ "epoch": 2.431678486997636,
+ "grad_norm": 2.806919813156128,
+ "learning_rate": 3.2683882465528917e-06,
+ "loss": 0.4645,
+ "step": 5143
+ },
+ {
+ "epoch": 2.4321513002364066,
+ "grad_norm": 2.733372211456299,
+ "learning_rate": 3.267794592111353e-06,
+ "loss": 0.4123,
+ "step": 5144
+ },
+ {
+ "epoch": 2.4326241134751774,
+ "grad_norm": 2.8005833625793457,
+ "learning_rate": 3.2672008898626116e-06,
+ "loss": 0.4343,
+ "step": 5145
+ },
+ {
+ "epoch": 2.433096926713948,
+ "grad_norm": 3.2339670658111572,
+ "learning_rate": 3.2666071398436354e-06,
+ "loss": 0.4017,
+ "step": 5146
+ },
+ {
+ "epoch": 2.4335697399527185,
+ "grad_norm": 2.510251760482788,
+ "learning_rate": 3.2660133420913932e-06,
+ "loss": 0.3882,
+ "step": 5147
+ },
+ {
+ "epoch": 2.4340425531914893,
+ "grad_norm": 3.5633628368377686,
+ "learning_rate": 3.26541949664286e-06,
+ "loss": 0.4766,
+ "step": 5148
+ },
+ {
+ "epoch": 2.43451536643026,
+ "grad_norm": 2.8246724605560303,
+ "learning_rate": 3.26482560353501e-06,
+ "loss": 0.3728,
+ "step": 5149
+ },
+ {
+ "epoch": 2.434988179669031,
+ "grad_norm": 2.4923641681671143,
+ "learning_rate": 3.264231662804823e-06,
+ "loss": 0.4346,
+ "step": 5150
+ },
+ {
+ "epoch": 2.4354609929078013,
+ "grad_norm": 3.180874824523926,
+ "learning_rate": 3.2636376744892827e-06,
+ "loss": 0.4351,
+ "step": 5151
+ },
+ {
+ "epoch": 2.435933806146572,
+ "grad_norm": 2.6933515071868896,
+ "learning_rate": 3.263043638625373e-06,
+ "loss": 0.4293,
+ "step": 5152
+ },
+ {
+ "epoch": 2.436406619385343,
+ "grad_norm": 2.584132194519043,
+ "learning_rate": 3.262449555250081e-06,
+ "loss": 0.4589,
+ "step": 5153
+ },
+ {
+ "epoch": 2.4368794326241137,
+ "grad_norm": 2.8103036880493164,
+ "learning_rate": 3.2618554244003985e-06,
+ "loss": 0.463,
+ "step": 5154
+ },
+ {
+ "epoch": 2.437352245862884,
+ "grad_norm": 2.809070587158203,
+ "learning_rate": 3.2612612461133197e-06,
+ "loss": 0.4629,
+ "step": 5155
+ },
+ {
+ "epoch": 2.437825059101655,
+ "grad_norm": 2.98148512840271,
+ "learning_rate": 3.2606670204258405e-06,
+ "loss": 0.451,
+ "step": 5156
+ },
+ {
+ "epoch": 2.4382978723404256,
+ "grad_norm": 2.691047191619873,
+ "learning_rate": 3.2600727473749614e-06,
+ "loss": 0.3878,
+ "step": 5157
+ },
+ {
+ "epoch": 2.4387706855791964,
+ "grad_norm": 2.900360345840454,
+ "learning_rate": 3.2594784269976856e-06,
+ "loss": 0.4216,
+ "step": 5158
+ },
+ {
+ "epoch": 2.4392434988179668,
+ "grad_norm": 2.8449952602386475,
+ "learning_rate": 3.258884059331019e-06,
+ "loss": 0.4268,
+ "step": 5159
+ },
+ {
+ "epoch": 2.4397163120567376,
+ "grad_norm": 2.7226388454437256,
+ "learning_rate": 3.258289644411969e-06,
+ "loss": 0.4381,
+ "step": 5160
+ },
+ {
+ "epoch": 2.4401891252955084,
+ "grad_norm": 2.513946056365967,
+ "learning_rate": 3.257695182277547e-06,
+ "loss": 0.4566,
+ "step": 5161
+ },
+ {
+ "epoch": 2.440661938534279,
+ "grad_norm": 2.9941394329071045,
+ "learning_rate": 3.2571006729647693e-06,
+ "loss": 0.4395,
+ "step": 5162
+ },
+ {
+ "epoch": 2.4411347517730495,
+ "grad_norm": 2.699094533920288,
+ "learning_rate": 3.2565061165106523e-06,
+ "loss": 0.4274,
+ "step": 5163
+ },
+ {
+ "epoch": 2.4416075650118203,
+ "grad_norm": 2.574193000793457,
+ "learning_rate": 3.255911512952216e-06,
+ "loss": 0.4187,
+ "step": 5164
+ },
+ {
+ "epoch": 2.442080378250591,
+ "grad_norm": 2.920766592025757,
+ "learning_rate": 3.2553168623264854e-06,
+ "loss": 0.4911,
+ "step": 5165
+ },
+ {
+ "epoch": 2.4425531914893615,
+ "grad_norm": 2.728421926498413,
+ "learning_rate": 3.2547221646704853e-06,
+ "loss": 0.4466,
+ "step": 5166
+ },
+ {
+ "epoch": 2.4430260047281322,
+ "grad_norm": 2.8171417713165283,
+ "learning_rate": 3.254127420021246e-06,
+ "loss": 0.4331,
+ "step": 5167
+ },
+ {
+ "epoch": 2.443498817966903,
+ "grad_norm": 2.4069135189056396,
+ "learning_rate": 3.2535326284157975e-06,
+ "loss": 0.389,
+ "step": 5168
+ },
+ {
+ "epoch": 2.443971631205674,
+ "grad_norm": 2.912405490875244,
+ "learning_rate": 3.2529377898911777e-06,
+ "loss": 0.4681,
+ "step": 5169
+ },
+ {
+ "epoch": 2.4444444444444446,
+ "grad_norm": 2.987558126449585,
+ "learning_rate": 3.2523429044844228e-06,
+ "loss": 0.4715,
+ "step": 5170
+ },
+ {
+ "epoch": 2.444917257683215,
+ "grad_norm": 2.5117199420928955,
+ "learning_rate": 3.251747972232574e-06,
+ "loss": 0.4531,
+ "step": 5171
+ },
+ {
+ "epoch": 2.445390070921986,
+ "grad_norm": 2.5405385494232178,
+ "learning_rate": 3.2511529931726752e-06,
+ "loss": 0.4323,
+ "step": 5172
+ },
+ {
+ "epoch": 2.4458628841607566,
+ "grad_norm": 2.989932060241699,
+ "learning_rate": 3.250557967341773e-06,
+ "loss": 0.4039,
+ "step": 5173
+ },
+ {
+ "epoch": 2.446335697399527,
+ "grad_norm": 2.6331627368927,
+ "learning_rate": 3.2499628947769186e-06,
+ "loss": 0.5147,
+ "step": 5174
+ },
+ {
+ "epoch": 2.4468085106382977,
+ "grad_norm": 2.71699857711792,
+ "learning_rate": 3.249367775515162e-06,
+ "loss": 0.3748,
+ "step": 5175
+ },
+ {
+ "epoch": 2.4472813238770685,
+ "grad_norm": 2.9508471488952637,
+ "learning_rate": 3.2487726095935606e-06,
+ "loss": 0.5145,
+ "step": 5176
+ },
+ {
+ "epoch": 2.4477541371158393,
+ "grad_norm": 2.8276431560516357,
+ "learning_rate": 3.2481773970491713e-06,
+ "loss": 0.4295,
+ "step": 5177
+ },
+ {
+ "epoch": 2.44822695035461,
+ "grad_norm": 2.5500540733337402,
+ "learning_rate": 3.2475821379190565e-06,
+ "loss": 0.4246,
+ "step": 5178
+ },
+ {
+ "epoch": 2.4486997635933805,
+ "grad_norm": 2.845641613006592,
+ "learning_rate": 3.246986832240281e-06,
+ "loss": 0.4211,
+ "step": 5179
+ },
+ {
+ "epoch": 2.4491725768321513,
+ "grad_norm": 3.1215856075286865,
+ "learning_rate": 3.2463914800499097e-06,
+ "loss": 0.4378,
+ "step": 5180
+ },
+ {
+ "epoch": 2.449645390070922,
+ "grad_norm": 2.4685606956481934,
+ "learning_rate": 3.2457960813850137e-06,
+ "loss": 0.4836,
+ "step": 5181
+ },
+ {
+ "epoch": 2.4501182033096924,
+ "grad_norm": 2.508028268814087,
+ "learning_rate": 3.245200636282666e-06,
+ "loss": 0.4377,
+ "step": 5182
+ },
+ {
+ "epoch": 2.4505910165484632,
+ "grad_norm": 2.899949312210083,
+ "learning_rate": 3.244605144779943e-06,
+ "loss": 0.501,
+ "step": 5183
+ },
+ {
+ "epoch": 2.451063829787234,
+ "grad_norm": 2.6494483947753906,
+ "learning_rate": 3.244009606913923e-06,
+ "loss": 0.4255,
+ "step": 5184
+ },
+ {
+ "epoch": 2.451536643026005,
+ "grad_norm": 2.4363760948181152,
+ "learning_rate": 3.243414022721686e-06,
+ "loss": 0.4402,
+ "step": 5185
+ },
+ {
+ "epoch": 2.4520094562647756,
+ "grad_norm": 2.4725022315979004,
+ "learning_rate": 3.242818392240317e-06,
+ "loss": 0.4388,
+ "step": 5186
+ },
+ {
+ "epoch": 2.452482269503546,
+ "grad_norm": 2.7010514736175537,
+ "learning_rate": 3.242222715506905e-06,
+ "loss": 0.4388,
+ "step": 5187
+ },
+ {
+ "epoch": 2.4529550827423168,
+ "grad_norm": 2.811464548110962,
+ "learning_rate": 3.241626992558539e-06,
+ "loss": 0.4634,
+ "step": 5188
+ },
+ {
+ "epoch": 2.4534278959810876,
+ "grad_norm": 2.6473052501678467,
+ "learning_rate": 3.2410312234323123e-06,
+ "loss": 0.4752,
+ "step": 5189
+ },
+ {
+ "epoch": 2.453900709219858,
+ "grad_norm": 2.5587213039398193,
+ "learning_rate": 3.24043540816532e-06,
+ "loss": 0.4458,
+ "step": 5190
+ },
+ {
+ "epoch": 2.4543735224586287,
+ "grad_norm": 2.6306557655334473,
+ "learning_rate": 3.239839546794662e-06,
+ "loss": 0.4081,
+ "step": 5191
+ },
+ {
+ "epoch": 2.4548463356973995,
+ "grad_norm": 2.4613633155822754,
+ "learning_rate": 3.23924363935744e-06,
+ "loss": 0.4165,
+ "step": 5192
+ },
+ {
+ "epoch": 2.4553191489361703,
+ "grad_norm": 2.7189204692840576,
+ "learning_rate": 3.238647685890757e-06,
+ "loss": 0.4822,
+ "step": 5193
+ },
+ {
+ "epoch": 2.455791962174941,
+ "grad_norm": 3.015977382659912,
+ "learning_rate": 3.238051686431722e-06,
+ "loss": 0.4964,
+ "step": 5194
+ },
+ {
+ "epoch": 2.4562647754137115,
+ "grad_norm": 2.8868937492370605,
+ "learning_rate": 3.2374556410174445e-06,
+ "loss": 0.4514,
+ "step": 5195
+ },
+ {
+ "epoch": 2.4567375886524823,
+ "grad_norm": 2.7959537506103516,
+ "learning_rate": 3.2368595496850375e-06,
+ "loss": 0.475,
+ "step": 5196
+ },
+ {
+ "epoch": 2.457210401891253,
+ "grad_norm": 3.0086777210235596,
+ "learning_rate": 3.2362634124716187e-06,
+ "loss": 0.4913,
+ "step": 5197
+ },
+ {
+ "epoch": 2.4576832151300234,
+ "grad_norm": 2.621335506439209,
+ "learning_rate": 3.2356672294143044e-06,
+ "loss": 0.4259,
+ "step": 5198
+ },
+ {
+ "epoch": 2.458156028368794,
+ "grad_norm": 3.1620380878448486,
+ "learning_rate": 3.235071000550218e-06,
+ "loss": 0.451,
+ "step": 5199
+ },
+ {
+ "epoch": 2.458628841607565,
+ "grad_norm": 2.7663278579711914,
+ "learning_rate": 3.234474725916484e-06,
+ "loss": 0.3854,
+ "step": 5200
+ },
+ {
+ "epoch": 2.459101654846336,
+ "grad_norm": 2.5187132358551025,
+ "learning_rate": 3.2338784055502288e-06,
+ "loss": 0.4068,
+ "step": 5201
+ },
+ {
+ "epoch": 2.4595744680851066,
+ "grad_norm": 2.6022701263427734,
+ "learning_rate": 3.233282039488583e-06,
+ "loss": 0.4484,
+ "step": 5202
+ },
+ {
+ "epoch": 2.460047281323877,
+ "grad_norm": 2.874750852584839,
+ "learning_rate": 3.2326856277686807e-06,
+ "loss": 0.45,
+ "step": 5203
+ },
+ {
+ "epoch": 2.4605200945626478,
+ "grad_norm": 2.671008586883545,
+ "learning_rate": 3.232089170427656e-06,
+ "loss": 0.4446,
+ "step": 5204
+ },
+ {
+ "epoch": 2.4609929078014185,
+ "grad_norm": 2.7365503311157227,
+ "learning_rate": 3.2314926675026498e-06,
+ "loss": 0.4402,
+ "step": 5205
+ },
+ {
+ "epoch": 2.461465721040189,
+ "grad_norm": 2.8163657188415527,
+ "learning_rate": 3.230896119030803e-06,
+ "loss": 0.3881,
+ "step": 5206
+ },
+ {
+ "epoch": 2.4619385342789597,
+ "grad_norm": 2.812433958053589,
+ "learning_rate": 3.2302995250492584e-06,
+ "loss": 0.4897,
+ "step": 5207
+ },
+ {
+ "epoch": 2.4624113475177305,
+ "grad_norm": 2.786033868789673,
+ "learning_rate": 3.2297028855951664e-06,
+ "loss": 0.4069,
+ "step": 5208
+ },
+ {
+ "epoch": 2.4628841607565013,
+ "grad_norm": 3.0247974395751953,
+ "learning_rate": 3.229106200705674e-06,
+ "loss": 0.4048,
+ "step": 5209
+ },
+ {
+ "epoch": 2.463356973995272,
+ "grad_norm": 3.3280487060546875,
+ "learning_rate": 3.2285094704179353e-06,
+ "loss": 0.5613,
+ "step": 5210
+ },
+ {
+ "epoch": 2.4638297872340424,
+ "grad_norm": 2.603219985961914,
+ "learning_rate": 3.2279126947691073e-06,
+ "loss": 0.432,
+ "step": 5211
+ },
+ {
+ "epoch": 2.4643026004728132,
+ "grad_norm": 3.1532180309295654,
+ "learning_rate": 3.2273158737963472e-06,
+ "loss": 0.4602,
+ "step": 5212
+ },
+ {
+ "epoch": 2.464775413711584,
+ "grad_norm": 2.7512969970703125,
+ "learning_rate": 3.2267190075368164e-06,
+ "loss": 0.5064,
+ "step": 5213
+ },
+ {
+ "epoch": 2.4652482269503544,
+ "grad_norm": 2.926992177963257,
+ "learning_rate": 3.22612209602768e-06,
+ "loss": 0.4753,
+ "step": 5214
+ },
+ {
+ "epoch": 2.465721040189125,
+ "grad_norm": 4.052840709686279,
+ "learning_rate": 3.2255251393061047e-06,
+ "loss": 0.5235,
+ "step": 5215
+ },
+ {
+ "epoch": 2.466193853427896,
+ "grad_norm": 2.8266959190368652,
+ "learning_rate": 3.2249281374092606e-06,
+ "loss": 0.3931,
+ "step": 5216
+ },
+ {
+ "epoch": 2.466666666666667,
+ "grad_norm": 2.564359426498413,
+ "learning_rate": 3.2243310903743196e-06,
+ "loss": 0.4146,
+ "step": 5217
+ },
+ {
+ "epoch": 2.4671394799054376,
+ "grad_norm": 2.387925148010254,
+ "learning_rate": 3.2237339982384576e-06,
+ "loss": 0.4142,
+ "step": 5218
+ },
+ {
+ "epoch": 2.467612293144208,
+ "grad_norm": 2.7045164108276367,
+ "learning_rate": 3.223136861038853e-06,
+ "loss": 0.4345,
+ "step": 5219
+ },
+ {
+ "epoch": 2.4680851063829787,
+ "grad_norm": 2.6963284015655518,
+ "learning_rate": 3.2225396788126872e-06,
+ "loss": 0.4243,
+ "step": 5220
+ },
+ {
+ "epoch": 2.4685579196217495,
+ "grad_norm": 2.8247268199920654,
+ "learning_rate": 3.221942451597144e-06,
+ "loss": 0.3919,
+ "step": 5221
+ },
+ {
+ "epoch": 2.46903073286052,
+ "grad_norm": 3.843836784362793,
+ "learning_rate": 3.2213451794294093e-06,
+ "loss": 0.4183,
+ "step": 5222
+ },
+ {
+ "epoch": 2.4695035460992907,
+ "grad_norm": 2.8579909801483154,
+ "learning_rate": 3.220747862346674e-06,
+ "loss": 0.4844,
+ "step": 5223
+ },
+ {
+ "epoch": 2.4699763593380615,
+ "grad_norm": 3.744027853012085,
+ "learning_rate": 3.2201505003861294e-06,
+ "loss": 0.4563,
+ "step": 5224
+ },
+ {
+ "epoch": 2.4704491725768323,
+ "grad_norm": 2.835108995437622,
+ "learning_rate": 3.219553093584971e-06,
+ "loss": 0.4394,
+ "step": 5225
+ },
+ {
+ "epoch": 2.470921985815603,
+ "grad_norm": 2.5681865215301514,
+ "learning_rate": 3.218955641980397e-06,
+ "loss": 0.3907,
+ "step": 5226
+ },
+ {
+ "epoch": 2.4713947990543734,
+ "grad_norm": 2.963172674179077,
+ "learning_rate": 3.2183581456096067e-06,
+ "loss": 0.5163,
+ "step": 5227
+ },
+ {
+ "epoch": 2.4718676122931442,
+ "grad_norm": 2.7840685844421387,
+ "learning_rate": 3.2177606045098047e-06,
+ "loss": 0.411,
+ "step": 5228
+ },
+ {
+ "epoch": 2.472340425531915,
+ "grad_norm": 2.7849979400634766,
+ "learning_rate": 3.2171630187181977e-06,
+ "loss": 0.4671,
+ "step": 5229
+ },
+ {
+ "epoch": 2.4728132387706854,
+ "grad_norm": 2.736406087875366,
+ "learning_rate": 3.216565388271994e-06,
+ "loss": 0.5225,
+ "step": 5230
+ },
+ {
+ "epoch": 2.473286052009456,
+ "grad_norm": 2.978271007537842,
+ "learning_rate": 3.215967713208406e-06,
+ "loss": 0.4668,
+ "step": 5231
+ },
+ {
+ "epoch": 2.473758865248227,
+ "grad_norm": 2.687560796737671,
+ "learning_rate": 3.2153699935646475e-06,
+ "loss": 0.4683,
+ "step": 5232
+ },
+ {
+ "epoch": 2.4742316784869978,
+ "grad_norm": 2.7096521854400635,
+ "learning_rate": 3.214772229377936e-06,
+ "loss": 0.4999,
+ "step": 5233
+ },
+ {
+ "epoch": 2.4747044917257686,
+ "grad_norm": 3.1861157417297363,
+ "learning_rate": 3.214174420685493e-06,
+ "loss": 0.4365,
+ "step": 5234
+ },
+ {
+ "epoch": 2.475177304964539,
+ "grad_norm": 2.623061418533325,
+ "learning_rate": 3.2135765675245394e-06,
+ "loss": 0.3717,
+ "step": 5235
+ },
+ {
+ "epoch": 2.4756501182033097,
+ "grad_norm": 2.680921792984009,
+ "learning_rate": 3.2129786699323016e-06,
+ "loss": 0.4688,
+ "step": 5236
+ },
+ {
+ "epoch": 2.4761229314420805,
+ "grad_norm": 2.80426025390625,
+ "learning_rate": 3.2123807279460096e-06,
+ "loss": 0.5043,
+ "step": 5237
+ },
+ {
+ "epoch": 2.476595744680851,
+ "grad_norm": 2.676156997680664,
+ "learning_rate": 3.211782741602893e-06,
+ "loss": 0.4486,
+ "step": 5238
+ },
+ {
+ "epoch": 2.4770685579196217,
+ "grad_norm": 2.700822591781616,
+ "learning_rate": 3.2111847109401855e-06,
+ "loss": 0.4097,
+ "step": 5239
+ },
+ {
+ "epoch": 2.4775413711583925,
+ "grad_norm": 2.735387086868286,
+ "learning_rate": 3.2105866359951254e-06,
+ "loss": 0.4357,
+ "step": 5240
+ },
+ {
+ "epoch": 2.4780141843971633,
+ "grad_norm": 2.961874485015869,
+ "learning_rate": 3.2099885168049507e-06,
+ "loss": 0.4942,
+ "step": 5241
+ },
+ {
+ "epoch": 2.478486997635934,
+ "grad_norm": 2.546588659286499,
+ "learning_rate": 3.209390353406904e-06,
+ "loss": 0.3852,
+ "step": 5242
+ },
+ {
+ "epoch": 2.4789598108747044,
+ "grad_norm": 2.6269772052764893,
+ "learning_rate": 3.208792145838231e-06,
+ "loss": 0.3935,
+ "step": 5243
+ },
+ {
+ "epoch": 2.479432624113475,
+ "grad_norm": 2.9009883403778076,
+ "learning_rate": 3.208193894136179e-06,
+ "loss": 0.4003,
+ "step": 5244
+ },
+ {
+ "epoch": 2.479905437352246,
+ "grad_norm": 2.772834300994873,
+ "learning_rate": 3.2075955983379982e-06,
+ "loss": 0.4742,
+ "step": 5245
+ },
+ {
+ "epoch": 2.4803782505910164,
+ "grad_norm": 2.728703737258911,
+ "learning_rate": 3.2069972584809423e-06,
+ "loss": 0.4405,
+ "step": 5246
+ },
+ {
+ "epoch": 2.480851063829787,
+ "grad_norm": 2.72868275642395,
+ "learning_rate": 3.206398874602268e-06,
+ "loss": 0.4714,
+ "step": 5247
+ },
+ {
+ "epoch": 2.481323877068558,
+ "grad_norm": 2.6804213523864746,
+ "learning_rate": 3.2058004467392323e-06,
+ "loss": 0.4106,
+ "step": 5248
+ },
+ {
+ "epoch": 2.4817966903073287,
+ "grad_norm": 2.6740739345550537,
+ "learning_rate": 3.205201974929098e-06,
+ "loss": 0.3855,
+ "step": 5249
+ },
+ {
+ "epoch": 2.482269503546099,
+ "grad_norm": 2.8131754398345947,
+ "learning_rate": 3.204603459209129e-06,
+ "loss": 0.418,
+ "step": 5250
+ },
+ {
+ "epoch": 2.48274231678487,
+ "grad_norm": 2.5242888927459717,
+ "learning_rate": 3.204004899616592e-06,
+ "loss": 0.4914,
+ "step": 5251
+ },
+ {
+ "epoch": 2.4832151300236407,
+ "grad_norm": 2.969191551208496,
+ "learning_rate": 3.2034062961887567e-06,
+ "loss": 0.4634,
+ "step": 5252
+ },
+ {
+ "epoch": 2.4836879432624115,
+ "grad_norm": 2.967968463897705,
+ "learning_rate": 3.2028076489628963e-06,
+ "loss": 0.456,
+ "step": 5253
+ },
+ {
+ "epoch": 2.484160756501182,
+ "grad_norm": 2.9006540775299072,
+ "learning_rate": 3.2022089579762845e-06,
+ "loss": 0.4203,
+ "step": 5254
+ },
+ {
+ "epoch": 2.4846335697399526,
+ "grad_norm": 2.6377336978912354,
+ "learning_rate": 3.2016102232662003e-06,
+ "loss": 0.4518,
+ "step": 5255
+ },
+ {
+ "epoch": 2.4851063829787234,
+ "grad_norm": 2.757749319076538,
+ "learning_rate": 3.201011444869925e-06,
+ "loss": 0.4314,
+ "step": 5256
+ },
+ {
+ "epoch": 2.4855791962174942,
+ "grad_norm": 2.571560859680176,
+ "learning_rate": 3.20041262282474e-06,
+ "loss": 0.427,
+ "step": 5257
+ },
+ {
+ "epoch": 2.4860520094562646,
+ "grad_norm": 3.1367194652557373,
+ "learning_rate": 3.1998137571679316e-06,
+ "loss": 0.4901,
+ "step": 5258
+ },
+ {
+ "epoch": 2.4865248226950354,
+ "grad_norm": 3.194042205810547,
+ "learning_rate": 3.1992148479367896e-06,
+ "loss": 0.466,
+ "step": 5259
+ },
+ {
+ "epoch": 2.486997635933806,
+ "grad_norm": 2.5546324253082275,
+ "learning_rate": 3.1986158951686052e-06,
+ "loss": 0.4182,
+ "step": 5260
+ },
+ {
+ "epoch": 2.487470449172577,
+ "grad_norm": 2.919783115386963,
+ "learning_rate": 3.198016898900672e-06,
+ "loss": 0.4234,
+ "step": 5261
+ },
+ {
+ "epoch": 2.4879432624113473,
+ "grad_norm": 2.865248918533325,
+ "learning_rate": 3.1974178591702877e-06,
+ "loss": 0.4291,
+ "step": 5262
+ },
+ {
+ "epoch": 2.488416075650118,
+ "grad_norm": 2.685737133026123,
+ "learning_rate": 3.196818776014752e-06,
+ "loss": 0.4548,
+ "step": 5263
+ },
+ {
+ "epoch": 2.488888888888889,
+ "grad_norm": 2.826974630355835,
+ "learning_rate": 3.196219649471365e-06,
+ "loss": 0.4152,
+ "step": 5264
+ },
+ {
+ "epoch": 2.4893617021276597,
+ "grad_norm": 2.764975070953369,
+ "learning_rate": 3.1956204795774336e-06,
+ "loss": 0.5209,
+ "step": 5265
+ },
+ {
+ "epoch": 2.48983451536643,
+ "grad_norm": 2.4184255599975586,
+ "learning_rate": 3.1950212663702662e-06,
+ "loss": 0.3969,
+ "step": 5266
+ },
+ {
+ "epoch": 2.490307328605201,
+ "grad_norm": 2.9361133575439453,
+ "learning_rate": 3.1944220098871713e-06,
+ "loss": 0.4589,
+ "step": 5267
+ },
+ {
+ "epoch": 2.4907801418439717,
+ "grad_norm": 2.377051830291748,
+ "learning_rate": 3.193822710165463e-06,
+ "loss": 0.4328,
+ "step": 5268
+ },
+ {
+ "epoch": 2.4912529550827425,
+ "grad_norm": 3.1302497386932373,
+ "learning_rate": 3.1932233672424563e-06,
+ "loss": 0.3918,
+ "step": 5269
+ },
+ {
+ "epoch": 2.491725768321513,
+ "grad_norm": 2.89577579498291,
+ "learning_rate": 3.192623981155471e-06,
+ "loss": 0.5004,
+ "step": 5270
+ },
+ {
+ "epoch": 2.4921985815602836,
+ "grad_norm": 2.7735235691070557,
+ "learning_rate": 3.1920245519418273e-06,
+ "loss": 0.4206,
+ "step": 5271
+ },
+ {
+ "epoch": 2.4926713947990544,
+ "grad_norm": 2.5424516201019287,
+ "learning_rate": 3.1914250796388493e-06,
+ "loss": 0.4419,
+ "step": 5272
+ },
+ {
+ "epoch": 2.493144208037825,
+ "grad_norm": 3.1216981410980225,
+ "learning_rate": 3.1908255642838628e-06,
+ "loss": 0.4552,
+ "step": 5273
+ },
+ {
+ "epoch": 2.4936170212765956,
+ "grad_norm": 3.044045925140381,
+ "learning_rate": 3.1902260059141978e-06,
+ "loss": 0.4967,
+ "step": 5274
+ },
+ {
+ "epoch": 2.4940898345153664,
+ "grad_norm": 2.5630741119384766,
+ "learning_rate": 3.189626404567186e-06,
+ "loss": 0.3908,
+ "step": 5275
+ },
+ {
+ "epoch": 2.494562647754137,
+ "grad_norm": 2.7177648544311523,
+ "learning_rate": 3.189026760280162e-06,
+ "loss": 0.4915,
+ "step": 5276
+ },
+ {
+ "epoch": 2.495035460992908,
+ "grad_norm": 2.653416395187378,
+ "learning_rate": 3.1884270730904632e-06,
+ "loss": 0.4633,
+ "step": 5277
+ },
+ {
+ "epoch": 2.4955082742316783,
+ "grad_norm": 3.7212321758270264,
+ "learning_rate": 3.1878273430354284e-06,
+ "loss": 0.4549,
+ "step": 5278
+ },
+ {
+ "epoch": 2.495981087470449,
+ "grad_norm": 2.4152729511260986,
+ "learning_rate": 3.187227570152402e-06,
+ "loss": 0.4674,
+ "step": 5279
+ },
+ {
+ "epoch": 2.49645390070922,
+ "grad_norm": 2.5354862213134766,
+ "learning_rate": 3.1866277544787284e-06,
+ "loss": 0.4135,
+ "step": 5280
+ },
+ {
+ "epoch": 2.4969267139479907,
+ "grad_norm": 3.1766583919525146,
+ "learning_rate": 3.186027896051754e-06,
+ "loss": 0.5656,
+ "step": 5281
+ },
+ {
+ "epoch": 2.497399527186761,
+ "grad_norm": 2.5636754035949707,
+ "learning_rate": 3.1854279949088313e-06,
+ "loss": 0.4138,
+ "step": 5282
+ },
+ {
+ "epoch": 2.497872340425532,
+ "grad_norm": 2.7615602016448975,
+ "learning_rate": 3.1848280510873124e-06,
+ "loss": 0.4936,
+ "step": 5283
+ },
+ {
+ "epoch": 2.4983451536643027,
+ "grad_norm": 2.964721918106079,
+ "learning_rate": 3.1842280646245543e-06,
+ "loss": 0.4865,
+ "step": 5284
+ },
+ {
+ "epoch": 2.4988179669030735,
+ "grad_norm": 2.6915178298950195,
+ "learning_rate": 3.1836280355579152e-06,
+ "loss": 0.4179,
+ "step": 5285
+ },
+ {
+ "epoch": 2.499290780141844,
+ "grad_norm": 2.820451259613037,
+ "learning_rate": 3.183027963924755e-06,
+ "loss": 0.4785,
+ "step": 5286
+ },
+ {
+ "epoch": 2.4997635933806146,
+ "grad_norm": 2.841719627380371,
+ "learning_rate": 3.1824278497624393e-06,
+ "loss": 0.4535,
+ "step": 5287
+ },
+ {
+ "epoch": 2.5002364066193854,
+ "grad_norm": 2.459167957305908,
+ "learning_rate": 3.181827693108333e-06,
+ "loss": 0.4353,
+ "step": 5288
+ },
+ {
+ "epoch": 2.500709219858156,
+ "grad_norm": 3.2538363933563232,
+ "learning_rate": 3.1812274939998066e-06,
+ "loss": 0.4037,
+ "step": 5289
+ },
+ {
+ "epoch": 2.5011820330969265,
+ "grad_norm": 2.6980504989624023,
+ "learning_rate": 3.180627252474231e-06,
+ "loss": 0.4181,
+ "step": 5290
+ },
+ {
+ "epoch": 2.5016548463356973,
+ "grad_norm": 2.9400012493133545,
+ "learning_rate": 3.1800269685689804e-06,
+ "loss": 0.4642,
+ "step": 5291
+ },
+ {
+ "epoch": 2.502127659574468,
+ "grad_norm": 2.7832958698272705,
+ "learning_rate": 3.1794266423214328e-06,
+ "loss": 0.3936,
+ "step": 5292
+ },
+ {
+ "epoch": 2.5026004728132385,
+ "grad_norm": 2.4017868041992188,
+ "learning_rate": 3.178826273768967e-06,
+ "loss": 0.3984,
+ "step": 5293
+ },
+ {
+ "epoch": 2.5030732860520093,
+ "grad_norm": 2.398120641708374,
+ "learning_rate": 3.1782258629489665e-06,
+ "loss": 0.4219,
+ "step": 5294
+ },
+ {
+ "epoch": 2.50354609929078,
+ "grad_norm": 2.973947763442993,
+ "learning_rate": 3.177625409898815e-06,
+ "loss": 0.4192,
+ "step": 5295
+ },
+ {
+ "epoch": 2.504018912529551,
+ "grad_norm": 3.1169888973236084,
+ "learning_rate": 3.1770249146559006e-06,
+ "loss": 0.5098,
+ "step": 5296
+ },
+ {
+ "epoch": 2.5044917257683217,
+ "grad_norm": 2.816964864730835,
+ "learning_rate": 3.1764243772576132e-06,
+ "loss": 0.4228,
+ "step": 5297
+ },
+ {
+ "epoch": 2.504964539007092,
+ "grad_norm": 2.5624163150787354,
+ "learning_rate": 3.1758237977413452e-06,
+ "loss": 0.4389,
+ "step": 5298
+ },
+ {
+ "epoch": 2.505437352245863,
+ "grad_norm": 2.7477777004241943,
+ "learning_rate": 3.175223176144494e-06,
+ "loss": 0.4564,
+ "step": 5299
+ },
+ {
+ "epoch": 2.5059101654846336,
+ "grad_norm": 3.1478309631347656,
+ "learning_rate": 3.174622512504456e-06,
+ "loss": 0.4859,
+ "step": 5300
+ },
+ {
+ "epoch": 2.506382978723404,
+ "grad_norm": 2.8400418758392334,
+ "learning_rate": 3.1740218068586315e-06,
+ "loss": 0.4476,
+ "step": 5301
+ },
+ {
+ "epoch": 2.506855791962175,
+ "grad_norm": 2.7097036838531494,
+ "learning_rate": 3.173421059244426e-06,
+ "loss": 0.4559,
+ "step": 5302
+ },
+ {
+ "epoch": 2.5073286052009456,
+ "grad_norm": 2.864760637283325,
+ "learning_rate": 3.172820269699243e-06,
+ "loss": 0.5124,
+ "step": 5303
+ },
+ {
+ "epoch": 2.5078014184397164,
+ "grad_norm": 2.877110004425049,
+ "learning_rate": 3.1722194382604926e-06,
+ "loss": 0.5083,
+ "step": 5304
+ },
+ {
+ "epoch": 2.508274231678487,
+ "grad_norm": 3.2369656562805176,
+ "learning_rate": 3.1716185649655844e-06,
+ "loss": 0.4894,
+ "step": 5305
+ },
+ {
+ "epoch": 2.5087470449172575,
+ "grad_norm": 2.7377753257751465,
+ "learning_rate": 3.171017649851934e-06,
+ "loss": 0.4324,
+ "step": 5306
+ },
+ {
+ "epoch": 2.5092198581560283,
+ "grad_norm": 2.883364200592041,
+ "learning_rate": 3.1704166929569564e-06,
+ "loss": 0.3731,
+ "step": 5307
+ },
+ {
+ "epoch": 2.509692671394799,
+ "grad_norm": 2.5724737644195557,
+ "learning_rate": 3.1698156943180716e-06,
+ "loss": 0.4768,
+ "step": 5308
+ },
+ {
+ "epoch": 2.5101654846335695,
+ "grad_norm": 2.7532460689544678,
+ "learning_rate": 3.1692146539727e-06,
+ "loss": 0.4385,
+ "step": 5309
+ },
+ {
+ "epoch": 2.5106382978723403,
+ "grad_norm": 2.786505699157715,
+ "learning_rate": 3.168613571958267e-06,
+ "loss": 0.4241,
+ "step": 5310
+ },
+ {
+ "epoch": 2.511111111111111,
+ "grad_norm": 3.1674118041992188,
+ "learning_rate": 3.1680124483121975e-06,
+ "loss": 0.4445,
+ "step": 5311
+ },
+ {
+ "epoch": 2.511583924349882,
+ "grad_norm": 2.7861545085906982,
+ "learning_rate": 3.167411283071923e-06,
+ "loss": 0.4264,
+ "step": 5312
+ },
+ {
+ "epoch": 2.5120567375886527,
+ "grad_norm": 2.7412493228912354,
+ "learning_rate": 3.1668100762748745e-06,
+ "loss": 0.4725,
+ "step": 5313
+ },
+ {
+ "epoch": 2.512529550827423,
+ "grad_norm": 2.710019588470459,
+ "learning_rate": 3.1662088279584858e-06,
+ "loss": 0.5207,
+ "step": 5314
+ },
+ {
+ "epoch": 2.513002364066194,
+ "grad_norm": 2.694812297821045,
+ "learning_rate": 3.165607538160194e-06,
+ "loss": 0.3666,
+ "step": 5315
+ },
+ {
+ "epoch": 2.5134751773049646,
+ "grad_norm": 2.4390623569488525,
+ "learning_rate": 3.1650062069174405e-06,
+ "loss": 0.4025,
+ "step": 5316
+ },
+ {
+ "epoch": 2.513947990543735,
+ "grad_norm": 3.055738925933838,
+ "learning_rate": 3.1644048342676663e-06,
+ "loss": 0.4288,
+ "step": 5317
+ },
+ {
+ "epoch": 2.5144208037825058,
+ "grad_norm": 3.065824508666992,
+ "learning_rate": 3.163803420248316e-06,
+ "loss": 0.4592,
+ "step": 5318
+ },
+ {
+ "epoch": 2.5148936170212766,
+ "grad_norm": 2.6011085510253906,
+ "learning_rate": 3.163201964896838e-06,
+ "loss": 0.4081,
+ "step": 5319
+ },
+ {
+ "epoch": 2.5153664302600474,
+ "grad_norm": 2.4833033084869385,
+ "learning_rate": 3.162600468250681e-06,
+ "loss": 0.4343,
+ "step": 5320
+ },
+ {
+ "epoch": 2.515839243498818,
+ "grad_norm": 2.9035534858703613,
+ "learning_rate": 3.161998930347299e-06,
+ "loss": 0.4972,
+ "step": 5321
+ },
+ {
+ "epoch": 2.5163120567375885,
+ "grad_norm": 2.788752317428589,
+ "learning_rate": 3.161397351224146e-06,
+ "loss": 0.4597,
+ "step": 5322
+ },
+ {
+ "epoch": 2.5167848699763593,
+ "grad_norm": 2.4344491958618164,
+ "learning_rate": 3.16079573091868e-06,
+ "loss": 0.359,
+ "step": 5323
+ },
+ {
+ "epoch": 2.51725768321513,
+ "grad_norm": 2.750150680541992,
+ "learning_rate": 3.160194069468361e-06,
+ "loss": 0.4596,
+ "step": 5324
+ },
+ {
+ "epoch": 2.5177304964539005,
+ "grad_norm": 2.826902389526367,
+ "learning_rate": 3.1595923669106526e-06,
+ "loss": 0.4377,
+ "step": 5325
+ },
+ {
+ "epoch": 2.5182033096926713,
+ "grad_norm": 2.554439067840576,
+ "learning_rate": 3.15899062328302e-06,
+ "loss": 0.4517,
+ "step": 5326
+ },
+ {
+ "epoch": 2.518676122931442,
+ "grad_norm": 3.0882742404937744,
+ "learning_rate": 3.158388838622931e-06,
+ "loss": 0.47,
+ "step": 5327
+ },
+ {
+ "epoch": 2.519148936170213,
+ "grad_norm": 2.918947696685791,
+ "learning_rate": 3.157787012967856e-06,
+ "loss": 0.522,
+ "step": 5328
+ },
+ {
+ "epoch": 2.5196217494089836,
+ "grad_norm": 2.8057637214660645,
+ "learning_rate": 3.1571851463552674e-06,
+ "loss": 0.4837,
+ "step": 5329
+ },
+ {
+ "epoch": 2.520094562647754,
+ "grad_norm": 2.66241455078125,
+ "learning_rate": 3.156583238822641e-06,
+ "loss": 0.3988,
+ "step": 5330
+ },
+ {
+ "epoch": 2.520567375886525,
+ "grad_norm": 2.9793803691864014,
+ "learning_rate": 3.155981290407456e-06,
+ "loss": 0.4737,
+ "step": 5331
+ },
+ {
+ "epoch": 2.5210401891252956,
+ "grad_norm": 2.847522258758545,
+ "learning_rate": 3.1553793011471924e-06,
+ "loss": 0.4394,
+ "step": 5332
+ },
+ {
+ "epoch": 2.521513002364066,
+ "grad_norm": 2.9561474323272705,
+ "learning_rate": 3.154777271079333e-06,
+ "loss": 0.47,
+ "step": 5333
+ },
+ {
+ "epoch": 2.5219858156028367,
+ "grad_norm": 2.8353018760681152,
+ "learning_rate": 3.154175200241365e-06,
+ "loss": 0.4015,
+ "step": 5334
+ },
+ {
+ "epoch": 2.5224586288416075,
+ "grad_norm": 2.609049081802368,
+ "learning_rate": 3.153573088670775e-06,
+ "loss": 0.4723,
+ "step": 5335
+ },
+ {
+ "epoch": 2.5229314420803783,
+ "grad_norm": 2.8538455963134766,
+ "learning_rate": 3.1529709364050556e-06,
+ "loss": 0.4665,
+ "step": 5336
+ },
+ {
+ "epoch": 2.523404255319149,
+ "grad_norm": 2.768310785293579,
+ "learning_rate": 3.1523687434816978e-06,
+ "loss": 0.4933,
+ "step": 5337
+ },
+ {
+ "epoch": 2.5238770685579195,
+ "grad_norm": 2.9300906658172607,
+ "learning_rate": 3.1517665099382e-06,
+ "loss": 0.4651,
+ "step": 5338
+ },
+ {
+ "epoch": 2.5243498817966903,
+ "grad_norm": 2.6984703540802,
+ "learning_rate": 3.1511642358120585e-06,
+ "loss": 0.4442,
+ "step": 5339
+ },
+ {
+ "epoch": 2.524822695035461,
+ "grad_norm": 2.8148467540740967,
+ "learning_rate": 3.1505619211407762e-06,
+ "loss": 0.4611,
+ "step": 5340
+ },
+ {
+ "epoch": 2.5252955082742314,
+ "grad_norm": 2.816436290740967,
+ "learning_rate": 3.1499595659618556e-06,
+ "loss": 0.5291,
+ "step": 5341
+ },
+ {
+ "epoch": 2.5257683215130022,
+ "grad_norm": 2.902805805206299,
+ "learning_rate": 3.149357170312802e-06,
+ "loss": 0.4394,
+ "step": 5342
+ },
+ {
+ "epoch": 2.526241134751773,
+ "grad_norm": 2.6443474292755127,
+ "learning_rate": 3.148754734231126e-06,
+ "loss": 0.4444,
+ "step": 5343
+ },
+ {
+ "epoch": 2.526713947990544,
+ "grad_norm": 2.6818583011627197,
+ "learning_rate": 3.148152257754336e-06,
+ "loss": 0.4256,
+ "step": 5344
+ },
+ {
+ "epoch": 2.5271867612293146,
+ "grad_norm": 2.5266945362091064,
+ "learning_rate": 3.1475497409199485e-06,
+ "loss": 0.4087,
+ "step": 5345
+ },
+ {
+ "epoch": 2.527659574468085,
+ "grad_norm": 2.6326711177825928,
+ "learning_rate": 3.146947183765477e-06,
+ "loss": 0.3842,
+ "step": 5346
+ },
+ {
+ "epoch": 2.5281323877068558,
+ "grad_norm": 3.122880697250366,
+ "learning_rate": 3.1463445863284413e-06,
+ "loss": 0.482,
+ "step": 5347
+ },
+ {
+ "epoch": 2.5286052009456266,
+ "grad_norm": 2.819258213043213,
+ "learning_rate": 3.145741948646362e-06,
+ "loss": 0.4628,
+ "step": 5348
+ },
+ {
+ "epoch": 2.529078014184397,
+ "grad_norm": 2.5842230319976807,
+ "learning_rate": 3.145139270756764e-06,
+ "loss": 0.4479,
+ "step": 5349
+ },
+ {
+ "epoch": 2.5295508274231677,
+ "grad_norm": 2.7257237434387207,
+ "learning_rate": 3.144536552697172e-06,
+ "loss": 0.473,
+ "step": 5350
+ },
+ {
+ "epoch": 2.5300236406619385,
+ "grad_norm": 2.6876981258392334,
+ "learning_rate": 3.143933794505115e-06,
+ "loss": 0.4615,
+ "step": 5351
+ },
+ {
+ "epoch": 2.5304964539007093,
+ "grad_norm": 2.7942895889282227,
+ "learning_rate": 3.143330996218124e-06,
+ "loss": 0.4982,
+ "step": 5352
+ },
+ {
+ "epoch": 2.53096926713948,
+ "grad_norm": 2.3150579929351807,
+ "learning_rate": 3.1427281578737327e-06,
+ "loss": 0.3905,
+ "step": 5353
+ },
+ {
+ "epoch": 2.5314420803782505,
+ "grad_norm": 2.7326138019561768,
+ "learning_rate": 3.142125279509478e-06,
+ "loss": 0.4076,
+ "step": 5354
+ },
+ {
+ "epoch": 2.5319148936170213,
+ "grad_norm": 2.46362566947937,
+ "learning_rate": 3.1415223611628976e-06,
+ "loss": 0.4043,
+ "step": 5355
+ },
+ {
+ "epoch": 2.532387706855792,
+ "grad_norm": 2.6670427322387695,
+ "learning_rate": 3.1409194028715323e-06,
+ "loss": 0.484,
+ "step": 5356
+ },
+ {
+ "epoch": 2.5328605200945624,
+ "grad_norm": 2.917771100997925,
+ "learning_rate": 3.140316404672926e-06,
+ "loss": 0.4539,
+ "step": 5357
+ },
+ {
+ "epoch": 2.533333333333333,
+ "grad_norm": 2.7964110374450684,
+ "learning_rate": 3.1397133666046254e-06,
+ "loss": 0.4706,
+ "step": 5358
+ },
+ {
+ "epoch": 2.533806146572104,
+ "grad_norm": 2.6481330394744873,
+ "learning_rate": 3.139110288704179e-06,
+ "loss": 0.4101,
+ "step": 5359
+ },
+ {
+ "epoch": 2.534278959810875,
+ "grad_norm": 2.859452962875366,
+ "learning_rate": 3.1385071710091365e-06,
+ "loss": 0.4842,
+ "step": 5360
+ },
+ {
+ "epoch": 2.5347517730496456,
+ "grad_norm": 2.686077356338501,
+ "learning_rate": 3.137904013557052e-06,
+ "loss": 0.4073,
+ "step": 5361
+ },
+ {
+ "epoch": 2.535224586288416,
+ "grad_norm": 3.7147045135498047,
+ "learning_rate": 3.137300816385482e-06,
+ "loss": 0.4536,
+ "step": 5362
+ },
+ {
+ "epoch": 2.5356973995271868,
+ "grad_norm": 2.51054048538208,
+ "learning_rate": 3.1366975795319856e-06,
+ "loss": 0.4171,
+ "step": 5363
+ },
+ {
+ "epoch": 2.5361702127659576,
+ "grad_norm": 3.043149471282959,
+ "learning_rate": 3.136094303034121e-06,
+ "loss": 0.5179,
+ "step": 5364
+ },
+ {
+ "epoch": 2.536643026004728,
+ "grad_norm": 2.398878812789917,
+ "learning_rate": 3.1354909869294548e-06,
+ "loss": 0.4144,
+ "step": 5365
+ },
+ {
+ "epoch": 2.5371158392434987,
+ "grad_norm": 2.969712257385254,
+ "learning_rate": 3.134887631255551e-06,
+ "loss": 0.3983,
+ "step": 5366
+ },
+ {
+ "epoch": 2.5375886524822695,
+ "grad_norm": 2.7707982063293457,
+ "learning_rate": 3.134284236049978e-06,
+ "loss": 0.4405,
+ "step": 5367
+ },
+ {
+ "epoch": 2.5380614657210403,
+ "grad_norm": 2.579742193222046,
+ "learning_rate": 3.1336808013503073e-06,
+ "loss": 0.4402,
+ "step": 5368
+ },
+ {
+ "epoch": 2.538534278959811,
+ "grad_norm": 2.6041927337646484,
+ "learning_rate": 3.1330773271941113e-06,
+ "loss": 0.396,
+ "step": 5369
+ },
+ {
+ "epoch": 2.5390070921985815,
+ "grad_norm": 2.7383856773376465,
+ "learning_rate": 3.1324738136189658e-06,
+ "loss": 0.4424,
+ "step": 5370
+ },
+ {
+ "epoch": 2.5394799054373522,
+ "grad_norm": 3.053644895553589,
+ "learning_rate": 3.13187026066245e-06,
+ "loss": 0.473,
+ "step": 5371
+ },
+ {
+ "epoch": 2.539952718676123,
+ "grad_norm": 2.684244155883789,
+ "learning_rate": 3.1312666683621428e-06,
+ "loss": 0.3963,
+ "step": 5372
+ },
+ {
+ "epoch": 2.5404255319148934,
+ "grad_norm": 2.6505017280578613,
+ "learning_rate": 3.130663036755629e-06,
+ "loss": 0.4292,
+ "step": 5373
+ },
+ {
+ "epoch": 2.540898345153664,
+ "grad_norm": 3.025965929031372,
+ "learning_rate": 3.1300593658804935e-06,
+ "loss": 0.4539,
+ "step": 5374
+ },
+ {
+ "epoch": 2.541371158392435,
+ "grad_norm": 2.72106671333313,
+ "learning_rate": 3.1294556557743237e-06,
+ "loss": 0.4519,
+ "step": 5375
+ },
+ {
+ "epoch": 2.541843971631206,
+ "grad_norm": 2.759995222091675,
+ "learning_rate": 3.12885190647471e-06,
+ "loss": 0.451,
+ "step": 5376
+ },
+ {
+ "epoch": 2.5423167848699766,
+ "grad_norm": 2.697950601577759,
+ "learning_rate": 3.1282481180192457e-06,
+ "loss": 0.4328,
+ "step": 5377
+ },
+ {
+ "epoch": 2.542789598108747,
+ "grad_norm": 2.6970415115356445,
+ "learning_rate": 3.127644290445526e-06,
+ "loss": 0.4489,
+ "step": 5378
+ },
+ {
+ "epoch": 2.5432624113475177,
+ "grad_norm": 2.5856997966766357,
+ "learning_rate": 3.127040423791148e-06,
+ "loss": 0.3848,
+ "step": 5379
+ },
+ {
+ "epoch": 2.5437352245862885,
+ "grad_norm": 2.9798166751861572,
+ "learning_rate": 3.1264365180937127e-06,
+ "loss": 0.5038,
+ "step": 5380
+ },
+ {
+ "epoch": 2.544208037825059,
+ "grad_norm": 3.413175106048584,
+ "learning_rate": 3.1258325733908224e-06,
+ "loss": 0.5247,
+ "step": 5381
+ },
+ {
+ "epoch": 2.5446808510638297,
+ "grad_norm": 2.838517904281616,
+ "learning_rate": 3.1252285897200818e-06,
+ "loss": 0.4652,
+ "step": 5382
+ },
+ {
+ "epoch": 2.5451536643026005,
+ "grad_norm": 2.8342528343200684,
+ "learning_rate": 3.1246245671190983e-06,
+ "loss": 0.4245,
+ "step": 5383
+ },
+ {
+ "epoch": 2.5456264775413713,
+ "grad_norm": 3.06026029586792,
+ "learning_rate": 3.124020505625482e-06,
+ "loss": 0.469,
+ "step": 5384
+ },
+ {
+ "epoch": 2.546099290780142,
+ "grad_norm": 2.633894681930542,
+ "learning_rate": 3.1234164052768452e-06,
+ "loss": 0.4509,
+ "step": 5385
+ },
+ {
+ "epoch": 2.5465721040189124,
+ "grad_norm": 2.634819984436035,
+ "learning_rate": 3.1228122661108023e-06,
+ "loss": 0.4879,
+ "step": 5386
+ },
+ {
+ "epoch": 2.5470449172576832,
+ "grad_norm": 3.9843504428863525,
+ "learning_rate": 3.1222080881649707e-06,
+ "loss": 0.4472,
+ "step": 5387
+ },
+ {
+ "epoch": 2.547517730496454,
+ "grad_norm": 2.5480258464813232,
+ "learning_rate": 3.1216038714769694e-06,
+ "loss": 0.4396,
+ "step": 5388
+ },
+ {
+ "epoch": 2.5479905437352244,
+ "grad_norm": 2.7461917400360107,
+ "learning_rate": 3.12099961608442e-06,
+ "loss": 0.4735,
+ "step": 5389
+ },
+ {
+ "epoch": 2.548463356973995,
+ "grad_norm": 3.167769193649292,
+ "learning_rate": 3.1203953220249493e-06,
+ "loss": 0.4196,
+ "step": 5390
+ },
+ {
+ "epoch": 2.548936170212766,
+ "grad_norm": 2.721696615219116,
+ "learning_rate": 3.1197909893361814e-06,
+ "loss": 0.4571,
+ "step": 5391
+ },
+ {
+ "epoch": 2.5494089834515368,
+ "grad_norm": 2.726668119430542,
+ "learning_rate": 3.1191866180557463e-06,
+ "loss": 0.4856,
+ "step": 5392
+ },
+ {
+ "epoch": 2.5498817966903076,
+ "grad_norm": 2.602205276489258,
+ "learning_rate": 3.1185822082212754e-06,
+ "loss": 0.4631,
+ "step": 5393
+ },
+ {
+ "epoch": 2.550354609929078,
+ "grad_norm": 2.7715859413146973,
+ "learning_rate": 3.1179777598704025e-06,
+ "loss": 0.4136,
+ "step": 5394
+ },
+ {
+ "epoch": 2.5508274231678487,
+ "grad_norm": 2.8081955909729004,
+ "learning_rate": 3.1173732730407647e-06,
+ "loss": 0.4963,
+ "step": 5395
+ },
+ {
+ "epoch": 2.5513002364066195,
+ "grad_norm": 2.946772336959839,
+ "learning_rate": 3.1167687477700006e-06,
+ "loss": 0.4443,
+ "step": 5396
+ },
+ {
+ "epoch": 2.55177304964539,
+ "grad_norm": 2.89345383644104,
+ "learning_rate": 3.1161641840957503e-06,
+ "loss": 0.4377,
+ "step": 5397
+ },
+ {
+ "epoch": 2.5522458628841607,
+ "grad_norm": 2.908317804336548,
+ "learning_rate": 3.115559582055659e-06,
+ "loss": 0.4702,
+ "step": 5398
+ },
+ {
+ "epoch": 2.5527186761229315,
+ "grad_norm": 2.554417848587036,
+ "learning_rate": 3.1149549416873704e-06,
+ "loss": 0.3738,
+ "step": 5399
+ },
+ {
+ "epoch": 2.5531914893617023,
+ "grad_norm": 2.3132457733154297,
+ "learning_rate": 3.1143502630285356e-06,
+ "loss": 0.4074,
+ "step": 5400
+ },
+ {
+ "epoch": 2.553664302600473,
+ "grad_norm": 2.751666784286499,
+ "learning_rate": 3.1137455461168026e-06,
+ "loss": 0.4697,
+ "step": 5401
+ },
+ {
+ "epoch": 2.5541371158392434,
+ "grad_norm": 2.7088871002197266,
+ "learning_rate": 3.113140790989826e-06,
+ "loss": 0.4754,
+ "step": 5402
+ },
+ {
+ "epoch": 2.554609929078014,
+ "grad_norm": 3.0633046627044678,
+ "learning_rate": 3.1125359976852605e-06,
+ "loss": 0.4874,
+ "step": 5403
+ },
+ {
+ "epoch": 2.555082742316785,
+ "grad_norm": 3.399456024169922,
+ "learning_rate": 3.111931166240764e-06,
+ "loss": 0.5529,
+ "step": 5404
+ },
+ {
+ "epoch": 2.5555555555555554,
+ "grad_norm": 2.7729690074920654,
+ "learning_rate": 3.1113262966939985e-06,
+ "loss": 0.4677,
+ "step": 5405
+ },
+ {
+ "epoch": 2.556028368794326,
+ "grad_norm": 2.81025767326355,
+ "learning_rate": 3.1107213890826244e-06,
+ "loss": 0.4954,
+ "step": 5406
+ },
+ {
+ "epoch": 2.556501182033097,
+ "grad_norm": 2.4837241172790527,
+ "learning_rate": 3.110116443444307e-06,
+ "loss": 0.3681,
+ "step": 5407
+ },
+ {
+ "epoch": 2.5569739952718678,
+ "grad_norm": 2.6406874656677246,
+ "learning_rate": 3.109511459816714e-06,
+ "loss": 0.4569,
+ "step": 5408
+ },
+ {
+ "epoch": 2.5574468085106385,
+ "grad_norm": 2.6093738079071045,
+ "learning_rate": 3.1089064382375155e-06,
+ "loss": 0.413,
+ "step": 5409
+ },
+ {
+ "epoch": 2.557919621749409,
+ "grad_norm": 2.6629011631011963,
+ "learning_rate": 3.108301378744383e-06,
+ "loss": 0.4286,
+ "step": 5410
+ },
+ {
+ "epoch": 2.5583924349881797,
+ "grad_norm": 2.694796323776245,
+ "learning_rate": 3.10769628137499e-06,
+ "loss": 0.4316,
+ "step": 5411
+ },
+ {
+ "epoch": 2.5588652482269505,
+ "grad_norm": 2.88023042678833,
+ "learning_rate": 3.107091146167015e-06,
+ "loss": 0.4378,
+ "step": 5412
+ },
+ {
+ "epoch": 2.559338061465721,
+ "grad_norm": 2.8804919719696045,
+ "learning_rate": 3.1064859731581365e-06,
+ "loss": 0.4971,
+ "step": 5413
+ },
+ {
+ "epoch": 2.5598108747044916,
+ "grad_norm": 2.850468397140503,
+ "learning_rate": 3.1058807623860353e-06,
+ "loss": 0.4686,
+ "step": 5414
+ },
+ {
+ "epoch": 2.5602836879432624,
+ "grad_norm": 3.0548019409179688,
+ "learning_rate": 3.1052755138883963e-06,
+ "loss": 0.4497,
+ "step": 5415
+ },
+ {
+ "epoch": 2.5607565011820332,
+ "grad_norm": 3.10168719291687,
+ "learning_rate": 3.1046702277029046e-06,
+ "loss": 0.569,
+ "step": 5416
+ },
+ {
+ "epoch": 2.561229314420804,
+ "grad_norm": 2.5887374877929688,
+ "learning_rate": 3.1040649038672494e-06,
+ "loss": 0.3812,
+ "step": 5417
+ },
+ {
+ "epoch": 2.5617021276595744,
+ "grad_norm": 2.9928438663482666,
+ "learning_rate": 3.1034595424191212e-06,
+ "loss": 0.4308,
+ "step": 5418
+ },
+ {
+ "epoch": 2.562174940898345,
+ "grad_norm": 2.7003073692321777,
+ "learning_rate": 3.102854143396214e-06,
+ "loss": 0.4967,
+ "step": 5419
+ },
+ {
+ "epoch": 2.562647754137116,
+ "grad_norm": 3.172868490219116,
+ "learning_rate": 3.102248706836222e-06,
+ "loss": 0.5311,
+ "step": 5420
+ },
+ {
+ "epoch": 2.5631205673758863,
+ "grad_norm": 3.0146191120147705,
+ "learning_rate": 3.101643232776844e-06,
+ "loss": 0.4714,
+ "step": 5421
+ },
+ {
+ "epoch": 2.563593380614657,
+ "grad_norm": 3.0683791637420654,
+ "learning_rate": 3.1010377212557806e-06,
+ "loss": 0.4047,
+ "step": 5422
+ },
+ {
+ "epoch": 2.564066193853428,
+ "grad_norm": 2.8260676860809326,
+ "learning_rate": 3.1004321723107334e-06,
+ "loss": 0.5282,
+ "step": 5423
+ },
+ {
+ "epoch": 2.5645390070921987,
+ "grad_norm": 3.0792388916015625,
+ "learning_rate": 3.0998265859794074e-06,
+ "loss": 0.5323,
+ "step": 5424
+ },
+ {
+ "epoch": 2.5650118203309695,
+ "grad_norm": 2.7332866191864014,
+ "learning_rate": 3.09922096229951e-06,
+ "loss": 0.4401,
+ "step": 5425
+ },
+ {
+ "epoch": 2.56548463356974,
+ "grad_norm": 2.9366047382354736,
+ "learning_rate": 3.098615301308751e-06,
+ "loss": 0.4495,
+ "step": 5426
+ },
+ {
+ "epoch": 2.5659574468085107,
+ "grad_norm": 2.982088565826416,
+ "learning_rate": 3.098009603044842e-06,
+ "loss": 0.495,
+ "step": 5427
+ },
+ {
+ "epoch": 2.5664302600472815,
+ "grad_norm": 3.1204755306243896,
+ "learning_rate": 3.0974038675454976e-06,
+ "loss": 0.4354,
+ "step": 5428
+ },
+ {
+ "epoch": 2.566903073286052,
+ "grad_norm": 2.835238218307495,
+ "learning_rate": 3.0967980948484333e-06,
+ "loss": 0.4161,
+ "step": 5429
+ },
+ {
+ "epoch": 2.5673758865248226,
+ "grad_norm": 2.8104958534240723,
+ "learning_rate": 3.096192284991369e-06,
+ "loss": 0.5045,
+ "step": 5430
+ },
+ {
+ "epoch": 2.5678486997635934,
+ "grad_norm": 3.1636080741882324,
+ "learning_rate": 3.0955864380120247e-06,
+ "loss": 0.4533,
+ "step": 5431
+ },
+ {
+ "epoch": 2.568321513002364,
+ "grad_norm": 2.980112314224243,
+ "learning_rate": 3.0949805539481247e-06,
+ "loss": 0.3998,
+ "step": 5432
+ },
+ {
+ "epoch": 2.568794326241135,
+ "grad_norm": 2.6379945278167725,
+ "learning_rate": 3.0943746328373953e-06,
+ "loss": 0.3785,
+ "step": 5433
+ },
+ {
+ "epoch": 2.5692671394799054,
+ "grad_norm": 2.780930757522583,
+ "learning_rate": 3.0937686747175627e-06,
+ "loss": 0.4801,
+ "step": 5434
+ },
+ {
+ "epoch": 2.569739952718676,
+ "grad_norm": 2.6608550548553467,
+ "learning_rate": 3.0931626796263585e-06,
+ "loss": 0.4047,
+ "step": 5435
+ },
+ {
+ "epoch": 2.570212765957447,
+ "grad_norm": 3.130584716796875,
+ "learning_rate": 3.0925566476015156e-06,
+ "loss": 0.5049,
+ "step": 5436
+ },
+ {
+ "epoch": 2.5706855791962173,
+ "grad_norm": 2.9699313640594482,
+ "learning_rate": 3.0919505786807687e-06,
+ "loss": 0.3847,
+ "step": 5437
+ },
+ {
+ "epoch": 2.571158392434988,
+ "grad_norm": 2.919260025024414,
+ "learning_rate": 3.091344472901855e-06,
+ "loss": 0.4631,
+ "step": 5438
+ },
+ {
+ "epoch": 2.571631205673759,
+ "grad_norm": 2.956587553024292,
+ "learning_rate": 3.0907383303025134e-06,
+ "loss": 0.4974,
+ "step": 5439
+ },
+ {
+ "epoch": 2.5721040189125297,
+ "grad_norm": 2.758542776107788,
+ "learning_rate": 3.090132150920486e-06,
+ "loss": 0.4785,
+ "step": 5440
+ },
+ {
+ "epoch": 2.5725768321513005,
+ "grad_norm": 2.678469657897949,
+ "learning_rate": 3.0895259347935175e-06,
+ "loss": 0.4453,
+ "step": 5441
+ },
+ {
+ "epoch": 2.573049645390071,
+ "grad_norm": 2.6508545875549316,
+ "learning_rate": 3.088919681959355e-06,
+ "loss": 0.4426,
+ "step": 5442
+ },
+ {
+ "epoch": 2.5735224586288417,
+ "grad_norm": 2.6156187057495117,
+ "learning_rate": 3.0883133924557453e-06,
+ "loss": 0.4445,
+ "step": 5443
+ },
+ {
+ "epoch": 2.5739952718676125,
+ "grad_norm": 2.484374761581421,
+ "learning_rate": 3.08770706632044e-06,
+ "loss": 0.4155,
+ "step": 5444
+ },
+ {
+ "epoch": 2.574468085106383,
+ "grad_norm": 2.7465295791625977,
+ "learning_rate": 3.087100703591193e-06,
+ "loss": 0.4085,
+ "step": 5445
+ },
+ {
+ "epoch": 2.5749408983451536,
+ "grad_norm": 2.771740198135376,
+ "learning_rate": 3.08649430430576e-06,
+ "loss": 0.4313,
+ "step": 5446
+ },
+ {
+ "epoch": 2.5754137115839244,
+ "grad_norm": 2.7480874061584473,
+ "learning_rate": 3.0858878685018984e-06,
+ "loss": 0.3471,
+ "step": 5447
+ },
+ {
+ "epoch": 2.575886524822695,
+ "grad_norm": 2.894913673400879,
+ "learning_rate": 3.085281396217368e-06,
+ "loss": 0.4888,
+ "step": 5448
+ },
+ {
+ "epoch": 2.576359338061466,
+ "grad_norm": 3.037628173828125,
+ "learning_rate": 3.0846748874899306e-06,
+ "loss": 0.3976,
+ "step": 5449
+ },
+ {
+ "epoch": 2.5768321513002364,
+ "grad_norm": 2.4811434745788574,
+ "learning_rate": 3.0840683423573526e-06,
+ "loss": 0.4822,
+ "step": 5450
+ },
+ {
+ "epoch": 2.577304964539007,
+ "grad_norm": 3.0078725814819336,
+ "learning_rate": 3.0834617608573998e-06,
+ "loss": 0.4999,
+ "step": 5451
+ },
+ {
+ "epoch": 2.5777777777777775,
+ "grad_norm": 3.174154043197632,
+ "learning_rate": 3.0828551430278413e-06,
+ "loss": 0.4626,
+ "step": 5452
+ },
+ {
+ "epoch": 2.5782505910165483,
+ "grad_norm": 2.8277535438537598,
+ "learning_rate": 3.082248488906449e-06,
+ "loss": 0.4633,
+ "step": 5453
+ },
+ {
+ "epoch": 2.578723404255319,
+ "grad_norm": 2.731767416000366,
+ "learning_rate": 3.0816417985309966e-06,
+ "loss": 0.4148,
+ "step": 5454
+ },
+ {
+ "epoch": 2.57919621749409,
+ "grad_norm": 2.5480549335479736,
+ "learning_rate": 3.0810350719392597e-06,
+ "loss": 0.4773,
+ "step": 5455
+ },
+ {
+ "epoch": 2.5796690307328607,
+ "grad_norm": 2.9755172729492188,
+ "learning_rate": 3.080428309169017e-06,
+ "loss": 0.5107,
+ "step": 5456
+ },
+ {
+ "epoch": 2.580141843971631,
+ "grad_norm": 2.6499290466308594,
+ "learning_rate": 3.079821510258048e-06,
+ "loss": 0.3982,
+ "step": 5457
+ },
+ {
+ "epoch": 2.580614657210402,
+ "grad_norm": 2.663214921951294,
+ "learning_rate": 3.079214675244136e-06,
+ "loss": 0.4419,
+ "step": 5458
+ },
+ {
+ "epoch": 2.5810874704491726,
+ "grad_norm": 2.595489263534546,
+ "learning_rate": 3.078607804165066e-06,
+ "loss": 0.3958,
+ "step": 5459
+ },
+ {
+ "epoch": 2.581560283687943,
+ "grad_norm": 3.031458854675293,
+ "learning_rate": 3.0780008970586255e-06,
+ "loss": 0.518,
+ "step": 5460
+ },
+ {
+ "epoch": 2.582033096926714,
+ "grad_norm": 2.827071189880371,
+ "learning_rate": 3.077393953962603e-06,
+ "loss": 0.4397,
+ "step": 5461
+ },
+ {
+ "epoch": 2.5825059101654846,
+ "grad_norm": 2.656111240386963,
+ "learning_rate": 3.0767869749147917e-06,
+ "loss": 0.4912,
+ "step": 5462
+ },
+ {
+ "epoch": 2.5829787234042554,
+ "grad_norm": 2.545365333557129,
+ "learning_rate": 3.076179959952984e-06,
+ "loss": 0.3991,
+ "step": 5463
+ },
+ {
+ "epoch": 2.583451536643026,
+ "grad_norm": 2.5794365406036377,
+ "learning_rate": 3.075572909114977e-06,
+ "loss": 0.4499,
+ "step": 5464
+ },
+ {
+ "epoch": 2.5839243498817965,
+ "grad_norm": 2.787140369415283,
+ "learning_rate": 3.074965822438568e-06,
+ "loss": 0.386,
+ "step": 5465
+ },
+ {
+ "epoch": 2.5843971631205673,
+ "grad_norm": 2.6406853199005127,
+ "learning_rate": 3.0743586999615594e-06,
+ "loss": 0.4853,
+ "step": 5466
+ },
+ {
+ "epoch": 2.584869976359338,
+ "grad_norm": 2.8082082271575928,
+ "learning_rate": 3.073751541721752e-06,
+ "loss": 0.4669,
+ "step": 5467
+ },
+ {
+ "epoch": 2.5853427895981085,
+ "grad_norm": 2.8808975219726562,
+ "learning_rate": 3.073144347756952e-06,
+ "loss": 0.4193,
+ "step": 5468
+ },
+ {
+ "epoch": 2.5858156028368793,
+ "grad_norm": 2.823352813720703,
+ "learning_rate": 3.072537118104968e-06,
+ "loss": 0.482,
+ "step": 5469
+ },
+ {
+ "epoch": 2.58628841607565,
+ "grad_norm": 2.6454555988311768,
+ "learning_rate": 3.0719298528036073e-06,
+ "loss": 0.4667,
+ "step": 5470
+ },
+ {
+ "epoch": 2.586761229314421,
+ "grad_norm": 2.871145486831665,
+ "learning_rate": 3.0713225518906826e-06,
+ "loss": 0.5125,
+ "step": 5471
+ },
+ {
+ "epoch": 2.5872340425531917,
+ "grad_norm": 3.1301417350769043,
+ "learning_rate": 3.070715215404007e-06,
+ "loss": 0.4827,
+ "step": 5472
+ },
+ {
+ "epoch": 2.587706855791962,
+ "grad_norm": 2.31062912940979,
+ "learning_rate": 3.070107843381398e-06,
+ "loss": 0.3954,
+ "step": 5473
+ },
+ {
+ "epoch": 2.588179669030733,
+ "grad_norm": 2.8366353511810303,
+ "learning_rate": 3.069500435860674e-06,
+ "loss": 0.4597,
+ "step": 5474
+ },
+ {
+ "epoch": 2.5886524822695036,
+ "grad_norm": 2.900143623352051,
+ "learning_rate": 3.068892992879654e-06,
+ "loss": 0.4294,
+ "step": 5475
+ },
+ {
+ "epoch": 2.589125295508274,
+ "grad_norm": 2.923313617706299,
+ "learning_rate": 3.0682855144761626e-06,
+ "loss": 0.505,
+ "step": 5476
+ },
+ {
+ "epoch": 2.5895981087470448,
+ "grad_norm": 2.726475954055786,
+ "learning_rate": 3.0676780006880242e-06,
+ "loss": 0.4208,
+ "step": 5477
+ },
+ {
+ "epoch": 2.5900709219858156,
+ "grad_norm": 4.115052223205566,
+ "learning_rate": 3.0670704515530654e-06,
+ "loss": 0.466,
+ "step": 5478
+ },
+ {
+ "epoch": 2.5905437352245864,
+ "grad_norm": 2.6018717288970947,
+ "learning_rate": 3.0664628671091163e-06,
+ "loss": 0.4697,
+ "step": 5479
+ },
+ {
+ "epoch": 2.591016548463357,
+ "grad_norm": 2.7393722534179688,
+ "learning_rate": 3.0658552473940085e-06,
+ "loss": 0.4618,
+ "step": 5480
+ },
+ {
+ "epoch": 2.5914893617021275,
+ "grad_norm": 2.8406929969787598,
+ "learning_rate": 3.065247592445575e-06,
+ "loss": 0.4806,
+ "step": 5481
+ },
+ {
+ "epoch": 2.5919621749408983,
+ "grad_norm": 2.9773001670837402,
+ "learning_rate": 3.0646399023016525e-06,
+ "loss": 0.4764,
+ "step": 5482
+ },
+ {
+ "epoch": 2.592434988179669,
+ "grad_norm": 3.374643325805664,
+ "learning_rate": 3.0640321770000804e-06,
+ "loss": 0.4481,
+ "step": 5483
+ },
+ {
+ "epoch": 2.5929078014184395,
+ "grad_norm": 2.5742013454437256,
+ "learning_rate": 3.0634244165786965e-06,
+ "loss": 0.432,
+ "step": 5484
+ },
+ {
+ "epoch": 2.5933806146572103,
+ "grad_norm": 2.9390289783477783,
+ "learning_rate": 3.062816621075346e-06,
+ "loss": 0.3941,
+ "step": 5485
+ },
+ {
+ "epoch": 2.593853427895981,
+ "grad_norm": 2.683414936065674,
+ "learning_rate": 3.062208790527871e-06,
+ "loss": 0.4268,
+ "step": 5486
+ },
+ {
+ "epoch": 2.594326241134752,
+ "grad_norm": 2.689647674560547,
+ "learning_rate": 3.06160092497412e-06,
+ "loss": 0.4569,
+ "step": 5487
+ },
+ {
+ "epoch": 2.5947990543735227,
+ "grad_norm": 3.1170310974121094,
+ "learning_rate": 3.060993024451943e-06,
+ "loss": 0.4387,
+ "step": 5488
+ },
+ {
+ "epoch": 2.595271867612293,
+ "grad_norm": 2.8732447624206543,
+ "learning_rate": 3.0603850889991894e-06,
+ "loss": 0.451,
+ "step": 5489
+ },
+ {
+ "epoch": 2.595744680851064,
+ "grad_norm": 3.0444157123565674,
+ "learning_rate": 3.0597771186537135e-06,
+ "loss": 0.4691,
+ "step": 5490
+ },
+ {
+ "epoch": 2.5962174940898346,
+ "grad_norm": 2.3791720867156982,
+ "learning_rate": 3.0591691134533714e-06,
+ "loss": 0.4771,
+ "step": 5491
+ },
+ {
+ "epoch": 2.596690307328605,
+ "grad_norm": 3.0677225589752197,
+ "learning_rate": 3.05856107343602e-06,
+ "loss": 0.459,
+ "step": 5492
+ },
+ {
+ "epoch": 2.5971631205673757,
+ "grad_norm": 3.1702635288238525,
+ "learning_rate": 3.05795299863952e-06,
+ "loss": 0.4816,
+ "step": 5493
+ },
+ {
+ "epoch": 2.5976359338061465,
+ "grad_norm": 2.964869499206543,
+ "learning_rate": 3.057344889101734e-06,
+ "loss": 0.4369,
+ "step": 5494
+ },
+ {
+ "epoch": 2.5981087470449173,
+ "grad_norm": 3.1333882808685303,
+ "learning_rate": 3.056736744860525e-06,
+ "loss": 0.4178,
+ "step": 5495
+ },
+ {
+ "epoch": 2.598581560283688,
+ "grad_norm": 2.4340405464172363,
+ "learning_rate": 3.05612856595376e-06,
+ "loss": 0.4359,
+ "step": 5496
+ },
+ {
+ "epoch": 2.5990543735224585,
+ "grad_norm": 2.638620615005493,
+ "learning_rate": 3.0555203524193083e-06,
+ "loss": 0.3915,
+ "step": 5497
+ },
+ {
+ "epoch": 2.5995271867612293,
+ "grad_norm": 2.8218815326690674,
+ "learning_rate": 3.054912104295039e-06,
+ "loss": 0.4684,
+ "step": 5498
+ },
+ {
+ "epoch": 2.6,
+ "grad_norm": 2.6696009635925293,
+ "learning_rate": 3.054303821618827e-06,
+ "loss": 0.4073,
+ "step": 5499
+ },
+ {
+ "epoch": 2.6004728132387704,
+ "grad_norm": 2.3880512714385986,
+ "learning_rate": 3.0536955044285465e-06,
+ "loss": 0.3576,
+ "step": 5500
+ },
+ {
+ "epoch": 2.6009456264775412,
+ "grad_norm": 2.762890100479126,
+ "learning_rate": 3.053087152762075e-06,
+ "loss": 0.3857,
+ "step": 5501
+ },
+ {
+ "epoch": 2.601418439716312,
+ "grad_norm": 2.729033946990967,
+ "learning_rate": 3.052478766657292e-06,
+ "loss": 0.3935,
+ "step": 5502
+ },
+ {
+ "epoch": 2.601891252955083,
+ "grad_norm": 2.630490303039551,
+ "learning_rate": 3.051870346152078e-06,
+ "loss": 0.3932,
+ "step": 5503
+ },
+ {
+ "epoch": 2.6023640661938536,
+ "grad_norm": 3.0335981845855713,
+ "learning_rate": 3.051261891284318e-06,
+ "loss": 0.4313,
+ "step": 5504
+ },
+ {
+ "epoch": 2.602836879432624,
+ "grad_norm": 2.969888687133789,
+ "learning_rate": 3.0506534020918963e-06,
+ "loss": 0.4698,
+ "step": 5505
+ },
+ {
+ "epoch": 2.603309692671395,
+ "grad_norm": 3.093996524810791,
+ "learning_rate": 3.050044878612703e-06,
+ "loss": 0.5338,
+ "step": 5506
+ },
+ {
+ "epoch": 2.6037825059101656,
+ "grad_norm": 2.759993314743042,
+ "learning_rate": 3.049436320884626e-06,
+ "loss": 0.4429,
+ "step": 5507
+ },
+ {
+ "epoch": 2.604255319148936,
+ "grad_norm": 2.979422092437744,
+ "learning_rate": 3.0488277289455587e-06,
+ "loss": 0.4489,
+ "step": 5508
+ },
+ {
+ "epoch": 2.6047281323877067,
+ "grad_norm": 2.8266701698303223,
+ "learning_rate": 3.048219102833396e-06,
+ "loss": 0.489,
+ "step": 5509
+ },
+ {
+ "epoch": 2.6052009456264775,
+ "grad_norm": 2.2582461833953857,
+ "learning_rate": 3.047610442586033e-06,
+ "loss": 0.3759,
+ "step": 5510
+ },
+ {
+ "epoch": 2.6056737588652483,
+ "grad_norm": 3.078152894973755,
+ "learning_rate": 3.0470017482413694e-06,
+ "loss": 0.5059,
+ "step": 5511
+ },
+ {
+ "epoch": 2.606146572104019,
+ "grad_norm": 2.7895498275756836,
+ "learning_rate": 3.0463930198373047e-06,
+ "loss": 0.4752,
+ "step": 5512
+ },
+ {
+ "epoch": 2.6066193853427895,
+ "grad_norm": 3.2307958602905273,
+ "learning_rate": 3.045784257411743e-06,
+ "loss": 0.4847,
+ "step": 5513
+ },
+ {
+ "epoch": 2.6070921985815603,
+ "grad_norm": 2.793661594390869,
+ "learning_rate": 3.0451754610025884e-06,
+ "loss": 0.4492,
+ "step": 5514
+ },
+ {
+ "epoch": 2.607565011820331,
+ "grad_norm": 2.4443132877349854,
+ "learning_rate": 3.0445666306477484e-06,
+ "loss": 0.4174,
+ "step": 5515
+ },
+ {
+ "epoch": 2.6080378250591014,
+ "grad_norm": 2.628769636154175,
+ "learning_rate": 3.0439577663851326e-06,
+ "loss": 0.3889,
+ "step": 5516
+ },
+ {
+ "epoch": 2.608510638297872,
+ "grad_norm": 2.9367563724517822,
+ "learning_rate": 3.0433488682526525e-06,
+ "loss": 0.437,
+ "step": 5517
+ },
+ {
+ "epoch": 2.608983451536643,
+ "grad_norm": 3.171353340148926,
+ "learning_rate": 3.04273993628822e-06,
+ "loss": 0.47,
+ "step": 5518
+ },
+ {
+ "epoch": 2.609456264775414,
+ "grad_norm": 2.856576442718506,
+ "learning_rate": 3.0421309705297513e-06,
+ "loss": 0.4797,
+ "step": 5519
+ },
+ {
+ "epoch": 2.6099290780141846,
+ "grad_norm": 2.4926068782806396,
+ "learning_rate": 3.041521971015165e-06,
+ "loss": 0.4294,
+ "step": 5520
+ },
+ {
+ "epoch": 2.610401891252955,
+ "grad_norm": 2.7897613048553467,
+ "learning_rate": 3.040912937782379e-06,
+ "loss": 0.4388,
+ "step": 5521
+ },
+ {
+ "epoch": 2.6108747044917258,
+ "grad_norm": 3.588188886642456,
+ "learning_rate": 3.0403038708693173e-06,
+ "loss": 0.4027,
+ "step": 5522
+ },
+ {
+ "epoch": 2.6113475177304966,
+ "grad_norm": 3.5394980907440186,
+ "learning_rate": 3.0396947703139017e-06,
+ "loss": 0.4866,
+ "step": 5523
+ },
+ {
+ "epoch": 2.611820330969267,
+ "grad_norm": 3.086865186691284,
+ "learning_rate": 3.03908563615406e-06,
+ "loss": 0.4344,
+ "step": 5524
+ },
+ {
+ "epoch": 2.6122931442080377,
+ "grad_norm": 2.649564504623413,
+ "learning_rate": 3.0384764684277194e-06,
+ "loss": 0.4571,
+ "step": 5525
+ },
+ {
+ "epoch": 2.6127659574468085,
+ "grad_norm": 2.945234775543213,
+ "learning_rate": 3.0378672671728105e-06,
+ "loss": 0.4885,
+ "step": 5526
+ },
+ {
+ "epoch": 2.6132387706855793,
+ "grad_norm": 2.625424861907959,
+ "learning_rate": 3.037258032427265e-06,
+ "loss": 0.4095,
+ "step": 5527
+ },
+ {
+ "epoch": 2.61371158392435,
+ "grad_norm": 2.7597248554229736,
+ "learning_rate": 3.0366487642290175e-06,
+ "loss": 0.4393,
+ "step": 5528
+ },
+ {
+ "epoch": 2.6141843971631205,
+ "grad_norm": 2.721189260482788,
+ "learning_rate": 3.0360394626160043e-06,
+ "loss": 0.3865,
+ "step": 5529
+ },
+ {
+ "epoch": 2.6146572104018913,
+ "grad_norm": 2.624056339263916,
+ "learning_rate": 3.0354301276261656e-06,
+ "loss": 0.4273,
+ "step": 5530
+ },
+ {
+ "epoch": 2.615130023640662,
+ "grad_norm": 2.7764177322387695,
+ "learning_rate": 3.034820759297439e-06,
+ "loss": 0.4756,
+ "step": 5531
+ },
+ {
+ "epoch": 2.6156028368794324,
+ "grad_norm": 3.0841729640960693,
+ "learning_rate": 3.0342113576677696e-06,
+ "loss": 0.4907,
+ "step": 5532
+ },
+ {
+ "epoch": 2.616075650118203,
+ "grad_norm": 2.678715705871582,
+ "learning_rate": 3.0336019227751017e-06,
+ "loss": 0.4478,
+ "step": 5533
+ },
+ {
+ "epoch": 2.616548463356974,
+ "grad_norm": 2.378679037094116,
+ "learning_rate": 3.032992454657382e-06,
+ "loss": 0.3678,
+ "step": 5534
+ },
+ {
+ "epoch": 2.617021276595745,
+ "grad_norm": 2.792079210281372,
+ "learning_rate": 3.0323829533525583e-06,
+ "loss": 0.4115,
+ "step": 5535
+ },
+ {
+ "epoch": 2.6174940898345156,
+ "grad_norm": 2.738133192062378,
+ "learning_rate": 3.0317734188985832e-06,
+ "loss": 0.4152,
+ "step": 5536
+ },
+ {
+ "epoch": 2.617966903073286,
+ "grad_norm": 2.6963796615600586,
+ "learning_rate": 3.0311638513334084e-06,
+ "loss": 0.4096,
+ "step": 5537
+ },
+ {
+ "epoch": 2.6184397163120567,
+ "grad_norm": 2.694145679473877,
+ "learning_rate": 3.03055425069499e-06,
+ "loss": 0.3793,
+ "step": 5538
+ },
+ {
+ "epoch": 2.6189125295508275,
+ "grad_norm": 2.762403964996338,
+ "learning_rate": 3.0299446170212855e-06,
+ "loss": 0.459,
+ "step": 5539
+ },
+ {
+ "epoch": 2.619385342789598,
+ "grad_norm": 2.804382562637329,
+ "learning_rate": 3.0293349503502522e-06,
+ "loss": 0.4853,
+ "step": 5540
+ },
+ {
+ "epoch": 2.6198581560283687,
+ "grad_norm": 2.7768518924713135,
+ "learning_rate": 3.0287252507198537e-06,
+ "loss": 0.4496,
+ "step": 5541
+ },
+ {
+ "epoch": 2.6203309692671395,
+ "grad_norm": 2.9075138568878174,
+ "learning_rate": 3.028115518168052e-06,
+ "loss": 0.4498,
+ "step": 5542
+ },
+ {
+ "epoch": 2.6208037825059103,
+ "grad_norm": 2.8966822624206543,
+ "learning_rate": 3.0275057527328126e-06,
+ "loss": 0.4434,
+ "step": 5543
+ },
+ {
+ "epoch": 2.621276595744681,
+ "grad_norm": 2.8140156269073486,
+ "learning_rate": 3.0268959544521027e-06,
+ "loss": 0.3935,
+ "step": 5544
+ },
+ {
+ "epoch": 2.6217494089834514,
+ "grad_norm": 2.8606276512145996,
+ "learning_rate": 3.0262861233638924e-06,
+ "loss": 0.4222,
+ "step": 5545
+ },
+ {
+ "epoch": 2.6222222222222222,
+ "grad_norm": 3.003610134124756,
+ "learning_rate": 3.0256762595061522e-06,
+ "loss": 0.428,
+ "step": 5546
+ },
+ {
+ "epoch": 2.622695035460993,
+ "grad_norm": 2.725907802581787,
+ "learning_rate": 3.025066362916857e-06,
+ "loss": 0.3975,
+ "step": 5547
+ },
+ {
+ "epoch": 2.6231678486997634,
+ "grad_norm": 2.5247902870178223,
+ "learning_rate": 3.024456433633982e-06,
+ "loss": 0.4584,
+ "step": 5548
+ },
+ {
+ "epoch": 2.623640661938534,
+ "grad_norm": 2.932798147201538,
+ "learning_rate": 3.0238464716955045e-06,
+ "loss": 0.4991,
+ "step": 5549
+ },
+ {
+ "epoch": 2.624113475177305,
+ "grad_norm": 2.693547010421753,
+ "learning_rate": 3.023236477139404e-06,
+ "loss": 0.4405,
+ "step": 5550
+ },
+ {
+ "epoch": 2.6245862884160758,
+ "grad_norm": 3.2600035667419434,
+ "learning_rate": 3.022626450003662e-06,
+ "loss": 0.4904,
+ "step": 5551
+ },
+ {
+ "epoch": 2.6250591016548466,
+ "grad_norm": 2.9471960067749023,
+ "learning_rate": 3.0220163903262627e-06,
+ "loss": 0.4487,
+ "step": 5552
+ },
+ {
+ "epoch": 2.625531914893617,
+ "grad_norm": 2.583944082260132,
+ "learning_rate": 3.0214062981451926e-06,
+ "loss": 0.3552,
+ "step": 5553
+ },
+ {
+ "epoch": 2.6260047281323877,
+ "grad_norm": 2.675062656402588,
+ "learning_rate": 3.0207961734984377e-06,
+ "loss": 0.4524,
+ "step": 5554
+ },
+ {
+ "epoch": 2.6264775413711585,
+ "grad_norm": 3.0126802921295166,
+ "learning_rate": 3.0201860164239887e-06,
+ "loss": 0.4124,
+ "step": 5555
+ },
+ {
+ "epoch": 2.626950354609929,
+ "grad_norm": 2.490734577178955,
+ "learning_rate": 3.019575826959838e-06,
+ "loss": 0.4095,
+ "step": 5556
+ },
+ {
+ "epoch": 2.6274231678486997,
+ "grad_norm": 2.72817063331604,
+ "learning_rate": 3.018965605143978e-06,
+ "loss": 0.4298,
+ "step": 5557
+ },
+ {
+ "epoch": 2.6278959810874705,
+ "grad_norm": 3.1298327445983887,
+ "learning_rate": 3.0183553510144064e-06,
+ "loss": 0.4961,
+ "step": 5558
+ },
+ {
+ "epoch": 2.6283687943262413,
+ "grad_norm": 3.2379956245422363,
+ "learning_rate": 3.0177450646091195e-06,
+ "loss": 0.4943,
+ "step": 5559
+ },
+ {
+ "epoch": 2.628841607565012,
+ "grad_norm": 2.5040571689605713,
+ "learning_rate": 3.017134745966117e-06,
+ "loss": 0.3701,
+ "step": 5560
+ },
+ {
+ "epoch": 2.6293144208037824,
+ "grad_norm": 3.047184944152832,
+ "learning_rate": 3.0165243951234025e-06,
+ "loss": 0.4587,
+ "step": 5561
+ },
+ {
+ "epoch": 2.629787234042553,
+ "grad_norm": 2.4926774501800537,
+ "learning_rate": 3.0159140121189783e-06,
+ "loss": 0.3723,
+ "step": 5562
+ },
+ {
+ "epoch": 2.630260047281324,
+ "grad_norm": 2.5434961318969727,
+ "learning_rate": 3.015303596990851e-06,
+ "loss": 0.4176,
+ "step": 5563
+ },
+ {
+ "epoch": 2.6307328605200944,
+ "grad_norm": 2.5117976665496826,
+ "learning_rate": 3.0146931497770284e-06,
+ "loss": 0.4218,
+ "step": 5564
+ },
+ {
+ "epoch": 2.631205673758865,
+ "grad_norm": 2.9408798217773438,
+ "learning_rate": 3.0140826705155196e-06,
+ "loss": 0.4473,
+ "step": 5565
+ },
+ {
+ "epoch": 2.631678486997636,
+ "grad_norm": 2.996422052383423,
+ "learning_rate": 3.0134721592443385e-06,
+ "loss": 0.4513,
+ "step": 5566
+ },
+ {
+ "epoch": 2.6321513002364068,
+ "grad_norm": 2.984356164932251,
+ "learning_rate": 3.0128616160014955e-06,
+ "loss": 0.4749,
+ "step": 5567
+ },
+ {
+ "epoch": 2.6326241134751776,
+ "grad_norm": 2.6075069904327393,
+ "learning_rate": 3.0122510408250095e-06,
+ "loss": 0.4707,
+ "step": 5568
+ },
+ {
+ "epoch": 2.633096926713948,
+ "grad_norm": 2.9463071823120117,
+ "learning_rate": 3.0116404337528972e-06,
+ "loss": 0.5125,
+ "step": 5569
+ },
+ {
+ "epoch": 2.6335697399527187,
+ "grad_norm": 2.98574161529541,
+ "learning_rate": 3.0110297948231787e-06,
+ "loss": 0.4487,
+ "step": 5570
+ },
+ {
+ "epoch": 2.6340425531914895,
+ "grad_norm": 2.6039397716522217,
+ "learning_rate": 3.010419124073876e-06,
+ "loss": 0.4516,
+ "step": 5571
+ },
+ {
+ "epoch": 2.63451536643026,
+ "grad_norm": 2.8480236530303955,
+ "learning_rate": 3.0098084215430124e-06,
+ "loss": 0.4962,
+ "step": 5572
+ },
+ {
+ "epoch": 2.6349881796690307,
+ "grad_norm": 2.527597427368164,
+ "learning_rate": 3.0091976872686133e-06,
+ "loss": 0.435,
+ "step": 5573
+ },
+ {
+ "epoch": 2.6354609929078014,
+ "grad_norm": 2.898303508758545,
+ "learning_rate": 3.0085869212887076e-06,
+ "loss": 0.4473,
+ "step": 5574
+ },
+ {
+ "epoch": 2.6359338061465722,
+ "grad_norm": 2.981414318084717,
+ "learning_rate": 3.007976123641324e-06,
+ "loss": 0.4203,
+ "step": 5575
+ },
+ {
+ "epoch": 2.636406619385343,
+ "grad_norm": 3.219064474105835,
+ "learning_rate": 3.0073652943644947e-06,
+ "loss": 0.4596,
+ "step": 5576
+ },
+ {
+ "epoch": 2.6368794326241134,
+ "grad_norm": 2.7287049293518066,
+ "learning_rate": 3.0067544334962532e-06,
+ "loss": 0.433,
+ "step": 5577
+ },
+ {
+ "epoch": 2.637352245862884,
+ "grad_norm": 2.6232664585113525,
+ "learning_rate": 3.0061435410746352e-06,
+ "loss": 0.4254,
+ "step": 5578
+ },
+ {
+ "epoch": 2.637825059101655,
+ "grad_norm": 2.908311605453491,
+ "learning_rate": 3.0055326171376788e-06,
+ "loss": 0.4349,
+ "step": 5579
+ },
+ {
+ "epoch": 2.6382978723404253,
+ "grad_norm": 2.8369064331054688,
+ "learning_rate": 3.0049216617234224e-06,
+ "loss": 0.4675,
+ "step": 5580
+ },
+ {
+ "epoch": 2.638770685579196,
+ "grad_norm": 2.659499406814575,
+ "learning_rate": 3.0043106748699085e-06,
+ "loss": 0.4073,
+ "step": 5581
+ },
+ {
+ "epoch": 2.639243498817967,
+ "grad_norm": 2.579765558242798,
+ "learning_rate": 3.00369965661518e-06,
+ "loss": 0.4536,
+ "step": 5582
+ },
+ {
+ "epoch": 2.6397163120567377,
+ "grad_norm": 3.572861909866333,
+ "learning_rate": 3.0030886069972827e-06,
+ "loss": 0.5227,
+ "step": 5583
+ },
+ {
+ "epoch": 2.6401891252955085,
+ "grad_norm": 2.6523196697235107,
+ "learning_rate": 3.002477526054263e-06,
+ "loss": 0.3846,
+ "step": 5584
+ },
+ {
+ "epoch": 2.640661938534279,
+ "grad_norm": 3.072181463241577,
+ "learning_rate": 3.001866413824173e-06,
+ "loss": 0.5399,
+ "step": 5585
+ },
+ {
+ "epoch": 2.6411347517730497,
+ "grad_norm": 2.7304325103759766,
+ "learning_rate": 3.0012552703450597e-06,
+ "loss": 0.4048,
+ "step": 5586
+ },
+ {
+ "epoch": 2.6416075650118205,
+ "grad_norm": 3.039491891860962,
+ "learning_rate": 3.0006440956549798e-06,
+ "loss": 0.5035,
+ "step": 5587
+ },
+ {
+ "epoch": 2.642080378250591,
+ "grad_norm": 2.7623798847198486,
+ "learning_rate": 3.000032889791988e-06,
+ "loss": 0.4369,
+ "step": 5588
+ },
+ {
+ "epoch": 2.6425531914893616,
+ "grad_norm": 3.391052722930908,
+ "learning_rate": 2.9994216527941394e-06,
+ "loss": 0.5308,
+ "step": 5589
+ },
+ {
+ "epoch": 2.6430260047281324,
+ "grad_norm": 3.0263915061950684,
+ "learning_rate": 2.9988103846994954e-06,
+ "loss": 0.4319,
+ "step": 5590
+ },
+ {
+ "epoch": 2.6434988179669032,
+ "grad_norm": 2.786607027053833,
+ "learning_rate": 2.998199085546115e-06,
+ "loss": 0.4695,
+ "step": 5591
+ },
+ {
+ "epoch": 2.643971631205674,
+ "grad_norm": 2.884674310684204,
+ "learning_rate": 2.9975877553720627e-06,
+ "loss": 0.4615,
+ "step": 5592
+ },
+ {
+ "epoch": 2.6444444444444444,
+ "grad_norm": 2.6100499629974365,
+ "learning_rate": 2.996976394215402e-06,
+ "loss": 0.4784,
+ "step": 5593
+ },
+ {
+ "epoch": 2.644917257683215,
+ "grad_norm": 2.6978676319122314,
+ "learning_rate": 2.9963650021142018e-06,
+ "loss": 0.3911,
+ "step": 5594
+ },
+ {
+ "epoch": 2.645390070921986,
+ "grad_norm": 2.8080835342407227,
+ "learning_rate": 2.9957535791065284e-06,
+ "loss": 0.4997,
+ "step": 5595
+ },
+ {
+ "epoch": 2.6458628841607563,
+ "grad_norm": 2.6639578342437744,
+ "learning_rate": 2.9951421252304537e-06,
+ "loss": 0.4066,
+ "step": 5596
+ },
+ {
+ "epoch": 2.646335697399527,
+ "grad_norm": 3.102456569671631,
+ "learning_rate": 2.9945306405240505e-06,
+ "loss": 0.5554,
+ "step": 5597
+ },
+ {
+ "epoch": 2.646808510638298,
+ "grad_norm": 2.6524150371551514,
+ "learning_rate": 2.993919125025392e-06,
+ "loss": 0.3881,
+ "step": 5598
+ },
+ {
+ "epoch": 2.6472813238770687,
+ "grad_norm": 2.926316499710083,
+ "learning_rate": 2.993307578772556e-06,
+ "loss": 0.4845,
+ "step": 5599
+ },
+ {
+ "epoch": 2.6477541371158395,
+ "grad_norm": 3.346550703048706,
+ "learning_rate": 2.9926960018036195e-06,
+ "loss": 0.4481,
+ "step": 5600
+ },
+ {
+ "epoch": 2.64822695035461,
+ "grad_norm": 2.6211020946502686,
+ "learning_rate": 2.9920843941566634e-06,
+ "loss": 0.4355,
+ "step": 5601
+ },
+ {
+ "epoch": 2.6486997635933807,
+ "grad_norm": 2.7479333877563477,
+ "learning_rate": 2.99147275586977e-06,
+ "loss": 0.4373,
+ "step": 5602
+ },
+ {
+ "epoch": 2.6491725768321515,
+ "grad_norm": 2.523385524749756,
+ "learning_rate": 2.9908610869810235e-06,
+ "loss": 0.4467,
+ "step": 5603
+ },
+ {
+ "epoch": 2.649645390070922,
+ "grad_norm": 2.93886137008667,
+ "learning_rate": 2.9902493875285086e-06,
+ "loss": 0.4956,
+ "step": 5604
+ },
+ {
+ "epoch": 2.6501182033096926,
+ "grad_norm": 2.7630443572998047,
+ "learning_rate": 2.989637657550315e-06,
+ "loss": 0.5012,
+ "step": 5605
+ },
+ {
+ "epoch": 2.6505910165484634,
+ "grad_norm": 2.6733906269073486,
+ "learning_rate": 2.989025897084531e-06,
+ "loss": 0.446,
+ "step": 5606
+ },
+ {
+ "epoch": 2.651063829787234,
+ "grad_norm": 2.8411107063293457,
+ "learning_rate": 2.9884141061692484e-06,
+ "loss": 0.4817,
+ "step": 5607
+ },
+ {
+ "epoch": 2.651536643026005,
+ "grad_norm": 2.8667192459106445,
+ "learning_rate": 2.987802284842562e-06,
+ "loss": 0.3909,
+ "step": 5608
+ },
+ {
+ "epoch": 2.6520094562647754,
+ "grad_norm": 3.4640755653381348,
+ "learning_rate": 2.987190433142565e-06,
+ "loss": 0.4379,
+ "step": 5609
+ },
+ {
+ "epoch": 2.652482269503546,
+ "grad_norm": 2.675121307373047,
+ "learning_rate": 2.9865785511073565e-06,
+ "loss": 0.4833,
+ "step": 5610
+ },
+ {
+ "epoch": 2.652955082742317,
+ "grad_norm": 2.4375529289245605,
+ "learning_rate": 2.9859666387750353e-06,
+ "loss": 0.3949,
+ "step": 5611
+ },
+ {
+ "epoch": 2.6534278959810873,
+ "grad_norm": 2.7312581539154053,
+ "learning_rate": 2.9853546961837026e-06,
+ "loss": 0.4546,
+ "step": 5612
+ },
+ {
+ "epoch": 2.653900709219858,
+ "grad_norm": 2.7695999145507812,
+ "learning_rate": 2.9847427233714617e-06,
+ "loss": 0.4696,
+ "step": 5613
+ },
+ {
+ "epoch": 2.654373522458629,
+ "grad_norm": 2.6313109397888184,
+ "learning_rate": 2.984130720376416e-06,
+ "loss": 0.4733,
+ "step": 5614
+ },
+ {
+ "epoch": 2.6548463356973997,
+ "grad_norm": 2.656864881515503,
+ "learning_rate": 2.9835186872366733e-06,
+ "loss": 0.3806,
+ "step": 5615
+ },
+ {
+ "epoch": 2.65531914893617,
+ "grad_norm": 2.720075845718384,
+ "learning_rate": 2.982906623990342e-06,
+ "loss": 0.4041,
+ "step": 5616
+ },
+ {
+ "epoch": 2.655791962174941,
+ "grad_norm": 2.6684951782226562,
+ "learning_rate": 2.9822945306755334e-06,
+ "loss": 0.4552,
+ "step": 5617
+ },
+ {
+ "epoch": 2.6562647754137116,
+ "grad_norm": 2.567751884460449,
+ "learning_rate": 2.9816824073303585e-06,
+ "loss": 0.465,
+ "step": 5618
+ },
+ {
+ "epoch": 2.656737588652482,
+ "grad_norm": 2.7490367889404297,
+ "learning_rate": 2.981070253992933e-06,
+ "loss": 0.4647,
+ "step": 5619
+ },
+ {
+ "epoch": 2.657210401891253,
+ "grad_norm": 2.548656463623047,
+ "learning_rate": 2.9804580707013715e-06,
+ "loss": 0.4226,
+ "step": 5620
+ },
+ {
+ "epoch": 2.6576832151300236,
+ "grad_norm": 2.5484731197357178,
+ "learning_rate": 2.9798458574937927e-06,
+ "loss": 0.382,
+ "step": 5621
+ },
+ {
+ "epoch": 2.6581560283687944,
+ "grad_norm": 2.7293949127197266,
+ "learning_rate": 2.979233614408317e-06,
+ "loss": 0.4418,
+ "step": 5622
+ },
+ {
+ "epoch": 2.658628841607565,
+ "grad_norm": 2.645036458969116,
+ "learning_rate": 2.9786213414830646e-06,
+ "loss": 0.414,
+ "step": 5623
+ },
+ {
+ "epoch": 2.6591016548463355,
+ "grad_norm": 2.5287609100341797,
+ "learning_rate": 2.9780090387561604e-06,
+ "loss": 0.3914,
+ "step": 5624
+ },
+ {
+ "epoch": 2.6595744680851063,
+ "grad_norm": 2.5570411682128906,
+ "learning_rate": 2.9773967062657293e-06,
+ "loss": 0.4431,
+ "step": 5625
+ },
+ {
+ "epoch": 2.660047281323877,
+ "grad_norm": 2.681749105453491,
+ "learning_rate": 2.9767843440498983e-06,
+ "loss": 0.4245,
+ "step": 5626
+ },
+ {
+ "epoch": 2.6605200945626475,
+ "grad_norm": 2.8629777431488037,
+ "learning_rate": 2.976171952146798e-06,
+ "loss": 0.4643,
+ "step": 5627
+ },
+ {
+ "epoch": 2.6609929078014183,
+ "grad_norm": 2.577148199081421,
+ "learning_rate": 2.9755595305945573e-06,
+ "loss": 0.43,
+ "step": 5628
+ },
+ {
+ "epoch": 2.661465721040189,
+ "grad_norm": 2.747218370437622,
+ "learning_rate": 2.97494707943131e-06,
+ "loss": 0.5194,
+ "step": 5629
+ },
+ {
+ "epoch": 2.66193853427896,
+ "grad_norm": 2.535604953765869,
+ "learning_rate": 2.9743345986951904e-06,
+ "loss": 0.4401,
+ "step": 5630
+ },
+ {
+ "epoch": 2.6624113475177307,
+ "grad_norm": 3.3341166973114014,
+ "learning_rate": 2.973722088424336e-06,
+ "loss": 0.4925,
+ "step": 5631
+ },
+ {
+ "epoch": 2.662884160756501,
+ "grad_norm": 2.9264349937438965,
+ "learning_rate": 2.973109548656884e-06,
+ "loss": 0.4787,
+ "step": 5632
+ },
+ {
+ "epoch": 2.663356973995272,
+ "grad_norm": 2.7132506370544434,
+ "learning_rate": 2.9724969794309742e-06,
+ "loss": 0.4138,
+ "step": 5633
+ },
+ {
+ "epoch": 2.6638297872340426,
+ "grad_norm": 2.7970192432403564,
+ "learning_rate": 2.9718843807847497e-06,
+ "loss": 0.4896,
+ "step": 5634
+ },
+ {
+ "epoch": 2.664302600472813,
+ "grad_norm": 2.610208749771118,
+ "learning_rate": 2.9712717527563545e-06,
+ "loss": 0.3997,
+ "step": 5635
+ },
+ {
+ "epoch": 2.6647754137115838,
+ "grad_norm": 3.5483577251434326,
+ "learning_rate": 2.9706590953839335e-06,
+ "loss": 0.5109,
+ "step": 5636
+ },
+ {
+ "epoch": 2.6652482269503546,
+ "grad_norm": 2.746933698654175,
+ "learning_rate": 2.9700464087056345e-06,
+ "loss": 0.4672,
+ "step": 5637
+ },
+ {
+ "epoch": 2.6657210401891254,
+ "grad_norm": 2.704436779022217,
+ "learning_rate": 2.969433692759607e-06,
+ "loss": 0.4402,
+ "step": 5638
+ },
+ {
+ "epoch": 2.666193853427896,
+ "grad_norm": 2.859520196914673,
+ "learning_rate": 2.9688209475840005e-06,
+ "loss": 0.4679,
+ "step": 5639
+ },
+ {
+ "epoch": 2.6666666666666665,
+ "grad_norm": 2.518580436706543,
+ "learning_rate": 2.968208173216971e-06,
+ "loss": 0.3772,
+ "step": 5640
+ },
+ {
+ "epoch": 2.6671394799054373,
+ "grad_norm": 2.7624926567077637,
+ "learning_rate": 2.967595369696671e-06,
+ "loss": 0.4753,
+ "step": 5641
+ },
+ {
+ "epoch": 2.667612293144208,
+ "grad_norm": 2.654003620147705,
+ "learning_rate": 2.966982537061257e-06,
+ "loss": 0.4583,
+ "step": 5642
+ },
+ {
+ "epoch": 2.6680851063829785,
+ "grad_norm": 2.8473968505859375,
+ "learning_rate": 2.966369675348888e-06,
+ "loss": 0.4623,
+ "step": 5643
+ },
+ {
+ "epoch": 2.6685579196217493,
+ "grad_norm": 2.5587947368621826,
+ "learning_rate": 2.9657567845977253e-06,
+ "loss": 0.4014,
+ "step": 5644
+ },
+ {
+ "epoch": 2.66903073286052,
+ "grad_norm": 2.572220802307129,
+ "learning_rate": 2.96514386484593e-06,
+ "loss": 0.4249,
+ "step": 5645
+ },
+ {
+ "epoch": 2.669503546099291,
+ "grad_norm": 2.7995707988739014,
+ "learning_rate": 2.964530916131665e-06,
+ "loss": 0.4575,
+ "step": 5646
+ },
+ {
+ "epoch": 2.6699763593380617,
+ "grad_norm": 2.8712687492370605,
+ "learning_rate": 2.963917938493097e-06,
+ "loss": 0.4353,
+ "step": 5647
+ },
+ {
+ "epoch": 2.670449172576832,
+ "grad_norm": 2.856473207473755,
+ "learning_rate": 2.963304931968393e-06,
+ "loss": 0.4345,
+ "step": 5648
+ },
+ {
+ "epoch": 2.670921985815603,
+ "grad_norm": 2.709198474884033,
+ "learning_rate": 2.9626918965957224e-06,
+ "loss": 0.4116,
+ "step": 5649
+ },
+ {
+ "epoch": 2.6713947990543736,
+ "grad_norm": 2.8144607543945312,
+ "learning_rate": 2.962078832413257e-06,
+ "loss": 0.4575,
+ "step": 5650
+ },
+ {
+ "epoch": 2.671867612293144,
+ "grad_norm": 3.131911039352417,
+ "learning_rate": 2.961465739459168e-06,
+ "loss": 0.4743,
+ "step": 5651
+ },
+ {
+ "epoch": 2.6723404255319148,
+ "grad_norm": 2.8487515449523926,
+ "learning_rate": 2.9608526177716316e-06,
+ "loss": 0.4314,
+ "step": 5652
+ },
+ {
+ "epoch": 2.6728132387706856,
+ "grad_norm": 2.613229751586914,
+ "learning_rate": 2.960239467388823e-06,
+ "loss": 0.4807,
+ "step": 5653
+ },
+ {
+ "epoch": 2.6732860520094563,
+ "grad_norm": 2.5049116611480713,
+ "learning_rate": 2.9596262883489213e-06,
+ "loss": 0.4708,
+ "step": 5654
+ },
+ {
+ "epoch": 2.673758865248227,
+ "grad_norm": 2.6347460746765137,
+ "learning_rate": 2.9590130806901052e-06,
+ "loss": 0.3689,
+ "step": 5655
+ },
+ {
+ "epoch": 2.6742316784869975,
+ "grad_norm": 3.3290371894836426,
+ "learning_rate": 2.9583998444505578e-06,
+ "loss": 0.4674,
+ "step": 5656
+ },
+ {
+ "epoch": 2.6747044917257683,
+ "grad_norm": 2.748403549194336,
+ "learning_rate": 2.957786579668462e-06,
+ "loss": 0.3852,
+ "step": 5657
+ },
+ {
+ "epoch": 2.675177304964539,
+ "grad_norm": 2.837573766708374,
+ "learning_rate": 2.957173286382003e-06,
+ "loss": 0.4541,
+ "step": 5658
+ },
+ {
+ "epoch": 2.6756501182033094,
+ "grad_norm": 3.0976510047912598,
+ "learning_rate": 2.9565599646293686e-06,
+ "loss": 0.4669,
+ "step": 5659
+ },
+ {
+ "epoch": 2.6761229314420802,
+ "grad_norm": 2.7059597969055176,
+ "learning_rate": 2.955946614448747e-06,
+ "loss": 0.3935,
+ "step": 5660
+ },
+ {
+ "epoch": 2.676595744680851,
+ "grad_norm": 2.6700541973114014,
+ "learning_rate": 2.9553332358783294e-06,
+ "loss": 0.4322,
+ "step": 5661
+ },
+ {
+ "epoch": 2.677068557919622,
+ "grad_norm": 2.9782698154449463,
+ "learning_rate": 2.9547198289563068e-06,
+ "loss": 0.4338,
+ "step": 5662
+ },
+ {
+ "epoch": 2.6775413711583926,
+ "grad_norm": 2.637876510620117,
+ "learning_rate": 2.9541063937208755e-06,
+ "loss": 0.4289,
+ "step": 5663
+ },
+ {
+ "epoch": 2.678014184397163,
+ "grad_norm": 3.421949863433838,
+ "learning_rate": 2.953492930210229e-06,
+ "loss": 0.5458,
+ "step": 5664
+ },
+ {
+ "epoch": 2.678486997635934,
+ "grad_norm": 2.8273842334747314,
+ "learning_rate": 2.952879438462567e-06,
+ "loss": 0.4529,
+ "step": 5665
+ },
+ {
+ "epoch": 2.6789598108747046,
+ "grad_norm": 2.9090168476104736,
+ "learning_rate": 2.9522659185160873e-06,
+ "loss": 0.444,
+ "step": 5666
+ },
+ {
+ "epoch": 2.679432624113475,
+ "grad_norm": 2.646710157394409,
+ "learning_rate": 2.9516523704089927e-06,
+ "loss": 0.4226,
+ "step": 5667
+ },
+ {
+ "epoch": 2.6799054373522457,
+ "grad_norm": 2.65915584564209,
+ "learning_rate": 2.951038794179486e-06,
+ "loss": 0.4307,
+ "step": 5668
+ },
+ {
+ "epoch": 2.6803782505910165,
+ "grad_norm": 3.004507303237915,
+ "learning_rate": 2.950425189865771e-06,
+ "loss": 0.4799,
+ "step": 5669
+ },
+ {
+ "epoch": 2.6808510638297873,
+ "grad_norm": 2.5210134983062744,
+ "learning_rate": 2.949811557506054e-06,
+ "loss": 0.3842,
+ "step": 5670
+ },
+ {
+ "epoch": 2.681323877068558,
+ "grad_norm": 2.8072893619537354,
+ "learning_rate": 2.9491978971385436e-06,
+ "loss": 0.435,
+ "step": 5671
+ },
+ {
+ "epoch": 2.6817966903073285,
+ "grad_norm": 2.5701990127563477,
+ "learning_rate": 2.9485842088014498e-06,
+ "loss": 0.4932,
+ "step": 5672
+ },
+ {
+ "epoch": 2.6822695035460993,
+ "grad_norm": 2.9368457794189453,
+ "learning_rate": 2.9479704925329854e-06,
+ "loss": 0.455,
+ "step": 5673
+ },
+ {
+ "epoch": 2.68274231678487,
+ "grad_norm": 2.8576247692108154,
+ "learning_rate": 2.947356748371362e-06,
+ "loss": 0.4254,
+ "step": 5674
+ },
+ {
+ "epoch": 2.6832151300236404,
+ "grad_norm": 2.8999195098876953,
+ "learning_rate": 2.946742976354795e-06,
+ "loss": 0.4159,
+ "step": 5675
+ },
+ {
+ "epoch": 2.6836879432624112,
+ "grad_norm": 2.8439736366271973,
+ "learning_rate": 2.946129176521502e-06,
+ "loss": 0.4035,
+ "step": 5676
+ },
+ {
+ "epoch": 2.684160756501182,
+ "grad_norm": 2.8525729179382324,
+ "learning_rate": 2.945515348909702e-06,
+ "loss": 0.4137,
+ "step": 5677
+ },
+ {
+ "epoch": 2.684633569739953,
+ "grad_norm": 2.6573562622070312,
+ "learning_rate": 2.9449014935576147e-06,
+ "loss": 0.4203,
+ "step": 5678
+ },
+ {
+ "epoch": 2.6851063829787236,
+ "grad_norm": 2.765794277191162,
+ "learning_rate": 2.9442876105034616e-06,
+ "loss": 0.5184,
+ "step": 5679
+ },
+ {
+ "epoch": 2.685579196217494,
+ "grad_norm": 2.694617748260498,
+ "learning_rate": 2.943673699785467e-06,
+ "loss": 0.417,
+ "step": 5680
+ },
+ {
+ "epoch": 2.6860520094562648,
+ "grad_norm": 2.740774393081665,
+ "learning_rate": 2.943059761441857e-06,
+ "loss": 0.4431,
+ "step": 5681
+ },
+ {
+ "epoch": 2.6865248226950356,
+ "grad_norm": 2.670642614364624,
+ "learning_rate": 2.942445795510859e-06,
+ "loss": 0.4298,
+ "step": 5682
+ },
+ {
+ "epoch": 2.686997635933806,
+ "grad_norm": 2.838907241821289,
+ "learning_rate": 2.9418318020307e-06,
+ "loss": 0.4529,
+ "step": 5683
+ },
+ {
+ "epoch": 2.6874704491725767,
+ "grad_norm": 2.562317371368408,
+ "learning_rate": 2.9412177810396135e-06,
+ "loss": 0.4251,
+ "step": 5684
+ },
+ {
+ "epoch": 2.6879432624113475,
+ "grad_norm": 2.5805928707122803,
+ "learning_rate": 2.9406037325758298e-06,
+ "loss": 0.4405,
+ "step": 5685
+ },
+ {
+ "epoch": 2.6884160756501183,
+ "grad_norm": 2.5701205730438232,
+ "learning_rate": 2.939989656677583e-06,
+ "loss": 0.4184,
+ "step": 5686
+ },
+ {
+ "epoch": 2.688888888888889,
+ "grad_norm": 2.7990400791168213,
+ "learning_rate": 2.939375553383111e-06,
+ "loss": 0.4866,
+ "step": 5687
+ },
+ {
+ "epoch": 2.6893617021276595,
+ "grad_norm": 3.063319206237793,
+ "learning_rate": 2.9387614227306487e-06,
+ "loss": 0.4202,
+ "step": 5688
+ },
+ {
+ "epoch": 2.6898345153664303,
+ "grad_norm": 3.0891315937042236,
+ "learning_rate": 2.938147264758437e-06,
+ "loss": 0.4344,
+ "step": 5689
+ },
+ {
+ "epoch": 2.690307328605201,
+ "grad_norm": 2.8982670307159424,
+ "learning_rate": 2.9375330795047165e-06,
+ "loss": 0.4548,
+ "step": 5690
+ },
+ {
+ "epoch": 2.6907801418439714,
+ "grad_norm": 2.7947235107421875,
+ "learning_rate": 2.9369188670077293e-06,
+ "loss": 0.5028,
+ "step": 5691
+ },
+ {
+ "epoch": 2.691252955082742,
+ "grad_norm": 3.1615960597991943,
+ "learning_rate": 2.9363046273057206e-06,
+ "loss": 0.4855,
+ "step": 5692
+ },
+ {
+ "epoch": 2.691725768321513,
+ "grad_norm": 2.669516086578369,
+ "learning_rate": 2.935690360436935e-06,
+ "loss": 0.3813,
+ "step": 5693
+ },
+ {
+ "epoch": 2.692198581560284,
+ "grad_norm": 2.8743274211883545,
+ "learning_rate": 2.935076066439622e-06,
+ "loss": 0.4302,
+ "step": 5694
+ },
+ {
+ "epoch": 2.6926713947990546,
+ "grad_norm": 2.6829612255096436,
+ "learning_rate": 2.9344617453520295e-06,
+ "loss": 0.4063,
+ "step": 5695
+ },
+ {
+ "epoch": 2.693144208037825,
+ "grad_norm": 2.776447057723999,
+ "learning_rate": 2.9338473972124097e-06,
+ "loss": 0.4921,
+ "step": 5696
+ },
+ {
+ "epoch": 2.6936170212765957,
+ "grad_norm": 2.7865772247314453,
+ "learning_rate": 2.9332330220590143e-06,
+ "loss": 0.4939,
+ "step": 5697
+ },
+ {
+ "epoch": 2.6940898345153665,
+ "grad_norm": 3.020526170730591,
+ "learning_rate": 2.932618619930098e-06,
+ "loss": 0.4839,
+ "step": 5698
+ },
+ {
+ "epoch": 2.694562647754137,
+ "grad_norm": 2.637057065963745,
+ "learning_rate": 2.932004190863918e-06,
+ "loss": 0.4343,
+ "step": 5699
+ },
+ {
+ "epoch": 2.6950354609929077,
+ "grad_norm": 2.7426512241363525,
+ "learning_rate": 2.9313897348987314e-06,
+ "loss": 0.3609,
+ "step": 5700
+ },
+ {
+ "epoch": 2.6955082742316785,
+ "grad_norm": 2.767186164855957,
+ "learning_rate": 2.9307752520727974e-06,
+ "loss": 0.3793,
+ "step": 5701
+ },
+ {
+ "epoch": 2.6959810874704493,
+ "grad_norm": 2.4791622161865234,
+ "learning_rate": 2.930160742424377e-06,
+ "loss": 0.4192,
+ "step": 5702
+ },
+ {
+ "epoch": 2.69645390070922,
+ "grad_norm": 2.661461591720581,
+ "learning_rate": 2.9295462059917336e-06,
+ "loss": 0.4758,
+ "step": 5703
+ },
+ {
+ "epoch": 2.6969267139479904,
+ "grad_norm": 2.896242380142212,
+ "learning_rate": 2.928931642813131e-06,
+ "loss": 0.42,
+ "step": 5704
+ },
+ {
+ "epoch": 2.6973995271867612,
+ "grad_norm": 2.783813238143921,
+ "learning_rate": 2.9283170529268366e-06,
+ "loss": 0.4726,
+ "step": 5705
+ },
+ {
+ "epoch": 2.697872340425532,
+ "grad_norm": 2.4347333908081055,
+ "learning_rate": 2.927702436371117e-06,
+ "loss": 0.4199,
+ "step": 5706
+ },
+ {
+ "epoch": 2.6983451536643024,
+ "grad_norm": 2.4643805027008057,
+ "learning_rate": 2.927087793184242e-06,
+ "loss": 0.3578,
+ "step": 5707
+ },
+ {
+ "epoch": 2.698817966903073,
+ "grad_norm": 2.6396660804748535,
+ "learning_rate": 2.9264731234044835e-06,
+ "loss": 0.4509,
+ "step": 5708
+ },
+ {
+ "epoch": 2.699290780141844,
+ "grad_norm": 2.7341182231903076,
+ "learning_rate": 2.925858427070113e-06,
+ "loss": 0.4331,
+ "step": 5709
+ },
+ {
+ "epoch": 2.699763593380615,
+ "grad_norm": 2.7578938007354736,
+ "learning_rate": 2.9252437042194058e-06,
+ "loss": 0.4508,
+ "step": 5710
+ },
+ {
+ "epoch": 2.7002364066193856,
+ "grad_norm": 2.557788133621216,
+ "learning_rate": 2.9246289548906375e-06,
+ "loss": 0.3775,
+ "step": 5711
+ },
+ {
+ "epoch": 2.700709219858156,
+ "grad_norm": 2.802851676940918,
+ "learning_rate": 2.924014179122086e-06,
+ "loss": 0.4518,
+ "step": 5712
+ },
+ {
+ "epoch": 2.7011820330969267,
+ "grad_norm": 2.4773001670837402,
+ "learning_rate": 2.9233993769520313e-06,
+ "loss": 0.4019,
+ "step": 5713
+ },
+ {
+ "epoch": 2.7016548463356975,
+ "grad_norm": 3.108971357345581,
+ "learning_rate": 2.922784548418754e-06,
+ "loss": 0.4715,
+ "step": 5714
+ },
+ {
+ "epoch": 2.702127659574468,
+ "grad_norm": 2.8596770763397217,
+ "learning_rate": 2.9221696935605366e-06,
+ "loss": 0.4361,
+ "step": 5715
+ },
+ {
+ "epoch": 2.7026004728132387,
+ "grad_norm": 2.570604085922241,
+ "learning_rate": 2.9215548124156633e-06,
+ "loss": 0.3982,
+ "step": 5716
+ },
+ {
+ "epoch": 2.7030732860520095,
+ "grad_norm": 2.3157799243927,
+ "learning_rate": 2.9209399050224206e-06,
+ "loss": 0.456,
+ "step": 5717
+ },
+ {
+ "epoch": 2.7035460992907803,
+ "grad_norm": 2.6865758895874023,
+ "learning_rate": 2.9203249714190952e-06,
+ "loss": 0.4441,
+ "step": 5718
+ },
+ {
+ "epoch": 2.704018912529551,
+ "grad_norm": 2.76723313331604,
+ "learning_rate": 2.919710011643978e-06,
+ "loss": 0.464,
+ "step": 5719
+ },
+ {
+ "epoch": 2.7044917257683214,
+ "grad_norm": 2.648792028427124,
+ "learning_rate": 2.9190950257353578e-06,
+ "loss": 0.3426,
+ "step": 5720
+ },
+ {
+ "epoch": 2.704964539007092,
+ "grad_norm": 2.878739833831787,
+ "learning_rate": 2.9184800137315276e-06,
+ "loss": 0.4431,
+ "step": 5721
+ },
+ {
+ "epoch": 2.705437352245863,
+ "grad_norm": 2.670567274093628,
+ "learning_rate": 2.917864975670783e-06,
+ "loss": 0.4347,
+ "step": 5722
+ },
+ {
+ "epoch": 2.7059101654846334,
+ "grad_norm": 2.7031569480895996,
+ "learning_rate": 2.9172499115914184e-06,
+ "loss": 0.4557,
+ "step": 5723
+ },
+ {
+ "epoch": 2.706382978723404,
+ "grad_norm": 2.5225696563720703,
+ "learning_rate": 2.9166348215317314e-06,
+ "loss": 0.4159,
+ "step": 5724
+ },
+ {
+ "epoch": 2.706855791962175,
+ "grad_norm": 2.8676085472106934,
+ "learning_rate": 2.916019705530021e-06,
+ "loss": 0.5018,
+ "step": 5725
+ },
+ {
+ "epoch": 2.7073286052009458,
+ "grad_norm": 2.576463460922241,
+ "learning_rate": 2.915404563624587e-06,
+ "loss": 0.4317,
+ "step": 5726
+ },
+ {
+ "epoch": 2.7078014184397166,
+ "grad_norm": 3.155565023422241,
+ "learning_rate": 2.9147893958537328e-06,
+ "loss": 0.5029,
+ "step": 5727
+ },
+ {
+ "epoch": 2.708274231678487,
+ "grad_norm": 2.604079008102417,
+ "learning_rate": 2.9141742022557622e-06,
+ "loss": 0.4324,
+ "step": 5728
+ },
+ {
+ "epoch": 2.7087470449172577,
+ "grad_norm": 2.6597228050231934,
+ "learning_rate": 2.913558982868979e-06,
+ "loss": 0.4335,
+ "step": 5729
+ },
+ {
+ "epoch": 2.7092198581560285,
+ "grad_norm": 2.811384439468384,
+ "learning_rate": 2.9129437377316923e-06,
+ "loss": 0.4031,
+ "step": 5730
+ },
+ {
+ "epoch": 2.709692671394799,
+ "grad_norm": 3.1041207313537598,
+ "learning_rate": 2.91232846688221e-06,
+ "loss": 0.481,
+ "step": 5731
+ },
+ {
+ "epoch": 2.7101654846335697,
+ "grad_norm": 2.5992188453674316,
+ "learning_rate": 2.9117131703588414e-06,
+ "loss": 0.4266,
+ "step": 5732
+ },
+ {
+ "epoch": 2.7106382978723405,
+ "grad_norm": 2.7726242542266846,
+ "learning_rate": 2.911097848199899e-06,
+ "loss": 0.4464,
+ "step": 5733
+ },
+ {
+ "epoch": 2.7111111111111112,
+ "grad_norm": 2.8683483600616455,
+ "learning_rate": 2.9104825004436966e-06,
+ "loss": 0.4248,
+ "step": 5734
+ },
+ {
+ "epoch": 2.711583924349882,
+ "grad_norm": 2.776386022567749,
+ "learning_rate": 2.9098671271285484e-06,
+ "loss": 0.4556,
+ "step": 5735
+ },
+ {
+ "epoch": 2.7120567375886524,
+ "grad_norm": 2.7612528800964355,
+ "learning_rate": 2.909251728292771e-06,
+ "loss": 0.455,
+ "step": 5736
+ },
+ {
+ "epoch": 2.712529550827423,
+ "grad_norm": 2.9223551750183105,
+ "learning_rate": 2.908636303974684e-06,
+ "loss": 0.4302,
+ "step": 5737
+ },
+ {
+ "epoch": 2.713002364066194,
+ "grad_norm": 2.898226022720337,
+ "learning_rate": 2.908020854212606e-06,
+ "loss": 0.4827,
+ "step": 5738
+ },
+ {
+ "epoch": 2.7134751773049643,
+ "grad_norm": 2.706361770629883,
+ "learning_rate": 2.9074053790448576e-06,
+ "loss": 0.4444,
+ "step": 5739
+ },
+ {
+ "epoch": 2.713947990543735,
+ "grad_norm": 2.8227248191833496,
+ "learning_rate": 2.9067898785097637e-06,
+ "loss": 0.4661,
+ "step": 5740
+ },
+ {
+ "epoch": 2.714420803782506,
+ "grad_norm": 2.597837448120117,
+ "learning_rate": 2.9061743526456474e-06,
+ "loss": 0.4646,
+ "step": 5741
+ },
+ {
+ "epoch": 2.7148936170212767,
+ "grad_norm": 2.5525131225585938,
+ "learning_rate": 2.9055588014908354e-06,
+ "loss": 0.4172,
+ "step": 5742
+ },
+ {
+ "epoch": 2.7153664302600475,
+ "grad_norm": 2.713071823120117,
+ "learning_rate": 2.904943225083655e-06,
+ "loss": 0.4893,
+ "step": 5743
+ },
+ {
+ "epoch": 2.715839243498818,
+ "grad_norm": 2.538623571395874,
+ "learning_rate": 2.9043276234624353e-06,
+ "loss": 0.3905,
+ "step": 5744
+ },
+ {
+ "epoch": 2.7163120567375887,
+ "grad_norm": 2.5190389156341553,
+ "learning_rate": 2.9037119966655076e-06,
+ "loss": 0.4318,
+ "step": 5745
+ },
+ {
+ "epoch": 2.7167848699763595,
+ "grad_norm": 2.6587612628936768,
+ "learning_rate": 2.903096344731204e-06,
+ "loss": 0.4153,
+ "step": 5746
+ },
+ {
+ "epoch": 2.71725768321513,
+ "grad_norm": 2.836731433868408,
+ "learning_rate": 2.902480667697859e-06,
+ "loss": 0.4779,
+ "step": 5747
+ },
+ {
+ "epoch": 2.7177304964539006,
+ "grad_norm": 2.8076045513153076,
+ "learning_rate": 2.9018649656038074e-06,
+ "loss": 0.5126,
+ "step": 5748
+ },
+ {
+ "epoch": 2.7182033096926714,
+ "grad_norm": 2.8930516242980957,
+ "learning_rate": 2.9012492384873865e-06,
+ "loss": 0.4561,
+ "step": 5749
+ },
+ {
+ "epoch": 2.7186761229314422,
+ "grad_norm": 2.7000370025634766,
+ "learning_rate": 2.9006334863869343e-06,
+ "loss": 0.4659,
+ "step": 5750
+ },
+ {
+ "epoch": 2.719148936170213,
+ "grad_norm": 2.927011251449585,
+ "learning_rate": 2.9000177093407926e-06,
+ "loss": 0.5123,
+ "step": 5751
+ },
+ {
+ "epoch": 2.7196217494089834,
+ "grad_norm": 3.0102779865264893,
+ "learning_rate": 2.8994019073873015e-06,
+ "loss": 0.3972,
+ "step": 5752
+ },
+ {
+ "epoch": 2.720094562647754,
+ "grad_norm": 2.778838634490967,
+ "learning_rate": 2.8987860805648054e-06,
+ "loss": 0.4922,
+ "step": 5753
+ },
+ {
+ "epoch": 2.720567375886525,
+ "grad_norm": 2.6150314807891846,
+ "learning_rate": 2.898170228911648e-06,
+ "loss": 0.4425,
+ "step": 5754
+ },
+ {
+ "epoch": 2.7210401891252953,
+ "grad_norm": 2.9329984188079834,
+ "learning_rate": 2.8975543524661777e-06,
+ "loss": 0.4872,
+ "step": 5755
+ },
+ {
+ "epoch": 2.721513002364066,
+ "grad_norm": 2.756803512573242,
+ "learning_rate": 2.8969384512667404e-06,
+ "loss": 0.4362,
+ "step": 5756
+ },
+ {
+ "epoch": 2.721985815602837,
+ "grad_norm": 2.600877285003662,
+ "learning_rate": 2.896322525351686e-06,
+ "loss": 0.4802,
+ "step": 5757
+ },
+ {
+ "epoch": 2.7224586288416077,
+ "grad_norm": 2.647069215774536,
+ "learning_rate": 2.8957065747593655e-06,
+ "loss": 0.4649,
+ "step": 5758
+ },
+ {
+ "epoch": 2.7229314420803785,
+ "grad_norm": 2.845388174057007,
+ "learning_rate": 2.895090599528132e-06,
+ "loss": 0.4533,
+ "step": 5759
+ },
+ {
+ "epoch": 2.723404255319149,
+ "grad_norm": 2.973881721496582,
+ "learning_rate": 2.8944745996963397e-06,
+ "loss": 0.4959,
+ "step": 5760
+ },
+ {
+ "epoch": 2.7238770685579197,
+ "grad_norm": 2.8995487689971924,
+ "learning_rate": 2.8938585753023435e-06,
+ "loss": 0.4597,
+ "step": 5761
+ },
+ {
+ "epoch": 2.7243498817966905,
+ "grad_norm": 2.903693437576294,
+ "learning_rate": 2.8932425263845004e-06,
+ "loss": 0.4521,
+ "step": 5762
+ },
+ {
+ "epoch": 2.724822695035461,
+ "grad_norm": 2.7609009742736816,
+ "learning_rate": 2.8926264529811702e-06,
+ "loss": 0.4399,
+ "step": 5763
+ },
+ {
+ "epoch": 2.7252955082742316,
+ "grad_norm": 2.788787603378296,
+ "learning_rate": 2.892010355130712e-06,
+ "loss": 0.4614,
+ "step": 5764
+ },
+ {
+ "epoch": 2.7257683215130024,
+ "grad_norm": 2.786498785018921,
+ "learning_rate": 2.8913942328714887e-06,
+ "loss": 0.4798,
+ "step": 5765
+ },
+ {
+ "epoch": 2.726241134751773,
+ "grad_norm": 2.9809393882751465,
+ "learning_rate": 2.8907780862418616e-06,
+ "loss": 0.5108,
+ "step": 5766
+ },
+ {
+ "epoch": 2.726713947990544,
+ "grad_norm": 2.6621177196502686,
+ "learning_rate": 2.8901619152801967e-06,
+ "loss": 0.4031,
+ "step": 5767
+ },
+ {
+ "epoch": 2.7271867612293144,
+ "grad_norm": 3.3092098236083984,
+ "learning_rate": 2.8895457200248607e-06,
+ "loss": 0.4671,
+ "step": 5768
+ },
+ {
+ "epoch": 2.727659574468085,
+ "grad_norm": 2.866306781768799,
+ "learning_rate": 2.8889295005142204e-06,
+ "loss": 0.4434,
+ "step": 5769
+ },
+ {
+ "epoch": 2.728132387706856,
+ "grad_norm": 2.6861231327056885,
+ "learning_rate": 2.888313256786646e-06,
+ "loss": 0.429,
+ "step": 5770
+ },
+ {
+ "epoch": 2.7286052009456263,
+ "grad_norm": 2.873180389404297,
+ "learning_rate": 2.8876969888805072e-06,
+ "loss": 0.4412,
+ "step": 5771
+ },
+ {
+ "epoch": 2.729078014184397,
+ "grad_norm": 2.511678695678711,
+ "learning_rate": 2.887080696834178e-06,
+ "loss": 0.4024,
+ "step": 5772
+ },
+ {
+ "epoch": 2.729550827423168,
+ "grad_norm": 2.6502726078033447,
+ "learning_rate": 2.88646438068603e-06,
+ "loss": 0.4357,
+ "step": 5773
+ },
+ {
+ "epoch": 2.7300236406619387,
+ "grad_norm": 2.7156145572662354,
+ "learning_rate": 2.8858480404744403e-06,
+ "loss": 0.4511,
+ "step": 5774
+ },
+ {
+ "epoch": 2.7304964539007095,
+ "grad_norm": 2.882582187652588,
+ "learning_rate": 2.8852316762377842e-06,
+ "loss": 0.4822,
+ "step": 5775
+ },
+ {
+ "epoch": 2.73096926713948,
+ "grad_norm": 2.7139666080474854,
+ "learning_rate": 2.8846152880144413e-06,
+ "loss": 0.4666,
+ "step": 5776
+ },
+ {
+ "epoch": 2.7314420803782506,
+ "grad_norm": 2.7453949451446533,
+ "learning_rate": 2.8839988758427907e-06,
+ "loss": 0.3927,
+ "step": 5777
+ },
+ {
+ "epoch": 2.731914893617021,
+ "grad_norm": 2.7859580516815186,
+ "learning_rate": 2.883382439761214e-06,
+ "loss": 0.4466,
+ "step": 5778
+ },
+ {
+ "epoch": 2.732387706855792,
+ "grad_norm": 2.695234537124634,
+ "learning_rate": 2.882765979808094e-06,
+ "loss": 0.4227,
+ "step": 5779
+ },
+ {
+ "epoch": 2.7328605200945626,
+ "grad_norm": 2.8081552982330322,
+ "learning_rate": 2.8821494960218148e-06,
+ "loss": 0.447,
+ "step": 5780
+ },
+ {
+ "epoch": 2.7333333333333334,
+ "grad_norm": 2.887643337249756,
+ "learning_rate": 2.881532988440762e-06,
+ "loss": 0.5018,
+ "step": 5781
+ },
+ {
+ "epoch": 2.733806146572104,
+ "grad_norm": 3.108212471008301,
+ "learning_rate": 2.8809164571033233e-06,
+ "loss": 0.4132,
+ "step": 5782
+ },
+ {
+ "epoch": 2.7342789598108745,
+ "grad_norm": 2.874328374862671,
+ "learning_rate": 2.880299902047886e-06,
+ "loss": 0.4618,
+ "step": 5783
+ },
+ {
+ "epoch": 2.7347517730496453,
+ "grad_norm": 3.089132308959961,
+ "learning_rate": 2.879683323312843e-06,
+ "loss": 0.4956,
+ "step": 5784
+ },
+ {
+ "epoch": 2.735224586288416,
+ "grad_norm": 2.5173206329345703,
+ "learning_rate": 2.879066720936583e-06,
+ "loss": 0.4087,
+ "step": 5785
+ },
+ {
+ "epoch": 2.7356973995271865,
+ "grad_norm": 2.6401286125183105,
+ "learning_rate": 2.8784500949575014e-06,
+ "loss": 0.3995,
+ "step": 5786
+ },
+ {
+ "epoch": 2.7361702127659573,
+ "grad_norm": 2.9371910095214844,
+ "learning_rate": 2.877833445413991e-06,
+ "loss": 0.5209,
+ "step": 5787
+ },
+ {
+ "epoch": 2.736643026004728,
+ "grad_norm": 3.218158006668091,
+ "learning_rate": 2.8772167723444498e-06,
+ "loss": 0.4275,
+ "step": 5788
+ },
+ {
+ "epoch": 2.737115839243499,
+ "grad_norm": 2.9072160720825195,
+ "learning_rate": 2.8766000757872736e-06,
+ "loss": 0.4244,
+ "step": 5789
+ },
+ {
+ "epoch": 2.7375886524822697,
+ "grad_norm": 3.0378096103668213,
+ "learning_rate": 2.8759833557808614e-06,
+ "loss": 0.507,
+ "step": 5790
+ },
+ {
+ "epoch": 2.73806146572104,
+ "grad_norm": 2.728353977203369,
+ "learning_rate": 2.8753666123636148e-06,
+ "loss": 0.413,
+ "step": 5791
+ },
+ {
+ "epoch": 2.738534278959811,
+ "grad_norm": 2.6869957447052,
+ "learning_rate": 2.874749845573935e-06,
+ "loss": 0.44,
+ "step": 5792
+ },
+ {
+ "epoch": 2.7390070921985816,
+ "grad_norm": 2.6381702423095703,
+ "learning_rate": 2.8741330554502263e-06,
+ "loss": 0.4708,
+ "step": 5793
+ },
+ {
+ "epoch": 2.739479905437352,
+ "grad_norm": 2.6944689750671387,
+ "learning_rate": 2.873516242030892e-06,
+ "loss": 0.4555,
+ "step": 5794
+ },
+ {
+ "epoch": 2.739952718676123,
+ "grad_norm": 3.168473243713379,
+ "learning_rate": 2.8728994053543396e-06,
+ "loss": 0.4538,
+ "step": 5795
+ },
+ {
+ "epoch": 2.7404255319148936,
+ "grad_norm": 2.7504515647888184,
+ "learning_rate": 2.872282545458976e-06,
+ "loss": 0.4628,
+ "step": 5796
+ },
+ {
+ "epoch": 2.7408983451536644,
+ "grad_norm": 2.896462917327881,
+ "learning_rate": 2.8716656623832114e-06,
+ "loss": 0.4946,
+ "step": 5797
+ },
+ {
+ "epoch": 2.741371158392435,
+ "grad_norm": 2.8053417205810547,
+ "learning_rate": 2.8710487561654547e-06,
+ "loss": 0.4893,
+ "step": 5798
+ },
+ {
+ "epoch": 2.7418439716312055,
+ "grad_norm": 2.63171124458313,
+ "learning_rate": 2.870431826844119e-06,
+ "loss": 0.4257,
+ "step": 5799
+ },
+ {
+ "epoch": 2.7423167848699763,
+ "grad_norm": 3.0963807106018066,
+ "learning_rate": 2.869814874457618e-06,
+ "loss": 0.5404,
+ "step": 5800
+ },
+ {
+ "epoch": 2.742789598108747,
+ "grad_norm": 2.591132164001465,
+ "learning_rate": 2.8691978990443664e-06,
+ "loss": 0.4015,
+ "step": 5801
+ },
+ {
+ "epoch": 2.7432624113475175,
+ "grad_norm": 3.0319552421569824,
+ "learning_rate": 2.8685809006427812e-06,
+ "loss": 0.4411,
+ "step": 5802
+ },
+ {
+ "epoch": 2.7437352245862883,
+ "grad_norm": 2.7791874408721924,
+ "learning_rate": 2.8679638792912784e-06,
+ "loss": 0.43,
+ "step": 5803
+ },
+ {
+ "epoch": 2.744208037825059,
+ "grad_norm": 3.530632495880127,
+ "learning_rate": 2.867346835028279e-06,
+ "loss": 0.4581,
+ "step": 5804
+ },
+ {
+ "epoch": 2.74468085106383,
+ "grad_norm": 3.2043099403381348,
+ "learning_rate": 2.8667297678922024e-06,
+ "loss": 0.4375,
+ "step": 5805
+ },
+ {
+ "epoch": 2.7451536643026007,
+ "grad_norm": 2.8442344665527344,
+ "learning_rate": 2.8661126779214716e-06,
+ "loss": 0.4059,
+ "step": 5806
+ },
+ {
+ "epoch": 2.745626477541371,
+ "grad_norm": 2.7561380863189697,
+ "learning_rate": 2.86549556515451e-06,
+ "loss": 0.4391,
+ "step": 5807
+ },
+ {
+ "epoch": 2.746099290780142,
+ "grad_norm": 3.229663848876953,
+ "learning_rate": 2.8648784296297418e-06,
+ "loss": 0.4579,
+ "step": 5808
+ },
+ {
+ "epoch": 2.7465721040189126,
+ "grad_norm": 2.8375027179718018,
+ "learning_rate": 2.864261271385593e-06,
+ "loss": 0.4566,
+ "step": 5809
+ },
+ {
+ "epoch": 2.747044917257683,
+ "grad_norm": 2.392998695373535,
+ "learning_rate": 2.863644090460493e-06,
+ "loss": 0.4123,
+ "step": 5810
+ },
+ {
+ "epoch": 2.7475177304964538,
+ "grad_norm": 2.707610607147217,
+ "learning_rate": 2.86302688689287e-06,
+ "loss": 0.4299,
+ "step": 5811
+ },
+ {
+ "epoch": 2.7479905437352246,
+ "grad_norm": 2.824042797088623,
+ "learning_rate": 2.8624096607211547e-06,
+ "loss": 0.3799,
+ "step": 5812
+ },
+ {
+ "epoch": 2.7484633569739954,
+ "grad_norm": 2.933102607727051,
+ "learning_rate": 2.861792411983779e-06,
+ "loss": 0.4416,
+ "step": 5813
+ },
+ {
+ "epoch": 2.748936170212766,
+ "grad_norm": 2.7000277042388916,
+ "learning_rate": 2.8611751407191757e-06,
+ "loss": 0.4346,
+ "step": 5814
+ },
+ {
+ "epoch": 2.7494089834515365,
+ "grad_norm": 2.6757142543792725,
+ "learning_rate": 2.86055784696578e-06,
+ "loss": 0.4559,
+ "step": 5815
+ },
+ {
+ "epoch": 2.7498817966903073,
+ "grad_norm": 2.5791053771972656,
+ "learning_rate": 2.8599405307620287e-06,
+ "loss": 0.4619,
+ "step": 5816
+ },
+ {
+ "epoch": 2.750354609929078,
+ "grad_norm": 3.0327374935150146,
+ "learning_rate": 2.859323192146359e-06,
+ "loss": 0.4163,
+ "step": 5817
+ },
+ {
+ "epoch": 2.7508274231678485,
+ "grad_norm": 2.5580220222473145,
+ "learning_rate": 2.8587058311572084e-06,
+ "loss": 0.4005,
+ "step": 5818
+ },
+ {
+ "epoch": 2.7513002364066192,
+ "grad_norm": 2.592179536819458,
+ "learning_rate": 2.85808844783302e-06,
+ "loss": 0.4404,
+ "step": 5819
+ },
+ {
+ "epoch": 2.75177304964539,
+ "grad_norm": 3.2779927253723145,
+ "learning_rate": 2.8574710422122342e-06,
+ "loss": 0.54,
+ "step": 5820
+ },
+ {
+ "epoch": 2.752245862884161,
+ "grad_norm": 2.4804370403289795,
+ "learning_rate": 2.8568536143332933e-06,
+ "loss": 0.4476,
+ "step": 5821
+ },
+ {
+ "epoch": 2.7527186761229316,
+ "grad_norm": 2.649477481842041,
+ "learning_rate": 2.8562361642346427e-06,
+ "loss": 0.4336,
+ "step": 5822
+ },
+ {
+ "epoch": 2.753191489361702,
+ "grad_norm": 3.138587474822998,
+ "learning_rate": 2.855618691954728e-06,
+ "loss": 0.5042,
+ "step": 5823
+ },
+ {
+ "epoch": 2.753664302600473,
+ "grad_norm": 2.75093412399292,
+ "learning_rate": 2.855001197531997e-06,
+ "loss": 0.4327,
+ "step": 5824
+ },
+ {
+ "epoch": 2.7541371158392436,
+ "grad_norm": 2.678809642791748,
+ "learning_rate": 2.854383681004898e-06,
+ "loss": 0.4409,
+ "step": 5825
+ },
+ {
+ "epoch": 2.754609929078014,
+ "grad_norm": 2.965386390686035,
+ "learning_rate": 2.853766142411881e-06,
+ "loss": 0.4716,
+ "step": 5826
+ },
+ {
+ "epoch": 2.7550827423167847,
+ "grad_norm": 2.6419436931610107,
+ "learning_rate": 2.853148581791398e-06,
+ "loss": 0.4367,
+ "step": 5827
+ },
+ {
+ "epoch": 2.7555555555555555,
+ "grad_norm": 3.205794095993042,
+ "learning_rate": 2.8525309991819004e-06,
+ "loss": 0.4869,
+ "step": 5828
+ },
+ {
+ "epoch": 2.7560283687943263,
+ "grad_norm": 3.041008472442627,
+ "learning_rate": 2.851913394621844e-06,
+ "loss": 0.5087,
+ "step": 5829
+ },
+ {
+ "epoch": 2.756501182033097,
+ "grad_norm": 2.6525566577911377,
+ "learning_rate": 2.851295768149684e-06,
+ "loss": 0.3951,
+ "step": 5830
+ },
+ {
+ "epoch": 2.7569739952718675,
+ "grad_norm": 2.732220411300659,
+ "learning_rate": 2.850678119803876e-06,
+ "loss": 0.4797,
+ "step": 5831
+ },
+ {
+ "epoch": 2.7574468085106383,
+ "grad_norm": 2.8965251445770264,
+ "learning_rate": 2.8500604496228797e-06,
+ "loss": 0.4938,
+ "step": 5832
+ },
+ {
+ "epoch": 2.757919621749409,
+ "grad_norm": 2.48020076751709,
+ "learning_rate": 2.849442757645154e-06,
+ "loss": 0.4172,
+ "step": 5833
+ },
+ {
+ "epoch": 2.7583924349881794,
+ "grad_norm": 2.4764912128448486,
+ "learning_rate": 2.8488250439091603e-06,
+ "loss": 0.4123,
+ "step": 5834
+ },
+ {
+ "epoch": 2.7588652482269502,
+ "grad_norm": 2.4547016620635986,
+ "learning_rate": 2.84820730845336e-06,
+ "loss": 0.4116,
+ "step": 5835
+ },
+ {
+ "epoch": 2.759338061465721,
+ "grad_norm": 2.55476975440979,
+ "learning_rate": 2.847589551316218e-06,
+ "loss": 0.4744,
+ "step": 5836
+ },
+ {
+ "epoch": 2.759810874704492,
+ "grad_norm": 2.3866238594055176,
+ "learning_rate": 2.846971772536199e-06,
+ "loss": 0.4406,
+ "step": 5837
+ },
+ {
+ "epoch": 2.7602836879432626,
+ "grad_norm": 2.855318784713745,
+ "learning_rate": 2.8463539721517687e-06,
+ "loss": 0.4517,
+ "step": 5838
+ },
+ {
+ "epoch": 2.760756501182033,
+ "grad_norm": 2.527198314666748,
+ "learning_rate": 2.8457361502013954e-06,
+ "loss": 0.3588,
+ "step": 5839
+ },
+ {
+ "epoch": 2.7612293144208038,
+ "grad_norm": 2.6761462688446045,
+ "learning_rate": 2.8451183067235476e-06,
+ "loss": 0.4192,
+ "step": 5840
+ },
+ {
+ "epoch": 2.7617021276595746,
+ "grad_norm": 2.5692319869995117,
+ "learning_rate": 2.8445004417566967e-06,
+ "loss": 0.4108,
+ "step": 5841
+ },
+ {
+ "epoch": 2.762174940898345,
+ "grad_norm": 2.5721096992492676,
+ "learning_rate": 2.8438825553393133e-06,
+ "loss": 0.3941,
+ "step": 5842
+ },
+ {
+ "epoch": 2.7626477541371157,
+ "grad_norm": 2.699430227279663,
+ "learning_rate": 2.843264647509872e-06,
+ "loss": 0.4418,
+ "step": 5843
+ },
+ {
+ "epoch": 2.7631205673758865,
+ "grad_norm": 2.6943318843841553,
+ "learning_rate": 2.842646718306846e-06,
+ "loss": 0.4505,
+ "step": 5844
+ },
+ {
+ "epoch": 2.7635933806146573,
+ "grad_norm": 2.661656379699707,
+ "learning_rate": 2.8420287677687107e-06,
+ "loss": 0.4413,
+ "step": 5845
+ },
+ {
+ "epoch": 2.764066193853428,
+ "grad_norm": 2.830467939376831,
+ "learning_rate": 2.8414107959339444e-06,
+ "loss": 0.5095,
+ "step": 5846
+ },
+ {
+ "epoch": 2.7645390070921985,
+ "grad_norm": 2.598053455352783,
+ "learning_rate": 2.840792802841024e-06,
+ "loss": 0.4029,
+ "step": 5847
+ },
+ {
+ "epoch": 2.7650118203309693,
+ "grad_norm": 2.641700029373169,
+ "learning_rate": 2.8401747885284316e-06,
+ "loss": 0.4237,
+ "step": 5848
+ },
+ {
+ "epoch": 2.76548463356974,
+ "grad_norm": 2.6672768592834473,
+ "learning_rate": 2.8395567530346454e-06,
+ "loss": 0.4181,
+ "step": 5849
+ },
+ {
+ "epoch": 2.7659574468085104,
+ "grad_norm": 2.5851705074310303,
+ "learning_rate": 2.838938696398149e-06,
+ "loss": 0.4165,
+ "step": 5850
+ },
+ {
+ "epoch": 2.766430260047281,
+ "grad_norm": 2.318120002746582,
+ "learning_rate": 2.8383206186574276e-06,
+ "loss": 0.3578,
+ "step": 5851
+ },
+ {
+ "epoch": 2.766903073286052,
+ "grad_norm": 2.6199793815612793,
+ "learning_rate": 2.8377025198509635e-06,
+ "loss": 0.4719,
+ "step": 5852
+ },
+ {
+ "epoch": 2.767375886524823,
+ "grad_norm": 2.7186086177825928,
+ "learning_rate": 2.837084400017245e-06,
+ "loss": 0.41,
+ "step": 5853
+ },
+ {
+ "epoch": 2.7678486997635936,
+ "grad_norm": 2.702514886856079,
+ "learning_rate": 2.8364662591947583e-06,
+ "loss": 0.4659,
+ "step": 5854
+ },
+ {
+ "epoch": 2.768321513002364,
+ "grad_norm": 2.612375259399414,
+ "learning_rate": 2.835848097421993e-06,
+ "loss": 0.4252,
+ "step": 5855
+ },
+ {
+ "epoch": 2.7687943262411348,
+ "grad_norm": 3.0127978324890137,
+ "learning_rate": 2.8352299147374394e-06,
+ "loss": 0.4084,
+ "step": 5856
+ },
+ {
+ "epoch": 2.7692671394799055,
+ "grad_norm": 2.6460049152374268,
+ "learning_rate": 2.83461171117959e-06,
+ "loss": 0.4035,
+ "step": 5857
+ },
+ {
+ "epoch": 2.769739952718676,
+ "grad_norm": 2.9844725131988525,
+ "learning_rate": 2.8339934867869357e-06,
+ "loss": 0.4912,
+ "step": 5858
+ },
+ {
+ "epoch": 2.7702127659574467,
+ "grad_norm": 2.731217861175537,
+ "learning_rate": 2.833375241597972e-06,
+ "loss": 0.4112,
+ "step": 5859
+ },
+ {
+ "epoch": 2.7706855791962175,
+ "grad_norm": 2.731194496154785,
+ "learning_rate": 2.832756975651193e-06,
+ "loss": 0.4516,
+ "step": 5860
+ },
+ {
+ "epoch": 2.7711583924349883,
+ "grad_norm": 3.0532076358795166,
+ "learning_rate": 2.8321386889850965e-06,
+ "loss": 0.3959,
+ "step": 5861
+ },
+ {
+ "epoch": 2.771631205673759,
+ "grad_norm": 3.5437800884246826,
+ "learning_rate": 2.831520381638181e-06,
+ "loss": 0.6055,
+ "step": 5862
+ },
+ {
+ "epoch": 2.7721040189125294,
+ "grad_norm": 2.4297714233398438,
+ "learning_rate": 2.830902053648944e-06,
+ "loss": 0.4038,
+ "step": 5863
+ },
+ {
+ "epoch": 2.7725768321513002,
+ "grad_norm": 2.696768045425415,
+ "learning_rate": 2.8302837050558876e-06,
+ "loss": 0.3983,
+ "step": 5864
+ },
+ {
+ "epoch": 2.773049645390071,
+ "grad_norm": 2.6574649810791016,
+ "learning_rate": 2.8296653358975122e-06,
+ "loss": 0.4937,
+ "step": 5865
+ },
+ {
+ "epoch": 2.7735224586288414,
+ "grad_norm": 2.9393341541290283,
+ "learning_rate": 2.8290469462123234e-06,
+ "loss": 0.4603,
+ "step": 5866
+ },
+ {
+ "epoch": 2.773995271867612,
+ "grad_norm": 2.7630696296691895,
+ "learning_rate": 2.828428536038824e-06,
+ "loss": 0.4663,
+ "step": 5867
+ },
+ {
+ "epoch": 2.774468085106383,
+ "grad_norm": 2.7354233264923096,
+ "learning_rate": 2.8278101054155183e-06,
+ "loss": 0.4444,
+ "step": 5868
+ },
+ {
+ "epoch": 2.774940898345154,
+ "grad_norm": 3.0489425659179688,
+ "learning_rate": 2.827191654380915e-06,
+ "loss": 0.4684,
+ "step": 5869
+ },
+ {
+ "epoch": 2.7754137115839246,
+ "grad_norm": 2.9602572917938232,
+ "learning_rate": 2.8265731829735226e-06,
+ "loss": 0.4571,
+ "step": 5870
+ },
+ {
+ "epoch": 2.775886524822695,
+ "grad_norm": 2.774132013320923,
+ "learning_rate": 2.825954691231851e-06,
+ "loss": 0.4458,
+ "step": 5871
+ },
+ {
+ "epoch": 2.7763593380614657,
+ "grad_norm": 2.696622133255005,
+ "learning_rate": 2.825336179194409e-06,
+ "loss": 0.4933,
+ "step": 5872
+ },
+ {
+ "epoch": 2.7768321513002365,
+ "grad_norm": 2.742184638977051,
+ "learning_rate": 2.8247176468997096e-06,
+ "loss": 0.4464,
+ "step": 5873
+ },
+ {
+ "epoch": 2.777304964539007,
+ "grad_norm": 2.7033183574676514,
+ "learning_rate": 2.824099094386266e-06,
+ "loss": 0.4369,
+ "step": 5874
+ },
+ {
+ "epoch": 2.7777777777777777,
+ "grad_norm": 2.7264044284820557,
+ "learning_rate": 2.8234805216925935e-06,
+ "loss": 0.4621,
+ "step": 5875
+ },
+ {
+ "epoch": 2.7782505910165485,
+ "grad_norm": 2.6417739391326904,
+ "learning_rate": 2.822861928857208e-06,
+ "loss": 0.4254,
+ "step": 5876
+ },
+ {
+ "epoch": 2.7787234042553193,
+ "grad_norm": 3.17209529876709,
+ "learning_rate": 2.8222433159186245e-06,
+ "loss": 0.5011,
+ "step": 5877
+ },
+ {
+ "epoch": 2.77919621749409,
+ "grad_norm": 3.1434381008148193,
+ "learning_rate": 2.8216246829153633e-06,
+ "loss": 0.4567,
+ "step": 5878
+ },
+ {
+ "epoch": 2.7796690307328604,
+ "grad_norm": 2.781608819961548,
+ "learning_rate": 2.821006029885943e-06,
+ "loss": 0.4723,
+ "step": 5879
+ },
+ {
+ "epoch": 2.780141843971631,
+ "grad_norm": 3.00079345703125,
+ "learning_rate": 2.820387356868885e-06,
+ "loss": 0.4796,
+ "step": 5880
+ },
+ {
+ "epoch": 2.780614657210402,
+ "grad_norm": 2.703555107116699,
+ "learning_rate": 2.819768663902712e-06,
+ "loss": 0.4577,
+ "step": 5881
+ },
+ {
+ "epoch": 2.7810874704491724,
+ "grad_norm": 2.5741801261901855,
+ "learning_rate": 2.8191499510259453e-06,
+ "loss": 0.4255,
+ "step": 5882
+ },
+ {
+ "epoch": 2.781560283687943,
+ "grad_norm": 2.9871208667755127,
+ "learning_rate": 2.8185312182771112e-06,
+ "loss": 0.4495,
+ "step": 5883
+ },
+ {
+ "epoch": 2.782033096926714,
+ "grad_norm": 2.525317668914795,
+ "learning_rate": 2.8179124656947343e-06,
+ "loss": 0.4428,
+ "step": 5884
+ },
+ {
+ "epoch": 2.7825059101654848,
+ "grad_norm": 2.525092840194702,
+ "learning_rate": 2.817293693317343e-06,
+ "loss": 0.4348,
+ "step": 5885
+ },
+ {
+ "epoch": 2.7829787234042556,
+ "grad_norm": 2.8485171794891357,
+ "learning_rate": 2.816674901183464e-06,
+ "loss": 0.4206,
+ "step": 5886
+ },
+ {
+ "epoch": 2.783451536643026,
+ "grad_norm": 2.6612746715545654,
+ "learning_rate": 2.8160560893316272e-06,
+ "loss": 0.396,
+ "step": 5887
+ },
+ {
+ "epoch": 2.7839243498817967,
+ "grad_norm": 2.7093865871429443,
+ "learning_rate": 2.815437257800364e-06,
+ "loss": 0.4468,
+ "step": 5888
+ },
+ {
+ "epoch": 2.7843971631205675,
+ "grad_norm": 2.6130900382995605,
+ "learning_rate": 2.814818406628206e-06,
+ "loss": 0.443,
+ "step": 5889
+ },
+ {
+ "epoch": 2.784869976359338,
+ "grad_norm": 2.8147552013397217,
+ "learning_rate": 2.8141995358536866e-06,
+ "loss": 0.4454,
+ "step": 5890
+ },
+ {
+ "epoch": 2.7853427895981087,
+ "grad_norm": 2.5621275901794434,
+ "learning_rate": 2.8135806455153395e-06,
+ "loss": 0.439,
+ "step": 5891
+ },
+ {
+ "epoch": 2.7858156028368795,
+ "grad_norm": 2.880228281021118,
+ "learning_rate": 2.812961735651701e-06,
+ "loss": 0.3895,
+ "step": 5892
+ },
+ {
+ "epoch": 2.7862884160756503,
+ "grad_norm": 2.5861377716064453,
+ "learning_rate": 2.8123428063013068e-06,
+ "loss": 0.4402,
+ "step": 5893
+ },
+ {
+ "epoch": 2.786761229314421,
+ "grad_norm": 2.9707765579223633,
+ "learning_rate": 2.811723857502696e-06,
+ "loss": 0.4461,
+ "step": 5894
+ },
+ {
+ "epoch": 2.7872340425531914,
+ "grad_norm": 2.923999309539795,
+ "learning_rate": 2.811104889294408e-06,
+ "loss": 0.4395,
+ "step": 5895
+ },
+ {
+ "epoch": 2.787706855791962,
+ "grad_norm": 2.846933603286743,
+ "learning_rate": 2.810485901714981e-06,
+ "loss": 0.5168,
+ "step": 5896
+ },
+ {
+ "epoch": 2.788179669030733,
+ "grad_norm": 4.1052350997924805,
+ "learning_rate": 2.8098668948029597e-06,
+ "loss": 0.5152,
+ "step": 5897
+ },
+ {
+ "epoch": 2.7886524822695034,
+ "grad_norm": 2.7391018867492676,
+ "learning_rate": 2.8092478685968856e-06,
+ "loss": 0.4515,
+ "step": 5898
+ },
+ {
+ "epoch": 2.789125295508274,
+ "grad_norm": 2.976088285446167,
+ "learning_rate": 2.8086288231353027e-06,
+ "loss": 0.5156,
+ "step": 5899
+ },
+ {
+ "epoch": 2.789598108747045,
+ "grad_norm": 2.6139633655548096,
+ "learning_rate": 2.8080097584567562e-06,
+ "loss": 0.4237,
+ "step": 5900
+ },
+ {
+ "epoch": 2.7900709219858157,
+ "grad_norm": 2.501654624938965,
+ "learning_rate": 2.807390674599792e-06,
+ "loss": 0.4349,
+ "step": 5901
+ },
+ {
+ "epoch": 2.7905437352245865,
+ "grad_norm": 2.8814525604248047,
+ "learning_rate": 2.8067715716029586e-06,
+ "loss": 0.4866,
+ "step": 5902
+ },
+ {
+ "epoch": 2.791016548463357,
+ "grad_norm": 2.7953200340270996,
+ "learning_rate": 2.8061524495048046e-06,
+ "loss": 0.3964,
+ "step": 5903
+ },
+ {
+ "epoch": 2.7914893617021277,
+ "grad_norm": 2.7362849712371826,
+ "learning_rate": 2.8055333083438808e-06,
+ "loss": 0.4181,
+ "step": 5904
+ },
+ {
+ "epoch": 2.7919621749408985,
+ "grad_norm": 2.9740512371063232,
+ "learning_rate": 2.8049141481587366e-06,
+ "loss": 0.4784,
+ "step": 5905
+ },
+ {
+ "epoch": 2.792434988179669,
+ "grad_norm": 2.595813274383545,
+ "learning_rate": 2.8042949689879262e-06,
+ "loss": 0.4421,
+ "step": 5906
+ },
+ {
+ "epoch": 2.7929078014184396,
+ "grad_norm": 2.886899948120117,
+ "learning_rate": 2.803675770870002e-06,
+ "loss": 0.4435,
+ "step": 5907
+ },
+ {
+ "epoch": 2.7933806146572104,
+ "grad_norm": 2.6057486534118652,
+ "learning_rate": 2.8030565538435196e-06,
+ "loss": 0.4472,
+ "step": 5908
+ },
+ {
+ "epoch": 2.7938534278959812,
+ "grad_norm": 2.7422802448272705,
+ "learning_rate": 2.802437317947034e-06,
+ "loss": 0.4799,
+ "step": 5909
+ },
+ {
+ "epoch": 2.794326241134752,
+ "grad_norm": 2.3904244899749756,
+ "learning_rate": 2.801818063219102e-06,
+ "loss": 0.4508,
+ "step": 5910
+ },
+ {
+ "epoch": 2.7947990543735224,
+ "grad_norm": 2.8434207439422607,
+ "learning_rate": 2.8011987896982835e-06,
+ "loss": 0.4473,
+ "step": 5911
+ },
+ {
+ "epoch": 2.795271867612293,
+ "grad_norm": 2.916088819503784,
+ "learning_rate": 2.8005794974231366e-06,
+ "loss": 0.464,
+ "step": 5912
+ },
+ {
+ "epoch": 2.795744680851064,
+ "grad_norm": 2.6483397483825684,
+ "learning_rate": 2.7999601864322236e-06,
+ "loss": 0.441,
+ "step": 5913
+ },
+ {
+ "epoch": 2.7962174940898343,
+ "grad_norm": 2.9287428855895996,
+ "learning_rate": 2.7993408567641033e-06,
+ "loss": 0.4551,
+ "step": 5914
+ },
+ {
+ "epoch": 2.796690307328605,
+ "grad_norm": 2.575024127960205,
+ "learning_rate": 2.798721508457342e-06,
+ "loss": 0.4494,
+ "step": 5915
+ },
+ {
+ "epoch": 2.797163120567376,
+ "grad_norm": 2.7156829833984375,
+ "learning_rate": 2.7981021415505015e-06,
+ "loss": 0.419,
+ "step": 5916
+ },
+ {
+ "epoch": 2.7976359338061467,
+ "grad_norm": 2.850553035736084,
+ "learning_rate": 2.7974827560821482e-06,
+ "loss": 0.4709,
+ "step": 5917
+ },
+ {
+ "epoch": 2.7981087470449175,
+ "grad_norm": 2.673846483230591,
+ "learning_rate": 2.796863352090847e-06,
+ "loss": 0.4224,
+ "step": 5918
+ },
+ {
+ "epoch": 2.798581560283688,
+ "grad_norm": 2.9093217849731445,
+ "learning_rate": 2.796243929615168e-06,
+ "loss": 0.468,
+ "step": 5919
+ },
+ {
+ "epoch": 2.7990543735224587,
+ "grad_norm": 2.4853813648223877,
+ "learning_rate": 2.7956244886936775e-06,
+ "loss": 0.4723,
+ "step": 5920
+ },
+ {
+ "epoch": 2.7995271867612295,
+ "grad_norm": 3.026428461074829,
+ "learning_rate": 2.795005029364946e-06,
+ "loss": 0.4721,
+ "step": 5921
+ },
+ {
+ "epoch": 2.8,
+ "grad_norm": 2.886295795440674,
+ "learning_rate": 2.794385551667546e-06,
+ "loss": 0.456,
+ "step": 5922
+ },
+ {
+ "epoch": 2.8004728132387706,
+ "grad_norm": 3.2260656356811523,
+ "learning_rate": 2.7937660556400486e-06,
+ "loss": 0.4499,
+ "step": 5923
+ },
+ {
+ "epoch": 2.8009456264775414,
+ "grad_norm": 2.7971982955932617,
+ "learning_rate": 2.793146541321027e-06,
+ "loss": 0.3982,
+ "step": 5924
+ },
+ {
+ "epoch": 2.801418439716312,
+ "grad_norm": 2.85461163520813,
+ "learning_rate": 2.7925270087490546e-06,
+ "loss": 0.4841,
+ "step": 5925
+ },
+ {
+ "epoch": 2.801891252955083,
+ "grad_norm": 3.0642316341400146,
+ "learning_rate": 2.7919074579627086e-06,
+ "loss": 0.4538,
+ "step": 5926
+ },
+ {
+ "epoch": 2.8023640661938534,
+ "grad_norm": 2.9053616523742676,
+ "learning_rate": 2.7912878890005657e-06,
+ "loss": 0.434,
+ "step": 5927
+ },
+ {
+ "epoch": 2.802836879432624,
+ "grad_norm": 2.7649240493774414,
+ "learning_rate": 2.7906683019012027e-06,
+ "loss": 0.414,
+ "step": 5928
+ },
+ {
+ "epoch": 2.803309692671395,
+ "grad_norm": 2.8717660903930664,
+ "learning_rate": 2.7900486967031987e-06,
+ "loss": 0.4337,
+ "step": 5929
+ },
+ {
+ "epoch": 2.8037825059101653,
+ "grad_norm": 2.6860995292663574,
+ "learning_rate": 2.789429073445135e-06,
+ "loss": 0.447,
+ "step": 5930
+ },
+ {
+ "epoch": 2.804255319148936,
+ "grad_norm": 2.67509126663208,
+ "learning_rate": 2.7888094321655918e-06,
+ "loss": 0.4955,
+ "step": 5931
+ },
+ {
+ "epoch": 2.804728132387707,
+ "grad_norm": 2.7426326274871826,
+ "learning_rate": 2.7881897729031514e-06,
+ "loss": 0.4564,
+ "step": 5932
+ },
+ {
+ "epoch": 2.8052009456264777,
+ "grad_norm": 2.7087252140045166,
+ "learning_rate": 2.7875700956963973e-06,
+ "loss": 0.4571,
+ "step": 5933
+ },
+ {
+ "epoch": 2.8056737588652485,
+ "grad_norm": 2.513526439666748,
+ "learning_rate": 2.7869504005839147e-06,
+ "loss": 0.4361,
+ "step": 5934
+ },
+ {
+ "epoch": 2.806146572104019,
+ "grad_norm": 3.2246084213256836,
+ "learning_rate": 2.7863306876042885e-06,
+ "loss": 0.4612,
+ "step": 5935
+ },
+ {
+ "epoch": 2.8066193853427897,
+ "grad_norm": 3.226325511932373,
+ "learning_rate": 2.7857109567961066e-06,
+ "loss": 0.4528,
+ "step": 5936
+ },
+ {
+ "epoch": 2.8070921985815604,
+ "grad_norm": 2.8861422538757324,
+ "learning_rate": 2.785091208197956e-06,
+ "loss": 0.5049,
+ "step": 5937
+ },
+ {
+ "epoch": 2.807565011820331,
+ "grad_norm": 2.76279616355896,
+ "learning_rate": 2.7844714418484257e-06,
+ "loss": 0.4714,
+ "step": 5938
+ },
+ {
+ "epoch": 2.8080378250591016,
+ "grad_norm": 2.9591920375823975,
+ "learning_rate": 2.7838516577861063e-06,
+ "loss": 0.4633,
+ "step": 5939
+ },
+ {
+ "epoch": 2.8085106382978724,
+ "grad_norm": 2.536916971206665,
+ "learning_rate": 2.7832318560495885e-06,
+ "loss": 0.4108,
+ "step": 5940
+ },
+ {
+ "epoch": 2.808983451536643,
+ "grad_norm": 3.2484991550445557,
+ "learning_rate": 2.7826120366774657e-06,
+ "loss": 0.4888,
+ "step": 5941
+ },
+ {
+ "epoch": 2.8094562647754135,
+ "grad_norm": 2.7129359245300293,
+ "learning_rate": 2.781992199708329e-06,
+ "loss": 0.4008,
+ "step": 5942
+ },
+ {
+ "epoch": 2.8099290780141843,
+ "grad_norm": 2.4176113605499268,
+ "learning_rate": 2.781372345180776e-06,
+ "loss": 0.3864,
+ "step": 5943
+ },
+ {
+ "epoch": 2.810401891252955,
+ "grad_norm": 2.6557252407073975,
+ "learning_rate": 2.7807524731334e-06,
+ "loss": 0.4295,
+ "step": 5944
+ },
+ {
+ "epoch": 2.8108747044917255,
+ "grad_norm": 2.9191324710845947,
+ "learning_rate": 2.7801325836047993e-06,
+ "loss": 0.4854,
+ "step": 5945
+ },
+ {
+ "epoch": 2.8113475177304963,
+ "grad_norm": 2.6325371265411377,
+ "learning_rate": 2.7795126766335705e-06,
+ "loss": 0.4332,
+ "step": 5946
+ },
+ {
+ "epoch": 2.811820330969267,
+ "grad_norm": 2.658337116241455,
+ "learning_rate": 2.778892752258314e-06,
+ "loss": 0.4276,
+ "step": 5947
+ },
+ {
+ "epoch": 2.812293144208038,
+ "grad_norm": 2.763782262802124,
+ "learning_rate": 2.778272810517627e-06,
+ "loss": 0.4246,
+ "step": 5948
+ },
+ {
+ "epoch": 2.8127659574468087,
+ "grad_norm": 2.407607078552246,
+ "learning_rate": 2.777652851450113e-06,
+ "loss": 0.3788,
+ "step": 5949
+ },
+ {
+ "epoch": 2.813238770685579,
+ "grad_norm": 3.0339951515197754,
+ "learning_rate": 2.7770328750943736e-06,
+ "loss": 0.477,
+ "step": 5950
+ },
+ {
+ "epoch": 2.81371158392435,
+ "grad_norm": 2.3475773334503174,
+ "learning_rate": 2.776412881489012e-06,
+ "loss": 0.4206,
+ "step": 5951
+ },
+ {
+ "epoch": 2.8141843971631206,
+ "grad_norm": 3.0455260276794434,
+ "learning_rate": 2.7757928706726318e-06,
+ "loss": 0.4301,
+ "step": 5952
+ },
+ {
+ "epoch": 2.814657210401891,
+ "grad_norm": 2.803920030593872,
+ "learning_rate": 2.7751728426838386e-06,
+ "loss": 0.3738,
+ "step": 5953
+ },
+ {
+ "epoch": 2.815130023640662,
+ "grad_norm": 3.1083319187164307,
+ "learning_rate": 2.77455279756124e-06,
+ "loss": 0.5365,
+ "step": 5954
+ },
+ {
+ "epoch": 2.8156028368794326,
+ "grad_norm": 3.180809497833252,
+ "learning_rate": 2.7739327353434427e-06,
+ "loss": 0.4789,
+ "step": 5955
+ },
+ {
+ "epoch": 2.8160756501182034,
+ "grad_norm": 2.975043773651123,
+ "learning_rate": 2.7733126560690543e-06,
+ "loss": 0.4798,
+ "step": 5956
+ },
+ {
+ "epoch": 2.816548463356974,
+ "grad_norm": 2.765475034713745,
+ "learning_rate": 2.772692559776685e-06,
+ "loss": 0.4206,
+ "step": 5957
+ },
+ {
+ "epoch": 2.8170212765957445,
+ "grad_norm": 2.48612380027771,
+ "learning_rate": 2.7720724465049463e-06,
+ "loss": 0.4234,
+ "step": 5958
+ },
+ {
+ "epoch": 2.8174940898345153,
+ "grad_norm": 2.7145729064941406,
+ "learning_rate": 2.77145231629245e-06,
+ "loss": 0.4713,
+ "step": 5959
+ },
+ {
+ "epoch": 2.817966903073286,
+ "grad_norm": 2.5993762016296387,
+ "learning_rate": 2.7708321691778074e-06,
+ "loss": 0.4144,
+ "step": 5960
+ },
+ {
+ "epoch": 2.8184397163120565,
+ "grad_norm": 3.0902538299560547,
+ "learning_rate": 2.770212005199633e-06,
+ "loss": 0.4822,
+ "step": 5961
+ },
+ {
+ "epoch": 2.8189125295508273,
+ "grad_norm": 2.849757671356201,
+ "learning_rate": 2.7695918243965424e-06,
+ "loss": 0.4449,
+ "step": 5962
+ },
+ {
+ "epoch": 2.819385342789598,
+ "grad_norm": 2.77148699760437,
+ "learning_rate": 2.768971626807151e-06,
+ "loss": 0.4448,
+ "step": 5963
+ },
+ {
+ "epoch": 2.819858156028369,
+ "grad_norm": 2.7865898609161377,
+ "learning_rate": 2.7683514124700757e-06,
+ "loss": 0.4944,
+ "step": 5964
+ },
+ {
+ "epoch": 2.8203309692671397,
+ "grad_norm": 2.9057955741882324,
+ "learning_rate": 2.767731181423934e-06,
+ "loss": 0.5074,
+ "step": 5965
+ },
+ {
+ "epoch": 2.82080378250591,
+ "grad_norm": 2.725837469100952,
+ "learning_rate": 2.7671109337073465e-06,
+ "loss": 0.4207,
+ "step": 5966
+ },
+ {
+ "epoch": 2.821276595744681,
+ "grad_norm": 3.078531265258789,
+ "learning_rate": 2.7664906693589315e-06,
+ "loss": 0.4835,
+ "step": 5967
+ },
+ {
+ "epoch": 2.8217494089834516,
+ "grad_norm": 2.8692002296447754,
+ "learning_rate": 2.765870388417312e-06,
+ "loss": 0.4284,
+ "step": 5968
+ },
+ {
+ "epoch": 2.822222222222222,
+ "grad_norm": 2.8519723415374756,
+ "learning_rate": 2.765250090921109e-06,
+ "loss": 0.541,
+ "step": 5969
+ },
+ {
+ "epoch": 2.8226950354609928,
+ "grad_norm": 3.2037532329559326,
+ "learning_rate": 2.7646297769089457e-06,
+ "loss": 0.4276,
+ "step": 5970
+ },
+ {
+ "epoch": 2.8231678486997636,
+ "grad_norm": 2.8637137413024902,
+ "learning_rate": 2.7640094464194468e-06,
+ "loss": 0.4904,
+ "step": 5971
+ },
+ {
+ "epoch": 2.8236406619385344,
+ "grad_norm": 2.681516408920288,
+ "learning_rate": 2.7633890994912372e-06,
+ "loss": 0.4942,
+ "step": 5972
+ },
+ {
+ "epoch": 2.824113475177305,
+ "grad_norm": 3.0035219192504883,
+ "learning_rate": 2.7627687361629434e-06,
+ "loss": 0.4556,
+ "step": 5973
+ },
+ {
+ "epoch": 2.8245862884160755,
+ "grad_norm": 2.8107759952545166,
+ "learning_rate": 2.7621483564731923e-06,
+ "loss": 0.4225,
+ "step": 5974
+ },
+ {
+ "epoch": 2.8250591016548463,
+ "grad_norm": 2.87276029586792,
+ "learning_rate": 2.7615279604606126e-06,
+ "loss": 0.5045,
+ "step": 5975
+ },
+ {
+ "epoch": 2.825531914893617,
+ "grad_norm": 2.687953233718872,
+ "learning_rate": 2.760907548163833e-06,
+ "loss": 0.4018,
+ "step": 5976
+ },
+ {
+ "epoch": 2.8260047281323875,
+ "grad_norm": 2.587979555130005,
+ "learning_rate": 2.760287119621486e-06,
+ "loss": 0.4407,
+ "step": 5977
+ },
+ {
+ "epoch": 2.8264775413711583,
+ "grad_norm": 2.805602550506592,
+ "learning_rate": 2.7596666748722e-06,
+ "loss": 0.4559,
+ "step": 5978
+ },
+ {
+ "epoch": 2.826950354609929,
+ "grad_norm": 2.320763111114502,
+ "learning_rate": 2.759046213954609e-06,
+ "loss": 0.3847,
+ "step": 5979
+ },
+ {
+ "epoch": 2.8274231678487,
+ "grad_norm": 2.6876401901245117,
+ "learning_rate": 2.758425736907347e-06,
+ "loss": 0.4528,
+ "step": 5980
+ },
+ {
+ "epoch": 2.8278959810874706,
+ "grad_norm": 2.6852915287017822,
+ "learning_rate": 2.757805243769046e-06,
+ "loss": 0.395,
+ "step": 5981
+ },
+ {
+ "epoch": 2.828368794326241,
+ "grad_norm": 2.808326005935669,
+ "learning_rate": 2.7571847345783447e-06,
+ "loss": 0.4647,
+ "step": 5982
+ },
+ {
+ "epoch": 2.828841607565012,
+ "grad_norm": 2.641479015350342,
+ "learning_rate": 2.7565642093738766e-06,
+ "loss": 0.3798,
+ "step": 5983
+ },
+ {
+ "epoch": 2.8293144208037826,
+ "grad_norm": 2.8066110610961914,
+ "learning_rate": 2.7559436681942803e-06,
+ "loss": 0.5072,
+ "step": 5984
+ },
+ {
+ "epoch": 2.829787234042553,
+ "grad_norm": 2.898375988006592,
+ "learning_rate": 2.7553231110781936e-06,
+ "loss": 0.5182,
+ "step": 5985
+ },
+ {
+ "epoch": 2.8302600472813237,
+ "grad_norm": 2.704890489578247,
+ "learning_rate": 2.7547025380642574e-06,
+ "loss": 0.3999,
+ "step": 5986
+ },
+ {
+ "epoch": 2.8307328605200945,
+ "grad_norm": 2.6024270057678223,
+ "learning_rate": 2.7540819491911106e-06,
+ "loss": 0.4302,
+ "step": 5987
+ },
+ {
+ "epoch": 2.8312056737588653,
+ "grad_norm": 2.8006081581115723,
+ "learning_rate": 2.7534613444973946e-06,
+ "loss": 0.4492,
+ "step": 5988
+ },
+ {
+ "epoch": 2.831678486997636,
+ "grad_norm": 2.9532058238983154,
+ "learning_rate": 2.752840724021752e-06,
+ "loss": 0.4552,
+ "step": 5989
+ },
+ {
+ "epoch": 2.8321513002364065,
+ "grad_norm": 3.1830217838287354,
+ "learning_rate": 2.7522200878028265e-06,
+ "loss": 0.5013,
+ "step": 5990
+ },
+ {
+ "epoch": 2.8326241134751773,
+ "grad_norm": 2.716176748275757,
+ "learning_rate": 2.7515994358792624e-06,
+ "loss": 0.4569,
+ "step": 5991
+ },
+ {
+ "epoch": 2.833096926713948,
+ "grad_norm": 2.6852715015411377,
+ "learning_rate": 2.7509787682897044e-06,
+ "loss": 0.4764,
+ "step": 5992
+ },
+ {
+ "epoch": 2.8335697399527184,
+ "grad_norm": 2.9383316040039062,
+ "learning_rate": 2.7503580850727985e-06,
+ "loss": 0.5205,
+ "step": 5993
+ },
+ {
+ "epoch": 2.8340425531914892,
+ "grad_norm": 2.703132152557373,
+ "learning_rate": 2.749737386267193e-06,
+ "loss": 0.4543,
+ "step": 5994
+ },
+ {
+ "epoch": 2.83451536643026,
+ "grad_norm": 2.4304885864257812,
+ "learning_rate": 2.7491166719115354e-06,
+ "loss": 0.4479,
+ "step": 5995
+ },
+ {
+ "epoch": 2.834988179669031,
+ "grad_norm": 2.975722551345825,
+ "learning_rate": 2.748495942044475e-06,
+ "loss": 0.4074,
+ "step": 5996
+ },
+ {
+ "epoch": 2.8354609929078016,
+ "grad_norm": 3.440208911895752,
+ "learning_rate": 2.7478751967046617e-06,
+ "loss": 0.4497,
+ "step": 5997
+ },
+ {
+ "epoch": 2.835933806146572,
+ "grad_norm": 2.734673261642456,
+ "learning_rate": 2.747254435930747e-06,
+ "loss": 0.437,
+ "step": 5998
+ },
+ {
+ "epoch": 2.8364066193853428,
+ "grad_norm": 3.1918959617614746,
+ "learning_rate": 2.7466336597613826e-06,
+ "loss": 0.4197,
+ "step": 5999
+ },
+ {
+ "epoch": 2.8368794326241136,
+ "grad_norm": 3.1440329551696777,
+ "learning_rate": 2.7460128682352216e-06,
+ "loss": 0.4425,
+ "step": 6000
+ },
+ {
+ "epoch": 2.837352245862884,
+ "grad_norm": 2.582993507385254,
+ "learning_rate": 2.7453920613909183e-06,
+ "loss": 0.4475,
+ "step": 6001
+ },
+ {
+ "epoch": 2.8378250591016547,
+ "grad_norm": 3.2682149410247803,
+ "learning_rate": 2.744771239267128e-06,
+ "loss": 0.4615,
+ "step": 6002
+ },
+ {
+ "epoch": 2.8382978723404255,
+ "grad_norm": 2.848477840423584,
+ "learning_rate": 2.7441504019025046e-06,
+ "loss": 0.4093,
+ "step": 6003
+ },
+ {
+ "epoch": 2.8387706855791963,
+ "grad_norm": 2.3582282066345215,
+ "learning_rate": 2.7435295493357067e-06,
+ "loss": 0.3911,
+ "step": 6004
+ },
+ {
+ "epoch": 2.839243498817967,
+ "grad_norm": 2.7707207202911377,
+ "learning_rate": 2.742908681605392e-06,
+ "loss": 0.4069,
+ "step": 6005
+ },
+ {
+ "epoch": 2.8397163120567375,
+ "grad_norm": 3.0763752460479736,
+ "learning_rate": 2.7422877987502183e-06,
+ "loss": 0.512,
+ "step": 6006
+ },
+ {
+ "epoch": 2.8401891252955083,
+ "grad_norm": 2.8027124404907227,
+ "learning_rate": 2.741666900808846e-06,
+ "loss": 0.4922,
+ "step": 6007
+ },
+ {
+ "epoch": 2.840661938534279,
+ "grad_norm": 2.487982988357544,
+ "learning_rate": 2.7410459878199353e-06,
+ "loss": 0.4368,
+ "step": 6008
+ },
+ {
+ "epoch": 2.8411347517730494,
+ "grad_norm": 2.8727993965148926,
+ "learning_rate": 2.7404250598221484e-06,
+ "loss": 0.4639,
+ "step": 6009
+ },
+ {
+ "epoch": 2.84160756501182,
+ "grad_norm": 2.5556678771972656,
+ "learning_rate": 2.739804116854147e-06,
+ "loss": 0.4217,
+ "step": 6010
+ },
+ {
+ "epoch": 2.842080378250591,
+ "grad_norm": 2.6306912899017334,
+ "learning_rate": 2.7391831589545948e-06,
+ "loss": 0.4816,
+ "step": 6011
+ },
+ {
+ "epoch": 2.842553191489362,
+ "grad_norm": 2.7340946197509766,
+ "learning_rate": 2.7385621861621557e-06,
+ "loss": 0.4113,
+ "step": 6012
+ },
+ {
+ "epoch": 2.8430260047281326,
+ "grad_norm": 2.834190607070923,
+ "learning_rate": 2.737941198515495e-06,
+ "loss": 0.4691,
+ "step": 6013
+ },
+ {
+ "epoch": 2.843498817966903,
+ "grad_norm": 2.7139697074890137,
+ "learning_rate": 2.737320196053281e-06,
+ "loss": 0.3798,
+ "step": 6014
+ },
+ {
+ "epoch": 2.8439716312056738,
+ "grad_norm": 2.7934985160827637,
+ "learning_rate": 2.736699178814177e-06,
+ "loss": 0.446,
+ "step": 6015
+ },
+ {
+ "epoch": 2.8444444444444446,
+ "grad_norm": 2.6941518783569336,
+ "learning_rate": 2.7360781468368534e-06,
+ "loss": 0.4787,
+ "step": 6016
+ },
+ {
+ "epoch": 2.844917257683215,
+ "grad_norm": 3.1530468463897705,
+ "learning_rate": 2.7354571001599792e-06,
+ "loss": 0.474,
+ "step": 6017
+ },
+ {
+ "epoch": 2.8453900709219857,
+ "grad_norm": 2.613875389099121,
+ "learning_rate": 2.7348360388222243e-06,
+ "loss": 0.4297,
+ "step": 6018
+ },
+ {
+ "epoch": 2.8458628841607565,
+ "grad_norm": 2.5481486320495605,
+ "learning_rate": 2.7342149628622587e-06,
+ "loss": 0.3762,
+ "step": 6019
+ },
+ {
+ "epoch": 2.8463356973995273,
+ "grad_norm": 2.6425609588623047,
+ "learning_rate": 2.7335938723187544e-06,
+ "loss": 0.4077,
+ "step": 6020
+ },
+ {
+ "epoch": 2.846808510638298,
+ "grad_norm": 2.6281731128692627,
+ "learning_rate": 2.7329727672303836e-06,
+ "loss": 0.466,
+ "step": 6021
+ },
+ {
+ "epoch": 2.8472813238770684,
+ "grad_norm": 2.8862180709838867,
+ "learning_rate": 2.7323516476358197e-06,
+ "loss": 0.4191,
+ "step": 6022
+ },
+ {
+ "epoch": 2.8477541371158392,
+ "grad_norm": 2.907731533050537,
+ "learning_rate": 2.7317305135737383e-06,
+ "loss": 0.4867,
+ "step": 6023
+ },
+ {
+ "epoch": 2.84822695035461,
+ "grad_norm": 2.825593948364258,
+ "learning_rate": 2.731109365082814e-06,
+ "loss": 0.4888,
+ "step": 6024
+ },
+ {
+ "epoch": 2.8486997635933804,
+ "grad_norm": 2.478163003921509,
+ "learning_rate": 2.730488202201722e-06,
+ "loss": 0.4714,
+ "step": 6025
+ },
+ {
+ "epoch": 2.849172576832151,
+ "grad_norm": 2.928899049758911,
+ "learning_rate": 2.7298670249691418e-06,
+ "loss": 0.4671,
+ "step": 6026
+ },
+ {
+ "epoch": 2.849645390070922,
+ "grad_norm": 2.778256893157959,
+ "learning_rate": 2.7292458334237488e-06,
+ "loss": 0.429,
+ "step": 6027
+ },
+ {
+ "epoch": 2.850118203309693,
+ "grad_norm": 3.0689055919647217,
+ "learning_rate": 2.7286246276042234e-06,
+ "loss": 0.4727,
+ "step": 6028
+ },
+ {
+ "epoch": 2.8505910165484636,
+ "grad_norm": 2.582066774368286,
+ "learning_rate": 2.7280034075492447e-06,
+ "loss": 0.4025,
+ "step": 6029
+ },
+ {
+ "epoch": 2.851063829787234,
+ "grad_norm": 3.6679015159606934,
+ "learning_rate": 2.7273821732974936e-06,
+ "loss": 0.4856,
+ "step": 6030
+ },
+ {
+ "epoch": 2.8515366430260047,
+ "grad_norm": 2.7222588062286377,
+ "learning_rate": 2.7267609248876516e-06,
+ "loss": 0.4255,
+ "step": 6031
+ },
+ {
+ "epoch": 2.8520094562647755,
+ "grad_norm": 2.455038547515869,
+ "learning_rate": 2.726139662358401e-06,
+ "loss": 0.4234,
+ "step": 6032
+ },
+ {
+ "epoch": 2.852482269503546,
+ "grad_norm": 2.8277318477630615,
+ "learning_rate": 2.7255183857484253e-06,
+ "loss": 0.4146,
+ "step": 6033
+ },
+ {
+ "epoch": 2.8529550827423167,
+ "grad_norm": 2.523615837097168,
+ "learning_rate": 2.724897095096409e-06,
+ "loss": 0.4227,
+ "step": 6034
+ },
+ {
+ "epoch": 2.8534278959810875,
+ "grad_norm": 3.353646755218506,
+ "learning_rate": 2.724275790441036e-06,
+ "loss": 0.5041,
+ "step": 6035
+ },
+ {
+ "epoch": 2.8539007092198583,
+ "grad_norm": 2.753981828689575,
+ "learning_rate": 2.7236544718209934e-06,
+ "loss": 0.4646,
+ "step": 6036
+ },
+ {
+ "epoch": 2.854373522458629,
+ "grad_norm": 2.954744577407837,
+ "learning_rate": 2.723033139274967e-06,
+ "loss": 0.5182,
+ "step": 6037
+ },
+ {
+ "epoch": 2.8548463356973994,
+ "grad_norm": 2.4814131259918213,
+ "learning_rate": 2.7224117928416462e-06,
+ "loss": 0.4626,
+ "step": 6038
+ },
+ {
+ "epoch": 2.8553191489361702,
+ "grad_norm": 2.7414886951446533,
+ "learning_rate": 2.721790432559717e-06,
+ "loss": 0.4111,
+ "step": 6039
+ },
+ {
+ "epoch": 2.855791962174941,
+ "grad_norm": 2.8743896484375,
+ "learning_rate": 2.7211690584678706e-06,
+ "loss": 0.4986,
+ "step": 6040
+ },
+ {
+ "epoch": 2.8562647754137114,
+ "grad_norm": 3.0691921710968018,
+ "learning_rate": 2.720547670604797e-06,
+ "loss": 0.4743,
+ "step": 6041
+ },
+ {
+ "epoch": 2.856737588652482,
+ "grad_norm": 2.7273411750793457,
+ "learning_rate": 2.7199262690091872e-06,
+ "loss": 0.4403,
+ "step": 6042
+ },
+ {
+ "epoch": 2.857210401891253,
+ "grad_norm": 2.8022944927215576,
+ "learning_rate": 2.7193048537197325e-06,
+ "loss": 0.4413,
+ "step": 6043
+ },
+ {
+ "epoch": 2.8576832151300238,
+ "grad_norm": 2.4883248805999756,
+ "learning_rate": 2.718683424775126e-06,
+ "loss": 0.4485,
+ "step": 6044
+ },
+ {
+ "epoch": 2.8581560283687946,
+ "grad_norm": 2.457249879837036,
+ "learning_rate": 2.718061982214062e-06,
+ "loss": 0.4167,
+ "step": 6045
+ },
+ {
+ "epoch": 2.858628841607565,
+ "grad_norm": 2.7210328578948975,
+ "learning_rate": 2.717440526075234e-06,
+ "loss": 0.4419,
+ "step": 6046
+ },
+ {
+ "epoch": 2.8591016548463357,
+ "grad_norm": 2.684483766555786,
+ "learning_rate": 2.7168190563973386e-06,
+ "loss": 0.4449,
+ "step": 6047
+ },
+ {
+ "epoch": 2.8595744680851065,
+ "grad_norm": 2.5305230617523193,
+ "learning_rate": 2.7161975732190706e-06,
+ "loss": 0.3829,
+ "step": 6048
+ },
+ {
+ "epoch": 2.860047281323877,
+ "grad_norm": 3.0284602642059326,
+ "learning_rate": 2.7155760765791278e-06,
+ "loss": 0.5164,
+ "step": 6049
+ },
+ {
+ "epoch": 2.8605200945626477,
+ "grad_norm": 3.154599189758301,
+ "learning_rate": 2.7149545665162085e-06,
+ "loss": 0.527,
+ "step": 6050
+ },
+ {
+ "epoch": 2.8609929078014185,
+ "grad_norm": 2.6798126697540283,
+ "learning_rate": 2.7143330430690113e-06,
+ "loss": 0.4379,
+ "step": 6051
+ },
+ {
+ "epoch": 2.8614657210401893,
+ "grad_norm": 2.9531302452087402,
+ "learning_rate": 2.7137115062762344e-06,
+ "loss": 0.4549,
+ "step": 6052
+ },
+ {
+ "epoch": 2.86193853427896,
+ "grad_norm": 2.779531240463257,
+ "learning_rate": 2.7130899561765787e-06,
+ "loss": 0.4037,
+ "step": 6053
+ },
+ {
+ "epoch": 2.8624113475177304,
+ "grad_norm": 2.786763906478882,
+ "learning_rate": 2.7124683928087466e-06,
+ "loss": 0.3986,
+ "step": 6054
+ },
+ {
+ "epoch": 2.862884160756501,
+ "grad_norm": 2.430415630340576,
+ "learning_rate": 2.7118468162114385e-06,
+ "loss": 0.4402,
+ "step": 6055
+ },
+ {
+ "epoch": 2.863356973995272,
+ "grad_norm": 3.027268409729004,
+ "learning_rate": 2.7112252264233596e-06,
+ "loss": 0.4737,
+ "step": 6056
+ },
+ {
+ "epoch": 2.8638297872340424,
+ "grad_norm": 3.024935483932495,
+ "learning_rate": 2.710603623483211e-06,
+ "loss": 0.3997,
+ "step": 6057
+ },
+ {
+ "epoch": 2.864302600472813,
+ "grad_norm": 2.8862195014953613,
+ "learning_rate": 2.7099820074296985e-06,
+ "loss": 0.4896,
+ "step": 6058
+ },
+ {
+ "epoch": 2.864775413711584,
+ "grad_norm": 2.595579147338867,
+ "learning_rate": 2.709360378301527e-06,
+ "loss": 0.4387,
+ "step": 6059
+ },
+ {
+ "epoch": 2.8652482269503547,
+ "grad_norm": 2.8046188354492188,
+ "learning_rate": 2.708738736137403e-06,
+ "loss": 0.4726,
+ "step": 6060
+ },
+ {
+ "epoch": 2.8657210401891255,
+ "grad_norm": 3.040304660797119,
+ "learning_rate": 2.708117080976033e-06,
+ "loss": 0.4642,
+ "step": 6061
+ },
+ {
+ "epoch": 2.866193853427896,
+ "grad_norm": 2.618128538131714,
+ "learning_rate": 2.7074954128561248e-06,
+ "loss": 0.3171,
+ "step": 6062
+ },
+ {
+ "epoch": 2.8666666666666667,
+ "grad_norm": 2.7966055870056152,
+ "learning_rate": 2.706873731816387e-06,
+ "loss": 0.4893,
+ "step": 6063
+ },
+ {
+ "epoch": 2.8671394799054375,
+ "grad_norm": 2.9198038578033447,
+ "learning_rate": 2.706252037895529e-06,
+ "loss": 0.4428,
+ "step": 6064
+ },
+ {
+ "epoch": 2.867612293144208,
+ "grad_norm": 2.417705774307251,
+ "learning_rate": 2.7056303311322617e-06,
+ "loss": 0.3704,
+ "step": 6065
+ },
+ {
+ "epoch": 2.8680851063829786,
+ "grad_norm": 3.143918752670288,
+ "learning_rate": 2.7050086115652953e-06,
+ "loss": 0.5247,
+ "step": 6066
+ },
+ {
+ "epoch": 2.8685579196217494,
+ "grad_norm": 2.620781183242798,
+ "learning_rate": 2.704386879233341e-06,
+ "loss": 0.4131,
+ "step": 6067
+ },
+ {
+ "epoch": 2.8690307328605202,
+ "grad_norm": 2.6929845809936523,
+ "learning_rate": 2.703765134175112e-06,
+ "loss": 0.4833,
+ "step": 6068
+ },
+ {
+ "epoch": 2.869503546099291,
+ "grad_norm": 2.695920944213867,
+ "learning_rate": 2.7031433764293214e-06,
+ "loss": 0.435,
+ "step": 6069
+ },
+ {
+ "epoch": 2.8699763593380614,
+ "grad_norm": 2.6184475421905518,
+ "learning_rate": 2.702521606034684e-06,
+ "loss": 0.3898,
+ "step": 6070
+ },
+ {
+ "epoch": 2.870449172576832,
+ "grad_norm": 3.130624532699585,
+ "learning_rate": 2.7018998230299136e-06,
+ "loss": 0.4934,
+ "step": 6071
+ },
+ {
+ "epoch": 2.870921985815603,
+ "grad_norm": 2.947936534881592,
+ "learning_rate": 2.701278027453727e-06,
+ "loss": 0.4167,
+ "step": 6072
+ },
+ {
+ "epoch": 2.8713947990543733,
+ "grad_norm": 2.389263391494751,
+ "learning_rate": 2.7006562193448406e-06,
+ "loss": 0.3854,
+ "step": 6073
+ },
+ {
+ "epoch": 2.871867612293144,
+ "grad_norm": 2.9040684700012207,
+ "learning_rate": 2.700034398741971e-06,
+ "loss": 0.4656,
+ "step": 6074
+ },
+ {
+ "epoch": 2.872340425531915,
+ "grad_norm": 2.8671910762786865,
+ "learning_rate": 2.6994125656838365e-06,
+ "loss": 0.4642,
+ "step": 6075
+ },
+ {
+ "epoch": 2.8728132387706857,
+ "grad_norm": 2.6957180500030518,
+ "learning_rate": 2.698790720209156e-06,
+ "loss": 0.4894,
+ "step": 6076
+ },
+ {
+ "epoch": 2.8732860520094565,
+ "grad_norm": 2.748342514038086,
+ "learning_rate": 2.698168862356648e-06,
+ "loss": 0.4552,
+ "step": 6077
+ },
+ {
+ "epoch": 2.873758865248227,
+ "grad_norm": 2.7459912300109863,
+ "learning_rate": 2.6975469921650344e-06,
+ "loss": 0.4244,
+ "step": 6078
+ },
+ {
+ "epoch": 2.8742316784869977,
+ "grad_norm": 2.515650987625122,
+ "learning_rate": 2.6969251096730366e-06,
+ "loss": 0.4178,
+ "step": 6079
+ },
+ {
+ "epoch": 2.8747044917257685,
+ "grad_norm": 2.747373342514038,
+ "learning_rate": 2.696303214919375e-06,
+ "loss": 0.4623,
+ "step": 6080
+ },
+ {
+ "epoch": 2.875177304964539,
+ "grad_norm": 2.72092604637146,
+ "learning_rate": 2.695681307942773e-06,
+ "loss": 0.4227,
+ "step": 6081
+ },
+ {
+ "epoch": 2.8756501182033096,
+ "grad_norm": 2.6925108432769775,
+ "learning_rate": 2.695059388781955e-06,
+ "loss": 0.3807,
+ "step": 6082
+ },
+ {
+ "epoch": 2.8761229314420804,
+ "grad_norm": 2.673546314239502,
+ "learning_rate": 2.6944374574756427e-06,
+ "loss": 0.424,
+ "step": 6083
+ },
+ {
+ "epoch": 2.876595744680851,
+ "grad_norm": 2.7018187046051025,
+ "learning_rate": 2.6938155140625636e-06,
+ "loss": 0.4367,
+ "step": 6084
+ },
+ {
+ "epoch": 2.877068557919622,
+ "grad_norm": 2.9420957565307617,
+ "learning_rate": 2.6931935585814416e-06,
+ "loss": 0.4223,
+ "step": 6085
+ },
+ {
+ "epoch": 2.8775413711583924,
+ "grad_norm": 2.6523385047912598,
+ "learning_rate": 2.6925715910710036e-06,
+ "loss": 0.4074,
+ "step": 6086
+ },
+ {
+ "epoch": 2.878014184397163,
+ "grad_norm": 2.6104063987731934,
+ "learning_rate": 2.691949611569978e-06,
+ "loss": 0.423,
+ "step": 6087
+ },
+ {
+ "epoch": 2.878486997635934,
+ "grad_norm": 2.6463685035705566,
+ "learning_rate": 2.691327620117091e-06,
+ "loss": 0.4354,
+ "step": 6088
+ },
+ {
+ "epoch": 2.8789598108747043,
+ "grad_norm": 2.5863583087921143,
+ "learning_rate": 2.6907056167510725e-06,
+ "loss": 0.4177,
+ "step": 6089
+ },
+ {
+ "epoch": 2.879432624113475,
+ "grad_norm": 2.6946942806243896,
+ "learning_rate": 2.690083601510651e-06,
+ "loss": 0.4176,
+ "step": 6090
+ },
+ {
+ "epoch": 2.879905437352246,
+ "grad_norm": 3.0649454593658447,
+ "learning_rate": 2.6894615744345575e-06,
+ "loss": 0.4827,
+ "step": 6091
+ },
+ {
+ "epoch": 2.8803782505910167,
+ "grad_norm": 2.6454906463623047,
+ "learning_rate": 2.6888395355615226e-06,
+ "loss": 0.4757,
+ "step": 6092
+ },
+ {
+ "epoch": 2.8808510638297875,
+ "grad_norm": 3.251805067062378,
+ "learning_rate": 2.688217484930278e-06,
+ "loss": 0.5651,
+ "step": 6093
+ },
+ {
+ "epoch": 2.881323877068558,
+ "grad_norm": 2.543999433517456,
+ "learning_rate": 2.687595422579555e-06,
+ "loss": 0.4196,
+ "step": 6094
+ },
+ {
+ "epoch": 2.8817966903073287,
+ "grad_norm": 3.1502909660339355,
+ "learning_rate": 2.686973348548088e-06,
+ "loss": 0.4376,
+ "step": 6095
+ },
+ {
+ "epoch": 2.8822695035460995,
+ "grad_norm": 2.7800376415252686,
+ "learning_rate": 2.686351262874611e-06,
+ "loss": 0.444,
+ "step": 6096
+ },
+ {
+ "epoch": 2.88274231678487,
+ "grad_norm": 3.1529603004455566,
+ "learning_rate": 2.685729165597858e-06,
+ "loss": 0.5137,
+ "step": 6097
+ },
+ {
+ "epoch": 2.8832151300236406,
+ "grad_norm": 2.6079602241516113,
+ "learning_rate": 2.685107056756564e-06,
+ "loss": 0.4213,
+ "step": 6098
+ },
+ {
+ "epoch": 2.8836879432624114,
+ "grad_norm": 2.8969249725341797,
+ "learning_rate": 2.6844849363894648e-06,
+ "loss": 0.4679,
+ "step": 6099
+ },
+ {
+ "epoch": 2.884160756501182,
+ "grad_norm": 2.5882437229156494,
+ "learning_rate": 2.6838628045352977e-06,
+ "loss": 0.3891,
+ "step": 6100
+ },
+ {
+ "epoch": 2.8846335697399526,
+ "grad_norm": 2.9458062648773193,
+ "learning_rate": 2.6832406612328007e-06,
+ "loss": 0.4802,
+ "step": 6101
+ },
+ {
+ "epoch": 2.8851063829787233,
+ "grad_norm": 2.8463058471679688,
+ "learning_rate": 2.6826185065207105e-06,
+ "loss": 0.4332,
+ "step": 6102
+ },
+ {
+ "epoch": 2.885579196217494,
+ "grad_norm": 2.8799285888671875,
+ "learning_rate": 2.6819963404377667e-06,
+ "loss": 0.4474,
+ "step": 6103
+ },
+ {
+ "epoch": 2.8860520094562645,
+ "grad_norm": 2.846860408782959,
+ "learning_rate": 2.681374163022709e-06,
+ "loss": 0.4317,
+ "step": 6104
+ },
+ {
+ "epoch": 2.8865248226950353,
+ "grad_norm": 2.7918877601623535,
+ "learning_rate": 2.6807519743142775e-06,
+ "loss": 0.4243,
+ "step": 6105
+ },
+ {
+ "epoch": 2.886997635933806,
+ "grad_norm": 2.9351487159729004,
+ "learning_rate": 2.6801297743512127e-06,
+ "loss": 0.5253,
+ "step": 6106
+ },
+ {
+ "epoch": 2.887470449172577,
+ "grad_norm": 2.9422426223754883,
+ "learning_rate": 2.6795075631722576e-06,
+ "loss": 0.4887,
+ "step": 6107
+ },
+ {
+ "epoch": 2.8879432624113477,
+ "grad_norm": 2.6837220191955566,
+ "learning_rate": 2.678885340816153e-06,
+ "loss": 0.4761,
+ "step": 6108
+ },
+ {
+ "epoch": 2.888416075650118,
+ "grad_norm": 2.6800777912139893,
+ "learning_rate": 2.6782631073216425e-06,
+ "loss": 0.4248,
+ "step": 6109
+ },
+ {
+ "epoch": 2.888888888888889,
+ "grad_norm": 2.9654436111450195,
+ "learning_rate": 2.6776408627274702e-06,
+ "loss": 0.487,
+ "step": 6110
+ },
+ {
+ "epoch": 2.8893617021276596,
+ "grad_norm": 2.7725181579589844,
+ "learning_rate": 2.6770186070723804e-06,
+ "loss": 0.4166,
+ "step": 6111
+ },
+ {
+ "epoch": 2.88983451536643,
+ "grad_norm": 2.6547815799713135,
+ "learning_rate": 2.676396340395118e-06,
+ "loss": 0.4039,
+ "step": 6112
+ },
+ {
+ "epoch": 2.890307328605201,
+ "grad_norm": 2.690997838973999,
+ "learning_rate": 2.6757740627344292e-06,
+ "loss": 0.4639,
+ "step": 6113
+ },
+ {
+ "epoch": 2.8907801418439716,
+ "grad_norm": 2.4693069458007812,
+ "learning_rate": 2.67515177412906e-06,
+ "loss": 0.4052,
+ "step": 6114
+ },
+ {
+ "epoch": 2.8912529550827424,
+ "grad_norm": 2.7137033939361572,
+ "learning_rate": 2.6745294746177576e-06,
+ "loss": 0.4442,
+ "step": 6115
+ },
+ {
+ "epoch": 2.891725768321513,
+ "grad_norm": 3.7417004108428955,
+ "learning_rate": 2.6739071642392712e-06,
+ "loss": 0.4809,
+ "step": 6116
+ },
+ {
+ "epoch": 2.8921985815602835,
+ "grad_norm": 2.707094669342041,
+ "learning_rate": 2.673284843032347e-06,
+ "loss": 0.411,
+ "step": 6117
+ },
+ {
+ "epoch": 2.8926713947990543,
+ "grad_norm": 2.7864158153533936,
+ "learning_rate": 2.672662511035736e-06,
+ "loss": 0.4939,
+ "step": 6118
+ },
+ {
+ "epoch": 2.893144208037825,
+ "grad_norm": 2.8753504753112793,
+ "learning_rate": 2.672040168288187e-06,
+ "loss": 0.4396,
+ "step": 6119
+ },
+ {
+ "epoch": 2.8936170212765955,
+ "grad_norm": 2.7581071853637695,
+ "learning_rate": 2.6714178148284516e-06,
+ "loss": 0.427,
+ "step": 6120
+ },
+ {
+ "epoch": 2.8940898345153663,
+ "grad_norm": 2.9754791259765625,
+ "learning_rate": 2.6707954506952803e-06,
+ "loss": 0.4255,
+ "step": 6121
+ },
+ {
+ "epoch": 2.894562647754137,
+ "grad_norm": 2.876939296722412,
+ "learning_rate": 2.670173075927426e-06,
+ "loss": 0.4699,
+ "step": 6122
+ },
+ {
+ "epoch": 2.895035460992908,
+ "grad_norm": 2.4875400066375732,
+ "learning_rate": 2.6695506905636397e-06,
+ "loss": 0.3568,
+ "step": 6123
+ },
+ {
+ "epoch": 2.8955082742316787,
+ "grad_norm": 2.703606128692627,
+ "learning_rate": 2.668928294642675e-06,
+ "loss": 0.3646,
+ "step": 6124
+ },
+ {
+ "epoch": 2.895981087470449,
+ "grad_norm": 2.8618338108062744,
+ "learning_rate": 2.6683058882032868e-06,
+ "loss": 0.378,
+ "step": 6125
+ },
+ {
+ "epoch": 2.89645390070922,
+ "grad_norm": 2.9756760597229004,
+ "learning_rate": 2.667683471284229e-06,
+ "loss": 0.4348,
+ "step": 6126
+ },
+ {
+ "epoch": 2.8969267139479906,
+ "grad_norm": 2.7861104011535645,
+ "learning_rate": 2.667061043924256e-06,
+ "loss": 0.4435,
+ "step": 6127
+ },
+ {
+ "epoch": 2.897399527186761,
+ "grad_norm": 2.7932238578796387,
+ "learning_rate": 2.6664386061621243e-06,
+ "loss": 0.4824,
+ "step": 6128
+ },
+ {
+ "epoch": 2.8978723404255318,
+ "grad_norm": 2.85483455657959,
+ "learning_rate": 2.6658161580365917e-06,
+ "loss": 0.4925,
+ "step": 6129
+ },
+ {
+ "epoch": 2.8983451536643026,
+ "grad_norm": 2.4242141246795654,
+ "learning_rate": 2.6651936995864136e-06,
+ "loss": 0.3466,
+ "step": 6130
+ },
+ {
+ "epoch": 2.8988179669030734,
+ "grad_norm": 3.385214328765869,
+ "learning_rate": 2.6645712308503473e-06,
+ "loss": 0.4751,
+ "step": 6131
+ },
+ {
+ "epoch": 2.899290780141844,
+ "grad_norm": 2.7109622955322266,
+ "learning_rate": 2.6639487518671525e-06,
+ "loss": 0.4469,
+ "step": 6132
+ },
+ {
+ "epoch": 2.8997635933806145,
+ "grad_norm": 2.6537814140319824,
+ "learning_rate": 2.6633262626755877e-06,
+ "loss": 0.4678,
+ "step": 6133
+ },
+ {
+ "epoch": 2.9002364066193853,
+ "grad_norm": 2.5992231369018555,
+ "learning_rate": 2.6627037633144124e-06,
+ "loss": 0.4206,
+ "step": 6134
+ },
+ {
+ "epoch": 2.900709219858156,
+ "grad_norm": 2.988940954208374,
+ "learning_rate": 2.6620812538223885e-06,
+ "loss": 0.4554,
+ "step": 6135
+ },
+ {
+ "epoch": 2.9011820330969265,
+ "grad_norm": 3.0678138732910156,
+ "learning_rate": 2.661458734238274e-06,
+ "loss": 0.4671,
+ "step": 6136
+ },
+ {
+ "epoch": 2.9016548463356973,
+ "grad_norm": 2.6902482509613037,
+ "learning_rate": 2.6608362046008335e-06,
+ "loss": 0.372,
+ "step": 6137
+ },
+ {
+ "epoch": 2.902127659574468,
+ "grad_norm": 3.031597375869751,
+ "learning_rate": 2.660213664948827e-06,
+ "loss": 0.4424,
+ "step": 6138
+ },
+ {
+ "epoch": 2.902600472813239,
+ "grad_norm": 2.8376755714416504,
+ "learning_rate": 2.6595911153210187e-06,
+ "loss": 0.4599,
+ "step": 6139
+ },
+ {
+ "epoch": 2.9030732860520096,
+ "grad_norm": 3.3164854049682617,
+ "learning_rate": 2.6589685557561707e-06,
+ "loss": 0.3897,
+ "step": 6140
+ },
+ {
+ "epoch": 2.90354609929078,
+ "grad_norm": 2.9535014629364014,
+ "learning_rate": 2.658345986293048e-06,
+ "loss": 0.4957,
+ "step": 6141
+ },
+ {
+ "epoch": 2.904018912529551,
+ "grad_norm": 2.821276903152466,
+ "learning_rate": 2.657723406970415e-06,
+ "loss": 0.4453,
+ "step": 6142
+ },
+ {
+ "epoch": 2.9044917257683216,
+ "grad_norm": 2.7314651012420654,
+ "learning_rate": 2.657100817827037e-06,
+ "loss": 0.4406,
+ "step": 6143
+ },
+ {
+ "epoch": 2.904964539007092,
+ "grad_norm": 2.9509520530700684,
+ "learning_rate": 2.6564782189016804e-06,
+ "loss": 0.4629,
+ "step": 6144
+ },
+ {
+ "epoch": 2.9054373522458627,
+ "grad_norm": 2.6234960556030273,
+ "learning_rate": 2.655855610233111e-06,
+ "loss": 0.4306,
+ "step": 6145
+ },
+ {
+ "epoch": 2.9059101654846335,
+ "grad_norm": 2.7209644317626953,
+ "learning_rate": 2.6552329918600962e-06,
+ "loss": 0.3643,
+ "step": 6146
+ },
+ {
+ "epoch": 2.9063829787234043,
+ "grad_norm": 2.9797747135162354,
+ "learning_rate": 2.654610363821404e-06,
+ "loss": 0.4616,
+ "step": 6147
+ },
+ {
+ "epoch": 2.906855791962175,
+ "grad_norm": 2.8179666996002197,
+ "learning_rate": 2.6539877261558016e-06,
+ "loss": 0.4526,
+ "step": 6148
+ },
+ {
+ "epoch": 2.9073286052009455,
+ "grad_norm": 2.7492244243621826,
+ "learning_rate": 2.653365078902059e-06,
+ "loss": 0.4862,
+ "step": 6149
+ },
+ {
+ "epoch": 2.9078014184397163,
+ "grad_norm": 3.0262451171875,
+ "learning_rate": 2.6527424220989457e-06,
+ "loss": 0.3728,
+ "step": 6150
+ },
+ {
+ "epoch": 2.908274231678487,
+ "grad_norm": 2.8092808723449707,
+ "learning_rate": 2.6521197557852315e-06,
+ "loss": 0.4668,
+ "step": 6151
+ },
+ {
+ "epoch": 2.9087470449172574,
+ "grad_norm": 2.915719985961914,
+ "learning_rate": 2.651497079999687e-06,
+ "loss": 0.5124,
+ "step": 6152
+ },
+ {
+ "epoch": 2.9092198581560282,
+ "grad_norm": 2.9794204235076904,
+ "learning_rate": 2.6508743947810834e-06,
+ "loss": 0.5207,
+ "step": 6153
+ },
+ {
+ "epoch": 2.909692671394799,
+ "grad_norm": 2.882453680038452,
+ "learning_rate": 2.650251700168193e-06,
+ "loss": 0.4382,
+ "step": 6154
+ },
+ {
+ "epoch": 2.91016548463357,
+ "grad_norm": 3.183680534362793,
+ "learning_rate": 2.6496289961997886e-06,
+ "loss": 0.5134,
+ "step": 6155
+ },
+ {
+ "epoch": 2.9106382978723406,
+ "grad_norm": 2.9374759197235107,
+ "learning_rate": 2.649006282914642e-06,
+ "loss": 0.4748,
+ "step": 6156
+ },
+ {
+ "epoch": 2.911111111111111,
+ "grad_norm": 2.8096041679382324,
+ "learning_rate": 2.648383560351527e-06,
+ "loss": 0.4672,
+ "step": 6157
+ },
+ {
+ "epoch": 2.911583924349882,
+ "grad_norm": 2.8799238204956055,
+ "learning_rate": 2.6477608285492196e-06,
+ "loss": 0.4679,
+ "step": 6158
+ },
+ {
+ "epoch": 2.9120567375886526,
+ "grad_norm": 2.689310073852539,
+ "learning_rate": 2.6471380875464923e-06,
+ "loss": 0.4069,
+ "step": 6159
+ },
+ {
+ "epoch": 2.912529550827423,
+ "grad_norm": 2.909323215484619,
+ "learning_rate": 2.6465153373821216e-06,
+ "loss": 0.4463,
+ "step": 6160
+ },
+ {
+ "epoch": 2.9130023640661937,
+ "grad_norm": 2.797724962234497,
+ "learning_rate": 2.6458925780948845e-06,
+ "loss": 0.4269,
+ "step": 6161
+ },
+ {
+ "epoch": 2.9134751773049645,
+ "grad_norm": 2.7533204555511475,
+ "learning_rate": 2.645269809723556e-06,
+ "loss": 0.453,
+ "step": 6162
+ },
+ {
+ "epoch": 2.9139479905437353,
+ "grad_norm": 2.6615989208221436,
+ "learning_rate": 2.6446470323069122e-06,
+ "loss": 0.3921,
+ "step": 6163
+ },
+ {
+ "epoch": 2.914420803782506,
+ "grad_norm": 3.0493314266204834,
+ "learning_rate": 2.644024245883733e-06,
+ "loss": 0.4779,
+ "step": 6164
+ },
+ {
+ "epoch": 2.9148936170212765,
+ "grad_norm": 2.649845600128174,
+ "learning_rate": 2.643401450492795e-06,
+ "loss": 0.454,
+ "step": 6165
+ },
+ {
+ "epoch": 2.9153664302600473,
+ "grad_norm": 2.7931838035583496,
+ "learning_rate": 2.642778646172877e-06,
+ "loss": 0.504,
+ "step": 6166
+ },
+ {
+ "epoch": 2.915839243498818,
+ "grad_norm": 2.9518136978149414,
+ "learning_rate": 2.64215583296276e-06,
+ "loss": 0.4767,
+ "step": 6167
+ },
+ {
+ "epoch": 2.9163120567375884,
+ "grad_norm": 2.6047427654266357,
+ "learning_rate": 2.6415330109012216e-06,
+ "loss": 0.4316,
+ "step": 6168
+ },
+ {
+ "epoch": 2.916784869976359,
+ "grad_norm": 2.7732112407684326,
+ "learning_rate": 2.640910180027044e-06,
+ "loss": 0.4213,
+ "step": 6169
+ },
+ {
+ "epoch": 2.91725768321513,
+ "grad_norm": 3.1157236099243164,
+ "learning_rate": 2.6402873403790068e-06,
+ "loss": 0.4559,
+ "step": 6170
+ },
+ {
+ "epoch": 2.917730496453901,
+ "grad_norm": 2.68424129486084,
+ "learning_rate": 2.6396644919958917e-06,
+ "loss": 0.3456,
+ "step": 6171
+ },
+ {
+ "epoch": 2.9182033096926716,
+ "grad_norm": 3.1093270778656006,
+ "learning_rate": 2.639041634916482e-06,
+ "loss": 0.4172,
+ "step": 6172
+ },
+ {
+ "epoch": 2.918676122931442,
+ "grad_norm": 2.9844655990600586,
+ "learning_rate": 2.6384187691795594e-06,
+ "loss": 0.4844,
+ "step": 6173
+ },
+ {
+ "epoch": 2.9191489361702128,
+ "grad_norm": 2.907151222229004,
+ "learning_rate": 2.637795894823906e-06,
+ "loss": 0.5126,
+ "step": 6174
+ },
+ {
+ "epoch": 2.9196217494089836,
+ "grad_norm": 2.804105520248413,
+ "learning_rate": 2.637173011888307e-06,
+ "loss": 0.3919,
+ "step": 6175
+ },
+ {
+ "epoch": 2.920094562647754,
+ "grad_norm": 2.8809266090393066,
+ "learning_rate": 2.636550120411547e-06,
+ "loss": 0.4468,
+ "step": 6176
+ },
+ {
+ "epoch": 2.9205673758865247,
+ "grad_norm": 2.686290979385376,
+ "learning_rate": 2.6359272204324087e-06,
+ "loss": 0.4352,
+ "step": 6177
+ },
+ {
+ "epoch": 2.9210401891252955,
+ "grad_norm": 2.448101758956909,
+ "learning_rate": 2.635304311989678e-06,
+ "loss": 0.4218,
+ "step": 6178
+ },
+ {
+ "epoch": 2.9215130023640663,
+ "grad_norm": 2.81024169921875,
+ "learning_rate": 2.6346813951221416e-06,
+ "loss": 0.5177,
+ "step": 6179
+ },
+ {
+ "epoch": 2.921985815602837,
+ "grad_norm": 2.7590086460113525,
+ "learning_rate": 2.6340584698685856e-06,
+ "loss": 0.3897,
+ "step": 6180
+ },
+ {
+ "epoch": 2.9224586288416075,
+ "grad_norm": 3.1226227283477783,
+ "learning_rate": 2.6334355362677965e-06,
+ "loss": 0.4595,
+ "step": 6181
+ },
+ {
+ "epoch": 2.9229314420803783,
+ "grad_norm": 2.673828125,
+ "learning_rate": 2.6328125943585607e-06,
+ "loss": 0.4932,
+ "step": 6182
+ },
+ {
+ "epoch": 2.923404255319149,
+ "grad_norm": 2.8297293186187744,
+ "learning_rate": 2.632189644179668e-06,
+ "loss": 0.3819,
+ "step": 6183
+ },
+ {
+ "epoch": 2.9238770685579194,
+ "grad_norm": 2.9661548137664795,
+ "learning_rate": 2.6315666857699056e-06,
+ "loss": 0.4419,
+ "step": 6184
+ },
+ {
+ "epoch": 2.92434988179669,
+ "grad_norm": 2.9745798110961914,
+ "learning_rate": 2.6309437191680627e-06,
+ "loss": 0.4423,
+ "step": 6185
+ },
+ {
+ "epoch": 2.924822695035461,
+ "grad_norm": 2.8351712226867676,
+ "learning_rate": 2.6303207444129285e-06,
+ "loss": 0.5043,
+ "step": 6186
+ },
+ {
+ "epoch": 2.925295508274232,
+ "grad_norm": 2.6442384719848633,
+ "learning_rate": 2.6296977615432927e-06,
+ "loss": 0.4431,
+ "step": 6187
+ },
+ {
+ "epoch": 2.9257683215130026,
+ "grad_norm": 2.4128029346466064,
+ "learning_rate": 2.6290747705979457e-06,
+ "loss": 0.3603,
+ "step": 6188
+ },
+ {
+ "epoch": 2.926241134751773,
+ "grad_norm": 2.730424642562866,
+ "learning_rate": 2.6284517716156786e-06,
+ "loss": 0.439,
+ "step": 6189
+ },
+ {
+ "epoch": 2.9267139479905437,
+ "grad_norm": 2.6215405464172363,
+ "learning_rate": 2.627828764635284e-06,
+ "loss": 0.4117,
+ "step": 6190
+ },
+ {
+ "epoch": 2.9271867612293145,
+ "grad_norm": 2.56585955619812,
+ "learning_rate": 2.627205749695552e-06,
+ "loss": 0.4404,
+ "step": 6191
+ },
+ {
+ "epoch": 2.927659574468085,
+ "grad_norm": 2.9587886333465576,
+ "learning_rate": 2.6265827268352763e-06,
+ "loss": 0.4295,
+ "step": 6192
+ },
+ {
+ "epoch": 2.9281323877068557,
+ "grad_norm": 2.6611828804016113,
+ "learning_rate": 2.625959696093249e-06,
+ "loss": 0.4441,
+ "step": 6193
+ },
+ {
+ "epoch": 2.9286052009456265,
+ "grad_norm": 2.4391369819641113,
+ "learning_rate": 2.6253366575082634e-06,
+ "loss": 0.4447,
+ "step": 6194
+ },
+ {
+ "epoch": 2.9290780141843973,
+ "grad_norm": 2.710763454437256,
+ "learning_rate": 2.6247136111191144e-06,
+ "loss": 0.4662,
+ "step": 6195
+ },
+ {
+ "epoch": 2.929550827423168,
+ "grad_norm": 2.770697593688965,
+ "learning_rate": 2.6240905569645952e-06,
+ "loss": 0.4263,
+ "step": 6196
+ },
+ {
+ "epoch": 2.9300236406619384,
+ "grad_norm": 2.5885732173919678,
+ "learning_rate": 2.623467495083501e-06,
+ "loss": 0.4303,
+ "step": 6197
+ },
+ {
+ "epoch": 2.9304964539007092,
+ "grad_norm": 2.5716748237609863,
+ "learning_rate": 2.6228444255146274e-06,
+ "loss": 0.3714,
+ "step": 6198
+ },
+ {
+ "epoch": 2.93096926713948,
+ "grad_norm": 3.0437910556793213,
+ "learning_rate": 2.6222213482967703e-06,
+ "loss": 0.4077,
+ "step": 6199
+ },
+ {
+ "epoch": 2.9314420803782504,
+ "grad_norm": 2.7861344814300537,
+ "learning_rate": 2.6215982634687253e-06,
+ "loss": 0.4157,
+ "step": 6200
+ },
+ {
+ "epoch": 2.931914893617021,
+ "grad_norm": 2.5265355110168457,
+ "learning_rate": 2.6209751710692905e-06,
+ "loss": 0.4586,
+ "step": 6201
+ },
+ {
+ "epoch": 2.932387706855792,
+ "grad_norm": 2.940112590789795,
+ "learning_rate": 2.6203520711372615e-06,
+ "loss": 0.4208,
+ "step": 6202
+ },
+ {
+ "epoch": 2.9328605200945628,
+ "grad_norm": 2.7124581336975098,
+ "learning_rate": 2.6197289637114363e-06,
+ "loss": 0.4173,
+ "step": 6203
+ },
+ {
+ "epoch": 2.9333333333333336,
+ "grad_norm": 2.818523406982422,
+ "learning_rate": 2.619105848830615e-06,
+ "loss": 0.4349,
+ "step": 6204
+ },
+ {
+ "epoch": 2.933806146572104,
+ "grad_norm": 2.7630393505096436,
+ "learning_rate": 2.6184827265335937e-06,
+ "loss": 0.5078,
+ "step": 6205
+ },
+ {
+ "epoch": 2.9342789598108747,
+ "grad_norm": 3.0554699897766113,
+ "learning_rate": 2.6178595968591726e-06,
+ "loss": 0.4712,
+ "step": 6206
+ },
+ {
+ "epoch": 2.9347517730496455,
+ "grad_norm": 2.721992254257202,
+ "learning_rate": 2.6172364598461507e-06,
+ "loss": 0.4847,
+ "step": 6207
+ },
+ {
+ "epoch": 2.935224586288416,
+ "grad_norm": 2.809663772583008,
+ "learning_rate": 2.6166133155333303e-06,
+ "loss": 0.4447,
+ "step": 6208
+ },
+ {
+ "epoch": 2.9356973995271867,
+ "grad_norm": 2.568394660949707,
+ "learning_rate": 2.6159901639595088e-06,
+ "loss": 0.4543,
+ "step": 6209
+ },
+ {
+ "epoch": 2.9361702127659575,
+ "grad_norm": 3.3670637607574463,
+ "learning_rate": 2.6153670051634884e-06,
+ "loss": 0.4901,
+ "step": 6210
+ },
+ {
+ "epoch": 2.9366430260047283,
+ "grad_norm": 3.082508087158203,
+ "learning_rate": 2.614743839184071e-06,
+ "loss": 0.4862,
+ "step": 6211
+ },
+ {
+ "epoch": 2.937115839243499,
+ "grad_norm": 2.692139148712158,
+ "learning_rate": 2.6141206660600566e-06,
+ "loss": 0.5199,
+ "step": 6212
+ },
+ {
+ "epoch": 2.9375886524822694,
+ "grad_norm": 3.231433391571045,
+ "learning_rate": 2.6134974858302504e-06,
+ "loss": 0.464,
+ "step": 6213
+ },
+ {
+ "epoch": 2.93806146572104,
+ "grad_norm": 3.224238157272339,
+ "learning_rate": 2.612874298533452e-06,
+ "loss": 0.4507,
+ "step": 6214
+ },
+ {
+ "epoch": 2.938534278959811,
+ "grad_norm": 2.812755584716797,
+ "learning_rate": 2.6122511042084663e-06,
+ "loss": 0.4527,
+ "step": 6215
+ },
+ {
+ "epoch": 2.9390070921985814,
+ "grad_norm": 2.837811231613159,
+ "learning_rate": 2.611627902894098e-06,
+ "loss": 0.4782,
+ "step": 6216
+ },
+ {
+ "epoch": 2.939479905437352,
+ "grad_norm": 3.093817710876465,
+ "learning_rate": 2.6110046946291476e-06,
+ "loss": 0.4933,
+ "step": 6217
+ },
+ {
+ "epoch": 2.939952718676123,
+ "grad_norm": 2.950119733810425,
+ "learning_rate": 2.6103814794524235e-06,
+ "loss": 0.4884,
+ "step": 6218
+ },
+ {
+ "epoch": 2.9404255319148938,
+ "grad_norm": 2.469681978225708,
+ "learning_rate": 2.6097582574027274e-06,
+ "loss": 0.4135,
+ "step": 6219
+ },
+ {
+ "epoch": 2.9408983451536646,
+ "grad_norm": 2.779238224029541,
+ "learning_rate": 2.609135028518866e-06,
+ "loss": 0.5165,
+ "step": 6220
+ },
+ {
+ "epoch": 2.941371158392435,
+ "grad_norm": 2.807705879211426,
+ "learning_rate": 2.608511792839645e-06,
+ "loss": 0.4046,
+ "step": 6221
+ },
+ {
+ "epoch": 2.9418439716312057,
+ "grad_norm": 2.6067750453948975,
+ "learning_rate": 2.607888550403871e-06,
+ "loss": 0.406,
+ "step": 6222
+ },
+ {
+ "epoch": 2.9423167848699765,
+ "grad_norm": 2.865766763687134,
+ "learning_rate": 2.607265301250349e-06,
+ "loss": 0.471,
+ "step": 6223
+ },
+ {
+ "epoch": 2.942789598108747,
+ "grad_norm": 2.977681875228882,
+ "learning_rate": 2.6066420454178876e-06,
+ "loss": 0.4666,
+ "step": 6224
+ },
+ {
+ "epoch": 2.9432624113475176,
+ "grad_norm": 2.870884418487549,
+ "learning_rate": 2.606018782945294e-06,
+ "loss": 0.4768,
+ "step": 6225
+ },
+ {
+ "epoch": 2.9437352245862884,
+ "grad_norm": 2.992851495742798,
+ "learning_rate": 2.6053955138713756e-06,
+ "loss": 0.4657,
+ "step": 6226
+ },
+ {
+ "epoch": 2.9442080378250592,
+ "grad_norm": 2.7279815673828125,
+ "learning_rate": 2.6047722382349406e-06,
+ "loss": 0.4087,
+ "step": 6227
+ },
+ {
+ "epoch": 2.94468085106383,
+ "grad_norm": 2.8587028980255127,
+ "learning_rate": 2.604148956074797e-06,
+ "loss": 0.4452,
+ "step": 6228
+ },
+ {
+ "epoch": 2.9451536643026004,
+ "grad_norm": 3.001694679260254,
+ "learning_rate": 2.6035256674297555e-06,
+ "loss": 0.4852,
+ "step": 6229
+ },
+ {
+ "epoch": 2.945626477541371,
+ "grad_norm": 2.858069896697998,
+ "learning_rate": 2.6029023723386237e-06,
+ "loss": 0.4281,
+ "step": 6230
+ },
+ {
+ "epoch": 2.946099290780142,
+ "grad_norm": 2.675856828689575,
+ "learning_rate": 2.602279070840213e-06,
+ "loss": 0.4545,
+ "step": 6231
+ },
+ {
+ "epoch": 2.9465721040189123,
+ "grad_norm": 2.530245065689087,
+ "learning_rate": 2.6016557629733334e-06,
+ "loss": 0.4619,
+ "step": 6232
+ },
+ {
+ "epoch": 2.947044917257683,
+ "grad_norm": 2.7533743381500244,
+ "learning_rate": 2.601032448776795e-06,
+ "loss": 0.4879,
+ "step": 6233
+ },
+ {
+ "epoch": 2.947517730496454,
+ "grad_norm": 3.130453109741211,
+ "learning_rate": 2.600409128289409e-06,
+ "loss": 0.4056,
+ "step": 6234
+ },
+ {
+ "epoch": 2.9479905437352247,
+ "grad_norm": 3.4736509323120117,
+ "learning_rate": 2.5997858015499867e-06,
+ "loss": 0.5063,
+ "step": 6235
+ },
+ {
+ "epoch": 2.9484633569739955,
+ "grad_norm": 2.871978282928467,
+ "learning_rate": 2.5991624685973406e-06,
+ "loss": 0.4562,
+ "step": 6236
+ },
+ {
+ "epoch": 2.948936170212766,
+ "grad_norm": 2.976503372192383,
+ "learning_rate": 2.5985391294702817e-06,
+ "loss": 0.5079,
+ "step": 6237
+ },
+ {
+ "epoch": 2.9494089834515367,
+ "grad_norm": 2.578122615814209,
+ "learning_rate": 2.597915784207623e-06,
+ "loss": 0.4069,
+ "step": 6238
+ },
+ {
+ "epoch": 2.9498817966903075,
+ "grad_norm": 2.885911226272583,
+ "learning_rate": 2.597292432848178e-06,
+ "loss": 0.4382,
+ "step": 6239
+ },
+ {
+ "epoch": 2.950354609929078,
+ "grad_norm": 2.9301681518554688,
+ "learning_rate": 2.5966690754307605e-06,
+ "loss": 0.4888,
+ "step": 6240
+ },
+ {
+ "epoch": 2.9508274231678486,
+ "grad_norm": 2.9912192821502686,
+ "learning_rate": 2.5960457119941834e-06,
+ "loss": 0.4699,
+ "step": 6241
+ },
+ {
+ "epoch": 2.9513002364066194,
+ "grad_norm": 2.6612601280212402,
+ "learning_rate": 2.5954223425772607e-06,
+ "loss": 0.3736,
+ "step": 6242
+ },
+ {
+ "epoch": 2.9517730496453902,
+ "grad_norm": 2.9325380325317383,
+ "learning_rate": 2.5947989672188067e-06,
+ "loss": 0.4771,
+ "step": 6243
+ },
+ {
+ "epoch": 2.952245862884161,
+ "grad_norm": 2.8143959045410156,
+ "learning_rate": 2.594175585957637e-06,
+ "loss": 0.5103,
+ "step": 6244
+ },
+ {
+ "epoch": 2.9527186761229314,
+ "grad_norm": 2.355078935623169,
+ "learning_rate": 2.5935521988325674e-06,
+ "loss": 0.44,
+ "step": 6245
+ },
+ {
+ "epoch": 2.953191489361702,
+ "grad_norm": 2.733156442642212,
+ "learning_rate": 2.5929288058824114e-06,
+ "loss": 0.4306,
+ "step": 6246
+ },
+ {
+ "epoch": 2.953664302600473,
+ "grad_norm": 3.182563304901123,
+ "learning_rate": 2.5923054071459865e-06,
+ "loss": 0.417,
+ "step": 6247
+ },
+ {
+ "epoch": 2.9541371158392433,
+ "grad_norm": 2.4162323474884033,
+ "learning_rate": 2.5916820026621094e-06,
+ "loss": 0.3802,
+ "step": 6248
+ },
+ {
+ "epoch": 2.954609929078014,
+ "grad_norm": 2.772706985473633,
+ "learning_rate": 2.591058592469595e-06,
+ "loss": 0.4654,
+ "step": 6249
+ },
+ {
+ "epoch": 2.955082742316785,
+ "grad_norm": 2.6011102199554443,
+ "learning_rate": 2.5904351766072616e-06,
+ "loss": 0.4619,
+ "step": 6250
+ },
+ {
+ "epoch": 2.9555555555555557,
+ "grad_norm": 2.5700361728668213,
+ "learning_rate": 2.589811755113926e-06,
+ "loss": 0.3991,
+ "step": 6251
+ },
+ {
+ "epoch": 2.9560283687943265,
+ "grad_norm": 2.6444971561431885,
+ "learning_rate": 2.589188328028407e-06,
+ "loss": 0.4388,
+ "step": 6252
+ },
+ {
+ "epoch": 2.956501182033097,
+ "grad_norm": 2.739567279815674,
+ "learning_rate": 2.588564895389521e-06,
+ "loss": 0.4193,
+ "step": 6253
+ },
+ {
+ "epoch": 2.9569739952718677,
+ "grad_norm": 2.7070045471191406,
+ "learning_rate": 2.5879414572360877e-06,
+ "loss": 0.4347,
+ "step": 6254
+ },
+ {
+ "epoch": 2.9574468085106385,
+ "grad_norm": 2.7811532020568848,
+ "learning_rate": 2.587318013606926e-06,
+ "loss": 0.43,
+ "step": 6255
+ },
+ {
+ "epoch": 2.957919621749409,
+ "grad_norm": 3.0036091804504395,
+ "learning_rate": 2.5866945645408537e-06,
+ "loss": 0.4855,
+ "step": 6256
+ },
+ {
+ "epoch": 2.9583924349881796,
+ "grad_norm": 2.948573112487793,
+ "learning_rate": 2.5860711100766918e-06,
+ "loss": 0.4594,
+ "step": 6257
+ },
+ {
+ "epoch": 2.9588652482269504,
+ "grad_norm": 2.6371593475341797,
+ "learning_rate": 2.5854476502532583e-06,
+ "loss": 0.446,
+ "step": 6258
+ },
+ {
+ "epoch": 2.959338061465721,
+ "grad_norm": 2.668677806854248,
+ "learning_rate": 2.5848241851093754e-06,
+ "loss": 0.3991,
+ "step": 6259
+ },
+ {
+ "epoch": 2.959810874704492,
+ "grad_norm": 3.1640663146972656,
+ "learning_rate": 2.5842007146838614e-06,
+ "loss": 0.5146,
+ "step": 6260
+ },
+ {
+ "epoch": 2.9602836879432624,
+ "grad_norm": 2.9412102699279785,
+ "learning_rate": 2.5835772390155382e-06,
+ "loss": 0.4798,
+ "step": 6261
+ },
+ {
+ "epoch": 2.960756501182033,
+ "grad_norm": 2.7674343585968018,
+ "learning_rate": 2.582953758143227e-06,
+ "loss": 0.4262,
+ "step": 6262
+ },
+ {
+ "epoch": 2.961229314420804,
+ "grad_norm": 3.5219457149505615,
+ "learning_rate": 2.582330272105749e-06,
+ "loss": 0.4905,
+ "step": 6263
+ },
+ {
+ "epoch": 2.9617021276595743,
+ "grad_norm": 2.4274468421936035,
+ "learning_rate": 2.5817067809419267e-06,
+ "loss": 0.4048,
+ "step": 6264
+ },
+ {
+ "epoch": 2.962174940898345,
+ "grad_norm": 2.6907944679260254,
+ "learning_rate": 2.5810832846905814e-06,
+ "loss": 0.388,
+ "step": 6265
+ },
+ {
+ "epoch": 2.962647754137116,
+ "grad_norm": 2.603151321411133,
+ "learning_rate": 2.5804597833905347e-06,
+ "loss": 0.4377,
+ "step": 6266
+ },
+ {
+ "epoch": 2.9631205673758867,
+ "grad_norm": 2.685837507247925,
+ "learning_rate": 2.57983627708061e-06,
+ "loss": 0.4409,
+ "step": 6267
+ },
+ {
+ "epoch": 2.963593380614657,
+ "grad_norm": 2.8281500339508057,
+ "learning_rate": 2.579212765799631e-06,
+ "loss": 0.4567,
+ "step": 6268
+ },
+ {
+ "epoch": 2.964066193853428,
+ "grad_norm": 2.6387875080108643,
+ "learning_rate": 2.57858924958642e-06,
+ "loss": 0.4061,
+ "step": 6269
+ },
+ {
+ "epoch": 2.9645390070921986,
+ "grad_norm": 2.64139986038208,
+ "learning_rate": 2.5779657284798017e-06,
+ "loss": 0.4539,
+ "step": 6270
+ },
+ {
+ "epoch": 2.965011820330969,
+ "grad_norm": 2.7384836673736572,
+ "learning_rate": 2.5773422025185983e-06,
+ "loss": 0.408,
+ "step": 6271
+ },
+ {
+ "epoch": 2.96548463356974,
+ "grad_norm": 2.262514352798462,
+ "learning_rate": 2.576718671741636e-06,
+ "loss": 0.3726,
+ "step": 6272
+ },
+ {
+ "epoch": 2.9659574468085106,
+ "grad_norm": 2.53800106048584,
+ "learning_rate": 2.5760951361877384e-06,
+ "loss": 0.4716,
+ "step": 6273
+ },
+ {
+ "epoch": 2.9664302600472814,
+ "grad_norm": 3.256701707839966,
+ "learning_rate": 2.57547159589573e-06,
+ "loss": 0.518,
+ "step": 6274
+ },
+ {
+ "epoch": 2.966903073286052,
+ "grad_norm": 2.9427342414855957,
+ "learning_rate": 2.574848050904436e-06,
+ "loss": 0.4255,
+ "step": 6275
+ },
+ {
+ "epoch": 2.9673758865248225,
+ "grad_norm": 2.5794098377227783,
+ "learning_rate": 2.574224501252682e-06,
+ "loss": 0.4412,
+ "step": 6276
+ },
+ {
+ "epoch": 2.9678486997635933,
+ "grad_norm": 2.5894877910614014,
+ "learning_rate": 2.573600946979294e-06,
+ "loss": 0.4356,
+ "step": 6277
+ },
+ {
+ "epoch": 2.968321513002364,
+ "grad_norm": 2.9597361087799072,
+ "learning_rate": 2.572977388123098e-06,
+ "loss": 0.4376,
+ "step": 6278
+ },
+ {
+ "epoch": 2.9687943262411345,
+ "grad_norm": 2.779303550720215,
+ "learning_rate": 2.5723538247229197e-06,
+ "loss": 0.3985,
+ "step": 6279
+ },
+ {
+ "epoch": 2.9692671394799053,
+ "grad_norm": 2.9173855781555176,
+ "learning_rate": 2.5717302568175866e-06,
+ "loss": 0.4581,
+ "step": 6280
+ },
+ {
+ "epoch": 2.969739952718676,
+ "grad_norm": 2.703721284866333,
+ "learning_rate": 2.5711066844459242e-06,
+ "loss": 0.3705,
+ "step": 6281
+ },
+ {
+ "epoch": 2.970212765957447,
+ "grad_norm": 2.5415029525756836,
+ "learning_rate": 2.5704831076467613e-06,
+ "loss": 0.4089,
+ "step": 6282
+ },
+ {
+ "epoch": 2.9706855791962177,
+ "grad_norm": 2.791780948638916,
+ "learning_rate": 2.5698595264589234e-06,
+ "loss": 0.4357,
+ "step": 6283
+ },
+ {
+ "epoch": 2.971158392434988,
+ "grad_norm": 2.887662887573242,
+ "learning_rate": 2.5692359409212392e-06,
+ "loss": 0.4093,
+ "step": 6284
+ },
+ {
+ "epoch": 2.971631205673759,
+ "grad_norm": 3.0309557914733887,
+ "learning_rate": 2.5686123510725364e-06,
+ "loss": 0.4461,
+ "step": 6285
+ },
+ {
+ "epoch": 2.9721040189125296,
+ "grad_norm": 2.6861515045166016,
+ "learning_rate": 2.5679887569516437e-06,
+ "loss": 0.4199,
+ "step": 6286
+ },
+ {
+ "epoch": 2.9725768321513,
+ "grad_norm": 2.7014012336730957,
+ "learning_rate": 2.5673651585973897e-06,
+ "loss": 0.4373,
+ "step": 6287
+ },
+ {
+ "epoch": 2.9730496453900708,
+ "grad_norm": 2.951265811920166,
+ "learning_rate": 2.5667415560486026e-06,
+ "loss": 0.4426,
+ "step": 6288
+ },
+ {
+ "epoch": 2.9735224586288416,
+ "grad_norm": 2.7664504051208496,
+ "learning_rate": 2.5661179493441106e-06,
+ "loss": 0.474,
+ "step": 6289
+ },
+ {
+ "epoch": 2.9739952718676124,
+ "grad_norm": 2.6081087589263916,
+ "learning_rate": 2.5654943385227445e-06,
+ "loss": 0.4058,
+ "step": 6290
+ },
+ {
+ "epoch": 2.974468085106383,
+ "grad_norm": 2.9416966438293457,
+ "learning_rate": 2.564870723623333e-06,
+ "loss": 0.506,
+ "step": 6291
+ },
+ {
+ "epoch": 2.9749408983451535,
+ "grad_norm": 2.9441659450531006,
+ "learning_rate": 2.564247104684706e-06,
+ "loss": 0.4505,
+ "step": 6292
+ },
+ {
+ "epoch": 2.9754137115839243,
+ "grad_norm": 2.7110862731933594,
+ "learning_rate": 2.563623481745693e-06,
+ "loss": 0.4493,
+ "step": 6293
+ },
+ {
+ "epoch": 2.975886524822695,
+ "grad_norm": 2.88459849357605,
+ "learning_rate": 2.562999854845125e-06,
+ "loss": 0.4462,
+ "step": 6294
+ },
+ {
+ "epoch": 2.9763593380614655,
+ "grad_norm": 3.0491793155670166,
+ "learning_rate": 2.5623762240218327e-06,
+ "loss": 0.4928,
+ "step": 6295
+ },
+ {
+ "epoch": 2.9768321513002363,
+ "grad_norm": 2.9475483894348145,
+ "learning_rate": 2.561752589314646e-06,
+ "loss": 0.4535,
+ "step": 6296
+ },
+ {
+ "epoch": 2.977304964539007,
+ "grad_norm": 2.879495859146118,
+ "learning_rate": 2.561128950762397e-06,
+ "loss": 0.4393,
+ "step": 6297
+ },
+ {
+ "epoch": 2.977777777777778,
+ "grad_norm": 2.8478336334228516,
+ "learning_rate": 2.560505308403916e-06,
+ "loss": 0.4363,
+ "step": 6298
+ },
+ {
+ "epoch": 2.9782505910165487,
+ "grad_norm": 2.5475094318389893,
+ "learning_rate": 2.5598816622780343e-06,
+ "loss": 0.3825,
+ "step": 6299
+ },
+ {
+ "epoch": 2.978723404255319,
+ "grad_norm": 2.85430908203125,
+ "learning_rate": 2.5592580124235838e-06,
+ "loss": 0.4226,
+ "step": 6300
+ },
+ {
+ "epoch": 2.97919621749409,
+ "grad_norm": 2.569775104522705,
+ "learning_rate": 2.5586343588793975e-06,
+ "loss": 0.4045,
+ "step": 6301
+ },
+ {
+ "epoch": 2.9796690307328606,
+ "grad_norm": 2.4482202529907227,
+ "learning_rate": 2.558010701684307e-06,
+ "loss": 0.4625,
+ "step": 6302
+ },
+ {
+ "epoch": 2.980141843971631,
+ "grad_norm": 2.9301230907440186,
+ "learning_rate": 2.5573870408771436e-06,
+ "loss": 0.4358,
+ "step": 6303
+ },
+ {
+ "epoch": 2.9806146572104018,
+ "grad_norm": 2.9865870475769043,
+ "learning_rate": 2.5567633764967416e-06,
+ "loss": 0.497,
+ "step": 6304
+ },
+ {
+ "epoch": 2.9810874704491725,
+ "grad_norm": 2.523524522781372,
+ "learning_rate": 2.556139708581933e-06,
+ "loss": 0.4141,
+ "step": 6305
+ },
+ {
+ "epoch": 2.9815602836879433,
+ "grad_norm": 2.8489344120025635,
+ "learning_rate": 2.5555160371715504e-06,
+ "loss": 0.4205,
+ "step": 6306
+ },
+ {
+ "epoch": 2.982033096926714,
+ "grad_norm": 2.417759895324707,
+ "learning_rate": 2.5548923623044274e-06,
+ "loss": 0.44,
+ "step": 6307
+ },
+ {
+ "epoch": 2.9825059101654845,
+ "grad_norm": 2.7626900672912598,
+ "learning_rate": 2.554268684019398e-06,
+ "loss": 0.4646,
+ "step": 6308
+ },
+ {
+ "epoch": 2.9829787234042553,
+ "grad_norm": 3.0916266441345215,
+ "learning_rate": 2.5536450023552956e-06,
+ "loss": 0.4443,
+ "step": 6309
+ },
+ {
+ "epoch": 2.983451536643026,
+ "grad_norm": 2.721992015838623,
+ "learning_rate": 2.5530213173509542e-06,
+ "loss": 0.4008,
+ "step": 6310
+ },
+ {
+ "epoch": 2.9839243498817964,
+ "grad_norm": 2.825334072113037,
+ "learning_rate": 2.552397629045208e-06,
+ "loss": 0.4513,
+ "step": 6311
+ },
+ {
+ "epoch": 2.9843971631205672,
+ "grad_norm": 2.912050485610962,
+ "learning_rate": 2.5517739374768915e-06,
+ "loss": 0.4104,
+ "step": 6312
+ },
+ {
+ "epoch": 2.984869976359338,
+ "grad_norm": 2.760650634765625,
+ "learning_rate": 2.551150242684838e-06,
+ "loss": 0.4372,
+ "step": 6313
+ },
+ {
+ "epoch": 2.985342789598109,
+ "grad_norm": 2.8926033973693848,
+ "learning_rate": 2.5505265447078838e-06,
+ "loss": 0.475,
+ "step": 6314
+ },
+ {
+ "epoch": 2.9858156028368796,
+ "grad_norm": 2.6279892921447754,
+ "learning_rate": 2.5499028435848633e-06,
+ "loss": 0.4589,
+ "step": 6315
+ },
+ {
+ "epoch": 2.98628841607565,
+ "grad_norm": 3.2147316932678223,
+ "learning_rate": 2.549279139354611e-06,
+ "loss": 0.4968,
+ "step": 6316
+ },
+ {
+ "epoch": 2.986761229314421,
+ "grad_norm": 2.4510674476623535,
+ "learning_rate": 2.5486554320559626e-06,
+ "loss": 0.4291,
+ "step": 6317
+ },
+ {
+ "epoch": 2.9872340425531916,
+ "grad_norm": 2.6919643878936768,
+ "learning_rate": 2.5480317217277544e-06,
+ "loss": 0.4704,
+ "step": 6318
+ },
+ {
+ "epoch": 2.987706855791962,
+ "grad_norm": 2.9832234382629395,
+ "learning_rate": 2.5474080084088215e-06,
+ "loss": 0.4129,
+ "step": 6319
+ },
+ {
+ "epoch": 2.9881796690307327,
+ "grad_norm": 2.893209218978882,
+ "learning_rate": 2.5467842921380004e-06,
+ "loss": 0.5099,
+ "step": 6320
+ },
+ {
+ "epoch": 2.9886524822695035,
+ "grad_norm": 2.6734580993652344,
+ "learning_rate": 2.5461605729541254e-06,
+ "loss": 0.4588,
+ "step": 6321
+ },
+ {
+ "epoch": 2.9891252955082743,
+ "grad_norm": 2.5591681003570557,
+ "learning_rate": 2.5455368508960343e-06,
+ "loss": 0.4162,
+ "step": 6322
+ },
+ {
+ "epoch": 2.989598108747045,
+ "grad_norm": 3.2619881629943848,
+ "learning_rate": 2.5449131260025626e-06,
+ "loss": 0.4412,
+ "step": 6323
+ },
+ {
+ "epoch": 2.9900709219858155,
+ "grad_norm": 2.897914409637451,
+ "learning_rate": 2.544289398312549e-06,
+ "loss": 0.5079,
+ "step": 6324
+ },
+ {
+ "epoch": 2.9905437352245863,
+ "grad_norm": 2.7891685962677,
+ "learning_rate": 2.5436656678648274e-06,
+ "loss": 0.42,
+ "step": 6325
+ },
+ {
+ "epoch": 2.991016548463357,
+ "grad_norm": 3.022341728210449,
+ "learning_rate": 2.5430419346982367e-06,
+ "loss": 0.4739,
+ "step": 6326
+ },
+ {
+ "epoch": 2.9914893617021274,
+ "grad_norm": 3.395775556564331,
+ "learning_rate": 2.542418198851614e-06,
+ "loss": 0.4822,
+ "step": 6327
+ },
+ {
+ "epoch": 2.9919621749408982,
+ "grad_norm": 3.0200490951538086,
+ "learning_rate": 2.541794460363795e-06,
+ "loss": 0.4755,
+ "step": 6328
+ },
+ {
+ "epoch": 2.992434988179669,
+ "grad_norm": 3.302020311355591,
+ "learning_rate": 2.541170719273619e-06,
+ "loss": 0.4603,
+ "step": 6329
+ },
+ {
+ "epoch": 2.99290780141844,
+ "grad_norm": 2.5985910892486572,
+ "learning_rate": 2.5405469756199226e-06,
+ "loss": 0.4475,
+ "step": 6330
+ },
+ {
+ "epoch": 2.9933806146572106,
+ "grad_norm": 2.9413928985595703,
+ "learning_rate": 2.5399232294415434e-06,
+ "loss": 0.4695,
+ "step": 6331
+ },
+ {
+ "epoch": 2.993853427895981,
+ "grad_norm": 2.942777156829834,
+ "learning_rate": 2.53929948077732e-06,
+ "loss": 0.4462,
+ "step": 6332
+ },
+ {
+ "epoch": 2.9943262411347518,
+ "grad_norm": 2.971120595932007,
+ "learning_rate": 2.53867572966609e-06,
+ "loss": 0.4546,
+ "step": 6333
+ },
+ {
+ "epoch": 2.9947990543735226,
+ "grad_norm": 2.8248138427734375,
+ "learning_rate": 2.5380519761466927e-06,
+ "loss": 0.453,
+ "step": 6334
+ },
+ {
+ "epoch": 2.995271867612293,
+ "grad_norm": 3.0819008350372314,
+ "learning_rate": 2.5374282202579647e-06,
+ "loss": 0.4774,
+ "step": 6335
+ },
+ {
+ "epoch": 2.9957446808510637,
+ "grad_norm": 2.742570161819458,
+ "learning_rate": 2.5368044620387466e-06,
+ "loss": 0.5059,
+ "step": 6336
+ },
+ {
+ "epoch": 2.9962174940898345,
+ "grad_norm": 2.9087419509887695,
+ "learning_rate": 2.5361807015278757e-06,
+ "loss": 0.3606,
+ "step": 6337
+ },
+ {
+ "epoch": 2.9966903073286053,
+ "grad_norm": 2.6887354850769043,
+ "learning_rate": 2.5355569387641908e-06,
+ "loss": 0.4247,
+ "step": 6338
+ },
+ {
+ "epoch": 2.997163120567376,
+ "grad_norm": 2.8516008853912354,
+ "learning_rate": 2.534933173786531e-06,
+ "loss": 0.4502,
+ "step": 6339
+ },
+ {
+ "epoch": 2.9976359338061465,
+ "grad_norm": 2.4463164806365967,
+ "learning_rate": 2.5343094066337366e-06,
+ "loss": 0.3883,
+ "step": 6340
+ },
+ {
+ "epoch": 2.9981087470449173,
+ "grad_norm": 2.87025785446167,
+ "learning_rate": 2.533685637344645e-06,
+ "loss": 0.4534,
+ "step": 6341
+ },
+ {
+ "epoch": 2.998581560283688,
+ "grad_norm": 3.0706169605255127,
+ "learning_rate": 2.5330618659580967e-06,
+ "loss": 0.5426,
+ "step": 6342
+ },
+ {
+ "epoch": 2.9990543735224584,
+ "grad_norm": 2.7185773849487305,
+ "learning_rate": 2.532438092512931e-06,
+ "loss": 0.497,
+ "step": 6343
+ },
+ {
+ "epoch": 2.999527186761229,
+ "grad_norm": 2.840207815170288,
+ "learning_rate": 2.531814317047988e-06,
+ "loss": 0.4073,
+ "step": 6344
+ },
+ {
+ "epoch": 3.0,
+ "grad_norm": 3.1592655181884766,
+ "learning_rate": 2.5311905396021063e-06,
+ "loss": 0.4728,
+ "step": 6345
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 12690,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 6,
+ "save_steps": 2115,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.602707133177004e+19,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-6345/training_args.bin b/checkpoint-6345/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3885e6a27b204c834ff6fbc6f3a5a4c7f69fe2cc
--- /dev/null
+++ b/checkpoint-6345/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51188d498a3fe1bf9ddf234271786a31e25e89b3aa68721a97f8264e1013c9b6
+size 8056
diff --git a/checkpoint-6345/zero_to_fp32.py b/checkpoint-6345/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-6345/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-8460/README.md b/checkpoint-8460/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..049d467664ca6172b7ffbe6ba60b3eac7479cac4
--- /dev/null
+++ b/checkpoint-8460/README.md
@@ -0,0 +1,202 @@
+---
+base_model: meta-llama/Llama-3.1-8B
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.14.0
\ No newline at end of file
diff --git a/checkpoint-8460/adapter_config.json b/checkpoint-8460/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..38890ca21f7e1854f7350049a9113a9eec8a443a
--- /dev/null
+++ b/checkpoint-8460/adapter_config.json
@@ -0,0 +1,40 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "meta-llama/Llama-3.1-8B",
+ "bias": "none",
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "v_proj",
+ "q_proj",
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "up_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-8460/adapter_model.safetensors b/checkpoint-8460/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fa4326d337a0367fb307452fbb18e0d6811353a3
--- /dev/null
+++ b/checkpoint-8460/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5cf66a7393d44ed5793e47a8d373bf1fe6cfc2b1ae0d9bb705e40621dae6ac59
+size 3443586272
diff --git a/checkpoint-8460/global_step8460/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-8460/global_step8460/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bffc1cb862cdaaf2f5be13c9b935ea0e155d5e0c
--- /dev/null
+++ b/checkpoint-8460/global_step8460/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db80595bb7524b5c711427518049c54e03a91cd0eb192779cec37d5357e16f3d
+size 20661195036
diff --git a/checkpoint-8460/global_step8460/mp_rank_00_model_states.pt b/checkpoint-8460/global_step8460/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d61389e9281218f5df2f414c63a04e7489340bc6
--- /dev/null
+++ b/checkpoint-8460/global_step8460/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b51c058c0ea1479af01a9aa78fb7ce7e200463b1ac38fb017b8dd09bfada7c4
+size 3555326841
diff --git a/checkpoint-8460/latest b/checkpoint-8460/latest
new file mode 100644
index 0000000000000000000000000000000000000000..54f6ed5439ebecfdabdb8ffb050bd657ed115c6c
--- /dev/null
+++ b/checkpoint-8460/latest
@@ -0,0 +1 @@
+global_step8460
\ No newline at end of file
diff --git a/checkpoint-8460/rng_state.pth b/checkpoint-8460/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..65526098b19e2db0331412cbe0cbbf7b88169aae
--- /dev/null
+++ b/checkpoint-8460/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e725b02912a27d6bc1153bf1e82bf617b65b24cf1135b112005175c489e7e308
+size 14244
diff --git a/checkpoint-8460/scheduler.pt b/checkpoint-8460/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4dc69344bcc497bb307eafd8dedd38d029ef1020
--- /dev/null
+++ b/checkpoint-8460/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f00190c6fe6ba125384231887614f34f1cef1f3f4a39aba19b287c2903fc371e
+size 1064
diff --git a/checkpoint-8460/special_tokens_map.json b/checkpoint-8460/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca
--- /dev/null
+++ b/checkpoint-8460/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-8460/tokenizer.json b/checkpoint-8460/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-8460/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-8460/tokenizer_config.json b/checkpoint-8460/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..81dd14db6632ad5b35b9d447732e37ac074873a5
--- /dev/null
+++ b/checkpoint-8460/tokenizer_config.json
@@ -0,0 +1,2063 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|end_of_text|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-8460/trainer_state.json b/checkpoint-8460/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..51dba668b24d65764f94a1d2c69d6bbcd2f54ae2
--- /dev/null
+++ b/checkpoint-8460/trainer_state.json
@@ -0,0 +1,59253 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 4.0,
+ "eval_steps": 500,
+ "global_step": 8460,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.00047281323877068556,
+ "grad_norm": 5.163570880889893,
+ "learning_rate": 5.0000000000000004e-08,
+ "loss": 1.4628,
+ "step": 1
+ },
+ {
+ "epoch": 0.0009456264775413711,
+ "grad_norm": 6.298020839691162,
+ "learning_rate": 1.0000000000000001e-07,
+ "loss": 1.5003,
+ "step": 2
+ },
+ {
+ "epoch": 0.0014184397163120568,
+ "grad_norm": 5.853623390197754,
+ "learning_rate": 1.5000000000000002e-07,
+ "loss": 1.4495,
+ "step": 3
+ },
+ {
+ "epoch": 0.0018912529550827422,
+ "grad_norm": 5.456025123596191,
+ "learning_rate": 2.0000000000000002e-07,
+ "loss": 1.3798,
+ "step": 4
+ },
+ {
+ "epoch": 0.002364066193853428,
+ "grad_norm": 5.757407188415527,
+ "learning_rate": 2.5000000000000004e-07,
+ "loss": 1.4515,
+ "step": 5
+ },
+ {
+ "epoch": 0.0028368794326241137,
+ "grad_norm": 5.872277736663818,
+ "learning_rate": 3.0000000000000004e-07,
+ "loss": 1.4424,
+ "step": 6
+ },
+ {
+ "epoch": 0.003309692671394799,
+ "grad_norm": 6.7816009521484375,
+ "learning_rate": 3.5000000000000004e-07,
+ "loss": 1.4004,
+ "step": 7
+ },
+ {
+ "epoch": 0.0037825059101654845,
+ "grad_norm": 6.229667663574219,
+ "learning_rate": 4.0000000000000003e-07,
+ "loss": 1.4494,
+ "step": 8
+ },
+ {
+ "epoch": 0.00425531914893617,
+ "grad_norm": 5.336202621459961,
+ "learning_rate": 4.5000000000000003e-07,
+ "loss": 1.3916,
+ "step": 9
+ },
+ {
+ "epoch": 0.004728132387706856,
+ "grad_norm": 5.589445114135742,
+ "learning_rate": 5.000000000000001e-07,
+ "loss": 1.2318,
+ "step": 10
+ },
+ {
+ "epoch": 0.005200945626477541,
+ "grad_norm": 5.720539569854736,
+ "learning_rate": 5.5e-07,
+ "loss": 1.4367,
+ "step": 11
+ },
+ {
+ "epoch": 0.005673758865248227,
+ "grad_norm": 5.913913726806641,
+ "learning_rate": 6.000000000000001e-07,
+ "loss": 1.342,
+ "step": 12
+ },
+ {
+ "epoch": 0.006146572104018913,
+ "grad_norm": 5.899744987487793,
+ "learning_rate": 6.5e-07,
+ "loss": 1.4307,
+ "step": 13
+ },
+ {
+ "epoch": 0.006619385342789598,
+ "grad_norm": 5.571037292480469,
+ "learning_rate": 7.000000000000001e-07,
+ "loss": 1.3372,
+ "step": 14
+ },
+ {
+ "epoch": 0.0070921985815602835,
+ "grad_norm": 5.480010509490967,
+ "learning_rate": 7.5e-07,
+ "loss": 1.3923,
+ "step": 15
+ },
+ {
+ "epoch": 0.007565011820330969,
+ "grad_norm": 5.254702091217041,
+ "learning_rate": 8.000000000000001e-07,
+ "loss": 1.2928,
+ "step": 16
+ },
+ {
+ "epoch": 0.008037825059101654,
+ "grad_norm": 6.090312480926514,
+ "learning_rate": 8.500000000000001e-07,
+ "loss": 1.4984,
+ "step": 17
+ },
+ {
+ "epoch": 0.00851063829787234,
+ "grad_norm": 5.689319610595703,
+ "learning_rate": 9.000000000000001e-07,
+ "loss": 1.4108,
+ "step": 18
+ },
+ {
+ "epoch": 0.008983451536643027,
+ "grad_norm": 5.386685848236084,
+ "learning_rate": 9.500000000000001e-07,
+ "loss": 1.425,
+ "step": 19
+ },
+ {
+ "epoch": 0.009456264775413711,
+ "grad_norm": 6.451584815979004,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 1.5507,
+ "step": 20
+ },
+ {
+ "epoch": 0.009929078014184398,
+ "grad_norm": 5.37647008895874,
+ "learning_rate": 1.0500000000000001e-06,
+ "loss": 1.4109,
+ "step": 21
+ },
+ {
+ "epoch": 0.010401891252955082,
+ "grad_norm": 4.716553211212158,
+ "learning_rate": 1.1e-06,
+ "loss": 1.2028,
+ "step": 22
+ },
+ {
+ "epoch": 0.010874704491725768,
+ "grad_norm": 4.950989723205566,
+ "learning_rate": 1.1500000000000002e-06,
+ "loss": 1.3043,
+ "step": 23
+ },
+ {
+ "epoch": 0.011347517730496455,
+ "grad_norm": 4.688975811004639,
+ "learning_rate": 1.2000000000000002e-06,
+ "loss": 1.2708,
+ "step": 24
+ },
+ {
+ "epoch": 0.01182033096926714,
+ "grad_norm": 4.905868053436279,
+ "learning_rate": 1.25e-06,
+ "loss": 1.3268,
+ "step": 25
+ },
+ {
+ "epoch": 0.012293144208037825,
+ "grad_norm": 4.503395080566406,
+ "learning_rate": 1.3e-06,
+ "loss": 1.1799,
+ "step": 26
+ },
+ {
+ "epoch": 0.01276595744680851,
+ "grad_norm": 4.77382230758667,
+ "learning_rate": 1.3500000000000002e-06,
+ "loss": 1.3882,
+ "step": 27
+ },
+ {
+ "epoch": 0.013238770685579196,
+ "grad_norm": 4.734329700469971,
+ "learning_rate": 1.4000000000000001e-06,
+ "loss": 1.3476,
+ "step": 28
+ },
+ {
+ "epoch": 0.013711583924349883,
+ "grad_norm": 4.775066375732422,
+ "learning_rate": 1.45e-06,
+ "loss": 1.2429,
+ "step": 29
+ },
+ {
+ "epoch": 0.014184397163120567,
+ "grad_norm": 4.978334426879883,
+ "learning_rate": 1.5e-06,
+ "loss": 1.2119,
+ "step": 30
+ },
+ {
+ "epoch": 0.014657210401891253,
+ "grad_norm": 4.506785869598389,
+ "learning_rate": 1.5500000000000002e-06,
+ "loss": 1.3157,
+ "step": 31
+ },
+ {
+ "epoch": 0.015130023640661938,
+ "grad_norm": 4.007757186889648,
+ "learning_rate": 1.6000000000000001e-06,
+ "loss": 1.1451,
+ "step": 32
+ },
+ {
+ "epoch": 0.015602836879432624,
+ "grad_norm": 3.6621618270874023,
+ "learning_rate": 1.6500000000000003e-06,
+ "loss": 1.093,
+ "step": 33
+ },
+ {
+ "epoch": 0.01607565011820331,
+ "grad_norm": 3.8733766078948975,
+ "learning_rate": 1.7000000000000002e-06,
+ "loss": 1.2289,
+ "step": 34
+ },
+ {
+ "epoch": 0.016548463356973995,
+ "grad_norm": 4.3391900062561035,
+ "learning_rate": 1.75e-06,
+ "loss": 1.1453,
+ "step": 35
+ },
+ {
+ "epoch": 0.01702127659574468,
+ "grad_norm": 3.287623643875122,
+ "learning_rate": 1.8000000000000001e-06,
+ "loss": 1.0257,
+ "step": 36
+ },
+ {
+ "epoch": 0.017494089834515367,
+ "grad_norm": 3.591721773147583,
+ "learning_rate": 1.85e-06,
+ "loss": 0.9976,
+ "step": 37
+ },
+ {
+ "epoch": 0.017966903073286054,
+ "grad_norm": 4.028271675109863,
+ "learning_rate": 1.9000000000000002e-06,
+ "loss": 1.0773,
+ "step": 38
+ },
+ {
+ "epoch": 0.018439716312056736,
+ "grad_norm": 3.3543951511383057,
+ "learning_rate": 1.9500000000000004e-06,
+ "loss": 1.1677,
+ "step": 39
+ },
+ {
+ "epoch": 0.018912529550827423,
+ "grad_norm": 3.807624340057373,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 1.1232,
+ "step": 40
+ },
+ {
+ "epoch": 0.01938534278959811,
+ "grad_norm": 4.242797374725342,
+ "learning_rate": 2.05e-06,
+ "loss": 1.1819,
+ "step": 41
+ },
+ {
+ "epoch": 0.019858156028368795,
+ "grad_norm": 3.4574992656707764,
+ "learning_rate": 2.1000000000000002e-06,
+ "loss": 0.9878,
+ "step": 42
+ },
+ {
+ "epoch": 0.02033096926713948,
+ "grad_norm": 3.906695604324341,
+ "learning_rate": 2.15e-06,
+ "loss": 1.0592,
+ "step": 43
+ },
+ {
+ "epoch": 0.020803782505910164,
+ "grad_norm": 3.7543163299560547,
+ "learning_rate": 2.2e-06,
+ "loss": 1.0309,
+ "step": 44
+ },
+ {
+ "epoch": 0.02127659574468085,
+ "grad_norm": 3.3777148723602295,
+ "learning_rate": 2.25e-06,
+ "loss": 1.0664,
+ "step": 45
+ },
+ {
+ "epoch": 0.021749408983451537,
+ "grad_norm": 3.6003634929656982,
+ "learning_rate": 2.3000000000000004e-06,
+ "loss": 1.0482,
+ "step": 46
+ },
+ {
+ "epoch": 0.022222222222222223,
+ "grad_norm": 3.3961377143859863,
+ "learning_rate": 2.35e-06,
+ "loss": 1.0252,
+ "step": 47
+ },
+ {
+ "epoch": 0.02269503546099291,
+ "grad_norm": 3.1601035594940186,
+ "learning_rate": 2.4000000000000003e-06,
+ "loss": 1.0435,
+ "step": 48
+ },
+ {
+ "epoch": 0.023167848699763592,
+ "grad_norm": 3.4192967414855957,
+ "learning_rate": 2.4500000000000003e-06,
+ "loss": 1.0935,
+ "step": 49
+ },
+ {
+ "epoch": 0.02364066193853428,
+ "grad_norm": 3.1225922107696533,
+ "learning_rate": 2.5e-06,
+ "loss": 0.8988,
+ "step": 50
+ },
+ {
+ "epoch": 0.024113475177304965,
+ "grad_norm": 3.1423380374908447,
+ "learning_rate": 2.55e-06,
+ "loss": 1.0159,
+ "step": 51
+ },
+ {
+ "epoch": 0.02458628841607565,
+ "grad_norm": 3.4782402515411377,
+ "learning_rate": 2.6e-06,
+ "loss": 1.0231,
+ "step": 52
+ },
+ {
+ "epoch": 0.025059101654846337,
+ "grad_norm": 3.8362693786621094,
+ "learning_rate": 2.6500000000000005e-06,
+ "loss": 1.0725,
+ "step": 53
+ },
+ {
+ "epoch": 0.02553191489361702,
+ "grad_norm": 3.033294916152954,
+ "learning_rate": 2.7000000000000004e-06,
+ "loss": 0.9377,
+ "step": 54
+ },
+ {
+ "epoch": 0.026004728132387706,
+ "grad_norm": 3.849741220474243,
+ "learning_rate": 2.7500000000000004e-06,
+ "loss": 1.0046,
+ "step": 55
+ },
+ {
+ "epoch": 0.026477541371158392,
+ "grad_norm": 3.141876220703125,
+ "learning_rate": 2.8000000000000003e-06,
+ "loss": 0.9226,
+ "step": 56
+ },
+ {
+ "epoch": 0.02695035460992908,
+ "grad_norm": 2.773594856262207,
+ "learning_rate": 2.85e-06,
+ "loss": 0.8662,
+ "step": 57
+ },
+ {
+ "epoch": 0.027423167848699765,
+ "grad_norm": 3.1460225582122803,
+ "learning_rate": 2.9e-06,
+ "loss": 0.9304,
+ "step": 58
+ },
+ {
+ "epoch": 0.027895981087470448,
+ "grad_norm": 3.293583631515503,
+ "learning_rate": 2.95e-06,
+ "loss": 1.0374,
+ "step": 59
+ },
+ {
+ "epoch": 0.028368794326241134,
+ "grad_norm": 3.8190863132476807,
+ "learning_rate": 3e-06,
+ "loss": 0.971,
+ "step": 60
+ },
+ {
+ "epoch": 0.02884160756501182,
+ "grad_norm": 3.4566776752471924,
+ "learning_rate": 3.05e-06,
+ "loss": 0.9631,
+ "step": 61
+ },
+ {
+ "epoch": 0.029314420803782507,
+ "grad_norm": 3.355741500854492,
+ "learning_rate": 3.1000000000000004e-06,
+ "loss": 1.0097,
+ "step": 62
+ },
+ {
+ "epoch": 0.029787234042553193,
+ "grad_norm": 3.29746675491333,
+ "learning_rate": 3.1500000000000003e-06,
+ "loss": 0.9459,
+ "step": 63
+ },
+ {
+ "epoch": 0.030260047281323876,
+ "grad_norm": 3.3122968673706055,
+ "learning_rate": 3.2000000000000003e-06,
+ "loss": 0.8594,
+ "step": 64
+ },
+ {
+ "epoch": 0.030732860520094562,
+ "grad_norm": 3.477701187133789,
+ "learning_rate": 3.2500000000000002e-06,
+ "loss": 0.9197,
+ "step": 65
+ },
+ {
+ "epoch": 0.031205673758865248,
+ "grad_norm": 3.3363406658172607,
+ "learning_rate": 3.3000000000000006e-06,
+ "loss": 0.9478,
+ "step": 66
+ },
+ {
+ "epoch": 0.03167848699763593,
+ "grad_norm": 4.143295764923096,
+ "learning_rate": 3.3500000000000005e-06,
+ "loss": 1.0534,
+ "step": 67
+ },
+ {
+ "epoch": 0.03215130023640662,
+ "grad_norm": 3.2363274097442627,
+ "learning_rate": 3.4000000000000005e-06,
+ "loss": 0.9454,
+ "step": 68
+ },
+ {
+ "epoch": 0.032624113475177303,
+ "grad_norm": 3.198746681213379,
+ "learning_rate": 3.45e-06,
+ "loss": 0.9388,
+ "step": 69
+ },
+ {
+ "epoch": 0.03309692671394799,
+ "grad_norm": 3.5751023292541504,
+ "learning_rate": 3.5e-06,
+ "loss": 0.9444,
+ "step": 70
+ },
+ {
+ "epoch": 0.033569739952718676,
+ "grad_norm": 3.1745729446411133,
+ "learning_rate": 3.5500000000000003e-06,
+ "loss": 0.8683,
+ "step": 71
+ },
+ {
+ "epoch": 0.03404255319148936,
+ "grad_norm": 3.3210883140563965,
+ "learning_rate": 3.6000000000000003e-06,
+ "loss": 0.8811,
+ "step": 72
+ },
+ {
+ "epoch": 0.03451536643026005,
+ "grad_norm": 3.2502429485321045,
+ "learning_rate": 3.65e-06,
+ "loss": 1.0012,
+ "step": 73
+ },
+ {
+ "epoch": 0.034988179669030735,
+ "grad_norm": 3.44598126411438,
+ "learning_rate": 3.7e-06,
+ "loss": 0.9217,
+ "step": 74
+ },
+ {
+ "epoch": 0.03546099290780142,
+ "grad_norm": 3.439117431640625,
+ "learning_rate": 3.7500000000000005e-06,
+ "loss": 0.8976,
+ "step": 75
+ },
+ {
+ "epoch": 0.03593380614657211,
+ "grad_norm": 3.523627758026123,
+ "learning_rate": 3.8000000000000005e-06,
+ "loss": 0.8996,
+ "step": 76
+ },
+ {
+ "epoch": 0.03640661938534279,
+ "grad_norm": 3.3716015815734863,
+ "learning_rate": 3.85e-06,
+ "loss": 0.9061,
+ "step": 77
+ },
+ {
+ "epoch": 0.03687943262411347,
+ "grad_norm": 3.33518385887146,
+ "learning_rate": 3.900000000000001e-06,
+ "loss": 0.9371,
+ "step": 78
+ },
+ {
+ "epoch": 0.03735224586288416,
+ "grad_norm": 3.833829879760742,
+ "learning_rate": 3.95e-06,
+ "loss": 0.9669,
+ "step": 79
+ },
+ {
+ "epoch": 0.037825059101654845,
+ "grad_norm": 3.260446786880493,
+ "learning_rate": 4.000000000000001e-06,
+ "loss": 0.9449,
+ "step": 80
+ },
+ {
+ "epoch": 0.03829787234042553,
+ "grad_norm": 3.532451629638672,
+ "learning_rate": 4.05e-06,
+ "loss": 0.897,
+ "step": 81
+ },
+ {
+ "epoch": 0.03877068557919622,
+ "grad_norm": 3.1156492233276367,
+ "learning_rate": 4.1e-06,
+ "loss": 0.8463,
+ "step": 82
+ },
+ {
+ "epoch": 0.039243498817966904,
+ "grad_norm": 2.8801751136779785,
+ "learning_rate": 4.15e-06,
+ "loss": 0.8616,
+ "step": 83
+ },
+ {
+ "epoch": 0.03971631205673759,
+ "grad_norm": 3.072476863861084,
+ "learning_rate": 4.2000000000000004e-06,
+ "loss": 0.8387,
+ "step": 84
+ },
+ {
+ "epoch": 0.04018912529550828,
+ "grad_norm": 2.9601376056671143,
+ "learning_rate": 4.25e-06,
+ "loss": 0.8538,
+ "step": 85
+ },
+ {
+ "epoch": 0.04066193853427896,
+ "grad_norm": 3.521664619445801,
+ "learning_rate": 4.3e-06,
+ "loss": 0.8894,
+ "step": 86
+ },
+ {
+ "epoch": 0.04113475177304964,
+ "grad_norm": 3.2670981884002686,
+ "learning_rate": 4.350000000000001e-06,
+ "loss": 0.8387,
+ "step": 87
+ },
+ {
+ "epoch": 0.04160756501182033,
+ "grad_norm": 3.422089099884033,
+ "learning_rate": 4.4e-06,
+ "loss": 0.7728,
+ "step": 88
+ },
+ {
+ "epoch": 0.042080378250591015,
+ "grad_norm": 3.414034128189087,
+ "learning_rate": 4.450000000000001e-06,
+ "loss": 0.7968,
+ "step": 89
+ },
+ {
+ "epoch": 0.0425531914893617,
+ "grad_norm": 4.234285354614258,
+ "learning_rate": 4.5e-06,
+ "loss": 0.8502,
+ "step": 90
+ },
+ {
+ "epoch": 0.04302600472813239,
+ "grad_norm": 3.1446919441223145,
+ "learning_rate": 4.5500000000000005e-06,
+ "loss": 0.8236,
+ "step": 91
+ },
+ {
+ "epoch": 0.043498817966903074,
+ "grad_norm": 3.683443307876587,
+ "learning_rate": 4.600000000000001e-06,
+ "loss": 0.9792,
+ "step": 92
+ },
+ {
+ "epoch": 0.04397163120567376,
+ "grad_norm": 3.664219617843628,
+ "learning_rate": 4.65e-06,
+ "loss": 0.8743,
+ "step": 93
+ },
+ {
+ "epoch": 0.044444444444444446,
+ "grad_norm": 3.369479179382324,
+ "learning_rate": 4.7e-06,
+ "loss": 0.8741,
+ "step": 94
+ },
+ {
+ "epoch": 0.04491725768321513,
+ "grad_norm": 3.694949150085449,
+ "learning_rate": 4.75e-06,
+ "loss": 0.7574,
+ "step": 95
+ },
+ {
+ "epoch": 0.04539007092198582,
+ "grad_norm": 3.5144498348236084,
+ "learning_rate": 4.800000000000001e-06,
+ "loss": 0.9934,
+ "step": 96
+ },
+ {
+ "epoch": 0.0458628841607565,
+ "grad_norm": 3.164451837539673,
+ "learning_rate": 4.85e-06,
+ "loss": 0.7463,
+ "step": 97
+ },
+ {
+ "epoch": 0.046335697399527184,
+ "grad_norm": 3.222785472869873,
+ "learning_rate": 4.9000000000000005e-06,
+ "loss": 0.7698,
+ "step": 98
+ },
+ {
+ "epoch": 0.04680851063829787,
+ "grad_norm": 2.9129555225372314,
+ "learning_rate": 4.95e-06,
+ "loss": 0.7856,
+ "step": 99
+ },
+ {
+ "epoch": 0.04728132387706856,
+ "grad_norm": 3.5061235427856445,
+ "learning_rate": 5e-06,
+ "loss": 0.8588,
+ "step": 100
+ },
+ {
+ "epoch": 0.04775413711583924,
+ "grad_norm": 3.2805044651031494,
+ "learning_rate": 4.999999922167982e-06,
+ "loss": 0.7643,
+ "step": 101
+ },
+ {
+ "epoch": 0.04822695035460993,
+ "grad_norm": 3.5461678504943848,
+ "learning_rate": 4.999999688671929e-06,
+ "loss": 0.8253,
+ "step": 102
+ },
+ {
+ "epoch": 0.048699763593380616,
+ "grad_norm": 3.2238264083862305,
+ "learning_rate": 4.99999929951186e-06,
+ "loss": 0.7622,
+ "step": 103
+ },
+ {
+ "epoch": 0.0491725768321513,
+ "grad_norm": 3.818955898284912,
+ "learning_rate": 4.999998754687795e-06,
+ "loss": 0.8471,
+ "step": 104
+ },
+ {
+ "epoch": 0.04964539007092199,
+ "grad_norm": 3.1252424716949463,
+ "learning_rate": 4.99999805419977e-06,
+ "loss": 0.8409,
+ "step": 105
+ },
+ {
+ "epoch": 0.050118203309692674,
+ "grad_norm": 3.604283571243286,
+ "learning_rate": 4.999997198047828e-06,
+ "loss": 0.9027,
+ "step": 106
+ },
+ {
+ "epoch": 0.050591016548463354,
+ "grad_norm": 3.6752424240112305,
+ "learning_rate": 4.999996186232023e-06,
+ "loss": 0.9336,
+ "step": 107
+ },
+ {
+ "epoch": 0.05106382978723404,
+ "grad_norm": 3.517557144165039,
+ "learning_rate": 4.9999950187524184e-06,
+ "loss": 0.8351,
+ "step": 108
+ },
+ {
+ "epoch": 0.051536643026004726,
+ "grad_norm": 3.427285671234131,
+ "learning_rate": 4.999993695609085e-06,
+ "loss": 0.8457,
+ "step": 109
+ },
+ {
+ "epoch": 0.05200945626477541,
+ "grad_norm": 3.2792510986328125,
+ "learning_rate": 4.999992216802107e-06,
+ "loss": 0.8391,
+ "step": 110
+ },
+ {
+ "epoch": 0.0524822695035461,
+ "grad_norm": 3.581094741821289,
+ "learning_rate": 4.999990582331576e-06,
+ "loss": 0.7533,
+ "step": 111
+ },
+ {
+ "epoch": 0.052955082742316785,
+ "grad_norm": 3.1667377948760986,
+ "learning_rate": 4.999988792197593e-06,
+ "loss": 0.9562,
+ "step": 112
+ },
+ {
+ "epoch": 0.05342789598108747,
+ "grad_norm": 3.3609890937805176,
+ "learning_rate": 4.99998684640027e-06,
+ "loss": 0.8181,
+ "step": 113
+ },
+ {
+ "epoch": 0.05390070921985816,
+ "grad_norm": 3.260627269744873,
+ "learning_rate": 4.999984744939729e-06,
+ "loss": 0.8012,
+ "step": 114
+ },
+ {
+ "epoch": 0.054373522458628844,
+ "grad_norm": 3.4535653591156006,
+ "learning_rate": 4.9999824878160985e-06,
+ "loss": 0.919,
+ "step": 115
+ },
+ {
+ "epoch": 0.05484633569739953,
+ "grad_norm": 3.4880740642547607,
+ "learning_rate": 4.999980075029522e-06,
+ "loss": 0.8114,
+ "step": 116
+ },
+ {
+ "epoch": 0.05531914893617021,
+ "grad_norm": 3.2546932697296143,
+ "learning_rate": 4.999977506580147e-06,
+ "loss": 0.8274,
+ "step": 117
+ },
+ {
+ "epoch": 0.055791962174940896,
+ "grad_norm": 3.2762744426727295,
+ "learning_rate": 4.999974782468136e-06,
+ "loss": 0.9018,
+ "step": 118
+ },
+ {
+ "epoch": 0.05626477541371158,
+ "grad_norm": 3.42825984954834,
+ "learning_rate": 4.999971902693657e-06,
+ "loss": 0.8262,
+ "step": 119
+ },
+ {
+ "epoch": 0.05673758865248227,
+ "grad_norm": 3.082496404647827,
+ "learning_rate": 4.99996886725689e-06,
+ "loss": 0.8181,
+ "step": 120
+ },
+ {
+ "epoch": 0.057210401891252954,
+ "grad_norm": 3.322869300842285,
+ "learning_rate": 4.9999656761580225e-06,
+ "loss": 0.8382,
+ "step": 121
+ },
+ {
+ "epoch": 0.05768321513002364,
+ "grad_norm": 3.6365339756011963,
+ "learning_rate": 4.9999623293972555e-06,
+ "loss": 0.7489,
+ "step": 122
+ },
+ {
+ "epoch": 0.05815602836879433,
+ "grad_norm": 3.376352548599243,
+ "learning_rate": 4.999958826974796e-06,
+ "loss": 0.9012,
+ "step": 123
+ },
+ {
+ "epoch": 0.05862884160756501,
+ "grad_norm": 3.49088716506958,
+ "learning_rate": 4.999955168890862e-06,
+ "loss": 0.8999,
+ "step": 124
+ },
+ {
+ "epoch": 0.0591016548463357,
+ "grad_norm": 3.3265068531036377,
+ "learning_rate": 4.999951355145682e-06,
+ "loss": 0.8161,
+ "step": 125
+ },
+ {
+ "epoch": 0.059574468085106386,
+ "grad_norm": 3.697282314300537,
+ "learning_rate": 4.999947385739493e-06,
+ "loss": 0.9623,
+ "step": 126
+ },
+ {
+ "epoch": 0.06004728132387707,
+ "grad_norm": 2.7901928424835205,
+ "learning_rate": 4.999943260672542e-06,
+ "loss": 0.7371,
+ "step": 127
+ },
+ {
+ "epoch": 0.06052009456264775,
+ "grad_norm": 3.110319137573242,
+ "learning_rate": 4.999938979945086e-06,
+ "loss": 0.715,
+ "step": 128
+ },
+ {
+ "epoch": 0.06099290780141844,
+ "grad_norm": 3.2211520671844482,
+ "learning_rate": 4.999934543557392e-06,
+ "loss": 0.8888,
+ "step": 129
+ },
+ {
+ "epoch": 0.061465721040189124,
+ "grad_norm": 3.2466187477111816,
+ "learning_rate": 4.999929951509735e-06,
+ "loss": 0.9389,
+ "step": 130
+ },
+ {
+ "epoch": 0.06193853427895981,
+ "grad_norm": 3.3574399948120117,
+ "learning_rate": 4.999925203802403e-06,
+ "loss": 0.8263,
+ "step": 131
+ },
+ {
+ "epoch": 0.062411347517730496,
+ "grad_norm": 3.275601625442505,
+ "learning_rate": 4.99992030043569e-06,
+ "loss": 0.8338,
+ "step": 132
+ },
+ {
+ "epoch": 0.06288416075650118,
+ "grad_norm": 3.6011312007904053,
+ "learning_rate": 4.999915241409902e-06,
+ "loss": 0.8351,
+ "step": 133
+ },
+ {
+ "epoch": 0.06335697399527186,
+ "grad_norm": 2.969011068344116,
+ "learning_rate": 4.999910026725352e-06,
+ "loss": 0.79,
+ "step": 134
+ },
+ {
+ "epoch": 0.06382978723404255,
+ "grad_norm": 3.690784454345703,
+ "learning_rate": 4.999904656382369e-06,
+ "loss": 0.8209,
+ "step": 135
+ },
+ {
+ "epoch": 0.06430260047281323,
+ "grad_norm": 3.3363115787506104,
+ "learning_rate": 4.999899130381283e-06,
+ "loss": 0.858,
+ "step": 136
+ },
+ {
+ "epoch": 0.06477541371158392,
+ "grad_norm": 3.206881523132324,
+ "learning_rate": 4.9998934487224405e-06,
+ "loss": 0.834,
+ "step": 137
+ },
+ {
+ "epoch": 0.06524822695035461,
+ "grad_norm": 2.773146152496338,
+ "learning_rate": 4.999887611406195e-06,
+ "loss": 0.7576,
+ "step": 138
+ },
+ {
+ "epoch": 0.0657210401891253,
+ "grad_norm": 3.307725667953491,
+ "learning_rate": 4.999881618432908e-06,
+ "loss": 0.7487,
+ "step": 139
+ },
+ {
+ "epoch": 0.06619385342789598,
+ "grad_norm": 4.273657321929932,
+ "learning_rate": 4.999875469802956e-06,
+ "loss": 0.8176,
+ "step": 140
+ },
+ {
+ "epoch": 0.06666666666666667,
+ "grad_norm": 3.0898005962371826,
+ "learning_rate": 4.999869165516719e-06,
+ "loss": 0.7578,
+ "step": 141
+ },
+ {
+ "epoch": 0.06713947990543735,
+ "grad_norm": 3.25150990486145,
+ "learning_rate": 4.9998627055745915e-06,
+ "loss": 0.7873,
+ "step": 142
+ },
+ {
+ "epoch": 0.06761229314420804,
+ "grad_norm": 2.9705755710601807,
+ "learning_rate": 4.999856089976974e-06,
+ "loss": 0.6473,
+ "step": 143
+ },
+ {
+ "epoch": 0.06808510638297872,
+ "grad_norm": 3.5658507347106934,
+ "learning_rate": 4.9998493187242804e-06,
+ "loss": 0.855,
+ "step": 144
+ },
+ {
+ "epoch": 0.06855791962174941,
+ "grad_norm": 3.3994076251983643,
+ "learning_rate": 4.99984239181693e-06,
+ "loss": 0.7926,
+ "step": 145
+ },
+ {
+ "epoch": 0.0690307328605201,
+ "grad_norm": 2.8266260623931885,
+ "learning_rate": 4.999835309255357e-06,
+ "loss": 0.7564,
+ "step": 146
+ },
+ {
+ "epoch": 0.06950354609929078,
+ "grad_norm": 3.1143875122070312,
+ "learning_rate": 4.999828071039999e-06,
+ "loss": 0.8398,
+ "step": 147
+ },
+ {
+ "epoch": 0.06997635933806147,
+ "grad_norm": 2.9364278316497803,
+ "learning_rate": 4.99982067717131e-06,
+ "loss": 0.7381,
+ "step": 148
+ },
+ {
+ "epoch": 0.07044917257683216,
+ "grad_norm": 3.4155616760253906,
+ "learning_rate": 4.999813127649748e-06,
+ "loss": 0.7933,
+ "step": 149
+ },
+ {
+ "epoch": 0.07092198581560284,
+ "grad_norm": 4.371236324310303,
+ "learning_rate": 4.999805422475784e-06,
+ "loss": 0.8292,
+ "step": 150
+ },
+ {
+ "epoch": 0.07139479905437353,
+ "grad_norm": 3.3967185020446777,
+ "learning_rate": 4.999797561649897e-06,
+ "loss": 0.8712,
+ "step": 151
+ },
+ {
+ "epoch": 0.07186761229314421,
+ "grad_norm": 3.343303680419922,
+ "learning_rate": 4.999789545172578e-06,
+ "loss": 0.8177,
+ "step": 152
+ },
+ {
+ "epoch": 0.07234042553191489,
+ "grad_norm": 3.040235757827759,
+ "learning_rate": 4.999781373044325e-06,
+ "loss": 0.7379,
+ "step": 153
+ },
+ {
+ "epoch": 0.07281323877068557,
+ "grad_norm": 3.4069204330444336,
+ "learning_rate": 4.999773045265647e-06,
+ "loss": 0.7939,
+ "step": 154
+ },
+ {
+ "epoch": 0.07328605200945626,
+ "grad_norm": 3.1939475536346436,
+ "learning_rate": 4.999764561837063e-06,
+ "loss": 0.8037,
+ "step": 155
+ },
+ {
+ "epoch": 0.07375886524822695,
+ "grad_norm": 4.452004909515381,
+ "learning_rate": 4.999755922759101e-06,
+ "loss": 0.8421,
+ "step": 156
+ },
+ {
+ "epoch": 0.07423167848699763,
+ "grad_norm": 3.2031240463256836,
+ "learning_rate": 4.999747128032298e-06,
+ "loss": 0.794,
+ "step": 157
+ },
+ {
+ "epoch": 0.07470449172576832,
+ "grad_norm": 3.175920009613037,
+ "learning_rate": 4.999738177657203e-06,
+ "loss": 0.759,
+ "step": 158
+ },
+ {
+ "epoch": 0.075177304964539,
+ "grad_norm": 3.7679688930511475,
+ "learning_rate": 4.9997290716343725e-06,
+ "loss": 0.8174,
+ "step": 159
+ },
+ {
+ "epoch": 0.07565011820330969,
+ "grad_norm": 3.7020037174224854,
+ "learning_rate": 4.999719809964373e-06,
+ "loss": 0.7116,
+ "step": 160
+ },
+ {
+ "epoch": 0.07612293144208038,
+ "grad_norm": 4.357471942901611,
+ "learning_rate": 4.999710392647783e-06,
+ "loss": 0.7649,
+ "step": 161
+ },
+ {
+ "epoch": 0.07659574468085106,
+ "grad_norm": 3.3439087867736816,
+ "learning_rate": 4.999700819685187e-06,
+ "loss": 0.7907,
+ "step": 162
+ },
+ {
+ "epoch": 0.07706855791962175,
+ "grad_norm": 3.210815191268921,
+ "learning_rate": 4.999691091077182e-06,
+ "loss": 0.8446,
+ "step": 163
+ },
+ {
+ "epoch": 0.07754137115839244,
+ "grad_norm": 3.1029553413391113,
+ "learning_rate": 4.9996812068243735e-06,
+ "loss": 0.7232,
+ "step": 164
+ },
+ {
+ "epoch": 0.07801418439716312,
+ "grad_norm": 2.9389400482177734,
+ "learning_rate": 4.999671166927378e-06,
+ "loss": 0.7413,
+ "step": 165
+ },
+ {
+ "epoch": 0.07848699763593381,
+ "grad_norm": 3.7062697410583496,
+ "learning_rate": 4.9996609713868185e-06,
+ "loss": 0.8773,
+ "step": 166
+ },
+ {
+ "epoch": 0.0789598108747045,
+ "grad_norm": 3.2768924236297607,
+ "learning_rate": 4.999650620203332e-06,
+ "loss": 0.8046,
+ "step": 167
+ },
+ {
+ "epoch": 0.07943262411347518,
+ "grad_norm": 3.380373001098633,
+ "learning_rate": 4.999640113377561e-06,
+ "loss": 0.7529,
+ "step": 168
+ },
+ {
+ "epoch": 0.07990543735224587,
+ "grad_norm": 3.520022392272949,
+ "learning_rate": 4.999629450910162e-06,
+ "loss": 0.7352,
+ "step": 169
+ },
+ {
+ "epoch": 0.08037825059101655,
+ "grad_norm": 3.43269419670105,
+ "learning_rate": 4.999618632801796e-06,
+ "loss": 0.9371,
+ "step": 170
+ },
+ {
+ "epoch": 0.08085106382978724,
+ "grad_norm": 3.555877923965454,
+ "learning_rate": 4.99960765905314e-06,
+ "loss": 0.8276,
+ "step": 171
+ },
+ {
+ "epoch": 0.08132387706855793,
+ "grad_norm": 3.597050189971924,
+ "learning_rate": 4.999596529664874e-06,
+ "loss": 0.8164,
+ "step": 172
+ },
+ {
+ "epoch": 0.0817966903073286,
+ "grad_norm": 3.2002956867218018,
+ "learning_rate": 4.999585244637693e-06,
+ "loss": 0.7824,
+ "step": 173
+ },
+ {
+ "epoch": 0.08226950354609928,
+ "grad_norm": 3.527275562286377,
+ "learning_rate": 4.999573803972299e-06,
+ "loss": 0.8033,
+ "step": 174
+ },
+ {
+ "epoch": 0.08274231678486997,
+ "grad_norm": 3.5184452533721924,
+ "learning_rate": 4.999562207669405e-06,
+ "loss": 0.724,
+ "step": 175
+ },
+ {
+ "epoch": 0.08321513002364066,
+ "grad_norm": 3.6635067462921143,
+ "learning_rate": 4.999550455729732e-06,
+ "loss": 0.819,
+ "step": 176
+ },
+ {
+ "epoch": 0.08368794326241134,
+ "grad_norm": 3.192399740219116,
+ "learning_rate": 4.999538548154012e-06,
+ "loss": 0.7999,
+ "step": 177
+ },
+ {
+ "epoch": 0.08416075650118203,
+ "grad_norm": 3.0946953296661377,
+ "learning_rate": 4.999526484942988e-06,
+ "loss": 0.7367,
+ "step": 178
+ },
+ {
+ "epoch": 0.08463356973995272,
+ "grad_norm": 2.847198009490967,
+ "learning_rate": 4.99951426609741e-06,
+ "loss": 0.7536,
+ "step": 179
+ },
+ {
+ "epoch": 0.0851063829787234,
+ "grad_norm": 2.7674827575683594,
+ "learning_rate": 4.999501891618037e-06,
+ "loss": 0.701,
+ "step": 180
+ },
+ {
+ "epoch": 0.08557919621749409,
+ "grad_norm": 3.357933521270752,
+ "learning_rate": 4.999489361505643e-06,
+ "loss": 0.8331,
+ "step": 181
+ },
+ {
+ "epoch": 0.08605200945626477,
+ "grad_norm": 3.1464426517486572,
+ "learning_rate": 4.999476675761004e-06,
+ "loss": 0.7931,
+ "step": 182
+ },
+ {
+ "epoch": 0.08652482269503546,
+ "grad_norm": 3.310697078704834,
+ "learning_rate": 4.999463834384915e-06,
+ "loss": 0.753,
+ "step": 183
+ },
+ {
+ "epoch": 0.08699763593380615,
+ "grad_norm": 2.9794881343841553,
+ "learning_rate": 4.999450837378171e-06,
+ "loss": 0.7091,
+ "step": 184
+ },
+ {
+ "epoch": 0.08747044917257683,
+ "grad_norm": 3.0776889324188232,
+ "learning_rate": 4.999437684741584e-06,
+ "loss": 0.7226,
+ "step": 185
+ },
+ {
+ "epoch": 0.08794326241134752,
+ "grad_norm": 3.6657519340515137,
+ "learning_rate": 4.999424376475972e-06,
+ "loss": 0.845,
+ "step": 186
+ },
+ {
+ "epoch": 0.0884160756501182,
+ "grad_norm": 3.872718572616577,
+ "learning_rate": 4.999410912582164e-06,
+ "loss": 0.812,
+ "step": 187
+ },
+ {
+ "epoch": 0.08888888888888889,
+ "grad_norm": 2.9184508323669434,
+ "learning_rate": 4.9993972930609976e-06,
+ "loss": 0.6823,
+ "step": 188
+ },
+ {
+ "epoch": 0.08936170212765958,
+ "grad_norm": 3.5567142963409424,
+ "learning_rate": 4.999383517913321e-06,
+ "loss": 0.7614,
+ "step": 189
+ },
+ {
+ "epoch": 0.08983451536643026,
+ "grad_norm": 3.3688533306121826,
+ "learning_rate": 4.999369587139992e-06,
+ "loss": 0.858,
+ "step": 190
+ },
+ {
+ "epoch": 0.09030732860520095,
+ "grad_norm": 2.893223524093628,
+ "learning_rate": 4.99935550074188e-06,
+ "loss": 0.6761,
+ "step": 191
+ },
+ {
+ "epoch": 0.09078014184397164,
+ "grad_norm": 3.400225877761841,
+ "learning_rate": 4.999341258719859e-06,
+ "loss": 0.7531,
+ "step": 192
+ },
+ {
+ "epoch": 0.09125295508274232,
+ "grad_norm": 3.6167714595794678,
+ "learning_rate": 4.999326861074817e-06,
+ "loss": 0.8164,
+ "step": 193
+ },
+ {
+ "epoch": 0.091725768321513,
+ "grad_norm": 4.325016498565674,
+ "learning_rate": 4.9993123078076506e-06,
+ "loss": 0.7069,
+ "step": 194
+ },
+ {
+ "epoch": 0.09219858156028368,
+ "grad_norm": 3.195317029953003,
+ "learning_rate": 4.999297598919266e-06,
+ "loss": 0.726,
+ "step": 195
+ },
+ {
+ "epoch": 0.09267139479905437,
+ "grad_norm": 3.146530866622925,
+ "learning_rate": 4.999282734410579e-06,
+ "loss": 0.7888,
+ "step": 196
+ },
+ {
+ "epoch": 0.09314420803782505,
+ "grad_norm": 3.5166752338409424,
+ "learning_rate": 4.999267714282515e-06,
+ "loss": 0.8473,
+ "step": 197
+ },
+ {
+ "epoch": 0.09361702127659574,
+ "grad_norm": 3.3140196800231934,
+ "learning_rate": 4.99925253853601e-06,
+ "loss": 0.7233,
+ "step": 198
+ },
+ {
+ "epoch": 0.09408983451536643,
+ "grad_norm": 3.0318164825439453,
+ "learning_rate": 4.999237207172008e-06,
+ "loss": 0.7543,
+ "step": 199
+ },
+ {
+ "epoch": 0.09456264775413711,
+ "grad_norm": 3.662214756011963,
+ "learning_rate": 4.999221720191464e-06,
+ "loss": 0.7783,
+ "step": 200
+ },
+ {
+ "epoch": 0.0950354609929078,
+ "grad_norm": 3.452078104019165,
+ "learning_rate": 4.9992060775953425e-06,
+ "loss": 0.7868,
+ "step": 201
+ },
+ {
+ "epoch": 0.09550827423167849,
+ "grad_norm": 3.4051287174224854,
+ "learning_rate": 4.999190279384617e-06,
+ "loss": 0.7849,
+ "step": 202
+ },
+ {
+ "epoch": 0.09598108747044917,
+ "grad_norm": 3.1377196311950684,
+ "learning_rate": 4.999174325560271e-06,
+ "loss": 0.8364,
+ "step": 203
+ },
+ {
+ "epoch": 0.09645390070921986,
+ "grad_norm": 3.129473924636841,
+ "learning_rate": 4.999158216123299e-06,
+ "loss": 0.7458,
+ "step": 204
+ },
+ {
+ "epoch": 0.09692671394799054,
+ "grad_norm": 3.169548749923706,
+ "learning_rate": 4.999141951074703e-06,
+ "loss": 0.7256,
+ "step": 205
+ },
+ {
+ "epoch": 0.09739952718676123,
+ "grad_norm": 3.186009168624878,
+ "learning_rate": 4.999125530415495e-06,
+ "loss": 0.783,
+ "step": 206
+ },
+ {
+ "epoch": 0.09787234042553192,
+ "grad_norm": 3.0995123386383057,
+ "learning_rate": 4.9991089541467e-06,
+ "loss": 0.7519,
+ "step": 207
+ },
+ {
+ "epoch": 0.0983451536643026,
+ "grad_norm": 3.1854088306427,
+ "learning_rate": 4.999092222269348e-06,
+ "loss": 0.7444,
+ "step": 208
+ },
+ {
+ "epoch": 0.09881796690307329,
+ "grad_norm": 3.1512246131896973,
+ "learning_rate": 4.999075334784482e-06,
+ "loss": 0.7882,
+ "step": 209
+ },
+ {
+ "epoch": 0.09929078014184398,
+ "grad_norm": 3.6199698448181152,
+ "learning_rate": 4.999058291693153e-06,
+ "loss": 0.8048,
+ "step": 210
+ },
+ {
+ "epoch": 0.09976359338061466,
+ "grad_norm": 2.956907272338867,
+ "learning_rate": 4.999041092996422e-06,
+ "loss": 0.7663,
+ "step": 211
+ },
+ {
+ "epoch": 0.10023640661938535,
+ "grad_norm": 3.3493971824645996,
+ "learning_rate": 4.99902373869536e-06,
+ "loss": 0.7639,
+ "step": 212
+ },
+ {
+ "epoch": 0.10070921985815603,
+ "grad_norm": 3.144812822341919,
+ "learning_rate": 4.9990062287910475e-06,
+ "loss": 0.7953,
+ "step": 213
+ },
+ {
+ "epoch": 0.10118203309692671,
+ "grad_norm": 3.5986971855163574,
+ "learning_rate": 4.998988563284576e-06,
+ "loss": 0.8297,
+ "step": 214
+ },
+ {
+ "epoch": 0.1016548463356974,
+ "grad_norm": 3.447584867477417,
+ "learning_rate": 4.998970742177044e-06,
+ "loss": 0.808,
+ "step": 215
+ },
+ {
+ "epoch": 0.10212765957446808,
+ "grad_norm": 3.791353940963745,
+ "learning_rate": 4.998952765469562e-06,
+ "loss": 0.8005,
+ "step": 216
+ },
+ {
+ "epoch": 0.10260047281323877,
+ "grad_norm": 3.4490807056427,
+ "learning_rate": 4.998934633163247e-06,
+ "loss": 0.8135,
+ "step": 217
+ },
+ {
+ "epoch": 0.10307328605200945,
+ "grad_norm": 3.1053314208984375,
+ "learning_rate": 4.998916345259232e-06,
+ "loss": 0.7888,
+ "step": 218
+ },
+ {
+ "epoch": 0.10354609929078014,
+ "grad_norm": 3.407862663269043,
+ "learning_rate": 4.9988979017586514e-06,
+ "loss": 0.7099,
+ "step": 219
+ },
+ {
+ "epoch": 0.10401891252955082,
+ "grad_norm": 3.116656541824341,
+ "learning_rate": 4.998879302662658e-06,
+ "loss": 0.8344,
+ "step": 220
+ },
+ {
+ "epoch": 0.10449172576832151,
+ "grad_norm": 3.339264154434204,
+ "learning_rate": 4.998860547972406e-06,
+ "loss": 0.8496,
+ "step": 221
+ },
+ {
+ "epoch": 0.1049645390070922,
+ "grad_norm": 3.251892566680908,
+ "learning_rate": 4.998841637689066e-06,
+ "loss": 0.7455,
+ "step": 222
+ },
+ {
+ "epoch": 0.10543735224586288,
+ "grad_norm": 4.098135471343994,
+ "learning_rate": 4.998822571813814e-06,
+ "loss": 0.7772,
+ "step": 223
+ },
+ {
+ "epoch": 0.10591016548463357,
+ "grad_norm": 3.9871134757995605,
+ "learning_rate": 4.998803350347837e-06,
+ "loss": 0.8261,
+ "step": 224
+ },
+ {
+ "epoch": 0.10638297872340426,
+ "grad_norm": 3.2822303771972656,
+ "learning_rate": 4.998783973292333e-06,
+ "loss": 0.8623,
+ "step": 225
+ },
+ {
+ "epoch": 0.10685579196217494,
+ "grad_norm": 3.0356857776641846,
+ "learning_rate": 4.998764440648507e-06,
+ "loss": 0.7426,
+ "step": 226
+ },
+ {
+ "epoch": 0.10732860520094563,
+ "grad_norm": 2.8932785987854004,
+ "learning_rate": 4.998744752417576e-06,
+ "loss": 0.6741,
+ "step": 227
+ },
+ {
+ "epoch": 0.10780141843971631,
+ "grad_norm": 3.085820436477661,
+ "learning_rate": 4.998724908600767e-06,
+ "loss": 0.6549,
+ "step": 228
+ },
+ {
+ "epoch": 0.108274231678487,
+ "grad_norm": 3.135829210281372,
+ "learning_rate": 4.998704909199314e-06,
+ "loss": 0.6702,
+ "step": 229
+ },
+ {
+ "epoch": 0.10874704491725769,
+ "grad_norm": 5.016134262084961,
+ "learning_rate": 4.9986847542144625e-06,
+ "loss": 0.7852,
+ "step": 230
+ },
+ {
+ "epoch": 0.10921985815602837,
+ "grad_norm": 3.9056200981140137,
+ "learning_rate": 4.998664443647468e-06,
+ "loss": 0.9654,
+ "step": 231
+ },
+ {
+ "epoch": 0.10969267139479906,
+ "grad_norm": 3.0880749225616455,
+ "learning_rate": 4.998643977499595e-06,
+ "loss": 0.7579,
+ "step": 232
+ },
+ {
+ "epoch": 0.11016548463356975,
+ "grad_norm": 3.6893601417541504,
+ "learning_rate": 4.998623355772118e-06,
+ "loss": 0.713,
+ "step": 233
+ },
+ {
+ "epoch": 0.11063829787234042,
+ "grad_norm": 4.181536674499512,
+ "learning_rate": 4.998602578466319e-06,
+ "loss": 0.7331,
+ "step": 234
+ },
+ {
+ "epoch": 0.1111111111111111,
+ "grad_norm": 3.036386728286743,
+ "learning_rate": 4.998581645583496e-06,
+ "loss": 0.7115,
+ "step": 235
+ },
+ {
+ "epoch": 0.11158392434988179,
+ "grad_norm": 3.6333255767822266,
+ "learning_rate": 4.998560557124948e-06,
+ "loss": 0.7544,
+ "step": 236
+ },
+ {
+ "epoch": 0.11205673758865248,
+ "grad_norm": 2.926417827606201,
+ "learning_rate": 4.9985393130919915e-06,
+ "loss": 0.715,
+ "step": 237
+ },
+ {
+ "epoch": 0.11252955082742316,
+ "grad_norm": 2.969158172607422,
+ "learning_rate": 4.998517913485946e-06,
+ "loss": 0.7304,
+ "step": 238
+ },
+ {
+ "epoch": 0.11300236406619385,
+ "grad_norm": 3.5254971981048584,
+ "learning_rate": 4.9984963583081466e-06,
+ "loss": 0.7725,
+ "step": 239
+ },
+ {
+ "epoch": 0.11347517730496454,
+ "grad_norm": 3.7840335369110107,
+ "learning_rate": 4.998474647559936e-06,
+ "loss": 0.8685,
+ "step": 240
+ },
+ {
+ "epoch": 0.11394799054373522,
+ "grad_norm": 3.0333125591278076,
+ "learning_rate": 4.9984527812426625e-06,
+ "loss": 0.7793,
+ "step": 241
+ },
+ {
+ "epoch": 0.11442080378250591,
+ "grad_norm": 3.290159225463867,
+ "learning_rate": 4.99843075935769e-06,
+ "loss": 0.7158,
+ "step": 242
+ },
+ {
+ "epoch": 0.1148936170212766,
+ "grad_norm": 3.3935494422912598,
+ "learning_rate": 4.99840858190639e-06,
+ "loss": 0.7643,
+ "step": 243
+ },
+ {
+ "epoch": 0.11536643026004728,
+ "grad_norm": 3.333965539932251,
+ "learning_rate": 4.998386248890142e-06,
+ "loss": 0.7255,
+ "step": 244
+ },
+ {
+ "epoch": 0.11583924349881797,
+ "grad_norm": 2.8129613399505615,
+ "learning_rate": 4.998363760310339e-06,
+ "loss": 0.768,
+ "step": 245
+ },
+ {
+ "epoch": 0.11631205673758865,
+ "grad_norm": 2.8678107261657715,
+ "learning_rate": 4.998341116168378e-06,
+ "loss": 0.7403,
+ "step": 246
+ },
+ {
+ "epoch": 0.11678486997635934,
+ "grad_norm": 2.8898239135742188,
+ "learning_rate": 4.998318316465672e-06,
+ "loss": 0.6844,
+ "step": 247
+ },
+ {
+ "epoch": 0.11725768321513003,
+ "grad_norm": 3.139777898788452,
+ "learning_rate": 4.998295361203637e-06,
+ "loss": 0.7936,
+ "step": 248
+ },
+ {
+ "epoch": 0.11773049645390071,
+ "grad_norm": 3.393721103668213,
+ "learning_rate": 4.998272250383707e-06,
+ "loss": 0.8173,
+ "step": 249
+ },
+ {
+ "epoch": 0.1182033096926714,
+ "grad_norm": 3.240973949432373,
+ "learning_rate": 4.998248984007318e-06,
+ "loss": 0.8252,
+ "step": 250
+ },
+ {
+ "epoch": 0.11867612293144209,
+ "grad_norm": 3.384855031967163,
+ "learning_rate": 4.998225562075918e-06,
+ "loss": 0.7244,
+ "step": 251
+ },
+ {
+ "epoch": 0.11914893617021277,
+ "grad_norm": 3.1881816387176514,
+ "learning_rate": 4.9982019845909675e-06,
+ "loss": 0.6818,
+ "step": 252
+ },
+ {
+ "epoch": 0.11962174940898346,
+ "grad_norm": 2.888364553451538,
+ "learning_rate": 4.998178251553934e-06,
+ "loss": 0.6753,
+ "step": 253
+ },
+ {
+ "epoch": 0.12009456264775414,
+ "grad_norm": 3.630093812942505,
+ "learning_rate": 4.9981543629662944e-06,
+ "loss": 0.7995,
+ "step": 254
+ },
+ {
+ "epoch": 0.12056737588652482,
+ "grad_norm": 2.9820947647094727,
+ "learning_rate": 4.998130318829537e-06,
+ "loss": 0.7478,
+ "step": 255
+ },
+ {
+ "epoch": 0.1210401891252955,
+ "grad_norm": 2.7094738483428955,
+ "learning_rate": 4.998106119145159e-06,
+ "loss": 0.7237,
+ "step": 256
+ },
+ {
+ "epoch": 0.12151300236406619,
+ "grad_norm": 3.1808104515075684,
+ "learning_rate": 4.9980817639146665e-06,
+ "loss": 0.7915,
+ "step": 257
+ },
+ {
+ "epoch": 0.12198581560283688,
+ "grad_norm": 3.1661291122436523,
+ "learning_rate": 4.998057253139575e-06,
+ "loss": 0.8053,
+ "step": 258
+ },
+ {
+ "epoch": 0.12245862884160756,
+ "grad_norm": 3.528749942779541,
+ "learning_rate": 4.998032586821413e-06,
+ "loss": 0.7946,
+ "step": 259
+ },
+ {
+ "epoch": 0.12293144208037825,
+ "grad_norm": 3.125964879989624,
+ "learning_rate": 4.998007764961716e-06,
+ "loss": 0.7569,
+ "step": 260
+ },
+ {
+ "epoch": 0.12340425531914893,
+ "grad_norm": 3.0778942108154297,
+ "learning_rate": 4.997982787562029e-06,
+ "loss": 0.7184,
+ "step": 261
+ },
+ {
+ "epoch": 0.12387706855791962,
+ "grad_norm": 3.3531930446624756,
+ "learning_rate": 4.997957654623906e-06,
+ "loss": 0.7586,
+ "step": 262
+ },
+ {
+ "epoch": 0.1243498817966903,
+ "grad_norm": 3.229278564453125,
+ "learning_rate": 4.997932366148913e-06,
+ "loss": 0.6092,
+ "step": 263
+ },
+ {
+ "epoch": 0.12482269503546099,
+ "grad_norm": 3.7286155223846436,
+ "learning_rate": 4.997906922138626e-06,
+ "loss": 0.7965,
+ "step": 264
+ },
+ {
+ "epoch": 0.12529550827423167,
+ "grad_norm": 3.300311803817749,
+ "learning_rate": 4.997881322594628e-06,
+ "loss": 0.7665,
+ "step": 265
+ },
+ {
+ "epoch": 0.12576832151300235,
+ "grad_norm": 3.411482572555542,
+ "learning_rate": 4.9978555675185115e-06,
+ "loss": 0.7253,
+ "step": 266
+ },
+ {
+ "epoch": 0.12624113475177304,
+ "grad_norm": 3.0884511470794678,
+ "learning_rate": 4.9978296569118825e-06,
+ "loss": 0.659,
+ "step": 267
+ },
+ {
+ "epoch": 0.12671394799054372,
+ "grad_norm": 3.0652925968170166,
+ "learning_rate": 4.9978035907763535e-06,
+ "loss": 0.6739,
+ "step": 268
+ },
+ {
+ "epoch": 0.1271867612293144,
+ "grad_norm": 3.280555009841919,
+ "learning_rate": 4.997777369113547e-06,
+ "loss": 0.8003,
+ "step": 269
+ },
+ {
+ "epoch": 0.1276595744680851,
+ "grad_norm": 2.980860948562622,
+ "learning_rate": 4.997750991925096e-06,
+ "loss": 0.7097,
+ "step": 270
+ },
+ {
+ "epoch": 0.12813238770685578,
+ "grad_norm": 3.301760673522949,
+ "learning_rate": 4.997724459212644e-06,
+ "loss": 0.7894,
+ "step": 271
+ },
+ {
+ "epoch": 0.12860520094562647,
+ "grad_norm": 2.9584903717041016,
+ "learning_rate": 4.997697770977841e-06,
+ "loss": 0.733,
+ "step": 272
+ },
+ {
+ "epoch": 0.12907801418439716,
+ "grad_norm": 3.5632214546203613,
+ "learning_rate": 4.99767092722235e-06,
+ "loss": 0.7228,
+ "step": 273
+ },
+ {
+ "epoch": 0.12955082742316784,
+ "grad_norm": 3.5900983810424805,
+ "learning_rate": 4.997643927947843e-06,
+ "loss": 0.7634,
+ "step": 274
+ },
+ {
+ "epoch": 0.13002364066193853,
+ "grad_norm": 3.332650661468506,
+ "learning_rate": 4.997616773156e-06,
+ "loss": 0.797,
+ "step": 275
+ },
+ {
+ "epoch": 0.13049645390070921,
+ "grad_norm": 3.1094167232513428,
+ "learning_rate": 4.997589462848512e-06,
+ "loss": 0.7849,
+ "step": 276
+ },
+ {
+ "epoch": 0.1309692671394799,
+ "grad_norm": 3.5359463691711426,
+ "learning_rate": 4.99756199702708e-06,
+ "loss": 0.6871,
+ "step": 277
+ },
+ {
+ "epoch": 0.1314420803782506,
+ "grad_norm": 3.190441846847534,
+ "learning_rate": 4.997534375693414e-06,
+ "loss": 0.6883,
+ "step": 278
+ },
+ {
+ "epoch": 0.13191489361702127,
+ "grad_norm": 3.063518762588501,
+ "learning_rate": 4.997506598849234e-06,
+ "loss": 0.7586,
+ "step": 279
+ },
+ {
+ "epoch": 0.13238770685579196,
+ "grad_norm": 3.4112050533294678,
+ "learning_rate": 4.997478666496269e-06,
+ "loss": 0.796,
+ "step": 280
+ },
+ {
+ "epoch": 0.13286052009456265,
+ "grad_norm": 3.231886386871338,
+ "learning_rate": 4.997450578636259e-06,
+ "loss": 0.7714,
+ "step": 281
+ },
+ {
+ "epoch": 0.13333333333333333,
+ "grad_norm": 3.279425621032715,
+ "learning_rate": 4.9974223352709515e-06,
+ "loss": 0.7793,
+ "step": 282
+ },
+ {
+ "epoch": 0.13380614657210402,
+ "grad_norm": 3.2154316902160645,
+ "learning_rate": 4.9973939364021075e-06,
+ "loss": 0.791,
+ "step": 283
+ },
+ {
+ "epoch": 0.1342789598108747,
+ "grad_norm": 3.2090768814086914,
+ "learning_rate": 4.9973653820314925e-06,
+ "loss": 0.6433,
+ "step": 284
+ },
+ {
+ "epoch": 0.1347517730496454,
+ "grad_norm": 3.1712026596069336,
+ "learning_rate": 4.997336672160886e-06,
+ "loss": 0.8128,
+ "step": 285
+ },
+ {
+ "epoch": 0.13522458628841608,
+ "grad_norm": 2.929229497909546,
+ "learning_rate": 4.997307806792076e-06,
+ "loss": 0.7594,
+ "step": 286
+ },
+ {
+ "epoch": 0.13569739952718676,
+ "grad_norm": 3.0363314151763916,
+ "learning_rate": 4.997278785926859e-06,
+ "loss": 0.7336,
+ "step": 287
+ },
+ {
+ "epoch": 0.13617021276595745,
+ "grad_norm": 3.1352357864379883,
+ "learning_rate": 4.997249609567042e-06,
+ "loss": 0.7225,
+ "step": 288
+ },
+ {
+ "epoch": 0.13664302600472814,
+ "grad_norm": 3.3171157836914062,
+ "learning_rate": 4.997220277714442e-06,
+ "loss": 0.7777,
+ "step": 289
+ },
+ {
+ "epoch": 0.13711583924349882,
+ "grad_norm": 3.050717353820801,
+ "learning_rate": 4.997190790370885e-06,
+ "loss": 0.6836,
+ "step": 290
+ },
+ {
+ "epoch": 0.1375886524822695,
+ "grad_norm": 3.0297694206237793,
+ "learning_rate": 4.997161147538208e-06,
+ "loss": 0.6883,
+ "step": 291
+ },
+ {
+ "epoch": 0.1380614657210402,
+ "grad_norm": 3.0566554069519043,
+ "learning_rate": 4.997131349218256e-06,
+ "loss": 0.6674,
+ "step": 292
+ },
+ {
+ "epoch": 0.13853427895981088,
+ "grad_norm": 3.799111843109131,
+ "learning_rate": 4.997101395412885e-06,
+ "loss": 0.8256,
+ "step": 293
+ },
+ {
+ "epoch": 0.13900709219858157,
+ "grad_norm": 3.1394248008728027,
+ "learning_rate": 4.9970712861239576e-06,
+ "loss": 0.7306,
+ "step": 294
+ },
+ {
+ "epoch": 0.13947990543735225,
+ "grad_norm": 3.0605666637420654,
+ "learning_rate": 4.997041021353352e-06,
+ "loss": 0.7212,
+ "step": 295
+ },
+ {
+ "epoch": 0.13995271867612294,
+ "grad_norm": 3.8813397884368896,
+ "learning_rate": 4.997010601102951e-06,
+ "loss": 0.769,
+ "step": 296
+ },
+ {
+ "epoch": 0.14042553191489363,
+ "grad_norm": 3.0514819622039795,
+ "learning_rate": 4.996980025374649e-06,
+ "loss": 0.7422,
+ "step": 297
+ },
+ {
+ "epoch": 0.1408983451536643,
+ "grad_norm": 2.9544146060943604,
+ "learning_rate": 4.99694929417035e-06,
+ "loss": 0.6912,
+ "step": 298
+ },
+ {
+ "epoch": 0.141371158392435,
+ "grad_norm": 3.2635602951049805,
+ "learning_rate": 4.996918407491966e-06,
+ "loss": 0.7395,
+ "step": 299
+ },
+ {
+ "epoch": 0.14184397163120568,
+ "grad_norm": 3.373882532119751,
+ "learning_rate": 4.996887365341423e-06,
+ "loss": 0.7799,
+ "step": 300
+ },
+ {
+ "epoch": 0.14231678486997637,
+ "grad_norm": 3.001128673553467,
+ "learning_rate": 4.996856167720652e-06,
+ "loss": 0.7168,
+ "step": 301
+ },
+ {
+ "epoch": 0.14278959810874706,
+ "grad_norm": 3.1026835441589355,
+ "learning_rate": 4.996824814631595e-06,
+ "loss": 0.7492,
+ "step": 302
+ },
+ {
+ "epoch": 0.14326241134751774,
+ "grad_norm": 3.41947603225708,
+ "learning_rate": 4.996793306076205e-06,
+ "loss": 0.6659,
+ "step": 303
+ },
+ {
+ "epoch": 0.14373522458628843,
+ "grad_norm": 3.2272400856018066,
+ "learning_rate": 4.996761642056444e-06,
+ "loss": 0.7184,
+ "step": 304
+ },
+ {
+ "epoch": 0.14420803782505912,
+ "grad_norm": 2.9488935470581055,
+ "learning_rate": 4.996729822574284e-06,
+ "loss": 0.7451,
+ "step": 305
+ },
+ {
+ "epoch": 0.14468085106382977,
+ "grad_norm": 3.268231153488159,
+ "learning_rate": 4.9966978476317065e-06,
+ "loss": 0.7798,
+ "step": 306
+ },
+ {
+ "epoch": 0.14515366430260046,
+ "grad_norm": 3.9086556434631348,
+ "learning_rate": 4.996665717230701e-06,
+ "loss": 0.7871,
+ "step": 307
+ },
+ {
+ "epoch": 0.14562647754137115,
+ "grad_norm": 3.3483879566192627,
+ "learning_rate": 4.996633431373269e-06,
+ "loss": 0.7415,
+ "step": 308
+ },
+ {
+ "epoch": 0.14609929078014183,
+ "grad_norm": 2.839400053024292,
+ "learning_rate": 4.99660099006142e-06,
+ "loss": 0.7192,
+ "step": 309
+ },
+ {
+ "epoch": 0.14657210401891252,
+ "grad_norm": 3.177302598953247,
+ "learning_rate": 4.996568393297175e-06,
+ "loss": 0.755,
+ "step": 310
+ },
+ {
+ "epoch": 0.1470449172576832,
+ "grad_norm": 3.5477044582366943,
+ "learning_rate": 4.996535641082563e-06,
+ "loss": 0.7531,
+ "step": 311
+ },
+ {
+ "epoch": 0.1475177304964539,
+ "grad_norm": 3.418576717376709,
+ "learning_rate": 4.996502733419624e-06,
+ "loss": 0.8009,
+ "step": 312
+ },
+ {
+ "epoch": 0.14799054373522458,
+ "grad_norm": 3.711341619491577,
+ "learning_rate": 4.996469670310407e-06,
+ "loss": 0.7362,
+ "step": 313
+ },
+ {
+ "epoch": 0.14846335697399526,
+ "grad_norm": 3.2419373989105225,
+ "learning_rate": 4.99643645175697e-06,
+ "loss": 0.7761,
+ "step": 314
+ },
+ {
+ "epoch": 0.14893617021276595,
+ "grad_norm": 3.121858835220337,
+ "learning_rate": 4.996403077761381e-06,
+ "loss": 0.6495,
+ "step": 315
+ },
+ {
+ "epoch": 0.14940898345153664,
+ "grad_norm": 3.123054265975952,
+ "learning_rate": 4.996369548325719e-06,
+ "loss": 0.7444,
+ "step": 316
+ },
+ {
+ "epoch": 0.14988179669030732,
+ "grad_norm": 2.780880928039551,
+ "learning_rate": 4.996335863452072e-06,
+ "loss": 0.672,
+ "step": 317
+ },
+ {
+ "epoch": 0.150354609929078,
+ "grad_norm": 3.3738629817962646,
+ "learning_rate": 4.996302023142536e-06,
+ "loss": 0.7972,
+ "step": 318
+ },
+ {
+ "epoch": 0.1508274231678487,
+ "grad_norm": 3.4874777793884277,
+ "learning_rate": 4.99626802739922e-06,
+ "loss": 0.8252,
+ "step": 319
+ },
+ {
+ "epoch": 0.15130023640661938,
+ "grad_norm": 3.7074787616729736,
+ "learning_rate": 4.9962338762242395e-06,
+ "loss": 0.8216,
+ "step": 320
+ },
+ {
+ "epoch": 0.15177304964539007,
+ "grad_norm": 3.281912326812744,
+ "learning_rate": 4.996199569619721e-06,
+ "loss": 0.8175,
+ "step": 321
+ },
+ {
+ "epoch": 0.15224586288416075,
+ "grad_norm": 2.9485340118408203,
+ "learning_rate": 4.996165107587801e-06,
+ "loss": 0.707,
+ "step": 322
+ },
+ {
+ "epoch": 0.15271867612293144,
+ "grad_norm": 3.3757646083831787,
+ "learning_rate": 4.996130490130625e-06,
+ "loss": 0.7955,
+ "step": 323
+ },
+ {
+ "epoch": 0.15319148936170213,
+ "grad_norm": 2.962181568145752,
+ "learning_rate": 4.996095717250349e-06,
+ "loss": 0.7067,
+ "step": 324
+ },
+ {
+ "epoch": 0.1536643026004728,
+ "grad_norm": 3.114272356033325,
+ "learning_rate": 4.996060788949136e-06,
+ "loss": 0.7486,
+ "step": 325
+ },
+ {
+ "epoch": 0.1541371158392435,
+ "grad_norm": 3.0621590614318848,
+ "learning_rate": 4.996025705229165e-06,
+ "loss": 0.6547,
+ "step": 326
+ },
+ {
+ "epoch": 0.15460992907801419,
+ "grad_norm": 2.8745882511138916,
+ "learning_rate": 4.995990466092616e-06,
+ "loss": 0.6435,
+ "step": 327
+ },
+ {
+ "epoch": 0.15508274231678487,
+ "grad_norm": 2.90841007232666,
+ "learning_rate": 4.995955071541686e-06,
+ "loss": 0.7331,
+ "step": 328
+ },
+ {
+ "epoch": 0.15555555555555556,
+ "grad_norm": 2.694580316543579,
+ "learning_rate": 4.9959195215785784e-06,
+ "loss": 0.6731,
+ "step": 329
+ },
+ {
+ "epoch": 0.15602836879432624,
+ "grad_norm": 3.158083438873291,
+ "learning_rate": 4.995883816205507e-06,
+ "loss": 0.7257,
+ "step": 330
+ },
+ {
+ "epoch": 0.15650118203309693,
+ "grad_norm": 3.3234715461730957,
+ "learning_rate": 4.995847955424694e-06,
+ "loss": 0.7389,
+ "step": 331
+ },
+ {
+ "epoch": 0.15697399527186762,
+ "grad_norm": 2.9406495094299316,
+ "learning_rate": 4.995811939238373e-06,
+ "loss": 0.643,
+ "step": 332
+ },
+ {
+ "epoch": 0.1574468085106383,
+ "grad_norm": 3.3191726207733154,
+ "learning_rate": 4.995775767648785e-06,
+ "loss": 0.7879,
+ "step": 333
+ },
+ {
+ "epoch": 0.157919621749409,
+ "grad_norm": 3.711925745010376,
+ "learning_rate": 4.995739440658185e-06,
+ "loss": 0.7586,
+ "step": 334
+ },
+ {
+ "epoch": 0.15839243498817968,
+ "grad_norm": 9.573421478271484,
+ "learning_rate": 4.995702958268833e-06,
+ "loss": 0.7842,
+ "step": 335
+ },
+ {
+ "epoch": 0.15886524822695036,
+ "grad_norm": 3.4154508113861084,
+ "learning_rate": 4.995666320483001e-06,
+ "loss": 0.6735,
+ "step": 336
+ },
+ {
+ "epoch": 0.15933806146572105,
+ "grad_norm": 3.4169859886169434,
+ "learning_rate": 4.995629527302971e-06,
+ "loss": 0.741,
+ "step": 337
+ },
+ {
+ "epoch": 0.15981087470449173,
+ "grad_norm": 3.287503242492676,
+ "learning_rate": 4.9955925787310335e-06,
+ "loss": 0.7139,
+ "step": 338
+ },
+ {
+ "epoch": 0.16028368794326242,
+ "grad_norm": 3.288409471511841,
+ "learning_rate": 4.995555474769488e-06,
+ "loss": 0.7636,
+ "step": 339
+ },
+ {
+ "epoch": 0.1607565011820331,
+ "grad_norm": 2.8021693229675293,
+ "learning_rate": 4.995518215420646e-06,
+ "loss": 0.5883,
+ "step": 340
+ },
+ {
+ "epoch": 0.1612293144208038,
+ "grad_norm": 2.7038564682006836,
+ "learning_rate": 4.995480800686827e-06,
+ "loss": 0.657,
+ "step": 341
+ },
+ {
+ "epoch": 0.16170212765957448,
+ "grad_norm": 3.2370235919952393,
+ "learning_rate": 4.9954432305703615e-06,
+ "loss": 0.6999,
+ "step": 342
+ },
+ {
+ "epoch": 0.16217494089834517,
+ "grad_norm": 2.8666412830352783,
+ "learning_rate": 4.995405505073588e-06,
+ "loss": 0.7199,
+ "step": 343
+ },
+ {
+ "epoch": 0.16264775413711585,
+ "grad_norm": 3.6467232704162598,
+ "learning_rate": 4.995367624198856e-06,
+ "loss": 0.7317,
+ "step": 344
+ },
+ {
+ "epoch": 0.16312056737588654,
+ "grad_norm": 2.7576327323913574,
+ "learning_rate": 4.9953295879485246e-06,
+ "loss": 0.647,
+ "step": 345
+ },
+ {
+ "epoch": 0.1635933806146572,
+ "grad_norm": 2.922232151031494,
+ "learning_rate": 4.995291396324959e-06,
+ "loss": 0.6686,
+ "step": 346
+ },
+ {
+ "epoch": 0.16406619385342788,
+ "grad_norm": 2.8693501949310303,
+ "learning_rate": 4.995253049330542e-06,
+ "loss": 0.6756,
+ "step": 347
+ },
+ {
+ "epoch": 0.16453900709219857,
+ "grad_norm": 3.671865701675415,
+ "learning_rate": 4.995214546967658e-06,
+ "loss": 0.7347,
+ "step": 348
+ },
+ {
+ "epoch": 0.16501182033096926,
+ "grad_norm": 3.024219274520874,
+ "learning_rate": 4.995175889238706e-06,
+ "loss": 0.7547,
+ "step": 349
+ },
+ {
+ "epoch": 0.16548463356973994,
+ "grad_norm": 2.8470778465270996,
+ "learning_rate": 4.995137076146091e-06,
+ "loss": 0.6764,
+ "step": 350
+ },
+ {
+ "epoch": 0.16595744680851063,
+ "grad_norm": 2.905057907104492,
+ "learning_rate": 4.9950981076922324e-06,
+ "loss": 0.6814,
+ "step": 351
+ },
+ {
+ "epoch": 0.16643026004728131,
+ "grad_norm": 3.504377841949463,
+ "learning_rate": 4.995058983879555e-06,
+ "loss": 0.7145,
+ "step": 352
+ },
+ {
+ "epoch": 0.166903073286052,
+ "grad_norm": 3.0029661655426025,
+ "learning_rate": 4.995019704710495e-06,
+ "loss": 0.7114,
+ "step": 353
+ },
+ {
+ "epoch": 0.1673758865248227,
+ "grad_norm": 2.8666274547576904,
+ "learning_rate": 4.994980270187499e-06,
+ "loss": 0.7416,
+ "step": 354
+ },
+ {
+ "epoch": 0.16784869976359337,
+ "grad_norm": 3.1644718647003174,
+ "learning_rate": 4.994940680313021e-06,
+ "loss": 0.661,
+ "step": 355
+ },
+ {
+ "epoch": 0.16832151300236406,
+ "grad_norm": 3.050391674041748,
+ "learning_rate": 4.994900935089527e-06,
+ "loss": 0.7243,
+ "step": 356
+ },
+ {
+ "epoch": 0.16879432624113475,
+ "grad_norm": 2.985466480255127,
+ "learning_rate": 4.994861034519491e-06,
+ "loss": 0.6917,
+ "step": 357
+ },
+ {
+ "epoch": 0.16926713947990543,
+ "grad_norm": 2.909342050552368,
+ "learning_rate": 4.9948209786053995e-06,
+ "loss": 0.6636,
+ "step": 358
+ },
+ {
+ "epoch": 0.16973995271867612,
+ "grad_norm": 3.2214784622192383,
+ "learning_rate": 4.9947807673497435e-06,
+ "loss": 0.7903,
+ "step": 359
+ },
+ {
+ "epoch": 0.1702127659574468,
+ "grad_norm": 2.5654983520507812,
+ "learning_rate": 4.994740400755029e-06,
+ "loss": 0.6129,
+ "step": 360
+ },
+ {
+ "epoch": 0.1706855791962175,
+ "grad_norm": 3.775646448135376,
+ "learning_rate": 4.99469987882377e-06,
+ "loss": 0.7145,
+ "step": 361
+ },
+ {
+ "epoch": 0.17115839243498818,
+ "grad_norm": 2.8965413570404053,
+ "learning_rate": 4.994659201558487e-06,
+ "loss": 0.7177,
+ "step": 362
+ },
+ {
+ "epoch": 0.17163120567375886,
+ "grad_norm": 3.485597848892212,
+ "learning_rate": 4.9946183689617146e-06,
+ "loss": 0.8107,
+ "step": 363
+ },
+ {
+ "epoch": 0.17210401891252955,
+ "grad_norm": 3.277839183807373,
+ "learning_rate": 4.994577381035995e-06,
+ "loss": 0.691,
+ "step": 364
+ },
+ {
+ "epoch": 0.17257683215130024,
+ "grad_norm": 2.8807685375213623,
+ "learning_rate": 4.99453623778388e-06,
+ "loss": 0.7627,
+ "step": 365
+ },
+ {
+ "epoch": 0.17304964539007092,
+ "grad_norm": 3.0659940242767334,
+ "learning_rate": 4.994494939207932e-06,
+ "loss": 0.6858,
+ "step": 366
+ },
+ {
+ "epoch": 0.1735224586288416,
+ "grad_norm": 3.0881855487823486,
+ "learning_rate": 4.994453485310723e-06,
+ "loss": 0.8212,
+ "step": 367
+ },
+ {
+ "epoch": 0.1739952718676123,
+ "grad_norm": 2.7199201583862305,
+ "learning_rate": 4.994411876094832e-06,
+ "loss": 0.6516,
+ "step": 368
+ },
+ {
+ "epoch": 0.17446808510638298,
+ "grad_norm": 2.955889940261841,
+ "learning_rate": 4.994370111562851e-06,
+ "loss": 0.6579,
+ "step": 369
+ },
+ {
+ "epoch": 0.17494089834515367,
+ "grad_norm": 3.1321663856506348,
+ "learning_rate": 4.994328191717382e-06,
+ "loss": 0.6891,
+ "step": 370
+ },
+ {
+ "epoch": 0.17541371158392435,
+ "grad_norm": 3.0560388565063477,
+ "learning_rate": 4.994286116561034e-06,
+ "loss": 0.7243,
+ "step": 371
+ },
+ {
+ "epoch": 0.17588652482269504,
+ "grad_norm": 3.1560704708099365,
+ "learning_rate": 4.994243886096425e-06,
+ "loss": 0.7262,
+ "step": 372
+ },
+ {
+ "epoch": 0.17635933806146573,
+ "grad_norm": 2.913541316986084,
+ "learning_rate": 4.994201500326187e-06,
+ "loss": 0.7318,
+ "step": 373
+ },
+ {
+ "epoch": 0.1768321513002364,
+ "grad_norm": 3.098376512527466,
+ "learning_rate": 4.994158959252958e-06,
+ "loss": 0.6419,
+ "step": 374
+ },
+ {
+ "epoch": 0.1773049645390071,
+ "grad_norm": 2.977508544921875,
+ "learning_rate": 4.994116262879387e-06,
+ "loss": 0.6709,
+ "step": 375
+ },
+ {
+ "epoch": 0.17777777777777778,
+ "grad_norm": 3.168186902999878,
+ "learning_rate": 4.994073411208133e-06,
+ "loss": 0.6608,
+ "step": 376
+ },
+ {
+ "epoch": 0.17825059101654847,
+ "grad_norm": 3.436844825744629,
+ "learning_rate": 4.994030404241864e-06,
+ "loss": 0.7227,
+ "step": 377
+ },
+ {
+ "epoch": 0.17872340425531916,
+ "grad_norm": 2.8998289108276367,
+ "learning_rate": 4.993987241983258e-06,
+ "loss": 0.6512,
+ "step": 378
+ },
+ {
+ "epoch": 0.17919621749408984,
+ "grad_norm": 3.407191514968872,
+ "learning_rate": 4.993943924435002e-06,
+ "loss": 0.616,
+ "step": 379
+ },
+ {
+ "epoch": 0.17966903073286053,
+ "grad_norm": 3.744858741760254,
+ "learning_rate": 4.993900451599793e-06,
+ "loss": 0.8599,
+ "step": 380
+ },
+ {
+ "epoch": 0.18014184397163122,
+ "grad_norm": 3.486283779144287,
+ "learning_rate": 4.993856823480338e-06,
+ "loss": 0.6634,
+ "step": 381
+ },
+ {
+ "epoch": 0.1806146572104019,
+ "grad_norm": 2.895719051361084,
+ "learning_rate": 4.993813040079355e-06,
+ "loss": 0.6972,
+ "step": 382
+ },
+ {
+ "epoch": 0.1810874704491726,
+ "grad_norm": 2.814133882522583,
+ "learning_rate": 4.993769101399569e-06,
+ "loss": 0.6271,
+ "step": 383
+ },
+ {
+ "epoch": 0.18156028368794327,
+ "grad_norm": 2.8609800338745117,
+ "learning_rate": 4.993725007443715e-06,
+ "loss": 0.6481,
+ "step": 384
+ },
+ {
+ "epoch": 0.18203309692671396,
+ "grad_norm": 3.2829644680023193,
+ "learning_rate": 4.99368075821454e-06,
+ "loss": 0.7999,
+ "step": 385
+ },
+ {
+ "epoch": 0.18250591016548465,
+ "grad_norm": 3.1417458057403564,
+ "learning_rate": 4.993636353714798e-06,
+ "loss": 0.6972,
+ "step": 386
+ },
+ {
+ "epoch": 0.1829787234042553,
+ "grad_norm": 3.0679385662078857,
+ "learning_rate": 4.993591793947256e-06,
+ "loss": 0.667,
+ "step": 387
+ },
+ {
+ "epoch": 0.183451536643026,
+ "grad_norm": 3.1387410163879395,
+ "learning_rate": 4.993547078914686e-06,
+ "loss": 0.7618,
+ "step": 388
+ },
+ {
+ "epoch": 0.18392434988179668,
+ "grad_norm": 2.9181406497955322,
+ "learning_rate": 4.993502208619872e-06,
+ "loss": 0.7391,
+ "step": 389
+ },
+ {
+ "epoch": 0.18439716312056736,
+ "grad_norm": 2.8952157497406006,
+ "learning_rate": 4.993457183065611e-06,
+ "loss": 0.6988,
+ "step": 390
+ },
+ {
+ "epoch": 0.18486997635933805,
+ "grad_norm": 3.2274813652038574,
+ "learning_rate": 4.993412002254704e-06,
+ "loss": 0.688,
+ "step": 391
+ },
+ {
+ "epoch": 0.18534278959810874,
+ "grad_norm": 3.4693779945373535,
+ "learning_rate": 4.993366666189965e-06,
+ "loss": 0.6634,
+ "step": 392
+ },
+ {
+ "epoch": 0.18581560283687942,
+ "grad_norm": 3.5358526706695557,
+ "learning_rate": 4.993321174874217e-06,
+ "loss": 0.7343,
+ "step": 393
+ },
+ {
+ "epoch": 0.1862884160756501,
+ "grad_norm": 3.013338088989258,
+ "learning_rate": 4.993275528310292e-06,
+ "loss": 0.7579,
+ "step": 394
+ },
+ {
+ "epoch": 0.1867612293144208,
+ "grad_norm": 2.694772720336914,
+ "learning_rate": 4.993229726501033e-06,
+ "loss": 0.718,
+ "step": 395
+ },
+ {
+ "epoch": 0.18723404255319148,
+ "grad_norm": 3.070612907409668,
+ "learning_rate": 4.9931837694492915e-06,
+ "loss": 0.6438,
+ "step": 396
+ },
+ {
+ "epoch": 0.18770685579196217,
+ "grad_norm": 2.9193027019500732,
+ "learning_rate": 4.993137657157928e-06,
+ "loss": 0.6788,
+ "step": 397
+ },
+ {
+ "epoch": 0.18817966903073285,
+ "grad_norm": 3.047682046890259,
+ "learning_rate": 4.993091389629816e-06,
+ "loss": 0.6826,
+ "step": 398
+ },
+ {
+ "epoch": 0.18865248226950354,
+ "grad_norm": 2.9629905223846436,
+ "learning_rate": 4.993044966867834e-06,
+ "loss": 0.7196,
+ "step": 399
+ },
+ {
+ "epoch": 0.18912529550827423,
+ "grad_norm": 3.0692050457000732,
+ "learning_rate": 4.992998388874874e-06,
+ "loss": 0.7015,
+ "step": 400
+ },
+ {
+ "epoch": 0.1895981087470449,
+ "grad_norm": 3.5427212715148926,
+ "learning_rate": 4.992951655653836e-06,
+ "loss": 0.8292,
+ "step": 401
+ },
+ {
+ "epoch": 0.1900709219858156,
+ "grad_norm": 2.643526554107666,
+ "learning_rate": 4.992904767207629e-06,
+ "loss": 0.624,
+ "step": 402
+ },
+ {
+ "epoch": 0.19054373522458629,
+ "grad_norm": 3.1185996532440186,
+ "learning_rate": 4.992857723539173e-06,
+ "loss": 0.7354,
+ "step": 403
+ },
+ {
+ "epoch": 0.19101654846335697,
+ "grad_norm": 3.006856679916382,
+ "learning_rate": 4.992810524651398e-06,
+ "loss": 0.7752,
+ "step": 404
+ },
+ {
+ "epoch": 0.19148936170212766,
+ "grad_norm": 2.9913275241851807,
+ "learning_rate": 4.9927631705472425e-06,
+ "loss": 0.7306,
+ "step": 405
+ },
+ {
+ "epoch": 0.19196217494089834,
+ "grad_norm": 2.6794071197509766,
+ "learning_rate": 4.992715661229655e-06,
+ "loss": 0.6136,
+ "step": 406
+ },
+ {
+ "epoch": 0.19243498817966903,
+ "grad_norm": 3.5933966636657715,
+ "learning_rate": 4.992667996701593e-06,
+ "loss": 0.7024,
+ "step": 407
+ },
+ {
+ "epoch": 0.19290780141843972,
+ "grad_norm": 2.862187623977661,
+ "learning_rate": 4.992620176966025e-06,
+ "loss": 0.692,
+ "step": 408
+ },
+ {
+ "epoch": 0.1933806146572104,
+ "grad_norm": 3.076845407485962,
+ "learning_rate": 4.9925722020259286e-06,
+ "loss": 0.7475,
+ "step": 409
+ },
+ {
+ "epoch": 0.1938534278959811,
+ "grad_norm": 3.372919797897339,
+ "learning_rate": 4.9925240718842895e-06,
+ "loss": 0.6886,
+ "step": 410
+ },
+ {
+ "epoch": 0.19432624113475178,
+ "grad_norm": 2.922977924346924,
+ "learning_rate": 4.992475786544108e-06,
+ "loss": 0.7049,
+ "step": 411
+ },
+ {
+ "epoch": 0.19479905437352246,
+ "grad_norm": 2.908034324645996,
+ "learning_rate": 4.992427346008387e-06,
+ "loss": 0.6498,
+ "step": 412
+ },
+ {
+ "epoch": 0.19527186761229315,
+ "grad_norm": 3.096723794937134,
+ "learning_rate": 4.992378750280144e-06,
+ "loss": 0.7151,
+ "step": 413
+ },
+ {
+ "epoch": 0.19574468085106383,
+ "grad_norm": 2.895237684249878,
+ "learning_rate": 4.992329999362405e-06,
+ "loss": 0.7277,
+ "step": 414
+ },
+ {
+ "epoch": 0.19621749408983452,
+ "grad_norm": 2.718230724334717,
+ "learning_rate": 4.9922810932582065e-06,
+ "loss": 0.6375,
+ "step": 415
+ },
+ {
+ "epoch": 0.1966903073286052,
+ "grad_norm": 3.187743663787842,
+ "learning_rate": 4.992232031970592e-06,
+ "loss": 0.6528,
+ "step": 416
+ },
+ {
+ "epoch": 0.1971631205673759,
+ "grad_norm": 2.996406316757202,
+ "learning_rate": 4.992182815502616e-06,
+ "loss": 0.6552,
+ "step": 417
+ },
+ {
+ "epoch": 0.19763593380614658,
+ "grad_norm": 3.301084041595459,
+ "learning_rate": 4.992133443857345e-06,
+ "loss": 0.7061,
+ "step": 418
+ },
+ {
+ "epoch": 0.19810874704491727,
+ "grad_norm": 3.7874677181243896,
+ "learning_rate": 4.992083917037853e-06,
+ "loss": 0.7859,
+ "step": 419
+ },
+ {
+ "epoch": 0.19858156028368795,
+ "grad_norm": 3.124253511428833,
+ "learning_rate": 4.992034235047222e-06,
+ "loss": 0.7615,
+ "step": 420
+ },
+ {
+ "epoch": 0.19905437352245864,
+ "grad_norm": 3.0488970279693604,
+ "learning_rate": 4.991984397888546e-06,
+ "loss": 0.6916,
+ "step": 421
+ },
+ {
+ "epoch": 0.19952718676122932,
+ "grad_norm": 3.1241321563720703,
+ "learning_rate": 4.991934405564929e-06,
+ "loss": 0.7055,
+ "step": 422
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 3.396632432937622,
+ "learning_rate": 4.991884258079484e-06,
+ "loss": 0.7675,
+ "step": 423
+ },
+ {
+ "epoch": 0.2004728132387707,
+ "grad_norm": 3.7776873111724854,
+ "learning_rate": 4.9918339554353316e-06,
+ "loss": 0.7371,
+ "step": 424
+ },
+ {
+ "epoch": 0.20094562647754138,
+ "grad_norm": 3.3356032371520996,
+ "learning_rate": 4.991783497635606e-06,
+ "loss": 0.6778,
+ "step": 425
+ },
+ {
+ "epoch": 0.20141843971631207,
+ "grad_norm": 2.988856792449951,
+ "learning_rate": 4.9917328846834474e-06,
+ "loss": 0.6795,
+ "step": 426
+ },
+ {
+ "epoch": 0.20189125295508276,
+ "grad_norm": 3.264183282852173,
+ "learning_rate": 4.99168211658201e-06,
+ "loss": 0.7707,
+ "step": 427
+ },
+ {
+ "epoch": 0.20236406619385341,
+ "grad_norm": 3.878068208694458,
+ "learning_rate": 4.991631193334451e-06,
+ "loss": 0.857,
+ "step": 428
+ },
+ {
+ "epoch": 0.2028368794326241,
+ "grad_norm": 3.6377553939819336,
+ "learning_rate": 4.991580114943943e-06,
+ "loss": 0.8033,
+ "step": 429
+ },
+ {
+ "epoch": 0.2033096926713948,
+ "grad_norm": 2.95393967628479,
+ "learning_rate": 4.991528881413667e-06,
+ "loss": 0.6809,
+ "step": 430
+ },
+ {
+ "epoch": 0.20378250591016547,
+ "grad_norm": 3.058704376220703,
+ "learning_rate": 4.9914774927468125e-06,
+ "loss": 0.6664,
+ "step": 431
+ },
+ {
+ "epoch": 0.20425531914893616,
+ "grad_norm": 2.7783217430114746,
+ "learning_rate": 4.9914259489465795e-06,
+ "loss": 0.6478,
+ "step": 432
+ },
+ {
+ "epoch": 0.20472813238770685,
+ "grad_norm": 2.4825217723846436,
+ "learning_rate": 4.991374250016177e-06,
+ "loss": 0.6598,
+ "step": 433
+ },
+ {
+ "epoch": 0.20520094562647753,
+ "grad_norm": 2.8753600120544434,
+ "learning_rate": 4.991322395958824e-06,
+ "loss": 0.6947,
+ "step": 434
+ },
+ {
+ "epoch": 0.20567375886524822,
+ "grad_norm": 3.2339367866516113,
+ "learning_rate": 4.99127038677775e-06,
+ "loss": 0.8201,
+ "step": 435
+ },
+ {
+ "epoch": 0.2061465721040189,
+ "grad_norm": 2.9065537452697754,
+ "learning_rate": 4.991218222476193e-06,
+ "loss": 0.6679,
+ "step": 436
+ },
+ {
+ "epoch": 0.2066193853427896,
+ "grad_norm": 3.283228874206543,
+ "learning_rate": 4.991165903057401e-06,
+ "loss": 0.8039,
+ "step": 437
+ },
+ {
+ "epoch": 0.20709219858156028,
+ "grad_norm": 3.429872751235962,
+ "learning_rate": 4.991113428524631e-06,
+ "loss": 0.7392,
+ "step": 438
+ },
+ {
+ "epoch": 0.20756501182033096,
+ "grad_norm": 3.118943452835083,
+ "learning_rate": 4.991060798881152e-06,
+ "loss": 0.6794,
+ "step": 439
+ },
+ {
+ "epoch": 0.20803782505910165,
+ "grad_norm": 3.395970106124878,
+ "learning_rate": 4.99100801413024e-06,
+ "loss": 0.6862,
+ "step": 440
+ },
+ {
+ "epoch": 0.20851063829787234,
+ "grad_norm": 2.869191884994507,
+ "learning_rate": 4.99095507427518e-06,
+ "loss": 0.6076,
+ "step": 441
+ },
+ {
+ "epoch": 0.20898345153664302,
+ "grad_norm": 3.1934661865234375,
+ "learning_rate": 4.990901979319272e-06,
+ "loss": 0.6927,
+ "step": 442
+ },
+ {
+ "epoch": 0.2094562647754137,
+ "grad_norm": 2.9068603515625,
+ "learning_rate": 4.990848729265819e-06,
+ "loss": 0.6864,
+ "step": 443
+ },
+ {
+ "epoch": 0.2099290780141844,
+ "grad_norm": 3.0535948276519775,
+ "learning_rate": 4.9907953241181375e-06,
+ "loss": 0.6396,
+ "step": 444
+ },
+ {
+ "epoch": 0.21040189125295508,
+ "grad_norm": 2.871511459350586,
+ "learning_rate": 4.990741763879554e-06,
+ "loss": 0.6743,
+ "step": 445
+ },
+ {
+ "epoch": 0.21087470449172577,
+ "grad_norm": 2.9184393882751465,
+ "learning_rate": 4.9906880485534015e-06,
+ "loss": 0.6786,
+ "step": 446
+ },
+ {
+ "epoch": 0.21134751773049645,
+ "grad_norm": 3.0628271102905273,
+ "learning_rate": 4.990634178143026e-06,
+ "loss": 0.6326,
+ "step": 447
+ },
+ {
+ "epoch": 0.21182033096926714,
+ "grad_norm": 3.7878305912017822,
+ "learning_rate": 4.990580152651782e-06,
+ "loss": 0.7944,
+ "step": 448
+ },
+ {
+ "epoch": 0.21229314420803783,
+ "grad_norm": 2.8577189445495605,
+ "learning_rate": 4.990525972083031e-06,
+ "loss": 0.71,
+ "step": 449
+ },
+ {
+ "epoch": 0.2127659574468085,
+ "grad_norm": 3.307769775390625,
+ "learning_rate": 4.99047163644015e-06,
+ "loss": 0.6893,
+ "step": 450
+ },
+ {
+ "epoch": 0.2132387706855792,
+ "grad_norm": 2.7391717433929443,
+ "learning_rate": 4.990417145726519e-06,
+ "loss": 0.712,
+ "step": 451
+ },
+ {
+ "epoch": 0.21371158392434988,
+ "grad_norm": 2.938044786453247,
+ "learning_rate": 4.990362499945534e-06,
+ "loss": 0.7516,
+ "step": 452
+ },
+ {
+ "epoch": 0.21418439716312057,
+ "grad_norm": 2.7831056118011475,
+ "learning_rate": 4.990307699100595e-06,
+ "loss": 0.6168,
+ "step": 453
+ },
+ {
+ "epoch": 0.21465721040189126,
+ "grad_norm": 2.907977342605591,
+ "learning_rate": 4.990252743195116e-06,
+ "loss": 0.6706,
+ "step": 454
+ },
+ {
+ "epoch": 0.21513002364066194,
+ "grad_norm": 3.7882161140441895,
+ "learning_rate": 4.990197632232517e-06,
+ "loss": 0.6847,
+ "step": 455
+ },
+ {
+ "epoch": 0.21560283687943263,
+ "grad_norm": 2.899716854095459,
+ "learning_rate": 4.990142366216232e-06,
+ "loss": 0.6699,
+ "step": 456
+ },
+ {
+ "epoch": 0.21607565011820332,
+ "grad_norm": 2.907003879547119,
+ "learning_rate": 4.990086945149701e-06,
+ "loss": 0.6864,
+ "step": 457
+ },
+ {
+ "epoch": 0.216548463356974,
+ "grad_norm": 3.2407333850860596,
+ "learning_rate": 4.9900313690363736e-06,
+ "loss": 0.692,
+ "step": 458
+ },
+ {
+ "epoch": 0.2170212765957447,
+ "grad_norm": 2.9055583477020264,
+ "learning_rate": 4.989975637879712e-06,
+ "loss": 0.7113,
+ "step": 459
+ },
+ {
+ "epoch": 0.21749408983451538,
+ "grad_norm": 2.9836206436157227,
+ "learning_rate": 4.989919751683184e-06,
+ "loss": 0.6673,
+ "step": 460
+ },
+ {
+ "epoch": 0.21796690307328606,
+ "grad_norm": 3.371035575866699,
+ "learning_rate": 4.989863710450273e-06,
+ "loss": 0.7181,
+ "step": 461
+ },
+ {
+ "epoch": 0.21843971631205675,
+ "grad_norm": 2.9636635780334473,
+ "learning_rate": 4.989807514184465e-06,
+ "loss": 0.6082,
+ "step": 462
+ },
+ {
+ "epoch": 0.21891252955082743,
+ "grad_norm": 2.9634664058685303,
+ "learning_rate": 4.9897511628892615e-06,
+ "loss": 0.7086,
+ "step": 463
+ },
+ {
+ "epoch": 0.21938534278959812,
+ "grad_norm": 3.154763698577881,
+ "learning_rate": 4.98969465656817e-06,
+ "loss": 0.7027,
+ "step": 464
+ },
+ {
+ "epoch": 0.2198581560283688,
+ "grad_norm": 2.9959890842437744,
+ "learning_rate": 4.98963799522471e-06,
+ "loss": 0.6498,
+ "step": 465
+ },
+ {
+ "epoch": 0.2203309692671395,
+ "grad_norm": 3.5470590591430664,
+ "learning_rate": 4.989581178862408e-06,
+ "loss": 0.7199,
+ "step": 466
+ },
+ {
+ "epoch": 0.22080378250591018,
+ "grad_norm": 7.1873369216918945,
+ "learning_rate": 4.989524207484802e-06,
+ "loss": 0.6676,
+ "step": 467
+ },
+ {
+ "epoch": 0.22127659574468084,
+ "grad_norm": 3.1099541187286377,
+ "learning_rate": 4.98946708109544e-06,
+ "loss": 0.6785,
+ "step": 468
+ },
+ {
+ "epoch": 0.22174940898345152,
+ "grad_norm": 2.830991506576538,
+ "learning_rate": 4.9894097996978795e-06,
+ "loss": 0.6456,
+ "step": 469
+ },
+ {
+ "epoch": 0.2222222222222222,
+ "grad_norm": 3.0212316513061523,
+ "learning_rate": 4.989352363295687e-06,
+ "loss": 0.6048,
+ "step": 470
+ },
+ {
+ "epoch": 0.2226950354609929,
+ "grad_norm": 3.18776798248291,
+ "learning_rate": 4.989294771892437e-06,
+ "loss": 0.7078,
+ "step": 471
+ },
+ {
+ "epoch": 0.22316784869976358,
+ "grad_norm": 2.9972598552703857,
+ "learning_rate": 4.989237025491717e-06,
+ "loss": 0.7082,
+ "step": 472
+ },
+ {
+ "epoch": 0.22364066193853427,
+ "grad_norm": 3.4935688972473145,
+ "learning_rate": 4.989179124097123e-06,
+ "loss": 0.8199,
+ "step": 473
+ },
+ {
+ "epoch": 0.22411347517730495,
+ "grad_norm": 2.6485543251037598,
+ "learning_rate": 4.9891210677122595e-06,
+ "loss": 0.6371,
+ "step": 474
+ },
+ {
+ "epoch": 0.22458628841607564,
+ "grad_norm": 2.969233512878418,
+ "learning_rate": 4.989062856340742e-06,
+ "loss": 0.6879,
+ "step": 475
+ },
+ {
+ "epoch": 0.22505910165484633,
+ "grad_norm": 2.881875514984131,
+ "learning_rate": 4.989004489986194e-06,
+ "loss": 0.7415,
+ "step": 476
+ },
+ {
+ "epoch": 0.225531914893617,
+ "grad_norm": 2.624540090560913,
+ "learning_rate": 4.98894596865225e-06,
+ "loss": 0.6522,
+ "step": 477
+ },
+ {
+ "epoch": 0.2260047281323877,
+ "grad_norm": 3.61075496673584,
+ "learning_rate": 4.988887292342555e-06,
+ "loss": 0.7109,
+ "step": 478
+ },
+ {
+ "epoch": 0.2264775413711584,
+ "grad_norm": 2.9368972778320312,
+ "learning_rate": 4.988828461060762e-06,
+ "loss": 0.6843,
+ "step": 479
+ },
+ {
+ "epoch": 0.22695035460992907,
+ "grad_norm": 3.0670197010040283,
+ "learning_rate": 4.988769474810533e-06,
+ "loss": 0.6807,
+ "step": 480
+ },
+ {
+ "epoch": 0.22742316784869976,
+ "grad_norm": 2.9662792682647705,
+ "learning_rate": 4.988710333595542e-06,
+ "loss": 0.6796,
+ "step": 481
+ },
+ {
+ "epoch": 0.22789598108747045,
+ "grad_norm": 2.971235752105713,
+ "learning_rate": 4.988651037419472e-06,
+ "loss": 0.696,
+ "step": 482
+ },
+ {
+ "epoch": 0.22836879432624113,
+ "grad_norm": 2.931884527206421,
+ "learning_rate": 4.988591586286013e-06,
+ "loss": 0.7323,
+ "step": 483
+ },
+ {
+ "epoch": 0.22884160756501182,
+ "grad_norm": 2.8114213943481445,
+ "learning_rate": 4.988531980198868e-06,
+ "loss": 0.6584,
+ "step": 484
+ },
+ {
+ "epoch": 0.2293144208037825,
+ "grad_norm": 3.2785916328430176,
+ "learning_rate": 4.98847221916175e-06,
+ "loss": 0.7514,
+ "step": 485
+ },
+ {
+ "epoch": 0.2297872340425532,
+ "grad_norm": 3.0520215034484863,
+ "learning_rate": 4.988412303178377e-06,
+ "loss": 0.7564,
+ "step": 486
+ },
+ {
+ "epoch": 0.23026004728132388,
+ "grad_norm": 3.181002616882324,
+ "learning_rate": 4.988352232252483e-06,
+ "loss": 0.6768,
+ "step": 487
+ },
+ {
+ "epoch": 0.23073286052009456,
+ "grad_norm": 3.4953625202178955,
+ "learning_rate": 4.988292006387805e-06,
+ "loss": 0.7143,
+ "step": 488
+ },
+ {
+ "epoch": 0.23120567375886525,
+ "grad_norm": 3.326571226119995,
+ "learning_rate": 4.988231625588096e-06,
+ "loss": 0.7318,
+ "step": 489
+ },
+ {
+ "epoch": 0.23167848699763594,
+ "grad_norm": 3.09614634513855,
+ "learning_rate": 4.988171089857113e-06,
+ "loss": 0.6574,
+ "step": 490
+ },
+ {
+ "epoch": 0.23215130023640662,
+ "grad_norm": 2.7439446449279785,
+ "learning_rate": 4.9881103991986265e-06,
+ "loss": 0.6637,
+ "step": 491
+ },
+ {
+ "epoch": 0.2326241134751773,
+ "grad_norm": 3.0681190490722656,
+ "learning_rate": 4.988049553616416e-06,
+ "loss": 0.6326,
+ "step": 492
+ },
+ {
+ "epoch": 0.233096926713948,
+ "grad_norm": 3.0757341384887695,
+ "learning_rate": 4.98798855311427e-06,
+ "loss": 0.695,
+ "step": 493
+ },
+ {
+ "epoch": 0.23356973995271868,
+ "grad_norm": 2.8637635707855225,
+ "learning_rate": 4.987927397695985e-06,
+ "loss": 0.6598,
+ "step": 494
+ },
+ {
+ "epoch": 0.23404255319148937,
+ "grad_norm": 3.3641068935394287,
+ "learning_rate": 4.9878660873653715e-06,
+ "loss": 0.7435,
+ "step": 495
+ },
+ {
+ "epoch": 0.23451536643026005,
+ "grad_norm": 3.5025596618652344,
+ "learning_rate": 4.987804622126245e-06,
+ "loss": 0.735,
+ "step": 496
+ },
+ {
+ "epoch": 0.23498817966903074,
+ "grad_norm": 2.9298837184906006,
+ "learning_rate": 4.987743001982434e-06,
+ "loss": 0.7063,
+ "step": 497
+ },
+ {
+ "epoch": 0.23546099290780143,
+ "grad_norm": 2.70358943939209,
+ "learning_rate": 4.987681226937774e-06,
+ "loss": 0.6799,
+ "step": 498
+ },
+ {
+ "epoch": 0.2359338061465721,
+ "grad_norm": 3.027871608734131,
+ "learning_rate": 4.9876192969961125e-06,
+ "loss": 0.6881,
+ "step": 499
+ },
+ {
+ "epoch": 0.2364066193853428,
+ "grad_norm": 3.362306594848633,
+ "learning_rate": 4.987557212161304e-06,
+ "loss": 0.7906,
+ "step": 500
+ },
+ {
+ "epoch": 0.23687943262411348,
+ "grad_norm": 3.3136050701141357,
+ "learning_rate": 4.987494972437217e-06,
+ "loss": 0.6878,
+ "step": 501
+ },
+ {
+ "epoch": 0.23735224586288417,
+ "grad_norm": 3.017089605331421,
+ "learning_rate": 4.9874325778277255e-06,
+ "loss": 0.7279,
+ "step": 502
+ },
+ {
+ "epoch": 0.23782505910165486,
+ "grad_norm": 2.8300516605377197,
+ "learning_rate": 4.987370028336714e-06,
+ "loss": 0.6864,
+ "step": 503
+ },
+ {
+ "epoch": 0.23829787234042554,
+ "grad_norm": 3.201860189437866,
+ "learning_rate": 4.987307323968077e-06,
+ "loss": 0.7531,
+ "step": 504
+ },
+ {
+ "epoch": 0.23877068557919623,
+ "grad_norm": 2.685396194458008,
+ "learning_rate": 4.987244464725721e-06,
+ "loss": 0.5849,
+ "step": 505
+ },
+ {
+ "epoch": 0.23924349881796692,
+ "grad_norm": 2.8715312480926514,
+ "learning_rate": 4.987181450613557e-06,
+ "loss": 0.675,
+ "step": 506
+ },
+ {
+ "epoch": 0.2397163120567376,
+ "grad_norm": 2.813908815383911,
+ "learning_rate": 4.987118281635511e-06,
+ "loss": 0.6841,
+ "step": 507
+ },
+ {
+ "epoch": 0.2401891252955083,
+ "grad_norm": 3.2738473415374756,
+ "learning_rate": 4.987054957795514e-06,
+ "loss": 0.7158,
+ "step": 508
+ },
+ {
+ "epoch": 0.24066193853427895,
+ "grad_norm": 2.896134376525879,
+ "learning_rate": 4.986991479097511e-06,
+ "loss": 0.7542,
+ "step": 509
+ },
+ {
+ "epoch": 0.24113475177304963,
+ "grad_norm": 3.0390403270721436,
+ "learning_rate": 4.986927845545454e-06,
+ "loss": 0.6733,
+ "step": 510
+ },
+ {
+ "epoch": 0.24160756501182032,
+ "grad_norm": 3.0300254821777344,
+ "learning_rate": 4.9868640571433044e-06,
+ "loss": 0.722,
+ "step": 511
+ },
+ {
+ "epoch": 0.242080378250591,
+ "grad_norm": 3.3037352561950684,
+ "learning_rate": 4.986800113895035e-06,
+ "loss": 0.6811,
+ "step": 512
+ },
+ {
+ "epoch": 0.2425531914893617,
+ "grad_norm": 3.0358474254608154,
+ "learning_rate": 4.986736015804627e-06,
+ "loss": 0.7348,
+ "step": 513
+ },
+ {
+ "epoch": 0.24302600472813238,
+ "grad_norm": 3.108792304992676,
+ "learning_rate": 4.986671762876071e-06,
+ "loss": 0.6096,
+ "step": 514
+ },
+ {
+ "epoch": 0.24349881796690306,
+ "grad_norm": 3.1316237449645996,
+ "learning_rate": 4.986607355113367e-06,
+ "loss": 0.6357,
+ "step": 515
+ },
+ {
+ "epoch": 0.24397163120567375,
+ "grad_norm": 3.3095219135284424,
+ "learning_rate": 4.986542792520528e-06,
+ "loss": 0.7515,
+ "step": 516
+ },
+ {
+ "epoch": 0.24444444444444444,
+ "grad_norm": 3.4775984287261963,
+ "learning_rate": 4.986478075101572e-06,
+ "loss": 0.7104,
+ "step": 517
+ },
+ {
+ "epoch": 0.24491725768321512,
+ "grad_norm": 3.341708183288574,
+ "learning_rate": 4.986413202860528e-06,
+ "loss": 0.7339,
+ "step": 518
+ },
+ {
+ "epoch": 0.2453900709219858,
+ "grad_norm": 2.9646966457366943,
+ "learning_rate": 4.986348175801438e-06,
+ "loss": 0.6032,
+ "step": 519
+ },
+ {
+ "epoch": 0.2458628841607565,
+ "grad_norm": 3.1853902339935303,
+ "learning_rate": 4.986282993928349e-06,
+ "loss": 0.6925,
+ "step": 520
+ },
+ {
+ "epoch": 0.24633569739952718,
+ "grad_norm": 3.286909818649292,
+ "learning_rate": 4.98621765724532e-06,
+ "loss": 0.7447,
+ "step": 521
+ },
+ {
+ "epoch": 0.24680851063829787,
+ "grad_norm": 3.2255051136016846,
+ "learning_rate": 4.986152165756419e-06,
+ "loss": 0.7747,
+ "step": 522
+ },
+ {
+ "epoch": 0.24728132387706855,
+ "grad_norm": 3.002352237701416,
+ "learning_rate": 4.986086519465724e-06,
+ "loss": 0.6472,
+ "step": 523
+ },
+ {
+ "epoch": 0.24775413711583924,
+ "grad_norm": 3.4738974571228027,
+ "learning_rate": 4.986020718377322e-06,
+ "loss": 0.7381,
+ "step": 524
+ },
+ {
+ "epoch": 0.24822695035460993,
+ "grad_norm": 3.4470200538635254,
+ "learning_rate": 4.985954762495312e-06,
+ "loss": 0.6878,
+ "step": 525
+ },
+ {
+ "epoch": 0.2486997635933806,
+ "grad_norm": 2.9219350814819336,
+ "learning_rate": 4.985888651823799e-06,
+ "loss": 0.6317,
+ "step": 526
+ },
+ {
+ "epoch": 0.2491725768321513,
+ "grad_norm": 3.061767101287842,
+ "learning_rate": 4.985822386366899e-06,
+ "loss": 0.6842,
+ "step": 527
+ },
+ {
+ "epoch": 0.24964539007092199,
+ "grad_norm": 3.0291247367858887,
+ "learning_rate": 4.985755966128742e-06,
+ "loss": 0.6852,
+ "step": 528
+ },
+ {
+ "epoch": 0.25011820330969264,
+ "grad_norm": 2.964280843734741,
+ "learning_rate": 4.985689391113457e-06,
+ "loss": 0.7738,
+ "step": 529
+ },
+ {
+ "epoch": 0.25059101654846333,
+ "grad_norm": 3.058302164077759,
+ "learning_rate": 4.9856226613251955e-06,
+ "loss": 0.6677,
+ "step": 530
+ },
+ {
+ "epoch": 0.251063829787234,
+ "grad_norm": 3.345141649246216,
+ "learning_rate": 4.985555776768109e-06,
+ "loss": 0.7837,
+ "step": 531
+ },
+ {
+ "epoch": 0.2515366430260047,
+ "grad_norm": 3.565031051635742,
+ "learning_rate": 4.9854887374463636e-06,
+ "loss": 0.7231,
+ "step": 532
+ },
+ {
+ "epoch": 0.2520094562647754,
+ "grad_norm": 2.7953789234161377,
+ "learning_rate": 4.985421543364132e-06,
+ "loss": 0.6102,
+ "step": 533
+ },
+ {
+ "epoch": 0.2524822695035461,
+ "grad_norm": 2.887606620788574,
+ "learning_rate": 4.9853541945256e-06,
+ "loss": 0.6289,
+ "step": 534
+ },
+ {
+ "epoch": 0.25295508274231676,
+ "grad_norm": 3.1480495929718018,
+ "learning_rate": 4.985286690934961e-06,
+ "loss": 0.6348,
+ "step": 535
+ },
+ {
+ "epoch": 0.25342789598108745,
+ "grad_norm": 2.8912761211395264,
+ "learning_rate": 4.985219032596416e-06,
+ "loss": 0.595,
+ "step": 536
+ },
+ {
+ "epoch": 0.25390070921985813,
+ "grad_norm": 2.947936534881592,
+ "learning_rate": 4.98515121951418e-06,
+ "loss": 0.6196,
+ "step": 537
+ },
+ {
+ "epoch": 0.2543735224586288,
+ "grad_norm": 3.1085827350616455,
+ "learning_rate": 4.985083251692474e-06,
+ "loss": 0.6387,
+ "step": 538
+ },
+ {
+ "epoch": 0.2548463356973995,
+ "grad_norm": 3.1688334941864014,
+ "learning_rate": 4.985015129135531e-06,
+ "loss": 0.7055,
+ "step": 539
+ },
+ {
+ "epoch": 0.2553191489361702,
+ "grad_norm": 3.075042963027954,
+ "learning_rate": 4.984946851847593e-06,
+ "loss": 0.7515,
+ "step": 540
+ },
+ {
+ "epoch": 0.2557919621749409,
+ "grad_norm": 3.1933093070983887,
+ "learning_rate": 4.98487841983291e-06,
+ "loss": 0.7054,
+ "step": 541
+ },
+ {
+ "epoch": 0.25626477541371157,
+ "grad_norm": 3.043473958969116,
+ "learning_rate": 4.984809833095744e-06,
+ "loss": 0.6281,
+ "step": 542
+ },
+ {
+ "epoch": 0.25673758865248225,
+ "grad_norm": 3.0532584190368652,
+ "learning_rate": 4.9847410916403645e-06,
+ "loss": 0.6155,
+ "step": 543
+ },
+ {
+ "epoch": 0.25721040189125294,
+ "grad_norm": 3.608480215072632,
+ "learning_rate": 4.984672195471053e-06,
+ "loss": 0.7363,
+ "step": 544
+ },
+ {
+ "epoch": 0.2576832151300236,
+ "grad_norm": 2.7491862773895264,
+ "learning_rate": 4.9846031445921e-06,
+ "loss": 0.6594,
+ "step": 545
+ },
+ {
+ "epoch": 0.2581560283687943,
+ "grad_norm": 2.8602418899536133,
+ "learning_rate": 4.984533939007802e-06,
+ "loss": 0.6742,
+ "step": 546
+ },
+ {
+ "epoch": 0.258628841607565,
+ "grad_norm": 3.1782007217407227,
+ "learning_rate": 4.98446457872247e-06,
+ "loss": 0.731,
+ "step": 547
+ },
+ {
+ "epoch": 0.2591016548463357,
+ "grad_norm": 2.796147584915161,
+ "learning_rate": 4.984395063740423e-06,
+ "loss": 0.6617,
+ "step": 548
+ },
+ {
+ "epoch": 0.25957446808510637,
+ "grad_norm": 2.8392202854156494,
+ "learning_rate": 4.984325394065991e-06,
+ "loss": 0.6753,
+ "step": 549
+ },
+ {
+ "epoch": 0.26004728132387706,
+ "grad_norm": 3.134672164916992,
+ "learning_rate": 4.984255569703508e-06,
+ "loss": 0.7222,
+ "step": 550
+ },
+ {
+ "epoch": 0.26052009456264774,
+ "grad_norm": 2.734330177307129,
+ "learning_rate": 4.984185590657325e-06,
+ "loss": 0.6098,
+ "step": 551
+ },
+ {
+ "epoch": 0.26099290780141843,
+ "grad_norm": 3.739010810852051,
+ "learning_rate": 4.984115456931798e-06,
+ "loss": 0.7457,
+ "step": 552
+ },
+ {
+ "epoch": 0.2614657210401891,
+ "grad_norm": 2.8412528038024902,
+ "learning_rate": 4.9840451685312925e-06,
+ "loss": 0.6972,
+ "step": 553
+ },
+ {
+ "epoch": 0.2619385342789598,
+ "grad_norm": 3.017395496368408,
+ "learning_rate": 4.983974725460188e-06,
+ "loss": 0.6887,
+ "step": 554
+ },
+ {
+ "epoch": 0.2624113475177305,
+ "grad_norm": 3.2746949195861816,
+ "learning_rate": 4.98390412772287e-06,
+ "loss": 0.7047,
+ "step": 555
+ },
+ {
+ "epoch": 0.2628841607565012,
+ "grad_norm": 3.1561965942382812,
+ "learning_rate": 4.983833375323732e-06,
+ "loss": 0.7726,
+ "step": 556
+ },
+ {
+ "epoch": 0.26335697399527186,
+ "grad_norm": 3.2367217540740967,
+ "learning_rate": 4.9837624682671816e-06,
+ "loss": 0.6348,
+ "step": 557
+ },
+ {
+ "epoch": 0.26382978723404255,
+ "grad_norm": 2.8195858001708984,
+ "learning_rate": 4.983691406557633e-06,
+ "loss": 0.6387,
+ "step": 558
+ },
+ {
+ "epoch": 0.26430260047281323,
+ "grad_norm": 3.349820852279663,
+ "learning_rate": 4.983620190199511e-06,
+ "loss": 0.6776,
+ "step": 559
+ },
+ {
+ "epoch": 0.2647754137115839,
+ "grad_norm": 2.8025588989257812,
+ "learning_rate": 4.98354881919725e-06,
+ "loss": 0.6512,
+ "step": 560
+ },
+ {
+ "epoch": 0.2652482269503546,
+ "grad_norm": 2.9125499725341797,
+ "learning_rate": 4.983477293555295e-06,
+ "loss": 0.7024,
+ "step": 561
+ },
+ {
+ "epoch": 0.2657210401891253,
+ "grad_norm": 3.3479275703430176,
+ "learning_rate": 4.983405613278098e-06,
+ "loss": 0.688,
+ "step": 562
+ },
+ {
+ "epoch": 0.266193853427896,
+ "grad_norm": 3.123971462249756,
+ "learning_rate": 4.983333778370123e-06,
+ "loss": 0.6743,
+ "step": 563
+ },
+ {
+ "epoch": 0.26666666666666666,
+ "grad_norm": 2.891625165939331,
+ "learning_rate": 4.983261788835843e-06,
+ "loss": 0.5971,
+ "step": 564
+ },
+ {
+ "epoch": 0.26713947990543735,
+ "grad_norm": 3.5066864490509033,
+ "learning_rate": 4.98318964467974e-06,
+ "loss": 0.6958,
+ "step": 565
+ },
+ {
+ "epoch": 0.26761229314420804,
+ "grad_norm": 2.570547342300415,
+ "learning_rate": 4.983117345906306e-06,
+ "loss": 0.609,
+ "step": 566
+ },
+ {
+ "epoch": 0.2680851063829787,
+ "grad_norm": 3.005106210708618,
+ "learning_rate": 4.983044892520044e-06,
+ "loss": 0.6791,
+ "step": 567
+ },
+ {
+ "epoch": 0.2685579196217494,
+ "grad_norm": 3.429675340652466,
+ "learning_rate": 4.982972284525463e-06,
+ "loss": 0.6625,
+ "step": 568
+ },
+ {
+ "epoch": 0.2690307328605201,
+ "grad_norm": 3.825657367706299,
+ "learning_rate": 4.982899521927086e-06,
+ "loss": 0.6368,
+ "step": 569
+ },
+ {
+ "epoch": 0.2695035460992908,
+ "grad_norm": 2.8699095249176025,
+ "learning_rate": 4.982826604729443e-06,
+ "loss": 0.6425,
+ "step": 570
+ },
+ {
+ "epoch": 0.26997635933806147,
+ "grad_norm": 3.1688714027404785,
+ "learning_rate": 4.982753532937074e-06,
+ "loss": 0.6904,
+ "step": 571
+ },
+ {
+ "epoch": 0.27044917257683215,
+ "grad_norm": 3.3889992237091064,
+ "learning_rate": 4.98268030655453e-06,
+ "loss": 0.7575,
+ "step": 572
+ },
+ {
+ "epoch": 0.27092198581560284,
+ "grad_norm": 3.108315944671631,
+ "learning_rate": 4.982606925586367e-06,
+ "loss": 0.6648,
+ "step": 573
+ },
+ {
+ "epoch": 0.2713947990543735,
+ "grad_norm": 3.209831953048706,
+ "learning_rate": 4.982533390037159e-06,
+ "loss": 0.657,
+ "step": 574
+ },
+ {
+ "epoch": 0.2718676122931442,
+ "grad_norm": 3.1740927696228027,
+ "learning_rate": 4.982459699911482e-06,
+ "loss": 0.7262,
+ "step": 575
+ },
+ {
+ "epoch": 0.2723404255319149,
+ "grad_norm": 3.0190417766571045,
+ "learning_rate": 4.982385855213924e-06,
+ "loss": 0.6368,
+ "step": 576
+ },
+ {
+ "epoch": 0.2728132387706856,
+ "grad_norm": 3.05049467086792,
+ "learning_rate": 4.982311855949084e-06,
+ "loss": 0.72,
+ "step": 577
+ },
+ {
+ "epoch": 0.27328605200945627,
+ "grad_norm": 2.984816551208496,
+ "learning_rate": 4.98223770212157e-06,
+ "loss": 0.6856,
+ "step": 578
+ },
+ {
+ "epoch": 0.27375886524822696,
+ "grad_norm": 2.744969606399536,
+ "learning_rate": 4.982163393735998e-06,
+ "loss": 0.6023,
+ "step": 579
+ },
+ {
+ "epoch": 0.27423167848699764,
+ "grad_norm": 3.170564889907837,
+ "learning_rate": 4.982088930796996e-06,
+ "loss": 0.6678,
+ "step": 580
+ },
+ {
+ "epoch": 0.27470449172576833,
+ "grad_norm": 2.8686118125915527,
+ "learning_rate": 4.982014313309199e-06,
+ "loss": 0.6157,
+ "step": 581
+ },
+ {
+ "epoch": 0.275177304964539,
+ "grad_norm": 2.8768694400787354,
+ "learning_rate": 4.981939541277254e-06,
+ "loss": 0.6566,
+ "step": 582
+ },
+ {
+ "epoch": 0.2756501182033097,
+ "grad_norm": 2.621481418609619,
+ "learning_rate": 4.981864614705818e-06,
+ "loss": 0.7372,
+ "step": 583
+ },
+ {
+ "epoch": 0.2761229314420804,
+ "grad_norm": 3.527374267578125,
+ "learning_rate": 4.981789533599554e-06,
+ "loss": 0.6485,
+ "step": 584
+ },
+ {
+ "epoch": 0.2765957446808511,
+ "grad_norm": 3.3141074180603027,
+ "learning_rate": 4.981714297963138e-06,
+ "loss": 0.6816,
+ "step": 585
+ },
+ {
+ "epoch": 0.27706855791962176,
+ "grad_norm": 2.9247069358825684,
+ "learning_rate": 4.981638907801255e-06,
+ "loss": 0.7217,
+ "step": 586
+ },
+ {
+ "epoch": 0.27754137115839245,
+ "grad_norm": 2.875236749649048,
+ "learning_rate": 4.981563363118599e-06,
+ "loss": 0.6662,
+ "step": 587
+ },
+ {
+ "epoch": 0.27801418439716313,
+ "grad_norm": 2.9540364742279053,
+ "learning_rate": 4.981487663919874e-06,
+ "loss": 0.7225,
+ "step": 588
+ },
+ {
+ "epoch": 0.2784869976359338,
+ "grad_norm": 2.90889310836792,
+ "learning_rate": 4.981411810209793e-06,
+ "loss": 0.6054,
+ "step": 589
+ },
+ {
+ "epoch": 0.2789598108747045,
+ "grad_norm": 2.8541409969329834,
+ "learning_rate": 4.981335801993078e-06,
+ "loss": 0.6539,
+ "step": 590
+ },
+ {
+ "epoch": 0.2794326241134752,
+ "grad_norm": 3.1600730419158936,
+ "learning_rate": 4.981259639274465e-06,
+ "loss": 0.6415,
+ "step": 591
+ },
+ {
+ "epoch": 0.2799054373522459,
+ "grad_norm": 3.569376230239868,
+ "learning_rate": 4.981183322058693e-06,
+ "loss": 0.6944,
+ "step": 592
+ },
+ {
+ "epoch": 0.28037825059101656,
+ "grad_norm": 3.067667007446289,
+ "learning_rate": 4.981106850350515e-06,
+ "loss": 0.7378,
+ "step": 593
+ },
+ {
+ "epoch": 0.28085106382978725,
+ "grad_norm": 3.082073450088501,
+ "learning_rate": 4.981030224154693e-06,
+ "loss": 0.693,
+ "step": 594
+ },
+ {
+ "epoch": 0.28132387706855794,
+ "grad_norm": 2.902932643890381,
+ "learning_rate": 4.980953443475998e-06,
+ "loss": 0.6549,
+ "step": 595
+ },
+ {
+ "epoch": 0.2817966903073286,
+ "grad_norm": 2.6821181774139404,
+ "learning_rate": 4.980876508319211e-06,
+ "loss": 0.6231,
+ "step": 596
+ },
+ {
+ "epoch": 0.2822695035460993,
+ "grad_norm": 3.1747355461120605,
+ "learning_rate": 4.9807994186891215e-06,
+ "loss": 0.6826,
+ "step": 597
+ },
+ {
+ "epoch": 0.28274231678487,
+ "grad_norm": 2.6975860595703125,
+ "learning_rate": 4.980722174590531e-06,
+ "loss": 0.6669,
+ "step": 598
+ },
+ {
+ "epoch": 0.2832151300236407,
+ "grad_norm": 2.924285650253296,
+ "learning_rate": 4.9806447760282486e-06,
+ "loss": 0.689,
+ "step": 599
+ },
+ {
+ "epoch": 0.28368794326241137,
+ "grad_norm": 2.941417694091797,
+ "learning_rate": 4.980567223007093e-06,
+ "loss": 0.6672,
+ "step": 600
+ },
+ {
+ "epoch": 0.28416075650118205,
+ "grad_norm": 2.8582186698913574,
+ "learning_rate": 4.980489515531892e-06,
+ "loss": 0.6229,
+ "step": 601
+ },
+ {
+ "epoch": 0.28463356973995274,
+ "grad_norm": 2.6462013721466064,
+ "learning_rate": 4.9804116536074865e-06,
+ "loss": 0.606,
+ "step": 602
+ },
+ {
+ "epoch": 0.2851063829787234,
+ "grad_norm": 2.9029998779296875,
+ "learning_rate": 4.980333637238723e-06,
+ "loss": 0.5915,
+ "step": 603
+ },
+ {
+ "epoch": 0.2855791962174941,
+ "grad_norm": 3.9359042644500732,
+ "learning_rate": 4.980255466430462e-06,
+ "loss": 0.7035,
+ "step": 604
+ },
+ {
+ "epoch": 0.2860520094562648,
+ "grad_norm": 3.200524091720581,
+ "learning_rate": 4.980177141187566e-06,
+ "loss": 0.7156,
+ "step": 605
+ },
+ {
+ "epoch": 0.2865248226950355,
+ "grad_norm": 3.1708686351776123,
+ "learning_rate": 4.980098661514916e-06,
+ "loss": 0.746,
+ "step": 606
+ },
+ {
+ "epoch": 0.28699763593380617,
+ "grad_norm": 2.8926830291748047,
+ "learning_rate": 4.980020027417397e-06,
+ "loss": 0.6282,
+ "step": 607
+ },
+ {
+ "epoch": 0.28747044917257686,
+ "grad_norm": 3.0526294708251953,
+ "learning_rate": 4.979941238899906e-06,
+ "loss": 0.6594,
+ "step": 608
+ },
+ {
+ "epoch": 0.28794326241134754,
+ "grad_norm": 2.9869306087493896,
+ "learning_rate": 4.9798622959673486e-06,
+ "loss": 0.7771,
+ "step": 609
+ },
+ {
+ "epoch": 0.28841607565011823,
+ "grad_norm": 2.7894513607025146,
+ "learning_rate": 4.979783198624638e-06,
+ "loss": 0.6819,
+ "step": 610
+ },
+ {
+ "epoch": 0.28888888888888886,
+ "grad_norm": 2.958575963973999,
+ "learning_rate": 4.9797039468767025e-06,
+ "loss": 0.6474,
+ "step": 611
+ },
+ {
+ "epoch": 0.28936170212765955,
+ "grad_norm": 3.423748016357422,
+ "learning_rate": 4.979624540728475e-06,
+ "loss": 0.7389,
+ "step": 612
+ },
+ {
+ "epoch": 0.28983451536643023,
+ "grad_norm": 2.9641635417938232,
+ "learning_rate": 4.9795449801849e-06,
+ "loss": 0.6005,
+ "step": 613
+ },
+ {
+ "epoch": 0.2903073286052009,
+ "grad_norm": 3.02274227142334,
+ "learning_rate": 4.979465265250933e-06,
+ "loss": 0.6358,
+ "step": 614
+ },
+ {
+ "epoch": 0.2907801418439716,
+ "grad_norm": 3.0562758445739746,
+ "learning_rate": 4.979385395931534e-06,
+ "loss": 0.6313,
+ "step": 615
+ },
+ {
+ "epoch": 0.2912529550827423,
+ "grad_norm": 3.301816701889038,
+ "learning_rate": 4.97930537223168e-06,
+ "loss": 0.7264,
+ "step": 616
+ },
+ {
+ "epoch": 0.291725768321513,
+ "grad_norm": 2.975360870361328,
+ "learning_rate": 4.979225194156351e-06,
+ "loss": 0.613,
+ "step": 617
+ },
+ {
+ "epoch": 0.29219858156028367,
+ "grad_norm": 2.9245030879974365,
+ "learning_rate": 4.97914486171054e-06,
+ "loss": 0.6646,
+ "step": 618
+ },
+ {
+ "epoch": 0.29267139479905435,
+ "grad_norm": 3.1336188316345215,
+ "learning_rate": 4.979064374899249e-06,
+ "loss": 0.6421,
+ "step": 619
+ },
+ {
+ "epoch": 0.29314420803782504,
+ "grad_norm": 3.6298763751983643,
+ "learning_rate": 4.978983733727491e-06,
+ "loss": 0.6433,
+ "step": 620
+ },
+ {
+ "epoch": 0.2936170212765957,
+ "grad_norm": 2.919597625732422,
+ "learning_rate": 4.9789029382002845e-06,
+ "loss": 0.6288,
+ "step": 621
+ },
+ {
+ "epoch": 0.2940898345153664,
+ "grad_norm": 3.2206127643585205,
+ "learning_rate": 4.978821988322662e-06,
+ "loss": 0.7102,
+ "step": 622
+ },
+ {
+ "epoch": 0.2945626477541371,
+ "grad_norm": 3.1767101287841797,
+ "learning_rate": 4.978740884099664e-06,
+ "loss": 0.6722,
+ "step": 623
+ },
+ {
+ "epoch": 0.2950354609929078,
+ "grad_norm": 3.3425452709198,
+ "learning_rate": 4.97865962553634e-06,
+ "loss": 0.6492,
+ "step": 624
+ },
+ {
+ "epoch": 0.29550827423167847,
+ "grad_norm": 3.0408358573913574,
+ "learning_rate": 4.97857821263775e-06,
+ "loss": 0.6522,
+ "step": 625
+ },
+ {
+ "epoch": 0.29598108747044916,
+ "grad_norm": 2.8144783973693848,
+ "learning_rate": 4.978496645408963e-06,
+ "loss": 0.7237,
+ "step": 626
+ },
+ {
+ "epoch": 0.29645390070921984,
+ "grad_norm": 3.7010560035705566,
+ "learning_rate": 4.978414923855057e-06,
+ "loss": 0.7509,
+ "step": 627
+ },
+ {
+ "epoch": 0.29692671394799053,
+ "grad_norm": 2.9438371658325195,
+ "learning_rate": 4.978333047981122e-06,
+ "loss": 0.6244,
+ "step": 628
+ },
+ {
+ "epoch": 0.2973995271867612,
+ "grad_norm": 3.285982370376587,
+ "learning_rate": 4.978251017792255e-06,
+ "loss": 0.7553,
+ "step": 629
+ },
+ {
+ "epoch": 0.2978723404255319,
+ "grad_norm": 3.7021138668060303,
+ "learning_rate": 4.978168833293564e-06,
+ "loss": 0.7859,
+ "step": 630
+ },
+ {
+ "epoch": 0.2983451536643026,
+ "grad_norm": 3.481858730316162,
+ "learning_rate": 4.9780864944901654e-06,
+ "loss": 0.7146,
+ "step": 631
+ },
+ {
+ "epoch": 0.2988179669030733,
+ "grad_norm": 3.693824529647827,
+ "learning_rate": 4.978004001387188e-06,
+ "loss": 0.6608,
+ "step": 632
+ },
+ {
+ "epoch": 0.29929078014184396,
+ "grad_norm": 3.0069146156311035,
+ "learning_rate": 4.9779213539897665e-06,
+ "loss": 0.6506,
+ "step": 633
+ },
+ {
+ "epoch": 0.29976359338061465,
+ "grad_norm": 3.037644147872925,
+ "learning_rate": 4.977838552303048e-06,
+ "loss": 0.6487,
+ "step": 634
+ },
+ {
+ "epoch": 0.30023640661938533,
+ "grad_norm": 3.018554449081421,
+ "learning_rate": 4.977755596332188e-06,
+ "loss": 0.6128,
+ "step": 635
+ },
+ {
+ "epoch": 0.300709219858156,
+ "grad_norm": 3.000312089920044,
+ "learning_rate": 4.977672486082351e-06,
+ "loss": 0.6431,
+ "step": 636
+ },
+ {
+ "epoch": 0.3011820330969267,
+ "grad_norm": 2.836803913116455,
+ "learning_rate": 4.977589221558713e-06,
+ "loss": 0.5914,
+ "step": 637
+ },
+ {
+ "epoch": 0.3016548463356974,
+ "grad_norm": 3.080469846725464,
+ "learning_rate": 4.977505802766457e-06,
+ "loss": 0.7265,
+ "step": 638
+ },
+ {
+ "epoch": 0.3021276595744681,
+ "grad_norm": 3.2245471477508545,
+ "learning_rate": 4.97742222971078e-06,
+ "loss": 0.6895,
+ "step": 639
+ },
+ {
+ "epoch": 0.30260047281323876,
+ "grad_norm": 3.559006452560425,
+ "learning_rate": 4.977338502396882e-06,
+ "loss": 0.7439,
+ "step": 640
+ },
+ {
+ "epoch": 0.30307328605200945,
+ "grad_norm": 2.9116289615631104,
+ "learning_rate": 4.9772546208299795e-06,
+ "loss": 0.6907,
+ "step": 641
+ },
+ {
+ "epoch": 0.30354609929078014,
+ "grad_norm": 3.3645524978637695,
+ "learning_rate": 4.977170585015295e-06,
+ "loss": 0.6983,
+ "step": 642
+ },
+ {
+ "epoch": 0.3040189125295508,
+ "grad_norm": 3.080148458480835,
+ "learning_rate": 4.977086394958058e-06,
+ "loss": 0.7016,
+ "step": 643
+ },
+ {
+ "epoch": 0.3044917257683215,
+ "grad_norm": 2.9276750087738037,
+ "learning_rate": 4.977002050663515e-06,
+ "loss": 0.6509,
+ "step": 644
+ },
+ {
+ "epoch": 0.3049645390070922,
+ "grad_norm": 3.183609962463379,
+ "learning_rate": 4.976917552136914e-06,
+ "loss": 0.6814,
+ "step": 645
+ },
+ {
+ "epoch": 0.3054373522458629,
+ "grad_norm": 3.0980000495910645,
+ "learning_rate": 4.976832899383519e-06,
+ "loss": 0.6319,
+ "step": 646
+ },
+ {
+ "epoch": 0.30591016548463357,
+ "grad_norm": 3.211376190185547,
+ "learning_rate": 4.9767480924086e-06,
+ "loss": 0.6365,
+ "step": 647
+ },
+ {
+ "epoch": 0.30638297872340425,
+ "grad_norm": 3.214430093765259,
+ "learning_rate": 4.976663131217437e-06,
+ "loss": 0.6006,
+ "step": 648
+ },
+ {
+ "epoch": 0.30685579196217494,
+ "grad_norm": 3.0914318561553955,
+ "learning_rate": 4.976578015815321e-06,
+ "loss": 0.7162,
+ "step": 649
+ },
+ {
+ "epoch": 0.3073286052009456,
+ "grad_norm": 2.7644500732421875,
+ "learning_rate": 4.976492746207551e-06,
+ "loss": 0.6045,
+ "step": 650
+ },
+ {
+ "epoch": 0.3078014184397163,
+ "grad_norm": 3.1913280487060547,
+ "learning_rate": 4.9764073223994374e-06,
+ "loss": 0.6796,
+ "step": 651
+ },
+ {
+ "epoch": 0.308274231678487,
+ "grad_norm": 2.8919692039489746,
+ "learning_rate": 4.976321744396299e-06,
+ "loss": 0.6683,
+ "step": 652
+ },
+ {
+ "epoch": 0.3087470449172577,
+ "grad_norm": 2.862234115600586,
+ "learning_rate": 4.976236012203463e-06,
+ "loss": 0.6631,
+ "step": 653
+ },
+ {
+ "epoch": 0.30921985815602837,
+ "grad_norm": 2.9708092212677,
+ "learning_rate": 4.976150125826268e-06,
+ "loss": 0.6326,
+ "step": 654
+ },
+ {
+ "epoch": 0.30969267139479906,
+ "grad_norm": 2.892465353012085,
+ "learning_rate": 4.976064085270063e-06,
+ "loss": 0.6574,
+ "step": 655
+ },
+ {
+ "epoch": 0.31016548463356974,
+ "grad_norm": 3.9215126037597656,
+ "learning_rate": 4.975977890540205e-06,
+ "loss": 0.7351,
+ "step": 656
+ },
+ {
+ "epoch": 0.31063829787234043,
+ "grad_norm": 2.9544081687927246,
+ "learning_rate": 4.975891541642059e-06,
+ "loss": 0.7264,
+ "step": 657
+ },
+ {
+ "epoch": 0.3111111111111111,
+ "grad_norm": 2.995035409927368,
+ "learning_rate": 4.975805038581005e-06,
+ "loss": 0.7405,
+ "step": 658
+ },
+ {
+ "epoch": 0.3115839243498818,
+ "grad_norm": 2.9653120040893555,
+ "learning_rate": 4.975718381362427e-06,
+ "loss": 0.679,
+ "step": 659
+ },
+ {
+ "epoch": 0.3120567375886525,
+ "grad_norm": 2.93976092338562,
+ "learning_rate": 4.9756315699917205e-06,
+ "loss": 0.627,
+ "step": 660
+ },
+ {
+ "epoch": 0.3125295508274232,
+ "grad_norm": 3.106522560119629,
+ "learning_rate": 4.9755446044742915e-06,
+ "loss": 0.6329,
+ "step": 661
+ },
+ {
+ "epoch": 0.31300236406619386,
+ "grad_norm": 3.0238280296325684,
+ "learning_rate": 4.975457484815554e-06,
+ "loss": 0.6643,
+ "step": 662
+ },
+ {
+ "epoch": 0.31347517730496455,
+ "grad_norm": 2.943528175354004,
+ "learning_rate": 4.9753702110209356e-06,
+ "loss": 0.668,
+ "step": 663
+ },
+ {
+ "epoch": 0.31394799054373523,
+ "grad_norm": 2.6840121746063232,
+ "learning_rate": 4.9752827830958676e-06,
+ "loss": 0.5482,
+ "step": 664
+ },
+ {
+ "epoch": 0.3144208037825059,
+ "grad_norm": 2.823875904083252,
+ "learning_rate": 4.975195201045794e-06,
+ "loss": 0.7017,
+ "step": 665
+ },
+ {
+ "epoch": 0.3148936170212766,
+ "grad_norm": 3.148181200027466,
+ "learning_rate": 4.975107464876168e-06,
+ "loss": 0.747,
+ "step": 666
+ },
+ {
+ "epoch": 0.3153664302600473,
+ "grad_norm": 2.630584478378296,
+ "learning_rate": 4.9750195745924545e-06,
+ "loss": 0.5987,
+ "step": 667
+ },
+ {
+ "epoch": 0.315839243498818,
+ "grad_norm": 3.075866460800171,
+ "learning_rate": 4.974931530200124e-06,
+ "loss": 0.664,
+ "step": 668
+ },
+ {
+ "epoch": 0.31631205673758866,
+ "grad_norm": 2.947197914123535,
+ "learning_rate": 4.974843331704659e-06,
+ "loss": 0.631,
+ "step": 669
+ },
+ {
+ "epoch": 0.31678486997635935,
+ "grad_norm": 3.519646644592285,
+ "learning_rate": 4.974754979111552e-06,
+ "loss": 0.7154,
+ "step": 670
+ },
+ {
+ "epoch": 0.31725768321513004,
+ "grad_norm": 2.8687186241149902,
+ "learning_rate": 4.974666472426305e-06,
+ "loss": 0.6366,
+ "step": 671
+ },
+ {
+ "epoch": 0.3177304964539007,
+ "grad_norm": 2.6966612339019775,
+ "learning_rate": 4.974577811654426e-06,
+ "loss": 0.7112,
+ "step": 672
+ },
+ {
+ "epoch": 0.3182033096926714,
+ "grad_norm": 3.1390228271484375,
+ "learning_rate": 4.974488996801439e-06,
+ "loss": 0.6882,
+ "step": 673
+ },
+ {
+ "epoch": 0.3186761229314421,
+ "grad_norm": 3.4667599201202393,
+ "learning_rate": 4.974400027872871e-06,
+ "loss": 0.7153,
+ "step": 674
+ },
+ {
+ "epoch": 0.3191489361702128,
+ "grad_norm": 2.9632184505462646,
+ "learning_rate": 4.974310904874265e-06,
+ "loss": 0.7081,
+ "step": 675
+ },
+ {
+ "epoch": 0.31962174940898347,
+ "grad_norm": 3.46150279045105,
+ "learning_rate": 4.9742216278111666e-06,
+ "loss": 0.6242,
+ "step": 676
+ },
+ {
+ "epoch": 0.32009456264775416,
+ "grad_norm": 3.380403757095337,
+ "learning_rate": 4.974132196689137e-06,
+ "loss": 0.6863,
+ "step": 677
+ },
+ {
+ "epoch": 0.32056737588652484,
+ "grad_norm": 3.4279606342315674,
+ "learning_rate": 4.974042611513746e-06,
+ "loss": 0.6388,
+ "step": 678
+ },
+ {
+ "epoch": 0.3210401891252955,
+ "grad_norm": 2.634523391723633,
+ "learning_rate": 4.973952872290568e-06,
+ "loss": 0.6038,
+ "step": 679
+ },
+ {
+ "epoch": 0.3215130023640662,
+ "grad_norm": 3.19693922996521,
+ "learning_rate": 4.973862979025194e-06,
+ "loss": 0.6383,
+ "step": 680
+ },
+ {
+ "epoch": 0.3219858156028369,
+ "grad_norm": 3.437692165374756,
+ "learning_rate": 4.973772931723218e-06,
+ "loss": 0.7288,
+ "step": 681
+ },
+ {
+ "epoch": 0.3224586288416076,
+ "grad_norm": 2.506301164627075,
+ "learning_rate": 4.97368273039025e-06,
+ "loss": 0.5707,
+ "step": 682
+ },
+ {
+ "epoch": 0.3229314420803783,
+ "grad_norm": 3.0942845344543457,
+ "learning_rate": 4.9735923750319044e-06,
+ "loss": 0.6348,
+ "step": 683
+ },
+ {
+ "epoch": 0.32340425531914896,
+ "grad_norm": 3.0889835357666016,
+ "learning_rate": 4.973501865653809e-06,
+ "loss": 0.6697,
+ "step": 684
+ },
+ {
+ "epoch": 0.32387706855791965,
+ "grad_norm": 3.0391931533813477,
+ "learning_rate": 4.973411202261598e-06,
+ "loss": 0.7091,
+ "step": 685
+ },
+ {
+ "epoch": 0.32434988179669033,
+ "grad_norm": 3.0333497524261475,
+ "learning_rate": 4.973320384860917e-06,
+ "loss": 0.6403,
+ "step": 686
+ },
+ {
+ "epoch": 0.324822695035461,
+ "grad_norm": 2.9714622497558594,
+ "learning_rate": 4.973229413457421e-06,
+ "loss": 0.6977,
+ "step": 687
+ },
+ {
+ "epoch": 0.3252955082742317,
+ "grad_norm": 3.057558298110962,
+ "learning_rate": 4.973138288056774e-06,
+ "loss": 0.7236,
+ "step": 688
+ },
+ {
+ "epoch": 0.3257683215130024,
+ "grad_norm": 2.921093463897705,
+ "learning_rate": 4.97304700866465e-06,
+ "loss": 0.576,
+ "step": 689
+ },
+ {
+ "epoch": 0.3262411347517731,
+ "grad_norm": 3.0287256240844727,
+ "learning_rate": 4.972955575286732e-06,
+ "loss": 0.7077,
+ "step": 690
+ },
+ {
+ "epoch": 0.32671394799054376,
+ "grad_norm": 2.8621346950531006,
+ "learning_rate": 4.972863987928716e-06,
+ "loss": 0.6952,
+ "step": 691
+ },
+ {
+ "epoch": 0.3271867612293144,
+ "grad_norm": 2.631359100341797,
+ "learning_rate": 4.9727722465963006e-06,
+ "loss": 0.6931,
+ "step": 692
+ },
+ {
+ "epoch": 0.3276595744680851,
+ "grad_norm": 2.8484320640563965,
+ "learning_rate": 4.972680351295201e-06,
+ "loss": 0.6292,
+ "step": 693
+ },
+ {
+ "epoch": 0.32813238770685577,
+ "grad_norm": 2.593001365661621,
+ "learning_rate": 4.972588302031138e-06,
+ "loss": 0.5942,
+ "step": 694
+ },
+ {
+ "epoch": 0.32860520094562645,
+ "grad_norm": 2.6321065425872803,
+ "learning_rate": 4.972496098809844e-06,
+ "loss": 0.65,
+ "step": 695
+ },
+ {
+ "epoch": 0.32907801418439714,
+ "grad_norm": 3.2516732215881348,
+ "learning_rate": 4.972403741637059e-06,
+ "loss": 0.7385,
+ "step": 696
+ },
+ {
+ "epoch": 0.3295508274231678,
+ "grad_norm": 3.180854320526123,
+ "learning_rate": 4.972311230518535e-06,
+ "loss": 0.6569,
+ "step": 697
+ },
+ {
+ "epoch": 0.3300236406619385,
+ "grad_norm": 4.161016941070557,
+ "learning_rate": 4.972218565460031e-06,
+ "loss": 0.6416,
+ "step": 698
+ },
+ {
+ "epoch": 0.3304964539007092,
+ "grad_norm": 3.153897762298584,
+ "learning_rate": 4.972125746467317e-06,
+ "loss": 0.7196,
+ "step": 699
+ },
+ {
+ "epoch": 0.3309692671394799,
+ "grad_norm": 2.9595556259155273,
+ "learning_rate": 4.972032773546173e-06,
+ "loss": 0.7093,
+ "step": 700
+ },
+ {
+ "epoch": 0.33144208037825057,
+ "grad_norm": 3.1086833477020264,
+ "learning_rate": 4.9719396467023875e-06,
+ "loss": 0.6963,
+ "step": 701
+ },
+ {
+ "epoch": 0.33191489361702126,
+ "grad_norm": 2.958921432495117,
+ "learning_rate": 4.971846365941759e-06,
+ "loss": 0.6518,
+ "step": 702
+ },
+ {
+ "epoch": 0.33238770685579194,
+ "grad_norm": 2.8745479583740234,
+ "learning_rate": 4.971752931270096e-06,
+ "loss": 0.696,
+ "step": 703
+ },
+ {
+ "epoch": 0.33286052009456263,
+ "grad_norm": 3.224358558654785,
+ "learning_rate": 4.971659342693217e-06,
+ "loss": 0.6769,
+ "step": 704
+ },
+ {
+ "epoch": 0.3333333333333333,
+ "grad_norm": 2.696319580078125,
+ "learning_rate": 4.9715656002169486e-06,
+ "loss": 0.6833,
+ "step": 705
+ },
+ {
+ "epoch": 0.333806146572104,
+ "grad_norm": 2.9283502101898193,
+ "learning_rate": 4.971471703847127e-06,
+ "loss": 0.6784,
+ "step": 706
+ },
+ {
+ "epoch": 0.3342789598108747,
+ "grad_norm": 2.654914140701294,
+ "learning_rate": 4.9713776535896e-06,
+ "loss": 0.6337,
+ "step": 707
+ },
+ {
+ "epoch": 0.3347517730496454,
+ "grad_norm": 3.041555643081665,
+ "learning_rate": 4.971283449450224e-06,
+ "loss": 0.6227,
+ "step": 708
+ },
+ {
+ "epoch": 0.33522458628841606,
+ "grad_norm": 2.893008232116699,
+ "learning_rate": 4.971189091434863e-06,
+ "loss": 0.655,
+ "step": 709
+ },
+ {
+ "epoch": 0.33569739952718675,
+ "grad_norm": 2.8806653022766113,
+ "learning_rate": 4.971094579549393e-06,
+ "loss": 0.7077,
+ "step": 710
+ },
+ {
+ "epoch": 0.33617021276595743,
+ "grad_norm": 3.4830048084259033,
+ "learning_rate": 4.9709999137996986e-06,
+ "loss": 0.7461,
+ "step": 711
+ },
+ {
+ "epoch": 0.3366430260047281,
+ "grad_norm": 3.155444860458374,
+ "learning_rate": 4.970905094191674e-06,
+ "loss": 0.652,
+ "step": 712
+ },
+ {
+ "epoch": 0.3371158392434988,
+ "grad_norm": 2.7608706951141357,
+ "learning_rate": 4.970810120731225e-06,
+ "loss": 0.684,
+ "step": 713
+ },
+ {
+ "epoch": 0.3375886524822695,
+ "grad_norm": 2.8209474086761475,
+ "learning_rate": 4.970714993424265e-06,
+ "loss": 0.6009,
+ "step": 714
+ },
+ {
+ "epoch": 0.3380614657210402,
+ "grad_norm": 3.6532654762268066,
+ "learning_rate": 4.9706197122767145e-06,
+ "loss": 0.702,
+ "step": 715
+ },
+ {
+ "epoch": 0.33853427895981086,
+ "grad_norm": 2.6276566982269287,
+ "learning_rate": 4.970524277294508e-06,
+ "loss": 0.6338,
+ "step": 716
+ },
+ {
+ "epoch": 0.33900709219858155,
+ "grad_norm": 3.509871482849121,
+ "learning_rate": 4.970428688483589e-06,
+ "loss": 0.6853,
+ "step": 717
+ },
+ {
+ "epoch": 0.33947990543735224,
+ "grad_norm": 5.332682132720947,
+ "learning_rate": 4.970332945849906e-06,
+ "loss": 0.6684,
+ "step": 718
+ },
+ {
+ "epoch": 0.3399527186761229,
+ "grad_norm": 2.718801975250244,
+ "learning_rate": 4.970237049399424e-06,
+ "loss": 0.6676,
+ "step": 719
+ },
+ {
+ "epoch": 0.3404255319148936,
+ "grad_norm": 3.891003131866455,
+ "learning_rate": 4.970140999138112e-06,
+ "loss": 0.7043,
+ "step": 720
+ },
+ {
+ "epoch": 0.3408983451536643,
+ "grad_norm": 2.8863155841827393,
+ "learning_rate": 4.970044795071951e-06,
+ "loss": 0.6563,
+ "step": 721
+ },
+ {
+ "epoch": 0.341371158392435,
+ "grad_norm": 3.2527518272399902,
+ "learning_rate": 4.969948437206932e-06,
+ "loss": 0.7244,
+ "step": 722
+ },
+ {
+ "epoch": 0.34184397163120567,
+ "grad_norm": 2.9726758003234863,
+ "learning_rate": 4.969851925549054e-06,
+ "loss": 0.6548,
+ "step": 723
+ },
+ {
+ "epoch": 0.34231678486997635,
+ "grad_norm": 3.118309497833252,
+ "learning_rate": 4.969755260104327e-06,
+ "loss": 0.7293,
+ "step": 724
+ },
+ {
+ "epoch": 0.34278959810874704,
+ "grad_norm": 3.373068332672119,
+ "learning_rate": 4.969658440878769e-06,
+ "loss": 0.6444,
+ "step": 725
+ },
+ {
+ "epoch": 0.3432624113475177,
+ "grad_norm": 2.7157437801361084,
+ "learning_rate": 4.969561467878409e-06,
+ "loss": 0.642,
+ "step": 726
+ },
+ {
+ "epoch": 0.3437352245862884,
+ "grad_norm": 2.58929705619812,
+ "learning_rate": 4.969464341109285e-06,
+ "loss": 0.6165,
+ "step": 727
+ },
+ {
+ "epoch": 0.3442080378250591,
+ "grad_norm": 2.8811306953430176,
+ "learning_rate": 4.969367060577445e-06,
+ "loss": 0.7127,
+ "step": 728
+ },
+ {
+ "epoch": 0.3446808510638298,
+ "grad_norm": 3.494358539581299,
+ "learning_rate": 4.969269626288946e-06,
+ "loss": 0.7103,
+ "step": 729
+ },
+ {
+ "epoch": 0.34515366430260047,
+ "grad_norm": 2.9753928184509277,
+ "learning_rate": 4.969172038249855e-06,
+ "loss": 0.6911,
+ "step": 730
+ },
+ {
+ "epoch": 0.34562647754137116,
+ "grad_norm": 3.2885913848876953,
+ "learning_rate": 4.969074296466247e-06,
+ "loss": 0.6968,
+ "step": 731
+ },
+ {
+ "epoch": 0.34609929078014184,
+ "grad_norm": 2.7564568519592285,
+ "learning_rate": 4.968976400944211e-06,
+ "loss": 0.6843,
+ "step": 732
+ },
+ {
+ "epoch": 0.34657210401891253,
+ "grad_norm": 2.9255006313323975,
+ "learning_rate": 4.96887835168984e-06,
+ "loss": 0.6024,
+ "step": 733
+ },
+ {
+ "epoch": 0.3470449172576832,
+ "grad_norm": 3.1808290481567383,
+ "learning_rate": 4.968780148709239e-06,
+ "loss": 0.7377,
+ "step": 734
+ },
+ {
+ "epoch": 0.3475177304964539,
+ "grad_norm": 2.956666946411133,
+ "learning_rate": 4.968681792008523e-06,
+ "loss": 0.65,
+ "step": 735
+ },
+ {
+ "epoch": 0.3479905437352246,
+ "grad_norm": 2.9631855487823486,
+ "learning_rate": 4.9685832815938175e-06,
+ "loss": 0.677,
+ "step": 736
+ },
+ {
+ "epoch": 0.3484633569739953,
+ "grad_norm": 2.501917600631714,
+ "learning_rate": 4.968484617471256e-06,
+ "loss": 0.6282,
+ "step": 737
+ },
+ {
+ "epoch": 0.34893617021276596,
+ "grad_norm": 2.750779628753662,
+ "learning_rate": 4.968385799646981e-06,
+ "loss": 0.6507,
+ "step": 738
+ },
+ {
+ "epoch": 0.34940898345153665,
+ "grad_norm": 2.872300624847412,
+ "learning_rate": 4.968286828127146e-06,
+ "loss": 0.5949,
+ "step": 739
+ },
+ {
+ "epoch": 0.34988179669030733,
+ "grad_norm": 2.6316142082214355,
+ "learning_rate": 4.9681877029179124e-06,
+ "loss": 0.6328,
+ "step": 740
+ },
+ {
+ "epoch": 0.350354609929078,
+ "grad_norm": 3.244364023208618,
+ "learning_rate": 4.968088424025454e-06,
+ "loss": 0.7393,
+ "step": 741
+ },
+ {
+ "epoch": 0.3508274231678487,
+ "grad_norm": 2.620465040206909,
+ "learning_rate": 4.967988991455951e-06,
+ "loss": 0.6797,
+ "step": 742
+ },
+ {
+ "epoch": 0.3513002364066194,
+ "grad_norm": 2.854513645172119,
+ "learning_rate": 4.967889405215596e-06,
+ "loss": 0.6368,
+ "step": 743
+ },
+ {
+ "epoch": 0.3517730496453901,
+ "grad_norm": 2.579854726791382,
+ "learning_rate": 4.9677896653105886e-06,
+ "loss": 0.6489,
+ "step": 744
+ },
+ {
+ "epoch": 0.35224586288416077,
+ "grad_norm": 3.0697381496429443,
+ "learning_rate": 4.96768977174714e-06,
+ "loss": 0.6313,
+ "step": 745
+ },
+ {
+ "epoch": 0.35271867612293145,
+ "grad_norm": 3.369338035583496,
+ "learning_rate": 4.96758972453147e-06,
+ "loss": 0.7416,
+ "step": 746
+ },
+ {
+ "epoch": 0.35319148936170214,
+ "grad_norm": 2.836221933364868,
+ "learning_rate": 4.967489523669807e-06,
+ "loss": 0.6422,
+ "step": 747
+ },
+ {
+ "epoch": 0.3536643026004728,
+ "grad_norm": 2.929579496383667,
+ "learning_rate": 4.967389169168392e-06,
+ "loss": 0.6482,
+ "step": 748
+ },
+ {
+ "epoch": 0.3541371158392435,
+ "grad_norm": 2.9243831634521484,
+ "learning_rate": 4.967288661033472e-06,
+ "loss": 0.5813,
+ "step": 749
+ },
+ {
+ "epoch": 0.3546099290780142,
+ "grad_norm": 3.7555336952209473,
+ "learning_rate": 4.967187999271306e-06,
+ "loss": 0.6501,
+ "step": 750
+ },
+ {
+ "epoch": 0.3550827423167849,
+ "grad_norm": 3.4279143810272217,
+ "learning_rate": 4.9670871838881615e-06,
+ "loss": 0.6326,
+ "step": 751
+ },
+ {
+ "epoch": 0.35555555555555557,
+ "grad_norm": 2.875066041946411,
+ "learning_rate": 4.9669862148903166e-06,
+ "loss": 0.664,
+ "step": 752
+ },
+ {
+ "epoch": 0.35602836879432626,
+ "grad_norm": 3.130394697189331,
+ "learning_rate": 4.966885092284057e-06,
+ "loss": 0.706,
+ "step": 753
+ },
+ {
+ "epoch": 0.35650118203309694,
+ "grad_norm": 2.9606287479400635,
+ "learning_rate": 4.96678381607568e-06,
+ "loss": 0.693,
+ "step": 754
+ },
+ {
+ "epoch": 0.35697399527186763,
+ "grad_norm": 3.0584909915924072,
+ "learning_rate": 4.966682386271491e-06,
+ "loss": 0.6034,
+ "step": 755
+ },
+ {
+ "epoch": 0.3574468085106383,
+ "grad_norm": 2.8215200901031494,
+ "learning_rate": 4.966580802877805e-06,
+ "loss": 0.6217,
+ "step": 756
+ },
+ {
+ "epoch": 0.357919621749409,
+ "grad_norm": 2.7348055839538574,
+ "learning_rate": 4.966479065900949e-06,
+ "loss": 0.6194,
+ "step": 757
+ },
+ {
+ "epoch": 0.3583924349881797,
+ "grad_norm": 3.2347466945648193,
+ "learning_rate": 4.966377175347257e-06,
+ "loss": 0.6377,
+ "step": 758
+ },
+ {
+ "epoch": 0.3588652482269504,
+ "grad_norm": 3.311845302581787,
+ "learning_rate": 4.966275131223072e-06,
+ "loss": 0.6234,
+ "step": 759
+ },
+ {
+ "epoch": 0.35933806146572106,
+ "grad_norm": 3.0384368896484375,
+ "learning_rate": 4.96617293353475e-06,
+ "loss": 0.609,
+ "step": 760
+ },
+ {
+ "epoch": 0.35981087470449175,
+ "grad_norm": 3.516854763031006,
+ "learning_rate": 4.966070582288653e-06,
+ "loss": 0.6627,
+ "step": 761
+ },
+ {
+ "epoch": 0.36028368794326243,
+ "grad_norm": 3.2425215244293213,
+ "learning_rate": 4.9659680774911534e-06,
+ "loss": 0.7355,
+ "step": 762
+ },
+ {
+ "epoch": 0.3607565011820331,
+ "grad_norm": 3.2665750980377197,
+ "learning_rate": 4.965865419148636e-06,
+ "loss": 0.6787,
+ "step": 763
+ },
+ {
+ "epoch": 0.3612293144208038,
+ "grad_norm": 2.729428291320801,
+ "learning_rate": 4.96576260726749e-06,
+ "loss": 0.6272,
+ "step": 764
+ },
+ {
+ "epoch": 0.3617021276595745,
+ "grad_norm": 3.299969434738159,
+ "learning_rate": 4.965659641854119e-06,
+ "loss": 0.6552,
+ "step": 765
+ },
+ {
+ "epoch": 0.3621749408983452,
+ "grad_norm": 2.7090916633605957,
+ "learning_rate": 4.965556522914934e-06,
+ "loss": 0.6661,
+ "step": 766
+ },
+ {
+ "epoch": 0.36264775413711586,
+ "grad_norm": 2.488846778869629,
+ "learning_rate": 4.965453250456355e-06,
+ "loss": 0.5821,
+ "step": 767
+ },
+ {
+ "epoch": 0.36312056737588655,
+ "grad_norm": 2.5267233848571777,
+ "learning_rate": 4.965349824484813e-06,
+ "loss": 0.5593,
+ "step": 768
+ },
+ {
+ "epoch": 0.36359338061465724,
+ "grad_norm": 3.0646679401397705,
+ "learning_rate": 4.965246245006748e-06,
+ "loss": 0.6341,
+ "step": 769
+ },
+ {
+ "epoch": 0.3640661938534279,
+ "grad_norm": 2.9877712726593018,
+ "learning_rate": 4.965142512028609e-06,
+ "loss": 0.7202,
+ "step": 770
+ },
+ {
+ "epoch": 0.3645390070921986,
+ "grad_norm": 3.7494113445281982,
+ "learning_rate": 4.965038625556854e-06,
+ "loss": 0.7643,
+ "step": 771
+ },
+ {
+ "epoch": 0.3650118203309693,
+ "grad_norm": 2.8382890224456787,
+ "learning_rate": 4.964934585597954e-06,
+ "loss": 0.6522,
+ "step": 772
+ },
+ {
+ "epoch": 0.3654846335697399,
+ "grad_norm": 3.091655731201172,
+ "learning_rate": 4.9648303921583854e-06,
+ "loss": 0.7117,
+ "step": 773
+ },
+ {
+ "epoch": 0.3659574468085106,
+ "grad_norm": 3.0608325004577637,
+ "learning_rate": 4.964726045244635e-06,
+ "loss": 0.6538,
+ "step": 774
+ },
+ {
+ "epoch": 0.3664302600472813,
+ "grad_norm": 2.8492867946624756,
+ "learning_rate": 4.964621544863203e-06,
+ "loss": 0.6079,
+ "step": 775
+ },
+ {
+ "epoch": 0.366903073286052,
+ "grad_norm": 3.0669894218444824,
+ "learning_rate": 4.964516891020594e-06,
+ "loss": 0.6223,
+ "step": 776
+ },
+ {
+ "epoch": 0.36737588652482267,
+ "grad_norm": 3.089984893798828,
+ "learning_rate": 4.964412083723325e-06,
+ "loss": 0.671,
+ "step": 777
+ },
+ {
+ "epoch": 0.36784869976359336,
+ "grad_norm": 2.905242443084717,
+ "learning_rate": 4.964307122977921e-06,
+ "loss": 0.62,
+ "step": 778
+ },
+ {
+ "epoch": 0.36832151300236404,
+ "grad_norm": 3.954436779022217,
+ "learning_rate": 4.964202008790918e-06,
+ "loss": 0.6535,
+ "step": 779
+ },
+ {
+ "epoch": 0.36879432624113473,
+ "grad_norm": 2.6026058197021484,
+ "learning_rate": 4.9640967411688615e-06,
+ "loss": 0.5865,
+ "step": 780
+ },
+ {
+ "epoch": 0.3692671394799054,
+ "grad_norm": 2.9876346588134766,
+ "learning_rate": 4.963991320118306e-06,
+ "loss": 0.6698,
+ "step": 781
+ },
+ {
+ "epoch": 0.3697399527186761,
+ "grad_norm": 2.9411263465881348,
+ "learning_rate": 4.963885745645815e-06,
+ "loss": 0.6173,
+ "step": 782
+ },
+ {
+ "epoch": 0.3702127659574468,
+ "grad_norm": 2.5679805278778076,
+ "learning_rate": 4.963780017757962e-06,
+ "loss": 0.6285,
+ "step": 783
+ },
+ {
+ "epoch": 0.3706855791962175,
+ "grad_norm": 3.3100640773773193,
+ "learning_rate": 4.963674136461332e-06,
+ "loss": 0.5968,
+ "step": 784
+ },
+ {
+ "epoch": 0.37115839243498816,
+ "grad_norm": 3.1293699741363525,
+ "learning_rate": 4.963568101762515e-06,
+ "loss": 0.697,
+ "step": 785
+ },
+ {
+ "epoch": 0.37163120567375885,
+ "grad_norm": 3.043853759765625,
+ "learning_rate": 4.963461913668115e-06,
+ "loss": 0.5881,
+ "step": 786
+ },
+ {
+ "epoch": 0.37210401891252953,
+ "grad_norm": 3.07351016998291,
+ "learning_rate": 4.963355572184744e-06,
+ "loss": 0.6307,
+ "step": 787
+ },
+ {
+ "epoch": 0.3725768321513002,
+ "grad_norm": 2.7381317615509033,
+ "learning_rate": 4.9632490773190225e-06,
+ "loss": 0.716,
+ "step": 788
+ },
+ {
+ "epoch": 0.3730496453900709,
+ "grad_norm": 2.892221450805664,
+ "learning_rate": 4.963142429077582e-06,
+ "loss": 0.6867,
+ "step": 789
+ },
+ {
+ "epoch": 0.3735224586288416,
+ "grad_norm": 3.133122205734253,
+ "learning_rate": 4.963035627467064e-06,
+ "loss": 0.659,
+ "step": 790
+ },
+ {
+ "epoch": 0.3739952718676123,
+ "grad_norm": 3.032599925994873,
+ "learning_rate": 4.962928672494116e-06,
+ "loss": 0.6848,
+ "step": 791
+ },
+ {
+ "epoch": 0.37446808510638296,
+ "grad_norm": 3.0076355934143066,
+ "learning_rate": 4.9628215641654e-06,
+ "loss": 0.6549,
+ "step": 792
+ },
+ {
+ "epoch": 0.37494089834515365,
+ "grad_norm": 2.8904454708099365,
+ "learning_rate": 4.962714302487585e-06,
+ "loss": 0.6484,
+ "step": 793
+ },
+ {
+ "epoch": 0.37541371158392434,
+ "grad_norm": 2.881364107131958,
+ "learning_rate": 4.9626068874673486e-06,
+ "loss": 0.721,
+ "step": 794
+ },
+ {
+ "epoch": 0.375886524822695,
+ "grad_norm": 3.11668062210083,
+ "learning_rate": 4.962499319111379e-06,
+ "loss": 0.7824,
+ "step": 795
+ },
+ {
+ "epoch": 0.3763593380614657,
+ "grad_norm": 2.9201436042785645,
+ "learning_rate": 4.962391597426374e-06,
+ "loss": 0.6911,
+ "step": 796
+ },
+ {
+ "epoch": 0.3768321513002364,
+ "grad_norm": 2.926598072052002,
+ "learning_rate": 4.962283722419043e-06,
+ "loss": 0.6715,
+ "step": 797
+ },
+ {
+ "epoch": 0.3773049645390071,
+ "grad_norm": 2.7267675399780273,
+ "learning_rate": 4.962175694096101e-06,
+ "loss": 0.6111,
+ "step": 798
+ },
+ {
+ "epoch": 0.37777777777777777,
+ "grad_norm": 3.194031000137329,
+ "learning_rate": 4.962067512464275e-06,
+ "loss": 0.6558,
+ "step": 799
+ },
+ {
+ "epoch": 0.37825059101654845,
+ "grad_norm": 2.6249136924743652,
+ "learning_rate": 4.9619591775303e-06,
+ "loss": 0.6166,
+ "step": 800
+ },
+ {
+ "epoch": 0.37872340425531914,
+ "grad_norm": 2.6356167793273926,
+ "learning_rate": 4.961850689300923e-06,
+ "loss": 0.6112,
+ "step": 801
+ },
+ {
+ "epoch": 0.3791962174940898,
+ "grad_norm": 3.030724287033081,
+ "learning_rate": 4.961742047782898e-06,
+ "loss": 0.6511,
+ "step": 802
+ },
+ {
+ "epoch": 0.3796690307328605,
+ "grad_norm": 3.4987757205963135,
+ "learning_rate": 4.96163325298299e-06,
+ "loss": 0.5888,
+ "step": 803
+ },
+ {
+ "epoch": 0.3801418439716312,
+ "grad_norm": 3.0371780395507812,
+ "learning_rate": 4.961524304907974e-06,
+ "loss": 0.6385,
+ "step": 804
+ },
+ {
+ "epoch": 0.3806146572104019,
+ "grad_norm": 3.302570104598999,
+ "learning_rate": 4.961415203564632e-06,
+ "loss": 0.6515,
+ "step": 805
+ },
+ {
+ "epoch": 0.38108747044917257,
+ "grad_norm": 2.7597038745880127,
+ "learning_rate": 4.961305948959759e-06,
+ "loss": 0.6126,
+ "step": 806
+ },
+ {
+ "epoch": 0.38156028368794326,
+ "grad_norm": 2.789811849594116,
+ "learning_rate": 4.9611965411001575e-06,
+ "loss": 0.6601,
+ "step": 807
+ },
+ {
+ "epoch": 0.38203309692671394,
+ "grad_norm": 3.0403921604156494,
+ "learning_rate": 4.961086979992639e-06,
+ "loss": 0.6947,
+ "step": 808
+ },
+ {
+ "epoch": 0.38250591016548463,
+ "grad_norm": 3.2139980792999268,
+ "learning_rate": 4.960977265644026e-06,
+ "loss": 0.6876,
+ "step": 809
+ },
+ {
+ "epoch": 0.3829787234042553,
+ "grad_norm": 2.918515205383301,
+ "learning_rate": 4.960867398061149e-06,
+ "loss": 0.5997,
+ "step": 810
+ },
+ {
+ "epoch": 0.383451536643026,
+ "grad_norm": 3.197636604309082,
+ "learning_rate": 4.9607573772508495e-06,
+ "loss": 0.5754,
+ "step": 811
+ },
+ {
+ "epoch": 0.3839243498817967,
+ "grad_norm": 2.8848466873168945,
+ "learning_rate": 4.960647203219979e-06,
+ "loss": 0.6424,
+ "step": 812
+ },
+ {
+ "epoch": 0.3843971631205674,
+ "grad_norm": 3.4810187816619873,
+ "learning_rate": 4.960536875975397e-06,
+ "loss": 0.6851,
+ "step": 813
+ },
+ {
+ "epoch": 0.38486997635933806,
+ "grad_norm": 3.713934898376465,
+ "learning_rate": 4.960426395523972e-06,
+ "loss": 0.6122,
+ "step": 814
+ },
+ {
+ "epoch": 0.38534278959810875,
+ "grad_norm": 2.862600803375244,
+ "learning_rate": 4.960315761872585e-06,
+ "loss": 0.6493,
+ "step": 815
+ },
+ {
+ "epoch": 0.38581560283687943,
+ "grad_norm": 3.133882522583008,
+ "learning_rate": 4.960204975028123e-06,
+ "loss": 0.7535,
+ "step": 816
+ },
+ {
+ "epoch": 0.3862884160756501,
+ "grad_norm": 3.1526732444763184,
+ "learning_rate": 4.960094034997485e-06,
+ "loss": 0.6512,
+ "step": 817
+ },
+ {
+ "epoch": 0.3867612293144208,
+ "grad_norm": 2.7213544845581055,
+ "learning_rate": 4.959982941787579e-06,
+ "loss": 0.6121,
+ "step": 818
+ },
+ {
+ "epoch": 0.3872340425531915,
+ "grad_norm": 3.4935851097106934,
+ "learning_rate": 4.9598716954053214e-06,
+ "loss": 0.7852,
+ "step": 819
+ },
+ {
+ "epoch": 0.3877068557919622,
+ "grad_norm": 2.691016435623169,
+ "learning_rate": 4.9597602958576395e-06,
+ "loss": 0.6861,
+ "step": 820
+ },
+ {
+ "epoch": 0.38817966903073287,
+ "grad_norm": 2.8621015548706055,
+ "learning_rate": 4.959648743151469e-06,
+ "loss": 0.6262,
+ "step": 821
+ },
+ {
+ "epoch": 0.38865248226950355,
+ "grad_norm": 3.3887462615966797,
+ "learning_rate": 4.959537037293758e-06,
+ "loss": 0.7103,
+ "step": 822
+ },
+ {
+ "epoch": 0.38912529550827424,
+ "grad_norm": 2.7565438747406006,
+ "learning_rate": 4.95942517829146e-06,
+ "loss": 0.6471,
+ "step": 823
+ },
+ {
+ "epoch": 0.3895981087470449,
+ "grad_norm": 2.7920358180999756,
+ "learning_rate": 4.959313166151541e-06,
+ "loss": 0.6239,
+ "step": 824
+ },
+ {
+ "epoch": 0.3900709219858156,
+ "grad_norm": 3.18904185295105,
+ "learning_rate": 4.959201000880973e-06,
+ "loss": 0.7461,
+ "step": 825
+ },
+ {
+ "epoch": 0.3905437352245863,
+ "grad_norm": 2.727872371673584,
+ "learning_rate": 4.959088682486743e-06,
+ "loss": 0.6333,
+ "step": 826
+ },
+ {
+ "epoch": 0.391016548463357,
+ "grad_norm": 2.906378746032715,
+ "learning_rate": 4.958976210975844e-06,
+ "loss": 0.7547,
+ "step": 827
+ },
+ {
+ "epoch": 0.39148936170212767,
+ "grad_norm": 2.96482515335083,
+ "learning_rate": 4.958863586355278e-06,
+ "loss": 0.6312,
+ "step": 828
+ },
+ {
+ "epoch": 0.39196217494089836,
+ "grad_norm": 3.2890889644622803,
+ "learning_rate": 4.958750808632059e-06,
+ "loss": 0.6943,
+ "step": 829
+ },
+ {
+ "epoch": 0.39243498817966904,
+ "grad_norm": 2.7004311084747314,
+ "learning_rate": 4.958637877813207e-06,
+ "loss": 0.5918,
+ "step": 830
+ },
+ {
+ "epoch": 0.39290780141843973,
+ "grad_norm": 2.7487950325012207,
+ "learning_rate": 4.9585247939057566e-06,
+ "loss": 0.6201,
+ "step": 831
+ },
+ {
+ "epoch": 0.3933806146572104,
+ "grad_norm": 2.7873897552490234,
+ "learning_rate": 4.958411556916747e-06,
+ "loss": 0.6268,
+ "step": 832
+ },
+ {
+ "epoch": 0.3938534278959811,
+ "grad_norm": 2.8501343727111816,
+ "learning_rate": 4.958298166853229e-06,
+ "loss": 0.7119,
+ "step": 833
+ },
+ {
+ "epoch": 0.3943262411347518,
+ "grad_norm": 3.0391547679901123,
+ "learning_rate": 4.958184623722265e-06,
+ "loss": 0.6375,
+ "step": 834
+ },
+ {
+ "epoch": 0.3947990543735225,
+ "grad_norm": 2.850520133972168,
+ "learning_rate": 4.958070927530922e-06,
+ "loss": 0.5962,
+ "step": 835
+ },
+ {
+ "epoch": 0.39527186761229316,
+ "grad_norm": 3.351914644241333,
+ "learning_rate": 4.957957078286281e-06,
+ "loss": 0.7247,
+ "step": 836
+ },
+ {
+ "epoch": 0.39574468085106385,
+ "grad_norm": 2.9559543132781982,
+ "learning_rate": 4.957843075995431e-06,
+ "loss": 0.6571,
+ "step": 837
+ },
+ {
+ "epoch": 0.39621749408983453,
+ "grad_norm": 3.225785255432129,
+ "learning_rate": 4.95772892066547e-06,
+ "loss": 0.7074,
+ "step": 838
+ },
+ {
+ "epoch": 0.3966903073286052,
+ "grad_norm": 2.7842373847961426,
+ "learning_rate": 4.957614612303505e-06,
+ "loss": 0.6469,
+ "step": 839
+ },
+ {
+ "epoch": 0.3971631205673759,
+ "grad_norm": 4.249724864959717,
+ "learning_rate": 4.957500150916655e-06,
+ "loss": 0.741,
+ "step": 840
+ },
+ {
+ "epoch": 0.3976359338061466,
+ "grad_norm": 3.138221263885498,
+ "learning_rate": 4.957385536512046e-06,
+ "loss": 0.6676,
+ "step": 841
+ },
+ {
+ "epoch": 0.3981087470449173,
+ "grad_norm": 3.456423759460449,
+ "learning_rate": 4.957270769096816e-06,
+ "loss": 0.6877,
+ "step": 842
+ },
+ {
+ "epoch": 0.39858156028368796,
+ "grad_norm": 2.8676278591156006,
+ "learning_rate": 4.957155848678109e-06,
+ "loss": 0.5986,
+ "step": 843
+ },
+ {
+ "epoch": 0.39905437352245865,
+ "grad_norm": 2.705324411392212,
+ "learning_rate": 4.957040775263082e-06,
+ "loss": 0.6356,
+ "step": 844
+ },
+ {
+ "epoch": 0.39952718676122934,
+ "grad_norm": 3.0767486095428467,
+ "learning_rate": 4.9569255488589e-06,
+ "loss": 0.6844,
+ "step": 845
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 2.7787704467773438,
+ "learning_rate": 4.956810169472736e-06,
+ "loss": 0.6641,
+ "step": 846
+ },
+ {
+ "epoch": 0.4004728132387707,
+ "grad_norm": 2.584277868270874,
+ "learning_rate": 4.956694637111777e-06,
+ "loss": 0.6256,
+ "step": 847
+ },
+ {
+ "epoch": 0.4009456264775414,
+ "grad_norm": 2.751641273498535,
+ "learning_rate": 4.956578951783215e-06,
+ "loss": 0.5954,
+ "step": 848
+ },
+ {
+ "epoch": 0.4014184397163121,
+ "grad_norm": 3.0181658267974854,
+ "learning_rate": 4.956463113494253e-06,
+ "loss": 0.6569,
+ "step": 849
+ },
+ {
+ "epoch": 0.40189125295508277,
+ "grad_norm": 3.0933220386505127,
+ "learning_rate": 4.956347122252104e-06,
+ "loss": 0.6248,
+ "step": 850
+ },
+ {
+ "epoch": 0.40236406619385345,
+ "grad_norm": 3.3767428398132324,
+ "learning_rate": 4.956230978063991e-06,
+ "loss": 0.719,
+ "step": 851
+ },
+ {
+ "epoch": 0.40283687943262414,
+ "grad_norm": 3.7666573524475098,
+ "learning_rate": 4.956114680937145e-06,
+ "loss": 0.6467,
+ "step": 852
+ },
+ {
+ "epoch": 0.4033096926713948,
+ "grad_norm": 2.9836843013763428,
+ "learning_rate": 4.955998230878808e-06,
+ "loss": 0.6993,
+ "step": 853
+ },
+ {
+ "epoch": 0.4037825059101655,
+ "grad_norm": 2.981497049331665,
+ "learning_rate": 4.955881627896229e-06,
+ "loss": 0.6578,
+ "step": 854
+ },
+ {
+ "epoch": 0.40425531914893614,
+ "grad_norm": 3.1369056701660156,
+ "learning_rate": 4.955764871996672e-06,
+ "loss": 0.6763,
+ "step": 855
+ },
+ {
+ "epoch": 0.40472813238770683,
+ "grad_norm": 2.7675817012786865,
+ "learning_rate": 4.9556479631874036e-06,
+ "loss": 0.6488,
+ "step": 856
+ },
+ {
+ "epoch": 0.4052009456264775,
+ "grad_norm": 3.035334825515747,
+ "learning_rate": 4.9555309014757034e-06,
+ "loss": 0.7076,
+ "step": 857
+ },
+ {
+ "epoch": 0.4056737588652482,
+ "grad_norm": 3.493704319000244,
+ "learning_rate": 4.955413686868862e-06,
+ "loss": 0.6773,
+ "step": 858
+ },
+ {
+ "epoch": 0.4061465721040189,
+ "grad_norm": 3.245487928390503,
+ "learning_rate": 4.9552963193741765e-06,
+ "loss": 0.6915,
+ "step": 859
+ },
+ {
+ "epoch": 0.4066193853427896,
+ "grad_norm": 3.189969539642334,
+ "learning_rate": 4.955178798998956e-06,
+ "loss": 0.7318,
+ "step": 860
+ },
+ {
+ "epoch": 0.40709219858156026,
+ "grad_norm": 2.7987146377563477,
+ "learning_rate": 4.955061125750517e-06,
+ "loss": 0.6162,
+ "step": 861
+ },
+ {
+ "epoch": 0.40756501182033095,
+ "grad_norm": 3.020118474960327,
+ "learning_rate": 4.954943299636187e-06,
+ "loss": 0.6678,
+ "step": 862
+ },
+ {
+ "epoch": 0.40803782505910163,
+ "grad_norm": 2.715463876724243,
+ "learning_rate": 4.954825320663302e-06,
+ "loss": 0.668,
+ "step": 863
+ },
+ {
+ "epoch": 0.4085106382978723,
+ "grad_norm": 2.595050096511841,
+ "learning_rate": 4.9547071888392085e-06,
+ "loss": 0.6557,
+ "step": 864
+ },
+ {
+ "epoch": 0.408983451536643,
+ "grad_norm": 3.131596088409424,
+ "learning_rate": 4.954588904171261e-06,
+ "loss": 0.6548,
+ "step": 865
+ },
+ {
+ "epoch": 0.4094562647754137,
+ "grad_norm": 2.5742313861846924,
+ "learning_rate": 4.954470466666827e-06,
+ "loss": 0.6592,
+ "step": 866
+ },
+ {
+ "epoch": 0.4099290780141844,
+ "grad_norm": 2.8612802028656006,
+ "learning_rate": 4.9543518763332785e-06,
+ "loss": 0.5391,
+ "step": 867
+ },
+ {
+ "epoch": 0.41040189125295506,
+ "grad_norm": 2.8973186016082764,
+ "learning_rate": 4.954233133178001e-06,
+ "loss": 0.6649,
+ "step": 868
+ },
+ {
+ "epoch": 0.41087470449172575,
+ "grad_norm": 2.802525043487549,
+ "learning_rate": 4.954114237208388e-06,
+ "loss": 0.6212,
+ "step": 869
+ },
+ {
+ "epoch": 0.41134751773049644,
+ "grad_norm": 2.5919506549835205,
+ "learning_rate": 4.953995188431843e-06,
+ "loss": 0.6596,
+ "step": 870
+ },
+ {
+ "epoch": 0.4118203309692671,
+ "grad_norm": 3.139169454574585,
+ "learning_rate": 4.953875986855777e-06,
+ "loss": 0.6799,
+ "step": 871
+ },
+ {
+ "epoch": 0.4122931442080378,
+ "grad_norm": 3.99727725982666,
+ "learning_rate": 4.953756632487614e-06,
+ "loss": 0.6519,
+ "step": 872
+ },
+ {
+ "epoch": 0.4127659574468085,
+ "grad_norm": 3.238706350326538,
+ "learning_rate": 4.953637125334784e-06,
+ "loss": 0.7361,
+ "step": 873
+ },
+ {
+ "epoch": 0.4132387706855792,
+ "grad_norm": 2.780019998550415,
+ "learning_rate": 4.9535174654047295e-06,
+ "loss": 0.6406,
+ "step": 874
+ },
+ {
+ "epoch": 0.41371158392434987,
+ "grad_norm": 2.7629551887512207,
+ "learning_rate": 4.953397652704901e-06,
+ "loss": 0.6131,
+ "step": 875
+ },
+ {
+ "epoch": 0.41418439716312055,
+ "grad_norm": 2.8008246421813965,
+ "learning_rate": 4.9532776872427585e-06,
+ "loss": 0.6464,
+ "step": 876
+ },
+ {
+ "epoch": 0.41465721040189124,
+ "grad_norm": 3.0970115661621094,
+ "learning_rate": 4.953157569025772e-06,
+ "loss": 0.7066,
+ "step": 877
+ },
+ {
+ "epoch": 0.4151300236406619,
+ "grad_norm": 2.8375589847564697,
+ "learning_rate": 4.9530372980614195e-06,
+ "loss": 0.6551,
+ "step": 878
+ },
+ {
+ "epoch": 0.4156028368794326,
+ "grad_norm": 2.718843936920166,
+ "learning_rate": 4.952916874357191e-06,
+ "loss": 0.5947,
+ "step": 879
+ },
+ {
+ "epoch": 0.4160756501182033,
+ "grad_norm": 2.7104697227478027,
+ "learning_rate": 4.952796297920585e-06,
+ "loss": 0.6708,
+ "step": 880
+ },
+ {
+ "epoch": 0.416548463356974,
+ "grad_norm": 2.8223445415496826,
+ "learning_rate": 4.952675568759108e-06,
+ "loss": 0.6214,
+ "step": 881
+ },
+ {
+ "epoch": 0.41702127659574467,
+ "grad_norm": 2.6598153114318848,
+ "learning_rate": 4.952554686880279e-06,
+ "loss": 0.6116,
+ "step": 882
+ },
+ {
+ "epoch": 0.41749408983451536,
+ "grad_norm": 2.8639824390411377,
+ "learning_rate": 4.952433652291623e-06,
+ "loss": 0.5971,
+ "step": 883
+ },
+ {
+ "epoch": 0.41796690307328604,
+ "grad_norm": 2.9578304290771484,
+ "learning_rate": 4.952312465000677e-06,
+ "loss": 0.6785,
+ "step": 884
+ },
+ {
+ "epoch": 0.41843971631205673,
+ "grad_norm": 2.872144937515259,
+ "learning_rate": 4.952191125014987e-06,
+ "loss": 0.6772,
+ "step": 885
+ },
+ {
+ "epoch": 0.4189125295508274,
+ "grad_norm": 2.7513675689697266,
+ "learning_rate": 4.952069632342108e-06,
+ "loss": 0.702,
+ "step": 886
+ },
+ {
+ "epoch": 0.4193853427895981,
+ "grad_norm": 2.9275078773498535,
+ "learning_rate": 4.951947986989606e-06,
+ "loss": 0.589,
+ "step": 887
+ },
+ {
+ "epoch": 0.4198581560283688,
+ "grad_norm": 2.740549325942993,
+ "learning_rate": 4.951826188965053e-06,
+ "loss": 0.5942,
+ "step": 888
+ },
+ {
+ "epoch": 0.4203309692671395,
+ "grad_norm": 2.92452073097229,
+ "learning_rate": 4.951704238276035e-06,
+ "loss": 0.6819,
+ "step": 889
+ },
+ {
+ "epoch": 0.42080378250591016,
+ "grad_norm": 2.842491865158081,
+ "learning_rate": 4.951582134930144e-06,
+ "loss": 0.6304,
+ "step": 890
+ },
+ {
+ "epoch": 0.42127659574468085,
+ "grad_norm": 2.613478422164917,
+ "learning_rate": 4.951459878934983e-06,
+ "loss": 0.6912,
+ "step": 891
+ },
+ {
+ "epoch": 0.42174940898345153,
+ "grad_norm": 3.2408607006073,
+ "learning_rate": 4.951337470298165e-06,
+ "loss": 0.6755,
+ "step": 892
+ },
+ {
+ "epoch": 0.4222222222222222,
+ "grad_norm": 3.1022439002990723,
+ "learning_rate": 4.9512149090273125e-06,
+ "loss": 0.6138,
+ "step": 893
+ },
+ {
+ "epoch": 0.4226950354609929,
+ "grad_norm": 2.6418895721435547,
+ "learning_rate": 4.951092195130055e-06,
+ "loss": 0.639,
+ "step": 894
+ },
+ {
+ "epoch": 0.4231678486997636,
+ "grad_norm": 3.010744333267212,
+ "learning_rate": 4.950969328614035e-06,
+ "loss": 0.7102,
+ "step": 895
+ },
+ {
+ "epoch": 0.4236406619385343,
+ "grad_norm": 2.673292636871338,
+ "learning_rate": 4.950846309486901e-06,
+ "loss": 0.5676,
+ "step": 896
+ },
+ {
+ "epoch": 0.42411347517730497,
+ "grad_norm": 3.6974737644195557,
+ "learning_rate": 4.950723137756314e-06,
+ "loss": 0.5722,
+ "step": 897
+ },
+ {
+ "epoch": 0.42458628841607565,
+ "grad_norm": 3.69028902053833,
+ "learning_rate": 4.9505998134299435e-06,
+ "loss": 0.6337,
+ "step": 898
+ },
+ {
+ "epoch": 0.42505910165484634,
+ "grad_norm": 3.2136125564575195,
+ "learning_rate": 4.950476336515469e-06,
+ "loss": 0.6469,
+ "step": 899
+ },
+ {
+ "epoch": 0.425531914893617,
+ "grad_norm": 2.7396016120910645,
+ "learning_rate": 4.950352707020577e-06,
+ "loss": 0.6656,
+ "step": 900
+ },
+ {
+ "epoch": 0.4260047281323877,
+ "grad_norm": 2.825416088104248,
+ "learning_rate": 4.950228924952967e-06,
+ "loss": 0.6298,
+ "step": 901
+ },
+ {
+ "epoch": 0.4264775413711584,
+ "grad_norm": 3.401658535003662,
+ "learning_rate": 4.950104990320345e-06,
+ "loss": 0.778,
+ "step": 902
+ },
+ {
+ "epoch": 0.4269503546099291,
+ "grad_norm": 2.7002272605895996,
+ "learning_rate": 4.9499809031304294e-06,
+ "loss": 0.6536,
+ "step": 903
+ },
+ {
+ "epoch": 0.42742316784869977,
+ "grad_norm": 2.62386417388916,
+ "learning_rate": 4.949856663390945e-06,
+ "loss": 0.6629,
+ "step": 904
+ },
+ {
+ "epoch": 0.42789598108747046,
+ "grad_norm": 2.584247589111328,
+ "learning_rate": 4.94973227110963e-06,
+ "loss": 0.5813,
+ "step": 905
+ },
+ {
+ "epoch": 0.42836879432624114,
+ "grad_norm": 3.4365768432617188,
+ "learning_rate": 4.9496077262942265e-06,
+ "loss": 0.7648,
+ "step": 906
+ },
+ {
+ "epoch": 0.42884160756501183,
+ "grad_norm": 2.8993639945983887,
+ "learning_rate": 4.949483028952492e-06,
+ "loss": 0.6696,
+ "step": 907
+ },
+ {
+ "epoch": 0.4293144208037825,
+ "grad_norm": 2.922809362411499,
+ "learning_rate": 4.94935817909219e-06,
+ "loss": 0.6892,
+ "step": 908
+ },
+ {
+ "epoch": 0.4297872340425532,
+ "grad_norm": 2.85478138923645,
+ "learning_rate": 4.9492331767210944e-06,
+ "loss": 0.536,
+ "step": 909
+ },
+ {
+ "epoch": 0.4302600472813239,
+ "grad_norm": 2.8639259338378906,
+ "learning_rate": 4.949108021846988e-06,
+ "loss": 0.634,
+ "step": 910
+ },
+ {
+ "epoch": 0.4307328605200946,
+ "grad_norm": 3.0533697605133057,
+ "learning_rate": 4.948982714477664e-06,
+ "loss": 0.6318,
+ "step": 911
+ },
+ {
+ "epoch": 0.43120567375886526,
+ "grad_norm": 2.331674814224243,
+ "learning_rate": 4.9488572546209255e-06,
+ "loss": 0.6562,
+ "step": 912
+ },
+ {
+ "epoch": 0.43167848699763595,
+ "grad_norm": 3.0154623985290527,
+ "learning_rate": 4.9487316422845835e-06,
+ "loss": 0.6675,
+ "step": 913
+ },
+ {
+ "epoch": 0.43215130023640663,
+ "grad_norm": 2.7354514598846436,
+ "learning_rate": 4.948605877476459e-06,
+ "loss": 0.6012,
+ "step": 914
+ },
+ {
+ "epoch": 0.4326241134751773,
+ "grad_norm": 2.863736629486084,
+ "learning_rate": 4.948479960204383e-06,
+ "loss": 0.6062,
+ "step": 915
+ },
+ {
+ "epoch": 0.433096926713948,
+ "grad_norm": 3.01998233795166,
+ "learning_rate": 4.948353890476197e-06,
+ "loss": 0.6749,
+ "step": 916
+ },
+ {
+ "epoch": 0.4335697399527187,
+ "grad_norm": 2.7550456523895264,
+ "learning_rate": 4.94822766829975e-06,
+ "loss": 0.6507,
+ "step": 917
+ },
+ {
+ "epoch": 0.4340425531914894,
+ "grad_norm": 3.370572805404663,
+ "learning_rate": 4.948101293682901e-06,
+ "loss": 0.714,
+ "step": 918
+ },
+ {
+ "epoch": 0.43451536643026006,
+ "grad_norm": 2.9736790657043457,
+ "learning_rate": 4.947974766633519e-06,
+ "loss": 0.729,
+ "step": 919
+ },
+ {
+ "epoch": 0.43498817966903075,
+ "grad_norm": 3.1036548614501953,
+ "learning_rate": 4.947848087159483e-06,
+ "loss": 0.7547,
+ "step": 920
+ },
+ {
+ "epoch": 0.43546099290780144,
+ "grad_norm": 2.895094871520996,
+ "learning_rate": 4.947721255268679e-06,
+ "loss": 0.6089,
+ "step": 921
+ },
+ {
+ "epoch": 0.4359338061465721,
+ "grad_norm": 2.798476219177246,
+ "learning_rate": 4.947594270969005e-06,
+ "loss": 0.5432,
+ "step": 922
+ },
+ {
+ "epoch": 0.4364066193853428,
+ "grad_norm": 2.7675702571868896,
+ "learning_rate": 4.94746713426837e-06,
+ "loss": 0.5693,
+ "step": 923
+ },
+ {
+ "epoch": 0.4368794326241135,
+ "grad_norm": 2.6851553916931152,
+ "learning_rate": 4.947339845174687e-06,
+ "loss": 0.6503,
+ "step": 924
+ },
+ {
+ "epoch": 0.4373522458628842,
+ "grad_norm": 2.909635543823242,
+ "learning_rate": 4.947212403695883e-06,
+ "loss": 0.6494,
+ "step": 925
+ },
+ {
+ "epoch": 0.43782505910165487,
+ "grad_norm": 2.604526996612549,
+ "learning_rate": 4.947084809839894e-06,
+ "loss": 0.6349,
+ "step": 926
+ },
+ {
+ "epoch": 0.43829787234042555,
+ "grad_norm": 3.118149518966675,
+ "learning_rate": 4.946957063614664e-06,
+ "loss": 0.6219,
+ "step": 927
+ },
+ {
+ "epoch": 0.43877068557919624,
+ "grad_norm": 2.7452616691589355,
+ "learning_rate": 4.9468291650281465e-06,
+ "loss": 0.6096,
+ "step": 928
+ },
+ {
+ "epoch": 0.4392434988179669,
+ "grad_norm": 3.30098819732666,
+ "learning_rate": 4.946701114088307e-06,
+ "loss": 0.6277,
+ "step": 929
+ },
+ {
+ "epoch": 0.4397163120567376,
+ "grad_norm": 2.789482593536377,
+ "learning_rate": 4.946572910803116e-06,
+ "loss": 0.7,
+ "step": 930
+ },
+ {
+ "epoch": 0.4401891252955083,
+ "grad_norm": 2.7283935546875,
+ "learning_rate": 4.946444555180559e-06,
+ "loss": 0.5375,
+ "step": 931
+ },
+ {
+ "epoch": 0.440661938534279,
+ "grad_norm": 3.101304054260254,
+ "learning_rate": 4.946316047228627e-06,
+ "loss": 0.6131,
+ "step": 932
+ },
+ {
+ "epoch": 0.44113475177304967,
+ "grad_norm": 3.573908805847168,
+ "learning_rate": 4.946187386955321e-06,
+ "loss": 0.7073,
+ "step": 933
+ },
+ {
+ "epoch": 0.44160756501182036,
+ "grad_norm": 3.214979648590088,
+ "learning_rate": 4.946058574368653e-06,
+ "loss": 0.6508,
+ "step": 934
+ },
+ {
+ "epoch": 0.44208037825059104,
+ "grad_norm": 3.145082712173462,
+ "learning_rate": 4.945929609476643e-06,
+ "loss": 0.64,
+ "step": 935
+ },
+ {
+ "epoch": 0.4425531914893617,
+ "grad_norm": 2.991780996322632,
+ "learning_rate": 4.945800492287321e-06,
+ "loss": 0.6315,
+ "step": 936
+ },
+ {
+ "epoch": 0.44302600472813236,
+ "grad_norm": 3.2441139221191406,
+ "learning_rate": 4.945671222808727e-06,
+ "loss": 0.7144,
+ "step": 937
+ },
+ {
+ "epoch": 0.44349881796690305,
+ "grad_norm": 2.9397029876708984,
+ "learning_rate": 4.94554180104891e-06,
+ "loss": 0.6818,
+ "step": 938
+ },
+ {
+ "epoch": 0.44397163120567373,
+ "grad_norm": 3.2471461296081543,
+ "learning_rate": 4.945412227015929e-06,
+ "loss": 0.6921,
+ "step": 939
+ },
+ {
+ "epoch": 0.4444444444444444,
+ "grad_norm": 3.0882487297058105,
+ "learning_rate": 4.945282500717851e-06,
+ "loss": 0.718,
+ "step": 940
+ },
+ {
+ "epoch": 0.4449172576832151,
+ "grad_norm": 2.6035783290863037,
+ "learning_rate": 4.945152622162753e-06,
+ "loss": 0.621,
+ "step": 941
+ },
+ {
+ "epoch": 0.4453900709219858,
+ "grad_norm": 2.83659029006958,
+ "learning_rate": 4.945022591358724e-06,
+ "loss": 0.6403,
+ "step": 942
+ },
+ {
+ "epoch": 0.4458628841607565,
+ "grad_norm": 2.824463129043579,
+ "learning_rate": 4.944892408313859e-06,
+ "loss": 0.6594,
+ "step": 943
+ },
+ {
+ "epoch": 0.44633569739952716,
+ "grad_norm": 2.753735065460205,
+ "learning_rate": 4.9447620730362645e-06,
+ "loss": 0.6116,
+ "step": 944
+ },
+ {
+ "epoch": 0.44680851063829785,
+ "grad_norm": 3.0659725666046143,
+ "learning_rate": 4.944631585534056e-06,
+ "loss": 0.5983,
+ "step": 945
+ },
+ {
+ "epoch": 0.44728132387706854,
+ "grad_norm": 2.969113349914551,
+ "learning_rate": 4.944500945815357e-06,
+ "loss": 0.6859,
+ "step": 946
+ },
+ {
+ "epoch": 0.4477541371158392,
+ "grad_norm": 2.810303211212158,
+ "learning_rate": 4.944370153888303e-06,
+ "loss": 0.7025,
+ "step": 947
+ },
+ {
+ "epoch": 0.4482269503546099,
+ "grad_norm": 3.027721643447876,
+ "learning_rate": 4.944239209761038e-06,
+ "loss": 0.7268,
+ "step": 948
+ },
+ {
+ "epoch": 0.4486997635933806,
+ "grad_norm": 2.661503314971924,
+ "learning_rate": 4.944108113441716e-06,
+ "loss": 0.6702,
+ "step": 949
+ },
+ {
+ "epoch": 0.4491725768321513,
+ "grad_norm": 2.738591432571411,
+ "learning_rate": 4.943976864938498e-06,
+ "loss": 0.6728,
+ "step": 950
+ },
+ {
+ "epoch": 0.44964539007092197,
+ "grad_norm": 3.447505474090576,
+ "learning_rate": 4.943845464259557e-06,
+ "loss": 0.6586,
+ "step": 951
+ },
+ {
+ "epoch": 0.45011820330969265,
+ "grad_norm": 3.0968854427337646,
+ "learning_rate": 4.943713911413075e-06,
+ "loss": 0.7666,
+ "step": 952
+ },
+ {
+ "epoch": 0.45059101654846334,
+ "grad_norm": 2.4113779067993164,
+ "learning_rate": 4.943582206407244e-06,
+ "loss": 0.6173,
+ "step": 953
+ },
+ {
+ "epoch": 0.451063829787234,
+ "grad_norm": 2.6357979774475098,
+ "learning_rate": 4.943450349250263e-06,
+ "loss": 0.5589,
+ "step": 954
+ },
+ {
+ "epoch": 0.4515366430260047,
+ "grad_norm": 2.9182233810424805,
+ "learning_rate": 4.9433183399503425e-06,
+ "loss": 0.6252,
+ "step": 955
+ },
+ {
+ "epoch": 0.4520094562647754,
+ "grad_norm": 2.832740306854248,
+ "learning_rate": 4.943186178515703e-06,
+ "loss": 0.6882,
+ "step": 956
+ },
+ {
+ "epoch": 0.4524822695035461,
+ "grad_norm": 2.9508981704711914,
+ "learning_rate": 4.943053864954574e-06,
+ "loss": 0.5722,
+ "step": 957
+ },
+ {
+ "epoch": 0.4529550827423168,
+ "grad_norm": 3.044729471206665,
+ "learning_rate": 4.9429213992751925e-06,
+ "loss": 0.6772,
+ "step": 958
+ },
+ {
+ "epoch": 0.45342789598108746,
+ "grad_norm": 2.606003522872925,
+ "learning_rate": 4.9427887814858075e-06,
+ "loss": 0.6445,
+ "step": 959
+ },
+ {
+ "epoch": 0.45390070921985815,
+ "grad_norm": 2.4634225368499756,
+ "learning_rate": 4.942656011594676e-06,
+ "loss": 0.6151,
+ "step": 960
+ },
+ {
+ "epoch": 0.45437352245862883,
+ "grad_norm": 2.8872334957122803,
+ "learning_rate": 4.942523089610066e-06,
+ "loss": 0.6255,
+ "step": 961
+ },
+ {
+ "epoch": 0.4548463356973995,
+ "grad_norm": 2.870605707168579,
+ "learning_rate": 4.942390015540253e-06,
+ "loss": 0.7481,
+ "step": 962
+ },
+ {
+ "epoch": 0.4553191489361702,
+ "grad_norm": 2.952680826187134,
+ "learning_rate": 4.942256789393524e-06,
+ "loss": 0.5556,
+ "step": 963
+ },
+ {
+ "epoch": 0.4557919621749409,
+ "grad_norm": 2.623680353164673,
+ "learning_rate": 4.9421234111781725e-06,
+ "loss": 0.6115,
+ "step": 964
+ },
+ {
+ "epoch": 0.4562647754137116,
+ "grad_norm": 2.6933600902557373,
+ "learning_rate": 4.941989880902505e-06,
+ "loss": 0.6102,
+ "step": 965
+ },
+ {
+ "epoch": 0.45673758865248226,
+ "grad_norm": 2.6047189235687256,
+ "learning_rate": 4.941856198574836e-06,
+ "loss": 0.612,
+ "step": 966
+ },
+ {
+ "epoch": 0.45721040189125295,
+ "grad_norm": 2.779186725616455,
+ "learning_rate": 4.9417223642034885e-06,
+ "loss": 0.5424,
+ "step": 967
+ },
+ {
+ "epoch": 0.45768321513002364,
+ "grad_norm": 2.6177165508270264,
+ "learning_rate": 4.941588377796795e-06,
+ "loss": 0.4661,
+ "step": 968
+ },
+ {
+ "epoch": 0.4581560283687943,
+ "grad_norm": 2.959676742553711,
+ "learning_rate": 4.941454239363101e-06,
+ "loss": 0.6966,
+ "step": 969
+ },
+ {
+ "epoch": 0.458628841607565,
+ "grad_norm": 2.9788379669189453,
+ "learning_rate": 4.941319948910756e-06,
+ "loss": 0.6181,
+ "step": 970
+ },
+ {
+ "epoch": 0.4591016548463357,
+ "grad_norm": 4.642750263214111,
+ "learning_rate": 4.941185506448122e-06,
+ "loss": 0.5602,
+ "step": 971
+ },
+ {
+ "epoch": 0.4595744680851064,
+ "grad_norm": 2.793002128601074,
+ "learning_rate": 4.941050911983572e-06,
+ "loss": 0.602,
+ "step": 972
+ },
+ {
+ "epoch": 0.46004728132387707,
+ "grad_norm": 2.6833035945892334,
+ "learning_rate": 4.9409161655254845e-06,
+ "loss": 0.5549,
+ "step": 973
+ },
+ {
+ "epoch": 0.46052009456264775,
+ "grad_norm": 3.905032157897949,
+ "learning_rate": 4.94078126708225e-06,
+ "loss": 0.6335,
+ "step": 974
+ },
+ {
+ "epoch": 0.46099290780141844,
+ "grad_norm": 2.922609329223633,
+ "learning_rate": 4.94064621666227e-06,
+ "loss": 0.5839,
+ "step": 975
+ },
+ {
+ "epoch": 0.4614657210401891,
+ "grad_norm": 2.8277416229248047,
+ "learning_rate": 4.940511014273952e-06,
+ "loss": 0.629,
+ "step": 976
+ },
+ {
+ "epoch": 0.4619385342789598,
+ "grad_norm": 3.07511043548584,
+ "learning_rate": 4.940375659925714e-06,
+ "loss": 0.7058,
+ "step": 977
+ },
+ {
+ "epoch": 0.4624113475177305,
+ "grad_norm": 3.65043044090271,
+ "learning_rate": 4.940240153625984e-06,
+ "loss": 0.7174,
+ "step": 978
+ },
+ {
+ "epoch": 0.4628841607565012,
+ "grad_norm": 2.755167245864868,
+ "learning_rate": 4.9401044953832e-06,
+ "loss": 0.6548,
+ "step": 979
+ },
+ {
+ "epoch": 0.46335697399527187,
+ "grad_norm": 2.9881057739257812,
+ "learning_rate": 4.939968685205808e-06,
+ "loss": 0.6245,
+ "step": 980
+ },
+ {
+ "epoch": 0.46382978723404256,
+ "grad_norm": 2.9484212398529053,
+ "learning_rate": 4.939832723102266e-06,
+ "loss": 0.655,
+ "step": 981
+ },
+ {
+ "epoch": 0.46430260047281324,
+ "grad_norm": 2.898918628692627,
+ "learning_rate": 4.939696609081038e-06,
+ "loss": 0.6178,
+ "step": 982
+ },
+ {
+ "epoch": 0.46477541371158393,
+ "grad_norm": 2.7052435874938965,
+ "learning_rate": 4.9395603431506e-06,
+ "loss": 0.6393,
+ "step": 983
+ },
+ {
+ "epoch": 0.4652482269503546,
+ "grad_norm": 2.5610013008117676,
+ "learning_rate": 4.939423925319436e-06,
+ "loss": 0.4847,
+ "step": 984
+ },
+ {
+ "epoch": 0.4657210401891253,
+ "grad_norm": 3.229083299636841,
+ "learning_rate": 4.939287355596042e-06,
+ "loss": 0.6473,
+ "step": 985
+ },
+ {
+ "epoch": 0.466193853427896,
+ "grad_norm": 2.907097816467285,
+ "learning_rate": 4.9391506339889195e-06,
+ "loss": 0.652,
+ "step": 986
+ },
+ {
+ "epoch": 0.4666666666666667,
+ "grad_norm": 2.6929478645324707,
+ "learning_rate": 4.939013760506582e-06,
+ "loss": 0.6175,
+ "step": 987
+ },
+ {
+ "epoch": 0.46713947990543736,
+ "grad_norm": 3.414813280105591,
+ "learning_rate": 4.938876735157554e-06,
+ "loss": 0.7597,
+ "step": 988
+ },
+ {
+ "epoch": 0.46761229314420805,
+ "grad_norm": 3.297360420227051,
+ "learning_rate": 4.938739557950365e-06,
+ "loss": 0.6824,
+ "step": 989
+ },
+ {
+ "epoch": 0.46808510638297873,
+ "grad_norm": 3.083155393600464,
+ "learning_rate": 4.938602228893557e-06,
+ "loss": 0.6505,
+ "step": 990
+ },
+ {
+ "epoch": 0.4685579196217494,
+ "grad_norm": 2.9781153202056885,
+ "learning_rate": 4.938464747995681e-06,
+ "loss": 0.666,
+ "step": 991
+ },
+ {
+ "epoch": 0.4690307328605201,
+ "grad_norm": 3.1494534015655518,
+ "learning_rate": 4.9383271152652975e-06,
+ "loss": 0.6422,
+ "step": 992
+ },
+ {
+ "epoch": 0.4695035460992908,
+ "grad_norm": 2.547868490219116,
+ "learning_rate": 4.938189330710976e-06,
+ "loss": 0.5766,
+ "step": 993
+ },
+ {
+ "epoch": 0.4699763593380615,
+ "grad_norm": 2.684736967086792,
+ "learning_rate": 4.938051394341297e-06,
+ "loss": 0.6407,
+ "step": 994
+ },
+ {
+ "epoch": 0.47044917257683216,
+ "grad_norm": 2.9619693756103516,
+ "learning_rate": 4.937913306164847e-06,
+ "loss": 0.6936,
+ "step": 995
+ },
+ {
+ "epoch": 0.47092198581560285,
+ "grad_norm": 2.9698498249053955,
+ "learning_rate": 4.937775066190227e-06,
+ "loss": 0.6464,
+ "step": 996
+ },
+ {
+ "epoch": 0.47139479905437354,
+ "grad_norm": 3.121049642562866,
+ "learning_rate": 4.937636674426042e-06,
+ "loss": 0.6383,
+ "step": 997
+ },
+ {
+ "epoch": 0.4718676122931442,
+ "grad_norm": 3.113672971725464,
+ "learning_rate": 4.93749813088091e-06,
+ "loss": 0.6892,
+ "step": 998
+ },
+ {
+ "epoch": 0.4723404255319149,
+ "grad_norm": 3.126113176345825,
+ "learning_rate": 4.937359435563458e-06,
+ "loss": 0.6728,
+ "step": 999
+ },
+ {
+ "epoch": 0.4728132387706856,
+ "grad_norm": 3.353966236114502,
+ "learning_rate": 4.937220588482321e-06,
+ "loss": 0.6041,
+ "step": 1000
+ },
+ {
+ "epoch": 0.4732860520094563,
+ "grad_norm": 2.8860628604888916,
+ "learning_rate": 4.937081589646144e-06,
+ "loss": 0.6798,
+ "step": 1001
+ },
+ {
+ "epoch": 0.47375886524822697,
+ "grad_norm": 3.0510590076446533,
+ "learning_rate": 4.936942439063584e-06,
+ "loss": 0.5841,
+ "step": 1002
+ },
+ {
+ "epoch": 0.47423167848699765,
+ "grad_norm": 2.6998369693756104,
+ "learning_rate": 4.936803136743303e-06,
+ "loss": 0.6403,
+ "step": 1003
+ },
+ {
+ "epoch": 0.47470449172576834,
+ "grad_norm": 2.875347137451172,
+ "learning_rate": 4.9366636826939765e-06,
+ "loss": 0.5811,
+ "step": 1004
+ },
+ {
+ "epoch": 0.475177304964539,
+ "grad_norm": 2.9122262001037598,
+ "learning_rate": 4.936524076924287e-06,
+ "loss": 0.6852,
+ "step": 1005
+ },
+ {
+ "epoch": 0.4756501182033097,
+ "grad_norm": 2.5167057514190674,
+ "learning_rate": 4.9363843194429265e-06,
+ "loss": 0.5367,
+ "step": 1006
+ },
+ {
+ "epoch": 0.4761229314420804,
+ "grad_norm": 2.5745551586151123,
+ "learning_rate": 4.9362444102585985e-06,
+ "loss": 0.6241,
+ "step": 1007
+ },
+ {
+ "epoch": 0.4765957446808511,
+ "grad_norm": 2.5024216175079346,
+ "learning_rate": 4.9361043493800125e-06,
+ "loss": 0.6133,
+ "step": 1008
+ },
+ {
+ "epoch": 0.47706855791962177,
+ "grad_norm": 2.7281384468078613,
+ "learning_rate": 4.935964136815892e-06,
+ "loss": 0.6834,
+ "step": 1009
+ },
+ {
+ "epoch": 0.47754137115839246,
+ "grad_norm": 3.0118913650512695,
+ "learning_rate": 4.935823772574965e-06,
+ "loss": 0.6922,
+ "step": 1010
+ },
+ {
+ "epoch": 0.47801418439716314,
+ "grad_norm": 3.016216993331909,
+ "learning_rate": 4.935683256665973e-06,
+ "loss": 0.6653,
+ "step": 1011
+ },
+ {
+ "epoch": 0.47848699763593383,
+ "grad_norm": 2.9526784420013428,
+ "learning_rate": 4.9355425890976636e-06,
+ "loss": 0.6423,
+ "step": 1012
+ },
+ {
+ "epoch": 0.4789598108747045,
+ "grad_norm": 6.222797393798828,
+ "learning_rate": 4.9354017698787985e-06,
+ "loss": 0.5884,
+ "step": 1013
+ },
+ {
+ "epoch": 0.4794326241134752,
+ "grad_norm": 2.6553597450256348,
+ "learning_rate": 4.935260799018143e-06,
+ "loss": 0.6624,
+ "step": 1014
+ },
+ {
+ "epoch": 0.4799054373522459,
+ "grad_norm": 3.0942065715789795,
+ "learning_rate": 4.935119676524475e-06,
+ "loss": 0.6623,
+ "step": 1015
+ },
+ {
+ "epoch": 0.4803782505910166,
+ "grad_norm": 2.626359224319458,
+ "learning_rate": 4.934978402406585e-06,
+ "loss": 0.6195,
+ "step": 1016
+ },
+ {
+ "epoch": 0.4808510638297872,
+ "grad_norm": 2.7954699993133545,
+ "learning_rate": 4.934836976673265e-06,
+ "loss": 0.5545,
+ "step": 1017
+ },
+ {
+ "epoch": 0.4813238770685579,
+ "grad_norm": 2.913557291030884,
+ "learning_rate": 4.934695399333324e-06,
+ "loss": 0.6288,
+ "step": 1018
+ },
+ {
+ "epoch": 0.4817966903073286,
+ "grad_norm": 3.1043739318847656,
+ "learning_rate": 4.9345536703955746e-06,
+ "loss": 0.6771,
+ "step": 1019
+ },
+ {
+ "epoch": 0.48226950354609927,
+ "grad_norm": 2.789357900619507,
+ "learning_rate": 4.934411789868845e-06,
+ "loss": 0.6227,
+ "step": 1020
+ },
+ {
+ "epoch": 0.48274231678486995,
+ "grad_norm": 2.480609655380249,
+ "learning_rate": 4.934269757761967e-06,
+ "loss": 0.5779,
+ "step": 1021
+ },
+ {
+ "epoch": 0.48321513002364064,
+ "grad_norm": 2.7946252822875977,
+ "learning_rate": 4.934127574083785e-06,
+ "loss": 0.6166,
+ "step": 1022
+ },
+ {
+ "epoch": 0.4836879432624113,
+ "grad_norm": 3.0670509338378906,
+ "learning_rate": 4.933985238843153e-06,
+ "loss": 0.7766,
+ "step": 1023
+ },
+ {
+ "epoch": 0.484160756501182,
+ "grad_norm": 2.8567559719085693,
+ "learning_rate": 4.933842752048932e-06,
+ "loss": 0.5088,
+ "step": 1024
+ },
+ {
+ "epoch": 0.4846335697399527,
+ "grad_norm": 2.5674657821655273,
+ "learning_rate": 4.933700113709996e-06,
+ "loss": 0.6036,
+ "step": 1025
+ },
+ {
+ "epoch": 0.4851063829787234,
+ "grad_norm": 2.782339096069336,
+ "learning_rate": 4.933557323835224e-06,
+ "loss": 0.5335,
+ "step": 1026
+ },
+ {
+ "epoch": 0.48557919621749407,
+ "grad_norm": 2.6334071159362793,
+ "learning_rate": 4.93341438243351e-06,
+ "loss": 0.6327,
+ "step": 1027
+ },
+ {
+ "epoch": 0.48605200945626476,
+ "grad_norm": 3.0853965282440186,
+ "learning_rate": 4.933271289513751e-06,
+ "loss": 0.7102,
+ "step": 1028
+ },
+ {
+ "epoch": 0.48652482269503544,
+ "grad_norm": 2.619997501373291,
+ "learning_rate": 4.933128045084859e-06,
+ "loss": 0.6138,
+ "step": 1029
+ },
+ {
+ "epoch": 0.48699763593380613,
+ "grad_norm": 2.8316116333007812,
+ "learning_rate": 4.932984649155753e-06,
+ "loss": 0.6346,
+ "step": 1030
+ },
+ {
+ "epoch": 0.4874704491725768,
+ "grad_norm": 3.153486490249634,
+ "learning_rate": 4.932841101735361e-06,
+ "loss": 0.7626,
+ "step": 1031
+ },
+ {
+ "epoch": 0.4879432624113475,
+ "grad_norm": 3.1831274032592773,
+ "learning_rate": 4.9326974028326214e-06,
+ "loss": 0.6607,
+ "step": 1032
+ },
+ {
+ "epoch": 0.4884160756501182,
+ "grad_norm": 2.791078567504883,
+ "learning_rate": 4.932553552456481e-06,
+ "loss": 0.6141,
+ "step": 1033
+ },
+ {
+ "epoch": 0.4888888888888889,
+ "grad_norm": 2.627263307571411,
+ "learning_rate": 4.932409550615898e-06,
+ "loss": 0.6777,
+ "step": 1034
+ },
+ {
+ "epoch": 0.48936170212765956,
+ "grad_norm": 2.8550007343292236,
+ "learning_rate": 4.932265397319838e-06,
+ "loss": 0.6379,
+ "step": 1035
+ },
+ {
+ "epoch": 0.48983451536643025,
+ "grad_norm": 4.505824089050293,
+ "learning_rate": 4.932121092577276e-06,
+ "loss": 0.5892,
+ "step": 1036
+ },
+ {
+ "epoch": 0.49030732860520093,
+ "grad_norm": 3.100191116333008,
+ "learning_rate": 4.931976636397199e-06,
+ "loss": 0.6443,
+ "step": 1037
+ },
+ {
+ "epoch": 0.4907801418439716,
+ "grad_norm": 2.921494245529175,
+ "learning_rate": 4.9318320287886e-06,
+ "loss": 0.6821,
+ "step": 1038
+ },
+ {
+ "epoch": 0.4912529550827423,
+ "grad_norm": 4.577807903289795,
+ "learning_rate": 4.931687269760485e-06,
+ "loss": 0.5946,
+ "step": 1039
+ },
+ {
+ "epoch": 0.491725768321513,
+ "grad_norm": 2.7347636222839355,
+ "learning_rate": 4.931542359321865e-06,
+ "loss": 0.5689,
+ "step": 1040
+ },
+ {
+ "epoch": 0.4921985815602837,
+ "grad_norm": 2.5289158821105957,
+ "learning_rate": 4.931397297481765e-06,
+ "loss": 0.5632,
+ "step": 1041
+ },
+ {
+ "epoch": 0.49267139479905436,
+ "grad_norm": 3.3518471717834473,
+ "learning_rate": 4.9312520842492165e-06,
+ "loss": 0.6349,
+ "step": 1042
+ },
+ {
+ "epoch": 0.49314420803782505,
+ "grad_norm": 3.0469748973846436,
+ "learning_rate": 4.931106719633261e-06,
+ "loss": 0.5734,
+ "step": 1043
+ },
+ {
+ "epoch": 0.49361702127659574,
+ "grad_norm": 3.104682445526123,
+ "learning_rate": 4.930961203642951e-06,
+ "loss": 0.6101,
+ "step": 1044
+ },
+ {
+ "epoch": 0.4940898345153664,
+ "grad_norm": 2.776705503463745,
+ "learning_rate": 4.930815536287346e-06,
+ "loss": 0.6397,
+ "step": 1045
+ },
+ {
+ "epoch": 0.4945626477541371,
+ "grad_norm": 2.760380983352661,
+ "learning_rate": 4.930669717575516e-06,
+ "loss": 0.668,
+ "step": 1046
+ },
+ {
+ "epoch": 0.4950354609929078,
+ "grad_norm": 2.70084547996521,
+ "learning_rate": 4.930523747516541e-06,
+ "loss": 0.5729,
+ "step": 1047
+ },
+ {
+ "epoch": 0.4955082742316785,
+ "grad_norm": 2.7319583892822266,
+ "learning_rate": 4.930377626119511e-06,
+ "loss": 0.6258,
+ "step": 1048
+ },
+ {
+ "epoch": 0.49598108747044917,
+ "grad_norm": 3.2515223026275635,
+ "learning_rate": 4.930231353393521e-06,
+ "loss": 0.7412,
+ "step": 1049
+ },
+ {
+ "epoch": 0.49645390070921985,
+ "grad_norm": 3.0646486282348633,
+ "learning_rate": 4.930084929347682e-06,
+ "loss": 0.5809,
+ "step": 1050
+ },
+ {
+ "epoch": 0.49692671394799054,
+ "grad_norm": 3.1621921062469482,
+ "learning_rate": 4.9299383539911096e-06,
+ "loss": 0.6282,
+ "step": 1051
+ },
+ {
+ "epoch": 0.4973995271867612,
+ "grad_norm": 2.864713191986084,
+ "learning_rate": 4.929791627332931e-06,
+ "loss": 0.6263,
+ "step": 1052
+ },
+ {
+ "epoch": 0.4978723404255319,
+ "grad_norm": 3.181016683578491,
+ "learning_rate": 4.929644749382283e-06,
+ "loss": 0.5697,
+ "step": 1053
+ },
+ {
+ "epoch": 0.4983451536643026,
+ "grad_norm": 2.9064836502075195,
+ "learning_rate": 4.929497720148309e-06,
+ "loss": 0.6161,
+ "step": 1054
+ },
+ {
+ "epoch": 0.4988179669030733,
+ "grad_norm": 3.058112859725952,
+ "learning_rate": 4.9293505396401655e-06,
+ "loss": 0.6477,
+ "step": 1055
+ },
+ {
+ "epoch": 0.49929078014184397,
+ "grad_norm": 2.5227596759796143,
+ "learning_rate": 4.929203207867016e-06,
+ "loss": 0.5819,
+ "step": 1056
+ },
+ {
+ "epoch": 0.49976359338061466,
+ "grad_norm": 3.386862277984619,
+ "learning_rate": 4.929055724838035e-06,
+ "loss": 0.7342,
+ "step": 1057
+ },
+ {
+ "epoch": 0.5002364066193853,
+ "grad_norm": 3.368346929550171,
+ "learning_rate": 4.928908090562404e-06,
+ "loss": 0.6622,
+ "step": 1058
+ },
+ {
+ "epoch": 0.500709219858156,
+ "grad_norm": 2.9108314514160156,
+ "learning_rate": 4.928760305049317e-06,
+ "loss": 0.6598,
+ "step": 1059
+ },
+ {
+ "epoch": 0.5011820330969267,
+ "grad_norm": 2.822305917739868,
+ "learning_rate": 4.928612368307977e-06,
+ "loss": 0.5841,
+ "step": 1060
+ },
+ {
+ "epoch": 0.5016548463356973,
+ "grad_norm": 2.689131259918213,
+ "learning_rate": 4.928464280347592e-06,
+ "loss": 0.6631,
+ "step": 1061
+ },
+ {
+ "epoch": 0.502127659574468,
+ "grad_norm": 3.337214946746826,
+ "learning_rate": 4.9283160411773864e-06,
+ "loss": 0.6105,
+ "step": 1062
+ },
+ {
+ "epoch": 0.5026004728132387,
+ "grad_norm": 3.035911798477173,
+ "learning_rate": 4.928167650806588e-06,
+ "loss": 0.6981,
+ "step": 1063
+ },
+ {
+ "epoch": 0.5030732860520094,
+ "grad_norm": 2.8820855617523193,
+ "learning_rate": 4.9280191092444375e-06,
+ "loss": 0.6408,
+ "step": 1064
+ },
+ {
+ "epoch": 0.5035460992907801,
+ "grad_norm": 3.080432415008545,
+ "learning_rate": 4.927870416500183e-06,
+ "loss": 0.6398,
+ "step": 1065
+ },
+ {
+ "epoch": 0.5040189125295508,
+ "grad_norm": 2.761612892150879,
+ "learning_rate": 4.927721572583084e-06,
+ "loss": 0.6126,
+ "step": 1066
+ },
+ {
+ "epoch": 0.5044917257683215,
+ "grad_norm": 2.8561882972717285,
+ "learning_rate": 4.927572577502408e-06,
+ "loss": 0.584,
+ "step": 1067
+ },
+ {
+ "epoch": 0.5049645390070922,
+ "grad_norm": 3.3386311531066895,
+ "learning_rate": 4.927423431267432e-06,
+ "loss": 0.6666,
+ "step": 1068
+ },
+ {
+ "epoch": 0.5054373522458628,
+ "grad_norm": 2.632906675338745,
+ "learning_rate": 4.927274133887443e-06,
+ "loss": 0.632,
+ "step": 1069
+ },
+ {
+ "epoch": 0.5059101654846335,
+ "grad_norm": 2.8737308979034424,
+ "learning_rate": 4.927124685371737e-06,
+ "loss": 0.6051,
+ "step": 1070
+ },
+ {
+ "epoch": 0.5063829787234042,
+ "grad_norm": 3.042222738265991,
+ "learning_rate": 4.926975085729619e-06,
+ "loss": 0.6954,
+ "step": 1071
+ },
+ {
+ "epoch": 0.5068557919621749,
+ "grad_norm": 3.3341481685638428,
+ "learning_rate": 4.926825334970404e-06,
+ "loss": 0.7148,
+ "step": 1072
+ },
+ {
+ "epoch": 0.5073286052009456,
+ "grad_norm": 2.7415387630462646,
+ "learning_rate": 4.926675433103418e-06,
+ "loss": 0.5456,
+ "step": 1073
+ },
+ {
+ "epoch": 0.5078014184397163,
+ "grad_norm": 2.7545325756073,
+ "learning_rate": 4.926525380137993e-06,
+ "loss": 0.6213,
+ "step": 1074
+ },
+ {
+ "epoch": 0.508274231678487,
+ "grad_norm": 2.9153690338134766,
+ "learning_rate": 4.926375176083472e-06,
+ "loss": 0.6466,
+ "step": 1075
+ },
+ {
+ "epoch": 0.5087470449172576,
+ "grad_norm": 4.210638523101807,
+ "learning_rate": 4.926224820949209e-06,
+ "loss": 0.6192,
+ "step": 1076
+ },
+ {
+ "epoch": 0.5092198581560283,
+ "grad_norm": 2.4357898235321045,
+ "learning_rate": 4.926074314744565e-06,
+ "loss": 0.594,
+ "step": 1077
+ },
+ {
+ "epoch": 0.509692671394799,
+ "grad_norm": 2.8004701137542725,
+ "learning_rate": 4.92592365747891e-06,
+ "loss": 0.6276,
+ "step": 1078
+ },
+ {
+ "epoch": 0.5101654846335697,
+ "grad_norm": 2.920675039291382,
+ "learning_rate": 4.925772849161628e-06,
+ "loss": 0.6043,
+ "step": 1079
+ },
+ {
+ "epoch": 0.5106382978723404,
+ "grad_norm": 2.791555404663086,
+ "learning_rate": 4.9256218898021055e-06,
+ "loss": 0.6837,
+ "step": 1080
+ },
+ {
+ "epoch": 0.5111111111111111,
+ "grad_norm": 3.1702463626861572,
+ "learning_rate": 4.925470779409746e-06,
+ "loss": 0.668,
+ "step": 1081
+ },
+ {
+ "epoch": 0.5115839243498818,
+ "grad_norm": 2.7149479389190674,
+ "learning_rate": 4.925319517993955e-06,
+ "loss": 0.5842,
+ "step": 1082
+ },
+ {
+ "epoch": 0.5120567375886524,
+ "grad_norm": 2.916311025619507,
+ "learning_rate": 4.925168105564153e-06,
+ "loss": 0.6893,
+ "step": 1083
+ },
+ {
+ "epoch": 0.5125295508274231,
+ "grad_norm": 2.917654514312744,
+ "learning_rate": 4.925016542129767e-06,
+ "loss": 0.6513,
+ "step": 1084
+ },
+ {
+ "epoch": 0.5130023640661938,
+ "grad_norm": 2.5568928718566895,
+ "learning_rate": 4.924864827700234e-06,
+ "loss": 0.6177,
+ "step": 1085
+ },
+ {
+ "epoch": 0.5134751773049645,
+ "grad_norm": 2.816720485687256,
+ "learning_rate": 4.924712962285001e-06,
+ "loss": 0.5833,
+ "step": 1086
+ },
+ {
+ "epoch": 0.5139479905437352,
+ "grad_norm": 2.6989188194274902,
+ "learning_rate": 4.9245609458935235e-06,
+ "loss": 0.6332,
+ "step": 1087
+ },
+ {
+ "epoch": 0.5144208037825059,
+ "grad_norm": 2.959599494934082,
+ "learning_rate": 4.924408778535268e-06,
+ "loss": 0.626,
+ "step": 1088
+ },
+ {
+ "epoch": 0.5148936170212766,
+ "grad_norm": 2.872814416885376,
+ "learning_rate": 4.924256460219708e-06,
+ "loss": 0.6407,
+ "step": 1089
+ },
+ {
+ "epoch": 0.5153664302600472,
+ "grad_norm": 2.6989097595214844,
+ "learning_rate": 4.924103990956329e-06,
+ "loss": 0.6391,
+ "step": 1090
+ },
+ {
+ "epoch": 0.5158392434988179,
+ "grad_norm": 2.986492156982422,
+ "learning_rate": 4.9239513707546235e-06,
+ "loss": 0.6911,
+ "step": 1091
+ },
+ {
+ "epoch": 0.5163120567375886,
+ "grad_norm": 3.069920301437378,
+ "learning_rate": 4.9237985996240954e-06,
+ "loss": 0.671,
+ "step": 1092
+ },
+ {
+ "epoch": 0.5167848699763593,
+ "grad_norm": 2.8214917182922363,
+ "learning_rate": 4.9236456775742555e-06,
+ "loss": 0.5885,
+ "step": 1093
+ },
+ {
+ "epoch": 0.51725768321513,
+ "grad_norm": 2.9416961669921875,
+ "learning_rate": 4.923492604614627e-06,
+ "loss": 0.6293,
+ "step": 1094
+ },
+ {
+ "epoch": 0.5177304964539007,
+ "grad_norm": 2.761780023574829,
+ "learning_rate": 4.923339380754741e-06,
+ "loss": 0.649,
+ "step": 1095
+ },
+ {
+ "epoch": 0.5182033096926714,
+ "grad_norm": 2.7648792266845703,
+ "learning_rate": 4.923186006004138e-06,
+ "loss": 0.5906,
+ "step": 1096
+ },
+ {
+ "epoch": 0.518676122931442,
+ "grad_norm": 3.5535428524017334,
+ "learning_rate": 4.923032480372367e-06,
+ "loss": 0.7138,
+ "step": 1097
+ },
+ {
+ "epoch": 0.5191489361702127,
+ "grad_norm": 2.6252479553222656,
+ "learning_rate": 4.922878803868988e-06,
+ "loss": 0.5499,
+ "step": 1098
+ },
+ {
+ "epoch": 0.5196217494089834,
+ "grad_norm": 2.901002883911133,
+ "learning_rate": 4.9227249765035715e-06,
+ "loss": 0.6991,
+ "step": 1099
+ },
+ {
+ "epoch": 0.5200945626477541,
+ "grad_norm": 2.621877431869507,
+ "learning_rate": 4.9225709982856925e-06,
+ "loss": 0.6269,
+ "step": 1100
+ },
+ {
+ "epoch": 0.5205673758865248,
+ "grad_norm": 2.872483015060425,
+ "learning_rate": 4.92241686922494e-06,
+ "loss": 0.6657,
+ "step": 1101
+ },
+ {
+ "epoch": 0.5210401891252955,
+ "grad_norm": 2.730447769165039,
+ "learning_rate": 4.922262589330912e-06,
+ "loss": 0.6061,
+ "step": 1102
+ },
+ {
+ "epoch": 0.5215130023640662,
+ "grad_norm": 2.646247386932373,
+ "learning_rate": 4.922108158613213e-06,
+ "loss": 0.5923,
+ "step": 1103
+ },
+ {
+ "epoch": 0.5219858156028369,
+ "grad_norm": 2.6488895416259766,
+ "learning_rate": 4.92195357708146e-06,
+ "loss": 0.6293,
+ "step": 1104
+ },
+ {
+ "epoch": 0.5224586288416075,
+ "grad_norm": 2.756338357925415,
+ "learning_rate": 4.921798844745278e-06,
+ "loss": 0.6374,
+ "step": 1105
+ },
+ {
+ "epoch": 0.5229314420803782,
+ "grad_norm": 3.1441280841827393,
+ "learning_rate": 4.921643961614301e-06,
+ "loss": 0.6652,
+ "step": 1106
+ },
+ {
+ "epoch": 0.5234042553191489,
+ "grad_norm": 3.050002098083496,
+ "learning_rate": 4.921488927698172e-06,
+ "loss": 0.6809,
+ "step": 1107
+ },
+ {
+ "epoch": 0.5238770685579196,
+ "grad_norm": 2.71750807762146,
+ "learning_rate": 4.921333743006547e-06,
+ "loss": 0.6266,
+ "step": 1108
+ },
+ {
+ "epoch": 0.5243498817966903,
+ "grad_norm": 2.8439245223999023,
+ "learning_rate": 4.921178407549086e-06,
+ "loss": 0.5663,
+ "step": 1109
+ },
+ {
+ "epoch": 0.524822695035461,
+ "grad_norm": 3.0722241401672363,
+ "learning_rate": 4.921022921335464e-06,
+ "loss": 0.6791,
+ "step": 1110
+ },
+ {
+ "epoch": 0.5252955082742317,
+ "grad_norm": 3.4381656646728516,
+ "learning_rate": 4.920867284375358e-06,
+ "loss": 0.6687,
+ "step": 1111
+ },
+ {
+ "epoch": 0.5257683215130023,
+ "grad_norm": 2.819812774658203,
+ "learning_rate": 4.920711496678463e-06,
+ "loss": 0.6299,
+ "step": 1112
+ },
+ {
+ "epoch": 0.526241134751773,
+ "grad_norm": 3.6587414741516113,
+ "learning_rate": 4.9205555582544765e-06,
+ "loss": 0.7392,
+ "step": 1113
+ },
+ {
+ "epoch": 0.5267139479905437,
+ "grad_norm": 2.774296522140503,
+ "learning_rate": 4.920399469113109e-06,
+ "loss": 0.6652,
+ "step": 1114
+ },
+ {
+ "epoch": 0.5271867612293144,
+ "grad_norm": 2.7480580806732178,
+ "learning_rate": 4.920243229264081e-06,
+ "loss": 0.596,
+ "step": 1115
+ },
+ {
+ "epoch": 0.5276595744680851,
+ "grad_norm": 3.213057518005371,
+ "learning_rate": 4.920086838717119e-06,
+ "loss": 0.6986,
+ "step": 1116
+ },
+ {
+ "epoch": 0.5281323877068558,
+ "grad_norm": 2.940546989440918,
+ "learning_rate": 4.919930297481962e-06,
+ "loss": 0.6481,
+ "step": 1117
+ },
+ {
+ "epoch": 0.5286052009456265,
+ "grad_norm": 2.5970494747161865,
+ "learning_rate": 4.9197736055683555e-06,
+ "loss": 0.5658,
+ "step": 1118
+ },
+ {
+ "epoch": 0.5290780141843971,
+ "grad_norm": 4.49385404586792,
+ "learning_rate": 4.919616762986057e-06,
+ "loss": 0.605,
+ "step": 1119
+ },
+ {
+ "epoch": 0.5295508274231678,
+ "grad_norm": 2.971857786178589,
+ "learning_rate": 4.919459769744833e-06,
+ "loss": 0.6539,
+ "step": 1120
+ },
+ {
+ "epoch": 0.5300236406619385,
+ "grad_norm": 2.6192965507507324,
+ "learning_rate": 4.919302625854457e-06,
+ "loss": 0.6226,
+ "step": 1121
+ },
+ {
+ "epoch": 0.5304964539007092,
+ "grad_norm": 2.665088176727295,
+ "learning_rate": 4.919145331324716e-06,
+ "loss": 0.6647,
+ "step": 1122
+ },
+ {
+ "epoch": 0.5309692671394799,
+ "grad_norm": 2.612126111984253,
+ "learning_rate": 4.918987886165403e-06,
+ "loss": 0.6965,
+ "step": 1123
+ },
+ {
+ "epoch": 0.5314420803782506,
+ "grad_norm": 3.80017352104187,
+ "learning_rate": 4.9188302903863205e-06,
+ "loss": 0.7396,
+ "step": 1124
+ },
+ {
+ "epoch": 0.5319148936170213,
+ "grad_norm": 2.781752824783325,
+ "learning_rate": 4.918672543997282e-06,
+ "loss": 0.5985,
+ "step": 1125
+ },
+ {
+ "epoch": 0.532387706855792,
+ "grad_norm": 2.6067914962768555,
+ "learning_rate": 4.91851464700811e-06,
+ "loss": 0.6159,
+ "step": 1126
+ },
+ {
+ "epoch": 0.5328605200945626,
+ "grad_norm": 2.670807123184204,
+ "learning_rate": 4.918356599428636e-06,
+ "loss": 0.5958,
+ "step": 1127
+ },
+ {
+ "epoch": 0.5333333333333333,
+ "grad_norm": 2.608611822128296,
+ "learning_rate": 4.9181984012687e-06,
+ "loss": 0.5768,
+ "step": 1128
+ },
+ {
+ "epoch": 0.533806146572104,
+ "grad_norm": 2.586764097213745,
+ "learning_rate": 4.918040052538154e-06,
+ "loss": 0.661,
+ "step": 1129
+ },
+ {
+ "epoch": 0.5342789598108747,
+ "grad_norm": 3.1317451000213623,
+ "learning_rate": 4.917881553246856e-06,
+ "loss": 0.6626,
+ "step": 1130
+ },
+ {
+ "epoch": 0.5347517730496454,
+ "grad_norm": 2.7135281562805176,
+ "learning_rate": 4.917722903404676e-06,
+ "loss": 0.6572,
+ "step": 1131
+ },
+ {
+ "epoch": 0.5352245862884161,
+ "grad_norm": 3.4546358585357666,
+ "learning_rate": 4.917564103021493e-06,
+ "loss": 0.5597,
+ "step": 1132
+ },
+ {
+ "epoch": 0.5356973995271868,
+ "grad_norm": 3.0943493843078613,
+ "learning_rate": 4.917405152107193e-06,
+ "loss": 0.7258,
+ "step": 1133
+ },
+ {
+ "epoch": 0.5361702127659574,
+ "grad_norm": 2.6069352626800537,
+ "learning_rate": 4.917246050671674e-06,
+ "loss": 0.6209,
+ "step": 1134
+ },
+ {
+ "epoch": 0.5366430260047281,
+ "grad_norm": 2.584883689880371,
+ "learning_rate": 4.917086798724844e-06,
+ "loss": 0.658,
+ "step": 1135
+ },
+ {
+ "epoch": 0.5371158392434988,
+ "grad_norm": 3.001976490020752,
+ "learning_rate": 4.9169273962766166e-06,
+ "loss": 0.6306,
+ "step": 1136
+ },
+ {
+ "epoch": 0.5375886524822695,
+ "grad_norm": 2.5013928413391113,
+ "learning_rate": 4.916767843336918e-06,
+ "loss": 0.572,
+ "step": 1137
+ },
+ {
+ "epoch": 0.5380614657210402,
+ "grad_norm": 2.9114553928375244,
+ "learning_rate": 4.916608139915684e-06,
+ "loss": 0.5841,
+ "step": 1138
+ },
+ {
+ "epoch": 0.5385342789598109,
+ "grad_norm": 2.8878467082977295,
+ "learning_rate": 4.9164482860228564e-06,
+ "loss": 0.6654,
+ "step": 1139
+ },
+ {
+ "epoch": 0.5390070921985816,
+ "grad_norm": 2.9827866554260254,
+ "learning_rate": 4.91628828166839e-06,
+ "loss": 0.6674,
+ "step": 1140
+ },
+ {
+ "epoch": 0.5394799054373522,
+ "grad_norm": 3.8696281909942627,
+ "learning_rate": 4.916128126862248e-06,
+ "loss": 0.6241,
+ "step": 1141
+ },
+ {
+ "epoch": 0.5399527186761229,
+ "grad_norm": 2.9556291103363037,
+ "learning_rate": 4.915967821614402e-06,
+ "loss": 0.6478,
+ "step": 1142
+ },
+ {
+ "epoch": 0.5404255319148936,
+ "grad_norm": 2.392942428588867,
+ "learning_rate": 4.915807365934834e-06,
+ "loss": 0.6097,
+ "step": 1143
+ },
+ {
+ "epoch": 0.5408983451536643,
+ "grad_norm": 3.032235860824585,
+ "learning_rate": 4.915646759833534e-06,
+ "loss": 0.7193,
+ "step": 1144
+ },
+ {
+ "epoch": 0.541371158392435,
+ "grad_norm": 2.840416193008423,
+ "learning_rate": 4.915486003320501e-06,
+ "loss": 0.5506,
+ "step": 1145
+ },
+ {
+ "epoch": 0.5418439716312057,
+ "grad_norm": 2.5438895225524902,
+ "learning_rate": 4.915325096405747e-06,
+ "loss": 0.6487,
+ "step": 1146
+ },
+ {
+ "epoch": 0.5423167848699764,
+ "grad_norm": 2.544334650039673,
+ "learning_rate": 4.9151640390992905e-06,
+ "loss": 0.6168,
+ "step": 1147
+ },
+ {
+ "epoch": 0.542789598108747,
+ "grad_norm": 2.8535678386688232,
+ "learning_rate": 4.91500283141116e-06,
+ "loss": 0.678,
+ "step": 1148
+ },
+ {
+ "epoch": 0.5432624113475177,
+ "grad_norm": 2.8086955547332764,
+ "learning_rate": 4.9148414733513915e-06,
+ "loss": 0.6473,
+ "step": 1149
+ },
+ {
+ "epoch": 0.5437352245862884,
+ "grad_norm": 2.4709885120391846,
+ "learning_rate": 4.914679964930034e-06,
+ "loss": 0.6797,
+ "step": 1150
+ },
+ {
+ "epoch": 0.5442080378250591,
+ "grad_norm": 2.8546934127807617,
+ "learning_rate": 4.9145183061571435e-06,
+ "loss": 0.6247,
+ "step": 1151
+ },
+ {
+ "epoch": 0.5446808510638298,
+ "grad_norm": 2.991184711456299,
+ "learning_rate": 4.9143564970427844e-06,
+ "loss": 0.5977,
+ "step": 1152
+ },
+ {
+ "epoch": 0.5451536643026005,
+ "grad_norm": 3.011216402053833,
+ "learning_rate": 4.914194537597033e-06,
+ "loss": 0.7005,
+ "step": 1153
+ },
+ {
+ "epoch": 0.5456264775413712,
+ "grad_norm": 2.807521343231201,
+ "learning_rate": 4.9140324278299744e-06,
+ "loss": 0.5412,
+ "step": 1154
+ },
+ {
+ "epoch": 0.5460992907801419,
+ "grad_norm": 3.0401229858398438,
+ "learning_rate": 4.913870167751701e-06,
+ "loss": 0.6394,
+ "step": 1155
+ },
+ {
+ "epoch": 0.5465721040189125,
+ "grad_norm": 2.853914976119995,
+ "learning_rate": 4.913707757372317e-06,
+ "loss": 0.6745,
+ "step": 1156
+ },
+ {
+ "epoch": 0.5470449172576832,
+ "grad_norm": 4.505620956420898,
+ "learning_rate": 4.913545196701935e-06,
+ "loss": 0.6668,
+ "step": 1157
+ },
+ {
+ "epoch": 0.5475177304964539,
+ "grad_norm": 3.0505781173706055,
+ "learning_rate": 4.913382485750676e-06,
+ "loss": 0.6926,
+ "step": 1158
+ },
+ {
+ "epoch": 0.5479905437352246,
+ "grad_norm": 2.798435688018799,
+ "learning_rate": 4.913219624528672e-06,
+ "loss": 0.605,
+ "step": 1159
+ },
+ {
+ "epoch": 0.5484633569739953,
+ "grad_norm": 2.7814908027648926,
+ "learning_rate": 4.913056613046065e-06,
+ "loss": 0.6678,
+ "step": 1160
+ },
+ {
+ "epoch": 0.548936170212766,
+ "grad_norm": 3.2089321613311768,
+ "learning_rate": 4.9128934513130025e-06,
+ "loss": 0.5995,
+ "step": 1161
+ },
+ {
+ "epoch": 0.5494089834515367,
+ "grad_norm": 2.7699952125549316,
+ "learning_rate": 4.9127301393396455e-06,
+ "loss": 0.7062,
+ "step": 1162
+ },
+ {
+ "epoch": 0.5498817966903073,
+ "grad_norm": 2.859368324279785,
+ "learning_rate": 4.912566677136162e-06,
+ "loss": 0.6063,
+ "step": 1163
+ },
+ {
+ "epoch": 0.550354609929078,
+ "grad_norm": 2.727334499359131,
+ "learning_rate": 4.91240306471273e-06,
+ "loss": 0.6848,
+ "step": 1164
+ },
+ {
+ "epoch": 0.5508274231678487,
+ "grad_norm": 2.6017510890960693,
+ "learning_rate": 4.912239302079537e-06,
+ "loss": 0.5808,
+ "step": 1165
+ },
+ {
+ "epoch": 0.5513002364066194,
+ "grad_norm": 3.539583206176758,
+ "learning_rate": 4.912075389246781e-06,
+ "loss": 0.7053,
+ "step": 1166
+ },
+ {
+ "epoch": 0.5517730496453901,
+ "grad_norm": 2.918280601501465,
+ "learning_rate": 4.911911326224666e-06,
+ "loss": 0.5904,
+ "step": 1167
+ },
+ {
+ "epoch": 0.5522458628841608,
+ "grad_norm": 3.0067362785339355,
+ "learning_rate": 4.9117471130234095e-06,
+ "loss": 0.6392,
+ "step": 1168
+ },
+ {
+ "epoch": 0.5527186761229315,
+ "grad_norm": 2.4374797344207764,
+ "learning_rate": 4.911582749653236e-06,
+ "loss": 0.5793,
+ "step": 1169
+ },
+ {
+ "epoch": 0.5531914893617021,
+ "grad_norm": 3.121182918548584,
+ "learning_rate": 4.911418236124378e-06,
+ "loss": 0.6636,
+ "step": 1170
+ },
+ {
+ "epoch": 0.5536643026004728,
+ "grad_norm": 3.1289851665496826,
+ "learning_rate": 4.91125357244708e-06,
+ "loss": 0.656,
+ "step": 1171
+ },
+ {
+ "epoch": 0.5541371158392435,
+ "grad_norm": 2.7034592628479004,
+ "learning_rate": 4.911088758631596e-06,
+ "loss": 0.6001,
+ "step": 1172
+ },
+ {
+ "epoch": 0.5546099290780142,
+ "grad_norm": 2.710146188735962,
+ "learning_rate": 4.910923794688187e-06,
+ "loss": 0.6007,
+ "step": 1173
+ },
+ {
+ "epoch": 0.5550827423167849,
+ "grad_norm": 2.5424487590789795,
+ "learning_rate": 4.910758680627124e-06,
+ "loss": 0.5193,
+ "step": 1174
+ },
+ {
+ "epoch": 0.5555555555555556,
+ "grad_norm": 2.615893602371216,
+ "learning_rate": 4.91059341645869e-06,
+ "loss": 0.5525,
+ "step": 1175
+ },
+ {
+ "epoch": 0.5560283687943263,
+ "grad_norm": 3.3179728984832764,
+ "learning_rate": 4.910428002193174e-06,
+ "loss": 0.7285,
+ "step": 1176
+ },
+ {
+ "epoch": 0.556501182033097,
+ "grad_norm": 2.7234175205230713,
+ "learning_rate": 4.910262437840875e-06,
+ "loss": 0.574,
+ "step": 1177
+ },
+ {
+ "epoch": 0.5569739952718676,
+ "grad_norm": 3.0416605472564697,
+ "learning_rate": 4.9100967234121034e-06,
+ "loss": 0.5623,
+ "step": 1178
+ },
+ {
+ "epoch": 0.5574468085106383,
+ "grad_norm": 3.067786455154419,
+ "learning_rate": 4.909930858917177e-06,
+ "loss": 0.6491,
+ "step": 1179
+ },
+ {
+ "epoch": 0.557919621749409,
+ "grad_norm": 3.0037379264831543,
+ "learning_rate": 4.909764844366422e-06,
+ "loss": 0.5696,
+ "step": 1180
+ },
+ {
+ "epoch": 0.5583924349881797,
+ "grad_norm": 2.966179609298706,
+ "learning_rate": 4.909598679770178e-06,
+ "loss": 0.6042,
+ "step": 1181
+ },
+ {
+ "epoch": 0.5588652482269504,
+ "grad_norm": 2.6000657081604004,
+ "learning_rate": 4.909432365138789e-06,
+ "loss": 0.5883,
+ "step": 1182
+ },
+ {
+ "epoch": 0.5593380614657211,
+ "grad_norm": 2.6794495582580566,
+ "learning_rate": 4.909265900482612e-06,
+ "loss": 0.6809,
+ "step": 1183
+ },
+ {
+ "epoch": 0.5598108747044918,
+ "grad_norm": 2.6765122413635254,
+ "learning_rate": 4.9090992858120115e-06,
+ "loss": 0.6601,
+ "step": 1184
+ },
+ {
+ "epoch": 0.5602836879432624,
+ "grad_norm": 2.6051928997039795,
+ "learning_rate": 4.908932521137363e-06,
+ "loss": 0.5946,
+ "step": 1185
+ },
+ {
+ "epoch": 0.5607565011820331,
+ "grad_norm": 3.0405542850494385,
+ "learning_rate": 4.908765606469048e-06,
+ "loss": 0.6998,
+ "step": 1186
+ },
+ {
+ "epoch": 0.5612293144208038,
+ "grad_norm": 2.7975668907165527,
+ "learning_rate": 4.908598541817462e-06,
+ "loss": 0.6218,
+ "step": 1187
+ },
+ {
+ "epoch": 0.5617021276595745,
+ "grad_norm": 2.5367627143859863,
+ "learning_rate": 4.908431327193005e-06,
+ "loss": 0.6354,
+ "step": 1188
+ },
+ {
+ "epoch": 0.5621749408983452,
+ "grad_norm": 3.7939631938934326,
+ "learning_rate": 4.908263962606091e-06,
+ "loss": 0.6376,
+ "step": 1189
+ },
+ {
+ "epoch": 0.5626477541371159,
+ "grad_norm": 2.864079475402832,
+ "learning_rate": 4.908096448067139e-06,
+ "loss": 0.5485,
+ "step": 1190
+ },
+ {
+ "epoch": 0.5631205673758866,
+ "grad_norm": 2.7855563163757324,
+ "learning_rate": 4.9079287835865804e-06,
+ "loss": 0.6645,
+ "step": 1191
+ },
+ {
+ "epoch": 0.5635933806146572,
+ "grad_norm": 2.6156625747680664,
+ "learning_rate": 4.9077609691748556e-06,
+ "loss": 0.5751,
+ "step": 1192
+ },
+ {
+ "epoch": 0.5640661938534279,
+ "grad_norm": 3.0475659370422363,
+ "learning_rate": 4.907593004842412e-06,
+ "loss": 0.6739,
+ "step": 1193
+ },
+ {
+ "epoch": 0.5645390070921986,
+ "grad_norm": 2.9176738262176514,
+ "learning_rate": 4.9074248905997104e-06,
+ "loss": 0.6493,
+ "step": 1194
+ },
+ {
+ "epoch": 0.5650118203309693,
+ "grad_norm": 2.6168384552001953,
+ "learning_rate": 4.907256626457216e-06,
+ "loss": 0.6154,
+ "step": 1195
+ },
+ {
+ "epoch": 0.56548463356974,
+ "grad_norm": 2.893980026245117,
+ "learning_rate": 4.907088212425408e-06,
+ "loss": 0.5808,
+ "step": 1196
+ },
+ {
+ "epoch": 0.5659574468085107,
+ "grad_norm": 3.3832836151123047,
+ "learning_rate": 4.90691964851477e-06,
+ "loss": 0.7888,
+ "step": 1197
+ },
+ {
+ "epoch": 0.5664302600472814,
+ "grad_norm": 3.088932752609253,
+ "learning_rate": 4.906750934735801e-06,
+ "loss": 0.6516,
+ "step": 1198
+ },
+ {
+ "epoch": 0.566903073286052,
+ "grad_norm": 2.494471549987793,
+ "learning_rate": 4.906582071099004e-06,
+ "loss": 0.6286,
+ "step": 1199
+ },
+ {
+ "epoch": 0.5673758865248227,
+ "grad_norm": 2.716550588607788,
+ "learning_rate": 4.906413057614895e-06,
+ "loss": 0.5939,
+ "step": 1200
+ },
+ {
+ "epoch": 0.5678486997635934,
+ "grad_norm": 2.5821073055267334,
+ "learning_rate": 4.906243894293995e-06,
+ "loss": 0.6668,
+ "step": 1201
+ },
+ {
+ "epoch": 0.5683215130023641,
+ "grad_norm": 3.651787042617798,
+ "learning_rate": 4.90607458114684e-06,
+ "loss": 0.6124,
+ "step": 1202
+ },
+ {
+ "epoch": 0.5687943262411348,
+ "grad_norm": 2.7567858695983887,
+ "learning_rate": 4.9059051181839705e-06,
+ "loss": 0.6656,
+ "step": 1203
+ },
+ {
+ "epoch": 0.5692671394799055,
+ "grad_norm": 2.8067586421966553,
+ "learning_rate": 4.90573550541594e-06,
+ "loss": 0.6306,
+ "step": 1204
+ },
+ {
+ "epoch": 0.5697399527186762,
+ "grad_norm": 2.6136393547058105,
+ "learning_rate": 4.905565742853307e-06,
+ "loss": 0.5992,
+ "step": 1205
+ },
+ {
+ "epoch": 0.5702127659574469,
+ "grad_norm": 2.899049758911133,
+ "learning_rate": 4.905395830506644e-06,
+ "loss": 0.621,
+ "step": 1206
+ },
+ {
+ "epoch": 0.5706855791962175,
+ "grad_norm": 3.036583185195923,
+ "learning_rate": 4.9052257683865294e-06,
+ "loss": 0.652,
+ "step": 1207
+ },
+ {
+ "epoch": 0.5711583924349882,
+ "grad_norm": 2.7947216033935547,
+ "learning_rate": 4.905055556503553e-06,
+ "loss": 0.6636,
+ "step": 1208
+ },
+ {
+ "epoch": 0.5716312056737589,
+ "grad_norm": 3.1646955013275146,
+ "learning_rate": 4.9048851948683135e-06,
+ "loss": 0.6376,
+ "step": 1209
+ },
+ {
+ "epoch": 0.5721040189125296,
+ "grad_norm": 2.8175766468048096,
+ "learning_rate": 4.904714683491417e-06,
+ "loss": 0.5929,
+ "step": 1210
+ },
+ {
+ "epoch": 0.5725768321513003,
+ "grad_norm": 2.923923969268799,
+ "learning_rate": 4.904544022383483e-06,
+ "loss": 0.6633,
+ "step": 1211
+ },
+ {
+ "epoch": 0.573049645390071,
+ "grad_norm": 2.7471134662628174,
+ "learning_rate": 4.9043732115551356e-06,
+ "loss": 0.6551,
+ "step": 1212
+ },
+ {
+ "epoch": 0.5735224586288417,
+ "grad_norm": 2.8660807609558105,
+ "learning_rate": 4.90420225101701e-06,
+ "loss": 0.6423,
+ "step": 1213
+ },
+ {
+ "epoch": 0.5739952718676123,
+ "grad_norm": 2.769247531890869,
+ "learning_rate": 4.904031140779754e-06,
+ "loss": 0.5982,
+ "step": 1214
+ },
+ {
+ "epoch": 0.574468085106383,
+ "grad_norm": 2.9043145179748535,
+ "learning_rate": 4.90385988085402e-06,
+ "loss": 0.5843,
+ "step": 1215
+ },
+ {
+ "epoch": 0.5749408983451537,
+ "grad_norm": 2.6639609336853027,
+ "learning_rate": 4.903688471250471e-06,
+ "loss": 0.5858,
+ "step": 1216
+ },
+ {
+ "epoch": 0.5754137115839244,
+ "grad_norm": 2.6967573165893555,
+ "learning_rate": 4.903516911979781e-06,
+ "loss": 0.5755,
+ "step": 1217
+ },
+ {
+ "epoch": 0.5758865248226951,
+ "grad_norm": 2.8865857124328613,
+ "learning_rate": 4.903345203052633e-06,
+ "loss": 0.6051,
+ "step": 1218
+ },
+ {
+ "epoch": 0.5763593380614658,
+ "grad_norm": 2.381979465484619,
+ "learning_rate": 4.903173344479717e-06,
+ "loss": 0.5727,
+ "step": 1219
+ },
+ {
+ "epoch": 0.5768321513002365,
+ "grad_norm": 2.7717981338500977,
+ "learning_rate": 4.903001336271734e-06,
+ "loss": 0.6406,
+ "step": 1220
+ },
+ {
+ "epoch": 0.577304964539007,
+ "grad_norm": 2.6431570053100586,
+ "learning_rate": 4.902829178439395e-06,
+ "loss": 0.6226,
+ "step": 1221
+ },
+ {
+ "epoch": 0.5777777777777777,
+ "grad_norm": 2.8090415000915527,
+ "learning_rate": 4.902656870993419e-06,
+ "loss": 0.5761,
+ "step": 1222
+ },
+ {
+ "epoch": 0.5782505910165484,
+ "grad_norm": 2.4769368171691895,
+ "learning_rate": 4.902484413944535e-06,
+ "loss": 0.5602,
+ "step": 1223
+ },
+ {
+ "epoch": 0.5787234042553191,
+ "grad_norm": 2.693316698074341,
+ "learning_rate": 4.902311807303481e-06,
+ "loss": 0.5222,
+ "step": 1224
+ },
+ {
+ "epoch": 0.5791962174940898,
+ "grad_norm": 2.7623913288116455,
+ "learning_rate": 4.902139051081004e-06,
+ "loss": 0.6978,
+ "step": 1225
+ },
+ {
+ "epoch": 0.5796690307328605,
+ "grad_norm": 2.6133766174316406,
+ "learning_rate": 4.901966145287863e-06,
+ "loss": 0.5802,
+ "step": 1226
+ },
+ {
+ "epoch": 0.5801418439716312,
+ "grad_norm": 2.7345972061157227,
+ "learning_rate": 4.901793089934821e-06,
+ "loss": 0.6294,
+ "step": 1227
+ },
+ {
+ "epoch": 0.5806146572104018,
+ "grad_norm": 2.7545835971832275,
+ "learning_rate": 4.9016198850326555e-06,
+ "loss": 0.6085,
+ "step": 1228
+ },
+ {
+ "epoch": 0.5810874704491725,
+ "grad_norm": 2.6947758197784424,
+ "learning_rate": 4.90144653059215e-06,
+ "loss": 0.6025,
+ "step": 1229
+ },
+ {
+ "epoch": 0.5815602836879432,
+ "grad_norm": 2.692967414855957,
+ "learning_rate": 4.901273026624099e-06,
+ "loss": 0.5715,
+ "step": 1230
+ },
+ {
+ "epoch": 0.5820330969267139,
+ "grad_norm": 2.78347110748291,
+ "learning_rate": 4.901099373139307e-06,
+ "loss": 0.6063,
+ "step": 1231
+ },
+ {
+ "epoch": 0.5825059101654846,
+ "grad_norm": 2.346496343612671,
+ "learning_rate": 4.900925570148585e-06,
+ "loss": 0.5869,
+ "step": 1232
+ },
+ {
+ "epoch": 0.5829787234042553,
+ "grad_norm": 2.606639862060547,
+ "learning_rate": 4.900751617662755e-06,
+ "loss": 0.6197,
+ "step": 1233
+ },
+ {
+ "epoch": 0.583451536643026,
+ "grad_norm": 2.5825929641723633,
+ "learning_rate": 4.900577515692649e-06,
+ "loss": 0.6721,
+ "step": 1234
+ },
+ {
+ "epoch": 0.5839243498817966,
+ "grad_norm": 2.731349468231201,
+ "learning_rate": 4.900403264249107e-06,
+ "loss": 0.6273,
+ "step": 1235
+ },
+ {
+ "epoch": 0.5843971631205673,
+ "grad_norm": 3.2133874893188477,
+ "learning_rate": 4.90022886334298e-06,
+ "loss": 0.6231,
+ "step": 1236
+ },
+ {
+ "epoch": 0.584869976359338,
+ "grad_norm": 2.9213852882385254,
+ "learning_rate": 4.900054312985127e-06,
+ "loss": 0.6677,
+ "step": 1237
+ },
+ {
+ "epoch": 0.5853427895981087,
+ "grad_norm": 2.815425157546997,
+ "learning_rate": 4.899879613186414e-06,
+ "loss": 0.6405,
+ "step": 1238
+ },
+ {
+ "epoch": 0.5858156028368794,
+ "grad_norm": 2.730782985687256,
+ "learning_rate": 4.899704763957721e-06,
+ "loss": 0.6233,
+ "step": 1239
+ },
+ {
+ "epoch": 0.5862884160756501,
+ "grad_norm": 2.6432766914367676,
+ "learning_rate": 4.899529765309936e-06,
+ "loss": 0.6267,
+ "step": 1240
+ },
+ {
+ "epoch": 0.5867612293144208,
+ "grad_norm": 2.616215229034424,
+ "learning_rate": 4.899354617253953e-06,
+ "loss": 0.6268,
+ "step": 1241
+ },
+ {
+ "epoch": 0.5872340425531914,
+ "grad_norm": 2.7630255222320557,
+ "learning_rate": 4.899179319800679e-06,
+ "loss": 0.6348,
+ "step": 1242
+ },
+ {
+ "epoch": 0.5877068557919621,
+ "grad_norm": 2.785095453262329,
+ "learning_rate": 4.899003872961029e-06,
+ "loss": 0.5839,
+ "step": 1243
+ },
+ {
+ "epoch": 0.5881796690307328,
+ "grad_norm": 2.9050328731536865,
+ "learning_rate": 4.898828276745927e-06,
+ "loss": 0.651,
+ "step": 1244
+ },
+ {
+ "epoch": 0.5886524822695035,
+ "grad_norm": 2.958092212677002,
+ "learning_rate": 4.8986525311663065e-06,
+ "loss": 0.6395,
+ "step": 1245
+ },
+ {
+ "epoch": 0.5891252955082742,
+ "grad_norm": 2.952310800552368,
+ "learning_rate": 4.898476636233111e-06,
+ "loss": 0.6731,
+ "step": 1246
+ },
+ {
+ "epoch": 0.5895981087470449,
+ "grad_norm": 2.9876346588134766,
+ "learning_rate": 4.898300591957293e-06,
+ "loss": 0.7015,
+ "step": 1247
+ },
+ {
+ "epoch": 0.5900709219858156,
+ "grad_norm": 2.8941752910614014,
+ "learning_rate": 4.898124398349813e-06,
+ "loss": 0.6452,
+ "step": 1248
+ },
+ {
+ "epoch": 0.5905437352245863,
+ "grad_norm": 2.9809536933898926,
+ "learning_rate": 4.897948055421642e-06,
+ "loss": 0.5736,
+ "step": 1249
+ },
+ {
+ "epoch": 0.5910165484633569,
+ "grad_norm": 2.927046775817871,
+ "learning_rate": 4.897771563183761e-06,
+ "loss": 0.5918,
+ "step": 1250
+ },
+ {
+ "epoch": 0.5914893617021276,
+ "grad_norm": 2.865020275115967,
+ "learning_rate": 4.897594921647158e-06,
+ "loss": 0.6924,
+ "step": 1251
+ },
+ {
+ "epoch": 0.5919621749408983,
+ "grad_norm": 2.7406699657440186,
+ "learning_rate": 4.897418130822832e-06,
+ "loss": 0.509,
+ "step": 1252
+ },
+ {
+ "epoch": 0.592434988179669,
+ "grad_norm": 2.781606912612915,
+ "learning_rate": 4.897241190721791e-06,
+ "loss": 0.5555,
+ "step": 1253
+ },
+ {
+ "epoch": 0.5929078014184397,
+ "grad_norm": 2.79209303855896,
+ "learning_rate": 4.8970641013550535e-06,
+ "loss": 0.6722,
+ "step": 1254
+ },
+ {
+ "epoch": 0.5933806146572104,
+ "grad_norm": 3.0672268867492676,
+ "learning_rate": 4.896886862733645e-06,
+ "loss": 0.6366,
+ "step": 1255
+ },
+ {
+ "epoch": 0.5938534278959811,
+ "grad_norm": 2.7456953525543213,
+ "learning_rate": 4.896709474868602e-06,
+ "loss": 0.6246,
+ "step": 1256
+ },
+ {
+ "epoch": 0.5943262411347517,
+ "grad_norm": 3.6731202602386475,
+ "learning_rate": 4.896531937770968e-06,
+ "loss": 0.668,
+ "step": 1257
+ },
+ {
+ "epoch": 0.5947990543735224,
+ "grad_norm": 2.6056087017059326,
+ "learning_rate": 4.8963542514518e-06,
+ "loss": 0.5815,
+ "step": 1258
+ },
+ {
+ "epoch": 0.5952718676122931,
+ "grad_norm": 2.719698905944824,
+ "learning_rate": 4.89617641592216e-06,
+ "loss": 0.6058,
+ "step": 1259
+ },
+ {
+ "epoch": 0.5957446808510638,
+ "grad_norm": 2.625838279724121,
+ "learning_rate": 4.895998431193121e-06,
+ "loss": 0.6143,
+ "step": 1260
+ },
+ {
+ "epoch": 0.5962174940898345,
+ "grad_norm": 2.7166085243225098,
+ "learning_rate": 4.895820297275767e-06,
+ "loss": 0.5187,
+ "step": 1261
+ },
+ {
+ "epoch": 0.5966903073286052,
+ "grad_norm": 2.7544102668762207,
+ "learning_rate": 4.8956420141811875e-06,
+ "loss": 0.5928,
+ "step": 1262
+ },
+ {
+ "epoch": 0.5971631205673759,
+ "grad_norm": 2.6678333282470703,
+ "learning_rate": 4.895463581920484e-06,
+ "loss": 0.611,
+ "step": 1263
+ },
+ {
+ "epoch": 0.5976359338061465,
+ "grad_norm": 2.853384494781494,
+ "learning_rate": 4.895285000504768e-06,
+ "loss": 0.642,
+ "step": 1264
+ },
+ {
+ "epoch": 0.5981087470449172,
+ "grad_norm": 2.637852430343628,
+ "learning_rate": 4.895106269945158e-06,
+ "loss": 0.6308,
+ "step": 1265
+ },
+ {
+ "epoch": 0.5985815602836879,
+ "grad_norm": 2.9880387783050537,
+ "learning_rate": 4.8949273902527826e-06,
+ "loss": 0.5781,
+ "step": 1266
+ },
+ {
+ "epoch": 0.5990543735224586,
+ "grad_norm": 3.5984015464782715,
+ "learning_rate": 4.89474836143878e-06,
+ "loss": 0.5865,
+ "step": 1267
+ },
+ {
+ "epoch": 0.5995271867612293,
+ "grad_norm": 2.719855546951294,
+ "learning_rate": 4.8945691835142975e-06,
+ "loss": 0.6393,
+ "step": 1268
+ },
+ {
+ "epoch": 0.6,
+ "grad_norm": 2.7885141372680664,
+ "learning_rate": 4.894389856490492e-06,
+ "loss": 0.66,
+ "step": 1269
+ },
+ {
+ "epoch": 0.6004728132387707,
+ "grad_norm": 2.698819875717163,
+ "learning_rate": 4.894210380378529e-06,
+ "loss": 0.6144,
+ "step": 1270
+ },
+ {
+ "epoch": 0.6009456264775414,
+ "grad_norm": 2.278045654296875,
+ "learning_rate": 4.894030755189584e-06,
+ "loss": 0.5609,
+ "step": 1271
+ },
+ {
+ "epoch": 0.601418439716312,
+ "grad_norm": 2.8729357719421387,
+ "learning_rate": 4.893850980934841e-06,
+ "loss": 0.6715,
+ "step": 1272
+ },
+ {
+ "epoch": 0.6018912529550827,
+ "grad_norm": 2.8541221618652344,
+ "learning_rate": 4.893671057625495e-06,
+ "loss": 0.6787,
+ "step": 1273
+ },
+ {
+ "epoch": 0.6023640661938534,
+ "grad_norm": 2.4561476707458496,
+ "learning_rate": 4.893490985272748e-06,
+ "loss": 0.6331,
+ "step": 1274
+ },
+ {
+ "epoch": 0.6028368794326241,
+ "grad_norm": 2.565739154815674,
+ "learning_rate": 4.893310763887812e-06,
+ "loss": 0.587,
+ "step": 1275
+ },
+ {
+ "epoch": 0.6033096926713948,
+ "grad_norm": 2.384951591491699,
+ "learning_rate": 4.8931303934819095e-06,
+ "loss": 0.5358,
+ "step": 1276
+ },
+ {
+ "epoch": 0.6037825059101655,
+ "grad_norm": 2.380808115005493,
+ "learning_rate": 4.89294987406627e-06,
+ "loss": 0.5402,
+ "step": 1277
+ },
+ {
+ "epoch": 0.6042553191489362,
+ "grad_norm": 2.764815092086792,
+ "learning_rate": 4.892769205652136e-06,
+ "loss": 0.6103,
+ "step": 1278
+ },
+ {
+ "epoch": 0.6047281323877068,
+ "grad_norm": 2.463350296020508,
+ "learning_rate": 4.892588388250754e-06,
+ "loss": 0.5937,
+ "step": 1279
+ },
+ {
+ "epoch": 0.6052009456264775,
+ "grad_norm": 3.099689245223999,
+ "learning_rate": 4.8924074218733855e-06,
+ "loss": 0.6354,
+ "step": 1280
+ },
+ {
+ "epoch": 0.6056737588652482,
+ "grad_norm": 2.804450035095215,
+ "learning_rate": 4.892226306531297e-06,
+ "loss": 0.6595,
+ "step": 1281
+ },
+ {
+ "epoch": 0.6061465721040189,
+ "grad_norm": 3.1559767723083496,
+ "learning_rate": 4.892045042235765e-06,
+ "loss": 0.6664,
+ "step": 1282
+ },
+ {
+ "epoch": 0.6066193853427896,
+ "grad_norm": 2.844341993331909,
+ "learning_rate": 4.891863628998079e-06,
+ "loss": 0.7454,
+ "step": 1283
+ },
+ {
+ "epoch": 0.6070921985815603,
+ "grad_norm": 2.686602830886841,
+ "learning_rate": 4.891682066829532e-06,
+ "loss": 0.6755,
+ "step": 1284
+ },
+ {
+ "epoch": 0.607565011820331,
+ "grad_norm": 2.736457347869873,
+ "learning_rate": 4.8915003557414285e-06,
+ "loss": 0.6305,
+ "step": 1285
+ },
+ {
+ "epoch": 0.6080378250591016,
+ "grad_norm": 2.661362409591675,
+ "learning_rate": 4.891318495745086e-06,
+ "loss": 0.5958,
+ "step": 1286
+ },
+ {
+ "epoch": 0.6085106382978723,
+ "grad_norm": 2.707348108291626,
+ "learning_rate": 4.8911364868518255e-06,
+ "loss": 0.5824,
+ "step": 1287
+ },
+ {
+ "epoch": 0.608983451536643,
+ "grad_norm": 2.9798858165740967,
+ "learning_rate": 4.890954329072981e-06,
+ "loss": 0.5981,
+ "step": 1288
+ },
+ {
+ "epoch": 0.6094562647754137,
+ "grad_norm": 2.6285455226898193,
+ "learning_rate": 4.890772022419895e-06,
+ "loss": 0.6194,
+ "step": 1289
+ },
+ {
+ "epoch": 0.6099290780141844,
+ "grad_norm": 2.9254322052001953,
+ "learning_rate": 4.890589566903917e-06,
+ "loss": 0.6002,
+ "step": 1290
+ },
+ {
+ "epoch": 0.6104018912529551,
+ "grad_norm": 2.6458325386047363,
+ "learning_rate": 4.89040696253641e-06,
+ "loss": 0.5457,
+ "step": 1291
+ },
+ {
+ "epoch": 0.6108747044917258,
+ "grad_norm": 2.508242607116699,
+ "learning_rate": 4.890224209328743e-06,
+ "loss": 0.6168,
+ "step": 1292
+ },
+ {
+ "epoch": 0.6113475177304964,
+ "grad_norm": 3.034785509109497,
+ "learning_rate": 4.890041307292296e-06,
+ "loss": 0.664,
+ "step": 1293
+ },
+ {
+ "epoch": 0.6118203309692671,
+ "grad_norm": 3.52469539642334,
+ "learning_rate": 4.889858256438455e-06,
+ "loss": 0.7301,
+ "step": 1294
+ },
+ {
+ "epoch": 0.6122931442080378,
+ "grad_norm": 2.9145348072052,
+ "learning_rate": 4.889675056778622e-06,
+ "loss": 0.6494,
+ "step": 1295
+ },
+ {
+ "epoch": 0.6127659574468085,
+ "grad_norm": 2.831829071044922,
+ "learning_rate": 4.8894917083242e-06,
+ "loss": 0.6064,
+ "step": 1296
+ },
+ {
+ "epoch": 0.6132387706855792,
+ "grad_norm": 2.6883130073547363,
+ "learning_rate": 4.889308211086608e-06,
+ "loss": 0.5642,
+ "step": 1297
+ },
+ {
+ "epoch": 0.6137115839243499,
+ "grad_norm": 3.0605485439300537,
+ "learning_rate": 4.889124565077269e-06,
+ "loss": 0.6695,
+ "step": 1298
+ },
+ {
+ "epoch": 0.6141843971631206,
+ "grad_norm": 3.44062876701355,
+ "learning_rate": 4.88894077030762e-06,
+ "loss": 0.6415,
+ "step": 1299
+ },
+ {
+ "epoch": 0.6146572104018913,
+ "grad_norm": 2.5970818996429443,
+ "learning_rate": 4.888756826789105e-06,
+ "loss": 0.6518,
+ "step": 1300
+ },
+ {
+ "epoch": 0.6151300236406619,
+ "grad_norm": 4.2233567237854,
+ "learning_rate": 4.8885727345331755e-06,
+ "loss": 0.6555,
+ "step": 1301
+ },
+ {
+ "epoch": 0.6156028368794326,
+ "grad_norm": 2.645385503768921,
+ "learning_rate": 4.888388493551297e-06,
+ "loss": 0.6762,
+ "step": 1302
+ },
+ {
+ "epoch": 0.6160756501182033,
+ "grad_norm": 2.907954454421997,
+ "learning_rate": 4.8882041038549385e-06,
+ "loss": 0.6526,
+ "step": 1303
+ },
+ {
+ "epoch": 0.616548463356974,
+ "grad_norm": 2.482771873474121,
+ "learning_rate": 4.888019565455583e-06,
+ "loss": 0.628,
+ "step": 1304
+ },
+ {
+ "epoch": 0.6170212765957447,
+ "grad_norm": 2.7165915966033936,
+ "learning_rate": 4.88783487836472e-06,
+ "loss": 0.5743,
+ "step": 1305
+ },
+ {
+ "epoch": 0.6174940898345154,
+ "grad_norm": 3.095627546310425,
+ "learning_rate": 4.88765004259385e-06,
+ "loss": 0.627,
+ "step": 1306
+ },
+ {
+ "epoch": 0.6179669030732861,
+ "grad_norm": 2.5018465518951416,
+ "learning_rate": 4.8874650581544805e-06,
+ "loss": 0.5215,
+ "step": 1307
+ },
+ {
+ "epoch": 0.6184397163120567,
+ "grad_norm": 3.094337224960327,
+ "learning_rate": 4.8872799250581316e-06,
+ "loss": 0.6979,
+ "step": 1308
+ },
+ {
+ "epoch": 0.6189125295508274,
+ "grad_norm": 3.1002209186553955,
+ "learning_rate": 4.887094643316329e-06,
+ "loss": 0.6565,
+ "step": 1309
+ },
+ {
+ "epoch": 0.6193853427895981,
+ "grad_norm": 2.551431894302368,
+ "learning_rate": 4.88690921294061e-06,
+ "loss": 0.5748,
+ "step": 1310
+ },
+ {
+ "epoch": 0.6198581560283688,
+ "grad_norm": 2.8282904624938965,
+ "learning_rate": 4.886723633942521e-06,
+ "loss": 0.676,
+ "step": 1311
+ },
+ {
+ "epoch": 0.6203309692671395,
+ "grad_norm": 2.8887810707092285,
+ "learning_rate": 4.886537906333617e-06,
+ "loss": 0.5971,
+ "step": 1312
+ },
+ {
+ "epoch": 0.6208037825059102,
+ "grad_norm": 2.9989118576049805,
+ "learning_rate": 4.886352030125462e-06,
+ "loss": 0.6341,
+ "step": 1313
+ },
+ {
+ "epoch": 0.6212765957446809,
+ "grad_norm": 2.8042776584625244,
+ "learning_rate": 4.886166005329629e-06,
+ "loss": 0.6578,
+ "step": 1314
+ },
+ {
+ "epoch": 0.6217494089834515,
+ "grad_norm": 2.4980967044830322,
+ "learning_rate": 4.8859798319577026e-06,
+ "loss": 0.6711,
+ "step": 1315
+ },
+ {
+ "epoch": 0.6222222222222222,
+ "grad_norm": 2.762369155883789,
+ "learning_rate": 4.885793510021274e-06,
+ "loss": 0.5747,
+ "step": 1316
+ },
+ {
+ "epoch": 0.6226950354609929,
+ "grad_norm": 3.136327028274536,
+ "learning_rate": 4.885607039531945e-06,
+ "loss": 0.7544,
+ "step": 1317
+ },
+ {
+ "epoch": 0.6231678486997636,
+ "grad_norm": 2.8736963272094727,
+ "learning_rate": 4.885420420501327e-06,
+ "loss": 0.6603,
+ "step": 1318
+ },
+ {
+ "epoch": 0.6236406619385343,
+ "grad_norm": 2.766237497329712,
+ "learning_rate": 4.885233652941039e-06,
+ "loss": 0.581,
+ "step": 1319
+ },
+ {
+ "epoch": 0.624113475177305,
+ "grad_norm": 2.4740939140319824,
+ "learning_rate": 4.88504673686271e-06,
+ "loss": 0.6335,
+ "step": 1320
+ },
+ {
+ "epoch": 0.6245862884160757,
+ "grad_norm": 3.324795961380005,
+ "learning_rate": 4.884859672277978e-06,
+ "loss": 0.6019,
+ "step": 1321
+ },
+ {
+ "epoch": 0.6250591016548463,
+ "grad_norm": 3.521327257156372,
+ "learning_rate": 4.884672459198493e-06,
+ "loss": 0.6104,
+ "step": 1322
+ },
+ {
+ "epoch": 0.625531914893617,
+ "grad_norm": 2.7728071212768555,
+ "learning_rate": 4.884485097635909e-06,
+ "loss": 0.6714,
+ "step": 1323
+ },
+ {
+ "epoch": 0.6260047281323877,
+ "grad_norm": 3.0738155841827393,
+ "learning_rate": 4.884297587601895e-06,
+ "loss": 0.604,
+ "step": 1324
+ },
+ {
+ "epoch": 0.6264775413711584,
+ "grad_norm": 2.719240427017212,
+ "learning_rate": 4.884109929108124e-06,
+ "loss": 0.6795,
+ "step": 1325
+ },
+ {
+ "epoch": 0.6269503546099291,
+ "grad_norm": 2.4108200073242188,
+ "learning_rate": 4.883922122166282e-06,
+ "loss": 0.5846,
+ "step": 1326
+ },
+ {
+ "epoch": 0.6274231678486998,
+ "grad_norm": 2.393899917602539,
+ "learning_rate": 4.883734166788063e-06,
+ "loss": 0.6188,
+ "step": 1327
+ },
+ {
+ "epoch": 0.6278959810874705,
+ "grad_norm": 4.555255889892578,
+ "learning_rate": 4.883546062985169e-06,
+ "loss": 0.5962,
+ "step": 1328
+ },
+ {
+ "epoch": 0.6283687943262412,
+ "grad_norm": 2.571075439453125,
+ "learning_rate": 4.883357810769315e-06,
+ "loss": 0.6165,
+ "step": 1329
+ },
+ {
+ "epoch": 0.6288416075650118,
+ "grad_norm": 2.553115129470825,
+ "learning_rate": 4.8831694101522185e-06,
+ "loss": 0.6787,
+ "step": 1330
+ },
+ {
+ "epoch": 0.6293144208037825,
+ "grad_norm": 3.2564642429351807,
+ "learning_rate": 4.882980861145614e-06,
+ "loss": 0.659,
+ "step": 1331
+ },
+ {
+ "epoch": 0.6297872340425532,
+ "grad_norm": 2.535216808319092,
+ "learning_rate": 4.882792163761241e-06,
+ "loss": 0.6176,
+ "step": 1332
+ },
+ {
+ "epoch": 0.6302600472813239,
+ "grad_norm": 3.097921848297119,
+ "learning_rate": 4.882603318010847e-06,
+ "loss": 0.6822,
+ "step": 1333
+ },
+ {
+ "epoch": 0.6307328605200946,
+ "grad_norm": 2.8135175704956055,
+ "learning_rate": 4.882414323906192e-06,
+ "loss": 0.6782,
+ "step": 1334
+ },
+ {
+ "epoch": 0.6312056737588653,
+ "grad_norm": 2.724634885787964,
+ "learning_rate": 4.882225181459044e-06,
+ "loss": 0.6545,
+ "step": 1335
+ },
+ {
+ "epoch": 0.631678486997636,
+ "grad_norm": 2.9585227966308594,
+ "learning_rate": 4.882035890681179e-06,
+ "loss": 0.6218,
+ "step": 1336
+ },
+ {
+ "epoch": 0.6321513002364066,
+ "grad_norm": 2.6952011585235596,
+ "learning_rate": 4.881846451584385e-06,
+ "loss": 0.6,
+ "step": 1337
+ },
+ {
+ "epoch": 0.6326241134751773,
+ "grad_norm": 3.1400704383850098,
+ "learning_rate": 4.881656864180455e-06,
+ "loss": 0.6687,
+ "step": 1338
+ },
+ {
+ "epoch": 0.633096926713948,
+ "grad_norm": 2.8382487297058105,
+ "learning_rate": 4.881467128481197e-06,
+ "loss": 0.574,
+ "step": 1339
+ },
+ {
+ "epoch": 0.6335697399527187,
+ "grad_norm": 2.8520095348358154,
+ "learning_rate": 4.881277244498422e-06,
+ "loss": 0.6582,
+ "step": 1340
+ },
+ {
+ "epoch": 0.6340425531914894,
+ "grad_norm": 2.703498363494873,
+ "learning_rate": 4.881087212243956e-06,
+ "loss": 0.7224,
+ "step": 1341
+ },
+ {
+ "epoch": 0.6345153664302601,
+ "grad_norm": 3.697205066680908,
+ "learning_rate": 4.880897031729629e-06,
+ "loss": 0.6582,
+ "step": 1342
+ },
+ {
+ "epoch": 0.6349881796690308,
+ "grad_norm": 2.7625808715820312,
+ "learning_rate": 4.880706702967284e-06,
+ "loss": 0.574,
+ "step": 1343
+ },
+ {
+ "epoch": 0.6354609929078014,
+ "grad_norm": 2.949984073638916,
+ "learning_rate": 4.880516225968771e-06,
+ "loss": 0.66,
+ "step": 1344
+ },
+ {
+ "epoch": 0.6359338061465721,
+ "grad_norm": 2.548269748687744,
+ "learning_rate": 4.8803256007459525e-06,
+ "loss": 0.642,
+ "step": 1345
+ },
+ {
+ "epoch": 0.6364066193853428,
+ "grad_norm": 2.5102174282073975,
+ "learning_rate": 4.8801348273106945e-06,
+ "loss": 0.6238,
+ "step": 1346
+ },
+ {
+ "epoch": 0.6368794326241135,
+ "grad_norm": 2.9847946166992188,
+ "learning_rate": 4.8799439056748786e-06,
+ "loss": 0.5416,
+ "step": 1347
+ },
+ {
+ "epoch": 0.6373522458628842,
+ "grad_norm": 2.8711049556732178,
+ "learning_rate": 4.879752835850391e-06,
+ "loss": 0.6427,
+ "step": 1348
+ },
+ {
+ "epoch": 0.6378250591016549,
+ "grad_norm": 2.7901716232299805,
+ "learning_rate": 4.879561617849129e-06,
+ "loss": 0.6026,
+ "step": 1349
+ },
+ {
+ "epoch": 0.6382978723404256,
+ "grad_norm": 2.659778356552124,
+ "learning_rate": 4.879370251682999e-06,
+ "loss": 0.6623,
+ "step": 1350
+ },
+ {
+ "epoch": 0.6387706855791963,
+ "grad_norm": 3.224386692047119,
+ "learning_rate": 4.879178737363917e-06,
+ "loss": 0.6485,
+ "step": 1351
+ },
+ {
+ "epoch": 0.6392434988179669,
+ "grad_norm": 2.6385605335235596,
+ "learning_rate": 4.8789870749038076e-06,
+ "loss": 0.5866,
+ "step": 1352
+ },
+ {
+ "epoch": 0.6397163120567376,
+ "grad_norm": 2.807713270187378,
+ "learning_rate": 4.8787952643146045e-06,
+ "loss": 0.6537,
+ "step": 1353
+ },
+ {
+ "epoch": 0.6401891252955083,
+ "grad_norm": 2.5689280033111572,
+ "learning_rate": 4.878603305608251e-06,
+ "loss": 0.6216,
+ "step": 1354
+ },
+ {
+ "epoch": 0.640661938534279,
+ "grad_norm": 2.7347843647003174,
+ "learning_rate": 4.8784111987967e-06,
+ "loss": 0.6318,
+ "step": 1355
+ },
+ {
+ "epoch": 0.6411347517730497,
+ "grad_norm": 2.5210378170013428,
+ "learning_rate": 4.878218943891911e-06,
+ "loss": 0.5472,
+ "step": 1356
+ },
+ {
+ "epoch": 0.6416075650118204,
+ "grad_norm": 2.866785764694214,
+ "learning_rate": 4.878026540905858e-06,
+ "loss": 0.7108,
+ "step": 1357
+ },
+ {
+ "epoch": 0.642080378250591,
+ "grad_norm": 2.923314332962036,
+ "learning_rate": 4.877833989850519e-06,
+ "loss": 0.5557,
+ "step": 1358
+ },
+ {
+ "epoch": 0.6425531914893617,
+ "grad_norm": 2.925463914871216,
+ "learning_rate": 4.8776412907378845e-06,
+ "loss": 0.6382,
+ "step": 1359
+ },
+ {
+ "epoch": 0.6430260047281324,
+ "grad_norm": 2.909644365310669,
+ "learning_rate": 4.877448443579952e-06,
+ "loss": 0.5603,
+ "step": 1360
+ },
+ {
+ "epoch": 0.6434988179669031,
+ "grad_norm": 3.501148223876953,
+ "learning_rate": 4.8772554483887306e-06,
+ "loss": 0.6722,
+ "step": 1361
+ },
+ {
+ "epoch": 0.6439716312056738,
+ "grad_norm": 2.823765516281128,
+ "learning_rate": 4.877062305176235e-06,
+ "loss": 0.6408,
+ "step": 1362
+ },
+ {
+ "epoch": 0.6444444444444445,
+ "grad_norm": 2.9807584285736084,
+ "learning_rate": 4.8768690139544935e-06,
+ "loss": 0.5984,
+ "step": 1363
+ },
+ {
+ "epoch": 0.6449172576832152,
+ "grad_norm": 2.8411378860473633,
+ "learning_rate": 4.8766755747355405e-06,
+ "loss": 0.6231,
+ "step": 1364
+ },
+ {
+ "epoch": 0.6453900709219859,
+ "grad_norm": 3.158952236175537,
+ "learning_rate": 4.8764819875314215e-06,
+ "loss": 0.6441,
+ "step": 1365
+ },
+ {
+ "epoch": 0.6458628841607565,
+ "grad_norm": 2.9614369869232178,
+ "learning_rate": 4.876288252354189e-06,
+ "loss": 0.6308,
+ "step": 1366
+ },
+ {
+ "epoch": 0.6463356973995272,
+ "grad_norm": 3.073805570602417,
+ "learning_rate": 4.876094369215907e-06,
+ "loss": 0.6046,
+ "step": 1367
+ },
+ {
+ "epoch": 0.6468085106382979,
+ "grad_norm": 2.719189405441284,
+ "learning_rate": 4.875900338128648e-06,
+ "loss": 0.6082,
+ "step": 1368
+ },
+ {
+ "epoch": 0.6472813238770686,
+ "grad_norm": 2.676726818084717,
+ "learning_rate": 4.8757061591044914e-06,
+ "loss": 0.6344,
+ "step": 1369
+ },
+ {
+ "epoch": 0.6477541371158393,
+ "grad_norm": 2.955256938934326,
+ "learning_rate": 4.87551183215553e-06,
+ "loss": 0.6506,
+ "step": 1370
+ },
+ {
+ "epoch": 0.64822695035461,
+ "grad_norm": 2.5672218799591064,
+ "learning_rate": 4.875317357293864e-06,
+ "loss": 0.5284,
+ "step": 1371
+ },
+ {
+ "epoch": 0.6486997635933807,
+ "grad_norm": 2.5860238075256348,
+ "learning_rate": 4.875122734531602e-06,
+ "loss": 0.667,
+ "step": 1372
+ },
+ {
+ "epoch": 0.6491725768321513,
+ "grad_norm": 3.1037003993988037,
+ "learning_rate": 4.8749279638808605e-06,
+ "loss": 0.6902,
+ "step": 1373
+ },
+ {
+ "epoch": 0.649645390070922,
+ "grad_norm": 2.7715282440185547,
+ "learning_rate": 4.874733045353769e-06,
+ "loss": 0.6291,
+ "step": 1374
+ },
+ {
+ "epoch": 0.6501182033096927,
+ "grad_norm": 2.527071475982666,
+ "learning_rate": 4.874537978962463e-06,
+ "loss": 0.5565,
+ "step": 1375
+ },
+ {
+ "epoch": 0.6505910165484634,
+ "grad_norm": 2.722092628479004,
+ "learning_rate": 4.874342764719091e-06,
+ "loss": 0.5724,
+ "step": 1376
+ },
+ {
+ "epoch": 0.6510638297872341,
+ "grad_norm": 2.6342411041259766,
+ "learning_rate": 4.874147402635805e-06,
+ "loss": 0.6308,
+ "step": 1377
+ },
+ {
+ "epoch": 0.6515366430260048,
+ "grad_norm": 2.3850719928741455,
+ "learning_rate": 4.8739518927247695e-06,
+ "loss": 0.5692,
+ "step": 1378
+ },
+ {
+ "epoch": 0.6520094562647755,
+ "grad_norm": 2.9787259101867676,
+ "learning_rate": 4.873756234998161e-06,
+ "loss": 0.6953,
+ "step": 1379
+ },
+ {
+ "epoch": 0.6524822695035462,
+ "grad_norm": 2.634141683578491,
+ "learning_rate": 4.873560429468159e-06,
+ "loss": 0.6077,
+ "step": 1380
+ },
+ {
+ "epoch": 0.6529550827423168,
+ "grad_norm": 2.803046941757202,
+ "learning_rate": 4.873364476146958e-06,
+ "loss": 0.6657,
+ "step": 1381
+ },
+ {
+ "epoch": 0.6534278959810875,
+ "grad_norm": 2.762827157974243,
+ "learning_rate": 4.8731683750467574e-06,
+ "loss": 0.6061,
+ "step": 1382
+ },
+ {
+ "epoch": 0.6539007092198581,
+ "grad_norm": 2.6654391288757324,
+ "learning_rate": 4.872972126179768e-06,
+ "loss": 0.6387,
+ "step": 1383
+ },
+ {
+ "epoch": 0.6543735224586288,
+ "grad_norm": 2.4363625049591064,
+ "learning_rate": 4.872775729558209e-06,
+ "loss": 0.5623,
+ "step": 1384
+ },
+ {
+ "epoch": 0.6548463356973995,
+ "grad_norm": 2.528959035873413,
+ "learning_rate": 4.87257918519431e-06,
+ "loss": 0.5609,
+ "step": 1385
+ },
+ {
+ "epoch": 0.6553191489361702,
+ "grad_norm": 2.718383312225342,
+ "learning_rate": 4.872382493100309e-06,
+ "loss": 0.5575,
+ "step": 1386
+ },
+ {
+ "epoch": 0.6557919621749408,
+ "grad_norm": 2.660841226577759,
+ "learning_rate": 4.872185653288453e-06,
+ "loss": 0.6106,
+ "step": 1387
+ },
+ {
+ "epoch": 0.6562647754137115,
+ "grad_norm": 2.508753538131714,
+ "learning_rate": 4.871988665770997e-06,
+ "loss": 0.5705,
+ "step": 1388
+ },
+ {
+ "epoch": 0.6567375886524822,
+ "grad_norm": 2.5134334564208984,
+ "learning_rate": 4.871791530560208e-06,
+ "loss": 0.5592,
+ "step": 1389
+ },
+ {
+ "epoch": 0.6572104018912529,
+ "grad_norm": 2.7475597858428955,
+ "learning_rate": 4.871594247668361e-06,
+ "loss": 0.6277,
+ "step": 1390
+ },
+ {
+ "epoch": 0.6576832151300236,
+ "grad_norm": 2.793616533279419,
+ "learning_rate": 4.871396817107739e-06,
+ "loss": 0.595,
+ "step": 1391
+ },
+ {
+ "epoch": 0.6581560283687943,
+ "grad_norm": 2.8285086154937744,
+ "learning_rate": 4.871199238890635e-06,
+ "loss": 0.6094,
+ "step": 1392
+ },
+ {
+ "epoch": 0.658628841607565,
+ "grad_norm": 2.74124813079834,
+ "learning_rate": 4.871001513029352e-06,
+ "loss": 0.6296,
+ "step": 1393
+ },
+ {
+ "epoch": 0.6591016548463356,
+ "grad_norm": 2.761237621307373,
+ "learning_rate": 4.870803639536202e-06,
+ "loss": 0.5702,
+ "step": 1394
+ },
+ {
+ "epoch": 0.6595744680851063,
+ "grad_norm": 2.761038064956665,
+ "learning_rate": 4.870605618423504e-06,
+ "loss": 0.6195,
+ "step": 1395
+ },
+ {
+ "epoch": 0.660047281323877,
+ "grad_norm": 2.8812482357025146,
+ "learning_rate": 4.870407449703589e-06,
+ "loss": 0.616,
+ "step": 1396
+ },
+ {
+ "epoch": 0.6605200945626477,
+ "grad_norm": 2.9966578483581543,
+ "learning_rate": 4.870209133388797e-06,
+ "loss": 0.6547,
+ "step": 1397
+ },
+ {
+ "epoch": 0.6609929078014184,
+ "grad_norm": 2.7969017028808594,
+ "learning_rate": 4.870010669491474e-06,
+ "loss": 0.5762,
+ "step": 1398
+ },
+ {
+ "epoch": 0.6614657210401891,
+ "grad_norm": 2.557783842086792,
+ "learning_rate": 4.86981205802398e-06,
+ "loss": 0.6184,
+ "step": 1399
+ },
+ {
+ "epoch": 0.6619385342789598,
+ "grad_norm": 2.5393927097320557,
+ "learning_rate": 4.86961329899868e-06,
+ "loss": 0.5953,
+ "step": 1400
+ },
+ {
+ "epoch": 0.6624113475177305,
+ "grad_norm": 2.7745981216430664,
+ "learning_rate": 4.86941439242795e-06,
+ "loss": 0.5967,
+ "step": 1401
+ },
+ {
+ "epoch": 0.6628841607565011,
+ "grad_norm": 2.650381326675415,
+ "learning_rate": 4.869215338324176e-06,
+ "loss": 0.5667,
+ "step": 1402
+ },
+ {
+ "epoch": 0.6633569739952718,
+ "grad_norm": 2.583169937133789,
+ "learning_rate": 4.869016136699751e-06,
+ "loss": 0.549,
+ "step": 1403
+ },
+ {
+ "epoch": 0.6638297872340425,
+ "grad_norm": 2.984978437423706,
+ "learning_rate": 4.868816787567079e-06,
+ "loss": 0.5931,
+ "step": 1404
+ },
+ {
+ "epoch": 0.6643026004728132,
+ "grad_norm": 3.1947181224823,
+ "learning_rate": 4.868617290938573e-06,
+ "loss": 0.5473,
+ "step": 1405
+ },
+ {
+ "epoch": 0.6647754137115839,
+ "grad_norm": 2.562927007675171,
+ "learning_rate": 4.868417646826654e-06,
+ "loss": 0.6878,
+ "step": 1406
+ },
+ {
+ "epoch": 0.6652482269503546,
+ "grad_norm": 2.8741261959075928,
+ "learning_rate": 4.868217855243754e-06,
+ "loss": 0.6312,
+ "step": 1407
+ },
+ {
+ "epoch": 0.6657210401891253,
+ "grad_norm": 2.9834797382354736,
+ "learning_rate": 4.868017916202312e-06,
+ "loss": 0.5624,
+ "step": 1408
+ },
+ {
+ "epoch": 0.6661938534278959,
+ "grad_norm": 2.6935982704162598,
+ "learning_rate": 4.8678178297147785e-06,
+ "loss": 0.5857,
+ "step": 1409
+ },
+ {
+ "epoch": 0.6666666666666666,
+ "grad_norm": 2.8200576305389404,
+ "learning_rate": 4.86761759579361e-06,
+ "loss": 0.6153,
+ "step": 1410
+ },
+ {
+ "epoch": 0.6671394799054373,
+ "grad_norm": 2.831425189971924,
+ "learning_rate": 4.867417214451276e-06,
+ "loss": 0.6495,
+ "step": 1411
+ },
+ {
+ "epoch": 0.667612293144208,
+ "grad_norm": 2.733565092086792,
+ "learning_rate": 4.867216685700253e-06,
+ "loss": 0.6036,
+ "step": 1412
+ },
+ {
+ "epoch": 0.6680851063829787,
+ "grad_norm": 3.0609400272369385,
+ "learning_rate": 4.867016009553027e-06,
+ "loss": 0.6773,
+ "step": 1413
+ },
+ {
+ "epoch": 0.6685579196217494,
+ "grad_norm": 2.665452241897583,
+ "learning_rate": 4.866815186022093e-06,
+ "loss": 0.6256,
+ "step": 1414
+ },
+ {
+ "epoch": 0.6690307328605201,
+ "grad_norm": 2.9480721950531006,
+ "learning_rate": 4.866614215119956e-06,
+ "loss": 0.535,
+ "step": 1415
+ },
+ {
+ "epoch": 0.6695035460992907,
+ "grad_norm": 2.5514180660247803,
+ "learning_rate": 4.866413096859128e-06,
+ "loss": 0.6588,
+ "step": 1416
+ },
+ {
+ "epoch": 0.6699763593380614,
+ "grad_norm": 3.3442373275756836,
+ "learning_rate": 4.866211831252134e-06,
+ "loss": 0.5754,
+ "step": 1417
+ },
+ {
+ "epoch": 0.6704491725768321,
+ "grad_norm": 2.521467685699463,
+ "learning_rate": 4.866010418311504e-06,
+ "loss": 0.5546,
+ "step": 1418
+ },
+ {
+ "epoch": 0.6709219858156028,
+ "grad_norm": 2.930706262588501,
+ "learning_rate": 4.865808858049781e-06,
+ "loss": 0.589,
+ "step": 1419
+ },
+ {
+ "epoch": 0.6713947990543735,
+ "grad_norm": 2.6298375129699707,
+ "learning_rate": 4.865607150479513e-06,
+ "loss": 0.5915,
+ "step": 1420
+ },
+ {
+ "epoch": 0.6718676122931442,
+ "grad_norm": 2.9554293155670166,
+ "learning_rate": 4.8654052956132615e-06,
+ "loss": 0.6654,
+ "step": 1421
+ },
+ {
+ "epoch": 0.6723404255319149,
+ "grad_norm": 3.2706902027130127,
+ "learning_rate": 4.865203293463593e-06,
+ "loss": 0.7115,
+ "step": 1422
+ },
+ {
+ "epoch": 0.6728132387706856,
+ "grad_norm": 3.041539430618286,
+ "learning_rate": 4.865001144043088e-06,
+ "loss": 0.5818,
+ "step": 1423
+ },
+ {
+ "epoch": 0.6732860520094562,
+ "grad_norm": 3.1314544677734375,
+ "learning_rate": 4.864798847364331e-06,
+ "loss": 0.5822,
+ "step": 1424
+ },
+ {
+ "epoch": 0.6737588652482269,
+ "grad_norm": 2.5301461219787598,
+ "learning_rate": 4.86459640343992e-06,
+ "loss": 0.5525,
+ "step": 1425
+ },
+ {
+ "epoch": 0.6742316784869976,
+ "grad_norm": 2.809295892715454,
+ "learning_rate": 4.864393812282458e-06,
+ "loss": 0.6768,
+ "step": 1426
+ },
+ {
+ "epoch": 0.6747044917257683,
+ "grad_norm": 2.794664144515991,
+ "learning_rate": 4.864191073904562e-06,
+ "loss": 0.5793,
+ "step": 1427
+ },
+ {
+ "epoch": 0.675177304964539,
+ "grad_norm": 2.7771105766296387,
+ "learning_rate": 4.863988188318854e-06,
+ "loss": 0.6453,
+ "step": 1428
+ },
+ {
+ "epoch": 0.6756501182033097,
+ "grad_norm": 2.6431946754455566,
+ "learning_rate": 4.863785155537967e-06,
+ "loss": 0.5877,
+ "step": 1429
+ },
+ {
+ "epoch": 0.6761229314420804,
+ "grad_norm": 2.951353073120117,
+ "learning_rate": 4.863581975574544e-06,
+ "loss": 0.6793,
+ "step": 1430
+ },
+ {
+ "epoch": 0.676595744680851,
+ "grad_norm": 3.1336071491241455,
+ "learning_rate": 4.863378648441235e-06,
+ "loss": 0.6695,
+ "step": 1431
+ },
+ {
+ "epoch": 0.6770685579196217,
+ "grad_norm": 2.735982656478882,
+ "learning_rate": 4.8631751741507e-06,
+ "loss": 0.5239,
+ "step": 1432
+ },
+ {
+ "epoch": 0.6775413711583924,
+ "grad_norm": 2.7085206508636475,
+ "learning_rate": 4.862971552715611e-06,
+ "loss": 0.6837,
+ "step": 1433
+ },
+ {
+ "epoch": 0.6780141843971631,
+ "grad_norm": 3.136528730392456,
+ "learning_rate": 4.8627677841486436e-06,
+ "loss": 0.683,
+ "step": 1434
+ },
+ {
+ "epoch": 0.6784869976359338,
+ "grad_norm": 2.7879369258880615,
+ "learning_rate": 4.862563868462486e-06,
+ "loss": 0.608,
+ "step": 1435
+ },
+ {
+ "epoch": 0.6789598108747045,
+ "grad_norm": 2.7937729358673096,
+ "learning_rate": 4.862359805669837e-06,
+ "loss": 0.6131,
+ "step": 1436
+ },
+ {
+ "epoch": 0.6794326241134752,
+ "grad_norm": 2.5988364219665527,
+ "learning_rate": 4.862155595783401e-06,
+ "loss": 0.6303,
+ "step": 1437
+ },
+ {
+ "epoch": 0.6799054373522458,
+ "grad_norm": 3.251070499420166,
+ "learning_rate": 4.861951238815894e-06,
+ "loss": 0.7246,
+ "step": 1438
+ },
+ {
+ "epoch": 0.6803782505910165,
+ "grad_norm": 2.646759271621704,
+ "learning_rate": 4.861746734780039e-06,
+ "loss": 0.6313,
+ "step": 1439
+ },
+ {
+ "epoch": 0.6808510638297872,
+ "grad_norm": 2.773866891860962,
+ "learning_rate": 4.861542083688573e-06,
+ "loss": 0.6463,
+ "step": 1440
+ },
+ {
+ "epoch": 0.6813238770685579,
+ "grad_norm": 2.759965658187866,
+ "learning_rate": 4.861337285554235e-06,
+ "loss": 0.5428,
+ "step": 1441
+ },
+ {
+ "epoch": 0.6817966903073286,
+ "grad_norm": 3.3250818252563477,
+ "learning_rate": 4.861132340389779e-06,
+ "loss": 0.6522,
+ "step": 1442
+ },
+ {
+ "epoch": 0.6822695035460993,
+ "grad_norm": 2.661797523498535,
+ "learning_rate": 4.860927248207965e-06,
+ "loss": 0.5871,
+ "step": 1443
+ },
+ {
+ "epoch": 0.68274231678487,
+ "grad_norm": 2.706289052963257,
+ "learning_rate": 4.860722009021563e-06,
+ "loss": 0.6651,
+ "step": 1444
+ },
+ {
+ "epoch": 0.6832151300236406,
+ "grad_norm": 2.8459298610687256,
+ "learning_rate": 4.860516622843354e-06,
+ "loss": 0.5827,
+ "step": 1445
+ },
+ {
+ "epoch": 0.6836879432624113,
+ "grad_norm": 3.1041831970214844,
+ "learning_rate": 4.860311089686125e-06,
+ "loss": 0.6727,
+ "step": 1446
+ },
+ {
+ "epoch": 0.684160756501182,
+ "grad_norm": 2.9382801055908203,
+ "learning_rate": 4.8601054095626746e-06,
+ "loss": 0.6002,
+ "step": 1447
+ },
+ {
+ "epoch": 0.6846335697399527,
+ "grad_norm": 2.782475471496582,
+ "learning_rate": 4.859899582485808e-06,
+ "loss": 0.6951,
+ "step": 1448
+ },
+ {
+ "epoch": 0.6851063829787234,
+ "grad_norm": 3.313894510269165,
+ "learning_rate": 4.859693608468343e-06,
+ "loss": 0.6363,
+ "step": 1449
+ },
+ {
+ "epoch": 0.6855791962174941,
+ "grad_norm": 3.1639695167541504,
+ "learning_rate": 4.8594874875231045e-06,
+ "loss": 0.7002,
+ "step": 1450
+ },
+ {
+ "epoch": 0.6860520094562648,
+ "grad_norm": 2.6762218475341797,
+ "learning_rate": 4.859281219662926e-06,
+ "loss": 0.6246,
+ "step": 1451
+ },
+ {
+ "epoch": 0.6865248226950355,
+ "grad_norm": 2.8368663787841797,
+ "learning_rate": 4.85907480490065e-06,
+ "loss": 0.5906,
+ "step": 1452
+ },
+ {
+ "epoch": 0.6869976359338061,
+ "grad_norm": 2.887373208999634,
+ "learning_rate": 4.858868243249131e-06,
+ "loss": 0.5931,
+ "step": 1453
+ },
+ {
+ "epoch": 0.6874704491725768,
+ "grad_norm": 2.8115322589874268,
+ "learning_rate": 4.858661534721229e-06,
+ "loss": 0.6337,
+ "step": 1454
+ },
+ {
+ "epoch": 0.6879432624113475,
+ "grad_norm": 2.8470499515533447,
+ "learning_rate": 4.8584546793298174e-06,
+ "loss": 0.632,
+ "step": 1455
+ },
+ {
+ "epoch": 0.6884160756501182,
+ "grad_norm": 2.8229613304138184,
+ "learning_rate": 4.8582476770877725e-06,
+ "loss": 0.6494,
+ "step": 1456
+ },
+ {
+ "epoch": 0.6888888888888889,
+ "grad_norm": 2.4235479831695557,
+ "learning_rate": 4.858040528007987e-06,
+ "loss": 0.5709,
+ "step": 1457
+ },
+ {
+ "epoch": 0.6893617021276596,
+ "grad_norm": 2.9348199367523193,
+ "learning_rate": 4.857833232103356e-06,
+ "loss": 0.5404,
+ "step": 1458
+ },
+ {
+ "epoch": 0.6898345153664303,
+ "grad_norm": 2.8274219036102295,
+ "learning_rate": 4.857625789386789e-06,
+ "loss": 0.701,
+ "step": 1459
+ },
+ {
+ "epoch": 0.6903073286052009,
+ "grad_norm": 3.136929988861084,
+ "learning_rate": 4.857418199871203e-06,
+ "loss": 0.6971,
+ "step": 1460
+ },
+ {
+ "epoch": 0.6907801418439716,
+ "grad_norm": 2.8987185955047607,
+ "learning_rate": 4.8572104635695214e-06,
+ "loss": 0.6613,
+ "step": 1461
+ },
+ {
+ "epoch": 0.6912529550827423,
+ "grad_norm": 2.5073442459106445,
+ "learning_rate": 4.857002580494681e-06,
+ "loss": 0.6032,
+ "step": 1462
+ },
+ {
+ "epoch": 0.691725768321513,
+ "grad_norm": 2.7019522190093994,
+ "learning_rate": 4.856794550659625e-06,
+ "loss": 0.567,
+ "step": 1463
+ },
+ {
+ "epoch": 0.6921985815602837,
+ "grad_norm": 2.4795594215393066,
+ "learning_rate": 4.8565863740773054e-06,
+ "loss": 0.5777,
+ "step": 1464
+ },
+ {
+ "epoch": 0.6926713947990544,
+ "grad_norm": 3.032506227493286,
+ "learning_rate": 4.856378050760687e-06,
+ "loss": 0.607,
+ "step": 1465
+ },
+ {
+ "epoch": 0.6931442080378251,
+ "grad_norm": 3.052091121673584,
+ "learning_rate": 4.85616958072274e-06,
+ "loss": 0.591,
+ "step": 1466
+ },
+ {
+ "epoch": 0.6936170212765957,
+ "grad_norm": 2.704831838607788,
+ "learning_rate": 4.855960963976443e-06,
+ "loss": 0.6528,
+ "step": 1467
+ },
+ {
+ "epoch": 0.6940898345153664,
+ "grad_norm": 2.680995225906372,
+ "learning_rate": 4.855752200534788e-06,
+ "loss": 0.6294,
+ "step": 1468
+ },
+ {
+ "epoch": 0.6945626477541371,
+ "grad_norm": 2.3948659896850586,
+ "learning_rate": 4.855543290410774e-06,
+ "loss": 0.6091,
+ "step": 1469
+ },
+ {
+ "epoch": 0.6950354609929078,
+ "grad_norm": 2.6407411098480225,
+ "learning_rate": 4.855334233617407e-06,
+ "loss": 0.5572,
+ "step": 1470
+ },
+ {
+ "epoch": 0.6955082742316785,
+ "grad_norm": 2.5526835918426514,
+ "learning_rate": 4.8551250301677064e-06,
+ "loss": 0.5432,
+ "step": 1471
+ },
+ {
+ "epoch": 0.6959810874704492,
+ "grad_norm": 3.1237430572509766,
+ "learning_rate": 4.8549156800746965e-06,
+ "loss": 0.5944,
+ "step": 1472
+ },
+ {
+ "epoch": 0.6964539007092199,
+ "grad_norm": 2.8112540245056152,
+ "learning_rate": 4.854706183351412e-06,
+ "loss": 0.604,
+ "step": 1473
+ },
+ {
+ "epoch": 0.6969267139479906,
+ "grad_norm": 2.664644479751587,
+ "learning_rate": 4.8544965400109e-06,
+ "loss": 0.5647,
+ "step": 1474
+ },
+ {
+ "epoch": 0.6973995271867612,
+ "grad_norm": 3.26310133934021,
+ "learning_rate": 4.854286750066212e-06,
+ "loss": 0.6999,
+ "step": 1475
+ },
+ {
+ "epoch": 0.6978723404255319,
+ "grad_norm": 2.9717442989349365,
+ "learning_rate": 4.8540768135304115e-06,
+ "loss": 0.6655,
+ "step": 1476
+ },
+ {
+ "epoch": 0.6983451536643026,
+ "grad_norm": 2.5302982330322266,
+ "learning_rate": 4.85386673041657e-06,
+ "loss": 0.6384,
+ "step": 1477
+ },
+ {
+ "epoch": 0.6988179669030733,
+ "grad_norm": 2.864877700805664,
+ "learning_rate": 4.853656500737769e-06,
+ "loss": 0.6834,
+ "step": 1478
+ },
+ {
+ "epoch": 0.699290780141844,
+ "grad_norm": 2.5522031784057617,
+ "learning_rate": 4.853446124507098e-06,
+ "loss": 0.5929,
+ "step": 1479
+ },
+ {
+ "epoch": 0.6997635933806147,
+ "grad_norm": 3.096477746963501,
+ "learning_rate": 4.853235601737656e-06,
+ "loss": 0.5737,
+ "step": 1480
+ },
+ {
+ "epoch": 0.7002364066193854,
+ "grad_norm": 2.884779214859009,
+ "learning_rate": 4.853024932442552e-06,
+ "loss": 0.6362,
+ "step": 1481
+ },
+ {
+ "epoch": 0.700709219858156,
+ "grad_norm": 3.368558406829834,
+ "learning_rate": 4.852814116634903e-06,
+ "loss": 0.6721,
+ "step": 1482
+ },
+ {
+ "epoch": 0.7011820330969267,
+ "grad_norm": 2.742414951324463,
+ "learning_rate": 4.852603154327837e-06,
+ "loss": 0.6212,
+ "step": 1483
+ },
+ {
+ "epoch": 0.7016548463356974,
+ "grad_norm": 2.53454852104187,
+ "learning_rate": 4.8523920455344864e-06,
+ "loss": 0.6675,
+ "step": 1484
+ },
+ {
+ "epoch": 0.7021276595744681,
+ "grad_norm": 2.9354238510131836,
+ "learning_rate": 4.852180790267999e-06,
+ "loss": 0.6692,
+ "step": 1485
+ },
+ {
+ "epoch": 0.7026004728132388,
+ "grad_norm": 2.585070848464966,
+ "learning_rate": 4.8519693885415274e-06,
+ "loss": 0.6215,
+ "step": 1486
+ },
+ {
+ "epoch": 0.7030732860520095,
+ "grad_norm": 2.9047999382019043,
+ "learning_rate": 4.851757840368235e-06,
+ "loss": 0.6231,
+ "step": 1487
+ },
+ {
+ "epoch": 0.7035460992907802,
+ "grad_norm": 3.0930933952331543,
+ "learning_rate": 4.851546145761295e-06,
+ "loss": 0.7267,
+ "step": 1488
+ },
+ {
+ "epoch": 0.7040189125295508,
+ "grad_norm": 3.0224719047546387,
+ "learning_rate": 4.8513343047338875e-06,
+ "loss": 0.6293,
+ "step": 1489
+ },
+ {
+ "epoch": 0.7044917257683215,
+ "grad_norm": 2.5758471488952637,
+ "learning_rate": 4.851122317299203e-06,
+ "loss": 0.5855,
+ "step": 1490
+ },
+ {
+ "epoch": 0.7049645390070922,
+ "grad_norm": 2.579272508621216,
+ "learning_rate": 4.850910183470441e-06,
+ "loss": 0.582,
+ "step": 1491
+ },
+ {
+ "epoch": 0.7054373522458629,
+ "grad_norm": 2.8148300647735596,
+ "learning_rate": 4.85069790326081e-06,
+ "loss": 0.6396,
+ "step": 1492
+ },
+ {
+ "epoch": 0.7059101654846336,
+ "grad_norm": 2.6380527019500732,
+ "learning_rate": 4.850485476683528e-06,
+ "loss": 0.6114,
+ "step": 1493
+ },
+ {
+ "epoch": 0.7063829787234043,
+ "grad_norm": 2.7736263275146484,
+ "learning_rate": 4.850272903751823e-06,
+ "loss": 0.6683,
+ "step": 1494
+ },
+ {
+ "epoch": 0.706855791962175,
+ "grad_norm": 3.1958179473876953,
+ "learning_rate": 4.8500601844789285e-06,
+ "loss": 0.6265,
+ "step": 1495
+ },
+ {
+ "epoch": 0.7073286052009456,
+ "grad_norm": 3.783212423324585,
+ "learning_rate": 4.8498473188780916e-06,
+ "loss": 0.6078,
+ "step": 1496
+ },
+ {
+ "epoch": 0.7078014184397163,
+ "grad_norm": 2.6656646728515625,
+ "learning_rate": 4.849634306962566e-06,
+ "loss": 0.5756,
+ "step": 1497
+ },
+ {
+ "epoch": 0.708274231678487,
+ "grad_norm": 2.757141590118408,
+ "learning_rate": 4.849421148745615e-06,
+ "loss": 0.5596,
+ "step": 1498
+ },
+ {
+ "epoch": 0.7087470449172577,
+ "grad_norm": 3.0391886234283447,
+ "learning_rate": 4.849207844240511e-06,
+ "loss": 0.5293,
+ "step": 1499
+ },
+ {
+ "epoch": 0.7092198581560284,
+ "grad_norm": 2.981912851333618,
+ "learning_rate": 4.848994393460535e-06,
+ "loss": 0.598,
+ "step": 1500
+ },
+ {
+ "epoch": 0.7096926713947991,
+ "grad_norm": 2.5470798015594482,
+ "learning_rate": 4.848780796418978e-06,
+ "loss": 0.6266,
+ "step": 1501
+ },
+ {
+ "epoch": 0.7101654846335698,
+ "grad_norm": 2.8394415378570557,
+ "learning_rate": 4.8485670531291415e-06,
+ "loss": 0.6844,
+ "step": 1502
+ },
+ {
+ "epoch": 0.7106382978723405,
+ "grad_norm": 3.2023508548736572,
+ "learning_rate": 4.848353163604331e-06,
+ "loss": 0.6134,
+ "step": 1503
+ },
+ {
+ "epoch": 0.7111111111111111,
+ "grad_norm": 2.98245906829834,
+ "learning_rate": 4.848139127857867e-06,
+ "loss": 0.7084,
+ "step": 1504
+ },
+ {
+ "epoch": 0.7115839243498818,
+ "grad_norm": 2.5917441844940186,
+ "learning_rate": 4.847924945903076e-06,
+ "loss": 0.5676,
+ "step": 1505
+ },
+ {
+ "epoch": 0.7120567375886525,
+ "grad_norm": 2.8736681938171387,
+ "learning_rate": 4.847710617753294e-06,
+ "loss": 0.6304,
+ "step": 1506
+ },
+ {
+ "epoch": 0.7125295508274232,
+ "grad_norm": 2.7832682132720947,
+ "learning_rate": 4.847496143421866e-06,
+ "loss": 0.5705,
+ "step": 1507
+ },
+ {
+ "epoch": 0.7130023640661939,
+ "grad_norm": 2.480560779571533,
+ "learning_rate": 4.847281522922147e-06,
+ "loss": 0.5595,
+ "step": 1508
+ },
+ {
+ "epoch": 0.7134751773049646,
+ "grad_norm": 2.357675313949585,
+ "learning_rate": 4.847066756267499e-06,
+ "loss": 0.5065,
+ "step": 1509
+ },
+ {
+ "epoch": 0.7139479905437353,
+ "grad_norm": 2.632669448852539,
+ "learning_rate": 4.846851843471296e-06,
+ "loss": 0.6949,
+ "step": 1510
+ },
+ {
+ "epoch": 0.7144208037825059,
+ "grad_norm": 2.7691073417663574,
+ "learning_rate": 4.84663678454692e-06,
+ "loss": 0.6638,
+ "step": 1511
+ },
+ {
+ "epoch": 0.7148936170212766,
+ "grad_norm": 2.5647685527801514,
+ "learning_rate": 4.846421579507761e-06,
+ "loss": 0.6098,
+ "step": 1512
+ },
+ {
+ "epoch": 0.7153664302600473,
+ "grad_norm": 2.476701021194458,
+ "learning_rate": 4.846206228367218e-06,
+ "loss": 0.592,
+ "step": 1513
+ },
+ {
+ "epoch": 0.715839243498818,
+ "grad_norm": 2.805727958679199,
+ "learning_rate": 4.845990731138702e-06,
+ "loss": 0.5466,
+ "step": 1514
+ },
+ {
+ "epoch": 0.7163120567375887,
+ "grad_norm": 2.551392078399658,
+ "learning_rate": 4.84577508783563e-06,
+ "loss": 0.6039,
+ "step": 1515
+ },
+ {
+ "epoch": 0.7167848699763594,
+ "grad_norm": 2.6861350536346436,
+ "learning_rate": 4.845559298471429e-06,
+ "loss": 0.6427,
+ "step": 1516
+ },
+ {
+ "epoch": 0.7172576832151301,
+ "grad_norm": 3.1908371448516846,
+ "learning_rate": 4.845343363059535e-06,
+ "loss": 0.5447,
+ "step": 1517
+ },
+ {
+ "epoch": 0.7177304964539007,
+ "grad_norm": 2.9021761417388916,
+ "learning_rate": 4.845127281613394e-06,
+ "loss": 0.5836,
+ "step": 1518
+ },
+ {
+ "epoch": 0.7182033096926714,
+ "grad_norm": 2.476670742034912,
+ "learning_rate": 4.844911054146461e-06,
+ "loss": 0.5863,
+ "step": 1519
+ },
+ {
+ "epoch": 0.7186761229314421,
+ "grad_norm": 2.662935495376587,
+ "learning_rate": 4.844694680672198e-06,
+ "loss": 0.5678,
+ "step": 1520
+ },
+ {
+ "epoch": 0.7191489361702128,
+ "grad_norm": 2.677896738052368,
+ "learning_rate": 4.844478161204079e-06,
+ "loss": 0.6195,
+ "step": 1521
+ },
+ {
+ "epoch": 0.7196217494089835,
+ "grad_norm": 2.781921863555908,
+ "learning_rate": 4.844261495755585e-06,
+ "loss": 0.643,
+ "step": 1522
+ },
+ {
+ "epoch": 0.7200945626477542,
+ "grad_norm": 3.0157392024993896,
+ "learning_rate": 4.844044684340206e-06,
+ "loss": 0.7559,
+ "step": 1523
+ },
+ {
+ "epoch": 0.7205673758865249,
+ "grad_norm": 2.8109354972839355,
+ "learning_rate": 4.843827726971444e-06,
+ "loss": 0.6264,
+ "step": 1524
+ },
+ {
+ "epoch": 0.7210401891252955,
+ "grad_norm": 3.0953569412231445,
+ "learning_rate": 4.8436106236628064e-06,
+ "loss": 0.6429,
+ "step": 1525
+ },
+ {
+ "epoch": 0.7215130023640662,
+ "grad_norm": 2.6850643157958984,
+ "learning_rate": 4.843393374427812e-06,
+ "loss": 0.6598,
+ "step": 1526
+ },
+ {
+ "epoch": 0.7219858156028369,
+ "grad_norm": 3.043480634689331,
+ "learning_rate": 4.8431759792799874e-06,
+ "loss": 0.6331,
+ "step": 1527
+ },
+ {
+ "epoch": 0.7224586288416076,
+ "grad_norm": 2.723870038986206,
+ "learning_rate": 4.842958438232868e-06,
+ "loss": 0.6259,
+ "step": 1528
+ },
+ {
+ "epoch": 0.7229314420803783,
+ "grad_norm": 2.822492837905884,
+ "learning_rate": 4.842740751300002e-06,
+ "loss": 0.6554,
+ "step": 1529
+ },
+ {
+ "epoch": 0.723404255319149,
+ "grad_norm": 2.7866315841674805,
+ "learning_rate": 4.842522918494941e-06,
+ "loss": 0.6991,
+ "step": 1530
+ },
+ {
+ "epoch": 0.7238770685579197,
+ "grad_norm": 2.8881826400756836,
+ "learning_rate": 4.84230493983125e-06,
+ "loss": 0.5876,
+ "step": 1531
+ },
+ {
+ "epoch": 0.7243498817966904,
+ "grad_norm": 2.7456939220428467,
+ "learning_rate": 4.8420868153225e-06,
+ "loss": 0.6188,
+ "step": 1532
+ },
+ {
+ "epoch": 0.724822695035461,
+ "grad_norm": 3.0257532596588135,
+ "learning_rate": 4.841868544982274e-06,
+ "loss": 0.63,
+ "step": 1533
+ },
+ {
+ "epoch": 0.7252955082742317,
+ "grad_norm": 3.1581954956054688,
+ "learning_rate": 4.841650128824164e-06,
+ "loss": 0.7214,
+ "step": 1534
+ },
+ {
+ "epoch": 0.7257683215130024,
+ "grad_norm": 2.9174306392669678,
+ "learning_rate": 4.841431566861767e-06,
+ "loss": 0.704,
+ "step": 1535
+ },
+ {
+ "epoch": 0.7262411347517731,
+ "grad_norm": 2.5019054412841797,
+ "learning_rate": 4.8412128591086935e-06,
+ "loss": 0.6298,
+ "step": 1536
+ },
+ {
+ "epoch": 0.7267139479905438,
+ "grad_norm": 2.724285125732422,
+ "learning_rate": 4.840994005578562e-06,
+ "loss": 0.6289,
+ "step": 1537
+ },
+ {
+ "epoch": 0.7271867612293145,
+ "grad_norm": 2.5882341861724854,
+ "learning_rate": 4.840775006284998e-06,
+ "loss": 0.6355,
+ "step": 1538
+ },
+ {
+ "epoch": 0.7276595744680852,
+ "grad_norm": 3.1281991004943848,
+ "learning_rate": 4.840555861241638e-06,
+ "loss": 0.5551,
+ "step": 1539
+ },
+ {
+ "epoch": 0.7281323877068558,
+ "grad_norm": 2.6064817905426025,
+ "learning_rate": 4.840336570462127e-06,
+ "loss": 0.5543,
+ "step": 1540
+ },
+ {
+ "epoch": 0.7286052009456265,
+ "grad_norm": 2.67112398147583,
+ "learning_rate": 4.840117133960122e-06,
+ "loss": 0.6044,
+ "step": 1541
+ },
+ {
+ "epoch": 0.7290780141843972,
+ "grad_norm": 2.838022232055664,
+ "learning_rate": 4.839897551749282e-06,
+ "loss": 0.6814,
+ "step": 1542
+ },
+ {
+ "epoch": 0.7295508274231679,
+ "grad_norm": 2.8897151947021484,
+ "learning_rate": 4.839677823843283e-06,
+ "loss": 0.593,
+ "step": 1543
+ },
+ {
+ "epoch": 0.7300236406619386,
+ "grad_norm": 2.9238014221191406,
+ "learning_rate": 4.839457950255805e-06,
+ "loss": 0.5544,
+ "step": 1544
+ },
+ {
+ "epoch": 0.7304964539007093,
+ "grad_norm": 3.016876459121704,
+ "learning_rate": 4.839237931000538e-06,
+ "loss": 0.6099,
+ "step": 1545
+ },
+ {
+ "epoch": 0.7309692671394799,
+ "grad_norm": 2.9415392875671387,
+ "learning_rate": 4.839017766091182e-06,
+ "loss": 0.6413,
+ "step": 1546
+ },
+ {
+ "epoch": 0.7314420803782505,
+ "grad_norm": 2.658067226409912,
+ "learning_rate": 4.838797455541446e-06,
+ "loss": 0.6534,
+ "step": 1547
+ },
+ {
+ "epoch": 0.7319148936170212,
+ "grad_norm": 2.460358142852783,
+ "learning_rate": 4.838576999365049e-06,
+ "loss": 0.5307,
+ "step": 1548
+ },
+ {
+ "epoch": 0.7323877068557919,
+ "grad_norm": 2.5818674564361572,
+ "learning_rate": 4.838356397575716e-06,
+ "loss": 0.6265,
+ "step": 1549
+ },
+ {
+ "epoch": 0.7328605200945626,
+ "grad_norm": 3.009197473526001,
+ "learning_rate": 4.838135650187183e-06,
+ "loss": 0.6957,
+ "step": 1550
+ },
+ {
+ "epoch": 0.7333333333333333,
+ "grad_norm": 2.738543748855591,
+ "learning_rate": 4.837914757213196e-06,
+ "loss": 0.646,
+ "step": 1551
+ },
+ {
+ "epoch": 0.733806146572104,
+ "grad_norm": 2.8208494186401367,
+ "learning_rate": 4.837693718667508e-06,
+ "loss": 0.5936,
+ "step": 1552
+ },
+ {
+ "epoch": 0.7342789598108747,
+ "grad_norm": 3.1574649810791016,
+ "learning_rate": 4.837472534563883e-06,
+ "loss": 0.6455,
+ "step": 1553
+ },
+ {
+ "epoch": 0.7347517730496453,
+ "grad_norm": 2.6737420558929443,
+ "learning_rate": 4.837251204916093e-06,
+ "loss": 0.5921,
+ "step": 1554
+ },
+ {
+ "epoch": 0.735224586288416,
+ "grad_norm": 2.424983024597168,
+ "learning_rate": 4.837029729737918e-06,
+ "loss": 0.6346,
+ "step": 1555
+ },
+ {
+ "epoch": 0.7356973995271867,
+ "grad_norm": 2.5163493156433105,
+ "learning_rate": 4.836808109043151e-06,
+ "loss": 0.6061,
+ "step": 1556
+ },
+ {
+ "epoch": 0.7361702127659574,
+ "grad_norm": 2.8377044200897217,
+ "learning_rate": 4.836586342845588e-06,
+ "loss": 0.611,
+ "step": 1557
+ },
+ {
+ "epoch": 0.7366430260047281,
+ "grad_norm": 2.5929181575775146,
+ "learning_rate": 4.83636443115904e-06,
+ "loss": 0.5496,
+ "step": 1558
+ },
+ {
+ "epoch": 0.7371158392434988,
+ "grad_norm": 2.5017223358154297,
+ "learning_rate": 4.836142373997323e-06,
+ "loss": 0.6235,
+ "step": 1559
+ },
+ {
+ "epoch": 0.7375886524822695,
+ "grad_norm": 2.822500228881836,
+ "learning_rate": 4.835920171374265e-06,
+ "loss": 0.6147,
+ "step": 1560
+ },
+ {
+ "epoch": 0.7380614657210401,
+ "grad_norm": 2.7234230041503906,
+ "learning_rate": 4.8356978233037e-06,
+ "loss": 0.6228,
+ "step": 1561
+ },
+ {
+ "epoch": 0.7385342789598108,
+ "grad_norm": 2.9565515518188477,
+ "learning_rate": 4.835475329799472e-06,
+ "loss": 0.5728,
+ "step": 1562
+ },
+ {
+ "epoch": 0.7390070921985815,
+ "grad_norm": 2.4356038570404053,
+ "learning_rate": 4.835252690875438e-06,
+ "loss": 0.6723,
+ "step": 1563
+ },
+ {
+ "epoch": 0.7394799054373522,
+ "grad_norm": 2.765913248062134,
+ "learning_rate": 4.835029906545458e-06,
+ "loss": 0.5805,
+ "step": 1564
+ },
+ {
+ "epoch": 0.7399527186761229,
+ "grad_norm": 2.4481914043426514,
+ "learning_rate": 4.834806976823405e-06,
+ "loss": 0.599,
+ "step": 1565
+ },
+ {
+ "epoch": 0.7404255319148936,
+ "grad_norm": 2.620779514312744,
+ "learning_rate": 4.834583901723158e-06,
+ "loss": 0.63,
+ "step": 1566
+ },
+ {
+ "epoch": 0.7408983451536643,
+ "grad_norm": 2.654426097869873,
+ "learning_rate": 4.83436068125861e-06,
+ "loss": 0.6544,
+ "step": 1567
+ },
+ {
+ "epoch": 0.741371158392435,
+ "grad_norm": 2.589623212814331,
+ "learning_rate": 4.834137315443656e-06,
+ "loss": 0.5596,
+ "step": 1568
+ },
+ {
+ "epoch": 0.7418439716312056,
+ "grad_norm": 2.572883129119873,
+ "learning_rate": 4.833913804292209e-06,
+ "loss": 0.5974,
+ "step": 1569
+ },
+ {
+ "epoch": 0.7423167848699763,
+ "grad_norm": 2.8744914531707764,
+ "learning_rate": 4.833690147818181e-06,
+ "loss": 0.5364,
+ "step": 1570
+ },
+ {
+ "epoch": 0.742789598108747,
+ "grad_norm": 2.9800851345062256,
+ "learning_rate": 4.833466346035502e-06,
+ "loss": 0.6287,
+ "step": 1571
+ },
+ {
+ "epoch": 0.7432624113475177,
+ "grad_norm": 2.627784490585327,
+ "learning_rate": 4.833242398958105e-06,
+ "loss": 0.621,
+ "step": 1572
+ },
+ {
+ "epoch": 0.7437352245862884,
+ "grad_norm": 2.5187721252441406,
+ "learning_rate": 4.833018306599933e-06,
+ "loss": 0.5901,
+ "step": 1573
+ },
+ {
+ "epoch": 0.7442080378250591,
+ "grad_norm": 2.4843688011169434,
+ "learning_rate": 4.832794068974944e-06,
+ "loss": 0.6336,
+ "step": 1574
+ },
+ {
+ "epoch": 0.7446808510638298,
+ "grad_norm": 2.774911880493164,
+ "learning_rate": 4.832569686097096e-06,
+ "loss": 0.6091,
+ "step": 1575
+ },
+ {
+ "epoch": 0.7451536643026004,
+ "grad_norm": 3.2562527656555176,
+ "learning_rate": 4.8323451579803615e-06,
+ "loss": 0.7686,
+ "step": 1576
+ },
+ {
+ "epoch": 0.7456264775413711,
+ "grad_norm": 2.799570083618164,
+ "learning_rate": 4.832120484638721e-06,
+ "loss": 0.6233,
+ "step": 1577
+ },
+ {
+ "epoch": 0.7460992907801418,
+ "grad_norm": 2.661893367767334,
+ "learning_rate": 4.831895666086164e-06,
+ "loss": 0.5841,
+ "step": 1578
+ },
+ {
+ "epoch": 0.7465721040189125,
+ "grad_norm": 3.0382652282714844,
+ "learning_rate": 4.831670702336689e-06,
+ "loss": 0.5769,
+ "step": 1579
+ },
+ {
+ "epoch": 0.7470449172576832,
+ "grad_norm": 2.676398515701294,
+ "learning_rate": 4.831445593404304e-06,
+ "loss": 0.619,
+ "step": 1580
+ },
+ {
+ "epoch": 0.7475177304964539,
+ "grad_norm": 2.717916965484619,
+ "learning_rate": 4.831220339303024e-06,
+ "loss": 0.5787,
+ "step": 1581
+ },
+ {
+ "epoch": 0.7479905437352246,
+ "grad_norm": 2.3918066024780273,
+ "learning_rate": 4.830994940046876e-06,
+ "loss": 0.5108,
+ "step": 1582
+ },
+ {
+ "epoch": 0.7484633569739952,
+ "grad_norm": 2.709144115447998,
+ "learning_rate": 4.830769395649895e-06,
+ "loss": 0.6875,
+ "step": 1583
+ },
+ {
+ "epoch": 0.7489361702127659,
+ "grad_norm": 2.8711116313934326,
+ "learning_rate": 4.830543706126123e-06,
+ "loss": 0.6745,
+ "step": 1584
+ },
+ {
+ "epoch": 0.7494089834515366,
+ "grad_norm": 2.612339496612549,
+ "learning_rate": 4.830317871489614e-06,
+ "loss": 0.5738,
+ "step": 1585
+ },
+ {
+ "epoch": 0.7498817966903073,
+ "grad_norm": 2.4355857372283936,
+ "learning_rate": 4.830091891754429e-06,
+ "loss": 0.5907,
+ "step": 1586
+ },
+ {
+ "epoch": 0.750354609929078,
+ "grad_norm": 2.676051378250122,
+ "learning_rate": 4.829865766934638e-06,
+ "loss": 0.6628,
+ "step": 1587
+ },
+ {
+ "epoch": 0.7508274231678487,
+ "grad_norm": 2.66489839553833,
+ "learning_rate": 4.829639497044323e-06,
+ "loss": 0.5984,
+ "step": 1588
+ },
+ {
+ "epoch": 0.7513002364066194,
+ "grad_norm": 2.5358035564422607,
+ "learning_rate": 4.829413082097572e-06,
+ "loss": 0.5867,
+ "step": 1589
+ },
+ {
+ "epoch": 0.75177304964539,
+ "grad_norm": 2.6530144214630127,
+ "learning_rate": 4.8291865221084815e-06,
+ "loss": 0.5917,
+ "step": 1590
+ },
+ {
+ "epoch": 0.7522458628841607,
+ "grad_norm": 2.5160958766937256,
+ "learning_rate": 4.82895981709116e-06,
+ "loss": 0.6347,
+ "step": 1591
+ },
+ {
+ "epoch": 0.7527186761229314,
+ "grad_norm": 2.61592698097229,
+ "learning_rate": 4.8287329670597225e-06,
+ "loss": 0.5472,
+ "step": 1592
+ },
+ {
+ "epoch": 0.7531914893617021,
+ "grad_norm": 2.7528622150421143,
+ "learning_rate": 4.828505972028296e-06,
+ "loss": 0.5842,
+ "step": 1593
+ },
+ {
+ "epoch": 0.7536643026004728,
+ "grad_norm": 2.8154072761535645,
+ "learning_rate": 4.828278832011011e-06,
+ "loss": 0.5757,
+ "step": 1594
+ },
+ {
+ "epoch": 0.7541371158392435,
+ "grad_norm": 3.118515729904175,
+ "learning_rate": 4.828051547022013e-06,
+ "loss": 0.6472,
+ "step": 1595
+ },
+ {
+ "epoch": 0.7546099290780142,
+ "grad_norm": 2.452033758163452,
+ "learning_rate": 4.827824117075453e-06,
+ "loss": 0.5571,
+ "step": 1596
+ },
+ {
+ "epoch": 0.7550827423167848,
+ "grad_norm": 2.984388828277588,
+ "learning_rate": 4.827596542185492e-06,
+ "loss": 0.6656,
+ "step": 1597
+ },
+ {
+ "epoch": 0.7555555555555555,
+ "grad_norm": 2.61356782913208,
+ "learning_rate": 4.8273688223663014e-06,
+ "loss": 0.6444,
+ "step": 1598
+ },
+ {
+ "epoch": 0.7560283687943262,
+ "grad_norm": 2.8967196941375732,
+ "learning_rate": 4.8271409576320595e-06,
+ "loss": 0.6457,
+ "step": 1599
+ },
+ {
+ "epoch": 0.7565011820330969,
+ "grad_norm": 2.852367639541626,
+ "learning_rate": 4.826912947996954e-06,
+ "loss": 0.5629,
+ "step": 1600
+ },
+ {
+ "epoch": 0.7569739952718676,
+ "grad_norm": 2.905280590057373,
+ "learning_rate": 4.826684793475182e-06,
+ "loss": 0.6245,
+ "step": 1601
+ },
+ {
+ "epoch": 0.7574468085106383,
+ "grad_norm": 2.6156530380249023,
+ "learning_rate": 4.826456494080951e-06,
+ "loss": 0.5869,
+ "step": 1602
+ },
+ {
+ "epoch": 0.757919621749409,
+ "grad_norm": 2.6490228176116943,
+ "learning_rate": 4.826228049828475e-06,
+ "loss": 0.5461,
+ "step": 1603
+ },
+ {
+ "epoch": 0.7583924349881797,
+ "grad_norm": 2.9626693725585938,
+ "learning_rate": 4.825999460731978e-06,
+ "loss": 0.6842,
+ "step": 1604
+ },
+ {
+ "epoch": 0.7588652482269503,
+ "grad_norm": 2.6866023540496826,
+ "learning_rate": 4.825770726805695e-06,
+ "loss": 0.5726,
+ "step": 1605
+ },
+ {
+ "epoch": 0.759338061465721,
+ "grad_norm": 2.5525858402252197,
+ "learning_rate": 4.825541848063866e-06,
+ "loss": 0.6061,
+ "step": 1606
+ },
+ {
+ "epoch": 0.7598108747044917,
+ "grad_norm": 2.703977584838867,
+ "learning_rate": 4.825312824520743e-06,
+ "loss": 0.6726,
+ "step": 1607
+ },
+ {
+ "epoch": 0.7602836879432624,
+ "grad_norm": 2.856534957885742,
+ "learning_rate": 4.825083656190588e-06,
+ "loss": 0.625,
+ "step": 1608
+ },
+ {
+ "epoch": 0.7607565011820331,
+ "grad_norm": 2.8564887046813965,
+ "learning_rate": 4.824854343087668e-06,
+ "loss": 0.7251,
+ "step": 1609
+ },
+ {
+ "epoch": 0.7612293144208038,
+ "grad_norm": 2.327650308609009,
+ "learning_rate": 4.824624885226262e-06,
+ "loss": 0.526,
+ "step": 1610
+ },
+ {
+ "epoch": 0.7617021276595745,
+ "grad_norm": 3.0025737285614014,
+ "learning_rate": 4.824395282620659e-06,
+ "loss": 0.6043,
+ "step": 1611
+ },
+ {
+ "epoch": 0.7621749408983451,
+ "grad_norm": 2.5441737174987793,
+ "learning_rate": 4.824165535285152e-06,
+ "loss": 0.6276,
+ "step": 1612
+ },
+ {
+ "epoch": 0.7626477541371158,
+ "grad_norm": 2.4177372455596924,
+ "learning_rate": 4.823935643234049e-06,
+ "loss": 0.6419,
+ "step": 1613
+ },
+ {
+ "epoch": 0.7631205673758865,
+ "grad_norm": 2.9210550785064697,
+ "learning_rate": 4.823705606481664e-06,
+ "loss": 0.5663,
+ "step": 1614
+ },
+ {
+ "epoch": 0.7635933806146572,
+ "grad_norm": 2.6353724002838135,
+ "learning_rate": 4.82347542504232e-06,
+ "loss": 0.5669,
+ "step": 1615
+ },
+ {
+ "epoch": 0.7640661938534279,
+ "grad_norm": 2.419081926345825,
+ "learning_rate": 4.823245098930349e-06,
+ "loss": 0.5777,
+ "step": 1616
+ },
+ {
+ "epoch": 0.7645390070921986,
+ "grad_norm": 2.5077571868896484,
+ "learning_rate": 4.823014628160093e-06,
+ "loss": 0.5924,
+ "step": 1617
+ },
+ {
+ "epoch": 0.7650118203309693,
+ "grad_norm": 2.816056251525879,
+ "learning_rate": 4.822784012745902e-06,
+ "loss": 0.7273,
+ "step": 1618
+ },
+ {
+ "epoch": 0.76548463356974,
+ "grad_norm": 2.7163147926330566,
+ "learning_rate": 4.8225532527021366e-06,
+ "loss": 0.5545,
+ "step": 1619
+ },
+ {
+ "epoch": 0.7659574468085106,
+ "grad_norm": 2.4784302711486816,
+ "learning_rate": 4.822322348043164e-06,
+ "loss": 0.556,
+ "step": 1620
+ },
+ {
+ "epoch": 0.7664302600472813,
+ "grad_norm": 2.712467670440674,
+ "learning_rate": 4.822091298783361e-06,
+ "loss": 0.6501,
+ "step": 1621
+ },
+ {
+ "epoch": 0.766903073286052,
+ "grad_norm": 2.7217724323272705,
+ "learning_rate": 4.821860104937115e-06,
+ "loss": 0.5989,
+ "step": 1622
+ },
+ {
+ "epoch": 0.7673758865248227,
+ "grad_norm": 2.5622854232788086,
+ "learning_rate": 4.821628766518821e-06,
+ "loss": 0.5263,
+ "step": 1623
+ },
+ {
+ "epoch": 0.7678486997635934,
+ "grad_norm": 3.230923891067505,
+ "learning_rate": 4.821397283542884e-06,
+ "loss": 0.6707,
+ "step": 1624
+ },
+ {
+ "epoch": 0.7683215130023641,
+ "grad_norm": 2.37929105758667,
+ "learning_rate": 4.821165656023718e-06,
+ "loss": 0.6124,
+ "step": 1625
+ },
+ {
+ "epoch": 0.7687943262411348,
+ "grad_norm": 2.9811325073242188,
+ "learning_rate": 4.820933883975745e-06,
+ "loss": 0.6435,
+ "step": 1626
+ },
+ {
+ "epoch": 0.7692671394799054,
+ "grad_norm": 2.887380838394165,
+ "learning_rate": 4.820701967413395e-06,
+ "loss": 0.621,
+ "step": 1627
+ },
+ {
+ "epoch": 0.7697399527186761,
+ "grad_norm": 2.6762876510620117,
+ "learning_rate": 4.820469906351109e-06,
+ "loss": 0.5713,
+ "step": 1628
+ },
+ {
+ "epoch": 0.7702127659574468,
+ "grad_norm": 2.7347512245178223,
+ "learning_rate": 4.820237700803337e-06,
+ "loss": 0.6136,
+ "step": 1629
+ },
+ {
+ "epoch": 0.7706855791962175,
+ "grad_norm": 2.7244746685028076,
+ "learning_rate": 4.820005350784539e-06,
+ "loss": 0.5816,
+ "step": 1630
+ },
+ {
+ "epoch": 0.7711583924349882,
+ "grad_norm": 2.9293999671936035,
+ "learning_rate": 4.8197728563091795e-06,
+ "loss": 0.6649,
+ "step": 1631
+ },
+ {
+ "epoch": 0.7716312056737589,
+ "grad_norm": 2.4402127265930176,
+ "learning_rate": 4.819540217391736e-06,
+ "loss": 0.6481,
+ "step": 1632
+ },
+ {
+ "epoch": 0.7721040189125296,
+ "grad_norm": 3.083941698074341,
+ "learning_rate": 4.819307434046694e-06,
+ "loss": 0.6951,
+ "step": 1633
+ },
+ {
+ "epoch": 0.7725768321513002,
+ "grad_norm": 2.544952392578125,
+ "learning_rate": 4.819074506288548e-06,
+ "loss": 0.539,
+ "step": 1634
+ },
+ {
+ "epoch": 0.7730496453900709,
+ "grad_norm": 2.7791268825531006,
+ "learning_rate": 4.818841434131801e-06,
+ "loss": 0.5827,
+ "step": 1635
+ },
+ {
+ "epoch": 0.7735224586288416,
+ "grad_norm": 2.7349796295166016,
+ "learning_rate": 4.818608217590967e-06,
+ "loss": 0.5584,
+ "step": 1636
+ },
+ {
+ "epoch": 0.7739952718676123,
+ "grad_norm": 2.637652635574341,
+ "learning_rate": 4.818374856680565e-06,
+ "loss": 0.6386,
+ "step": 1637
+ },
+ {
+ "epoch": 0.774468085106383,
+ "grad_norm": 2.9821584224700928,
+ "learning_rate": 4.818141351415127e-06,
+ "loss": 0.6734,
+ "step": 1638
+ },
+ {
+ "epoch": 0.7749408983451537,
+ "grad_norm": 2.992938995361328,
+ "learning_rate": 4.817907701809192e-06,
+ "loss": 0.5899,
+ "step": 1639
+ },
+ {
+ "epoch": 0.7754137115839244,
+ "grad_norm": 4.35719633102417,
+ "learning_rate": 4.8176739078773076e-06,
+ "loss": 0.6281,
+ "step": 1640
+ },
+ {
+ "epoch": 0.775886524822695,
+ "grad_norm": 2.838146209716797,
+ "learning_rate": 4.8174399696340315e-06,
+ "loss": 0.5766,
+ "step": 1641
+ },
+ {
+ "epoch": 0.7763593380614657,
+ "grad_norm": 3.3116989135742188,
+ "learning_rate": 4.81720588709393e-06,
+ "loss": 0.6409,
+ "step": 1642
+ },
+ {
+ "epoch": 0.7768321513002364,
+ "grad_norm": 2.9843590259552,
+ "learning_rate": 4.816971660271579e-06,
+ "loss": 0.6108,
+ "step": 1643
+ },
+ {
+ "epoch": 0.7773049645390071,
+ "grad_norm": 2.843770742416382,
+ "learning_rate": 4.816737289181562e-06,
+ "loss": 0.6053,
+ "step": 1644
+ },
+ {
+ "epoch": 0.7777777777777778,
+ "grad_norm": 2.7608556747436523,
+ "learning_rate": 4.816502773838473e-06,
+ "loss": 0.5854,
+ "step": 1645
+ },
+ {
+ "epoch": 0.7782505910165485,
+ "grad_norm": 3.343682289123535,
+ "learning_rate": 4.816268114256914e-06,
+ "loss": 0.6329,
+ "step": 1646
+ },
+ {
+ "epoch": 0.7787234042553192,
+ "grad_norm": 2.769768476486206,
+ "learning_rate": 4.816033310451496e-06,
+ "loss": 0.6242,
+ "step": 1647
+ },
+ {
+ "epoch": 0.7791962174940898,
+ "grad_norm": 2.989851713180542,
+ "learning_rate": 4.815798362436838e-06,
+ "loss": 0.6493,
+ "step": 1648
+ },
+ {
+ "epoch": 0.7796690307328605,
+ "grad_norm": 3.170736312866211,
+ "learning_rate": 4.8155632702275716e-06,
+ "loss": 0.6341,
+ "step": 1649
+ },
+ {
+ "epoch": 0.7801418439716312,
+ "grad_norm": 2.7372522354125977,
+ "learning_rate": 4.815328033838334e-06,
+ "loss": 0.5445,
+ "step": 1650
+ },
+ {
+ "epoch": 0.7806146572104019,
+ "grad_norm": 2.6947238445281982,
+ "learning_rate": 4.8150926532837715e-06,
+ "loss": 0.6437,
+ "step": 1651
+ },
+ {
+ "epoch": 0.7810874704491726,
+ "grad_norm": 2.472323179244995,
+ "learning_rate": 4.81485712857854e-06,
+ "loss": 0.5751,
+ "step": 1652
+ },
+ {
+ "epoch": 0.7815602836879433,
+ "grad_norm": 2.791114091873169,
+ "learning_rate": 4.814621459737308e-06,
+ "loss": 0.5996,
+ "step": 1653
+ },
+ {
+ "epoch": 0.782033096926714,
+ "grad_norm": 3.1957521438598633,
+ "learning_rate": 4.814385646774745e-06,
+ "loss": 0.5803,
+ "step": 1654
+ },
+ {
+ "epoch": 0.7825059101654847,
+ "grad_norm": 2.4120798110961914,
+ "learning_rate": 4.8141496897055364e-06,
+ "loss": 0.5814,
+ "step": 1655
+ },
+ {
+ "epoch": 0.7829787234042553,
+ "grad_norm": 2.9262423515319824,
+ "learning_rate": 4.813913588544374e-06,
+ "loss": 0.6292,
+ "step": 1656
+ },
+ {
+ "epoch": 0.783451536643026,
+ "grad_norm": 2.8251047134399414,
+ "learning_rate": 4.813677343305959e-06,
+ "loss": 0.6787,
+ "step": 1657
+ },
+ {
+ "epoch": 0.7839243498817967,
+ "grad_norm": 2.931659698486328,
+ "learning_rate": 4.8134409540050005e-06,
+ "loss": 0.6163,
+ "step": 1658
+ },
+ {
+ "epoch": 0.7843971631205674,
+ "grad_norm": 2.7160706520080566,
+ "learning_rate": 4.813204420656219e-06,
+ "loss": 0.6831,
+ "step": 1659
+ },
+ {
+ "epoch": 0.7848699763593381,
+ "grad_norm": 3.2134454250335693,
+ "learning_rate": 4.81296774327434e-06,
+ "loss": 0.6002,
+ "step": 1660
+ },
+ {
+ "epoch": 0.7853427895981088,
+ "grad_norm": 2.4002513885498047,
+ "learning_rate": 4.812730921874103e-06,
+ "loss": 0.5488,
+ "step": 1661
+ },
+ {
+ "epoch": 0.7858156028368795,
+ "grad_norm": 2.5559282302856445,
+ "learning_rate": 4.812493956470251e-06,
+ "loss": 0.5802,
+ "step": 1662
+ },
+ {
+ "epoch": 0.7862884160756501,
+ "grad_norm": 2.57478404045105,
+ "learning_rate": 4.812256847077541e-06,
+ "loss": 0.646,
+ "step": 1663
+ },
+ {
+ "epoch": 0.7867612293144208,
+ "grad_norm": 2.811851978302002,
+ "learning_rate": 4.812019593710736e-06,
+ "loss": 0.6245,
+ "step": 1664
+ },
+ {
+ "epoch": 0.7872340425531915,
+ "grad_norm": 2.5228829383850098,
+ "learning_rate": 4.811782196384609e-06,
+ "loss": 0.5949,
+ "step": 1665
+ },
+ {
+ "epoch": 0.7877068557919622,
+ "grad_norm": 2.744096040725708,
+ "learning_rate": 4.8115446551139415e-06,
+ "loss": 0.6006,
+ "step": 1666
+ },
+ {
+ "epoch": 0.7881796690307329,
+ "grad_norm": 3.129242420196533,
+ "learning_rate": 4.811306969913524e-06,
+ "loss": 0.7251,
+ "step": 1667
+ },
+ {
+ "epoch": 0.7886524822695036,
+ "grad_norm": 2.7855660915374756,
+ "learning_rate": 4.811069140798156e-06,
+ "loss": 0.6534,
+ "step": 1668
+ },
+ {
+ "epoch": 0.7891252955082743,
+ "grad_norm": 2.836603879928589,
+ "learning_rate": 4.810831167782647e-06,
+ "loss": 0.6661,
+ "step": 1669
+ },
+ {
+ "epoch": 0.789598108747045,
+ "grad_norm": 2.5339887142181396,
+ "learning_rate": 4.810593050881813e-06,
+ "loss": 0.5354,
+ "step": 1670
+ },
+ {
+ "epoch": 0.7900709219858156,
+ "grad_norm": 2.9553709030151367,
+ "learning_rate": 4.810354790110482e-06,
+ "loss": 0.6001,
+ "step": 1671
+ },
+ {
+ "epoch": 0.7905437352245863,
+ "grad_norm": 2.6581788063049316,
+ "learning_rate": 4.8101163854834885e-06,
+ "loss": 0.6802,
+ "step": 1672
+ },
+ {
+ "epoch": 0.791016548463357,
+ "grad_norm": 3.2002551555633545,
+ "learning_rate": 4.809877837015677e-06,
+ "loss": 0.6641,
+ "step": 1673
+ },
+ {
+ "epoch": 0.7914893617021277,
+ "grad_norm": 2.918792963027954,
+ "learning_rate": 4.809639144721902e-06,
+ "loss": 0.6758,
+ "step": 1674
+ },
+ {
+ "epoch": 0.7919621749408984,
+ "grad_norm": 2.7993946075439453,
+ "learning_rate": 4.8094003086170245e-06,
+ "loss": 0.5889,
+ "step": 1675
+ },
+ {
+ "epoch": 0.7924349881796691,
+ "grad_norm": 2.3698952198028564,
+ "learning_rate": 4.809161328715916e-06,
+ "loss": 0.6244,
+ "step": 1676
+ },
+ {
+ "epoch": 0.7929078014184398,
+ "grad_norm": 2.8891594409942627,
+ "learning_rate": 4.808922205033458e-06,
+ "loss": 0.5835,
+ "step": 1677
+ },
+ {
+ "epoch": 0.7933806146572104,
+ "grad_norm": 2.838345766067505,
+ "learning_rate": 4.808682937584537e-06,
+ "loss": 0.6907,
+ "step": 1678
+ },
+ {
+ "epoch": 0.7938534278959811,
+ "grad_norm": 2.8443174362182617,
+ "learning_rate": 4.808443526384053e-06,
+ "loss": 0.6692,
+ "step": 1679
+ },
+ {
+ "epoch": 0.7943262411347518,
+ "grad_norm": 2.7355034351348877,
+ "learning_rate": 4.808203971446913e-06,
+ "loss": 0.5799,
+ "step": 1680
+ },
+ {
+ "epoch": 0.7947990543735225,
+ "grad_norm": 2.7108020782470703,
+ "learning_rate": 4.807964272788033e-06,
+ "loss": 0.652,
+ "step": 1681
+ },
+ {
+ "epoch": 0.7952718676122932,
+ "grad_norm": 2.397650957107544,
+ "learning_rate": 4.807724430422338e-06,
+ "loss": 0.5418,
+ "step": 1682
+ },
+ {
+ "epoch": 0.7957446808510639,
+ "grad_norm": 2.4981582164764404,
+ "learning_rate": 4.807484444364762e-06,
+ "loss": 0.5731,
+ "step": 1683
+ },
+ {
+ "epoch": 0.7962174940898346,
+ "grad_norm": 2.7943713665008545,
+ "learning_rate": 4.8072443146302475e-06,
+ "loss": 0.5913,
+ "step": 1684
+ },
+ {
+ "epoch": 0.7966903073286052,
+ "grad_norm": 2.5691423416137695,
+ "learning_rate": 4.807004041233746e-06,
+ "loss": 0.6475,
+ "step": 1685
+ },
+ {
+ "epoch": 0.7971631205673759,
+ "grad_norm": 3.2367498874664307,
+ "learning_rate": 4.8067636241902195e-06,
+ "loss": 0.675,
+ "step": 1686
+ },
+ {
+ "epoch": 0.7976359338061466,
+ "grad_norm": 3.000595808029175,
+ "learning_rate": 4.806523063514637e-06,
+ "loss": 0.5481,
+ "step": 1687
+ },
+ {
+ "epoch": 0.7981087470449173,
+ "grad_norm": 2.702014207839966,
+ "learning_rate": 4.806282359221976e-06,
+ "loss": 0.5993,
+ "step": 1688
+ },
+ {
+ "epoch": 0.798581560283688,
+ "grad_norm": 2.383671998977661,
+ "learning_rate": 4.806041511327226e-06,
+ "loss": 0.562,
+ "step": 1689
+ },
+ {
+ "epoch": 0.7990543735224587,
+ "grad_norm": 2.6965041160583496,
+ "learning_rate": 4.8058005198453834e-06,
+ "loss": 0.5955,
+ "step": 1690
+ },
+ {
+ "epoch": 0.7995271867612294,
+ "grad_norm": 2.5906765460968018,
+ "learning_rate": 4.805559384791453e-06,
+ "loss": 0.5151,
+ "step": 1691
+ },
+ {
+ "epoch": 0.8,
+ "grad_norm": 2.5454652309417725,
+ "learning_rate": 4.8053181061804475e-06,
+ "loss": 0.5843,
+ "step": 1692
+ },
+ {
+ "epoch": 0.8004728132387707,
+ "grad_norm": 2.661343812942505,
+ "learning_rate": 4.8050766840273935e-06,
+ "loss": 0.5995,
+ "step": 1693
+ },
+ {
+ "epoch": 0.8009456264775414,
+ "grad_norm": 2.5635924339294434,
+ "learning_rate": 4.8048351183473215e-06,
+ "loss": 0.5676,
+ "step": 1694
+ },
+ {
+ "epoch": 0.8014184397163121,
+ "grad_norm": 2.5936667919158936,
+ "learning_rate": 4.804593409155274e-06,
+ "loss": 0.6291,
+ "step": 1695
+ },
+ {
+ "epoch": 0.8018912529550828,
+ "grad_norm": 2.6902432441711426,
+ "learning_rate": 4.804351556466299e-06,
+ "loss": 0.6114,
+ "step": 1696
+ },
+ {
+ "epoch": 0.8023640661938535,
+ "grad_norm": 2.7764673233032227,
+ "learning_rate": 4.804109560295457e-06,
+ "loss": 0.5768,
+ "step": 1697
+ },
+ {
+ "epoch": 0.8028368794326242,
+ "grad_norm": 2.9587221145629883,
+ "learning_rate": 4.803867420657816e-06,
+ "loss": 0.6048,
+ "step": 1698
+ },
+ {
+ "epoch": 0.8033096926713948,
+ "grad_norm": 2.9238998889923096,
+ "learning_rate": 4.803625137568453e-06,
+ "loss": 0.6329,
+ "step": 1699
+ },
+ {
+ "epoch": 0.8037825059101655,
+ "grad_norm": 2.70473313331604,
+ "learning_rate": 4.803382711042455e-06,
+ "loss": 0.5427,
+ "step": 1700
+ },
+ {
+ "epoch": 0.8042553191489362,
+ "grad_norm": 3.1604979038238525,
+ "learning_rate": 4.803140141094914e-06,
+ "loss": 0.626,
+ "step": 1701
+ },
+ {
+ "epoch": 0.8047281323877069,
+ "grad_norm": 2.9567699432373047,
+ "learning_rate": 4.802897427740936e-06,
+ "loss": 0.5319,
+ "step": 1702
+ },
+ {
+ "epoch": 0.8052009456264776,
+ "grad_norm": 2.90983247756958,
+ "learning_rate": 4.802654570995632e-06,
+ "loss": 0.586,
+ "step": 1703
+ },
+ {
+ "epoch": 0.8056737588652483,
+ "grad_norm": 2.783480167388916,
+ "learning_rate": 4.8024115708741255e-06,
+ "loss": 0.5773,
+ "step": 1704
+ },
+ {
+ "epoch": 0.806146572104019,
+ "grad_norm": 3.3307793140411377,
+ "learning_rate": 4.802168427391547e-06,
+ "loss": 0.6257,
+ "step": 1705
+ },
+ {
+ "epoch": 0.8066193853427897,
+ "grad_norm": 3.0475001335144043,
+ "learning_rate": 4.801925140563034e-06,
+ "loss": 0.6612,
+ "step": 1706
+ },
+ {
+ "epoch": 0.8070921985815603,
+ "grad_norm": 2.8278894424438477,
+ "learning_rate": 4.8016817104037375e-06,
+ "loss": 0.6449,
+ "step": 1707
+ },
+ {
+ "epoch": 0.807565011820331,
+ "grad_norm": 2.760244369506836,
+ "learning_rate": 4.801438136928812e-06,
+ "loss": 0.7007,
+ "step": 1708
+ },
+ {
+ "epoch": 0.8080378250591016,
+ "grad_norm": 2.827634572982788,
+ "learning_rate": 4.801194420153427e-06,
+ "loss": 0.6418,
+ "step": 1709
+ },
+ {
+ "epoch": 0.8085106382978723,
+ "grad_norm": 2.8655009269714355,
+ "learning_rate": 4.800950560092754e-06,
+ "loss": 0.6231,
+ "step": 1710
+ },
+ {
+ "epoch": 0.808983451536643,
+ "grad_norm": 2.738112688064575,
+ "learning_rate": 4.800706556761981e-06,
+ "loss": 0.6463,
+ "step": 1711
+ },
+ {
+ "epoch": 0.8094562647754137,
+ "grad_norm": 2.4781179428100586,
+ "learning_rate": 4.800462410176296e-06,
+ "loss": 0.5365,
+ "step": 1712
+ },
+ {
+ "epoch": 0.8099290780141843,
+ "grad_norm": 2.6049838066101074,
+ "learning_rate": 4.800218120350906e-06,
+ "loss": 0.6035,
+ "step": 1713
+ },
+ {
+ "epoch": 0.810401891252955,
+ "grad_norm": 2.9089980125427246,
+ "learning_rate": 4.79997368730102e-06,
+ "loss": 0.5828,
+ "step": 1714
+ },
+ {
+ "epoch": 0.8108747044917257,
+ "grad_norm": 2.831871747970581,
+ "learning_rate": 4.799729111041857e-06,
+ "loss": 0.5953,
+ "step": 1715
+ },
+ {
+ "epoch": 0.8113475177304964,
+ "grad_norm": 2.5611300468444824,
+ "learning_rate": 4.799484391588647e-06,
+ "loss": 0.6302,
+ "step": 1716
+ },
+ {
+ "epoch": 0.8118203309692671,
+ "grad_norm": 2.744070053100586,
+ "learning_rate": 4.799239528956625e-06,
+ "loss": 0.5561,
+ "step": 1717
+ },
+ {
+ "epoch": 0.8122931442080378,
+ "grad_norm": 2.7344231605529785,
+ "learning_rate": 4.798994523161041e-06,
+ "loss": 0.6317,
+ "step": 1718
+ },
+ {
+ "epoch": 0.8127659574468085,
+ "grad_norm": 2.3420889377593994,
+ "learning_rate": 4.798749374217149e-06,
+ "loss": 0.5415,
+ "step": 1719
+ },
+ {
+ "epoch": 0.8132387706855791,
+ "grad_norm": 2.57384991645813,
+ "learning_rate": 4.798504082140212e-06,
+ "loss": 0.6383,
+ "step": 1720
+ },
+ {
+ "epoch": 0.8137115839243498,
+ "grad_norm": 2.8819844722747803,
+ "learning_rate": 4.798258646945505e-06,
+ "loss": 0.6355,
+ "step": 1721
+ },
+ {
+ "epoch": 0.8141843971631205,
+ "grad_norm": 2.908123254776001,
+ "learning_rate": 4.79801306864831e-06,
+ "loss": 0.701,
+ "step": 1722
+ },
+ {
+ "epoch": 0.8146572104018912,
+ "grad_norm": 2.6500701904296875,
+ "learning_rate": 4.797767347263917e-06,
+ "loss": 0.6152,
+ "step": 1723
+ },
+ {
+ "epoch": 0.8151300236406619,
+ "grad_norm": 2.5513017177581787,
+ "learning_rate": 4.797521482807628e-06,
+ "loss": 0.6241,
+ "step": 1724
+ },
+ {
+ "epoch": 0.8156028368794326,
+ "grad_norm": 2.6239185333251953,
+ "learning_rate": 4.7972754752947495e-06,
+ "loss": 0.6072,
+ "step": 1725
+ },
+ {
+ "epoch": 0.8160756501182033,
+ "grad_norm": 2.673436403274536,
+ "learning_rate": 4.797029324740601e-06,
+ "loss": 0.5802,
+ "step": 1726
+ },
+ {
+ "epoch": 0.816548463356974,
+ "grad_norm": 2.533831834793091,
+ "learning_rate": 4.796783031160508e-06,
+ "loss": 0.5566,
+ "step": 1727
+ },
+ {
+ "epoch": 0.8170212765957446,
+ "grad_norm": 2.9806582927703857,
+ "learning_rate": 4.796536594569807e-06,
+ "loss": 0.6945,
+ "step": 1728
+ },
+ {
+ "epoch": 0.8174940898345153,
+ "grad_norm": 2.7093560695648193,
+ "learning_rate": 4.796290014983842e-06,
+ "loss": 0.7143,
+ "step": 1729
+ },
+ {
+ "epoch": 0.817966903073286,
+ "grad_norm": 2.814507246017456,
+ "learning_rate": 4.796043292417967e-06,
+ "loss": 0.6122,
+ "step": 1730
+ },
+ {
+ "epoch": 0.8184397163120567,
+ "grad_norm": 2.537156820297241,
+ "learning_rate": 4.795796426887543e-06,
+ "loss": 0.6229,
+ "step": 1731
+ },
+ {
+ "epoch": 0.8189125295508274,
+ "grad_norm": 2.4878013134002686,
+ "learning_rate": 4.795549418407944e-06,
+ "loss": 0.5442,
+ "step": 1732
+ },
+ {
+ "epoch": 0.8193853427895981,
+ "grad_norm": 2.839383363723755,
+ "learning_rate": 4.795302266994548e-06,
+ "loss": 0.6717,
+ "step": 1733
+ },
+ {
+ "epoch": 0.8198581560283688,
+ "grad_norm": 3.1981801986694336,
+ "learning_rate": 4.795054972662744e-06,
+ "loss": 0.6596,
+ "step": 1734
+ },
+ {
+ "epoch": 0.8203309692671394,
+ "grad_norm": 2.781730890274048,
+ "learning_rate": 4.79480753542793e-06,
+ "loss": 0.5845,
+ "step": 1735
+ },
+ {
+ "epoch": 0.8208037825059101,
+ "grad_norm": 2.689948558807373,
+ "learning_rate": 4.794559955305513e-06,
+ "loss": 0.5928,
+ "step": 1736
+ },
+ {
+ "epoch": 0.8212765957446808,
+ "grad_norm": 2.7267637252807617,
+ "learning_rate": 4.7943122323109105e-06,
+ "loss": 0.5224,
+ "step": 1737
+ },
+ {
+ "epoch": 0.8217494089834515,
+ "grad_norm": 2.4346601963043213,
+ "learning_rate": 4.794064366459544e-06,
+ "loss": 0.6431,
+ "step": 1738
+ },
+ {
+ "epoch": 0.8222222222222222,
+ "grad_norm": 2.7440176010131836,
+ "learning_rate": 4.793816357766849e-06,
+ "loss": 0.6083,
+ "step": 1739
+ },
+ {
+ "epoch": 0.8226950354609929,
+ "grad_norm": 2.6558027267456055,
+ "learning_rate": 4.793568206248268e-06,
+ "loss": 0.698,
+ "step": 1740
+ },
+ {
+ "epoch": 0.8231678486997636,
+ "grad_norm": 2.591658353805542,
+ "learning_rate": 4.793319911919251e-06,
+ "loss": 0.6601,
+ "step": 1741
+ },
+ {
+ "epoch": 0.8236406619385342,
+ "grad_norm": 2.5431172847747803,
+ "learning_rate": 4.79307147479526e-06,
+ "loss": 0.5917,
+ "step": 1742
+ },
+ {
+ "epoch": 0.8241134751773049,
+ "grad_norm": 2.7335588932037354,
+ "learning_rate": 4.792822894891762e-06,
+ "loss": 0.5925,
+ "step": 1743
+ },
+ {
+ "epoch": 0.8245862884160756,
+ "grad_norm": 2.2500839233398438,
+ "learning_rate": 4.792574172224237e-06,
+ "loss": 0.4984,
+ "step": 1744
+ },
+ {
+ "epoch": 0.8250591016548463,
+ "grad_norm": 2.691343069076538,
+ "learning_rate": 4.79232530680817e-06,
+ "loss": 0.6262,
+ "step": 1745
+ },
+ {
+ "epoch": 0.825531914893617,
+ "grad_norm": 2.612204074859619,
+ "learning_rate": 4.792076298659058e-06,
+ "loss": 0.5822,
+ "step": 1746
+ },
+ {
+ "epoch": 0.8260047281323877,
+ "grad_norm": 3.0163519382476807,
+ "learning_rate": 4.791827147792406e-06,
+ "loss": 0.6263,
+ "step": 1747
+ },
+ {
+ "epoch": 0.8264775413711584,
+ "grad_norm": 2.742183208465576,
+ "learning_rate": 4.791577854223727e-06,
+ "loss": 0.6628,
+ "step": 1748
+ },
+ {
+ "epoch": 0.826950354609929,
+ "grad_norm": 2.872213840484619,
+ "learning_rate": 4.791328417968542e-06,
+ "loss": 0.6332,
+ "step": 1749
+ },
+ {
+ "epoch": 0.8274231678486997,
+ "grad_norm": 2.725006580352783,
+ "learning_rate": 4.7910788390423844e-06,
+ "loss": 0.6266,
+ "step": 1750
+ },
+ {
+ "epoch": 0.8278959810874704,
+ "grad_norm": 3.0366697311401367,
+ "learning_rate": 4.790829117460793e-06,
+ "loss": 0.6403,
+ "step": 1751
+ },
+ {
+ "epoch": 0.8283687943262411,
+ "grad_norm": 2.594881772994995,
+ "learning_rate": 4.790579253239318e-06,
+ "loss": 0.521,
+ "step": 1752
+ },
+ {
+ "epoch": 0.8288416075650118,
+ "grad_norm": 2.4496347904205322,
+ "learning_rate": 4.790329246393517e-06,
+ "loss": 0.54,
+ "step": 1753
+ },
+ {
+ "epoch": 0.8293144208037825,
+ "grad_norm": 3.102278470993042,
+ "learning_rate": 4.790079096938956e-06,
+ "loss": 0.6142,
+ "step": 1754
+ },
+ {
+ "epoch": 0.8297872340425532,
+ "grad_norm": 2.4645912647247314,
+ "learning_rate": 4.789828804891212e-06,
+ "loss": 0.5212,
+ "step": 1755
+ },
+ {
+ "epoch": 0.8302600472813239,
+ "grad_norm": 2.7482516765594482,
+ "learning_rate": 4.789578370265868e-06,
+ "loss": 0.6712,
+ "step": 1756
+ },
+ {
+ "epoch": 0.8307328605200945,
+ "grad_norm": 2.61360502243042,
+ "learning_rate": 4.7893277930785195e-06,
+ "loss": 0.6367,
+ "step": 1757
+ },
+ {
+ "epoch": 0.8312056737588652,
+ "grad_norm": 2.79028058052063,
+ "learning_rate": 4.789077073344767e-06,
+ "loss": 0.5099,
+ "step": 1758
+ },
+ {
+ "epoch": 0.8316784869976359,
+ "grad_norm": 2.647662401199341,
+ "learning_rate": 4.788826211080222e-06,
+ "loss": 0.6698,
+ "step": 1759
+ },
+ {
+ "epoch": 0.8321513002364066,
+ "grad_norm": 3.0214831829071045,
+ "learning_rate": 4.7885752063005055e-06,
+ "loss": 0.6121,
+ "step": 1760
+ },
+ {
+ "epoch": 0.8326241134751773,
+ "grad_norm": 2.8244032859802246,
+ "learning_rate": 4.788324059021247e-06,
+ "loss": 0.6921,
+ "step": 1761
+ },
+ {
+ "epoch": 0.833096926713948,
+ "grad_norm": 3.1501076221466064,
+ "learning_rate": 4.788072769258082e-06,
+ "loss": 0.6872,
+ "step": 1762
+ },
+ {
+ "epoch": 0.8335697399527187,
+ "grad_norm": 2.6989903450012207,
+ "learning_rate": 4.7878213370266594e-06,
+ "loss": 0.5884,
+ "step": 1763
+ },
+ {
+ "epoch": 0.8340425531914893,
+ "grad_norm": 2.6982665061950684,
+ "learning_rate": 4.787569762342633e-06,
+ "loss": 0.6112,
+ "step": 1764
+ },
+ {
+ "epoch": 0.83451536643026,
+ "grad_norm": 2.6918323040008545,
+ "learning_rate": 4.7873180452216685e-06,
+ "loss": 0.5315,
+ "step": 1765
+ },
+ {
+ "epoch": 0.8349881796690307,
+ "grad_norm": 2.5494401454925537,
+ "learning_rate": 4.78706618567944e-06,
+ "loss": 0.5909,
+ "step": 1766
+ },
+ {
+ "epoch": 0.8354609929078014,
+ "grad_norm": 2.7532095909118652,
+ "learning_rate": 4.786814183731627e-06,
+ "loss": 0.5566,
+ "step": 1767
+ },
+ {
+ "epoch": 0.8359338061465721,
+ "grad_norm": 2.550865888595581,
+ "learning_rate": 4.786562039393923e-06,
+ "loss": 0.555,
+ "step": 1768
+ },
+ {
+ "epoch": 0.8364066193853428,
+ "grad_norm": 2.4477791786193848,
+ "learning_rate": 4.786309752682028e-06,
+ "loss": 0.5844,
+ "step": 1769
+ },
+ {
+ "epoch": 0.8368794326241135,
+ "grad_norm": 2.6982262134552,
+ "learning_rate": 4.7860573236116485e-06,
+ "loss": 0.6136,
+ "step": 1770
+ },
+ {
+ "epoch": 0.8373522458628841,
+ "grad_norm": 2.456263542175293,
+ "learning_rate": 4.785804752198503e-06,
+ "loss": 0.5055,
+ "step": 1771
+ },
+ {
+ "epoch": 0.8378250591016548,
+ "grad_norm": 2.428544521331787,
+ "learning_rate": 4.78555203845832e-06,
+ "loss": 0.5859,
+ "step": 1772
+ },
+ {
+ "epoch": 0.8382978723404255,
+ "grad_norm": 2.1782307624816895,
+ "learning_rate": 4.785299182406833e-06,
+ "loss": 0.5325,
+ "step": 1773
+ },
+ {
+ "epoch": 0.8387706855791962,
+ "grad_norm": 3.137956142425537,
+ "learning_rate": 4.785046184059786e-06,
+ "loss": 0.6097,
+ "step": 1774
+ },
+ {
+ "epoch": 0.8392434988179669,
+ "grad_norm": 2.6269001960754395,
+ "learning_rate": 4.7847930434329336e-06,
+ "loss": 0.5972,
+ "step": 1775
+ },
+ {
+ "epoch": 0.8397163120567376,
+ "grad_norm": 2.732659339904785,
+ "learning_rate": 4.784539760542037e-06,
+ "loss": 0.6054,
+ "step": 1776
+ },
+ {
+ "epoch": 0.8401891252955083,
+ "grad_norm": 2.5346736907958984,
+ "learning_rate": 4.784286335402866e-06,
+ "loss": 0.5521,
+ "step": 1777
+ },
+ {
+ "epoch": 0.840661938534279,
+ "grad_norm": 3.1420228481292725,
+ "learning_rate": 4.784032768031202e-06,
+ "loss": 0.6165,
+ "step": 1778
+ },
+ {
+ "epoch": 0.8411347517730496,
+ "grad_norm": 3.073793411254883,
+ "learning_rate": 4.783779058442831e-06,
+ "loss": 0.6414,
+ "step": 1779
+ },
+ {
+ "epoch": 0.8416075650118203,
+ "grad_norm": 2.6621336936950684,
+ "learning_rate": 4.783525206653554e-06,
+ "loss": 0.5836,
+ "step": 1780
+ },
+ {
+ "epoch": 0.842080378250591,
+ "grad_norm": 2.7029049396514893,
+ "learning_rate": 4.7832712126791745e-06,
+ "loss": 0.5897,
+ "step": 1781
+ },
+ {
+ "epoch": 0.8425531914893617,
+ "grad_norm": 2.4733822345733643,
+ "learning_rate": 4.783017076535509e-06,
+ "loss": 0.5913,
+ "step": 1782
+ },
+ {
+ "epoch": 0.8430260047281324,
+ "grad_norm": 2.8119473457336426,
+ "learning_rate": 4.782762798238381e-06,
+ "loss": 0.6105,
+ "step": 1783
+ },
+ {
+ "epoch": 0.8434988179669031,
+ "grad_norm": 2.5290818214416504,
+ "learning_rate": 4.782508377803622e-06,
+ "loss": 0.6119,
+ "step": 1784
+ },
+ {
+ "epoch": 0.8439716312056738,
+ "grad_norm": 3.193472385406494,
+ "learning_rate": 4.782253815247076e-06,
+ "loss": 0.6665,
+ "step": 1785
+ },
+ {
+ "epoch": 0.8444444444444444,
+ "grad_norm": 3.206759452819824,
+ "learning_rate": 4.781999110584592e-06,
+ "loss": 0.6012,
+ "step": 1786
+ },
+ {
+ "epoch": 0.8449172576832151,
+ "grad_norm": 2.6227457523345947,
+ "learning_rate": 4.781744263832029e-06,
+ "loss": 0.5845,
+ "step": 1787
+ },
+ {
+ "epoch": 0.8453900709219858,
+ "grad_norm": 2.838365316390991,
+ "learning_rate": 4.781489275005257e-06,
+ "loss": 0.5695,
+ "step": 1788
+ },
+ {
+ "epoch": 0.8458628841607565,
+ "grad_norm": 2.8348326683044434,
+ "learning_rate": 4.78123414412015e-06,
+ "loss": 0.6136,
+ "step": 1789
+ },
+ {
+ "epoch": 0.8463356973995272,
+ "grad_norm": 2.5698344707489014,
+ "learning_rate": 4.780978871192597e-06,
+ "loss": 0.6576,
+ "step": 1790
+ },
+ {
+ "epoch": 0.8468085106382979,
+ "grad_norm": 2.5198330879211426,
+ "learning_rate": 4.780723456238492e-06,
+ "loss": 0.5521,
+ "step": 1791
+ },
+ {
+ "epoch": 0.8472813238770686,
+ "grad_norm": 3.001325845718384,
+ "learning_rate": 4.780467899273737e-06,
+ "loss": 0.6075,
+ "step": 1792
+ },
+ {
+ "epoch": 0.8477541371158392,
+ "grad_norm": 2.7732746601104736,
+ "learning_rate": 4.780212200314247e-06,
+ "loss": 0.6245,
+ "step": 1793
+ },
+ {
+ "epoch": 0.8482269503546099,
+ "grad_norm": 2.6950337886810303,
+ "learning_rate": 4.77995635937594e-06,
+ "loss": 0.5723,
+ "step": 1794
+ },
+ {
+ "epoch": 0.8486997635933806,
+ "grad_norm": 2.82051420211792,
+ "learning_rate": 4.779700376474749e-06,
+ "loss": 0.6184,
+ "step": 1795
+ },
+ {
+ "epoch": 0.8491725768321513,
+ "grad_norm": 2.757791757583618,
+ "learning_rate": 4.779444251626611e-06,
+ "loss": 0.608,
+ "step": 1796
+ },
+ {
+ "epoch": 0.849645390070922,
+ "grad_norm": 2.394108533859253,
+ "learning_rate": 4.779187984847475e-06,
+ "loss": 0.6174,
+ "step": 1797
+ },
+ {
+ "epoch": 0.8501182033096927,
+ "grad_norm": 2.427562713623047,
+ "learning_rate": 4.778931576153296e-06,
+ "loss": 0.5618,
+ "step": 1798
+ },
+ {
+ "epoch": 0.8505910165484634,
+ "grad_norm": 2.891268491744995,
+ "learning_rate": 4.778675025560042e-06,
+ "loss": 0.6865,
+ "step": 1799
+ },
+ {
+ "epoch": 0.851063829787234,
+ "grad_norm": 2.665534257888794,
+ "learning_rate": 4.778418333083685e-06,
+ "loss": 0.5852,
+ "step": 1800
+ },
+ {
+ "epoch": 0.8515366430260047,
+ "grad_norm": 2.5492889881134033,
+ "learning_rate": 4.7781614987402095e-06,
+ "loss": 0.5161,
+ "step": 1801
+ },
+ {
+ "epoch": 0.8520094562647754,
+ "grad_norm": 2.400177001953125,
+ "learning_rate": 4.777904522545607e-06,
+ "loss": 0.5128,
+ "step": 1802
+ },
+ {
+ "epoch": 0.8524822695035461,
+ "grad_norm": 2.3949809074401855,
+ "learning_rate": 4.777647404515878e-06,
+ "loss": 0.571,
+ "step": 1803
+ },
+ {
+ "epoch": 0.8529550827423168,
+ "grad_norm": 2.3624472618103027,
+ "learning_rate": 4.7773901446670325e-06,
+ "loss": 0.5486,
+ "step": 1804
+ },
+ {
+ "epoch": 0.8534278959810875,
+ "grad_norm": 2.711366891860962,
+ "learning_rate": 4.7771327430150885e-06,
+ "loss": 0.5667,
+ "step": 1805
+ },
+ {
+ "epoch": 0.8539007092198582,
+ "grad_norm": 2.7681493759155273,
+ "learning_rate": 4.776875199576073e-06,
+ "loss": 0.5686,
+ "step": 1806
+ },
+ {
+ "epoch": 0.8543735224586289,
+ "grad_norm": 3.0369436740875244,
+ "learning_rate": 4.776617514366023e-06,
+ "loss": 0.6635,
+ "step": 1807
+ },
+ {
+ "epoch": 0.8548463356973995,
+ "grad_norm": 2.919649600982666,
+ "learning_rate": 4.776359687400983e-06,
+ "loss": 0.5749,
+ "step": 1808
+ },
+ {
+ "epoch": 0.8553191489361702,
+ "grad_norm": 2.7986185550689697,
+ "learning_rate": 4.776101718697007e-06,
+ "loss": 0.559,
+ "step": 1809
+ },
+ {
+ "epoch": 0.8557919621749409,
+ "grad_norm": 2.5951223373413086,
+ "learning_rate": 4.775843608270158e-06,
+ "loss": 0.5654,
+ "step": 1810
+ },
+ {
+ "epoch": 0.8562647754137116,
+ "grad_norm": 2.674138069152832,
+ "learning_rate": 4.775585356136505e-06,
+ "loss": 0.5286,
+ "step": 1811
+ },
+ {
+ "epoch": 0.8567375886524823,
+ "grad_norm": 3.045437812805176,
+ "learning_rate": 4.775326962312131e-06,
+ "loss": 0.6185,
+ "step": 1812
+ },
+ {
+ "epoch": 0.857210401891253,
+ "grad_norm": 2.6145293712615967,
+ "learning_rate": 4.775068426813124e-06,
+ "loss": 0.6075,
+ "step": 1813
+ },
+ {
+ "epoch": 0.8576832151300237,
+ "grad_norm": 2.6320106983184814,
+ "learning_rate": 4.7748097496555824e-06,
+ "loss": 0.561,
+ "step": 1814
+ },
+ {
+ "epoch": 0.8581560283687943,
+ "grad_norm": 2.5038623809814453,
+ "learning_rate": 4.774550930855612e-06,
+ "loss": 0.593,
+ "step": 1815
+ },
+ {
+ "epoch": 0.858628841607565,
+ "grad_norm": 2.8168089389801025,
+ "learning_rate": 4.774291970429329e-06,
+ "loss": 0.5196,
+ "step": 1816
+ },
+ {
+ "epoch": 0.8591016548463357,
+ "grad_norm": 2.778130292892456,
+ "learning_rate": 4.774032868392858e-06,
+ "loss": 0.5984,
+ "step": 1817
+ },
+ {
+ "epoch": 0.8595744680851064,
+ "grad_norm": 2.536458730697632,
+ "learning_rate": 4.7737736247623305e-06,
+ "loss": 0.568,
+ "step": 1818
+ },
+ {
+ "epoch": 0.8600472813238771,
+ "grad_norm": 2.6669719219207764,
+ "learning_rate": 4.77351423955389e-06,
+ "loss": 0.6233,
+ "step": 1819
+ },
+ {
+ "epoch": 0.8605200945626478,
+ "grad_norm": 2.578242540359497,
+ "learning_rate": 4.773254712783687e-06,
+ "loss": 0.579,
+ "step": 1820
+ },
+ {
+ "epoch": 0.8609929078014185,
+ "grad_norm": 2.816664457321167,
+ "learning_rate": 4.772995044467881e-06,
+ "loss": 0.6635,
+ "step": 1821
+ },
+ {
+ "epoch": 0.8614657210401891,
+ "grad_norm": 3.1111979484558105,
+ "learning_rate": 4.77273523462264e-06,
+ "loss": 0.6372,
+ "step": 1822
+ },
+ {
+ "epoch": 0.8619385342789598,
+ "grad_norm": 2.764552354812622,
+ "learning_rate": 4.772475283264142e-06,
+ "loss": 0.6216,
+ "step": 1823
+ },
+ {
+ "epoch": 0.8624113475177305,
+ "grad_norm": 2.9126830101013184,
+ "learning_rate": 4.772215190408572e-06,
+ "loss": 0.6396,
+ "step": 1824
+ },
+ {
+ "epoch": 0.8628841607565012,
+ "grad_norm": 2.7502307891845703,
+ "learning_rate": 4.7719549560721264e-06,
+ "loss": 0.6186,
+ "step": 1825
+ },
+ {
+ "epoch": 0.8633569739952719,
+ "grad_norm": 2.6279006004333496,
+ "learning_rate": 4.771694580271007e-06,
+ "loss": 0.5557,
+ "step": 1826
+ },
+ {
+ "epoch": 0.8638297872340426,
+ "grad_norm": 2.996563196182251,
+ "learning_rate": 4.7714340630214276e-06,
+ "loss": 0.6259,
+ "step": 1827
+ },
+ {
+ "epoch": 0.8643026004728133,
+ "grad_norm": 3.231323480606079,
+ "learning_rate": 4.771173404339609e-06,
+ "loss": 0.5473,
+ "step": 1828
+ },
+ {
+ "epoch": 0.864775413711584,
+ "grad_norm": 3.143519878387451,
+ "learning_rate": 4.770912604241781e-06,
+ "loss": 0.593,
+ "step": 1829
+ },
+ {
+ "epoch": 0.8652482269503546,
+ "grad_norm": 2.515484094619751,
+ "learning_rate": 4.770651662744184e-06,
+ "loss": 0.538,
+ "step": 1830
+ },
+ {
+ "epoch": 0.8657210401891253,
+ "grad_norm": 2.629058837890625,
+ "learning_rate": 4.770390579863064e-06,
+ "loss": 0.5745,
+ "step": 1831
+ },
+ {
+ "epoch": 0.866193853427896,
+ "grad_norm": 2.5826802253723145,
+ "learning_rate": 4.770129355614677e-06,
+ "loss": 0.6397,
+ "step": 1832
+ },
+ {
+ "epoch": 0.8666666666666667,
+ "grad_norm": 2.954623222351074,
+ "learning_rate": 4.769867990015289e-06,
+ "loss": 0.6106,
+ "step": 1833
+ },
+ {
+ "epoch": 0.8671394799054374,
+ "grad_norm": 2.742192268371582,
+ "learning_rate": 4.769606483081175e-06,
+ "loss": 0.6902,
+ "step": 1834
+ },
+ {
+ "epoch": 0.8676122931442081,
+ "grad_norm": 2.2619097232818604,
+ "learning_rate": 4.769344834828618e-06,
+ "loss": 0.5414,
+ "step": 1835
+ },
+ {
+ "epoch": 0.8680851063829788,
+ "grad_norm": 2.7384188175201416,
+ "learning_rate": 4.769083045273908e-06,
+ "loss": 0.5787,
+ "step": 1836
+ },
+ {
+ "epoch": 0.8685579196217494,
+ "grad_norm": 2.6734485626220703,
+ "learning_rate": 4.768821114433346e-06,
+ "loss": 0.5923,
+ "step": 1837
+ },
+ {
+ "epoch": 0.8690307328605201,
+ "grad_norm": 2.286140203475952,
+ "learning_rate": 4.768559042323243e-06,
+ "loss": 0.5822,
+ "step": 1838
+ },
+ {
+ "epoch": 0.8695035460992908,
+ "grad_norm": 3.0243725776672363,
+ "learning_rate": 4.768296828959915e-06,
+ "loss": 0.6623,
+ "step": 1839
+ },
+ {
+ "epoch": 0.8699763593380615,
+ "grad_norm": 2.4026312828063965,
+ "learning_rate": 4.768034474359689e-06,
+ "loss": 0.5554,
+ "step": 1840
+ },
+ {
+ "epoch": 0.8704491725768322,
+ "grad_norm": 2.7469029426574707,
+ "learning_rate": 4.767771978538903e-06,
+ "loss": 0.6316,
+ "step": 1841
+ },
+ {
+ "epoch": 0.8709219858156029,
+ "grad_norm": 2.729659080505371,
+ "learning_rate": 4.767509341513899e-06,
+ "loss": 0.5807,
+ "step": 1842
+ },
+ {
+ "epoch": 0.8713947990543736,
+ "grad_norm": 2.5336945056915283,
+ "learning_rate": 4.76724656330103e-06,
+ "loss": 0.6109,
+ "step": 1843
+ },
+ {
+ "epoch": 0.8718676122931442,
+ "grad_norm": 2.519880533218384,
+ "learning_rate": 4.76698364391666e-06,
+ "loss": 0.5313,
+ "step": 1844
+ },
+ {
+ "epoch": 0.8723404255319149,
+ "grad_norm": 2.698862075805664,
+ "learning_rate": 4.766720583377159e-06,
+ "loss": 0.5953,
+ "step": 1845
+ },
+ {
+ "epoch": 0.8728132387706856,
+ "grad_norm": 3.0195560455322266,
+ "learning_rate": 4.766457381698907e-06,
+ "loss": 0.5965,
+ "step": 1846
+ },
+ {
+ "epoch": 0.8732860520094563,
+ "grad_norm": 2.5972697734832764,
+ "learning_rate": 4.766194038898291e-06,
+ "loss": 0.6014,
+ "step": 1847
+ },
+ {
+ "epoch": 0.873758865248227,
+ "grad_norm": 2.7132294178009033,
+ "learning_rate": 4.76593055499171e-06,
+ "loss": 0.5638,
+ "step": 1848
+ },
+ {
+ "epoch": 0.8742316784869977,
+ "grad_norm": 2.7134575843811035,
+ "learning_rate": 4.765666929995568e-06,
+ "loss": 0.52,
+ "step": 1849
+ },
+ {
+ "epoch": 0.8747044917257684,
+ "grad_norm": 2.3804993629455566,
+ "learning_rate": 4.765403163926282e-06,
+ "loss": 0.5435,
+ "step": 1850
+ },
+ {
+ "epoch": 0.875177304964539,
+ "grad_norm": 2.8782761096954346,
+ "learning_rate": 4.765139256800274e-06,
+ "loss": 0.5843,
+ "step": 1851
+ },
+ {
+ "epoch": 0.8756501182033097,
+ "grad_norm": 2.836209774017334,
+ "learning_rate": 4.764875208633977e-06,
+ "loss": 0.6667,
+ "step": 1852
+ },
+ {
+ "epoch": 0.8761229314420804,
+ "grad_norm": 2.608851194381714,
+ "learning_rate": 4.764611019443831e-06,
+ "loss": 0.5436,
+ "step": 1853
+ },
+ {
+ "epoch": 0.8765957446808511,
+ "grad_norm": 2.788738965988159,
+ "learning_rate": 4.764346689246288e-06,
+ "loss": 0.7331,
+ "step": 1854
+ },
+ {
+ "epoch": 0.8770685579196218,
+ "grad_norm": 2.524277687072754,
+ "learning_rate": 4.764082218057805e-06,
+ "loss": 0.5067,
+ "step": 1855
+ },
+ {
+ "epoch": 0.8775413711583925,
+ "grad_norm": 3.7559316158294678,
+ "learning_rate": 4.763817605894851e-06,
+ "loss": 0.6809,
+ "step": 1856
+ },
+ {
+ "epoch": 0.8780141843971632,
+ "grad_norm": 2.9070613384246826,
+ "learning_rate": 4.763552852773899e-06,
+ "loss": 0.5913,
+ "step": 1857
+ },
+ {
+ "epoch": 0.8784869976359339,
+ "grad_norm": 2.7050609588623047,
+ "learning_rate": 4.7632879587114386e-06,
+ "loss": 0.6074,
+ "step": 1858
+ },
+ {
+ "epoch": 0.8789598108747045,
+ "grad_norm": 2.891134262084961,
+ "learning_rate": 4.76302292372396e-06,
+ "loss": 0.5939,
+ "step": 1859
+ },
+ {
+ "epoch": 0.8794326241134752,
+ "grad_norm": 2.8581702709198,
+ "learning_rate": 4.762757747827968e-06,
+ "loss": 0.5972,
+ "step": 1860
+ },
+ {
+ "epoch": 0.8799054373522459,
+ "grad_norm": 2.8266196250915527,
+ "learning_rate": 4.762492431039971e-06,
+ "loss": 0.5993,
+ "step": 1861
+ },
+ {
+ "epoch": 0.8803782505910166,
+ "grad_norm": 2.4853954315185547,
+ "learning_rate": 4.762226973376493e-06,
+ "loss": 0.6388,
+ "step": 1862
+ },
+ {
+ "epoch": 0.8808510638297873,
+ "grad_norm": 3.2212886810302734,
+ "learning_rate": 4.761961374854059e-06,
+ "loss": 0.6698,
+ "step": 1863
+ },
+ {
+ "epoch": 0.881323877068558,
+ "grad_norm": 3.1254501342773438,
+ "learning_rate": 4.761695635489211e-06,
+ "loss": 0.5263,
+ "step": 1864
+ },
+ {
+ "epoch": 0.8817966903073287,
+ "grad_norm": 2.6891462802886963,
+ "learning_rate": 4.761429755298491e-06,
+ "loss": 0.5359,
+ "step": 1865
+ },
+ {
+ "epoch": 0.8822695035460993,
+ "grad_norm": 2.8557538986206055,
+ "learning_rate": 4.761163734298457e-06,
+ "loss": 0.5933,
+ "step": 1866
+ },
+ {
+ "epoch": 0.88274231678487,
+ "grad_norm": 2.53548264503479,
+ "learning_rate": 4.7608975725056724e-06,
+ "loss": 0.6397,
+ "step": 1867
+ },
+ {
+ "epoch": 0.8832151300236407,
+ "grad_norm": 3.0237956047058105,
+ "learning_rate": 4.76063126993671e-06,
+ "loss": 0.6845,
+ "step": 1868
+ },
+ {
+ "epoch": 0.8836879432624114,
+ "grad_norm": 3.222886800765991,
+ "learning_rate": 4.76036482660815e-06,
+ "loss": 0.6055,
+ "step": 1869
+ },
+ {
+ "epoch": 0.8841607565011821,
+ "grad_norm": 3.1867551803588867,
+ "learning_rate": 4.760098242536584e-06,
+ "loss": 0.6592,
+ "step": 1870
+ },
+ {
+ "epoch": 0.8846335697399527,
+ "grad_norm": 2.782209873199463,
+ "learning_rate": 4.7598315177386115e-06,
+ "loss": 0.5833,
+ "step": 1871
+ },
+ {
+ "epoch": 0.8851063829787233,
+ "grad_norm": 2.899871587753296,
+ "learning_rate": 4.759564652230838e-06,
+ "loss": 0.6129,
+ "step": 1872
+ },
+ {
+ "epoch": 0.885579196217494,
+ "grad_norm": 2.5690579414367676,
+ "learning_rate": 4.759297646029882e-06,
+ "loss": 0.5827,
+ "step": 1873
+ },
+ {
+ "epoch": 0.8860520094562647,
+ "grad_norm": 2.666130304336548,
+ "learning_rate": 4.759030499152368e-06,
+ "loss": 0.5272,
+ "step": 1874
+ },
+ {
+ "epoch": 0.8865248226950354,
+ "grad_norm": 2.7030911445617676,
+ "learning_rate": 4.758763211614932e-06,
+ "loss": 0.6415,
+ "step": 1875
+ },
+ {
+ "epoch": 0.8869976359338061,
+ "grad_norm": 2.717512845993042,
+ "learning_rate": 4.7584957834342135e-06,
+ "loss": 0.5827,
+ "step": 1876
+ },
+ {
+ "epoch": 0.8874704491725768,
+ "grad_norm": 2.665823459625244,
+ "learning_rate": 4.758228214626867e-06,
+ "loss": 0.6209,
+ "step": 1877
+ },
+ {
+ "epoch": 0.8879432624113475,
+ "grad_norm": 2.636653184890747,
+ "learning_rate": 4.75796050520955e-06,
+ "loss": 0.6413,
+ "step": 1878
+ },
+ {
+ "epoch": 0.8884160756501182,
+ "grad_norm": 2.585115671157837,
+ "learning_rate": 4.7576926551989345e-06,
+ "loss": 0.5518,
+ "step": 1879
+ },
+ {
+ "epoch": 0.8888888888888888,
+ "grad_norm": 2.808526039123535,
+ "learning_rate": 4.757424664611697e-06,
+ "loss": 0.5717,
+ "step": 1880
+ },
+ {
+ "epoch": 0.8893617021276595,
+ "grad_norm": 3.5957939624786377,
+ "learning_rate": 4.757156533464524e-06,
+ "loss": 0.6323,
+ "step": 1881
+ },
+ {
+ "epoch": 0.8898345153664302,
+ "grad_norm": 2.5003883838653564,
+ "learning_rate": 4.756888261774111e-06,
+ "loss": 0.5937,
+ "step": 1882
+ },
+ {
+ "epoch": 0.8903073286052009,
+ "grad_norm": 2.749061346054077,
+ "learning_rate": 4.756619849557161e-06,
+ "loss": 0.6642,
+ "step": 1883
+ },
+ {
+ "epoch": 0.8907801418439716,
+ "grad_norm": 2.6757891178131104,
+ "learning_rate": 4.756351296830389e-06,
+ "loss": 0.5887,
+ "step": 1884
+ },
+ {
+ "epoch": 0.8912529550827423,
+ "grad_norm": 2.811925172805786,
+ "learning_rate": 4.756082603610516e-06,
+ "loss": 0.6571,
+ "step": 1885
+ },
+ {
+ "epoch": 0.891725768321513,
+ "grad_norm": 2.5054616928100586,
+ "learning_rate": 4.755813769914271e-06,
+ "loss": 0.6312,
+ "step": 1886
+ },
+ {
+ "epoch": 0.8921985815602836,
+ "grad_norm": 2.7518467903137207,
+ "learning_rate": 4.755544795758395e-06,
+ "loss": 0.6685,
+ "step": 1887
+ },
+ {
+ "epoch": 0.8926713947990543,
+ "grad_norm": 2.7527287006378174,
+ "learning_rate": 4.755275681159634e-06,
+ "loss": 0.5886,
+ "step": 1888
+ },
+ {
+ "epoch": 0.893144208037825,
+ "grad_norm": 2.6162452697753906,
+ "learning_rate": 4.755006426134745e-06,
+ "loss": 0.546,
+ "step": 1889
+ },
+ {
+ "epoch": 0.8936170212765957,
+ "grad_norm": 2.4016737937927246,
+ "learning_rate": 4.754737030700495e-06,
+ "loss": 0.5726,
+ "step": 1890
+ },
+ {
+ "epoch": 0.8940898345153664,
+ "grad_norm": 2.528327703475952,
+ "learning_rate": 4.754467494873656e-06,
+ "loss": 0.5682,
+ "step": 1891
+ },
+ {
+ "epoch": 0.8945626477541371,
+ "grad_norm": 2.3139286041259766,
+ "learning_rate": 4.7541978186710115e-06,
+ "loss": 0.6108,
+ "step": 1892
+ },
+ {
+ "epoch": 0.8950354609929078,
+ "grad_norm": 2.7269136905670166,
+ "learning_rate": 4.753928002109354e-06,
+ "loss": 0.5875,
+ "step": 1893
+ },
+ {
+ "epoch": 0.8955082742316784,
+ "grad_norm": 4.425495147705078,
+ "learning_rate": 4.753658045205482e-06,
+ "loss": 0.5572,
+ "step": 1894
+ },
+ {
+ "epoch": 0.8959810874704491,
+ "grad_norm": 2.535409927368164,
+ "learning_rate": 4.753387947976206e-06,
+ "loss": 0.5868,
+ "step": 1895
+ },
+ {
+ "epoch": 0.8964539007092198,
+ "grad_norm": 2.722458600997925,
+ "learning_rate": 4.753117710438343e-06,
+ "loss": 0.5935,
+ "step": 1896
+ },
+ {
+ "epoch": 0.8969267139479905,
+ "grad_norm": 2.743861436843872,
+ "learning_rate": 4.75284733260872e-06,
+ "loss": 0.572,
+ "step": 1897
+ },
+ {
+ "epoch": 0.8973995271867612,
+ "grad_norm": 2.60640549659729,
+ "learning_rate": 4.752576814504173e-06,
+ "loss": 0.567,
+ "step": 1898
+ },
+ {
+ "epoch": 0.8978723404255319,
+ "grad_norm": 2.7486042976379395,
+ "learning_rate": 4.7523061561415435e-06,
+ "loss": 0.5768,
+ "step": 1899
+ },
+ {
+ "epoch": 0.8983451536643026,
+ "grad_norm": 3.8410251140594482,
+ "learning_rate": 4.752035357537686e-06,
+ "loss": 0.6034,
+ "step": 1900
+ },
+ {
+ "epoch": 0.8988179669030733,
+ "grad_norm": 3.0935890674591064,
+ "learning_rate": 4.751764418709462e-06,
+ "loss": 0.5644,
+ "step": 1901
+ },
+ {
+ "epoch": 0.8992907801418439,
+ "grad_norm": 2.7989892959594727,
+ "learning_rate": 4.751493339673742e-06,
+ "loss": 0.656,
+ "step": 1902
+ },
+ {
+ "epoch": 0.8997635933806146,
+ "grad_norm": 3.6940557956695557,
+ "learning_rate": 4.751222120447403e-06,
+ "loss": 0.6632,
+ "step": 1903
+ },
+ {
+ "epoch": 0.9002364066193853,
+ "grad_norm": 2.3428797721862793,
+ "learning_rate": 4.750950761047335e-06,
+ "loss": 0.4485,
+ "step": 1904
+ },
+ {
+ "epoch": 0.900709219858156,
+ "grad_norm": 2.622544050216675,
+ "learning_rate": 4.750679261490432e-06,
+ "loss": 0.5857,
+ "step": 1905
+ },
+ {
+ "epoch": 0.9011820330969267,
+ "grad_norm": 2.4911322593688965,
+ "learning_rate": 4.750407621793601e-06,
+ "loss": 0.5618,
+ "step": 1906
+ },
+ {
+ "epoch": 0.9016548463356974,
+ "grad_norm": 2.6434662342071533,
+ "learning_rate": 4.750135841973755e-06,
+ "loss": 0.6057,
+ "step": 1907
+ },
+ {
+ "epoch": 0.902127659574468,
+ "grad_norm": 3.115443706512451,
+ "learning_rate": 4.749863922047817e-06,
+ "loss": 0.6064,
+ "step": 1908
+ },
+ {
+ "epoch": 0.9026004728132387,
+ "grad_norm": 2.5671091079711914,
+ "learning_rate": 4.749591862032718e-06,
+ "loss": 0.5625,
+ "step": 1909
+ },
+ {
+ "epoch": 0.9030732860520094,
+ "grad_norm": 3.2008655071258545,
+ "learning_rate": 4.749319661945398e-06,
+ "loss": 0.5547,
+ "step": 1910
+ },
+ {
+ "epoch": 0.9035460992907801,
+ "grad_norm": 2.905987024307251,
+ "learning_rate": 4.749047321802805e-06,
+ "loss": 0.6033,
+ "step": 1911
+ },
+ {
+ "epoch": 0.9040189125295508,
+ "grad_norm": 3.1456053256988525,
+ "learning_rate": 4.748774841621897e-06,
+ "loss": 0.5651,
+ "step": 1912
+ },
+ {
+ "epoch": 0.9044917257683215,
+ "grad_norm": 2.8116416931152344,
+ "learning_rate": 4.748502221419641e-06,
+ "loss": 0.5853,
+ "step": 1913
+ },
+ {
+ "epoch": 0.9049645390070922,
+ "grad_norm": 3.123835325241089,
+ "learning_rate": 4.748229461213011e-06,
+ "loss": 0.5427,
+ "step": 1914
+ },
+ {
+ "epoch": 0.9054373522458629,
+ "grad_norm": 2.4750146865844727,
+ "learning_rate": 4.747956561018989e-06,
+ "loss": 0.6517,
+ "step": 1915
+ },
+ {
+ "epoch": 0.9059101654846335,
+ "grad_norm": 2.6174299716949463,
+ "learning_rate": 4.7476835208545705e-06,
+ "loss": 0.6119,
+ "step": 1916
+ },
+ {
+ "epoch": 0.9063829787234042,
+ "grad_norm": 2.7390382289886475,
+ "learning_rate": 4.747410340736755e-06,
+ "loss": 0.5664,
+ "step": 1917
+ },
+ {
+ "epoch": 0.9068557919621749,
+ "grad_norm": 2.7940444946289062,
+ "learning_rate": 4.747137020682552e-06,
+ "loss": 0.5628,
+ "step": 1918
+ },
+ {
+ "epoch": 0.9073286052009456,
+ "grad_norm": 2.477365016937256,
+ "learning_rate": 4.7468635607089795e-06,
+ "loss": 0.5261,
+ "step": 1919
+ },
+ {
+ "epoch": 0.9078014184397163,
+ "grad_norm": 2.7016685009002686,
+ "learning_rate": 4.746589960833066e-06,
+ "loss": 0.5576,
+ "step": 1920
+ },
+ {
+ "epoch": 0.908274231678487,
+ "grad_norm": 2.8806519508361816,
+ "learning_rate": 4.746316221071846e-06,
+ "loss": 0.5925,
+ "step": 1921
+ },
+ {
+ "epoch": 0.9087470449172577,
+ "grad_norm": 3.0315234661102295,
+ "learning_rate": 4.746042341442365e-06,
+ "loss": 0.6142,
+ "step": 1922
+ },
+ {
+ "epoch": 0.9092198581560283,
+ "grad_norm": 4.2446160316467285,
+ "learning_rate": 4.745768321961676e-06,
+ "loss": 0.5352,
+ "step": 1923
+ },
+ {
+ "epoch": 0.909692671394799,
+ "grad_norm": 2.6517012119293213,
+ "learning_rate": 4.745494162646841e-06,
+ "loss": 0.6118,
+ "step": 1924
+ },
+ {
+ "epoch": 0.9101654846335697,
+ "grad_norm": 2.774900197982788,
+ "learning_rate": 4.7452198635149304e-06,
+ "loss": 0.572,
+ "step": 1925
+ },
+ {
+ "epoch": 0.9106382978723404,
+ "grad_norm": 3.0133683681488037,
+ "learning_rate": 4.744945424583024e-06,
+ "loss": 0.5897,
+ "step": 1926
+ },
+ {
+ "epoch": 0.9111111111111111,
+ "grad_norm": 2.7344839572906494,
+ "learning_rate": 4.744670845868211e-06,
+ "loss": 0.6207,
+ "step": 1927
+ },
+ {
+ "epoch": 0.9115839243498818,
+ "grad_norm": 2.636578321456909,
+ "learning_rate": 4.744396127387586e-06,
+ "loss": 0.6687,
+ "step": 1928
+ },
+ {
+ "epoch": 0.9120567375886525,
+ "grad_norm": 2.8663458824157715,
+ "learning_rate": 4.744121269158255e-06,
+ "loss": 0.5002,
+ "step": 1929
+ },
+ {
+ "epoch": 0.9125295508274232,
+ "grad_norm": 2.661079168319702,
+ "learning_rate": 4.743846271197333e-06,
+ "loss": 0.5848,
+ "step": 1930
+ },
+ {
+ "epoch": 0.9130023640661938,
+ "grad_norm": 2.881256341934204,
+ "learning_rate": 4.743571133521943e-06,
+ "loss": 0.5911,
+ "step": 1931
+ },
+ {
+ "epoch": 0.9134751773049645,
+ "grad_norm": 2.5540573596954346,
+ "learning_rate": 4.743295856149217e-06,
+ "loss": 0.5647,
+ "step": 1932
+ },
+ {
+ "epoch": 0.9139479905437352,
+ "grad_norm": 2.7060387134552,
+ "learning_rate": 4.743020439096293e-06,
+ "loss": 0.6267,
+ "step": 1933
+ },
+ {
+ "epoch": 0.9144208037825059,
+ "grad_norm": 2.694481372833252,
+ "learning_rate": 4.742744882380323e-06,
+ "loss": 0.6283,
+ "step": 1934
+ },
+ {
+ "epoch": 0.9148936170212766,
+ "grad_norm": 2.711555242538452,
+ "learning_rate": 4.7424691860184625e-06,
+ "loss": 0.5784,
+ "step": 1935
+ },
+ {
+ "epoch": 0.9153664302600473,
+ "grad_norm": 2.9077224731445312,
+ "learning_rate": 4.742193350027879e-06,
+ "loss": 0.5948,
+ "step": 1936
+ },
+ {
+ "epoch": 0.915839243498818,
+ "grad_norm": 2.9824187755584717,
+ "learning_rate": 4.7419173744257476e-06,
+ "loss": 0.6115,
+ "step": 1937
+ },
+ {
+ "epoch": 0.9163120567375886,
+ "grad_norm": 2.5127830505371094,
+ "learning_rate": 4.7416412592292515e-06,
+ "loss": 0.5803,
+ "step": 1938
+ },
+ {
+ "epoch": 0.9167848699763593,
+ "grad_norm": 3.1307175159454346,
+ "learning_rate": 4.741365004455583e-06,
+ "loss": 0.5657,
+ "step": 1939
+ },
+ {
+ "epoch": 0.91725768321513,
+ "grad_norm": 2.8205273151397705,
+ "learning_rate": 4.741088610121944e-06,
+ "loss": 0.6145,
+ "step": 1940
+ },
+ {
+ "epoch": 0.9177304964539007,
+ "grad_norm": 2.6119720935821533,
+ "learning_rate": 4.7408120762455444e-06,
+ "loss": 0.6058,
+ "step": 1941
+ },
+ {
+ "epoch": 0.9182033096926714,
+ "grad_norm": 2.421276092529297,
+ "learning_rate": 4.7405354028436025e-06,
+ "loss": 0.5973,
+ "step": 1942
+ },
+ {
+ "epoch": 0.9186761229314421,
+ "grad_norm": 2.9846808910369873,
+ "learning_rate": 4.740258589933346e-06,
+ "loss": 0.6892,
+ "step": 1943
+ },
+ {
+ "epoch": 0.9191489361702128,
+ "grad_norm": 2.6899871826171875,
+ "learning_rate": 4.739981637532009e-06,
+ "loss": 0.5705,
+ "step": 1944
+ },
+ {
+ "epoch": 0.9196217494089834,
+ "grad_norm": 2.8636131286621094,
+ "learning_rate": 4.739704545656839e-06,
+ "loss": 0.5775,
+ "step": 1945
+ },
+ {
+ "epoch": 0.9200945626477541,
+ "grad_norm": 2.7659449577331543,
+ "learning_rate": 4.739427314325087e-06,
+ "loss": 0.5823,
+ "step": 1946
+ },
+ {
+ "epoch": 0.9205673758865248,
+ "grad_norm": 4.71295166015625,
+ "learning_rate": 4.739149943554016e-06,
+ "loss": 0.5601,
+ "step": 1947
+ },
+ {
+ "epoch": 0.9210401891252955,
+ "grad_norm": 2.642636775970459,
+ "learning_rate": 4.738872433360896e-06,
+ "loss": 0.5278,
+ "step": 1948
+ },
+ {
+ "epoch": 0.9215130023640662,
+ "grad_norm": 2.4658217430114746,
+ "learning_rate": 4.7385947837630065e-06,
+ "loss": 0.6392,
+ "step": 1949
+ },
+ {
+ "epoch": 0.9219858156028369,
+ "grad_norm": 2.851602792739868,
+ "learning_rate": 4.738316994777636e-06,
+ "loss": 0.6164,
+ "step": 1950
+ },
+ {
+ "epoch": 0.9224586288416076,
+ "grad_norm": 2.394226551055908,
+ "learning_rate": 4.738039066422081e-06,
+ "loss": 0.5556,
+ "step": 1951
+ },
+ {
+ "epoch": 0.9229314420803783,
+ "grad_norm": 2.7985100746154785,
+ "learning_rate": 4.737760998713647e-06,
+ "loss": 0.5799,
+ "step": 1952
+ },
+ {
+ "epoch": 0.9234042553191489,
+ "grad_norm": 2.5974674224853516,
+ "learning_rate": 4.737482791669648e-06,
+ "loss": 0.6984,
+ "step": 1953
+ },
+ {
+ "epoch": 0.9238770685579196,
+ "grad_norm": 2.707636594772339,
+ "learning_rate": 4.737204445307406e-06,
+ "loss": 0.5548,
+ "step": 1954
+ },
+ {
+ "epoch": 0.9243498817966903,
+ "grad_norm": 2.7882707118988037,
+ "learning_rate": 4.736925959644254e-06,
+ "loss": 0.6026,
+ "step": 1955
+ },
+ {
+ "epoch": 0.924822695035461,
+ "grad_norm": 2.474482774734497,
+ "learning_rate": 4.7366473346975304e-06,
+ "loss": 0.5832,
+ "step": 1956
+ },
+ {
+ "epoch": 0.9252955082742317,
+ "grad_norm": 2.6196324825286865,
+ "learning_rate": 4.736368570484585e-06,
+ "loss": 0.5861,
+ "step": 1957
+ },
+ {
+ "epoch": 0.9257683215130024,
+ "grad_norm": 2.826864004135132,
+ "learning_rate": 4.736089667022775e-06,
+ "loss": 0.6173,
+ "step": 1958
+ },
+ {
+ "epoch": 0.926241134751773,
+ "grad_norm": 2.414473056793213,
+ "learning_rate": 4.735810624329466e-06,
+ "loss": 0.5753,
+ "step": 1959
+ },
+ {
+ "epoch": 0.9267139479905437,
+ "grad_norm": 2.8037970066070557,
+ "learning_rate": 4.7355314424220335e-06,
+ "loss": 0.6207,
+ "step": 1960
+ },
+ {
+ "epoch": 0.9271867612293144,
+ "grad_norm": 2.645458698272705,
+ "learning_rate": 4.735252121317861e-06,
+ "loss": 0.5959,
+ "step": 1961
+ },
+ {
+ "epoch": 0.9276595744680851,
+ "grad_norm": 2.7983884811401367,
+ "learning_rate": 4.734972661034339e-06,
+ "loss": 0.5696,
+ "step": 1962
+ },
+ {
+ "epoch": 0.9281323877068558,
+ "grad_norm": 3.0568997859954834,
+ "learning_rate": 4.73469306158887e-06,
+ "loss": 0.6194,
+ "step": 1963
+ },
+ {
+ "epoch": 0.9286052009456265,
+ "grad_norm": 2.7205135822296143,
+ "learning_rate": 4.734413322998863e-06,
+ "loss": 0.5292,
+ "step": 1964
+ },
+ {
+ "epoch": 0.9290780141843972,
+ "grad_norm": 3.3168489933013916,
+ "learning_rate": 4.734133445281735e-06,
+ "loss": 0.5654,
+ "step": 1965
+ },
+ {
+ "epoch": 0.9295508274231679,
+ "grad_norm": 3.0095653533935547,
+ "learning_rate": 4.733853428454916e-06,
+ "loss": 0.6508,
+ "step": 1966
+ },
+ {
+ "epoch": 0.9300236406619385,
+ "grad_norm": 2.7726712226867676,
+ "learning_rate": 4.733573272535838e-06,
+ "loss": 0.644,
+ "step": 1967
+ },
+ {
+ "epoch": 0.9304964539007092,
+ "grad_norm": 2.474397659301758,
+ "learning_rate": 4.7332929775419456e-06,
+ "loss": 0.5479,
+ "step": 1968
+ },
+ {
+ "epoch": 0.9309692671394799,
+ "grad_norm": 2.4518635272979736,
+ "learning_rate": 4.733012543490693e-06,
+ "loss": 0.6,
+ "step": 1969
+ },
+ {
+ "epoch": 0.9314420803782506,
+ "grad_norm": 2.9292192459106445,
+ "learning_rate": 4.73273197039954e-06,
+ "loss": 0.6647,
+ "step": 1970
+ },
+ {
+ "epoch": 0.9319148936170213,
+ "grad_norm": 2.425004720687866,
+ "learning_rate": 4.732451258285958e-06,
+ "loss": 0.6338,
+ "step": 1971
+ },
+ {
+ "epoch": 0.932387706855792,
+ "grad_norm": 2.904479503631592,
+ "learning_rate": 4.7321704071674255e-06,
+ "loss": 0.5923,
+ "step": 1972
+ },
+ {
+ "epoch": 0.9328605200945627,
+ "grad_norm": 2.477085590362549,
+ "learning_rate": 4.731889417061428e-06,
+ "loss": 0.5984,
+ "step": 1973
+ },
+ {
+ "epoch": 0.9333333333333333,
+ "grad_norm": 2.585240364074707,
+ "learning_rate": 4.731608287985465e-06,
+ "loss": 0.558,
+ "step": 1974
+ },
+ {
+ "epoch": 0.933806146572104,
+ "grad_norm": 2.658714532852173,
+ "learning_rate": 4.731327019957039e-06,
+ "loss": 0.5567,
+ "step": 1975
+ },
+ {
+ "epoch": 0.9342789598108747,
+ "grad_norm": 2.7593026161193848,
+ "learning_rate": 4.731045612993662e-06,
+ "loss": 0.5772,
+ "step": 1976
+ },
+ {
+ "epoch": 0.9347517730496454,
+ "grad_norm": 2.4386026859283447,
+ "learning_rate": 4.7307640671128585e-06,
+ "loss": 0.6199,
+ "step": 1977
+ },
+ {
+ "epoch": 0.9352245862884161,
+ "grad_norm": 2.681910514831543,
+ "learning_rate": 4.730482382332158e-06,
+ "loss": 0.5971,
+ "step": 1978
+ },
+ {
+ "epoch": 0.9356973995271868,
+ "grad_norm": 3.7593860626220703,
+ "learning_rate": 4.7302005586691e-06,
+ "loss": 0.6346,
+ "step": 1979
+ },
+ {
+ "epoch": 0.9361702127659575,
+ "grad_norm": 2.5789096355438232,
+ "learning_rate": 4.729918596141232e-06,
+ "loss": 0.5684,
+ "step": 1980
+ },
+ {
+ "epoch": 0.9366430260047282,
+ "grad_norm": 3.0607335567474365,
+ "learning_rate": 4.729636494766111e-06,
+ "loss": 0.6223,
+ "step": 1981
+ },
+ {
+ "epoch": 0.9371158392434988,
+ "grad_norm": 2.906643867492676,
+ "learning_rate": 4.729354254561303e-06,
+ "loss": 0.6513,
+ "step": 1982
+ },
+ {
+ "epoch": 0.9375886524822695,
+ "grad_norm": 3.192430019378662,
+ "learning_rate": 4.7290718755443795e-06,
+ "loss": 0.5095,
+ "step": 1983
+ },
+ {
+ "epoch": 0.9380614657210402,
+ "grad_norm": 2.661536931991577,
+ "learning_rate": 4.7287893577329255e-06,
+ "loss": 0.5525,
+ "step": 1984
+ },
+ {
+ "epoch": 0.9385342789598109,
+ "grad_norm": 2.8436734676361084,
+ "learning_rate": 4.728506701144531e-06,
+ "loss": 0.6323,
+ "step": 1985
+ },
+ {
+ "epoch": 0.9390070921985816,
+ "grad_norm": 2.75544810295105,
+ "learning_rate": 4.728223905796796e-06,
+ "loss": 0.6018,
+ "step": 1986
+ },
+ {
+ "epoch": 0.9394799054373523,
+ "grad_norm": 3.0652759075164795,
+ "learning_rate": 4.727940971707329e-06,
+ "loss": 0.62,
+ "step": 1987
+ },
+ {
+ "epoch": 0.939952718676123,
+ "grad_norm": 2.802567720413208,
+ "learning_rate": 4.727657898893747e-06,
+ "loss": 0.5809,
+ "step": 1988
+ },
+ {
+ "epoch": 0.9404255319148936,
+ "grad_norm": 2.6208512783050537,
+ "learning_rate": 4.7273746873736745e-06,
+ "loss": 0.5762,
+ "step": 1989
+ },
+ {
+ "epoch": 0.9408983451536643,
+ "grad_norm": 2.5901873111724854,
+ "learning_rate": 4.727091337164748e-06,
+ "loss": 0.6111,
+ "step": 1990
+ },
+ {
+ "epoch": 0.941371158392435,
+ "grad_norm": 3.002347707748413,
+ "learning_rate": 4.726807848284609e-06,
+ "loss": 0.6419,
+ "step": 1991
+ },
+ {
+ "epoch": 0.9418439716312057,
+ "grad_norm": 2.522151470184326,
+ "learning_rate": 4.72652422075091e-06,
+ "loss": 0.642,
+ "step": 1992
+ },
+ {
+ "epoch": 0.9423167848699764,
+ "grad_norm": 2.5571532249450684,
+ "learning_rate": 4.726240454581311e-06,
+ "loss": 0.5729,
+ "step": 1993
+ },
+ {
+ "epoch": 0.9427895981087471,
+ "grad_norm": 2.7704918384552,
+ "learning_rate": 4.72595654979348e-06,
+ "loss": 0.6816,
+ "step": 1994
+ },
+ {
+ "epoch": 0.9432624113475178,
+ "grad_norm": 2.517040491104126,
+ "learning_rate": 4.7256725064050955e-06,
+ "loss": 0.5782,
+ "step": 1995
+ },
+ {
+ "epoch": 0.9437352245862884,
+ "grad_norm": 2.613955020904541,
+ "learning_rate": 4.725388324433843e-06,
+ "loss": 0.6291,
+ "step": 1996
+ },
+ {
+ "epoch": 0.9442080378250591,
+ "grad_norm": 2.848891258239746,
+ "learning_rate": 4.725104003897418e-06,
+ "loss": 0.6544,
+ "step": 1997
+ },
+ {
+ "epoch": 0.9446808510638298,
+ "grad_norm": 3.0162429809570312,
+ "learning_rate": 4.724819544813523e-06,
+ "loss": 0.6301,
+ "step": 1998
+ },
+ {
+ "epoch": 0.9451536643026005,
+ "grad_norm": 2.613614559173584,
+ "learning_rate": 4.72453494719987e-06,
+ "loss": 0.5829,
+ "step": 1999
+ },
+ {
+ "epoch": 0.9456264775413712,
+ "grad_norm": 2.4838767051696777,
+ "learning_rate": 4.724250211074182e-06,
+ "loss": 0.6042,
+ "step": 2000
+ },
+ {
+ "epoch": 0.9460992907801419,
+ "grad_norm": 2.526470899581909,
+ "learning_rate": 4.723965336454185e-06,
+ "loss": 0.6167,
+ "step": 2001
+ },
+ {
+ "epoch": 0.9465721040189126,
+ "grad_norm": 2.504506826400757,
+ "learning_rate": 4.723680323357618e-06,
+ "loss": 0.6061,
+ "step": 2002
+ },
+ {
+ "epoch": 0.9470449172576832,
+ "grad_norm": 3.0547544956207275,
+ "learning_rate": 4.723395171802228e-06,
+ "loss": 0.6619,
+ "step": 2003
+ },
+ {
+ "epoch": 0.9475177304964539,
+ "grad_norm": 2.8692407608032227,
+ "learning_rate": 4.723109881805771e-06,
+ "loss": 0.5985,
+ "step": 2004
+ },
+ {
+ "epoch": 0.9479905437352246,
+ "grad_norm": 2.7929654121398926,
+ "learning_rate": 4.7228244533860094e-06,
+ "loss": 0.5869,
+ "step": 2005
+ },
+ {
+ "epoch": 0.9484633569739953,
+ "grad_norm": 2.764869451522827,
+ "learning_rate": 4.7225388865607146e-06,
+ "loss": 0.6288,
+ "step": 2006
+ },
+ {
+ "epoch": 0.948936170212766,
+ "grad_norm": 2.7656404972076416,
+ "learning_rate": 4.722253181347671e-06,
+ "loss": 0.5831,
+ "step": 2007
+ },
+ {
+ "epoch": 0.9494089834515367,
+ "grad_norm": 2.6698336601257324,
+ "learning_rate": 4.7219673377646635e-06,
+ "loss": 0.6087,
+ "step": 2008
+ },
+ {
+ "epoch": 0.9498817966903074,
+ "grad_norm": 2.524935722351074,
+ "learning_rate": 4.7216813558294946e-06,
+ "loss": 0.5675,
+ "step": 2009
+ },
+ {
+ "epoch": 0.950354609929078,
+ "grad_norm": 2.5998785495758057,
+ "learning_rate": 4.721395235559969e-06,
+ "loss": 0.5667,
+ "step": 2010
+ },
+ {
+ "epoch": 0.9508274231678487,
+ "grad_norm": 2.758021354675293,
+ "learning_rate": 4.721108976973902e-06,
+ "loss": 0.4931,
+ "step": 2011
+ },
+ {
+ "epoch": 0.9513002364066194,
+ "grad_norm": 2.767695903778076,
+ "learning_rate": 4.72082258008912e-06,
+ "loss": 0.5778,
+ "step": 2012
+ },
+ {
+ "epoch": 0.9517730496453901,
+ "grad_norm": 2.982314348220825,
+ "learning_rate": 4.720536044923453e-06,
+ "loss": 0.6096,
+ "step": 2013
+ },
+ {
+ "epoch": 0.9522458628841608,
+ "grad_norm": 2.7608799934387207,
+ "learning_rate": 4.720249371494743e-06,
+ "loss": 0.6242,
+ "step": 2014
+ },
+ {
+ "epoch": 0.9527186761229315,
+ "grad_norm": 2.60054349899292,
+ "learning_rate": 4.71996255982084e-06,
+ "loss": 0.6249,
+ "step": 2015
+ },
+ {
+ "epoch": 0.9531914893617022,
+ "grad_norm": 2.654355764389038,
+ "learning_rate": 4.719675609919603e-06,
+ "loss": 0.6327,
+ "step": 2016
+ },
+ {
+ "epoch": 0.9536643026004729,
+ "grad_norm": 2.589404582977295,
+ "learning_rate": 4.719388521808899e-06,
+ "loss": 0.6357,
+ "step": 2017
+ },
+ {
+ "epoch": 0.9541371158392435,
+ "grad_norm": 2.8016581535339355,
+ "learning_rate": 4.719101295506603e-06,
+ "loss": 0.5901,
+ "step": 2018
+ },
+ {
+ "epoch": 0.9546099290780142,
+ "grad_norm": 3.1408045291900635,
+ "learning_rate": 4.7188139310306e-06,
+ "loss": 0.598,
+ "step": 2019
+ },
+ {
+ "epoch": 0.9550827423167849,
+ "grad_norm": 2.7432665824890137,
+ "learning_rate": 4.718526428398783e-06,
+ "loss": 0.5508,
+ "step": 2020
+ },
+ {
+ "epoch": 0.9555555555555556,
+ "grad_norm": 2.947800874710083,
+ "learning_rate": 4.718238787629053e-06,
+ "loss": 0.6439,
+ "step": 2021
+ },
+ {
+ "epoch": 0.9560283687943263,
+ "grad_norm": 2.50828218460083,
+ "learning_rate": 4.71795100873932e-06,
+ "loss": 0.5441,
+ "step": 2022
+ },
+ {
+ "epoch": 0.956501182033097,
+ "grad_norm": 2.8558974266052246,
+ "learning_rate": 4.717663091747503e-06,
+ "loss": 0.5416,
+ "step": 2023
+ },
+ {
+ "epoch": 0.9569739952718677,
+ "grad_norm": 2.4803316593170166,
+ "learning_rate": 4.71737503667153e-06,
+ "loss": 0.5317,
+ "step": 2024
+ },
+ {
+ "epoch": 0.9574468085106383,
+ "grad_norm": 4.36754035949707,
+ "learning_rate": 4.717086843529336e-06,
+ "loss": 0.5808,
+ "step": 2025
+ },
+ {
+ "epoch": 0.957919621749409,
+ "grad_norm": 2.730185031890869,
+ "learning_rate": 4.7167985123388665e-06,
+ "loss": 0.5257,
+ "step": 2026
+ },
+ {
+ "epoch": 0.9583924349881797,
+ "grad_norm": 2.8136069774627686,
+ "learning_rate": 4.716510043118074e-06,
+ "loss": 0.5836,
+ "step": 2027
+ },
+ {
+ "epoch": 0.9588652482269504,
+ "grad_norm": 2.793975353240967,
+ "learning_rate": 4.71622143588492e-06,
+ "loss": 0.5706,
+ "step": 2028
+ },
+ {
+ "epoch": 0.9593380614657211,
+ "grad_norm": 2.3883821964263916,
+ "learning_rate": 4.7159326906573745e-06,
+ "loss": 0.5291,
+ "step": 2029
+ },
+ {
+ "epoch": 0.9598108747044918,
+ "grad_norm": 2.6135976314544678,
+ "learning_rate": 4.715643807453417e-06,
+ "loss": 0.6199,
+ "step": 2030
+ },
+ {
+ "epoch": 0.9602836879432625,
+ "grad_norm": 2.6245670318603516,
+ "learning_rate": 4.715354786291035e-06,
+ "loss": 0.5585,
+ "step": 2031
+ },
+ {
+ "epoch": 0.9607565011820332,
+ "grad_norm": 2.7870967388153076,
+ "learning_rate": 4.715065627188225e-06,
+ "loss": 0.6196,
+ "step": 2032
+ },
+ {
+ "epoch": 0.9612293144208038,
+ "grad_norm": 2.6983911991119385,
+ "learning_rate": 4.714776330162991e-06,
+ "loss": 0.6424,
+ "step": 2033
+ },
+ {
+ "epoch": 0.9617021276595744,
+ "grad_norm": 2.3221919536590576,
+ "learning_rate": 4.7144868952333465e-06,
+ "loss": 0.568,
+ "step": 2034
+ },
+ {
+ "epoch": 0.9621749408983451,
+ "grad_norm": 2.9408178329467773,
+ "learning_rate": 4.714197322417314e-06,
+ "loss": 0.6175,
+ "step": 2035
+ },
+ {
+ "epoch": 0.9626477541371158,
+ "grad_norm": 2.404057264328003,
+ "learning_rate": 4.713907611732921e-06,
+ "loss": 0.4943,
+ "step": 2036
+ },
+ {
+ "epoch": 0.9631205673758865,
+ "grad_norm": 3.547607660293579,
+ "learning_rate": 4.71361776319821e-06,
+ "loss": 0.5488,
+ "step": 2037
+ },
+ {
+ "epoch": 0.9635933806146572,
+ "grad_norm": 2.679614543914795,
+ "learning_rate": 4.713327776831227e-06,
+ "loss": 0.6234,
+ "step": 2038
+ },
+ {
+ "epoch": 0.9640661938534278,
+ "grad_norm": 2.526914119720459,
+ "learning_rate": 4.7130376526500286e-06,
+ "loss": 0.5891,
+ "step": 2039
+ },
+ {
+ "epoch": 0.9645390070921985,
+ "grad_norm": 2.6953470706939697,
+ "learning_rate": 4.71274739067268e-06,
+ "loss": 0.69,
+ "step": 2040
+ },
+ {
+ "epoch": 0.9650118203309692,
+ "grad_norm": 2.546660900115967,
+ "learning_rate": 4.712456990917254e-06,
+ "loss": 0.6185,
+ "step": 2041
+ },
+ {
+ "epoch": 0.9654846335697399,
+ "grad_norm": 3.3920490741729736,
+ "learning_rate": 4.712166453401832e-06,
+ "loss": 0.587,
+ "step": 2042
+ },
+ {
+ "epoch": 0.9659574468085106,
+ "grad_norm": 2.5961573123931885,
+ "learning_rate": 4.711875778144504e-06,
+ "loss": 0.6105,
+ "step": 2043
+ },
+ {
+ "epoch": 0.9664302600472813,
+ "grad_norm": 2.5111498832702637,
+ "learning_rate": 4.711584965163372e-06,
+ "loss": 0.5533,
+ "step": 2044
+ },
+ {
+ "epoch": 0.966903073286052,
+ "grad_norm": 2.4878132343292236,
+ "learning_rate": 4.7112940144765405e-06,
+ "loss": 0.5604,
+ "step": 2045
+ },
+ {
+ "epoch": 0.9673758865248226,
+ "grad_norm": 2.5714077949523926,
+ "learning_rate": 4.711002926102128e-06,
+ "loss": 0.5794,
+ "step": 2046
+ },
+ {
+ "epoch": 0.9678486997635933,
+ "grad_norm": 2.7069091796875,
+ "learning_rate": 4.710711700058257e-06,
+ "loss": 0.594,
+ "step": 2047
+ },
+ {
+ "epoch": 0.968321513002364,
+ "grad_norm": 2.8104631900787354,
+ "learning_rate": 4.710420336363063e-06,
+ "loss": 0.6247,
+ "step": 2048
+ },
+ {
+ "epoch": 0.9687943262411347,
+ "grad_norm": 2.8464386463165283,
+ "learning_rate": 4.7101288350346865e-06,
+ "loss": 0.6162,
+ "step": 2049
+ },
+ {
+ "epoch": 0.9692671394799054,
+ "grad_norm": 2.7187976837158203,
+ "learning_rate": 4.709837196091279e-06,
+ "loss": 0.6109,
+ "step": 2050
+ },
+ {
+ "epoch": 0.9697399527186761,
+ "grad_norm": 2.556734085083008,
+ "learning_rate": 4.709545419550999e-06,
+ "loss": 0.6297,
+ "step": 2051
+ },
+ {
+ "epoch": 0.9702127659574468,
+ "grad_norm": 2.937195062637329,
+ "learning_rate": 4.709253505432014e-06,
+ "loss": 0.6862,
+ "step": 2052
+ },
+ {
+ "epoch": 0.9706855791962175,
+ "grad_norm": 2.792175531387329,
+ "learning_rate": 4.7089614537525015e-06,
+ "loss": 0.6105,
+ "step": 2053
+ },
+ {
+ "epoch": 0.9711583924349881,
+ "grad_norm": 2.625636100769043,
+ "learning_rate": 4.708669264530644e-06,
+ "loss": 0.5849,
+ "step": 2054
+ },
+ {
+ "epoch": 0.9716312056737588,
+ "grad_norm": 2.6752610206604004,
+ "learning_rate": 4.708376937784637e-06,
+ "loss": 0.5949,
+ "step": 2055
+ },
+ {
+ "epoch": 0.9721040189125295,
+ "grad_norm": 2.6072793006896973,
+ "learning_rate": 4.708084473532681e-06,
+ "loss": 0.5776,
+ "step": 2056
+ },
+ {
+ "epoch": 0.9725768321513002,
+ "grad_norm": 2.728632926940918,
+ "learning_rate": 4.707791871792988e-06,
+ "loss": 0.6352,
+ "step": 2057
+ },
+ {
+ "epoch": 0.9730496453900709,
+ "grad_norm": 2.5841758251190186,
+ "learning_rate": 4.707499132583775e-06,
+ "loss": 0.5488,
+ "step": 2058
+ },
+ {
+ "epoch": 0.9735224586288416,
+ "grad_norm": 2.8464293479919434,
+ "learning_rate": 4.707206255923271e-06,
+ "loss": 0.7051,
+ "step": 2059
+ },
+ {
+ "epoch": 0.9739952718676123,
+ "grad_norm": 2.547297239303589,
+ "learning_rate": 4.706913241829712e-06,
+ "loss": 0.5937,
+ "step": 2060
+ },
+ {
+ "epoch": 0.9744680851063829,
+ "grad_norm": 2.6572306156158447,
+ "learning_rate": 4.706620090321341e-06,
+ "loss": 0.6041,
+ "step": 2061
+ },
+ {
+ "epoch": 0.9749408983451536,
+ "grad_norm": 2.3262805938720703,
+ "learning_rate": 4.706326801416414e-06,
+ "loss": 0.5144,
+ "step": 2062
+ },
+ {
+ "epoch": 0.9754137115839243,
+ "grad_norm": 2.9693965911865234,
+ "learning_rate": 4.706033375133191e-06,
+ "loss": 0.551,
+ "step": 2063
+ },
+ {
+ "epoch": 0.975886524822695,
+ "grad_norm": 2.5993731021881104,
+ "learning_rate": 4.7057398114899435e-06,
+ "loss": 0.6143,
+ "step": 2064
+ },
+ {
+ "epoch": 0.9763593380614657,
+ "grad_norm": 2.453336477279663,
+ "learning_rate": 4.70544611050495e-06,
+ "loss": 0.6093,
+ "step": 2065
+ },
+ {
+ "epoch": 0.9768321513002364,
+ "grad_norm": 2.898629665374756,
+ "learning_rate": 4.705152272196497e-06,
+ "loss": 0.6007,
+ "step": 2066
+ },
+ {
+ "epoch": 0.9773049645390071,
+ "grad_norm": 2.7990612983703613,
+ "learning_rate": 4.7048582965828815e-06,
+ "loss": 0.6687,
+ "step": 2067
+ },
+ {
+ "epoch": 0.9777777777777777,
+ "grad_norm": 2.635284423828125,
+ "learning_rate": 4.704564183682408e-06,
+ "loss": 0.5564,
+ "step": 2068
+ },
+ {
+ "epoch": 0.9782505910165484,
+ "grad_norm": 3.014547109603882,
+ "learning_rate": 4.704269933513389e-06,
+ "loss": 0.6084,
+ "step": 2069
+ },
+ {
+ "epoch": 0.9787234042553191,
+ "grad_norm": 2.659357786178589,
+ "learning_rate": 4.703975546094147e-06,
+ "loss": 0.6031,
+ "step": 2070
+ },
+ {
+ "epoch": 0.9791962174940898,
+ "grad_norm": 2.326932668685913,
+ "learning_rate": 4.703681021443013e-06,
+ "loss": 0.5859,
+ "step": 2071
+ },
+ {
+ "epoch": 0.9796690307328605,
+ "grad_norm": 2.958803653717041,
+ "learning_rate": 4.7033863595783235e-06,
+ "loss": 0.5586,
+ "step": 2072
+ },
+ {
+ "epoch": 0.9801418439716312,
+ "grad_norm": 2.921386957168579,
+ "learning_rate": 4.703091560518427e-06,
+ "loss": 0.6126,
+ "step": 2073
+ },
+ {
+ "epoch": 0.9806146572104019,
+ "grad_norm": 2.6500775814056396,
+ "learning_rate": 4.702796624281679e-06,
+ "loss": 0.5678,
+ "step": 2074
+ },
+ {
+ "epoch": 0.9810874704491725,
+ "grad_norm": 2.7740228176116943,
+ "learning_rate": 4.702501550886445e-06,
+ "loss": 0.6067,
+ "step": 2075
+ },
+ {
+ "epoch": 0.9815602836879432,
+ "grad_norm": 2.3296213150024414,
+ "learning_rate": 4.702206340351096e-06,
+ "loss": 0.5247,
+ "step": 2076
+ },
+ {
+ "epoch": 0.9820330969267139,
+ "grad_norm": 2.748300790786743,
+ "learning_rate": 4.701910992694016e-06,
+ "loss": 0.5197,
+ "step": 2077
+ },
+ {
+ "epoch": 0.9825059101654846,
+ "grad_norm": 2.250985622406006,
+ "learning_rate": 4.7016155079335926e-06,
+ "loss": 0.5214,
+ "step": 2078
+ },
+ {
+ "epoch": 0.9829787234042553,
+ "grad_norm": 2.389845848083496,
+ "learning_rate": 4.701319886088226e-06,
+ "loss": 0.519,
+ "step": 2079
+ },
+ {
+ "epoch": 0.983451536643026,
+ "grad_norm": 2.818220853805542,
+ "learning_rate": 4.701024127176322e-06,
+ "loss": 0.607,
+ "step": 2080
+ },
+ {
+ "epoch": 0.9839243498817967,
+ "grad_norm": 3.4058034420013428,
+ "learning_rate": 4.700728231216297e-06,
+ "loss": 0.5711,
+ "step": 2081
+ },
+ {
+ "epoch": 0.9843971631205674,
+ "grad_norm": 2.5297787189483643,
+ "learning_rate": 4.700432198226575e-06,
+ "loss": 0.5979,
+ "step": 2082
+ },
+ {
+ "epoch": 0.984869976359338,
+ "grad_norm": 3.0548105239868164,
+ "learning_rate": 4.7001360282255885e-06,
+ "loss": 0.6041,
+ "step": 2083
+ },
+ {
+ "epoch": 0.9853427895981087,
+ "grad_norm": 2.8983733654022217,
+ "learning_rate": 4.699839721231779e-06,
+ "loss": 0.5926,
+ "step": 2084
+ },
+ {
+ "epoch": 0.9858156028368794,
+ "grad_norm": 3.2717764377593994,
+ "learning_rate": 4.699543277263596e-06,
+ "loss": 0.6477,
+ "step": 2085
+ },
+ {
+ "epoch": 0.9862884160756501,
+ "grad_norm": 3.03729248046875,
+ "learning_rate": 4.699246696339497e-06,
+ "loss": 0.6786,
+ "step": 2086
+ },
+ {
+ "epoch": 0.9867612293144208,
+ "grad_norm": 2.852301597595215,
+ "learning_rate": 4.698949978477951e-06,
+ "loss": 0.6565,
+ "step": 2087
+ },
+ {
+ "epoch": 0.9872340425531915,
+ "grad_norm": 2.843485116958618,
+ "learning_rate": 4.698653123697431e-06,
+ "loss": 0.6627,
+ "step": 2088
+ },
+ {
+ "epoch": 0.9877068557919622,
+ "grad_norm": 2.6315064430236816,
+ "learning_rate": 4.698356132016423e-06,
+ "loss": 0.6577,
+ "step": 2089
+ },
+ {
+ "epoch": 0.9881796690307328,
+ "grad_norm": 2.7482151985168457,
+ "learning_rate": 4.698059003453417e-06,
+ "loss": 0.5514,
+ "step": 2090
+ },
+ {
+ "epoch": 0.9886524822695035,
+ "grad_norm": 2.826673746109009,
+ "learning_rate": 4.6977617380269145e-06,
+ "loss": 0.565,
+ "step": 2091
+ },
+ {
+ "epoch": 0.9891252955082742,
+ "grad_norm": 3.0273752212524414,
+ "learning_rate": 4.697464335755427e-06,
+ "loss": 0.6331,
+ "step": 2092
+ },
+ {
+ "epoch": 0.9895981087470449,
+ "grad_norm": 2.7551653385162354,
+ "learning_rate": 4.6971667966574695e-06,
+ "loss": 0.6486,
+ "step": 2093
+ },
+ {
+ "epoch": 0.9900709219858156,
+ "grad_norm": 2.656299114227295,
+ "learning_rate": 4.696869120751571e-06,
+ "loss": 0.6562,
+ "step": 2094
+ },
+ {
+ "epoch": 0.9905437352245863,
+ "grad_norm": 2.785322904586792,
+ "learning_rate": 4.696571308056265e-06,
+ "loss": 0.5892,
+ "step": 2095
+ },
+ {
+ "epoch": 0.991016548463357,
+ "grad_norm": 2.9334635734558105,
+ "learning_rate": 4.696273358590095e-06,
+ "loss": 0.6346,
+ "step": 2096
+ },
+ {
+ "epoch": 0.9914893617021276,
+ "grad_norm": 2.7944300174713135,
+ "learning_rate": 4.695975272371613e-06,
+ "loss": 0.5859,
+ "step": 2097
+ },
+ {
+ "epoch": 0.9919621749408983,
+ "grad_norm": 2.5416972637176514,
+ "learning_rate": 4.695677049419381e-06,
+ "loss": 0.5658,
+ "step": 2098
+ },
+ {
+ "epoch": 0.992434988179669,
+ "grad_norm": 2.4056856632232666,
+ "learning_rate": 4.695378689751966e-06,
+ "loss": 0.5121,
+ "step": 2099
+ },
+ {
+ "epoch": 0.9929078014184397,
+ "grad_norm": 2.614548683166504,
+ "learning_rate": 4.695080193387948e-06,
+ "loss": 0.5961,
+ "step": 2100
+ },
+ {
+ "epoch": 0.9933806146572104,
+ "grad_norm": 2.8966517448425293,
+ "learning_rate": 4.69478156034591e-06,
+ "loss": 0.5985,
+ "step": 2101
+ },
+ {
+ "epoch": 0.9938534278959811,
+ "grad_norm": 2.9514098167419434,
+ "learning_rate": 4.694482790644448e-06,
+ "loss": 0.5677,
+ "step": 2102
+ },
+ {
+ "epoch": 0.9943262411347518,
+ "grad_norm": 2.4326791763305664,
+ "learning_rate": 4.694183884302165e-06,
+ "loss": 0.5698,
+ "step": 2103
+ },
+ {
+ "epoch": 0.9947990543735225,
+ "grad_norm": 2.9242892265319824,
+ "learning_rate": 4.6938848413376735e-06,
+ "loss": 0.6245,
+ "step": 2104
+ },
+ {
+ "epoch": 0.9952718676122931,
+ "grad_norm": 2.9134104251861572,
+ "learning_rate": 4.693585661769593e-06,
+ "loss": 0.6164,
+ "step": 2105
+ },
+ {
+ "epoch": 0.9957446808510638,
+ "grad_norm": 2.472564458847046,
+ "learning_rate": 4.693286345616551e-06,
+ "loss": 0.5616,
+ "step": 2106
+ },
+ {
+ "epoch": 0.9962174940898345,
+ "grad_norm": 3.2456448078155518,
+ "learning_rate": 4.692986892897186e-06,
+ "loss": 0.6977,
+ "step": 2107
+ },
+ {
+ "epoch": 0.9966903073286052,
+ "grad_norm": 3.4032769203186035,
+ "learning_rate": 4.692687303630143e-06,
+ "loss": 0.643,
+ "step": 2108
+ },
+ {
+ "epoch": 0.9971631205673759,
+ "grad_norm": 2.722200870513916,
+ "learning_rate": 4.692387577834076e-06,
+ "loss": 0.5873,
+ "step": 2109
+ },
+ {
+ "epoch": 0.9976359338061466,
+ "grad_norm": 2.687532663345337,
+ "learning_rate": 4.692087715527648e-06,
+ "loss": 0.5423,
+ "step": 2110
+ },
+ {
+ "epoch": 0.9981087470449173,
+ "grad_norm": 2.578613042831421,
+ "learning_rate": 4.6917877167295305e-06,
+ "loss": 0.5689,
+ "step": 2111
+ },
+ {
+ "epoch": 0.9985815602836879,
+ "grad_norm": 3.1806094646453857,
+ "learning_rate": 4.691487581458402e-06,
+ "loss": 0.6133,
+ "step": 2112
+ },
+ {
+ "epoch": 0.9990543735224586,
+ "grad_norm": 2.4449520111083984,
+ "learning_rate": 4.691187309732952e-06,
+ "loss": 0.5841,
+ "step": 2113
+ },
+ {
+ "epoch": 0.9995271867612293,
+ "grad_norm": 2.908749580383301,
+ "learning_rate": 4.690886901571875e-06,
+ "loss": 0.534,
+ "step": 2114
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 4.019968032836914,
+ "learning_rate": 4.6905863569938785e-06,
+ "loss": 0.596,
+ "step": 2115
+ },
+ {
+ "epoch": 1.0004728132387706,
+ "grad_norm": 2.4319307804107666,
+ "learning_rate": 4.690285676017675e-06,
+ "loss": 0.4973,
+ "step": 2116
+ },
+ {
+ "epoch": 1.0009456264775414,
+ "grad_norm": 2.6366477012634277,
+ "learning_rate": 4.689984858661986e-06,
+ "loss": 0.5682,
+ "step": 2117
+ },
+ {
+ "epoch": 1.001418439716312,
+ "grad_norm": 2.815114974975586,
+ "learning_rate": 4.689683904945542e-06,
+ "loss": 0.5616,
+ "step": 2118
+ },
+ {
+ "epoch": 1.0018912529550827,
+ "grad_norm": 2.6680490970611572,
+ "learning_rate": 4.689382814887084e-06,
+ "loss": 0.5161,
+ "step": 2119
+ },
+ {
+ "epoch": 1.0023640661938533,
+ "grad_norm": 2.7406351566314697,
+ "learning_rate": 4.689081588505358e-06,
+ "loss": 0.4937,
+ "step": 2120
+ },
+ {
+ "epoch": 1.0028368794326241,
+ "grad_norm": 2.2832298278808594,
+ "learning_rate": 4.68878022581912e-06,
+ "loss": 0.4986,
+ "step": 2121
+ },
+ {
+ "epoch": 1.0033096926713947,
+ "grad_norm": 2.5525307655334473,
+ "learning_rate": 4.688478726847136e-06,
+ "loss": 0.4909,
+ "step": 2122
+ },
+ {
+ "epoch": 1.0037825059101655,
+ "grad_norm": 2.9843199253082275,
+ "learning_rate": 4.688177091608176e-06,
+ "loss": 0.6046,
+ "step": 2123
+ },
+ {
+ "epoch": 1.004255319148936,
+ "grad_norm": 2.5231106281280518,
+ "learning_rate": 4.687875320121024e-06,
+ "loss": 0.5423,
+ "step": 2124
+ },
+ {
+ "epoch": 1.0047281323877069,
+ "grad_norm": 2.567599058151245,
+ "learning_rate": 4.68757341240447e-06,
+ "loss": 0.5092,
+ "step": 2125
+ },
+ {
+ "epoch": 1.0052009456264774,
+ "grad_norm": 2.768111228942871,
+ "learning_rate": 4.687271368477311e-06,
+ "loss": 0.5175,
+ "step": 2126
+ },
+ {
+ "epoch": 1.0056737588652482,
+ "grad_norm": 2.7223286628723145,
+ "learning_rate": 4.686969188358355e-06,
+ "loss": 0.5412,
+ "step": 2127
+ },
+ {
+ "epoch": 1.0061465721040188,
+ "grad_norm": 2.488299608230591,
+ "learning_rate": 4.686666872066418e-06,
+ "loss": 0.5288,
+ "step": 2128
+ },
+ {
+ "epoch": 1.0066193853427896,
+ "grad_norm": 2.882981777191162,
+ "learning_rate": 4.6863644196203215e-06,
+ "loss": 0.6117,
+ "step": 2129
+ },
+ {
+ "epoch": 1.0070921985815602,
+ "grad_norm": 3.0019447803497314,
+ "learning_rate": 4.686061831038901e-06,
+ "loss": 0.5308,
+ "step": 2130
+ },
+ {
+ "epoch": 1.007565011820331,
+ "grad_norm": 3.0056138038635254,
+ "learning_rate": 4.685759106340996e-06,
+ "loss": 0.5833,
+ "step": 2131
+ },
+ {
+ "epoch": 1.0080378250591016,
+ "grad_norm": 2.5709075927734375,
+ "learning_rate": 4.685456245545454e-06,
+ "loss": 0.5071,
+ "step": 2132
+ },
+ {
+ "epoch": 1.0085106382978724,
+ "grad_norm": 2.4641504287719727,
+ "learning_rate": 4.685153248671136e-06,
+ "loss": 0.4813,
+ "step": 2133
+ },
+ {
+ "epoch": 1.008983451536643,
+ "grad_norm": 2.374413013458252,
+ "learning_rate": 4.684850115736906e-06,
+ "loss": 0.5179,
+ "step": 2134
+ },
+ {
+ "epoch": 1.0094562647754137,
+ "grad_norm": 2.6504571437835693,
+ "learning_rate": 4.684546846761641e-06,
+ "loss": 0.437,
+ "step": 2135
+ },
+ {
+ "epoch": 1.0099290780141843,
+ "grad_norm": 2.5977871417999268,
+ "learning_rate": 4.684243441764221e-06,
+ "loss": 0.497,
+ "step": 2136
+ },
+ {
+ "epoch": 1.010401891252955,
+ "grad_norm": 2.4950785636901855,
+ "learning_rate": 4.683939900763541e-06,
+ "loss": 0.5624,
+ "step": 2137
+ },
+ {
+ "epoch": 1.0108747044917257,
+ "grad_norm": 3.065718412399292,
+ "learning_rate": 4.6836362237785e-06,
+ "loss": 0.512,
+ "step": 2138
+ },
+ {
+ "epoch": 1.0113475177304965,
+ "grad_norm": 2.7419207096099854,
+ "learning_rate": 4.6833324108280045e-06,
+ "loss": 0.5585,
+ "step": 2139
+ },
+ {
+ "epoch": 1.011820330969267,
+ "grad_norm": 2.623610496520996,
+ "learning_rate": 4.6830284619309744e-06,
+ "loss": 0.5163,
+ "step": 2140
+ },
+ {
+ "epoch": 1.0122931442080378,
+ "grad_norm": 2.774322986602783,
+ "learning_rate": 4.682724377106334e-06,
+ "loss": 0.527,
+ "step": 2141
+ },
+ {
+ "epoch": 1.0127659574468084,
+ "grad_norm": 2.959935188293457,
+ "learning_rate": 4.682420156373017e-06,
+ "loss": 0.6166,
+ "step": 2142
+ },
+ {
+ "epoch": 1.0132387706855792,
+ "grad_norm": 2.584026336669922,
+ "learning_rate": 4.682115799749968e-06,
+ "loss": 0.5086,
+ "step": 2143
+ },
+ {
+ "epoch": 1.0137115839243498,
+ "grad_norm": 2.6039700508117676,
+ "learning_rate": 4.6818113072561346e-06,
+ "loss": 0.49,
+ "step": 2144
+ },
+ {
+ "epoch": 1.0141843971631206,
+ "grad_norm": 2.466381072998047,
+ "learning_rate": 4.681506678910479e-06,
+ "loss": 0.4959,
+ "step": 2145
+ },
+ {
+ "epoch": 1.0146572104018912,
+ "grad_norm": 2.432636260986328,
+ "learning_rate": 4.681201914731969e-06,
+ "loss": 0.5057,
+ "step": 2146
+ },
+ {
+ "epoch": 1.015130023640662,
+ "grad_norm": 2.6134090423583984,
+ "learning_rate": 4.680897014739579e-06,
+ "loss": 0.4874,
+ "step": 2147
+ },
+ {
+ "epoch": 1.0156028368794325,
+ "grad_norm": 2.774481773376465,
+ "learning_rate": 4.680591978952295e-06,
+ "loss": 0.4967,
+ "step": 2148
+ },
+ {
+ "epoch": 1.0160756501182033,
+ "grad_norm": 2.66050124168396,
+ "learning_rate": 4.68028680738911e-06,
+ "loss": 0.4932,
+ "step": 2149
+ },
+ {
+ "epoch": 1.016548463356974,
+ "grad_norm": 3.020594835281372,
+ "learning_rate": 4.679981500069026e-06,
+ "loss": 0.5788,
+ "step": 2150
+ },
+ {
+ "epoch": 1.0170212765957447,
+ "grad_norm": 2.697758436203003,
+ "learning_rate": 4.679676057011053e-06,
+ "loss": 0.5441,
+ "step": 2151
+ },
+ {
+ "epoch": 1.0174940898345153,
+ "grad_norm": 6.986445903778076,
+ "learning_rate": 4.679370478234209e-06,
+ "loss": 0.6483,
+ "step": 2152
+ },
+ {
+ "epoch": 1.017966903073286,
+ "grad_norm": 2.6637115478515625,
+ "learning_rate": 4.679064763757522e-06,
+ "loss": 0.5859,
+ "step": 2153
+ },
+ {
+ "epoch": 1.0184397163120567,
+ "grad_norm": 2.7501862049102783,
+ "learning_rate": 4.678758913600027e-06,
+ "loss": 0.5745,
+ "step": 2154
+ },
+ {
+ "epoch": 1.0189125295508275,
+ "grad_norm": 2.7959372997283936,
+ "learning_rate": 4.678452927780768e-06,
+ "loss": 0.5076,
+ "step": 2155
+ },
+ {
+ "epoch": 1.019385342789598,
+ "grad_norm": 2.4377388954162598,
+ "learning_rate": 4.678146806318798e-06,
+ "loss": 0.5061,
+ "step": 2156
+ },
+ {
+ "epoch": 1.0198581560283688,
+ "grad_norm": 2.5478947162628174,
+ "learning_rate": 4.677840549233176e-06,
+ "loss": 0.4941,
+ "step": 2157
+ },
+ {
+ "epoch": 1.0203309692671394,
+ "grad_norm": 3.0956528186798096,
+ "learning_rate": 4.677534156542973e-06,
+ "loss": 0.5879,
+ "step": 2158
+ },
+ {
+ "epoch": 1.0208037825059102,
+ "grad_norm": 2.5247607231140137,
+ "learning_rate": 4.6772276282672666e-06,
+ "loss": 0.5532,
+ "step": 2159
+ },
+ {
+ "epoch": 1.0212765957446808,
+ "grad_norm": 3.1972787380218506,
+ "learning_rate": 4.676920964425143e-06,
+ "loss": 0.6081,
+ "step": 2160
+ },
+ {
+ "epoch": 1.0217494089834516,
+ "grad_norm": 2.6173388957977295,
+ "learning_rate": 4.6766141650356955e-06,
+ "loss": 0.5001,
+ "step": 2161
+ },
+ {
+ "epoch": 1.0222222222222221,
+ "grad_norm": 2.9914398193359375,
+ "learning_rate": 4.676307230118029e-06,
+ "loss": 0.5566,
+ "step": 2162
+ },
+ {
+ "epoch": 1.022695035460993,
+ "grad_norm": 2.8011834621429443,
+ "learning_rate": 4.676000159691254e-06,
+ "loss": 0.4909,
+ "step": 2163
+ },
+ {
+ "epoch": 1.0231678486997635,
+ "grad_norm": 2.6049559116363525,
+ "learning_rate": 4.67569295377449e-06,
+ "loss": 0.5018,
+ "step": 2164
+ },
+ {
+ "epoch": 1.0236406619385343,
+ "grad_norm": 2.8175013065338135,
+ "learning_rate": 4.675385612386866e-06,
+ "loss": 0.5309,
+ "step": 2165
+ },
+ {
+ "epoch": 1.0241134751773049,
+ "grad_norm": 2.854696750640869,
+ "learning_rate": 4.675078135547519e-06,
+ "loss": 0.5627,
+ "step": 2166
+ },
+ {
+ "epoch": 1.0245862884160757,
+ "grad_norm": 3.1856436729431152,
+ "learning_rate": 4.674770523275594e-06,
+ "loss": 0.5475,
+ "step": 2167
+ },
+ {
+ "epoch": 1.0250591016548463,
+ "grad_norm": 2.8289129734039307,
+ "learning_rate": 4.674462775590244e-06,
+ "loss": 0.5878,
+ "step": 2168
+ },
+ {
+ "epoch": 1.025531914893617,
+ "grad_norm": 2.8824517726898193,
+ "learning_rate": 4.6741548925106325e-06,
+ "loss": 0.4392,
+ "step": 2169
+ },
+ {
+ "epoch": 1.0260047281323876,
+ "grad_norm": 2.7044589519500732,
+ "learning_rate": 4.673846874055928e-06,
+ "loss": 0.5264,
+ "step": 2170
+ },
+ {
+ "epoch": 1.0264775413711584,
+ "grad_norm": 2.575035810470581,
+ "learning_rate": 4.673538720245312e-06,
+ "loss": 0.4615,
+ "step": 2171
+ },
+ {
+ "epoch": 1.026950354609929,
+ "grad_norm": 2.48168683052063,
+ "learning_rate": 4.67323043109797e-06,
+ "loss": 0.4404,
+ "step": 2172
+ },
+ {
+ "epoch": 1.0274231678486998,
+ "grad_norm": 2.926593065261841,
+ "learning_rate": 4.672922006633098e-06,
+ "loss": 0.54,
+ "step": 2173
+ },
+ {
+ "epoch": 1.0278959810874704,
+ "grad_norm": 2.4610698223114014,
+ "learning_rate": 4.672613446869901e-06,
+ "loss": 0.5555,
+ "step": 2174
+ },
+ {
+ "epoch": 1.0283687943262412,
+ "grad_norm": 3.026901960372925,
+ "learning_rate": 4.672304751827592e-06,
+ "loss": 0.62,
+ "step": 2175
+ },
+ {
+ "epoch": 1.0288416075650118,
+ "grad_norm": 2.3946213722229004,
+ "learning_rate": 4.671995921525391e-06,
+ "loss": 0.5228,
+ "step": 2176
+ },
+ {
+ "epoch": 1.0293144208037825,
+ "grad_norm": 2.985020399093628,
+ "learning_rate": 4.671686955982528e-06,
+ "loss": 0.6256,
+ "step": 2177
+ },
+ {
+ "epoch": 1.0297872340425531,
+ "grad_norm": 3.0910139083862305,
+ "learning_rate": 4.671377855218239e-06,
+ "loss": 0.5893,
+ "step": 2178
+ },
+ {
+ "epoch": 1.030260047281324,
+ "grad_norm": 2.507805109024048,
+ "learning_rate": 4.6710686192517744e-06,
+ "loss": 0.5329,
+ "step": 2179
+ },
+ {
+ "epoch": 1.0307328605200945,
+ "grad_norm": 2.4514641761779785,
+ "learning_rate": 4.670759248102386e-06,
+ "loss": 0.4585,
+ "step": 2180
+ },
+ {
+ "epoch": 1.0312056737588653,
+ "grad_norm": 2.742838144302368,
+ "learning_rate": 4.670449741789337e-06,
+ "loss": 0.6255,
+ "step": 2181
+ },
+ {
+ "epoch": 1.0316784869976359,
+ "grad_norm": 2.374349594116211,
+ "learning_rate": 4.670140100331901e-06,
+ "loss": 0.5049,
+ "step": 2182
+ },
+ {
+ "epoch": 1.0321513002364067,
+ "grad_norm": 2.78894305229187,
+ "learning_rate": 4.669830323749356e-06,
+ "loss": 0.6061,
+ "step": 2183
+ },
+ {
+ "epoch": 1.0326241134751772,
+ "grad_norm": 2.7195091247558594,
+ "learning_rate": 4.6695204120609905e-06,
+ "loss": 0.592,
+ "step": 2184
+ },
+ {
+ "epoch": 1.033096926713948,
+ "grad_norm": 2.824411630630493,
+ "learning_rate": 4.6692103652861035e-06,
+ "loss": 0.5666,
+ "step": 2185
+ },
+ {
+ "epoch": 1.0335697399527186,
+ "grad_norm": 2.4981014728546143,
+ "learning_rate": 4.6689001834439975e-06,
+ "loss": 0.5045,
+ "step": 2186
+ },
+ {
+ "epoch": 1.0340425531914894,
+ "grad_norm": 2.7375214099884033,
+ "learning_rate": 4.668589866553988e-06,
+ "loss": 0.5305,
+ "step": 2187
+ },
+ {
+ "epoch": 1.03451536643026,
+ "grad_norm": 2.625345468521118,
+ "learning_rate": 4.668279414635396e-06,
+ "loss": 0.4819,
+ "step": 2188
+ },
+ {
+ "epoch": 1.0349881796690308,
+ "grad_norm": 2.60479736328125,
+ "learning_rate": 4.667968827707553e-06,
+ "loss": 0.55,
+ "step": 2189
+ },
+ {
+ "epoch": 1.0354609929078014,
+ "grad_norm": 2.642014741897583,
+ "learning_rate": 4.667658105789797e-06,
+ "loss": 0.5264,
+ "step": 2190
+ },
+ {
+ "epoch": 1.0359338061465722,
+ "grad_norm": 2.5439083576202393,
+ "learning_rate": 4.667347248901476e-06,
+ "loss": 0.4657,
+ "step": 2191
+ },
+ {
+ "epoch": 1.0364066193853427,
+ "grad_norm": 2.5537586212158203,
+ "learning_rate": 4.667036257061945e-06,
+ "loss": 0.527,
+ "step": 2192
+ },
+ {
+ "epoch": 1.0368794326241135,
+ "grad_norm": 2.595466375350952,
+ "learning_rate": 4.666725130290569e-06,
+ "loss": 0.5336,
+ "step": 2193
+ },
+ {
+ "epoch": 1.037352245862884,
+ "grad_norm": 3.5106313228607178,
+ "learning_rate": 4.666413868606719e-06,
+ "loss": 0.5176,
+ "step": 2194
+ },
+ {
+ "epoch": 1.037825059101655,
+ "grad_norm": 2.931553363800049,
+ "learning_rate": 4.666102472029778e-06,
+ "loss": 0.549,
+ "step": 2195
+ },
+ {
+ "epoch": 1.0382978723404255,
+ "grad_norm": 2.4325125217437744,
+ "learning_rate": 4.665790940579133e-06,
+ "loss": 0.5095,
+ "step": 2196
+ },
+ {
+ "epoch": 1.0387706855791963,
+ "grad_norm": 2.708477258682251,
+ "learning_rate": 4.665479274274184e-06,
+ "loss": 0.5264,
+ "step": 2197
+ },
+ {
+ "epoch": 1.0392434988179668,
+ "grad_norm": 2.905977487564087,
+ "learning_rate": 4.665167473134335e-06,
+ "loss": 0.5575,
+ "step": 2198
+ },
+ {
+ "epoch": 1.0397163120567376,
+ "grad_norm": 2.428938865661621,
+ "learning_rate": 4.664855537179003e-06,
+ "loss": 0.5099,
+ "step": 2199
+ },
+ {
+ "epoch": 1.0401891252955082,
+ "grad_norm": 2.8432137966156006,
+ "learning_rate": 4.6645434664276075e-06,
+ "loss": 0.5331,
+ "step": 2200
+ },
+ {
+ "epoch": 1.040661938534279,
+ "grad_norm": 2.5185136795043945,
+ "learning_rate": 4.6642312608995825e-06,
+ "loss": 0.5217,
+ "step": 2201
+ },
+ {
+ "epoch": 1.0411347517730496,
+ "grad_norm": 2.556607723236084,
+ "learning_rate": 4.663918920614366e-06,
+ "loss": 0.4431,
+ "step": 2202
+ },
+ {
+ "epoch": 1.0416075650118204,
+ "grad_norm": 3.1271166801452637,
+ "learning_rate": 4.663606445591407e-06,
+ "loss": 0.5398,
+ "step": 2203
+ },
+ {
+ "epoch": 1.042080378250591,
+ "grad_norm": 2.573680877685547,
+ "learning_rate": 4.663293835850162e-06,
+ "loss": 0.4713,
+ "step": 2204
+ },
+ {
+ "epoch": 1.0425531914893618,
+ "grad_norm": 2.5230324268341064,
+ "learning_rate": 4.662981091410096e-06,
+ "loss": 0.5571,
+ "step": 2205
+ },
+ {
+ "epoch": 1.0430260047281323,
+ "grad_norm": 2.552182912826538,
+ "learning_rate": 4.662668212290681e-06,
+ "loss": 0.5173,
+ "step": 2206
+ },
+ {
+ "epoch": 1.0434988179669031,
+ "grad_norm": 2.832345724105835,
+ "learning_rate": 4.6623551985113995e-06,
+ "loss": 0.525,
+ "step": 2207
+ },
+ {
+ "epoch": 1.0439716312056737,
+ "grad_norm": 2.9729080200195312,
+ "learning_rate": 4.6620420500917416e-06,
+ "loss": 0.6308,
+ "step": 2208
+ },
+ {
+ "epoch": 1.0444444444444445,
+ "grad_norm": 2.618187665939331,
+ "learning_rate": 4.661728767051206e-06,
+ "loss": 0.4942,
+ "step": 2209
+ },
+ {
+ "epoch": 1.044917257683215,
+ "grad_norm": 2.515566349029541,
+ "learning_rate": 4.661415349409299e-06,
+ "loss": 0.5229,
+ "step": 2210
+ },
+ {
+ "epoch": 1.0453900709219859,
+ "grad_norm": 2.8651459217071533,
+ "learning_rate": 4.6611017971855356e-06,
+ "loss": 0.5029,
+ "step": 2211
+ },
+ {
+ "epoch": 1.0458628841607565,
+ "grad_norm": 2.502405881881714,
+ "learning_rate": 4.660788110399439e-06,
+ "loss": 0.4732,
+ "step": 2212
+ },
+ {
+ "epoch": 1.0463356973995273,
+ "grad_norm": 2.540668249130249,
+ "learning_rate": 4.660474289070541e-06,
+ "loss": 0.547,
+ "step": 2213
+ },
+ {
+ "epoch": 1.0468085106382978,
+ "grad_norm": 2.803469181060791,
+ "learning_rate": 4.660160333218384e-06,
+ "loss": 0.5441,
+ "step": 2214
+ },
+ {
+ "epoch": 1.0472813238770686,
+ "grad_norm": 3.233325481414795,
+ "learning_rate": 4.659846242862514e-06,
+ "loss": 0.4457,
+ "step": 2215
+ },
+ {
+ "epoch": 1.0477541371158392,
+ "grad_norm": 2.549548387527466,
+ "learning_rate": 4.659532018022489e-06,
+ "loss": 0.5684,
+ "step": 2216
+ },
+ {
+ "epoch": 1.04822695035461,
+ "grad_norm": 2.6112852096557617,
+ "learning_rate": 4.659217658717875e-06,
+ "loss": 0.5323,
+ "step": 2217
+ },
+ {
+ "epoch": 1.0486997635933806,
+ "grad_norm": 2.347418785095215,
+ "learning_rate": 4.658903164968245e-06,
+ "loss": 0.5349,
+ "step": 2218
+ },
+ {
+ "epoch": 1.0491725768321514,
+ "grad_norm": 2.695502281188965,
+ "learning_rate": 4.658588536793182e-06,
+ "loss": 0.4883,
+ "step": 2219
+ },
+ {
+ "epoch": 1.049645390070922,
+ "grad_norm": 2.7575674057006836,
+ "learning_rate": 4.658273774212275e-06,
+ "loss": 0.5517,
+ "step": 2220
+ },
+ {
+ "epoch": 1.0501182033096927,
+ "grad_norm": 2.787855386734009,
+ "learning_rate": 4.6579588772451245e-06,
+ "loss": 0.5744,
+ "step": 2221
+ },
+ {
+ "epoch": 1.0505910165484633,
+ "grad_norm": 3.0699398517608643,
+ "learning_rate": 4.657643845911337e-06,
+ "loss": 0.5258,
+ "step": 2222
+ },
+ {
+ "epoch": 1.0510638297872341,
+ "grad_norm": 2.652040719985962,
+ "learning_rate": 4.657328680230527e-06,
+ "loss": 0.5141,
+ "step": 2223
+ },
+ {
+ "epoch": 1.0515366430260047,
+ "grad_norm": 2.6896369457244873,
+ "learning_rate": 4.657013380222322e-06,
+ "loss": 0.5139,
+ "step": 2224
+ },
+ {
+ "epoch": 1.0520094562647755,
+ "grad_norm": 2.551839590072632,
+ "learning_rate": 4.65669794590635e-06,
+ "loss": 0.5099,
+ "step": 2225
+ },
+ {
+ "epoch": 1.052482269503546,
+ "grad_norm": 2.8543262481689453,
+ "learning_rate": 4.656382377302255e-06,
+ "loss": 0.6085,
+ "step": 2226
+ },
+ {
+ "epoch": 1.0529550827423169,
+ "grad_norm": 2.871469259262085,
+ "learning_rate": 4.656066674429685e-06,
+ "loss": 0.6108,
+ "step": 2227
+ },
+ {
+ "epoch": 1.0534278959810874,
+ "grad_norm": 2.4840824604034424,
+ "learning_rate": 4.655750837308296e-06,
+ "loss": 0.4994,
+ "step": 2228
+ },
+ {
+ "epoch": 1.0539007092198582,
+ "grad_norm": 2.5203280448913574,
+ "learning_rate": 4.6554348659577555e-06,
+ "loss": 0.4928,
+ "step": 2229
+ },
+ {
+ "epoch": 1.0543735224586288,
+ "grad_norm": 2.9327683448791504,
+ "learning_rate": 4.655118760397737e-06,
+ "loss": 0.6324,
+ "step": 2230
+ },
+ {
+ "epoch": 1.0548463356973996,
+ "grad_norm": 2.6766855716705322,
+ "learning_rate": 4.654802520647924e-06,
+ "loss": 0.5178,
+ "step": 2231
+ },
+ {
+ "epoch": 1.0553191489361702,
+ "grad_norm": 2.8438873291015625,
+ "learning_rate": 4.654486146728006e-06,
+ "loss": 0.509,
+ "step": 2232
+ },
+ {
+ "epoch": 1.055791962174941,
+ "grad_norm": 2.538661241531372,
+ "learning_rate": 4.6541696386576826e-06,
+ "loss": 0.5463,
+ "step": 2233
+ },
+ {
+ "epoch": 1.0562647754137116,
+ "grad_norm": 2.829030990600586,
+ "learning_rate": 4.653852996456662e-06,
+ "loss": 0.5404,
+ "step": 2234
+ },
+ {
+ "epoch": 1.0567375886524824,
+ "grad_norm": 2.5657269954681396,
+ "learning_rate": 4.653536220144659e-06,
+ "loss": 0.5479,
+ "step": 2235
+ },
+ {
+ "epoch": 1.057210401891253,
+ "grad_norm": 2.6641297340393066,
+ "learning_rate": 4.653219309741399e-06,
+ "loss": 0.5503,
+ "step": 2236
+ },
+ {
+ "epoch": 1.0576832151300237,
+ "grad_norm": 2.966350555419922,
+ "learning_rate": 4.652902265266615e-06,
+ "loss": 0.6404,
+ "step": 2237
+ },
+ {
+ "epoch": 1.0581560283687943,
+ "grad_norm": 2.462430000305176,
+ "learning_rate": 4.6525850867400455e-06,
+ "loss": 0.4885,
+ "step": 2238
+ },
+ {
+ "epoch": 1.058628841607565,
+ "grad_norm": 2.1791880130767822,
+ "learning_rate": 4.652267774181443e-06,
+ "loss": 0.4405,
+ "step": 2239
+ },
+ {
+ "epoch": 1.0591016548463357,
+ "grad_norm": 2.5473732948303223,
+ "learning_rate": 4.651950327610563e-06,
+ "loss": 0.5295,
+ "step": 2240
+ },
+ {
+ "epoch": 1.0595744680851065,
+ "grad_norm": 2.70904803276062,
+ "learning_rate": 4.651632747047172e-06,
+ "loss": 0.5169,
+ "step": 2241
+ },
+ {
+ "epoch": 1.060047281323877,
+ "grad_norm": 3.8442928791046143,
+ "learning_rate": 4.651315032511045e-06,
+ "loss": 0.5473,
+ "step": 2242
+ },
+ {
+ "epoch": 1.0605200945626478,
+ "grad_norm": 2.8613383769989014,
+ "learning_rate": 4.650997184021963e-06,
+ "loss": 0.5445,
+ "step": 2243
+ },
+ {
+ "epoch": 1.0609929078014184,
+ "grad_norm": 2.5995829105377197,
+ "learning_rate": 4.6506792015997184e-06,
+ "loss": 0.5525,
+ "step": 2244
+ },
+ {
+ "epoch": 1.0614657210401892,
+ "grad_norm": 2.5465996265411377,
+ "learning_rate": 4.650361085264111e-06,
+ "loss": 0.5093,
+ "step": 2245
+ },
+ {
+ "epoch": 1.0619385342789598,
+ "grad_norm": 2.46553111076355,
+ "learning_rate": 4.650042835034948e-06,
+ "loss": 0.5375,
+ "step": 2246
+ },
+ {
+ "epoch": 1.0624113475177306,
+ "grad_norm": 2.6907830238342285,
+ "learning_rate": 4.649724450932045e-06,
+ "loss": 0.572,
+ "step": 2247
+ },
+ {
+ "epoch": 1.0628841607565012,
+ "grad_norm": 3.0671346187591553,
+ "learning_rate": 4.649405932975226e-06,
+ "loss": 0.4974,
+ "step": 2248
+ },
+ {
+ "epoch": 1.063356973995272,
+ "grad_norm": 2.5392491817474365,
+ "learning_rate": 4.649087281184325e-06,
+ "loss": 0.524,
+ "step": 2249
+ },
+ {
+ "epoch": 1.0638297872340425,
+ "grad_norm": 2.7498562335968018,
+ "learning_rate": 4.648768495579183e-06,
+ "loss": 0.5801,
+ "step": 2250
+ },
+ {
+ "epoch": 1.0643026004728133,
+ "grad_norm": 2.8536248207092285,
+ "learning_rate": 4.648449576179649e-06,
+ "loss": 0.5384,
+ "step": 2251
+ },
+ {
+ "epoch": 1.064775413711584,
+ "grad_norm": 2.7062792778015137,
+ "learning_rate": 4.64813052300558e-06,
+ "loss": 0.5262,
+ "step": 2252
+ },
+ {
+ "epoch": 1.0652482269503547,
+ "grad_norm": 2.798650026321411,
+ "learning_rate": 4.647811336076841e-06,
+ "loss": 0.5719,
+ "step": 2253
+ },
+ {
+ "epoch": 1.0657210401891253,
+ "grad_norm": 2.9793951511383057,
+ "learning_rate": 4.647492015413311e-06,
+ "loss": 0.5377,
+ "step": 2254
+ },
+ {
+ "epoch": 1.066193853427896,
+ "grad_norm": 2.572129011154175,
+ "learning_rate": 4.647172561034868e-06,
+ "loss": 0.4791,
+ "step": 2255
+ },
+ {
+ "epoch": 1.0666666666666667,
+ "grad_norm": 3.7490930557250977,
+ "learning_rate": 4.646852972961405e-06,
+ "loss": 0.5423,
+ "step": 2256
+ },
+ {
+ "epoch": 1.0671394799054374,
+ "grad_norm": 2.626255750656128,
+ "learning_rate": 4.646533251212821e-06,
+ "loss": 0.5558,
+ "step": 2257
+ },
+ {
+ "epoch": 1.067612293144208,
+ "grad_norm": 2.8408126831054688,
+ "learning_rate": 4.646213395809023e-06,
+ "loss": 0.55,
+ "step": 2258
+ },
+ {
+ "epoch": 1.0680851063829788,
+ "grad_norm": 3.255606174468994,
+ "learning_rate": 4.645893406769929e-06,
+ "loss": 0.547,
+ "step": 2259
+ },
+ {
+ "epoch": 1.0685579196217494,
+ "grad_norm": 2.4352102279663086,
+ "learning_rate": 4.645573284115461e-06,
+ "loss": 0.4898,
+ "step": 2260
+ },
+ {
+ "epoch": 1.0690307328605202,
+ "grad_norm": 2.408634662628174,
+ "learning_rate": 4.6452530278655535e-06,
+ "loss": 0.5264,
+ "step": 2261
+ },
+ {
+ "epoch": 1.0695035460992908,
+ "grad_norm": 2.4220449924468994,
+ "learning_rate": 4.644932638040146e-06,
+ "loss": 0.5166,
+ "step": 2262
+ },
+ {
+ "epoch": 1.0699763593380616,
+ "grad_norm": 2.9188082218170166,
+ "learning_rate": 4.644612114659188e-06,
+ "loss": 0.5611,
+ "step": 2263
+ },
+ {
+ "epoch": 1.0704491725768321,
+ "grad_norm": 2.906557083129883,
+ "learning_rate": 4.644291457742638e-06,
+ "loss": 0.5515,
+ "step": 2264
+ },
+ {
+ "epoch": 1.070921985815603,
+ "grad_norm": 2.9039015769958496,
+ "learning_rate": 4.643970667310462e-06,
+ "loss": 0.5732,
+ "step": 2265
+ },
+ {
+ "epoch": 1.0713947990543735,
+ "grad_norm": 2.9985480308532715,
+ "learning_rate": 4.643649743382632e-06,
+ "loss": 0.563,
+ "step": 2266
+ },
+ {
+ "epoch": 1.0718676122931443,
+ "grad_norm": 2.5780906677246094,
+ "learning_rate": 4.6433286859791335e-06,
+ "loss": 0.502,
+ "step": 2267
+ },
+ {
+ "epoch": 1.0723404255319149,
+ "grad_norm": 2.590209722518921,
+ "learning_rate": 4.643007495119955e-06,
+ "loss": 0.4995,
+ "step": 2268
+ },
+ {
+ "epoch": 1.0728132387706855,
+ "grad_norm": 2.378894805908203,
+ "learning_rate": 4.642686170825097e-06,
+ "loss": 0.4886,
+ "step": 2269
+ },
+ {
+ "epoch": 1.0732860520094563,
+ "grad_norm": 2.6826229095458984,
+ "learning_rate": 4.642364713114567e-06,
+ "loss": 0.465,
+ "step": 2270
+ },
+ {
+ "epoch": 1.073758865248227,
+ "grad_norm": 2.627819538116455,
+ "learning_rate": 4.64204312200838e-06,
+ "loss": 0.4954,
+ "step": 2271
+ },
+ {
+ "epoch": 1.0742316784869976,
+ "grad_norm": 2.993021249771118,
+ "learning_rate": 4.641721397526561e-06,
+ "loss": 0.5073,
+ "step": 2272
+ },
+ {
+ "epoch": 1.0747044917257682,
+ "grad_norm": 2.719052791595459,
+ "learning_rate": 4.64139953968914e-06,
+ "loss": 0.538,
+ "step": 2273
+ },
+ {
+ "epoch": 1.075177304964539,
+ "grad_norm": 2.729252576828003,
+ "learning_rate": 4.6410775485161605e-06,
+ "loss": 0.552,
+ "step": 2274
+ },
+ {
+ "epoch": 1.0756501182033098,
+ "grad_norm": 2.924142599105835,
+ "learning_rate": 4.640755424027671e-06,
+ "loss": 0.522,
+ "step": 2275
+ },
+ {
+ "epoch": 1.0761229314420804,
+ "grad_norm": 3.329162120819092,
+ "learning_rate": 4.640433166243728e-06,
+ "loss": 0.5965,
+ "step": 2276
+ },
+ {
+ "epoch": 1.076595744680851,
+ "grad_norm": 2.9810245037078857,
+ "learning_rate": 4.640110775184396e-06,
+ "loss": 0.5653,
+ "step": 2277
+ },
+ {
+ "epoch": 1.0770685579196217,
+ "grad_norm": 2.61772084236145,
+ "learning_rate": 4.639788250869751e-06,
+ "loss": 0.5382,
+ "step": 2278
+ },
+ {
+ "epoch": 1.0775413711583925,
+ "grad_norm": 2.741225004196167,
+ "learning_rate": 4.639465593319874e-06,
+ "loss": 0.4866,
+ "step": 2279
+ },
+ {
+ "epoch": 1.0780141843971631,
+ "grad_norm": 2.7945218086242676,
+ "learning_rate": 4.639142802554856e-06,
+ "loss": 0.4711,
+ "step": 2280
+ },
+ {
+ "epoch": 1.0784869976359337,
+ "grad_norm": 2.4282329082489014,
+ "learning_rate": 4.638819878594795e-06,
+ "loss": 0.4911,
+ "step": 2281
+ },
+ {
+ "epoch": 1.0789598108747045,
+ "grad_norm": 2.551741361618042,
+ "learning_rate": 4.638496821459799e-06,
+ "loss": 0.453,
+ "step": 2282
+ },
+ {
+ "epoch": 1.0794326241134753,
+ "grad_norm": 2.5622754096984863,
+ "learning_rate": 4.638173631169983e-06,
+ "loss": 0.5983,
+ "step": 2283
+ },
+ {
+ "epoch": 1.0799054373522459,
+ "grad_norm": 2.7748284339904785,
+ "learning_rate": 4.6378503077454715e-06,
+ "loss": 0.5143,
+ "step": 2284
+ },
+ {
+ "epoch": 1.0803782505910164,
+ "grad_norm": 2.7693238258361816,
+ "learning_rate": 4.637526851206394e-06,
+ "loss": 0.5929,
+ "step": 2285
+ },
+ {
+ "epoch": 1.0808510638297872,
+ "grad_norm": 2.705548048019409,
+ "learning_rate": 4.637203261572893e-06,
+ "loss": 0.5577,
+ "step": 2286
+ },
+ {
+ "epoch": 1.081323877068558,
+ "grad_norm": 2.739307165145874,
+ "learning_rate": 4.636879538865117e-06,
+ "loss": 0.5676,
+ "step": 2287
+ },
+ {
+ "epoch": 1.0817966903073286,
+ "grad_norm": 2.514059543609619,
+ "learning_rate": 4.636555683103221e-06,
+ "loss": 0.5001,
+ "step": 2288
+ },
+ {
+ "epoch": 1.0822695035460992,
+ "grad_norm": 2.7166874408721924,
+ "learning_rate": 4.636231694307372e-06,
+ "loss": 0.5411,
+ "step": 2289
+ },
+ {
+ "epoch": 1.08274231678487,
+ "grad_norm": 2.7661683559417725,
+ "learning_rate": 4.635907572497741e-06,
+ "loss": 0.6353,
+ "step": 2290
+ },
+ {
+ "epoch": 1.0832151300236406,
+ "grad_norm": 2.598381996154785,
+ "learning_rate": 4.635583317694512e-06,
+ "loss": 0.5213,
+ "step": 2291
+ },
+ {
+ "epoch": 1.0836879432624114,
+ "grad_norm": 2.821491003036499,
+ "learning_rate": 4.6352589299178744e-06,
+ "loss": 0.6172,
+ "step": 2292
+ },
+ {
+ "epoch": 1.084160756501182,
+ "grad_norm": 2.5422823429107666,
+ "learning_rate": 4.634934409188025e-06,
+ "loss": 0.5245,
+ "step": 2293
+ },
+ {
+ "epoch": 1.0846335697399527,
+ "grad_norm": 2.8264620304107666,
+ "learning_rate": 4.634609755525173e-06,
+ "loss": 0.5004,
+ "step": 2294
+ },
+ {
+ "epoch": 1.0851063829787233,
+ "grad_norm": 2.3286643028259277,
+ "learning_rate": 4.63428496894953e-06,
+ "loss": 0.4561,
+ "step": 2295
+ },
+ {
+ "epoch": 1.085579196217494,
+ "grad_norm": 2.462005376815796,
+ "learning_rate": 4.633960049481321e-06,
+ "loss": 0.4948,
+ "step": 2296
+ },
+ {
+ "epoch": 1.0860520094562647,
+ "grad_norm": 2.760258913040161,
+ "learning_rate": 4.633634997140777e-06,
+ "loss": 0.5407,
+ "step": 2297
+ },
+ {
+ "epoch": 1.0865248226950355,
+ "grad_norm": 3.0234217643737793,
+ "learning_rate": 4.633309811948138e-06,
+ "loss": 0.4914,
+ "step": 2298
+ },
+ {
+ "epoch": 1.086997635933806,
+ "grad_norm": 2.8380849361419678,
+ "learning_rate": 4.63298449392365e-06,
+ "loss": 0.5562,
+ "step": 2299
+ },
+ {
+ "epoch": 1.0874704491725768,
+ "grad_norm": 2.6201648712158203,
+ "learning_rate": 4.632659043087572e-06,
+ "loss": 0.5882,
+ "step": 2300
+ },
+ {
+ "epoch": 1.0879432624113474,
+ "grad_norm": 2.586339235305786,
+ "learning_rate": 4.632333459460165e-06,
+ "loss": 0.4991,
+ "step": 2301
+ },
+ {
+ "epoch": 1.0884160756501182,
+ "grad_norm": 2.500115394592285,
+ "learning_rate": 4.632007743061705e-06,
+ "loss": 0.552,
+ "step": 2302
+ },
+ {
+ "epoch": 1.0888888888888888,
+ "grad_norm": 2.816390037536621,
+ "learning_rate": 4.63168189391247e-06,
+ "loss": 0.5301,
+ "step": 2303
+ },
+ {
+ "epoch": 1.0893617021276596,
+ "grad_norm": 2.975400924682617,
+ "learning_rate": 4.631355912032753e-06,
+ "loss": 0.6056,
+ "step": 2304
+ },
+ {
+ "epoch": 1.0898345153664302,
+ "grad_norm": 2.747985363006592,
+ "learning_rate": 4.631029797442846e-06,
+ "loss": 0.5335,
+ "step": 2305
+ },
+ {
+ "epoch": 1.090307328605201,
+ "grad_norm": 2.609281539916992,
+ "learning_rate": 4.630703550163059e-06,
+ "loss": 0.5189,
+ "step": 2306
+ },
+ {
+ "epoch": 1.0907801418439715,
+ "grad_norm": 2.624131202697754,
+ "learning_rate": 4.630377170213705e-06,
+ "loss": 0.5646,
+ "step": 2307
+ },
+ {
+ "epoch": 1.0912529550827423,
+ "grad_norm": 2.6186959743499756,
+ "learning_rate": 4.630050657615107e-06,
+ "loss": 0.5187,
+ "step": 2308
+ },
+ {
+ "epoch": 1.091725768321513,
+ "grad_norm": 2.9961764812469482,
+ "learning_rate": 4.629724012387594e-06,
+ "loss": 0.6207,
+ "step": 2309
+ },
+ {
+ "epoch": 1.0921985815602837,
+ "grad_norm": 2.665799140930176,
+ "learning_rate": 4.629397234551505e-06,
+ "loss": 0.5046,
+ "step": 2310
+ },
+ {
+ "epoch": 1.0926713947990543,
+ "grad_norm": 2.6154725551605225,
+ "learning_rate": 4.629070324127187e-06,
+ "loss": 0.5553,
+ "step": 2311
+ },
+ {
+ "epoch": 1.093144208037825,
+ "grad_norm": 2.702967643737793,
+ "learning_rate": 4.628743281134996e-06,
+ "loss": 0.5159,
+ "step": 2312
+ },
+ {
+ "epoch": 1.0936170212765957,
+ "grad_norm": 2.578080177307129,
+ "learning_rate": 4.628416105595295e-06,
+ "loss": 0.4934,
+ "step": 2313
+ },
+ {
+ "epoch": 1.0940898345153665,
+ "grad_norm": 2.8763060569763184,
+ "learning_rate": 4.628088797528456e-06,
+ "loss": 0.5404,
+ "step": 2314
+ },
+ {
+ "epoch": 1.094562647754137,
+ "grad_norm": 2.5301198959350586,
+ "learning_rate": 4.6277613569548585e-06,
+ "loss": 0.524,
+ "step": 2315
+ },
+ {
+ "epoch": 1.0950354609929078,
+ "grad_norm": 2.559903144836426,
+ "learning_rate": 4.627433783894892e-06,
+ "loss": 0.5177,
+ "step": 2316
+ },
+ {
+ "epoch": 1.0955082742316784,
+ "grad_norm": 2.430863380432129,
+ "learning_rate": 4.627106078368952e-06,
+ "loss": 0.5368,
+ "step": 2317
+ },
+ {
+ "epoch": 1.0959810874704492,
+ "grad_norm": 2.687567949295044,
+ "learning_rate": 4.626778240397444e-06,
+ "loss": 0.5385,
+ "step": 2318
+ },
+ {
+ "epoch": 1.0964539007092198,
+ "grad_norm": 3.053466558456421,
+ "learning_rate": 4.62645027000078e-06,
+ "loss": 0.5814,
+ "step": 2319
+ },
+ {
+ "epoch": 1.0969267139479906,
+ "grad_norm": 2.4612979888916016,
+ "learning_rate": 4.6261221671993815e-06,
+ "loss": 0.5069,
+ "step": 2320
+ },
+ {
+ "epoch": 1.0973995271867611,
+ "grad_norm": 2.6153628826141357,
+ "learning_rate": 4.625793932013679e-06,
+ "loss": 0.5422,
+ "step": 2321
+ },
+ {
+ "epoch": 1.097872340425532,
+ "grad_norm": 2.8918874263763428,
+ "learning_rate": 4.62546556446411e-06,
+ "loss": 0.5326,
+ "step": 2322
+ },
+ {
+ "epoch": 1.0983451536643025,
+ "grad_norm": 3.62565279006958,
+ "learning_rate": 4.625137064571119e-06,
+ "loss": 0.5164,
+ "step": 2323
+ },
+ {
+ "epoch": 1.0988179669030733,
+ "grad_norm": 2.4285085201263428,
+ "learning_rate": 4.624808432355164e-06,
+ "loss": 0.5084,
+ "step": 2324
+ },
+ {
+ "epoch": 1.099290780141844,
+ "grad_norm": 2.593979835510254,
+ "learning_rate": 4.624479667836702e-06,
+ "loss": 0.4986,
+ "step": 2325
+ },
+ {
+ "epoch": 1.0997635933806147,
+ "grad_norm": 2.490752935409546,
+ "learning_rate": 4.624150771036208e-06,
+ "loss": 0.5296,
+ "step": 2326
+ },
+ {
+ "epoch": 1.1002364066193853,
+ "grad_norm": 2.67694091796875,
+ "learning_rate": 4.6238217419741595e-06,
+ "loss": 0.5229,
+ "step": 2327
+ },
+ {
+ "epoch": 1.100709219858156,
+ "grad_norm": 2.594147205352783,
+ "learning_rate": 4.623492580671044e-06,
+ "loss": 0.4916,
+ "step": 2328
+ },
+ {
+ "epoch": 1.1011820330969266,
+ "grad_norm": 2.943472385406494,
+ "learning_rate": 4.623163287147356e-06,
+ "loss": 0.5591,
+ "step": 2329
+ },
+ {
+ "epoch": 1.1016548463356974,
+ "grad_norm": 2.569410562515259,
+ "learning_rate": 4.622833861423601e-06,
+ "loss": 0.4648,
+ "step": 2330
+ },
+ {
+ "epoch": 1.102127659574468,
+ "grad_norm": 2.5490405559539795,
+ "learning_rate": 4.6225043035202886e-06,
+ "loss": 0.5493,
+ "step": 2331
+ },
+ {
+ "epoch": 1.1026004728132388,
+ "grad_norm": 2.5964598655700684,
+ "learning_rate": 4.622174613457941e-06,
+ "loss": 0.5358,
+ "step": 2332
+ },
+ {
+ "epoch": 1.1030732860520094,
+ "grad_norm": 2.6456820964813232,
+ "learning_rate": 4.621844791257085e-06,
+ "loss": 0.5864,
+ "step": 2333
+ },
+ {
+ "epoch": 1.1035460992907802,
+ "grad_norm": 2.861180067062378,
+ "learning_rate": 4.621514836938259e-06,
+ "loss": 0.6064,
+ "step": 2334
+ },
+ {
+ "epoch": 1.1040189125295508,
+ "grad_norm": 2.8199548721313477,
+ "learning_rate": 4.621184750522005e-06,
+ "loss": 0.5244,
+ "step": 2335
+ },
+ {
+ "epoch": 1.1044917257683216,
+ "grad_norm": 2.7398853302001953,
+ "learning_rate": 4.6208545320288795e-06,
+ "loss": 0.5496,
+ "step": 2336
+ },
+ {
+ "epoch": 1.1049645390070921,
+ "grad_norm": 2.7941031455993652,
+ "learning_rate": 4.620524181479441e-06,
+ "loss": 0.5496,
+ "step": 2337
+ },
+ {
+ "epoch": 1.105437352245863,
+ "grad_norm": 2.973785161972046,
+ "learning_rate": 4.620193698894259e-06,
+ "loss": 0.5492,
+ "step": 2338
+ },
+ {
+ "epoch": 1.1059101654846335,
+ "grad_norm": 2.650355815887451,
+ "learning_rate": 4.6198630842939144e-06,
+ "loss": 0.5392,
+ "step": 2339
+ },
+ {
+ "epoch": 1.1063829787234043,
+ "grad_norm": 2.9092214107513428,
+ "learning_rate": 4.61953233769899e-06,
+ "loss": 0.5305,
+ "step": 2340
+ },
+ {
+ "epoch": 1.1068557919621749,
+ "grad_norm": 2.6329731941223145,
+ "learning_rate": 4.61920145913008e-06,
+ "loss": 0.5031,
+ "step": 2341
+ },
+ {
+ "epoch": 1.1073286052009457,
+ "grad_norm": 2.7214207649230957,
+ "learning_rate": 4.618870448607788e-06,
+ "loss": 0.5536,
+ "step": 2342
+ },
+ {
+ "epoch": 1.1078014184397162,
+ "grad_norm": 2.873119592666626,
+ "learning_rate": 4.618539306152724e-06,
+ "loss": 0.4531,
+ "step": 2343
+ },
+ {
+ "epoch": 1.108274231678487,
+ "grad_norm": 2.701042413711548,
+ "learning_rate": 4.618208031785507e-06,
+ "loss": 0.5217,
+ "step": 2344
+ },
+ {
+ "epoch": 1.1087470449172576,
+ "grad_norm": 2.7189881801605225,
+ "learning_rate": 4.6178766255267635e-06,
+ "loss": 0.6205,
+ "step": 2345
+ },
+ {
+ "epoch": 1.1092198581560284,
+ "grad_norm": 2.546382188796997,
+ "learning_rate": 4.61754508739713e-06,
+ "loss": 0.5475,
+ "step": 2346
+ },
+ {
+ "epoch": 1.109692671394799,
+ "grad_norm": 2.8429276943206787,
+ "learning_rate": 4.617213417417249e-06,
+ "loss": 0.4809,
+ "step": 2347
+ },
+ {
+ "epoch": 1.1101654846335698,
+ "grad_norm": 2.9515812397003174,
+ "learning_rate": 4.616881615607772e-06,
+ "loss": 0.5067,
+ "step": 2348
+ },
+ {
+ "epoch": 1.1106382978723404,
+ "grad_norm": 2.5910723209381104,
+ "learning_rate": 4.616549681989358e-06,
+ "loss": 0.5368,
+ "step": 2349
+ },
+ {
+ "epoch": 1.1111111111111112,
+ "grad_norm": 2.80855655670166,
+ "learning_rate": 4.616217616582678e-06,
+ "loss": 0.5827,
+ "step": 2350
+ },
+ {
+ "epoch": 1.1115839243498817,
+ "grad_norm": 2.604383945465088,
+ "learning_rate": 4.6158854194084044e-06,
+ "loss": 0.5716,
+ "step": 2351
+ },
+ {
+ "epoch": 1.1120567375886525,
+ "grad_norm": 3.0585904121398926,
+ "learning_rate": 4.6155530904872246e-06,
+ "loss": 0.4998,
+ "step": 2352
+ },
+ {
+ "epoch": 1.112529550827423,
+ "grad_norm": 2.660961627960205,
+ "learning_rate": 4.61522062983983e-06,
+ "loss": 0.4533,
+ "step": 2353
+ },
+ {
+ "epoch": 1.113002364066194,
+ "grad_norm": 2.8042070865631104,
+ "learning_rate": 4.614888037486923e-06,
+ "loss": 0.5592,
+ "step": 2354
+ },
+ {
+ "epoch": 1.1134751773049645,
+ "grad_norm": 2.681664228439331,
+ "learning_rate": 4.61455531344921e-06,
+ "loss": 0.5439,
+ "step": 2355
+ },
+ {
+ "epoch": 1.1139479905437353,
+ "grad_norm": 2.905054807662964,
+ "learning_rate": 4.61422245774741e-06,
+ "loss": 0.5497,
+ "step": 2356
+ },
+ {
+ "epoch": 1.1144208037825059,
+ "grad_norm": 2.7979753017425537,
+ "learning_rate": 4.6138894704022484e-06,
+ "loss": 0.5374,
+ "step": 2357
+ },
+ {
+ "epoch": 1.1148936170212767,
+ "grad_norm": 2.965611696243286,
+ "learning_rate": 4.613556351434458e-06,
+ "loss": 0.5145,
+ "step": 2358
+ },
+ {
+ "epoch": 1.1153664302600472,
+ "grad_norm": 2.583134889602661,
+ "learning_rate": 4.613223100864782e-06,
+ "loss": 0.535,
+ "step": 2359
+ },
+ {
+ "epoch": 1.115839243498818,
+ "grad_norm": 2.5979621410369873,
+ "learning_rate": 4.61288971871397e-06,
+ "loss": 0.5514,
+ "step": 2360
+ },
+ {
+ "epoch": 1.1163120567375886,
+ "grad_norm": 3.0117669105529785,
+ "learning_rate": 4.612556205002779e-06,
+ "loss": 0.5266,
+ "step": 2361
+ },
+ {
+ "epoch": 1.1167848699763594,
+ "grad_norm": 2.425133466720581,
+ "learning_rate": 4.612222559751976e-06,
+ "loss": 0.4838,
+ "step": 2362
+ },
+ {
+ "epoch": 1.11725768321513,
+ "grad_norm": 2.5102691650390625,
+ "learning_rate": 4.611888782982337e-06,
+ "loss": 0.3947,
+ "step": 2363
+ },
+ {
+ "epoch": 1.1177304964539008,
+ "grad_norm": 3.0327367782592773,
+ "learning_rate": 4.611554874714645e-06,
+ "loss": 0.5753,
+ "step": 2364
+ },
+ {
+ "epoch": 1.1182033096926713,
+ "grad_norm": 2.4561009407043457,
+ "learning_rate": 4.6112208349696875e-06,
+ "loss": 0.5054,
+ "step": 2365
+ },
+ {
+ "epoch": 1.1186761229314421,
+ "grad_norm": 3.3898050785064697,
+ "learning_rate": 4.610886663768267e-06,
+ "loss": 0.5946,
+ "step": 2366
+ },
+ {
+ "epoch": 1.1191489361702127,
+ "grad_norm": 2.8112242221832275,
+ "learning_rate": 4.61055236113119e-06,
+ "loss": 0.5475,
+ "step": 2367
+ },
+ {
+ "epoch": 1.1196217494089835,
+ "grad_norm": 3.152946710586548,
+ "learning_rate": 4.610217927079272e-06,
+ "loss": 0.5165,
+ "step": 2368
+ },
+ {
+ "epoch": 1.120094562647754,
+ "grad_norm": 2.7847867012023926,
+ "learning_rate": 4.609883361633336e-06,
+ "loss": 0.5533,
+ "step": 2369
+ },
+ {
+ "epoch": 1.1205673758865249,
+ "grad_norm": 2.6376686096191406,
+ "learning_rate": 4.6095486648142155e-06,
+ "loss": 0.4942,
+ "step": 2370
+ },
+ {
+ "epoch": 1.1210401891252955,
+ "grad_norm": 3.123072862625122,
+ "learning_rate": 4.609213836642749e-06,
+ "loss": 0.616,
+ "step": 2371
+ },
+ {
+ "epoch": 1.1215130023640663,
+ "grad_norm": 2.802694320678711,
+ "learning_rate": 4.608878877139786e-06,
+ "loss": 0.5323,
+ "step": 2372
+ },
+ {
+ "epoch": 1.1219858156028368,
+ "grad_norm": 2.3567938804626465,
+ "learning_rate": 4.6085437863261825e-06,
+ "loss": 0.4822,
+ "step": 2373
+ },
+ {
+ "epoch": 1.1224586288416076,
+ "grad_norm": 2.553112030029297,
+ "learning_rate": 4.608208564222804e-06,
+ "loss": 0.5447,
+ "step": 2374
+ },
+ {
+ "epoch": 1.1229314420803782,
+ "grad_norm": 3.0020132064819336,
+ "learning_rate": 4.607873210850521e-06,
+ "loss": 0.6486,
+ "step": 2375
+ },
+ {
+ "epoch": 1.123404255319149,
+ "grad_norm": 2.832442045211792,
+ "learning_rate": 4.607537726230216e-06,
+ "loss": 0.5257,
+ "step": 2376
+ },
+ {
+ "epoch": 1.1238770685579196,
+ "grad_norm": 2.471527099609375,
+ "learning_rate": 4.607202110382778e-06,
+ "loss": 0.4816,
+ "step": 2377
+ },
+ {
+ "epoch": 1.1243498817966904,
+ "grad_norm": 2.4232118129730225,
+ "learning_rate": 4.606866363329105e-06,
+ "loss": 0.5533,
+ "step": 2378
+ },
+ {
+ "epoch": 1.124822695035461,
+ "grad_norm": 2.477506637573242,
+ "learning_rate": 4.6065304850901025e-06,
+ "loss": 0.5223,
+ "step": 2379
+ },
+ {
+ "epoch": 1.1252955082742317,
+ "grad_norm": 3.54127836227417,
+ "learning_rate": 4.6061944756866824e-06,
+ "loss": 0.6514,
+ "step": 2380
+ },
+ {
+ "epoch": 1.1257683215130023,
+ "grad_norm": 2.5148677825927734,
+ "learning_rate": 4.605858335139768e-06,
+ "loss": 0.4864,
+ "step": 2381
+ },
+ {
+ "epoch": 1.1262411347517731,
+ "grad_norm": 2.8363659381866455,
+ "learning_rate": 4.605522063470289e-06,
+ "loss": 0.5034,
+ "step": 2382
+ },
+ {
+ "epoch": 1.1267139479905437,
+ "grad_norm": 2.4996654987335205,
+ "learning_rate": 4.605185660699184e-06,
+ "loss": 0.4126,
+ "step": 2383
+ },
+ {
+ "epoch": 1.1271867612293145,
+ "grad_norm": 2.352543830871582,
+ "learning_rate": 4.604849126847398e-06,
+ "loss": 0.5224,
+ "step": 2384
+ },
+ {
+ "epoch": 1.127659574468085,
+ "grad_norm": 2.60101056098938,
+ "learning_rate": 4.6045124619358875e-06,
+ "loss": 0.4867,
+ "step": 2385
+ },
+ {
+ "epoch": 1.1281323877068559,
+ "grad_norm": 2.9471068382263184,
+ "learning_rate": 4.604175665985613e-06,
+ "loss": 0.6474,
+ "step": 2386
+ },
+ {
+ "epoch": 1.1286052009456264,
+ "grad_norm": 2.5933351516723633,
+ "learning_rate": 4.603838739017546e-06,
+ "loss": 0.5081,
+ "step": 2387
+ },
+ {
+ "epoch": 1.1290780141843972,
+ "grad_norm": 2.3740346431732178,
+ "learning_rate": 4.6035016810526665e-06,
+ "loss": 0.4438,
+ "step": 2388
+ },
+ {
+ "epoch": 1.1295508274231678,
+ "grad_norm": 2.675020217895508,
+ "learning_rate": 4.6031644921119614e-06,
+ "loss": 0.4968,
+ "step": 2389
+ },
+ {
+ "epoch": 1.1300236406619386,
+ "grad_norm": 2.599472999572754,
+ "learning_rate": 4.602827172216424e-06,
+ "loss": 0.5131,
+ "step": 2390
+ },
+ {
+ "epoch": 1.1304964539007092,
+ "grad_norm": 2.8176097869873047,
+ "learning_rate": 4.602489721387061e-06,
+ "loss": 0.5549,
+ "step": 2391
+ },
+ {
+ "epoch": 1.13096926713948,
+ "grad_norm": 2.466914176940918,
+ "learning_rate": 4.602152139644881e-06,
+ "loss": 0.5052,
+ "step": 2392
+ },
+ {
+ "epoch": 1.1314420803782506,
+ "grad_norm": 2.8938796520233154,
+ "learning_rate": 4.601814427010905e-06,
+ "loss": 0.6181,
+ "step": 2393
+ },
+ {
+ "epoch": 1.1319148936170214,
+ "grad_norm": 2.7390825748443604,
+ "learning_rate": 4.601476583506161e-06,
+ "loss": 0.5178,
+ "step": 2394
+ },
+ {
+ "epoch": 1.132387706855792,
+ "grad_norm": 3.180112838745117,
+ "learning_rate": 4.601138609151685e-06,
+ "loss": 0.6071,
+ "step": 2395
+ },
+ {
+ "epoch": 1.1328605200945627,
+ "grad_norm": 2.9282350540161133,
+ "learning_rate": 4.600800503968521e-06,
+ "loss": 0.5557,
+ "step": 2396
+ },
+ {
+ "epoch": 1.1333333333333333,
+ "grad_norm": 2.6689717769622803,
+ "learning_rate": 4.6004622679777215e-06,
+ "loss": 0.4679,
+ "step": 2397
+ },
+ {
+ "epoch": 1.133806146572104,
+ "grad_norm": 2.651582956314087,
+ "learning_rate": 4.600123901200347e-06,
+ "loss": 0.4907,
+ "step": 2398
+ },
+ {
+ "epoch": 1.1342789598108747,
+ "grad_norm": 2.5702924728393555,
+ "learning_rate": 4.599785403657464e-06,
+ "loss": 0.4919,
+ "step": 2399
+ },
+ {
+ "epoch": 1.1347517730496455,
+ "grad_norm": 2.636812448501587,
+ "learning_rate": 4.599446775370153e-06,
+ "loss": 0.5091,
+ "step": 2400
+ },
+ {
+ "epoch": 1.135224586288416,
+ "grad_norm": 2.5965442657470703,
+ "learning_rate": 4.599108016359497e-06,
+ "loss": 0.5035,
+ "step": 2401
+ },
+ {
+ "epoch": 1.1356973995271868,
+ "grad_norm": 2.689732313156128,
+ "learning_rate": 4.5987691266465885e-06,
+ "loss": 0.5307,
+ "step": 2402
+ },
+ {
+ "epoch": 1.1361702127659574,
+ "grad_norm": 2.7256956100463867,
+ "learning_rate": 4.59843010625253e-06,
+ "loss": 0.5066,
+ "step": 2403
+ },
+ {
+ "epoch": 1.1366430260047282,
+ "grad_norm": 2.726020574569702,
+ "learning_rate": 4.59809095519843e-06,
+ "loss": 0.4805,
+ "step": 2404
+ },
+ {
+ "epoch": 1.1371158392434988,
+ "grad_norm": 2.703339099884033,
+ "learning_rate": 4.597751673505406e-06,
+ "loss": 0.4992,
+ "step": 2405
+ },
+ {
+ "epoch": 1.1375886524822696,
+ "grad_norm": 2.54455304145813,
+ "learning_rate": 4.5974122611945835e-06,
+ "loss": 0.5251,
+ "step": 2406
+ },
+ {
+ "epoch": 1.1380614657210402,
+ "grad_norm": 2.623507022857666,
+ "learning_rate": 4.597072718287096e-06,
+ "loss": 0.4831,
+ "step": 2407
+ },
+ {
+ "epoch": 1.138534278959811,
+ "grad_norm": 2.653590202331543,
+ "learning_rate": 4.596733044804086e-06,
+ "loss": 0.5646,
+ "step": 2408
+ },
+ {
+ "epoch": 1.1390070921985815,
+ "grad_norm": 2.8230600357055664,
+ "learning_rate": 4.5963932407667035e-06,
+ "loss": 0.514,
+ "step": 2409
+ },
+ {
+ "epoch": 1.1394799054373523,
+ "grad_norm": 2.6077451705932617,
+ "learning_rate": 4.5960533061961065e-06,
+ "loss": 0.4713,
+ "step": 2410
+ },
+ {
+ "epoch": 1.139952718676123,
+ "grad_norm": 2.3945798873901367,
+ "learning_rate": 4.595713241113461e-06,
+ "loss": 0.466,
+ "step": 2411
+ },
+ {
+ "epoch": 1.1404255319148937,
+ "grad_norm": 2.8100006580352783,
+ "learning_rate": 4.595373045539941e-06,
+ "loss": 0.5365,
+ "step": 2412
+ },
+ {
+ "epoch": 1.1408983451536643,
+ "grad_norm": 2.6825881004333496,
+ "learning_rate": 4.59503271949673e-06,
+ "loss": 0.4457,
+ "step": 2413
+ },
+ {
+ "epoch": 1.141371158392435,
+ "grad_norm": 2.969435691833496,
+ "learning_rate": 4.594692263005016e-06,
+ "loss": 0.5459,
+ "step": 2414
+ },
+ {
+ "epoch": 1.1418439716312057,
+ "grad_norm": 2.4103164672851562,
+ "learning_rate": 4.594351676086002e-06,
+ "loss": 0.4573,
+ "step": 2415
+ },
+ {
+ "epoch": 1.1423167848699765,
+ "grad_norm": 2.9450128078460693,
+ "learning_rate": 4.594010958760892e-06,
+ "loss": 0.5529,
+ "step": 2416
+ },
+ {
+ "epoch": 1.142789598108747,
+ "grad_norm": 2.6416335105895996,
+ "learning_rate": 4.593670111050901e-06,
+ "loss": 0.5153,
+ "step": 2417
+ },
+ {
+ "epoch": 1.1432624113475178,
+ "grad_norm": 2.473177194595337,
+ "learning_rate": 4.593329132977253e-06,
+ "loss": 0.4962,
+ "step": 2418
+ },
+ {
+ "epoch": 1.1437352245862884,
+ "grad_norm": 2.4494502544403076,
+ "learning_rate": 4.592988024561179e-06,
+ "loss": 0.5182,
+ "step": 2419
+ },
+ {
+ "epoch": 1.1442080378250592,
+ "grad_norm": 2.773930311203003,
+ "learning_rate": 4.592646785823918e-06,
+ "loss": 0.4442,
+ "step": 2420
+ },
+ {
+ "epoch": 1.1446808510638298,
+ "grad_norm": 2.4733314514160156,
+ "learning_rate": 4.592305416786718e-06,
+ "loss": 0.5106,
+ "step": 2421
+ },
+ {
+ "epoch": 1.1451536643026006,
+ "grad_norm": 2.6870038509368896,
+ "learning_rate": 4.591963917470834e-06,
+ "loss": 0.5316,
+ "step": 2422
+ },
+ {
+ "epoch": 1.1456264775413711,
+ "grad_norm": 2.8989531993865967,
+ "learning_rate": 4.591622287897529e-06,
+ "loss": 0.5906,
+ "step": 2423
+ },
+ {
+ "epoch": 1.1460992907801417,
+ "grad_norm": 2.6349124908447266,
+ "learning_rate": 4.591280528088077e-06,
+ "loss": 0.6225,
+ "step": 2424
+ },
+ {
+ "epoch": 1.1465721040189125,
+ "grad_norm": 3.19022274017334,
+ "learning_rate": 4.5909386380637555e-06,
+ "loss": 0.555,
+ "step": 2425
+ },
+ {
+ "epoch": 1.1470449172576833,
+ "grad_norm": 3.1473541259765625,
+ "learning_rate": 4.5905966178458535e-06,
+ "loss": 0.537,
+ "step": 2426
+ },
+ {
+ "epoch": 1.147517730496454,
+ "grad_norm": 2.6996145248413086,
+ "learning_rate": 4.590254467455667e-06,
+ "loss": 0.565,
+ "step": 2427
+ },
+ {
+ "epoch": 1.1479905437352245,
+ "grad_norm": 2.830188274383545,
+ "learning_rate": 4.5899121869145015e-06,
+ "loss": 0.6773,
+ "step": 2428
+ },
+ {
+ "epoch": 1.1484633569739953,
+ "grad_norm": 2.4937260150909424,
+ "learning_rate": 4.589569776243667e-06,
+ "loss": 0.5484,
+ "step": 2429
+ },
+ {
+ "epoch": 1.148936170212766,
+ "grad_norm": 2.54011869430542,
+ "learning_rate": 4.589227235464486e-06,
+ "loss": 0.5307,
+ "step": 2430
+ },
+ {
+ "epoch": 1.1494089834515366,
+ "grad_norm": 2.8764214515686035,
+ "learning_rate": 4.5888845645982845e-06,
+ "loss": 0.5296,
+ "step": 2431
+ },
+ {
+ "epoch": 1.1498817966903072,
+ "grad_norm": 2.637033462524414,
+ "learning_rate": 4.588541763666402e-06,
+ "loss": 0.5975,
+ "step": 2432
+ },
+ {
+ "epoch": 1.150354609929078,
+ "grad_norm": 2.8534255027770996,
+ "learning_rate": 4.5881988326901815e-06,
+ "loss": 0.5431,
+ "step": 2433
+ },
+ {
+ "epoch": 1.1508274231678488,
+ "grad_norm": 2.8546559810638428,
+ "learning_rate": 4.587855771690976e-06,
+ "loss": 0.469,
+ "step": 2434
+ },
+ {
+ "epoch": 1.1513002364066194,
+ "grad_norm": 2.9084973335266113,
+ "learning_rate": 4.587512580690146e-06,
+ "loss": 0.5566,
+ "step": 2435
+ },
+ {
+ "epoch": 1.15177304964539,
+ "grad_norm": 3.0993130207061768,
+ "learning_rate": 4.587169259709063e-06,
+ "loss": 0.5612,
+ "step": 2436
+ },
+ {
+ "epoch": 1.1522458628841608,
+ "grad_norm": 10.847400665283203,
+ "learning_rate": 4.5868258087691e-06,
+ "loss": 0.4678,
+ "step": 2437
+ },
+ {
+ "epoch": 1.1527186761229316,
+ "grad_norm": 2.6648571491241455,
+ "learning_rate": 4.586482227891645e-06,
+ "loss": 0.5951,
+ "step": 2438
+ },
+ {
+ "epoch": 1.1531914893617021,
+ "grad_norm": 2.529043197631836,
+ "learning_rate": 4.586138517098091e-06,
+ "loss": 0.5048,
+ "step": 2439
+ },
+ {
+ "epoch": 1.1536643026004727,
+ "grad_norm": 2.833904504776001,
+ "learning_rate": 4.585794676409839e-06,
+ "loss": 0.536,
+ "step": 2440
+ },
+ {
+ "epoch": 1.1541371158392435,
+ "grad_norm": 3.507657766342163,
+ "learning_rate": 4.585450705848298e-06,
+ "loss": 0.5954,
+ "step": 2441
+ },
+ {
+ "epoch": 1.1546099290780143,
+ "grad_norm": 2.6108388900756836,
+ "learning_rate": 4.585106605434887e-06,
+ "loss": 0.5684,
+ "step": 2442
+ },
+ {
+ "epoch": 1.1550827423167849,
+ "grad_norm": 2.490708589553833,
+ "learning_rate": 4.58476237519103e-06,
+ "loss": 0.4678,
+ "step": 2443
+ },
+ {
+ "epoch": 1.1555555555555554,
+ "grad_norm": 2.8192343711853027,
+ "learning_rate": 4.584418015138161e-06,
+ "loss": 0.5291,
+ "step": 2444
+ },
+ {
+ "epoch": 1.1560283687943262,
+ "grad_norm": 3.0878679752349854,
+ "learning_rate": 4.584073525297722e-06,
+ "loss": 0.5691,
+ "step": 2445
+ },
+ {
+ "epoch": 1.156501182033097,
+ "grad_norm": 3.1444318294525146,
+ "learning_rate": 4.583728905691163e-06,
+ "loss": 0.5643,
+ "step": 2446
+ },
+ {
+ "epoch": 1.1569739952718676,
+ "grad_norm": 3.02382230758667,
+ "learning_rate": 4.583384156339942e-06,
+ "loss": 0.6008,
+ "step": 2447
+ },
+ {
+ "epoch": 1.1574468085106382,
+ "grad_norm": 2.5942490100860596,
+ "learning_rate": 4.583039277265525e-06,
+ "loss": 0.5105,
+ "step": 2448
+ },
+ {
+ "epoch": 1.157919621749409,
+ "grad_norm": 2.938608407974243,
+ "learning_rate": 4.582694268489386e-06,
+ "loss": 0.5123,
+ "step": 2449
+ },
+ {
+ "epoch": 1.1583924349881798,
+ "grad_norm": 2.4622268676757812,
+ "learning_rate": 4.5823491300330075e-06,
+ "loss": 0.4538,
+ "step": 2450
+ },
+ {
+ "epoch": 1.1588652482269504,
+ "grad_norm": 2.4380505084991455,
+ "learning_rate": 4.5820038619178795e-06,
+ "loss": 0.4682,
+ "step": 2451
+ },
+ {
+ "epoch": 1.159338061465721,
+ "grad_norm": 2.479896068572998,
+ "learning_rate": 4.581658464165501e-06,
+ "loss": 0.4877,
+ "step": 2452
+ },
+ {
+ "epoch": 1.1598108747044917,
+ "grad_norm": 2.3373546600341797,
+ "learning_rate": 4.5813129367973765e-06,
+ "loss": 0.445,
+ "step": 2453
+ },
+ {
+ "epoch": 1.1602836879432625,
+ "grad_norm": 2.8586013317108154,
+ "learning_rate": 4.5809672798350214e-06,
+ "loss": 0.5232,
+ "step": 2454
+ },
+ {
+ "epoch": 1.160756501182033,
+ "grad_norm": 3.2302439212799072,
+ "learning_rate": 4.5806214932999595e-06,
+ "loss": 0.5336,
+ "step": 2455
+ },
+ {
+ "epoch": 1.1612293144208037,
+ "grad_norm": 3.1005783081054688,
+ "learning_rate": 4.580275577213721e-06,
+ "loss": 0.5123,
+ "step": 2456
+ },
+ {
+ "epoch": 1.1617021276595745,
+ "grad_norm": 2.7131073474884033,
+ "learning_rate": 4.579929531597842e-06,
+ "loss": 0.5648,
+ "step": 2457
+ },
+ {
+ "epoch": 1.1621749408983453,
+ "grad_norm": 2.5067050457000732,
+ "learning_rate": 4.579583356473874e-06,
+ "loss": 0.5324,
+ "step": 2458
+ },
+ {
+ "epoch": 1.1626477541371159,
+ "grad_norm": 2.7870543003082275,
+ "learning_rate": 4.579237051863366e-06,
+ "loss": 0.5094,
+ "step": 2459
+ },
+ {
+ "epoch": 1.1631205673758864,
+ "grad_norm": 2.739196300506592,
+ "learning_rate": 4.578890617787887e-06,
+ "loss": 0.5103,
+ "step": 2460
+ },
+ {
+ "epoch": 1.1635933806146572,
+ "grad_norm": 2.7108185291290283,
+ "learning_rate": 4.578544054269003e-06,
+ "loss": 0.533,
+ "step": 2461
+ },
+ {
+ "epoch": 1.1640661938534278,
+ "grad_norm": 3.028005361557007,
+ "learning_rate": 4.578197361328295e-06,
+ "loss": 0.636,
+ "step": 2462
+ },
+ {
+ "epoch": 1.1645390070921986,
+ "grad_norm": 2.4855129718780518,
+ "learning_rate": 4.5778505389873505e-06,
+ "loss": 0.501,
+ "step": 2463
+ },
+ {
+ "epoch": 1.1650118203309692,
+ "grad_norm": 2.6314198970794678,
+ "learning_rate": 4.577503587267764e-06,
+ "loss": 0.5812,
+ "step": 2464
+ },
+ {
+ "epoch": 1.16548463356974,
+ "grad_norm": 2.4209671020507812,
+ "learning_rate": 4.5771565061911385e-06,
+ "loss": 0.5168,
+ "step": 2465
+ },
+ {
+ "epoch": 1.1659574468085105,
+ "grad_norm": 2.526388645172119,
+ "learning_rate": 4.576809295779085e-06,
+ "loss": 0.5047,
+ "step": 2466
+ },
+ {
+ "epoch": 1.1664302600472813,
+ "grad_norm": 2.8278191089630127,
+ "learning_rate": 4.576461956053224e-06,
+ "loss": 0.4759,
+ "step": 2467
+ },
+ {
+ "epoch": 1.166903073286052,
+ "grad_norm": 2.7862167358398438,
+ "learning_rate": 4.576114487035182e-06,
+ "loss": 0.5492,
+ "step": 2468
+ },
+ {
+ "epoch": 1.1673758865248227,
+ "grad_norm": 2.6303019523620605,
+ "learning_rate": 4.575766888746594e-06,
+ "loss": 0.5538,
+ "step": 2469
+ },
+ {
+ "epoch": 1.1678486997635933,
+ "grad_norm": 2.613104820251465,
+ "learning_rate": 4.5754191612091034e-06,
+ "loss": 0.5114,
+ "step": 2470
+ },
+ {
+ "epoch": 1.168321513002364,
+ "grad_norm": 2.653958320617676,
+ "learning_rate": 4.5750713044443625e-06,
+ "loss": 0.5858,
+ "step": 2471
+ },
+ {
+ "epoch": 1.1687943262411347,
+ "grad_norm": 3.1143975257873535,
+ "learning_rate": 4.574723318474031e-06,
+ "loss": 0.5193,
+ "step": 2472
+ },
+ {
+ "epoch": 1.1692671394799055,
+ "grad_norm": 3.05454421043396,
+ "learning_rate": 4.574375203319775e-06,
+ "loss": 0.464,
+ "step": 2473
+ },
+ {
+ "epoch": 1.169739952718676,
+ "grad_norm": 2.66626238822937,
+ "learning_rate": 4.574026959003272e-06,
+ "loss": 0.4988,
+ "step": 2474
+ },
+ {
+ "epoch": 1.1702127659574468,
+ "grad_norm": 2.8871963024139404,
+ "learning_rate": 4.573678585546203e-06,
+ "loss": 0.5557,
+ "step": 2475
+ },
+ {
+ "epoch": 1.1706855791962174,
+ "grad_norm": 2.592949628829956,
+ "learning_rate": 4.573330082970262e-06,
+ "loss": 0.5178,
+ "step": 2476
+ },
+ {
+ "epoch": 1.1711583924349882,
+ "grad_norm": 2.9111456871032715,
+ "learning_rate": 4.572981451297148e-06,
+ "loss": 0.5712,
+ "step": 2477
+ },
+ {
+ "epoch": 1.1716312056737588,
+ "grad_norm": 2.8152248859405518,
+ "learning_rate": 4.57263269054857e-06,
+ "loss": 0.5548,
+ "step": 2478
+ },
+ {
+ "epoch": 1.1721040189125296,
+ "grad_norm": 3.0292418003082275,
+ "learning_rate": 4.572283800746241e-06,
+ "loss": 0.5937,
+ "step": 2479
+ },
+ {
+ "epoch": 1.1725768321513002,
+ "grad_norm": 3.454618215560913,
+ "learning_rate": 4.571934781911886e-06,
+ "loss": 0.5537,
+ "step": 2480
+ },
+ {
+ "epoch": 1.173049645390071,
+ "grad_norm": 2.7817866802215576,
+ "learning_rate": 4.571585634067239e-06,
+ "loss": 0.5649,
+ "step": 2481
+ },
+ {
+ "epoch": 1.1735224586288415,
+ "grad_norm": 2.7989349365234375,
+ "learning_rate": 4.571236357234037e-06,
+ "loss": 0.5448,
+ "step": 2482
+ },
+ {
+ "epoch": 1.1739952718676123,
+ "grad_norm": 2.8863933086395264,
+ "learning_rate": 4.57088695143403e-06,
+ "loss": 0.63,
+ "step": 2483
+ },
+ {
+ "epoch": 1.174468085106383,
+ "grad_norm": 2.5738039016723633,
+ "learning_rate": 4.570537416688972e-06,
+ "loss": 0.4702,
+ "step": 2484
+ },
+ {
+ "epoch": 1.1749408983451537,
+ "grad_norm": 3.003643274307251,
+ "learning_rate": 4.570187753020629e-06,
+ "loss": 0.5918,
+ "step": 2485
+ },
+ {
+ "epoch": 1.1754137115839243,
+ "grad_norm": 2.8619167804718018,
+ "learning_rate": 4.569837960450772e-06,
+ "loss": 0.5268,
+ "step": 2486
+ },
+ {
+ "epoch": 1.175886524822695,
+ "grad_norm": 2.876077175140381,
+ "learning_rate": 4.569488039001181e-06,
+ "loss": 0.4915,
+ "step": 2487
+ },
+ {
+ "epoch": 1.1763593380614656,
+ "grad_norm": 3.407115936279297,
+ "learning_rate": 4.569137988693644e-06,
+ "loss": 0.5761,
+ "step": 2488
+ },
+ {
+ "epoch": 1.1768321513002364,
+ "grad_norm": 2.7292826175689697,
+ "learning_rate": 4.568787809549958e-06,
+ "loss": 0.541,
+ "step": 2489
+ },
+ {
+ "epoch": 1.177304964539007,
+ "grad_norm": 2.8805999755859375,
+ "learning_rate": 4.568437501591926e-06,
+ "loss": 0.6223,
+ "step": 2490
+ },
+ {
+ "epoch": 1.1777777777777778,
+ "grad_norm": 2.9264373779296875,
+ "learning_rate": 4.56808706484136e-06,
+ "loss": 0.6081,
+ "step": 2491
+ },
+ {
+ "epoch": 1.1782505910165484,
+ "grad_norm": 2.5167033672332764,
+ "learning_rate": 4.567736499320082e-06,
+ "loss": 0.5393,
+ "step": 2492
+ },
+ {
+ "epoch": 1.1787234042553192,
+ "grad_norm": 3.4647862911224365,
+ "learning_rate": 4.567385805049918e-06,
+ "loss": 0.4826,
+ "step": 2493
+ },
+ {
+ "epoch": 1.1791962174940898,
+ "grad_norm": 2.9824202060699463,
+ "learning_rate": 4.5670349820527055e-06,
+ "loss": 0.541,
+ "step": 2494
+ },
+ {
+ "epoch": 1.1796690307328606,
+ "grad_norm": 2.997105836868286,
+ "learning_rate": 4.5666840303502885e-06,
+ "loss": 0.5771,
+ "step": 2495
+ },
+ {
+ "epoch": 1.1801418439716311,
+ "grad_norm": 2.8728017807006836,
+ "learning_rate": 4.56633294996452e-06,
+ "loss": 0.4877,
+ "step": 2496
+ },
+ {
+ "epoch": 1.180614657210402,
+ "grad_norm": 2.626498222351074,
+ "learning_rate": 4.5659817409172565e-06,
+ "loss": 0.5296,
+ "step": 2497
+ },
+ {
+ "epoch": 1.1810874704491725,
+ "grad_norm": 2.87037992477417,
+ "learning_rate": 4.565630403230371e-06,
+ "loss": 0.539,
+ "step": 2498
+ },
+ {
+ "epoch": 1.1815602836879433,
+ "grad_norm": 2.5719685554504395,
+ "learning_rate": 4.5652789369257375e-06,
+ "loss": 0.5653,
+ "step": 2499
+ },
+ {
+ "epoch": 1.1820330969267139,
+ "grad_norm": 2.4842135906219482,
+ "learning_rate": 4.56492734202524e-06,
+ "loss": 0.515,
+ "step": 2500
+ },
+ {
+ "epoch": 1.1825059101654847,
+ "grad_norm": 2.640951156616211,
+ "learning_rate": 4.564575618550773e-06,
+ "loss": 0.5601,
+ "step": 2501
+ },
+ {
+ "epoch": 1.1829787234042553,
+ "grad_norm": 2.624394655227661,
+ "learning_rate": 4.564223766524234e-06,
+ "loss": 0.5551,
+ "step": 2502
+ },
+ {
+ "epoch": 1.183451536643026,
+ "grad_norm": 3.014537811279297,
+ "learning_rate": 4.563871785967533e-06,
+ "loss": 0.5212,
+ "step": 2503
+ },
+ {
+ "epoch": 1.1839243498817966,
+ "grad_norm": 2.8756890296936035,
+ "learning_rate": 4.563519676902585e-06,
+ "loss": 0.5132,
+ "step": 2504
+ },
+ {
+ "epoch": 1.1843971631205674,
+ "grad_norm": 2.636781692504883,
+ "learning_rate": 4.5631674393513145e-06,
+ "loss": 0.5323,
+ "step": 2505
+ },
+ {
+ "epoch": 1.184869976359338,
+ "grad_norm": 2.7233786582946777,
+ "learning_rate": 4.562815073335655e-06,
+ "loss": 0.5608,
+ "step": 2506
+ },
+ {
+ "epoch": 1.1853427895981088,
+ "grad_norm": 2.7158713340759277,
+ "learning_rate": 4.562462578877546e-06,
+ "loss": 0.5373,
+ "step": 2507
+ },
+ {
+ "epoch": 1.1858156028368794,
+ "grad_norm": 2.9754762649536133,
+ "learning_rate": 4.562109955998936e-06,
+ "loss": 0.5712,
+ "step": 2508
+ },
+ {
+ "epoch": 1.1862884160756502,
+ "grad_norm": 2.8815054893493652,
+ "learning_rate": 4.561757204721781e-06,
+ "loss": 0.6126,
+ "step": 2509
+ },
+ {
+ "epoch": 1.1867612293144207,
+ "grad_norm": 2.866319417953491,
+ "learning_rate": 4.561404325068045e-06,
+ "loss": 0.506,
+ "step": 2510
+ },
+ {
+ "epoch": 1.1872340425531915,
+ "grad_norm": 2.6187376976013184,
+ "learning_rate": 4.561051317059701e-06,
+ "loss": 0.4674,
+ "step": 2511
+ },
+ {
+ "epoch": 1.1877068557919621,
+ "grad_norm": 2.642552137374878,
+ "learning_rate": 4.560698180718729e-06,
+ "loss": 0.4793,
+ "step": 2512
+ },
+ {
+ "epoch": 1.188179669030733,
+ "grad_norm": 2.7815041542053223,
+ "learning_rate": 4.560344916067117e-06,
+ "loss": 0.5034,
+ "step": 2513
+ },
+ {
+ "epoch": 1.1886524822695035,
+ "grad_norm": 2.70853590965271,
+ "learning_rate": 4.559991523126862e-06,
+ "loss": 0.4811,
+ "step": 2514
+ },
+ {
+ "epoch": 1.1891252955082743,
+ "grad_norm": 2.7049436569213867,
+ "learning_rate": 4.559638001919967e-06,
+ "loss": 0.547,
+ "step": 2515
+ },
+ {
+ "epoch": 1.1895981087470449,
+ "grad_norm": 2.766773223876953,
+ "learning_rate": 4.559284352468445e-06,
+ "loss": 0.5362,
+ "step": 2516
+ },
+ {
+ "epoch": 1.1900709219858157,
+ "grad_norm": 3.0064334869384766,
+ "learning_rate": 4.558930574794316e-06,
+ "loss": 0.5915,
+ "step": 2517
+ },
+ {
+ "epoch": 1.1905437352245862,
+ "grad_norm": 2.4899885654449463,
+ "learning_rate": 4.558576668919609e-06,
+ "loss": 0.4379,
+ "step": 2518
+ },
+ {
+ "epoch": 1.191016548463357,
+ "grad_norm": 2.925963878631592,
+ "learning_rate": 4.558222634866358e-06,
+ "loss": 0.5389,
+ "step": 2519
+ },
+ {
+ "epoch": 1.1914893617021276,
+ "grad_norm": 6.087667465209961,
+ "learning_rate": 4.55786847265661e-06,
+ "loss": 0.4777,
+ "step": 2520
+ },
+ {
+ "epoch": 1.1919621749408984,
+ "grad_norm": 2.4560582637786865,
+ "learning_rate": 4.5575141823124145e-06,
+ "loss": 0.5576,
+ "step": 2521
+ },
+ {
+ "epoch": 1.192434988179669,
+ "grad_norm": 3.184252977371216,
+ "learning_rate": 4.557159763855834e-06,
+ "loss": 0.5151,
+ "step": 2522
+ },
+ {
+ "epoch": 1.1929078014184398,
+ "grad_norm": 2.359722137451172,
+ "learning_rate": 4.556805217308935e-06,
+ "loss": 0.478,
+ "step": 2523
+ },
+ {
+ "epoch": 1.1933806146572103,
+ "grad_norm": 3.0821568965911865,
+ "learning_rate": 4.5564505426937935e-06,
+ "loss": 0.5784,
+ "step": 2524
+ },
+ {
+ "epoch": 1.1938534278959811,
+ "grad_norm": 2.9905128479003906,
+ "learning_rate": 4.5560957400324936e-06,
+ "loss": 0.6087,
+ "step": 2525
+ },
+ {
+ "epoch": 1.1943262411347517,
+ "grad_norm": 2.462102174758911,
+ "learning_rate": 4.555740809347128e-06,
+ "loss": 0.4739,
+ "step": 2526
+ },
+ {
+ "epoch": 1.1947990543735225,
+ "grad_norm": 2.7931067943573,
+ "learning_rate": 4.555385750659796e-06,
+ "loss": 0.4961,
+ "step": 2527
+ },
+ {
+ "epoch": 1.195271867612293,
+ "grad_norm": 2.660320997238159,
+ "learning_rate": 4.555030563992607e-06,
+ "loss": 0.487,
+ "step": 2528
+ },
+ {
+ "epoch": 1.195744680851064,
+ "grad_norm": 2.8135557174682617,
+ "learning_rate": 4.554675249367675e-06,
+ "loss": 0.5269,
+ "step": 2529
+ },
+ {
+ "epoch": 1.1962174940898345,
+ "grad_norm": 2.661933422088623,
+ "learning_rate": 4.554319806807126e-06,
+ "loss": 0.4723,
+ "step": 2530
+ },
+ {
+ "epoch": 1.1966903073286053,
+ "grad_norm": 2.568176507949829,
+ "learning_rate": 4.553964236333089e-06,
+ "loss": 0.5258,
+ "step": 2531
+ },
+ {
+ "epoch": 1.1971631205673758,
+ "grad_norm": 2.6890947818756104,
+ "learning_rate": 4.553608537967705e-06,
+ "loss": 0.4965,
+ "step": 2532
+ },
+ {
+ "epoch": 1.1976359338061466,
+ "grad_norm": 3.133470058441162,
+ "learning_rate": 4.553252711733124e-06,
+ "loss": 0.5423,
+ "step": 2533
+ },
+ {
+ "epoch": 1.1981087470449172,
+ "grad_norm": 2.7086687088012695,
+ "learning_rate": 4.552896757651498e-06,
+ "loss": 0.5326,
+ "step": 2534
+ },
+ {
+ "epoch": 1.198581560283688,
+ "grad_norm": 2.8411715030670166,
+ "learning_rate": 4.552540675744994e-06,
+ "loss": 0.5793,
+ "step": 2535
+ },
+ {
+ "epoch": 1.1990543735224586,
+ "grad_norm": 3.041077136993408,
+ "learning_rate": 4.552184466035782e-06,
+ "loss": 0.5068,
+ "step": 2536
+ },
+ {
+ "epoch": 1.1995271867612294,
+ "grad_norm": 2.5921192169189453,
+ "learning_rate": 4.551828128546041e-06,
+ "loss": 0.5189,
+ "step": 2537
+ },
+ {
+ "epoch": 1.2,
+ "grad_norm": 2.923305034637451,
+ "learning_rate": 4.5514716632979605e-06,
+ "loss": 0.516,
+ "step": 2538
+ },
+ {
+ "epoch": 1.2004728132387708,
+ "grad_norm": 2.7083024978637695,
+ "learning_rate": 4.551115070313734e-06,
+ "loss": 0.4825,
+ "step": 2539
+ },
+ {
+ "epoch": 1.2009456264775413,
+ "grad_norm": 2.746842384338379,
+ "learning_rate": 4.550758349615567e-06,
+ "loss": 0.5691,
+ "step": 2540
+ },
+ {
+ "epoch": 1.2014184397163121,
+ "grad_norm": 2.6596429347991943,
+ "learning_rate": 4.550401501225669e-06,
+ "loss": 0.5983,
+ "step": 2541
+ },
+ {
+ "epoch": 1.2018912529550827,
+ "grad_norm": 2.9057931900024414,
+ "learning_rate": 4.550044525166261e-06,
+ "loss": 0.5069,
+ "step": 2542
+ },
+ {
+ "epoch": 1.2023640661938535,
+ "grad_norm": 2.6139039993286133,
+ "learning_rate": 4.5496874214595686e-06,
+ "loss": 0.5102,
+ "step": 2543
+ },
+ {
+ "epoch": 1.202836879432624,
+ "grad_norm": 2.630286455154419,
+ "learning_rate": 4.5493301901278285e-06,
+ "loss": 0.4902,
+ "step": 2544
+ },
+ {
+ "epoch": 1.2033096926713949,
+ "grad_norm": 2.639174222946167,
+ "learning_rate": 4.548972831193284e-06,
+ "loss": 0.4566,
+ "step": 2545
+ },
+ {
+ "epoch": 1.2037825059101654,
+ "grad_norm": 2.9569664001464844,
+ "learning_rate": 4.548615344678186e-06,
+ "loss": 0.5636,
+ "step": 2546
+ },
+ {
+ "epoch": 1.2042553191489362,
+ "grad_norm": 2.981734037399292,
+ "learning_rate": 4.5482577306047924e-06,
+ "loss": 0.4884,
+ "step": 2547
+ },
+ {
+ "epoch": 1.2047281323877068,
+ "grad_norm": 2.6760342121124268,
+ "learning_rate": 4.547899988995371e-06,
+ "loss": 0.5426,
+ "step": 2548
+ },
+ {
+ "epoch": 1.2052009456264776,
+ "grad_norm": 2.825805902481079,
+ "learning_rate": 4.547542119872198e-06,
+ "loss": 0.4989,
+ "step": 2549
+ },
+ {
+ "epoch": 1.2056737588652482,
+ "grad_norm": 2.856426954269409,
+ "learning_rate": 4.547184123257555e-06,
+ "loss": 0.5734,
+ "step": 2550
+ },
+ {
+ "epoch": 1.206146572104019,
+ "grad_norm": 2.555682420730591,
+ "learning_rate": 4.5468259991737334e-06,
+ "loss": 0.5299,
+ "step": 2551
+ },
+ {
+ "epoch": 1.2066193853427896,
+ "grad_norm": 2.6324024200439453,
+ "learning_rate": 4.546467747643032e-06,
+ "loss": 0.5906,
+ "step": 2552
+ },
+ {
+ "epoch": 1.2070921985815604,
+ "grad_norm": 3.4145350456237793,
+ "learning_rate": 4.546109368687757e-06,
+ "loss": 0.5153,
+ "step": 2553
+ },
+ {
+ "epoch": 1.207565011820331,
+ "grad_norm": 2.658691644668579,
+ "learning_rate": 4.545750862330225e-06,
+ "loss": 0.5759,
+ "step": 2554
+ },
+ {
+ "epoch": 1.2080378250591017,
+ "grad_norm": 3.162605047225952,
+ "learning_rate": 4.545392228592755e-06,
+ "loss": 0.5379,
+ "step": 2555
+ },
+ {
+ "epoch": 1.2085106382978723,
+ "grad_norm": 2.8631198406219482,
+ "learning_rate": 4.545033467497681e-06,
+ "loss": 0.5959,
+ "step": 2556
+ },
+ {
+ "epoch": 1.208983451536643,
+ "grad_norm": 2.457109212875366,
+ "learning_rate": 4.54467457906734e-06,
+ "loss": 0.4864,
+ "step": 2557
+ },
+ {
+ "epoch": 1.2094562647754137,
+ "grad_norm": 2.5307061672210693,
+ "learning_rate": 4.544315563324078e-06,
+ "loss": 0.5308,
+ "step": 2558
+ },
+ {
+ "epoch": 1.2099290780141845,
+ "grad_norm": 2.8482773303985596,
+ "learning_rate": 4.543956420290251e-06,
+ "loss": 0.5126,
+ "step": 2559
+ },
+ {
+ "epoch": 1.210401891252955,
+ "grad_norm": 2.4990832805633545,
+ "learning_rate": 4.5435971499882195e-06,
+ "loss": 0.4534,
+ "step": 2560
+ },
+ {
+ "epoch": 1.2108747044917259,
+ "grad_norm": 2.6292665004730225,
+ "learning_rate": 4.543237752440354e-06,
+ "loss": 0.4434,
+ "step": 2561
+ },
+ {
+ "epoch": 1.2113475177304964,
+ "grad_norm": 2.865983247756958,
+ "learning_rate": 4.542878227669033e-06,
+ "loss": 0.5667,
+ "step": 2562
+ },
+ {
+ "epoch": 1.2118203309692672,
+ "grad_norm": 2.745614528656006,
+ "learning_rate": 4.542518575696644e-06,
+ "loss": 0.4724,
+ "step": 2563
+ },
+ {
+ "epoch": 1.2122931442080378,
+ "grad_norm": 2.8562581539154053,
+ "learning_rate": 4.5421587965455785e-06,
+ "loss": 0.5405,
+ "step": 2564
+ },
+ {
+ "epoch": 1.2127659574468086,
+ "grad_norm": 2.6670095920562744,
+ "learning_rate": 4.5417988902382385e-06,
+ "loss": 0.5432,
+ "step": 2565
+ },
+ {
+ "epoch": 1.2132387706855792,
+ "grad_norm": 2.9320743083953857,
+ "learning_rate": 4.541438856797036e-06,
+ "loss": 0.5862,
+ "step": 2566
+ },
+ {
+ "epoch": 1.21371158392435,
+ "grad_norm": 2.577505588531494,
+ "learning_rate": 4.541078696244386e-06,
+ "loss": 0.4742,
+ "step": 2567
+ },
+ {
+ "epoch": 1.2141843971631205,
+ "grad_norm": 3.4476120471954346,
+ "learning_rate": 4.540718408602717e-06,
+ "loss": 0.5903,
+ "step": 2568
+ },
+ {
+ "epoch": 1.2146572104018913,
+ "grad_norm": 2.816210985183716,
+ "learning_rate": 4.540357993894459e-06,
+ "loss": 0.5033,
+ "step": 2569
+ },
+ {
+ "epoch": 1.215130023640662,
+ "grad_norm": 3.0806639194488525,
+ "learning_rate": 4.539997452142058e-06,
+ "loss": 0.6064,
+ "step": 2570
+ },
+ {
+ "epoch": 1.2156028368794327,
+ "grad_norm": 2.563060760498047,
+ "learning_rate": 4.5396367833679586e-06,
+ "loss": 0.5597,
+ "step": 2571
+ },
+ {
+ "epoch": 1.2160756501182033,
+ "grad_norm": 3.1014397144317627,
+ "learning_rate": 4.5392759875946215e-06,
+ "loss": 0.54,
+ "step": 2572
+ },
+ {
+ "epoch": 1.216548463356974,
+ "grad_norm": 3.124190330505371,
+ "learning_rate": 4.53891506484451e-06,
+ "loss": 0.5122,
+ "step": 2573
+ },
+ {
+ "epoch": 1.2170212765957447,
+ "grad_norm": 2.6688716411590576,
+ "learning_rate": 4.538554015140097e-06,
+ "loss": 0.5615,
+ "step": 2574
+ },
+ {
+ "epoch": 1.2174940898345155,
+ "grad_norm": 2.775543689727783,
+ "learning_rate": 4.538192838503866e-06,
+ "loss": 0.496,
+ "step": 2575
+ },
+ {
+ "epoch": 1.217966903073286,
+ "grad_norm": 2.7877283096313477,
+ "learning_rate": 4.537831534958303e-06,
+ "loss": 0.4995,
+ "step": 2576
+ },
+ {
+ "epoch": 1.2184397163120568,
+ "grad_norm": 2.824810028076172,
+ "learning_rate": 4.537470104525906e-06,
+ "loss": 0.5481,
+ "step": 2577
+ },
+ {
+ "epoch": 1.2189125295508274,
+ "grad_norm": 2.801269292831421,
+ "learning_rate": 4.53710854722918e-06,
+ "loss": 0.5628,
+ "step": 2578
+ },
+ {
+ "epoch": 1.2193853427895982,
+ "grad_norm": 2.7780683040618896,
+ "learning_rate": 4.536746863090637e-06,
+ "loss": 0.4845,
+ "step": 2579
+ },
+ {
+ "epoch": 1.2198581560283688,
+ "grad_norm": 2.536010265350342,
+ "learning_rate": 4.536385052132798e-06,
+ "loss": 0.4771,
+ "step": 2580
+ },
+ {
+ "epoch": 1.2203309692671396,
+ "grad_norm": 2.768775701522827,
+ "learning_rate": 4.536023114378191e-06,
+ "loss": 0.5366,
+ "step": 2581
+ },
+ {
+ "epoch": 1.2208037825059102,
+ "grad_norm": 2.658125877380371,
+ "learning_rate": 4.535661049849352e-06,
+ "loss": 0.524,
+ "step": 2582
+ },
+ {
+ "epoch": 1.2212765957446807,
+ "grad_norm": 2.558696746826172,
+ "learning_rate": 4.535298858568825e-06,
+ "loss": 0.5482,
+ "step": 2583
+ },
+ {
+ "epoch": 1.2217494089834515,
+ "grad_norm": 2.5284535884857178,
+ "learning_rate": 4.534936540559164e-06,
+ "loss": 0.4454,
+ "step": 2584
+ },
+ {
+ "epoch": 1.2222222222222223,
+ "grad_norm": 7.617330074310303,
+ "learning_rate": 4.534574095842927e-06,
+ "loss": 0.5615,
+ "step": 2585
+ },
+ {
+ "epoch": 1.222695035460993,
+ "grad_norm": 2.9120311737060547,
+ "learning_rate": 4.534211524442682e-06,
+ "loss": 0.5624,
+ "step": 2586
+ },
+ {
+ "epoch": 1.2231678486997635,
+ "grad_norm": 2.5004289150238037,
+ "learning_rate": 4.533848826381005e-06,
+ "loss": 0.4743,
+ "step": 2587
+ },
+ {
+ "epoch": 1.2236406619385343,
+ "grad_norm": 2.8395533561706543,
+ "learning_rate": 4.53348600168048e-06,
+ "loss": 0.4457,
+ "step": 2588
+ },
+ {
+ "epoch": 1.224113475177305,
+ "grad_norm": 2.832211494445801,
+ "learning_rate": 4.533123050363699e-06,
+ "loss": 0.5559,
+ "step": 2589
+ },
+ {
+ "epoch": 1.2245862884160756,
+ "grad_norm": 2.6318583488464355,
+ "learning_rate": 4.53275997245326e-06,
+ "loss": 0.5281,
+ "step": 2590
+ },
+ {
+ "epoch": 1.2250591016548462,
+ "grad_norm": 3.0509233474731445,
+ "learning_rate": 4.532396767971771e-06,
+ "loss": 0.6003,
+ "step": 2591
+ },
+ {
+ "epoch": 1.225531914893617,
+ "grad_norm": 2.6863620281219482,
+ "learning_rate": 4.532033436941847e-06,
+ "loss": 0.5219,
+ "step": 2592
+ },
+ {
+ "epoch": 1.2260047281323878,
+ "grad_norm": 2.401463747024536,
+ "learning_rate": 4.5316699793861104e-06,
+ "loss": 0.5994,
+ "step": 2593
+ },
+ {
+ "epoch": 1.2264775413711584,
+ "grad_norm": 2.613517999649048,
+ "learning_rate": 4.531306395327194e-06,
+ "loss": 0.5785,
+ "step": 2594
+ },
+ {
+ "epoch": 1.226950354609929,
+ "grad_norm": 2.5016374588012695,
+ "learning_rate": 4.530942684787735e-06,
+ "loss": 0.5695,
+ "step": 2595
+ },
+ {
+ "epoch": 1.2274231678486998,
+ "grad_norm": 2.576464891433716,
+ "learning_rate": 4.53057884779038e-06,
+ "loss": 0.4427,
+ "step": 2596
+ },
+ {
+ "epoch": 1.2278959810874706,
+ "grad_norm": 2.5688700675964355,
+ "learning_rate": 4.530214884357785e-06,
+ "loss": 0.4966,
+ "step": 2597
+ },
+ {
+ "epoch": 1.2283687943262411,
+ "grad_norm": 3.179013729095459,
+ "learning_rate": 4.52985079451261e-06,
+ "loss": 0.5239,
+ "step": 2598
+ },
+ {
+ "epoch": 1.2288416075650117,
+ "grad_norm": 2.6015284061431885,
+ "learning_rate": 4.529486578277527e-06,
+ "loss": 0.5135,
+ "step": 2599
+ },
+ {
+ "epoch": 1.2293144208037825,
+ "grad_norm": 2.3029589653015137,
+ "learning_rate": 4.529122235675214e-06,
+ "loss": 0.4044,
+ "step": 2600
+ },
+ {
+ "epoch": 1.2297872340425533,
+ "grad_norm": 2.994093656539917,
+ "learning_rate": 4.528757766728357e-06,
+ "loss": 0.5419,
+ "step": 2601
+ },
+ {
+ "epoch": 1.2302600472813239,
+ "grad_norm": 2.6297390460968018,
+ "learning_rate": 4.52839317145965e-06,
+ "loss": 0.488,
+ "step": 2602
+ },
+ {
+ "epoch": 1.2307328605200945,
+ "grad_norm": 2.4814043045043945,
+ "learning_rate": 4.528028449891793e-06,
+ "loss": 0.4917,
+ "step": 2603
+ },
+ {
+ "epoch": 1.2312056737588652,
+ "grad_norm": 3.6052863597869873,
+ "learning_rate": 4.527663602047499e-06,
+ "loss": 0.5301,
+ "step": 2604
+ },
+ {
+ "epoch": 1.231678486997636,
+ "grad_norm": 2.6984751224517822,
+ "learning_rate": 4.5272986279494825e-06,
+ "loss": 0.5253,
+ "step": 2605
+ },
+ {
+ "epoch": 1.2321513002364066,
+ "grad_norm": 2.514000415802002,
+ "learning_rate": 4.526933527620469e-06,
+ "loss": 0.5661,
+ "step": 2606
+ },
+ {
+ "epoch": 1.2326241134751772,
+ "grad_norm": 2.890921115875244,
+ "learning_rate": 4.526568301083195e-06,
+ "loss": 0.5585,
+ "step": 2607
+ },
+ {
+ "epoch": 1.233096926713948,
+ "grad_norm": 2.6390011310577393,
+ "learning_rate": 4.526202948360397e-06,
+ "loss": 0.5168,
+ "step": 2608
+ },
+ {
+ "epoch": 1.2335697399527188,
+ "grad_norm": 2.7370636463165283,
+ "learning_rate": 4.5258374694748266e-06,
+ "loss": 0.5453,
+ "step": 2609
+ },
+ {
+ "epoch": 1.2340425531914894,
+ "grad_norm": 2.8203976154327393,
+ "learning_rate": 4.52547186444924e-06,
+ "loss": 0.5763,
+ "step": 2610
+ },
+ {
+ "epoch": 1.23451536643026,
+ "grad_norm": 2.7567849159240723,
+ "learning_rate": 4.5251061333064025e-06,
+ "loss": 0.5194,
+ "step": 2611
+ },
+ {
+ "epoch": 1.2349881796690307,
+ "grad_norm": 2.767519474029541,
+ "learning_rate": 4.524740276069085e-06,
+ "loss": 0.5355,
+ "step": 2612
+ },
+ {
+ "epoch": 1.2354609929078015,
+ "grad_norm": 3.072035312652588,
+ "learning_rate": 4.5243742927600695e-06,
+ "loss": 0.5391,
+ "step": 2613
+ },
+ {
+ "epoch": 1.2359338061465721,
+ "grad_norm": 2.5957462787628174,
+ "learning_rate": 4.524008183402143e-06,
+ "loss": 0.5645,
+ "step": 2614
+ },
+ {
+ "epoch": 1.2364066193853427,
+ "grad_norm": 2.774897575378418,
+ "learning_rate": 4.523641948018101e-06,
+ "loss": 0.5576,
+ "step": 2615
+ },
+ {
+ "epoch": 1.2368794326241135,
+ "grad_norm": 2.635887622833252,
+ "learning_rate": 4.5232755866307496e-06,
+ "loss": 0.5254,
+ "step": 2616
+ },
+ {
+ "epoch": 1.2373522458628843,
+ "grad_norm": 2.4860997200012207,
+ "learning_rate": 4.522909099262899e-06,
+ "loss": 0.4692,
+ "step": 2617
+ },
+ {
+ "epoch": 1.2378250591016549,
+ "grad_norm": 2.595513105392456,
+ "learning_rate": 4.522542485937369e-06,
+ "loss": 0.5166,
+ "step": 2618
+ },
+ {
+ "epoch": 1.2382978723404254,
+ "grad_norm": 2.961474895477295,
+ "learning_rate": 4.522175746676986e-06,
+ "loss": 0.5455,
+ "step": 2619
+ },
+ {
+ "epoch": 1.2387706855791962,
+ "grad_norm": 2.813889741897583,
+ "learning_rate": 4.521808881504588e-06,
+ "loss": 0.5249,
+ "step": 2620
+ },
+ {
+ "epoch": 1.239243498817967,
+ "grad_norm": 2.8434813022613525,
+ "learning_rate": 4.521441890443015e-06,
+ "loss": 0.472,
+ "step": 2621
+ },
+ {
+ "epoch": 1.2397163120567376,
+ "grad_norm": 2.4264845848083496,
+ "learning_rate": 4.521074773515119e-06,
+ "loss": 0.4783,
+ "step": 2622
+ },
+ {
+ "epoch": 1.2401891252955082,
+ "grad_norm": 2.615169048309326,
+ "learning_rate": 4.520707530743761e-06,
+ "loss": 0.5324,
+ "step": 2623
+ },
+ {
+ "epoch": 1.240661938534279,
+ "grad_norm": 2.6772537231445312,
+ "learning_rate": 4.520340162151803e-06,
+ "loss": 0.5224,
+ "step": 2624
+ },
+ {
+ "epoch": 1.2411347517730495,
+ "grad_norm": 2.683393955230713,
+ "learning_rate": 4.519972667762124e-06,
+ "loss": 0.4863,
+ "step": 2625
+ },
+ {
+ "epoch": 1.2416075650118203,
+ "grad_norm": 3.0335750579833984,
+ "learning_rate": 4.519605047597603e-06,
+ "loss": 0.544,
+ "step": 2626
+ },
+ {
+ "epoch": 1.242080378250591,
+ "grad_norm": 2.8694353103637695,
+ "learning_rate": 4.519237301681132e-06,
+ "loss": 0.5576,
+ "step": 2627
+ },
+ {
+ "epoch": 1.2425531914893617,
+ "grad_norm": 3.217808246612549,
+ "learning_rate": 4.518869430035609e-06,
+ "loss": 0.5459,
+ "step": 2628
+ },
+ {
+ "epoch": 1.2430260047281323,
+ "grad_norm": 2.7700083255767822,
+ "learning_rate": 4.518501432683937e-06,
+ "loss": 0.5579,
+ "step": 2629
+ },
+ {
+ "epoch": 1.243498817966903,
+ "grad_norm": 2.4759175777435303,
+ "learning_rate": 4.5181333096490335e-06,
+ "loss": 0.5049,
+ "step": 2630
+ },
+ {
+ "epoch": 1.2439716312056737,
+ "grad_norm": 2.8652584552764893,
+ "learning_rate": 4.517765060953818e-06,
+ "loss": 0.5366,
+ "step": 2631
+ },
+ {
+ "epoch": 1.2444444444444445,
+ "grad_norm": 2.776334524154663,
+ "learning_rate": 4.517396686621218e-06,
+ "loss": 0.5677,
+ "step": 2632
+ },
+ {
+ "epoch": 1.244917257683215,
+ "grad_norm": 2.676708221435547,
+ "learning_rate": 4.517028186674174e-06,
+ "loss": 0.5055,
+ "step": 2633
+ },
+ {
+ "epoch": 1.2453900709219858,
+ "grad_norm": 2.6851537227630615,
+ "learning_rate": 4.516659561135629e-06,
+ "loss": 0.5537,
+ "step": 2634
+ },
+ {
+ "epoch": 1.2458628841607564,
+ "grad_norm": 2.619971513748169,
+ "learning_rate": 4.516290810028536e-06,
+ "loss": 0.5765,
+ "step": 2635
+ },
+ {
+ "epoch": 1.2463356973995272,
+ "grad_norm": 2.7302334308624268,
+ "learning_rate": 4.515921933375855e-06,
+ "loss": 0.5611,
+ "step": 2636
+ },
+ {
+ "epoch": 1.2468085106382978,
+ "grad_norm": 2.5005829334259033,
+ "learning_rate": 4.5155529312005554e-06,
+ "loss": 0.442,
+ "step": 2637
+ },
+ {
+ "epoch": 1.2472813238770686,
+ "grad_norm": 2.713587522506714,
+ "learning_rate": 4.515183803525612e-06,
+ "loss": 0.5023,
+ "step": 2638
+ },
+ {
+ "epoch": 1.2477541371158392,
+ "grad_norm": 2.5146236419677734,
+ "learning_rate": 4.514814550374009e-06,
+ "loss": 0.5195,
+ "step": 2639
+ },
+ {
+ "epoch": 1.24822695035461,
+ "grad_norm": 2.761060953140259,
+ "learning_rate": 4.51444517176874e-06,
+ "loss": 0.5138,
+ "step": 2640
+ },
+ {
+ "epoch": 1.2486997635933805,
+ "grad_norm": 3.082329273223877,
+ "learning_rate": 4.5140756677328026e-06,
+ "loss": 0.6105,
+ "step": 2641
+ },
+ {
+ "epoch": 1.2491725768321513,
+ "grad_norm": 2.6933493614196777,
+ "learning_rate": 4.513706038289205e-06,
+ "loss": 0.5185,
+ "step": 2642
+ },
+ {
+ "epoch": 1.249645390070922,
+ "grad_norm": 2.515856981277466,
+ "learning_rate": 4.513336283460962e-06,
+ "loss": 0.5375,
+ "step": 2643
+ },
+ {
+ "epoch": 1.2501182033096927,
+ "grad_norm": 2.8553731441497803,
+ "learning_rate": 4.512966403271096e-06,
+ "loss": 0.5582,
+ "step": 2644
+ },
+ {
+ "epoch": 1.2505910165484633,
+ "grad_norm": 2.640880823135376,
+ "learning_rate": 4.5125963977426405e-06,
+ "loss": 0.5125,
+ "step": 2645
+ },
+ {
+ "epoch": 1.251063829787234,
+ "grad_norm": 2.9845943450927734,
+ "learning_rate": 4.512226266898631e-06,
+ "loss": 0.4749,
+ "step": 2646
+ },
+ {
+ "epoch": 1.2515366430260046,
+ "grad_norm": 2.5131032466888428,
+ "learning_rate": 4.511856010762116e-06,
+ "loss": 0.4764,
+ "step": 2647
+ },
+ {
+ "epoch": 1.2520094562647754,
+ "grad_norm": 2.370638370513916,
+ "learning_rate": 4.511485629356148e-06,
+ "loss": 0.5153,
+ "step": 2648
+ },
+ {
+ "epoch": 1.252482269503546,
+ "grad_norm": 2.912461996078491,
+ "learning_rate": 4.511115122703791e-06,
+ "loss": 0.6117,
+ "step": 2649
+ },
+ {
+ "epoch": 1.2529550827423168,
+ "grad_norm": 2.7308082580566406,
+ "learning_rate": 4.510744490828113e-06,
+ "loss": 0.5076,
+ "step": 2650
+ },
+ {
+ "epoch": 1.2534278959810874,
+ "grad_norm": 2.8524296283721924,
+ "learning_rate": 4.510373733752193e-06,
+ "loss": 0.542,
+ "step": 2651
+ },
+ {
+ "epoch": 1.2539007092198582,
+ "grad_norm": 2.799377202987671,
+ "learning_rate": 4.5100028514991145e-06,
+ "loss": 0.486,
+ "step": 2652
+ },
+ {
+ "epoch": 1.2543735224586288,
+ "grad_norm": 2.7248027324676514,
+ "learning_rate": 4.509631844091973e-06,
+ "loss": 0.4972,
+ "step": 2653
+ },
+ {
+ "epoch": 1.2548463356973996,
+ "grad_norm": 2.8041458129882812,
+ "learning_rate": 4.5092607115538686e-06,
+ "loss": 0.588,
+ "step": 2654
+ },
+ {
+ "epoch": 1.2553191489361701,
+ "grad_norm": 2.679417133331299,
+ "learning_rate": 4.50888945390791e-06,
+ "loss": 0.4639,
+ "step": 2655
+ },
+ {
+ "epoch": 1.255791962174941,
+ "grad_norm": 3.1049270629882812,
+ "learning_rate": 4.508518071177214e-06,
+ "loss": 0.5857,
+ "step": 2656
+ },
+ {
+ "epoch": 1.2562647754137115,
+ "grad_norm": 2.8590362071990967,
+ "learning_rate": 4.508146563384904e-06,
+ "loss": 0.5451,
+ "step": 2657
+ },
+ {
+ "epoch": 1.2567375886524823,
+ "grad_norm": 2.9774081707000732,
+ "learning_rate": 4.507774930554114e-06,
+ "loss": 0.5493,
+ "step": 2658
+ },
+ {
+ "epoch": 1.2572104018912529,
+ "grad_norm": 2.617643356323242,
+ "learning_rate": 4.507403172707983e-06,
+ "loss": 0.5472,
+ "step": 2659
+ },
+ {
+ "epoch": 1.2576832151300237,
+ "grad_norm": 2.9195587635040283,
+ "learning_rate": 4.507031289869658e-06,
+ "loss": 0.5403,
+ "step": 2660
+ },
+ {
+ "epoch": 1.2581560283687943,
+ "grad_norm": 2.706089496612549,
+ "learning_rate": 4.506659282062295e-06,
+ "loss": 0.4899,
+ "step": 2661
+ },
+ {
+ "epoch": 1.258628841607565,
+ "grad_norm": 2.8229358196258545,
+ "learning_rate": 4.506287149309057e-06,
+ "loss": 0.5336,
+ "step": 2662
+ },
+ {
+ "epoch": 1.2591016548463356,
+ "grad_norm": 2.5295674800872803,
+ "learning_rate": 4.505914891633117e-06,
+ "loss": 0.4806,
+ "step": 2663
+ },
+ {
+ "epoch": 1.2595744680851064,
+ "grad_norm": 3.098208427429199,
+ "learning_rate": 4.505542509057651e-06,
+ "loss": 0.6039,
+ "step": 2664
+ },
+ {
+ "epoch": 1.260047281323877,
+ "grad_norm": 2.5118041038513184,
+ "learning_rate": 4.5051700016058475e-06,
+ "loss": 0.5279,
+ "step": 2665
+ },
+ {
+ "epoch": 1.2605200945626478,
+ "grad_norm": 2.6901369094848633,
+ "learning_rate": 4.5047973693009005e-06,
+ "loss": 0.5515,
+ "step": 2666
+ },
+ {
+ "epoch": 1.2609929078014184,
+ "grad_norm": 2.5622377395629883,
+ "learning_rate": 4.504424612166012e-06,
+ "loss": 0.5405,
+ "step": 2667
+ },
+ {
+ "epoch": 1.2614657210401892,
+ "grad_norm": 2.685751438140869,
+ "learning_rate": 4.5040517302243915e-06,
+ "loss": 0.5797,
+ "step": 2668
+ },
+ {
+ "epoch": 1.2619385342789597,
+ "grad_norm": 2.8525350093841553,
+ "learning_rate": 4.503678723499259e-06,
+ "loss": 0.5561,
+ "step": 2669
+ },
+ {
+ "epoch": 1.2624113475177305,
+ "grad_norm": 2.803386926651001,
+ "learning_rate": 4.503305592013836e-06,
+ "loss": 0.5376,
+ "step": 2670
+ },
+ {
+ "epoch": 1.2628841607565011,
+ "grad_norm": 2.78633189201355,
+ "learning_rate": 4.502932335791359e-06,
+ "loss": 0.4739,
+ "step": 2671
+ },
+ {
+ "epoch": 1.263356973995272,
+ "grad_norm": 2.8337297439575195,
+ "learning_rate": 4.502558954855069e-06,
+ "loss": 0.5406,
+ "step": 2672
+ },
+ {
+ "epoch": 1.2638297872340425,
+ "grad_norm": 2.610275983810425,
+ "learning_rate": 4.502185449228213e-06,
+ "loss": 0.5343,
+ "step": 2673
+ },
+ {
+ "epoch": 1.2643026004728133,
+ "grad_norm": 2.7842252254486084,
+ "learning_rate": 4.501811818934048e-06,
+ "loss": 0.532,
+ "step": 2674
+ },
+ {
+ "epoch": 1.2647754137115839,
+ "grad_norm": 2.4472389221191406,
+ "learning_rate": 4.501438063995839e-06,
+ "loss": 0.4976,
+ "step": 2675
+ },
+ {
+ "epoch": 1.2652482269503547,
+ "grad_norm": 3.076580762863159,
+ "learning_rate": 4.501064184436858e-06,
+ "loss": 0.507,
+ "step": 2676
+ },
+ {
+ "epoch": 1.2657210401891252,
+ "grad_norm": 2.5952908992767334,
+ "learning_rate": 4.500690180280384e-06,
+ "loss": 0.5498,
+ "step": 2677
+ },
+ {
+ "epoch": 1.266193853427896,
+ "grad_norm": 2.476943016052246,
+ "learning_rate": 4.500316051549706e-06,
+ "loss": 0.557,
+ "step": 2678
+ },
+ {
+ "epoch": 1.2666666666666666,
+ "grad_norm": 2.730579376220703,
+ "learning_rate": 4.499941798268118e-06,
+ "loss": 0.4975,
+ "step": 2679
+ },
+ {
+ "epoch": 1.2671394799054374,
+ "grad_norm": 2.7916698455810547,
+ "learning_rate": 4.499567420458924e-06,
+ "loss": 0.5673,
+ "step": 2680
+ },
+ {
+ "epoch": 1.267612293144208,
+ "grad_norm": 2.4249091148376465,
+ "learning_rate": 4.4991929181454355e-06,
+ "loss": 0.4836,
+ "step": 2681
+ },
+ {
+ "epoch": 1.2680851063829788,
+ "grad_norm": 2.661911725997925,
+ "learning_rate": 4.498818291350969e-06,
+ "loss": 0.5332,
+ "step": 2682
+ },
+ {
+ "epoch": 1.2685579196217494,
+ "grad_norm": 2.693657875061035,
+ "learning_rate": 4.498443540098852e-06,
+ "loss": 0.5257,
+ "step": 2683
+ },
+ {
+ "epoch": 1.2690307328605201,
+ "grad_norm": 2.609386682510376,
+ "learning_rate": 4.4980686644124195e-06,
+ "loss": 0.4918,
+ "step": 2684
+ },
+ {
+ "epoch": 1.2695035460992907,
+ "grad_norm": 3.2104930877685547,
+ "learning_rate": 4.4976936643150124e-06,
+ "loss": 0.6097,
+ "step": 2685
+ },
+ {
+ "epoch": 1.2699763593380615,
+ "grad_norm": 2.707860231399536,
+ "learning_rate": 4.49731853982998e-06,
+ "loss": 0.5109,
+ "step": 2686
+ },
+ {
+ "epoch": 1.270449172576832,
+ "grad_norm": 3.5046379566192627,
+ "learning_rate": 4.49694329098068e-06,
+ "loss": 0.5883,
+ "step": 2687
+ },
+ {
+ "epoch": 1.270921985815603,
+ "grad_norm": 2.5362324714660645,
+ "learning_rate": 4.496567917790477e-06,
+ "loss": 0.5301,
+ "step": 2688
+ },
+ {
+ "epoch": 1.2713947990543735,
+ "grad_norm": 2.7095518112182617,
+ "learning_rate": 4.496192420282746e-06,
+ "loss": 0.4772,
+ "step": 2689
+ },
+ {
+ "epoch": 1.2718676122931443,
+ "grad_norm": 2.416433095932007,
+ "learning_rate": 4.495816798480865e-06,
+ "loss": 0.5012,
+ "step": 2690
+ },
+ {
+ "epoch": 1.2723404255319148,
+ "grad_norm": 2.5362391471862793,
+ "learning_rate": 4.495441052408224e-06,
+ "loss": 0.5197,
+ "step": 2691
+ },
+ {
+ "epoch": 1.2728132387706856,
+ "grad_norm": 2.9093947410583496,
+ "learning_rate": 4.495065182088218e-06,
+ "loss": 0.4893,
+ "step": 2692
+ },
+ {
+ "epoch": 1.2732860520094562,
+ "grad_norm": 2.520470142364502,
+ "learning_rate": 4.494689187544251e-06,
+ "loss": 0.5072,
+ "step": 2693
+ },
+ {
+ "epoch": 1.273758865248227,
+ "grad_norm": 2.4385125637054443,
+ "learning_rate": 4.494313068799735e-06,
+ "loss": 0.4923,
+ "step": 2694
+ },
+ {
+ "epoch": 1.2742316784869976,
+ "grad_norm": 2.636852502822876,
+ "learning_rate": 4.493936825878089e-06,
+ "loss": 0.5409,
+ "step": 2695
+ },
+ {
+ "epoch": 1.2747044917257684,
+ "grad_norm": 2.7027053833007812,
+ "learning_rate": 4.493560458802741e-06,
+ "loss": 0.5906,
+ "step": 2696
+ },
+ {
+ "epoch": 1.275177304964539,
+ "grad_norm": 2.58752179145813,
+ "learning_rate": 4.493183967597123e-06,
+ "loss": 0.5292,
+ "step": 2697
+ },
+ {
+ "epoch": 1.2756501182033098,
+ "grad_norm": 2.7658379077911377,
+ "learning_rate": 4.49280735228468e-06,
+ "loss": 0.5613,
+ "step": 2698
+ },
+ {
+ "epoch": 1.2761229314420803,
+ "grad_norm": 3.272688388824463,
+ "learning_rate": 4.492430612888861e-06,
+ "loss": 0.5654,
+ "step": 2699
+ },
+ {
+ "epoch": 1.2765957446808511,
+ "grad_norm": 2.806819438934326,
+ "learning_rate": 4.492053749433125e-06,
+ "loss": 0.5388,
+ "step": 2700
+ },
+ {
+ "epoch": 1.2770685579196217,
+ "grad_norm": 2.879727602005005,
+ "learning_rate": 4.491676761940936e-06,
+ "loss": 0.5033,
+ "step": 2701
+ },
+ {
+ "epoch": 1.2775413711583925,
+ "grad_norm": 2.733347177505493,
+ "learning_rate": 4.4912996504357695e-06,
+ "loss": 0.5113,
+ "step": 2702
+ },
+ {
+ "epoch": 1.278014184397163,
+ "grad_norm": 2.7431252002716064,
+ "learning_rate": 4.490922414941104e-06,
+ "loss": 0.5417,
+ "step": 2703
+ },
+ {
+ "epoch": 1.2784869976359339,
+ "grad_norm": 2.9287240505218506,
+ "learning_rate": 4.490545055480431e-06,
+ "loss": 0.5875,
+ "step": 2704
+ },
+ {
+ "epoch": 1.2789598108747045,
+ "grad_norm": 2.576775550842285,
+ "learning_rate": 4.490167572077244e-06,
+ "loss": 0.5176,
+ "step": 2705
+ },
+ {
+ "epoch": 1.2794326241134752,
+ "grad_norm": 2.4335594177246094,
+ "learning_rate": 4.4897899647550505e-06,
+ "loss": 0.4749,
+ "step": 2706
+ },
+ {
+ "epoch": 1.2799054373522458,
+ "grad_norm": 2.6798062324523926,
+ "learning_rate": 4.489412233537361e-06,
+ "loss": 0.5439,
+ "step": 2707
+ },
+ {
+ "epoch": 1.2803782505910166,
+ "grad_norm": 2.8440675735473633,
+ "learning_rate": 4.489034378447693e-06,
+ "loss": 0.552,
+ "step": 2708
+ },
+ {
+ "epoch": 1.2808510638297872,
+ "grad_norm": 2.9059503078460693,
+ "learning_rate": 4.488656399509577e-06,
+ "loss": 0.5667,
+ "step": 2709
+ },
+ {
+ "epoch": 1.281323877068558,
+ "grad_norm": 2.7415006160736084,
+ "learning_rate": 4.488278296746548e-06,
+ "loss": 0.5676,
+ "step": 2710
+ },
+ {
+ "epoch": 1.2817966903073286,
+ "grad_norm": 2.4584875106811523,
+ "learning_rate": 4.487900070182147e-06,
+ "loss": 0.4787,
+ "step": 2711
+ },
+ {
+ "epoch": 1.2822695035460994,
+ "grad_norm": 2.990940809249878,
+ "learning_rate": 4.487521719839924e-06,
+ "loss": 0.5239,
+ "step": 2712
+ },
+ {
+ "epoch": 1.28274231678487,
+ "grad_norm": 3.075201988220215,
+ "learning_rate": 4.487143245743441e-06,
+ "loss": 0.5103,
+ "step": 2713
+ },
+ {
+ "epoch": 1.2832151300236407,
+ "grad_norm": 2.543341875076294,
+ "learning_rate": 4.486764647916259e-06,
+ "loss": 0.5475,
+ "step": 2714
+ },
+ {
+ "epoch": 1.2836879432624113,
+ "grad_norm": 2.9927213191986084,
+ "learning_rate": 4.486385926381957e-06,
+ "loss": 0.4923,
+ "step": 2715
+ },
+ {
+ "epoch": 1.284160756501182,
+ "grad_norm": 2.4220657348632812,
+ "learning_rate": 4.486007081164111e-06,
+ "loss": 0.543,
+ "step": 2716
+ },
+ {
+ "epoch": 1.2846335697399527,
+ "grad_norm": 2.468214988708496,
+ "learning_rate": 4.4856281122863134e-06,
+ "loss": 0.5248,
+ "step": 2717
+ },
+ {
+ "epoch": 1.2851063829787235,
+ "grad_norm": 2.633711099624634,
+ "learning_rate": 4.48524901977216e-06,
+ "loss": 0.4764,
+ "step": 2718
+ },
+ {
+ "epoch": 1.285579196217494,
+ "grad_norm": 2.8399546146392822,
+ "learning_rate": 4.484869803645254e-06,
+ "loss": 0.5503,
+ "step": 2719
+ },
+ {
+ "epoch": 1.2860520094562649,
+ "grad_norm": 2.769063949584961,
+ "learning_rate": 4.484490463929209e-06,
+ "loss": 0.5468,
+ "step": 2720
+ },
+ {
+ "epoch": 1.2865248226950354,
+ "grad_norm": 2.617863893508911,
+ "learning_rate": 4.4841110006476465e-06,
+ "loss": 0.5906,
+ "step": 2721
+ },
+ {
+ "epoch": 1.2869976359338062,
+ "grad_norm": 2.7639541625976562,
+ "learning_rate": 4.4837314138241905e-06,
+ "loss": 0.552,
+ "step": 2722
+ },
+ {
+ "epoch": 1.2874704491725768,
+ "grad_norm": 2.7711129188537598,
+ "learning_rate": 4.483351703482478e-06,
+ "loss": 0.5229,
+ "step": 2723
+ },
+ {
+ "epoch": 1.2879432624113476,
+ "grad_norm": 2.611205577850342,
+ "learning_rate": 4.482971869646152e-06,
+ "loss": 0.5055,
+ "step": 2724
+ },
+ {
+ "epoch": 1.2884160756501182,
+ "grad_norm": 2.8602211475372314,
+ "learning_rate": 4.482591912338862e-06,
+ "loss": 0.5561,
+ "step": 2725
+ },
+ {
+ "epoch": 1.2888888888888888,
+ "grad_norm": 2.5882298946380615,
+ "learning_rate": 4.4822118315842675e-06,
+ "loss": 0.5555,
+ "step": 2726
+ },
+ {
+ "epoch": 1.2893617021276595,
+ "grad_norm": 2.7533531188964844,
+ "learning_rate": 4.481831627406033e-06,
+ "loss": 0.5346,
+ "step": 2727
+ },
+ {
+ "epoch": 1.2898345153664303,
+ "grad_norm": 2.4296958446502686,
+ "learning_rate": 4.481451299827835e-06,
+ "loss": 0.4915,
+ "step": 2728
+ },
+ {
+ "epoch": 1.290307328605201,
+ "grad_norm": 2.4403445720672607,
+ "learning_rate": 4.481070848873352e-06,
+ "loss": 0.5648,
+ "step": 2729
+ },
+ {
+ "epoch": 1.2907801418439715,
+ "grad_norm": 2.473224401473999,
+ "learning_rate": 4.480690274566274e-06,
+ "loss": 0.4849,
+ "step": 2730
+ },
+ {
+ "epoch": 1.2912529550827423,
+ "grad_norm": 2.637899875640869,
+ "learning_rate": 4.480309576930297e-06,
+ "loss": 0.4968,
+ "step": 2731
+ },
+ {
+ "epoch": 1.291725768321513,
+ "grad_norm": 2.7156927585601807,
+ "learning_rate": 4.479928755989127e-06,
+ "loss": 0.4759,
+ "step": 2732
+ },
+ {
+ "epoch": 1.2921985815602837,
+ "grad_norm": 2.632786989212036,
+ "learning_rate": 4.479547811766475e-06,
+ "loss": 0.5468,
+ "step": 2733
+ },
+ {
+ "epoch": 1.2926713947990542,
+ "grad_norm": 2.529218912124634,
+ "learning_rate": 4.479166744286061e-06,
+ "loss": 0.4852,
+ "step": 2734
+ },
+ {
+ "epoch": 1.293144208037825,
+ "grad_norm": 2.561978340148926,
+ "learning_rate": 4.4787855535716115e-06,
+ "loss": 0.546,
+ "step": 2735
+ },
+ {
+ "epoch": 1.2936170212765958,
+ "grad_norm": 2.3684909343719482,
+ "learning_rate": 4.478404239646862e-06,
+ "loss": 0.5369,
+ "step": 2736
+ },
+ {
+ "epoch": 1.2940898345153664,
+ "grad_norm": 2.8940367698669434,
+ "learning_rate": 4.4780228025355566e-06,
+ "loss": 0.568,
+ "step": 2737
+ },
+ {
+ "epoch": 1.294562647754137,
+ "grad_norm": 2.6950316429138184,
+ "learning_rate": 4.477641242261445e-06,
+ "loss": 0.4576,
+ "step": 2738
+ },
+ {
+ "epoch": 1.2950354609929078,
+ "grad_norm": 2.4211716651916504,
+ "learning_rate": 4.4772595588482835e-06,
+ "loss": 0.4341,
+ "step": 2739
+ },
+ {
+ "epoch": 1.2955082742316786,
+ "grad_norm": 3.141097068786621,
+ "learning_rate": 4.47687775231984e-06,
+ "loss": 0.5944,
+ "step": 2740
+ },
+ {
+ "epoch": 1.2959810874704492,
+ "grad_norm": 3.077522039413452,
+ "learning_rate": 4.476495822699887e-06,
+ "loss": 0.5786,
+ "step": 2741
+ },
+ {
+ "epoch": 1.2964539007092197,
+ "grad_norm": 2.708139419555664,
+ "learning_rate": 4.476113770012206e-06,
+ "loss": 0.5014,
+ "step": 2742
+ },
+ {
+ "epoch": 1.2969267139479905,
+ "grad_norm": 2.7572035789489746,
+ "learning_rate": 4.475731594280586e-06,
+ "loss": 0.594,
+ "step": 2743
+ },
+ {
+ "epoch": 1.2973995271867613,
+ "grad_norm": 2.673126459121704,
+ "learning_rate": 4.475349295528822e-06,
+ "loss": 0.5317,
+ "step": 2744
+ },
+ {
+ "epoch": 1.297872340425532,
+ "grad_norm": 2.6757819652557373,
+ "learning_rate": 4.4749668737807195e-06,
+ "loss": 0.5614,
+ "step": 2745
+ },
+ {
+ "epoch": 1.2983451536643025,
+ "grad_norm": 2.7077620029449463,
+ "learning_rate": 4.47458432906009e-06,
+ "loss": 0.4916,
+ "step": 2746
+ },
+ {
+ "epoch": 1.2988179669030733,
+ "grad_norm": 2.446570873260498,
+ "learning_rate": 4.474201661390752e-06,
+ "loss": 0.5005,
+ "step": 2747
+ },
+ {
+ "epoch": 1.299290780141844,
+ "grad_norm": 2.642695665359497,
+ "learning_rate": 4.473818870796533e-06,
+ "loss": 0.5048,
+ "step": 2748
+ },
+ {
+ "epoch": 1.2997635933806146,
+ "grad_norm": 2.519824743270874,
+ "learning_rate": 4.4734359573012686e-06,
+ "loss": 0.5131,
+ "step": 2749
+ },
+ {
+ "epoch": 1.3002364066193852,
+ "grad_norm": 2.5901925563812256,
+ "learning_rate": 4.4730529209287995e-06,
+ "loss": 0.4582,
+ "step": 2750
+ },
+ {
+ "epoch": 1.300709219858156,
+ "grad_norm": 2.6789121627807617,
+ "learning_rate": 4.472669761702978e-06,
+ "loss": 0.5685,
+ "step": 2751
+ },
+ {
+ "epoch": 1.3011820330969268,
+ "grad_norm": 2.408003807067871,
+ "learning_rate": 4.472286479647659e-06,
+ "loss": 0.4329,
+ "step": 2752
+ },
+ {
+ "epoch": 1.3016548463356974,
+ "grad_norm": 2.681403398513794,
+ "learning_rate": 4.47190307478671e-06,
+ "loss": 0.4853,
+ "step": 2753
+ },
+ {
+ "epoch": 1.302127659574468,
+ "grad_norm": 2.9923183917999268,
+ "learning_rate": 4.4715195471440025e-06,
+ "loss": 0.5184,
+ "step": 2754
+ },
+ {
+ "epoch": 1.3026004728132388,
+ "grad_norm": 2.5100321769714355,
+ "learning_rate": 4.471135896743418e-06,
+ "loss": 0.5148,
+ "step": 2755
+ },
+ {
+ "epoch": 1.3030732860520096,
+ "grad_norm": 2.267881393432617,
+ "learning_rate": 4.4707521236088444e-06,
+ "loss": 0.5028,
+ "step": 2756
+ },
+ {
+ "epoch": 1.3035460992907801,
+ "grad_norm": 2.7779829502105713,
+ "learning_rate": 4.4703682277641775e-06,
+ "loss": 0.5724,
+ "step": 2757
+ },
+ {
+ "epoch": 1.3040189125295507,
+ "grad_norm": 2.4262194633483887,
+ "learning_rate": 4.4699842092333205e-06,
+ "loss": 0.5341,
+ "step": 2758
+ },
+ {
+ "epoch": 1.3044917257683215,
+ "grad_norm": 2.8682050704956055,
+ "learning_rate": 4.469600068040185e-06,
+ "loss": 0.6114,
+ "step": 2759
+ },
+ {
+ "epoch": 1.3049645390070923,
+ "grad_norm": 2.647853374481201,
+ "learning_rate": 4.46921580420869e-06,
+ "loss": 0.5107,
+ "step": 2760
+ },
+ {
+ "epoch": 1.3054373522458629,
+ "grad_norm": 2.561998128890991,
+ "learning_rate": 4.468831417762762e-06,
+ "loss": 0.6019,
+ "step": 2761
+ },
+ {
+ "epoch": 1.3059101654846335,
+ "grad_norm": 2.763425350189209,
+ "learning_rate": 4.468446908726334e-06,
+ "loss": 0.572,
+ "step": 2762
+ },
+ {
+ "epoch": 1.3063829787234043,
+ "grad_norm": 2.7052934169769287,
+ "learning_rate": 4.468062277123348e-06,
+ "loss": 0.4876,
+ "step": 2763
+ },
+ {
+ "epoch": 1.306855791962175,
+ "grad_norm": 2.997845411300659,
+ "learning_rate": 4.467677522977755e-06,
+ "loss": 0.5683,
+ "step": 2764
+ },
+ {
+ "epoch": 1.3073286052009456,
+ "grad_norm": 2.503129005432129,
+ "learning_rate": 4.46729264631351e-06,
+ "loss": 0.4951,
+ "step": 2765
+ },
+ {
+ "epoch": 1.3078014184397162,
+ "grad_norm": 2.617492437362671,
+ "learning_rate": 4.466907647154578e-06,
+ "loss": 0.5054,
+ "step": 2766
+ },
+ {
+ "epoch": 1.308274231678487,
+ "grad_norm": 2.934967279434204,
+ "learning_rate": 4.4665225255249315e-06,
+ "loss": 0.5299,
+ "step": 2767
+ },
+ {
+ "epoch": 1.3087470449172578,
+ "grad_norm": 2.787252187728882,
+ "learning_rate": 4.46613728144855e-06,
+ "loss": 0.4652,
+ "step": 2768
+ },
+ {
+ "epoch": 1.3092198581560284,
+ "grad_norm": 2.567439556121826,
+ "learning_rate": 4.465751914949422e-06,
+ "loss": 0.538,
+ "step": 2769
+ },
+ {
+ "epoch": 1.309692671394799,
+ "grad_norm": 2.6386024951934814,
+ "learning_rate": 4.4653664260515416e-06,
+ "loss": 0.464,
+ "step": 2770
+ },
+ {
+ "epoch": 1.3101654846335697,
+ "grad_norm": 2.966848134994507,
+ "learning_rate": 4.464980814778912e-06,
+ "loss": 0.4889,
+ "step": 2771
+ },
+ {
+ "epoch": 1.3106382978723405,
+ "grad_norm": 2.571256637573242,
+ "learning_rate": 4.464595081155542e-06,
+ "loss": 0.4979,
+ "step": 2772
+ },
+ {
+ "epoch": 1.3111111111111111,
+ "grad_norm": 2.774203062057495,
+ "learning_rate": 4.4642092252054515e-06,
+ "loss": 0.5366,
+ "step": 2773
+ },
+ {
+ "epoch": 1.3115839243498817,
+ "grad_norm": 2.682969331741333,
+ "learning_rate": 4.463823246952666e-06,
+ "loss": 0.5118,
+ "step": 2774
+ },
+ {
+ "epoch": 1.3120567375886525,
+ "grad_norm": 2.4873905181884766,
+ "learning_rate": 4.463437146421217e-06,
+ "loss": 0.5548,
+ "step": 2775
+ },
+ {
+ "epoch": 1.3125295508274233,
+ "grad_norm": 2.6769661903381348,
+ "learning_rate": 4.463050923635147e-06,
+ "loss": 0.5023,
+ "step": 2776
+ },
+ {
+ "epoch": 1.3130023640661939,
+ "grad_norm": 2.7190892696380615,
+ "learning_rate": 4.462664578618503e-06,
+ "loss": 0.5546,
+ "step": 2777
+ },
+ {
+ "epoch": 1.3134751773049644,
+ "grad_norm": 2.8193624019622803,
+ "learning_rate": 4.462278111395343e-06,
+ "loss": 0.5265,
+ "step": 2778
+ },
+ {
+ "epoch": 1.3139479905437352,
+ "grad_norm": 2.7324538230895996,
+ "learning_rate": 4.461891521989728e-06,
+ "loss": 0.5449,
+ "step": 2779
+ },
+ {
+ "epoch": 1.314420803782506,
+ "grad_norm": 2.87320876121521,
+ "learning_rate": 4.4615048104257305e-06,
+ "loss": 0.5367,
+ "step": 2780
+ },
+ {
+ "epoch": 1.3148936170212766,
+ "grad_norm": 2.6777031421661377,
+ "learning_rate": 4.4611179767274306e-06,
+ "loss": 0.5026,
+ "step": 2781
+ },
+ {
+ "epoch": 1.3153664302600472,
+ "grad_norm": 3.714524269104004,
+ "learning_rate": 4.460731020918913e-06,
+ "loss": 0.569,
+ "step": 2782
+ },
+ {
+ "epoch": 1.315839243498818,
+ "grad_norm": 2.7493600845336914,
+ "learning_rate": 4.460343943024273e-06,
+ "loss": 0.5826,
+ "step": 2783
+ },
+ {
+ "epoch": 1.3163120567375888,
+ "grad_norm": 2.6544079780578613,
+ "learning_rate": 4.459956743067609e-06,
+ "loss": 0.5399,
+ "step": 2784
+ },
+ {
+ "epoch": 1.3167848699763594,
+ "grad_norm": 2.4338037967681885,
+ "learning_rate": 4.459569421073036e-06,
+ "loss": 0.5186,
+ "step": 2785
+ },
+ {
+ "epoch": 1.31725768321513,
+ "grad_norm": 2.9312374591827393,
+ "learning_rate": 4.459181977064665e-06,
+ "loss": 0.5571,
+ "step": 2786
+ },
+ {
+ "epoch": 1.3177304964539007,
+ "grad_norm": 2.5988922119140625,
+ "learning_rate": 4.458794411066624e-06,
+ "loss": 0.5926,
+ "step": 2787
+ },
+ {
+ "epoch": 1.3182033096926715,
+ "grad_norm": 2.5193772315979004,
+ "learning_rate": 4.458406723103044e-06,
+ "loss": 0.5243,
+ "step": 2788
+ },
+ {
+ "epoch": 1.318676122931442,
+ "grad_norm": 2.8653743267059326,
+ "learning_rate": 4.458018913198066e-06,
+ "loss": 0.5421,
+ "step": 2789
+ },
+ {
+ "epoch": 1.3191489361702127,
+ "grad_norm": 2.486245632171631,
+ "learning_rate": 4.457630981375834e-06,
+ "loss": 0.4862,
+ "step": 2790
+ },
+ {
+ "epoch": 1.3196217494089835,
+ "grad_norm": 3.155435800552368,
+ "learning_rate": 4.457242927660506e-06,
+ "loss": 0.5386,
+ "step": 2791
+ },
+ {
+ "epoch": 1.3200945626477543,
+ "grad_norm": 3.102023124694824,
+ "learning_rate": 4.456854752076242e-06,
+ "loss": 0.5527,
+ "step": 2792
+ },
+ {
+ "epoch": 1.3205673758865248,
+ "grad_norm": 2.7995986938476562,
+ "learning_rate": 4.456466454647215e-06,
+ "loss": 0.4364,
+ "step": 2793
+ },
+ {
+ "epoch": 1.3210401891252954,
+ "grad_norm": 2.8328311443328857,
+ "learning_rate": 4.456078035397599e-06,
+ "loss": 0.5516,
+ "step": 2794
+ },
+ {
+ "epoch": 1.3215130023640662,
+ "grad_norm": 2.606161594390869,
+ "learning_rate": 4.455689494351581e-06,
+ "loss": 0.5042,
+ "step": 2795
+ },
+ {
+ "epoch": 1.321985815602837,
+ "grad_norm": 2.6344757080078125,
+ "learning_rate": 4.455300831533354e-06,
+ "loss": 0.4807,
+ "step": 2796
+ },
+ {
+ "epoch": 1.3224586288416076,
+ "grad_norm": 2.8539786338806152,
+ "learning_rate": 4.454912046967118e-06,
+ "loss": 0.4694,
+ "step": 2797
+ },
+ {
+ "epoch": 1.3229314420803782,
+ "grad_norm": 2.849066734313965,
+ "learning_rate": 4.454523140677081e-06,
+ "loss": 0.5037,
+ "step": 2798
+ },
+ {
+ "epoch": 1.323404255319149,
+ "grad_norm": 2.6803371906280518,
+ "learning_rate": 4.454134112687458e-06,
+ "loss": 0.4959,
+ "step": 2799
+ },
+ {
+ "epoch": 1.3238770685579198,
+ "grad_norm": 3.0546066761016846,
+ "learning_rate": 4.453744963022473e-06,
+ "loss": 0.5935,
+ "step": 2800
+ },
+ {
+ "epoch": 1.3243498817966903,
+ "grad_norm": 2.625602960586548,
+ "learning_rate": 4.453355691706356e-06,
+ "loss": 0.5349,
+ "step": 2801
+ },
+ {
+ "epoch": 1.324822695035461,
+ "grad_norm": 2.7568554878234863,
+ "learning_rate": 4.452966298763345e-06,
+ "loss": 0.5012,
+ "step": 2802
+ },
+ {
+ "epoch": 1.3252955082742317,
+ "grad_norm": 2.940427303314209,
+ "learning_rate": 4.452576784217686e-06,
+ "loss": 0.5246,
+ "step": 2803
+ },
+ {
+ "epoch": 1.3257683215130025,
+ "grad_norm": 2.5485289096832275,
+ "learning_rate": 4.452187148093633e-06,
+ "loss": 0.5282,
+ "step": 2804
+ },
+ {
+ "epoch": 1.326241134751773,
+ "grad_norm": 2.8152987957000732,
+ "learning_rate": 4.4517973904154455e-06,
+ "loss": 0.5468,
+ "step": 2805
+ },
+ {
+ "epoch": 1.3267139479905437,
+ "grad_norm": 2.9399688243865967,
+ "learning_rate": 4.451407511207393e-06,
+ "loss": 0.5586,
+ "step": 2806
+ },
+ {
+ "epoch": 1.3271867612293144,
+ "grad_norm": 2.3870036602020264,
+ "learning_rate": 4.451017510493751e-06,
+ "loss": 0.4807,
+ "step": 2807
+ },
+ {
+ "epoch": 1.327659574468085,
+ "grad_norm": 3.4667887687683105,
+ "learning_rate": 4.450627388298805e-06,
+ "loss": 0.5571,
+ "step": 2808
+ },
+ {
+ "epoch": 1.3281323877068558,
+ "grad_norm": 2.685986042022705,
+ "learning_rate": 4.450237144646844e-06,
+ "loss": 0.5525,
+ "step": 2809
+ },
+ {
+ "epoch": 1.3286052009456264,
+ "grad_norm": 2.8529131412506104,
+ "learning_rate": 4.449846779562168e-06,
+ "loss": 0.491,
+ "step": 2810
+ },
+ {
+ "epoch": 1.3290780141843972,
+ "grad_norm": 2.7360332012176514,
+ "learning_rate": 4.449456293069082e-06,
+ "loss": 0.5574,
+ "step": 2811
+ },
+ {
+ "epoch": 1.3295508274231678,
+ "grad_norm": 2.4656026363372803,
+ "learning_rate": 4.4490656851919015e-06,
+ "loss": 0.4678,
+ "step": 2812
+ },
+ {
+ "epoch": 1.3300236406619386,
+ "grad_norm": 2.602651357650757,
+ "learning_rate": 4.448674955954947e-06,
+ "loss": 0.5118,
+ "step": 2813
+ },
+ {
+ "epoch": 1.3304964539007091,
+ "grad_norm": 3.0129756927490234,
+ "learning_rate": 4.448284105382548e-06,
+ "loss": 0.6136,
+ "step": 2814
+ },
+ {
+ "epoch": 1.33096926713948,
+ "grad_norm": 2.8499927520751953,
+ "learning_rate": 4.447893133499039e-06,
+ "loss": 0.5286,
+ "step": 2815
+ },
+ {
+ "epoch": 1.3314420803782505,
+ "grad_norm": 2.8320744037628174,
+ "learning_rate": 4.447502040328767e-06,
+ "loss": 0.5186,
+ "step": 2816
+ },
+ {
+ "epoch": 1.3319148936170213,
+ "grad_norm": 2.499950885772705,
+ "learning_rate": 4.447110825896084e-06,
+ "loss": 0.5338,
+ "step": 2817
+ },
+ {
+ "epoch": 1.3323877068557919,
+ "grad_norm": 2.530895233154297,
+ "learning_rate": 4.446719490225346e-06,
+ "loss": 0.5151,
+ "step": 2818
+ },
+ {
+ "epoch": 1.3328605200945627,
+ "grad_norm": 2.5276098251342773,
+ "learning_rate": 4.446328033340921e-06,
+ "loss": 0.5424,
+ "step": 2819
+ },
+ {
+ "epoch": 1.3333333333333333,
+ "grad_norm": 2.90218186378479,
+ "learning_rate": 4.4459364552671845e-06,
+ "loss": 0.5747,
+ "step": 2820
+ },
+ {
+ "epoch": 1.333806146572104,
+ "grad_norm": 2.500943183898926,
+ "learning_rate": 4.445544756028518e-06,
+ "loss": 0.5459,
+ "step": 2821
+ },
+ {
+ "epoch": 1.3342789598108746,
+ "grad_norm": 2.960374355316162,
+ "learning_rate": 4.44515293564931e-06,
+ "loss": 0.6092,
+ "step": 2822
+ },
+ {
+ "epoch": 1.3347517730496454,
+ "grad_norm": 2.813671827316284,
+ "learning_rate": 4.444760994153958e-06,
+ "loss": 0.5536,
+ "step": 2823
+ },
+ {
+ "epoch": 1.335224586288416,
+ "grad_norm": 2.7147483825683594,
+ "learning_rate": 4.444368931566867e-06,
+ "loss": 0.5291,
+ "step": 2824
+ },
+ {
+ "epoch": 1.3356973995271868,
+ "grad_norm": 2.710101842880249,
+ "learning_rate": 4.443976747912447e-06,
+ "loss": 0.5138,
+ "step": 2825
+ },
+ {
+ "epoch": 1.3361702127659574,
+ "grad_norm": 2.711419105529785,
+ "learning_rate": 4.443584443215121e-06,
+ "loss": 0.5223,
+ "step": 2826
+ },
+ {
+ "epoch": 1.3366430260047282,
+ "grad_norm": 2.887472152709961,
+ "learning_rate": 4.443192017499313e-06,
+ "loss": 0.5464,
+ "step": 2827
+ },
+ {
+ "epoch": 1.3371158392434987,
+ "grad_norm": 2.8867223262786865,
+ "learning_rate": 4.4427994707894585e-06,
+ "loss": 0.5748,
+ "step": 2828
+ },
+ {
+ "epoch": 1.3375886524822695,
+ "grad_norm": 2.407247543334961,
+ "learning_rate": 4.44240680311e-06,
+ "loss": 0.4727,
+ "step": 2829
+ },
+ {
+ "epoch": 1.3380614657210401,
+ "grad_norm": 2.578420877456665,
+ "learning_rate": 4.4420140144853865e-06,
+ "loss": 0.5129,
+ "step": 2830
+ },
+ {
+ "epoch": 1.338534278959811,
+ "grad_norm": 2.884373426437378,
+ "learning_rate": 4.441621104940077e-06,
+ "loss": 0.5366,
+ "step": 2831
+ },
+ {
+ "epoch": 1.3390070921985815,
+ "grad_norm": 2.8652374744415283,
+ "learning_rate": 4.441228074498534e-06,
+ "loss": 0.5045,
+ "step": 2832
+ },
+ {
+ "epoch": 1.3394799054373523,
+ "grad_norm": 2.5380210876464844,
+ "learning_rate": 4.440834923185231e-06,
+ "loss": 0.509,
+ "step": 2833
+ },
+ {
+ "epoch": 1.3399527186761229,
+ "grad_norm": 2.415734052658081,
+ "learning_rate": 4.440441651024648e-06,
+ "loss": 0.5066,
+ "step": 2834
+ },
+ {
+ "epoch": 1.3404255319148937,
+ "grad_norm": 2.503051996231079,
+ "learning_rate": 4.440048258041272e-06,
+ "loss": 0.5118,
+ "step": 2835
+ },
+ {
+ "epoch": 1.3408983451536642,
+ "grad_norm": 3.351001024246216,
+ "learning_rate": 4.439654744259598e-06,
+ "loss": 0.5758,
+ "step": 2836
+ },
+ {
+ "epoch": 1.341371158392435,
+ "grad_norm": 2.7368781566619873,
+ "learning_rate": 4.439261109704129e-06,
+ "loss": 0.5674,
+ "step": 2837
+ },
+ {
+ "epoch": 1.3418439716312056,
+ "grad_norm": 3.008199453353882,
+ "learning_rate": 4.438867354399372e-06,
+ "loss": 0.5891,
+ "step": 2838
+ },
+ {
+ "epoch": 1.3423167848699764,
+ "grad_norm": 2.538907766342163,
+ "learning_rate": 4.438473478369847e-06,
+ "loss": 0.5102,
+ "step": 2839
+ },
+ {
+ "epoch": 1.342789598108747,
+ "grad_norm": 2.7169063091278076,
+ "learning_rate": 4.438079481640079e-06,
+ "loss": 0.6131,
+ "step": 2840
+ },
+ {
+ "epoch": 1.3432624113475178,
+ "grad_norm": 2.7411608695983887,
+ "learning_rate": 4.437685364234601e-06,
+ "loss": 0.5337,
+ "step": 2841
+ },
+ {
+ "epoch": 1.3437352245862884,
+ "grad_norm": 3.2374939918518066,
+ "learning_rate": 4.43729112617795e-06,
+ "loss": 0.5401,
+ "step": 2842
+ },
+ {
+ "epoch": 1.3442080378250592,
+ "grad_norm": 2.4712226390838623,
+ "learning_rate": 4.436896767494676e-06,
+ "loss": 0.5365,
+ "step": 2843
+ },
+ {
+ "epoch": 1.3446808510638297,
+ "grad_norm": 2.661619186401367,
+ "learning_rate": 4.436502288209334e-06,
+ "loss": 0.4919,
+ "step": 2844
+ },
+ {
+ "epoch": 1.3451536643026005,
+ "grad_norm": 2.5943779945373535,
+ "learning_rate": 4.4361076883464845e-06,
+ "loss": 0.5253,
+ "step": 2845
+ },
+ {
+ "epoch": 1.345626477541371,
+ "grad_norm": 2.672297477722168,
+ "learning_rate": 4.4357129679307e-06,
+ "loss": 0.541,
+ "step": 2846
+ },
+ {
+ "epoch": 1.346099290780142,
+ "grad_norm": 2.6830925941467285,
+ "learning_rate": 4.435318126986557e-06,
+ "loss": 0.5641,
+ "step": 2847
+ },
+ {
+ "epoch": 1.3465721040189125,
+ "grad_norm": 2.7394626140594482,
+ "learning_rate": 4.434923165538639e-06,
+ "loss": 0.5591,
+ "step": 2848
+ },
+ {
+ "epoch": 1.3470449172576833,
+ "grad_norm": 2.9656317234039307,
+ "learning_rate": 4.434528083611541e-06,
+ "loss": 0.515,
+ "step": 2849
+ },
+ {
+ "epoch": 1.3475177304964538,
+ "grad_norm": 3.30155086517334,
+ "learning_rate": 4.434132881229861e-06,
+ "loss": 0.5871,
+ "step": 2850
+ },
+ {
+ "epoch": 1.3479905437352246,
+ "grad_norm": 2.6222476959228516,
+ "learning_rate": 4.433737558418209e-06,
+ "loss": 0.5143,
+ "step": 2851
+ },
+ {
+ "epoch": 1.3484633569739952,
+ "grad_norm": 2.903158187866211,
+ "learning_rate": 4.4333421152011965e-06,
+ "loss": 0.4484,
+ "step": 2852
+ },
+ {
+ "epoch": 1.348936170212766,
+ "grad_norm": 2.863116979598999,
+ "learning_rate": 4.432946551603449e-06,
+ "loss": 0.5213,
+ "step": 2853
+ },
+ {
+ "epoch": 1.3494089834515366,
+ "grad_norm": 2.8253962993621826,
+ "learning_rate": 4.432550867649596e-06,
+ "loss": 0.5713,
+ "step": 2854
+ },
+ {
+ "epoch": 1.3498817966903074,
+ "grad_norm": 2.652493953704834,
+ "learning_rate": 4.432155063364273e-06,
+ "loss": 0.5559,
+ "step": 2855
+ },
+ {
+ "epoch": 1.350354609929078,
+ "grad_norm": 2.4289376735687256,
+ "learning_rate": 4.431759138772127e-06,
+ "loss": 0.5122,
+ "step": 2856
+ },
+ {
+ "epoch": 1.3508274231678488,
+ "grad_norm": 2.6329853534698486,
+ "learning_rate": 4.43136309389781e-06,
+ "loss": 0.5332,
+ "step": 2857
+ },
+ {
+ "epoch": 1.3513002364066193,
+ "grad_norm": 2.431103229522705,
+ "learning_rate": 4.430966928765982e-06,
+ "loss": 0.4863,
+ "step": 2858
+ },
+ {
+ "epoch": 1.3517730496453901,
+ "grad_norm": 2.7529025077819824,
+ "learning_rate": 4.4305706434013106e-06,
+ "loss": 0.5263,
+ "step": 2859
+ },
+ {
+ "epoch": 1.3522458628841607,
+ "grad_norm": 2.884605646133423,
+ "learning_rate": 4.43017423782847e-06,
+ "loss": 0.564,
+ "step": 2860
+ },
+ {
+ "epoch": 1.3527186761229315,
+ "grad_norm": 3.027771234512329,
+ "learning_rate": 4.4297777120721435e-06,
+ "loss": 0.5846,
+ "step": 2861
+ },
+ {
+ "epoch": 1.353191489361702,
+ "grad_norm": 3.0140626430511475,
+ "learning_rate": 4.4293810661570205e-06,
+ "loss": 0.6621,
+ "step": 2862
+ },
+ {
+ "epoch": 1.3536643026004729,
+ "grad_norm": 2.721799612045288,
+ "learning_rate": 4.428984300107799e-06,
+ "loss": 0.5566,
+ "step": 2863
+ },
+ {
+ "epoch": 1.3541371158392435,
+ "grad_norm": 3.0016496181488037,
+ "learning_rate": 4.428587413949183e-06,
+ "loss": 0.5525,
+ "step": 2864
+ },
+ {
+ "epoch": 1.3546099290780143,
+ "grad_norm": 2.77138614654541,
+ "learning_rate": 4.428190407705886e-06,
+ "loss": 0.6016,
+ "step": 2865
+ },
+ {
+ "epoch": 1.3550827423167848,
+ "grad_norm": 2.9783477783203125,
+ "learning_rate": 4.427793281402627e-06,
+ "loss": 0.5556,
+ "step": 2866
+ },
+ {
+ "epoch": 1.3555555555555556,
+ "grad_norm": 2.2490382194519043,
+ "learning_rate": 4.427396035064132e-06,
+ "loss": 0.5138,
+ "step": 2867
+ },
+ {
+ "epoch": 1.3560283687943262,
+ "grad_norm": 2.442225217819214,
+ "learning_rate": 4.426998668715139e-06,
+ "loss": 0.4843,
+ "step": 2868
+ },
+ {
+ "epoch": 1.356501182033097,
+ "grad_norm": 2.74040150642395,
+ "learning_rate": 4.426601182380388e-06,
+ "loss": 0.54,
+ "step": 2869
+ },
+ {
+ "epoch": 1.3569739952718676,
+ "grad_norm": 2.4434332847595215,
+ "learning_rate": 4.426203576084629e-06,
+ "loss": 0.5199,
+ "step": 2870
+ },
+ {
+ "epoch": 1.3574468085106384,
+ "grad_norm": 2.6380388736724854,
+ "learning_rate": 4.42580584985262e-06,
+ "loss": 0.5049,
+ "step": 2871
+ },
+ {
+ "epoch": 1.357919621749409,
+ "grad_norm": 2.7324254512786865,
+ "learning_rate": 4.425408003709125e-06,
+ "loss": 0.5036,
+ "step": 2872
+ },
+ {
+ "epoch": 1.3583924349881797,
+ "grad_norm": 2.661012649536133,
+ "learning_rate": 4.425010037678916e-06,
+ "loss": 0.4965,
+ "step": 2873
+ },
+ {
+ "epoch": 1.3588652482269503,
+ "grad_norm": 2.5380208492279053,
+ "learning_rate": 4.424611951786773e-06,
+ "loss": 0.4293,
+ "step": 2874
+ },
+ {
+ "epoch": 1.3593380614657211,
+ "grad_norm": 2.6060714721679688,
+ "learning_rate": 4.424213746057483e-06,
+ "loss": 0.5335,
+ "step": 2875
+ },
+ {
+ "epoch": 1.3598108747044917,
+ "grad_norm": 2.98282527923584,
+ "learning_rate": 4.423815420515841e-06,
+ "loss": 0.5626,
+ "step": 2876
+ },
+ {
+ "epoch": 1.3602836879432625,
+ "grad_norm": 2.779371500015259,
+ "learning_rate": 4.423416975186647e-06,
+ "loss": 0.5353,
+ "step": 2877
+ },
+ {
+ "epoch": 1.360756501182033,
+ "grad_norm": 2.8033530712127686,
+ "learning_rate": 4.423018410094713e-06,
+ "loss": 0.538,
+ "step": 2878
+ },
+ {
+ "epoch": 1.3612293144208039,
+ "grad_norm": 3.225177764892578,
+ "learning_rate": 4.422619725264855e-06,
+ "loss": 0.5441,
+ "step": 2879
+ },
+ {
+ "epoch": 1.3617021276595744,
+ "grad_norm": 2.959135055541992,
+ "learning_rate": 4.422220920721896e-06,
+ "loss": 0.5293,
+ "step": 2880
+ },
+ {
+ "epoch": 1.3621749408983452,
+ "grad_norm": 2.5558884143829346,
+ "learning_rate": 4.4218219964906704e-06,
+ "loss": 0.442,
+ "step": 2881
+ },
+ {
+ "epoch": 1.3626477541371158,
+ "grad_norm": 2.694899797439575,
+ "learning_rate": 4.421422952596015e-06,
+ "loss": 0.5318,
+ "step": 2882
+ },
+ {
+ "epoch": 1.3631205673758866,
+ "grad_norm": 2.7909531593322754,
+ "learning_rate": 4.421023789062777e-06,
+ "loss": 0.6648,
+ "step": 2883
+ },
+ {
+ "epoch": 1.3635933806146572,
+ "grad_norm": 2.421995162963867,
+ "learning_rate": 4.420624505915813e-06,
+ "loss": 0.4644,
+ "step": 2884
+ },
+ {
+ "epoch": 1.364066193853428,
+ "grad_norm": 2.5876688957214355,
+ "learning_rate": 4.420225103179981e-06,
+ "loss": 0.5743,
+ "step": 2885
+ },
+ {
+ "epoch": 1.3645390070921986,
+ "grad_norm": 2.89341139793396,
+ "learning_rate": 4.419825580880152e-06,
+ "loss": 0.5454,
+ "step": 2886
+ },
+ {
+ "epoch": 1.3650118203309693,
+ "grad_norm": 2.534708261489868,
+ "learning_rate": 4.419425939041203e-06,
+ "loss": 0.5572,
+ "step": 2887
+ },
+ {
+ "epoch": 1.36548463356974,
+ "grad_norm": 2.6052141189575195,
+ "learning_rate": 4.419026177688017e-06,
+ "loss": 0.4763,
+ "step": 2888
+ },
+ {
+ "epoch": 1.3659574468085105,
+ "grad_norm": 2.723720073699951,
+ "learning_rate": 4.4186262968454854e-06,
+ "loss": 0.5659,
+ "step": 2889
+ },
+ {
+ "epoch": 1.3664302600472813,
+ "grad_norm": 2.8909599781036377,
+ "learning_rate": 4.418226296538507e-06,
+ "loss": 0.4996,
+ "step": 2890
+ },
+ {
+ "epoch": 1.366903073286052,
+ "grad_norm": 2.551375389099121,
+ "learning_rate": 4.417826176791988e-06,
+ "loss": 0.5259,
+ "step": 2891
+ },
+ {
+ "epoch": 1.3673758865248227,
+ "grad_norm": 3.360267162322998,
+ "learning_rate": 4.417425937630843e-06,
+ "loss": 0.5381,
+ "step": 2892
+ },
+ {
+ "epoch": 1.3678486997635932,
+ "grad_norm": 2.7611942291259766,
+ "learning_rate": 4.417025579079992e-06,
+ "loss": 0.6022,
+ "step": 2893
+ },
+ {
+ "epoch": 1.368321513002364,
+ "grad_norm": 2.5931224822998047,
+ "learning_rate": 4.416625101164365e-06,
+ "loss": 0.5102,
+ "step": 2894
+ },
+ {
+ "epoch": 1.3687943262411348,
+ "grad_norm": 2.5888102054595947,
+ "learning_rate": 4.416224503908897e-06,
+ "loss": 0.4955,
+ "step": 2895
+ },
+ {
+ "epoch": 1.3692671394799054,
+ "grad_norm": 2.6262896060943604,
+ "learning_rate": 4.41582378733853e-06,
+ "loss": 0.5101,
+ "step": 2896
+ },
+ {
+ "epoch": 1.369739952718676,
+ "grad_norm": 3.339170217514038,
+ "learning_rate": 4.415422951478218e-06,
+ "loss": 0.4939,
+ "step": 2897
+ },
+ {
+ "epoch": 1.3702127659574468,
+ "grad_norm": 2.940866708755493,
+ "learning_rate": 4.415021996352917e-06,
+ "loss": 0.5157,
+ "step": 2898
+ },
+ {
+ "epoch": 1.3706855791962176,
+ "grad_norm": 2.7423818111419678,
+ "learning_rate": 4.414620921987594e-06,
+ "loss": 0.5308,
+ "step": 2899
+ },
+ {
+ "epoch": 1.3711583924349882,
+ "grad_norm": 2.7177040576934814,
+ "learning_rate": 4.414219728407221e-06,
+ "loss": 0.5429,
+ "step": 2900
+ },
+ {
+ "epoch": 1.3716312056737587,
+ "grad_norm": 2.560774087905884,
+ "learning_rate": 4.4138184156367794e-06,
+ "loss": 0.5266,
+ "step": 2901
+ },
+ {
+ "epoch": 1.3721040189125295,
+ "grad_norm": 2.5649116039276123,
+ "learning_rate": 4.413416983701256e-06,
+ "loss": 0.4718,
+ "step": 2902
+ },
+ {
+ "epoch": 1.3725768321513003,
+ "grad_norm": 2.8547167778015137,
+ "learning_rate": 4.413015432625648e-06,
+ "loss": 0.5129,
+ "step": 2903
+ },
+ {
+ "epoch": 1.373049645390071,
+ "grad_norm": 2.5413618087768555,
+ "learning_rate": 4.412613762434958e-06,
+ "loss": 0.5738,
+ "step": 2904
+ },
+ {
+ "epoch": 1.3735224586288415,
+ "grad_norm": 3.3252241611480713,
+ "learning_rate": 4.412211973154195e-06,
+ "loss": 0.5639,
+ "step": 2905
+ },
+ {
+ "epoch": 1.3739952718676123,
+ "grad_norm": 2.869102954864502,
+ "learning_rate": 4.411810064808376e-06,
+ "loss": 0.5384,
+ "step": 2906
+ },
+ {
+ "epoch": 1.374468085106383,
+ "grad_norm": 2.703199863433838,
+ "learning_rate": 4.411408037422529e-06,
+ "loss": 0.5742,
+ "step": 2907
+ },
+ {
+ "epoch": 1.3749408983451537,
+ "grad_norm": 2.685450792312622,
+ "learning_rate": 4.411005891021684e-06,
+ "loss": 0.5121,
+ "step": 2908
+ },
+ {
+ "epoch": 1.3754137115839242,
+ "grad_norm": 2.9572203159332275,
+ "learning_rate": 4.410603625630882e-06,
+ "loss": 0.5444,
+ "step": 2909
+ },
+ {
+ "epoch": 1.375886524822695,
+ "grad_norm": 2.707002878189087,
+ "learning_rate": 4.410201241275169e-06,
+ "loss": 0.5125,
+ "step": 2910
+ },
+ {
+ "epoch": 1.3763593380614658,
+ "grad_norm": 3.0158939361572266,
+ "learning_rate": 4.409798737979602e-06,
+ "loss": 0.5299,
+ "step": 2911
+ },
+ {
+ "epoch": 1.3768321513002364,
+ "grad_norm": 2.7932698726654053,
+ "learning_rate": 4.4093961157692415e-06,
+ "loss": 0.5437,
+ "step": 2912
+ },
+ {
+ "epoch": 1.377304964539007,
+ "grad_norm": 2.459510326385498,
+ "learning_rate": 4.408993374669156e-06,
+ "loss": 0.5548,
+ "step": 2913
+ },
+ {
+ "epoch": 1.3777777777777778,
+ "grad_norm": 2.7500696182250977,
+ "learning_rate": 4.408590514704425e-06,
+ "loss": 0.5186,
+ "step": 2914
+ },
+ {
+ "epoch": 1.3782505910165486,
+ "grad_norm": 2.7824268341064453,
+ "learning_rate": 4.4081875359001315e-06,
+ "loss": 0.4762,
+ "step": 2915
+ },
+ {
+ "epoch": 1.3787234042553191,
+ "grad_norm": 2.4202158451080322,
+ "learning_rate": 4.4077844382813675e-06,
+ "loss": 0.5005,
+ "step": 2916
+ },
+ {
+ "epoch": 1.3791962174940897,
+ "grad_norm": 2.5566670894622803,
+ "learning_rate": 4.4073812218732316e-06,
+ "loss": 0.5377,
+ "step": 2917
+ },
+ {
+ "epoch": 1.3796690307328605,
+ "grad_norm": 3.400874376296997,
+ "learning_rate": 4.406977886700831e-06,
+ "loss": 0.6637,
+ "step": 2918
+ },
+ {
+ "epoch": 1.3801418439716313,
+ "grad_norm": 2.8187878131866455,
+ "learning_rate": 4.406574432789278e-06,
+ "loss": 0.5033,
+ "step": 2919
+ },
+ {
+ "epoch": 1.3806146572104019,
+ "grad_norm": 2.5578041076660156,
+ "learning_rate": 4.406170860163697e-06,
+ "loss": 0.5293,
+ "step": 2920
+ },
+ {
+ "epoch": 1.3810874704491725,
+ "grad_norm": 2.6709718704223633,
+ "learning_rate": 4.405767168849213e-06,
+ "loss": 0.5144,
+ "step": 2921
+ },
+ {
+ "epoch": 1.3815602836879433,
+ "grad_norm": 3.049365997314453,
+ "learning_rate": 4.405363358870965e-06,
+ "loss": 0.4894,
+ "step": 2922
+ },
+ {
+ "epoch": 1.382033096926714,
+ "grad_norm": 2.5569891929626465,
+ "learning_rate": 4.404959430254095e-06,
+ "loss": 0.4929,
+ "step": 2923
+ },
+ {
+ "epoch": 1.3825059101654846,
+ "grad_norm": 2.8288230895996094,
+ "learning_rate": 4.404555383023754e-06,
+ "loss": 0.5438,
+ "step": 2924
+ },
+ {
+ "epoch": 1.3829787234042552,
+ "grad_norm": 2.8363358974456787,
+ "learning_rate": 4.404151217205102e-06,
+ "loss": 0.545,
+ "step": 2925
+ },
+ {
+ "epoch": 1.383451536643026,
+ "grad_norm": 2.720972776412964,
+ "learning_rate": 4.403746932823302e-06,
+ "loss": 0.5732,
+ "step": 2926
+ },
+ {
+ "epoch": 1.3839243498817968,
+ "grad_norm": 2.728043794631958,
+ "learning_rate": 4.403342529903528e-06,
+ "loss": 0.4944,
+ "step": 2927
+ },
+ {
+ "epoch": 1.3843971631205674,
+ "grad_norm": 2.4366135597229004,
+ "learning_rate": 4.402938008470961e-06,
+ "loss": 0.4441,
+ "step": 2928
+ },
+ {
+ "epoch": 1.384869976359338,
+ "grad_norm": 2.858454704284668,
+ "learning_rate": 4.402533368550788e-06,
+ "loss": 0.5359,
+ "step": 2929
+ },
+ {
+ "epoch": 1.3853427895981087,
+ "grad_norm": 2.805795907974243,
+ "learning_rate": 4.402128610168205e-06,
+ "loss": 0.4954,
+ "step": 2930
+ },
+ {
+ "epoch": 1.3858156028368795,
+ "grad_norm": 3.3514177799224854,
+ "learning_rate": 4.401723733348413e-06,
+ "loss": 0.579,
+ "step": 2931
+ },
+ {
+ "epoch": 1.3862884160756501,
+ "grad_norm": 2.6255125999450684,
+ "learning_rate": 4.401318738116624e-06,
+ "loss": 0.5002,
+ "step": 2932
+ },
+ {
+ "epoch": 1.3867612293144207,
+ "grad_norm": 2.3480796813964844,
+ "learning_rate": 4.400913624498054e-06,
+ "loss": 0.4688,
+ "step": 2933
+ },
+ {
+ "epoch": 1.3872340425531915,
+ "grad_norm": 2.710165023803711,
+ "learning_rate": 4.400508392517927e-06,
+ "loss": 0.5099,
+ "step": 2934
+ },
+ {
+ "epoch": 1.3877068557919623,
+ "grad_norm": 2.5820295810699463,
+ "learning_rate": 4.400103042201477e-06,
+ "loss": 0.512,
+ "step": 2935
+ },
+ {
+ "epoch": 1.3881796690307329,
+ "grad_norm": 2.750596523284912,
+ "learning_rate": 4.399697573573942e-06,
+ "loss": 0.463,
+ "step": 2936
+ },
+ {
+ "epoch": 1.3886524822695034,
+ "grad_norm": 3.497537612915039,
+ "learning_rate": 4.399291986660569e-06,
+ "loss": 0.5676,
+ "step": 2937
+ },
+ {
+ "epoch": 1.3891252955082742,
+ "grad_norm": 2.4046003818511963,
+ "learning_rate": 4.398886281486612e-06,
+ "loss": 0.5408,
+ "step": 2938
+ },
+ {
+ "epoch": 1.389598108747045,
+ "grad_norm": 2.941606283187866,
+ "learning_rate": 4.398480458077332e-06,
+ "loss": 0.5734,
+ "step": 2939
+ },
+ {
+ "epoch": 1.3900709219858156,
+ "grad_norm": 3.030214309692383,
+ "learning_rate": 4.398074516458e-06,
+ "loss": 0.5353,
+ "step": 2940
+ },
+ {
+ "epoch": 1.3905437352245862,
+ "grad_norm": 2.9991626739501953,
+ "learning_rate": 4.397668456653889e-06,
+ "loss": 0.5989,
+ "step": 2941
+ },
+ {
+ "epoch": 1.391016548463357,
+ "grad_norm": 4.163141250610352,
+ "learning_rate": 4.397262278690285e-06,
+ "loss": 0.5436,
+ "step": 2942
+ },
+ {
+ "epoch": 1.3914893617021278,
+ "grad_norm": 2.6576037406921387,
+ "learning_rate": 4.396855982592478e-06,
+ "loss": 0.5206,
+ "step": 2943
+ },
+ {
+ "epoch": 1.3919621749408984,
+ "grad_norm": 2.7729203701019287,
+ "learning_rate": 4.396449568385768e-06,
+ "loss": 0.5403,
+ "step": 2944
+ },
+ {
+ "epoch": 1.392434988179669,
+ "grad_norm": 2.4560446739196777,
+ "learning_rate": 4.396043036095457e-06,
+ "loss": 0.4924,
+ "step": 2945
+ },
+ {
+ "epoch": 1.3929078014184397,
+ "grad_norm": 2.6370556354522705,
+ "learning_rate": 4.39563638574686e-06,
+ "loss": 0.5543,
+ "step": 2946
+ },
+ {
+ "epoch": 1.3933806146572105,
+ "grad_norm": 2.593914270401001,
+ "learning_rate": 4.395229617365298e-06,
+ "loss": 0.5133,
+ "step": 2947
+ },
+ {
+ "epoch": 1.393853427895981,
+ "grad_norm": 2.3583998680114746,
+ "learning_rate": 4.394822730976099e-06,
+ "loss": 0.4436,
+ "step": 2948
+ },
+ {
+ "epoch": 1.3943262411347517,
+ "grad_norm": 3.2768537998199463,
+ "learning_rate": 4.394415726604596e-06,
+ "loss": 0.5489,
+ "step": 2949
+ },
+ {
+ "epoch": 1.3947990543735225,
+ "grad_norm": 2.88662052154541,
+ "learning_rate": 4.394008604276133e-06,
+ "loss": 0.5194,
+ "step": 2950
+ },
+ {
+ "epoch": 1.3952718676122933,
+ "grad_norm": 2.46610426902771,
+ "learning_rate": 4.393601364016059e-06,
+ "loss": 0.5255,
+ "step": 2951
+ },
+ {
+ "epoch": 1.3957446808510638,
+ "grad_norm": 3.122509241104126,
+ "learning_rate": 4.393194005849731e-06,
+ "loss": 0.6046,
+ "step": 2952
+ },
+ {
+ "epoch": 1.3962174940898344,
+ "grad_norm": 2.724926471710205,
+ "learning_rate": 4.392786529802513e-06,
+ "loss": 0.4958,
+ "step": 2953
+ },
+ {
+ "epoch": 1.3966903073286052,
+ "grad_norm": 2.491485595703125,
+ "learning_rate": 4.3923789358997785e-06,
+ "loss": 0.5209,
+ "step": 2954
+ },
+ {
+ "epoch": 1.397163120567376,
+ "grad_norm": 2.61110520362854,
+ "learning_rate": 4.3919712241669056e-06,
+ "loss": 0.5202,
+ "step": 2955
+ },
+ {
+ "epoch": 1.3976359338061466,
+ "grad_norm": 2.3814501762390137,
+ "learning_rate": 4.39156339462928e-06,
+ "loss": 0.4966,
+ "step": 2956
+ },
+ {
+ "epoch": 1.3981087470449172,
+ "grad_norm": 2.762498617172241,
+ "learning_rate": 4.391155447312296e-06,
+ "loss": 0.6025,
+ "step": 2957
+ },
+ {
+ "epoch": 1.398581560283688,
+ "grad_norm": 2.964975595474243,
+ "learning_rate": 4.390747382241355e-06,
+ "loss": 0.4845,
+ "step": 2958
+ },
+ {
+ "epoch": 1.3990543735224588,
+ "grad_norm": 3.0117249488830566,
+ "learning_rate": 4.3903391994418655e-06,
+ "loss": 0.5326,
+ "step": 2959
+ },
+ {
+ "epoch": 1.3995271867612293,
+ "grad_norm": 2.578626871109009,
+ "learning_rate": 4.389930898939243e-06,
+ "loss": 0.5271,
+ "step": 2960
+ },
+ {
+ "epoch": 1.4,
+ "grad_norm": 2.747441053390503,
+ "learning_rate": 4.38952248075891e-06,
+ "loss": 0.5553,
+ "step": 2961
+ },
+ {
+ "epoch": 1.4004728132387707,
+ "grad_norm": 2.8273086547851562,
+ "learning_rate": 4.389113944926297e-06,
+ "loss": 0.5475,
+ "step": 2962
+ },
+ {
+ "epoch": 1.4009456264775415,
+ "grad_norm": 2.55238676071167,
+ "learning_rate": 4.388705291466843e-06,
+ "loss": 0.4864,
+ "step": 2963
+ },
+ {
+ "epoch": 1.401418439716312,
+ "grad_norm": 2.597214460372925,
+ "learning_rate": 4.388296520405992e-06,
+ "loss": 0.4845,
+ "step": 2964
+ },
+ {
+ "epoch": 1.4018912529550827,
+ "grad_norm": 2.608962297439575,
+ "learning_rate": 4.387887631769196e-06,
+ "loss": 0.5544,
+ "step": 2965
+ },
+ {
+ "epoch": 1.4023640661938535,
+ "grad_norm": 2.2754876613616943,
+ "learning_rate": 4.3874786255819165e-06,
+ "loss": 0.5045,
+ "step": 2966
+ },
+ {
+ "epoch": 1.4028368794326243,
+ "grad_norm": 2.9900264739990234,
+ "learning_rate": 4.387069501869618e-06,
+ "loss": 0.562,
+ "step": 2967
+ },
+ {
+ "epoch": 1.4033096926713948,
+ "grad_norm": 2.8069417476654053,
+ "learning_rate": 4.386660260657778e-06,
+ "loss": 0.5284,
+ "step": 2968
+ },
+ {
+ "epoch": 1.4037825059101654,
+ "grad_norm": 2.68894624710083,
+ "learning_rate": 4.386250901971875e-06,
+ "loss": 0.5879,
+ "step": 2969
+ },
+ {
+ "epoch": 1.4042553191489362,
+ "grad_norm": 2.614485025405884,
+ "learning_rate": 4.385841425837399e-06,
+ "loss": 0.4771,
+ "step": 2970
+ },
+ {
+ "epoch": 1.4047281323877068,
+ "grad_norm": 2.487950325012207,
+ "learning_rate": 4.385431832279848e-06,
+ "loss": 0.5552,
+ "step": 2971
+ },
+ {
+ "epoch": 1.4052009456264776,
+ "grad_norm": 2.5098392963409424,
+ "learning_rate": 4.385022121324723e-06,
+ "loss": 0.5267,
+ "step": 2972
+ },
+ {
+ "epoch": 1.4056737588652481,
+ "grad_norm": 2.825838565826416,
+ "learning_rate": 4.384612292997537e-06,
+ "loss": 0.5336,
+ "step": 2973
+ },
+ {
+ "epoch": 1.406146572104019,
+ "grad_norm": 2.898188829421997,
+ "learning_rate": 4.384202347323806e-06,
+ "loss": 0.5685,
+ "step": 2974
+ },
+ {
+ "epoch": 1.4066193853427895,
+ "grad_norm": 2.8722569942474365,
+ "learning_rate": 4.383792284329057e-06,
+ "loss": 0.5977,
+ "step": 2975
+ },
+ {
+ "epoch": 1.4070921985815603,
+ "grad_norm": 2.832951307296753,
+ "learning_rate": 4.3833821040388235e-06,
+ "loss": 0.5766,
+ "step": 2976
+ },
+ {
+ "epoch": 1.407565011820331,
+ "grad_norm": 2.7353670597076416,
+ "learning_rate": 4.3829718064786446e-06,
+ "loss": 0.5461,
+ "step": 2977
+ },
+ {
+ "epoch": 1.4080378250591017,
+ "grad_norm": 2.6050429344177246,
+ "learning_rate": 4.3825613916740675e-06,
+ "loss": 0.5501,
+ "step": 2978
+ },
+ {
+ "epoch": 1.4085106382978723,
+ "grad_norm": 2.79719877243042,
+ "learning_rate": 4.382150859650647e-06,
+ "loss": 0.502,
+ "step": 2979
+ },
+ {
+ "epoch": 1.408983451536643,
+ "grad_norm": 2.5538079738616943,
+ "learning_rate": 4.381740210433946e-06,
+ "loss": 0.4762,
+ "step": 2980
+ },
+ {
+ "epoch": 1.4094562647754136,
+ "grad_norm": 2.7256062030792236,
+ "learning_rate": 4.381329444049533e-06,
+ "loss": 0.4692,
+ "step": 2981
+ },
+ {
+ "epoch": 1.4099290780141844,
+ "grad_norm": 2.7778146266937256,
+ "learning_rate": 4.3809185605229855e-06,
+ "loss": 0.5366,
+ "step": 2982
+ },
+ {
+ "epoch": 1.410401891252955,
+ "grad_norm": 2.6289451122283936,
+ "learning_rate": 4.380507559879887e-06,
+ "loss": 0.5412,
+ "step": 2983
+ },
+ {
+ "epoch": 1.4108747044917258,
+ "grad_norm": 2.697204828262329,
+ "learning_rate": 4.380096442145827e-06,
+ "loss": 0.5065,
+ "step": 2984
+ },
+ {
+ "epoch": 1.4113475177304964,
+ "grad_norm": 2.4709219932556152,
+ "learning_rate": 4.379685207346407e-06,
+ "loss": 0.568,
+ "step": 2985
+ },
+ {
+ "epoch": 1.4118203309692672,
+ "grad_norm": 2.9740655422210693,
+ "learning_rate": 4.379273855507231e-06,
+ "loss": 0.5512,
+ "step": 2986
+ },
+ {
+ "epoch": 1.4122931442080378,
+ "grad_norm": 3.0090627670288086,
+ "learning_rate": 4.378862386653911e-06,
+ "loss": 0.5459,
+ "step": 2987
+ },
+ {
+ "epoch": 1.4127659574468086,
+ "grad_norm": 2.8835368156433105,
+ "learning_rate": 4.378450800812071e-06,
+ "loss": 0.5357,
+ "step": 2988
+ },
+ {
+ "epoch": 1.4132387706855791,
+ "grad_norm": 2.558824062347412,
+ "learning_rate": 4.378039098007335e-06,
+ "loss": 0.536,
+ "step": 2989
+ },
+ {
+ "epoch": 1.41371158392435,
+ "grad_norm": 2.5572092533111572,
+ "learning_rate": 4.377627278265339e-06,
+ "loss": 0.5183,
+ "step": 2990
+ },
+ {
+ "epoch": 1.4141843971631205,
+ "grad_norm": 2.7356579303741455,
+ "learning_rate": 4.377215341611727e-06,
+ "loss": 0.5087,
+ "step": 2991
+ },
+ {
+ "epoch": 1.4146572104018913,
+ "grad_norm": 2.7541024684906006,
+ "learning_rate": 4.376803288072146e-06,
+ "loss": 0.4509,
+ "step": 2992
+ },
+ {
+ "epoch": 1.4151300236406619,
+ "grad_norm": 2.7548446655273438,
+ "learning_rate": 4.376391117672254e-06,
+ "loss": 0.5532,
+ "step": 2993
+ },
+ {
+ "epoch": 1.4156028368794327,
+ "grad_norm": 2.9107465744018555,
+ "learning_rate": 4.375978830437715e-06,
+ "loss": 0.5719,
+ "step": 2994
+ },
+ {
+ "epoch": 1.4160756501182032,
+ "grad_norm": 2.7077393531799316,
+ "learning_rate": 4.3755664263942e-06,
+ "loss": 0.5084,
+ "step": 2995
+ },
+ {
+ "epoch": 1.416548463356974,
+ "grad_norm": 2.764209270477295,
+ "learning_rate": 4.375153905567388e-06,
+ "loss": 0.5976,
+ "step": 2996
+ },
+ {
+ "epoch": 1.4170212765957446,
+ "grad_norm": 2.7792932987213135,
+ "learning_rate": 4.374741267982964e-06,
+ "loss": 0.5358,
+ "step": 2997
+ },
+ {
+ "epoch": 1.4174940898345154,
+ "grad_norm": 2.459212064743042,
+ "learning_rate": 4.374328513666622e-06,
+ "loss": 0.5181,
+ "step": 2998
+ },
+ {
+ "epoch": 1.417966903073286,
+ "grad_norm": 2.548546552658081,
+ "learning_rate": 4.373915642644062e-06,
+ "loss": 0.528,
+ "step": 2999
+ },
+ {
+ "epoch": 1.4184397163120568,
+ "grad_norm": 2.998138189315796,
+ "learning_rate": 4.373502654940992e-06,
+ "loss": 0.5233,
+ "step": 3000
+ },
+ {
+ "epoch": 1.4189125295508274,
+ "grad_norm": 2.604341983795166,
+ "learning_rate": 4.373089550583126e-06,
+ "loss": 0.5274,
+ "step": 3001
+ },
+ {
+ "epoch": 1.4193853427895982,
+ "grad_norm": 2.6792588233947754,
+ "learning_rate": 4.372676329596188e-06,
+ "loss": 0.5061,
+ "step": 3002
+ },
+ {
+ "epoch": 1.4198581560283687,
+ "grad_norm": 2.5182368755340576,
+ "learning_rate": 4.372262992005906e-06,
+ "loss": 0.541,
+ "step": 3003
+ },
+ {
+ "epoch": 1.4203309692671395,
+ "grad_norm": 2.690718173980713,
+ "learning_rate": 4.371849537838018e-06,
+ "loss": 0.5308,
+ "step": 3004
+ },
+ {
+ "epoch": 1.42080378250591,
+ "grad_norm": 2.6797590255737305,
+ "learning_rate": 4.371435967118266e-06,
+ "loss": 0.5728,
+ "step": 3005
+ },
+ {
+ "epoch": 1.421276595744681,
+ "grad_norm": 2.847900152206421,
+ "learning_rate": 4.371022279872403e-06,
+ "loss": 0.5053,
+ "step": 3006
+ },
+ {
+ "epoch": 1.4217494089834515,
+ "grad_norm": 2.497810125350952,
+ "learning_rate": 4.370608476126186e-06,
+ "loss": 0.5057,
+ "step": 3007
+ },
+ {
+ "epoch": 1.4222222222222223,
+ "grad_norm": 2.5259225368499756,
+ "learning_rate": 4.370194555905382e-06,
+ "loss": 0.5508,
+ "step": 3008
+ },
+ {
+ "epoch": 1.4226950354609929,
+ "grad_norm": 2.774118423461914,
+ "learning_rate": 4.369780519235763e-06,
+ "loss": 0.5419,
+ "step": 3009
+ },
+ {
+ "epoch": 1.4231678486997636,
+ "grad_norm": 2.2764663696289062,
+ "learning_rate": 4.369366366143111e-06,
+ "loss": 0.5032,
+ "step": 3010
+ },
+ {
+ "epoch": 1.4236406619385342,
+ "grad_norm": 2.736347198486328,
+ "learning_rate": 4.368952096653211e-06,
+ "loss": 0.5184,
+ "step": 3011
+ },
+ {
+ "epoch": 1.424113475177305,
+ "grad_norm": 2.476762056350708,
+ "learning_rate": 4.36853771079186e-06,
+ "loss": 0.5331,
+ "step": 3012
+ },
+ {
+ "epoch": 1.4245862884160756,
+ "grad_norm": 2.8006162643432617,
+ "learning_rate": 4.3681232085848585e-06,
+ "loss": 0.5331,
+ "step": 3013
+ },
+ {
+ "epoch": 1.4250591016548464,
+ "grad_norm": 2.509143590927124,
+ "learning_rate": 4.367708590058016e-06,
+ "loss": 0.5127,
+ "step": 3014
+ },
+ {
+ "epoch": 1.425531914893617,
+ "grad_norm": 3.030137538909912,
+ "learning_rate": 4.3672938552371505e-06,
+ "loss": 0.5555,
+ "step": 3015
+ },
+ {
+ "epoch": 1.4260047281323878,
+ "grad_norm": 3.0536904335021973,
+ "learning_rate": 4.3668790041480835e-06,
+ "loss": 0.5241,
+ "step": 3016
+ },
+ {
+ "epoch": 1.4264775413711583,
+ "grad_norm": 2.6400439739227295,
+ "learning_rate": 4.366464036816647e-06,
+ "loss": 0.4946,
+ "step": 3017
+ },
+ {
+ "epoch": 1.4269503546099291,
+ "grad_norm": 2.7302589416503906,
+ "learning_rate": 4.366048953268679e-06,
+ "loss": 0.5105,
+ "step": 3018
+ },
+ {
+ "epoch": 1.4274231678486997,
+ "grad_norm": 2.504549264907837,
+ "learning_rate": 4.365633753530026e-06,
+ "loss": 0.4844,
+ "step": 3019
+ },
+ {
+ "epoch": 1.4278959810874705,
+ "grad_norm": 2.3872320652008057,
+ "learning_rate": 4.365218437626539e-06,
+ "loss": 0.4402,
+ "step": 3020
+ },
+ {
+ "epoch": 1.428368794326241,
+ "grad_norm": 2.531649351119995,
+ "learning_rate": 4.364803005584078e-06,
+ "loss": 0.4913,
+ "step": 3021
+ },
+ {
+ "epoch": 1.4288416075650119,
+ "grad_norm": 2.4683783054351807,
+ "learning_rate": 4.364387457428512e-06,
+ "loss": 0.515,
+ "step": 3022
+ },
+ {
+ "epoch": 1.4293144208037825,
+ "grad_norm": 2.632336378097534,
+ "learning_rate": 4.363971793185713e-06,
+ "loss": 0.5398,
+ "step": 3023
+ },
+ {
+ "epoch": 1.4297872340425533,
+ "grad_norm": 2.7456719875335693,
+ "learning_rate": 4.363556012881565e-06,
+ "loss": 0.5254,
+ "step": 3024
+ },
+ {
+ "epoch": 1.4302600472813238,
+ "grad_norm": 2.607177972793579,
+ "learning_rate": 4.363140116541955e-06,
+ "loss": 0.5266,
+ "step": 3025
+ },
+ {
+ "epoch": 1.4307328605200946,
+ "grad_norm": 2.640127420425415,
+ "learning_rate": 4.3627241041927796e-06,
+ "loss": 0.5157,
+ "step": 3026
+ },
+ {
+ "epoch": 1.4312056737588652,
+ "grad_norm": 2.4210736751556396,
+ "learning_rate": 4.362307975859941e-06,
+ "loss": 0.4599,
+ "step": 3027
+ },
+ {
+ "epoch": 1.431678486997636,
+ "grad_norm": 2.6007790565490723,
+ "learning_rate": 4.361891731569352e-06,
+ "loss": 0.5298,
+ "step": 3028
+ },
+ {
+ "epoch": 1.4321513002364066,
+ "grad_norm": 2.5352046489715576,
+ "learning_rate": 4.361475371346928e-06,
+ "loss": 0.5128,
+ "step": 3029
+ },
+ {
+ "epoch": 1.4326241134751774,
+ "grad_norm": 2.4204049110412598,
+ "learning_rate": 4.361058895218596e-06,
+ "loss": 0.4669,
+ "step": 3030
+ },
+ {
+ "epoch": 1.433096926713948,
+ "grad_norm": 2.525240182876587,
+ "learning_rate": 4.360642303210286e-06,
+ "loss": 0.4925,
+ "step": 3031
+ },
+ {
+ "epoch": 1.4335697399527187,
+ "grad_norm": 2.839646339416504,
+ "learning_rate": 4.360225595347939e-06,
+ "loss": 0.5868,
+ "step": 3032
+ },
+ {
+ "epoch": 1.4340425531914893,
+ "grad_norm": 2.5043296813964844,
+ "learning_rate": 4.359808771657501e-06,
+ "loss": 0.4951,
+ "step": 3033
+ },
+ {
+ "epoch": 1.4345153664302601,
+ "grad_norm": 2.9082300662994385,
+ "learning_rate": 4.359391832164927e-06,
+ "loss": 0.5259,
+ "step": 3034
+ },
+ {
+ "epoch": 1.4349881796690307,
+ "grad_norm": 2.6651999950408936,
+ "learning_rate": 4.3589747768961745e-06,
+ "loss": 0.537,
+ "step": 3035
+ },
+ {
+ "epoch": 1.4354609929078015,
+ "grad_norm": 2.577077865600586,
+ "learning_rate": 4.358557605877216e-06,
+ "loss": 0.5186,
+ "step": 3036
+ },
+ {
+ "epoch": 1.435933806146572,
+ "grad_norm": 2.7445287704467773,
+ "learning_rate": 4.3581403191340236e-06,
+ "loss": 0.5573,
+ "step": 3037
+ },
+ {
+ "epoch": 1.4364066193853429,
+ "grad_norm": 2.502086639404297,
+ "learning_rate": 4.357722916692582e-06,
+ "loss": 0.5039,
+ "step": 3038
+ },
+ {
+ "epoch": 1.4368794326241134,
+ "grad_norm": 2.4476163387298584,
+ "learning_rate": 4.357305398578879e-06,
+ "loss": 0.5638,
+ "step": 3039
+ },
+ {
+ "epoch": 1.4373522458628842,
+ "grad_norm": 2.7705588340759277,
+ "learning_rate": 4.356887764818915e-06,
+ "loss": 0.5485,
+ "step": 3040
+ },
+ {
+ "epoch": 1.4378250591016548,
+ "grad_norm": 2.498225450515747,
+ "learning_rate": 4.356470015438691e-06,
+ "loss": 0.5486,
+ "step": 3041
+ },
+ {
+ "epoch": 1.4382978723404256,
+ "grad_norm": 2.394320011138916,
+ "learning_rate": 4.356052150464219e-06,
+ "loss": 0.512,
+ "step": 3042
+ },
+ {
+ "epoch": 1.4387706855791962,
+ "grad_norm": 2.8725767135620117,
+ "learning_rate": 4.3556341699215185e-06,
+ "loss": 0.5202,
+ "step": 3043
+ },
+ {
+ "epoch": 1.439243498817967,
+ "grad_norm": 3.1707918643951416,
+ "learning_rate": 4.355216073836615e-06,
+ "loss": 0.5229,
+ "step": 3044
+ },
+ {
+ "epoch": 1.4397163120567376,
+ "grad_norm": 2.532578468322754,
+ "learning_rate": 4.3547978622355415e-06,
+ "loss": 0.4569,
+ "step": 3045
+ },
+ {
+ "epoch": 1.4401891252955084,
+ "grad_norm": 3.0111029148101807,
+ "learning_rate": 4.354379535144338e-06,
+ "loss": 0.5801,
+ "step": 3046
+ },
+ {
+ "epoch": 1.440661938534279,
+ "grad_norm": 2.9554224014282227,
+ "learning_rate": 4.353961092589052e-06,
+ "loss": 0.5968,
+ "step": 3047
+ },
+ {
+ "epoch": 1.4411347517730497,
+ "grad_norm": 2.7562637329101562,
+ "learning_rate": 4.353542534595738e-06,
+ "loss": 0.5005,
+ "step": 3048
+ },
+ {
+ "epoch": 1.4416075650118203,
+ "grad_norm": 3.083254337310791,
+ "learning_rate": 4.3531238611904595e-06,
+ "loss": 0.5389,
+ "step": 3049
+ },
+ {
+ "epoch": 1.442080378250591,
+ "grad_norm": 2.7778005599975586,
+ "learning_rate": 4.352705072399282e-06,
+ "loss": 0.5342,
+ "step": 3050
+ },
+ {
+ "epoch": 1.4425531914893617,
+ "grad_norm": 2.6673996448516846,
+ "learning_rate": 4.3522861682482845e-06,
+ "loss": 0.5213,
+ "step": 3051
+ },
+ {
+ "epoch": 1.4430260047281322,
+ "grad_norm": 2.637605905532837,
+ "learning_rate": 4.351867148763548e-06,
+ "loss": 0.4893,
+ "step": 3052
+ },
+ {
+ "epoch": 1.443498817966903,
+ "grad_norm": 2.834469795227051,
+ "learning_rate": 4.351448013971166e-06,
+ "loss": 0.5391,
+ "step": 3053
+ },
+ {
+ "epoch": 1.4439716312056738,
+ "grad_norm": 2.824153184890747,
+ "learning_rate": 4.351028763897234e-06,
+ "loss": 0.6403,
+ "step": 3054
+ },
+ {
+ "epoch": 1.4444444444444444,
+ "grad_norm": 2.558966875076294,
+ "learning_rate": 4.350609398567857e-06,
+ "loss": 0.4912,
+ "step": 3055
+ },
+ {
+ "epoch": 1.444917257683215,
+ "grad_norm": 2.281726360321045,
+ "learning_rate": 4.3501899180091475e-06,
+ "loss": 0.4655,
+ "step": 3056
+ },
+ {
+ "epoch": 1.4453900709219858,
+ "grad_norm": 2.499472141265869,
+ "learning_rate": 4.349770322247225e-06,
+ "loss": 0.4878,
+ "step": 3057
+ },
+ {
+ "epoch": 1.4458628841607566,
+ "grad_norm": 2.578615188598633,
+ "learning_rate": 4.349350611308215e-06,
+ "loss": 0.4855,
+ "step": 3058
+ },
+ {
+ "epoch": 1.4463356973995272,
+ "grad_norm": 2.7111165523529053,
+ "learning_rate": 4.348930785218252e-06,
+ "loss": 0.5415,
+ "step": 3059
+ },
+ {
+ "epoch": 1.4468085106382977,
+ "grad_norm": 2.8081610202789307,
+ "learning_rate": 4.348510844003476e-06,
+ "loss": 0.4881,
+ "step": 3060
+ },
+ {
+ "epoch": 1.4472813238770685,
+ "grad_norm": 2.9439868927001953,
+ "learning_rate": 4.348090787690036e-06,
+ "loss": 0.5485,
+ "step": 3061
+ },
+ {
+ "epoch": 1.4477541371158393,
+ "grad_norm": 2.592532157897949,
+ "learning_rate": 4.347670616304085e-06,
+ "loss": 0.4912,
+ "step": 3062
+ },
+ {
+ "epoch": 1.44822695035461,
+ "grad_norm": 2.960592746734619,
+ "learning_rate": 4.347250329871787e-06,
+ "loss": 0.5473,
+ "step": 3063
+ },
+ {
+ "epoch": 1.4486997635933805,
+ "grad_norm": 2.5786688327789307,
+ "learning_rate": 4.3468299284193116e-06,
+ "loss": 0.5348,
+ "step": 3064
+ },
+ {
+ "epoch": 1.4491725768321513,
+ "grad_norm": 2.6084046363830566,
+ "learning_rate": 4.346409411972834e-06,
+ "loss": 0.527,
+ "step": 3065
+ },
+ {
+ "epoch": 1.449645390070922,
+ "grad_norm": 2.489748239517212,
+ "learning_rate": 4.3459887805585385e-06,
+ "loss": 0.4943,
+ "step": 3066
+ },
+ {
+ "epoch": 1.4501182033096927,
+ "grad_norm": 2.452131986618042,
+ "learning_rate": 4.345568034202617e-06,
+ "loss": 0.4886,
+ "step": 3067
+ },
+ {
+ "epoch": 1.4505910165484632,
+ "grad_norm": 2.4034671783447266,
+ "learning_rate": 4.345147172931266e-06,
+ "loss": 0.4689,
+ "step": 3068
+ },
+ {
+ "epoch": 1.451063829787234,
+ "grad_norm": 2.6045448780059814,
+ "learning_rate": 4.344726196770691e-06,
+ "loss": 0.5842,
+ "step": 3069
+ },
+ {
+ "epoch": 1.4515366430260048,
+ "grad_norm": 2.697593927383423,
+ "learning_rate": 4.3443051057471045e-06,
+ "loss": 0.5358,
+ "step": 3070
+ },
+ {
+ "epoch": 1.4520094562647754,
+ "grad_norm": 2.6080820560455322,
+ "learning_rate": 4.343883899886727e-06,
+ "loss": 0.5361,
+ "step": 3071
+ },
+ {
+ "epoch": 1.452482269503546,
+ "grad_norm": 2.4605307579040527,
+ "learning_rate": 4.343462579215783e-06,
+ "loss": 0.4941,
+ "step": 3072
+ },
+ {
+ "epoch": 1.4529550827423168,
+ "grad_norm": 2.8025355339050293,
+ "learning_rate": 4.343041143760509e-06,
+ "loss": 0.5116,
+ "step": 3073
+ },
+ {
+ "epoch": 1.4534278959810876,
+ "grad_norm": 2.432515859603882,
+ "learning_rate": 4.3426195935471434e-06,
+ "loss": 0.4991,
+ "step": 3074
+ },
+ {
+ "epoch": 1.4539007092198581,
+ "grad_norm": 2.5838661193847656,
+ "learning_rate": 4.342197928601935e-06,
+ "loss": 0.4994,
+ "step": 3075
+ },
+ {
+ "epoch": 1.4543735224586287,
+ "grad_norm": 2.421692371368408,
+ "learning_rate": 4.341776148951141e-06,
+ "loss": 0.4945,
+ "step": 3076
+ },
+ {
+ "epoch": 1.4548463356973995,
+ "grad_norm": 2.5354676246643066,
+ "learning_rate": 4.341354254621021e-06,
+ "loss": 0.4859,
+ "step": 3077
+ },
+ {
+ "epoch": 1.4553191489361703,
+ "grad_norm": 2.7316789627075195,
+ "learning_rate": 4.340932245637846e-06,
+ "loss": 0.5136,
+ "step": 3078
+ },
+ {
+ "epoch": 1.455791962174941,
+ "grad_norm": 3.5903496742248535,
+ "learning_rate": 4.340510122027891e-06,
+ "loss": 0.6451,
+ "step": 3079
+ },
+ {
+ "epoch": 1.4562647754137115,
+ "grad_norm": 2.95190167427063,
+ "learning_rate": 4.340087883817442e-06,
+ "loss": 0.6354,
+ "step": 3080
+ },
+ {
+ "epoch": 1.4567375886524823,
+ "grad_norm": 2.8659214973449707,
+ "learning_rate": 4.339665531032789e-06,
+ "loss": 0.5514,
+ "step": 3081
+ },
+ {
+ "epoch": 1.457210401891253,
+ "grad_norm": 2.5681674480438232,
+ "learning_rate": 4.339243063700231e-06,
+ "loss": 0.5135,
+ "step": 3082
+ },
+ {
+ "epoch": 1.4576832151300236,
+ "grad_norm": 2.7353906631469727,
+ "learning_rate": 4.338820481846072e-06,
+ "loss": 0.4608,
+ "step": 3083
+ },
+ {
+ "epoch": 1.4581560283687942,
+ "grad_norm": 2.6116466522216797,
+ "learning_rate": 4.3383977854966245e-06,
+ "loss": 0.4924,
+ "step": 3084
+ },
+ {
+ "epoch": 1.458628841607565,
+ "grad_norm": 2.6676487922668457,
+ "learning_rate": 4.337974974678207e-06,
+ "loss": 0.5747,
+ "step": 3085
+ },
+ {
+ "epoch": 1.4591016548463358,
+ "grad_norm": 2.909031629562378,
+ "learning_rate": 4.337552049417147e-06,
+ "loss": 0.4618,
+ "step": 3086
+ },
+ {
+ "epoch": 1.4595744680851064,
+ "grad_norm": 2.7614190578460693,
+ "learning_rate": 4.33712900973978e-06,
+ "loss": 0.5154,
+ "step": 3087
+ },
+ {
+ "epoch": 1.460047281323877,
+ "grad_norm": 2.452188014984131,
+ "learning_rate": 4.336705855672444e-06,
+ "loss": 0.542,
+ "step": 3088
+ },
+ {
+ "epoch": 1.4605200945626478,
+ "grad_norm": 3.0004117488861084,
+ "learning_rate": 4.336282587241488e-06,
+ "loss": 0.5857,
+ "step": 3089
+ },
+ {
+ "epoch": 1.4609929078014185,
+ "grad_norm": 2.870783567428589,
+ "learning_rate": 4.335859204473268e-06,
+ "loss": 0.5506,
+ "step": 3090
+ },
+ {
+ "epoch": 1.4614657210401891,
+ "grad_norm": 3.1078689098358154,
+ "learning_rate": 4.335435707394145e-06,
+ "loss": 0.5138,
+ "step": 3091
+ },
+ {
+ "epoch": 1.4619385342789597,
+ "grad_norm": 2.8516197204589844,
+ "learning_rate": 4.335012096030488e-06,
+ "loss": 0.5842,
+ "step": 3092
+ },
+ {
+ "epoch": 1.4624113475177305,
+ "grad_norm": 2.615922212600708,
+ "learning_rate": 4.334588370408675e-06,
+ "loss": 0.4896,
+ "step": 3093
+ },
+ {
+ "epoch": 1.4628841607565013,
+ "grad_norm": 3.1911802291870117,
+ "learning_rate": 4.334164530555088e-06,
+ "loss": 0.4974,
+ "step": 3094
+ },
+ {
+ "epoch": 1.4633569739952719,
+ "grad_norm": 3.075051784515381,
+ "learning_rate": 4.3337405764961186e-06,
+ "loss": 0.567,
+ "step": 3095
+ },
+ {
+ "epoch": 1.4638297872340424,
+ "grad_norm": 2.550625801086426,
+ "learning_rate": 4.333316508258163e-06,
+ "loss": 0.4887,
+ "step": 3096
+ },
+ {
+ "epoch": 1.4643026004728132,
+ "grad_norm": 2.3986475467681885,
+ "learning_rate": 4.332892325867629e-06,
+ "loss": 0.5047,
+ "step": 3097
+ },
+ {
+ "epoch": 1.464775413711584,
+ "grad_norm": 2.5045125484466553,
+ "learning_rate": 4.332468029350926e-06,
+ "loss": 0.4721,
+ "step": 3098
+ },
+ {
+ "epoch": 1.4652482269503546,
+ "grad_norm": 2.347365617752075,
+ "learning_rate": 4.332043618734474e-06,
+ "loss": 0.4913,
+ "step": 3099
+ },
+ {
+ "epoch": 1.4657210401891252,
+ "grad_norm": 2.459928512573242,
+ "learning_rate": 4.331619094044699e-06,
+ "loss": 0.523,
+ "step": 3100
+ },
+ {
+ "epoch": 1.466193853427896,
+ "grad_norm": 2.5771310329437256,
+ "learning_rate": 4.331194455308035e-06,
+ "loss": 0.593,
+ "step": 3101
+ },
+ {
+ "epoch": 1.4666666666666668,
+ "grad_norm": 3.1351823806762695,
+ "learning_rate": 4.330769702550921e-06,
+ "loss": 0.5852,
+ "step": 3102
+ },
+ {
+ "epoch": 1.4671394799054374,
+ "grad_norm": 2.589817523956299,
+ "learning_rate": 4.330344835799806e-06,
+ "loss": 0.508,
+ "step": 3103
+ },
+ {
+ "epoch": 1.467612293144208,
+ "grad_norm": 3.1140341758728027,
+ "learning_rate": 4.329919855081144e-06,
+ "loss": 0.469,
+ "step": 3104
+ },
+ {
+ "epoch": 1.4680851063829787,
+ "grad_norm": 2.8186635971069336,
+ "learning_rate": 4.329494760421396e-06,
+ "loss": 0.5088,
+ "step": 3105
+ },
+ {
+ "epoch": 1.4685579196217495,
+ "grad_norm": 2.676077365875244,
+ "learning_rate": 4.329069551847031e-06,
+ "loss": 0.52,
+ "step": 3106
+ },
+ {
+ "epoch": 1.46903073286052,
+ "grad_norm": 2.5543313026428223,
+ "learning_rate": 4.328644229384526e-06,
+ "loss": 0.5066,
+ "step": 3107
+ },
+ {
+ "epoch": 1.4695035460992907,
+ "grad_norm": 2.8176217079162598,
+ "learning_rate": 4.328218793060362e-06,
+ "loss": 0.6404,
+ "step": 3108
+ },
+ {
+ "epoch": 1.4699763593380615,
+ "grad_norm": 2.485217332839966,
+ "learning_rate": 4.3277932429010314e-06,
+ "loss": 0.4578,
+ "step": 3109
+ },
+ {
+ "epoch": 1.4704491725768323,
+ "grad_norm": 2.6741621494293213,
+ "learning_rate": 4.327367578933031e-06,
+ "loss": 0.5068,
+ "step": 3110
+ },
+ {
+ "epoch": 1.4709219858156029,
+ "grad_norm": 2.377242088317871,
+ "learning_rate": 4.326941801182863e-06,
+ "loss": 0.5249,
+ "step": 3111
+ },
+ {
+ "epoch": 1.4713947990543734,
+ "grad_norm": 2.790046215057373,
+ "learning_rate": 4.32651590967704e-06,
+ "loss": 0.5532,
+ "step": 3112
+ },
+ {
+ "epoch": 1.4718676122931442,
+ "grad_norm": 2.78019642829895,
+ "learning_rate": 4.326089904442081e-06,
+ "loss": 0.5362,
+ "step": 3113
+ },
+ {
+ "epoch": 1.472340425531915,
+ "grad_norm": 2.5661380290985107,
+ "learning_rate": 4.32566378550451e-06,
+ "loss": 0.5041,
+ "step": 3114
+ },
+ {
+ "epoch": 1.4728132387706856,
+ "grad_norm": 2.522153615951538,
+ "learning_rate": 4.3252375528908605e-06,
+ "loss": 0.5074,
+ "step": 3115
+ },
+ {
+ "epoch": 1.4732860520094562,
+ "grad_norm": 2.874688148498535,
+ "learning_rate": 4.3248112066276725e-06,
+ "loss": 0.59,
+ "step": 3116
+ },
+ {
+ "epoch": 1.473758865248227,
+ "grad_norm": 3.067866802215576,
+ "learning_rate": 4.324384746741492e-06,
+ "loss": 0.5924,
+ "step": 3117
+ },
+ {
+ "epoch": 1.4742316784869978,
+ "grad_norm": 3.359463930130005,
+ "learning_rate": 4.323958173258873e-06,
+ "loss": 0.6346,
+ "step": 3118
+ },
+ {
+ "epoch": 1.4747044917257683,
+ "grad_norm": 2.193024158477783,
+ "learning_rate": 4.323531486206376e-06,
+ "loss": 0.4594,
+ "step": 3119
+ },
+ {
+ "epoch": 1.475177304964539,
+ "grad_norm": 2.886889934539795,
+ "learning_rate": 4.323104685610569e-06,
+ "loss": 0.523,
+ "step": 3120
+ },
+ {
+ "epoch": 1.4756501182033097,
+ "grad_norm": 2.7558681964874268,
+ "learning_rate": 4.322677771498028e-06,
+ "loss": 0.5387,
+ "step": 3121
+ },
+ {
+ "epoch": 1.4761229314420805,
+ "grad_norm": 2.639277935028076,
+ "learning_rate": 4.322250743895335e-06,
+ "loss": 0.5599,
+ "step": 3122
+ },
+ {
+ "epoch": 1.476595744680851,
+ "grad_norm": 2.786198616027832,
+ "learning_rate": 4.321823602829078e-06,
+ "loss": 0.5405,
+ "step": 3123
+ },
+ {
+ "epoch": 1.4770685579196217,
+ "grad_norm": 2.582315683364868,
+ "learning_rate": 4.321396348325853e-06,
+ "loss": 0.4452,
+ "step": 3124
+ },
+ {
+ "epoch": 1.4775413711583925,
+ "grad_norm": 2.8574297428131104,
+ "learning_rate": 4.320968980412265e-06,
+ "loss": 0.4846,
+ "step": 3125
+ },
+ {
+ "epoch": 1.4780141843971633,
+ "grad_norm": 2.705281972885132,
+ "learning_rate": 4.320541499114922e-06,
+ "loss": 0.5548,
+ "step": 3126
+ },
+ {
+ "epoch": 1.4784869976359338,
+ "grad_norm": 2.3152754306793213,
+ "learning_rate": 4.320113904460444e-06,
+ "loss": 0.5216,
+ "step": 3127
+ },
+ {
+ "epoch": 1.4789598108747044,
+ "grad_norm": 3.230764150619507,
+ "learning_rate": 4.319686196475453e-06,
+ "loss": 0.6192,
+ "step": 3128
+ },
+ {
+ "epoch": 1.4794326241134752,
+ "grad_norm": 2.463380813598633,
+ "learning_rate": 4.319258375186583e-06,
+ "loss": 0.4872,
+ "step": 3129
+ },
+ {
+ "epoch": 1.479905437352246,
+ "grad_norm": 2.8477656841278076,
+ "learning_rate": 4.31883044062047e-06,
+ "loss": 0.5371,
+ "step": 3130
+ },
+ {
+ "epoch": 1.4803782505910166,
+ "grad_norm": 2.393911123275757,
+ "learning_rate": 4.318402392803762e-06,
+ "loss": 0.5334,
+ "step": 3131
+ },
+ {
+ "epoch": 1.4808510638297872,
+ "grad_norm": 2.6113736629486084,
+ "learning_rate": 4.317974231763109e-06,
+ "loss": 0.5572,
+ "step": 3132
+ },
+ {
+ "epoch": 1.481323877068558,
+ "grad_norm": 2.3941731452941895,
+ "learning_rate": 4.317545957525173e-06,
+ "loss": 0.4849,
+ "step": 3133
+ },
+ {
+ "epoch": 1.4817966903073285,
+ "grad_norm": 2.9536755084991455,
+ "learning_rate": 4.317117570116619e-06,
+ "loss": 0.6058,
+ "step": 3134
+ },
+ {
+ "epoch": 1.4822695035460993,
+ "grad_norm": 2.595754623413086,
+ "learning_rate": 4.316689069564123e-06,
+ "loss": 0.5193,
+ "step": 3135
+ },
+ {
+ "epoch": 1.48274231678487,
+ "grad_norm": 2.569833993911743,
+ "learning_rate": 4.316260455894364e-06,
+ "loss": 0.543,
+ "step": 3136
+ },
+ {
+ "epoch": 1.4832151300236407,
+ "grad_norm": 2.5137455463409424,
+ "learning_rate": 4.315831729134031e-06,
+ "loss": 0.5415,
+ "step": 3137
+ },
+ {
+ "epoch": 1.4836879432624113,
+ "grad_norm": 2.5582292079925537,
+ "learning_rate": 4.3154028893098176e-06,
+ "loss": 0.5338,
+ "step": 3138
+ },
+ {
+ "epoch": 1.484160756501182,
+ "grad_norm": 2.666426181793213,
+ "learning_rate": 4.3149739364484265e-06,
+ "loss": 0.5435,
+ "step": 3139
+ },
+ {
+ "epoch": 1.4846335697399526,
+ "grad_norm": 2.790851354598999,
+ "learning_rate": 4.314544870576568e-06,
+ "loss": 0.5746,
+ "step": 3140
+ },
+ {
+ "epoch": 1.4851063829787234,
+ "grad_norm": 2.620326042175293,
+ "learning_rate": 4.314115691720956e-06,
+ "loss": 0.5076,
+ "step": 3141
+ },
+ {
+ "epoch": 1.485579196217494,
+ "grad_norm": 3.075674533843994,
+ "learning_rate": 4.313686399908314e-06,
+ "loss": 0.5486,
+ "step": 3142
+ },
+ {
+ "epoch": 1.4860520094562648,
+ "grad_norm": 3.1347315311431885,
+ "learning_rate": 4.3132569951653745e-06,
+ "loss": 0.531,
+ "step": 3143
+ },
+ {
+ "epoch": 1.4865248226950354,
+ "grad_norm": 2.5783653259277344,
+ "learning_rate": 4.312827477518871e-06,
+ "loss": 0.5818,
+ "step": 3144
+ },
+ {
+ "epoch": 1.4869976359338062,
+ "grad_norm": 3.0247137546539307,
+ "learning_rate": 4.3123978469955505e-06,
+ "loss": 0.5347,
+ "step": 3145
+ },
+ {
+ "epoch": 1.4874704491725768,
+ "grad_norm": 2.4789345264434814,
+ "learning_rate": 4.311968103622163e-06,
+ "loss": 0.5,
+ "step": 3146
+ },
+ {
+ "epoch": 1.4879432624113476,
+ "grad_norm": 2.663341522216797,
+ "learning_rate": 4.311538247425466e-06,
+ "loss": 0.4825,
+ "step": 3147
+ },
+ {
+ "epoch": 1.4884160756501181,
+ "grad_norm": 2.633711099624634,
+ "learning_rate": 4.311108278432226e-06,
+ "loss": 0.5244,
+ "step": 3148
+ },
+ {
+ "epoch": 1.488888888888889,
+ "grad_norm": 2.51312518119812,
+ "learning_rate": 4.310678196669216e-06,
+ "loss": 0.513,
+ "step": 3149
+ },
+ {
+ "epoch": 1.4893617021276595,
+ "grad_norm": 2.5263755321502686,
+ "learning_rate": 4.310248002163214e-06,
+ "loss": 0.5236,
+ "step": 3150
+ },
+ {
+ "epoch": 1.4898345153664303,
+ "grad_norm": 2.559216260910034,
+ "learning_rate": 4.309817694941007e-06,
+ "loss": 0.5107,
+ "step": 3151
+ },
+ {
+ "epoch": 1.4903073286052009,
+ "grad_norm": 2.5023303031921387,
+ "learning_rate": 4.309387275029386e-06,
+ "loss": 0.4685,
+ "step": 3152
+ },
+ {
+ "epoch": 1.4907801418439717,
+ "grad_norm": 3.0314254760742188,
+ "learning_rate": 4.308956742455155e-06,
+ "loss": 0.5462,
+ "step": 3153
+ },
+ {
+ "epoch": 1.4912529550827422,
+ "grad_norm": 2.675295114517212,
+ "learning_rate": 4.308526097245119e-06,
+ "loss": 0.5398,
+ "step": 3154
+ },
+ {
+ "epoch": 1.491725768321513,
+ "grad_norm": 2.6613399982452393,
+ "learning_rate": 4.308095339426094e-06,
+ "loss": 0.5376,
+ "step": 3155
+ },
+ {
+ "epoch": 1.4921985815602836,
+ "grad_norm": 2.58937668800354,
+ "learning_rate": 4.307664469024899e-06,
+ "loss": 0.5385,
+ "step": 3156
+ },
+ {
+ "epoch": 1.4926713947990544,
+ "grad_norm": 2.583631992340088,
+ "learning_rate": 4.3072334860683655e-06,
+ "loss": 0.4927,
+ "step": 3157
+ },
+ {
+ "epoch": 1.493144208037825,
+ "grad_norm": 2.5889222621917725,
+ "learning_rate": 4.306802390583327e-06,
+ "loss": 0.47,
+ "step": 3158
+ },
+ {
+ "epoch": 1.4936170212765958,
+ "grad_norm": 2.9362716674804688,
+ "learning_rate": 4.3063711825966244e-06,
+ "loss": 0.4902,
+ "step": 3159
+ },
+ {
+ "epoch": 1.4940898345153664,
+ "grad_norm": 2.5385425090789795,
+ "learning_rate": 4.305939862135111e-06,
+ "loss": 0.5396,
+ "step": 3160
+ },
+ {
+ "epoch": 1.4945626477541372,
+ "grad_norm": 2.776326894760132,
+ "learning_rate": 4.305508429225641e-06,
+ "loss": 0.5169,
+ "step": 3161
+ },
+ {
+ "epoch": 1.4950354609929077,
+ "grad_norm": 2.575063467025757,
+ "learning_rate": 4.305076883895076e-06,
+ "loss": 0.4938,
+ "step": 3162
+ },
+ {
+ "epoch": 1.4955082742316785,
+ "grad_norm": 2.7552313804626465,
+ "learning_rate": 4.304645226170291e-06,
+ "loss": 0.6211,
+ "step": 3163
+ },
+ {
+ "epoch": 1.4959810874704491,
+ "grad_norm": 2.57149338722229,
+ "learning_rate": 4.30421345607816e-06,
+ "loss": 0.5241,
+ "step": 3164
+ },
+ {
+ "epoch": 1.49645390070922,
+ "grad_norm": 2.8142426013946533,
+ "learning_rate": 4.303781573645568e-06,
+ "loss": 0.5699,
+ "step": 3165
+ },
+ {
+ "epoch": 1.4969267139479905,
+ "grad_norm": 2.6344845294952393,
+ "learning_rate": 4.303349578899407e-06,
+ "loss": 0.5049,
+ "step": 3166
+ },
+ {
+ "epoch": 1.4973995271867613,
+ "grad_norm": 2.554410934448242,
+ "learning_rate": 4.302917471866575e-06,
+ "loss": 0.4404,
+ "step": 3167
+ },
+ {
+ "epoch": 1.4978723404255319,
+ "grad_norm": 2.896240711212158,
+ "learning_rate": 4.302485252573978e-06,
+ "loss": 0.602,
+ "step": 3168
+ },
+ {
+ "epoch": 1.4983451536643027,
+ "grad_norm": 2.4044477939605713,
+ "learning_rate": 4.302052921048527e-06,
+ "loss": 0.4857,
+ "step": 3169
+ },
+ {
+ "epoch": 1.4988179669030732,
+ "grad_norm": 2.7447879314422607,
+ "learning_rate": 4.301620477317144e-06,
+ "loss": 0.5438,
+ "step": 3170
+ },
+ {
+ "epoch": 1.499290780141844,
+ "grad_norm": 2.851820945739746,
+ "learning_rate": 4.301187921406752e-06,
+ "loss": 0.5245,
+ "step": 3171
+ },
+ {
+ "epoch": 1.4997635933806146,
+ "grad_norm": 3.247114419937134,
+ "learning_rate": 4.300755253344287e-06,
+ "loss": 0.504,
+ "step": 3172
+ },
+ {
+ "epoch": 1.5002364066193854,
+ "grad_norm": 3.117490291595459,
+ "learning_rate": 4.300322473156688e-06,
+ "loss": 0.4627,
+ "step": 3173
+ },
+ {
+ "epoch": 1.500709219858156,
+ "grad_norm": 2.558319330215454,
+ "learning_rate": 4.299889580870904e-06,
+ "loss": 0.5721,
+ "step": 3174
+ },
+ {
+ "epoch": 1.5011820330969265,
+ "grad_norm": 2.8983113765716553,
+ "learning_rate": 4.2994565765138865e-06,
+ "loss": 0.5257,
+ "step": 3175
+ },
+ {
+ "epoch": 1.5016548463356973,
+ "grad_norm": 2.744056463241577,
+ "learning_rate": 4.299023460112599e-06,
+ "loss": 0.4892,
+ "step": 3176
+ },
+ {
+ "epoch": 1.5021276595744681,
+ "grad_norm": 2.5506751537323,
+ "learning_rate": 4.29859023169401e-06,
+ "loss": 0.4933,
+ "step": 3177
+ },
+ {
+ "epoch": 1.5026004728132387,
+ "grad_norm": 2.842615842819214,
+ "learning_rate": 4.298156891285092e-06,
+ "loss": 0.6124,
+ "step": 3178
+ },
+ {
+ "epoch": 1.5030732860520093,
+ "grad_norm": 2.5355329513549805,
+ "learning_rate": 4.2977234389128305e-06,
+ "loss": 0.641,
+ "step": 3179
+ },
+ {
+ "epoch": 1.50354609929078,
+ "grad_norm": 2.674781084060669,
+ "learning_rate": 4.297289874604213e-06,
+ "loss": 0.475,
+ "step": 3180
+ },
+ {
+ "epoch": 1.5040189125295509,
+ "grad_norm": 2.6845548152923584,
+ "learning_rate": 4.296856198386235e-06,
+ "loss": 0.5328,
+ "step": 3181
+ },
+ {
+ "epoch": 1.5044917257683215,
+ "grad_norm": 2.9686241149902344,
+ "learning_rate": 4.296422410285902e-06,
+ "loss": 0.6216,
+ "step": 3182
+ },
+ {
+ "epoch": 1.504964539007092,
+ "grad_norm": 2.5095980167388916,
+ "learning_rate": 4.295988510330222e-06,
+ "loss": 0.4993,
+ "step": 3183
+ },
+ {
+ "epoch": 1.5054373522458628,
+ "grad_norm": 2.4906392097473145,
+ "learning_rate": 4.2955544985462125e-06,
+ "loss": 0.4795,
+ "step": 3184
+ },
+ {
+ "epoch": 1.5059101654846336,
+ "grad_norm": 2.5593366622924805,
+ "learning_rate": 4.295120374960897e-06,
+ "loss": 0.5527,
+ "step": 3185
+ },
+ {
+ "epoch": 1.5063829787234042,
+ "grad_norm": 2.691495180130005,
+ "learning_rate": 4.294686139601308e-06,
+ "loss": 0.5646,
+ "step": 3186
+ },
+ {
+ "epoch": 1.5068557919621748,
+ "grad_norm": 2.74320387840271,
+ "learning_rate": 4.294251792494483e-06,
+ "loss": 0.6149,
+ "step": 3187
+ },
+ {
+ "epoch": 1.5073286052009456,
+ "grad_norm": 2.8827052116394043,
+ "learning_rate": 4.293817333667465e-06,
+ "loss": 0.5414,
+ "step": 3188
+ },
+ {
+ "epoch": 1.5078014184397164,
+ "grad_norm": 2.5652425289154053,
+ "learning_rate": 4.293382763147308e-06,
+ "loss": 0.5006,
+ "step": 3189
+ },
+ {
+ "epoch": 1.508274231678487,
+ "grad_norm": 2.729295253753662,
+ "learning_rate": 4.29294808096107e-06,
+ "loss": 0.522,
+ "step": 3190
+ },
+ {
+ "epoch": 1.5087470449172575,
+ "grad_norm": 2.348118305206299,
+ "learning_rate": 4.292513287135817e-06,
+ "loss": 0.4125,
+ "step": 3191
+ },
+ {
+ "epoch": 1.5092198581560283,
+ "grad_norm": 2.809551954269409,
+ "learning_rate": 4.292078381698621e-06,
+ "loss": 0.5577,
+ "step": 3192
+ },
+ {
+ "epoch": 1.5096926713947991,
+ "grad_norm": 2.6925361156463623,
+ "learning_rate": 4.291643364676563e-06,
+ "loss": 0.62,
+ "step": 3193
+ },
+ {
+ "epoch": 1.5101654846335697,
+ "grad_norm": 2.4200620651245117,
+ "learning_rate": 4.291208236096729e-06,
+ "loss": 0.5464,
+ "step": 3194
+ },
+ {
+ "epoch": 1.5106382978723403,
+ "grad_norm": 2.5659191608428955,
+ "learning_rate": 4.290772995986211e-06,
+ "loss": 0.5402,
+ "step": 3195
+ },
+ {
+ "epoch": 1.511111111111111,
+ "grad_norm": 2.3877315521240234,
+ "learning_rate": 4.290337644372113e-06,
+ "loss": 0.463,
+ "step": 3196
+ },
+ {
+ "epoch": 1.5115839243498819,
+ "grad_norm": 2.7063233852386475,
+ "learning_rate": 4.289902181281538e-06,
+ "loss": 0.5253,
+ "step": 3197
+ },
+ {
+ "epoch": 1.5120567375886524,
+ "grad_norm": 2.56788969039917,
+ "learning_rate": 4.289466606741603e-06,
+ "loss": 0.5012,
+ "step": 3198
+ },
+ {
+ "epoch": 1.512529550827423,
+ "grad_norm": 2.637164831161499,
+ "learning_rate": 4.28903092077943e-06,
+ "loss": 0.5236,
+ "step": 3199
+ },
+ {
+ "epoch": 1.5130023640661938,
+ "grad_norm": 2.767526865005493,
+ "learning_rate": 4.288595123422146e-06,
+ "loss": 0.5832,
+ "step": 3200
+ },
+ {
+ "epoch": 1.5134751773049646,
+ "grad_norm": 2.33365535736084,
+ "learning_rate": 4.2881592146968866e-06,
+ "loss": 0.4548,
+ "step": 3201
+ },
+ {
+ "epoch": 1.5139479905437352,
+ "grad_norm": 2.544189453125,
+ "learning_rate": 4.287723194630793e-06,
+ "loss": 0.5115,
+ "step": 3202
+ },
+ {
+ "epoch": 1.5144208037825058,
+ "grad_norm": 2.588793992996216,
+ "learning_rate": 4.2872870632510155e-06,
+ "loss": 0.4766,
+ "step": 3203
+ },
+ {
+ "epoch": 1.5148936170212766,
+ "grad_norm": 2.5382184982299805,
+ "learning_rate": 4.286850820584709e-06,
+ "loss": 0.5401,
+ "step": 3204
+ },
+ {
+ "epoch": 1.5153664302600474,
+ "grad_norm": 2.597930669784546,
+ "learning_rate": 4.286414466659038e-06,
+ "loss": 0.5346,
+ "step": 3205
+ },
+ {
+ "epoch": 1.515839243498818,
+ "grad_norm": 2.8522393703460693,
+ "learning_rate": 4.28597800150117e-06,
+ "loss": 0.486,
+ "step": 3206
+ },
+ {
+ "epoch": 1.5163120567375885,
+ "grad_norm": 2.4801454544067383,
+ "learning_rate": 4.285541425138285e-06,
+ "loss": 0.5162,
+ "step": 3207
+ },
+ {
+ "epoch": 1.5167848699763593,
+ "grad_norm": 2.353665351867676,
+ "learning_rate": 4.285104737597563e-06,
+ "loss": 0.5066,
+ "step": 3208
+ },
+ {
+ "epoch": 1.51725768321513,
+ "grad_norm": 2.767976760864258,
+ "learning_rate": 4.2846679389061975e-06,
+ "loss": 0.5331,
+ "step": 3209
+ },
+ {
+ "epoch": 1.5177304964539007,
+ "grad_norm": 2.9307682514190674,
+ "learning_rate": 4.284231029091385e-06,
+ "loss": 0.5291,
+ "step": 3210
+ },
+ {
+ "epoch": 1.5182033096926713,
+ "grad_norm": 2.39719820022583,
+ "learning_rate": 4.283794008180329e-06,
+ "loss": 0.4759,
+ "step": 3211
+ },
+ {
+ "epoch": 1.518676122931442,
+ "grad_norm": 2.452244758605957,
+ "learning_rate": 4.283356876200242e-06,
+ "loss": 0.4283,
+ "step": 3212
+ },
+ {
+ "epoch": 1.5191489361702128,
+ "grad_norm": 2.4911608695983887,
+ "learning_rate": 4.282919633178343e-06,
+ "loss": 0.4812,
+ "step": 3213
+ },
+ {
+ "epoch": 1.5196217494089834,
+ "grad_norm": 2.5813944339752197,
+ "learning_rate": 4.282482279141856e-06,
+ "loss": 0.4911,
+ "step": 3214
+ },
+ {
+ "epoch": 1.520094562647754,
+ "grad_norm": 2.503542184829712,
+ "learning_rate": 4.282044814118013e-06,
+ "loss": 0.4969,
+ "step": 3215
+ },
+ {
+ "epoch": 1.5205673758865248,
+ "grad_norm": 2.5090713500976562,
+ "learning_rate": 4.281607238134053e-06,
+ "loss": 0.5293,
+ "step": 3216
+ },
+ {
+ "epoch": 1.5210401891252956,
+ "grad_norm": 2.425994396209717,
+ "learning_rate": 4.281169551217223e-06,
+ "loss": 0.5365,
+ "step": 3217
+ },
+ {
+ "epoch": 1.5215130023640662,
+ "grad_norm": 2.637655258178711,
+ "learning_rate": 4.2807317533947765e-06,
+ "loss": 0.5589,
+ "step": 3218
+ },
+ {
+ "epoch": 1.5219858156028367,
+ "grad_norm": 2.9335296154022217,
+ "learning_rate": 4.28029384469397e-06,
+ "loss": 0.6071,
+ "step": 3219
+ },
+ {
+ "epoch": 1.5224586288416075,
+ "grad_norm": 2.898683547973633,
+ "learning_rate": 4.279855825142073e-06,
+ "loss": 0.5392,
+ "step": 3220
+ },
+ {
+ "epoch": 1.5229314420803783,
+ "grad_norm": 2.613914966583252,
+ "learning_rate": 4.279417694766359e-06,
+ "loss": 0.4968,
+ "step": 3221
+ },
+ {
+ "epoch": 1.523404255319149,
+ "grad_norm": 2.500682830810547,
+ "learning_rate": 4.278979453594106e-06,
+ "loss": 0.471,
+ "step": 3222
+ },
+ {
+ "epoch": 1.5238770685579195,
+ "grad_norm": 2.5269598960876465,
+ "learning_rate": 4.278541101652605e-06,
+ "loss": 0.471,
+ "step": 3223
+ },
+ {
+ "epoch": 1.5243498817966903,
+ "grad_norm": 2.8153114318847656,
+ "learning_rate": 4.2781026389691465e-06,
+ "loss": 0.5742,
+ "step": 3224
+ },
+ {
+ "epoch": 1.524822695035461,
+ "grad_norm": 2.5648019313812256,
+ "learning_rate": 4.277664065571034e-06,
+ "loss": 0.5315,
+ "step": 3225
+ },
+ {
+ "epoch": 1.5252955082742317,
+ "grad_norm": 2.778355836868286,
+ "learning_rate": 4.277225381485575e-06,
+ "loss": 0.5543,
+ "step": 3226
+ },
+ {
+ "epoch": 1.5257683215130022,
+ "grad_norm": 2.6736745834350586,
+ "learning_rate": 4.2767865867400846e-06,
+ "loss": 0.4947,
+ "step": 3227
+ },
+ {
+ "epoch": 1.526241134751773,
+ "grad_norm": 2.9560294151306152,
+ "learning_rate": 4.276347681361884e-06,
+ "loss": 0.5835,
+ "step": 3228
+ },
+ {
+ "epoch": 1.5267139479905438,
+ "grad_norm": 2.5580296516418457,
+ "learning_rate": 4.275908665378302e-06,
+ "loss": 0.4751,
+ "step": 3229
+ },
+ {
+ "epoch": 1.5271867612293144,
+ "grad_norm": 3.0705175399780273,
+ "learning_rate": 4.2754695388166755e-06,
+ "loss": 0.5327,
+ "step": 3230
+ },
+ {
+ "epoch": 1.527659574468085,
+ "grad_norm": 2.664652109146118,
+ "learning_rate": 4.275030301704346e-06,
+ "loss": 0.4934,
+ "step": 3231
+ },
+ {
+ "epoch": 1.5281323877068558,
+ "grad_norm": 2.308499813079834,
+ "learning_rate": 4.274590954068663e-06,
+ "loss": 0.4412,
+ "step": 3232
+ },
+ {
+ "epoch": 1.5286052009456266,
+ "grad_norm": 2.871189594268799,
+ "learning_rate": 4.2741514959369815e-06,
+ "loss": 0.5001,
+ "step": 3233
+ },
+ {
+ "epoch": 1.5290780141843971,
+ "grad_norm": 2.5274453163146973,
+ "learning_rate": 4.273711927336666e-06,
+ "loss": 0.4938,
+ "step": 3234
+ },
+ {
+ "epoch": 1.5295508274231677,
+ "grad_norm": 2.8848133087158203,
+ "learning_rate": 4.273272248295087e-06,
+ "loss": 0.5397,
+ "step": 3235
+ },
+ {
+ "epoch": 1.5300236406619385,
+ "grad_norm": 2.3927090167999268,
+ "learning_rate": 4.27283245883962e-06,
+ "loss": 0.5497,
+ "step": 3236
+ },
+ {
+ "epoch": 1.5304964539007093,
+ "grad_norm": 2.5413873195648193,
+ "learning_rate": 4.27239255899765e-06,
+ "loss": 0.5108,
+ "step": 3237
+ },
+ {
+ "epoch": 1.53096926713948,
+ "grad_norm": 2.7692389488220215,
+ "learning_rate": 4.271952548796567e-06,
+ "loss": 0.5768,
+ "step": 3238
+ },
+ {
+ "epoch": 1.5314420803782505,
+ "grad_norm": 2.4621126651763916,
+ "learning_rate": 4.271512428263768e-06,
+ "loss": 0.4698,
+ "step": 3239
+ },
+ {
+ "epoch": 1.5319148936170213,
+ "grad_norm": 2.6423375606536865,
+ "learning_rate": 4.271072197426659e-06,
+ "loss": 0.4929,
+ "step": 3240
+ },
+ {
+ "epoch": 1.532387706855792,
+ "grad_norm": 2.7097692489624023,
+ "learning_rate": 4.270631856312649e-06,
+ "loss": 0.4836,
+ "step": 3241
+ },
+ {
+ "epoch": 1.5328605200945626,
+ "grad_norm": 2.545706272125244,
+ "learning_rate": 4.270191404949158e-06,
+ "loss": 0.4636,
+ "step": 3242
+ },
+ {
+ "epoch": 1.5333333333333332,
+ "grad_norm": 3.138781785964966,
+ "learning_rate": 4.26975084336361e-06,
+ "loss": 0.5988,
+ "step": 3243
+ },
+ {
+ "epoch": 1.533806146572104,
+ "grad_norm": 2.492715835571289,
+ "learning_rate": 4.269310171583438e-06,
+ "loss": 0.5095,
+ "step": 3244
+ },
+ {
+ "epoch": 1.5342789598108748,
+ "grad_norm": 2.5705838203430176,
+ "learning_rate": 4.268869389636077e-06,
+ "loss": 0.4818,
+ "step": 3245
+ },
+ {
+ "epoch": 1.5347517730496454,
+ "grad_norm": 2.7633554935455322,
+ "learning_rate": 4.268428497548979e-06,
+ "loss": 0.547,
+ "step": 3246
+ },
+ {
+ "epoch": 1.535224586288416,
+ "grad_norm": 2.654528856277466,
+ "learning_rate": 4.2679874953495905e-06,
+ "loss": 0.5261,
+ "step": 3247
+ },
+ {
+ "epoch": 1.5356973995271868,
+ "grad_norm": 2.5039751529693604,
+ "learning_rate": 4.2675463830653744e-06,
+ "loss": 0.4941,
+ "step": 3248
+ },
+ {
+ "epoch": 1.5361702127659576,
+ "grad_norm": 2.897268295288086,
+ "learning_rate": 4.267105160723794e-06,
+ "loss": 0.5404,
+ "step": 3249
+ },
+ {
+ "epoch": 1.5366430260047281,
+ "grad_norm": 2.500732421875,
+ "learning_rate": 4.266663828352324e-06,
+ "loss": 0.5375,
+ "step": 3250
+ },
+ {
+ "epoch": 1.5371158392434987,
+ "grad_norm": 2.6310064792633057,
+ "learning_rate": 4.266222385978444e-06,
+ "loss": 0.5217,
+ "step": 3251
+ },
+ {
+ "epoch": 1.5375886524822695,
+ "grad_norm": 2.7440476417541504,
+ "learning_rate": 4.265780833629642e-06,
+ "loss": 0.5419,
+ "step": 3252
+ },
+ {
+ "epoch": 1.5380614657210403,
+ "grad_norm": 2.7037577629089355,
+ "learning_rate": 4.2653391713334095e-06,
+ "loss": 0.5634,
+ "step": 3253
+ },
+ {
+ "epoch": 1.5385342789598109,
+ "grad_norm": 2.548525810241699,
+ "learning_rate": 4.264897399117248e-06,
+ "loss": 0.535,
+ "step": 3254
+ },
+ {
+ "epoch": 1.5390070921985815,
+ "grad_norm": 2.6127355098724365,
+ "learning_rate": 4.264455517008663e-06,
+ "loss": 0.4619,
+ "step": 3255
+ },
+ {
+ "epoch": 1.5394799054373522,
+ "grad_norm": 2.5597004890441895,
+ "learning_rate": 4.264013525035171e-06,
+ "loss": 0.4477,
+ "step": 3256
+ },
+ {
+ "epoch": 1.539952718676123,
+ "grad_norm": 2.642432689666748,
+ "learning_rate": 4.263571423224292e-06,
+ "loss": 0.4749,
+ "step": 3257
+ },
+ {
+ "epoch": 1.5404255319148936,
+ "grad_norm": 2.5121877193450928,
+ "learning_rate": 4.2631292116035526e-06,
+ "loss": 0.4693,
+ "step": 3258
+ },
+ {
+ "epoch": 1.5408983451536642,
+ "grad_norm": 2.390292167663574,
+ "learning_rate": 4.262686890200489e-06,
+ "loss": 0.4872,
+ "step": 3259
+ },
+ {
+ "epoch": 1.541371158392435,
+ "grad_norm": 2.5898337364196777,
+ "learning_rate": 4.2622444590426405e-06,
+ "loss": 0.5193,
+ "step": 3260
+ },
+ {
+ "epoch": 1.5418439716312058,
+ "grad_norm": 2.508821487426758,
+ "learning_rate": 4.261801918157558e-06,
+ "loss": 0.511,
+ "step": 3261
+ },
+ {
+ "epoch": 1.5423167848699764,
+ "grad_norm": 2.6992101669311523,
+ "learning_rate": 4.261359267572795e-06,
+ "loss": 0.5069,
+ "step": 3262
+ },
+ {
+ "epoch": 1.542789598108747,
+ "grad_norm": 2.6011030673980713,
+ "learning_rate": 4.2609165073159145e-06,
+ "loss": 0.5887,
+ "step": 3263
+ },
+ {
+ "epoch": 1.5432624113475177,
+ "grad_norm": 2.887053966522217,
+ "learning_rate": 4.260473637414483e-06,
+ "loss": 0.5556,
+ "step": 3264
+ },
+ {
+ "epoch": 1.5437352245862885,
+ "grad_norm": 2.6433887481689453,
+ "learning_rate": 4.260030657896079e-06,
+ "loss": 0.4728,
+ "step": 3265
+ },
+ {
+ "epoch": 1.544208037825059,
+ "grad_norm": 2.6134607791900635,
+ "learning_rate": 4.259587568788282e-06,
+ "loss": 0.483,
+ "step": 3266
+ },
+ {
+ "epoch": 1.5446808510638297,
+ "grad_norm": 2.5308640003204346,
+ "learning_rate": 4.259144370118684e-06,
+ "loss": 0.5115,
+ "step": 3267
+ },
+ {
+ "epoch": 1.5451536643026005,
+ "grad_norm": 2.8256733417510986,
+ "learning_rate": 4.258701061914879e-06,
+ "loss": 0.5414,
+ "step": 3268
+ },
+ {
+ "epoch": 1.5456264775413713,
+ "grad_norm": 2.8648319244384766,
+ "learning_rate": 4.258257644204471e-06,
+ "loss": 0.5695,
+ "step": 3269
+ },
+ {
+ "epoch": 1.5460992907801419,
+ "grad_norm": 2.8568081855773926,
+ "learning_rate": 4.257814117015069e-06,
+ "loss": 0.5264,
+ "step": 3270
+ },
+ {
+ "epoch": 1.5465721040189124,
+ "grad_norm": 2.6065011024475098,
+ "learning_rate": 4.257370480374289e-06,
+ "loss": 0.5646,
+ "step": 3271
+ },
+ {
+ "epoch": 1.5470449172576832,
+ "grad_norm": 2.7840216159820557,
+ "learning_rate": 4.256926734309756e-06,
+ "loss": 0.5191,
+ "step": 3272
+ },
+ {
+ "epoch": 1.547517730496454,
+ "grad_norm": 2.85906982421875,
+ "learning_rate": 4.256482878849099e-06,
+ "loss": 0.5911,
+ "step": 3273
+ },
+ {
+ "epoch": 1.5479905437352246,
+ "grad_norm": 2.916029930114746,
+ "learning_rate": 4.256038914019954e-06,
+ "loss": 0.5589,
+ "step": 3274
+ },
+ {
+ "epoch": 1.5484633569739952,
+ "grad_norm": 2.6748716831207275,
+ "learning_rate": 4.255594839849967e-06,
+ "loss": 0.5323,
+ "step": 3275
+ },
+ {
+ "epoch": 1.548936170212766,
+ "grad_norm": 2.717212200164795,
+ "learning_rate": 4.255150656366787e-06,
+ "loss": 0.453,
+ "step": 3276
+ },
+ {
+ "epoch": 1.5494089834515368,
+ "grad_norm": 2.4974849224090576,
+ "learning_rate": 4.254706363598072e-06,
+ "loss": 0.4516,
+ "step": 3277
+ },
+ {
+ "epoch": 1.5498817966903073,
+ "grad_norm": 2.648151397705078,
+ "learning_rate": 4.254261961571485e-06,
+ "loss": 0.5452,
+ "step": 3278
+ },
+ {
+ "epoch": 1.550354609929078,
+ "grad_norm": 2.932905435562134,
+ "learning_rate": 4.253817450314699e-06,
+ "loss": 0.4813,
+ "step": 3279
+ },
+ {
+ "epoch": 1.5508274231678487,
+ "grad_norm": 2.862912178039551,
+ "learning_rate": 4.25337282985539e-06,
+ "loss": 0.5689,
+ "step": 3280
+ },
+ {
+ "epoch": 1.5513002364066195,
+ "grad_norm": 2.532156467437744,
+ "learning_rate": 4.2529281002212436e-06,
+ "loss": 0.485,
+ "step": 3281
+ },
+ {
+ "epoch": 1.55177304964539,
+ "grad_norm": 2.583299160003662,
+ "learning_rate": 4.25248326143995e-06,
+ "loss": 0.4661,
+ "step": 3282
+ },
+ {
+ "epoch": 1.5522458628841607,
+ "grad_norm": 2.5790653228759766,
+ "learning_rate": 4.252038313539209e-06,
+ "loss": 0.5455,
+ "step": 3283
+ },
+ {
+ "epoch": 1.5527186761229315,
+ "grad_norm": 2.872864007949829,
+ "learning_rate": 4.251593256546724e-06,
+ "loss": 0.5317,
+ "step": 3284
+ },
+ {
+ "epoch": 1.5531914893617023,
+ "grad_norm": 3.0382463932037354,
+ "learning_rate": 4.251148090490208e-06,
+ "loss": 0.5131,
+ "step": 3285
+ },
+ {
+ "epoch": 1.5536643026004728,
+ "grad_norm": 2.574399709701538,
+ "learning_rate": 4.250702815397379e-06,
+ "loss": 0.5399,
+ "step": 3286
+ },
+ {
+ "epoch": 1.5541371158392434,
+ "grad_norm": 2.9784770011901855,
+ "learning_rate": 4.250257431295962e-06,
+ "loss": 0.5209,
+ "step": 3287
+ },
+ {
+ "epoch": 1.5546099290780142,
+ "grad_norm": 2.6482062339782715,
+ "learning_rate": 4.249811938213689e-06,
+ "loss": 0.5416,
+ "step": 3288
+ },
+ {
+ "epoch": 1.555082742316785,
+ "grad_norm": 2.82142972946167,
+ "learning_rate": 4.2493663361783e-06,
+ "loss": 0.594,
+ "step": 3289
+ },
+ {
+ "epoch": 1.5555555555555556,
+ "grad_norm": 2.815595865249634,
+ "learning_rate": 4.24892062521754e-06,
+ "loss": 0.5381,
+ "step": 3290
+ },
+ {
+ "epoch": 1.5560283687943262,
+ "grad_norm": 2.689764976501465,
+ "learning_rate": 4.248474805359161e-06,
+ "loss": 0.5141,
+ "step": 3291
+ },
+ {
+ "epoch": 1.556501182033097,
+ "grad_norm": 2.7718515396118164,
+ "learning_rate": 4.248028876630922e-06,
+ "loss": 0.5324,
+ "step": 3292
+ },
+ {
+ "epoch": 1.5569739952718678,
+ "grad_norm": 3.0196774005889893,
+ "learning_rate": 4.247582839060591e-06,
+ "loss": 0.4971,
+ "step": 3293
+ },
+ {
+ "epoch": 1.5574468085106383,
+ "grad_norm": 2.608475923538208,
+ "learning_rate": 4.247136692675939e-06,
+ "loss": 0.5795,
+ "step": 3294
+ },
+ {
+ "epoch": 1.557919621749409,
+ "grad_norm": 2.4912326335906982,
+ "learning_rate": 4.246690437504746e-06,
+ "loss": 0.5348,
+ "step": 3295
+ },
+ {
+ "epoch": 1.5583924349881797,
+ "grad_norm": 2.519303560256958,
+ "learning_rate": 4.246244073574799e-06,
+ "loss": 0.4953,
+ "step": 3296
+ },
+ {
+ "epoch": 1.5588652482269505,
+ "grad_norm": 2.5667171478271484,
+ "learning_rate": 4.24579760091389e-06,
+ "loss": 0.5353,
+ "step": 3297
+ },
+ {
+ "epoch": 1.559338061465721,
+ "grad_norm": 2.8835761547088623,
+ "learning_rate": 4.24535101954982e-06,
+ "loss": 0.578,
+ "step": 3298
+ },
+ {
+ "epoch": 1.5598108747044916,
+ "grad_norm": 3.0506930351257324,
+ "learning_rate": 4.244904329510395e-06,
+ "loss": 0.6418,
+ "step": 3299
+ },
+ {
+ "epoch": 1.5602836879432624,
+ "grad_norm": 2.579446315765381,
+ "learning_rate": 4.244457530823428e-06,
+ "loss": 0.5027,
+ "step": 3300
+ },
+ {
+ "epoch": 1.5607565011820332,
+ "grad_norm": 2.72012996673584,
+ "learning_rate": 4.24401062351674e-06,
+ "loss": 0.5438,
+ "step": 3301
+ },
+ {
+ "epoch": 1.5612293144208038,
+ "grad_norm": 2.527007818222046,
+ "learning_rate": 4.243563607618158e-06,
+ "loss": 0.5303,
+ "step": 3302
+ },
+ {
+ "epoch": 1.5617021276595744,
+ "grad_norm": 2.4415159225463867,
+ "learning_rate": 4.243116483155516e-06,
+ "loss": 0.4893,
+ "step": 3303
+ },
+ {
+ "epoch": 1.5621749408983452,
+ "grad_norm": 2.462256669998169,
+ "learning_rate": 4.242669250156653e-06,
+ "loss": 0.5671,
+ "step": 3304
+ },
+ {
+ "epoch": 1.562647754137116,
+ "grad_norm": 2.479865074157715,
+ "learning_rate": 4.242221908649418e-06,
+ "loss": 0.5038,
+ "step": 3305
+ },
+ {
+ "epoch": 1.5631205673758866,
+ "grad_norm": 2.74670672416687,
+ "learning_rate": 4.241774458661662e-06,
+ "loss": 0.5689,
+ "step": 3306
+ },
+ {
+ "epoch": 1.5635933806146571,
+ "grad_norm": 2.55938982963562,
+ "learning_rate": 4.24132690022125e-06,
+ "loss": 0.492,
+ "step": 3307
+ },
+ {
+ "epoch": 1.564066193853428,
+ "grad_norm": 2.634956121444702,
+ "learning_rate": 4.240879233356048e-06,
+ "loss": 0.503,
+ "step": 3308
+ },
+ {
+ "epoch": 1.5645390070921987,
+ "grad_norm": 2.381775140762329,
+ "learning_rate": 4.240431458093928e-06,
+ "loss": 0.4939,
+ "step": 3309
+ },
+ {
+ "epoch": 1.5650118203309693,
+ "grad_norm": 2.8176610469818115,
+ "learning_rate": 4.239983574462774e-06,
+ "loss": 0.5609,
+ "step": 3310
+ },
+ {
+ "epoch": 1.5654846335697399,
+ "grad_norm": 3.0268442630767822,
+ "learning_rate": 4.239535582490471e-06,
+ "loss": 0.5427,
+ "step": 3311
+ },
+ {
+ "epoch": 1.5659574468085107,
+ "grad_norm": 2.5881481170654297,
+ "learning_rate": 4.239087482204916e-06,
+ "loss": 0.5538,
+ "step": 3312
+ },
+ {
+ "epoch": 1.5664302600472815,
+ "grad_norm": 2.5317704677581787,
+ "learning_rate": 4.238639273634008e-06,
+ "loss": 0.4915,
+ "step": 3313
+ },
+ {
+ "epoch": 1.566903073286052,
+ "grad_norm": 2.9608731269836426,
+ "learning_rate": 4.238190956805658e-06,
+ "loss": 0.564,
+ "step": 3314
+ },
+ {
+ "epoch": 1.5673758865248226,
+ "grad_norm": 3.022686243057251,
+ "learning_rate": 4.237742531747777e-06,
+ "loss": 0.5503,
+ "step": 3315
+ },
+ {
+ "epoch": 1.5678486997635934,
+ "grad_norm": 2.763622283935547,
+ "learning_rate": 4.23729399848829e-06,
+ "loss": 0.5241,
+ "step": 3316
+ },
+ {
+ "epoch": 1.5683215130023642,
+ "grad_norm": 2.6112794876098633,
+ "learning_rate": 4.236845357055122e-06,
+ "loss": 0.4919,
+ "step": 3317
+ },
+ {
+ "epoch": 1.5687943262411348,
+ "grad_norm": 2.649829149246216,
+ "learning_rate": 4.23639660747621e-06,
+ "loss": 0.5472,
+ "step": 3318
+ },
+ {
+ "epoch": 1.5692671394799054,
+ "grad_norm": 2.8888115882873535,
+ "learning_rate": 4.2359477497794955e-06,
+ "loss": 0.5077,
+ "step": 3319
+ },
+ {
+ "epoch": 1.5697399527186762,
+ "grad_norm": 2.5666911602020264,
+ "learning_rate": 4.235498783992927e-06,
+ "loss": 0.5365,
+ "step": 3320
+ },
+ {
+ "epoch": 1.570212765957447,
+ "grad_norm": 2.448758363723755,
+ "learning_rate": 4.2350497101444575e-06,
+ "loss": 0.5043,
+ "step": 3321
+ },
+ {
+ "epoch": 1.5706855791962175,
+ "grad_norm": 2.595207691192627,
+ "learning_rate": 4.234600528262052e-06,
+ "loss": 0.5303,
+ "step": 3322
+ },
+ {
+ "epoch": 1.5711583924349881,
+ "grad_norm": 2.7814228534698486,
+ "learning_rate": 4.234151238373676e-06,
+ "loss": 0.4521,
+ "step": 3323
+ },
+ {
+ "epoch": 1.571631205673759,
+ "grad_norm": 2.781538724899292,
+ "learning_rate": 4.233701840507308e-06,
+ "loss": 0.5193,
+ "step": 3324
+ },
+ {
+ "epoch": 1.5721040189125297,
+ "grad_norm": 2.771907329559326,
+ "learning_rate": 4.233252334690928e-06,
+ "loss": 0.497,
+ "step": 3325
+ },
+ {
+ "epoch": 1.5725768321513003,
+ "grad_norm": 2.5557498931884766,
+ "learning_rate": 4.232802720952525e-06,
+ "loss": 0.4913,
+ "step": 3326
+ },
+ {
+ "epoch": 1.5730496453900709,
+ "grad_norm": 2.478267192840576,
+ "learning_rate": 4.232352999320094e-06,
+ "loss": 0.4967,
+ "step": 3327
+ },
+ {
+ "epoch": 1.5735224586288417,
+ "grad_norm": 3.1548502445220947,
+ "learning_rate": 4.231903169821639e-06,
+ "loss": 0.5009,
+ "step": 3328
+ },
+ {
+ "epoch": 1.5739952718676125,
+ "grad_norm": 2.634824275970459,
+ "learning_rate": 4.231453232485168e-06,
+ "loss": 0.5223,
+ "step": 3329
+ },
+ {
+ "epoch": 1.574468085106383,
+ "grad_norm": 2.579102039337158,
+ "learning_rate": 4.231003187338695e-06,
+ "loss": 0.5513,
+ "step": 3330
+ },
+ {
+ "epoch": 1.5749408983451536,
+ "grad_norm": 2.8477070331573486,
+ "learning_rate": 4.230553034410245e-06,
+ "loss": 0.561,
+ "step": 3331
+ },
+ {
+ "epoch": 1.5754137115839244,
+ "grad_norm": 2.6714725494384766,
+ "learning_rate": 4.2301027737278446e-06,
+ "loss": 0.4687,
+ "step": 3332
+ },
+ {
+ "epoch": 1.5758865248226952,
+ "grad_norm": 2.6562764644622803,
+ "learning_rate": 4.229652405319532e-06,
+ "loss": 0.5925,
+ "step": 3333
+ },
+ {
+ "epoch": 1.5763593380614658,
+ "grad_norm": 2.750946283340454,
+ "learning_rate": 4.229201929213348e-06,
+ "loss": 0.4748,
+ "step": 3334
+ },
+ {
+ "epoch": 1.5768321513002364,
+ "grad_norm": 2.760470151901245,
+ "learning_rate": 4.228751345437342e-06,
+ "loss": 0.5989,
+ "step": 3335
+ },
+ {
+ "epoch": 1.5773049645390071,
+ "grad_norm": 3.1451845169067383,
+ "learning_rate": 4.2283006540195706e-06,
+ "loss": 0.562,
+ "step": 3336
+ },
+ {
+ "epoch": 1.5777777777777777,
+ "grad_norm": 2.563011407852173,
+ "learning_rate": 4.227849854988095e-06,
+ "loss": 0.5473,
+ "step": 3337
+ },
+ {
+ "epoch": 1.5782505910165483,
+ "grad_norm": 2.310469388961792,
+ "learning_rate": 4.2273989483709856e-06,
+ "loss": 0.5033,
+ "step": 3338
+ },
+ {
+ "epoch": 1.578723404255319,
+ "grad_norm": 2.677978754043579,
+ "learning_rate": 4.226947934196318e-06,
+ "loss": 0.5291,
+ "step": 3339
+ },
+ {
+ "epoch": 1.57919621749409,
+ "grad_norm": 3.0423545837402344,
+ "learning_rate": 4.226496812492176e-06,
+ "loss": 0.5201,
+ "step": 3340
+ },
+ {
+ "epoch": 1.5796690307328605,
+ "grad_norm": 2.357513904571533,
+ "learning_rate": 4.226045583286647e-06,
+ "loss": 0.4421,
+ "step": 3341
+ },
+ {
+ "epoch": 1.580141843971631,
+ "grad_norm": 2.719860315322876,
+ "learning_rate": 4.225594246607828e-06,
+ "loss": 0.4855,
+ "step": 3342
+ },
+ {
+ "epoch": 1.5806146572104018,
+ "grad_norm": 3.2645058631896973,
+ "learning_rate": 4.2251428024838215e-06,
+ "loss": 0.6654,
+ "step": 3343
+ },
+ {
+ "epoch": 1.5810874704491726,
+ "grad_norm": 2.2997004985809326,
+ "learning_rate": 4.224691250942737e-06,
+ "loss": 0.4565,
+ "step": 3344
+ },
+ {
+ "epoch": 1.5815602836879432,
+ "grad_norm": 2.8103034496307373,
+ "learning_rate": 4.2242395920126926e-06,
+ "loss": 0.5543,
+ "step": 3345
+ },
+ {
+ "epoch": 1.5820330969267138,
+ "grad_norm": 2.720254898071289,
+ "learning_rate": 4.223787825721808e-06,
+ "loss": 0.5028,
+ "step": 3346
+ },
+ {
+ "epoch": 1.5825059101654846,
+ "grad_norm": 2.735544204711914,
+ "learning_rate": 4.223335952098214e-06,
+ "loss": 0.5169,
+ "step": 3347
+ },
+ {
+ "epoch": 1.5829787234042554,
+ "grad_norm": 2.784254550933838,
+ "learning_rate": 4.222883971170047e-06,
+ "loss": 0.4989,
+ "step": 3348
+ },
+ {
+ "epoch": 1.583451536643026,
+ "grad_norm": 2.7192094326019287,
+ "learning_rate": 4.22243188296545e-06,
+ "loss": 0.502,
+ "step": 3349
+ },
+ {
+ "epoch": 1.5839243498817965,
+ "grad_norm": 2.716501474380493,
+ "learning_rate": 4.221979687512573e-06,
+ "loss": 0.5687,
+ "step": 3350
+ },
+ {
+ "epoch": 1.5843971631205673,
+ "grad_norm": 2.8420114517211914,
+ "learning_rate": 4.22152738483957e-06,
+ "loss": 0.5903,
+ "step": 3351
+ },
+ {
+ "epoch": 1.5848699763593381,
+ "grad_norm": 2.734872579574585,
+ "learning_rate": 4.2210749749746065e-06,
+ "loss": 0.5397,
+ "step": 3352
+ },
+ {
+ "epoch": 1.5853427895981087,
+ "grad_norm": 2.4343836307525635,
+ "learning_rate": 4.220622457945851e-06,
+ "loss": 0.436,
+ "step": 3353
+ },
+ {
+ "epoch": 1.5858156028368793,
+ "grad_norm": 2.728177547454834,
+ "learning_rate": 4.2201698337814785e-06,
+ "loss": 0.5703,
+ "step": 3354
+ },
+ {
+ "epoch": 1.58628841607565,
+ "grad_norm": 2.502098560333252,
+ "learning_rate": 4.219717102509674e-06,
+ "loss": 0.5275,
+ "step": 3355
+ },
+ {
+ "epoch": 1.5867612293144209,
+ "grad_norm": 2.6595494747161865,
+ "learning_rate": 4.219264264158627e-06,
+ "loss": 0.4659,
+ "step": 3356
+ },
+ {
+ "epoch": 1.5872340425531914,
+ "grad_norm": 2.5307185649871826,
+ "learning_rate": 4.218811318756532e-06,
+ "loss": 0.5048,
+ "step": 3357
+ },
+ {
+ "epoch": 1.587706855791962,
+ "grad_norm": 2.9300129413604736,
+ "learning_rate": 4.218358266331593e-06,
+ "loss": 0.5137,
+ "step": 3358
+ },
+ {
+ "epoch": 1.5881796690307328,
+ "grad_norm": 2.686586618423462,
+ "learning_rate": 4.21790510691202e-06,
+ "loss": 0.4529,
+ "step": 3359
+ },
+ {
+ "epoch": 1.5886524822695036,
+ "grad_norm": 2.9981517791748047,
+ "learning_rate": 4.217451840526029e-06,
+ "loss": 0.6054,
+ "step": 3360
+ },
+ {
+ "epoch": 1.5891252955082742,
+ "grad_norm": 2.6943674087524414,
+ "learning_rate": 4.216998467201841e-06,
+ "loss": 0.5153,
+ "step": 3361
+ },
+ {
+ "epoch": 1.5895981087470448,
+ "grad_norm": 2.707084894180298,
+ "learning_rate": 4.216544986967689e-06,
+ "loss": 0.5235,
+ "step": 3362
+ },
+ {
+ "epoch": 1.5900709219858156,
+ "grad_norm": 2.6553728580474854,
+ "learning_rate": 4.216091399851808e-06,
+ "loss": 0.5275,
+ "step": 3363
+ },
+ {
+ "epoch": 1.5905437352245864,
+ "grad_norm": 2.9136953353881836,
+ "learning_rate": 4.215637705882439e-06,
+ "loss": 0.5834,
+ "step": 3364
+ },
+ {
+ "epoch": 1.591016548463357,
+ "grad_norm": 2.7647159099578857,
+ "learning_rate": 4.2151839050878325e-06,
+ "loss": 0.5641,
+ "step": 3365
+ },
+ {
+ "epoch": 1.5914893617021275,
+ "grad_norm": 2.4556827545166016,
+ "learning_rate": 4.214729997496246e-06,
+ "loss": 0.5636,
+ "step": 3366
+ },
+ {
+ "epoch": 1.5919621749408983,
+ "grad_norm": 2.6111652851104736,
+ "learning_rate": 4.2142759831359414e-06,
+ "loss": 0.5097,
+ "step": 3367
+ },
+ {
+ "epoch": 1.592434988179669,
+ "grad_norm": 2.4886903762817383,
+ "learning_rate": 4.213821862035189e-06,
+ "loss": 0.531,
+ "step": 3368
+ },
+ {
+ "epoch": 1.5929078014184397,
+ "grad_norm": 2.5245840549468994,
+ "learning_rate": 4.213367634222263e-06,
+ "loss": 0.5085,
+ "step": 3369
+ },
+ {
+ "epoch": 1.5933806146572103,
+ "grad_norm": 2.970214605331421,
+ "learning_rate": 4.212913299725447e-06,
+ "loss": 0.5851,
+ "step": 3370
+ },
+ {
+ "epoch": 1.593853427895981,
+ "grad_norm": 2.5433361530303955,
+ "learning_rate": 4.212458858573032e-06,
+ "loss": 0.48,
+ "step": 3371
+ },
+ {
+ "epoch": 1.5943262411347519,
+ "grad_norm": 2.3550102710723877,
+ "learning_rate": 4.212004310793312e-06,
+ "loss": 0.4405,
+ "step": 3372
+ },
+ {
+ "epoch": 1.5947990543735224,
+ "grad_norm": 2.4824719429016113,
+ "learning_rate": 4.2115496564145896e-06,
+ "loss": 0.4634,
+ "step": 3373
+ },
+ {
+ "epoch": 1.595271867612293,
+ "grad_norm": 2.4751930236816406,
+ "learning_rate": 4.211094895465176e-06,
+ "loss": 0.5662,
+ "step": 3374
+ },
+ {
+ "epoch": 1.5957446808510638,
+ "grad_norm": 2.4193356037139893,
+ "learning_rate": 4.210640027973386e-06,
+ "loss": 0.4441,
+ "step": 3375
+ },
+ {
+ "epoch": 1.5962174940898346,
+ "grad_norm": 2.4477498531341553,
+ "learning_rate": 4.210185053967543e-06,
+ "loss": 0.5205,
+ "step": 3376
+ },
+ {
+ "epoch": 1.5966903073286052,
+ "grad_norm": 2.7954161167144775,
+ "learning_rate": 4.209729973475976e-06,
+ "loss": 0.4951,
+ "step": 3377
+ },
+ {
+ "epoch": 1.5971631205673757,
+ "grad_norm": 3.1907570362091064,
+ "learning_rate": 4.209274786527019e-06,
+ "loss": 0.6024,
+ "step": 3378
+ },
+ {
+ "epoch": 1.5976359338061465,
+ "grad_norm": 2.485245704650879,
+ "learning_rate": 4.2088194931490165e-06,
+ "loss": 0.5652,
+ "step": 3379
+ },
+ {
+ "epoch": 1.5981087470449173,
+ "grad_norm": 2.589310884475708,
+ "learning_rate": 4.208364093370317e-06,
+ "loss": 0.5085,
+ "step": 3380
+ },
+ {
+ "epoch": 1.598581560283688,
+ "grad_norm": 2.8941214084625244,
+ "learning_rate": 4.207908587219276e-06,
+ "loss": 0.53,
+ "step": 3381
+ },
+ {
+ "epoch": 1.5990543735224585,
+ "grad_norm": 2.480509042739868,
+ "learning_rate": 4.207452974724258e-06,
+ "loss": 0.4543,
+ "step": 3382
+ },
+ {
+ "epoch": 1.5995271867612293,
+ "grad_norm": 2.7884905338287354,
+ "learning_rate": 4.206997255913629e-06,
+ "loss": 0.5483,
+ "step": 3383
+ },
+ {
+ "epoch": 1.6,
+ "grad_norm": 2.7976696491241455,
+ "learning_rate": 4.206541430815766e-06,
+ "loss": 0.4734,
+ "step": 3384
+ },
+ {
+ "epoch": 1.6004728132387707,
+ "grad_norm": 2.5463132858276367,
+ "learning_rate": 4.206085499459051e-06,
+ "loss": 0.4931,
+ "step": 3385
+ },
+ {
+ "epoch": 1.6009456264775412,
+ "grad_norm": 2.8384251594543457,
+ "learning_rate": 4.205629461871871e-06,
+ "loss": 0.5066,
+ "step": 3386
+ },
+ {
+ "epoch": 1.601418439716312,
+ "grad_norm": 2.8578574657440186,
+ "learning_rate": 4.205173318082626e-06,
+ "loss": 0.458,
+ "step": 3387
+ },
+ {
+ "epoch": 1.6018912529550828,
+ "grad_norm": 2.7779932022094727,
+ "learning_rate": 4.204717068119715e-06,
+ "loss": 0.5293,
+ "step": 3388
+ },
+ {
+ "epoch": 1.6023640661938534,
+ "grad_norm": 2.9123778343200684,
+ "learning_rate": 4.204260712011546e-06,
+ "loss": 0.4866,
+ "step": 3389
+ },
+ {
+ "epoch": 1.602836879432624,
+ "grad_norm": 2.757922887802124,
+ "learning_rate": 4.203804249786537e-06,
+ "loss": 0.4925,
+ "step": 3390
+ },
+ {
+ "epoch": 1.6033096926713948,
+ "grad_norm": 3.287733316421509,
+ "learning_rate": 4.203347681473107e-06,
+ "loss": 0.6694,
+ "step": 3391
+ },
+ {
+ "epoch": 1.6037825059101656,
+ "grad_norm": 3.2117912769317627,
+ "learning_rate": 4.202891007099687e-06,
+ "loss": 0.5269,
+ "step": 3392
+ },
+ {
+ "epoch": 1.6042553191489362,
+ "grad_norm": 2.8489456176757812,
+ "learning_rate": 4.20243422669471e-06,
+ "loss": 0.5073,
+ "step": 3393
+ },
+ {
+ "epoch": 1.6047281323877067,
+ "grad_norm": 2.7660224437713623,
+ "learning_rate": 4.201977340286619e-06,
+ "loss": 0.5014,
+ "step": 3394
+ },
+ {
+ "epoch": 1.6052009456264775,
+ "grad_norm": 2.68182110786438,
+ "learning_rate": 4.201520347903862e-06,
+ "loss": 0.4542,
+ "step": 3395
+ },
+ {
+ "epoch": 1.6056737588652483,
+ "grad_norm": 2.7546045780181885,
+ "learning_rate": 4.2010632495748934e-06,
+ "loss": 0.516,
+ "step": 3396
+ },
+ {
+ "epoch": 1.606146572104019,
+ "grad_norm": 2.744668483734131,
+ "learning_rate": 4.200606045328176e-06,
+ "loss": 0.5243,
+ "step": 3397
+ },
+ {
+ "epoch": 1.6066193853427895,
+ "grad_norm": 2.935343027114868,
+ "learning_rate": 4.200148735192177e-06,
+ "loss": 0.5624,
+ "step": 3398
+ },
+ {
+ "epoch": 1.6070921985815603,
+ "grad_norm": 2.7392852306365967,
+ "learning_rate": 4.19969131919537e-06,
+ "loss": 0.5796,
+ "step": 3399
+ },
+ {
+ "epoch": 1.607565011820331,
+ "grad_norm": 2.864750385284424,
+ "learning_rate": 4.199233797366239e-06,
+ "loss": 0.549,
+ "step": 3400
+ },
+ {
+ "epoch": 1.6080378250591016,
+ "grad_norm": 2.684157371520996,
+ "learning_rate": 4.198776169733269e-06,
+ "loss": 0.5532,
+ "step": 3401
+ },
+ {
+ "epoch": 1.6085106382978722,
+ "grad_norm": 2.4717135429382324,
+ "learning_rate": 4.198318436324957e-06,
+ "loss": 0.5174,
+ "step": 3402
+ },
+ {
+ "epoch": 1.608983451536643,
+ "grad_norm": 2.640242338180542,
+ "learning_rate": 4.197860597169802e-06,
+ "loss": 0.5117,
+ "step": 3403
+ },
+ {
+ "epoch": 1.6094562647754138,
+ "grad_norm": 2.4957473278045654,
+ "learning_rate": 4.197402652296313e-06,
+ "loss": 0.474,
+ "step": 3404
+ },
+ {
+ "epoch": 1.6099290780141844,
+ "grad_norm": 2.416138172149658,
+ "learning_rate": 4.196944601733004e-06,
+ "loss": 0.4858,
+ "step": 3405
+ },
+ {
+ "epoch": 1.610401891252955,
+ "grad_norm": 2.4498109817504883,
+ "learning_rate": 4.196486445508395e-06,
+ "loss": 0.5048,
+ "step": 3406
+ },
+ {
+ "epoch": 1.6108747044917258,
+ "grad_norm": 2.415895938873291,
+ "learning_rate": 4.196028183651014e-06,
+ "loss": 0.4745,
+ "step": 3407
+ },
+ {
+ "epoch": 1.6113475177304966,
+ "grad_norm": 2.843665838241577,
+ "learning_rate": 4.195569816189395e-06,
+ "loss": 0.5219,
+ "step": 3408
+ },
+ {
+ "epoch": 1.6118203309692671,
+ "grad_norm": 2.608579158782959,
+ "learning_rate": 4.195111343152079e-06,
+ "loss": 0.4941,
+ "step": 3409
+ },
+ {
+ "epoch": 1.6122931442080377,
+ "grad_norm": 2.643789529800415,
+ "learning_rate": 4.194652764567611e-06,
+ "loss": 0.515,
+ "step": 3410
+ },
+ {
+ "epoch": 1.6127659574468085,
+ "grad_norm": 2.8099429607391357,
+ "learning_rate": 4.194194080464547e-06,
+ "loss": 0.4935,
+ "step": 3411
+ },
+ {
+ "epoch": 1.6132387706855793,
+ "grad_norm": 2.595628261566162,
+ "learning_rate": 4.193735290871446e-06,
+ "loss": 0.5571,
+ "step": 3412
+ },
+ {
+ "epoch": 1.6137115839243499,
+ "grad_norm": 2.7903778553009033,
+ "learning_rate": 4.193276395816876e-06,
+ "loss": 0.5228,
+ "step": 3413
+ },
+ {
+ "epoch": 1.6141843971631205,
+ "grad_norm": 2.83910870552063,
+ "learning_rate": 4.192817395329409e-06,
+ "loss": 0.6124,
+ "step": 3414
+ },
+ {
+ "epoch": 1.6146572104018913,
+ "grad_norm": 2.6155734062194824,
+ "learning_rate": 4.192358289437626e-06,
+ "loss": 0.552,
+ "step": 3415
+ },
+ {
+ "epoch": 1.615130023640662,
+ "grad_norm": 2.795832872390747,
+ "learning_rate": 4.191899078170113e-06,
+ "loss": 0.5561,
+ "step": 3416
+ },
+ {
+ "epoch": 1.6156028368794326,
+ "grad_norm": 2.3402161598205566,
+ "learning_rate": 4.191439761555464e-06,
+ "loss": 0.4889,
+ "step": 3417
+ },
+ {
+ "epoch": 1.6160756501182032,
+ "grad_norm": 3.1183433532714844,
+ "learning_rate": 4.190980339622276e-06,
+ "loss": 0.5337,
+ "step": 3418
+ },
+ {
+ "epoch": 1.616548463356974,
+ "grad_norm": 2.6262872219085693,
+ "learning_rate": 4.190520812399158e-06,
+ "loss": 0.525,
+ "step": 3419
+ },
+ {
+ "epoch": 1.6170212765957448,
+ "grad_norm": 2.578340530395508,
+ "learning_rate": 4.190061179914722e-06,
+ "loss": 0.4975,
+ "step": 3420
+ },
+ {
+ "epoch": 1.6174940898345154,
+ "grad_norm": 3.19482159614563,
+ "learning_rate": 4.189601442197586e-06,
+ "loss": 0.5832,
+ "step": 3421
+ },
+ {
+ "epoch": 1.617966903073286,
+ "grad_norm": 2.6398792266845703,
+ "learning_rate": 4.189141599276378e-06,
+ "loss": 0.4676,
+ "step": 3422
+ },
+ {
+ "epoch": 1.6184397163120567,
+ "grad_norm": 2.624865770339966,
+ "learning_rate": 4.1886816511797275e-06,
+ "loss": 0.4507,
+ "step": 3423
+ },
+ {
+ "epoch": 1.6189125295508275,
+ "grad_norm": 2.4136857986450195,
+ "learning_rate": 4.1882215979362775e-06,
+ "loss": 0.4616,
+ "step": 3424
+ },
+ {
+ "epoch": 1.6193853427895981,
+ "grad_norm": 2.6906614303588867,
+ "learning_rate": 4.18776143957467e-06,
+ "loss": 0.5142,
+ "step": 3425
+ },
+ {
+ "epoch": 1.6198581560283687,
+ "grad_norm": 2.5149154663085938,
+ "learning_rate": 4.187301176123558e-06,
+ "loss": 0.5252,
+ "step": 3426
+ },
+ {
+ "epoch": 1.6203309692671395,
+ "grad_norm": 2.677405834197998,
+ "learning_rate": 4.186840807611602e-06,
+ "loss": 0.4635,
+ "step": 3427
+ },
+ {
+ "epoch": 1.6208037825059103,
+ "grad_norm": 2.7164649963378906,
+ "learning_rate": 4.186380334067464e-06,
+ "loss": 0.5634,
+ "step": 3428
+ },
+ {
+ "epoch": 1.6212765957446809,
+ "grad_norm": 2.8299832344055176,
+ "learning_rate": 4.185919755519817e-06,
+ "loss": 0.5166,
+ "step": 3429
+ },
+ {
+ "epoch": 1.6217494089834514,
+ "grad_norm": 2.465848207473755,
+ "learning_rate": 4.18545907199734e-06,
+ "loss": 0.4696,
+ "step": 3430
+ },
+ {
+ "epoch": 1.6222222222222222,
+ "grad_norm": 2.407616376876831,
+ "learning_rate": 4.1849982835287175e-06,
+ "loss": 0.5111,
+ "step": 3431
+ },
+ {
+ "epoch": 1.622695035460993,
+ "grad_norm": 2.452146291732788,
+ "learning_rate": 4.184537390142639e-06,
+ "loss": 0.4574,
+ "step": 3432
+ },
+ {
+ "epoch": 1.6231678486997636,
+ "grad_norm": 2.653071165084839,
+ "learning_rate": 4.1840763918678055e-06,
+ "loss": 0.5611,
+ "step": 3433
+ },
+ {
+ "epoch": 1.6236406619385342,
+ "grad_norm": 2.5920350551605225,
+ "learning_rate": 4.183615288732919e-06,
+ "loss": 0.5437,
+ "step": 3434
+ },
+ {
+ "epoch": 1.624113475177305,
+ "grad_norm": 2.782900810241699,
+ "learning_rate": 4.18315408076669e-06,
+ "loss": 0.5824,
+ "step": 3435
+ },
+ {
+ "epoch": 1.6245862884160758,
+ "grad_norm": 2.8769774436950684,
+ "learning_rate": 4.1826927679978365e-06,
+ "loss": 0.5271,
+ "step": 3436
+ },
+ {
+ "epoch": 1.6250591016548463,
+ "grad_norm": 2.488598585128784,
+ "learning_rate": 4.182231350455084e-06,
+ "loss": 0.4684,
+ "step": 3437
+ },
+ {
+ "epoch": 1.625531914893617,
+ "grad_norm": 2.6472036838531494,
+ "learning_rate": 4.181769828167161e-06,
+ "loss": 0.5372,
+ "step": 3438
+ },
+ {
+ "epoch": 1.6260047281323877,
+ "grad_norm": 2.6498794555664062,
+ "learning_rate": 4.1813082011628045e-06,
+ "loss": 0.4805,
+ "step": 3439
+ },
+ {
+ "epoch": 1.6264775413711585,
+ "grad_norm": 2.5386533737182617,
+ "learning_rate": 4.1808464694707595e-06,
+ "loss": 0.5015,
+ "step": 3440
+ },
+ {
+ "epoch": 1.626950354609929,
+ "grad_norm": 2.8812551498413086,
+ "learning_rate": 4.180384633119775e-06,
+ "loss": 0.5225,
+ "step": 3441
+ },
+ {
+ "epoch": 1.6274231678486997,
+ "grad_norm": 2.870124578475952,
+ "learning_rate": 4.179922692138609e-06,
+ "loss": 0.537,
+ "step": 3442
+ },
+ {
+ "epoch": 1.6278959810874705,
+ "grad_norm": 2.5759785175323486,
+ "learning_rate": 4.179460646556021e-06,
+ "loss": 0.5142,
+ "step": 3443
+ },
+ {
+ "epoch": 1.6283687943262413,
+ "grad_norm": 2.629347324371338,
+ "learning_rate": 4.1789984964007836e-06,
+ "loss": 0.5007,
+ "step": 3444
+ },
+ {
+ "epoch": 1.6288416075650118,
+ "grad_norm": 2.751128673553467,
+ "learning_rate": 4.178536241701672e-06,
+ "loss": 0.5677,
+ "step": 3445
+ },
+ {
+ "epoch": 1.6293144208037824,
+ "grad_norm": 2.7582364082336426,
+ "learning_rate": 4.178073882487469e-06,
+ "loss": 0.499,
+ "step": 3446
+ },
+ {
+ "epoch": 1.6297872340425532,
+ "grad_norm": 3.136711359024048,
+ "learning_rate": 4.177611418786963e-06,
+ "loss": 0.5294,
+ "step": 3447
+ },
+ {
+ "epoch": 1.630260047281324,
+ "grad_norm": 2.7363100051879883,
+ "learning_rate": 4.17714885062895e-06,
+ "loss": 0.5264,
+ "step": 3448
+ },
+ {
+ "epoch": 1.6307328605200946,
+ "grad_norm": 2.7305946350097656,
+ "learning_rate": 4.176686178042233e-06,
+ "loss": 0.5235,
+ "step": 3449
+ },
+ {
+ "epoch": 1.6312056737588652,
+ "grad_norm": 2.6500556468963623,
+ "learning_rate": 4.176223401055619e-06,
+ "loss": 0.5463,
+ "step": 3450
+ },
+ {
+ "epoch": 1.631678486997636,
+ "grad_norm": 2.756321907043457,
+ "learning_rate": 4.175760519697924e-06,
+ "loss": 0.545,
+ "step": 3451
+ },
+ {
+ "epoch": 1.6321513002364068,
+ "grad_norm": 2.6234960556030273,
+ "learning_rate": 4.17529753399797e-06,
+ "loss": 0.4927,
+ "step": 3452
+ },
+ {
+ "epoch": 1.6326241134751773,
+ "grad_norm": 2.6358842849731445,
+ "learning_rate": 4.174834443984584e-06,
+ "loss": 0.5445,
+ "step": 3453
+ },
+ {
+ "epoch": 1.633096926713948,
+ "grad_norm": 2.541147470474243,
+ "learning_rate": 4.174371249686601e-06,
+ "loss": 0.4691,
+ "step": 3454
+ },
+ {
+ "epoch": 1.6335697399527187,
+ "grad_norm": 2.566981077194214,
+ "learning_rate": 4.173907951132863e-06,
+ "loss": 0.4932,
+ "step": 3455
+ },
+ {
+ "epoch": 1.6340425531914895,
+ "grad_norm": 2.670940399169922,
+ "learning_rate": 4.173444548352216e-06,
+ "loss": 0.4979,
+ "step": 3456
+ },
+ {
+ "epoch": 1.63451536643026,
+ "grad_norm": 2.5440268516540527,
+ "learning_rate": 4.172981041373515e-06,
+ "loss": 0.4716,
+ "step": 3457
+ },
+ {
+ "epoch": 1.6349881796690307,
+ "grad_norm": 2.3801631927490234,
+ "learning_rate": 4.17251743022562e-06,
+ "loss": 0.5126,
+ "step": 3458
+ },
+ {
+ "epoch": 1.6354609929078014,
+ "grad_norm": 2.5051121711730957,
+ "learning_rate": 4.1720537149373985e-06,
+ "loss": 0.4964,
+ "step": 3459
+ },
+ {
+ "epoch": 1.6359338061465722,
+ "grad_norm": 3.5521697998046875,
+ "learning_rate": 4.171589895537724e-06,
+ "loss": 0.5447,
+ "step": 3460
+ },
+ {
+ "epoch": 1.6364066193853428,
+ "grad_norm": 2.6041572093963623,
+ "learning_rate": 4.171125972055477e-06,
+ "loss": 0.4637,
+ "step": 3461
+ },
+ {
+ "epoch": 1.6368794326241134,
+ "grad_norm": 2.2297258377075195,
+ "learning_rate": 4.170661944519543e-06,
+ "loss": 0.4702,
+ "step": 3462
+ },
+ {
+ "epoch": 1.6373522458628842,
+ "grad_norm": 2.6764535903930664,
+ "learning_rate": 4.170197812958815e-06,
+ "loss": 0.5111,
+ "step": 3463
+ },
+ {
+ "epoch": 1.637825059101655,
+ "grad_norm": 2.86892032623291,
+ "learning_rate": 4.169733577402193e-06,
+ "loss": 0.5437,
+ "step": 3464
+ },
+ {
+ "epoch": 1.6382978723404256,
+ "grad_norm": 2.9007070064544678,
+ "learning_rate": 4.1692692378785825e-06,
+ "loss": 0.5425,
+ "step": 3465
+ },
+ {
+ "epoch": 1.6387706855791961,
+ "grad_norm": 2.5902905464172363,
+ "learning_rate": 4.168804794416896e-06,
+ "loss": 0.5252,
+ "step": 3466
+ },
+ {
+ "epoch": 1.639243498817967,
+ "grad_norm": 2.821183681488037,
+ "learning_rate": 4.168340247046053e-06,
+ "loss": 0.5265,
+ "step": 3467
+ },
+ {
+ "epoch": 1.6397163120567377,
+ "grad_norm": 2.7928314208984375,
+ "learning_rate": 4.167875595794978e-06,
+ "loss": 0.5151,
+ "step": 3468
+ },
+ {
+ "epoch": 1.6401891252955083,
+ "grad_norm": 2.3130412101745605,
+ "learning_rate": 4.167410840692603e-06,
+ "loss": 0.4941,
+ "step": 3469
+ },
+ {
+ "epoch": 1.6406619385342789,
+ "grad_norm": 2.6078619956970215,
+ "learning_rate": 4.1669459817678655e-06,
+ "loss": 0.493,
+ "step": 3470
+ },
+ {
+ "epoch": 1.6411347517730497,
+ "grad_norm": 2.5335731506347656,
+ "learning_rate": 4.166481019049712e-06,
+ "loss": 0.4969,
+ "step": 3471
+ },
+ {
+ "epoch": 1.6416075650118205,
+ "grad_norm": 2.8181469440460205,
+ "learning_rate": 4.166015952567093e-06,
+ "loss": 0.5062,
+ "step": 3472
+ },
+ {
+ "epoch": 1.642080378250591,
+ "grad_norm": 2.7256782054901123,
+ "learning_rate": 4.165550782348966e-06,
+ "loss": 0.5397,
+ "step": 3473
+ },
+ {
+ "epoch": 1.6425531914893616,
+ "grad_norm": 2.284345865249634,
+ "learning_rate": 4.1650855084242946e-06,
+ "loss": 0.4448,
+ "step": 3474
+ },
+ {
+ "epoch": 1.6430260047281324,
+ "grad_norm": 3.0383145809173584,
+ "learning_rate": 4.164620130822049e-06,
+ "loss": 0.5873,
+ "step": 3475
+ },
+ {
+ "epoch": 1.6434988179669032,
+ "grad_norm": 2.754448652267456,
+ "learning_rate": 4.1641546495712085e-06,
+ "loss": 0.4852,
+ "step": 3476
+ },
+ {
+ "epoch": 1.6439716312056738,
+ "grad_norm": 2.6820101737976074,
+ "learning_rate": 4.1636890647007535e-06,
+ "loss": 0.5325,
+ "step": 3477
+ },
+ {
+ "epoch": 1.6444444444444444,
+ "grad_norm": 2.6396398544311523,
+ "learning_rate": 4.163223376239676e-06,
+ "loss": 0.466,
+ "step": 3478
+ },
+ {
+ "epoch": 1.6449172576832152,
+ "grad_norm": 2.395049810409546,
+ "learning_rate": 4.162757584216972e-06,
+ "loss": 0.4531,
+ "step": 3479
+ },
+ {
+ "epoch": 1.645390070921986,
+ "grad_norm": 2.596670627593994,
+ "learning_rate": 4.162291688661645e-06,
+ "loss": 0.5207,
+ "step": 3480
+ },
+ {
+ "epoch": 1.6458628841607565,
+ "grad_norm": 2.4391872882843018,
+ "learning_rate": 4.161825689602703e-06,
+ "loss": 0.5133,
+ "step": 3481
+ },
+ {
+ "epoch": 1.6463356973995271,
+ "grad_norm": 2.6169841289520264,
+ "learning_rate": 4.161359587069162e-06,
+ "loss": 0.5096,
+ "step": 3482
+ },
+ {
+ "epoch": 1.646808510638298,
+ "grad_norm": 2.634089946746826,
+ "learning_rate": 4.1608933810900445e-06,
+ "loss": 0.4921,
+ "step": 3483
+ },
+ {
+ "epoch": 1.6472813238770687,
+ "grad_norm": 2.815877914428711,
+ "learning_rate": 4.160427071694379e-06,
+ "loss": 0.5045,
+ "step": 3484
+ },
+ {
+ "epoch": 1.6477541371158393,
+ "grad_norm": 2.417525053024292,
+ "learning_rate": 4.159960658911199e-06,
+ "loss": 0.4997,
+ "step": 3485
+ },
+ {
+ "epoch": 1.6482269503546099,
+ "grad_norm": 2.5713605880737305,
+ "learning_rate": 4.15949414276955e-06,
+ "loss": 0.5246,
+ "step": 3486
+ },
+ {
+ "epoch": 1.6486997635933807,
+ "grad_norm": 3.49833607673645,
+ "learning_rate": 4.159027523298475e-06,
+ "loss": 0.4901,
+ "step": 3487
+ },
+ {
+ "epoch": 1.6491725768321515,
+ "grad_norm": 2.985464334487915,
+ "learning_rate": 4.158560800527033e-06,
+ "loss": 0.5726,
+ "step": 3488
+ },
+ {
+ "epoch": 1.649645390070922,
+ "grad_norm": 2.72745680809021,
+ "learning_rate": 4.158093974484282e-06,
+ "loss": 0.5119,
+ "step": 3489
+ },
+ {
+ "epoch": 1.6501182033096926,
+ "grad_norm": 2.4885571002960205,
+ "learning_rate": 4.157627045199289e-06,
+ "loss": 0.4838,
+ "step": 3490
+ },
+ {
+ "epoch": 1.6505910165484634,
+ "grad_norm": 2.7622628211975098,
+ "learning_rate": 4.157160012701128e-06,
+ "loss": 0.5269,
+ "step": 3491
+ },
+ {
+ "epoch": 1.6510638297872342,
+ "grad_norm": 2.615122079849243,
+ "learning_rate": 4.156692877018879e-06,
+ "loss": 0.5501,
+ "step": 3492
+ },
+ {
+ "epoch": 1.6515366430260048,
+ "grad_norm": 2.827753782272339,
+ "learning_rate": 4.156225638181631e-06,
+ "loss": 0.5452,
+ "step": 3493
+ },
+ {
+ "epoch": 1.6520094562647754,
+ "grad_norm": 2.724820137023926,
+ "learning_rate": 4.155758296218474e-06,
+ "loss": 0.5155,
+ "step": 3494
+ },
+ {
+ "epoch": 1.6524822695035462,
+ "grad_norm": 2.5806174278259277,
+ "learning_rate": 4.155290851158508e-06,
+ "loss": 0.5292,
+ "step": 3495
+ },
+ {
+ "epoch": 1.652955082742317,
+ "grad_norm": 2.5655179023742676,
+ "learning_rate": 4.154823303030838e-06,
+ "loss": 0.4959,
+ "step": 3496
+ },
+ {
+ "epoch": 1.6534278959810875,
+ "grad_norm": 2.656548261642456,
+ "learning_rate": 4.154355651864579e-06,
+ "loss": 0.5703,
+ "step": 3497
+ },
+ {
+ "epoch": 1.653900709219858,
+ "grad_norm": 2.9085004329681396,
+ "learning_rate": 4.153887897688847e-06,
+ "loss": 0.5061,
+ "step": 3498
+ },
+ {
+ "epoch": 1.654373522458629,
+ "grad_norm": 2.608010768890381,
+ "learning_rate": 4.1534200405327665e-06,
+ "loss": 0.5165,
+ "step": 3499
+ },
+ {
+ "epoch": 1.6548463356973995,
+ "grad_norm": 2.600463628768921,
+ "learning_rate": 4.152952080425471e-06,
+ "loss": 0.4946,
+ "step": 3500
+ },
+ {
+ "epoch": 1.65531914893617,
+ "grad_norm": 2.5561563968658447,
+ "learning_rate": 4.152484017396098e-06,
+ "loss": 0.4804,
+ "step": 3501
+ },
+ {
+ "epoch": 1.6557919621749408,
+ "grad_norm": 2.788594961166382,
+ "learning_rate": 4.152015851473791e-06,
+ "loss": 0.5635,
+ "step": 3502
+ },
+ {
+ "epoch": 1.6562647754137116,
+ "grad_norm": 2.693302631378174,
+ "learning_rate": 4.151547582687699e-06,
+ "loss": 0.5139,
+ "step": 3503
+ },
+ {
+ "epoch": 1.6567375886524822,
+ "grad_norm": 2.7887485027313232,
+ "learning_rate": 4.1510792110669825e-06,
+ "loss": 0.4952,
+ "step": 3504
+ },
+ {
+ "epoch": 1.6572104018912528,
+ "grad_norm": 2.8982298374176025,
+ "learning_rate": 4.150610736640803e-06,
+ "loss": 0.4136,
+ "step": 3505
+ },
+ {
+ "epoch": 1.6576832151300236,
+ "grad_norm": 2.7569408416748047,
+ "learning_rate": 4.150142159438331e-06,
+ "loss": 0.5272,
+ "step": 3506
+ },
+ {
+ "epoch": 1.6581560283687944,
+ "grad_norm": 2.531648874282837,
+ "learning_rate": 4.149673479488742e-06,
+ "loss": 0.5016,
+ "step": 3507
+ },
+ {
+ "epoch": 1.658628841607565,
+ "grad_norm": 2.7706353664398193,
+ "learning_rate": 4.149204696821219e-06,
+ "loss": 0.5512,
+ "step": 3508
+ },
+ {
+ "epoch": 1.6591016548463355,
+ "grad_norm": 2.7307450771331787,
+ "learning_rate": 4.148735811464951e-06,
+ "loss": 0.4968,
+ "step": 3509
+ },
+ {
+ "epoch": 1.6595744680851063,
+ "grad_norm": 3.0097429752349854,
+ "learning_rate": 4.1482668234491335e-06,
+ "loss": 0.4797,
+ "step": 3510
+ },
+ {
+ "epoch": 1.6600472813238771,
+ "grad_norm": 2.6045308113098145,
+ "learning_rate": 4.147797732802969e-06,
+ "loss": 0.5496,
+ "step": 3511
+ },
+ {
+ "epoch": 1.6605200945626477,
+ "grad_norm": 2.702061176300049,
+ "learning_rate": 4.147328539555664e-06,
+ "loss": 0.5302,
+ "step": 3512
+ },
+ {
+ "epoch": 1.6609929078014183,
+ "grad_norm": 3.3724892139434814,
+ "learning_rate": 4.1468592437364356e-06,
+ "loss": 0.5124,
+ "step": 3513
+ },
+ {
+ "epoch": 1.661465721040189,
+ "grad_norm": 2.5117242336273193,
+ "learning_rate": 4.146389845374502e-06,
+ "loss": 0.4953,
+ "step": 3514
+ },
+ {
+ "epoch": 1.6619385342789599,
+ "grad_norm": 2.86547589302063,
+ "learning_rate": 4.145920344499092e-06,
+ "loss": 0.5337,
+ "step": 3515
+ },
+ {
+ "epoch": 1.6624113475177305,
+ "grad_norm": 2.745149850845337,
+ "learning_rate": 4.14545074113944e-06,
+ "loss": 0.5187,
+ "step": 3516
+ },
+ {
+ "epoch": 1.662884160756501,
+ "grad_norm": 2.5560994148254395,
+ "learning_rate": 4.1449810353247855e-06,
+ "loss": 0.5183,
+ "step": 3517
+ },
+ {
+ "epoch": 1.6633569739952718,
+ "grad_norm": 2.2318122386932373,
+ "learning_rate": 4.144511227084374e-06,
+ "loss": 0.4452,
+ "step": 3518
+ },
+ {
+ "epoch": 1.6638297872340426,
+ "grad_norm": 2.6980903148651123,
+ "learning_rate": 4.14404131644746e-06,
+ "loss": 0.4974,
+ "step": 3519
+ },
+ {
+ "epoch": 1.6643026004728132,
+ "grad_norm": 2.6875357627868652,
+ "learning_rate": 4.1435713034433025e-06,
+ "loss": 0.4582,
+ "step": 3520
+ },
+ {
+ "epoch": 1.6647754137115838,
+ "grad_norm": 2.9430019855499268,
+ "learning_rate": 4.143101188101166e-06,
+ "loss": 0.5004,
+ "step": 3521
+ },
+ {
+ "epoch": 1.6652482269503546,
+ "grad_norm": 2.4447221755981445,
+ "learning_rate": 4.142630970450323e-06,
+ "loss": 0.5436,
+ "step": 3522
+ },
+ {
+ "epoch": 1.6657210401891254,
+ "grad_norm": 2.571023941040039,
+ "learning_rate": 4.142160650520053e-06,
+ "loss": 0.5307,
+ "step": 3523
+ },
+ {
+ "epoch": 1.666193853427896,
+ "grad_norm": 2.9725306034088135,
+ "learning_rate": 4.14169022833964e-06,
+ "loss": 0.5918,
+ "step": 3524
+ },
+ {
+ "epoch": 1.6666666666666665,
+ "grad_norm": 2.5958926677703857,
+ "learning_rate": 4.141219703938375e-06,
+ "loss": 0.5036,
+ "step": 3525
+ },
+ {
+ "epoch": 1.6671394799054373,
+ "grad_norm": 2.935788631439209,
+ "learning_rate": 4.140749077345556e-06,
+ "loss": 0.5773,
+ "step": 3526
+ },
+ {
+ "epoch": 1.6676122931442081,
+ "grad_norm": 2.5460526943206787,
+ "learning_rate": 4.140278348590485e-06,
+ "loss": 0.4762,
+ "step": 3527
+ },
+ {
+ "epoch": 1.6680851063829787,
+ "grad_norm": 2.5729143619537354,
+ "learning_rate": 4.139807517702475e-06,
+ "loss": 0.5515,
+ "step": 3528
+ },
+ {
+ "epoch": 1.6685579196217493,
+ "grad_norm": 2.4377381801605225,
+ "learning_rate": 4.13933658471084e-06,
+ "loss": 0.5383,
+ "step": 3529
+ },
+ {
+ "epoch": 1.66903073286052,
+ "grad_norm": 2.6284425258636475,
+ "learning_rate": 4.138865549644905e-06,
+ "loss": 0.5396,
+ "step": 3530
+ },
+ {
+ "epoch": 1.6695035460992909,
+ "grad_norm": 2.857250928878784,
+ "learning_rate": 4.138394412533998e-06,
+ "loss": 0.5861,
+ "step": 3531
+ },
+ {
+ "epoch": 1.6699763593380614,
+ "grad_norm": 2.9226012229919434,
+ "learning_rate": 4.137923173407456e-06,
+ "loss": 0.5262,
+ "step": 3532
+ },
+ {
+ "epoch": 1.670449172576832,
+ "grad_norm": 4.839131832122803,
+ "learning_rate": 4.137451832294619e-06,
+ "loss": 0.651,
+ "step": 3533
+ },
+ {
+ "epoch": 1.6709219858156028,
+ "grad_norm": 2.4727771282196045,
+ "learning_rate": 4.1369803892248375e-06,
+ "loss": 0.5149,
+ "step": 3534
+ },
+ {
+ "epoch": 1.6713947990543736,
+ "grad_norm": 2.5391688346862793,
+ "learning_rate": 4.1365088442274635e-06,
+ "loss": 0.4907,
+ "step": 3535
+ },
+ {
+ "epoch": 1.6718676122931442,
+ "grad_norm": 2.5168209075927734,
+ "learning_rate": 4.136037197331862e-06,
+ "loss": 0.5091,
+ "step": 3536
+ },
+ {
+ "epoch": 1.6723404255319148,
+ "grad_norm": 2.6278600692749023,
+ "learning_rate": 4.135565448567396e-06,
+ "loss": 0.4357,
+ "step": 3537
+ },
+ {
+ "epoch": 1.6728132387706856,
+ "grad_norm": 2.835184097290039,
+ "learning_rate": 4.135093597963441e-06,
+ "loss": 0.4786,
+ "step": 3538
+ },
+ {
+ "epoch": 1.6732860520094563,
+ "grad_norm": 2.385328531265259,
+ "learning_rate": 4.134621645549379e-06,
+ "loss": 0.4849,
+ "step": 3539
+ },
+ {
+ "epoch": 1.673758865248227,
+ "grad_norm": 2.6504149436950684,
+ "learning_rate": 4.134149591354593e-06,
+ "loss": 0.6037,
+ "step": 3540
+ },
+ {
+ "epoch": 1.6742316784869975,
+ "grad_norm": 2.945634126663208,
+ "learning_rate": 4.1336774354084786e-06,
+ "loss": 0.532,
+ "step": 3541
+ },
+ {
+ "epoch": 1.6747044917257683,
+ "grad_norm": 2.8373215198516846,
+ "learning_rate": 4.133205177740434e-06,
+ "loss": 0.5138,
+ "step": 3542
+ },
+ {
+ "epoch": 1.675177304964539,
+ "grad_norm": 2.6616621017456055,
+ "learning_rate": 4.1327328183798634e-06,
+ "loss": 0.5543,
+ "step": 3543
+ },
+ {
+ "epoch": 1.6756501182033097,
+ "grad_norm": 3.0843071937561035,
+ "learning_rate": 4.13226035735618e-06,
+ "loss": 0.6585,
+ "step": 3544
+ },
+ {
+ "epoch": 1.6761229314420802,
+ "grad_norm": 2.2214272022247314,
+ "learning_rate": 4.131787794698802e-06,
+ "loss": 0.5413,
+ "step": 3545
+ },
+ {
+ "epoch": 1.676595744680851,
+ "grad_norm": 2.4515018463134766,
+ "learning_rate": 4.131315130437152e-06,
+ "loss": 0.4966,
+ "step": 3546
+ },
+ {
+ "epoch": 1.6770685579196218,
+ "grad_norm": 2.647414207458496,
+ "learning_rate": 4.130842364600663e-06,
+ "loss": 0.5401,
+ "step": 3547
+ },
+ {
+ "epoch": 1.6775413711583924,
+ "grad_norm": 2.648941993713379,
+ "learning_rate": 4.13036949721877e-06,
+ "loss": 0.4796,
+ "step": 3548
+ },
+ {
+ "epoch": 1.678014184397163,
+ "grad_norm": 2.7835679054260254,
+ "learning_rate": 4.129896528320919e-06,
+ "loss": 0.5653,
+ "step": 3549
+ },
+ {
+ "epoch": 1.6784869976359338,
+ "grad_norm": 2.995964288711548,
+ "learning_rate": 4.129423457936556e-06,
+ "loss": 0.4999,
+ "step": 3550
+ },
+ {
+ "epoch": 1.6789598108747046,
+ "grad_norm": 2.5980007648468018,
+ "learning_rate": 4.1289502860951405e-06,
+ "loss": 0.5177,
+ "step": 3551
+ },
+ {
+ "epoch": 1.6794326241134752,
+ "grad_norm": 2.442254066467285,
+ "learning_rate": 4.128477012826133e-06,
+ "loss": 0.5062,
+ "step": 3552
+ },
+ {
+ "epoch": 1.6799054373522457,
+ "grad_norm": 2.3007538318634033,
+ "learning_rate": 4.1280036381590025e-06,
+ "loss": 0.5029,
+ "step": 3553
+ },
+ {
+ "epoch": 1.6803782505910165,
+ "grad_norm": 2.4169347286224365,
+ "learning_rate": 4.1275301621232245e-06,
+ "loss": 0.515,
+ "step": 3554
+ },
+ {
+ "epoch": 1.6808510638297873,
+ "grad_norm": 2.6456379890441895,
+ "learning_rate": 4.127056584748279e-06,
+ "loss": 0.5343,
+ "step": 3555
+ },
+ {
+ "epoch": 1.681323877068558,
+ "grad_norm": 2.6406595706939697,
+ "learning_rate": 4.1265829060636546e-06,
+ "loss": 0.5047,
+ "step": 3556
+ },
+ {
+ "epoch": 1.6817966903073285,
+ "grad_norm": 2.9344475269317627,
+ "learning_rate": 4.126109126098846e-06,
+ "loss": 0.5501,
+ "step": 3557
+ },
+ {
+ "epoch": 1.6822695035460993,
+ "grad_norm": 2.3292455673217773,
+ "learning_rate": 4.125635244883351e-06,
+ "loss": 0.463,
+ "step": 3558
+ },
+ {
+ "epoch": 1.68274231678487,
+ "grad_norm": 2.4150657653808594,
+ "learning_rate": 4.125161262446677e-06,
+ "loss": 0.4802,
+ "step": 3559
+ },
+ {
+ "epoch": 1.6832151300236406,
+ "grad_norm": 2.604292392730713,
+ "learning_rate": 4.124687178818339e-06,
+ "loss": 0.5683,
+ "step": 3560
+ },
+ {
+ "epoch": 1.6836879432624112,
+ "grad_norm": 2.5676791667938232,
+ "learning_rate": 4.1242129940278544e-06,
+ "loss": 0.5519,
+ "step": 3561
+ },
+ {
+ "epoch": 1.684160756501182,
+ "grad_norm": 3.078514814376831,
+ "learning_rate": 4.123738708104748e-06,
+ "loss": 0.5194,
+ "step": 3562
+ },
+ {
+ "epoch": 1.6846335697399528,
+ "grad_norm": 2.893577814102173,
+ "learning_rate": 4.123264321078552e-06,
+ "loss": 0.5107,
+ "step": 3563
+ },
+ {
+ "epoch": 1.6851063829787234,
+ "grad_norm": 2.772413730621338,
+ "learning_rate": 4.122789832978804e-06,
+ "loss": 0.6147,
+ "step": 3564
+ },
+ {
+ "epoch": 1.685579196217494,
+ "grad_norm": 2.5804643630981445,
+ "learning_rate": 4.12231524383505e-06,
+ "loss": 0.5057,
+ "step": 3565
+ },
+ {
+ "epoch": 1.6860520094562648,
+ "grad_norm": 2.599571466445923,
+ "learning_rate": 4.121840553676839e-06,
+ "loss": 0.5591,
+ "step": 3566
+ },
+ {
+ "epoch": 1.6865248226950356,
+ "grad_norm": 2.9124577045440674,
+ "learning_rate": 4.1213657625337275e-06,
+ "loss": 0.565,
+ "step": 3567
+ },
+ {
+ "epoch": 1.6869976359338061,
+ "grad_norm": 2.6582155227661133,
+ "learning_rate": 4.120890870435281e-06,
+ "loss": 0.4607,
+ "step": 3568
+ },
+ {
+ "epoch": 1.6874704491725767,
+ "grad_norm": 2.929227590560913,
+ "learning_rate": 4.120415877411066e-06,
+ "loss": 0.5705,
+ "step": 3569
+ },
+ {
+ "epoch": 1.6879432624113475,
+ "grad_norm": 2.4443247318267822,
+ "learning_rate": 4.11994078349066e-06,
+ "loss": 0.4592,
+ "step": 3570
+ },
+ {
+ "epoch": 1.6884160756501183,
+ "grad_norm": 2.4799163341522217,
+ "learning_rate": 4.119465588703645e-06,
+ "loss": 0.5361,
+ "step": 3571
+ },
+ {
+ "epoch": 1.6888888888888889,
+ "grad_norm": 2.9408936500549316,
+ "learning_rate": 4.1189902930796085e-06,
+ "loss": 0.5347,
+ "step": 3572
+ },
+ {
+ "epoch": 1.6893617021276595,
+ "grad_norm": 3.3348076343536377,
+ "learning_rate": 4.118514896648146e-06,
+ "loss": 0.5612,
+ "step": 3573
+ },
+ {
+ "epoch": 1.6898345153664303,
+ "grad_norm": 2.764889717102051,
+ "learning_rate": 4.118039399438857e-06,
+ "loss": 0.4745,
+ "step": 3574
+ },
+ {
+ "epoch": 1.690307328605201,
+ "grad_norm": 2.7023751735687256,
+ "learning_rate": 4.11756380148135e-06,
+ "loss": 0.5106,
+ "step": 3575
+ },
+ {
+ "epoch": 1.6907801418439716,
+ "grad_norm": 2.8816208839416504,
+ "learning_rate": 4.117088102805238e-06,
+ "loss": 0.6016,
+ "step": 3576
+ },
+ {
+ "epoch": 1.6912529550827422,
+ "grad_norm": 2.215733289718628,
+ "learning_rate": 4.11661230344014e-06,
+ "loss": 0.4404,
+ "step": 3577
+ },
+ {
+ "epoch": 1.691725768321513,
+ "grad_norm": 2.8190999031066895,
+ "learning_rate": 4.116136403415683e-06,
+ "loss": 0.5038,
+ "step": 3578
+ },
+ {
+ "epoch": 1.6921985815602838,
+ "grad_norm": 2.616424083709717,
+ "learning_rate": 4.115660402761499e-06,
+ "loss": 0.5493,
+ "step": 3579
+ },
+ {
+ "epoch": 1.6926713947990544,
+ "grad_norm": 2.7738113403320312,
+ "learning_rate": 4.115184301507226e-06,
+ "loss": 0.5416,
+ "step": 3580
+ },
+ {
+ "epoch": 1.693144208037825,
+ "grad_norm": 2.4793593883514404,
+ "learning_rate": 4.114708099682509e-06,
+ "loss": 0.4526,
+ "step": 3581
+ },
+ {
+ "epoch": 1.6936170212765957,
+ "grad_norm": 2.390652894973755,
+ "learning_rate": 4.114231797316999e-06,
+ "loss": 0.4908,
+ "step": 3582
+ },
+ {
+ "epoch": 1.6940898345153665,
+ "grad_norm": 2.513197660446167,
+ "learning_rate": 4.113755394440352e-06,
+ "loss": 0.4738,
+ "step": 3583
+ },
+ {
+ "epoch": 1.6945626477541371,
+ "grad_norm": 2.504497766494751,
+ "learning_rate": 4.113278891082234e-06,
+ "loss": 0.4661,
+ "step": 3584
+ },
+ {
+ "epoch": 1.6950354609929077,
+ "grad_norm": 2.4966917037963867,
+ "learning_rate": 4.112802287272314e-06,
+ "loss": 0.4979,
+ "step": 3585
+ },
+ {
+ "epoch": 1.6955082742316785,
+ "grad_norm": 2.3129689693450928,
+ "learning_rate": 4.112325583040265e-06,
+ "loss": 0.4933,
+ "step": 3586
+ },
+ {
+ "epoch": 1.6959810874704493,
+ "grad_norm": 2.822136878967285,
+ "learning_rate": 4.111848778415774e-06,
+ "loss": 0.5087,
+ "step": 3587
+ },
+ {
+ "epoch": 1.6964539007092199,
+ "grad_norm": 2.5181210041046143,
+ "learning_rate": 4.111371873428527e-06,
+ "loss": 0.4836,
+ "step": 3588
+ },
+ {
+ "epoch": 1.6969267139479904,
+ "grad_norm": 2.7564687728881836,
+ "learning_rate": 4.110894868108218e-06,
+ "loss": 0.5224,
+ "step": 3589
+ },
+ {
+ "epoch": 1.6973995271867612,
+ "grad_norm": 2.424421787261963,
+ "learning_rate": 4.11041776248455e-06,
+ "loss": 0.4552,
+ "step": 3590
+ },
+ {
+ "epoch": 1.697872340425532,
+ "grad_norm": 2.7013823986053467,
+ "learning_rate": 4.10994055658723e-06,
+ "loss": 0.5535,
+ "step": 3591
+ },
+ {
+ "epoch": 1.6983451536643026,
+ "grad_norm": 2.5660946369171143,
+ "learning_rate": 4.10946325044597e-06,
+ "loss": 0.5351,
+ "step": 3592
+ },
+ {
+ "epoch": 1.6988179669030732,
+ "grad_norm": 2.5598108768463135,
+ "learning_rate": 4.10898584409049e-06,
+ "loss": 0.5246,
+ "step": 3593
+ },
+ {
+ "epoch": 1.699290780141844,
+ "grad_norm": 2.6318907737731934,
+ "learning_rate": 4.108508337550518e-06,
+ "loss": 0.5002,
+ "step": 3594
+ },
+ {
+ "epoch": 1.6997635933806148,
+ "grad_norm": 2.527099132537842,
+ "learning_rate": 4.108030730855784e-06,
+ "loss": 0.5366,
+ "step": 3595
+ },
+ {
+ "epoch": 1.7002364066193854,
+ "grad_norm": 2.8629603385925293,
+ "learning_rate": 4.107553024036029e-06,
+ "loss": 0.5742,
+ "step": 3596
+ },
+ {
+ "epoch": 1.700709219858156,
+ "grad_norm": 2.8084018230438232,
+ "learning_rate": 4.107075217120994e-06,
+ "loss": 0.5618,
+ "step": 3597
+ },
+ {
+ "epoch": 1.7011820330969267,
+ "grad_norm": 3.6470065116882324,
+ "learning_rate": 4.1065973101404325e-06,
+ "loss": 0.508,
+ "step": 3598
+ },
+ {
+ "epoch": 1.7016548463356975,
+ "grad_norm": 3.0332422256469727,
+ "learning_rate": 4.106119303124102e-06,
+ "loss": 0.51,
+ "step": 3599
+ },
+ {
+ "epoch": 1.702127659574468,
+ "grad_norm": 2.4887590408325195,
+ "learning_rate": 4.105641196101765e-06,
+ "loss": 0.5109,
+ "step": 3600
+ },
+ {
+ "epoch": 1.7026004728132387,
+ "grad_norm": 2.6102066040039062,
+ "learning_rate": 4.105162989103191e-06,
+ "loss": 0.5278,
+ "step": 3601
+ },
+ {
+ "epoch": 1.7030732860520095,
+ "grad_norm": 2.771578073501587,
+ "learning_rate": 4.104684682158156e-06,
+ "loss": 0.498,
+ "step": 3602
+ },
+ {
+ "epoch": 1.7035460992907803,
+ "grad_norm": 2.5452702045440674,
+ "learning_rate": 4.1042062752964425e-06,
+ "loss": 0.4939,
+ "step": 3603
+ },
+ {
+ "epoch": 1.7040189125295508,
+ "grad_norm": 2.4287021160125732,
+ "learning_rate": 4.103727768547838e-06,
+ "loss": 0.4819,
+ "step": 3604
+ },
+ {
+ "epoch": 1.7044917257683214,
+ "grad_norm": 2.412280321121216,
+ "learning_rate": 4.103249161942138e-06,
+ "loss": 0.5196,
+ "step": 3605
+ },
+ {
+ "epoch": 1.7049645390070922,
+ "grad_norm": 2.8850717544555664,
+ "learning_rate": 4.102770455509142e-06,
+ "loss": 0.5724,
+ "step": 3606
+ },
+ {
+ "epoch": 1.705437352245863,
+ "grad_norm": 2.7979609966278076,
+ "learning_rate": 4.102291649278659e-06,
+ "loss": 0.5295,
+ "step": 3607
+ },
+ {
+ "epoch": 1.7059101654846336,
+ "grad_norm": 2.762238025665283,
+ "learning_rate": 4.1018127432805e-06,
+ "loss": 0.5166,
+ "step": 3608
+ },
+ {
+ "epoch": 1.7063829787234042,
+ "grad_norm": 2.921586513519287,
+ "learning_rate": 4.101333737544485e-06,
+ "loss": 0.5607,
+ "step": 3609
+ },
+ {
+ "epoch": 1.706855791962175,
+ "grad_norm": 3.001929998397827,
+ "learning_rate": 4.100854632100439e-06,
+ "loss": 0.6255,
+ "step": 3610
+ },
+ {
+ "epoch": 1.7073286052009458,
+ "grad_norm": 2.752713918685913,
+ "learning_rate": 4.100375426978196e-06,
+ "loss": 0.5732,
+ "step": 3611
+ },
+ {
+ "epoch": 1.7078014184397163,
+ "grad_norm": 2.6496472358703613,
+ "learning_rate": 4.099896122207593e-06,
+ "loss": 0.5138,
+ "step": 3612
+ },
+ {
+ "epoch": 1.708274231678487,
+ "grad_norm": 3.0079452991485596,
+ "learning_rate": 4.099416717818473e-06,
+ "loss": 0.5746,
+ "step": 3613
+ },
+ {
+ "epoch": 1.7087470449172577,
+ "grad_norm": 2.5762360095977783,
+ "learning_rate": 4.098937213840687e-06,
+ "loss": 0.5308,
+ "step": 3614
+ },
+ {
+ "epoch": 1.7092198581560285,
+ "grad_norm": 2.6026158332824707,
+ "learning_rate": 4.098457610304092e-06,
+ "loss": 0.4857,
+ "step": 3615
+ },
+ {
+ "epoch": 1.709692671394799,
+ "grad_norm": 2.587583541870117,
+ "learning_rate": 4.097977907238551e-06,
+ "loss": 0.4591,
+ "step": 3616
+ },
+ {
+ "epoch": 1.7101654846335697,
+ "grad_norm": 2.6996991634368896,
+ "learning_rate": 4.097498104673932e-06,
+ "loss": 0.5298,
+ "step": 3617
+ },
+ {
+ "epoch": 1.7106382978723405,
+ "grad_norm": 2.600029945373535,
+ "learning_rate": 4.097018202640111e-06,
+ "loss": 0.4726,
+ "step": 3618
+ },
+ {
+ "epoch": 1.7111111111111112,
+ "grad_norm": 2.8261220455169678,
+ "learning_rate": 4.096538201166969e-06,
+ "loss": 0.5242,
+ "step": 3619
+ },
+ {
+ "epoch": 1.7115839243498818,
+ "grad_norm": 3.053027629852295,
+ "learning_rate": 4.096058100284394e-06,
+ "loss": 0.5568,
+ "step": 3620
+ },
+ {
+ "epoch": 1.7120567375886524,
+ "grad_norm": 2.9638442993164062,
+ "learning_rate": 4.0955779000222805e-06,
+ "loss": 0.5325,
+ "step": 3621
+ },
+ {
+ "epoch": 1.7125295508274232,
+ "grad_norm": 2.731095790863037,
+ "learning_rate": 4.095097600410527e-06,
+ "loss": 0.4733,
+ "step": 3622
+ },
+ {
+ "epoch": 1.713002364066194,
+ "grad_norm": 2.632490873336792,
+ "learning_rate": 4.09461720147904e-06,
+ "loss": 0.5253,
+ "step": 3623
+ },
+ {
+ "epoch": 1.7134751773049646,
+ "grad_norm": 2.847689390182495,
+ "learning_rate": 4.094136703257732e-06,
+ "loss": 0.57,
+ "step": 3624
+ },
+ {
+ "epoch": 1.7139479905437351,
+ "grad_norm": 3.1078696250915527,
+ "learning_rate": 4.0936561057765215e-06,
+ "loss": 0.5368,
+ "step": 3625
+ },
+ {
+ "epoch": 1.714420803782506,
+ "grad_norm": 2.696349620819092,
+ "learning_rate": 4.0931754090653334e-06,
+ "loss": 0.491,
+ "step": 3626
+ },
+ {
+ "epoch": 1.7148936170212767,
+ "grad_norm": 2.712958812713623,
+ "learning_rate": 4.092694613154099e-06,
+ "loss": 0.5768,
+ "step": 3627
+ },
+ {
+ "epoch": 1.7153664302600473,
+ "grad_norm": 2.5421478748321533,
+ "learning_rate": 4.092213718072754e-06,
+ "loss": 0.4839,
+ "step": 3628
+ },
+ {
+ "epoch": 1.715839243498818,
+ "grad_norm": 2.5176162719726562,
+ "learning_rate": 4.091732723851243e-06,
+ "loss": 0.5049,
+ "step": 3629
+ },
+ {
+ "epoch": 1.7163120567375887,
+ "grad_norm": 2.642185926437378,
+ "learning_rate": 4.091251630519514e-06,
+ "loss": 0.589,
+ "step": 3630
+ },
+ {
+ "epoch": 1.7167848699763595,
+ "grad_norm": 2.587348461151123,
+ "learning_rate": 4.0907704381075245e-06,
+ "loss": 0.5281,
+ "step": 3631
+ },
+ {
+ "epoch": 1.71725768321513,
+ "grad_norm": 2.4628195762634277,
+ "learning_rate": 4.090289146645234e-06,
+ "loss": 0.5592,
+ "step": 3632
+ },
+ {
+ "epoch": 1.7177304964539006,
+ "grad_norm": 2.2751028537750244,
+ "learning_rate": 4.0898077561626125e-06,
+ "loss": 0.502,
+ "step": 3633
+ },
+ {
+ "epoch": 1.7182033096926714,
+ "grad_norm": 2.7712769508361816,
+ "learning_rate": 4.089326266689632e-06,
+ "loss": 0.5143,
+ "step": 3634
+ },
+ {
+ "epoch": 1.7186761229314422,
+ "grad_norm": 2.5297727584838867,
+ "learning_rate": 4.088844678256275e-06,
+ "loss": 0.5035,
+ "step": 3635
+ },
+ {
+ "epoch": 1.7191489361702128,
+ "grad_norm": 2.739130735397339,
+ "learning_rate": 4.088362990892527e-06,
+ "loss": 0.5959,
+ "step": 3636
+ },
+ {
+ "epoch": 1.7196217494089834,
+ "grad_norm": 2.3708314895629883,
+ "learning_rate": 4.08788120462838e-06,
+ "loss": 0.4796,
+ "step": 3637
+ },
+ {
+ "epoch": 1.7200945626477542,
+ "grad_norm": 2.7664241790771484,
+ "learning_rate": 4.087399319493832e-06,
+ "loss": 0.6052,
+ "step": 3638
+ },
+ {
+ "epoch": 1.720567375886525,
+ "grad_norm": 2.5900204181671143,
+ "learning_rate": 4.0869173355188895e-06,
+ "loss": 0.4955,
+ "step": 3639
+ },
+ {
+ "epoch": 1.7210401891252955,
+ "grad_norm": 2.6771862506866455,
+ "learning_rate": 4.0864352527335635e-06,
+ "loss": 0.4889,
+ "step": 3640
+ },
+ {
+ "epoch": 1.7215130023640661,
+ "grad_norm": 2.888479471206665,
+ "learning_rate": 4.085953071167871e-06,
+ "loss": 0.5719,
+ "step": 3641
+ },
+ {
+ "epoch": 1.721985815602837,
+ "grad_norm": 2.5967187881469727,
+ "learning_rate": 4.085470790851833e-06,
+ "loss": 0.4959,
+ "step": 3642
+ },
+ {
+ "epoch": 1.7224586288416077,
+ "grad_norm": 2.5317695140838623,
+ "learning_rate": 4.084988411815483e-06,
+ "loss": 0.4596,
+ "step": 3643
+ },
+ {
+ "epoch": 1.7229314420803783,
+ "grad_norm": 2.6531455516815186,
+ "learning_rate": 4.084505934088853e-06,
+ "loss": 0.5346,
+ "step": 3644
+ },
+ {
+ "epoch": 1.7234042553191489,
+ "grad_norm": 2.6525208950042725,
+ "learning_rate": 4.084023357701987e-06,
+ "loss": 0.5178,
+ "step": 3645
+ },
+ {
+ "epoch": 1.7238770685579197,
+ "grad_norm": 2.461954116821289,
+ "learning_rate": 4.083540682684932e-06,
+ "loss": 0.4802,
+ "step": 3646
+ },
+ {
+ "epoch": 1.7243498817966905,
+ "grad_norm": 2.794696807861328,
+ "learning_rate": 4.083057909067743e-06,
+ "loss": 0.5148,
+ "step": 3647
+ },
+ {
+ "epoch": 1.724822695035461,
+ "grad_norm": 2.867572546005249,
+ "learning_rate": 4.082575036880479e-06,
+ "loss": 0.5352,
+ "step": 3648
+ },
+ {
+ "epoch": 1.7252955082742316,
+ "grad_norm": 2.642820358276367,
+ "learning_rate": 4.082092066153207e-06,
+ "loss": 0.4652,
+ "step": 3649
+ },
+ {
+ "epoch": 1.7257683215130024,
+ "grad_norm": 2.782142400741577,
+ "learning_rate": 4.081608996915999e-06,
+ "loss": 0.5591,
+ "step": 3650
+ },
+ {
+ "epoch": 1.7262411347517732,
+ "grad_norm": 2.327331304550171,
+ "learning_rate": 4.081125829198934e-06,
+ "loss": 0.4339,
+ "step": 3651
+ },
+ {
+ "epoch": 1.7267139479905438,
+ "grad_norm": 2.7959988117218018,
+ "learning_rate": 4.0806425630320965e-06,
+ "loss": 0.5783,
+ "step": 3652
+ },
+ {
+ "epoch": 1.7271867612293144,
+ "grad_norm": 2.595053195953369,
+ "learning_rate": 4.080159198445578e-06,
+ "loss": 0.4602,
+ "step": 3653
+ },
+ {
+ "epoch": 1.7276595744680852,
+ "grad_norm": 3.0968129634857178,
+ "learning_rate": 4.079675735469475e-06,
+ "loss": 0.5775,
+ "step": 3654
+ },
+ {
+ "epoch": 1.728132387706856,
+ "grad_norm": 2.628044605255127,
+ "learning_rate": 4.07919217413389e-06,
+ "loss": 0.486,
+ "step": 3655
+ },
+ {
+ "epoch": 1.7286052009456265,
+ "grad_norm": 2.782799005508423,
+ "learning_rate": 4.078708514468933e-06,
+ "loss": 0.5282,
+ "step": 3656
+ },
+ {
+ "epoch": 1.729078014184397,
+ "grad_norm": 2.655365467071533,
+ "learning_rate": 4.0782247565047205e-06,
+ "loss": 0.4873,
+ "step": 3657
+ },
+ {
+ "epoch": 1.729550827423168,
+ "grad_norm": 2.9461584091186523,
+ "learning_rate": 4.077740900271371e-06,
+ "loss": 0.548,
+ "step": 3658
+ },
+ {
+ "epoch": 1.7300236406619387,
+ "grad_norm": 2.5094761848449707,
+ "learning_rate": 4.077256945799015e-06,
+ "loss": 0.5437,
+ "step": 3659
+ },
+ {
+ "epoch": 1.7304964539007093,
+ "grad_norm": 2.555793285369873,
+ "learning_rate": 4.0767728931177845e-06,
+ "loss": 0.5268,
+ "step": 3660
+ },
+ {
+ "epoch": 1.7309692671394799,
+ "grad_norm": 2.4433486461639404,
+ "learning_rate": 4.07628874225782e-06,
+ "loss": 0.5211,
+ "step": 3661
+ },
+ {
+ "epoch": 1.7314420803782506,
+ "grad_norm": 2.365206003189087,
+ "learning_rate": 4.075804493249267e-06,
+ "loss": 0.5084,
+ "step": 3662
+ },
+ {
+ "epoch": 1.7319148936170212,
+ "grad_norm": 2.514305830001831,
+ "learning_rate": 4.075320146122278e-06,
+ "loss": 0.4693,
+ "step": 3663
+ },
+ {
+ "epoch": 1.7323877068557918,
+ "grad_norm": 2.9270083904266357,
+ "learning_rate": 4.074835700907012e-06,
+ "loss": 0.5724,
+ "step": 3664
+ },
+ {
+ "epoch": 1.7328605200945626,
+ "grad_norm": 2.938692569732666,
+ "learning_rate": 4.0743511576336315e-06,
+ "loss": 0.5361,
+ "step": 3665
+ },
+ {
+ "epoch": 1.7333333333333334,
+ "grad_norm": 3.1978867053985596,
+ "learning_rate": 4.073866516332307e-06,
+ "loss": 0.6277,
+ "step": 3666
+ },
+ {
+ "epoch": 1.733806146572104,
+ "grad_norm": 2.3477370738983154,
+ "learning_rate": 4.073381777033217e-06,
+ "loss": 0.5139,
+ "step": 3667
+ },
+ {
+ "epoch": 1.7342789598108745,
+ "grad_norm": 2.5954184532165527,
+ "learning_rate": 4.072896939766543e-06,
+ "loss": 0.537,
+ "step": 3668
+ },
+ {
+ "epoch": 1.7347517730496453,
+ "grad_norm": 2.8999998569488525,
+ "learning_rate": 4.072412004562472e-06,
+ "loss": 0.5486,
+ "step": 3669
+ },
+ {
+ "epoch": 1.7352245862884161,
+ "grad_norm": 2.7320556640625,
+ "learning_rate": 4.071926971451201e-06,
+ "loss": 0.6025,
+ "step": 3670
+ },
+ {
+ "epoch": 1.7356973995271867,
+ "grad_norm": 2.499234676361084,
+ "learning_rate": 4.0714418404629304e-06,
+ "loss": 0.456,
+ "step": 3671
+ },
+ {
+ "epoch": 1.7361702127659573,
+ "grad_norm": 2.485924243927002,
+ "learning_rate": 4.070956611627867e-06,
+ "loss": 0.5097,
+ "step": 3672
+ },
+ {
+ "epoch": 1.736643026004728,
+ "grad_norm": 2.513723373413086,
+ "learning_rate": 4.070471284976225e-06,
+ "loss": 0.4744,
+ "step": 3673
+ },
+ {
+ "epoch": 1.7371158392434989,
+ "grad_norm": 2.281977653503418,
+ "learning_rate": 4.06998586053822e-06,
+ "loss": 0.5124,
+ "step": 3674
+ },
+ {
+ "epoch": 1.7375886524822695,
+ "grad_norm": 2.3683905601501465,
+ "learning_rate": 4.069500338344081e-06,
+ "loss": 0.4816,
+ "step": 3675
+ },
+ {
+ "epoch": 1.73806146572104,
+ "grad_norm": 2.5635924339294434,
+ "learning_rate": 4.069014718424038e-06,
+ "loss": 0.5665,
+ "step": 3676
+ },
+ {
+ "epoch": 1.7385342789598108,
+ "grad_norm": 2.7308456897735596,
+ "learning_rate": 4.068529000808328e-06,
+ "loss": 0.534,
+ "step": 3677
+ },
+ {
+ "epoch": 1.7390070921985816,
+ "grad_norm": 2.788452625274658,
+ "learning_rate": 4.068043185527196e-06,
+ "loss": 0.5609,
+ "step": 3678
+ },
+ {
+ "epoch": 1.7394799054373522,
+ "grad_norm": 2.832368850708008,
+ "learning_rate": 4.067557272610889e-06,
+ "loss": 0.553,
+ "step": 3679
+ },
+ {
+ "epoch": 1.7399527186761228,
+ "grad_norm": 2.9987435340881348,
+ "learning_rate": 4.067071262089665e-06,
+ "loss": 0.5,
+ "step": 3680
+ },
+ {
+ "epoch": 1.7404255319148936,
+ "grad_norm": 3.04913067817688,
+ "learning_rate": 4.066585153993785e-06,
+ "loss": 0.5158,
+ "step": 3681
+ },
+ {
+ "epoch": 1.7408983451536644,
+ "grad_norm": 2.5177130699157715,
+ "learning_rate": 4.066098948353516e-06,
+ "loss": 0.4508,
+ "step": 3682
+ },
+ {
+ "epoch": 1.741371158392435,
+ "grad_norm": 2.8991222381591797,
+ "learning_rate": 4.065612645199133e-06,
+ "loss": 0.5268,
+ "step": 3683
+ },
+ {
+ "epoch": 1.7418439716312055,
+ "grad_norm": 2.4928159713745117,
+ "learning_rate": 4.0651262445609156e-06,
+ "loss": 0.5024,
+ "step": 3684
+ },
+ {
+ "epoch": 1.7423167848699763,
+ "grad_norm": 2.9737319946289062,
+ "learning_rate": 4.06463974646915e-06,
+ "loss": 0.5429,
+ "step": 3685
+ },
+ {
+ "epoch": 1.7427895981087471,
+ "grad_norm": 2.6485493183135986,
+ "learning_rate": 4.064153150954128e-06,
+ "loss": 0.5619,
+ "step": 3686
+ },
+ {
+ "epoch": 1.7432624113475177,
+ "grad_norm": 2.564861297607422,
+ "learning_rate": 4.063666458046148e-06,
+ "loss": 0.4878,
+ "step": 3687
+ },
+ {
+ "epoch": 1.7437352245862883,
+ "grad_norm": 2.6048383712768555,
+ "learning_rate": 4.063179667775514e-06,
+ "loss": 0.4836,
+ "step": 3688
+ },
+ {
+ "epoch": 1.744208037825059,
+ "grad_norm": 2.751638650894165,
+ "learning_rate": 4.062692780172536e-06,
+ "loss": 0.5558,
+ "step": 3689
+ },
+ {
+ "epoch": 1.7446808510638299,
+ "grad_norm": 3.3866634368896484,
+ "learning_rate": 4.062205795267531e-06,
+ "loss": 0.4825,
+ "step": 3690
+ },
+ {
+ "epoch": 1.7451536643026004,
+ "grad_norm": 3.0112249851226807,
+ "learning_rate": 4.061718713090822e-06,
+ "loss": 0.5732,
+ "step": 3691
+ },
+ {
+ "epoch": 1.745626477541371,
+ "grad_norm": 2.5889365673065186,
+ "learning_rate": 4.061231533672736e-06,
+ "loss": 0.483,
+ "step": 3692
+ },
+ {
+ "epoch": 1.7460992907801418,
+ "grad_norm": 2.624598979949951,
+ "learning_rate": 4.0607442570436085e-06,
+ "loss": 0.5706,
+ "step": 3693
+ },
+ {
+ "epoch": 1.7465721040189126,
+ "grad_norm": 2.9219250679016113,
+ "learning_rate": 4.060256883233779e-06,
+ "loss": 0.5153,
+ "step": 3694
+ },
+ {
+ "epoch": 1.7470449172576832,
+ "grad_norm": 3.2219252586364746,
+ "learning_rate": 4.059769412273595e-06,
+ "loss": 0.5184,
+ "step": 3695
+ },
+ {
+ "epoch": 1.7475177304964538,
+ "grad_norm": 2.890697956085205,
+ "learning_rate": 4.05928184419341e-06,
+ "loss": 0.5312,
+ "step": 3696
+ },
+ {
+ "epoch": 1.7479905437352246,
+ "grad_norm": 2.673809289932251,
+ "learning_rate": 4.0587941790235816e-06,
+ "loss": 0.4893,
+ "step": 3697
+ },
+ {
+ "epoch": 1.7484633569739954,
+ "grad_norm": 2.5339348316192627,
+ "learning_rate": 4.058306416794474e-06,
+ "loss": 0.5115,
+ "step": 3698
+ },
+ {
+ "epoch": 1.748936170212766,
+ "grad_norm": 2.6525840759277344,
+ "learning_rate": 4.05781855753646e-06,
+ "loss": 0.5256,
+ "step": 3699
+ },
+ {
+ "epoch": 1.7494089834515365,
+ "grad_norm": 2.7868754863739014,
+ "learning_rate": 4.057330601279914e-06,
+ "loss": 0.5227,
+ "step": 3700
+ },
+ {
+ "epoch": 1.7498817966903073,
+ "grad_norm": 3.1629884243011475,
+ "learning_rate": 4.056842548055221e-06,
+ "loss": 0.5617,
+ "step": 3701
+ },
+ {
+ "epoch": 1.750354609929078,
+ "grad_norm": 2.9350688457489014,
+ "learning_rate": 4.056354397892769e-06,
+ "loss": 0.4753,
+ "step": 3702
+ },
+ {
+ "epoch": 1.7508274231678487,
+ "grad_norm": 2.9688615798950195,
+ "learning_rate": 4.0558661508229525e-06,
+ "loss": 0.596,
+ "step": 3703
+ },
+ {
+ "epoch": 1.7513002364066192,
+ "grad_norm": 2.802205801010132,
+ "learning_rate": 4.055377806876174e-06,
+ "loss": 0.5793,
+ "step": 3704
+ },
+ {
+ "epoch": 1.75177304964539,
+ "grad_norm": 2.4933416843414307,
+ "learning_rate": 4.054889366082839e-06,
+ "loss": 0.4824,
+ "step": 3705
+ },
+ {
+ "epoch": 1.7522458628841608,
+ "grad_norm": 3.7904608249664307,
+ "learning_rate": 4.054400828473361e-06,
+ "loss": 0.5124,
+ "step": 3706
+ },
+ {
+ "epoch": 1.7527186761229314,
+ "grad_norm": 2.694838762283325,
+ "learning_rate": 4.053912194078159e-06,
+ "loss": 0.5604,
+ "step": 3707
+ },
+ {
+ "epoch": 1.753191489361702,
+ "grad_norm": 2.3721256256103516,
+ "learning_rate": 4.053423462927659e-06,
+ "loss": 0.4978,
+ "step": 3708
+ },
+ {
+ "epoch": 1.7536643026004728,
+ "grad_norm": 2.718512773513794,
+ "learning_rate": 4.052934635052292e-06,
+ "loss": 0.5029,
+ "step": 3709
+ },
+ {
+ "epoch": 1.7541371158392436,
+ "grad_norm": 3.061558246612549,
+ "learning_rate": 4.052445710482493e-06,
+ "loss": 0.4886,
+ "step": 3710
+ },
+ {
+ "epoch": 1.7546099290780142,
+ "grad_norm": 3.0490729808807373,
+ "learning_rate": 4.051956689248709e-06,
+ "loss": 0.5363,
+ "step": 3711
+ },
+ {
+ "epoch": 1.7550827423167847,
+ "grad_norm": 2.611661672592163,
+ "learning_rate": 4.051467571381385e-06,
+ "loss": 0.5397,
+ "step": 3712
+ },
+ {
+ "epoch": 1.7555555555555555,
+ "grad_norm": 2.7829177379608154,
+ "learning_rate": 4.050978356910979e-06,
+ "loss": 0.4973,
+ "step": 3713
+ },
+ {
+ "epoch": 1.7560283687943263,
+ "grad_norm": 2.6228256225585938,
+ "learning_rate": 4.0504890458679525e-06,
+ "loss": 0.4551,
+ "step": 3714
+ },
+ {
+ "epoch": 1.756501182033097,
+ "grad_norm": 2.6801326274871826,
+ "learning_rate": 4.049999638282771e-06,
+ "loss": 0.5581,
+ "step": 3715
+ },
+ {
+ "epoch": 1.7569739952718675,
+ "grad_norm": 2.4476819038391113,
+ "learning_rate": 4.049510134185908e-06,
+ "loss": 0.5226,
+ "step": 3716
+ },
+ {
+ "epoch": 1.7574468085106383,
+ "grad_norm": 2.5661075115203857,
+ "learning_rate": 4.049020533607844e-06,
+ "loss": 0.5163,
+ "step": 3717
+ },
+ {
+ "epoch": 1.757919621749409,
+ "grad_norm": 2.3923349380493164,
+ "learning_rate": 4.048530836579065e-06,
+ "loss": 0.5076,
+ "step": 3718
+ },
+ {
+ "epoch": 1.7583924349881797,
+ "grad_norm": 2.8204405307769775,
+ "learning_rate": 4.0480410431300585e-06,
+ "loss": 0.5883,
+ "step": 3719
+ },
+ {
+ "epoch": 1.7588652482269502,
+ "grad_norm": 2.323107957839966,
+ "learning_rate": 4.047551153291325e-06,
+ "loss": 0.5116,
+ "step": 3720
+ },
+ {
+ "epoch": 1.759338061465721,
+ "grad_norm": 2.8306009769439697,
+ "learning_rate": 4.047061167093368e-06,
+ "loss": 0.5094,
+ "step": 3721
+ },
+ {
+ "epoch": 1.7598108747044918,
+ "grad_norm": 2.568765640258789,
+ "learning_rate": 4.046571084566695e-06,
+ "loss": 0.4725,
+ "step": 3722
+ },
+ {
+ "epoch": 1.7602836879432624,
+ "grad_norm": 2.7212061882019043,
+ "learning_rate": 4.046080905741822e-06,
+ "loss": 0.4741,
+ "step": 3723
+ },
+ {
+ "epoch": 1.760756501182033,
+ "grad_norm": 2.802917003631592,
+ "learning_rate": 4.04559063064927e-06,
+ "loss": 0.5691,
+ "step": 3724
+ },
+ {
+ "epoch": 1.7612293144208038,
+ "grad_norm": 3.1044139862060547,
+ "learning_rate": 4.0451002593195675e-06,
+ "loss": 0.5472,
+ "step": 3725
+ },
+ {
+ "epoch": 1.7617021276595746,
+ "grad_norm": 2.5855562686920166,
+ "learning_rate": 4.044609791783246e-06,
+ "loss": 0.4852,
+ "step": 3726
+ },
+ {
+ "epoch": 1.7621749408983451,
+ "grad_norm": 2.6235129833221436,
+ "learning_rate": 4.0441192280708465e-06,
+ "loss": 0.5269,
+ "step": 3727
+ },
+ {
+ "epoch": 1.7626477541371157,
+ "grad_norm": 3.535630464553833,
+ "learning_rate": 4.043628568212914e-06,
+ "loss": 0.5266,
+ "step": 3728
+ },
+ {
+ "epoch": 1.7631205673758865,
+ "grad_norm": 2.7783355712890625,
+ "learning_rate": 4.043137812239998e-06,
+ "loss": 0.5609,
+ "step": 3729
+ },
+ {
+ "epoch": 1.7635933806146573,
+ "grad_norm": 2.9344944953918457,
+ "learning_rate": 4.042646960182657e-06,
+ "loss": 0.5056,
+ "step": 3730
+ },
+ {
+ "epoch": 1.7640661938534279,
+ "grad_norm": 2.6205739974975586,
+ "learning_rate": 4.042156012071453e-06,
+ "loss": 0.4914,
+ "step": 3731
+ },
+ {
+ "epoch": 1.7645390070921985,
+ "grad_norm": 2.8004493713378906,
+ "learning_rate": 4.041664967936958e-06,
+ "loss": 0.4901,
+ "step": 3732
+ },
+ {
+ "epoch": 1.7650118203309693,
+ "grad_norm": 2.944589138031006,
+ "learning_rate": 4.041173827809745e-06,
+ "loss": 0.5572,
+ "step": 3733
+ },
+ {
+ "epoch": 1.76548463356974,
+ "grad_norm": 2.5021605491638184,
+ "learning_rate": 4.040682591720397e-06,
+ "loss": 0.4637,
+ "step": 3734
+ },
+ {
+ "epoch": 1.7659574468085106,
+ "grad_norm": 2.448030948638916,
+ "learning_rate": 4.040191259699497e-06,
+ "loss": 0.4785,
+ "step": 3735
+ },
+ {
+ "epoch": 1.7664302600472812,
+ "grad_norm": 2.7171032428741455,
+ "learning_rate": 4.039699831777643e-06,
+ "loss": 0.4919,
+ "step": 3736
+ },
+ {
+ "epoch": 1.766903073286052,
+ "grad_norm": 2.453118324279785,
+ "learning_rate": 4.03920830798543e-06,
+ "loss": 0.4326,
+ "step": 3737
+ },
+ {
+ "epoch": 1.7673758865248228,
+ "grad_norm": 3.112877368927002,
+ "learning_rate": 4.038716688353466e-06,
+ "loss": 0.5375,
+ "step": 3738
+ },
+ {
+ "epoch": 1.7678486997635934,
+ "grad_norm": 2.742239236831665,
+ "learning_rate": 4.038224972912361e-06,
+ "loss": 0.5267,
+ "step": 3739
+ },
+ {
+ "epoch": 1.768321513002364,
+ "grad_norm": 2.544785737991333,
+ "learning_rate": 4.037733161692731e-06,
+ "loss": 0.5032,
+ "step": 3740
+ },
+ {
+ "epoch": 1.7687943262411348,
+ "grad_norm": 2.4639062881469727,
+ "learning_rate": 4.037241254725201e-06,
+ "loss": 0.5532,
+ "step": 3741
+ },
+ {
+ "epoch": 1.7692671394799055,
+ "grad_norm": 2.866290330886841,
+ "learning_rate": 4.036749252040398e-06,
+ "loss": 0.5503,
+ "step": 3742
+ },
+ {
+ "epoch": 1.7697399527186761,
+ "grad_norm": 2.3466262817382812,
+ "learning_rate": 4.0362571536689575e-06,
+ "loss": 0.5286,
+ "step": 3743
+ },
+ {
+ "epoch": 1.7702127659574467,
+ "grad_norm": 2.246464967727661,
+ "learning_rate": 4.03576495964152e-06,
+ "loss": 0.4656,
+ "step": 3744
+ },
+ {
+ "epoch": 1.7706855791962175,
+ "grad_norm": 2.667558431625366,
+ "learning_rate": 4.035272669988733e-06,
+ "loss": 0.5205,
+ "step": 3745
+ },
+ {
+ "epoch": 1.7711583924349883,
+ "grad_norm": 2.974666118621826,
+ "learning_rate": 4.034780284741249e-06,
+ "loss": 0.6007,
+ "step": 3746
+ },
+ {
+ "epoch": 1.7716312056737589,
+ "grad_norm": 2.7164433002471924,
+ "learning_rate": 4.034287803929726e-06,
+ "loss": 0.4913,
+ "step": 3747
+ },
+ {
+ "epoch": 1.7721040189125294,
+ "grad_norm": 2.5923962593078613,
+ "learning_rate": 4.033795227584829e-06,
+ "loss": 0.5275,
+ "step": 3748
+ },
+ {
+ "epoch": 1.7725768321513002,
+ "grad_norm": 2.606027126312256,
+ "learning_rate": 4.033302555737229e-06,
+ "loss": 0.4869,
+ "step": 3749
+ },
+ {
+ "epoch": 1.773049645390071,
+ "grad_norm": 3.0110089778900146,
+ "learning_rate": 4.032809788417602e-06,
+ "loss": 0.4956,
+ "step": 3750
+ },
+ {
+ "epoch": 1.7735224586288416,
+ "grad_norm": 3.004598617553711,
+ "learning_rate": 4.032316925656632e-06,
+ "loss": 0.5159,
+ "step": 3751
+ },
+ {
+ "epoch": 1.7739952718676122,
+ "grad_norm": 2.731539249420166,
+ "learning_rate": 4.031823967485005e-06,
+ "loss": 0.5237,
+ "step": 3752
+ },
+ {
+ "epoch": 1.774468085106383,
+ "grad_norm": 2.7466373443603516,
+ "learning_rate": 4.0313309139334155e-06,
+ "loss": 0.4948,
+ "step": 3753
+ },
+ {
+ "epoch": 1.7749408983451538,
+ "grad_norm": 2.8596460819244385,
+ "learning_rate": 4.030837765032565e-06,
+ "loss": 0.5016,
+ "step": 3754
+ },
+ {
+ "epoch": 1.7754137115839244,
+ "grad_norm": 3.2886788845062256,
+ "learning_rate": 4.03034452081316e-06,
+ "loss": 0.5377,
+ "step": 3755
+ },
+ {
+ "epoch": 1.775886524822695,
+ "grad_norm": 2.5629258155822754,
+ "learning_rate": 4.029851181305912e-06,
+ "loss": 0.519,
+ "step": 3756
+ },
+ {
+ "epoch": 1.7763593380614657,
+ "grad_norm": 2.5988714694976807,
+ "learning_rate": 4.029357746541539e-06,
+ "loss": 0.5521,
+ "step": 3757
+ },
+ {
+ "epoch": 1.7768321513002365,
+ "grad_norm": 2.987884759902954,
+ "learning_rate": 4.028864216550765e-06,
+ "loss": 0.6225,
+ "step": 3758
+ },
+ {
+ "epoch": 1.777304964539007,
+ "grad_norm": 2.6875851154327393,
+ "learning_rate": 4.02837059136432e-06,
+ "loss": 0.5321,
+ "step": 3759
+ },
+ {
+ "epoch": 1.7777777777777777,
+ "grad_norm": 2.6414570808410645,
+ "learning_rate": 4.02787687101294e-06,
+ "loss": 0.4831,
+ "step": 3760
+ },
+ {
+ "epoch": 1.7782505910165485,
+ "grad_norm": 2.581475019454956,
+ "learning_rate": 4.027383055527368e-06,
+ "loss": 0.5204,
+ "step": 3761
+ },
+ {
+ "epoch": 1.7787234042553193,
+ "grad_norm": 2.811298131942749,
+ "learning_rate": 4.026889144938349e-06,
+ "loss": 0.5486,
+ "step": 3762
+ },
+ {
+ "epoch": 1.7791962174940898,
+ "grad_norm": 3.1589081287384033,
+ "learning_rate": 4.026395139276639e-06,
+ "loss": 0.4979,
+ "step": 3763
+ },
+ {
+ "epoch": 1.7796690307328604,
+ "grad_norm": 2.3773093223571777,
+ "learning_rate": 4.025901038572996e-06,
+ "loss": 0.503,
+ "step": 3764
+ },
+ {
+ "epoch": 1.7801418439716312,
+ "grad_norm": 2.962541341781616,
+ "learning_rate": 4.025406842858187e-06,
+ "loss": 0.4613,
+ "step": 3765
+ },
+ {
+ "epoch": 1.780614657210402,
+ "grad_norm": 2.603092908859253,
+ "learning_rate": 4.024912552162982e-06,
+ "loss": 0.5142,
+ "step": 3766
+ },
+ {
+ "epoch": 1.7810874704491726,
+ "grad_norm": 2.648927927017212,
+ "learning_rate": 4.024418166518159e-06,
+ "loss": 0.4491,
+ "step": 3767
+ },
+ {
+ "epoch": 1.7815602836879432,
+ "grad_norm": 3.3239917755126953,
+ "learning_rate": 4.023923685954502e-06,
+ "loss": 0.6272,
+ "step": 3768
+ },
+ {
+ "epoch": 1.782033096926714,
+ "grad_norm": 2.672821283340454,
+ "learning_rate": 4.023429110502798e-06,
+ "loss": 0.5171,
+ "step": 3769
+ },
+ {
+ "epoch": 1.7825059101654848,
+ "grad_norm": 2.364332437515259,
+ "learning_rate": 4.022934440193844e-06,
+ "loss": 0.4513,
+ "step": 3770
+ },
+ {
+ "epoch": 1.7829787234042553,
+ "grad_norm": 3.03108549118042,
+ "learning_rate": 4.022439675058441e-06,
+ "loss": 0.4324,
+ "step": 3771
+ },
+ {
+ "epoch": 1.783451536643026,
+ "grad_norm": 2.647557020187378,
+ "learning_rate": 4.021944815127393e-06,
+ "loss": 0.5162,
+ "step": 3772
+ },
+ {
+ "epoch": 1.7839243498817967,
+ "grad_norm": 2.4111907482147217,
+ "learning_rate": 4.021449860431517e-06,
+ "loss": 0.4712,
+ "step": 3773
+ },
+ {
+ "epoch": 1.7843971631205675,
+ "grad_norm": 2.796175718307495,
+ "learning_rate": 4.020954811001629e-06,
+ "loss": 0.5131,
+ "step": 3774
+ },
+ {
+ "epoch": 1.784869976359338,
+ "grad_norm": 2.4594924449920654,
+ "learning_rate": 4.020459666868553e-06,
+ "loss": 0.4739,
+ "step": 3775
+ },
+ {
+ "epoch": 1.7853427895981087,
+ "grad_norm": 2.5735671520233154,
+ "learning_rate": 4.0199644280631215e-06,
+ "loss": 0.4716,
+ "step": 3776
+ },
+ {
+ "epoch": 1.7858156028368795,
+ "grad_norm": 2.419990062713623,
+ "learning_rate": 4.01946909461617e-06,
+ "loss": 0.4866,
+ "step": 3777
+ },
+ {
+ "epoch": 1.7862884160756503,
+ "grad_norm": 2.5597951412200928,
+ "learning_rate": 4.01897366655854e-06,
+ "loss": 0.5569,
+ "step": 3778
+ },
+ {
+ "epoch": 1.7867612293144208,
+ "grad_norm": 2.462383985519409,
+ "learning_rate": 4.018478143921081e-06,
+ "loss": 0.4588,
+ "step": 3779
+ },
+ {
+ "epoch": 1.7872340425531914,
+ "grad_norm": 2.536701202392578,
+ "learning_rate": 4.017982526734646e-06,
+ "loss": 0.5278,
+ "step": 3780
+ },
+ {
+ "epoch": 1.7877068557919622,
+ "grad_norm": 2.691077470779419,
+ "learning_rate": 4.017486815030095e-06,
+ "loss": 0.4815,
+ "step": 3781
+ },
+ {
+ "epoch": 1.788179669030733,
+ "grad_norm": 2.4277288913726807,
+ "learning_rate": 4.016991008838294e-06,
+ "loss": 0.4877,
+ "step": 3782
+ },
+ {
+ "epoch": 1.7886524822695036,
+ "grad_norm": 2.6740009784698486,
+ "learning_rate": 4.016495108190115e-06,
+ "loss": 0.572,
+ "step": 3783
+ },
+ {
+ "epoch": 1.7891252955082741,
+ "grad_norm": 3.179232120513916,
+ "learning_rate": 4.0159991131164355e-06,
+ "loss": 0.4821,
+ "step": 3784
+ },
+ {
+ "epoch": 1.789598108747045,
+ "grad_norm": 3.2747793197631836,
+ "learning_rate": 4.015503023648138e-06,
+ "loss": 0.5517,
+ "step": 3785
+ },
+ {
+ "epoch": 1.7900709219858157,
+ "grad_norm": 2.671367645263672,
+ "learning_rate": 4.015006839816113e-06,
+ "loss": 0.5158,
+ "step": 3786
+ },
+ {
+ "epoch": 1.7905437352245863,
+ "grad_norm": 2.6600193977355957,
+ "learning_rate": 4.014510561651256e-06,
+ "loss": 0.535,
+ "step": 3787
+ },
+ {
+ "epoch": 1.791016548463357,
+ "grad_norm": 2.481509208679199,
+ "learning_rate": 4.014014189184466e-06,
+ "loss": 0.5596,
+ "step": 3788
+ },
+ {
+ "epoch": 1.7914893617021277,
+ "grad_norm": 2.759816884994507,
+ "learning_rate": 4.013517722446652e-06,
+ "loss": 0.5201,
+ "step": 3789
+ },
+ {
+ "epoch": 1.7919621749408985,
+ "grad_norm": 2.6913561820983887,
+ "learning_rate": 4.013021161468724e-06,
+ "loss": 0.5758,
+ "step": 3790
+ },
+ {
+ "epoch": 1.792434988179669,
+ "grad_norm": 2.775087594985962,
+ "learning_rate": 4.0125245062816044e-06,
+ "loss": 0.499,
+ "step": 3791
+ },
+ {
+ "epoch": 1.7929078014184396,
+ "grad_norm": 2.6134777069091797,
+ "learning_rate": 4.012027756916216e-06,
+ "loss": 0.5659,
+ "step": 3792
+ },
+ {
+ "epoch": 1.7933806146572104,
+ "grad_norm": 2.7109756469726562,
+ "learning_rate": 4.0115309134034895e-06,
+ "loss": 0.5337,
+ "step": 3793
+ },
+ {
+ "epoch": 1.7938534278959812,
+ "grad_norm": 2.5389950275421143,
+ "learning_rate": 4.0110339757743595e-06,
+ "loss": 0.4501,
+ "step": 3794
+ },
+ {
+ "epoch": 1.7943262411347518,
+ "grad_norm": 2.634648561477661,
+ "learning_rate": 4.010536944059771e-06,
+ "loss": 0.4411,
+ "step": 3795
+ },
+ {
+ "epoch": 1.7947990543735224,
+ "grad_norm": 2.527070999145508,
+ "learning_rate": 4.0100398182906695e-06,
+ "loss": 0.5145,
+ "step": 3796
+ },
+ {
+ "epoch": 1.7952718676122932,
+ "grad_norm": 2.62988543510437,
+ "learning_rate": 4.0095425984980105e-06,
+ "loss": 0.4981,
+ "step": 3797
+ },
+ {
+ "epoch": 1.795744680851064,
+ "grad_norm": 2.6032519340515137,
+ "learning_rate": 4.009045284712752e-06,
+ "loss": 0.453,
+ "step": 3798
+ },
+ {
+ "epoch": 1.7962174940898346,
+ "grad_norm": 2.735173463821411,
+ "learning_rate": 4.008547876965863e-06,
+ "loss": 0.5925,
+ "step": 3799
+ },
+ {
+ "epoch": 1.7966903073286051,
+ "grad_norm": 2.6296730041503906,
+ "learning_rate": 4.00805037528831e-06,
+ "loss": 0.5651,
+ "step": 3800
+ },
+ {
+ "epoch": 1.797163120567376,
+ "grad_norm": 2.641214370727539,
+ "learning_rate": 4.0075527797110735e-06,
+ "loss": 0.4973,
+ "step": 3801
+ },
+ {
+ "epoch": 1.7976359338061467,
+ "grad_norm": 2.6104819774627686,
+ "learning_rate": 4.007055090265136e-06,
+ "loss": 0.4432,
+ "step": 3802
+ },
+ {
+ "epoch": 1.7981087470449173,
+ "grad_norm": 2.8200619220733643,
+ "learning_rate": 4.0065573069814865e-06,
+ "loss": 0.4899,
+ "step": 3803
+ },
+ {
+ "epoch": 1.7985815602836879,
+ "grad_norm": 2.982354164123535,
+ "learning_rate": 4.006059429891119e-06,
+ "loss": 0.5488,
+ "step": 3804
+ },
+ {
+ "epoch": 1.7990543735224587,
+ "grad_norm": 2.7561678886413574,
+ "learning_rate": 4.005561459025034e-06,
+ "loss": 0.5637,
+ "step": 3805
+ },
+ {
+ "epoch": 1.7995271867612295,
+ "grad_norm": 2.702212333679199,
+ "learning_rate": 4.005063394414241e-06,
+ "loss": 0.4804,
+ "step": 3806
+ },
+ {
+ "epoch": 1.8,
+ "grad_norm": 2.8655319213867188,
+ "learning_rate": 4.004565236089748e-06,
+ "loss": 0.5759,
+ "step": 3807
+ },
+ {
+ "epoch": 1.8004728132387706,
+ "grad_norm": 2.703676223754883,
+ "learning_rate": 4.0040669840825756e-06,
+ "loss": 0.4728,
+ "step": 3808
+ },
+ {
+ "epoch": 1.8009456264775414,
+ "grad_norm": 2.802645683288574,
+ "learning_rate": 4.003568638423747e-06,
+ "loss": 0.5421,
+ "step": 3809
+ },
+ {
+ "epoch": 1.8014184397163122,
+ "grad_norm": 2.4723124504089355,
+ "learning_rate": 4.003070199144292e-06,
+ "loss": 0.4944,
+ "step": 3810
+ },
+ {
+ "epoch": 1.8018912529550828,
+ "grad_norm": 2.4889068603515625,
+ "learning_rate": 4.0025716662752475e-06,
+ "loss": 0.4774,
+ "step": 3811
+ },
+ {
+ "epoch": 1.8023640661938534,
+ "grad_norm": 2.5408077239990234,
+ "learning_rate": 4.002073039847653e-06,
+ "loss": 0.5233,
+ "step": 3812
+ },
+ {
+ "epoch": 1.8028368794326242,
+ "grad_norm": 2.734602689743042,
+ "learning_rate": 4.001574319892557e-06,
+ "loss": 0.5403,
+ "step": 3813
+ },
+ {
+ "epoch": 1.803309692671395,
+ "grad_norm": 3.3786163330078125,
+ "learning_rate": 4.001075506441012e-06,
+ "loss": 0.6969,
+ "step": 3814
+ },
+ {
+ "epoch": 1.8037825059101655,
+ "grad_norm": 2.7375378608703613,
+ "learning_rate": 4.000576599524078e-06,
+ "loss": 0.4907,
+ "step": 3815
+ },
+ {
+ "epoch": 1.804255319148936,
+ "grad_norm": 3.041804075241089,
+ "learning_rate": 4.000077599172818e-06,
+ "loss": 0.6021,
+ "step": 3816
+ },
+ {
+ "epoch": 1.804728132387707,
+ "grad_norm": 2.697599411010742,
+ "learning_rate": 3.999578505418305e-06,
+ "loss": 0.4743,
+ "step": 3817
+ },
+ {
+ "epoch": 1.8052009456264777,
+ "grad_norm": 2.276921272277832,
+ "learning_rate": 3.999079318291612e-06,
+ "loss": 0.4885,
+ "step": 3818
+ },
+ {
+ "epoch": 1.8056737588652483,
+ "grad_norm": 2.4896953105926514,
+ "learning_rate": 3.998580037823825e-06,
+ "loss": 0.503,
+ "step": 3819
+ },
+ {
+ "epoch": 1.8061465721040189,
+ "grad_norm": 2.6232175827026367,
+ "learning_rate": 3.998080664046029e-06,
+ "loss": 0.5058,
+ "step": 3820
+ },
+ {
+ "epoch": 1.8066193853427897,
+ "grad_norm": 2.695861339569092,
+ "learning_rate": 3.997581196989319e-06,
+ "loss": 0.4949,
+ "step": 3821
+ },
+ {
+ "epoch": 1.8070921985815604,
+ "grad_norm": 2.912886142730713,
+ "learning_rate": 3.997081636684795e-06,
+ "loss": 0.4971,
+ "step": 3822
+ },
+ {
+ "epoch": 1.807565011820331,
+ "grad_norm": 2.876500368118286,
+ "learning_rate": 3.996581983163561e-06,
+ "loss": 0.5584,
+ "step": 3823
+ },
+ {
+ "epoch": 1.8080378250591016,
+ "grad_norm": 2.857069730758667,
+ "learning_rate": 3.99608223645673e-06,
+ "loss": 0.5457,
+ "step": 3824
+ },
+ {
+ "epoch": 1.8085106382978724,
+ "grad_norm": 2.486743211746216,
+ "learning_rate": 3.995582396595419e-06,
+ "loss": 0.5291,
+ "step": 3825
+ },
+ {
+ "epoch": 1.808983451536643,
+ "grad_norm": 2.509441375732422,
+ "learning_rate": 3.9950824636107486e-06,
+ "loss": 0.4747,
+ "step": 3826
+ },
+ {
+ "epoch": 1.8094562647754135,
+ "grad_norm": 2.931394100189209,
+ "learning_rate": 3.99458243753385e-06,
+ "loss": 0.5116,
+ "step": 3827
+ },
+ {
+ "epoch": 1.8099290780141843,
+ "grad_norm": 2.4868650436401367,
+ "learning_rate": 3.994082318395856e-06,
+ "loss": 0.4671,
+ "step": 3828
+ },
+ {
+ "epoch": 1.8104018912529551,
+ "grad_norm": 2.5554752349853516,
+ "learning_rate": 3.993582106227907e-06,
+ "loss": 0.4969,
+ "step": 3829
+ },
+ {
+ "epoch": 1.8108747044917257,
+ "grad_norm": 2.8367133140563965,
+ "learning_rate": 3.99308180106115e-06,
+ "loss": 0.5507,
+ "step": 3830
+ },
+ {
+ "epoch": 1.8113475177304963,
+ "grad_norm": 2.68245792388916,
+ "learning_rate": 3.992581402926737e-06,
+ "loss": 0.5115,
+ "step": 3831
+ },
+ {
+ "epoch": 1.811820330969267,
+ "grad_norm": 2.406674385070801,
+ "learning_rate": 3.992080911855824e-06,
+ "loss": 0.545,
+ "step": 3832
+ },
+ {
+ "epoch": 1.8122931442080379,
+ "grad_norm": 2.5003464221954346,
+ "learning_rate": 3.991580327879575e-06,
+ "loss": 0.4331,
+ "step": 3833
+ },
+ {
+ "epoch": 1.8127659574468085,
+ "grad_norm": 2.49320912361145,
+ "learning_rate": 3.99107965102916e-06,
+ "loss": 0.5118,
+ "step": 3834
+ },
+ {
+ "epoch": 1.813238770685579,
+ "grad_norm": 2.6183295249938965,
+ "learning_rate": 3.990578881335752e-06,
+ "loss": 0.5286,
+ "step": 3835
+ },
+ {
+ "epoch": 1.8137115839243498,
+ "grad_norm": 3.1999518871307373,
+ "learning_rate": 3.990078018830534e-06,
+ "loss": 0.5048,
+ "step": 3836
+ },
+ {
+ "epoch": 1.8141843971631206,
+ "grad_norm": 2.4351117610931396,
+ "learning_rate": 3.9895770635446915e-06,
+ "loss": 0.514,
+ "step": 3837
+ },
+ {
+ "epoch": 1.8146572104018912,
+ "grad_norm": 2.6859259605407715,
+ "learning_rate": 3.989076015509416e-06,
+ "loss": 0.5575,
+ "step": 3838
+ },
+ {
+ "epoch": 1.8151300236406618,
+ "grad_norm": 2.790421962738037,
+ "learning_rate": 3.988574874755909e-06,
+ "loss": 0.5467,
+ "step": 3839
+ },
+ {
+ "epoch": 1.8156028368794326,
+ "grad_norm": 2.5202765464782715,
+ "learning_rate": 3.988073641315369e-06,
+ "loss": 0.5229,
+ "step": 3840
+ },
+ {
+ "epoch": 1.8160756501182034,
+ "grad_norm": 2.623652219772339,
+ "learning_rate": 3.987572315219009e-06,
+ "loss": 0.509,
+ "step": 3841
+ },
+ {
+ "epoch": 1.816548463356974,
+ "grad_norm": 2.6038360595703125,
+ "learning_rate": 3.987070896498044e-06,
+ "loss": 0.5304,
+ "step": 3842
+ },
+ {
+ "epoch": 1.8170212765957445,
+ "grad_norm": 2.9378011226654053,
+ "learning_rate": 3.9865693851836955e-06,
+ "loss": 0.5845,
+ "step": 3843
+ },
+ {
+ "epoch": 1.8174940898345153,
+ "grad_norm": 2.4061124324798584,
+ "learning_rate": 3.98606778130719e-06,
+ "loss": 0.4333,
+ "step": 3844
+ },
+ {
+ "epoch": 1.8179669030732861,
+ "grad_norm": 2.483489751815796,
+ "learning_rate": 3.985566084899759e-06,
+ "loss": 0.4827,
+ "step": 3845
+ },
+ {
+ "epoch": 1.8184397163120567,
+ "grad_norm": 2.7774932384490967,
+ "learning_rate": 3.985064295992642e-06,
+ "loss": 0.5016,
+ "step": 3846
+ },
+ {
+ "epoch": 1.8189125295508273,
+ "grad_norm": 2.5936765670776367,
+ "learning_rate": 3.984562414617083e-06,
+ "loss": 0.4448,
+ "step": 3847
+ },
+ {
+ "epoch": 1.819385342789598,
+ "grad_norm": 2.8608627319335938,
+ "learning_rate": 3.9840604408043325e-06,
+ "loss": 0.5735,
+ "step": 3848
+ },
+ {
+ "epoch": 1.8198581560283689,
+ "grad_norm": 2.6212472915649414,
+ "learning_rate": 3.983558374585646e-06,
+ "loss": 0.5091,
+ "step": 3849
+ },
+ {
+ "epoch": 1.8203309692671394,
+ "grad_norm": 2.832460641860962,
+ "learning_rate": 3.983056215992284e-06,
+ "loss": 0.5169,
+ "step": 3850
+ },
+ {
+ "epoch": 1.82080378250591,
+ "grad_norm": 2.5293610095977783,
+ "learning_rate": 3.982553965055514e-06,
+ "loss": 0.4708,
+ "step": 3851
+ },
+ {
+ "epoch": 1.8212765957446808,
+ "grad_norm": 2.9362871646881104,
+ "learning_rate": 3.982051621806611e-06,
+ "loss": 0.575,
+ "step": 3852
+ },
+ {
+ "epoch": 1.8217494089834516,
+ "grad_norm": 2.69073486328125,
+ "learning_rate": 3.98154918627685e-06,
+ "loss": 0.5278,
+ "step": 3853
+ },
+ {
+ "epoch": 1.8222222222222222,
+ "grad_norm": 2.6711034774780273,
+ "learning_rate": 3.98104665849752e-06,
+ "loss": 0.4918,
+ "step": 3854
+ },
+ {
+ "epoch": 1.8226950354609928,
+ "grad_norm": 2.571110963821411,
+ "learning_rate": 3.980544038499907e-06,
+ "loss": 0.5234,
+ "step": 3855
+ },
+ {
+ "epoch": 1.8231678486997636,
+ "grad_norm": 3.2603371143341064,
+ "learning_rate": 3.980041326315309e-06,
+ "loss": 0.5996,
+ "step": 3856
+ },
+ {
+ "epoch": 1.8236406619385344,
+ "grad_norm": 2.8472323417663574,
+ "learning_rate": 3.979538521975028e-06,
+ "loss": 0.4769,
+ "step": 3857
+ },
+ {
+ "epoch": 1.824113475177305,
+ "grad_norm": 2.6714751720428467,
+ "learning_rate": 3.979035625510371e-06,
+ "loss": 0.4826,
+ "step": 3858
+ },
+ {
+ "epoch": 1.8245862884160755,
+ "grad_norm": 2.6816468238830566,
+ "learning_rate": 3.97853263695265e-06,
+ "loss": 0.5127,
+ "step": 3859
+ },
+ {
+ "epoch": 1.8250591016548463,
+ "grad_norm": 2.6464123725891113,
+ "learning_rate": 3.978029556333185e-06,
+ "loss": 0.4925,
+ "step": 3860
+ },
+ {
+ "epoch": 1.825531914893617,
+ "grad_norm": 2.5317227840423584,
+ "learning_rate": 3.977526383683301e-06,
+ "loss": 0.4765,
+ "step": 3861
+ },
+ {
+ "epoch": 1.8260047281323877,
+ "grad_norm": 2.5052425861358643,
+ "learning_rate": 3.977023119034328e-06,
+ "loss": 0.4804,
+ "step": 3862
+ },
+ {
+ "epoch": 1.8264775413711583,
+ "grad_norm": 2.7022836208343506,
+ "learning_rate": 3.976519762417602e-06,
+ "loss": 0.4824,
+ "step": 3863
+ },
+ {
+ "epoch": 1.826950354609929,
+ "grad_norm": 2.7445900440216064,
+ "learning_rate": 3.976016313864464e-06,
+ "loss": 0.5698,
+ "step": 3864
+ },
+ {
+ "epoch": 1.8274231678486998,
+ "grad_norm": 2.442518711090088,
+ "learning_rate": 3.975512773406262e-06,
+ "loss": 0.5133,
+ "step": 3865
+ },
+ {
+ "epoch": 1.8278959810874704,
+ "grad_norm": 2.4100050926208496,
+ "learning_rate": 3.975009141074351e-06,
+ "loss": 0.5044,
+ "step": 3866
+ },
+ {
+ "epoch": 1.828368794326241,
+ "grad_norm": 2.9507648944854736,
+ "learning_rate": 3.974505416900088e-06,
+ "loss": 0.5367,
+ "step": 3867
+ },
+ {
+ "epoch": 1.8288416075650118,
+ "grad_norm": 2.5662600994110107,
+ "learning_rate": 3.974001600914837e-06,
+ "loss": 0.5878,
+ "step": 3868
+ },
+ {
+ "epoch": 1.8293144208037826,
+ "grad_norm": 2.4306657314300537,
+ "learning_rate": 3.973497693149971e-06,
+ "loss": 0.4647,
+ "step": 3869
+ },
+ {
+ "epoch": 1.8297872340425532,
+ "grad_norm": 2.974686622619629,
+ "learning_rate": 3.972993693636864e-06,
+ "loss": 0.4911,
+ "step": 3870
+ },
+ {
+ "epoch": 1.8302600472813237,
+ "grad_norm": 2.5711987018585205,
+ "learning_rate": 3.972489602406899e-06,
+ "loss": 0.5089,
+ "step": 3871
+ },
+ {
+ "epoch": 1.8307328605200945,
+ "grad_norm": 3.259617328643799,
+ "learning_rate": 3.971985419491463e-06,
+ "loss": 0.5966,
+ "step": 3872
+ },
+ {
+ "epoch": 1.8312056737588653,
+ "grad_norm": 2.7437000274658203,
+ "learning_rate": 3.971481144921949e-06,
+ "loss": 0.5097,
+ "step": 3873
+ },
+ {
+ "epoch": 1.831678486997636,
+ "grad_norm": 2.9597461223602295,
+ "learning_rate": 3.970976778729757e-06,
+ "loss": 0.5672,
+ "step": 3874
+ },
+ {
+ "epoch": 1.8321513002364065,
+ "grad_norm": 2.5775723457336426,
+ "learning_rate": 3.970472320946291e-06,
+ "loss": 0.4749,
+ "step": 3875
+ },
+ {
+ "epoch": 1.8326241134751773,
+ "grad_norm": 2.7381200790405273,
+ "learning_rate": 3.969967771602961e-06,
+ "loss": 0.5255,
+ "step": 3876
+ },
+ {
+ "epoch": 1.833096926713948,
+ "grad_norm": 2.651698350906372,
+ "learning_rate": 3.969463130731183e-06,
+ "loss": 0.5098,
+ "step": 3877
+ },
+ {
+ "epoch": 1.8335697399527187,
+ "grad_norm": 2.7277021408081055,
+ "learning_rate": 3.968958398362381e-06,
+ "loss": 0.5251,
+ "step": 3878
+ },
+ {
+ "epoch": 1.8340425531914892,
+ "grad_norm": 2.5184953212738037,
+ "learning_rate": 3.968453574527978e-06,
+ "loss": 0.5086,
+ "step": 3879
+ },
+ {
+ "epoch": 1.83451536643026,
+ "grad_norm": 2.8227882385253906,
+ "learning_rate": 3.967948659259412e-06,
+ "loss": 0.5742,
+ "step": 3880
+ },
+ {
+ "epoch": 1.8349881796690308,
+ "grad_norm": 2.547922134399414,
+ "learning_rate": 3.967443652588119e-06,
+ "loss": 0.5411,
+ "step": 3881
+ },
+ {
+ "epoch": 1.8354609929078014,
+ "grad_norm": 2.6572835445404053,
+ "learning_rate": 3.966938554545545e-06,
+ "loss": 0.4854,
+ "step": 3882
+ },
+ {
+ "epoch": 1.835933806146572,
+ "grad_norm": 2.9416658878326416,
+ "learning_rate": 3.966433365163139e-06,
+ "loss": 0.5236,
+ "step": 3883
+ },
+ {
+ "epoch": 1.8364066193853428,
+ "grad_norm": 2.344325304031372,
+ "learning_rate": 3.965928084472357e-06,
+ "loss": 0.4916,
+ "step": 3884
+ },
+ {
+ "epoch": 1.8368794326241136,
+ "grad_norm": 2.890418291091919,
+ "learning_rate": 3.965422712504662e-06,
+ "loss": 0.5287,
+ "step": 3885
+ },
+ {
+ "epoch": 1.8373522458628841,
+ "grad_norm": 2.6063363552093506,
+ "learning_rate": 3.96491724929152e-06,
+ "loss": 0.4842,
+ "step": 3886
+ },
+ {
+ "epoch": 1.8378250591016547,
+ "grad_norm": 2.5582427978515625,
+ "learning_rate": 3.964411694864404e-06,
+ "loss": 0.4768,
+ "step": 3887
+ },
+ {
+ "epoch": 1.8382978723404255,
+ "grad_norm": 2.84356951713562,
+ "learning_rate": 3.963906049254793e-06,
+ "loss": 0.5284,
+ "step": 3888
+ },
+ {
+ "epoch": 1.8387706855791963,
+ "grad_norm": 2.7048516273498535,
+ "learning_rate": 3.963400312494172e-06,
+ "loss": 0.5271,
+ "step": 3889
+ },
+ {
+ "epoch": 1.839243498817967,
+ "grad_norm": 2.5401699542999268,
+ "learning_rate": 3.962894484614031e-06,
+ "loss": 0.4734,
+ "step": 3890
+ },
+ {
+ "epoch": 1.8397163120567375,
+ "grad_norm": 2.208256244659424,
+ "learning_rate": 3.962388565645864e-06,
+ "loss": 0.4113,
+ "step": 3891
+ },
+ {
+ "epoch": 1.8401891252955083,
+ "grad_norm": 2.775139331817627,
+ "learning_rate": 3.961882555621173e-06,
+ "loss": 0.5172,
+ "step": 3892
+ },
+ {
+ "epoch": 1.840661938534279,
+ "grad_norm": 2.7540855407714844,
+ "learning_rate": 3.961376454571466e-06,
+ "loss": 0.5252,
+ "step": 3893
+ },
+ {
+ "epoch": 1.8411347517730496,
+ "grad_norm": 2.6731574535369873,
+ "learning_rate": 3.960870262528255e-06,
+ "loss": 0.4495,
+ "step": 3894
+ },
+ {
+ "epoch": 1.8416075650118202,
+ "grad_norm": 2.791492223739624,
+ "learning_rate": 3.960363979523058e-06,
+ "loss": 0.5457,
+ "step": 3895
+ },
+ {
+ "epoch": 1.842080378250591,
+ "grad_norm": 2.9280290603637695,
+ "learning_rate": 3.959857605587401e-06,
+ "loss": 0.5373,
+ "step": 3896
+ },
+ {
+ "epoch": 1.8425531914893618,
+ "grad_norm": 2.5652217864990234,
+ "learning_rate": 3.95935114075281e-06,
+ "loss": 0.5191,
+ "step": 3897
+ },
+ {
+ "epoch": 1.8430260047281324,
+ "grad_norm": 2.7297749519348145,
+ "learning_rate": 3.958844585050824e-06,
+ "loss": 0.5366,
+ "step": 3898
+ },
+ {
+ "epoch": 1.843498817966903,
+ "grad_norm": 2.5302982330322266,
+ "learning_rate": 3.958337938512983e-06,
+ "loss": 0.569,
+ "step": 3899
+ },
+ {
+ "epoch": 1.8439716312056738,
+ "grad_norm": 2.644777297973633,
+ "learning_rate": 3.957831201170832e-06,
+ "loss": 0.521,
+ "step": 3900
+ },
+ {
+ "epoch": 1.8444444444444446,
+ "grad_norm": 2.8375515937805176,
+ "learning_rate": 3.957324373055925e-06,
+ "loss": 0.573,
+ "step": 3901
+ },
+ {
+ "epoch": 1.8449172576832151,
+ "grad_norm": 2.512296676635742,
+ "learning_rate": 3.956817454199819e-06,
+ "loss": 0.5081,
+ "step": 3902
+ },
+ {
+ "epoch": 1.8453900709219857,
+ "grad_norm": 2.3662109375,
+ "learning_rate": 3.956310444634079e-06,
+ "loss": 0.4989,
+ "step": 3903
+ },
+ {
+ "epoch": 1.8458628841607565,
+ "grad_norm": 2.6849682331085205,
+ "learning_rate": 3.955803344390272e-06,
+ "loss": 0.5459,
+ "step": 3904
+ },
+ {
+ "epoch": 1.8463356973995273,
+ "grad_norm": 2.8364317417144775,
+ "learning_rate": 3.9552961534999756e-06,
+ "loss": 0.5704,
+ "step": 3905
+ },
+ {
+ "epoch": 1.8468085106382979,
+ "grad_norm": 2.6006948947906494,
+ "learning_rate": 3.954788871994768e-06,
+ "loss": 0.5696,
+ "step": 3906
+ },
+ {
+ "epoch": 1.8472813238770684,
+ "grad_norm": 2.558300018310547,
+ "learning_rate": 3.9542814999062375e-06,
+ "loss": 0.5047,
+ "step": 3907
+ },
+ {
+ "epoch": 1.8477541371158392,
+ "grad_norm": 2.6343321800231934,
+ "learning_rate": 3.953774037265974e-06,
+ "loss": 0.525,
+ "step": 3908
+ },
+ {
+ "epoch": 1.84822695035461,
+ "grad_norm": 2.5050008296966553,
+ "learning_rate": 3.953266484105576e-06,
+ "loss": 0.4867,
+ "step": 3909
+ },
+ {
+ "epoch": 1.8486997635933806,
+ "grad_norm": 2.3775103092193604,
+ "learning_rate": 3.952758840456647e-06,
+ "loss": 0.4349,
+ "step": 3910
+ },
+ {
+ "epoch": 1.8491725768321512,
+ "grad_norm": 2.508376359939575,
+ "learning_rate": 3.952251106350794e-06,
+ "loss": 0.539,
+ "step": 3911
+ },
+ {
+ "epoch": 1.849645390070922,
+ "grad_norm": 2.7403106689453125,
+ "learning_rate": 3.951743281819633e-06,
+ "loss": 0.4478,
+ "step": 3912
+ },
+ {
+ "epoch": 1.8501182033096928,
+ "grad_norm": 2.5332062244415283,
+ "learning_rate": 3.951235366894784e-06,
+ "loss": 0.4658,
+ "step": 3913
+ },
+ {
+ "epoch": 1.8505910165484634,
+ "grad_norm": 3.0137248039245605,
+ "learning_rate": 3.950727361607872e-06,
+ "loss": 0.5047,
+ "step": 3914
+ },
+ {
+ "epoch": 1.851063829787234,
+ "grad_norm": 2.5820653438568115,
+ "learning_rate": 3.950219265990528e-06,
+ "loss": 0.542,
+ "step": 3915
+ },
+ {
+ "epoch": 1.8515366430260047,
+ "grad_norm": 2.555133819580078,
+ "learning_rate": 3.949711080074389e-06,
+ "loss": 0.5253,
+ "step": 3916
+ },
+ {
+ "epoch": 1.8520094562647755,
+ "grad_norm": 2.876882791519165,
+ "learning_rate": 3.949202803891099e-06,
+ "loss": 0.5242,
+ "step": 3917
+ },
+ {
+ "epoch": 1.852482269503546,
+ "grad_norm": 2.5929203033447266,
+ "learning_rate": 3.948694437472305e-06,
+ "loss": 0.5358,
+ "step": 3918
+ },
+ {
+ "epoch": 1.8529550827423167,
+ "grad_norm": 2.468513250350952,
+ "learning_rate": 3.948185980849659e-06,
+ "loss": 0.5119,
+ "step": 3919
+ },
+ {
+ "epoch": 1.8534278959810875,
+ "grad_norm": 2.9259560108184814,
+ "learning_rate": 3.947677434054824e-06,
+ "loss": 0.4756,
+ "step": 3920
+ },
+ {
+ "epoch": 1.8539007092198583,
+ "grad_norm": 2.5247011184692383,
+ "learning_rate": 3.947168797119462e-06,
+ "loss": 0.4627,
+ "step": 3921
+ },
+ {
+ "epoch": 1.8543735224586289,
+ "grad_norm": 2.7396671772003174,
+ "learning_rate": 3.946660070075245e-06,
+ "loss": 0.5013,
+ "step": 3922
+ },
+ {
+ "epoch": 1.8548463356973994,
+ "grad_norm": 2.7059738636016846,
+ "learning_rate": 3.946151252953849e-06,
+ "loss": 0.5875,
+ "step": 3923
+ },
+ {
+ "epoch": 1.8553191489361702,
+ "grad_norm": 2.5638437271118164,
+ "learning_rate": 3.945642345786955e-06,
+ "loss": 0.5063,
+ "step": 3924
+ },
+ {
+ "epoch": 1.855791962174941,
+ "grad_norm": 2.6647839546203613,
+ "learning_rate": 3.945133348606251e-06,
+ "loss": 0.5421,
+ "step": 3925
+ },
+ {
+ "epoch": 1.8562647754137116,
+ "grad_norm": 3.7235286235809326,
+ "learning_rate": 3.944624261443431e-06,
+ "loss": 0.5958,
+ "step": 3926
+ },
+ {
+ "epoch": 1.8567375886524822,
+ "grad_norm": 2.769984245300293,
+ "learning_rate": 3.944115084330192e-06,
+ "loss": 0.5678,
+ "step": 3927
+ },
+ {
+ "epoch": 1.857210401891253,
+ "grad_norm": 2.567249059677124,
+ "learning_rate": 3.9436058172982395e-06,
+ "loss": 0.4767,
+ "step": 3928
+ },
+ {
+ "epoch": 1.8576832151300238,
+ "grad_norm": 2.6196048259735107,
+ "learning_rate": 3.943096460379283e-06,
+ "loss": 0.5345,
+ "step": 3929
+ },
+ {
+ "epoch": 1.8581560283687943,
+ "grad_norm": 2.5999555587768555,
+ "learning_rate": 3.942587013605037e-06,
+ "loss": 0.5482,
+ "step": 3930
+ },
+ {
+ "epoch": 1.858628841607565,
+ "grad_norm": 2.630387783050537,
+ "learning_rate": 3.942077477007224e-06,
+ "loss": 0.6023,
+ "step": 3931
+ },
+ {
+ "epoch": 1.8591016548463357,
+ "grad_norm": 2.543503761291504,
+ "learning_rate": 3.941567850617569e-06,
+ "loss": 0.5157,
+ "step": 3932
+ },
+ {
+ "epoch": 1.8595744680851065,
+ "grad_norm": 2.5109236240386963,
+ "learning_rate": 3.941058134467805e-06,
+ "loss": 0.4774,
+ "step": 3933
+ },
+ {
+ "epoch": 1.860047281323877,
+ "grad_norm": 2.5110230445861816,
+ "learning_rate": 3.94054832858967e-06,
+ "loss": 0.5064,
+ "step": 3934
+ },
+ {
+ "epoch": 1.8605200945626477,
+ "grad_norm": 2.4780776500701904,
+ "learning_rate": 3.940038433014908e-06,
+ "loss": 0.5216,
+ "step": 3935
+ },
+ {
+ "epoch": 1.8609929078014185,
+ "grad_norm": 2.4398856163024902,
+ "learning_rate": 3.939528447775266e-06,
+ "loss": 0.4958,
+ "step": 3936
+ },
+ {
+ "epoch": 1.8614657210401893,
+ "grad_norm": 2.449498176574707,
+ "learning_rate": 3.9390183729025e-06,
+ "loss": 0.5165,
+ "step": 3937
+ },
+ {
+ "epoch": 1.8619385342789598,
+ "grad_norm": 2.982544422149658,
+ "learning_rate": 3.938508208428371e-06,
+ "loss": 0.4803,
+ "step": 3938
+ },
+ {
+ "epoch": 1.8624113475177304,
+ "grad_norm": 2.6574015617370605,
+ "learning_rate": 3.937997954384641e-06,
+ "loss": 0.4797,
+ "step": 3939
+ },
+ {
+ "epoch": 1.8628841607565012,
+ "grad_norm": 2.7773542404174805,
+ "learning_rate": 3.937487610803086e-06,
+ "loss": 0.4843,
+ "step": 3940
+ },
+ {
+ "epoch": 1.863356973995272,
+ "grad_norm": 2.588937759399414,
+ "learning_rate": 3.9369771777154805e-06,
+ "loss": 0.5426,
+ "step": 3941
+ },
+ {
+ "epoch": 1.8638297872340426,
+ "grad_norm": 2.855442523956299,
+ "learning_rate": 3.936466655153607e-06,
+ "loss": 0.5443,
+ "step": 3942
+ },
+ {
+ "epoch": 1.8643026004728132,
+ "grad_norm": 2.554676055908203,
+ "learning_rate": 3.935956043149253e-06,
+ "loss": 0.5334,
+ "step": 3943
+ },
+ {
+ "epoch": 1.864775413711584,
+ "grad_norm": 2.901599884033203,
+ "learning_rate": 3.935445341734212e-06,
+ "loss": 0.5842,
+ "step": 3944
+ },
+ {
+ "epoch": 1.8652482269503547,
+ "grad_norm": 2.554485321044922,
+ "learning_rate": 3.934934550940285e-06,
+ "loss": 0.4941,
+ "step": 3945
+ },
+ {
+ "epoch": 1.8657210401891253,
+ "grad_norm": 2.357203245162964,
+ "learning_rate": 3.934423670799275e-06,
+ "loss": 0.4402,
+ "step": 3946
+ },
+ {
+ "epoch": 1.866193853427896,
+ "grad_norm": 2.7036049365997314,
+ "learning_rate": 3.933912701342993e-06,
+ "loss": 0.4966,
+ "step": 3947
+ },
+ {
+ "epoch": 1.8666666666666667,
+ "grad_norm": 2.7817211151123047,
+ "learning_rate": 3.933401642603255e-06,
+ "loss": 0.4908,
+ "step": 3948
+ },
+ {
+ "epoch": 1.8671394799054375,
+ "grad_norm": 2.439490795135498,
+ "learning_rate": 3.932890494611882e-06,
+ "loss": 0.4322,
+ "step": 3949
+ },
+ {
+ "epoch": 1.867612293144208,
+ "grad_norm": 3.187152147293091,
+ "learning_rate": 3.9323792574007e-06,
+ "loss": 0.501,
+ "step": 3950
+ },
+ {
+ "epoch": 1.8680851063829786,
+ "grad_norm": 2.405773401260376,
+ "learning_rate": 3.931867931001543e-06,
+ "loss": 0.4477,
+ "step": 3951
+ },
+ {
+ "epoch": 1.8685579196217494,
+ "grad_norm": 2.4922525882720947,
+ "learning_rate": 3.931356515446248e-06,
+ "loss": 0.5098,
+ "step": 3952
+ },
+ {
+ "epoch": 1.8690307328605202,
+ "grad_norm": 2.7781267166137695,
+ "learning_rate": 3.93084501076666e-06,
+ "loss": 0.5815,
+ "step": 3953
+ },
+ {
+ "epoch": 1.8695035460992908,
+ "grad_norm": 2.74621844291687,
+ "learning_rate": 3.930333416994626e-06,
+ "loss": 0.5605,
+ "step": 3954
+ },
+ {
+ "epoch": 1.8699763593380614,
+ "grad_norm": 2.5527689456939697,
+ "learning_rate": 3.929821734162004e-06,
+ "loss": 0.5141,
+ "step": 3955
+ },
+ {
+ "epoch": 1.8704491725768322,
+ "grad_norm": 2.5730628967285156,
+ "learning_rate": 3.92930996230065e-06,
+ "loss": 0.5446,
+ "step": 3956
+ },
+ {
+ "epoch": 1.870921985815603,
+ "grad_norm": 2.7053353786468506,
+ "learning_rate": 3.9287981014424334e-06,
+ "loss": 0.4722,
+ "step": 3957
+ },
+ {
+ "epoch": 1.8713947990543736,
+ "grad_norm": 2.7591893672943115,
+ "learning_rate": 3.928286151619224e-06,
+ "loss": 0.509,
+ "step": 3958
+ },
+ {
+ "epoch": 1.8718676122931441,
+ "grad_norm": 2.6233739852905273,
+ "learning_rate": 3.927774112862898e-06,
+ "loss": 0.5266,
+ "step": 3959
+ },
+ {
+ "epoch": 1.872340425531915,
+ "grad_norm": 2.7715370655059814,
+ "learning_rate": 3.9272619852053396e-06,
+ "loss": 0.5612,
+ "step": 3960
+ },
+ {
+ "epoch": 1.8728132387706857,
+ "grad_norm": 2.4815211296081543,
+ "learning_rate": 3.926749768678435e-06,
+ "loss": 0.5498,
+ "step": 3961
+ },
+ {
+ "epoch": 1.8732860520094563,
+ "grad_norm": 2.6819605827331543,
+ "learning_rate": 3.926237463314078e-06,
+ "loss": 0.5499,
+ "step": 3962
+ },
+ {
+ "epoch": 1.8737588652482269,
+ "grad_norm": 2.638664722442627,
+ "learning_rate": 3.925725069144168e-06,
+ "loss": 0.5429,
+ "step": 3963
+ },
+ {
+ "epoch": 1.8742316784869977,
+ "grad_norm": 2.527294874191284,
+ "learning_rate": 3.925212586200611e-06,
+ "loss": 0.5451,
+ "step": 3964
+ },
+ {
+ "epoch": 1.8747044917257685,
+ "grad_norm": 2.831638813018799,
+ "learning_rate": 3.924700014515315e-06,
+ "loss": 0.5276,
+ "step": 3965
+ },
+ {
+ "epoch": 1.875177304964539,
+ "grad_norm": 2.5906996726989746,
+ "learning_rate": 3.924187354120196e-06,
+ "loss": 0.5323,
+ "step": 3966
+ },
+ {
+ "epoch": 1.8756501182033096,
+ "grad_norm": 2.5482442378997803,
+ "learning_rate": 3.923674605047175e-06,
+ "loss": 0.4882,
+ "step": 3967
+ },
+ {
+ "epoch": 1.8761229314420804,
+ "grad_norm": 2.56402850151062,
+ "learning_rate": 3.923161767328179e-06,
+ "loss": 0.5111,
+ "step": 3968
+ },
+ {
+ "epoch": 1.8765957446808512,
+ "grad_norm": 3.223782539367676,
+ "learning_rate": 3.9226488409951405e-06,
+ "loss": 0.5829,
+ "step": 3969
+ },
+ {
+ "epoch": 1.8770685579196218,
+ "grad_norm": 2.665964365005493,
+ "learning_rate": 3.922135826079997e-06,
+ "loss": 0.4739,
+ "step": 3970
+ },
+ {
+ "epoch": 1.8775413711583924,
+ "grad_norm": 2.602696418762207,
+ "learning_rate": 3.921622722614691e-06,
+ "loss": 0.5199,
+ "step": 3971
+ },
+ {
+ "epoch": 1.8780141843971632,
+ "grad_norm": 2.5384418964385986,
+ "learning_rate": 3.921109530631172e-06,
+ "loss": 0.5086,
+ "step": 3972
+ },
+ {
+ "epoch": 1.878486997635934,
+ "grad_norm": 2.7961080074310303,
+ "learning_rate": 3.920596250161394e-06,
+ "loss": 0.5454,
+ "step": 3973
+ },
+ {
+ "epoch": 1.8789598108747045,
+ "grad_norm": 3.022007465362549,
+ "learning_rate": 3.920082881237317e-06,
+ "loss": 0.5537,
+ "step": 3974
+ },
+ {
+ "epoch": 1.8794326241134751,
+ "grad_norm": 2.699885129928589,
+ "learning_rate": 3.9195694238909045e-06,
+ "loss": 0.5274,
+ "step": 3975
+ },
+ {
+ "epoch": 1.879905437352246,
+ "grad_norm": 2.3994593620300293,
+ "learning_rate": 3.919055878154129e-06,
+ "loss": 0.4134,
+ "step": 3976
+ },
+ {
+ "epoch": 1.8803782505910167,
+ "grad_norm": 4.093045711517334,
+ "learning_rate": 3.918542244058967e-06,
+ "loss": 0.5305,
+ "step": 3977
+ },
+ {
+ "epoch": 1.8808510638297873,
+ "grad_norm": 3.011643171310425,
+ "learning_rate": 3.9180285216374e-06,
+ "loss": 0.5481,
+ "step": 3978
+ },
+ {
+ "epoch": 1.8813238770685579,
+ "grad_norm": 2.6426854133605957,
+ "learning_rate": 3.917514710921414e-06,
+ "loss": 0.5415,
+ "step": 3979
+ },
+ {
+ "epoch": 1.8817966903073287,
+ "grad_norm": 2.4379019737243652,
+ "learning_rate": 3.917000811943002e-06,
+ "loss": 0.4566,
+ "step": 3980
+ },
+ {
+ "epoch": 1.8822695035460995,
+ "grad_norm": 3.18522047996521,
+ "learning_rate": 3.9164868247341634e-06,
+ "loss": 0.6079,
+ "step": 3981
+ },
+ {
+ "epoch": 1.88274231678487,
+ "grad_norm": 2.6451141834259033,
+ "learning_rate": 3.915972749326903e-06,
+ "loss": 0.515,
+ "step": 3982
+ },
+ {
+ "epoch": 1.8832151300236406,
+ "grad_norm": 2.565598726272583,
+ "learning_rate": 3.915458585753226e-06,
+ "loss": 0.4799,
+ "step": 3983
+ },
+ {
+ "epoch": 1.8836879432624114,
+ "grad_norm": 2.711651563644409,
+ "learning_rate": 3.91494433404515e-06,
+ "loss": 0.5595,
+ "step": 3984
+ },
+ {
+ "epoch": 1.8841607565011822,
+ "grad_norm": 2.749328851699829,
+ "learning_rate": 3.914429994234695e-06,
+ "loss": 0.495,
+ "step": 3985
+ },
+ {
+ "epoch": 1.8846335697399526,
+ "grad_norm": 2.9492287635803223,
+ "learning_rate": 3.913915566353886e-06,
+ "loss": 0.5683,
+ "step": 3986
+ },
+ {
+ "epoch": 1.8851063829787233,
+ "grad_norm": 3.07747745513916,
+ "learning_rate": 3.913401050434756e-06,
+ "loss": 0.4953,
+ "step": 3987
+ },
+ {
+ "epoch": 1.8855791962174941,
+ "grad_norm": 2.8746345043182373,
+ "learning_rate": 3.912886446509338e-06,
+ "loss": 0.4752,
+ "step": 3988
+ },
+ {
+ "epoch": 1.8860520094562647,
+ "grad_norm": 2.772954225540161,
+ "learning_rate": 3.912371754609677e-06,
+ "loss": 0.5473,
+ "step": 3989
+ },
+ {
+ "epoch": 1.8865248226950353,
+ "grad_norm": 2.8906044960021973,
+ "learning_rate": 3.911856974767821e-06,
+ "loss": 0.5285,
+ "step": 3990
+ },
+ {
+ "epoch": 1.886997635933806,
+ "grad_norm": 2.8992726802825928,
+ "learning_rate": 3.9113421070158206e-06,
+ "loss": 0.571,
+ "step": 3991
+ },
+ {
+ "epoch": 1.887470449172577,
+ "grad_norm": 2.624662160873413,
+ "learning_rate": 3.910827151385737e-06,
+ "loss": 0.5183,
+ "step": 3992
+ },
+ {
+ "epoch": 1.8879432624113475,
+ "grad_norm": 2.4491732120513916,
+ "learning_rate": 3.910312107909632e-06,
+ "loss": 0.4205,
+ "step": 3993
+ },
+ {
+ "epoch": 1.888416075650118,
+ "grad_norm": 2.278259515762329,
+ "learning_rate": 3.909796976619575e-06,
+ "loss": 0.4464,
+ "step": 3994
+ },
+ {
+ "epoch": 1.8888888888888888,
+ "grad_norm": 2.6481523513793945,
+ "learning_rate": 3.909281757547644e-06,
+ "loss": 0.5023,
+ "step": 3995
+ },
+ {
+ "epoch": 1.8893617021276596,
+ "grad_norm": 2.6687493324279785,
+ "learning_rate": 3.908766450725917e-06,
+ "loss": 0.495,
+ "step": 3996
+ },
+ {
+ "epoch": 1.8898345153664302,
+ "grad_norm": 2.507525682449341,
+ "learning_rate": 3.908251056186481e-06,
+ "loss": 0.4155,
+ "step": 3997
+ },
+ {
+ "epoch": 1.8903073286052008,
+ "grad_norm": 2.7048323154449463,
+ "learning_rate": 3.907735573961426e-06,
+ "loss": 0.4601,
+ "step": 3998
+ },
+ {
+ "epoch": 1.8907801418439716,
+ "grad_norm": 2.6825389862060547,
+ "learning_rate": 3.907220004082848e-06,
+ "loss": 0.5067,
+ "step": 3999
+ },
+ {
+ "epoch": 1.8912529550827424,
+ "grad_norm": 2.775696039199829,
+ "learning_rate": 3.906704346582852e-06,
+ "loss": 0.5411,
+ "step": 4000
+ },
+ {
+ "epoch": 1.891725768321513,
+ "grad_norm": 2.4492077827453613,
+ "learning_rate": 3.906188601493545e-06,
+ "loss": 0.4931,
+ "step": 4001
+ },
+ {
+ "epoch": 1.8921985815602835,
+ "grad_norm": 2.320810556411743,
+ "learning_rate": 3.905672768847041e-06,
+ "loss": 0.4908,
+ "step": 4002
+ },
+ {
+ "epoch": 1.8926713947990543,
+ "grad_norm": 2.455162525177002,
+ "learning_rate": 3.905156848675455e-06,
+ "loss": 0.508,
+ "step": 4003
+ },
+ {
+ "epoch": 1.8931442080378251,
+ "grad_norm": 2.515921115875244,
+ "learning_rate": 3.904640841010915e-06,
+ "loss": 0.5318,
+ "step": 4004
+ },
+ {
+ "epoch": 1.8936170212765957,
+ "grad_norm": 2.7230770587921143,
+ "learning_rate": 3.904124745885548e-06,
+ "loss": 0.4793,
+ "step": 4005
+ },
+ {
+ "epoch": 1.8940898345153663,
+ "grad_norm": 2.519934892654419,
+ "learning_rate": 3.903608563331491e-06,
+ "loss": 0.5013,
+ "step": 4006
+ },
+ {
+ "epoch": 1.894562647754137,
+ "grad_norm": 2.719674587249756,
+ "learning_rate": 3.903092293380883e-06,
+ "loss": 0.516,
+ "step": 4007
+ },
+ {
+ "epoch": 1.8950354609929079,
+ "grad_norm": 3.2107343673706055,
+ "learning_rate": 3.902575936065869e-06,
+ "loss": 0.6297,
+ "step": 4008
+ },
+ {
+ "epoch": 1.8955082742316784,
+ "grad_norm": 2.9773149490356445,
+ "learning_rate": 3.902059491418603e-06,
+ "loss": 0.566,
+ "step": 4009
+ },
+ {
+ "epoch": 1.895981087470449,
+ "grad_norm": 2.6754770278930664,
+ "learning_rate": 3.90154295947124e-06,
+ "loss": 0.5187,
+ "step": 4010
+ },
+ {
+ "epoch": 1.8964539007092198,
+ "grad_norm": 2.457303762435913,
+ "learning_rate": 3.901026340255943e-06,
+ "loss": 0.5757,
+ "step": 4011
+ },
+ {
+ "epoch": 1.8969267139479906,
+ "grad_norm": 2.5944161415100098,
+ "learning_rate": 3.900509633804878e-06,
+ "loss": 0.5049,
+ "step": 4012
+ },
+ {
+ "epoch": 1.8973995271867612,
+ "grad_norm": 2.610445022583008,
+ "learning_rate": 3.89999284015022e-06,
+ "loss": 0.521,
+ "step": 4013
+ },
+ {
+ "epoch": 1.8978723404255318,
+ "grad_norm": 2.6949338912963867,
+ "learning_rate": 3.899475959324146e-06,
+ "loss": 0.5619,
+ "step": 4014
+ },
+ {
+ "epoch": 1.8983451536643026,
+ "grad_norm": 2.7889559268951416,
+ "learning_rate": 3.898958991358841e-06,
+ "loss": 0.5223,
+ "step": 4015
+ },
+ {
+ "epoch": 1.8988179669030734,
+ "grad_norm": 2.569265842437744,
+ "learning_rate": 3.898441936286493e-06,
+ "loss": 0.5724,
+ "step": 4016
+ },
+ {
+ "epoch": 1.899290780141844,
+ "grad_norm": 2.3567774295806885,
+ "learning_rate": 3.897924794139299e-06,
+ "loss": 0.4784,
+ "step": 4017
+ },
+ {
+ "epoch": 1.8997635933806145,
+ "grad_norm": 2.9176526069641113,
+ "learning_rate": 3.897407564949457e-06,
+ "loss": 0.646,
+ "step": 4018
+ },
+ {
+ "epoch": 1.9002364066193853,
+ "grad_norm": 2.7870090007781982,
+ "learning_rate": 3.896890248749174e-06,
+ "loss": 0.4922,
+ "step": 4019
+ },
+ {
+ "epoch": 1.900709219858156,
+ "grad_norm": 2.8310980796813965,
+ "learning_rate": 3.89637284557066e-06,
+ "loss": 0.4746,
+ "step": 4020
+ },
+ {
+ "epoch": 1.9011820330969267,
+ "grad_norm": 2.434915542602539,
+ "learning_rate": 3.895855355446131e-06,
+ "loss": 0.4537,
+ "step": 4021
+ },
+ {
+ "epoch": 1.9016548463356973,
+ "grad_norm": 3.0547034740448,
+ "learning_rate": 3.89533777840781e-06,
+ "loss": 0.6161,
+ "step": 4022
+ },
+ {
+ "epoch": 1.902127659574468,
+ "grad_norm": 3.416774272918701,
+ "learning_rate": 3.894820114487925e-06,
+ "loss": 0.5448,
+ "step": 4023
+ },
+ {
+ "epoch": 1.9026004728132389,
+ "grad_norm": 2.606951951980591,
+ "learning_rate": 3.894302363718707e-06,
+ "loss": 0.5501,
+ "step": 4024
+ },
+ {
+ "epoch": 1.9030732860520094,
+ "grad_norm": 3.082165002822876,
+ "learning_rate": 3.8937845261323945e-06,
+ "loss": 0.6035,
+ "step": 4025
+ },
+ {
+ "epoch": 1.90354609929078,
+ "grad_norm": 2.616093397140503,
+ "learning_rate": 3.893266601761231e-06,
+ "loss": 0.5294,
+ "step": 4026
+ },
+ {
+ "epoch": 1.9040189125295508,
+ "grad_norm": 2.7141637802124023,
+ "learning_rate": 3.8927485906374654e-06,
+ "loss": 0.5481,
+ "step": 4027
+ },
+ {
+ "epoch": 1.9044917257683216,
+ "grad_norm": 2.5129404067993164,
+ "learning_rate": 3.892230492793352e-06,
+ "loss": 0.4958,
+ "step": 4028
+ },
+ {
+ "epoch": 1.9049645390070922,
+ "grad_norm": 2.703403949737549,
+ "learning_rate": 3.891712308261151e-06,
+ "loss": 0.4852,
+ "step": 4029
+ },
+ {
+ "epoch": 1.9054373522458627,
+ "grad_norm": 2.881058931350708,
+ "learning_rate": 3.891194037073127e-06,
+ "loss": 0.4662,
+ "step": 4030
+ },
+ {
+ "epoch": 1.9059101654846335,
+ "grad_norm": 3.216769218444824,
+ "learning_rate": 3.8906756792615505e-06,
+ "loss": 0.5076,
+ "step": 4031
+ },
+ {
+ "epoch": 1.9063829787234043,
+ "grad_norm": 2.442265748977661,
+ "learning_rate": 3.890157234858697e-06,
+ "loss": 0.4748,
+ "step": 4032
+ },
+ {
+ "epoch": 1.906855791962175,
+ "grad_norm": 3.088672399520874,
+ "learning_rate": 3.889638703896849e-06,
+ "loss": 0.5729,
+ "step": 4033
+ },
+ {
+ "epoch": 1.9073286052009455,
+ "grad_norm": 2.9304986000061035,
+ "learning_rate": 3.889120086408291e-06,
+ "loss": 0.603,
+ "step": 4034
+ },
+ {
+ "epoch": 1.9078014184397163,
+ "grad_norm": 2.686093807220459,
+ "learning_rate": 3.888601382425318e-06,
+ "loss": 0.4978,
+ "step": 4035
+ },
+ {
+ "epoch": 1.908274231678487,
+ "grad_norm": 2.5668389797210693,
+ "learning_rate": 3.888082591980225e-06,
+ "loss": 0.5086,
+ "step": 4036
+ },
+ {
+ "epoch": 1.9087470449172577,
+ "grad_norm": 2.530996561050415,
+ "learning_rate": 3.887563715105315e-06,
+ "loss": 0.4678,
+ "step": 4037
+ },
+ {
+ "epoch": 1.9092198581560282,
+ "grad_norm": 3.043342351913452,
+ "learning_rate": 3.887044751832897e-06,
+ "loss": 0.5452,
+ "step": 4038
+ },
+ {
+ "epoch": 1.909692671394799,
+ "grad_norm": 2.799734115600586,
+ "learning_rate": 3.886525702195284e-06,
+ "loss": 0.5265,
+ "step": 4039
+ },
+ {
+ "epoch": 1.9101654846335698,
+ "grad_norm": 2.890022039413452,
+ "learning_rate": 3.886006566224796e-06,
+ "loss": 0.4634,
+ "step": 4040
+ },
+ {
+ "epoch": 1.9106382978723404,
+ "grad_norm": 2.6804237365722656,
+ "learning_rate": 3.8854873439537555e-06,
+ "loss": 0.5031,
+ "step": 4041
+ },
+ {
+ "epoch": 1.911111111111111,
+ "grad_norm": 2.43038272857666,
+ "learning_rate": 3.884968035414495e-06,
+ "loss": 0.5098,
+ "step": 4042
+ },
+ {
+ "epoch": 1.9115839243498818,
+ "grad_norm": 2.589583396911621,
+ "learning_rate": 3.884448640639346e-06,
+ "loss": 0.498,
+ "step": 4043
+ },
+ {
+ "epoch": 1.9120567375886526,
+ "grad_norm": 2.4565231800079346,
+ "learning_rate": 3.8839291596606524e-06,
+ "loss": 0.4318,
+ "step": 4044
+ },
+ {
+ "epoch": 1.9125295508274232,
+ "grad_norm": 2.66762638092041,
+ "learning_rate": 3.8834095925107575e-06,
+ "loss": 0.5441,
+ "step": 4045
+ },
+ {
+ "epoch": 1.9130023640661937,
+ "grad_norm": 2.7334461212158203,
+ "learning_rate": 3.882889939222013e-06,
+ "loss": 0.5209,
+ "step": 4046
+ },
+ {
+ "epoch": 1.9134751773049645,
+ "grad_norm": 2.6398537158966064,
+ "learning_rate": 3.8823701998267765e-06,
+ "loss": 0.4874,
+ "step": 4047
+ },
+ {
+ "epoch": 1.9139479905437353,
+ "grad_norm": 2.82405161857605,
+ "learning_rate": 3.881850374357409e-06,
+ "loss": 0.4519,
+ "step": 4048
+ },
+ {
+ "epoch": 1.914420803782506,
+ "grad_norm": 2.7552523612976074,
+ "learning_rate": 3.8813304628462776e-06,
+ "loss": 0.547,
+ "step": 4049
+ },
+ {
+ "epoch": 1.9148936170212765,
+ "grad_norm": 2.5287928581237793,
+ "learning_rate": 3.880810465325755e-06,
+ "loss": 0.5226,
+ "step": 4050
+ },
+ {
+ "epoch": 1.9153664302600473,
+ "grad_norm": 2.7597358226776123,
+ "learning_rate": 3.88029038182822e-06,
+ "loss": 0.5171,
+ "step": 4051
+ },
+ {
+ "epoch": 1.915839243498818,
+ "grad_norm": 2.563899278640747,
+ "learning_rate": 3.879770212386055e-06,
+ "loss": 0.4911,
+ "step": 4052
+ },
+ {
+ "epoch": 1.9163120567375886,
+ "grad_norm": 2.499404191970825,
+ "learning_rate": 3.879249957031649e-06,
+ "loss": 0.5072,
+ "step": 4053
+ },
+ {
+ "epoch": 1.9167848699763592,
+ "grad_norm": 2.817713499069214,
+ "learning_rate": 3.878729615797396e-06,
+ "loss": 0.5452,
+ "step": 4054
+ },
+ {
+ "epoch": 1.91725768321513,
+ "grad_norm": 2.7152490615844727,
+ "learning_rate": 3.878209188715696e-06,
+ "loss": 0.4917,
+ "step": 4055
+ },
+ {
+ "epoch": 1.9177304964539008,
+ "grad_norm": 2.384265661239624,
+ "learning_rate": 3.877688675818953e-06,
+ "loss": 0.4823,
+ "step": 4056
+ },
+ {
+ "epoch": 1.9182033096926714,
+ "grad_norm": 2.61059308052063,
+ "learning_rate": 3.877168077139577e-06,
+ "loss": 0.478,
+ "step": 4057
+ },
+ {
+ "epoch": 1.918676122931442,
+ "grad_norm": 2.6107938289642334,
+ "learning_rate": 3.8766473927099824e-06,
+ "loss": 0.5202,
+ "step": 4058
+ },
+ {
+ "epoch": 1.9191489361702128,
+ "grad_norm": 2.2339766025543213,
+ "learning_rate": 3.876126622562592e-06,
+ "loss": 0.547,
+ "step": 4059
+ },
+ {
+ "epoch": 1.9196217494089836,
+ "grad_norm": 2.4324610233306885,
+ "learning_rate": 3.8756057667298304e-06,
+ "loss": 0.5333,
+ "step": 4060
+ },
+ {
+ "epoch": 1.9200945626477541,
+ "grad_norm": 2.5521230697631836,
+ "learning_rate": 3.875084825244131e-06,
+ "loss": 0.5503,
+ "step": 4061
+ },
+ {
+ "epoch": 1.9205673758865247,
+ "grad_norm": 2.6985747814178467,
+ "learning_rate": 3.874563798137928e-06,
+ "loss": 0.4944,
+ "step": 4062
+ },
+ {
+ "epoch": 1.9210401891252955,
+ "grad_norm": 2.422332525253296,
+ "learning_rate": 3.874042685443664e-06,
+ "loss": 0.4807,
+ "step": 4063
+ },
+ {
+ "epoch": 1.9215130023640663,
+ "grad_norm": 2.914553165435791,
+ "learning_rate": 3.873521487193788e-06,
+ "loss": 0.4439,
+ "step": 4064
+ },
+ {
+ "epoch": 1.9219858156028369,
+ "grad_norm": 2.8098697662353516,
+ "learning_rate": 3.873000203420752e-06,
+ "loss": 0.5433,
+ "step": 4065
+ },
+ {
+ "epoch": 1.9224586288416075,
+ "grad_norm": 2.6124703884124756,
+ "learning_rate": 3.872478834157013e-06,
+ "loss": 0.4812,
+ "step": 4066
+ },
+ {
+ "epoch": 1.9229314420803783,
+ "grad_norm": 2.511059522628784,
+ "learning_rate": 3.871957379435035e-06,
+ "loss": 0.4666,
+ "step": 4067
+ },
+ {
+ "epoch": 1.923404255319149,
+ "grad_norm": 2.950542688369751,
+ "learning_rate": 3.871435839287287e-06,
+ "loss": 0.5687,
+ "step": 4068
+ },
+ {
+ "epoch": 1.9238770685579196,
+ "grad_norm": 2.4969422817230225,
+ "learning_rate": 3.870914213746243e-06,
+ "loss": 0.5235,
+ "step": 4069
+ },
+ {
+ "epoch": 1.9243498817966902,
+ "grad_norm": 2.512152910232544,
+ "learning_rate": 3.870392502844382e-06,
+ "loss": 0.4524,
+ "step": 4070
+ },
+ {
+ "epoch": 1.924822695035461,
+ "grad_norm": 3.0212557315826416,
+ "learning_rate": 3.86987070661419e-06,
+ "loss": 0.4868,
+ "step": 4071
+ },
+ {
+ "epoch": 1.9252955082742318,
+ "grad_norm": 2.8949966430664062,
+ "learning_rate": 3.869348825088154e-06,
+ "loss": 0.5556,
+ "step": 4072
+ },
+ {
+ "epoch": 1.9257683215130024,
+ "grad_norm": 2.402043581008911,
+ "learning_rate": 3.868826858298772e-06,
+ "loss": 0.5307,
+ "step": 4073
+ },
+ {
+ "epoch": 1.926241134751773,
+ "grad_norm": 2.980992078781128,
+ "learning_rate": 3.868304806278543e-06,
+ "loss": 0.6313,
+ "step": 4074
+ },
+ {
+ "epoch": 1.9267139479905437,
+ "grad_norm": 2.7140514850616455,
+ "learning_rate": 3.867782669059975e-06,
+ "loss": 0.5359,
+ "step": 4075
+ },
+ {
+ "epoch": 1.9271867612293145,
+ "grad_norm": 2.499631643295288,
+ "learning_rate": 3.867260446675577e-06,
+ "loss": 0.4873,
+ "step": 4076
+ },
+ {
+ "epoch": 1.9276595744680851,
+ "grad_norm": 2.915583610534668,
+ "learning_rate": 3.866738139157866e-06,
+ "loss": 0.5736,
+ "step": 4077
+ },
+ {
+ "epoch": 1.9281323877068557,
+ "grad_norm": 2.4231131076812744,
+ "learning_rate": 3.866215746539363e-06,
+ "loss": 0.5096,
+ "step": 4078
+ },
+ {
+ "epoch": 1.9286052009456265,
+ "grad_norm": 2.360074996948242,
+ "learning_rate": 3.865693268852599e-06,
+ "loss": 0.4907,
+ "step": 4079
+ },
+ {
+ "epoch": 1.9290780141843973,
+ "grad_norm": 2.5410032272338867,
+ "learning_rate": 3.865170706130101e-06,
+ "loss": 0.473,
+ "step": 4080
+ },
+ {
+ "epoch": 1.9295508274231679,
+ "grad_norm": 2.780090808868408,
+ "learning_rate": 3.86464805840441e-06,
+ "loss": 0.5213,
+ "step": 4081
+ },
+ {
+ "epoch": 1.9300236406619384,
+ "grad_norm": 2.7318382263183594,
+ "learning_rate": 3.864125325708068e-06,
+ "loss": 0.5617,
+ "step": 4082
+ },
+ {
+ "epoch": 1.9304964539007092,
+ "grad_norm": 2.76509165763855,
+ "learning_rate": 3.863602508073623e-06,
+ "loss": 0.52,
+ "step": 4083
+ },
+ {
+ "epoch": 1.93096926713948,
+ "grad_norm": 2.8041110038757324,
+ "learning_rate": 3.863079605533631e-06,
+ "loss": 0.5343,
+ "step": 4084
+ },
+ {
+ "epoch": 1.9314420803782506,
+ "grad_norm": 2.4462404251098633,
+ "learning_rate": 3.862556618120647e-06,
+ "loss": 0.4657,
+ "step": 4085
+ },
+ {
+ "epoch": 1.9319148936170212,
+ "grad_norm": 2.460864305496216,
+ "learning_rate": 3.862033545867238e-06,
+ "loss": 0.517,
+ "step": 4086
+ },
+ {
+ "epoch": 1.932387706855792,
+ "grad_norm": 2.6480276584625244,
+ "learning_rate": 3.8615103888059715e-06,
+ "loss": 0.4702,
+ "step": 4087
+ },
+ {
+ "epoch": 1.9328605200945628,
+ "grad_norm": 2.7175381183624268,
+ "learning_rate": 3.860987146969424e-06,
+ "loss": 0.5073,
+ "step": 4088
+ },
+ {
+ "epoch": 1.9333333333333333,
+ "grad_norm": 2.4963486194610596,
+ "learning_rate": 3.860463820390175e-06,
+ "loss": 0.4491,
+ "step": 4089
+ },
+ {
+ "epoch": 1.933806146572104,
+ "grad_norm": 2.548135757446289,
+ "learning_rate": 3.8599404091008075e-06,
+ "loss": 0.5134,
+ "step": 4090
+ },
+ {
+ "epoch": 1.9342789598108747,
+ "grad_norm": 2.8693668842315674,
+ "learning_rate": 3.859416913133916e-06,
+ "loss": 0.5467,
+ "step": 4091
+ },
+ {
+ "epoch": 1.9347517730496455,
+ "grad_norm": 2.711273670196533,
+ "learning_rate": 3.858893332522092e-06,
+ "loss": 0.6287,
+ "step": 4092
+ },
+ {
+ "epoch": 1.935224586288416,
+ "grad_norm": 2.8604533672332764,
+ "learning_rate": 3.858369667297941e-06,
+ "loss": 0.5661,
+ "step": 4093
+ },
+ {
+ "epoch": 1.9356973995271867,
+ "grad_norm": 2.936988353729248,
+ "learning_rate": 3.857845917494066e-06,
+ "loss": 0.5311,
+ "step": 4094
+ },
+ {
+ "epoch": 1.9361702127659575,
+ "grad_norm": 2.414093494415283,
+ "learning_rate": 3.857322083143079e-06,
+ "loss": 0.505,
+ "step": 4095
+ },
+ {
+ "epoch": 1.9366430260047283,
+ "grad_norm": 2.5528934001922607,
+ "learning_rate": 3.856798164277599e-06,
+ "loss": 0.4759,
+ "step": 4096
+ },
+ {
+ "epoch": 1.9371158392434988,
+ "grad_norm": 2.592893600463867,
+ "learning_rate": 3.8562741609302456e-06,
+ "loss": 0.4932,
+ "step": 4097
+ },
+ {
+ "epoch": 1.9375886524822694,
+ "grad_norm": 2.9619107246398926,
+ "learning_rate": 3.855750073133648e-06,
+ "loss": 0.5563,
+ "step": 4098
+ },
+ {
+ "epoch": 1.9380614657210402,
+ "grad_norm": 2.864889621734619,
+ "learning_rate": 3.855225900920438e-06,
+ "loss": 0.5069,
+ "step": 4099
+ },
+ {
+ "epoch": 1.938534278959811,
+ "grad_norm": 2.3951032161712646,
+ "learning_rate": 3.854701644323253e-06,
+ "loss": 0.4883,
+ "step": 4100
+ },
+ {
+ "epoch": 1.9390070921985816,
+ "grad_norm": 2.6339633464813232,
+ "learning_rate": 3.854177303374737e-06,
+ "loss": 0.5207,
+ "step": 4101
+ },
+ {
+ "epoch": 1.9394799054373522,
+ "grad_norm": 2.6435508728027344,
+ "learning_rate": 3.853652878107539e-06,
+ "loss": 0.4679,
+ "step": 4102
+ },
+ {
+ "epoch": 1.939952718676123,
+ "grad_norm": 2.4635629653930664,
+ "learning_rate": 3.853128368554311e-06,
+ "loss": 0.5639,
+ "step": 4103
+ },
+ {
+ "epoch": 1.9404255319148938,
+ "grad_norm": 2.664635419845581,
+ "learning_rate": 3.852603774747714e-06,
+ "loss": 0.5697,
+ "step": 4104
+ },
+ {
+ "epoch": 1.9408983451536643,
+ "grad_norm": 2.7020363807678223,
+ "learning_rate": 3.8520790967204095e-06,
+ "loss": 0.5462,
+ "step": 4105
+ },
+ {
+ "epoch": 1.941371158392435,
+ "grad_norm": 3.529282331466675,
+ "learning_rate": 3.851554334505069e-06,
+ "loss": 0.54,
+ "step": 4106
+ },
+ {
+ "epoch": 1.9418439716312057,
+ "grad_norm": 2.7125768661499023,
+ "learning_rate": 3.851029488134367e-06,
+ "loss": 0.5355,
+ "step": 4107
+ },
+ {
+ "epoch": 1.9423167848699765,
+ "grad_norm": 2.5226643085479736,
+ "learning_rate": 3.850504557640981e-06,
+ "loss": 0.5106,
+ "step": 4108
+ },
+ {
+ "epoch": 1.942789598108747,
+ "grad_norm": 2.834352731704712,
+ "learning_rate": 3.8499795430575995e-06,
+ "loss": 0.6069,
+ "step": 4109
+ },
+ {
+ "epoch": 1.9432624113475176,
+ "grad_norm": 2.8484177589416504,
+ "learning_rate": 3.849454444416911e-06,
+ "loss": 0.5542,
+ "step": 4110
+ },
+ {
+ "epoch": 1.9437352245862884,
+ "grad_norm": 2.402539014816284,
+ "learning_rate": 3.848929261751612e-06,
+ "loss": 0.47,
+ "step": 4111
+ },
+ {
+ "epoch": 1.9442080378250592,
+ "grad_norm": 2.7010042667388916,
+ "learning_rate": 3.848403995094402e-06,
+ "loss": 0.5263,
+ "step": 4112
+ },
+ {
+ "epoch": 1.9446808510638298,
+ "grad_norm": 2.441689968109131,
+ "learning_rate": 3.847878644477988e-06,
+ "loss": 0.5607,
+ "step": 4113
+ },
+ {
+ "epoch": 1.9451536643026004,
+ "grad_norm": 2.5994722843170166,
+ "learning_rate": 3.847353209935081e-06,
+ "loss": 0.5103,
+ "step": 4114
+ },
+ {
+ "epoch": 1.9456264775413712,
+ "grad_norm": 2.452242136001587,
+ "learning_rate": 3.8468276914983975e-06,
+ "loss": 0.4409,
+ "step": 4115
+ },
+ {
+ "epoch": 1.946099290780142,
+ "grad_norm": 2.421023368835449,
+ "learning_rate": 3.84630208920066e-06,
+ "loss": 0.4429,
+ "step": 4116
+ },
+ {
+ "epoch": 1.9465721040189126,
+ "grad_norm": 2.696399688720703,
+ "learning_rate": 3.8457764030745945e-06,
+ "loss": 0.5352,
+ "step": 4117
+ },
+ {
+ "epoch": 1.9470449172576831,
+ "grad_norm": 2.3963489532470703,
+ "learning_rate": 3.845250633152933e-06,
+ "loss": 0.4505,
+ "step": 4118
+ },
+ {
+ "epoch": 1.947517730496454,
+ "grad_norm": 2.610649585723877,
+ "learning_rate": 3.8447247794684135e-06,
+ "loss": 0.501,
+ "step": 4119
+ },
+ {
+ "epoch": 1.9479905437352247,
+ "grad_norm": 2.740412712097168,
+ "learning_rate": 3.8441988420537775e-06,
+ "loss": 0.5362,
+ "step": 4120
+ },
+ {
+ "epoch": 1.9484633569739953,
+ "grad_norm": 2.2614004611968994,
+ "learning_rate": 3.8436728209417755e-06,
+ "loss": 0.4199,
+ "step": 4121
+ },
+ {
+ "epoch": 1.9489361702127659,
+ "grad_norm": 3.0683481693267822,
+ "learning_rate": 3.843146716165158e-06,
+ "loss": 0.5248,
+ "step": 4122
+ },
+ {
+ "epoch": 1.9494089834515367,
+ "grad_norm": 3.005174398422241,
+ "learning_rate": 3.842620527756684e-06,
+ "loss": 0.5246,
+ "step": 4123
+ },
+ {
+ "epoch": 1.9498817966903075,
+ "grad_norm": 2.672896385192871,
+ "learning_rate": 3.842094255749117e-06,
+ "loss": 0.5586,
+ "step": 4124
+ },
+ {
+ "epoch": 1.950354609929078,
+ "grad_norm": 2.5481197834014893,
+ "learning_rate": 3.8415679001752255e-06,
+ "loss": 0.5061,
+ "step": 4125
+ },
+ {
+ "epoch": 1.9508274231678486,
+ "grad_norm": 2.515789270401001,
+ "learning_rate": 3.8410414610677835e-06,
+ "loss": 0.4645,
+ "step": 4126
+ },
+ {
+ "epoch": 1.9513002364066194,
+ "grad_norm": 2.7236077785491943,
+ "learning_rate": 3.84051493845957e-06,
+ "loss": 0.5623,
+ "step": 4127
+ },
+ {
+ "epoch": 1.9517730496453902,
+ "grad_norm": 2.6252009868621826,
+ "learning_rate": 3.839988332383369e-06,
+ "loss": 0.5078,
+ "step": 4128
+ },
+ {
+ "epoch": 1.9522458628841608,
+ "grad_norm": 2.719196081161499,
+ "learning_rate": 3.83946164287197e-06,
+ "loss": 0.5481,
+ "step": 4129
+ },
+ {
+ "epoch": 1.9527186761229314,
+ "grad_norm": 2.484163284301758,
+ "learning_rate": 3.838934869958169e-06,
+ "loss": 0.5332,
+ "step": 4130
+ },
+ {
+ "epoch": 1.9531914893617022,
+ "grad_norm": 2.615382671356201,
+ "learning_rate": 3.838408013674764e-06,
+ "loss": 0.4742,
+ "step": 4131
+ },
+ {
+ "epoch": 1.953664302600473,
+ "grad_norm": 2.735321044921875,
+ "learning_rate": 3.83788107405456e-06,
+ "loss": 0.421,
+ "step": 4132
+ },
+ {
+ "epoch": 1.9541371158392435,
+ "grad_norm": 2.892652750015259,
+ "learning_rate": 3.837354051130369e-06,
+ "loss": 0.5326,
+ "step": 4133
+ },
+ {
+ "epoch": 1.9546099290780141,
+ "grad_norm": 2.6800546646118164,
+ "learning_rate": 3.8368269449350055e-06,
+ "loss": 0.5041,
+ "step": 4134
+ },
+ {
+ "epoch": 1.955082742316785,
+ "grad_norm": 2.362470865249634,
+ "learning_rate": 3.836299755501289e-06,
+ "loss": 0.4697,
+ "step": 4135
+ },
+ {
+ "epoch": 1.9555555555555557,
+ "grad_norm": 2.3855135440826416,
+ "learning_rate": 3.835772482862047e-06,
+ "loss": 0.5148,
+ "step": 4136
+ },
+ {
+ "epoch": 1.9560283687943263,
+ "grad_norm": 2.3338418006896973,
+ "learning_rate": 3.83524512705011e-06,
+ "loss": 0.4643,
+ "step": 4137
+ },
+ {
+ "epoch": 1.9565011820330969,
+ "grad_norm": 2.261355400085449,
+ "learning_rate": 3.834717688098313e-06,
+ "loss": 0.5573,
+ "step": 4138
+ },
+ {
+ "epoch": 1.9569739952718677,
+ "grad_norm": 2.8166391849517822,
+ "learning_rate": 3.834190166039498e-06,
+ "loss": 0.4868,
+ "step": 4139
+ },
+ {
+ "epoch": 1.9574468085106385,
+ "grad_norm": 2.4155869483947754,
+ "learning_rate": 3.833662560906512e-06,
+ "loss": 0.4923,
+ "step": 4140
+ },
+ {
+ "epoch": 1.957919621749409,
+ "grad_norm": 2.3977696895599365,
+ "learning_rate": 3.833134872732206e-06,
+ "loss": 0.5106,
+ "step": 4141
+ },
+ {
+ "epoch": 1.9583924349881796,
+ "grad_norm": 2.9541378021240234,
+ "learning_rate": 3.832607101549438e-06,
+ "loss": 0.4683,
+ "step": 4142
+ },
+ {
+ "epoch": 1.9588652482269504,
+ "grad_norm": 2.5862700939178467,
+ "learning_rate": 3.832079247391068e-06,
+ "loss": 0.4453,
+ "step": 4143
+ },
+ {
+ "epoch": 1.9593380614657212,
+ "grad_norm": 2.7459371089935303,
+ "learning_rate": 3.8315513102899644e-06,
+ "loss": 0.5511,
+ "step": 4144
+ },
+ {
+ "epoch": 1.9598108747044918,
+ "grad_norm": 2.904869556427002,
+ "learning_rate": 3.831023290279e-06,
+ "loss": 0.5348,
+ "step": 4145
+ },
+ {
+ "epoch": 1.9602836879432624,
+ "grad_norm": 3.092846632003784,
+ "learning_rate": 3.830495187391051e-06,
+ "loss": 0.5664,
+ "step": 4146
+ },
+ {
+ "epoch": 1.9607565011820332,
+ "grad_norm": 3.2838528156280518,
+ "learning_rate": 3.829967001659001e-06,
+ "loss": 0.5115,
+ "step": 4147
+ },
+ {
+ "epoch": 1.961229314420804,
+ "grad_norm": 2.7799549102783203,
+ "learning_rate": 3.829438733115738e-06,
+ "loss": 0.5145,
+ "step": 4148
+ },
+ {
+ "epoch": 1.9617021276595743,
+ "grad_norm": 2.436084270477295,
+ "learning_rate": 3.828910381794154e-06,
+ "loss": 0.4718,
+ "step": 4149
+ },
+ {
+ "epoch": 1.962174940898345,
+ "grad_norm": 2.6662371158599854,
+ "learning_rate": 3.828381947727148e-06,
+ "loss": 0.6129,
+ "step": 4150
+ },
+ {
+ "epoch": 1.962647754137116,
+ "grad_norm": 2.937000036239624,
+ "learning_rate": 3.827853430947622e-06,
+ "loss": 0.522,
+ "step": 4151
+ },
+ {
+ "epoch": 1.9631205673758865,
+ "grad_norm": 2.5737369060516357,
+ "learning_rate": 3.827324831488486e-06,
+ "loss": 0.4916,
+ "step": 4152
+ },
+ {
+ "epoch": 1.963593380614657,
+ "grad_norm": 2.70232892036438,
+ "learning_rate": 3.826796149382653e-06,
+ "loss": 0.4726,
+ "step": 4153
+ },
+ {
+ "epoch": 1.9640661938534278,
+ "grad_norm": 2.6899707317352295,
+ "learning_rate": 3.826267384663042e-06,
+ "loss": 0.529,
+ "step": 4154
+ },
+ {
+ "epoch": 1.9645390070921986,
+ "grad_norm": 2.6142728328704834,
+ "learning_rate": 3.825738537362575e-06,
+ "loss": 0.4999,
+ "step": 4155
+ },
+ {
+ "epoch": 1.9650118203309692,
+ "grad_norm": 2.43949818611145,
+ "learning_rate": 3.825209607514183e-06,
+ "loss": 0.5035,
+ "step": 4156
+ },
+ {
+ "epoch": 1.9654846335697398,
+ "grad_norm": 2.3735458850860596,
+ "learning_rate": 3.824680595150801e-06,
+ "loss": 0.4779,
+ "step": 4157
+ },
+ {
+ "epoch": 1.9659574468085106,
+ "grad_norm": 2.444307565689087,
+ "learning_rate": 3.824151500305365e-06,
+ "loss": 0.4825,
+ "step": 4158
+ },
+ {
+ "epoch": 1.9664302600472814,
+ "grad_norm": 2.8219668865203857,
+ "learning_rate": 3.8236223230108224e-06,
+ "loss": 0.5354,
+ "step": 4159
+ },
+ {
+ "epoch": 1.966903073286052,
+ "grad_norm": 2.720721483230591,
+ "learning_rate": 3.823093063300121e-06,
+ "loss": 0.5064,
+ "step": 4160
+ },
+ {
+ "epoch": 1.9673758865248225,
+ "grad_norm": 2.324190616607666,
+ "learning_rate": 3.822563721206217e-06,
+ "loss": 0.5348,
+ "step": 4161
+ },
+ {
+ "epoch": 1.9678486997635933,
+ "grad_norm": 2.702155351638794,
+ "learning_rate": 3.8220342967620695e-06,
+ "loss": 0.5388,
+ "step": 4162
+ },
+ {
+ "epoch": 1.9683215130023641,
+ "grad_norm": 2.4956369400024414,
+ "learning_rate": 3.821504790000642e-06,
+ "loss": 0.5071,
+ "step": 4163
+ },
+ {
+ "epoch": 1.9687943262411347,
+ "grad_norm": 2.568039655685425,
+ "learning_rate": 3.820975200954906e-06,
+ "loss": 0.5133,
+ "step": 4164
+ },
+ {
+ "epoch": 1.9692671394799053,
+ "grad_norm": 2.810868978500366,
+ "learning_rate": 3.820445529657837e-06,
+ "loss": 0.4856,
+ "step": 4165
+ },
+ {
+ "epoch": 1.969739952718676,
+ "grad_norm": 2.66365647315979,
+ "learning_rate": 3.819915776142415e-06,
+ "loss": 0.5235,
+ "step": 4166
+ },
+ {
+ "epoch": 1.9702127659574469,
+ "grad_norm": 2.2982139587402344,
+ "learning_rate": 3.8193859404416265e-06,
+ "loss": 0.4361,
+ "step": 4167
+ },
+ {
+ "epoch": 1.9706855791962175,
+ "grad_norm": 2.585672378540039,
+ "learning_rate": 3.818856022588458e-06,
+ "loss": 0.4842,
+ "step": 4168
+ },
+ {
+ "epoch": 1.971158392434988,
+ "grad_norm": 2.57857346534729,
+ "learning_rate": 3.81832602261591e-06,
+ "loss": 0.5249,
+ "step": 4169
+ },
+ {
+ "epoch": 1.9716312056737588,
+ "grad_norm": 2.6947224140167236,
+ "learning_rate": 3.817795940556981e-06,
+ "loss": 0.5234,
+ "step": 4170
+ },
+ {
+ "epoch": 1.9721040189125296,
+ "grad_norm": 2.7453415393829346,
+ "learning_rate": 3.8172657764446764e-06,
+ "loss": 0.5219,
+ "step": 4171
+ },
+ {
+ "epoch": 1.9725768321513002,
+ "grad_norm": 8.424073219299316,
+ "learning_rate": 3.816735530312009e-06,
+ "loss": 0.5162,
+ "step": 4172
+ },
+ {
+ "epoch": 1.9730496453900708,
+ "grad_norm": 2.8229739665985107,
+ "learning_rate": 3.816205202191993e-06,
+ "loss": 0.4621,
+ "step": 4173
+ },
+ {
+ "epoch": 1.9735224586288416,
+ "grad_norm": 2.5969009399414062,
+ "learning_rate": 3.815674792117651e-06,
+ "loss": 0.5044,
+ "step": 4174
+ },
+ {
+ "epoch": 1.9739952718676124,
+ "grad_norm": 2.646024227142334,
+ "learning_rate": 3.815144300122009e-06,
+ "loss": 0.5094,
+ "step": 4175
+ },
+ {
+ "epoch": 1.974468085106383,
+ "grad_norm": 2.4950616359710693,
+ "learning_rate": 3.814613726238097e-06,
+ "loss": 0.4827,
+ "step": 4176
+ },
+ {
+ "epoch": 1.9749408983451535,
+ "grad_norm": 2.5636119842529297,
+ "learning_rate": 3.8140830704989535e-06,
+ "loss": 0.5241,
+ "step": 4177
+ },
+ {
+ "epoch": 1.9754137115839243,
+ "grad_norm": 2.7936553955078125,
+ "learning_rate": 3.813552332937619e-06,
+ "loss": 0.5344,
+ "step": 4178
+ },
+ {
+ "epoch": 1.9758865248226951,
+ "grad_norm": 2.8085341453552246,
+ "learning_rate": 3.8130215135871405e-06,
+ "loss": 0.5647,
+ "step": 4179
+ },
+ {
+ "epoch": 1.9763593380614657,
+ "grad_norm": 2.4776322841644287,
+ "learning_rate": 3.8124906124805694e-06,
+ "loss": 0.542,
+ "step": 4180
+ },
+ {
+ "epoch": 1.9768321513002363,
+ "grad_norm": 2.3227856159210205,
+ "learning_rate": 3.8119596296509635e-06,
+ "loss": 0.4618,
+ "step": 4181
+ },
+ {
+ "epoch": 1.977304964539007,
+ "grad_norm": 2.5157814025878906,
+ "learning_rate": 3.8114285651313848e-06,
+ "loss": 0.538,
+ "step": 4182
+ },
+ {
+ "epoch": 1.9777777777777779,
+ "grad_norm": 2.5630218982696533,
+ "learning_rate": 3.8108974189548987e-06,
+ "loss": 0.5254,
+ "step": 4183
+ },
+ {
+ "epoch": 1.9782505910165484,
+ "grad_norm": 2.703237533569336,
+ "learning_rate": 3.8103661911545787e-06,
+ "loss": 0.4859,
+ "step": 4184
+ },
+ {
+ "epoch": 1.978723404255319,
+ "grad_norm": 2.8808000087738037,
+ "learning_rate": 3.809834881763502e-06,
+ "loss": 0.5585,
+ "step": 4185
+ },
+ {
+ "epoch": 1.9791962174940898,
+ "grad_norm": 2.9047577381134033,
+ "learning_rate": 3.8093034908147507e-06,
+ "loss": 0.5022,
+ "step": 4186
+ },
+ {
+ "epoch": 1.9796690307328606,
+ "grad_norm": 2.7417640686035156,
+ "learning_rate": 3.8087720183414125e-06,
+ "loss": 0.5275,
+ "step": 4187
+ },
+ {
+ "epoch": 1.9801418439716312,
+ "grad_norm": 2.952012062072754,
+ "learning_rate": 3.8082404643765786e-06,
+ "loss": 0.543,
+ "step": 4188
+ },
+ {
+ "epoch": 1.9806146572104018,
+ "grad_norm": 2.538376569747925,
+ "learning_rate": 3.807708828953348e-06,
+ "loss": 0.4969,
+ "step": 4189
+ },
+ {
+ "epoch": 1.9810874704491725,
+ "grad_norm": 2.3476181030273438,
+ "learning_rate": 3.807177112104823e-06,
+ "loss": 0.4979,
+ "step": 4190
+ },
+ {
+ "epoch": 1.9815602836879433,
+ "grad_norm": 2.6480464935302734,
+ "learning_rate": 3.80664531386411e-06,
+ "loss": 0.4894,
+ "step": 4191
+ },
+ {
+ "epoch": 1.982033096926714,
+ "grad_norm": 2.792916774749756,
+ "learning_rate": 3.8061134342643235e-06,
+ "loss": 0.5468,
+ "step": 4192
+ },
+ {
+ "epoch": 1.9825059101654845,
+ "grad_norm": 2.368736743927002,
+ "learning_rate": 3.805581473338581e-06,
+ "loss": 0.4672,
+ "step": 4193
+ },
+ {
+ "epoch": 1.9829787234042553,
+ "grad_norm": 2.379084348678589,
+ "learning_rate": 3.8050494311200037e-06,
+ "loss": 0.4577,
+ "step": 4194
+ },
+ {
+ "epoch": 1.983451536643026,
+ "grad_norm": 2.722471237182617,
+ "learning_rate": 3.804517307641722e-06,
+ "loss": 0.4988,
+ "step": 4195
+ },
+ {
+ "epoch": 1.9839243498817967,
+ "grad_norm": 2.356649875640869,
+ "learning_rate": 3.8039851029368674e-06,
+ "loss": 0.4933,
+ "step": 4196
+ },
+ {
+ "epoch": 1.9843971631205672,
+ "grad_norm": 2.9182281494140625,
+ "learning_rate": 3.8034528170385776e-06,
+ "loss": 0.4873,
+ "step": 4197
+ },
+ {
+ "epoch": 1.984869976359338,
+ "grad_norm": 2.6232199668884277,
+ "learning_rate": 3.8029204499799976e-06,
+ "loss": 0.4425,
+ "step": 4198
+ },
+ {
+ "epoch": 1.9853427895981088,
+ "grad_norm": 2.667541980743408,
+ "learning_rate": 3.802388001794274e-06,
+ "loss": 0.5022,
+ "step": 4199
+ },
+ {
+ "epoch": 1.9858156028368794,
+ "grad_norm": 3.168470621109009,
+ "learning_rate": 3.8018554725145596e-06,
+ "loss": 0.5505,
+ "step": 4200
+ },
+ {
+ "epoch": 1.98628841607565,
+ "grad_norm": 2.716625452041626,
+ "learning_rate": 3.8013228621740132e-06,
+ "loss": 0.4937,
+ "step": 4201
+ },
+ {
+ "epoch": 1.9867612293144208,
+ "grad_norm": 2.3014442920684814,
+ "learning_rate": 3.800790170805799e-06,
+ "loss": 0.4734,
+ "step": 4202
+ },
+ {
+ "epoch": 1.9872340425531916,
+ "grad_norm": 2.9426841735839844,
+ "learning_rate": 3.8002573984430847e-06,
+ "loss": 0.4983,
+ "step": 4203
+ },
+ {
+ "epoch": 1.9877068557919622,
+ "grad_norm": 2.5598278045654297,
+ "learning_rate": 3.7997245451190435e-06,
+ "loss": 0.4834,
+ "step": 4204
+ },
+ {
+ "epoch": 1.9881796690307327,
+ "grad_norm": 2.86458420753479,
+ "learning_rate": 3.7991916108668538e-06,
+ "loss": 0.5613,
+ "step": 4205
+ },
+ {
+ "epoch": 1.9886524822695035,
+ "grad_norm": 2.842914342880249,
+ "learning_rate": 3.7986585957196997e-06,
+ "loss": 0.4951,
+ "step": 4206
+ },
+ {
+ "epoch": 1.9891252955082743,
+ "grad_norm": 3.1828150749206543,
+ "learning_rate": 3.7981254997107686e-06,
+ "loss": 0.5913,
+ "step": 4207
+ },
+ {
+ "epoch": 1.989598108747045,
+ "grad_norm": 2.5765931606292725,
+ "learning_rate": 3.7975923228732547e-06,
+ "loss": 0.5544,
+ "step": 4208
+ },
+ {
+ "epoch": 1.9900709219858155,
+ "grad_norm": 2.492234945297241,
+ "learning_rate": 3.797059065240357e-06,
+ "loss": 0.5046,
+ "step": 4209
+ },
+ {
+ "epoch": 1.9905437352245863,
+ "grad_norm": 2.870346784591675,
+ "learning_rate": 3.7965257268452795e-06,
+ "loss": 0.5354,
+ "step": 4210
+ },
+ {
+ "epoch": 1.991016548463357,
+ "grad_norm": 2.4989993572235107,
+ "learning_rate": 3.795992307721229e-06,
+ "loss": 0.4677,
+ "step": 4211
+ },
+ {
+ "epoch": 1.9914893617021276,
+ "grad_norm": 2.931114673614502,
+ "learning_rate": 3.7954588079014206e-06,
+ "loss": 0.5504,
+ "step": 4212
+ },
+ {
+ "epoch": 1.9919621749408982,
+ "grad_norm": 2.5247652530670166,
+ "learning_rate": 3.794925227419073e-06,
+ "loss": 0.4736,
+ "step": 4213
+ },
+ {
+ "epoch": 1.992434988179669,
+ "grad_norm": 2.6238436698913574,
+ "learning_rate": 3.794391566307409e-06,
+ "loss": 0.4591,
+ "step": 4214
+ },
+ {
+ "epoch": 1.9929078014184398,
+ "grad_norm": 2.654886245727539,
+ "learning_rate": 3.7938578245996584e-06,
+ "loss": 0.5149,
+ "step": 4215
+ },
+ {
+ "epoch": 1.9933806146572104,
+ "grad_norm": 2.509164810180664,
+ "learning_rate": 3.793324002329054e-06,
+ "loss": 0.4951,
+ "step": 4216
+ },
+ {
+ "epoch": 1.993853427895981,
+ "grad_norm": 2.909632921218872,
+ "learning_rate": 3.7927900995288345e-06,
+ "loss": 0.5131,
+ "step": 4217
+ },
+ {
+ "epoch": 1.9943262411347518,
+ "grad_norm": 2.4354615211486816,
+ "learning_rate": 3.7922561162322456e-06,
+ "loss": 0.4716,
+ "step": 4218
+ },
+ {
+ "epoch": 1.9947990543735226,
+ "grad_norm": 2.6514649391174316,
+ "learning_rate": 3.791722052472534e-06,
+ "loss": 0.5714,
+ "step": 4219
+ },
+ {
+ "epoch": 1.9952718676122931,
+ "grad_norm": 2.77089262008667,
+ "learning_rate": 3.791187908282954e-06,
+ "loss": 0.5736,
+ "step": 4220
+ },
+ {
+ "epoch": 1.9957446808510637,
+ "grad_norm": 2.7651021480560303,
+ "learning_rate": 3.7906536836967657e-06,
+ "loss": 0.4948,
+ "step": 4221
+ },
+ {
+ "epoch": 1.9962174940898345,
+ "grad_norm": 2.7536795139312744,
+ "learning_rate": 3.7901193787472306e-06,
+ "loss": 0.512,
+ "step": 4222
+ },
+ {
+ "epoch": 1.9966903073286053,
+ "grad_norm": 2.684893846511841,
+ "learning_rate": 3.78958499346762e-06,
+ "loss": 0.5118,
+ "step": 4223
+ },
+ {
+ "epoch": 1.9971631205673759,
+ "grad_norm": 2.7616753578186035,
+ "learning_rate": 3.7890505278912054e-06,
+ "loss": 0.4516,
+ "step": 4224
+ },
+ {
+ "epoch": 1.9976359338061465,
+ "grad_norm": 2.4731967449188232,
+ "learning_rate": 3.7885159820512666e-06,
+ "loss": 0.4736,
+ "step": 4225
+ },
+ {
+ "epoch": 1.9981087470449173,
+ "grad_norm": 2.366631031036377,
+ "learning_rate": 3.7879813559810884e-06,
+ "loss": 0.4999,
+ "step": 4226
+ },
+ {
+ "epoch": 1.998581560283688,
+ "grad_norm": 2.994624137878418,
+ "learning_rate": 3.7874466497139582e-06,
+ "loss": 0.5273,
+ "step": 4227
+ },
+ {
+ "epoch": 1.9990543735224586,
+ "grad_norm": 2.4499242305755615,
+ "learning_rate": 3.7869118632831712e-06,
+ "loss": 0.5761,
+ "step": 4228
+ },
+ {
+ "epoch": 1.9995271867612292,
+ "grad_norm": 2.3370113372802734,
+ "learning_rate": 3.7863769967220243e-06,
+ "loss": 0.4673,
+ "step": 4229
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 3.1131203174591064,
+ "learning_rate": 3.7858420500638236e-06,
+ "loss": 0.5118,
+ "step": 4230
+ },
+ {
+ "epoch": 2.000472813238771,
+ "grad_norm": 2.2747561931610107,
+ "learning_rate": 3.785307023341876e-06,
+ "loss": 0.4166,
+ "step": 4231
+ },
+ {
+ "epoch": 2.000945626477541,
+ "grad_norm": 2.4347424507141113,
+ "learning_rate": 3.7847719165894963e-06,
+ "loss": 0.4161,
+ "step": 4232
+ },
+ {
+ "epoch": 2.001418439716312,
+ "grad_norm": 2.398805618286133,
+ "learning_rate": 3.784236729840003e-06,
+ "loss": 0.4652,
+ "step": 4233
+ },
+ {
+ "epoch": 2.0018912529550827,
+ "grad_norm": 2.1904916763305664,
+ "learning_rate": 3.783701463126719e-06,
+ "loss": 0.4554,
+ "step": 4234
+ },
+ {
+ "epoch": 2.0023640661938535,
+ "grad_norm": 2.237330913543701,
+ "learning_rate": 3.7831661164829735e-06,
+ "loss": 0.4471,
+ "step": 4235
+ },
+ {
+ "epoch": 2.002836879432624,
+ "grad_norm": 2.3656628131866455,
+ "learning_rate": 3.7826306899421016e-06,
+ "loss": 0.4052,
+ "step": 4236
+ },
+ {
+ "epoch": 2.0033096926713947,
+ "grad_norm": 2.615489959716797,
+ "learning_rate": 3.7820951835374405e-06,
+ "loss": 0.4847,
+ "step": 4237
+ },
+ {
+ "epoch": 2.0037825059101655,
+ "grad_norm": 2.453036308288574,
+ "learning_rate": 3.7815595973023347e-06,
+ "loss": 0.4672,
+ "step": 4238
+ },
+ {
+ "epoch": 2.0042553191489363,
+ "grad_norm": 2.537468671798706,
+ "learning_rate": 3.7810239312701306e-06,
+ "loss": 0.467,
+ "step": 4239
+ },
+ {
+ "epoch": 2.0047281323877066,
+ "grad_norm": 2.3321666717529297,
+ "learning_rate": 3.780488185474184e-06,
+ "loss": 0.3557,
+ "step": 4240
+ },
+ {
+ "epoch": 2.0052009456264774,
+ "grad_norm": 2.9051828384399414,
+ "learning_rate": 3.779952359947854e-06,
+ "loss": 0.5474,
+ "step": 4241
+ },
+ {
+ "epoch": 2.0056737588652482,
+ "grad_norm": 2.7458817958831787,
+ "learning_rate": 3.7794164547245015e-06,
+ "loss": 0.4659,
+ "step": 4242
+ },
+ {
+ "epoch": 2.006146572104019,
+ "grad_norm": 2.627046585083008,
+ "learning_rate": 3.778880469837497e-06,
+ "loss": 0.4179,
+ "step": 4243
+ },
+ {
+ "epoch": 2.0066193853427894,
+ "grad_norm": 2.4186174869537354,
+ "learning_rate": 3.7783444053202135e-06,
+ "loss": 0.3976,
+ "step": 4244
+ },
+ {
+ "epoch": 2.00709219858156,
+ "grad_norm": 3.109376907348633,
+ "learning_rate": 3.7778082612060296e-06,
+ "loss": 0.4095,
+ "step": 4245
+ },
+ {
+ "epoch": 2.007565011820331,
+ "grad_norm": 2.583376169204712,
+ "learning_rate": 3.7772720375283282e-06,
+ "loss": 0.4325,
+ "step": 4246
+ },
+ {
+ "epoch": 2.0080378250591018,
+ "grad_norm": 2.6199896335601807,
+ "learning_rate": 3.776735734320497e-06,
+ "loss": 0.4207,
+ "step": 4247
+ },
+ {
+ "epoch": 2.008510638297872,
+ "grad_norm": 2.545353651046753,
+ "learning_rate": 3.77619935161593e-06,
+ "loss": 0.4483,
+ "step": 4248
+ },
+ {
+ "epoch": 2.008983451536643,
+ "grad_norm": 2.770266056060791,
+ "learning_rate": 3.7756628894480263e-06,
+ "loss": 0.457,
+ "step": 4249
+ },
+ {
+ "epoch": 2.0094562647754137,
+ "grad_norm": 2.903254985809326,
+ "learning_rate": 3.7751263478501878e-06,
+ "loss": 0.4171,
+ "step": 4250
+ },
+ {
+ "epoch": 2.0099290780141845,
+ "grad_norm": 2.5576963424682617,
+ "learning_rate": 3.774589726855822e-06,
+ "loss": 0.3631,
+ "step": 4251
+ },
+ {
+ "epoch": 2.010401891252955,
+ "grad_norm": 3.7584285736083984,
+ "learning_rate": 3.7740530264983434e-06,
+ "loss": 0.4827,
+ "step": 4252
+ },
+ {
+ "epoch": 2.0108747044917257,
+ "grad_norm": 3.3116581439971924,
+ "learning_rate": 3.77351624681117e-06,
+ "loss": 0.5071,
+ "step": 4253
+ },
+ {
+ "epoch": 2.0113475177304965,
+ "grad_norm": 3.1370885372161865,
+ "learning_rate": 3.772979387827723e-06,
+ "loss": 0.4963,
+ "step": 4254
+ },
+ {
+ "epoch": 2.0118203309692673,
+ "grad_norm": 2.4832639694213867,
+ "learning_rate": 3.772442449581432e-06,
+ "loss": 0.4442,
+ "step": 4255
+ },
+ {
+ "epoch": 2.0122931442080376,
+ "grad_norm": 2.7645785808563232,
+ "learning_rate": 3.7719054321057293e-06,
+ "loss": 0.4572,
+ "step": 4256
+ },
+ {
+ "epoch": 2.0127659574468084,
+ "grad_norm": 2.7962236404418945,
+ "learning_rate": 3.7713683354340515e-06,
+ "loss": 0.4906,
+ "step": 4257
+ },
+ {
+ "epoch": 2.013238770685579,
+ "grad_norm": 2.647991895675659,
+ "learning_rate": 3.7708311595998425e-06,
+ "loss": 0.4027,
+ "step": 4258
+ },
+ {
+ "epoch": 2.01371158392435,
+ "grad_norm": 2.3780267238616943,
+ "learning_rate": 3.7702939046365504e-06,
+ "loss": 0.4285,
+ "step": 4259
+ },
+ {
+ "epoch": 2.0141843971631204,
+ "grad_norm": 2.5185933113098145,
+ "learning_rate": 3.7697565705776266e-06,
+ "loss": 0.4834,
+ "step": 4260
+ },
+ {
+ "epoch": 2.014657210401891,
+ "grad_norm": 2.432507276535034,
+ "learning_rate": 3.7692191574565294e-06,
+ "loss": 0.3695,
+ "step": 4261
+ },
+ {
+ "epoch": 2.015130023640662,
+ "grad_norm": 2.8010706901550293,
+ "learning_rate": 3.76868166530672e-06,
+ "loss": 0.478,
+ "step": 4262
+ },
+ {
+ "epoch": 2.0156028368794328,
+ "grad_norm": 2.32817006111145,
+ "learning_rate": 3.768144094161666e-06,
+ "loss": 0.4154,
+ "step": 4263
+ },
+ {
+ "epoch": 2.016075650118203,
+ "grad_norm": 3.062812328338623,
+ "learning_rate": 3.7676064440548405e-06,
+ "loss": 0.5015,
+ "step": 4264
+ },
+ {
+ "epoch": 2.016548463356974,
+ "grad_norm": 2.6129536628723145,
+ "learning_rate": 3.7670687150197194e-06,
+ "loss": 0.3843,
+ "step": 4265
+ },
+ {
+ "epoch": 2.0170212765957447,
+ "grad_norm": 2.838259696960449,
+ "learning_rate": 3.766530907089786e-06,
+ "loss": 0.4937,
+ "step": 4266
+ },
+ {
+ "epoch": 2.0174940898345155,
+ "grad_norm": 2.601203680038452,
+ "learning_rate": 3.7659930202985263e-06,
+ "loss": 0.4644,
+ "step": 4267
+ },
+ {
+ "epoch": 2.017966903073286,
+ "grad_norm": 2.5964133739471436,
+ "learning_rate": 3.7654550546794322e-06,
+ "loss": 0.4365,
+ "step": 4268
+ },
+ {
+ "epoch": 2.0184397163120567,
+ "grad_norm": 3.0028915405273438,
+ "learning_rate": 3.764917010266001e-06,
+ "loss": 0.434,
+ "step": 4269
+ },
+ {
+ "epoch": 2.0189125295508275,
+ "grad_norm": 2.719252586364746,
+ "learning_rate": 3.764378887091734e-06,
+ "loss": 0.4401,
+ "step": 4270
+ },
+ {
+ "epoch": 2.0193853427895982,
+ "grad_norm": 2.400254011154175,
+ "learning_rate": 3.7638406851901377e-06,
+ "loss": 0.4904,
+ "step": 4271
+ },
+ {
+ "epoch": 2.0198581560283686,
+ "grad_norm": 2.8015363216400146,
+ "learning_rate": 3.763302404594724e-06,
+ "loss": 0.4569,
+ "step": 4272
+ },
+ {
+ "epoch": 2.0203309692671394,
+ "grad_norm": 2.718416452407837,
+ "learning_rate": 3.762764045339009e-06,
+ "loss": 0.5124,
+ "step": 4273
+ },
+ {
+ "epoch": 2.02080378250591,
+ "grad_norm": 2.484049081802368,
+ "learning_rate": 3.762225607456514e-06,
+ "loss": 0.4255,
+ "step": 4274
+ },
+ {
+ "epoch": 2.021276595744681,
+ "grad_norm": 2.6377930641174316,
+ "learning_rate": 3.7616870909807645e-06,
+ "loss": 0.5044,
+ "step": 4275
+ },
+ {
+ "epoch": 2.0217494089834513,
+ "grad_norm": 2.8845038414001465,
+ "learning_rate": 3.7611484959452927e-06,
+ "loss": 0.4924,
+ "step": 4276
+ },
+ {
+ "epoch": 2.022222222222222,
+ "grad_norm": 2.5939974784851074,
+ "learning_rate": 3.7606098223836342e-06,
+ "loss": 0.4873,
+ "step": 4277
+ },
+ {
+ "epoch": 2.022695035460993,
+ "grad_norm": 2.499826431274414,
+ "learning_rate": 3.76007107032933e-06,
+ "loss": 0.4515,
+ "step": 4278
+ },
+ {
+ "epoch": 2.0231678486997637,
+ "grad_norm": 3.0318663120269775,
+ "learning_rate": 3.759532239815924e-06,
+ "loss": 0.4901,
+ "step": 4279
+ },
+ {
+ "epoch": 2.023640661938534,
+ "grad_norm": 2.857977867126465,
+ "learning_rate": 3.758993330876969e-06,
+ "loss": 0.4659,
+ "step": 4280
+ },
+ {
+ "epoch": 2.024113475177305,
+ "grad_norm": 2.47918438911438,
+ "learning_rate": 3.7584543435460196e-06,
+ "loss": 0.4323,
+ "step": 4281
+ },
+ {
+ "epoch": 2.0245862884160757,
+ "grad_norm": 2.6033785343170166,
+ "learning_rate": 3.757915277856637e-06,
+ "loss": 0.4437,
+ "step": 4282
+ },
+ {
+ "epoch": 2.0250591016548465,
+ "grad_norm": 2.799781322479248,
+ "learning_rate": 3.757376133842386e-06,
+ "loss": 0.4523,
+ "step": 4283
+ },
+ {
+ "epoch": 2.025531914893617,
+ "grad_norm": 2.6092529296875,
+ "learning_rate": 3.756836911536836e-06,
+ "loss": 0.3898,
+ "step": 4284
+ },
+ {
+ "epoch": 2.0260047281323876,
+ "grad_norm": 2.66229248046875,
+ "learning_rate": 3.7562976109735627e-06,
+ "loss": 0.4731,
+ "step": 4285
+ },
+ {
+ "epoch": 2.0264775413711584,
+ "grad_norm": 2.90142822265625,
+ "learning_rate": 3.7557582321861463e-06,
+ "loss": 0.4285,
+ "step": 4286
+ },
+ {
+ "epoch": 2.0269503546099292,
+ "grad_norm": 2.5138802528381348,
+ "learning_rate": 3.7552187752081707e-06,
+ "loss": 0.4467,
+ "step": 4287
+ },
+ {
+ "epoch": 2.0274231678486996,
+ "grad_norm": 3.0656235218048096,
+ "learning_rate": 3.754679240073226e-06,
+ "loss": 0.4718,
+ "step": 4288
+ },
+ {
+ "epoch": 2.0278959810874704,
+ "grad_norm": 2.9633383750915527,
+ "learning_rate": 3.754139626814907e-06,
+ "loss": 0.4741,
+ "step": 4289
+ },
+ {
+ "epoch": 2.028368794326241,
+ "grad_norm": 2.5925145149230957,
+ "learning_rate": 3.753599935466812e-06,
+ "loss": 0.4281,
+ "step": 4290
+ },
+ {
+ "epoch": 2.028841607565012,
+ "grad_norm": 2.837740659713745,
+ "learning_rate": 3.7530601660625456e-06,
+ "loss": 0.4757,
+ "step": 4291
+ },
+ {
+ "epoch": 2.0293144208037823,
+ "grad_norm": 2.3995790481567383,
+ "learning_rate": 3.752520318635718e-06,
+ "loss": 0.4148,
+ "step": 4292
+ },
+ {
+ "epoch": 2.029787234042553,
+ "grad_norm": 2.572601795196533,
+ "learning_rate": 3.7519803932199424e-06,
+ "loss": 0.4051,
+ "step": 4293
+ },
+ {
+ "epoch": 2.030260047281324,
+ "grad_norm": 2.6780295372009277,
+ "learning_rate": 3.751440389848837e-06,
+ "loss": 0.4626,
+ "step": 4294
+ },
+ {
+ "epoch": 2.0307328605200947,
+ "grad_norm": 2.8666839599609375,
+ "learning_rate": 3.7509003085560257e-06,
+ "loss": 0.4255,
+ "step": 4295
+ },
+ {
+ "epoch": 2.031205673758865,
+ "grad_norm": 2.4398207664489746,
+ "learning_rate": 3.750360149375138e-06,
+ "loss": 0.4235,
+ "step": 4296
+ },
+ {
+ "epoch": 2.031678486997636,
+ "grad_norm": 2.436840534210205,
+ "learning_rate": 3.7498199123398062e-06,
+ "loss": 0.3907,
+ "step": 4297
+ },
+ {
+ "epoch": 2.0321513002364067,
+ "grad_norm": 3.3945820331573486,
+ "learning_rate": 3.7492795974836683e-06,
+ "loss": 0.465,
+ "step": 4298
+ },
+ {
+ "epoch": 2.0326241134751775,
+ "grad_norm": 2.6693103313446045,
+ "learning_rate": 3.7487392048403678e-06,
+ "loss": 0.4948,
+ "step": 4299
+ },
+ {
+ "epoch": 2.033096926713948,
+ "grad_norm": 2.7642734050750732,
+ "learning_rate": 3.748198734443553e-06,
+ "loss": 0.4538,
+ "step": 4300
+ },
+ {
+ "epoch": 2.0335697399527186,
+ "grad_norm": 3.1436543464660645,
+ "learning_rate": 3.747658186326876e-06,
+ "loss": 0.5137,
+ "step": 4301
+ },
+ {
+ "epoch": 2.0340425531914894,
+ "grad_norm": 3.482678174972534,
+ "learning_rate": 3.7471175605239947e-06,
+ "loss": 0.4982,
+ "step": 4302
+ },
+ {
+ "epoch": 2.03451536643026,
+ "grad_norm": 2.712557077407837,
+ "learning_rate": 3.746576857068571e-06,
+ "loss": 0.4459,
+ "step": 4303
+ },
+ {
+ "epoch": 2.0349881796690306,
+ "grad_norm": 3.147440195083618,
+ "learning_rate": 3.7460360759942726e-06,
+ "loss": 0.5063,
+ "step": 4304
+ },
+ {
+ "epoch": 2.0354609929078014,
+ "grad_norm": 2.840672492980957,
+ "learning_rate": 3.7454952173347714e-06,
+ "loss": 0.5041,
+ "step": 4305
+ },
+ {
+ "epoch": 2.035933806146572,
+ "grad_norm": 2.584122657775879,
+ "learning_rate": 3.744954281123745e-06,
+ "loss": 0.4487,
+ "step": 4306
+ },
+ {
+ "epoch": 2.036406619385343,
+ "grad_norm": 2.9869542121887207,
+ "learning_rate": 3.7444132673948737e-06,
+ "loss": 0.479,
+ "step": 4307
+ },
+ {
+ "epoch": 2.0368794326241133,
+ "grad_norm": 2.478459358215332,
+ "learning_rate": 3.7438721761818446e-06,
+ "loss": 0.4636,
+ "step": 4308
+ },
+ {
+ "epoch": 2.037352245862884,
+ "grad_norm": 2.5524215698242188,
+ "learning_rate": 3.7433310075183504e-06,
+ "loss": 0.4601,
+ "step": 4309
+ },
+ {
+ "epoch": 2.037825059101655,
+ "grad_norm": 2.3709988594055176,
+ "learning_rate": 3.742789761438086e-06,
+ "loss": 0.4163,
+ "step": 4310
+ },
+ {
+ "epoch": 2.0382978723404257,
+ "grad_norm": 3.140355348587036,
+ "learning_rate": 3.742248437974752e-06,
+ "loss": 0.4433,
+ "step": 4311
+ },
+ {
+ "epoch": 2.038770685579196,
+ "grad_norm": 2.940948486328125,
+ "learning_rate": 3.741707037162055e-06,
+ "loss": 0.4299,
+ "step": 4312
+ },
+ {
+ "epoch": 2.039243498817967,
+ "grad_norm": 3.009157419204712,
+ "learning_rate": 3.7411655590337055e-06,
+ "loss": 0.463,
+ "step": 4313
+ },
+ {
+ "epoch": 2.0397163120567376,
+ "grad_norm": 2.672945737838745,
+ "learning_rate": 3.7406240036234185e-06,
+ "loss": 0.4696,
+ "step": 4314
+ },
+ {
+ "epoch": 2.0401891252955084,
+ "grad_norm": 2.745962142944336,
+ "learning_rate": 3.740082370964916e-06,
+ "loss": 0.4931,
+ "step": 4315
+ },
+ {
+ "epoch": 2.040661938534279,
+ "grad_norm": 2.3939316272735596,
+ "learning_rate": 3.7395406610919217e-06,
+ "loss": 0.4396,
+ "step": 4316
+ },
+ {
+ "epoch": 2.0411347517730496,
+ "grad_norm": 2.4364447593688965,
+ "learning_rate": 3.738998874038165e-06,
+ "loss": 0.4807,
+ "step": 4317
+ },
+ {
+ "epoch": 2.0416075650118204,
+ "grad_norm": 2.360489845275879,
+ "learning_rate": 3.738457009837381e-06,
+ "loss": 0.4426,
+ "step": 4318
+ },
+ {
+ "epoch": 2.042080378250591,
+ "grad_norm": 2.5494935512542725,
+ "learning_rate": 3.7379150685233108e-06,
+ "loss": 0.4189,
+ "step": 4319
+ },
+ {
+ "epoch": 2.0425531914893615,
+ "grad_norm": 2.635472059249878,
+ "learning_rate": 3.7373730501296963e-06,
+ "loss": 0.5014,
+ "step": 4320
+ },
+ {
+ "epoch": 2.0430260047281323,
+ "grad_norm": 2.4982943534851074,
+ "learning_rate": 3.7368309546902876e-06,
+ "loss": 0.4658,
+ "step": 4321
+ },
+ {
+ "epoch": 2.043498817966903,
+ "grad_norm": 2.692742109298706,
+ "learning_rate": 3.736288782238839e-06,
+ "loss": 0.4454,
+ "step": 4322
+ },
+ {
+ "epoch": 2.043971631205674,
+ "grad_norm": 2.6774091720581055,
+ "learning_rate": 3.7357465328091086e-06,
+ "loss": 0.5002,
+ "step": 4323
+ },
+ {
+ "epoch": 2.0444444444444443,
+ "grad_norm": 2.695138692855835,
+ "learning_rate": 3.735204206434861e-06,
+ "loss": 0.448,
+ "step": 4324
+ },
+ {
+ "epoch": 2.044917257683215,
+ "grad_norm": 2.5383570194244385,
+ "learning_rate": 3.7346618031498635e-06,
+ "loss": 0.4352,
+ "step": 4325
+ },
+ {
+ "epoch": 2.045390070921986,
+ "grad_norm": 2.267277240753174,
+ "learning_rate": 3.7341193229878886e-06,
+ "loss": 0.4162,
+ "step": 4326
+ },
+ {
+ "epoch": 2.0458628841607567,
+ "grad_norm": 2.6037328243255615,
+ "learning_rate": 3.733576765982715e-06,
+ "loss": 0.4471,
+ "step": 4327
+ },
+ {
+ "epoch": 2.046335697399527,
+ "grad_norm": 3.261385440826416,
+ "learning_rate": 3.7330341321681253e-06,
+ "loss": 0.4618,
+ "step": 4328
+ },
+ {
+ "epoch": 2.046808510638298,
+ "grad_norm": 2.440650463104248,
+ "learning_rate": 3.7324914215779072e-06,
+ "loss": 0.4476,
+ "step": 4329
+ },
+ {
+ "epoch": 2.0472813238770686,
+ "grad_norm": 2.5940682888031006,
+ "learning_rate": 3.731948634245853e-06,
+ "loss": 0.4389,
+ "step": 4330
+ },
+ {
+ "epoch": 2.0477541371158394,
+ "grad_norm": 2.7428150177001953,
+ "learning_rate": 3.7314057702057582e-06,
+ "loss": 0.4477,
+ "step": 4331
+ },
+ {
+ "epoch": 2.0482269503546098,
+ "grad_norm": 2.3546223640441895,
+ "learning_rate": 3.730862829491427e-06,
+ "loss": 0.4047,
+ "step": 4332
+ },
+ {
+ "epoch": 2.0486997635933806,
+ "grad_norm": 2.552422523498535,
+ "learning_rate": 3.7303198121366637e-06,
+ "loss": 0.4438,
+ "step": 4333
+ },
+ {
+ "epoch": 2.0491725768321514,
+ "grad_norm": 2.99226713180542,
+ "learning_rate": 3.729776718175281e-06,
+ "loss": 0.491,
+ "step": 4334
+ },
+ {
+ "epoch": 2.049645390070922,
+ "grad_norm": 3.2003321647644043,
+ "learning_rate": 3.7292335476410935e-06,
+ "loss": 0.5458,
+ "step": 4335
+ },
+ {
+ "epoch": 2.0501182033096925,
+ "grad_norm": 2.739847183227539,
+ "learning_rate": 3.7286903005679237e-06,
+ "loss": 0.4499,
+ "step": 4336
+ },
+ {
+ "epoch": 2.0505910165484633,
+ "grad_norm": 2.5917470455169678,
+ "learning_rate": 3.7281469769895963e-06,
+ "loss": 0.4714,
+ "step": 4337
+ },
+ {
+ "epoch": 2.051063829787234,
+ "grad_norm": 2.8029327392578125,
+ "learning_rate": 3.7276035769399422e-06,
+ "loss": 0.42,
+ "step": 4338
+ },
+ {
+ "epoch": 2.051536643026005,
+ "grad_norm": 2.484879493713379,
+ "learning_rate": 3.727060100452796e-06,
+ "loss": 0.4163,
+ "step": 4339
+ },
+ {
+ "epoch": 2.0520094562647753,
+ "grad_norm": 2.7126030921936035,
+ "learning_rate": 3.7265165475619973e-06,
+ "loss": 0.4112,
+ "step": 4340
+ },
+ {
+ "epoch": 2.052482269503546,
+ "grad_norm": 2.618267774581909,
+ "learning_rate": 3.7259729183013927e-06,
+ "loss": 0.4281,
+ "step": 4341
+ },
+ {
+ "epoch": 2.052955082742317,
+ "grad_norm": 2.703270673751831,
+ "learning_rate": 3.7254292127048293e-06,
+ "loss": 0.4437,
+ "step": 4342
+ },
+ {
+ "epoch": 2.0534278959810877,
+ "grad_norm": 2.429150104522705,
+ "learning_rate": 3.7248854308061623e-06,
+ "loss": 0.3971,
+ "step": 4343
+ },
+ {
+ "epoch": 2.053900709219858,
+ "grad_norm": 2.54354190826416,
+ "learning_rate": 3.7243415726392508e-06,
+ "loss": 0.4485,
+ "step": 4344
+ },
+ {
+ "epoch": 2.054373522458629,
+ "grad_norm": 2.9515016078948975,
+ "learning_rate": 3.723797638237957e-06,
+ "loss": 0.4386,
+ "step": 4345
+ },
+ {
+ "epoch": 2.0548463356973996,
+ "grad_norm": 2.9129958152770996,
+ "learning_rate": 3.7232536276361514e-06,
+ "loss": 0.4595,
+ "step": 4346
+ },
+ {
+ "epoch": 2.0553191489361704,
+ "grad_norm": 2.5397512912750244,
+ "learning_rate": 3.722709540867706e-06,
+ "loss": 0.3681,
+ "step": 4347
+ },
+ {
+ "epoch": 2.0557919621749408,
+ "grad_norm": 2.79884672164917,
+ "learning_rate": 3.722165377966499e-06,
+ "loss": 0.4576,
+ "step": 4348
+ },
+ {
+ "epoch": 2.0562647754137116,
+ "grad_norm": 2.669936180114746,
+ "learning_rate": 3.7216211389664137e-06,
+ "loss": 0.3692,
+ "step": 4349
+ },
+ {
+ "epoch": 2.0567375886524824,
+ "grad_norm": 2.512326240539551,
+ "learning_rate": 3.7210768239013355e-06,
+ "loss": 0.4554,
+ "step": 4350
+ },
+ {
+ "epoch": 2.057210401891253,
+ "grad_norm": 2.913693904876709,
+ "learning_rate": 3.7205324328051583e-06,
+ "loss": 0.5282,
+ "step": 4351
+ },
+ {
+ "epoch": 2.0576832151300235,
+ "grad_norm": 3.040891170501709,
+ "learning_rate": 3.719987965711778e-06,
+ "loss": 0.4778,
+ "step": 4352
+ },
+ {
+ "epoch": 2.0581560283687943,
+ "grad_norm": 2.7504117488861084,
+ "learning_rate": 3.7194434226550966e-06,
+ "loss": 0.4217,
+ "step": 4353
+ },
+ {
+ "epoch": 2.058628841607565,
+ "grad_norm": 2.5522971153259277,
+ "learning_rate": 3.718898803669021e-06,
+ "loss": 0.437,
+ "step": 4354
+ },
+ {
+ "epoch": 2.059101654846336,
+ "grad_norm": 2.8531908988952637,
+ "learning_rate": 3.718354108787461e-06,
+ "loss": 0.4251,
+ "step": 4355
+ },
+ {
+ "epoch": 2.0595744680851062,
+ "grad_norm": 2.5812065601348877,
+ "learning_rate": 3.7178093380443337e-06,
+ "loss": 0.4374,
+ "step": 4356
+ },
+ {
+ "epoch": 2.060047281323877,
+ "grad_norm": 2.627871513366699,
+ "learning_rate": 3.7172644914735583e-06,
+ "loss": 0.436,
+ "step": 4357
+ },
+ {
+ "epoch": 2.060520094562648,
+ "grad_norm": 2.7146239280700684,
+ "learning_rate": 3.7167195691090607e-06,
+ "loss": 0.4204,
+ "step": 4358
+ },
+ {
+ "epoch": 2.0609929078014186,
+ "grad_norm": 2.486483573913574,
+ "learning_rate": 3.7161745709847706e-06,
+ "loss": 0.4015,
+ "step": 4359
+ },
+ {
+ "epoch": 2.061465721040189,
+ "grad_norm": 2.866049289703369,
+ "learning_rate": 3.7156294971346226e-06,
+ "loss": 0.4087,
+ "step": 4360
+ },
+ {
+ "epoch": 2.06193853427896,
+ "grad_norm": 2.9345552921295166,
+ "learning_rate": 3.715084347592556e-06,
+ "loss": 0.5074,
+ "step": 4361
+ },
+ {
+ "epoch": 2.0624113475177306,
+ "grad_norm": 2.502455711364746,
+ "learning_rate": 3.7145391223925155e-06,
+ "loss": 0.469,
+ "step": 4362
+ },
+ {
+ "epoch": 2.0628841607565014,
+ "grad_norm": 2.6419875621795654,
+ "learning_rate": 3.713993821568449e-06,
+ "loss": 0.4493,
+ "step": 4363
+ },
+ {
+ "epoch": 2.0633569739952717,
+ "grad_norm": 3.812079429626465,
+ "learning_rate": 3.7134484451543114e-06,
+ "loss": 0.4764,
+ "step": 4364
+ },
+ {
+ "epoch": 2.0638297872340425,
+ "grad_norm": 2.581780195236206,
+ "learning_rate": 3.712902993184059e-06,
+ "loss": 0.3994,
+ "step": 4365
+ },
+ {
+ "epoch": 2.0643026004728133,
+ "grad_norm": 2.282508134841919,
+ "learning_rate": 3.712357465691656e-06,
+ "loss": 0.4252,
+ "step": 4366
+ },
+ {
+ "epoch": 2.064775413711584,
+ "grad_norm": 2.4727818965911865,
+ "learning_rate": 3.71181186271107e-06,
+ "loss": 0.4558,
+ "step": 4367
+ },
+ {
+ "epoch": 2.0652482269503545,
+ "grad_norm": 2.7661173343658447,
+ "learning_rate": 3.711266184276272e-06,
+ "loss": 0.505,
+ "step": 4368
+ },
+ {
+ "epoch": 2.0657210401891253,
+ "grad_norm": 2.6264543533325195,
+ "learning_rate": 3.71072043042124e-06,
+ "loss": 0.4297,
+ "step": 4369
+ },
+ {
+ "epoch": 2.066193853427896,
+ "grad_norm": 2.773699998855591,
+ "learning_rate": 3.7101746011799565e-06,
+ "loss": 0.4267,
+ "step": 4370
+ },
+ {
+ "epoch": 2.066666666666667,
+ "grad_norm": 2.686955213546753,
+ "learning_rate": 3.709628696586407e-06,
+ "loss": 0.4099,
+ "step": 4371
+ },
+ {
+ "epoch": 2.0671394799054372,
+ "grad_norm": 2.6066620349884033,
+ "learning_rate": 3.709082716674582e-06,
+ "loss": 0.4146,
+ "step": 4372
+ },
+ {
+ "epoch": 2.067612293144208,
+ "grad_norm": 2.7769250869750977,
+ "learning_rate": 3.7085366614784784e-06,
+ "loss": 0.4047,
+ "step": 4373
+ },
+ {
+ "epoch": 2.068085106382979,
+ "grad_norm": 2.4986939430236816,
+ "learning_rate": 3.7079905310320957e-06,
+ "loss": 0.4021,
+ "step": 4374
+ },
+ {
+ "epoch": 2.0685579196217496,
+ "grad_norm": 2.5456206798553467,
+ "learning_rate": 3.7074443253694402e-06,
+ "loss": 0.3569,
+ "step": 4375
+ },
+ {
+ "epoch": 2.06903073286052,
+ "grad_norm": 2.4079296588897705,
+ "learning_rate": 3.70689804452452e-06,
+ "loss": 0.4308,
+ "step": 4376
+ },
+ {
+ "epoch": 2.0695035460992908,
+ "grad_norm": 2.86014723777771,
+ "learning_rate": 3.7063516885313513e-06,
+ "loss": 0.4577,
+ "step": 4377
+ },
+ {
+ "epoch": 2.0699763593380616,
+ "grad_norm": 2.8025779724121094,
+ "learning_rate": 3.7058052574239523e-06,
+ "loss": 0.4615,
+ "step": 4378
+ },
+ {
+ "epoch": 2.0704491725768324,
+ "grad_norm": 2.902676820755005,
+ "learning_rate": 3.7052587512363475e-06,
+ "loss": 0.4765,
+ "step": 4379
+ },
+ {
+ "epoch": 2.0709219858156027,
+ "grad_norm": 2.814509391784668,
+ "learning_rate": 3.704712170002566e-06,
+ "loss": 0.434,
+ "step": 4380
+ },
+ {
+ "epoch": 2.0713947990543735,
+ "grad_norm": 2.7923502922058105,
+ "learning_rate": 3.704165513756639e-06,
+ "loss": 0.4626,
+ "step": 4381
+ },
+ {
+ "epoch": 2.0718676122931443,
+ "grad_norm": 2.6802031993865967,
+ "learning_rate": 3.703618782532606e-06,
+ "loss": 0.4835,
+ "step": 4382
+ },
+ {
+ "epoch": 2.072340425531915,
+ "grad_norm": 3.0963687896728516,
+ "learning_rate": 3.7030719763645085e-06,
+ "loss": 0.4813,
+ "step": 4383
+ },
+ {
+ "epoch": 2.0728132387706855,
+ "grad_norm": 2.5658695697784424,
+ "learning_rate": 3.7025250952863956e-06,
+ "loss": 0.4428,
+ "step": 4384
+ },
+ {
+ "epoch": 2.0732860520094563,
+ "grad_norm": 2.7738289833068848,
+ "learning_rate": 3.7019781393323167e-06,
+ "loss": 0.4376,
+ "step": 4385
+ },
+ {
+ "epoch": 2.073758865248227,
+ "grad_norm": 2.6446938514709473,
+ "learning_rate": 3.7014311085363303e-06,
+ "loss": 0.4208,
+ "step": 4386
+ },
+ {
+ "epoch": 2.0742316784869974,
+ "grad_norm": 2.7556118965148926,
+ "learning_rate": 3.7008840029324967e-06,
+ "loss": 0.3831,
+ "step": 4387
+ },
+ {
+ "epoch": 2.074704491725768,
+ "grad_norm": 2.573141574859619,
+ "learning_rate": 3.700336822554882e-06,
+ "loss": 0.4396,
+ "step": 4388
+ },
+ {
+ "epoch": 2.075177304964539,
+ "grad_norm": 2.762319803237915,
+ "learning_rate": 3.6997895674375566e-06,
+ "loss": 0.4579,
+ "step": 4389
+ },
+ {
+ "epoch": 2.07565011820331,
+ "grad_norm": 2.729780435562134,
+ "learning_rate": 3.699242237614596e-06,
+ "loss": 0.4262,
+ "step": 4390
+ },
+ {
+ "epoch": 2.0761229314420806,
+ "grad_norm": 2.657480001449585,
+ "learning_rate": 3.698694833120079e-06,
+ "loss": 0.4176,
+ "step": 4391
+ },
+ {
+ "epoch": 2.076595744680851,
+ "grad_norm": 2.8433303833007812,
+ "learning_rate": 3.6981473539880914e-06,
+ "loss": 0.457,
+ "step": 4392
+ },
+ {
+ "epoch": 2.0770685579196217,
+ "grad_norm": 2.819047212600708,
+ "learning_rate": 3.6975998002527225e-06,
+ "loss": 0.4244,
+ "step": 4393
+ },
+ {
+ "epoch": 2.0775413711583925,
+ "grad_norm": 2.6565003395080566,
+ "learning_rate": 3.697052171948064e-06,
+ "loss": 0.4384,
+ "step": 4394
+ },
+ {
+ "epoch": 2.078014184397163,
+ "grad_norm": 2.5795063972473145,
+ "learning_rate": 3.696504469108216e-06,
+ "loss": 0.4958,
+ "step": 4395
+ },
+ {
+ "epoch": 2.0784869976359337,
+ "grad_norm": 2.455730676651001,
+ "learning_rate": 3.6959566917672822e-06,
+ "loss": 0.4191,
+ "step": 4396
+ },
+ {
+ "epoch": 2.0789598108747045,
+ "grad_norm": 2.6706607341766357,
+ "learning_rate": 3.6954088399593684e-06,
+ "loss": 0.4709,
+ "step": 4397
+ },
+ {
+ "epoch": 2.0794326241134753,
+ "grad_norm": 2.3758466243743896,
+ "learning_rate": 3.694860913718589e-06,
+ "loss": 0.4231,
+ "step": 4398
+ },
+ {
+ "epoch": 2.079905437352246,
+ "grad_norm": 2.3488340377807617,
+ "learning_rate": 3.6943129130790583e-06,
+ "loss": 0.4321,
+ "step": 4399
+ },
+ {
+ "epoch": 2.0803782505910164,
+ "grad_norm": 2.6438148021698,
+ "learning_rate": 3.6937648380748996e-06,
+ "loss": 0.4907,
+ "step": 4400
+ },
+ {
+ "epoch": 2.0808510638297872,
+ "grad_norm": 2.9826784133911133,
+ "learning_rate": 3.6932166887402395e-06,
+ "loss": 0.4404,
+ "step": 4401
+ },
+ {
+ "epoch": 2.081323877068558,
+ "grad_norm": 2.5203495025634766,
+ "learning_rate": 3.6926684651092076e-06,
+ "loss": 0.4337,
+ "step": 4402
+ },
+ {
+ "epoch": 2.0817966903073284,
+ "grad_norm": 2.7704148292541504,
+ "learning_rate": 3.692120167215941e-06,
+ "loss": 0.4195,
+ "step": 4403
+ },
+ {
+ "epoch": 2.082269503546099,
+ "grad_norm": 2.879430055618286,
+ "learning_rate": 3.6915717950945782e-06,
+ "loss": 0.4498,
+ "step": 4404
+ },
+ {
+ "epoch": 2.08274231678487,
+ "grad_norm": 2.7659497261047363,
+ "learning_rate": 3.6910233487792655e-06,
+ "loss": 0.4017,
+ "step": 4405
+ },
+ {
+ "epoch": 2.083215130023641,
+ "grad_norm": 3.4017205238342285,
+ "learning_rate": 3.6904748283041503e-06,
+ "loss": 0.4733,
+ "step": 4406
+ },
+ {
+ "epoch": 2.083687943262411,
+ "grad_norm": 2.706223249435425,
+ "learning_rate": 3.6899262337033887e-06,
+ "loss": 0.4926,
+ "step": 4407
+ },
+ {
+ "epoch": 2.084160756501182,
+ "grad_norm": 2.644932508468628,
+ "learning_rate": 3.6893775650111372e-06,
+ "loss": 0.3904,
+ "step": 4408
+ },
+ {
+ "epoch": 2.0846335697399527,
+ "grad_norm": 2.666585683822632,
+ "learning_rate": 3.6888288222615603e-06,
+ "loss": 0.4698,
+ "step": 4409
+ },
+ {
+ "epoch": 2.0851063829787235,
+ "grad_norm": 3.0058486461639404,
+ "learning_rate": 3.688280005488826e-06,
+ "loss": 0.5291,
+ "step": 4410
+ },
+ {
+ "epoch": 2.085579196217494,
+ "grad_norm": 2.533088445663452,
+ "learning_rate": 3.687731114727105e-06,
+ "loss": 0.393,
+ "step": 4411
+ },
+ {
+ "epoch": 2.0860520094562647,
+ "grad_norm": 2.921687364578247,
+ "learning_rate": 3.6871821500105763e-06,
+ "loss": 0.4719,
+ "step": 4412
+ },
+ {
+ "epoch": 2.0865248226950355,
+ "grad_norm": 2.291804313659668,
+ "learning_rate": 3.686633111373421e-06,
+ "loss": 0.4105,
+ "step": 4413
+ },
+ {
+ "epoch": 2.0869976359338063,
+ "grad_norm": 2.496333122253418,
+ "learning_rate": 3.6860839988498255e-06,
+ "loss": 0.4704,
+ "step": 4414
+ },
+ {
+ "epoch": 2.0874704491725766,
+ "grad_norm": 2.8059427738189697,
+ "learning_rate": 3.6855348124739787e-06,
+ "loss": 0.4961,
+ "step": 4415
+ },
+ {
+ "epoch": 2.0879432624113474,
+ "grad_norm": 2.683922290802002,
+ "learning_rate": 3.6849855522800795e-06,
+ "loss": 0.4838,
+ "step": 4416
+ },
+ {
+ "epoch": 2.088416075650118,
+ "grad_norm": 2.694148540496826,
+ "learning_rate": 3.684436218302324e-06,
+ "loss": 0.4812,
+ "step": 4417
+ },
+ {
+ "epoch": 2.088888888888889,
+ "grad_norm": 2.724531888961792,
+ "learning_rate": 3.683886810574919e-06,
+ "loss": 0.4495,
+ "step": 4418
+ },
+ {
+ "epoch": 2.0893617021276594,
+ "grad_norm": 2.6176564693450928,
+ "learning_rate": 3.6833373291320746e-06,
+ "loss": 0.4698,
+ "step": 4419
+ },
+ {
+ "epoch": 2.08983451536643,
+ "grad_norm": 2.534116268157959,
+ "learning_rate": 3.6827877740080032e-06,
+ "loss": 0.3912,
+ "step": 4420
+ },
+ {
+ "epoch": 2.090307328605201,
+ "grad_norm": 2.5747432708740234,
+ "learning_rate": 3.682238145236924e-06,
+ "loss": 0.4072,
+ "step": 4421
+ },
+ {
+ "epoch": 2.0907801418439718,
+ "grad_norm": 2.5947659015655518,
+ "learning_rate": 3.6816884428530588e-06,
+ "loss": 0.4638,
+ "step": 4422
+ },
+ {
+ "epoch": 2.091252955082742,
+ "grad_norm": 2.811992883682251,
+ "learning_rate": 3.6811386668906353e-06,
+ "loss": 0.4345,
+ "step": 4423
+ },
+ {
+ "epoch": 2.091725768321513,
+ "grad_norm": 2.7482287883758545,
+ "learning_rate": 3.680588817383886e-06,
+ "loss": 0.4541,
+ "step": 4424
+ },
+ {
+ "epoch": 2.0921985815602837,
+ "grad_norm": 2.987131357192993,
+ "learning_rate": 3.6800388943670484e-06,
+ "loss": 0.4571,
+ "step": 4425
+ },
+ {
+ "epoch": 2.0926713947990545,
+ "grad_norm": 3.1918671131134033,
+ "learning_rate": 3.6794888978743637e-06,
+ "loss": 0.5722,
+ "step": 4426
+ },
+ {
+ "epoch": 2.093144208037825,
+ "grad_norm": 2.5654571056365967,
+ "learning_rate": 3.678938827940076e-06,
+ "loss": 0.4686,
+ "step": 4427
+ },
+ {
+ "epoch": 2.0936170212765957,
+ "grad_norm": 2.942084789276123,
+ "learning_rate": 3.6783886845984383e-06,
+ "loss": 0.4512,
+ "step": 4428
+ },
+ {
+ "epoch": 2.0940898345153665,
+ "grad_norm": 2.74847674369812,
+ "learning_rate": 3.677838467883703e-06,
+ "loss": 0.4506,
+ "step": 4429
+ },
+ {
+ "epoch": 2.0945626477541373,
+ "grad_norm": 2.7569334506988525,
+ "learning_rate": 3.6772881778301322e-06,
+ "loss": 0.502,
+ "step": 4430
+ },
+ {
+ "epoch": 2.0950354609929076,
+ "grad_norm": 2.969966173171997,
+ "learning_rate": 3.6767378144719884e-06,
+ "loss": 0.4772,
+ "step": 4431
+ },
+ {
+ "epoch": 2.0955082742316784,
+ "grad_norm": 2.773524522781372,
+ "learning_rate": 3.67618737784354e-06,
+ "loss": 0.5183,
+ "step": 4432
+ },
+ {
+ "epoch": 2.095981087470449,
+ "grad_norm": 2.6760106086730957,
+ "learning_rate": 3.6756368679790617e-06,
+ "loss": 0.4787,
+ "step": 4433
+ },
+ {
+ "epoch": 2.09645390070922,
+ "grad_norm": 2.8758978843688965,
+ "learning_rate": 3.6750862849128304e-06,
+ "loss": 0.4275,
+ "step": 4434
+ },
+ {
+ "epoch": 2.0969267139479904,
+ "grad_norm": 2.670509099960327,
+ "learning_rate": 3.6745356286791288e-06,
+ "loss": 0.4401,
+ "step": 4435
+ },
+ {
+ "epoch": 2.097399527186761,
+ "grad_norm": 2.8453969955444336,
+ "learning_rate": 3.673984899312244e-06,
+ "loss": 0.4303,
+ "step": 4436
+ },
+ {
+ "epoch": 2.097872340425532,
+ "grad_norm": 2.6212339401245117,
+ "learning_rate": 3.673434096846468e-06,
+ "loss": 0.4675,
+ "step": 4437
+ },
+ {
+ "epoch": 2.0983451536643027,
+ "grad_norm": 2.8211941719055176,
+ "learning_rate": 3.672883221316095e-06,
+ "loss": 0.4678,
+ "step": 4438
+ },
+ {
+ "epoch": 2.098817966903073,
+ "grad_norm": 2.4838058948516846,
+ "learning_rate": 3.672332272755427e-06,
+ "loss": 0.4128,
+ "step": 4439
+ },
+ {
+ "epoch": 2.099290780141844,
+ "grad_norm": 2.596660852432251,
+ "learning_rate": 3.671781251198769e-06,
+ "loss": 0.423,
+ "step": 4440
+ },
+ {
+ "epoch": 2.0997635933806147,
+ "grad_norm": 2.9979989528656006,
+ "learning_rate": 3.67123015668043e-06,
+ "loss": 0.4493,
+ "step": 4441
+ },
+ {
+ "epoch": 2.1002364066193855,
+ "grad_norm": 2.6232850551605225,
+ "learning_rate": 3.670678989234725e-06,
+ "loss": 0.4237,
+ "step": 4442
+ },
+ {
+ "epoch": 2.100709219858156,
+ "grad_norm": 2.575039863586426,
+ "learning_rate": 3.670127748895973e-06,
+ "loss": 0.4464,
+ "step": 4443
+ },
+ {
+ "epoch": 2.1011820330969266,
+ "grad_norm": 2.3381190299987793,
+ "learning_rate": 3.669576435698497e-06,
+ "loss": 0.4208,
+ "step": 4444
+ },
+ {
+ "epoch": 2.1016548463356974,
+ "grad_norm": 2.9645180702209473,
+ "learning_rate": 3.669025049676625e-06,
+ "loss": 0.5272,
+ "step": 4445
+ },
+ {
+ "epoch": 2.1021276595744682,
+ "grad_norm": 2.719320297241211,
+ "learning_rate": 3.668473590864689e-06,
+ "loss": 0.4485,
+ "step": 4446
+ },
+ {
+ "epoch": 2.1026004728132386,
+ "grad_norm": 2.8665547370910645,
+ "learning_rate": 3.6679220592970254e-06,
+ "loss": 0.4433,
+ "step": 4447
+ },
+ {
+ "epoch": 2.1030732860520094,
+ "grad_norm": 2.6922879219055176,
+ "learning_rate": 3.667370455007977e-06,
+ "loss": 0.502,
+ "step": 4448
+ },
+ {
+ "epoch": 2.10354609929078,
+ "grad_norm": 3.018228530883789,
+ "learning_rate": 3.6668187780318894e-06,
+ "loss": 0.4939,
+ "step": 4449
+ },
+ {
+ "epoch": 2.104018912529551,
+ "grad_norm": 3.187901735305786,
+ "learning_rate": 3.666267028403112e-06,
+ "loss": 0.4151,
+ "step": 4450
+ },
+ {
+ "epoch": 2.1044917257683213,
+ "grad_norm": 2.9521446228027344,
+ "learning_rate": 3.6657152061560012e-06,
+ "loss": 0.4343,
+ "step": 4451
+ },
+ {
+ "epoch": 2.104964539007092,
+ "grad_norm": 2.5125739574432373,
+ "learning_rate": 3.6651633113249164e-06,
+ "loss": 0.4071,
+ "step": 4452
+ },
+ {
+ "epoch": 2.105437352245863,
+ "grad_norm": 2.9164133071899414,
+ "learning_rate": 3.664611343944221e-06,
+ "loss": 0.4173,
+ "step": 4453
+ },
+ {
+ "epoch": 2.1059101654846337,
+ "grad_norm": 2.680893898010254,
+ "learning_rate": 3.6640593040482834e-06,
+ "loss": 0.4917,
+ "step": 4454
+ },
+ {
+ "epoch": 2.106382978723404,
+ "grad_norm": 2.6823534965515137,
+ "learning_rate": 3.6635071916714774e-06,
+ "loss": 0.4668,
+ "step": 4455
+ },
+ {
+ "epoch": 2.106855791962175,
+ "grad_norm": 2.6221907138824463,
+ "learning_rate": 3.6629550068481806e-06,
+ "loss": 0.4956,
+ "step": 4456
+ },
+ {
+ "epoch": 2.1073286052009457,
+ "grad_norm": 3.096370220184326,
+ "learning_rate": 3.6624027496127745e-06,
+ "loss": 0.3995,
+ "step": 4457
+ },
+ {
+ "epoch": 2.1078014184397165,
+ "grad_norm": 2.752885341644287,
+ "learning_rate": 3.661850419999647e-06,
+ "loss": 0.4838,
+ "step": 4458
+ },
+ {
+ "epoch": 2.108274231678487,
+ "grad_norm": 2.6806766986846924,
+ "learning_rate": 3.661298018043188e-06,
+ "loss": 0.4817,
+ "step": 4459
+ },
+ {
+ "epoch": 2.1087470449172576,
+ "grad_norm": 2.6317873001098633,
+ "learning_rate": 3.660745543777794e-06,
+ "loss": 0.4777,
+ "step": 4460
+ },
+ {
+ "epoch": 2.1092198581560284,
+ "grad_norm": 2.4939377307891846,
+ "learning_rate": 3.6601929972378634e-06,
+ "loss": 0.4525,
+ "step": 4461
+ },
+ {
+ "epoch": 2.109692671394799,
+ "grad_norm": 2.4902873039245605,
+ "learning_rate": 3.659640378457803e-06,
+ "loss": 0.4392,
+ "step": 4462
+ },
+ {
+ "epoch": 2.1101654846335696,
+ "grad_norm": 2.5082345008850098,
+ "learning_rate": 3.6590876874720216e-06,
+ "loss": 0.4224,
+ "step": 4463
+ },
+ {
+ "epoch": 2.1106382978723404,
+ "grad_norm": 2.658407211303711,
+ "learning_rate": 3.6585349243149313e-06,
+ "loss": 0.4316,
+ "step": 4464
+ },
+ {
+ "epoch": 2.111111111111111,
+ "grad_norm": 2.562883138656616,
+ "learning_rate": 3.6579820890209515e-06,
+ "loss": 0.4491,
+ "step": 4465
+ },
+ {
+ "epoch": 2.111583924349882,
+ "grad_norm": 2.5719261169433594,
+ "learning_rate": 3.657429181624505e-06,
+ "loss": 0.4406,
+ "step": 4466
+ },
+ {
+ "epoch": 2.1120567375886523,
+ "grad_norm": 2.8840596675872803,
+ "learning_rate": 3.6568762021600184e-06,
+ "loss": 0.4267,
+ "step": 4467
+ },
+ {
+ "epoch": 2.112529550827423,
+ "grad_norm": 2.660304546356201,
+ "learning_rate": 3.656323150661924e-06,
+ "loss": 0.4502,
+ "step": 4468
+ },
+ {
+ "epoch": 2.113002364066194,
+ "grad_norm": 2.610996961593628,
+ "learning_rate": 3.655770027164657e-06,
+ "loss": 0.3934,
+ "step": 4469
+ },
+ {
+ "epoch": 2.1134751773049647,
+ "grad_norm": 2.6000053882598877,
+ "learning_rate": 3.655216831702658e-06,
+ "loss": 0.4582,
+ "step": 4470
+ },
+ {
+ "epoch": 2.113947990543735,
+ "grad_norm": 2.73124098777771,
+ "learning_rate": 3.654663564310372e-06,
+ "loss": 0.4748,
+ "step": 4471
+ },
+ {
+ "epoch": 2.114420803782506,
+ "grad_norm": 2.711091995239258,
+ "learning_rate": 3.6541102250222495e-06,
+ "loss": 0.4145,
+ "step": 4472
+ },
+ {
+ "epoch": 2.1148936170212767,
+ "grad_norm": 2.655996561050415,
+ "learning_rate": 3.6535568138727438e-06,
+ "loss": 0.4407,
+ "step": 4473
+ },
+ {
+ "epoch": 2.1153664302600474,
+ "grad_norm": 2.7630865573883057,
+ "learning_rate": 3.653003330896313e-06,
+ "loss": 0.4298,
+ "step": 4474
+ },
+ {
+ "epoch": 2.115839243498818,
+ "grad_norm": 2.554415464401245,
+ "learning_rate": 3.6524497761274214e-06,
+ "loss": 0.44,
+ "step": 4475
+ },
+ {
+ "epoch": 2.1163120567375886,
+ "grad_norm": 2.790328025817871,
+ "learning_rate": 3.651896149600535e-06,
+ "loss": 0.5061,
+ "step": 4476
+ },
+ {
+ "epoch": 2.1167848699763594,
+ "grad_norm": 2.755267381668091,
+ "learning_rate": 3.651342451350127e-06,
+ "loss": 0.4588,
+ "step": 4477
+ },
+ {
+ "epoch": 2.11725768321513,
+ "grad_norm": 2.8936638832092285,
+ "learning_rate": 3.6507886814106722e-06,
+ "loss": 0.468,
+ "step": 4478
+ },
+ {
+ "epoch": 2.1177304964539005,
+ "grad_norm": 2.7394332885742188,
+ "learning_rate": 3.6502348398166525e-06,
+ "loss": 0.383,
+ "step": 4479
+ },
+ {
+ "epoch": 2.1182033096926713,
+ "grad_norm": 2.3359546661376953,
+ "learning_rate": 3.649680926602553e-06,
+ "loss": 0.3903,
+ "step": 4480
+ },
+ {
+ "epoch": 2.118676122931442,
+ "grad_norm": 3.102202892303467,
+ "learning_rate": 3.6491269418028637e-06,
+ "loss": 0.4525,
+ "step": 4481
+ },
+ {
+ "epoch": 2.119148936170213,
+ "grad_norm": 2.467970848083496,
+ "learning_rate": 3.648572885452078e-06,
+ "loss": 0.414,
+ "step": 4482
+ },
+ {
+ "epoch": 2.1196217494089833,
+ "grad_norm": 2.8984131813049316,
+ "learning_rate": 3.6480187575846952e-06,
+ "loss": 0.4571,
+ "step": 4483
+ },
+ {
+ "epoch": 2.120094562647754,
+ "grad_norm": 2.674834966659546,
+ "learning_rate": 3.6474645582352187e-06,
+ "loss": 0.455,
+ "step": 4484
+ },
+ {
+ "epoch": 2.120567375886525,
+ "grad_norm": 2.8713369369506836,
+ "learning_rate": 3.6469102874381552e-06,
+ "loss": 0.4567,
+ "step": 4485
+ },
+ {
+ "epoch": 2.1210401891252957,
+ "grad_norm": 3.174814462661743,
+ "learning_rate": 3.646355945228017e-06,
+ "loss": 0.5295,
+ "step": 4486
+ },
+ {
+ "epoch": 2.121513002364066,
+ "grad_norm": 2.6409823894500732,
+ "learning_rate": 3.6458015316393215e-06,
+ "loss": 0.4308,
+ "step": 4487
+ },
+ {
+ "epoch": 2.121985815602837,
+ "grad_norm": 2.4228954315185547,
+ "learning_rate": 3.645247046706588e-06,
+ "loss": 0.4042,
+ "step": 4488
+ },
+ {
+ "epoch": 2.1224586288416076,
+ "grad_norm": 2.553551435470581,
+ "learning_rate": 3.6446924904643427e-06,
+ "loss": 0.3925,
+ "step": 4489
+ },
+ {
+ "epoch": 2.1229314420803784,
+ "grad_norm": 2.8019237518310547,
+ "learning_rate": 3.6441378629471157e-06,
+ "loss": 0.4079,
+ "step": 4490
+ },
+ {
+ "epoch": 2.123404255319149,
+ "grad_norm": 2.993251085281372,
+ "learning_rate": 3.643583164189441e-06,
+ "loss": 0.4558,
+ "step": 4491
+ },
+ {
+ "epoch": 2.1238770685579196,
+ "grad_norm": 2.4531471729278564,
+ "learning_rate": 3.643028394225857e-06,
+ "loss": 0.4167,
+ "step": 4492
+ },
+ {
+ "epoch": 2.1243498817966904,
+ "grad_norm": 2.6827852725982666,
+ "learning_rate": 3.6424735530909065e-06,
+ "loss": 0.4311,
+ "step": 4493
+ },
+ {
+ "epoch": 2.124822695035461,
+ "grad_norm": 3.1232128143310547,
+ "learning_rate": 3.6419186408191377e-06,
+ "loss": 0.4537,
+ "step": 4494
+ },
+ {
+ "epoch": 2.1252955082742315,
+ "grad_norm": 2.816348075866699,
+ "learning_rate": 3.641363657445103e-06,
+ "loss": 0.4869,
+ "step": 4495
+ },
+ {
+ "epoch": 2.1257683215130023,
+ "grad_norm": 2.6269683837890625,
+ "learning_rate": 3.6408086030033575e-06,
+ "loss": 0.4066,
+ "step": 4496
+ },
+ {
+ "epoch": 2.126241134751773,
+ "grad_norm": 4.6375956535339355,
+ "learning_rate": 3.640253477528462e-06,
+ "loss": 0.4488,
+ "step": 4497
+ },
+ {
+ "epoch": 2.126713947990544,
+ "grad_norm": 3.020970582962036,
+ "learning_rate": 3.639698281054983e-06,
+ "loss": 0.4197,
+ "step": 4498
+ },
+ {
+ "epoch": 2.1271867612293143,
+ "grad_norm": 2.87904691696167,
+ "learning_rate": 3.6391430136174892e-06,
+ "loss": 0.4743,
+ "step": 4499
+ },
+ {
+ "epoch": 2.127659574468085,
+ "grad_norm": 2.719892978668213,
+ "learning_rate": 3.6385876752505554e-06,
+ "loss": 0.388,
+ "step": 4500
+ },
+ {
+ "epoch": 2.128132387706856,
+ "grad_norm": 2.7321808338165283,
+ "learning_rate": 3.638032265988759e-06,
+ "loss": 0.4857,
+ "step": 4501
+ },
+ {
+ "epoch": 2.1286052009456267,
+ "grad_norm": 2.700814723968506,
+ "learning_rate": 3.6374767858666836e-06,
+ "loss": 0.4819,
+ "step": 4502
+ },
+ {
+ "epoch": 2.129078014184397,
+ "grad_norm": 2.658423662185669,
+ "learning_rate": 3.6369212349189164e-06,
+ "loss": 0.4113,
+ "step": 4503
+ },
+ {
+ "epoch": 2.129550827423168,
+ "grad_norm": 2.673877716064453,
+ "learning_rate": 3.63636561318005e-06,
+ "loss": 0.3745,
+ "step": 4504
+ },
+ {
+ "epoch": 2.1300236406619386,
+ "grad_norm": 2.607758045196533,
+ "learning_rate": 3.6358099206846787e-06,
+ "loss": 0.4409,
+ "step": 4505
+ },
+ {
+ "epoch": 2.1304964539007094,
+ "grad_norm": 2.8117682933807373,
+ "learning_rate": 3.6352541574674044e-06,
+ "loss": 0.426,
+ "step": 4506
+ },
+ {
+ "epoch": 2.1309692671394798,
+ "grad_norm": 2.6970250606536865,
+ "learning_rate": 3.634698323562832e-06,
+ "loss": 0.4295,
+ "step": 4507
+ },
+ {
+ "epoch": 2.1314420803782506,
+ "grad_norm": 2.7133560180664062,
+ "learning_rate": 3.6341424190055696e-06,
+ "loss": 0.4443,
+ "step": 4508
+ },
+ {
+ "epoch": 2.1319148936170214,
+ "grad_norm": 2.57181715965271,
+ "learning_rate": 3.6335864438302328e-06,
+ "loss": 0.3995,
+ "step": 4509
+ },
+ {
+ "epoch": 2.132387706855792,
+ "grad_norm": 2.8618004322052,
+ "learning_rate": 3.633030398071438e-06,
+ "loss": 0.5075,
+ "step": 4510
+ },
+ {
+ "epoch": 2.1328605200945625,
+ "grad_norm": 2.7586729526519775,
+ "learning_rate": 3.6324742817638087e-06,
+ "loss": 0.4322,
+ "step": 4511
+ },
+ {
+ "epoch": 2.1333333333333333,
+ "grad_norm": 2.913256883621216,
+ "learning_rate": 3.631918094941972e-06,
+ "loss": 0.4708,
+ "step": 4512
+ },
+ {
+ "epoch": 2.133806146572104,
+ "grad_norm": 2.7715728282928467,
+ "learning_rate": 3.6313618376405585e-06,
+ "loss": 0.5194,
+ "step": 4513
+ },
+ {
+ "epoch": 2.134278959810875,
+ "grad_norm": 2.7986366748809814,
+ "learning_rate": 3.6308055098942042e-06,
+ "loss": 0.4419,
+ "step": 4514
+ },
+ {
+ "epoch": 2.1347517730496453,
+ "grad_norm": 3.043549060821533,
+ "learning_rate": 3.6302491117375492e-06,
+ "loss": 0.4441,
+ "step": 4515
+ },
+ {
+ "epoch": 2.135224586288416,
+ "grad_norm": 2.771761417388916,
+ "learning_rate": 3.629692643205238e-06,
+ "loss": 0.4752,
+ "step": 4516
+ },
+ {
+ "epoch": 2.135697399527187,
+ "grad_norm": 2.804941415786743,
+ "learning_rate": 3.6291361043319202e-06,
+ "loss": 0.4089,
+ "step": 4517
+ },
+ {
+ "epoch": 2.1361702127659576,
+ "grad_norm": 2.9897940158843994,
+ "learning_rate": 3.628579495152248e-06,
+ "loss": 0.4829,
+ "step": 4518
+ },
+ {
+ "epoch": 2.136643026004728,
+ "grad_norm": 2.9273486137390137,
+ "learning_rate": 3.6280228157008784e-06,
+ "loss": 0.4469,
+ "step": 4519
+ },
+ {
+ "epoch": 2.137115839243499,
+ "grad_norm": 2.584373950958252,
+ "learning_rate": 3.627466066012475e-06,
+ "loss": 0.4277,
+ "step": 4520
+ },
+ {
+ "epoch": 2.1375886524822696,
+ "grad_norm": 3.009333848953247,
+ "learning_rate": 3.626909246121703e-06,
+ "loss": 0.4025,
+ "step": 4521
+ },
+ {
+ "epoch": 2.1380614657210404,
+ "grad_norm": 2.634615659713745,
+ "learning_rate": 3.626352356063234e-06,
+ "loss": 0.4046,
+ "step": 4522
+ },
+ {
+ "epoch": 2.1385342789598107,
+ "grad_norm": 2.87310528755188,
+ "learning_rate": 3.625795395871743e-06,
+ "loss": 0.4426,
+ "step": 4523
+ },
+ {
+ "epoch": 2.1390070921985815,
+ "grad_norm": 2.94985032081604,
+ "learning_rate": 3.625238365581909e-06,
+ "loss": 0.445,
+ "step": 4524
+ },
+ {
+ "epoch": 2.1394799054373523,
+ "grad_norm": 2.470189332962036,
+ "learning_rate": 3.624681265228416e-06,
+ "loss": 0.4082,
+ "step": 4525
+ },
+ {
+ "epoch": 2.139952718676123,
+ "grad_norm": 2.5304040908813477,
+ "learning_rate": 3.624124094845952e-06,
+ "loss": 0.403,
+ "step": 4526
+ },
+ {
+ "epoch": 2.1404255319148935,
+ "grad_norm": 2.6148900985717773,
+ "learning_rate": 3.62356685446921e-06,
+ "loss": 0.3867,
+ "step": 4527
+ },
+ {
+ "epoch": 2.1408983451536643,
+ "grad_norm": 2.885549783706665,
+ "learning_rate": 3.623009544132886e-06,
+ "loss": 0.4706,
+ "step": 4528
+ },
+ {
+ "epoch": 2.141371158392435,
+ "grad_norm": 3.00490665435791,
+ "learning_rate": 3.6224521638716827e-06,
+ "loss": 0.4733,
+ "step": 4529
+ },
+ {
+ "epoch": 2.141843971631206,
+ "grad_norm": 2.925879716873169,
+ "learning_rate": 3.6218947137203043e-06,
+ "loss": 0.4581,
+ "step": 4530
+ },
+ {
+ "epoch": 2.1423167848699762,
+ "grad_norm": 3.10861873626709,
+ "learning_rate": 3.621337193713462e-06,
+ "loss": 0.4579,
+ "step": 4531
+ },
+ {
+ "epoch": 2.142789598108747,
+ "grad_norm": 2.7386577129364014,
+ "learning_rate": 3.6207796038858693e-06,
+ "loss": 0.4248,
+ "step": 4532
+ },
+ {
+ "epoch": 2.143262411347518,
+ "grad_norm": 2.601836681365967,
+ "learning_rate": 3.6202219442722453e-06,
+ "loss": 0.4928,
+ "step": 4533
+ },
+ {
+ "epoch": 2.1437352245862886,
+ "grad_norm": 2.598778247833252,
+ "learning_rate": 3.6196642149073123e-06,
+ "loss": 0.4415,
+ "step": 4534
+ },
+ {
+ "epoch": 2.144208037825059,
+ "grad_norm": 2.443995714187622,
+ "learning_rate": 3.619106415825798e-06,
+ "loss": 0.3917,
+ "step": 4535
+ },
+ {
+ "epoch": 2.1446808510638298,
+ "grad_norm": 2.84643816947937,
+ "learning_rate": 3.6185485470624354e-06,
+ "loss": 0.4162,
+ "step": 4536
+ },
+ {
+ "epoch": 2.1451536643026006,
+ "grad_norm": 2.4568188190460205,
+ "learning_rate": 3.617990608651959e-06,
+ "loss": 0.4298,
+ "step": 4537
+ },
+ {
+ "epoch": 2.145626477541371,
+ "grad_norm": 2.968804359436035,
+ "learning_rate": 3.61743260062911e-06,
+ "loss": 0.4696,
+ "step": 4538
+ },
+ {
+ "epoch": 2.1460992907801417,
+ "grad_norm": 2.629075288772583,
+ "learning_rate": 3.6168745230286327e-06,
+ "loss": 0.4234,
+ "step": 4539
+ },
+ {
+ "epoch": 2.1465721040189125,
+ "grad_norm": 2.7680578231811523,
+ "learning_rate": 3.6163163758852754e-06,
+ "loss": 0.4669,
+ "step": 4540
+ },
+ {
+ "epoch": 2.1470449172576833,
+ "grad_norm": 2.782825469970703,
+ "learning_rate": 3.615758159233793e-06,
+ "loss": 0.4552,
+ "step": 4541
+ },
+ {
+ "epoch": 2.147517730496454,
+ "grad_norm": 2.653047561645508,
+ "learning_rate": 3.615199873108942e-06,
+ "loss": 0.4393,
+ "step": 4542
+ },
+ {
+ "epoch": 2.1479905437352245,
+ "grad_norm": 2.4175806045532227,
+ "learning_rate": 3.6146415175454852e-06,
+ "loss": 0.4114,
+ "step": 4543
+ },
+ {
+ "epoch": 2.1484633569739953,
+ "grad_norm": 2.627943515777588,
+ "learning_rate": 3.614083092578189e-06,
+ "loss": 0.4215,
+ "step": 4544
+ },
+ {
+ "epoch": 2.148936170212766,
+ "grad_norm": 2.8934123516082764,
+ "learning_rate": 3.6135245982418227e-06,
+ "loss": 0.4815,
+ "step": 4545
+ },
+ {
+ "epoch": 2.1494089834515364,
+ "grad_norm": 2.8535244464874268,
+ "learning_rate": 3.612966034571164e-06,
+ "loss": 0.4683,
+ "step": 4546
+ },
+ {
+ "epoch": 2.149881796690307,
+ "grad_norm": 2.7826647758483887,
+ "learning_rate": 3.6124074016009893e-06,
+ "loss": 0.4351,
+ "step": 4547
+ },
+ {
+ "epoch": 2.150354609929078,
+ "grad_norm": 2.6906018257141113,
+ "learning_rate": 3.6118486993660834e-06,
+ "loss": 0.4585,
+ "step": 4548
+ },
+ {
+ "epoch": 2.150827423167849,
+ "grad_norm": 2.726766586303711,
+ "learning_rate": 3.6112899279012346e-06,
+ "loss": 0.4753,
+ "step": 4549
+ },
+ {
+ "epoch": 2.1513002364066196,
+ "grad_norm": 3.0193991661071777,
+ "learning_rate": 3.6107310872412348e-06,
+ "loss": 0.4827,
+ "step": 4550
+ },
+ {
+ "epoch": 2.15177304964539,
+ "grad_norm": 2.6788697242736816,
+ "learning_rate": 3.610172177420881e-06,
+ "loss": 0.4333,
+ "step": 4551
+ },
+ {
+ "epoch": 2.1522458628841608,
+ "grad_norm": 2.865410327911377,
+ "learning_rate": 3.609613198474973e-06,
+ "loss": 0.4569,
+ "step": 4552
+ },
+ {
+ "epoch": 2.1527186761229316,
+ "grad_norm": 2.9199366569519043,
+ "learning_rate": 3.609054150438317e-06,
+ "loss": 0.5097,
+ "step": 4553
+ },
+ {
+ "epoch": 2.153191489361702,
+ "grad_norm": 2.761035203933716,
+ "learning_rate": 3.6084950333457215e-06,
+ "loss": 0.5002,
+ "step": 4554
+ },
+ {
+ "epoch": 2.1536643026004727,
+ "grad_norm": 2.514223337173462,
+ "learning_rate": 3.607935847232002e-06,
+ "loss": 0.4171,
+ "step": 4555
+ },
+ {
+ "epoch": 2.1541371158392435,
+ "grad_norm": 2.5167524814605713,
+ "learning_rate": 3.6073765921319747e-06,
+ "loss": 0.4494,
+ "step": 4556
+ },
+ {
+ "epoch": 2.1546099290780143,
+ "grad_norm": 2.7540643215179443,
+ "learning_rate": 3.606817268080463e-06,
+ "loss": 0.4472,
+ "step": 4557
+ },
+ {
+ "epoch": 2.155082742316785,
+ "grad_norm": 2.7728664875030518,
+ "learning_rate": 3.6062578751122936e-06,
+ "loss": 0.4669,
+ "step": 4558
+ },
+ {
+ "epoch": 2.1555555555555554,
+ "grad_norm": 2.7788400650024414,
+ "learning_rate": 3.605698413262296e-06,
+ "loss": 0.4613,
+ "step": 4559
+ },
+ {
+ "epoch": 2.1560283687943262,
+ "grad_norm": 2.7811810970306396,
+ "learning_rate": 3.605138882565308e-06,
+ "loss": 0.4242,
+ "step": 4560
+ },
+ {
+ "epoch": 2.156501182033097,
+ "grad_norm": 2.7819995880126953,
+ "learning_rate": 3.6045792830561664e-06,
+ "loss": 0.443,
+ "step": 4561
+ },
+ {
+ "epoch": 2.1569739952718674,
+ "grad_norm": 2.671259641647339,
+ "learning_rate": 3.6040196147697166e-06,
+ "loss": 0.4336,
+ "step": 4562
+ },
+ {
+ "epoch": 2.157446808510638,
+ "grad_norm": 2.9296300411224365,
+ "learning_rate": 3.603459877740807e-06,
+ "loss": 0.479,
+ "step": 4563
+ },
+ {
+ "epoch": 2.157919621749409,
+ "grad_norm": 2.834937334060669,
+ "learning_rate": 3.602900072004289e-06,
+ "loss": 0.4603,
+ "step": 4564
+ },
+ {
+ "epoch": 2.15839243498818,
+ "grad_norm": 2.8434760570526123,
+ "learning_rate": 3.602340197595019e-06,
+ "loss": 0.4288,
+ "step": 4565
+ },
+ {
+ "epoch": 2.1588652482269506,
+ "grad_norm": 2.7245426177978516,
+ "learning_rate": 3.6017802545478593e-06,
+ "loss": 0.4194,
+ "step": 4566
+ },
+ {
+ "epoch": 2.159338061465721,
+ "grad_norm": 2.7795023918151855,
+ "learning_rate": 3.6012202428976735e-06,
+ "loss": 0.4481,
+ "step": 4567
+ },
+ {
+ "epoch": 2.1598108747044917,
+ "grad_norm": 2.9482083320617676,
+ "learning_rate": 3.6006601626793325e-06,
+ "loss": 0.468,
+ "step": 4568
+ },
+ {
+ "epoch": 2.1602836879432625,
+ "grad_norm": 2.9563326835632324,
+ "learning_rate": 3.6001000139277094e-06,
+ "loss": 0.4427,
+ "step": 4569
+ },
+ {
+ "epoch": 2.160756501182033,
+ "grad_norm": 2.7755916118621826,
+ "learning_rate": 3.599539796677682e-06,
+ "loss": 0.4258,
+ "step": 4570
+ },
+ {
+ "epoch": 2.1612293144208037,
+ "grad_norm": 2.961045265197754,
+ "learning_rate": 3.5989795109641333e-06,
+ "loss": 0.4645,
+ "step": 4571
+ },
+ {
+ "epoch": 2.1617021276595745,
+ "grad_norm": 3.0184407234191895,
+ "learning_rate": 3.5984191568219482e-06,
+ "loss": 0.4192,
+ "step": 4572
+ },
+ {
+ "epoch": 2.1621749408983453,
+ "grad_norm": 2.9811131954193115,
+ "learning_rate": 3.5978587342860192e-06,
+ "loss": 0.408,
+ "step": 4573
+ },
+ {
+ "epoch": 2.162647754137116,
+ "grad_norm": 2.9172329902648926,
+ "learning_rate": 3.597298243391242e-06,
+ "loss": 0.4528,
+ "step": 4574
+ },
+ {
+ "epoch": 2.1631205673758864,
+ "grad_norm": 2.7798452377319336,
+ "learning_rate": 3.596737684172513e-06,
+ "loss": 0.391,
+ "step": 4575
+ },
+ {
+ "epoch": 2.1635933806146572,
+ "grad_norm": 2.526277542114258,
+ "learning_rate": 3.596177056664738e-06,
+ "loss": 0.3699,
+ "step": 4576
+ },
+ {
+ "epoch": 2.164066193853428,
+ "grad_norm": 2.856269121170044,
+ "learning_rate": 3.5956163609028244e-06,
+ "loss": 0.4082,
+ "step": 4577
+ },
+ {
+ "epoch": 2.1645390070921984,
+ "grad_norm": 2.7681572437286377,
+ "learning_rate": 3.5950555969216845e-06,
+ "loss": 0.4064,
+ "step": 4578
+ },
+ {
+ "epoch": 2.165011820330969,
+ "grad_norm": 2.2924954891204834,
+ "learning_rate": 3.5944947647562333e-06,
+ "loss": 0.416,
+ "step": 4579
+ },
+ {
+ "epoch": 2.16548463356974,
+ "grad_norm": 2.439929485321045,
+ "learning_rate": 3.5939338644413936e-06,
+ "loss": 0.4476,
+ "step": 4580
+ },
+ {
+ "epoch": 2.1659574468085108,
+ "grad_norm": 2.786442518234253,
+ "learning_rate": 3.5933728960120877e-06,
+ "loss": 0.4525,
+ "step": 4581
+ },
+ {
+ "epoch": 2.166430260047281,
+ "grad_norm": 2.5910253524780273,
+ "learning_rate": 3.5928118595032465e-06,
+ "loss": 0.4441,
+ "step": 4582
+ },
+ {
+ "epoch": 2.166903073286052,
+ "grad_norm": 2.8144876956939697,
+ "learning_rate": 3.5922507549498024e-06,
+ "loss": 0.497,
+ "step": 4583
+ },
+ {
+ "epoch": 2.1673758865248227,
+ "grad_norm": 2.5714170932769775,
+ "learning_rate": 3.591689582386694e-06,
+ "loss": 0.4625,
+ "step": 4584
+ },
+ {
+ "epoch": 2.1678486997635935,
+ "grad_norm": 2.878187894821167,
+ "learning_rate": 3.591128341848861e-06,
+ "loss": 0.4835,
+ "step": 4585
+ },
+ {
+ "epoch": 2.168321513002364,
+ "grad_norm": 2.4946508407592773,
+ "learning_rate": 3.5905670333712504e-06,
+ "loss": 0.4278,
+ "step": 4586
+ },
+ {
+ "epoch": 2.1687943262411347,
+ "grad_norm": 2.9186196327209473,
+ "learning_rate": 3.590005656988814e-06,
+ "loss": 0.465,
+ "step": 4587
+ },
+ {
+ "epoch": 2.1692671394799055,
+ "grad_norm": 3.136807441711426,
+ "learning_rate": 3.5894442127365046e-06,
+ "loss": 0.4146,
+ "step": 4588
+ },
+ {
+ "epoch": 2.1697399527186763,
+ "grad_norm": 2.8106343746185303,
+ "learning_rate": 3.5888827006492804e-06,
+ "loss": 0.4737,
+ "step": 4589
+ },
+ {
+ "epoch": 2.1702127659574466,
+ "grad_norm": 2.874553680419922,
+ "learning_rate": 3.5883211207621047e-06,
+ "loss": 0.3962,
+ "step": 4590
+ },
+ {
+ "epoch": 2.1706855791962174,
+ "grad_norm": 2.7914116382598877,
+ "learning_rate": 3.587759473109946e-06,
+ "loss": 0.4705,
+ "step": 4591
+ },
+ {
+ "epoch": 2.171158392434988,
+ "grad_norm": 2.7273290157318115,
+ "learning_rate": 3.5871977577277745e-06,
+ "loss": 0.4827,
+ "step": 4592
+ },
+ {
+ "epoch": 2.171631205673759,
+ "grad_norm": 2.4167256355285645,
+ "learning_rate": 3.5866359746505653e-06,
+ "loss": 0.4181,
+ "step": 4593
+ },
+ {
+ "epoch": 2.1721040189125294,
+ "grad_norm": 2.8929779529571533,
+ "learning_rate": 3.586074123913299e-06,
+ "loss": 0.4006,
+ "step": 4594
+ },
+ {
+ "epoch": 2.1725768321513,
+ "grad_norm": 2.6996190547943115,
+ "learning_rate": 3.5855122055509593e-06,
+ "loss": 0.4792,
+ "step": 4595
+ },
+ {
+ "epoch": 2.173049645390071,
+ "grad_norm": 2.9341464042663574,
+ "learning_rate": 3.584950219598534e-06,
+ "loss": 0.3903,
+ "step": 4596
+ },
+ {
+ "epoch": 2.1735224586288417,
+ "grad_norm": 2.799330234527588,
+ "learning_rate": 3.5843881660910166e-06,
+ "loss": 0.4717,
+ "step": 4597
+ },
+ {
+ "epoch": 2.173995271867612,
+ "grad_norm": 2.5028693675994873,
+ "learning_rate": 3.5838260450634028e-06,
+ "loss": 0.4462,
+ "step": 4598
+ },
+ {
+ "epoch": 2.174468085106383,
+ "grad_norm": 2.5845541954040527,
+ "learning_rate": 3.583263856550693e-06,
+ "loss": 0.4327,
+ "step": 4599
+ },
+ {
+ "epoch": 2.1749408983451537,
+ "grad_norm": 2.4804906845092773,
+ "learning_rate": 3.5827016005878933e-06,
+ "loss": 0.4555,
+ "step": 4600
+ },
+ {
+ "epoch": 2.1754137115839245,
+ "grad_norm": 2.625746011734009,
+ "learning_rate": 3.5821392772100125e-06,
+ "loss": 0.455,
+ "step": 4601
+ },
+ {
+ "epoch": 2.175886524822695,
+ "grad_norm": 2.6230757236480713,
+ "learning_rate": 3.581576886452064e-06,
+ "loss": 0.4422,
+ "step": 4602
+ },
+ {
+ "epoch": 2.1763593380614656,
+ "grad_norm": 3.3104100227355957,
+ "learning_rate": 3.5810144283490656e-06,
+ "loss": 0.4212,
+ "step": 4603
+ },
+ {
+ "epoch": 2.1768321513002364,
+ "grad_norm": 2.6799755096435547,
+ "learning_rate": 3.5804519029360384e-06,
+ "loss": 0.4575,
+ "step": 4604
+ },
+ {
+ "epoch": 2.1773049645390072,
+ "grad_norm": 2.462216854095459,
+ "learning_rate": 3.5798893102480085e-06,
+ "loss": 0.4096,
+ "step": 4605
+ },
+ {
+ "epoch": 2.1777777777777776,
+ "grad_norm": 2.8600878715515137,
+ "learning_rate": 3.5793266503200074e-06,
+ "loss": 0.4798,
+ "step": 4606
+ },
+ {
+ "epoch": 2.1782505910165484,
+ "grad_norm": 2.935746431350708,
+ "learning_rate": 3.5787639231870673e-06,
+ "loss": 0.4021,
+ "step": 4607
+ },
+ {
+ "epoch": 2.178723404255319,
+ "grad_norm": 2.8655526638031006,
+ "learning_rate": 3.578201128884229e-06,
+ "loss": 0.4553,
+ "step": 4608
+ },
+ {
+ "epoch": 2.17919621749409,
+ "grad_norm": 3.219498634338379,
+ "learning_rate": 3.577638267446533e-06,
+ "loss": 0.4692,
+ "step": 4609
+ },
+ {
+ "epoch": 2.1796690307328603,
+ "grad_norm": 3.0449860095977783,
+ "learning_rate": 3.5770753389090283e-06,
+ "loss": 0.4675,
+ "step": 4610
+ },
+ {
+ "epoch": 2.180141843971631,
+ "grad_norm": 2.7045507431030273,
+ "learning_rate": 3.576512343306765e-06,
+ "loss": 0.4773,
+ "step": 4611
+ },
+ {
+ "epoch": 2.180614657210402,
+ "grad_norm": 2.601499557495117,
+ "learning_rate": 3.5759492806747985e-06,
+ "loss": 0.4112,
+ "step": 4612
+ },
+ {
+ "epoch": 2.1810874704491727,
+ "grad_norm": 2.987741470336914,
+ "learning_rate": 3.575386151048188e-06,
+ "loss": 0.4651,
+ "step": 4613
+ },
+ {
+ "epoch": 2.181560283687943,
+ "grad_norm": 2.961228847503662,
+ "learning_rate": 3.5748229544619973e-06,
+ "loss": 0.5116,
+ "step": 4614
+ },
+ {
+ "epoch": 2.182033096926714,
+ "grad_norm": 2.8008430004119873,
+ "learning_rate": 3.574259690951295e-06,
+ "loss": 0.4152,
+ "step": 4615
+ },
+ {
+ "epoch": 2.1825059101654847,
+ "grad_norm": 2.5429348945617676,
+ "learning_rate": 3.573696360551151e-06,
+ "loss": 0.4188,
+ "step": 4616
+ },
+ {
+ "epoch": 2.1829787234042555,
+ "grad_norm": 2.9566478729248047,
+ "learning_rate": 3.5731329632966428e-06,
+ "loss": 0.5156,
+ "step": 4617
+ },
+ {
+ "epoch": 2.183451536643026,
+ "grad_norm": 2.5302467346191406,
+ "learning_rate": 3.572569499222851e-06,
+ "loss": 0.4361,
+ "step": 4618
+ },
+ {
+ "epoch": 2.1839243498817966,
+ "grad_norm": 3.206803560256958,
+ "learning_rate": 3.5720059683648593e-06,
+ "loss": 0.5149,
+ "step": 4619
+ },
+ {
+ "epoch": 2.1843971631205674,
+ "grad_norm": 2.9432034492492676,
+ "learning_rate": 3.5714423707577573e-06,
+ "loss": 0.4411,
+ "step": 4620
+ },
+ {
+ "epoch": 2.184869976359338,
+ "grad_norm": 2.9412078857421875,
+ "learning_rate": 3.5708787064366358e-06,
+ "loss": 0.4372,
+ "step": 4621
+ },
+ {
+ "epoch": 2.1853427895981086,
+ "grad_norm": 3.1702330112457275,
+ "learning_rate": 3.5703149754365935e-06,
+ "loss": 0.4761,
+ "step": 4622
+ },
+ {
+ "epoch": 2.1858156028368794,
+ "grad_norm": 3.1240456104278564,
+ "learning_rate": 3.569751177792731e-06,
+ "loss": 0.4854,
+ "step": 4623
+ },
+ {
+ "epoch": 2.18628841607565,
+ "grad_norm": 2.7221994400024414,
+ "learning_rate": 3.5691873135401534e-06,
+ "loss": 0.4048,
+ "step": 4624
+ },
+ {
+ "epoch": 2.186761229314421,
+ "grad_norm": 2.74397873878479,
+ "learning_rate": 3.5686233827139695e-06,
+ "loss": 0.4747,
+ "step": 4625
+ },
+ {
+ "epoch": 2.1872340425531913,
+ "grad_norm": 2.7379889488220215,
+ "learning_rate": 3.5680593853492932e-06,
+ "loss": 0.4963,
+ "step": 4626
+ },
+ {
+ "epoch": 2.187706855791962,
+ "grad_norm": 3.040205478668213,
+ "learning_rate": 3.5674953214812435e-06,
+ "loss": 0.4917,
+ "step": 4627
+ },
+ {
+ "epoch": 2.188179669030733,
+ "grad_norm": 2.95302677154541,
+ "learning_rate": 3.56693119114494e-06,
+ "loss": 0.4758,
+ "step": 4628
+ },
+ {
+ "epoch": 2.1886524822695037,
+ "grad_norm": 2.5488312244415283,
+ "learning_rate": 3.56636699437551e-06,
+ "loss": 0.4057,
+ "step": 4629
+ },
+ {
+ "epoch": 2.189125295508274,
+ "grad_norm": 2.8379666805267334,
+ "learning_rate": 3.565802731208083e-06,
+ "loss": 0.4755,
+ "step": 4630
+ },
+ {
+ "epoch": 2.189598108747045,
+ "grad_norm": 2.8765869140625,
+ "learning_rate": 3.565238401677793e-06,
+ "loss": 0.4232,
+ "step": 4631
+ },
+ {
+ "epoch": 2.1900709219858157,
+ "grad_norm": 2.9091262817382812,
+ "learning_rate": 3.5646740058197784e-06,
+ "loss": 0.3874,
+ "step": 4632
+ },
+ {
+ "epoch": 2.1905437352245865,
+ "grad_norm": 2.7067387104034424,
+ "learning_rate": 3.5641095436691826e-06,
+ "loss": 0.4771,
+ "step": 4633
+ },
+ {
+ "epoch": 2.191016548463357,
+ "grad_norm": 2.403043508529663,
+ "learning_rate": 3.563545015261151e-06,
+ "loss": 0.4062,
+ "step": 4634
+ },
+ {
+ "epoch": 2.1914893617021276,
+ "grad_norm": 2.8059732913970947,
+ "learning_rate": 3.562980420630836e-06,
+ "loss": 0.4635,
+ "step": 4635
+ },
+ {
+ "epoch": 2.1919621749408984,
+ "grad_norm": 2.5467724800109863,
+ "learning_rate": 3.56241575981339e-06,
+ "loss": 0.4552,
+ "step": 4636
+ },
+ {
+ "epoch": 2.192434988179669,
+ "grad_norm": 2.651024103164673,
+ "learning_rate": 3.561851032843973e-06,
+ "loss": 0.38,
+ "step": 4637
+ },
+ {
+ "epoch": 2.1929078014184396,
+ "grad_norm": 2.5529849529266357,
+ "learning_rate": 3.5612862397577496e-06,
+ "loss": 0.4106,
+ "step": 4638
+ },
+ {
+ "epoch": 2.1933806146572103,
+ "grad_norm": 3.069258451461792,
+ "learning_rate": 3.5607213805898844e-06,
+ "loss": 0.461,
+ "step": 4639
+ },
+ {
+ "epoch": 2.193853427895981,
+ "grad_norm": 2.5652637481689453,
+ "learning_rate": 3.56015645537555e-06,
+ "loss": 0.4497,
+ "step": 4640
+ },
+ {
+ "epoch": 2.194326241134752,
+ "grad_norm": 2.699101209640503,
+ "learning_rate": 3.5595914641499224e-06,
+ "loss": 0.4887,
+ "step": 4641
+ },
+ {
+ "epoch": 2.1947990543735223,
+ "grad_norm": 2.9292235374450684,
+ "learning_rate": 3.5590264069481805e-06,
+ "loss": 0.4462,
+ "step": 4642
+ },
+ {
+ "epoch": 2.195271867612293,
+ "grad_norm": 2.6151106357574463,
+ "learning_rate": 3.5584612838055077e-06,
+ "loss": 0.4334,
+ "step": 4643
+ },
+ {
+ "epoch": 2.195744680851064,
+ "grad_norm": 2.895798444747925,
+ "learning_rate": 3.5578960947570923e-06,
+ "loss": 0.4448,
+ "step": 4644
+ },
+ {
+ "epoch": 2.1962174940898347,
+ "grad_norm": 2.627631425857544,
+ "learning_rate": 3.557330839838125e-06,
+ "loss": 0.436,
+ "step": 4645
+ },
+ {
+ "epoch": 2.196690307328605,
+ "grad_norm": 2.8803584575653076,
+ "learning_rate": 3.556765519083803e-06,
+ "loss": 0.4698,
+ "step": 4646
+ },
+ {
+ "epoch": 2.197163120567376,
+ "grad_norm": 2.436609983444214,
+ "learning_rate": 3.5562001325293265e-06,
+ "loss": 0.4043,
+ "step": 4647
+ },
+ {
+ "epoch": 2.1976359338061466,
+ "grad_norm": 2.5090718269348145,
+ "learning_rate": 3.5556346802098985e-06,
+ "loss": 0.4505,
+ "step": 4648
+ },
+ {
+ "epoch": 2.1981087470449174,
+ "grad_norm": 2.792783737182617,
+ "learning_rate": 3.5550691621607277e-06,
+ "loss": 0.43,
+ "step": 4649
+ },
+ {
+ "epoch": 2.198581560283688,
+ "grad_norm": 2.74153470993042,
+ "learning_rate": 3.554503578417026e-06,
+ "loss": 0.4496,
+ "step": 4650
+ },
+ {
+ "epoch": 2.1990543735224586,
+ "grad_norm": 3.0262627601623535,
+ "learning_rate": 3.5539379290140114e-06,
+ "loss": 0.4503,
+ "step": 4651
+ },
+ {
+ "epoch": 2.1995271867612294,
+ "grad_norm": 2.783811330795288,
+ "learning_rate": 3.553372213986903e-06,
+ "loss": 0.432,
+ "step": 4652
+ },
+ {
+ "epoch": 2.2,
+ "grad_norm": 3.091191053390503,
+ "learning_rate": 3.5528064333709255e-06,
+ "loss": 0.4658,
+ "step": 4653
+ },
+ {
+ "epoch": 2.2004728132387705,
+ "grad_norm": 2.814634084701538,
+ "learning_rate": 3.5522405872013076e-06,
+ "loss": 0.4473,
+ "step": 4654
+ },
+ {
+ "epoch": 2.2009456264775413,
+ "grad_norm": 2.6918299198150635,
+ "learning_rate": 3.5516746755132824e-06,
+ "loss": 0.5323,
+ "step": 4655
+ },
+ {
+ "epoch": 2.201418439716312,
+ "grad_norm": 2.9902455806732178,
+ "learning_rate": 3.5511086983420867e-06,
+ "loss": 0.5166,
+ "step": 4656
+ },
+ {
+ "epoch": 2.201891252955083,
+ "grad_norm": 2.932699203491211,
+ "learning_rate": 3.5505426557229616e-06,
+ "loss": 0.5197,
+ "step": 4657
+ },
+ {
+ "epoch": 2.2023640661938533,
+ "grad_norm": 2.585712432861328,
+ "learning_rate": 3.549976547691152e-06,
+ "loss": 0.425,
+ "step": 4658
+ },
+ {
+ "epoch": 2.202836879432624,
+ "grad_norm": 3.1019949913024902,
+ "learning_rate": 3.5494103742819065e-06,
+ "loss": 0.485,
+ "step": 4659
+ },
+ {
+ "epoch": 2.203309692671395,
+ "grad_norm": 2.3169195652008057,
+ "learning_rate": 3.548844135530478e-06,
+ "loss": 0.4064,
+ "step": 4660
+ },
+ {
+ "epoch": 2.2037825059101657,
+ "grad_norm": 2.779240846633911,
+ "learning_rate": 3.5482778314721257e-06,
+ "loss": 0.427,
+ "step": 4661
+ },
+ {
+ "epoch": 2.204255319148936,
+ "grad_norm": 2.765423059463501,
+ "learning_rate": 3.5477114621421078e-06,
+ "loss": 0.5125,
+ "step": 4662
+ },
+ {
+ "epoch": 2.204728132387707,
+ "grad_norm": 2.5590033531188965,
+ "learning_rate": 3.5471450275756913e-06,
+ "loss": 0.4009,
+ "step": 4663
+ },
+ {
+ "epoch": 2.2052009456264776,
+ "grad_norm": 2.706068515777588,
+ "learning_rate": 3.546578527808146e-06,
+ "loss": 0.4604,
+ "step": 4664
+ },
+ {
+ "epoch": 2.2056737588652484,
+ "grad_norm": 2.7995102405548096,
+ "learning_rate": 3.546011962874745e-06,
+ "loss": 0.4088,
+ "step": 4665
+ },
+ {
+ "epoch": 2.2061465721040188,
+ "grad_norm": 2.6369729042053223,
+ "learning_rate": 3.5454453328107656e-06,
+ "loss": 0.4634,
+ "step": 4666
+ },
+ {
+ "epoch": 2.2066193853427896,
+ "grad_norm": 3.1426475048065186,
+ "learning_rate": 3.54487863765149e-06,
+ "loss": 0.4761,
+ "step": 4667
+ },
+ {
+ "epoch": 2.2070921985815604,
+ "grad_norm": 2.7739460468292236,
+ "learning_rate": 3.5443118774322027e-06,
+ "loss": 0.467,
+ "step": 4668
+ },
+ {
+ "epoch": 2.207565011820331,
+ "grad_norm": 2.559105157852173,
+ "learning_rate": 3.5437450521881934e-06,
+ "loss": 0.4268,
+ "step": 4669
+ },
+ {
+ "epoch": 2.2080378250591015,
+ "grad_norm": 2.726593017578125,
+ "learning_rate": 3.543178161954758e-06,
+ "loss": 0.462,
+ "step": 4670
+ },
+ {
+ "epoch": 2.2085106382978723,
+ "grad_norm": 2.796109199523926,
+ "learning_rate": 3.5426112067671907e-06,
+ "loss": 0.4571,
+ "step": 4671
+ },
+ {
+ "epoch": 2.208983451536643,
+ "grad_norm": 2.7989072799682617,
+ "learning_rate": 3.5420441866607964e-06,
+ "loss": 0.4648,
+ "step": 4672
+ },
+ {
+ "epoch": 2.209456264775414,
+ "grad_norm": 2.6750967502593994,
+ "learning_rate": 3.5414771016708795e-06,
+ "loss": 0.4717,
+ "step": 4673
+ },
+ {
+ "epoch": 2.2099290780141843,
+ "grad_norm": 2.705659866333008,
+ "learning_rate": 3.5409099518327507e-06,
+ "loss": 0.4738,
+ "step": 4674
+ },
+ {
+ "epoch": 2.210401891252955,
+ "grad_norm": 2.79276442527771,
+ "learning_rate": 3.5403427371817234e-06,
+ "loss": 0.4625,
+ "step": 4675
+ },
+ {
+ "epoch": 2.210874704491726,
+ "grad_norm": 2.781339406967163,
+ "learning_rate": 3.539775457753115e-06,
+ "loss": 0.438,
+ "step": 4676
+ },
+ {
+ "epoch": 2.2113475177304966,
+ "grad_norm": 3.0088918209075928,
+ "learning_rate": 3.5392081135822488e-06,
+ "loss": 0.4776,
+ "step": 4677
+ },
+ {
+ "epoch": 2.211820330969267,
+ "grad_norm": 3.0291390419006348,
+ "learning_rate": 3.538640704704449e-06,
+ "loss": 0.4634,
+ "step": 4678
+ },
+ {
+ "epoch": 2.212293144208038,
+ "grad_norm": 2.967867374420166,
+ "learning_rate": 3.5380732311550477e-06,
+ "loss": 0.4776,
+ "step": 4679
+ },
+ {
+ "epoch": 2.2127659574468086,
+ "grad_norm": 2.6268832683563232,
+ "learning_rate": 3.5375056929693787e-06,
+ "loss": 0.4646,
+ "step": 4680
+ },
+ {
+ "epoch": 2.2132387706855794,
+ "grad_norm": 2.6688554286956787,
+ "learning_rate": 3.536938090182778e-06,
+ "loss": 0.3975,
+ "step": 4681
+ },
+ {
+ "epoch": 2.2137115839243497,
+ "grad_norm": 3.0079736709594727,
+ "learning_rate": 3.5363704228305906e-06,
+ "loss": 0.4724,
+ "step": 4682
+ },
+ {
+ "epoch": 2.2141843971631205,
+ "grad_norm": 2.4287586212158203,
+ "learning_rate": 3.535802690948161e-06,
+ "loss": 0.4371,
+ "step": 4683
+ },
+ {
+ "epoch": 2.2146572104018913,
+ "grad_norm": 2.960679531097412,
+ "learning_rate": 3.53523489457084e-06,
+ "loss": 0.4347,
+ "step": 4684
+ },
+ {
+ "epoch": 2.215130023640662,
+ "grad_norm": 2.9646008014678955,
+ "learning_rate": 3.5346670337339807e-06,
+ "loss": 0.4803,
+ "step": 4685
+ },
+ {
+ "epoch": 2.2156028368794325,
+ "grad_norm": 3.0518898963928223,
+ "learning_rate": 3.534099108472942e-06,
+ "loss": 0.4712,
+ "step": 4686
+ },
+ {
+ "epoch": 2.2160756501182033,
+ "grad_norm": 2.776681900024414,
+ "learning_rate": 3.533531118823086e-06,
+ "loss": 0.4347,
+ "step": 4687
+ },
+ {
+ "epoch": 2.216548463356974,
+ "grad_norm": 2.18019437789917,
+ "learning_rate": 3.53296306481978e-06,
+ "loss": 0.3551,
+ "step": 4688
+ },
+ {
+ "epoch": 2.217021276595745,
+ "grad_norm": 2.9400811195373535,
+ "learning_rate": 3.5323949464983937e-06,
+ "loss": 0.4912,
+ "step": 4689
+ },
+ {
+ "epoch": 2.2174940898345152,
+ "grad_norm": 2.798386812210083,
+ "learning_rate": 3.5318267638943e-06,
+ "loss": 0.3967,
+ "step": 4690
+ },
+ {
+ "epoch": 2.217966903073286,
+ "grad_norm": 2.5452775955200195,
+ "learning_rate": 3.531258517042879e-06,
+ "loss": 0.3773,
+ "step": 4691
+ },
+ {
+ "epoch": 2.218439716312057,
+ "grad_norm": 2.711137294769287,
+ "learning_rate": 3.5306902059795113e-06,
+ "loss": 0.4123,
+ "step": 4692
+ },
+ {
+ "epoch": 2.2189125295508276,
+ "grad_norm": 3.0022387504577637,
+ "learning_rate": 3.530121830739584e-06,
+ "loss": 0.4898,
+ "step": 4693
+ },
+ {
+ "epoch": 2.219385342789598,
+ "grad_norm": 2.871814250946045,
+ "learning_rate": 3.5295533913584877e-06,
+ "loss": 0.4497,
+ "step": 4694
+ },
+ {
+ "epoch": 2.219858156028369,
+ "grad_norm": 2.9782521724700928,
+ "learning_rate": 3.528984887871616e-06,
+ "loss": 0.4797,
+ "step": 4695
+ },
+ {
+ "epoch": 2.2203309692671396,
+ "grad_norm": 2.6896398067474365,
+ "learning_rate": 3.5284163203143673e-06,
+ "loss": 0.439,
+ "step": 4696
+ },
+ {
+ "epoch": 2.2208037825059104,
+ "grad_norm": 2.7898833751678467,
+ "learning_rate": 3.5278476887221436e-06,
+ "loss": 0.4656,
+ "step": 4697
+ },
+ {
+ "epoch": 2.2212765957446807,
+ "grad_norm": 2.800416946411133,
+ "learning_rate": 3.527278993130352e-06,
+ "loss": 0.4452,
+ "step": 4698
+ },
+ {
+ "epoch": 2.2217494089834515,
+ "grad_norm": 3.653228998184204,
+ "learning_rate": 3.526710233574401e-06,
+ "loss": 0.4189,
+ "step": 4699
+ },
+ {
+ "epoch": 2.2222222222222223,
+ "grad_norm": 2.856956958770752,
+ "learning_rate": 3.5261414100897064e-06,
+ "loss": 0.4298,
+ "step": 4700
+ },
+ {
+ "epoch": 2.222695035460993,
+ "grad_norm": 2.8576223850250244,
+ "learning_rate": 3.5255725227116854e-06,
+ "loss": 0.4425,
+ "step": 4701
+ },
+ {
+ "epoch": 2.2231678486997635,
+ "grad_norm": 3.1161351203918457,
+ "learning_rate": 3.5250035714757603e-06,
+ "loss": 0.4609,
+ "step": 4702
+ },
+ {
+ "epoch": 2.2236406619385343,
+ "grad_norm": 2.843379259109497,
+ "learning_rate": 3.5244345564173578e-06,
+ "loss": 0.3589,
+ "step": 4703
+ },
+ {
+ "epoch": 2.224113475177305,
+ "grad_norm": 2.877157211303711,
+ "learning_rate": 3.5238654775719068e-06,
+ "loss": 0.4591,
+ "step": 4704
+ },
+ {
+ "epoch": 2.2245862884160754,
+ "grad_norm": 3.488954782485962,
+ "learning_rate": 3.5232963349748424e-06,
+ "loss": 0.4836,
+ "step": 4705
+ },
+ {
+ "epoch": 2.225059101654846,
+ "grad_norm": 2.929037570953369,
+ "learning_rate": 3.5227271286616025e-06,
+ "loss": 0.5293,
+ "step": 4706
+ },
+ {
+ "epoch": 2.225531914893617,
+ "grad_norm": 2.6230576038360596,
+ "learning_rate": 3.5221578586676286e-06,
+ "loss": 0.4235,
+ "step": 4707
+ },
+ {
+ "epoch": 2.226004728132388,
+ "grad_norm": 2.529998302459717,
+ "learning_rate": 3.5215885250283664e-06,
+ "loss": 0.4369,
+ "step": 4708
+ },
+ {
+ "epoch": 2.2264775413711586,
+ "grad_norm": 2.817279577255249,
+ "learning_rate": 3.521019127779267e-06,
+ "loss": 0.481,
+ "step": 4709
+ },
+ {
+ "epoch": 2.226950354609929,
+ "grad_norm": 3.1513843536376953,
+ "learning_rate": 3.5204496669557833e-06,
+ "loss": 0.463,
+ "step": 4710
+ },
+ {
+ "epoch": 2.2274231678486998,
+ "grad_norm": 2.9403610229492188,
+ "learning_rate": 3.5198801425933725e-06,
+ "loss": 0.455,
+ "step": 4711
+ },
+ {
+ "epoch": 2.2278959810874706,
+ "grad_norm": 2.648346424102783,
+ "learning_rate": 3.5193105547274987e-06,
+ "loss": 0.4441,
+ "step": 4712
+ },
+ {
+ "epoch": 2.228368794326241,
+ "grad_norm": 2.791898727416992,
+ "learning_rate": 3.5187409033936252e-06,
+ "loss": 0.4682,
+ "step": 4713
+ },
+ {
+ "epoch": 2.2288416075650117,
+ "grad_norm": 2.8157432079315186,
+ "learning_rate": 3.5181711886272242e-06,
+ "loss": 0.4572,
+ "step": 4714
+ },
+ {
+ "epoch": 2.2293144208037825,
+ "grad_norm": 3.250319480895996,
+ "learning_rate": 3.5176014104637665e-06,
+ "loss": 0.4599,
+ "step": 4715
+ },
+ {
+ "epoch": 2.2297872340425533,
+ "grad_norm": 2.6747050285339355,
+ "learning_rate": 3.5170315689387307e-06,
+ "loss": 0.4328,
+ "step": 4716
+ },
+ {
+ "epoch": 2.230260047281324,
+ "grad_norm": 2.584094762802124,
+ "learning_rate": 3.5164616640875993e-06,
+ "loss": 0.4268,
+ "step": 4717
+ },
+ {
+ "epoch": 2.2307328605200945,
+ "grad_norm": 2.480710506439209,
+ "learning_rate": 3.5158916959458573e-06,
+ "loss": 0.438,
+ "step": 4718
+ },
+ {
+ "epoch": 2.2312056737588652,
+ "grad_norm": 2.9338483810424805,
+ "learning_rate": 3.515321664548993e-06,
+ "loss": 0.4937,
+ "step": 4719
+ },
+ {
+ "epoch": 2.231678486997636,
+ "grad_norm": 2.7880783081054688,
+ "learning_rate": 3.5147515699325013e-06,
+ "loss": 0.4624,
+ "step": 4720
+ },
+ {
+ "epoch": 2.2321513002364064,
+ "grad_norm": 2.740841865539551,
+ "learning_rate": 3.5141814121318797e-06,
+ "loss": 0.3689,
+ "step": 4721
+ },
+ {
+ "epoch": 2.232624113475177,
+ "grad_norm": 2.9541244506835938,
+ "learning_rate": 3.5136111911826277e-06,
+ "loss": 0.4092,
+ "step": 4722
+ },
+ {
+ "epoch": 2.233096926713948,
+ "grad_norm": 2.7205398082733154,
+ "learning_rate": 3.5130409071202515e-06,
+ "loss": 0.445,
+ "step": 4723
+ },
+ {
+ "epoch": 2.233569739952719,
+ "grad_norm": 2.563406229019165,
+ "learning_rate": 3.51247055998026e-06,
+ "loss": 0.4335,
+ "step": 4724
+ },
+ {
+ "epoch": 2.2340425531914896,
+ "grad_norm": 2.4249489307403564,
+ "learning_rate": 3.5119001497981666e-06,
+ "loss": 0.4671,
+ "step": 4725
+ },
+ {
+ "epoch": 2.23451536643026,
+ "grad_norm": 2.711630344390869,
+ "learning_rate": 3.5113296766094875e-06,
+ "loss": 0.4177,
+ "step": 4726
+ },
+ {
+ "epoch": 2.2349881796690307,
+ "grad_norm": 3.0257632732391357,
+ "learning_rate": 3.5107591404497443e-06,
+ "loss": 0.4976,
+ "step": 4727
+ },
+ {
+ "epoch": 2.2354609929078015,
+ "grad_norm": 2.717303991317749,
+ "learning_rate": 3.5101885413544614e-06,
+ "loss": 0.4621,
+ "step": 4728
+ },
+ {
+ "epoch": 2.235933806146572,
+ "grad_norm": 3.2846004962921143,
+ "learning_rate": 3.509617879359167e-06,
+ "loss": 0.4284,
+ "step": 4729
+ },
+ {
+ "epoch": 2.2364066193853427,
+ "grad_norm": 2.7217819690704346,
+ "learning_rate": 3.5090471544993953e-06,
+ "loss": 0.4247,
+ "step": 4730
+ },
+ {
+ "epoch": 2.2368794326241135,
+ "grad_norm": 2.5003223419189453,
+ "learning_rate": 3.5084763668106812e-06,
+ "loss": 0.4096,
+ "step": 4731
+ },
+ {
+ "epoch": 2.2373522458628843,
+ "grad_norm": 2.7312731742858887,
+ "learning_rate": 3.5079055163285658e-06,
+ "loss": 0.4741,
+ "step": 4732
+ },
+ {
+ "epoch": 2.237825059101655,
+ "grad_norm": 2.84940767288208,
+ "learning_rate": 3.5073346030885934e-06,
+ "loss": 0.4887,
+ "step": 4733
+ },
+ {
+ "epoch": 2.2382978723404254,
+ "grad_norm": 3.1188511848449707,
+ "learning_rate": 3.506763627126313e-06,
+ "loss": 0.5335,
+ "step": 4734
+ },
+ {
+ "epoch": 2.2387706855791962,
+ "grad_norm": 2.6741397380828857,
+ "learning_rate": 3.5061925884772753e-06,
+ "loss": 0.4137,
+ "step": 4735
+ },
+ {
+ "epoch": 2.239243498817967,
+ "grad_norm": 3.1542465686798096,
+ "learning_rate": 3.505621487177037e-06,
+ "loss": 0.5303,
+ "step": 4736
+ },
+ {
+ "epoch": 2.2397163120567374,
+ "grad_norm": 5.448268890380859,
+ "learning_rate": 3.505050323261159e-06,
+ "loss": 0.4995,
+ "step": 4737
+ },
+ {
+ "epoch": 2.240189125295508,
+ "grad_norm": 2.7317898273468018,
+ "learning_rate": 3.5044790967652037e-06,
+ "loss": 0.4595,
+ "step": 4738
+ },
+ {
+ "epoch": 2.240661938534279,
+ "grad_norm": 2.8135695457458496,
+ "learning_rate": 3.50390780772474e-06,
+ "loss": 0.4593,
+ "step": 4739
+ },
+ {
+ "epoch": 2.2411347517730498,
+ "grad_norm": 3.1391844749450684,
+ "learning_rate": 3.5033364561753393e-06,
+ "loss": 0.4902,
+ "step": 4740
+ },
+ {
+ "epoch": 2.24160756501182,
+ "grad_norm": 2.6383132934570312,
+ "learning_rate": 3.5027650421525762e-06,
+ "loss": 0.3832,
+ "step": 4741
+ },
+ {
+ "epoch": 2.242080378250591,
+ "grad_norm": 2.742546558380127,
+ "learning_rate": 3.5021935656920314e-06,
+ "loss": 0.4012,
+ "step": 4742
+ },
+ {
+ "epoch": 2.2425531914893617,
+ "grad_norm": 3.1243674755096436,
+ "learning_rate": 3.5016220268292873e-06,
+ "loss": 0.4271,
+ "step": 4743
+ },
+ {
+ "epoch": 2.2430260047281325,
+ "grad_norm": 2.794717788696289,
+ "learning_rate": 3.501050425599932e-06,
+ "loss": 0.4604,
+ "step": 4744
+ },
+ {
+ "epoch": 2.243498817966903,
+ "grad_norm": 2.8481621742248535,
+ "learning_rate": 3.5004787620395565e-06,
+ "loss": 0.4814,
+ "step": 4745
+ },
+ {
+ "epoch": 2.2439716312056737,
+ "grad_norm": 2.8842051029205322,
+ "learning_rate": 3.499907036183755e-06,
+ "loss": 0.4987,
+ "step": 4746
+ },
+ {
+ "epoch": 2.2444444444444445,
+ "grad_norm": 3.074805974960327,
+ "learning_rate": 3.4993352480681265e-06,
+ "loss": 0.4966,
+ "step": 4747
+ },
+ {
+ "epoch": 2.2449172576832153,
+ "grad_norm": 2.7204246520996094,
+ "learning_rate": 3.4987633977282742e-06,
+ "loss": 0.4,
+ "step": 4748
+ },
+ {
+ "epoch": 2.2453900709219856,
+ "grad_norm": 2.685884952545166,
+ "learning_rate": 3.4981914851998055e-06,
+ "loss": 0.4285,
+ "step": 4749
+ },
+ {
+ "epoch": 2.2458628841607564,
+ "grad_norm": 2.1666336059570312,
+ "learning_rate": 3.4976195105183287e-06,
+ "loss": 0.3756,
+ "step": 4750
+ },
+ {
+ "epoch": 2.246335697399527,
+ "grad_norm": 2.863006353378296,
+ "learning_rate": 3.49704747371946e-06,
+ "loss": 0.4535,
+ "step": 4751
+ },
+ {
+ "epoch": 2.246808510638298,
+ "grad_norm": 2.5558736324310303,
+ "learning_rate": 3.496475374838817e-06,
+ "loss": 0.4129,
+ "step": 4752
+ },
+ {
+ "epoch": 2.2472813238770684,
+ "grad_norm": 2.9780309200286865,
+ "learning_rate": 3.495903213912022e-06,
+ "loss": 0.4871,
+ "step": 4753
+ },
+ {
+ "epoch": 2.247754137115839,
+ "grad_norm": 2.951779365539551,
+ "learning_rate": 3.4953309909747e-06,
+ "loss": 0.5162,
+ "step": 4754
+ },
+ {
+ "epoch": 2.24822695035461,
+ "grad_norm": 2.7654693126678467,
+ "learning_rate": 3.4947587060624834e-06,
+ "loss": 0.4662,
+ "step": 4755
+ },
+ {
+ "epoch": 2.2486997635933808,
+ "grad_norm": 2.708247184753418,
+ "learning_rate": 3.494186359211002e-06,
+ "loss": 0.4279,
+ "step": 4756
+ },
+ {
+ "epoch": 2.249172576832151,
+ "grad_norm": 3.09916615486145,
+ "learning_rate": 3.4936139504558963e-06,
+ "loss": 0.4085,
+ "step": 4757
+ },
+ {
+ "epoch": 2.249645390070922,
+ "grad_norm": 2.913806200027466,
+ "learning_rate": 3.493041479832807e-06,
+ "loss": 0.4653,
+ "step": 4758
+ },
+ {
+ "epoch": 2.2501182033096927,
+ "grad_norm": 3.2903928756713867,
+ "learning_rate": 3.4924689473773787e-06,
+ "loss": 0.5167,
+ "step": 4759
+ },
+ {
+ "epoch": 2.2505910165484635,
+ "grad_norm": 3.1302902698516846,
+ "learning_rate": 3.4918963531252607e-06,
+ "loss": 0.5398,
+ "step": 4760
+ },
+ {
+ "epoch": 2.251063829787234,
+ "grad_norm": 2.8858273029327393,
+ "learning_rate": 3.4913236971121063e-06,
+ "loss": 0.4395,
+ "step": 4761
+ },
+ {
+ "epoch": 2.2515366430260046,
+ "grad_norm": 3.194521903991699,
+ "learning_rate": 3.4907509793735727e-06,
+ "loss": 0.5258,
+ "step": 4762
+ },
+ {
+ "epoch": 2.2520094562647754,
+ "grad_norm": 2.8640544414520264,
+ "learning_rate": 3.49017819994532e-06,
+ "loss": 0.4073,
+ "step": 4763
+ },
+ {
+ "epoch": 2.2524822695035462,
+ "grad_norm": 3.139995813369751,
+ "learning_rate": 3.489605358863011e-06,
+ "loss": 0.4653,
+ "step": 4764
+ },
+ {
+ "epoch": 2.2529550827423166,
+ "grad_norm": 2.6228537559509277,
+ "learning_rate": 3.489032456162317e-06,
+ "loss": 0.4546,
+ "step": 4765
+ },
+ {
+ "epoch": 2.2534278959810874,
+ "grad_norm": 2.8197672367095947,
+ "learning_rate": 3.4884594918789083e-06,
+ "loss": 0.479,
+ "step": 4766
+ },
+ {
+ "epoch": 2.253900709219858,
+ "grad_norm": 2.7839298248291016,
+ "learning_rate": 3.4878864660484612e-06,
+ "loss": 0.5081,
+ "step": 4767
+ },
+ {
+ "epoch": 2.254373522458629,
+ "grad_norm": 2.8630709648132324,
+ "learning_rate": 3.487313378706656e-06,
+ "loss": 0.4345,
+ "step": 4768
+ },
+ {
+ "epoch": 2.2548463356973993,
+ "grad_norm": 2.5661563873291016,
+ "learning_rate": 3.4867402298891755e-06,
+ "loss": 0.4266,
+ "step": 4769
+ },
+ {
+ "epoch": 2.25531914893617,
+ "grad_norm": 2.6274025440216064,
+ "learning_rate": 3.4861670196317084e-06,
+ "loss": 0.4645,
+ "step": 4770
+ },
+ {
+ "epoch": 2.255791962174941,
+ "grad_norm": 2.578702449798584,
+ "learning_rate": 3.485593747969944e-06,
+ "loss": 0.4242,
+ "step": 4771
+ },
+ {
+ "epoch": 2.2562647754137117,
+ "grad_norm": 2.322476625442505,
+ "learning_rate": 3.48502041493958e-06,
+ "loss": 0.3975,
+ "step": 4772
+ },
+ {
+ "epoch": 2.256737588652482,
+ "grad_norm": 2.8412630558013916,
+ "learning_rate": 3.484447020576313e-06,
+ "loss": 0.4276,
+ "step": 4773
+ },
+ {
+ "epoch": 2.257210401891253,
+ "grad_norm": 2.6090497970581055,
+ "learning_rate": 3.483873564915847e-06,
+ "loss": 0.429,
+ "step": 4774
+ },
+ {
+ "epoch": 2.2576832151300237,
+ "grad_norm": 2.692458152770996,
+ "learning_rate": 3.4833000479938877e-06,
+ "loss": 0.4211,
+ "step": 4775
+ },
+ {
+ "epoch": 2.2581560283687945,
+ "grad_norm": 2.5546815395355225,
+ "learning_rate": 3.482726469846146e-06,
+ "loss": 0.4751,
+ "step": 4776
+ },
+ {
+ "epoch": 2.258628841607565,
+ "grad_norm": 2.8409626483917236,
+ "learning_rate": 3.4821528305083376e-06,
+ "loss": 0.4821,
+ "step": 4777
+ },
+ {
+ "epoch": 2.2591016548463356,
+ "grad_norm": 2.722966432571411,
+ "learning_rate": 3.4815791300161785e-06,
+ "loss": 0.5029,
+ "step": 4778
+ },
+ {
+ "epoch": 2.2595744680851064,
+ "grad_norm": 2.691603899002075,
+ "learning_rate": 3.48100536840539e-06,
+ "loss": 0.4242,
+ "step": 4779
+ },
+ {
+ "epoch": 2.260047281323877,
+ "grad_norm": 2.64035964012146,
+ "learning_rate": 3.4804315457116992e-06,
+ "loss": 0.4033,
+ "step": 4780
+ },
+ {
+ "epoch": 2.2605200945626476,
+ "grad_norm": 2.758819580078125,
+ "learning_rate": 3.4798576619708357e-06,
+ "loss": 0.4321,
+ "step": 4781
+ },
+ {
+ "epoch": 2.2609929078014184,
+ "grad_norm": 2.8204405307769775,
+ "learning_rate": 3.4792837172185324e-06,
+ "loss": 0.4309,
+ "step": 4782
+ },
+ {
+ "epoch": 2.261465721040189,
+ "grad_norm": 2.529771327972412,
+ "learning_rate": 3.478709711490525e-06,
+ "loss": 0.4398,
+ "step": 4783
+ },
+ {
+ "epoch": 2.26193853427896,
+ "grad_norm": 2.8156251907348633,
+ "learning_rate": 3.4781356448225557e-06,
+ "loss": 0.447,
+ "step": 4784
+ },
+ {
+ "epoch": 2.2624113475177303,
+ "grad_norm": 2.689528703689575,
+ "learning_rate": 3.477561517250369e-06,
+ "loss": 0.3907,
+ "step": 4785
+ },
+ {
+ "epoch": 2.262884160756501,
+ "grad_norm": 2.9148027896881104,
+ "learning_rate": 3.476987328809713e-06,
+ "loss": 0.4287,
+ "step": 4786
+ },
+ {
+ "epoch": 2.263356973995272,
+ "grad_norm": 2.933021306991577,
+ "learning_rate": 3.4764130795363404e-06,
+ "loss": 0.4847,
+ "step": 4787
+ },
+ {
+ "epoch": 2.2638297872340427,
+ "grad_norm": 2.8559257984161377,
+ "learning_rate": 3.4758387694660064e-06,
+ "loss": 0.4554,
+ "step": 4788
+ },
+ {
+ "epoch": 2.264302600472813,
+ "grad_norm": 3.0355522632598877,
+ "learning_rate": 3.4752643986344707e-06,
+ "loss": 0.4286,
+ "step": 4789
+ },
+ {
+ "epoch": 2.264775413711584,
+ "grad_norm": 2.9768362045288086,
+ "learning_rate": 3.474689967077498e-06,
+ "loss": 0.4917,
+ "step": 4790
+ },
+ {
+ "epoch": 2.2652482269503547,
+ "grad_norm": 2.827971935272217,
+ "learning_rate": 3.474115474830855e-06,
+ "loss": 0.4542,
+ "step": 4791
+ },
+ {
+ "epoch": 2.2657210401891255,
+ "grad_norm": 2.559659719467163,
+ "learning_rate": 3.4735409219303123e-06,
+ "loss": 0.4168,
+ "step": 4792
+ },
+ {
+ "epoch": 2.266193853427896,
+ "grad_norm": 2.3172824382781982,
+ "learning_rate": 3.472966308411645e-06,
+ "loss": 0.3535,
+ "step": 4793
+ },
+ {
+ "epoch": 2.2666666666666666,
+ "grad_norm": 2.6779656410217285,
+ "learning_rate": 3.4723916343106327e-06,
+ "loss": 0.4599,
+ "step": 4794
+ },
+ {
+ "epoch": 2.2671394799054374,
+ "grad_norm": 2.55780291557312,
+ "learning_rate": 3.4718168996630573e-06,
+ "loss": 0.4185,
+ "step": 4795
+ },
+ {
+ "epoch": 2.267612293144208,
+ "grad_norm": 2.4929800033569336,
+ "learning_rate": 3.471242104504704e-06,
+ "loss": 0.4008,
+ "step": 4796
+ },
+ {
+ "epoch": 2.2680851063829786,
+ "grad_norm": 2.849475145339966,
+ "learning_rate": 3.4706672488713642e-06,
+ "loss": 0.396,
+ "step": 4797
+ },
+ {
+ "epoch": 2.2685579196217494,
+ "grad_norm": 2.4830739498138428,
+ "learning_rate": 3.4700923327988306e-06,
+ "loss": 0.4087,
+ "step": 4798
+ },
+ {
+ "epoch": 2.26903073286052,
+ "grad_norm": 3.2748119831085205,
+ "learning_rate": 3.469517356322901e-06,
+ "loss": 0.4496,
+ "step": 4799
+ },
+ {
+ "epoch": 2.269503546099291,
+ "grad_norm": 3.0440170764923096,
+ "learning_rate": 3.468942319479378e-06,
+ "loss": 0.4903,
+ "step": 4800
+ },
+ {
+ "epoch": 2.2699763593380613,
+ "grad_norm": 2.8200504779815674,
+ "learning_rate": 3.4683672223040645e-06,
+ "loss": 0.4588,
+ "step": 4801
+ },
+ {
+ "epoch": 2.270449172576832,
+ "grad_norm": 2.675206184387207,
+ "learning_rate": 3.4677920648327707e-06,
+ "loss": 0.4257,
+ "step": 4802
+ },
+ {
+ "epoch": 2.270921985815603,
+ "grad_norm": 2.862675905227661,
+ "learning_rate": 3.4672168471013084e-06,
+ "loss": 0.466,
+ "step": 4803
+ },
+ {
+ "epoch": 2.2713947990543737,
+ "grad_norm": 2.65663743019104,
+ "learning_rate": 3.4666415691454947e-06,
+ "loss": 0.4784,
+ "step": 4804
+ },
+ {
+ "epoch": 2.271867612293144,
+ "grad_norm": 2.5610506534576416,
+ "learning_rate": 3.4660662310011483e-06,
+ "loss": 0.4429,
+ "step": 4805
+ },
+ {
+ "epoch": 2.272340425531915,
+ "grad_norm": 2.6459643840789795,
+ "learning_rate": 3.465490832704094e-06,
+ "loss": 0.4345,
+ "step": 4806
+ },
+ {
+ "epoch": 2.2728132387706856,
+ "grad_norm": 2.426013469696045,
+ "learning_rate": 3.4649153742901585e-06,
+ "loss": 0.4533,
+ "step": 4807
+ },
+ {
+ "epoch": 2.2732860520094564,
+ "grad_norm": 2.6714842319488525,
+ "learning_rate": 3.4643398557951745e-06,
+ "loss": 0.4409,
+ "step": 4808
+ },
+ {
+ "epoch": 2.273758865248227,
+ "grad_norm": 2.703629493713379,
+ "learning_rate": 3.463764277254976e-06,
+ "loss": 0.3656,
+ "step": 4809
+ },
+ {
+ "epoch": 2.2742316784869976,
+ "grad_norm": 2.811753988265991,
+ "learning_rate": 3.4631886387054025e-06,
+ "loss": 0.4957,
+ "step": 4810
+ },
+ {
+ "epoch": 2.2747044917257684,
+ "grad_norm": 2.9469289779663086,
+ "learning_rate": 3.462612940182295e-06,
+ "loss": 0.4582,
+ "step": 4811
+ },
+ {
+ "epoch": 2.275177304964539,
+ "grad_norm": 2.6287801265716553,
+ "learning_rate": 3.462037181721501e-06,
+ "loss": 0.4072,
+ "step": 4812
+ },
+ {
+ "epoch": 2.2756501182033095,
+ "grad_norm": 2.7104952335357666,
+ "learning_rate": 3.46146136335887e-06,
+ "loss": 0.4998,
+ "step": 4813
+ },
+ {
+ "epoch": 2.2761229314420803,
+ "grad_norm": 3.170363187789917,
+ "learning_rate": 3.460885485130256e-06,
+ "loss": 0.4722,
+ "step": 4814
+ },
+ {
+ "epoch": 2.276595744680851,
+ "grad_norm": 2.7315151691436768,
+ "learning_rate": 3.460309547071516e-06,
+ "loss": 0.4482,
+ "step": 4815
+ },
+ {
+ "epoch": 2.277068557919622,
+ "grad_norm": 2.685988187789917,
+ "learning_rate": 3.4597335492185113e-06,
+ "loss": 0.4419,
+ "step": 4816
+ },
+ {
+ "epoch": 2.2775413711583923,
+ "grad_norm": 2.532790184020996,
+ "learning_rate": 3.459157491607107e-06,
+ "loss": 0.3961,
+ "step": 4817
+ },
+ {
+ "epoch": 2.278014184397163,
+ "grad_norm": 2.920729875564575,
+ "learning_rate": 3.458581374273171e-06,
+ "loss": 0.4767,
+ "step": 4818
+ },
+ {
+ "epoch": 2.278486997635934,
+ "grad_norm": 3.2481250762939453,
+ "learning_rate": 3.458005197252577e-06,
+ "loss": 0.4985,
+ "step": 4819
+ },
+ {
+ "epoch": 2.2789598108747047,
+ "grad_norm": 2.373809814453125,
+ "learning_rate": 3.4574289605811994e-06,
+ "loss": 0.4259,
+ "step": 4820
+ },
+ {
+ "epoch": 2.279432624113475,
+ "grad_norm": 2.7851033210754395,
+ "learning_rate": 3.4568526642949184e-06,
+ "loss": 0.4829,
+ "step": 4821
+ },
+ {
+ "epoch": 2.279905437352246,
+ "grad_norm": 2.9777133464813232,
+ "learning_rate": 3.456276308429618e-06,
+ "loss": 0.4896,
+ "step": 4822
+ },
+ {
+ "epoch": 2.2803782505910166,
+ "grad_norm": 2.7922022342681885,
+ "learning_rate": 3.4556998930211853e-06,
+ "loss": 0.4908,
+ "step": 4823
+ },
+ {
+ "epoch": 2.2808510638297874,
+ "grad_norm": 2.699180841445923,
+ "learning_rate": 3.4551234181055104e-06,
+ "loss": 0.4518,
+ "step": 4824
+ },
+ {
+ "epoch": 2.2813238770685578,
+ "grad_norm": 3.1200520992279053,
+ "learning_rate": 3.4545468837184885e-06,
+ "loss": 0.4877,
+ "step": 4825
+ },
+ {
+ "epoch": 2.2817966903073286,
+ "grad_norm": 2.56782603263855,
+ "learning_rate": 3.453970289896018e-06,
+ "loss": 0.4281,
+ "step": 4826
+ },
+ {
+ "epoch": 2.2822695035460994,
+ "grad_norm": 3.241356372833252,
+ "learning_rate": 3.4533936366740007e-06,
+ "loss": 0.4338,
+ "step": 4827
+ },
+ {
+ "epoch": 2.28274231678487,
+ "grad_norm": 3.560295343399048,
+ "learning_rate": 3.452816924088342e-06,
+ "loss": 0.4121,
+ "step": 4828
+ },
+ {
+ "epoch": 2.2832151300236405,
+ "grad_norm": 2.8512449264526367,
+ "learning_rate": 3.452240152174951e-06,
+ "loss": 0.4357,
+ "step": 4829
+ },
+ {
+ "epoch": 2.2836879432624113,
+ "grad_norm": 3.0332651138305664,
+ "learning_rate": 3.4516633209697408e-06,
+ "loss": 0.4985,
+ "step": 4830
+ },
+ {
+ "epoch": 2.284160756501182,
+ "grad_norm": 2.520930528640747,
+ "learning_rate": 3.451086430508629e-06,
+ "loss": 0.4021,
+ "step": 4831
+ },
+ {
+ "epoch": 2.284633569739953,
+ "grad_norm": 2.508227825164795,
+ "learning_rate": 3.4505094808275363e-06,
+ "loss": 0.3935,
+ "step": 4832
+ },
+ {
+ "epoch": 2.2851063829787233,
+ "grad_norm": 2.56752610206604,
+ "learning_rate": 3.449932471962385e-06,
+ "loss": 0.4689,
+ "step": 4833
+ },
+ {
+ "epoch": 2.285579196217494,
+ "grad_norm": 2.7757534980773926,
+ "learning_rate": 3.449355403949105e-06,
+ "loss": 0.4565,
+ "step": 4834
+ },
+ {
+ "epoch": 2.286052009456265,
+ "grad_norm": 3.364821195602417,
+ "learning_rate": 3.448778276823626e-06,
+ "loss": 0.4729,
+ "step": 4835
+ },
+ {
+ "epoch": 2.2865248226950357,
+ "grad_norm": 3.0045557022094727,
+ "learning_rate": 3.448201090621884e-06,
+ "loss": 0.4834,
+ "step": 4836
+ },
+ {
+ "epoch": 2.286997635933806,
+ "grad_norm": 2.9451794624328613,
+ "learning_rate": 3.4476238453798183e-06,
+ "loss": 0.489,
+ "step": 4837
+ },
+ {
+ "epoch": 2.287470449172577,
+ "grad_norm": 2.8307435512542725,
+ "learning_rate": 3.4470465411333708e-06,
+ "loss": 0.5079,
+ "step": 4838
+ },
+ {
+ "epoch": 2.2879432624113476,
+ "grad_norm": 2.7118136882781982,
+ "learning_rate": 3.4464691779184876e-06,
+ "loss": 0.4794,
+ "step": 4839
+ },
+ {
+ "epoch": 2.2884160756501184,
+ "grad_norm": 2.6724441051483154,
+ "learning_rate": 3.445891755771119e-06,
+ "loss": 0.4619,
+ "step": 4840
+ },
+ {
+ "epoch": 2.2888888888888888,
+ "grad_norm": 2.8161258697509766,
+ "learning_rate": 3.445314274727218e-06,
+ "loss": 0.4287,
+ "step": 4841
+ },
+ {
+ "epoch": 2.2893617021276595,
+ "grad_norm": 2.5681750774383545,
+ "learning_rate": 3.4447367348227433e-06,
+ "loss": 0.4167,
+ "step": 4842
+ },
+ {
+ "epoch": 2.2898345153664303,
+ "grad_norm": 2.8136284351348877,
+ "learning_rate": 3.444159136093654e-06,
+ "loss": 0.4195,
+ "step": 4843
+ },
+ {
+ "epoch": 2.290307328605201,
+ "grad_norm": 3.153651714324951,
+ "learning_rate": 3.443581478575915e-06,
+ "loss": 0.4821,
+ "step": 4844
+ },
+ {
+ "epoch": 2.2907801418439715,
+ "grad_norm": 2.980883836746216,
+ "learning_rate": 3.4430037623054953e-06,
+ "loss": 0.4627,
+ "step": 4845
+ },
+ {
+ "epoch": 2.2912529550827423,
+ "grad_norm": 2.786182403564453,
+ "learning_rate": 3.4424259873183664e-06,
+ "loss": 0.4342,
+ "step": 4846
+ },
+ {
+ "epoch": 2.291725768321513,
+ "grad_norm": 2.8938279151916504,
+ "learning_rate": 3.4418481536505026e-06,
+ "loss": 0.3997,
+ "step": 4847
+ },
+ {
+ "epoch": 2.2921985815602834,
+ "grad_norm": 2.5534510612487793,
+ "learning_rate": 3.4412702613378844e-06,
+ "loss": 0.3982,
+ "step": 4848
+ },
+ {
+ "epoch": 2.2926713947990542,
+ "grad_norm": 2.7907063961029053,
+ "learning_rate": 3.4406923104164956e-06,
+ "loss": 0.4484,
+ "step": 4849
+ },
+ {
+ "epoch": 2.293144208037825,
+ "grad_norm": 3.162702798843384,
+ "learning_rate": 3.4401143009223203e-06,
+ "loss": 0.4528,
+ "step": 4850
+ },
+ {
+ "epoch": 2.293617021276596,
+ "grad_norm": 2.4647393226623535,
+ "learning_rate": 3.4395362328913505e-06,
+ "loss": 0.3759,
+ "step": 4851
+ },
+ {
+ "epoch": 2.2940898345153666,
+ "grad_norm": 2.8219876289367676,
+ "learning_rate": 3.438958106359579e-06,
+ "loss": 0.4903,
+ "step": 4852
+ },
+ {
+ "epoch": 2.294562647754137,
+ "grad_norm": 2.827073097229004,
+ "learning_rate": 3.438379921363003e-06,
+ "loss": 0.4315,
+ "step": 4853
+ },
+ {
+ "epoch": 2.295035460992908,
+ "grad_norm": 2.472470283508301,
+ "learning_rate": 3.4378016779376244e-06,
+ "loss": 0.4478,
+ "step": 4854
+ },
+ {
+ "epoch": 2.2955082742316786,
+ "grad_norm": 3.3994734287261963,
+ "learning_rate": 3.4372233761194473e-06,
+ "loss": 0.5086,
+ "step": 4855
+ },
+ {
+ "epoch": 2.295981087470449,
+ "grad_norm": 3.030465602874756,
+ "learning_rate": 3.4366450159444796e-06,
+ "loss": 0.4159,
+ "step": 4856
+ },
+ {
+ "epoch": 2.2964539007092197,
+ "grad_norm": 2.5460705757141113,
+ "learning_rate": 3.4360665974487346e-06,
+ "loss": 0.4097,
+ "step": 4857
+ },
+ {
+ "epoch": 2.2969267139479905,
+ "grad_norm": 2.884469509124756,
+ "learning_rate": 3.4354881206682273e-06,
+ "loss": 0.4478,
+ "step": 4858
+ },
+ {
+ "epoch": 2.2973995271867613,
+ "grad_norm": 2.5139710903167725,
+ "learning_rate": 3.4349095856389765e-06,
+ "loss": 0.4286,
+ "step": 4859
+ },
+ {
+ "epoch": 2.297872340425532,
+ "grad_norm": 3.1628260612487793,
+ "learning_rate": 3.4343309923970053e-06,
+ "loss": 0.4617,
+ "step": 4860
+ },
+ {
+ "epoch": 2.2983451536643025,
+ "grad_norm": 2.6141695976257324,
+ "learning_rate": 3.4337523409783395e-06,
+ "loss": 0.3841,
+ "step": 4861
+ },
+ {
+ "epoch": 2.2988179669030733,
+ "grad_norm": 2.766834259033203,
+ "learning_rate": 3.43317363141901e-06,
+ "loss": 0.4484,
+ "step": 4862
+ },
+ {
+ "epoch": 2.299290780141844,
+ "grad_norm": 2.785491943359375,
+ "learning_rate": 3.4325948637550503e-06,
+ "loss": 0.4363,
+ "step": 4863
+ },
+ {
+ "epoch": 2.2997635933806144,
+ "grad_norm": 2.624929189682007,
+ "learning_rate": 3.4320160380224988e-06,
+ "loss": 0.4518,
+ "step": 4864
+ },
+ {
+ "epoch": 2.300236406619385,
+ "grad_norm": 2.895413398742676,
+ "learning_rate": 3.4314371542573944e-06,
+ "loss": 0.4745,
+ "step": 4865
+ },
+ {
+ "epoch": 2.300709219858156,
+ "grad_norm": 2.603816270828247,
+ "learning_rate": 3.430858212495783e-06,
+ "loss": 0.4444,
+ "step": 4866
+ },
+ {
+ "epoch": 2.301182033096927,
+ "grad_norm": 3.387360095977783,
+ "learning_rate": 3.4302792127737116e-06,
+ "loss": 0.4169,
+ "step": 4867
+ },
+ {
+ "epoch": 2.3016548463356976,
+ "grad_norm": 2.894054651260376,
+ "learning_rate": 3.4297001551272334e-06,
+ "loss": 0.4493,
+ "step": 4868
+ },
+ {
+ "epoch": 2.302127659574468,
+ "grad_norm": 3.0432028770446777,
+ "learning_rate": 3.4291210395924035e-06,
+ "loss": 0.4854,
+ "step": 4869
+ },
+ {
+ "epoch": 2.3026004728132388,
+ "grad_norm": 2.5144734382629395,
+ "learning_rate": 3.42854186620528e-06,
+ "loss": 0.4556,
+ "step": 4870
+ },
+ {
+ "epoch": 2.3030732860520096,
+ "grad_norm": 2.964812755584717,
+ "learning_rate": 3.427962635001926e-06,
+ "loss": 0.495,
+ "step": 4871
+ },
+ {
+ "epoch": 2.30354609929078,
+ "grad_norm": 2.9991118907928467,
+ "learning_rate": 3.4273833460184077e-06,
+ "loss": 0.4787,
+ "step": 4872
+ },
+ {
+ "epoch": 2.3040189125295507,
+ "grad_norm": 2.9424328804016113,
+ "learning_rate": 3.4268039992907955e-06,
+ "loss": 0.5006,
+ "step": 4873
+ },
+ {
+ "epoch": 2.3044917257683215,
+ "grad_norm": 2.792880058288574,
+ "learning_rate": 3.426224594855162e-06,
+ "loss": 0.4399,
+ "step": 4874
+ },
+ {
+ "epoch": 2.3049645390070923,
+ "grad_norm": 2.5308053493499756,
+ "learning_rate": 3.4256451327475838e-06,
+ "loss": 0.4843,
+ "step": 4875
+ },
+ {
+ "epoch": 2.305437352245863,
+ "grad_norm": 2.7937564849853516,
+ "learning_rate": 3.425065613004142e-06,
+ "loss": 0.4428,
+ "step": 4876
+ },
+ {
+ "epoch": 2.3059101654846335,
+ "grad_norm": 2.4231557846069336,
+ "learning_rate": 3.424486035660921e-06,
+ "loss": 0.4054,
+ "step": 4877
+ },
+ {
+ "epoch": 2.3063829787234043,
+ "grad_norm": 3.0622596740722656,
+ "learning_rate": 3.423906400754009e-06,
+ "loss": 0.4623,
+ "step": 4878
+ },
+ {
+ "epoch": 2.306855791962175,
+ "grad_norm": 2.6532933712005615,
+ "learning_rate": 3.4233267083194955e-06,
+ "loss": 0.4387,
+ "step": 4879
+ },
+ {
+ "epoch": 2.3073286052009454,
+ "grad_norm": 2.793325185775757,
+ "learning_rate": 3.422746958393477e-06,
+ "loss": 0.4047,
+ "step": 4880
+ },
+ {
+ "epoch": 2.307801418439716,
+ "grad_norm": 2.9178314208984375,
+ "learning_rate": 3.422167151012052e-06,
+ "loss": 0.4397,
+ "step": 4881
+ },
+ {
+ "epoch": 2.308274231678487,
+ "grad_norm": 3.463913917541504,
+ "learning_rate": 3.4215872862113214e-06,
+ "loss": 0.4347,
+ "step": 4882
+ },
+ {
+ "epoch": 2.308747044917258,
+ "grad_norm": 3.228403091430664,
+ "learning_rate": 3.421007364027392e-06,
+ "loss": 0.4405,
+ "step": 4883
+ },
+ {
+ "epoch": 2.3092198581560286,
+ "grad_norm": 2.896933078765869,
+ "learning_rate": 3.420427384496372e-06,
+ "loss": 0.4429,
+ "step": 4884
+ },
+ {
+ "epoch": 2.309692671394799,
+ "grad_norm": 2.5559937953948975,
+ "learning_rate": 3.4198473476543755e-06,
+ "loss": 0.4281,
+ "step": 4885
+ },
+ {
+ "epoch": 2.3101654846335697,
+ "grad_norm": 3.457918167114258,
+ "learning_rate": 3.419267253537517e-06,
+ "loss": 0.4495,
+ "step": 4886
+ },
+ {
+ "epoch": 2.3106382978723405,
+ "grad_norm": 2.6554839611053467,
+ "learning_rate": 3.418687102181918e-06,
+ "loss": 0.4682,
+ "step": 4887
+ },
+ {
+ "epoch": 2.311111111111111,
+ "grad_norm": 2.8171639442443848,
+ "learning_rate": 3.4181068936237024e-06,
+ "loss": 0.4184,
+ "step": 4888
+ },
+ {
+ "epoch": 2.3115839243498817,
+ "grad_norm": 2.9272499084472656,
+ "learning_rate": 3.4175266278989955e-06,
+ "loss": 0.5445,
+ "step": 4889
+ },
+ {
+ "epoch": 2.3120567375886525,
+ "grad_norm": 2.5928499698638916,
+ "learning_rate": 3.4169463050439284e-06,
+ "loss": 0.3808,
+ "step": 4890
+ },
+ {
+ "epoch": 2.3125295508274233,
+ "grad_norm": 2.6624577045440674,
+ "learning_rate": 3.4163659250946356e-06,
+ "loss": 0.4678,
+ "step": 4891
+ },
+ {
+ "epoch": 2.313002364066194,
+ "grad_norm": 2.666555643081665,
+ "learning_rate": 3.4157854880872553e-06,
+ "loss": 0.457,
+ "step": 4892
+ },
+ {
+ "epoch": 2.3134751773049644,
+ "grad_norm": 3.2987406253814697,
+ "learning_rate": 3.4152049940579278e-06,
+ "loss": 0.551,
+ "step": 4893
+ },
+ {
+ "epoch": 2.3139479905437352,
+ "grad_norm": 2.728119134902954,
+ "learning_rate": 3.414624443042799e-06,
+ "loss": 0.3935,
+ "step": 4894
+ },
+ {
+ "epoch": 2.314420803782506,
+ "grad_norm": 3.133005380630493,
+ "learning_rate": 3.4140438350780157e-06,
+ "loss": 0.4981,
+ "step": 4895
+ },
+ {
+ "epoch": 2.3148936170212764,
+ "grad_norm": 2.591252565383911,
+ "learning_rate": 3.4134631701997312e-06,
+ "loss": 0.4251,
+ "step": 4896
+ },
+ {
+ "epoch": 2.315366430260047,
+ "grad_norm": 3.007136344909668,
+ "learning_rate": 3.412882448444101e-06,
+ "loss": 0.4492,
+ "step": 4897
+ },
+ {
+ "epoch": 2.315839243498818,
+ "grad_norm": 2.6391026973724365,
+ "learning_rate": 3.412301669847284e-06,
+ "loss": 0.5151,
+ "step": 4898
+ },
+ {
+ "epoch": 2.3163120567375888,
+ "grad_norm": 7.453699111938477,
+ "learning_rate": 3.411720834445441e-06,
+ "loss": 0.4983,
+ "step": 4899
+ },
+ {
+ "epoch": 2.3167848699763596,
+ "grad_norm": 2.667712688446045,
+ "learning_rate": 3.41113994227474e-06,
+ "loss": 0.4581,
+ "step": 4900
+ },
+ {
+ "epoch": 2.31725768321513,
+ "grad_norm": 2.7727627754211426,
+ "learning_rate": 3.41055899337135e-06,
+ "loss": 0.4731,
+ "step": 4901
+ },
+ {
+ "epoch": 2.3177304964539007,
+ "grad_norm": 3.0096890926361084,
+ "learning_rate": 3.409977987771444e-06,
+ "loss": 0.4996,
+ "step": 4902
+ },
+ {
+ "epoch": 2.3182033096926715,
+ "grad_norm": 2.725830078125,
+ "learning_rate": 3.4093969255111993e-06,
+ "loss": 0.4544,
+ "step": 4903
+ },
+ {
+ "epoch": 2.318676122931442,
+ "grad_norm": 2.7596993446350098,
+ "learning_rate": 3.4088158066267945e-06,
+ "loss": 0.4846,
+ "step": 4904
+ },
+ {
+ "epoch": 2.3191489361702127,
+ "grad_norm": 2.702620029449463,
+ "learning_rate": 3.4082346311544156e-06,
+ "loss": 0.4849,
+ "step": 4905
+ },
+ {
+ "epoch": 2.3196217494089835,
+ "grad_norm": 2.725374460220337,
+ "learning_rate": 3.407653399130249e-06,
+ "loss": 0.4116,
+ "step": 4906
+ },
+ {
+ "epoch": 2.3200945626477543,
+ "grad_norm": 2.6770219802856445,
+ "learning_rate": 3.4070721105904847e-06,
+ "loss": 0.4606,
+ "step": 4907
+ },
+ {
+ "epoch": 2.320567375886525,
+ "grad_norm": 2.9249117374420166,
+ "learning_rate": 3.406490765571317e-06,
+ "loss": 0.461,
+ "step": 4908
+ },
+ {
+ "epoch": 2.3210401891252954,
+ "grad_norm": 2.7568278312683105,
+ "learning_rate": 3.405909364108944e-06,
+ "loss": 0.4065,
+ "step": 4909
+ },
+ {
+ "epoch": 2.321513002364066,
+ "grad_norm": 2.7231340408325195,
+ "learning_rate": 3.4053279062395676e-06,
+ "loss": 0.4173,
+ "step": 4910
+ },
+ {
+ "epoch": 2.321985815602837,
+ "grad_norm": 3.1401100158691406,
+ "learning_rate": 3.404746391999393e-06,
+ "loss": 0.4287,
+ "step": 4911
+ },
+ {
+ "epoch": 2.3224586288416074,
+ "grad_norm": 2.714853525161743,
+ "learning_rate": 3.404164821424627e-06,
+ "loss": 0.4552,
+ "step": 4912
+ },
+ {
+ "epoch": 2.322931442080378,
+ "grad_norm": 3.1509978771209717,
+ "learning_rate": 3.4035831945514825e-06,
+ "loss": 0.5296,
+ "step": 4913
+ },
+ {
+ "epoch": 2.323404255319149,
+ "grad_norm": 2.567194938659668,
+ "learning_rate": 3.403001511416174e-06,
+ "loss": 0.4306,
+ "step": 4914
+ },
+ {
+ "epoch": 2.3238770685579198,
+ "grad_norm": 2.7473888397216797,
+ "learning_rate": 3.402419772054922e-06,
+ "loss": 0.4009,
+ "step": 4915
+ },
+ {
+ "epoch": 2.3243498817966906,
+ "grad_norm": 2.8617780208587646,
+ "learning_rate": 3.401837976503947e-06,
+ "loss": 0.4545,
+ "step": 4916
+ },
+ {
+ "epoch": 2.324822695035461,
+ "grad_norm": 2.3650572299957275,
+ "learning_rate": 3.401256124799475e-06,
+ "loss": 0.4046,
+ "step": 4917
+ },
+ {
+ "epoch": 2.3252955082742317,
+ "grad_norm": 2.418407678604126,
+ "learning_rate": 3.4006742169777364e-06,
+ "loss": 0.4222,
+ "step": 4918
+ },
+ {
+ "epoch": 2.3257683215130025,
+ "grad_norm": 2.7232494354248047,
+ "learning_rate": 3.400092253074964e-06,
+ "loss": 0.4373,
+ "step": 4919
+ },
+ {
+ "epoch": 2.326241134751773,
+ "grad_norm": 2.702965497970581,
+ "learning_rate": 3.399510233127394e-06,
+ "loss": 0.437,
+ "step": 4920
+ },
+ {
+ "epoch": 2.3267139479905437,
+ "grad_norm": 2.8381760120391846,
+ "learning_rate": 3.3989281571712664e-06,
+ "loss": 0.4294,
+ "step": 4921
+ },
+ {
+ "epoch": 2.3271867612293144,
+ "grad_norm": 2.767131805419922,
+ "learning_rate": 3.398346025242823e-06,
+ "loss": 0.4673,
+ "step": 4922
+ },
+ {
+ "epoch": 2.3276595744680852,
+ "grad_norm": 2.5261805057525635,
+ "learning_rate": 3.3977638373783123e-06,
+ "loss": 0.4147,
+ "step": 4923
+ },
+ {
+ "epoch": 2.3281323877068556,
+ "grad_norm": 2.7176897525787354,
+ "learning_rate": 3.3971815936139836e-06,
+ "loss": 0.3885,
+ "step": 4924
+ },
+ {
+ "epoch": 2.3286052009456264,
+ "grad_norm": 2.849043130874634,
+ "learning_rate": 3.396599293986092e-06,
+ "loss": 0.4842,
+ "step": 4925
+ },
+ {
+ "epoch": 2.329078014184397,
+ "grad_norm": 2.550673484802246,
+ "learning_rate": 3.3960169385308927e-06,
+ "loss": 0.4049,
+ "step": 4926
+ },
+ {
+ "epoch": 2.329550827423168,
+ "grad_norm": 3.0821585655212402,
+ "learning_rate": 3.3954345272846477e-06,
+ "loss": 0.53,
+ "step": 4927
+ },
+ {
+ "epoch": 2.3300236406619383,
+ "grad_norm": 2.68658185005188,
+ "learning_rate": 3.3948520602836223e-06,
+ "loss": 0.4592,
+ "step": 4928
+ },
+ {
+ "epoch": 2.330496453900709,
+ "grad_norm": 2.7391903400421143,
+ "learning_rate": 3.394269537564082e-06,
+ "loss": 0.4773,
+ "step": 4929
+ },
+ {
+ "epoch": 2.33096926713948,
+ "grad_norm": 2.665114164352417,
+ "learning_rate": 3.393686959162299e-06,
+ "loss": 0.4671,
+ "step": 4930
+ },
+ {
+ "epoch": 2.3314420803782507,
+ "grad_norm": 2.6827399730682373,
+ "learning_rate": 3.3931043251145477e-06,
+ "loss": 0.4669,
+ "step": 4931
+ },
+ {
+ "epoch": 2.331914893617021,
+ "grad_norm": 3.1760666370391846,
+ "learning_rate": 3.392521635457106e-06,
+ "loss": 0.4729,
+ "step": 4932
+ },
+ {
+ "epoch": 2.332387706855792,
+ "grad_norm": 2.9686226844787598,
+ "learning_rate": 3.3919388902262555e-06,
+ "loss": 0.5017,
+ "step": 4933
+ },
+ {
+ "epoch": 2.3328605200945627,
+ "grad_norm": 2.471325397491455,
+ "learning_rate": 3.3913560894582818e-06,
+ "loss": 0.4195,
+ "step": 4934
+ },
+ {
+ "epoch": 2.3333333333333335,
+ "grad_norm": 2.4062955379486084,
+ "learning_rate": 3.3907732331894732e-06,
+ "loss": 0.3666,
+ "step": 4935
+ },
+ {
+ "epoch": 2.333806146572104,
+ "grad_norm": 2.6800320148468018,
+ "learning_rate": 3.3901903214561206e-06,
+ "loss": 0.4774,
+ "step": 4936
+ },
+ {
+ "epoch": 2.3342789598108746,
+ "grad_norm": 2.923741102218628,
+ "learning_rate": 3.389607354294521e-06,
+ "loss": 0.4546,
+ "step": 4937
+ },
+ {
+ "epoch": 2.3347517730496454,
+ "grad_norm": 3.0034096240997314,
+ "learning_rate": 3.3890243317409716e-06,
+ "loss": 0.5373,
+ "step": 4938
+ },
+ {
+ "epoch": 2.3352245862884162,
+ "grad_norm": 3.0757339000701904,
+ "learning_rate": 3.388441253831775e-06,
+ "loss": 0.4655,
+ "step": 4939
+ },
+ {
+ "epoch": 2.3356973995271866,
+ "grad_norm": 2.5352041721343994,
+ "learning_rate": 3.3878581206032373e-06,
+ "loss": 0.4391,
+ "step": 4940
+ },
+ {
+ "epoch": 2.3361702127659574,
+ "grad_norm": 2.9332237243652344,
+ "learning_rate": 3.3872749320916675e-06,
+ "loss": 0.4685,
+ "step": 4941
+ },
+ {
+ "epoch": 2.336643026004728,
+ "grad_norm": 2.4871222972869873,
+ "learning_rate": 3.386691688333379e-06,
+ "loss": 0.3952,
+ "step": 4942
+ },
+ {
+ "epoch": 2.337115839243499,
+ "grad_norm": 2.6384918689727783,
+ "learning_rate": 3.386108389364687e-06,
+ "loss": 0.4044,
+ "step": 4943
+ },
+ {
+ "epoch": 2.3375886524822693,
+ "grad_norm": 2.3545165061950684,
+ "learning_rate": 3.3855250352219102e-06,
+ "loss": 0.426,
+ "step": 4944
+ },
+ {
+ "epoch": 2.33806146572104,
+ "grad_norm": 2.972242593765259,
+ "learning_rate": 3.3849416259413735e-06,
+ "loss": 0.5033,
+ "step": 4945
+ },
+ {
+ "epoch": 2.338534278959811,
+ "grad_norm": 3.117351770401001,
+ "learning_rate": 3.384358161559401e-06,
+ "loss": 0.4695,
+ "step": 4946
+ },
+ {
+ "epoch": 2.3390070921985817,
+ "grad_norm": 2.888916492462158,
+ "learning_rate": 3.383774642112324e-06,
+ "loss": 0.437,
+ "step": 4947
+ },
+ {
+ "epoch": 2.339479905437352,
+ "grad_norm": 3.0677435398101807,
+ "learning_rate": 3.3831910676364753e-06,
+ "loss": 0.4293,
+ "step": 4948
+ },
+ {
+ "epoch": 2.339952718676123,
+ "grad_norm": 2.8571784496307373,
+ "learning_rate": 3.3826074381681916e-06,
+ "loss": 0.4574,
+ "step": 4949
+ },
+ {
+ "epoch": 2.3404255319148937,
+ "grad_norm": 2.907276153564453,
+ "learning_rate": 3.3820237537438127e-06,
+ "loss": 0.4731,
+ "step": 4950
+ },
+ {
+ "epoch": 2.3408983451536645,
+ "grad_norm": 2.923762559890747,
+ "learning_rate": 3.3814400143996823e-06,
+ "loss": 0.4648,
+ "step": 4951
+ },
+ {
+ "epoch": 2.341371158392435,
+ "grad_norm": 2.6206982135772705,
+ "learning_rate": 3.3808562201721473e-06,
+ "loss": 0.436,
+ "step": 4952
+ },
+ {
+ "epoch": 2.3418439716312056,
+ "grad_norm": 6.279088973999023,
+ "learning_rate": 3.380272371097558e-06,
+ "loss": 0.4461,
+ "step": 4953
+ },
+ {
+ "epoch": 2.3423167848699764,
+ "grad_norm": 2.785297155380249,
+ "learning_rate": 3.3796884672122684e-06,
+ "loss": 0.4619,
+ "step": 4954
+ },
+ {
+ "epoch": 2.342789598108747,
+ "grad_norm": 2.6241793632507324,
+ "learning_rate": 3.379104508552634e-06,
+ "loss": 0.4323,
+ "step": 4955
+ },
+ {
+ "epoch": 2.3432624113475176,
+ "grad_norm": 2.6052167415618896,
+ "learning_rate": 3.378520495155017e-06,
+ "loss": 0.3943,
+ "step": 4956
+ },
+ {
+ "epoch": 2.3437352245862884,
+ "grad_norm": 2.8247411251068115,
+ "learning_rate": 3.3779364270557818e-06,
+ "loss": 0.4689,
+ "step": 4957
+ },
+ {
+ "epoch": 2.344208037825059,
+ "grad_norm": 2.5348927974700928,
+ "learning_rate": 3.377352304291294e-06,
+ "loss": 0.4619,
+ "step": 4958
+ },
+ {
+ "epoch": 2.34468085106383,
+ "grad_norm": 2.906648874282837,
+ "learning_rate": 3.376768126897926e-06,
+ "loss": 0.5191,
+ "step": 4959
+ },
+ {
+ "epoch": 2.3451536643026003,
+ "grad_norm": 2.796870470046997,
+ "learning_rate": 3.3761838949120514e-06,
+ "loss": 0.4227,
+ "step": 4960
+ },
+ {
+ "epoch": 2.345626477541371,
+ "grad_norm": 2.789635419845581,
+ "learning_rate": 3.3755996083700464e-06,
+ "loss": 0.3927,
+ "step": 4961
+ },
+ {
+ "epoch": 2.346099290780142,
+ "grad_norm": 2.86641263961792,
+ "learning_rate": 3.375015267308295e-06,
+ "loss": 0.4097,
+ "step": 4962
+ },
+ {
+ "epoch": 2.3465721040189127,
+ "grad_norm": 2.8374414443969727,
+ "learning_rate": 3.374430871763178e-06,
+ "loss": 0.4566,
+ "step": 4963
+ },
+ {
+ "epoch": 2.347044917257683,
+ "grad_norm": 2.71951961517334,
+ "learning_rate": 3.3738464217710854e-06,
+ "loss": 0.4748,
+ "step": 4964
+ },
+ {
+ "epoch": 2.347517730496454,
+ "grad_norm": 2.6939785480499268,
+ "learning_rate": 3.373261917368408e-06,
+ "loss": 0.4499,
+ "step": 4965
+ },
+ {
+ "epoch": 2.3479905437352246,
+ "grad_norm": 2.862661600112915,
+ "learning_rate": 3.37267735859154e-06,
+ "loss": 0.415,
+ "step": 4966
+ },
+ {
+ "epoch": 2.3484633569739954,
+ "grad_norm": 2.3657119274139404,
+ "learning_rate": 3.3720927454768793e-06,
+ "loss": 0.4112,
+ "step": 4967
+ },
+ {
+ "epoch": 2.348936170212766,
+ "grad_norm": 3.701571464538574,
+ "learning_rate": 3.3715080780608277e-06,
+ "loss": 0.4735,
+ "step": 4968
+ },
+ {
+ "epoch": 2.3494089834515366,
+ "grad_norm": 2.894350528717041,
+ "learning_rate": 3.3709233563797895e-06,
+ "loss": 0.4278,
+ "step": 4969
+ },
+ {
+ "epoch": 2.3498817966903074,
+ "grad_norm": 3.0072877407073975,
+ "learning_rate": 3.3703385804701727e-06,
+ "loss": 0.4718,
+ "step": 4970
+ },
+ {
+ "epoch": 2.350354609929078,
+ "grad_norm": 2.9920408725738525,
+ "learning_rate": 3.369753750368389e-06,
+ "loss": 0.4636,
+ "step": 4971
+ },
+ {
+ "epoch": 2.3508274231678485,
+ "grad_norm": 2.381770372390747,
+ "learning_rate": 3.369168866110853e-06,
+ "loss": 0.3841,
+ "step": 4972
+ },
+ {
+ "epoch": 2.3513002364066193,
+ "grad_norm": 2.6195342540740967,
+ "learning_rate": 3.3685839277339825e-06,
+ "loss": 0.4422,
+ "step": 4973
+ },
+ {
+ "epoch": 2.35177304964539,
+ "grad_norm": 2.885852575302124,
+ "learning_rate": 3.3679989352741992e-06,
+ "loss": 0.4798,
+ "step": 4974
+ },
+ {
+ "epoch": 2.352245862884161,
+ "grad_norm": 2.820004940032959,
+ "learning_rate": 3.367413888767929e-06,
+ "loss": 0.4498,
+ "step": 4975
+ },
+ {
+ "epoch": 2.3527186761229313,
+ "grad_norm": 2.579680919647217,
+ "learning_rate": 3.366828788251599e-06,
+ "loss": 0.4894,
+ "step": 4976
+ },
+ {
+ "epoch": 2.353191489361702,
+ "grad_norm": 2.7509915828704834,
+ "learning_rate": 3.366243633761642e-06,
+ "loss": 0.4354,
+ "step": 4977
+ },
+ {
+ "epoch": 2.353664302600473,
+ "grad_norm": 3.061767339706421,
+ "learning_rate": 3.3656584253344917e-06,
+ "loss": 0.4651,
+ "step": 4978
+ },
+ {
+ "epoch": 2.3541371158392437,
+ "grad_norm": 2.6109485626220703,
+ "learning_rate": 3.365073163006587e-06,
+ "loss": 0.44,
+ "step": 4979
+ },
+ {
+ "epoch": 2.354609929078014,
+ "grad_norm": 3.4247376918792725,
+ "learning_rate": 3.36448784681437e-06,
+ "loss": 0.3993,
+ "step": 4980
+ },
+ {
+ "epoch": 2.355082742316785,
+ "grad_norm": 2.953695297241211,
+ "learning_rate": 3.363902476794285e-06,
+ "loss": 0.4763,
+ "step": 4981
+ },
+ {
+ "epoch": 2.3555555555555556,
+ "grad_norm": 2.836543083190918,
+ "learning_rate": 3.3633170529827806e-06,
+ "loss": 0.4755,
+ "step": 4982
+ },
+ {
+ "epoch": 2.3560283687943264,
+ "grad_norm": 2.944082021713257,
+ "learning_rate": 3.36273157541631e-06,
+ "loss": 0.472,
+ "step": 4983
+ },
+ {
+ "epoch": 2.3565011820330968,
+ "grad_norm": 2.891716957092285,
+ "learning_rate": 3.3621460441313262e-06,
+ "loss": 0.5259,
+ "step": 4984
+ },
+ {
+ "epoch": 2.3569739952718676,
+ "grad_norm": 2.8448829650878906,
+ "learning_rate": 3.3615604591642896e-06,
+ "loss": 0.4587,
+ "step": 4985
+ },
+ {
+ "epoch": 2.3574468085106384,
+ "grad_norm": 3.114393711090088,
+ "learning_rate": 3.36097482055166e-06,
+ "loss": 0.4352,
+ "step": 4986
+ },
+ {
+ "epoch": 2.357919621749409,
+ "grad_norm": 2.964851140975952,
+ "learning_rate": 3.360389128329904e-06,
+ "loss": 0.5015,
+ "step": 4987
+ },
+ {
+ "epoch": 2.3583924349881795,
+ "grad_norm": 2.4819815158843994,
+ "learning_rate": 3.3598033825354893e-06,
+ "loss": 0.3459,
+ "step": 4988
+ },
+ {
+ "epoch": 2.3588652482269503,
+ "grad_norm": 2.635754346847534,
+ "learning_rate": 3.359217583204889e-06,
+ "loss": 0.4367,
+ "step": 4989
+ },
+ {
+ "epoch": 2.359338061465721,
+ "grad_norm": 2.542482376098633,
+ "learning_rate": 3.358631730374576e-06,
+ "loss": 0.3978,
+ "step": 4990
+ },
+ {
+ "epoch": 2.359810874704492,
+ "grad_norm": 2.614018678665161,
+ "learning_rate": 3.358045824081031e-06,
+ "loss": 0.424,
+ "step": 4991
+ },
+ {
+ "epoch": 2.3602836879432623,
+ "grad_norm": 2.775373697280884,
+ "learning_rate": 3.3574598643607354e-06,
+ "loss": 0.4901,
+ "step": 4992
+ },
+ {
+ "epoch": 2.360756501182033,
+ "grad_norm": 3.091381311416626,
+ "learning_rate": 3.356873851250173e-06,
+ "loss": 0.4954,
+ "step": 4993
+ },
+ {
+ "epoch": 2.361229314420804,
+ "grad_norm": 2.440023422241211,
+ "learning_rate": 3.3562877847858337e-06,
+ "loss": 0.4053,
+ "step": 4994
+ },
+ {
+ "epoch": 2.3617021276595747,
+ "grad_norm": 2.8879518508911133,
+ "learning_rate": 3.3557016650042084e-06,
+ "loss": 0.4766,
+ "step": 4995
+ },
+ {
+ "epoch": 2.362174940898345,
+ "grad_norm": 3.1298391819000244,
+ "learning_rate": 3.355115491941793e-06,
+ "loss": 0.4743,
+ "step": 4996
+ },
+ {
+ "epoch": 2.362647754137116,
+ "grad_norm": 3.3325259685516357,
+ "learning_rate": 3.3545292656350845e-06,
+ "loss": 0.4703,
+ "step": 4997
+ },
+ {
+ "epoch": 2.3631205673758866,
+ "grad_norm": 2.7935359477996826,
+ "learning_rate": 3.353942986120587e-06,
+ "loss": 0.432,
+ "step": 4998
+ },
+ {
+ "epoch": 2.3635933806146574,
+ "grad_norm": 2.623624324798584,
+ "learning_rate": 3.3533566534348033e-06,
+ "loss": 0.4302,
+ "step": 4999
+ },
+ {
+ "epoch": 2.3640661938534278,
+ "grad_norm": 3.1467108726501465,
+ "learning_rate": 3.3527702676142426e-06,
+ "loss": 0.4661,
+ "step": 5000
+ },
+ {
+ "epoch": 2.3645390070921986,
+ "grad_norm": 2.5364840030670166,
+ "learning_rate": 3.352183828695418e-06,
+ "loss": 0.4134,
+ "step": 5001
+ },
+ {
+ "epoch": 2.3650118203309693,
+ "grad_norm": 3.002777338027954,
+ "learning_rate": 3.3515973367148415e-06,
+ "loss": 0.3771,
+ "step": 5002
+ },
+ {
+ "epoch": 2.36548463356974,
+ "grad_norm": 2.660043954849243,
+ "learning_rate": 3.3510107917090335e-06,
+ "loss": 0.4254,
+ "step": 5003
+ },
+ {
+ "epoch": 2.3659574468085105,
+ "grad_norm": 2.7041075229644775,
+ "learning_rate": 3.3504241937145148e-06,
+ "loss": 0.4651,
+ "step": 5004
+ },
+ {
+ "epoch": 2.3664302600472813,
+ "grad_norm": 2.7387280464172363,
+ "learning_rate": 3.349837542767811e-06,
+ "loss": 0.3874,
+ "step": 5005
+ },
+ {
+ "epoch": 2.366903073286052,
+ "grad_norm": 3.012188196182251,
+ "learning_rate": 3.349250838905449e-06,
+ "loss": 0.4508,
+ "step": 5006
+ },
+ {
+ "epoch": 2.3673758865248224,
+ "grad_norm": 2.3108484745025635,
+ "learning_rate": 3.3486640821639616e-06,
+ "loss": 0.3783,
+ "step": 5007
+ },
+ {
+ "epoch": 2.3678486997635932,
+ "grad_norm": 3.2188332080841064,
+ "learning_rate": 3.3480772725798837e-06,
+ "loss": 0.4879,
+ "step": 5008
+ },
+ {
+ "epoch": 2.368321513002364,
+ "grad_norm": 2.566087484359741,
+ "learning_rate": 3.3474904101897526e-06,
+ "loss": 0.3847,
+ "step": 5009
+ },
+ {
+ "epoch": 2.368794326241135,
+ "grad_norm": 2.5581698417663574,
+ "learning_rate": 3.3469034950301092e-06,
+ "loss": 0.4201,
+ "step": 5010
+ },
+ {
+ "epoch": 2.3692671394799056,
+ "grad_norm": 2.900296926498413,
+ "learning_rate": 3.3463165271374992e-06,
+ "loss": 0.4568,
+ "step": 5011
+ },
+ {
+ "epoch": 2.369739952718676,
+ "grad_norm": 2.8239312171936035,
+ "learning_rate": 3.34572950654847e-06,
+ "loss": 0.4583,
+ "step": 5012
+ },
+ {
+ "epoch": 2.370212765957447,
+ "grad_norm": 3.219465970993042,
+ "learning_rate": 3.3451424332995723e-06,
+ "loss": 0.5435,
+ "step": 5013
+ },
+ {
+ "epoch": 2.3706855791962176,
+ "grad_norm": 3.3111915588378906,
+ "learning_rate": 3.344555307427362e-06,
+ "loss": 0.435,
+ "step": 5014
+ },
+ {
+ "epoch": 2.371158392434988,
+ "grad_norm": 3.296668529510498,
+ "learning_rate": 3.3439681289683946e-06,
+ "loss": 0.4738,
+ "step": 5015
+ },
+ {
+ "epoch": 2.3716312056737587,
+ "grad_norm": 3.005722761154175,
+ "learning_rate": 3.343380897959234e-06,
+ "loss": 0.4267,
+ "step": 5016
+ },
+ {
+ "epoch": 2.3721040189125295,
+ "grad_norm": 2.7844085693359375,
+ "learning_rate": 3.3427936144364425e-06,
+ "loss": 0.4558,
+ "step": 5017
+ },
+ {
+ "epoch": 2.3725768321513003,
+ "grad_norm": 2.7532076835632324,
+ "learning_rate": 3.3422062784365884e-06,
+ "loss": 0.4144,
+ "step": 5018
+ },
+ {
+ "epoch": 2.373049645390071,
+ "grad_norm": 2.835764169692993,
+ "learning_rate": 3.3416188899962413e-06,
+ "loss": 0.4945,
+ "step": 5019
+ },
+ {
+ "epoch": 2.3735224586288415,
+ "grad_norm": 3.1513726711273193,
+ "learning_rate": 3.3410314491519767e-06,
+ "loss": 0.4971,
+ "step": 5020
+ },
+ {
+ "epoch": 2.3739952718676123,
+ "grad_norm": 3.0162220001220703,
+ "learning_rate": 3.3404439559403723e-06,
+ "loss": 0.4477,
+ "step": 5021
+ },
+ {
+ "epoch": 2.374468085106383,
+ "grad_norm": 2.676391363143921,
+ "learning_rate": 3.3398564103980073e-06,
+ "loss": 0.432,
+ "step": 5022
+ },
+ {
+ "epoch": 2.3749408983451534,
+ "grad_norm": 2.7806248664855957,
+ "learning_rate": 3.3392688125614663e-06,
+ "loss": 0.4818,
+ "step": 5023
+ },
+ {
+ "epoch": 2.3754137115839242,
+ "grad_norm": 2.968806505203247,
+ "learning_rate": 3.3386811624673373e-06,
+ "loss": 0.4893,
+ "step": 5024
+ },
+ {
+ "epoch": 2.375886524822695,
+ "grad_norm": 2.992684841156006,
+ "learning_rate": 3.3380934601522087e-06,
+ "loss": 0.4423,
+ "step": 5025
+ },
+ {
+ "epoch": 2.376359338061466,
+ "grad_norm": 2.578420639038086,
+ "learning_rate": 3.3375057056526762e-06,
+ "loss": 0.3682,
+ "step": 5026
+ },
+ {
+ "epoch": 2.3768321513002366,
+ "grad_norm": 2.7683115005493164,
+ "learning_rate": 3.336917899005335e-06,
+ "loss": 0.4038,
+ "step": 5027
+ },
+ {
+ "epoch": 2.377304964539007,
+ "grad_norm": 2.838812828063965,
+ "learning_rate": 3.336330040246786e-06,
+ "loss": 0.442,
+ "step": 5028
+ },
+ {
+ "epoch": 2.3777777777777778,
+ "grad_norm": 2.766136646270752,
+ "learning_rate": 3.335742129413633e-06,
+ "loss": 0.4745,
+ "step": 5029
+ },
+ {
+ "epoch": 2.3782505910165486,
+ "grad_norm": 2.862656593322754,
+ "learning_rate": 3.3351541665424812e-06,
+ "loss": 0.4324,
+ "step": 5030
+ },
+ {
+ "epoch": 2.378723404255319,
+ "grad_norm": 2.71425199508667,
+ "learning_rate": 3.3345661516699433e-06,
+ "loss": 0.4013,
+ "step": 5031
+ },
+ {
+ "epoch": 2.3791962174940897,
+ "grad_norm": 2.8404030799865723,
+ "learning_rate": 3.333978084832629e-06,
+ "loss": 0.5038,
+ "step": 5032
+ },
+ {
+ "epoch": 2.3796690307328605,
+ "grad_norm": 2.965851068496704,
+ "learning_rate": 3.3333899660671574e-06,
+ "loss": 0.4668,
+ "step": 5033
+ },
+ {
+ "epoch": 2.3801418439716313,
+ "grad_norm": 2.686452627182007,
+ "learning_rate": 3.3328017954101464e-06,
+ "loss": 0.4167,
+ "step": 5034
+ },
+ {
+ "epoch": 2.380614657210402,
+ "grad_norm": 2.8676156997680664,
+ "learning_rate": 3.3322135728982197e-06,
+ "loss": 0.4531,
+ "step": 5035
+ },
+ {
+ "epoch": 2.3810874704491725,
+ "grad_norm": 2.4456300735473633,
+ "learning_rate": 3.3316252985680026e-06,
+ "loss": 0.4173,
+ "step": 5036
+ },
+ {
+ "epoch": 2.3815602836879433,
+ "grad_norm": 2.5472559928894043,
+ "learning_rate": 3.331036972456124e-06,
+ "loss": 0.3926,
+ "step": 5037
+ },
+ {
+ "epoch": 2.382033096926714,
+ "grad_norm": 2.81900954246521,
+ "learning_rate": 3.330448594599218e-06,
+ "loss": 0.4785,
+ "step": 5038
+ },
+ {
+ "epoch": 2.3825059101654844,
+ "grad_norm": 3.0930590629577637,
+ "learning_rate": 3.329860165033919e-06,
+ "loss": 0.4587,
+ "step": 5039
+ },
+ {
+ "epoch": 2.382978723404255,
+ "grad_norm": 3.0553040504455566,
+ "learning_rate": 3.3292716837968673e-06,
+ "loss": 0.5285,
+ "step": 5040
+ },
+ {
+ "epoch": 2.383451536643026,
+ "grad_norm": 2.577580690383911,
+ "learning_rate": 3.328683150924704e-06,
+ "loss": 0.4184,
+ "step": 5041
+ },
+ {
+ "epoch": 2.383924349881797,
+ "grad_norm": 2.6430366039276123,
+ "learning_rate": 3.3280945664540735e-06,
+ "loss": 0.4636,
+ "step": 5042
+ },
+ {
+ "epoch": 2.3843971631205676,
+ "grad_norm": 3.228360891342163,
+ "learning_rate": 3.3275059304216255e-06,
+ "loss": 0.455,
+ "step": 5043
+ },
+ {
+ "epoch": 2.384869976359338,
+ "grad_norm": 2.776142120361328,
+ "learning_rate": 3.3269172428640125e-06,
+ "loss": 0.4785,
+ "step": 5044
+ },
+ {
+ "epoch": 2.3853427895981087,
+ "grad_norm": 2.755671739578247,
+ "learning_rate": 3.3263285038178882e-06,
+ "loss": 0.4625,
+ "step": 5045
+ },
+ {
+ "epoch": 2.3858156028368795,
+ "grad_norm": 3.061004400253296,
+ "learning_rate": 3.3257397133199114e-06,
+ "loss": 0.4641,
+ "step": 5046
+ },
+ {
+ "epoch": 2.38628841607565,
+ "grad_norm": 2.8391458988189697,
+ "learning_rate": 3.3251508714067432e-06,
+ "loss": 0.5003,
+ "step": 5047
+ },
+ {
+ "epoch": 2.3867612293144207,
+ "grad_norm": 2.390810966491699,
+ "learning_rate": 3.324561978115049e-06,
+ "loss": 0.4446,
+ "step": 5048
+ },
+ {
+ "epoch": 2.3872340425531915,
+ "grad_norm": 2.7760825157165527,
+ "learning_rate": 3.323973033481496e-06,
+ "loss": 0.4443,
+ "step": 5049
+ },
+ {
+ "epoch": 2.3877068557919623,
+ "grad_norm": 3.157893419265747,
+ "learning_rate": 3.3233840375427552e-06,
+ "loss": 0.4934,
+ "step": 5050
+ },
+ {
+ "epoch": 2.388179669030733,
+ "grad_norm": 2.7245349884033203,
+ "learning_rate": 3.3227949903355e-06,
+ "loss": 0.4254,
+ "step": 5051
+ },
+ {
+ "epoch": 2.3886524822695034,
+ "grad_norm": 2.6674044132232666,
+ "learning_rate": 3.322205891896409e-06,
+ "loss": 0.4116,
+ "step": 5052
+ },
+ {
+ "epoch": 2.3891252955082742,
+ "grad_norm": 3.1490554809570312,
+ "learning_rate": 3.3216167422621627e-06,
+ "loss": 0.4604,
+ "step": 5053
+ },
+ {
+ "epoch": 2.389598108747045,
+ "grad_norm": 2.725731134414673,
+ "learning_rate": 3.321027541469444e-06,
+ "loss": 0.4836,
+ "step": 5054
+ },
+ {
+ "epoch": 2.3900709219858154,
+ "grad_norm": 2.5378828048706055,
+ "learning_rate": 3.3204382895549407e-06,
+ "loss": 0.4228,
+ "step": 5055
+ },
+ {
+ "epoch": 2.390543735224586,
+ "grad_norm": 2.8191192150115967,
+ "learning_rate": 3.3198489865553427e-06,
+ "loss": 0.4371,
+ "step": 5056
+ },
+ {
+ "epoch": 2.391016548463357,
+ "grad_norm": 2.5676498413085938,
+ "learning_rate": 3.3192596325073433e-06,
+ "loss": 0.4463,
+ "step": 5057
+ },
+ {
+ "epoch": 2.391489361702128,
+ "grad_norm": 3.0846121311187744,
+ "learning_rate": 3.3186702274476397e-06,
+ "loss": 0.5049,
+ "step": 5058
+ },
+ {
+ "epoch": 2.3919621749408986,
+ "grad_norm": 2.6085152626037598,
+ "learning_rate": 3.3180807714129293e-06,
+ "loss": 0.4376,
+ "step": 5059
+ },
+ {
+ "epoch": 2.392434988179669,
+ "grad_norm": 3.0218591690063477,
+ "learning_rate": 3.3174912644399172e-06,
+ "loss": 0.4734,
+ "step": 5060
+ },
+ {
+ "epoch": 2.3929078014184397,
+ "grad_norm": 2.5904781818389893,
+ "learning_rate": 3.316901706565308e-06,
+ "loss": 0.4924,
+ "step": 5061
+ },
+ {
+ "epoch": 2.3933806146572105,
+ "grad_norm": 2.675478458404541,
+ "learning_rate": 3.3163120978258123e-06,
+ "loss": 0.4072,
+ "step": 5062
+ },
+ {
+ "epoch": 2.393853427895981,
+ "grad_norm": 2.7944445610046387,
+ "learning_rate": 3.3157224382581415e-06,
+ "loss": 0.4328,
+ "step": 5063
+ },
+ {
+ "epoch": 2.3943262411347517,
+ "grad_norm": 2.846224546432495,
+ "learning_rate": 3.315132727899012e-06,
+ "loss": 0.4447,
+ "step": 5064
+ },
+ {
+ "epoch": 2.3947990543735225,
+ "grad_norm": 2.6825828552246094,
+ "learning_rate": 3.3145429667851402e-06,
+ "loss": 0.4528,
+ "step": 5065
+ },
+ {
+ "epoch": 2.3952718676122933,
+ "grad_norm": 3.0305285453796387,
+ "learning_rate": 3.3139531549532505e-06,
+ "loss": 0.4538,
+ "step": 5066
+ },
+ {
+ "epoch": 2.395744680851064,
+ "grad_norm": 2.707540988922119,
+ "learning_rate": 3.313363292440067e-06,
+ "loss": 0.4412,
+ "step": 5067
+ },
+ {
+ "epoch": 2.3962174940898344,
+ "grad_norm": 3.0458385944366455,
+ "learning_rate": 3.3127733792823173e-06,
+ "loss": 0.4587,
+ "step": 5068
+ },
+ {
+ "epoch": 2.396690307328605,
+ "grad_norm": 2.7711992263793945,
+ "learning_rate": 3.312183415516733e-06,
+ "loss": 0.4157,
+ "step": 5069
+ },
+ {
+ "epoch": 2.397163120567376,
+ "grad_norm": 2.6953988075256348,
+ "learning_rate": 3.3115934011800494e-06,
+ "loss": 0.3828,
+ "step": 5070
+ },
+ {
+ "epoch": 2.3976359338061464,
+ "grad_norm": 3.033721923828125,
+ "learning_rate": 3.311003336309003e-06,
+ "loss": 0.5204,
+ "step": 5071
+ },
+ {
+ "epoch": 2.398108747044917,
+ "grad_norm": 2.6134517192840576,
+ "learning_rate": 3.3104132209403355e-06,
+ "loss": 0.4181,
+ "step": 5072
+ },
+ {
+ "epoch": 2.398581560283688,
+ "grad_norm": 2.8800251483917236,
+ "learning_rate": 3.30982305511079e-06,
+ "loss": 0.466,
+ "step": 5073
+ },
+ {
+ "epoch": 2.3990543735224588,
+ "grad_norm": 2.5043210983276367,
+ "learning_rate": 3.309232838857114e-06,
+ "loss": 0.4161,
+ "step": 5074
+ },
+ {
+ "epoch": 2.3995271867612296,
+ "grad_norm": 2.6577322483062744,
+ "learning_rate": 3.308642572216057e-06,
+ "loss": 0.465,
+ "step": 5075
+ },
+ {
+ "epoch": 2.4,
+ "grad_norm": 2.549098253250122,
+ "learning_rate": 3.3080522552243734e-06,
+ "loss": 0.4571,
+ "step": 5076
+ },
+ {
+ "epoch": 2.4004728132387707,
+ "grad_norm": 2.881958246231079,
+ "learning_rate": 3.3074618879188186e-06,
+ "loss": 0.4443,
+ "step": 5077
+ },
+ {
+ "epoch": 2.4009456264775415,
+ "grad_norm": 2.608397960662842,
+ "learning_rate": 3.3068714703361528e-06,
+ "loss": 0.3843,
+ "step": 5078
+ },
+ {
+ "epoch": 2.401418439716312,
+ "grad_norm": 2.8666789531707764,
+ "learning_rate": 3.306281002513139e-06,
+ "loss": 0.4857,
+ "step": 5079
+ },
+ {
+ "epoch": 2.4018912529550827,
+ "grad_norm": 2.9008588790893555,
+ "learning_rate": 3.3056904844865422e-06,
+ "loss": 0.4454,
+ "step": 5080
+ },
+ {
+ "epoch": 2.4023640661938535,
+ "grad_norm": 2.7446060180664062,
+ "learning_rate": 3.3050999162931315e-06,
+ "loss": 0.4522,
+ "step": 5081
+ },
+ {
+ "epoch": 2.4028368794326243,
+ "grad_norm": 2.787116765975952,
+ "learning_rate": 3.3045092979696804e-06,
+ "loss": 0.4714,
+ "step": 5082
+ },
+ {
+ "epoch": 2.403309692671395,
+ "grad_norm": 2.7494192123413086,
+ "learning_rate": 3.3039186295529613e-06,
+ "loss": 0.4107,
+ "step": 5083
+ },
+ {
+ "epoch": 2.4037825059101654,
+ "grad_norm": 2.733794927597046,
+ "learning_rate": 3.303327911079755e-06,
+ "loss": 0.4169,
+ "step": 5084
+ },
+ {
+ "epoch": 2.404255319148936,
+ "grad_norm": 2.7313334941864014,
+ "learning_rate": 3.3027371425868422e-06,
+ "loss": 0.4287,
+ "step": 5085
+ },
+ {
+ "epoch": 2.404728132387707,
+ "grad_norm": 2.7832977771759033,
+ "learning_rate": 3.3021463241110075e-06,
+ "loss": 0.5307,
+ "step": 5086
+ },
+ {
+ "epoch": 2.4052009456264773,
+ "grad_norm": 2.6615281105041504,
+ "learning_rate": 3.301555455689038e-06,
+ "loss": 0.4519,
+ "step": 5087
+ },
+ {
+ "epoch": 2.405673758865248,
+ "grad_norm": 2.343921422958374,
+ "learning_rate": 3.3009645373577264e-06,
+ "loss": 0.46,
+ "step": 5088
+ },
+ {
+ "epoch": 2.406146572104019,
+ "grad_norm": 2.6115355491638184,
+ "learning_rate": 3.300373569153864e-06,
+ "loss": 0.4782,
+ "step": 5089
+ },
+ {
+ "epoch": 2.4066193853427897,
+ "grad_norm": 2.730625629425049,
+ "learning_rate": 3.299782551114249e-06,
+ "loss": 0.4632,
+ "step": 5090
+ },
+ {
+ "epoch": 2.40709219858156,
+ "grad_norm": 2.4495043754577637,
+ "learning_rate": 3.2991914832756824e-06,
+ "loss": 0.4243,
+ "step": 5091
+ },
+ {
+ "epoch": 2.407565011820331,
+ "grad_norm": 2.8731648921966553,
+ "learning_rate": 3.2986003656749654e-06,
+ "loss": 0.4262,
+ "step": 5092
+ },
+ {
+ "epoch": 2.4080378250591017,
+ "grad_norm": 2.870342969894409,
+ "learning_rate": 3.2980091983489053e-06,
+ "loss": 0.4735,
+ "step": 5093
+ },
+ {
+ "epoch": 2.4085106382978725,
+ "grad_norm": 2.500786542892456,
+ "learning_rate": 3.297417981334312e-06,
+ "loss": 0.4007,
+ "step": 5094
+ },
+ {
+ "epoch": 2.408983451536643,
+ "grad_norm": 2.7787322998046875,
+ "learning_rate": 3.2968267146679978e-06,
+ "loss": 0.493,
+ "step": 5095
+ },
+ {
+ "epoch": 2.4094562647754136,
+ "grad_norm": 2.5229599475860596,
+ "learning_rate": 3.2962353983867783e-06,
+ "loss": 0.3676,
+ "step": 5096
+ },
+ {
+ "epoch": 2.4099290780141844,
+ "grad_norm": 3.1955904960632324,
+ "learning_rate": 3.2956440325274715e-06,
+ "loss": 0.4888,
+ "step": 5097
+ },
+ {
+ "epoch": 2.4104018912529552,
+ "grad_norm": 2.8580288887023926,
+ "learning_rate": 3.2950526171268995e-06,
+ "loss": 0.4892,
+ "step": 5098
+ },
+ {
+ "epoch": 2.4108747044917256,
+ "grad_norm": 2.6321749687194824,
+ "learning_rate": 3.294461152221887e-06,
+ "loss": 0.3823,
+ "step": 5099
+ },
+ {
+ "epoch": 2.4113475177304964,
+ "grad_norm": 2.881127119064331,
+ "learning_rate": 3.293869637849263e-06,
+ "loss": 0.4569,
+ "step": 5100
+ },
+ {
+ "epoch": 2.411820330969267,
+ "grad_norm": 2.7742316722869873,
+ "learning_rate": 3.293278074045857e-06,
+ "loss": 0.4445,
+ "step": 5101
+ },
+ {
+ "epoch": 2.412293144208038,
+ "grad_norm": 2.546701431274414,
+ "learning_rate": 3.2926864608485037e-06,
+ "loss": 0.3995,
+ "step": 5102
+ },
+ {
+ "epoch": 2.4127659574468083,
+ "grad_norm": 2.588226318359375,
+ "learning_rate": 3.292094798294041e-06,
+ "loss": 0.4081,
+ "step": 5103
+ },
+ {
+ "epoch": 2.413238770685579,
+ "grad_norm": 2.968689441680908,
+ "learning_rate": 3.2915030864193077e-06,
+ "loss": 0.4475,
+ "step": 5104
+ },
+ {
+ "epoch": 2.41371158392435,
+ "grad_norm": 2.9249184131622314,
+ "learning_rate": 3.290911325261148e-06,
+ "loss": 0.4763,
+ "step": 5105
+ },
+ {
+ "epoch": 2.4141843971631207,
+ "grad_norm": 2.817596673965454,
+ "learning_rate": 3.2903195148564083e-06,
+ "loss": 0.4451,
+ "step": 5106
+ },
+ {
+ "epoch": 2.414657210401891,
+ "grad_norm": 2.6465954780578613,
+ "learning_rate": 3.2897276552419377e-06,
+ "loss": 0.4665,
+ "step": 5107
+ },
+ {
+ "epoch": 2.415130023640662,
+ "grad_norm": 2.8613853454589844,
+ "learning_rate": 3.2891357464545885e-06,
+ "loss": 0.4398,
+ "step": 5108
+ },
+ {
+ "epoch": 2.4156028368794327,
+ "grad_norm": 2.756321907043457,
+ "learning_rate": 3.2885437885312175e-06,
+ "loss": 0.4634,
+ "step": 5109
+ },
+ {
+ "epoch": 2.4160756501182035,
+ "grad_norm": 2.8965282440185547,
+ "learning_rate": 3.287951781508682e-06,
+ "loss": 0.4319,
+ "step": 5110
+ },
+ {
+ "epoch": 2.416548463356974,
+ "grad_norm": 2.896756172180176,
+ "learning_rate": 3.287359725423844e-06,
+ "loss": 0.4771,
+ "step": 5111
+ },
+ {
+ "epoch": 2.4170212765957446,
+ "grad_norm": 2.952911376953125,
+ "learning_rate": 3.286767620313569e-06,
+ "loss": 0.5026,
+ "step": 5112
+ },
+ {
+ "epoch": 2.4174940898345154,
+ "grad_norm": 3.850515604019165,
+ "learning_rate": 3.2861754662147234e-06,
+ "loss": 0.4387,
+ "step": 5113
+ },
+ {
+ "epoch": 2.417966903073286,
+ "grad_norm": 3.0072689056396484,
+ "learning_rate": 3.2855832631641794e-06,
+ "loss": 0.4586,
+ "step": 5114
+ },
+ {
+ "epoch": 2.4184397163120566,
+ "grad_norm": 3.166790246963501,
+ "learning_rate": 3.2849910111988092e-06,
+ "loss": 0.4842,
+ "step": 5115
+ },
+ {
+ "epoch": 2.4189125295508274,
+ "grad_norm": 3.5397679805755615,
+ "learning_rate": 3.284398710355492e-06,
+ "loss": 0.5138,
+ "step": 5116
+ },
+ {
+ "epoch": 2.419385342789598,
+ "grad_norm": 2.779609441757202,
+ "learning_rate": 3.283806360671106e-06,
+ "loss": 0.4049,
+ "step": 5117
+ },
+ {
+ "epoch": 2.419858156028369,
+ "grad_norm": 2.5924575328826904,
+ "learning_rate": 3.283213962182535e-06,
+ "loss": 0.433,
+ "step": 5118
+ },
+ {
+ "epoch": 2.4203309692671393,
+ "grad_norm": 2.7429699897766113,
+ "learning_rate": 3.282621514926665e-06,
+ "loss": 0.4674,
+ "step": 5119
+ },
+ {
+ "epoch": 2.42080378250591,
+ "grad_norm": 2.8113889694213867,
+ "learning_rate": 3.2820290189403846e-06,
+ "loss": 0.3898,
+ "step": 5120
+ },
+ {
+ "epoch": 2.421276595744681,
+ "grad_norm": 2.867105722427368,
+ "learning_rate": 3.2814364742605863e-06,
+ "loss": 0.4439,
+ "step": 5121
+ },
+ {
+ "epoch": 2.4217494089834517,
+ "grad_norm": 2.428597927093506,
+ "learning_rate": 3.2808438809241654e-06,
+ "loss": 0.4339,
+ "step": 5122
+ },
+ {
+ "epoch": 2.422222222222222,
+ "grad_norm": 3.071735143661499,
+ "learning_rate": 3.2802512389680203e-06,
+ "loss": 0.4583,
+ "step": 5123
+ },
+ {
+ "epoch": 2.422695035460993,
+ "grad_norm": 3.046313762664795,
+ "learning_rate": 3.279658548429051e-06,
+ "loss": 0.5351,
+ "step": 5124
+ },
+ {
+ "epoch": 2.4231678486997636,
+ "grad_norm": 2.8412697315216064,
+ "learning_rate": 3.279065809344163e-06,
+ "loss": 0.5258,
+ "step": 5125
+ },
+ {
+ "epoch": 2.4236406619385344,
+ "grad_norm": 2.887169122695923,
+ "learning_rate": 3.278473021750263e-06,
+ "loss": 0.4568,
+ "step": 5126
+ },
+ {
+ "epoch": 2.424113475177305,
+ "grad_norm": 2.8316574096679688,
+ "learning_rate": 3.2778801856842624e-06,
+ "loss": 0.46,
+ "step": 5127
+ },
+ {
+ "epoch": 2.4245862884160756,
+ "grad_norm": 2.7660772800445557,
+ "learning_rate": 3.277287301183073e-06,
+ "loss": 0.4323,
+ "step": 5128
+ },
+ {
+ "epoch": 2.4250591016548464,
+ "grad_norm": 2.737682819366455,
+ "learning_rate": 3.276694368283611e-06,
+ "loss": 0.4296,
+ "step": 5129
+ },
+ {
+ "epoch": 2.425531914893617,
+ "grad_norm": 2.8807425498962402,
+ "learning_rate": 3.276101387022797e-06,
+ "loss": 0.4673,
+ "step": 5130
+ },
+ {
+ "epoch": 2.4260047281323875,
+ "grad_norm": 2.530526876449585,
+ "learning_rate": 3.275508357437552e-06,
+ "loss": 0.416,
+ "step": 5131
+ },
+ {
+ "epoch": 2.4264775413711583,
+ "grad_norm": 3.1189746856689453,
+ "learning_rate": 3.274915279564803e-06,
+ "loss": 0.4171,
+ "step": 5132
+ },
+ {
+ "epoch": 2.426950354609929,
+ "grad_norm": 2.6612462997436523,
+ "learning_rate": 3.274322153441477e-06,
+ "loss": 0.4104,
+ "step": 5133
+ },
+ {
+ "epoch": 2.4274231678487,
+ "grad_norm": 2.717973470687866,
+ "learning_rate": 3.2737289791045064e-06,
+ "loss": 0.479,
+ "step": 5134
+ },
+ {
+ "epoch": 2.4278959810874703,
+ "grad_norm": 2.764216661453247,
+ "learning_rate": 3.2731357565908247e-06,
+ "loss": 0.481,
+ "step": 5135
+ },
+ {
+ "epoch": 2.428368794326241,
+ "grad_norm": 2.5081393718719482,
+ "learning_rate": 3.272542485937369e-06,
+ "loss": 0.4592,
+ "step": 5136
+ },
+ {
+ "epoch": 2.428841607565012,
+ "grad_norm": 3.1380364894866943,
+ "learning_rate": 3.271949167181081e-06,
+ "loss": 0.4179,
+ "step": 5137
+ },
+ {
+ "epoch": 2.4293144208037827,
+ "grad_norm": 2.9275963306427,
+ "learning_rate": 3.2713558003589026e-06,
+ "loss": 0.5196,
+ "step": 5138
+ },
+ {
+ "epoch": 2.429787234042553,
+ "grad_norm": 2.8215506076812744,
+ "learning_rate": 3.270762385507781e-06,
+ "loss": 0.4081,
+ "step": 5139
+ },
+ {
+ "epoch": 2.430260047281324,
+ "grad_norm": 2.9185614585876465,
+ "learning_rate": 3.270168922664665e-06,
+ "loss": 0.4936,
+ "step": 5140
+ },
+ {
+ "epoch": 2.4307328605200946,
+ "grad_norm": 2.6507248878479004,
+ "learning_rate": 3.269575411866507e-06,
+ "loss": 0.4834,
+ "step": 5141
+ },
+ {
+ "epoch": 2.4312056737588654,
+ "grad_norm": 2.864741563796997,
+ "learning_rate": 3.2689818531502637e-06,
+ "loss": 0.4562,
+ "step": 5142
+ },
+ {
+ "epoch": 2.431678486997636,
+ "grad_norm": 2.806919813156128,
+ "learning_rate": 3.2683882465528917e-06,
+ "loss": 0.4645,
+ "step": 5143
+ },
+ {
+ "epoch": 2.4321513002364066,
+ "grad_norm": 2.733372211456299,
+ "learning_rate": 3.267794592111353e-06,
+ "loss": 0.4123,
+ "step": 5144
+ },
+ {
+ "epoch": 2.4326241134751774,
+ "grad_norm": 2.8005833625793457,
+ "learning_rate": 3.2672008898626116e-06,
+ "loss": 0.4343,
+ "step": 5145
+ },
+ {
+ "epoch": 2.433096926713948,
+ "grad_norm": 3.2339670658111572,
+ "learning_rate": 3.2666071398436354e-06,
+ "loss": 0.4017,
+ "step": 5146
+ },
+ {
+ "epoch": 2.4335697399527185,
+ "grad_norm": 2.510251760482788,
+ "learning_rate": 3.2660133420913932e-06,
+ "loss": 0.3882,
+ "step": 5147
+ },
+ {
+ "epoch": 2.4340425531914893,
+ "grad_norm": 3.5633628368377686,
+ "learning_rate": 3.26541949664286e-06,
+ "loss": 0.4766,
+ "step": 5148
+ },
+ {
+ "epoch": 2.43451536643026,
+ "grad_norm": 2.8246724605560303,
+ "learning_rate": 3.26482560353501e-06,
+ "loss": 0.3728,
+ "step": 5149
+ },
+ {
+ "epoch": 2.434988179669031,
+ "grad_norm": 2.4923641681671143,
+ "learning_rate": 3.264231662804823e-06,
+ "loss": 0.4346,
+ "step": 5150
+ },
+ {
+ "epoch": 2.4354609929078013,
+ "grad_norm": 3.180874824523926,
+ "learning_rate": 3.2636376744892827e-06,
+ "loss": 0.4351,
+ "step": 5151
+ },
+ {
+ "epoch": 2.435933806146572,
+ "grad_norm": 2.6933515071868896,
+ "learning_rate": 3.263043638625373e-06,
+ "loss": 0.4293,
+ "step": 5152
+ },
+ {
+ "epoch": 2.436406619385343,
+ "grad_norm": 2.584132194519043,
+ "learning_rate": 3.262449555250081e-06,
+ "loss": 0.4589,
+ "step": 5153
+ },
+ {
+ "epoch": 2.4368794326241137,
+ "grad_norm": 2.8103036880493164,
+ "learning_rate": 3.2618554244003985e-06,
+ "loss": 0.463,
+ "step": 5154
+ },
+ {
+ "epoch": 2.437352245862884,
+ "grad_norm": 2.809070587158203,
+ "learning_rate": 3.2612612461133197e-06,
+ "loss": 0.4629,
+ "step": 5155
+ },
+ {
+ "epoch": 2.437825059101655,
+ "grad_norm": 2.98148512840271,
+ "learning_rate": 3.2606670204258405e-06,
+ "loss": 0.451,
+ "step": 5156
+ },
+ {
+ "epoch": 2.4382978723404256,
+ "grad_norm": 2.691047191619873,
+ "learning_rate": 3.2600727473749614e-06,
+ "loss": 0.3878,
+ "step": 5157
+ },
+ {
+ "epoch": 2.4387706855791964,
+ "grad_norm": 2.900360345840454,
+ "learning_rate": 3.2594784269976856e-06,
+ "loss": 0.4216,
+ "step": 5158
+ },
+ {
+ "epoch": 2.4392434988179668,
+ "grad_norm": 2.8449952602386475,
+ "learning_rate": 3.258884059331019e-06,
+ "loss": 0.4268,
+ "step": 5159
+ },
+ {
+ "epoch": 2.4397163120567376,
+ "grad_norm": 2.7226388454437256,
+ "learning_rate": 3.258289644411969e-06,
+ "loss": 0.4381,
+ "step": 5160
+ },
+ {
+ "epoch": 2.4401891252955084,
+ "grad_norm": 2.513946056365967,
+ "learning_rate": 3.257695182277547e-06,
+ "loss": 0.4566,
+ "step": 5161
+ },
+ {
+ "epoch": 2.440661938534279,
+ "grad_norm": 2.9941394329071045,
+ "learning_rate": 3.2571006729647693e-06,
+ "loss": 0.4395,
+ "step": 5162
+ },
+ {
+ "epoch": 2.4411347517730495,
+ "grad_norm": 2.699094533920288,
+ "learning_rate": 3.2565061165106523e-06,
+ "loss": 0.4274,
+ "step": 5163
+ },
+ {
+ "epoch": 2.4416075650118203,
+ "grad_norm": 2.574193000793457,
+ "learning_rate": 3.255911512952216e-06,
+ "loss": 0.4187,
+ "step": 5164
+ },
+ {
+ "epoch": 2.442080378250591,
+ "grad_norm": 2.920766592025757,
+ "learning_rate": 3.2553168623264854e-06,
+ "loss": 0.4911,
+ "step": 5165
+ },
+ {
+ "epoch": 2.4425531914893615,
+ "grad_norm": 2.728421926498413,
+ "learning_rate": 3.2547221646704853e-06,
+ "loss": 0.4466,
+ "step": 5166
+ },
+ {
+ "epoch": 2.4430260047281322,
+ "grad_norm": 2.8171417713165283,
+ "learning_rate": 3.254127420021246e-06,
+ "loss": 0.4331,
+ "step": 5167
+ },
+ {
+ "epoch": 2.443498817966903,
+ "grad_norm": 2.4069135189056396,
+ "learning_rate": 3.2535326284157975e-06,
+ "loss": 0.389,
+ "step": 5168
+ },
+ {
+ "epoch": 2.443971631205674,
+ "grad_norm": 2.912405490875244,
+ "learning_rate": 3.2529377898911777e-06,
+ "loss": 0.4681,
+ "step": 5169
+ },
+ {
+ "epoch": 2.4444444444444446,
+ "grad_norm": 2.987558126449585,
+ "learning_rate": 3.2523429044844228e-06,
+ "loss": 0.4715,
+ "step": 5170
+ },
+ {
+ "epoch": 2.444917257683215,
+ "grad_norm": 2.5117199420928955,
+ "learning_rate": 3.251747972232574e-06,
+ "loss": 0.4531,
+ "step": 5171
+ },
+ {
+ "epoch": 2.445390070921986,
+ "grad_norm": 2.5405385494232178,
+ "learning_rate": 3.2511529931726752e-06,
+ "loss": 0.4323,
+ "step": 5172
+ },
+ {
+ "epoch": 2.4458628841607566,
+ "grad_norm": 2.989932060241699,
+ "learning_rate": 3.250557967341773e-06,
+ "loss": 0.4039,
+ "step": 5173
+ },
+ {
+ "epoch": 2.446335697399527,
+ "grad_norm": 2.6331627368927,
+ "learning_rate": 3.2499628947769186e-06,
+ "loss": 0.5147,
+ "step": 5174
+ },
+ {
+ "epoch": 2.4468085106382977,
+ "grad_norm": 2.71699857711792,
+ "learning_rate": 3.249367775515162e-06,
+ "loss": 0.3748,
+ "step": 5175
+ },
+ {
+ "epoch": 2.4472813238770685,
+ "grad_norm": 2.9508471488952637,
+ "learning_rate": 3.2487726095935606e-06,
+ "loss": 0.5145,
+ "step": 5176
+ },
+ {
+ "epoch": 2.4477541371158393,
+ "grad_norm": 2.8276431560516357,
+ "learning_rate": 3.2481773970491713e-06,
+ "loss": 0.4295,
+ "step": 5177
+ },
+ {
+ "epoch": 2.44822695035461,
+ "grad_norm": 2.5500540733337402,
+ "learning_rate": 3.2475821379190565e-06,
+ "loss": 0.4246,
+ "step": 5178
+ },
+ {
+ "epoch": 2.4486997635933805,
+ "grad_norm": 2.845641613006592,
+ "learning_rate": 3.246986832240281e-06,
+ "loss": 0.4211,
+ "step": 5179
+ },
+ {
+ "epoch": 2.4491725768321513,
+ "grad_norm": 3.1215856075286865,
+ "learning_rate": 3.2463914800499097e-06,
+ "loss": 0.4378,
+ "step": 5180
+ },
+ {
+ "epoch": 2.449645390070922,
+ "grad_norm": 2.4685606956481934,
+ "learning_rate": 3.2457960813850137e-06,
+ "loss": 0.4836,
+ "step": 5181
+ },
+ {
+ "epoch": 2.4501182033096924,
+ "grad_norm": 2.508028268814087,
+ "learning_rate": 3.245200636282666e-06,
+ "loss": 0.4377,
+ "step": 5182
+ },
+ {
+ "epoch": 2.4505910165484632,
+ "grad_norm": 2.899949312210083,
+ "learning_rate": 3.244605144779943e-06,
+ "loss": 0.501,
+ "step": 5183
+ },
+ {
+ "epoch": 2.451063829787234,
+ "grad_norm": 2.6494483947753906,
+ "learning_rate": 3.244009606913923e-06,
+ "loss": 0.4255,
+ "step": 5184
+ },
+ {
+ "epoch": 2.451536643026005,
+ "grad_norm": 2.4363760948181152,
+ "learning_rate": 3.243414022721686e-06,
+ "loss": 0.4402,
+ "step": 5185
+ },
+ {
+ "epoch": 2.4520094562647756,
+ "grad_norm": 2.4725022315979004,
+ "learning_rate": 3.242818392240317e-06,
+ "loss": 0.4388,
+ "step": 5186
+ },
+ {
+ "epoch": 2.452482269503546,
+ "grad_norm": 2.7010514736175537,
+ "learning_rate": 3.242222715506905e-06,
+ "loss": 0.4388,
+ "step": 5187
+ },
+ {
+ "epoch": 2.4529550827423168,
+ "grad_norm": 2.811464548110962,
+ "learning_rate": 3.241626992558539e-06,
+ "loss": 0.4634,
+ "step": 5188
+ },
+ {
+ "epoch": 2.4534278959810876,
+ "grad_norm": 2.6473052501678467,
+ "learning_rate": 3.2410312234323123e-06,
+ "loss": 0.4752,
+ "step": 5189
+ },
+ {
+ "epoch": 2.453900709219858,
+ "grad_norm": 2.5587213039398193,
+ "learning_rate": 3.24043540816532e-06,
+ "loss": 0.4458,
+ "step": 5190
+ },
+ {
+ "epoch": 2.4543735224586287,
+ "grad_norm": 2.6306557655334473,
+ "learning_rate": 3.239839546794662e-06,
+ "loss": 0.4081,
+ "step": 5191
+ },
+ {
+ "epoch": 2.4548463356973995,
+ "grad_norm": 2.4613633155822754,
+ "learning_rate": 3.23924363935744e-06,
+ "loss": 0.4165,
+ "step": 5192
+ },
+ {
+ "epoch": 2.4553191489361703,
+ "grad_norm": 2.7189204692840576,
+ "learning_rate": 3.238647685890757e-06,
+ "loss": 0.4822,
+ "step": 5193
+ },
+ {
+ "epoch": 2.455791962174941,
+ "grad_norm": 3.015977382659912,
+ "learning_rate": 3.238051686431722e-06,
+ "loss": 0.4964,
+ "step": 5194
+ },
+ {
+ "epoch": 2.4562647754137115,
+ "grad_norm": 2.8868937492370605,
+ "learning_rate": 3.2374556410174445e-06,
+ "loss": 0.4514,
+ "step": 5195
+ },
+ {
+ "epoch": 2.4567375886524823,
+ "grad_norm": 2.7959537506103516,
+ "learning_rate": 3.2368595496850375e-06,
+ "loss": 0.475,
+ "step": 5196
+ },
+ {
+ "epoch": 2.457210401891253,
+ "grad_norm": 3.0086777210235596,
+ "learning_rate": 3.2362634124716187e-06,
+ "loss": 0.4913,
+ "step": 5197
+ },
+ {
+ "epoch": 2.4576832151300234,
+ "grad_norm": 2.621335506439209,
+ "learning_rate": 3.2356672294143044e-06,
+ "loss": 0.4259,
+ "step": 5198
+ },
+ {
+ "epoch": 2.458156028368794,
+ "grad_norm": 3.1620380878448486,
+ "learning_rate": 3.235071000550218e-06,
+ "loss": 0.451,
+ "step": 5199
+ },
+ {
+ "epoch": 2.458628841607565,
+ "grad_norm": 2.7663278579711914,
+ "learning_rate": 3.234474725916484e-06,
+ "loss": 0.3854,
+ "step": 5200
+ },
+ {
+ "epoch": 2.459101654846336,
+ "grad_norm": 2.5187132358551025,
+ "learning_rate": 3.2338784055502288e-06,
+ "loss": 0.4068,
+ "step": 5201
+ },
+ {
+ "epoch": 2.4595744680851066,
+ "grad_norm": 2.6022701263427734,
+ "learning_rate": 3.233282039488583e-06,
+ "loss": 0.4484,
+ "step": 5202
+ },
+ {
+ "epoch": 2.460047281323877,
+ "grad_norm": 2.874750852584839,
+ "learning_rate": 3.2326856277686807e-06,
+ "loss": 0.45,
+ "step": 5203
+ },
+ {
+ "epoch": 2.4605200945626478,
+ "grad_norm": 2.671008586883545,
+ "learning_rate": 3.232089170427656e-06,
+ "loss": 0.4446,
+ "step": 5204
+ },
+ {
+ "epoch": 2.4609929078014185,
+ "grad_norm": 2.7365503311157227,
+ "learning_rate": 3.2314926675026498e-06,
+ "loss": 0.4402,
+ "step": 5205
+ },
+ {
+ "epoch": 2.461465721040189,
+ "grad_norm": 2.8163657188415527,
+ "learning_rate": 3.230896119030803e-06,
+ "loss": 0.3881,
+ "step": 5206
+ },
+ {
+ "epoch": 2.4619385342789597,
+ "grad_norm": 2.812433958053589,
+ "learning_rate": 3.2302995250492584e-06,
+ "loss": 0.4897,
+ "step": 5207
+ },
+ {
+ "epoch": 2.4624113475177305,
+ "grad_norm": 2.786033868789673,
+ "learning_rate": 3.2297028855951664e-06,
+ "loss": 0.4069,
+ "step": 5208
+ },
+ {
+ "epoch": 2.4628841607565013,
+ "grad_norm": 3.0247974395751953,
+ "learning_rate": 3.229106200705674e-06,
+ "loss": 0.4048,
+ "step": 5209
+ },
+ {
+ "epoch": 2.463356973995272,
+ "grad_norm": 3.3280487060546875,
+ "learning_rate": 3.2285094704179353e-06,
+ "loss": 0.5613,
+ "step": 5210
+ },
+ {
+ "epoch": 2.4638297872340424,
+ "grad_norm": 2.603219985961914,
+ "learning_rate": 3.2279126947691073e-06,
+ "loss": 0.432,
+ "step": 5211
+ },
+ {
+ "epoch": 2.4643026004728132,
+ "grad_norm": 3.1532180309295654,
+ "learning_rate": 3.2273158737963472e-06,
+ "loss": 0.4602,
+ "step": 5212
+ },
+ {
+ "epoch": 2.464775413711584,
+ "grad_norm": 2.7512969970703125,
+ "learning_rate": 3.2267190075368164e-06,
+ "loss": 0.5064,
+ "step": 5213
+ },
+ {
+ "epoch": 2.4652482269503544,
+ "grad_norm": 2.926992177963257,
+ "learning_rate": 3.22612209602768e-06,
+ "loss": 0.4753,
+ "step": 5214
+ },
+ {
+ "epoch": 2.465721040189125,
+ "grad_norm": 4.052840709686279,
+ "learning_rate": 3.2255251393061047e-06,
+ "loss": 0.5235,
+ "step": 5215
+ },
+ {
+ "epoch": 2.466193853427896,
+ "grad_norm": 2.8266959190368652,
+ "learning_rate": 3.2249281374092606e-06,
+ "loss": 0.3931,
+ "step": 5216
+ },
+ {
+ "epoch": 2.466666666666667,
+ "grad_norm": 2.564359426498413,
+ "learning_rate": 3.2243310903743196e-06,
+ "loss": 0.4146,
+ "step": 5217
+ },
+ {
+ "epoch": 2.4671394799054376,
+ "grad_norm": 2.387925148010254,
+ "learning_rate": 3.2237339982384576e-06,
+ "loss": 0.4142,
+ "step": 5218
+ },
+ {
+ "epoch": 2.467612293144208,
+ "grad_norm": 2.7045164108276367,
+ "learning_rate": 3.223136861038853e-06,
+ "loss": 0.4345,
+ "step": 5219
+ },
+ {
+ "epoch": 2.4680851063829787,
+ "grad_norm": 2.6963284015655518,
+ "learning_rate": 3.2225396788126872e-06,
+ "loss": 0.4243,
+ "step": 5220
+ },
+ {
+ "epoch": 2.4685579196217495,
+ "grad_norm": 2.8247268199920654,
+ "learning_rate": 3.221942451597144e-06,
+ "loss": 0.3919,
+ "step": 5221
+ },
+ {
+ "epoch": 2.46903073286052,
+ "grad_norm": 3.843836784362793,
+ "learning_rate": 3.2213451794294093e-06,
+ "loss": 0.4183,
+ "step": 5222
+ },
+ {
+ "epoch": 2.4695035460992907,
+ "grad_norm": 2.8579909801483154,
+ "learning_rate": 3.220747862346674e-06,
+ "loss": 0.4844,
+ "step": 5223
+ },
+ {
+ "epoch": 2.4699763593380615,
+ "grad_norm": 3.744027853012085,
+ "learning_rate": 3.2201505003861294e-06,
+ "loss": 0.4563,
+ "step": 5224
+ },
+ {
+ "epoch": 2.4704491725768323,
+ "grad_norm": 2.835108995437622,
+ "learning_rate": 3.219553093584971e-06,
+ "loss": 0.4394,
+ "step": 5225
+ },
+ {
+ "epoch": 2.470921985815603,
+ "grad_norm": 2.5681865215301514,
+ "learning_rate": 3.218955641980397e-06,
+ "loss": 0.3907,
+ "step": 5226
+ },
+ {
+ "epoch": 2.4713947990543734,
+ "grad_norm": 2.963172674179077,
+ "learning_rate": 3.2183581456096067e-06,
+ "loss": 0.5163,
+ "step": 5227
+ },
+ {
+ "epoch": 2.4718676122931442,
+ "grad_norm": 2.7840685844421387,
+ "learning_rate": 3.2177606045098047e-06,
+ "loss": 0.411,
+ "step": 5228
+ },
+ {
+ "epoch": 2.472340425531915,
+ "grad_norm": 2.7849979400634766,
+ "learning_rate": 3.2171630187181977e-06,
+ "loss": 0.4671,
+ "step": 5229
+ },
+ {
+ "epoch": 2.4728132387706854,
+ "grad_norm": 2.736406087875366,
+ "learning_rate": 3.216565388271994e-06,
+ "loss": 0.5225,
+ "step": 5230
+ },
+ {
+ "epoch": 2.473286052009456,
+ "grad_norm": 2.978271007537842,
+ "learning_rate": 3.215967713208406e-06,
+ "loss": 0.4668,
+ "step": 5231
+ },
+ {
+ "epoch": 2.473758865248227,
+ "grad_norm": 2.687560796737671,
+ "learning_rate": 3.2153699935646475e-06,
+ "loss": 0.4683,
+ "step": 5232
+ },
+ {
+ "epoch": 2.4742316784869978,
+ "grad_norm": 2.7096521854400635,
+ "learning_rate": 3.214772229377936e-06,
+ "loss": 0.4999,
+ "step": 5233
+ },
+ {
+ "epoch": 2.4747044917257686,
+ "grad_norm": 3.1861157417297363,
+ "learning_rate": 3.214174420685493e-06,
+ "loss": 0.4365,
+ "step": 5234
+ },
+ {
+ "epoch": 2.475177304964539,
+ "grad_norm": 2.623061418533325,
+ "learning_rate": 3.2135765675245394e-06,
+ "loss": 0.3717,
+ "step": 5235
+ },
+ {
+ "epoch": 2.4756501182033097,
+ "grad_norm": 2.680921792984009,
+ "learning_rate": 3.2129786699323016e-06,
+ "loss": 0.4688,
+ "step": 5236
+ },
+ {
+ "epoch": 2.4761229314420805,
+ "grad_norm": 2.80426025390625,
+ "learning_rate": 3.2123807279460096e-06,
+ "loss": 0.5043,
+ "step": 5237
+ },
+ {
+ "epoch": 2.476595744680851,
+ "grad_norm": 2.676156997680664,
+ "learning_rate": 3.211782741602893e-06,
+ "loss": 0.4486,
+ "step": 5238
+ },
+ {
+ "epoch": 2.4770685579196217,
+ "grad_norm": 2.700822591781616,
+ "learning_rate": 3.2111847109401855e-06,
+ "loss": 0.4097,
+ "step": 5239
+ },
+ {
+ "epoch": 2.4775413711583925,
+ "grad_norm": 2.735387086868286,
+ "learning_rate": 3.2105866359951254e-06,
+ "loss": 0.4357,
+ "step": 5240
+ },
+ {
+ "epoch": 2.4780141843971633,
+ "grad_norm": 2.961874485015869,
+ "learning_rate": 3.2099885168049507e-06,
+ "loss": 0.4942,
+ "step": 5241
+ },
+ {
+ "epoch": 2.478486997635934,
+ "grad_norm": 2.546588659286499,
+ "learning_rate": 3.209390353406904e-06,
+ "loss": 0.3852,
+ "step": 5242
+ },
+ {
+ "epoch": 2.4789598108747044,
+ "grad_norm": 2.6269772052764893,
+ "learning_rate": 3.208792145838231e-06,
+ "loss": 0.3935,
+ "step": 5243
+ },
+ {
+ "epoch": 2.479432624113475,
+ "grad_norm": 2.9009883403778076,
+ "learning_rate": 3.208193894136179e-06,
+ "loss": 0.4003,
+ "step": 5244
+ },
+ {
+ "epoch": 2.479905437352246,
+ "grad_norm": 2.772834300994873,
+ "learning_rate": 3.2075955983379982e-06,
+ "loss": 0.4742,
+ "step": 5245
+ },
+ {
+ "epoch": 2.4803782505910164,
+ "grad_norm": 2.728703737258911,
+ "learning_rate": 3.2069972584809423e-06,
+ "loss": 0.4405,
+ "step": 5246
+ },
+ {
+ "epoch": 2.480851063829787,
+ "grad_norm": 2.72868275642395,
+ "learning_rate": 3.206398874602268e-06,
+ "loss": 0.4714,
+ "step": 5247
+ },
+ {
+ "epoch": 2.481323877068558,
+ "grad_norm": 2.6804213523864746,
+ "learning_rate": 3.2058004467392323e-06,
+ "loss": 0.4106,
+ "step": 5248
+ },
+ {
+ "epoch": 2.4817966903073287,
+ "grad_norm": 2.6740739345550537,
+ "learning_rate": 3.205201974929098e-06,
+ "loss": 0.3855,
+ "step": 5249
+ },
+ {
+ "epoch": 2.482269503546099,
+ "grad_norm": 2.8131754398345947,
+ "learning_rate": 3.204603459209129e-06,
+ "loss": 0.418,
+ "step": 5250
+ },
+ {
+ "epoch": 2.48274231678487,
+ "grad_norm": 2.5242888927459717,
+ "learning_rate": 3.204004899616592e-06,
+ "loss": 0.4914,
+ "step": 5251
+ },
+ {
+ "epoch": 2.4832151300236407,
+ "grad_norm": 2.969191551208496,
+ "learning_rate": 3.2034062961887567e-06,
+ "loss": 0.4634,
+ "step": 5252
+ },
+ {
+ "epoch": 2.4836879432624115,
+ "grad_norm": 2.967968463897705,
+ "learning_rate": 3.2028076489628963e-06,
+ "loss": 0.456,
+ "step": 5253
+ },
+ {
+ "epoch": 2.484160756501182,
+ "grad_norm": 2.9006540775299072,
+ "learning_rate": 3.2022089579762845e-06,
+ "loss": 0.4203,
+ "step": 5254
+ },
+ {
+ "epoch": 2.4846335697399526,
+ "grad_norm": 2.6377336978912354,
+ "learning_rate": 3.2016102232662003e-06,
+ "loss": 0.4518,
+ "step": 5255
+ },
+ {
+ "epoch": 2.4851063829787234,
+ "grad_norm": 2.757749319076538,
+ "learning_rate": 3.201011444869925e-06,
+ "loss": 0.4314,
+ "step": 5256
+ },
+ {
+ "epoch": 2.4855791962174942,
+ "grad_norm": 2.571560859680176,
+ "learning_rate": 3.20041262282474e-06,
+ "loss": 0.427,
+ "step": 5257
+ },
+ {
+ "epoch": 2.4860520094562646,
+ "grad_norm": 3.1367194652557373,
+ "learning_rate": 3.1998137571679316e-06,
+ "loss": 0.4901,
+ "step": 5258
+ },
+ {
+ "epoch": 2.4865248226950354,
+ "grad_norm": 3.194042205810547,
+ "learning_rate": 3.1992148479367896e-06,
+ "loss": 0.466,
+ "step": 5259
+ },
+ {
+ "epoch": 2.486997635933806,
+ "grad_norm": 2.5546324253082275,
+ "learning_rate": 3.1986158951686052e-06,
+ "loss": 0.4182,
+ "step": 5260
+ },
+ {
+ "epoch": 2.487470449172577,
+ "grad_norm": 2.919783115386963,
+ "learning_rate": 3.198016898900672e-06,
+ "loss": 0.4234,
+ "step": 5261
+ },
+ {
+ "epoch": 2.4879432624113473,
+ "grad_norm": 2.865248918533325,
+ "learning_rate": 3.1974178591702877e-06,
+ "loss": 0.4291,
+ "step": 5262
+ },
+ {
+ "epoch": 2.488416075650118,
+ "grad_norm": 2.685737133026123,
+ "learning_rate": 3.196818776014752e-06,
+ "loss": 0.4548,
+ "step": 5263
+ },
+ {
+ "epoch": 2.488888888888889,
+ "grad_norm": 2.826974630355835,
+ "learning_rate": 3.196219649471365e-06,
+ "loss": 0.4152,
+ "step": 5264
+ },
+ {
+ "epoch": 2.4893617021276597,
+ "grad_norm": 2.764975070953369,
+ "learning_rate": 3.1956204795774336e-06,
+ "loss": 0.5209,
+ "step": 5265
+ },
+ {
+ "epoch": 2.48983451536643,
+ "grad_norm": 2.4184255599975586,
+ "learning_rate": 3.1950212663702662e-06,
+ "loss": 0.3969,
+ "step": 5266
+ },
+ {
+ "epoch": 2.490307328605201,
+ "grad_norm": 2.9361133575439453,
+ "learning_rate": 3.1944220098871713e-06,
+ "loss": 0.4589,
+ "step": 5267
+ },
+ {
+ "epoch": 2.4907801418439717,
+ "grad_norm": 2.377051830291748,
+ "learning_rate": 3.193822710165463e-06,
+ "loss": 0.4328,
+ "step": 5268
+ },
+ {
+ "epoch": 2.4912529550827425,
+ "grad_norm": 3.1302497386932373,
+ "learning_rate": 3.1932233672424563e-06,
+ "loss": 0.3918,
+ "step": 5269
+ },
+ {
+ "epoch": 2.491725768321513,
+ "grad_norm": 2.89577579498291,
+ "learning_rate": 3.192623981155471e-06,
+ "loss": 0.5004,
+ "step": 5270
+ },
+ {
+ "epoch": 2.4921985815602836,
+ "grad_norm": 2.7735235691070557,
+ "learning_rate": 3.1920245519418273e-06,
+ "loss": 0.4206,
+ "step": 5271
+ },
+ {
+ "epoch": 2.4926713947990544,
+ "grad_norm": 2.5424516201019287,
+ "learning_rate": 3.1914250796388493e-06,
+ "loss": 0.4419,
+ "step": 5272
+ },
+ {
+ "epoch": 2.493144208037825,
+ "grad_norm": 3.1216981410980225,
+ "learning_rate": 3.1908255642838628e-06,
+ "loss": 0.4552,
+ "step": 5273
+ },
+ {
+ "epoch": 2.4936170212765956,
+ "grad_norm": 3.044045925140381,
+ "learning_rate": 3.1902260059141978e-06,
+ "loss": 0.4967,
+ "step": 5274
+ },
+ {
+ "epoch": 2.4940898345153664,
+ "grad_norm": 2.5630741119384766,
+ "learning_rate": 3.189626404567186e-06,
+ "loss": 0.3908,
+ "step": 5275
+ },
+ {
+ "epoch": 2.494562647754137,
+ "grad_norm": 2.7177648544311523,
+ "learning_rate": 3.189026760280162e-06,
+ "loss": 0.4915,
+ "step": 5276
+ },
+ {
+ "epoch": 2.495035460992908,
+ "grad_norm": 2.653416395187378,
+ "learning_rate": 3.1884270730904632e-06,
+ "loss": 0.4633,
+ "step": 5277
+ },
+ {
+ "epoch": 2.4955082742316783,
+ "grad_norm": 3.7212321758270264,
+ "learning_rate": 3.1878273430354284e-06,
+ "loss": 0.4549,
+ "step": 5278
+ },
+ {
+ "epoch": 2.495981087470449,
+ "grad_norm": 2.4152729511260986,
+ "learning_rate": 3.187227570152402e-06,
+ "loss": 0.4674,
+ "step": 5279
+ },
+ {
+ "epoch": 2.49645390070922,
+ "grad_norm": 2.5354862213134766,
+ "learning_rate": 3.1866277544787284e-06,
+ "loss": 0.4135,
+ "step": 5280
+ },
+ {
+ "epoch": 2.4969267139479907,
+ "grad_norm": 3.1766583919525146,
+ "learning_rate": 3.186027896051754e-06,
+ "loss": 0.5656,
+ "step": 5281
+ },
+ {
+ "epoch": 2.497399527186761,
+ "grad_norm": 2.5636754035949707,
+ "learning_rate": 3.1854279949088313e-06,
+ "loss": 0.4138,
+ "step": 5282
+ },
+ {
+ "epoch": 2.497872340425532,
+ "grad_norm": 2.7615602016448975,
+ "learning_rate": 3.1848280510873124e-06,
+ "loss": 0.4936,
+ "step": 5283
+ },
+ {
+ "epoch": 2.4983451536643027,
+ "grad_norm": 2.964721918106079,
+ "learning_rate": 3.1842280646245543e-06,
+ "loss": 0.4865,
+ "step": 5284
+ },
+ {
+ "epoch": 2.4988179669030735,
+ "grad_norm": 2.6915178298950195,
+ "learning_rate": 3.1836280355579152e-06,
+ "loss": 0.4179,
+ "step": 5285
+ },
+ {
+ "epoch": 2.499290780141844,
+ "grad_norm": 2.820451259613037,
+ "learning_rate": 3.183027963924755e-06,
+ "loss": 0.4785,
+ "step": 5286
+ },
+ {
+ "epoch": 2.4997635933806146,
+ "grad_norm": 2.841719627380371,
+ "learning_rate": 3.1824278497624393e-06,
+ "loss": 0.4535,
+ "step": 5287
+ },
+ {
+ "epoch": 2.5002364066193854,
+ "grad_norm": 2.459167957305908,
+ "learning_rate": 3.181827693108333e-06,
+ "loss": 0.4353,
+ "step": 5288
+ },
+ {
+ "epoch": 2.500709219858156,
+ "grad_norm": 3.2538363933563232,
+ "learning_rate": 3.1812274939998066e-06,
+ "loss": 0.4037,
+ "step": 5289
+ },
+ {
+ "epoch": 2.5011820330969265,
+ "grad_norm": 2.6980504989624023,
+ "learning_rate": 3.180627252474231e-06,
+ "loss": 0.4181,
+ "step": 5290
+ },
+ {
+ "epoch": 2.5016548463356973,
+ "grad_norm": 2.9400012493133545,
+ "learning_rate": 3.1800269685689804e-06,
+ "loss": 0.4642,
+ "step": 5291
+ },
+ {
+ "epoch": 2.502127659574468,
+ "grad_norm": 2.7832958698272705,
+ "learning_rate": 3.1794266423214328e-06,
+ "loss": 0.3936,
+ "step": 5292
+ },
+ {
+ "epoch": 2.5026004728132385,
+ "grad_norm": 2.4017868041992188,
+ "learning_rate": 3.178826273768967e-06,
+ "loss": 0.3984,
+ "step": 5293
+ },
+ {
+ "epoch": 2.5030732860520093,
+ "grad_norm": 2.398120641708374,
+ "learning_rate": 3.1782258629489665e-06,
+ "loss": 0.4219,
+ "step": 5294
+ },
+ {
+ "epoch": 2.50354609929078,
+ "grad_norm": 2.973947763442993,
+ "learning_rate": 3.177625409898815e-06,
+ "loss": 0.4192,
+ "step": 5295
+ },
+ {
+ "epoch": 2.504018912529551,
+ "grad_norm": 3.1169888973236084,
+ "learning_rate": 3.1770249146559006e-06,
+ "loss": 0.5098,
+ "step": 5296
+ },
+ {
+ "epoch": 2.5044917257683217,
+ "grad_norm": 2.816964864730835,
+ "learning_rate": 3.1764243772576132e-06,
+ "loss": 0.4228,
+ "step": 5297
+ },
+ {
+ "epoch": 2.504964539007092,
+ "grad_norm": 2.5624163150787354,
+ "learning_rate": 3.1758237977413452e-06,
+ "loss": 0.4389,
+ "step": 5298
+ },
+ {
+ "epoch": 2.505437352245863,
+ "grad_norm": 2.7477777004241943,
+ "learning_rate": 3.175223176144494e-06,
+ "loss": 0.4564,
+ "step": 5299
+ },
+ {
+ "epoch": 2.5059101654846336,
+ "grad_norm": 3.1478309631347656,
+ "learning_rate": 3.174622512504456e-06,
+ "loss": 0.4859,
+ "step": 5300
+ },
+ {
+ "epoch": 2.506382978723404,
+ "grad_norm": 2.8400418758392334,
+ "learning_rate": 3.1740218068586315e-06,
+ "loss": 0.4476,
+ "step": 5301
+ },
+ {
+ "epoch": 2.506855791962175,
+ "grad_norm": 2.7097036838531494,
+ "learning_rate": 3.173421059244426e-06,
+ "loss": 0.4559,
+ "step": 5302
+ },
+ {
+ "epoch": 2.5073286052009456,
+ "grad_norm": 2.864760637283325,
+ "learning_rate": 3.172820269699243e-06,
+ "loss": 0.5124,
+ "step": 5303
+ },
+ {
+ "epoch": 2.5078014184397164,
+ "grad_norm": 2.877110004425049,
+ "learning_rate": 3.1722194382604926e-06,
+ "loss": 0.5083,
+ "step": 5304
+ },
+ {
+ "epoch": 2.508274231678487,
+ "grad_norm": 3.2369656562805176,
+ "learning_rate": 3.1716185649655844e-06,
+ "loss": 0.4894,
+ "step": 5305
+ },
+ {
+ "epoch": 2.5087470449172575,
+ "grad_norm": 2.7377753257751465,
+ "learning_rate": 3.171017649851934e-06,
+ "loss": 0.4324,
+ "step": 5306
+ },
+ {
+ "epoch": 2.5092198581560283,
+ "grad_norm": 2.883364200592041,
+ "learning_rate": 3.1704166929569564e-06,
+ "loss": 0.3731,
+ "step": 5307
+ },
+ {
+ "epoch": 2.509692671394799,
+ "grad_norm": 2.5724737644195557,
+ "learning_rate": 3.1698156943180716e-06,
+ "loss": 0.4768,
+ "step": 5308
+ },
+ {
+ "epoch": 2.5101654846335695,
+ "grad_norm": 2.7532460689544678,
+ "learning_rate": 3.1692146539727e-06,
+ "loss": 0.4385,
+ "step": 5309
+ },
+ {
+ "epoch": 2.5106382978723403,
+ "grad_norm": 2.786505699157715,
+ "learning_rate": 3.168613571958267e-06,
+ "loss": 0.4241,
+ "step": 5310
+ },
+ {
+ "epoch": 2.511111111111111,
+ "grad_norm": 3.1674118041992188,
+ "learning_rate": 3.1680124483121975e-06,
+ "loss": 0.4445,
+ "step": 5311
+ },
+ {
+ "epoch": 2.511583924349882,
+ "grad_norm": 2.7861545085906982,
+ "learning_rate": 3.167411283071923e-06,
+ "loss": 0.4264,
+ "step": 5312
+ },
+ {
+ "epoch": 2.5120567375886527,
+ "grad_norm": 2.7412493228912354,
+ "learning_rate": 3.1668100762748745e-06,
+ "loss": 0.4725,
+ "step": 5313
+ },
+ {
+ "epoch": 2.512529550827423,
+ "grad_norm": 2.710019588470459,
+ "learning_rate": 3.1662088279584858e-06,
+ "loss": 0.5207,
+ "step": 5314
+ },
+ {
+ "epoch": 2.513002364066194,
+ "grad_norm": 2.694812297821045,
+ "learning_rate": 3.165607538160194e-06,
+ "loss": 0.3666,
+ "step": 5315
+ },
+ {
+ "epoch": 2.5134751773049646,
+ "grad_norm": 2.4390623569488525,
+ "learning_rate": 3.1650062069174405e-06,
+ "loss": 0.4025,
+ "step": 5316
+ },
+ {
+ "epoch": 2.513947990543735,
+ "grad_norm": 3.055738925933838,
+ "learning_rate": 3.1644048342676663e-06,
+ "loss": 0.4288,
+ "step": 5317
+ },
+ {
+ "epoch": 2.5144208037825058,
+ "grad_norm": 3.065824508666992,
+ "learning_rate": 3.163803420248316e-06,
+ "loss": 0.4592,
+ "step": 5318
+ },
+ {
+ "epoch": 2.5148936170212766,
+ "grad_norm": 2.6011085510253906,
+ "learning_rate": 3.163201964896838e-06,
+ "loss": 0.4081,
+ "step": 5319
+ },
+ {
+ "epoch": 2.5153664302600474,
+ "grad_norm": 2.4833033084869385,
+ "learning_rate": 3.162600468250681e-06,
+ "loss": 0.4343,
+ "step": 5320
+ },
+ {
+ "epoch": 2.515839243498818,
+ "grad_norm": 2.9035534858703613,
+ "learning_rate": 3.161998930347299e-06,
+ "loss": 0.4972,
+ "step": 5321
+ },
+ {
+ "epoch": 2.5163120567375885,
+ "grad_norm": 2.788752317428589,
+ "learning_rate": 3.161397351224146e-06,
+ "loss": 0.4597,
+ "step": 5322
+ },
+ {
+ "epoch": 2.5167848699763593,
+ "grad_norm": 2.4344491958618164,
+ "learning_rate": 3.16079573091868e-06,
+ "loss": 0.359,
+ "step": 5323
+ },
+ {
+ "epoch": 2.51725768321513,
+ "grad_norm": 2.750150680541992,
+ "learning_rate": 3.160194069468361e-06,
+ "loss": 0.4596,
+ "step": 5324
+ },
+ {
+ "epoch": 2.5177304964539005,
+ "grad_norm": 2.826902389526367,
+ "learning_rate": 3.1595923669106526e-06,
+ "loss": 0.4377,
+ "step": 5325
+ },
+ {
+ "epoch": 2.5182033096926713,
+ "grad_norm": 2.554439067840576,
+ "learning_rate": 3.15899062328302e-06,
+ "loss": 0.4517,
+ "step": 5326
+ },
+ {
+ "epoch": 2.518676122931442,
+ "grad_norm": 3.0882742404937744,
+ "learning_rate": 3.158388838622931e-06,
+ "loss": 0.47,
+ "step": 5327
+ },
+ {
+ "epoch": 2.519148936170213,
+ "grad_norm": 2.918947696685791,
+ "learning_rate": 3.157787012967856e-06,
+ "loss": 0.522,
+ "step": 5328
+ },
+ {
+ "epoch": 2.5196217494089836,
+ "grad_norm": 2.8057637214660645,
+ "learning_rate": 3.1571851463552674e-06,
+ "loss": 0.4837,
+ "step": 5329
+ },
+ {
+ "epoch": 2.520094562647754,
+ "grad_norm": 2.66241455078125,
+ "learning_rate": 3.156583238822641e-06,
+ "loss": 0.3988,
+ "step": 5330
+ },
+ {
+ "epoch": 2.520567375886525,
+ "grad_norm": 2.9793803691864014,
+ "learning_rate": 3.155981290407456e-06,
+ "loss": 0.4737,
+ "step": 5331
+ },
+ {
+ "epoch": 2.5210401891252956,
+ "grad_norm": 2.847522258758545,
+ "learning_rate": 3.1553793011471924e-06,
+ "loss": 0.4394,
+ "step": 5332
+ },
+ {
+ "epoch": 2.521513002364066,
+ "grad_norm": 2.9561474323272705,
+ "learning_rate": 3.154777271079333e-06,
+ "loss": 0.47,
+ "step": 5333
+ },
+ {
+ "epoch": 2.5219858156028367,
+ "grad_norm": 2.8353018760681152,
+ "learning_rate": 3.154175200241365e-06,
+ "loss": 0.4015,
+ "step": 5334
+ },
+ {
+ "epoch": 2.5224586288416075,
+ "grad_norm": 2.609049081802368,
+ "learning_rate": 3.153573088670775e-06,
+ "loss": 0.4723,
+ "step": 5335
+ },
+ {
+ "epoch": 2.5229314420803783,
+ "grad_norm": 2.8538455963134766,
+ "learning_rate": 3.1529709364050556e-06,
+ "loss": 0.4665,
+ "step": 5336
+ },
+ {
+ "epoch": 2.523404255319149,
+ "grad_norm": 2.768310785293579,
+ "learning_rate": 3.1523687434816978e-06,
+ "loss": 0.4933,
+ "step": 5337
+ },
+ {
+ "epoch": 2.5238770685579195,
+ "grad_norm": 2.9300906658172607,
+ "learning_rate": 3.1517665099382e-06,
+ "loss": 0.4651,
+ "step": 5338
+ },
+ {
+ "epoch": 2.5243498817966903,
+ "grad_norm": 2.6984703540802,
+ "learning_rate": 3.1511642358120585e-06,
+ "loss": 0.4442,
+ "step": 5339
+ },
+ {
+ "epoch": 2.524822695035461,
+ "grad_norm": 2.8148467540740967,
+ "learning_rate": 3.1505619211407762e-06,
+ "loss": 0.4611,
+ "step": 5340
+ },
+ {
+ "epoch": 2.5252955082742314,
+ "grad_norm": 2.816436290740967,
+ "learning_rate": 3.1499595659618556e-06,
+ "loss": 0.5291,
+ "step": 5341
+ },
+ {
+ "epoch": 2.5257683215130022,
+ "grad_norm": 2.902805805206299,
+ "learning_rate": 3.149357170312802e-06,
+ "loss": 0.4394,
+ "step": 5342
+ },
+ {
+ "epoch": 2.526241134751773,
+ "grad_norm": 2.6443474292755127,
+ "learning_rate": 3.148754734231126e-06,
+ "loss": 0.4444,
+ "step": 5343
+ },
+ {
+ "epoch": 2.526713947990544,
+ "grad_norm": 2.6818583011627197,
+ "learning_rate": 3.148152257754336e-06,
+ "loss": 0.4256,
+ "step": 5344
+ },
+ {
+ "epoch": 2.5271867612293146,
+ "grad_norm": 2.5266945362091064,
+ "learning_rate": 3.1475497409199485e-06,
+ "loss": 0.4087,
+ "step": 5345
+ },
+ {
+ "epoch": 2.527659574468085,
+ "grad_norm": 2.6326711177825928,
+ "learning_rate": 3.146947183765477e-06,
+ "loss": 0.3842,
+ "step": 5346
+ },
+ {
+ "epoch": 2.5281323877068558,
+ "grad_norm": 3.122880697250366,
+ "learning_rate": 3.1463445863284413e-06,
+ "loss": 0.482,
+ "step": 5347
+ },
+ {
+ "epoch": 2.5286052009456266,
+ "grad_norm": 2.819258213043213,
+ "learning_rate": 3.145741948646362e-06,
+ "loss": 0.4628,
+ "step": 5348
+ },
+ {
+ "epoch": 2.529078014184397,
+ "grad_norm": 2.5842230319976807,
+ "learning_rate": 3.145139270756764e-06,
+ "loss": 0.4479,
+ "step": 5349
+ },
+ {
+ "epoch": 2.5295508274231677,
+ "grad_norm": 2.7257237434387207,
+ "learning_rate": 3.144536552697172e-06,
+ "loss": 0.473,
+ "step": 5350
+ },
+ {
+ "epoch": 2.5300236406619385,
+ "grad_norm": 2.6876981258392334,
+ "learning_rate": 3.143933794505115e-06,
+ "loss": 0.4615,
+ "step": 5351
+ },
+ {
+ "epoch": 2.5304964539007093,
+ "grad_norm": 2.7942895889282227,
+ "learning_rate": 3.143330996218124e-06,
+ "loss": 0.4982,
+ "step": 5352
+ },
+ {
+ "epoch": 2.53096926713948,
+ "grad_norm": 2.3150579929351807,
+ "learning_rate": 3.1427281578737327e-06,
+ "loss": 0.3905,
+ "step": 5353
+ },
+ {
+ "epoch": 2.5314420803782505,
+ "grad_norm": 2.7326138019561768,
+ "learning_rate": 3.142125279509478e-06,
+ "loss": 0.4076,
+ "step": 5354
+ },
+ {
+ "epoch": 2.5319148936170213,
+ "grad_norm": 2.46362566947937,
+ "learning_rate": 3.1415223611628976e-06,
+ "loss": 0.4043,
+ "step": 5355
+ },
+ {
+ "epoch": 2.532387706855792,
+ "grad_norm": 2.6670427322387695,
+ "learning_rate": 3.1409194028715323e-06,
+ "loss": 0.484,
+ "step": 5356
+ },
+ {
+ "epoch": 2.5328605200945624,
+ "grad_norm": 2.917771100997925,
+ "learning_rate": 3.140316404672926e-06,
+ "loss": 0.4539,
+ "step": 5357
+ },
+ {
+ "epoch": 2.533333333333333,
+ "grad_norm": 2.7964110374450684,
+ "learning_rate": 3.1397133666046254e-06,
+ "loss": 0.4706,
+ "step": 5358
+ },
+ {
+ "epoch": 2.533806146572104,
+ "grad_norm": 2.6481330394744873,
+ "learning_rate": 3.139110288704179e-06,
+ "loss": 0.4101,
+ "step": 5359
+ },
+ {
+ "epoch": 2.534278959810875,
+ "grad_norm": 2.859452962875366,
+ "learning_rate": 3.1385071710091365e-06,
+ "loss": 0.4842,
+ "step": 5360
+ },
+ {
+ "epoch": 2.5347517730496456,
+ "grad_norm": 2.686077356338501,
+ "learning_rate": 3.137904013557052e-06,
+ "loss": 0.4073,
+ "step": 5361
+ },
+ {
+ "epoch": 2.535224586288416,
+ "grad_norm": 3.7147045135498047,
+ "learning_rate": 3.137300816385482e-06,
+ "loss": 0.4536,
+ "step": 5362
+ },
+ {
+ "epoch": 2.5356973995271868,
+ "grad_norm": 2.51054048538208,
+ "learning_rate": 3.1366975795319856e-06,
+ "loss": 0.4171,
+ "step": 5363
+ },
+ {
+ "epoch": 2.5361702127659576,
+ "grad_norm": 3.043149471282959,
+ "learning_rate": 3.136094303034121e-06,
+ "loss": 0.5179,
+ "step": 5364
+ },
+ {
+ "epoch": 2.536643026004728,
+ "grad_norm": 2.398878812789917,
+ "learning_rate": 3.1354909869294548e-06,
+ "loss": 0.4144,
+ "step": 5365
+ },
+ {
+ "epoch": 2.5371158392434987,
+ "grad_norm": 2.969712257385254,
+ "learning_rate": 3.134887631255551e-06,
+ "loss": 0.3983,
+ "step": 5366
+ },
+ {
+ "epoch": 2.5375886524822695,
+ "grad_norm": 2.7707982063293457,
+ "learning_rate": 3.134284236049978e-06,
+ "loss": 0.4405,
+ "step": 5367
+ },
+ {
+ "epoch": 2.5380614657210403,
+ "grad_norm": 2.579742193222046,
+ "learning_rate": 3.1336808013503073e-06,
+ "loss": 0.4402,
+ "step": 5368
+ },
+ {
+ "epoch": 2.538534278959811,
+ "grad_norm": 2.6041927337646484,
+ "learning_rate": 3.1330773271941113e-06,
+ "loss": 0.396,
+ "step": 5369
+ },
+ {
+ "epoch": 2.5390070921985815,
+ "grad_norm": 2.7383856773376465,
+ "learning_rate": 3.1324738136189658e-06,
+ "loss": 0.4424,
+ "step": 5370
+ },
+ {
+ "epoch": 2.5394799054373522,
+ "grad_norm": 3.053644895553589,
+ "learning_rate": 3.13187026066245e-06,
+ "loss": 0.473,
+ "step": 5371
+ },
+ {
+ "epoch": 2.539952718676123,
+ "grad_norm": 2.684244155883789,
+ "learning_rate": 3.1312666683621428e-06,
+ "loss": 0.3963,
+ "step": 5372
+ },
+ {
+ "epoch": 2.5404255319148934,
+ "grad_norm": 2.6505017280578613,
+ "learning_rate": 3.130663036755629e-06,
+ "loss": 0.4292,
+ "step": 5373
+ },
+ {
+ "epoch": 2.540898345153664,
+ "grad_norm": 3.025965929031372,
+ "learning_rate": 3.1300593658804935e-06,
+ "loss": 0.4539,
+ "step": 5374
+ },
+ {
+ "epoch": 2.541371158392435,
+ "grad_norm": 2.72106671333313,
+ "learning_rate": 3.1294556557743237e-06,
+ "loss": 0.4519,
+ "step": 5375
+ },
+ {
+ "epoch": 2.541843971631206,
+ "grad_norm": 2.759995222091675,
+ "learning_rate": 3.12885190647471e-06,
+ "loss": 0.451,
+ "step": 5376
+ },
+ {
+ "epoch": 2.5423167848699766,
+ "grad_norm": 2.697950601577759,
+ "learning_rate": 3.1282481180192457e-06,
+ "loss": 0.4328,
+ "step": 5377
+ },
+ {
+ "epoch": 2.542789598108747,
+ "grad_norm": 2.6970415115356445,
+ "learning_rate": 3.127644290445526e-06,
+ "loss": 0.4489,
+ "step": 5378
+ },
+ {
+ "epoch": 2.5432624113475177,
+ "grad_norm": 2.5856997966766357,
+ "learning_rate": 3.127040423791148e-06,
+ "loss": 0.3848,
+ "step": 5379
+ },
+ {
+ "epoch": 2.5437352245862885,
+ "grad_norm": 2.9798166751861572,
+ "learning_rate": 3.1264365180937127e-06,
+ "loss": 0.5038,
+ "step": 5380
+ },
+ {
+ "epoch": 2.544208037825059,
+ "grad_norm": 3.413175106048584,
+ "learning_rate": 3.1258325733908224e-06,
+ "loss": 0.5247,
+ "step": 5381
+ },
+ {
+ "epoch": 2.5446808510638297,
+ "grad_norm": 2.838517904281616,
+ "learning_rate": 3.1252285897200818e-06,
+ "loss": 0.4652,
+ "step": 5382
+ },
+ {
+ "epoch": 2.5451536643026005,
+ "grad_norm": 2.8342528343200684,
+ "learning_rate": 3.1246245671190983e-06,
+ "loss": 0.4245,
+ "step": 5383
+ },
+ {
+ "epoch": 2.5456264775413713,
+ "grad_norm": 3.06026029586792,
+ "learning_rate": 3.124020505625482e-06,
+ "loss": 0.469,
+ "step": 5384
+ },
+ {
+ "epoch": 2.546099290780142,
+ "grad_norm": 2.633894681930542,
+ "learning_rate": 3.1234164052768452e-06,
+ "loss": 0.4509,
+ "step": 5385
+ },
+ {
+ "epoch": 2.5465721040189124,
+ "grad_norm": 2.634819984436035,
+ "learning_rate": 3.1228122661108023e-06,
+ "loss": 0.4879,
+ "step": 5386
+ },
+ {
+ "epoch": 2.5470449172576832,
+ "grad_norm": 3.9843504428863525,
+ "learning_rate": 3.1222080881649707e-06,
+ "loss": 0.4472,
+ "step": 5387
+ },
+ {
+ "epoch": 2.547517730496454,
+ "grad_norm": 2.5480258464813232,
+ "learning_rate": 3.1216038714769694e-06,
+ "loss": 0.4396,
+ "step": 5388
+ },
+ {
+ "epoch": 2.5479905437352244,
+ "grad_norm": 2.7461917400360107,
+ "learning_rate": 3.12099961608442e-06,
+ "loss": 0.4735,
+ "step": 5389
+ },
+ {
+ "epoch": 2.548463356973995,
+ "grad_norm": 3.167769193649292,
+ "learning_rate": 3.1203953220249493e-06,
+ "loss": 0.4196,
+ "step": 5390
+ },
+ {
+ "epoch": 2.548936170212766,
+ "grad_norm": 2.721696615219116,
+ "learning_rate": 3.1197909893361814e-06,
+ "loss": 0.4571,
+ "step": 5391
+ },
+ {
+ "epoch": 2.5494089834515368,
+ "grad_norm": 2.726668119430542,
+ "learning_rate": 3.1191866180557463e-06,
+ "loss": 0.4856,
+ "step": 5392
+ },
+ {
+ "epoch": 2.5498817966903076,
+ "grad_norm": 2.602205276489258,
+ "learning_rate": 3.1185822082212754e-06,
+ "loss": 0.4631,
+ "step": 5393
+ },
+ {
+ "epoch": 2.550354609929078,
+ "grad_norm": 2.7715859413146973,
+ "learning_rate": 3.1179777598704025e-06,
+ "loss": 0.4136,
+ "step": 5394
+ },
+ {
+ "epoch": 2.5508274231678487,
+ "grad_norm": 2.8081955909729004,
+ "learning_rate": 3.1173732730407647e-06,
+ "loss": 0.4963,
+ "step": 5395
+ },
+ {
+ "epoch": 2.5513002364066195,
+ "grad_norm": 2.946772336959839,
+ "learning_rate": 3.1167687477700006e-06,
+ "loss": 0.4443,
+ "step": 5396
+ },
+ {
+ "epoch": 2.55177304964539,
+ "grad_norm": 2.89345383644104,
+ "learning_rate": 3.1161641840957503e-06,
+ "loss": 0.4377,
+ "step": 5397
+ },
+ {
+ "epoch": 2.5522458628841607,
+ "grad_norm": 2.908317804336548,
+ "learning_rate": 3.115559582055659e-06,
+ "loss": 0.4702,
+ "step": 5398
+ },
+ {
+ "epoch": 2.5527186761229315,
+ "grad_norm": 2.554417848587036,
+ "learning_rate": 3.1149549416873704e-06,
+ "loss": 0.3738,
+ "step": 5399
+ },
+ {
+ "epoch": 2.5531914893617023,
+ "grad_norm": 2.3132457733154297,
+ "learning_rate": 3.1143502630285356e-06,
+ "loss": 0.4074,
+ "step": 5400
+ },
+ {
+ "epoch": 2.553664302600473,
+ "grad_norm": 2.751666784286499,
+ "learning_rate": 3.1137455461168026e-06,
+ "loss": 0.4697,
+ "step": 5401
+ },
+ {
+ "epoch": 2.5541371158392434,
+ "grad_norm": 2.7088871002197266,
+ "learning_rate": 3.113140790989826e-06,
+ "loss": 0.4754,
+ "step": 5402
+ },
+ {
+ "epoch": 2.554609929078014,
+ "grad_norm": 3.0633046627044678,
+ "learning_rate": 3.1125359976852605e-06,
+ "loss": 0.4874,
+ "step": 5403
+ },
+ {
+ "epoch": 2.555082742316785,
+ "grad_norm": 3.399456024169922,
+ "learning_rate": 3.111931166240764e-06,
+ "loss": 0.5529,
+ "step": 5404
+ },
+ {
+ "epoch": 2.5555555555555554,
+ "grad_norm": 2.7729690074920654,
+ "learning_rate": 3.1113262966939985e-06,
+ "loss": 0.4677,
+ "step": 5405
+ },
+ {
+ "epoch": 2.556028368794326,
+ "grad_norm": 2.81025767326355,
+ "learning_rate": 3.1107213890826244e-06,
+ "loss": 0.4954,
+ "step": 5406
+ },
+ {
+ "epoch": 2.556501182033097,
+ "grad_norm": 2.4837241172790527,
+ "learning_rate": 3.110116443444307e-06,
+ "loss": 0.3681,
+ "step": 5407
+ },
+ {
+ "epoch": 2.5569739952718678,
+ "grad_norm": 2.6406874656677246,
+ "learning_rate": 3.109511459816714e-06,
+ "loss": 0.4569,
+ "step": 5408
+ },
+ {
+ "epoch": 2.5574468085106385,
+ "grad_norm": 2.6093738079071045,
+ "learning_rate": 3.1089064382375155e-06,
+ "loss": 0.413,
+ "step": 5409
+ },
+ {
+ "epoch": 2.557919621749409,
+ "grad_norm": 2.6629011631011963,
+ "learning_rate": 3.108301378744383e-06,
+ "loss": 0.4286,
+ "step": 5410
+ },
+ {
+ "epoch": 2.5583924349881797,
+ "grad_norm": 2.694796323776245,
+ "learning_rate": 3.10769628137499e-06,
+ "loss": 0.4316,
+ "step": 5411
+ },
+ {
+ "epoch": 2.5588652482269505,
+ "grad_norm": 2.88023042678833,
+ "learning_rate": 3.107091146167015e-06,
+ "loss": 0.4378,
+ "step": 5412
+ },
+ {
+ "epoch": 2.559338061465721,
+ "grad_norm": 2.8804919719696045,
+ "learning_rate": 3.1064859731581365e-06,
+ "loss": 0.4971,
+ "step": 5413
+ },
+ {
+ "epoch": 2.5598108747044916,
+ "grad_norm": 2.850468397140503,
+ "learning_rate": 3.1058807623860353e-06,
+ "loss": 0.4686,
+ "step": 5414
+ },
+ {
+ "epoch": 2.5602836879432624,
+ "grad_norm": 3.0548019409179688,
+ "learning_rate": 3.1052755138883963e-06,
+ "loss": 0.4497,
+ "step": 5415
+ },
+ {
+ "epoch": 2.5607565011820332,
+ "grad_norm": 3.10168719291687,
+ "learning_rate": 3.1046702277029046e-06,
+ "loss": 0.569,
+ "step": 5416
+ },
+ {
+ "epoch": 2.561229314420804,
+ "grad_norm": 2.5887374877929688,
+ "learning_rate": 3.1040649038672494e-06,
+ "loss": 0.3812,
+ "step": 5417
+ },
+ {
+ "epoch": 2.5617021276595744,
+ "grad_norm": 2.9928438663482666,
+ "learning_rate": 3.1034595424191212e-06,
+ "loss": 0.4308,
+ "step": 5418
+ },
+ {
+ "epoch": 2.562174940898345,
+ "grad_norm": 2.7003073692321777,
+ "learning_rate": 3.102854143396214e-06,
+ "loss": 0.4967,
+ "step": 5419
+ },
+ {
+ "epoch": 2.562647754137116,
+ "grad_norm": 3.172868490219116,
+ "learning_rate": 3.102248706836222e-06,
+ "loss": 0.5311,
+ "step": 5420
+ },
+ {
+ "epoch": 2.5631205673758863,
+ "grad_norm": 3.0146191120147705,
+ "learning_rate": 3.101643232776844e-06,
+ "loss": 0.4714,
+ "step": 5421
+ },
+ {
+ "epoch": 2.563593380614657,
+ "grad_norm": 3.0683791637420654,
+ "learning_rate": 3.1010377212557806e-06,
+ "loss": 0.4047,
+ "step": 5422
+ },
+ {
+ "epoch": 2.564066193853428,
+ "grad_norm": 2.8260676860809326,
+ "learning_rate": 3.1004321723107334e-06,
+ "loss": 0.5282,
+ "step": 5423
+ },
+ {
+ "epoch": 2.5645390070921987,
+ "grad_norm": 3.0792388916015625,
+ "learning_rate": 3.0998265859794074e-06,
+ "loss": 0.5323,
+ "step": 5424
+ },
+ {
+ "epoch": 2.5650118203309695,
+ "grad_norm": 2.7332866191864014,
+ "learning_rate": 3.09922096229951e-06,
+ "loss": 0.4401,
+ "step": 5425
+ },
+ {
+ "epoch": 2.56548463356974,
+ "grad_norm": 2.9366047382354736,
+ "learning_rate": 3.098615301308751e-06,
+ "loss": 0.4495,
+ "step": 5426
+ },
+ {
+ "epoch": 2.5659574468085107,
+ "grad_norm": 2.982088565826416,
+ "learning_rate": 3.098009603044842e-06,
+ "loss": 0.495,
+ "step": 5427
+ },
+ {
+ "epoch": 2.5664302600472815,
+ "grad_norm": 3.1204755306243896,
+ "learning_rate": 3.0974038675454976e-06,
+ "loss": 0.4354,
+ "step": 5428
+ },
+ {
+ "epoch": 2.566903073286052,
+ "grad_norm": 2.835238218307495,
+ "learning_rate": 3.0967980948484333e-06,
+ "loss": 0.4161,
+ "step": 5429
+ },
+ {
+ "epoch": 2.5673758865248226,
+ "grad_norm": 2.8104958534240723,
+ "learning_rate": 3.096192284991369e-06,
+ "loss": 0.5045,
+ "step": 5430
+ },
+ {
+ "epoch": 2.5678486997635934,
+ "grad_norm": 3.1636080741882324,
+ "learning_rate": 3.0955864380120247e-06,
+ "loss": 0.4533,
+ "step": 5431
+ },
+ {
+ "epoch": 2.568321513002364,
+ "grad_norm": 2.980112314224243,
+ "learning_rate": 3.0949805539481247e-06,
+ "loss": 0.3998,
+ "step": 5432
+ },
+ {
+ "epoch": 2.568794326241135,
+ "grad_norm": 2.6379945278167725,
+ "learning_rate": 3.0943746328373953e-06,
+ "loss": 0.3785,
+ "step": 5433
+ },
+ {
+ "epoch": 2.5692671394799054,
+ "grad_norm": 2.780930757522583,
+ "learning_rate": 3.0937686747175627e-06,
+ "loss": 0.4801,
+ "step": 5434
+ },
+ {
+ "epoch": 2.569739952718676,
+ "grad_norm": 2.6608550548553467,
+ "learning_rate": 3.0931626796263585e-06,
+ "loss": 0.4047,
+ "step": 5435
+ },
+ {
+ "epoch": 2.570212765957447,
+ "grad_norm": 3.130584716796875,
+ "learning_rate": 3.0925566476015156e-06,
+ "loss": 0.5049,
+ "step": 5436
+ },
+ {
+ "epoch": 2.5706855791962173,
+ "grad_norm": 2.9699313640594482,
+ "learning_rate": 3.0919505786807687e-06,
+ "loss": 0.3847,
+ "step": 5437
+ },
+ {
+ "epoch": 2.571158392434988,
+ "grad_norm": 2.919260025024414,
+ "learning_rate": 3.091344472901855e-06,
+ "loss": 0.4631,
+ "step": 5438
+ },
+ {
+ "epoch": 2.571631205673759,
+ "grad_norm": 2.956587553024292,
+ "learning_rate": 3.0907383303025134e-06,
+ "loss": 0.4974,
+ "step": 5439
+ },
+ {
+ "epoch": 2.5721040189125297,
+ "grad_norm": 2.758542776107788,
+ "learning_rate": 3.090132150920486e-06,
+ "loss": 0.4785,
+ "step": 5440
+ },
+ {
+ "epoch": 2.5725768321513005,
+ "grad_norm": 2.678469657897949,
+ "learning_rate": 3.0895259347935175e-06,
+ "loss": 0.4453,
+ "step": 5441
+ },
+ {
+ "epoch": 2.573049645390071,
+ "grad_norm": 2.6508545875549316,
+ "learning_rate": 3.088919681959355e-06,
+ "loss": 0.4426,
+ "step": 5442
+ },
+ {
+ "epoch": 2.5735224586288417,
+ "grad_norm": 2.6156187057495117,
+ "learning_rate": 3.0883133924557453e-06,
+ "loss": 0.4445,
+ "step": 5443
+ },
+ {
+ "epoch": 2.5739952718676125,
+ "grad_norm": 2.484374761581421,
+ "learning_rate": 3.08770706632044e-06,
+ "loss": 0.4155,
+ "step": 5444
+ },
+ {
+ "epoch": 2.574468085106383,
+ "grad_norm": 2.7465295791625977,
+ "learning_rate": 3.087100703591193e-06,
+ "loss": 0.4085,
+ "step": 5445
+ },
+ {
+ "epoch": 2.5749408983451536,
+ "grad_norm": 2.771740198135376,
+ "learning_rate": 3.08649430430576e-06,
+ "loss": 0.4313,
+ "step": 5446
+ },
+ {
+ "epoch": 2.5754137115839244,
+ "grad_norm": 2.7480874061584473,
+ "learning_rate": 3.0858878685018984e-06,
+ "loss": 0.3471,
+ "step": 5447
+ },
+ {
+ "epoch": 2.575886524822695,
+ "grad_norm": 2.894913673400879,
+ "learning_rate": 3.085281396217368e-06,
+ "loss": 0.4888,
+ "step": 5448
+ },
+ {
+ "epoch": 2.576359338061466,
+ "grad_norm": 3.037628173828125,
+ "learning_rate": 3.0846748874899306e-06,
+ "loss": 0.3976,
+ "step": 5449
+ },
+ {
+ "epoch": 2.5768321513002364,
+ "grad_norm": 2.4811434745788574,
+ "learning_rate": 3.0840683423573526e-06,
+ "loss": 0.4822,
+ "step": 5450
+ },
+ {
+ "epoch": 2.577304964539007,
+ "grad_norm": 3.0078725814819336,
+ "learning_rate": 3.0834617608573998e-06,
+ "loss": 0.4999,
+ "step": 5451
+ },
+ {
+ "epoch": 2.5777777777777775,
+ "grad_norm": 3.174154043197632,
+ "learning_rate": 3.0828551430278413e-06,
+ "loss": 0.4626,
+ "step": 5452
+ },
+ {
+ "epoch": 2.5782505910165483,
+ "grad_norm": 2.8277535438537598,
+ "learning_rate": 3.082248488906449e-06,
+ "loss": 0.4633,
+ "step": 5453
+ },
+ {
+ "epoch": 2.578723404255319,
+ "grad_norm": 2.731767416000366,
+ "learning_rate": 3.0816417985309966e-06,
+ "loss": 0.4148,
+ "step": 5454
+ },
+ {
+ "epoch": 2.57919621749409,
+ "grad_norm": 2.5480549335479736,
+ "learning_rate": 3.0810350719392597e-06,
+ "loss": 0.4773,
+ "step": 5455
+ },
+ {
+ "epoch": 2.5796690307328607,
+ "grad_norm": 2.9755172729492188,
+ "learning_rate": 3.080428309169017e-06,
+ "loss": 0.5107,
+ "step": 5456
+ },
+ {
+ "epoch": 2.580141843971631,
+ "grad_norm": 2.6499290466308594,
+ "learning_rate": 3.079821510258048e-06,
+ "loss": 0.3982,
+ "step": 5457
+ },
+ {
+ "epoch": 2.580614657210402,
+ "grad_norm": 2.663214921951294,
+ "learning_rate": 3.079214675244136e-06,
+ "loss": 0.4419,
+ "step": 5458
+ },
+ {
+ "epoch": 2.5810874704491726,
+ "grad_norm": 2.595489263534546,
+ "learning_rate": 3.078607804165066e-06,
+ "loss": 0.3958,
+ "step": 5459
+ },
+ {
+ "epoch": 2.581560283687943,
+ "grad_norm": 3.031458854675293,
+ "learning_rate": 3.0780008970586255e-06,
+ "loss": 0.518,
+ "step": 5460
+ },
+ {
+ "epoch": 2.582033096926714,
+ "grad_norm": 2.827071189880371,
+ "learning_rate": 3.077393953962603e-06,
+ "loss": 0.4397,
+ "step": 5461
+ },
+ {
+ "epoch": 2.5825059101654846,
+ "grad_norm": 2.656111240386963,
+ "learning_rate": 3.0767869749147917e-06,
+ "loss": 0.4912,
+ "step": 5462
+ },
+ {
+ "epoch": 2.5829787234042554,
+ "grad_norm": 2.545365333557129,
+ "learning_rate": 3.076179959952984e-06,
+ "loss": 0.3991,
+ "step": 5463
+ },
+ {
+ "epoch": 2.583451536643026,
+ "grad_norm": 2.5794365406036377,
+ "learning_rate": 3.075572909114977e-06,
+ "loss": 0.4499,
+ "step": 5464
+ },
+ {
+ "epoch": 2.5839243498817965,
+ "grad_norm": 2.787140369415283,
+ "learning_rate": 3.074965822438568e-06,
+ "loss": 0.386,
+ "step": 5465
+ },
+ {
+ "epoch": 2.5843971631205673,
+ "grad_norm": 2.6406853199005127,
+ "learning_rate": 3.0743586999615594e-06,
+ "loss": 0.4853,
+ "step": 5466
+ },
+ {
+ "epoch": 2.584869976359338,
+ "grad_norm": 2.8082082271575928,
+ "learning_rate": 3.073751541721752e-06,
+ "loss": 0.4669,
+ "step": 5467
+ },
+ {
+ "epoch": 2.5853427895981085,
+ "grad_norm": 2.8808975219726562,
+ "learning_rate": 3.073144347756952e-06,
+ "loss": 0.4193,
+ "step": 5468
+ },
+ {
+ "epoch": 2.5858156028368793,
+ "grad_norm": 2.823352813720703,
+ "learning_rate": 3.072537118104968e-06,
+ "loss": 0.482,
+ "step": 5469
+ },
+ {
+ "epoch": 2.58628841607565,
+ "grad_norm": 2.6454555988311768,
+ "learning_rate": 3.0719298528036073e-06,
+ "loss": 0.4667,
+ "step": 5470
+ },
+ {
+ "epoch": 2.586761229314421,
+ "grad_norm": 2.871145486831665,
+ "learning_rate": 3.0713225518906826e-06,
+ "loss": 0.5125,
+ "step": 5471
+ },
+ {
+ "epoch": 2.5872340425531917,
+ "grad_norm": 3.1301417350769043,
+ "learning_rate": 3.070715215404007e-06,
+ "loss": 0.4827,
+ "step": 5472
+ },
+ {
+ "epoch": 2.587706855791962,
+ "grad_norm": 2.31062912940979,
+ "learning_rate": 3.070107843381398e-06,
+ "loss": 0.3954,
+ "step": 5473
+ },
+ {
+ "epoch": 2.588179669030733,
+ "grad_norm": 2.8366353511810303,
+ "learning_rate": 3.069500435860674e-06,
+ "loss": 0.4597,
+ "step": 5474
+ },
+ {
+ "epoch": 2.5886524822695036,
+ "grad_norm": 2.900143623352051,
+ "learning_rate": 3.068892992879654e-06,
+ "loss": 0.4294,
+ "step": 5475
+ },
+ {
+ "epoch": 2.589125295508274,
+ "grad_norm": 2.923313617706299,
+ "learning_rate": 3.0682855144761626e-06,
+ "loss": 0.505,
+ "step": 5476
+ },
+ {
+ "epoch": 2.5895981087470448,
+ "grad_norm": 2.726475954055786,
+ "learning_rate": 3.0676780006880242e-06,
+ "loss": 0.4208,
+ "step": 5477
+ },
+ {
+ "epoch": 2.5900709219858156,
+ "grad_norm": 4.115052223205566,
+ "learning_rate": 3.0670704515530654e-06,
+ "loss": 0.466,
+ "step": 5478
+ },
+ {
+ "epoch": 2.5905437352245864,
+ "grad_norm": 2.6018717288970947,
+ "learning_rate": 3.0664628671091163e-06,
+ "loss": 0.4697,
+ "step": 5479
+ },
+ {
+ "epoch": 2.591016548463357,
+ "grad_norm": 2.7393722534179688,
+ "learning_rate": 3.0658552473940085e-06,
+ "loss": 0.4618,
+ "step": 5480
+ },
+ {
+ "epoch": 2.5914893617021275,
+ "grad_norm": 2.8406929969787598,
+ "learning_rate": 3.065247592445575e-06,
+ "loss": 0.4806,
+ "step": 5481
+ },
+ {
+ "epoch": 2.5919621749408983,
+ "grad_norm": 2.9773001670837402,
+ "learning_rate": 3.0646399023016525e-06,
+ "loss": 0.4764,
+ "step": 5482
+ },
+ {
+ "epoch": 2.592434988179669,
+ "grad_norm": 3.374643325805664,
+ "learning_rate": 3.0640321770000804e-06,
+ "loss": 0.4481,
+ "step": 5483
+ },
+ {
+ "epoch": 2.5929078014184395,
+ "grad_norm": 2.5742013454437256,
+ "learning_rate": 3.0634244165786965e-06,
+ "loss": 0.432,
+ "step": 5484
+ },
+ {
+ "epoch": 2.5933806146572103,
+ "grad_norm": 2.9390289783477783,
+ "learning_rate": 3.062816621075346e-06,
+ "loss": 0.3941,
+ "step": 5485
+ },
+ {
+ "epoch": 2.593853427895981,
+ "grad_norm": 2.683414936065674,
+ "learning_rate": 3.062208790527871e-06,
+ "loss": 0.4268,
+ "step": 5486
+ },
+ {
+ "epoch": 2.594326241134752,
+ "grad_norm": 2.689647674560547,
+ "learning_rate": 3.06160092497412e-06,
+ "loss": 0.4569,
+ "step": 5487
+ },
+ {
+ "epoch": 2.5947990543735227,
+ "grad_norm": 3.1170310974121094,
+ "learning_rate": 3.060993024451943e-06,
+ "loss": 0.4387,
+ "step": 5488
+ },
+ {
+ "epoch": 2.595271867612293,
+ "grad_norm": 2.8732447624206543,
+ "learning_rate": 3.0603850889991894e-06,
+ "loss": 0.451,
+ "step": 5489
+ },
+ {
+ "epoch": 2.595744680851064,
+ "grad_norm": 3.0444157123565674,
+ "learning_rate": 3.0597771186537135e-06,
+ "loss": 0.4691,
+ "step": 5490
+ },
+ {
+ "epoch": 2.5962174940898346,
+ "grad_norm": 2.3791720867156982,
+ "learning_rate": 3.0591691134533714e-06,
+ "loss": 0.4771,
+ "step": 5491
+ },
+ {
+ "epoch": 2.596690307328605,
+ "grad_norm": 3.0677225589752197,
+ "learning_rate": 3.05856107343602e-06,
+ "loss": 0.459,
+ "step": 5492
+ },
+ {
+ "epoch": 2.5971631205673757,
+ "grad_norm": 3.1702635288238525,
+ "learning_rate": 3.05795299863952e-06,
+ "loss": 0.4816,
+ "step": 5493
+ },
+ {
+ "epoch": 2.5976359338061465,
+ "grad_norm": 2.964869499206543,
+ "learning_rate": 3.057344889101734e-06,
+ "loss": 0.4369,
+ "step": 5494
+ },
+ {
+ "epoch": 2.5981087470449173,
+ "grad_norm": 3.1333882808685303,
+ "learning_rate": 3.056736744860525e-06,
+ "loss": 0.4178,
+ "step": 5495
+ },
+ {
+ "epoch": 2.598581560283688,
+ "grad_norm": 2.4340405464172363,
+ "learning_rate": 3.05612856595376e-06,
+ "loss": 0.4359,
+ "step": 5496
+ },
+ {
+ "epoch": 2.5990543735224585,
+ "grad_norm": 2.638620615005493,
+ "learning_rate": 3.0555203524193083e-06,
+ "loss": 0.3915,
+ "step": 5497
+ },
+ {
+ "epoch": 2.5995271867612293,
+ "grad_norm": 2.8218815326690674,
+ "learning_rate": 3.054912104295039e-06,
+ "loss": 0.4684,
+ "step": 5498
+ },
+ {
+ "epoch": 2.6,
+ "grad_norm": 2.6696009635925293,
+ "learning_rate": 3.054303821618827e-06,
+ "loss": 0.4073,
+ "step": 5499
+ },
+ {
+ "epoch": 2.6004728132387704,
+ "grad_norm": 2.3880512714385986,
+ "learning_rate": 3.0536955044285465e-06,
+ "loss": 0.3576,
+ "step": 5500
+ },
+ {
+ "epoch": 2.6009456264775412,
+ "grad_norm": 2.762890100479126,
+ "learning_rate": 3.053087152762075e-06,
+ "loss": 0.3857,
+ "step": 5501
+ },
+ {
+ "epoch": 2.601418439716312,
+ "grad_norm": 2.729033946990967,
+ "learning_rate": 3.052478766657292e-06,
+ "loss": 0.3935,
+ "step": 5502
+ },
+ {
+ "epoch": 2.601891252955083,
+ "grad_norm": 2.630490303039551,
+ "learning_rate": 3.051870346152078e-06,
+ "loss": 0.3932,
+ "step": 5503
+ },
+ {
+ "epoch": 2.6023640661938536,
+ "grad_norm": 3.0335981845855713,
+ "learning_rate": 3.051261891284318e-06,
+ "loss": 0.4313,
+ "step": 5504
+ },
+ {
+ "epoch": 2.602836879432624,
+ "grad_norm": 2.969888687133789,
+ "learning_rate": 3.0506534020918963e-06,
+ "loss": 0.4698,
+ "step": 5505
+ },
+ {
+ "epoch": 2.603309692671395,
+ "grad_norm": 3.093996524810791,
+ "learning_rate": 3.050044878612703e-06,
+ "loss": 0.5338,
+ "step": 5506
+ },
+ {
+ "epoch": 2.6037825059101656,
+ "grad_norm": 2.759993314743042,
+ "learning_rate": 3.049436320884626e-06,
+ "loss": 0.4429,
+ "step": 5507
+ },
+ {
+ "epoch": 2.604255319148936,
+ "grad_norm": 2.979422092437744,
+ "learning_rate": 3.0488277289455587e-06,
+ "loss": 0.4489,
+ "step": 5508
+ },
+ {
+ "epoch": 2.6047281323877067,
+ "grad_norm": 2.8266701698303223,
+ "learning_rate": 3.048219102833396e-06,
+ "loss": 0.489,
+ "step": 5509
+ },
+ {
+ "epoch": 2.6052009456264775,
+ "grad_norm": 2.2582461833953857,
+ "learning_rate": 3.047610442586033e-06,
+ "loss": 0.3759,
+ "step": 5510
+ },
+ {
+ "epoch": 2.6056737588652483,
+ "grad_norm": 3.078152894973755,
+ "learning_rate": 3.0470017482413694e-06,
+ "loss": 0.5059,
+ "step": 5511
+ },
+ {
+ "epoch": 2.606146572104019,
+ "grad_norm": 2.7895498275756836,
+ "learning_rate": 3.0463930198373047e-06,
+ "loss": 0.4752,
+ "step": 5512
+ },
+ {
+ "epoch": 2.6066193853427895,
+ "grad_norm": 3.2307958602905273,
+ "learning_rate": 3.045784257411743e-06,
+ "loss": 0.4847,
+ "step": 5513
+ },
+ {
+ "epoch": 2.6070921985815603,
+ "grad_norm": 2.793661594390869,
+ "learning_rate": 3.0451754610025884e-06,
+ "loss": 0.4492,
+ "step": 5514
+ },
+ {
+ "epoch": 2.607565011820331,
+ "grad_norm": 2.4443132877349854,
+ "learning_rate": 3.0445666306477484e-06,
+ "loss": 0.4174,
+ "step": 5515
+ },
+ {
+ "epoch": 2.6080378250591014,
+ "grad_norm": 2.628769636154175,
+ "learning_rate": 3.0439577663851326e-06,
+ "loss": 0.3889,
+ "step": 5516
+ },
+ {
+ "epoch": 2.608510638297872,
+ "grad_norm": 2.9367563724517822,
+ "learning_rate": 3.0433488682526525e-06,
+ "loss": 0.437,
+ "step": 5517
+ },
+ {
+ "epoch": 2.608983451536643,
+ "grad_norm": 3.171353340148926,
+ "learning_rate": 3.04273993628822e-06,
+ "loss": 0.47,
+ "step": 5518
+ },
+ {
+ "epoch": 2.609456264775414,
+ "grad_norm": 2.856576442718506,
+ "learning_rate": 3.0421309705297513e-06,
+ "loss": 0.4797,
+ "step": 5519
+ },
+ {
+ "epoch": 2.6099290780141846,
+ "grad_norm": 2.4926068782806396,
+ "learning_rate": 3.041521971015165e-06,
+ "loss": 0.4294,
+ "step": 5520
+ },
+ {
+ "epoch": 2.610401891252955,
+ "grad_norm": 2.7897613048553467,
+ "learning_rate": 3.040912937782379e-06,
+ "loss": 0.4388,
+ "step": 5521
+ },
+ {
+ "epoch": 2.6108747044917258,
+ "grad_norm": 3.588188886642456,
+ "learning_rate": 3.0403038708693173e-06,
+ "loss": 0.4027,
+ "step": 5522
+ },
+ {
+ "epoch": 2.6113475177304966,
+ "grad_norm": 3.5394980907440186,
+ "learning_rate": 3.0396947703139017e-06,
+ "loss": 0.4866,
+ "step": 5523
+ },
+ {
+ "epoch": 2.611820330969267,
+ "grad_norm": 3.086865186691284,
+ "learning_rate": 3.03908563615406e-06,
+ "loss": 0.4344,
+ "step": 5524
+ },
+ {
+ "epoch": 2.6122931442080377,
+ "grad_norm": 2.649564504623413,
+ "learning_rate": 3.0384764684277194e-06,
+ "loss": 0.4571,
+ "step": 5525
+ },
+ {
+ "epoch": 2.6127659574468085,
+ "grad_norm": 2.945234775543213,
+ "learning_rate": 3.0378672671728105e-06,
+ "loss": 0.4885,
+ "step": 5526
+ },
+ {
+ "epoch": 2.6132387706855793,
+ "grad_norm": 2.625424861907959,
+ "learning_rate": 3.037258032427265e-06,
+ "loss": 0.4095,
+ "step": 5527
+ },
+ {
+ "epoch": 2.61371158392435,
+ "grad_norm": 2.7597248554229736,
+ "learning_rate": 3.0366487642290175e-06,
+ "loss": 0.4393,
+ "step": 5528
+ },
+ {
+ "epoch": 2.6141843971631205,
+ "grad_norm": 2.721189260482788,
+ "learning_rate": 3.0360394626160043e-06,
+ "loss": 0.3865,
+ "step": 5529
+ },
+ {
+ "epoch": 2.6146572104018913,
+ "grad_norm": 2.624056339263916,
+ "learning_rate": 3.0354301276261656e-06,
+ "loss": 0.4273,
+ "step": 5530
+ },
+ {
+ "epoch": 2.615130023640662,
+ "grad_norm": 2.7764177322387695,
+ "learning_rate": 3.034820759297439e-06,
+ "loss": 0.4756,
+ "step": 5531
+ },
+ {
+ "epoch": 2.6156028368794324,
+ "grad_norm": 3.0841729640960693,
+ "learning_rate": 3.0342113576677696e-06,
+ "loss": 0.4907,
+ "step": 5532
+ },
+ {
+ "epoch": 2.616075650118203,
+ "grad_norm": 2.678715705871582,
+ "learning_rate": 3.0336019227751017e-06,
+ "loss": 0.4478,
+ "step": 5533
+ },
+ {
+ "epoch": 2.616548463356974,
+ "grad_norm": 2.378679037094116,
+ "learning_rate": 3.032992454657382e-06,
+ "loss": 0.3678,
+ "step": 5534
+ },
+ {
+ "epoch": 2.617021276595745,
+ "grad_norm": 2.792079210281372,
+ "learning_rate": 3.0323829533525583e-06,
+ "loss": 0.4115,
+ "step": 5535
+ },
+ {
+ "epoch": 2.6174940898345156,
+ "grad_norm": 2.738133192062378,
+ "learning_rate": 3.0317734188985832e-06,
+ "loss": 0.4152,
+ "step": 5536
+ },
+ {
+ "epoch": 2.617966903073286,
+ "grad_norm": 2.6963796615600586,
+ "learning_rate": 3.0311638513334084e-06,
+ "loss": 0.4096,
+ "step": 5537
+ },
+ {
+ "epoch": 2.6184397163120567,
+ "grad_norm": 2.694145679473877,
+ "learning_rate": 3.03055425069499e-06,
+ "loss": 0.3793,
+ "step": 5538
+ },
+ {
+ "epoch": 2.6189125295508275,
+ "grad_norm": 2.762403964996338,
+ "learning_rate": 3.0299446170212855e-06,
+ "loss": 0.459,
+ "step": 5539
+ },
+ {
+ "epoch": 2.619385342789598,
+ "grad_norm": 2.804382562637329,
+ "learning_rate": 3.0293349503502522e-06,
+ "loss": 0.4853,
+ "step": 5540
+ },
+ {
+ "epoch": 2.6198581560283687,
+ "grad_norm": 2.7768518924713135,
+ "learning_rate": 3.0287252507198537e-06,
+ "loss": 0.4496,
+ "step": 5541
+ },
+ {
+ "epoch": 2.6203309692671395,
+ "grad_norm": 2.9075138568878174,
+ "learning_rate": 3.028115518168052e-06,
+ "loss": 0.4498,
+ "step": 5542
+ },
+ {
+ "epoch": 2.6208037825059103,
+ "grad_norm": 2.8966822624206543,
+ "learning_rate": 3.0275057527328126e-06,
+ "loss": 0.4434,
+ "step": 5543
+ },
+ {
+ "epoch": 2.621276595744681,
+ "grad_norm": 2.8140156269073486,
+ "learning_rate": 3.0268959544521027e-06,
+ "loss": 0.3935,
+ "step": 5544
+ },
+ {
+ "epoch": 2.6217494089834514,
+ "grad_norm": 2.8606276512145996,
+ "learning_rate": 3.0262861233638924e-06,
+ "loss": 0.4222,
+ "step": 5545
+ },
+ {
+ "epoch": 2.6222222222222222,
+ "grad_norm": 3.003610134124756,
+ "learning_rate": 3.0256762595061522e-06,
+ "loss": 0.428,
+ "step": 5546
+ },
+ {
+ "epoch": 2.622695035460993,
+ "grad_norm": 2.725907802581787,
+ "learning_rate": 3.025066362916857e-06,
+ "loss": 0.3975,
+ "step": 5547
+ },
+ {
+ "epoch": 2.6231678486997634,
+ "grad_norm": 2.5247902870178223,
+ "learning_rate": 3.024456433633982e-06,
+ "loss": 0.4584,
+ "step": 5548
+ },
+ {
+ "epoch": 2.623640661938534,
+ "grad_norm": 2.932798147201538,
+ "learning_rate": 3.0238464716955045e-06,
+ "loss": 0.4991,
+ "step": 5549
+ },
+ {
+ "epoch": 2.624113475177305,
+ "grad_norm": 2.693547010421753,
+ "learning_rate": 3.023236477139404e-06,
+ "loss": 0.4405,
+ "step": 5550
+ },
+ {
+ "epoch": 2.6245862884160758,
+ "grad_norm": 3.2600035667419434,
+ "learning_rate": 3.022626450003662e-06,
+ "loss": 0.4904,
+ "step": 5551
+ },
+ {
+ "epoch": 2.6250591016548466,
+ "grad_norm": 2.9471960067749023,
+ "learning_rate": 3.0220163903262627e-06,
+ "loss": 0.4487,
+ "step": 5552
+ },
+ {
+ "epoch": 2.625531914893617,
+ "grad_norm": 2.583944082260132,
+ "learning_rate": 3.0214062981451926e-06,
+ "loss": 0.3552,
+ "step": 5553
+ },
+ {
+ "epoch": 2.6260047281323877,
+ "grad_norm": 2.675062656402588,
+ "learning_rate": 3.0207961734984377e-06,
+ "loss": 0.4524,
+ "step": 5554
+ },
+ {
+ "epoch": 2.6264775413711585,
+ "grad_norm": 3.0126802921295166,
+ "learning_rate": 3.0201860164239887e-06,
+ "loss": 0.4124,
+ "step": 5555
+ },
+ {
+ "epoch": 2.626950354609929,
+ "grad_norm": 2.490734577178955,
+ "learning_rate": 3.019575826959838e-06,
+ "loss": 0.4095,
+ "step": 5556
+ },
+ {
+ "epoch": 2.6274231678486997,
+ "grad_norm": 2.72817063331604,
+ "learning_rate": 3.018965605143978e-06,
+ "loss": 0.4298,
+ "step": 5557
+ },
+ {
+ "epoch": 2.6278959810874705,
+ "grad_norm": 3.1298327445983887,
+ "learning_rate": 3.0183553510144064e-06,
+ "loss": 0.4961,
+ "step": 5558
+ },
+ {
+ "epoch": 2.6283687943262413,
+ "grad_norm": 3.2379956245422363,
+ "learning_rate": 3.0177450646091195e-06,
+ "loss": 0.4943,
+ "step": 5559
+ },
+ {
+ "epoch": 2.628841607565012,
+ "grad_norm": 2.5040571689605713,
+ "learning_rate": 3.017134745966117e-06,
+ "loss": 0.3701,
+ "step": 5560
+ },
+ {
+ "epoch": 2.6293144208037824,
+ "grad_norm": 3.047184944152832,
+ "learning_rate": 3.0165243951234025e-06,
+ "loss": 0.4587,
+ "step": 5561
+ },
+ {
+ "epoch": 2.629787234042553,
+ "grad_norm": 2.4926774501800537,
+ "learning_rate": 3.0159140121189783e-06,
+ "loss": 0.3723,
+ "step": 5562
+ },
+ {
+ "epoch": 2.630260047281324,
+ "grad_norm": 2.5434961318969727,
+ "learning_rate": 3.015303596990851e-06,
+ "loss": 0.4176,
+ "step": 5563
+ },
+ {
+ "epoch": 2.6307328605200944,
+ "grad_norm": 2.5117976665496826,
+ "learning_rate": 3.0146931497770284e-06,
+ "loss": 0.4218,
+ "step": 5564
+ },
+ {
+ "epoch": 2.631205673758865,
+ "grad_norm": 2.9408798217773438,
+ "learning_rate": 3.0140826705155196e-06,
+ "loss": 0.4473,
+ "step": 5565
+ },
+ {
+ "epoch": 2.631678486997636,
+ "grad_norm": 2.996422052383423,
+ "learning_rate": 3.0134721592443385e-06,
+ "loss": 0.4513,
+ "step": 5566
+ },
+ {
+ "epoch": 2.6321513002364068,
+ "grad_norm": 2.984356164932251,
+ "learning_rate": 3.0128616160014955e-06,
+ "loss": 0.4749,
+ "step": 5567
+ },
+ {
+ "epoch": 2.6326241134751776,
+ "grad_norm": 2.6075069904327393,
+ "learning_rate": 3.0122510408250095e-06,
+ "loss": 0.4707,
+ "step": 5568
+ },
+ {
+ "epoch": 2.633096926713948,
+ "grad_norm": 2.9463071823120117,
+ "learning_rate": 3.0116404337528972e-06,
+ "loss": 0.5125,
+ "step": 5569
+ },
+ {
+ "epoch": 2.6335697399527187,
+ "grad_norm": 2.98574161529541,
+ "learning_rate": 3.0110297948231787e-06,
+ "loss": 0.4487,
+ "step": 5570
+ },
+ {
+ "epoch": 2.6340425531914895,
+ "grad_norm": 2.6039397716522217,
+ "learning_rate": 3.010419124073876e-06,
+ "loss": 0.4516,
+ "step": 5571
+ },
+ {
+ "epoch": 2.63451536643026,
+ "grad_norm": 2.8480236530303955,
+ "learning_rate": 3.0098084215430124e-06,
+ "loss": 0.4962,
+ "step": 5572
+ },
+ {
+ "epoch": 2.6349881796690307,
+ "grad_norm": 2.527597427368164,
+ "learning_rate": 3.0091976872686133e-06,
+ "loss": 0.435,
+ "step": 5573
+ },
+ {
+ "epoch": 2.6354609929078014,
+ "grad_norm": 2.898303508758545,
+ "learning_rate": 3.0085869212887076e-06,
+ "loss": 0.4473,
+ "step": 5574
+ },
+ {
+ "epoch": 2.6359338061465722,
+ "grad_norm": 2.981414318084717,
+ "learning_rate": 3.007976123641324e-06,
+ "loss": 0.4203,
+ "step": 5575
+ },
+ {
+ "epoch": 2.636406619385343,
+ "grad_norm": 3.219064474105835,
+ "learning_rate": 3.0073652943644947e-06,
+ "loss": 0.4596,
+ "step": 5576
+ },
+ {
+ "epoch": 2.6368794326241134,
+ "grad_norm": 2.7287049293518066,
+ "learning_rate": 3.0067544334962532e-06,
+ "loss": 0.433,
+ "step": 5577
+ },
+ {
+ "epoch": 2.637352245862884,
+ "grad_norm": 2.6232664585113525,
+ "learning_rate": 3.0061435410746352e-06,
+ "loss": 0.4254,
+ "step": 5578
+ },
+ {
+ "epoch": 2.637825059101655,
+ "grad_norm": 2.908311605453491,
+ "learning_rate": 3.0055326171376788e-06,
+ "loss": 0.4349,
+ "step": 5579
+ },
+ {
+ "epoch": 2.6382978723404253,
+ "grad_norm": 2.8369064331054688,
+ "learning_rate": 3.0049216617234224e-06,
+ "loss": 0.4675,
+ "step": 5580
+ },
+ {
+ "epoch": 2.638770685579196,
+ "grad_norm": 2.659499406814575,
+ "learning_rate": 3.0043106748699085e-06,
+ "loss": 0.4073,
+ "step": 5581
+ },
+ {
+ "epoch": 2.639243498817967,
+ "grad_norm": 2.579765558242798,
+ "learning_rate": 3.00369965661518e-06,
+ "loss": 0.4536,
+ "step": 5582
+ },
+ {
+ "epoch": 2.6397163120567377,
+ "grad_norm": 3.572861909866333,
+ "learning_rate": 3.0030886069972827e-06,
+ "loss": 0.5227,
+ "step": 5583
+ },
+ {
+ "epoch": 2.6401891252955085,
+ "grad_norm": 2.6523196697235107,
+ "learning_rate": 3.002477526054263e-06,
+ "loss": 0.3846,
+ "step": 5584
+ },
+ {
+ "epoch": 2.640661938534279,
+ "grad_norm": 3.072181463241577,
+ "learning_rate": 3.001866413824173e-06,
+ "loss": 0.5399,
+ "step": 5585
+ },
+ {
+ "epoch": 2.6411347517730497,
+ "grad_norm": 2.7304325103759766,
+ "learning_rate": 3.0012552703450597e-06,
+ "loss": 0.4048,
+ "step": 5586
+ },
+ {
+ "epoch": 2.6416075650118205,
+ "grad_norm": 3.039491891860962,
+ "learning_rate": 3.0006440956549798e-06,
+ "loss": 0.5035,
+ "step": 5587
+ },
+ {
+ "epoch": 2.642080378250591,
+ "grad_norm": 2.7623798847198486,
+ "learning_rate": 3.000032889791988e-06,
+ "loss": 0.4369,
+ "step": 5588
+ },
+ {
+ "epoch": 2.6425531914893616,
+ "grad_norm": 3.391052722930908,
+ "learning_rate": 2.9994216527941394e-06,
+ "loss": 0.5308,
+ "step": 5589
+ },
+ {
+ "epoch": 2.6430260047281324,
+ "grad_norm": 3.0263915061950684,
+ "learning_rate": 2.9988103846994954e-06,
+ "loss": 0.4319,
+ "step": 5590
+ },
+ {
+ "epoch": 2.6434988179669032,
+ "grad_norm": 2.786607027053833,
+ "learning_rate": 2.998199085546115e-06,
+ "loss": 0.4695,
+ "step": 5591
+ },
+ {
+ "epoch": 2.643971631205674,
+ "grad_norm": 2.884674310684204,
+ "learning_rate": 2.9975877553720627e-06,
+ "loss": 0.4615,
+ "step": 5592
+ },
+ {
+ "epoch": 2.6444444444444444,
+ "grad_norm": 2.6100499629974365,
+ "learning_rate": 2.996976394215402e-06,
+ "loss": 0.4784,
+ "step": 5593
+ },
+ {
+ "epoch": 2.644917257683215,
+ "grad_norm": 2.6978676319122314,
+ "learning_rate": 2.9963650021142018e-06,
+ "loss": 0.3911,
+ "step": 5594
+ },
+ {
+ "epoch": 2.645390070921986,
+ "grad_norm": 2.8080835342407227,
+ "learning_rate": 2.9957535791065284e-06,
+ "loss": 0.4997,
+ "step": 5595
+ },
+ {
+ "epoch": 2.6458628841607563,
+ "grad_norm": 2.6639578342437744,
+ "learning_rate": 2.9951421252304537e-06,
+ "loss": 0.4066,
+ "step": 5596
+ },
+ {
+ "epoch": 2.646335697399527,
+ "grad_norm": 3.102456569671631,
+ "learning_rate": 2.9945306405240505e-06,
+ "loss": 0.5554,
+ "step": 5597
+ },
+ {
+ "epoch": 2.646808510638298,
+ "grad_norm": 2.6524150371551514,
+ "learning_rate": 2.993919125025392e-06,
+ "loss": 0.3881,
+ "step": 5598
+ },
+ {
+ "epoch": 2.6472813238770687,
+ "grad_norm": 2.926316499710083,
+ "learning_rate": 2.993307578772556e-06,
+ "loss": 0.4845,
+ "step": 5599
+ },
+ {
+ "epoch": 2.6477541371158395,
+ "grad_norm": 3.346550703048706,
+ "learning_rate": 2.9926960018036195e-06,
+ "loss": 0.4481,
+ "step": 5600
+ },
+ {
+ "epoch": 2.64822695035461,
+ "grad_norm": 2.6211020946502686,
+ "learning_rate": 2.9920843941566634e-06,
+ "loss": 0.4355,
+ "step": 5601
+ },
+ {
+ "epoch": 2.6486997635933807,
+ "grad_norm": 2.7479333877563477,
+ "learning_rate": 2.99147275586977e-06,
+ "loss": 0.4373,
+ "step": 5602
+ },
+ {
+ "epoch": 2.6491725768321515,
+ "grad_norm": 2.523385524749756,
+ "learning_rate": 2.9908610869810235e-06,
+ "loss": 0.4467,
+ "step": 5603
+ },
+ {
+ "epoch": 2.649645390070922,
+ "grad_norm": 2.93886137008667,
+ "learning_rate": 2.9902493875285086e-06,
+ "loss": 0.4956,
+ "step": 5604
+ },
+ {
+ "epoch": 2.6501182033096926,
+ "grad_norm": 2.7630443572998047,
+ "learning_rate": 2.989637657550315e-06,
+ "loss": 0.5012,
+ "step": 5605
+ },
+ {
+ "epoch": 2.6505910165484634,
+ "grad_norm": 2.6733906269073486,
+ "learning_rate": 2.989025897084531e-06,
+ "loss": 0.446,
+ "step": 5606
+ },
+ {
+ "epoch": 2.651063829787234,
+ "grad_norm": 2.8411107063293457,
+ "learning_rate": 2.9884141061692484e-06,
+ "loss": 0.4817,
+ "step": 5607
+ },
+ {
+ "epoch": 2.651536643026005,
+ "grad_norm": 2.8667192459106445,
+ "learning_rate": 2.987802284842562e-06,
+ "loss": 0.3909,
+ "step": 5608
+ },
+ {
+ "epoch": 2.6520094562647754,
+ "grad_norm": 3.4640755653381348,
+ "learning_rate": 2.987190433142565e-06,
+ "loss": 0.4379,
+ "step": 5609
+ },
+ {
+ "epoch": 2.652482269503546,
+ "grad_norm": 2.675121307373047,
+ "learning_rate": 2.9865785511073565e-06,
+ "loss": 0.4833,
+ "step": 5610
+ },
+ {
+ "epoch": 2.652955082742317,
+ "grad_norm": 2.4375529289245605,
+ "learning_rate": 2.9859666387750353e-06,
+ "loss": 0.3949,
+ "step": 5611
+ },
+ {
+ "epoch": 2.6534278959810873,
+ "grad_norm": 2.7312581539154053,
+ "learning_rate": 2.9853546961837026e-06,
+ "loss": 0.4546,
+ "step": 5612
+ },
+ {
+ "epoch": 2.653900709219858,
+ "grad_norm": 2.7695999145507812,
+ "learning_rate": 2.9847427233714617e-06,
+ "loss": 0.4696,
+ "step": 5613
+ },
+ {
+ "epoch": 2.654373522458629,
+ "grad_norm": 2.6313109397888184,
+ "learning_rate": 2.984130720376416e-06,
+ "loss": 0.4733,
+ "step": 5614
+ },
+ {
+ "epoch": 2.6548463356973997,
+ "grad_norm": 2.656864881515503,
+ "learning_rate": 2.9835186872366733e-06,
+ "loss": 0.3806,
+ "step": 5615
+ },
+ {
+ "epoch": 2.65531914893617,
+ "grad_norm": 2.720075845718384,
+ "learning_rate": 2.982906623990342e-06,
+ "loss": 0.4041,
+ "step": 5616
+ },
+ {
+ "epoch": 2.655791962174941,
+ "grad_norm": 2.6684951782226562,
+ "learning_rate": 2.9822945306755334e-06,
+ "loss": 0.4552,
+ "step": 5617
+ },
+ {
+ "epoch": 2.6562647754137116,
+ "grad_norm": 2.567751884460449,
+ "learning_rate": 2.9816824073303585e-06,
+ "loss": 0.465,
+ "step": 5618
+ },
+ {
+ "epoch": 2.656737588652482,
+ "grad_norm": 2.7490367889404297,
+ "learning_rate": 2.981070253992933e-06,
+ "loss": 0.4647,
+ "step": 5619
+ },
+ {
+ "epoch": 2.657210401891253,
+ "grad_norm": 2.548656463623047,
+ "learning_rate": 2.9804580707013715e-06,
+ "loss": 0.4226,
+ "step": 5620
+ },
+ {
+ "epoch": 2.6576832151300236,
+ "grad_norm": 2.5484731197357178,
+ "learning_rate": 2.9798458574937927e-06,
+ "loss": 0.382,
+ "step": 5621
+ },
+ {
+ "epoch": 2.6581560283687944,
+ "grad_norm": 2.7293949127197266,
+ "learning_rate": 2.979233614408317e-06,
+ "loss": 0.4418,
+ "step": 5622
+ },
+ {
+ "epoch": 2.658628841607565,
+ "grad_norm": 2.645036458969116,
+ "learning_rate": 2.9786213414830646e-06,
+ "loss": 0.414,
+ "step": 5623
+ },
+ {
+ "epoch": 2.6591016548463355,
+ "grad_norm": 2.5287609100341797,
+ "learning_rate": 2.9780090387561604e-06,
+ "loss": 0.3914,
+ "step": 5624
+ },
+ {
+ "epoch": 2.6595744680851063,
+ "grad_norm": 2.5570411682128906,
+ "learning_rate": 2.9773967062657293e-06,
+ "loss": 0.4431,
+ "step": 5625
+ },
+ {
+ "epoch": 2.660047281323877,
+ "grad_norm": 2.681749105453491,
+ "learning_rate": 2.9767843440498983e-06,
+ "loss": 0.4245,
+ "step": 5626
+ },
+ {
+ "epoch": 2.6605200945626475,
+ "grad_norm": 2.8629777431488037,
+ "learning_rate": 2.976171952146798e-06,
+ "loss": 0.4643,
+ "step": 5627
+ },
+ {
+ "epoch": 2.6609929078014183,
+ "grad_norm": 2.577148199081421,
+ "learning_rate": 2.9755595305945573e-06,
+ "loss": 0.43,
+ "step": 5628
+ },
+ {
+ "epoch": 2.661465721040189,
+ "grad_norm": 2.747218370437622,
+ "learning_rate": 2.97494707943131e-06,
+ "loss": 0.5194,
+ "step": 5629
+ },
+ {
+ "epoch": 2.66193853427896,
+ "grad_norm": 2.535604953765869,
+ "learning_rate": 2.9743345986951904e-06,
+ "loss": 0.4401,
+ "step": 5630
+ },
+ {
+ "epoch": 2.6624113475177307,
+ "grad_norm": 3.3341166973114014,
+ "learning_rate": 2.973722088424336e-06,
+ "loss": 0.4925,
+ "step": 5631
+ },
+ {
+ "epoch": 2.662884160756501,
+ "grad_norm": 2.9264349937438965,
+ "learning_rate": 2.973109548656884e-06,
+ "loss": 0.4787,
+ "step": 5632
+ },
+ {
+ "epoch": 2.663356973995272,
+ "grad_norm": 2.7132506370544434,
+ "learning_rate": 2.9724969794309742e-06,
+ "loss": 0.4138,
+ "step": 5633
+ },
+ {
+ "epoch": 2.6638297872340426,
+ "grad_norm": 2.7970192432403564,
+ "learning_rate": 2.9718843807847497e-06,
+ "loss": 0.4896,
+ "step": 5634
+ },
+ {
+ "epoch": 2.664302600472813,
+ "grad_norm": 2.610208749771118,
+ "learning_rate": 2.9712717527563545e-06,
+ "loss": 0.3997,
+ "step": 5635
+ },
+ {
+ "epoch": 2.6647754137115838,
+ "grad_norm": 3.5483577251434326,
+ "learning_rate": 2.9706590953839335e-06,
+ "loss": 0.5109,
+ "step": 5636
+ },
+ {
+ "epoch": 2.6652482269503546,
+ "grad_norm": 2.746933698654175,
+ "learning_rate": 2.9700464087056345e-06,
+ "loss": 0.4672,
+ "step": 5637
+ },
+ {
+ "epoch": 2.6657210401891254,
+ "grad_norm": 2.704436779022217,
+ "learning_rate": 2.969433692759607e-06,
+ "loss": 0.4402,
+ "step": 5638
+ },
+ {
+ "epoch": 2.666193853427896,
+ "grad_norm": 2.859520196914673,
+ "learning_rate": 2.9688209475840005e-06,
+ "loss": 0.4679,
+ "step": 5639
+ },
+ {
+ "epoch": 2.6666666666666665,
+ "grad_norm": 2.518580436706543,
+ "learning_rate": 2.968208173216971e-06,
+ "loss": 0.3772,
+ "step": 5640
+ },
+ {
+ "epoch": 2.6671394799054373,
+ "grad_norm": 2.7624926567077637,
+ "learning_rate": 2.967595369696671e-06,
+ "loss": 0.4753,
+ "step": 5641
+ },
+ {
+ "epoch": 2.667612293144208,
+ "grad_norm": 2.654003620147705,
+ "learning_rate": 2.966982537061257e-06,
+ "loss": 0.4583,
+ "step": 5642
+ },
+ {
+ "epoch": 2.6680851063829785,
+ "grad_norm": 2.8473968505859375,
+ "learning_rate": 2.966369675348888e-06,
+ "loss": 0.4623,
+ "step": 5643
+ },
+ {
+ "epoch": 2.6685579196217493,
+ "grad_norm": 2.5587947368621826,
+ "learning_rate": 2.9657567845977253e-06,
+ "loss": 0.4014,
+ "step": 5644
+ },
+ {
+ "epoch": 2.66903073286052,
+ "grad_norm": 2.572220802307129,
+ "learning_rate": 2.96514386484593e-06,
+ "loss": 0.4249,
+ "step": 5645
+ },
+ {
+ "epoch": 2.669503546099291,
+ "grad_norm": 2.7995707988739014,
+ "learning_rate": 2.964530916131665e-06,
+ "loss": 0.4575,
+ "step": 5646
+ },
+ {
+ "epoch": 2.6699763593380617,
+ "grad_norm": 2.8712687492370605,
+ "learning_rate": 2.963917938493097e-06,
+ "loss": 0.4353,
+ "step": 5647
+ },
+ {
+ "epoch": 2.670449172576832,
+ "grad_norm": 2.856473207473755,
+ "learning_rate": 2.963304931968393e-06,
+ "loss": 0.4345,
+ "step": 5648
+ },
+ {
+ "epoch": 2.670921985815603,
+ "grad_norm": 2.709198474884033,
+ "learning_rate": 2.9626918965957224e-06,
+ "loss": 0.4116,
+ "step": 5649
+ },
+ {
+ "epoch": 2.6713947990543736,
+ "grad_norm": 2.8144607543945312,
+ "learning_rate": 2.962078832413257e-06,
+ "loss": 0.4575,
+ "step": 5650
+ },
+ {
+ "epoch": 2.671867612293144,
+ "grad_norm": 3.131911039352417,
+ "learning_rate": 2.961465739459168e-06,
+ "loss": 0.4743,
+ "step": 5651
+ },
+ {
+ "epoch": 2.6723404255319148,
+ "grad_norm": 2.8487515449523926,
+ "learning_rate": 2.9608526177716316e-06,
+ "loss": 0.4314,
+ "step": 5652
+ },
+ {
+ "epoch": 2.6728132387706856,
+ "grad_norm": 2.613229751586914,
+ "learning_rate": 2.960239467388823e-06,
+ "loss": 0.4807,
+ "step": 5653
+ },
+ {
+ "epoch": 2.6732860520094563,
+ "grad_norm": 2.5049116611480713,
+ "learning_rate": 2.9596262883489213e-06,
+ "loss": 0.4708,
+ "step": 5654
+ },
+ {
+ "epoch": 2.673758865248227,
+ "grad_norm": 2.6347460746765137,
+ "learning_rate": 2.9590130806901052e-06,
+ "loss": 0.3689,
+ "step": 5655
+ },
+ {
+ "epoch": 2.6742316784869975,
+ "grad_norm": 3.3290371894836426,
+ "learning_rate": 2.9583998444505578e-06,
+ "loss": 0.4674,
+ "step": 5656
+ },
+ {
+ "epoch": 2.6747044917257683,
+ "grad_norm": 2.748403549194336,
+ "learning_rate": 2.957786579668462e-06,
+ "loss": 0.3852,
+ "step": 5657
+ },
+ {
+ "epoch": 2.675177304964539,
+ "grad_norm": 2.837573766708374,
+ "learning_rate": 2.957173286382003e-06,
+ "loss": 0.4541,
+ "step": 5658
+ },
+ {
+ "epoch": 2.6756501182033094,
+ "grad_norm": 3.0976510047912598,
+ "learning_rate": 2.9565599646293686e-06,
+ "loss": 0.4669,
+ "step": 5659
+ },
+ {
+ "epoch": 2.6761229314420802,
+ "grad_norm": 2.7059597969055176,
+ "learning_rate": 2.955946614448747e-06,
+ "loss": 0.3935,
+ "step": 5660
+ },
+ {
+ "epoch": 2.676595744680851,
+ "grad_norm": 2.6700541973114014,
+ "learning_rate": 2.9553332358783294e-06,
+ "loss": 0.4322,
+ "step": 5661
+ },
+ {
+ "epoch": 2.677068557919622,
+ "grad_norm": 2.9782698154449463,
+ "learning_rate": 2.9547198289563068e-06,
+ "loss": 0.4338,
+ "step": 5662
+ },
+ {
+ "epoch": 2.6775413711583926,
+ "grad_norm": 2.637876510620117,
+ "learning_rate": 2.9541063937208755e-06,
+ "loss": 0.4289,
+ "step": 5663
+ },
+ {
+ "epoch": 2.678014184397163,
+ "grad_norm": 3.421949863433838,
+ "learning_rate": 2.953492930210229e-06,
+ "loss": 0.5458,
+ "step": 5664
+ },
+ {
+ "epoch": 2.678486997635934,
+ "grad_norm": 2.8273842334747314,
+ "learning_rate": 2.952879438462567e-06,
+ "loss": 0.4529,
+ "step": 5665
+ },
+ {
+ "epoch": 2.6789598108747046,
+ "grad_norm": 2.9090168476104736,
+ "learning_rate": 2.9522659185160873e-06,
+ "loss": 0.444,
+ "step": 5666
+ },
+ {
+ "epoch": 2.679432624113475,
+ "grad_norm": 2.646710157394409,
+ "learning_rate": 2.9516523704089927e-06,
+ "loss": 0.4226,
+ "step": 5667
+ },
+ {
+ "epoch": 2.6799054373522457,
+ "grad_norm": 2.65915584564209,
+ "learning_rate": 2.951038794179486e-06,
+ "loss": 0.4307,
+ "step": 5668
+ },
+ {
+ "epoch": 2.6803782505910165,
+ "grad_norm": 3.004507303237915,
+ "learning_rate": 2.950425189865771e-06,
+ "loss": 0.4799,
+ "step": 5669
+ },
+ {
+ "epoch": 2.6808510638297873,
+ "grad_norm": 2.5210134983062744,
+ "learning_rate": 2.949811557506054e-06,
+ "loss": 0.3842,
+ "step": 5670
+ },
+ {
+ "epoch": 2.681323877068558,
+ "grad_norm": 2.8072893619537354,
+ "learning_rate": 2.9491978971385436e-06,
+ "loss": 0.435,
+ "step": 5671
+ },
+ {
+ "epoch": 2.6817966903073285,
+ "grad_norm": 2.5701990127563477,
+ "learning_rate": 2.9485842088014498e-06,
+ "loss": 0.4932,
+ "step": 5672
+ },
+ {
+ "epoch": 2.6822695035460993,
+ "grad_norm": 2.9368457794189453,
+ "learning_rate": 2.9479704925329854e-06,
+ "loss": 0.455,
+ "step": 5673
+ },
+ {
+ "epoch": 2.68274231678487,
+ "grad_norm": 2.8576247692108154,
+ "learning_rate": 2.947356748371362e-06,
+ "loss": 0.4254,
+ "step": 5674
+ },
+ {
+ "epoch": 2.6832151300236404,
+ "grad_norm": 2.8999195098876953,
+ "learning_rate": 2.946742976354795e-06,
+ "loss": 0.4159,
+ "step": 5675
+ },
+ {
+ "epoch": 2.6836879432624112,
+ "grad_norm": 2.8439736366271973,
+ "learning_rate": 2.946129176521502e-06,
+ "loss": 0.4035,
+ "step": 5676
+ },
+ {
+ "epoch": 2.684160756501182,
+ "grad_norm": 2.8525729179382324,
+ "learning_rate": 2.945515348909702e-06,
+ "loss": 0.4137,
+ "step": 5677
+ },
+ {
+ "epoch": 2.684633569739953,
+ "grad_norm": 2.6573562622070312,
+ "learning_rate": 2.9449014935576147e-06,
+ "loss": 0.4203,
+ "step": 5678
+ },
+ {
+ "epoch": 2.6851063829787236,
+ "grad_norm": 2.765794277191162,
+ "learning_rate": 2.9442876105034616e-06,
+ "loss": 0.5184,
+ "step": 5679
+ },
+ {
+ "epoch": 2.685579196217494,
+ "grad_norm": 2.694617748260498,
+ "learning_rate": 2.943673699785467e-06,
+ "loss": 0.417,
+ "step": 5680
+ },
+ {
+ "epoch": 2.6860520094562648,
+ "grad_norm": 2.740774393081665,
+ "learning_rate": 2.943059761441857e-06,
+ "loss": 0.4431,
+ "step": 5681
+ },
+ {
+ "epoch": 2.6865248226950356,
+ "grad_norm": 2.670642614364624,
+ "learning_rate": 2.942445795510859e-06,
+ "loss": 0.4298,
+ "step": 5682
+ },
+ {
+ "epoch": 2.686997635933806,
+ "grad_norm": 2.838907241821289,
+ "learning_rate": 2.9418318020307e-06,
+ "loss": 0.4529,
+ "step": 5683
+ },
+ {
+ "epoch": 2.6874704491725767,
+ "grad_norm": 2.562317371368408,
+ "learning_rate": 2.9412177810396135e-06,
+ "loss": 0.4251,
+ "step": 5684
+ },
+ {
+ "epoch": 2.6879432624113475,
+ "grad_norm": 2.5805928707122803,
+ "learning_rate": 2.9406037325758298e-06,
+ "loss": 0.4405,
+ "step": 5685
+ },
+ {
+ "epoch": 2.6884160756501183,
+ "grad_norm": 2.5701205730438232,
+ "learning_rate": 2.939989656677583e-06,
+ "loss": 0.4184,
+ "step": 5686
+ },
+ {
+ "epoch": 2.688888888888889,
+ "grad_norm": 2.7990400791168213,
+ "learning_rate": 2.939375553383111e-06,
+ "loss": 0.4866,
+ "step": 5687
+ },
+ {
+ "epoch": 2.6893617021276595,
+ "grad_norm": 3.063319206237793,
+ "learning_rate": 2.9387614227306487e-06,
+ "loss": 0.4202,
+ "step": 5688
+ },
+ {
+ "epoch": 2.6898345153664303,
+ "grad_norm": 3.0891315937042236,
+ "learning_rate": 2.938147264758437e-06,
+ "loss": 0.4344,
+ "step": 5689
+ },
+ {
+ "epoch": 2.690307328605201,
+ "grad_norm": 2.8982670307159424,
+ "learning_rate": 2.9375330795047165e-06,
+ "loss": 0.4548,
+ "step": 5690
+ },
+ {
+ "epoch": 2.6907801418439714,
+ "grad_norm": 2.7947235107421875,
+ "learning_rate": 2.9369188670077293e-06,
+ "loss": 0.5028,
+ "step": 5691
+ },
+ {
+ "epoch": 2.691252955082742,
+ "grad_norm": 3.1615960597991943,
+ "learning_rate": 2.9363046273057206e-06,
+ "loss": 0.4855,
+ "step": 5692
+ },
+ {
+ "epoch": 2.691725768321513,
+ "grad_norm": 2.669516086578369,
+ "learning_rate": 2.935690360436935e-06,
+ "loss": 0.3813,
+ "step": 5693
+ },
+ {
+ "epoch": 2.692198581560284,
+ "grad_norm": 2.8743274211883545,
+ "learning_rate": 2.935076066439622e-06,
+ "loss": 0.4302,
+ "step": 5694
+ },
+ {
+ "epoch": 2.6926713947990546,
+ "grad_norm": 2.6829612255096436,
+ "learning_rate": 2.9344617453520295e-06,
+ "loss": 0.4063,
+ "step": 5695
+ },
+ {
+ "epoch": 2.693144208037825,
+ "grad_norm": 2.776447057723999,
+ "learning_rate": 2.9338473972124097e-06,
+ "loss": 0.4921,
+ "step": 5696
+ },
+ {
+ "epoch": 2.6936170212765957,
+ "grad_norm": 2.7865772247314453,
+ "learning_rate": 2.9332330220590143e-06,
+ "loss": 0.4939,
+ "step": 5697
+ },
+ {
+ "epoch": 2.6940898345153665,
+ "grad_norm": 3.020526170730591,
+ "learning_rate": 2.932618619930098e-06,
+ "loss": 0.4839,
+ "step": 5698
+ },
+ {
+ "epoch": 2.694562647754137,
+ "grad_norm": 2.637057065963745,
+ "learning_rate": 2.932004190863918e-06,
+ "loss": 0.4343,
+ "step": 5699
+ },
+ {
+ "epoch": 2.6950354609929077,
+ "grad_norm": 2.7426512241363525,
+ "learning_rate": 2.9313897348987314e-06,
+ "loss": 0.3609,
+ "step": 5700
+ },
+ {
+ "epoch": 2.6955082742316785,
+ "grad_norm": 2.767186164855957,
+ "learning_rate": 2.9307752520727974e-06,
+ "loss": 0.3793,
+ "step": 5701
+ },
+ {
+ "epoch": 2.6959810874704493,
+ "grad_norm": 2.4791622161865234,
+ "learning_rate": 2.930160742424377e-06,
+ "loss": 0.4192,
+ "step": 5702
+ },
+ {
+ "epoch": 2.69645390070922,
+ "grad_norm": 2.661461591720581,
+ "learning_rate": 2.9295462059917336e-06,
+ "loss": 0.4758,
+ "step": 5703
+ },
+ {
+ "epoch": 2.6969267139479904,
+ "grad_norm": 2.896242380142212,
+ "learning_rate": 2.928931642813131e-06,
+ "loss": 0.42,
+ "step": 5704
+ },
+ {
+ "epoch": 2.6973995271867612,
+ "grad_norm": 2.783813238143921,
+ "learning_rate": 2.9283170529268366e-06,
+ "loss": 0.4726,
+ "step": 5705
+ },
+ {
+ "epoch": 2.697872340425532,
+ "grad_norm": 2.4347333908081055,
+ "learning_rate": 2.927702436371117e-06,
+ "loss": 0.4199,
+ "step": 5706
+ },
+ {
+ "epoch": 2.6983451536643024,
+ "grad_norm": 2.4643805027008057,
+ "learning_rate": 2.927087793184242e-06,
+ "loss": 0.3578,
+ "step": 5707
+ },
+ {
+ "epoch": 2.698817966903073,
+ "grad_norm": 2.6396660804748535,
+ "learning_rate": 2.9264731234044835e-06,
+ "loss": 0.4509,
+ "step": 5708
+ },
+ {
+ "epoch": 2.699290780141844,
+ "grad_norm": 2.7341182231903076,
+ "learning_rate": 2.925858427070113e-06,
+ "loss": 0.4331,
+ "step": 5709
+ },
+ {
+ "epoch": 2.699763593380615,
+ "grad_norm": 2.7578938007354736,
+ "learning_rate": 2.9252437042194058e-06,
+ "loss": 0.4508,
+ "step": 5710
+ },
+ {
+ "epoch": 2.7002364066193856,
+ "grad_norm": 2.557788133621216,
+ "learning_rate": 2.9246289548906375e-06,
+ "loss": 0.3775,
+ "step": 5711
+ },
+ {
+ "epoch": 2.700709219858156,
+ "grad_norm": 2.802851676940918,
+ "learning_rate": 2.924014179122086e-06,
+ "loss": 0.4518,
+ "step": 5712
+ },
+ {
+ "epoch": 2.7011820330969267,
+ "grad_norm": 2.4773001670837402,
+ "learning_rate": 2.9233993769520313e-06,
+ "loss": 0.4019,
+ "step": 5713
+ },
+ {
+ "epoch": 2.7016548463356975,
+ "grad_norm": 3.108971357345581,
+ "learning_rate": 2.922784548418754e-06,
+ "loss": 0.4715,
+ "step": 5714
+ },
+ {
+ "epoch": 2.702127659574468,
+ "grad_norm": 2.8596770763397217,
+ "learning_rate": 2.9221696935605366e-06,
+ "loss": 0.4361,
+ "step": 5715
+ },
+ {
+ "epoch": 2.7026004728132387,
+ "grad_norm": 2.570604085922241,
+ "learning_rate": 2.9215548124156633e-06,
+ "loss": 0.3982,
+ "step": 5716
+ },
+ {
+ "epoch": 2.7030732860520095,
+ "grad_norm": 2.3157799243927,
+ "learning_rate": 2.9209399050224206e-06,
+ "loss": 0.456,
+ "step": 5717
+ },
+ {
+ "epoch": 2.7035460992907803,
+ "grad_norm": 2.6865758895874023,
+ "learning_rate": 2.9203249714190952e-06,
+ "loss": 0.4441,
+ "step": 5718
+ },
+ {
+ "epoch": 2.704018912529551,
+ "grad_norm": 2.76723313331604,
+ "learning_rate": 2.919710011643978e-06,
+ "loss": 0.464,
+ "step": 5719
+ },
+ {
+ "epoch": 2.7044917257683214,
+ "grad_norm": 2.648792028427124,
+ "learning_rate": 2.9190950257353578e-06,
+ "loss": 0.3426,
+ "step": 5720
+ },
+ {
+ "epoch": 2.704964539007092,
+ "grad_norm": 2.878739833831787,
+ "learning_rate": 2.9184800137315276e-06,
+ "loss": 0.4431,
+ "step": 5721
+ },
+ {
+ "epoch": 2.705437352245863,
+ "grad_norm": 2.670567274093628,
+ "learning_rate": 2.917864975670783e-06,
+ "loss": 0.4347,
+ "step": 5722
+ },
+ {
+ "epoch": 2.7059101654846334,
+ "grad_norm": 2.7031569480895996,
+ "learning_rate": 2.9172499115914184e-06,
+ "loss": 0.4557,
+ "step": 5723
+ },
+ {
+ "epoch": 2.706382978723404,
+ "grad_norm": 2.5225696563720703,
+ "learning_rate": 2.9166348215317314e-06,
+ "loss": 0.4159,
+ "step": 5724
+ },
+ {
+ "epoch": 2.706855791962175,
+ "grad_norm": 2.8676085472106934,
+ "learning_rate": 2.916019705530021e-06,
+ "loss": 0.5018,
+ "step": 5725
+ },
+ {
+ "epoch": 2.7073286052009458,
+ "grad_norm": 2.576463460922241,
+ "learning_rate": 2.915404563624587e-06,
+ "loss": 0.4317,
+ "step": 5726
+ },
+ {
+ "epoch": 2.7078014184397166,
+ "grad_norm": 3.155565023422241,
+ "learning_rate": 2.9147893958537328e-06,
+ "loss": 0.5029,
+ "step": 5727
+ },
+ {
+ "epoch": 2.708274231678487,
+ "grad_norm": 2.604079008102417,
+ "learning_rate": 2.9141742022557622e-06,
+ "loss": 0.4324,
+ "step": 5728
+ },
+ {
+ "epoch": 2.7087470449172577,
+ "grad_norm": 2.6597228050231934,
+ "learning_rate": 2.913558982868979e-06,
+ "loss": 0.4335,
+ "step": 5729
+ },
+ {
+ "epoch": 2.7092198581560285,
+ "grad_norm": 2.811384439468384,
+ "learning_rate": 2.9129437377316923e-06,
+ "loss": 0.4031,
+ "step": 5730
+ },
+ {
+ "epoch": 2.709692671394799,
+ "grad_norm": 3.1041207313537598,
+ "learning_rate": 2.91232846688221e-06,
+ "loss": 0.481,
+ "step": 5731
+ },
+ {
+ "epoch": 2.7101654846335697,
+ "grad_norm": 2.5992188453674316,
+ "learning_rate": 2.9117131703588414e-06,
+ "loss": 0.4266,
+ "step": 5732
+ },
+ {
+ "epoch": 2.7106382978723405,
+ "grad_norm": 2.7726242542266846,
+ "learning_rate": 2.911097848199899e-06,
+ "loss": 0.4464,
+ "step": 5733
+ },
+ {
+ "epoch": 2.7111111111111112,
+ "grad_norm": 2.8683483600616455,
+ "learning_rate": 2.9104825004436966e-06,
+ "loss": 0.4248,
+ "step": 5734
+ },
+ {
+ "epoch": 2.711583924349882,
+ "grad_norm": 2.776386022567749,
+ "learning_rate": 2.9098671271285484e-06,
+ "loss": 0.4556,
+ "step": 5735
+ },
+ {
+ "epoch": 2.7120567375886524,
+ "grad_norm": 2.7612528800964355,
+ "learning_rate": 2.909251728292771e-06,
+ "loss": 0.455,
+ "step": 5736
+ },
+ {
+ "epoch": 2.712529550827423,
+ "grad_norm": 2.9223551750183105,
+ "learning_rate": 2.908636303974684e-06,
+ "loss": 0.4302,
+ "step": 5737
+ },
+ {
+ "epoch": 2.713002364066194,
+ "grad_norm": 2.898226022720337,
+ "learning_rate": 2.908020854212606e-06,
+ "loss": 0.4827,
+ "step": 5738
+ },
+ {
+ "epoch": 2.7134751773049643,
+ "grad_norm": 2.706361770629883,
+ "learning_rate": 2.9074053790448576e-06,
+ "loss": 0.4444,
+ "step": 5739
+ },
+ {
+ "epoch": 2.713947990543735,
+ "grad_norm": 2.8227248191833496,
+ "learning_rate": 2.9067898785097637e-06,
+ "loss": 0.4661,
+ "step": 5740
+ },
+ {
+ "epoch": 2.714420803782506,
+ "grad_norm": 2.597837448120117,
+ "learning_rate": 2.9061743526456474e-06,
+ "loss": 0.4646,
+ "step": 5741
+ },
+ {
+ "epoch": 2.7148936170212767,
+ "grad_norm": 2.5525131225585938,
+ "learning_rate": 2.9055588014908354e-06,
+ "loss": 0.4172,
+ "step": 5742
+ },
+ {
+ "epoch": 2.7153664302600475,
+ "grad_norm": 2.713071823120117,
+ "learning_rate": 2.904943225083655e-06,
+ "loss": 0.4893,
+ "step": 5743
+ },
+ {
+ "epoch": 2.715839243498818,
+ "grad_norm": 2.538623571395874,
+ "learning_rate": 2.9043276234624353e-06,
+ "loss": 0.3905,
+ "step": 5744
+ },
+ {
+ "epoch": 2.7163120567375887,
+ "grad_norm": 2.5190389156341553,
+ "learning_rate": 2.9037119966655076e-06,
+ "loss": 0.4318,
+ "step": 5745
+ },
+ {
+ "epoch": 2.7167848699763595,
+ "grad_norm": 2.6587612628936768,
+ "learning_rate": 2.903096344731204e-06,
+ "loss": 0.4153,
+ "step": 5746
+ },
+ {
+ "epoch": 2.71725768321513,
+ "grad_norm": 2.836731433868408,
+ "learning_rate": 2.902480667697859e-06,
+ "loss": 0.4779,
+ "step": 5747
+ },
+ {
+ "epoch": 2.7177304964539006,
+ "grad_norm": 2.8076045513153076,
+ "learning_rate": 2.9018649656038074e-06,
+ "loss": 0.5126,
+ "step": 5748
+ },
+ {
+ "epoch": 2.7182033096926714,
+ "grad_norm": 2.8930516242980957,
+ "learning_rate": 2.9012492384873865e-06,
+ "loss": 0.4561,
+ "step": 5749
+ },
+ {
+ "epoch": 2.7186761229314422,
+ "grad_norm": 2.7000370025634766,
+ "learning_rate": 2.9006334863869343e-06,
+ "loss": 0.4659,
+ "step": 5750
+ },
+ {
+ "epoch": 2.719148936170213,
+ "grad_norm": 2.927011251449585,
+ "learning_rate": 2.9000177093407926e-06,
+ "loss": 0.5123,
+ "step": 5751
+ },
+ {
+ "epoch": 2.7196217494089834,
+ "grad_norm": 3.0102779865264893,
+ "learning_rate": 2.8994019073873015e-06,
+ "loss": 0.3972,
+ "step": 5752
+ },
+ {
+ "epoch": 2.720094562647754,
+ "grad_norm": 2.778838634490967,
+ "learning_rate": 2.8987860805648054e-06,
+ "loss": 0.4922,
+ "step": 5753
+ },
+ {
+ "epoch": 2.720567375886525,
+ "grad_norm": 2.6150314807891846,
+ "learning_rate": 2.898170228911648e-06,
+ "loss": 0.4425,
+ "step": 5754
+ },
+ {
+ "epoch": 2.7210401891252953,
+ "grad_norm": 2.9329984188079834,
+ "learning_rate": 2.8975543524661777e-06,
+ "loss": 0.4872,
+ "step": 5755
+ },
+ {
+ "epoch": 2.721513002364066,
+ "grad_norm": 2.756803512573242,
+ "learning_rate": 2.8969384512667404e-06,
+ "loss": 0.4362,
+ "step": 5756
+ },
+ {
+ "epoch": 2.721985815602837,
+ "grad_norm": 2.600877285003662,
+ "learning_rate": 2.896322525351686e-06,
+ "loss": 0.4802,
+ "step": 5757
+ },
+ {
+ "epoch": 2.7224586288416077,
+ "grad_norm": 2.647069215774536,
+ "learning_rate": 2.8957065747593655e-06,
+ "loss": 0.4649,
+ "step": 5758
+ },
+ {
+ "epoch": 2.7229314420803785,
+ "grad_norm": 2.845388174057007,
+ "learning_rate": 2.895090599528132e-06,
+ "loss": 0.4533,
+ "step": 5759
+ },
+ {
+ "epoch": 2.723404255319149,
+ "grad_norm": 2.973881721496582,
+ "learning_rate": 2.8944745996963397e-06,
+ "loss": 0.4959,
+ "step": 5760
+ },
+ {
+ "epoch": 2.7238770685579197,
+ "grad_norm": 2.8995487689971924,
+ "learning_rate": 2.8938585753023435e-06,
+ "loss": 0.4597,
+ "step": 5761
+ },
+ {
+ "epoch": 2.7243498817966905,
+ "grad_norm": 2.903693437576294,
+ "learning_rate": 2.8932425263845004e-06,
+ "loss": 0.4521,
+ "step": 5762
+ },
+ {
+ "epoch": 2.724822695035461,
+ "grad_norm": 2.7609009742736816,
+ "learning_rate": 2.8926264529811702e-06,
+ "loss": 0.4399,
+ "step": 5763
+ },
+ {
+ "epoch": 2.7252955082742316,
+ "grad_norm": 2.788787603378296,
+ "learning_rate": 2.892010355130712e-06,
+ "loss": 0.4614,
+ "step": 5764
+ },
+ {
+ "epoch": 2.7257683215130024,
+ "grad_norm": 2.786498785018921,
+ "learning_rate": 2.8913942328714887e-06,
+ "loss": 0.4798,
+ "step": 5765
+ },
+ {
+ "epoch": 2.726241134751773,
+ "grad_norm": 2.9809393882751465,
+ "learning_rate": 2.8907780862418616e-06,
+ "loss": 0.5108,
+ "step": 5766
+ },
+ {
+ "epoch": 2.726713947990544,
+ "grad_norm": 2.6621177196502686,
+ "learning_rate": 2.8901619152801967e-06,
+ "loss": 0.4031,
+ "step": 5767
+ },
+ {
+ "epoch": 2.7271867612293144,
+ "grad_norm": 3.3092098236083984,
+ "learning_rate": 2.8895457200248607e-06,
+ "loss": 0.4671,
+ "step": 5768
+ },
+ {
+ "epoch": 2.727659574468085,
+ "grad_norm": 2.866306781768799,
+ "learning_rate": 2.8889295005142204e-06,
+ "loss": 0.4434,
+ "step": 5769
+ },
+ {
+ "epoch": 2.728132387706856,
+ "grad_norm": 2.6861231327056885,
+ "learning_rate": 2.888313256786646e-06,
+ "loss": 0.429,
+ "step": 5770
+ },
+ {
+ "epoch": 2.7286052009456263,
+ "grad_norm": 2.873180389404297,
+ "learning_rate": 2.8876969888805072e-06,
+ "loss": 0.4412,
+ "step": 5771
+ },
+ {
+ "epoch": 2.729078014184397,
+ "grad_norm": 2.511678695678711,
+ "learning_rate": 2.887080696834178e-06,
+ "loss": 0.4024,
+ "step": 5772
+ },
+ {
+ "epoch": 2.729550827423168,
+ "grad_norm": 2.6502726078033447,
+ "learning_rate": 2.88646438068603e-06,
+ "loss": 0.4357,
+ "step": 5773
+ },
+ {
+ "epoch": 2.7300236406619387,
+ "grad_norm": 2.7156145572662354,
+ "learning_rate": 2.8858480404744403e-06,
+ "loss": 0.4511,
+ "step": 5774
+ },
+ {
+ "epoch": 2.7304964539007095,
+ "grad_norm": 2.882582187652588,
+ "learning_rate": 2.8852316762377842e-06,
+ "loss": 0.4822,
+ "step": 5775
+ },
+ {
+ "epoch": 2.73096926713948,
+ "grad_norm": 2.7139666080474854,
+ "learning_rate": 2.8846152880144413e-06,
+ "loss": 0.4666,
+ "step": 5776
+ },
+ {
+ "epoch": 2.7314420803782506,
+ "grad_norm": 2.7453949451446533,
+ "learning_rate": 2.8839988758427907e-06,
+ "loss": 0.3927,
+ "step": 5777
+ },
+ {
+ "epoch": 2.731914893617021,
+ "grad_norm": 2.7859580516815186,
+ "learning_rate": 2.883382439761214e-06,
+ "loss": 0.4466,
+ "step": 5778
+ },
+ {
+ "epoch": 2.732387706855792,
+ "grad_norm": 2.695234537124634,
+ "learning_rate": 2.882765979808094e-06,
+ "loss": 0.4227,
+ "step": 5779
+ },
+ {
+ "epoch": 2.7328605200945626,
+ "grad_norm": 2.8081552982330322,
+ "learning_rate": 2.8821494960218148e-06,
+ "loss": 0.447,
+ "step": 5780
+ },
+ {
+ "epoch": 2.7333333333333334,
+ "grad_norm": 2.887643337249756,
+ "learning_rate": 2.881532988440762e-06,
+ "loss": 0.5018,
+ "step": 5781
+ },
+ {
+ "epoch": 2.733806146572104,
+ "grad_norm": 3.108212471008301,
+ "learning_rate": 2.8809164571033233e-06,
+ "loss": 0.4132,
+ "step": 5782
+ },
+ {
+ "epoch": 2.7342789598108745,
+ "grad_norm": 2.874328374862671,
+ "learning_rate": 2.880299902047886e-06,
+ "loss": 0.4618,
+ "step": 5783
+ },
+ {
+ "epoch": 2.7347517730496453,
+ "grad_norm": 3.089132308959961,
+ "learning_rate": 2.879683323312843e-06,
+ "loss": 0.4956,
+ "step": 5784
+ },
+ {
+ "epoch": 2.735224586288416,
+ "grad_norm": 2.5173206329345703,
+ "learning_rate": 2.879066720936583e-06,
+ "loss": 0.4087,
+ "step": 5785
+ },
+ {
+ "epoch": 2.7356973995271865,
+ "grad_norm": 2.6401286125183105,
+ "learning_rate": 2.8784500949575014e-06,
+ "loss": 0.3995,
+ "step": 5786
+ },
+ {
+ "epoch": 2.7361702127659573,
+ "grad_norm": 2.9371910095214844,
+ "learning_rate": 2.877833445413991e-06,
+ "loss": 0.5209,
+ "step": 5787
+ },
+ {
+ "epoch": 2.736643026004728,
+ "grad_norm": 3.218158006668091,
+ "learning_rate": 2.8772167723444498e-06,
+ "loss": 0.4275,
+ "step": 5788
+ },
+ {
+ "epoch": 2.737115839243499,
+ "grad_norm": 2.9072160720825195,
+ "learning_rate": 2.8766000757872736e-06,
+ "loss": 0.4244,
+ "step": 5789
+ },
+ {
+ "epoch": 2.7375886524822697,
+ "grad_norm": 3.0378096103668213,
+ "learning_rate": 2.8759833557808614e-06,
+ "loss": 0.507,
+ "step": 5790
+ },
+ {
+ "epoch": 2.73806146572104,
+ "grad_norm": 2.728353977203369,
+ "learning_rate": 2.8753666123636148e-06,
+ "loss": 0.413,
+ "step": 5791
+ },
+ {
+ "epoch": 2.738534278959811,
+ "grad_norm": 2.6869957447052,
+ "learning_rate": 2.874749845573935e-06,
+ "loss": 0.44,
+ "step": 5792
+ },
+ {
+ "epoch": 2.7390070921985816,
+ "grad_norm": 2.6381702423095703,
+ "learning_rate": 2.8741330554502263e-06,
+ "loss": 0.4708,
+ "step": 5793
+ },
+ {
+ "epoch": 2.739479905437352,
+ "grad_norm": 2.6944689750671387,
+ "learning_rate": 2.873516242030892e-06,
+ "loss": 0.4555,
+ "step": 5794
+ },
+ {
+ "epoch": 2.739952718676123,
+ "grad_norm": 3.168473243713379,
+ "learning_rate": 2.8728994053543396e-06,
+ "loss": 0.4538,
+ "step": 5795
+ },
+ {
+ "epoch": 2.7404255319148936,
+ "grad_norm": 2.7504515647888184,
+ "learning_rate": 2.872282545458976e-06,
+ "loss": 0.4628,
+ "step": 5796
+ },
+ {
+ "epoch": 2.7408983451536644,
+ "grad_norm": 2.896462917327881,
+ "learning_rate": 2.8716656623832114e-06,
+ "loss": 0.4946,
+ "step": 5797
+ },
+ {
+ "epoch": 2.741371158392435,
+ "grad_norm": 2.8053417205810547,
+ "learning_rate": 2.8710487561654547e-06,
+ "loss": 0.4893,
+ "step": 5798
+ },
+ {
+ "epoch": 2.7418439716312055,
+ "grad_norm": 2.63171124458313,
+ "learning_rate": 2.870431826844119e-06,
+ "loss": 0.4257,
+ "step": 5799
+ },
+ {
+ "epoch": 2.7423167848699763,
+ "grad_norm": 3.0963807106018066,
+ "learning_rate": 2.869814874457618e-06,
+ "loss": 0.5404,
+ "step": 5800
+ },
+ {
+ "epoch": 2.742789598108747,
+ "grad_norm": 2.591132164001465,
+ "learning_rate": 2.8691978990443664e-06,
+ "loss": 0.4015,
+ "step": 5801
+ },
+ {
+ "epoch": 2.7432624113475175,
+ "grad_norm": 3.0319552421569824,
+ "learning_rate": 2.8685809006427812e-06,
+ "loss": 0.4411,
+ "step": 5802
+ },
+ {
+ "epoch": 2.7437352245862883,
+ "grad_norm": 2.7791874408721924,
+ "learning_rate": 2.8679638792912784e-06,
+ "loss": 0.43,
+ "step": 5803
+ },
+ {
+ "epoch": 2.744208037825059,
+ "grad_norm": 3.530632495880127,
+ "learning_rate": 2.867346835028279e-06,
+ "loss": 0.4581,
+ "step": 5804
+ },
+ {
+ "epoch": 2.74468085106383,
+ "grad_norm": 3.2043099403381348,
+ "learning_rate": 2.8667297678922024e-06,
+ "loss": 0.4375,
+ "step": 5805
+ },
+ {
+ "epoch": 2.7451536643026007,
+ "grad_norm": 2.8442344665527344,
+ "learning_rate": 2.8661126779214716e-06,
+ "loss": 0.4059,
+ "step": 5806
+ },
+ {
+ "epoch": 2.745626477541371,
+ "grad_norm": 2.7561380863189697,
+ "learning_rate": 2.86549556515451e-06,
+ "loss": 0.4391,
+ "step": 5807
+ },
+ {
+ "epoch": 2.746099290780142,
+ "grad_norm": 3.229663848876953,
+ "learning_rate": 2.8648784296297418e-06,
+ "loss": 0.4579,
+ "step": 5808
+ },
+ {
+ "epoch": 2.7465721040189126,
+ "grad_norm": 2.8375027179718018,
+ "learning_rate": 2.864261271385593e-06,
+ "loss": 0.4566,
+ "step": 5809
+ },
+ {
+ "epoch": 2.747044917257683,
+ "grad_norm": 2.392998695373535,
+ "learning_rate": 2.863644090460493e-06,
+ "loss": 0.4123,
+ "step": 5810
+ },
+ {
+ "epoch": 2.7475177304964538,
+ "grad_norm": 2.707610607147217,
+ "learning_rate": 2.86302688689287e-06,
+ "loss": 0.4299,
+ "step": 5811
+ },
+ {
+ "epoch": 2.7479905437352246,
+ "grad_norm": 2.824042797088623,
+ "learning_rate": 2.8624096607211547e-06,
+ "loss": 0.3799,
+ "step": 5812
+ },
+ {
+ "epoch": 2.7484633569739954,
+ "grad_norm": 2.933102607727051,
+ "learning_rate": 2.861792411983779e-06,
+ "loss": 0.4416,
+ "step": 5813
+ },
+ {
+ "epoch": 2.748936170212766,
+ "grad_norm": 2.7000277042388916,
+ "learning_rate": 2.8611751407191757e-06,
+ "loss": 0.4346,
+ "step": 5814
+ },
+ {
+ "epoch": 2.7494089834515365,
+ "grad_norm": 2.6757142543792725,
+ "learning_rate": 2.86055784696578e-06,
+ "loss": 0.4559,
+ "step": 5815
+ },
+ {
+ "epoch": 2.7498817966903073,
+ "grad_norm": 2.5791053771972656,
+ "learning_rate": 2.8599405307620287e-06,
+ "loss": 0.4619,
+ "step": 5816
+ },
+ {
+ "epoch": 2.750354609929078,
+ "grad_norm": 3.0327374935150146,
+ "learning_rate": 2.859323192146359e-06,
+ "loss": 0.4163,
+ "step": 5817
+ },
+ {
+ "epoch": 2.7508274231678485,
+ "grad_norm": 2.5580220222473145,
+ "learning_rate": 2.8587058311572084e-06,
+ "loss": 0.4005,
+ "step": 5818
+ },
+ {
+ "epoch": 2.7513002364066192,
+ "grad_norm": 2.592179536819458,
+ "learning_rate": 2.85808844783302e-06,
+ "loss": 0.4404,
+ "step": 5819
+ },
+ {
+ "epoch": 2.75177304964539,
+ "grad_norm": 3.2779927253723145,
+ "learning_rate": 2.8574710422122342e-06,
+ "loss": 0.54,
+ "step": 5820
+ },
+ {
+ "epoch": 2.752245862884161,
+ "grad_norm": 2.4804370403289795,
+ "learning_rate": 2.8568536143332933e-06,
+ "loss": 0.4476,
+ "step": 5821
+ },
+ {
+ "epoch": 2.7527186761229316,
+ "grad_norm": 2.649477481842041,
+ "learning_rate": 2.8562361642346427e-06,
+ "loss": 0.4336,
+ "step": 5822
+ },
+ {
+ "epoch": 2.753191489361702,
+ "grad_norm": 3.138587474822998,
+ "learning_rate": 2.855618691954728e-06,
+ "loss": 0.5042,
+ "step": 5823
+ },
+ {
+ "epoch": 2.753664302600473,
+ "grad_norm": 2.75093412399292,
+ "learning_rate": 2.855001197531997e-06,
+ "loss": 0.4327,
+ "step": 5824
+ },
+ {
+ "epoch": 2.7541371158392436,
+ "grad_norm": 2.678809642791748,
+ "learning_rate": 2.854383681004898e-06,
+ "loss": 0.4409,
+ "step": 5825
+ },
+ {
+ "epoch": 2.754609929078014,
+ "grad_norm": 2.965386390686035,
+ "learning_rate": 2.853766142411881e-06,
+ "loss": 0.4716,
+ "step": 5826
+ },
+ {
+ "epoch": 2.7550827423167847,
+ "grad_norm": 2.6419436931610107,
+ "learning_rate": 2.853148581791398e-06,
+ "loss": 0.4367,
+ "step": 5827
+ },
+ {
+ "epoch": 2.7555555555555555,
+ "grad_norm": 3.205794095993042,
+ "learning_rate": 2.8525309991819004e-06,
+ "loss": 0.4869,
+ "step": 5828
+ },
+ {
+ "epoch": 2.7560283687943263,
+ "grad_norm": 3.041008472442627,
+ "learning_rate": 2.851913394621844e-06,
+ "loss": 0.5087,
+ "step": 5829
+ },
+ {
+ "epoch": 2.756501182033097,
+ "grad_norm": 2.6525566577911377,
+ "learning_rate": 2.851295768149684e-06,
+ "loss": 0.3951,
+ "step": 5830
+ },
+ {
+ "epoch": 2.7569739952718675,
+ "grad_norm": 2.732220411300659,
+ "learning_rate": 2.850678119803876e-06,
+ "loss": 0.4797,
+ "step": 5831
+ },
+ {
+ "epoch": 2.7574468085106383,
+ "grad_norm": 2.8965251445770264,
+ "learning_rate": 2.8500604496228797e-06,
+ "loss": 0.4938,
+ "step": 5832
+ },
+ {
+ "epoch": 2.757919621749409,
+ "grad_norm": 2.48020076751709,
+ "learning_rate": 2.849442757645154e-06,
+ "loss": 0.4172,
+ "step": 5833
+ },
+ {
+ "epoch": 2.7583924349881794,
+ "grad_norm": 2.4764912128448486,
+ "learning_rate": 2.8488250439091603e-06,
+ "loss": 0.4123,
+ "step": 5834
+ },
+ {
+ "epoch": 2.7588652482269502,
+ "grad_norm": 2.4547016620635986,
+ "learning_rate": 2.84820730845336e-06,
+ "loss": 0.4116,
+ "step": 5835
+ },
+ {
+ "epoch": 2.759338061465721,
+ "grad_norm": 2.55476975440979,
+ "learning_rate": 2.847589551316218e-06,
+ "loss": 0.4744,
+ "step": 5836
+ },
+ {
+ "epoch": 2.759810874704492,
+ "grad_norm": 2.3866238594055176,
+ "learning_rate": 2.846971772536199e-06,
+ "loss": 0.4406,
+ "step": 5837
+ },
+ {
+ "epoch": 2.7602836879432626,
+ "grad_norm": 2.855318784713745,
+ "learning_rate": 2.8463539721517687e-06,
+ "loss": 0.4517,
+ "step": 5838
+ },
+ {
+ "epoch": 2.760756501182033,
+ "grad_norm": 2.527198314666748,
+ "learning_rate": 2.8457361502013954e-06,
+ "loss": 0.3588,
+ "step": 5839
+ },
+ {
+ "epoch": 2.7612293144208038,
+ "grad_norm": 2.6761462688446045,
+ "learning_rate": 2.8451183067235476e-06,
+ "loss": 0.4192,
+ "step": 5840
+ },
+ {
+ "epoch": 2.7617021276595746,
+ "grad_norm": 2.5692319869995117,
+ "learning_rate": 2.8445004417566967e-06,
+ "loss": 0.4108,
+ "step": 5841
+ },
+ {
+ "epoch": 2.762174940898345,
+ "grad_norm": 2.5721096992492676,
+ "learning_rate": 2.8438825553393133e-06,
+ "loss": 0.3941,
+ "step": 5842
+ },
+ {
+ "epoch": 2.7626477541371157,
+ "grad_norm": 2.699430227279663,
+ "learning_rate": 2.843264647509872e-06,
+ "loss": 0.4418,
+ "step": 5843
+ },
+ {
+ "epoch": 2.7631205673758865,
+ "grad_norm": 2.6943318843841553,
+ "learning_rate": 2.842646718306846e-06,
+ "loss": 0.4505,
+ "step": 5844
+ },
+ {
+ "epoch": 2.7635933806146573,
+ "grad_norm": 2.661656379699707,
+ "learning_rate": 2.8420287677687107e-06,
+ "loss": 0.4413,
+ "step": 5845
+ },
+ {
+ "epoch": 2.764066193853428,
+ "grad_norm": 2.830467939376831,
+ "learning_rate": 2.8414107959339444e-06,
+ "loss": 0.5095,
+ "step": 5846
+ },
+ {
+ "epoch": 2.7645390070921985,
+ "grad_norm": 2.598053455352783,
+ "learning_rate": 2.840792802841024e-06,
+ "loss": 0.4029,
+ "step": 5847
+ },
+ {
+ "epoch": 2.7650118203309693,
+ "grad_norm": 2.641700029373169,
+ "learning_rate": 2.8401747885284316e-06,
+ "loss": 0.4237,
+ "step": 5848
+ },
+ {
+ "epoch": 2.76548463356974,
+ "grad_norm": 2.6672768592834473,
+ "learning_rate": 2.8395567530346454e-06,
+ "loss": 0.4181,
+ "step": 5849
+ },
+ {
+ "epoch": 2.7659574468085104,
+ "grad_norm": 2.5851705074310303,
+ "learning_rate": 2.838938696398149e-06,
+ "loss": 0.4165,
+ "step": 5850
+ },
+ {
+ "epoch": 2.766430260047281,
+ "grad_norm": 2.318120002746582,
+ "learning_rate": 2.8383206186574276e-06,
+ "loss": 0.3578,
+ "step": 5851
+ },
+ {
+ "epoch": 2.766903073286052,
+ "grad_norm": 2.6199793815612793,
+ "learning_rate": 2.8377025198509635e-06,
+ "loss": 0.4719,
+ "step": 5852
+ },
+ {
+ "epoch": 2.767375886524823,
+ "grad_norm": 2.7186086177825928,
+ "learning_rate": 2.837084400017245e-06,
+ "loss": 0.41,
+ "step": 5853
+ },
+ {
+ "epoch": 2.7678486997635936,
+ "grad_norm": 2.702514886856079,
+ "learning_rate": 2.8364662591947583e-06,
+ "loss": 0.4659,
+ "step": 5854
+ },
+ {
+ "epoch": 2.768321513002364,
+ "grad_norm": 2.612375259399414,
+ "learning_rate": 2.835848097421993e-06,
+ "loss": 0.4252,
+ "step": 5855
+ },
+ {
+ "epoch": 2.7687943262411348,
+ "grad_norm": 3.0127978324890137,
+ "learning_rate": 2.8352299147374394e-06,
+ "loss": 0.4084,
+ "step": 5856
+ },
+ {
+ "epoch": 2.7692671394799055,
+ "grad_norm": 2.6460049152374268,
+ "learning_rate": 2.83461171117959e-06,
+ "loss": 0.4035,
+ "step": 5857
+ },
+ {
+ "epoch": 2.769739952718676,
+ "grad_norm": 2.9844725131988525,
+ "learning_rate": 2.8339934867869357e-06,
+ "loss": 0.4912,
+ "step": 5858
+ },
+ {
+ "epoch": 2.7702127659574467,
+ "grad_norm": 2.731217861175537,
+ "learning_rate": 2.833375241597972e-06,
+ "loss": 0.4112,
+ "step": 5859
+ },
+ {
+ "epoch": 2.7706855791962175,
+ "grad_norm": 2.731194496154785,
+ "learning_rate": 2.832756975651193e-06,
+ "loss": 0.4516,
+ "step": 5860
+ },
+ {
+ "epoch": 2.7711583924349883,
+ "grad_norm": 3.0532076358795166,
+ "learning_rate": 2.8321386889850965e-06,
+ "loss": 0.3959,
+ "step": 5861
+ },
+ {
+ "epoch": 2.771631205673759,
+ "grad_norm": 3.5437800884246826,
+ "learning_rate": 2.831520381638181e-06,
+ "loss": 0.6055,
+ "step": 5862
+ },
+ {
+ "epoch": 2.7721040189125294,
+ "grad_norm": 2.4297714233398438,
+ "learning_rate": 2.830902053648944e-06,
+ "loss": 0.4038,
+ "step": 5863
+ },
+ {
+ "epoch": 2.7725768321513002,
+ "grad_norm": 2.696768045425415,
+ "learning_rate": 2.8302837050558876e-06,
+ "loss": 0.3983,
+ "step": 5864
+ },
+ {
+ "epoch": 2.773049645390071,
+ "grad_norm": 2.6574649810791016,
+ "learning_rate": 2.8296653358975122e-06,
+ "loss": 0.4937,
+ "step": 5865
+ },
+ {
+ "epoch": 2.7735224586288414,
+ "grad_norm": 2.9393341541290283,
+ "learning_rate": 2.8290469462123234e-06,
+ "loss": 0.4603,
+ "step": 5866
+ },
+ {
+ "epoch": 2.773995271867612,
+ "grad_norm": 2.7630696296691895,
+ "learning_rate": 2.828428536038824e-06,
+ "loss": 0.4663,
+ "step": 5867
+ },
+ {
+ "epoch": 2.774468085106383,
+ "grad_norm": 2.7354233264923096,
+ "learning_rate": 2.8278101054155183e-06,
+ "loss": 0.4444,
+ "step": 5868
+ },
+ {
+ "epoch": 2.774940898345154,
+ "grad_norm": 3.0489425659179688,
+ "learning_rate": 2.827191654380915e-06,
+ "loss": 0.4684,
+ "step": 5869
+ },
+ {
+ "epoch": 2.7754137115839246,
+ "grad_norm": 2.9602572917938232,
+ "learning_rate": 2.8265731829735226e-06,
+ "loss": 0.4571,
+ "step": 5870
+ },
+ {
+ "epoch": 2.775886524822695,
+ "grad_norm": 2.774132013320923,
+ "learning_rate": 2.825954691231851e-06,
+ "loss": 0.4458,
+ "step": 5871
+ },
+ {
+ "epoch": 2.7763593380614657,
+ "grad_norm": 2.696622133255005,
+ "learning_rate": 2.825336179194409e-06,
+ "loss": 0.4933,
+ "step": 5872
+ },
+ {
+ "epoch": 2.7768321513002365,
+ "grad_norm": 2.742184638977051,
+ "learning_rate": 2.8247176468997096e-06,
+ "loss": 0.4464,
+ "step": 5873
+ },
+ {
+ "epoch": 2.777304964539007,
+ "grad_norm": 2.7033183574676514,
+ "learning_rate": 2.824099094386266e-06,
+ "loss": 0.4369,
+ "step": 5874
+ },
+ {
+ "epoch": 2.7777777777777777,
+ "grad_norm": 2.7264044284820557,
+ "learning_rate": 2.8234805216925935e-06,
+ "loss": 0.4621,
+ "step": 5875
+ },
+ {
+ "epoch": 2.7782505910165485,
+ "grad_norm": 2.6417739391326904,
+ "learning_rate": 2.822861928857208e-06,
+ "loss": 0.4254,
+ "step": 5876
+ },
+ {
+ "epoch": 2.7787234042553193,
+ "grad_norm": 3.17209529876709,
+ "learning_rate": 2.8222433159186245e-06,
+ "loss": 0.5011,
+ "step": 5877
+ },
+ {
+ "epoch": 2.77919621749409,
+ "grad_norm": 3.1434381008148193,
+ "learning_rate": 2.8216246829153633e-06,
+ "loss": 0.4567,
+ "step": 5878
+ },
+ {
+ "epoch": 2.7796690307328604,
+ "grad_norm": 2.781608819961548,
+ "learning_rate": 2.821006029885943e-06,
+ "loss": 0.4723,
+ "step": 5879
+ },
+ {
+ "epoch": 2.780141843971631,
+ "grad_norm": 3.00079345703125,
+ "learning_rate": 2.820387356868885e-06,
+ "loss": 0.4796,
+ "step": 5880
+ },
+ {
+ "epoch": 2.780614657210402,
+ "grad_norm": 2.703555107116699,
+ "learning_rate": 2.819768663902712e-06,
+ "loss": 0.4577,
+ "step": 5881
+ },
+ {
+ "epoch": 2.7810874704491724,
+ "grad_norm": 2.5741801261901855,
+ "learning_rate": 2.8191499510259453e-06,
+ "loss": 0.4255,
+ "step": 5882
+ },
+ {
+ "epoch": 2.781560283687943,
+ "grad_norm": 2.9871208667755127,
+ "learning_rate": 2.8185312182771112e-06,
+ "loss": 0.4495,
+ "step": 5883
+ },
+ {
+ "epoch": 2.782033096926714,
+ "grad_norm": 2.525317668914795,
+ "learning_rate": 2.8179124656947343e-06,
+ "loss": 0.4428,
+ "step": 5884
+ },
+ {
+ "epoch": 2.7825059101654848,
+ "grad_norm": 2.525092840194702,
+ "learning_rate": 2.817293693317343e-06,
+ "loss": 0.4348,
+ "step": 5885
+ },
+ {
+ "epoch": 2.7829787234042556,
+ "grad_norm": 2.8485171794891357,
+ "learning_rate": 2.816674901183464e-06,
+ "loss": 0.4206,
+ "step": 5886
+ },
+ {
+ "epoch": 2.783451536643026,
+ "grad_norm": 2.6612746715545654,
+ "learning_rate": 2.8160560893316272e-06,
+ "loss": 0.396,
+ "step": 5887
+ },
+ {
+ "epoch": 2.7839243498817967,
+ "grad_norm": 2.7093865871429443,
+ "learning_rate": 2.815437257800364e-06,
+ "loss": 0.4468,
+ "step": 5888
+ },
+ {
+ "epoch": 2.7843971631205675,
+ "grad_norm": 2.6130900382995605,
+ "learning_rate": 2.814818406628206e-06,
+ "loss": 0.443,
+ "step": 5889
+ },
+ {
+ "epoch": 2.784869976359338,
+ "grad_norm": 2.8147552013397217,
+ "learning_rate": 2.8141995358536866e-06,
+ "loss": 0.4454,
+ "step": 5890
+ },
+ {
+ "epoch": 2.7853427895981087,
+ "grad_norm": 2.5621275901794434,
+ "learning_rate": 2.8135806455153395e-06,
+ "loss": 0.439,
+ "step": 5891
+ },
+ {
+ "epoch": 2.7858156028368795,
+ "grad_norm": 2.880228281021118,
+ "learning_rate": 2.812961735651701e-06,
+ "loss": 0.3895,
+ "step": 5892
+ },
+ {
+ "epoch": 2.7862884160756503,
+ "grad_norm": 2.5861377716064453,
+ "learning_rate": 2.8123428063013068e-06,
+ "loss": 0.4402,
+ "step": 5893
+ },
+ {
+ "epoch": 2.786761229314421,
+ "grad_norm": 2.9707765579223633,
+ "learning_rate": 2.811723857502696e-06,
+ "loss": 0.4461,
+ "step": 5894
+ },
+ {
+ "epoch": 2.7872340425531914,
+ "grad_norm": 2.923999309539795,
+ "learning_rate": 2.811104889294408e-06,
+ "loss": 0.4395,
+ "step": 5895
+ },
+ {
+ "epoch": 2.787706855791962,
+ "grad_norm": 2.846933603286743,
+ "learning_rate": 2.810485901714981e-06,
+ "loss": 0.5168,
+ "step": 5896
+ },
+ {
+ "epoch": 2.788179669030733,
+ "grad_norm": 4.1052350997924805,
+ "learning_rate": 2.8098668948029597e-06,
+ "loss": 0.5152,
+ "step": 5897
+ },
+ {
+ "epoch": 2.7886524822695034,
+ "grad_norm": 2.7391018867492676,
+ "learning_rate": 2.8092478685968856e-06,
+ "loss": 0.4515,
+ "step": 5898
+ },
+ {
+ "epoch": 2.789125295508274,
+ "grad_norm": 2.976088285446167,
+ "learning_rate": 2.8086288231353027e-06,
+ "loss": 0.5156,
+ "step": 5899
+ },
+ {
+ "epoch": 2.789598108747045,
+ "grad_norm": 2.6139633655548096,
+ "learning_rate": 2.8080097584567562e-06,
+ "loss": 0.4237,
+ "step": 5900
+ },
+ {
+ "epoch": 2.7900709219858157,
+ "grad_norm": 2.501654624938965,
+ "learning_rate": 2.807390674599792e-06,
+ "loss": 0.4349,
+ "step": 5901
+ },
+ {
+ "epoch": 2.7905437352245865,
+ "grad_norm": 2.8814525604248047,
+ "learning_rate": 2.8067715716029586e-06,
+ "loss": 0.4866,
+ "step": 5902
+ },
+ {
+ "epoch": 2.791016548463357,
+ "grad_norm": 2.7953200340270996,
+ "learning_rate": 2.8061524495048046e-06,
+ "loss": 0.3964,
+ "step": 5903
+ },
+ {
+ "epoch": 2.7914893617021277,
+ "grad_norm": 2.7362849712371826,
+ "learning_rate": 2.8055333083438808e-06,
+ "loss": 0.4181,
+ "step": 5904
+ },
+ {
+ "epoch": 2.7919621749408985,
+ "grad_norm": 2.9740512371063232,
+ "learning_rate": 2.8049141481587366e-06,
+ "loss": 0.4784,
+ "step": 5905
+ },
+ {
+ "epoch": 2.792434988179669,
+ "grad_norm": 2.595813274383545,
+ "learning_rate": 2.8042949689879262e-06,
+ "loss": 0.4421,
+ "step": 5906
+ },
+ {
+ "epoch": 2.7929078014184396,
+ "grad_norm": 2.886899948120117,
+ "learning_rate": 2.803675770870002e-06,
+ "loss": 0.4435,
+ "step": 5907
+ },
+ {
+ "epoch": 2.7933806146572104,
+ "grad_norm": 2.6057486534118652,
+ "learning_rate": 2.8030565538435196e-06,
+ "loss": 0.4472,
+ "step": 5908
+ },
+ {
+ "epoch": 2.7938534278959812,
+ "grad_norm": 2.7422802448272705,
+ "learning_rate": 2.802437317947034e-06,
+ "loss": 0.4799,
+ "step": 5909
+ },
+ {
+ "epoch": 2.794326241134752,
+ "grad_norm": 2.3904244899749756,
+ "learning_rate": 2.801818063219102e-06,
+ "loss": 0.4508,
+ "step": 5910
+ },
+ {
+ "epoch": 2.7947990543735224,
+ "grad_norm": 2.8434207439422607,
+ "learning_rate": 2.8011987896982835e-06,
+ "loss": 0.4473,
+ "step": 5911
+ },
+ {
+ "epoch": 2.795271867612293,
+ "grad_norm": 2.916088819503784,
+ "learning_rate": 2.8005794974231366e-06,
+ "loss": 0.464,
+ "step": 5912
+ },
+ {
+ "epoch": 2.795744680851064,
+ "grad_norm": 2.6483397483825684,
+ "learning_rate": 2.7999601864322236e-06,
+ "loss": 0.441,
+ "step": 5913
+ },
+ {
+ "epoch": 2.7962174940898343,
+ "grad_norm": 2.9287428855895996,
+ "learning_rate": 2.7993408567641033e-06,
+ "loss": 0.4551,
+ "step": 5914
+ },
+ {
+ "epoch": 2.796690307328605,
+ "grad_norm": 2.575024127960205,
+ "learning_rate": 2.798721508457342e-06,
+ "loss": 0.4494,
+ "step": 5915
+ },
+ {
+ "epoch": 2.797163120567376,
+ "grad_norm": 2.7156829833984375,
+ "learning_rate": 2.7981021415505015e-06,
+ "loss": 0.419,
+ "step": 5916
+ },
+ {
+ "epoch": 2.7976359338061467,
+ "grad_norm": 2.850553035736084,
+ "learning_rate": 2.7974827560821482e-06,
+ "loss": 0.4709,
+ "step": 5917
+ },
+ {
+ "epoch": 2.7981087470449175,
+ "grad_norm": 2.673846483230591,
+ "learning_rate": 2.796863352090847e-06,
+ "loss": 0.4224,
+ "step": 5918
+ },
+ {
+ "epoch": 2.798581560283688,
+ "grad_norm": 2.9093217849731445,
+ "learning_rate": 2.796243929615168e-06,
+ "loss": 0.468,
+ "step": 5919
+ },
+ {
+ "epoch": 2.7990543735224587,
+ "grad_norm": 2.4853813648223877,
+ "learning_rate": 2.7956244886936775e-06,
+ "loss": 0.4723,
+ "step": 5920
+ },
+ {
+ "epoch": 2.7995271867612295,
+ "grad_norm": 3.026428461074829,
+ "learning_rate": 2.795005029364946e-06,
+ "loss": 0.4721,
+ "step": 5921
+ },
+ {
+ "epoch": 2.8,
+ "grad_norm": 2.886295795440674,
+ "learning_rate": 2.794385551667546e-06,
+ "loss": 0.456,
+ "step": 5922
+ },
+ {
+ "epoch": 2.8004728132387706,
+ "grad_norm": 3.2260656356811523,
+ "learning_rate": 2.7937660556400486e-06,
+ "loss": 0.4499,
+ "step": 5923
+ },
+ {
+ "epoch": 2.8009456264775414,
+ "grad_norm": 2.7971982955932617,
+ "learning_rate": 2.793146541321027e-06,
+ "loss": 0.3982,
+ "step": 5924
+ },
+ {
+ "epoch": 2.801418439716312,
+ "grad_norm": 2.85461163520813,
+ "learning_rate": 2.7925270087490546e-06,
+ "loss": 0.4841,
+ "step": 5925
+ },
+ {
+ "epoch": 2.801891252955083,
+ "grad_norm": 3.0642316341400146,
+ "learning_rate": 2.7919074579627086e-06,
+ "loss": 0.4538,
+ "step": 5926
+ },
+ {
+ "epoch": 2.8023640661938534,
+ "grad_norm": 2.9053616523742676,
+ "learning_rate": 2.7912878890005657e-06,
+ "loss": 0.434,
+ "step": 5927
+ },
+ {
+ "epoch": 2.802836879432624,
+ "grad_norm": 2.7649240493774414,
+ "learning_rate": 2.7906683019012027e-06,
+ "loss": 0.414,
+ "step": 5928
+ },
+ {
+ "epoch": 2.803309692671395,
+ "grad_norm": 2.8717660903930664,
+ "learning_rate": 2.7900486967031987e-06,
+ "loss": 0.4337,
+ "step": 5929
+ },
+ {
+ "epoch": 2.8037825059101653,
+ "grad_norm": 2.6860995292663574,
+ "learning_rate": 2.789429073445135e-06,
+ "loss": 0.447,
+ "step": 5930
+ },
+ {
+ "epoch": 2.804255319148936,
+ "grad_norm": 2.67509126663208,
+ "learning_rate": 2.7888094321655918e-06,
+ "loss": 0.4955,
+ "step": 5931
+ },
+ {
+ "epoch": 2.804728132387707,
+ "grad_norm": 2.7426326274871826,
+ "learning_rate": 2.7881897729031514e-06,
+ "loss": 0.4564,
+ "step": 5932
+ },
+ {
+ "epoch": 2.8052009456264777,
+ "grad_norm": 2.7087252140045166,
+ "learning_rate": 2.7875700956963973e-06,
+ "loss": 0.4571,
+ "step": 5933
+ },
+ {
+ "epoch": 2.8056737588652485,
+ "grad_norm": 2.513526439666748,
+ "learning_rate": 2.7869504005839147e-06,
+ "loss": 0.4361,
+ "step": 5934
+ },
+ {
+ "epoch": 2.806146572104019,
+ "grad_norm": 3.2246084213256836,
+ "learning_rate": 2.7863306876042885e-06,
+ "loss": 0.4612,
+ "step": 5935
+ },
+ {
+ "epoch": 2.8066193853427897,
+ "grad_norm": 3.226325511932373,
+ "learning_rate": 2.7857109567961066e-06,
+ "loss": 0.4528,
+ "step": 5936
+ },
+ {
+ "epoch": 2.8070921985815604,
+ "grad_norm": 2.8861422538757324,
+ "learning_rate": 2.785091208197956e-06,
+ "loss": 0.5049,
+ "step": 5937
+ },
+ {
+ "epoch": 2.807565011820331,
+ "grad_norm": 2.76279616355896,
+ "learning_rate": 2.7844714418484257e-06,
+ "loss": 0.4714,
+ "step": 5938
+ },
+ {
+ "epoch": 2.8080378250591016,
+ "grad_norm": 2.9591920375823975,
+ "learning_rate": 2.7838516577861063e-06,
+ "loss": 0.4633,
+ "step": 5939
+ },
+ {
+ "epoch": 2.8085106382978724,
+ "grad_norm": 2.536916971206665,
+ "learning_rate": 2.7832318560495885e-06,
+ "loss": 0.4108,
+ "step": 5940
+ },
+ {
+ "epoch": 2.808983451536643,
+ "grad_norm": 3.2484991550445557,
+ "learning_rate": 2.7826120366774657e-06,
+ "loss": 0.4888,
+ "step": 5941
+ },
+ {
+ "epoch": 2.8094562647754135,
+ "grad_norm": 2.7129359245300293,
+ "learning_rate": 2.781992199708329e-06,
+ "loss": 0.4008,
+ "step": 5942
+ },
+ {
+ "epoch": 2.8099290780141843,
+ "grad_norm": 2.4176113605499268,
+ "learning_rate": 2.781372345180776e-06,
+ "loss": 0.3864,
+ "step": 5943
+ },
+ {
+ "epoch": 2.810401891252955,
+ "grad_norm": 2.6557252407073975,
+ "learning_rate": 2.7807524731334e-06,
+ "loss": 0.4295,
+ "step": 5944
+ },
+ {
+ "epoch": 2.8108747044917255,
+ "grad_norm": 2.9191324710845947,
+ "learning_rate": 2.7801325836047993e-06,
+ "loss": 0.4854,
+ "step": 5945
+ },
+ {
+ "epoch": 2.8113475177304963,
+ "grad_norm": 2.6325371265411377,
+ "learning_rate": 2.7795126766335705e-06,
+ "loss": 0.4332,
+ "step": 5946
+ },
+ {
+ "epoch": 2.811820330969267,
+ "grad_norm": 2.658337116241455,
+ "learning_rate": 2.778892752258314e-06,
+ "loss": 0.4276,
+ "step": 5947
+ },
+ {
+ "epoch": 2.812293144208038,
+ "grad_norm": 2.763782262802124,
+ "learning_rate": 2.778272810517627e-06,
+ "loss": 0.4246,
+ "step": 5948
+ },
+ {
+ "epoch": 2.8127659574468087,
+ "grad_norm": 2.407607078552246,
+ "learning_rate": 2.777652851450113e-06,
+ "loss": 0.3788,
+ "step": 5949
+ },
+ {
+ "epoch": 2.813238770685579,
+ "grad_norm": 3.0339951515197754,
+ "learning_rate": 2.7770328750943736e-06,
+ "loss": 0.477,
+ "step": 5950
+ },
+ {
+ "epoch": 2.81371158392435,
+ "grad_norm": 2.3475773334503174,
+ "learning_rate": 2.776412881489012e-06,
+ "loss": 0.4206,
+ "step": 5951
+ },
+ {
+ "epoch": 2.8141843971631206,
+ "grad_norm": 3.0455260276794434,
+ "learning_rate": 2.7757928706726318e-06,
+ "loss": 0.4301,
+ "step": 5952
+ },
+ {
+ "epoch": 2.814657210401891,
+ "grad_norm": 2.803920030593872,
+ "learning_rate": 2.7751728426838386e-06,
+ "loss": 0.3738,
+ "step": 5953
+ },
+ {
+ "epoch": 2.815130023640662,
+ "grad_norm": 3.1083319187164307,
+ "learning_rate": 2.77455279756124e-06,
+ "loss": 0.5365,
+ "step": 5954
+ },
+ {
+ "epoch": 2.8156028368794326,
+ "grad_norm": 3.180809497833252,
+ "learning_rate": 2.7739327353434427e-06,
+ "loss": 0.4789,
+ "step": 5955
+ },
+ {
+ "epoch": 2.8160756501182034,
+ "grad_norm": 2.975043773651123,
+ "learning_rate": 2.7733126560690543e-06,
+ "loss": 0.4798,
+ "step": 5956
+ },
+ {
+ "epoch": 2.816548463356974,
+ "grad_norm": 2.765475034713745,
+ "learning_rate": 2.772692559776685e-06,
+ "loss": 0.4206,
+ "step": 5957
+ },
+ {
+ "epoch": 2.8170212765957445,
+ "grad_norm": 2.48612380027771,
+ "learning_rate": 2.7720724465049463e-06,
+ "loss": 0.4234,
+ "step": 5958
+ },
+ {
+ "epoch": 2.8174940898345153,
+ "grad_norm": 2.7145729064941406,
+ "learning_rate": 2.77145231629245e-06,
+ "loss": 0.4713,
+ "step": 5959
+ },
+ {
+ "epoch": 2.817966903073286,
+ "grad_norm": 2.5993762016296387,
+ "learning_rate": 2.7708321691778074e-06,
+ "loss": 0.4144,
+ "step": 5960
+ },
+ {
+ "epoch": 2.8184397163120565,
+ "grad_norm": 3.0902538299560547,
+ "learning_rate": 2.770212005199633e-06,
+ "loss": 0.4822,
+ "step": 5961
+ },
+ {
+ "epoch": 2.8189125295508273,
+ "grad_norm": 2.849757671356201,
+ "learning_rate": 2.7695918243965424e-06,
+ "loss": 0.4449,
+ "step": 5962
+ },
+ {
+ "epoch": 2.819385342789598,
+ "grad_norm": 2.77148699760437,
+ "learning_rate": 2.768971626807151e-06,
+ "loss": 0.4448,
+ "step": 5963
+ },
+ {
+ "epoch": 2.819858156028369,
+ "grad_norm": 2.7865898609161377,
+ "learning_rate": 2.7683514124700757e-06,
+ "loss": 0.4944,
+ "step": 5964
+ },
+ {
+ "epoch": 2.8203309692671397,
+ "grad_norm": 2.9057955741882324,
+ "learning_rate": 2.767731181423934e-06,
+ "loss": 0.5074,
+ "step": 5965
+ },
+ {
+ "epoch": 2.82080378250591,
+ "grad_norm": 2.725837469100952,
+ "learning_rate": 2.7671109337073465e-06,
+ "loss": 0.4207,
+ "step": 5966
+ },
+ {
+ "epoch": 2.821276595744681,
+ "grad_norm": 3.078531265258789,
+ "learning_rate": 2.7664906693589315e-06,
+ "loss": 0.4835,
+ "step": 5967
+ },
+ {
+ "epoch": 2.8217494089834516,
+ "grad_norm": 2.8692002296447754,
+ "learning_rate": 2.765870388417312e-06,
+ "loss": 0.4284,
+ "step": 5968
+ },
+ {
+ "epoch": 2.822222222222222,
+ "grad_norm": 2.8519723415374756,
+ "learning_rate": 2.765250090921109e-06,
+ "loss": 0.541,
+ "step": 5969
+ },
+ {
+ "epoch": 2.8226950354609928,
+ "grad_norm": 3.2037532329559326,
+ "learning_rate": 2.7646297769089457e-06,
+ "loss": 0.4276,
+ "step": 5970
+ },
+ {
+ "epoch": 2.8231678486997636,
+ "grad_norm": 2.8637137413024902,
+ "learning_rate": 2.7640094464194468e-06,
+ "loss": 0.4904,
+ "step": 5971
+ },
+ {
+ "epoch": 2.8236406619385344,
+ "grad_norm": 2.681516408920288,
+ "learning_rate": 2.7633890994912372e-06,
+ "loss": 0.4942,
+ "step": 5972
+ },
+ {
+ "epoch": 2.824113475177305,
+ "grad_norm": 3.0035219192504883,
+ "learning_rate": 2.7627687361629434e-06,
+ "loss": 0.4556,
+ "step": 5973
+ },
+ {
+ "epoch": 2.8245862884160755,
+ "grad_norm": 2.8107759952545166,
+ "learning_rate": 2.7621483564731923e-06,
+ "loss": 0.4225,
+ "step": 5974
+ },
+ {
+ "epoch": 2.8250591016548463,
+ "grad_norm": 2.87276029586792,
+ "learning_rate": 2.7615279604606126e-06,
+ "loss": 0.5045,
+ "step": 5975
+ },
+ {
+ "epoch": 2.825531914893617,
+ "grad_norm": 2.687953233718872,
+ "learning_rate": 2.760907548163833e-06,
+ "loss": 0.4018,
+ "step": 5976
+ },
+ {
+ "epoch": 2.8260047281323875,
+ "grad_norm": 2.587979555130005,
+ "learning_rate": 2.760287119621486e-06,
+ "loss": 0.4407,
+ "step": 5977
+ },
+ {
+ "epoch": 2.8264775413711583,
+ "grad_norm": 2.805602550506592,
+ "learning_rate": 2.7596666748722e-06,
+ "loss": 0.4559,
+ "step": 5978
+ },
+ {
+ "epoch": 2.826950354609929,
+ "grad_norm": 2.320763111114502,
+ "learning_rate": 2.759046213954609e-06,
+ "loss": 0.3847,
+ "step": 5979
+ },
+ {
+ "epoch": 2.8274231678487,
+ "grad_norm": 2.6876401901245117,
+ "learning_rate": 2.758425736907347e-06,
+ "loss": 0.4528,
+ "step": 5980
+ },
+ {
+ "epoch": 2.8278959810874706,
+ "grad_norm": 2.6852915287017822,
+ "learning_rate": 2.757805243769046e-06,
+ "loss": 0.395,
+ "step": 5981
+ },
+ {
+ "epoch": 2.828368794326241,
+ "grad_norm": 2.808326005935669,
+ "learning_rate": 2.7571847345783447e-06,
+ "loss": 0.4647,
+ "step": 5982
+ },
+ {
+ "epoch": 2.828841607565012,
+ "grad_norm": 2.641479015350342,
+ "learning_rate": 2.7565642093738766e-06,
+ "loss": 0.3798,
+ "step": 5983
+ },
+ {
+ "epoch": 2.8293144208037826,
+ "grad_norm": 2.8066110610961914,
+ "learning_rate": 2.7559436681942803e-06,
+ "loss": 0.5072,
+ "step": 5984
+ },
+ {
+ "epoch": 2.829787234042553,
+ "grad_norm": 2.898375988006592,
+ "learning_rate": 2.7553231110781936e-06,
+ "loss": 0.5182,
+ "step": 5985
+ },
+ {
+ "epoch": 2.8302600472813237,
+ "grad_norm": 2.704890489578247,
+ "learning_rate": 2.7547025380642574e-06,
+ "loss": 0.3999,
+ "step": 5986
+ },
+ {
+ "epoch": 2.8307328605200945,
+ "grad_norm": 2.6024270057678223,
+ "learning_rate": 2.7540819491911106e-06,
+ "loss": 0.4302,
+ "step": 5987
+ },
+ {
+ "epoch": 2.8312056737588653,
+ "grad_norm": 2.8006081581115723,
+ "learning_rate": 2.7534613444973946e-06,
+ "loss": 0.4492,
+ "step": 5988
+ },
+ {
+ "epoch": 2.831678486997636,
+ "grad_norm": 2.9532058238983154,
+ "learning_rate": 2.752840724021752e-06,
+ "loss": 0.4552,
+ "step": 5989
+ },
+ {
+ "epoch": 2.8321513002364065,
+ "grad_norm": 3.1830217838287354,
+ "learning_rate": 2.7522200878028265e-06,
+ "loss": 0.5013,
+ "step": 5990
+ },
+ {
+ "epoch": 2.8326241134751773,
+ "grad_norm": 2.716176748275757,
+ "learning_rate": 2.7515994358792624e-06,
+ "loss": 0.4569,
+ "step": 5991
+ },
+ {
+ "epoch": 2.833096926713948,
+ "grad_norm": 2.6852715015411377,
+ "learning_rate": 2.7509787682897044e-06,
+ "loss": 0.4764,
+ "step": 5992
+ },
+ {
+ "epoch": 2.8335697399527184,
+ "grad_norm": 2.9383316040039062,
+ "learning_rate": 2.7503580850727985e-06,
+ "loss": 0.5205,
+ "step": 5993
+ },
+ {
+ "epoch": 2.8340425531914892,
+ "grad_norm": 2.703132152557373,
+ "learning_rate": 2.749737386267193e-06,
+ "loss": 0.4543,
+ "step": 5994
+ },
+ {
+ "epoch": 2.83451536643026,
+ "grad_norm": 2.4304885864257812,
+ "learning_rate": 2.7491166719115354e-06,
+ "loss": 0.4479,
+ "step": 5995
+ },
+ {
+ "epoch": 2.834988179669031,
+ "grad_norm": 2.975722551345825,
+ "learning_rate": 2.748495942044475e-06,
+ "loss": 0.4074,
+ "step": 5996
+ },
+ {
+ "epoch": 2.8354609929078016,
+ "grad_norm": 3.440208911895752,
+ "learning_rate": 2.7478751967046617e-06,
+ "loss": 0.4497,
+ "step": 5997
+ },
+ {
+ "epoch": 2.835933806146572,
+ "grad_norm": 2.734673261642456,
+ "learning_rate": 2.747254435930747e-06,
+ "loss": 0.437,
+ "step": 5998
+ },
+ {
+ "epoch": 2.8364066193853428,
+ "grad_norm": 3.1918959617614746,
+ "learning_rate": 2.7466336597613826e-06,
+ "loss": 0.4197,
+ "step": 5999
+ },
+ {
+ "epoch": 2.8368794326241136,
+ "grad_norm": 3.1440329551696777,
+ "learning_rate": 2.7460128682352216e-06,
+ "loss": 0.4425,
+ "step": 6000
+ },
+ {
+ "epoch": 2.837352245862884,
+ "grad_norm": 2.582993507385254,
+ "learning_rate": 2.7453920613909183e-06,
+ "loss": 0.4475,
+ "step": 6001
+ },
+ {
+ "epoch": 2.8378250591016547,
+ "grad_norm": 3.2682149410247803,
+ "learning_rate": 2.744771239267128e-06,
+ "loss": 0.4615,
+ "step": 6002
+ },
+ {
+ "epoch": 2.8382978723404255,
+ "grad_norm": 2.848477840423584,
+ "learning_rate": 2.7441504019025046e-06,
+ "loss": 0.4093,
+ "step": 6003
+ },
+ {
+ "epoch": 2.8387706855791963,
+ "grad_norm": 2.3582282066345215,
+ "learning_rate": 2.7435295493357067e-06,
+ "loss": 0.3911,
+ "step": 6004
+ },
+ {
+ "epoch": 2.839243498817967,
+ "grad_norm": 2.7707207202911377,
+ "learning_rate": 2.742908681605392e-06,
+ "loss": 0.4069,
+ "step": 6005
+ },
+ {
+ "epoch": 2.8397163120567375,
+ "grad_norm": 3.0763752460479736,
+ "learning_rate": 2.7422877987502183e-06,
+ "loss": 0.512,
+ "step": 6006
+ },
+ {
+ "epoch": 2.8401891252955083,
+ "grad_norm": 2.8027124404907227,
+ "learning_rate": 2.741666900808846e-06,
+ "loss": 0.4922,
+ "step": 6007
+ },
+ {
+ "epoch": 2.840661938534279,
+ "grad_norm": 2.487982988357544,
+ "learning_rate": 2.7410459878199353e-06,
+ "loss": 0.4368,
+ "step": 6008
+ },
+ {
+ "epoch": 2.8411347517730494,
+ "grad_norm": 2.8727993965148926,
+ "learning_rate": 2.7404250598221484e-06,
+ "loss": 0.4639,
+ "step": 6009
+ },
+ {
+ "epoch": 2.84160756501182,
+ "grad_norm": 2.5556678771972656,
+ "learning_rate": 2.739804116854147e-06,
+ "loss": 0.4217,
+ "step": 6010
+ },
+ {
+ "epoch": 2.842080378250591,
+ "grad_norm": 2.6306912899017334,
+ "learning_rate": 2.7391831589545948e-06,
+ "loss": 0.4816,
+ "step": 6011
+ },
+ {
+ "epoch": 2.842553191489362,
+ "grad_norm": 2.7340946197509766,
+ "learning_rate": 2.7385621861621557e-06,
+ "loss": 0.4113,
+ "step": 6012
+ },
+ {
+ "epoch": 2.8430260047281326,
+ "grad_norm": 2.834190607070923,
+ "learning_rate": 2.737941198515495e-06,
+ "loss": 0.4691,
+ "step": 6013
+ },
+ {
+ "epoch": 2.843498817966903,
+ "grad_norm": 2.7139697074890137,
+ "learning_rate": 2.737320196053281e-06,
+ "loss": 0.3798,
+ "step": 6014
+ },
+ {
+ "epoch": 2.8439716312056738,
+ "grad_norm": 2.7934985160827637,
+ "learning_rate": 2.736699178814177e-06,
+ "loss": 0.446,
+ "step": 6015
+ },
+ {
+ "epoch": 2.8444444444444446,
+ "grad_norm": 2.6941518783569336,
+ "learning_rate": 2.7360781468368534e-06,
+ "loss": 0.4787,
+ "step": 6016
+ },
+ {
+ "epoch": 2.844917257683215,
+ "grad_norm": 3.1530468463897705,
+ "learning_rate": 2.7354571001599792e-06,
+ "loss": 0.474,
+ "step": 6017
+ },
+ {
+ "epoch": 2.8453900709219857,
+ "grad_norm": 2.613875389099121,
+ "learning_rate": 2.7348360388222243e-06,
+ "loss": 0.4297,
+ "step": 6018
+ },
+ {
+ "epoch": 2.8458628841607565,
+ "grad_norm": 2.5481486320495605,
+ "learning_rate": 2.7342149628622587e-06,
+ "loss": 0.3762,
+ "step": 6019
+ },
+ {
+ "epoch": 2.8463356973995273,
+ "grad_norm": 2.6425609588623047,
+ "learning_rate": 2.7335938723187544e-06,
+ "loss": 0.4077,
+ "step": 6020
+ },
+ {
+ "epoch": 2.846808510638298,
+ "grad_norm": 2.6281731128692627,
+ "learning_rate": 2.7329727672303836e-06,
+ "loss": 0.466,
+ "step": 6021
+ },
+ {
+ "epoch": 2.8472813238770684,
+ "grad_norm": 2.8862180709838867,
+ "learning_rate": 2.7323516476358197e-06,
+ "loss": 0.4191,
+ "step": 6022
+ },
+ {
+ "epoch": 2.8477541371158392,
+ "grad_norm": 2.907731533050537,
+ "learning_rate": 2.7317305135737383e-06,
+ "loss": 0.4867,
+ "step": 6023
+ },
+ {
+ "epoch": 2.84822695035461,
+ "grad_norm": 2.825593948364258,
+ "learning_rate": 2.731109365082814e-06,
+ "loss": 0.4888,
+ "step": 6024
+ },
+ {
+ "epoch": 2.8486997635933804,
+ "grad_norm": 2.478163003921509,
+ "learning_rate": 2.730488202201722e-06,
+ "loss": 0.4714,
+ "step": 6025
+ },
+ {
+ "epoch": 2.849172576832151,
+ "grad_norm": 2.928899049758911,
+ "learning_rate": 2.7298670249691418e-06,
+ "loss": 0.4671,
+ "step": 6026
+ },
+ {
+ "epoch": 2.849645390070922,
+ "grad_norm": 2.778256893157959,
+ "learning_rate": 2.7292458334237488e-06,
+ "loss": 0.429,
+ "step": 6027
+ },
+ {
+ "epoch": 2.850118203309693,
+ "grad_norm": 3.0689055919647217,
+ "learning_rate": 2.7286246276042234e-06,
+ "loss": 0.4727,
+ "step": 6028
+ },
+ {
+ "epoch": 2.8505910165484636,
+ "grad_norm": 2.582066774368286,
+ "learning_rate": 2.7280034075492447e-06,
+ "loss": 0.4025,
+ "step": 6029
+ },
+ {
+ "epoch": 2.851063829787234,
+ "grad_norm": 3.6679015159606934,
+ "learning_rate": 2.7273821732974936e-06,
+ "loss": 0.4856,
+ "step": 6030
+ },
+ {
+ "epoch": 2.8515366430260047,
+ "grad_norm": 2.7222588062286377,
+ "learning_rate": 2.7267609248876516e-06,
+ "loss": 0.4255,
+ "step": 6031
+ },
+ {
+ "epoch": 2.8520094562647755,
+ "grad_norm": 2.455038547515869,
+ "learning_rate": 2.726139662358401e-06,
+ "loss": 0.4234,
+ "step": 6032
+ },
+ {
+ "epoch": 2.852482269503546,
+ "grad_norm": 2.8277318477630615,
+ "learning_rate": 2.7255183857484253e-06,
+ "loss": 0.4146,
+ "step": 6033
+ },
+ {
+ "epoch": 2.8529550827423167,
+ "grad_norm": 2.523615837097168,
+ "learning_rate": 2.724897095096409e-06,
+ "loss": 0.4227,
+ "step": 6034
+ },
+ {
+ "epoch": 2.8534278959810875,
+ "grad_norm": 3.353646755218506,
+ "learning_rate": 2.724275790441036e-06,
+ "loss": 0.5041,
+ "step": 6035
+ },
+ {
+ "epoch": 2.8539007092198583,
+ "grad_norm": 2.753981828689575,
+ "learning_rate": 2.7236544718209934e-06,
+ "loss": 0.4646,
+ "step": 6036
+ },
+ {
+ "epoch": 2.854373522458629,
+ "grad_norm": 2.954744577407837,
+ "learning_rate": 2.723033139274967e-06,
+ "loss": 0.5182,
+ "step": 6037
+ },
+ {
+ "epoch": 2.8548463356973994,
+ "grad_norm": 2.4814131259918213,
+ "learning_rate": 2.7224117928416462e-06,
+ "loss": 0.4626,
+ "step": 6038
+ },
+ {
+ "epoch": 2.8553191489361702,
+ "grad_norm": 2.7414886951446533,
+ "learning_rate": 2.721790432559717e-06,
+ "loss": 0.4111,
+ "step": 6039
+ },
+ {
+ "epoch": 2.855791962174941,
+ "grad_norm": 2.8743896484375,
+ "learning_rate": 2.7211690584678706e-06,
+ "loss": 0.4986,
+ "step": 6040
+ },
+ {
+ "epoch": 2.8562647754137114,
+ "grad_norm": 3.0691921710968018,
+ "learning_rate": 2.720547670604797e-06,
+ "loss": 0.4743,
+ "step": 6041
+ },
+ {
+ "epoch": 2.856737588652482,
+ "grad_norm": 2.7273411750793457,
+ "learning_rate": 2.7199262690091872e-06,
+ "loss": 0.4403,
+ "step": 6042
+ },
+ {
+ "epoch": 2.857210401891253,
+ "grad_norm": 2.8022944927215576,
+ "learning_rate": 2.7193048537197325e-06,
+ "loss": 0.4413,
+ "step": 6043
+ },
+ {
+ "epoch": 2.8576832151300238,
+ "grad_norm": 2.4883248805999756,
+ "learning_rate": 2.718683424775126e-06,
+ "loss": 0.4485,
+ "step": 6044
+ },
+ {
+ "epoch": 2.8581560283687946,
+ "grad_norm": 2.457249879837036,
+ "learning_rate": 2.718061982214062e-06,
+ "loss": 0.4167,
+ "step": 6045
+ },
+ {
+ "epoch": 2.858628841607565,
+ "grad_norm": 2.7210328578948975,
+ "learning_rate": 2.717440526075234e-06,
+ "loss": 0.4419,
+ "step": 6046
+ },
+ {
+ "epoch": 2.8591016548463357,
+ "grad_norm": 2.684483766555786,
+ "learning_rate": 2.7168190563973386e-06,
+ "loss": 0.4449,
+ "step": 6047
+ },
+ {
+ "epoch": 2.8595744680851065,
+ "grad_norm": 2.5305230617523193,
+ "learning_rate": 2.7161975732190706e-06,
+ "loss": 0.3829,
+ "step": 6048
+ },
+ {
+ "epoch": 2.860047281323877,
+ "grad_norm": 3.0284602642059326,
+ "learning_rate": 2.7155760765791278e-06,
+ "loss": 0.5164,
+ "step": 6049
+ },
+ {
+ "epoch": 2.8605200945626477,
+ "grad_norm": 3.154599189758301,
+ "learning_rate": 2.7149545665162085e-06,
+ "loss": 0.527,
+ "step": 6050
+ },
+ {
+ "epoch": 2.8609929078014185,
+ "grad_norm": 2.6798126697540283,
+ "learning_rate": 2.7143330430690113e-06,
+ "loss": 0.4379,
+ "step": 6051
+ },
+ {
+ "epoch": 2.8614657210401893,
+ "grad_norm": 2.9531302452087402,
+ "learning_rate": 2.7137115062762344e-06,
+ "loss": 0.4549,
+ "step": 6052
+ },
+ {
+ "epoch": 2.86193853427896,
+ "grad_norm": 2.779531240463257,
+ "learning_rate": 2.7130899561765787e-06,
+ "loss": 0.4037,
+ "step": 6053
+ },
+ {
+ "epoch": 2.8624113475177304,
+ "grad_norm": 2.786763906478882,
+ "learning_rate": 2.7124683928087466e-06,
+ "loss": 0.3986,
+ "step": 6054
+ },
+ {
+ "epoch": 2.862884160756501,
+ "grad_norm": 2.430415630340576,
+ "learning_rate": 2.7118468162114385e-06,
+ "loss": 0.4402,
+ "step": 6055
+ },
+ {
+ "epoch": 2.863356973995272,
+ "grad_norm": 3.027268409729004,
+ "learning_rate": 2.7112252264233596e-06,
+ "loss": 0.4737,
+ "step": 6056
+ },
+ {
+ "epoch": 2.8638297872340424,
+ "grad_norm": 3.024935483932495,
+ "learning_rate": 2.710603623483211e-06,
+ "loss": 0.3997,
+ "step": 6057
+ },
+ {
+ "epoch": 2.864302600472813,
+ "grad_norm": 2.8862195014953613,
+ "learning_rate": 2.7099820074296985e-06,
+ "loss": 0.4896,
+ "step": 6058
+ },
+ {
+ "epoch": 2.864775413711584,
+ "grad_norm": 2.595579147338867,
+ "learning_rate": 2.709360378301527e-06,
+ "loss": 0.4387,
+ "step": 6059
+ },
+ {
+ "epoch": 2.8652482269503547,
+ "grad_norm": 2.8046188354492188,
+ "learning_rate": 2.708738736137403e-06,
+ "loss": 0.4726,
+ "step": 6060
+ },
+ {
+ "epoch": 2.8657210401891255,
+ "grad_norm": 3.040304660797119,
+ "learning_rate": 2.708117080976033e-06,
+ "loss": 0.4642,
+ "step": 6061
+ },
+ {
+ "epoch": 2.866193853427896,
+ "grad_norm": 2.618128538131714,
+ "learning_rate": 2.7074954128561248e-06,
+ "loss": 0.3171,
+ "step": 6062
+ },
+ {
+ "epoch": 2.8666666666666667,
+ "grad_norm": 2.7966055870056152,
+ "learning_rate": 2.706873731816387e-06,
+ "loss": 0.4893,
+ "step": 6063
+ },
+ {
+ "epoch": 2.8671394799054375,
+ "grad_norm": 2.9198038578033447,
+ "learning_rate": 2.706252037895529e-06,
+ "loss": 0.4428,
+ "step": 6064
+ },
+ {
+ "epoch": 2.867612293144208,
+ "grad_norm": 2.417705774307251,
+ "learning_rate": 2.7056303311322617e-06,
+ "loss": 0.3704,
+ "step": 6065
+ },
+ {
+ "epoch": 2.8680851063829786,
+ "grad_norm": 3.143918752670288,
+ "learning_rate": 2.7050086115652953e-06,
+ "loss": 0.5247,
+ "step": 6066
+ },
+ {
+ "epoch": 2.8685579196217494,
+ "grad_norm": 2.620781183242798,
+ "learning_rate": 2.704386879233341e-06,
+ "loss": 0.4131,
+ "step": 6067
+ },
+ {
+ "epoch": 2.8690307328605202,
+ "grad_norm": 2.6929845809936523,
+ "learning_rate": 2.703765134175112e-06,
+ "loss": 0.4833,
+ "step": 6068
+ },
+ {
+ "epoch": 2.869503546099291,
+ "grad_norm": 2.695920944213867,
+ "learning_rate": 2.7031433764293214e-06,
+ "loss": 0.435,
+ "step": 6069
+ },
+ {
+ "epoch": 2.8699763593380614,
+ "grad_norm": 2.6184475421905518,
+ "learning_rate": 2.702521606034684e-06,
+ "loss": 0.3898,
+ "step": 6070
+ },
+ {
+ "epoch": 2.870449172576832,
+ "grad_norm": 3.130624532699585,
+ "learning_rate": 2.7018998230299136e-06,
+ "loss": 0.4934,
+ "step": 6071
+ },
+ {
+ "epoch": 2.870921985815603,
+ "grad_norm": 2.947936534881592,
+ "learning_rate": 2.701278027453727e-06,
+ "loss": 0.4167,
+ "step": 6072
+ },
+ {
+ "epoch": 2.8713947990543733,
+ "grad_norm": 2.389263391494751,
+ "learning_rate": 2.7006562193448406e-06,
+ "loss": 0.3854,
+ "step": 6073
+ },
+ {
+ "epoch": 2.871867612293144,
+ "grad_norm": 2.9040684700012207,
+ "learning_rate": 2.700034398741971e-06,
+ "loss": 0.4656,
+ "step": 6074
+ },
+ {
+ "epoch": 2.872340425531915,
+ "grad_norm": 2.8671910762786865,
+ "learning_rate": 2.6994125656838365e-06,
+ "loss": 0.4642,
+ "step": 6075
+ },
+ {
+ "epoch": 2.8728132387706857,
+ "grad_norm": 2.6957180500030518,
+ "learning_rate": 2.698790720209156e-06,
+ "loss": 0.4894,
+ "step": 6076
+ },
+ {
+ "epoch": 2.8732860520094565,
+ "grad_norm": 2.748342514038086,
+ "learning_rate": 2.698168862356648e-06,
+ "loss": 0.4552,
+ "step": 6077
+ },
+ {
+ "epoch": 2.873758865248227,
+ "grad_norm": 2.7459912300109863,
+ "learning_rate": 2.6975469921650344e-06,
+ "loss": 0.4244,
+ "step": 6078
+ },
+ {
+ "epoch": 2.8742316784869977,
+ "grad_norm": 2.515650987625122,
+ "learning_rate": 2.6969251096730366e-06,
+ "loss": 0.4178,
+ "step": 6079
+ },
+ {
+ "epoch": 2.8747044917257685,
+ "grad_norm": 2.747373342514038,
+ "learning_rate": 2.696303214919375e-06,
+ "loss": 0.4623,
+ "step": 6080
+ },
+ {
+ "epoch": 2.875177304964539,
+ "grad_norm": 2.72092604637146,
+ "learning_rate": 2.695681307942773e-06,
+ "loss": 0.4227,
+ "step": 6081
+ },
+ {
+ "epoch": 2.8756501182033096,
+ "grad_norm": 2.6925108432769775,
+ "learning_rate": 2.695059388781955e-06,
+ "loss": 0.3807,
+ "step": 6082
+ },
+ {
+ "epoch": 2.8761229314420804,
+ "grad_norm": 2.673546314239502,
+ "learning_rate": 2.6944374574756427e-06,
+ "loss": 0.424,
+ "step": 6083
+ },
+ {
+ "epoch": 2.876595744680851,
+ "grad_norm": 2.7018187046051025,
+ "learning_rate": 2.6938155140625636e-06,
+ "loss": 0.4367,
+ "step": 6084
+ },
+ {
+ "epoch": 2.877068557919622,
+ "grad_norm": 2.9420957565307617,
+ "learning_rate": 2.6931935585814416e-06,
+ "loss": 0.4223,
+ "step": 6085
+ },
+ {
+ "epoch": 2.8775413711583924,
+ "grad_norm": 2.6523385047912598,
+ "learning_rate": 2.6925715910710036e-06,
+ "loss": 0.4074,
+ "step": 6086
+ },
+ {
+ "epoch": 2.878014184397163,
+ "grad_norm": 2.6104063987731934,
+ "learning_rate": 2.691949611569978e-06,
+ "loss": 0.423,
+ "step": 6087
+ },
+ {
+ "epoch": 2.878486997635934,
+ "grad_norm": 2.6463685035705566,
+ "learning_rate": 2.691327620117091e-06,
+ "loss": 0.4354,
+ "step": 6088
+ },
+ {
+ "epoch": 2.8789598108747043,
+ "grad_norm": 2.5863583087921143,
+ "learning_rate": 2.6907056167510725e-06,
+ "loss": 0.4177,
+ "step": 6089
+ },
+ {
+ "epoch": 2.879432624113475,
+ "grad_norm": 2.6946942806243896,
+ "learning_rate": 2.690083601510651e-06,
+ "loss": 0.4176,
+ "step": 6090
+ },
+ {
+ "epoch": 2.879905437352246,
+ "grad_norm": 3.0649454593658447,
+ "learning_rate": 2.6894615744345575e-06,
+ "loss": 0.4827,
+ "step": 6091
+ },
+ {
+ "epoch": 2.8803782505910167,
+ "grad_norm": 2.6454906463623047,
+ "learning_rate": 2.6888395355615226e-06,
+ "loss": 0.4757,
+ "step": 6092
+ },
+ {
+ "epoch": 2.8808510638297875,
+ "grad_norm": 3.251805067062378,
+ "learning_rate": 2.688217484930278e-06,
+ "loss": 0.5651,
+ "step": 6093
+ },
+ {
+ "epoch": 2.881323877068558,
+ "grad_norm": 2.543999433517456,
+ "learning_rate": 2.687595422579555e-06,
+ "loss": 0.4196,
+ "step": 6094
+ },
+ {
+ "epoch": 2.8817966903073287,
+ "grad_norm": 3.1502909660339355,
+ "learning_rate": 2.686973348548088e-06,
+ "loss": 0.4376,
+ "step": 6095
+ },
+ {
+ "epoch": 2.8822695035460995,
+ "grad_norm": 2.7800376415252686,
+ "learning_rate": 2.686351262874611e-06,
+ "loss": 0.444,
+ "step": 6096
+ },
+ {
+ "epoch": 2.88274231678487,
+ "grad_norm": 3.1529603004455566,
+ "learning_rate": 2.685729165597858e-06,
+ "loss": 0.5137,
+ "step": 6097
+ },
+ {
+ "epoch": 2.8832151300236406,
+ "grad_norm": 2.6079602241516113,
+ "learning_rate": 2.685107056756564e-06,
+ "loss": 0.4213,
+ "step": 6098
+ },
+ {
+ "epoch": 2.8836879432624114,
+ "grad_norm": 2.8969249725341797,
+ "learning_rate": 2.6844849363894648e-06,
+ "loss": 0.4679,
+ "step": 6099
+ },
+ {
+ "epoch": 2.884160756501182,
+ "grad_norm": 2.5882437229156494,
+ "learning_rate": 2.6838628045352977e-06,
+ "loss": 0.3891,
+ "step": 6100
+ },
+ {
+ "epoch": 2.8846335697399526,
+ "grad_norm": 2.9458062648773193,
+ "learning_rate": 2.6832406612328007e-06,
+ "loss": 0.4802,
+ "step": 6101
+ },
+ {
+ "epoch": 2.8851063829787233,
+ "grad_norm": 2.8463058471679688,
+ "learning_rate": 2.6826185065207105e-06,
+ "loss": 0.4332,
+ "step": 6102
+ },
+ {
+ "epoch": 2.885579196217494,
+ "grad_norm": 2.8799285888671875,
+ "learning_rate": 2.6819963404377667e-06,
+ "loss": 0.4474,
+ "step": 6103
+ },
+ {
+ "epoch": 2.8860520094562645,
+ "grad_norm": 2.846860408782959,
+ "learning_rate": 2.681374163022709e-06,
+ "loss": 0.4317,
+ "step": 6104
+ },
+ {
+ "epoch": 2.8865248226950353,
+ "grad_norm": 2.7918877601623535,
+ "learning_rate": 2.6807519743142775e-06,
+ "loss": 0.4243,
+ "step": 6105
+ },
+ {
+ "epoch": 2.886997635933806,
+ "grad_norm": 2.9351487159729004,
+ "learning_rate": 2.6801297743512127e-06,
+ "loss": 0.5253,
+ "step": 6106
+ },
+ {
+ "epoch": 2.887470449172577,
+ "grad_norm": 2.9422426223754883,
+ "learning_rate": 2.6795075631722576e-06,
+ "loss": 0.4887,
+ "step": 6107
+ },
+ {
+ "epoch": 2.8879432624113477,
+ "grad_norm": 2.6837220191955566,
+ "learning_rate": 2.678885340816153e-06,
+ "loss": 0.4761,
+ "step": 6108
+ },
+ {
+ "epoch": 2.888416075650118,
+ "grad_norm": 2.6800777912139893,
+ "learning_rate": 2.6782631073216425e-06,
+ "loss": 0.4248,
+ "step": 6109
+ },
+ {
+ "epoch": 2.888888888888889,
+ "grad_norm": 2.9654436111450195,
+ "learning_rate": 2.6776408627274702e-06,
+ "loss": 0.487,
+ "step": 6110
+ },
+ {
+ "epoch": 2.8893617021276596,
+ "grad_norm": 2.7725181579589844,
+ "learning_rate": 2.6770186070723804e-06,
+ "loss": 0.4166,
+ "step": 6111
+ },
+ {
+ "epoch": 2.88983451536643,
+ "grad_norm": 2.6547815799713135,
+ "learning_rate": 2.676396340395118e-06,
+ "loss": 0.4039,
+ "step": 6112
+ },
+ {
+ "epoch": 2.890307328605201,
+ "grad_norm": 2.690997838973999,
+ "learning_rate": 2.6757740627344292e-06,
+ "loss": 0.4639,
+ "step": 6113
+ },
+ {
+ "epoch": 2.8907801418439716,
+ "grad_norm": 2.4693069458007812,
+ "learning_rate": 2.67515177412906e-06,
+ "loss": 0.4052,
+ "step": 6114
+ },
+ {
+ "epoch": 2.8912529550827424,
+ "grad_norm": 2.7137033939361572,
+ "learning_rate": 2.6745294746177576e-06,
+ "loss": 0.4442,
+ "step": 6115
+ },
+ {
+ "epoch": 2.891725768321513,
+ "grad_norm": 3.7417004108428955,
+ "learning_rate": 2.6739071642392712e-06,
+ "loss": 0.4809,
+ "step": 6116
+ },
+ {
+ "epoch": 2.8921985815602835,
+ "grad_norm": 2.707094669342041,
+ "learning_rate": 2.673284843032347e-06,
+ "loss": 0.411,
+ "step": 6117
+ },
+ {
+ "epoch": 2.8926713947990543,
+ "grad_norm": 2.7864158153533936,
+ "learning_rate": 2.672662511035736e-06,
+ "loss": 0.4939,
+ "step": 6118
+ },
+ {
+ "epoch": 2.893144208037825,
+ "grad_norm": 2.8753504753112793,
+ "learning_rate": 2.672040168288187e-06,
+ "loss": 0.4396,
+ "step": 6119
+ },
+ {
+ "epoch": 2.8936170212765955,
+ "grad_norm": 2.7581071853637695,
+ "learning_rate": 2.6714178148284516e-06,
+ "loss": 0.427,
+ "step": 6120
+ },
+ {
+ "epoch": 2.8940898345153663,
+ "grad_norm": 2.9754791259765625,
+ "learning_rate": 2.6707954506952803e-06,
+ "loss": 0.4255,
+ "step": 6121
+ },
+ {
+ "epoch": 2.894562647754137,
+ "grad_norm": 2.876939296722412,
+ "learning_rate": 2.670173075927426e-06,
+ "loss": 0.4699,
+ "step": 6122
+ },
+ {
+ "epoch": 2.895035460992908,
+ "grad_norm": 2.4875400066375732,
+ "learning_rate": 2.6695506905636397e-06,
+ "loss": 0.3568,
+ "step": 6123
+ },
+ {
+ "epoch": 2.8955082742316787,
+ "grad_norm": 2.703606128692627,
+ "learning_rate": 2.668928294642675e-06,
+ "loss": 0.3646,
+ "step": 6124
+ },
+ {
+ "epoch": 2.895981087470449,
+ "grad_norm": 2.8618338108062744,
+ "learning_rate": 2.6683058882032868e-06,
+ "loss": 0.378,
+ "step": 6125
+ },
+ {
+ "epoch": 2.89645390070922,
+ "grad_norm": 2.9756760597229004,
+ "learning_rate": 2.667683471284229e-06,
+ "loss": 0.4348,
+ "step": 6126
+ },
+ {
+ "epoch": 2.8969267139479906,
+ "grad_norm": 2.7861104011535645,
+ "learning_rate": 2.667061043924256e-06,
+ "loss": 0.4435,
+ "step": 6127
+ },
+ {
+ "epoch": 2.897399527186761,
+ "grad_norm": 2.7932238578796387,
+ "learning_rate": 2.6664386061621243e-06,
+ "loss": 0.4824,
+ "step": 6128
+ },
+ {
+ "epoch": 2.8978723404255318,
+ "grad_norm": 2.85483455657959,
+ "learning_rate": 2.6658161580365917e-06,
+ "loss": 0.4925,
+ "step": 6129
+ },
+ {
+ "epoch": 2.8983451536643026,
+ "grad_norm": 2.4242141246795654,
+ "learning_rate": 2.6651936995864136e-06,
+ "loss": 0.3466,
+ "step": 6130
+ },
+ {
+ "epoch": 2.8988179669030734,
+ "grad_norm": 3.385214328765869,
+ "learning_rate": 2.6645712308503473e-06,
+ "loss": 0.4751,
+ "step": 6131
+ },
+ {
+ "epoch": 2.899290780141844,
+ "grad_norm": 2.7109622955322266,
+ "learning_rate": 2.6639487518671525e-06,
+ "loss": 0.4469,
+ "step": 6132
+ },
+ {
+ "epoch": 2.8997635933806145,
+ "grad_norm": 2.6537814140319824,
+ "learning_rate": 2.6633262626755877e-06,
+ "loss": 0.4678,
+ "step": 6133
+ },
+ {
+ "epoch": 2.9002364066193853,
+ "grad_norm": 2.5992231369018555,
+ "learning_rate": 2.6627037633144124e-06,
+ "loss": 0.4206,
+ "step": 6134
+ },
+ {
+ "epoch": 2.900709219858156,
+ "grad_norm": 2.988940954208374,
+ "learning_rate": 2.6620812538223885e-06,
+ "loss": 0.4554,
+ "step": 6135
+ },
+ {
+ "epoch": 2.9011820330969265,
+ "grad_norm": 3.0678138732910156,
+ "learning_rate": 2.661458734238274e-06,
+ "loss": 0.4671,
+ "step": 6136
+ },
+ {
+ "epoch": 2.9016548463356973,
+ "grad_norm": 2.6902482509613037,
+ "learning_rate": 2.6608362046008335e-06,
+ "loss": 0.372,
+ "step": 6137
+ },
+ {
+ "epoch": 2.902127659574468,
+ "grad_norm": 3.031597375869751,
+ "learning_rate": 2.660213664948827e-06,
+ "loss": 0.4424,
+ "step": 6138
+ },
+ {
+ "epoch": 2.902600472813239,
+ "grad_norm": 2.8376755714416504,
+ "learning_rate": 2.6595911153210187e-06,
+ "loss": 0.4599,
+ "step": 6139
+ },
+ {
+ "epoch": 2.9030732860520096,
+ "grad_norm": 3.3164854049682617,
+ "learning_rate": 2.6589685557561707e-06,
+ "loss": 0.3897,
+ "step": 6140
+ },
+ {
+ "epoch": 2.90354609929078,
+ "grad_norm": 2.9535014629364014,
+ "learning_rate": 2.658345986293048e-06,
+ "loss": 0.4957,
+ "step": 6141
+ },
+ {
+ "epoch": 2.904018912529551,
+ "grad_norm": 2.821276903152466,
+ "learning_rate": 2.657723406970415e-06,
+ "loss": 0.4453,
+ "step": 6142
+ },
+ {
+ "epoch": 2.9044917257683216,
+ "grad_norm": 2.7314651012420654,
+ "learning_rate": 2.657100817827037e-06,
+ "loss": 0.4406,
+ "step": 6143
+ },
+ {
+ "epoch": 2.904964539007092,
+ "grad_norm": 2.9509520530700684,
+ "learning_rate": 2.6564782189016804e-06,
+ "loss": 0.4629,
+ "step": 6144
+ },
+ {
+ "epoch": 2.9054373522458627,
+ "grad_norm": 2.6234960556030273,
+ "learning_rate": 2.655855610233111e-06,
+ "loss": 0.4306,
+ "step": 6145
+ },
+ {
+ "epoch": 2.9059101654846335,
+ "grad_norm": 2.7209644317626953,
+ "learning_rate": 2.6552329918600962e-06,
+ "loss": 0.3643,
+ "step": 6146
+ },
+ {
+ "epoch": 2.9063829787234043,
+ "grad_norm": 2.9797747135162354,
+ "learning_rate": 2.654610363821404e-06,
+ "loss": 0.4616,
+ "step": 6147
+ },
+ {
+ "epoch": 2.906855791962175,
+ "grad_norm": 2.8179666996002197,
+ "learning_rate": 2.6539877261558016e-06,
+ "loss": 0.4526,
+ "step": 6148
+ },
+ {
+ "epoch": 2.9073286052009455,
+ "grad_norm": 2.7492244243621826,
+ "learning_rate": 2.653365078902059e-06,
+ "loss": 0.4862,
+ "step": 6149
+ },
+ {
+ "epoch": 2.9078014184397163,
+ "grad_norm": 3.0262451171875,
+ "learning_rate": 2.6527424220989457e-06,
+ "loss": 0.3728,
+ "step": 6150
+ },
+ {
+ "epoch": 2.908274231678487,
+ "grad_norm": 2.8092808723449707,
+ "learning_rate": 2.6521197557852315e-06,
+ "loss": 0.4668,
+ "step": 6151
+ },
+ {
+ "epoch": 2.9087470449172574,
+ "grad_norm": 2.915719985961914,
+ "learning_rate": 2.651497079999687e-06,
+ "loss": 0.5124,
+ "step": 6152
+ },
+ {
+ "epoch": 2.9092198581560282,
+ "grad_norm": 2.9794204235076904,
+ "learning_rate": 2.6508743947810834e-06,
+ "loss": 0.5207,
+ "step": 6153
+ },
+ {
+ "epoch": 2.909692671394799,
+ "grad_norm": 2.882453680038452,
+ "learning_rate": 2.650251700168193e-06,
+ "loss": 0.4382,
+ "step": 6154
+ },
+ {
+ "epoch": 2.91016548463357,
+ "grad_norm": 3.183680534362793,
+ "learning_rate": 2.6496289961997886e-06,
+ "loss": 0.5134,
+ "step": 6155
+ },
+ {
+ "epoch": 2.9106382978723406,
+ "grad_norm": 2.9374759197235107,
+ "learning_rate": 2.649006282914642e-06,
+ "loss": 0.4748,
+ "step": 6156
+ },
+ {
+ "epoch": 2.911111111111111,
+ "grad_norm": 2.8096041679382324,
+ "learning_rate": 2.648383560351527e-06,
+ "loss": 0.4672,
+ "step": 6157
+ },
+ {
+ "epoch": 2.911583924349882,
+ "grad_norm": 2.8799238204956055,
+ "learning_rate": 2.6477608285492196e-06,
+ "loss": 0.4679,
+ "step": 6158
+ },
+ {
+ "epoch": 2.9120567375886526,
+ "grad_norm": 2.689310073852539,
+ "learning_rate": 2.6471380875464923e-06,
+ "loss": 0.4069,
+ "step": 6159
+ },
+ {
+ "epoch": 2.912529550827423,
+ "grad_norm": 2.909323215484619,
+ "learning_rate": 2.6465153373821216e-06,
+ "loss": 0.4463,
+ "step": 6160
+ },
+ {
+ "epoch": 2.9130023640661937,
+ "grad_norm": 2.797724962234497,
+ "learning_rate": 2.6458925780948845e-06,
+ "loss": 0.4269,
+ "step": 6161
+ },
+ {
+ "epoch": 2.9134751773049645,
+ "grad_norm": 2.7533204555511475,
+ "learning_rate": 2.645269809723556e-06,
+ "loss": 0.453,
+ "step": 6162
+ },
+ {
+ "epoch": 2.9139479905437353,
+ "grad_norm": 2.6615989208221436,
+ "learning_rate": 2.6446470323069122e-06,
+ "loss": 0.3921,
+ "step": 6163
+ },
+ {
+ "epoch": 2.914420803782506,
+ "grad_norm": 3.0493314266204834,
+ "learning_rate": 2.644024245883733e-06,
+ "loss": 0.4779,
+ "step": 6164
+ },
+ {
+ "epoch": 2.9148936170212765,
+ "grad_norm": 2.649845600128174,
+ "learning_rate": 2.643401450492795e-06,
+ "loss": 0.454,
+ "step": 6165
+ },
+ {
+ "epoch": 2.9153664302600473,
+ "grad_norm": 2.7931838035583496,
+ "learning_rate": 2.642778646172877e-06,
+ "loss": 0.504,
+ "step": 6166
+ },
+ {
+ "epoch": 2.915839243498818,
+ "grad_norm": 2.9518136978149414,
+ "learning_rate": 2.64215583296276e-06,
+ "loss": 0.4767,
+ "step": 6167
+ },
+ {
+ "epoch": 2.9163120567375884,
+ "grad_norm": 2.6047427654266357,
+ "learning_rate": 2.6415330109012216e-06,
+ "loss": 0.4316,
+ "step": 6168
+ },
+ {
+ "epoch": 2.916784869976359,
+ "grad_norm": 2.7732112407684326,
+ "learning_rate": 2.640910180027044e-06,
+ "loss": 0.4213,
+ "step": 6169
+ },
+ {
+ "epoch": 2.91725768321513,
+ "grad_norm": 3.1157236099243164,
+ "learning_rate": 2.6402873403790068e-06,
+ "loss": 0.4559,
+ "step": 6170
+ },
+ {
+ "epoch": 2.917730496453901,
+ "grad_norm": 2.68424129486084,
+ "learning_rate": 2.6396644919958917e-06,
+ "loss": 0.3456,
+ "step": 6171
+ },
+ {
+ "epoch": 2.9182033096926716,
+ "grad_norm": 3.1093270778656006,
+ "learning_rate": 2.639041634916482e-06,
+ "loss": 0.4172,
+ "step": 6172
+ },
+ {
+ "epoch": 2.918676122931442,
+ "grad_norm": 2.9844655990600586,
+ "learning_rate": 2.6384187691795594e-06,
+ "loss": 0.4844,
+ "step": 6173
+ },
+ {
+ "epoch": 2.9191489361702128,
+ "grad_norm": 2.907151222229004,
+ "learning_rate": 2.637795894823906e-06,
+ "loss": 0.5126,
+ "step": 6174
+ },
+ {
+ "epoch": 2.9196217494089836,
+ "grad_norm": 2.804105520248413,
+ "learning_rate": 2.637173011888307e-06,
+ "loss": 0.3919,
+ "step": 6175
+ },
+ {
+ "epoch": 2.920094562647754,
+ "grad_norm": 2.8809266090393066,
+ "learning_rate": 2.636550120411547e-06,
+ "loss": 0.4468,
+ "step": 6176
+ },
+ {
+ "epoch": 2.9205673758865247,
+ "grad_norm": 2.686290979385376,
+ "learning_rate": 2.6359272204324087e-06,
+ "loss": 0.4352,
+ "step": 6177
+ },
+ {
+ "epoch": 2.9210401891252955,
+ "grad_norm": 2.448101758956909,
+ "learning_rate": 2.635304311989678e-06,
+ "loss": 0.4218,
+ "step": 6178
+ },
+ {
+ "epoch": 2.9215130023640663,
+ "grad_norm": 2.81024169921875,
+ "learning_rate": 2.6346813951221416e-06,
+ "loss": 0.5177,
+ "step": 6179
+ },
+ {
+ "epoch": 2.921985815602837,
+ "grad_norm": 2.7590086460113525,
+ "learning_rate": 2.6340584698685856e-06,
+ "loss": 0.3897,
+ "step": 6180
+ },
+ {
+ "epoch": 2.9224586288416075,
+ "grad_norm": 3.1226227283477783,
+ "learning_rate": 2.6334355362677965e-06,
+ "loss": 0.4595,
+ "step": 6181
+ },
+ {
+ "epoch": 2.9229314420803783,
+ "grad_norm": 2.673828125,
+ "learning_rate": 2.6328125943585607e-06,
+ "loss": 0.4932,
+ "step": 6182
+ },
+ {
+ "epoch": 2.923404255319149,
+ "grad_norm": 2.8297293186187744,
+ "learning_rate": 2.632189644179668e-06,
+ "loss": 0.3819,
+ "step": 6183
+ },
+ {
+ "epoch": 2.9238770685579194,
+ "grad_norm": 2.9661548137664795,
+ "learning_rate": 2.6315666857699056e-06,
+ "loss": 0.4419,
+ "step": 6184
+ },
+ {
+ "epoch": 2.92434988179669,
+ "grad_norm": 2.9745798110961914,
+ "learning_rate": 2.6309437191680627e-06,
+ "loss": 0.4423,
+ "step": 6185
+ },
+ {
+ "epoch": 2.924822695035461,
+ "grad_norm": 2.8351712226867676,
+ "learning_rate": 2.6303207444129285e-06,
+ "loss": 0.5043,
+ "step": 6186
+ },
+ {
+ "epoch": 2.925295508274232,
+ "grad_norm": 2.6442384719848633,
+ "learning_rate": 2.6296977615432927e-06,
+ "loss": 0.4431,
+ "step": 6187
+ },
+ {
+ "epoch": 2.9257683215130026,
+ "grad_norm": 2.4128029346466064,
+ "learning_rate": 2.6290747705979457e-06,
+ "loss": 0.3603,
+ "step": 6188
+ },
+ {
+ "epoch": 2.926241134751773,
+ "grad_norm": 2.730424642562866,
+ "learning_rate": 2.6284517716156786e-06,
+ "loss": 0.439,
+ "step": 6189
+ },
+ {
+ "epoch": 2.9267139479905437,
+ "grad_norm": 2.6215405464172363,
+ "learning_rate": 2.627828764635284e-06,
+ "loss": 0.4117,
+ "step": 6190
+ },
+ {
+ "epoch": 2.9271867612293145,
+ "grad_norm": 2.56585955619812,
+ "learning_rate": 2.627205749695552e-06,
+ "loss": 0.4404,
+ "step": 6191
+ },
+ {
+ "epoch": 2.927659574468085,
+ "grad_norm": 2.9587886333465576,
+ "learning_rate": 2.6265827268352763e-06,
+ "loss": 0.4295,
+ "step": 6192
+ },
+ {
+ "epoch": 2.9281323877068557,
+ "grad_norm": 2.6611828804016113,
+ "learning_rate": 2.625959696093249e-06,
+ "loss": 0.4441,
+ "step": 6193
+ },
+ {
+ "epoch": 2.9286052009456265,
+ "grad_norm": 2.4391369819641113,
+ "learning_rate": 2.6253366575082634e-06,
+ "loss": 0.4447,
+ "step": 6194
+ },
+ {
+ "epoch": 2.9290780141843973,
+ "grad_norm": 2.710763454437256,
+ "learning_rate": 2.6247136111191144e-06,
+ "loss": 0.4662,
+ "step": 6195
+ },
+ {
+ "epoch": 2.929550827423168,
+ "grad_norm": 2.770697593688965,
+ "learning_rate": 2.6240905569645952e-06,
+ "loss": 0.4263,
+ "step": 6196
+ },
+ {
+ "epoch": 2.9300236406619384,
+ "grad_norm": 2.5885732173919678,
+ "learning_rate": 2.623467495083501e-06,
+ "loss": 0.4303,
+ "step": 6197
+ },
+ {
+ "epoch": 2.9304964539007092,
+ "grad_norm": 2.5716748237609863,
+ "learning_rate": 2.6228444255146274e-06,
+ "loss": 0.3714,
+ "step": 6198
+ },
+ {
+ "epoch": 2.93096926713948,
+ "grad_norm": 3.0437910556793213,
+ "learning_rate": 2.6222213482967703e-06,
+ "loss": 0.4077,
+ "step": 6199
+ },
+ {
+ "epoch": 2.9314420803782504,
+ "grad_norm": 2.7861344814300537,
+ "learning_rate": 2.6215982634687253e-06,
+ "loss": 0.4157,
+ "step": 6200
+ },
+ {
+ "epoch": 2.931914893617021,
+ "grad_norm": 2.5265355110168457,
+ "learning_rate": 2.6209751710692905e-06,
+ "loss": 0.4586,
+ "step": 6201
+ },
+ {
+ "epoch": 2.932387706855792,
+ "grad_norm": 2.940112590789795,
+ "learning_rate": 2.6203520711372615e-06,
+ "loss": 0.4208,
+ "step": 6202
+ },
+ {
+ "epoch": 2.9328605200945628,
+ "grad_norm": 2.7124581336975098,
+ "learning_rate": 2.6197289637114363e-06,
+ "loss": 0.4173,
+ "step": 6203
+ },
+ {
+ "epoch": 2.9333333333333336,
+ "grad_norm": 2.818523406982422,
+ "learning_rate": 2.619105848830615e-06,
+ "loss": 0.4349,
+ "step": 6204
+ },
+ {
+ "epoch": 2.933806146572104,
+ "grad_norm": 2.7630393505096436,
+ "learning_rate": 2.6184827265335937e-06,
+ "loss": 0.5078,
+ "step": 6205
+ },
+ {
+ "epoch": 2.9342789598108747,
+ "grad_norm": 3.0554699897766113,
+ "learning_rate": 2.6178595968591726e-06,
+ "loss": 0.4712,
+ "step": 6206
+ },
+ {
+ "epoch": 2.9347517730496455,
+ "grad_norm": 2.721992254257202,
+ "learning_rate": 2.6172364598461507e-06,
+ "loss": 0.4847,
+ "step": 6207
+ },
+ {
+ "epoch": 2.935224586288416,
+ "grad_norm": 2.809663772583008,
+ "learning_rate": 2.6166133155333303e-06,
+ "loss": 0.4447,
+ "step": 6208
+ },
+ {
+ "epoch": 2.9356973995271867,
+ "grad_norm": 2.568394660949707,
+ "learning_rate": 2.6159901639595088e-06,
+ "loss": 0.4543,
+ "step": 6209
+ },
+ {
+ "epoch": 2.9361702127659575,
+ "grad_norm": 3.3670637607574463,
+ "learning_rate": 2.6153670051634884e-06,
+ "loss": 0.4901,
+ "step": 6210
+ },
+ {
+ "epoch": 2.9366430260047283,
+ "grad_norm": 3.082508087158203,
+ "learning_rate": 2.614743839184071e-06,
+ "loss": 0.4862,
+ "step": 6211
+ },
+ {
+ "epoch": 2.937115839243499,
+ "grad_norm": 2.692139148712158,
+ "learning_rate": 2.6141206660600566e-06,
+ "loss": 0.5199,
+ "step": 6212
+ },
+ {
+ "epoch": 2.9375886524822694,
+ "grad_norm": 3.231433391571045,
+ "learning_rate": 2.6134974858302504e-06,
+ "loss": 0.464,
+ "step": 6213
+ },
+ {
+ "epoch": 2.93806146572104,
+ "grad_norm": 3.224238157272339,
+ "learning_rate": 2.612874298533452e-06,
+ "loss": 0.4507,
+ "step": 6214
+ },
+ {
+ "epoch": 2.938534278959811,
+ "grad_norm": 2.812755584716797,
+ "learning_rate": 2.6122511042084663e-06,
+ "loss": 0.4527,
+ "step": 6215
+ },
+ {
+ "epoch": 2.9390070921985814,
+ "grad_norm": 2.837811231613159,
+ "learning_rate": 2.611627902894098e-06,
+ "loss": 0.4782,
+ "step": 6216
+ },
+ {
+ "epoch": 2.939479905437352,
+ "grad_norm": 3.093817710876465,
+ "learning_rate": 2.6110046946291476e-06,
+ "loss": 0.4933,
+ "step": 6217
+ },
+ {
+ "epoch": 2.939952718676123,
+ "grad_norm": 2.950119733810425,
+ "learning_rate": 2.6103814794524235e-06,
+ "loss": 0.4884,
+ "step": 6218
+ },
+ {
+ "epoch": 2.9404255319148938,
+ "grad_norm": 2.469681978225708,
+ "learning_rate": 2.6097582574027274e-06,
+ "loss": 0.4135,
+ "step": 6219
+ },
+ {
+ "epoch": 2.9408983451536646,
+ "grad_norm": 2.779238224029541,
+ "learning_rate": 2.609135028518866e-06,
+ "loss": 0.5165,
+ "step": 6220
+ },
+ {
+ "epoch": 2.941371158392435,
+ "grad_norm": 2.807705879211426,
+ "learning_rate": 2.608511792839645e-06,
+ "loss": 0.4046,
+ "step": 6221
+ },
+ {
+ "epoch": 2.9418439716312057,
+ "grad_norm": 2.6067750453948975,
+ "learning_rate": 2.607888550403871e-06,
+ "loss": 0.406,
+ "step": 6222
+ },
+ {
+ "epoch": 2.9423167848699765,
+ "grad_norm": 2.865766763687134,
+ "learning_rate": 2.607265301250349e-06,
+ "loss": 0.471,
+ "step": 6223
+ },
+ {
+ "epoch": 2.942789598108747,
+ "grad_norm": 2.977681875228882,
+ "learning_rate": 2.6066420454178876e-06,
+ "loss": 0.4666,
+ "step": 6224
+ },
+ {
+ "epoch": 2.9432624113475176,
+ "grad_norm": 2.870884418487549,
+ "learning_rate": 2.606018782945294e-06,
+ "loss": 0.4768,
+ "step": 6225
+ },
+ {
+ "epoch": 2.9437352245862884,
+ "grad_norm": 2.992851495742798,
+ "learning_rate": 2.6053955138713756e-06,
+ "loss": 0.4657,
+ "step": 6226
+ },
+ {
+ "epoch": 2.9442080378250592,
+ "grad_norm": 2.7279815673828125,
+ "learning_rate": 2.6047722382349406e-06,
+ "loss": 0.4087,
+ "step": 6227
+ },
+ {
+ "epoch": 2.94468085106383,
+ "grad_norm": 2.8587028980255127,
+ "learning_rate": 2.604148956074797e-06,
+ "loss": 0.4452,
+ "step": 6228
+ },
+ {
+ "epoch": 2.9451536643026004,
+ "grad_norm": 3.001694679260254,
+ "learning_rate": 2.6035256674297555e-06,
+ "loss": 0.4852,
+ "step": 6229
+ },
+ {
+ "epoch": 2.945626477541371,
+ "grad_norm": 2.858069896697998,
+ "learning_rate": 2.6029023723386237e-06,
+ "loss": 0.4281,
+ "step": 6230
+ },
+ {
+ "epoch": 2.946099290780142,
+ "grad_norm": 2.675856828689575,
+ "learning_rate": 2.602279070840213e-06,
+ "loss": 0.4545,
+ "step": 6231
+ },
+ {
+ "epoch": 2.9465721040189123,
+ "grad_norm": 2.530245065689087,
+ "learning_rate": 2.6016557629733334e-06,
+ "loss": 0.4619,
+ "step": 6232
+ },
+ {
+ "epoch": 2.947044917257683,
+ "grad_norm": 2.7533743381500244,
+ "learning_rate": 2.601032448776795e-06,
+ "loss": 0.4879,
+ "step": 6233
+ },
+ {
+ "epoch": 2.947517730496454,
+ "grad_norm": 3.130453109741211,
+ "learning_rate": 2.600409128289409e-06,
+ "loss": 0.4056,
+ "step": 6234
+ },
+ {
+ "epoch": 2.9479905437352247,
+ "grad_norm": 3.4736509323120117,
+ "learning_rate": 2.5997858015499867e-06,
+ "loss": 0.5063,
+ "step": 6235
+ },
+ {
+ "epoch": 2.9484633569739955,
+ "grad_norm": 2.871978282928467,
+ "learning_rate": 2.5991624685973406e-06,
+ "loss": 0.4562,
+ "step": 6236
+ },
+ {
+ "epoch": 2.948936170212766,
+ "grad_norm": 2.976503372192383,
+ "learning_rate": 2.5985391294702817e-06,
+ "loss": 0.5079,
+ "step": 6237
+ },
+ {
+ "epoch": 2.9494089834515367,
+ "grad_norm": 2.578122615814209,
+ "learning_rate": 2.597915784207623e-06,
+ "loss": 0.4069,
+ "step": 6238
+ },
+ {
+ "epoch": 2.9498817966903075,
+ "grad_norm": 2.885911226272583,
+ "learning_rate": 2.597292432848178e-06,
+ "loss": 0.4382,
+ "step": 6239
+ },
+ {
+ "epoch": 2.950354609929078,
+ "grad_norm": 2.9301681518554688,
+ "learning_rate": 2.5966690754307605e-06,
+ "loss": 0.4888,
+ "step": 6240
+ },
+ {
+ "epoch": 2.9508274231678486,
+ "grad_norm": 2.9912192821502686,
+ "learning_rate": 2.5960457119941834e-06,
+ "loss": 0.4699,
+ "step": 6241
+ },
+ {
+ "epoch": 2.9513002364066194,
+ "grad_norm": 2.6612601280212402,
+ "learning_rate": 2.5954223425772607e-06,
+ "loss": 0.3736,
+ "step": 6242
+ },
+ {
+ "epoch": 2.9517730496453902,
+ "grad_norm": 2.9325380325317383,
+ "learning_rate": 2.5947989672188067e-06,
+ "loss": 0.4771,
+ "step": 6243
+ },
+ {
+ "epoch": 2.952245862884161,
+ "grad_norm": 2.8143959045410156,
+ "learning_rate": 2.594175585957637e-06,
+ "loss": 0.5103,
+ "step": 6244
+ },
+ {
+ "epoch": 2.9527186761229314,
+ "grad_norm": 2.355078935623169,
+ "learning_rate": 2.5935521988325674e-06,
+ "loss": 0.44,
+ "step": 6245
+ },
+ {
+ "epoch": 2.953191489361702,
+ "grad_norm": 2.733156442642212,
+ "learning_rate": 2.5929288058824114e-06,
+ "loss": 0.4306,
+ "step": 6246
+ },
+ {
+ "epoch": 2.953664302600473,
+ "grad_norm": 3.182563304901123,
+ "learning_rate": 2.5923054071459865e-06,
+ "loss": 0.417,
+ "step": 6247
+ },
+ {
+ "epoch": 2.9541371158392433,
+ "grad_norm": 2.4162323474884033,
+ "learning_rate": 2.5916820026621094e-06,
+ "loss": 0.3802,
+ "step": 6248
+ },
+ {
+ "epoch": 2.954609929078014,
+ "grad_norm": 2.772706985473633,
+ "learning_rate": 2.591058592469595e-06,
+ "loss": 0.4654,
+ "step": 6249
+ },
+ {
+ "epoch": 2.955082742316785,
+ "grad_norm": 2.6011102199554443,
+ "learning_rate": 2.5904351766072616e-06,
+ "loss": 0.4619,
+ "step": 6250
+ },
+ {
+ "epoch": 2.9555555555555557,
+ "grad_norm": 2.5700361728668213,
+ "learning_rate": 2.589811755113926e-06,
+ "loss": 0.3991,
+ "step": 6251
+ },
+ {
+ "epoch": 2.9560283687943265,
+ "grad_norm": 2.6444971561431885,
+ "learning_rate": 2.589188328028407e-06,
+ "loss": 0.4388,
+ "step": 6252
+ },
+ {
+ "epoch": 2.956501182033097,
+ "grad_norm": 2.739567279815674,
+ "learning_rate": 2.588564895389521e-06,
+ "loss": 0.4193,
+ "step": 6253
+ },
+ {
+ "epoch": 2.9569739952718677,
+ "grad_norm": 2.7070045471191406,
+ "learning_rate": 2.5879414572360877e-06,
+ "loss": 0.4347,
+ "step": 6254
+ },
+ {
+ "epoch": 2.9574468085106385,
+ "grad_norm": 2.7811532020568848,
+ "learning_rate": 2.587318013606926e-06,
+ "loss": 0.43,
+ "step": 6255
+ },
+ {
+ "epoch": 2.957919621749409,
+ "grad_norm": 3.0036091804504395,
+ "learning_rate": 2.5866945645408537e-06,
+ "loss": 0.4855,
+ "step": 6256
+ },
+ {
+ "epoch": 2.9583924349881796,
+ "grad_norm": 2.948573112487793,
+ "learning_rate": 2.5860711100766918e-06,
+ "loss": 0.4594,
+ "step": 6257
+ },
+ {
+ "epoch": 2.9588652482269504,
+ "grad_norm": 2.6371593475341797,
+ "learning_rate": 2.5854476502532583e-06,
+ "loss": 0.446,
+ "step": 6258
+ },
+ {
+ "epoch": 2.959338061465721,
+ "grad_norm": 2.668677806854248,
+ "learning_rate": 2.5848241851093754e-06,
+ "loss": 0.3991,
+ "step": 6259
+ },
+ {
+ "epoch": 2.959810874704492,
+ "grad_norm": 3.1640663146972656,
+ "learning_rate": 2.5842007146838614e-06,
+ "loss": 0.5146,
+ "step": 6260
+ },
+ {
+ "epoch": 2.9602836879432624,
+ "grad_norm": 2.9412102699279785,
+ "learning_rate": 2.5835772390155382e-06,
+ "loss": 0.4798,
+ "step": 6261
+ },
+ {
+ "epoch": 2.960756501182033,
+ "grad_norm": 2.7674343585968018,
+ "learning_rate": 2.582953758143227e-06,
+ "loss": 0.4262,
+ "step": 6262
+ },
+ {
+ "epoch": 2.961229314420804,
+ "grad_norm": 3.5219457149505615,
+ "learning_rate": 2.582330272105749e-06,
+ "loss": 0.4905,
+ "step": 6263
+ },
+ {
+ "epoch": 2.9617021276595743,
+ "grad_norm": 2.4274468421936035,
+ "learning_rate": 2.5817067809419267e-06,
+ "loss": 0.4048,
+ "step": 6264
+ },
+ {
+ "epoch": 2.962174940898345,
+ "grad_norm": 2.6907944679260254,
+ "learning_rate": 2.5810832846905814e-06,
+ "loss": 0.388,
+ "step": 6265
+ },
+ {
+ "epoch": 2.962647754137116,
+ "grad_norm": 2.603151321411133,
+ "learning_rate": 2.5804597833905347e-06,
+ "loss": 0.4377,
+ "step": 6266
+ },
+ {
+ "epoch": 2.9631205673758867,
+ "grad_norm": 2.685837507247925,
+ "learning_rate": 2.57983627708061e-06,
+ "loss": 0.4409,
+ "step": 6267
+ },
+ {
+ "epoch": 2.963593380614657,
+ "grad_norm": 2.8281500339508057,
+ "learning_rate": 2.579212765799631e-06,
+ "loss": 0.4567,
+ "step": 6268
+ },
+ {
+ "epoch": 2.964066193853428,
+ "grad_norm": 2.6387875080108643,
+ "learning_rate": 2.57858924958642e-06,
+ "loss": 0.4061,
+ "step": 6269
+ },
+ {
+ "epoch": 2.9645390070921986,
+ "grad_norm": 2.64139986038208,
+ "learning_rate": 2.5779657284798017e-06,
+ "loss": 0.4539,
+ "step": 6270
+ },
+ {
+ "epoch": 2.965011820330969,
+ "grad_norm": 2.7384836673736572,
+ "learning_rate": 2.5773422025185983e-06,
+ "loss": 0.408,
+ "step": 6271
+ },
+ {
+ "epoch": 2.96548463356974,
+ "grad_norm": 2.262514352798462,
+ "learning_rate": 2.576718671741636e-06,
+ "loss": 0.3726,
+ "step": 6272
+ },
+ {
+ "epoch": 2.9659574468085106,
+ "grad_norm": 2.53800106048584,
+ "learning_rate": 2.5760951361877384e-06,
+ "loss": 0.4716,
+ "step": 6273
+ },
+ {
+ "epoch": 2.9664302600472814,
+ "grad_norm": 3.256701707839966,
+ "learning_rate": 2.57547159589573e-06,
+ "loss": 0.518,
+ "step": 6274
+ },
+ {
+ "epoch": 2.966903073286052,
+ "grad_norm": 2.9427342414855957,
+ "learning_rate": 2.574848050904436e-06,
+ "loss": 0.4255,
+ "step": 6275
+ },
+ {
+ "epoch": 2.9673758865248225,
+ "grad_norm": 2.5794098377227783,
+ "learning_rate": 2.574224501252682e-06,
+ "loss": 0.4412,
+ "step": 6276
+ },
+ {
+ "epoch": 2.9678486997635933,
+ "grad_norm": 2.5894877910614014,
+ "learning_rate": 2.573600946979294e-06,
+ "loss": 0.4356,
+ "step": 6277
+ },
+ {
+ "epoch": 2.968321513002364,
+ "grad_norm": 2.9597361087799072,
+ "learning_rate": 2.572977388123098e-06,
+ "loss": 0.4376,
+ "step": 6278
+ },
+ {
+ "epoch": 2.9687943262411345,
+ "grad_norm": 2.779303550720215,
+ "learning_rate": 2.5723538247229197e-06,
+ "loss": 0.3985,
+ "step": 6279
+ },
+ {
+ "epoch": 2.9692671394799053,
+ "grad_norm": 2.9173855781555176,
+ "learning_rate": 2.5717302568175866e-06,
+ "loss": 0.4581,
+ "step": 6280
+ },
+ {
+ "epoch": 2.969739952718676,
+ "grad_norm": 2.703721284866333,
+ "learning_rate": 2.5711066844459242e-06,
+ "loss": 0.3705,
+ "step": 6281
+ },
+ {
+ "epoch": 2.970212765957447,
+ "grad_norm": 2.5415029525756836,
+ "learning_rate": 2.5704831076467613e-06,
+ "loss": 0.4089,
+ "step": 6282
+ },
+ {
+ "epoch": 2.9706855791962177,
+ "grad_norm": 2.791780948638916,
+ "learning_rate": 2.5698595264589234e-06,
+ "loss": 0.4357,
+ "step": 6283
+ },
+ {
+ "epoch": 2.971158392434988,
+ "grad_norm": 2.887662887573242,
+ "learning_rate": 2.5692359409212392e-06,
+ "loss": 0.4093,
+ "step": 6284
+ },
+ {
+ "epoch": 2.971631205673759,
+ "grad_norm": 3.0309557914733887,
+ "learning_rate": 2.5686123510725364e-06,
+ "loss": 0.4461,
+ "step": 6285
+ },
+ {
+ "epoch": 2.9721040189125296,
+ "grad_norm": 2.6861515045166016,
+ "learning_rate": 2.5679887569516437e-06,
+ "loss": 0.4199,
+ "step": 6286
+ },
+ {
+ "epoch": 2.9725768321513,
+ "grad_norm": 2.7014012336730957,
+ "learning_rate": 2.5673651585973897e-06,
+ "loss": 0.4373,
+ "step": 6287
+ },
+ {
+ "epoch": 2.9730496453900708,
+ "grad_norm": 2.951265811920166,
+ "learning_rate": 2.5667415560486026e-06,
+ "loss": 0.4426,
+ "step": 6288
+ },
+ {
+ "epoch": 2.9735224586288416,
+ "grad_norm": 2.7664504051208496,
+ "learning_rate": 2.5661179493441106e-06,
+ "loss": 0.474,
+ "step": 6289
+ },
+ {
+ "epoch": 2.9739952718676124,
+ "grad_norm": 2.6081087589263916,
+ "learning_rate": 2.5654943385227445e-06,
+ "loss": 0.4058,
+ "step": 6290
+ },
+ {
+ "epoch": 2.974468085106383,
+ "grad_norm": 2.9416966438293457,
+ "learning_rate": 2.564870723623333e-06,
+ "loss": 0.506,
+ "step": 6291
+ },
+ {
+ "epoch": 2.9749408983451535,
+ "grad_norm": 2.9441659450531006,
+ "learning_rate": 2.564247104684706e-06,
+ "loss": 0.4505,
+ "step": 6292
+ },
+ {
+ "epoch": 2.9754137115839243,
+ "grad_norm": 2.7110862731933594,
+ "learning_rate": 2.563623481745693e-06,
+ "loss": 0.4493,
+ "step": 6293
+ },
+ {
+ "epoch": 2.975886524822695,
+ "grad_norm": 2.88459849357605,
+ "learning_rate": 2.562999854845125e-06,
+ "loss": 0.4462,
+ "step": 6294
+ },
+ {
+ "epoch": 2.9763593380614655,
+ "grad_norm": 3.0491793155670166,
+ "learning_rate": 2.5623762240218327e-06,
+ "loss": 0.4928,
+ "step": 6295
+ },
+ {
+ "epoch": 2.9768321513002363,
+ "grad_norm": 2.9475483894348145,
+ "learning_rate": 2.561752589314646e-06,
+ "loss": 0.4535,
+ "step": 6296
+ },
+ {
+ "epoch": 2.977304964539007,
+ "grad_norm": 2.879495859146118,
+ "learning_rate": 2.561128950762397e-06,
+ "loss": 0.4393,
+ "step": 6297
+ },
+ {
+ "epoch": 2.977777777777778,
+ "grad_norm": 2.8478336334228516,
+ "learning_rate": 2.560505308403916e-06,
+ "loss": 0.4363,
+ "step": 6298
+ },
+ {
+ "epoch": 2.9782505910165487,
+ "grad_norm": 2.5475094318389893,
+ "learning_rate": 2.5598816622780343e-06,
+ "loss": 0.3825,
+ "step": 6299
+ },
+ {
+ "epoch": 2.978723404255319,
+ "grad_norm": 2.85430908203125,
+ "learning_rate": 2.5592580124235838e-06,
+ "loss": 0.4226,
+ "step": 6300
+ },
+ {
+ "epoch": 2.97919621749409,
+ "grad_norm": 2.569775104522705,
+ "learning_rate": 2.5586343588793975e-06,
+ "loss": 0.4045,
+ "step": 6301
+ },
+ {
+ "epoch": 2.9796690307328606,
+ "grad_norm": 2.4482202529907227,
+ "learning_rate": 2.558010701684307e-06,
+ "loss": 0.4625,
+ "step": 6302
+ },
+ {
+ "epoch": 2.980141843971631,
+ "grad_norm": 2.9301230907440186,
+ "learning_rate": 2.5573870408771436e-06,
+ "loss": 0.4358,
+ "step": 6303
+ },
+ {
+ "epoch": 2.9806146572104018,
+ "grad_norm": 2.9865870475769043,
+ "learning_rate": 2.5567633764967416e-06,
+ "loss": 0.497,
+ "step": 6304
+ },
+ {
+ "epoch": 2.9810874704491725,
+ "grad_norm": 2.523524522781372,
+ "learning_rate": 2.556139708581933e-06,
+ "loss": 0.4141,
+ "step": 6305
+ },
+ {
+ "epoch": 2.9815602836879433,
+ "grad_norm": 2.8489344120025635,
+ "learning_rate": 2.5555160371715504e-06,
+ "loss": 0.4205,
+ "step": 6306
+ },
+ {
+ "epoch": 2.982033096926714,
+ "grad_norm": 2.417759895324707,
+ "learning_rate": 2.5548923623044274e-06,
+ "loss": 0.44,
+ "step": 6307
+ },
+ {
+ "epoch": 2.9825059101654845,
+ "grad_norm": 2.7626900672912598,
+ "learning_rate": 2.554268684019398e-06,
+ "loss": 0.4646,
+ "step": 6308
+ },
+ {
+ "epoch": 2.9829787234042553,
+ "grad_norm": 3.0916266441345215,
+ "learning_rate": 2.5536450023552956e-06,
+ "loss": 0.4443,
+ "step": 6309
+ },
+ {
+ "epoch": 2.983451536643026,
+ "grad_norm": 2.721992015838623,
+ "learning_rate": 2.5530213173509542e-06,
+ "loss": 0.4008,
+ "step": 6310
+ },
+ {
+ "epoch": 2.9839243498817964,
+ "grad_norm": 2.825334072113037,
+ "learning_rate": 2.552397629045208e-06,
+ "loss": 0.4513,
+ "step": 6311
+ },
+ {
+ "epoch": 2.9843971631205672,
+ "grad_norm": 2.912050485610962,
+ "learning_rate": 2.5517739374768915e-06,
+ "loss": 0.4104,
+ "step": 6312
+ },
+ {
+ "epoch": 2.984869976359338,
+ "grad_norm": 2.760650634765625,
+ "learning_rate": 2.551150242684838e-06,
+ "loss": 0.4372,
+ "step": 6313
+ },
+ {
+ "epoch": 2.985342789598109,
+ "grad_norm": 2.8926033973693848,
+ "learning_rate": 2.5505265447078838e-06,
+ "loss": 0.475,
+ "step": 6314
+ },
+ {
+ "epoch": 2.9858156028368796,
+ "grad_norm": 2.6279892921447754,
+ "learning_rate": 2.5499028435848633e-06,
+ "loss": 0.4589,
+ "step": 6315
+ },
+ {
+ "epoch": 2.98628841607565,
+ "grad_norm": 3.2147316932678223,
+ "learning_rate": 2.549279139354611e-06,
+ "loss": 0.4968,
+ "step": 6316
+ },
+ {
+ "epoch": 2.986761229314421,
+ "grad_norm": 2.4510674476623535,
+ "learning_rate": 2.5486554320559626e-06,
+ "loss": 0.4291,
+ "step": 6317
+ },
+ {
+ "epoch": 2.9872340425531916,
+ "grad_norm": 2.6919643878936768,
+ "learning_rate": 2.5480317217277544e-06,
+ "loss": 0.4704,
+ "step": 6318
+ },
+ {
+ "epoch": 2.987706855791962,
+ "grad_norm": 2.9832234382629395,
+ "learning_rate": 2.5474080084088215e-06,
+ "loss": 0.4129,
+ "step": 6319
+ },
+ {
+ "epoch": 2.9881796690307327,
+ "grad_norm": 2.893209218978882,
+ "learning_rate": 2.5467842921380004e-06,
+ "loss": 0.5099,
+ "step": 6320
+ },
+ {
+ "epoch": 2.9886524822695035,
+ "grad_norm": 2.6734580993652344,
+ "learning_rate": 2.5461605729541254e-06,
+ "loss": 0.4588,
+ "step": 6321
+ },
+ {
+ "epoch": 2.9891252955082743,
+ "grad_norm": 2.5591681003570557,
+ "learning_rate": 2.5455368508960343e-06,
+ "loss": 0.4162,
+ "step": 6322
+ },
+ {
+ "epoch": 2.989598108747045,
+ "grad_norm": 3.2619881629943848,
+ "learning_rate": 2.5449131260025626e-06,
+ "loss": 0.4412,
+ "step": 6323
+ },
+ {
+ "epoch": 2.9900709219858155,
+ "grad_norm": 2.897914409637451,
+ "learning_rate": 2.544289398312549e-06,
+ "loss": 0.5079,
+ "step": 6324
+ },
+ {
+ "epoch": 2.9905437352245863,
+ "grad_norm": 2.7891685962677,
+ "learning_rate": 2.5436656678648274e-06,
+ "loss": 0.42,
+ "step": 6325
+ },
+ {
+ "epoch": 2.991016548463357,
+ "grad_norm": 3.022341728210449,
+ "learning_rate": 2.5430419346982367e-06,
+ "loss": 0.4739,
+ "step": 6326
+ },
+ {
+ "epoch": 2.9914893617021274,
+ "grad_norm": 3.395775556564331,
+ "learning_rate": 2.542418198851614e-06,
+ "loss": 0.4822,
+ "step": 6327
+ },
+ {
+ "epoch": 2.9919621749408982,
+ "grad_norm": 3.0200490951538086,
+ "learning_rate": 2.541794460363795e-06,
+ "loss": 0.4755,
+ "step": 6328
+ },
+ {
+ "epoch": 2.992434988179669,
+ "grad_norm": 3.302020311355591,
+ "learning_rate": 2.541170719273619e-06,
+ "loss": 0.4603,
+ "step": 6329
+ },
+ {
+ "epoch": 2.99290780141844,
+ "grad_norm": 2.5985910892486572,
+ "learning_rate": 2.5405469756199226e-06,
+ "loss": 0.4475,
+ "step": 6330
+ },
+ {
+ "epoch": 2.9933806146572106,
+ "grad_norm": 2.9413928985595703,
+ "learning_rate": 2.5399232294415434e-06,
+ "loss": 0.4695,
+ "step": 6331
+ },
+ {
+ "epoch": 2.993853427895981,
+ "grad_norm": 2.942777156829834,
+ "learning_rate": 2.53929948077732e-06,
+ "loss": 0.4462,
+ "step": 6332
+ },
+ {
+ "epoch": 2.9943262411347518,
+ "grad_norm": 2.971120595932007,
+ "learning_rate": 2.53867572966609e-06,
+ "loss": 0.4546,
+ "step": 6333
+ },
+ {
+ "epoch": 2.9947990543735226,
+ "grad_norm": 2.8248138427734375,
+ "learning_rate": 2.5380519761466927e-06,
+ "loss": 0.453,
+ "step": 6334
+ },
+ {
+ "epoch": 2.995271867612293,
+ "grad_norm": 3.0819008350372314,
+ "learning_rate": 2.5374282202579647e-06,
+ "loss": 0.4774,
+ "step": 6335
+ },
+ {
+ "epoch": 2.9957446808510637,
+ "grad_norm": 2.742570161819458,
+ "learning_rate": 2.5368044620387466e-06,
+ "loss": 0.5059,
+ "step": 6336
+ },
+ {
+ "epoch": 2.9962174940898345,
+ "grad_norm": 2.9087419509887695,
+ "learning_rate": 2.5361807015278757e-06,
+ "loss": 0.3606,
+ "step": 6337
+ },
+ {
+ "epoch": 2.9966903073286053,
+ "grad_norm": 2.6887354850769043,
+ "learning_rate": 2.5355569387641908e-06,
+ "loss": 0.4247,
+ "step": 6338
+ },
+ {
+ "epoch": 2.997163120567376,
+ "grad_norm": 2.8516008853912354,
+ "learning_rate": 2.534933173786531e-06,
+ "loss": 0.4502,
+ "step": 6339
+ },
+ {
+ "epoch": 2.9976359338061465,
+ "grad_norm": 2.4463164806365967,
+ "learning_rate": 2.5343094066337366e-06,
+ "loss": 0.3883,
+ "step": 6340
+ },
+ {
+ "epoch": 2.9981087470449173,
+ "grad_norm": 2.87025785446167,
+ "learning_rate": 2.533685637344645e-06,
+ "loss": 0.4534,
+ "step": 6341
+ },
+ {
+ "epoch": 2.998581560283688,
+ "grad_norm": 3.0706169605255127,
+ "learning_rate": 2.5330618659580967e-06,
+ "loss": 0.5426,
+ "step": 6342
+ },
+ {
+ "epoch": 2.9990543735224584,
+ "grad_norm": 2.7185773849487305,
+ "learning_rate": 2.532438092512931e-06,
+ "loss": 0.497,
+ "step": 6343
+ },
+ {
+ "epoch": 2.999527186761229,
+ "grad_norm": 2.840207815170288,
+ "learning_rate": 2.531814317047988e-06,
+ "loss": 0.4073,
+ "step": 6344
+ },
+ {
+ "epoch": 3.0,
+ "grad_norm": 3.1592655181884766,
+ "learning_rate": 2.5311905396021063e-06,
+ "loss": 0.4728,
+ "step": 6345
+ },
+ {
+ "epoch": 3.000472813238771,
+ "grad_norm": 2.190042495727539,
+ "learning_rate": 2.530566760214127e-06,
+ "loss": 0.3588,
+ "step": 6346
+ },
+ {
+ "epoch": 3.000945626477541,
+ "grad_norm": 2.749516248703003,
+ "learning_rate": 2.5299429789228898e-06,
+ "loss": 0.3495,
+ "step": 6347
+ },
+ {
+ "epoch": 3.001418439716312,
+ "grad_norm": 2.6181938648223877,
+ "learning_rate": 2.5293191957672335e-06,
+ "loss": 0.3611,
+ "step": 6348
+ },
+ {
+ "epoch": 3.0018912529550827,
+ "grad_norm": 2.7235212326049805,
+ "learning_rate": 2.528695410786e-06,
+ "loss": 0.4173,
+ "step": 6349
+ },
+ {
+ "epoch": 3.0023640661938535,
+ "grad_norm": 2.5408031940460205,
+ "learning_rate": 2.528071624018029e-06,
+ "loss": 0.3651,
+ "step": 6350
+ },
+ {
+ "epoch": 3.002836879432624,
+ "grad_norm": 2.7824409008026123,
+ "learning_rate": 2.5274478355021615e-06,
+ "loss": 0.378,
+ "step": 6351
+ },
+ {
+ "epoch": 3.0033096926713947,
+ "grad_norm": 2.7671427726745605,
+ "learning_rate": 2.526824045277238e-06,
+ "loss": 0.446,
+ "step": 6352
+ },
+ {
+ "epoch": 3.0037825059101655,
+ "grad_norm": 2.6746346950531006,
+ "learning_rate": 2.526200253382098e-06,
+ "loss": 0.3831,
+ "step": 6353
+ },
+ {
+ "epoch": 3.0042553191489363,
+ "grad_norm": 2.437439441680908,
+ "learning_rate": 2.525576459855583e-06,
+ "loss": 0.352,
+ "step": 6354
+ },
+ {
+ "epoch": 3.0047281323877066,
+ "grad_norm": 2.7632546424865723,
+ "learning_rate": 2.5249526647365343e-06,
+ "loss": 0.4636,
+ "step": 6355
+ },
+ {
+ "epoch": 3.0052009456264774,
+ "grad_norm": 2.681955099105835,
+ "learning_rate": 2.524328868063793e-06,
+ "loss": 0.3978,
+ "step": 6356
+ },
+ {
+ "epoch": 3.0056737588652482,
+ "grad_norm": 2.9575345516204834,
+ "learning_rate": 2.523705069876199e-06,
+ "loss": 0.3803,
+ "step": 6357
+ },
+ {
+ "epoch": 3.006146572104019,
+ "grad_norm": 2.7368216514587402,
+ "learning_rate": 2.523081270212594e-06,
+ "loss": 0.3968,
+ "step": 6358
+ },
+ {
+ "epoch": 3.0066193853427894,
+ "grad_norm": 2.637592077255249,
+ "learning_rate": 2.522457469111821e-06,
+ "loss": 0.3629,
+ "step": 6359
+ },
+ {
+ "epoch": 3.00709219858156,
+ "grad_norm": 2.579331398010254,
+ "learning_rate": 2.5218336666127187e-06,
+ "loss": 0.4044,
+ "step": 6360
+ },
+ {
+ "epoch": 3.007565011820331,
+ "grad_norm": 3.014544725418091,
+ "learning_rate": 2.5212098627541296e-06,
+ "loss": 0.3518,
+ "step": 6361
+ },
+ {
+ "epoch": 3.0080378250591018,
+ "grad_norm": 2.5261058807373047,
+ "learning_rate": 2.520586057574896e-06,
+ "loss": 0.3763,
+ "step": 6362
+ },
+ {
+ "epoch": 3.008510638297872,
+ "grad_norm": 3.234910249710083,
+ "learning_rate": 2.519962251113858e-06,
+ "loss": 0.3691,
+ "step": 6363
+ },
+ {
+ "epoch": 3.008983451536643,
+ "grad_norm": 3.2930967807769775,
+ "learning_rate": 2.519338443409859e-06,
+ "loss": 0.4363,
+ "step": 6364
+ },
+ {
+ "epoch": 3.0094562647754137,
+ "grad_norm": 2.807910442352295,
+ "learning_rate": 2.51871463450174e-06,
+ "loss": 0.3984,
+ "step": 6365
+ },
+ {
+ "epoch": 3.0099290780141845,
+ "grad_norm": 3.1555075645446777,
+ "learning_rate": 2.518090824428342e-06,
+ "loss": 0.4006,
+ "step": 6366
+ },
+ {
+ "epoch": 3.010401891252955,
+ "grad_norm": 3.1793272495269775,
+ "learning_rate": 2.5174670132285084e-06,
+ "loss": 0.4966,
+ "step": 6367
+ },
+ {
+ "epoch": 3.0108747044917257,
+ "grad_norm": 2.7007548809051514,
+ "learning_rate": 2.5168432009410805e-06,
+ "loss": 0.3755,
+ "step": 6368
+ },
+ {
+ "epoch": 3.0113475177304965,
+ "grad_norm": 2.914792537689209,
+ "learning_rate": 2.5162193876048995e-06,
+ "loss": 0.39,
+ "step": 6369
+ },
+ {
+ "epoch": 3.0118203309692673,
+ "grad_norm": 2.935516119003296,
+ "learning_rate": 2.5155955732588093e-06,
+ "loss": 0.4045,
+ "step": 6370
+ },
+ {
+ "epoch": 3.0122931442080376,
+ "grad_norm": 2.8817989826202393,
+ "learning_rate": 2.5149717579416503e-06,
+ "loss": 0.3751,
+ "step": 6371
+ },
+ {
+ "epoch": 3.0127659574468084,
+ "grad_norm": 2.9181740283966064,
+ "learning_rate": 2.514347941692266e-06,
+ "loss": 0.3689,
+ "step": 6372
+ },
+ {
+ "epoch": 3.013238770685579,
+ "grad_norm": 3.052060604095459,
+ "learning_rate": 2.5137241245494982e-06,
+ "loss": 0.3874,
+ "step": 6373
+ },
+ {
+ "epoch": 3.01371158392435,
+ "grad_norm": 2.6931657791137695,
+ "learning_rate": 2.513100306552189e-06,
+ "loss": 0.3673,
+ "step": 6374
+ },
+ {
+ "epoch": 3.0141843971631204,
+ "grad_norm": 2.3422248363494873,
+ "learning_rate": 2.5124764877391824e-06,
+ "loss": 0.3753,
+ "step": 6375
+ },
+ {
+ "epoch": 3.014657210401891,
+ "grad_norm": 2.5826265811920166,
+ "learning_rate": 2.5118526681493186e-06,
+ "loss": 0.3661,
+ "step": 6376
+ },
+ {
+ "epoch": 3.015130023640662,
+ "grad_norm": 2.7407493591308594,
+ "learning_rate": 2.5112288478214415e-06,
+ "loss": 0.3887,
+ "step": 6377
+ },
+ {
+ "epoch": 3.0156028368794328,
+ "grad_norm": 2.7378315925598145,
+ "learning_rate": 2.510605026794393e-06,
+ "loss": 0.3623,
+ "step": 6378
+ },
+ {
+ "epoch": 3.016075650118203,
+ "grad_norm": 2.59541654586792,
+ "learning_rate": 2.5099812051070167e-06,
+ "loss": 0.3804,
+ "step": 6379
+ },
+ {
+ "epoch": 3.016548463356974,
+ "grad_norm": 3.1022770404815674,
+ "learning_rate": 2.509357382798154e-06,
+ "loss": 0.4092,
+ "step": 6380
+ },
+ {
+ "epoch": 3.0170212765957447,
+ "grad_norm": 2.521545648574829,
+ "learning_rate": 2.5087335599066476e-06,
+ "loss": 0.3509,
+ "step": 6381
+ },
+ {
+ "epoch": 3.0174940898345155,
+ "grad_norm": 2.949395179748535,
+ "learning_rate": 2.5081097364713407e-06,
+ "loss": 0.387,
+ "step": 6382
+ },
+ {
+ "epoch": 3.017966903073286,
+ "grad_norm": 2.4806487560272217,
+ "learning_rate": 2.507485912531077e-06,
+ "loss": 0.4004,
+ "step": 6383
+ },
+ {
+ "epoch": 3.0184397163120567,
+ "grad_norm": 2.6480894088745117,
+ "learning_rate": 2.506862088124698e-06,
+ "loss": 0.3366,
+ "step": 6384
+ },
+ {
+ "epoch": 3.0189125295508275,
+ "grad_norm": 2.62559175491333,
+ "learning_rate": 2.5062382632910463e-06,
+ "loss": 0.3676,
+ "step": 6385
+ },
+ {
+ "epoch": 3.0193853427895982,
+ "grad_norm": 2.694767951965332,
+ "learning_rate": 2.5056144380689657e-06,
+ "loss": 0.3438,
+ "step": 6386
+ },
+ {
+ "epoch": 3.0198581560283686,
+ "grad_norm": 2.808107614517212,
+ "learning_rate": 2.504990612497299e-06,
+ "loss": 0.3831,
+ "step": 6387
+ },
+ {
+ "epoch": 3.0203309692671394,
+ "grad_norm": 3.2392303943634033,
+ "learning_rate": 2.504366786614888e-06,
+ "loss": 0.3493,
+ "step": 6388
+ },
+ {
+ "epoch": 3.02080378250591,
+ "grad_norm": 2.6899030208587646,
+ "learning_rate": 2.5037429604605774e-06,
+ "loss": 0.3998,
+ "step": 6389
+ },
+ {
+ "epoch": 3.021276595744681,
+ "grad_norm": 2.5622799396514893,
+ "learning_rate": 2.503119134073208e-06,
+ "loss": 0.3443,
+ "step": 6390
+ },
+ {
+ "epoch": 3.0217494089834513,
+ "grad_norm": 2.716832399368286,
+ "learning_rate": 2.502495307491625e-06,
+ "loss": 0.4465,
+ "step": 6391
+ },
+ {
+ "epoch": 3.022222222222222,
+ "grad_norm": 2.8117692470550537,
+ "learning_rate": 2.501871480754669e-06,
+ "loss": 0.3513,
+ "step": 6392
+ },
+ {
+ "epoch": 3.022695035460993,
+ "grad_norm": 3.1260762214660645,
+ "learning_rate": 2.501247653901185e-06,
+ "loss": 0.4336,
+ "step": 6393
+ },
+ {
+ "epoch": 3.0231678486997637,
+ "grad_norm": 2.5076897144317627,
+ "learning_rate": 2.5006238269700137e-06,
+ "loss": 0.3437,
+ "step": 6394
+ },
+ {
+ "epoch": 3.023640661938534,
+ "grad_norm": 2.781937837600708,
+ "learning_rate": 2.5e-06,
+ "loss": 0.3583,
+ "step": 6395
+ },
+ {
+ "epoch": 3.024113475177305,
+ "grad_norm": 3.084050178527832,
+ "learning_rate": 2.499376173029987e-06,
+ "loss": 0.3785,
+ "step": 6396
+ },
+ {
+ "epoch": 3.0245862884160757,
+ "grad_norm": 3.2292473316192627,
+ "learning_rate": 2.498752346098816e-06,
+ "loss": 0.3858,
+ "step": 6397
+ },
+ {
+ "epoch": 3.0250591016548465,
+ "grad_norm": 2.738614797592163,
+ "learning_rate": 2.498128519245332e-06,
+ "loss": 0.4166,
+ "step": 6398
+ },
+ {
+ "epoch": 3.025531914893617,
+ "grad_norm": 2.940103054046631,
+ "learning_rate": 2.4975046925083764e-06,
+ "loss": 0.4117,
+ "step": 6399
+ },
+ {
+ "epoch": 3.0260047281323876,
+ "grad_norm": 2.5177032947540283,
+ "learning_rate": 2.4968808659267927e-06,
+ "loss": 0.3704,
+ "step": 6400
+ },
+ {
+ "epoch": 3.0264775413711584,
+ "grad_norm": 2.6969990730285645,
+ "learning_rate": 2.4962570395394243e-06,
+ "loss": 0.3721,
+ "step": 6401
+ },
+ {
+ "epoch": 3.0269503546099292,
+ "grad_norm": 2.9696028232574463,
+ "learning_rate": 2.495633213385112e-06,
+ "loss": 0.3934,
+ "step": 6402
+ },
+ {
+ "epoch": 3.0274231678486996,
+ "grad_norm": 3.4032552242279053,
+ "learning_rate": 2.495009387502702e-06,
+ "loss": 0.3877,
+ "step": 6403
+ },
+ {
+ "epoch": 3.0278959810874704,
+ "grad_norm": 2.6801865100860596,
+ "learning_rate": 2.4943855619310343e-06,
+ "loss": 0.3421,
+ "step": 6404
+ },
+ {
+ "epoch": 3.028368794326241,
+ "grad_norm": 2.827056884765625,
+ "learning_rate": 2.493761736708954e-06,
+ "loss": 0.3791,
+ "step": 6405
+ },
+ {
+ "epoch": 3.028841607565012,
+ "grad_norm": 2.6393566131591797,
+ "learning_rate": 2.4931379118753034e-06,
+ "loss": 0.3729,
+ "step": 6406
+ },
+ {
+ "epoch": 3.0293144208037823,
+ "grad_norm": 2.833519458770752,
+ "learning_rate": 2.4925140874689236e-06,
+ "loss": 0.3836,
+ "step": 6407
+ },
+ {
+ "epoch": 3.029787234042553,
+ "grad_norm": 2.8852169513702393,
+ "learning_rate": 2.4918902635286597e-06,
+ "loss": 0.4307,
+ "step": 6408
+ },
+ {
+ "epoch": 3.030260047281324,
+ "grad_norm": 2.7166404724121094,
+ "learning_rate": 2.491266440093354e-06,
+ "loss": 0.3825,
+ "step": 6409
+ },
+ {
+ "epoch": 3.0307328605200947,
+ "grad_norm": 2.5828018188476562,
+ "learning_rate": 2.4906426172018474e-06,
+ "loss": 0.3579,
+ "step": 6410
+ },
+ {
+ "epoch": 3.031205673758865,
+ "grad_norm": 2.915632724761963,
+ "learning_rate": 2.490018794892985e-06,
+ "loss": 0.4099,
+ "step": 6411
+ },
+ {
+ "epoch": 3.031678486997636,
+ "grad_norm": 2.7117249965667725,
+ "learning_rate": 2.489394973205607e-06,
+ "loss": 0.4063,
+ "step": 6412
+ },
+ {
+ "epoch": 3.0321513002364067,
+ "grad_norm": 2.3989102840423584,
+ "learning_rate": 2.488771152178559e-06,
+ "loss": 0.3377,
+ "step": 6413
+ },
+ {
+ "epoch": 3.0326241134751775,
+ "grad_norm": 2.6560115814208984,
+ "learning_rate": 2.488147331850682e-06,
+ "loss": 0.4072,
+ "step": 6414
+ },
+ {
+ "epoch": 3.033096926713948,
+ "grad_norm": 2.9466328620910645,
+ "learning_rate": 2.4875235122608184e-06,
+ "loss": 0.3559,
+ "step": 6415
+ },
+ {
+ "epoch": 3.0335697399527186,
+ "grad_norm": 2.765348196029663,
+ "learning_rate": 2.4868996934478114e-06,
+ "loss": 0.336,
+ "step": 6416
+ },
+ {
+ "epoch": 3.0340425531914894,
+ "grad_norm": 2.6021807193756104,
+ "learning_rate": 2.4862758754505017e-06,
+ "loss": 0.3861,
+ "step": 6417
+ },
+ {
+ "epoch": 3.03451536643026,
+ "grad_norm": 2.7293684482574463,
+ "learning_rate": 2.4856520583077344e-06,
+ "loss": 0.3926,
+ "step": 6418
+ },
+ {
+ "epoch": 3.0349881796690306,
+ "grad_norm": 2.9704763889312744,
+ "learning_rate": 2.485028242058351e-06,
+ "loss": 0.4303,
+ "step": 6419
+ },
+ {
+ "epoch": 3.0354609929078014,
+ "grad_norm": 3.385713815689087,
+ "learning_rate": 2.484404426741191e-06,
+ "loss": 0.44,
+ "step": 6420
+ },
+ {
+ "epoch": 3.035933806146572,
+ "grad_norm": 3.177983045578003,
+ "learning_rate": 2.4837806123951013e-06,
+ "loss": 0.4256,
+ "step": 6421
+ },
+ {
+ "epoch": 3.036406619385343,
+ "grad_norm": 2.6287200450897217,
+ "learning_rate": 2.4831567990589203e-06,
+ "loss": 0.3764,
+ "step": 6422
+ },
+ {
+ "epoch": 3.0368794326241133,
+ "grad_norm": 2.81823992729187,
+ "learning_rate": 2.4825329867714924e-06,
+ "loss": 0.3645,
+ "step": 6423
+ },
+ {
+ "epoch": 3.037352245862884,
+ "grad_norm": 3.1826934814453125,
+ "learning_rate": 2.4819091755716586e-06,
+ "loss": 0.3666,
+ "step": 6424
+ },
+ {
+ "epoch": 3.037825059101655,
+ "grad_norm": 3.0880346298217773,
+ "learning_rate": 2.481285365498261e-06,
+ "loss": 0.4339,
+ "step": 6425
+ },
+ {
+ "epoch": 3.0382978723404257,
+ "grad_norm": 3.1764965057373047,
+ "learning_rate": 2.480661556590142e-06,
+ "loss": 0.4804,
+ "step": 6426
+ },
+ {
+ "epoch": 3.038770685579196,
+ "grad_norm": 2.89469313621521,
+ "learning_rate": 2.480037748886142e-06,
+ "loss": 0.3875,
+ "step": 6427
+ },
+ {
+ "epoch": 3.039243498817967,
+ "grad_norm": 2.6043636798858643,
+ "learning_rate": 2.479413942425105e-06,
+ "loss": 0.3859,
+ "step": 6428
+ },
+ {
+ "epoch": 3.0397163120567376,
+ "grad_norm": 2.6570727825164795,
+ "learning_rate": 2.4787901372458712e-06,
+ "loss": 0.3508,
+ "step": 6429
+ },
+ {
+ "epoch": 3.0401891252955084,
+ "grad_norm": 2.914050579071045,
+ "learning_rate": 2.4781663333872825e-06,
+ "loss": 0.3904,
+ "step": 6430
+ },
+ {
+ "epoch": 3.040661938534279,
+ "grad_norm": 2.595606803894043,
+ "learning_rate": 2.47754253088818e-06,
+ "loss": 0.3753,
+ "step": 6431
+ },
+ {
+ "epoch": 3.0411347517730496,
+ "grad_norm": 2.68186616897583,
+ "learning_rate": 2.4769187297874065e-06,
+ "loss": 0.3545,
+ "step": 6432
+ },
+ {
+ "epoch": 3.0416075650118204,
+ "grad_norm": 2.956507921218872,
+ "learning_rate": 2.476294930123802e-06,
+ "loss": 0.3778,
+ "step": 6433
+ },
+ {
+ "epoch": 3.042080378250591,
+ "grad_norm": 2.8327226638793945,
+ "learning_rate": 2.475671131936209e-06,
+ "loss": 0.3205,
+ "step": 6434
+ },
+ {
+ "epoch": 3.0425531914893615,
+ "grad_norm": 2.594348430633545,
+ "learning_rate": 2.475047335263466e-06,
+ "loss": 0.3859,
+ "step": 6435
+ },
+ {
+ "epoch": 3.0430260047281323,
+ "grad_norm": 3.5030717849731445,
+ "learning_rate": 2.4744235401444177e-06,
+ "loss": 0.3611,
+ "step": 6436
+ },
+ {
+ "epoch": 3.043498817966903,
+ "grad_norm": 2.8478317260742188,
+ "learning_rate": 2.4737997466179034e-06,
+ "loss": 0.3927,
+ "step": 6437
+ },
+ {
+ "epoch": 3.043971631205674,
+ "grad_norm": 2.677827835083008,
+ "learning_rate": 2.4731759547227627e-06,
+ "loss": 0.3784,
+ "step": 6438
+ },
+ {
+ "epoch": 3.0444444444444443,
+ "grad_norm": 3.0059866905212402,
+ "learning_rate": 2.4725521644978393e-06,
+ "loss": 0.4279,
+ "step": 6439
+ },
+ {
+ "epoch": 3.044917257683215,
+ "grad_norm": 3.012500047683716,
+ "learning_rate": 2.4719283759819713e-06,
+ "loss": 0.4007,
+ "step": 6440
+ },
+ {
+ "epoch": 3.045390070921986,
+ "grad_norm": 2.758204936981201,
+ "learning_rate": 2.4713045892140007e-06,
+ "loss": 0.3668,
+ "step": 6441
+ },
+ {
+ "epoch": 3.0458628841607567,
+ "grad_norm": 2.9551615715026855,
+ "learning_rate": 2.4706808042327678e-06,
+ "loss": 0.3524,
+ "step": 6442
+ },
+ {
+ "epoch": 3.046335697399527,
+ "grad_norm": 2.8639965057373047,
+ "learning_rate": 2.4700570210771115e-06,
+ "loss": 0.3886,
+ "step": 6443
+ },
+ {
+ "epoch": 3.046808510638298,
+ "grad_norm": 2.718219757080078,
+ "learning_rate": 2.4694332397858738e-06,
+ "loss": 0.3693,
+ "step": 6444
+ },
+ {
+ "epoch": 3.0472813238770686,
+ "grad_norm": 3.050135612487793,
+ "learning_rate": 2.4688094603978933e-06,
+ "loss": 0.3979,
+ "step": 6445
+ },
+ {
+ "epoch": 3.0477541371158394,
+ "grad_norm": 2.786186456680298,
+ "learning_rate": 2.468185682952013e-06,
+ "loss": 0.3809,
+ "step": 6446
+ },
+ {
+ "epoch": 3.0482269503546098,
+ "grad_norm": 2.6462252140045166,
+ "learning_rate": 2.4675619074870697e-06,
+ "loss": 0.3746,
+ "step": 6447
+ },
+ {
+ "epoch": 3.0486997635933806,
+ "grad_norm": 2.984783887863159,
+ "learning_rate": 2.4669381340419037e-06,
+ "loss": 0.4092,
+ "step": 6448
+ },
+ {
+ "epoch": 3.0491725768321514,
+ "grad_norm": 2.936380624771118,
+ "learning_rate": 2.466314362655356e-06,
+ "loss": 0.4335,
+ "step": 6449
+ },
+ {
+ "epoch": 3.049645390070922,
+ "grad_norm": 2.730738639831543,
+ "learning_rate": 2.465690593366264e-06,
+ "loss": 0.364,
+ "step": 6450
+ },
+ {
+ "epoch": 3.0501182033096925,
+ "grad_norm": 2.7273590564727783,
+ "learning_rate": 2.4650668262134693e-06,
+ "loss": 0.3905,
+ "step": 6451
+ },
+ {
+ "epoch": 3.0505910165484633,
+ "grad_norm": 2.9588208198547363,
+ "learning_rate": 2.4644430612358105e-06,
+ "loss": 0.3936,
+ "step": 6452
+ },
+ {
+ "epoch": 3.051063829787234,
+ "grad_norm": 2.8721611499786377,
+ "learning_rate": 2.4638192984721247e-06,
+ "loss": 0.4279,
+ "step": 6453
+ },
+ {
+ "epoch": 3.051536643026005,
+ "grad_norm": 3.7179651260375977,
+ "learning_rate": 2.463195537961254e-06,
+ "loss": 0.427,
+ "step": 6454
+ },
+ {
+ "epoch": 3.0520094562647753,
+ "grad_norm": 2.651731491088867,
+ "learning_rate": 2.4625717797420353e-06,
+ "loss": 0.3471,
+ "step": 6455
+ },
+ {
+ "epoch": 3.052482269503546,
+ "grad_norm": 3.898737668991089,
+ "learning_rate": 2.4619480238533085e-06,
+ "loss": 0.4574,
+ "step": 6456
+ },
+ {
+ "epoch": 3.052955082742317,
+ "grad_norm": 2.916252374649048,
+ "learning_rate": 2.4613242703339108e-06,
+ "loss": 0.3622,
+ "step": 6457
+ },
+ {
+ "epoch": 3.0534278959810877,
+ "grad_norm": 3.122565507888794,
+ "learning_rate": 2.4607005192226806e-06,
+ "loss": 0.3954,
+ "step": 6458
+ },
+ {
+ "epoch": 3.053900709219858,
+ "grad_norm": 3.2377424240112305,
+ "learning_rate": 2.4600767705584575e-06,
+ "loss": 0.4082,
+ "step": 6459
+ },
+ {
+ "epoch": 3.054373522458629,
+ "grad_norm": 2.941102981567383,
+ "learning_rate": 2.459453024380079e-06,
+ "loss": 0.4324,
+ "step": 6460
+ },
+ {
+ "epoch": 3.0548463356973996,
+ "grad_norm": 2.964313507080078,
+ "learning_rate": 2.4588292807263816e-06,
+ "loss": 0.3037,
+ "step": 6461
+ },
+ {
+ "epoch": 3.0553191489361704,
+ "grad_norm": 2.824669599533081,
+ "learning_rate": 2.4582055396362055e-06,
+ "loss": 0.4076,
+ "step": 6462
+ },
+ {
+ "epoch": 3.0557919621749408,
+ "grad_norm": 2.7739884853363037,
+ "learning_rate": 2.457581801148387e-06,
+ "loss": 0.3615,
+ "step": 6463
+ },
+ {
+ "epoch": 3.0562647754137116,
+ "grad_norm": 3.2974464893341064,
+ "learning_rate": 2.456958065301764e-06,
+ "loss": 0.426,
+ "step": 6464
+ },
+ {
+ "epoch": 3.0567375886524824,
+ "grad_norm": 3.0801217555999756,
+ "learning_rate": 2.456334332135174e-06,
+ "loss": 0.3737,
+ "step": 6465
+ },
+ {
+ "epoch": 3.057210401891253,
+ "grad_norm": 2.788851022720337,
+ "learning_rate": 2.455710601687452e-06,
+ "loss": 0.4367,
+ "step": 6466
+ },
+ {
+ "epoch": 3.0576832151300235,
+ "grad_norm": 2.8078136444091797,
+ "learning_rate": 2.4550868739974378e-06,
+ "loss": 0.3796,
+ "step": 6467
+ },
+ {
+ "epoch": 3.0581560283687943,
+ "grad_norm": 2.9871349334716797,
+ "learning_rate": 2.4544631491039657e-06,
+ "loss": 0.3869,
+ "step": 6468
+ },
+ {
+ "epoch": 3.058628841607565,
+ "grad_norm": 2.9170174598693848,
+ "learning_rate": 2.453839427045875e-06,
+ "loss": 0.4591,
+ "step": 6469
+ },
+ {
+ "epoch": 3.059101654846336,
+ "grad_norm": 2.7316131591796875,
+ "learning_rate": 2.4532157078620013e-06,
+ "loss": 0.3723,
+ "step": 6470
+ },
+ {
+ "epoch": 3.0595744680851062,
+ "grad_norm": 3.047921657562256,
+ "learning_rate": 2.4525919915911793e-06,
+ "loss": 0.3804,
+ "step": 6471
+ },
+ {
+ "epoch": 3.060047281323877,
+ "grad_norm": 3.047934055328369,
+ "learning_rate": 2.4519682782722465e-06,
+ "loss": 0.3949,
+ "step": 6472
+ },
+ {
+ "epoch": 3.060520094562648,
+ "grad_norm": 2.4911186695098877,
+ "learning_rate": 2.4513445679440374e-06,
+ "loss": 0.3629,
+ "step": 6473
+ },
+ {
+ "epoch": 3.0609929078014186,
+ "grad_norm": 2.5353519916534424,
+ "learning_rate": 2.4507208606453895e-06,
+ "loss": 0.3417,
+ "step": 6474
+ },
+ {
+ "epoch": 3.061465721040189,
+ "grad_norm": 2.474622964859009,
+ "learning_rate": 2.4500971564151384e-06,
+ "loss": 0.3468,
+ "step": 6475
+ },
+ {
+ "epoch": 3.06193853427896,
+ "grad_norm": 2.7016963958740234,
+ "learning_rate": 2.4494734552921166e-06,
+ "loss": 0.3872,
+ "step": 6476
+ },
+ {
+ "epoch": 3.0624113475177306,
+ "grad_norm": 2.912144184112549,
+ "learning_rate": 2.4488497573151625e-06,
+ "loss": 0.3727,
+ "step": 6477
+ },
+ {
+ "epoch": 3.0628841607565014,
+ "grad_norm": 2.8234877586364746,
+ "learning_rate": 2.4482260625231093e-06,
+ "loss": 0.3472,
+ "step": 6478
+ },
+ {
+ "epoch": 3.0633569739952717,
+ "grad_norm": 2.6554179191589355,
+ "learning_rate": 2.447602370954793e-06,
+ "loss": 0.343,
+ "step": 6479
+ },
+ {
+ "epoch": 3.0638297872340425,
+ "grad_norm": 2.666419744491577,
+ "learning_rate": 2.446978682649047e-06,
+ "loss": 0.3932,
+ "step": 6480
+ },
+ {
+ "epoch": 3.0643026004728133,
+ "grad_norm": 2.968574285507202,
+ "learning_rate": 2.446354997644705e-06,
+ "loss": 0.4418,
+ "step": 6481
+ },
+ {
+ "epoch": 3.064775413711584,
+ "grad_norm": 2.692253589630127,
+ "learning_rate": 2.4457313159806028e-06,
+ "loss": 0.3141,
+ "step": 6482
+ },
+ {
+ "epoch": 3.0652482269503545,
+ "grad_norm": 2.5857295989990234,
+ "learning_rate": 2.445107637695574e-06,
+ "loss": 0.3392,
+ "step": 6483
+ },
+ {
+ "epoch": 3.0657210401891253,
+ "grad_norm": 3.2332825660705566,
+ "learning_rate": 2.4444839628284504e-06,
+ "loss": 0.4694,
+ "step": 6484
+ },
+ {
+ "epoch": 3.066193853427896,
+ "grad_norm": 2.7391014099121094,
+ "learning_rate": 2.4438602914180684e-06,
+ "loss": 0.3966,
+ "step": 6485
+ },
+ {
+ "epoch": 3.066666666666667,
+ "grad_norm": 2.7882139682769775,
+ "learning_rate": 2.4432366235032593e-06,
+ "loss": 0.3552,
+ "step": 6486
+ },
+ {
+ "epoch": 3.0671394799054372,
+ "grad_norm": 2.8907811641693115,
+ "learning_rate": 2.4426129591228573e-06,
+ "loss": 0.4478,
+ "step": 6487
+ },
+ {
+ "epoch": 3.067612293144208,
+ "grad_norm": 2.878929853439331,
+ "learning_rate": 2.4419892983156947e-06,
+ "loss": 0.3457,
+ "step": 6488
+ },
+ {
+ "epoch": 3.068085106382979,
+ "grad_norm": 2.7087442874908447,
+ "learning_rate": 2.441365641120603e-06,
+ "loss": 0.3491,
+ "step": 6489
+ },
+ {
+ "epoch": 3.0685579196217496,
+ "grad_norm": 3.2330431938171387,
+ "learning_rate": 2.4407419875764167e-06,
+ "loss": 0.3901,
+ "step": 6490
+ },
+ {
+ "epoch": 3.06903073286052,
+ "grad_norm": 3.0529370307922363,
+ "learning_rate": 2.440118337721966e-06,
+ "loss": 0.4059,
+ "step": 6491
+ },
+ {
+ "epoch": 3.0695035460992908,
+ "grad_norm": 2.4786794185638428,
+ "learning_rate": 2.439494691596085e-06,
+ "loss": 0.3153,
+ "step": 6492
+ },
+ {
+ "epoch": 3.0699763593380616,
+ "grad_norm": 2.956310510635376,
+ "learning_rate": 2.438871049237604e-06,
+ "loss": 0.3973,
+ "step": 6493
+ },
+ {
+ "epoch": 3.0704491725768324,
+ "grad_norm": 3.0816991329193115,
+ "learning_rate": 2.4382474106853543e-06,
+ "loss": 0.388,
+ "step": 6494
+ },
+ {
+ "epoch": 3.0709219858156027,
+ "grad_norm": 2.6103477478027344,
+ "learning_rate": 2.4376237759781686e-06,
+ "loss": 0.3656,
+ "step": 6495
+ },
+ {
+ "epoch": 3.0713947990543735,
+ "grad_norm": 2.974076271057129,
+ "learning_rate": 2.437000145154875e-06,
+ "loss": 0.3246,
+ "step": 6496
+ },
+ {
+ "epoch": 3.0718676122931443,
+ "grad_norm": 2.633605718612671,
+ "learning_rate": 2.4363765182543075e-06,
+ "loss": 0.3556,
+ "step": 6497
+ },
+ {
+ "epoch": 3.072340425531915,
+ "grad_norm": 2.49161434173584,
+ "learning_rate": 2.4357528953152953e-06,
+ "loss": 0.3506,
+ "step": 6498
+ },
+ {
+ "epoch": 3.0728132387706855,
+ "grad_norm": 2.6435935497283936,
+ "learning_rate": 2.4351292763766676e-06,
+ "loss": 0.3652,
+ "step": 6499
+ },
+ {
+ "epoch": 3.0732860520094563,
+ "grad_norm": 2.9710617065429688,
+ "learning_rate": 2.4345056614772563e-06,
+ "loss": 0.3713,
+ "step": 6500
+ },
+ {
+ "epoch": 3.073758865248227,
+ "grad_norm": 2.6947052478790283,
+ "learning_rate": 2.43388205065589e-06,
+ "loss": 0.378,
+ "step": 6501
+ },
+ {
+ "epoch": 3.0742316784869974,
+ "grad_norm": 2.9686238765716553,
+ "learning_rate": 2.433258443951398e-06,
+ "loss": 0.3936,
+ "step": 6502
+ },
+ {
+ "epoch": 3.074704491725768,
+ "grad_norm": 2.6008691787719727,
+ "learning_rate": 2.432634841402611e-06,
+ "loss": 0.3709,
+ "step": 6503
+ },
+ {
+ "epoch": 3.075177304964539,
+ "grad_norm": 2.595116376876831,
+ "learning_rate": 2.4320112430483563e-06,
+ "loss": 0.3884,
+ "step": 6504
+ },
+ {
+ "epoch": 3.07565011820331,
+ "grad_norm": 2.685241460800171,
+ "learning_rate": 2.431387648927464e-06,
+ "loss": 0.3751,
+ "step": 6505
+ },
+ {
+ "epoch": 3.0761229314420806,
+ "grad_norm": 2.8863797187805176,
+ "learning_rate": 2.430764059078762e-06,
+ "loss": 0.3765,
+ "step": 6506
+ },
+ {
+ "epoch": 3.076595744680851,
+ "grad_norm": 3.020766019821167,
+ "learning_rate": 2.430140473541077e-06,
+ "loss": 0.362,
+ "step": 6507
+ },
+ {
+ "epoch": 3.0770685579196217,
+ "grad_norm": 2.9521167278289795,
+ "learning_rate": 2.42951689235324e-06,
+ "loss": 0.41,
+ "step": 6508
+ },
+ {
+ "epoch": 3.0775413711583925,
+ "grad_norm": 2.5844924449920654,
+ "learning_rate": 2.4288933155540757e-06,
+ "loss": 0.3258,
+ "step": 6509
+ },
+ {
+ "epoch": 3.078014184397163,
+ "grad_norm": 3.052661657333374,
+ "learning_rate": 2.4282697431824138e-06,
+ "loss": 0.363,
+ "step": 6510
+ },
+ {
+ "epoch": 3.0784869976359337,
+ "grad_norm": 3.109342575073242,
+ "learning_rate": 2.427646175277081e-06,
+ "loss": 0.4105,
+ "step": 6511
+ },
+ {
+ "epoch": 3.0789598108747045,
+ "grad_norm": 3.3141326904296875,
+ "learning_rate": 2.427022611876903e-06,
+ "loss": 0.405,
+ "step": 6512
+ },
+ {
+ "epoch": 3.0794326241134753,
+ "grad_norm": 3.054673194885254,
+ "learning_rate": 2.426399053020707e-06,
+ "loss": 0.3532,
+ "step": 6513
+ },
+ {
+ "epoch": 3.079905437352246,
+ "grad_norm": 2.823489189147949,
+ "learning_rate": 2.425775498747318e-06,
+ "loss": 0.3762,
+ "step": 6514
+ },
+ {
+ "epoch": 3.0803782505910164,
+ "grad_norm": 2.6739792823791504,
+ "learning_rate": 2.425151949095565e-06,
+ "loss": 0.4044,
+ "step": 6515
+ },
+ {
+ "epoch": 3.0808510638297872,
+ "grad_norm": 2.7313177585601807,
+ "learning_rate": 2.4245284041042714e-06,
+ "loss": 0.3136,
+ "step": 6516
+ },
+ {
+ "epoch": 3.081323877068558,
+ "grad_norm": 3.1661181449890137,
+ "learning_rate": 2.4239048638122624e-06,
+ "loss": 0.44,
+ "step": 6517
+ },
+ {
+ "epoch": 3.0817966903073284,
+ "grad_norm": 3.326542377471924,
+ "learning_rate": 2.4232813282583647e-06,
+ "loss": 0.3798,
+ "step": 6518
+ },
+ {
+ "epoch": 3.082269503546099,
+ "grad_norm": 3.0194952487945557,
+ "learning_rate": 2.422657797481402e-06,
+ "loss": 0.423,
+ "step": 6519
+ },
+ {
+ "epoch": 3.08274231678487,
+ "grad_norm": 2.6704318523406982,
+ "learning_rate": 2.4220342715201995e-06,
+ "loss": 0.41,
+ "step": 6520
+ },
+ {
+ "epoch": 3.083215130023641,
+ "grad_norm": 3.057990312576294,
+ "learning_rate": 2.421410750413581e-06,
+ "loss": 0.4096,
+ "step": 6521
+ },
+ {
+ "epoch": 3.083687943262411,
+ "grad_norm": 2.6242079734802246,
+ "learning_rate": 2.4207872342003693e-06,
+ "loss": 0.3673,
+ "step": 6522
+ },
+ {
+ "epoch": 3.084160756501182,
+ "grad_norm": 2.933910846710205,
+ "learning_rate": 2.4201637229193904e-06,
+ "loss": 0.4018,
+ "step": 6523
+ },
+ {
+ "epoch": 3.0846335697399527,
+ "grad_norm": 2.6973681449890137,
+ "learning_rate": 2.4195402166094657e-06,
+ "loss": 0.3533,
+ "step": 6524
+ },
+ {
+ "epoch": 3.0851063829787235,
+ "grad_norm": 3.096013307571411,
+ "learning_rate": 2.4189167153094194e-06,
+ "loss": 0.3872,
+ "step": 6525
+ },
+ {
+ "epoch": 3.085579196217494,
+ "grad_norm": 3.0707414150238037,
+ "learning_rate": 2.4182932190580737e-06,
+ "loss": 0.3775,
+ "step": 6526
+ },
+ {
+ "epoch": 3.0860520094562647,
+ "grad_norm": 2.873190402984619,
+ "learning_rate": 2.417669727894251e-06,
+ "loss": 0.3144,
+ "step": 6527
+ },
+ {
+ "epoch": 3.0865248226950355,
+ "grad_norm": 2.316431999206543,
+ "learning_rate": 2.4170462418567732e-06,
+ "loss": 0.3238,
+ "step": 6528
+ },
+ {
+ "epoch": 3.0869976359338063,
+ "grad_norm": 2.3672494888305664,
+ "learning_rate": 2.4164227609844626e-06,
+ "loss": 0.3585,
+ "step": 6529
+ },
+ {
+ "epoch": 3.0874704491725766,
+ "grad_norm": 2.904538154602051,
+ "learning_rate": 2.415799285316139e-06,
+ "loss": 0.366,
+ "step": 6530
+ },
+ {
+ "epoch": 3.0879432624113474,
+ "grad_norm": 2.914602279663086,
+ "learning_rate": 2.415175814890626e-06,
+ "loss": 0.3793,
+ "step": 6531
+ },
+ {
+ "epoch": 3.088416075650118,
+ "grad_norm": 2.652005672454834,
+ "learning_rate": 2.4145523497467417e-06,
+ "loss": 0.362,
+ "step": 6532
+ },
+ {
+ "epoch": 3.088888888888889,
+ "grad_norm": 2.5137813091278076,
+ "learning_rate": 2.413928889923309e-06,
+ "loss": 0.2974,
+ "step": 6533
+ },
+ {
+ "epoch": 3.0893617021276594,
+ "grad_norm": 3.2166645526885986,
+ "learning_rate": 2.413305435459147e-06,
+ "loss": 0.4151,
+ "step": 6534
+ },
+ {
+ "epoch": 3.08983451536643,
+ "grad_norm": 3.0506820678710938,
+ "learning_rate": 2.412681986393075e-06,
+ "loss": 0.4223,
+ "step": 6535
+ },
+ {
+ "epoch": 3.090307328605201,
+ "grad_norm": 3.035275936126709,
+ "learning_rate": 2.412058542763913e-06,
+ "loss": 0.4841,
+ "step": 6536
+ },
+ {
+ "epoch": 3.0907801418439718,
+ "grad_norm": 3.3195009231567383,
+ "learning_rate": 2.4114351046104793e-06,
+ "loss": 0.4205,
+ "step": 6537
+ },
+ {
+ "epoch": 3.091252955082742,
+ "grad_norm": 2.8700361251831055,
+ "learning_rate": 2.410811671971594e-06,
+ "loss": 0.3704,
+ "step": 6538
+ },
+ {
+ "epoch": 3.091725768321513,
+ "grad_norm": 2.900595188140869,
+ "learning_rate": 2.410188244886075e-06,
+ "loss": 0.4184,
+ "step": 6539
+ },
+ {
+ "epoch": 3.0921985815602837,
+ "grad_norm": 2.88179349899292,
+ "learning_rate": 2.409564823392739e-06,
+ "loss": 0.4156,
+ "step": 6540
+ },
+ {
+ "epoch": 3.0926713947990545,
+ "grad_norm": 2.677568197250366,
+ "learning_rate": 2.408941407530406e-06,
+ "loss": 0.4084,
+ "step": 6541
+ },
+ {
+ "epoch": 3.093144208037825,
+ "grad_norm": 3.0236027240753174,
+ "learning_rate": 2.408317997337892e-06,
+ "loss": 0.4384,
+ "step": 6542
+ },
+ {
+ "epoch": 3.0936170212765957,
+ "grad_norm": 3.1708545684814453,
+ "learning_rate": 2.4076945928540143e-06,
+ "loss": 0.3876,
+ "step": 6543
+ },
+ {
+ "epoch": 3.0940898345153665,
+ "grad_norm": 3.248821973800659,
+ "learning_rate": 2.40707119411759e-06,
+ "loss": 0.3865,
+ "step": 6544
+ },
+ {
+ "epoch": 3.0945626477541373,
+ "grad_norm": 3.0961649417877197,
+ "learning_rate": 2.4064478011674334e-06,
+ "loss": 0.3982,
+ "step": 6545
+ },
+ {
+ "epoch": 3.0950354609929076,
+ "grad_norm": 3.1989805698394775,
+ "learning_rate": 2.4058244140423637e-06,
+ "loss": 0.4777,
+ "step": 6546
+ },
+ {
+ "epoch": 3.0955082742316784,
+ "grad_norm": 2.805640459060669,
+ "learning_rate": 2.4052010327811933e-06,
+ "loss": 0.3764,
+ "step": 6547
+ },
+ {
+ "epoch": 3.095981087470449,
+ "grad_norm": 2.7225050926208496,
+ "learning_rate": 2.40457765742274e-06,
+ "loss": 0.3286,
+ "step": 6548
+ },
+ {
+ "epoch": 3.09645390070922,
+ "grad_norm": 3.119915008544922,
+ "learning_rate": 2.4039542880058174e-06,
+ "loss": 0.4463,
+ "step": 6549
+ },
+ {
+ "epoch": 3.0969267139479904,
+ "grad_norm": 2.8503530025482178,
+ "learning_rate": 2.4033309245692403e-06,
+ "loss": 0.395,
+ "step": 6550
+ },
+ {
+ "epoch": 3.097399527186761,
+ "grad_norm": 2.947504758834839,
+ "learning_rate": 2.4027075671518225e-06,
+ "loss": 0.4024,
+ "step": 6551
+ },
+ {
+ "epoch": 3.097872340425532,
+ "grad_norm": 3.170905113220215,
+ "learning_rate": 2.402084215792377e-06,
+ "loss": 0.4302,
+ "step": 6552
+ },
+ {
+ "epoch": 3.0983451536643027,
+ "grad_norm": 2.910475492477417,
+ "learning_rate": 2.4014608705297195e-06,
+ "loss": 0.4037,
+ "step": 6553
+ },
+ {
+ "epoch": 3.098817966903073,
+ "grad_norm": 2.627511978149414,
+ "learning_rate": 2.400837531402661e-06,
+ "loss": 0.3972,
+ "step": 6554
+ },
+ {
+ "epoch": 3.099290780141844,
+ "grad_norm": 2.6485681533813477,
+ "learning_rate": 2.4002141984500133e-06,
+ "loss": 0.4044,
+ "step": 6555
+ },
+ {
+ "epoch": 3.0997635933806147,
+ "grad_norm": 2.930954694747925,
+ "learning_rate": 2.399590871710592e-06,
+ "loss": 0.4214,
+ "step": 6556
+ },
+ {
+ "epoch": 3.1002364066193855,
+ "grad_norm": 2.6014554500579834,
+ "learning_rate": 2.3989675512232063e-06,
+ "loss": 0.3493,
+ "step": 6557
+ },
+ {
+ "epoch": 3.100709219858156,
+ "grad_norm": 2.899001121520996,
+ "learning_rate": 2.398344237026667e-06,
+ "loss": 0.382,
+ "step": 6558
+ },
+ {
+ "epoch": 3.1011820330969266,
+ "grad_norm": 2.4698870182037354,
+ "learning_rate": 2.3977209291597876e-06,
+ "loss": 0.3558,
+ "step": 6559
+ },
+ {
+ "epoch": 3.1016548463356974,
+ "grad_norm": 3.2926251888275146,
+ "learning_rate": 2.3970976276613763e-06,
+ "loss": 0.4078,
+ "step": 6560
+ },
+ {
+ "epoch": 3.1021276595744682,
+ "grad_norm": 2.5306150913238525,
+ "learning_rate": 2.3964743325702454e-06,
+ "loss": 0.3657,
+ "step": 6561
+ },
+ {
+ "epoch": 3.1026004728132386,
+ "grad_norm": 2.727583408355713,
+ "learning_rate": 2.395851043925204e-06,
+ "loss": 0.3791,
+ "step": 6562
+ },
+ {
+ "epoch": 3.1030732860520094,
+ "grad_norm": 3.1403541564941406,
+ "learning_rate": 2.3952277617650602e-06,
+ "loss": 0.3934,
+ "step": 6563
+ },
+ {
+ "epoch": 3.10354609929078,
+ "grad_norm": 2.5816383361816406,
+ "learning_rate": 2.3946044861286256e-06,
+ "loss": 0.3703,
+ "step": 6564
+ },
+ {
+ "epoch": 3.104018912529551,
+ "grad_norm": 2.5742220878601074,
+ "learning_rate": 2.3939812170547067e-06,
+ "loss": 0.3628,
+ "step": 6565
+ },
+ {
+ "epoch": 3.1044917257683213,
+ "grad_norm": 2.7276530265808105,
+ "learning_rate": 2.393357954582113e-06,
+ "loss": 0.3789,
+ "step": 6566
+ },
+ {
+ "epoch": 3.104964539007092,
+ "grad_norm": 3.05595064163208,
+ "learning_rate": 2.3927346987496515e-06,
+ "loss": 0.3766,
+ "step": 6567
+ },
+ {
+ "epoch": 3.105437352245863,
+ "grad_norm": 2.786970615386963,
+ "learning_rate": 2.39211144959613e-06,
+ "loss": 0.3329,
+ "step": 6568
+ },
+ {
+ "epoch": 3.1059101654846337,
+ "grad_norm": 3.499018430709839,
+ "learning_rate": 2.391488207160356e-06,
+ "loss": 0.4175,
+ "step": 6569
+ },
+ {
+ "epoch": 3.106382978723404,
+ "grad_norm": 2.969735860824585,
+ "learning_rate": 2.3908649714811346e-06,
+ "loss": 0.3893,
+ "step": 6570
+ },
+ {
+ "epoch": 3.106855791962175,
+ "grad_norm": 3.1494929790496826,
+ "learning_rate": 2.3902417425972734e-06,
+ "loss": 0.4048,
+ "step": 6571
+ },
+ {
+ "epoch": 3.1073286052009457,
+ "grad_norm": 2.6393489837646484,
+ "learning_rate": 2.3896185205475782e-06,
+ "loss": 0.3216,
+ "step": 6572
+ },
+ {
+ "epoch": 3.1078014184397165,
+ "grad_norm": 3.6984152793884277,
+ "learning_rate": 2.3889953053708528e-06,
+ "loss": 0.3646,
+ "step": 6573
+ },
+ {
+ "epoch": 3.108274231678487,
+ "grad_norm": 3.518547534942627,
+ "learning_rate": 2.388372097105903e-06,
+ "loss": 0.3627,
+ "step": 6574
+ },
+ {
+ "epoch": 3.1087470449172576,
+ "grad_norm": 3.422043800354004,
+ "learning_rate": 2.3877488957915333e-06,
+ "loss": 0.4116,
+ "step": 6575
+ },
+ {
+ "epoch": 3.1092198581560284,
+ "grad_norm": 2.8088064193725586,
+ "learning_rate": 2.3871257014665486e-06,
+ "loss": 0.3477,
+ "step": 6576
+ },
+ {
+ "epoch": 3.109692671394799,
+ "grad_norm": 2.7877607345581055,
+ "learning_rate": 2.3865025141697513e-06,
+ "loss": 0.351,
+ "step": 6577
+ },
+ {
+ "epoch": 3.1101654846335696,
+ "grad_norm": 2.9446799755096436,
+ "learning_rate": 2.3858793339399433e-06,
+ "loss": 0.4025,
+ "step": 6578
+ },
+ {
+ "epoch": 3.1106382978723404,
+ "grad_norm": 2.886584758758545,
+ "learning_rate": 2.3852561608159304e-06,
+ "loss": 0.3765,
+ "step": 6579
+ },
+ {
+ "epoch": 3.111111111111111,
+ "grad_norm": 3.45711088180542,
+ "learning_rate": 2.384632994836513e-06,
+ "loss": 0.3744,
+ "step": 6580
+ },
+ {
+ "epoch": 3.111583924349882,
+ "grad_norm": 2.737441301345825,
+ "learning_rate": 2.3840098360404916e-06,
+ "loss": 0.4048,
+ "step": 6581
+ },
+ {
+ "epoch": 3.1120567375886523,
+ "grad_norm": 2.742567300796509,
+ "learning_rate": 2.383386684466671e-06,
+ "loss": 0.3717,
+ "step": 6582
+ },
+ {
+ "epoch": 3.112529550827423,
+ "grad_norm": 3.017970561981201,
+ "learning_rate": 2.382763540153849e-06,
+ "loss": 0.3922,
+ "step": 6583
+ },
+ {
+ "epoch": 3.113002364066194,
+ "grad_norm": 3.132004499435425,
+ "learning_rate": 2.3821404031408283e-06,
+ "loss": 0.3969,
+ "step": 6584
+ },
+ {
+ "epoch": 3.1134751773049647,
+ "grad_norm": 2.910820245742798,
+ "learning_rate": 2.3815172734664075e-06,
+ "loss": 0.4241,
+ "step": 6585
+ },
+ {
+ "epoch": 3.113947990543735,
+ "grad_norm": 3.0029842853546143,
+ "learning_rate": 2.380894151169386e-06,
+ "loss": 0.4007,
+ "step": 6586
+ },
+ {
+ "epoch": 3.114420803782506,
+ "grad_norm": 3.0309178829193115,
+ "learning_rate": 2.380271036288564e-06,
+ "loss": 0.3876,
+ "step": 6587
+ },
+ {
+ "epoch": 3.1148936170212767,
+ "grad_norm": 2.963204860687256,
+ "learning_rate": 2.379647928862739e-06,
+ "loss": 0.4017,
+ "step": 6588
+ },
+ {
+ "epoch": 3.1153664302600474,
+ "grad_norm": 3.0127944946289062,
+ "learning_rate": 2.3790248289307103e-06,
+ "loss": 0.3651,
+ "step": 6589
+ },
+ {
+ "epoch": 3.115839243498818,
+ "grad_norm": 2.557485580444336,
+ "learning_rate": 2.3784017365312755e-06,
+ "loss": 0.3419,
+ "step": 6590
+ },
+ {
+ "epoch": 3.1163120567375886,
+ "grad_norm": 2.8577969074249268,
+ "learning_rate": 2.3777786517032306e-06,
+ "loss": 0.372,
+ "step": 6591
+ },
+ {
+ "epoch": 3.1167848699763594,
+ "grad_norm": 2.450324058532715,
+ "learning_rate": 2.3771555744853735e-06,
+ "loss": 0.3442,
+ "step": 6592
+ },
+ {
+ "epoch": 3.11725768321513,
+ "grad_norm": 2.7939295768737793,
+ "learning_rate": 2.3765325049164996e-06,
+ "loss": 0.401,
+ "step": 6593
+ },
+ {
+ "epoch": 3.1177304964539005,
+ "grad_norm": 2.9690325260162354,
+ "learning_rate": 2.3759094430354056e-06,
+ "loss": 0.3962,
+ "step": 6594
+ },
+ {
+ "epoch": 3.1182033096926713,
+ "grad_norm": 2.7630631923675537,
+ "learning_rate": 2.375286388880887e-06,
+ "loss": 0.4126,
+ "step": 6595
+ },
+ {
+ "epoch": 3.118676122931442,
+ "grad_norm": 2.6259944438934326,
+ "learning_rate": 2.3746633424917366e-06,
+ "loss": 0.3285,
+ "step": 6596
+ },
+ {
+ "epoch": 3.119148936170213,
+ "grad_norm": 2.7107701301574707,
+ "learning_rate": 2.3740403039067516e-06,
+ "loss": 0.3636,
+ "step": 6597
+ },
+ {
+ "epoch": 3.1196217494089833,
+ "grad_norm": 2.985301971435547,
+ "learning_rate": 2.373417273164724e-06,
+ "loss": 0.3928,
+ "step": 6598
+ },
+ {
+ "epoch": 3.120094562647754,
+ "grad_norm": 3.2578976154327393,
+ "learning_rate": 2.3727942503044483e-06,
+ "loss": 0.3379,
+ "step": 6599
+ },
+ {
+ "epoch": 3.120567375886525,
+ "grad_norm": 3.1681406497955322,
+ "learning_rate": 2.372171235364717e-06,
+ "loss": 0.4023,
+ "step": 6600
+ },
+ {
+ "epoch": 3.1210401891252957,
+ "grad_norm": 3.120147705078125,
+ "learning_rate": 2.371548228384321e-06,
+ "loss": 0.4228,
+ "step": 6601
+ },
+ {
+ "epoch": 3.121513002364066,
+ "grad_norm": 2.7786099910736084,
+ "learning_rate": 2.3709252294020547e-06,
+ "loss": 0.4386,
+ "step": 6602
+ },
+ {
+ "epoch": 3.121985815602837,
+ "grad_norm": 2.698849678039551,
+ "learning_rate": 2.3703022384567086e-06,
+ "loss": 0.3861,
+ "step": 6603
+ },
+ {
+ "epoch": 3.1224586288416076,
+ "grad_norm": 2.7917959690093994,
+ "learning_rate": 2.3696792555870724e-06,
+ "loss": 0.3535,
+ "step": 6604
+ },
+ {
+ "epoch": 3.1229314420803784,
+ "grad_norm": 2.8249263763427734,
+ "learning_rate": 2.3690562808319385e-06,
+ "loss": 0.3415,
+ "step": 6605
+ },
+ {
+ "epoch": 3.123404255319149,
+ "grad_norm": 2.567458391189575,
+ "learning_rate": 2.368433314230095e-06,
+ "loss": 0.3827,
+ "step": 6606
+ },
+ {
+ "epoch": 3.1238770685579196,
+ "grad_norm": 2.9670443534851074,
+ "learning_rate": 2.3678103558203328e-06,
+ "loss": 0.4238,
+ "step": 6607
+ },
+ {
+ "epoch": 3.1243498817966904,
+ "grad_norm": 2.6893439292907715,
+ "learning_rate": 2.36718740564144e-06,
+ "loss": 0.3461,
+ "step": 6608
+ },
+ {
+ "epoch": 3.124822695035461,
+ "grad_norm": 3.2669708728790283,
+ "learning_rate": 2.3665644637322044e-06,
+ "loss": 0.3992,
+ "step": 6609
+ },
+ {
+ "epoch": 3.1252955082742315,
+ "grad_norm": 2.889340400695801,
+ "learning_rate": 2.3659415301314152e-06,
+ "loss": 0.3829,
+ "step": 6610
+ },
+ {
+ "epoch": 3.1257683215130023,
+ "grad_norm": 2.625603199005127,
+ "learning_rate": 2.3653186048778584e-06,
+ "loss": 0.3559,
+ "step": 6611
+ },
+ {
+ "epoch": 3.126241134751773,
+ "grad_norm": 2.8128650188446045,
+ "learning_rate": 2.3646956880103224e-06,
+ "loss": 0.4035,
+ "step": 6612
+ },
+ {
+ "epoch": 3.126713947990544,
+ "grad_norm": 3.1887412071228027,
+ "learning_rate": 2.3640727795675925e-06,
+ "loss": 0.3938,
+ "step": 6613
+ },
+ {
+ "epoch": 3.1271867612293143,
+ "grad_norm": 2.886514186859131,
+ "learning_rate": 2.363449879588454e-06,
+ "loss": 0.3504,
+ "step": 6614
+ },
+ {
+ "epoch": 3.127659574468085,
+ "grad_norm": 3.2149860858917236,
+ "learning_rate": 2.3628269881116937e-06,
+ "loss": 0.4137,
+ "step": 6615
+ },
+ {
+ "epoch": 3.128132387706856,
+ "grad_norm": 3.3155312538146973,
+ "learning_rate": 2.362204105176094e-06,
+ "loss": 0.3811,
+ "step": 6616
+ },
+ {
+ "epoch": 3.1286052009456267,
+ "grad_norm": 2.6228792667388916,
+ "learning_rate": 2.3615812308204415e-06,
+ "loss": 0.3511,
+ "step": 6617
+ },
+ {
+ "epoch": 3.129078014184397,
+ "grad_norm": 2.7686524391174316,
+ "learning_rate": 2.3609583650835187e-06,
+ "loss": 0.3722,
+ "step": 6618
+ },
+ {
+ "epoch": 3.129550827423168,
+ "grad_norm": 3.396368980407715,
+ "learning_rate": 2.3603355080041083e-06,
+ "loss": 0.4678,
+ "step": 6619
+ },
+ {
+ "epoch": 3.1300236406619386,
+ "grad_norm": 2.7329437732696533,
+ "learning_rate": 2.359712659620994e-06,
+ "loss": 0.3775,
+ "step": 6620
+ },
+ {
+ "epoch": 3.1304964539007094,
+ "grad_norm": 2.7633914947509766,
+ "learning_rate": 2.3590898199729567e-06,
+ "loss": 0.3306,
+ "step": 6621
+ },
+ {
+ "epoch": 3.1309692671394798,
+ "grad_norm": 3.020887613296509,
+ "learning_rate": 2.3584669890987792e-06,
+ "loss": 0.4121,
+ "step": 6622
+ },
+ {
+ "epoch": 3.1314420803782506,
+ "grad_norm": 2.8912103176116943,
+ "learning_rate": 2.3578441670372414e-06,
+ "loss": 0.4297,
+ "step": 6623
+ },
+ {
+ "epoch": 3.1319148936170214,
+ "grad_norm": 3.0654027462005615,
+ "learning_rate": 2.3572213538271234e-06,
+ "loss": 0.3856,
+ "step": 6624
+ },
+ {
+ "epoch": 3.132387706855792,
+ "grad_norm": 3.1126575469970703,
+ "learning_rate": 2.356598549507206e-06,
+ "loss": 0.3886,
+ "step": 6625
+ },
+ {
+ "epoch": 3.1328605200945625,
+ "grad_norm": 2.7066447734832764,
+ "learning_rate": 2.3559757541162687e-06,
+ "loss": 0.4212,
+ "step": 6626
+ },
+ {
+ "epoch": 3.1333333333333333,
+ "grad_norm": 2.876338243484497,
+ "learning_rate": 2.355352967693088e-06,
+ "loss": 0.3607,
+ "step": 6627
+ },
+ {
+ "epoch": 3.133806146572104,
+ "grad_norm": 2.9011716842651367,
+ "learning_rate": 2.3547301902764454e-06,
+ "loss": 0.428,
+ "step": 6628
+ },
+ {
+ "epoch": 3.134278959810875,
+ "grad_norm": 2.805656909942627,
+ "learning_rate": 2.3541074219051163e-06,
+ "loss": 0.4038,
+ "step": 6629
+ },
+ {
+ "epoch": 3.1347517730496453,
+ "grad_norm": 2.89546275138855,
+ "learning_rate": 2.353484662617879e-06,
+ "loss": 0.3798,
+ "step": 6630
+ },
+ {
+ "epoch": 3.135224586288416,
+ "grad_norm": 3.0290539264678955,
+ "learning_rate": 2.352861912453508e-06,
+ "loss": 0.3916,
+ "step": 6631
+ },
+ {
+ "epoch": 3.135697399527187,
+ "grad_norm": 2.848393440246582,
+ "learning_rate": 2.352239171450781e-06,
+ "loss": 0.3423,
+ "step": 6632
+ },
+ {
+ "epoch": 3.1361702127659576,
+ "grad_norm": 2.871372938156128,
+ "learning_rate": 2.3516164396484737e-06,
+ "loss": 0.3872,
+ "step": 6633
+ },
+ {
+ "epoch": 3.136643026004728,
+ "grad_norm": 3.120682716369629,
+ "learning_rate": 2.3509937170853585e-06,
+ "loss": 0.3952,
+ "step": 6634
+ },
+ {
+ "epoch": 3.137115839243499,
+ "grad_norm": 2.6936683654785156,
+ "learning_rate": 2.3503710038002127e-06,
+ "loss": 0.3643,
+ "step": 6635
+ },
+ {
+ "epoch": 3.1375886524822696,
+ "grad_norm": 3.749519109725952,
+ "learning_rate": 2.349748299831808e-06,
+ "loss": 0.4519,
+ "step": 6636
+ },
+ {
+ "epoch": 3.1380614657210404,
+ "grad_norm": 2.8034276962280273,
+ "learning_rate": 2.3491256052189175e-06,
+ "loss": 0.401,
+ "step": 6637
+ },
+ {
+ "epoch": 3.1385342789598107,
+ "grad_norm": 2.6201975345611572,
+ "learning_rate": 2.348502920000314e-06,
+ "loss": 0.3491,
+ "step": 6638
+ },
+ {
+ "epoch": 3.1390070921985815,
+ "grad_norm": 2.890552043914795,
+ "learning_rate": 2.347880244214769e-06,
+ "loss": 0.3439,
+ "step": 6639
+ },
+ {
+ "epoch": 3.1394799054373523,
+ "grad_norm": 2.899594306945801,
+ "learning_rate": 2.347257577901055e-06,
+ "loss": 0.3707,
+ "step": 6640
+ },
+ {
+ "epoch": 3.139952718676123,
+ "grad_norm": 2.8660130500793457,
+ "learning_rate": 2.346634921097942e-06,
+ "loss": 0.3582,
+ "step": 6641
+ },
+ {
+ "epoch": 3.1404255319148935,
+ "grad_norm": 2.9805452823638916,
+ "learning_rate": 2.346012273844199e-06,
+ "loss": 0.3466,
+ "step": 6642
+ },
+ {
+ "epoch": 3.1408983451536643,
+ "grad_norm": 3.162977457046509,
+ "learning_rate": 2.345389636178597e-06,
+ "loss": 0.3657,
+ "step": 6643
+ },
+ {
+ "epoch": 3.141371158392435,
+ "grad_norm": 2.838988780975342,
+ "learning_rate": 2.344767008139904e-06,
+ "loss": 0.3826,
+ "step": 6644
+ },
+ {
+ "epoch": 3.141843971631206,
+ "grad_norm": 3.8427252769470215,
+ "learning_rate": 2.3441443897668893e-06,
+ "loss": 0.3697,
+ "step": 6645
+ },
+ {
+ "epoch": 3.1423167848699762,
+ "grad_norm": 2.9233880043029785,
+ "learning_rate": 2.34352178109832e-06,
+ "loss": 0.3481,
+ "step": 6646
+ },
+ {
+ "epoch": 3.142789598108747,
+ "grad_norm": 2.5840606689453125,
+ "learning_rate": 2.342899182172963e-06,
+ "loss": 0.3746,
+ "step": 6647
+ },
+ {
+ "epoch": 3.143262411347518,
+ "grad_norm": 2.806793451309204,
+ "learning_rate": 2.3422765930295857e-06,
+ "loss": 0.419,
+ "step": 6648
+ },
+ {
+ "epoch": 3.1437352245862886,
+ "grad_norm": 2.803952693939209,
+ "learning_rate": 2.3416540137069522e-06,
+ "loss": 0.3965,
+ "step": 6649
+ },
+ {
+ "epoch": 3.144208037825059,
+ "grad_norm": 2.8416364192962646,
+ "learning_rate": 2.3410314442438297e-06,
+ "loss": 0.4317,
+ "step": 6650
+ },
+ {
+ "epoch": 3.1446808510638298,
+ "grad_norm": 2.9956440925598145,
+ "learning_rate": 2.3404088846789826e-06,
+ "loss": 0.4268,
+ "step": 6651
+ },
+ {
+ "epoch": 3.1451536643026006,
+ "grad_norm": 3.1649162769317627,
+ "learning_rate": 2.339786335051173e-06,
+ "loss": 0.4149,
+ "step": 6652
+ },
+ {
+ "epoch": 3.145626477541371,
+ "grad_norm": 2.909107208251953,
+ "learning_rate": 2.3391637953991673e-06,
+ "loss": 0.4085,
+ "step": 6653
+ },
+ {
+ "epoch": 3.1460992907801417,
+ "grad_norm": 2.416755199432373,
+ "learning_rate": 2.3385412657617264e-06,
+ "loss": 0.3585,
+ "step": 6654
+ },
+ {
+ "epoch": 3.1465721040189125,
+ "grad_norm": 3.1122629642486572,
+ "learning_rate": 2.3379187461776123e-06,
+ "loss": 0.3876,
+ "step": 6655
+ },
+ {
+ "epoch": 3.1470449172576833,
+ "grad_norm": 2.6854658126831055,
+ "learning_rate": 2.337296236685588e-06,
+ "loss": 0.3125,
+ "step": 6656
+ },
+ {
+ "epoch": 3.147517730496454,
+ "grad_norm": 2.779876708984375,
+ "learning_rate": 2.3366737373244127e-06,
+ "loss": 0.3688,
+ "step": 6657
+ },
+ {
+ "epoch": 3.1479905437352245,
+ "grad_norm": 3.1444761753082275,
+ "learning_rate": 2.3360512481328484e-06,
+ "loss": 0.4089,
+ "step": 6658
+ },
+ {
+ "epoch": 3.1484633569739953,
+ "grad_norm": 2.71445894241333,
+ "learning_rate": 2.335428769149654e-06,
+ "loss": 0.3532,
+ "step": 6659
+ },
+ {
+ "epoch": 3.148936170212766,
+ "grad_norm": 2.9788241386413574,
+ "learning_rate": 2.334806300413587e-06,
+ "loss": 0.4238,
+ "step": 6660
+ },
+ {
+ "epoch": 3.1494089834515364,
+ "grad_norm": 3.0118865966796875,
+ "learning_rate": 2.334183841963409e-06,
+ "loss": 0.4437,
+ "step": 6661
+ },
+ {
+ "epoch": 3.149881796690307,
+ "grad_norm": 3.2229537963867188,
+ "learning_rate": 2.3335613938378753e-06,
+ "loss": 0.3582,
+ "step": 6662
+ },
+ {
+ "epoch": 3.150354609929078,
+ "grad_norm": 2.734997034072876,
+ "learning_rate": 2.3329389560757447e-06,
+ "loss": 0.3737,
+ "step": 6663
+ },
+ {
+ "epoch": 3.150827423167849,
+ "grad_norm": 3.4746382236480713,
+ "learning_rate": 2.3323165287157724e-06,
+ "loss": 0.3516,
+ "step": 6664
+ },
+ {
+ "epoch": 3.1513002364066196,
+ "grad_norm": 2.9428153038024902,
+ "learning_rate": 2.3316941117967137e-06,
+ "loss": 0.3985,
+ "step": 6665
+ },
+ {
+ "epoch": 3.15177304964539,
+ "grad_norm": 2.6840944290161133,
+ "learning_rate": 2.3310717053573257e-06,
+ "loss": 0.3274,
+ "step": 6666
+ },
+ {
+ "epoch": 3.1522458628841608,
+ "grad_norm": 3.048335552215576,
+ "learning_rate": 2.3304493094363607e-06,
+ "loss": 0.4262,
+ "step": 6667
+ },
+ {
+ "epoch": 3.1527186761229316,
+ "grad_norm": 2.87381911277771,
+ "learning_rate": 2.329826924072575e-06,
+ "loss": 0.3867,
+ "step": 6668
+ },
+ {
+ "epoch": 3.153191489361702,
+ "grad_norm": 2.6236355304718018,
+ "learning_rate": 2.32920454930472e-06,
+ "loss": 0.3649,
+ "step": 6669
+ },
+ {
+ "epoch": 3.1536643026004727,
+ "grad_norm": 3.1326401233673096,
+ "learning_rate": 2.328582185171549e-06,
+ "loss": 0.3451,
+ "step": 6670
+ },
+ {
+ "epoch": 3.1541371158392435,
+ "grad_norm": 3.011826992034912,
+ "learning_rate": 2.327959831711814e-06,
+ "loss": 0.4118,
+ "step": 6671
+ },
+ {
+ "epoch": 3.1546099290780143,
+ "grad_norm": 2.834933280944824,
+ "learning_rate": 2.3273374889642646e-06,
+ "loss": 0.4378,
+ "step": 6672
+ },
+ {
+ "epoch": 3.155082742316785,
+ "grad_norm": 3.085756778717041,
+ "learning_rate": 2.326715156967654e-06,
+ "loss": 0.4389,
+ "step": 6673
+ },
+ {
+ "epoch": 3.1555555555555554,
+ "grad_norm": 2.7912232875823975,
+ "learning_rate": 2.3260928357607305e-06,
+ "loss": 0.3352,
+ "step": 6674
+ },
+ {
+ "epoch": 3.1560283687943262,
+ "grad_norm": 2.7643113136291504,
+ "learning_rate": 2.3254705253822424e-06,
+ "loss": 0.3449,
+ "step": 6675
+ },
+ {
+ "epoch": 3.156501182033097,
+ "grad_norm": 2.8984663486480713,
+ "learning_rate": 2.3248482258709405e-06,
+ "loss": 0.4231,
+ "step": 6676
+ },
+ {
+ "epoch": 3.1569739952718674,
+ "grad_norm": 3.214996814727783,
+ "learning_rate": 2.324225937265572e-06,
+ "loss": 0.4616,
+ "step": 6677
+ },
+ {
+ "epoch": 3.157446808510638,
+ "grad_norm": 2.58534836769104,
+ "learning_rate": 2.3236036596048827e-06,
+ "loss": 0.3264,
+ "step": 6678
+ },
+ {
+ "epoch": 3.157919621749409,
+ "grad_norm": 2.790714740753174,
+ "learning_rate": 2.322981392927621e-06,
+ "loss": 0.4086,
+ "step": 6679
+ },
+ {
+ "epoch": 3.15839243498818,
+ "grad_norm": 2.726029872894287,
+ "learning_rate": 2.32235913727253e-06,
+ "loss": 0.3344,
+ "step": 6680
+ },
+ {
+ "epoch": 3.1588652482269506,
+ "grad_norm": 2.8392906188964844,
+ "learning_rate": 2.3217368926783583e-06,
+ "loss": 0.3468,
+ "step": 6681
+ },
+ {
+ "epoch": 3.159338061465721,
+ "grad_norm": 2.9796900749206543,
+ "learning_rate": 2.321114659183848e-06,
+ "loss": 0.4051,
+ "step": 6682
+ },
+ {
+ "epoch": 3.1598108747044917,
+ "grad_norm": 3.0399303436279297,
+ "learning_rate": 2.320492436827743e-06,
+ "loss": 0.402,
+ "step": 6683
+ },
+ {
+ "epoch": 3.1602836879432625,
+ "grad_norm": 2.9295334815979004,
+ "learning_rate": 2.3198702256487877e-06,
+ "loss": 0.3975,
+ "step": 6684
+ },
+ {
+ "epoch": 3.160756501182033,
+ "grad_norm": 2.881552219390869,
+ "learning_rate": 2.319248025685723e-06,
+ "loss": 0.4342,
+ "step": 6685
+ },
+ {
+ "epoch": 3.1612293144208037,
+ "grad_norm": 3.0711705684661865,
+ "learning_rate": 2.3186258369772916e-06,
+ "loss": 0.3829,
+ "step": 6686
+ },
+ {
+ "epoch": 3.1617021276595745,
+ "grad_norm": 2.6614468097686768,
+ "learning_rate": 2.3180036595622345e-06,
+ "loss": 0.3473,
+ "step": 6687
+ },
+ {
+ "epoch": 3.1621749408983453,
+ "grad_norm": 3.0084400177001953,
+ "learning_rate": 2.3173814934792903e-06,
+ "loss": 0.4363,
+ "step": 6688
+ },
+ {
+ "epoch": 3.162647754137116,
+ "grad_norm": 2.9340786933898926,
+ "learning_rate": 2.3167593387672006e-06,
+ "loss": 0.4235,
+ "step": 6689
+ },
+ {
+ "epoch": 3.1631205673758864,
+ "grad_norm": 3.0765340328216553,
+ "learning_rate": 2.3161371954647023e-06,
+ "loss": 0.4601,
+ "step": 6690
+ },
+ {
+ "epoch": 3.1635933806146572,
+ "grad_norm": 2.816096067428589,
+ "learning_rate": 2.3155150636105356e-06,
+ "loss": 0.3764,
+ "step": 6691
+ },
+ {
+ "epoch": 3.164066193853428,
+ "grad_norm": 3.0476551055908203,
+ "learning_rate": 2.3148929432434372e-06,
+ "loss": 0.3956,
+ "step": 6692
+ },
+ {
+ "epoch": 3.1645390070921984,
+ "grad_norm": 2.628934860229492,
+ "learning_rate": 2.314270834402143e-06,
+ "loss": 0.3551,
+ "step": 6693
+ },
+ {
+ "epoch": 3.165011820330969,
+ "grad_norm": 3.3933539390563965,
+ "learning_rate": 2.31364873712539e-06,
+ "loss": 0.4523,
+ "step": 6694
+ },
+ {
+ "epoch": 3.16548463356974,
+ "grad_norm": 3.256176233291626,
+ "learning_rate": 2.313026651451912e-06,
+ "loss": 0.417,
+ "step": 6695
+ },
+ {
+ "epoch": 3.1659574468085108,
+ "grad_norm": 2.92926025390625,
+ "learning_rate": 2.312404577420445e-06,
+ "loss": 0.4365,
+ "step": 6696
+ },
+ {
+ "epoch": 3.166430260047281,
+ "grad_norm": 2.9514732360839844,
+ "learning_rate": 2.3117825150697233e-06,
+ "loss": 0.4632,
+ "step": 6697
+ },
+ {
+ "epoch": 3.166903073286052,
+ "grad_norm": 2.8635852336883545,
+ "learning_rate": 2.3111604644384778e-06,
+ "loss": 0.4018,
+ "step": 6698
+ },
+ {
+ "epoch": 3.1673758865248227,
+ "grad_norm": 2.5937020778656006,
+ "learning_rate": 2.3105384255654433e-06,
+ "loss": 0.3682,
+ "step": 6699
+ },
+ {
+ "epoch": 3.1678486997635935,
+ "grad_norm": 2.857851266860962,
+ "learning_rate": 2.3099163984893497e-06,
+ "loss": 0.3293,
+ "step": 6700
+ },
+ {
+ "epoch": 3.168321513002364,
+ "grad_norm": 2.5903947353363037,
+ "learning_rate": 2.3092943832489283e-06,
+ "loss": 0.3543,
+ "step": 6701
+ },
+ {
+ "epoch": 3.1687943262411347,
+ "grad_norm": 2.9783661365509033,
+ "learning_rate": 2.30867237988291e-06,
+ "loss": 0.3707,
+ "step": 6702
+ },
+ {
+ "epoch": 3.1692671394799055,
+ "grad_norm": 3.0133306980133057,
+ "learning_rate": 2.3080503884300225e-06,
+ "loss": 0.439,
+ "step": 6703
+ },
+ {
+ "epoch": 3.1697399527186763,
+ "grad_norm": 2.7119483947753906,
+ "learning_rate": 2.3074284089289968e-06,
+ "loss": 0.3956,
+ "step": 6704
+ },
+ {
+ "epoch": 3.1702127659574466,
+ "grad_norm": 3.0499672889709473,
+ "learning_rate": 2.3068064414185597e-06,
+ "loss": 0.434,
+ "step": 6705
+ },
+ {
+ "epoch": 3.1706855791962174,
+ "grad_norm": 2.862807512283325,
+ "learning_rate": 2.306184485937437e-06,
+ "loss": 0.3644,
+ "step": 6706
+ },
+ {
+ "epoch": 3.171158392434988,
+ "grad_norm": 2.9445149898529053,
+ "learning_rate": 2.305562542524358e-06,
+ "loss": 0.3894,
+ "step": 6707
+ },
+ {
+ "epoch": 3.171631205673759,
+ "grad_norm": 3.0442428588867188,
+ "learning_rate": 2.304940611218046e-06,
+ "loss": 0.3816,
+ "step": 6708
+ },
+ {
+ "epoch": 3.1721040189125294,
+ "grad_norm": 2.7101798057556152,
+ "learning_rate": 2.304318692057228e-06,
+ "loss": 0.3708,
+ "step": 6709
+ },
+ {
+ "epoch": 3.1725768321513,
+ "grad_norm": 2.7874515056610107,
+ "learning_rate": 2.303696785080626e-06,
+ "loss": 0.404,
+ "step": 6710
+ },
+ {
+ "epoch": 3.173049645390071,
+ "grad_norm": 3.0438833236694336,
+ "learning_rate": 2.303074890326964e-06,
+ "loss": 0.4342,
+ "step": 6711
+ },
+ {
+ "epoch": 3.1735224586288417,
+ "grad_norm": 2.6079208850860596,
+ "learning_rate": 2.302453007834966e-06,
+ "loss": 0.3725,
+ "step": 6712
+ },
+ {
+ "epoch": 3.173995271867612,
+ "grad_norm": 3.3353021144866943,
+ "learning_rate": 2.3018311376433523e-06,
+ "loss": 0.4372,
+ "step": 6713
+ },
+ {
+ "epoch": 3.174468085106383,
+ "grad_norm": 2.840771436691284,
+ "learning_rate": 2.3012092797908454e-06,
+ "loss": 0.3979,
+ "step": 6714
+ },
+ {
+ "epoch": 3.1749408983451537,
+ "grad_norm": 3.0474867820739746,
+ "learning_rate": 2.3005874343161648e-06,
+ "loss": 0.4077,
+ "step": 6715
+ },
+ {
+ "epoch": 3.1754137115839245,
+ "grad_norm": 2.849835157394409,
+ "learning_rate": 2.2999656012580296e-06,
+ "loss": 0.393,
+ "step": 6716
+ },
+ {
+ "epoch": 3.175886524822695,
+ "grad_norm": 2.6361217498779297,
+ "learning_rate": 2.29934378065516e-06,
+ "loss": 0.3894,
+ "step": 6717
+ },
+ {
+ "epoch": 3.1763593380614656,
+ "grad_norm": 3.139700174331665,
+ "learning_rate": 2.298721972546273e-06,
+ "loss": 0.36,
+ "step": 6718
+ },
+ {
+ "epoch": 3.1768321513002364,
+ "grad_norm": 2.987861156463623,
+ "learning_rate": 2.298100176970087e-06,
+ "loss": 0.4306,
+ "step": 6719
+ },
+ {
+ "epoch": 3.1773049645390072,
+ "grad_norm": 2.6403157711029053,
+ "learning_rate": 2.297478393965317e-06,
+ "loss": 0.3978,
+ "step": 6720
+ },
+ {
+ "epoch": 3.1777777777777776,
+ "grad_norm": 2.819519281387329,
+ "learning_rate": 2.296856623570679e-06,
+ "loss": 0.3467,
+ "step": 6721
+ },
+ {
+ "epoch": 3.1782505910165484,
+ "grad_norm": 2.7195916175842285,
+ "learning_rate": 2.296234865824889e-06,
+ "loss": 0.3685,
+ "step": 6722
+ },
+ {
+ "epoch": 3.178723404255319,
+ "grad_norm": 3.015488624572754,
+ "learning_rate": 2.2956131207666604e-06,
+ "loss": 0.3751,
+ "step": 6723
+ },
+ {
+ "epoch": 3.17919621749409,
+ "grad_norm": 2.9283792972564697,
+ "learning_rate": 2.2949913884347055e-06,
+ "loss": 0.3261,
+ "step": 6724
+ },
+ {
+ "epoch": 3.1796690307328603,
+ "grad_norm": 3.358991861343384,
+ "learning_rate": 2.294369668867739e-06,
+ "loss": 0.4505,
+ "step": 6725
+ },
+ {
+ "epoch": 3.180141843971631,
+ "grad_norm": 2.9143471717834473,
+ "learning_rate": 2.2937479621044712e-06,
+ "loss": 0.3612,
+ "step": 6726
+ },
+ {
+ "epoch": 3.180614657210402,
+ "grad_norm": 3.020519495010376,
+ "learning_rate": 2.2931262681836136e-06,
+ "loss": 0.4241,
+ "step": 6727
+ },
+ {
+ "epoch": 3.1810874704491727,
+ "grad_norm": 2.693737745285034,
+ "learning_rate": 2.2925045871438765e-06,
+ "loss": 0.366,
+ "step": 6728
+ },
+ {
+ "epoch": 3.181560283687943,
+ "grad_norm": 2.9427194595336914,
+ "learning_rate": 2.2918829190239677e-06,
+ "loss": 0.3741,
+ "step": 6729
+ },
+ {
+ "epoch": 3.182033096926714,
+ "grad_norm": 2.529383659362793,
+ "learning_rate": 2.291261263862598e-06,
+ "loss": 0.4469,
+ "step": 6730
+ },
+ {
+ "epoch": 3.1825059101654847,
+ "grad_norm": 3.0097804069519043,
+ "learning_rate": 2.290639621698473e-06,
+ "loss": 0.4167,
+ "step": 6731
+ },
+ {
+ "epoch": 3.1829787234042555,
+ "grad_norm": 2.7047014236450195,
+ "learning_rate": 2.290017992570302e-06,
+ "loss": 0.3615,
+ "step": 6732
+ },
+ {
+ "epoch": 3.183451536643026,
+ "grad_norm": 2.676964282989502,
+ "learning_rate": 2.2893963765167897e-06,
+ "loss": 0.3722,
+ "step": 6733
+ },
+ {
+ "epoch": 3.1839243498817966,
+ "grad_norm": 3.0529778003692627,
+ "learning_rate": 2.2887747735766413e-06,
+ "loss": 0.395,
+ "step": 6734
+ },
+ {
+ "epoch": 3.1843971631205674,
+ "grad_norm": 2.826725721359253,
+ "learning_rate": 2.288153183788562e-06,
+ "loss": 0.3713,
+ "step": 6735
+ },
+ {
+ "epoch": 3.184869976359338,
+ "grad_norm": 2.8689587116241455,
+ "learning_rate": 2.287531607191254e-06,
+ "loss": 0.4383,
+ "step": 6736
+ },
+ {
+ "epoch": 3.1853427895981086,
+ "grad_norm": 3.1835694313049316,
+ "learning_rate": 2.2869100438234217e-06,
+ "loss": 0.3908,
+ "step": 6737
+ },
+ {
+ "epoch": 3.1858156028368794,
+ "grad_norm": 3.227262020111084,
+ "learning_rate": 2.286288493723767e-06,
+ "loss": 0.3549,
+ "step": 6738
+ },
+ {
+ "epoch": 3.18628841607565,
+ "grad_norm": 2.7543468475341797,
+ "learning_rate": 2.2856669569309896e-06,
+ "loss": 0.351,
+ "step": 6739
+ },
+ {
+ "epoch": 3.186761229314421,
+ "grad_norm": 2.5381555557250977,
+ "learning_rate": 2.2850454334837923e-06,
+ "loss": 0.3473,
+ "step": 6740
+ },
+ {
+ "epoch": 3.1872340425531913,
+ "grad_norm": 2.785923957824707,
+ "learning_rate": 2.284423923420872e-06,
+ "loss": 0.4144,
+ "step": 6741
+ },
+ {
+ "epoch": 3.187706855791962,
+ "grad_norm": 2.583853006362915,
+ "learning_rate": 2.28380242678093e-06,
+ "loss": 0.3088,
+ "step": 6742
+ },
+ {
+ "epoch": 3.188179669030733,
+ "grad_norm": 2.604647159576416,
+ "learning_rate": 2.2831809436026627e-06,
+ "loss": 0.3474,
+ "step": 6743
+ },
+ {
+ "epoch": 3.1886524822695037,
+ "grad_norm": 6.13611364364624,
+ "learning_rate": 2.2825594739247662e-06,
+ "loss": 0.4089,
+ "step": 6744
+ },
+ {
+ "epoch": 3.189125295508274,
+ "grad_norm": 3.034011125564575,
+ "learning_rate": 2.281938017785939e-06,
+ "loss": 0.4569,
+ "step": 6745
+ },
+ {
+ "epoch": 3.189598108747045,
+ "grad_norm": 2.9352638721466064,
+ "learning_rate": 2.281316575224874e-06,
+ "loss": 0.4293,
+ "step": 6746
+ },
+ {
+ "epoch": 3.1900709219858157,
+ "grad_norm": 3.860957384109497,
+ "learning_rate": 2.280695146280268e-06,
+ "loss": 0.4082,
+ "step": 6747
+ },
+ {
+ "epoch": 3.1905437352245865,
+ "grad_norm": 2.8131468296051025,
+ "learning_rate": 2.280073730990814e-06,
+ "loss": 0.3194,
+ "step": 6748
+ },
+ {
+ "epoch": 3.191016548463357,
+ "grad_norm": 3.1310737133026123,
+ "learning_rate": 2.2794523293952033e-06,
+ "loss": 0.4454,
+ "step": 6749
+ },
+ {
+ "epoch": 3.1914893617021276,
+ "grad_norm": 3.065091133117676,
+ "learning_rate": 2.27883094153213e-06,
+ "loss": 0.3789,
+ "step": 6750
+ },
+ {
+ "epoch": 3.1919621749408984,
+ "grad_norm": 3.315216541290283,
+ "learning_rate": 2.278209567440284e-06,
+ "loss": 0.4037,
+ "step": 6751
+ },
+ {
+ "epoch": 3.192434988179669,
+ "grad_norm": 3.0228476524353027,
+ "learning_rate": 2.2775882071583546e-06,
+ "loss": 0.3652,
+ "step": 6752
+ },
+ {
+ "epoch": 3.1929078014184396,
+ "grad_norm": 3.703540802001953,
+ "learning_rate": 2.2769668607250336e-06,
+ "loss": 0.3477,
+ "step": 6753
+ },
+ {
+ "epoch": 3.1933806146572103,
+ "grad_norm": 2.952481508255005,
+ "learning_rate": 2.2763455281790065e-06,
+ "loss": 0.4026,
+ "step": 6754
+ },
+ {
+ "epoch": 3.193853427895981,
+ "grad_norm": 2.5798189640045166,
+ "learning_rate": 2.275724209558965e-06,
+ "loss": 0.3475,
+ "step": 6755
+ },
+ {
+ "epoch": 3.194326241134752,
+ "grad_norm": 2.599669933319092,
+ "learning_rate": 2.2751029049035923e-06,
+ "loss": 0.3499,
+ "step": 6756
+ },
+ {
+ "epoch": 3.1947990543735223,
+ "grad_norm": 3.0463781356811523,
+ "learning_rate": 2.2744816142515756e-06,
+ "loss": 0.3927,
+ "step": 6757
+ },
+ {
+ "epoch": 3.195271867612293,
+ "grad_norm": 3.134199380874634,
+ "learning_rate": 2.2738603376416003e-06,
+ "loss": 0.3957,
+ "step": 6758
+ },
+ {
+ "epoch": 3.195744680851064,
+ "grad_norm": 3.1326372623443604,
+ "learning_rate": 2.273239075112349e-06,
+ "loss": 0.4305,
+ "step": 6759
+ },
+ {
+ "epoch": 3.1962174940898347,
+ "grad_norm": 2.847128391265869,
+ "learning_rate": 2.2726178267025072e-06,
+ "loss": 0.3825,
+ "step": 6760
+ },
+ {
+ "epoch": 3.196690307328605,
+ "grad_norm": 2.697584629058838,
+ "learning_rate": 2.2719965924507566e-06,
+ "loss": 0.3517,
+ "step": 6761
+ },
+ {
+ "epoch": 3.197163120567376,
+ "grad_norm": 2.881446599960327,
+ "learning_rate": 2.271375372395777e-06,
+ "loss": 0.3791,
+ "step": 6762
+ },
+ {
+ "epoch": 3.1976359338061466,
+ "grad_norm": 3.085054874420166,
+ "learning_rate": 2.270754166576252e-06,
+ "loss": 0.4324,
+ "step": 6763
+ },
+ {
+ "epoch": 3.1981087470449174,
+ "grad_norm": 3.3494462966918945,
+ "learning_rate": 2.270132975030859e-06,
+ "loss": 0.4242,
+ "step": 6764
+ },
+ {
+ "epoch": 3.198581560283688,
+ "grad_norm": 2.8617660999298096,
+ "learning_rate": 2.2695117977982785e-06,
+ "loss": 0.3563,
+ "step": 6765
+ },
+ {
+ "epoch": 3.1990543735224586,
+ "grad_norm": 2.7437968254089355,
+ "learning_rate": 2.2688906349171873e-06,
+ "loss": 0.4042,
+ "step": 6766
+ },
+ {
+ "epoch": 3.1995271867612294,
+ "grad_norm": 3.1129143238067627,
+ "learning_rate": 2.268269486426262e-06,
+ "loss": 0.3761,
+ "step": 6767
+ },
+ {
+ "epoch": 3.2,
+ "grad_norm": 3.32441782951355,
+ "learning_rate": 2.2676483523641807e-06,
+ "loss": 0.4439,
+ "step": 6768
+ },
+ {
+ "epoch": 3.2004728132387705,
+ "grad_norm": 2.8744730949401855,
+ "learning_rate": 2.267027232769617e-06,
+ "loss": 0.4015,
+ "step": 6769
+ },
+ {
+ "epoch": 3.2009456264775413,
+ "grad_norm": 3.6283397674560547,
+ "learning_rate": 2.2664061276812465e-06,
+ "loss": 0.3634,
+ "step": 6770
+ },
+ {
+ "epoch": 3.201418439716312,
+ "grad_norm": 2.7826597690582275,
+ "learning_rate": 2.2657850371377426e-06,
+ "loss": 0.3178,
+ "step": 6771
+ },
+ {
+ "epoch": 3.201891252955083,
+ "grad_norm": 2.668173313140869,
+ "learning_rate": 2.265163961177776e-06,
+ "loss": 0.3662,
+ "step": 6772
+ },
+ {
+ "epoch": 3.2023640661938533,
+ "grad_norm": 2.868441104888916,
+ "learning_rate": 2.264542899840021e-06,
+ "loss": 0.4235,
+ "step": 6773
+ },
+ {
+ "epoch": 3.202836879432624,
+ "grad_norm": 3.2715935707092285,
+ "learning_rate": 2.263921853163147e-06,
+ "loss": 0.4741,
+ "step": 6774
+ },
+ {
+ "epoch": 3.203309692671395,
+ "grad_norm": 2.8647544384002686,
+ "learning_rate": 2.2633008211858233e-06,
+ "loss": 0.3885,
+ "step": 6775
+ },
+ {
+ "epoch": 3.2037825059101657,
+ "grad_norm": 3.070164680480957,
+ "learning_rate": 2.2626798039467207e-06,
+ "loss": 0.4191,
+ "step": 6776
+ },
+ {
+ "epoch": 3.204255319148936,
+ "grad_norm": 2.846686840057373,
+ "learning_rate": 2.262058801484505e-06,
+ "loss": 0.3619,
+ "step": 6777
+ },
+ {
+ "epoch": 3.204728132387707,
+ "grad_norm": 2.767031192779541,
+ "learning_rate": 2.261437813837845e-06,
+ "loss": 0.3248,
+ "step": 6778
+ },
+ {
+ "epoch": 3.2052009456264776,
+ "grad_norm": 2.6819260120391846,
+ "learning_rate": 2.2608168410454065e-06,
+ "loss": 0.3871,
+ "step": 6779
+ },
+ {
+ "epoch": 3.2056737588652484,
+ "grad_norm": 3.1176788806915283,
+ "learning_rate": 2.260195883145854e-06,
+ "loss": 0.3929,
+ "step": 6780
+ },
+ {
+ "epoch": 3.2061465721040188,
+ "grad_norm": 3.143209457397461,
+ "learning_rate": 2.2595749401778524e-06,
+ "loss": 0.4188,
+ "step": 6781
+ },
+ {
+ "epoch": 3.2066193853427896,
+ "grad_norm": 2.9685657024383545,
+ "learning_rate": 2.2589540121800647e-06,
+ "loss": 0.4049,
+ "step": 6782
+ },
+ {
+ "epoch": 3.2070921985815604,
+ "grad_norm": 2.6853368282318115,
+ "learning_rate": 2.258333099191155e-06,
+ "loss": 0.349,
+ "step": 6783
+ },
+ {
+ "epoch": 3.207565011820331,
+ "grad_norm": 2.8418309688568115,
+ "learning_rate": 2.257712201249783e-06,
+ "loss": 0.4121,
+ "step": 6784
+ },
+ {
+ "epoch": 3.2080378250591015,
+ "grad_norm": 2.9441449642181396,
+ "learning_rate": 2.2570913183946085e-06,
+ "loss": 0.3846,
+ "step": 6785
+ },
+ {
+ "epoch": 3.2085106382978723,
+ "grad_norm": 2.9956493377685547,
+ "learning_rate": 2.256470450664294e-06,
+ "loss": 0.3941,
+ "step": 6786
+ },
+ {
+ "epoch": 3.208983451536643,
+ "grad_norm": 3.1774401664733887,
+ "learning_rate": 2.255849598097496e-06,
+ "loss": 0.4252,
+ "step": 6787
+ },
+ {
+ "epoch": 3.209456264775414,
+ "grad_norm": 2.8948934078216553,
+ "learning_rate": 2.255228760732873e-06,
+ "loss": 0.3963,
+ "step": 6788
+ },
+ {
+ "epoch": 3.2099290780141843,
+ "grad_norm": 3.440021276473999,
+ "learning_rate": 2.2546079386090825e-06,
+ "loss": 0.3777,
+ "step": 6789
+ },
+ {
+ "epoch": 3.210401891252955,
+ "grad_norm": 3.1573195457458496,
+ "learning_rate": 2.253987131764779e-06,
+ "loss": 0.3896,
+ "step": 6790
+ },
+ {
+ "epoch": 3.210874704491726,
+ "grad_norm": 3.4218719005584717,
+ "learning_rate": 2.2533663402386183e-06,
+ "loss": 0.3979,
+ "step": 6791
+ },
+ {
+ "epoch": 3.2113475177304966,
+ "grad_norm": 3.3442487716674805,
+ "learning_rate": 2.252745564069253e-06,
+ "loss": 0.406,
+ "step": 6792
+ },
+ {
+ "epoch": 3.211820330969267,
+ "grad_norm": 2.6089327335357666,
+ "learning_rate": 2.2521248032953387e-06,
+ "loss": 0.3539,
+ "step": 6793
+ },
+ {
+ "epoch": 3.212293144208038,
+ "grad_norm": 3.8015971183776855,
+ "learning_rate": 2.251504057955526e-06,
+ "loss": 0.4184,
+ "step": 6794
+ },
+ {
+ "epoch": 3.2127659574468086,
+ "grad_norm": 3.797565460205078,
+ "learning_rate": 2.250883328088465e-06,
+ "loss": 0.3392,
+ "step": 6795
+ },
+ {
+ "epoch": 3.2132387706855794,
+ "grad_norm": 3.290762186050415,
+ "learning_rate": 2.2502626137328077e-06,
+ "loss": 0.3726,
+ "step": 6796
+ },
+ {
+ "epoch": 3.2137115839243497,
+ "grad_norm": 3.149158000946045,
+ "learning_rate": 2.2496419149272023e-06,
+ "loss": 0.3869,
+ "step": 6797
+ },
+ {
+ "epoch": 3.2141843971631205,
+ "grad_norm": 2.652902364730835,
+ "learning_rate": 2.2490212317102964e-06,
+ "loss": 0.3256,
+ "step": 6798
+ },
+ {
+ "epoch": 3.2146572104018913,
+ "grad_norm": 3.3039770126342773,
+ "learning_rate": 2.248400564120739e-06,
+ "loss": 0.4231,
+ "step": 6799
+ },
+ {
+ "epoch": 3.215130023640662,
+ "grad_norm": 3.0190038681030273,
+ "learning_rate": 2.247779912197174e-06,
+ "loss": 0.4319,
+ "step": 6800
+ },
+ {
+ "epoch": 3.2156028368794325,
+ "grad_norm": 2.861393690109253,
+ "learning_rate": 2.2471592759782485e-06,
+ "loss": 0.465,
+ "step": 6801
+ },
+ {
+ "epoch": 3.2160756501182033,
+ "grad_norm": 2.7796146869659424,
+ "learning_rate": 2.246538655502606e-06,
+ "loss": 0.3896,
+ "step": 6802
+ },
+ {
+ "epoch": 3.216548463356974,
+ "grad_norm": 3.1849005222320557,
+ "learning_rate": 2.24591805080889e-06,
+ "loss": 0.3782,
+ "step": 6803
+ },
+ {
+ "epoch": 3.217021276595745,
+ "grad_norm": 3.076164960861206,
+ "learning_rate": 2.2452974619357435e-06,
+ "loss": 0.4023,
+ "step": 6804
+ },
+ {
+ "epoch": 3.2174940898345152,
+ "grad_norm": 2.7006006240844727,
+ "learning_rate": 2.2446768889218064e-06,
+ "loss": 0.3902,
+ "step": 6805
+ },
+ {
+ "epoch": 3.217966903073286,
+ "grad_norm": 2.9310474395751953,
+ "learning_rate": 2.2440563318057205e-06,
+ "loss": 0.366,
+ "step": 6806
+ },
+ {
+ "epoch": 3.218439716312057,
+ "grad_norm": 3.057248592376709,
+ "learning_rate": 2.2434357906261246e-06,
+ "loss": 0.4042,
+ "step": 6807
+ },
+ {
+ "epoch": 3.2189125295508276,
+ "grad_norm": 3.3720197677612305,
+ "learning_rate": 2.242815265421656e-06,
+ "loss": 0.3816,
+ "step": 6808
+ },
+ {
+ "epoch": 3.219385342789598,
+ "grad_norm": 2.9626352787017822,
+ "learning_rate": 2.2421947562309545e-06,
+ "loss": 0.363,
+ "step": 6809
+ },
+ {
+ "epoch": 3.219858156028369,
+ "grad_norm": 2.7848782539367676,
+ "learning_rate": 2.2415742630926533e-06,
+ "loss": 0.3597,
+ "step": 6810
+ },
+ {
+ "epoch": 3.2203309692671396,
+ "grad_norm": 2.757319450378418,
+ "learning_rate": 2.2409537860453913e-06,
+ "loss": 0.3304,
+ "step": 6811
+ },
+ {
+ "epoch": 3.2208037825059104,
+ "grad_norm": 2.7765560150146484,
+ "learning_rate": 2.240333325127801e-06,
+ "loss": 0.3896,
+ "step": 6812
+ },
+ {
+ "epoch": 3.2212765957446807,
+ "grad_norm": 2.9882447719573975,
+ "learning_rate": 2.239712880378515e-06,
+ "loss": 0.4004,
+ "step": 6813
+ },
+ {
+ "epoch": 3.2217494089834515,
+ "grad_norm": 2.8551244735717773,
+ "learning_rate": 2.2390924518361673e-06,
+ "loss": 0.4167,
+ "step": 6814
+ },
+ {
+ "epoch": 3.2222222222222223,
+ "grad_norm": 2.8051679134368896,
+ "learning_rate": 2.2384720395393878e-06,
+ "loss": 0.3319,
+ "step": 6815
+ },
+ {
+ "epoch": 3.222695035460993,
+ "grad_norm": 3.1172873973846436,
+ "learning_rate": 2.2378516435268086e-06,
+ "loss": 0.379,
+ "step": 6816
+ },
+ {
+ "epoch": 3.2231678486997635,
+ "grad_norm": 3.0282177925109863,
+ "learning_rate": 2.237231263837058e-06,
+ "loss": 0.3855,
+ "step": 6817
+ },
+ {
+ "epoch": 3.2236406619385343,
+ "grad_norm": 2.7156803607940674,
+ "learning_rate": 2.236610900508763e-06,
+ "loss": 0.4062,
+ "step": 6818
+ },
+ {
+ "epoch": 3.224113475177305,
+ "grad_norm": 2.721327781677246,
+ "learning_rate": 2.235990553580554e-06,
+ "loss": 0.3726,
+ "step": 6819
+ },
+ {
+ "epoch": 3.2245862884160754,
+ "grad_norm": 2.881181240081787,
+ "learning_rate": 2.235370223091055e-06,
+ "loss": 0.421,
+ "step": 6820
+ },
+ {
+ "epoch": 3.225059101654846,
+ "grad_norm": 2.8074657917022705,
+ "learning_rate": 2.234749909078892e-06,
+ "loss": 0.3628,
+ "step": 6821
+ },
+ {
+ "epoch": 3.225531914893617,
+ "grad_norm": 2.8781638145446777,
+ "learning_rate": 2.234129611582689e-06,
+ "loss": 0.3857,
+ "step": 6822
+ },
+ {
+ "epoch": 3.226004728132388,
+ "grad_norm": 2.9473299980163574,
+ "learning_rate": 2.233509330641068e-06,
+ "loss": 0.4358,
+ "step": 6823
+ },
+ {
+ "epoch": 3.2264775413711586,
+ "grad_norm": 3.261209011077881,
+ "learning_rate": 2.2328890662926543e-06,
+ "loss": 0.4115,
+ "step": 6824
+ },
+ {
+ "epoch": 3.226950354609929,
+ "grad_norm": 3.2796943187713623,
+ "learning_rate": 2.232268818576067e-06,
+ "loss": 0.3846,
+ "step": 6825
+ },
+ {
+ "epoch": 3.2274231678486998,
+ "grad_norm": 3.1083059310913086,
+ "learning_rate": 2.2316485875299247e-06,
+ "loss": 0.3452,
+ "step": 6826
+ },
+ {
+ "epoch": 3.2278959810874706,
+ "grad_norm": 2.7947003841400146,
+ "learning_rate": 2.23102837319285e-06,
+ "loss": 0.3733,
+ "step": 6827
+ },
+ {
+ "epoch": 3.228368794326241,
+ "grad_norm": 2.792348861694336,
+ "learning_rate": 2.230408175603458e-06,
+ "loss": 0.411,
+ "step": 6828
+ },
+ {
+ "epoch": 3.2288416075650117,
+ "grad_norm": 2.8563876152038574,
+ "learning_rate": 2.229787994800368e-06,
+ "loss": 0.4303,
+ "step": 6829
+ },
+ {
+ "epoch": 3.2293144208037825,
+ "grad_norm": 2.9573659896850586,
+ "learning_rate": 2.2291678308221943e-06,
+ "loss": 0.4124,
+ "step": 6830
+ },
+ {
+ "epoch": 3.2297872340425533,
+ "grad_norm": 2.8554422855377197,
+ "learning_rate": 2.228547683707551e-06,
+ "loss": 0.3715,
+ "step": 6831
+ },
+ {
+ "epoch": 3.230260047281324,
+ "grad_norm": 2.9457242488861084,
+ "learning_rate": 2.227927553495054e-06,
+ "loss": 0.4339,
+ "step": 6832
+ },
+ {
+ "epoch": 3.2307328605200945,
+ "grad_norm": 2.799135684967041,
+ "learning_rate": 2.227307440223315e-06,
+ "loss": 0.3335,
+ "step": 6833
+ },
+ {
+ "epoch": 3.2312056737588652,
+ "grad_norm": 2.768529176712036,
+ "learning_rate": 2.2266873439309465e-06,
+ "loss": 0.3929,
+ "step": 6834
+ },
+ {
+ "epoch": 3.231678486997636,
+ "grad_norm": 3.124069929122925,
+ "learning_rate": 2.2260672646565585e-06,
+ "loss": 0.4205,
+ "step": 6835
+ },
+ {
+ "epoch": 3.2321513002364064,
+ "grad_norm": 2.8153982162475586,
+ "learning_rate": 2.2254472024387603e-06,
+ "loss": 0.3565,
+ "step": 6836
+ },
+ {
+ "epoch": 3.232624113475177,
+ "grad_norm": 3.1802141666412354,
+ "learning_rate": 2.224827157316162e-06,
+ "loss": 0.4614,
+ "step": 6837
+ },
+ {
+ "epoch": 3.233096926713948,
+ "grad_norm": 2.669651746749878,
+ "learning_rate": 2.2242071293273682e-06,
+ "loss": 0.3581,
+ "step": 6838
+ },
+ {
+ "epoch": 3.233569739952719,
+ "grad_norm": 3.073127269744873,
+ "learning_rate": 2.223587118510989e-06,
+ "loss": 0.3581,
+ "step": 6839
+ },
+ {
+ "epoch": 3.2340425531914896,
+ "grad_norm": 2.875955820083618,
+ "learning_rate": 2.222967124905627e-06,
+ "loss": 0.3905,
+ "step": 6840
+ },
+ {
+ "epoch": 3.23451536643026,
+ "grad_norm": 2.887744903564453,
+ "learning_rate": 2.2223471485498872e-06,
+ "loss": 0.4131,
+ "step": 6841
+ },
+ {
+ "epoch": 3.2349881796690307,
+ "grad_norm": 2.6957902908325195,
+ "learning_rate": 2.2217271894823735e-06,
+ "loss": 0.3631,
+ "step": 6842
+ },
+ {
+ "epoch": 3.2354609929078015,
+ "grad_norm": 2.7098400592803955,
+ "learning_rate": 2.221107247741688e-06,
+ "loss": 0.3959,
+ "step": 6843
+ },
+ {
+ "epoch": 3.235933806146572,
+ "grad_norm": 2.986271858215332,
+ "learning_rate": 2.22048732336643e-06,
+ "loss": 0.3515,
+ "step": 6844
+ },
+ {
+ "epoch": 3.2364066193853427,
+ "grad_norm": 3.0537121295928955,
+ "learning_rate": 2.2198674163952015e-06,
+ "loss": 0.438,
+ "step": 6845
+ },
+ {
+ "epoch": 3.2368794326241135,
+ "grad_norm": 2.8351151943206787,
+ "learning_rate": 2.2192475268666e-06,
+ "loss": 0.4069,
+ "step": 6846
+ },
+ {
+ "epoch": 3.2373522458628843,
+ "grad_norm": 2.6455280780792236,
+ "learning_rate": 2.218627654819225e-06,
+ "loss": 0.3626,
+ "step": 6847
+ },
+ {
+ "epoch": 3.237825059101655,
+ "grad_norm": 3.060352325439453,
+ "learning_rate": 2.2180078002916717e-06,
+ "loss": 0.3306,
+ "step": 6848
+ },
+ {
+ "epoch": 3.2382978723404254,
+ "grad_norm": 3.0178887844085693,
+ "learning_rate": 2.2173879633225355e-06,
+ "loss": 0.4111,
+ "step": 6849
+ },
+ {
+ "epoch": 3.2387706855791962,
+ "grad_norm": 2.895822763442993,
+ "learning_rate": 2.2167681439504123e-06,
+ "loss": 0.4053,
+ "step": 6850
+ },
+ {
+ "epoch": 3.239243498817967,
+ "grad_norm": 2.7295608520507812,
+ "learning_rate": 2.2161483422138945e-06,
+ "loss": 0.4021,
+ "step": 6851
+ },
+ {
+ "epoch": 3.2397163120567374,
+ "grad_norm": 3.1004912853240967,
+ "learning_rate": 2.2155285581515747e-06,
+ "loss": 0.3882,
+ "step": 6852
+ },
+ {
+ "epoch": 3.240189125295508,
+ "grad_norm": 2.927987813949585,
+ "learning_rate": 2.214908791802045e-06,
+ "loss": 0.4036,
+ "step": 6853
+ },
+ {
+ "epoch": 3.240661938534279,
+ "grad_norm": 3.1679599285125732,
+ "learning_rate": 2.2142890432038943e-06,
+ "loss": 0.3897,
+ "step": 6854
+ },
+ {
+ "epoch": 3.2411347517730498,
+ "grad_norm": 3.2094008922576904,
+ "learning_rate": 2.213669312395712e-06,
+ "loss": 0.4429,
+ "step": 6855
+ },
+ {
+ "epoch": 3.24160756501182,
+ "grad_norm": 4.637594223022461,
+ "learning_rate": 2.2130495994160857e-06,
+ "loss": 0.3708,
+ "step": 6856
+ },
+ {
+ "epoch": 3.242080378250591,
+ "grad_norm": 3.0063490867614746,
+ "learning_rate": 2.212429904303603e-06,
+ "loss": 0.3949,
+ "step": 6857
+ },
+ {
+ "epoch": 3.2425531914893617,
+ "grad_norm": 3.285444736480713,
+ "learning_rate": 2.21181022709685e-06,
+ "loss": 0.4236,
+ "step": 6858
+ },
+ {
+ "epoch": 3.2430260047281325,
+ "grad_norm": 3.02506422996521,
+ "learning_rate": 2.2111905678344086e-06,
+ "loss": 0.368,
+ "step": 6859
+ },
+ {
+ "epoch": 3.243498817966903,
+ "grad_norm": 2.9845006465911865,
+ "learning_rate": 2.2105709265548657e-06,
+ "loss": 0.4154,
+ "step": 6860
+ },
+ {
+ "epoch": 3.2439716312056737,
+ "grad_norm": 3.2537527084350586,
+ "learning_rate": 2.2099513032968013e-06,
+ "loss": 0.4385,
+ "step": 6861
+ },
+ {
+ "epoch": 3.2444444444444445,
+ "grad_norm": 2.8521063327789307,
+ "learning_rate": 2.2093316980987985e-06,
+ "loss": 0.384,
+ "step": 6862
+ },
+ {
+ "epoch": 3.2449172576832153,
+ "grad_norm": 3.186844825744629,
+ "learning_rate": 2.208712110999436e-06,
+ "loss": 0.4131,
+ "step": 6863
+ },
+ {
+ "epoch": 3.2453900709219856,
+ "grad_norm": 2.932058095932007,
+ "learning_rate": 2.208092542037292e-06,
+ "loss": 0.3341,
+ "step": 6864
+ },
+ {
+ "epoch": 3.2458628841607564,
+ "grad_norm": 3.0818707942962646,
+ "learning_rate": 2.2074729912509462e-06,
+ "loss": 0.4149,
+ "step": 6865
+ },
+ {
+ "epoch": 3.246335697399527,
+ "grad_norm": 2.9788503646850586,
+ "learning_rate": 2.2068534586789735e-06,
+ "loss": 0.3572,
+ "step": 6866
+ },
+ {
+ "epoch": 3.246808510638298,
+ "grad_norm": 2.84075665473938,
+ "learning_rate": 2.206233944359952e-06,
+ "loss": 0.3561,
+ "step": 6867
+ },
+ {
+ "epoch": 3.2472813238770684,
+ "grad_norm": 2.966459035873413,
+ "learning_rate": 2.2056144483324545e-06,
+ "loss": 0.3909,
+ "step": 6868
+ },
+ {
+ "epoch": 3.247754137115839,
+ "grad_norm": 2.892038106918335,
+ "learning_rate": 2.204994970635054e-06,
+ "loss": 0.3557,
+ "step": 6869
+ },
+ {
+ "epoch": 3.24822695035461,
+ "grad_norm": 2.7458810806274414,
+ "learning_rate": 2.2043755113063233e-06,
+ "loss": 0.3551,
+ "step": 6870
+ },
+ {
+ "epoch": 3.2486997635933808,
+ "grad_norm": 2.766803741455078,
+ "learning_rate": 2.2037560703848334e-06,
+ "loss": 0.3343,
+ "step": 6871
+ },
+ {
+ "epoch": 3.249172576832151,
+ "grad_norm": 2.9780561923980713,
+ "learning_rate": 2.2031366479091533e-06,
+ "loss": 0.4004,
+ "step": 6872
+ },
+ {
+ "epoch": 3.249645390070922,
+ "grad_norm": 2.8848516941070557,
+ "learning_rate": 2.202517243917853e-06,
+ "loss": 0.3467,
+ "step": 6873
+ },
+ {
+ "epoch": 3.2501182033096927,
+ "grad_norm": 2.9962213039398193,
+ "learning_rate": 2.201897858449499e-06,
+ "loss": 0.3796,
+ "step": 6874
+ },
+ {
+ "epoch": 3.2505910165484635,
+ "grad_norm": 2.838131904602051,
+ "learning_rate": 2.201278491542659e-06,
+ "loss": 0.3683,
+ "step": 6875
+ },
+ {
+ "epoch": 3.251063829787234,
+ "grad_norm": 3.0232505798339844,
+ "learning_rate": 2.200659143235897e-06,
+ "loss": 0.3793,
+ "step": 6876
+ },
+ {
+ "epoch": 3.2515366430260046,
+ "grad_norm": 3.0690126419067383,
+ "learning_rate": 2.2000398135677776e-06,
+ "loss": 0.417,
+ "step": 6877
+ },
+ {
+ "epoch": 3.2520094562647754,
+ "grad_norm": 3.1838719844818115,
+ "learning_rate": 2.1994205025768643e-06,
+ "loss": 0.4608,
+ "step": 6878
+ },
+ {
+ "epoch": 3.2524822695035462,
+ "grad_norm": 3.1187257766723633,
+ "learning_rate": 2.198801210301717e-06,
+ "loss": 0.3396,
+ "step": 6879
+ },
+ {
+ "epoch": 3.2529550827423166,
+ "grad_norm": 2.7608656883239746,
+ "learning_rate": 2.1981819367808984e-06,
+ "loss": 0.386,
+ "step": 6880
+ },
+ {
+ "epoch": 3.2534278959810874,
+ "grad_norm": 3.027456283569336,
+ "learning_rate": 2.197562682052968e-06,
+ "loss": 0.3941,
+ "step": 6881
+ },
+ {
+ "epoch": 3.253900709219858,
+ "grad_norm": 2.925515651702881,
+ "learning_rate": 2.1969434461564816e-06,
+ "loss": 0.3608,
+ "step": 6882
+ },
+ {
+ "epoch": 3.254373522458629,
+ "grad_norm": 2.946770668029785,
+ "learning_rate": 2.196324229129999e-06,
+ "loss": 0.4116,
+ "step": 6883
+ },
+ {
+ "epoch": 3.2548463356973993,
+ "grad_norm": 2.6497952938079834,
+ "learning_rate": 2.1957050310120746e-06,
+ "loss": 0.338,
+ "step": 6884
+ },
+ {
+ "epoch": 3.25531914893617,
+ "grad_norm": 2.6915128231048584,
+ "learning_rate": 2.195085851841264e-06,
+ "loss": 0.3372,
+ "step": 6885
+ },
+ {
+ "epoch": 3.255791962174941,
+ "grad_norm": 3.4022350311279297,
+ "learning_rate": 2.1944666916561205e-06,
+ "loss": 0.3844,
+ "step": 6886
+ },
+ {
+ "epoch": 3.2562647754137117,
+ "grad_norm": 2.7463366985321045,
+ "learning_rate": 2.1938475504951958e-06,
+ "loss": 0.3268,
+ "step": 6887
+ },
+ {
+ "epoch": 3.256737588652482,
+ "grad_norm": 2.828810691833496,
+ "learning_rate": 2.193228428397042e-06,
+ "loss": 0.3275,
+ "step": 6888
+ },
+ {
+ "epoch": 3.257210401891253,
+ "grad_norm": 3.4016268253326416,
+ "learning_rate": 2.192609325400208e-06,
+ "loss": 0.3916,
+ "step": 6889
+ },
+ {
+ "epoch": 3.2576832151300237,
+ "grad_norm": 2.4980733394622803,
+ "learning_rate": 2.191990241543245e-06,
+ "loss": 0.3636,
+ "step": 6890
+ },
+ {
+ "epoch": 3.2581560283687945,
+ "grad_norm": 3.0384702682495117,
+ "learning_rate": 2.191371176864698e-06,
+ "loss": 0.398,
+ "step": 6891
+ },
+ {
+ "epoch": 3.258628841607565,
+ "grad_norm": 2.8949527740478516,
+ "learning_rate": 2.190752131403115e-06,
+ "loss": 0.3919,
+ "step": 6892
+ },
+ {
+ "epoch": 3.2591016548463356,
+ "grad_norm": 2.765617609024048,
+ "learning_rate": 2.190133105197041e-06,
+ "loss": 0.3799,
+ "step": 6893
+ },
+ {
+ "epoch": 3.2595744680851064,
+ "grad_norm": 2.6149277687072754,
+ "learning_rate": 2.18951409828502e-06,
+ "loss": 0.3895,
+ "step": 6894
+ },
+ {
+ "epoch": 3.260047281323877,
+ "grad_norm": 2.9738945960998535,
+ "learning_rate": 2.1888951107055934e-06,
+ "loss": 0.3879,
+ "step": 6895
+ },
+ {
+ "epoch": 3.2605200945626476,
+ "grad_norm": 2.9438633918762207,
+ "learning_rate": 2.1882761424973053e-06,
+ "loss": 0.438,
+ "step": 6896
+ },
+ {
+ "epoch": 3.2609929078014184,
+ "grad_norm": 3.114243984222412,
+ "learning_rate": 2.1876571936986936e-06,
+ "loss": 0.4737,
+ "step": 6897
+ },
+ {
+ "epoch": 3.261465721040189,
+ "grad_norm": 3.017526388168335,
+ "learning_rate": 2.1870382643483e-06,
+ "loss": 0.4039,
+ "step": 6898
+ },
+ {
+ "epoch": 3.26193853427896,
+ "grad_norm": 3.1475703716278076,
+ "learning_rate": 2.1864193544846613e-06,
+ "loss": 0.3825,
+ "step": 6899
+ },
+ {
+ "epoch": 3.2624113475177303,
+ "grad_norm": 2.75502872467041,
+ "learning_rate": 2.1858004641463142e-06,
+ "loss": 0.3507,
+ "step": 6900
+ },
+ {
+ "epoch": 3.262884160756501,
+ "grad_norm": 3.0467209815979004,
+ "learning_rate": 2.1851815933717944e-06,
+ "loss": 0.3938,
+ "step": 6901
+ },
+ {
+ "epoch": 3.263356973995272,
+ "grad_norm": 2.993014097213745,
+ "learning_rate": 2.184562742199636e-06,
+ "loss": 0.3711,
+ "step": 6902
+ },
+ {
+ "epoch": 3.2638297872340427,
+ "grad_norm": 2.607309341430664,
+ "learning_rate": 2.183943910668373e-06,
+ "loss": 0.3689,
+ "step": 6903
+ },
+ {
+ "epoch": 3.264302600472813,
+ "grad_norm": 2.961653470993042,
+ "learning_rate": 2.1833250988165373e-06,
+ "loss": 0.3806,
+ "step": 6904
+ },
+ {
+ "epoch": 3.264775413711584,
+ "grad_norm": 2.8202552795410156,
+ "learning_rate": 2.1827063066826574e-06,
+ "loss": 0.391,
+ "step": 6905
+ },
+ {
+ "epoch": 3.2652482269503547,
+ "grad_norm": 3.032648801803589,
+ "learning_rate": 2.1820875343052666e-06,
+ "loss": 0.4011,
+ "step": 6906
+ },
+ {
+ "epoch": 3.2657210401891255,
+ "grad_norm": 2.8265180587768555,
+ "learning_rate": 2.1814687817228896e-06,
+ "loss": 0.3923,
+ "step": 6907
+ },
+ {
+ "epoch": 3.266193853427896,
+ "grad_norm": 3.1425564289093018,
+ "learning_rate": 2.1808500489740555e-06,
+ "loss": 0.4913,
+ "step": 6908
+ },
+ {
+ "epoch": 3.2666666666666666,
+ "grad_norm": 2.977809429168701,
+ "learning_rate": 2.18023133609729e-06,
+ "loss": 0.379,
+ "step": 6909
+ },
+ {
+ "epoch": 3.2671394799054374,
+ "grad_norm": 3.509551525115967,
+ "learning_rate": 2.1796126431311153e-06,
+ "loss": 0.4025,
+ "step": 6910
+ },
+ {
+ "epoch": 3.267612293144208,
+ "grad_norm": 2.9133846759796143,
+ "learning_rate": 2.178993970114058e-06,
+ "loss": 0.4209,
+ "step": 6911
+ },
+ {
+ "epoch": 3.2680851063829786,
+ "grad_norm": 2.945513963699341,
+ "learning_rate": 2.178375317084637e-06,
+ "loss": 0.3882,
+ "step": 6912
+ },
+ {
+ "epoch": 3.2685579196217494,
+ "grad_norm": 2.7868733406066895,
+ "learning_rate": 2.1777566840813763e-06,
+ "loss": 0.3456,
+ "step": 6913
+ },
+ {
+ "epoch": 3.26903073286052,
+ "grad_norm": 2.803220748901367,
+ "learning_rate": 2.1771380711427937e-06,
+ "loss": 0.3394,
+ "step": 6914
+ },
+ {
+ "epoch": 3.269503546099291,
+ "grad_norm": 3.1293554306030273,
+ "learning_rate": 2.176519478307407e-06,
+ "loss": 0.402,
+ "step": 6915
+ },
+ {
+ "epoch": 3.2699763593380613,
+ "grad_norm": 2.843971014022827,
+ "learning_rate": 2.1759009056137347e-06,
+ "loss": 0.3449,
+ "step": 6916
+ },
+ {
+ "epoch": 3.270449172576832,
+ "grad_norm": 2.9983274936676025,
+ "learning_rate": 2.1752823531002917e-06,
+ "loss": 0.4091,
+ "step": 6917
+ },
+ {
+ "epoch": 3.270921985815603,
+ "grad_norm": 2.686722993850708,
+ "learning_rate": 2.174663820805592e-06,
+ "loss": 0.4303,
+ "step": 6918
+ },
+ {
+ "epoch": 3.2713947990543737,
+ "grad_norm": 2.669349431991577,
+ "learning_rate": 2.1740453087681508e-06,
+ "loss": 0.3796,
+ "step": 6919
+ },
+ {
+ "epoch": 3.271867612293144,
+ "grad_norm": 2.992138624191284,
+ "learning_rate": 2.173426817026477e-06,
+ "loss": 0.4125,
+ "step": 6920
+ },
+ {
+ "epoch": 3.272340425531915,
+ "grad_norm": 3.332834243774414,
+ "learning_rate": 2.1728083456190852e-06,
+ "loss": 0.3885,
+ "step": 6921
+ },
+ {
+ "epoch": 3.2728132387706856,
+ "grad_norm": 2.869673013687134,
+ "learning_rate": 2.1721898945844825e-06,
+ "loss": 0.3941,
+ "step": 6922
+ },
+ {
+ "epoch": 3.2732860520094564,
+ "grad_norm": 2.804440975189209,
+ "learning_rate": 2.1715714639611774e-06,
+ "loss": 0.4007,
+ "step": 6923
+ },
+ {
+ "epoch": 3.273758865248227,
+ "grad_norm": 3.1751439571380615,
+ "learning_rate": 2.1709530537876774e-06,
+ "loss": 0.3981,
+ "step": 6924
+ },
+ {
+ "epoch": 3.2742316784869976,
+ "grad_norm": 2.6367175579071045,
+ "learning_rate": 2.1703346641024878e-06,
+ "loss": 0.3582,
+ "step": 6925
+ },
+ {
+ "epoch": 3.2747044917257684,
+ "grad_norm": 2.99164080619812,
+ "learning_rate": 2.1697162949441137e-06,
+ "loss": 0.3846,
+ "step": 6926
+ },
+ {
+ "epoch": 3.275177304964539,
+ "grad_norm": 3.3206982612609863,
+ "learning_rate": 2.169097946351057e-06,
+ "loss": 0.3689,
+ "step": 6927
+ },
+ {
+ "epoch": 3.2756501182033095,
+ "grad_norm": 2.927907943725586,
+ "learning_rate": 2.16847961836182e-06,
+ "loss": 0.3536,
+ "step": 6928
+ },
+ {
+ "epoch": 3.2761229314420803,
+ "grad_norm": 3.1950864791870117,
+ "learning_rate": 2.167861311014904e-06,
+ "loss": 0.4154,
+ "step": 6929
+ },
+ {
+ "epoch": 3.276595744680851,
+ "grad_norm": 2.888383388519287,
+ "learning_rate": 2.1672430243488073e-06,
+ "loss": 0.3702,
+ "step": 6930
+ },
+ {
+ "epoch": 3.277068557919622,
+ "grad_norm": 2.842287063598633,
+ "learning_rate": 2.166624758402029e-06,
+ "loss": 0.3623,
+ "step": 6931
+ },
+ {
+ "epoch": 3.2775413711583923,
+ "grad_norm": 2.84350323677063,
+ "learning_rate": 2.166006513213065e-06,
+ "loss": 0.3757,
+ "step": 6932
+ },
+ {
+ "epoch": 3.278014184397163,
+ "grad_norm": 3.105626344680786,
+ "learning_rate": 2.165388288820411e-06,
+ "loss": 0.3955,
+ "step": 6933
+ },
+ {
+ "epoch": 3.278486997635934,
+ "grad_norm": 3.273508071899414,
+ "learning_rate": 2.164770085262561e-06,
+ "loss": 0.4046,
+ "step": 6934
+ },
+ {
+ "epoch": 3.2789598108747047,
+ "grad_norm": 3.2530124187469482,
+ "learning_rate": 2.1641519025780066e-06,
+ "loss": 0.3141,
+ "step": 6935
+ },
+ {
+ "epoch": 3.279432624113475,
+ "grad_norm": 2.822849750518799,
+ "learning_rate": 2.163533740805242e-06,
+ "loss": 0.3973,
+ "step": 6936
+ },
+ {
+ "epoch": 3.279905437352246,
+ "grad_norm": 2.772097587585449,
+ "learning_rate": 2.162915599982756e-06,
+ "loss": 0.3606,
+ "step": 6937
+ },
+ {
+ "epoch": 3.2803782505910166,
+ "grad_norm": 3.150696039199829,
+ "learning_rate": 2.1622974801490365e-06,
+ "loss": 0.4709,
+ "step": 6938
+ },
+ {
+ "epoch": 3.2808510638297874,
+ "grad_norm": 3.2072134017944336,
+ "learning_rate": 2.1616793813425736e-06,
+ "loss": 0.3946,
+ "step": 6939
+ },
+ {
+ "epoch": 3.2813238770685578,
+ "grad_norm": 2.9922473430633545,
+ "learning_rate": 2.1610613036018515e-06,
+ "loss": 0.3263,
+ "step": 6940
+ },
+ {
+ "epoch": 3.2817966903073286,
+ "grad_norm": 2.7818009853363037,
+ "learning_rate": 2.1604432469653555e-06,
+ "loss": 0.3887,
+ "step": 6941
+ },
+ {
+ "epoch": 3.2822695035460994,
+ "grad_norm": 3.12998628616333,
+ "learning_rate": 2.15982521147157e-06,
+ "loss": 0.3522,
+ "step": 6942
+ },
+ {
+ "epoch": 3.28274231678487,
+ "grad_norm": 2.876678228378296,
+ "learning_rate": 2.159207197158976e-06,
+ "loss": 0.3643,
+ "step": 6943
+ },
+ {
+ "epoch": 3.2832151300236405,
+ "grad_norm": 2.825488805770874,
+ "learning_rate": 2.1585892040660565e-06,
+ "loss": 0.3223,
+ "step": 6944
+ },
+ {
+ "epoch": 3.2836879432624113,
+ "grad_norm": 2.8724498748779297,
+ "learning_rate": 2.1579712322312906e-06,
+ "loss": 0.3855,
+ "step": 6945
+ },
+ {
+ "epoch": 3.284160756501182,
+ "grad_norm": 2.841064691543579,
+ "learning_rate": 2.1573532816931547e-06,
+ "loss": 0.4106,
+ "step": 6946
+ },
+ {
+ "epoch": 3.284633569739953,
+ "grad_norm": 3.053391218185425,
+ "learning_rate": 2.1567353524901288e-06,
+ "loss": 0.4875,
+ "step": 6947
+ },
+ {
+ "epoch": 3.2851063829787233,
+ "grad_norm": 2.7294771671295166,
+ "learning_rate": 2.156117444660687e-06,
+ "loss": 0.3856,
+ "step": 6948
+ },
+ {
+ "epoch": 3.285579196217494,
+ "grad_norm": 3.0965659618377686,
+ "learning_rate": 2.155499558243304e-06,
+ "loss": 0.4104,
+ "step": 6949
+ },
+ {
+ "epoch": 3.286052009456265,
+ "grad_norm": 2.778923511505127,
+ "learning_rate": 2.1548816932764536e-06,
+ "loss": 0.3636,
+ "step": 6950
+ },
+ {
+ "epoch": 3.2865248226950357,
+ "grad_norm": 2.890679121017456,
+ "learning_rate": 2.1542638497986054e-06,
+ "loss": 0.4026,
+ "step": 6951
+ },
+ {
+ "epoch": 3.286997635933806,
+ "grad_norm": 3.0466806888580322,
+ "learning_rate": 2.1536460278482326e-06,
+ "loss": 0.3856,
+ "step": 6952
+ },
+ {
+ "epoch": 3.287470449172577,
+ "grad_norm": 3.1367077827453613,
+ "learning_rate": 2.1530282274638013e-06,
+ "loss": 0.3767,
+ "step": 6953
+ },
+ {
+ "epoch": 3.2879432624113476,
+ "grad_norm": 2.984694719314575,
+ "learning_rate": 2.1524104486837823e-06,
+ "loss": 0.4142,
+ "step": 6954
+ },
+ {
+ "epoch": 3.2884160756501184,
+ "grad_norm": 3.1542797088623047,
+ "learning_rate": 2.151792691546641e-06,
+ "loss": 0.4361,
+ "step": 6955
+ },
+ {
+ "epoch": 3.2888888888888888,
+ "grad_norm": 2.7306816577911377,
+ "learning_rate": 2.1511749560908405e-06,
+ "loss": 0.3692,
+ "step": 6956
+ },
+ {
+ "epoch": 3.2893617021276595,
+ "grad_norm": 3.6679904460906982,
+ "learning_rate": 2.150557242354847e-06,
+ "loss": 0.4496,
+ "step": 6957
+ },
+ {
+ "epoch": 3.2898345153664303,
+ "grad_norm": 3.2040863037109375,
+ "learning_rate": 2.1499395503771207e-06,
+ "loss": 0.3526,
+ "step": 6958
+ },
+ {
+ "epoch": 3.290307328605201,
+ "grad_norm": 3.2416043281555176,
+ "learning_rate": 2.1493218801961246e-06,
+ "loss": 0.3955,
+ "step": 6959
+ },
+ {
+ "epoch": 3.2907801418439715,
+ "grad_norm": 2.8164525032043457,
+ "learning_rate": 2.1487042318503174e-06,
+ "loss": 0.3727,
+ "step": 6960
+ },
+ {
+ "epoch": 3.2912529550827423,
+ "grad_norm": 2.5954513549804688,
+ "learning_rate": 2.148086605378156e-06,
+ "loss": 0.3315,
+ "step": 6961
+ },
+ {
+ "epoch": 3.291725768321513,
+ "grad_norm": 2.8068149089813232,
+ "learning_rate": 2.1474690008181e-06,
+ "loss": 0.3702,
+ "step": 6962
+ },
+ {
+ "epoch": 3.2921985815602834,
+ "grad_norm": 2.9063730239868164,
+ "learning_rate": 2.1468514182086025e-06,
+ "loss": 0.3357,
+ "step": 6963
+ },
+ {
+ "epoch": 3.2926713947990542,
+ "grad_norm": 2.7623207569122314,
+ "learning_rate": 2.1462338575881197e-06,
+ "loss": 0.381,
+ "step": 6964
+ },
+ {
+ "epoch": 3.293144208037825,
+ "grad_norm": 2.6818830966949463,
+ "learning_rate": 2.145616318995103e-06,
+ "loss": 0.3733,
+ "step": 6965
+ },
+ {
+ "epoch": 3.293617021276596,
+ "grad_norm": 2.7966864109039307,
+ "learning_rate": 2.1449988024680034e-06,
+ "loss": 0.3993,
+ "step": 6966
+ },
+ {
+ "epoch": 3.2940898345153666,
+ "grad_norm": 3.0644514560699463,
+ "learning_rate": 2.1443813080452728e-06,
+ "loss": 0.3541,
+ "step": 6967
+ },
+ {
+ "epoch": 3.294562647754137,
+ "grad_norm": 3.03204607963562,
+ "learning_rate": 2.1437638357653586e-06,
+ "loss": 0.3864,
+ "step": 6968
+ },
+ {
+ "epoch": 3.295035460992908,
+ "grad_norm": 2.980565071105957,
+ "learning_rate": 2.143146385666707e-06,
+ "loss": 0.36,
+ "step": 6969
+ },
+ {
+ "epoch": 3.2955082742316786,
+ "grad_norm": 3.1261661052703857,
+ "learning_rate": 2.1425289577877675e-06,
+ "loss": 0.4053,
+ "step": 6970
+ },
+ {
+ "epoch": 3.295981087470449,
+ "grad_norm": 3.0194897651672363,
+ "learning_rate": 2.1419115521669804e-06,
+ "loss": 0.4553,
+ "step": 6971
+ },
+ {
+ "epoch": 3.2964539007092197,
+ "grad_norm": 2.7620482444763184,
+ "learning_rate": 2.141294168842792e-06,
+ "loss": 0.3846,
+ "step": 6972
+ },
+ {
+ "epoch": 3.2969267139479905,
+ "grad_norm": 2.9575016498565674,
+ "learning_rate": 2.1406768078536427e-06,
+ "loss": 0.4415,
+ "step": 6973
+ },
+ {
+ "epoch": 3.2973995271867613,
+ "grad_norm": 3.17909574508667,
+ "learning_rate": 2.1400594692379717e-06,
+ "loss": 0.4514,
+ "step": 6974
+ },
+ {
+ "epoch": 3.297872340425532,
+ "grad_norm": 3.128613233566284,
+ "learning_rate": 2.1394421530342207e-06,
+ "loss": 0.3757,
+ "step": 6975
+ },
+ {
+ "epoch": 3.2983451536643025,
+ "grad_norm": 3.0247111320495605,
+ "learning_rate": 2.1388248592808243e-06,
+ "loss": 0.3881,
+ "step": 6976
+ },
+ {
+ "epoch": 3.2988179669030733,
+ "grad_norm": 2.8091228008270264,
+ "learning_rate": 2.1382075880162217e-06,
+ "loss": 0.3782,
+ "step": 6977
+ },
+ {
+ "epoch": 3.299290780141844,
+ "grad_norm": 2.985105514526367,
+ "learning_rate": 2.137590339278846e-06,
+ "loss": 0.3783,
+ "step": 6978
+ },
+ {
+ "epoch": 3.2997635933806144,
+ "grad_norm": 3.1862502098083496,
+ "learning_rate": 2.1369731131071304e-06,
+ "loss": 0.4776,
+ "step": 6979
+ },
+ {
+ "epoch": 3.300236406619385,
+ "grad_norm": 3.3138091564178467,
+ "learning_rate": 2.1363559095395075e-06,
+ "loss": 0.4056,
+ "step": 6980
+ },
+ {
+ "epoch": 3.300709219858156,
+ "grad_norm": 3.023695707321167,
+ "learning_rate": 2.135738728614407e-06,
+ "loss": 0.3716,
+ "step": 6981
+ },
+ {
+ "epoch": 3.301182033096927,
+ "grad_norm": 6.149252414703369,
+ "learning_rate": 2.135121570370259e-06,
+ "loss": 0.3713,
+ "step": 6982
+ },
+ {
+ "epoch": 3.3016548463356976,
+ "grad_norm": 2.689671754837036,
+ "learning_rate": 2.134504434845491e-06,
+ "loss": 0.3541,
+ "step": 6983
+ },
+ {
+ "epoch": 3.302127659574468,
+ "grad_norm": 3.241212844848633,
+ "learning_rate": 2.1338873220785284e-06,
+ "loss": 0.4328,
+ "step": 6984
+ },
+ {
+ "epoch": 3.3026004728132388,
+ "grad_norm": 3.6037068367004395,
+ "learning_rate": 2.133270232107798e-06,
+ "loss": 0.4091,
+ "step": 6985
+ },
+ {
+ "epoch": 3.3030732860520096,
+ "grad_norm": 3.300031900405884,
+ "learning_rate": 2.1326531649717216e-06,
+ "loss": 0.3742,
+ "step": 6986
+ },
+ {
+ "epoch": 3.30354609929078,
+ "grad_norm": 2.82257342338562,
+ "learning_rate": 2.1320361207087225e-06,
+ "loss": 0.3622,
+ "step": 6987
+ },
+ {
+ "epoch": 3.3040189125295507,
+ "grad_norm": 3.297513246536255,
+ "learning_rate": 2.1314190993572196e-06,
+ "loss": 0.4606,
+ "step": 6988
+ },
+ {
+ "epoch": 3.3044917257683215,
+ "grad_norm": 2.676440954208374,
+ "learning_rate": 2.130802100955634e-06,
+ "loss": 0.382,
+ "step": 6989
+ },
+ {
+ "epoch": 3.3049645390070923,
+ "grad_norm": 2.9548017978668213,
+ "learning_rate": 2.130185125542383e-06,
+ "loss": 0.3751,
+ "step": 6990
+ },
+ {
+ "epoch": 3.305437352245863,
+ "grad_norm": 2.800647020339966,
+ "learning_rate": 2.129568173155882e-06,
+ "loss": 0.3868,
+ "step": 6991
+ },
+ {
+ "epoch": 3.3059101654846335,
+ "grad_norm": 3.3789260387420654,
+ "learning_rate": 2.128951243834546e-06,
+ "loss": 0.4373,
+ "step": 6992
+ },
+ {
+ "epoch": 3.3063829787234043,
+ "grad_norm": 2.944807767868042,
+ "learning_rate": 2.12833433761679e-06,
+ "loss": 0.4205,
+ "step": 6993
+ },
+ {
+ "epoch": 3.306855791962175,
+ "grad_norm": 2.577975273132324,
+ "learning_rate": 2.127717454541025e-06,
+ "loss": 0.4197,
+ "step": 6994
+ },
+ {
+ "epoch": 3.3073286052009454,
+ "grad_norm": 3.0542666912078857,
+ "learning_rate": 2.127100594645661e-06,
+ "loss": 0.3811,
+ "step": 6995
+ },
+ {
+ "epoch": 3.307801418439716,
+ "grad_norm": 3.163015842437744,
+ "learning_rate": 2.1264837579691088e-06,
+ "loss": 0.415,
+ "step": 6996
+ },
+ {
+ "epoch": 3.308274231678487,
+ "grad_norm": 2.9161269664764404,
+ "learning_rate": 2.1258669445497746e-06,
+ "loss": 0.3714,
+ "step": 6997
+ },
+ {
+ "epoch": 3.308747044917258,
+ "grad_norm": 2.934483289718628,
+ "learning_rate": 2.1252501544260657e-06,
+ "loss": 0.4085,
+ "step": 6998
+ },
+ {
+ "epoch": 3.3092198581560286,
+ "grad_norm": 3.155613660812378,
+ "learning_rate": 2.1246333876363852e-06,
+ "loss": 0.4698,
+ "step": 6999
+ },
+ {
+ "epoch": 3.309692671394799,
+ "grad_norm": 2.648171901702881,
+ "learning_rate": 2.124016644219139e-06,
+ "loss": 0.3091,
+ "step": 7000
+ },
+ {
+ "epoch": 3.3101654846335697,
+ "grad_norm": 2.908219814300537,
+ "learning_rate": 2.123399924212728e-06,
+ "loss": 0.4063,
+ "step": 7001
+ },
+ {
+ "epoch": 3.3106382978723405,
+ "grad_norm": 3.138749361038208,
+ "learning_rate": 2.122783227655551e-06,
+ "loss": 0.4296,
+ "step": 7002
+ },
+ {
+ "epoch": 3.311111111111111,
+ "grad_norm": 3.044466018676758,
+ "learning_rate": 2.1221665545860094e-06,
+ "loss": 0.4424,
+ "step": 7003
+ },
+ {
+ "epoch": 3.3115839243498817,
+ "grad_norm": 2.6758792400360107,
+ "learning_rate": 2.121549905042499e-06,
+ "loss": 0.4073,
+ "step": 7004
+ },
+ {
+ "epoch": 3.3120567375886525,
+ "grad_norm": 2.8901989459991455,
+ "learning_rate": 2.1209332790634174e-06,
+ "loss": 0.3842,
+ "step": 7005
+ },
+ {
+ "epoch": 3.3125295508274233,
+ "grad_norm": 2.8179712295532227,
+ "learning_rate": 2.1203166766871582e-06,
+ "loss": 0.366,
+ "step": 7006
+ },
+ {
+ "epoch": 3.313002364066194,
+ "grad_norm": 2.6536550521850586,
+ "learning_rate": 2.1197000979521138e-06,
+ "loss": 0.3851,
+ "step": 7007
+ },
+ {
+ "epoch": 3.3134751773049644,
+ "grad_norm": 3.1277682781219482,
+ "learning_rate": 2.1190835428966775e-06,
+ "loss": 0.4249,
+ "step": 7008
+ },
+ {
+ "epoch": 3.3139479905437352,
+ "grad_norm": 2.924666166305542,
+ "learning_rate": 2.1184670115592383e-06,
+ "loss": 0.3873,
+ "step": 7009
+ },
+ {
+ "epoch": 3.314420803782506,
+ "grad_norm": 2.7921009063720703,
+ "learning_rate": 2.1178505039781856e-06,
+ "loss": 0.3754,
+ "step": 7010
+ },
+ {
+ "epoch": 3.3148936170212764,
+ "grad_norm": 2.5349879264831543,
+ "learning_rate": 2.1172340201919067e-06,
+ "loss": 0.3701,
+ "step": 7011
+ },
+ {
+ "epoch": 3.315366430260047,
+ "grad_norm": 2.849376678466797,
+ "learning_rate": 2.1166175602387866e-06,
+ "loss": 0.3963,
+ "step": 7012
+ },
+ {
+ "epoch": 3.315839243498818,
+ "grad_norm": 3.141280174255371,
+ "learning_rate": 2.11600112415721e-06,
+ "loss": 0.4158,
+ "step": 7013
+ },
+ {
+ "epoch": 3.3163120567375888,
+ "grad_norm": 2.922807455062866,
+ "learning_rate": 2.11538471198556e-06,
+ "loss": 0.3667,
+ "step": 7014
+ },
+ {
+ "epoch": 3.3167848699763596,
+ "grad_norm": 2.770400047302246,
+ "learning_rate": 2.114768323762216e-06,
+ "loss": 0.3674,
+ "step": 7015
+ },
+ {
+ "epoch": 3.31725768321513,
+ "grad_norm": 2.7706570625305176,
+ "learning_rate": 2.114151959525561e-06,
+ "loss": 0.3761,
+ "step": 7016
+ },
+ {
+ "epoch": 3.3177304964539007,
+ "grad_norm": 3.041755437850952,
+ "learning_rate": 2.1135356193139704e-06,
+ "loss": 0.4483,
+ "step": 7017
+ },
+ {
+ "epoch": 3.3182033096926715,
+ "grad_norm": 3.5757904052734375,
+ "learning_rate": 2.1129193031658227e-06,
+ "loss": 0.4094,
+ "step": 7018
+ },
+ {
+ "epoch": 3.318676122931442,
+ "grad_norm": 2.9292917251586914,
+ "learning_rate": 2.1123030111194936e-06,
+ "loss": 0.3514,
+ "step": 7019
+ },
+ {
+ "epoch": 3.3191489361702127,
+ "grad_norm": 3.1443874835968018,
+ "learning_rate": 2.111686743213355e-06,
+ "loss": 0.4098,
+ "step": 7020
+ },
+ {
+ "epoch": 3.3196217494089835,
+ "grad_norm": 2.9738030433654785,
+ "learning_rate": 2.1110704994857804e-06,
+ "loss": 0.3584,
+ "step": 7021
+ },
+ {
+ "epoch": 3.3200945626477543,
+ "grad_norm": 2.8961563110351562,
+ "learning_rate": 2.1104542799751397e-06,
+ "loss": 0.3736,
+ "step": 7022
+ },
+ {
+ "epoch": 3.320567375886525,
+ "grad_norm": 3.9264683723449707,
+ "learning_rate": 2.1098380847198037e-06,
+ "loss": 0.457,
+ "step": 7023
+ },
+ {
+ "epoch": 3.3210401891252954,
+ "grad_norm": 2.8742756843566895,
+ "learning_rate": 2.109221913758139e-06,
+ "loss": 0.4252,
+ "step": 7024
+ },
+ {
+ "epoch": 3.321513002364066,
+ "grad_norm": 3.7229559421539307,
+ "learning_rate": 2.108605767128512e-06,
+ "loss": 0.4451,
+ "step": 7025
+ },
+ {
+ "epoch": 3.321985815602837,
+ "grad_norm": 2.6417593955993652,
+ "learning_rate": 2.1079896448692884e-06,
+ "loss": 0.3658,
+ "step": 7026
+ },
+ {
+ "epoch": 3.3224586288416074,
+ "grad_norm": 2.8780412673950195,
+ "learning_rate": 2.10737354701883e-06,
+ "loss": 0.4225,
+ "step": 7027
+ },
+ {
+ "epoch": 3.322931442080378,
+ "grad_norm": 2.557816505432129,
+ "learning_rate": 2.1067574736155e-06,
+ "loss": 0.3812,
+ "step": 7028
+ },
+ {
+ "epoch": 3.323404255319149,
+ "grad_norm": 2.859062910079956,
+ "learning_rate": 2.106141424697658e-06,
+ "loss": 0.3629,
+ "step": 7029
+ },
+ {
+ "epoch": 3.3238770685579198,
+ "grad_norm": 2.4776878356933594,
+ "learning_rate": 2.1055254003036607e-06,
+ "loss": 0.3591,
+ "step": 7030
+ },
+ {
+ "epoch": 3.3243498817966906,
+ "grad_norm": 3.085066795349121,
+ "learning_rate": 2.1049094004718687e-06,
+ "loss": 0.4237,
+ "step": 7031
+ },
+ {
+ "epoch": 3.324822695035461,
+ "grad_norm": 2.862592935562134,
+ "learning_rate": 2.1042934252406345e-06,
+ "loss": 0.3185,
+ "step": 7032
+ },
+ {
+ "epoch": 3.3252955082742317,
+ "grad_norm": 2.965743064880371,
+ "learning_rate": 2.1036774746483145e-06,
+ "loss": 0.4058,
+ "step": 7033
+ },
+ {
+ "epoch": 3.3257683215130025,
+ "grad_norm": 2.7420589923858643,
+ "learning_rate": 2.103061548733261e-06,
+ "loss": 0.3566,
+ "step": 7034
+ },
+ {
+ "epoch": 3.326241134751773,
+ "grad_norm": 2.7824347019195557,
+ "learning_rate": 2.1024456475338235e-06,
+ "loss": 0.3553,
+ "step": 7035
+ },
+ {
+ "epoch": 3.3267139479905437,
+ "grad_norm": 3.0410704612731934,
+ "learning_rate": 2.1018297710883528e-06,
+ "loss": 0.3772,
+ "step": 7036
+ },
+ {
+ "epoch": 3.3271867612293144,
+ "grad_norm": 3.0811562538146973,
+ "learning_rate": 2.101213919435196e-06,
+ "loss": 0.3738,
+ "step": 7037
+ },
+ {
+ "epoch": 3.3276595744680852,
+ "grad_norm": 2.939445734024048,
+ "learning_rate": 2.100598092612699e-06,
+ "loss": 0.4107,
+ "step": 7038
+ },
+ {
+ "epoch": 3.3281323877068556,
+ "grad_norm": 3.05804705619812,
+ "learning_rate": 2.0999822906592086e-06,
+ "loss": 0.3972,
+ "step": 7039
+ },
+ {
+ "epoch": 3.3286052009456264,
+ "grad_norm": 2.803558111190796,
+ "learning_rate": 2.0993665136130657e-06,
+ "loss": 0.3487,
+ "step": 7040
+ },
+ {
+ "epoch": 3.329078014184397,
+ "grad_norm": 2.937675714492798,
+ "learning_rate": 2.0987507615126147e-06,
+ "loss": 0.4095,
+ "step": 7041
+ },
+ {
+ "epoch": 3.329550827423168,
+ "grad_norm": 2.853905439376831,
+ "learning_rate": 2.098135034396194e-06,
+ "loss": 0.3775,
+ "step": 7042
+ },
+ {
+ "epoch": 3.3300236406619383,
+ "grad_norm": 3.3520495891571045,
+ "learning_rate": 2.097519332302142e-06,
+ "loss": 0.4065,
+ "step": 7043
+ },
+ {
+ "epoch": 3.330496453900709,
+ "grad_norm": 2.8787078857421875,
+ "learning_rate": 2.096903655268797e-06,
+ "loss": 0.3452,
+ "step": 7044
+ },
+ {
+ "epoch": 3.33096926713948,
+ "grad_norm": 2.993896007537842,
+ "learning_rate": 2.096288003334493e-06,
+ "loss": 0.3814,
+ "step": 7045
+ },
+ {
+ "epoch": 3.3314420803782507,
+ "grad_norm": 3.5248336791992188,
+ "learning_rate": 2.0956723765375655e-06,
+ "loss": 0.3852,
+ "step": 7046
+ },
+ {
+ "epoch": 3.331914893617021,
+ "grad_norm": 3.2227890491485596,
+ "learning_rate": 2.0950567749163463e-06,
+ "loss": 0.3913,
+ "step": 7047
+ },
+ {
+ "epoch": 3.332387706855792,
+ "grad_norm": 3.390401601791382,
+ "learning_rate": 2.094441198509165e-06,
+ "loss": 0.3944,
+ "step": 7048
+ },
+ {
+ "epoch": 3.3328605200945627,
+ "grad_norm": 3.2057554721832275,
+ "learning_rate": 2.0938256473543534e-06,
+ "loss": 0.404,
+ "step": 7049
+ },
+ {
+ "epoch": 3.3333333333333335,
+ "grad_norm": 2.866708755493164,
+ "learning_rate": 2.0932101214902367e-06,
+ "loss": 0.4345,
+ "step": 7050
+ },
+ {
+ "epoch": 3.333806146572104,
+ "grad_norm": 3.4304039478302,
+ "learning_rate": 2.0925946209551428e-06,
+ "loss": 0.4209,
+ "step": 7051
+ },
+ {
+ "epoch": 3.3342789598108746,
+ "grad_norm": 3.996561288833618,
+ "learning_rate": 2.091979145787395e-06,
+ "loss": 0.4394,
+ "step": 7052
+ },
+ {
+ "epoch": 3.3347517730496454,
+ "grad_norm": 3.1932613849639893,
+ "learning_rate": 2.0913636960253166e-06,
+ "loss": 0.3837,
+ "step": 7053
+ },
+ {
+ "epoch": 3.3352245862884162,
+ "grad_norm": 2.908832311630249,
+ "learning_rate": 2.0907482717072293e-06,
+ "loss": 0.3526,
+ "step": 7054
+ },
+ {
+ "epoch": 3.3356973995271866,
+ "grad_norm": 2.7319607734680176,
+ "learning_rate": 2.090132872871452e-06,
+ "loss": 0.3686,
+ "step": 7055
+ },
+ {
+ "epoch": 3.3361702127659574,
+ "grad_norm": 2.9213504791259766,
+ "learning_rate": 2.0895174995563043e-06,
+ "loss": 0.4034,
+ "step": 7056
+ },
+ {
+ "epoch": 3.336643026004728,
+ "grad_norm": 2.8093936443328857,
+ "learning_rate": 2.0889021518001017e-06,
+ "loss": 0.4151,
+ "step": 7057
+ },
+ {
+ "epoch": 3.337115839243499,
+ "grad_norm": 3.1840829849243164,
+ "learning_rate": 2.0882868296411594e-06,
+ "loss": 0.3501,
+ "step": 7058
+ },
+ {
+ "epoch": 3.3375886524822693,
+ "grad_norm": 2.793567657470703,
+ "learning_rate": 2.087671533117791e-06,
+ "loss": 0.3911,
+ "step": 7059
+ },
+ {
+ "epoch": 3.33806146572104,
+ "grad_norm": 3.0820090770721436,
+ "learning_rate": 2.0870562622683077e-06,
+ "loss": 0.432,
+ "step": 7060
+ },
+ {
+ "epoch": 3.338534278959811,
+ "grad_norm": 2.774630546569824,
+ "learning_rate": 2.0864410171310213e-06,
+ "loss": 0.3434,
+ "step": 7061
+ },
+ {
+ "epoch": 3.3390070921985817,
+ "grad_norm": 2.70447039604187,
+ "learning_rate": 2.085825797744239e-06,
+ "loss": 0.3787,
+ "step": 7062
+ },
+ {
+ "epoch": 3.339479905437352,
+ "grad_norm": 3.1014437675476074,
+ "learning_rate": 2.0852106041462672e-06,
+ "loss": 0.4568,
+ "step": 7063
+ },
+ {
+ "epoch": 3.339952718676123,
+ "grad_norm": 3.312680244445801,
+ "learning_rate": 2.0845954363754133e-06,
+ "loss": 0.4285,
+ "step": 7064
+ },
+ {
+ "epoch": 3.3404255319148937,
+ "grad_norm": 2.7070534229278564,
+ "learning_rate": 2.0839802944699806e-06,
+ "loss": 0.4096,
+ "step": 7065
+ },
+ {
+ "epoch": 3.3408983451536645,
+ "grad_norm": 2.8172531127929688,
+ "learning_rate": 2.083365178468269e-06,
+ "loss": 0.3652,
+ "step": 7066
+ },
+ {
+ "epoch": 3.341371158392435,
+ "grad_norm": 2.896378517150879,
+ "learning_rate": 2.082750088408582e-06,
+ "loss": 0.3778,
+ "step": 7067
+ },
+ {
+ "epoch": 3.3418439716312056,
+ "grad_norm": 2.769805669784546,
+ "learning_rate": 2.0821350243292175e-06,
+ "loss": 0.3593,
+ "step": 7068
+ },
+ {
+ "epoch": 3.3423167848699764,
+ "grad_norm": 2.672520875930786,
+ "learning_rate": 2.0815199862684728e-06,
+ "loss": 0.3873,
+ "step": 7069
+ },
+ {
+ "epoch": 3.342789598108747,
+ "grad_norm": 2.841327428817749,
+ "learning_rate": 2.0809049742646435e-06,
+ "loss": 0.41,
+ "step": 7070
+ },
+ {
+ "epoch": 3.3432624113475176,
+ "grad_norm": 3.0540482997894287,
+ "learning_rate": 2.080289988356023e-06,
+ "loss": 0.32,
+ "step": 7071
+ },
+ {
+ "epoch": 3.3437352245862884,
+ "grad_norm": 3.471684217453003,
+ "learning_rate": 2.079675028580905e-06,
+ "loss": 0.3779,
+ "step": 7072
+ },
+ {
+ "epoch": 3.344208037825059,
+ "grad_norm": 2.8545875549316406,
+ "learning_rate": 2.07906009497758e-06,
+ "loss": 0.4645,
+ "step": 7073
+ },
+ {
+ "epoch": 3.34468085106383,
+ "grad_norm": 2.7771127223968506,
+ "learning_rate": 2.078445187584337e-06,
+ "loss": 0.3889,
+ "step": 7074
+ },
+ {
+ "epoch": 3.3451536643026003,
+ "grad_norm": 2.769188165664673,
+ "learning_rate": 2.0778303064394647e-06,
+ "loss": 0.3745,
+ "step": 7075
+ },
+ {
+ "epoch": 3.345626477541371,
+ "grad_norm": 2.739577531814575,
+ "learning_rate": 2.0772154515812467e-06,
+ "loss": 0.4402,
+ "step": 7076
+ },
+ {
+ "epoch": 3.346099290780142,
+ "grad_norm": 2.6124343872070312,
+ "learning_rate": 2.0766006230479696e-06,
+ "loss": 0.3595,
+ "step": 7077
+ },
+ {
+ "epoch": 3.3465721040189127,
+ "grad_norm": 2.7100563049316406,
+ "learning_rate": 2.0759858208779136e-06,
+ "loss": 0.3641,
+ "step": 7078
+ },
+ {
+ "epoch": 3.347044917257683,
+ "grad_norm": 2.8594000339508057,
+ "learning_rate": 2.075371045109363e-06,
+ "loss": 0.402,
+ "step": 7079
+ },
+ {
+ "epoch": 3.347517730496454,
+ "grad_norm": 3.2045278549194336,
+ "learning_rate": 2.0747562957805955e-06,
+ "loss": 0.4719,
+ "step": 7080
+ },
+ {
+ "epoch": 3.3479905437352246,
+ "grad_norm": 2.825594663619995,
+ "learning_rate": 2.0741415729298874e-06,
+ "loss": 0.4127,
+ "step": 7081
+ },
+ {
+ "epoch": 3.3484633569739954,
+ "grad_norm": 2.992403984069824,
+ "learning_rate": 2.0735268765955173e-06,
+ "loss": 0.3943,
+ "step": 7082
+ },
+ {
+ "epoch": 3.348936170212766,
+ "grad_norm": 3.0629165172576904,
+ "learning_rate": 2.072912206815758e-06,
+ "loss": 0.4132,
+ "step": 7083
+ },
+ {
+ "epoch": 3.3494089834515366,
+ "grad_norm": 2.7553658485412598,
+ "learning_rate": 2.0722975636288836e-06,
+ "loss": 0.3667,
+ "step": 7084
+ },
+ {
+ "epoch": 3.3498817966903074,
+ "grad_norm": 3.1556780338287354,
+ "learning_rate": 2.0716829470731647e-06,
+ "loss": 0.4383,
+ "step": 7085
+ },
+ {
+ "epoch": 3.350354609929078,
+ "grad_norm": 2.6693310737609863,
+ "learning_rate": 2.071068357186869e-06,
+ "loss": 0.363,
+ "step": 7086
+ },
+ {
+ "epoch": 3.3508274231678485,
+ "grad_norm": 2.738314628601074,
+ "learning_rate": 2.0704537940082673e-06,
+ "loss": 0.3493,
+ "step": 7087
+ },
+ {
+ "epoch": 3.3513002364066193,
+ "grad_norm": 3.2205989360809326,
+ "learning_rate": 2.069839257575624e-06,
+ "loss": 0.3802,
+ "step": 7088
+ },
+ {
+ "epoch": 3.35177304964539,
+ "grad_norm": 2.8969876766204834,
+ "learning_rate": 2.069224747927203e-06,
+ "loss": 0.3846,
+ "step": 7089
+ },
+ {
+ "epoch": 3.352245862884161,
+ "grad_norm": 2.833179473876953,
+ "learning_rate": 2.0686102651012694e-06,
+ "loss": 0.3892,
+ "step": 7090
+ },
+ {
+ "epoch": 3.3527186761229313,
+ "grad_norm": 3.303830623626709,
+ "learning_rate": 2.067995809136082e-06,
+ "loss": 0.4009,
+ "step": 7091
+ },
+ {
+ "epoch": 3.353191489361702,
+ "grad_norm": 3.3684141635894775,
+ "learning_rate": 2.0673813800699024e-06,
+ "loss": 0.4434,
+ "step": 7092
+ },
+ {
+ "epoch": 3.353664302600473,
+ "grad_norm": 2.6549112796783447,
+ "learning_rate": 2.066766977940987e-06,
+ "loss": 0.3941,
+ "step": 7093
+ },
+ {
+ "epoch": 3.3541371158392437,
+ "grad_norm": 2.852935314178467,
+ "learning_rate": 2.066152602787591e-06,
+ "loss": 0.4143,
+ "step": 7094
+ },
+ {
+ "epoch": 3.354609929078014,
+ "grad_norm": 2.9621706008911133,
+ "learning_rate": 2.0655382546479713e-06,
+ "loss": 0.4502,
+ "step": 7095
+ },
+ {
+ "epoch": 3.355082742316785,
+ "grad_norm": 3.2836413383483887,
+ "learning_rate": 2.064923933560378e-06,
+ "loss": 0.3993,
+ "step": 7096
+ },
+ {
+ "epoch": 3.3555555555555556,
+ "grad_norm": 2.8187968730926514,
+ "learning_rate": 2.0643096395630654e-06,
+ "loss": 0.3766,
+ "step": 7097
+ },
+ {
+ "epoch": 3.3560283687943264,
+ "grad_norm": 2.7965118885040283,
+ "learning_rate": 2.0636953726942803e-06,
+ "loss": 0.4258,
+ "step": 7098
+ },
+ {
+ "epoch": 3.3565011820330968,
+ "grad_norm": 3.002030611038208,
+ "learning_rate": 2.063081132992271e-06,
+ "loss": 0.3548,
+ "step": 7099
+ },
+ {
+ "epoch": 3.3569739952718676,
+ "grad_norm": 2.927603006362915,
+ "learning_rate": 2.0624669204952847e-06,
+ "loss": 0.3759,
+ "step": 7100
+ },
+ {
+ "epoch": 3.3574468085106384,
+ "grad_norm": 2.911393165588379,
+ "learning_rate": 2.061852735241563e-06,
+ "loss": 0.3599,
+ "step": 7101
+ },
+ {
+ "epoch": 3.357919621749409,
+ "grad_norm": 3.0596864223480225,
+ "learning_rate": 2.0612385772693517e-06,
+ "loss": 0.3557,
+ "step": 7102
+ },
+ {
+ "epoch": 3.3583924349881795,
+ "grad_norm": 2.7869808673858643,
+ "learning_rate": 2.0606244466168905e-06,
+ "loss": 0.3696,
+ "step": 7103
+ },
+ {
+ "epoch": 3.3588652482269503,
+ "grad_norm": 2.927715539932251,
+ "learning_rate": 2.060010343322417e-06,
+ "loss": 0.3309,
+ "step": 7104
+ },
+ {
+ "epoch": 3.359338061465721,
+ "grad_norm": 3.44653058052063,
+ "learning_rate": 2.059396267424171e-06,
+ "loss": 0.4453,
+ "step": 7105
+ },
+ {
+ "epoch": 3.359810874704492,
+ "grad_norm": 3.047652244567871,
+ "learning_rate": 2.0587822189603873e-06,
+ "loss": 0.3615,
+ "step": 7106
+ },
+ {
+ "epoch": 3.3602836879432623,
+ "grad_norm": 2.6640517711639404,
+ "learning_rate": 2.0581681979693002e-06,
+ "loss": 0.3716,
+ "step": 7107
+ },
+ {
+ "epoch": 3.360756501182033,
+ "grad_norm": 2.8253493309020996,
+ "learning_rate": 2.0575542044891424e-06,
+ "loss": 0.3485,
+ "step": 7108
+ },
+ {
+ "epoch": 3.361229314420804,
+ "grad_norm": 3.0512938499450684,
+ "learning_rate": 2.0569402385581433e-06,
+ "loss": 0.4582,
+ "step": 7109
+ },
+ {
+ "epoch": 3.3617021276595747,
+ "grad_norm": 2.935060739517212,
+ "learning_rate": 2.0563263002145333e-06,
+ "loss": 0.425,
+ "step": 7110
+ },
+ {
+ "epoch": 3.362174940898345,
+ "grad_norm": 3.2708780765533447,
+ "learning_rate": 2.0557123894965396e-06,
+ "loss": 0.4193,
+ "step": 7111
+ },
+ {
+ "epoch": 3.362647754137116,
+ "grad_norm": 2.758329391479492,
+ "learning_rate": 2.055098506442386e-06,
+ "loss": 0.3754,
+ "step": 7112
+ },
+ {
+ "epoch": 3.3631205673758866,
+ "grad_norm": 3.0359015464782715,
+ "learning_rate": 2.0544846510902987e-06,
+ "loss": 0.4207,
+ "step": 7113
+ },
+ {
+ "epoch": 3.3635933806146574,
+ "grad_norm": 3.096968412399292,
+ "learning_rate": 2.0538708234784983e-06,
+ "loss": 0.4303,
+ "step": 7114
+ },
+ {
+ "epoch": 3.3640661938534278,
+ "grad_norm": 3.0777673721313477,
+ "learning_rate": 2.053257023645206e-06,
+ "loss": 0.3904,
+ "step": 7115
+ },
+ {
+ "epoch": 3.3645390070921986,
+ "grad_norm": 2.9483232498168945,
+ "learning_rate": 2.0526432516286394e-06,
+ "loss": 0.3949,
+ "step": 7116
+ },
+ {
+ "epoch": 3.3650118203309693,
+ "grad_norm": 2.839067220687866,
+ "learning_rate": 2.0520295074670154e-06,
+ "loss": 0.3705,
+ "step": 7117
+ },
+ {
+ "epoch": 3.36548463356974,
+ "grad_norm": 3.0450778007507324,
+ "learning_rate": 2.0514157911985506e-06,
+ "loss": 0.3987,
+ "step": 7118
+ },
+ {
+ "epoch": 3.3659574468085105,
+ "grad_norm": 3.425318717956543,
+ "learning_rate": 2.0508021028614564e-06,
+ "loss": 0.3941,
+ "step": 7119
+ },
+ {
+ "epoch": 3.3664302600472813,
+ "grad_norm": 2.9509286880493164,
+ "learning_rate": 2.0501884424939465e-06,
+ "loss": 0.354,
+ "step": 7120
+ },
+ {
+ "epoch": 3.366903073286052,
+ "grad_norm": 2.799504518508911,
+ "learning_rate": 2.0495748101342303e-06,
+ "loss": 0.3891,
+ "step": 7121
+ },
+ {
+ "epoch": 3.3673758865248224,
+ "grad_norm": 2.9140994548797607,
+ "learning_rate": 2.048961205820515e-06,
+ "loss": 0.3638,
+ "step": 7122
+ },
+ {
+ "epoch": 3.3678486997635932,
+ "grad_norm": 2.8074216842651367,
+ "learning_rate": 2.0483476295910077e-06,
+ "loss": 0.3501,
+ "step": 7123
+ },
+ {
+ "epoch": 3.368321513002364,
+ "grad_norm": 2.770829677581787,
+ "learning_rate": 2.0477340814839126e-06,
+ "loss": 0.3774,
+ "step": 7124
+ },
+ {
+ "epoch": 3.368794326241135,
+ "grad_norm": 2.581655502319336,
+ "learning_rate": 2.047120561537434e-06,
+ "loss": 0.3523,
+ "step": 7125
+ },
+ {
+ "epoch": 3.3692671394799056,
+ "grad_norm": 3.4234209060668945,
+ "learning_rate": 2.046507069789772e-06,
+ "loss": 0.4191,
+ "step": 7126
+ },
+ {
+ "epoch": 3.369739952718676,
+ "grad_norm": 2.669860601425171,
+ "learning_rate": 2.045893606279126e-06,
+ "loss": 0.3542,
+ "step": 7127
+ },
+ {
+ "epoch": 3.370212765957447,
+ "grad_norm": 3.2426629066467285,
+ "learning_rate": 2.045280171043694e-06,
+ "loss": 0.4416,
+ "step": 7128
+ },
+ {
+ "epoch": 3.3706855791962176,
+ "grad_norm": 3.1318910121917725,
+ "learning_rate": 2.044666764121672e-06,
+ "loss": 0.3999,
+ "step": 7129
+ },
+ {
+ "epoch": 3.371158392434988,
+ "grad_norm": 2.7044012546539307,
+ "learning_rate": 2.044053385551254e-06,
+ "loss": 0.3907,
+ "step": 7130
+ },
+ {
+ "epoch": 3.3716312056737587,
+ "grad_norm": 2.9429895877838135,
+ "learning_rate": 2.0434400353706322e-06,
+ "loss": 0.3827,
+ "step": 7131
+ },
+ {
+ "epoch": 3.3721040189125295,
+ "grad_norm": 2.7258787155151367,
+ "learning_rate": 2.0428267136179973e-06,
+ "loss": 0.3688,
+ "step": 7132
+ },
+ {
+ "epoch": 3.3725768321513003,
+ "grad_norm": 2.765108108520508,
+ "learning_rate": 2.042213420331539e-06,
+ "loss": 0.4078,
+ "step": 7133
+ },
+ {
+ "epoch": 3.373049645390071,
+ "grad_norm": 3.2951347827911377,
+ "learning_rate": 2.0416001555494435e-06,
+ "loss": 0.4259,
+ "step": 7134
+ },
+ {
+ "epoch": 3.3735224586288415,
+ "grad_norm": 3.3917062282562256,
+ "learning_rate": 2.040986919309895e-06,
+ "loss": 0.5094,
+ "step": 7135
+ },
+ {
+ "epoch": 3.3739952718676123,
+ "grad_norm": 2.746434450149536,
+ "learning_rate": 2.04037371165108e-06,
+ "loss": 0.3513,
+ "step": 7136
+ },
+ {
+ "epoch": 3.374468085106383,
+ "grad_norm": 3.268731117248535,
+ "learning_rate": 2.0397605326111774e-06,
+ "loss": 0.3909,
+ "step": 7137
+ },
+ {
+ "epoch": 3.3749408983451534,
+ "grad_norm": 2.8498165607452393,
+ "learning_rate": 2.0391473822283692e-06,
+ "loss": 0.3657,
+ "step": 7138
+ },
+ {
+ "epoch": 3.3754137115839242,
+ "grad_norm": 2.855966567993164,
+ "learning_rate": 2.0385342605408325e-06,
+ "loss": 0.3927,
+ "step": 7139
+ },
+ {
+ "epoch": 3.375886524822695,
+ "grad_norm": 3.1839048862457275,
+ "learning_rate": 2.0379211675867438e-06,
+ "loss": 0.4476,
+ "step": 7140
+ },
+ {
+ "epoch": 3.376359338061466,
+ "grad_norm": 2.9379947185516357,
+ "learning_rate": 2.037308103404278e-06,
+ "loss": 0.3657,
+ "step": 7141
+ },
+ {
+ "epoch": 3.3768321513002366,
+ "grad_norm": 2.9251210689544678,
+ "learning_rate": 2.0366950680316073e-06,
+ "loss": 0.3975,
+ "step": 7142
+ },
+ {
+ "epoch": 3.377304964539007,
+ "grad_norm": 2.811885118484497,
+ "learning_rate": 2.036082061506904e-06,
+ "loss": 0.3064,
+ "step": 7143
+ },
+ {
+ "epoch": 3.3777777777777778,
+ "grad_norm": 2.755229949951172,
+ "learning_rate": 2.0354690838683363e-06,
+ "loss": 0.3328,
+ "step": 7144
+ },
+ {
+ "epoch": 3.3782505910165486,
+ "grad_norm": 3.006819725036621,
+ "learning_rate": 2.0348561351540706e-06,
+ "loss": 0.4168,
+ "step": 7145
+ },
+ {
+ "epoch": 3.378723404255319,
+ "grad_norm": 2.8788509368896484,
+ "learning_rate": 2.034243215402275e-06,
+ "loss": 0.4123,
+ "step": 7146
+ },
+ {
+ "epoch": 3.3791962174940897,
+ "grad_norm": 2.9732980728149414,
+ "learning_rate": 2.033630324651112e-06,
+ "loss": 0.3371,
+ "step": 7147
+ },
+ {
+ "epoch": 3.3796690307328605,
+ "grad_norm": 2.7731754779815674,
+ "learning_rate": 2.033017462938744e-06,
+ "loss": 0.382,
+ "step": 7148
+ },
+ {
+ "epoch": 3.3801418439716313,
+ "grad_norm": 2.766395092010498,
+ "learning_rate": 2.032404630303331e-06,
+ "loss": 0.3295,
+ "step": 7149
+ },
+ {
+ "epoch": 3.380614657210402,
+ "grad_norm": 3.197960138320923,
+ "learning_rate": 2.03179182678303e-06,
+ "loss": 0.354,
+ "step": 7150
+ },
+ {
+ "epoch": 3.3810874704491725,
+ "grad_norm": 3.048553228378296,
+ "learning_rate": 2.031179052416e-06,
+ "loss": 0.4027,
+ "step": 7151
+ },
+ {
+ "epoch": 3.3815602836879433,
+ "grad_norm": 3.1527998447418213,
+ "learning_rate": 2.0305663072403934e-06,
+ "loss": 0.4229,
+ "step": 7152
+ },
+ {
+ "epoch": 3.382033096926714,
+ "grad_norm": 3.0407028198242188,
+ "learning_rate": 2.029953591294366e-06,
+ "loss": 0.4254,
+ "step": 7153
+ },
+ {
+ "epoch": 3.3825059101654844,
+ "grad_norm": 2.7170357704162598,
+ "learning_rate": 2.0293409046160673e-06,
+ "loss": 0.3307,
+ "step": 7154
+ },
+ {
+ "epoch": 3.382978723404255,
+ "grad_norm": 3.0128726959228516,
+ "learning_rate": 2.028728247243646e-06,
+ "loss": 0.3873,
+ "step": 7155
+ },
+ {
+ "epoch": 3.383451536643026,
+ "grad_norm": 4.861877918243408,
+ "learning_rate": 2.0281156192152507e-06,
+ "loss": 0.4371,
+ "step": 7156
+ },
+ {
+ "epoch": 3.383924349881797,
+ "grad_norm": 2.890249252319336,
+ "learning_rate": 2.0275030205690257e-06,
+ "loss": 0.3899,
+ "step": 7157
+ },
+ {
+ "epoch": 3.3843971631205676,
+ "grad_norm": 3.0774779319763184,
+ "learning_rate": 2.026890451343117e-06,
+ "loss": 0.4151,
+ "step": 7158
+ },
+ {
+ "epoch": 3.384869976359338,
+ "grad_norm": 2.8705947399139404,
+ "learning_rate": 2.026277911575665e-06,
+ "loss": 0.4004,
+ "step": 7159
+ },
+ {
+ "epoch": 3.3853427895981087,
+ "grad_norm": 3.170760154724121,
+ "learning_rate": 2.0256654013048096e-06,
+ "loss": 0.4442,
+ "step": 7160
+ },
+ {
+ "epoch": 3.3858156028368795,
+ "grad_norm": 4.211156368255615,
+ "learning_rate": 2.0250529205686905e-06,
+ "loss": 0.4605,
+ "step": 7161
+ },
+ {
+ "epoch": 3.38628841607565,
+ "grad_norm": 2.513519287109375,
+ "learning_rate": 2.0244404694054435e-06,
+ "loss": 0.3506,
+ "step": 7162
+ },
+ {
+ "epoch": 3.3867612293144207,
+ "grad_norm": 3.1558821201324463,
+ "learning_rate": 2.023828047853203e-06,
+ "loss": 0.43,
+ "step": 7163
+ },
+ {
+ "epoch": 3.3872340425531915,
+ "grad_norm": 3.6770291328430176,
+ "learning_rate": 2.023215655950102e-06,
+ "loss": 0.3911,
+ "step": 7164
+ },
+ {
+ "epoch": 3.3877068557919623,
+ "grad_norm": 2.6544485092163086,
+ "learning_rate": 2.022603293734271e-06,
+ "loss": 0.3306,
+ "step": 7165
+ },
+ {
+ "epoch": 3.388179669030733,
+ "grad_norm": 3.34232759475708,
+ "learning_rate": 2.0219909612438405e-06,
+ "loss": 0.4233,
+ "step": 7166
+ },
+ {
+ "epoch": 3.3886524822695034,
+ "grad_norm": 3.388561725616455,
+ "learning_rate": 2.0213786585169363e-06,
+ "loss": 0.4171,
+ "step": 7167
+ },
+ {
+ "epoch": 3.3891252955082742,
+ "grad_norm": 2.8606953620910645,
+ "learning_rate": 2.020766385591684e-06,
+ "loss": 0.3864,
+ "step": 7168
+ },
+ {
+ "epoch": 3.389598108747045,
+ "grad_norm": 3.0135979652404785,
+ "learning_rate": 2.020154142506208e-06,
+ "loss": 0.3933,
+ "step": 7169
+ },
+ {
+ "epoch": 3.3900709219858154,
+ "grad_norm": 2.5003163814544678,
+ "learning_rate": 2.0195419292986294e-06,
+ "loss": 0.3852,
+ "step": 7170
+ },
+ {
+ "epoch": 3.390543735224586,
+ "grad_norm": 2.8591368198394775,
+ "learning_rate": 2.0189297460070685e-06,
+ "loss": 0.3962,
+ "step": 7171
+ },
+ {
+ "epoch": 3.391016548463357,
+ "grad_norm": 2.8830223083496094,
+ "learning_rate": 2.0183175926696427e-06,
+ "loss": 0.3632,
+ "step": 7172
+ },
+ {
+ "epoch": 3.391489361702128,
+ "grad_norm": 3.3904542922973633,
+ "learning_rate": 2.0177054693244674e-06,
+ "loss": 0.4284,
+ "step": 7173
+ },
+ {
+ "epoch": 3.3919621749408986,
+ "grad_norm": 3.0325920581817627,
+ "learning_rate": 2.0170933760096585e-06,
+ "loss": 0.4331,
+ "step": 7174
+ },
+ {
+ "epoch": 3.392434988179669,
+ "grad_norm": 2.60345196723938,
+ "learning_rate": 2.016481312763327e-06,
+ "loss": 0.4077,
+ "step": 7175
+ },
+ {
+ "epoch": 3.3929078014184397,
+ "grad_norm": 2.8146891593933105,
+ "learning_rate": 2.0158692796235845e-06,
+ "loss": 0.4224,
+ "step": 7176
+ },
+ {
+ "epoch": 3.3933806146572105,
+ "grad_norm": 2.8158490657806396,
+ "learning_rate": 2.0152572766285396e-06,
+ "loss": 0.3454,
+ "step": 7177
+ },
+ {
+ "epoch": 3.393853427895981,
+ "grad_norm": 3.2753400802612305,
+ "learning_rate": 2.0146453038162978e-06,
+ "loss": 0.3615,
+ "step": 7178
+ },
+ {
+ "epoch": 3.3943262411347517,
+ "grad_norm": 3.0527124404907227,
+ "learning_rate": 2.0140333612249655e-06,
+ "loss": 0.415,
+ "step": 7179
+ },
+ {
+ "epoch": 3.3947990543735225,
+ "grad_norm": 2.6813764572143555,
+ "learning_rate": 2.0134214488926435e-06,
+ "loss": 0.3391,
+ "step": 7180
+ },
+ {
+ "epoch": 3.3952718676122933,
+ "grad_norm": 2.809319496154785,
+ "learning_rate": 2.0128095668574356e-06,
+ "loss": 0.3123,
+ "step": 7181
+ },
+ {
+ "epoch": 3.395744680851064,
+ "grad_norm": 2.6619064807891846,
+ "learning_rate": 2.0121977151574396e-06,
+ "loss": 0.4222,
+ "step": 7182
+ },
+ {
+ "epoch": 3.3962174940898344,
+ "grad_norm": 2.9201200008392334,
+ "learning_rate": 2.0115858938307516e-06,
+ "loss": 0.3712,
+ "step": 7183
+ },
+ {
+ "epoch": 3.396690307328605,
+ "grad_norm": 3.2058637142181396,
+ "learning_rate": 2.0109741029154696e-06,
+ "loss": 0.4004,
+ "step": 7184
+ },
+ {
+ "epoch": 3.397163120567376,
+ "grad_norm": 2.821855306625366,
+ "learning_rate": 2.0103623424496862e-06,
+ "loss": 0.4053,
+ "step": 7185
+ },
+ {
+ "epoch": 3.3976359338061464,
+ "grad_norm": 3.0371549129486084,
+ "learning_rate": 2.009750612471492e-06,
+ "loss": 0.4246,
+ "step": 7186
+ },
+ {
+ "epoch": 3.398108747044917,
+ "grad_norm": 2.8827290534973145,
+ "learning_rate": 2.009138913018978e-06,
+ "loss": 0.3256,
+ "step": 7187
+ },
+ {
+ "epoch": 3.398581560283688,
+ "grad_norm": 3.168039560317993,
+ "learning_rate": 2.0085272441302305e-06,
+ "loss": 0.4233,
+ "step": 7188
+ },
+ {
+ "epoch": 3.3990543735224588,
+ "grad_norm": 3.259723663330078,
+ "learning_rate": 2.0079156058433374e-06,
+ "loss": 0.4168,
+ "step": 7189
+ },
+ {
+ "epoch": 3.3995271867612296,
+ "grad_norm": 2.456231117248535,
+ "learning_rate": 2.007303998196382e-06,
+ "loss": 0.3383,
+ "step": 7190
+ },
+ {
+ "epoch": 3.4,
+ "grad_norm": 2.735180377960205,
+ "learning_rate": 2.006692421227445e-06,
+ "loss": 0.3475,
+ "step": 7191
+ },
+ {
+ "epoch": 3.4004728132387707,
+ "grad_norm": 2.76263427734375,
+ "learning_rate": 2.006080874974609e-06,
+ "loss": 0.3651,
+ "step": 7192
+ },
+ {
+ "epoch": 3.4009456264775415,
+ "grad_norm": 3.36867094039917,
+ "learning_rate": 2.0054693594759504e-06,
+ "loss": 0.4479,
+ "step": 7193
+ },
+ {
+ "epoch": 3.401418439716312,
+ "grad_norm": 2.532167673110962,
+ "learning_rate": 2.004857874769547e-06,
+ "loss": 0.3818,
+ "step": 7194
+ },
+ {
+ "epoch": 3.4018912529550827,
+ "grad_norm": 2.8723537921905518,
+ "learning_rate": 2.0042464208934724e-06,
+ "loss": 0.3332,
+ "step": 7195
+ },
+ {
+ "epoch": 3.4023640661938535,
+ "grad_norm": 2.676460027694702,
+ "learning_rate": 2.0036349978857987e-06,
+ "loss": 0.3488,
+ "step": 7196
+ },
+ {
+ "epoch": 3.4028368794326243,
+ "grad_norm": 2.805851459503174,
+ "learning_rate": 2.0030236057845983e-06,
+ "loss": 0.3796,
+ "step": 7197
+ },
+ {
+ "epoch": 3.403309692671395,
+ "grad_norm": 2.688988447189331,
+ "learning_rate": 2.0024122446279377e-06,
+ "loss": 0.3707,
+ "step": 7198
+ },
+ {
+ "epoch": 3.4037825059101654,
+ "grad_norm": 3.118720293045044,
+ "learning_rate": 2.0018009144538853e-06,
+ "loss": 0.4064,
+ "step": 7199
+ },
+ {
+ "epoch": 3.404255319148936,
+ "grad_norm": 2.876507520675659,
+ "learning_rate": 2.001189615300506e-06,
+ "loss": 0.3543,
+ "step": 7200
+ },
+ {
+ "epoch": 3.404728132387707,
+ "grad_norm": 3.0043466091156006,
+ "learning_rate": 2.000578347205861e-06,
+ "loss": 0.3833,
+ "step": 7201
+ },
+ {
+ "epoch": 3.4052009456264773,
+ "grad_norm": 3.1057114601135254,
+ "learning_rate": 1.9999671102080133e-06,
+ "loss": 0.4154,
+ "step": 7202
+ },
+ {
+ "epoch": 3.405673758865248,
+ "grad_norm": 2.9791855812072754,
+ "learning_rate": 1.9993559043450202e-06,
+ "loss": 0.3865,
+ "step": 7203
+ },
+ {
+ "epoch": 3.406146572104019,
+ "grad_norm": 3.4403460025787354,
+ "learning_rate": 1.9987447296549407e-06,
+ "loss": 0.3883,
+ "step": 7204
+ },
+ {
+ "epoch": 3.4066193853427897,
+ "grad_norm": 2.9962027072906494,
+ "learning_rate": 1.998133586175829e-06,
+ "loss": 0.3796,
+ "step": 7205
+ },
+ {
+ "epoch": 3.40709219858156,
+ "grad_norm": 3.0613129138946533,
+ "learning_rate": 1.997522473945737e-06,
+ "loss": 0.3917,
+ "step": 7206
+ },
+ {
+ "epoch": 3.407565011820331,
+ "grad_norm": 3.065985679626465,
+ "learning_rate": 1.996911393002718e-06,
+ "loss": 0.3521,
+ "step": 7207
+ },
+ {
+ "epoch": 3.4080378250591017,
+ "grad_norm": 2.976177930831909,
+ "learning_rate": 1.996300343384821e-06,
+ "loss": 0.3852,
+ "step": 7208
+ },
+ {
+ "epoch": 3.4085106382978725,
+ "grad_norm": 3.3587961196899414,
+ "learning_rate": 1.995689325130092e-06,
+ "loss": 0.3947,
+ "step": 7209
+ },
+ {
+ "epoch": 3.408983451536643,
+ "grad_norm": 2.626983165740967,
+ "learning_rate": 1.995078338276578e-06,
+ "loss": 0.316,
+ "step": 7210
+ },
+ {
+ "epoch": 3.4094562647754136,
+ "grad_norm": 3.14713978767395,
+ "learning_rate": 1.9944673828623217e-06,
+ "loss": 0.4008,
+ "step": 7211
+ },
+ {
+ "epoch": 3.4099290780141844,
+ "grad_norm": 2.968918800354004,
+ "learning_rate": 1.993856458925365e-06,
+ "loss": 0.439,
+ "step": 7212
+ },
+ {
+ "epoch": 3.4104018912529552,
+ "grad_norm": 2.7724127769470215,
+ "learning_rate": 1.9932455665037476e-06,
+ "loss": 0.3941,
+ "step": 7213
+ },
+ {
+ "epoch": 3.4108747044917256,
+ "grad_norm": 2.963146448135376,
+ "learning_rate": 1.9926347056355057e-06,
+ "loss": 0.3893,
+ "step": 7214
+ },
+ {
+ "epoch": 3.4113475177304964,
+ "grad_norm": 2.791637420654297,
+ "learning_rate": 1.9920238763586765e-06,
+ "loss": 0.4068,
+ "step": 7215
+ },
+ {
+ "epoch": 3.411820330969267,
+ "grad_norm": 3.030275583267212,
+ "learning_rate": 1.9914130787112924e-06,
+ "loss": 0.3828,
+ "step": 7216
+ },
+ {
+ "epoch": 3.412293144208038,
+ "grad_norm": 3.113128900527954,
+ "learning_rate": 1.990802312731387e-06,
+ "loss": 0.3903,
+ "step": 7217
+ },
+ {
+ "epoch": 3.4127659574468083,
+ "grad_norm": 3.104170322418213,
+ "learning_rate": 1.9901915784569884e-06,
+ "loss": 0.4171,
+ "step": 7218
+ },
+ {
+ "epoch": 3.413238770685579,
+ "grad_norm": 3.1247572898864746,
+ "learning_rate": 1.989580875926125e-06,
+ "loss": 0.4022,
+ "step": 7219
+ },
+ {
+ "epoch": 3.41371158392435,
+ "grad_norm": 2.9487457275390625,
+ "learning_rate": 1.988970205176822e-06,
+ "loss": 0.3948,
+ "step": 7220
+ },
+ {
+ "epoch": 3.4141843971631207,
+ "grad_norm": 2.8763654232025146,
+ "learning_rate": 1.9883595662471028e-06,
+ "loss": 0.3588,
+ "step": 7221
+ },
+ {
+ "epoch": 3.414657210401891,
+ "grad_norm": 2.563152551651001,
+ "learning_rate": 1.987748959174991e-06,
+ "loss": 0.3509,
+ "step": 7222
+ },
+ {
+ "epoch": 3.415130023640662,
+ "grad_norm": 3.148759365081787,
+ "learning_rate": 1.9871383839985053e-06,
+ "loss": 0.4364,
+ "step": 7223
+ },
+ {
+ "epoch": 3.4156028368794327,
+ "grad_norm": 2.8187363147735596,
+ "learning_rate": 1.986527840755663e-06,
+ "loss": 0.3803,
+ "step": 7224
+ },
+ {
+ "epoch": 3.4160756501182035,
+ "grad_norm": 3.009376287460327,
+ "learning_rate": 1.985917329484481e-06,
+ "loss": 0.3841,
+ "step": 7225
+ },
+ {
+ "epoch": 3.416548463356974,
+ "grad_norm": 2.869291067123413,
+ "learning_rate": 1.985306850222972e-06,
+ "loss": 0.3877,
+ "step": 7226
+ },
+ {
+ "epoch": 3.4170212765957446,
+ "grad_norm": 3.108461856842041,
+ "learning_rate": 1.9846964030091497e-06,
+ "loss": 0.3767,
+ "step": 7227
+ },
+ {
+ "epoch": 3.4174940898345154,
+ "grad_norm": 3.096320629119873,
+ "learning_rate": 1.9840859878810226e-06,
+ "loss": 0.4603,
+ "step": 7228
+ },
+ {
+ "epoch": 3.417966903073286,
+ "grad_norm": 2.8519909381866455,
+ "learning_rate": 1.983475604876598e-06,
+ "loss": 0.3263,
+ "step": 7229
+ },
+ {
+ "epoch": 3.4184397163120566,
+ "grad_norm": 3.192051410675049,
+ "learning_rate": 1.9828652540338835e-06,
+ "loss": 0.4132,
+ "step": 7230
+ },
+ {
+ "epoch": 3.4189125295508274,
+ "grad_norm": 3.0398056507110596,
+ "learning_rate": 1.9822549353908817e-06,
+ "loss": 0.4038,
+ "step": 7231
+ },
+ {
+ "epoch": 3.419385342789598,
+ "grad_norm": 3.12247896194458,
+ "learning_rate": 1.9816446489855944e-06,
+ "loss": 0.409,
+ "step": 7232
+ },
+ {
+ "epoch": 3.419858156028369,
+ "grad_norm": 3.20316481590271,
+ "learning_rate": 1.9810343948560223e-06,
+ "loss": 0.4058,
+ "step": 7233
+ },
+ {
+ "epoch": 3.4203309692671393,
+ "grad_norm": 3.3397457599639893,
+ "learning_rate": 1.9804241730401625e-06,
+ "loss": 0.3657,
+ "step": 7234
+ },
+ {
+ "epoch": 3.42080378250591,
+ "grad_norm": 3.928691864013672,
+ "learning_rate": 1.979813983576012e-06,
+ "loss": 0.361,
+ "step": 7235
+ },
+ {
+ "epoch": 3.421276595744681,
+ "grad_norm": 3.5814051628112793,
+ "learning_rate": 1.9792038265015635e-06,
+ "loss": 0.3975,
+ "step": 7236
+ },
+ {
+ "epoch": 3.4217494089834517,
+ "grad_norm": 2.8578879833221436,
+ "learning_rate": 1.9785937018548086e-06,
+ "loss": 0.3915,
+ "step": 7237
+ },
+ {
+ "epoch": 3.422222222222222,
+ "grad_norm": 3.0343220233917236,
+ "learning_rate": 1.977983609673738e-06,
+ "loss": 0.3686,
+ "step": 7238
+ },
+ {
+ "epoch": 3.422695035460993,
+ "grad_norm": 3.2719056606292725,
+ "learning_rate": 1.977373549996338e-06,
+ "loss": 0.3905,
+ "step": 7239
+ },
+ {
+ "epoch": 3.4231678486997636,
+ "grad_norm": 2.6638169288635254,
+ "learning_rate": 1.976763522860597e-06,
+ "loss": 0.3631,
+ "step": 7240
+ },
+ {
+ "epoch": 3.4236406619385344,
+ "grad_norm": 2.7679927349090576,
+ "learning_rate": 1.9761535283044967e-06,
+ "loss": 0.377,
+ "step": 7241
+ },
+ {
+ "epoch": 3.424113475177305,
+ "grad_norm": 2.774540424346924,
+ "learning_rate": 1.975543566366019e-06,
+ "loss": 0.3509,
+ "step": 7242
+ },
+ {
+ "epoch": 3.4245862884160756,
+ "grad_norm": 2.811659336090088,
+ "learning_rate": 1.9749336370831438e-06,
+ "loss": 0.3835,
+ "step": 7243
+ },
+ {
+ "epoch": 3.4250591016548464,
+ "grad_norm": 2.8533360958099365,
+ "learning_rate": 1.9743237404938478e-06,
+ "loss": 0.3765,
+ "step": 7244
+ },
+ {
+ "epoch": 3.425531914893617,
+ "grad_norm": 2.712301015853882,
+ "learning_rate": 1.9737138766361084e-06,
+ "loss": 0.3797,
+ "step": 7245
+ },
+ {
+ "epoch": 3.4260047281323875,
+ "grad_norm": 2.9763426780700684,
+ "learning_rate": 1.9731040455478986e-06,
+ "loss": 0.4223,
+ "step": 7246
+ },
+ {
+ "epoch": 3.4264775413711583,
+ "grad_norm": 2.8802297115325928,
+ "learning_rate": 1.9724942472671882e-06,
+ "loss": 0.3666,
+ "step": 7247
+ },
+ {
+ "epoch": 3.426950354609929,
+ "grad_norm": 2.934107542037964,
+ "learning_rate": 1.9718844818319486e-06,
+ "loss": 0.3612,
+ "step": 7248
+ },
+ {
+ "epoch": 3.4274231678487,
+ "grad_norm": 3.0172696113586426,
+ "learning_rate": 1.9712747492801467e-06,
+ "loss": 0.3643,
+ "step": 7249
+ },
+ {
+ "epoch": 3.4278959810874703,
+ "grad_norm": 3.368419647216797,
+ "learning_rate": 1.970665049649748e-06,
+ "loss": 0.4511,
+ "step": 7250
+ },
+ {
+ "epoch": 3.428368794326241,
+ "grad_norm": 3.077819585800171,
+ "learning_rate": 1.9700553829787162e-06,
+ "loss": 0.4013,
+ "step": 7251
+ },
+ {
+ "epoch": 3.428841607565012,
+ "grad_norm": 2.690673828125,
+ "learning_rate": 1.96944574930501e-06,
+ "loss": 0.3776,
+ "step": 7252
+ },
+ {
+ "epoch": 3.4293144208037827,
+ "grad_norm": 3.1122169494628906,
+ "learning_rate": 1.9688361486665924e-06,
+ "loss": 0.3802,
+ "step": 7253
+ },
+ {
+ "epoch": 3.429787234042553,
+ "grad_norm": 2.9874207973480225,
+ "learning_rate": 1.968226581101417e-06,
+ "loss": 0.4492,
+ "step": 7254
+ },
+ {
+ "epoch": 3.430260047281324,
+ "grad_norm": 2.885493278503418,
+ "learning_rate": 1.967617046647442e-06,
+ "loss": 0.3958,
+ "step": 7255
+ },
+ {
+ "epoch": 3.4307328605200946,
+ "grad_norm": 2.953897476196289,
+ "learning_rate": 1.9670075453426195e-06,
+ "loss": 0.3973,
+ "step": 7256
+ },
+ {
+ "epoch": 3.4312056737588654,
+ "grad_norm": 2.685088634490967,
+ "learning_rate": 1.966398077224899e-06,
+ "loss": 0.393,
+ "step": 7257
+ },
+ {
+ "epoch": 3.431678486997636,
+ "grad_norm": 4.035208702087402,
+ "learning_rate": 1.9657886423322313e-06,
+ "loss": 0.4263,
+ "step": 7258
+ },
+ {
+ "epoch": 3.4321513002364066,
+ "grad_norm": 2.942042827606201,
+ "learning_rate": 1.965179240702562e-06,
+ "loss": 0.4319,
+ "step": 7259
+ },
+ {
+ "epoch": 3.4326241134751774,
+ "grad_norm": 3.0794999599456787,
+ "learning_rate": 1.9645698723738356e-06,
+ "loss": 0.4199,
+ "step": 7260
+ },
+ {
+ "epoch": 3.433096926713948,
+ "grad_norm": 3.0653584003448486,
+ "learning_rate": 1.963960537383996e-06,
+ "loss": 0.3723,
+ "step": 7261
+ },
+ {
+ "epoch": 3.4335697399527185,
+ "grad_norm": 3.1571545600891113,
+ "learning_rate": 1.963351235770983e-06,
+ "loss": 0.4211,
+ "step": 7262
+ },
+ {
+ "epoch": 3.4340425531914893,
+ "grad_norm": 2.6681735515594482,
+ "learning_rate": 1.962741967572736e-06,
+ "loss": 0.3333,
+ "step": 7263
+ },
+ {
+ "epoch": 3.43451536643026,
+ "grad_norm": 2.9747934341430664,
+ "learning_rate": 1.9621327328271907e-06,
+ "loss": 0.3896,
+ "step": 7264
+ },
+ {
+ "epoch": 3.434988179669031,
+ "grad_norm": 2.7994508743286133,
+ "learning_rate": 1.9615235315722814e-06,
+ "loss": 0.3642,
+ "step": 7265
+ },
+ {
+ "epoch": 3.4354609929078013,
+ "grad_norm": 2.933928966522217,
+ "learning_rate": 1.9609143638459405e-06,
+ "loss": 0.3955,
+ "step": 7266
+ },
+ {
+ "epoch": 3.435933806146572,
+ "grad_norm": 2.9577367305755615,
+ "learning_rate": 1.9603052296860983e-06,
+ "loss": 0.3437,
+ "step": 7267
+ },
+ {
+ "epoch": 3.436406619385343,
+ "grad_norm": 3.017282009124756,
+ "learning_rate": 1.959696129130684e-06,
+ "loss": 0.3784,
+ "step": 7268
+ },
+ {
+ "epoch": 3.4368794326241137,
+ "grad_norm": 3.2072815895080566,
+ "learning_rate": 1.959087062217622e-06,
+ "loss": 0.3901,
+ "step": 7269
+ },
+ {
+ "epoch": 3.437352245862884,
+ "grad_norm": 2.91153621673584,
+ "learning_rate": 1.9584780289848358e-06,
+ "loss": 0.4402,
+ "step": 7270
+ },
+ {
+ "epoch": 3.437825059101655,
+ "grad_norm": 2.846842050552368,
+ "learning_rate": 1.9578690294702495e-06,
+ "loss": 0.3804,
+ "step": 7271
+ },
+ {
+ "epoch": 3.4382978723404256,
+ "grad_norm": 3.0958521366119385,
+ "learning_rate": 1.957260063711781e-06,
+ "loss": 0.4103,
+ "step": 7272
+ },
+ {
+ "epoch": 3.4387706855791964,
+ "grad_norm": 2.9808530807495117,
+ "learning_rate": 1.9566511317473483e-06,
+ "loss": 0.4127,
+ "step": 7273
+ },
+ {
+ "epoch": 3.4392434988179668,
+ "grad_norm": 2.725851058959961,
+ "learning_rate": 1.9560422336148678e-06,
+ "loss": 0.3493,
+ "step": 7274
+ },
+ {
+ "epoch": 3.4397163120567376,
+ "grad_norm": 2.7861814498901367,
+ "learning_rate": 1.9554333693522515e-06,
+ "loss": 0.3703,
+ "step": 7275
+ },
+ {
+ "epoch": 3.4401891252955084,
+ "grad_norm": 3.128708839416504,
+ "learning_rate": 1.954824538997412e-06,
+ "loss": 0.3917,
+ "step": 7276
+ },
+ {
+ "epoch": 3.440661938534279,
+ "grad_norm": 3.117403268814087,
+ "learning_rate": 1.954215742588257e-06,
+ "loss": 0.3581,
+ "step": 7277
+ },
+ {
+ "epoch": 3.4411347517730495,
+ "grad_norm": 2.710076093673706,
+ "learning_rate": 1.9536069801626957e-06,
+ "loss": 0.3255,
+ "step": 7278
+ },
+ {
+ "epoch": 3.4416075650118203,
+ "grad_norm": 2.7732627391815186,
+ "learning_rate": 1.952998251758632e-06,
+ "loss": 0.375,
+ "step": 7279
+ },
+ {
+ "epoch": 3.442080378250591,
+ "grad_norm": 2.896050453186035,
+ "learning_rate": 1.9523895574139673e-06,
+ "loss": 0.4087,
+ "step": 7280
+ },
+ {
+ "epoch": 3.4425531914893615,
+ "grad_norm": 2.9051663875579834,
+ "learning_rate": 1.9517808971666048e-06,
+ "loss": 0.3423,
+ "step": 7281
+ },
+ {
+ "epoch": 3.4430260047281322,
+ "grad_norm": 3.0232038497924805,
+ "learning_rate": 1.9511722710544417e-06,
+ "loss": 0.364,
+ "step": 7282
+ },
+ {
+ "epoch": 3.443498817966903,
+ "grad_norm": 2.753870725631714,
+ "learning_rate": 1.9505636791153744e-06,
+ "loss": 0.3484,
+ "step": 7283
+ },
+ {
+ "epoch": 3.443971631205674,
+ "grad_norm": 2.944079637527466,
+ "learning_rate": 1.9499551213872983e-06,
+ "loss": 0.3354,
+ "step": 7284
+ },
+ {
+ "epoch": 3.4444444444444446,
+ "grad_norm": 3.1531970500946045,
+ "learning_rate": 1.949346597908104e-06,
+ "loss": 0.3394,
+ "step": 7285
+ },
+ {
+ "epoch": 3.444917257683215,
+ "grad_norm": 3.0357189178466797,
+ "learning_rate": 1.948738108715683e-06,
+ "loss": 0.4302,
+ "step": 7286
+ },
+ {
+ "epoch": 3.445390070921986,
+ "grad_norm": 3.3698086738586426,
+ "learning_rate": 1.948129653847923e-06,
+ "loss": 0.419,
+ "step": 7287
+ },
+ {
+ "epoch": 3.4458628841607566,
+ "grad_norm": 3.343132495880127,
+ "learning_rate": 1.947521233342709e-06,
+ "loss": 0.3895,
+ "step": 7288
+ },
+ {
+ "epoch": 3.446335697399527,
+ "grad_norm": 3.1905252933502197,
+ "learning_rate": 1.9469128472379257e-06,
+ "loss": 0.429,
+ "step": 7289
+ },
+ {
+ "epoch": 3.4468085106382977,
+ "grad_norm": 2.8517212867736816,
+ "learning_rate": 1.946304495571454e-06,
+ "loss": 0.3513,
+ "step": 7290
+ },
+ {
+ "epoch": 3.4472813238770685,
+ "grad_norm": 2.7713496685028076,
+ "learning_rate": 1.9456961783811735e-06,
+ "loss": 0.4331,
+ "step": 7291
+ },
+ {
+ "epoch": 3.4477541371158393,
+ "grad_norm": 2.8258652687072754,
+ "learning_rate": 1.945087895704962e-06,
+ "loss": 0.3539,
+ "step": 7292
+ },
+ {
+ "epoch": 3.44822695035461,
+ "grad_norm": 2.757322072982788,
+ "learning_rate": 1.9444796475806925e-06,
+ "loss": 0.3865,
+ "step": 7293
+ },
+ {
+ "epoch": 3.4486997635933805,
+ "grad_norm": 2.8410696983337402,
+ "learning_rate": 1.943871434046241e-06,
+ "loss": 0.3612,
+ "step": 7294
+ },
+ {
+ "epoch": 3.4491725768321513,
+ "grad_norm": 3.2297637462615967,
+ "learning_rate": 1.9432632551394753e-06,
+ "loss": 0.3956,
+ "step": 7295
+ },
+ {
+ "epoch": 3.449645390070922,
+ "grad_norm": 2.991351842880249,
+ "learning_rate": 1.9426551108982666e-06,
+ "loss": 0.3864,
+ "step": 7296
+ },
+ {
+ "epoch": 3.4501182033096924,
+ "grad_norm": 2.7942168712615967,
+ "learning_rate": 1.94204700136048e-06,
+ "loss": 0.4314,
+ "step": 7297
+ },
+ {
+ "epoch": 3.4505910165484632,
+ "grad_norm": 2.8188698291778564,
+ "learning_rate": 1.9414389265639805e-06,
+ "loss": 0.3585,
+ "step": 7298
+ },
+ {
+ "epoch": 3.451063829787234,
+ "grad_norm": 3.2826895713806152,
+ "learning_rate": 1.9408308865466295e-06,
+ "loss": 0.4614,
+ "step": 7299
+ },
+ {
+ "epoch": 3.451536643026005,
+ "grad_norm": 3.273867130279541,
+ "learning_rate": 1.9402228813462865e-06,
+ "loss": 0.3533,
+ "step": 7300
+ },
+ {
+ "epoch": 3.4520094562647756,
+ "grad_norm": 3.5334157943725586,
+ "learning_rate": 1.939614911000811e-06,
+ "loss": 0.4088,
+ "step": 7301
+ },
+ {
+ "epoch": 3.452482269503546,
+ "grad_norm": 2.983908176422119,
+ "learning_rate": 1.9390069755480583e-06,
+ "loss": 0.3725,
+ "step": 7302
+ },
+ {
+ "epoch": 3.4529550827423168,
+ "grad_norm": 2.893660306930542,
+ "learning_rate": 1.93839907502588e-06,
+ "loss": 0.3746,
+ "step": 7303
+ },
+ {
+ "epoch": 3.4534278959810876,
+ "grad_norm": 3.1762871742248535,
+ "learning_rate": 1.9377912094721295e-06,
+ "loss": 0.446,
+ "step": 7304
+ },
+ {
+ "epoch": 3.453900709219858,
+ "grad_norm": 3.3231537342071533,
+ "learning_rate": 1.9371833789246554e-06,
+ "loss": 0.4837,
+ "step": 7305
+ },
+ {
+ "epoch": 3.4543735224586287,
+ "grad_norm": 3.548333168029785,
+ "learning_rate": 1.936575583421304e-06,
+ "loss": 0.3911,
+ "step": 7306
+ },
+ {
+ "epoch": 3.4548463356973995,
+ "grad_norm": 3.0627071857452393,
+ "learning_rate": 1.9359678229999213e-06,
+ "loss": 0.3751,
+ "step": 7307
+ },
+ {
+ "epoch": 3.4553191489361703,
+ "grad_norm": 2.797663927078247,
+ "learning_rate": 1.9353600976983475e-06,
+ "loss": 0.41,
+ "step": 7308
+ },
+ {
+ "epoch": 3.455791962174941,
+ "grad_norm": 2.803269624710083,
+ "learning_rate": 1.9347524075544258e-06,
+ "loss": 0.3775,
+ "step": 7309
+ },
+ {
+ "epoch": 3.4562647754137115,
+ "grad_norm": 2.828010320663452,
+ "learning_rate": 1.934144752605993e-06,
+ "loss": 0.375,
+ "step": 7310
+ },
+ {
+ "epoch": 3.4567375886524823,
+ "grad_norm": 3.456477165222168,
+ "learning_rate": 1.933537132890884e-06,
+ "loss": 0.4764,
+ "step": 7311
+ },
+ {
+ "epoch": 3.457210401891253,
+ "grad_norm": 2.723670244216919,
+ "learning_rate": 1.9329295484469354e-06,
+ "loss": 0.3581,
+ "step": 7312
+ },
+ {
+ "epoch": 3.4576832151300234,
+ "grad_norm": 3.9723474979400635,
+ "learning_rate": 1.9323219993119766e-06,
+ "loss": 0.3951,
+ "step": 7313
+ },
+ {
+ "epoch": 3.458156028368794,
+ "grad_norm": 2.951300859451294,
+ "learning_rate": 1.931714485523838e-06,
+ "loss": 0.3865,
+ "step": 7314
+ },
+ {
+ "epoch": 3.458628841607565,
+ "grad_norm": 2.9265835285186768,
+ "learning_rate": 1.931107007120347e-06,
+ "loss": 0.3731,
+ "step": 7315
+ },
+ {
+ "epoch": 3.459101654846336,
+ "grad_norm": 3.271883249282837,
+ "learning_rate": 1.930499564139327e-06,
+ "loss": 0.3971,
+ "step": 7316
+ },
+ {
+ "epoch": 3.4595744680851066,
+ "grad_norm": 2.8716280460357666,
+ "learning_rate": 1.929892156618603e-06,
+ "loss": 0.3332,
+ "step": 7317
+ },
+ {
+ "epoch": 3.460047281323877,
+ "grad_norm": 2.9820191860198975,
+ "learning_rate": 1.929284784595993e-06,
+ "loss": 0.3907,
+ "step": 7318
+ },
+ {
+ "epoch": 3.4605200945626478,
+ "grad_norm": 3.313225269317627,
+ "learning_rate": 1.9286774481093183e-06,
+ "loss": 0.3678,
+ "step": 7319
+ },
+ {
+ "epoch": 3.4609929078014185,
+ "grad_norm": 3.365387439727783,
+ "learning_rate": 1.928070147196394e-06,
+ "loss": 0.4894,
+ "step": 7320
+ },
+ {
+ "epoch": 3.461465721040189,
+ "grad_norm": 3.1723599433898926,
+ "learning_rate": 1.927462881895033e-06,
+ "loss": 0.4607,
+ "step": 7321
+ },
+ {
+ "epoch": 3.4619385342789597,
+ "grad_norm": 2.7644999027252197,
+ "learning_rate": 1.9268556522430483e-06,
+ "loss": 0.3627,
+ "step": 7322
+ },
+ {
+ "epoch": 3.4624113475177305,
+ "grad_norm": 2.65572190284729,
+ "learning_rate": 1.9262484582782483e-06,
+ "loss": 0.3893,
+ "step": 7323
+ },
+ {
+ "epoch": 3.4628841607565013,
+ "grad_norm": 2.992037773132324,
+ "learning_rate": 1.9256413000384415e-06,
+ "loss": 0.4175,
+ "step": 7324
+ },
+ {
+ "epoch": 3.463356973995272,
+ "grad_norm": 3.020496368408203,
+ "learning_rate": 1.925034177561433e-06,
+ "loss": 0.42,
+ "step": 7325
+ },
+ {
+ "epoch": 3.4638297872340424,
+ "grad_norm": 2.780334234237671,
+ "learning_rate": 1.9244270908850236e-06,
+ "loss": 0.4195,
+ "step": 7326
+ },
+ {
+ "epoch": 3.4643026004728132,
+ "grad_norm": 2.863028049468994,
+ "learning_rate": 1.9238200400470166e-06,
+ "loss": 0.3706,
+ "step": 7327
+ },
+ {
+ "epoch": 3.464775413711584,
+ "grad_norm": 3.2766900062561035,
+ "learning_rate": 1.923213025085209e-06,
+ "loss": 0.4506,
+ "step": 7328
+ },
+ {
+ "epoch": 3.4652482269503544,
+ "grad_norm": 2.7300634384155273,
+ "learning_rate": 1.9226060460373975e-06,
+ "loss": 0.3463,
+ "step": 7329
+ },
+ {
+ "epoch": 3.465721040189125,
+ "grad_norm": 3.136104106903076,
+ "learning_rate": 1.921999102941376e-06,
+ "loss": 0.3839,
+ "step": 7330
+ },
+ {
+ "epoch": 3.466193853427896,
+ "grad_norm": 2.944932699203491,
+ "learning_rate": 1.921392195834934e-06,
+ "loss": 0.432,
+ "step": 7331
+ },
+ {
+ "epoch": 3.466666666666667,
+ "grad_norm": 3.428375005722046,
+ "learning_rate": 1.9207853247558647e-06,
+ "loss": 0.3407,
+ "step": 7332
+ },
+ {
+ "epoch": 3.4671394799054376,
+ "grad_norm": 3.3732450008392334,
+ "learning_rate": 1.9201784897419535e-06,
+ "loss": 0.361,
+ "step": 7333
+ },
+ {
+ "epoch": 3.467612293144208,
+ "grad_norm": 2.8291900157928467,
+ "learning_rate": 1.9195716908309836e-06,
+ "loss": 0.3805,
+ "step": 7334
+ },
+ {
+ "epoch": 3.4680851063829787,
+ "grad_norm": 3.3229610919952393,
+ "learning_rate": 1.9189649280607407e-06,
+ "loss": 0.3756,
+ "step": 7335
+ },
+ {
+ "epoch": 3.4685579196217495,
+ "grad_norm": 2.949416160583496,
+ "learning_rate": 1.918358201469004e-06,
+ "loss": 0.4316,
+ "step": 7336
+ },
+ {
+ "epoch": 3.46903073286052,
+ "grad_norm": 3.525501251220703,
+ "learning_rate": 1.9177515110935515e-06,
+ "loss": 0.4018,
+ "step": 7337
+ },
+ {
+ "epoch": 3.4695035460992907,
+ "grad_norm": 3.1439104080200195,
+ "learning_rate": 1.917144856972159e-06,
+ "loss": 0.4176,
+ "step": 7338
+ },
+ {
+ "epoch": 3.4699763593380615,
+ "grad_norm": 3.0022377967834473,
+ "learning_rate": 1.9165382391426006e-06,
+ "loss": 0.3962,
+ "step": 7339
+ },
+ {
+ "epoch": 3.4704491725768323,
+ "grad_norm": 3.2174794673919678,
+ "learning_rate": 1.9159316576426482e-06,
+ "loss": 0.441,
+ "step": 7340
+ },
+ {
+ "epoch": 3.470921985815603,
+ "grad_norm": 2.965123414993286,
+ "learning_rate": 1.9153251125100694e-06,
+ "loss": 0.4105,
+ "step": 7341
+ },
+ {
+ "epoch": 3.4713947990543734,
+ "grad_norm": 2.722904920578003,
+ "learning_rate": 1.9147186037826333e-06,
+ "loss": 0.4102,
+ "step": 7342
+ },
+ {
+ "epoch": 3.4718676122931442,
+ "grad_norm": 3.4894051551818848,
+ "learning_rate": 1.9141121314981033e-06,
+ "loss": 0.4225,
+ "step": 7343
+ },
+ {
+ "epoch": 3.472340425531915,
+ "grad_norm": 2.828497886657715,
+ "learning_rate": 1.913505695694241e-06,
+ "loss": 0.374,
+ "step": 7344
+ },
+ {
+ "epoch": 3.4728132387706854,
+ "grad_norm": 3.3046014308929443,
+ "learning_rate": 1.9128992964088077e-06,
+ "loss": 0.3568,
+ "step": 7345
+ },
+ {
+ "epoch": 3.473286052009456,
+ "grad_norm": 2.927281618118286,
+ "learning_rate": 1.9122929336795605e-06,
+ "loss": 0.4308,
+ "step": 7346
+ },
+ {
+ "epoch": 3.473758865248227,
+ "grad_norm": 2.9569990634918213,
+ "learning_rate": 1.911686607544256e-06,
+ "loss": 0.3226,
+ "step": 7347
+ },
+ {
+ "epoch": 3.4742316784869978,
+ "grad_norm": 3.1061038970947266,
+ "learning_rate": 1.9110803180406468e-06,
+ "loss": 0.4426,
+ "step": 7348
+ },
+ {
+ "epoch": 3.4747044917257686,
+ "grad_norm": 2.9609580039978027,
+ "learning_rate": 1.9104740652064825e-06,
+ "loss": 0.3835,
+ "step": 7349
+ },
+ {
+ "epoch": 3.475177304964539,
+ "grad_norm": 3.1547608375549316,
+ "learning_rate": 1.9098678490795147e-06,
+ "loss": 0.3814,
+ "step": 7350
+ },
+ {
+ "epoch": 3.4756501182033097,
+ "grad_norm": 2.869022846221924,
+ "learning_rate": 1.909261669697487e-06,
+ "loss": 0.4048,
+ "step": 7351
+ },
+ {
+ "epoch": 3.4761229314420805,
+ "grad_norm": 3.0565078258514404,
+ "learning_rate": 1.908655527098146e-06,
+ "loss": 0.3736,
+ "step": 7352
+ },
+ {
+ "epoch": 3.476595744680851,
+ "grad_norm": 2.893603563308716,
+ "learning_rate": 1.9080494213192317e-06,
+ "loss": 0.3906,
+ "step": 7353
+ },
+ {
+ "epoch": 3.4770685579196217,
+ "grad_norm": 2.818938732147217,
+ "learning_rate": 1.9074433523984844e-06,
+ "loss": 0.3958,
+ "step": 7354
+ },
+ {
+ "epoch": 3.4775413711583925,
+ "grad_norm": 2.675461769104004,
+ "learning_rate": 1.9068373203736419e-06,
+ "loss": 0.3371,
+ "step": 7355
+ },
+ {
+ "epoch": 3.4780141843971633,
+ "grad_norm": 2.5831551551818848,
+ "learning_rate": 1.9062313252824384e-06,
+ "loss": 0.3365,
+ "step": 7356
+ },
+ {
+ "epoch": 3.478486997635934,
+ "grad_norm": 3.299736738204956,
+ "learning_rate": 1.9056253671626054e-06,
+ "loss": 0.3923,
+ "step": 7357
+ },
+ {
+ "epoch": 3.4789598108747044,
+ "grad_norm": 2.508787155151367,
+ "learning_rate": 1.905019446051876e-06,
+ "loss": 0.3367,
+ "step": 7358
+ },
+ {
+ "epoch": 3.479432624113475,
+ "grad_norm": 2.980327606201172,
+ "learning_rate": 1.9044135619879753e-06,
+ "loss": 0.3842,
+ "step": 7359
+ },
+ {
+ "epoch": 3.479905437352246,
+ "grad_norm": 3.2114269733428955,
+ "learning_rate": 1.9038077150086317e-06,
+ "loss": 0.4625,
+ "step": 7360
+ },
+ {
+ "epoch": 3.4803782505910164,
+ "grad_norm": 3.2119715213775635,
+ "learning_rate": 1.9032019051515677e-06,
+ "loss": 0.4197,
+ "step": 7361
+ },
+ {
+ "epoch": 3.480851063829787,
+ "grad_norm": 3.2967300415039062,
+ "learning_rate": 1.9025961324545034e-06,
+ "loss": 0.4462,
+ "step": 7362
+ },
+ {
+ "epoch": 3.481323877068558,
+ "grad_norm": 3.132643461227417,
+ "learning_rate": 1.9019903969551589e-06,
+ "loss": 0.4355,
+ "step": 7363
+ },
+ {
+ "epoch": 3.4817966903073287,
+ "grad_norm": 2.9940602779388428,
+ "learning_rate": 1.9013846986912493e-06,
+ "loss": 0.3584,
+ "step": 7364
+ },
+ {
+ "epoch": 3.482269503546099,
+ "grad_norm": 2.901935577392578,
+ "learning_rate": 1.9007790377004907e-06,
+ "loss": 0.3987,
+ "step": 7365
+ },
+ {
+ "epoch": 3.48274231678487,
+ "grad_norm": 3.143404006958008,
+ "learning_rate": 1.9001734140205939e-06,
+ "loss": 0.3642,
+ "step": 7366
+ },
+ {
+ "epoch": 3.4832151300236407,
+ "grad_norm": 3.6701388359069824,
+ "learning_rate": 1.899567827689267e-06,
+ "loss": 0.4111,
+ "step": 7367
+ },
+ {
+ "epoch": 3.4836879432624115,
+ "grad_norm": 2.6821651458740234,
+ "learning_rate": 1.8989622787442202e-06,
+ "loss": 0.3827,
+ "step": 7368
+ },
+ {
+ "epoch": 3.484160756501182,
+ "grad_norm": 3.3916640281677246,
+ "learning_rate": 1.8983567672231562e-06,
+ "loss": 0.4214,
+ "step": 7369
+ },
+ {
+ "epoch": 3.4846335697399526,
+ "grad_norm": 3.3192927837371826,
+ "learning_rate": 1.8977512931637788e-06,
+ "loss": 0.4375,
+ "step": 7370
+ },
+ {
+ "epoch": 3.4851063829787234,
+ "grad_norm": 3.177884578704834,
+ "learning_rate": 1.8971458566037877e-06,
+ "loss": 0.3944,
+ "step": 7371
+ },
+ {
+ "epoch": 3.4855791962174942,
+ "grad_norm": 3.1554532051086426,
+ "learning_rate": 1.896540457580879e-06,
+ "loss": 0.3712,
+ "step": 7372
+ },
+ {
+ "epoch": 3.4860520094562646,
+ "grad_norm": 2.7773821353912354,
+ "learning_rate": 1.8959350961327516e-06,
+ "loss": 0.3534,
+ "step": 7373
+ },
+ {
+ "epoch": 3.4865248226950354,
+ "grad_norm": 2.4792168140411377,
+ "learning_rate": 1.8953297722970956e-06,
+ "loss": 0.3571,
+ "step": 7374
+ },
+ {
+ "epoch": 3.486997635933806,
+ "grad_norm": 2.676361083984375,
+ "learning_rate": 1.8947244861116044e-06,
+ "loss": 0.3365,
+ "step": 7375
+ },
+ {
+ "epoch": 3.487470449172577,
+ "grad_norm": 2.9340765476226807,
+ "learning_rate": 1.8941192376139655e-06,
+ "loss": 0.4656,
+ "step": 7376
+ },
+ {
+ "epoch": 3.4879432624113473,
+ "grad_norm": 3.7924742698669434,
+ "learning_rate": 1.8935140268418646e-06,
+ "loss": 0.3639,
+ "step": 7377
+ },
+ {
+ "epoch": 3.488416075650118,
+ "grad_norm": 2.798912286758423,
+ "learning_rate": 1.892908853832986e-06,
+ "loss": 0.3741,
+ "step": 7378
+ },
+ {
+ "epoch": 3.488888888888889,
+ "grad_norm": 3.1731197834014893,
+ "learning_rate": 1.8923037186250112e-06,
+ "loss": 0.4041,
+ "step": 7379
+ },
+ {
+ "epoch": 3.4893617021276597,
+ "grad_norm": 2.893725633621216,
+ "learning_rate": 1.8916986212556182e-06,
+ "loss": 0.3103,
+ "step": 7380
+ },
+ {
+ "epoch": 3.48983451536643,
+ "grad_norm": 3.2489001750946045,
+ "learning_rate": 1.891093561762486e-06,
+ "loss": 0.328,
+ "step": 7381
+ },
+ {
+ "epoch": 3.490307328605201,
+ "grad_norm": 2.8076415061950684,
+ "learning_rate": 1.8904885401832862e-06,
+ "loss": 0.426,
+ "step": 7382
+ },
+ {
+ "epoch": 3.4907801418439717,
+ "grad_norm": 3.076544761657715,
+ "learning_rate": 1.8898835565556938e-06,
+ "loss": 0.3664,
+ "step": 7383
+ },
+ {
+ "epoch": 3.4912529550827425,
+ "grad_norm": 2.7615935802459717,
+ "learning_rate": 1.8892786109173769e-06,
+ "loss": 0.3718,
+ "step": 7384
+ },
+ {
+ "epoch": 3.491725768321513,
+ "grad_norm": 2.9050116539001465,
+ "learning_rate": 1.8886737033060023e-06,
+ "loss": 0.3456,
+ "step": 7385
+ },
+ {
+ "epoch": 3.4921985815602836,
+ "grad_norm": 2.4928293228149414,
+ "learning_rate": 1.8880688337592366e-06,
+ "loss": 0.3487,
+ "step": 7386
+ },
+ {
+ "epoch": 3.4926713947990544,
+ "grad_norm": 2.773418426513672,
+ "learning_rate": 1.88746400231474e-06,
+ "loss": 0.3771,
+ "step": 7387
+ },
+ {
+ "epoch": 3.493144208037825,
+ "grad_norm": 2.7137296199798584,
+ "learning_rate": 1.886859209010175e-06,
+ "loss": 0.376,
+ "step": 7388
+ },
+ {
+ "epoch": 3.4936170212765956,
+ "grad_norm": 3.327976942062378,
+ "learning_rate": 1.886254453883199e-06,
+ "loss": 0.3481,
+ "step": 7389
+ },
+ {
+ "epoch": 3.4940898345153664,
+ "grad_norm": 3.8637235164642334,
+ "learning_rate": 1.8856497369714655e-06,
+ "loss": 0.3726,
+ "step": 7390
+ },
+ {
+ "epoch": 3.494562647754137,
+ "grad_norm": 3.1517951488494873,
+ "learning_rate": 1.88504505831263e-06,
+ "loss": 0.4459,
+ "step": 7391
+ },
+ {
+ "epoch": 3.495035460992908,
+ "grad_norm": 3.160130262374878,
+ "learning_rate": 1.884440417944342e-06,
+ "loss": 0.3918,
+ "step": 7392
+ },
+ {
+ "epoch": 3.4955082742316783,
+ "grad_norm": 2.6518726348876953,
+ "learning_rate": 1.8838358159042503e-06,
+ "loss": 0.3493,
+ "step": 7393
+ },
+ {
+ "epoch": 3.495981087470449,
+ "grad_norm": 2.7487380504608154,
+ "learning_rate": 1.8832312522300009e-06,
+ "loss": 0.3846,
+ "step": 7394
+ },
+ {
+ "epoch": 3.49645390070922,
+ "grad_norm": 3.062293291091919,
+ "learning_rate": 1.8826267269592355e-06,
+ "loss": 0.3792,
+ "step": 7395
+ },
+ {
+ "epoch": 3.4969267139479907,
+ "grad_norm": 3.3636794090270996,
+ "learning_rate": 1.8820222401295979e-06,
+ "loss": 0.4504,
+ "step": 7396
+ },
+ {
+ "epoch": 3.497399527186761,
+ "grad_norm": 3.230196237564087,
+ "learning_rate": 1.8814177917787246e-06,
+ "loss": 0.3953,
+ "step": 7397
+ },
+ {
+ "epoch": 3.497872340425532,
+ "grad_norm": 2.891002893447876,
+ "learning_rate": 1.8808133819442541e-06,
+ "loss": 0.3923,
+ "step": 7398
+ },
+ {
+ "epoch": 3.4983451536643027,
+ "grad_norm": 2.7478551864624023,
+ "learning_rate": 1.8802090106638196e-06,
+ "loss": 0.4115,
+ "step": 7399
+ },
+ {
+ "epoch": 3.4988179669030735,
+ "grad_norm": 3.0452797412872314,
+ "learning_rate": 1.8796046779750515e-06,
+ "loss": 0.4154,
+ "step": 7400
+ },
+ {
+ "epoch": 3.499290780141844,
+ "grad_norm": 3.0759124755859375,
+ "learning_rate": 1.87900038391558e-06,
+ "loss": 0.4277,
+ "step": 7401
+ },
+ {
+ "epoch": 3.4997635933806146,
+ "grad_norm": 2.7563929557800293,
+ "learning_rate": 1.8783961285230314e-06,
+ "loss": 0.3896,
+ "step": 7402
+ },
+ {
+ "epoch": 3.5002364066193854,
+ "grad_norm": 2.661916494369507,
+ "learning_rate": 1.87779191183503e-06,
+ "loss": 0.3625,
+ "step": 7403
+ },
+ {
+ "epoch": 3.500709219858156,
+ "grad_norm": 2.881241798400879,
+ "learning_rate": 1.877187733889199e-06,
+ "loss": 0.3724,
+ "step": 7404
+ },
+ {
+ "epoch": 3.5011820330969265,
+ "grad_norm": 3.2405693531036377,
+ "learning_rate": 1.8765835947231554e-06,
+ "loss": 0.3974,
+ "step": 7405
+ },
+ {
+ "epoch": 3.5016548463356973,
+ "grad_norm": 2.924288034439087,
+ "learning_rate": 1.8759794943745184e-06,
+ "loss": 0.3467,
+ "step": 7406
+ },
+ {
+ "epoch": 3.502127659574468,
+ "grad_norm": 3.031663656234741,
+ "learning_rate": 1.8753754328809027e-06,
+ "loss": 0.3995,
+ "step": 7407
+ },
+ {
+ "epoch": 3.5026004728132385,
+ "grad_norm": 3.028277635574341,
+ "learning_rate": 1.874771410279919e-06,
+ "loss": 0.3741,
+ "step": 7408
+ },
+ {
+ "epoch": 3.5030732860520093,
+ "grad_norm": 3.0211644172668457,
+ "learning_rate": 1.8741674266091782e-06,
+ "loss": 0.4018,
+ "step": 7409
+ },
+ {
+ "epoch": 3.50354609929078,
+ "grad_norm": 2.732234239578247,
+ "learning_rate": 1.8735634819062875e-06,
+ "loss": 0.313,
+ "step": 7410
+ },
+ {
+ "epoch": 3.504018912529551,
+ "grad_norm": 3.139596939086914,
+ "learning_rate": 1.8729595762088525e-06,
+ "loss": 0.4112,
+ "step": 7411
+ },
+ {
+ "epoch": 3.5044917257683217,
+ "grad_norm": 2.894230365753174,
+ "learning_rate": 1.8723557095544754e-06,
+ "loss": 0.3891,
+ "step": 7412
+ },
+ {
+ "epoch": 3.504964539007092,
+ "grad_norm": 2.850205659866333,
+ "learning_rate": 1.8717518819807547e-06,
+ "loss": 0.424,
+ "step": 7413
+ },
+ {
+ "epoch": 3.505437352245863,
+ "grad_norm": 3.047736644744873,
+ "learning_rate": 1.8711480935252907e-06,
+ "loss": 0.3757,
+ "step": 7414
+ },
+ {
+ "epoch": 3.5059101654846336,
+ "grad_norm": 3.0174455642700195,
+ "learning_rate": 1.8705443442256772e-06,
+ "loss": 0.3625,
+ "step": 7415
+ },
+ {
+ "epoch": 3.506382978723404,
+ "grad_norm": 2.840681552886963,
+ "learning_rate": 1.869940634119507e-06,
+ "loss": 0.3595,
+ "step": 7416
+ },
+ {
+ "epoch": 3.506855791962175,
+ "grad_norm": 3.067473888397217,
+ "learning_rate": 1.8693369632443713e-06,
+ "loss": 0.432,
+ "step": 7417
+ },
+ {
+ "epoch": 3.5073286052009456,
+ "grad_norm": 2.94655179977417,
+ "learning_rate": 1.8687333316378572e-06,
+ "loss": 0.4222,
+ "step": 7418
+ },
+ {
+ "epoch": 3.5078014184397164,
+ "grad_norm": 2.968548536300659,
+ "learning_rate": 1.868129739337551e-06,
+ "loss": 0.4098,
+ "step": 7419
+ },
+ {
+ "epoch": 3.508274231678487,
+ "grad_norm": 2.70094895362854,
+ "learning_rate": 1.867526186381034e-06,
+ "loss": 0.386,
+ "step": 7420
+ },
+ {
+ "epoch": 3.5087470449172575,
+ "grad_norm": 3.25897216796875,
+ "learning_rate": 1.8669226728058895e-06,
+ "loss": 0.4411,
+ "step": 7421
+ },
+ {
+ "epoch": 3.5092198581560283,
+ "grad_norm": 4.281215667724609,
+ "learning_rate": 1.866319198649694e-06,
+ "loss": 0.4011,
+ "step": 7422
+ },
+ {
+ "epoch": 3.509692671394799,
+ "grad_norm": 2.8394858837127686,
+ "learning_rate": 1.8657157639500223e-06,
+ "loss": 0.4162,
+ "step": 7423
+ },
+ {
+ "epoch": 3.5101654846335695,
+ "grad_norm": 2.732691764831543,
+ "learning_rate": 1.86511236874445e-06,
+ "loss": 0.3603,
+ "step": 7424
+ },
+ {
+ "epoch": 3.5106382978723403,
+ "grad_norm": 3.0152828693389893,
+ "learning_rate": 1.8645090130705463e-06,
+ "loss": 0.3811,
+ "step": 7425
+ },
+ {
+ "epoch": 3.511111111111111,
+ "grad_norm": 3.1762008666992188,
+ "learning_rate": 1.8639056969658793e-06,
+ "loss": 0.3985,
+ "step": 7426
+ },
+ {
+ "epoch": 3.511583924349882,
+ "grad_norm": 3.151123523712158,
+ "learning_rate": 1.863302420468016e-06,
+ "loss": 0.3582,
+ "step": 7427
+ },
+ {
+ "epoch": 3.5120567375886527,
+ "grad_norm": 2.738206386566162,
+ "learning_rate": 1.862699183614518e-06,
+ "loss": 0.3768,
+ "step": 7428
+ },
+ {
+ "epoch": 3.512529550827423,
+ "grad_norm": 3.235212564468384,
+ "learning_rate": 1.8620959864429487e-06,
+ "loss": 0.3964,
+ "step": 7429
+ },
+ {
+ "epoch": 3.513002364066194,
+ "grad_norm": 3.1113579273223877,
+ "learning_rate": 1.8614928289908648e-06,
+ "loss": 0.3979,
+ "step": 7430
+ },
+ {
+ "epoch": 3.5134751773049646,
+ "grad_norm": 2.6802520751953125,
+ "learning_rate": 1.860889711295822e-06,
+ "loss": 0.327,
+ "step": 7431
+ },
+ {
+ "epoch": 3.513947990543735,
+ "grad_norm": 2.9212403297424316,
+ "learning_rate": 1.860286633395375e-06,
+ "loss": 0.4104,
+ "step": 7432
+ },
+ {
+ "epoch": 3.5144208037825058,
+ "grad_norm": 2.868861198425293,
+ "learning_rate": 1.8596835953270742e-06,
+ "loss": 0.383,
+ "step": 7433
+ },
+ {
+ "epoch": 3.5148936170212766,
+ "grad_norm": 2.831655740737915,
+ "learning_rate": 1.8590805971284686e-06,
+ "loss": 0.3615,
+ "step": 7434
+ },
+ {
+ "epoch": 3.5153664302600474,
+ "grad_norm": 3.1540114879608154,
+ "learning_rate": 1.8584776388371039e-06,
+ "loss": 0.3914,
+ "step": 7435
+ },
+ {
+ "epoch": 3.515839243498818,
+ "grad_norm": 3.22031307220459,
+ "learning_rate": 1.8578747204905223e-06,
+ "loss": 0.4358,
+ "step": 7436
+ },
+ {
+ "epoch": 3.5163120567375885,
+ "grad_norm": 3.2922887802124023,
+ "learning_rate": 1.8572718421262677e-06,
+ "loss": 0.3894,
+ "step": 7437
+ },
+ {
+ "epoch": 3.5167848699763593,
+ "grad_norm": 2.936475992202759,
+ "learning_rate": 1.856669003781876e-06,
+ "loss": 0.3748,
+ "step": 7438
+ },
+ {
+ "epoch": 3.51725768321513,
+ "grad_norm": 3.4542860984802246,
+ "learning_rate": 1.8560662054948856e-06,
+ "loss": 0.3362,
+ "step": 7439
+ },
+ {
+ "epoch": 3.5177304964539005,
+ "grad_norm": 3.1532278060913086,
+ "learning_rate": 1.8554634473028288e-06,
+ "loss": 0.411,
+ "step": 7440
+ },
+ {
+ "epoch": 3.5182033096926713,
+ "grad_norm": 3.1678943634033203,
+ "learning_rate": 1.854860729243237e-06,
+ "loss": 0.4357,
+ "step": 7441
+ },
+ {
+ "epoch": 3.518676122931442,
+ "grad_norm": 2.608930826187134,
+ "learning_rate": 1.8542580513536385e-06,
+ "loss": 0.3851,
+ "step": 7442
+ },
+ {
+ "epoch": 3.519148936170213,
+ "grad_norm": 3.127915143966675,
+ "learning_rate": 1.853655413671559e-06,
+ "loss": 0.4227,
+ "step": 7443
+ },
+ {
+ "epoch": 3.5196217494089836,
+ "grad_norm": 3.0593245029449463,
+ "learning_rate": 1.8530528162345238e-06,
+ "loss": 0.4315,
+ "step": 7444
+ },
+ {
+ "epoch": 3.520094562647754,
+ "grad_norm": 2.7818729877471924,
+ "learning_rate": 1.852450259080053e-06,
+ "loss": 0.4018,
+ "step": 7445
+ },
+ {
+ "epoch": 3.520567375886525,
+ "grad_norm": 3.2635445594787598,
+ "learning_rate": 1.8518477422456639e-06,
+ "loss": 0.415,
+ "step": 7446
+ },
+ {
+ "epoch": 3.5210401891252956,
+ "grad_norm": 2.5713813304901123,
+ "learning_rate": 1.851245265768875e-06,
+ "loss": 0.3309,
+ "step": 7447
+ },
+ {
+ "epoch": 3.521513002364066,
+ "grad_norm": 2.6778969764709473,
+ "learning_rate": 1.8506428296871982e-06,
+ "loss": 0.3106,
+ "step": 7448
+ },
+ {
+ "epoch": 3.5219858156028367,
+ "grad_norm": 2.901095390319824,
+ "learning_rate": 1.8500404340381455e-06,
+ "loss": 0.3729,
+ "step": 7449
+ },
+ {
+ "epoch": 3.5224586288416075,
+ "grad_norm": 3.1000046730041504,
+ "learning_rate": 1.849438078859225e-06,
+ "loss": 0.438,
+ "step": 7450
+ },
+ {
+ "epoch": 3.5229314420803783,
+ "grad_norm": 2.901890993118286,
+ "learning_rate": 1.8488357641879417e-06,
+ "loss": 0.3934,
+ "step": 7451
+ },
+ {
+ "epoch": 3.523404255319149,
+ "grad_norm": 3.2212157249450684,
+ "learning_rate": 1.8482334900618009e-06,
+ "loss": 0.4359,
+ "step": 7452
+ },
+ {
+ "epoch": 3.5238770685579195,
+ "grad_norm": 3.3780901432037354,
+ "learning_rate": 1.847631256518303e-06,
+ "loss": 0.4022,
+ "step": 7453
+ },
+ {
+ "epoch": 3.5243498817966903,
+ "grad_norm": 2.9996445178985596,
+ "learning_rate": 1.847029063594945e-06,
+ "loss": 0.3989,
+ "step": 7454
+ },
+ {
+ "epoch": 3.524822695035461,
+ "grad_norm": 2.8581080436706543,
+ "learning_rate": 1.8464269113292255e-06,
+ "loss": 0.3401,
+ "step": 7455
+ },
+ {
+ "epoch": 3.5252955082742314,
+ "grad_norm": 2.9551661014556885,
+ "learning_rate": 1.8458247997586354e-06,
+ "loss": 0.4556,
+ "step": 7456
+ },
+ {
+ "epoch": 3.5257683215130022,
+ "grad_norm": 2.9672555923461914,
+ "learning_rate": 1.8452227289206672e-06,
+ "loss": 0.3575,
+ "step": 7457
+ },
+ {
+ "epoch": 3.526241134751773,
+ "grad_norm": 3.226273536682129,
+ "learning_rate": 1.8446206988528087e-06,
+ "loss": 0.3769,
+ "step": 7458
+ },
+ {
+ "epoch": 3.526713947990544,
+ "grad_norm": 2.994356155395508,
+ "learning_rate": 1.8440187095925443e-06,
+ "loss": 0.3653,
+ "step": 7459
+ },
+ {
+ "epoch": 3.5271867612293146,
+ "grad_norm": 2.489049196243286,
+ "learning_rate": 1.8434167611773595e-06,
+ "loss": 0.3454,
+ "step": 7460
+ },
+ {
+ "epoch": 3.527659574468085,
+ "grad_norm": 2.7897472381591797,
+ "learning_rate": 1.8428148536447333e-06,
+ "loss": 0.3526,
+ "step": 7461
+ },
+ {
+ "epoch": 3.5281323877068558,
+ "grad_norm": 2.947746992111206,
+ "learning_rate": 1.842212987032145e-06,
+ "loss": 0.3542,
+ "step": 7462
+ },
+ {
+ "epoch": 3.5286052009456266,
+ "grad_norm": 2.9303736686706543,
+ "learning_rate": 1.84161116137707e-06,
+ "loss": 0.3618,
+ "step": 7463
+ },
+ {
+ "epoch": 3.529078014184397,
+ "grad_norm": 2.81052827835083,
+ "learning_rate": 1.8410093767169807e-06,
+ "loss": 0.3833,
+ "step": 7464
+ },
+ {
+ "epoch": 3.5295508274231677,
+ "grad_norm": 3.4084126949310303,
+ "learning_rate": 1.840407633089348e-06,
+ "loss": 0.3868,
+ "step": 7465
+ },
+ {
+ "epoch": 3.5300236406619385,
+ "grad_norm": 2.8372802734375,
+ "learning_rate": 1.839805930531639e-06,
+ "loss": 0.3407,
+ "step": 7466
+ },
+ {
+ "epoch": 3.5304964539007093,
+ "grad_norm": 2.9218525886535645,
+ "learning_rate": 1.8392042690813205e-06,
+ "loss": 0.3772,
+ "step": 7467
+ },
+ {
+ "epoch": 3.53096926713948,
+ "grad_norm": 3.425274610519409,
+ "learning_rate": 1.8386026487758552e-06,
+ "loss": 0.3996,
+ "step": 7468
+ },
+ {
+ "epoch": 3.5314420803782505,
+ "grad_norm": 3.027423858642578,
+ "learning_rate": 1.8380010696527015e-06,
+ "loss": 0.3752,
+ "step": 7469
+ },
+ {
+ "epoch": 3.5319148936170213,
+ "grad_norm": 2.974896192550659,
+ "learning_rate": 1.8373995317493193e-06,
+ "loss": 0.3657,
+ "step": 7470
+ },
+ {
+ "epoch": 3.532387706855792,
+ "grad_norm": 2.837458610534668,
+ "learning_rate": 1.8367980351031628e-06,
+ "loss": 0.3949,
+ "step": 7471
+ },
+ {
+ "epoch": 3.5328605200945624,
+ "grad_norm": 2.8257288932800293,
+ "learning_rate": 1.8361965797516844e-06,
+ "loss": 0.3253,
+ "step": 7472
+ },
+ {
+ "epoch": 3.533333333333333,
+ "grad_norm": 2.8278095722198486,
+ "learning_rate": 1.8355951657323351e-06,
+ "loss": 0.3588,
+ "step": 7473
+ },
+ {
+ "epoch": 3.533806146572104,
+ "grad_norm": 2.641160249710083,
+ "learning_rate": 1.8349937930825601e-06,
+ "loss": 0.3423,
+ "step": 7474
+ },
+ {
+ "epoch": 3.534278959810875,
+ "grad_norm": 2.6909263134002686,
+ "learning_rate": 1.8343924618398065e-06,
+ "loss": 0.3973,
+ "step": 7475
+ },
+ {
+ "epoch": 3.5347517730496456,
+ "grad_norm": 3.0727429389953613,
+ "learning_rate": 1.8337911720415157e-06,
+ "loss": 0.4207,
+ "step": 7476
+ },
+ {
+ "epoch": 3.535224586288416,
+ "grad_norm": 3.218925714492798,
+ "learning_rate": 1.8331899237251265e-06,
+ "loss": 0.3955,
+ "step": 7477
+ },
+ {
+ "epoch": 3.5356973995271868,
+ "grad_norm": 3.163914918899536,
+ "learning_rate": 1.832588716928078e-06,
+ "loss": 0.4655,
+ "step": 7478
+ },
+ {
+ "epoch": 3.5361702127659576,
+ "grad_norm": 2.8622686862945557,
+ "learning_rate": 1.831987551687803e-06,
+ "loss": 0.4084,
+ "step": 7479
+ },
+ {
+ "epoch": 3.536643026004728,
+ "grad_norm": 2.8534188270568848,
+ "learning_rate": 1.831386428041734e-06,
+ "loss": 0.4144,
+ "step": 7480
+ },
+ {
+ "epoch": 3.5371158392434987,
+ "grad_norm": 2.8138554096221924,
+ "learning_rate": 1.8307853460273008e-06,
+ "loss": 0.3835,
+ "step": 7481
+ },
+ {
+ "epoch": 3.5375886524822695,
+ "grad_norm": 3.061960458755493,
+ "learning_rate": 1.830184305681929e-06,
+ "loss": 0.4128,
+ "step": 7482
+ },
+ {
+ "epoch": 3.5380614657210403,
+ "grad_norm": 2.8524835109710693,
+ "learning_rate": 1.8295833070430444e-06,
+ "loss": 0.3372,
+ "step": 7483
+ },
+ {
+ "epoch": 3.538534278959811,
+ "grad_norm": 3.2567028999328613,
+ "learning_rate": 1.8289823501480663e-06,
+ "loss": 0.4533,
+ "step": 7484
+ },
+ {
+ "epoch": 3.5390070921985815,
+ "grad_norm": 2.945634603500366,
+ "learning_rate": 1.8283814350344158e-06,
+ "loss": 0.3565,
+ "step": 7485
+ },
+ {
+ "epoch": 3.5394799054373522,
+ "grad_norm": 2.903287649154663,
+ "learning_rate": 1.8277805617395089e-06,
+ "loss": 0.349,
+ "step": 7486
+ },
+ {
+ "epoch": 3.539952718676123,
+ "grad_norm": 3.249272584915161,
+ "learning_rate": 1.827179730300757e-06,
+ "loss": 0.4076,
+ "step": 7487
+ },
+ {
+ "epoch": 3.5404255319148934,
+ "grad_norm": 2.9591739177703857,
+ "learning_rate": 1.8265789407555748e-06,
+ "loss": 0.3439,
+ "step": 7488
+ },
+ {
+ "epoch": 3.540898345153664,
+ "grad_norm": 3.8527538776397705,
+ "learning_rate": 1.8259781931413683e-06,
+ "loss": 0.4684,
+ "step": 7489
+ },
+ {
+ "epoch": 3.541371158392435,
+ "grad_norm": 2.7392261028289795,
+ "learning_rate": 1.8253774874955449e-06,
+ "loss": 0.3494,
+ "step": 7490
+ },
+ {
+ "epoch": 3.541843971631206,
+ "grad_norm": 2.880993127822876,
+ "learning_rate": 1.8247768238555069e-06,
+ "loss": 0.3546,
+ "step": 7491
+ },
+ {
+ "epoch": 3.5423167848699766,
+ "grad_norm": 2.9944894313812256,
+ "learning_rate": 1.8241762022586545e-06,
+ "loss": 0.3594,
+ "step": 7492
+ },
+ {
+ "epoch": 3.542789598108747,
+ "grad_norm": 3.0084292888641357,
+ "learning_rate": 1.8235756227423878e-06,
+ "loss": 0.408,
+ "step": 7493
+ },
+ {
+ "epoch": 3.5432624113475177,
+ "grad_norm": 2.75227689743042,
+ "learning_rate": 1.8229750853440998e-06,
+ "loss": 0.3515,
+ "step": 7494
+ },
+ {
+ "epoch": 3.5437352245862885,
+ "grad_norm": 3.041893243789673,
+ "learning_rate": 1.8223745901011856e-06,
+ "loss": 0.401,
+ "step": 7495
+ },
+ {
+ "epoch": 3.544208037825059,
+ "grad_norm": 2.8728370666503906,
+ "learning_rate": 1.8217741370510345e-06,
+ "loss": 0.3832,
+ "step": 7496
+ },
+ {
+ "epoch": 3.5446808510638297,
+ "grad_norm": 3.095460891723633,
+ "learning_rate": 1.8211737262310331e-06,
+ "loss": 0.3086,
+ "step": 7497
+ },
+ {
+ "epoch": 3.5451536643026005,
+ "grad_norm": 3.1869826316833496,
+ "learning_rate": 1.8205733576785678e-06,
+ "loss": 0.3666,
+ "step": 7498
+ },
+ {
+ "epoch": 3.5456264775413713,
+ "grad_norm": 3.307560443878174,
+ "learning_rate": 1.8199730314310204e-06,
+ "loss": 0.4489,
+ "step": 7499
+ },
+ {
+ "epoch": 3.546099290780142,
+ "grad_norm": 2.9531142711639404,
+ "learning_rate": 1.8193727475257697e-06,
+ "loss": 0.4017,
+ "step": 7500
+ },
+ {
+ "epoch": 3.5465721040189124,
+ "grad_norm": 3.2969162464141846,
+ "learning_rate": 1.8187725060001942e-06,
+ "loss": 0.4179,
+ "step": 7501
+ },
+ {
+ "epoch": 3.5470449172576832,
+ "grad_norm": 2.9434688091278076,
+ "learning_rate": 1.818172306891667e-06,
+ "loss": 0.3562,
+ "step": 7502
+ },
+ {
+ "epoch": 3.547517730496454,
+ "grad_norm": 3.070732355117798,
+ "learning_rate": 1.8175721502375616e-06,
+ "loss": 0.393,
+ "step": 7503
+ },
+ {
+ "epoch": 3.5479905437352244,
+ "grad_norm": 2.970898389816284,
+ "learning_rate": 1.8169720360752457e-06,
+ "loss": 0.4448,
+ "step": 7504
+ },
+ {
+ "epoch": 3.548463356973995,
+ "grad_norm": 2.7050931453704834,
+ "learning_rate": 1.8163719644420858e-06,
+ "loss": 0.3562,
+ "step": 7505
+ },
+ {
+ "epoch": 3.548936170212766,
+ "grad_norm": 3.0073063373565674,
+ "learning_rate": 1.8157719353754467e-06,
+ "loss": 0.4429,
+ "step": 7506
+ },
+ {
+ "epoch": 3.5494089834515368,
+ "grad_norm": 3.0240445137023926,
+ "learning_rate": 1.8151719489126874e-06,
+ "loss": 0.4073,
+ "step": 7507
+ },
+ {
+ "epoch": 3.5498817966903076,
+ "grad_norm": 3.558763265609741,
+ "learning_rate": 1.8145720050911695e-06,
+ "loss": 0.4025,
+ "step": 7508
+ },
+ {
+ "epoch": 3.550354609929078,
+ "grad_norm": 3.637258768081665,
+ "learning_rate": 1.8139721039482473e-06,
+ "loss": 0.5074,
+ "step": 7509
+ },
+ {
+ "epoch": 3.5508274231678487,
+ "grad_norm": 2.804719924926758,
+ "learning_rate": 1.8133722455212726e-06,
+ "loss": 0.3727,
+ "step": 7510
+ },
+ {
+ "epoch": 3.5513002364066195,
+ "grad_norm": 2.9034759998321533,
+ "learning_rate": 1.8127724298475984e-06,
+ "loss": 0.3642,
+ "step": 7511
+ },
+ {
+ "epoch": 3.55177304964539,
+ "grad_norm": 3.1304872035980225,
+ "learning_rate": 1.8121726569645714e-06,
+ "loss": 0.434,
+ "step": 7512
+ },
+ {
+ "epoch": 3.5522458628841607,
+ "grad_norm": 3.019956111907959,
+ "learning_rate": 1.8115729269095378e-06,
+ "loss": 0.4235,
+ "step": 7513
+ },
+ {
+ "epoch": 3.5527186761229315,
+ "grad_norm": 2.7984633445739746,
+ "learning_rate": 1.810973239719839e-06,
+ "loss": 0.3344,
+ "step": 7514
+ },
+ {
+ "epoch": 3.5531914893617023,
+ "grad_norm": 2.839709997177124,
+ "learning_rate": 1.8103735954328145e-06,
+ "loss": 0.3708,
+ "step": 7515
+ },
+ {
+ "epoch": 3.553664302600473,
+ "grad_norm": 2.766819477081299,
+ "learning_rate": 1.809773994085803e-06,
+ "loss": 0.3402,
+ "step": 7516
+ },
+ {
+ "epoch": 3.5541371158392434,
+ "grad_norm": 2.707942247390747,
+ "learning_rate": 1.8091744357161372e-06,
+ "loss": 0.4327,
+ "step": 7517
+ },
+ {
+ "epoch": 3.554609929078014,
+ "grad_norm": 3.512702465057373,
+ "learning_rate": 1.8085749203611516e-06,
+ "loss": 0.3965,
+ "step": 7518
+ },
+ {
+ "epoch": 3.555082742316785,
+ "grad_norm": 2.717024803161621,
+ "learning_rate": 1.8079754480581738e-06,
+ "loss": 0.3237,
+ "step": 7519
+ },
+ {
+ "epoch": 3.5555555555555554,
+ "grad_norm": 2.659001350402832,
+ "learning_rate": 1.8073760188445296e-06,
+ "loss": 0.3546,
+ "step": 7520
+ },
+ {
+ "epoch": 3.556028368794326,
+ "grad_norm": 2.615028142929077,
+ "learning_rate": 1.8067766327575445e-06,
+ "loss": 0.3232,
+ "step": 7521
+ },
+ {
+ "epoch": 3.556501182033097,
+ "grad_norm": 2.659428119659424,
+ "learning_rate": 1.8061772898345386e-06,
+ "loss": 0.3769,
+ "step": 7522
+ },
+ {
+ "epoch": 3.5569739952718678,
+ "grad_norm": 3.142369270324707,
+ "learning_rate": 1.8055779901128296e-06,
+ "loss": 0.4292,
+ "step": 7523
+ },
+ {
+ "epoch": 3.5574468085106385,
+ "grad_norm": 3.0832736492156982,
+ "learning_rate": 1.8049787336297352e-06,
+ "loss": 0.3871,
+ "step": 7524
+ },
+ {
+ "epoch": 3.557919621749409,
+ "grad_norm": 2.778411865234375,
+ "learning_rate": 1.8043795204225664e-06,
+ "loss": 0.3938,
+ "step": 7525
+ },
+ {
+ "epoch": 3.5583924349881797,
+ "grad_norm": 3.1651480197906494,
+ "learning_rate": 1.8037803505286355e-06,
+ "loss": 0.3315,
+ "step": 7526
+ },
+ {
+ "epoch": 3.5588652482269505,
+ "grad_norm": 3.266508102416992,
+ "learning_rate": 1.8031812239852498e-06,
+ "loss": 0.4156,
+ "step": 7527
+ },
+ {
+ "epoch": 3.559338061465721,
+ "grad_norm": 3.1345436573028564,
+ "learning_rate": 1.8025821408297127e-06,
+ "loss": 0.3813,
+ "step": 7528
+ },
+ {
+ "epoch": 3.5598108747044916,
+ "grad_norm": 3.1535425186157227,
+ "learning_rate": 1.8019831010993289e-06,
+ "loss": 0.3897,
+ "step": 7529
+ },
+ {
+ "epoch": 3.5602836879432624,
+ "grad_norm": 3.0934345722198486,
+ "learning_rate": 1.8013841048313952e-06,
+ "loss": 0.4074,
+ "step": 7530
+ },
+ {
+ "epoch": 3.5607565011820332,
+ "grad_norm": 3.224876642227173,
+ "learning_rate": 1.8007851520632108e-06,
+ "loss": 0.3969,
+ "step": 7531
+ },
+ {
+ "epoch": 3.561229314420804,
+ "grad_norm": 3.082303285598755,
+ "learning_rate": 1.8001862428320693e-06,
+ "loss": 0.3559,
+ "step": 7532
+ },
+ {
+ "epoch": 3.5617021276595744,
+ "grad_norm": 3.5289969444274902,
+ "learning_rate": 1.7995873771752608e-06,
+ "loss": 0.3961,
+ "step": 7533
+ },
+ {
+ "epoch": 3.562174940898345,
+ "grad_norm": 3.1893370151519775,
+ "learning_rate": 1.7989885551300762e-06,
+ "loss": 0.3721,
+ "step": 7534
+ },
+ {
+ "epoch": 3.562647754137116,
+ "grad_norm": 2.6911089420318604,
+ "learning_rate": 1.7983897767337999e-06,
+ "loss": 0.3801,
+ "step": 7535
+ },
+ {
+ "epoch": 3.5631205673758863,
+ "grad_norm": 3.0837483406066895,
+ "learning_rate": 1.797791042023716e-06,
+ "loss": 0.3886,
+ "step": 7536
+ },
+ {
+ "epoch": 3.563593380614657,
+ "grad_norm": 2.973459005355835,
+ "learning_rate": 1.7971923510371054e-06,
+ "loss": 0.438,
+ "step": 7537
+ },
+ {
+ "epoch": 3.564066193853428,
+ "grad_norm": 3.1537392139434814,
+ "learning_rate": 1.7965937038112435e-06,
+ "loss": 0.4022,
+ "step": 7538
+ },
+ {
+ "epoch": 3.5645390070921987,
+ "grad_norm": 3.2339680194854736,
+ "learning_rate": 1.795995100383409e-06,
+ "loss": 0.3883,
+ "step": 7539
+ },
+ {
+ "epoch": 3.5650118203309695,
+ "grad_norm": 2.5029079914093018,
+ "learning_rate": 1.7953965407908714e-06,
+ "loss": 0.3522,
+ "step": 7540
+ },
+ {
+ "epoch": 3.56548463356974,
+ "grad_norm": 3.1560211181640625,
+ "learning_rate": 1.7947980250709027e-06,
+ "loss": 0.4024,
+ "step": 7541
+ },
+ {
+ "epoch": 3.5659574468085107,
+ "grad_norm": 2.950477361679077,
+ "learning_rate": 1.7941995532607687e-06,
+ "loss": 0.3598,
+ "step": 7542
+ },
+ {
+ "epoch": 3.5664302600472815,
+ "grad_norm": 3.1263279914855957,
+ "learning_rate": 1.793601125397733e-06,
+ "loss": 0.3535,
+ "step": 7543
+ },
+ {
+ "epoch": 3.566903073286052,
+ "grad_norm": 2.986631393432617,
+ "learning_rate": 1.7930027415190587e-06,
+ "loss": 0.4251,
+ "step": 7544
+ },
+ {
+ "epoch": 3.5673758865248226,
+ "grad_norm": 2.6882247924804688,
+ "learning_rate": 1.7924044016620022e-06,
+ "loss": 0.3584,
+ "step": 7545
+ },
+ {
+ "epoch": 3.5678486997635934,
+ "grad_norm": 2.9358696937561035,
+ "learning_rate": 1.791806105863822e-06,
+ "loss": 0.3671,
+ "step": 7546
+ },
+ {
+ "epoch": 3.568321513002364,
+ "grad_norm": 2.774198055267334,
+ "learning_rate": 1.7912078541617704e-06,
+ "loss": 0.3505,
+ "step": 7547
+ },
+ {
+ "epoch": 3.568794326241135,
+ "grad_norm": 2.7384231090545654,
+ "learning_rate": 1.7906096465930964e-06,
+ "loss": 0.3992,
+ "step": 7548
+ },
+ {
+ "epoch": 3.5692671394799054,
+ "grad_norm": 2.8625354766845703,
+ "learning_rate": 1.7900114831950506e-06,
+ "loss": 0.3858,
+ "step": 7549
+ },
+ {
+ "epoch": 3.569739952718676,
+ "grad_norm": 2.737884044647217,
+ "learning_rate": 1.7894133640048761e-06,
+ "loss": 0.3973,
+ "step": 7550
+ },
+ {
+ "epoch": 3.570212765957447,
+ "grad_norm": 2.9817614555358887,
+ "learning_rate": 1.7888152890598154e-06,
+ "loss": 0.3613,
+ "step": 7551
+ },
+ {
+ "epoch": 3.5706855791962173,
+ "grad_norm": 2.760956287384033,
+ "learning_rate": 1.7882172583971081e-06,
+ "loss": 0.3645,
+ "step": 7552
+ },
+ {
+ "epoch": 3.571158392434988,
+ "grad_norm": 2.6867735385894775,
+ "learning_rate": 1.7876192720539908e-06,
+ "loss": 0.3771,
+ "step": 7553
+ },
+ {
+ "epoch": 3.571631205673759,
+ "grad_norm": 3.3362443447113037,
+ "learning_rate": 1.7870213300676986e-06,
+ "loss": 0.3989,
+ "step": 7554
+ },
+ {
+ "epoch": 3.5721040189125297,
+ "grad_norm": 2.8359227180480957,
+ "learning_rate": 1.7864234324754617e-06,
+ "loss": 0.3645,
+ "step": 7555
+ },
+ {
+ "epoch": 3.5725768321513005,
+ "grad_norm": 3.3070647716522217,
+ "learning_rate": 1.7858255793145076e-06,
+ "loss": 0.4128,
+ "step": 7556
+ },
+ {
+ "epoch": 3.573049645390071,
+ "grad_norm": 2.544879913330078,
+ "learning_rate": 1.7852277706220644e-06,
+ "loss": 0.3779,
+ "step": 7557
+ },
+ {
+ "epoch": 3.5735224586288417,
+ "grad_norm": 2.890796661376953,
+ "learning_rate": 1.7846300064353525e-06,
+ "loss": 0.373,
+ "step": 7558
+ },
+ {
+ "epoch": 3.5739952718676125,
+ "grad_norm": 2.9703400135040283,
+ "learning_rate": 1.7840322867915944e-06,
+ "loss": 0.3619,
+ "step": 7559
+ },
+ {
+ "epoch": 3.574468085106383,
+ "grad_norm": 3.0122430324554443,
+ "learning_rate": 1.7834346117280066e-06,
+ "loss": 0.4079,
+ "step": 7560
+ },
+ {
+ "epoch": 3.5749408983451536,
+ "grad_norm": 2.904963493347168,
+ "learning_rate": 1.7828369812818025e-06,
+ "loss": 0.38,
+ "step": 7561
+ },
+ {
+ "epoch": 3.5754137115839244,
+ "grad_norm": 3.0917439460754395,
+ "learning_rate": 1.7822393954901957e-06,
+ "loss": 0.383,
+ "step": 7562
+ },
+ {
+ "epoch": 3.575886524822695,
+ "grad_norm": 2.633920907974243,
+ "learning_rate": 1.7816418543903935e-06,
+ "loss": 0.3823,
+ "step": 7563
+ },
+ {
+ "epoch": 3.576359338061466,
+ "grad_norm": 2.9266390800476074,
+ "learning_rate": 1.781044358019604e-06,
+ "loss": 0.4642,
+ "step": 7564
+ },
+ {
+ "epoch": 3.5768321513002364,
+ "grad_norm": 2.878138780593872,
+ "learning_rate": 1.7804469064150299e-06,
+ "loss": 0.4056,
+ "step": 7565
+ },
+ {
+ "epoch": 3.577304964539007,
+ "grad_norm": 2.949370861053467,
+ "learning_rate": 1.7798494996138708e-06,
+ "loss": 0.3863,
+ "step": 7566
+ },
+ {
+ "epoch": 3.5777777777777775,
+ "grad_norm": 3.1444685459136963,
+ "learning_rate": 1.7792521376533264e-06,
+ "loss": 0.3611,
+ "step": 7567
+ },
+ {
+ "epoch": 3.5782505910165483,
+ "grad_norm": 3.0719716548919678,
+ "learning_rate": 1.7786548205705906e-06,
+ "loss": 0.3866,
+ "step": 7568
+ },
+ {
+ "epoch": 3.578723404255319,
+ "grad_norm": 3.155343770980835,
+ "learning_rate": 1.7780575484028566e-06,
+ "loss": 0.3896,
+ "step": 7569
+ },
+ {
+ "epoch": 3.57919621749409,
+ "grad_norm": 3.0447211265563965,
+ "learning_rate": 1.7774603211873138e-06,
+ "loss": 0.3713,
+ "step": 7570
+ },
+ {
+ "epoch": 3.5796690307328607,
+ "grad_norm": 2.8683619499206543,
+ "learning_rate": 1.7768631389611471e-06,
+ "loss": 0.3808,
+ "step": 7571
+ },
+ {
+ "epoch": 3.580141843971631,
+ "grad_norm": 3.1548070907592773,
+ "learning_rate": 1.776266001761543e-06,
+ "loss": 0.3714,
+ "step": 7572
+ },
+ {
+ "epoch": 3.580614657210402,
+ "grad_norm": 2.8699257373809814,
+ "learning_rate": 1.7756689096256816e-06,
+ "loss": 0.3694,
+ "step": 7573
+ },
+ {
+ "epoch": 3.5810874704491726,
+ "grad_norm": 2.834714412689209,
+ "learning_rate": 1.7750718625907398e-06,
+ "loss": 0.3935,
+ "step": 7574
+ },
+ {
+ "epoch": 3.581560283687943,
+ "grad_norm": 3.3828539848327637,
+ "learning_rate": 1.7744748606938957e-06,
+ "loss": 0.4783,
+ "step": 7575
+ },
+ {
+ "epoch": 3.582033096926714,
+ "grad_norm": 3.3892476558685303,
+ "learning_rate": 1.7738779039723202e-06,
+ "loss": 0.41,
+ "step": 7576
+ },
+ {
+ "epoch": 3.5825059101654846,
+ "grad_norm": 3.014289379119873,
+ "learning_rate": 1.7732809924631842e-06,
+ "loss": 0.3516,
+ "step": 7577
+ },
+ {
+ "epoch": 3.5829787234042554,
+ "grad_norm": 3.477212429046631,
+ "learning_rate": 1.772684126203654e-06,
+ "loss": 0.4144,
+ "step": 7578
+ },
+ {
+ "epoch": 3.583451536643026,
+ "grad_norm": 2.9156792163848877,
+ "learning_rate": 1.772087305230893e-06,
+ "loss": 0.3772,
+ "step": 7579
+ },
+ {
+ "epoch": 3.5839243498817965,
+ "grad_norm": 2.639169931411743,
+ "learning_rate": 1.7714905295820651e-06,
+ "loss": 0.3487,
+ "step": 7580
+ },
+ {
+ "epoch": 3.5843971631205673,
+ "grad_norm": 3.196894407272339,
+ "learning_rate": 1.7708937992943263e-06,
+ "loss": 0.4852,
+ "step": 7581
+ },
+ {
+ "epoch": 3.584869976359338,
+ "grad_norm": 2.9140779972076416,
+ "learning_rate": 1.7702971144048347e-06,
+ "loss": 0.3703,
+ "step": 7582
+ },
+ {
+ "epoch": 3.5853427895981085,
+ "grad_norm": 3.3844895362854004,
+ "learning_rate": 1.7697004749507418e-06,
+ "loss": 0.4227,
+ "step": 7583
+ },
+ {
+ "epoch": 3.5858156028368793,
+ "grad_norm": 3.080061912536621,
+ "learning_rate": 1.769103880969198e-06,
+ "loss": 0.4237,
+ "step": 7584
+ },
+ {
+ "epoch": 3.58628841607565,
+ "grad_norm": 3.037505865097046,
+ "learning_rate": 1.7685073324973506e-06,
+ "loss": 0.3902,
+ "step": 7585
+ },
+ {
+ "epoch": 3.586761229314421,
+ "grad_norm": 3.6563873291015625,
+ "learning_rate": 1.7679108295723436e-06,
+ "loss": 0.3956,
+ "step": 7586
+ },
+ {
+ "epoch": 3.5872340425531917,
+ "grad_norm": 3.158935546875,
+ "learning_rate": 1.76731437223132e-06,
+ "loss": 0.3898,
+ "step": 7587
+ },
+ {
+ "epoch": 3.587706855791962,
+ "grad_norm": 3.059199571609497,
+ "learning_rate": 1.7667179605114176e-06,
+ "loss": 0.4183,
+ "step": 7588
+ },
+ {
+ "epoch": 3.588179669030733,
+ "grad_norm": 2.8123233318328857,
+ "learning_rate": 1.7661215944497716e-06,
+ "loss": 0.3731,
+ "step": 7589
+ },
+ {
+ "epoch": 3.5886524822695036,
+ "grad_norm": 3.094287633895874,
+ "learning_rate": 1.7655252740835169e-06,
+ "loss": 0.4562,
+ "step": 7590
+ },
+ {
+ "epoch": 3.589125295508274,
+ "grad_norm": 2.886833667755127,
+ "learning_rate": 1.7649289994497822e-06,
+ "loss": 0.4178,
+ "step": 7591
+ },
+ {
+ "epoch": 3.5895981087470448,
+ "grad_norm": 3.3040647506713867,
+ "learning_rate": 1.764332770585696e-06,
+ "loss": 0.4311,
+ "step": 7592
+ },
+ {
+ "epoch": 3.5900709219858156,
+ "grad_norm": 2.7948951721191406,
+ "learning_rate": 1.7637365875283827e-06,
+ "loss": 0.3704,
+ "step": 7593
+ },
+ {
+ "epoch": 3.5905437352245864,
+ "grad_norm": 3.092221975326538,
+ "learning_rate": 1.7631404503149623e-06,
+ "loss": 0.4166,
+ "step": 7594
+ },
+ {
+ "epoch": 3.591016548463357,
+ "grad_norm": 3.6018600463867188,
+ "learning_rate": 1.7625443589825564e-06,
+ "loss": 0.4251,
+ "step": 7595
+ },
+ {
+ "epoch": 3.5914893617021275,
+ "grad_norm": 2.708017110824585,
+ "learning_rate": 1.7619483135682791e-06,
+ "loss": 0.3775,
+ "step": 7596
+ },
+ {
+ "epoch": 3.5919621749408983,
+ "grad_norm": 2.8069381713867188,
+ "learning_rate": 1.7613523141092438e-06,
+ "loss": 0.3929,
+ "step": 7597
+ },
+ {
+ "epoch": 3.592434988179669,
+ "grad_norm": 3.097787380218506,
+ "learning_rate": 1.7607563606425616e-06,
+ "loss": 0.3992,
+ "step": 7598
+ },
+ {
+ "epoch": 3.5929078014184395,
+ "grad_norm": 2.9691715240478516,
+ "learning_rate": 1.7601604532053385e-06,
+ "loss": 0.4001,
+ "step": 7599
+ },
+ {
+ "epoch": 3.5933806146572103,
+ "grad_norm": 2.5511624813079834,
+ "learning_rate": 1.7595645918346807e-06,
+ "loss": 0.3136,
+ "step": 7600
+ },
+ {
+ "epoch": 3.593853427895981,
+ "grad_norm": 2.4688427448272705,
+ "learning_rate": 1.7589687765676891e-06,
+ "loss": 0.3922,
+ "step": 7601
+ },
+ {
+ "epoch": 3.594326241134752,
+ "grad_norm": 3.004023790359497,
+ "learning_rate": 1.7583730074414613e-06,
+ "loss": 0.4203,
+ "step": 7602
+ },
+ {
+ "epoch": 3.5947990543735227,
+ "grad_norm": 2.902641773223877,
+ "learning_rate": 1.7577772844930957e-06,
+ "loss": 0.3855,
+ "step": 7603
+ },
+ {
+ "epoch": 3.595271867612293,
+ "grad_norm": 3.851375102996826,
+ "learning_rate": 1.7571816077596826e-06,
+ "loss": 0.3769,
+ "step": 7604
+ },
+ {
+ "epoch": 3.595744680851064,
+ "grad_norm": 3.03249192237854,
+ "learning_rate": 1.756585977278315e-06,
+ "loss": 0.3448,
+ "step": 7605
+ },
+ {
+ "epoch": 3.5962174940898346,
+ "grad_norm": 2.992363214492798,
+ "learning_rate": 1.7559903930860789e-06,
+ "loss": 0.3893,
+ "step": 7606
+ },
+ {
+ "epoch": 3.596690307328605,
+ "grad_norm": 2.9322855472564697,
+ "learning_rate": 1.7553948552200577e-06,
+ "loss": 0.4337,
+ "step": 7607
+ },
+ {
+ "epoch": 3.5971631205673757,
+ "grad_norm": 3.2564096450805664,
+ "learning_rate": 1.7547993637173347e-06,
+ "loss": 0.3943,
+ "step": 7608
+ },
+ {
+ "epoch": 3.5976359338061465,
+ "grad_norm": 2.9988484382629395,
+ "learning_rate": 1.7542039186149867e-06,
+ "loss": 0.3421,
+ "step": 7609
+ },
+ {
+ "epoch": 3.5981087470449173,
+ "grad_norm": 2.8188817501068115,
+ "learning_rate": 1.7536085199500914e-06,
+ "loss": 0.3657,
+ "step": 7610
+ },
+ {
+ "epoch": 3.598581560283688,
+ "grad_norm": 3.0583255290985107,
+ "learning_rate": 1.7530131677597206e-06,
+ "loss": 0.4036,
+ "step": 7611
+ },
+ {
+ "epoch": 3.5990543735224585,
+ "grad_norm": 2.8700921535491943,
+ "learning_rate": 1.7524178620809435e-06,
+ "loss": 0.3928,
+ "step": 7612
+ },
+ {
+ "epoch": 3.5995271867612293,
+ "grad_norm": 3.4497945308685303,
+ "learning_rate": 1.751822602950829e-06,
+ "loss": 0.3517,
+ "step": 7613
+ },
+ {
+ "epoch": 3.6,
+ "grad_norm": 3.334191083908081,
+ "learning_rate": 1.75122739040644e-06,
+ "loss": 0.3414,
+ "step": 7614
+ },
+ {
+ "epoch": 3.6004728132387704,
+ "grad_norm": 3.1435158252716064,
+ "learning_rate": 1.7506322244848387e-06,
+ "loss": 0.4075,
+ "step": 7615
+ },
+ {
+ "epoch": 3.6009456264775412,
+ "grad_norm": 3.178990125656128,
+ "learning_rate": 1.7500371052230824e-06,
+ "loss": 0.4688,
+ "step": 7616
+ },
+ {
+ "epoch": 3.601418439716312,
+ "grad_norm": 2.9292044639587402,
+ "learning_rate": 1.7494420326582267e-06,
+ "loss": 0.3882,
+ "step": 7617
+ },
+ {
+ "epoch": 3.601891252955083,
+ "grad_norm": 2.6899197101593018,
+ "learning_rate": 1.7488470068273256e-06,
+ "loss": 0.3916,
+ "step": 7618
+ },
+ {
+ "epoch": 3.6023640661938536,
+ "grad_norm": 2.8319191932678223,
+ "learning_rate": 1.7482520277674273e-06,
+ "loss": 0.3924,
+ "step": 7619
+ },
+ {
+ "epoch": 3.602836879432624,
+ "grad_norm": 2.74589204788208,
+ "learning_rate": 1.747657095515578e-06,
+ "loss": 0.2911,
+ "step": 7620
+ },
+ {
+ "epoch": 3.603309692671395,
+ "grad_norm": 2.857028007507324,
+ "learning_rate": 1.7470622101088233e-06,
+ "loss": 0.3618,
+ "step": 7621
+ },
+ {
+ "epoch": 3.6037825059101656,
+ "grad_norm": 3.3715617656707764,
+ "learning_rate": 1.746467371584203e-06,
+ "loss": 0.4186,
+ "step": 7622
+ },
+ {
+ "epoch": 3.604255319148936,
+ "grad_norm": 2.839526414871216,
+ "learning_rate": 1.745872579978755e-06,
+ "loss": 0.4088,
+ "step": 7623
+ },
+ {
+ "epoch": 3.6047281323877067,
+ "grad_norm": 3.7689156532287598,
+ "learning_rate": 1.7452778353295155e-06,
+ "loss": 0.4748,
+ "step": 7624
+ },
+ {
+ "epoch": 3.6052009456264775,
+ "grad_norm": 2.9345123767852783,
+ "learning_rate": 1.7446831376735152e-06,
+ "loss": 0.4117,
+ "step": 7625
+ },
+ {
+ "epoch": 3.6056737588652483,
+ "grad_norm": 2.7898924350738525,
+ "learning_rate": 1.7440884870477845e-06,
+ "loss": 0.3515,
+ "step": 7626
+ },
+ {
+ "epoch": 3.606146572104019,
+ "grad_norm": 3.4268569946289062,
+ "learning_rate": 1.7434938834893481e-06,
+ "loss": 0.4051,
+ "step": 7627
+ },
+ {
+ "epoch": 3.6066193853427895,
+ "grad_norm": 3.019066095352173,
+ "learning_rate": 1.7428993270352311e-06,
+ "loss": 0.4128,
+ "step": 7628
+ },
+ {
+ "epoch": 3.6070921985815603,
+ "grad_norm": 3.1277568340301514,
+ "learning_rate": 1.742304817722454e-06,
+ "loss": 0.37,
+ "step": 7629
+ },
+ {
+ "epoch": 3.607565011820331,
+ "grad_norm": 2.924818277359009,
+ "learning_rate": 1.7417103555880318e-06,
+ "loss": 0.3792,
+ "step": 7630
+ },
+ {
+ "epoch": 3.6080378250591014,
+ "grad_norm": 2.664699077606201,
+ "learning_rate": 1.7411159406689821e-06,
+ "loss": 0.3584,
+ "step": 7631
+ },
+ {
+ "epoch": 3.608510638297872,
+ "grad_norm": 3.223729133605957,
+ "learning_rate": 1.7405215730023144e-06,
+ "loss": 0.3956,
+ "step": 7632
+ },
+ {
+ "epoch": 3.608983451536643,
+ "grad_norm": 2.934225559234619,
+ "learning_rate": 1.7399272526250388e-06,
+ "loss": 0.4179,
+ "step": 7633
+ },
+ {
+ "epoch": 3.609456264775414,
+ "grad_norm": 2.833798885345459,
+ "learning_rate": 1.7393329795741603e-06,
+ "loss": 0.3283,
+ "step": 7634
+ },
+ {
+ "epoch": 3.6099290780141846,
+ "grad_norm": 3.008798837661743,
+ "learning_rate": 1.738738753886681e-06,
+ "loss": 0.3704,
+ "step": 7635
+ },
+ {
+ "epoch": 3.610401891252955,
+ "grad_norm": 2.8714520931243896,
+ "learning_rate": 1.7381445755996023e-06,
+ "loss": 0.3646,
+ "step": 7636
+ },
+ {
+ "epoch": 3.6108747044917258,
+ "grad_norm": 3.083554267883301,
+ "learning_rate": 1.7375504447499193e-06,
+ "loss": 0.3785,
+ "step": 7637
+ },
+ {
+ "epoch": 3.6113475177304966,
+ "grad_norm": 3.270347833633423,
+ "learning_rate": 1.7369563613746277e-06,
+ "loss": 0.4426,
+ "step": 7638
+ },
+ {
+ "epoch": 3.611820330969267,
+ "grad_norm": 2.7754862308502197,
+ "learning_rate": 1.7363623255107175e-06,
+ "loss": 0.3448,
+ "step": 7639
+ },
+ {
+ "epoch": 3.6122931442080377,
+ "grad_norm": 2.98140025138855,
+ "learning_rate": 1.7357683371951767e-06,
+ "loss": 0.4027,
+ "step": 7640
+ },
+ {
+ "epoch": 3.6127659574468085,
+ "grad_norm": 3.1640074253082275,
+ "learning_rate": 1.7351743964649908e-06,
+ "loss": 0.3913,
+ "step": 7641
+ },
+ {
+ "epoch": 3.6132387706855793,
+ "grad_norm": 2.758202075958252,
+ "learning_rate": 1.7345805033571417e-06,
+ "loss": 0.4148,
+ "step": 7642
+ },
+ {
+ "epoch": 3.61371158392435,
+ "grad_norm": 3.1030571460723877,
+ "learning_rate": 1.7339866579086074e-06,
+ "loss": 0.4002,
+ "step": 7643
+ },
+ {
+ "epoch": 3.6141843971631205,
+ "grad_norm": 3.2414135932922363,
+ "learning_rate": 1.733392860156366e-06,
+ "loss": 0.4732,
+ "step": 7644
+ },
+ {
+ "epoch": 3.6146572104018913,
+ "grad_norm": 2.8720390796661377,
+ "learning_rate": 1.7327991101373886e-06,
+ "loss": 0.4112,
+ "step": 7645
+ },
+ {
+ "epoch": 3.615130023640662,
+ "grad_norm": 3.0104875564575195,
+ "learning_rate": 1.7322054078886474e-06,
+ "loss": 0.3934,
+ "step": 7646
+ },
+ {
+ "epoch": 3.6156028368794324,
+ "grad_norm": 2.8615126609802246,
+ "learning_rate": 1.7316117534471091e-06,
+ "loss": 0.3437,
+ "step": 7647
+ },
+ {
+ "epoch": 3.616075650118203,
+ "grad_norm": 2.8283586502075195,
+ "learning_rate": 1.7310181468497369e-06,
+ "loss": 0.374,
+ "step": 7648
+ },
+ {
+ "epoch": 3.616548463356974,
+ "grad_norm": 3.2289321422576904,
+ "learning_rate": 1.7304245881334935e-06,
+ "loss": 0.3899,
+ "step": 7649
+ },
+ {
+ "epoch": 3.617021276595745,
+ "grad_norm": 3.126882791519165,
+ "learning_rate": 1.7298310773353356e-06,
+ "loss": 0.388,
+ "step": 7650
+ },
+ {
+ "epoch": 3.6174940898345156,
+ "grad_norm": 3.013657569885254,
+ "learning_rate": 1.7292376144922201e-06,
+ "loss": 0.379,
+ "step": 7651
+ },
+ {
+ "epoch": 3.617966903073286,
+ "grad_norm": 3.070192337036133,
+ "learning_rate": 1.7286441996410989e-06,
+ "loss": 0.3801,
+ "step": 7652
+ },
+ {
+ "epoch": 3.6184397163120567,
+ "grad_norm": 2.805380344390869,
+ "learning_rate": 1.7280508328189199e-06,
+ "loss": 0.3577,
+ "step": 7653
+ },
+ {
+ "epoch": 3.6189125295508275,
+ "grad_norm": 3.2853379249572754,
+ "learning_rate": 1.7274575140626318e-06,
+ "loss": 0.4168,
+ "step": 7654
+ },
+ {
+ "epoch": 3.619385342789598,
+ "grad_norm": 3.16316819190979,
+ "learning_rate": 1.7268642434091761e-06,
+ "loss": 0.425,
+ "step": 7655
+ },
+ {
+ "epoch": 3.6198581560283687,
+ "grad_norm": 3.2971179485321045,
+ "learning_rate": 1.7262710208954947e-06,
+ "loss": 0.3884,
+ "step": 7656
+ },
+ {
+ "epoch": 3.6203309692671395,
+ "grad_norm": 3.1823747158050537,
+ "learning_rate": 1.725677846558524e-06,
+ "loss": 0.3419,
+ "step": 7657
+ },
+ {
+ "epoch": 3.6208037825059103,
+ "grad_norm": 3.114654779434204,
+ "learning_rate": 1.7250847204351973e-06,
+ "loss": 0.3951,
+ "step": 7658
+ },
+ {
+ "epoch": 3.621276595744681,
+ "grad_norm": 3.0272440910339355,
+ "learning_rate": 1.7244916425624482e-06,
+ "loss": 0.4102,
+ "step": 7659
+ },
+ {
+ "epoch": 3.6217494089834514,
+ "grad_norm": 2.973611354827881,
+ "learning_rate": 1.7238986129772035e-06,
+ "loss": 0.3827,
+ "step": 7660
+ },
+ {
+ "epoch": 3.6222222222222222,
+ "grad_norm": 3.063713312149048,
+ "learning_rate": 1.7233056317163894e-06,
+ "loss": 0.3909,
+ "step": 7661
+ },
+ {
+ "epoch": 3.622695035460993,
+ "grad_norm": 3.203725576400757,
+ "learning_rate": 1.7227126988169283e-06,
+ "loss": 0.3933,
+ "step": 7662
+ },
+ {
+ "epoch": 3.6231678486997634,
+ "grad_norm": 2.945887327194214,
+ "learning_rate": 1.7221198143157386e-06,
+ "loss": 0.3722,
+ "step": 7663
+ },
+ {
+ "epoch": 3.623640661938534,
+ "grad_norm": 3.042691469192505,
+ "learning_rate": 1.7215269782497373e-06,
+ "loss": 0.4108,
+ "step": 7664
+ },
+ {
+ "epoch": 3.624113475177305,
+ "grad_norm": 2.8496763706207275,
+ "learning_rate": 1.720934190655837e-06,
+ "loss": 0.3867,
+ "step": 7665
+ },
+ {
+ "epoch": 3.6245862884160758,
+ "grad_norm": 2.7017154693603516,
+ "learning_rate": 1.7203414515709493e-06,
+ "loss": 0.3246,
+ "step": 7666
+ },
+ {
+ "epoch": 3.6250591016548466,
+ "grad_norm": 2.66630482673645,
+ "learning_rate": 1.7197487610319808e-06,
+ "loss": 0.365,
+ "step": 7667
+ },
+ {
+ "epoch": 3.625531914893617,
+ "grad_norm": 2.8724591732025146,
+ "learning_rate": 1.7191561190758348e-06,
+ "loss": 0.3361,
+ "step": 7668
+ },
+ {
+ "epoch": 3.6260047281323877,
+ "grad_norm": 3.1413803100585938,
+ "learning_rate": 1.7185635257394143e-06,
+ "loss": 0.3949,
+ "step": 7669
+ },
+ {
+ "epoch": 3.6264775413711585,
+ "grad_norm": 2.9866268634796143,
+ "learning_rate": 1.7179709810596163e-06,
+ "loss": 0.3728,
+ "step": 7670
+ },
+ {
+ "epoch": 3.626950354609929,
+ "grad_norm": 3.003497838973999,
+ "learning_rate": 1.717378485073336e-06,
+ "loss": 0.384,
+ "step": 7671
+ },
+ {
+ "epoch": 3.6274231678486997,
+ "grad_norm": 3.0043468475341797,
+ "learning_rate": 1.716786037817466e-06,
+ "loss": 0.3432,
+ "step": 7672
+ },
+ {
+ "epoch": 3.6278959810874705,
+ "grad_norm": 3.216550827026367,
+ "learning_rate": 1.7161936393288945e-06,
+ "loss": 0.3963,
+ "step": 7673
+ },
+ {
+ "epoch": 3.6283687943262413,
+ "grad_norm": 3.1091387271881104,
+ "learning_rate": 1.715601289644509e-06,
+ "loss": 0.4347,
+ "step": 7674
+ },
+ {
+ "epoch": 3.628841607565012,
+ "grad_norm": 3.2288286685943604,
+ "learning_rate": 1.7150089888011916e-06,
+ "loss": 0.4291,
+ "step": 7675
+ },
+ {
+ "epoch": 3.6293144208037824,
+ "grad_norm": 2.943941831588745,
+ "learning_rate": 1.7144167368358216e-06,
+ "loss": 0.3643,
+ "step": 7676
+ },
+ {
+ "epoch": 3.629787234042553,
+ "grad_norm": 2.819683313369751,
+ "learning_rate": 1.7138245337852774e-06,
+ "loss": 0.4051,
+ "step": 7677
+ },
+ {
+ "epoch": 3.630260047281324,
+ "grad_norm": 2.9988269805908203,
+ "learning_rate": 1.713232379686432e-06,
+ "loss": 0.4102,
+ "step": 7678
+ },
+ {
+ "epoch": 3.6307328605200944,
+ "grad_norm": 3.0041310787200928,
+ "learning_rate": 1.7126402745761566e-06,
+ "loss": 0.3854,
+ "step": 7679
+ },
+ {
+ "epoch": 3.631205673758865,
+ "grad_norm": 2.8700194358825684,
+ "learning_rate": 1.7120482184913192e-06,
+ "loss": 0.3441,
+ "step": 7680
+ },
+ {
+ "epoch": 3.631678486997636,
+ "grad_norm": 3.5275180339813232,
+ "learning_rate": 1.7114562114687833e-06,
+ "loss": 0.3808,
+ "step": 7681
+ },
+ {
+ "epoch": 3.6321513002364068,
+ "grad_norm": 3.182326078414917,
+ "learning_rate": 1.710864253545412e-06,
+ "loss": 0.4178,
+ "step": 7682
+ },
+ {
+ "epoch": 3.6326241134751776,
+ "grad_norm": 3.0514512062072754,
+ "learning_rate": 1.7102723447580627e-06,
+ "loss": 0.3527,
+ "step": 7683
+ },
+ {
+ "epoch": 3.633096926713948,
+ "grad_norm": 2.8293066024780273,
+ "learning_rate": 1.7096804851435922e-06,
+ "loss": 0.3723,
+ "step": 7684
+ },
+ {
+ "epoch": 3.6335697399527187,
+ "grad_norm": 2.9601097106933594,
+ "learning_rate": 1.709088674738853e-06,
+ "loss": 0.3704,
+ "step": 7685
+ },
+ {
+ "epoch": 3.6340425531914895,
+ "grad_norm": 2.8070995807647705,
+ "learning_rate": 1.7084969135806933e-06,
+ "loss": 0.346,
+ "step": 7686
+ },
+ {
+ "epoch": 3.63451536643026,
+ "grad_norm": 3.0162715911865234,
+ "learning_rate": 1.70790520170596e-06,
+ "loss": 0.39,
+ "step": 7687
+ },
+ {
+ "epoch": 3.6349881796690307,
+ "grad_norm": 3.018763780593872,
+ "learning_rate": 1.7073135391514967e-06,
+ "loss": 0.4621,
+ "step": 7688
+ },
+ {
+ "epoch": 3.6354609929078014,
+ "grad_norm": 2.963604688644409,
+ "learning_rate": 1.706721925954144e-06,
+ "loss": 0.339,
+ "step": 7689
+ },
+ {
+ "epoch": 3.6359338061465722,
+ "grad_norm": 2.8532896041870117,
+ "learning_rate": 1.7061303621507383e-06,
+ "loss": 0.3915,
+ "step": 7690
+ },
+ {
+ "epoch": 3.636406619385343,
+ "grad_norm": 3.248006820678711,
+ "learning_rate": 1.7055388477781133e-06,
+ "loss": 0.3712,
+ "step": 7691
+ },
+ {
+ "epoch": 3.6368794326241134,
+ "grad_norm": 3.2195777893066406,
+ "learning_rate": 1.7049473828731011e-06,
+ "loss": 0.4358,
+ "step": 7692
+ },
+ {
+ "epoch": 3.637352245862884,
+ "grad_norm": 2.7190768718719482,
+ "learning_rate": 1.7043559674725296e-06,
+ "loss": 0.341,
+ "step": 7693
+ },
+ {
+ "epoch": 3.637825059101655,
+ "grad_norm": 2.6047232151031494,
+ "learning_rate": 1.7037646016132223e-06,
+ "loss": 0.3513,
+ "step": 7694
+ },
+ {
+ "epoch": 3.6382978723404253,
+ "grad_norm": 3.0824201107025146,
+ "learning_rate": 1.7031732853320026e-06,
+ "loss": 0.4097,
+ "step": 7695
+ },
+ {
+ "epoch": 3.638770685579196,
+ "grad_norm": 2.845461130142212,
+ "learning_rate": 1.7025820186656883e-06,
+ "loss": 0.3395,
+ "step": 7696
+ },
+ {
+ "epoch": 3.639243498817967,
+ "grad_norm": 2.937863826751709,
+ "learning_rate": 1.7019908016510953e-06,
+ "loss": 0.395,
+ "step": 7697
+ },
+ {
+ "epoch": 3.6397163120567377,
+ "grad_norm": 3.349780559539795,
+ "learning_rate": 1.701399634325036e-06,
+ "loss": 0.3889,
+ "step": 7698
+ },
+ {
+ "epoch": 3.6401891252955085,
+ "grad_norm": 2.8527066707611084,
+ "learning_rate": 1.7008085167243187e-06,
+ "loss": 0.3753,
+ "step": 7699
+ },
+ {
+ "epoch": 3.640661938534279,
+ "grad_norm": 2.8112385272979736,
+ "learning_rate": 1.7002174488857517e-06,
+ "loss": 0.3912,
+ "step": 7700
+ },
+ {
+ "epoch": 3.6411347517730497,
+ "grad_norm": 2.731933832168579,
+ "learning_rate": 1.6996264308461363e-06,
+ "loss": 0.4142,
+ "step": 7701
+ },
+ {
+ "epoch": 3.6416075650118205,
+ "grad_norm": 3.70465350151062,
+ "learning_rate": 1.6990354626422744e-06,
+ "loss": 0.4089,
+ "step": 7702
+ },
+ {
+ "epoch": 3.642080378250591,
+ "grad_norm": 2.8656258583068848,
+ "learning_rate": 1.698444544310962e-06,
+ "loss": 0.3771,
+ "step": 7703
+ },
+ {
+ "epoch": 3.6425531914893616,
+ "grad_norm": 2.878830671310425,
+ "learning_rate": 1.697853675888993e-06,
+ "loss": 0.3754,
+ "step": 7704
+ },
+ {
+ "epoch": 3.6430260047281324,
+ "grad_norm": 3.440528154373169,
+ "learning_rate": 1.6972628574131586e-06,
+ "loss": 0.4543,
+ "step": 7705
+ },
+ {
+ "epoch": 3.6434988179669032,
+ "grad_norm": 2.70736026763916,
+ "learning_rate": 1.6966720889202451e-06,
+ "loss": 0.4049,
+ "step": 7706
+ },
+ {
+ "epoch": 3.643971631205674,
+ "grad_norm": 2.787992238998413,
+ "learning_rate": 1.6960813704470391e-06,
+ "loss": 0.3854,
+ "step": 7707
+ },
+ {
+ "epoch": 3.6444444444444444,
+ "grad_norm": 2.631490707397461,
+ "learning_rate": 1.6954907020303213e-06,
+ "loss": 0.3775,
+ "step": 7708
+ },
+ {
+ "epoch": 3.644917257683215,
+ "grad_norm": 3.052255392074585,
+ "learning_rate": 1.6949000837068685e-06,
+ "loss": 0.3873,
+ "step": 7709
+ },
+ {
+ "epoch": 3.645390070921986,
+ "grad_norm": 2.7443203926086426,
+ "learning_rate": 1.6943095155134586e-06,
+ "loss": 0.3362,
+ "step": 7710
+ },
+ {
+ "epoch": 3.6458628841607563,
+ "grad_norm": 2.931688070297241,
+ "learning_rate": 1.6937189974868618e-06,
+ "loss": 0.3839,
+ "step": 7711
+ },
+ {
+ "epoch": 3.646335697399527,
+ "grad_norm": 2.950242757797241,
+ "learning_rate": 1.6931285296638479e-06,
+ "loss": 0.3552,
+ "step": 7712
+ },
+ {
+ "epoch": 3.646808510638298,
+ "grad_norm": 2.940735340118408,
+ "learning_rate": 1.6925381120811823e-06,
+ "loss": 0.3881,
+ "step": 7713
+ },
+ {
+ "epoch": 3.6472813238770687,
+ "grad_norm": 2.771355390548706,
+ "learning_rate": 1.6919477447756273e-06,
+ "loss": 0.3578,
+ "step": 7714
+ },
+ {
+ "epoch": 3.6477541371158395,
+ "grad_norm": 2.919004201889038,
+ "learning_rate": 1.6913574277839435e-06,
+ "loss": 0.3971,
+ "step": 7715
+ },
+ {
+ "epoch": 3.64822695035461,
+ "grad_norm": 3.293705463409424,
+ "learning_rate": 1.6907671611428872e-06,
+ "loss": 0.422,
+ "step": 7716
+ },
+ {
+ "epoch": 3.6486997635933807,
+ "grad_norm": 2.744239091873169,
+ "learning_rate": 1.6901769448892103e-06,
+ "loss": 0.398,
+ "step": 7717
+ },
+ {
+ "epoch": 3.6491725768321515,
+ "grad_norm": 3.1726129055023193,
+ "learning_rate": 1.689586779059665e-06,
+ "loss": 0.39,
+ "step": 7718
+ },
+ {
+ "epoch": 3.649645390070922,
+ "grad_norm": 3.146743059158325,
+ "learning_rate": 1.688996663690997e-06,
+ "loss": 0.4059,
+ "step": 7719
+ },
+ {
+ "epoch": 3.6501182033096926,
+ "grad_norm": 2.941025495529175,
+ "learning_rate": 1.688406598819951e-06,
+ "loss": 0.3479,
+ "step": 7720
+ },
+ {
+ "epoch": 3.6505910165484634,
+ "grad_norm": 3.3480939865112305,
+ "learning_rate": 1.6878165844832679e-06,
+ "loss": 0.4141,
+ "step": 7721
+ },
+ {
+ "epoch": 3.651063829787234,
+ "grad_norm": 2.9145030975341797,
+ "learning_rate": 1.6872266207176833e-06,
+ "loss": 0.3497,
+ "step": 7722
+ },
+ {
+ "epoch": 3.651536643026005,
+ "grad_norm": 3.119502067565918,
+ "learning_rate": 1.686636707559934e-06,
+ "loss": 0.424,
+ "step": 7723
+ },
+ {
+ "epoch": 3.6520094562647754,
+ "grad_norm": 3.0867667198181152,
+ "learning_rate": 1.6860468450467497e-06,
+ "loss": 0.3998,
+ "step": 7724
+ },
+ {
+ "epoch": 3.652482269503546,
+ "grad_norm": 2.9128987789154053,
+ "learning_rate": 1.6854570332148602e-06,
+ "loss": 0.4043,
+ "step": 7725
+ },
+ {
+ "epoch": 3.652955082742317,
+ "grad_norm": 2.9973206520080566,
+ "learning_rate": 1.6848672721009896e-06,
+ "loss": 0.3395,
+ "step": 7726
+ },
+ {
+ "epoch": 3.6534278959810873,
+ "grad_norm": 2.824916124343872,
+ "learning_rate": 1.6842775617418591e-06,
+ "loss": 0.4102,
+ "step": 7727
+ },
+ {
+ "epoch": 3.653900709219858,
+ "grad_norm": 2.7984440326690674,
+ "learning_rate": 1.6836879021741887e-06,
+ "loss": 0.3823,
+ "step": 7728
+ },
+ {
+ "epoch": 3.654373522458629,
+ "grad_norm": 2.8412179946899414,
+ "learning_rate": 1.6830982934346917e-06,
+ "loss": 0.3755,
+ "step": 7729
+ },
+ {
+ "epoch": 3.6548463356973997,
+ "grad_norm": 3.1677138805389404,
+ "learning_rate": 1.6825087355600836e-06,
+ "loss": 0.4224,
+ "step": 7730
+ },
+ {
+ "epoch": 3.65531914893617,
+ "grad_norm": 3.097085475921631,
+ "learning_rate": 1.6819192285870718e-06,
+ "loss": 0.4103,
+ "step": 7731
+ },
+ {
+ "epoch": 3.655791962174941,
+ "grad_norm": 2.9802496433258057,
+ "learning_rate": 1.6813297725523613e-06,
+ "loss": 0.4297,
+ "step": 7732
+ },
+ {
+ "epoch": 3.6562647754137116,
+ "grad_norm": 3.0135059356689453,
+ "learning_rate": 1.680740367492657e-06,
+ "loss": 0.4526,
+ "step": 7733
+ },
+ {
+ "epoch": 3.656737588652482,
+ "grad_norm": 2.7776739597320557,
+ "learning_rate": 1.6801510134446575e-06,
+ "loss": 0.3924,
+ "step": 7734
+ },
+ {
+ "epoch": 3.657210401891253,
+ "grad_norm": 2.7500126361846924,
+ "learning_rate": 1.6795617104450595e-06,
+ "loss": 0.3785,
+ "step": 7735
+ },
+ {
+ "epoch": 3.6576832151300236,
+ "grad_norm": 3.494142770767212,
+ "learning_rate": 1.6789724585305566e-06,
+ "loss": 0.3483,
+ "step": 7736
+ },
+ {
+ "epoch": 3.6581560283687944,
+ "grad_norm": 3.055081605911255,
+ "learning_rate": 1.6783832577378377e-06,
+ "loss": 0.4481,
+ "step": 7737
+ },
+ {
+ "epoch": 3.658628841607565,
+ "grad_norm": 2.781412124633789,
+ "learning_rate": 1.6777941081035914e-06,
+ "loss": 0.3969,
+ "step": 7738
+ },
+ {
+ "epoch": 3.6591016548463355,
+ "grad_norm": 3.1672184467315674,
+ "learning_rate": 1.677205009664501e-06,
+ "loss": 0.3959,
+ "step": 7739
+ },
+ {
+ "epoch": 3.6595744680851063,
+ "grad_norm": 3.0597715377807617,
+ "learning_rate": 1.6766159624572458e-06,
+ "loss": 0.418,
+ "step": 7740
+ },
+ {
+ "epoch": 3.660047281323877,
+ "grad_norm": 3.2906267642974854,
+ "learning_rate": 1.676026966518505e-06,
+ "loss": 0.4335,
+ "step": 7741
+ },
+ {
+ "epoch": 3.6605200945626475,
+ "grad_norm": 3.2519290447235107,
+ "learning_rate": 1.6754380218849515e-06,
+ "loss": 0.3786,
+ "step": 7742
+ },
+ {
+ "epoch": 3.6609929078014183,
+ "grad_norm": 3.24716854095459,
+ "learning_rate": 1.6748491285932572e-06,
+ "loss": 0.3599,
+ "step": 7743
+ },
+ {
+ "epoch": 3.661465721040189,
+ "grad_norm": 3.2940993309020996,
+ "learning_rate": 1.6742602866800897e-06,
+ "loss": 0.3934,
+ "step": 7744
+ },
+ {
+ "epoch": 3.66193853427896,
+ "grad_norm": 2.917409896850586,
+ "learning_rate": 1.6736714961821124e-06,
+ "loss": 0.4197,
+ "step": 7745
+ },
+ {
+ "epoch": 3.6624113475177307,
+ "grad_norm": 3.005068063735962,
+ "learning_rate": 1.6730827571359887e-06,
+ "loss": 0.4239,
+ "step": 7746
+ },
+ {
+ "epoch": 3.662884160756501,
+ "grad_norm": 2.751880168914795,
+ "learning_rate": 1.6724940695783745e-06,
+ "loss": 0.4257,
+ "step": 7747
+ },
+ {
+ "epoch": 3.663356973995272,
+ "grad_norm": 3.090670585632324,
+ "learning_rate": 1.6719054335459273e-06,
+ "loss": 0.3686,
+ "step": 7748
+ },
+ {
+ "epoch": 3.6638297872340426,
+ "grad_norm": 3.250251293182373,
+ "learning_rate": 1.6713168490752974e-06,
+ "loss": 0.4249,
+ "step": 7749
+ },
+ {
+ "epoch": 3.664302600472813,
+ "grad_norm": 2.8662827014923096,
+ "learning_rate": 1.6707283162031335e-06,
+ "loss": 0.3692,
+ "step": 7750
+ },
+ {
+ "epoch": 3.6647754137115838,
+ "grad_norm": 2.8709118366241455,
+ "learning_rate": 1.6701398349660813e-06,
+ "loss": 0.3929,
+ "step": 7751
+ },
+ {
+ "epoch": 3.6652482269503546,
+ "grad_norm": 2.992035388946533,
+ "learning_rate": 1.6695514054007822e-06,
+ "loss": 0.4131,
+ "step": 7752
+ },
+ {
+ "epoch": 3.6657210401891254,
+ "grad_norm": 3.0427589416503906,
+ "learning_rate": 1.668963027543876e-06,
+ "loss": 0.387,
+ "step": 7753
+ },
+ {
+ "epoch": 3.666193853427896,
+ "grad_norm": 3.0147807598114014,
+ "learning_rate": 1.6683747014319987e-06,
+ "loss": 0.3648,
+ "step": 7754
+ },
+ {
+ "epoch": 3.6666666666666665,
+ "grad_norm": 2.5483829975128174,
+ "learning_rate": 1.6677864271017811e-06,
+ "loss": 0.3643,
+ "step": 7755
+ },
+ {
+ "epoch": 3.6671394799054373,
+ "grad_norm": 2.7661986351013184,
+ "learning_rate": 1.6671982045898544e-06,
+ "loss": 0.3731,
+ "step": 7756
+ },
+ {
+ "epoch": 3.667612293144208,
+ "grad_norm": 2.778036117553711,
+ "learning_rate": 1.666610033932843e-06,
+ "loss": 0.3744,
+ "step": 7757
+ },
+ {
+ "epoch": 3.6680851063829785,
+ "grad_norm": 2.9028329849243164,
+ "learning_rate": 1.6660219151673712e-06,
+ "loss": 0.4286,
+ "step": 7758
+ },
+ {
+ "epoch": 3.6685579196217493,
+ "grad_norm": 2.826687812805176,
+ "learning_rate": 1.6654338483300575e-06,
+ "loss": 0.318,
+ "step": 7759
+ },
+ {
+ "epoch": 3.66903073286052,
+ "grad_norm": 2.7063660621643066,
+ "learning_rate": 1.6648458334575186e-06,
+ "loss": 0.3351,
+ "step": 7760
+ },
+ {
+ "epoch": 3.669503546099291,
+ "grad_norm": 2.708361864089966,
+ "learning_rate": 1.664257870586368e-06,
+ "loss": 0.376,
+ "step": 7761
+ },
+ {
+ "epoch": 3.6699763593380617,
+ "grad_norm": 3.1139161586761475,
+ "learning_rate": 1.6636699597532141e-06,
+ "loss": 0.3572,
+ "step": 7762
+ },
+ {
+ "epoch": 3.670449172576832,
+ "grad_norm": 3.0858285427093506,
+ "learning_rate": 1.6630821009946658e-06,
+ "loss": 0.4204,
+ "step": 7763
+ },
+ {
+ "epoch": 3.670921985815603,
+ "grad_norm": 3.5593984127044678,
+ "learning_rate": 1.6624942943473252e-06,
+ "loss": 0.463,
+ "step": 7764
+ },
+ {
+ "epoch": 3.6713947990543736,
+ "grad_norm": 2.863851308822632,
+ "learning_rate": 1.6619065398477921e-06,
+ "loss": 0.4272,
+ "step": 7765
+ },
+ {
+ "epoch": 3.671867612293144,
+ "grad_norm": 2.833399772644043,
+ "learning_rate": 1.6613188375326638e-06,
+ "loss": 0.3509,
+ "step": 7766
+ },
+ {
+ "epoch": 3.6723404255319148,
+ "grad_norm": 2.988948345184326,
+ "learning_rate": 1.6607311874385346e-06,
+ "loss": 0.3572,
+ "step": 7767
+ },
+ {
+ "epoch": 3.6728132387706856,
+ "grad_norm": 2.7349398136138916,
+ "learning_rate": 1.6601435896019936e-06,
+ "loss": 0.3875,
+ "step": 7768
+ },
+ {
+ "epoch": 3.6732860520094563,
+ "grad_norm": 2.8544445037841797,
+ "learning_rate": 1.659556044059629e-06,
+ "loss": 0.4057,
+ "step": 7769
+ },
+ {
+ "epoch": 3.673758865248227,
+ "grad_norm": 3.0341904163360596,
+ "learning_rate": 1.6589685508480235e-06,
+ "loss": 0.3935,
+ "step": 7770
+ },
+ {
+ "epoch": 3.6742316784869975,
+ "grad_norm": 2.7495710849761963,
+ "learning_rate": 1.6583811100037595e-06,
+ "loss": 0.352,
+ "step": 7771
+ },
+ {
+ "epoch": 3.6747044917257683,
+ "grad_norm": 3.258525848388672,
+ "learning_rate": 1.6577937215634133e-06,
+ "loss": 0.4414,
+ "step": 7772
+ },
+ {
+ "epoch": 3.675177304964539,
+ "grad_norm": 3.0686328411102295,
+ "learning_rate": 1.657206385563558e-06,
+ "loss": 0.353,
+ "step": 7773
+ },
+ {
+ "epoch": 3.6756501182033094,
+ "grad_norm": 3.1168248653411865,
+ "learning_rate": 1.6566191020407668e-06,
+ "loss": 0.4064,
+ "step": 7774
+ },
+ {
+ "epoch": 3.6761229314420802,
+ "grad_norm": 2.7837352752685547,
+ "learning_rate": 1.6560318710316053e-06,
+ "loss": 0.3956,
+ "step": 7775
+ },
+ {
+ "epoch": 3.676595744680851,
+ "grad_norm": 3.1514039039611816,
+ "learning_rate": 1.6554446925726391e-06,
+ "loss": 0.4044,
+ "step": 7776
+ },
+ {
+ "epoch": 3.677068557919622,
+ "grad_norm": 3.010352611541748,
+ "learning_rate": 1.6548575667004285e-06,
+ "loss": 0.4162,
+ "step": 7777
+ },
+ {
+ "epoch": 3.6775413711583926,
+ "grad_norm": 3.1727633476257324,
+ "learning_rate": 1.6542704934515308e-06,
+ "loss": 0.411,
+ "step": 7778
+ },
+ {
+ "epoch": 3.678014184397163,
+ "grad_norm": 3.6771271228790283,
+ "learning_rate": 1.6536834728625018e-06,
+ "loss": 0.4562,
+ "step": 7779
+ },
+ {
+ "epoch": 3.678486997635934,
+ "grad_norm": 2.9793131351470947,
+ "learning_rate": 1.6530965049698908e-06,
+ "loss": 0.4039,
+ "step": 7780
+ },
+ {
+ "epoch": 3.6789598108747046,
+ "grad_norm": 3.193751096725464,
+ "learning_rate": 1.6525095898102478e-06,
+ "loss": 0.4064,
+ "step": 7781
+ },
+ {
+ "epoch": 3.679432624113475,
+ "grad_norm": 2.6643173694610596,
+ "learning_rate": 1.6519227274201169e-06,
+ "loss": 0.3731,
+ "step": 7782
+ },
+ {
+ "epoch": 3.6799054373522457,
+ "grad_norm": 3.4855685234069824,
+ "learning_rate": 1.6513359178360384e-06,
+ "loss": 0.3815,
+ "step": 7783
+ },
+ {
+ "epoch": 3.6803782505910165,
+ "grad_norm": 3.320537567138672,
+ "learning_rate": 1.6507491610945514e-06,
+ "loss": 0.4065,
+ "step": 7784
+ },
+ {
+ "epoch": 3.6808510638297873,
+ "grad_norm": 3.2793102264404297,
+ "learning_rate": 1.6501624572321895e-06,
+ "loss": 0.429,
+ "step": 7785
+ },
+ {
+ "epoch": 3.681323877068558,
+ "grad_norm": 2.8609631061553955,
+ "learning_rate": 1.6495758062854854e-06,
+ "loss": 0.3881,
+ "step": 7786
+ },
+ {
+ "epoch": 3.6817966903073285,
+ "grad_norm": 2.956533193588257,
+ "learning_rate": 1.6489892082909675e-06,
+ "loss": 0.3662,
+ "step": 7787
+ },
+ {
+ "epoch": 3.6822695035460993,
+ "grad_norm": 2.949092149734497,
+ "learning_rate": 1.6484026632851591e-06,
+ "loss": 0.4021,
+ "step": 7788
+ },
+ {
+ "epoch": 3.68274231678487,
+ "grad_norm": 2.802572250366211,
+ "learning_rate": 1.6478161713045831e-06,
+ "loss": 0.3862,
+ "step": 7789
+ },
+ {
+ "epoch": 3.6832151300236404,
+ "grad_norm": 3.0064797401428223,
+ "learning_rate": 1.6472297323857578e-06,
+ "loss": 0.3495,
+ "step": 7790
+ },
+ {
+ "epoch": 3.6836879432624112,
+ "grad_norm": 2.74037766456604,
+ "learning_rate": 1.646643346565197e-06,
+ "loss": 0.362,
+ "step": 7791
+ },
+ {
+ "epoch": 3.684160756501182,
+ "grad_norm": 3.423326253890991,
+ "learning_rate": 1.646057013879414e-06,
+ "loss": 0.4383,
+ "step": 7792
+ },
+ {
+ "epoch": 3.684633569739953,
+ "grad_norm": 2.7426443099975586,
+ "learning_rate": 1.645470734364915e-06,
+ "loss": 0.3741,
+ "step": 7793
+ },
+ {
+ "epoch": 3.6851063829787236,
+ "grad_norm": 2.879296064376831,
+ "learning_rate": 1.6448845080582077e-06,
+ "loss": 0.4003,
+ "step": 7794
+ },
+ {
+ "epoch": 3.685579196217494,
+ "grad_norm": 2.6821188926696777,
+ "learning_rate": 1.6442983349957924e-06,
+ "loss": 0.3545,
+ "step": 7795
+ },
+ {
+ "epoch": 3.6860520094562648,
+ "grad_norm": 2.714059829711914,
+ "learning_rate": 1.6437122152141665e-06,
+ "loss": 0.3702,
+ "step": 7796
+ },
+ {
+ "epoch": 3.6865248226950356,
+ "grad_norm": 2.593811511993408,
+ "learning_rate": 1.6431261487498274e-06,
+ "loss": 0.3512,
+ "step": 7797
+ },
+ {
+ "epoch": 3.686997635933806,
+ "grad_norm": 2.687533378601074,
+ "learning_rate": 1.6425401356392652e-06,
+ "loss": 0.36,
+ "step": 7798
+ },
+ {
+ "epoch": 3.6874704491725767,
+ "grad_norm": 3.1675431728363037,
+ "learning_rate": 1.6419541759189694e-06,
+ "loss": 0.3349,
+ "step": 7799
+ },
+ {
+ "epoch": 3.6879432624113475,
+ "grad_norm": 2.777310371398926,
+ "learning_rate": 1.6413682696254246e-06,
+ "loss": 0.317,
+ "step": 7800
+ },
+ {
+ "epoch": 3.6884160756501183,
+ "grad_norm": 3.0121655464172363,
+ "learning_rate": 1.640782416795112e-06,
+ "loss": 0.3612,
+ "step": 7801
+ },
+ {
+ "epoch": 3.688888888888889,
+ "grad_norm": 3.0532145500183105,
+ "learning_rate": 1.6401966174645113e-06,
+ "loss": 0.4065,
+ "step": 7802
+ },
+ {
+ "epoch": 3.6893617021276595,
+ "grad_norm": 2.8221664428710938,
+ "learning_rate": 1.6396108716700961e-06,
+ "loss": 0.3669,
+ "step": 7803
+ },
+ {
+ "epoch": 3.6898345153664303,
+ "grad_norm": 2.966357707977295,
+ "learning_rate": 1.6390251794483405e-06,
+ "loss": 0.391,
+ "step": 7804
+ },
+ {
+ "epoch": 3.690307328605201,
+ "grad_norm": 3.460252046585083,
+ "learning_rate": 1.6384395408357118e-06,
+ "loss": 0.429,
+ "step": 7805
+ },
+ {
+ "epoch": 3.6907801418439714,
+ "grad_norm": 2.8907718658447266,
+ "learning_rate": 1.637853955868674e-06,
+ "loss": 0.3761,
+ "step": 7806
+ },
+ {
+ "epoch": 3.691252955082742,
+ "grad_norm": 3.114612102508545,
+ "learning_rate": 1.6372684245836912e-06,
+ "loss": 0.4376,
+ "step": 7807
+ },
+ {
+ "epoch": 3.691725768321513,
+ "grad_norm": 2.9361326694488525,
+ "learning_rate": 1.6366829470172191e-06,
+ "loss": 0.3672,
+ "step": 7808
+ },
+ {
+ "epoch": 3.692198581560284,
+ "grad_norm": 3.2719476222991943,
+ "learning_rate": 1.6360975232057156e-06,
+ "loss": 0.4266,
+ "step": 7809
+ },
+ {
+ "epoch": 3.6926713947990546,
+ "grad_norm": 2.873952865600586,
+ "learning_rate": 1.635512153185631e-06,
+ "loss": 0.4056,
+ "step": 7810
+ },
+ {
+ "epoch": 3.693144208037825,
+ "grad_norm": 3.0273401737213135,
+ "learning_rate": 1.634926836993413e-06,
+ "loss": 0.3947,
+ "step": 7811
+ },
+ {
+ "epoch": 3.6936170212765957,
+ "grad_norm": 2.868738889694214,
+ "learning_rate": 1.634341574665509e-06,
+ "loss": 0.3935,
+ "step": 7812
+ },
+ {
+ "epoch": 3.6940898345153665,
+ "grad_norm": 3.3080437183380127,
+ "learning_rate": 1.6337563662383591e-06,
+ "loss": 0.3606,
+ "step": 7813
+ },
+ {
+ "epoch": 3.694562647754137,
+ "grad_norm": 2.8339016437530518,
+ "learning_rate": 1.6331712117484014e-06,
+ "loss": 0.4019,
+ "step": 7814
+ },
+ {
+ "epoch": 3.6950354609929077,
+ "grad_norm": 2.666815996170044,
+ "learning_rate": 1.6325861112320717e-06,
+ "loss": 0.3502,
+ "step": 7815
+ },
+ {
+ "epoch": 3.6955082742316785,
+ "grad_norm": 2.7624311447143555,
+ "learning_rate": 1.6320010647258008e-06,
+ "loss": 0.3481,
+ "step": 7816
+ },
+ {
+ "epoch": 3.6959810874704493,
+ "grad_norm": 2.7796332836151123,
+ "learning_rate": 1.6314160722660183e-06,
+ "loss": 0.3735,
+ "step": 7817
+ },
+ {
+ "epoch": 3.69645390070922,
+ "grad_norm": 2.954318046569824,
+ "learning_rate": 1.6308311338891484e-06,
+ "loss": 0.3933,
+ "step": 7818
+ },
+ {
+ "epoch": 3.6969267139479904,
+ "grad_norm": 2.821072816848755,
+ "learning_rate": 1.6302462496316115e-06,
+ "loss": 0.3437,
+ "step": 7819
+ },
+ {
+ "epoch": 3.6973995271867612,
+ "grad_norm": 3.436192750930786,
+ "learning_rate": 1.629661419529828e-06,
+ "loss": 0.4469,
+ "step": 7820
+ },
+ {
+ "epoch": 3.697872340425532,
+ "grad_norm": 3.1361067295074463,
+ "learning_rate": 1.629076643620211e-06,
+ "loss": 0.3887,
+ "step": 7821
+ },
+ {
+ "epoch": 3.6983451536643024,
+ "grad_norm": 3.355024576187134,
+ "learning_rate": 1.6284919219391732e-06,
+ "loss": 0.424,
+ "step": 7822
+ },
+ {
+ "epoch": 3.698817966903073,
+ "grad_norm": 2.7671639919281006,
+ "learning_rate": 1.6279072545231212e-06,
+ "loss": 0.3765,
+ "step": 7823
+ },
+ {
+ "epoch": 3.699290780141844,
+ "grad_norm": 2.9509360790252686,
+ "learning_rate": 1.6273226414084606e-06,
+ "loss": 0.4057,
+ "step": 7824
+ },
+ {
+ "epoch": 3.699763593380615,
+ "grad_norm": 2.9852921962738037,
+ "learning_rate": 1.6267380826315932e-06,
+ "loss": 0.4238,
+ "step": 7825
+ },
+ {
+ "epoch": 3.7002364066193856,
+ "grad_norm": 2.826594114303589,
+ "learning_rate": 1.626153578228915e-06,
+ "loss": 0.3958,
+ "step": 7826
+ },
+ {
+ "epoch": 3.700709219858156,
+ "grad_norm": 2.9103410243988037,
+ "learning_rate": 1.6255691282368228e-06,
+ "loss": 0.394,
+ "step": 7827
+ },
+ {
+ "epoch": 3.7011820330969267,
+ "grad_norm": 3.362992525100708,
+ "learning_rate": 1.6249847326917068e-06,
+ "loss": 0.4233,
+ "step": 7828
+ },
+ {
+ "epoch": 3.7016548463356975,
+ "grad_norm": 2.711280107498169,
+ "learning_rate": 1.624400391629954e-06,
+ "loss": 0.2977,
+ "step": 7829
+ },
+ {
+ "epoch": 3.702127659574468,
+ "grad_norm": 2.8354649543762207,
+ "learning_rate": 1.6238161050879497e-06,
+ "loss": 0.3549,
+ "step": 7830
+ },
+ {
+ "epoch": 3.7026004728132387,
+ "grad_norm": 3.096376895904541,
+ "learning_rate": 1.6232318731020743e-06,
+ "loss": 0.3486,
+ "step": 7831
+ },
+ {
+ "epoch": 3.7030732860520095,
+ "grad_norm": 2.918267250061035,
+ "learning_rate": 1.6226476957087064e-06,
+ "loss": 0.3659,
+ "step": 7832
+ },
+ {
+ "epoch": 3.7035460992907803,
+ "grad_norm": 2.705399513244629,
+ "learning_rate": 1.6220635729442195e-06,
+ "loss": 0.4301,
+ "step": 7833
+ },
+ {
+ "epoch": 3.704018912529551,
+ "grad_norm": 2.9192235469818115,
+ "learning_rate": 1.621479504844983e-06,
+ "loss": 0.3384,
+ "step": 7834
+ },
+ {
+ "epoch": 3.7044917257683214,
+ "grad_norm": 2.78623104095459,
+ "learning_rate": 1.6208954914473669e-06,
+ "loss": 0.3528,
+ "step": 7835
+ },
+ {
+ "epoch": 3.704964539007092,
+ "grad_norm": 3.0218069553375244,
+ "learning_rate": 1.6203115327877333e-06,
+ "loss": 0.3698,
+ "step": 7836
+ },
+ {
+ "epoch": 3.705437352245863,
+ "grad_norm": 3.019101619720459,
+ "learning_rate": 1.6197276289024422e-06,
+ "loss": 0.4398,
+ "step": 7837
+ },
+ {
+ "epoch": 3.7059101654846334,
+ "grad_norm": 2.9220848083496094,
+ "learning_rate": 1.6191437798278531e-06,
+ "loss": 0.3803,
+ "step": 7838
+ },
+ {
+ "epoch": 3.706382978723404,
+ "grad_norm": 3.2731969356536865,
+ "learning_rate": 1.6185599856003181e-06,
+ "loss": 0.4529,
+ "step": 7839
+ },
+ {
+ "epoch": 3.706855791962175,
+ "grad_norm": 2.85239577293396,
+ "learning_rate": 1.617976246256188e-06,
+ "loss": 0.3801,
+ "step": 7840
+ },
+ {
+ "epoch": 3.7073286052009458,
+ "grad_norm": 2.8250765800476074,
+ "learning_rate": 1.6173925618318092e-06,
+ "loss": 0.3267,
+ "step": 7841
+ },
+ {
+ "epoch": 3.7078014184397166,
+ "grad_norm": 2.9152321815490723,
+ "learning_rate": 1.616808932363525e-06,
+ "loss": 0.428,
+ "step": 7842
+ },
+ {
+ "epoch": 3.708274231678487,
+ "grad_norm": 2.912656545639038,
+ "learning_rate": 1.6162253578876766e-06,
+ "loss": 0.3802,
+ "step": 7843
+ },
+ {
+ "epoch": 3.7087470449172577,
+ "grad_norm": 3.0700762271881104,
+ "learning_rate": 1.6156418384405992e-06,
+ "loss": 0.377,
+ "step": 7844
+ },
+ {
+ "epoch": 3.7092198581560285,
+ "grad_norm": 2.873141050338745,
+ "learning_rate": 1.6150583740586274e-06,
+ "loss": 0.399,
+ "step": 7845
+ },
+ {
+ "epoch": 3.709692671394799,
+ "grad_norm": 2.899555206298828,
+ "learning_rate": 1.6144749647780906e-06,
+ "loss": 0.402,
+ "step": 7846
+ },
+ {
+ "epoch": 3.7101654846335697,
+ "grad_norm": 3.257697343826294,
+ "learning_rate": 1.6138916106353139e-06,
+ "loss": 0.4193,
+ "step": 7847
+ },
+ {
+ "epoch": 3.7106382978723405,
+ "grad_norm": 2.6879804134368896,
+ "learning_rate": 1.613308311666622e-06,
+ "loss": 0.3474,
+ "step": 7848
+ },
+ {
+ "epoch": 3.7111111111111112,
+ "grad_norm": 2.712491273880005,
+ "learning_rate": 1.6127250679083323e-06,
+ "loss": 0.3315,
+ "step": 7849
+ },
+ {
+ "epoch": 3.711583924349882,
+ "grad_norm": 2.9762673377990723,
+ "learning_rate": 1.6121418793967631e-06,
+ "loss": 0.3953,
+ "step": 7850
+ },
+ {
+ "epoch": 3.7120567375886524,
+ "grad_norm": 2.743668556213379,
+ "learning_rate": 1.6115587461682258e-06,
+ "loss": 0.381,
+ "step": 7851
+ },
+ {
+ "epoch": 3.712529550827423,
+ "grad_norm": 3.0545318126678467,
+ "learning_rate": 1.6109756682590288e-06,
+ "loss": 0.412,
+ "step": 7852
+ },
+ {
+ "epoch": 3.713002364066194,
+ "grad_norm": 3.0125906467437744,
+ "learning_rate": 1.61039264570548e-06,
+ "loss": 0.3931,
+ "step": 7853
+ },
+ {
+ "epoch": 3.7134751773049643,
+ "grad_norm": 2.809302806854248,
+ "learning_rate": 1.6098096785438794e-06,
+ "loss": 0.3943,
+ "step": 7854
+ },
+ {
+ "epoch": 3.713947990543735,
+ "grad_norm": 3.092452049255371,
+ "learning_rate": 1.6092267668105276e-06,
+ "loss": 0.3932,
+ "step": 7855
+ },
+ {
+ "epoch": 3.714420803782506,
+ "grad_norm": 2.9878969192504883,
+ "learning_rate": 1.608643910541719e-06,
+ "loss": 0.4289,
+ "step": 7856
+ },
+ {
+ "epoch": 3.7148936170212767,
+ "grad_norm": 2.693387508392334,
+ "learning_rate": 1.6080611097737444e-06,
+ "loss": 0.373,
+ "step": 7857
+ },
+ {
+ "epoch": 3.7153664302600475,
+ "grad_norm": 3.4097673892974854,
+ "learning_rate": 1.6074783645428945e-06,
+ "loss": 0.4487,
+ "step": 7858
+ },
+ {
+ "epoch": 3.715839243498818,
+ "grad_norm": 3.1466784477233887,
+ "learning_rate": 1.6068956748854525e-06,
+ "loss": 0.3648,
+ "step": 7859
+ },
+ {
+ "epoch": 3.7163120567375887,
+ "grad_norm": 3.062107563018799,
+ "learning_rate": 1.6063130408377015e-06,
+ "loss": 0.3899,
+ "step": 7860
+ },
+ {
+ "epoch": 3.7167848699763595,
+ "grad_norm": 3.2298364639282227,
+ "learning_rate": 1.6057304624359188e-06,
+ "loss": 0.4243,
+ "step": 7861
+ },
+ {
+ "epoch": 3.71725768321513,
+ "grad_norm": 3.0285773277282715,
+ "learning_rate": 1.6051479397163784e-06,
+ "loss": 0.3469,
+ "step": 7862
+ },
+ {
+ "epoch": 3.7177304964539006,
+ "grad_norm": 2.8438515663146973,
+ "learning_rate": 1.6045654727153525e-06,
+ "loss": 0.3363,
+ "step": 7863
+ },
+ {
+ "epoch": 3.7182033096926714,
+ "grad_norm": 3.1558034420013428,
+ "learning_rate": 1.6039830614691081e-06,
+ "loss": 0.4326,
+ "step": 7864
+ },
+ {
+ "epoch": 3.7186761229314422,
+ "grad_norm": 2.438640594482422,
+ "learning_rate": 1.603400706013909e-06,
+ "loss": 0.3647,
+ "step": 7865
+ },
+ {
+ "epoch": 3.719148936170213,
+ "grad_norm": 3.0443127155303955,
+ "learning_rate": 1.6028184063860168e-06,
+ "loss": 0.4295,
+ "step": 7866
+ },
+ {
+ "epoch": 3.7196217494089834,
+ "grad_norm": 2.8009512424468994,
+ "learning_rate": 1.602236162621688e-06,
+ "loss": 0.4055,
+ "step": 7867
+ },
+ {
+ "epoch": 3.720094562647754,
+ "grad_norm": 3.227698802947998,
+ "learning_rate": 1.6016539747571775e-06,
+ "loss": 0.4681,
+ "step": 7868
+ },
+ {
+ "epoch": 3.720567375886525,
+ "grad_norm": 2.8242595195770264,
+ "learning_rate": 1.601071842828735e-06,
+ "loss": 0.3707,
+ "step": 7869
+ },
+ {
+ "epoch": 3.7210401891252953,
+ "grad_norm": 3.602937698364258,
+ "learning_rate": 1.6004897668726067e-06,
+ "loss": 0.5201,
+ "step": 7870
+ },
+ {
+ "epoch": 3.721513002364066,
+ "grad_norm": 3.578422784805298,
+ "learning_rate": 1.599907746925037e-06,
+ "loss": 0.4514,
+ "step": 7871
+ },
+ {
+ "epoch": 3.721985815602837,
+ "grad_norm": 2.7365758419036865,
+ "learning_rate": 1.5993257830222635e-06,
+ "loss": 0.356,
+ "step": 7872
+ },
+ {
+ "epoch": 3.7224586288416077,
+ "grad_norm": 3.125636577606201,
+ "learning_rate": 1.5987438752005258e-06,
+ "loss": 0.4277,
+ "step": 7873
+ },
+ {
+ "epoch": 3.7229314420803785,
+ "grad_norm": 2.7157294750213623,
+ "learning_rate": 1.5981620234960549e-06,
+ "loss": 0.363,
+ "step": 7874
+ },
+ {
+ "epoch": 3.723404255319149,
+ "grad_norm": 2.90950083732605,
+ "learning_rate": 1.5975802279450793e-06,
+ "loss": 0.4027,
+ "step": 7875
+ },
+ {
+ "epoch": 3.7238770685579197,
+ "grad_norm": 2.659787178039551,
+ "learning_rate": 1.596998488583827e-06,
+ "loss": 0.3632,
+ "step": 7876
+ },
+ {
+ "epoch": 3.7243498817966905,
+ "grad_norm": 3.221623182296753,
+ "learning_rate": 1.5964168054485185e-06,
+ "loss": 0.4295,
+ "step": 7877
+ },
+ {
+ "epoch": 3.724822695035461,
+ "grad_norm": 2.6838672161102295,
+ "learning_rate": 1.595835178575374e-06,
+ "loss": 0.3413,
+ "step": 7878
+ },
+ {
+ "epoch": 3.7252955082742316,
+ "grad_norm": 2.804706335067749,
+ "learning_rate": 1.5952536080006084e-06,
+ "loss": 0.3801,
+ "step": 7879
+ },
+ {
+ "epoch": 3.7257683215130024,
+ "grad_norm": 2.7647509574890137,
+ "learning_rate": 1.5946720937604326e-06,
+ "loss": 0.3941,
+ "step": 7880
+ },
+ {
+ "epoch": 3.726241134751773,
+ "grad_norm": 2.8363754749298096,
+ "learning_rate": 1.5940906358910566e-06,
+ "loss": 0.3772,
+ "step": 7881
+ },
+ {
+ "epoch": 3.726713947990544,
+ "grad_norm": 3.4147698879241943,
+ "learning_rate": 1.5935092344286835e-06,
+ "loss": 0.399,
+ "step": 7882
+ },
+ {
+ "epoch": 3.7271867612293144,
+ "grad_norm": 2.984090805053711,
+ "learning_rate": 1.5929278894095162e-06,
+ "loss": 0.3373,
+ "step": 7883
+ },
+ {
+ "epoch": 3.727659574468085,
+ "grad_norm": 3.250173330307007,
+ "learning_rate": 1.5923466008697521e-06,
+ "loss": 0.3932,
+ "step": 7884
+ },
+ {
+ "epoch": 3.728132387706856,
+ "grad_norm": 3.2699649333953857,
+ "learning_rate": 1.5917653688455848e-06,
+ "loss": 0.4529,
+ "step": 7885
+ },
+ {
+ "epoch": 3.7286052009456263,
+ "grad_norm": 3.175934076309204,
+ "learning_rate": 1.591184193373206e-06,
+ "loss": 0.3726,
+ "step": 7886
+ },
+ {
+ "epoch": 3.729078014184397,
+ "grad_norm": 2.8128812313079834,
+ "learning_rate": 1.5906030744888024e-06,
+ "loss": 0.3648,
+ "step": 7887
+ },
+ {
+ "epoch": 3.729550827423168,
+ "grad_norm": 3.025012493133545,
+ "learning_rate": 1.5900220122285564e-06,
+ "loss": 0.3945,
+ "step": 7888
+ },
+ {
+ "epoch": 3.7300236406619387,
+ "grad_norm": 3.237680435180664,
+ "learning_rate": 1.5894410066286512e-06,
+ "loss": 0.3815,
+ "step": 7889
+ },
+ {
+ "epoch": 3.7304964539007095,
+ "grad_norm": 3.458033323287964,
+ "learning_rate": 1.5888600577252605e-06,
+ "loss": 0.4104,
+ "step": 7890
+ },
+ {
+ "epoch": 3.73096926713948,
+ "grad_norm": 2.718867540359497,
+ "learning_rate": 1.58827916555456e-06,
+ "loss": 0.3243,
+ "step": 7891
+ },
+ {
+ "epoch": 3.7314420803782506,
+ "grad_norm": 3.047157049179077,
+ "learning_rate": 1.5876983301527176e-06,
+ "loss": 0.3689,
+ "step": 7892
+ },
+ {
+ "epoch": 3.731914893617021,
+ "grad_norm": 3.2904715538024902,
+ "learning_rate": 1.5871175515558995e-06,
+ "loss": 0.4045,
+ "step": 7893
+ },
+ {
+ "epoch": 3.732387706855792,
+ "grad_norm": 2.956467866897583,
+ "learning_rate": 1.5865368298002692e-06,
+ "loss": 0.3806,
+ "step": 7894
+ },
+ {
+ "epoch": 3.7328605200945626,
+ "grad_norm": 3.3309173583984375,
+ "learning_rate": 1.5859561649219843e-06,
+ "loss": 0.4011,
+ "step": 7895
+ },
+ {
+ "epoch": 3.7333333333333334,
+ "grad_norm": 2.7853524684906006,
+ "learning_rate": 1.5853755569572018e-06,
+ "loss": 0.3239,
+ "step": 7896
+ },
+ {
+ "epoch": 3.733806146572104,
+ "grad_norm": 2.9832780361175537,
+ "learning_rate": 1.584795005942073e-06,
+ "loss": 0.4582,
+ "step": 7897
+ },
+ {
+ "epoch": 3.7342789598108745,
+ "grad_norm": 3.2866461277008057,
+ "learning_rate": 1.584214511912745e-06,
+ "loss": 0.3876,
+ "step": 7898
+ },
+ {
+ "epoch": 3.7347517730496453,
+ "grad_norm": 3.018526792526245,
+ "learning_rate": 1.5836340749053646e-06,
+ "loss": 0.3221,
+ "step": 7899
+ },
+ {
+ "epoch": 3.735224586288416,
+ "grad_norm": 2.9109885692596436,
+ "learning_rate": 1.583053694956072e-06,
+ "loss": 0.4225,
+ "step": 7900
+ },
+ {
+ "epoch": 3.7356973995271865,
+ "grad_norm": 3.104146718978882,
+ "learning_rate": 1.5824733721010051e-06,
+ "loss": 0.3843,
+ "step": 7901
+ },
+ {
+ "epoch": 3.7361702127659573,
+ "grad_norm": 3.0982813835144043,
+ "learning_rate": 1.5818931063762989e-06,
+ "loss": 0.4223,
+ "step": 7902
+ },
+ {
+ "epoch": 3.736643026004728,
+ "grad_norm": 2.7797579765319824,
+ "learning_rate": 1.5813128978180819e-06,
+ "loss": 0.3536,
+ "step": 7903
+ },
+ {
+ "epoch": 3.737115839243499,
+ "grad_norm": 2.870884656906128,
+ "learning_rate": 1.5807327464624835e-06,
+ "loss": 0.3053,
+ "step": 7904
+ },
+ {
+ "epoch": 3.7375886524822697,
+ "grad_norm": 2.896674633026123,
+ "learning_rate": 1.5801526523456251e-06,
+ "loss": 0.3806,
+ "step": 7905
+ },
+ {
+ "epoch": 3.73806146572104,
+ "grad_norm": 3.009662389755249,
+ "learning_rate": 1.5795726155036284e-06,
+ "loss": 0.3568,
+ "step": 7906
+ },
+ {
+ "epoch": 3.738534278959811,
+ "grad_norm": 2.6860599517822266,
+ "learning_rate": 1.578992635972609e-06,
+ "loss": 0.4392,
+ "step": 7907
+ },
+ {
+ "epoch": 3.7390070921985816,
+ "grad_norm": 2.9046099185943604,
+ "learning_rate": 1.578412713788679e-06,
+ "loss": 0.3756,
+ "step": 7908
+ },
+ {
+ "epoch": 3.739479905437352,
+ "grad_norm": 2.8035101890563965,
+ "learning_rate": 1.5778328489879488e-06,
+ "loss": 0.3576,
+ "step": 7909
+ },
+ {
+ "epoch": 3.739952718676123,
+ "grad_norm": 2.767514228820801,
+ "learning_rate": 1.5772530416065238e-06,
+ "loss": 0.4037,
+ "step": 7910
+ },
+ {
+ "epoch": 3.7404255319148936,
+ "grad_norm": 3.0867795944213867,
+ "learning_rate": 1.576673291680505e-06,
+ "loss": 0.4394,
+ "step": 7911
+ },
+ {
+ "epoch": 3.7408983451536644,
+ "grad_norm": 3.295976161956787,
+ "learning_rate": 1.5760935992459926e-06,
+ "loss": 0.3938,
+ "step": 7912
+ },
+ {
+ "epoch": 3.741371158392435,
+ "grad_norm": 2.725949287414551,
+ "learning_rate": 1.5755139643390794e-06,
+ "loss": 0.3633,
+ "step": 7913
+ },
+ {
+ "epoch": 3.7418439716312055,
+ "grad_norm": 3.0864083766937256,
+ "learning_rate": 1.5749343869958585e-06,
+ "loss": 0.3034,
+ "step": 7914
+ },
+ {
+ "epoch": 3.7423167848699763,
+ "grad_norm": 3.707273244857788,
+ "learning_rate": 1.5743548672524175e-06,
+ "loss": 0.4206,
+ "step": 7915
+ },
+ {
+ "epoch": 3.742789598108747,
+ "grad_norm": 2.9829516410827637,
+ "learning_rate": 1.573775405144839e-06,
+ "loss": 0.333,
+ "step": 7916
+ },
+ {
+ "epoch": 3.7432624113475175,
+ "grad_norm": 3.3303117752075195,
+ "learning_rate": 1.5731960007092056e-06,
+ "loss": 0.4558,
+ "step": 7917
+ },
+ {
+ "epoch": 3.7437352245862883,
+ "grad_norm": 2.63291335105896,
+ "learning_rate": 1.5726166539815925e-06,
+ "loss": 0.39,
+ "step": 7918
+ },
+ {
+ "epoch": 3.744208037825059,
+ "grad_norm": 3.0533673763275146,
+ "learning_rate": 1.572037364998075e-06,
+ "loss": 0.3586,
+ "step": 7919
+ },
+ {
+ "epoch": 3.74468085106383,
+ "grad_norm": 2.9185104370117188,
+ "learning_rate": 1.5714581337947216e-06,
+ "loss": 0.3809,
+ "step": 7920
+ },
+ {
+ "epoch": 3.7451536643026007,
+ "grad_norm": 3.1863298416137695,
+ "learning_rate": 1.5708789604075975e-06,
+ "loss": 0.4132,
+ "step": 7921
+ },
+ {
+ "epoch": 3.745626477541371,
+ "grad_norm": 3.2700514793395996,
+ "learning_rate": 1.5702998448727674e-06,
+ "loss": 0.4601,
+ "step": 7922
+ },
+ {
+ "epoch": 3.746099290780142,
+ "grad_norm": 3.4729206562042236,
+ "learning_rate": 1.5697207872262886e-06,
+ "loss": 0.4585,
+ "step": 7923
+ },
+ {
+ "epoch": 3.7465721040189126,
+ "grad_norm": 3.1432926654815674,
+ "learning_rate": 1.5691417875042182e-06,
+ "loss": 0.3128,
+ "step": 7924
+ },
+ {
+ "epoch": 3.747044917257683,
+ "grad_norm": 3.096121072769165,
+ "learning_rate": 1.5685628457426066e-06,
+ "loss": 0.3903,
+ "step": 7925
+ },
+ {
+ "epoch": 3.7475177304964538,
+ "grad_norm": 2.6897027492523193,
+ "learning_rate": 1.5679839619775023e-06,
+ "loss": 0.3707,
+ "step": 7926
+ },
+ {
+ "epoch": 3.7479905437352246,
+ "grad_norm": 2.8020687103271484,
+ "learning_rate": 1.5674051362449503e-06,
+ "loss": 0.3986,
+ "step": 7927
+ },
+ {
+ "epoch": 3.7484633569739954,
+ "grad_norm": 3.1278326511383057,
+ "learning_rate": 1.56682636858099e-06,
+ "loss": 0.3899,
+ "step": 7928
+ },
+ {
+ "epoch": 3.748936170212766,
+ "grad_norm": 2.907982587814331,
+ "learning_rate": 1.5662476590216613e-06,
+ "loss": 0.3422,
+ "step": 7929
+ },
+ {
+ "epoch": 3.7494089834515365,
+ "grad_norm": 3.1246347427368164,
+ "learning_rate": 1.5656690076029962e-06,
+ "loss": 0.452,
+ "step": 7930
+ },
+ {
+ "epoch": 3.7498817966903073,
+ "grad_norm": 2.9161367416381836,
+ "learning_rate": 1.565090414361024e-06,
+ "loss": 0.3284,
+ "step": 7931
+ },
+ {
+ "epoch": 3.750354609929078,
+ "grad_norm": 2.943183422088623,
+ "learning_rate": 1.564511879331773e-06,
+ "loss": 0.3478,
+ "step": 7932
+ },
+ {
+ "epoch": 3.7508274231678485,
+ "grad_norm": 3.2308566570281982,
+ "learning_rate": 1.563933402551266e-06,
+ "loss": 0.4143,
+ "step": 7933
+ },
+ {
+ "epoch": 3.7513002364066192,
+ "grad_norm": 2.6846251487731934,
+ "learning_rate": 1.5633549840555206e-06,
+ "loss": 0.3681,
+ "step": 7934
+ },
+ {
+ "epoch": 3.75177304964539,
+ "grad_norm": 3.0995283126831055,
+ "learning_rate": 1.562776623880554e-06,
+ "loss": 0.4642,
+ "step": 7935
+ },
+ {
+ "epoch": 3.752245862884161,
+ "grad_norm": 2.7406163215637207,
+ "learning_rate": 1.562198322062376e-06,
+ "loss": 0.3823,
+ "step": 7936
+ },
+ {
+ "epoch": 3.7527186761229316,
+ "grad_norm": 2.85732364654541,
+ "learning_rate": 1.5616200786369978e-06,
+ "loss": 0.3053,
+ "step": 7937
+ },
+ {
+ "epoch": 3.753191489361702,
+ "grad_norm": 2.812526226043701,
+ "learning_rate": 1.5610418936404223e-06,
+ "loss": 0.3944,
+ "step": 7938
+ },
+ {
+ "epoch": 3.753664302600473,
+ "grad_norm": 2.8886849880218506,
+ "learning_rate": 1.5604637671086499e-06,
+ "loss": 0.3936,
+ "step": 7939
+ },
+ {
+ "epoch": 3.7541371158392436,
+ "grad_norm": 2.831774950027466,
+ "learning_rate": 1.5598856990776801e-06,
+ "loss": 0.3146,
+ "step": 7940
+ },
+ {
+ "epoch": 3.754609929078014,
+ "grad_norm": 2.8853790760040283,
+ "learning_rate": 1.5593076895835052e-06,
+ "loss": 0.3286,
+ "step": 7941
+ },
+ {
+ "epoch": 3.7550827423167847,
+ "grad_norm": 3.2724483013153076,
+ "learning_rate": 1.5587297386621158e-06,
+ "loss": 0.3396,
+ "step": 7942
+ },
+ {
+ "epoch": 3.7555555555555555,
+ "grad_norm": 3.5077168941497803,
+ "learning_rate": 1.5581518463494983e-06,
+ "loss": 0.4528,
+ "step": 7943
+ },
+ {
+ "epoch": 3.7560283687943263,
+ "grad_norm": 3.031503915786743,
+ "learning_rate": 1.5575740126816346e-06,
+ "loss": 0.3803,
+ "step": 7944
+ },
+ {
+ "epoch": 3.756501182033097,
+ "grad_norm": 3.0939114093780518,
+ "learning_rate": 1.556996237694506e-06,
+ "loss": 0.3931,
+ "step": 7945
+ },
+ {
+ "epoch": 3.7569739952718675,
+ "grad_norm": 2.9404146671295166,
+ "learning_rate": 1.556418521424085e-06,
+ "loss": 0.3608,
+ "step": 7946
+ },
+ {
+ "epoch": 3.7574468085106383,
+ "grad_norm": 3.4363012313842773,
+ "learning_rate": 1.5558408639063465e-06,
+ "loss": 0.4335,
+ "step": 7947
+ },
+ {
+ "epoch": 3.757919621749409,
+ "grad_norm": 3.2819864749908447,
+ "learning_rate": 1.5552632651772575e-06,
+ "loss": 0.4147,
+ "step": 7948
+ },
+ {
+ "epoch": 3.7583924349881794,
+ "grad_norm": 2.917788505554199,
+ "learning_rate": 1.554685725272782e-06,
+ "loss": 0.3516,
+ "step": 7949
+ },
+ {
+ "epoch": 3.7588652482269502,
+ "grad_norm": 2.8425943851470947,
+ "learning_rate": 1.5541082442288818e-06,
+ "loss": 0.3596,
+ "step": 7950
+ },
+ {
+ "epoch": 3.759338061465721,
+ "grad_norm": 3.087005376815796,
+ "learning_rate": 1.5535308220815126e-06,
+ "loss": 0.3968,
+ "step": 7951
+ },
+ {
+ "epoch": 3.759810874704492,
+ "grad_norm": 2.743110179901123,
+ "learning_rate": 1.5529534588666298e-06,
+ "loss": 0.3802,
+ "step": 7952
+ },
+ {
+ "epoch": 3.7602836879432626,
+ "grad_norm": 2.914424180984497,
+ "learning_rate": 1.5523761546201825e-06,
+ "loss": 0.4055,
+ "step": 7953
+ },
+ {
+ "epoch": 3.760756501182033,
+ "grad_norm": 2.9691991806030273,
+ "learning_rate": 1.551798909378116e-06,
+ "loss": 0.3384,
+ "step": 7954
+ },
+ {
+ "epoch": 3.7612293144208038,
+ "grad_norm": 2.433657646179199,
+ "learning_rate": 1.5512217231763747e-06,
+ "loss": 0.3019,
+ "step": 7955
+ },
+ {
+ "epoch": 3.7617021276595746,
+ "grad_norm": 2.7904880046844482,
+ "learning_rate": 1.5506445960508957e-06,
+ "loss": 0.389,
+ "step": 7956
+ },
+ {
+ "epoch": 3.762174940898345,
+ "grad_norm": 2.9241607189178467,
+ "learning_rate": 1.5500675280376154e-06,
+ "loss": 0.4291,
+ "step": 7957
+ },
+ {
+ "epoch": 3.7626477541371157,
+ "grad_norm": 3.216491222381592,
+ "learning_rate": 1.549490519172465e-06,
+ "loss": 0.4065,
+ "step": 7958
+ },
+ {
+ "epoch": 3.7631205673758865,
+ "grad_norm": 2.8859689235687256,
+ "learning_rate": 1.548913569491371e-06,
+ "loss": 0.353,
+ "step": 7959
+ },
+ {
+ "epoch": 3.7635933806146573,
+ "grad_norm": 2.958773136138916,
+ "learning_rate": 1.5483366790302594e-06,
+ "loss": 0.3829,
+ "step": 7960
+ },
+ {
+ "epoch": 3.764066193853428,
+ "grad_norm": 2.868649482727051,
+ "learning_rate": 1.5477598478250505e-06,
+ "loss": 0.3591,
+ "step": 7961
+ },
+ {
+ "epoch": 3.7645390070921985,
+ "grad_norm": 2.6912996768951416,
+ "learning_rate": 1.5471830759116591e-06,
+ "loss": 0.3695,
+ "step": 7962
+ },
+ {
+ "epoch": 3.7650118203309693,
+ "grad_norm": 3.3318257331848145,
+ "learning_rate": 1.5466063633260004e-06,
+ "loss": 0.4126,
+ "step": 7963
+ },
+ {
+ "epoch": 3.76548463356974,
+ "grad_norm": 2.865525007247925,
+ "learning_rate": 1.5460297101039825e-06,
+ "loss": 0.4235,
+ "step": 7964
+ },
+ {
+ "epoch": 3.7659574468085104,
+ "grad_norm": 2.8639180660247803,
+ "learning_rate": 1.5454531162815123e-06,
+ "loss": 0.4392,
+ "step": 7965
+ },
+ {
+ "epoch": 3.766430260047281,
+ "grad_norm": 2.5752499103546143,
+ "learning_rate": 1.5448765818944902e-06,
+ "loss": 0.4113,
+ "step": 7966
+ },
+ {
+ "epoch": 3.766903073286052,
+ "grad_norm": 2.7622742652893066,
+ "learning_rate": 1.5443001069788155e-06,
+ "loss": 0.3785,
+ "step": 7967
+ },
+ {
+ "epoch": 3.767375886524823,
+ "grad_norm": 2.965579032897949,
+ "learning_rate": 1.5437236915703829e-06,
+ "loss": 0.335,
+ "step": 7968
+ },
+ {
+ "epoch": 3.7678486997635936,
+ "grad_norm": 3.0587408542633057,
+ "learning_rate": 1.5431473357050816e-06,
+ "loss": 0.4047,
+ "step": 7969
+ },
+ {
+ "epoch": 3.768321513002364,
+ "grad_norm": 3.2929413318634033,
+ "learning_rate": 1.5425710394188014e-06,
+ "loss": 0.4061,
+ "step": 7970
+ },
+ {
+ "epoch": 3.7687943262411348,
+ "grad_norm": 2.663043975830078,
+ "learning_rate": 1.541994802747424e-06,
+ "loss": 0.3478,
+ "step": 7971
+ },
+ {
+ "epoch": 3.7692671394799055,
+ "grad_norm": 3.0657591819763184,
+ "learning_rate": 1.5414186257268293e-06,
+ "loss": 0.3735,
+ "step": 7972
+ },
+ {
+ "epoch": 3.769739952718676,
+ "grad_norm": 2.963189125061035,
+ "learning_rate": 1.5408425083928939e-06,
+ "loss": 0.4743,
+ "step": 7973
+ },
+ {
+ "epoch": 3.7702127659574467,
+ "grad_norm": 3.1509387493133545,
+ "learning_rate": 1.540266450781489e-06,
+ "loss": 0.4164,
+ "step": 7974
+ },
+ {
+ "epoch": 3.7706855791962175,
+ "grad_norm": 3.4436306953430176,
+ "learning_rate": 1.539690452928485e-06,
+ "loss": 0.4583,
+ "step": 7975
+ },
+ {
+ "epoch": 3.7711583924349883,
+ "grad_norm": 3.1746156215667725,
+ "learning_rate": 1.5391145148697454e-06,
+ "loss": 0.4042,
+ "step": 7976
+ },
+ {
+ "epoch": 3.771631205673759,
+ "grad_norm": 3.531028985977173,
+ "learning_rate": 1.5385386366411304e-06,
+ "loss": 0.4304,
+ "step": 7977
+ },
+ {
+ "epoch": 3.7721040189125294,
+ "grad_norm": 2.867871046066284,
+ "learning_rate": 1.5379628182785e-06,
+ "loss": 0.4023,
+ "step": 7978
+ },
+ {
+ "epoch": 3.7725768321513002,
+ "grad_norm": 3.0504629611968994,
+ "learning_rate": 1.5373870598177051e-06,
+ "loss": 0.3785,
+ "step": 7979
+ },
+ {
+ "epoch": 3.773049645390071,
+ "grad_norm": 2.8188650608062744,
+ "learning_rate": 1.5368113612945983e-06,
+ "loss": 0.3808,
+ "step": 7980
+ },
+ {
+ "epoch": 3.7735224586288414,
+ "grad_norm": 3.0809133052825928,
+ "learning_rate": 1.5362357227450248e-06,
+ "loss": 0.3912,
+ "step": 7981
+ },
+ {
+ "epoch": 3.773995271867612,
+ "grad_norm": 3.223273277282715,
+ "learning_rate": 1.5356601442048257e-06,
+ "loss": 0.3802,
+ "step": 7982
+ },
+ {
+ "epoch": 3.774468085106383,
+ "grad_norm": 2.7513339519500732,
+ "learning_rate": 1.535084625709842e-06,
+ "loss": 0.3822,
+ "step": 7983
+ },
+ {
+ "epoch": 3.774940898345154,
+ "grad_norm": 3.085592031478882,
+ "learning_rate": 1.5345091672959074e-06,
+ "loss": 0.4348,
+ "step": 7984
+ },
+ {
+ "epoch": 3.7754137115839246,
+ "grad_norm": 3.315108299255371,
+ "learning_rate": 1.5339337689988525e-06,
+ "loss": 0.4196,
+ "step": 7985
+ },
+ {
+ "epoch": 3.775886524822695,
+ "grad_norm": 3.713372230529785,
+ "learning_rate": 1.533358430854507e-06,
+ "loss": 0.4292,
+ "step": 7986
+ },
+ {
+ "epoch": 3.7763593380614657,
+ "grad_norm": 2.7899155616760254,
+ "learning_rate": 1.532783152898692e-06,
+ "loss": 0.3874,
+ "step": 7987
+ },
+ {
+ "epoch": 3.7768321513002365,
+ "grad_norm": 2.918851852416992,
+ "learning_rate": 1.5322079351672297e-06,
+ "loss": 0.4073,
+ "step": 7988
+ },
+ {
+ "epoch": 3.777304964539007,
+ "grad_norm": 3.13395619392395,
+ "learning_rate": 1.5316327776959361e-06,
+ "loss": 0.3441,
+ "step": 7989
+ },
+ {
+ "epoch": 3.7777777777777777,
+ "grad_norm": 3.2320916652679443,
+ "learning_rate": 1.531057680520623e-06,
+ "loss": 0.372,
+ "step": 7990
+ },
+ {
+ "epoch": 3.7782505910165485,
+ "grad_norm": 3.1130621433258057,
+ "learning_rate": 1.5304826436770991e-06,
+ "loss": 0.3514,
+ "step": 7991
+ },
+ {
+ "epoch": 3.7787234042553193,
+ "grad_norm": 3.223207712173462,
+ "learning_rate": 1.5299076672011696e-06,
+ "loss": 0.44,
+ "step": 7992
+ },
+ {
+ "epoch": 3.77919621749409,
+ "grad_norm": 3.0757877826690674,
+ "learning_rate": 1.5293327511286366e-06,
+ "loss": 0.4051,
+ "step": 7993
+ },
+ {
+ "epoch": 3.7796690307328604,
+ "grad_norm": 2.936678409576416,
+ "learning_rate": 1.528757895495297e-06,
+ "loss": 0.3965,
+ "step": 7994
+ },
+ {
+ "epoch": 3.780141843971631,
+ "grad_norm": 2.993445873260498,
+ "learning_rate": 1.5281831003369435e-06,
+ "loss": 0.4222,
+ "step": 7995
+ },
+ {
+ "epoch": 3.780614657210402,
+ "grad_norm": 2.9140853881835938,
+ "learning_rate": 1.5276083656893679e-06,
+ "loss": 0.3662,
+ "step": 7996
+ },
+ {
+ "epoch": 3.7810874704491724,
+ "grad_norm": 3.2649893760681152,
+ "learning_rate": 1.5270336915883549e-06,
+ "loss": 0.4272,
+ "step": 7997
+ },
+ {
+ "epoch": 3.781560283687943,
+ "grad_norm": 3.0631372928619385,
+ "learning_rate": 1.5264590780696887e-06,
+ "loss": 0.4111,
+ "step": 7998
+ },
+ {
+ "epoch": 3.782033096926714,
+ "grad_norm": 2.791299343109131,
+ "learning_rate": 1.5258845251691463e-06,
+ "loss": 0.416,
+ "step": 7999
+ },
+ {
+ "epoch": 3.7825059101654848,
+ "grad_norm": 3.262294054031372,
+ "learning_rate": 1.5253100329225023e-06,
+ "loss": 0.4236,
+ "step": 8000
+ },
+ {
+ "epoch": 3.7829787234042556,
+ "grad_norm": 2.574486017227173,
+ "learning_rate": 1.5247356013655295e-06,
+ "loss": 0.4089,
+ "step": 8001
+ },
+ {
+ "epoch": 3.783451536643026,
+ "grad_norm": 3.1566531658172607,
+ "learning_rate": 1.5241612305339936e-06,
+ "loss": 0.3955,
+ "step": 8002
+ },
+ {
+ "epoch": 3.7839243498817967,
+ "grad_norm": 2.5845813751220703,
+ "learning_rate": 1.5235869204636602e-06,
+ "loss": 0.3672,
+ "step": 8003
+ },
+ {
+ "epoch": 3.7843971631205675,
+ "grad_norm": 2.877570629119873,
+ "learning_rate": 1.5230126711902876e-06,
+ "loss": 0.3919,
+ "step": 8004
+ },
+ {
+ "epoch": 3.784869976359338,
+ "grad_norm": 3.183061122894287,
+ "learning_rate": 1.5224384827496314e-06,
+ "loss": 0.3291,
+ "step": 8005
+ },
+ {
+ "epoch": 3.7853427895981087,
+ "grad_norm": 3.0778391361236572,
+ "learning_rate": 1.5218643551774451e-06,
+ "loss": 0.3571,
+ "step": 8006
+ },
+ {
+ "epoch": 3.7858156028368795,
+ "grad_norm": 3.2364399433135986,
+ "learning_rate": 1.5212902885094762e-06,
+ "loss": 0.4045,
+ "step": 8007
+ },
+ {
+ "epoch": 3.7862884160756503,
+ "grad_norm": 3.0571746826171875,
+ "learning_rate": 1.5207162827814687e-06,
+ "loss": 0.4181,
+ "step": 8008
+ },
+ {
+ "epoch": 3.786761229314421,
+ "grad_norm": 2.7215163707733154,
+ "learning_rate": 1.5201423380291652e-06,
+ "loss": 0.3328,
+ "step": 8009
+ },
+ {
+ "epoch": 3.7872340425531914,
+ "grad_norm": 3.0521233081817627,
+ "learning_rate": 1.5195684542883007e-06,
+ "loss": 0.4072,
+ "step": 8010
+ },
+ {
+ "epoch": 3.787706855791962,
+ "grad_norm": 2.541666269302368,
+ "learning_rate": 1.5189946315946104e-06,
+ "loss": 0.3293,
+ "step": 8011
+ },
+ {
+ "epoch": 3.788179669030733,
+ "grad_norm": 3.0041720867156982,
+ "learning_rate": 1.5184208699838232e-06,
+ "loss": 0.3998,
+ "step": 8012
+ },
+ {
+ "epoch": 3.7886524822695034,
+ "grad_norm": 3.0763001441955566,
+ "learning_rate": 1.5178471694916635e-06,
+ "loss": 0.38,
+ "step": 8013
+ },
+ {
+ "epoch": 3.789125295508274,
+ "grad_norm": 3.0788497924804688,
+ "learning_rate": 1.5172735301538544e-06,
+ "loss": 0.3986,
+ "step": 8014
+ },
+ {
+ "epoch": 3.789598108747045,
+ "grad_norm": 2.830225944519043,
+ "learning_rate": 1.5166999520061127e-06,
+ "loss": 0.3977,
+ "step": 8015
+ },
+ {
+ "epoch": 3.7900709219858157,
+ "grad_norm": 3.196078062057495,
+ "learning_rate": 1.5161264350841543e-06,
+ "loss": 0.4058,
+ "step": 8016
+ },
+ {
+ "epoch": 3.7905437352245865,
+ "grad_norm": 9.898200988769531,
+ "learning_rate": 1.5155529794236884e-06,
+ "loss": 0.3451,
+ "step": 8017
+ },
+ {
+ "epoch": 3.791016548463357,
+ "grad_norm": 3.0028066635131836,
+ "learning_rate": 1.514979585060421e-06,
+ "loss": 0.4029,
+ "step": 8018
+ },
+ {
+ "epoch": 3.7914893617021277,
+ "grad_norm": 2.984926223754883,
+ "learning_rate": 1.5144062520300562e-06,
+ "loss": 0.3995,
+ "step": 8019
+ },
+ {
+ "epoch": 3.7919621749408985,
+ "grad_norm": 2.938596487045288,
+ "learning_rate": 1.5138329803682925e-06,
+ "loss": 0.386,
+ "step": 8020
+ },
+ {
+ "epoch": 3.792434988179669,
+ "grad_norm": 2.992565393447876,
+ "learning_rate": 1.513259770110825e-06,
+ "loss": 0.3919,
+ "step": 8021
+ },
+ {
+ "epoch": 3.7929078014184396,
+ "grad_norm": 3.0182361602783203,
+ "learning_rate": 1.5126866212933453e-06,
+ "loss": 0.3506,
+ "step": 8022
+ },
+ {
+ "epoch": 3.7933806146572104,
+ "grad_norm": 3.2039108276367188,
+ "learning_rate": 1.5121135339515392e-06,
+ "loss": 0.3807,
+ "step": 8023
+ },
+ {
+ "epoch": 3.7938534278959812,
+ "grad_norm": 2.9290878772735596,
+ "learning_rate": 1.5115405081210927e-06,
+ "loss": 0.3596,
+ "step": 8024
+ },
+ {
+ "epoch": 3.794326241134752,
+ "grad_norm": 3.106152057647705,
+ "learning_rate": 1.510967543837683e-06,
+ "loss": 0.3703,
+ "step": 8025
+ },
+ {
+ "epoch": 3.7947990543735224,
+ "grad_norm": 2.9752190113067627,
+ "learning_rate": 1.510394641136989e-06,
+ "loss": 0.4049,
+ "step": 8026
+ },
+ {
+ "epoch": 3.795271867612293,
+ "grad_norm": 2.996206283569336,
+ "learning_rate": 1.5098218000546815e-06,
+ "loss": 0.4286,
+ "step": 8027
+ },
+ {
+ "epoch": 3.795744680851064,
+ "grad_norm": 2.9403493404388428,
+ "learning_rate": 1.5092490206264281e-06,
+ "loss": 0.3628,
+ "step": 8028
+ },
+ {
+ "epoch": 3.7962174940898343,
+ "grad_norm": 2.8101110458374023,
+ "learning_rate": 1.5086763028878943e-06,
+ "loss": 0.4016,
+ "step": 8029
+ },
+ {
+ "epoch": 3.796690307328605,
+ "grad_norm": 3.162264108657837,
+ "learning_rate": 1.5081036468747401e-06,
+ "loss": 0.4133,
+ "step": 8030
+ },
+ {
+ "epoch": 3.797163120567376,
+ "grad_norm": 2.6871988773345947,
+ "learning_rate": 1.5075310526226223e-06,
+ "loss": 0.3748,
+ "step": 8031
+ },
+ {
+ "epoch": 3.7976359338061467,
+ "grad_norm": 2.997924327850342,
+ "learning_rate": 1.5069585201671944e-06,
+ "loss": 0.4083,
+ "step": 8032
+ },
+ {
+ "epoch": 3.7981087470449175,
+ "grad_norm": 2.8266279697418213,
+ "learning_rate": 1.506386049544104e-06,
+ "loss": 0.4488,
+ "step": 8033
+ },
+ {
+ "epoch": 3.798581560283688,
+ "grad_norm": 2.7106378078460693,
+ "learning_rate": 1.5058136407889985e-06,
+ "loss": 0.363,
+ "step": 8034
+ },
+ {
+ "epoch": 3.7990543735224587,
+ "grad_norm": 2.8983304500579834,
+ "learning_rate": 1.5052412939375183e-06,
+ "loss": 0.4156,
+ "step": 8035
+ },
+ {
+ "epoch": 3.7995271867612295,
+ "grad_norm": 3.0333914756774902,
+ "learning_rate": 1.5046690090253001e-06,
+ "loss": 0.3694,
+ "step": 8036
+ },
+ {
+ "epoch": 3.8,
+ "grad_norm": 2.872662305831909,
+ "learning_rate": 1.5040967860879785e-06,
+ "loss": 0.3492,
+ "step": 8037
+ },
+ {
+ "epoch": 3.8004728132387706,
+ "grad_norm": 2.7279646396636963,
+ "learning_rate": 1.5035246251611835e-06,
+ "loss": 0.327,
+ "step": 8038
+ },
+ {
+ "epoch": 3.8009456264775414,
+ "grad_norm": 2.969326972961426,
+ "learning_rate": 1.5029525262805405e-06,
+ "loss": 0.3977,
+ "step": 8039
+ },
+ {
+ "epoch": 3.801418439716312,
+ "grad_norm": 3.073899745941162,
+ "learning_rate": 1.5023804894816723e-06,
+ "loss": 0.388,
+ "step": 8040
+ },
+ {
+ "epoch": 3.801891252955083,
+ "grad_norm": 3.026284694671631,
+ "learning_rate": 1.5018085148001953e-06,
+ "loss": 0.3761,
+ "step": 8041
+ },
+ {
+ "epoch": 3.8023640661938534,
+ "grad_norm": 3.0478618144989014,
+ "learning_rate": 1.5012366022717262e-06,
+ "loss": 0.4415,
+ "step": 8042
+ },
+ {
+ "epoch": 3.802836879432624,
+ "grad_norm": 2.801584005355835,
+ "learning_rate": 1.500664751931874e-06,
+ "loss": 0.4079,
+ "step": 8043
+ },
+ {
+ "epoch": 3.803309692671395,
+ "grad_norm": 3.4839112758636475,
+ "learning_rate": 1.5000929638162459e-06,
+ "loss": 0.4391,
+ "step": 8044
+ },
+ {
+ "epoch": 3.8037825059101653,
+ "grad_norm": 2.6945605278015137,
+ "learning_rate": 1.4995212379604446e-06,
+ "loss": 0.3564,
+ "step": 8045
+ },
+ {
+ "epoch": 3.804255319148936,
+ "grad_norm": 3.0870234966278076,
+ "learning_rate": 1.4989495744000687e-06,
+ "loss": 0.3801,
+ "step": 8046
+ },
+ {
+ "epoch": 3.804728132387707,
+ "grad_norm": 2.975332021713257,
+ "learning_rate": 1.4983779731707135e-06,
+ "loss": 0.3408,
+ "step": 8047
+ },
+ {
+ "epoch": 3.8052009456264777,
+ "grad_norm": 2.9920027256011963,
+ "learning_rate": 1.497806434307969e-06,
+ "loss": 0.3875,
+ "step": 8048
+ },
+ {
+ "epoch": 3.8056737588652485,
+ "grad_norm": 3.1974916458129883,
+ "learning_rate": 1.4972349578474244e-06,
+ "loss": 0.4492,
+ "step": 8049
+ },
+ {
+ "epoch": 3.806146572104019,
+ "grad_norm": 2.839503526687622,
+ "learning_rate": 1.4966635438246622e-06,
+ "loss": 0.3785,
+ "step": 8050
+ },
+ {
+ "epoch": 3.8066193853427897,
+ "grad_norm": 3.274502992630005,
+ "learning_rate": 1.4960921922752603e-06,
+ "loss": 0.4404,
+ "step": 8051
+ },
+ {
+ "epoch": 3.8070921985815604,
+ "grad_norm": 3.0852737426757812,
+ "learning_rate": 1.4955209032347967e-06,
+ "loss": 0.4047,
+ "step": 8052
+ },
+ {
+ "epoch": 3.807565011820331,
+ "grad_norm": 2.9251608848571777,
+ "learning_rate": 1.4949496767388417e-06,
+ "loss": 0.3654,
+ "step": 8053
+ },
+ {
+ "epoch": 3.8080378250591016,
+ "grad_norm": 2.518220901489258,
+ "learning_rate": 1.4943785128229635e-06,
+ "loss": 0.3157,
+ "step": 8054
+ },
+ {
+ "epoch": 3.8085106382978724,
+ "grad_norm": 3.3993279933929443,
+ "learning_rate": 1.4938074115227257e-06,
+ "loss": 0.4204,
+ "step": 8055
+ },
+ {
+ "epoch": 3.808983451536643,
+ "grad_norm": 3.2847096920013428,
+ "learning_rate": 1.4932363728736876e-06,
+ "loss": 0.339,
+ "step": 8056
+ },
+ {
+ "epoch": 3.8094562647754135,
+ "grad_norm": 2.7779417037963867,
+ "learning_rate": 1.492665396911407e-06,
+ "loss": 0.3538,
+ "step": 8057
+ },
+ {
+ "epoch": 3.8099290780141843,
+ "grad_norm": 2.958131790161133,
+ "learning_rate": 1.4920944836714353e-06,
+ "loss": 0.363,
+ "step": 8058
+ },
+ {
+ "epoch": 3.810401891252955,
+ "grad_norm": 3.1873440742492676,
+ "learning_rate": 1.491523633189319e-06,
+ "loss": 0.3785,
+ "step": 8059
+ },
+ {
+ "epoch": 3.8108747044917255,
+ "grad_norm": 3.132652759552002,
+ "learning_rate": 1.4909528455006055e-06,
+ "loss": 0.375,
+ "step": 8060
+ },
+ {
+ "epoch": 3.8113475177304963,
+ "grad_norm": 2.8598761558532715,
+ "learning_rate": 1.490382120640833e-06,
+ "loss": 0.4152,
+ "step": 8061
+ },
+ {
+ "epoch": 3.811820330969267,
+ "grad_norm": 3.115870952606201,
+ "learning_rate": 1.4898114586455399e-06,
+ "loss": 0.4609,
+ "step": 8062
+ },
+ {
+ "epoch": 3.812293144208038,
+ "grad_norm": 3.347944974899292,
+ "learning_rate": 1.4892408595502571e-06,
+ "loss": 0.3836,
+ "step": 8063
+ },
+ {
+ "epoch": 3.8127659574468087,
+ "grad_norm": 3.1747031211853027,
+ "learning_rate": 1.4886703233905132e-06,
+ "loss": 0.374,
+ "step": 8064
+ },
+ {
+ "epoch": 3.813238770685579,
+ "grad_norm": 2.945139169692993,
+ "learning_rate": 1.4880998502018345e-06,
+ "loss": 0.3652,
+ "step": 8065
+ },
+ {
+ "epoch": 3.81371158392435,
+ "grad_norm": 2.8911492824554443,
+ "learning_rate": 1.4875294400197403e-06,
+ "loss": 0.3683,
+ "step": 8066
+ },
+ {
+ "epoch": 3.8141843971631206,
+ "grad_norm": 3.080268383026123,
+ "learning_rate": 1.4869590928797491e-06,
+ "loss": 0.3919,
+ "step": 8067
+ },
+ {
+ "epoch": 3.814657210401891,
+ "grad_norm": 3.0834288597106934,
+ "learning_rate": 1.4863888088173734e-06,
+ "loss": 0.3988,
+ "step": 8068
+ },
+ {
+ "epoch": 3.815130023640662,
+ "grad_norm": 2.765702724456787,
+ "learning_rate": 1.4858185878681213e-06,
+ "loss": 0.3659,
+ "step": 8069
+ },
+ {
+ "epoch": 3.8156028368794326,
+ "grad_norm": 3.074059247970581,
+ "learning_rate": 1.4852484300674993e-06,
+ "loss": 0.3888,
+ "step": 8070
+ },
+ {
+ "epoch": 3.8160756501182034,
+ "grad_norm": 3.0009944438934326,
+ "learning_rate": 1.484678335451007e-06,
+ "loss": 0.417,
+ "step": 8071
+ },
+ {
+ "epoch": 3.816548463356974,
+ "grad_norm": 2.6661112308502197,
+ "learning_rate": 1.4841083040541438e-06,
+ "loss": 0.3544,
+ "step": 8072
+ },
+ {
+ "epoch": 3.8170212765957445,
+ "grad_norm": 2.7849514484405518,
+ "learning_rate": 1.4835383359124018e-06,
+ "loss": 0.3691,
+ "step": 8073
+ },
+ {
+ "epoch": 3.8174940898345153,
+ "grad_norm": 3.008070707321167,
+ "learning_rate": 1.4829684310612697e-06,
+ "loss": 0.4228,
+ "step": 8074
+ },
+ {
+ "epoch": 3.817966903073286,
+ "grad_norm": 2.649296998977661,
+ "learning_rate": 1.4823985895362348e-06,
+ "loss": 0.3642,
+ "step": 8075
+ },
+ {
+ "epoch": 3.8184397163120565,
+ "grad_norm": 2.6017661094665527,
+ "learning_rate": 1.4818288113727768e-06,
+ "loss": 0.3537,
+ "step": 8076
+ },
+ {
+ "epoch": 3.8189125295508273,
+ "grad_norm": 2.9071972370147705,
+ "learning_rate": 1.481259096606375e-06,
+ "loss": 0.3096,
+ "step": 8077
+ },
+ {
+ "epoch": 3.819385342789598,
+ "grad_norm": 3.0866518020629883,
+ "learning_rate": 1.4806894452725024e-06,
+ "loss": 0.4148,
+ "step": 8078
+ },
+ {
+ "epoch": 3.819858156028369,
+ "grad_norm": 3.2099499702453613,
+ "learning_rate": 1.4801198574066272e-06,
+ "loss": 0.4058,
+ "step": 8079
+ },
+ {
+ "epoch": 3.8203309692671397,
+ "grad_norm": 3.0204920768737793,
+ "learning_rate": 1.4795503330442176e-06,
+ "loss": 0.3427,
+ "step": 8080
+ },
+ {
+ "epoch": 3.82080378250591,
+ "grad_norm": 2.88667368888855,
+ "learning_rate": 1.478980872220734e-06,
+ "loss": 0.4075,
+ "step": 8081
+ },
+ {
+ "epoch": 3.821276595744681,
+ "grad_norm": 2.926673173904419,
+ "learning_rate": 1.4784114749716338e-06,
+ "loss": 0.3449,
+ "step": 8082
+ },
+ {
+ "epoch": 3.8217494089834516,
+ "grad_norm": 2.818936347961426,
+ "learning_rate": 1.4778421413323723e-06,
+ "loss": 0.3628,
+ "step": 8083
+ },
+ {
+ "epoch": 3.822222222222222,
+ "grad_norm": 2.960322380065918,
+ "learning_rate": 1.4772728713383983e-06,
+ "loss": 0.3669,
+ "step": 8084
+ },
+ {
+ "epoch": 3.8226950354609928,
+ "grad_norm": 2.940131902694702,
+ "learning_rate": 1.4767036650251584e-06,
+ "loss": 0.4357,
+ "step": 8085
+ },
+ {
+ "epoch": 3.8231678486997636,
+ "grad_norm": 2.9251785278320312,
+ "learning_rate": 1.4761345224280943e-06,
+ "loss": 0.4046,
+ "step": 8086
+ },
+ {
+ "epoch": 3.8236406619385344,
+ "grad_norm": 3.115590810775757,
+ "learning_rate": 1.475565443582643e-06,
+ "loss": 0.3712,
+ "step": 8087
+ },
+ {
+ "epoch": 3.824113475177305,
+ "grad_norm": 2.5968618392944336,
+ "learning_rate": 1.4749964285242408e-06,
+ "loss": 0.3432,
+ "step": 8088
+ },
+ {
+ "epoch": 3.8245862884160755,
+ "grad_norm": 3.195409059524536,
+ "learning_rate": 1.4744274772883148e-06,
+ "loss": 0.3717,
+ "step": 8089
+ },
+ {
+ "epoch": 3.8250591016548463,
+ "grad_norm": 2.8658018112182617,
+ "learning_rate": 1.4738585899102942e-06,
+ "loss": 0.3807,
+ "step": 8090
+ },
+ {
+ "epoch": 3.825531914893617,
+ "grad_norm": 2.9005510807037354,
+ "learning_rate": 1.4732897664255998e-06,
+ "loss": 0.3988,
+ "step": 8091
+ },
+ {
+ "epoch": 3.8260047281323875,
+ "grad_norm": 3.9155731201171875,
+ "learning_rate": 1.472721006869649e-06,
+ "loss": 0.3981,
+ "step": 8092
+ },
+ {
+ "epoch": 3.8264775413711583,
+ "grad_norm": 2.89312744140625,
+ "learning_rate": 1.4721523112778575e-06,
+ "loss": 0.3286,
+ "step": 8093
+ },
+ {
+ "epoch": 3.826950354609929,
+ "grad_norm": 3.006071090698242,
+ "learning_rate": 1.4715836796856332e-06,
+ "loss": 0.3901,
+ "step": 8094
+ },
+ {
+ "epoch": 3.8274231678487,
+ "grad_norm": 3.083411693572998,
+ "learning_rate": 1.4710151121283845e-06,
+ "loss": 0.3741,
+ "step": 8095
+ },
+ {
+ "epoch": 3.8278959810874706,
+ "grad_norm": 2.864989995956421,
+ "learning_rate": 1.4704466086415131e-06,
+ "loss": 0.3887,
+ "step": 8096
+ },
+ {
+ "epoch": 3.828368794326241,
+ "grad_norm": 2.4846417903900146,
+ "learning_rate": 1.4698781692604158e-06,
+ "loss": 0.33,
+ "step": 8097
+ },
+ {
+ "epoch": 3.828841607565012,
+ "grad_norm": 3.2497007846832275,
+ "learning_rate": 1.4693097940204893e-06,
+ "loss": 0.4011,
+ "step": 8098
+ },
+ {
+ "epoch": 3.8293144208037826,
+ "grad_norm": 3.0079777240753174,
+ "learning_rate": 1.4687414829571218e-06,
+ "loss": 0.4263,
+ "step": 8099
+ },
+ {
+ "epoch": 3.829787234042553,
+ "grad_norm": 2.8538410663604736,
+ "learning_rate": 1.4681732361057005e-06,
+ "loss": 0.3651,
+ "step": 8100
+ },
+ {
+ "epoch": 3.8302600472813237,
+ "grad_norm": 3.238163948059082,
+ "learning_rate": 1.4676050535016076e-06,
+ "loss": 0.392,
+ "step": 8101
+ },
+ {
+ "epoch": 3.8307328605200945,
+ "grad_norm": 2.9991304874420166,
+ "learning_rate": 1.46703693518022e-06,
+ "loss": 0.3643,
+ "step": 8102
+ },
+ {
+ "epoch": 3.8312056737588653,
+ "grad_norm": 2.9816839694976807,
+ "learning_rate": 1.466468881176914e-06,
+ "loss": 0.3803,
+ "step": 8103
+ },
+ {
+ "epoch": 3.831678486997636,
+ "grad_norm": 3.2009265422821045,
+ "learning_rate": 1.465900891527059e-06,
+ "loss": 0.3828,
+ "step": 8104
+ },
+ {
+ "epoch": 3.8321513002364065,
+ "grad_norm": 2.9479124546051025,
+ "learning_rate": 1.4653329662660201e-06,
+ "loss": 0.3683,
+ "step": 8105
+ },
+ {
+ "epoch": 3.8326241134751773,
+ "grad_norm": 2.938507080078125,
+ "learning_rate": 1.4647651054291614e-06,
+ "loss": 0.3703,
+ "step": 8106
+ },
+ {
+ "epoch": 3.833096926713948,
+ "grad_norm": 2.7777645587921143,
+ "learning_rate": 1.4641973090518397e-06,
+ "loss": 0.3982,
+ "step": 8107
+ },
+ {
+ "epoch": 3.8335697399527184,
+ "grad_norm": 3.2470149993896484,
+ "learning_rate": 1.4636295771694099e-06,
+ "loss": 0.3748,
+ "step": 8108
+ },
+ {
+ "epoch": 3.8340425531914892,
+ "grad_norm": 2.869310140609741,
+ "learning_rate": 1.4630619098172223e-06,
+ "loss": 0.3577,
+ "step": 8109
+ },
+ {
+ "epoch": 3.83451536643026,
+ "grad_norm": 3.1245369911193848,
+ "learning_rate": 1.4624943070306225e-06,
+ "loss": 0.4518,
+ "step": 8110
+ },
+ {
+ "epoch": 3.834988179669031,
+ "grad_norm": 3.0390701293945312,
+ "learning_rate": 1.4619267688449529e-06,
+ "loss": 0.5051,
+ "step": 8111
+ },
+ {
+ "epoch": 3.8354609929078016,
+ "grad_norm": 2.929943799972534,
+ "learning_rate": 1.4613592952955507e-06,
+ "loss": 0.4207,
+ "step": 8112
+ },
+ {
+ "epoch": 3.835933806146572,
+ "grad_norm": 3.17008376121521,
+ "learning_rate": 1.4607918864177523e-06,
+ "loss": 0.3836,
+ "step": 8113
+ },
+ {
+ "epoch": 3.8364066193853428,
+ "grad_norm": 3.0689237117767334,
+ "learning_rate": 1.460224542246886e-06,
+ "loss": 0.3413,
+ "step": 8114
+ },
+ {
+ "epoch": 3.8368794326241136,
+ "grad_norm": 2.9966423511505127,
+ "learning_rate": 1.4596572628182774e-06,
+ "loss": 0.4367,
+ "step": 8115
+ },
+ {
+ "epoch": 3.837352245862884,
+ "grad_norm": 3.0572052001953125,
+ "learning_rate": 1.45909004816725e-06,
+ "loss": 0.4089,
+ "step": 8116
+ },
+ {
+ "epoch": 3.8378250591016547,
+ "grad_norm": 2.911263942718506,
+ "learning_rate": 1.4585228983291203e-06,
+ "loss": 0.3848,
+ "step": 8117
+ },
+ {
+ "epoch": 3.8382978723404255,
+ "grad_norm": 2.9233853816986084,
+ "learning_rate": 1.4579558133392038e-06,
+ "loss": 0.4012,
+ "step": 8118
+ },
+ {
+ "epoch": 3.8387706855791963,
+ "grad_norm": 2.7813868522644043,
+ "learning_rate": 1.4573887932328097e-06,
+ "loss": 0.3898,
+ "step": 8119
+ },
+ {
+ "epoch": 3.839243498817967,
+ "grad_norm": 2.8727006912231445,
+ "learning_rate": 1.4568218380452436e-06,
+ "loss": 0.3965,
+ "step": 8120
+ },
+ {
+ "epoch": 3.8397163120567375,
+ "grad_norm": 3.0381174087524414,
+ "learning_rate": 1.4562549478118077e-06,
+ "loss": 0.4304,
+ "step": 8121
+ },
+ {
+ "epoch": 3.8401891252955083,
+ "grad_norm": 2.7406346797943115,
+ "learning_rate": 1.4556881225677982e-06,
+ "loss": 0.3636,
+ "step": 8122
+ },
+ {
+ "epoch": 3.840661938534279,
+ "grad_norm": 3.3900108337402344,
+ "learning_rate": 1.4551213623485111e-06,
+ "loss": 0.3863,
+ "step": 8123
+ },
+ {
+ "epoch": 3.8411347517730494,
+ "grad_norm": 2.885150909423828,
+ "learning_rate": 1.4545546671892354e-06,
+ "loss": 0.3679,
+ "step": 8124
+ },
+ {
+ "epoch": 3.84160756501182,
+ "grad_norm": 3.3361690044403076,
+ "learning_rate": 1.4539880371252555e-06,
+ "loss": 0.4333,
+ "step": 8125
+ },
+ {
+ "epoch": 3.842080378250591,
+ "grad_norm": 3.1547763347625732,
+ "learning_rate": 1.4534214721918545e-06,
+ "loss": 0.4477,
+ "step": 8126
+ },
+ {
+ "epoch": 3.842553191489362,
+ "grad_norm": 3.0337510108947754,
+ "learning_rate": 1.4528549724243095e-06,
+ "loss": 0.3647,
+ "step": 8127
+ },
+ {
+ "epoch": 3.8430260047281326,
+ "grad_norm": 2.8390069007873535,
+ "learning_rate": 1.452288537857893e-06,
+ "loss": 0.3698,
+ "step": 8128
+ },
+ {
+ "epoch": 3.843498817966903,
+ "grad_norm": 2.857513427734375,
+ "learning_rate": 1.451722168527876e-06,
+ "loss": 0.3842,
+ "step": 8129
+ },
+ {
+ "epoch": 3.8439716312056738,
+ "grad_norm": 3.015320062637329,
+ "learning_rate": 1.451155864469522e-06,
+ "loss": 0.4058,
+ "step": 8130
+ },
+ {
+ "epoch": 3.8444444444444446,
+ "grad_norm": 2.923957347869873,
+ "learning_rate": 1.450589625718094e-06,
+ "loss": 0.3976,
+ "step": 8131
+ },
+ {
+ "epoch": 3.844917257683215,
+ "grad_norm": 3.332338571548462,
+ "learning_rate": 1.4500234523088492e-06,
+ "loss": 0.4118,
+ "step": 8132
+ },
+ {
+ "epoch": 3.8453900709219857,
+ "grad_norm": 3.0403711795806885,
+ "learning_rate": 1.4494573442770381e-06,
+ "loss": 0.3715,
+ "step": 8133
+ },
+ {
+ "epoch": 3.8458628841607565,
+ "grad_norm": 3.2310287952423096,
+ "learning_rate": 1.4488913016579135e-06,
+ "loss": 0.4587,
+ "step": 8134
+ },
+ {
+ "epoch": 3.8463356973995273,
+ "grad_norm": 3.091282844543457,
+ "learning_rate": 1.448325324486718e-06,
+ "loss": 0.4234,
+ "step": 8135
+ },
+ {
+ "epoch": 3.846808510638298,
+ "grad_norm": 3.11161208152771,
+ "learning_rate": 1.4477594127986933e-06,
+ "loss": 0.4176,
+ "step": 8136
+ },
+ {
+ "epoch": 3.8472813238770684,
+ "grad_norm": 3.21042537689209,
+ "learning_rate": 1.4471935666290751e-06,
+ "loss": 0.4326,
+ "step": 8137
+ },
+ {
+ "epoch": 3.8477541371158392,
+ "grad_norm": 3.411543846130371,
+ "learning_rate": 1.4466277860130981e-06,
+ "loss": 0.4525,
+ "step": 8138
+ },
+ {
+ "epoch": 3.84822695035461,
+ "grad_norm": 3.0475308895111084,
+ "learning_rate": 1.4460620709859898e-06,
+ "loss": 0.3906,
+ "step": 8139
+ },
+ {
+ "epoch": 3.8486997635933804,
+ "grad_norm": 2.989367723464966,
+ "learning_rate": 1.4454964215829742e-06,
+ "loss": 0.3732,
+ "step": 8140
+ },
+ {
+ "epoch": 3.849172576832151,
+ "grad_norm": 2.8130393028259277,
+ "learning_rate": 1.4449308378392734e-06,
+ "loss": 0.3733,
+ "step": 8141
+ },
+ {
+ "epoch": 3.849645390070922,
+ "grad_norm": 12.2243013381958,
+ "learning_rate": 1.444365319790103e-06,
+ "loss": 0.3506,
+ "step": 8142
+ },
+ {
+ "epoch": 3.850118203309693,
+ "grad_norm": 3.075556516647339,
+ "learning_rate": 1.4437998674706743e-06,
+ "loss": 0.376,
+ "step": 8143
+ },
+ {
+ "epoch": 3.8505910165484636,
+ "grad_norm": 2.765650510787964,
+ "learning_rate": 1.4432344809161974e-06,
+ "loss": 0.3865,
+ "step": 8144
+ },
+ {
+ "epoch": 3.851063829787234,
+ "grad_norm": 3.171588897705078,
+ "learning_rate": 1.4426691601618747e-06,
+ "loss": 0.4391,
+ "step": 8145
+ },
+ {
+ "epoch": 3.8515366430260047,
+ "grad_norm": 2.8378992080688477,
+ "learning_rate": 1.4421039052429083e-06,
+ "loss": 0.3984,
+ "step": 8146
+ },
+ {
+ "epoch": 3.8520094562647755,
+ "grad_norm": 2.6588387489318848,
+ "learning_rate": 1.4415387161944929e-06,
+ "loss": 0.3961,
+ "step": 8147
+ },
+ {
+ "epoch": 3.852482269503546,
+ "grad_norm": 2.919325351715088,
+ "learning_rate": 1.4409735930518197e-06,
+ "loss": 0.4058,
+ "step": 8148
+ },
+ {
+ "epoch": 3.8529550827423167,
+ "grad_norm": 3.2239115238189697,
+ "learning_rate": 1.4404085358500778e-06,
+ "loss": 0.4018,
+ "step": 8149
+ },
+ {
+ "epoch": 3.8534278959810875,
+ "grad_norm": 3.2509875297546387,
+ "learning_rate": 1.4398435446244502e-06,
+ "loss": 0.4078,
+ "step": 8150
+ },
+ {
+ "epoch": 3.8539007092198583,
+ "grad_norm": 3.124782085418701,
+ "learning_rate": 1.4392786194101155e-06,
+ "loss": 0.4459,
+ "step": 8151
+ },
+ {
+ "epoch": 3.854373522458629,
+ "grad_norm": 2.924095392227173,
+ "learning_rate": 1.4387137602422512e-06,
+ "loss": 0.3686,
+ "step": 8152
+ },
+ {
+ "epoch": 3.8548463356973994,
+ "grad_norm": 2.9307191371917725,
+ "learning_rate": 1.4381489671560272e-06,
+ "loss": 0.4345,
+ "step": 8153
+ },
+ {
+ "epoch": 3.8553191489361702,
+ "grad_norm": 2.868488073348999,
+ "learning_rate": 1.4375842401866113e-06,
+ "loss": 0.366,
+ "step": 8154
+ },
+ {
+ "epoch": 3.855791962174941,
+ "grad_norm": 2.9893085956573486,
+ "learning_rate": 1.4370195793691661e-06,
+ "loss": 0.3401,
+ "step": 8155
+ },
+ {
+ "epoch": 3.8562647754137114,
+ "grad_norm": 3.0113472938537598,
+ "learning_rate": 1.4364549847388492e-06,
+ "loss": 0.4051,
+ "step": 8156
+ },
+ {
+ "epoch": 3.856737588652482,
+ "grad_norm": 3.4693121910095215,
+ "learning_rate": 1.4358904563308184e-06,
+ "loss": 0.4505,
+ "step": 8157
+ },
+ {
+ "epoch": 3.857210401891253,
+ "grad_norm": 2.9048118591308594,
+ "learning_rate": 1.4353259941802216e-06,
+ "loss": 0.3973,
+ "step": 8158
+ },
+ {
+ "epoch": 3.8576832151300238,
+ "grad_norm": 3.264910936355591,
+ "learning_rate": 1.434761598322208e-06,
+ "loss": 0.4317,
+ "step": 8159
+ },
+ {
+ "epoch": 3.8581560283687946,
+ "grad_norm": 2.973742723464966,
+ "learning_rate": 1.4341972687919186e-06,
+ "loss": 0.3896,
+ "step": 8160
+ },
+ {
+ "epoch": 3.858628841607565,
+ "grad_norm": 2.7802605628967285,
+ "learning_rate": 1.4336330056244906e-06,
+ "loss": 0.4063,
+ "step": 8161
+ },
+ {
+ "epoch": 3.8591016548463357,
+ "grad_norm": 3.1401731967926025,
+ "learning_rate": 1.433068808855061e-06,
+ "loss": 0.4068,
+ "step": 8162
+ },
+ {
+ "epoch": 3.8595744680851065,
+ "grad_norm": 3.132723331451416,
+ "learning_rate": 1.432504678518757e-06,
+ "loss": 0.4724,
+ "step": 8163
+ },
+ {
+ "epoch": 3.860047281323877,
+ "grad_norm": 2.94944167137146,
+ "learning_rate": 1.4319406146507068e-06,
+ "loss": 0.3666,
+ "step": 8164
+ },
+ {
+ "epoch": 3.8605200945626477,
+ "grad_norm": 2.972322463989258,
+ "learning_rate": 1.4313766172860311e-06,
+ "loss": 0.4226,
+ "step": 8165
+ },
+ {
+ "epoch": 3.8609929078014185,
+ "grad_norm": 2.9808123111724854,
+ "learning_rate": 1.430812686459847e-06,
+ "loss": 0.4079,
+ "step": 8166
+ },
+ {
+ "epoch": 3.8614657210401893,
+ "grad_norm": 2.9656291007995605,
+ "learning_rate": 1.4302488222072698e-06,
+ "loss": 0.3423,
+ "step": 8167
+ },
+ {
+ "epoch": 3.86193853427896,
+ "grad_norm": 2.886765241622925,
+ "learning_rate": 1.4296850245634073e-06,
+ "loss": 0.3577,
+ "step": 8168
+ },
+ {
+ "epoch": 3.8624113475177304,
+ "grad_norm": 3.0613043308258057,
+ "learning_rate": 1.4291212935633653e-06,
+ "loss": 0.4121,
+ "step": 8169
+ },
+ {
+ "epoch": 3.862884160756501,
+ "grad_norm": 2.842050313949585,
+ "learning_rate": 1.4285576292422445e-06,
+ "loss": 0.373,
+ "step": 8170
+ },
+ {
+ "epoch": 3.863356973995272,
+ "grad_norm": 3.0604517459869385,
+ "learning_rate": 1.4279940316351413e-06,
+ "loss": 0.3938,
+ "step": 8171
+ },
+ {
+ "epoch": 3.8638297872340424,
+ "grad_norm": 3.9742302894592285,
+ "learning_rate": 1.42743050077715e-06,
+ "loss": 0.4463,
+ "step": 8172
+ },
+ {
+ "epoch": 3.864302600472813,
+ "grad_norm": 2.8330607414245605,
+ "learning_rate": 1.4268670367033572e-06,
+ "loss": 0.4423,
+ "step": 8173
+ },
+ {
+ "epoch": 3.864775413711584,
+ "grad_norm": 2.953256607055664,
+ "learning_rate": 1.4263036394488497e-06,
+ "loss": 0.3553,
+ "step": 8174
+ },
+ {
+ "epoch": 3.8652482269503547,
+ "grad_norm": 2.865849018096924,
+ "learning_rate": 1.4257403090487065e-06,
+ "loss": 0.3348,
+ "step": 8175
+ },
+ {
+ "epoch": 3.8657210401891255,
+ "grad_norm": 2.712502956390381,
+ "learning_rate": 1.4251770455380027e-06,
+ "loss": 0.3896,
+ "step": 8176
+ },
+ {
+ "epoch": 3.866193853427896,
+ "grad_norm": 2.798898220062256,
+ "learning_rate": 1.4246138489518123e-06,
+ "loss": 0.4275,
+ "step": 8177
+ },
+ {
+ "epoch": 3.8666666666666667,
+ "grad_norm": 2.830899953842163,
+ "learning_rate": 1.4240507193252023e-06,
+ "loss": 0.3952,
+ "step": 8178
+ },
+ {
+ "epoch": 3.8671394799054375,
+ "grad_norm": 2.5789451599121094,
+ "learning_rate": 1.4234876566932348e-06,
+ "loss": 0.3483,
+ "step": 8179
+ },
+ {
+ "epoch": 3.867612293144208,
+ "grad_norm": 2.8513095378875732,
+ "learning_rate": 1.422924661090972e-06,
+ "loss": 0.3403,
+ "step": 8180
+ },
+ {
+ "epoch": 3.8680851063829786,
+ "grad_norm": 3.5031449794769287,
+ "learning_rate": 1.4223617325534664e-06,
+ "loss": 0.3964,
+ "step": 8181
+ },
+ {
+ "epoch": 3.8685579196217494,
+ "grad_norm": 2.7495479583740234,
+ "learning_rate": 1.4217988711157715e-06,
+ "loss": 0.3376,
+ "step": 8182
+ },
+ {
+ "epoch": 3.8690307328605202,
+ "grad_norm": 2.8609421253204346,
+ "learning_rate": 1.421236076812933e-06,
+ "loss": 0.3967,
+ "step": 8183
+ },
+ {
+ "epoch": 3.869503546099291,
+ "grad_norm": 3.0624637603759766,
+ "learning_rate": 1.420673349679994e-06,
+ "loss": 0.3764,
+ "step": 8184
+ },
+ {
+ "epoch": 3.8699763593380614,
+ "grad_norm": 3.3084404468536377,
+ "learning_rate": 1.4201106897519926e-06,
+ "loss": 0.4567,
+ "step": 8185
+ },
+ {
+ "epoch": 3.870449172576832,
+ "grad_norm": 3.164116382598877,
+ "learning_rate": 1.4195480970639624e-06,
+ "loss": 0.4217,
+ "step": 8186
+ },
+ {
+ "epoch": 3.870921985815603,
+ "grad_norm": 2.971390724182129,
+ "learning_rate": 1.4189855716509355e-06,
+ "loss": 0.3981,
+ "step": 8187
+ },
+ {
+ "epoch": 3.8713947990543733,
+ "grad_norm": 3.0537233352661133,
+ "learning_rate": 1.418423113547937e-06,
+ "loss": 0.4093,
+ "step": 8188
+ },
+ {
+ "epoch": 3.871867612293144,
+ "grad_norm": 3.698120594024658,
+ "learning_rate": 1.4178607227899877e-06,
+ "loss": 0.3158,
+ "step": 8189
+ },
+ {
+ "epoch": 3.872340425531915,
+ "grad_norm": 3.0320451259613037,
+ "learning_rate": 1.417298399412107e-06,
+ "loss": 0.3903,
+ "step": 8190
+ },
+ {
+ "epoch": 3.8728132387706857,
+ "grad_norm": 2.913296699523926,
+ "learning_rate": 1.4167361434493068e-06,
+ "loss": 0.3396,
+ "step": 8191
+ },
+ {
+ "epoch": 3.8732860520094565,
+ "grad_norm": 3.011906147003174,
+ "learning_rate": 1.4161739549365976e-06,
+ "loss": 0.3915,
+ "step": 8192
+ },
+ {
+ "epoch": 3.873758865248227,
+ "grad_norm": 3.2707724571228027,
+ "learning_rate": 1.4156118339089842e-06,
+ "loss": 0.4466,
+ "step": 8193
+ },
+ {
+ "epoch": 3.8742316784869977,
+ "grad_norm": 3.036747694015503,
+ "learning_rate": 1.4150497804014656e-06,
+ "loss": 0.4095,
+ "step": 8194
+ },
+ {
+ "epoch": 3.8747044917257685,
+ "grad_norm": 2.8851394653320312,
+ "learning_rate": 1.4144877944490411e-06,
+ "loss": 0.4235,
+ "step": 8195
+ },
+ {
+ "epoch": 3.875177304964539,
+ "grad_norm": 3.099785566329956,
+ "learning_rate": 1.4139258760867008e-06,
+ "loss": 0.4102,
+ "step": 8196
+ },
+ {
+ "epoch": 3.8756501182033096,
+ "grad_norm": 3.0752081871032715,
+ "learning_rate": 1.4133640253494347e-06,
+ "loss": 0.4165,
+ "step": 8197
+ },
+ {
+ "epoch": 3.8761229314420804,
+ "grad_norm": 2.842257261276245,
+ "learning_rate": 1.412802242272226e-06,
+ "loss": 0.3573,
+ "step": 8198
+ },
+ {
+ "epoch": 3.876595744680851,
+ "grad_norm": 2.93868350982666,
+ "learning_rate": 1.4122405268900547e-06,
+ "loss": 0.36,
+ "step": 8199
+ },
+ {
+ "epoch": 3.877068557919622,
+ "grad_norm": 2.674356460571289,
+ "learning_rate": 1.411678879237896e-06,
+ "loss": 0.3763,
+ "step": 8200
+ },
+ {
+ "epoch": 3.8775413711583924,
+ "grad_norm": 2.710617780685425,
+ "learning_rate": 1.411117299350721e-06,
+ "loss": 0.358,
+ "step": 8201
+ },
+ {
+ "epoch": 3.878014184397163,
+ "grad_norm": 3.0299410820007324,
+ "learning_rate": 1.4105557872634968e-06,
+ "loss": 0.3723,
+ "step": 8202
+ },
+ {
+ "epoch": 3.878486997635934,
+ "grad_norm": 3.1951241493225098,
+ "learning_rate": 1.4099943430111874e-06,
+ "loss": 0.4163,
+ "step": 8203
+ },
+ {
+ "epoch": 3.8789598108747043,
+ "grad_norm": 2.752410411834717,
+ "learning_rate": 1.4094329666287495e-06,
+ "loss": 0.3753,
+ "step": 8204
+ },
+ {
+ "epoch": 3.879432624113475,
+ "grad_norm": 3.1242496967315674,
+ "learning_rate": 1.40887165815114e-06,
+ "loss": 0.3694,
+ "step": 8205
+ },
+ {
+ "epoch": 3.879905437352246,
+ "grad_norm": 5.16750431060791,
+ "learning_rate": 1.4083104176133079e-06,
+ "loss": 0.3869,
+ "step": 8206
+ },
+ {
+ "epoch": 3.8803782505910167,
+ "grad_norm": 3.2995245456695557,
+ "learning_rate": 1.4077492450501978e-06,
+ "loss": 0.4194,
+ "step": 8207
+ },
+ {
+ "epoch": 3.8808510638297875,
+ "grad_norm": 3.506807804107666,
+ "learning_rate": 1.4071881404967541e-06,
+ "loss": 0.3873,
+ "step": 8208
+ },
+ {
+ "epoch": 3.881323877068558,
+ "grad_norm": 3.1201252937316895,
+ "learning_rate": 1.4066271039879123e-06,
+ "loss": 0.3625,
+ "step": 8209
+ },
+ {
+ "epoch": 3.8817966903073287,
+ "grad_norm": 2.870683193206787,
+ "learning_rate": 1.4060661355586073e-06,
+ "loss": 0.4039,
+ "step": 8210
+ },
+ {
+ "epoch": 3.8822695035460995,
+ "grad_norm": 3.177701234817505,
+ "learning_rate": 1.405505235243767e-06,
+ "loss": 0.3715,
+ "step": 8211
+ },
+ {
+ "epoch": 3.88274231678487,
+ "grad_norm": 3.0319771766662598,
+ "learning_rate": 1.4049444030783157e-06,
+ "loss": 0.3588,
+ "step": 8212
+ },
+ {
+ "epoch": 3.8832151300236406,
+ "grad_norm": 2.4598889350891113,
+ "learning_rate": 1.404383639097176e-06,
+ "loss": 0.2788,
+ "step": 8213
+ },
+ {
+ "epoch": 3.8836879432624114,
+ "grad_norm": 2.916987419128418,
+ "learning_rate": 1.4038229433352623e-06,
+ "loss": 0.4167,
+ "step": 8214
+ },
+ {
+ "epoch": 3.884160756501182,
+ "grad_norm": 3.005075216293335,
+ "learning_rate": 1.4032623158274872e-06,
+ "loss": 0.4251,
+ "step": 8215
+ },
+ {
+ "epoch": 3.8846335697399526,
+ "grad_norm": 3.1718621253967285,
+ "learning_rate": 1.4027017566087591e-06,
+ "loss": 0.383,
+ "step": 8216
+ },
+ {
+ "epoch": 3.8851063829787233,
+ "grad_norm": 2.954662322998047,
+ "learning_rate": 1.402141265713981e-06,
+ "loss": 0.373,
+ "step": 8217
+ },
+ {
+ "epoch": 3.885579196217494,
+ "grad_norm": 3.408008337020874,
+ "learning_rate": 1.4015808431780526e-06,
+ "loss": 0.4216,
+ "step": 8218
+ },
+ {
+ "epoch": 3.8860520094562645,
+ "grad_norm": 3.1599369049072266,
+ "learning_rate": 1.4010204890358675e-06,
+ "loss": 0.4544,
+ "step": 8219
+ },
+ {
+ "epoch": 3.8865248226950353,
+ "grad_norm": 2.8919107913970947,
+ "learning_rate": 1.4004602033223186e-06,
+ "loss": 0.3785,
+ "step": 8220
+ },
+ {
+ "epoch": 3.886997635933806,
+ "grad_norm": 3.522581100463867,
+ "learning_rate": 1.3998999860722918e-06,
+ "loss": 0.4276,
+ "step": 8221
+ },
+ {
+ "epoch": 3.887470449172577,
+ "grad_norm": 2.9278945922851562,
+ "learning_rate": 1.399339837320668e-06,
+ "loss": 0.409,
+ "step": 8222
+ },
+ {
+ "epoch": 3.8879432624113477,
+ "grad_norm": 3.032557725906372,
+ "learning_rate": 1.398779757102327e-06,
+ "loss": 0.3973,
+ "step": 8223
+ },
+ {
+ "epoch": 3.888416075650118,
+ "grad_norm": 2.843118667602539,
+ "learning_rate": 1.3982197454521423e-06,
+ "loss": 0.3418,
+ "step": 8224
+ },
+ {
+ "epoch": 3.888888888888889,
+ "grad_norm": 2.8620638847351074,
+ "learning_rate": 1.3976598024049815e-06,
+ "loss": 0.3751,
+ "step": 8225
+ },
+ {
+ "epoch": 3.8893617021276596,
+ "grad_norm": 2.532327175140381,
+ "learning_rate": 1.3970999279957124e-06,
+ "loss": 0.3541,
+ "step": 8226
+ },
+ {
+ "epoch": 3.88983451536643,
+ "grad_norm": 3.1074535846710205,
+ "learning_rate": 1.3965401222591935e-06,
+ "loss": 0.4706,
+ "step": 8227
+ },
+ {
+ "epoch": 3.890307328605201,
+ "grad_norm": 3.1558735370635986,
+ "learning_rate": 1.3959803852302839e-06,
+ "loss": 0.448,
+ "step": 8228
+ },
+ {
+ "epoch": 3.8907801418439716,
+ "grad_norm": 3.0862064361572266,
+ "learning_rate": 1.3954207169438344e-06,
+ "loss": 0.3308,
+ "step": 8229
+ },
+ {
+ "epoch": 3.8912529550827424,
+ "grad_norm": 2.9246280193328857,
+ "learning_rate": 1.3948611174346927e-06,
+ "loss": 0.3771,
+ "step": 8230
+ },
+ {
+ "epoch": 3.891725768321513,
+ "grad_norm": 2.7959492206573486,
+ "learning_rate": 1.394301586737704e-06,
+ "loss": 0.4248,
+ "step": 8231
+ },
+ {
+ "epoch": 3.8921985815602835,
+ "grad_norm": 2.787670373916626,
+ "learning_rate": 1.3937421248877075e-06,
+ "loss": 0.3416,
+ "step": 8232
+ },
+ {
+ "epoch": 3.8926713947990543,
+ "grad_norm": 3.0775792598724365,
+ "learning_rate": 1.393182731919538e-06,
+ "loss": 0.4345,
+ "step": 8233
+ },
+ {
+ "epoch": 3.893144208037825,
+ "grad_norm": 2.6338887214660645,
+ "learning_rate": 1.3926234078680268e-06,
+ "loss": 0.3995,
+ "step": 8234
+ },
+ {
+ "epoch": 3.8936170212765955,
+ "grad_norm": 2.9975900650024414,
+ "learning_rate": 1.392064152767999e-06,
+ "loss": 0.3997,
+ "step": 8235
+ },
+ {
+ "epoch": 3.8940898345153663,
+ "grad_norm": 2.8615779876708984,
+ "learning_rate": 1.3915049666542791e-06,
+ "loss": 0.3687,
+ "step": 8236
+ },
+ {
+ "epoch": 3.894562647754137,
+ "grad_norm": 3.0132436752319336,
+ "learning_rate": 1.3909458495616835e-06,
+ "loss": 0.4085,
+ "step": 8237
+ },
+ {
+ "epoch": 3.895035460992908,
+ "grad_norm": 3.141291379928589,
+ "learning_rate": 1.3903868015250278e-06,
+ "loss": 0.3903,
+ "step": 8238
+ },
+ {
+ "epoch": 3.8955082742316787,
+ "grad_norm": 2.6998603343963623,
+ "learning_rate": 1.3898278225791204e-06,
+ "loss": 0.3576,
+ "step": 8239
+ },
+ {
+ "epoch": 3.895981087470449,
+ "grad_norm": 3.212578535079956,
+ "learning_rate": 1.3892689127587656e-06,
+ "loss": 0.4321,
+ "step": 8240
+ },
+ {
+ "epoch": 3.89645390070922,
+ "grad_norm": 3.15732741355896,
+ "learning_rate": 1.3887100720987662e-06,
+ "loss": 0.4247,
+ "step": 8241
+ },
+ {
+ "epoch": 3.8969267139479906,
+ "grad_norm": 2.6001040935516357,
+ "learning_rate": 1.3881513006339168e-06,
+ "loss": 0.3376,
+ "step": 8242
+ },
+ {
+ "epoch": 3.897399527186761,
+ "grad_norm": 2.766188859939575,
+ "learning_rate": 1.3875925983990113e-06,
+ "loss": 0.3771,
+ "step": 8243
+ },
+ {
+ "epoch": 3.8978723404255318,
+ "grad_norm": 2.7471580505371094,
+ "learning_rate": 1.3870339654288372e-06,
+ "loss": 0.3311,
+ "step": 8244
+ },
+ {
+ "epoch": 3.8983451536643026,
+ "grad_norm": 3.577664375305176,
+ "learning_rate": 1.3864754017581769e-06,
+ "loss": 0.3725,
+ "step": 8245
+ },
+ {
+ "epoch": 3.8988179669030734,
+ "grad_norm": 2.8747243881225586,
+ "learning_rate": 1.3859169074218116e-06,
+ "loss": 0.3706,
+ "step": 8246
+ },
+ {
+ "epoch": 3.899290780141844,
+ "grad_norm": 2.5249671936035156,
+ "learning_rate": 1.3853584824545152e-06,
+ "loss": 0.3621,
+ "step": 8247
+ },
+ {
+ "epoch": 3.8997635933806145,
+ "grad_norm": 2.7290890216827393,
+ "learning_rate": 1.3848001268910589e-06,
+ "loss": 0.3209,
+ "step": 8248
+ },
+ {
+ "epoch": 3.9002364066193853,
+ "grad_norm": 3.0917534828186035,
+ "learning_rate": 1.3842418407662084e-06,
+ "loss": 0.3904,
+ "step": 8249
+ },
+ {
+ "epoch": 3.900709219858156,
+ "grad_norm": 3.099494695663452,
+ "learning_rate": 1.383683624114725e-06,
+ "loss": 0.3714,
+ "step": 8250
+ },
+ {
+ "epoch": 3.9011820330969265,
+ "grad_norm": 3.077505588531494,
+ "learning_rate": 1.3831254769713687e-06,
+ "loss": 0.4166,
+ "step": 8251
+ },
+ {
+ "epoch": 3.9016548463356973,
+ "grad_norm": 2.9983766078948975,
+ "learning_rate": 1.3825673993708915e-06,
+ "loss": 0.3909,
+ "step": 8252
+ },
+ {
+ "epoch": 3.902127659574468,
+ "grad_norm": 2.7958667278289795,
+ "learning_rate": 1.3820093913480415e-06,
+ "loss": 0.3966,
+ "step": 8253
+ },
+ {
+ "epoch": 3.902600472813239,
+ "grad_norm": 3.0938336849212646,
+ "learning_rate": 1.3814514529375656e-06,
+ "loss": 0.4118,
+ "step": 8254
+ },
+ {
+ "epoch": 3.9030732860520096,
+ "grad_norm": 3.2711637020111084,
+ "learning_rate": 1.3808935841742016e-06,
+ "loss": 0.4021,
+ "step": 8255
+ },
+ {
+ "epoch": 3.90354609929078,
+ "grad_norm": 3.23563814163208,
+ "learning_rate": 1.3803357850926885e-06,
+ "loss": 0.3679,
+ "step": 8256
+ },
+ {
+ "epoch": 3.904018912529551,
+ "grad_norm": 2.77942156791687,
+ "learning_rate": 1.3797780557277563e-06,
+ "loss": 0.3938,
+ "step": 8257
+ },
+ {
+ "epoch": 3.9044917257683216,
+ "grad_norm": 3.1273257732391357,
+ "learning_rate": 1.3792203961141313e-06,
+ "loss": 0.3579,
+ "step": 8258
+ },
+ {
+ "epoch": 3.904964539007092,
+ "grad_norm": 3.69164776802063,
+ "learning_rate": 1.378662806286539e-06,
+ "loss": 0.3712,
+ "step": 8259
+ },
+ {
+ "epoch": 3.9054373522458627,
+ "grad_norm": 2.8818306922912598,
+ "learning_rate": 1.3781052862796957e-06,
+ "loss": 0.3972,
+ "step": 8260
+ },
+ {
+ "epoch": 3.9059101654846335,
+ "grad_norm": 2.776651382446289,
+ "learning_rate": 1.377547836128318e-06,
+ "loss": 0.3605,
+ "step": 8261
+ },
+ {
+ "epoch": 3.9063829787234043,
+ "grad_norm": 3.1498706340789795,
+ "learning_rate": 1.376990455867115e-06,
+ "loss": 0.3995,
+ "step": 8262
+ },
+ {
+ "epoch": 3.906855791962175,
+ "grad_norm": 2.777390956878662,
+ "learning_rate": 1.3764331455307916e-06,
+ "loss": 0.3463,
+ "step": 8263
+ },
+ {
+ "epoch": 3.9073286052009455,
+ "grad_norm": 2.9953835010528564,
+ "learning_rate": 1.3758759051540496e-06,
+ "loss": 0.3881,
+ "step": 8264
+ },
+ {
+ "epoch": 3.9078014184397163,
+ "grad_norm": 3.737194538116455,
+ "learning_rate": 1.375318734771585e-06,
+ "loss": 0.4456,
+ "step": 8265
+ },
+ {
+ "epoch": 3.908274231678487,
+ "grad_norm": 3.1575849056243896,
+ "learning_rate": 1.374761634418092e-06,
+ "loss": 0.3613,
+ "step": 8266
+ },
+ {
+ "epoch": 3.9087470449172574,
+ "grad_norm": 3.140662908554077,
+ "learning_rate": 1.374204604128258e-06,
+ "loss": 0.4462,
+ "step": 8267
+ },
+ {
+ "epoch": 3.9092198581560282,
+ "grad_norm": 3.2106714248657227,
+ "learning_rate": 1.3736476439367663e-06,
+ "loss": 0.3801,
+ "step": 8268
+ },
+ {
+ "epoch": 3.909692671394799,
+ "grad_norm": 2.888345956802368,
+ "learning_rate": 1.3730907538782976e-06,
+ "loss": 0.4209,
+ "step": 8269
+ },
+ {
+ "epoch": 3.91016548463357,
+ "grad_norm": 2.8903355598449707,
+ "learning_rate": 1.3725339339875252e-06,
+ "loss": 0.3612,
+ "step": 8270
+ },
+ {
+ "epoch": 3.9106382978723406,
+ "grad_norm": 3.2661736011505127,
+ "learning_rate": 1.371977184299122e-06,
+ "loss": 0.4151,
+ "step": 8271
+ },
+ {
+ "epoch": 3.911111111111111,
+ "grad_norm": 3.1532459259033203,
+ "learning_rate": 1.3714205048477535e-06,
+ "loss": 0.3706,
+ "step": 8272
+ },
+ {
+ "epoch": 3.911583924349882,
+ "grad_norm": 2.907306432723999,
+ "learning_rate": 1.3708638956680804e-06,
+ "loss": 0.4113,
+ "step": 8273
+ },
+ {
+ "epoch": 3.9120567375886526,
+ "grad_norm": 2.7301599979400635,
+ "learning_rate": 1.3703073567947622e-06,
+ "loss": 0.355,
+ "step": 8274
+ },
+ {
+ "epoch": 3.912529550827423,
+ "grad_norm": 2.595625877380371,
+ "learning_rate": 1.3697508882624516e-06,
+ "loss": 0.3733,
+ "step": 8275
+ },
+ {
+ "epoch": 3.9130023640661937,
+ "grad_norm": 2.784294366836548,
+ "learning_rate": 1.369194490105796e-06,
+ "loss": 0.3366,
+ "step": 8276
+ },
+ {
+ "epoch": 3.9134751773049645,
+ "grad_norm": 3.0179800987243652,
+ "learning_rate": 1.3686381623594419e-06,
+ "loss": 0.3922,
+ "step": 8277
+ },
+ {
+ "epoch": 3.9139479905437353,
+ "grad_norm": 2.6641111373901367,
+ "learning_rate": 1.3680819050580291e-06,
+ "loss": 0.3324,
+ "step": 8278
+ },
+ {
+ "epoch": 3.914420803782506,
+ "grad_norm": 2.917741060256958,
+ "learning_rate": 1.3675257182361923e-06,
+ "loss": 0.3784,
+ "step": 8279
+ },
+ {
+ "epoch": 3.9148936170212765,
+ "grad_norm": 2.959599018096924,
+ "learning_rate": 1.3669696019285626e-06,
+ "loss": 0.3846,
+ "step": 8280
+ },
+ {
+ "epoch": 3.9153664302600473,
+ "grad_norm": 3.078824043273926,
+ "learning_rate": 1.3664135561697683e-06,
+ "loss": 0.4357,
+ "step": 8281
+ },
+ {
+ "epoch": 3.915839243498818,
+ "grad_norm": 3.0174930095672607,
+ "learning_rate": 1.3658575809944313e-06,
+ "loss": 0.3643,
+ "step": 8282
+ },
+ {
+ "epoch": 3.9163120567375884,
+ "grad_norm": 2.6805408000946045,
+ "learning_rate": 1.365301676437169e-06,
+ "loss": 0.3193,
+ "step": 8283
+ },
+ {
+ "epoch": 3.916784869976359,
+ "grad_norm": 2.6996054649353027,
+ "learning_rate": 1.3647458425325966e-06,
+ "loss": 0.3378,
+ "step": 8284
+ },
+ {
+ "epoch": 3.91725768321513,
+ "grad_norm": 2.7950546741485596,
+ "learning_rate": 1.3641900793153223e-06,
+ "loss": 0.3864,
+ "step": 8285
+ },
+ {
+ "epoch": 3.917730496453901,
+ "grad_norm": 2.9658634662628174,
+ "learning_rate": 1.363634386819951e-06,
+ "loss": 0.3452,
+ "step": 8286
+ },
+ {
+ "epoch": 3.9182033096926716,
+ "grad_norm": 3.0684404373168945,
+ "learning_rate": 1.363078765081084e-06,
+ "loss": 0.3278,
+ "step": 8287
+ },
+ {
+ "epoch": 3.918676122931442,
+ "grad_norm": 3.0293614864349365,
+ "learning_rate": 1.3625232141333164e-06,
+ "loss": 0.3827,
+ "step": 8288
+ },
+ {
+ "epoch": 3.9191489361702128,
+ "grad_norm": 2.9969890117645264,
+ "learning_rate": 1.3619677340112413e-06,
+ "loss": 0.3412,
+ "step": 8289
+ },
+ {
+ "epoch": 3.9196217494089836,
+ "grad_norm": 2.991654396057129,
+ "learning_rate": 1.3614123247494457e-06,
+ "loss": 0.3683,
+ "step": 8290
+ },
+ {
+ "epoch": 3.920094562647754,
+ "grad_norm": 3.032158374786377,
+ "learning_rate": 1.360856986382511e-06,
+ "loss": 0.421,
+ "step": 8291
+ },
+ {
+ "epoch": 3.9205673758865247,
+ "grad_norm": 3.1413731575012207,
+ "learning_rate": 1.3603017189450173e-06,
+ "loss": 0.3818,
+ "step": 8292
+ },
+ {
+ "epoch": 3.9210401891252955,
+ "grad_norm": 3.295527219772339,
+ "learning_rate": 1.3597465224715387e-06,
+ "loss": 0.4828,
+ "step": 8293
+ },
+ {
+ "epoch": 3.9215130023640663,
+ "grad_norm": 3.116053581237793,
+ "learning_rate": 1.359191396996643e-06,
+ "loss": 0.4108,
+ "step": 8294
+ },
+ {
+ "epoch": 3.921985815602837,
+ "grad_norm": 2.957446336746216,
+ "learning_rate": 1.3586363425548975e-06,
+ "loss": 0.3482,
+ "step": 8295
+ },
+ {
+ "epoch": 3.9224586288416075,
+ "grad_norm": 2.745471715927124,
+ "learning_rate": 1.3580813591808627e-06,
+ "loss": 0.4184,
+ "step": 8296
+ },
+ {
+ "epoch": 3.9229314420803783,
+ "grad_norm": 3.0920722484588623,
+ "learning_rate": 1.3575264469090943e-06,
+ "loss": 0.3826,
+ "step": 8297
+ },
+ {
+ "epoch": 3.923404255319149,
+ "grad_norm": 2.8719749450683594,
+ "learning_rate": 1.3569716057741444e-06,
+ "loss": 0.3953,
+ "step": 8298
+ },
+ {
+ "epoch": 3.9238770685579194,
+ "grad_norm": 3.1278762817382812,
+ "learning_rate": 1.3564168358105597e-06,
+ "loss": 0.3658,
+ "step": 8299
+ },
+ {
+ "epoch": 3.92434988179669,
+ "grad_norm": 2.7752785682678223,
+ "learning_rate": 1.3558621370528851e-06,
+ "loss": 0.3447,
+ "step": 8300
+ },
+ {
+ "epoch": 3.924822695035461,
+ "grad_norm": 2.948575735092163,
+ "learning_rate": 1.3553075095356575e-06,
+ "loss": 0.3803,
+ "step": 8301
+ },
+ {
+ "epoch": 3.925295508274232,
+ "grad_norm": 2.8164193630218506,
+ "learning_rate": 1.354752953293413e-06,
+ "loss": 0.3724,
+ "step": 8302
+ },
+ {
+ "epoch": 3.9257683215130026,
+ "grad_norm": 3.2431271076202393,
+ "learning_rate": 1.3541984683606798e-06,
+ "loss": 0.382,
+ "step": 8303
+ },
+ {
+ "epoch": 3.926241134751773,
+ "grad_norm": 2.8485286235809326,
+ "learning_rate": 1.353644054771983e-06,
+ "loss": 0.3632,
+ "step": 8304
+ },
+ {
+ "epoch": 3.9267139479905437,
+ "grad_norm": 3.334914445877075,
+ "learning_rate": 1.3530897125618456e-06,
+ "loss": 0.5286,
+ "step": 8305
+ },
+ {
+ "epoch": 3.9271867612293145,
+ "grad_norm": 3.3895132541656494,
+ "learning_rate": 1.3525354417647815e-06,
+ "loss": 0.3838,
+ "step": 8306
+ },
+ {
+ "epoch": 3.927659574468085,
+ "grad_norm": 3.141935110092163,
+ "learning_rate": 1.351981242415305e-06,
+ "loss": 0.3928,
+ "step": 8307
+ },
+ {
+ "epoch": 3.9281323877068557,
+ "grad_norm": 3.3013596534729004,
+ "learning_rate": 1.3514271145479225e-06,
+ "loss": 0.4046,
+ "step": 8308
+ },
+ {
+ "epoch": 3.9286052009456265,
+ "grad_norm": 2.8704745769500732,
+ "learning_rate": 1.3508730581971363e-06,
+ "loss": 0.3542,
+ "step": 8309
+ },
+ {
+ "epoch": 3.9290780141843973,
+ "grad_norm": 3.179405689239502,
+ "learning_rate": 1.3503190733974472e-06,
+ "loss": 0.3911,
+ "step": 8310
+ },
+ {
+ "epoch": 3.929550827423168,
+ "grad_norm": 3.1091885566711426,
+ "learning_rate": 1.3497651601833481e-06,
+ "loss": 0.3552,
+ "step": 8311
+ },
+ {
+ "epoch": 3.9300236406619384,
+ "grad_norm": 2.687678813934326,
+ "learning_rate": 1.3492113185893288e-06,
+ "loss": 0.3462,
+ "step": 8312
+ },
+ {
+ "epoch": 3.9304964539007092,
+ "grad_norm": 3.4954965114593506,
+ "learning_rate": 1.3486575486498749e-06,
+ "loss": 0.4358,
+ "step": 8313
+ },
+ {
+ "epoch": 3.93096926713948,
+ "grad_norm": 2.8652899265289307,
+ "learning_rate": 1.3481038503994652e-06,
+ "loss": 0.3434,
+ "step": 8314
+ },
+ {
+ "epoch": 3.9314420803782504,
+ "grad_norm": 3.927623748779297,
+ "learning_rate": 1.3475502238725797e-06,
+ "loss": 0.4662,
+ "step": 8315
+ },
+ {
+ "epoch": 3.931914893617021,
+ "grad_norm": 3.1166276931762695,
+ "learning_rate": 1.346996669103687e-06,
+ "loss": 0.3953,
+ "step": 8316
+ },
+ {
+ "epoch": 3.932387706855792,
+ "grad_norm": 3.140003204345703,
+ "learning_rate": 1.346443186127257e-06,
+ "loss": 0.3616,
+ "step": 8317
+ },
+ {
+ "epoch": 3.9328605200945628,
+ "grad_norm": 3.335466146469116,
+ "learning_rate": 1.3458897749777516e-06,
+ "loss": 0.3854,
+ "step": 8318
+ },
+ {
+ "epoch": 3.9333333333333336,
+ "grad_norm": 2.8305466175079346,
+ "learning_rate": 1.3453364356896282e-06,
+ "loss": 0.374,
+ "step": 8319
+ },
+ {
+ "epoch": 3.933806146572104,
+ "grad_norm": 2.9511806964874268,
+ "learning_rate": 1.344783168297343e-06,
+ "loss": 0.4235,
+ "step": 8320
+ },
+ {
+ "epoch": 3.9342789598108747,
+ "grad_norm": 3.1868233680725098,
+ "learning_rate": 1.3442299728353448e-06,
+ "loss": 0.4384,
+ "step": 8321
+ },
+ {
+ "epoch": 3.9347517730496455,
+ "grad_norm": 3.1358237266540527,
+ "learning_rate": 1.3436768493380766e-06,
+ "loss": 0.4011,
+ "step": 8322
+ },
+ {
+ "epoch": 3.935224586288416,
+ "grad_norm": 3.126192808151245,
+ "learning_rate": 1.343123797839982e-06,
+ "loss": 0.4061,
+ "step": 8323
+ },
+ {
+ "epoch": 3.9356973995271867,
+ "grad_norm": 2.9724647998809814,
+ "learning_rate": 1.3425708183754949e-06,
+ "loss": 0.3859,
+ "step": 8324
+ },
+ {
+ "epoch": 3.9361702127659575,
+ "grad_norm": 3.1526355743408203,
+ "learning_rate": 1.3420179109790485e-06,
+ "loss": 0.3543,
+ "step": 8325
+ },
+ {
+ "epoch": 3.9366430260047283,
+ "grad_norm": 3.1289172172546387,
+ "learning_rate": 1.3414650756850695e-06,
+ "loss": 0.3836,
+ "step": 8326
+ },
+ {
+ "epoch": 3.937115839243499,
+ "grad_norm": 2.851264715194702,
+ "learning_rate": 1.34091231252798e-06,
+ "loss": 0.3294,
+ "step": 8327
+ },
+ {
+ "epoch": 3.9375886524822694,
+ "grad_norm": 2.921872138977051,
+ "learning_rate": 1.3403596215421981e-06,
+ "loss": 0.3698,
+ "step": 8328
+ },
+ {
+ "epoch": 3.93806146572104,
+ "grad_norm": 2.947258234024048,
+ "learning_rate": 1.339807002762137e-06,
+ "loss": 0.3616,
+ "step": 8329
+ },
+ {
+ "epoch": 3.938534278959811,
+ "grad_norm": 3.011021375656128,
+ "learning_rate": 1.3392544562222077e-06,
+ "loss": 0.3387,
+ "step": 8330
+ },
+ {
+ "epoch": 3.9390070921985814,
+ "grad_norm": 3.5230746269226074,
+ "learning_rate": 1.3387019819568134e-06,
+ "loss": 0.4054,
+ "step": 8331
+ },
+ {
+ "epoch": 3.939479905437352,
+ "grad_norm": 3.120321035385132,
+ "learning_rate": 1.3381495800003536e-06,
+ "loss": 0.4389,
+ "step": 8332
+ },
+ {
+ "epoch": 3.939952718676123,
+ "grad_norm": 3.0090999603271484,
+ "learning_rate": 1.3375972503872259e-06,
+ "loss": 0.4158,
+ "step": 8333
+ },
+ {
+ "epoch": 3.9404255319148938,
+ "grad_norm": 3.4807989597320557,
+ "learning_rate": 1.3370449931518198e-06,
+ "loss": 0.4144,
+ "step": 8334
+ },
+ {
+ "epoch": 3.9408983451536646,
+ "grad_norm": 2.8535733222961426,
+ "learning_rate": 1.336492808328523e-06,
+ "loss": 0.4281,
+ "step": 8335
+ },
+ {
+ "epoch": 3.941371158392435,
+ "grad_norm": 2.9032745361328125,
+ "learning_rate": 1.3359406959517174e-06,
+ "loss": 0.3389,
+ "step": 8336
+ },
+ {
+ "epoch": 3.9418439716312057,
+ "grad_norm": 2.725823163986206,
+ "learning_rate": 1.3353886560557793e-06,
+ "loss": 0.369,
+ "step": 8337
+ },
+ {
+ "epoch": 3.9423167848699765,
+ "grad_norm": 3.1965179443359375,
+ "learning_rate": 1.3348366886750844e-06,
+ "loss": 0.4031,
+ "step": 8338
+ },
+ {
+ "epoch": 3.942789598108747,
+ "grad_norm": 2.6991076469421387,
+ "learning_rate": 1.3342847938439985e-06,
+ "loss": 0.3434,
+ "step": 8339
+ },
+ {
+ "epoch": 3.9432624113475176,
+ "grad_norm": 4.491400718688965,
+ "learning_rate": 1.3337329715968877e-06,
+ "loss": 0.4175,
+ "step": 8340
+ },
+ {
+ "epoch": 3.9437352245862884,
+ "grad_norm": 4.005452632904053,
+ "learning_rate": 1.3331812219681112e-06,
+ "loss": 0.4191,
+ "step": 8341
+ },
+ {
+ "epoch": 3.9442080378250592,
+ "grad_norm": 3.1575794219970703,
+ "learning_rate": 1.3326295449920238e-06,
+ "loss": 0.4135,
+ "step": 8342
+ },
+ {
+ "epoch": 3.94468085106383,
+ "grad_norm": 3.2383973598480225,
+ "learning_rate": 1.3320779407029755e-06,
+ "loss": 0.38,
+ "step": 8343
+ },
+ {
+ "epoch": 3.9451536643026004,
+ "grad_norm": 2.873703718185425,
+ "learning_rate": 1.3315264091353119e-06,
+ "loss": 0.4128,
+ "step": 8344
+ },
+ {
+ "epoch": 3.945626477541371,
+ "grad_norm": 2.947274923324585,
+ "learning_rate": 1.330974950323376e-06,
+ "loss": 0.3342,
+ "step": 8345
+ },
+ {
+ "epoch": 3.946099290780142,
+ "grad_norm": 3.2874088287353516,
+ "learning_rate": 1.330423564301504e-06,
+ "loss": 0.3849,
+ "step": 8346
+ },
+ {
+ "epoch": 3.9465721040189123,
+ "grad_norm": 2.885772466659546,
+ "learning_rate": 1.3298722511040275e-06,
+ "loss": 0.3562,
+ "step": 8347
+ },
+ {
+ "epoch": 3.947044917257683,
+ "grad_norm": 3.0031309127807617,
+ "learning_rate": 1.3293210107652753e-06,
+ "loss": 0.3593,
+ "step": 8348
+ },
+ {
+ "epoch": 3.947517730496454,
+ "grad_norm": 2.815854549407959,
+ "learning_rate": 1.3287698433195712e-06,
+ "loss": 0.3633,
+ "step": 8349
+ },
+ {
+ "epoch": 3.9479905437352247,
+ "grad_norm": 3.228415012359619,
+ "learning_rate": 1.328218748801232e-06,
+ "loss": 0.3869,
+ "step": 8350
+ },
+ {
+ "epoch": 3.9484633569739955,
+ "grad_norm": 2.8497684001922607,
+ "learning_rate": 1.3276677272445743e-06,
+ "loss": 0.3833,
+ "step": 8351
+ },
+ {
+ "epoch": 3.948936170212766,
+ "grad_norm": 3.2330706119537354,
+ "learning_rate": 1.3271167786839057e-06,
+ "loss": 0.4414,
+ "step": 8352
+ },
+ {
+ "epoch": 3.9494089834515367,
+ "grad_norm": 3.2179152965545654,
+ "learning_rate": 1.3265659031535332e-06,
+ "loss": 0.3821,
+ "step": 8353
+ },
+ {
+ "epoch": 3.9498817966903075,
+ "grad_norm": 3.0861377716064453,
+ "learning_rate": 1.3260151006877567e-06,
+ "loss": 0.4014,
+ "step": 8354
+ },
+ {
+ "epoch": 3.950354609929078,
+ "grad_norm": 3.118872880935669,
+ "learning_rate": 1.325464371320871e-06,
+ "loss": 0.3836,
+ "step": 8355
+ },
+ {
+ "epoch": 3.9508274231678486,
+ "grad_norm": 2.9787702560424805,
+ "learning_rate": 1.32491371508717e-06,
+ "loss": 0.3794,
+ "step": 8356
+ },
+ {
+ "epoch": 3.9513002364066194,
+ "grad_norm": 3.132089853286743,
+ "learning_rate": 1.3243631320209387e-06,
+ "loss": 0.3698,
+ "step": 8357
+ },
+ {
+ "epoch": 3.9517730496453902,
+ "grad_norm": 2.5304882526397705,
+ "learning_rate": 1.32381262215646e-06,
+ "loss": 0.3687,
+ "step": 8358
+ },
+ {
+ "epoch": 3.952245862884161,
+ "grad_norm": 2.9121861457824707,
+ "learning_rate": 1.3232621855280126e-06,
+ "loss": 0.3704,
+ "step": 8359
+ },
+ {
+ "epoch": 3.9527186761229314,
+ "grad_norm": 3.0885608196258545,
+ "learning_rate": 1.3227118221698688e-06,
+ "loss": 0.4303,
+ "step": 8360
+ },
+ {
+ "epoch": 3.953191489361702,
+ "grad_norm": 2.7274837493896484,
+ "learning_rate": 1.3221615321162979e-06,
+ "loss": 0.3556,
+ "step": 8361
+ },
+ {
+ "epoch": 3.953664302600473,
+ "grad_norm": 3.1329922676086426,
+ "learning_rate": 1.3216113154015625e-06,
+ "loss": 0.4042,
+ "step": 8362
+ },
+ {
+ "epoch": 3.9541371158392433,
+ "grad_norm": 2.937380313873291,
+ "learning_rate": 1.3210611720599243e-06,
+ "loss": 0.3358,
+ "step": 8363
+ },
+ {
+ "epoch": 3.954609929078014,
+ "grad_norm": 2.939194440841675,
+ "learning_rate": 1.3205111021256378e-06,
+ "loss": 0.3885,
+ "step": 8364
+ },
+ {
+ "epoch": 3.955082742316785,
+ "grad_norm": 2.9151997566223145,
+ "learning_rate": 1.3199611056329516e-06,
+ "loss": 0.4094,
+ "step": 8365
+ },
+ {
+ "epoch": 3.9555555555555557,
+ "grad_norm": 3.029733419418335,
+ "learning_rate": 1.3194111826161143e-06,
+ "loss": 0.3999,
+ "step": 8366
+ },
+ {
+ "epoch": 3.9560283687943265,
+ "grad_norm": 2.7899951934814453,
+ "learning_rate": 1.3188613331093653e-06,
+ "loss": 0.321,
+ "step": 8367
+ },
+ {
+ "epoch": 3.956501182033097,
+ "grad_norm": 3.1109507083892822,
+ "learning_rate": 1.3183115571469425e-06,
+ "loss": 0.4266,
+ "step": 8368
+ },
+ {
+ "epoch": 3.9569739952718677,
+ "grad_norm": 3.085594415664673,
+ "learning_rate": 1.3177618547630774e-06,
+ "loss": 0.4412,
+ "step": 8369
+ },
+ {
+ "epoch": 3.9574468085106385,
+ "grad_norm": 3.0980300903320312,
+ "learning_rate": 1.3172122259919968e-06,
+ "loss": 0.3385,
+ "step": 8370
+ },
+ {
+ "epoch": 3.957919621749409,
+ "grad_norm": 3.103438138961792,
+ "learning_rate": 1.3166626708679256e-06,
+ "loss": 0.3887,
+ "step": 8371
+ },
+ {
+ "epoch": 3.9583924349881796,
+ "grad_norm": 2.8235526084899902,
+ "learning_rate": 1.3161131894250812e-06,
+ "loss": 0.3759,
+ "step": 8372
+ },
+ {
+ "epoch": 3.9588652482269504,
+ "grad_norm": 2.8316404819488525,
+ "learning_rate": 1.3155637816976762e-06,
+ "loss": 0.3666,
+ "step": 8373
+ },
+ {
+ "epoch": 3.959338061465721,
+ "grad_norm": 2.7873756885528564,
+ "learning_rate": 1.3150144477199218e-06,
+ "loss": 0.3284,
+ "step": 8374
+ },
+ {
+ "epoch": 3.959810874704492,
+ "grad_norm": 3.355039119720459,
+ "learning_rate": 1.3144651875260218e-06,
+ "loss": 0.4197,
+ "step": 8375
+ },
+ {
+ "epoch": 3.9602836879432624,
+ "grad_norm": 3.477721929550171,
+ "learning_rate": 1.3139160011501761e-06,
+ "loss": 0.3298,
+ "step": 8376
+ },
+ {
+ "epoch": 3.960756501182033,
+ "grad_norm": 3.557152032852173,
+ "learning_rate": 1.3133668886265805e-06,
+ "loss": 0.3788,
+ "step": 8377
+ },
+ {
+ "epoch": 3.961229314420804,
+ "grad_norm": 3.06707763671875,
+ "learning_rate": 1.312817849989424e-06,
+ "loss": 0.3613,
+ "step": 8378
+ },
+ {
+ "epoch": 3.9617021276595743,
+ "grad_norm": 2.7702202796936035,
+ "learning_rate": 1.3122688852728956e-06,
+ "loss": 0.402,
+ "step": 8379
+ },
+ {
+ "epoch": 3.962174940898345,
+ "grad_norm": 2.8121016025543213,
+ "learning_rate": 1.3117199945111746e-06,
+ "loss": 0.3576,
+ "step": 8380
+ },
+ {
+ "epoch": 3.962647754137116,
+ "grad_norm": 2.809282064437866,
+ "learning_rate": 1.3111711777384403e-06,
+ "loss": 0.3741,
+ "step": 8381
+ },
+ {
+ "epoch": 3.9631205673758867,
+ "grad_norm": 3.1175687313079834,
+ "learning_rate": 1.3106224349888638e-06,
+ "loss": 0.3388,
+ "step": 8382
+ },
+ {
+ "epoch": 3.963593380614657,
+ "grad_norm": 2.930525064468384,
+ "learning_rate": 1.310073766296612e-06,
+ "loss": 0.3593,
+ "step": 8383
+ },
+ {
+ "epoch": 3.964066193853428,
+ "grad_norm": 3.0673177242279053,
+ "learning_rate": 1.3095251716958501e-06,
+ "loss": 0.402,
+ "step": 8384
+ },
+ {
+ "epoch": 3.9645390070921986,
+ "grad_norm": 2.9725706577301025,
+ "learning_rate": 1.3089766512207347e-06,
+ "loss": 0.3707,
+ "step": 8385
+ },
+ {
+ "epoch": 3.965011820330969,
+ "grad_norm": 2.9790916442871094,
+ "learning_rate": 1.3084282049054218e-06,
+ "loss": 0.3292,
+ "step": 8386
+ },
+ {
+ "epoch": 3.96548463356974,
+ "grad_norm": 3.257035493850708,
+ "learning_rate": 1.3078798327840598e-06,
+ "loss": 0.3753,
+ "step": 8387
+ },
+ {
+ "epoch": 3.9659574468085106,
+ "grad_norm": 3.0534379482269287,
+ "learning_rate": 1.307331534890792e-06,
+ "loss": 0.4134,
+ "step": 8388
+ },
+ {
+ "epoch": 3.9664302600472814,
+ "grad_norm": 2.919243812561035,
+ "learning_rate": 1.306783311259761e-06,
+ "loss": 0.4283,
+ "step": 8389
+ },
+ {
+ "epoch": 3.966903073286052,
+ "grad_norm": 2.7643322944641113,
+ "learning_rate": 1.306235161925101e-06,
+ "loss": 0.3454,
+ "step": 8390
+ },
+ {
+ "epoch": 3.9673758865248225,
+ "grad_norm": 3.0208916664123535,
+ "learning_rate": 1.3056870869209431e-06,
+ "loss": 0.385,
+ "step": 8391
+ },
+ {
+ "epoch": 3.9678486997635933,
+ "grad_norm": 2.8657243251800537,
+ "learning_rate": 1.3051390862814135e-06,
+ "loss": 0.3614,
+ "step": 8392
+ },
+ {
+ "epoch": 3.968321513002364,
+ "grad_norm": 3.2093591690063477,
+ "learning_rate": 1.3045911600406325e-06,
+ "loss": 0.3774,
+ "step": 8393
+ },
+ {
+ "epoch": 3.9687943262411345,
+ "grad_norm": 3.091618537902832,
+ "learning_rate": 1.3040433082327192e-06,
+ "loss": 0.4157,
+ "step": 8394
+ },
+ {
+ "epoch": 3.9692671394799053,
+ "grad_norm": 2.99763560295105,
+ "learning_rate": 1.3034955308917849e-06,
+ "loss": 0.4017,
+ "step": 8395
+ },
+ {
+ "epoch": 3.969739952718676,
+ "grad_norm": 3.063109874725342,
+ "learning_rate": 1.3029478280519364e-06,
+ "loss": 0.4568,
+ "step": 8396
+ },
+ {
+ "epoch": 3.970212765957447,
+ "grad_norm": 3.2660679817199707,
+ "learning_rate": 1.3024001997472791e-06,
+ "loss": 0.3999,
+ "step": 8397
+ },
+ {
+ "epoch": 3.9706855791962177,
+ "grad_norm": 2.860121250152588,
+ "learning_rate": 1.3018526460119088e-06,
+ "loss": 0.433,
+ "step": 8398
+ },
+ {
+ "epoch": 3.971158392434988,
+ "grad_norm": 3.1037673950195312,
+ "learning_rate": 1.3013051668799216e-06,
+ "loss": 0.4526,
+ "step": 8399
+ },
+ {
+ "epoch": 3.971631205673759,
+ "grad_norm": 2.9408578872680664,
+ "learning_rate": 1.3007577623854053e-06,
+ "loss": 0.3722,
+ "step": 8400
+ },
+ {
+ "epoch": 3.9721040189125296,
+ "grad_norm": 3.0684635639190674,
+ "learning_rate": 1.3002104325624436e-06,
+ "loss": 0.3789,
+ "step": 8401
+ },
+ {
+ "epoch": 3.9725768321513,
+ "grad_norm": 2.6469366550445557,
+ "learning_rate": 1.2996631774451187e-06,
+ "loss": 0.3409,
+ "step": 8402
+ },
+ {
+ "epoch": 3.9730496453900708,
+ "grad_norm": 3.3741610050201416,
+ "learning_rate": 1.2991159970675033e-06,
+ "loss": 0.3544,
+ "step": 8403
+ },
+ {
+ "epoch": 3.9735224586288416,
+ "grad_norm": 3.3716588020324707,
+ "learning_rate": 1.2985688914636701e-06,
+ "loss": 0.3747,
+ "step": 8404
+ },
+ {
+ "epoch": 3.9739952718676124,
+ "grad_norm": 3.000469923019409,
+ "learning_rate": 1.2980218606676837e-06,
+ "loss": 0.4506,
+ "step": 8405
+ },
+ {
+ "epoch": 3.974468085106383,
+ "grad_norm": 3.0139408111572266,
+ "learning_rate": 1.2974749047136057e-06,
+ "loss": 0.4156,
+ "step": 8406
+ },
+ {
+ "epoch": 3.9749408983451535,
+ "grad_norm": 2.9494218826293945,
+ "learning_rate": 1.2969280236354925e-06,
+ "loss": 0.3378,
+ "step": 8407
+ },
+ {
+ "epoch": 3.9754137115839243,
+ "grad_norm": 2.6061158180236816,
+ "learning_rate": 1.2963812174673948e-06,
+ "loss": 0.3887,
+ "step": 8408
+ },
+ {
+ "epoch": 3.975886524822695,
+ "grad_norm": 2.873987913131714,
+ "learning_rate": 1.295834486243362e-06,
+ "loss": 0.3202,
+ "step": 8409
+ },
+ {
+ "epoch": 3.9763593380614655,
+ "grad_norm": 3.0106539726257324,
+ "learning_rate": 1.2952878299974358e-06,
+ "loss": 0.4142,
+ "step": 8410
+ },
+ {
+ "epoch": 3.9768321513002363,
+ "grad_norm": 3.0011982917785645,
+ "learning_rate": 1.2947412487636527e-06,
+ "loss": 0.4121,
+ "step": 8411
+ },
+ {
+ "epoch": 3.977304964539007,
+ "grad_norm": 3.1321003437042236,
+ "learning_rate": 1.294194742576048e-06,
+ "loss": 0.4033,
+ "step": 8412
+ },
+ {
+ "epoch": 3.977777777777778,
+ "grad_norm": 2.812255382537842,
+ "learning_rate": 1.2936483114686487e-06,
+ "loss": 0.3414,
+ "step": 8413
+ },
+ {
+ "epoch": 3.9782505910165487,
+ "grad_norm": 2.9594221115112305,
+ "learning_rate": 1.2931019554754804e-06,
+ "loss": 0.3666,
+ "step": 8414
+ },
+ {
+ "epoch": 3.978723404255319,
+ "grad_norm": 3.119440793991089,
+ "learning_rate": 1.2925556746305612e-06,
+ "loss": 0.3902,
+ "step": 8415
+ },
+ {
+ "epoch": 3.97919621749409,
+ "grad_norm": 3.042102098464966,
+ "learning_rate": 1.2920094689679047e-06,
+ "loss": 0.344,
+ "step": 8416
+ },
+ {
+ "epoch": 3.9796690307328606,
+ "grad_norm": 2.8443872928619385,
+ "learning_rate": 1.2914633385215225e-06,
+ "loss": 0.372,
+ "step": 8417
+ },
+ {
+ "epoch": 3.980141843971631,
+ "grad_norm": 3.483201265335083,
+ "learning_rate": 1.2909172833254187e-06,
+ "loss": 0.4028,
+ "step": 8418
+ },
+ {
+ "epoch": 3.9806146572104018,
+ "grad_norm": 2.966996431350708,
+ "learning_rate": 1.2903713034135934e-06,
+ "loss": 0.3527,
+ "step": 8419
+ },
+ {
+ "epoch": 3.9810874704491725,
+ "grad_norm": 2.7813172340393066,
+ "learning_rate": 1.2898253988200437e-06,
+ "loss": 0.3873,
+ "step": 8420
+ },
+ {
+ "epoch": 3.9815602836879433,
+ "grad_norm": 3.24611234664917,
+ "learning_rate": 1.2892795695787602e-06,
+ "loss": 0.4783,
+ "step": 8421
+ },
+ {
+ "epoch": 3.982033096926714,
+ "grad_norm": 3.345573663711548,
+ "learning_rate": 1.2887338157237289e-06,
+ "loss": 0.4179,
+ "step": 8422
+ },
+ {
+ "epoch": 3.9825059101654845,
+ "grad_norm": 3.1726880073547363,
+ "learning_rate": 1.288188137288931e-06,
+ "loss": 0.3725,
+ "step": 8423
+ },
+ {
+ "epoch": 3.9829787234042553,
+ "grad_norm": 3.398966073989868,
+ "learning_rate": 1.2876425343083449e-06,
+ "loss": 0.4117,
+ "step": 8424
+ },
+ {
+ "epoch": 3.983451536643026,
+ "grad_norm": 2.615680456161499,
+ "learning_rate": 1.2870970068159423e-06,
+ "loss": 0.324,
+ "step": 8425
+ },
+ {
+ "epoch": 3.9839243498817964,
+ "grad_norm": 3.0505547523498535,
+ "learning_rate": 1.2865515548456893e-06,
+ "loss": 0.3698,
+ "step": 8426
+ },
+ {
+ "epoch": 3.9843971631205672,
+ "grad_norm": 3.077404260635376,
+ "learning_rate": 1.2860061784315514e-06,
+ "loss": 0.3592,
+ "step": 8427
+ },
+ {
+ "epoch": 3.984869976359338,
+ "grad_norm": 2.654080390930176,
+ "learning_rate": 1.2854608776074855e-06,
+ "loss": 0.3451,
+ "step": 8428
+ },
+ {
+ "epoch": 3.985342789598109,
+ "grad_norm": 3.1023523807525635,
+ "learning_rate": 1.284915652407444e-06,
+ "loss": 0.3809,
+ "step": 8429
+ },
+ {
+ "epoch": 3.9858156028368796,
+ "grad_norm": 3.0526652336120605,
+ "learning_rate": 1.2843705028653783e-06,
+ "loss": 0.3633,
+ "step": 8430
+ },
+ {
+ "epoch": 3.98628841607565,
+ "grad_norm": 2.7829604148864746,
+ "learning_rate": 1.2838254290152296e-06,
+ "loss": 0.3213,
+ "step": 8431
+ },
+ {
+ "epoch": 3.986761229314421,
+ "grad_norm": 3.2218687534332275,
+ "learning_rate": 1.28328043089094e-06,
+ "loss": 0.465,
+ "step": 8432
+ },
+ {
+ "epoch": 3.9872340425531916,
+ "grad_norm": 2.952998161315918,
+ "learning_rate": 1.2827355085264425e-06,
+ "loss": 0.4405,
+ "step": 8433
+ },
+ {
+ "epoch": 3.987706855791962,
+ "grad_norm": 2.81211519241333,
+ "learning_rate": 1.2821906619556667e-06,
+ "loss": 0.3444,
+ "step": 8434
+ },
+ {
+ "epoch": 3.9881796690307327,
+ "grad_norm": 3.1707375049591064,
+ "learning_rate": 1.281645891212539e-06,
+ "loss": 0.4019,
+ "step": 8435
+ },
+ {
+ "epoch": 3.9886524822695035,
+ "grad_norm": 2.791504383087158,
+ "learning_rate": 1.2811011963309788e-06,
+ "loss": 0.3606,
+ "step": 8436
+ },
+ {
+ "epoch": 3.9891252955082743,
+ "grad_norm": 2.954782247543335,
+ "learning_rate": 1.280556577344903e-06,
+ "loss": 0.3141,
+ "step": 8437
+ },
+ {
+ "epoch": 3.989598108747045,
+ "grad_norm": 2.718273878097534,
+ "learning_rate": 1.2800120342882223e-06,
+ "loss": 0.3715,
+ "step": 8438
+ },
+ {
+ "epoch": 3.9900709219858155,
+ "grad_norm": 3.2916250228881836,
+ "learning_rate": 1.2794675671948425e-06,
+ "loss": 0.4048,
+ "step": 8439
+ },
+ {
+ "epoch": 3.9905437352245863,
+ "grad_norm": 3.060060977935791,
+ "learning_rate": 1.2789231760986655e-06,
+ "loss": 0.4032,
+ "step": 8440
+ },
+ {
+ "epoch": 3.991016548463357,
+ "grad_norm": 2.8467273712158203,
+ "learning_rate": 1.2783788610335882e-06,
+ "loss": 0.4041,
+ "step": 8441
+ },
+ {
+ "epoch": 3.9914893617021274,
+ "grad_norm": 3.161790132522583,
+ "learning_rate": 1.2778346220335013e-06,
+ "loss": 0.4049,
+ "step": 8442
+ },
+ {
+ "epoch": 3.9919621749408982,
+ "grad_norm": 2.6512296199798584,
+ "learning_rate": 1.277290459132295e-06,
+ "loss": 0.3598,
+ "step": 8443
+ },
+ {
+ "epoch": 3.992434988179669,
+ "grad_norm": 2.792736291885376,
+ "learning_rate": 1.276746372363849e-06,
+ "loss": 0.3874,
+ "step": 8444
+ },
+ {
+ "epoch": 3.99290780141844,
+ "grad_norm": 2.887047052383423,
+ "learning_rate": 1.2762023617620433e-06,
+ "loss": 0.4255,
+ "step": 8445
+ },
+ {
+ "epoch": 3.9933806146572106,
+ "grad_norm": 3.0420780181884766,
+ "learning_rate": 1.275658427360751e-06,
+ "loss": 0.4489,
+ "step": 8446
+ },
+ {
+ "epoch": 3.993853427895981,
+ "grad_norm": 3.107618570327759,
+ "learning_rate": 1.2751145691938383e-06,
+ "loss": 0.4354,
+ "step": 8447
+ },
+ {
+ "epoch": 3.9943262411347518,
+ "grad_norm": 2.656224250793457,
+ "learning_rate": 1.2745707872951718e-06,
+ "loss": 0.4188,
+ "step": 8448
+ },
+ {
+ "epoch": 3.9947990543735226,
+ "grad_norm": 2.9895219802856445,
+ "learning_rate": 1.2740270816986079e-06,
+ "loss": 0.391,
+ "step": 8449
+ },
+ {
+ "epoch": 3.995271867612293,
+ "grad_norm": 2.919255018234253,
+ "learning_rate": 1.2734834524380025e-06,
+ "loss": 0.4058,
+ "step": 8450
+ },
+ {
+ "epoch": 3.9957446808510637,
+ "grad_norm": 3.4418535232543945,
+ "learning_rate": 1.2729398995472048e-06,
+ "loss": 0.3977,
+ "step": 8451
+ },
+ {
+ "epoch": 3.9962174940898345,
+ "grad_norm": 2.980224132537842,
+ "learning_rate": 1.272396423060058e-06,
+ "loss": 0.4417,
+ "step": 8452
+ },
+ {
+ "epoch": 3.9966903073286053,
+ "grad_norm": 3.6488101482391357,
+ "learning_rate": 1.2718530230104043e-06,
+ "loss": 0.4472,
+ "step": 8453
+ },
+ {
+ "epoch": 3.997163120567376,
+ "grad_norm": 2.725437641143799,
+ "learning_rate": 1.2713096994320774e-06,
+ "loss": 0.3125,
+ "step": 8454
+ },
+ {
+ "epoch": 3.9976359338061465,
+ "grad_norm": 3.453794002532959,
+ "learning_rate": 1.2707664523589076e-06,
+ "loss": 0.3792,
+ "step": 8455
+ },
+ {
+ "epoch": 3.9981087470449173,
+ "grad_norm": 2.8443076610565186,
+ "learning_rate": 1.270223281824721e-06,
+ "loss": 0.3627,
+ "step": 8456
+ },
+ {
+ "epoch": 3.998581560283688,
+ "grad_norm": 3.1851959228515625,
+ "learning_rate": 1.2696801878633372e-06,
+ "loss": 0.3745,
+ "step": 8457
+ },
+ {
+ "epoch": 3.9990543735224584,
+ "grad_norm": 2.897239923477173,
+ "learning_rate": 1.2691371705085743e-06,
+ "loss": 0.3817,
+ "step": 8458
+ },
+ {
+ "epoch": 3.999527186761229,
+ "grad_norm": 2.92111873626709,
+ "learning_rate": 1.2685942297942416e-06,
+ "loss": 0.3824,
+ "step": 8459
+ },
+ {
+ "epoch": 4.0,
+ "grad_norm": 2.9540340900421143,
+ "learning_rate": 1.268051365754148e-06,
+ "loss": 0.3656,
+ "step": 8460
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 12690,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 6,
+ "save_steps": 2115,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 2.1369638945418117e+19,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-8460/training_args.bin b/checkpoint-8460/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3885e6a27b204c834ff6fbc6f3a5a4c7f69fe2cc
--- /dev/null
+++ b/checkpoint-8460/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51188d498a3fe1bf9ddf234271786a31e25e89b3aa68721a97f8264e1013c9b6
+size 8056
diff --git a/checkpoint-8460/zero_to_fp32.py b/checkpoint-8460/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-8460/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bc1fab23ab76663db86132a9d92014a99265286d
--- /dev/null
+++ b/config.json
@@ -0,0 +1,52 @@
+{
+ "_attn_implementation_autoset": true,
+ "_name_or_path": "meta-llama/Llama-3.1-8B",
+ "architectures": [
+ "LlamaForCausalLM"
+ ],
+ "attention_bias": false,
+ "attention_dropout": 0.0,
+ "bos_token_id": 128000,
+ "eos_token_id": 128001,
+ "head_dim": 128,
+ "hidden_act": "silu",
+ "hidden_size": 4096,
+ "initializer_range": 0.02,
+ "intermediate_size": 14336,
+ "max_position_embeddings": 131072,
+ "mlp_bias": false,
+ "model_type": "llama",
+ "num_attention_heads": 32,
+ "num_hidden_layers": 32,
+ "num_key_value_heads": 8,
+ "pretraining_tp": 1,
+ "quantization_config": {
+ "_load_in_4bit": true,
+ "_load_in_8bit": false,
+ "bnb_4bit_compute_dtype": "bfloat16",
+ "bnb_4bit_quant_storage": "bfloat16",
+ "bnb_4bit_quant_type": "nf4",
+ "bnb_4bit_use_double_quant": true,
+ "llm_int8_enable_fp32_cpu_offload": false,
+ "llm_int8_has_fp16_weight": false,
+ "llm_int8_skip_modules": null,
+ "llm_int8_threshold": 6.0,
+ "load_in_4bit": true,
+ "load_in_8bit": false,
+ "quant_method": "bitsandbytes"
+ },
+ "rms_norm_eps": 1e-05,
+ "rope_scaling": {
+ "factor": 8.0,
+ "high_freq_factor": 4.0,
+ "low_freq_factor": 1.0,
+ "original_max_position_embeddings": 8192,
+ "rope_type": "llama3"
+ },
+ "rope_theta": 500000.0,
+ "tie_word_embeddings": false,
+ "torch_dtype": "bfloat16",
+ "transformers_version": "4.49.0",
+ "use_cache": false,
+ "vocab_size": 128256
+}
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/tokenizer.json b/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..81dd14db6632ad5b35b9d447732e37ac074873a5
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,2063 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|end_of_text|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}